Initial commit

2026-06-06 05:21:10 +00:00
commit 6664758a6d
493 changed files with 135653 additions and 0 deletions
--- a/skills/xlsx/scenes/advanced.md
+++ b/skills/xlsx/scenes/advanced.md
@@ -0,0 +1,271 @@
+# Scene: Advanced Operations
+
+## When This Applies
+Batch processing multiple files, handling very large datasets, data validation, conditional formatting, sheet protection, or other power-user features.
+
+---
+
+## Large File Handling (>100K rows)
+
+### Read-Only Mode
+```python
+from openpyxl import load_workbook
+
+# Memory-efficient reading — does NOT load entire file
+wb = load_workbook('huge.xlsx', read_only=True)
+ws = wb.active
+
+for row in ws.iter_rows(min_row=2, values_only=True):
+    process(row)  # Yields rows one at a time
+
+wb.close()  # MUST close read-only workbooks
+```
+
+### Write-Only Mode
+```python
+from openpyxl import Workbook
+
+wb = Workbook(write_only=True)
+ws = wb.create_sheet()
+
+# Write rows sequentially — cannot random-access cells
+for data_row in large_dataset:
+    ws.append(data_row)
+
+wb.save('output.xlsx')
+```
+
+### Chunked Processing with pandas
+```python
+# Read in chunks
+chunks = pd.read_excel('huge.xlsx', chunksize=10000)
+# Note: chunksize only works with read_csv, not read_excel
+
+# For Excel, read specific columns/rows
+df = pd.read_excel('huge.xlsx',
+    usecols=['A', 'C', 'E'],     # Only needed columns
+    nrows=50000,                   # Limit rows
+    dtype={'id': str}              # Prevent type inference overhead
+)
+```
+
+---
+
+## Batch Processing Multiple Files
+
+```python
+import os
+import glob
+import pandas as pd
+
+# Collect all Excel files
+files = glob.glob('data/*.xlsx')
+
+# Method 1: Concatenate into one DataFrame
+all_data = []
+for f in files:
+    df = pd.read_excel(f)
+    df['source_file'] = os.path.basename(f)
+    all_data.append(df)
+
+combined = pd.concat(all_data, ignore_index=True)
+combined.to_excel('combined.xlsx', index=False)
+
+# Method 2: One sheet per file
+wb = Workbook()
+wb.remove(wb.active)  # Remove default sheet
+
+for f in files:
+    df = pd.read_excel(f)
+    ws = wb.create_sheet(title=os.path.splitext(os.path.basename(f))[0][:31])
+    for r in dataframe_to_rows(df, index=False, header=True):
+        ws.append(r)
+
+wb.save('all_files.xlsx')
+```
+
+---
+
+## Data Validation (Dropdown Lists)
+
+```python
+from openpyxl.worksheet.datavalidation import DataValidation
+
+# Dropdown list
+dv = DataValidation(
+    type="list",
+    formula1='"High,Medium,Low"',
+    allow_blank=True,
+    showErrorMessage=True,
+    errorTitle="Invalid",
+    error="Please select High, Medium, or Low"
+)
+ws.add_data_validation(dv)
+dv.add('D5:D100')  # Apply to range
+
+# Number range validation
+dv_num = DataValidation(
+    type="whole",
+    operator="between",
+    formula1=1,
+    formula2=100,
+    errorTitle="Out of range",
+    error="Enter a number between 1 and 100"
+)
+ws.add_data_validation(dv_num)
+dv_num.add('E5:E100')
+
+# Date validation
+dv_date = DataValidation(
+    type="date",
+    operator="greaterThan",
+    formula1="2024-01-01"
+)
+ws.add_data_validation(dv_date)
+dv_date.add('F5:F100')
+```
+
+---
+
+## Conditional Formatting
+
+For full conditional formatting rules, color usage, and code examples → see **`engines/design.md §8`**.
+
+Quick reference for advanced-only patterns (FormulaRule for row-level highlighting):
+
+```python
+from openpyxl.formatting.rule import FormulaRule
+from openpyxl.styles import PatternFill
+
+# Formula-based: highlight entire row if status = "Overdue"
+ws.conditional_formatting.add('B5:H100',
+    FormulaRule(formula=['$G5="Overdue"'],
+               fill=PatternFill('solid', fgColor='FFEBEE')))
+
+# Note: Icon sets are NOT supported by openpyxl — use color fills instead
+```
+
+---
+
+## Sheet Protection
+
+```python
+# Protect sheet (allow select + sort, prevent edits)
+ws.protection.sheet = True
+ws.protection.password = 'mypassword'
+ws.protection.sort = True
+ws.protection.autoFilter = True
+
+# Unlock specific cells for user input
+from openpyxl.styles import Protection
+unlocked = Protection(locked=False)
+for row in range(5, 101):
+    ws.cell(row=row, column=4).protection = unlocked  # Column D is editable
+
+# Protect workbook structure (prevent adding/deleting sheets)
+wb.security.workbookPassword = 'structpass'
+wb.security.lockStructure = True
+```
+
+---
+
+## Named Ranges
+
+```python
+from openpyxl.workbook.defined_name import DefinedName
+
+# Create named range
+ref = f"'Data'!$B$5:$B$100"
+defn = DefinedName('SalesData', attr_text=ref)
+wb.defined_names.add(defn)
+
+# Use in formulas
+ws['H5'] = '=SUM(SalesData)'
+```
+
+---
+
+## Auto-Filter & Sort
+
+```python
+# Apply auto-filter
+ws.auto_filter.ref = 'B4:H100'
+
+# Add filter criteria (for saved state — user can change in Excel)
+ws.auto_filter.add_filter_column(0, ['Active', 'Pending'])
+
+# Sort (openpyxl can set sort state, but actual reordering
+# must be done in Python before writing)
+df = df.sort_values(['Category', 'Revenue'], ascending=[True, False])
+```
+
+---
+
+## Merged Cells
+
+```python
+# Merge cells
+ws.merge_cells('B2:H2')  # Title spanning full width
+
+# Write to merged range (write to top-left cell)
+ws['B2'] = 'Report Title'
+
+# Check existing merges before editing
+for merge_range in ws.merged_cells.ranges:
+    print(f"Merged: {merge_range}")
+
+# Unmerge if needed
+ws.unmerge_cells('B2:H2')
+```
+
+**Warning**: Never write to cells within a merged range except the top-left cell. This causes corruption.
+
+---
+
+## Performance Tips
+
+| Technique | When | Impact |
+|-----------|------|--------|
+| `read_only=True` | Reading files >50K rows | ~10x less memory |
+| `write_only=True` | Writing files >50K rows | ~5x faster |
+| `usecols` parameter | Only need specific columns | Faster read |
+| Avoid `ws.cell()` in tight loops | Use `ws.append()` instead | Faster write |
+| Batch style application | Apply to ranges, not cell-by-cell | Faster formatting |
+| `data_only=True` for analysis | Need values not formulas | Faster read |
+
+---
+
+## VBA Module Inspection
+
+When working with `.xlsm` files, you can read and list VBA modules:
+
+```python
+from openpyxl import load_workbook
+import zipfile
+import os
+
+def list_vba_modules(filepath):
+    """List all VBA modules in an .xlsm file."""
+    if not filepath.endswith(('.xlsm', '.xlsb')):
+        return {"has_vba": False, "modules": []}
+    
+    modules = []
+    try:
+        with zipfile.ZipFile(filepath, 'r') as zf:
+            vba_files = [f for f in zf.namelist() if f.startswith('xl/vbaProject')]
+            if not vba_files:
+                return {"has_vba": False, "modules": []}
+            
+            # Read with keep_vba to access vba_archive
+            wb = load_workbook(filepath, keep_vba=True)
+            if wb.vba_archive:
+                for name in wb.vba_archive.namelist():
+                    modules.append(name)
+            wb.close()
+    except Exception as e:
+        return {"has_vba": False, "error": str(e)}
+    
+    return {"has_vba": True, "modules": modules}
+```
+
+Use this to inspect before editing — know what VBA exists before you touch the file.