Initial commit
This commit is contained in:
271
skills/xlsx/scenes/advanced.md
Executable file
271
skills/xlsx/scenes/advanced.md
Executable file
@@ -0,0 +1,271 @@
|
||||
# Scene: Advanced Operations
|
||||
|
||||
## When This Applies
|
||||
Batch processing multiple files, handling very large datasets, data validation, conditional formatting, sheet protection, or other power-user features.
|
||||
|
||||
---
|
||||
|
||||
## Large File Handling (>100K rows)
|
||||
|
||||
### Read-Only Mode
|
||||
```python
|
||||
from openpyxl import load_workbook
|
||||
|
||||
# Memory-efficient reading — does NOT load entire file
|
||||
wb = load_workbook('huge.xlsx', read_only=True)
|
||||
ws = wb.active
|
||||
|
||||
for row in ws.iter_rows(min_row=2, values_only=True):
|
||||
process(row) # Yields rows one at a time
|
||||
|
||||
wb.close() # MUST close read-only workbooks
|
||||
```
|
||||
|
||||
### Write-Only Mode
|
||||
```python
|
||||
from openpyxl import Workbook
|
||||
|
||||
wb = Workbook(write_only=True)
|
||||
ws = wb.create_sheet()
|
||||
|
||||
# Write rows sequentially — cannot random-access cells
|
||||
for data_row in large_dataset:
|
||||
ws.append(data_row)
|
||||
|
||||
wb.save('output.xlsx')
|
||||
```
|
||||
|
||||
### Chunked Processing with pandas
|
||||
```python
|
||||
# Read in chunks
|
||||
chunks = pd.read_excel('huge.xlsx', chunksize=10000)
|
||||
# Note: chunksize only works with read_csv, not read_excel
|
||||
|
||||
# For Excel, read specific columns/rows
|
||||
df = pd.read_excel('huge.xlsx',
|
||||
usecols=['A', 'C', 'E'], # Only needed columns
|
||||
nrows=50000, # Limit rows
|
||||
dtype={'id': str} # Prevent type inference overhead
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Batch Processing Multiple Files
|
||||
|
||||
```python
|
||||
import os
|
||||
import glob
|
||||
import pandas as pd
|
||||
|
||||
# Collect all Excel files
|
||||
files = glob.glob('data/*.xlsx')
|
||||
|
||||
# Method 1: Concatenate into one DataFrame
|
||||
all_data = []
|
||||
for f in files:
|
||||
df = pd.read_excel(f)
|
||||
df['source_file'] = os.path.basename(f)
|
||||
all_data.append(df)
|
||||
|
||||
combined = pd.concat(all_data, ignore_index=True)
|
||||
combined.to_excel('combined.xlsx', index=False)
|
||||
|
||||
# Method 2: One sheet per file
|
||||
wb = Workbook()
|
||||
wb.remove(wb.active) # Remove default sheet
|
||||
|
||||
for f in files:
|
||||
df = pd.read_excel(f)
|
||||
ws = wb.create_sheet(title=os.path.splitext(os.path.basename(f))[0][:31])
|
||||
for r in dataframe_to_rows(df, index=False, header=True):
|
||||
ws.append(r)
|
||||
|
||||
wb.save('all_files.xlsx')
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Data Validation (Dropdown Lists)
|
||||
|
||||
```python
|
||||
from openpyxl.worksheet.datavalidation import DataValidation
|
||||
|
||||
# Dropdown list
|
||||
dv = DataValidation(
|
||||
type="list",
|
||||
formula1='"High,Medium,Low"',
|
||||
allow_blank=True,
|
||||
showErrorMessage=True,
|
||||
errorTitle="Invalid",
|
||||
error="Please select High, Medium, or Low"
|
||||
)
|
||||
ws.add_data_validation(dv)
|
||||
dv.add('D5:D100') # Apply to range
|
||||
|
||||
# Number range validation
|
||||
dv_num = DataValidation(
|
||||
type="whole",
|
||||
operator="between",
|
||||
formula1=1,
|
||||
formula2=100,
|
||||
errorTitle="Out of range",
|
||||
error="Enter a number between 1 and 100"
|
||||
)
|
||||
ws.add_data_validation(dv_num)
|
||||
dv_num.add('E5:E100')
|
||||
|
||||
# Date validation
|
||||
dv_date = DataValidation(
|
||||
type="date",
|
||||
operator="greaterThan",
|
||||
formula1="2024-01-01"
|
||||
)
|
||||
ws.add_data_validation(dv_date)
|
||||
dv_date.add('F5:F100')
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Conditional Formatting
|
||||
|
||||
For full conditional formatting rules, color usage, and code examples → see **`engines/design.md §8`**.
|
||||
|
||||
Quick reference for advanced-only patterns (FormulaRule for row-level highlighting):
|
||||
|
||||
```python
|
||||
from openpyxl.formatting.rule import FormulaRule
|
||||
from openpyxl.styles import PatternFill
|
||||
|
||||
# Formula-based: highlight entire row if status = "Overdue"
|
||||
ws.conditional_formatting.add('B5:H100',
|
||||
FormulaRule(formula=['$G5="Overdue"'],
|
||||
fill=PatternFill('solid', fgColor='FFEBEE')))
|
||||
|
||||
# Note: Icon sets are NOT supported by openpyxl — use color fills instead
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Sheet Protection
|
||||
|
||||
```python
|
||||
# Protect sheet (allow select + sort, prevent edits)
|
||||
ws.protection.sheet = True
|
||||
ws.protection.password = 'mypassword'
|
||||
ws.protection.sort = True
|
||||
ws.protection.autoFilter = True
|
||||
|
||||
# Unlock specific cells for user input
|
||||
from openpyxl.styles import Protection
|
||||
unlocked = Protection(locked=False)
|
||||
for row in range(5, 101):
|
||||
ws.cell(row=row, column=4).protection = unlocked # Column D is editable
|
||||
|
||||
# Protect workbook structure (prevent adding/deleting sheets)
|
||||
wb.security.workbookPassword = 'structpass'
|
||||
wb.security.lockStructure = True
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Named Ranges
|
||||
|
||||
```python
|
||||
from openpyxl.workbook.defined_name import DefinedName
|
||||
|
||||
# Create named range
|
||||
ref = f"'Data'!$B$5:$B$100"
|
||||
defn = DefinedName('SalesData', attr_text=ref)
|
||||
wb.defined_names.add(defn)
|
||||
|
||||
# Use in formulas
|
||||
ws['H5'] = '=SUM(SalesData)'
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Auto-Filter & Sort
|
||||
|
||||
```python
|
||||
# Apply auto-filter
|
||||
ws.auto_filter.ref = 'B4:H100'
|
||||
|
||||
# Add filter criteria (for saved state — user can change in Excel)
|
||||
ws.auto_filter.add_filter_column(0, ['Active', 'Pending'])
|
||||
|
||||
# Sort (openpyxl can set sort state, but actual reordering
|
||||
# must be done in Python before writing)
|
||||
df = df.sort_values(['Category', 'Revenue'], ascending=[True, False])
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Merged Cells
|
||||
|
||||
```python
|
||||
# Merge cells
|
||||
ws.merge_cells('B2:H2') # Title spanning full width
|
||||
|
||||
# Write to merged range (write to top-left cell)
|
||||
ws['B2'] = 'Report Title'
|
||||
|
||||
# Check existing merges before editing
|
||||
for merge_range in ws.merged_cells.ranges:
|
||||
print(f"Merged: {merge_range}")
|
||||
|
||||
# Unmerge if needed
|
||||
ws.unmerge_cells('B2:H2')
|
||||
```
|
||||
|
||||
**Warning**: Never write to cells within a merged range except the top-left cell. This causes corruption.
|
||||
|
||||
---
|
||||
|
||||
## Performance Tips
|
||||
|
||||
| Technique | When | Impact |
|
||||
|-----------|------|--------|
|
||||
| `read_only=True` | Reading files >50K rows | ~10x less memory |
|
||||
| `write_only=True` | Writing files >50K rows | ~5x faster |
|
||||
| `usecols` parameter | Only need specific columns | Faster read |
|
||||
| Avoid `ws.cell()` in tight loops | Use `ws.append()` instead | Faster write |
|
||||
| Batch style application | Apply to ranges, not cell-by-cell | Faster formatting |
|
||||
| `data_only=True` for analysis | Need values not formulas | Faster read |
|
||||
|
||||
---
|
||||
|
||||
## VBA Module Inspection
|
||||
|
||||
When working with `.xlsm` files, you can read and list VBA modules:
|
||||
|
||||
```python
|
||||
from openpyxl import load_workbook
|
||||
import zipfile
|
||||
import os
|
||||
|
||||
def list_vba_modules(filepath):
|
||||
"""List all VBA modules in an .xlsm file."""
|
||||
if not filepath.endswith(('.xlsm', '.xlsb')):
|
||||
return {"has_vba": False, "modules": []}
|
||||
|
||||
modules = []
|
||||
try:
|
||||
with zipfile.ZipFile(filepath, 'r') as zf:
|
||||
vba_files = [f for f in zf.namelist() if f.startswith('xl/vbaProject')]
|
||||
if not vba_files:
|
||||
return {"has_vba": False, "modules": []}
|
||||
|
||||
# Read with keep_vba to access vba_archive
|
||||
wb = load_workbook(filepath, keep_vba=True)
|
||||
if wb.vba_archive:
|
||||
for name in wb.vba_archive.namelist():
|
||||
modules.append(name)
|
||||
wb.close()
|
||||
except Exception as e:
|
||||
return {"has_vba": False, "error": str(e)}
|
||||
|
||||
return {"has_vba": True, "modules": modules}
|
||||
```
|
||||
|
||||
Use this to inspect before editing — know what VBA exists before you touch the file.
|
||||
Reference in New Issue
Block a user