6.6 KiB
Executable File
6.6 KiB
Executable File
Scene: Advanced Operations
When This Applies
Batch processing multiple files, handling very large datasets, data validation, conditional formatting, sheet protection, or other power-user features.
Large File Handling (>100K rows)
Read-Only Mode
from openpyxl import load_workbook
# Memory-efficient reading — does NOT load entire file
wb = load_workbook('huge.xlsx', read_only=True)
ws = wb.active
for row in ws.iter_rows(min_row=2, values_only=True):
process(row) # Yields rows one at a time
wb.close() # MUST close read-only workbooks
Write-Only Mode
from openpyxl import Workbook
wb = Workbook(write_only=True)
ws = wb.create_sheet()
# Write rows sequentially — cannot random-access cells
for data_row in large_dataset:
ws.append(data_row)
wb.save('output.xlsx')
Chunked Processing with pandas
# Read in chunks
chunks = pd.read_excel('huge.xlsx', chunksize=10000)
# Note: chunksize only works with read_csv, not read_excel
# For Excel, read specific columns/rows
df = pd.read_excel('huge.xlsx',
usecols=['A', 'C', 'E'], # Only needed columns
nrows=50000, # Limit rows
dtype={'id': str} # Prevent type inference overhead
)
Batch Processing Multiple Files
import os
import glob
import pandas as pd
# Collect all Excel files
files = glob.glob('data/*.xlsx')
# Method 1: Concatenate into one DataFrame
all_data = []
for f in files:
df = pd.read_excel(f)
df['source_file'] = os.path.basename(f)
all_data.append(df)
combined = pd.concat(all_data, ignore_index=True)
combined.to_excel('combined.xlsx', index=False)
# Method 2: One sheet per file
wb = Workbook()
wb.remove(wb.active) # Remove default sheet
for f in files:
df = pd.read_excel(f)
ws = wb.create_sheet(title=os.path.splitext(os.path.basename(f))[0][:31])
for r in dataframe_to_rows(df, index=False, header=True):
ws.append(r)
wb.save('all_files.xlsx')
Data Validation (Dropdown Lists)
from openpyxl.worksheet.datavalidation import DataValidation
# Dropdown list
dv = DataValidation(
type="list",
formula1='"High,Medium,Low"',
allow_blank=True,
showErrorMessage=True,
errorTitle="Invalid",
error="Please select High, Medium, or Low"
)
ws.add_data_validation(dv)
dv.add('D5:D100') # Apply to range
# Number range validation
dv_num = DataValidation(
type="whole",
operator="between",
formula1=1,
formula2=100,
errorTitle="Out of range",
error="Enter a number between 1 and 100"
)
ws.add_data_validation(dv_num)
dv_num.add('E5:E100')
# Date validation
dv_date = DataValidation(
type="date",
operator="greaterThan",
formula1="2024-01-01"
)
ws.add_data_validation(dv_date)
dv_date.add('F5:F100')
Conditional Formatting
For full conditional formatting rules, color usage, and code examples → see engines/design.md §8.
Quick reference for advanced-only patterns (FormulaRule for row-level highlighting):
from openpyxl.formatting.rule import FormulaRule
from openpyxl.styles import PatternFill
# Formula-based: highlight entire row if status = "Overdue"
ws.conditional_formatting.add('B5:H100',
FormulaRule(formula=['$G5="Overdue"'],
fill=PatternFill('solid', fgColor='FFEBEE')))
# Note: Icon sets are NOT supported by openpyxl — use color fills instead
Sheet Protection
# Protect sheet (allow select + sort, prevent edits)
ws.protection.sheet = True
ws.protection.password = 'mypassword'
ws.protection.sort = True
ws.protection.autoFilter = True
# Unlock specific cells for user input
from openpyxl.styles import Protection
unlocked = Protection(locked=False)
for row in range(5, 101):
ws.cell(row=row, column=4).protection = unlocked # Column D is editable
# Protect workbook structure (prevent adding/deleting sheets)
wb.security.workbookPassword = 'structpass'
wb.security.lockStructure = True
Named Ranges
from openpyxl.workbook.defined_name import DefinedName
# Create named range
ref = f"'Data'!$B$5:$B$100"
defn = DefinedName('SalesData', attr_text=ref)
wb.defined_names.add(defn)
# Use in formulas
ws['H5'] = '=SUM(SalesData)'
Auto-Filter & Sort
# Apply auto-filter
ws.auto_filter.ref = 'B4:H100'
# Add filter criteria (for saved state — user can change in Excel)
ws.auto_filter.add_filter_column(0, ['Active', 'Pending'])
# Sort (openpyxl can set sort state, but actual reordering
# must be done in Python before writing)
df = df.sort_values(['Category', 'Revenue'], ascending=[True, False])
Merged Cells
# Merge cells
ws.merge_cells('B2:H2') # Title spanning full width
# Write to merged range (write to top-left cell)
ws['B2'] = 'Report Title'
# Check existing merges before editing
for merge_range in ws.merged_cells.ranges:
print(f"Merged: {merge_range}")
# Unmerge if needed
ws.unmerge_cells('B2:H2')
Warning: Never write to cells within a merged range except the top-left cell. This causes corruption.
Performance Tips
| Technique | When | Impact |
|---|---|---|
read_only=True |
Reading files >50K rows | ~10x less memory |
write_only=True |
Writing files >50K rows | ~5x faster |
usecols parameter |
Only need specific columns | Faster read |
Avoid ws.cell() in tight loops |
Use ws.append() instead |
Faster write |
| Batch style application | Apply to ranges, not cell-by-cell | Faster formatting |
data_only=True for analysis |
Need values not formulas | Faster read |
VBA Module Inspection
When working with .xlsm files, you can read and list VBA modules:
from openpyxl import load_workbook
import zipfile
import os
def list_vba_modules(filepath):
"""List all VBA modules in an .xlsm file."""
if not filepath.endswith(('.xlsm', '.xlsb')):
return {"has_vba": False, "modules": []}
modules = []
try:
with zipfile.ZipFile(filepath, 'r') as zf:
vba_files = [f for f in zf.namelist() if f.startswith('xl/vbaProject')]
if not vba_files:
return {"has_vba": False, "modules": []}
# Read with keep_vba to access vba_archive
wb = load_workbook(filepath, keep_vba=True)
if wb.vba_archive:
for name in wb.vba_archive.namelist():
modules.append(name)
wb.close()
except Exception as e:
return {"has_vba": False, "error": str(e)}
return {"has_vba": True, "modules": modules}
Use this to inspect before editing — know what VBA exists before you touch the file.