Initial commit
This commit is contained in:
271
skills/xlsx/scenes/advanced.md
Executable file
271
skills/xlsx/scenes/advanced.md
Executable file
@@ -0,0 +1,271 @@
|
||||
# Scene: Advanced Operations
|
||||
|
||||
## When This Applies
|
||||
Batch processing multiple files, handling very large datasets, data validation, conditional formatting, sheet protection, or other power-user features.
|
||||
|
||||
---
|
||||
|
||||
## Large File Handling (>100K rows)
|
||||
|
||||
### Read-Only Mode
|
||||
```python
|
||||
from openpyxl import load_workbook
|
||||
|
||||
# Memory-efficient reading — does NOT load entire file
|
||||
wb = load_workbook('huge.xlsx', read_only=True)
|
||||
ws = wb.active
|
||||
|
||||
for row in ws.iter_rows(min_row=2, values_only=True):
|
||||
process(row) # Yields rows one at a time
|
||||
|
||||
wb.close() # MUST close read-only workbooks
|
||||
```
|
||||
|
||||
### Write-Only Mode
|
||||
```python
|
||||
from openpyxl import Workbook
|
||||
|
||||
wb = Workbook(write_only=True)
|
||||
ws = wb.create_sheet()
|
||||
|
||||
# Write rows sequentially — cannot random-access cells
|
||||
for data_row in large_dataset:
|
||||
ws.append(data_row)
|
||||
|
||||
wb.save('output.xlsx')
|
||||
```
|
||||
|
||||
### Chunked Processing with pandas
|
||||
```python
|
||||
# Read in chunks
|
||||
chunks = pd.read_excel('huge.xlsx', chunksize=10000)
|
||||
# Note: chunksize only works with read_csv, not read_excel
|
||||
|
||||
# For Excel, read specific columns/rows
|
||||
df = pd.read_excel('huge.xlsx',
|
||||
usecols=['A', 'C', 'E'], # Only needed columns
|
||||
nrows=50000, # Limit rows
|
||||
dtype={'id': str} # Prevent type inference overhead
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Batch Processing Multiple Files
|
||||
|
||||
```python
|
||||
import os
|
||||
import glob
|
||||
import pandas as pd
|
||||
|
||||
# Collect all Excel files
|
||||
files = glob.glob('data/*.xlsx')
|
||||
|
||||
# Method 1: Concatenate into one DataFrame
|
||||
all_data = []
|
||||
for f in files:
|
||||
df = pd.read_excel(f)
|
||||
df['source_file'] = os.path.basename(f)
|
||||
all_data.append(df)
|
||||
|
||||
combined = pd.concat(all_data, ignore_index=True)
|
||||
combined.to_excel('combined.xlsx', index=False)
|
||||
|
||||
# Method 2: One sheet per file
|
||||
wb = Workbook()
|
||||
wb.remove(wb.active) # Remove default sheet
|
||||
|
||||
for f in files:
|
||||
df = pd.read_excel(f)
|
||||
ws = wb.create_sheet(title=os.path.splitext(os.path.basename(f))[0][:31])
|
||||
for r in dataframe_to_rows(df, index=False, header=True):
|
||||
ws.append(r)
|
||||
|
||||
wb.save('all_files.xlsx')
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Data Validation (Dropdown Lists)
|
||||
|
||||
```python
|
||||
from openpyxl.worksheet.datavalidation import DataValidation
|
||||
|
||||
# Dropdown list
|
||||
dv = DataValidation(
|
||||
type="list",
|
||||
formula1='"High,Medium,Low"',
|
||||
allow_blank=True,
|
||||
showErrorMessage=True,
|
||||
errorTitle="Invalid",
|
||||
error="Please select High, Medium, or Low"
|
||||
)
|
||||
ws.add_data_validation(dv)
|
||||
dv.add('D5:D100') # Apply to range
|
||||
|
||||
# Number range validation
|
||||
dv_num = DataValidation(
|
||||
type="whole",
|
||||
operator="between",
|
||||
formula1=1,
|
||||
formula2=100,
|
||||
errorTitle="Out of range",
|
||||
error="Enter a number between 1 and 100"
|
||||
)
|
||||
ws.add_data_validation(dv_num)
|
||||
dv_num.add('E5:E100')
|
||||
|
||||
# Date validation
|
||||
dv_date = DataValidation(
|
||||
type="date",
|
||||
operator="greaterThan",
|
||||
formula1="2024-01-01"
|
||||
)
|
||||
ws.add_data_validation(dv_date)
|
||||
dv_date.add('F5:F100')
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Conditional Formatting
|
||||
|
||||
For full conditional formatting rules, color usage, and code examples → see **`engines/design.md §8`**.
|
||||
|
||||
Quick reference for advanced-only patterns (FormulaRule for row-level highlighting):
|
||||
|
||||
```python
|
||||
from openpyxl.formatting.rule import FormulaRule
|
||||
from openpyxl.styles import PatternFill
|
||||
|
||||
# Formula-based: highlight entire row if status = "Overdue"
|
||||
ws.conditional_formatting.add('B5:H100',
|
||||
FormulaRule(formula=['$G5="Overdue"'],
|
||||
fill=PatternFill('solid', fgColor='FFEBEE')))
|
||||
|
||||
# Note: Icon sets are NOT supported by openpyxl — use color fills instead
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Sheet Protection
|
||||
|
||||
```python
|
||||
# Protect sheet (allow select + sort, prevent edits)
|
||||
ws.protection.sheet = True
|
||||
ws.protection.password = 'mypassword'
|
||||
ws.protection.sort = True
|
||||
ws.protection.autoFilter = True
|
||||
|
||||
# Unlock specific cells for user input
|
||||
from openpyxl.styles import Protection
|
||||
unlocked = Protection(locked=False)
|
||||
for row in range(5, 101):
|
||||
ws.cell(row=row, column=4).protection = unlocked # Column D is editable
|
||||
|
||||
# Protect workbook structure (prevent adding/deleting sheets)
|
||||
wb.security.workbookPassword = 'structpass'
|
||||
wb.security.lockStructure = True
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Named Ranges
|
||||
|
||||
```python
|
||||
from openpyxl.workbook.defined_name import DefinedName
|
||||
|
||||
# Create named range
|
||||
ref = f"'Data'!$B$5:$B$100"
|
||||
defn = DefinedName('SalesData', attr_text=ref)
|
||||
wb.defined_names.add(defn)
|
||||
|
||||
# Use in formulas
|
||||
ws['H5'] = '=SUM(SalesData)'
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Auto-Filter & Sort
|
||||
|
||||
```python
|
||||
# Apply auto-filter
|
||||
ws.auto_filter.ref = 'B4:H100'
|
||||
|
||||
# Add filter criteria (for saved state — user can change in Excel)
|
||||
ws.auto_filter.add_filter_column(0, ['Active', 'Pending'])
|
||||
|
||||
# Sort (openpyxl can set sort state, but actual reordering
|
||||
# must be done in Python before writing)
|
||||
df = df.sort_values(['Category', 'Revenue'], ascending=[True, False])
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Merged Cells
|
||||
|
||||
```python
|
||||
# Merge cells
|
||||
ws.merge_cells('B2:H2') # Title spanning full width
|
||||
|
||||
# Write to merged range (write to top-left cell)
|
||||
ws['B2'] = 'Report Title'
|
||||
|
||||
# Check existing merges before editing
|
||||
for merge_range in ws.merged_cells.ranges:
|
||||
print(f"Merged: {merge_range}")
|
||||
|
||||
# Unmerge if needed
|
||||
ws.unmerge_cells('B2:H2')
|
||||
```
|
||||
|
||||
**Warning**: Never write to cells within a merged range except the top-left cell. This causes corruption.
|
||||
|
||||
---
|
||||
|
||||
## Performance Tips
|
||||
|
||||
| Technique | When | Impact |
|
||||
|-----------|------|--------|
|
||||
| `read_only=True` | Reading files >50K rows | ~10x less memory |
|
||||
| `write_only=True` | Writing files >50K rows | ~5x faster |
|
||||
| `usecols` parameter | Only need specific columns | Faster read |
|
||||
| Avoid `ws.cell()` in tight loops | Use `ws.append()` instead | Faster write |
|
||||
| Batch style application | Apply to ranges, not cell-by-cell | Faster formatting |
|
||||
| `data_only=True` for analysis | Need values not formulas | Faster read |
|
||||
|
||||
---
|
||||
|
||||
## VBA Module Inspection
|
||||
|
||||
When working with `.xlsm` files, you can read and list VBA modules:
|
||||
|
||||
```python
|
||||
from openpyxl import load_workbook
|
||||
import zipfile
|
||||
import os
|
||||
|
||||
def list_vba_modules(filepath):
|
||||
"""List all VBA modules in an .xlsm file."""
|
||||
if not filepath.endswith(('.xlsm', '.xlsb')):
|
||||
return {"has_vba": False, "modules": []}
|
||||
|
||||
modules = []
|
||||
try:
|
||||
with zipfile.ZipFile(filepath, 'r') as zf:
|
||||
vba_files = [f for f in zf.namelist() if f.startswith('xl/vbaProject')]
|
||||
if not vba_files:
|
||||
return {"has_vba": False, "modules": []}
|
||||
|
||||
# Read with keep_vba to access vba_archive
|
||||
wb = load_workbook(filepath, keep_vba=True)
|
||||
if wb.vba_archive:
|
||||
for name in wb.vba_archive.namelist():
|
||||
modules.append(name)
|
||||
wb.close()
|
||||
except Exception as e:
|
||||
return {"has_vba": False, "error": str(e)}
|
||||
|
||||
return {"has_vba": True, "modules": modules}
|
||||
```
|
||||
|
||||
Use this to inspect before editing — know what VBA exists before you touch the file.
|
||||
234
skills/xlsx/scenes/analyze-recipes.md
Executable file
234
skills/xlsx/scenes/analyze-recipes.md
Executable file
@@ -0,0 +1,234 @@
|
||||
# Analyze Recipes — Code Patterns for Data Analysis
|
||||
|
||||
> Load this file ON DEMAND when you need specific code patterns. Do NOT load upfront.
|
||||
|
||||
---
|
||||
|
||||
## Load & Explore
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
|
||||
df = pd.read_excel('input.xlsx') # or read_csv, read_json
|
||||
# Multi-sheet: pd.read_excel('input.xlsx', sheet_name=None) → dict
|
||||
|
||||
print(f"Shape: {df.shape}")
|
||||
print(f"Columns: {list(df.columns)}")
|
||||
print(f"Dtypes:\n{df.dtypes}")
|
||||
print(f"Nulls:\n{df.isnull().sum()}")
|
||||
print(f"Duplicates: {df.duplicated().sum()}")
|
||||
print(f"\nDescribe:\n{df.describe()}")
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Aggregation & Grouping
|
||||
|
||||
```python
|
||||
summary = df.groupby('Category').agg(
|
||||
total=('Revenue', 'sum'),
|
||||
avg=('Revenue', 'mean'),
|
||||
count=('Revenue', 'count'),
|
||||
max_val=('Revenue', 'max')
|
||||
).round(2)
|
||||
|
||||
pivot = df.pivot_table(
|
||||
values='Amount', index='Category', columns='Quarter',
|
||||
aggfunc='sum', margins=True
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Time Series
|
||||
|
||||
```python
|
||||
df['date'] = pd.to_datetime(df['date'])
|
||||
monthly = df.resample('M', on='date').agg({'revenue': 'sum', 'orders': 'count'})
|
||||
monthly['growth'] = monthly['revenue'].pct_change()
|
||||
monthly['rolling_3m'] = monthly['revenue'].rolling(3).mean()
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Comparison / Diff
|
||||
|
||||
```python
|
||||
df1 = pd.read_excel('this_month.xlsx')
|
||||
df2 = pd.read_excel('last_month.xlsx')
|
||||
merged = df1.merge(df2, on='ID', suffixes=('_new', '_old'))
|
||||
merged['change'] = merged['value_new'] - merged['value_old']
|
||||
merged['change_pct'] = (merged['change'] / merged['value_old'] * 100).round(1)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Statistical Analysis
|
||||
|
||||
```python
|
||||
stats = df.describe().T
|
||||
stats['median'] = df.median()
|
||||
stats['skew'] = df.skew()
|
||||
corr = df.select_dtypes(include='number').corr().round(3)
|
||||
top_10 = df.nlargest(10, 'Revenue')
|
||||
bottom_10 = df.nsmallest(10, 'Revenue')
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Data Cleaning
|
||||
|
||||
```python
|
||||
df = df.drop_duplicates()
|
||||
df['amount'] = df['amount'].fillna(0)
|
||||
df['name'] = df['name'].fillna('Unknown')
|
||||
df['date'] = pd.to_datetime(df['date'], errors='coerce')
|
||||
df['amount'] = pd.to_numeric(df['amount'], errors='coerce')
|
||||
|
||||
# Remove outliers (IQR)
|
||||
Q1, Q3 = df['value'].quantile([0.25, 0.75])
|
||||
IQR = Q3 - Q1
|
||||
df = df[(df['value'] >= Q1 - 1.5*IQR) & (df['value'] <= Q3 + 1.5*IQR)]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Bridge Pattern: pandas → openpyxl
|
||||
|
||||
```python
|
||||
from openpyxl import Workbook
|
||||
from openpyxl.utils.dataframe import dataframe_to_rows
|
||||
|
||||
wb = Workbook()
|
||||
ws = wb.active
|
||||
ws.title = "Analysis"
|
||||
|
||||
for r_idx, row in enumerate(dataframe_to_rows(summary, index=True, header=True), 1):
|
||||
for c_idx, value in enumerate(row, 1):
|
||||
ws.cell(row=r_idx + 3, column=c_idx + 1, value=value)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## KPI Summary Card
|
||||
|
||||
```python
|
||||
kpis = [
|
||||
('Total Revenue', total_revenue, '$#,##0'),
|
||||
('Avg Order Value', avg_order, '$#,##0.00'),
|
||||
('Growth Rate', growth_rate, '0.0%'),
|
||||
('Total Orders', total_orders, '#,##0'),
|
||||
]
|
||||
col = 2
|
||||
for label, value, fmt in kpis:
|
||||
ws.cell(row=3, column=col, value=label)
|
||||
ws.cell(row=4, column=col, value=value)
|
||||
ws.cell(row=4, column=col).number_format = fmt
|
||||
col += 3
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Cross-Validation Review Sheet
|
||||
|
||||
```python
|
||||
review_ws = wb.create_sheet("Review")
|
||||
review_ws.sheet_properties.tabColor = "FFC000"
|
||||
|
||||
checks = [
|
||||
["Check", "Expected", "Actual", "Status"],
|
||||
["Total Revenue", "=SUM(Data!B2:B100)", "=Summary!B10", '=IF(B2=C2,"✓ PASS","✗ FAIL")'],
|
||||
["Row Count", "=COUNTA(Data!A:A)-1", "=Summary!B3", '=IF(B3=C3,"✓ PASS","✗ FAIL")'],
|
||||
]
|
||||
for i, row in enumerate(checks, 1):
|
||||
for j, val in enumerate(row, 1):
|
||||
review_ws.cell(row=i, column=j, value=val)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## xlsx.py Pivot Workflow
|
||||
|
||||
```bash
|
||||
python3 "$XLSX_SKILL_DIR/xlsx.py" inspect data.xlsx --pretty
|
||||
python3 "$XLSX_SKILL_DIR/xlsx.py" pivot data.xlsx output.xlsx \
|
||||
--source "Data!A1:F500" \
|
||||
--rows "Product,Region" \
|
||||
--values "Revenue:sum,Units:count" \
|
||||
--location "Summary!A3" \
|
||||
--style "finance" \
|
||||
--chart "bar"
|
||||
python3 "$XLSX_SKILL_DIR/xlsx.py" validate output.xlsx
|
||||
```
|
||||
|
||||
### PivotTable Best Practices
|
||||
- Source data: first row must have unique, non-blank headers
|
||||
- No merged cells or blank rows in source range
|
||||
- Place pivot on a dedicated sheet, position at A3 or B2
|
||||
- Row axis: primary grouping; Column axis: ≤10 distinct values
|
||||
- Values: numeric measures only
|
||||
|
||||
### PivotTable Troubleshooting
|
||||
| Symptom | Remedy |
|
||||
|---------|--------|
|
||||
| "Field not found" | Check header spelling via `inspect` |
|
||||
| PivotTable empty | Ensure `--source` covers all data rows |
|
||||
| `validate` reports pivot errors | Critical — must fix |
|
||||
| `validate` reports `pass_with_warnings` | Safe to deliver |
|
||||
|
||||
---
|
||||
|
||||
## Alternating Column Structure (Key-Value Pairs)
|
||||
|
||||
When odd columns contain identifiers and even columns contain corresponding values (e.g., O=PartNo, P=Qty, Q=PartNo, R=Qty, ...):
|
||||
|
||||
**Detection heuristic**:
|
||||
- Odd columns have repeated values or category codes
|
||||
- Even columns are numeric
|
||||
- Headers alternate between descriptive and quantitative names
|
||||
|
||||
**Solution**: Use SUMIF across the combined key/value ranges:
|
||||
|
||||
```python
|
||||
# Excel formula: =SUMIF(O2:W2, A2, P2:X2)
|
||||
# SUMIF matches position-by-position across multi-column ranges
|
||||
formula = f'=SUMIF(O{row}:W{row},A{row},P{row}:X{row})'
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## FIFO Allocation Formula (Cumulative Deduction)
|
||||
|
||||
Scenario: Allocate limited inventory to order lines in sequence — each row gets what's left after previous rows consumed their share.
|
||||
|
||||
**Formula template** (row N):
|
||||
```
|
||||
=MAX(0, MIN(OrderQty_N,
|
||||
TotalInventory_for_key - SUM_of_already_allocated_above))
|
||||
```
|
||||
|
||||
**Example** (H column = allocated qty):
|
||||
```python
|
||||
# Row 2 (first row): allocate up to available inventory
|
||||
f'=MIN(G2, SUMIFS(Sheet2!D:D, Sheet2!A:A, A2, Sheet2!B:B, D2))'
|
||||
|
||||
# Row 3+ (subsequent): subtract already-allocated from rows above
|
||||
f'=MAX(0, MIN(G{r}, SUMIFS(Sheet2!D:D, Sheet2!A:A, A{r}, Sheet2!B:B, D{r})'
|
||||
f' - SUMIFS(H$1:H{r-1}, A$1:A{r-1}, A{r}, D$1:D{r-1}, D{r})))'
|
||||
```
|
||||
|
||||
**Key**: `SUMIFS(H$1:H{r-1}, ...)` creates a running total of already-allocated amounts, achieving row-by-row deduction.
|
||||
|
||||
⚠️ This is a self-referencing formula pattern — openpyxl cannot verify it. Must open in Excel to confirm calculation.
|
||||
|
||||
### Data Provenance Implementation
|
||||
|
||||
```python
|
||||
src_ws = wb.create_sheet("Sources")
|
||||
src_ws.sheet_properties.tabColor = PRIMARY
|
||||
headers = ["Data Description", "Source Name", "Source URL", "Access Date"]
|
||||
for col, h in enumerate(headers, 1):
|
||||
cell = src_ws.cell(row=1, column=col, value=h)
|
||||
cell.font = Font(name=FONT_NAME, bold=HEADER_BOLD, color="FFFFFF")
|
||||
cell.fill = PatternFill(start_color=PRIMARY, end_color=PRIMARY, fill_type="solid")
|
||||
```
|
||||
95
skills/xlsx/scenes/analyze.md
Executable file
95
skills/xlsx/scenes/analyze.md
Executable file
@@ -0,0 +1,95 @@
|
||||
# Scene: Data Analysis → Excel Output
|
||||
|
||||
## When This Applies
|
||||
User wants to analyze data (statistics, trends, comparisons, pivots, aggregation) and receive results as an Excel file — possibly with charts, summary tables, or dashboards.
|
||||
|
||||
This scene bridges **pandas analysis** with **openpyxl output**. The deliverable is always an .xlsx file.
|
||||
|
||||
## Workflow
|
||||
|
||||
```
|
||||
1. LOAD → Read input data (CSV/XLSX/JSON/DB)
|
||||
2. EXPLORE → Understand structure, quality, distributions
|
||||
3. ANALYZE → Compute metrics, aggregations, statistical tests
|
||||
4. DESIGN → Plan Excel output (sheets, charts, KPIs)
|
||||
5. BUILD → Write analysis results to .xlsx with formatting
|
||||
6. CHART → Add charts (Excel-native or embedded matplotlib)
|
||||
7. QA → recalc → audit → scan → chart-verify
|
||||
8. PIVOT → If needed, run xlsx.py pivot as final step
|
||||
9. VALIDATE → validate → deliver
|
||||
```
|
||||
|
||||
## Analysis Framework
|
||||
|
||||
### Phase A: Problem Framing
|
||||
- What question is the user trying to answer?
|
||||
- Who will consume this output? (executive summary vs. detailed analysis)
|
||||
- What decisions will be made based on this data?
|
||||
|
||||
### Phase B: Data Quality Assessment
|
||||
- Missing values: count, pattern (random vs. systematic)
|
||||
- Outliers: statistical detection (IQR, z-score)
|
||||
- Data types: numeric vs. categorical, date parsing
|
||||
- Duplicates: exact and fuzzy
|
||||
|
||||
### Phase C: Exploratory Analysis
|
||||
- Distributions: histograms, box plots for key variables
|
||||
- Correlations: pairwise for numeric columns
|
||||
- Segmentation: group-by analysis on categorical dimensions
|
||||
- Time patterns: trends, seasonality if time-series data
|
||||
|
||||
### Phase D: Insight Extraction
|
||||
- Rank findings by business impact, not statistical significance
|
||||
- Each insight must be actionable — "so what?" test
|
||||
- Cross-validate: check the same insight from a different angle
|
||||
|
||||
### Phase E: Cross-Validation
|
||||
- Sanity check totals against known benchmarks
|
||||
- Verify computed metrics with alternative formulas
|
||||
- Document any assumptions or limitations in the output
|
||||
|
||||
**Industry-specific frameworks:**
|
||||
- **Finance**: Variance analysis → trend decomposition → ratio analysis → peer comparison
|
||||
- **Marketing**: Funnel analysis → cohort analysis → attribution → ROI calculation
|
||||
- **Operations**: Throughput analysis → bottleneck identification → utilization rates → SLA compliance
|
||||
|
||||
---
|
||||
|
||||
## Multi-Sheet Report Layout
|
||||
|
||||
```
|
||||
Sheet 1: "Dashboard" — KPI cards + summary chart
|
||||
Sheet 2: "Detail" — Full analysis table with formatting
|
||||
Sheet 3: "Charts" — Additional visualizations
|
||||
Sheet 4: "Raw Data" — Original data for reference (tab color: gray)
|
||||
```
|
||||
|
||||
### KPI Summary Card Pattern
|
||||
|
||||
Place 4-6 KPI metrics at the top of Dashboard sheet (row 3-4), each spaced 3 columns apart. Include label (small, gray) and value (large, bold, themed) with appropriate number format.
|
||||
|
||||
---
|
||||
|
||||
## PivotTable Decision
|
||||
|
||||
| Situation | Use |
|
||||
|-----------|-----|
|
||||
| Need interactive PivotTable in Excel | `"$XLSX_SKILL_DIR/xlsx.py" pivot` |
|
||||
| Just need a summary table (static) | pandas `pivot_table` → openpyxl |
|
||||
| Simple aggregation (1 dimension) | pandas `groupby` → openpyxl |
|
||||
|
||||
**Trigger phrases**: summarize, aggregate, group by, categorize, breakdown, distribution, tally, totals per, cross-tab, 汇总, 透视, 分类统计, 交叉分析
|
||||
|
||||
---
|
||||
|
||||
## Data Provenance
|
||||
|
||||
When analysis uses external data, create a **"Sources" sheet** (tab color: `PRIMARY`) with columns: Data Description | Source Name | Source URL | Access Date.
|
||||
|
||||
Skip when user provides all data directly.
|
||||
|
||||
---
|
||||
|
||||
## Code Recipes
|
||||
|
||||
For specific code patterns (aggregation, time series, comparison, cleaning, bridge pattern), load `scenes/analyze-recipes.md` on demand.
|
||||
133
skills/xlsx/scenes/convert.md
Executable file
133
skills/xlsx/scenes/convert.md
Executable file
@@ -0,0 +1,133 @@
|
||||
# Scene: Format Conversion
|
||||
|
||||
## When This Applies
|
||||
User wants to convert between tabular file formats: CSV↔XLSX, JSON→XLSX, TSV→XLSX, PDF table→XLSX, or XLSX→CSV/JSON.
|
||||
|
||||
## Conversion Matrix
|
||||
|
||||
| From | To | Method |
|
||||
|------|-----|--------|
|
||||
| CSV/TSV → XLSX | pandas read → openpyxl write with formatting | Most common |
|
||||
| JSON → XLSX | pandas json_normalize → openpyxl | Flatten nested structures |
|
||||
| XLSX → CSV | pandas read_excel → to_csv | Simple export |
|
||||
| XLSX → JSON | pandas read_excel → to_json | With orient parameter |
|
||||
| PDF table → XLSX | pdfplumber/tabula extract → openpyxl | Needs table detection |
|
||||
| Image table → XLSX | OCR → pandas → openpyxl | Last resort, error-prone |
|
||||
|
||||
## CSV/TSV → XLSX
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
from openpyxl import Workbook
|
||||
from openpyxl.utils.dataframe import dataframe_to_rows
|
||||
|
||||
# Read with encoding detection
|
||||
df = pd.read_csv('input.csv', encoding='utf-8')
|
||||
# Common encodings: utf-8, gbk, gb2312, latin-1, shift_jis
|
||||
|
||||
# Handle messy CSVs
|
||||
df = pd.read_csv('input.csv',
|
||||
encoding='utf-8',
|
||||
sep=',', # or '\t', ';', '|'
|
||||
skiprows=2, # skip junk header rows
|
||||
na_values=['N/A', '-', ''],
|
||||
dtype=str, # read everything as string first, convert later
|
||||
on_bad_lines='skip' # skip malformed rows
|
||||
)
|
||||
|
||||
# Convert types after reading
|
||||
df['amount'] = pd.to_numeric(df['amount'], errors='coerce')
|
||||
df['date'] = pd.to_datetime(df['date'], errors='coerce')
|
||||
|
||||
# Write to Excel with formatting
|
||||
wb = Workbook()
|
||||
ws = wb.active
|
||||
|
||||
# Write data starting at B4 (with theme formatting)
|
||||
for r_idx, row in enumerate(dataframe_to_rows(df, index=False, header=True), 4):
|
||||
for c_idx, value in enumerate(row, 2):
|
||||
ws.cell(row=r_idx, column=c_idx, value=value)
|
||||
|
||||
# Apply design tokens from engines/design.md
|
||||
# ...
|
||||
|
||||
wb.save('output.xlsx')
|
||||
```
|
||||
|
||||
## JSON → XLSX
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
import json
|
||||
|
||||
# Flat JSON
|
||||
df = pd.read_json('input.json')
|
||||
|
||||
# Nested JSON — flatten
|
||||
with open('input.json') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# If it's a list of objects
|
||||
df = pd.json_normalize(data, max_level=2)
|
||||
|
||||
# If nested with specific record path
|
||||
df = pd.json_normalize(data, record_path='items', meta=['id', 'name'])
|
||||
|
||||
# Write to Excel...
|
||||
```
|
||||
|
||||
## XLSX → CSV/JSON
|
||||
|
||||
```python
|
||||
# To CSV
|
||||
df = pd.read_excel('input.xlsx', sheet_name='Data')
|
||||
df.to_csv('output.csv', index=False, encoding='utf-8-sig') # utf-8-sig for Excel compatibility
|
||||
|
||||
# To JSON
|
||||
df.to_json('output.json', orient='records', force_ascii=False, indent=2)
|
||||
|
||||
# Multiple sheets → multiple CSVs
|
||||
sheets = pd.read_excel('input.xlsx', sheet_name=None)
|
||||
for name, df in sheets.items():
|
||||
df.to_csv(f'output_{name}.csv', index=False, encoding='utf-8-sig')
|
||||
```
|
||||
|
||||
## PDF Table → XLSX
|
||||
|
||||
```python
|
||||
# Method 1: pdfplumber (preferred for most PDFs)
|
||||
import pdfplumber
|
||||
|
||||
tables = []
|
||||
with pdfplumber.open('input.pdf') as pdf:
|
||||
for page in pdf.pages:
|
||||
page_tables = page.extract_tables()
|
||||
for table in page_tables:
|
||||
tables.extend(table)
|
||||
|
||||
# Clean and convert to DataFrame
|
||||
df = pd.DataFrame(tables[1:], columns=tables[0])
|
||||
|
||||
# Method 2: tabula-py (Java-based, good for complex tables)
|
||||
# import tabula
|
||||
# dfs = tabula.read_pdf('input.pdf', pages='all', multiple_tables=True)
|
||||
```
|
||||
|
||||
## Encoding Gotchas
|
||||
|
||||
| Scenario | Encoding | Tip |
|
||||
|----------|----------|-----|
|
||||
| Chinese data from Windows | `gbk` or `gb2312` | Try gbk first |
|
||||
| Japanese data | `shift_jis` or `cp932` | |
|
||||
| European data | `latin-1` or `cp1252` | |
|
||||
| Excel-generated CSV | `utf-8-sig` (has BOM) | pandas handles automatically |
|
||||
| Output CSV for Excel | Write with `utf-8-sig` | Prevents garbled Chinese in Excel |
|
||||
|
||||
## Quality Checks After Conversion
|
||||
|
||||
- [ ] Row count matches source
|
||||
- [ ] No garbled characters (encoding correct)
|
||||
- [ ] Numeric columns are numbers, not strings
|
||||
- [ ] Dates are date objects, not text
|
||||
- [ ] No blank rows/columns from source artifacts
|
||||
- [ ] Headers are in the correct row
|
||||
105
skills/xlsx/scenes/create.md
Executable file
105
skills/xlsx/scenes/create.md
Executable file
@@ -0,0 +1,105 @@
|
||||
# Scene: Create New Spreadsheet
|
||||
|
||||
## When This Applies
|
||||
User wants to create a new Excel file from scratch — a table, template, schedule, report, or any structured data output.
|
||||
|
||||
For financial models, also load `scenes/finance.md`.
|
||||
|
||||
## Workflow
|
||||
|
||||
```
|
||||
1. PLAN → Identify all sheets, their structure, formulas, cross-references
|
||||
2. STYLE → Load engines/design.md, apply default palette
|
||||
3. BUILD → Create workbook, write data/formulas/formatting per sheet
|
||||
4. QA → recalc → audit → scan → chart-verify (if charts)
|
||||
5. PIVOT → If needed, run pivot command LAST
|
||||
6. VALIDATE → validate → exit 0 = deliver
|
||||
```
|
||||
|
||||
## Layout & Styling
|
||||
|
||||
All layout rules (Canvas Origin B2, column widths, row heights, margins) and styling (title/header/data/totals) are defined in **`engines/design.md`** — the single source of truth. Do not duplicate here.
|
||||
|
||||
Quick reference for sheet structure:
|
||||
```
|
||||
Row 1: [top margin]
|
||||
Row 2: Title (B2)
|
||||
Row 3: [spacer]
|
||||
Row 4: Column headers
|
||||
Row 5+: Data rows
|
||||
Last+1: Totals row
|
||||
Last+3: Notes/sources
|
||||
```
|
||||
|
||||
## Multi-Sheet Workbooks
|
||||
|
||||
### Cross-Sheet References
|
||||
```python
|
||||
# Reference another sheet
|
||||
sheet['C5'] = "=Data!B10"
|
||||
|
||||
# Sheet names with spaces need quotes
|
||||
sheet['C5'] = "='Sales Data'!B10"
|
||||
|
||||
# Green font for cross-sheet links (Finance theme)
|
||||
sheet['C5'].font = Font(color="008000")
|
||||
```
|
||||
|
||||
### Common Multi-Sheet Patterns
|
||||
- **Data + Summary**: Raw data on Sheet1, formulas/charts on Summary
|
||||
- **Monthly tabs**: Jan, Feb, Mar... + Annual Summary
|
||||
- **Input + Output**: Assumptions sheet + Calculations sheet + Dashboard
|
||||
|
||||
## Template Patterns
|
||||
|
||||
### Simple Data Table
|
||||
```python
|
||||
wb = Workbook()
|
||||
ws = wb.active
|
||||
ws.title = "Data"
|
||||
|
||||
# Title + Headers + Data + Totals styling → see engines/design.md §11 Code Templates
|
||||
# Only show formula logic here:
|
||||
|
||||
# Headers at B4
|
||||
headers = ['Product', 'Q1', 'Q2', 'Q3', 'Q4', 'Total']
|
||||
for col, h in enumerate(headers, 2):
|
||||
cell = ws.cell(row=4, column=col, value=h)
|
||||
|
||||
# Data rows starting at row 5
|
||||
# ...
|
||||
|
||||
# Totals row
|
||||
total_row = last_data_row + 1
|
||||
ws.cell(row=total_row, column=2, value='Total')
|
||||
for col in range(3, 7): # Q1-Q4
|
||||
letter = get_column_letter(col)
|
||||
ws.cell(row=total_row, column=col).value = f'=SUM({letter}5:{letter}{last_data_row})'
|
||||
|
||||
# Grand total
|
||||
ws.cell(row=total_row, column=7).value = f'=SUM(C{total_row}:F{total_row})'
|
||||
```
|
||||
|
||||
### Schedule / Calendar
|
||||
- Use merged cells for day headers
|
||||
- Conditional formatting for weekends (light gray fill)
|
||||
- Freeze panes: `ws.freeze_panes = 'C5'` (freeze header + left labels)
|
||||
|
||||
### Checklist / Tracker
|
||||
- Checkbox column using data validation (`TRUE`/`FALSE`)
|
||||
- Status column with conditional formatting (green/amber/red)
|
||||
- Progress bar using data bar conditional formatting
|
||||
|
||||
## Freeze Panes & Print
|
||||
|
||||
```python
|
||||
# Freeze headers (row 4) and label column (col B)
|
||||
ws.freeze_panes = 'C5' # Rows 1-4 and cols A-B stay visible
|
||||
|
||||
# Print setup
|
||||
ws.page_setup.orientation = 'landscape'
|
||||
ws.page_setup.fitToWidth = 1
|
||||
ws.page_setup.fitToHeight = 0
|
||||
ws.print_area = 'B2:H50'
|
||||
ws.print_title_rows = '4:4' # Repeat header on each page
|
||||
```
|
||||
222
skills/xlsx/scenes/edit-patterns.md
Executable file
222
skills/xlsx/scenes/edit-patterns.md
Executable file
@@ -0,0 +1,222 @@
|
||||
# Edit Patterns — Reusable Code for Complex Edit Operations
|
||||
|
||||
> Load this file ON DEMAND when you encounter grouping, sorting, block detection, or other complex edit patterns.
|
||||
> Do NOT load upfront for simple edits.
|
||||
|
||||
---
|
||||
|
||||
## Pattern: Block Detection
|
||||
|
||||
Data is often split into independent blocks separated by blank rows or keyword rows (e.g., TOTAL, Subtotal).
|
||||
|
||||
```python
|
||||
def detect_blocks(ws, col=1, start_row=1, end_row=None,
|
||||
separator='blank', keyword='TOTAL'):
|
||||
"""
|
||||
Detect data block boundaries.
|
||||
separator: 'blank' (empty row) or 'keyword' (row containing keyword)
|
||||
Returns: list of (start_row, end_row) tuples
|
||||
"""
|
||||
if end_row is None:
|
||||
end_row = ws.max_row
|
||||
blocks, block_start = [], None
|
||||
for row in range(start_row, end_row + 1):
|
||||
val = ws.cell(row=row, column=col).value
|
||||
is_blank = val is None or (isinstance(val, str) and val.strip() == '')
|
||||
is_kw = (separator == 'keyword' and
|
||||
isinstance(val, str) and keyword in str(val).upper())
|
||||
if separator == 'blank':
|
||||
if not is_blank and block_start is None:
|
||||
block_start = row
|
||||
elif is_blank and block_start is not None:
|
||||
blocks.append((block_start, row - 1))
|
||||
block_start = None
|
||||
elif separator == 'keyword':
|
||||
if is_kw:
|
||||
if block_start:
|
||||
blocks.append((block_start, row))
|
||||
block_start = None
|
||||
elif not is_blank and block_start is None:
|
||||
block_start = row
|
||||
if block_start:
|
||||
blocks.append((block_start, end_row))
|
||||
return blocks
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pattern: Pre-filter Null Rows
|
||||
|
||||
Before any groupby/aggregation, filter out rows where key columns are empty.
|
||||
|
||||
```python
|
||||
def pre_filter_rows(ws, key_cols, start_row, end_row):
|
||||
"""Return row numbers where ALL key columns are non-null."""
|
||||
return [row for row in range(start_row, end_row + 1)
|
||||
if all(normalize_cell_value(ws.cell(row=row, column=c).value) is not None
|
||||
for c in key_cols)]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pattern: Sort with Formula Rewrite
|
||||
|
||||
When sorting rows by swapping data (not using `insert_rows`), formulas must be regenerated with new row numbers.
|
||||
|
||||
```python
|
||||
def sort_block_with_formulas(ws, block_rows, sort_col, formula_templates,
|
||||
descending=True):
|
||||
"""
|
||||
Sort rows within a block, regenerating formulas.
|
||||
formula_templates: dict {col_index: '=B{row}+C{row}'}
|
||||
"""
|
||||
# 1. Read all row data + compute sort key
|
||||
rows_data = []
|
||||
for r in block_rows:
|
||||
vals = {c: ws.cell(row=r, column=c).value for c in range(1, ws.max_column + 1)}
|
||||
rows_data.append(vals)
|
||||
rows_data.sort(key=lambda x: (x.get(sort_col) or 0), reverse=descending)
|
||||
|
||||
# 2. Write back with new row numbers
|
||||
for i, rd in enumerate(rows_data):
|
||||
target = block_rows[i]
|
||||
for col, val in rd.items():
|
||||
if col in formula_templates:
|
||||
ws.cell(row=target, column=col).value = formula_templates[col].format(row=target)
|
||||
else:
|
||||
ws.cell(row=target, column=col).value = val
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pattern: Group-Merge (Aggregate by Key)
|
||||
|
||||
Group rows by a key column. Take first-row values for some columns, sum for others.
|
||||
|
||||
```python
|
||||
from collections import OrderedDict
|
||||
|
||||
def group_merge_rows(ws, key_col, start_row, end_row, first_cols, sum_cols):
|
||||
"""
|
||||
Group by key_col, merge rows.
|
||||
first_cols: take value from first row in group
|
||||
sum_cols: sum values across group
|
||||
"""
|
||||
groups = OrderedDict()
|
||||
for row in range(start_row, end_row + 1):
|
||||
key = normalize_cell_value(ws.cell(row=row, column=key_col).value)
|
||||
if key is None:
|
||||
continue
|
||||
if key not in groups:
|
||||
groups[key] = {
|
||||
'first': {c: ws.cell(row=row, column=c).value for c in first_cols},
|
||||
'sums': {c: 0.0 for c in sum_cols},
|
||||
}
|
||||
for c in sum_cols:
|
||||
v = normalize_cell_value(ws.cell(row=row, column=c).value)
|
||||
if v is not None:
|
||||
try:
|
||||
groups[key]['sums'][c] += float(v)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
return groups
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pattern: Group-Max-Keep-Ties
|
||||
|
||||
Group by key, find max value per group, keep ALL rows that match the max (not just the first).
|
||||
|
||||
```python
|
||||
from collections import defaultdict
|
||||
|
||||
def group_max_keep_ties(rows, key_func, value_func, filter_null=True):
|
||||
"""
|
||||
Keep all rows with the maximum value per group (ties preserved).
|
||||
rows: list of row dicts or tuples
|
||||
key_func: row → group key
|
||||
value_func: row → comparable value (e.g., date)
|
||||
"""
|
||||
groups = defaultdict(list)
|
||||
for row in rows:
|
||||
val = value_func(row)
|
||||
if filter_null and val is None:
|
||||
continue
|
||||
groups[key_func(row)].append(row)
|
||||
|
||||
kept = []
|
||||
for key, group in groups.items():
|
||||
max_val = max(value_func(r) for r in group)
|
||||
kept.extend(r for r in group if value_func(r) == max_val)
|
||||
return kept
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pattern: Sequence Fill (Smart Numbering)
|
||||
|
||||
Fill blank rows with "parent number + letter suffix" (e.g., 5 → 5a, 5b, ..., 5z, 5aa).
|
||||
|
||||
```python
|
||||
import re
|
||||
|
||||
def get_letter_suffix(n):
|
||||
"""0=a, 25=z, 26=aa, 27=ab..."""
|
||||
if n < 26:
|
||||
return chr(ord('a') + n)
|
||||
return chr(ord('a') + (n // 26) - 1) + chr(ord('a') + (n % 26))
|
||||
|
||||
def fill_sequential_labels(ws, col, start_row, end_row):
|
||||
last_base, blank_count = None, 0
|
||||
for row in range(start_row, end_row + 1):
|
||||
val = ws.cell(row=row, column=col).value
|
||||
if val is not None:
|
||||
m = re.match(r'^(\d+)', str(val))
|
||||
if m:
|
||||
last_base = m.group(1)
|
||||
blank_count = 0
|
||||
else:
|
||||
if last_base is not None:
|
||||
ws.cell(row=row, column=col).value = f"{last_base}{get_letter_suffix(blank_count)}"
|
||||
blank_count += 1
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pattern: Zero-as-Blank Output
|
||||
|
||||
When merged/aggregated values of 0 should display as empty:
|
||||
|
||||
```python
|
||||
# Method 1: Write None (best for programmatic verification)
|
||||
cell.value = computed_value if computed_value != 0 else None
|
||||
|
||||
# Method 2: Number format (best for Excel viewing)
|
||||
cell.value = computed_value
|
||||
cell.number_format = '0.00;-0.00;""' # positive;negative;zero(blank)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pattern: Side-by-Side Table Detection
|
||||
|
||||
Some sheets contain multiple independent tables arranged horizontally (separated by empty columns).
|
||||
|
||||
```python
|
||||
def detect_side_by_side_tables(ws):
|
||||
"""Find column groups separated by all-null columns."""
|
||||
tables = []
|
||||
current_start = None
|
||||
for col in range(1, ws.max_column + 1):
|
||||
has_data = any(ws.cell(row=r, column=col).value is not None
|
||||
for r in range(1, ws.max_row + 1))
|
||||
if has_data and current_start is None:
|
||||
current_start = col
|
||||
elif not has_data and current_start is not None:
|
||||
tables.append((current_start, col - 1))
|
||||
current_start = None
|
||||
if current_start:
|
||||
tables.append((current_start, ws.max_column))
|
||||
return tables # [(start_col, end_col), ...]
|
||||
```
|
||||
195
skills/xlsx/scenes/edit.md
Executable file
195
skills/xlsx/scenes/edit.md
Executable file
@@ -0,0 +1,195 @@
|
||||
# Scene: Edit Existing Spreadsheet
|
||||
|
||||
## When This Applies
|
||||
User provides an existing .xlsx/.xlsm file and wants to modify it — fill data, fix formulas, beautify layout, add sheets, restructure.
|
||||
|
||||
## Core Principle: Preserve First
|
||||
|
||||
**Study the existing file before making ANY changes.** The original format, style, and conventions take absolute priority over default guidelines.
|
||||
|
||||
### VBA Preservation Rule
|
||||
When opening `.xlsm` files, **always** use `keep_vba=True`:
|
||||
```python
|
||||
wb = load_workbook('file.xlsm', keep_vba=True)
|
||||
# Edit data/formatting as usual
|
||||
wb.save('output.xlsm') # VBA modules preserved
|
||||
```
|
||||
**Never** save a `.xlsm` as `.xlsx` unless the user explicitly requests macro removal. This silently destroys all VBA code.
|
||||
|
||||
## Workflow
|
||||
|
||||
```
|
||||
1. INSPECT → Read the file, understand structure
|
||||
2. PLAN → Identify what to change vs what to preserve
|
||||
3. BACKUP → If destructive changes, suggest user keeps original
|
||||
4. MODIFY → Make targeted changes
|
||||
5. QA → recalc → audit → scan
|
||||
6. VALIDATE → validate → deliver
|
||||
```
|
||||
|
||||
## Step 1: Inspect the File
|
||||
|
||||
### 1a. Structure Survey
|
||||
|
||||
```python
|
||||
from openpyxl import load_workbook
|
||||
|
||||
# Read with formulas preserved
|
||||
wb = load_workbook('input.xlsx')
|
||||
|
||||
# Survey structure
|
||||
for name in wb.sheetnames:
|
||||
ws = wb[name]
|
||||
print(f"Sheet: {name}, Dimensions: {ws.dimensions}, "
|
||||
f"Rows: {ws.max_row}, Cols: {ws.max_column}")
|
||||
|
||||
# Check for existing styles
|
||||
sample = ws['B4']
|
||||
print(f"Font: {sample.font.name}, Size: {sample.font.size}, "
|
||||
f"Bold: {sample.font.bold}, Fill: {sample.fill.fgColor}")
|
||||
```
|
||||
|
||||
Also run `python3 "$XLSX_SKILL_DIR/xlsx.py" inspect input.xlsx --pretty` for structured overview.
|
||||
|
||||
### 1b. Semantic Data Sampling (MANDATORY for merge/copy/aggregate operations)
|
||||
|
||||
**Don't just print headers — print actual data rows to understand column semantics:**
|
||||
|
||||
```python
|
||||
# Sample first 5 data rows from each sheet
|
||||
for name in wb.sheetnames:
|
||||
ws = wb[name]
|
||||
print(f"\n=== {name} ===")
|
||||
for row in range(1, min(6, ws.max_row + 1)):
|
||||
vals = []
|
||||
for col in range(1, ws.max_column + 1):
|
||||
v = ws.cell(row=row, column=col).value
|
||||
if v is not None:
|
||||
vals.append(f"{get_column_letter(col)}={v}")
|
||||
if vals:
|
||||
print(f" Row {row}: {vals}")
|
||||
```
|
||||
|
||||
### 1c. Cross-Sheet Column Semantic Mapping (MANDATORY before any merge/copy)
|
||||
|
||||
**⚠️ NEVER copy columns by position index alone when merging sheets.**
|
||||
|
||||
When two sheets have similar headers (e.g., both have columns A-V), the same column position may hold completely different data. Always:
|
||||
|
||||
1. Print sample data (not just headers) from both source and target sheets
|
||||
2. For each column, identify the data type and value domain
|
||||
3. Create an explicit column mapping dict before writing any data
|
||||
|
||||
```python
|
||||
# Example: source sheet E column = amount, target sheet E column = type code
|
||||
# → Do NOT copy source.E → target.E. Build semantic mapping first.
|
||||
column_mapping = {
|
||||
'src_I': 'dst_E', # amount → amount (different positions!)
|
||||
'src_E': 'dst_I', # type → type
|
||||
}
|
||||
```
|
||||
|
||||
### 1d. Cell Value Normalization
|
||||
|
||||
Canonical implementation lives in **`templates/base.py → normalize_cell_value()`**.
|
||||
Referenced by `edit-patterns.md` and `quality/pipeline.md`.
|
||||
|
||||
```python
|
||||
from base import normalize_cell_value
|
||||
# normalize_cell_value(value) → None for blank/NBSP/ZWSP, otherwise original value
|
||||
```
|
||||
|
||||
**Always use this when checking for empty cells** — `\xa0` (NBSP) looks blank but fails `is None`.
|
||||
|
||||
## Step 2: Match Existing Styles
|
||||
|
||||
When adding new cells/rows to a styled file, use **`copy_style()` from `templates/base.py`**:
|
||||
|
||||
```python
|
||||
from base import copy_style
|
||||
|
||||
# copy_style(source_cell, target_cell)
|
||||
# → copies font, fill, border, alignment, number_format
|
||||
```
|
||||
|
||||
## Common Edit Operations
|
||||
|
||||
### Fill / Complete Data
|
||||
```python
|
||||
# Add data to empty cells while preserving existing formatting
|
||||
for row in range(start, end + 1):
|
||||
cell = ws.cell(row=row, column=col)
|
||||
if cell.value is None:
|
||||
cell.value = new_value
|
||||
# Copy style from the cell above
|
||||
copy_style(ws.cell(row=row-1, column=col), cell)
|
||||
```
|
||||
|
||||
### Insert Rows / Columns
|
||||
```python
|
||||
# Insert 3 rows at position 10
|
||||
ws.insert_rows(10, amount=3)
|
||||
# Note: formulas referencing rows below 10 will auto-adjust
|
||||
|
||||
# Insert column at position D
|
||||
ws.insert_cols(4)
|
||||
```
|
||||
|
||||
**Warning**: Inserting/deleting rows can break chart references and named ranges. Verify after insertion.
|
||||
|
||||
### Restructure Data
|
||||
```python
|
||||
# Move data from one layout to another
|
||||
# Read all data first, then rewrite
|
||||
data = []
|
||||
for row in ws.iter_rows(min_row=2, values_only=True):
|
||||
data.append(row)
|
||||
|
||||
# Clear and rewrite in new structure
|
||||
# ...
|
||||
```
|
||||
|
||||
### Fix Formulas
|
||||
```python
|
||||
# Find cells with errors (after recalc)
|
||||
wb_data = load_workbook('input.xlsx', data_only=True)
|
||||
ws_data = wb_data.active
|
||||
|
||||
wb_formula = load_workbook('input.xlsx')
|
||||
ws_formula = wb_formula.active
|
||||
|
||||
for row in ws_data.iter_rows():
|
||||
for cell in row:
|
||||
if isinstance(cell.value, str) and cell.value.startswith('#'):
|
||||
formula_cell = ws_formula[cell.coordinate]
|
||||
print(f"Error at {cell.coordinate}: {cell.value}, Formula: {formula_cell.value}")
|
||||
```
|
||||
|
||||
## Format Beautification
|
||||
|
||||
When the user asks to "make it look better" or "format nicely":
|
||||
|
||||
→ **Load `engines/design.md`** and apply its complete styling system (tokens, fonts, layout, colors).
|
||||
|
||||
**But**: if the file already has a consistent style, enhance it rather than replacing it. Add what's missing (alignment, column widths, alternating fills) without changing existing colors or fonts. Use `copy_style()` (above) to match adjacent cells.
|
||||
|
||||
## ⚠️ Dangerous Operations
|
||||
|
||||
| Operation | Risk | Mitigation |
|
||||
|-----------|------|-----------|
|
||||
| `load_workbook(data_only=True)` then save | Formulas permanently lost | Never save after data_only read |
|
||||
| Delete rows/cols with formula dependencies | #REF! errors | Run audit after deletion |
|
||||
| Modify pivot table output with openpyxl | Corrupt pivotCache | Never — regenerate via xlsx.py pivot |
|
||||
| Overwrite merged cells | Layout breaks | Check `ws.merged_cells.ranges` first |
|
||||
| Manual row sort (swap row data) | Formulas still reference old row numbers | **Regenerate formula strings with target row number** (see Common Patterns → Sort with Formula Rewrite) |
|
||||
| Write SUM formula → verify with data_only | Get `None` — formula not evaluated | Compute value in Python for verification; write computed value or use recalc |
|
||||
|
||||
---
|
||||
|
||||
## Common Patterns
|
||||
|
||||
For complex edit operations (grouping, sorting, block detection, merging, sequence fill, etc.):
|
||||
|
||||
→ **Load `scenes/edit-patterns.md`** on demand.
|
||||
|
||||
Available patterns: Block Detection, Pre-filter Null, Sort with Formula Rewrite, Group-Merge, Group-Max-Keep-Ties, Sequence Fill, Zero-as-Blank, Side-by-Side Table Detection.
|
||||
318
skills/xlsx/scenes/finance.md
Executable file
318
skills/xlsx/scenes/finance.md
Executable file
@@ -0,0 +1,318 @@
|
||||
# Financial Model Specialist Guide
|
||||
|
||||
Load this reference when the task involves: financial statements, budgets, forecasts, DCF models, LBO, valuation, P&L, balance sheets, cash flow, or any investment banking deliverable.
|
||||
|
||||
Also load `engines/design.md` → use **Finance** scene overrides (IB text color rules, section dividers).
|
||||
|
||||
---
|
||||
|
||||
## Financial Model Architecture
|
||||
|
||||
### Standard Sheet Structure
|
||||
```
|
||||
Assumptions Sheet:
|
||||
- All inputs, growth rates, margins, multiples
|
||||
- Blue font for every changeable number
|
||||
- Yellow background for key assumptions
|
||||
- Source citations in adjacent cells or comments
|
||||
|
||||
Income Statement / P&L:
|
||||
- Revenue → COGS → Gross Profit → OpEx → EBIT → Interest → Tax → Net Income
|
||||
- All values are formulas referencing Assumptions
|
||||
|
||||
Balance Sheet:
|
||||
- Assets = Liabilities + Equity (must balance!)
|
||||
- Include balance check row: =Assets-Liabilities-Equity (should be 0)
|
||||
|
||||
Cash Flow Statement:
|
||||
- Operating → Investing → Financing → Net Change
|
||||
- Ending Cash = Beginning Cash + Net Change
|
||||
|
||||
Valuation / Output:
|
||||
- DCF, comparables, or whatever model the user needs
|
||||
- Green font for values pulled from other sheets
|
||||
```
|
||||
|
||||
### Formula Construction Rules
|
||||
|
||||
```python
|
||||
# ✅ CORRECT: Reference assumptions
|
||||
sheet['C10'] = '=C9*(1+Assumptions!$B$5)' # Growth rate from assumptions
|
||||
|
||||
# ❌ WRONG: Hardcoded magic number
|
||||
sheet['C10'] = '=C9*1.05'
|
||||
|
||||
# ✅ CORRECT: Protected division
|
||||
sheet['D15'] = '=IF(C15=0,"-",B15/C15)'
|
||||
|
||||
# ✅ CORRECT: Consistent formula across periods
|
||||
# If D10 = '=D9*(1+Assumptions!$B$5)' then E10 must follow the same pattern
|
||||
```
|
||||
|
||||
### Assumptions Sheet Layout
|
||||
```
|
||||
B4: "Key Assumptions" (section header, bold)
|
||||
B6: "Revenue Growth Rate" C6: 0.05 (blue font, yellow bg)
|
||||
B7: "Gross Margin" C7: 0.65 (blue font, yellow bg)
|
||||
B8: "OpEx as % Revenue" C8: 0.30 (blue font, yellow bg)
|
||||
B9: "Tax Rate" C9: 0.21 (blue font, yellow bg)
|
||||
B10: "Discount Rate (WACC)" C10: 0.10 (blue font, yellow bg)
|
||||
B11: "Terminal Growth Rate" C11: 0.02 (blue font, yellow bg)
|
||||
```
|
||||
|
||||
### Source Documentation for Hardcodes
|
||||
|
||||
Every hardcoded input MUST have a source citation:
|
||||
|
||||
```python
|
||||
# In cell comment
|
||||
ws['C6'].comment = Comment(
|
||||
"Source: Company 10-K, FY2024, Page 45, Revenue Growth",
|
||||
"Z.ai"
|
||||
)
|
||||
|
||||
# Or in adjacent cell (if end of table)
|
||||
ws['D6'] = "Source: Management guidance, Q3 2024 earnings call"
|
||||
ws['D6'].font = Font(size=8, italic=True, color="808080")
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Number Formatting (CRITICAL)
|
||||
|
||||
> Finance-specific formats below. For general number formats, see `engines/design.md §10`.
|
||||
> Finance formats take priority when both apply.
|
||||
|
||||
```python
|
||||
FINANCE_FORMATS = {
|
||||
# Currency — zeros as dash, negatives in parentheses
|
||||
'currency': '$#,##0;($#,##0);"-"',
|
||||
'currency_k': '$#,##0,"K";($#,##0,"K");"-"',
|
||||
'currency_mm': '$#,##0.0,,"M";($#,##0.0,,"M");"-"',
|
||||
|
||||
# Percentages — one decimal
|
||||
'pct': '0.0%;(0.0%);"-"',
|
||||
|
||||
# Multiples — for EV/EBITDA, P/E etc.
|
||||
'multiple': '0.0"x";(0.0"x");"-"',
|
||||
|
||||
# Years — MUST be text, not number (avoids "2,024")
|
||||
'year': '@',
|
||||
|
||||
# Integer with thousands separator
|
||||
'integer': '#,##0;(#,##0);"-"',
|
||||
|
||||
# Two decimal places
|
||||
'decimal': '#,##0.00;(#,##0.00);"-"',
|
||||
|
||||
# Shares (millions)
|
||||
'shares': '#,##0.0,,"M"',
|
||||
}
|
||||
|
||||
# Apply
|
||||
cell.number_format = FINANCE_FORMATS['currency_mm']
|
||||
```
|
||||
|
||||
**Always specify units in column headers**: "Revenue ($mm)", "Shares (M)", "Growth (%)"
|
||||
|
||||
---
|
||||
|
||||
## IB Model Layout Rules
|
||||
|
||||
> All colors below use **design tokens from `engines/design.md`**. Do not hardcode hex values.
|
||||
> Finance-specific overrides (IB text color rules, section dividers) are in `design.md §2.4`.
|
||||
|
||||
### Section Headers
|
||||
```python
|
||||
# Dark background, white bold text, merged across data width
|
||||
# Uses PRIMARY from design.md (or Finance palette PRIMARY from design.md)
|
||||
ws.merge_cells('B10:H10')
|
||||
ws['B10'] = 'Income Statement'
|
||||
ws['B10'].fill = PatternFill('solid', fgColor=PRIMARY)
|
||||
ws['B10'].font = Font(name=FONT_NAME, size=12, bold=HEADER_BOLD, color='FFFFFF')
|
||||
```
|
||||
|
||||
### Data Alignment
|
||||
- Column labels (years, quarters): **right-aligned**
|
||||
- Row labels (line items): **left-aligned**
|
||||
- Submetrics: **indented** (add 2-3 spaces prefix)
|
||||
|
||||
```python
|
||||
# Parent line item
|
||||
ws['B12'] = 'Revenue'
|
||||
ws['B12'].font = Font(name=FONT_NAME, bold=HEADER_BOLD)
|
||||
|
||||
# Sub line item (indented)
|
||||
ws['B13'] = ' Product Revenue'
|
||||
ws['B14'] = ' Service Revenue'
|
||||
```
|
||||
|
||||
### Totals Formatting
|
||||
```python
|
||||
# Uses design tokens — see engines/design.md §6.3
|
||||
total_border = Border(top=Side(style='thin', color=PRIMARY))
|
||||
for col in range(3, 9): # C through H
|
||||
cell = ws.cell(row=total_row, column=col)
|
||||
cell.font = Font(name=FONT_NAME, bold=HEADER_BOLD)
|
||||
cell.border = total_border
|
||||
```
|
||||
|
||||
### Grid Lines
|
||||
```python
|
||||
ws.sheet_view.showGridLines = False # Standard — defined in design.md §7.3
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Balance Check Pattern
|
||||
|
||||
For any financial model with a balance sheet:
|
||||
|
||||
```python
|
||||
# Balance check row (should always be 0)
|
||||
check_row = bs_end + 2
|
||||
ws.cell(row=check_row, column=2, value='Balance Check')
|
||||
for col in range(3, last_col + 1):
|
||||
letter = get_column_letter(col)
|
||||
ws.cell(row=check_row, column=col).value = \
|
||||
f'={letter}{assets_total_row}-{letter}{liab_total_row}-{letter}{equity_total_row}'
|
||||
# Conditional: red if not zero
|
||||
ws.conditional_formatting.add(
|
||||
f'{letter}{check_row}',
|
||||
CellIsRule(operator='notEqual', formula=['0'],
|
||||
font=Font(color='FF0000', bold=True))
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Sensitivity / Scenario Tables
|
||||
|
||||
```python
|
||||
# Two-way data table: vary growth rate (rows) × discount rate (cols)
|
||||
# Row headers: growth rates
|
||||
growth_rates = [0.02, 0.03, 0.04, 0.05, 0.06]
|
||||
# Col headers: discount rates
|
||||
discount_rates = [0.08, 0.09, 0.10, 0.11, 0.12]
|
||||
|
||||
# Write headers
|
||||
for i, g in enumerate(growth_rates):
|
||||
ws.cell(row=start_row + i + 1, column=start_col, value=g)
|
||||
ws.cell(row=start_row + i + 1, column=start_col).number_format = '0.0%'
|
||||
ws.cell(row=start_row + i + 1, column=start_col).font = Font(color='0000FF')
|
||||
|
||||
for j, d in enumerate(discount_rates):
|
||||
ws.cell(row=start_row, column=start_col + j + 1, value=d)
|
||||
ws.cell(row=start_row, column=start_col + j + 1).number_format = '0.0%'
|
||||
ws.cell(row=start_row, column=start_col + j + 1).font = Font(color='0000FF')
|
||||
|
||||
# Fill formulas for each combination
|
||||
# Yellow background for the cell matching base case assumptions
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Projection Period Patterns
|
||||
|
||||
```python
|
||||
# Historical + Projected columns
|
||||
years = ['FY2022', 'FY2023', 'FY2024', 'FY2025E', 'FY2026E', 'FY2027E']
|
||||
|
||||
for i, year in enumerate(years):
|
||||
col = start_col + i
|
||||
cell = ws.cell(row=header_row, column=col, value=year)
|
||||
cell.font = Font(name=FONT_NAME, bold=HEADER_BOLD)
|
||||
cell.alignment = Alignment(horizontal='center')
|
||||
|
||||
# Visual separator between historical and projected
|
||||
if year.endswith('E') and not years[i-1].endswith('E'):
|
||||
# Add left border to mark transition
|
||||
for row in range(header_row, last_row + 1):
|
||||
ws.cell(row=row, column=col).border = Border(
|
||||
left=Side(style='medium', color=PRIMARY))
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Additional Model Templates
|
||||
|
||||
### Template: P&L (Profit & Loss) Statement
|
||||
|
||||
```
|
||||
Sheet: "P&L"
|
||||
Row 1: Company Name + Period
|
||||
Row 3: Headers (Month/Quarter columns)
|
||||
|
||||
Revenue Section:
|
||||
Product Revenue =Assumptions!B5 * (1+Assumptions!C5)
|
||||
Service Revenue =Assumptions!B6 * (1+Assumptions!C6)
|
||||
Total Revenue =SUM(above)
|
||||
|
||||
COGS Section:
|
||||
Direct Costs =Total_Revenue * Assumptions!gross_margin
|
||||
Gross Profit =Total_Revenue - Direct_Costs
|
||||
Gross Margin % =IFERROR(Gross_Profit/Total_Revenue, 0)
|
||||
|
||||
OpEx Section:
|
||||
S&M, R&D, G&A (each from Assumptions)
|
||||
Total OpEx =SUM(S&M:G&A)
|
||||
EBITDA =Gross_Profit - Total_OpEx
|
||||
EBITDA Margin % =IFERROR(EBITDA/Total_Revenue, 0)
|
||||
|
||||
Below the Line:
|
||||
D&A, Interest, Tax
|
||||
Net Income =EBITDA - D&A - Interest - Tax
|
||||
```
|
||||
|
||||
### Template: Budget vs Actual
|
||||
|
||||
```
|
||||
Sheet: "Budget vs Actual"
|
||||
Columns: Category | Budget | Actual | Variance | Var %
|
||||
|
||||
Key formulas:
|
||||
Variance = =Actual - Budget
|
||||
Var % = =IFERROR(Variance/Budget, 0)
|
||||
|
||||
Conditional formatting:
|
||||
Var % > 0 → Green font (favorable)
|
||||
Var % < -10% → Red font + red fill (unfavorable)
|
||||
Var % -10~0 → Orange font (watch)
|
||||
|
||||
Summary section:
|
||||
Total Budget =SUM(Budget range)
|
||||
Total Actual =SUM(Actual range)
|
||||
Overall Var % =IFERROR((Total_Actual-Total_Budget)/Total_Budget, 0)
|
||||
```
|
||||
|
||||
### Template: SaaS Metrics Dashboard
|
||||
|
||||
```
|
||||
Sheet: "SaaS Metrics"
|
||||
KPIs (each with formula, not hardcoded):
|
||||
MRR =SUMPRODUCT(Users * ARPU)
|
||||
ARR =MRR * 12
|
||||
Net Revenue Retention = =IFERROR((Starting_MRR + Expansion - Contraction - Churn) / Starting_MRR, 0)
|
||||
CAC =IFERROR(Total_S&M / New_Customers, 0)
|
||||
LTV =IFERROR(ARPU * Gross_Margin / Monthly_Churn_Rate, 0)
|
||||
LTV:CAC Ratio =IFERROR(LTV / CAC, 0)
|
||||
Payback Months =IFERROR(CAC / (ARPU * Gross_Margin), 0)
|
||||
|
||||
Chart: MRR waterfall (starting → new → expansion → contraction → churn → ending)
|
||||
Chart: LTV:CAC trend line
|
||||
```
|
||||
|
||||
### Template: Project Budget Tracker
|
||||
|
||||
```
|
||||
Sheet: "Project Budget"
|
||||
Columns: Phase | Task | Planned Cost | Actual Cost | Remaining | % Spent | Status
|
||||
|
||||
Key formulas:
|
||||
Remaining = =Planned - Actual
|
||||
% Spent = =IFERROR(Actual/Planned, 0)
|
||||
Status = =IF(% Spent>1, "Over Budget", IF(% Spent>0.9, "At Risk", "On Track"))
|
||||
|
||||
Phase subtotals with SUBTOTAL function
|
||||
Grand total row with project-level health indicator
|
||||
```
|
||||
192
skills/xlsx/scenes/finance_lite.md
Executable file
192
skills/xlsx/scenes/finance_lite.md
Executable file
@@ -0,0 +1,192 @@
|
||||
# Finance Lite — Simple Budget & Expense Guide
|
||||
|
||||
Load this reference for: simple budgets, expense reports, fee tracking, cost summaries, revenue/expense comparison, personal finance, project cost tracking — any financial table that does **NOT** need DCF, LBO, three-statement linkage, sensitivity analysis, or IB-grade formatting.
|
||||
|
||||
For complex financial models → use `scenes/finance.md` instead.
|
||||
|
||||
Also load `engines/design.md` for styling (use **standard** design tokens, NOT IB overrides).
|
||||
|
||||
---
|
||||
|
||||
## When to Use finance_lite vs finance
|
||||
|
||||
| Signal | finance_lite ✅ | finance.md ❌ |
|
||||
|--------|----------------|--------------|
|
||||
| 预算表 / budget | ✅ | |
|
||||
| 费用报表 / expense report | ✅ | |
|
||||
| 项目成本追踪 / project cost tracking | ✅ | |
|
||||
| 收支对比 / revenue vs cost | ✅ | |
|
||||
| 个人记账 / personal finance | ✅ | |
|
||||
| 简单 ROI 计算 / simple ROI calculation | ✅ | |
|
||||
| DCF / LBO / 估值模型 (valuation model) | | ✅ |
|
||||
| 三表联动 (P&L + BS + CF) | | ✅ |
|
||||
| 敏感性分析 / scenario table | | ✅ |
|
||||
| IB pitch book level formatting | | ✅ |
|
||||
|
||||
---
|
||||
|
||||
## Standard Sheet Structure
|
||||
|
||||
```
|
||||
Sheet: "Budget" (or user-specified name)
|
||||
Row 1: margin (whitespace)
|
||||
Row 2: Title (merged, styled via setup_sheet())
|
||||
Row 3: spacer
|
||||
Row 4: Headers
|
||||
Row 5+: Data rows
|
||||
Last row: Totals (if applicable)
|
||||
```
|
||||
|
||||
### Typical Column Patterns
|
||||
|
||||
**Budget Table:**
|
||||
```
|
||||
Category (类别) | Budget Amount (预算金额) | Actual Amount (实际金额) | Variance (差异) | Variance Rate (差异率) | Notes (备注)
|
||||
```
|
||||
|
||||
**Expense Report:**
|
||||
```
|
||||
Date (日期) | Category (类别) | Description (说明) | Amount (金额) | Claimant (报销人) | Status (状态)
|
||||
```
|
||||
|
||||
**Revenue vs Cost:**
|
||||
```
|
||||
Month (月份) | Revenue (收入) | Cost (成本) | Gross Profit (毛利) | Gross Margin (毛利率)
|
||||
```
|
||||
|
||||
**Project Cost:**
|
||||
```
|
||||
Phase (阶段) | Task (任务) | Budget (预算) | Used (已用) | Remaining (剩余) | Usage Rate (使用率) | Status (状态)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Formula Patterns
|
||||
|
||||
```python
|
||||
# Variance
|
||||
cell.value = '=C{r}-B{r}' # Actual - Budget
|
||||
|
||||
# Variance percentage (safe division)
|
||||
cell.value = '=IFERROR((C{r}-B{r})/B{r},0)'
|
||||
|
||||
# Running total
|
||||
cell.value = '=SUM(D$5:D{r})'
|
||||
|
||||
# Gross margin
|
||||
cell.value = '=IFERROR((B{r}-C{r})/B{r},0)'
|
||||
|
||||
# Status formula (simple threshold)
|
||||
cell.value = '=IF(F{r}>1,"Over Budget",IF(F{r}>0.9,"At Risk","On Track"))'
|
||||
|
||||
# Subtotal
|
||||
cell.value = '=SUBTOTAL(9,D{start}:D{end})'
|
||||
|
||||
# Grand total
|
||||
cell.value = '=SUM(D5:D{last_data_row})'
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Number Formats
|
||||
|
||||
Use standard formats from `templates/base.py`:
|
||||
|
||||
```python
|
||||
from templates.base import FORMATS
|
||||
|
||||
cell.number_format = FORMATS['currency_cny'] # ¥#,##0.00
|
||||
cell.number_format = FORMATS['percentage'] # 0.0%
|
||||
cell.number_format = FORMATS['integer'] # #,##0
|
||||
cell.number_format = FORMATS['date'] # YYYY-MM-DD
|
||||
```
|
||||
|
||||
For budget-specific formatting (negatives in parentheses):
|
||||
```python
|
||||
BUDGET_FORMATS = {
|
||||
'currency': '¥#,##0.00;(¥#,##0.00);"-"',
|
||||
'variance': '#,##0.00;(#,##0.00);"-"',
|
||||
'var_pct': '0.0%;(0.0%);"-"',
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Styling
|
||||
|
||||
Use **standard** design tokens (NOT IB overrides):
|
||||
|
||||
```python
|
||||
from templates.base import (
|
||||
setup_sheet, style_header_row, style_data_row, style_total_row,
|
||||
FONT_NAME, HEADER_BOLD, PRIMARY, ACCENT_POSITIVE, ACCENT_NEGATIVE, ACCENT_WARNING,
|
||||
font_body, font_header, fill_header,
|
||||
)
|
||||
|
||||
# Setup
|
||||
setup_sheet(ws, title="2026年部门预算", last_col=7)
|
||||
|
||||
# Headers at row 4
|
||||
style_header_row(ws, row_num=4, col_start=2, col_end=7)
|
||||
|
||||
# Data rows
|
||||
for i, row_num in enumerate(range(5, last_row + 1)):
|
||||
style_data_row(ws, row_num=row_num, col_start=2, col_end=7, row_index=i)
|
||||
|
||||
# Totals
|
||||
style_total_row(ws, row_num=last_row + 1, col_start=2, col_end=7)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Conditional Formatting (Simple)
|
||||
|
||||
```python
|
||||
from openpyxl.formatting.rule import CellIsRule
|
||||
from templates.base import CF_POSITIVE_FONT, CF_POSITIVE_FILL, CF_NEGATIVE_FONT, CF_NEGATIVE_FILL
|
||||
|
||||
# Highlight positive variance (green)
|
||||
ws.conditional_formatting.add(
|
||||
f'D5:D{last_row}',
|
||||
CellIsRule(operator='greaterThan', formula=['0'],
|
||||
font=CF_POSITIVE_FONT, fill=CF_POSITIVE_FILL)
|
||||
)
|
||||
|
||||
# Highlight negative variance (red)
|
||||
ws.conditional_formatting.add(
|
||||
f'D5:D{last_row}',
|
||||
CellIsRule(operator='lessThan', formula=['0'],
|
||||
font=CF_NEGATIVE_FONT, fill=CF_NEGATIVE_FILL)
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Quick Templates
|
||||
|
||||
### Template: Monthly Budget
|
||||
|
||||
```python
|
||||
headers = ["类别", "预算金额", "实际金额", "差异", "差异率", "状态"]
|
||||
# Variance = Actual - Budget
|
||||
# Var% = IFERROR((Actual-Budget)/Budget, 0)
|
||||
# Status = IF(Var%>0.1,"超支"(Over Budget),IF(Var%>0,"注意"(Watch),"正常"(Normal)))
|
||||
```
|
||||
|
||||
### Template: Expense Report
|
||||
|
||||
```python
|
||||
headers = ["日期", "类别", "说明", "金额", "报销人", "状态"]
|
||||
# Date format: YYYY-MM-DD
|
||||
# Amount: currency_cny
|
||||
# Status: dropdown validation ["待审批"(Pending),"已审批"(Approved),"已报销"(Reimbursed),"已拒绝"(Rejected)]
|
||||
```
|
||||
|
||||
### Template: Project Cost Tracker
|
||||
|
||||
```python
|
||||
headers = ["阶段", "任务", "预算", "已用", "剩余", "使用率", "状态"]
|
||||
# Remaining = Budget - Used
|
||||
# Usage% = IFERROR(Used/Budget, 0)
|
||||
# Status = IF(Usage%>1,"超支"(Over Budget),IF(Usage%>0.9,"预警"(Warning),"正常"(Normal)))
|
||||
```
|
||||
298
skills/xlsx/scenes/vba.md
Executable file
298
skills/xlsx/scenes/vba.md
Executable file
@@ -0,0 +1,298 @@
|
||||
# VBA — Macro Generation & Management Guide
|
||||
|
||||
Load this reference when the task involves: creating Excel macros, writing VBA code, automating Excel workflows, adding buttons/forms, modifying existing macros, or any `.xlsm` deliverable that needs programmatic automation.
|
||||
|
||||
Also load `engines/vba-templates.md` for ready-to-use code templates.
|
||||
|
||||
---
|
||||
|
||||
## Core Principles
|
||||
|
||||
### 1. Safety First
|
||||
- **Never** generate VBA that deletes files, accesses filesystem outside the workbook, or sends data to external URLs without explicit user request
|
||||
- **Always** include error handling (`On Error GoTo`)
|
||||
- **Always** add `Application.ScreenUpdating` toggle for performance
|
||||
- Generated macros must be **read-audit-friendly**: clear naming, comments, structured layout
|
||||
|
||||
### 2. openpyxl VBA Workflow
|
||||
openpyxl can read/preserve/inject VBA but **cannot execute** it. The workflow:
|
||||
|
||||
```python
|
||||
# READ existing VBA
|
||||
from openpyxl import load_workbook
|
||||
wb = load_workbook('file.xlsm', keep_vba=True)
|
||||
# wb.vba_archive contains all VBA modules
|
||||
|
||||
# CREATE new .xlsm with VBA
|
||||
from openpyxl import Workbook
|
||||
wb = Workbook()
|
||||
# ... build sheets ...
|
||||
# Inject VBA via vbaProject.bin (see Injection section)
|
||||
wb.save('output.xlsm')
|
||||
```
|
||||
|
||||
### 3. File Format Rules
|
||||
| Need | Format | Extension |
|
||||
|------|--------|-----------|
|
||||
| Data only, no macros | OpenXML | `.xlsx` |
|
||||
| Contains VBA macros | Macro-Enabled | `.xlsm` |
|
||||
| Binary with macros | Binary | `.xlsb` |
|
||||
|
||||
**Critical**: If user gives `.xlsx` but wants macros → output must be `.xlsm`. Always warn about format change.
|
||||
|
||||
---
|
||||
|
||||
## VBA Code Structure Standard
|
||||
|
||||
Every generated VBA module must follow this structure:
|
||||
|
||||
```vba
|
||||
Option Explicit
|
||||
|
||||
' ============================================================
|
||||
' Module: [ModuleName]
|
||||
' Purpose: [One-line description]
|
||||
' Author: Z.ai
|
||||
' Date: [YYYY-MM-DD]
|
||||
' ============================================================
|
||||
|
||||
' --- Constants ---
|
||||
Private Const MODULE_NAME As String = "[ModuleName]"
|
||||
|
||||
' --- Main Entry Point ---
|
||||
Public Sub Main()
|
||||
On Error GoTo ErrHandler
|
||||
Application.ScreenUpdating = False
|
||||
Application.Calculation = xlCalculationManual
|
||||
|
||||
' [Main logic here]
|
||||
|
||||
CleanUp:
|
||||
Application.ScreenUpdating = True
|
||||
Application.Calculation = xlCalculationAutomatic
|
||||
Exit Sub
|
||||
|
||||
ErrHandler:
|
||||
MsgBox "Error in " & MODULE_NAME & ": " & Err.Description, _
|
||||
vbCritical, "Error"
|
||||
Resume CleanUp
|
||||
End Sub
|
||||
```
|
||||
|
||||
### Naming Conventions
|
||||
| Element | Convention | Example |
|
||||
|---------|-----------|---------|
|
||||
| Sub/Function | PascalCase | `GenerateMonthlyReport` |
|
||||
| Variable | camelCase | `lastRow`, `wsData` |
|
||||
| Constant | UPPER_SNAKE | `MAX_ROWS`, `REPORT_TITLE` |
|
||||
| Module | PascalCase | `ModReport`, `ModUtils` |
|
||||
| Worksheet variable | ws + Name | `wsData`, `wsSummary` |
|
||||
| Range variable | rng + Desc | `rngData`, `rngHeaders` |
|
||||
|
||||
### Variable Declaration Rules
|
||||
```vba
|
||||
' Always use explicit types
|
||||
Dim lastRow As Long ' Not Integer (row limit)
|
||||
Dim ws As Worksheet
|
||||
Dim rng As Range
|
||||
Dim cell As Range
|
||||
Dim i As Long
|
||||
Dim strValue As String
|
||||
Dim dblAmount As Double
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Common Patterns
|
||||
|
||||
### Find Last Row/Column (Robust)
|
||||
```vba
|
||||
' Last row with data in column A
|
||||
Dim lastRow As Long
|
||||
lastRow = ws.Cells(ws.Rows.Count, "A").End(xlUp).Row
|
||||
|
||||
' Last column with data in row 1
|
||||
Dim lastCol As Long
|
||||
lastCol = ws.Cells(1, ws.Columns.Count).End(xlToLeft).Column
|
||||
|
||||
' Used range (less reliable but useful)
|
||||
Dim usedRows As Long
|
||||
usedRows = ws.UsedRange.Rows.Count
|
||||
```
|
||||
|
||||
### Loop Through Data
|
||||
```vba
|
||||
' Row loop
|
||||
Dim i As Long
|
||||
For i = 2 To lastRow ' Skip header
|
||||
If ws.Cells(i, 1).Value <> "" Then
|
||||
' Process row
|
||||
End If
|
||||
Next i
|
||||
|
||||
' For Each (range)
|
||||
Dim cell As Range
|
||||
For Each cell In ws.Range("A2:A" & lastRow)
|
||||
If Not IsEmpty(cell) Then
|
||||
' Process cell
|
||||
End If
|
||||
Next cell
|
||||
```
|
||||
|
||||
### Sheet Operations
|
||||
```vba
|
||||
' Reference sheet safely
|
||||
Dim ws As Worksheet
|
||||
On Error Resume Next
|
||||
Set ws = ThisWorkbook.Sheets("Data")
|
||||
On Error GoTo 0
|
||||
If ws Is Nothing Then
|
||||
MsgBox "Sheet 'Data' not found!", vbExclamation
|
||||
Exit Sub
|
||||
End If
|
||||
|
||||
' Create sheet if not exists
|
||||
Dim wsNew As Worksheet
|
||||
Dim sheetExists As Boolean
|
||||
For Each wsNew In ThisWorkbook.Sheets
|
||||
If wsNew.Name = "Summary" Then sheetExists = True
|
||||
Next wsNew
|
||||
If Not sheetExists Then
|
||||
Set wsNew = ThisWorkbook.Sheets.Add(After:=ThisWorkbook.Sheets(ThisWorkbook.Sheets.Count))
|
||||
wsNew.Name = "Summary"
|
||||
End If
|
||||
```
|
||||
|
||||
### User Interaction
|
||||
```vba
|
||||
' Simple input
|
||||
Dim userInput As String
|
||||
userInput = InputBox("Enter report month (YYYY-MM):", "Month Selection")
|
||||
If userInput = "" Then Exit Sub
|
||||
|
||||
' Confirmation
|
||||
If MsgBox("Generate report for " & userInput & "?", _
|
||||
vbYesNo + vbQuestion, "Confirm") = vbNo Then Exit Sub
|
||||
|
||||
' File picker
|
||||
Dim filePath As Variant
|
||||
filePath = Application.GetOpenFilename( _
|
||||
FileFilter:="Excel Files (*.xlsx;*.xlsm),*.xlsx;*.xlsm", _
|
||||
Title:="Select Source File")
|
||||
If filePath = False Then Exit Sub
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## VBA Injection via openpyxl
|
||||
|
||||
### Method 1: Preserve Existing VBA
|
||||
```python
|
||||
# Open with VBA preserved
|
||||
wb = load_workbook('source.xlsm', keep_vba=True)
|
||||
# Edit data/formatting as usual
|
||||
wb.save('output.xlsm') # VBA modules intact
|
||||
```
|
||||
|
||||
### Method 2: Copy VBA from Template
|
||||
```python
|
||||
# Use a template .xlsm that already has the VBA you need
|
||||
import shutil
|
||||
shutil.copy('template_with_macros.xlsm', 'output.xlsm')
|
||||
wb = load_workbook('output.xlsm', keep_vba=True)
|
||||
# Modify data
|
||||
wb.save('output.xlsm')
|
||||
```
|
||||
|
||||
### Method 3: Manual vbaProject.bin Injection
|
||||
```python
|
||||
# For advanced use: inject raw vbaProject.bin
|
||||
# 1. Create your VBA in Excel, save as .xlsm
|
||||
# 2. Extract vbaProject.bin from the .xlsm (it's a ZIP)
|
||||
# 3. Inject into new workbook
|
||||
|
||||
import zipfile
|
||||
import shutil
|
||||
|
||||
# Create the workbook first
|
||||
wb = Workbook()
|
||||
# ... add data ...
|
||||
wb.save('temp.xlsx')
|
||||
|
||||
# Convert to .xlsm by injecting VBA
|
||||
shutil.copy('temp.xlsx', 'output.xlsm')
|
||||
with zipfile.ZipFile('output.xlsm', 'a') as zf:
|
||||
zf.write('vbaProject.bin', 'xl/vbaProject.bin')
|
||||
|
||||
# Update [Content_Types].xml to register VBA
|
||||
# (This is fragile — Method 1 or 2 preferred)
|
||||
```
|
||||
|
||||
**Recommendation**: Method 1 (preserve) or Method 2 (template) are robust. Method 3 is fragile and should be last resort.
|
||||
|
||||
---
|
||||
|
||||
## Security Checklist
|
||||
|
||||
Before delivering any VBA-enabled file:
|
||||
|
||||
- [ ] No filesystem access outside workbook (no `Kill`, `FileCopy`, `MkDir` unless requested)
|
||||
- [ ] No network calls (`XMLHTTP`, `WinHttpRequest`) unless requested
|
||||
- [ ] No shell execution (`Shell`, `WScript.Shell`) unless requested
|
||||
- [ ] No registry access (`CreateObject("WScript.Shell").RegWrite`)
|
||||
- [ ] No auto-execution (`Auto_Open`, `Workbook_Open`) unless explicitly requested
|
||||
- [ ] Error handling in every Sub/Function
|
||||
- [ ] `ScreenUpdating` restored in cleanup
|
||||
- [ ] All variables explicitly declared (`Option Explicit`)
|
||||
- [ ] Module purpose documented in header comment
|
||||
|
||||
---
|
||||
|
||||
## Performance Guidelines
|
||||
|
||||
```vba
|
||||
' ALWAYS bracket bulk operations
|
||||
Application.ScreenUpdating = False
|
||||
Application.Calculation = xlCalculationManual
|
||||
Application.EnableEvents = False
|
||||
|
||||
' [Bulk operations here]
|
||||
|
||||
Application.EnableEvents = True
|
||||
Application.Calculation = xlCalculationAutomatic
|
||||
Application.ScreenUpdating = True
|
||||
```
|
||||
|
||||
### Array-Based Processing (for large data)
|
||||
```vba
|
||||
' Read range into array — much faster than cell-by-cell
|
||||
Dim data As Variant
|
||||
data = ws.Range("A1:Z" & lastRow).Value ' 2D array
|
||||
|
||||
' Process in memory
|
||||
Dim i As Long
|
||||
For i = LBound(data, 1) To UBound(data, 1)
|
||||
data(i, 3) = data(i, 1) * data(i, 2) ' Column C = A * B
|
||||
Next i
|
||||
|
||||
' Write back in one shot
|
||||
ws.Range("A1:Z" & lastRow).Value = data
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Debugging Support
|
||||
|
||||
When user reports VBA errors, include diagnostic code:
|
||||
|
||||
```vba
|
||||
' Debug logging to Immediate Window
|
||||
Debug.Print "Processing row " & i & ": " & ws.Cells(i, 1).Value
|
||||
|
||||
' Verbose error info
|
||||
ErrHandler:
|
||||
Debug.Print "ERROR in " & MODULE_NAME
|
||||
Debug.Print " Number: " & Err.Number
|
||||
Debug.Print " Description: " & Err.Description
|
||||
Debug.Print " Source: " & Err.Source
|
||||
```
|
||||
Reference in New Issue
Block a user