1300 lines
51 KiB
Python
Executable File
1300 lines
51 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
xlsx.py — Unified Excel Quality Assurance & Manipulation Tool
|
|
|
|
Commands:
|
|
recalc <xlsx> [timeout] Recalculate formulas via LibreOffice + error scan
|
|
audit <xlsx> Formula error + zero-value + implicit array detection
|
|
scan <xlsx> Reference anomaly detection
|
|
inspect <xlsx> [--pretty] Structure analysis → JSON
|
|
pivot <in> <out> [options] PivotTable with optional chart
|
|
chart-verify <xlsx> Verify chart data content
|
|
validate <xlsx> Structural validation (forbidden funcs, schema)
|
|
|
|
Usage:
|
|
python3 xlsx.py <command> [args...]
|
|
python3 xlsx.py --help
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import platform
|
|
import re
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
import zipfile
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple
|
|
|
|
try:
|
|
from openpyxl import load_workbook, Workbook
|
|
from openpyxl.utils import get_column_letter, column_index_from_string
|
|
from openpyxl.utils.cell import coordinate_from_string
|
|
except ImportError:
|
|
print("Error: openpyxl required. Install: pip install openpyxl", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════
|
|
# Section 0: Command registry + shared constants & helpers
|
|
# ═══════════════════════════════════════════════════════════════
|
|
|
|
_COMMANDS: Dict[str, Callable] = {}
|
|
|
|
|
|
def cmd(name: str):
|
|
"""Decorator that registers a function as a CLI command."""
|
|
def decorator(fn: Callable) -> Callable:
|
|
_COMMANDS[name] = fn
|
|
return fn
|
|
return decorator
|
|
|
|
|
|
# --------------- constants ---------------
|
|
|
|
EXCEL_ERRORS = {"#VALUE!", "#DIV/0!", "#REF!", "#NAME?", "#NULL!", "#NUM!", "#N/A"}
|
|
|
|
# Implicit array formula patterns that work in LibreOffice but fail in MS Excel
|
|
IMPLICIT_ARRAY_PATTERNS = [
|
|
(re.compile(r'MATCH\s*\(\s*TRUE\s*\(\s*\)', re.IGNORECASE),
|
|
"MATCH(TRUE(), ...) requires CSE in Excel. Use SUMPRODUCT or helper column."),
|
|
(re.compile(r'MATCH\s*\(\s*TRUE\s*,', re.IGNORECASE),
|
|
"MATCH(TRUE, ...) with comparison range requires CSE. Use SUMPRODUCT."),
|
|
(re.compile(r'MATCH\s*\([^,]+[<>=!]+[^,]+,', re.IGNORECASE),
|
|
"MATCH with inline comparison requires CSE. Use SUMPRODUCT or helper column."),
|
|
]
|
|
|
|
FORBIDDEN_FUNCTIONS = {
|
|
"FILTER", "UNIQUE", "SORT", "SORTBY", "XLOOKUP", "XMATCH",
|
|
"SEQUENCE", "LET", "LAMBDA", "RANDARRAY",
|
|
"ARRAYFORMULA", "QUERY", "IMPORTRANGE",
|
|
}
|
|
|
|
# [Fix ①] Pattern to detect valid formula content (function calls or cell references)
|
|
# A real formula must contain at least one of:
|
|
# - Function call: ALPHA_CHARS( e.g. SUM(, IF(, VLOOKUP(
|
|
# - Cell reference: $?[A-Z]{1,3}$?\d+ e.g. A1, $B$5, $A$1:$A$10
|
|
_VALID_FORMULA_PATTERN = re.compile(
|
|
r'[A-Z]{2,}\s*\(' # function call (2+ uppercase letters followed by parenthesis)
|
|
r'|'
|
|
r'\$?[A-Z]{1,3}\$?\d+' # cell reference like A1, $B$5
|
|
,
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
# [Fix ②] Pattern to detect external file references in formulas
|
|
# Matches [filename.xlsx]SheetName! or [filename.xls]SheetName! etc.
|
|
_EXT_FILE_REF_PATTERN = re.compile(r"\[([^\]]+\.(xlsx?|xlsm|xlsb|csv))\]", re.IGNORECASE)
|
|
|
|
|
|
# --------------- helpers ---------------
|
|
|
|
def cell_ref(sheet_name: str, cell) -> str:
|
|
return f"{sheet_name}!{cell.coordinate}"
|
|
|
|
|
|
def is_formula(value) -> bool:
|
|
return isinstance(value, str) and value.startswith("=")
|
|
|
|
|
|
def parse_range(range_str: str) -> Tuple[Optional[str], int, int, int, int]:
|
|
"""Parse 'Sheet!A1:F100' into (sheet, min_col, min_row, max_col, max_row)."""
|
|
if "!" in range_str:
|
|
sheet, rng = range_str.rsplit("!", 1)
|
|
sheet = sheet.strip("'\"")
|
|
else:
|
|
sheet = None
|
|
rng = range_str
|
|
parts = rng.split(":")
|
|
if len(parts) == 2:
|
|
c1, r1 = coordinate_from_string(parts[0])
|
|
c2, r2 = coordinate_from_string(parts[1])
|
|
return sheet, column_index_from_string(c1), r1, column_index_from_string(c2), r2
|
|
else:
|
|
c1, r1 = coordinate_from_string(parts[0])
|
|
return sheet, column_index_from_string(c1), r1, column_index_from_string(c1), r1
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════
|
|
# Section 1: recalc — LibreOffice recalculation + error scan
|
|
# ═══════════════════════════════════════════════════════════════
|
|
|
|
def _find_soffice() -> Optional[str]:
|
|
"""Locate soffice binary across macOS / Linux / Windows.
|
|
|
|
Search order:
|
|
1. PATH (shutil.which)
|
|
2. Platform-specific well-known locations
|
|
Returns the absolute path, or None if not found.
|
|
"""
|
|
# 1. Check PATH first (works on all platforms if user configured it)
|
|
found = shutil.which("soffice")
|
|
if found:
|
|
return found
|
|
|
|
# 2. Platform-specific search
|
|
system = platform.system()
|
|
|
|
if system == "Darwin":
|
|
candidates = [
|
|
"/Applications/LibreOffice.app/Contents/MacOS/soffice",
|
|
os.path.expanduser("~/Applications/LibreOffice.app/Contents/MacOS/soffice"),
|
|
]
|
|
elif system == "Linux":
|
|
candidates = [
|
|
"/usr/bin/soffice",
|
|
"/usr/local/bin/soffice",
|
|
"/usr/lib/libreoffice/program/soffice",
|
|
"/opt/libreoffice/program/soffice",
|
|
"/snap/bin/libreoffice.soffice", # Snap package
|
|
"/var/lib/flatpak/exports/bin/org.libreoffice.LibreOffice", # Flatpak
|
|
]
|
|
elif system == "Windows":
|
|
candidates = [
|
|
r"C:\Program Files\LibreOffice\program\soffice.exe",
|
|
r"C:\Program Files (x86)\LibreOffice\program\soffice.exe",
|
|
]
|
|
else:
|
|
candidates = []
|
|
|
|
for path in candidates:
|
|
if os.path.isfile(path):
|
|
return path
|
|
|
|
return None
|
|
|
|
|
|
def _setup_libreoffice_macro() -> bool:
|
|
"""Setup LibreOffice macro for recalculation if not already configured."""
|
|
if platform.system() == "Darwin":
|
|
macro_dir = os.path.expanduser(
|
|
"~/Library/Application Support/LibreOffice/4/user/basic/Standard"
|
|
)
|
|
else:
|
|
macro_dir = os.path.expanduser(
|
|
"~/.config/libreoffice/4/user/basic/Standard"
|
|
)
|
|
|
|
macro_file = os.path.join(macro_dir, "Module1.xba")
|
|
|
|
if os.path.exists(macro_file):
|
|
with open(macro_file, "r") as f:
|
|
if "RecalculateAndSave" in f.read():
|
|
return True
|
|
|
|
if not os.path.exists(macro_dir):
|
|
soffice_bin = _find_soffice()
|
|
if soffice_bin:
|
|
subprocess.run(
|
|
[soffice_bin, "--headless", "--terminate_after_init"],
|
|
capture_output=True, timeout=10,
|
|
)
|
|
os.makedirs(macro_dir, exist_ok=True)
|
|
|
|
macro_content = '''<?xml version="1.0" encoding="UTF-8"?>
|
|
<!DOCTYPE script:module PUBLIC "-//OpenOffice.org//DTD OfficeDocument 1.0//EN" "module.dtd">
|
|
<script:module xmlns:script="http://openoffice.org/2000/script" script:name="Module1" script:language="StarBasic">
|
|
Sub RecalculateAndSave()
|
|
ThisComponent.calculateAll()
|
|
ThisComponent.store()
|
|
ThisComponent.close(True)
|
|
End Sub
|
|
</script:module>'''
|
|
|
|
try:
|
|
with open(macro_file, "w") as f:
|
|
f.write(macro_content)
|
|
return True
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
def _libreoffice_recalc(filename: str, timeout: int = 30) -> Dict[str, Any]:
|
|
"""
|
|
Recalculate formulas in an Excel file via LibreOffice,
|
|
then scan ALL cells for errors.
|
|
|
|
Returns a dict with status, error counts, and locations.
|
|
"""
|
|
if not Path(filename).exists():
|
|
return {"error": f"File {filename} does not exist"}
|
|
|
|
abs_path = str(Path(filename).absolute())
|
|
|
|
if not _setup_libreoffice_macro():
|
|
return {"error": "Failed to setup LibreOffice macro"}
|
|
|
|
# Locate soffice binary (cross-platform)
|
|
soffice_bin = _find_soffice()
|
|
if not soffice_bin:
|
|
return {"error": "LibreOffice not found. Install it and ensure soffice is in PATH."}
|
|
|
|
lo_cmd: List[str] = [
|
|
soffice_bin, "--headless", "--norestore",
|
|
"vnd.sun.star.script:Standard.Module1.RecalculateAndSave"
|
|
"?language=Basic&location=application",
|
|
abs_path,
|
|
]
|
|
|
|
# Wrap with a timeout binary when available
|
|
if platform.system() != "Windows":
|
|
timeout_bin: Optional[str] = None
|
|
if platform.system() == "Linux":
|
|
timeout_bin = "timeout"
|
|
elif platform.system() == "Darwin":
|
|
try:
|
|
subprocess.run(
|
|
["gtimeout", "--version"],
|
|
capture_output=True, timeout=1, check=False,
|
|
)
|
|
timeout_bin = "gtimeout"
|
|
except (FileNotFoundError, subprocess.TimeoutExpired):
|
|
pass
|
|
if timeout_bin:
|
|
lo_cmd = [timeout_bin, str(timeout)] + lo_cmd
|
|
|
|
result = subprocess.run(lo_cmd, capture_output=True, text=True)
|
|
|
|
if result.returncode != 0 and result.returncode != 124:
|
|
error_msg = result.stderr or "Unknown error during recalculation"
|
|
if "Module1" in error_msg or "RecalculateAndSave" not in error_msg:
|
|
return {"error": "LibreOffice macro not configured properly"}
|
|
else:
|
|
return {"error": error_msg}
|
|
|
|
# Scan recalculated file for Excel errors
|
|
try:
|
|
wb = load_workbook(filename, data_only=True)
|
|
|
|
excel_errors_list = [
|
|
"#VALUE!", "#DIV/0!", "#REF!", "#NAME?", "#NULL!", "#NUM!", "#N/A"
|
|
]
|
|
error_details: Dict[str, List[str]] = {err: [] for err in excel_errors_list}
|
|
total_errors = 0
|
|
|
|
for sheet_name in wb.sheetnames:
|
|
ws = wb[sheet_name]
|
|
for row in ws.iter_rows():
|
|
for cell in row:
|
|
if cell.value is not None and isinstance(cell.value, str):
|
|
for err in excel_errors_list:
|
|
if err in cell.value:
|
|
location = f"{sheet_name}!{cell.coordinate}"
|
|
error_details[err].append(location)
|
|
total_errors += 1
|
|
break
|
|
wb.close()
|
|
|
|
out: Dict[str, Any] = {
|
|
"status": "success" if total_errors == 0 else "errors_found",
|
|
"total_errors": total_errors,
|
|
"error_summary": {},
|
|
}
|
|
|
|
for err_type, locations in error_details.items():
|
|
if locations:
|
|
out["error_summary"][err_type] = {
|
|
"count": len(locations),
|
|
"locations": locations[:20],
|
|
}
|
|
|
|
# Count formulas for context
|
|
wb_formulas = load_workbook(filename, data_only=False)
|
|
formula_count = 0
|
|
for sheet_name in wb_formulas.sheetnames:
|
|
ws = wb_formulas[sheet_name]
|
|
for row in ws.iter_rows():
|
|
for cell in row:
|
|
if cell.value and isinstance(cell.value, str) and cell.value.startswith("="):
|
|
formula_count += 1
|
|
wb_formulas.close()
|
|
|
|
out["total_formulas"] = formula_count
|
|
return out
|
|
|
|
except Exception as e:
|
|
return {"error": str(e)}
|
|
|
|
|
|
@cmd("recalc")
|
|
def cmd_recalc(argv: Sequence[str]) -> int:
|
|
"""Recalculate formulas via LibreOffice and report errors."""
|
|
parser = argparse.ArgumentParser(prog="xlsx.py recalc",
|
|
description="Recalculate Excel formulas via LibreOffice")
|
|
parser.add_argument("file", help="Excel file path")
|
|
parser.add_argument("timeout", nargs="?", type=int, default=30,
|
|
help="Timeout in seconds (default: 30)")
|
|
args = parser.parse_args(argv)
|
|
|
|
result = _libreoffice_recalc(args.file, args.timeout)
|
|
print(json.dumps(result, indent=2, ensure_ascii=False))
|
|
if "error" in result:
|
|
return 1
|
|
return 0 if result.get("total_errors", 0) == 0 else 1
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════
|
|
# Section 2: audit — Formula error + zero-value + implicit array
|
|
# ═══════════════════════════════════════════════════════════════
|
|
|
|
def _run_libreoffice_recalc_best_effort(filepath: str) -> None:
|
|
"""Attempt LibreOffice recalc (best-effort, swallow errors)."""
|
|
try:
|
|
_libreoffice_recalc(filepath, timeout=30)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
@cmd("audit")
|
|
def cmd_audit(argv: Sequence[str]) -> int:
|
|
"""Detect formula errors, zero-value formulas, and implicit array formulas."""
|
|
parser = argparse.ArgumentParser(prog="xlsx.py audit",
|
|
description="Formula error scan + zero-value + implicit array detection")
|
|
parser.add_argument("file", help="Excel file path")
|
|
args = parser.parse_args(argv)
|
|
|
|
path = Path(args.file)
|
|
if not path.exists():
|
|
print(json.dumps({"error": f"File not found: {path}"}))
|
|
return 1
|
|
|
|
# Best-effort LibreOffice recalculation first
|
|
_run_libreoffice_recalc_best_effort(str(path))
|
|
|
|
wb_data = load_workbook(str(path), data_only=True)
|
|
wb_form = load_workbook(str(path), data_only=False)
|
|
|
|
errors: List[Dict[str, Any]] = []
|
|
zero_values: List[Dict[str, str]] = []
|
|
implicit_arrays: List[Dict[str, str]] = []
|
|
total_formulas = 0
|
|
|
|
for sname in wb_form.sheetnames:
|
|
ws_d = wb_data[sname]
|
|
ws_f = wb_form[sname]
|
|
for row_d, row_f in zip(ws_d.iter_rows(), ws_f.iter_rows()):
|
|
for cd, cf in zip(row_d, row_f):
|
|
fval = cf.value
|
|
if not is_formula(fval):
|
|
continue
|
|
total_formulas += 1
|
|
fstr = str(fval)
|
|
|
|
# Check for formula errors in calculated value
|
|
dval = cd.value
|
|
if dval is not None and isinstance(dval, str):
|
|
for err in EXCEL_ERRORS:
|
|
if err in dval:
|
|
errors.append({
|
|
"cell": cell_ref(sname, cd),
|
|
"error": err,
|
|
"formula": fstr[:120],
|
|
})
|
|
break
|
|
|
|
# Check for zero values (potential reference errors)
|
|
if isinstance(dval, (int, float)) and dval == 0:
|
|
zero_values.append({
|
|
"cell": cell_ref(sname, cd),
|
|
"formula": fstr[:120],
|
|
})
|
|
|
|
# Check for implicit array formula patterns
|
|
for pattern, msg in IMPLICIT_ARRAY_PATTERNS:
|
|
if pattern.search(fstr):
|
|
implicit_arrays.append({
|
|
"cell": cell_ref(sname, cf),
|
|
"formula": fstr[:120],
|
|
"issue": msg,
|
|
})
|
|
break
|
|
|
|
wb_data.close()
|
|
wb_form.close()
|
|
|
|
result: Dict[str, Any] = {
|
|
"total_formulas": total_formulas,
|
|
"error_count": len(errors),
|
|
"zero_value_count": len(zero_values),
|
|
"implicit_array_count": len(implicit_arrays),
|
|
}
|
|
if errors:
|
|
result["errors"] = errors[:50]
|
|
if zero_values:
|
|
result["zero_values"] = zero_values[:30]
|
|
if implicit_arrays:
|
|
result["implicit_arrays"] = implicit_arrays[:20]
|
|
|
|
print(json.dumps(result, indent=2, ensure_ascii=False))
|
|
return 1 if errors else 0
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════
|
|
# Section 3: scan — Reference anomaly detection
|
|
# ═══════════════════════════════════════════════════════════════
|
|
|
|
@cmd("scan")
|
|
def cmd_scan(argv: Sequence[str]) -> int:
|
|
"""Detect reference anomalies in formulas."""
|
|
parser = argparse.ArgumentParser(prog="xlsx.py scan",
|
|
description="Reference anomaly detection")
|
|
parser.add_argument("file", help="Excel file path")
|
|
args = parser.parse_args(argv)
|
|
|
|
path = Path(args.file)
|
|
if not path.exists():
|
|
print(json.dumps({"error": f"File not found: {path}"}))
|
|
return 1
|
|
|
|
wb = load_workbook(str(path), data_only=False)
|
|
findings: List[Dict[str, str]] = []
|
|
|
|
# Pre-collect max_row for every sheet (used for cross-sheet ref check)
|
|
sheet_max_rows: Dict[str, int] = {}
|
|
sheet_max_cols: Dict[str, int] = {}
|
|
for sn in wb.sheetnames:
|
|
sw = wb[sn]
|
|
sheet_max_rows[sn] = sw.max_row or 1
|
|
sheet_max_cols[sn] = sw.max_column or 1
|
|
|
|
for sname in wb.sheetnames:
|
|
ws = wb[sname]
|
|
max_data_row = sheet_max_rows[sname]
|
|
max_data_col = sheet_max_cols[sname]
|
|
|
|
# Collect formulas by column for pattern analysis
|
|
col_formulas: Dict[Tuple[str, str], List[Tuple[int, str]]] = defaultdict(list)
|
|
|
|
for row in ws.iter_rows(min_row=1, max_row=max_data_row,
|
|
min_col=1, max_col=max_data_col):
|
|
for c in row:
|
|
if not is_formula(c.value):
|
|
continue
|
|
fstr = str(c.value)
|
|
col_letter = get_column_letter(c.column)
|
|
col_formulas[(sname, col_letter)].append((c.row, fstr))
|
|
|
|
# --- Out-of-range references ---
|
|
# Check for cross-sheet references: SheetName!A1:A242
|
|
cross_sheet = re.findall(r"([A-Za-z_]\w*?)!([A-Z]{1,3})(\d+):([A-Z]{1,3})(\d+)", fstr)
|
|
if cross_sheet:
|
|
for tgt_sheet, c1, r1, c2, r2 in cross_sheet:
|
|
tgt_rows = sheet_max_rows.get(tgt_sheet, 0)
|
|
r2_int = int(r2)
|
|
# Only flag if target sheet exists and range truly exceeds it
|
|
if tgt_rows > 0 and r2_int > tgt_rows * 3 and r2_int > 100:
|
|
findings.append({
|
|
"type": "out_of_range",
|
|
"cell": cell_ref(sname, c),
|
|
"detail": f"Range {tgt_sheet}!{c1}{r1}:{c2}{r2} extends far beyond {tgt_sheet} data ({tgt_rows} rows)",
|
|
"formula": fstr[:100],
|
|
})
|
|
else:
|
|
# Same-sheet reference check
|
|
range_refs = re.findall(r'([A-Z]{1,3})(\d+):([A-Z]{1,3})(\d+)', fstr)
|
|
for c1, r1, c2, r2 in range_refs:
|
|
r1_int, r2_int = int(r1), int(r2)
|
|
if r2_int > max_data_row * 3 and r2_int > 100:
|
|
findings.append({
|
|
"type": "out_of_range",
|
|
"cell": cell_ref(sname, c),
|
|
"detail": f"Range {c1}{r1}:{c2}{r2} extends far beyond data ({max_data_row} rows)",
|
|
"formula": fstr[:100],
|
|
})
|
|
|
|
# --- Header row inclusion ---
|
|
agg_pattern = re.compile(
|
|
r'(SUM|AVERAGE|AVG|COUNT|COUNTA|MIN|MAX|SUMPRODUCT)\s*\(\s*([A-Z]{1,3})1:',
|
|
re.IGNORECASE,
|
|
)
|
|
agg_match = agg_pattern.search(fstr)
|
|
if agg_match and c.row > 1:
|
|
findings.append({
|
|
"type": "header_included",
|
|
"cell": cell_ref(sname, c),
|
|
"detail": f"{agg_match.group(1)}() starts at row 1 (header row)",
|
|
"formula": fstr[:100],
|
|
})
|
|
|
|
# --- Insufficient aggregate range ---
|
|
small_range = re.compile(
|
|
r'(SUM|AVERAGE|AVG|COUNT|COUNTA)\s*\(\s*([A-Z]{1,3})(\d+):([A-Z]{1,3})(\d+)\s*\)',
|
|
re.IGNORECASE,
|
|
)
|
|
for m in small_range.finditer(fstr):
|
|
func, _, r1s, _, r2s = m.groups()
|
|
span = abs(int(r2s) - int(r1s)) + 1
|
|
if span <= 2:
|
|
findings.append({
|
|
"type": "small_aggregate",
|
|
"cell": cell_ref(sname, c),
|
|
"detail": f"{func}() covers only {span} cell(s)",
|
|
"formula": fstr[:100],
|
|
})
|
|
|
|
# --- Inconsistent formula patterns within same column ---
|
|
for (sh, col), entries in col_formulas.items():
|
|
if len(entries) < 3:
|
|
continue
|
|
patterns: Dict[str, List[int]] = defaultdict(list)
|
|
for row_num, fstr in entries:
|
|
norm = re.sub(r'(\$?[A-Z]{1,3}\$?)\d+', r'\1#', fstr)
|
|
patterns[norm].append(row_num)
|
|
|
|
if len(patterns) <= 1:
|
|
continue
|
|
|
|
dominant_pat = max(patterns, key=lambda k: len(patterns[k]))
|
|
dominant_rows = patterns[dominant_pat]
|
|
|
|
for pat, rows in patterns.items():
|
|
if pat == dominant_pat:
|
|
continue
|
|
if len(rows) <= 2 and len(rows) < len(dominant_rows):
|
|
for r in rows:
|
|
orig = next((f for rn, f in entries if rn == r), "?")
|
|
findings.append({
|
|
"type": "inconsistent_pattern",
|
|
"cell": f"{sh}!{col}{r}",
|
|
"detail": f"Formula differs from {len(dominant_rows)} other rows in column {col}",
|
|
"formula": orig[:100],
|
|
})
|
|
|
|
wb.close()
|
|
|
|
result: Dict[str, Any] = {
|
|
"total_findings": len(findings),
|
|
"by_type": {},
|
|
}
|
|
for f in findings:
|
|
t = f["type"]
|
|
result["by_type"].setdefault(t, 0)
|
|
result["by_type"][t] += 1
|
|
|
|
if findings:
|
|
result["findings"] = findings[:60]
|
|
|
|
print(json.dumps(result, indent=2, ensure_ascii=False))
|
|
return 1 if findings else 0
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════
|
|
# Section 4: inspect — Structure analysis → JSON
|
|
# ═══════════════════════════════════════════════════════════════
|
|
|
|
@cmd("inspect")
|
|
def cmd_inspect(argv: Sequence[str]) -> int:
|
|
"""Analyse Excel file structure and output JSON."""
|
|
parser = argparse.ArgumentParser(prog="xlsx.py inspect",
|
|
description="Analyse file structure → JSON")
|
|
parser.add_argument("file", help="Excel file path")
|
|
parser.add_argument("--pretty", action="store_true", help="Pretty-print JSON")
|
|
args = parser.parse_args(argv)
|
|
|
|
path = Path(args.file)
|
|
if not path.exists():
|
|
print(json.dumps({"error": f"File not found: {path}"}))
|
|
return 1
|
|
|
|
wb = load_workbook(str(path), data_only=False, read_only=False)
|
|
sheets_info: List[Dict[str, Any]] = []
|
|
|
|
for sname in wb.sheetnames:
|
|
ws = wb[sname]
|
|
max_row = ws.max_row or 0
|
|
max_col = ws.max_column or 0
|
|
|
|
# Extract headers (first row)
|
|
headers: List[Optional[str]] = []
|
|
if max_row > 0 and max_col > 0:
|
|
for cell in ws[1]:
|
|
if cell.value is not None:
|
|
headers.append(str(cell.value))
|
|
else:
|
|
headers.append(None)
|
|
|
|
# Build data range string
|
|
if max_row > 0 and max_col > 0:
|
|
data_range = f"A1:{get_column_letter(max_col)}{max_row}"
|
|
else:
|
|
data_range = None
|
|
|
|
# Count formulas and data rows
|
|
formula_count = 0
|
|
data_rows = 0
|
|
for row in ws.iter_rows(min_row=2, max_row=max_row, max_col=max_col):
|
|
has_data = False
|
|
for c in row:
|
|
if c.value is not None:
|
|
has_data = True
|
|
if is_formula(c.value):
|
|
formula_count += 1
|
|
if has_data:
|
|
data_rows += 1
|
|
|
|
# Detect charts
|
|
chart_count = 0
|
|
if hasattr(ws, "_charts"):
|
|
chart_count = len(ws._charts)
|
|
|
|
sheet_info: Dict[str, Any] = {
|
|
"name": sname,
|
|
"dataRange": data_range,
|
|
"rows": max_row,
|
|
"columns": max_col,
|
|
"dataRows": data_rows,
|
|
"formulaCount": formula_count,
|
|
"chartCount": chart_count,
|
|
"tables": [{
|
|
"headers": [h for h in headers if h is not None]
|
|
}] if headers and any(h is not None for h in headers) else [],
|
|
}
|
|
sheets_info.append(sheet_info)
|
|
|
|
wb.close()
|
|
|
|
output = {"sheets": sheets_info}
|
|
indent = 2 if args.pretty else None
|
|
print(json.dumps(output, indent=indent, ensure_ascii=False))
|
|
return 0
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════
|
|
# Section 5: pivot — PivotTable with optional chart
|
|
# ═══════════════════════════════════════════════════════════════
|
|
|
|
def _aggregate(values: List[float], method: str) -> float:
|
|
"""Compute aggregation on a list of numbers."""
|
|
if not values:
|
|
return 0.0
|
|
if method == "sum":
|
|
return sum(values)
|
|
elif method == "count":
|
|
return float(len(values))
|
|
elif method == "average":
|
|
return sum(values) / len(values)
|
|
elif method == "max":
|
|
return max(values)
|
|
elif method == "min":
|
|
return min(values)
|
|
return sum(values)
|
|
|
|
|
|
@cmd("pivot")
|
|
def cmd_pivot(argv: Sequence[str]) -> int:
|
|
"""Create a PivotTable-like summary with optional chart using openpyxl."""
|
|
parser = argparse.ArgumentParser(prog="xlsx.py pivot",
|
|
description="Create PivotTable summary with optional chart")
|
|
parser.add_argument("input", help="Input Excel file")
|
|
parser.add_argument("output", help="Output Excel file")
|
|
parser.add_argument("--source", required=True, help="Source range: 'Sheet!A1:Z100'")
|
|
parser.add_argument("--values", required=True, help="Value fields: 'Revenue:sum,Units:count'")
|
|
parser.add_argument("--rows", default=None, help="Row fields: 'Product,Region'")
|
|
parser.add_argument("--cols", default=None, help="Column fields: 'Quarter'")
|
|
parser.add_argument("--filters", default=None, help="Filter fields: 'Year'")
|
|
parser.add_argument("--location", default="PivotTable!A3", help="Output location: 'Sheet!A3'")
|
|
parser.add_argument("--name", default="PivotTable1", help="PivotTable name")
|
|
parser.add_argument("--style", default="monochrome", choices=["monochrome", "finance"],
|
|
help="Visual style theme")
|
|
parser.add_argument("--chart", default=None, choices=["bar", "line", "pie"],
|
|
help="Chart type (optional)")
|
|
args = parser.parse_args(argv)
|
|
|
|
input_path = Path(args.input)
|
|
output_path = Path(args.output)
|
|
|
|
if not input_path.exists():
|
|
print(json.dumps({"error": f"Input file not found: {input_path}"}))
|
|
return 1
|
|
|
|
# Parse source range
|
|
src_sheet, src_min_col, src_min_row, src_max_col, src_max_row = parse_range(args.source)
|
|
|
|
# Parse location
|
|
loc = parse_range(args.location)
|
|
loc_sheet = loc[0]
|
|
loc_start_col = loc[1]
|
|
loc_start_row = loc[2]
|
|
|
|
# Parse value fields
|
|
value_fields: List[Tuple[str, str]] = []
|
|
for vspec in args.values.split(","):
|
|
vspec = vspec.strip()
|
|
if ":" in vspec:
|
|
fname, agg = vspec.rsplit(":", 1)
|
|
agg = agg.strip().lower()
|
|
if agg in ("avg", "average"):
|
|
agg = "average"
|
|
else:
|
|
fname = vspec
|
|
agg = "sum"
|
|
value_fields.append((fname.strip(), agg))
|
|
|
|
row_fields = [f.strip() for f in args.rows.split(",") if f.strip()] if args.rows else []
|
|
col_fields = [f.strip() for f in args.cols.split(",") if f.strip()] if args.cols else []
|
|
filter_fields = [f.strip() for f in args.filters.split(",") if f.strip()] if args.filters else []
|
|
|
|
# Load workbook
|
|
wb = load_workbook(str(input_path))
|
|
if src_sheet not in wb.sheetnames:
|
|
print(json.dumps({"error": f"Source sheet '{src_sheet}' not found"}))
|
|
return 1
|
|
|
|
ws_src = wb[src_sheet]
|
|
|
|
# Read headers from first row of source
|
|
headers: List[str] = []
|
|
for col_idx in range(src_min_col, src_max_col + 1):
|
|
val = ws_src.cell(row=src_min_row, column=col_idx).value
|
|
headers.append(str(val) if val is not None else f"Col{col_idx}")
|
|
|
|
# Build column index map
|
|
col_map = {h: i for i, h in enumerate(headers)}
|
|
|
|
# Validate field names
|
|
all_fields = row_fields + col_fields + [vf[0] for vf in value_fields] + filter_fields
|
|
for f in all_fields:
|
|
if f not in col_map:
|
|
print(json.dumps({"error": f"Field '{f}' not found. Available: {headers}"}))
|
|
return 1
|
|
|
|
# Read data rows
|
|
data_rows: List[Dict[str, Any]] = []
|
|
for row_idx in range(src_min_row + 1, src_max_row + 1):
|
|
row_data: Dict[str, Any] = {}
|
|
for col_idx in range(src_min_col, src_max_col + 1):
|
|
h = headers[col_idx - src_min_col]
|
|
row_data[h] = ws_src.cell(row=row_idx, column=col_idx).value
|
|
if any(v is not None for v in row_data.values()):
|
|
data_rows.append(row_data)
|
|
|
|
# Aggregate data
|
|
def make_key(row: Dict[str, Any], fields: List[str]) -> Tuple[str, ...]:
|
|
return tuple(str(row.get(f, "")) for f in fields)
|
|
|
|
group_fields = row_fields + col_fields
|
|
groups: Dict[Tuple[str, ...], Dict[str, List[float]]] = defaultdict(lambda: defaultdict(list))
|
|
|
|
for row in data_rows:
|
|
key = make_key(row, group_fields)
|
|
for vname, _ in value_fields:
|
|
val = row.get(vname)
|
|
if isinstance(val, (int, float)):
|
|
groups[key][vname].append(float(val))
|
|
|
|
# Create or get output sheet
|
|
if loc_sheet and loc_sheet in wb.sheetnames:
|
|
ws_out = wb[loc_sheet]
|
|
elif loc_sheet:
|
|
ws_out = wb.create_sheet(loc_sheet)
|
|
else:
|
|
ws_out = wb.create_sheet("PivotTable")
|
|
|
|
# ---- Styling ----
|
|
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
|
|
|
|
# --- Font resolution (mirrors templates/base.py logic) ---
|
|
_platform_hints = {
|
|
"Darwin": {"PingFang SC", "Hiragino Sans GB"},
|
|
"Windows": {"Microsoft YaHei", "SimHei"},
|
|
"Linux": {"Noto Sans CJK SC", "WenQuanYi Micro Hei"},
|
|
}
|
|
_cjk_chain = ["PingFang SC", "Microsoft YaHei", "Noto Sans CJK SC",
|
|
"Hiragino Sans GB", "Source Han Sans SC", "SimHei"]
|
|
_avail = _platform_hints.get(platform.system(), set())
|
|
_font_name = next((f for f in _cjk_chain if f in _avail), _cjk_chain[0])
|
|
_heavy = {"SimHei", "Microsoft YaHei", "PingFang SC", "Noto Sans CJK SC",
|
|
"Source Han Sans SC", "Hiragino Sans GB", "WenQuanYi Micro Hei"}
|
|
_header_bold = _font_name not in _heavy
|
|
|
|
if args.style == "finance":
|
|
header_fill_color = "1B2A4A" # PRIMARY from design token
|
|
alt_row_color = "D6E4F0" # PRIMARY_LIGHT
|
|
else: # monochrome
|
|
header_fill_color = "333333"
|
|
alt_row_color = "F5F5F5"
|
|
|
|
header_fill = PatternFill(start_color=header_fill_color,
|
|
end_color=header_fill_color, fill_type="solid")
|
|
header_font = Font(name=_font_name, color="FFFFFF", bold=_header_bold, size=11)
|
|
data_font = Font(name=_font_name, size=11)
|
|
alt_fill = PatternFill(start_color=alt_row_color,
|
|
end_color=alt_row_color, fill_type="solid")
|
|
border = Border(bottom=Side(style="thin", color="D0D0D0"))
|
|
|
|
# Determine if cross-matrix mode (--cols provided)
|
|
use_cross_matrix = len(col_fields) > 0
|
|
|
|
if use_cross_matrix:
|
|
# ── Cross-matrix mode: row_fields as rows, col_fields expanded as columns ──
|
|
# Collect unique column dimension values
|
|
col_dim_values: List[str] = []
|
|
seen_col_vals: set = set()
|
|
for row in data_rows:
|
|
cv = str(row.get(col_fields[0], ""))
|
|
if cv not in seen_col_vals:
|
|
seen_col_vals.add(cv)
|
|
col_dim_values.append(cv)
|
|
col_dim_values.sort()
|
|
|
|
# Build cross-matrix groups: key = row_fields only, sub-key = col_dim value
|
|
cross_groups: Dict[Tuple[str, ...], Dict[str, Dict[str, List[float]]]] = defaultdict(
|
|
lambda: defaultdict(lambda: defaultdict(list))
|
|
)
|
|
for row in data_rows:
|
|
rkey = make_key(row, row_fields)
|
|
cval = str(row.get(col_fields[0], ""))
|
|
for vname, _ in value_fields:
|
|
val = row.get(vname)
|
|
if isinstance(val, (int, float)):
|
|
cross_groups[rkey][cval][vname].append(float(val))
|
|
|
|
# Build output headers: row_fields + (col_val - agg_name) for each combination
|
|
out_headers: List[str] = list(row_fields)
|
|
for cv in col_dim_values:
|
|
for vname, agg in value_fields:
|
|
if len(value_fields) == 1:
|
|
out_headers.append(f"{cv}")
|
|
else:
|
|
out_headers.append(f"{cv} ({vname} {agg})")
|
|
|
|
# Write headers
|
|
r = loc_start_row
|
|
for i, h in enumerate(out_headers):
|
|
cell = ws_out.cell(row=r, column=loc_start_col + i, value=h)
|
|
cell.fill = header_fill
|
|
cell.font = header_font
|
|
cell.alignment = Alignment(horizontal="center", vertical="center")
|
|
|
|
# Sort row keys
|
|
sorted_row_keys = sorted(cross_groups.keys())
|
|
|
|
# Write data rows
|
|
for idx, rkey in enumerate(sorted_row_keys):
|
|
r += 1
|
|
for i, val in enumerate(rkey):
|
|
cell = ws_out.cell(row=r, column=loc_start_col + i, value=val)
|
|
cell.font = data_font
|
|
cell.border = border
|
|
if idx % 2 == 1:
|
|
cell.fill = alt_fill
|
|
|
|
col_offset = len(row_fields)
|
|
for cv in col_dim_values:
|
|
for vname, agg in value_fields:
|
|
vals = cross_groups[rkey].get(cv, {}).get(vname, [])
|
|
agg_result = _aggregate(vals, agg) if vals else 0
|
|
cell = ws_out.cell(row=r, column=loc_start_col + col_offset,
|
|
value=round(agg_result, 2))
|
|
cell.font = data_font
|
|
cell.border = border
|
|
cell.number_format = "#,##0.00"
|
|
if idx % 2 == 1:
|
|
cell.fill = alt_fill
|
|
col_offset += 1
|
|
|
|
sorted_keys = sorted_row_keys
|
|
total_data_rows_for_chart = len(sorted_row_keys)
|
|
|
|
else:
|
|
# ── Flat mode: no --cols, original behavior ──
|
|
out_headers: List[str] = list(row_fields)
|
|
for vname, agg in value_fields:
|
|
out_headers.append(f"{vname} ({agg})")
|
|
|
|
# Write headers
|
|
r = loc_start_row
|
|
for i, h in enumerate(out_headers):
|
|
cell = ws_out.cell(row=r, column=loc_start_col + i, value=h)
|
|
cell.fill = header_fill
|
|
cell.font = header_font
|
|
cell.alignment = Alignment(horizontal="center", vertical="center")
|
|
|
|
# Sort keys for consistent output
|
|
sorted_keys = sorted(groups.keys())
|
|
|
|
# Write data rows
|
|
for idx, key in enumerate(sorted_keys):
|
|
r += 1
|
|
# Row fields
|
|
for i, val in enumerate(key[:len(row_fields)]):
|
|
cell = ws_out.cell(row=r, column=loc_start_col + i, value=val)
|
|
cell.font = data_font
|
|
cell.border = border
|
|
if idx % 2 == 1:
|
|
cell.fill = alt_fill
|
|
|
|
# Value fields
|
|
for i, (vname, agg) in enumerate(value_fields):
|
|
vals = groups[key].get(vname, [])
|
|
agg_result = _aggregate(vals, agg)
|
|
col_offset = len(row_fields) + i
|
|
cell = ws_out.cell(row=r, column=loc_start_col + col_offset,
|
|
value=round(agg_result, 2))
|
|
cell.font = data_font
|
|
cell.border = border
|
|
cell.number_format = "#,##0.00"
|
|
if idx % 2 == 1:
|
|
cell.fill = alt_fill
|
|
|
|
total_data_rows_for_chart = len(sorted_keys)
|
|
|
|
# Auto-adjust column widths (data-driven, headers wrap if too wide)
|
|
try:
|
|
from templates.base import auto_fit_columns
|
|
auto_fit_columns(ws_out, min_width=10, max_width=28,
|
|
header_row=loc_start_row, data_start_row=loc_start_row + 1)
|
|
except ImportError:
|
|
# Fallback: old logic
|
|
for i, h in enumerate(out_headers):
|
|
col_letter = get_column_letter(loc_start_col + i)
|
|
ws_out.column_dimensions[col_letter].width = max(len(str(h)) + 4, 14)
|
|
|
|
# Hide gridlines
|
|
ws_out.sheet_view.showGridLines = False
|
|
|
|
# Add chart if requested
|
|
has_chart = False
|
|
total_data_rows = total_data_rows_for_chart
|
|
if args.chart and total_data_rows > 0:
|
|
from openpyxl.chart import BarChart, LineChart, PieChart, Reference
|
|
|
|
chart_type_map = {
|
|
"bar": BarChart,
|
|
"line": LineChart,
|
|
"pie": PieChart,
|
|
}
|
|
ChartClass = chart_type_map.get(args.chart, BarChart)
|
|
chart = ChartClass()
|
|
chart.title = args.name or "PivotTable Summary"
|
|
chart.style = 10
|
|
|
|
data_ref = Reference(
|
|
ws_out,
|
|
min_col=loc_start_col + len(row_fields),
|
|
min_row=loc_start_row,
|
|
max_col=loc_start_col + len(out_headers) - 1,
|
|
max_row=loc_start_row + total_data_rows,
|
|
)
|
|
cats_ref = Reference(
|
|
ws_out,
|
|
min_col=loc_start_col,
|
|
min_row=loc_start_row + 1,
|
|
max_row=loc_start_row + total_data_rows,
|
|
)
|
|
|
|
chart.add_data(data_ref, titles_from_data=True)
|
|
chart.set_categories(cats_ref)
|
|
|
|
if isinstance(chart, BarChart):
|
|
chart.type = "col"
|
|
|
|
# Pie chart: prevent label overlap with bestFit + leader lines
|
|
if isinstance(chart, PieChart):
|
|
from openpyxl.chart.label import DataLabelList
|
|
chart.dataLabels = DataLabelList()
|
|
chart.dataLabels.dLblPos = 'bestFit'
|
|
chart.dataLabels.showLeaderLines = True
|
|
chart.dataLabels.showCatName = True
|
|
chart.dataLabels.showPercent = True
|
|
chart.dataLabels.showVal = False
|
|
chart.dataLabels.showSerName = False
|
|
|
|
# Anchor with enough vertical offset to avoid chart-to-chart overlap
|
|
# ~15 rows per chart height; leave 2 extra rows gap
|
|
chart_row_offset = 17
|
|
chart_anchor = ws_out.cell(
|
|
row=loc_start_row + total_data_rows + 3,
|
|
column=loc_start_col,
|
|
).coordinate
|
|
ws_out.add_chart(chart, chart_anchor)
|
|
has_chart = True
|
|
|
|
wb.save(str(output_path))
|
|
wb.close()
|
|
|
|
# [Fix ③] Auto-recalc after pivot if chart was created, so chart data cache is populated
|
|
if has_chart:
|
|
try:
|
|
_run_libreoffice_recalc_best_effort(str(output_path))
|
|
except Exception:
|
|
pass
|
|
|
|
print(json.dumps({
|
|
"status": "success",
|
|
"output": str(output_path),
|
|
"pivot_rows": total_data_rows,
|
|
"fields": {
|
|
"rows": row_fields,
|
|
"columns": col_fields,
|
|
"values": [f"{v}:{a}" for v, a in value_fields],
|
|
"filters": filter_fields,
|
|
},
|
|
"chart": args.chart or "none",
|
|
}, indent=2, ensure_ascii=False))
|
|
return 0
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════
|
|
# Section 6: chart-verify — Verify chart data content
|
|
# ═══════════════════════════════════════════════════════════════
|
|
|
|
def _check_charts(filepath: str) -> Tuple[List[Dict], List[Dict]]:
|
|
"""Core chart verification logic. Returns (ok_charts, empty_charts)."""
|
|
wb = load_workbook(filepath)
|
|
ok_charts: List[Dict[str, str]] = []
|
|
empty_charts: List[Dict[str, str]] = []
|
|
|
|
for sname in wb.sheetnames:
|
|
ws = wb[sname]
|
|
if not hasattr(ws, "_charts"):
|
|
continue
|
|
for chart in ws._charts:
|
|
chart_title = "untitled"
|
|
if chart.title:
|
|
try:
|
|
parts = []
|
|
for p in chart.title.tx.rich.paragraphs:
|
|
for run in p.r:
|
|
if hasattr(run, 't') and run.t:
|
|
parts.append(run.t)
|
|
chart_title = "".join(parts) if parts else "untitled"
|
|
except (AttributeError, TypeError):
|
|
chart_title = "untitled"
|
|
has_data = False
|
|
|
|
for series in (chart.series if hasattr(chart, "series") else []):
|
|
if hasattr(series, "val") and series.val:
|
|
ref = series.val
|
|
if hasattr(ref, "numRef") and ref.numRef:
|
|
cache = ref.numRef.numCache if hasattr(ref.numRef, "numCache") else None
|
|
if cache and hasattr(cache, "ptCount") and cache.ptCount and cache.ptCount > 0:
|
|
has_data = True
|
|
break
|
|
if hasattr(ref, "numLit") and ref.numLit:
|
|
has_data = True
|
|
break
|
|
|
|
entry = {"sheet": sname, "title": chart_title}
|
|
if has_data:
|
|
ok_charts.append(entry)
|
|
else:
|
|
empty_charts.append(entry)
|
|
|
|
wb.close()
|
|
return ok_charts, empty_charts
|
|
|
|
|
|
@cmd("chart-verify")
|
|
def cmd_chart_verify(argv: Sequence[str]) -> int:
|
|
"""Verify all charts have actual data content.
|
|
[Fix ④] Automatically recalc first if charts appear empty, then re-check.
|
|
"""
|
|
parser = argparse.ArgumentParser(prog="xlsx.py chart-verify",
|
|
description="Verify all charts have actual data")
|
|
parser.add_argument("file", help="Excel file path")
|
|
parser.add_argument("--no-auto-recalc", action="store_true",
|
|
help="Disable automatic recalc before checking")
|
|
args = parser.parse_args(argv)
|
|
|
|
path = Path(args.file)
|
|
if not path.exists():
|
|
print(json.dumps({"error": f"File not found: {path}"}))
|
|
return 1
|
|
|
|
ok_charts, empty_charts = _check_charts(str(path))
|
|
|
|
# [Fix ④] If there are empty charts and auto-recalc is enabled, try recalc first
|
|
if empty_charts and not args.no_auto_recalc:
|
|
try:
|
|
_run_libreoffice_recalc_best_effort(str(path))
|
|
# Re-check after recalc
|
|
ok_charts, empty_charts = _check_charts(str(path))
|
|
except Exception:
|
|
pass
|
|
|
|
total_charts = len(ok_charts) + len(empty_charts)
|
|
|
|
result: Dict[str, Any] = {
|
|
"total_charts": total_charts,
|
|
"charts_with_data": len(ok_charts),
|
|
"empty_charts": len(empty_charts),
|
|
}
|
|
if empty_charts:
|
|
result["empty"] = empty_charts
|
|
if ok_charts:
|
|
result["ok"] = ok_charts
|
|
|
|
print(json.dumps(result, indent=2, ensure_ascii=False))
|
|
|
|
if empty_charts:
|
|
return 1
|
|
if total_charts == 0:
|
|
print("No charts found in workbook.", file=sys.stderr)
|
|
return 0
|
|
return 0
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════
|
|
# Section 7: validate — Structural validation
|
|
# ═══════════════════════════════════════════════════════════════
|
|
|
|
@cmd("validate")
|
|
def cmd_validate(argv: Sequence[str]) -> int:
|
|
"""Structural validation: forbidden functions, formula hygiene, schema basics."""
|
|
parser = argparse.ArgumentParser(prog="xlsx.py validate",
|
|
description="Structural validation (forbidden funcs, schema)")
|
|
parser.add_argument("file", help="Excel file path")
|
|
args = parser.parse_args(argv)
|
|
|
|
path = Path(args.file)
|
|
if not path.exists():
|
|
print(json.dumps({"error": f"File not found: {path}"}))
|
|
return 1
|
|
|
|
wb = load_workbook(str(path), data_only=False)
|
|
issues: List[Dict[str, str]] = []
|
|
|
|
for sname in wb.sheetnames:
|
|
ws = wb[sname]
|
|
for row in ws.iter_rows():
|
|
for c in row:
|
|
if not is_formula(c.value):
|
|
continue
|
|
fstr = str(c.value).upper()
|
|
|
|
# Check for forbidden functions
|
|
for func in FORBIDDEN_FUNCTIONS:
|
|
pattern = re.compile(rf'\b{func}\s*\(', re.IGNORECASE)
|
|
if pattern.search(fstr):
|
|
issues.append({
|
|
"type": "forbidden_function",
|
|
"cell": cell_ref(sname, c),
|
|
"function": func,
|
|
"detail": f"{func}() is not supported in Excel 2019 and earlier",
|
|
"formula": str(c.value)[:100],
|
|
})
|
|
|
|
# Check for text accidentally treated as formula
|
|
raw = str(c.value)
|
|
is_formula_cell = getattr(c, "data_type", None) == "f"
|
|
if (raw.startswith("=") and not is_formula_cell
|
|
and not any(ch in raw for ch in "+-*/()&,!:$")):
|
|
issues.append({
|
|
"type": "text_as_formula",
|
|
"cell": cell_ref(sname, c),
|
|
"detail": "Text starts with '=' — may be misinterpreted as formula",
|
|
"value": raw[:80],
|
|
})
|
|
|
|
# [Fix ①] Heuristic: data_type=='f' but content has no valid formula elements
|
|
if is_formula_cell and raw.startswith("="):
|
|
body = raw[1:] # strip leading =
|
|
body_stripped = body.strip()
|
|
if body_stripped.startswith('"'):
|
|
pass # starts with a quoted string, likely intentional
|
|
elif not _VALID_FORMULA_PATTERN.search(body):
|
|
issues.append({
|
|
"type": "text_as_formula",
|
|
"cell": cell_ref(sname, c),
|
|
"detail": "Cell stored as formula but contains no function calls or cell references — likely text starting with '='",
|
|
"value": raw[:80],
|
|
})
|
|
|
|
# [Fix ②] Check for external file references in formulas
|
|
if _EXT_FILE_REF_PATTERN.search(fstr):
|
|
ext_matches = _EXT_FILE_REF_PATTERN.findall(fstr)
|
|
for ext_file, _ in ext_matches:
|
|
issues.append({
|
|
"type": "external_file_ref",
|
|
"cell": cell_ref(sname, c),
|
|
"detail": f"Formula references external file: {ext_file}",
|
|
"formula": str(c.value)[:100],
|
|
})
|
|
|
|
# Check for absolute paths / local file references in .rels
|
|
try:
|
|
with zipfile.ZipFile(str(path), "r") as zf:
|
|
for name in zf.namelist():
|
|
if name.endswith(".rels"):
|
|
content = zf.read(name).decode("utf-8", errors="ignore")
|
|
if re.search(r'Target="[A-Z]:\\', content):
|
|
issues.append({
|
|
"type": "absolute_path",
|
|
"file": name,
|
|
"detail": "Absolute Windows path in .rels file — causes Excel crash",
|
|
})
|
|
if "file:///" in content.lower():
|
|
issues.append({
|
|
"type": "local_file_ref",
|
|
"file": name,
|
|
"detail": "Local file:// reference in .rels — may cause security warning",
|
|
})
|
|
except Exception:
|
|
pass # zipfile inspection is best-effort
|
|
|
|
wb.close()
|
|
|
|
by_type: Dict[str, int] = defaultdict(int)
|
|
for iss in issues:
|
|
by_type[iss["type"]] += 1
|
|
|
|
result: Dict[str, Any] = {
|
|
"status": "passed" if not issues else "failed",
|
|
"total_issues": len(issues),
|
|
"by_type": dict(by_type),
|
|
}
|
|
if issues:
|
|
result["issues"] = issues[:50]
|
|
|
|
print(json.dumps(result, indent=2, ensure_ascii=False))
|
|
return 0 if not issues else 1
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════
|
|
# Section 8: CLI entry point
|
|
# ═══════════════════════════════════════════════════════════════
|
|
|
|
_HELP_TEXT = """\
|
|
xlsx.py — Unified Excel Quality Assurance & Manipulation Tool
|
|
|
|
Usage: python3 xlsx.py <command> [args...]
|
|
|
|
Commands:
|
|
recalc <xlsx> [timeout] Recalculate formulas via LibreOffice + error scan
|
|
audit <xlsx> Formula error + zero-value + implicit array detection
|
|
scan <xlsx> Reference anomaly detection
|
|
inspect <xlsx> [--pretty] Structure analysis → JSON
|
|
pivot <in> <out> --source ... [options] PivotTable with optional chart
|
|
chart-verify <xlsx> [--no-auto-recalc] Verify chart data content
|
|
validate <xlsx> Structural validation
|
|
|
|
Run 'python3 xlsx.py <command> --help' for command-specific options.
|
|
"""
|
|
|
|
|
|
def main() -> int:
|
|
if len(sys.argv) < 2 or sys.argv[1] in ("--help", "-h"):
|
|
print(_HELP_TEXT)
|
|
return 0
|
|
|
|
command = sys.argv[1]
|
|
rest = sys.argv[2:]
|
|
|
|
handler = _COMMANDS.get(command)
|
|
if handler is None:
|
|
print(f"Unknown command: {command}\n", file=sys.stderr)
|
|
print(_HELP_TEXT, file=sys.stderr)
|
|
return 1
|
|
|
|
return handler(rest)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|