808 lines
30 KiB
Python
Executable File
808 lines
30 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""
|
||
postcheck.py — Document business rule self-check script
|
||
|
||
Unlike traditional OpenXML Schema validation, this script does not check XML legality.
|
||
Instead, it checks document "visual quality" and "typesetting correctness" — issues visible to the human eye.
|
||
|
||
Usage:
|
||
python3 postcheck.py output.docx [--fix] [--json]
|
||
|
||
Checks:
|
||
1. Blank page detection — trailing/middle excess blank pages, double page breaks, consecutive empty paragraphs
|
||
2. Line spacing consistency — whether body paragraph line spacing is uniform
|
||
3. Table margins — whether cells have padding set
|
||
4. Table pagination control — whether header rows have tblHeader set, data rows have cantSplit
|
||
5. Image overflow — whether image width exceeds page usable area
|
||
6. Font fallback — whether fonts are used that may be missing on target systems
|
||
7. CJK indentation — whether Chinese body text has first-line indent (excluding table cells, lists, centered paragraphs)
|
||
8. Heading level continuity — whether headings skip levels (H1→H3 skipping H2)
|
||
9. Numbering continuity — whether numbered lists have gaps
|
||
10. Cover separation — whether cover and body are in different sections
|
||
11. ShadingType — whether SOLID is misused causing black cells
|
||
12. TOC quality — whether TOC field exists, whether headings use standard Heading styles
|
||
13. Image aspect ratio — whether images are stretched/distorted
|
||
14. Document cleanliness — whether placeholder text, Markdown syntax, or draft expressions remain
|
||
15. Report content quality — whether summary exists, whether titles are specific, whether vague conclusions are used
|
||
"""
|
||
|
||
import zipfile
|
||
import sys
|
||
import json
|
||
import re
|
||
from pathlib import Path
|
||
from xml.etree import ElementTree as ET
|
||
|
||
NS = {
|
||
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
|
||
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
|
||
"wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
|
||
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
||
"pic": "http://schemas.openxmlformats.org/drawingml/2006/picture",
|
||
}
|
||
|
||
|
||
class CheckResult:
|
||
def __init__(self, name: str, passed: bool, message: str, severity: str = "warning"):
|
||
self.name = name
|
||
self.passed = passed
|
||
self.message = message
|
||
self.severity = severity # "error" | "warning" | "info"
|
||
|
||
def to_dict(self):
|
||
return {
|
||
"name": self.name,
|
||
"passed": self.passed,
|
||
"message": self.message,
|
||
"severity": self.severity,
|
||
}
|
||
|
||
def __str__(self):
|
||
icon = "✅" if self.passed else ("❌" if self.severity == "error" else "⚠️")
|
||
return f"{icon} [{self.name}] {self.message}"
|
||
|
||
|
||
def read_document_xml(docx_path: str) -> ET.Element:
|
||
"""Read document.xml and return the root element"""
|
||
with zipfile.ZipFile(docx_path, "r") as z:
|
||
return ET.fromstring(z.read("word/document.xml"))
|
||
|
||
|
||
def get_sections(root: ET.Element) -> list:
|
||
"""Extract all sections (located via sectPr)"""
|
||
body = root.find(".//w:body", NS)
|
||
if body is None:
|
||
return []
|
||
|
||
sections = []
|
||
current_children = []
|
||
|
||
for child in body:
|
||
tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
|
||
if tag == "sectPr":
|
||
sections.append({"children": current_children, "sectPr": child})
|
||
current_children = []
|
||
else:
|
||
# Check whether paragraph contains sectPr (section break inside paragraph pPr)
|
||
ppr_sect = child.find(".//w:pPr/w:sectPr", NS)
|
||
if ppr_sect is not None:
|
||
current_children.append(child)
|
||
sections.append({"children": current_children, "sectPr": ppr_sect})
|
||
current_children = []
|
||
else:
|
||
current_children.append(child)
|
||
|
||
# Last section (body-level sectPr)
|
||
body_sect = body.find("w:sectPr", NS)
|
||
if body_sect is not None and current_children:
|
||
sections.append({"children": current_children, "sectPr": body_sect})
|
||
|
||
return sections
|
||
|
||
|
||
def check_blank_pages(root: ET.Element) -> CheckResult:
|
||
"""Detect excess blank pages — multi-pattern detection"""
|
||
body = root.find(".//w:body", NS)
|
||
paragraphs = body.findall("w:p", NS)
|
||
issues = []
|
||
|
||
if not paragraphs:
|
||
return CheckResult("blank-pages", True, "No paragraph content")
|
||
|
||
# Check 1: Whether the last paragraph only has a page break
|
||
last_p = paragraphs[-1]
|
||
runs = last_p.findall(".//w:r", NS)
|
||
has_page_break = False
|
||
has_text = False
|
||
for run in runs:
|
||
br = run.find("w:br", NS)
|
||
if br is not None and br.get(f"{{{NS['w']}}}type") == "page":
|
||
has_page_break = True
|
||
t = run.find("w:t", NS)
|
||
if t is not None and t.text and t.text.strip():
|
||
has_text = True
|
||
if has_page_break and not has_text:
|
||
issues.append("Trailing page break at document end may cause blank page")
|
||
|
||
# Check 2: Consecutive empty paragraphs (≥5 consecutive may form visual blank page)
|
||
consecutive_empty = 0
|
||
max_empty = 0
|
||
max_empty_pos = 0
|
||
for idx, p in enumerate(paragraphs):
|
||
texts = p.findall(".//w:t", NS)
|
||
has_any_text = any(t.text and t.text.strip() for t in texts)
|
||
has_br = any(
|
||
br.get(f"{{{NS['w']}}}type") == "page"
|
||
for br in p.findall(".//w:br", NS)
|
||
)
|
||
has_drawing = p.find(".//{http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing}inline", None) is not None
|
||
if not has_any_text and not has_br and not has_drawing:
|
||
consecutive_empty += 1
|
||
if consecutive_empty > max_empty:
|
||
max_empty = consecutive_empty
|
||
max_empty_pos = idx
|
||
else:
|
||
consecutive_empty = 0
|
||
|
||
if max_empty >= 5:
|
||
issues.append(f"Found {max_empty} consecutive empty paragraphs (starting around paragraph {max_empty_pos - max_empty + 2}), may form visual blank page")
|
||
|
||
# Check 3: Double page break at section boundary (PageBreak at section end + NEXT_PAGE in next section)
|
||
sections = get_sections(root)
|
||
for i in range(len(sections) - 1):
|
||
sec_children = sections[i]["children"]
|
||
if not sec_children:
|
||
continue
|
||
# Check whether the last paragraph of the section contains PageBreak
|
||
last_child = sec_children[-1]
|
||
if last_child.tag == f"{{{NS['w']}}}p":
|
||
for br in last_child.findall(".//w:br", NS):
|
||
if br.get(f"{{{NS['w']}}}type") == "page":
|
||
# Check whether the next section is NEXT_PAGE
|
||
next_sect_pr = sections[i + 1]["sectPr"]
|
||
sect_type = next_sect_pr.find("w:type", NS)
|
||
if sect_type is not None and sect_type.get(f"{{{NS['w']}}}val") == "nextPage":
|
||
issues.append(f"Section {i+1} ends with PageBreak and Section {i+2} is type nextPage, double page break causes blank page")
|
||
|
||
# Check 4: Empty paragraph + PageBreak (paragraph has only PageBreak, no text)
|
||
# Exclude section-ending PageBreaks — they are normal section separators
|
||
# (e.g., cover page ending with an empty para + PageBreak before a new section)
|
||
section_last_paras = set()
|
||
for sec in sections:
|
||
children = sec["children"]
|
||
if children:
|
||
last_child = children[-1]
|
||
section_last_paras.add(id(last_child))
|
||
|
||
empty_pb_count = 0
|
||
for p in paragraphs[:-1]: # Last paragraph already handled in Check 1
|
||
if id(p) in section_last_paras:
|
||
continue # Skip section-ending paragraphs (normal section breaks)
|
||
runs = p.findall(".//w:r", NS)
|
||
p_has_break = False
|
||
p_has_text = False
|
||
for run in runs:
|
||
br = run.find("w:br", NS)
|
||
if br is not None and br.get(f"{{{NS['w']}}}type") == "page":
|
||
p_has_break = True
|
||
t = run.find("w:t", NS)
|
||
if t is not None and t.text and t.text.strip():
|
||
p_has_text = True
|
||
if p_has_break and not p_has_text:
|
||
empty_pb_count += 1
|
||
|
||
if empty_pb_count > 0:
|
||
issues.append(f"Found {empty_pb_count} empty paragraphs with PageBreak (suggest attaching PageBreak to content paragraphs)")
|
||
|
||
# Separate hard errors from soft warnings
|
||
hard_issues = [i for i in issues if "double page break" in i.lower() or "trailing page break" in i.lower() or "consecutive" in i.lower()]
|
||
soft_issues = [i for i in issues if i not in hard_issues]
|
||
|
||
if hard_issues:
|
||
return CheckResult(
|
||
"blank-pages", False,
|
||
"; ".join(hard_issues[:3]),
|
||
"error"
|
||
)
|
||
if soft_issues:
|
||
return CheckResult(
|
||
"blank-pages", False,
|
||
"; ".join(soft_issues[:3]),
|
||
"warning"
|
||
)
|
||
|
||
return CheckResult("blank-pages", True, "No blank page issues detected")
|
||
|
||
|
||
def check_line_spacing(root: ET.Element) -> CheckResult:
|
||
"""Check body paragraph line spacing consistency"""
|
||
body = root.find(".//w:body", NS)
|
||
paragraphs = body.findall(".//w:p", NS)
|
||
|
||
spacing_values = {}
|
||
body_para_count = 0
|
||
|
||
for p in paragraphs:
|
||
ppr = p.find("w:pPr", NS)
|
||
# Skip heading paragraphs
|
||
if ppr is not None:
|
||
style = ppr.find("w:pStyle", NS)
|
||
if style is not None:
|
||
val = style.get(f"{{{NS['w']}}}val", "")
|
||
if val.startswith("Heading") or val == "Title":
|
||
continue
|
||
|
||
spacing = ppr.find("w:spacing", NS) if ppr is not None else None
|
||
line_val = spacing.get(f"{{{NS['w']}}}line") if spacing is not None else None
|
||
|
||
# Only count paragraphs with text content
|
||
texts = p.findall(".//w:t", NS)
|
||
if not any(t.text and t.text.strip() for t in texts):
|
||
continue
|
||
|
||
body_para_count += 1
|
||
key = line_val or "default"
|
||
spacing_values[key] = spacing_values.get(key, 0) + 1
|
||
|
||
if body_para_count == 0:
|
||
return CheckResult("line-spacing", True, "No body paragraphs")
|
||
|
||
if len(spacing_values) <= 1:
|
||
dominant = list(spacing_values.keys())[0] if spacing_values else "default"
|
||
return CheckResult("line-spacing", True, f"Line spacing uniform (line={dominant})")
|
||
|
||
# Find the most common line spacing
|
||
dominant = max(spacing_values, key=spacing_values.get)
|
||
inconsistent = sum(v for k, v in spacing_values.items() if k != dominant)
|
||
total = sum(spacing_values.values())
|
||
|
||
if inconsistent / total > 0.2:
|
||
return CheckResult(
|
||
"line-spacing", False,
|
||
f"Line spacing inconsistent: {dict(spacing_values)}, {inconsistent}/{total} paragraphs differ from dominant spacing {dominant}",
|
||
"warning"
|
||
)
|
||
|
||
return CheckResult("line-spacing", True, f"Line spacing mostly uniform (line={dominant}, {inconsistent} exceptions)")
|
||
|
||
|
||
|
||
def check_image_overflow(root: ET.Element) -> CheckResult:
|
||
"""Check whether image width may exceed page bounds"""
|
||
# Get page width
|
||
sect_pr = root.find(".//w:body/w:sectPr", NS)
|
||
page_width = 11906 # A4 default
|
||
margin_left = 1701
|
||
margin_right = 1417
|
||
|
||
if sect_pr is not None:
|
||
pg_sz = sect_pr.find("w:pgSz", NS)
|
||
pg_mar = sect_pr.find("w:pgMar", NS)
|
||
if pg_sz is not None:
|
||
page_width = int(pg_sz.get(f"{{{NS['w']}}}w", "11906"))
|
||
if pg_mar is not None:
|
||
margin_left = int(pg_mar.get(f"{{{NS['w']}}}left", "1701"))
|
||
margin_right = int(pg_mar.get(f"{{{NS['w']}}}right", "1417"))
|
||
|
||
usable_width_emu = (page_width - margin_left - margin_right) * 635 # twips → EMU
|
||
|
||
drawings = root.findall(".//wp:inline", NS) + root.findall(".//wp:anchor", NS)
|
||
oversized = 0
|
||
|
||
for dwg in drawings:
|
||
extent = dwg.find("wp:extent", NS)
|
||
if extent is not None:
|
||
cx = int(extent.get("cx", "0"))
|
||
if cx > usable_width_emu * 1.05: # 5% tolerance
|
||
oversized += 1
|
||
|
||
if oversized > 0:
|
||
return CheckResult(
|
||
"image-overflow", False,
|
||
f"{oversized} images exceed page usable area",
|
||
"error"
|
||
)
|
||
|
||
return CheckResult(
|
||
"image-overflow", True,
|
||
f"All images within page width ({len(drawings)} images)"
|
||
)
|
||
|
||
|
||
def check_image_aspect_ratio(docx_path: str, root: ET.Element) -> CheckResult:
|
||
"""Check whether images are stretched/distorted (aspect ratio drift).
|
||
|
||
Compares the original aspect ratio of embedded images with the display aspect ratio set in wp:extent.
|
||
Drift >10% is considered distortion (pie charts becoming elliptical, radar charts becoming diamond-shaped, etc).
|
||
"""
|
||
import zipfile as _zf
|
||
|
||
# Build a mapping: rId → image file path inside the zip
|
||
# We need to parse word/_rels/document.xml.rels
|
||
rid_to_path = {}
|
||
try:
|
||
with _zf.ZipFile(docx_path, 'r') as z:
|
||
rels_path = 'word/_rels/document.xml.rels'
|
||
if rels_path in z.namelist():
|
||
rels_xml = z.read(rels_path)
|
||
rels_root = ET.fromstring(rels_xml)
|
||
rels_ns = 'http://schemas.openxmlformats.org/package/2006/relationships'
|
||
for rel in rels_root.findall(f'{{{rels_ns}}}Relationship'):
|
||
rid = rel.get('Id', '')
|
||
target = rel.get('Target', '')
|
||
rel_type = rel.get('Type', '')
|
||
if 'image' in rel_type:
|
||
# Target is relative to word/ directory
|
||
if not target.startswith('/'):
|
||
img_path = 'word/' + target
|
||
else:
|
||
img_path = target.lstrip('/')
|
||
rid_to_path[rid] = img_path
|
||
|
||
# Now check each drawing
|
||
drawings = root.findall(".//wp:inline", NS) + root.findall(".//wp:anchor", NS)
|
||
distorted = []
|
||
|
||
for dwg in drawings:
|
||
extent = dwg.find("wp:extent", NS)
|
||
if extent is None:
|
||
continue
|
||
display_cx = int(extent.get("cx", "0"))
|
||
display_cy = int(extent.get("cy", "0"))
|
||
if display_cx == 0 or display_cy == 0:
|
||
continue
|
||
|
||
# Find the blip rId
|
||
blip = dwg.find(".//a:blip", NS)
|
||
if blip is None:
|
||
continue
|
||
r_embed = blip.get(f"{{{NS['r']}}}embed", "")
|
||
if not r_embed or r_embed not in rid_to_path:
|
||
continue
|
||
|
||
img_zip_path = rid_to_path[r_embed]
|
||
if img_zip_path not in z.namelist():
|
||
continue
|
||
|
||
# Read actual image dimensions
|
||
try:
|
||
img_data = z.read(img_zip_path)
|
||
from PIL import Image as _PILImage
|
||
import io as _io
|
||
pil_img = _PILImage.open(_io.BytesIO(img_data))
|
||
orig_w, orig_h = pil_img.size
|
||
if orig_w == 0 or orig_h == 0:
|
||
continue
|
||
except Exception:
|
||
continue
|
||
|
||
# Compare aspect ratios
|
||
orig_ratio = orig_w / orig_h
|
||
display_ratio = display_cx / display_cy
|
||
drift = abs(orig_ratio - display_ratio) / orig_ratio
|
||
|
||
if drift > 0.10: # >10% distortion
|
||
pct = drift * 100
|
||
distorted.append(
|
||
f"{img_zip_path.split('/')[-1]}: "
|
||
f"original {orig_w}×{orig_h} (ratio={orig_ratio:.2f}), "
|
||
f"display ratio={display_ratio:.2f}, distortion {pct:.0f}%"
|
||
)
|
||
|
||
except Exception:
|
||
return CheckResult(
|
||
"image-aspect-ratio", True,
|
||
"Cannot check image aspect ratio (zip read error)",
|
||
"info"
|
||
)
|
||
|
||
if distorted:
|
||
detail = "; ".join(distorted[:3])
|
||
if len(distorted) > 3:
|
||
detail += f" ...and {len(distorted)} more"
|
||
return CheckResult(
|
||
"image-aspect-ratio", False,
|
||
f"{len(distorted)} images have aspect ratio distortion: {detail}",
|
||
"warning"
|
||
)
|
||
|
||
img_count = len(drawings)
|
||
return CheckResult(
|
||
"image-aspect-ratio", True,
|
||
f"All images have correct aspect ratio ({img_count} images)"
|
||
)
|
||
|
||
|
||
def check_font_fallback(root: ET.Element) -> CheckResult:
|
||
"""Check whether potentially missing fonts are used"""
|
||
SAFE_FONTS = {
|
||
# Chinese
|
||
"宋体", "SimSun", "黑体", "SimHei", "微软雅黑", "Microsoft YaHei",
|
||
"仿宋", "FangSong", "FangSong_GB2312", "楷体", "KaiTi",
|
||
# English
|
||
"Times New Roman", "Arial", "Calibri", "Helvetica",
|
||
"Courier New", "Georgia", "Verdana", "Tahoma",
|
||
# Universal
|
||
"Symbol", "Wingdings",
|
||
}
|
||
|
||
fonts_used = set()
|
||
for rpr in root.findall(".//w:rPr", NS):
|
||
for font_tag in ["w:rFonts"]:
|
||
rf = rpr.find(font_tag, NS)
|
||
if rf is not None:
|
||
for attr in ["ascii", "eastAsia", "hAnsi", "cs"]:
|
||
f = rf.get(f"{{{NS['w']}}}{attr}")
|
||
if f:
|
||
fonts_used.add(f)
|
||
|
||
risky = fonts_used - SAFE_FONTS
|
||
if risky:
|
||
return CheckResult(
|
||
"font-fallback", False,
|
||
f"Following fonts may be missing on target system: {', '.join(sorted(risky))}",
|
||
"info"
|
||
)
|
||
|
||
return CheckResult("font-fallback", True, f"All fonts are common system fonts ({len(fonts_used)} types)")
|
||
|
||
|
||
|
||
def check_heading_levels(root: ET.Element) -> CheckResult:
|
||
"""Check whether headings skip levels"""
|
||
body = root.find(".//w:body", NS)
|
||
heading_levels = []
|
||
|
||
for p in body.findall(".//w:p", NS):
|
||
ppr = p.find("w:pPr", NS)
|
||
if ppr is None:
|
||
continue
|
||
style = ppr.find("w:pStyle", NS)
|
||
if style is None:
|
||
continue
|
||
val = style.get(f"{{{NS['w']}}}val", "")
|
||
m = re.match(r"Heading(\d+)", val)
|
||
if m:
|
||
heading_levels.append(int(m.group(1)))
|
||
|
||
if len(heading_levels) < 2:
|
||
return CheckResult("heading-levels", True, "Too few headings, skipping check")
|
||
|
||
skips = []
|
||
for i in range(1, len(heading_levels)):
|
||
diff = heading_levels[i] - heading_levels[i - 1]
|
||
if diff > 1:
|
||
skips.append(f"H{heading_levels[i-1]}→H{heading_levels[i]}")
|
||
|
||
if skips:
|
||
return CheckResult(
|
||
"heading-levels", False,
|
||
f"Heading level skip: {', '.join(skips[:5])}",
|
||
"warning"
|
||
)
|
||
|
||
return CheckResult("heading-levels", True, f"Heading levels continuous ({len(heading_levels)} headings)")
|
||
|
||
|
||
# check_cover_separation removed — false positives on complex covers (>15 elements is normal)
|
||
|
||
|
||
def check_shading_type(root: ET.Element) -> CheckResult:
|
||
"""Check whether ShadingType.SOLID is misused"""
|
||
shadings = root.findall(".//w:shd", NS)
|
||
solid_count = 0
|
||
|
||
for shd in shadings:
|
||
val = shd.get(f"{{{NS['w']}}}val", "")
|
||
if val == "solid":
|
||
solid_count += 1
|
||
|
||
if solid_count > 0:
|
||
return CheckResult(
|
||
"shading-type", False,
|
||
f"Found {solid_count} instances of ShadingType.SOLID (should be CLEAR), may cause black cells",
|
||
"error"
|
||
)
|
||
|
||
return CheckResult("shading-type", True, "No ShadingType.SOLID misuse found")
|
||
|
||
|
||
|
||
def check_toc(root: ET.Element, docx_path: str = "") -> CheckResult:
|
||
"""Check TOC quality: field existence, headings presence, outlineLvl, updateFields."""
|
||
body = root.find(".//w:body", NS)
|
||
if body is None:
|
||
return CheckResult("toc", True, "Document body is empty, skipping TOC check", "info")
|
||
|
||
paragraphs = list(body)
|
||
w_ns = NS["w"]
|
||
|
||
# --- Detect headings and their levels ---
|
||
heading_count = 0
|
||
heading_levels_used = set() # e.g. {1, 2, 3}
|
||
for p in paragraphs:
|
||
if p.tag != f"{{{w_ns}}}p":
|
||
continue
|
||
ppr = p.find(f"{{{w_ns}}}pPr")
|
||
if ppr is None:
|
||
continue
|
||
ps = ppr.find(f"{{{w_ns}}}pStyle")
|
||
if ps is None:
|
||
continue
|
||
val = ps.get(f"{{{w_ns}}}val", "")
|
||
m = re.match(r"(?i)heading\s*(\d)", val)
|
||
if m:
|
||
heading_count += 1
|
||
heading_levels_used.add(int(m.group(1)))
|
||
|
||
# --- Detect TOC field ---
|
||
has_toc = False
|
||
for instr in root.findall(f".//{{{w_ns}}}instrText"):
|
||
if instr.text and "TOC" in instr.text.upper():
|
||
has_toc = True
|
||
break
|
||
if not has_toc:
|
||
for fld in root.findall(f".//{{{w_ns}}}fldSimple"):
|
||
if "TOC" in fld.get(f"{{{w_ns}}}instr", "").upper():
|
||
has_toc = True
|
||
break
|
||
# Also check SDT-wrapped TOC
|
||
if not has_toc:
|
||
for sdt in root.findall(f".//{{{w_ns}}}sdt"):
|
||
for instr in sdt.findall(f".//{{{w_ns}}}instrText"):
|
||
if instr.text and "TOC" in instr.text.upper():
|
||
has_toc = True
|
||
break
|
||
if has_toc:
|
||
break
|
||
|
||
issues = []
|
||
|
||
# Check 1: Document has a "目录" / "目 录" / "Table of Contents" title but no TOC field
|
||
has_toc_title = False
|
||
toc_title_pattern = re.compile(r'^(?:目\s*录|table\s+of\s+contents|contents)$', re.IGNORECASE)
|
||
for p in paragraphs:
|
||
if p.tag != f"{{{w_ns}}}p":
|
||
continue
|
||
texts = p.findall(f".//{{{w_ns}}}t")
|
||
p_text = "".join(t.text or "" for t in texts).strip()
|
||
if toc_title_pattern.match(p_text):
|
||
has_toc_title = True
|
||
break
|
||
|
||
if has_toc_title and not has_toc:
|
||
issues.append("TOC_FIELD_MISSING: document has a TOC title but no TOC field element — add TableOfContents in code")
|
||
|
||
# Check 2: TOC field exists but no headings in document → TOC will be empty after update
|
||
if has_toc and heading_count == 0:
|
||
issues.append("TOC_NO_HEADINGS: TOC field exists but document has 0 Heading-styled paragraphs — TOC will be empty after update")
|
||
|
||
# Check 3 & 4: Read styles.xml and settings.xml from DOCX (only when TOC exists)
|
||
if has_toc and docx_path:
|
||
try:
|
||
import zipfile
|
||
with zipfile.ZipFile(docx_path, 'r') as zf:
|
||
# Check 3: outlineLvl missing in Heading styles
|
||
if 'word/styles.xml' in zf.namelist():
|
||
styles_content = zf.read('word/styles.xml').decode('utf-8')
|
||
styles_root = ET.fromstring(styles_content)
|
||
|
||
missing_outline = []
|
||
for level in sorted(heading_levels_used):
|
||
style_id = f"Heading{level}"
|
||
# Find <w:style w:styleId="HeadingN">
|
||
for style_elem in styles_root.findall(f".//{{{w_ns}}}style"):
|
||
sid = style_elem.get(f"{{{w_ns}}}styleId", "")
|
||
if sid == style_id:
|
||
# Check if pPr has outlineLvl
|
||
ppr = style_elem.find(f"{{{w_ns}}}pPr")
|
||
has_outline = False
|
||
if ppr is not None:
|
||
ol = ppr.find(f"{{{w_ns}}}outlineLvl")
|
||
if ol is not None:
|
||
has_outline = True
|
||
if not has_outline:
|
||
missing_outline.append(style_id)
|
||
break
|
||
|
||
if missing_outline:
|
||
issues.append(
|
||
"TOC_OUTLINE_MISSING: %s style(s) missing outlineLvl — "
|
||
"Word TOC update won't find these headings. "
|
||
"Run add_toc_placeholders.py to fix" % ", ".join(missing_outline)
|
||
)
|
||
|
||
# Check 4: updateFields not set to true
|
||
if 'word/settings.xml' in zf.namelist():
|
||
settings_content = zf.read('word/settings.xml').decode('utf-8')
|
||
# Check for <w:updateFields w:val="true"/>
|
||
update_ok = bool(re.search(
|
||
r'<w:updateFields\s+[^>]*w:val\s*=\s*"true"',
|
||
settings_content
|
||
))
|
||
if not update_ok:
|
||
issues.append(
|
||
"TOC_UPDATE_DISABLED: settings.xml missing updateFields=true — "
|
||
"Word won't prompt to update TOC on open. "
|
||
"Run add_toc_placeholders.py to fix"
|
||
)
|
||
except Exception as e:
|
||
issues.append(f"TOC_CHECK_ERROR: failed to read styles/settings from DOCX: {e}")
|
||
|
||
if not issues:
|
||
if has_toc:
|
||
return CheckResult("toc", True, "TOC field present and update-ready")
|
||
else:
|
||
return CheckResult("toc", True, "No TOC needed")
|
||
|
||
severity = "error" if any(k in i for i in issues for k in ("FIELD_MISSING", "NO_HEADINGS", "OUTLINE_MISSING")) else "warning"
|
||
return CheckResult("toc", False, "; ".join(issues[:5]), severity)
|
||
|
||
|
||
|
||
|
||
def check_cover_overflow(root: ET.Element) -> CheckResult:
|
||
"""Detect cover section issues: oversized fonts, excessive spacing, trailing empty content."""
|
||
sections = get_sections(root)
|
||
if not sections:
|
||
return CheckResult("cover-overflow", True, "No sections found")
|
||
|
||
sec0 = sections[0]
|
||
sect_pr = sec0["sectPr"]
|
||
|
||
# Get page dimensions and margins for accurate available height calculation
|
||
pg_sz = sect_pr.find("w:pgSz", NS)
|
||
pg_mar = sect_pr.find("w:pgMar", NS)
|
||
page_height = int(pg_sz.get(f"{{{NS['w']}}}h", "16838")) if pg_sz is not None else 16838
|
||
margin_top = int(pg_mar.get(f"{{{NS['w']}}}top", "0")) if pg_mar is not None else 0
|
||
margin_bottom = int(pg_mar.get(f"{{{NS['w']}}}bottom", "0")) if pg_mar is not None else 0
|
||
|
||
issues = []
|
||
children = sec0["children"]
|
||
|
||
# Check 1: Oversized font in cover section (> 44pt = 88 half-points = 889000 EMU)
|
||
max_font_size = 0
|
||
for child in children:
|
||
for sz in child.findall(".//" + f"{{{NS['w']}}}sz"):
|
||
val = sz.get(f"{{{NS['w']}}}val")
|
||
if val and val.isdigit():
|
||
size_hp = int(val)
|
||
if size_hp > max_font_size:
|
||
max_font_size = size_hp
|
||
|
||
if max_font_size > 88: # 88 half-points = 44pt
|
||
issues.append(
|
||
f"Cover has font size {max_font_size // 2}pt (>{44}pt max). "
|
||
f"Use calcTitleLayout() for dynamic sizing"
|
||
)
|
||
|
||
# Check 2: Excessive spacing.before in cover section (> 5000 twips)
|
||
max_spacing = 0
|
||
for child in children:
|
||
for sp in child.findall(".//" + f"{{{NS['w']}}}spacing"):
|
||
before = sp.get(f"{{{NS['w']}}}before")
|
||
if before and before.isdigit():
|
||
val = int(before)
|
||
if val > max_spacing:
|
||
max_spacing = val
|
||
|
||
if max_spacing > 5000:
|
||
issues.append(
|
||
f"Cover has spacing.before={max_spacing} twips (>5000 max). "
|
||
f"Use calcCoverSpacing() for dynamic spacing"
|
||
)
|
||
|
||
# Check 3: Trailing empty paragraphs in cover section
|
||
trailing_empty = 0
|
||
for child in reversed(children):
|
||
tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
|
||
if tag != "p":
|
||
break
|
||
texts = child.findall(".//" + f"{{{NS['w']}}}t")
|
||
has_text = any(t.text and t.text.strip() for t in texts)
|
||
if not has_text:
|
||
trailing_empty += 1
|
||
else:
|
||
break
|
||
|
||
if trailing_empty > 2:
|
||
issues.append(
|
||
f"Cover section ends with {trailing_empty} empty paragraphs (max 2 allowed) — "
|
||
f"excessive empty paragraphs may cause blank page after cover"
|
||
)
|
||
|
||
if issues:
|
||
return CheckResult(
|
||
"cover-overflow", False,
|
||
"; ".join(issues),
|
||
"error"
|
||
)
|
||
|
||
return CheckResult("cover-overflow", True, "Cover section layout looks OK")
|
||
|
||
|
||
def run_all_checks(docx_path: str) -> list[CheckResult]:
|
||
"""Run all checks"""
|
||
root = read_document_xml(docx_path)
|
||
|
||
checks = [
|
||
check_blank_pages,
|
||
check_cover_overflow,
|
||
check_line_spacing,
|
||
check_image_overflow,
|
||
check_font_fallback,
|
||
check_heading_levels,
|
||
check_shading_type,
|
||
]
|
||
|
||
results = []
|
||
for check_fn in checks:
|
||
try:
|
||
results.append(check_fn(root))
|
||
except Exception as e:
|
||
results.append(CheckResult(
|
||
check_fn.__name__.replace("check_", ""),
|
||
False,
|
||
f"Check error: {e}",
|
||
"error"
|
||
))
|
||
|
||
# TOC check needs both root and docx_path
|
||
try:
|
||
results.append(check_toc(root, docx_path))
|
||
except Exception as e:
|
||
results.append(CheckResult("toc", False, f"Check error: {e}", "error"))
|
||
|
||
# Image aspect ratio check needs both root and docx_path
|
||
try:
|
||
results.append(check_image_aspect_ratio(docx_path, root))
|
||
except Exception as e:
|
||
results.append(CheckResult("image-aspect-ratio", False, f"Check error: {e}", "error"))
|
||
|
||
return results
|
||
|
||
|
||
def main():
|
||
import argparse
|
||
parser = argparse.ArgumentParser(description="docx business rule self-check")
|
||
parser.add_argument("docx_path", help="Path to the .docx file to check")
|
||
parser.add_argument("--json", action="store_true", help="Output in JSON format")
|
||
parser.add_argument("--strict", action="store_true", help="Treat warnings as failures")
|
||
args = parser.parse_args()
|
||
|
||
if not Path(args.docx_path).exists():
|
||
print(f"❌ File not found: {args.docx_path}")
|
||
sys.exit(1)
|
||
|
||
results = run_all_checks(args.docx_path)
|
||
|
||
if args.json:
|
||
print(json.dumps([r.to_dict() for r in results], ensure_ascii=False, indent=2))
|
||
else:
|
||
print(f"\n📋 Document self-check report: {args.docx_path}\n")
|
||
for r in results:
|
||
print(f" {r}")
|
||
|
||
passed = sum(1 for r in results if r.passed)
|
||
total = len(results)
|
||
errors = sum(1 for r in results if not r.passed and r.severity == "error")
|
||
warnings = sum(1 for r in results if not r.passed and r.severity == "warning")
|
||
|
||
print(f"\n {'─' * 50}")
|
||
print(f" Passed {passed}/{total} | ❌ {errors} errors | ⚠️ {warnings} warnings\n")
|
||
|
||
# Exit code
|
||
has_errors = any(not r.passed and r.severity == "error" for r in results)
|
||
has_warnings = any(not r.passed and r.severity == "warning" for r in results)
|
||
|
||
if has_errors:
|
||
sys.exit(2)
|
||
elif args.strict and has_warnings:
|
||
sys.exit(1)
|
||
else:
|
||
sys.exit(0)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|