2076 lines
76 KiB
Python
Executable File
2076 lines
76 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""
|
||
toc_validate.py - Table of Contents Validation for DOCX and PDF files.
|
||
|
||
Checks DOCX and PDF files for TOC quality issues including missing TOC fields,
|
||
empty placeholders, heading style mismatches, page break issues, and more.
|
||
Also validates TOC consistency across DOCX→PDF conversions.
|
||
|
||
Usage:
|
||
python3 toc_validate.py check-docx output.docx
|
||
python3 toc_validate.py check-pdf output.pdf
|
||
python3 toc_validate.py check-conversion input.docx output.pdf
|
||
|
||
Output:
|
||
JSON to stdout with structure:
|
||
{
|
||
"pass": true/false,
|
||
"source": "filename",
|
||
"check_type": "docx-toc"|"pdf-toc"|"conversion-toc",
|
||
"errors": [...],
|
||
"warnings": [...],
|
||
"info": [...]
|
||
}
|
||
|
||
Exit codes:
|
||
0 = pass (no errors)
|
||
1 = fail (errors found)
|
||
2 = script error (bad args, file not found, etc.)
|
||
|
||
Dependencies:
|
||
- Standard library (zipfile, xml.etree.ElementTree, etc.)
|
||
- pdfplumber (for PDF checks)
|
||
- pikepdf (optional, for link annotation checks)
|
||
"""
|
||
|
||
import sys
|
||
import os
|
||
import json
|
||
import re
|
||
import zipfile
|
||
import tempfile
|
||
import xml.etree.ElementTree as ET
|
||
from pathlib import Path
|
||
from typing import List, Dict, Tuple, Optional, Any
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# XML namespace constants
|
||
# ---------------------------------------------------------------------------
|
||
NS = {
|
||
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
|
||
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
|
||
}
|
||
|
||
# Standard heading style names (case-insensitive comparison)
|
||
STANDARD_HEADING_STYLES = {
|
||
'heading1', 'heading2', 'heading3', 'heading4',
|
||
'heading 1', 'heading 2', 'heading 3', 'heading 4',
|
||
# Some localized variants
|
||
'1', '2', '3', '4',
|
||
}
|
||
|
||
# TOC keywords to search for in PDF text
|
||
TOC_KEYWORDS = ['目录', '目 录', '目 录', 'table of contents', 'contents']
|
||
|
||
# Hint phrases that should not leak into final PDF
|
||
HINT_PHRASES = [
|
||
'提示:本目录通过域代码生成',
|
||
'右键更新域',
|
||
'Update Field',
|
||
'right-click',
|
||
'Tip: This table of contents',
|
||
]
|
||
|
||
# Hint text indicators for DOCX styling check
|
||
HINT_INDICATORS = ['提示', 'Tip:', 'Update Field', '更新域']
|
||
|
||
# Gray color values (hex, case-insensitive)
|
||
GRAY_COLORS = {'808080', '999999', 'a0a0a0', 'a5a5a5', 'b0b0b0', 'c0c0c0',
|
||
'888888', '777777', '666666', 'aaaaaa', 'bbbbbb', 'cccccc',
|
||
'909090', '959595', '9a9a9a', 'a8a8a8', 'b8b8b8'}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Result helpers
|
||
# ---------------------------------------------------------------------------
|
||
def make_item(code: str, message: str, severity: str) -> Dict[str, str]:
|
||
"""Create a single result item."""
|
||
return {"code": code, "message": message, "severity": severity}
|
||
|
||
|
||
def make_result(source: str, check_type: str, errors: List, warnings: List,
|
||
info: List) -> Dict[str, Any]:
|
||
"""Build the final result dict."""
|
||
return {
|
||
"pass": len(errors) == 0,
|
||
"source": source,
|
||
"check_type": check_type,
|
||
"errors": errors,
|
||
"warnings": warnings,
|
||
"info": info,
|
||
}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# DOCX XML parsing helpers
|
||
# ---------------------------------------------------------------------------
|
||
def parse_docx_xml(docx_path: str) -> Optional[ET.Element]:
|
||
"""Extract and parse document.xml from a .docx file.
|
||
|
||
Returns the root Element or None if extraction fails.
|
||
"""
|
||
try:
|
||
with zipfile.ZipFile(docx_path, 'r') as z:
|
||
with z.open('word/document.xml') as f:
|
||
return ET.parse(f).getroot()
|
||
except (zipfile.BadZipFile, KeyError, ET.ParseError):
|
||
return None
|
||
|
||
|
||
def get_all_paragraphs(root: ET.Element) -> List[ET.Element]:
|
||
"""Return all w:p elements in document order."""
|
||
return root.findall('.//' + _w('p'))
|
||
|
||
|
||
def _w(tag: str) -> str:
|
||
"""Shorthand for word namespace tag."""
|
||
return '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' + tag
|
||
|
||
|
||
def get_paragraph_text(para: ET.Element) -> str:
|
||
"""Extract concatenated text from all w:t elements in a paragraph."""
|
||
texts = []
|
||
for t in para.findall('.//' + _w('t')):
|
||
if t.text:
|
||
texts.append(t.text)
|
||
return ''.join(texts)
|
||
|
||
|
||
def get_paragraph_style(para: ET.Element) -> Optional[str]:
|
||
"""Get the pStyle val from a paragraph, or None."""
|
||
pPr = para.find(_w('pPr'))
|
||
if pPr is None:
|
||
return None
|
||
pStyle = pPr.find(_w('pStyle'))
|
||
if pStyle is None:
|
||
return None
|
||
return pStyle.get(_w('val'))
|
||
|
||
|
||
def is_heading_style(style_val: Optional[str]) -> bool:
|
||
"""Check if a style value is a standard heading style."""
|
||
if style_val is None:
|
||
return False
|
||
lower = style_val.lower().strip()
|
||
# Check direct matches
|
||
if lower in STANDARD_HEADING_STYLES:
|
||
return True
|
||
# Check "Heading" prefix pattern (e.g. "Heading1", "Heading 2")
|
||
if lower.startswith('heading'):
|
||
return True
|
||
# Numeric style IDs sometimes used for headings
|
||
if lower in ('1', '2', '3', '4'):
|
||
return True
|
||
return False
|
||
|
||
|
||
def is_any_heading_style(style_val: Optional[str]) -> bool:
|
||
"""Check if a style looks like any heading (standard or custom with 'heading')."""
|
||
if style_val is None:
|
||
return False
|
||
lower = style_val.lower().strip()
|
||
return lower.startswith('heading') or lower in ('1', '2', '3', '4')
|
||
|
||
|
||
def is_standard_heading_style(style_val: Optional[str]) -> bool:
|
||
"""Check if a style is specifically a standard Heading1-4."""
|
||
if style_val is None:
|
||
return False
|
||
lower = style_val.lower().strip()
|
||
return lower in {'heading1', 'heading2', 'heading3', 'heading4',
|
||
'heading 1', 'heading 2', 'heading 3', 'heading 4'}
|
||
|
||
|
||
def paragraph_is_bold_large(para: ET.Element) -> bool:
|
||
"""Check if a paragraph has bold text and large font (≥28 half-points / 14pt).
|
||
|
||
Checks both paragraph-level and run-level properties.
|
||
"""
|
||
is_bold = False
|
||
is_large = False
|
||
|
||
# Check paragraph-level properties
|
||
pPr = para.find(_w('pPr'))
|
||
if pPr is not None:
|
||
rPr = pPr.find(_w('rPr'))
|
||
if rPr is not None:
|
||
b = rPr.find(_w('b'))
|
||
if b is not None:
|
||
b_val = b.get(_w('val'))
|
||
if b_val is None or b_val.lower() not in ('false', '0', 'off'):
|
||
is_bold = True
|
||
sz = rPr.find(_w('sz'))
|
||
if sz is not None:
|
||
try:
|
||
size = int(sz.get(_w('val'), '0'))
|
||
if size >= 28:
|
||
is_large = True
|
||
except (ValueError, TypeError):
|
||
pass
|
||
|
||
# Check run-level properties
|
||
for run in para.findall(_w('r')):
|
||
rPr = run.find(_w('rPr'))
|
||
if rPr is None:
|
||
continue
|
||
b = rPr.find(_w('b'))
|
||
if b is not None:
|
||
b_val = b.get(_w('val'))
|
||
if b_val is None or b_val.lower() not in ('false', '0', 'off'):
|
||
is_bold = True
|
||
sz = rPr.find(_w('sz'))
|
||
if sz is not None:
|
||
try:
|
||
size = int(sz.get(_w('val'), '0'))
|
||
if size >= 28:
|
||
is_large = True
|
||
except (ValueError, TypeError):
|
||
pass
|
||
|
||
return is_bold and is_large
|
||
|
||
|
||
def docx_has_toc_field(root: ET.Element) -> bool:
|
||
"""Check if the document has a TOC field code.
|
||
|
||
Looks for:
|
||
- <w:fldSimple> with w:instr containing "TOC"
|
||
- <w:instrText> containing "TOC"
|
||
"""
|
||
# Check fldSimple
|
||
for fld in root.findall('.//' + _w('fldSimple')):
|
||
instr = fld.get(_w('instr'), '')
|
||
if 'TOC' in instr.upper():
|
||
return True
|
||
|
||
# Check instrText
|
||
for instr in root.findall('.//' + _w('instrText')):
|
||
if instr.text and 'TOC' in instr.text.upper():
|
||
return True
|
||
|
||
return False
|
||
|
||
|
||
def find_toc_field_boundaries(root: ET.Element) -> Tuple[Optional[ET.Element], Optional[ET.Element], Optional[ET.Element]]:
|
||
"""Find the TOC field begin/separate/end fldChar elements.
|
||
|
||
Returns (begin_elem, separate_elem, end_elem) — any may be None.
|
||
We search for the TOC instrText and then find the corresponding
|
||
fldChar markers.
|
||
"""
|
||
body = root.find(_w('body'))
|
||
if body is None:
|
||
return None, None, None
|
||
|
||
all_paragraphs = list(body) # Direct children of body
|
||
|
||
# Flatten all runs across all paragraphs to find field structure
|
||
# We need to track field nesting to find the right begin/separate/end
|
||
in_toc_field = False
|
||
toc_begin_para_idx = None
|
||
toc_separate_para_idx = None
|
||
toc_end_para_idx = None
|
||
field_depth = 0
|
||
|
||
for para_idx, elem in enumerate(all_paragraphs):
|
||
if elem.tag != _w('p'):
|
||
continue
|
||
for run in elem.findall(_w('r')):
|
||
# Check for instrText with TOC
|
||
instr = run.find(_w('instrText'))
|
||
if instr is not None and instr.text and 'TOC' in instr.text.upper():
|
||
in_toc_field = True
|
||
|
||
fldChar = run.find(_w('fldChar'))
|
||
if fldChar is not None:
|
||
fld_type = fldChar.get(_w('fldCharType'), '')
|
||
if fld_type == 'begin':
|
||
field_depth += 1
|
||
if not in_toc_field and toc_begin_para_idx is None:
|
||
# Mark tentatively; will confirm when we see instrText
|
||
pass
|
||
if in_toc_field and toc_begin_para_idx is None:
|
||
toc_begin_para_idx = para_idx
|
||
elif fld_type == 'separate':
|
||
if in_toc_field and toc_separate_para_idx is None:
|
||
toc_separate_para_idx = para_idx
|
||
elif fld_type == 'end':
|
||
if in_toc_field and field_depth <= 1:
|
||
toc_end_para_idx = para_idx
|
||
in_toc_field = False
|
||
field_depth = max(0, field_depth - 1)
|
||
|
||
return toc_begin_para_idx, toc_separate_para_idx, toc_end_para_idx
|
||
|
||
|
||
def find_toc_field_boundaries_v2(root: ET.Element) -> Dict[str, Any]:
|
||
"""Enhanced TOC boundary finder that works with nested fields.
|
||
|
||
Returns dict with:
|
||
'has_toc': bool
|
||
'begin_para_idx': int or None
|
||
'separate_para_idx': int or None
|
||
'end_para_idx': int or None
|
||
'toc_entry_texts': list of str (text between separate and end)
|
||
"""
|
||
body = root.find(_w('body'))
|
||
if body is None:
|
||
return {'has_toc': False, 'begin_para_idx': None,
|
||
'separate_para_idx': None, 'end_para_idx': None,
|
||
'toc_entry_texts': []}
|
||
|
||
paragraphs = [e for e in body if e.tag == _w('p')]
|
||
|
||
# Phase 1: Find the TOC instrText and its surrounding begin marker
|
||
toc_begin_idx = None
|
||
toc_separate_idx = None
|
||
toc_end_idx = None
|
||
|
||
# Track all fldChar positions
|
||
events = [] # (para_idx, event_type, element)
|
||
for pi, para in enumerate(paragraphs):
|
||
for run in para.findall('.//' + _w('r')):
|
||
fldChar = run.find(_w('fldChar'))
|
||
if fldChar is not None:
|
||
events.append((pi, fldChar.get(_w('fldCharType'), ''), run))
|
||
instr = run.find(_w('instrText'))
|
||
if instr is not None and instr.text and 'TOC' in instr.text.upper():
|
||
events.append((pi, 'toc_instr', run))
|
||
|
||
# Find TOC field boundaries using field nesting
|
||
depth = 0
|
||
found_toc = False
|
||
toc_depth = None
|
||
|
||
begin_stack = [] # Stack of (para_idx, depth)
|
||
|
||
for pi, evt, run in events:
|
||
if evt == 'begin':
|
||
depth += 1
|
||
begin_stack.append((pi, depth))
|
||
elif evt == 'toc_instr':
|
||
if not found_toc and begin_stack:
|
||
found_toc = True
|
||
toc_begin_idx = begin_stack[-1][0]
|
||
toc_depth = begin_stack[-1][1]
|
||
elif evt == 'separate':
|
||
if found_toc and toc_separate_idx is None and depth == toc_depth:
|
||
toc_separate_idx = pi
|
||
elif evt == 'end':
|
||
if found_toc and toc_end_idx is None and depth == toc_depth:
|
||
toc_end_idx = pi
|
||
depth = max(0, depth - 1)
|
||
if begin_stack:
|
||
begin_stack.pop()
|
||
|
||
# Phase 2: Extract TOC entry texts between separate and end
|
||
toc_entry_texts = []
|
||
if toc_separate_idx is not None and toc_end_idx is not None:
|
||
for pi in range(toc_separate_idx, toc_end_idx + 1):
|
||
if pi < len(paragraphs):
|
||
text = get_paragraph_text(paragraphs[pi]).strip()
|
||
if text:
|
||
toc_entry_texts.append(text)
|
||
|
||
return {
|
||
'has_toc': found_toc,
|
||
'begin_para_idx': toc_begin_idx,
|
||
'separate_para_idx': toc_separate_idx,
|
||
'end_para_idx': toc_end_idx,
|
||
'toc_entry_texts': toc_entry_texts,
|
||
}
|
||
|
||
|
||
def check_toc_has_content(root: ET.Element, separate_para_idx: Optional[int],
|
||
end_para_idx: Optional[int]) -> bool:
|
||
"""Check if there are w:t elements between the separate and end markers.
|
||
|
||
Looks at all paragraphs between the separate and end field char markers.
|
||
"""
|
||
if separate_para_idx is None or end_para_idx is None:
|
||
return False
|
||
|
||
body = root.find(_w('body'))
|
||
if body is None:
|
||
return False
|
||
|
||
paragraphs = [e for e in body if e.tag == _w('p')]
|
||
|
||
for pi in range(separate_para_idx, min(end_para_idx + 1, len(paragraphs))):
|
||
para = paragraphs[pi]
|
||
for t in para.findall('.//' + _w('t')):
|
||
if t.text and t.text.strip():
|
||
return True
|
||
return False
|
||
|
||
|
||
def fuzzy_match(text_a: str, text_b: str) -> bool:
|
||
"""Check if two strings match fuzzily.
|
||
|
||
Match if one contains the other, or they share >60% of characters.
|
||
"""
|
||
a = text_a.strip().lower()
|
||
b = text_b.strip().lower()
|
||
|
||
if not a or not b:
|
||
return False
|
||
|
||
# One contains the other
|
||
if a in b or b in a:
|
||
return True
|
||
|
||
# Character overlap >60%
|
||
set_a = set(a)
|
||
set_b = set(b)
|
||
if not set_a or not set_b:
|
||
return False
|
||
intersection = set_a & set_b
|
||
union = set_a | set_b
|
||
similarity = len(intersection) / len(union)
|
||
return similarity > 0.6
|
||
|
||
|
||
def _detect_language(texts: list) -> str:
|
||
"""Detect the primary language of a list of text strings.
|
||
|
||
Returns 'zh' if more than half contain Chinese characters, else 'en'.
|
||
"""
|
||
if not texts:
|
||
return 'en'
|
||
total = len(texts)
|
||
chinese_count = sum(1 for t in texts if re.search(r'[\u4e00-\u9fff]', t))
|
||
return 'zh' if chinese_count > total / 2 else 'en'
|
||
|
||
|
||
def _get_heading_level(style_val: Optional[str]) -> int:
|
||
"""Extract heading level (1-9) from a style value. Returns 0 if not a heading."""
|
||
if style_val is None:
|
||
return 0
|
||
lower = style_val.lower().strip()
|
||
# "heading1", "heading 1", "heading2", etc.
|
||
m = re.match(r'heading\s*(\d+)', lower)
|
||
if m:
|
||
return int(m.group(1))
|
||
# Numeric style IDs
|
||
if lower in ('1', '2', '3', '4', '5', '6', '7', '8', '9'):
|
||
return int(lower)
|
||
return 0
|
||
|
||
|
||
def check_run_hint_style(run: ET.Element) -> Tuple[bool, bool]:
|
||
"""Check if a run has gray color and small font size.
|
||
|
||
Returns (has_gray_color, has_small_font).
|
||
"""
|
||
has_gray = False
|
||
has_small = False
|
||
|
||
rPr = run.find(_w('rPr'))
|
||
if rPr is None:
|
||
return False, False
|
||
|
||
color = rPr.find(_w('color'))
|
||
if color is not None:
|
||
val = color.get(_w('val'), '').lower()
|
||
if val in GRAY_COLORS:
|
||
has_gray = True
|
||
# Also check if it's any gray-ish color (same R, G, B values or close)
|
||
if len(val) == 6:
|
||
try:
|
||
r = int(val[0:2], 16)
|
||
g = int(val[2:4], 16)
|
||
b = int(val[4:6], 16)
|
||
# Gray if R, G, B are all close to each other and in mid-range
|
||
if (abs(r - g) < 30 and abs(g - b) < 30 and abs(r - b) < 30
|
||
and 80 <= r <= 210):
|
||
has_gray = True
|
||
except ValueError:
|
||
pass
|
||
|
||
sz = rPr.find(_w('sz'))
|
||
if sz is not None:
|
||
try:
|
||
size = int(sz.get(_w('val'), '0'))
|
||
if size <= 18: # 18 half-points = 9pt
|
||
has_small = True
|
||
except (ValueError, TypeError):
|
||
pass
|
||
|
||
return has_gray, has_small
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# check-docx implementation
|
||
# ---------------------------------------------------------------------------
|
||
def check_docx(docx_path: str) -> Dict[str, Any]:
|
||
"""Run all DOCX TOC validation checks.
|
||
|
||
Returns the result dict.
|
||
"""
|
||
errors: List[Dict] = []
|
||
warnings: List[Dict] = []
|
||
info: List[Dict] = []
|
||
source = os.path.basename(docx_path)
|
||
|
||
# Parse document.xml
|
||
root = parse_docx_xml(docx_path)
|
||
if root is None:
|
||
return make_result(source, "docx-toc",
|
||
[make_item("PARSE_ERROR",
|
||
"Failed to parse DOCX file. File may be corrupted.",
|
||
"error")],
|
||
[], [])
|
||
|
||
body = root.find(_w('body'))
|
||
if body is None:
|
||
return make_result(source, "docx-toc", [], [],
|
||
[make_item("EMPTY_BODY",
|
||
"Document body is empty.",
|
||
"info")])
|
||
|
||
paragraphs = [e for e in body if e.tag == _w('p')]
|
||
|
||
# Detect TOC field boundaries
|
||
toc_info = find_toc_field_boundaries_v2(root)
|
||
has_toc = toc_info['has_toc']
|
||
toc_begin_idx = toc_info['begin_para_idx']
|
||
toc_separate_idx = toc_info['separate_para_idx']
|
||
toc_end_idx = toc_info['end_para_idx']
|
||
toc_entry_texts = toc_info['toc_entry_texts']
|
||
|
||
# Also check for fldSimple-based TOC
|
||
if not has_toc:
|
||
for fld in root.findall('.//' + _w('fldSimple')):
|
||
instr = fld.get(_w('instr'), '')
|
||
if 'TOC' in instr.upper():
|
||
has_toc = True
|
||
break
|
||
|
||
# Also check for SDT-wrapped TOC (e.g. generated by fix-docx)
|
||
if not has_toc:
|
||
if docx_has_toc_field(root):
|
||
has_toc = True
|
||
|
||
# Count headings (all paragraphs with heading styles)
|
||
all_heading_paras = []
|
||
for pi, para in enumerate(paragraphs):
|
||
style = get_paragraph_style(para)
|
||
if is_any_heading_style(style):
|
||
all_heading_paras.append((pi, para, style))
|
||
|
||
# Content headings: headings AFTER the TOC end (or all if no TOC)
|
||
content_heading_paras = []
|
||
if toc_end_idx is not None:
|
||
for pi, para, style in all_heading_paras:
|
||
if pi > toc_end_idx:
|
||
content_heading_paras.append((pi, para, style))
|
||
else:
|
||
content_heading_paras = list(all_heading_paras)
|
||
|
||
# ---- CHECK 1: TOC_FIELD_MISSING ----
|
||
heading_count = len(all_heading_paras)
|
||
if heading_count >= 3 and not has_toc:
|
||
warnings.append(make_item(
|
||
"TOC_FIELD_MISSING",
|
||
f"Document has {heading_count} headings but no Table of Contents.",
|
||
"warning"
|
||
))
|
||
|
||
# ---- CHECK 2: TOC_PLACEHOLDER_EMPTY ----
|
||
if has_toc and toc_separate_idx is not None and toc_end_idx is not None:
|
||
has_content = check_toc_has_content(root, toc_separate_idx, toc_end_idx)
|
||
if not has_content:
|
||
errors.append(make_item(
|
||
"TOC_PLACEHOLDER_EMPTY",
|
||
"TOC field exists but has no placeholder entries. Run add_toc_placeholders.py.",
|
||
"error"
|
||
))
|
||
|
||
# ---- CHECK 3: TOC_HEADING_STYLE ----
|
||
# Scan ALL paragraphs after TOC (not just those in content_heading_paras)
|
||
# to catch bold+large paragraphs with non-heading styles that TOC won't see.
|
||
if has_toc:
|
||
start_idx = (toc_end_idx + 1) if toc_end_idx is not None else 0
|
||
for pi in range(start_idx, len(paragraphs)):
|
||
para = paragraphs[pi]
|
||
style = get_paragraph_style(para)
|
||
if paragraph_is_bold_large(para) and not is_standard_heading_style(style):
|
||
text = get_paragraph_text(para).strip()
|
||
if text:
|
||
truncated = text[:50] + ('...' if len(text) > 50 else '')
|
||
style_name = style if style else '(none)'
|
||
errors.append(make_item(
|
||
"TOC_HEADING_STYLE",
|
||
f"Paragraph '{truncated}' uses custom style '{style_name}' "
|
||
f"instead of HeadingLevel. TOC will not pick it up.",
|
||
"error"
|
||
))
|
||
|
||
# ---- CHECK 4: TOC_ENTRY_MISMATCH ----
|
||
if toc_entry_texts and content_heading_paras:
|
||
heading_texts = [get_paragraph_text(para).strip()
|
||
for _, para, _ in content_heading_paras
|
||
if get_paragraph_text(para).strip()]
|
||
if heading_texts:
|
||
unmatched = 0
|
||
for ht in heading_texts:
|
||
matched = any(fuzzy_match(ht, et) for et in toc_entry_texts)
|
||
if not matched:
|
||
unmatched += 1
|
||
|
||
match_ratio = (len(heading_texts) - unmatched) / len(heading_texts)
|
||
if match_ratio < 0.5:
|
||
errors.append(make_item(
|
||
"TOC_ENTRY_MISMATCH",
|
||
f"TOC placeholder entries don't match actual headings. "
|
||
f"{unmatched} of {len(heading_texts)} headings not found in TOC.",
|
||
"error"
|
||
))
|
||
|
||
# ---- CHECK 5: TOC_NO_PAGEBREAK ----
|
||
if toc_end_idx is not None:
|
||
found_pagebreak = False
|
||
# Check up to 2 paragraphs after TOC end
|
||
check_end = min(toc_end_idx + 3, len(paragraphs))
|
||
for pi in range(toc_end_idx, check_end):
|
||
para = paragraphs[pi]
|
||
# Check for <w:br w:type="page"/>
|
||
for br in para.findall('.//' + _w('br')):
|
||
if br.get(_w('type')) == 'page':
|
||
found_pagebreak = True
|
||
break
|
||
# Check for <w:lastRenderedPageBreak/>
|
||
if para.findall('.//' + _w('lastRenderedPageBreak')):
|
||
found_pagebreak = True
|
||
if found_pagebreak:
|
||
break
|
||
|
||
if not found_pagebreak:
|
||
warnings.append(make_item(
|
||
"TOC_NO_PAGEBREAK",
|
||
"No page break found after TOC. Content may run into the table of contents.",
|
||
"warning"
|
||
))
|
||
|
||
# ---- CHECK 6: TOC_HINT_STYLE ----
|
||
for para in paragraphs:
|
||
text = get_paragraph_text(para).strip()
|
||
has_hint = any(indicator in text for indicator in HINT_INDICATORS)
|
||
if has_hint:
|
||
# Check if runs containing hint text are properly styled
|
||
properly_styled = True
|
||
for run in para.findall(_w('r')):
|
||
run_text = ''
|
||
for t in run.findall(_w('t')):
|
||
if t.text:
|
||
run_text += t.text
|
||
if any(ind in run_text for ind in HINT_INDICATORS):
|
||
has_gray, has_small = check_run_hint_style(run)
|
||
if not (has_gray and has_small):
|
||
properly_styled = False
|
||
break
|
||
|
||
if not properly_styled:
|
||
warnings.append(make_item(
|
||
"TOC_HINT_STYLE",
|
||
"TOC hint text found but not styled as gray/small. "
|
||
"It may look like regular content.",
|
||
"warning"
|
||
))
|
||
break # Only report once
|
||
|
||
return make_result(source, "docx-toc", errors, warnings, info)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# check-pdf implementation
|
||
# ---------------------------------------------------------------------------
|
||
def check_pdf(pdf_path: str) -> Dict[str, Any]:
|
||
"""Run all PDF TOC validation checks.
|
||
|
||
Returns the result dict.
|
||
"""
|
||
errors: List[Dict] = []
|
||
warnings: List[Dict] = []
|
||
info: List[Dict] = []
|
||
source = os.path.basename(pdf_path)
|
||
|
||
try:
|
||
import pdfplumber
|
||
except ImportError:
|
||
return make_result(source, "pdf-toc",
|
||
[make_item("DEPENDENCY_MISSING",
|
||
"pdfplumber is not installed. Run: pip install pdfplumber",
|
||
"error")],
|
||
[], [])
|
||
|
||
try:
|
||
pdf = pdfplumber.open(pdf_path)
|
||
except Exception as e:
|
||
return make_result(source, "pdf-toc",
|
||
[make_item("PARSE_ERROR",
|
||
f"Failed to open PDF: {str(e)[:100]}",
|
||
"error")],
|
||
[], [])
|
||
|
||
total_pages = len(pdf.pages)
|
||
if total_pages == 0:
|
||
pdf.close()
|
||
return make_result(source, "pdf-toc", [], [],
|
||
[make_item("EMPTY_PDF", "PDF has no pages.", "info")])
|
||
|
||
# Extract text from first 5 pages (or all if <5)
|
||
check_pages = min(5, total_pages)
|
||
page_texts = {}
|
||
for i in range(check_pages):
|
||
try:
|
||
text = pdf.pages[i].extract_text() or ''
|
||
except Exception:
|
||
text = ''
|
||
page_texts[i] = text
|
||
|
||
# ---- CHECK 1: TOC_NOT_FOUND ----
|
||
toc_pages = []
|
||
for page_idx, text in page_texts.items():
|
||
text_lower = text.lower()
|
||
for kw in TOC_KEYWORDS:
|
||
if kw.lower() in text_lower:
|
||
toc_pages.append(page_idx)
|
||
break
|
||
|
||
if not toc_pages and total_pages > 5:
|
||
warnings.append(make_item(
|
||
"TOC_NOT_FOUND",
|
||
f"No TOC detected in first 5 pages of a {total_pages}-page document.",
|
||
"warning"
|
||
))
|
||
|
||
# ---- CHECK 1b: TOC_ON_FIRST_PAGE ----
|
||
# If TOC appears on page 1, it likely means either:
|
||
# (a) there is no cover page before the TOC, or
|
||
# (b) the TOC and body content are not separated by a page break
|
||
if toc_pages and 0 in toc_pages and total_pages > 1:
|
||
errors.append(make_item(
|
||
"TOC_ON_FIRST_PAGE",
|
||
"TOC detected on page 1. A cover page should precede the TOC "
|
||
"(expected structure: Cover → TOC → Content). "
|
||
"Either the cover page is missing or the TOC was not separated by a page break.",
|
||
"error"
|
||
))
|
||
|
||
# ---- CHECK 2 & 3 & 4: TOC entry analysis ----
|
||
# Regex to find lines where the last token is a number (page reference)
|
||
entry_pattern = re.compile(r'^(.+?)\s+(\d{1,4})\s*$')
|
||
|
||
toc_entries = [] # List of (title_text, page_number)
|
||
if toc_pages:
|
||
for page_idx in toc_pages:
|
||
text = page_texts.get(page_idx, '')
|
||
for line in text.split('\n'):
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
m = entry_pattern.match(line)
|
||
if m:
|
||
title = m.group(1).strip()
|
||
page_num = int(m.group(2))
|
||
if 1 <= page_num <= 9999 and title:
|
||
toc_entries.append((title, page_num))
|
||
|
||
if toc_pages:
|
||
# CHECK 2: TOC_NO_ENTRIES
|
||
if len(toc_entries) < 2:
|
||
errors.append(make_item(
|
||
"TOC_NO_ENTRIES",
|
||
"TOC page found but contains fewer than 2 entries.",
|
||
"error"
|
||
))
|
||
|
||
if toc_entries:
|
||
# CHECK 3: TOC_PAGES_INVALID
|
||
invalid_entries = []
|
||
for title, page_num in toc_entries:
|
||
if page_num < 1 or page_num > total_pages:
|
||
invalid_entries.append((title, page_num))
|
||
|
||
for title, page_num in invalid_entries:
|
||
truncated = title[:50] + ('...' if len(title) > 50 else '')
|
||
errors.append(make_item(
|
||
"TOC_PAGES_INVALID",
|
||
f"TOC entry '{truncated}' references page {page_num} "
|
||
f"but document only has {total_pages} pages.",
|
||
"error"
|
||
))
|
||
|
||
# CHECK 4: TOC_ALL_SAME_PAGE
|
||
if len(toc_entries) >= 2:
|
||
page_nums = set(pn for _, pn in toc_entries)
|
||
if len(page_nums) == 1:
|
||
same_page = page_nums.pop()
|
||
errors.append(make_item(
|
||
"TOC_ALL_SAME_PAGE",
|
||
f"All TOC entries point to page {same_page}. "
|
||
f"This likely means placeholder page numbers were not updated.",
|
||
"error"
|
||
))
|
||
|
||
# ---- CHECK 5: TOC_LINKS_MISSING ----
|
||
if toc_entries and toc_pages:
|
||
has_links = False
|
||
for page_idx in toc_pages:
|
||
try:
|
||
page = pdf.pages[page_idx]
|
||
# Try annots (annotations)
|
||
annots = page.annots
|
||
if annots:
|
||
has_links = True
|
||
break
|
||
# Try hyperlinks
|
||
hyperlinks = page.hyperlinks
|
||
if hyperlinks:
|
||
has_links = True
|
||
break
|
||
except (AttributeError, Exception):
|
||
pass
|
||
|
||
if not has_links:
|
||
# Also try pikepdf for more thorough annotation check
|
||
try:
|
||
import pikepdf
|
||
pike_pdf = pikepdf.open(pdf_path)
|
||
for page_idx in toc_pages:
|
||
if page_idx < len(pike_pdf.pages):
|
||
pike_page = pike_pdf.pages[page_idx]
|
||
if '/Annots' in pike_page:
|
||
annots = pike_page['/Annots']
|
||
if len(annots) > 0:
|
||
has_links = True
|
||
break
|
||
pike_pdf.close()
|
||
except (ImportError, Exception):
|
||
pass
|
||
|
||
if not has_links:
|
||
warnings.append(make_item(
|
||
"TOC_LINKS_MISSING",
|
||
"TOC entries found but no clickable links detected.",
|
||
"warning"
|
||
))
|
||
|
||
pdf.close()
|
||
return make_result(source, "pdf-toc", errors, warnings, info)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# check-conversion implementation
|
||
# ---------------------------------------------------------------------------
|
||
def check_conversion(docx_path: str, pdf_path: str) -> Dict[str, Any]:
|
||
"""Run DOCX→PDF conversion TOC consistency checks.
|
||
|
||
Returns the result dict.
|
||
"""
|
||
errors: List[Dict] = []
|
||
warnings: List[Dict] = []
|
||
info: List[Dict] = []
|
||
source = f"{os.path.basename(docx_path)} → {os.path.basename(pdf_path)}"
|
||
|
||
# Parse DOCX
|
||
docx_root = parse_docx_xml(docx_path)
|
||
if docx_root is None:
|
||
return make_result(source, "conversion-toc",
|
||
[make_item("PARSE_ERROR",
|
||
"Failed to parse source DOCX file.",
|
||
"error")],
|
||
[], [])
|
||
|
||
# Check DOCX has TOC
|
||
docx_has_toc = docx_has_toc_field(docx_root)
|
||
|
||
# Parse PDF
|
||
try:
|
||
import pdfplumber
|
||
except ImportError:
|
||
return make_result(source, "conversion-toc",
|
||
[make_item("DEPENDENCY_MISSING",
|
||
"pdfplumber is not installed.",
|
||
"error")],
|
||
[], [])
|
||
|
||
try:
|
||
pdf = pdfplumber.open(pdf_path)
|
||
except Exception as e:
|
||
return make_result(source, "conversion-toc",
|
||
[make_item("PARSE_ERROR",
|
||
f"Failed to open PDF: {str(e)[:100]}",
|
||
"error")],
|
||
[], [])
|
||
|
||
total_pages = len(pdf.pages)
|
||
|
||
# Extract all PDF text
|
||
all_pdf_text = ''
|
||
page_texts = {}
|
||
for i in range(total_pages):
|
||
try:
|
||
text = pdf.pages[i].extract_text() or ''
|
||
except Exception:
|
||
text = ''
|
||
page_texts[i] = text
|
||
all_pdf_text += text + '\n'
|
||
|
||
# Find TOC pages in PDF
|
||
toc_pages = []
|
||
check_pages = min(5, total_pages)
|
||
for i in range(check_pages):
|
||
text_lower = page_texts.get(i, '').lower()
|
||
for kw in TOC_KEYWORDS:
|
||
if kw.lower() in text_lower:
|
||
toc_pages.append(i)
|
||
break
|
||
|
||
pdf_has_toc = len(toc_pages) > 0
|
||
|
||
# ---- CHECK 1: CONV_TOC_LOST ----
|
||
if docx_has_toc and not pdf_has_toc and total_pages > 5:
|
||
errors.append(make_item(
|
||
"CONV_TOC_LOST",
|
||
"Source DOCX has TOC but converted PDF does not. "
|
||
"TOC was lost during conversion.",
|
||
"error"
|
||
))
|
||
|
||
# ---- CHECK 2: CONV_HINT_LEAKED ----
|
||
all_text_lower = all_pdf_text.lower()
|
||
for phrase in HINT_PHRASES:
|
||
if phrase.lower() in all_text_lower:
|
||
# Find the actual matched text (up to 60 chars)
|
||
idx = all_text_lower.index(phrase.lower())
|
||
matched = all_pdf_text[idx:idx + len(phrase)]
|
||
truncated = matched[:60] + ('...' if len(matched) > 60 else '')
|
||
errors.append(make_item(
|
||
"CONV_HINT_LEAKED",
|
||
f"TOC hint text leaked into PDF: '{truncated}'. "
|
||
f"Clean hints before conversion.",
|
||
"error"
|
||
))
|
||
break # Report only the first match
|
||
|
||
# ---- CHECK 3: CONV_HEADING_DRIFT ----
|
||
# Count DOCX headings
|
||
body = docx_root.find(_w('body'))
|
||
docx_heading_count = 0
|
||
if body is not None:
|
||
for para in body:
|
||
if para.tag != _w('p'):
|
||
continue
|
||
style = get_paragraph_style(para)
|
||
if is_any_heading_style(style):
|
||
docx_heading_count += 1
|
||
|
||
# Count PDF TOC entries
|
||
entry_pattern = re.compile(r'^(.+?)\s+(\d{1,4})\s*$')
|
||
pdf_toc_entry_count = 0
|
||
if toc_pages:
|
||
for page_idx in toc_pages:
|
||
text = page_texts.get(page_idx, '')
|
||
for line in text.split('\n'):
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
m = entry_pattern.match(line)
|
||
if m:
|
||
page_num = int(m.group(2))
|
||
if 1 <= page_num <= 9999:
|
||
pdf_toc_entry_count += 1
|
||
|
||
if docx_heading_count > 0 and pdf_toc_entry_count > 0:
|
||
drift = abs(docx_heading_count - pdf_toc_entry_count)
|
||
drift_pct = (drift / docx_heading_count) * 100
|
||
if drift_pct > 30:
|
||
warnings.append(make_item(
|
||
"CONV_HEADING_DRIFT",
|
||
f"DOCX has {docx_heading_count} headings but PDF TOC has "
|
||
f"{pdf_toc_entry_count} entries ({drift_pct:.0f}% drift).",
|
||
"warning"
|
||
))
|
||
|
||
pdf.close()
|
||
return make_result(source, "conversion-toc", errors, warnings, info)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# fix-docx implementation
|
||
# ---------------------------------------------------------------------------
|
||
def _find_toc_sdt_indices(body_elem) -> List[int]:
|
||
"""Find indices of SDT elements in body that contain TOC.
|
||
|
||
Returns list of indices into body's direct children.
|
||
"""
|
||
indices = []
|
||
for idx, child in enumerate(body_elem):
|
||
if child.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}sdt':
|
||
# Check if this SDT contains TOC-related content
|
||
for instr in child.findall('.//' + _w('instrText')):
|
||
if instr.text and 'TOC' in instr.text.upper():
|
||
indices.append(idx)
|
||
break
|
||
else:
|
||
# Also check alias/tag
|
||
sdtPr = child.find('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}sdtPr')
|
||
if sdtPr is not None:
|
||
alias = sdtPr.find('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}alias')
|
||
if alias is not None and alias.get(_w('val'), '').upper() in ('TOC', '目录'):
|
||
indices.append(idx)
|
||
continue
|
||
docPartObj = sdtPr.find('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}docPartObj')
|
||
if docPartObj is not None:
|
||
docPartGallery = docPartObj.find('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}docPartGallery')
|
||
if docPartGallery is not None and 'toc' in docPartGallery.get(_w('val'), '').lower():
|
||
indices.append(idx)
|
||
return indices
|
||
|
||
|
||
def _find_toc_field_para_range(body_elem) -> Tuple[Optional[int], Optional[int]]:
|
||
"""Find the range of paragraph indices that make up a TOC field code block.
|
||
|
||
Returns (start_idx, end_idx) inclusive, or (None, None) if not found.
|
||
These are indices into body's direct children.
|
||
"""
|
||
children = list(body_elem)
|
||
in_toc = False
|
||
toc_depth = None
|
||
depth = 0
|
||
start_idx = None
|
||
end_idx = None
|
||
|
||
for ci, child in enumerate(children):
|
||
if child.tag != _w('p'):
|
||
continue
|
||
for run in child.findall('.//' + _w('r')):
|
||
instr = run.find(_w('instrText'))
|
||
if instr is not None and instr.text and 'TOC' in instr.text.upper():
|
||
in_toc = True
|
||
|
||
fldChar = run.find(_w('fldChar'))
|
||
if fldChar is not None:
|
||
fld_type = fldChar.get(_w('fldCharType'), '')
|
||
if fld_type == 'begin':
|
||
depth += 1
|
||
if in_toc and start_idx is None:
|
||
# The begin was before instrText; look back
|
||
start_idx = ci
|
||
toc_depth = depth
|
||
elif not in_toc and start_idx is None:
|
||
# tentative; may become TOC if instrText follows
|
||
pass
|
||
elif fld_type == 'end':
|
||
if in_toc and depth == toc_depth:
|
||
end_idx = ci
|
||
in_toc = False
|
||
depth = max(0, depth - 1)
|
||
|
||
# If we found instrText but start_idx wasn't set (begin was in the same para before instrText)
|
||
# Re-scan more carefully
|
||
if in_toc and start_idx is None:
|
||
# Fall back to find_toc_field_boundaries_v2 style
|
||
pass
|
||
|
||
return start_idx, end_idx
|
||
|
||
|
||
def fix_docx(docx_path: str, output_path: Optional[str] = None) -> Dict[str, Any]:
|
||
"""Detect TOC issues in a DOCX and fix them, outputting a new DOCX file.
|
||
|
||
Returns the result dict.
|
||
"""
|
||
from docx import Document as DocxDocument
|
||
from docx.shared import Pt, Twips
|
||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||
from docx.oxml.ns import qn
|
||
from docx.oxml import OxmlElement
|
||
|
||
errors: List[Dict] = []
|
||
warnings: List[Dict] = []
|
||
info_list: List[Dict] = []
|
||
source = os.path.basename(docx_path)
|
||
|
||
if output_path is None:
|
||
base, ext = os.path.splitext(docx_path)
|
||
output_path = base + '_fixed' + ext
|
||
|
||
# Parse using low-level XML for analysis
|
||
root = parse_docx_xml(docx_path)
|
||
if root is None:
|
||
return {
|
||
"pass": False, "source": source, "check_type": "fix-docx",
|
||
"action": "failed", "reason": "Failed to parse DOCX file",
|
||
"heading_count": 0, "toc_entries_before": 0, "toc_entries_after": 0,
|
||
"output": output_path,
|
||
"errors": [make_item("PARSE_ERROR", "Failed to parse DOCX", "error")],
|
||
"warnings": [], "info": []
|
||
}
|
||
|
||
body = root.find(_w('body'))
|
||
if body is None:
|
||
return {
|
||
"pass": False, "source": source, "check_type": "fix-docx",
|
||
"action": "failed", "reason": "Document body is empty",
|
||
"heading_count": 0, "toc_entries_before": 0, "toc_entries_after": 0,
|
||
"output": output_path,
|
||
"errors": [make_item("EMPTY_BODY", "Document body is empty", "error")],
|
||
"warnings": [], "info": []
|
||
}
|
||
|
||
paragraphs = [e for e in body if e.tag == _w('p')]
|
||
|
||
# Extract headings
|
||
headings = [] # list of (para_index_in_body, text, level)
|
||
body_children = list(body)
|
||
para_to_body_idx = {} # map paragraph element to body child index
|
||
pi = 0
|
||
caption_filter = re.compile(r'^[表图]\s*\d')
|
||
for ci, child in enumerate(body_children):
|
||
if child.tag == _w('p'):
|
||
para_to_body_idx[id(child)] = ci
|
||
style = get_paragraph_style(child)
|
||
if is_any_heading_style(style):
|
||
text = get_paragraph_text(child).strip()
|
||
level = _get_heading_level(style)
|
||
if text and level > 0:
|
||
# Skip table/figure captions styled as headings
|
||
if caption_filter.match(text):
|
||
continue
|
||
headings.append((ci, text, level))
|
||
pi += 1
|
||
|
||
heading_count = len(headings)
|
||
heading_texts = [h[1] for h in headings]
|
||
|
||
# Get TOC info
|
||
toc_info = find_toc_field_boundaries_v2(root)
|
||
has_toc = toc_info['has_toc']
|
||
toc_entry_texts = toc_info['toc_entry_texts']
|
||
toc_entries_before = len(toc_entry_texts)
|
||
|
||
# Also check for SDT-based TOC
|
||
sdt_indices = _find_toc_sdt_indices(body)
|
||
has_sdt_toc = len(sdt_indices) > 0
|
||
|
||
has_any_toc = has_toc or has_sdt_toc
|
||
|
||
# If SDT TOC, extract text from it for analysis
|
||
if has_sdt_toc and not toc_entry_texts:
|
||
for si in sdt_indices:
|
||
sdt_elem = body_children[si]
|
||
for t in sdt_elem.findall('.//' + _w('t')):
|
||
if t.text and t.text.strip():
|
||
toc_entry_texts.append(t.text.strip())
|
||
toc_entries_before = len(toc_entry_texts)
|
||
|
||
# ---- Decision logic ----
|
||
|
||
# Case 1: No TOC exists
|
||
if not has_any_toc:
|
||
if heading_count < 3:
|
||
return {
|
||
"pass": True, "source": source, "check_type": "fix-docx",
|
||
"action": "no_toc_needed",
|
||
"reason": f"Document has only {heading_count} headings, no TOC needed",
|
||
"heading_count": heading_count,
|
||
"toc_entries_before": 0, "toc_entries_after": 0,
|
||
"output": output_path,
|
||
"errors": [], "warnings": [],
|
||
"info": [f"Document has {heading_count} headings (< 3), no TOC needed"]
|
||
}
|
||
else:
|
||
# Need to generate TOC
|
||
info_list.append(f"No TOC found, generating new TOC with {heading_count} entries")
|
||
need_fix = True
|
||
fix_reason = "no_toc"
|
||
toc_insert_body_idx = None # Will determine below
|
||
else:
|
||
# Case 2 & 3: TOC exists, check if it's stale/placeholder
|
||
need_fix = False
|
||
fix_reason = ""
|
||
|
||
# Check for empty TOC
|
||
non_empty_entries = [t for t in toc_entry_texts if t.strip()]
|
||
if not non_empty_entries:
|
||
need_fix = True
|
||
fix_reason = "empty_toc"
|
||
info_list.append("TOC exists but has no text content (uninitialized)")
|
||
else:
|
||
# Language mismatch check
|
||
heading_lang = _detect_language(heading_texts)
|
||
toc_lang = _detect_language(non_empty_entries)
|
||
|
||
if heading_lang != toc_lang and heading_count >= 3:
|
||
need_fix = True
|
||
fix_reason = "language_mismatch"
|
||
info_list.append(
|
||
f"Deleted stale TOC with {toc_entries_before} "
|
||
f"{'English' if toc_lang == 'en' else 'Chinese'} placeholder entries"
|
||
)
|
||
|
||
# Count mismatch check (>50% difference)
|
||
if not need_fix and heading_count > 0:
|
||
diff = abs(heading_count - toc_entries_before)
|
||
if diff / heading_count > 0.5:
|
||
need_fix = True
|
||
fix_reason = "count_mismatch"
|
||
info_list.append(
|
||
f"TOC has {toc_entries_before} entries but document has "
|
||
f"{heading_count} headings (>{50}% drift)"
|
||
)
|
||
|
||
if not need_fix:
|
||
# TOC looks OK
|
||
return {
|
||
"pass": True, "source": source, "check_type": "fix-docx",
|
||
"action": "skipped",
|
||
"reason": "TOC appears to be up-to-date",
|
||
"heading_count": heading_count,
|
||
"toc_entries_before": toc_entries_before,
|
||
"toc_entries_after": toc_entries_before,
|
||
"output": output_path,
|
||
"errors": [], "warnings": [],
|
||
"info": ["TOC entries and headings are consistent, no fix needed"]
|
||
}
|
||
|
||
# ---- Perform the fix using python-docx ----
|
||
try:
|
||
doc = DocxDocument(docx_path)
|
||
except Exception as e:
|
||
return {
|
||
"pass": False, "source": source, "check_type": "fix-docx",
|
||
"action": "failed", "reason": f"Failed to open DOCX with python-docx: {str(e)[:200]}",
|
||
"heading_count": heading_count,
|
||
"toc_entries_before": toc_entries_before, "toc_entries_after": 0,
|
||
"output": output_path,
|
||
"errors": [make_item("OPEN_ERROR", f"Failed to open: {str(e)[:200]}", "error")],
|
||
"warnings": [], "info": []
|
||
}
|
||
|
||
doc_body = doc.element.body
|
||
doc_children = list(doc_body)
|
||
|
||
# Determine language for TOC title
|
||
content_lang = _detect_language(heading_texts)
|
||
toc_title = "目 录" if content_lang == 'zh' else "Table of Contents"
|
||
|
||
# Re-extract headings from the python-docx document for consistency
|
||
doc_headings = [] # (element_index, text, level)
|
||
# Pattern to filter out table/figure captions styled as headings
|
||
caption_re = re.compile(r'^[表图]\s*\d')
|
||
for ci, child in enumerate(doc_children):
|
||
if child.tag == qn('w:p'):
|
||
pPr = child.find(qn('w:pPr'))
|
||
if pPr is not None:
|
||
pStyle = pPr.find(qn('w:pStyle'))
|
||
if pStyle is not None:
|
||
style_val = pStyle.get(qn('w:val'))
|
||
if is_any_heading_style(style_val):
|
||
text_parts = []
|
||
for t in child.findall('.//' + qn('w:t')):
|
||
if t.text:
|
||
text_parts.append(t.text)
|
||
text = ''.join(text_parts).strip()
|
||
level = _get_heading_level(style_val)
|
||
if text and level > 0:
|
||
# Skip table/figure captions (e.g. "表 1:xxx", "图 2:xxx")
|
||
if caption_re.match(text):
|
||
continue
|
||
doc_headings.append((ci, text, level))
|
||
|
||
if not doc_headings:
|
||
return {
|
||
"pass": True, "source": source, "check_type": "fix-docx",
|
||
"action": "no_toc_needed",
|
||
"reason": "No headings found in document after re-parse",
|
||
"heading_count": 0,
|
||
"toc_entries_before": toc_entries_before, "toc_entries_after": 0,
|
||
"output": output_path,
|
||
"errors": [], "warnings": [],
|
||
"info": ["No headings found, skipping TOC generation"]
|
||
}
|
||
|
||
# Step 1: Remove existing TOC (SDT or field code range)
|
||
insert_before_idx = None
|
||
|
||
# Remove SDT-based TOC
|
||
sdt_removed = False
|
||
for child in list(doc_body):
|
||
if child.tag == qn('w:sdt'):
|
||
is_toc_sdt = False
|
||
for instr in child.findall('.//' + qn('w:instrText')):
|
||
if instr.text and 'TOC' in instr.text.upper():
|
||
is_toc_sdt = True
|
||
break
|
||
if not is_toc_sdt:
|
||
sdtPr = child.find(qn('w:sdtPr'))
|
||
if sdtPr is not None:
|
||
alias = sdtPr.find(qn('w:alias'))
|
||
if alias is not None and alias.get(qn('w:val'), '').upper() in ('TOC', '目录'):
|
||
is_toc_sdt = True
|
||
docPartObj = sdtPr.find(qn('w:docPartObj'))
|
||
if docPartObj is not None:
|
||
dpg = docPartObj.find(qn('w:docPartGallery'))
|
||
if dpg is not None and 'toc' in dpg.get(qn('w:val'), '').lower():
|
||
is_toc_sdt = True
|
||
if is_toc_sdt:
|
||
# Record position
|
||
insert_before_idx = list(doc_body).index(child)
|
||
doc_body.remove(child)
|
||
sdt_removed = True
|
||
|
||
# Remove field code TOC (non-SDT)
|
||
if not sdt_removed and has_toc:
|
||
# Find and remove paragraphs that are part of the TOC field
|
||
doc_children_fresh = list(doc_body)
|
||
# Use similar logic to find_toc_field_boundaries_v2 but on python-docx elements
|
||
in_toc = False
|
||
toc_depth = None
|
||
depth = 0
|
||
toc_paras_to_remove = []
|
||
field_begin_idx = None
|
||
|
||
for ci, child in enumerate(doc_children_fresh):
|
||
if child.tag != qn('w:p'):
|
||
continue
|
||
for run in child.findall('.//' + qn('w:r')):
|
||
instr = run.find(qn('w:instrText'))
|
||
if instr is not None and instr.text and 'TOC' in instr.text.upper():
|
||
in_toc = True
|
||
|
||
fldChar = run.find(qn('w:fldChar'))
|
||
if fldChar is not None:
|
||
fld_type = fldChar.get(qn('w:fldCharType'), '')
|
||
if fld_type == 'begin':
|
||
depth += 1
|
||
if in_toc and field_begin_idx is None:
|
||
field_begin_idx = ci
|
||
toc_depth = depth
|
||
elif fld_type == 'end':
|
||
if in_toc and depth == toc_depth:
|
||
# Mark all paragraphs from begin to end for removal
|
||
if field_begin_idx is not None:
|
||
for ri in range(field_begin_idx, ci + 1):
|
||
toc_paras_to_remove.append(doc_children_fresh[ri])
|
||
in_toc = False
|
||
depth = max(0, depth - 1)
|
||
|
||
if toc_paras_to_remove:
|
||
insert_before_idx = list(doc_body).index(toc_paras_to_remove[0])
|
||
for p in toc_paras_to_remove:
|
||
try:
|
||
doc_body.remove(p)
|
||
except ValueError:
|
||
pass
|
||
|
||
# Step 2: Determine insertion point
|
||
if insert_before_idx is None:
|
||
# No existing TOC was removed — find the right place to insert
|
||
doc_children_now = list(doc_body)
|
||
first_heading_idx = None
|
||
for ci, child in enumerate(doc_children_now):
|
||
if child.tag == qn('w:p'):
|
||
pPr = child.find(qn('w:pPr'))
|
||
if pPr is not None:
|
||
pStyle = pPr.find(qn('w:pStyle'))
|
||
if pStyle is not None:
|
||
sv = pStyle.get(qn('w:val'))
|
||
if is_any_heading_style(sv):
|
||
first_heading_idx = ci
|
||
break
|
||
if first_heading_idx is not None:
|
||
insert_before_idx = first_heading_idx
|
||
else:
|
||
insert_before_idx = 0
|
||
|
||
# Step 3: Build TOC paragraphs as OxmlElements and insert them
|
||
|
||
def _make_toc_paragraph(text: str, level: int, lang: str, page_num: str = '1', bookmark_name: str = '') -> Any:
|
||
"""Create a TOC entry paragraph with HYPERLINK + PAGEREF for clickable links and auto page numbers."""
|
||
p = OxmlElement('w:p')
|
||
pPr = OxmlElement('w:pPr')
|
||
|
||
# TOC style
|
||
toc_style = OxmlElement('w:pStyle')
|
||
toc_style.set(qn('w:val'), f'TOC{level}' if level <= 3 else 'TOC3')
|
||
pPr.append(toc_style)
|
||
|
||
# Indentation based on level
|
||
if level >= 2:
|
||
ind = OxmlElement('w:ind')
|
||
indent_twips = (level - 1) * 420
|
||
ind.set(qn('w:left'), str(indent_twips))
|
||
pPr.append(ind)
|
||
|
||
# Right-aligned tab stop with dot leader at 9026 twips (~15.9cm)
|
||
tabs = OxmlElement('w:tabs')
|
||
tab = OxmlElement('w:tab')
|
||
tab.set(qn('w:val'), 'right')
|
||
tab.set(qn('w:leader'), 'dot')
|
||
tab.set(qn('w:pos'), '9026')
|
||
tabs.append(tab)
|
||
pPr.append(tabs)
|
||
|
||
# Line spacing
|
||
spacing = OxmlElement('w:spacing')
|
||
spacing.set(qn('w:before'), '120')
|
||
spacing.set(qn('w:after'), '60')
|
||
pPr.append(spacing)
|
||
|
||
p.append(pPr)
|
||
|
||
if bookmark_name:
|
||
# Wrap everything in a hyperlink element pointing to the bookmark
|
||
hyperlink = OxmlElement('w:hyperlink')
|
||
hyperlink.set(qn('w:anchor'), bookmark_name)
|
||
hyperlink.set(qn('w:history'), '1')
|
||
|
||
# --- Run 1: heading text ---
|
||
r = OxmlElement('w:r')
|
||
rPr_r = OxmlElement('w:rPr')
|
||
# Style as hyperlink (blue, underline optional)
|
||
rStyle = OxmlElement('w:rStyle')
|
||
rStyle.set(qn('w:val'), 'Hyperlink')
|
||
rPr_r.append(rStyle)
|
||
sz2 = OxmlElement('w:sz')
|
||
szCs2 = OxmlElement('w:szCs')
|
||
if level == 1:
|
||
sz2.set(qn('w:val'), '28')
|
||
szCs2.set(qn('w:val'), '28')
|
||
b2 = OxmlElement('w:b')
|
||
rPr_r.append(b2)
|
||
elif level == 2:
|
||
sz2.set(qn('w:val'), '24')
|
||
szCs2.set(qn('w:val'), '24')
|
||
else:
|
||
sz2.set(qn('w:val'), '22')
|
||
szCs2.set(qn('w:val'), '22')
|
||
rPr_r.append(sz2)
|
||
rPr_r.append(szCs2)
|
||
r.append(rPr_r)
|
||
t = OxmlElement('w:t')
|
||
t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
|
||
t.text = text
|
||
r.append(t)
|
||
hyperlink.append(r)
|
||
|
||
# --- Run 2: tab ---
|
||
r_tab = OxmlElement('w:r')
|
||
tab_elem = OxmlElement('w:tab')
|
||
r_tab.append(tab_elem)
|
||
hyperlink.append(r_tab)
|
||
|
||
# --- Run 3: PAGEREF field code for auto page number ---
|
||
# fldChar begin
|
||
r_begin = OxmlElement('w:r')
|
||
fldChar_begin = OxmlElement('w:fldChar')
|
||
fldChar_begin.set(qn('w:fldCharType'), 'begin')
|
||
r_begin.append(fldChar_begin)
|
||
hyperlink.append(r_begin)
|
||
|
||
# instrText: PAGEREF bookmark_name \h
|
||
r_instr = OxmlElement('w:r')
|
||
instrText = OxmlElement('w:instrText')
|
||
instrText.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
|
||
instrText.text = f' PAGEREF {bookmark_name} \\h '
|
||
r_instr.append(instrText)
|
||
hyperlink.append(r_instr)
|
||
|
||
# fldChar separate
|
||
r_sep = OxmlElement('w:r')
|
||
fldChar_sep = OxmlElement('w:fldChar')
|
||
fldChar_sep.set(qn('w:fldCharType'), 'separate')
|
||
r_sep.append(fldChar_sep)
|
||
hyperlink.append(r_sep)
|
||
|
||
# Page number placeholder text
|
||
r_page = OxmlElement('w:r')
|
||
rPr_page = OxmlElement('w:rPr')
|
||
noProof = OxmlElement('w:noProof')
|
||
rPr_page.append(noProof)
|
||
r_page.append(rPr_page)
|
||
t_page = OxmlElement('w:t')
|
||
t_page.text = str(page_num)
|
||
r_page.append(t_page)
|
||
hyperlink.append(r_page)
|
||
|
||
# fldChar end
|
||
r_end = OxmlElement('w:r')
|
||
fldChar_end = OxmlElement('w:fldChar')
|
||
fldChar_end.set(qn('w:fldCharType'), 'end')
|
||
r_end.append(fldChar_end)
|
||
hyperlink.append(r_end)
|
||
|
||
p.append(hyperlink)
|
||
else:
|
||
# Fallback: plain text without hyperlink (same as before)
|
||
r = OxmlElement('w:r')
|
||
rPr_r = OxmlElement('w:rPr')
|
||
sz2 = OxmlElement('w:sz')
|
||
szCs2 = OxmlElement('w:szCs')
|
||
if level == 1:
|
||
sz2.set(qn('w:val'), '28')
|
||
szCs2.set(qn('w:val'), '28')
|
||
b2 = OxmlElement('w:b')
|
||
rPr_r.append(b2)
|
||
elif level == 2:
|
||
sz2.set(qn('w:val'), '24')
|
||
szCs2.set(qn('w:val'), '24')
|
||
else:
|
||
sz2.set(qn('w:val'), '22')
|
||
szCs2.set(qn('w:val'), '22')
|
||
rPr_r.append(sz2)
|
||
rPr_r.append(szCs2)
|
||
r.append(rPr_r)
|
||
t = OxmlElement('w:t')
|
||
t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
|
||
t.text = text
|
||
r.append(t)
|
||
p.append(r)
|
||
|
||
r_tab = OxmlElement('w:r')
|
||
tab_elem = OxmlElement('w:tab')
|
||
r_tab.append(tab_elem)
|
||
p.append(r_tab)
|
||
|
||
r_page = OxmlElement('w:r')
|
||
t_page = OxmlElement('w:t')
|
||
t_page.text = str(page_num)
|
||
r_page.append(t_page)
|
||
p.append(r_page)
|
||
|
||
return p
|
||
|
||
def _make_toc_title(title_text: str) -> Any:
|
||
"""Create the TOC title paragraph (centered, 18pt, bold)."""
|
||
p = OxmlElement('w:p')
|
||
pPr = OxmlElement('w:pPr')
|
||
|
||
# Center alignment
|
||
jc = OxmlElement('w:jc')
|
||
jc.set(qn('w:val'), 'center')
|
||
pPr.append(jc)
|
||
|
||
# Spacing
|
||
spacing = OxmlElement('w:spacing')
|
||
spacing.set(qn('w:after'), '200')
|
||
spacing.set(qn('w:line'), '360')
|
||
spacing.set(qn('w:lineRule'), 'auto')
|
||
pPr.append(spacing)
|
||
|
||
# Run properties
|
||
rPr_p = OxmlElement('w:rPr')
|
||
b = OxmlElement('w:b')
|
||
rPr_p.append(b)
|
||
sz = OxmlElement('w:sz')
|
||
sz.set(qn('w:val'), '36') # 18pt = 36 half-points
|
||
rPr_p.append(sz)
|
||
szCs = OxmlElement('w:szCs')
|
||
szCs.set(qn('w:val'), '36')
|
||
rPr_p.append(szCs)
|
||
pPr.append(rPr_p)
|
||
|
||
p.append(pPr)
|
||
|
||
# Run with text
|
||
r = OxmlElement('w:r')
|
||
rPr_r = OxmlElement('w:rPr')
|
||
b2 = OxmlElement('w:b')
|
||
rPr_r.append(b2)
|
||
sz2 = OxmlElement('w:sz')
|
||
sz2.set(qn('w:val'), '36')
|
||
rPr_r.append(sz2)
|
||
szCs2 = OxmlElement('w:szCs')
|
||
szCs2.set(qn('w:val'), '36')
|
||
rPr_r.append(szCs2)
|
||
r.append(rPr_r)
|
||
|
||
t = OxmlElement('w:t')
|
||
t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
|
||
t.text = title_text
|
||
r.append(t)
|
||
p.append(r)
|
||
|
||
return p
|
||
|
||
def _make_page_break() -> Any:
|
||
"""Create a paragraph with a page break."""
|
||
p = OxmlElement('w:p')
|
||
r = OxmlElement('w:r')
|
||
br = OxmlElement('w:br')
|
||
br.set(qn('w:type'), 'page')
|
||
r.append(br)
|
||
p.append(r)
|
||
return p
|
||
|
||
# Build the TOC as an SDT (Structured Document Tag) wrapping a TOC field
|
||
# This ensures check-docx recognizes it and fix-docx can detect/replace it
|
||
|
||
def _build_toc_sdt(title_text: str, heading_entries: list, lang: str) -> Any:
|
||
"""Build a complete SDT element containing a TOC field with entries."""
|
||
sdt = OxmlElement('w:sdt')
|
||
|
||
# SDT properties
|
||
sdtPr = OxmlElement('w:sdtPr')
|
||
alias = OxmlElement('w:alias')
|
||
alias.set(qn('w:val'), 'TOC')
|
||
sdtPr.append(alias)
|
||
|
||
# docPartObj with TOC gallery
|
||
docPartObj = OxmlElement('w:docPartObj')
|
||
docPartGallery = OxmlElement('w:docPartGallery')
|
||
docPartGallery.set(qn('w:val'), 'Table of Contents')
|
||
docPartObj.append(docPartGallery)
|
||
docPartUnique = OxmlElement('w:docPartUnique')
|
||
docPartObj.append(docPartUnique)
|
||
sdtPr.append(docPartObj)
|
||
|
||
sdt.append(sdtPr)
|
||
|
||
# SDT content
|
||
sdtContent = OxmlElement('w:sdtContent')
|
||
|
||
# Title paragraph
|
||
sdtContent.append(_make_toc_title(title_text))
|
||
|
||
# Field begin paragraph
|
||
p_begin = OxmlElement('w:p')
|
||
r_begin = OxmlElement('w:r')
|
||
fldChar_begin = OxmlElement('w:fldChar')
|
||
fldChar_begin.set(qn('w:fldCharType'), 'begin')
|
||
r_begin.append(fldChar_begin)
|
||
p_begin.append(r_begin)
|
||
r_instr = OxmlElement('w:r')
|
||
instrText = OxmlElement('w:instrText')
|
||
instrText.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
|
||
instrText.text = ' TOC \\o "1-3" \\h \\z \\u '
|
||
r_instr.append(instrText)
|
||
p_begin.append(r_instr)
|
||
r_sep = OxmlElement('w:r')
|
||
fldChar_sep = OxmlElement('w:fldChar')
|
||
fldChar_sep.set(qn('w:fldCharType'), 'separate')
|
||
r_sep.append(fldChar_sep)
|
||
p_begin.append(r_sep)
|
||
sdtContent.append(p_begin)
|
||
|
||
# TOC entry paragraphs — estimate page numbers based on heading position
|
||
toc_entries = [(i, h_text, h_level) for i, (_, h_text, h_level) in enumerate(heading_entries) if h_level <= 3]
|
||
total_headings = len(toc_entries)
|
||
# TOC itself takes ~2 pages; cover takes ~1 page
|
||
toc_offset = 3 # cover + TOC pages
|
||
|
||
# Count body children in the original document to estimate total pages
|
||
# Rough heuristic: ~40 paragraphs per page for typical documents
|
||
doc_body_count = len(list(doc_body))
|
||
estimated_total_pages = max(toc_offset + 1, doc_body_count // 40 + toc_offset)
|
||
|
||
# Map each heading to its position ratio in the document
|
||
# Also generate bookmark names for HYPERLINK + PAGEREF
|
||
bookmark_names = []
|
||
for seq, (idx, h_text, h_level) in enumerate(toc_entries):
|
||
# Generate a unique bookmark name for each heading
|
||
bm_name = f'_Toc{100000 + seq}'
|
||
bookmark_names.append(bm_name)
|
||
|
||
# Use the heading's body child index to estimate position
|
||
h_body_idx = heading_entries[idx][0] if idx < len(heading_entries) else 0
|
||
if doc_body_count > 0:
|
||
position_ratio = h_body_idx / doc_body_count
|
||
est_page = toc_offset + max(0, int(position_ratio * (estimated_total_pages - toc_offset)))
|
||
else:
|
||
est_page = toc_offset + seq
|
||
est_page = max(toc_offset, est_page) # never less than toc_offset
|
||
sdtContent.append(_make_toc_paragraph(h_text, h_level, lang, str(est_page), bm_name))
|
||
|
||
# Field end paragraph
|
||
p_end = OxmlElement('w:p')
|
||
r_end = OxmlElement('w:r')
|
||
fldChar_end = OxmlElement('w:fldChar')
|
||
fldChar_end.set(qn('w:fldCharType'), 'end')
|
||
r_end.append(fldChar_end)
|
||
p_end.append(r_end)
|
||
sdtContent.append(p_end)
|
||
|
||
sdt.append(sdtContent)
|
||
|
||
# Build bookmark mapping: list of (heading_body_index, bookmark_name)
|
||
bm_mapping = []
|
||
for seq, (idx, h_text, h_level) in enumerate(toc_entries):
|
||
h_body_idx = heading_entries[idx][0] if idx < len(heading_entries) else 0
|
||
bm_mapping.append((h_body_idx, bookmark_names[seq]))
|
||
|
||
return sdt, bm_mapping
|
||
|
||
# Build TOC SDT and page break
|
||
toc_sdt, bookmark_mapping = _build_toc_sdt(toc_title, doc_headings, content_lang)
|
||
page_break = _make_page_break()
|
||
|
||
# Insert TOC elements at the determined position
|
||
ref_children = list(doc_body)
|
||
# Clamp insert_before_idx
|
||
if insert_before_idx >= len(ref_children):
|
||
doc_body.append(toc_sdt)
|
||
doc_body.append(page_break)
|
||
else:
|
||
ref_element = ref_children[insert_before_idx]
|
||
ref_element.addprevious(toc_sdt)
|
||
ref_element.addprevious(page_break)
|
||
|
||
# Add bookmarks to heading paragraphs so PAGEREF and HYPERLINK can resolve
|
||
body_children = list(doc_body)
|
||
bm_id_start = 10 # bookmark IDs must be unique integers in the document
|
||
for body_idx, bm_name in bookmark_mapping:
|
||
if body_idx < len(body_children):
|
||
heading_para = body_children[body_idx]
|
||
# Insert bookmarkStart before first run, bookmarkEnd after last run
|
||
bm_start = OxmlElement('w:bookmarkStart')
|
||
bm_start.set(qn('w:id'), str(bm_id_start))
|
||
bm_start.set(qn('w:name'), bm_name)
|
||
|
||
bm_end = OxmlElement('w:bookmarkEnd')
|
||
bm_end.set(qn('w:id'), str(bm_id_start))
|
||
|
||
# Insert at beginning and end of the paragraph
|
||
heading_para.insert(0, bm_start)
|
||
heading_para.append(bm_end)
|
||
|
||
bm_id_start += 1
|
||
|
||
# Save
|
||
try:
|
||
doc.save(output_path)
|
||
except Exception as e:
|
||
return {
|
||
"pass": False, "source": source, "check_type": "fix-docx",
|
||
"action": "failed", "reason": f"Failed to save: {str(e)[:200]}",
|
||
"heading_count": heading_count,
|
||
"toc_entries_before": toc_entries_before, "toc_entries_after": 0,
|
||
"output": output_path,
|
||
"errors": [make_item("SAVE_ERROR", f"Failed to save: {str(e)[:200]}", "error")],
|
||
"warnings": [], "info": []
|
||
}
|
||
|
||
toc_entries_after = sum(1 for _, _, l in doc_headings if l <= 3)
|
||
info_list.append(f"Generated new TOC with {toc_entries_after} entries")
|
||
|
||
return {
|
||
"pass": True, "source": source, "check_type": "fix-docx",
|
||
"action": "fixed",
|
||
"reason": fix_reason,
|
||
"heading_count": heading_count,
|
||
"toc_entries_before": toc_entries_before,
|
||
"toc_entries_after": toc_entries_after,
|
||
"output": output_path,
|
||
"errors": [], "warnings": [],
|
||
"info": info_list
|
||
}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# CLI entry point
|
||
# ---------------------------------------------------------------------------
|
||
def fix_docx_accurate_pages(fixed_docx_path: str, pass1_pdf_path: str, output_path: Optional[str] = None) -> Dict[str, Any]:
|
||
"""Update TOC page numbers in a fix-docx output using actual page positions from a PDF.
|
||
|
||
Two-pass approach:
|
||
Pass 1: Convert the DOCX (without TOC fix or with estimated pages) to PDF
|
||
Pass 2: Read actual heading positions from PDF, update PAGEREF placeholder text
|
||
|
||
Args:
|
||
fixed_docx_path: Path to the DOCX after fix-docx (has PAGEREF fields with estimated pages)
|
||
pass1_pdf_path: Path to a PDF converted from the ORIGINAL docx (without TOC)
|
||
output_path: Where to save the updated DOCX (defaults to overwrite fixed_docx_path)
|
||
"""
|
||
import zipfile as zf_mod
|
||
import tempfile
|
||
import shutil
|
||
|
||
try:
|
||
import pdfplumber
|
||
except ImportError:
|
||
return {"pass": False, "error": "pdfplumber not installed — cannot extract page positions"}
|
||
|
||
try:
|
||
from docx import Document
|
||
from docx.oxml.ns import qn as docx_qn
|
||
except ImportError:
|
||
return {"pass": False, "error": "python-docx not installed"}
|
||
|
||
try:
|
||
from lxml import etree
|
||
except ImportError:
|
||
return {"pass": False, "error": "lxml not installed"}
|
||
|
||
if output_path is None:
|
||
output_path = fixed_docx_path
|
||
|
||
source = os.path.basename(fixed_docx_path)
|
||
|
||
# --- Step 1: Extract headings from the fixed DOCX ---
|
||
doc = Document(fixed_docx_path)
|
||
headings = []
|
||
caption_pattern = re.compile(r'^[表图]\s*\d')
|
||
for p in doc.paragraphs:
|
||
style_name = p.style.name if p.style else ''
|
||
if style_name.startswith('Heading'):
|
||
m = re.match(r'Heading\s*(\d+)', style_name)
|
||
if m:
|
||
text = p.text.strip()
|
||
if text and not caption_pattern.match(text):
|
||
headings.append({'text': text, 'level': int(m.group(1))})
|
||
|
||
if not headings:
|
||
return {"pass": True, "source": source, "info": "No headings found, nothing to update"}
|
||
|
||
# --- Step 2: Find actual page positions in pass1 PDF ---
|
||
pdf = pdfplumber.open(pass1_pdf_path)
|
||
total_pdf_pages = len(pdf.pages)
|
||
|
||
page_texts = []
|
||
for i in range(total_pdf_pages):
|
||
page_texts.append(pdf.pages[i].extract_text() or '')
|
||
|
||
heading_pages_pass1: Dict[str, int] = {}
|
||
for h in headings:
|
||
for page_num, pt in enumerate(page_texts):
|
||
if h['text'] in pt:
|
||
heading_pages_pass1[h['text']] = page_num + 1 # 1-indexed
|
||
break
|
||
pdf.close()
|
||
|
||
# --- Step 3: Calculate offset ---
|
||
# Instead of estimating TOC page count, calculate actual offset by comparing
|
||
# where the first heading appears in pass1 vs where it should appear after TOC insertion.
|
||
# The offset = (number of pages TOC adds) which depends on entry count and formatting.
|
||
toc_entry_count = sum(1 for h in headings if h['level'] <= 3)
|
||
|
||
# Better estimate: ~15 entries per page for CJK text with leader dots
|
||
toc_pages = max(1, (toc_entry_count + 14) // 15)
|
||
|
||
# Additional offset for the page break after TOC
|
||
# Check if the original DOCX already had a TOC (pass1 already includes TOC space)
|
||
# by looking at whether the first heading is on page 1-2 (no TOC) or later (has TOC)
|
||
first_heading_page = min(heading_pages_pass1.values()) if heading_pages_pass1 else 1
|
||
|
||
if first_heading_page <= 2:
|
||
# Pass1 has no significant TOC content, so we need full offset
|
||
offset = toc_pages + 1 # +1 for page break after TOC
|
||
else:
|
||
# Pass1 already has some TOC pages, smaller offset needed
|
||
offset = max(0, toc_pages - (first_heading_page - 2))
|
||
|
||
heading_page_map: Dict[str, int] = {}
|
||
for h_text, orig_page in heading_pages_pass1.items():
|
||
heading_page_map[h_text] = orig_page + offset
|
||
|
||
# --- Step 4: Update PAGEREF placeholder text in the DOCX XML ---
|
||
with zf_mod.ZipFile(fixed_docx_path, 'r') as zf:
|
||
doc_xml = zf.read('word/document.xml')
|
||
|
||
root = etree.fromstring(doc_xml)
|
||
nsmap = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
|
||
w_ns = nsmap['w']
|
||
|
||
all_runs = root.findall(f'.//{{{w_ns}}}r')
|
||
updates = 0
|
||
i = 0
|
||
while i < len(all_runs):
|
||
r = all_runs[i]
|
||
fld = r.find(f'{{{w_ns}}}fldChar')
|
||
if fld is not None and fld.get(f'{{{w_ns}}}fldCharType') == 'begin':
|
||
if i + 1 < len(all_runs):
|
||
instr_r = all_runs[i + 1]
|
||
instr_t = instr_r.find(f'{{{w_ns}}}instrText')
|
||
if instr_t is not None and instr_t.text and 'PAGEREF' in instr_t.text:
|
||
# Find the hyperlink parent to get heading text
|
||
hyperlink = r.getparent()
|
||
if hyperlink is not None:
|
||
text_runs = hyperlink.findall(f'.//{{{w_ns}}}t')
|
||
# Find the 'separate' then the page number text
|
||
for j in range(i + 2, min(i + 5, len(all_runs))):
|
||
sep_fld = all_runs[j].find(f'{{{w_ns}}}fldChar')
|
||
if sep_fld is not None and sep_fld.get(f'{{{w_ns}}}fldCharType') == 'separate':
|
||
if j + 1 < len(all_runs):
|
||
page_t = all_runs[j + 1].find(f'{{{w_ns}}}t')
|
||
if page_t is not None:
|
||
# Get heading text from the hyperlink
|
||
heading_text = ''
|
||
for tr in text_runs:
|
||
if tr.text and tr != page_t:
|
||
heading_text += tr.text
|
||
heading_text = heading_text.strip()
|
||
|
||
correct_page = heading_page_map.get(heading_text)
|
||
if correct_page:
|
||
page_t.text = str(correct_page)
|
||
updates += 1
|
||
break
|
||
i += 1
|
||
|
||
# --- Step 5: Save updated DOCX ---
|
||
with tempfile.TemporaryDirectory() as tmpdir:
|
||
with zf_mod.ZipFile(fixed_docx_path, 'r') as zf:
|
||
zf.extractall(tmpdir)
|
||
|
||
doc_xml_path = os.path.join(tmpdir, 'word', 'document.xml')
|
||
with open(doc_xml_path, 'wb') as f:
|
||
f.write(etree.tostring(root, xml_declaration=True, encoding='UTF-8', standalone=True))
|
||
|
||
with zf_mod.ZipFile(output_path, 'w', zf_mod.ZIP_DEFLATED) as zf:
|
||
for dirpath, dirnames, filenames in os.walk(tmpdir):
|
||
for fn in filenames:
|
||
full_path = os.path.join(dirpath, fn)
|
||
arcname = os.path.relpath(full_path, tmpdir)
|
||
zf.write(full_path, arcname)
|
||
|
||
return {
|
||
"pass": True,
|
||
"source": source,
|
||
"check_type": "fix-pages",
|
||
"pages_updated": updates,
|
||
"total_headings": len(headings),
|
||
"toc_pages_estimated": toc_pages,
|
||
"offset_applied": offset,
|
||
"output": output_path,
|
||
}
|
||
|
||
|
||
def print_usage():
|
||
"""Print usage information to stderr."""
|
||
print("Usage:", file=sys.stderr)
|
||
print(" toc_validate.py check-docx <file.docx>", file=sys.stderr)
|
||
print(" toc_validate.py check-pdf <file.pdf>", file=sys.stderr)
|
||
print(" toc_validate.py check-conversion <input.docx> <output.pdf>",
|
||
file=sys.stderr)
|
||
print(" toc_validate.py fix-docx <input.docx> [-o output.docx]",
|
||
file=sys.stderr)
|
||
print(" toc_validate.py fix-pages <fixed.docx> <pass1.pdf> [-o output.docx]",
|
||
file=sys.stderr)
|
||
print("", file=sys.stderr)
|
||
print("fix-pages: 2-pass page number correction. Requires a PDF converted", file=sys.stderr)
|
||
print(" from the ORIGINAL docx (without TOC) as reference.", file=sys.stderr)
|
||
|
||
|
||
def main():
|
||
"""CLI entry point."""
|
||
if len(sys.argv) < 2:
|
||
print_usage()
|
||
sys.exit(2)
|
||
|
||
command = sys.argv[1].lower()
|
||
|
||
try:
|
||
if command == 'check-docx':
|
||
if len(sys.argv) < 3:
|
||
print("Error: Missing DOCX file path.", file=sys.stderr)
|
||
print_usage()
|
||
sys.exit(2)
|
||
docx_path = sys.argv[2]
|
||
if not os.path.isfile(docx_path):
|
||
print(f"Error: File not found: {docx_path}", file=sys.stderr)
|
||
sys.exit(2)
|
||
result = check_docx(docx_path)
|
||
|
||
elif command == 'check-pdf':
|
||
if len(sys.argv) < 3:
|
||
print("Error: Missing PDF file path.", file=sys.stderr)
|
||
print_usage()
|
||
sys.exit(2)
|
||
pdf_path = sys.argv[2]
|
||
if not os.path.isfile(pdf_path):
|
||
print(f"Error: File not found: {pdf_path}", file=sys.stderr)
|
||
sys.exit(2)
|
||
result = check_pdf(pdf_path)
|
||
|
||
elif command == 'check-conversion':
|
||
if len(sys.argv) < 4:
|
||
print("Error: Missing file paths. Need both DOCX and PDF.",
|
||
file=sys.stderr)
|
||
print_usage()
|
||
sys.exit(2)
|
||
docx_path = sys.argv[2]
|
||
pdf_path = sys.argv[3]
|
||
if not os.path.isfile(docx_path):
|
||
print(f"Error: File not found: {docx_path}", file=sys.stderr)
|
||
sys.exit(2)
|
||
if not os.path.isfile(pdf_path):
|
||
print(f"Error: File not found: {pdf_path}", file=sys.stderr)
|
||
sys.exit(2)
|
||
result = check_conversion(docx_path, pdf_path)
|
||
|
||
elif command == 'fix-docx':
|
||
if len(sys.argv) < 3:
|
||
print("Error: Missing DOCX file path.", file=sys.stderr)
|
||
print_usage()
|
||
sys.exit(2)
|
||
docx_path = sys.argv[2]
|
||
if not os.path.isfile(docx_path):
|
||
print(f"Error: File not found: {docx_path}", file=sys.stderr)
|
||
sys.exit(2)
|
||
# Parse optional -o flag
|
||
output_path = None
|
||
if '-o' in sys.argv:
|
||
o_idx = sys.argv.index('-o')
|
||
if o_idx + 1 < len(sys.argv):
|
||
output_path = sys.argv[o_idx + 1]
|
||
else:
|
||
print("Error: -o flag requires an output path.",
|
||
file=sys.stderr)
|
||
sys.exit(2)
|
||
result = fix_docx(docx_path, output_path)
|
||
|
||
elif command == 'fix-pages':
|
||
if len(sys.argv) < 4:
|
||
print("Error: Need both fixed DOCX and pass1 PDF paths.", file=sys.stderr)
|
||
print_usage()
|
||
sys.exit(2)
|
||
fixed_docx = sys.argv[2]
|
||
pass1_pdf = sys.argv[3]
|
||
if not os.path.isfile(fixed_docx):
|
||
print(f"Error: File not found: {fixed_docx}", file=sys.stderr)
|
||
sys.exit(2)
|
||
if not os.path.isfile(pass1_pdf):
|
||
print(f"Error: File not found: {pass1_pdf}", file=sys.stderr)
|
||
sys.exit(2)
|
||
output_path = None
|
||
if '-o' in sys.argv:
|
||
o_idx = sys.argv.index('-o')
|
||
if o_idx + 1 < len(sys.argv):
|
||
output_path = sys.argv[o_idx + 1]
|
||
result = fix_docx_accurate_pages(fixed_docx, pass1_pdf, output_path)
|
||
|
||
else:
|
||
print(f"Error: Unknown command '{command}'", file=sys.stderr)
|
||
print_usage()
|
||
sys.exit(2)
|
||
|
||
# Output JSON to stdout
|
||
print(json.dumps(result, ensure_ascii=False, indent=2))
|
||
|
||
# Exit code: 0=pass, 1=fail
|
||
sys.exit(0 if result['pass'] else 1)
|
||
|
||
except Exception as e:
|
||
# Unexpected error — output JSON error and exit 2
|
||
error_result = {
|
||
"pass": False,
|
||
"source": sys.argv[2] if len(sys.argv) > 2 else "unknown",
|
||
"check_type": command.replace('check-', '') + '-toc'
|
||
if command.startswith('check-') else 'unknown',
|
||
"errors": [make_item("SCRIPT_ERROR",
|
||
f"Unexpected error: {str(e)[:200]}",
|
||
"error")],
|
||
"warnings": [],
|
||
"info": [],
|
||
}
|
||
print(json.dumps(error_result, ensure_ascii=False, indent=2))
|
||
sys.exit(2)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|