mantle-ai-trader/skills/pdf/scripts/toc_validate.py

#!/usr/bin/env python3
"""
toc_validate.py - Table of Contents Validation for DOCX and PDF files.

Checks DOCX and PDF files for TOC quality issues including missing TOC fields,
empty placeholders, heading style mismatches, page break issues, and more.
Also validates TOC consistency across DOCX→PDF conversions.

Usage:
    python3 toc_validate.py check-docx output.docx
    python3 toc_validate.py check-pdf output.pdf
    python3 toc_validate.py check-conversion input.docx output.pdf

Output:
    JSON to stdout with structure:
    {
        "pass": true/false,
        "source": "filename",
        "check_type": "docx-toc"|"pdf-toc"|"conversion-toc",
        "errors": [...],
        "warnings": [...],
        "info": [...]
    }

Exit codes:
    0 = pass (no errors)
    1 = fail (errors found)
    2 = script error (bad args, file not found, etc.)

Dependencies:
    - Standard library (zipfile, xml.etree.ElementTree, etc.)
    - pdfplumber (for PDF checks)
    - pikepdf (optional, for link annotation checks)
"""

import sys
import os
import json
import re
import zipfile
import tempfile
import xml.etree.ElementTree as ET
from pathlib import Path
from typing import List, Dict, Tuple, Optional, Any

# ---------------------------------------------------------------------------
# XML namespace constants
# ---------------------------------------------------------------------------
NS = {
    'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
    'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
}

# Standard heading style names (case-insensitive comparison)
STANDARD_HEADING_STYLES = {
    'heading1', 'heading2', 'heading3', 'heading4',
    'heading 1', 'heading 2', 'heading 3', 'heading 4',
    # Some localized variants
    '1', '2', '3', '4',
}

# TOC keywords to search for in PDF text
TOC_KEYWORDS = ['目录', '目 录', '目  录', 'table of contents', 'contents']

# Hint phrases that should not leak into final PDF
HINT_PHRASES = [
    '提示：本目录通过域代码生成',
    '右键更新域',
    'Update Field',
    'right-click',
    'Tip: This table of contents',
]

# Hint text indicators for DOCX styling check
HINT_INDICATORS = ['提示', 'Tip:', 'Update Field', '更新域']

# Gray color values (hex, case-insensitive)
GRAY_COLORS = {'808080', '999999', 'a0a0a0', 'a5a5a5', 'b0b0b0', 'c0c0c0',
               '888888', '777777', '666666', 'aaaaaa', 'bbbbbb', 'cccccc',
               '909090', '959595', '9a9a9a', 'a8a8a8', 'b8b8b8'}


# ---------------------------------------------------------------------------
# Result helpers
# ---------------------------------------------------------------------------
def make_item(code: str, message: str, severity: str) -> Dict[str, str]:
    """Create a single result item."""
    return {"code": code, "message": message, "severity": severity}


def make_result(source: str, check_type: str, errors: List, warnings: List,
                info: List) -> Dict[str, Any]:
    """Build the final result dict."""
    return {
        "pass": len(errors) == 0,
        "source": source,
        "check_type": check_type,
        "errors": errors,
        "warnings": warnings,
        "info": info,
    }


# ---------------------------------------------------------------------------
# DOCX XML parsing helpers
# ---------------------------------------------------------------------------
def parse_docx_xml(docx_path: str) -> Optional[ET.Element]:
    """Extract and parse document.xml from a .docx file.

    Returns the root Element or None if extraction fails.
    """
    try:
        with zipfile.ZipFile(docx_path, 'r') as z:
            with z.open('word/document.xml') as f:
                return ET.parse(f).getroot()
    except (zipfile.BadZipFile, KeyError, ET.ParseError):
        return None


def get_all_paragraphs(root: ET.Element) -> List[ET.Element]:
    """Return all w:p elements in document order."""
    return root.findall('.//' + _w('p'))


def _w(tag: str) -> str:
    """Shorthand for word namespace tag."""
    return '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' + tag


def get_paragraph_text(para: ET.Element) -> str:
    """Extract concatenated text from all w:t elements in a paragraph."""
    texts = []
    for t in para.findall('.//' + _w('t')):
        if t.text:
            texts.append(t.text)
    return ''.join(texts)


def get_paragraph_style(para: ET.Element) -> Optional[str]:
    """Get the pStyle val from a paragraph, or None."""
    pPr = para.find(_w('pPr'))
    if pPr is None:
        return None
    pStyle = pPr.find(_w('pStyle'))
    if pStyle is None:
        return None
    return pStyle.get(_w('val'))


def is_heading_style(style_val: Optional[str]) -> bool:
    """Check if a style value is a standard heading style."""
    if style_val is None:
        return False
    lower = style_val.lower().strip()
    # Check direct matches
    if lower in STANDARD_HEADING_STYLES:
        return True
    # Check "Heading" prefix pattern (e.g. "Heading1", "Heading 2")
    if lower.startswith('heading'):
        return True
    # Numeric style IDs sometimes used for headings
    if lower in ('1', '2', '3', '4'):
        return True
    return False


def is_any_heading_style(style_val: Optional[str]) -> bool:
    """Check if a style looks like any heading (standard or custom with 'heading')."""
    if style_val is None:
        return False
    lower = style_val.lower().strip()
    return lower.startswith('heading') or lower in ('1', '2', '3', '4')


def is_standard_heading_style(style_val: Optional[str]) -> bool:
    """Check if a style is specifically a standard Heading1-4."""
    if style_val is None:
        return False
    lower = style_val.lower().strip()
    return lower in {'heading1', 'heading2', 'heading3', 'heading4',
                     'heading 1', 'heading 2', 'heading 3', 'heading 4'}


def paragraph_is_bold_large(para: ET.Element) -> bool:
    """Check if a paragraph has bold text and large font (≥28 half-points / 14pt).

    Checks both paragraph-level and run-level properties.
    """
    is_bold = False
    is_large = False

    # Check paragraph-level properties
    pPr = para.find(_w('pPr'))
    if pPr is not None:
        rPr = pPr.find(_w('rPr'))
        if rPr is not None:
            b = rPr.find(_w('b'))
            if b is not None:
                b_val = b.get(_w('val'))
                if b_val is None or b_val.lower() not in ('false', '0', 'off'):
                    is_bold = True
            sz = rPr.find(_w('sz'))
            if sz is not None:
                try:
                    size = int(sz.get(_w('val'), '0'))
                    if size >= 28:
                        is_large = True
                except (ValueError, TypeError):
                    pass

    # Check run-level properties
    for run in para.findall(_w('r')):
        rPr = run.find(_w('rPr'))
        if rPr is None:
            continue
        b = rPr.find(_w('b'))
        if b is not None:
            b_val = b.get(_w('val'))
            if b_val is None or b_val.lower() not in ('false', '0', 'off'):
                is_bold = True
        sz = rPr.find(_w('sz'))
        if sz is not None:
            try:
                size = int(sz.get(_w('val'), '0'))
                if size >= 28:
                    is_large = True
            except (ValueError, TypeError):
                pass

    return is_bold and is_large


def docx_has_toc_field(root: ET.Element) -> bool:
    """Check if the document has a TOC field code.

    Looks for:
    - <w:fldSimple> with w:instr containing "TOC"
    - <w:instrText> containing "TOC"
    """
    # Check fldSimple
    for fld in root.findall('.//' + _w('fldSimple')):
        instr = fld.get(_w('instr'), '')
        if 'TOC' in instr.upper():
            return True

    # Check instrText
    for instr in root.findall('.//' + _w('instrText')):
        if instr.text and 'TOC' in instr.text.upper():
            return True

    return False


def find_toc_field_boundaries(root: ET.Element) -> Tuple[Optional[ET.Element], Optional[ET.Element], Optional[ET.Element]]:
    """Find the TOC field begin/separate/end fldChar elements.

    Returns (begin_elem, separate_elem, end_elem) — any may be None.
    We search for the TOC instrText and then find the corresponding
    fldChar markers.
    """
    body = root.find(_w('body'))
    if body is None:
        return None, None, None

    all_paragraphs = list(body)  # Direct children of body

    # Flatten all runs across all paragraphs to find field structure
    # We need to track field nesting to find the right begin/separate/end
    in_toc_field = False
    toc_begin_para_idx = None
    toc_separate_para_idx = None
    toc_end_para_idx = None
    field_depth = 0

    for para_idx, elem in enumerate(all_paragraphs):
        if elem.tag != _w('p'):
            continue
        for run in elem.findall(_w('r')):
            # Check for instrText with TOC
            instr = run.find(_w('instrText'))
            if instr is not None and instr.text and 'TOC' in instr.text.upper():
                in_toc_field = True

            fldChar = run.find(_w('fldChar'))
            if fldChar is not None:
                fld_type = fldChar.get(_w('fldCharType'), '')
                if fld_type == 'begin':
                    field_depth += 1
                    if not in_toc_field and toc_begin_para_idx is None:
                        # Mark tentatively; will confirm when we see instrText
                        pass
                    if in_toc_field and toc_begin_para_idx is None:
                        toc_begin_para_idx = para_idx
                elif fld_type == 'separate':
                    if in_toc_field and toc_separate_para_idx is None:
                        toc_separate_para_idx = para_idx
                elif fld_type == 'end':
                    if in_toc_field and field_depth <= 1:
                        toc_end_para_idx = para_idx
                        in_toc_field = False
                    field_depth = max(0, field_depth - 1)

    return toc_begin_para_idx, toc_separate_para_idx, toc_end_para_idx


def find_toc_field_boundaries_v2(root: ET.Element) -> Dict[str, Any]:
    """Enhanced TOC boundary finder that works with nested fields.

    Returns dict with:
        'has_toc': bool
        'begin_para_idx': int or None
        'separate_para_idx': int or None
        'end_para_idx': int or None
        'toc_entry_texts': list of str (text between separate and end)
    """
    body = root.find(_w('body'))
    if body is None:
        return {'has_toc': False, 'begin_para_idx': None,
                'separate_para_idx': None, 'end_para_idx': None,
                'toc_entry_texts': []}

    paragraphs = [e for e in body if e.tag == _w('p')]

    # Phase 1: Find the TOC instrText and its surrounding begin marker
    toc_begin_idx = None
    toc_separate_idx = None
    toc_end_idx = None

    # Track all fldChar positions
    events = []  # (para_idx, event_type, element)
    for pi, para in enumerate(paragraphs):
        for run in para.findall('.//' + _w('r')):
            fldChar = run.find(_w('fldChar'))
            if fldChar is not None:
                events.append((pi, fldChar.get(_w('fldCharType'), ''), run))
            instr = run.find(_w('instrText'))
            if instr is not None and instr.text and 'TOC' in instr.text.upper():
                events.append((pi, 'toc_instr', run))

    # Find TOC field boundaries using field nesting
    depth = 0
    found_toc = False
    toc_depth = None

    begin_stack = []  # Stack of (para_idx, depth)

    for pi, evt, run in events:
        if evt == 'begin':
            depth += 1
            begin_stack.append((pi, depth))
        elif evt == 'toc_instr':
            if not found_toc and begin_stack:
                found_toc = True
                toc_begin_idx = begin_stack[-1][0]
                toc_depth = begin_stack[-1][1]
        elif evt == 'separate':
            if found_toc and toc_separate_idx is None and depth == toc_depth:
                toc_separate_idx = pi
        elif evt == 'end':
            if found_toc and toc_end_idx is None and depth == toc_depth:
                toc_end_idx = pi
            depth = max(0, depth - 1)
            if begin_stack:
                begin_stack.pop()

    # Phase 2: Extract TOC entry texts between separate and end
    toc_entry_texts = []
    if toc_separate_idx is not None and toc_end_idx is not None:
        for pi in range(toc_separate_idx, toc_end_idx + 1):
            if pi < len(paragraphs):
                text = get_paragraph_text(paragraphs[pi]).strip()
                if text:
                    toc_entry_texts.append(text)

    return {
        'has_toc': found_toc,
        'begin_para_idx': toc_begin_idx,
        'separate_para_idx': toc_separate_idx,
        'end_para_idx': toc_end_idx,
        'toc_entry_texts': toc_entry_texts,
    }


def check_toc_has_content(root: ET.Element, separate_para_idx: Optional[int],
                          end_para_idx: Optional[int]) -> bool:
    """Check if there are w:t elements between the separate and end markers.

    Looks at all paragraphs between the separate and end field char markers.
    """
    if separate_para_idx is None or end_para_idx is None:
        return False

    body = root.find(_w('body'))
    if body is None:
        return False

    paragraphs = [e for e in body if e.tag == _w('p')]

    for pi in range(separate_para_idx, min(end_para_idx + 1, len(paragraphs))):
        para = paragraphs[pi]
        for t in para.findall('.//' + _w('t')):
            if t.text and t.text.strip():
                return True
    return False


def fuzzy_match(text_a: str, text_b: str) -> bool:
    """Check if two strings match fuzzily.

    Match if one contains the other, or they share >60% of characters.
    """
    a = text_a.strip().lower()
    b = text_b.strip().lower()

    if not a or not b:
        return False

    # One contains the other
    if a in b or b in a:
        return True

    # Character overlap >60%
    set_a = set(a)
    set_b = set(b)
    if not set_a or not set_b:
        return False
    intersection = set_a & set_b
    union = set_a | set_b
    similarity = len(intersection) / len(union)
    return similarity > 0.6


def _detect_language(texts: list) -> str:
    """Detect the primary language of a list of text strings.

    Returns 'zh' if more than half contain Chinese characters, else 'en'.
    """
    if not texts:
        return 'en'
    total = len(texts)
    chinese_count = sum(1 for t in texts if re.search(r'[\u4e00-\u9fff]', t))
    return 'zh' if chinese_count > total / 2 else 'en'


def _get_heading_level(style_val: Optional[str]) -> int:
    """Extract heading level (1-9) from a style value. Returns 0 if not a heading."""
    if style_val is None:
        return 0
    lower = style_val.lower().strip()
    # "heading1", "heading 1", "heading2", etc.
    m = re.match(r'heading\s*(\d+)', lower)
    if m:
        return int(m.group(1))
    # Numeric style IDs
    if lower in ('1', '2', '3', '4', '5', '6', '7', '8', '9'):
        return int(lower)
    return 0


def check_run_hint_style(run: ET.Element) -> Tuple[bool, bool]:
    """Check if a run has gray color and small font size.

    Returns (has_gray_color, has_small_font).
    """
    has_gray = False
    has_small = False

    rPr = run.find(_w('rPr'))
    if rPr is None:
        return False, False

    color = rPr.find(_w('color'))
    if color is not None:
        val = color.get(_w('val'), '').lower()
        if val in GRAY_COLORS:
            has_gray = True
        # Also check if it's any gray-ish color (same R, G, B values or close)
        if len(val) == 6:
            try:
                r = int(val[0:2], 16)
                g = int(val[2:4], 16)
                b = int(val[4:6], 16)
                # Gray if R, G, B are all close to each other and in mid-range
                if (abs(r - g) < 30 and abs(g - b) < 30 and abs(r - b) < 30
                        and 80 <= r <= 210):
                    has_gray = True
            except ValueError:
                pass

    sz = rPr.find(_w('sz'))
    if sz is not None:
        try:
            size = int(sz.get(_w('val'), '0'))
            if size <= 18:  # 18 half-points = 9pt
                has_small = True
        except (ValueError, TypeError):
            pass

    return has_gray, has_small


# ---------------------------------------------------------------------------
# check-docx implementation
# ---------------------------------------------------------------------------
def check_docx(docx_path: str) -> Dict[str, Any]:
    """Run all DOCX TOC validation checks.

    Returns the result dict.
    """
    errors: List[Dict] = []
    warnings: List[Dict] = []
    info: List[Dict] = []
    source = os.path.basename(docx_path)

    # Parse document.xml
    root = parse_docx_xml(docx_path)
    if root is None:
        return make_result(source, "docx-toc",
                           [make_item("PARSE_ERROR",
                                      "Failed to parse DOCX file. File may be corrupted.",
                                      "error")],
                           [], [])

    body = root.find(_w('body'))
    if body is None:
        return make_result(source, "docx-toc", [], [],
                           [make_item("EMPTY_BODY",
                                      "Document body is empty.",
                                      "info")])

    paragraphs = [e for e in body if e.tag == _w('p')]

    # Detect TOC field boundaries
    toc_info = find_toc_field_boundaries_v2(root)
    has_toc = toc_info['has_toc']
    toc_begin_idx = toc_info['begin_para_idx']
    toc_separate_idx = toc_info['separate_para_idx']
    toc_end_idx = toc_info['end_para_idx']
    toc_entry_texts = toc_info['toc_entry_texts']

    # Also check for fldSimple-based TOC
    if not has_toc:
        for fld in root.findall('.//' + _w('fldSimple')):
            instr = fld.get(_w('instr'), '')
            if 'TOC' in instr.upper():
                has_toc = True
                break

    # Also check for SDT-wrapped TOC (e.g. generated by fix-docx)
    if not has_toc:
        if docx_has_toc_field(root):
            has_toc = True

    # Count headings (all paragraphs with heading styles)
    all_heading_paras = []
    for pi, para in enumerate(paragraphs):
        style = get_paragraph_style(para)
        if is_any_heading_style(style):
            all_heading_paras.append((pi, para, style))

    # Content headings: headings AFTER the TOC end (or all if no TOC)
    content_heading_paras = []
    if toc_end_idx is not None:
        for pi, para, style in all_heading_paras:
            if pi > toc_end_idx:
                content_heading_paras.append((pi, para, style))
    else:
        content_heading_paras = list(all_heading_paras)

    # ---- CHECK 1: TOC_FIELD_MISSING ----
    heading_count = len(all_heading_paras)
    if heading_count >= 3 and not has_toc:
        warnings.append(make_item(
            "TOC_FIELD_MISSING",
            f"Document has {heading_count} headings but no Table of Contents.",
            "warning"
        ))

    # ---- CHECK 2: TOC_PLACEHOLDER_EMPTY ----
    if has_toc and toc_separate_idx is not None and toc_end_idx is not None:
        has_content = check_toc_has_content(root, toc_separate_idx, toc_end_idx)
        if not has_content:
            errors.append(make_item(
                "TOC_PLACEHOLDER_EMPTY",
                "TOC field exists but has no placeholder entries. Run add_toc_placeholders.py.",
                "error"
            ))

    # ---- CHECK 3: TOC_HEADING_STYLE ----
    # Scan ALL paragraphs after TOC (not just those in content_heading_paras)
    # to catch bold+large paragraphs with non-heading styles that TOC won't see.
    if has_toc:
        start_idx = (toc_end_idx + 1) if toc_end_idx is not None else 0
        for pi in range(start_idx, len(paragraphs)):
            para = paragraphs[pi]
            style = get_paragraph_style(para)
            if paragraph_is_bold_large(para) and not is_standard_heading_style(style):
                text = get_paragraph_text(para).strip()
                if text:
                    truncated = text[:50] + ('...' if len(text) > 50 else '')
                    style_name = style if style else '(none)'
                    errors.append(make_item(
                        "TOC_HEADING_STYLE",
                        f"Paragraph '{truncated}' uses custom style '{style_name}' "
                        f"instead of HeadingLevel. TOC will not pick it up.",
                        "error"
                    ))

    # ---- CHECK 4: TOC_ENTRY_MISMATCH ----
    if toc_entry_texts and content_heading_paras:
        heading_texts = [get_paragraph_text(para).strip()
                         for _, para, _ in content_heading_paras
                         if get_paragraph_text(para).strip()]
        if heading_texts:
            unmatched = 0
            for ht in heading_texts:
                matched = any(fuzzy_match(ht, et) for et in toc_entry_texts)
                if not matched:
                    unmatched += 1

            match_ratio = (len(heading_texts) - unmatched) / len(heading_texts)
            if match_ratio < 0.5:
                errors.append(make_item(
                    "TOC_ENTRY_MISMATCH",
                    f"TOC placeholder entries don't match actual headings. "
                    f"{unmatched} of {len(heading_texts)} headings not found in TOC.",
                    "error"
                ))

    # ---- CHECK 5: TOC_NO_PAGEBREAK ----
    if toc_end_idx is not None:
        found_pagebreak = False
        # Check up to 2 paragraphs after TOC end
        check_end = min(toc_end_idx + 3, len(paragraphs))
        for pi in range(toc_end_idx, check_end):
            para = paragraphs[pi]
            # Check for <w:br w:type="page"/>
            for br in para.findall('.//' + _w('br')):
                if br.get(_w('type')) == 'page':
                    found_pagebreak = True
                    break
            # Check for <w:lastRenderedPageBreak/>
            if para.findall('.//' + _w('lastRenderedPageBreak')):
                found_pagebreak = True
            if found_pagebreak:
                break

        if not found_pagebreak:
            warnings.append(make_item(
                "TOC_NO_PAGEBREAK",
                "No page break found after TOC. Content may run into the table of contents.",
                "warning"
            ))

    # ---- CHECK 6: TOC_HINT_STYLE ----
    for para in paragraphs:
        text = get_paragraph_text(para).strip()
        has_hint = any(indicator in text for indicator in HINT_INDICATORS)
        if has_hint:
            # Check if runs containing hint text are properly styled
            properly_styled = True
            for run in para.findall(_w('r')):
                run_text = ''
                for t in run.findall(_w('t')):
                    if t.text:
                        run_text += t.text
                if any(ind in run_text for ind in HINT_INDICATORS):
                    has_gray, has_small = check_run_hint_style(run)
                    if not (has_gray and has_small):
                        properly_styled = False
                        break

            if not properly_styled:
                warnings.append(make_item(
                    "TOC_HINT_STYLE",
                    "TOC hint text found but not styled as gray/small. "
                    "It may look like regular content.",
                    "warning"
                ))
            break  # Only report once

    return make_result(source, "docx-toc", errors, warnings, info)


# ---------------------------------------------------------------------------
# check-pdf implementation
# ---------------------------------------------------------------------------
def check_pdf(pdf_path: str) -> Dict[str, Any]:
    """Run all PDF TOC validation checks.

    Returns the result dict.
    """
    errors: List[Dict] = []
    warnings: List[Dict] = []
    info: List[Dict] = []
    source = os.path.basename(pdf_path)

    try:
        import pdfplumber
    except ImportError:
        return make_result(source, "pdf-toc",
                           [make_item("DEPENDENCY_MISSING",
                                      "pdfplumber is not installed. Run: pip install pdfplumber",
                                      "error")],
                           [], [])

    try:
        pdf = pdfplumber.open(pdf_path)
    except Exception as e:
        return make_result(source, "pdf-toc",
                           [make_item("PARSE_ERROR",
                                      f"Failed to open PDF: {str(e)[:100]}",
                                      "error")],
                           [], [])

    total_pages = len(pdf.pages)
    if total_pages == 0:
        pdf.close()
        return make_result(source, "pdf-toc", [], [],
                           [make_item("EMPTY_PDF", "PDF has no pages.", "info")])

    # Extract text from first 5 pages (or all if <5)
    check_pages = min(5, total_pages)
    page_texts = {}
    for i in range(check_pages):
        try:
            text = pdf.pages[i].extract_text() or ''
        except Exception:
            text = ''
        page_texts[i] = text

    # ---- CHECK 1: TOC_NOT_FOUND ----
    toc_pages = []
    for page_idx, text in page_texts.items():
        text_lower = text.lower()
        for kw in TOC_KEYWORDS:
            if kw.lower() in text_lower:
                toc_pages.append(page_idx)
                break

    if not toc_pages and total_pages > 5:
        warnings.append(make_item(
            "TOC_NOT_FOUND",
            f"No TOC detected in first 5 pages of a {total_pages}-page document.",
            "warning"
        ))

    # ---- CHECK 1b: TOC_ON_FIRST_PAGE ----
    # If TOC appears on page 1, it likely means either:
    # (a) there is no cover page before the TOC, or
    # (b) the TOC and body content are not separated by a page break
    if toc_pages and 0 in toc_pages and total_pages > 1:
        errors.append(make_item(
            "TOC_ON_FIRST_PAGE",
            "TOC detected on page 1. A cover page should precede the TOC "
            "(expected structure: Cover → TOC → Content). "
            "Either the cover page is missing or the TOC was not separated by a page break.",
            "error"
        ))

    # ---- CHECK 2 & 3 & 4: TOC entry analysis ----
    # Regex to find lines where the last token is a number (page reference)
    entry_pattern = re.compile(r'^(.+?)\s+(\d{1,4})\s*$')

    toc_entries = []  # List of (title_text, page_number)
    if toc_pages:
        for page_idx in toc_pages:
            text = page_texts.get(page_idx, '')
            for line in text.split('\n'):
                line = line.strip()
                if not line:
                    continue
                m = entry_pattern.match(line)
                if m:
                    title = m.group(1).strip()
                    page_num = int(m.group(2))
                    if 1 <= page_num <= 9999 and title:
                        toc_entries.append((title, page_num))

    if toc_pages:
        # CHECK 2: TOC_NO_ENTRIES
        if len(toc_entries) < 2:
            errors.append(make_item(
                "TOC_NO_ENTRIES",
                "TOC page found but contains fewer than 2 entries.",
                "error"
            ))

        if toc_entries:
            # CHECK 3: TOC_PAGES_INVALID
            invalid_entries = []
            for title, page_num in toc_entries:
                if page_num < 1 or page_num > total_pages:
                    invalid_entries.append((title, page_num))

            for title, page_num in invalid_entries:
                truncated = title[:50] + ('...' if len(title) > 50 else '')
                errors.append(make_item(
                    "TOC_PAGES_INVALID",
                    f"TOC entry '{truncated}' references page {page_num} "
                    f"but document only has {total_pages} pages.",
                    "error"
                ))

            # CHECK 4: TOC_ALL_SAME_PAGE
            if len(toc_entries) >= 2:
                page_nums = set(pn for _, pn in toc_entries)
                if len(page_nums) == 1:
                    same_page = page_nums.pop()
                    errors.append(make_item(
                        "TOC_ALL_SAME_PAGE",
                        f"All TOC entries point to page {same_page}. "
                        f"This likely means placeholder page numbers were not updated.",
                        "error"
                    ))

    # ---- CHECK 5: TOC_LINKS_MISSING ----
    if toc_entries and toc_pages:
        has_links = False
        for page_idx in toc_pages:
            try:
                page = pdf.pages[page_idx]
                # Try annots (annotations)
                annots = page.annots
                if annots:
                    has_links = True
                    break
                # Try hyperlinks
                hyperlinks = page.hyperlinks
                if hyperlinks:
                    has_links = True
                    break
            except (AttributeError, Exception):
                pass

        if not has_links:
            # Also try pikepdf for more thorough annotation check
            try:
                import pikepdf
                pike_pdf = pikepdf.open(pdf_path)
                for page_idx in toc_pages:
                    if page_idx < len(pike_pdf.pages):
                        pike_page = pike_pdf.pages[page_idx]
                        if '/Annots' in pike_page:
                            annots = pike_page['/Annots']
                            if len(annots) > 0:
                                has_links = True
                                break
                pike_pdf.close()
            except (ImportError, Exception):
                pass

        if not has_links:
            warnings.append(make_item(
                "TOC_LINKS_MISSING",
                "TOC entries found but no clickable links detected.",
                "warning"
            ))

    pdf.close()
    return make_result(source, "pdf-toc", errors, warnings, info)


# ---------------------------------------------------------------------------
# check-conversion implementation
# ---------------------------------------------------------------------------
def check_conversion(docx_path: str, pdf_path: str) -> Dict[str, Any]:
    """Run DOCX→PDF conversion TOC consistency checks.

    Returns the result dict.
    """
    errors: List[Dict] = []
    warnings: List[Dict] = []
    info: List[Dict] = []
    source = f"{os.path.basename(docx_path)} → {os.path.basename(pdf_path)}"

    # Parse DOCX
    docx_root = parse_docx_xml(docx_path)
    if docx_root is None:
        return make_result(source, "conversion-toc",
                           [make_item("PARSE_ERROR",
                                      "Failed to parse source DOCX file.",
                                      "error")],
                           [], [])

    # Check DOCX has TOC
    docx_has_toc = docx_has_toc_field(docx_root)

    # Parse PDF
    try:
        import pdfplumber
    except ImportError:
        return make_result(source, "conversion-toc",
                           [make_item("DEPENDENCY_MISSING",
                                      "pdfplumber is not installed.",
                                      "error")],
                           [], [])

    try:
        pdf = pdfplumber.open(pdf_path)
    except Exception as e:
        return make_result(source, "conversion-toc",
                           [make_item("PARSE_ERROR",
                                      f"Failed to open PDF: {str(e)[:100]}",
                                      "error")],
                           [], [])

    total_pages = len(pdf.pages)

    # Extract all PDF text
    all_pdf_text = ''
    page_texts = {}
    for i in range(total_pages):
        try:
            text = pdf.pages[i].extract_text() or ''
        except Exception:
            text = ''
        page_texts[i] = text
        all_pdf_text += text + '\n'

    # Find TOC pages in PDF
    toc_pages = []
    check_pages = min(5, total_pages)
    for i in range(check_pages):
        text_lower = page_texts.get(i, '').lower()
        for kw in TOC_KEYWORDS:
            if kw.lower() in text_lower:
                toc_pages.append(i)
                break

    pdf_has_toc = len(toc_pages) > 0

    # ---- CHECK 1: CONV_TOC_LOST ----
    if docx_has_toc and not pdf_has_toc and total_pages > 5:
        errors.append(make_item(
            "CONV_TOC_LOST",
            "Source DOCX has TOC but converted PDF does not. "
            "TOC was lost during conversion.",
            "error"
        ))

    # ---- CHECK 2: CONV_HINT_LEAKED ----
    all_text_lower = all_pdf_text.lower()
    for phrase in HINT_PHRASES:
        if phrase.lower() in all_text_lower:
            # Find the actual matched text (up to 60 chars)
            idx = all_text_lower.index(phrase.lower())
            matched = all_pdf_text[idx:idx + len(phrase)]
            truncated = matched[:60] + ('...' if len(matched) > 60 else '')
            errors.append(make_item(
                "CONV_HINT_LEAKED",
                f"TOC hint text leaked into PDF: '{truncated}'. "
                f"Clean hints before conversion.",
                "error"
            ))
            break  # Report only the first match

    # ---- CHECK 3: CONV_HEADING_DRIFT ----
    # Count DOCX headings
    body = docx_root.find(_w('body'))
    docx_heading_count = 0
    if body is not None:
        for para in body:
            if para.tag != _w('p'):
                continue
            style = get_paragraph_style(para)
            if is_any_heading_style(style):
                docx_heading_count += 1

    # Count PDF TOC entries
    entry_pattern = re.compile(r'^(.+?)\s+(\d{1,4})\s*$')
    pdf_toc_entry_count = 0
    if toc_pages:
        for page_idx in toc_pages:
            text = page_texts.get(page_idx, '')
            for line in text.split('\n'):
                line = line.strip()
                if not line:
                    continue
                m = entry_pattern.match(line)
                if m:
                    page_num = int(m.group(2))
                    if 1 <= page_num <= 9999:
                        pdf_toc_entry_count += 1

    if docx_heading_count > 0 and pdf_toc_entry_count > 0:
        drift = abs(docx_heading_count - pdf_toc_entry_count)
        drift_pct = (drift / docx_heading_count) * 100
        if drift_pct > 30:
            warnings.append(make_item(
                "CONV_HEADING_DRIFT",
                f"DOCX has {docx_heading_count} headings but PDF TOC has "
                f"{pdf_toc_entry_count} entries ({drift_pct:.0f}% drift).",
                "warning"
            ))

    pdf.close()
    return make_result(source, "conversion-toc", errors, warnings, info)


# ---------------------------------------------------------------------------
# fix-docx implementation
# ---------------------------------------------------------------------------
def _find_toc_sdt_indices(body_elem) -> List[int]:
    """Find indices of SDT elements in body that contain TOC.

    Returns list of indices into body's direct children.
    """
    indices = []
    for idx, child in enumerate(body_elem):
        if child.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}sdt':
            # Check if this SDT contains TOC-related content
            for instr in child.findall('.//' + _w('instrText')):
                if instr.text and 'TOC' in instr.text.upper():
                    indices.append(idx)
                    break
            else:
                # Also check alias/tag
                sdtPr = child.find('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}sdtPr')
                if sdtPr is not None:
                    alias = sdtPr.find('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}alias')
                    if alias is not None and alias.get(_w('val'), '').upper() in ('TOC', '目录'):
                        indices.append(idx)
                        continue
                    docPartObj = sdtPr.find('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}docPartObj')
                    if docPartObj is not None:
                        docPartGallery = docPartObj.find('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}docPartGallery')
                        if docPartGallery is not None and 'toc' in docPartGallery.get(_w('val'), '').lower():
                            indices.append(idx)
    return indices


def _find_toc_field_para_range(body_elem) -> Tuple[Optional[int], Optional[int]]:
    """Find the range of paragraph indices that make up a TOC field code block.

    Returns (start_idx, end_idx) inclusive, or (None, None) if not found.
    These are indices into body's direct children.
    """
    children = list(body_elem)
    in_toc = False
    toc_depth = None
    depth = 0
    start_idx = None
    end_idx = None

    for ci, child in enumerate(children):
        if child.tag != _w('p'):
            continue
        for run in child.findall('.//' + _w('r')):
            instr = run.find(_w('instrText'))
            if instr is not None and instr.text and 'TOC' in instr.text.upper():
                in_toc = True

            fldChar = run.find(_w('fldChar'))
            if fldChar is not None:
                fld_type = fldChar.get(_w('fldCharType'), '')
                if fld_type == 'begin':
                    depth += 1
                    if in_toc and start_idx is None:
                        # The begin was before instrText; look back
                        start_idx = ci
                        toc_depth = depth
                    elif not in_toc and start_idx is None:
                        # tentative; may become TOC if instrText follows
                        pass
                elif fld_type == 'end':
                    if in_toc and depth == toc_depth:
                        end_idx = ci
                        in_toc = False
                    depth = max(0, depth - 1)

    # If we found instrText but start_idx wasn't set (begin was in the same para before instrText)
    # Re-scan more carefully
    if in_toc and start_idx is None:
        # Fall back to find_toc_field_boundaries_v2 style
        pass

    return start_idx, end_idx


def fix_docx(docx_path: str, output_path: Optional[str] = None) -> Dict[str, Any]:
    """Detect TOC issues in a DOCX and fix them, outputting a new DOCX file.

    Returns the result dict.
    """
    from docx import Document as DocxDocument
    from docx.shared import Pt, Twips
    from docx.enum.text import WD_ALIGN_PARAGRAPH
    from docx.oxml.ns import qn
    from docx.oxml import OxmlElement

    errors: List[Dict] = []
    warnings: List[Dict] = []
    info_list: List[Dict] = []
    source = os.path.basename(docx_path)

    if output_path is None:
        base, ext = os.path.splitext(docx_path)
        output_path = base + '_fixed' + ext

    # Parse using low-level XML for analysis
    root = parse_docx_xml(docx_path)
    if root is None:
        return {
            "pass": False, "source": source, "check_type": "fix-docx",
            "action": "failed", "reason": "Failed to parse DOCX file",
            "heading_count": 0, "toc_entries_before": 0, "toc_entries_after": 0,
            "output": output_path,
            "errors": [make_item("PARSE_ERROR", "Failed to parse DOCX", "error")],
            "warnings": [], "info": []
        }

    body = root.find(_w('body'))
    if body is None:
        return {
            "pass": False, "source": source, "check_type": "fix-docx",
            "action": "failed", "reason": "Document body is empty",
            "heading_count": 0, "toc_entries_before": 0, "toc_entries_after": 0,
            "output": output_path,
            "errors": [make_item("EMPTY_BODY", "Document body is empty", "error")],
            "warnings": [], "info": []
        }

    paragraphs = [e for e in body if e.tag == _w('p')]

    # Extract headings
    headings = []  # list of (para_index_in_body, text, level)
    body_children = list(body)
    para_to_body_idx = {}  # map paragraph element to body child index
    pi = 0
    caption_filter = re.compile(r'^[表图]\s*\d')
    for ci, child in enumerate(body_children):
        if child.tag == _w('p'):
            para_to_body_idx[id(child)] = ci
            style = get_paragraph_style(child)
            if is_any_heading_style(style):
                text = get_paragraph_text(child).strip()
                level = _get_heading_level(style)
                if text and level > 0:
                    # Skip table/figure captions styled as headings
                    if caption_filter.match(text):
                        continue
                    headings.append((ci, text, level))
            pi += 1

    heading_count = len(headings)
    heading_texts = [h[1] for h in headings]

    # Get TOC info
    toc_info = find_toc_field_boundaries_v2(root)
    has_toc = toc_info['has_toc']
    toc_entry_texts = toc_info['toc_entry_texts']
    toc_entries_before = len(toc_entry_texts)

    # Also check for SDT-based TOC
    sdt_indices = _find_toc_sdt_indices(body)
    has_sdt_toc = len(sdt_indices) > 0

    has_any_toc = has_toc or has_sdt_toc

    # If SDT TOC, extract text from it for analysis
    if has_sdt_toc and not toc_entry_texts:
        for si in sdt_indices:
            sdt_elem = body_children[si]
            for t in sdt_elem.findall('.//' + _w('t')):
                if t.text and t.text.strip():
                    toc_entry_texts.append(t.text.strip())
        toc_entries_before = len(toc_entry_texts)

    # ---- Decision logic ----

    # Case 1: No TOC exists
    if not has_any_toc:
        if heading_count < 3:
            return {
                "pass": True, "source": source, "check_type": "fix-docx",
                "action": "no_toc_needed",
                "reason": f"Document has only {heading_count} headings, no TOC needed",
                "heading_count": heading_count,
                "toc_entries_before": 0, "toc_entries_after": 0,
                "output": output_path,
                "errors": [], "warnings": [],
                "info": [f"Document has {heading_count} headings (< 3), no TOC needed"]
            }
        else:
            # Need to generate TOC
            info_list.append(f"No TOC found, generating new TOC with {heading_count} entries")
            need_fix = True
            fix_reason = "no_toc"
            toc_insert_body_idx = None  # Will determine below
    else:
        # Case 2 & 3: TOC exists, check if it's stale/placeholder
        need_fix = False
        fix_reason = ""

        # Check for empty TOC
        non_empty_entries = [t for t in toc_entry_texts if t.strip()]
        if not non_empty_entries:
            need_fix = True
            fix_reason = "empty_toc"
            info_list.append("TOC exists but has no text content (uninitialized)")
        else:
            # Language mismatch check
            heading_lang = _detect_language(heading_texts)
            toc_lang = _detect_language(non_empty_entries)

            if heading_lang != toc_lang and heading_count >= 3:
                need_fix = True
                fix_reason = "language_mismatch"
                info_list.append(
                    f"Deleted stale TOC with {toc_entries_before} "
                    f"{'English' if toc_lang == 'en' else 'Chinese'} placeholder entries"
                )

            # Count mismatch check (>50% difference)
            if not need_fix and heading_count > 0:
                diff = abs(heading_count - toc_entries_before)
                if diff / heading_count > 0.5:
                    need_fix = True
                    fix_reason = "count_mismatch"
                    info_list.append(
                        f"TOC has {toc_entries_before} entries but document has "
                        f"{heading_count} headings (>{50}% drift)"
                    )

        if not need_fix:
            # TOC looks OK
            return {
                "pass": True, "source": source, "check_type": "fix-docx",
                "action": "skipped",
                "reason": "TOC appears to be up-to-date",
                "heading_count": heading_count,
                "toc_entries_before": toc_entries_before,
                "toc_entries_after": toc_entries_before,
                "output": output_path,
                "errors": [], "warnings": [],
                "info": ["TOC entries and headings are consistent, no fix needed"]
            }

    # ---- Perform the fix using python-docx ----
    try:
        doc = DocxDocument(docx_path)
    except Exception as e:
        return {
            "pass": False, "source": source, "check_type": "fix-docx",
            "action": "failed", "reason": f"Failed to open DOCX with python-docx: {str(e)[:200]}",
            "heading_count": heading_count,
            "toc_entries_before": toc_entries_before, "toc_entries_after": 0,
            "output": output_path,
            "errors": [make_item("OPEN_ERROR", f"Failed to open: {str(e)[:200]}", "error")],
            "warnings": [], "info": []
        }

    doc_body = doc.element.body
    doc_children = list(doc_body)

    # Determine language for TOC title
    content_lang = _detect_language(heading_texts)
    toc_title = "目  录" if content_lang == 'zh' else "Table of Contents"

    # Re-extract headings from the python-docx document for consistency
    doc_headings = []  # (element_index, text, level)
    # Pattern to filter out table/figure captions styled as headings
    caption_re = re.compile(r'^[表图]\s*\d')
    for ci, child in enumerate(doc_children):
        if child.tag == qn('w:p'):
            pPr = child.find(qn('w:pPr'))
            if pPr is not None:
                pStyle = pPr.find(qn('w:pStyle'))
                if pStyle is not None:
                    style_val = pStyle.get(qn('w:val'))
                    if is_any_heading_style(style_val):
                        text_parts = []
                        for t in child.findall('.//' + qn('w:t')):
                            if t.text:
                                text_parts.append(t.text)
                        text = ''.join(text_parts).strip()
                        level = _get_heading_level(style_val)
                        if text and level > 0:
                            # Skip table/figure captions (e.g. "表 1：xxx", "图 2：xxx")
                            if caption_re.match(text):
                                continue
                            doc_headings.append((ci, text, level))

    if not doc_headings:
        return {
            "pass": True, "source": source, "check_type": "fix-docx",
            "action": "no_toc_needed",
            "reason": "No headings found in document after re-parse",
            "heading_count": 0,
            "toc_entries_before": toc_entries_before, "toc_entries_after": 0,
            "output": output_path,
            "errors": [], "warnings": [],
            "info": ["No headings found, skipping TOC generation"]
        }

    # Step 1: Remove existing TOC (SDT or field code range)
    insert_before_idx = None

    # Remove SDT-based TOC
    sdt_removed = False
    for child in list(doc_body):
        if child.tag == qn('w:sdt'):
            is_toc_sdt = False
            for instr in child.findall('.//' + qn('w:instrText')):
                if instr.text and 'TOC' in instr.text.upper():
                    is_toc_sdt = True
                    break
            if not is_toc_sdt:
                sdtPr = child.find(qn('w:sdtPr'))
                if sdtPr is not None:
                    alias = sdtPr.find(qn('w:alias'))
                    if alias is not None and alias.get(qn('w:val'), '').upper() in ('TOC', '目录'):
                        is_toc_sdt = True
                    docPartObj = sdtPr.find(qn('w:docPartObj'))
                    if docPartObj is not None:
                        dpg = docPartObj.find(qn('w:docPartGallery'))
                        if dpg is not None and 'toc' in dpg.get(qn('w:val'), '').lower():
                            is_toc_sdt = True
            if is_toc_sdt:
                # Record position
                insert_before_idx = list(doc_body).index(child)
                doc_body.remove(child)
                sdt_removed = True

    # Remove field code TOC (non-SDT)
    if not sdt_removed and has_toc:
        # Find and remove paragraphs that are part of the TOC field
        doc_children_fresh = list(doc_body)
        # Use similar logic to find_toc_field_boundaries_v2 but on python-docx elements
        in_toc = False
        toc_depth = None
        depth = 0
        toc_paras_to_remove = []
        field_begin_idx = None

        for ci, child in enumerate(doc_children_fresh):
            if child.tag != qn('w:p'):
                continue
            for run in child.findall('.//' + qn('w:r')):
                instr = run.find(qn('w:instrText'))
                if instr is not None and instr.text and 'TOC' in instr.text.upper():
                    in_toc = True

                fldChar = run.find(qn('w:fldChar'))
                if fldChar is not None:
                    fld_type = fldChar.get(qn('w:fldCharType'), '')
                    if fld_type == 'begin':
                        depth += 1
                        if in_toc and field_begin_idx is None:
                            field_begin_idx = ci
                            toc_depth = depth
                    elif fld_type == 'end':
                        if in_toc and depth == toc_depth:
                            # Mark all paragraphs from begin to end for removal
                            if field_begin_idx is not None:
                                for ri in range(field_begin_idx, ci + 1):
                                    toc_paras_to_remove.append(doc_children_fresh[ri])
                            in_toc = False
                        depth = max(0, depth - 1)

        if toc_paras_to_remove:
            insert_before_idx = list(doc_body).index(toc_paras_to_remove[0])
            for p in toc_paras_to_remove:
                try:
                    doc_body.remove(p)
                except ValueError:
                    pass

    # Step 2: Determine insertion point
    if insert_before_idx is None:
        # No existing TOC was removed — find the right place to insert
        doc_children_now = list(doc_body)
        first_heading_idx = None
        for ci, child in enumerate(doc_children_now):
            if child.tag == qn('w:p'):
                pPr = child.find(qn('w:pPr'))
                if pPr is not None:
                    pStyle = pPr.find(qn('w:pStyle'))
                    if pStyle is not None:
                        sv = pStyle.get(qn('w:val'))
                        if is_any_heading_style(sv):
                            first_heading_idx = ci
                            break
        if first_heading_idx is not None:
            insert_before_idx = first_heading_idx
        else:
            insert_before_idx = 0

    # Step 3: Build TOC paragraphs as OxmlElements and insert them

    def _make_toc_paragraph(text: str, level: int, lang: str, page_num: str = '1', bookmark_name: str = '') -> Any:
        """Create a TOC entry paragraph with HYPERLINK + PAGEREF for clickable links and auto page numbers."""
        p = OxmlElement('w:p')
        pPr = OxmlElement('w:pPr')

        # TOC style
        toc_style = OxmlElement('w:pStyle')
        toc_style.set(qn('w:val'), f'TOC{level}' if level <= 3 else 'TOC3')
        pPr.append(toc_style)

        # Indentation based on level
        if level >= 2:
            ind = OxmlElement('w:ind')
            indent_twips = (level - 1) * 420
            ind.set(qn('w:left'), str(indent_twips))
            pPr.append(ind)

        # Right-aligned tab stop with dot leader at 9026 twips (~15.9cm)
        tabs = OxmlElement('w:tabs')
        tab = OxmlElement('w:tab')
        tab.set(qn('w:val'), 'right')
        tab.set(qn('w:leader'), 'dot')
        tab.set(qn('w:pos'), '9026')
        tabs.append(tab)
        pPr.append(tabs)

        # Line spacing
        spacing = OxmlElement('w:spacing')
        spacing.set(qn('w:before'), '120')
        spacing.set(qn('w:after'), '60')
        pPr.append(spacing)

        p.append(pPr)

        if bookmark_name:
            # Wrap everything in a hyperlink element pointing to the bookmark
            hyperlink = OxmlElement('w:hyperlink')
            hyperlink.set(qn('w:anchor'), bookmark_name)
            hyperlink.set(qn('w:history'), '1')

            # --- Run 1: heading text ---
            r = OxmlElement('w:r')
            rPr_r = OxmlElement('w:rPr')
            # Style as hyperlink (blue, underline optional)
            rStyle = OxmlElement('w:rStyle')
            rStyle.set(qn('w:val'), 'Hyperlink')
            rPr_r.append(rStyle)
            sz2 = OxmlElement('w:sz')
            szCs2 = OxmlElement('w:szCs')
            if level == 1:
                sz2.set(qn('w:val'), '28')
                szCs2.set(qn('w:val'), '28')
                b2 = OxmlElement('w:b')
                rPr_r.append(b2)
            elif level == 2:
                sz2.set(qn('w:val'), '24')
                szCs2.set(qn('w:val'), '24')
            else:
                sz2.set(qn('w:val'), '22')
                szCs2.set(qn('w:val'), '22')
            rPr_r.append(sz2)
            rPr_r.append(szCs2)
            r.append(rPr_r)
            t = OxmlElement('w:t')
            t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
            t.text = text
            r.append(t)
            hyperlink.append(r)

            # --- Run 2: tab ---
            r_tab = OxmlElement('w:r')
            tab_elem = OxmlElement('w:tab')
            r_tab.append(tab_elem)
            hyperlink.append(r_tab)

            # --- Run 3: PAGEREF field code for auto page number ---
            # fldChar begin
            r_begin = OxmlElement('w:r')
            fldChar_begin = OxmlElement('w:fldChar')
            fldChar_begin.set(qn('w:fldCharType'), 'begin')
            r_begin.append(fldChar_begin)
            hyperlink.append(r_begin)

            # instrText: PAGEREF bookmark_name \h
            r_instr = OxmlElement('w:r')
            instrText = OxmlElement('w:instrText')
            instrText.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
            instrText.text = f' PAGEREF {bookmark_name} \\h '
            r_instr.append(instrText)
            hyperlink.append(r_instr)

            # fldChar separate
            r_sep = OxmlElement('w:r')
            fldChar_sep = OxmlElement('w:fldChar')
            fldChar_sep.set(qn('w:fldCharType'), 'separate')
            r_sep.append(fldChar_sep)
            hyperlink.append(r_sep)

            # Page number placeholder text
            r_page = OxmlElement('w:r')
            rPr_page = OxmlElement('w:rPr')
            noProof = OxmlElement('w:noProof')
            rPr_page.append(noProof)
            r_page.append(rPr_page)
            t_page = OxmlElement('w:t')
            t_page.text = str(page_num)
            r_page.append(t_page)
            hyperlink.append(r_page)

            # fldChar end
            r_end = OxmlElement('w:r')
            fldChar_end = OxmlElement('w:fldChar')
            fldChar_end.set(qn('w:fldCharType'), 'end')
            r_end.append(fldChar_end)
            hyperlink.append(r_end)

            p.append(hyperlink)
        else:
            # Fallback: plain text without hyperlink (same as before)
            r = OxmlElement('w:r')
            rPr_r = OxmlElement('w:rPr')
            sz2 = OxmlElement('w:sz')
            szCs2 = OxmlElement('w:szCs')
            if level == 1:
                sz2.set(qn('w:val'), '28')
                szCs2.set(qn('w:val'), '28')
                b2 = OxmlElement('w:b')
                rPr_r.append(b2)
            elif level == 2:
                sz2.set(qn('w:val'), '24')
                szCs2.set(qn('w:val'), '24')
            else:
                sz2.set(qn('w:val'), '22')
                szCs2.set(qn('w:val'), '22')
            rPr_r.append(sz2)
            rPr_r.append(szCs2)
            r.append(rPr_r)
            t = OxmlElement('w:t')
            t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
            t.text = text
            r.append(t)
            p.append(r)

            r_tab = OxmlElement('w:r')
            tab_elem = OxmlElement('w:tab')
            r_tab.append(tab_elem)
            p.append(r_tab)

            r_page = OxmlElement('w:r')
            t_page = OxmlElement('w:t')
            t_page.text = str(page_num)
            r_page.append(t_page)
            p.append(r_page)

        return p

    def _make_toc_title(title_text: str) -> Any:
        """Create the TOC title paragraph (centered, 18pt, bold)."""
        p = OxmlElement('w:p')
        pPr = OxmlElement('w:pPr')

        # Center alignment
        jc = OxmlElement('w:jc')
        jc.set(qn('w:val'), 'center')
        pPr.append(jc)

        # Spacing
        spacing = OxmlElement('w:spacing')
        spacing.set(qn('w:after'), '200')
        spacing.set(qn('w:line'), '360')
        spacing.set(qn('w:lineRule'), 'auto')
        pPr.append(spacing)

        # Run properties
        rPr_p = OxmlElement('w:rPr')
        b = OxmlElement('w:b')
        rPr_p.append(b)
        sz = OxmlElement('w:sz')
        sz.set(qn('w:val'), '36')  # 18pt = 36 half-points
        rPr_p.append(sz)
        szCs = OxmlElement('w:szCs')
        szCs.set(qn('w:val'), '36')
        rPr_p.append(szCs)
        pPr.append(rPr_p)

        p.append(pPr)

        # Run with text
        r = OxmlElement('w:r')
        rPr_r = OxmlElement('w:rPr')
        b2 = OxmlElement('w:b')
        rPr_r.append(b2)
        sz2 = OxmlElement('w:sz')
        sz2.set(qn('w:val'), '36')
        rPr_r.append(sz2)
        szCs2 = OxmlElement('w:szCs')
        szCs2.set(qn('w:val'), '36')
        rPr_r.append(szCs2)
        r.append(rPr_r)

        t = OxmlElement('w:t')
        t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
        t.text = title_text
        r.append(t)
        p.append(r)

        return p

    def _make_page_break() -> Any:
        """Create a paragraph with a page break."""
        p = OxmlElement('w:p')
        r = OxmlElement('w:r')
        br = OxmlElement('w:br')
        br.set(qn('w:type'), 'page')
        r.append(br)
        p.append(r)
        return p

    # Build the TOC as an SDT (Structured Document Tag) wrapping a TOC field
    # This ensures check-docx recognizes it and fix-docx can detect/replace it

    def _build_toc_sdt(title_text: str, heading_entries: list, lang: str) -> Any:
        """Build a complete SDT element containing a TOC field with entries."""
        sdt = OxmlElement('w:sdt')

        # SDT properties
        sdtPr = OxmlElement('w:sdtPr')
        alias = OxmlElement('w:alias')
        alias.set(qn('w:val'), 'TOC')
        sdtPr.append(alias)

        # docPartObj with TOC gallery
        docPartObj = OxmlElement('w:docPartObj')
        docPartGallery = OxmlElement('w:docPartGallery')
        docPartGallery.set(qn('w:val'), 'Table of Contents')
        docPartObj.append(docPartGallery)
        docPartUnique = OxmlElement('w:docPartUnique')
        docPartObj.append(docPartUnique)
        sdtPr.append(docPartObj)

        sdt.append(sdtPr)

        # SDT content
        sdtContent = OxmlElement('w:sdtContent')

        # Title paragraph
        sdtContent.append(_make_toc_title(title_text))

        # Field begin paragraph
        p_begin = OxmlElement('w:p')
        r_begin = OxmlElement('w:r')
        fldChar_begin = OxmlElement('w:fldChar')
        fldChar_begin.set(qn('w:fldCharType'), 'begin')
        r_begin.append(fldChar_begin)
        p_begin.append(r_begin)
        r_instr = OxmlElement('w:r')
        instrText = OxmlElement('w:instrText')
        instrText.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
        instrText.text = ' TOC \\o "1-3" \\h \\z \\u '
        r_instr.append(instrText)
        p_begin.append(r_instr)
        r_sep = OxmlElement('w:r')
        fldChar_sep = OxmlElement('w:fldChar')
        fldChar_sep.set(qn('w:fldCharType'), 'separate')
        r_sep.append(fldChar_sep)
        p_begin.append(r_sep)
        sdtContent.append(p_begin)

        # TOC entry paragraphs — estimate page numbers based on heading position
        toc_entries = [(i, h_text, h_level) for i, (_, h_text, h_level) in enumerate(heading_entries) if h_level <= 3]
        total_headings = len(toc_entries)
        # TOC itself takes ~2 pages; cover takes ~1 page
        toc_offset = 3  # cover + TOC pages

        # Count body children in the original document to estimate total pages
        # Rough heuristic: ~40 paragraphs per page for typical documents
        doc_body_count = len(list(doc_body))
        estimated_total_pages = max(toc_offset + 1, doc_body_count // 40 + toc_offset)

        # Map each heading to its position ratio in the document
        # Also generate bookmark names for HYPERLINK + PAGEREF
        bookmark_names = []
        for seq, (idx, h_text, h_level) in enumerate(toc_entries):
            # Generate a unique bookmark name for each heading
            bm_name = f'_Toc{100000 + seq}'
            bookmark_names.append(bm_name)

            # Use the heading's body child index to estimate position
            h_body_idx = heading_entries[idx][0] if idx < len(heading_entries) else 0
            if doc_body_count > 0:
                position_ratio = h_body_idx / doc_body_count
                est_page = toc_offset + max(0, int(position_ratio * (estimated_total_pages - toc_offset)))
            else:
                est_page = toc_offset + seq
            est_page = max(toc_offset, est_page)  # never less than toc_offset
            sdtContent.append(_make_toc_paragraph(h_text, h_level, lang, str(est_page), bm_name))

        # Field end paragraph
        p_end = OxmlElement('w:p')
        r_end = OxmlElement('w:r')
        fldChar_end = OxmlElement('w:fldChar')
        fldChar_end.set(qn('w:fldCharType'), 'end')
        r_end.append(fldChar_end)
        p_end.append(r_end)
        sdtContent.append(p_end)

        sdt.append(sdtContent)

        # Build bookmark mapping: list of (heading_body_index, bookmark_name)
        bm_mapping = []
        for seq, (idx, h_text, h_level) in enumerate(toc_entries):
            h_body_idx = heading_entries[idx][0] if idx < len(heading_entries) else 0
            bm_mapping.append((h_body_idx, bookmark_names[seq]))

        return sdt, bm_mapping

    # Build TOC SDT and page break
    toc_sdt, bookmark_mapping = _build_toc_sdt(toc_title, doc_headings, content_lang)
    page_break = _make_page_break()

    # Insert TOC elements at the determined position
    ref_children = list(doc_body)
    # Clamp insert_before_idx
    if insert_before_idx >= len(ref_children):
        doc_body.append(toc_sdt)
        doc_body.append(page_break)
    else:
        ref_element = ref_children[insert_before_idx]
        ref_element.addprevious(toc_sdt)
        ref_element.addprevious(page_break)

    # Add bookmarks to heading paragraphs so PAGEREF and HYPERLINK can resolve
    body_children = list(doc_body)
    bm_id_start = 10  # bookmark IDs must be unique integers in the document
    for body_idx, bm_name in bookmark_mapping:
        if body_idx < len(body_children):
            heading_para = body_children[body_idx]
            # Insert bookmarkStart before first run, bookmarkEnd after last run
            bm_start = OxmlElement('w:bookmarkStart')
            bm_start.set(qn('w:id'), str(bm_id_start))
            bm_start.set(qn('w:name'), bm_name)

            bm_end = OxmlElement('w:bookmarkEnd')
            bm_end.set(qn('w:id'), str(bm_id_start))

            # Insert at beginning and end of the paragraph
            heading_para.insert(0, bm_start)
            heading_para.append(bm_end)

            bm_id_start += 1

    # Save
    try:
        doc.save(output_path)
    except Exception as e:
        return {
            "pass": False, "source": source, "check_type": "fix-docx",
            "action": "failed", "reason": f"Failed to save: {str(e)[:200]}",
            "heading_count": heading_count,
            "toc_entries_before": toc_entries_before, "toc_entries_after": 0,
            "output": output_path,
            "errors": [make_item("SAVE_ERROR", f"Failed to save: {str(e)[:200]}", "error")],
            "warnings": [], "info": []
        }

    toc_entries_after = sum(1 for _, _, l in doc_headings if l <= 3)
    info_list.append(f"Generated new TOC with {toc_entries_after} entries")

    return {
        "pass": True, "source": source, "check_type": "fix-docx",
        "action": "fixed",
        "reason": fix_reason,
        "heading_count": heading_count,
        "toc_entries_before": toc_entries_before,
        "toc_entries_after": toc_entries_after,
        "output": output_path,
        "errors": [], "warnings": [],
        "info": info_list
    }


# ---------------------------------------------------------------------------
# CLI entry point
# ---------------------------------------------------------------------------
def fix_docx_accurate_pages(fixed_docx_path: str, pass1_pdf_path: str, output_path: Optional[str] = None) -> Dict[str, Any]:
    """Update TOC page numbers in a fix-docx output using actual page positions from a PDF.

    Two-pass approach:
      Pass 1: Convert the DOCX (without TOC fix or with estimated pages) to PDF
      Pass 2: Read actual heading positions from PDF, update PAGEREF placeholder text

    Args:
        fixed_docx_path: Path to the DOCX after fix-docx (has PAGEREF fields with estimated pages)
        pass1_pdf_path: Path to a PDF converted from the ORIGINAL docx (without TOC)
        output_path: Where to save the updated DOCX (defaults to overwrite fixed_docx_path)
    """
    import zipfile as zf_mod
    import tempfile
    import shutil

    try:
        import pdfplumber
    except ImportError:
        return {"pass": False, "error": "pdfplumber not installed — cannot extract page positions"}

    try:
        from docx import Document
        from docx.oxml.ns import qn as docx_qn
    except ImportError:
        return {"pass": False, "error": "python-docx not installed"}

    try:
        from lxml import etree
    except ImportError:
        return {"pass": False, "error": "lxml not installed"}

    if output_path is None:
        output_path = fixed_docx_path

    source = os.path.basename(fixed_docx_path)

    # --- Step 1: Extract headings from the fixed DOCX ---
    doc = Document(fixed_docx_path)
    headings = []
    caption_pattern = re.compile(r'^[表图]\s*\d')
    for p in doc.paragraphs:
        style_name = p.style.name if p.style else ''
        if style_name.startswith('Heading'):
            m = re.match(r'Heading\s*(\d+)', style_name)
            if m:
                text = p.text.strip()
                if text and not caption_pattern.match(text):
                    headings.append({'text': text, 'level': int(m.group(1))})

    if not headings:
        return {"pass": True, "source": source, "info": "No headings found, nothing to update"}

    # --- Step 2: Find actual page positions in pass1 PDF ---
    pdf = pdfplumber.open(pass1_pdf_path)
    total_pdf_pages = len(pdf.pages)

    page_texts = []
    for i in range(total_pdf_pages):
        page_texts.append(pdf.pages[i].extract_text() or '')

    heading_pages_pass1: Dict[str, int] = {}
    for h in headings:
        for page_num, pt in enumerate(page_texts):
            if h['text'] in pt:
                heading_pages_pass1[h['text']] = page_num + 1  # 1-indexed
                break
    pdf.close()

    # --- Step 3: Calculate offset ---
    # Instead of estimating TOC page count, calculate actual offset by comparing
    # where the first heading appears in pass1 vs where it should appear after TOC insertion.
    # The offset = (number of pages TOC adds) which depends on entry count and formatting.
    toc_entry_count = sum(1 for h in headings if h['level'] <= 3)

    # Better estimate: ~15 entries per page for CJK text with leader dots
    toc_pages = max(1, (toc_entry_count + 14) // 15)

    # Additional offset for the page break after TOC
    # Check if the original DOCX already had a TOC (pass1 already includes TOC space)
    # by looking at whether the first heading is on page 1-2 (no TOC) or later (has TOC)
    first_heading_page = min(heading_pages_pass1.values()) if heading_pages_pass1 else 1

    if first_heading_page <= 2:
        # Pass1 has no significant TOC content, so we need full offset
        offset = toc_pages + 1  # +1 for page break after TOC
    else:
        # Pass1 already has some TOC pages, smaller offset needed
        offset = max(0, toc_pages - (first_heading_page - 2))

    heading_page_map: Dict[str, int] = {}
    for h_text, orig_page in heading_pages_pass1.items():
        heading_page_map[h_text] = orig_page + offset

    # --- Step 4: Update PAGEREF placeholder text in the DOCX XML ---
    with zf_mod.ZipFile(fixed_docx_path, 'r') as zf:
        doc_xml = zf.read('word/document.xml')

    root = etree.fromstring(doc_xml)
    nsmap = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
    w_ns = nsmap['w']

    all_runs = root.findall(f'.//{{{w_ns}}}r')
    updates = 0
    i = 0
    while i < len(all_runs):
        r = all_runs[i]
        fld = r.find(f'{{{w_ns}}}fldChar')
        if fld is not None and fld.get(f'{{{w_ns}}}fldCharType') == 'begin':
            if i + 1 < len(all_runs):
                instr_r = all_runs[i + 1]
                instr_t = instr_r.find(f'{{{w_ns}}}instrText')
                if instr_t is not None and instr_t.text and 'PAGEREF' in instr_t.text:
                    # Find the hyperlink parent to get heading text
                    hyperlink = r.getparent()
                    if hyperlink is not None:
                        text_runs = hyperlink.findall(f'.//{{{w_ns}}}t')
                        # Find the 'separate' then the page number text
                        for j in range(i + 2, min(i + 5, len(all_runs))):
                            sep_fld = all_runs[j].find(f'{{{w_ns}}}fldChar')
                            if sep_fld is not None and sep_fld.get(f'{{{w_ns}}}fldCharType') == 'separate':
                                if j + 1 < len(all_runs):
                                    page_t = all_runs[j + 1].find(f'{{{w_ns}}}t')
                                    if page_t is not None:
                                        # Get heading text from the hyperlink
                                        heading_text = ''
                                        for tr in text_runs:
                                            if tr.text and tr != page_t:
                                                heading_text += tr.text
                                        heading_text = heading_text.strip()

                                        correct_page = heading_page_map.get(heading_text)
                                        if correct_page:
                                            page_t.text = str(correct_page)
                                            updates += 1
                                break
        i += 1

    # --- Step 5: Save updated DOCX ---
    with tempfile.TemporaryDirectory() as tmpdir:
        with zf_mod.ZipFile(fixed_docx_path, 'r') as zf:
            zf.extractall(tmpdir)

        doc_xml_path = os.path.join(tmpdir, 'word', 'document.xml')
        with open(doc_xml_path, 'wb') as f:
            f.write(etree.tostring(root, xml_declaration=True, encoding='UTF-8', standalone=True))

        with zf_mod.ZipFile(output_path, 'w', zf_mod.ZIP_DEFLATED) as zf:
            for dirpath, dirnames, filenames in os.walk(tmpdir):
                for fn in filenames:
                    full_path = os.path.join(dirpath, fn)
                    arcname = os.path.relpath(full_path, tmpdir)
                    zf.write(full_path, arcname)

    return {
        "pass": True,
        "source": source,
        "check_type": "fix-pages",
        "pages_updated": updates,
        "total_headings": len(headings),
        "toc_pages_estimated": toc_pages,
        "offset_applied": offset,
        "output": output_path,
    }


def print_usage():
    """Print usage information to stderr."""
    print("Usage:", file=sys.stderr)
    print("  toc_validate.py check-docx <file.docx>", file=sys.stderr)
    print("  toc_validate.py check-pdf  <file.pdf>", file=sys.stderr)
    print("  toc_validate.py check-conversion <input.docx> <output.pdf>",
          file=sys.stderr)
    print("  toc_validate.py fix-docx <input.docx> [-o output.docx]",
          file=sys.stderr)
    print("  toc_validate.py fix-pages <fixed.docx> <pass1.pdf> [-o output.docx]",
          file=sys.stderr)
    print("", file=sys.stderr)
    print("fix-pages: 2-pass page number correction. Requires a PDF converted", file=sys.stderr)
    print("           from the ORIGINAL docx (without TOC) as reference.", file=sys.stderr)


def main():
    """CLI entry point."""
    if len(sys.argv) < 2:
        print_usage()
        sys.exit(2)

    command = sys.argv[1].lower()

    try:
        if command == 'check-docx':
            if len(sys.argv) < 3:
                print("Error: Missing DOCX file path.", file=sys.stderr)
                print_usage()
                sys.exit(2)
            docx_path = sys.argv[2]
            if not os.path.isfile(docx_path):
                print(f"Error: File not found: {docx_path}", file=sys.stderr)
                sys.exit(2)
            result = check_docx(docx_path)

        elif command == 'check-pdf':
            if len(sys.argv) < 3:
                print("Error: Missing PDF file path.", file=sys.stderr)
                print_usage()
                sys.exit(2)
            pdf_path = sys.argv[2]
            if not os.path.isfile(pdf_path):
                print(f"Error: File not found: {pdf_path}", file=sys.stderr)
                sys.exit(2)
            result = check_pdf(pdf_path)

        elif command == 'check-conversion':
            if len(sys.argv) < 4:
                print("Error: Missing file paths. Need both DOCX and PDF.",
                      file=sys.stderr)
                print_usage()
                sys.exit(2)
            docx_path = sys.argv[2]
            pdf_path = sys.argv[3]
            if not os.path.isfile(docx_path):
                print(f"Error: File not found: {docx_path}", file=sys.stderr)
                sys.exit(2)
            if not os.path.isfile(pdf_path):
                print(f"Error: File not found: {pdf_path}", file=sys.stderr)
                sys.exit(2)
            result = check_conversion(docx_path, pdf_path)

        elif command == 'fix-docx':
            if len(sys.argv) < 3:
                print("Error: Missing DOCX file path.", file=sys.stderr)
                print_usage()
                sys.exit(2)
            docx_path = sys.argv[2]
            if not os.path.isfile(docx_path):
                print(f"Error: File not found: {docx_path}", file=sys.stderr)
                sys.exit(2)
            # Parse optional -o flag
            output_path = None
            if '-o' in sys.argv:
                o_idx = sys.argv.index('-o')
                if o_idx + 1 < len(sys.argv):
                    output_path = sys.argv[o_idx + 1]
                else:
                    print("Error: -o flag requires an output path.",
                          file=sys.stderr)
                    sys.exit(2)
            result = fix_docx(docx_path, output_path)

        elif command == 'fix-pages':
            if len(sys.argv) < 4:
                print("Error: Need both fixed DOCX and pass1 PDF paths.", file=sys.stderr)
                print_usage()
                sys.exit(2)
            fixed_docx = sys.argv[2]
            pass1_pdf = sys.argv[3]
            if not os.path.isfile(fixed_docx):
                print(f"Error: File not found: {fixed_docx}", file=sys.stderr)
                sys.exit(2)
            if not os.path.isfile(pass1_pdf):
                print(f"Error: File not found: {pass1_pdf}", file=sys.stderr)
                sys.exit(2)
            output_path = None
            if '-o' in sys.argv:
                o_idx = sys.argv.index('-o')
                if o_idx + 1 < len(sys.argv):
                    output_path = sys.argv[o_idx + 1]
            result = fix_docx_accurate_pages(fixed_docx, pass1_pdf, output_path)

        else:
            print(f"Error: Unknown command '{command}'", file=sys.stderr)
            print_usage()
            sys.exit(2)

        # Output JSON to stdout
        print(json.dumps(result, ensure_ascii=False, indent=2))

        # Exit code: 0=pass, 1=fail
        sys.exit(0 if result['pass'] else 1)

    except Exception as e:
        # Unexpected error — output JSON error and exit 2
        error_result = {
            "pass": False,
            "source": sys.argv[2] if len(sys.argv) > 2 else "unknown",
            "check_type": command.replace('check-', '') + '-toc'
                          if command.startswith('check-') else 'unknown',
            "errors": [make_item("SCRIPT_ERROR",
                                 f"Unexpected error: {str(e)[:200]}",
                                 "error")],
            "warnings": [],
            "info": [],
        }
        print(json.dumps(error_result, ensure_ascii=False, indent=2))
        sys.exit(2)


if __name__ == '__main__':
    main()