mantle-ai-trader/skills/docx/scripts/add_toc_placeholders.py

#!/usr/bin/env python3
"""
Add placeholder entries to Table of Contents in a DOCX file.

This script adds placeholder TOC entries between the 'separate' and 'end'
field characters, so users see some content on first open instead of an empty TOC.
The original file is replaced with the modified version.

Usage:
    python add_toc_placeholders.py <docx_file>              # auto-extract headings (default)
    python add_toc_placeholders.py <docx_file> --auto       # explicit auto mode
    python add_toc_placeholders.py <docx_file> --entries <entries_json>

    entries_json format: JSON string with array of objects:
    [
        {"level": 1, "text": "Chapter 1 Overview", "page": "1"},
        {"level": 2, "text": "Section 1.1 Details", "page": "1"}
    ]

    Default behavior (no flags): auto-extracts Heading 1-3 from the document.
    Filters out table/figure captions (e.g. "表 1：xxx", "图 2：xxx").

Example:
    python add_toc_placeholders.py document.docx
    python add_toc_placeholders.py document.docx --auto
    python add_toc_placeholders.py document.docx --entries '[{"level":1,"text":"Introduction","page":"1"}]'
"""

import argparse
import html
import json
import re
import shutil
import sys
import tempfile
import zipfile
from pathlib import Path


def _extract_headings_from_docx(docx_path: str, max_level: int = 3) -> list:
    """Extract headings from a DOCX file for auto-mode TOC generation.

    Args:
        docx_path: Path to DOCX file
        max_level: Maximum heading level to include (default 3)

    Returns:
        List of dicts with 'level', 'text', 'page' keys
    """
    from docx import Document

    doc = Document(docx_path)
    entries = []
    page_estimate = 1

    # Pattern to filter out table/figure captions styled as headings
    caption_pattern = re.compile(r'^[表图]\s*\d')

    for i, para in enumerate(doc.paragraphs):
        style_name = para.style.name if para.style else ''
        if not style_name.startswith('Heading'):
            continue
        m = re.search(r'(\d+)', style_name)
        if not m:
            continue
        level = int(m.group(1))
        if level > max_level:
            continue
        text = para.text.strip()
        if not text:
            continue
        # Filter table/figure captions
        if caption_pattern.match(text):
            continue

        # Rough page estimate: increment every ~8 headings
        page_estimate = max(1, 1 + i // 8)
        entries.append({"level": level, "text": text, "page": str(page_estimate)})

    return entries


def add_toc_placeholders(docx_path: str, entries: list = None) -> None:
    """Add placeholder TOC entries to a DOCX file (in-place replacement).

    Args:
        docx_path: Path to DOCX file (will be modified in-place)
        entries: Optional list of placeholder entries. Each entry should be a dict
                 with 'level' (1-3), 'text', and 'page' keys.
    """
    docx_path = Path(docx_path)

    # Create temp directory for extraction
    with tempfile.TemporaryDirectory() as temp_dir:
        temp_path = Path(temp_dir)
        extracted_dir = temp_path / "extracted"
        temp_output = temp_path / "output.docx"

        # Extract DOCX
        with zipfile.ZipFile(docx_path, 'r') as zip_ref:
            zip_ref.extractall(extracted_dir)

        # Ensure TOC styles exist in styles.xml
        styles_xml_path = extracted_dir / "word" / "styles.xml"
        toc_style_mapping = _ensure_toc_styles(styles_xml_path)
        print(f"TOC style mapping: {toc_style_mapping}")

        # Fix settings.xml: ensure updateFields has val="true"
        settings_xml_path = extracted_dir / "word" / "settings.xml"
        _fix_update_fields(settings_xml_path)

        # Fix Heading styles: ensure outlineLvl is set (required for TOC field update)
        _fix_heading_outline_levels(styles_xml_path)

        # Process document.xml
        document_xml = extracted_dir / "word" / "document.xml"
        if not document_xml.exists():
            raise ValueError("document.xml not found in the DOCX file")

        # Read and process XML
        content = document_xml.read_text(encoding='utf-8')

        # Fix fldChar structure: split merged begin+instrText+separate into separate <w:r> elements
        content = _fix_fld_char_structure(content)

        # Find TOC structure and add placeholders (uses lxml for robust XML parsing)
        modified_content = _insert_toc_placeholders(content, entries, toc_style_mapping)

        # Write back
        document_xml.write_text(modified_content, encoding='utf-8')

        # Repack DOCX to temp file
        with zipfile.ZipFile(temp_output, 'w', zipfile.ZIP_DEFLATED) as zipf:
            for file_path in extracted_dir.rglob('*'):
                if file_path.is_file():
                    arcname = file_path.relative_to(extracted_dir)
                    zipf.write(file_path, arcname)

        # Replace original file with modified version (use shutil.move for cross-device support)
        docx_path.unlink()
        shutil.move(str(temp_output), str(docx_path))


def _fix_update_fields(settings_xml_path: Path) -> None:
    """Fix settings.xml to ensure <w:updateFields w:val="true"/> is present.

    The docx npm library generates <w:updateFields/> without val="true",
    which Word/WPS interprets as false, preventing TOC auto-update on open.
    """
    if not settings_xml_path.exists():
        return

    content = settings_xml_path.read_text(encoding='utf-8')
    original = content

    # Case 1: <w:updateFields/> (self-closing, no val) → add val="true"
    if '<w:updateFields/>' in content:
        content = content.replace('<w:updateFields/>', '<w:updateFields w:val="true"/>')
        print('Fixed: <w:updateFields/> → <w:updateFields w:val="true"/>')

    # Case 2: <w:updateFields w:val="false"/> → change to true (match precisely)
    elif re.search(r'<w:updateFields\s+w:val="false"\s*/>', content):
        content = re.sub(
            r'<w:updateFields\s+w:val="false"\s*/>',
            '<w:updateFields w:val="true"/>',
            content
        )
        print('Fixed: <w:updateFields w:val="false"/> → <w:updateFields w:val="true"/>')

    # Case 3: Not present at all → inject before </w:settings>
    elif '<w:updateFields' not in content:
        content = content.replace('</w:settings>', '<w:updateFields w:val="true"/></w:settings>')
        print('Fixed: added <w:updateFields w:val="true"/> to settings.xml')

    if content != original:
        settings_xml_path.write_text(content, encoding='utf-8')


def _fix_heading_outline_levels(styles_xml_path: Path) -> None:
    """Fix Heading styles to include outlineLvl in pPr.

    The docx npm library creates Heading styles but sometimes doesn't set outlineLvl
    in the style definition. Without outlineLvl, Word's TOC field update won't find
    headings even though they display correctly.

    This ensures Heading1 has outlineLvl=0, Heading2 has outlineLvl=1, etc.
    """
    if not styles_xml_path.exists():
        return

    content = styles_xml_path.read_text(encoding='utf-8')
    original = content

    W_NS = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'

    for level in range(1, 7):
        style_id = f'Heading{level}'
        outline_val = str(level - 1)

        # Pattern: find <w:style> with w:styleId="HeadingN"
        style_pattern = (
            rf'(<w:style[^>]*w:styleId="{style_id}"[^>]*>)'
            rf'(.*?)'
            rf'(</w:style>)'
        )

        match = re.search(style_pattern, content, flags=re.DOTALL)
        if not match:
            continue

        style_content = match.group(2)

        # Check if outlineLvl already exists in this style
        if f'<w:outlineLvl' in style_content:
            continue

        # Find or create <w:pPr> within this style
        ppr_match = re.search(r'(<w:pPr[^>]*>)(.*?)(</w:pPr>)', style_content, flags=re.DOTALL)
        if ppr_match:
            # Add outlineLvl inside existing pPr
            new_ppr_content = ppr_match.group(2) + f'<w:outlineLvl w:val="{outline_val}"/>'
            new_style_content = (
                style_content[:ppr_match.start()] +
                ppr_match.group(1) + new_ppr_content + ppr_match.group(3) +
                style_content[ppr_match.end():]
            )
        else:
            # No pPr exists, create one
            new_ppr = f'<w:pPr><w:outlineLvl w:val="{outline_val}"/></w:pPr>'
            # Insert pPr right after style opening (after name/basedOn if present)
            new_style_content = new_ppr + style_content

        new_style = match.group(1) + new_style_content + match.group(3)
        content = content[:match.start()] + new_style + content[match.end():]
        print(f'Fixed: added outlineLvl={outline_val} to {style_id} style')

    if content != original:
        styles_xml_path.write_text(content, encoding='utf-8')


def _fix_fld_char_structure(xml_content: str) -> str:
    """Fix malformed fldChar structure where begin+instrText+separate are in one <w:r>.

    The docx npm library generates:
        <w:r><w:fldChar begin/><w:instrText>TOC...</w:instrText><w:fldChar separate/></w:r>

    Word/WPS requires the standard structure:
        <w:r><w:fldChar begin/></w:r>
        <w:r><w:instrText>TOC...</w:instrText></w:r>
        <w:r><w:fldChar separate/></w:r>
    """
    # Match a <w:r> that contains both begin fldChar AND instrText AND separate fldChar
    pattern = (
        r'<w:r(?:\s[^>]*)?>('
        r'<w:fldChar[^>]*w:fldCharType="begin"[^>]*/>'  # begin
        r')('
        r'<w:instrText[^>]*>.*?</w:instrText>'           # instrText
        r')('
        r'<w:fldChar[^>]*w:fldCharType="separate"[^>]*/>'  # separate
        r')</w:r>'
    )

    def split_run(match):
        begin = match.group(1)
        instr = match.group(2)
        separate = match.group(3)
        return f'<w:r>{begin}</w:r><w:r>{instr}</w:r><w:r>{separate}</w:r>'

    modified = re.sub(pattern, split_run, xml_content, flags=re.DOTALL)
    if modified != xml_content:
        print("Fixed: split merged fldChar begin+instrText+separate into separate <w:r> elements")

    # Fix TOC instrText: remove \t switch with wrong style names
    # docx npm lib generates \t "Heading1,1,Heading2,2,..." but Word expects "Heading 1,1,..."
    # Since we already have \o "1-3" which uses outlineLvl (now fixed), \t is redundant and harmful
    toc_t_pattern = r'(TOC\s+[^<]*?)\\t\s+&quot;[^&]*&quot;'
    modified2 = re.sub(toc_t_pattern, r'\1', modified)
    if modified2 != modified:
        print("Fixed: removed \\t switch from TOC instrText (\\o with outlineLvl is sufficient)")
        modified = modified2

    return modified


def _detect_toc_styles(styles_xml_path: Path) -> dict:
    """Detect TOC style IDs from styles.xml.

    Args:
        styles_xml_path: Path to styles.xml

    Returns:
        Dictionary mapping level (1-3) to style ID string
    """
    if not styles_xml_path.exists():
        return {}

    content = styles_xml_path.read_text(encoding='utf-8')
    result = {}

    for level in range(1, 4):
        # Standard TOC style names: "TOC 1", "TOC 2", "TOC 3" (with space)
        # or "TOC1", "TOC2", "TOC3" (no space) — docx-js uses numeric IDs like "9", "11", "12"
        patterns = [
            rf'w:styleId="(TOC{level})"',
            rf'w:styleId="(TOC {level})"',
            rf'<w:name\s+w:val="toc\s*{level}"[^/]*/>\s*</w:name>|<w:name\s+w:val="toc\s*{level}"[^/]*/>',
        ]
        for pattern in patterns[:2]:
            m = re.search(pattern, content)
            if m:
                result[level] = m.group(1)
                break
        else:
            # Try matching by w:name (case insensitive toc N)
            # Find <w:style> blocks with name containing "toc N"
            name_pattern = rf'<w:style[^>]*w:styleId="([^"]*)"[^>]*>.*?<w:name\s+w:val="[Tt][Oo][Cc]\s*{level}"'
            m = re.search(name_pattern, content, flags=re.DOTALL)
            if m:
                result[level] = m.group(1)

    return result


def _ensure_toc_styles(styles_xml_path: Path) -> dict:
    """Ensure TOC styles exist in styles.xml, adding them if necessary.

    Returns:
        Dictionary mapping level (1-3) to style ID string
    """
    if not styles_xml_path.exists():
        return {1: "9", 2: "11", 3: "12"}

    content = styles_xml_path.read_text(encoding='utf-8')
    detected = _detect_toc_styles(styles_xml_path)
    result = dict(detected)

    W_NS = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'

    # Define TOC styles to add if missing
    toc_style_defs = {
        1: {
            'id': '9',
            'name': 'toc 1',
            'xml': f'''<w:style w:type="paragraph" w:styleId="9" xmlns:w="{W_NS}">
  <w:name w:val="toc 1"/>
  <w:basedOn w:val="Normal"/>
  <w:uiPriority w:val="39"/>
  <w:pPr>
    <w:tabs><w:tab w:val="right" w:leader="dot" w:pos="9026"/></w:tabs>
    <w:spacing w:before="120" w:after="60"/>
  </w:pPr>
  <w:rPr><w:b/><w:bCs/></w:rPr>
</w:style>'''
        },
        2: {
            'id': '11',
            'name': 'toc 2',
            'xml': f'''<w:style w:type="paragraph" w:styleId="11" xmlns:w="{W_NS}">
  <w:name w:val="toc 2"/>
  <w:basedOn w:val="Normal"/>
  <w:uiPriority w:val="39"/>
  <w:pPr>
    <w:tabs><w:tab w:val="right" w:leader="dot" w:pos="9026"/></w:tabs>
    <w:ind w:left="360"/>
    <w:spacing w:before="60" w:after="40"/>
  </w:pPr>
</w:style>'''
        },
        3: {
            'id': '12',
            'name': 'toc 3',
            'xml': f'''<w:style w:type="paragraph" w:styleId="12" xmlns:w="{W_NS}">
  <w:name w:val="toc 3"/>
  <w:basedOn w:val="Normal"/>
  <w:uiPriority w:val="39"/>
  <w:pPr>
    <w:tabs><w:tab w:val="right" w:leader="dot" w:pos="9026"/></w:tabs>
    <w:ind w:left="720"/>
    <w:spacing w:before="40" w:after="20"/>
  </w:pPr>
</w:style>'''
        },
    }

    modified = False
    for level in range(1, 4):
        if level not in result:
            style_def = toc_style_defs[level]
            result[level] = style_def['id']
            # Add style before </w:styles>
            insert_point = content.rfind('</w:styles>')
            if insert_point == -1:
                print(f"WARNING: Could not find </w:styles> to insert TOC {level} style", file=sys.stderr)
                continue
            content = content[:insert_point] + style_def['xml'] + '\n' + content[insert_point:]
            print(f"Added TOC {level} style (ID: {style_def['id']})")
            modified = True

    if modified:
        styles_xml_path.write_text(content, encoding='utf-8')

    # Ensure Hyperlink style exists
    _ensure_hyperlink_style(styles_xml_path)

    return result


def _ensure_hyperlink_style(styles_xml_path: Path) -> None:
    """Ensure Hyperlink character style exists in styles.xml."""
    if not styles_xml_path.exists():
        return

    content = styles_xml_path.read_text(encoding='utf-8')
    if 'w:styleId="Hyperlink"' in content:
        return

    W_NS = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
    hyperlink_style = f'''<w:style w:type="character" w:styleId="Hyperlink" xmlns:w="{W_NS}">
  <w:name w:val="Hyperlink"/>
  <w:uiPriority w:val="99"/>
  <w:rPr>
    <w:color w:val="0563C1"/>
    <w:u w:val="single"/>
  </w:rPr>
</w:style>'''

    insert_point = content.rfind('</w:styles>')
    if insert_point != -1:
        content = content[:insert_point] + hyperlink_style + '\n' + content[insert_point:]
        styles_xml_path.write_text(content, encoding='utf-8')
        print("Added Hyperlink character style")


def _insert_toc_placeholders(xml_content: str, entries: list = None, toc_style_mapping: dict = None) -> str:
    """Insert placeholder TOC entries and heading bookmarks into XML content.

    Uses lxml ElementTree for robust XML manipulation instead of fragile regex.

    This function does TWO things:
    1. Adds bookmark anchors to each Heading paragraph (so Word can link TOC → heading)
    2. Replaces TOC placeholder area with proper entries containing HYPERLINK + PAGEREF

    Args:
        xml_content: The XML content of document.xml
        entries: List of placeholder entries with 'level', 'text', 'page' keys
        toc_style_mapping: Dictionary mapping level to style ID

    Returns:
        Modified XML content with bookmarks and TOC placeholders

    Raises:
        RuntimeError: If TOC structure cannot be found or is malformed
    """
    from lxml import etree

    if entries is None:
        entries = [{"level": 1, "text": "Contents", "page": "1"}]

    if toc_style_mapping is None:
        toc_style_mapping = {1: "9", 2: "11", 3: "12"}

    W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
    R_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships"

    # Parse XML
    root = etree.fromstring(xml_content.encode('utf-8'))
    nsmap = {'w': W, 'r': R_NS}

    # ── Step 1: Add bookmarks to Heading paragraphs ──
    bookmark_id_counter = 100000
    heading_bookmark_map = {}  # text → first bookmark_name (backward compat)
    heading_bookmark_map_all = {}  # text → [list of bookmark_names] for duplicate headings

    for para in root.iter(f'{{{W}}}p'):
        # Find pStyle
        ppr = para.find(f'{{{W}}}pPr')
        if ppr is None:
            continue
        pstyle = ppr.find(f'{{{W}}}pStyle')
        if pstyle is None:
            continue
        style_val = pstyle.get(f'{{{W}}}val', '')
        if not re.match(r'Heading\d$', style_val):
            continue

        # Extract heading text
        texts = []
        for t_elem in para.iter(f'{{{W}}}t'):
            if t_elem.text:
                texts.append(t_elem.text)
        heading_text = ''.join(texts).strip()
        if not heading_text:
            continue

        # Skip if already has bookmark
        if para.find(f'{{{W}}}bookmarkStart') is not None:
            continue

        # Generate bookmark
        bm_name = f"_Toc{bookmark_id_counter}"
        bm_id_str = str(bookmark_id_counter)
        bookmark_id_counter += 1

        # Store mapping (support duplicate headings)
        if heading_text not in heading_bookmark_map_all:
            heading_bookmark_map_all[heading_text] = []
        heading_bookmark_map_all[heading_text].append(bm_name)
        if heading_text not in heading_bookmark_map:
            heading_bookmark_map[heading_text] = bm_name

        # Insert bookmarkStart after pPr
        bm_start = etree.Element(f'{{{W}}}bookmarkStart')
        bm_start.set(f'{{{W}}}id', bm_id_str)
        bm_start.set(f'{{{W}}}name', bm_name)

        bm_end = etree.Element(f'{{{W}}}bookmarkEnd')
        bm_end.set(f'{{{W}}}id', bm_id_str)

        ppr_index = list(para).index(ppr)
        para.insert(ppr_index + 1, bm_start)
        # bookmarkEnd at end of paragraph
        para.append(bm_end)

    bookmarks_added = len(heading_bookmark_map)
    if bookmarks_added > 0:
        print(f"Added {bookmarks_added} bookmarks to Heading paragraphs")

    # ── Step 2: Find TOC field structure (begin → instrText → separate → end) ──
    toc_separate_para = None
    toc_end_para = None

    # Track field nesting to handle nested fields correctly
    field_stack = []
    toc_field_depth = None

    for fld_char in root.iter(f'{{{W}}}fldChar'):
        fld_type = fld_char.get(f'{{{W}}}fldCharType')
        run = fld_char.getparent()

        if fld_type == 'begin':
            para = run.getparent()
            instr_text = ''
            found_run = False
            for sibling in para:
                if sibling is run:
                    found_run = True
                    it = sibling.find(f'{{{W}}}instrText')
                    if it is not None and it.text:
                        instr_text += it.text
                    continue
                if found_run and sibling.tag == f'{{{W}}}r':
                    it = sibling.find(f'{{{W}}}instrText')
                    if it is not None and it.text:
                        instr_text += it.text
                    if sibling.find(f'{{{W}}}fldChar') is not None:
                        break

            field_stack.append(instr_text.strip())
            if 'TOC' in instr_text and toc_field_depth is None:
                toc_field_depth = len(field_stack)

        elif fld_type == 'separate':
            if toc_field_depth is not None and len(field_stack) == toc_field_depth:
                toc_separate_para = run.getparent()

        elif fld_type == 'end':
            if toc_field_depth is not None and len(field_stack) == toc_field_depth:
                toc_end_para = run.getparent()
                break
            if field_stack:
                field_stack.pop()

    if toc_separate_para is None or toc_end_para is None:
        has_begin = root.find(f'.//{{{W}}}fldChar[@{{{W}}}fldCharType="begin"]') is not None
        has_separate = root.find(f'.//{{{W}}}fldChar[@{{{W}}}fldCharType="separate"]') is not None
        if not has_begin:
            raise RuntimeError(
                "TOC FAILED: No field structure found in document. "
                "Ensure the code includes a TableOfContents element."
            )
        elif not has_separate:
            raise RuntimeError(
                "TOC FAILED: TOC field has 'begin' but no 'separate' fldChar. "
                "Run _fix_fld_char_structure() first or check the docx-js version."
            )
        else:
            raise RuntimeError(
                "TOC FAILED: Field structure found but no TOC instrText detected. "
                "Ensure TableOfContents element generates a TOC \\o field code."
            )

    # ── Step 3: Remove everything between separate-para and end-para ──
    # The TOC paragraphs may be direct children of <w:body> or wrapped in <w:sdt><w:sdtContent>
    toc_container = toc_separate_para.getparent()  # could be body or sdtContent
    container_children = list(toc_container)

    sep_idx = container_children.index(toc_separate_para)
    end_idx = container_children.index(toc_end_para)

    for elem in container_children[sep_idx + 1:end_idx]:
        toc_container.remove(elem)

    # ── Step 4: Build and insert placeholder paragraphs ──
    indent_mapping = {1: 0, 2: 360, 3: 720, 4: 1080, 5: 1440, 6: 1800}
    heading_occurrence_counter = {}

    insert_pos = list(toc_container).index(toc_end_para)

    for entry in entries:
        level = entry.get('level', 1)
        text_raw = entry.get('text', '')
        page = entry.get('page', '1')

        toc_style = toc_style_mapping.get(level, toc_style_mapping.get(1, "9"))
        indent = indent_mapping.get(level, 0)

        # Resolve bookmark (handle duplicate headings correctly)
        bm_name = ''
        if text_raw in heading_bookmark_map_all:
            occ = heading_occurrence_counter.get(text_raw, 0)
            bm_list = heading_bookmark_map_all[text_raw]
            if occ < len(bm_list):
                bm_name = bm_list[occ]
            heading_occurrence_counter[text_raw] = occ + 1

        # Build paragraph element
        p = etree.Element(f'{{{W}}}p')
        toc_container.insert(insert_pos, p)
        insert_pos += 1

        # pPr
        ppr = etree.SubElement(p, f'{{{W}}}pPr')
        pstyle = etree.SubElement(ppr, f'{{{W}}}pStyle')
        pstyle.set(f'{{{W}}}val', str(toc_style))
        if indent > 0:
            ind = etree.SubElement(ppr, f'{{{W}}}ind')
            ind.set(f'{{{W}}}left', str(indent))
        tabs = etree.SubElement(ppr, f'{{{W}}}tabs')
        tab = etree.SubElement(tabs, f'{{{W}}}tab')
        tab.set(f'{{{W}}}val', 'right')
        tab.set(f'{{{W}}}leader', 'dot')
        tab.set(f'{{{W}}}pos', '9026')
        spacing = etree.SubElement(ppr, f'{{{W}}}spacing')
        spacing.set(f'{{{W}}}before', '120')
        spacing.set(f'{{{W}}}after', '60')

        if bm_name:
            hyperlink = etree.SubElement(p, f'{{{W}}}hyperlink')
            hyperlink.set(f'{{{W}}}anchor', bm_name)
            hyperlink.set(f'{{{W}}}history', '1')

            r_text = etree.SubElement(hyperlink, f'{{{W}}}r')
            rpr = etree.SubElement(r_text, f'{{{W}}}rPr')
            rstyle = etree.SubElement(rpr, f'{{{W}}}rStyle')
            rstyle.set(f'{{{W}}}val', 'Hyperlink')
            t = etree.SubElement(r_text, f'{{{W}}}t')
            t.text = text_raw

            r_tab = etree.SubElement(hyperlink, f'{{{W}}}r')
            etree.SubElement(r_tab, f'{{{W}}}tab')

            r_begin = etree.SubElement(hyperlink, f'{{{W}}}r')
            fc_begin = etree.SubElement(r_begin, f'{{{W}}}fldChar')
            fc_begin.set(f'{{{W}}}fldCharType', 'begin')

            r_instr = etree.SubElement(hyperlink, f'{{{W}}}r')
            instr = etree.SubElement(r_instr, f'{{{W}}}instrText')
            instr.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
            instr.text = f' PAGEREF {bm_name} \\h '

            r_sep = etree.SubElement(hyperlink, f'{{{W}}}r')
            fc_sep = etree.SubElement(r_sep, f'{{{W}}}fldChar')
            fc_sep.set(f'{{{W}}}fldCharType', 'separate')

            r_page = etree.SubElement(hyperlink, f'{{{W}}}r')
            t_page = etree.SubElement(r_page, f'{{{W}}}t')
            t_page.text = str(page)

            r_end = etree.SubElement(hyperlink, f'{{{W}}}r')
            fc_end = etree.SubElement(r_end, f'{{{W}}}fldChar')
            fc_end.set(f'{{{W}}}fldCharType', 'end')
        else:
            r_text = etree.SubElement(p, f'{{{W}}}r')
            t = etree.SubElement(r_text, f'{{{W}}}t')
            t.text = text_raw

            r_tab = etree.SubElement(p, f'{{{W}}}r')
            etree.SubElement(r_tab, f'{{{W}}}tab')

            r_page = etree.SubElement(p, f'{{{W}}}r')
            t_page = etree.SubElement(r_page, f'{{{W}}}t')
            t_page.text = str(page)

    placeholders_inserted = len(entries)
    print(f"Inserted {placeholders_inserted} TOC placeholder entries")

    # Serialize back to string
    result = etree.tostring(root, xml_declaration=True, encoding='UTF-8', standalone=True)
    return result.decode('utf-8')


def main():
    parser = argparse.ArgumentParser(
        description='Add placeholder entries to Table of Contents in a DOCX file (in-place)'
    )
    parser.add_argument('docx_file', help='DOCX file to modify (will be replaced)')
    parser.add_argument(
        '--auto', action='store_true',
        help='Auto-extract Heading 1-3 from the DOCX as TOC entries (recommended)'
    )
    parser.add_argument(
        '--entries',
        help='JSON string with placeholder entries: [{"level":1,"text":"Chapter 1","page":"1"}]'
    )

    args = parser.parse_args()

    # Determine entries
    entries = None
    if args.entries:
        try:
            entries = json.loads(args.entries)
        except json.JSONDecodeError as e:
            print(f"Error parsing entries JSON: {e}", file=sys.stderr)
            sys.exit(1)
    elif args.auto or True:
        # Default to auto mode — always extract from document headings
        entries = _extract_headings_from_docx(args.docx_file)
        if entries:
            print(f"Auto-extracted {len(entries)} headings from document", file=sys.stderr)
        else:
            print("No headings found in document, using minimal placeholder", file=sys.stderr)
            entries = [{"level": 1, "text": "Contents", "page": "1"}]

    # Add placeholders
    try:
        add_toc_placeholders(args.docx_file, entries)
        print(f"Successfully added TOC placeholders to {args.docx_file}")
    except RuntimeError as e:
        # TOC structure errors — hard fail with exit code 1
        print(f"ERROR: {e}", file=sys.stderr)
        sys.exit(1)
    except Exception as e:
        print(f"Error: {e}", file=sys.stderr)
        sys.exit(1)


if __name__ == '__main__':
    main()