#!/usr/bin/env python3 """ Add placeholder entries to Table of Contents in a DOCX file. This script adds placeholder TOC entries between the 'separate' and 'end' field characters, so users see some content on first open instead of an empty TOC. The original file is replaced with the modified version. Usage: python add_toc_placeholders.py # auto-extract headings (default) python add_toc_placeholders.py --auto # explicit auto mode python add_toc_placeholders.py --entries entries_json format: JSON string with array of objects: [ {"level": 1, "text": "Chapter 1 Overview", "page": "1"}, {"level": 2, "text": "Section 1.1 Details", "page": "1"} ] Default behavior (no flags): auto-extracts Heading 1-3 from the document. Filters out table/figure captions (e.g. "表 1:xxx", "图 2:xxx"). Example: python add_toc_placeholders.py document.docx python add_toc_placeholders.py document.docx --auto python add_toc_placeholders.py document.docx --entries '[{"level":1,"text":"Introduction","page":"1"}]' """ import argparse import html import json import re import shutil import sys import tempfile import zipfile from pathlib import Path def _extract_headings_from_docx(docx_path: str, max_level: int = 3) -> list: """Extract headings from a DOCX file for auto-mode TOC generation. Args: docx_path: Path to DOCX file max_level: Maximum heading level to include (default 3) Returns: List of dicts with 'level', 'text', 'page' keys """ from docx import Document doc = Document(docx_path) entries = [] page_estimate = 1 # Pattern to filter out table/figure captions styled as headings caption_pattern = re.compile(r'^[表图]\s*\d') for i, para in enumerate(doc.paragraphs): style_name = para.style.name if para.style else '' if not style_name.startswith('Heading'): continue m = re.search(r'(\d+)', style_name) if not m: continue level = int(m.group(1)) if level > max_level: continue text = para.text.strip() if not text: continue # Filter table/figure captions if caption_pattern.match(text): continue # Rough page estimate: increment every ~8 headings page_estimate = max(1, 1 + i // 8) entries.append({"level": level, "text": text, "page": str(page_estimate)}) return entries def add_toc_placeholders(docx_path: str, entries: list = None) -> None: """Add placeholder TOC entries to a DOCX file (in-place replacement). Args: docx_path: Path to DOCX file (will be modified in-place) entries: Optional list of placeholder entries. Each entry should be a dict with 'level' (1-3), 'text', and 'page' keys. """ docx_path = Path(docx_path) # Create temp directory for extraction with tempfile.TemporaryDirectory() as temp_dir: temp_path = Path(temp_dir) extracted_dir = temp_path / "extracted" temp_output = temp_path / "output.docx" # Extract DOCX with zipfile.ZipFile(docx_path, 'r') as zip_ref: zip_ref.extractall(extracted_dir) # Ensure TOC styles exist in styles.xml styles_xml_path = extracted_dir / "word" / "styles.xml" toc_style_mapping = _ensure_toc_styles(styles_xml_path) print(f"TOC style mapping: {toc_style_mapping}") # Fix settings.xml: ensure updateFields has val="true" settings_xml_path = extracted_dir / "word" / "settings.xml" _fix_update_fields(settings_xml_path) # Fix Heading styles: ensure outlineLvl is set (required for TOC field update) _fix_heading_outline_levels(styles_xml_path) # Process document.xml document_xml = extracted_dir / "word" / "document.xml" if not document_xml.exists(): raise ValueError("document.xml not found in the DOCX file") # Read and process XML content = document_xml.read_text(encoding='utf-8') # Fix fldChar structure: split merged begin+instrText+separate into separate elements content = _fix_fld_char_structure(content) # Find TOC structure and add placeholders (uses lxml for robust XML parsing) modified_content = _insert_toc_placeholders(content, entries, toc_style_mapping) # Write back document_xml.write_text(modified_content, encoding='utf-8') # Repack DOCX to temp file with zipfile.ZipFile(temp_output, 'w', zipfile.ZIP_DEFLATED) as zipf: for file_path in extracted_dir.rglob('*'): if file_path.is_file(): arcname = file_path.relative_to(extracted_dir) zipf.write(file_path, arcname) # Replace original file with modified version (use shutil.move for cross-device support) docx_path.unlink() shutil.move(str(temp_output), str(docx_path)) def _fix_update_fields(settings_xml_path: Path) -> None: """Fix settings.xml to ensure is present. The docx npm library generates without val="true", which Word/WPS interprets as false, preventing TOC auto-update on open. """ if not settings_xml_path.exists(): return content = settings_xml_path.read_text(encoding='utf-8') original = content # Case 1: (self-closing, no val) → add val="true" if '' in content: content = content.replace('', '') print('Fixed: ') # Case 2: → change to true (match precisely) elif re.search(r'', content): content = re.sub( r'', '', content ) print('Fixed: ') # Case 3: Not present at all → inject before elif '', '') print('Fixed: added to settings.xml') if content != original: settings_xml_path.write_text(content, encoding='utf-8') def _fix_heading_outline_levels(styles_xml_path: Path) -> None: """Fix Heading styles to include outlineLvl in pPr. The docx npm library creates Heading styles but sometimes doesn't set outlineLvl in the style definition. Without outlineLvl, Word's TOC field update won't find headings even though they display correctly. This ensures Heading1 has outlineLvl=0, Heading2 has outlineLvl=1, etc. """ if not styles_xml_path.exists(): return content = styles_xml_path.read_text(encoding='utf-8') original = content W_NS = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' for level in range(1, 7): style_id = f'Heading{level}' outline_val = str(level - 1) # Pattern: find with w:styleId="HeadingN" style_pattern = ( rf'(]*w:styleId="{style_id}"[^>]*>)' rf'(.*?)' rf'()' ) match = re.search(style_pattern, content, flags=re.DOTALL) if not match: continue style_content = match.group(2) # Check if outlineLvl already exists in this style if f' within this style ppr_match = re.search(r'(]*>)(.*?)()', style_content, flags=re.DOTALL) if ppr_match: # Add outlineLvl inside existing pPr new_ppr_content = ppr_match.group(2) + f'' new_style_content = ( style_content[:ppr_match.start()] + ppr_match.group(1) + new_ppr_content + ppr_match.group(3) + style_content[ppr_match.end():] ) else: # No pPr exists, create one new_ppr = f'' # Insert pPr right after style opening (after name/basedOn if present) new_style_content = new_ppr + style_content new_style = match.group(1) + new_style_content + match.group(3) content = content[:match.start()] + new_style + content[match.end():] print(f'Fixed: added outlineLvl={outline_val} to {style_id} style') if content != original: styles_xml_path.write_text(content, encoding='utf-8') def _fix_fld_char_structure(xml_content: str) -> str: """Fix malformed fldChar structure where begin+instrText+separate are in one . The docx npm library generates: TOC... Word/WPS requires the standard structure: TOC... """ # Match a that contains both begin fldChar AND instrText AND separate fldChar pattern = ( r']*)?>(' r']*w:fldCharType="begin"[^>]*/>' # begin r')(' r']*>.*?' # instrText r')(' r']*w:fldCharType="separate"[^>]*/>' # separate r')' ) def split_run(match): begin = match.group(1) instr = match.group(2) separate = match.group(3) return f'{begin}{instr}{separate}' modified = re.sub(pattern, split_run, xml_content, flags=re.DOTALL) if modified != xml_content: print("Fixed: split merged fldChar begin+instrText+separate into separate elements") # Fix TOC instrText: remove \t switch with wrong style names # docx npm lib generates \t "Heading1,1,Heading2,2,..." but Word expects "Heading 1,1,..." # Since we already have \o "1-3" which uses outlineLvl (now fixed), \t is redundant and harmful toc_t_pattern = r'(TOC\s+[^<]*?)\\t\s+"[^&]*"' modified2 = re.sub(toc_t_pattern, r'\1', modified) if modified2 != modified: print("Fixed: removed \\t switch from TOC instrText (\\o with outlineLvl is sufficient)") modified = modified2 return modified def _detect_toc_styles(styles_xml_path: Path) -> dict: """Detect TOC style IDs from styles.xml. Args: styles_xml_path: Path to styles.xml Returns: Dictionary mapping level (1-3) to style ID string """ if not styles_xml_path.exists(): return {} content = styles_xml_path.read_text(encoding='utf-8') result = {} for level in range(1, 4): # Standard TOC style names: "TOC 1", "TOC 2", "TOC 3" (with space) # or "TOC1", "TOC2", "TOC3" (no space) — docx-js uses numeric IDs like "9", "11", "12" patterns = [ rf'w:styleId="(TOC{level})"', rf'w:styleId="(TOC {level})"', rf'\s*|', ] for pattern in patterns[:2]: m = re.search(pattern, content) if m: result[level] = m.group(1) break else: # Try matching by w:name (case insensitive toc N) # Find blocks with name containing "toc N" name_pattern = rf']*w:styleId="([^"]*)"[^>]*>.*? dict: """Ensure TOC styles exist in styles.xml, adding them if necessary. Returns: Dictionary mapping level (1-3) to style ID string """ if not styles_xml_path.exists(): return {1: "9", 2: "11", 3: "12"} content = styles_xml_path.read_text(encoding='utf-8') detected = _detect_toc_styles(styles_xml_path) result = dict(detected) W_NS = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' # Define TOC styles to add if missing toc_style_defs = { 1: { 'id': '9', 'name': 'toc 1', 'xml': f''' ''' }, 2: { 'id': '11', 'name': 'toc 2', 'xml': f''' ''' }, 3: { 'id': '12', 'name': 'toc 3', 'xml': f''' ''' }, } modified = False for level in range(1, 4): if level not in result: style_def = toc_style_defs[level] result[level] = style_def['id'] # Add style before insert_point = content.rfind('') if insert_point == -1: print(f"WARNING: Could not find to insert TOC {level} style", file=sys.stderr) continue content = content[:insert_point] + style_def['xml'] + '\n' + content[insert_point:] print(f"Added TOC {level} style (ID: {style_def['id']})") modified = True if modified: styles_xml_path.write_text(content, encoding='utf-8') # Ensure Hyperlink style exists _ensure_hyperlink_style(styles_xml_path) return result def _ensure_hyperlink_style(styles_xml_path: Path) -> None: """Ensure Hyperlink character style exists in styles.xml.""" if not styles_xml_path.exists(): return content = styles_xml_path.read_text(encoding='utf-8') if 'w:styleId="Hyperlink"' in content: return W_NS = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' hyperlink_style = f''' ''' insert_point = content.rfind('') if insert_point != -1: content = content[:insert_point] + hyperlink_style + '\n' + content[insert_point:] styles_xml_path.write_text(content, encoding='utf-8') print("Added Hyperlink character style") def _insert_toc_placeholders(xml_content: str, entries: list = None, toc_style_mapping: dict = None) -> str: """Insert placeholder TOC entries and heading bookmarks into XML content. Uses lxml ElementTree for robust XML manipulation instead of fragile regex. This function does TWO things: 1. Adds bookmark anchors to each Heading paragraph (so Word can link TOC → heading) 2. Replaces TOC placeholder area with proper entries containing HYPERLINK + PAGEREF Args: xml_content: The XML content of document.xml entries: List of placeholder entries with 'level', 'text', 'page' keys toc_style_mapping: Dictionary mapping level to style ID Returns: Modified XML content with bookmarks and TOC placeholders Raises: RuntimeError: If TOC structure cannot be found or is malformed """ from lxml import etree if entries is None: entries = [{"level": 1, "text": "Contents", "page": "1"}] if toc_style_mapping is None: toc_style_mapping = {1: "9", 2: "11", 3: "12"} W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" R_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships" # Parse XML root = etree.fromstring(xml_content.encode('utf-8')) nsmap = {'w': W, 'r': R_NS} # ── Step 1: Add bookmarks to Heading paragraphs ── bookmark_id_counter = 100000 heading_bookmark_map = {} # text → first bookmark_name (backward compat) heading_bookmark_map_all = {} # text → [list of bookmark_names] for duplicate headings for para in root.iter(f'{{{W}}}p'): # Find pStyle ppr = para.find(f'{{{W}}}pPr') if ppr is None: continue pstyle = ppr.find(f'{{{W}}}pStyle') if pstyle is None: continue style_val = pstyle.get(f'{{{W}}}val', '') if not re.match(r'Heading\d$', style_val): continue # Extract heading text texts = [] for t_elem in para.iter(f'{{{W}}}t'): if t_elem.text: texts.append(t_elem.text) heading_text = ''.join(texts).strip() if not heading_text: continue # Skip if already has bookmark if para.find(f'{{{W}}}bookmarkStart') is not None: continue # Generate bookmark bm_name = f"_Toc{bookmark_id_counter}" bm_id_str = str(bookmark_id_counter) bookmark_id_counter += 1 # Store mapping (support duplicate headings) if heading_text not in heading_bookmark_map_all: heading_bookmark_map_all[heading_text] = [] heading_bookmark_map_all[heading_text].append(bm_name) if heading_text not in heading_bookmark_map: heading_bookmark_map[heading_text] = bm_name # Insert bookmarkStart after pPr bm_start = etree.Element(f'{{{W}}}bookmarkStart') bm_start.set(f'{{{W}}}id', bm_id_str) bm_start.set(f'{{{W}}}name', bm_name) bm_end = etree.Element(f'{{{W}}}bookmarkEnd') bm_end.set(f'{{{W}}}id', bm_id_str) ppr_index = list(para).index(ppr) para.insert(ppr_index + 1, bm_start) # bookmarkEnd at end of paragraph para.append(bm_end) bookmarks_added = len(heading_bookmark_map) if bookmarks_added > 0: print(f"Added {bookmarks_added} bookmarks to Heading paragraphs") # ── Step 2: Find TOC field structure (begin → instrText → separate → end) ── toc_separate_para = None toc_end_para = None # Track field nesting to handle nested fields correctly field_stack = [] toc_field_depth = None for fld_char in root.iter(f'{{{W}}}fldChar'): fld_type = fld_char.get(f'{{{W}}}fldCharType') run = fld_char.getparent() if fld_type == 'begin': para = run.getparent() instr_text = '' found_run = False for sibling in para: if sibling is run: found_run = True it = sibling.find(f'{{{W}}}instrText') if it is not None and it.text: instr_text += it.text continue if found_run and sibling.tag == f'{{{W}}}r': it = sibling.find(f'{{{W}}}instrText') if it is not None and it.text: instr_text += it.text if sibling.find(f'{{{W}}}fldChar') is not None: break field_stack.append(instr_text.strip()) if 'TOC' in instr_text and toc_field_depth is None: toc_field_depth = len(field_stack) elif fld_type == 'separate': if toc_field_depth is not None and len(field_stack) == toc_field_depth: toc_separate_para = run.getparent() elif fld_type == 'end': if toc_field_depth is not None and len(field_stack) == toc_field_depth: toc_end_para = run.getparent() break if field_stack: field_stack.pop() if toc_separate_para is None or toc_end_para is None: has_begin = root.find(f'.//{{{W}}}fldChar[@{{{W}}}fldCharType="begin"]') is not None has_separate = root.find(f'.//{{{W}}}fldChar[@{{{W}}}fldCharType="separate"]') is not None if not has_begin: raise RuntimeError( "TOC FAILED: No field structure found in document. " "Ensure the code includes a TableOfContents element." ) elif not has_separate: raise RuntimeError( "TOC FAILED: TOC field has 'begin' but no 'separate' fldChar. " "Run _fix_fld_char_structure() first or check the docx-js version." ) else: raise RuntimeError( "TOC FAILED: Field structure found but no TOC instrText detected. " "Ensure TableOfContents element generates a TOC \\o field code." ) # ── Step 3: Remove everything between separate-para and end-para ── # The TOC paragraphs may be direct children of or wrapped in toc_container = toc_separate_para.getparent() # could be body or sdtContent container_children = list(toc_container) sep_idx = container_children.index(toc_separate_para) end_idx = container_children.index(toc_end_para) for elem in container_children[sep_idx + 1:end_idx]: toc_container.remove(elem) # ── Step 4: Build and insert placeholder paragraphs ── indent_mapping = {1: 0, 2: 360, 3: 720, 4: 1080, 5: 1440, 6: 1800} heading_occurrence_counter = {} insert_pos = list(toc_container).index(toc_end_para) for entry in entries: level = entry.get('level', 1) text_raw = entry.get('text', '') page = entry.get('page', '1') toc_style = toc_style_mapping.get(level, toc_style_mapping.get(1, "9")) indent = indent_mapping.get(level, 0) # Resolve bookmark (handle duplicate headings correctly) bm_name = '' if text_raw in heading_bookmark_map_all: occ = heading_occurrence_counter.get(text_raw, 0) bm_list = heading_bookmark_map_all[text_raw] if occ < len(bm_list): bm_name = bm_list[occ] heading_occurrence_counter[text_raw] = occ + 1 # Build paragraph element p = etree.Element(f'{{{W}}}p') toc_container.insert(insert_pos, p) insert_pos += 1 # pPr ppr = etree.SubElement(p, f'{{{W}}}pPr') pstyle = etree.SubElement(ppr, f'{{{W}}}pStyle') pstyle.set(f'{{{W}}}val', str(toc_style)) if indent > 0: ind = etree.SubElement(ppr, f'{{{W}}}ind') ind.set(f'{{{W}}}left', str(indent)) tabs = etree.SubElement(ppr, f'{{{W}}}tabs') tab = etree.SubElement(tabs, f'{{{W}}}tab') tab.set(f'{{{W}}}val', 'right') tab.set(f'{{{W}}}leader', 'dot') tab.set(f'{{{W}}}pos', '9026') spacing = etree.SubElement(ppr, f'{{{W}}}spacing') spacing.set(f'{{{W}}}before', '120') spacing.set(f'{{{W}}}after', '60') if bm_name: hyperlink = etree.SubElement(p, f'{{{W}}}hyperlink') hyperlink.set(f'{{{W}}}anchor', bm_name) hyperlink.set(f'{{{W}}}history', '1') r_text = etree.SubElement(hyperlink, f'{{{W}}}r') rpr = etree.SubElement(r_text, f'{{{W}}}rPr') rstyle = etree.SubElement(rpr, f'{{{W}}}rStyle') rstyle.set(f'{{{W}}}val', 'Hyperlink') t = etree.SubElement(r_text, f'{{{W}}}t') t.text = text_raw r_tab = etree.SubElement(hyperlink, f'{{{W}}}r') etree.SubElement(r_tab, f'{{{W}}}tab') r_begin = etree.SubElement(hyperlink, f'{{{W}}}r') fc_begin = etree.SubElement(r_begin, f'{{{W}}}fldChar') fc_begin.set(f'{{{W}}}fldCharType', 'begin') r_instr = etree.SubElement(hyperlink, f'{{{W}}}r') instr = etree.SubElement(r_instr, f'{{{W}}}instrText') instr.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve') instr.text = f' PAGEREF {bm_name} \\h ' r_sep = etree.SubElement(hyperlink, f'{{{W}}}r') fc_sep = etree.SubElement(r_sep, f'{{{W}}}fldChar') fc_sep.set(f'{{{W}}}fldCharType', 'separate') r_page = etree.SubElement(hyperlink, f'{{{W}}}r') t_page = etree.SubElement(r_page, f'{{{W}}}t') t_page.text = str(page) r_end = etree.SubElement(hyperlink, f'{{{W}}}r') fc_end = etree.SubElement(r_end, f'{{{W}}}fldChar') fc_end.set(f'{{{W}}}fldCharType', 'end') else: r_text = etree.SubElement(p, f'{{{W}}}r') t = etree.SubElement(r_text, f'{{{W}}}t') t.text = text_raw r_tab = etree.SubElement(p, f'{{{W}}}r') etree.SubElement(r_tab, f'{{{W}}}tab') r_page = etree.SubElement(p, f'{{{W}}}r') t_page = etree.SubElement(r_page, f'{{{W}}}t') t_page.text = str(page) placeholders_inserted = len(entries) print(f"Inserted {placeholders_inserted} TOC placeholder entries") # Serialize back to string result = etree.tostring(root, xml_declaration=True, encoding='UTF-8', standalone=True) return result.decode('utf-8') def main(): parser = argparse.ArgumentParser( description='Add placeholder entries to Table of Contents in a DOCX file (in-place)' ) parser.add_argument('docx_file', help='DOCX file to modify (will be replaced)') parser.add_argument( '--auto', action='store_true', help='Auto-extract Heading 1-3 from the DOCX as TOC entries (recommended)' ) parser.add_argument( '--entries', help='JSON string with placeholder entries: [{"level":1,"text":"Chapter 1","page":"1"}]' ) args = parser.parse_args() # Determine entries entries = None if args.entries: try: entries = json.loads(args.entries) except json.JSONDecodeError as e: print(f"Error parsing entries JSON: {e}", file=sys.stderr) sys.exit(1) elif args.auto or True: # Default to auto mode — always extract from document headings entries = _extract_headings_from_docx(args.docx_file) if entries: print(f"Auto-extracted {len(entries)} headings from document", file=sys.stderr) else: print("No headings found in document, using minimal placeholder", file=sys.stderr) entries = [{"level": 1, "text": "Contents", "page": "1"}] # Add placeholders try: add_toc_placeholders(args.docx_file, entries) print(f"Successfully added TOC placeholders to {args.docx_file}") except RuntimeError as e: # TOC structure errors — hard fail with exit code 1 print(f"ERROR: {e}", file=sys.stderr) sys.exit(1) except Exception as e: print(f"Error: {e}", file=sys.stderr) sys.exit(1) if __name__ == '__main__': main()