Initial commit

This commit is contained in:
Z User
2026-06-06 05:21:10 +00:00
Unverified
commit 6664758a6d
493 changed files with 135653 additions and 0 deletions

View File

@@ -0,0 +1 @@
# Make scripts directory a package for relative imports in tests

View File

@@ -0,0 +1,749 @@
#!/usr/bin/env python3
"""
Add placeholder entries to Table of Contents in a DOCX file.
This script adds placeholder TOC entries between the 'separate' and 'end'
field characters, so users see some content on first open instead of an empty TOC.
The original file is replaced with the modified version.
Usage:
python add_toc_placeholders.py <docx_file> # auto-extract headings (default)
python add_toc_placeholders.py <docx_file> --auto # explicit auto mode
python add_toc_placeholders.py <docx_file> --entries <entries_json>
entries_json format: JSON string with array of objects:
[
{"level": 1, "text": "Chapter 1 Overview", "page": "1"},
{"level": 2, "text": "Section 1.1 Details", "page": "1"}
]
Default behavior (no flags): auto-extracts Heading 1-3 from the document.
Filters out table/figure captions (e.g. "表 1xxx", "图 2xxx").
Example:
python add_toc_placeholders.py document.docx
python add_toc_placeholders.py document.docx --auto
python add_toc_placeholders.py document.docx --entries '[{"level":1,"text":"Introduction","page":"1"}]'
"""
import argparse
import html
import json
import re
import shutil
import sys
import tempfile
import zipfile
from pathlib import Path
def _extract_headings_from_docx(docx_path: str, max_level: int = 3) -> list:
"""Extract headings from a DOCX file for auto-mode TOC generation.
Args:
docx_path: Path to DOCX file
max_level: Maximum heading level to include (default 3)
Returns:
List of dicts with 'level', 'text', 'page' keys
"""
from docx import Document
doc = Document(docx_path)
entries = []
page_estimate = 1
# Pattern to filter out table/figure captions styled as headings
caption_pattern = re.compile(r'^[表图]\s*\d')
for i, para in enumerate(doc.paragraphs):
style_name = para.style.name if para.style else ''
if not style_name.startswith('Heading'):
continue
m = re.search(r'(\d+)', style_name)
if not m:
continue
level = int(m.group(1))
if level > max_level:
continue
text = para.text.strip()
if not text:
continue
# Filter table/figure captions
if caption_pattern.match(text):
continue
# Rough page estimate: increment every ~8 headings
page_estimate = max(1, 1 + i // 8)
entries.append({"level": level, "text": text, "page": str(page_estimate)})
return entries
def add_toc_placeholders(docx_path: str, entries: list = None) -> None:
"""Add placeholder TOC entries to a DOCX file (in-place replacement).
Args:
docx_path: Path to DOCX file (will be modified in-place)
entries: Optional list of placeholder entries. Each entry should be a dict
with 'level' (1-3), 'text', and 'page' keys.
"""
docx_path = Path(docx_path)
# Create temp directory for extraction
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
extracted_dir = temp_path / "extracted"
temp_output = temp_path / "output.docx"
# Extract DOCX
with zipfile.ZipFile(docx_path, 'r') as zip_ref:
zip_ref.extractall(extracted_dir)
# Ensure TOC styles exist in styles.xml
styles_xml_path = extracted_dir / "word" / "styles.xml"
toc_style_mapping = _ensure_toc_styles(styles_xml_path)
print(f"TOC style mapping: {toc_style_mapping}")
# Fix settings.xml: ensure updateFields has val="true"
settings_xml_path = extracted_dir / "word" / "settings.xml"
_fix_update_fields(settings_xml_path)
# Fix Heading styles: ensure outlineLvl is set (required for TOC field update)
_fix_heading_outline_levels(styles_xml_path)
# Process document.xml
document_xml = extracted_dir / "word" / "document.xml"
if not document_xml.exists():
raise ValueError("document.xml not found in the DOCX file")
# Read and process XML
content = document_xml.read_text(encoding='utf-8')
# Fix fldChar structure: split merged begin+instrText+separate into separate <w:r> elements
content = _fix_fld_char_structure(content)
# Find TOC structure and add placeholders (uses lxml for robust XML parsing)
modified_content = _insert_toc_placeholders(content, entries, toc_style_mapping)
# Write back
document_xml.write_text(modified_content, encoding='utf-8')
# Repack DOCX to temp file
with zipfile.ZipFile(temp_output, 'w', zipfile.ZIP_DEFLATED) as zipf:
for file_path in extracted_dir.rglob('*'):
if file_path.is_file():
arcname = file_path.relative_to(extracted_dir)
zipf.write(file_path, arcname)
# Replace original file with modified version (use shutil.move for cross-device support)
docx_path.unlink()
shutil.move(str(temp_output), str(docx_path))
def _fix_update_fields(settings_xml_path: Path) -> None:
"""Fix settings.xml to ensure <w:updateFields w:val="true"/> is present.
The docx npm library generates <w:updateFields/> without val="true",
which Word/WPS interprets as false, preventing TOC auto-update on open.
"""
if not settings_xml_path.exists():
return
content = settings_xml_path.read_text(encoding='utf-8')
original = content
# Case 1: <w:updateFields/> (self-closing, no val) → add val="true"
if '<w:updateFields/>' in content:
content = content.replace('<w:updateFields/>', '<w:updateFields w:val="true"/>')
print('Fixed: <w:updateFields/> → <w:updateFields w:val="true"/>')
# Case 2: <w:updateFields w:val="false"/> → change to true (match precisely)
elif re.search(r'<w:updateFields\s+w:val="false"\s*/>', content):
content = re.sub(
r'<w:updateFields\s+w:val="false"\s*/>',
'<w:updateFields w:val="true"/>',
content
)
print('Fixed: <w:updateFields w:val="false"/> → <w:updateFields w:val="true"/>')
# Case 3: Not present at all → inject before </w:settings>
elif '<w:updateFields' not in content:
content = content.replace('</w:settings>', '<w:updateFields w:val="true"/></w:settings>')
print('Fixed: added <w:updateFields w:val="true"/> to settings.xml')
if content != original:
settings_xml_path.write_text(content, encoding='utf-8')
def _fix_heading_outline_levels(styles_xml_path: Path) -> None:
"""Fix Heading styles to include outlineLvl in pPr.
The docx npm library creates Heading styles but sometimes doesn't set outlineLvl
in the style definition. Without outlineLvl, Word's TOC field update won't find
headings even though they display correctly.
This ensures Heading1 has outlineLvl=0, Heading2 has outlineLvl=1, etc.
"""
if not styles_xml_path.exists():
return
content = styles_xml_path.read_text(encoding='utf-8')
original = content
W_NS = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
for level in range(1, 7):
style_id = f'Heading{level}'
outline_val = str(level - 1)
# Pattern: find <w:style> with w:styleId="HeadingN"
style_pattern = (
rf'(<w:style[^>]*w:styleId="{style_id}"[^>]*>)'
rf'(.*?)'
rf'(</w:style>)'
)
match = re.search(style_pattern, content, flags=re.DOTALL)
if not match:
continue
style_content = match.group(2)
# Check if outlineLvl already exists in this style
if f'<w:outlineLvl' in style_content:
continue
# Find or create <w:pPr> within this style
ppr_match = re.search(r'(<w:pPr[^>]*>)(.*?)(</w:pPr>)', style_content, flags=re.DOTALL)
if ppr_match:
# Add outlineLvl inside existing pPr
new_ppr_content = ppr_match.group(2) + f'<w:outlineLvl w:val="{outline_val}"/>'
new_style_content = (
style_content[:ppr_match.start()] +
ppr_match.group(1) + new_ppr_content + ppr_match.group(3) +
style_content[ppr_match.end():]
)
else:
# No pPr exists, create one
new_ppr = f'<w:pPr><w:outlineLvl w:val="{outline_val}"/></w:pPr>'
# Insert pPr right after style opening (after name/basedOn if present)
new_style_content = new_ppr + style_content
new_style = match.group(1) + new_style_content + match.group(3)
content = content[:match.start()] + new_style + content[match.end():]
print(f'Fixed: added outlineLvl={outline_val} to {style_id} style')
if content != original:
styles_xml_path.write_text(content, encoding='utf-8')
def _fix_fld_char_structure(xml_content: str) -> str:
"""Fix malformed fldChar structure where begin+instrText+separate are in one <w:r>.
The docx npm library generates:
<w:r><w:fldChar begin/><w:instrText>TOC...</w:instrText><w:fldChar separate/></w:r>
Word/WPS requires the standard structure:
<w:r><w:fldChar begin/></w:r>
<w:r><w:instrText>TOC...</w:instrText></w:r>
<w:r><w:fldChar separate/></w:r>
"""
# Match a <w:r> that contains both begin fldChar AND instrText AND separate fldChar
pattern = (
r'<w:r(?:\s[^>]*)?>('
r'<w:fldChar[^>]*w:fldCharType="begin"[^>]*/>' # begin
r')('
r'<w:instrText[^>]*>.*?</w:instrText>' # instrText
r')('
r'<w:fldChar[^>]*w:fldCharType="separate"[^>]*/>' # separate
r')</w:r>'
)
def split_run(match):
begin = match.group(1)
instr = match.group(2)
separate = match.group(3)
return f'<w:r>{begin}</w:r><w:r>{instr}</w:r><w:r>{separate}</w:r>'
modified = re.sub(pattern, split_run, xml_content, flags=re.DOTALL)
if modified != xml_content:
print("Fixed: split merged fldChar begin+instrText+separate into separate <w:r> elements")
# Fix TOC instrText: remove \t switch with wrong style names
# docx npm lib generates \t "Heading1,1,Heading2,2,..." but Word expects "Heading 1,1,..."
# Since we already have \o "1-3" which uses outlineLvl (now fixed), \t is redundant and harmful
toc_t_pattern = r'(TOC\s+[^<]*?)\\t\s+&quot;[^&]*&quot;'
modified2 = re.sub(toc_t_pattern, r'\1', modified)
if modified2 != modified:
print("Fixed: removed \\t switch from TOC instrText (\\o with outlineLvl is sufficient)")
modified = modified2
return modified
def _detect_toc_styles(styles_xml_path: Path) -> dict:
"""Detect TOC style IDs from styles.xml.
Args:
styles_xml_path: Path to styles.xml
Returns:
Dictionary mapping level (1-3) to style ID string
"""
if not styles_xml_path.exists():
return {}
content = styles_xml_path.read_text(encoding='utf-8')
result = {}
for level in range(1, 4):
# Standard TOC style names: "TOC 1", "TOC 2", "TOC 3" (with space)
# or "TOC1", "TOC2", "TOC3" (no space) — docx-js uses numeric IDs like "9", "11", "12"
patterns = [
rf'w:styleId="(TOC{level})"',
rf'w:styleId="(TOC {level})"',
rf'<w:name\s+w:val="toc\s*{level}"[^/]*/>\s*</w:name>|<w:name\s+w:val="toc\s*{level}"[^/]*/>',
]
for pattern in patterns[:2]:
m = re.search(pattern, content)
if m:
result[level] = m.group(1)
break
else:
# Try matching by w:name (case insensitive toc N)
# Find <w:style> blocks with name containing "toc N"
name_pattern = rf'<w:style[^>]*w:styleId="([^"]*)"[^>]*>.*?<w:name\s+w:val="[Tt][Oo][Cc]\s*{level}"'
m = re.search(name_pattern, content, flags=re.DOTALL)
if m:
result[level] = m.group(1)
return result
def _ensure_toc_styles(styles_xml_path: Path) -> dict:
"""Ensure TOC styles exist in styles.xml, adding them if necessary.
Returns:
Dictionary mapping level (1-3) to style ID string
"""
if not styles_xml_path.exists():
return {1: "9", 2: "11", 3: "12"}
content = styles_xml_path.read_text(encoding='utf-8')
detected = _detect_toc_styles(styles_xml_path)
result = dict(detected)
W_NS = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
# Define TOC styles to add if missing
toc_style_defs = {
1: {
'id': '9',
'name': 'toc 1',
'xml': f'''<w:style w:type="paragraph" w:styleId="9" xmlns:w="{W_NS}">
<w:name w:val="toc 1"/>
<w:basedOn w:val="Normal"/>
<w:uiPriority w:val="39"/>
<w:pPr>
<w:tabs><w:tab w:val="right" w:leader="dot" w:pos="9026"/></w:tabs>
<w:spacing w:before="120" w:after="60"/>
</w:pPr>
<w:rPr><w:b/><w:bCs/></w:rPr>
</w:style>'''
},
2: {
'id': '11',
'name': 'toc 2',
'xml': f'''<w:style w:type="paragraph" w:styleId="11" xmlns:w="{W_NS}">
<w:name w:val="toc 2"/>
<w:basedOn w:val="Normal"/>
<w:uiPriority w:val="39"/>
<w:pPr>
<w:tabs><w:tab w:val="right" w:leader="dot" w:pos="9026"/></w:tabs>
<w:ind w:left="360"/>
<w:spacing w:before="60" w:after="40"/>
</w:pPr>
</w:style>'''
},
3: {
'id': '12',
'name': 'toc 3',
'xml': f'''<w:style w:type="paragraph" w:styleId="12" xmlns:w="{W_NS}">
<w:name w:val="toc 3"/>
<w:basedOn w:val="Normal"/>
<w:uiPriority w:val="39"/>
<w:pPr>
<w:tabs><w:tab w:val="right" w:leader="dot" w:pos="9026"/></w:tabs>
<w:ind w:left="720"/>
<w:spacing w:before="40" w:after="20"/>
</w:pPr>
</w:style>'''
},
}
modified = False
for level in range(1, 4):
if level not in result:
style_def = toc_style_defs[level]
result[level] = style_def['id']
# Add style before </w:styles>
insert_point = content.rfind('</w:styles>')
if insert_point == -1:
print(f"WARNING: Could not find </w:styles> to insert TOC {level} style", file=sys.stderr)
continue
content = content[:insert_point] + style_def['xml'] + '\n' + content[insert_point:]
print(f"Added TOC {level} style (ID: {style_def['id']})")
modified = True
if modified:
styles_xml_path.write_text(content, encoding='utf-8')
# Ensure Hyperlink style exists
_ensure_hyperlink_style(styles_xml_path)
return result
def _ensure_hyperlink_style(styles_xml_path: Path) -> None:
"""Ensure Hyperlink character style exists in styles.xml."""
if not styles_xml_path.exists():
return
content = styles_xml_path.read_text(encoding='utf-8')
if 'w:styleId="Hyperlink"' in content:
return
W_NS = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
hyperlink_style = f'''<w:style w:type="character" w:styleId="Hyperlink" xmlns:w="{W_NS}">
<w:name w:val="Hyperlink"/>
<w:uiPriority w:val="99"/>
<w:rPr>
<w:color w:val="0563C1"/>
<w:u w:val="single"/>
</w:rPr>
</w:style>'''
insert_point = content.rfind('</w:styles>')
if insert_point != -1:
content = content[:insert_point] + hyperlink_style + '\n' + content[insert_point:]
styles_xml_path.write_text(content, encoding='utf-8')
print("Added Hyperlink character style")
def _insert_toc_placeholders(xml_content: str, entries: list = None, toc_style_mapping: dict = None) -> str:
"""Insert placeholder TOC entries and heading bookmarks into XML content.
Uses lxml ElementTree for robust XML manipulation instead of fragile regex.
This function does TWO things:
1. Adds bookmark anchors to each Heading paragraph (so Word can link TOC → heading)
2. Replaces TOC placeholder area with proper entries containing HYPERLINK + PAGEREF
Args:
xml_content: The XML content of document.xml
entries: List of placeholder entries with 'level', 'text', 'page' keys
toc_style_mapping: Dictionary mapping level to style ID
Returns:
Modified XML content with bookmarks and TOC placeholders
Raises:
RuntimeError: If TOC structure cannot be found or is malformed
"""
from lxml import etree
if entries is None:
entries = [{"level": 1, "text": "Contents", "page": "1"}]
if toc_style_mapping is None:
toc_style_mapping = {1: "9", 2: "11", 3: "12"}
W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
R_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
# Parse XML
root = etree.fromstring(xml_content.encode('utf-8'))
nsmap = {'w': W, 'r': R_NS}
# ── Step 1: Add bookmarks to Heading paragraphs ──
bookmark_id_counter = 100000
heading_bookmark_map = {} # text → first bookmark_name (backward compat)
heading_bookmark_map_all = {} # text → [list of bookmark_names] for duplicate headings
for para in root.iter(f'{{{W}}}p'):
# Find pStyle
ppr = para.find(f'{{{W}}}pPr')
if ppr is None:
continue
pstyle = ppr.find(f'{{{W}}}pStyle')
if pstyle is None:
continue
style_val = pstyle.get(f'{{{W}}}val', '')
if not re.match(r'Heading\d$', style_val):
continue
# Extract heading text
texts = []
for t_elem in para.iter(f'{{{W}}}t'):
if t_elem.text:
texts.append(t_elem.text)
heading_text = ''.join(texts).strip()
if not heading_text:
continue
# Skip if already has bookmark
if para.find(f'{{{W}}}bookmarkStart') is not None:
continue
# Generate bookmark
bm_name = f"_Toc{bookmark_id_counter}"
bm_id_str = str(bookmark_id_counter)
bookmark_id_counter += 1
# Store mapping (support duplicate headings)
if heading_text not in heading_bookmark_map_all:
heading_bookmark_map_all[heading_text] = []
heading_bookmark_map_all[heading_text].append(bm_name)
if heading_text not in heading_bookmark_map:
heading_bookmark_map[heading_text] = bm_name
# Insert bookmarkStart after pPr
bm_start = etree.Element(f'{{{W}}}bookmarkStart')
bm_start.set(f'{{{W}}}id', bm_id_str)
bm_start.set(f'{{{W}}}name', bm_name)
bm_end = etree.Element(f'{{{W}}}bookmarkEnd')
bm_end.set(f'{{{W}}}id', bm_id_str)
ppr_index = list(para).index(ppr)
para.insert(ppr_index + 1, bm_start)
# bookmarkEnd at end of paragraph
para.append(bm_end)
bookmarks_added = len(heading_bookmark_map)
if bookmarks_added > 0:
print(f"Added {bookmarks_added} bookmarks to Heading paragraphs")
# ── Step 2: Find TOC field structure (begin → instrText → separate → end) ──
toc_separate_para = None
toc_end_para = None
# Track field nesting to handle nested fields correctly
field_stack = []
toc_field_depth = None
for fld_char in root.iter(f'{{{W}}}fldChar'):
fld_type = fld_char.get(f'{{{W}}}fldCharType')
run = fld_char.getparent()
if fld_type == 'begin':
para = run.getparent()
instr_text = ''
found_run = False
for sibling in para:
if sibling is run:
found_run = True
it = sibling.find(f'{{{W}}}instrText')
if it is not None and it.text:
instr_text += it.text
continue
if found_run and sibling.tag == f'{{{W}}}r':
it = sibling.find(f'{{{W}}}instrText')
if it is not None and it.text:
instr_text += it.text
if sibling.find(f'{{{W}}}fldChar') is not None:
break
field_stack.append(instr_text.strip())
if 'TOC' in instr_text and toc_field_depth is None:
toc_field_depth = len(field_stack)
elif fld_type == 'separate':
if toc_field_depth is not None and len(field_stack) == toc_field_depth:
toc_separate_para = run.getparent()
elif fld_type == 'end':
if toc_field_depth is not None and len(field_stack) == toc_field_depth:
toc_end_para = run.getparent()
break
if field_stack:
field_stack.pop()
if toc_separate_para is None or toc_end_para is None:
has_begin = root.find(f'.//{{{W}}}fldChar[@{{{W}}}fldCharType="begin"]') is not None
has_separate = root.find(f'.//{{{W}}}fldChar[@{{{W}}}fldCharType="separate"]') is not None
if not has_begin:
raise RuntimeError(
"TOC FAILED: No field structure found in document. "
"Ensure the code includes a TableOfContents element."
)
elif not has_separate:
raise RuntimeError(
"TOC FAILED: TOC field has 'begin' but no 'separate' fldChar. "
"Run _fix_fld_char_structure() first or check the docx-js version."
)
else:
raise RuntimeError(
"TOC FAILED: Field structure found but no TOC instrText detected. "
"Ensure TableOfContents element generates a TOC \\o field code."
)
# ── Step 3: Remove everything between separate-para and end-para ──
# The TOC paragraphs may be direct children of <w:body> or wrapped in <w:sdt><w:sdtContent>
toc_container = toc_separate_para.getparent() # could be body or sdtContent
container_children = list(toc_container)
sep_idx = container_children.index(toc_separate_para)
end_idx = container_children.index(toc_end_para)
for elem in container_children[sep_idx + 1:end_idx]:
toc_container.remove(elem)
# ── Step 4: Build and insert placeholder paragraphs ──
indent_mapping = {1: 0, 2: 360, 3: 720, 4: 1080, 5: 1440, 6: 1800}
heading_occurrence_counter = {}
insert_pos = list(toc_container).index(toc_end_para)
for entry in entries:
level = entry.get('level', 1)
text_raw = entry.get('text', '')
page = entry.get('page', '1')
toc_style = toc_style_mapping.get(level, toc_style_mapping.get(1, "9"))
indent = indent_mapping.get(level, 0)
# Resolve bookmark (handle duplicate headings correctly)
bm_name = ''
if text_raw in heading_bookmark_map_all:
occ = heading_occurrence_counter.get(text_raw, 0)
bm_list = heading_bookmark_map_all[text_raw]
if occ < len(bm_list):
bm_name = bm_list[occ]
heading_occurrence_counter[text_raw] = occ + 1
# Build paragraph element
p = etree.Element(f'{{{W}}}p')
toc_container.insert(insert_pos, p)
insert_pos += 1
# pPr
ppr = etree.SubElement(p, f'{{{W}}}pPr')
pstyle = etree.SubElement(ppr, f'{{{W}}}pStyle')
pstyle.set(f'{{{W}}}val', str(toc_style))
if indent > 0:
ind = etree.SubElement(ppr, f'{{{W}}}ind')
ind.set(f'{{{W}}}left', str(indent))
tabs = etree.SubElement(ppr, f'{{{W}}}tabs')
tab = etree.SubElement(tabs, f'{{{W}}}tab')
tab.set(f'{{{W}}}val', 'right')
tab.set(f'{{{W}}}leader', 'dot')
tab.set(f'{{{W}}}pos', '9026')
spacing = etree.SubElement(ppr, f'{{{W}}}spacing')
spacing.set(f'{{{W}}}before', '120')
spacing.set(f'{{{W}}}after', '60')
if bm_name:
hyperlink = etree.SubElement(p, f'{{{W}}}hyperlink')
hyperlink.set(f'{{{W}}}anchor', bm_name)
hyperlink.set(f'{{{W}}}history', '1')
r_text = etree.SubElement(hyperlink, f'{{{W}}}r')
rpr = etree.SubElement(r_text, f'{{{W}}}rPr')
rstyle = etree.SubElement(rpr, f'{{{W}}}rStyle')
rstyle.set(f'{{{W}}}val', 'Hyperlink')
t = etree.SubElement(r_text, f'{{{W}}}t')
t.text = text_raw
r_tab = etree.SubElement(hyperlink, f'{{{W}}}r')
etree.SubElement(r_tab, f'{{{W}}}tab')
r_begin = etree.SubElement(hyperlink, f'{{{W}}}r')
fc_begin = etree.SubElement(r_begin, f'{{{W}}}fldChar')
fc_begin.set(f'{{{W}}}fldCharType', 'begin')
r_instr = etree.SubElement(hyperlink, f'{{{W}}}r')
instr = etree.SubElement(r_instr, f'{{{W}}}instrText')
instr.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
instr.text = f' PAGEREF {bm_name} \\h '
r_sep = etree.SubElement(hyperlink, f'{{{W}}}r')
fc_sep = etree.SubElement(r_sep, f'{{{W}}}fldChar')
fc_sep.set(f'{{{W}}}fldCharType', 'separate')
r_page = etree.SubElement(hyperlink, f'{{{W}}}r')
t_page = etree.SubElement(r_page, f'{{{W}}}t')
t_page.text = str(page)
r_end = etree.SubElement(hyperlink, f'{{{W}}}r')
fc_end = etree.SubElement(r_end, f'{{{W}}}fldChar')
fc_end.set(f'{{{W}}}fldCharType', 'end')
else:
r_text = etree.SubElement(p, f'{{{W}}}r')
t = etree.SubElement(r_text, f'{{{W}}}t')
t.text = text_raw
r_tab = etree.SubElement(p, f'{{{W}}}r')
etree.SubElement(r_tab, f'{{{W}}}tab')
r_page = etree.SubElement(p, f'{{{W}}}r')
t_page = etree.SubElement(r_page, f'{{{W}}}t')
t_page.text = str(page)
placeholders_inserted = len(entries)
print(f"Inserted {placeholders_inserted} TOC placeholder entries")
# Serialize back to string
result = etree.tostring(root, xml_declaration=True, encoding='UTF-8', standalone=True)
return result.decode('utf-8')
def main():
parser = argparse.ArgumentParser(
description='Add placeholder entries to Table of Contents in a DOCX file (in-place)'
)
parser.add_argument('docx_file', help='DOCX file to modify (will be replaced)')
parser.add_argument(
'--auto', action='store_true',
help='Auto-extract Heading 1-3 from the DOCX as TOC entries (recommended)'
)
parser.add_argument(
'--entries',
help='JSON string with placeholder entries: [{"level":1,"text":"Chapter 1","page":"1"}]'
)
args = parser.parse_args()
# Determine entries
entries = None
if args.entries:
try:
entries = json.loads(args.entries)
except json.JSONDecodeError as e:
print(f"Error parsing entries JSON: {e}", file=sys.stderr)
sys.exit(1)
elif args.auto or True:
# Default to auto mode — always extract from document headings
entries = _extract_headings_from_docx(args.docx_file)
if entries:
print(f"Auto-extracted {len(entries)} headings from document", file=sys.stderr)
else:
print("No headings found in document, using minimal placeholder", file=sys.stderr)
entries = [{"level": 1, "text": "Contents", "page": "1"}]
# Add placeholders
try:
add_toc_placeholders(args.docx_file, entries)
print(f"Successfully added TOC placeholders to {args.docx_file}")
except RuntimeError as e:
# TOC structure errors — hard fail with exit code 1
print(f"ERROR: {e}", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

1333
skills/docx/scripts/document.py Executable file

File diff suppressed because it is too large Load Diff

807
skills/docx/scripts/postcheck.py Executable file
View File

@@ -0,0 +1,807 @@
#!/usr/bin/env python3
"""
postcheck.py — Document business rule self-check script
Unlike traditional OpenXML Schema validation, this script does not check XML legality.
Instead, it checks document "visual quality" and "typesetting correctness" — issues visible to the human eye.
Usage:
python3 postcheck.py output.docx [--fix] [--json]
Checks:
1. Blank page detection — trailing/middle excess blank pages, double page breaks, consecutive empty paragraphs
2. Line spacing consistency — whether body paragraph line spacing is uniform
3. Table margins — whether cells have padding set
4. Table pagination control — whether header rows have tblHeader set, data rows have cantSplit
5. Image overflow — whether image width exceeds page usable area
6. Font fallback — whether fonts are used that may be missing on target systems
7. CJK indentation — whether Chinese body text has first-line indent (excluding table cells, lists, centered paragraphs)
8. Heading level continuity — whether headings skip levels (H1→H3 skipping H2)
9. Numbering continuity — whether numbered lists have gaps
10. Cover separation — whether cover and body are in different sections
11. ShadingType — whether SOLID is misused causing black cells
12. TOC quality — whether TOC field exists, whether headings use standard Heading styles
13. Image aspect ratio — whether images are stretched/distorted
14. Document cleanliness — whether placeholder text, Markdown syntax, or draft expressions remain
15. Report content quality — whether summary exists, whether titles are specific, whether vague conclusions are used
"""
import zipfile
import sys
import json
import re
from pathlib import Path
from xml.etree import ElementTree as ET
NS = {
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
"wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
"pic": "http://schemas.openxmlformats.org/drawingml/2006/picture",
}
class CheckResult:
def __init__(self, name: str, passed: bool, message: str, severity: str = "warning"):
self.name = name
self.passed = passed
self.message = message
self.severity = severity # "error" | "warning" | "info"
def to_dict(self):
return {
"name": self.name,
"passed": self.passed,
"message": self.message,
"severity": self.severity,
}
def __str__(self):
icon = "" if self.passed else ("" if self.severity == "error" else "⚠️")
return f"{icon} [{self.name}] {self.message}"
def read_document_xml(docx_path: str) -> ET.Element:
"""Read document.xml and return the root element"""
with zipfile.ZipFile(docx_path, "r") as z:
return ET.fromstring(z.read("word/document.xml"))
def get_sections(root: ET.Element) -> list:
"""Extract all sections (located via sectPr)"""
body = root.find(".//w:body", NS)
if body is None:
return []
sections = []
current_children = []
for child in body:
tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
if tag == "sectPr":
sections.append({"children": current_children, "sectPr": child})
current_children = []
else:
# Check whether paragraph contains sectPr (section break inside paragraph pPr)
ppr_sect = child.find(".//w:pPr/w:sectPr", NS)
if ppr_sect is not None:
current_children.append(child)
sections.append({"children": current_children, "sectPr": ppr_sect})
current_children = []
else:
current_children.append(child)
# Last section (body-level sectPr)
body_sect = body.find("w:sectPr", NS)
if body_sect is not None and current_children:
sections.append({"children": current_children, "sectPr": body_sect})
return sections
def check_blank_pages(root: ET.Element) -> CheckResult:
"""Detect excess blank pages — multi-pattern detection"""
body = root.find(".//w:body", NS)
paragraphs = body.findall("w:p", NS)
issues = []
if not paragraphs:
return CheckResult("blank-pages", True, "No paragraph content")
# Check 1: Whether the last paragraph only has a page break
last_p = paragraphs[-1]
runs = last_p.findall(".//w:r", NS)
has_page_break = False
has_text = False
for run in runs:
br = run.find("w:br", NS)
if br is not None and br.get(f"{{{NS['w']}}}type") == "page":
has_page_break = True
t = run.find("w:t", NS)
if t is not None and t.text and t.text.strip():
has_text = True
if has_page_break and not has_text:
issues.append("Trailing page break at document end may cause blank page")
# Check 2: Consecutive empty paragraphs (≥5 consecutive may form visual blank page)
consecutive_empty = 0
max_empty = 0
max_empty_pos = 0
for idx, p in enumerate(paragraphs):
texts = p.findall(".//w:t", NS)
has_any_text = any(t.text and t.text.strip() for t in texts)
has_br = any(
br.get(f"{{{NS['w']}}}type") == "page"
for br in p.findall(".//w:br", NS)
)
has_drawing = p.find(".//{http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing}inline", None) is not None
if not has_any_text and not has_br and not has_drawing:
consecutive_empty += 1
if consecutive_empty > max_empty:
max_empty = consecutive_empty
max_empty_pos = idx
else:
consecutive_empty = 0
if max_empty >= 5:
issues.append(f"Found {max_empty} consecutive empty paragraphs (starting around paragraph {max_empty_pos - max_empty + 2}), may form visual blank page")
# Check 3: Double page break at section boundary (PageBreak at section end + NEXT_PAGE in next section)
sections = get_sections(root)
for i in range(len(sections) - 1):
sec_children = sections[i]["children"]
if not sec_children:
continue
# Check whether the last paragraph of the section contains PageBreak
last_child = sec_children[-1]
if last_child.tag == f"{{{NS['w']}}}p":
for br in last_child.findall(".//w:br", NS):
if br.get(f"{{{NS['w']}}}type") == "page":
# Check whether the next section is NEXT_PAGE
next_sect_pr = sections[i + 1]["sectPr"]
sect_type = next_sect_pr.find("w:type", NS)
if sect_type is not None and sect_type.get(f"{{{NS['w']}}}val") == "nextPage":
issues.append(f"Section {i+1} ends with PageBreak and Section {i+2} is type nextPage, double page break causes blank page")
# Check 4: Empty paragraph + PageBreak (paragraph has only PageBreak, no text)
# Exclude section-ending PageBreaks — they are normal section separators
# (e.g., cover page ending with an empty para + PageBreak before a new section)
section_last_paras = set()
for sec in sections:
children = sec["children"]
if children:
last_child = children[-1]
section_last_paras.add(id(last_child))
empty_pb_count = 0
for p in paragraphs[:-1]: # Last paragraph already handled in Check 1
if id(p) in section_last_paras:
continue # Skip section-ending paragraphs (normal section breaks)
runs = p.findall(".//w:r", NS)
p_has_break = False
p_has_text = False
for run in runs:
br = run.find("w:br", NS)
if br is not None and br.get(f"{{{NS['w']}}}type") == "page":
p_has_break = True
t = run.find("w:t", NS)
if t is not None and t.text and t.text.strip():
p_has_text = True
if p_has_break and not p_has_text:
empty_pb_count += 1
if empty_pb_count > 0:
issues.append(f"Found {empty_pb_count} empty paragraphs with PageBreak (suggest attaching PageBreak to content paragraphs)")
# Separate hard errors from soft warnings
hard_issues = [i for i in issues if "double page break" in i.lower() or "trailing page break" in i.lower() or "consecutive" in i.lower()]
soft_issues = [i for i in issues if i not in hard_issues]
if hard_issues:
return CheckResult(
"blank-pages", False,
"; ".join(hard_issues[:3]),
"error"
)
if soft_issues:
return CheckResult(
"blank-pages", False,
"; ".join(soft_issues[:3]),
"warning"
)
return CheckResult("blank-pages", True, "No blank page issues detected")
def check_line_spacing(root: ET.Element) -> CheckResult:
"""Check body paragraph line spacing consistency"""
body = root.find(".//w:body", NS)
paragraphs = body.findall(".//w:p", NS)
spacing_values = {}
body_para_count = 0
for p in paragraphs:
ppr = p.find("w:pPr", NS)
# Skip heading paragraphs
if ppr is not None:
style = ppr.find("w:pStyle", NS)
if style is not None:
val = style.get(f"{{{NS['w']}}}val", "")
if val.startswith("Heading") or val == "Title":
continue
spacing = ppr.find("w:spacing", NS) if ppr is not None else None
line_val = spacing.get(f"{{{NS['w']}}}line") if spacing is not None else None
# Only count paragraphs with text content
texts = p.findall(".//w:t", NS)
if not any(t.text and t.text.strip() for t in texts):
continue
body_para_count += 1
key = line_val or "default"
spacing_values[key] = spacing_values.get(key, 0) + 1
if body_para_count == 0:
return CheckResult("line-spacing", True, "No body paragraphs")
if len(spacing_values) <= 1:
dominant = list(spacing_values.keys())[0] if spacing_values else "default"
return CheckResult("line-spacing", True, f"Line spacing uniform (line={dominant})")
# Find the most common line spacing
dominant = max(spacing_values, key=spacing_values.get)
inconsistent = sum(v for k, v in spacing_values.items() if k != dominant)
total = sum(spacing_values.values())
if inconsistent / total > 0.2:
return CheckResult(
"line-spacing", False,
f"Line spacing inconsistent: {dict(spacing_values)}, {inconsistent}/{total} paragraphs differ from dominant spacing {dominant}",
"warning"
)
return CheckResult("line-spacing", True, f"Line spacing mostly uniform (line={dominant}, {inconsistent} exceptions)")
def check_image_overflow(root: ET.Element) -> CheckResult:
"""Check whether image width may exceed page bounds"""
# Get page width
sect_pr = root.find(".//w:body/w:sectPr", NS)
page_width = 11906 # A4 default
margin_left = 1701
margin_right = 1417
if sect_pr is not None:
pg_sz = sect_pr.find("w:pgSz", NS)
pg_mar = sect_pr.find("w:pgMar", NS)
if pg_sz is not None:
page_width = int(pg_sz.get(f"{{{NS['w']}}}w", "11906"))
if pg_mar is not None:
margin_left = int(pg_mar.get(f"{{{NS['w']}}}left", "1701"))
margin_right = int(pg_mar.get(f"{{{NS['w']}}}right", "1417"))
usable_width_emu = (page_width - margin_left - margin_right) * 635 # twips → EMU
drawings = root.findall(".//wp:inline", NS) + root.findall(".//wp:anchor", NS)
oversized = 0
for dwg in drawings:
extent = dwg.find("wp:extent", NS)
if extent is not None:
cx = int(extent.get("cx", "0"))
if cx > usable_width_emu * 1.05: # 5% tolerance
oversized += 1
if oversized > 0:
return CheckResult(
"image-overflow", False,
f"{oversized} images exceed page usable area",
"error"
)
return CheckResult(
"image-overflow", True,
f"All images within page width ({len(drawings)} images)"
)
def check_image_aspect_ratio(docx_path: str, root: ET.Element) -> CheckResult:
"""Check whether images are stretched/distorted (aspect ratio drift).
Compares the original aspect ratio of embedded images with the display aspect ratio set in wp:extent.
Drift >10% is considered distortion (pie charts becoming elliptical, radar charts becoming diamond-shaped, etc).
"""
import zipfile as _zf
# Build a mapping: rId → image file path inside the zip
# We need to parse word/_rels/document.xml.rels
rid_to_path = {}
try:
with _zf.ZipFile(docx_path, 'r') as z:
rels_path = 'word/_rels/document.xml.rels'
if rels_path in z.namelist():
rels_xml = z.read(rels_path)
rels_root = ET.fromstring(rels_xml)
rels_ns = 'http://schemas.openxmlformats.org/package/2006/relationships'
for rel in rels_root.findall(f'{{{rels_ns}}}Relationship'):
rid = rel.get('Id', '')
target = rel.get('Target', '')
rel_type = rel.get('Type', '')
if 'image' in rel_type:
# Target is relative to word/ directory
if not target.startswith('/'):
img_path = 'word/' + target
else:
img_path = target.lstrip('/')
rid_to_path[rid] = img_path
# Now check each drawing
drawings = root.findall(".//wp:inline", NS) + root.findall(".//wp:anchor", NS)
distorted = []
for dwg in drawings:
extent = dwg.find("wp:extent", NS)
if extent is None:
continue
display_cx = int(extent.get("cx", "0"))
display_cy = int(extent.get("cy", "0"))
if display_cx == 0 or display_cy == 0:
continue
# Find the blip rId
blip = dwg.find(".//a:blip", NS)
if blip is None:
continue
r_embed = blip.get(f"{{{NS['r']}}}embed", "")
if not r_embed or r_embed not in rid_to_path:
continue
img_zip_path = rid_to_path[r_embed]
if img_zip_path not in z.namelist():
continue
# Read actual image dimensions
try:
img_data = z.read(img_zip_path)
from PIL import Image as _PILImage
import io as _io
pil_img = _PILImage.open(_io.BytesIO(img_data))
orig_w, orig_h = pil_img.size
if orig_w == 0 or orig_h == 0:
continue
except Exception:
continue
# Compare aspect ratios
orig_ratio = orig_w / orig_h
display_ratio = display_cx / display_cy
drift = abs(orig_ratio - display_ratio) / orig_ratio
if drift > 0.10: # >10% distortion
pct = drift * 100
distorted.append(
f"{img_zip_path.split('/')[-1]}: "
f"original {orig_w}×{orig_h} (ratio={orig_ratio:.2f}), "
f"display ratio={display_ratio:.2f}, distortion {pct:.0f}%"
)
except Exception:
return CheckResult(
"image-aspect-ratio", True,
"Cannot check image aspect ratio (zip read error)",
"info"
)
if distorted:
detail = "; ".join(distorted[:3])
if len(distorted) > 3:
detail += f" ...and {len(distorted)} more"
return CheckResult(
"image-aspect-ratio", False,
f"{len(distorted)} images have aspect ratio distortion: {detail}",
"warning"
)
img_count = len(drawings)
return CheckResult(
"image-aspect-ratio", True,
f"All images have correct aspect ratio ({img_count} images)"
)
def check_font_fallback(root: ET.Element) -> CheckResult:
"""Check whether potentially missing fonts are used"""
SAFE_FONTS = {
# Chinese
"宋体", "SimSun", "黑体", "SimHei", "微软雅黑", "Microsoft YaHei",
"仿宋", "FangSong", "FangSong_GB2312", "楷体", "KaiTi",
# English
"Times New Roman", "Arial", "Calibri", "Helvetica",
"Courier New", "Georgia", "Verdana", "Tahoma",
# Universal
"Symbol", "Wingdings",
}
fonts_used = set()
for rpr in root.findall(".//w:rPr", NS):
for font_tag in ["w:rFonts"]:
rf = rpr.find(font_tag, NS)
if rf is not None:
for attr in ["ascii", "eastAsia", "hAnsi", "cs"]:
f = rf.get(f"{{{NS['w']}}}{attr}")
if f:
fonts_used.add(f)
risky = fonts_used - SAFE_FONTS
if risky:
return CheckResult(
"font-fallback", False,
f"Following fonts may be missing on target system: {', '.join(sorted(risky))}",
"info"
)
return CheckResult("font-fallback", True, f"All fonts are common system fonts ({len(fonts_used)} types)")
def check_heading_levels(root: ET.Element) -> CheckResult:
"""Check whether headings skip levels"""
body = root.find(".//w:body", NS)
heading_levels = []
for p in body.findall(".//w:p", NS):
ppr = p.find("w:pPr", NS)
if ppr is None:
continue
style = ppr.find("w:pStyle", NS)
if style is None:
continue
val = style.get(f"{{{NS['w']}}}val", "")
m = re.match(r"Heading(\d+)", val)
if m:
heading_levels.append(int(m.group(1)))
if len(heading_levels) < 2:
return CheckResult("heading-levels", True, "Too few headings, skipping check")
skips = []
for i in range(1, len(heading_levels)):
diff = heading_levels[i] - heading_levels[i - 1]
if diff > 1:
skips.append(f"H{heading_levels[i-1]}→H{heading_levels[i]}")
if skips:
return CheckResult(
"heading-levels", False,
f"Heading level skip: {', '.join(skips[:5])}",
"warning"
)
return CheckResult("heading-levels", True, f"Heading levels continuous ({len(heading_levels)} headings)")
# check_cover_separation removed — false positives on complex covers (>15 elements is normal)
def check_shading_type(root: ET.Element) -> CheckResult:
"""Check whether ShadingType.SOLID is misused"""
shadings = root.findall(".//w:shd", NS)
solid_count = 0
for shd in shadings:
val = shd.get(f"{{{NS['w']}}}val", "")
if val == "solid":
solid_count += 1
if solid_count > 0:
return CheckResult(
"shading-type", False,
f"Found {solid_count} instances of ShadingType.SOLID (should be CLEAR), may cause black cells",
"error"
)
return CheckResult("shading-type", True, "No ShadingType.SOLID misuse found")
def check_toc(root: ET.Element, docx_path: str = "") -> CheckResult:
"""Check TOC quality: field existence, headings presence, outlineLvl, updateFields."""
body = root.find(".//w:body", NS)
if body is None:
return CheckResult("toc", True, "Document body is empty, skipping TOC check", "info")
paragraphs = list(body)
w_ns = NS["w"]
# --- Detect headings and their levels ---
heading_count = 0
heading_levels_used = set() # e.g. {1, 2, 3}
for p in paragraphs:
if p.tag != f"{{{w_ns}}}p":
continue
ppr = p.find(f"{{{w_ns}}}pPr")
if ppr is None:
continue
ps = ppr.find(f"{{{w_ns}}}pStyle")
if ps is None:
continue
val = ps.get(f"{{{w_ns}}}val", "")
m = re.match(r"(?i)heading\s*(\d)", val)
if m:
heading_count += 1
heading_levels_used.add(int(m.group(1)))
# --- Detect TOC field ---
has_toc = False
for instr in root.findall(f".//{{{w_ns}}}instrText"):
if instr.text and "TOC" in instr.text.upper():
has_toc = True
break
if not has_toc:
for fld in root.findall(f".//{{{w_ns}}}fldSimple"):
if "TOC" in fld.get(f"{{{w_ns}}}instr", "").upper():
has_toc = True
break
# Also check SDT-wrapped TOC
if not has_toc:
for sdt in root.findall(f".//{{{w_ns}}}sdt"):
for instr in sdt.findall(f".//{{{w_ns}}}instrText"):
if instr.text and "TOC" in instr.text.upper():
has_toc = True
break
if has_toc:
break
issues = []
# Check 1: Document has a "目录" / "目 录" / "Table of Contents" title but no TOC field
has_toc_title = False
toc_title_pattern = re.compile(r'^(?:目\s*录|table\s+of\s+contents|contents)$', re.IGNORECASE)
for p in paragraphs:
if p.tag != f"{{{w_ns}}}p":
continue
texts = p.findall(f".//{{{w_ns}}}t")
p_text = "".join(t.text or "" for t in texts).strip()
if toc_title_pattern.match(p_text):
has_toc_title = True
break
if has_toc_title and not has_toc:
issues.append("TOC_FIELD_MISSING: document has a TOC title but no TOC field element — add TableOfContents in code")
# Check 2: TOC field exists but no headings in document → TOC will be empty after update
if has_toc and heading_count == 0:
issues.append("TOC_NO_HEADINGS: TOC field exists but document has 0 Heading-styled paragraphs — TOC will be empty after update")
# Check 3 & 4: Read styles.xml and settings.xml from DOCX (only when TOC exists)
if has_toc and docx_path:
try:
import zipfile
with zipfile.ZipFile(docx_path, 'r') as zf:
# Check 3: outlineLvl missing in Heading styles
if 'word/styles.xml' in zf.namelist():
styles_content = zf.read('word/styles.xml').decode('utf-8')
styles_root = ET.fromstring(styles_content)
missing_outline = []
for level in sorted(heading_levels_used):
style_id = f"Heading{level}"
# Find <w:style w:styleId="HeadingN">
for style_elem in styles_root.findall(f".//{{{w_ns}}}style"):
sid = style_elem.get(f"{{{w_ns}}}styleId", "")
if sid == style_id:
# Check if pPr has outlineLvl
ppr = style_elem.find(f"{{{w_ns}}}pPr")
has_outline = False
if ppr is not None:
ol = ppr.find(f"{{{w_ns}}}outlineLvl")
if ol is not None:
has_outline = True
if not has_outline:
missing_outline.append(style_id)
break
if missing_outline:
issues.append(
"TOC_OUTLINE_MISSING: %s style(s) missing outlineLvl — "
"Word TOC update won't find these headings. "
"Run add_toc_placeholders.py to fix" % ", ".join(missing_outline)
)
# Check 4: updateFields not set to true
if 'word/settings.xml' in zf.namelist():
settings_content = zf.read('word/settings.xml').decode('utf-8')
# Check for <w:updateFields w:val="true"/>
update_ok = bool(re.search(
r'<w:updateFields\s+[^>]*w:val\s*=\s*"true"',
settings_content
))
if not update_ok:
issues.append(
"TOC_UPDATE_DISABLED: settings.xml missing updateFields=true — "
"Word won't prompt to update TOC on open. "
"Run add_toc_placeholders.py to fix"
)
except Exception as e:
issues.append(f"TOC_CHECK_ERROR: failed to read styles/settings from DOCX: {e}")
if not issues:
if has_toc:
return CheckResult("toc", True, "TOC field present and update-ready")
else:
return CheckResult("toc", True, "No TOC needed")
severity = "error" if any(k in i for i in issues for k in ("FIELD_MISSING", "NO_HEADINGS", "OUTLINE_MISSING")) else "warning"
return CheckResult("toc", False, "; ".join(issues[:5]), severity)
def check_cover_overflow(root: ET.Element) -> CheckResult:
"""Detect cover section issues: oversized fonts, excessive spacing, trailing empty content."""
sections = get_sections(root)
if not sections:
return CheckResult("cover-overflow", True, "No sections found")
sec0 = sections[0]
sect_pr = sec0["sectPr"]
# Get page dimensions and margins for accurate available height calculation
pg_sz = sect_pr.find("w:pgSz", NS)
pg_mar = sect_pr.find("w:pgMar", NS)
page_height = int(pg_sz.get(f"{{{NS['w']}}}h", "16838")) if pg_sz is not None else 16838
margin_top = int(pg_mar.get(f"{{{NS['w']}}}top", "0")) if pg_mar is not None else 0
margin_bottom = int(pg_mar.get(f"{{{NS['w']}}}bottom", "0")) if pg_mar is not None else 0
issues = []
children = sec0["children"]
# Check 1: Oversized font in cover section (> 44pt = 88 half-points = 889000 EMU)
max_font_size = 0
for child in children:
for sz in child.findall(".//" + f"{{{NS['w']}}}sz"):
val = sz.get(f"{{{NS['w']}}}val")
if val and val.isdigit():
size_hp = int(val)
if size_hp > max_font_size:
max_font_size = size_hp
if max_font_size > 88: # 88 half-points = 44pt
issues.append(
f"Cover has font size {max_font_size // 2}pt (>{44}pt max). "
f"Use calcTitleLayout() for dynamic sizing"
)
# Check 2: Excessive spacing.before in cover section (> 5000 twips)
max_spacing = 0
for child in children:
for sp in child.findall(".//" + f"{{{NS['w']}}}spacing"):
before = sp.get(f"{{{NS['w']}}}before")
if before and before.isdigit():
val = int(before)
if val > max_spacing:
max_spacing = val
if max_spacing > 5000:
issues.append(
f"Cover has spacing.before={max_spacing} twips (>5000 max). "
f"Use calcCoverSpacing() for dynamic spacing"
)
# Check 3: Trailing empty paragraphs in cover section
trailing_empty = 0
for child in reversed(children):
tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
if tag != "p":
break
texts = child.findall(".//" + f"{{{NS['w']}}}t")
has_text = any(t.text and t.text.strip() for t in texts)
if not has_text:
trailing_empty += 1
else:
break
if trailing_empty > 2:
issues.append(
f"Cover section ends with {trailing_empty} empty paragraphs (max 2 allowed) — "
f"excessive empty paragraphs may cause blank page after cover"
)
if issues:
return CheckResult(
"cover-overflow", False,
"; ".join(issues),
"error"
)
return CheckResult("cover-overflow", True, "Cover section layout looks OK")
def run_all_checks(docx_path: str) -> list[CheckResult]:
"""Run all checks"""
root = read_document_xml(docx_path)
checks = [
check_blank_pages,
check_cover_overflow,
check_line_spacing,
check_image_overflow,
check_font_fallback,
check_heading_levels,
check_shading_type,
]
results = []
for check_fn in checks:
try:
results.append(check_fn(root))
except Exception as e:
results.append(CheckResult(
check_fn.__name__.replace("check_", ""),
False,
f"Check error: {e}",
"error"
))
# TOC check needs both root and docx_path
try:
results.append(check_toc(root, docx_path))
except Exception as e:
results.append(CheckResult("toc", False, f"Check error: {e}", "error"))
# Image aspect ratio check needs both root and docx_path
try:
results.append(check_image_aspect_ratio(docx_path, root))
except Exception as e:
results.append(CheckResult("image-aspect-ratio", False, f"Check error: {e}", "error"))
return results
def main():
import argparse
parser = argparse.ArgumentParser(description="docx business rule self-check")
parser.add_argument("docx_path", help="Path to the .docx file to check")
parser.add_argument("--json", action="store_true", help="Output in JSON format")
parser.add_argument("--strict", action="store_true", help="Treat warnings as failures")
args = parser.parse_args()
if not Path(args.docx_path).exists():
print(f"❌ File not found: {args.docx_path}")
sys.exit(1)
results = run_all_checks(args.docx_path)
if args.json:
print(json.dumps([r.to_dict() for r in results], ensure_ascii=False, indent=2))
else:
print(f"\n📋 Document self-check report: {args.docx_path}\n")
for r in results:
print(f" {r}")
passed = sum(1 for r in results if r.passed)
total = len(results)
errors = sum(1 for r in results if not r.passed and r.severity == "error")
warnings = sum(1 for r in results if not r.passed and r.severity == "warning")
print(f"\n {'' * 50}")
print(f" Passed {passed}/{total} | ❌ {errors} errors | ⚠️ {warnings} warnings\n")
# Exit code
has_errors = any(not r.passed and r.severity == "error" for r in results)
has_warnings = any(not r.passed and r.severity == "warning" for r in results)
if has_errors:
sys.exit(2)
elif args.strict and has_warnings:
sys.exit(1)
else:
sys.exit(0)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,3 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:comments xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14">
</w:comments>

View File

@@ -0,0 +1,3 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w15:commentsEx xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14">
</w15:commentsEx>

View File

@@ -0,0 +1,3 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w16cex:commentsExtensible xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" xmlns:cr="http://schemas.microsoft.com/office/comments/2020/reactions" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl cr w16du wp14">
</w16cex:commentsExtensible>

View File

@@ -0,0 +1,3 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w16cid:commentsIds xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14">
</w16cid:commentsIds>

View File

@@ -0,0 +1,3 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w15:people xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml">
</w15:people>

374
skills/docx/scripts/utilities.py Executable file
View File

@@ -0,0 +1,374 @@
#!/usr/bin/env python3
"""
Utilities for editing OOXML documents.
This module provides XMLEditor, a tool for manipulating XML files with support for
line-number-based node finding and DOM manipulation. Each element is automatically
annotated with its original line and column position during parsing.
Example usage:
editor = XMLEditor("document.xml")
# Find node by line number or range
elem = editor.get_node(tag="w:r", line_number=519)
elem = editor.get_node(tag="w:p", line_number=range(100, 200))
# Find node by text content
elem = editor.get_node(tag="w:p", contains="specific text")
# Find node by attributes
elem = editor.get_node(tag="w:r", attrs={"w:id": "target"})
# Combine filters
elem = editor.get_node(tag="w:p", line_number=range(1, 50), contains="text")
# Replace, insert, or manipulate
new_elem = editor.replace_node(elem, "<w:r><w:t>new text</w:t></w:r>")
editor.insert_after(new_elem, "<w:r><w:t>more</w:t></w:r>")
# Save changes
editor.save()
"""
import html
from pathlib import Path
from typing import Optional, Union
import defusedxml.minidom
import defusedxml.sax
class XMLEditor:
"""
Editor for manipulating OOXML XML files with line-number-based node finding.
This class parses XML files and tracks the original line and column position
of each element. This enables finding nodes by their line number in the original
file, which is useful when working with Read tool output.
Attributes:
xml_path: Path to the XML file being edited
encoding: Detected encoding of the XML file ('ascii' or 'utf-8')
dom: Parsed DOM tree with parse_position attributes on elements
"""
def __init__(self, xml_path):
"""
Initialize with path to XML file and parse with line number tracking.
Args:
xml_path: Path to XML file to edit (str or Path)
Raises:
ValueError: If the XML file does not exist
"""
self.xml_path = Path(xml_path)
if not self.xml_path.exists():
raise ValueError(f"XML file not found: {xml_path}")
with open(self.xml_path, "rb") as f:
header = f.read(200).decode("utf-8", errors="ignore")
self.encoding = "ascii" if 'encoding="ascii"' in header else "utf-8"
parser = _create_line_tracking_parser()
self.dom = defusedxml.minidom.parse(str(self.xml_path), parser)
def get_node(
self,
tag: str,
attrs: Optional[dict[str, str]] = None,
line_number: Optional[Union[int, range]] = None,
contains: Optional[str] = None,
):
"""
Get a DOM element by tag and identifier.
Finds an element by either its line number in the original file or by
matching attribute values. Exactly one match must be found.
Args:
tag: The XML tag name (e.g., "w:del", "w:ins", "w:r")
attrs: Dictionary of attribute name-value pairs to match (e.g., {"w:id": "1"})
line_number: Line number (int) or line range (range) in original XML file (1-indexed)
contains: Text string that must appear in any text node within the element.
Supports both entity notation (&#8220;) and Unicode characters (\u201c).
Returns:
defusedxml.minidom.Element: The matching DOM element
Raises:
ValueError: If node not found or multiple matches found
Example:
elem = editor.get_node(tag="w:r", line_number=519)
elem = editor.get_node(tag="w:r", line_number=range(100, 200))
elem = editor.get_node(tag="w:del", attrs={"w:id": "1"})
elem = editor.get_node(tag="w:p", attrs={"w14:paraId": "12345678"})
elem = editor.get_node(tag="w:commentRangeStart", attrs={"w:id": "0"})
elem = editor.get_node(tag="w:p", contains="specific text")
elem = editor.get_node(tag="w:t", contains="&#8220;Agreement") # Entity notation
elem = editor.get_node(tag="w:t", contains="\u201cAgreement") # Unicode character
"""
matches = []
for elem in self.dom.getElementsByTagName(tag):
# Check line_number filter
if line_number is not None:
parse_pos = getattr(elem, "parse_position", (None,))
elem_line = parse_pos[0]
# Handle both single line number and range
if isinstance(line_number, range):
if elem_line not in line_number:
continue
else:
if elem_line != line_number:
continue
# Check attrs filter
if attrs is not None:
if not all(
elem.getAttribute(attr_name) == attr_value
for attr_name, attr_value in attrs.items()
):
continue
# Check contains filter
if contains is not None:
elem_text = self._get_element_text(elem)
# Normalize the search string: convert HTML entities to Unicode characters
# This allows searching for both "&#8220;Rowan" and ""Rowan"
normalized_contains = html.unescape(contains)
if normalized_contains not in elem_text:
continue
# If all applicable filters passed, this is a match
matches.append(elem)
if not matches:
# Build descriptive error message
filters = []
if line_number is not None:
line_str = (
f"lines {line_number.start}-{line_number.stop - 1}"
if isinstance(line_number, range)
else f"line {line_number}"
)
filters.append(f"at {line_str}")
if attrs is not None:
filters.append(f"with attributes {attrs}")
if contains is not None:
filters.append(f"containing '{contains}'")
filter_desc = " ".join(filters) if filters else ""
base_msg = f"Node not found: <{tag}> {filter_desc}".strip()
# Add helpful hint based on filters used
if contains:
hint = "Text may be split across elements or use different wording."
elif line_number:
hint = "Line numbers may have changed if document was modified."
elif attrs:
hint = "Verify attribute values are correct."
else:
hint = "Try adding filters (attrs, line_number, or contains)."
raise ValueError(f"{base_msg}. {hint}")
if len(matches) > 1:
raise ValueError(
f"Multiple nodes found: <{tag}>. "
f"Add more filters (attrs, line_number, or contains) to narrow the search."
)
return matches[0]
def _get_element_text(self, elem):
"""
Recursively extract all text content from an element.
Skips text nodes that contain only whitespace (spaces, tabs, newlines),
which typically represent XML formatting rather than document content.
Args:
elem: defusedxml.minidom.Element to extract text from
Returns:
str: Concatenated text from all non-whitespace text nodes within the element
"""
text_parts = []
for node in elem.childNodes:
if node.nodeType == node.TEXT_NODE:
# Skip whitespace-only text nodes (XML formatting)
if node.data.strip():
text_parts.append(node.data)
elif node.nodeType == node.ELEMENT_NODE:
text_parts.append(self._get_element_text(node))
return "".join(text_parts)
def replace_node(self, elem, new_content):
"""
Replace a DOM element with new XML content.
Args:
elem: defusedxml.minidom.Element to replace
new_content: String containing XML to replace the node with
Returns:
List[defusedxml.minidom.Node]: All inserted nodes
Example:
new_nodes = editor.replace_node(old_elem, "<w:r><w:t>text</w:t></w:r>")
"""
parent = elem.parentNode
nodes = self._parse_fragment(new_content)
for node in nodes:
parent.insertBefore(node, elem)
parent.removeChild(elem)
return nodes
def insert_after(self, elem, xml_content):
"""
Insert XML content after a DOM element.
Args:
elem: defusedxml.minidom.Element to insert after
xml_content: String containing XML to insert
Returns:
List[defusedxml.minidom.Node]: All inserted nodes
Example:
new_nodes = editor.insert_after(elem, "<w:r><w:t>text</w:t></w:r>")
"""
parent = elem.parentNode
next_sibling = elem.nextSibling
nodes = self._parse_fragment(xml_content)
for node in nodes:
if next_sibling:
parent.insertBefore(node, next_sibling)
else:
parent.appendChild(node)
return nodes
def insert_before(self, elem, xml_content):
"""
Insert XML content before a DOM element.
Args:
elem: defusedxml.minidom.Element to insert before
xml_content: String containing XML to insert
Returns:
List[defusedxml.minidom.Node]: All inserted nodes
Example:
new_nodes = editor.insert_before(elem, "<w:r><w:t>text</w:t></w:r>")
"""
parent = elem.parentNode
nodes = self._parse_fragment(xml_content)
for node in nodes:
parent.insertBefore(node, elem)
return nodes
def append_to(self, elem, xml_content):
"""
Append XML content as a child of a DOM element.
Args:
elem: defusedxml.minidom.Element to append to
xml_content: String containing XML to append
Returns:
List[defusedxml.minidom.Node]: All inserted nodes
Example:
new_nodes = editor.append_to(elem, "<w:r><w:t>text</w:t></w:r>")
"""
nodes = self._parse_fragment(xml_content)
for node in nodes:
elem.appendChild(node)
return nodes
def get_next_rid(self):
"""Get the next available rId for relationships files."""
max_id = 0
for rel_elem in self.dom.getElementsByTagName("Relationship"):
rel_id = rel_elem.getAttribute("Id")
if rel_id.startswith("rId"):
try:
max_id = max(max_id, int(rel_id[3:]))
except ValueError:
pass
return f"rId{max_id + 1}"
def save(self):
"""
Save the edited XML back to the file.
Serializes the DOM tree and writes it back to the original file path,
preserving the original encoding (ascii or utf-8).
"""
content = self.dom.toxml(encoding=self.encoding)
self.xml_path.write_bytes(content)
def _parse_fragment(self, xml_content):
"""
Parse XML fragment and return list of imported nodes.
Args:
xml_content: String containing XML fragment
Returns:
List of defusedxml.minidom.Node objects imported into this document
Raises:
AssertionError: If fragment contains no element nodes
"""
# Extract namespace declarations from the root document element
root_elem = self.dom.documentElement
namespaces = []
if root_elem and root_elem.attributes:
for i in range(root_elem.attributes.length):
attr = root_elem.attributes.item(i)
if attr.name.startswith("xmlns"): # type: ignore
namespaces.append(f'{attr.name}="{attr.value}"') # type: ignore
ns_decl = " ".join(namespaces)
wrapper = f"<root {ns_decl}>{xml_content}</root>"
fragment_doc = defusedxml.minidom.parseString(wrapper)
nodes = [
self.dom.importNode(child, deep=True)
for child in fragment_doc.documentElement.childNodes # type: ignore
]
elements = [n for n in nodes if n.nodeType == n.ELEMENT_NODE]
assert elements, "Fragment must contain at least one element"
return nodes
def _create_line_tracking_parser():
"""
Create a SAX parser that tracks line and column numbers for each element.
Monkey patches the SAX content handler to store the current line and column
position from the underlying expat parser onto each element as a parse_position
attribute (line, column) tuple.
Returns:
defusedxml.sax.xmlreader.XMLReader: Configured SAX parser
"""
def set_content_handler(dom_handler):
def startElementNS(name, tagName, attrs):
orig_start_cb(name, tagName, attrs)
cur_elem = dom_handler.elementStack[-1]
cur_elem.parse_position = (
parser._parser.CurrentLineNumber, # type: ignore
parser._parser.CurrentColumnNumber, # type: ignore
)
orig_start_cb = dom_handler.startElementNS
dom_handler.startElementNS = startElementNS
orig_set_content_handler(dom_handler)
parser = defusedxml.sax.make_parser()
orig_set_content_handler = parser.setContentHandler
parser.setContentHandler = set_content_handler # type: ignore
return parser