Initial commit
This commit is contained in:
1
skills/docx/scripts/__init__.py
Executable file
1
skills/docx/scripts/__init__.py
Executable file
@@ -0,0 +1 @@
|
||||
# Make scripts directory a package for relative imports in tests
|
||||
749
skills/docx/scripts/add_toc_placeholders.py
Executable file
749
skills/docx/scripts/add_toc_placeholders.py
Executable file
@@ -0,0 +1,749 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Add placeholder entries to Table of Contents in a DOCX file.
|
||||
|
||||
This script adds placeholder TOC entries between the 'separate' and 'end'
|
||||
field characters, so users see some content on first open instead of an empty TOC.
|
||||
The original file is replaced with the modified version.
|
||||
|
||||
Usage:
|
||||
python add_toc_placeholders.py <docx_file> # auto-extract headings (default)
|
||||
python add_toc_placeholders.py <docx_file> --auto # explicit auto mode
|
||||
python add_toc_placeholders.py <docx_file> --entries <entries_json>
|
||||
|
||||
entries_json format: JSON string with array of objects:
|
||||
[
|
||||
{"level": 1, "text": "Chapter 1 Overview", "page": "1"},
|
||||
{"level": 2, "text": "Section 1.1 Details", "page": "1"}
|
||||
]
|
||||
|
||||
Default behavior (no flags): auto-extracts Heading 1-3 from the document.
|
||||
Filters out table/figure captions (e.g. "表 1:xxx", "图 2:xxx").
|
||||
|
||||
Example:
|
||||
python add_toc_placeholders.py document.docx
|
||||
python add_toc_placeholders.py document.docx --auto
|
||||
python add_toc_placeholders.py document.docx --entries '[{"level":1,"text":"Introduction","page":"1"}]'
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import html
|
||||
import json
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def _extract_headings_from_docx(docx_path: str, max_level: int = 3) -> list:
|
||||
"""Extract headings from a DOCX file for auto-mode TOC generation.
|
||||
|
||||
Args:
|
||||
docx_path: Path to DOCX file
|
||||
max_level: Maximum heading level to include (default 3)
|
||||
|
||||
Returns:
|
||||
List of dicts with 'level', 'text', 'page' keys
|
||||
"""
|
||||
from docx import Document
|
||||
|
||||
doc = Document(docx_path)
|
||||
entries = []
|
||||
page_estimate = 1
|
||||
|
||||
# Pattern to filter out table/figure captions styled as headings
|
||||
caption_pattern = re.compile(r'^[表图]\s*\d')
|
||||
|
||||
for i, para in enumerate(doc.paragraphs):
|
||||
style_name = para.style.name if para.style else ''
|
||||
if not style_name.startswith('Heading'):
|
||||
continue
|
||||
m = re.search(r'(\d+)', style_name)
|
||||
if not m:
|
||||
continue
|
||||
level = int(m.group(1))
|
||||
if level > max_level:
|
||||
continue
|
||||
text = para.text.strip()
|
||||
if not text:
|
||||
continue
|
||||
# Filter table/figure captions
|
||||
if caption_pattern.match(text):
|
||||
continue
|
||||
|
||||
# Rough page estimate: increment every ~8 headings
|
||||
page_estimate = max(1, 1 + i // 8)
|
||||
entries.append({"level": level, "text": text, "page": str(page_estimate)})
|
||||
|
||||
return entries
|
||||
|
||||
|
||||
def add_toc_placeholders(docx_path: str, entries: list = None) -> None:
|
||||
"""Add placeholder TOC entries to a DOCX file (in-place replacement).
|
||||
|
||||
Args:
|
||||
docx_path: Path to DOCX file (will be modified in-place)
|
||||
entries: Optional list of placeholder entries. Each entry should be a dict
|
||||
with 'level' (1-3), 'text', and 'page' keys.
|
||||
"""
|
||||
docx_path = Path(docx_path)
|
||||
|
||||
# Create temp directory for extraction
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
temp_path = Path(temp_dir)
|
||||
extracted_dir = temp_path / "extracted"
|
||||
temp_output = temp_path / "output.docx"
|
||||
|
||||
# Extract DOCX
|
||||
with zipfile.ZipFile(docx_path, 'r') as zip_ref:
|
||||
zip_ref.extractall(extracted_dir)
|
||||
|
||||
# Ensure TOC styles exist in styles.xml
|
||||
styles_xml_path = extracted_dir / "word" / "styles.xml"
|
||||
toc_style_mapping = _ensure_toc_styles(styles_xml_path)
|
||||
print(f"TOC style mapping: {toc_style_mapping}")
|
||||
|
||||
# Fix settings.xml: ensure updateFields has val="true"
|
||||
settings_xml_path = extracted_dir / "word" / "settings.xml"
|
||||
_fix_update_fields(settings_xml_path)
|
||||
|
||||
# Fix Heading styles: ensure outlineLvl is set (required for TOC field update)
|
||||
_fix_heading_outline_levels(styles_xml_path)
|
||||
|
||||
# Process document.xml
|
||||
document_xml = extracted_dir / "word" / "document.xml"
|
||||
if not document_xml.exists():
|
||||
raise ValueError("document.xml not found in the DOCX file")
|
||||
|
||||
# Read and process XML
|
||||
content = document_xml.read_text(encoding='utf-8')
|
||||
|
||||
# Fix fldChar structure: split merged begin+instrText+separate into separate <w:r> elements
|
||||
content = _fix_fld_char_structure(content)
|
||||
|
||||
# Find TOC structure and add placeholders (uses lxml for robust XML parsing)
|
||||
modified_content = _insert_toc_placeholders(content, entries, toc_style_mapping)
|
||||
|
||||
# Write back
|
||||
document_xml.write_text(modified_content, encoding='utf-8')
|
||||
|
||||
# Repack DOCX to temp file
|
||||
with zipfile.ZipFile(temp_output, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
||||
for file_path in extracted_dir.rglob('*'):
|
||||
if file_path.is_file():
|
||||
arcname = file_path.relative_to(extracted_dir)
|
||||
zipf.write(file_path, arcname)
|
||||
|
||||
# Replace original file with modified version (use shutil.move for cross-device support)
|
||||
docx_path.unlink()
|
||||
shutil.move(str(temp_output), str(docx_path))
|
||||
|
||||
|
||||
def _fix_update_fields(settings_xml_path: Path) -> None:
|
||||
"""Fix settings.xml to ensure <w:updateFields w:val="true"/> is present.
|
||||
|
||||
The docx npm library generates <w:updateFields/> without val="true",
|
||||
which Word/WPS interprets as false, preventing TOC auto-update on open.
|
||||
"""
|
||||
if not settings_xml_path.exists():
|
||||
return
|
||||
|
||||
content = settings_xml_path.read_text(encoding='utf-8')
|
||||
original = content
|
||||
|
||||
# Case 1: <w:updateFields/> (self-closing, no val) → add val="true"
|
||||
if '<w:updateFields/>' in content:
|
||||
content = content.replace('<w:updateFields/>', '<w:updateFields w:val="true"/>')
|
||||
print('Fixed: <w:updateFields/> → <w:updateFields w:val="true"/>')
|
||||
|
||||
# Case 2: <w:updateFields w:val="false"/> → change to true (match precisely)
|
||||
elif re.search(r'<w:updateFields\s+w:val="false"\s*/>', content):
|
||||
content = re.sub(
|
||||
r'<w:updateFields\s+w:val="false"\s*/>',
|
||||
'<w:updateFields w:val="true"/>',
|
||||
content
|
||||
)
|
||||
print('Fixed: <w:updateFields w:val="false"/> → <w:updateFields w:val="true"/>')
|
||||
|
||||
# Case 3: Not present at all → inject before </w:settings>
|
||||
elif '<w:updateFields' not in content:
|
||||
content = content.replace('</w:settings>', '<w:updateFields w:val="true"/></w:settings>')
|
||||
print('Fixed: added <w:updateFields w:val="true"/> to settings.xml')
|
||||
|
||||
if content != original:
|
||||
settings_xml_path.write_text(content, encoding='utf-8')
|
||||
|
||||
|
||||
def _fix_heading_outline_levels(styles_xml_path: Path) -> None:
|
||||
"""Fix Heading styles to include outlineLvl in pPr.
|
||||
|
||||
The docx npm library creates Heading styles but sometimes doesn't set outlineLvl
|
||||
in the style definition. Without outlineLvl, Word's TOC field update won't find
|
||||
headings even though they display correctly.
|
||||
|
||||
This ensures Heading1 has outlineLvl=0, Heading2 has outlineLvl=1, etc.
|
||||
"""
|
||||
if not styles_xml_path.exists():
|
||||
return
|
||||
|
||||
content = styles_xml_path.read_text(encoding='utf-8')
|
||||
original = content
|
||||
|
||||
W_NS = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
|
||||
|
||||
for level in range(1, 7):
|
||||
style_id = f'Heading{level}'
|
||||
outline_val = str(level - 1)
|
||||
|
||||
# Pattern: find <w:style> with w:styleId="HeadingN"
|
||||
style_pattern = (
|
||||
rf'(<w:style[^>]*w:styleId="{style_id}"[^>]*>)'
|
||||
rf'(.*?)'
|
||||
rf'(</w:style>)'
|
||||
)
|
||||
|
||||
match = re.search(style_pattern, content, flags=re.DOTALL)
|
||||
if not match:
|
||||
continue
|
||||
|
||||
style_content = match.group(2)
|
||||
|
||||
# Check if outlineLvl already exists in this style
|
||||
if f'<w:outlineLvl' in style_content:
|
||||
continue
|
||||
|
||||
# Find or create <w:pPr> within this style
|
||||
ppr_match = re.search(r'(<w:pPr[^>]*>)(.*?)(</w:pPr>)', style_content, flags=re.DOTALL)
|
||||
if ppr_match:
|
||||
# Add outlineLvl inside existing pPr
|
||||
new_ppr_content = ppr_match.group(2) + f'<w:outlineLvl w:val="{outline_val}"/>'
|
||||
new_style_content = (
|
||||
style_content[:ppr_match.start()] +
|
||||
ppr_match.group(1) + new_ppr_content + ppr_match.group(3) +
|
||||
style_content[ppr_match.end():]
|
||||
)
|
||||
else:
|
||||
# No pPr exists, create one
|
||||
new_ppr = f'<w:pPr><w:outlineLvl w:val="{outline_val}"/></w:pPr>'
|
||||
# Insert pPr right after style opening (after name/basedOn if present)
|
||||
new_style_content = new_ppr + style_content
|
||||
|
||||
new_style = match.group(1) + new_style_content + match.group(3)
|
||||
content = content[:match.start()] + new_style + content[match.end():]
|
||||
print(f'Fixed: added outlineLvl={outline_val} to {style_id} style')
|
||||
|
||||
if content != original:
|
||||
styles_xml_path.write_text(content, encoding='utf-8')
|
||||
|
||||
|
||||
def _fix_fld_char_structure(xml_content: str) -> str:
|
||||
"""Fix malformed fldChar structure where begin+instrText+separate are in one <w:r>.
|
||||
|
||||
The docx npm library generates:
|
||||
<w:r><w:fldChar begin/><w:instrText>TOC...</w:instrText><w:fldChar separate/></w:r>
|
||||
|
||||
Word/WPS requires the standard structure:
|
||||
<w:r><w:fldChar begin/></w:r>
|
||||
<w:r><w:instrText>TOC...</w:instrText></w:r>
|
||||
<w:r><w:fldChar separate/></w:r>
|
||||
"""
|
||||
# Match a <w:r> that contains both begin fldChar AND instrText AND separate fldChar
|
||||
pattern = (
|
||||
r'<w:r(?:\s[^>]*)?>('
|
||||
r'<w:fldChar[^>]*w:fldCharType="begin"[^>]*/>' # begin
|
||||
r')('
|
||||
r'<w:instrText[^>]*>.*?</w:instrText>' # instrText
|
||||
r')('
|
||||
r'<w:fldChar[^>]*w:fldCharType="separate"[^>]*/>' # separate
|
||||
r')</w:r>'
|
||||
)
|
||||
|
||||
def split_run(match):
|
||||
begin = match.group(1)
|
||||
instr = match.group(2)
|
||||
separate = match.group(3)
|
||||
return f'<w:r>{begin}</w:r><w:r>{instr}</w:r><w:r>{separate}</w:r>'
|
||||
|
||||
modified = re.sub(pattern, split_run, xml_content, flags=re.DOTALL)
|
||||
if modified != xml_content:
|
||||
print("Fixed: split merged fldChar begin+instrText+separate into separate <w:r> elements")
|
||||
|
||||
# Fix TOC instrText: remove \t switch with wrong style names
|
||||
# docx npm lib generates \t "Heading1,1,Heading2,2,..." but Word expects "Heading 1,1,..."
|
||||
# Since we already have \o "1-3" which uses outlineLvl (now fixed), \t is redundant and harmful
|
||||
toc_t_pattern = r'(TOC\s+[^<]*?)\\t\s+"[^&]*"'
|
||||
modified2 = re.sub(toc_t_pattern, r'\1', modified)
|
||||
if modified2 != modified:
|
||||
print("Fixed: removed \\t switch from TOC instrText (\\o with outlineLvl is sufficient)")
|
||||
modified = modified2
|
||||
|
||||
return modified
|
||||
|
||||
|
||||
def _detect_toc_styles(styles_xml_path: Path) -> dict:
|
||||
"""Detect TOC style IDs from styles.xml.
|
||||
|
||||
Args:
|
||||
styles_xml_path: Path to styles.xml
|
||||
|
||||
Returns:
|
||||
Dictionary mapping level (1-3) to style ID string
|
||||
"""
|
||||
if not styles_xml_path.exists():
|
||||
return {}
|
||||
|
||||
content = styles_xml_path.read_text(encoding='utf-8')
|
||||
result = {}
|
||||
|
||||
for level in range(1, 4):
|
||||
# Standard TOC style names: "TOC 1", "TOC 2", "TOC 3" (with space)
|
||||
# or "TOC1", "TOC2", "TOC3" (no space) — docx-js uses numeric IDs like "9", "11", "12"
|
||||
patterns = [
|
||||
rf'w:styleId="(TOC{level})"',
|
||||
rf'w:styleId="(TOC {level})"',
|
||||
rf'<w:name\s+w:val="toc\s*{level}"[^/]*/>\s*</w:name>|<w:name\s+w:val="toc\s*{level}"[^/]*/>',
|
||||
]
|
||||
for pattern in patterns[:2]:
|
||||
m = re.search(pattern, content)
|
||||
if m:
|
||||
result[level] = m.group(1)
|
||||
break
|
||||
else:
|
||||
# Try matching by w:name (case insensitive toc N)
|
||||
# Find <w:style> blocks with name containing "toc N"
|
||||
name_pattern = rf'<w:style[^>]*w:styleId="([^"]*)"[^>]*>.*?<w:name\s+w:val="[Tt][Oo][Cc]\s*{level}"'
|
||||
m = re.search(name_pattern, content, flags=re.DOTALL)
|
||||
if m:
|
||||
result[level] = m.group(1)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _ensure_toc_styles(styles_xml_path: Path) -> dict:
|
||||
"""Ensure TOC styles exist in styles.xml, adding them if necessary.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping level (1-3) to style ID string
|
||||
"""
|
||||
if not styles_xml_path.exists():
|
||||
return {1: "9", 2: "11", 3: "12"}
|
||||
|
||||
content = styles_xml_path.read_text(encoding='utf-8')
|
||||
detected = _detect_toc_styles(styles_xml_path)
|
||||
result = dict(detected)
|
||||
|
||||
W_NS = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
|
||||
|
||||
# Define TOC styles to add if missing
|
||||
toc_style_defs = {
|
||||
1: {
|
||||
'id': '9',
|
||||
'name': 'toc 1',
|
||||
'xml': f'''<w:style w:type="paragraph" w:styleId="9" xmlns:w="{W_NS}">
|
||||
<w:name w:val="toc 1"/>
|
||||
<w:basedOn w:val="Normal"/>
|
||||
<w:uiPriority w:val="39"/>
|
||||
<w:pPr>
|
||||
<w:tabs><w:tab w:val="right" w:leader="dot" w:pos="9026"/></w:tabs>
|
||||
<w:spacing w:before="120" w:after="60"/>
|
||||
</w:pPr>
|
||||
<w:rPr><w:b/><w:bCs/></w:rPr>
|
||||
</w:style>'''
|
||||
},
|
||||
2: {
|
||||
'id': '11',
|
||||
'name': 'toc 2',
|
||||
'xml': f'''<w:style w:type="paragraph" w:styleId="11" xmlns:w="{W_NS}">
|
||||
<w:name w:val="toc 2"/>
|
||||
<w:basedOn w:val="Normal"/>
|
||||
<w:uiPriority w:val="39"/>
|
||||
<w:pPr>
|
||||
<w:tabs><w:tab w:val="right" w:leader="dot" w:pos="9026"/></w:tabs>
|
||||
<w:ind w:left="360"/>
|
||||
<w:spacing w:before="60" w:after="40"/>
|
||||
</w:pPr>
|
||||
</w:style>'''
|
||||
},
|
||||
3: {
|
||||
'id': '12',
|
||||
'name': 'toc 3',
|
||||
'xml': f'''<w:style w:type="paragraph" w:styleId="12" xmlns:w="{W_NS}">
|
||||
<w:name w:val="toc 3"/>
|
||||
<w:basedOn w:val="Normal"/>
|
||||
<w:uiPriority w:val="39"/>
|
||||
<w:pPr>
|
||||
<w:tabs><w:tab w:val="right" w:leader="dot" w:pos="9026"/></w:tabs>
|
||||
<w:ind w:left="720"/>
|
||||
<w:spacing w:before="40" w:after="20"/>
|
||||
</w:pPr>
|
||||
</w:style>'''
|
||||
},
|
||||
}
|
||||
|
||||
modified = False
|
||||
for level in range(1, 4):
|
||||
if level not in result:
|
||||
style_def = toc_style_defs[level]
|
||||
result[level] = style_def['id']
|
||||
# Add style before </w:styles>
|
||||
insert_point = content.rfind('</w:styles>')
|
||||
if insert_point == -1:
|
||||
print(f"WARNING: Could not find </w:styles> to insert TOC {level} style", file=sys.stderr)
|
||||
continue
|
||||
content = content[:insert_point] + style_def['xml'] + '\n' + content[insert_point:]
|
||||
print(f"Added TOC {level} style (ID: {style_def['id']})")
|
||||
modified = True
|
||||
|
||||
if modified:
|
||||
styles_xml_path.write_text(content, encoding='utf-8')
|
||||
|
||||
# Ensure Hyperlink style exists
|
||||
_ensure_hyperlink_style(styles_xml_path)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _ensure_hyperlink_style(styles_xml_path: Path) -> None:
|
||||
"""Ensure Hyperlink character style exists in styles.xml."""
|
||||
if not styles_xml_path.exists():
|
||||
return
|
||||
|
||||
content = styles_xml_path.read_text(encoding='utf-8')
|
||||
if 'w:styleId="Hyperlink"' in content:
|
||||
return
|
||||
|
||||
W_NS = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
|
||||
hyperlink_style = f'''<w:style w:type="character" w:styleId="Hyperlink" xmlns:w="{W_NS}">
|
||||
<w:name w:val="Hyperlink"/>
|
||||
<w:uiPriority w:val="99"/>
|
||||
<w:rPr>
|
||||
<w:color w:val="0563C1"/>
|
||||
<w:u w:val="single"/>
|
||||
</w:rPr>
|
||||
</w:style>'''
|
||||
|
||||
insert_point = content.rfind('</w:styles>')
|
||||
if insert_point != -1:
|
||||
content = content[:insert_point] + hyperlink_style + '\n' + content[insert_point:]
|
||||
styles_xml_path.write_text(content, encoding='utf-8')
|
||||
print("Added Hyperlink character style")
|
||||
|
||||
|
||||
def _insert_toc_placeholders(xml_content: str, entries: list = None, toc_style_mapping: dict = None) -> str:
|
||||
"""Insert placeholder TOC entries and heading bookmarks into XML content.
|
||||
|
||||
Uses lxml ElementTree for robust XML manipulation instead of fragile regex.
|
||||
|
||||
This function does TWO things:
|
||||
1. Adds bookmark anchors to each Heading paragraph (so Word can link TOC → heading)
|
||||
2. Replaces TOC placeholder area with proper entries containing HYPERLINK + PAGEREF
|
||||
|
||||
Args:
|
||||
xml_content: The XML content of document.xml
|
||||
entries: List of placeholder entries with 'level', 'text', 'page' keys
|
||||
toc_style_mapping: Dictionary mapping level to style ID
|
||||
|
||||
Returns:
|
||||
Modified XML content with bookmarks and TOC placeholders
|
||||
|
||||
Raises:
|
||||
RuntimeError: If TOC structure cannot be found or is malformed
|
||||
"""
|
||||
from lxml import etree
|
||||
|
||||
if entries is None:
|
||||
entries = [{"level": 1, "text": "Contents", "page": "1"}]
|
||||
|
||||
if toc_style_mapping is None:
|
||||
toc_style_mapping = {1: "9", 2: "11", 3: "12"}
|
||||
|
||||
W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||
R_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
|
||||
|
||||
# Parse XML
|
||||
root = etree.fromstring(xml_content.encode('utf-8'))
|
||||
nsmap = {'w': W, 'r': R_NS}
|
||||
|
||||
# ── Step 1: Add bookmarks to Heading paragraphs ──
|
||||
bookmark_id_counter = 100000
|
||||
heading_bookmark_map = {} # text → first bookmark_name (backward compat)
|
||||
heading_bookmark_map_all = {} # text → [list of bookmark_names] for duplicate headings
|
||||
|
||||
for para in root.iter(f'{{{W}}}p'):
|
||||
# Find pStyle
|
||||
ppr = para.find(f'{{{W}}}pPr')
|
||||
if ppr is None:
|
||||
continue
|
||||
pstyle = ppr.find(f'{{{W}}}pStyle')
|
||||
if pstyle is None:
|
||||
continue
|
||||
style_val = pstyle.get(f'{{{W}}}val', '')
|
||||
if not re.match(r'Heading\d$', style_val):
|
||||
continue
|
||||
|
||||
# Extract heading text
|
||||
texts = []
|
||||
for t_elem in para.iter(f'{{{W}}}t'):
|
||||
if t_elem.text:
|
||||
texts.append(t_elem.text)
|
||||
heading_text = ''.join(texts).strip()
|
||||
if not heading_text:
|
||||
continue
|
||||
|
||||
# Skip if already has bookmark
|
||||
if para.find(f'{{{W}}}bookmarkStart') is not None:
|
||||
continue
|
||||
|
||||
# Generate bookmark
|
||||
bm_name = f"_Toc{bookmark_id_counter}"
|
||||
bm_id_str = str(bookmark_id_counter)
|
||||
bookmark_id_counter += 1
|
||||
|
||||
# Store mapping (support duplicate headings)
|
||||
if heading_text not in heading_bookmark_map_all:
|
||||
heading_bookmark_map_all[heading_text] = []
|
||||
heading_bookmark_map_all[heading_text].append(bm_name)
|
||||
if heading_text not in heading_bookmark_map:
|
||||
heading_bookmark_map[heading_text] = bm_name
|
||||
|
||||
# Insert bookmarkStart after pPr
|
||||
bm_start = etree.Element(f'{{{W}}}bookmarkStart')
|
||||
bm_start.set(f'{{{W}}}id', bm_id_str)
|
||||
bm_start.set(f'{{{W}}}name', bm_name)
|
||||
|
||||
bm_end = etree.Element(f'{{{W}}}bookmarkEnd')
|
||||
bm_end.set(f'{{{W}}}id', bm_id_str)
|
||||
|
||||
ppr_index = list(para).index(ppr)
|
||||
para.insert(ppr_index + 1, bm_start)
|
||||
# bookmarkEnd at end of paragraph
|
||||
para.append(bm_end)
|
||||
|
||||
bookmarks_added = len(heading_bookmark_map)
|
||||
if bookmarks_added > 0:
|
||||
print(f"Added {bookmarks_added} bookmarks to Heading paragraphs")
|
||||
|
||||
# ── Step 2: Find TOC field structure (begin → instrText → separate → end) ──
|
||||
toc_separate_para = None
|
||||
toc_end_para = None
|
||||
|
||||
# Track field nesting to handle nested fields correctly
|
||||
field_stack = []
|
||||
toc_field_depth = None
|
||||
|
||||
for fld_char in root.iter(f'{{{W}}}fldChar'):
|
||||
fld_type = fld_char.get(f'{{{W}}}fldCharType')
|
||||
run = fld_char.getparent()
|
||||
|
||||
if fld_type == 'begin':
|
||||
para = run.getparent()
|
||||
instr_text = ''
|
||||
found_run = False
|
||||
for sibling in para:
|
||||
if sibling is run:
|
||||
found_run = True
|
||||
it = sibling.find(f'{{{W}}}instrText')
|
||||
if it is not None and it.text:
|
||||
instr_text += it.text
|
||||
continue
|
||||
if found_run and sibling.tag == f'{{{W}}}r':
|
||||
it = sibling.find(f'{{{W}}}instrText')
|
||||
if it is not None and it.text:
|
||||
instr_text += it.text
|
||||
if sibling.find(f'{{{W}}}fldChar') is not None:
|
||||
break
|
||||
|
||||
field_stack.append(instr_text.strip())
|
||||
if 'TOC' in instr_text and toc_field_depth is None:
|
||||
toc_field_depth = len(field_stack)
|
||||
|
||||
elif fld_type == 'separate':
|
||||
if toc_field_depth is not None and len(field_stack) == toc_field_depth:
|
||||
toc_separate_para = run.getparent()
|
||||
|
||||
elif fld_type == 'end':
|
||||
if toc_field_depth is not None and len(field_stack) == toc_field_depth:
|
||||
toc_end_para = run.getparent()
|
||||
break
|
||||
if field_stack:
|
||||
field_stack.pop()
|
||||
|
||||
if toc_separate_para is None or toc_end_para is None:
|
||||
has_begin = root.find(f'.//{{{W}}}fldChar[@{{{W}}}fldCharType="begin"]') is not None
|
||||
has_separate = root.find(f'.//{{{W}}}fldChar[@{{{W}}}fldCharType="separate"]') is not None
|
||||
if not has_begin:
|
||||
raise RuntimeError(
|
||||
"TOC FAILED: No field structure found in document. "
|
||||
"Ensure the code includes a TableOfContents element."
|
||||
)
|
||||
elif not has_separate:
|
||||
raise RuntimeError(
|
||||
"TOC FAILED: TOC field has 'begin' but no 'separate' fldChar. "
|
||||
"Run _fix_fld_char_structure() first or check the docx-js version."
|
||||
)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
"TOC FAILED: Field structure found but no TOC instrText detected. "
|
||||
"Ensure TableOfContents element generates a TOC \\o field code."
|
||||
)
|
||||
|
||||
# ── Step 3: Remove everything between separate-para and end-para ──
|
||||
# The TOC paragraphs may be direct children of <w:body> or wrapped in <w:sdt><w:sdtContent>
|
||||
toc_container = toc_separate_para.getparent() # could be body or sdtContent
|
||||
container_children = list(toc_container)
|
||||
|
||||
sep_idx = container_children.index(toc_separate_para)
|
||||
end_idx = container_children.index(toc_end_para)
|
||||
|
||||
for elem in container_children[sep_idx + 1:end_idx]:
|
||||
toc_container.remove(elem)
|
||||
|
||||
# ── Step 4: Build and insert placeholder paragraphs ──
|
||||
indent_mapping = {1: 0, 2: 360, 3: 720, 4: 1080, 5: 1440, 6: 1800}
|
||||
heading_occurrence_counter = {}
|
||||
|
||||
insert_pos = list(toc_container).index(toc_end_para)
|
||||
|
||||
for entry in entries:
|
||||
level = entry.get('level', 1)
|
||||
text_raw = entry.get('text', '')
|
||||
page = entry.get('page', '1')
|
||||
|
||||
toc_style = toc_style_mapping.get(level, toc_style_mapping.get(1, "9"))
|
||||
indent = indent_mapping.get(level, 0)
|
||||
|
||||
# Resolve bookmark (handle duplicate headings correctly)
|
||||
bm_name = ''
|
||||
if text_raw in heading_bookmark_map_all:
|
||||
occ = heading_occurrence_counter.get(text_raw, 0)
|
||||
bm_list = heading_bookmark_map_all[text_raw]
|
||||
if occ < len(bm_list):
|
||||
bm_name = bm_list[occ]
|
||||
heading_occurrence_counter[text_raw] = occ + 1
|
||||
|
||||
# Build paragraph element
|
||||
p = etree.Element(f'{{{W}}}p')
|
||||
toc_container.insert(insert_pos, p)
|
||||
insert_pos += 1
|
||||
|
||||
# pPr
|
||||
ppr = etree.SubElement(p, f'{{{W}}}pPr')
|
||||
pstyle = etree.SubElement(ppr, f'{{{W}}}pStyle')
|
||||
pstyle.set(f'{{{W}}}val', str(toc_style))
|
||||
if indent > 0:
|
||||
ind = etree.SubElement(ppr, f'{{{W}}}ind')
|
||||
ind.set(f'{{{W}}}left', str(indent))
|
||||
tabs = etree.SubElement(ppr, f'{{{W}}}tabs')
|
||||
tab = etree.SubElement(tabs, f'{{{W}}}tab')
|
||||
tab.set(f'{{{W}}}val', 'right')
|
||||
tab.set(f'{{{W}}}leader', 'dot')
|
||||
tab.set(f'{{{W}}}pos', '9026')
|
||||
spacing = etree.SubElement(ppr, f'{{{W}}}spacing')
|
||||
spacing.set(f'{{{W}}}before', '120')
|
||||
spacing.set(f'{{{W}}}after', '60')
|
||||
|
||||
if bm_name:
|
||||
hyperlink = etree.SubElement(p, f'{{{W}}}hyperlink')
|
||||
hyperlink.set(f'{{{W}}}anchor', bm_name)
|
||||
hyperlink.set(f'{{{W}}}history', '1')
|
||||
|
||||
r_text = etree.SubElement(hyperlink, f'{{{W}}}r')
|
||||
rpr = etree.SubElement(r_text, f'{{{W}}}rPr')
|
||||
rstyle = etree.SubElement(rpr, f'{{{W}}}rStyle')
|
||||
rstyle.set(f'{{{W}}}val', 'Hyperlink')
|
||||
t = etree.SubElement(r_text, f'{{{W}}}t')
|
||||
t.text = text_raw
|
||||
|
||||
r_tab = etree.SubElement(hyperlink, f'{{{W}}}r')
|
||||
etree.SubElement(r_tab, f'{{{W}}}tab')
|
||||
|
||||
r_begin = etree.SubElement(hyperlink, f'{{{W}}}r')
|
||||
fc_begin = etree.SubElement(r_begin, f'{{{W}}}fldChar')
|
||||
fc_begin.set(f'{{{W}}}fldCharType', 'begin')
|
||||
|
||||
r_instr = etree.SubElement(hyperlink, f'{{{W}}}r')
|
||||
instr = etree.SubElement(r_instr, f'{{{W}}}instrText')
|
||||
instr.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
|
||||
instr.text = f' PAGEREF {bm_name} \\h '
|
||||
|
||||
r_sep = etree.SubElement(hyperlink, f'{{{W}}}r')
|
||||
fc_sep = etree.SubElement(r_sep, f'{{{W}}}fldChar')
|
||||
fc_sep.set(f'{{{W}}}fldCharType', 'separate')
|
||||
|
||||
r_page = etree.SubElement(hyperlink, f'{{{W}}}r')
|
||||
t_page = etree.SubElement(r_page, f'{{{W}}}t')
|
||||
t_page.text = str(page)
|
||||
|
||||
r_end = etree.SubElement(hyperlink, f'{{{W}}}r')
|
||||
fc_end = etree.SubElement(r_end, f'{{{W}}}fldChar')
|
||||
fc_end.set(f'{{{W}}}fldCharType', 'end')
|
||||
else:
|
||||
r_text = etree.SubElement(p, f'{{{W}}}r')
|
||||
t = etree.SubElement(r_text, f'{{{W}}}t')
|
||||
t.text = text_raw
|
||||
|
||||
r_tab = etree.SubElement(p, f'{{{W}}}r')
|
||||
etree.SubElement(r_tab, f'{{{W}}}tab')
|
||||
|
||||
r_page = etree.SubElement(p, f'{{{W}}}r')
|
||||
t_page = etree.SubElement(r_page, f'{{{W}}}t')
|
||||
t_page.text = str(page)
|
||||
|
||||
placeholders_inserted = len(entries)
|
||||
print(f"Inserted {placeholders_inserted} TOC placeholder entries")
|
||||
|
||||
# Serialize back to string
|
||||
result = etree.tostring(root, xml_declaration=True, encoding='UTF-8', standalone=True)
|
||||
return result.decode('utf-8')
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Add placeholder entries to Table of Contents in a DOCX file (in-place)'
|
||||
)
|
||||
parser.add_argument('docx_file', help='DOCX file to modify (will be replaced)')
|
||||
parser.add_argument(
|
||||
'--auto', action='store_true',
|
||||
help='Auto-extract Heading 1-3 from the DOCX as TOC entries (recommended)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--entries',
|
||||
help='JSON string with placeholder entries: [{"level":1,"text":"Chapter 1","page":"1"}]'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Determine entries
|
||||
entries = None
|
||||
if args.entries:
|
||||
try:
|
||||
entries = json.loads(args.entries)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Error parsing entries JSON: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
elif args.auto or True:
|
||||
# Default to auto mode — always extract from document headings
|
||||
entries = _extract_headings_from_docx(args.docx_file)
|
||||
if entries:
|
||||
print(f"Auto-extracted {len(entries)} headings from document", file=sys.stderr)
|
||||
else:
|
||||
print("No headings found in document, using minimal placeholder", file=sys.stderr)
|
||||
entries = [{"level": 1, "text": "Contents", "page": "1"}]
|
||||
|
||||
# Add placeholders
|
||||
try:
|
||||
add_toc_placeholders(args.docx_file, entries)
|
||||
print(f"Successfully added TOC placeholders to {args.docx_file}")
|
||||
except RuntimeError as e:
|
||||
# TOC structure errors — hard fail with exit code 1
|
||||
print(f"ERROR: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
1333
skills/docx/scripts/document.py
Executable file
1333
skills/docx/scripts/document.py
Executable file
File diff suppressed because it is too large
Load Diff
807
skills/docx/scripts/postcheck.py
Executable file
807
skills/docx/scripts/postcheck.py
Executable file
@@ -0,0 +1,807 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
postcheck.py — Document business rule self-check script
|
||||
|
||||
Unlike traditional OpenXML Schema validation, this script does not check XML legality.
|
||||
Instead, it checks document "visual quality" and "typesetting correctness" — issues visible to the human eye.
|
||||
|
||||
Usage:
|
||||
python3 postcheck.py output.docx [--fix] [--json]
|
||||
|
||||
Checks:
|
||||
1. Blank page detection — trailing/middle excess blank pages, double page breaks, consecutive empty paragraphs
|
||||
2. Line spacing consistency — whether body paragraph line spacing is uniform
|
||||
3. Table margins — whether cells have padding set
|
||||
4. Table pagination control — whether header rows have tblHeader set, data rows have cantSplit
|
||||
5. Image overflow — whether image width exceeds page usable area
|
||||
6. Font fallback — whether fonts are used that may be missing on target systems
|
||||
7. CJK indentation — whether Chinese body text has first-line indent (excluding table cells, lists, centered paragraphs)
|
||||
8. Heading level continuity — whether headings skip levels (H1→H3 skipping H2)
|
||||
9. Numbering continuity — whether numbered lists have gaps
|
||||
10. Cover separation — whether cover and body are in different sections
|
||||
11. ShadingType — whether SOLID is misused causing black cells
|
||||
12. TOC quality — whether TOC field exists, whether headings use standard Heading styles
|
||||
13. Image aspect ratio — whether images are stretched/distorted
|
||||
14. Document cleanliness — whether placeholder text, Markdown syntax, or draft expressions remain
|
||||
15. Report content quality — whether summary exists, whether titles are specific, whether vague conclusions are used
|
||||
"""
|
||||
|
||||
import zipfile
|
||||
import sys
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
NS = {
|
||||
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
|
||||
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
|
||||
"wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
|
||||
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
||||
"pic": "http://schemas.openxmlformats.org/drawingml/2006/picture",
|
||||
}
|
||||
|
||||
|
||||
class CheckResult:
|
||||
def __init__(self, name: str, passed: bool, message: str, severity: str = "warning"):
|
||||
self.name = name
|
||||
self.passed = passed
|
||||
self.message = message
|
||||
self.severity = severity # "error" | "warning" | "info"
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
"name": self.name,
|
||||
"passed": self.passed,
|
||||
"message": self.message,
|
||||
"severity": self.severity,
|
||||
}
|
||||
|
||||
def __str__(self):
|
||||
icon = "✅" if self.passed else ("❌" if self.severity == "error" else "⚠️")
|
||||
return f"{icon} [{self.name}] {self.message}"
|
||||
|
||||
|
||||
def read_document_xml(docx_path: str) -> ET.Element:
|
||||
"""Read document.xml and return the root element"""
|
||||
with zipfile.ZipFile(docx_path, "r") as z:
|
||||
return ET.fromstring(z.read("word/document.xml"))
|
||||
|
||||
|
||||
def get_sections(root: ET.Element) -> list:
|
||||
"""Extract all sections (located via sectPr)"""
|
||||
body = root.find(".//w:body", NS)
|
||||
if body is None:
|
||||
return []
|
||||
|
||||
sections = []
|
||||
current_children = []
|
||||
|
||||
for child in body:
|
||||
tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
|
||||
if tag == "sectPr":
|
||||
sections.append({"children": current_children, "sectPr": child})
|
||||
current_children = []
|
||||
else:
|
||||
# Check whether paragraph contains sectPr (section break inside paragraph pPr)
|
||||
ppr_sect = child.find(".//w:pPr/w:sectPr", NS)
|
||||
if ppr_sect is not None:
|
||||
current_children.append(child)
|
||||
sections.append({"children": current_children, "sectPr": ppr_sect})
|
||||
current_children = []
|
||||
else:
|
||||
current_children.append(child)
|
||||
|
||||
# Last section (body-level sectPr)
|
||||
body_sect = body.find("w:sectPr", NS)
|
||||
if body_sect is not None and current_children:
|
||||
sections.append({"children": current_children, "sectPr": body_sect})
|
||||
|
||||
return sections
|
||||
|
||||
|
||||
def check_blank_pages(root: ET.Element) -> CheckResult:
|
||||
"""Detect excess blank pages — multi-pattern detection"""
|
||||
body = root.find(".//w:body", NS)
|
||||
paragraphs = body.findall("w:p", NS)
|
||||
issues = []
|
||||
|
||||
if not paragraphs:
|
||||
return CheckResult("blank-pages", True, "No paragraph content")
|
||||
|
||||
# Check 1: Whether the last paragraph only has a page break
|
||||
last_p = paragraphs[-1]
|
||||
runs = last_p.findall(".//w:r", NS)
|
||||
has_page_break = False
|
||||
has_text = False
|
||||
for run in runs:
|
||||
br = run.find("w:br", NS)
|
||||
if br is not None and br.get(f"{{{NS['w']}}}type") == "page":
|
||||
has_page_break = True
|
||||
t = run.find("w:t", NS)
|
||||
if t is not None and t.text and t.text.strip():
|
||||
has_text = True
|
||||
if has_page_break and not has_text:
|
||||
issues.append("Trailing page break at document end may cause blank page")
|
||||
|
||||
# Check 2: Consecutive empty paragraphs (≥5 consecutive may form visual blank page)
|
||||
consecutive_empty = 0
|
||||
max_empty = 0
|
||||
max_empty_pos = 0
|
||||
for idx, p in enumerate(paragraphs):
|
||||
texts = p.findall(".//w:t", NS)
|
||||
has_any_text = any(t.text and t.text.strip() for t in texts)
|
||||
has_br = any(
|
||||
br.get(f"{{{NS['w']}}}type") == "page"
|
||||
for br in p.findall(".//w:br", NS)
|
||||
)
|
||||
has_drawing = p.find(".//{http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing}inline", None) is not None
|
||||
if not has_any_text and not has_br and not has_drawing:
|
||||
consecutive_empty += 1
|
||||
if consecutive_empty > max_empty:
|
||||
max_empty = consecutive_empty
|
||||
max_empty_pos = idx
|
||||
else:
|
||||
consecutive_empty = 0
|
||||
|
||||
if max_empty >= 5:
|
||||
issues.append(f"Found {max_empty} consecutive empty paragraphs (starting around paragraph {max_empty_pos - max_empty + 2}), may form visual blank page")
|
||||
|
||||
# Check 3: Double page break at section boundary (PageBreak at section end + NEXT_PAGE in next section)
|
||||
sections = get_sections(root)
|
||||
for i in range(len(sections) - 1):
|
||||
sec_children = sections[i]["children"]
|
||||
if not sec_children:
|
||||
continue
|
||||
# Check whether the last paragraph of the section contains PageBreak
|
||||
last_child = sec_children[-1]
|
||||
if last_child.tag == f"{{{NS['w']}}}p":
|
||||
for br in last_child.findall(".//w:br", NS):
|
||||
if br.get(f"{{{NS['w']}}}type") == "page":
|
||||
# Check whether the next section is NEXT_PAGE
|
||||
next_sect_pr = sections[i + 1]["sectPr"]
|
||||
sect_type = next_sect_pr.find("w:type", NS)
|
||||
if sect_type is not None and sect_type.get(f"{{{NS['w']}}}val") == "nextPage":
|
||||
issues.append(f"Section {i+1} ends with PageBreak and Section {i+2} is type nextPage, double page break causes blank page")
|
||||
|
||||
# Check 4: Empty paragraph + PageBreak (paragraph has only PageBreak, no text)
|
||||
# Exclude section-ending PageBreaks — they are normal section separators
|
||||
# (e.g., cover page ending with an empty para + PageBreak before a new section)
|
||||
section_last_paras = set()
|
||||
for sec in sections:
|
||||
children = sec["children"]
|
||||
if children:
|
||||
last_child = children[-1]
|
||||
section_last_paras.add(id(last_child))
|
||||
|
||||
empty_pb_count = 0
|
||||
for p in paragraphs[:-1]: # Last paragraph already handled in Check 1
|
||||
if id(p) in section_last_paras:
|
||||
continue # Skip section-ending paragraphs (normal section breaks)
|
||||
runs = p.findall(".//w:r", NS)
|
||||
p_has_break = False
|
||||
p_has_text = False
|
||||
for run in runs:
|
||||
br = run.find("w:br", NS)
|
||||
if br is not None and br.get(f"{{{NS['w']}}}type") == "page":
|
||||
p_has_break = True
|
||||
t = run.find("w:t", NS)
|
||||
if t is not None and t.text and t.text.strip():
|
||||
p_has_text = True
|
||||
if p_has_break and not p_has_text:
|
||||
empty_pb_count += 1
|
||||
|
||||
if empty_pb_count > 0:
|
||||
issues.append(f"Found {empty_pb_count} empty paragraphs with PageBreak (suggest attaching PageBreak to content paragraphs)")
|
||||
|
||||
# Separate hard errors from soft warnings
|
||||
hard_issues = [i for i in issues if "double page break" in i.lower() or "trailing page break" in i.lower() or "consecutive" in i.lower()]
|
||||
soft_issues = [i for i in issues if i not in hard_issues]
|
||||
|
||||
if hard_issues:
|
||||
return CheckResult(
|
||||
"blank-pages", False,
|
||||
"; ".join(hard_issues[:3]),
|
||||
"error"
|
||||
)
|
||||
if soft_issues:
|
||||
return CheckResult(
|
||||
"blank-pages", False,
|
||||
"; ".join(soft_issues[:3]),
|
||||
"warning"
|
||||
)
|
||||
|
||||
return CheckResult("blank-pages", True, "No blank page issues detected")
|
||||
|
||||
|
||||
def check_line_spacing(root: ET.Element) -> CheckResult:
|
||||
"""Check body paragraph line spacing consistency"""
|
||||
body = root.find(".//w:body", NS)
|
||||
paragraphs = body.findall(".//w:p", NS)
|
||||
|
||||
spacing_values = {}
|
||||
body_para_count = 0
|
||||
|
||||
for p in paragraphs:
|
||||
ppr = p.find("w:pPr", NS)
|
||||
# Skip heading paragraphs
|
||||
if ppr is not None:
|
||||
style = ppr.find("w:pStyle", NS)
|
||||
if style is not None:
|
||||
val = style.get(f"{{{NS['w']}}}val", "")
|
||||
if val.startswith("Heading") or val == "Title":
|
||||
continue
|
||||
|
||||
spacing = ppr.find("w:spacing", NS) if ppr is not None else None
|
||||
line_val = spacing.get(f"{{{NS['w']}}}line") if spacing is not None else None
|
||||
|
||||
# Only count paragraphs with text content
|
||||
texts = p.findall(".//w:t", NS)
|
||||
if not any(t.text and t.text.strip() for t in texts):
|
||||
continue
|
||||
|
||||
body_para_count += 1
|
||||
key = line_val or "default"
|
||||
spacing_values[key] = spacing_values.get(key, 0) + 1
|
||||
|
||||
if body_para_count == 0:
|
||||
return CheckResult("line-spacing", True, "No body paragraphs")
|
||||
|
||||
if len(spacing_values) <= 1:
|
||||
dominant = list(spacing_values.keys())[0] if spacing_values else "default"
|
||||
return CheckResult("line-spacing", True, f"Line spacing uniform (line={dominant})")
|
||||
|
||||
# Find the most common line spacing
|
||||
dominant = max(spacing_values, key=spacing_values.get)
|
||||
inconsistent = sum(v for k, v in spacing_values.items() if k != dominant)
|
||||
total = sum(spacing_values.values())
|
||||
|
||||
if inconsistent / total > 0.2:
|
||||
return CheckResult(
|
||||
"line-spacing", False,
|
||||
f"Line spacing inconsistent: {dict(spacing_values)}, {inconsistent}/{total} paragraphs differ from dominant spacing {dominant}",
|
||||
"warning"
|
||||
)
|
||||
|
||||
return CheckResult("line-spacing", True, f"Line spacing mostly uniform (line={dominant}, {inconsistent} exceptions)")
|
||||
|
||||
|
||||
|
||||
def check_image_overflow(root: ET.Element) -> CheckResult:
|
||||
"""Check whether image width may exceed page bounds"""
|
||||
# Get page width
|
||||
sect_pr = root.find(".//w:body/w:sectPr", NS)
|
||||
page_width = 11906 # A4 default
|
||||
margin_left = 1701
|
||||
margin_right = 1417
|
||||
|
||||
if sect_pr is not None:
|
||||
pg_sz = sect_pr.find("w:pgSz", NS)
|
||||
pg_mar = sect_pr.find("w:pgMar", NS)
|
||||
if pg_sz is not None:
|
||||
page_width = int(pg_sz.get(f"{{{NS['w']}}}w", "11906"))
|
||||
if pg_mar is not None:
|
||||
margin_left = int(pg_mar.get(f"{{{NS['w']}}}left", "1701"))
|
||||
margin_right = int(pg_mar.get(f"{{{NS['w']}}}right", "1417"))
|
||||
|
||||
usable_width_emu = (page_width - margin_left - margin_right) * 635 # twips → EMU
|
||||
|
||||
drawings = root.findall(".//wp:inline", NS) + root.findall(".//wp:anchor", NS)
|
||||
oversized = 0
|
||||
|
||||
for dwg in drawings:
|
||||
extent = dwg.find("wp:extent", NS)
|
||||
if extent is not None:
|
||||
cx = int(extent.get("cx", "0"))
|
||||
if cx > usable_width_emu * 1.05: # 5% tolerance
|
||||
oversized += 1
|
||||
|
||||
if oversized > 0:
|
||||
return CheckResult(
|
||||
"image-overflow", False,
|
||||
f"{oversized} images exceed page usable area",
|
||||
"error"
|
||||
)
|
||||
|
||||
return CheckResult(
|
||||
"image-overflow", True,
|
||||
f"All images within page width ({len(drawings)} images)"
|
||||
)
|
||||
|
||||
|
||||
def check_image_aspect_ratio(docx_path: str, root: ET.Element) -> CheckResult:
|
||||
"""Check whether images are stretched/distorted (aspect ratio drift).
|
||||
|
||||
Compares the original aspect ratio of embedded images with the display aspect ratio set in wp:extent.
|
||||
Drift >10% is considered distortion (pie charts becoming elliptical, radar charts becoming diamond-shaped, etc).
|
||||
"""
|
||||
import zipfile as _zf
|
||||
|
||||
# Build a mapping: rId → image file path inside the zip
|
||||
# We need to parse word/_rels/document.xml.rels
|
||||
rid_to_path = {}
|
||||
try:
|
||||
with _zf.ZipFile(docx_path, 'r') as z:
|
||||
rels_path = 'word/_rels/document.xml.rels'
|
||||
if rels_path in z.namelist():
|
||||
rels_xml = z.read(rels_path)
|
||||
rels_root = ET.fromstring(rels_xml)
|
||||
rels_ns = 'http://schemas.openxmlformats.org/package/2006/relationships'
|
||||
for rel in rels_root.findall(f'{{{rels_ns}}}Relationship'):
|
||||
rid = rel.get('Id', '')
|
||||
target = rel.get('Target', '')
|
||||
rel_type = rel.get('Type', '')
|
||||
if 'image' in rel_type:
|
||||
# Target is relative to word/ directory
|
||||
if not target.startswith('/'):
|
||||
img_path = 'word/' + target
|
||||
else:
|
||||
img_path = target.lstrip('/')
|
||||
rid_to_path[rid] = img_path
|
||||
|
||||
# Now check each drawing
|
||||
drawings = root.findall(".//wp:inline", NS) + root.findall(".//wp:anchor", NS)
|
||||
distorted = []
|
||||
|
||||
for dwg in drawings:
|
||||
extent = dwg.find("wp:extent", NS)
|
||||
if extent is None:
|
||||
continue
|
||||
display_cx = int(extent.get("cx", "0"))
|
||||
display_cy = int(extent.get("cy", "0"))
|
||||
if display_cx == 0 or display_cy == 0:
|
||||
continue
|
||||
|
||||
# Find the blip rId
|
||||
blip = dwg.find(".//a:blip", NS)
|
||||
if blip is None:
|
||||
continue
|
||||
r_embed = blip.get(f"{{{NS['r']}}}embed", "")
|
||||
if not r_embed or r_embed not in rid_to_path:
|
||||
continue
|
||||
|
||||
img_zip_path = rid_to_path[r_embed]
|
||||
if img_zip_path not in z.namelist():
|
||||
continue
|
||||
|
||||
# Read actual image dimensions
|
||||
try:
|
||||
img_data = z.read(img_zip_path)
|
||||
from PIL import Image as _PILImage
|
||||
import io as _io
|
||||
pil_img = _PILImage.open(_io.BytesIO(img_data))
|
||||
orig_w, orig_h = pil_img.size
|
||||
if orig_w == 0 or orig_h == 0:
|
||||
continue
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Compare aspect ratios
|
||||
orig_ratio = orig_w / orig_h
|
||||
display_ratio = display_cx / display_cy
|
||||
drift = abs(orig_ratio - display_ratio) / orig_ratio
|
||||
|
||||
if drift > 0.10: # >10% distortion
|
||||
pct = drift * 100
|
||||
distorted.append(
|
||||
f"{img_zip_path.split('/')[-1]}: "
|
||||
f"original {orig_w}×{orig_h} (ratio={orig_ratio:.2f}), "
|
||||
f"display ratio={display_ratio:.2f}, distortion {pct:.0f}%"
|
||||
)
|
||||
|
||||
except Exception:
|
||||
return CheckResult(
|
||||
"image-aspect-ratio", True,
|
||||
"Cannot check image aspect ratio (zip read error)",
|
||||
"info"
|
||||
)
|
||||
|
||||
if distorted:
|
||||
detail = "; ".join(distorted[:3])
|
||||
if len(distorted) > 3:
|
||||
detail += f" ...and {len(distorted)} more"
|
||||
return CheckResult(
|
||||
"image-aspect-ratio", False,
|
||||
f"{len(distorted)} images have aspect ratio distortion: {detail}",
|
||||
"warning"
|
||||
)
|
||||
|
||||
img_count = len(drawings)
|
||||
return CheckResult(
|
||||
"image-aspect-ratio", True,
|
||||
f"All images have correct aspect ratio ({img_count} images)"
|
||||
)
|
||||
|
||||
|
||||
def check_font_fallback(root: ET.Element) -> CheckResult:
|
||||
"""Check whether potentially missing fonts are used"""
|
||||
SAFE_FONTS = {
|
||||
# Chinese
|
||||
"宋体", "SimSun", "黑体", "SimHei", "微软雅黑", "Microsoft YaHei",
|
||||
"仿宋", "FangSong", "FangSong_GB2312", "楷体", "KaiTi",
|
||||
# English
|
||||
"Times New Roman", "Arial", "Calibri", "Helvetica",
|
||||
"Courier New", "Georgia", "Verdana", "Tahoma",
|
||||
# Universal
|
||||
"Symbol", "Wingdings",
|
||||
}
|
||||
|
||||
fonts_used = set()
|
||||
for rpr in root.findall(".//w:rPr", NS):
|
||||
for font_tag in ["w:rFonts"]:
|
||||
rf = rpr.find(font_tag, NS)
|
||||
if rf is not None:
|
||||
for attr in ["ascii", "eastAsia", "hAnsi", "cs"]:
|
||||
f = rf.get(f"{{{NS['w']}}}{attr}")
|
||||
if f:
|
||||
fonts_used.add(f)
|
||||
|
||||
risky = fonts_used - SAFE_FONTS
|
||||
if risky:
|
||||
return CheckResult(
|
||||
"font-fallback", False,
|
||||
f"Following fonts may be missing on target system: {', '.join(sorted(risky))}",
|
||||
"info"
|
||||
)
|
||||
|
||||
return CheckResult("font-fallback", True, f"All fonts are common system fonts ({len(fonts_used)} types)")
|
||||
|
||||
|
||||
|
||||
def check_heading_levels(root: ET.Element) -> CheckResult:
|
||||
"""Check whether headings skip levels"""
|
||||
body = root.find(".//w:body", NS)
|
||||
heading_levels = []
|
||||
|
||||
for p in body.findall(".//w:p", NS):
|
||||
ppr = p.find("w:pPr", NS)
|
||||
if ppr is None:
|
||||
continue
|
||||
style = ppr.find("w:pStyle", NS)
|
||||
if style is None:
|
||||
continue
|
||||
val = style.get(f"{{{NS['w']}}}val", "")
|
||||
m = re.match(r"Heading(\d+)", val)
|
||||
if m:
|
||||
heading_levels.append(int(m.group(1)))
|
||||
|
||||
if len(heading_levels) < 2:
|
||||
return CheckResult("heading-levels", True, "Too few headings, skipping check")
|
||||
|
||||
skips = []
|
||||
for i in range(1, len(heading_levels)):
|
||||
diff = heading_levels[i] - heading_levels[i - 1]
|
||||
if diff > 1:
|
||||
skips.append(f"H{heading_levels[i-1]}→H{heading_levels[i]}")
|
||||
|
||||
if skips:
|
||||
return CheckResult(
|
||||
"heading-levels", False,
|
||||
f"Heading level skip: {', '.join(skips[:5])}",
|
||||
"warning"
|
||||
)
|
||||
|
||||
return CheckResult("heading-levels", True, f"Heading levels continuous ({len(heading_levels)} headings)")
|
||||
|
||||
|
||||
# check_cover_separation removed — false positives on complex covers (>15 elements is normal)
|
||||
|
||||
|
||||
def check_shading_type(root: ET.Element) -> CheckResult:
|
||||
"""Check whether ShadingType.SOLID is misused"""
|
||||
shadings = root.findall(".//w:shd", NS)
|
||||
solid_count = 0
|
||||
|
||||
for shd in shadings:
|
||||
val = shd.get(f"{{{NS['w']}}}val", "")
|
||||
if val == "solid":
|
||||
solid_count += 1
|
||||
|
||||
if solid_count > 0:
|
||||
return CheckResult(
|
||||
"shading-type", False,
|
||||
f"Found {solid_count} instances of ShadingType.SOLID (should be CLEAR), may cause black cells",
|
||||
"error"
|
||||
)
|
||||
|
||||
return CheckResult("shading-type", True, "No ShadingType.SOLID misuse found")
|
||||
|
||||
|
||||
|
||||
def check_toc(root: ET.Element, docx_path: str = "") -> CheckResult:
|
||||
"""Check TOC quality: field existence, headings presence, outlineLvl, updateFields."""
|
||||
body = root.find(".//w:body", NS)
|
||||
if body is None:
|
||||
return CheckResult("toc", True, "Document body is empty, skipping TOC check", "info")
|
||||
|
||||
paragraphs = list(body)
|
||||
w_ns = NS["w"]
|
||||
|
||||
# --- Detect headings and their levels ---
|
||||
heading_count = 0
|
||||
heading_levels_used = set() # e.g. {1, 2, 3}
|
||||
for p in paragraphs:
|
||||
if p.tag != f"{{{w_ns}}}p":
|
||||
continue
|
||||
ppr = p.find(f"{{{w_ns}}}pPr")
|
||||
if ppr is None:
|
||||
continue
|
||||
ps = ppr.find(f"{{{w_ns}}}pStyle")
|
||||
if ps is None:
|
||||
continue
|
||||
val = ps.get(f"{{{w_ns}}}val", "")
|
||||
m = re.match(r"(?i)heading\s*(\d)", val)
|
||||
if m:
|
||||
heading_count += 1
|
||||
heading_levels_used.add(int(m.group(1)))
|
||||
|
||||
# --- Detect TOC field ---
|
||||
has_toc = False
|
||||
for instr in root.findall(f".//{{{w_ns}}}instrText"):
|
||||
if instr.text and "TOC" in instr.text.upper():
|
||||
has_toc = True
|
||||
break
|
||||
if not has_toc:
|
||||
for fld in root.findall(f".//{{{w_ns}}}fldSimple"):
|
||||
if "TOC" in fld.get(f"{{{w_ns}}}instr", "").upper():
|
||||
has_toc = True
|
||||
break
|
||||
# Also check SDT-wrapped TOC
|
||||
if not has_toc:
|
||||
for sdt in root.findall(f".//{{{w_ns}}}sdt"):
|
||||
for instr in sdt.findall(f".//{{{w_ns}}}instrText"):
|
||||
if instr.text and "TOC" in instr.text.upper():
|
||||
has_toc = True
|
||||
break
|
||||
if has_toc:
|
||||
break
|
||||
|
||||
issues = []
|
||||
|
||||
# Check 1: Document has a "目录" / "目 录" / "Table of Contents" title but no TOC field
|
||||
has_toc_title = False
|
||||
toc_title_pattern = re.compile(r'^(?:目\s*录|table\s+of\s+contents|contents)$', re.IGNORECASE)
|
||||
for p in paragraphs:
|
||||
if p.tag != f"{{{w_ns}}}p":
|
||||
continue
|
||||
texts = p.findall(f".//{{{w_ns}}}t")
|
||||
p_text = "".join(t.text or "" for t in texts).strip()
|
||||
if toc_title_pattern.match(p_text):
|
||||
has_toc_title = True
|
||||
break
|
||||
|
||||
if has_toc_title and not has_toc:
|
||||
issues.append("TOC_FIELD_MISSING: document has a TOC title but no TOC field element — add TableOfContents in code")
|
||||
|
||||
# Check 2: TOC field exists but no headings in document → TOC will be empty after update
|
||||
if has_toc and heading_count == 0:
|
||||
issues.append("TOC_NO_HEADINGS: TOC field exists but document has 0 Heading-styled paragraphs — TOC will be empty after update")
|
||||
|
||||
# Check 3 & 4: Read styles.xml and settings.xml from DOCX (only when TOC exists)
|
||||
if has_toc and docx_path:
|
||||
try:
|
||||
import zipfile
|
||||
with zipfile.ZipFile(docx_path, 'r') as zf:
|
||||
# Check 3: outlineLvl missing in Heading styles
|
||||
if 'word/styles.xml' in zf.namelist():
|
||||
styles_content = zf.read('word/styles.xml').decode('utf-8')
|
||||
styles_root = ET.fromstring(styles_content)
|
||||
|
||||
missing_outline = []
|
||||
for level in sorted(heading_levels_used):
|
||||
style_id = f"Heading{level}"
|
||||
# Find <w:style w:styleId="HeadingN">
|
||||
for style_elem in styles_root.findall(f".//{{{w_ns}}}style"):
|
||||
sid = style_elem.get(f"{{{w_ns}}}styleId", "")
|
||||
if sid == style_id:
|
||||
# Check if pPr has outlineLvl
|
||||
ppr = style_elem.find(f"{{{w_ns}}}pPr")
|
||||
has_outline = False
|
||||
if ppr is not None:
|
||||
ol = ppr.find(f"{{{w_ns}}}outlineLvl")
|
||||
if ol is not None:
|
||||
has_outline = True
|
||||
if not has_outline:
|
||||
missing_outline.append(style_id)
|
||||
break
|
||||
|
||||
if missing_outline:
|
||||
issues.append(
|
||||
"TOC_OUTLINE_MISSING: %s style(s) missing outlineLvl — "
|
||||
"Word TOC update won't find these headings. "
|
||||
"Run add_toc_placeholders.py to fix" % ", ".join(missing_outline)
|
||||
)
|
||||
|
||||
# Check 4: updateFields not set to true
|
||||
if 'word/settings.xml' in zf.namelist():
|
||||
settings_content = zf.read('word/settings.xml').decode('utf-8')
|
||||
# Check for <w:updateFields w:val="true"/>
|
||||
update_ok = bool(re.search(
|
||||
r'<w:updateFields\s+[^>]*w:val\s*=\s*"true"',
|
||||
settings_content
|
||||
))
|
||||
if not update_ok:
|
||||
issues.append(
|
||||
"TOC_UPDATE_DISABLED: settings.xml missing updateFields=true — "
|
||||
"Word won't prompt to update TOC on open. "
|
||||
"Run add_toc_placeholders.py to fix"
|
||||
)
|
||||
except Exception as e:
|
||||
issues.append(f"TOC_CHECK_ERROR: failed to read styles/settings from DOCX: {e}")
|
||||
|
||||
if not issues:
|
||||
if has_toc:
|
||||
return CheckResult("toc", True, "TOC field present and update-ready")
|
||||
else:
|
||||
return CheckResult("toc", True, "No TOC needed")
|
||||
|
||||
severity = "error" if any(k in i for i in issues for k in ("FIELD_MISSING", "NO_HEADINGS", "OUTLINE_MISSING")) else "warning"
|
||||
return CheckResult("toc", False, "; ".join(issues[:5]), severity)
|
||||
|
||||
|
||||
|
||||
|
||||
def check_cover_overflow(root: ET.Element) -> CheckResult:
|
||||
"""Detect cover section issues: oversized fonts, excessive spacing, trailing empty content."""
|
||||
sections = get_sections(root)
|
||||
if not sections:
|
||||
return CheckResult("cover-overflow", True, "No sections found")
|
||||
|
||||
sec0 = sections[0]
|
||||
sect_pr = sec0["sectPr"]
|
||||
|
||||
# Get page dimensions and margins for accurate available height calculation
|
||||
pg_sz = sect_pr.find("w:pgSz", NS)
|
||||
pg_mar = sect_pr.find("w:pgMar", NS)
|
||||
page_height = int(pg_sz.get(f"{{{NS['w']}}}h", "16838")) if pg_sz is not None else 16838
|
||||
margin_top = int(pg_mar.get(f"{{{NS['w']}}}top", "0")) if pg_mar is not None else 0
|
||||
margin_bottom = int(pg_mar.get(f"{{{NS['w']}}}bottom", "0")) if pg_mar is not None else 0
|
||||
|
||||
issues = []
|
||||
children = sec0["children"]
|
||||
|
||||
# Check 1: Oversized font in cover section (> 44pt = 88 half-points = 889000 EMU)
|
||||
max_font_size = 0
|
||||
for child in children:
|
||||
for sz in child.findall(".//" + f"{{{NS['w']}}}sz"):
|
||||
val = sz.get(f"{{{NS['w']}}}val")
|
||||
if val and val.isdigit():
|
||||
size_hp = int(val)
|
||||
if size_hp > max_font_size:
|
||||
max_font_size = size_hp
|
||||
|
||||
if max_font_size > 88: # 88 half-points = 44pt
|
||||
issues.append(
|
||||
f"Cover has font size {max_font_size // 2}pt (>{44}pt max). "
|
||||
f"Use calcTitleLayout() for dynamic sizing"
|
||||
)
|
||||
|
||||
# Check 2: Excessive spacing.before in cover section (> 5000 twips)
|
||||
max_spacing = 0
|
||||
for child in children:
|
||||
for sp in child.findall(".//" + f"{{{NS['w']}}}spacing"):
|
||||
before = sp.get(f"{{{NS['w']}}}before")
|
||||
if before and before.isdigit():
|
||||
val = int(before)
|
||||
if val > max_spacing:
|
||||
max_spacing = val
|
||||
|
||||
if max_spacing > 5000:
|
||||
issues.append(
|
||||
f"Cover has spacing.before={max_spacing} twips (>5000 max). "
|
||||
f"Use calcCoverSpacing() for dynamic spacing"
|
||||
)
|
||||
|
||||
# Check 3: Trailing empty paragraphs in cover section
|
||||
trailing_empty = 0
|
||||
for child in reversed(children):
|
||||
tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
|
||||
if tag != "p":
|
||||
break
|
||||
texts = child.findall(".//" + f"{{{NS['w']}}}t")
|
||||
has_text = any(t.text and t.text.strip() for t in texts)
|
||||
if not has_text:
|
||||
trailing_empty += 1
|
||||
else:
|
||||
break
|
||||
|
||||
if trailing_empty > 2:
|
||||
issues.append(
|
||||
f"Cover section ends with {trailing_empty} empty paragraphs (max 2 allowed) — "
|
||||
f"excessive empty paragraphs may cause blank page after cover"
|
||||
)
|
||||
|
||||
if issues:
|
||||
return CheckResult(
|
||||
"cover-overflow", False,
|
||||
"; ".join(issues),
|
||||
"error"
|
||||
)
|
||||
|
||||
return CheckResult("cover-overflow", True, "Cover section layout looks OK")
|
||||
|
||||
|
||||
def run_all_checks(docx_path: str) -> list[CheckResult]:
|
||||
"""Run all checks"""
|
||||
root = read_document_xml(docx_path)
|
||||
|
||||
checks = [
|
||||
check_blank_pages,
|
||||
check_cover_overflow,
|
||||
check_line_spacing,
|
||||
check_image_overflow,
|
||||
check_font_fallback,
|
||||
check_heading_levels,
|
||||
check_shading_type,
|
||||
]
|
||||
|
||||
results = []
|
||||
for check_fn in checks:
|
||||
try:
|
||||
results.append(check_fn(root))
|
||||
except Exception as e:
|
||||
results.append(CheckResult(
|
||||
check_fn.__name__.replace("check_", ""),
|
||||
False,
|
||||
f"Check error: {e}",
|
||||
"error"
|
||||
))
|
||||
|
||||
# TOC check needs both root and docx_path
|
||||
try:
|
||||
results.append(check_toc(root, docx_path))
|
||||
except Exception as e:
|
||||
results.append(CheckResult("toc", False, f"Check error: {e}", "error"))
|
||||
|
||||
# Image aspect ratio check needs both root and docx_path
|
||||
try:
|
||||
results.append(check_image_aspect_ratio(docx_path, root))
|
||||
except Exception as e:
|
||||
results.append(CheckResult("image-aspect-ratio", False, f"Check error: {e}", "error"))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description="docx business rule self-check")
|
||||
parser.add_argument("docx_path", help="Path to the .docx file to check")
|
||||
parser.add_argument("--json", action="store_true", help="Output in JSON format")
|
||||
parser.add_argument("--strict", action="store_true", help="Treat warnings as failures")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not Path(args.docx_path).exists():
|
||||
print(f"❌ File not found: {args.docx_path}")
|
||||
sys.exit(1)
|
||||
|
||||
results = run_all_checks(args.docx_path)
|
||||
|
||||
if args.json:
|
||||
print(json.dumps([r.to_dict() for r in results], ensure_ascii=False, indent=2))
|
||||
else:
|
||||
print(f"\n📋 Document self-check report: {args.docx_path}\n")
|
||||
for r in results:
|
||||
print(f" {r}")
|
||||
|
||||
passed = sum(1 for r in results if r.passed)
|
||||
total = len(results)
|
||||
errors = sum(1 for r in results if not r.passed and r.severity == "error")
|
||||
warnings = sum(1 for r in results if not r.passed and r.severity == "warning")
|
||||
|
||||
print(f"\n {'─' * 50}")
|
||||
print(f" Passed {passed}/{total} | ❌ {errors} errors | ⚠️ {warnings} warnings\n")
|
||||
|
||||
# Exit code
|
||||
has_errors = any(not r.passed and r.severity == "error" for r in results)
|
||||
has_warnings = any(not r.passed and r.severity == "warning" for r in results)
|
||||
|
||||
if has_errors:
|
||||
sys.exit(2)
|
||||
elif args.strict and has_warnings:
|
||||
sys.exit(1)
|
||||
else:
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
3
skills/docx/scripts/templates/comments.xml
Executable file
3
skills/docx/scripts/templates/comments.xml
Executable file
@@ -0,0 +1,3 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:comments xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14">
|
||||
</w:comments>
|
||||
3
skills/docx/scripts/templates/commentsExtended.xml
Executable file
3
skills/docx/scripts/templates/commentsExtended.xml
Executable file
@@ -0,0 +1,3 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w15:commentsEx xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14">
|
||||
</w15:commentsEx>
|
||||
3
skills/docx/scripts/templates/commentsExtensible.xml
Executable file
3
skills/docx/scripts/templates/commentsExtensible.xml
Executable file
@@ -0,0 +1,3 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w16cex:commentsExtensible xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" xmlns:cr="http://schemas.microsoft.com/office/comments/2020/reactions" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl cr w16du wp14">
|
||||
</w16cex:commentsExtensible>
|
||||
3
skills/docx/scripts/templates/commentsIds.xml
Executable file
3
skills/docx/scripts/templates/commentsIds.xml
Executable file
@@ -0,0 +1,3 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w16cid:commentsIds xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14">
|
||||
</w16cid:commentsIds>
|
||||
3
skills/docx/scripts/templates/people.xml
Executable file
3
skills/docx/scripts/templates/people.xml
Executable file
@@ -0,0 +1,3 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w15:people xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml">
|
||||
</w15:people>
|
||||
374
skills/docx/scripts/utilities.py
Executable file
374
skills/docx/scripts/utilities.py
Executable file
@@ -0,0 +1,374 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Utilities for editing OOXML documents.
|
||||
|
||||
This module provides XMLEditor, a tool for manipulating XML files with support for
|
||||
line-number-based node finding and DOM manipulation. Each element is automatically
|
||||
annotated with its original line and column position during parsing.
|
||||
|
||||
Example usage:
|
||||
editor = XMLEditor("document.xml")
|
||||
|
||||
# Find node by line number or range
|
||||
elem = editor.get_node(tag="w:r", line_number=519)
|
||||
elem = editor.get_node(tag="w:p", line_number=range(100, 200))
|
||||
|
||||
# Find node by text content
|
||||
elem = editor.get_node(tag="w:p", contains="specific text")
|
||||
|
||||
# Find node by attributes
|
||||
elem = editor.get_node(tag="w:r", attrs={"w:id": "target"})
|
||||
|
||||
# Combine filters
|
||||
elem = editor.get_node(tag="w:p", line_number=range(1, 50), contains="text")
|
||||
|
||||
# Replace, insert, or manipulate
|
||||
new_elem = editor.replace_node(elem, "<w:r><w:t>new text</w:t></w:r>")
|
||||
editor.insert_after(new_elem, "<w:r><w:t>more</w:t></w:r>")
|
||||
|
||||
# Save changes
|
||||
editor.save()
|
||||
"""
|
||||
|
||||
import html
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union
|
||||
|
||||
import defusedxml.minidom
|
||||
import defusedxml.sax
|
||||
|
||||
|
||||
class XMLEditor:
|
||||
"""
|
||||
Editor for manipulating OOXML XML files with line-number-based node finding.
|
||||
|
||||
This class parses XML files and tracks the original line and column position
|
||||
of each element. This enables finding nodes by their line number in the original
|
||||
file, which is useful when working with Read tool output.
|
||||
|
||||
Attributes:
|
||||
xml_path: Path to the XML file being edited
|
||||
encoding: Detected encoding of the XML file ('ascii' or 'utf-8')
|
||||
dom: Parsed DOM tree with parse_position attributes on elements
|
||||
"""
|
||||
|
||||
def __init__(self, xml_path):
|
||||
"""
|
||||
Initialize with path to XML file and parse with line number tracking.
|
||||
|
||||
Args:
|
||||
xml_path: Path to XML file to edit (str or Path)
|
||||
|
||||
Raises:
|
||||
ValueError: If the XML file does not exist
|
||||
"""
|
||||
self.xml_path = Path(xml_path)
|
||||
if not self.xml_path.exists():
|
||||
raise ValueError(f"XML file not found: {xml_path}")
|
||||
|
||||
with open(self.xml_path, "rb") as f:
|
||||
header = f.read(200).decode("utf-8", errors="ignore")
|
||||
self.encoding = "ascii" if 'encoding="ascii"' in header else "utf-8"
|
||||
|
||||
parser = _create_line_tracking_parser()
|
||||
self.dom = defusedxml.minidom.parse(str(self.xml_path), parser)
|
||||
|
||||
def get_node(
|
||||
self,
|
||||
tag: str,
|
||||
attrs: Optional[dict[str, str]] = None,
|
||||
line_number: Optional[Union[int, range]] = None,
|
||||
contains: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
Get a DOM element by tag and identifier.
|
||||
|
||||
Finds an element by either its line number in the original file or by
|
||||
matching attribute values. Exactly one match must be found.
|
||||
|
||||
Args:
|
||||
tag: The XML tag name (e.g., "w:del", "w:ins", "w:r")
|
||||
attrs: Dictionary of attribute name-value pairs to match (e.g., {"w:id": "1"})
|
||||
line_number: Line number (int) or line range (range) in original XML file (1-indexed)
|
||||
contains: Text string that must appear in any text node within the element.
|
||||
Supports both entity notation (“) and Unicode characters (\u201c).
|
||||
|
||||
Returns:
|
||||
defusedxml.minidom.Element: The matching DOM element
|
||||
|
||||
Raises:
|
||||
ValueError: If node not found or multiple matches found
|
||||
|
||||
Example:
|
||||
elem = editor.get_node(tag="w:r", line_number=519)
|
||||
elem = editor.get_node(tag="w:r", line_number=range(100, 200))
|
||||
elem = editor.get_node(tag="w:del", attrs={"w:id": "1"})
|
||||
elem = editor.get_node(tag="w:p", attrs={"w14:paraId": "12345678"})
|
||||
elem = editor.get_node(tag="w:commentRangeStart", attrs={"w:id": "0"})
|
||||
elem = editor.get_node(tag="w:p", contains="specific text")
|
||||
elem = editor.get_node(tag="w:t", contains="“Agreement") # Entity notation
|
||||
elem = editor.get_node(tag="w:t", contains="\u201cAgreement") # Unicode character
|
||||
"""
|
||||
matches = []
|
||||
for elem in self.dom.getElementsByTagName(tag):
|
||||
# Check line_number filter
|
||||
if line_number is not None:
|
||||
parse_pos = getattr(elem, "parse_position", (None,))
|
||||
elem_line = parse_pos[0]
|
||||
|
||||
# Handle both single line number and range
|
||||
if isinstance(line_number, range):
|
||||
if elem_line not in line_number:
|
||||
continue
|
||||
else:
|
||||
if elem_line != line_number:
|
||||
continue
|
||||
|
||||
# Check attrs filter
|
||||
if attrs is not None:
|
||||
if not all(
|
||||
elem.getAttribute(attr_name) == attr_value
|
||||
for attr_name, attr_value in attrs.items()
|
||||
):
|
||||
continue
|
||||
|
||||
# Check contains filter
|
||||
if contains is not None:
|
||||
elem_text = self._get_element_text(elem)
|
||||
# Normalize the search string: convert HTML entities to Unicode characters
|
||||
# This allows searching for both "“Rowan" and ""Rowan"
|
||||
normalized_contains = html.unescape(contains)
|
||||
if normalized_contains not in elem_text:
|
||||
continue
|
||||
|
||||
# If all applicable filters passed, this is a match
|
||||
matches.append(elem)
|
||||
|
||||
if not matches:
|
||||
# Build descriptive error message
|
||||
filters = []
|
||||
if line_number is not None:
|
||||
line_str = (
|
||||
f"lines {line_number.start}-{line_number.stop - 1}"
|
||||
if isinstance(line_number, range)
|
||||
else f"line {line_number}"
|
||||
)
|
||||
filters.append(f"at {line_str}")
|
||||
if attrs is not None:
|
||||
filters.append(f"with attributes {attrs}")
|
||||
if contains is not None:
|
||||
filters.append(f"containing '{contains}'")
|
||||
|
||||
filter_desc = " ".join(filters) if filters else ""
|
||||
base_msg = f"Node not found: <{tag}> {filter_desc}".strip()
|
||||
|
||||
# Add helpful hint based on filters used
|
||||
if contains:
|
||||
hint = "Text may be split across elements or use different wording."
|
||||
elif line_number:
|
||||
hint = "Line numbers may have changed if document was modified."
|
||||
elif attrs:
|
||||
hint = "Verify attribute values are correct."
|
||||
else:
|
||||
hint = "Try adding filters (attrs, line_number, or contains)."
|
||||
|
||||
raise ValueError(f"{base_msg}. {hint}")
|
||||
if len(matches) > 1:
|
||||
raise ValueError(
|
||||
f"Multiple nodes found: <{tag}>. "
|
||||
f"Add more filters (attrs, line_number, or contains) to narrow the search."
|
||||
)
|
||||
return matches[0]
|
||||
|
||||
def _get_element_text(self, elem):
|
||||
"""
|
||||
Recursively extract all text content from an element.
|
||||
|
||||
Skips text nodes that contain only whitespace (spaces, tabs, newlines),
|
||||
which typically represent XML formatting rather than document content.
|
||||
|
||||
Args:
|
||||
elem: defusedxml.minidom.Element to extract text from
|
||||
|
||||
Returns:
|
||||
str: Concatenated text from all non-whitespace text nodes within the element
|
||||
"""
|
||||
text_parts = []
|
||||
for node in elem.childNodes:
|
||||
if node.nodeType == node.TEXT_NODE:
|
||||
# Skip whitespace-only text nodes (XML formatting)
|
||||
if node.data.strip():
|
||||
text_parts.append(node.data)
|
||||
elif node.nodeType == node.ELEMENT_NODE:
|
||||
text_parts.append(self._get_element_text(node))
|
||||
return "".join(text_parts)
|
||||
|
||||
def replace_node(self, elem, new_content):
|
||||
"""
|
||||
Replace a DOM element with new XML content.
|
||||
|
||||
Args:
|
||||
elem: defusedxml.minidom.Element to replace
|
||||
new_content: String containing XML to replace the node with
|
||||
|
||||
Returns:
|
||||
List[defusedxml.minidom.Node]: All inserted nodes
|
||||
|
||||
Example:
|
||||
new_nodes = editor.replace_node(old_elem, "<w:r><w:t>text</w:t></w:r>")
|
||||
"""
|
||||
parent = elem.parentNode
|
||||
nodes = self._parse_fragment(new_content)
|
||||
for node in nodes:
|
||||
parent.insertBefore(node, elem)
|
||||
parent.removeChild(elem)
|
||||
return nodes
|
||||
|
||||
def insert_after(self, elem, xml_content):
|
||||
"""
|
||||
Insert XML content after a DOM element.
|
||||
|
||||
Args:
|
||||
elem: defusedxml.minidom.Element to insert after
|
||||
xml_content: String containing XML to insert
|
||||
|
||||
Returns:
|
||||
List[defusedxml.minidom.Node]: All inserted nodes
|
||||
|
||||
Example:
|
||||
new_nodes = editor.insert_after(elem, "<w:r><w:t>text</w:t></w:r>")
|
||||
"""
|
||||
parent = elem.parentNode
|
||||
next_sibling = elem.nextSibling
|
||||
nodes = self._parse_fragment(xml_content)
|
||||
for node in nodes:
|
||||
if next_sibling:
|
||||
parent.insertBefore(node, next_sibling)
|
||||
else:
|
||||
parent.appendChild(node)
|
||||
return nodes
|
||||
|
||||
def insert_before(self, elem, xml_content):
|
||||
"""
|
||||
Insert XML content before a DOM element.
|
||||
|
||||
Args:
|
||||
elem: defusedxml.minidom.Element to insert before
|
||||
xml_content: String containing XML to insert
|
||||
|
||||
Returns:
|
||||
List[defusedxml.minidom.Node]: All inserted nodes
|
||||
|
||||
Example:
|
||||
new_nodes = editor.insert_before(elem, "<w:r><w:t>text</w:t></w:r>")
|
||||
"""
|
||||
parent = elem.parentNode
|
||||
nodes = self._parse_fragment(xml_content)
|
||||
for node in nodes:
|
||||
parent.insertBefore(node, elem)
|
||||
return nodes
|
||||
|
||||
def append_to(self, elem, xml_content):
|
||||
"""
|
||||
Append XML content as a child of a DOM element.
|
||||
|
||||
Args:
|
||||
elem: defusedxml.minidom.Element to append to
|
||||
xml_content: String containing XML to append
|
||||
|
||||
Returns:
|
||||
List[defusedxml.minidom.Node]: All inserted nodes
|
||||
|
||||
Example:
|
||||
new_nodes = editor.append_to(elem, "<w:r><w:t>text</w:t></w:r>")
|
||||
"""
|
||||
nodes = self._parse_fragment(xml_content)
|
||||
for node in nodes:
|
||||
elem.appendChild(node)
|
||||
return nodes
|
||||
|
||||
def get_next_rid(self):
|
||||
"""Get the next available rId for relationships files."""
|
||||
max_id = 0
|
||||
for rel_elem in self.dom.getElementsByTagName("Relationship"):
|
||||
rel_id = rel_elem.getAttribute("Id")
|
||||
if rel_id.startswith("rId"):
|
||||
try:
|
||||
max_id = max(max_id, int(rel_id[3:]))
|
||||
except ValueError:
|
||||
pass
|
||||
return f"rId{max_id + 1}"
|
||||
|
||||
def save(self):
|
||||
"""
|
||||
Save the edited XML back to the file.
|
||||
|
||||
Serializes the DOM tree and writes it back to the original file path,
|
||||
preserving the original encoding (ascii or utf-8).
|
||||
"""
|
||||
content = self.dom.toxml(encoding=self.encoding)
|
||||
self.xml_path.write_bytes(content)
|
||||
|
||||
def _parse_fragment(self, xml_content):
|
||||
"""
|
||||
Parse XML fragment and return list of imported nodes.
|
||||
|
||||
Args:
|
||||
xml_content: String containing XML fragment
|
||||
|
||||
Returns:
|
||||
List of defusedxml.minidom.Node objects imported into this document
|
||||
|
||||
Raises:
|
||||
AssertionError: If fragment contains no element nodes
|
||||
"""
|
||||
# Extract namespace declarations from the root document element
|
||||
root_elem = self.dom.documentElement
|
||||
namespaces = []
|
||||
if root_elem and root_elem.attributes:
|
||||
for i in range(root_elem.attributes.length):
|
||||
attr = root_elem.attributes.item(i)
|
||||
if attr.name.startswith("xmlns"): # type: ignore
|
||||
namespaces.append(f'{attr.name}="{attr.value}"') # type: ignore
|
||||
|
||||
ns_decl = " ".join(namespaces)
|
||||
wrapper = f"<root {ns_decl}>{xml_content}</root>"
|
||||
fragment_doc = defusedxml.minidom.parseString(wrapper)
|
||||
nodes = [
|
||||
self.dom.importNode(child, deep=True)
|
||||
for child in fragment_doc.documentElement.childNodes # type: ignore
|
||||
]
|
||||
elements = [n for n in nodes if n.nodeType == n.ELEMENT_NODE]
|
||||
assert elements, "Fragment must contain at least one element"
|
||||
return nodes
|
||||
|
||||
|
||||
def _create_line_tracking_parser():
|
||||
"""
|
||||
Create a SAX parser that tracks line and column numbers for each element.
|
||||
|
||||
Monkey patches the SAX content handler to store the current line and column
|
||||
position from the underlying expat parser onto each element as a parse_position
|
||||
attribute (line, column) tuple.
|
||||
|
||||
Returns:
|
||||
defusedxml.sax.xmlreader.XMLReader: Configured SAX parser
|
||||
"""
|
||||
|
||||
def set_content_handler(dom_handler):
|
||||
def startElementNS(name, tagName, attrs):
|
||||
orig_start_cb(name, tagName, attrs)
|
||||
cur_elem = dom_handler.elementStack[-1]
|
||||
cur_elem.parse_position = (
|
||||
parser._parser.CurrentLineNumber, # type: ignore
|
||||
parser._parser.CurrentColumnNumber, # type: ignore
|
||||
)
|
||||
|
||||
orig_start_cb = dom_handler.startElementNS
|
||||
dom_handler.startElementNS = startElementNS
|
||||
orig_set_content_handler(dom_handler)
|
||||
|
||||
parser = defusedxml.sax.make_parser()
|
||||
orig_set_content_handler = parser.setContentHandler
|
||||
parser.setContentHandler = set_content_handler # type: ignore
|
||||
return parser
|
||||
Reference in New Issue
Block a user