8.7 KiB
Executable File
8.7 KiB
Executable File
OOXML Editing Reference — Document Library API
Important: Read this entire document before editing. This is the primary reference for modifying existing .docx files.
Document Library (Python) — Primary API
Use the Document class from "$DOCX_SCRIPTS/document.py" for all edits, tracked changes, and comments. It handles infrastructure automatically (people.xml, RSIDs, settings.xml, comments, relationships, content types).
Working with Unicode and Entities:
- Both entity notation and Unicode work for search:
contains="“Company"≡contains="\u201cCompany" - Both work for replacement too
Setup
# Find the docx skill root
find /mnt/skills -name "document.py" -path "*/docx/scripts/*" 2>/dev/null | head -1
# Skill root = parent of scripts/
# Run with PYTHONPATH
PYTHONPATH=/mnt/skills/docx python your_script.py
from scripts.document import Document, DocxXMLEditor
# Basic init (auto-creates temp copy, sets up infrastructure)
doc = Document('unpacked')
# Custom author/initials
doc = Document('unpacked', author="John Doe", initials="JD")
# Enable tracked changes
doc = Document('unpacked', track_revisions=True)
# Custom RSID (auto-generated if omitted)
doc = Document('unpacked', rsid="07DC5ECB")
Finding Nodes
# By text
node = doc["word/document.xml"].get_node(tag="w:p", contains="specific text")
# By line range
para = doc["word/document.xml"].get_node(tag="w:p", line_number=range(100, 150))
# By attributes
node = doc["word/document.xml"].get_node(tag="w:del", attrs={"w:id": "1"})
# By exact line number
para = doc["word/document.xml"].get_node(tag="w:p", line_number=42)
# Combined filters (disambiguation)
node = doc["word/document.xml"].get_node(tag="w:r", contains="Section", line_number=range(2400, 2500))
Tracked Changes
CRITICAL: Only mark text that actually changes. Keep unchanged text outside <w:del>/<w:ins> tags.
Method Selection:
- Regular text →
replace_node()with<w:del>/<w:ins>, orsuggest_deletion()for whole elements - Partially modify another's tracked change →
replace_node()to nest changes - Reject another's insertion →
revert_insertion()(NOTsuggest_deletion()) - Reject another's deletion →
revert_deletion()
# Change one word: "monthly" → "quarterly"
node = doc["word/document.xml"].get_node(tag="w:r", contains="The report is monthly")
rpr = tags[0].toxml() if (tags := node.getElementsByTagName("w:rPr")) else ""
replacement = f'<w:r w:rsidR="00AB12CD">{rpr}<w:t>The report is </w:t></w:r><w:del><w:r>{rpr}<w:delText>monthly</w:delText></w:r></w:del><w:ins><w:r>{rpr}<w:t>quarterly</w:t></w:r></w:ins>'
doc["word/document.xml"].replace_node(node, replacement)
# Delete entire run
node = doc["word/document.xml"].get_node(tag="w:r", contains="text to delete")
doc["word/document.xml"].suggest_deletion(node)
# Delete entire paragraph
para = doc["word/document.xml"].get_node(tag="w:p", contains="paragraph to delete")
doc["word/document.xml"].suggest_deletion(para)
# Insert new content after a node
node = doc["word/document.xml"].get_node(tag="w:r", contains="existing text")
doc["word/document.xml"].insert_after(node, '<w:ins><w:r><w:t>new text</w:t></w:r></w:ins>')
# Add new numbered list item
target_para = doc["word/document.xml"].get_node(tag="w:p", contains="existing list item")
pPr = tags[0].toxml() if (tags := target_para.getElementsByTagName("w:pPr")) else ""
new_item = f'<w:p>{pPr}<w:r><w:t>New item</w:t></w:r></w:p>'
tracked_para = DocxXMLEditor.suggest_paragraph(new_item)
doc["word/document.xml"].insert_after(target_para, tracked_para)
Handling Other Authors' Changes
# Partially delete another author's insertion
node = doc["word/document.xml"].get_node(tag="w:ins", attrs={"w:id": "5"})
replacement = '''<w:ins w:author="Jane Smith" w:date="2025-01-15T10:00:00Z">
<w:r><w:t>quarterly </w:t></w:r>
<w:del><w:r><w:delText>financial </w:delText></w:r></w:del>
<w:r><w:t>report</w:t></w:r>
</w:ins>'''
doc["word/document.xml"].replace_node(node, replacement)
# Reject insertion (wraps in deletion)
ins = doc["word/document.xml"].get_node(tag="w:ins", attrs={"w:id": "5"})
doc["word/document.xml"].revert_insertion(ins)
# Reject deletion (restores deleted content)
del_elem = doc["word/document.xml"].get_node(tag="w:del", attrs={"w:id": "3"})
doc["word/document.xml"].revert_deletion(del_elem)
Comments
doc = Document('unpacked', author="Z.ai", initials="Z")
# Comment on a range
start = doc["word/document.xml"].get_node(tag="w:del", attrs={"w:id": "1"})
end = doc["word/document.xml"].get_node(tag="w:ins", attrs={"w:id": "2"})
doc.add_comment(start=start, end=end, text="Explanation of this change")
# Comment on paragraph
para = doc["word/document.xml"].get_node(tag="w:p", contains="text")
doc.add_comment(start=para, end=para, text="Comment here")
# Comment on newly created tracked change
node = doc["word/document.xml"].get_node(tag="w:r", contains="old")
new_nodes = doc["word/document.xml"].replace_node(
node, '<w:del><w:r><w:delText>old</w:delText></w:r></w:del><w:ins><w:r><w:t>new</w:t></w:r></w:ins>')
doc.add_comment(start=new_nodes[0], end=new_nodes[1], text="Changed per requirements")
# Reply to comment
doc.reply_to_comment(parent_comment_id=0, text="I agree")
Images
from PIL import Image
import shutil, os
doc = Document('unpacked')
media_dir = os.path.join(doc.unpacked_path, 'word/media')
os.makedirs(media_dir, exist_ok=True)
shutil.copy('image.png', os.path.join(media_dir, 'image1.png'))
img = Image.open(os.path.join(media_dir, 'image1.png'))
width_emus = int(6.5 * 914400) # 6.5" usable width
height_emus = int(width_emus * img.size[1] / img.size[0])
# Add relationship
rels_editor = doc['word/_rels/document.xml.rels']
next_rid = rels_editor.get_next_rid()
rels_editor.append_to(rels_editor.dom.documentElement,
f'<Relationship Id="{next_rid}" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="media/image1.png"/>')
doc['[Content_Types].xml'].append_to(doc['[Content_Types].xml'].dom.documentElement,
'<Default Extension="png" ContentType="image/png"/>')
# Insert
node = doc["word/document.xml"].get_node(tag="w:p", line_number=100)
doc["word/document.xml"].insert_after(node, f'''<w:p><w:r><w:drawing>
<wp:inline distT="0" distB="0" distL="0" distR="0">
<wp:extent cx="{width_emus}" cy="{height_emus}"/>
<wp:docPr id="1" name="Picture 1"/>
<a:graphic xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">
<a:graphicData uri="http://schemas.openxmlformats.org/drawingml/2006/picture">
<pic:pic xmlns:pic="http://schemas.openxmlformats.org/drawingml/2006/picture">
<pic:nvPicPr><pic:cNvPr id="1" name="image1.png"/><pic:cNvPicPr/></pic:nvPicPr>
<pic:blipFill><a:blip r:embed="{next_rid}"/><a:stretch><a:fillRect/></a:stretch></pic:blipFill>
<pic:spPr><a:xfrm><a:ext cx="{width_emus}" cy="{height_emus}"/></a:xfrm><a:prstGeom prst="rect"><a:avLst/></a:prstGeom></pic:spPr>
</pic:pic>
</a:graphicData>
</a:graphic>
</wp:inline>
</w:drawing></w:r></w:p>''')
Saving
doc.save() # Validates + copies back to original dir
doc.save('modified-unpacked') # Save to different location
doc.save(validate=False) # Skip validation (debug only)
Direct DOM Manipulation
editor = doc["word/document.xml"]
node = doc["word/document.xml"].get_node(tag="w:p", line_number=5)
parent = node.parentNode
parent.removeChild(node)
# General replacement (without tracked changes)
old = doc["word/document.xml"].get_node(tag="w:p", contains="original")
doc["word/document.xml"].replace_node(old, "<w:p><w:r><w:t>replacement</w:t></w:r></w:p>")
# Chained insertions
node = doc["word/document.xml"].get_node(tag="w:r", line_number=100)
nodes = doc["word/document.xml"].insert_after(node, "<w:r><w:t>A</w:t></w:r>")
nodes = doc["word/document.xml"].insert_after(nodes[-1], "<w:r><w:t>B</w:t></w:r>")
Schema Compliance Quick Reference
- Element ordering in
<w:pPr>:<w:pStyle>→<w:numPr>→<w:spacing>→<w:ind>→<w:jc> - Whitespace:
xml:space='preserve'on<w:t>with leading/trailing spaces - RSIDs: 8-digit hex only (0-9, A-F)
- trackRevisions: Add
<w:trackRevisions/>after<w:proofState>in settings.xml <w:del>/<w:ins>placement: At paragraph level, containing complete<w:r>elements. Never nest inside<w:r>.
Validation Rules
The validator ensures document text matches the original after reverting GLM's changes:
- Never modify text inside another author's
<w:ins>or<w:del>tags - Use nested deletions to remove another author's insertions
- Every edit must be tracked with
<w:ins>or<w:del>tags