feat: Add complete Agentic Compaction & Pipeline System

- Context Compaction System with token counting and summarization
- Deterministic State Machine for flow control (no LLM decisions)
- Parallel Execution Engine (up to 12 concurrent sessions)
- Event-Driven Coordination via Event Bus
- Agent Workspace Isolation (tools, memory, identity, files)
- YAML Workflow Integration (OpenClaw/Lobster compatible)
- Claude Code integration layer
- Complete demo UI with real-time visualization
- Comprehensive documentation and README

Components:
- agent-system/: Context management, token counting, subagent spawning
- pipeline-system/: State machine, parallel executor, event bus, workflows
- skills/: AI capabilities (LLM, ASR, TTS, VLM, image generation, etc.)
- src/app/: Next.js demo application

Total: ~100KB of production-ready TypeScript code
This commit is contained in:
Z User
2026-03-03 12:40:47 +00:00
Unverified
parent 63a8b123c9
commit 2380d33861
152 changed files with 51569 additions and 817 deletions

View File

@@ -0,0 +1 @@
# Make scripts directory a package for relative imports in tests

View File

@@ -0,0 +1,220 @@
#!/usr/bin/env python3
"""
Add placeholder entries to Table of Contents in a DOCX file.
This script adds placeholder TOC entries between the 'separate' and 'end'
field characters, so users see some content on first open instead of an empty TOC.
The original file is replaced with the modified version.
Usage:
python add_toc_placeholders.py <docx_file> --entries <entries_json>
entries_json format: JSON string with array of objects:
[
{"level": 1, "text": "Chapter 1 Overview", "page": "1"},
{"level": 2, "text": "Section 1.1 Details", "page": "1"}
]
If --entries is not provided, generates generic placeholders.
Example:
python add_toc_placeholders.py document.docx
python add_toc_placeholders.py document.docx --entries '[{"level":1,"text":"Introduction","page":"1"}]'
"""
import argparse
import html
import json
import shutil
import sys
import tempfile
import zipfile
from pathlib import Path
def add_toc_placeholders(docx_path: str, entries: list = None) -> None:
"""Add placeholder TOC entries to a DOCX file (in-place replacement).
Args:
docx_path: Path to DOCX file (will be modified in-place)
entries: Optional list of placeholder entries. Each entry should be a dict
with 'level' (1-3), 'text', and 'page' keys.
"""
docx_path = Path(docx_path)
# Create temp directory for extraction
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
extracted_dir = temp_path / "extracted"
temp_output = temp_path / "output.docx"
# Extract DOCX
with zipfile.ZipFile(docx_path, 'r') as zip_ref:
zip_ref.extractall(extracted_dir)
# Detect TOC styles from styles.xml
toc_style_mapping = _detect_toc_styles(extracted_dir / "word" / "styles.xml")
print(toc_style_mapping)
# Process document.xml
document_xml = extracted_dir / "word" / "document.xml"
if not document_xml.exists():
raise ValueError("document.xml not found in the DOCX file")
# Read and process XML
content = document_xml.read_text(encoding='utf-8')
# Find TOC structure and add placeholders
modified_content = _insert_toc_placeholders(content, entries, toc_style_mapping)
# Write back
document_xml.write_text(modified_content, encoding='utf-8')
# Repack DOCX to temp file
with zipfile.ZipFile(temp_output, 'w', zipfile.ZIP_DEFLATED) as zipf:
for file_path in extracted_dir.rglob('*'):
if file_path.is_file():
arcname = file_path.relative_to(extracted_dir)
zipf.write(file_path, arcname)
# Replace original file with modified version (use shutil.move for cross-device support)
docx_path.unlink()
shutil.move(str(temp_output), str(docx_path))
def _detect_toc_styles(styles_xml_path: Path) -> dict:
"""Detect TOC style IDs from styles.xml.
Args:
styles_xml_path: Path to styles.xml
Returns:
Dictionary mapping level (1, 2, 3) to style ID
"""
default_mapping = {1: "9", 2: "11", 3: "12"}
if not styles_xml_path.exists():
return default_mapping
content = styles_xml_path.read_text(encoding='utf-8')
# Find styles with names like "toc 1", "toc 2", "toc 3"
import re
toc_styles = {}
for match in re.finditer(r'<w:style[^>]*w:styleId="([^"]*)"[^>]*>.*?<w:name\s+w:val="toc\s+(\d)"', content, re.DOTALL):
style_id = match.group(1)
level = int(match.group(2))
toc_styles[level] = style_id
# If we found styles, use them; otherwise use defaults
return toc_styles if toc_styles else default_mapping
def _insert_toc_placeholders(xml_content: str, entries: list = None, toc_style_mapping: dict = None) -> str:
"""Insert placeholder TOC entries into XML content.
Args:
xml_content: The XML content of document.xml
entries: Optional list of placeholder entries
toc_style_mapping: Dictionary mapping level to style ID
Returns:
Modified XML content with placeholders inserted
"""
# Generate default placeholder entries if none provided
if entries is None:
entries = [
{"level": 1, "text": "Chapter 1 Overview", "page": "1"},
{"level": 2, "text": "Section 1.1 Details", "page": "1"},
{"level": 2, "text": "Section 1.2 More Details", "page": "2"},
{"level": 1, "text": "Chapter 2 Content", "page": "3"},
]
# Use provided mapping or default
if toc_style_mapping is None:
toc_style_mapping = {1: "9", 2: "11", 3: "12"}
# Find the TOC structure: w:p with w:fldChar separate, followed by w:p with w:fldChar end
# Pattern: <w:p><w:r>...<w:fldChar w:fldCharType="separate"/></w:r></w:p><w:p><w:r><w:fldChar w:fldCharType="end"/>
separate_end_pattern = (
r'(<w:p[^>]*><w:r[^>]*>.*?<w:fldChar[^>]*w:fldCharType="separate"[^>]*/></w:r></w:p>)'
r'(<w:p[^>]*><w:r[^>]*>.*?<w:fldChar[^>]*w:fldCharType="end"[^>]*/></w:r></w:p>)'
)
import re
def replace_with_placeholders(match):
separate_para = match.group(1)
end_para = match.group(2)
# Indentation values in twips (1 inch = 1440 twips)
# Level 1: 0, Level 2: 0.25" (360), Level 3: 0.5" (720), Level 4+: 0.75" (1080)
indent_mapping = {1: 0, 2: 360, 3: 720, 4: 1080, 5: 1440, 6: 1800}
# Generate placeholder paragraphs matching Word's TOC format
placeholder_paragraphs = []
for entry in entries:
level = entry.get('level', 1)
text = html.escape(entry.get('text', ''))
page = entry.get('page', '1')
# Get style ID for this level
toc_style = toc_style_mapping.get(level, toc_style_mapping.get(1, "9"))
# Get indentation for this level
indent = indent_mapping.get(level, 0)
indent_attr = f'<w:ind w:left="{indent}"/>' if indent > 0 else ''
# Use w:tab element (not w:tabStop) like Word does
placeholder_para = f'''<w:p>
<w:pPr>
<w:pStyle w:val="{toc_style}"/>
{indent_attr}
<w:tabs><w:tab w:val="right" w:leader="dot" w:pos="9026"/></w:tabs>
</w:pPr>
<w:r><w:t>{text}</w:t></w:r>
<w:r><w:tab/></w:r>
<w:r><w:t>{page}</w:t></w:r>
</w:p>'''
placeholder_paragraphs.append(placeholder_para)
# Join with the separate paragraph at start and end paragraph at end
return separate_para + '\n'.join(placeholder_paragraphs) + end_para
# Replace the pattern
modified_content = re.sub(separate_end_pattern, replace_with_placeholders, xml_content, flags=re.DOTALL)
return modified_content
def main():
parser = argparse.ArgumentParser(
description='Add placeholder entries to Table of Contents in a DOCX file (in-place)'
)
parser.add_argument('docx_file', help='DOCX file to modify (will be replaced)')
parser.add_argument(
'--entries',
help='JSON string with placeholder entries: [{"level":1,"text":"Chapter 1","page":"1"}]'
)
args = parser.parse_args()
# Parse entries if provided
entries = None
if args.entries:
try:
entries = json.loads(args.entries)
except json.JSONDecodeError as e:
print(f"Error parsing entries JSON: {e}", file=sys.stderr)
sys.exit(1)
# Add placeholders
try:
add_toc_placeholders(args.docx_file, entries)
print(f"Successfully added TOC placeholders to {args.docx_file}")
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

1302
skills/docx/scripts/document.py Executable file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,3 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:comments xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14">
</w:comments>

View File

@@ -0,0 +1,3 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w15:commentsEx xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14">
</w15:commentsEx>

View File

@@ -0,0 +1,3 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w16cex:commentsExtensible xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" xmlns:cr="http://schemas.microsoft.com/office/comments/2020/reactions" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl cr w16du wp14">
</w16cex:commentsExtensible>

View File

@@ -0,0 +1,3 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w16cid:commentsIds xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14">
</w16cid:commentsIds>

View File

@@ -0,0 +1,3 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w15:people xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml">
</w15:people>

374
skills/docx/scripts/utilities.py Executable file
View File

@@ -0,0 +1,374 @@
#!/usr/bin/env python3
"""
Utilities for editing OOXML documents.
This module provides XMLEditor, a tool for manipulating XML files with support for
line-number-based node finding and DOM manipulation. Each element is automatically
annotated with its original line and column position during parsing.
Example usage:
editor = XMLEditor("document.xml")
# Find node by line number or range
elem = editor.get_node(tag="w:r", line_number=519)
elem = editor.get_node(tag="w:p", line_number=range(100, 200))
# Find node by text content
elem = editor.get_node(tag="w:p", contains="specific text")
# Find node by attributes
elem = editor.get_node(tag="w:r", attrs={"w:id": "target"})
# Combine filters
elem = editor.get_node(tag="w:p", line_number=range(1, 50), contains="text")
# Replace, insert, or manipulate
new_elem = editor.replace_node(elem, "<w:r><w:t>new text</w:t></w:r>")
editor.insert_after(new_elem, "<w:r><w:t>more</w:t></w:r>")
# Save changes
editor.save()
"""
import html
from pathlib import Path
from typing import Optional, Union
import defusedxml.minidom
import defusedxml.sax
class XMLEditor:
"""
Editor for manipulating OOXML XML files with line-number-based node finding.
This class parses XML files and tracks the original line and column position
of each element. This enables finding nodes by their line number in the original
file, which is useful when working with Read tool output.
Attributes:
xml_path: Path to the XML file being edited
encoding: Detected encoding of the XML file ('ascii' or 'utf-8')
dom: Parsed DOM tree with parse_position attributes on elements
"""
def __init__(self, xml_path):
"""
Initialize with path to XML file and parse with line number tracking.
Args:
xml_path: Path to XML file to edit (str or Path)
Raises:
ValueError: If the XML file does not exist
"""
self.xml_path = Path(xml_path)
if not self.xml_path.exists():
raise ValueError(f"XML file not found: {xml_path}")
with open(self.xml_path, "rb") as f:
header = f.read(200).decode("utf-8", errors="ignore")
self.encoding = "ascii" if 'encoding="ascii"' in header else "utf-8"
parser = _create_line_tracking_parser()
self.dom = defusedxml.minidom.parse(str(self.xml_path), parser)
def get_node(
self,
tag: str,
attrs: Optional[dict[str, str]] = None,
line_number: Optional[Union[int, range]] = None,
contains: Optional[str] = None,
):
"""
Get a DOM element by tag and identifier.
Finds an element by either its line number in the original file or by
matching attribute values. Exactly one match must be found.
Args:
tag: The XML tag name (e.g., "w:del", "w:ins", "w:r")
attrs: Dictionary of attribute name-value pairs to match (e.g., {"w:id": "1"})
line_number: Line number (int) or line range (range) in original XML file (1-indexed)
contains: Text string that must appear in any text node within the element.
Supports both entity notation (&#8220;) and Unicode characters (\u201c).
Returns:
defusedxml.minidom.Element: The matching DOM element
Raises:
ValueError: If node not found or multiple matches found
Example:
elem = editor.get_node(tag="w:r", line_number=519)
elem = editor.get_node(tag="w:r", line_number=range(100, 200))
elem = editor.get_node(tag="w:del", attrs={"w:id": "1"})
elem = editor.get_node(tag="w:p", attrs={"w14:paraId": "12345678"})
elem = editor.get_node(tag="w:commentRangeStart", attrs={"w:id": "0"})
elem = editor.get_node(tag="w:p", contains="specific text")
elem = editor.get_node(tag="w:t", contains="&#8220;Agreement") # Entity notation
elem = editor.get_node(tag="w:t", contains="\u201cAgreement") # Unicode character
"""
matches = []
for elem in self.dom.getElementsByTagName(tag):
# Check line_number filter
if line_number is not None:
parse_pos = getattr(elem, "parse_position", (None,))
elem_line = parse_pos[0]
# Handle both single line number and range
if isinstance(line_number, range):
if elem_line not in line_number:
continue
else:
if elem_line != line_number:
continue
# Check attrs filter
if attrs is not None:
if not all(
elem.getAttribute(attr_name) == attr_value
for attr_name, attr_value in attrs.items()
):
continue
# Check contains filter
if contains is not None:
elem_text = self._get_element_text(elem)
# Normalize the search string: convert HTML entities to Unicode characters
# This allows searching for both "&#8220;Rowan" and ""Rowan"
normalized_contains = html.unescape(contains)
if normalized_contains not in elem_text:
continue
# If all applicable filters passed, this is a match
matches.append(elem)
if not matches:
# Build descriptive error message
filters = []
if line_number is not None:
line_str = (
f"lines {line_number.start}-{line_number.stop - 1}"
if isinstance(line_number, range)
else f"line {line_number}"
)
filters.append(f"at {line_str}")
if attrs is not None:
filters.append(f"with attributes {attrs}")
if contains is not None:
filters.append(f"containing '{contains}'")
filter_desc = " ".join(filters) if filters else ""
base_msg = f"Node not found: <{tag}> {filter_desc}".strip()
# Add helpful hint based on filters used
if contains:
hint = "Text may be split across elements or use different wording."
elif line_number:
hint = "Line numbers may have changed if document was modified."
elif attrs:
hint = "Verify attribute values are correct."
else:
hint = "Try adding filters (attrs, line_number, or contains)."
raise ValueError(f"{base_msg}. {hint}")
if len(matches) > 1:
raise ValueError(
f"Multiple nodes found: <{tag}>. "
f"Add more filters (attrs, line_number, or contains) to narrow the search."
)
return matches[0]
def _get_element_text(self, elem):
"""
Recursively extract all text content from an element.
Skips text nodes that contain only whitespace (spaces, tabs, newlines),
which typically represent XML formatting rather than document content.
Args:
elem: defusedxml.minidom.Element to extract text from
Returns:
str: Concatenated text from all non-whitespace text nodes within the element
"""
text_parts = []
for node in elem.childNodes:
if node.nodeType == node.TEXT_NODE:
# Skip whitespace-only text nodes (XML formatting)
if node.data.strip():
text_parts.append(node.data)
elif node.nodeType == node.ELEMENT_NODE:
text_parts.append(self._get_element_text(node))
return "".join(text_parts)
def replace_node(self, elem, new_content):
"""
Replace a DOM element with new XML content.
Args:
elem: defusedxml.minidom.Element to replace
new_content: String containing XML to replace the node with
Returns:
List[defusedxml.minidom.Node]: All inserted nodes
Example:
new_nodes = editor.replace_node(old_elem, "<w:r><w:t>text</w:t></w:r>")
"""
parent = elem.parentNode
nodes = self._parse_fragment(new_content)
for node in nodes:
parent.insertBefore(node, elem)
parent.removeChild(elem)
return nodes
def insert_after(self, elem, xml_content):
"""
Insert XML content after a DOM element.
Args:
elem: defusedxml.minidom.Element to insert after
xml_content: String containing XML to insert
Returns:
List[defusedxml.minidom.Node]: All inserted nodes
Example:
new_nodes = editor.insert_after(elem, "<w:r><w:t>text</w:t></w:r>")
"""
parent = elem.parentNode
next_sibling = elem.nextSibling
nodes = self._parse_fragment(xml_content)
for node in nodes:
if next_sibling:
parent.insertBefore(node, next_sibling)
else:
parent.appendChild(node)
return nodes
def insert_before(self, elem, xml_content):
"""
Insert XML content before a DOM element.
Args:
elem: defusedxml.minidom.Element to insert before
xml_content: String containing XML to insert
Returns:
List[defusedxml.minidom.Node]: All inserted nodes
Example:
new_nodes = editor.insert_before(elem, "<w:r><w:t>text</w:t></w:r>")
"""
parent = elem.parentNode
nodes = self._parse_fragment(xml_content)
for node in nodes:
parent.insertBefore(node, elem)
return nodes
def append_to(self, elem, xml_content):
"""
Append XML content as a child of a DOM element.
Args:
elem: defusedxml.minidom.Element to append to
xml_content: String containing XML to append
Returns:
List[defusedxml.minidom.Node]: All inserted nodes
Example:
new_nodes = editor.append_to(elem, "<w:r><w:t>text</w:t></w:r>")
"""
nodes = self._parse_fragment(xml_content)
for node in nodes:
elem.appendChild(node)
return nodes
def get_next_rid(self):
"""Get the next available rId for relationships files."""
max_id = 0
for rel_elem in self.dom.getElementsByTagName("Relationship"):
rel_id = rel_elem.getAttribute("Id")
if rel_id.startswith("rId"):
try:
max_id = max(max_id, int(rel_id[3:]))
except ValueError:
pass
return f"rId{max_id + 1}"
def save(self):
"""
Save the edited XML back to the file.
Serializes the DOM tree and writes it back to the original file path,
preserving the original encoding (ascii or utf-8).
"""
content = self.dom.toxml(encoding=self.encoding)
self.xml_path.write_bytes(content)
def _parse_fragment(self, xml_content):
"""
Parse XML fragment and return list of imported nodes.
Args:
xml_content: String containing XML fragment
Returns:
List of defusedxml.minidom.Node objects imported into this document
Raises:
AssertionError: If fragment contains no element nodes
"""
# Extract namespace declarations from the root document element
root_elem = self.dom.documentElement
namespaces = []
if root_elem and root_elem.attributes:
for i in range(root_elem.attributes.length):
attr = root_elem.attributes.item(i)
if attr.name.startswith("xmlns"): # type: ignore
namespaces.append(f'{attr.name}="{attr.value}"') # type: ignore
ns_decl = " ".join(namespaces)
wrapper = f"<root {ns_decl}>{xml_content}</root>"
fragment_doc = defusedxml.minidom.parseString(wrapper)
nodes = [
self.dom.importNode(child, deep=True)
for child in fragment_doc.documentElement.childNodes # type: ignore
]
elements = [n for n in nodes if n.nodeType == n.ELEMENT_NODE]
assert elements, "Fragment must contain at least one element"
return nodes
def _create_line_tracking_parser():
"""
Create a SAX parser that tracks line and column numbers for each element.
Monkey patches the SAX content handler to store the current line and column
position from the underlying expat parser onto each element as a parse_position
attribute (line, column) tuple.
Returns:
defusedxml.sax.xmlreader.XMLReader: Configured SAX parser
"""
def set_content_handler(dom_handler):
def startElementNS(name, tagName, attrs):
orig_start_cb(name, tagName, attrs)
cur_elem = dom_handler.elementStack[-1]
cur_elem.parse_position = (
parser._parser.CurrentLineNumber, # type: ignore
parser._parser.CurrentColumnNumber, # type: ignore
)
orig_start_cb = dom_handler.startElementNS
dom_handler.startElementNS = startElementNS
orig_set_content_handler(dom_handler)
parser = defusedxml.sax.make_parser()
orig_set_content_handler = parser.setContentHandler
parser.setContentHandler = set_content_handler # type: ignore
return parser