#!/usr/bin/env python3 """ Library for working with Word documents: comments, tracked changes, and editing. Usage: from skills.docx.scripts.document import Document # Initialize doc = Document('workspace/unpacked') doc = Document('workspace/unpacked', author="John Doe", initials="JD") # Find nodes node = doc["word/document.xml"].get_node(tag="w:del", attrs={"w:id": "1"}) node = doc["word/document.xml"].get_node(tag="w:p", line_number=10) # Add comments doc.add_comment(start=node, end=node, text="Comment text") doc.reply_to_comment(parent_comment_id=0, text="Reply text") # Suggest tracked changes doc["word/document.xml"].suggest_deletion(node) # Delete content doc["word/document.xml"].revert_insertion(ins_node) # Reject insertion doc["word/document.xml"].revert_deletion(del_node) # Reject deletion # Save doc.save() """ import html import random import shutil import tempfile from datetime import datetime, timezone from pathlib import Path import zipfile import defusedxml.minidom from defusedxml import minidom from .utilities import XMLEditor # --------------------------------------------------------------------------- # Inline pack utility (replaces former ooxml.scripts.pack dependency) # --------------------------------------------------------------------------- def _condense_xml(xml_file): """Strip unnecessary whitespace from XML, preserving text content.""" with open(xml_file, "r", encoding="utf-8") as f: dom = defusedxml.minidom.parse(f) for element in dom.getElementsByTagName("*"): if element.tagName.endswith(":t"): continue for child in list(element.childNodes): if ( child.nodeType == child.TEXT_NODE and child.nodeValue and child.nodeValue.strip() == "" ) or child.nodeType == child.COMMENT_NODE: element.removeChild(child) with open(xml_file, "wb") as f: f.write(dom.toxml(encoding="UTF-8")) def _pack_document(input_dir, output_file): """Pack an unpacked directory back into a .docx file.""" input_dir = Path(input_dir) output_file = Path(output_file) with tempfile.TemporaryDirectory() as temp_dir: temp_content_dir = Path(temp_dir) / "content" shutil.copytree(input_dir, temp_content_dir) for pattern in ["*.xml", "*.rels"]: for xml_file in temp_content_dir.rglob(pattern): _condense_xml(xml_file) output_file.parent.mkdir(parents=True, exist_ok=True) with zipfile.ZipFile(output_file, "w", zipfile.ZIP_DEFLATED) as zf: for f in temp_content_dir.rglob("*"): if f.is_file(): zf.write(f, f.relative_to(temp_content_dir)) # Path to template files TEMPLATE_DIR = Path(__file__).parent / "templates" class DocxXMLEditor(XMLEditor): """XMLEditor that automatically applies RSID, author, and date to new elements. Automatically adds attributes to elements that support them when inserting new content: - w:rsidR, w:rsidRDefault, w:rsidP (for w:p and w:r elements) - w:author and w:date (for w:ins, w:del, w:comment elements) - w:id (for w:ins and w:del elements) Attributes: dom (defusedxml.minidom.Document): The DOM document for direct manipulation """ def __init__( self, xml_path, rsid: str, author: str = "Z.AI", initials: str = "Z" ): """Initialize with required RSID and optional author. Args: xml_path: Path to XML file to edit rsid: RSID to automatically apply to new elements author: Author name for tracked changes and comments (default: "Z.AI") initials: Author initials (default: "C") """ super().__init__(xml_path) self.rsid = rsid self.author = author self.initials = initials def _get_next_change_id(self): """Get the next available change ID by checking all tracked change elements.""" max_id = -1 for tag in ("w:ins", "w:del"): elements = self.dom.getElementsByTagName(tag) for elem in elements: change_id = elem.getAttribute("w:id") if change_id: try: max_id = max(max_id, int(change_id)) except ValueError: pass return max_id + 1 def _ensure_w16du_namespace(self): """Ensure w16du namespace is declared on the root element.""" root = self.dom.documentElement if not root.hasAttribute("xmlns:w16du"): # type: ignore root.setAttribute( # type: ignore "xmlns:w16du", "http://schemas.microsoft.com/office/word/2023/wordml/word16du", ) def _ensure_w16cex_namespace(self): """Ensure w16cex namespace is declared on the root element.""" root = self.dom.documentElement if not root.hasAttribute("xmlns:w16cex"): # type: ignore root.setAttribute( # type: ignore "xmlns:w16cex", "http://schemas.microsoft.com/office/word/2018/wordml/cex", ) def _ensure_w14_namespace(self): """Ensure w14 namespace is declared on the root element.""" root = self.dom.documentElement if not root.hasAttribute("xmlns:w14"): # type: ignore root.setAttribute( # type: ignore "xmlns:w14", "http://schemas.microsoft.com/office/word/2010/wordml", ) def _inject_attributes_to_nodes(self, nodes): """Inject RSID, author, and date attributes into DOM nodes where applicable. Adds attributes to elements that support them: - w:r: gets w:rsidR (or w:rsidDel if inside w:del) - w:p: gets w:rsidR, w:rsidRDefault, w:rsidP, w14:paraId, w14:textId - w:t: gets xml:space="preserve" if text has leading/trailing whitespace - w:ins, w:del: get w:id, w:author, w:date, w16du:dateUtc - w:comment: gets w:author, w:date, w:initials - w16cex:commentExtensible: gets w16cex:dateUtc Args: nodes: List of DOM nodes to process """ from datetime import datetime, timezone timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") def is_inside_deletion(elem): """Check if element is inside a w:del element.""" parent = elem.parentNode while parent: if parent.nodeType == parent.ELEMENT_NODE and parent.tagName == "w:del": return True parent = parent.parentNode return False def add_rsid_to_p(elem): if not elem.hasAttribute("w:rsidR"): elem.setAttribute("w:rsidR", self.rsid) if not elem.hasAttribute("w:rsidRDefault"): elem.setAttribute("w:rsidRDefault", self.rsid) if not elem.hasAttribute("w:rsidP"): elem.setAttribute("w:rsidP", self.rsid) # Add w14:paraId and w14:textId if not present if not elem.hasAttribute("w14:paraId"): self._ensure_w14_namespace() elem.setAttribute("w14:paraId", _generate_hex_id()) if not elem.hasAttribute("w14:textId"): self._ensure_w14_namespace() elem.setAttribute("w14:textId", _generate_hex_id()) def add_rsid_to_r(elem): # Use w:rsidDel for inside , otherwise w:rsidR if is_inside_deletion(elem): if not elem.hasAttribute("w:rsidDel"): elem.setAttribute("w:rsidDel", self.rsid) else: if not elem.hasAttribute("w:rsidR"): elem.setAttribute("w:rsidR", self.rsid) def add_tracked_change_attrs(elem): # Auto-assign w:id if not present if not elem.hasAttribute("w:id"): elem.setAttribute("w:id", str(self._get_next_change_id())) if not elem.hasAttribute("w:author"): elem.setAttribute("w:author", self.author) if not elem.hasAttribute("w:date"): elem.setAttribute("w:date", timestamp) # Add w16du:dateUtc for tracked changes (same as w:date since we generate UTC timestamps) if elem.tagName in ("w:ins", "w:del") and not elem.hasAttribute( "w16du:dateUtc" ): self._ensure_w16du_namespace() elem.setAttribute("w16du:dateUtc", timestamp) def add_comment_attrs(elem): if not elem.hasAttribute("w:author"): elem.setAttribute("w:author", self.author) if not elem.hasAttribute("w:date"): elem.setAttribute("w:date", timestamp) if not elem.hasAttribute("w:initials"): elem.setAttribute("w:initials", self.initials) def add_comment_extensible_date(elem): # Add w16cex:dateUtc for comment extensible elements if not elem.hasAttribute("w16cex:dateUtc"): self._ensure_w16cex_namespace() elem.setAttribute("w16cex:dateUtc", timestamp) def add_xml_space_to_t(elem): # Add xml:space="preserve" to w:t if text has leading/trailing whitespace if ( elem.firstChild and elem.firstChild.nodeType == elem.firstChild.TEXT_NODE ): text = elem.firstChild.data if text and (text[0].isspace() or text[-1].isspace()): if not elem.hasAttribute("xml:space"): elem.setAttribute("xml:space", "preserve") for node in nodes: if node.nodeType != node.ELEMENT_NODE: continue # Handle the node itself if node.tagName == "w:p": add_rsid_to_p(node) elif node.tagName == "w:r": add_rsid_to_r(node) elif node.tagName == "w:t": add_xml_space_to_t(node) elif node.tagName in ("w:ins", "w:del"): add_tracked_change_attrs(node) elif node.tagName == "w:comment": add_comment_attrs(node) elif node.tagName == "w16cex:commentExtensible": add_comment_extensible_date(node) # Process descendants (getElementsByTagName doesn't return the element itself) for elem in node.getElementsByTagName("w:p"): add_rsid_to_p(elem) for elem in node.getElementsByTagName("w:r"): add_rsid_to_r(elem) for elem in node.getElementsByTagName("w:t"): add_xml_space_to_t(elem) for tag in ("w:ins", "w:del"): for elem in node.getElementsByTagName(tag): add_tracked_change_attrs(elem) for elem in node.getElementsByTagName("w:comment"): add_comment_attrs(elem) for elem in node.getElementsByTagName("w16cex:commentExtensible"): add_comment_extensible_date(elem) def replace_node(self, elem, new_content): """Replace node with automatic attribute injection.""" nodes = super().replace_node(elem, new_content) self._inject_attributes_to_nodes(nodes) return nodes def insert_after(self, elem, xml_content): """Insert after with automatic attribute injection.""" nodes = super().insert_after(elem, xml_content) self._inject_attributes_to_nodes(nodes) return nodes def insert_before(self, elem, xml_content): """Insert before with automatic attribute injection.""" nodes = super().insert_before(elem, xml_content) self._inject_attributes_to_nodes(nodes) return nodes def append_to(self, elem, xml_content): """Append to with automatic attribute injection.""" nodes = super().append_to(elem, xml_content) self._inject_attributes_to_nodes(nodes) return nodes def revert_insertion(self, elem): """Reject an insertion by wrapping its content in a deletion. Wraps all runs inside w:ins in w:del, converting w:t to w:delText. Can process a single w:ins element or a container element with multiple w:ins. Args: elem: Element to process (w:ins, w:p, w:body, etc.) Returns: list: List containing the processed element(s) Raises: ValueError: If the element contains no w:ins elements Example: # Reject a single insertion ins = doc["word/document.xml"].get_node(tag="w:ins", attrs={"w:id": "5"}) doc["word/document.xml"].revert_insertion(ins) # Reject all insertions in a paragraph para = doc["word/document.xml"].get_node(tag="w:p", line_number=42) doc["word/document.xml"].revert_insertion(para) """ # Collect insertions ins_elements = [] if elem.tagName == "w:ins": ins_elements.append(elem) else: ins_elements.extend(elem.getElementsByTagName("w:ins")) # Validate that there are insertions to reject if not ins_elements: raise ValueError( f"revert_insertion requires w:ins elements. " f"The provided element <{elem.tagName}> contains no insertions. " ) # Process all insertions - wrap all children in w:del for ins_elem in ins_elements: runs = list(ins_elem.getElementsByTagName("w:r")) if not runs: continue # Create deletion wrapper del_wrapper = self.dom.createElement("w:del") # Process each run for run in runs: # Convert w:t → w:delText and w:rsidR → w:rsidDel if run.hasAttribute("w:rsidR"): run.setAttribute("w:rsidDel", run.getAttribute("w:rsidR")) run.removeAttribute("w:rsidR") elif not run.hasAttribute("w:rsidDel"): run.setAttribute("w:rsidDel", self.rsid) for t_elem in list(run.getElementsByTagName("w:t")): del_text = self.dom.createElement("w:delText") # Copy ALL child nodes (not just firstChild) to handle entities while t_elem.firstChild: del_text.appendChild(t_elem.firstChild) for i in range(t_elem.attributes.length): attr = t_elem.attributes.item(i) del_text.setAttribute(attr.name, attr.value) t_elem.parentNode.replaceChild(del_text, t_elem) # Move all children from ins to del wrapper while ins_elem.firstChild: del_wrapper.appendChild(ins_elem.firstChild) # Add del wrapper back to ins ins_elem.appendChild(del_wrapper) # Inject attributes to the deletion wrapper self._inject_attributes_to_nodes([del_wrapper]) return [elem] def revert_deletion(self, elem): """Reject a deletion by re-inserting the deleted content. Creates w:ins elements after each w:del, copying deleted content and converting w:delText back to w:t. Can process a single w:del element or a container element with multiple w:del. Args: elem: Element to process (w:del, w:p, w:body, etc.) Returns: list: If elem is w:del, returns [elem, new_ins]. Otherwise returns [elem]. Raises: ValueError: If the element contains no w:del elements Example: # Reject a single deletion - returns [w:del, w:ins] del_elem = doc["word/document.xml"].get_node(tag="w:del", attrs={"w:id": "3"}) nodes = doc["word/document.xml"].revert_deletion(del_elem) # Reject all deletions in a paragraph - returns [para] para = doc["word/document.xml"].get_node(tag="w:p", line_number=42) nodes = doc["word/document.xml"].revert_deletion(para) """ # Collect deletions FIRST - before we modify the DOM del_elements = [] is_single_del = elem.tagName == "w:del" if is_single_del: del_elements.append(elem) else: del_elements.extend(elem.getElementsByTagName("w:del")) # Validate that there are deletions to reject if not del_elements: raise ValueError( f"revert_deletion requires w:del elements. " f"The provided element <{elem.tagName}> contains no deletions. " ) # Track created insertion (only relevant if elem is a single w:del) created_insertion = None # Process all deletions - create insertions that copy the deleted content for del_elem in del_elements: # Clone the deleted runs and convert them to insertions runs = list(del_elem.getElementsByTagName("w:r")) if not runs: continue # Create insertion wrapper ins_elem = self.dom.createElement("w:ins") for run in runs: # Clone the run new_run = run.cloneNode(True) # Convert w:delText → w:t for del_text in list(new_run.getElementsByTagName("w:delText")): t_elem = self.dom.createElement("w:t") # Copy ALL child nodes (not just firstChild) to handle entities while del_text.firstChild: t_elem.appendChild(del_text.firstChild) for i in range(del_text.attributes.length): attr = del_text.attributes.item(i) t_elem.setAttribute(attr.name, attr.value) del_text.parentNode.replaceChild(t_elem, del_text) # Update run attributes: w:rsidDel → w:rsidR if new_run.hasAttribute("w:rsidDel"): new_run.setAttribute("w:rsidR", new_run.getAttribute("w:rsidDel")) new_run.removeAttribute("w:rsidDel") elif not new_run.hasAttribute("w:rsidR"): new_run.setAttribute("w:rsidR", self.rsid) ins_elem.appendChild(new_run) # Insert the new insertion after the deletion nodes = self.insert_after(del_elem, ins_elem.toxml()) # If processing a single w:del, track the created insertion if is_single_del and nodes: created_insertion = nodes[0] # Return based on input type if is_single_del and created_insertion: return [elem, created_insertion] else: return [elem] @staticmethod def suggest_paragraph(xml_content: str) -> str: """Transform paragraph XML to add tracked change wrapping for insertion. Wraps runs in and adds to w:rPr in w:pPr for numbered lists. Args: xml_content: XML string containing a element Returns: str: Transformed XML with tracked change wrapping """ wrapper = f'{xml_content}' doc = minidom.parseString(wrapper) para = doc.getElementsByTagName("w:p")[0] # Ensure w:pPr exists pPr_list = para.getElementsByTagName("w:pPr") if not pPr_list: pPr = doc.createElement("w:pPr") para.insertBefore( pPr, para.firstChild ) if para.firstChild else para.appendChild(pPr) else: pPr = pPr_list[0] # Ensure w:rPr exists in w:pPr rPr_list = pPr.getElementsByTagName("w:rPr") if not rPr_list: rPr = doc.createElement("w:rPr") pPr.appendChild(rPr) else: rPr = rPr_list[0] # Add to w:rPr ins_marker = doc.createElement("w:ins") rPr.insertBefore( ins_marker, rPr.firstChild ) if rPr.firstChild else rPr.appendChild(ins_marker) # Wrap all non-pPr children in ins_wrapper = doc.createElement("w:ins") for child in [c for c in para.childNodes if c.nodeName != "w:pPr"]: para.removeChild(child) ins_wrapper.appendChild(child) para.appendChild(ins_wrapper) return para.toxml() def suggest_deletion(self, elem): """Mark a w:r or w:p element as deleted with tracked changes (in-place DOM manipulation). For w:r: wraps in , converts to , preserves w:rPr For w:p (regular): wraps content in , converts to For w:p (numbered list): adds to w:rPr in w:pPr, wraps content in Args: elem: A w:r or w:p DOM element without existing tracked changes Returns: Element: The modified element Raises: ValueError: If element has existing tracked changes or invalid structure """ if elem.nodeName == "w:r": # Check for existing w:delText if elem.getElementsByTagName("w:delText"): raise ValueError("w:r element already contains w:delText") # Convert w:t → w:delText for t_elem in list(elem.getElementsByTagName("w:t")): del_text = self.dom.createElement("w:delText") # Copy ALL child nodes (not just firstChild) to handle entities while t_elem.firstChild: del_text.appendChild(t_elem.firstChild) # Preserve attributes like xml:space for i in range(t_elem.attributes.length): attr = t_elem.attributes.item(i) del_text.setAttribute(attr.name, attr.value) t_elem.parentNode.replaceChild(del_text, t_elem) # Update run attributes: w:rsidR → w:rsidDel if elem.hasAttribute("w:rsidR"): elem.setAttribute("w:rsidDel", elem.getAttribute("w:rsidR")) elem.removeAttribute("w:rsidR") elif not elem.hasAttribute("w:rsidDel"): elem.setAttribute("w:rsidDel", self.rsid) # Wrap in w:del del_wrapper = self.dom.createElement("w:del") parent = elem.parentNode parent.insertBefore(del_wrapper, elem) parent.removeChild(elem) del_wrapper.appendChild(elem) # Inject attributes to the deletion wrapper self._inject_attributes_to_nodes([del_wrapper]) return del_wrapper elif elem.nodeName == "w:p": # Check for existing tracked changes if elem.getElementsByTagName("w:ins") or elem.getElementsByTagName("w:del"): raise ValueError("w:p element already contains tracked changes") # Check if it's a numbered list item pPr_list = elem.getElementsByTagName("w:pPr") is_numbered = pPr_list and pPr_list[0].getElementsByTagName("w:numPr") if is_numbered: # Add to w:rPr in w:pPr pPr = pPr_list[0] rPr_list = pPr.getElementsByTagName("w:rPr") if not rPr_list: rPr = self.dom.createElement("w:rPr") pPr.appendChild(rPr) else: rPr = rPr_list[0] # Add marker del_marker = self.dom.createElement("w:del") rPr.insertBefore( del_marker, rPr.firstChild ) if rPr.firstChild else rPr.appendChild(del_marker) # Convert w:t → w:delText in all runs for t_elem in list(elem.getElementsByTagName("w:t")): del_text = self.dom.createElement("w:delText") # Copy ALL child nodes (not just firstChild) to handle entities while t_elem.firstChild: del_text.appendChild(t_elem.firstChild) # Preserve attributes like xml:space for i in range(t_elem.attributes.length): attr = t_elem.attributes.item(i) del_text.setAttribute(attr.name, attr.value) t_elem.parentNode.replaceChild(del_text, t_elem) # Update run attributes: w:rsidR → w:rsidDel for run in elem.getElementsByTagName("w:r"): if run.hasAttribute("w:rsidR"): run.setAttribute("w:rsidDel", run.getAttribute("w:rsidR")) run.removeAttribute("w:rsidR") elif not run.hasAttribute("w:rsidDel"): run.setAttribute("w:rsidDel", self.rsid) # Wrap all non-pPr children in del_wrapper = self.dom.createElement("w:del") for child in [c for c in elem.childNodes if c.nodeName != "w:pPr"]: elem.removeChild(child) del_wrapper.appendChild(child) elem.appendChild(del_wrapper) # Inject attributes to the deletion wrapper self._inject_attributes_to_nodes([del_wrapper]) return elem else: raise ValueError(f"Element must be w:r or w:p, got {elem.nodeName}") def _generate_hex_id() -> str: """Generate random 8-character hex ID for para/durable IDs. Values are constrained to be less than 0x7FFFFFFF per OOXML spec: - paraId must be < 0x80000000 - durableId must be < 0x7FFFFFFF We use the stricter constraint (0x7FFFFFFF) for both. """ return f"{random.randint(1, 0x7FFFFFFE):08X}" def _generate_rsid() -> str: """Generate random 8-character hex RSID.""" return "".join(random.choices("0123456789ABCDEF", k=8)) class Document: """Manages comments in unpacked Word documents.""" def __init__( self, unpacked_dir, rsid=None, track_revisions=False, author="Z.AI", initials="C", ): """ Initialize with path to unpacked Word document directory. Automatically sets up comment infrastructure (people.xml, RSIDs). Args: unpacked_dir: Path to unpacked DOCX directory (must contain word/ subdirectory) rsid: Optional RSID to use for all comment elements. If not provided, one will be generated. track_revisions: If True, enables track revisions in settings.xml (default: False) author: Default author name for comments (default: "Z.AI") initials: Default author initials for comments (default: "C") """ self.original_path = Path(unpacked_dir) if not self.original_path.exists() or not self.original_path.is_dir(): raise ValueError(f"Directory not found: {unpacked_dir}") # Create temporary directory with subdirectories for unpacked content and baseline self.temp_dir = tempfile.mkdtemp(prefix="docx_") self.unpacked_path = Path(self.temp_dir) / "unpacked" shutil.copytree(self.original_path, self.unpacked_path) # Pack original directory into temporary .docx for validation baseline (outside unpacked dir) self.original_docx = Path(self.temp_dir) / "original.docx" _pack_document(self.original_path, self.original_docx) self.word_path = self.unpacked_path / "word" # Generate RSID if not provided self.rsid = rsid if rsid else _generate_rsid() print(f"Using RSID: {self.rsid}") # Set default author and initials self.author = author self.initials = initials # Cache for lazy-loaded editors self._editors = {} # Comment file paths self.comments_path = self.word_path / "comments.xml" self.comments_extended_path = self.word_path / "commentsExtended.xml" self.comments_ids_path = self.word_path / "commentsIds.xml" self.comments_extensible_path = self.word_path / "commentsExtensible.xml" # Load existing comments and determine next ID (before setup modifies files) self.existing_comments = self._load_existing_comments() self.next_comment_id = self._get_next_comment_id() # Convenient access to document.xml editor (semi-private) self._document = self["word/document.xml"] # Setup tracked changes infrastructure self._setup_tracking(track_revisions=track_revisions) # Add author to people.xml self._add_author_to_people(author) def __getitem__(self, xml_path: str) -> DocxXMLEditor: """ Get or create a DocxXMLEditor for the specified XML file. Enables lazy-loaded editors with bracket notation: node = doc["word/document.xml"].get_node(tag="w:p", line_number=42) Args: xml_path: Relative path to XML file (e.g., "word/document.xml", "word/comments.xml") Returns: DocxXMLEditor instance for the specified file Raises: ValueError: If the file does not exist Example: # Get node from document.xml node = doc["word/document.xml"].get_node(tag="w:del", attrs={"w:id": "1"}) # Get node from comments.xml comment = doc["word/comments.xml"].get_node(tag="w:comment", attrs={"w:id": "0"}) """ if xml_path not in self._editors: file_path = self.unpacked_path / xml_path if not file_path.exists(): raise ValueError(f"XML file not found: {xml_path}") # Use DocxXMLEditor with RSID, author, and initials for all editors self._editors[xml_path] = DocxXMLEditor( file_path, rsid=self.rsid, author=self.author, initials=self.initials ) return self._editors[xml_path] def add_comment(self, start, end, text: str) -> int: """ Add a comment spanning from one element to another. Args: start: DOM element for the starting point end: DOM element for the ending point text: Comment content Returns: The comment ID that was created Example: start_node = cm.get_document_node(tag="w:del", id="1") end_node = cm.get_document_node(tag="w:ins", id="2") cm.add_comment(start=start_node, end=end_node, text="Explanation") """ comment_id = self.next_comment_id para_id = _generate_hex_id() durable_id = _generate_hex_id() timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") # Add comment ranges to document.xml immediately self._document.insert_before(start, self._comment_range_start_xml(comment_id)) # If end node is a paragraph, append comment markup inside it # Otherwise insert after it (for run-level anchors) if end.tagName == "w:p": self._document.append_to(end, self._comment_range_end_xml(comment_id)) else: self._document.insert_after(end, self._comment_range_end_xml(comment_id)) # Add to comments.xml immediately self._add_to_comments_xml( comment_id, para_id, text, self.author, self.initials, timestamp ) # Add to commentsExtended.xml immediately self._add_to_comments_extended_xml(para_id, parent_para_id=None) # Add to commentsIds.xml immediately self._add_to_comments_ids_xml(para_id, durable_id) # Add to commentsExtensible.xml immediately self._add_to_comments_extensible_xml(durable_id) # Update existing_comments so replies work self.existing_comments[comment_id] = {"para_id": para_id} self.next_comment_id += 1 return comment_id def reply_to_comment( self, parent_comment_id: int, text: str, ) -> int: """ Add a reply to an existing comment. Args: parent_comment_id: The w:id of the parent comment to reply to text: Reply text Returns: The comment ID that was created for the reply Example: cm.reply_to_comment(parent_comment_id=0, text="I agree with this change") """ if parent_comment_id not in self.existing_comments: raise ValueError(f"Parent comment with id={parent_comment_id} not found") parent_info = self.existing_comments[parent_comment_id] comment_id = self.next_comment_id para_id = _generate_hex_id() durable_id = _generate_hex_id() timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") # Add comment ranges to document.xml immediately parent_start_elem = self._document.get_node( tag="w:commentRangeStart", attrs={"w:id": str(parent_comment_id)} ) parent_ref_elem = self._document.get_node( tag="w:commentReference", attrs={"w:id": str(parent_comment_id)} ) self._document.insert_after( parent_start_elem, self._comment_range_start_xml(comment_id) ) parent_ref_run = parent_ref_elem.parentNode self._document.insert_after( parent_ref_run, f'' ) self._document.insert_after( parent_ref_run, self._comment_ref_run_xml(comment_id) ) # Add to comments.xml immediately self._add_to_comments_xml( comment_id, para_id, text, self.author, self.initials, timestamp ) # Add to commentsExtended.xml immediately (with parent) self._add_to_comments_extended_xml( para_id, parent_para_id=parent_info["para_id"] ) # Add to commentsIds.xml immediately self._add_to_comments_ids_xml(para_id, durable_id) # Add to commentsExtensible.xml immediately self._add_to_comments_extensible_xml(durable_id) # Update existing_comments so replies work self.existing_comments[comment_id] = {"para_id": para_id} self.next_comment_id += 1 return comment_id def __del__(self): """Clean up temporary directory on deletion.""" if hasattr(self, "temp_dir") and Path(self.temp_dir).exists(): shutil.rmtree(self.temp_dir) def validate(self) -> None: """ Validate the document (lightweight check). Currently performs basic structural checks. XSD schema validation and redlining validation have been removed. Use save(validate=False) to skip validation entirely. """ # Basic structural check: ensure word/document.xml exists doc_xml = self.unpacked_path / "word" / "document.xml" if not doc_xml.exists(): raise ValueError("Validation failed: word/document.xml not found") def save(self, destination=None, validate=True) -> None: """ Save all modified XML files to disk and copy to destination directory. This persists all changes made via add_comment() and reply_to_comment(). Args: destination: Optional path to save to. If None, saves back to original directory. validate: If True, validates document before saving (default: True). """ # Only ensure comment relationships and content types if comment files exist if self.comments_path.exists(): self._ensure_comment_relationships() self._ensure_comment_content_types() # Save all modified XML files in temp directory for editor in self._editors.values(): editor.save() # Validate by default if validate: self.validate() # Copy contents from temp directory to destination (or original directory) target_path = Path(destination) if destination else self.original_path shutil.copytree(self.unpacked_path, target_path, dirs_exist_ok=True) # ==================== Private: Initialization ==================== def _get_next_comment_id(self): """Get the next available comment ID.""" if not self.comments_path.exists(): return 0 editor = self["word/comments.xml"] max_id = -1 for comment_elem in editor.dom.getElementsByTagName("w:comment"): comment_id = comment_elem.getAttribute("w:id") if comment_id: try: max_id = max(max_id, int(comment_id)) except ValueError: pass return max_id + 1 def _load_existing_comments(self): """Load existing comments from files to enable replies.""" if not self.comments_path.exists(): return {} editor = self["word/comments.xml"] existing = {} for comment_elem in editor.dom.getElementsByTagName("w:comment"): comment_id = comment_elem.getAttribute("w:id") if not comment_id: continue # Find para_id from the w:p element within the comment para_id = None for p_elem in comment_elem.getElementsByTagName("w:p"): para_id = p_elem.getAttribute("w14:paraId") if para_id: break if not para_id: continue existing[int(comment_id)] = {"para_id": para_id} return existing # ==================== Private: Setup Methods ==================== def _setup_tracking(self, track_revisions=False): """Set up comment infrastructure in unpacked directory. Args: track_revisions: If True, enables track revisions in settings.xml """ # Create or update word/people.xml people_file = self.word_path / "people.xml" self._update_people_xml(people_file) # Update XML files self._add_content_type_for_people(self.unpacked_path / "[Content_Types].xml") self._add_relationship_for_people( self.word_path / "_rels" / "document.xml.rels" ) # Always add RSID to settings.xml, optionally enable trackRevisions self._update_settings( self.word_path / "settings.xml", track_revisions=track_revisions ) def _update_people_xml(self, path): """Create people.xml if it doesn't exist.""" if not path.exists(): # Copy from template shutil.copy(TEMPLATE_DIR / "people.xml", path) def _add_content_type_for_people(self, path): """Add people.xml content type to [Content_Types].xml if not already present.""" editor = self["[Content_Types].xml"] if self._has_override(editor, "/word/people.xml"): return # Add Override element root = editor.dom.documentElement override_xml = '' editor.append_to(root, override_xml) def _add_relationship_for_people(self, path): """Add people.xml relationship to document.xml.rels if not already present.""" editor = self["word/_rels/document.xml.rels"] if self._has_relationship(editor, "people.xml"): return root = editor.dom.documentElement root_tag = root.tagName # type: ignore prefix = root_tag.split(":")[0] + ":" if ":" in root_tag else "" next_rid = editor.get_next_rid() # Create the relationship entry rel_xml = f'<{prefix}Relationship Id="{next_rid}" Type="http://schemas.microsoft.com/office/2011/relationships/people" Target="people.xml"/>' editor.append_to(root, rel_xml) def _update_settings(self, path, track_revisions=False, update_fields=True): """Add RSID and optionally enable track revisions and update fields in settings.xml. Args: path: Path to settings.xml track_revisions: If True, adds trackRevisions element update_fields: If True, adds updateFields element to auto-update fields on open Places elements per OOXML schema order: - trackRevisions: early (before defaultTabStop) - updateFields: early (before defaultTabStop) - rsids: late (after compat) """ editor = self["word/settings.xml"] root = editor.get_node(tag="w:settings") prefix = root.tagName.split(":")[0] if ":" in root.tagName else "w" # Conditionally add trackRevisions if requested if track_revisions: track_revisions_exists = any( elem.tagName == f"{prefix}:trackRevisions" for elem in editor.dom.getElementsByTagName(f"{prefix}:trackRevisions") ) if not track_revisions_exists: track_rev_xml = f"<{prefix}:trackRevisions/>" # Try to insert before documentProtection, defaultTabStop, or at start inserted = False for tag in [f"{prefix}:documentProtection", f"{prefix}:defaultTabStop"]: elements = editor.dom.getElementsByTagName(tag) if elements: editor.insert_before(elements[0], track_rev_xml) inserted = True break if not inserted: # Insert as first child of settings if root.firstChild: editor.insert_before(root.firstChild, track_rev_xml) else: editor.append_to(root, track_rev_xml) # Conditionally add updateFields if requested if update_fields: update_fields_exists = any( elem.tagName == f"{prefix}:updateFields" for elem in editor.dom.getElementsByTagName(f"{prefix}:updateFields") ) if not update_fields_exists: update_fields_xml = f'<{prefix}:updateFields {prefix}:val="true"/>' # Try to insert before defaultTabStop, hyphenationZone, or at start inserted = False for tag in [f"{prefix}:defaultTabStop", f"{prefix}:hyphenationZone"]: elements = editor.dom.getElementsByTagName(tag) if elements: editor.insert_before(elements[0], update_fields_xml) inserted = True break if not inserted: # Insert as first child of settings if root.firstChild: editor.insert_before(root.firstChild, update_fields_xml) else: editor.append_to(root, update_fields_xml) # Always check if rsids section exists rsids_elements = editor.dom.getElementsByTagName(f"{prefix}:rsids") if not rsids_elements: # Add new rsids section rsids_xml = f'''<{prefix}:rsids> <{prefix}:rsidRoot {prefix}:val="{self.rsid}"/> <{prefix}:rsid {prefix}:val="{self.rsid}"/> ''' # Try to insert after compat, before clrSchemeMapping, or before closing tag inserted = False compat_elements = editor.dom.getElementsByTagName(f"{prefix}:compat") if compat_elements: editor.insert_after(compat_elements[0], rsids_xml) inserted = True if not inserted: clr_elements = editor.dom.getElementsByTagName( f"{prefix}:clrSchemeMapping" ) if clr_elements: editor.insert_before(clr_elements[0], rsids_xml) inserted = True if not inserted: editor.append_to(root, rsids_xml) else: # Check if this rsid already exists rsids_elem = rsids_elements[0] rsid_exists = any( elem.getAttribute(f"{prefix}:val") == self.rsid for elem in rsids_elem.getElementsByTagName(f"{prefix}:rsid") ) if not rsid_exists: rsid_xml = f'<{prefix}:rsid {prefix}:val="{self.rsid}"/>' editor.append_to(rsids_elem, rsid_xml) # ==================== Private: XML File Creation ==================== def _add_to_comments_xml( self, comment_id, para_id, text, author, initials, timestamp ): """Add a single comment to comments.xml.""" if not self.comments_path.exists(): shutil.copy(TEMPLATE_DIR / "comments.xml", self.comments_path) editor = self["word/comments.xml"] root = editor.get_node(tag="w:comments") escaped_text = ( text.replace("&", "&").replace("<", "<").replace(">", ">") ) # Note: w:rsidR, w:rsidRDefault, w:rsidP on w:p, w:rsidR on w:r, # and w:author, w:date, w:initials on w:comment are automatically added by DocxXMLEditor comment_xml = f''' {escaped_text} ''' editor.append_to(root, comment_xml) def _add_to_comments_extended_xml(self, para_id, parent_para_id): """Add a single comment to commentsExtended.xml.""" if not self.comments_extended_path.exists(): shutil.copy( TEMPLATE_DIR / "commentsExtended.xml", self.comments_extended_path ) editor = self["word/commentsExtended.xml"] root = editor.get_node(tag="w15:commentsEx") if parent_para_id: xml = f'' else: xml = f'' editor.append_to(root, xml) def _add_to_comments_ids_xml(self, para_id, durable_id): """Add a single comment to commentsIds.xml.""" if not self.comments_ids_path.exists(): shutil.copy(TEMPLATE_DIR / "commentsIds.xml", self.comments_ids_path) editor = self["word/commentsIds.xml"] root = editor.get_node(tag="w16cid:commentsIds") xml = f'' editor.append_to(root, xml) def _add_to_comments_extensible_xml(self, durable_id): """Add a single comment to commentsExtensible.xml.""" if not self.comments_extensible_path.exists(): shutil.copy( TEMPLATE_DIR / "commentsExtensible.xml", self.comments_extensible_path ) editor = self["word/commentsExtensible.xml"] root = editor.get_node(tag="w16cex:commentsExtensible") xml = f'' editor.append_to(root, xml) # ==================== Private: XML Fragments ==================== def _comment_range_start_xml(self, comment_id): """Generate XML for comment range start.""" return f'' def _comment_range_end_xml(self, comment_id): """Generate XML for comment range end with reference run. Note: w:rsidR is automatically added by DocxXMLEditor. """ return f''' ''' def _comment_ref_run_xml(self, comment_id): """Generate XML for comment reference run. Note: w:rsidR is automatically added by DocxXMLEditor. """ return f''' ''' # ==================== Private: Metadata Updates ==================== def _has_relationship(self, editor, target): """Check if a relationship with given target exists.""" for rel_elem in editor.dom.getElementsByTagName("Relationship"): if rel_elem.getAttribute("Target") == target: return True return False def _has_override(self, editor, part_name): """Check if an override with given part name exists.""" for override_elem in editor.dom.getElementsByTagName("Override"): if override_elem.getAttribute("PartName") == part_name: return True return False def _has_author(self, editor, author): """Check if an author already exists in people.xml.""" for person_elem in editor.dom.getElementsByTagName("w15:person"): if person_elem.getAttribute("w15:author") == author: return True return False def _add_author_to_people(self, author): """Add author to people.xml (called during initialization).""" people_path = self.word_path / "people.xml" # people.xml should already exist from _setup_tracking if not people_path.exists(): raise ValueError("people.xml should exist after _setup_tracking") editor = self["word/people.xml"] root = editor.get_node(tag="w15:people") # Check if author already exists if self._has_author(editor, author): return # Add author with proper XML escaping to prevent injection escaped_author = html.escape(author, quote=True) person_xml = f''' ''' editor.append_to(root, person_xml) def _ensure_comment_relationships(self): """Ensure word/_rels/document.xml.rels has comment relationships.""" editor = self["word/_rels/document.xml.rels"] if self._has_relationship(editor, "comments.xml"): return root = editor.dom.documentElement root_tag = root.tagName # type: ignore prefix = root_tag.split(":")[0] + ":" if ":" in root_tag else "" next_rid_num = int(editor.get_next_rid()[3:]) # Add relationship elements rels = [ ( next_rid_num, "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments", "comments.xml", ), ( next_rid_num + 1, "http://schemas.microsoft.com/office/2011/relationships/commentsExtended", "commentsExtended.xml", ), ( next_rid_num + 2, "http://schemas.microsoft.com/office/2016/09/relationships/commentsIds", "commentsIds.xml", ), ( next_rid_num + 3, "http://schemas.microsoft.com/office/2018/08/relationships/commentsExtensible", "commentsExtensible.xml", ), ] for rel_id, rel_type, target in rels: rel_xml = f'<{prefix}Relationship Id="rId{rel_id}" Type="{rel_type}" Target="{target}"/>' editor.append_to(root, rel_xml) def _ensure_comment_content_types(self): """Ensure [Content_Types].xml has comment content types.""" editor = self["[Content_Types].xml"] if self._has_override(editor, "/word/comments.xml"): return root = editor.dom.documentElement # Add Override elements overrides = [ ( "/word/comments.xml", "application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml", ), ( "/word/commentsExtended.xml", "application/vnd.openxmlformats-officedocument.wordprocessingml.commentsExtended+xml", ), ( "/word/commentsIds.xml", "application/vnd.openxmlformats-officedocument.wordprocessingml.commentsIds+xml", ), ( "/word/commentsExtensible.xml", "application/vnd.openxmlformats-officedocument.wordprocessingml.commentsExtensible+xml", ), ] for part_name, content_type in overrides: override_xml = ( f'' ) editor.append_to(root, override_xml)