Initial commit

This commit is contained in:
Z User
2026-06-06 05:21:10 +00:00
Unverified
commit 6664758a6d
493 changed files with 135653 additions and 0 deletions

Binary file not shown.

1337
skills/ppt/scripts/html2pptx.js Executable file

File diff suppressed because it is too large Load Diff

512
skills/ppt/scripts/inventory.py Executable file
View File

@@ -0,0 +1,512 @@
#!/usr/bin/env python3
"""
Extract structured text content from PowerPoint presentations.
Usage:
python inventory.py input.pptx output.json [--issues-only]
"""
import argparse
import json
import sys
from pathlib import Path
from typing import Any, Dict, List, Optional
from pptx import Presentation
from pptx.enum.text import PP_ALIGN
from pptx.shapes.base import BaseShape
# Public type alias used by replace.py: slide_id -> {shape_id -> ShapeData}
InventoryData = Dict[str, Dict[str, "ShapeData"]]
_EMU = 914400 # EMUs per inch
_BULLET_NS = "{http://schemas.openxmlformats.org/drawingml/2006/main}"
_ALIGN_MAP = {
PP_ALIGN.CENTER: "CENTER",
PP_ALIGN.RIGHT: "RIGHT",
PP_ALIGN.JUSTIFY: "JUSTIFY",
}
def _is_cjk(ch: str) -> bool:
"""True for full-width CJK characters (Chinese, Japanese, Korean, full-width forms)."""
cp = ord(ch)
return (
0x4E00 <= cp <= 0x9FFF # CJK Unified Ideographs
or 0x3400 <= cp <= 0x4DBF # CJK Extension A
or 0x3040 <= cp <= 0x30FF # Hiragana / Katakana
or 0xFF00 <= cp <= 0xFFEF # Full-width ASCII & half-width Katakana
or 0xAC00 <= cp <= 0xD7AF # Hangul syllables
)
class ParagraphData:
"""Text and formatting for one paragraph."""
def __init__(self, paragraph: Any):
self.text: str = paragraph.text.strip()
self.bullet: bool = False
self.level: Optional[int] = None
self.alignment: Optional[str] = None
self.space_before: Optional[float] = None
self.space_after: Optional[float] = None
self.font_name: Optional[str] = None
self.font_size: Optional[float] = None
self.bold: Optional[bool] = None
self.italic: Optional[bool] = None
self.underline: Optional[bool] = None
self.color: Optional[str] = None
self.theme_color: Optional[str] = None
self.line_spacing: Optional[float] = None
# Bullet detection
pPr = getattr(getattr(paragraph, "_p", None), "pPr", None)
if pPr is not None and (
pPr.find(f"{_BULLET_NS}buChar") is not None
or pPr.find(f"{_BULLET_NS}buAutoNum") is not None
):
self.bullet = True
self.level = getattr(paragraph, "level", None)
# Alignment (omit LEFT — it's the default)
align = getattr(paragraph, "alignment", None)
if align in _ALIGN_MAP:
self.alignment = _ALIGN_MAP[align]
# Spacing
sb = getattr(paragraph, "space_before", None)
if sb:
self.space_before = sb.pt
sa = getattr(paragraph, "space_after", None)
if sa:
self.space_after = sa.pt
# Font from first run
if paragraph.runs:
font = paragraph.runs[0].font
self.font_name = font.name or None
self.font_size = font.size.pt if font.size else None
self.bold = font.bold
self.italic = font.italic
self.underline = font.underline
try:
self.color = str(font.color.rgb) if font.color.rgb else None
except (AttributeError, TypeError):
try:
tc = font.color.theme_color
self.theme_color = tc.name if tc else None
except (AttributeError, TypeError):
pass
# Line spacing (after font so font_size is available)
ls = getattr(paragraph, "line_spacing", None)
if ls is not None:
if hasattr(ls, "pt"):
self.line_spacing = round(ls.pt, 2)
else:
# Multiplier — convert to points using current font size
self.line_spacing = round(ls * (self.font_size or 12.0), 2)
def to_dict(self) -> Dict[str, Any]:
d: Dict[str, Any] = {"text": self.text}
if self.bullet:
d["bullet"] = True
if self.level is not None:
d["level"] = self.level
if self.alignment:
d["alignment"] = self.alignment
for key in ("space_before", "space_after", "font_size", "line_spacing"):
val = getattr(self, key)
if val is not None:
d[key] = val
if self.font_name:
d["font_name"] = self.font_name
for key in ("bold", "italic", "underline"):
val = getattr(self, key)
if val is not None:
d[key] = val
if self.color:
d["color"] = self.color
elif self.theme_color:
d["theme_color"] = self.theme_color
return d
class ShapeData:
"""Position, formatting metadata, and text content for one shape."""
def __init__(
self,
shape: BaseShape,
absolute_left: Optional[int] = None,
absolute_top: Optional[int] = None,
slide: Optional[Any] = None,
):
self.shape = shape
self.shape_id: str = "" # assigned after sorting
# Slide dimensions (for overflow checking)
self.slide_width_emu: Optional[int] = None
self.slide_height_emu: Optional[int] = None
if slide:
try:
prs_xml = slide.part.package.presentation_part.presentation
self.slide_width_emu = prs_xml.slide_width
self.slide_height_emu = prs_xml.slide_height
except (AttributeError, TypeError):
pass
# Placeholder metadata
self.placeholder_type: Optional[str] = None
self.default_font_size: Optional[float] = None
if getattr(shape, "is_placeholder", False):
pf = shape.placeholder_format # type: ignore
if pf and pf.type:
self.placeholder_type = str(pf.type).split(".")[-1].split(" ")[0]
if slide and hasattr(slide, "slide_layout"):
self.default_font_size = _layout_font_size(shape, slide.slide_layout)
# Position in inches (use absolute coords for shapes inside groups)
left_emu = absolute_left if absolute_left is not None else getattr(shape, "left", 0)
top_emu = absolute_top if absolute_top is not None else getattr(shape, "top", 0)
self.left = round(left_emu / _EMU, 2)
self.top = round(top_emu / _EMU, 2)
self.width = round(getattr(shape, "width", 0) / _EMU, 2)
self.height = round(getattr(shape, "height", 0) / _EMU, 2)
# EMU positions kept for overflow arithmetic
self.left_emu = left_emu
self.top_emu = top_emu
self.width_emu = getattr(shape, "width", 0)
self.height_emu = getattr(shape, "height", 0)
# Issue detection
self.frame_overflow_bottom: Optional[float] = None
self.slide_overflow_right: Optional[float] = None
self.slide_overflow_bottom: Optional[float] = None
self.overlapping_shapes: Dict[str, float] = {}
self.warnings: List[str] = []
self._estimate_frame_overflow()
self._calculate_slide_overflow()
self._detect_bullet_issues()
# ------------------------------------------------------------------
# Issue detection helpers
# ------------------------------------------------------------------
def _default_font_size_pts(self) -> float:
"""Best-effort default font size from theme styles."""
if self.default_font_size:
return self.default_font_size
try:
master = self.shape.part.slide_layout.slide_master # type: ignore
style = "titleStyle" if (self.placeholder_type and "TITLE" in self.placeholder_type) else "bodyStyle"
for child in master.element.iter():
tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
if tag == style:
for elem in child.iter():
if "sz" in elem.attrib:
return int(elem.attrib["sz"]) / 100.0
except Exception:
pass
return 14.0 # conservative fallback
def _estimate_frame_overflow(self) -> None:
"""Estimate text overflow via character-count heuristic (no external deps)."""
if not hasattr(self.shape, "text_frame"):
return
tf = self.shape.text_frame # type: ignore
if not tf or not tf.paragraphs:
return
# Usable area after text frame margins
def e2i(v: Any) -> float:
return (v or 0) / _EMU
margin_h = e2i(tf.margin_top) + e2i(tf.margin_bottom)
margin_w = e2i(tf.margin_left) + e2i(tf.margin_right)
if margin_h == 0:
margin_h = 0.10 # PowerPoint default: ~0.05" top + 0.05" bottom
if margin_w == 0:
margin_w = 0.20 # PowerPoint default: ~0.1" left + 0.1" right
usable_w = self.width - margin_w
usable_h = self.height - margin_h
if usable_w <= 0 or usable_h <= 0:
return
default_size = self._default_font_size_pts()
total_h = 0.0
for para in tf.paragraphs:
if not para.text.strip():
continue
pd = ParagraphData(para)
size_pt = pd.font_size or default_size
# Estimate text width: CJK chars ≈ 1.0× font_size pts, others ≈ 0.5×
text_w_pts = sum(
size_pt if _is_cjk(c) else size_pt * 0.5
for c in para.text
)
usable_w_pts = usable_w * 72.0
n_lines = max(1, -(-int(text_w_pts) // max(1, int(usable_w_pts)))) # ceiling div
line_h_in = (pd.line_spacing or size_pt) / 72.0
total_h += (pd.space_before or 0) / 72.0
total_h += n_lines * line_h_in
total_h += (pd.space_after or 0) / 72.0
if total_h > usable_h + 0.05: # ignore sub-0.05" rounding noise
self.frame_overflow_bottom = round(total_h - usable_h, 2)
def _calculate_slide_overflow(self) -> None:
if self.slide_width_emu is None or self.slide_height_emu is None:
return
r = self.left_emu + self.width_emu - self.slide_width_emu
if r > 0:
v = round(r / _EMU, 2)
if v > 0.01:
self.slide_overflow_right = v
b = self.top_emu + self.height_emu - self.slide_height_emu
if b > 0:
v = round(b / _EMU, 2)
if v > 0.01:
self.slide_overflow_bottom = v
def _detect_bullet_issues(self) -> None:
if not hasattr(self.shape, "text_frame"):
return
for para in self.shape.text_frame.paragraphs: # type: ignore
text = para.text.strip()
if text and any(text.startswith(s + " ") for s in ("", "", "")):
self.warnings.append("manual_bullet_symbol: use proper bullet formatting")
break
# ------------------------------------------------------------------
# Public interface
# ------------------------------------------------------------------
@property
def paragraphs(self) -> List[ParagraphData]:
if not hasattr(self.shape, "text_frame"):
return []
return [ParagraphData(p) for p in self.shape.text_frame.paragraphs if p.text.strip()] # type: ignore
@property
def has_any_issues(self) -> bool:
return bool(
self.frame_overflow_bottom is not None
or self.slide_overflow_right is not None
or self.slide_overflow_bottom is not None
or self.overlapping_shapes
or self.warnings
)
def to_dict(self) -> Dict[str, Any]:
d: Dict[str, Any] = {
"left": self.left, "top": self.top,
"width": self.width, "height": self.height,
}
if self.placeholder_type:
d["placeholder_type"] = self.placeholder_type
if self.default_font_size:
d["default_font_size"] = self.default_font_size
overflow: Dict[str, Any] = {}
if self.frame_overflow_bottom is not None:
overflow["frame"] = {"overflow_bottom": self.frame_overflow_bottom}
slide_ov: Dict[str, float] = {}
if self.slide_overflow_right is not None:
slide_ov["overflow_right"] = self.slide_overflow_right
if self.slide_overflow_bottom is not None:
slide_ov["overflow_bottom"] = self.slide_overflow_bottom
if slide_ov:
overflow["slide"] = slide_ov
if overflow:
d["overflow"] = overflow
if self.overlapping_shapes:
d["overlap"] = {"overlapping_shapes": self.overlapping_shapes}
if self.warnings:
d["warnings"] = self.warnings
d["paragraphs"] = [p.to_dict() for p in self.paragraphs]
return d
# ------------------------------------------------------------------
# Module-level helpers
# ------------------------------------------------------------------
def _layout_font_size(shape: BaseShape, slide_layout: Any) -> Optional[float]:
"""Extract default font size from the matching layout placeholder."""
try:
shape_type = shape.placeholder_format.type # type: ignore
for ph in slide_layout.placeholders:
if ph.placeholder_format.type == shape_type:
for elem in ph.element.iter():
if "defRPr" in elem.tag and (sz := elem.get("sz")):
return float(sz) / 100.0
break
except Exception:
pass
return None
def _is_valid_shape(shape: BaseShape) -> bool:
"""True if shape has meaningful text and is not a slide-number placeholder."""
if not hasattr(shape, "text_frame"):
return False
tf = shape.text_frame # type: ignore
if not tf or not tf.text.strip():
return False
if getattr(shape, "is_placeholder", False):
pf = shape.placeholder_format # type: ignore
if pf and pf.type:
pt = str(pf.type).split(".")[-1].split(" ")[0]
if pt == "SLIDE_NUMBER":
return False
if pt == "FOOTER" and tf.text.strip().isdigit():
return False
return True
def _collect_shapes(shape: BaseShape, parent_left: int = 0, parent_top: int = 0):
"""Yield (shape, abs_left, abs_top) tuples, recursing into GroupShapes."""
if hasattr(shape, "shapes"): # GroupShape
g_left = parent_left + getattr(shape, "left", 0)
g_top = parent_top + getattr(shape, "top", 0)
for child in shape.shapes: # type: ignore
yield from _collect_shapes(child, g_left, g_top)
elif _is_valid_shape(shape):
yield (
shape,
parent_left + getattr(shape, "left", 0),
parent_top + getattr(shape, "top", 0),
)
def _sort_by_position(shapes: List[ShapeData]) -> List[ShapeData]:
"""Sort shapes top-to-bottom, left-to-right (0.5" row tolerance)."""
if not shapes:
return shapes
shapes = sorted(shapes, key=lambda s: (s.top, s.left))
result: List[ShapeData] = []
row = [shapes[0]]
row_top = shapes[0].top
for s in shapes[1:]:
if abs(s.top - row_top) <= 0.5:
row.append(s)
else:
result.extend(sorted(row, key=lambda s: s.left))
row = [s]
row_top = s.top
result.extend(sorted(row, key=lambda s: s.left))
return result
def _detect_overlaps(shapes: List[ShapeData]) -> None:
"""Populate overlapping_shapes for all pairs with meaningful overlap."""
for i, s1 in enumerate(shapes):
for s2 in shapes[i + 1:]:
ow = min(s1.left + s1.width, s2.left + s2.width) - max(s1.left, s2.left)
oh = min(s1.top + s1.height, s2.top + s2.height) - max(s1.top, s2.top)
if ow > 0.05 and oh > 0.05:
area = round(ow * oh, 2)
s1.overlapping_shapes[s2.shape_id] = area
s2.overlapping_shapes[s1.shape_id] = area
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
def extract_text_inventory(
pptx_path: Path,
prs: Optional[Any] = None,
issues_only: bool = False,
) -> InventoryData:
"""Extract text from all slides.
Returns {slide-N: {shape-N: ShapeData}}, shapes sorted by visual position.
Pass an existing Presentation object via `prs` to avoid re-loading.
"""
if prs is None:
prs = Presentation(str(pptx_path))
inventory: InventoryData = {}
for slide_idx, slide in enumerate(prs.slides):
raw = list(_collect_shapes_from_slide(slide))
if not raw:
continue
shape_data_list = [ShapeData(s, al, at, slide) for s, al, at in raw]
sorted_shapes = _sort_by_position(shape_data_list)
for idx, sd in enumerate(sorted_shapes):
sd.shape_id = f"shape-{idx}"
if len(sorted_shapes) > 1:
_detect_overlaps(sorted_shapes)
if issues_only:
sorted_shapes = [sd for sd in sorted_shapes if sd.has_any_issues]
if not sorted_shapes:
continue
inventory[f"slide-{slide_idx}"] = {sd.shape_id: sd for sd in sorted_shapes}
return inventory
def _collect_shapes_from_slide(slide):
"""Yield (shape, abs_left, abs_top) for all valid text shapes on a slide."""
for shape in slide.shapes: # type: ignore
yield from _collect_shapes(shape)
def save_inventory(inventory: InventoryData, output_path: Path) -> None:
"""Serialize inventory to a JSON file."""
json_data = {
slide_key: {k: sd.to_dict() for k, sd in shapes.items()}
for slide_key, shapes in inventory.items()
}
with open(output_path, "w", encoding="utf-8") as f:
json.dump(json_data, f, indent=2, ensure_ascii=False)
def main() -> None:
parser = argparse.ArgumentParser(description="Extract text inventory from a PowerPoint file.")
parser.add_argument("input", help="Input .pptx file")
parser.add_argument("output", help="Output .json file")
parser.add_argument("--issues-only", action="store_true",
help="Include only shapes with overflow/overlap issues")
args = parser.parse_args()
input_path = Path(args.input)
if not input_path.exists():
print(f"Error: File not found: {args.input}")
sys.exit(1)
if input_path.suffix.lower() != ".pptx":
print("Error: Input must be a .pptx file")
sys.exit(1)
try:
inventory = extract_text_inventory(input_path, issues_only=args.issues_only)
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
save_inventory(inventory, output_path)
total = sum(len(v) for v in inventory.values())
if args.issues_only:
print(f"Found {total} shapes with issues across {len(inventory)} slides → {args.output}")
else:
print(f"Found {total} text shapes across {len(inventory)} slides → {args.output}")
except Exception as e:
import traceback
print(f"Error: {e}")
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()

2046
skills/ppt/scripts/pdf.py Executable file

File diff suppressed because it is too large Load Diff

144
skills/ppt/scripts/rearrange.py Executable file
View File

@@ -0,0 +1,144 @@
#!/usr/bin/env python3
"""
Rearrange PowerPoint slides based on a sequence of indices.
Usage:
python rearrange.py template.pptx output.pptx 0,34,34,50,52
Slides are 0-indexed. Indices can repeat to duplicate slides.
"""
import argparse
import sys
from copy import deepcopy
from pathlib import Path
from pptx import Presentation
from pptx.oxml.ns import qn
def copy_slide(src_prs: Presentation, dst_prs: Presentation, index: int, dst_layouts: dict) -> None:
"""Append a copy of slide[index] from src_prs into dst_prs."""
src_slide = src_prs.slides[index]
# Match layout by name across all masters; fall back to first available layout
layout_name = src_slide.slide_layout.name
dst_layout = dst_layouts.get(layout_name) or dst_prs.slide_layouts[0]
new_slide = dst_prs.slides.add_slide(dst_layout)
# Clear auto-added placeholder shapes
for shape in list(new_slide.shapes):
sp = shape.element
sp.getparent().remove(sp)
# Copy ALL non-layout relationships from source and build old→new rId mapping.
# This covers images, media, charts, hyperlinks, videos, and any other embedded content.
# Without this, relationship attributes (r:embed, r:id, r:link) in copied shapes would
# reference rIds that don't exist in the new slide, causing PowerPoint repair dialogs.
R_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
SKIP_TYPES = {"slideLayout", "notesSlide", "slide"} # handled by python-pptx infrastructure
rId_mapping: dict = {}
for rel_id, rel in src_slide.part.rels.items():
rel_short = rel.reltype.split("/")[-1]
if rel_short in SKIP_TYPES:
continue
new_rId = new_slide.part.rels.get_or_add(rel.reltype, rel._target)
rId_mapping[rel_id] = new_rId
# Copy all shape elements
r_embed = f"{{{R_NS}}}embed"
r_id = f"{{{R_NS}}}id"
r_link = f"{{{R_NS}}}link"
for shape in src_slide.shapes:
new_el = deepcopy(shape.element)
new_slide.shapes._spTree.insert_element_before(new_el, "p:extLst")
# Remap ALL relationship references (images, charts, hyperlinks, video, etc.)
for el in new_el.iter():
for attr in (r_embed, r_id, r_link):
old_rId = el.get(attr)
if old_rId and old_rId in rId_mapping:
el.set(attr, rId_mapping[old_rId])
# Copy slide-level background if defined.
# p:bg lives inside p:cSld, not directly under p:sld.
src_cSld = src_slide.element.find(qn("p:cSld"))
dst_cSld = new_slide.element.find(qn("p:cSld"))
if src_cSld is not None and dst_cSld is not None:
src_bg = src_cSld.find(qn("p:bg"))
if src_bg is not None:
existing_bg = dst_cSld.find(qn("p:bg"))
if existing_bg is not None:
dst_cSld.remove(existing_bg)
dst_cSld.insert(0, deepcopy(src_bg))
def rearrange_presentation(
template_path: Path, output_path: Path, slide_sequence: list[int]
) -> None:
src_prs = Presentation(template_path)
total = len(src_prs.slides)
for idx in slide_sequence:
if idx < 0 or idx >= total:
raise ValueError(f"Slide index {idx} out of range (0{total - 1})")
# Build a fresh presentation with the same dimensions
dst_prs = Presentation(template_path)
# Remove all existing slides from dst_prs
sldIdLst = dst_prs.slides._sldIdLst
for sldId in list(sldIdLst):
rId = sldId.get(qn("r:id")) # must use full namespace via qn(), not bare "r:id"
if rId:
dst_prs.part.drop_rel(rId)
sldIdLst.remove(sldId)
# Search all slide masters for layout matching (templates may have multiple masters)
all_layouts = {
layout.name: layout
for master in dst_prs.slide_masters
for layout in master.slide_layouts
}
# Append slides in requested order (duplicates included)
for idx in slide_sequence:
copy_slide(src_prs, dst_prs, idx, all_layouts)
output_path.parent.mkdir(parents=True, exist_ok=True)
dst_prs.save(output_path)
print(f"Saved {len(slide_sequence)} slides → {output_path}")
def main() -> None:
parser = argparse.ArgumentParser(
description="Rearrange PowerPoint slides.",
epilog="Example: python rearrange.py template.pptx output.pptx 0,34,34,50,52",
)
parser.add_argument("template", help="Path to template PPTX")
parser.add_argument("output", help="Path for output PPTX")
parser.add_argument("sequence", help="Comma-separated 0-based slide indices")
args = parser.parse_args()
template_path = Path(args.template)
if not template_path.exists():
print(f"Error: Template not found: {args.template}")
sys.exit(1)
try:
slide_sequence = [int(x.strip()) for x in args.sequence.split(",")]
except ValueError:
print("Error: sequence must be comma-separated integers (e.g. 0,34,34,50,52)")
sys.exit(1)
try:
rearrange_presentation(template_path, Path(args.output), slide_sequence)
except ValueError as e:
print(f"Error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()

231
skills/ppt/scripts/replace.py Executable file
View File

@@ -0,0 +1,231 @@
#!/usr/bin/env python3
"""Apply text replacements to PowerPoint presentation.
Usage:
python replace.py <input.pptx> <replacements.json> <output.pptx>
The replacements JSON should have the structure output by inventory.py.
ALL text shapes identified by inventory.py will have their text cleared
unless "paragraphs" is specified in the replacements for that shape.
"""
import json
import sys
from pathlib import Path
from typing import Any, Dict, List
from inventory import InventoryData, extract_text_inventory
from pptx import Presentation
from pptx.dml.color import RGBColor
from pptx.enum.dml import MSO_THEME_COLOR
from pptx.enum.text import PP_ALIGN
from pptx.oxml.xmlchemy import OxmlElement
from pptx.util import Pt
_ALIGN_MAP = {
"LEFT": PP_ALIGN.LEFT,
"CENTER": PP_ALIGN.CENTER,
"RIGHT": PP_ALIGN.RIGHT,
"JUSTIFY": PP_ALIGN.JUSTIFY,
}
# Bullet indentation constants
# marL = font_size × (1 + level) × 1.6 pts, converted to EMUs (1 pt = 12700 EMU)
_INDENT_FACTOR = 1.6
_EMU_PER_PT = 12700
def _clear_paragraph_bullets(paragraph):
"""Remove all bullet XML elements from a paragraph's pPr."""
pPr = paragraph._element.get_or_add_pPr()
for child in list(pPr):
if any(child.tag.endswith(t) for t in ("buChar", "buNone", "buAutoNum", "buFont")):
pPr.remove(child)
return pPr
def _apply_paragraph_properties(paragraph, para_data: Dict[str, Any]):
text = para_data.get("text", "")
pPr = _clear_paragraph_bullets(paragraph)
if para_data.get("bullet", False):
level = para_data.get("level", 0)
paragraph.level = level
font_size = para_data.get("font_size", 18.0)
pPr.attrib["marL"] = str(int(font_size * _INDENT_FACTOR * (1 + level) * _EMU_PER_PT))
pPr.attrib["indent"] = str(int(-font_size * 0.8 * _EMU_PER_PT))
buChar = OxmlElement("a:buChar")
buChar.set("char", "")
pPr.append(buChar)
if "alignment" not in para_data:
paragraph.alignment = PP_ALIGN.LEFT
else:
pPr.attrib["marL"] = "0"
pPr.attrib["indent"] = "0"
pPr.insert(0, OxmlElement("a:buNone"))
if para_data.get("alignment") in _ALIGN_MAP:
paragraph.alignment = _ALIGN_MAP[para_data["alignment"]]
if "space_before" in para_data:
paragraph.space_before = Pt(para_data["space_before"])
if "space_after" in para_data:
paragraph.space_after = Pt(para_data["space_after"])
if "line_spacing" in para_data:
paragraph.line_spacing = Pt(para_data["line_spacing"])
run = paragraph.runs[0] if paragraph.runs else paragraph.add_run()
run.text = text
_apply_font_properties(run, para_data)
def _apply_font_properties(run, para_data: Dict[str, Any]):
for attr in ("bold", "italic", "underline"):
if attr in para_data:
setattr(run.font, attr, para_data[attr])
if "font_size" in para_data:
run.font.size = Pt(para_data["font_size"])
if "font_name" in para_data:
run.font.name = para_data["font_name"]
if "color" in para_data:
h = para_data["color"].lstrip("#")
if len(h) == 6:
run.font.color.rgb = RGBColor(int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16))
elif "theme_color" in para_data:
try:
run.font.color.theme_color = getattr(MSO_THEME_COLOR, para_data["theme_color"])
except AttributeError:
print(f" WARNING: Unknown theme color '{para_data['theme_color']}'")
def _check_duplicate_keys(pairs):
result = {}
for key, value in pairs:
if key in result:
raise ValueError(f"Duplicate key in JSON: '{key}'")
result[key] = value
return result
def _validate_replacements(inventory: InventoryData, replacements: Dict) -> List[str]:
errors = []
for slide_key, shapes_data in replacements.items():
if not slide_key.startswith("slide-"):
continue
if slide_key not in inventory:
errors.append(f"Slide '{slide_key}' not found in inventory")
continue
for shape_key in shapes_data:
if shape_key not in inventory[slide_key]:
available = sorted(inventory[slide_key].keys())
errors.append(
f"Shape '{shape_key}' not found on '{slide_key}'. "
f"Available: {', '.join(available)}"
)
return errors
def apply_replacements(pptx_file: str, json_file: str, output_file: str):
prs = Presentation(pptx_file)
inventory = extract_text_inventory(Path(pptx_file), prs)
# Snapshot original overflow so we can detect if replacements make it worse
original_overflow: Dict[str, Dict[str, float]] = {
slide_key: {
shape_key: sd.frame_overflow_bottom
for shape_key, sd in shapes.items()
if sd.frame_overflow_bottom is not None
}
for slide_key, shapes in inventory.items()
}
with open(json_file) as f:
replacements = json.load(f, object_pairs_hook=_check_duplicate_keys)
errors = _validate_replacements(inventory, replacements)
if errors:
print("ERROR: Invalid shapes in replacement JSON:")
for e in errors:
print(f" - {e}")
raise ValueError(f"Found {len(errors)} validation error(s)")
shapes_cleared = shapes_replaced = 0
for slide_key, shapes_dict in inventory.items():
if not slide_key.startswith("slide-"):
continue
for shape_key, shape_data in shapes_dict.items():
if not shape_data.shape:
continue
tf = shape_data.shape.text_frame # type: ignore
tf.clear()
shapes_cleared += 1
para_list = replacements.get(slide_key, {}).get(shape_key, {}).get("paragraphs")
if not para_list:
continue
shapes_replaced += 1
# Inherit original font_size if not specified in replacement
orig_paras = shape_data.paragraphs or []
orig_font_size = orig_paras[0].get("font_size") if orig_paras else None
for i, para_data in enumerate(para_list):
p = tf.paragraphs[0] if i == 0 else tf.add_paragraph()
if orig_font_size is not None and "font_size" not in para_data:
para_data = {**para_data, "font_size": orig_font_size}
_apply_paragraph_properties(p, para_data)
# Re-check overflow on the updated in-memory presentation.
# Note: extract_text_inventory may add benign empty <a:solidFill/> elements
# while reading font colors — these are harmless and ignored by PowerPoint.
updated_inventory = extract_text_inventory(Path(pptx_file), prs)
overflow_errors: List[str] = []
warnings: List[str] = []
for slide_key, shapes_dict in updated_inventory.items():
for shape_key, sd in shapes_dict.items():
for w in sd.warnings:
warnings.append(f"{slide_key}/{shape_key}: {w}")
new_ov = sd.frame_overflow_bottom
if new_ov is not None:
old_ov = original_overflow.get(slide_key, {}).get(shape_key, 0.0)
if new_ov > old_ov + 0.01:
overflow_errors.append(
f'{slide_key}/{shape_key}: overflow increased by {new_ov - old_ov:.2f}" '
f'(was {old_ov:.2f}", now {new_ov:.2f}")'
)
if overflow_errors or warnings:
print("\nWARNING: Issues in replacement output:")
for e in overflow_errors:
print(f" overflow - {e}")
for w in warnings:
print(f" warning - {w}")
prs.save(output_file)
print(f"Saved: {output_file}")
print(f" Shapes cleared: {shapes_cleared}, replaced: {shapes_replaced}")
def main():
if len(sys.argv) != 4:
print(__doc__)
sys.exit(1)
input_pptx, replacements_json, output_pptx = (
Path(sys.argv[1]), Path(sys.argv[2]), Path(sys.argv[3])
)
for p in (input_pptx, replacements_json):
if not p.exists():
print(f"Error: File not found: {p}")
sys.exit(1)
try:
apply_replacements(str(input_pptx), str(replacements_json), str(output_pptx))
except Exception as e:
import traceback
print(f"Error: {e}")
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()

BIN
skills/ppt/scripts/tectonic Executable file

Binary file not shown.

352
skills/ppt/scripts/thumbnail.py Executable file
View File

@@ -0,0 +1,352 @@
#!/usr/bin/env python3
"""
Create thumbnail grids from PowerPoint presentation slides.
Creates a grid layout of slide thumbnails with configurable columns (max 6).
Each grid contains up to cols×(cols+1) images. For presentations with more
slides, multiple numbered grid files are created automatically.
The program outputs the names of all files created.
Output:
- Single grid: {prefix}.jpg (if slides fit in one grid)
- Multiple grids: {prefix}-1.jpg, {prefix}-2.jpg, etc.
Grid limits by column count:
- 3 cols: max 12 slides per grid (3×4)
- 4 cols: max 20 slides per grid (4×5)
- 5 cols: max 30 slides per grid (5×6) [default]
- 6 cols: max 42 slides per grid (6×7)
Usage:
python thumbnail.py input.pptx [output_prefix] [--cols N] [--outline-placeholders]
Examples:
python thumbnail.py presentation.pptx
# Creates: thumbnails.jpg (using default prefix)
# Outputs:
# Created 1 grid(s):
# - thumbnails.jpg
python thumbnail.py large-deck.pptx grid --cols 4
# Creates: grid-1.jpg, grid-2.jpg, grid-3.jpg
# Outputs:
# Created 3 grid(s):
# - grid-1.jpg
# - grid-2.jpg
# - grid-3.jpg
python thumbnail.py template.pptx analysis --outline-placeholders
# Creates thumbnail grids with red outlines around text placeholders
"""
import argparse
import subprocess
import sys
import tempfile
from pathlib import Path
from inventory import extract_text_inventory
from PIL import Image, ImageDraw, ImageFont
from pptx import Presentation
# Constants
THUMBNAIL_WIDTH = 300 # Fixed thumbnail width in pixels
CONVERSION_DPI = 100 # DPI for PDF to image conversion
MAX_COLS = 6 # Maximum number of columns
DEFAULT_COLS = 5 # Default number of columns
JPEG_QUALITY = 95 # JPEG compression quality
# Grid layout constants
GRID_PADDING = 20 # Padding between thumbnails
BORDER_WIDTH = 2 # Border width around thumbnails
FONT_SIZE_RATIO = 0.12 # Font size as fraction of thumbnail width
LABEL_PADDING_RATIO = 0.4 # Label padding as fraction of font size
def main():
parser = argparse.ArgumentParser(
description="Create thumbnail grids from PowerPoint slides."
)
parser.add_argument("input", help="Input PowerPoint file (.pptx)")
parser.add_argument(
"output_prefix",
nargs="?",
default="thumbnails",
help="Output prefix for image files (default: thumbnails, will create prefix.jpg or prefix-N.jpg)",
)
parser.add_argument(
"--cols",
type=int,
default=DEFAULT_COLS,
help=f"Number of columns (default: {DEFAULT_COLS}, max: {MAX_COLS})",
)
parser.add_argument(
"--outline-placeholders",
action="store_true",
help="Outline text placeholders with a colored border",
)
args = parser.parse_args()
cols = min(args.cols, MAX_COLS)
if args.cols > MAX_COLS:
print(f"Warning: Columns limited to {MAX_COLS} (requested {args.cols})")
input_path = Path(args.input)
if not input_path.is_file() or input_path.suffix.lower() != ".pptx":
sys.exit(f"Error: Invalid PowerPoint file: {args.input}")
output_path = Path(f"{args.output_prefix}.jpg")
print(f"Processing: {args.input}")
try:
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
placeholder_regions = None
slide_dimensions = None
if args.outline_placeholders:
print("Extracting placeholder regions...")
placeholder_regions, slide_dimensions = get_placeholder_regions(input_path)
if placeholder_regions:
print(f"Found placeholders on {len(placeholder_regions)} slides")
prs = Presentation(str(input_path))
total_slides = len(prs.slides)
hidden_slides = {
idx + 1
for idx, slide in enumerate(prs.slides)
if slide.element.get("show") == "0"
}
hidden_info = f" ({len(hidden_slides)} hidden)" if hidden_slides else ""
print(f"Found {total_slides} slides{hidden_info}")
slide_images = convert_to_images(input_path, temp_path, CONVERSION_DPI, total_slides, hidden_slides)
if not slide_images:
sys.exit("Error: No slides found")
grid_files = create_grids(
slide_images, cols, THUMBNAIL_WIDTH, output_path,
placeholder_regions, slide_dimensions,
)
print(f"Created {len(grid_files)} grid(s):")
for grid_file in grid_files:
print(f" - {grid_file}")
except RuntimeError as e:
sys.exit(f"Error: {e}")
def create_hidden_slide_placeholder(size):
"""Create placeholder image for hidden slides."""
img = Image.new("RGB", size, color="#F0F0F0")
draw = ImageDraw.Draw(img)
line_width = max(5, min(size) // 100)
draw.line([(0, 0), size], fill="#CCCCCC", width=line_width)
draw.line([(size[0], 0), (0, size[1])], fill="#CCCCCC", width=line_width)
return img
def get_placeholder_regions(pptx_path):
"""Extract ALL text regions from the presentation.
Returns a tuple of (placeholder_regions, slide_dimensions).
text_regions is a dict mapping slide indices to lists of text regions.
Each region is a dict with 'left', 'top', 'width', 'height' in inches.
slide_dimensions is a tuple of (width_inches, height_inches).
"""
prs = Presentation(str(pptx_path))
inventory = extract_text_inventory(pptx_path, prs)
placeholder_regions = {}
slide_width_inches = (prs.slide_width or 9144000) / 914400.0
slide_height_inches = (prs.slide_height or 5143500) / 914400.0
for slide_key, shapes in inventory.items():
slide_idx = int(slide_key.split("-")[1])
regions = [
{"left": s.left, "top": s.top, "width": s.width, "height": s.height}
for s in shapes.values()
]
if regions:
placeholder_regions[slide_idx] = regions
return placeholder_regions, (slide_width_inches, slide_height_inches)
def _pptx_to_pdf(pptx_path, temp_dir):
"""Convert PPTX to PDF via LibreOffice. Returns path to the PDF file."""
pdf_path = temp_dir / f"{pptx_path.stem}.pdf"
result = subprocess.run(
["soffice", "--headless", "--convert-to", "pdf", "--outdir", str(temp_dir), str(pptx_path)],
capture_output=True,
text=True,
)
if result.returncode != 0 or not pdf_path.exists():
raise RuntimeError("PDF conversion failed")
return pdf_path
def _pdf_to_images(pdf_path, temp_dir, dpi):
"""Convert PDF pages to JPEG images via pdftoppm. Returns sorted image paths."""
result = subprocess.run(
["pdftoppm", "-jpeg", "-r", str(dpi), str(pdf_path), str(temp_dir / "slide")],
capture_output=True,
text=True,
)
if result.returncode != 0:
raise RuntimeError("Image conversion failed")
return sorted(temp_dir.glob("slide-*.jpg"))
def convert_to_images(pptx_path, temp_dir, dpi, total_slides, hidden_slides):
"""Convert PowerPoint to images via PDF, inserting placeholders for hidden slides."""
pdf_path = _pptx_to_pdf(pptx_path, temp_dir)
visible_images = _pdf_to_images(pdf_path, temp_dir, dpi)
if not visible_images:
return []
with Image.open(visible_images[0]) as img:
placeholder_size = img.size
all_images = []
visible_idx = 0
for slide_num in range(1, total_slides + 1):
if slide_num in hidden_slides:
placeholder_path = temp_dir / f"hidden-{slide_num:03d}.jpg"
create_hidden_slide_placeholder(placeholder_size).save(placeholder_path, "JPEG")
all_images.append(placeholder_path)
else:
if visible_idx < len(visible_images):
all_images.append(visible_images[visible_idx])
visible_idx += 1
return all_images
def create_grids(
image_paths,
cols,
width,
output_path,
placeholder_regions=None,
slide_dimensions=None,
):
"""Create multiple thumbnail grids from slide images, max cols×(cols+1) images per grid."""
max_images_per_grid = cols * (cols + 1)
grid_files = []
total_images = len(image_paths)
for chunk_idx, start_idx in enumerate(range(0, total_images, max_images_per_grid)):
chunk_images = image_paths[start_idx: start_idx + max_images_per_grid]
grid = create_grid(chunk_images, cols, width, start_idx, placeholder_regions, slide_dimensions)
if total_images <= max_images_per_grid:
grid_filename = output_path
else:
grid_filename = output_path.parent / f"{output_path.stem}-{chunk_idx + 1}{output_path.suffix}"
grid_filename.parent.mkdir(parents=True, exist_ok=True)
grid.save(str(grid_filename), quality=JPEG_QUALITY)
grid_files.append(str(grid_filename))
return grid_files
def create_grid(
image_paths,
cols,
width,
start_slide_num=0,
placeholder_regions=None,
slide_dimensions=None,
):
"""Create thumbnail grid from slide images with optional placeholder outlining."""
font_size = int(width * FONT_SIZE_RATIO)
label_padding = int(font_size * LABEL_PADDING_RATIO)
with Image.open(image_paths[0]) as img:
aspect = img.height / img.width
height = int(width * aspect)
rows = (len(image_paths) + cols - 1) // cols
grid_w = cols * width + (cols + 1) * GRID_PADDING
grid_h = rows * (height + font_size + label_padding * 2) + (rows + 1) * GRID_PADDING
grid = Image.new("RGB", (grid_w, grid_h), "white")
draw = ImageDraw.Draw(grid)
try:
font = ImageFont.load_default(size=font_size)
except Exception:
font = ImageFont.load_default()
for i, img_path in enumerate(image_paths):
row, col = i // cols, i % cols
x = col * width + (col + 1) * GRID_PADDING
y_base = row * (height + font_size + label_padding * 2) + (row + 1) * GRID_PADDING
label = f"{start_slide_num + i}"
bbox = draw.textbbox((0, 0), label, font=font)
text_w = bbox[2] - bbox[0]
draw.text((x + (width - text_w) // 2, y_base + label_padding), label, fill="black", font=font)
y_thumbnail = y_base + label_padding + font_size + label_padding
with Image.open(img_path) as img:
orig_w, orig_h = img.size
if placeholder_regions and (start_slide_num + i) in placeholder_regions:
if img.mode != "RGBA":
img = img.convert("RGBA")
regions = placeholder_regions[start_slide_num + i]
if slide_dimensions:
slide_w_in, slide_h_in = slide_dimensions
else:
slide_w_in = orig_w / CONVERSION_DPI
slide_h_in = orig_h / CONVERSION_DPI
x_scale = orig_w / slide_w_in
y_scale = orig_h / slide_h_in
overlay = Image.new("RGBA", img.size, (255, 255, 255, 0))
overlay_draw = ImageDraw.Draw(overlay)
stroke_width = max(5, min(orig_w, orig_h) // 150)
for region in regions:
px_left = int(region["left"] * x_scale)
px_top = int(region["top"] * y_scale)
px_right = px_left + int(region["width"] * x_scale)
px_bottom = px_top + int(region["height"] * y_scale)
overlay_draw.rectangle(
[(px_left, px_top), (px_right, px_bottom)],
outline=(255, 0, 0, 255),
width=stroke_width,
)
img = Image.alpha_composite(img, overlay).convert("RGB")
img.thumbnail((width, height), Image.Resampling.LANCZOS)
w, h = img.size
tx = x + (width - w) // 2
ty = y_thumbnail + (height - h) // 2
grid.paste(img, (tx, ty))
if BORDER_WIDTH > 0:
draw.rectangle(
[(tx - BORDER_WIDTH, ty - BORDER_WIDTH), (tx + w + BORDER_WIDTH - 1, ty + h + BORDER_WIDTH - 1)],
outline="gray",
width=BORDER_WIDTH,
)
return grid
if __name__ == "__main__":
main()