Initial commit

2026-06-06 05:21:10 +00:00
commit 6664758a6d
493 changed files with 135653 additions and 0 deletions
--- a/skills/pdf/scripts/pdf_qa.py
+++ b/skills/pdf/scripts/pdf_qa.py
@@ -0,0 +1,901 @@
+#!/usr/bin/env python3
+"""
+PDF Quality Assurance Checker
+=============================
+Automatically detects common typesetting issues in PDFs.
+
+Usage: python3 pdf_qa.py <pdf_path>
+
+Checks:
+  1. Page size consistency across all pages
+  2. Blank page detection
+  3. CJK punctuation placement (line-start/end forbidden punctuation)
+  4. Color analysis (informational only — counts and lists colors)
+  5. Font embedding check (warns on non-embedded fonts)
+  6. PDF metadata check (title/author/creator)
+  7. Content overflow detection (text exceeding page boundaries)
+  8. Content fill ratio per page (multi-page docs, warns if < 40%)
+  9. Cover/poster full-bleed check (background extends to page edges)
+ 10. Margin symmetry check (left/right text margins)
+ 11. Table centering check (if detected)
+ 12. Formula overflow check (optional)
+"""
+
+import sys
+import os
+import re
+import json
+from collections import Counter
+
+try:
+    import pymupdf  # PyMuPDF
+except ImportError:
+    import fitz as pymupdf
+
+# ============================================================
+# Config
+# ============================================================
+
+# CJK punctuation forbidden at line start
+LINE_START_FORBIDDEN = set(
+    "。、，；：！？）】〛〉」』"
+    "\u201c\u201d"  # "" curly double quotes
+    "\u2026"        # … ellipsis
+    "\u2014"        # — em dash
+    "\uff5e"        # ～ fullwidth tilde
+    "\u00b7"        # · middle dot
+)
+
+# CJK punctuation forbidden at line end
+LINE_END_FORBIDDEN = set(
+    "（【《〈「"
+    "\u2018\u2019"  # '' curly single quotes
+    "\u201c"        # " left curly double quote
+)
+
+# Minimum fill ratio for last page (DISABLED — caused false positives)
+# LAST_PAGE_MIN_FILL = 0.40
+
+# Maximum allowed color count — REMOVED (color count is now info-only)
+# MAX_COLORS = 8
+
+# ============================================================
+# Checks
+# ============================================================
+
+class QAResult:
+    def __init__(self):
+        self.issues = []     # (severity, category, message)
+        self.passes = []     # passed checks
+        self.info = []       # informational
+    
+    def error(self, cat, msg):
+        self.issues.append(('ERROR', cat, msg))
+    
+    def warn(self, cat, msg):
+        self.issues.append(('WARN', cat, msg))
+    
+    def ok(self, msg):
+        self.passes.append(msg)
+    
+    def add_info(self, msg):
+        self.info.append(msg)
+
+
+def check_last_page_fill(doc, result):
+    """Check content fill ratio of the last page"""
+    if len(doc) < 2:
+        result.ok("Single-page document, no last-page blank check needed")
+        return
+    
+    last_page = doc[-1]
+    page_rect = last_page.rect
+    page_area = page_rect.width * page_rect.height
+    
+    # Get bounding boxes of all content on last page
+    blocks = last_page.get_text("blocks")
+    if not blocks:
+        result.error("Last page blank", f"Page {len(doc)} (last page) has no content at all!")
+        return
+    
+    # Calculate max y-coordinate covered by content
+    max_y = 0
+    min_y = page_rect.height
+    for b in blocks:
+        if b[4].strip():  # Has text content
+            min_y = min(min_y, b[1])
+            max_y = max(max_y, b[3])
+    
+    if max_y == 0:
+        result.error("Last page blank", f"Page {len(doc)} (last page) has no valid text content")
+        return
+    
+    content_height = max_y - min_y
+    fill_ratio = content_height / page_rect.height
+    
+    result.add_info(f"Last page fill ratio: {fill_ratio:.0%} (content height {content_height:.0f}px / page height {page_rect.height:.0f}px)")
+    
+    if fill_ratio < 0.25:
+        result.error("Last page blank", f"Last page fill ratio only {fill_ratio:.0%}, mostly blank! Consider compressing preceding page spacing or trimming content")
+    elif fill_ratio < LAST_PAGE_MIN_FILL:
+        result.warn("Last page blank", f"Last page fill ratio {fill_ratio:.0%}, somewhat sparse — optimization recommended")
+    else:
+        result.ok(f"Last page fill ratio {fill_ratio:.0%} ✓")
+
+
+def check_punctuation(doc, result):
+    """Check CJK punctuation placement rules"""
+    violations = []
+    
+    for page_num in range(len(doc)):
+        page = doc[page_num]
+        # Extract text by line
+        text_dict = page.get_text("dict")
+        
+        for block in text_dict.get("blocks", []):
+            if block.get("type") != 0:  # Only check text blocks
+                continue
+            for line in block.get("lines", []):
+                line_text = ""
+                for span in line.get("spans", []):
+                    line_text += span.get("text", "")
+                
+                line_text = line_text.strip()
+                if not line_text:
+                    continue
+                
+                # Check line start
+                first_char = line_text[0]
+                if first_char in LINE_START_FORBIDDEN:
+                    violations.append((page_num + 1, f"Forbidden line-start punctuation '{first_char}': ...{line_text[:30]}"))
+                
+                # Check line end
+                last_char = line_text[-1] if len(line_text) > 0 else ''
+                if last_char in LINE_END_FORBIDDEN:
+                    violations.append((page_num + 1, f"Forbidden line-end punctuation '{last_char}': {line_text[-30:]}..."))
+    
+    if violations:
+        # Show at most 10
+        shown = violations[:10]
+        for page_num, desc in shown:
+            result.warn("Punctuation rules", f"Page {page_num} - {desc}")
+        if len(violations) > 10:
+            result.warn("Punctuation rules", f"...{len(violations) - 10} more violations")
+    else:
+        result.ok("Punctuation placement check passed ✓")
+
+
+def check_blank_pages(doc, result):
+    """Check for completely blank pages"""
+    blank_pages = []
+    for i in range(len(doc)):
+        page = doc[i]
+        text = page.get_text().strip()
+        # Also check for images
+        images = page.get_images()
+        drawings = page.get_drawings()
+        
+        if not text and not images and not drawings:
+            blank_pages.append(i + 1)
+    
+    if blank_pages:
+        result.error("Blank pages", f"Found blank pages: {blank_pages}")
+    else:
+        result.ok("No blank pages ✓")
+
+
+def check_colors(doc, result):
+    """Analyze colors used in the document (informational only, no pass/fail)"""
+    colors = set()
+    
+    for page_num in range(len(doc)):
+        page = doc[page_num]
+        text_dict = page.get_text("dict")
+        
+        for block in text_dict.get("blocks", []):
+            if block.get("type") != 0:
+                continue
+            for line in block.get("lines", []):
+                for span in line.get("spans", []):
+                    color = span.get("color", 0)
+                    if color != 0:  # Exclude pure black
+                        r = (color >> 16) & 0xFF
+                        g = (color >> 8) & 0xFF
+                        b = color & 0xFF
+                        hex_color = f"#{r:02x}{g:02x}{b:02x}"
+                        colors.add(hex_color)
+        
+        # Check drawing colors
+        drawings = page.get_drawings()
+        for d in drawings:
+            if d.get("color"):
+                c = d["color"]
+                if isinstance(c, (tuple, list)) and len(c) >= 3:
+                    hex_color = f"#{int(c[0]*255):02x}{int(c[1]*255):02x}{int(c[2]*255):02x}"
+                    colors.add(hex_color)
+            if d.get("fill"):
+                c = d["fill"]
+                if isinstance(c, (tuple, list)) and len(c) >= 3:
+                    hex_color = f"#{int(c[0]*255):02x}{int(c[1]*255):02x}{int(c[2]*255):02x}"
+                    colors.add(hex_color)
+    
+    # Filter out near-black/white/gray colors
+    distinct_colors = []
+    for c in colors:
+        r = int(c[1:3], 16)
+        g = int(c[3:5], 16)
+        b = int(c[5:7], 16)
+        max_diff = max(abs(r-g), abs(g-b), abs(r-b))
+        if max_diff > 20:
+            distinct_colors.append(c)
+    
+    result.add_info(f"Total text colors: {len(colors)} (chromatic: {len(distinct_colors)})")
+    
+    if distinct_colors:
+        result.add_info(f"Chromatic colors: {', '.join(sorted(distinct_colors)[:10])}")
+
+
+def check_page_size_consistency(doc, result):
+    """Check whether all page sizes are consistent"""
+    if len(doc) < 2:
+        result.ok("Single-page document, size consistent ✓")
+        return
+    
+    sizes = set()
+    for i in range(len(doc)):
+        page = doc[i]
+        w = round(page.rect.width, 1)
+        h = round(page.rect.height, 1)
+        sizes.add((w, h))
+    
+    if len(sizes) > 1:
+        result.warn("Page size", f"Inconsistent page sizes: {sizes}")
+    else:
+        size = list(sizes)[0]
+        # Convert to mm
+        w_mm = size[0] * 25.4 / 72
+        h_mm = size[1] * 25.4 / 72
+        result.add_info(f"Page size: {w_mm:.0f}mm × {h_mm:.0f}mm ({len(doc)} pages)")
+        result.ok("Page size consistent ✓")
+
+
+def check_text_overflow(doc, result):
+    """Check whether text overflows page boundaries"""
+    overflow_pages = []
+    
+    for i in range(len(doc)):
+        page = doc[i]
+        rect = page.rect
+        blocks = page.get_text("blocks")
+        
+        for b in blocks:
+            # b = (x0, y0, x1, y1, text, block_no, block_type)
+            if b[2] > rect.width + 2 or b[3] > rect.height + 2:  # 2px tolerance
+                overflow_pages.append(i + 1)
+                break
+            if b[0] < -2 or b[1] < -2:
+                overflow_pages.append(i + 1)
+                break
+    
+    if overflow_pages:
+        result.warn("Content overflow", f"Pages {overflow_pages} may have content exceeding page boundaries")
+    else:
+        result.ok("No content overflow ✓")
+
+
+def check_content_fill_ratio(doc, result):
+    """Check content fill ratio per page — warns when content is crammed at top leaving large void below.
+    
+    Rules:
+    - Skip single-page documents (may be intentional design)
+    - Skip page 1 (usually cover with intentional whitespace)
+    - Middle pages: warn if fill ratio < 40%
+    - Last page: warn if fill ratio < 25% (naturally has less content)
+    """
+    if len(doc) < 2:
+        result.ok("Single-page document, skipping content fill ratio check ✓")
+        return
+    
+    low_fill_pages = []
+    
+    for i in range(len(doc)):
+        page = doc[i]
+        page_rect = page.rect
+        page_height = page_rect.height
+        
+        # Skip page 1 (cover)
+        if i == 0:
+            continue
+        
+        blocks = page.get_text("blocks")
+        images = page.get_images()
+        drawings = page.get_drawings()
+        
+        if not blocks and not images and not drawings:
+            continue  # Blank page check handles this
+        
+        # Calculate content bbox
+        max_y = 0
+        for b in blocks:
+            if b[4].strip():
+                max_y = max(max_y, b[3])
+        
+        # Include images in bbox
+        for img in images:
+            try:
+                img_rects = page.get_image_rects(img[0])
+                for r in img_rects:
+                    max_y = max(max_y, r.y1)
+            except Exception:
+                pass
+        
+        if max_y == 0:
+            continue
+        
+        fill_ratio = max_y / page_height
+        is_last = (i == len(doc) - 1)
+        threshold = 0.25 if is_last else 0.40
+        
+        if fill_ratio < threshold:
+            low_fill_pages.append((i + 1, fill_ratio, threshold))
+    
+    if low_fill_pages:
+        for pg, ratio, thresh in low_fill_pages:
+            result.warn(
+                "Content fill ratio",
+                f"Page {pg} content only fills {ratio:.0%} of page height "
+                f"(threshold: {thresh:.0%}). Content may be crammed at the top "
+                f"with a large blank area below."
+            )
+    else:
+        result.ok("Content fill ratio adequate on all pages ✓")
+
+
+def check_cover_bleed(doc, result, poster=False):
+    """Check if the cover page (page 1) fills the entire page area (full-bleed).
+
+    A properly designed cover should have background color/graphics extending
+    to the page edges. If the content bbox has significant margins on all sides,
+    the cover likely wasn't rendered full-bleed (e.g. ReportLab with default margins).
+
+    For poster mode: checks ALL pages (not just the cover) since every page of a
+    seamlessly-paginated poster should have consistent background fill.
+
+    Strategy: combine bounding boxes of drawings (rects, paths), images, and colored
+    backgrounds. If the union bbox leaves > 5% margin on any side, warn.
+    """
+    if not poster and len(doc) < 2:
+        # Single page doc (non-poster) — not necessarily a cover scenario
+        return
+
+    pages_to_check = range(len(doc)) if poster else [0]
+    
+    for page_idx in pages_to_check:
+        page = doc[page_idx]
+        page_rect = page.rect
+        pw, ph = page_rect.width, page_rect.height
+
+        # Collect all content bounding boxes
+        min_x, min_y = pw, ph
+        max_x, max_y = 0.0, 0.0
+        has_content = False
+
+        # 1. Drawings (vector paths, rectangles — typical for colored backgrounds)
+        for d in page.get_drawings():
+            r = d.get("rect")
+            if r:
+                min_x = min(min_x, r.x0)
+                min_y = min(min_y, r.y0)
+                max_x = max(max_x, r.x1)
+                max_y = max(max_y, r.y1)
+                has_content = True
+
+        # 2. Images
+        for img in page.get_images():
+            try:
+                for r in page.get_image_rects(img[0]):
+                    min_x = min(min_x, r.x0)
+                    min_y = min(min_y, r.y0)
+                    max_x = max(max_x, r.x1)
+                    max_y = max(max_y, r.y1)
+                    has_content = True
+            except Exception:
+                pass
+
+        page_label = f"Page {page_idx + 1}" if poster else "Cover page (p1)"
+
+        if not has_content:
+            blocks = page.get_text("blocks")
+            if blocks:
+                result.warn(
+                    f"{page_label} not full-bleed",
+                    f"{page_label} has no background graphics (no filled rectangles or images). "
+                    "A proper cover/poster page should have a full-page background color or image "
+                    "extending to all edges."
+                )
+            continue
+
+        # Calculate margin ratios (how far content is from page edges)
+        margin_left = max(0, min_x) / pw
+        margin_top = max(0, min_y) / ph
+        margin_right = max(0, pw - max_x) / pw
+        margin_bottom = max(0, ph - max_y) / ph
+
+        threshold = 0.05
+        margins_ok = (margin_left <= threshold and margin_top <= threshold and
+                      margin_right <= threshold and margin_bottom <= threshold)
+
+        if margins_ok:
+            result.ok(f"{page_label} content extends to page edges (full-bleed) ✓")
+        else:
+            sides = []
+            if margin_left > threshold:
+                sides.append(f"left {margin_left:.0%}")
+            if margin_top > threshold:
+                sides.append(f"top {margin_top:.0%}")
+            if margin_right > threshold:
+                sides.append(f"right {margin_right:.0%}")
+            if margin_bottom > threshold:
+                sides.append(f"bottom {margin_bottom:.0%}")
+            result.warn(
+                f"{page_label} not full-bleed",
+                f"{page_label} has visible margins: {', '.join(sides)}. "
+                f"Background/graphics should extend to page edges."
+            )
+
+
+def check_margin_symmetry(doc, result, skip_cover=False):
+    """Check left/right margin symmetry using text block bounds."""
+    warn_pages = []
+
+    for page_num in range(len(doc)):
+        if skip_cover and page_num == 0:
+            continue
+
+        page = doc[page_num]
+        blocks = page.get_text("blocks")
+        text_blocks = [b for b in blocks if b[4].strip()]
+
+        if len(text_blocks) < 3:
+            continue  # Skip decorative/cover-like pages
+
+        left_margin = min(b[0] for b in text_blocks)
+        right_margin = page.rect.width - max(b[2] for b in text_blocks)
+        diff = abs(left_margin - right_margin)
+
+        if diff > page.rect.width * 0.05:
+            warn_pages.append((page_num + 1, left_margin, right_margin, diff))
+
+    if warn_pages:
+        for pg, left, right, diff in warn_pages:
+            result.warn(
+                "Margin symmetry",
+                f"Page {pg} left/right margins differ by {diff:.0f}pt "
+                f"(L {left:.0f}pt, R {right:.0f}pt)"
+            )
+    else:
+        result.ok("Left/right margins appear symmetric \u2713")
+
+
+def check_table_centering(doc, result):
+    """Check if detected table regions are centered."""
+    def _bbox_intersects(a, b, tol=6):
+        return not (a[2] < b[0] - tol or a[0] > b[2] + tol or
+                    a[3] < b[1] - tol or a[1] > b[3] + tol)
+
+    def _rect_tuple(r):
+        if hasattr(r, "x0"):
+            return (r.x0, r.y0, r.x1, r.y1)
+        return (r[0], r[1], r[2], r[3])
+
+    any_tables = False
+
+    for page_num in range(len(doc)):
+        page = doc[page_num]
+        drawings = page.get_drawings()
+        segments = []
+
+        for d in drawings:
+            for item in d.get("items", []):
+                if not item:
+                    continue
+                op = item[0]
+                if op == "l" and len(item) >= 3:
+                    p0, p1 = item[1], item[2]
+                    segments.append((p0[0], p0[1], p1[0], p1[1]))
+                elif op == "re" and len(item) >= 2:
+                    x0, y0, x1, y1 = _rect_tuple(item[1])
+                    segments.extend([
+                        (x0, y0, x1, y0),
+                        (x0, y1, x1, y1),
+                        (x0, y0, x0, y1),
+                        (x1, y0, x1, y1),
+                    ])
+
+        if not segments:
+            continue
+
+        cluster_list = []
+        for x0, y0, x1, y1 in segments:
+            min_x, max_x = min(x0, x1), max(x0, x1)
+            min_y, max_y = min(y0, y1), max(y0, y1)
+            bbox = (min_x, min_y, max_x, max_y)
+            is_h = abs(y0 - y1) < 1 and (max_x - min_x) > 20
+            is_v = abs(x0 - x1) < 1 and (max_y - min_y) > 20
+            if not is_h and not is_v:
+                continue
+
+            placed = False
+            for cl in cluster_list:
+                if _bbox_intersects(bbox, cl["bbox"]):
+                    cl["segments"].append((x0, y0, x1, y1, is_h, is_v))
+                    cl["bbox"] = (
+                        min(cl["bbox"][0], bbox[0]),
+                        min(cl["bbox"][1], bbox[1]),
+                        max(cl["bbox"][2], bbox[2]),
+                        max(cl["bbox"][3], bbox[3]),
+                    )
+                    if is_h:
+                        cl["h"] += 1
+                    if is_v:
+                        cl["v"] += 1
+                    placed = True
+                    break
+            if not placed:
+                cluster_list.append({
+                    "bbox": bbox,
+                    "segments": [(x0, y0, x1, y1, is_h, is_v)],
+                    "h": 1 if is_h else 0,
+                    "v": 1 if is_v else 0,
+                })
+
+        for cl in cluster_list:
+            if cl["h"] < 2 or cl["v"] < 2:
+                continue
+            any_tables = True
+            bbox = cl["bbox"]
+            page_width = page.rect.width
+            left_margin = bbox[0]
+            right_margin = page_width - bbox[2]
+            if abs(left_margin - right_margin) > page_width * 0.05:
+                result.warn(
+                    "Table centering",
+                    f"Page {page_num + 1}: Table not centered "
+                    f"(L {left_margin:.0f}pt, R {right_margin:.0f}pt)"
+                )
+
+    if any_tables:
+        result.ok("Table centering check complete \u2713")
+
+
+def check_font_embedding(doc, result):
+    """Check font embedding status using PyMuPDF font list."""
+    fonts_used = set()
+    non_embedded = set()
+
+    for page_num in range(len(doc)):
+        page = doc[page_num]
+        for font in page.get_fonts():
+            basefont = font[3] if len(font) > 3 else "unknown"
+            ext = font[1] if len(font) > 1 else ""
+            fonts_used.add(basefont)
+            if not ext:
+                non_embedded.add(basefont)
+
+    if fonts_used:
+        result.add_info(f"Fonts used: {', '.join(sorted(fonts_used))}")
+    else:
+        result.add_info("Fonts used: (none detected)")
+
+    if non_embedded:
+        for basefont in sorted(non_embedded):
+            result.warn(
+                "Font embedding",
+                f"Font {basefont} is not embedded. May display differently on other systems."
+            )
+    else:
+        result.ok("All fonts are embedded \u2713")
+
+
+def check_helvetica_in_cjk(doc, result):
+    """Detect Helvetica rendering visible text in documents containing CJK text.
+
+    Helvetica is a Latin-only built-in PDF font. When it appears rendering
+    actual text content in a CJK document, it almost always means a raw string
+    was passed to a ReportLab Table or flowable without wrapping it in
+    Paragraph() with a CJK font. The CJK characters rendered via Helvetica
+    become garbled (fall back to ZapfDingbats symbols).
+
+    We only check Helvetica (not ZapfDingbats) because ZapfDingbats is
+    legitimately used for bullet symbols in list items.
+
+    We check actual rendered text spans (not just font presence in font list)
+    because ReportLab internally registers Helvetica on every page even when
+    only CJK fonts are used in visible content.
+    """
+    has_cjk = False
+    helvetica_pages = []
+
+    for page_num in range(len(doc)):
+        page = doc[page_num]
+        text = page.get_text("text") or ""
+
+        # Check if document contains CJK characters
+        if not has_cjk:
+            for ch in text:
+                if '\u4e00' <= ch <= '\u9fff' or '\u3400' <= ch <= '\u4dbf':
+                    has_cjk = True
+                    break
+
+        # Check if Helvetica is actually used to render visible text on this page
+        blocks = page.get_text("dict", sort=True).get("blocks", [])
+        found_on_page = False
+        for block in blocks:
+            if found_on_page:
+                break
+            for line in block.get("lines", []):
+                if found_on_page:
+                    break
+                for span in line.get("spans", []):
+                    font = span.get("font", "")
+                    txt = span.get("text", "").strip()
+                    if "Helvetica" in font and len(txt) > 0:
+                        helvetica_pages.append(page_num + 1)
+                        found_on_page = True
+                        break
+
+    if has_cjk and helvetica_pages:
+        pages_str = ', '.join(str(p) for p in helvetica_pages[:5])
+        if len(helvetica_pages) > 5:
+            pages_str += f' ...and {len(helvetica_pages) - 5} more'
+        result.warn(
+            "Helvetica in CJK document",
+            f"Helvetica font detected rendering text on page(s) {pages_str} in a CJK document. "
+            f"This usually means a raw string was passed to a ReportLab Table or flowable "
+            f"without wrapping in Paragraph(text, style) with a CJK-capable font. "
+            f"CJK characters rendered via Helvetica will appear as garbled symbols."
+        )
+
+
+def check_metadata(doc, result):
+    """Check PDF metadata presence for title, author, creator."""
+    meta = doc.metadata or {}
+
+    def _missing(v):
+        if v is None:
+            return True
+        if not str(v).strip():
+            return True
+        return False
+
+    title = meta.get("title")
+    author = meta.get("author")
+    creator = meta.get("creator")
+
+    if _missing(title) or str(title).strip().lower() in ("untitled", "(anonymous)"):
+        result.warn("Metadata", "Missing/invalid title metadata")
+    else:
+        result.ok("Title metadata present \u2713")
+
+    if _missing(author):
+        result.warn("Metadata", "Missing author metadata")
+    else:
+        result.ok("Author metadata present \u2713")
+
+    if _missing(creator):
+        result.warn("Metadata", "Missing creator metadata")
+    else:
+        result.ok("Creator metadata present \u2713")
+
+
+def check_toc_without_cover(doc, result):
+    """Detect TOC on page 1 without a preceding cover page.
+    
+    If the first page contains Table of Contents / 目录, it means the document
+    has a TOC but no cover page. This is a structural issue — documents with
+    TOC should have: Cover (p1) → TOC (p2) → Content (p3+).
+    """
+    if len(doc) < 2:
+        # Single-page docs don't need TOC/cover checks
+        return
+    
+    page1 = doc[0]
+    text = page1.get_text("text", sort=True).strip()
+    
+    # Normalize for matching
+    text_lower = text.lower()
+    first_300 = text_lower[:300]
+    
+    toc_keywords = [
+        "table of contents", "contents",
+        "目录", "目 录",
+    ]
+    
+    has_toc = any(kw in first_300 for kw in toc_keywords)
+    
+    if has_toc:
+        result.warn(
+            "TOC without cover",
+            "Page 1 appears to be a Table of Contents with no preceding cover page. "
+            "Documents with TOC should have: Cover (p1) → TOC (p2) → Content (p3+)."
+        )
+
+
+def check_formula_overflow(doc, result):
+    """Detect likely formula overflow past right content margin."""
+    math_re = re.compile(r"[=+\-*/<>\u2264\u2265\u2211\u222b\u221a\u03c0\u00b5\u221e\u2202\u2206\u2248\u2260\u00b1\u00d7\u00f7]")
+
+    for page_num in range(len(doc)):
+        page = doc[page_num]
+        blocks = page.get_text("blocks")
+        text_blocks = [b for b in blocks if b[4].strip()]
+
+        if len(text_blocks) < 3:
+            continue
+
+        right_edges = sorted(b[2] for b in text_blocks)
+        mid = len(right_edges) // 2
+        content_right = right_edges[mid] if right_edges else 0
+
+        for b in text_blocks:
+            x0, x1, text = b[0], b[2], b[4]
+            if x1 <= content_right + 10:
+                continue
+
+            is_single_line = "\n" not in text.strip()
+            is_wide = (x1 - x0) > page.rect.width * 0.5
+            has_math = bool(math_re.search(text))
+
+            if (is_single_line and is_wide) or has_math:
+                delta = x1 - content_right
+                result.warn(
+                    "Formula overflow",
+                    f"Page {page_num + 1}: Content extends {delta:.0f}pt beyond right content margin "
+                    "(possible formula overflow)"
+                )
+                break
+
+
+# ============================================================
+# Main
+# ============================================================
+
+def run_qa(pdf_path, poster=False, skip_cover=False, check_tables=True, check_formulas=False):
+    result = QAResult()
+    
+    if not os.path.exists(pdf_path):
+        result.error("File", f"File not found: {pdf_path}")
+        return result
+    
+    doc = pymupdf.open(pdf_path)
+    
+    result.add_info(f"File: {os.path.basename(pdf_path)}")
+    result.add_info(f"Size: {os.path.getsize(pdf_path) / 1024:.1f} KB")
+    if poster:
+        result.add_info("Mode: poster (creative)")
+    
+    # Run all checks
+    check_metadata(doc, result)
+    check_page_size_consistency(doc, result)
+    check_blank_pages(doc, result)
+    check_punctuation(doc, result)
+    check_colors(doc, result)
+    check_font_embedding(doc, result)
+    check_helvetica_in_cjk(doc, result)
+    check_text_overflow(doc, result)
+    if not poster:
+        # Content fill ratio is not meaningful for posters — the last page
+        # of a seamlessly-paginated poster naturally has less content.
+        check_content_fill_ratio(doc, result)
+    check_cover_bleed(doc, result, poster=poster)
+    check_margin_symmetry(doc, result, skip_cover=skip_cover)
+    if check_tables:
+        check_table_centering(doc, result)
+    if check_formulas:
+        check_formula_overflow(doc, result)
+    if not poster:
+        check_toc_without_cover(doc, result)
+    
+    doc.close()
+    return result
+
+
+def format_report(result):
+    lines = []
+    lines.append("=" * 56)
+    lines.append("  PDF Quality Assurance Report")
+    lines.append("=" * 56)
+    
+    # Info
+    if result.info:
+        lines.append("")
+        lines.append("ℹ️  Info:")
+        for msg in result.info:
+            lines.append(f"   {msg}")
+    
+    # Passes
+    if result.passes:
+        lines.append("")
+        lines.append(f"✅ Passed ({len(result.passes)}):")
+        for msg in result.passes:
+            lines.append(f"   {msg}")
+    
+    # Issues
+    errors = [(s, c, m) for s, c, m in result.issues if s == 'ERROR']
+    warns = [(s, c, m) for s, c, m in result.issues if s == 'WARN']
+    
+    if errors:
+        lines.append("")
+        lines.append(f"❌ Errors ({len(errors)}):")
+        for _, cat, msg in errors:
+            lines.append(f"   [{cat}] {msg}")
+    
+    if warns:
+        lines.append("")
+        lines.append(f"⚠️  Warnings ({len(warns)}):")
+        for _, cat, msg in warns:
+            lines.append(f"   [{cat}] {msg}")
+    
+    # Summary
+    lines.append("")
+    lines.append("-" * 56)
+    total_issues = len(result.issues)
+    if total_issues == 0:
+        lines.append("🎉 PASS — All checks passed!")
+    elif errors:
+        lines.append(f"💀 FAIL — {len(errors)} error(s), {len(warns)} warning(s)")
+    else:
+        lines.append(f"⚠️  WARN — {len(warns)} warning(s), optimization recommended")
+    lines.append("-" * 56)
+    
+    return "\n".join(lines)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python3 pdf_qa.py <pdf_path>")
+        print("       python3 pdf_qa.py *.pdf  (batch check)")
+        print("Options:")
+        print("  --poster      Poster mode (creative)")
+        print("  --skip-cover  Skip page 1 margin symmetry check")
+        print("  --no-tables   Disable table centering check")
+        print("  --formulas    Enable formula overflow check")
+        sys.exit(1)
+    
+    import glob
+    files = []
+    poster = False
+    skip_cover = False
+    check_tables = True
+    check_formulas = False
+    args = sys.argv[1:]
+    if '--poster' in args:
+        poster = True
+        args.remove('--poster')
+    if '--skip-cover' in args:
+        skip_cover = True
+        args.remove('--skip-cover')
+    if '--no-tables' in args:
+        check_tables = False
+        args.remove('--no-tables')
+    if '--formulas' in args:
+        check_formulas = True
+        args.remove('--formulas')
+    for arg in args:
+        files.extend(glob.glob(arg))
+    
+    if not files:
+        print(f"File not found: {args}")
+        sys.exit(1)
+    
+    for pdf_path in files:
+        result = run_qa(
+            pdf_path,
+            poster=poster,
+            skip_cover=skip_cover,
+            check_tables=check_tables,
+            check_formulas=check_formulas
+        )
+        print(format_report(result))
+        if len(files) > 1:
+            print("\n")