mantle-ai-trader/skills/pdf/scripts/pdf_qa.py

#!/usr/bin/env python3
"""
PDF Quality Assurance Checker
=============================
Automatically detects common typesetting issues in PDFs.

Usage: python3 pdf_qa.py <pdf_path>

Checks:
  1. Page size consistency across all pages
  2. Blank page detection
  3. CJK punctuation placement (line-start/end forbidden punctuation)
  4. Color analysis (informational only — counts and lists colors)
  5. Font embedding check (warns on non-embedded fonts)
  6. PDF metadata check (title/author/creator)
  7. Content overflow detection (text exceeding page boundaries)
  8. Content fill ratio per page (multi-page docs, warns if < 40%)
  9. Cover/poster full-bleed check (background extends to page edges)
 10. Margin symmetry check (left/right text margins)
 11. Table centering check (if detected)
 12. Formula overflow check (optional)
"""

import sys
import os
import re
import json
from collections import Counter

try:
    import pymupdf  # PyMuPDF
except ImportError:
    import fitz as pymupdf

# ============================================================
# Config
# ============================================================

# CJK punctuation forbidden at line start
LINE_START_FORBIDDEN = set(
    "。、，；：！？）】〛〉」』"
    "\u201c\u201d"  # "" curly double quotes
    "\u2026"        # … ellipsis
    "\u2014"        # — em dash
    "\uff5e"        # ～ fullwidth tilde
    "\u00b7"        # · middle dot
)

# CJK punctuation forbidden at line end
LINE_END_FORBIDDEN = set(
    "（【《〈「"
    "\u2018\u2019"  # '' curly single quotes
    "\u201c"        # " left curly double quote
)

# Minimum fill ratio for last page (DISABLED — caused false positives)
# LAST_PAGE_MIN_FILL = 0.40

# Maximum allowed color count — REMOVED (color count is now info-only)
# MAX_COLORS = 8

# ============================================================
# Checks
# ============================================================

class QAResult:
    def __init__(self):
        self.issues = []     # (severity, category, message)
        self.passes = []     # passed checks
        self.info = []       # informational

    def error(self, cat, msg):
        self.issues.append(('ERROR', cat, msg))

    def warn(self, cat, msg):
        self.issues.append(('WARN', cat, msg))

    def ok(self, msg):
        self.passes.append(msg)

    def add_info(self, msg):
        self.info.append(msg)


def check_last_page_fill(doc, result):
    """Check content fill ratio of the last page"""
    if len(doc) < 2:
        result.ok("Single-page document, no last-page blank check needed")
        return

    last_page = doc[-1]
    page_rect = last_page.rect
    page_area = page_rect.width * page_rect.height

    # Get bounding boxes of all content on last page
    blocks = last_page.get_text("blocks")
    if not blocks:
        result.error("Last page blank", f"Page {len(doc)} (last page) has no content at all!")
        return

    # Calculate max y-coordinate covered by content
    max_y = 0
    min_y = page_rect.height
    for b in blocks:
        if b[4].strip():  # Has text content
            min_y = min(min_y, b[1])
            max_y = max(max_y, b[3])

    if max_y == 0:
        result.error("Last page blank", f"Page {len(doc)} (last page) has no valid text content")
        return

    content_height = max_y - min_y
    fill_ratio = content_height / page_rect.height

    result.add_info(f"Last page fill ratio: {fill_ratio:.0%} (content height {content_height:.0f}px / page height {page_rect.height:.0f}px)")

    if fill_ratio < 0.25:
        result.error("Last page blank", f"Last page fill ratio only {fill_ratio:.0%}, mostly blank! Consider compressing preceding page spacing or trimming content")
    elif fill_ratio < LAST_PAGE_MIN_FILL:
        result.warn("Last page blank", f"Last page fill ratio {fill_ratio:.0%}, somewhat sparse — optimization recommended")
    else:
        result.ok(f"Last page fill ratio {fill_ratio:.0%} ✓")


def check_punctuation(doc, result):
    """Check CJK punctuation placement rules"""
    violations = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        # Extract text by line
        text_dict = page.get_text("dict")

        for block in text_dict.get("blocks", []):
            if block.get("type") != 0:  # Only check text blocks
                continue
            for line in block.get("lines", []):
                line_text = ""
                for span in line.get("spans", []):
                    line_text += span.get("text", "")

                line_text = line_text.strip()
                if not line_text:
                    continue

                # Check line start
                first_char = line_text[0]
                if first_char in LINE_START_FORBIDDEN:
                    violations.append((page_num + 1, f"Forbidden line-start punctuation '{first_char}': ...{line_text[:30]}"))

                # Check line end
                last_char = line_text[-1] if len(line_text) > 0 else ''
                if last_char in LINE_END_FORBIDDEN:
                    violations.append((page_num + 1, f"Forbidden line-end punctuation '{last_char}': {line_text[-30:]}..."))

    if violations:
        # Show at most 10
        shown = violations[:10]
        for page_num, desc in shown:
            result.warn("Punctuation rules", f"Page {page_num} - {desc}")
        if len(violations) > 10:
            result.warn("Punctuation rules", f"...{len(violations) - 10} more violations")
    else:
        result.ok("Punctuation placement check passed ✓")


def check_blank_pages(doc, result):
    """Check for completely blank pages"""
    blank_pages = []
    for i in range(len(doc)):
        page = doc[i]
        text = page.get_text().strip()
        # Also check for images
        images = page.get_images()
        drawings = page.get_drawings()

        if not text and not images and not drawings:
            blank_pages.append(i + 1)

    if blank_pages:
        result.error("Blank pages", f"Found blank pages: {blank_pages}")
    else:
        result.ok("No blank pages ✓")


def check_colors(doc, result):
    """Analyze colors used in the document (informational only, no pass/fail)"""
    colors = set()

    for page_num in range(len(doc)):
        page = doc[page_num]
        text_dict = page.get_text("dict")

        for block in text_dict.get("blocks", []):
            if block.get("type") != 0:
                continue
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    color = span.get("color", 0)
                    if color != 0:  # Exclude pure black
                        r = (color >> 16) & 0xFF
                        g = (color >> 8) & 0xFF
                        b = color & 0xFF
                        hex_color = f"#{r:02x}{g:02x}{b:02x}"
                        colors.add(hex_color)

        # Check drawing colors
        drawings = page.get_drawings()
        for d in drawings:
            if d.get("color"):
                c = d["color"]
                if isinstance(c, (tuple, list)) and len(c) >= 3:
                    hex_color = f"#{int(c[0]*255):02x}{int(c[1]*255):02x}{int(c[2]*255):02x}"
                    colors.add(hex_color)
            if d.get("fill"):
                c = d["fill"]
                if isinstance(c, (tuple, list)) and len(c) >= 3:
                    hex_color = f"#{int(c[0]*255):02x}{int(c[1]*255):02x}{int(c[2]*255):02x}"
                    colors.add(hex_color)

    # Filter out near-black/white/gray colors
    distinct_colors = []
    for c in colors:
        r = int(c[1:3], 16)
        g = int(c[3:5], 16)
        b = int(c[5:7], 16)
        max_diff = max(abs(r-g), abs(g-b), abs(r-b))
        if max_diff > 20:
            distinct_colors.append(c)

    result.add_info(f"Total text colors: {len(colors)} (chromatic: {len(distinct_colors)})")

    if distinct_colors:
        result.add_info(f"Chromatic colors: {', '.join(sorted(distinct_colors)[:10])}")


def check_page_size_consistency(doc, result):
    """Check whether all page sizes are consistent"""
    if len(doc) < 2:
        result.ok("Single-page document, size consistent ✓")
        return

    sizes = set()
    for i in range(len(doc)):
        page = doc[i]
        w = round(page.rect.width, 1)
        h = round(page.rect.height, 1)
        sizes.add((w, h))

    if len(sizes) > 1:
        result.warn("Page size", f"Inconsistent page sizes: {sizes}")
    else:
        size = list(sizes)[0]
        # Convert to mm
        w_mm = size[0] * 25.4 / 72
        h_mm = size[1] * 25.4 / 72
        result.add_info(f"Page size: {w_mm:.0f}mm × {h_mm:.0f}mm ({len(doc)} pages)")
        result.ok("Page size consistent ✓")


def check_text_overflow(doc, result):
    """Check whether text overflows page boundaries"""
    overflow_pages = []

    for i in range(len(doc)):
        page = doc[i]
        rect = page.rect
        blocks = page.get_text("blocks")

        for b in blocks:
            # b = (x0, y0, x1, y1, text, block_no, block_type)
            if b[2] > rect.width + 2 or b[3] > rect.height + 2:  # 2px tolerance
                overflow_pages.append(i + 1)
                break
            if b[0] < -2 or b[1] < -2:
                overflow_pages.append(i + 1)
                break

    if overflow_pages:
        result.warn("Content overflow", f"Pages {overflow_pages} may have content exceeding page boundaries")
    else:
        result.ok("No content overflow ✓")


def check_content_fill_ratio(doc, result):
    """Check content fill ratio per page — warns when content is crammed at top leaving large void below.

    Rules:
    - Skip single-page documents (may be intentional design)
    - Skip page 1 (usually cover with intentional whitespace)
    - Middle pages: warn if fill ratio < 40%
    - Last page: warn if fill ratio < 25% (naturally has less content)
    """
    if len(doc) < 2:
        result.ok("Single-page document, skipping content fill ratio check ✓")
        return

    low_fill_pages = []

    for i in range(len(doc)):
        page = doc[i]
        page_rect = page.rect
        page_height = page_rect.height

        # Skip page 1 (cover)
        if i == 0:
            continue

        blocks = page.get_text("blocks")
        images = page.get_images()
        drawings = page.get_drawings()

        if not blocks and not images and not drawings:
            continue  # Blank page check handles this

        # Calculate content bbox
        max_y = 0
        for b in blocks:
            if b[4].strip():
                max_y = max(max_y, b[3])

        # Include images in bbox
        for img in images:
            try:
                img_rects = page.get_image_rects(img[0])
                for r in img_rects:
                    max_y = max(max_y, r.y1)
            except Exception:
                pass

        if max_y == 0:
            continue

        fill_ratio = max_y / page_height
        is_last = (i == len(doc) - 1)
        threshold = 0.25 if is_last else 0.40

        if fill_ratio < threshold:
            low_fill_pages.append((i + 1, fill_ratio, threshold))

    if low_fill_pages:
        for pg, ratio, thresh in low_fill_pages:
            result.warn(
                "Content fill ratio",
                f"Page {pg} content only fills {ratio:.0%} of page height "
                f"(threshold: {thresh:.0%}). Content may be crammed at the top "
                f"with a large blank area below."
            )
    else:
        result.ok("Content fill ratio adequate on all pages ✓")


def check_cover_bleed(doc, result, poster=False):
    """Check if the cover page (page 1) fills the entire page area (full-bleed).

    A properly designed cover should have background color/graphics extending
    to the page edges. If the content bbox has significant margins on all sides,
    the cover likely wasn't rendered full-bleed (e.g. ReportLab with default margins).

    For poster mode: checks ALL pages (not just the cover) since every page of a
    seamlessly-paginated poster should have consistent background fill.

    Strategy: combine bounding boxes of drawings (rects, paths), images, and colored
    backgrounds. If the union bbox leaves > 5% margin on any side, warn.
    """
    if not poster and len(doc) < 2:
        # Single page doc (non-poster) — not necessarily a cover scenario
        return

    pages_to_check = range(len(doc)) if poster else [0]

    for page_idx in pages_to_check:
        page = doc[page_idx]
        page_rect = page.rect
        pw, ph = page_rect.width, page_rect.height

        # Collect all content bounding boxes
        min_x, min_y = pw, ph
        max_x, max_y = 0.0, 0.0
        has_content = False

        # 1. Drawings (vector paths, rectangles — typical for colored backgrounds)
        for d in page.get_drawings():
            r = d.get("rect")
            if r:
                min_x = min(min_x, r.x0)
                min_y = min(min_y, r.y0)
                max_x = max(max_x, r.x1)
                max_y = max(max_y, r.y1)
                has_content = True

        # 2. Images
        for img in page.get_images():
            try:
                for r in page.get_image_rects(img[0]):
                    min_x = min(min_x, r.x0)
                    min_y = min(min_y, r.y0)
                    max_x = max(max_x, r.x1)
                    max_y = max(max_y, r.y1)
                    has_content = True
            except Exception:
                pass

        page_label = f"Page {page_idx + 1}" if poster else "Cover page (p1)"

        if not has_content:
            blocks = page.get_text("blocks")
            if blocks:
                result.warn(
                    f"{page_label} not full-bleed",
                    f"{page_label} has no background graphics (no filled rectangles or images). "
                    "A proper cover/poster page should have a full-page background color or image "
                    "extending to all edges."
                )
            continue

        # Calculate margin ratios (how far content is from page edges)
        margin_left = max(0, min_x) / pw
        margin_top = max(0, min_y) / ph
        margin_right = max(0, pw - max_x) / pw
        margin_bottom = max(0, ph - max_y) / ph

        threshold = 0.05
        margins_ok = (margin_left <= threshold and margin_top <= threshold and
                      margin_right <= threshold and margin_bottom <= threshold)

        if margins_ok:
            result.ok(f"{page_label} content extends to page edges (full-bleed) ✓")
        else:
            sides = []
            if margin_left > threshold:
                sides.append(f"left {margin_left:.0%}")
            if margin_top > threshold:
                sides.append(f"top {margin_top:.0%}")
            if margin_right > threshold:
                sides.append(f"right {margin_right:.0%}")
            if margin_bottom > threshold:
                sides.append(f"bottom {margin_bottom:.0%}")
            result.warn(
                f"{page_label} not full-bleed",
                f"{page_label} has visible margins: {', '.join(sides)}. "
                f"Background/graphics should extend to page edges."
            )


def check_margin_symmetry(doc, result, skip_cover=False):
    """Check left/right margin symmetry using text block bounds."""
    warn_pages = []

    for page_num in range(len(doc)):
        if skip_cover and page_num == 0:
            continue

        page = doc[page_num]
        blocks = page.get_text("blocks")
        text_blocks = [b for b in blocks if b[4].strip()]

        if len(text_blocks) < 3:
            continue  # Skip decorative/cover-like pages

        left_margin = min(b[0] for b in text_blocks)
        right_margin = page.rect.width - max(b[2] for b in text_blocks)
        diff = abs(left_margin - right_margin)

        if diff > page.rect.width * 0.05:
            warn_pages.append((page_num + 1, left_margin, right_margin, diff))

    if warn_pages:
        for pg, left, right, diff in warn_pages:
            result.warn(
                "Margin symmetry",
                f"Page {pg} left/right margins differ by {diff:.0f}pt "
                f"(L {left:.0f}pt, R {right:.0f}pt)"
            )
    else:
        result.ok("Left/right margins appear symmetric \u2713")


def check_table_centering(doc, result):
    """Check if detected table regions are centered."""
    def _bbox_intersects(a, b, tol=6):
        return not (a[2] < b[0] - tol or a[0] > b[2] + tol or
                    a[3] < b[1] - tol or a[1] > b[3] + tol)

    def _rect_tuple(r):
        if hasattr(r, "x0"):
            return (r.x0, r.y0, r.x1, r.y1)
        return (r[0], r[1], r[2], r[3])

    any_tables = False

    for page_num in range(len(doc)):
        page = doc[page_num]
        drawings = page.get_drawings()
        segments = []

        for d in drawings:
            for item in d.get("items", []):
                if not item:
                    continue
                op = item[0]
                if op == "l" and len(item) >= 3:
                    p0, p1 = item[1], item[2]
                    segments.append((p0[0], p0[1], p1[0], p1[1]))
                elif op == "re" and len(item) >= 2:
                    x0, y0, x1, y1 = _rect_tuple(item[1])
                    segments.extend([
                        (x0, y0, x1, y0),
                        (x0, y1, x1, y1),
                        (x0, y0, x0, y1),
                        (x1, y0, x1, y1),
                    ])

        if not segments:
            continue

        cluster_list = []
        for x0, y0, x1, y1 in segments:
            min_x, max_x = min(x0, x1), max(x0, x1)
            min_y, max_y = min(y0, y1), max(y0, y1)
            bbox = (min_x, min_y, max_x, max_y)
            is_h = abs(y0 - y1) < 1 and (max_x - min_x) > 20
            is_v = abs(x0 - x1) < 1 and (max_y - min_y) > 20
            if not is_h and not is_v:
                continue

            placed = False
            for cl in cluster_list:
                if _bbox_intersects(bbox, cl["bbox"]):
                    cl["segments"].append((x0, y0, x1, y1, is_h, is_v))
                    cl["bbox"] = (
                        min(cl["bbox"][0], bbox[0]),
                        min(cl["bbox"][1], bbox[1]),
                        max(cl["bbox"][2], bbox[2]),
                        max(cl["bbox"][3], bbox[3]),
                    )
                    if is_h:
                        cl["h"] += 1
                    if is_v:
                        cl["v"] += 1
                    placed = True
                    break
            if not placed:
                cluster_list.append({
                    "bbox": bbox,
                    "segments": [(x0, y0, x1, y1, is_h, is_v)],
                    "h": 1 if is_h else 0,
                    "v": 1 if is_v else 0,
                })

        for cl in cluster_list:
            if cl["h"] < 2 or cl["v"] < 2:
                continue
            any_tables = True
            bbox = cl["bbox"]
            page_width = page.rect.width
            left_margin = bbox[0]
            right_margin = page_width - bbox[2]
            if abs(left_margin - right_margin) > page_width * 0.05:
                result.warn(
                    "Table centering",
                    f"Page {page_num + 1}: Table not centered "
                    f"(L {left_margin:.0f}pt, R {right_margin:.0f}pt)"
                )

    if any_tables:
        result.ok("Table centering check complete \u2713")


def check_font_embedding(doc, result):
    """Check font embedding status using PyMuPDF font list."""
    fonts_used = set()
    non_embedded = set()

    for page_num in range(len(doc)):
        page = doc[page_num]
        for font in page.get_fonts():
            basefont = font[3] if len(font) > 3 else "unknown"
            ext = font[1] if len(font) > 1 else ""
            fonts_used.add(basefont)
            if not ext:
                non_embedded.add(basefont)

    if fonts_used:
        result.add_info(f"Fonts used: {', '.join(sorted(fonts_used))}")
    else:
        result.add_info("Fonts used: (none detected)")

    if non_embedded:
        for basefont in sorted(non_embedded):
            result.warn(
                "Font embedding",
                f"Font {basefont} is not embedded. May display differently on other systems."
            )
    else:
        result.ok("All fonts are embedded \u2713")


def check_helvetica_in_cjk(doc, result):
    """Detect Helvetica rendering visible text in documents containing CJK text.

    Helvetica is a Latin-only built-in PDF font. When it appears rendering
    actual text content in a CJK document, it almost always means a raw string
    was passed to a ReportLab Table or flowable without wrapping it in
    Paragraph() with a CJK font. The CJK characters rendered via Helvetica
    become garbled (fall back to ZapfDingbats symbols).

    We only check Helvetica (not ZapfDingbats) because ZapfDingbats is
    legitimately used for bullet symbols in list items.

    We check actual rendered text spans (not just font presence in font list)
    because ReportLab internally registers Helvetica on every page even when
    only CJK fonts are used in visible content.
    """
    has_cjk = False
    helvetica_pages = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text") or ""

        # Check if document contains CJK characters
        if not has_cjk:
            for ch in text:
                if '\u4e00' <= ch <= '\u9fff' or '\u3400' <= ch <= '\u4dbf':
                    has_cjk = True
                    break

        # Check if Helvetica is actually used to render visible text on this page
        blocks = page.get_text("dict", sort=True).get("blocks", [])
        found_on_page = False
        for block in blocks:
            if found_on_page:
                break
            for line in block.get("lines", []):
                if found_on_page:
                    break
                for span in line.get("spans", []):
                    font = span.get("font", "")
                    txt = span.get("text", "").strip()
                    if "Helvetica" in font and len(txt) > 0:
                        helvetica_pages.append(page_num + 1)
                        found_on_page = True
                        break

    if has_cjk and helvetica_pages:
        pages_str = ', '.join(str(p) for p in helvetica_pages[:5])
        if len(helvetica_pages) > 5:
            pages_str += f' ...and {len(helvetica_pages) - 5} more'
        result.warn(
            "Helvetica in CJK document",
            f"Helvetica font detected rendering text on page(s) {pages_str} in a CJK document. "
            f"This usually means a raw string was passed to a ReportLab Table or flowable "
            f"without wrapping in Paragraph(text, style) with a CJK-capable font. "
            f"CJK characters rendered via Helvetica will appear as garbled symbols."
        )


def check_metadata(doc, result):
    """Check PDF metadata presence for title, author, creator."""
    meta = doc.metadata or {}

    def _missing(v):
        if v is None:
            return True
        if not str(v).strip():
            return True
        return False

    title = meta.get("title")
    author = meta.get("author")
    creator = meta.get("creator")

    if _missing(title) or str(title).strip().lower() in ("untitled", "(anonymous)"):
        result.warn("Metadata", "Missing/invalid title metadata")
    else:
        result.ok("Title metadata present \u2713")

    if _missing(author):
        result.warn("Metadata", "Missing author metadata")
    else:
        result.ok("Author metadata present \u2713")

    if _missing(creator):
        result.warn("Metadata", "Missing creator metadata")
    else:
        result.ok("Creator metadata present \u2713")


def check_toc_without_cover(doc, result):
    """Detect TOC on page 1 without a preceding cover page.

    If the first page contains Table of Contents / 目录, it means the document
    has a TOC but no cover page. This is a structural issue — documents with
    TOC should have: Cover (p1) → TOC (p2) → Content (p3+).
    """
    if len(doc) < 2:
        # Single-page docs don't need TOC/cover checks
        return

    page1 = doc[0]
    text = page1.get_text("text", sort=True).strip()

    # Normalize for matching
    text_lower = text.lower()
    first_300 = text_lower[:300]

    toc_keywords = [
        "table of contents", "contents",
        "目录", "目 录",
    ]

    has_toc = any(kw in first_300 for kw in toc_keywords)

    if has_toc:
        result.warn(
            "TOC without cover",
            "Page 1 appears to be a Table of Contents with no preceding cover page. "
            "Documents with TOC should have: Cover (p1) → TOC (p2) → Content (p3+)."
        )


def check_formula_overflow(doc, result):
    """Detect likely formula overflow past right content margin."""
    math_re = re.compile(r"[=+\-*/<>\u2264\u2265\u2211\u222b\u221a\u03c0\u00b5\u221e\u2202\u2206\u2248\u2260\u00b1\u00d7\u00f7]")

    for page_num in range(len(doc)):
        page = doc[page_num]
        blocks = page.get_text("blocks")
        text_blocks = [b for b in blocks if b[4].strip()]

        if len(text_blocks) < 3:
            continue

        right_edges = sorted(b[2] for b in text_blocks)
        mid = len(right_edges) // 2
        content_right = right_edges[mid] if right_edges else 0

        for b in text_blocks:
            x0, x1, text = b[0], b[2], b[4]
            if x1 <= content_right + 10:
                continue

            is_single_line = "\n" not in text.strip()
            is_wide = (x1 - x0) > page.rect.width * 0.5
            has_math = bool(math_re.search(text))

            if (is_single_line and is_wide) or has_math:
                delta = x1 - content_right
                result.warn(
                    "Formula overflow",
                    f"Page {page_num + 1}: Content extends {delta:.0f}pt beyond right content margin "
                    "(possible formula overflow)"
                )
                break


# ============================================================
# Main
# ============================================================

def run_qa(pdf_path, poster=False, skip_cover=False, check_tables=True, check_formulas=False):
    result = QAResult()

    if not os.path.exists(pdf_path):
        result.error("File", f"File not found: {pdf_path}")
        return result

    doc = pymupdf.open(pdf_path)

    result.add_info(f"File: {os.path.basename(pdf_path)}")
    result.add_info(f"Size: {os.path.getsize(pdf_path) / 1024:.1f} KB")
    if poster:
        result.add_info("Mode: poster (creative)")

    # Run all checks
    check_metadata(doc, result)
    check_page_size_consistency(doc, result)
    check_blank_pages(doc, result)
    check_punctuation(doc, result)
    check_colors(doc, result)
    check_font_embedding(doc, result)
    check_helvetica_in_cjk(doc, result)
    check_text_overflow(doc, result)
    if not poster:
        # Content fill ratio is not meaningful for posters — the last page
        # of a seamlessly-paginated poster naturally has less content.
        check_content_fill_ratio(doc, result)
    check_cover_bleed(doc, result, poster=poster)
    check_margin_symmetry(doc, result, skip_cover=skip_cover)
    if check_tables:
        check_table_centering(doc, result)
    if check_formulas:
        check_formula_overflow(doc, result)
    if not poster:
        check_toc_without_cover(doc, result)

    doc.close()
    return result


def format_report(result):
    lines = []
    lines.append("=" * 56)
    lines.append("  PDF Quality Assurance Report")
    lines.append("=" * 56)

    # Info
    if result.info:
        lines.append("")
        lines.append("ℹ️  Info:")
        for msg in result.info:
            lines.append(f"   {msg}")

    # Passes
    if result.passes:
        lines.append("")
        lines.append(f"✅ Passed ({len(result.passes)}):")
        for msg in result.passes:
            lines.append(f"   {msg}")

    # Issues
    errors = [(s, c, m) for s, c, m in result.issues if s == 'ERROR']
    warns = [(s, c, m) for s, c, m in result.issues if s == 'WARN']

    if errors:
        lines.append("")
        lines.append(f"❌ Errors ({len(errors)}):")
        for _, cat, msg in errors:
            lines.append(f"   [{cat}] {msg}")

    if warns:
        lines.append("")
        lines.append(f"⚠️  Warnings ({len(warns)}):")
        for _, cat, msg in warns:
            lines.append(f"   [{cat}] {msg}")

    # Summary
    lines.append("")
    lines.append("-" * 56)
    total_issues = len(result.issues)
    if total_issues == 0:
        lines.append("🎉 PASS — All checks passed!")
    elif errors:
        lines.append(f"💀 FAIL — {len(errors)} error(s), {len(warns)} warning(s)")
    else:
        lines.append(f"⚠️  WARN — {len(warns)} warning(s), optimization recommended")
    lines.append("-" * 56)

    return "\n".join(lines)


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python3 pdf_qa.py <pdf_path>")
        print("       python3 pdf_qa.py *.pdf  (batch check)")
        print("Options:")
        print("  --poster      Poster mode (creative)")
        print("  --skip-cover  Skip page 1 margin symmetry check")
        print("  --no-tables   Disable table centering check")
        print("  --formulas    Enable formula overflow check")
        sys.exit(1)

    import glob
    files = []
    poster = False
    skip_cover = False
    check_tables = True
    check_formulas = False
    args = sys.argv[1:]
    if '--poster' in args:
        poster = True
        args.remove('--poster')
    if '--skip-cover' in args:
        skip_cover = True
        args.remove('--skip-cover')
    if '--no-tables' in args:
        check_tables = False
        args.remove('--no-tables')
    if '--formulas' in args:
        check_formulas = True
        args.remove('--formulas')
    for arg in args:
        files.extend(glob.glob(arg))

    if not files:
        print(f"File not found: {args}")
        sys.exit(1)

    for pdf_path in files:
        result = run_qa(
            pdf_path,
            poster=poster,
            skip_cover=skip_cover,
            check_tables=check_tables,
            check_formulas=check_formulas
        )
        print(format_report(result))
        if len(files) > 1:
            print("\n")