902 lines
30 KiB
Python
Executable File
902 lines
30 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""
|
||
PDF Quality Assurance Checker
|
||
=============================
|
||
Automatically detects common typesetting issues in PDFs.
|
||
|
||
Usage: python3 pdf_qa.py <pdf_path>
|
||
|
||
Checks:
|
||
1. Page size consistency across all pages
|
||
2. Blank page detection
|
||
3. CJK punctuation placement (line-start/end forbidden punctuation)
|
||
4. Color analysis (informational only — counts and lists colors)
|
||
5. Font embedding check (warns on non-embedded fonts)
|
||
6. PDF metadata check (title/author/creator)
|
||
7. Content overflow detection (text exceeding page boundaries)
|
||
8. Content fill ratio per page (multi-page docs, warns if < 40%)
|
||
9. Cover/poster full-bleed check (background extends to page edges)
|
||
10. Margin symmetry check (left/right text margins)
|
||
11. Table centering check (if detected)
|
||
12. Formula overflow check (optional)
|
||
"""
|
||
|
||
import sys
|
||
import os
|
||
import re
|
||
import json
|
||
from collections import Counter
|
||
|
||
try:
|
||
import pymupdf # PyMuPDF
|
||
except ImportError:
|
||
import fitz as pymupdf
|
||
|
||
# ============================================================
|
||
# Config
|
||
# ============================================================
|
||
|
||
# CJK punctuation forbidden at line start
|
||
LINE_START_FORBIDDEN = set(
|
||
"。、,;:!?)】〛〉」』"
|
||
"\u201c\u201d" # "" curly double quotes
|
||
"\u2026" # … ellipsis
|
||
"\u2014" # — em dash
|
||
"\uff5e" # ~ fullwidth tilde
|
||
"\u00b7" # · middle dot
|
||
)
|
||
|
||
# CJK punctuation forbidden at line end
|
||
LINE_END_FORBIDDEN = set(
|
||
"(【《〈「"
|
||
"\u2018\u2019" # '' curly single quotes
|
||
"\u201c" # " left curly double quote
|
||
)
|
||
|
||
# Minimum fill ratio for last page (DISABLED — caused false positives)
|
||
# LAST_PAGE_MIN_FILL = 0.40
|
||
|
||
# Maximum allowed color count — REMOVED (color count is now info-only)
|
||
# MAX_COLORS = 8
|
||
|
||
# ============================================================
|
||
# Checks
|
||
# ============================================================
|
||
|
||
class QAResult:
|
||
def __init__(self):
|
||
self.issues = [] # (severity, category, message)
|
||
self.passes = [] # passed checks
|
||
self.info = [] # informational
|
||
|
||
def error(self, cat, msg):
|
||
self.issues.append(('ERROR', cat, msg))
|
||
|
||
def warn(self, cat, msg):
|
||
self.issues.append(('WARN', cat, msg))
|
||
|
||
def ok(self, msg):
|
||
self.passes.append(msg)
|
||
|
||
def add_info(self, msg):
|
||
self.info.append(msg)
|
||
|
||
|
||
def check_last_page_fill(doc, result):
|
||
"""Check content fill ratio of the last page"""
|
||
if len(doc) < 2:
|
||
result.ok("Single-page document, no last-page blank check needed")
|
||
return
|
||
|
||
last_page = doc[-1]
|
||
page_rect = last_page.rect
|
||
page_area = page_rect.width * page_rect.height
|
||
|
||
# Get bounding boxes of all content on last page
|
||
blocks = last_page.get_text("blocks")
|
||
if not blocks:
|
||
result.error("Last page blank", f"Page {len(doc)} (last page) has no content at all!")
|
||
return
|
||
|
||
# Calculate max y-coordinate covered by content
|
||
max_y = 0
|
||
min_y = page_rect.height
|
||
for b in blocks:
|
||
if b[4].strip(): # Has text content
|
||
min_y = min(min_y, b[1])
|
||
max_y = max(max_y, b[3])
|
||
|
||
if max_y == 0:
|
||
result.error("Last page blank", f"Page {len(doc)} (last page) has no valid text content")
|
||
return
|
||
|
||
content_height = max_y - min_y
|
||
fill_ratio = content_height / page_rect.height
|
||
|
||
result.add_info(f"Last page fill ratio: {fill_ratio:.0%} (content height {content_height:.0f}px / page height {page_rect.height:.0f}px)")
|
||
|
||
if fill_ratio < 0.25:
|
||
result.error("Last page blank", f"Last page fill ratio only {fill_ratio:.0%}, mostly blank! Consider compressing preceding page spacing or trimming content")
|
||
elif fill_ratio < LAST_PAGE_MIN_FILL:
|
||
result.warn("Last page blank", f"Last page fill ratio {fill_ratio:.0%}, somewhat sparse — optimization recommended")
|
||
else:
|
||
result.ok(f"Last page fill ratio {fill_ratio:.0%} ✓")
|
||
|
||
|
||
def check_punctuation(doc, result):
|
||
"""Check CJK punctuation placement rules"""
|
||
violations = []
|
||
|
||
for page_num in range(len(doc)):
|
||
page = doc[page_num]
|
||
# Extract text by line
|
||
text_dict = page.get_text("dict")
|
||
|
||
for block in text_dict.get("blocks", []):
|
||
if block.get("type") != 0: # Only check text blocks
|
||
continue
|
||
for line in block.get("lines", []):
|
||
line_text = ""
|
||
for span in line.get("spans", []):
|
||
line_text += span.get("text", "")
|
||
|
||
line_text = line_text.strip()
|
||
if not line_text:
|
||
continue
|
||
|
||
# Check line start
|
||
first_char = line_text[0]
|
||
if first_char in LINE_START_FORBIDDEN:
|
||
violations.append((page_num + 1, f"Forbidden line-start punctuation '{first_char}': ...{line_text[:30]}"))
|
||
|
||
# Check line end
|
||
last_char = line_text[-1] if len(line_text) > 0 else ''
|
||
if last_char in LINE_END_FORBIDDEN:
|
||
violations.append((page_num + 1, f"Forbidden line-end punctuation '{last_char}': {line_text[-30:]}..."))
|
||
|
||
if violations:
|
||
# Show at most 10
|
||
shown = violations[:10]
|
||
for page_num, desc in shown:
|
||
result.warn("Punctuation rules", f"Page {page_num} - {desc}")
|
||
if len(violations) > 10:
|
||
result.warn("Punctuation rules", f"...{len(violations) - 10} more violations")
|
||
else:
|
||
result.ok("Punctuation placement check passed ✓")
|
||
|
||
|
||
def check_blank_pages(doc, result):
|
||
"""Check for completely blank pages"""
|
||
blank_pages = []
|
||
for i in range(len(doc)):
|
||
page = doc[i]
|
||
text = page.get_text().strip()
|
||
# Also check for images
|
||
images = page.get_images()
|
||
drawings = page.get_drawings()
|
||
|
||
if not text and not images and not drawings:
|
||
blank_pages.append(i + 1)
|
||
|
||
if blank_pages:
|
||
result.error("Blank pages", f"Found blank pages: {blank_pages}")
|
||
else:
|
||
result.ok("No blank pages ✓")
|
||
|
||
|
||
def check_colors(doc, result):
|
||
"""Analyze colors used in the document (informational only, no pass/fail)"""
|
||
colors = set()
|
||
|
||
for page_num in range(len(doc)):
|
||
page = doc[page_num]
|
||
text_dict = page.get_text("dict")
|
||
|
||
for block in text_dict.get("blocks", []):
|
||
if block.get("type") != 0:
|
||
continue
|
||
for line in block.get("lines", []):
|
||
for span in line.get("spans", []):
|
||
color = span.get("color", 0)
|
||
if color != 0: # Exclude pure black
|
||
r = (color >> 16) & 0xFF
|
||
g = (color >> 8) & 0xFF
|
||
b = color & 0xFF
|
||
hex_color = f"#{r:02x}{g:02x}{b:02x}"
|
||
colors.add(hex_color)
|
||
|
||
# Check drawing colors
|
||
drawings = page.get_drawings()
|
||
for d in drawings:
|
||
if d.get("color"):
|
||
c = d["color"]
|
||
if isinstance(c, (tuple, list)) and len(c) >= 3:
|
||
hex_color = f"#{int(c[0]*255):02x}{int(c[1]*255):02x}{int(c[2]*255):02x}"
|
||
colors.add(hex_color)
|
||
if d.get("fill"):
|
||
c = d["fill"]
|
||
if isinstance(c, (tuple, list)) and len(c) >= 3:
|
||
hex_color = f"#{int(c[0]*255):02x}{int(c[1]*255):02x}{int(c[2]*255):02x}"
|
||
colors.add(hex_color)
|
||
|
||
# Filter out near-black/white/gray colors
|
||
distinct_colors = []
|
||
for c in colors:
|
||
r = int(c[1:3], 16)
|
||
g = int(c[3:5], 16)
|
||
b = int(c[5:7], 16)
|
||
max_diff = max(abs(r-g), abs(g-b), abs(r-b))
|
||
if max_diff > 20:
|
||
distinct_colors.append(c)
|
||
|
||
result.add_info(f"Total text colors: {len(colors)} (chromatic: {len(distinct_colors)})")
|
||
|
||
if distinct_colors:
|
||
result.add_info(f"Chromatic colors: {', '.join(sorted(distinct_colors)[:10])}")
|
||
|
||
|
||
def check_page_size_consistency(doc, result):
|
||
"""Check whether all page sizes are consistent"""
|
||
if len(doc) < 2:
|
||
result.ok("Single-page document, size consistent ✓")
|
||
return
|
||
|
||
sizes = set()
|
||
for i in range(len(doc)):
|
||
page = doc[i]
|
||
w = round(page.rect.width, 1)
|
||
h = round(page.rect.height, 1)
|
||
sizes.add((w, h))
|
||
|
||
if len(sizes) > 1:
|
||
result.warn("Page size", f"Inconsistent page sizes: {sizes}")
|
||
else:
|
||
size = list(sizes)[0]
|
||
# Convert to mm
|
||
w_mm = size[0] * 25.4 / 72
|
||
h_mm = size[1] * 25.4 / 72
|
||
result.add_info(f"Page size: {w_mm:.0f}mm × {h_mm:.0f}mm ({len(doc)} pages)")
|
||
result.ok("Page size consistent ✓")
|
||
|
||
|
||
def check_text_overflow(doc, result):
|
||
"""Check whether text overflows page boundaries"""
|
||
overflow_pages = []
|
||
|
||
for i in range(len(doc)):
|
||
page = doc[i]
|
||
rect = page.rect
|
||
blocks = page.get_text("blocks")
|
||
|
||
for b in blocks:
|
||
# b = (x0, y0, x1, y1, text, block_no, block_type)
|
||
if b[2] > rect.width + 2 or b[3] > rect.height + 2: # 2px tolerance
|
||
overflow_pages.append(i + 1)
|
||
break
|
||
if b[0] < -2 or b[1] < -2:
|
||
overflow_pages.append(i + 1)
|
||
break
|
||
|
||
if overflow_pages:
|
||
result.warn("Content overflow", f"Pages {overflow_pages} may have content exceeding page boundaries")
|
||
else:
|
||
result.ok("No content overflow ✓")
|
||
|
||
|
||
def check_content_fill_ratio(doc, result):
|
||
"""Check content fill ratio per page — warns when content is crammed at top leaving large void below.
|
||
|
||
Rules:
|
||
- Skip single-page documents (may be intentional design)
|
||
- Skip page 1 (usually cover with intentional whitespace)
|
||
- Middle pages: warn if fill ratio < 40%
|
||
- Last page: warn if fill ratio < 25% (naturally has less content)
|
||
"""
|
||
if len(doc) < 2:
|
||
result.ok("Single-page document, skipping content fill ratio check ✓")
|
||
return
|
||
|
||
low_fill_pages = []
|
||
|
||
for i in range(len(doc)):
|
||
page = doc[i]
|
||
page_rect = page.rect
|
||
page_height = page_rect.height
|
||
|
||
# Skip page 1 (cover)
|
||
if i == 0:
|
||
continue
|
||
|
||
blocks = page.get_text("blocks")
|
||
images = page.get_images()
|
||
drawings = page.get_drawings()
|
||
|
||
if not blocks and not images and not drawings:
|
||
continue # Blank page check handles this
|
||
|
||
# Calculate content bbox
|
||
max_y = 0
|
||
for b in blocks:
|
||
if b[4].strip():
|
||
max_y = max(max_y, b[3])
|
||
|
||
# Include images in bbox
|
||
for img in images:
|
||
try:
|
||
img_rects = page.get_image_rects(img[0])
|
||
for r in img_rects:
|
||
max_y = max(max_y, r.y1)
|
||
except Exception:
|
||
pass
|
||
|
||
if max_y == 0:
|
||
continue
|
||
|
||
fill_ratio = max_y / page_height
|
||
is_last = (i == len(doc) - 1)
|
||
threshold = 0.25 if is_last else 0.40
|
||
|
||
if fill_ratio < threshold:
|
||
low_fill_pages.append((i + 1, fill_ratio, threshold))
|
||
|
||
if low_fill_pages:
|
||
for pg, ratio, thresh in low_fill_pages:
|
||
result.warn(
|
||
"Content fill ratio",
|
||
f"Page {pg} content only fills {ratio:.0%} of page height "
|
||
f"(threshold: {thresh:.0%}). Content may be crammed at the top "
|
||
f"with a large blank area below."
|
||
)
|
||
else:
|
||
result.ok("Content fill ratio adequate on all pages ✓")
|
||
|
||
|
||
def check_cover_bleed(doc, result, poster=False):
|
||
"""Check if the cover page (page 1) fills the entire page area (full-bleed).
|
||
|
||
A properly designed cover should have background color/graphics extending
|
||
to the page edges. If the content bbox has significant margins on all sides,
|
||
the cover likely wasn't rendered full-bleed (e.g. ReportLab with default margins).
|
||
|
||
For poster mode: checks ALL pages (not just the cover) since every page of a
|
||
seamlessly-paginated poster should have consistent background fill.
|
||
|
||
Strategy: combine bounding boxes of drawings (rects, paths), images, and colored
|
||
backgrounds. If the union bbox leaves > 5% margin on any side, warn.
|
||
"""
|
||
if not poster and len(doc) < 2:
|
||
# Single page doc (non-poster) — not necessarily a cover scenario
|
||
return
|
||
|
||
pages_to_check = range(len(doc)) if poster else [0]
|
||
|
||
for page_idx in pages_to_check:
|
||
page = doc[page_idx]
|
||
page_rect = page.rect
|
||
pw, ph = page_rect.width, page_rect.height
|
||
|
||
# Collect all content bounding boxes
|
||
min_x, min_y = pw, ph
|
||
max_x, max_y = 0.0, 0.0
|
||
has_content = False
|
||
|
||
# 1. Drawings (vector paths, rectangles — typical for colored backgrounds)
|
||
for d in page.get_drawings():
|
||
r = d.get("rect")
|
||
if r:
|
||
min_x = min(min_x, r.x0)
|
||
min_y = min(min_y, r.y0)
|
||
max_x = max(max_x, r.x1)
|
||
max_y = max(max_y, r.y1)
|
||
has_content = True
|
||
|
||
# 2. Images
|
||
for img in page.get_images():
|
||
try:
|
||
for r in page.get_image_rects(img[0]):
|
||
min_x = min(min_x, r.x0)
|
||
min_y = min(min_y, r.y0)
|
||
max_x = max(max_x, r.x1)
|
||
max_y = max(max_y, r.y1)
|
||
has_content = True
|
||
except Exception:
|
||
pass
|
||
|
||
page_label = f"Page {page_idx + 1}" if poster else "Cover page (p1)"
|
||
|
||
if not has_content:
|
||
blocks = page.get_text("blocks")
|
||
if blocks:
|
||
result.warn(
|
||
f"{page_label} not full-bleed",
|
||
f"{page_label} has no background graphics (no filled rectangles or images). "
|
||
"A proper cover/poster page should have a full-page background color or image "
|
||
"extending to all edges."
|
||
)
|
||
continue
|
||
|
||
# Calculate margin ratios (how far content is from page edges)
|
||
margin_left = max(0, min_x) / pw
|
||
margin_top = max(0, min_y) / ph
|
||
margin_right = max(0, pw - max_x) / pw
|
||
margin_bottom = max(0, ph - max_y) / ph
|
||
|
||
threshold = 0.05
|
||
margins_ok = (margin_left <= threshold and margin_top <= threshold and
|
||
margin_right <= threshold and margin_bottom <= threshold)
|
||
|
||
if margins_ok:
|
||
result.ok(f"{page_label} content extends to page edges (full-bleed) ✓")
|
||
else:
|
||
sides = []
|
||
if margin_left > threshold:
|
||
sides.append(f"left {margin_left:.0%}")
|
||
if margin_top > threshold:
|
||
sides.append(f"top {margin_top:.0%}")
|
||
if margin_right > threshold:
|
||
sides.append(f"right {margin_right:.0%}")
|
||
if margin_bottom > threshold:
|
||
sides.append(f"bottom {margin_bottom:.0%}")
|
||
result.warn(
|
||
f"{page_label} not full-bleed",
|
||
f"{page_label} has visible margins: {', '.join(sides)}. "
|
||
f"Background/graphics should extend to page edges."
|
||
)
|
||
|
||
|
||
def check_margin_symmetry(doc, result, skip_cover=False):
|
||
"""Check left/right margin symmetry using text block bounds."""
|
||
warn_pages = []
|
||
|
||
for page_num in range(len(doc)):
|
||
if skip_cover and page_num == 0:
|
||
continue
|
||
|
||
page = doc[page_num]
|
||
blocks = page.get_text("blocks")
|
||
text_blocks = [b for b in blocks if b[4].strip()]
|
||
|
||
if len(text_blocks) < 3:
|
||
continue # Skip decorative/cover-like pages
|
||
|
||
left_margin = min(b[0] for b in text_blocks)
|
||
right_margin = page.rect.width - max(b[2] for b in text_blocks)
|
||
diff = abs(left_margin - right_margin)
|
||
|
||
if diff > page.rect.width * 0.05:
|
||
warn_pages.append((page_num + 1, left_margin, right_margin, diff))
|
||
|
||
if warn_pages:
|
||
for pg, left, right, diff in warn_pages:
|
||
result.warn(
|
||
"Margin symmetry",
|
||
f"Page {pg} left/right margins differ by {diff:.0f}pt "
|
||
f"(L {left:.0f}pt, R {right:.0f}pt)"
|
||
)
|
||
else:
|
||
result.ok("Left/right margins appear symmetric \u2713")
|
||
|
||
|
||
def check_table_centering(doc, result):
|
||
"""Check if detected table regions are centered."""
|
||
def _bbox_intersects(a, b, tol=6):
|
||
return not (a[2] < b[0] - tol or a[0] > b[2] + tol or
|
||
a[3] < b[1] - tol or a[1] > b[3] + tol)
|
||
|
||
def _rect_tuple(r):
|
||
if hasattr(r, "x0"):
|
||
return (r.x0, r.y0, r.x1, r.y1)
|
||
return (r[0], r[1], r[2], r[3])
|
||
|
||
any_tables = False
|
||
|
||
for page_num in range(len(doc)):
|
||
page = doc[page_num]
|
||
drawings = page.get_drawings()
|
||
segments = []
|
||
|
||
for d in drawings:
|
||
for item in d.get("items", []):
|
||
if not item:
|
||
continue
|
||
op = item[0]
|
||
if op == "l" and len(item) >= 3:
|
||
p0, p1 = item[1], item[2]
|
||
segments.append((p0[0], p0[1], p1[0], p1[1]))
|
||
elif op == "re" and len(item) >= 2:
|
||
x0, y0, x1, y1 = _rect_tuple(item[1])
|
||
segments.extend([
|
||
(x0, y0, x1, y0),
|
||
(x0, y1, x1, y1),
|
||
(x0, y0, x0, y1),
|
||
(x1, y0, x1, y1),
|
||
])
|
||
|
||
if not segments:
|
||
continue
|
||
|
||
cluster_list = []
|
||
for x0, y0, x1, y1 in segments:
|
||
min_x, max_x = min(x0, x1), max(x0, x1)
|
||
min_y, max_y = min(y0, y1), max(y0, y1)
|
||
bbox = (min_x, min_y, max_x, max_y)
|
||
is_h = abs(y0 - y1) < 1 and (max_x - min_x) > 20
|
||
is_v = abs(x0 - x1) < 1 and (max_y - min_y) > 20
|
||
if not is_h and not is_v:
|
||
continue
|
||
|
||
placed = False
|
||
for cl in cluster_list:
|
||
if _bbox_intersects(bbox, cl["bbox"]):
|
||
cl["segments"].append((x0, y0, x1, y1, is_h, is_v))
|
||
cl["bbox"] = (
|
||
min(cl["bbox"][0], bbox[0]),
|
||
min(cl["bbox"][1], bbox[1]),
|
||
max(cl["bbox"][2], bbox[2]),
|
||
max(cl["bbox"][3], bbox[3]),
|
||
)
|
||
if is_h:
|
||
cl["h"] += 1
|
||
if is_v:
|
||
cl["v"] += 1
|
||
placed = True
|
||
break
|
||
if not placed:
|
||
cluster_list.append({
|
||
"bbox": bbox,
|
||
"segments": [(x0, y0, x1, y1, is_h, is_v)],
|
||
"h": 1 if is_h else 0,
|
||
"v": 1 if is_v else 0,
|
||
})
|
||
|
||
for cl in cluster_list:
|
||
if cl["h"] < 2 or cl["v"] < 2:
|
||
continue
|
||
any_tables = True
|
||
bbox = cl["bbox"]
|
||
page_width = page.rect.width
|
||
left_margin = bbox[0]
|
||
right_margin = page_width - bbox[2]
|
||
if abs(left_margin - right_margin) > page_width * 0.05:
|
||
result.warn(
|
||
"Table centering",
|
||
f"Page {page_num + 1}: Table not centered "
|
||
f"(L {left_margin:.0f}pt, R {right_margin:.0f}pt)"
|
||
)
|
||
|
||
if any_tables:
|
||
result.ok("Table centering check complete \u2713")
|
||
|
||
|
||
def check_font_embedding(doc, result):
|
||
"""Check font embedding status using PyMuPDF font list."""
|
||
fonts_used = set()
|
||
non_embedded = set()
|
||
|
||
for page_num in range(len(doc)):
|
||
page = doc[page_num]
|
||
for font in page.get_fonts():
|
||
basefont = font[3] if len(font) > 3 else "unknown"
|
||
ext = font[1] if len(font) > 1 else ""
|
||
fonts_used.add(basefont)
|
||
if not ext:
|
||
non_embedded.add(basefont)
|
||
|
||
if fonts_used:
|
||
result.add_info(f"Fonts used: {', '.join(sorted(fonts_used))}")
|
||
else:
|
||
result.add_info("Fonts used: (none detected)")
|
||
|
||
if non_embedded:
|
||
for basefont in sorted(non_embedded):
|
||
result.warn(
|
||
"Font embedding",
|
||
f"Font {basefont} is not embedded. May display differently on other systems."
|
||
)
|
||
else:
|
||
result.ok("All fonts are embedded \u2713")
|
||
|
||
|
||
def check_helvetica_in_cjk(doc, result):
|
||
"""Detect Helvetica rendering visible text in documents containing CJK text.
|
||
|
||
Helvetica is a Latin-only built-in PDF font. When it appears rendering
|
||
actual text content in a CJK document, it almost always means a raw string
|
||
was passed to a ReportLab Table or flowable without wrapping it in
|
||
Paragraph() with a CJK font. The CJK characters rendered via Helvetica
|
||
become garbled (fall back to ZapfDingbats symbols).
|
||
|
||
We only check Helvetica (not ZapfDingbats) because ZapfDingbats is
|
||
legitimately used for bullet symbols in list items.
|
||
|
||
We check actual rendered text spans (not just font presence in font list)
|
||
because ReportLab internally registers Helvetica on every page even when
|
||
only CJK fonts are used in visible content.
|
||
"""
|
||
has_cjk = False
|
||
helvetica_pages = []
|
||
|
||
for page_num in range(len(doc)):
|
||
page = doc[page_num]
|
||
text = page.get_text("text") or ""
|
||
|
||
# Check if document contains CJK characters
|
||
if not has_cjk:
|
||
for ch in text:
|
||
if '\u4e00' <= ch <= '\u9fff' or '\u3400' <= ch <= '\u4dbf':
|
||
has_cjk = True
|
||
break
|
||
|
||
# Check if Helvetica is actually used to render visible text on this page
|
||
blocks = page.get_text("dict", sort=True).get("blocks", [])
|
||
found_on_page = False
|
||
for block in blocks:
|
||
if found_on_page:
|
||
break
|
||
for line in block.get("lines", []):
|
||
if found_on_page:
|
||
break
|
||
for span in line.get("spans", []):
|
||
font = span.get("font", "")
|
||
txt = span.get("text", "").strip()
|
||
if "Helvetica" in font and len(txt) > 0:
|
||
helvetica_pages.append(page_num + 1)
|
||
found_on_page = True
|
||
break
|
||
|
||
if has_cjk and helvetica_pages:
|
||
pages_str = ', '.join(str(p) for p in helvetica_pages[:5])
|
||
if len(helvetica_pages) > 5:
|
||
pages_str += f' ...and {len(helvetica_pages) - 5} more'
|
||
result.warn(
|
||
"Helvetica in CJK document",
|
||
f"Helvetica font detected rendering text on page(s) {pages_str} in a CJK document. "
|
||
f"This usually means a raw string was passed to a ReportLab Table or flowable "
|
||
f"without wrapping in Paragraph(text, style) with a CJK-capable font. "
|
||
f"CJK characters rendered via Helvetica will appear as garbled symbols."
|
||
)
|
||
|
||
|
||
def check_metadata(doc, result):
|
||
"""Check PDF metadata presence for title, author, creator."""
|
||
meta = doc.metadata or {}
|
||
|
||
def _missing(v):
|
||
if v is None:
|
||
return True
|
||
if not str(v).strip():
|
||
return True
|
||
return False
|
||
|
||
title = meta.get("title")
|
||
author = meta.get("author")
|
||
creator = meta.get("creator")
|
||
|
||
if _missing(title) or str(title).strip().lower() in ("untitled", "(anonymous)"):
|
||
result.warn("Metadata", "Missing/invalid title metadata")
|
||
else:
|
||
result.ok("Title metadata present \u2713")
|
||
|
||
if _missing(author):
|
||
result.warn("Metadata", "Missing author metadata")
|
||
else:
|
||
result.ok("Author metadata present \u2713")
|
||
|
||
if _missing(creator):
|
||
result.warn("Metadata", "Missing creator metadata")
|
||
else:
|
||
result.ok("Creator metadata present \u2713")
|
||
|
||
|
||
def check_toc_without_cover(doc, result):
|
||
"""Detect TOC on page 1 without a preceding cover page.
|
||
|
||
If the first page contains Table of Contents / 目录, it means the document
|
||
has a TOC but no cover page. This is a structural issue — documents with
|
||
TOC should have: Cover (p1) → TOC (p2) → Content (p3+).
|
||
"""
|
||
if len(doc) < 2:
|
||
# Single-page docs don't need TOC/cover checks
|
||
return
|
||
|
||
page1 = doc[0]
|
||
text = page1.get_text("text", sort=True).strip()
|
||
|
||
# Normalize for matching
|
||
text_lower = text.lower()
|
||
first_300 = text_lower[:300]
|
||
|
||
toc_keywords = [
|
||
"table of contents", "contents",
|
||
"目录", "目 录",
|
||
]
|
||
|
||
has_toc = any(kw in first_300 for kw in toc_keywords)
|
||
|
||
if has_toc:
|
||
result.warn(
|
||
"TOC without cover",
|
||
"Page 1 appears to be a Table of Contents with no preceding cover page. "
|
||
"Documents with TOC should have: Cover (p1) → TOC (p2) → Content (p3+)."
|
||
)
|
||
|
||
|
||
def check_formula_overflow(doc, result):
|
||
"""Detect likely formula overflow past right content margin."""
|
||
math_re = re.compile(r"[=+\-*/<>\u2264\u2265\u2211\u222b\u221a\u03c0\u00b5\u221e\u2202\u2206\u2248\u2260\u00b1\u00d7\u00f7]")
|
||
|
||
for page_num in range(len(doc)):
|
||
page = doc[page_num]
|
||
blocks = page.get_text("blocks")
|
||
text_blocks = [b for b in blocks if b[4].strip()]
|
||
|
||
if len(text_blocks) < 3:
|
||
continue
|
||
|
||
right_edges = sorted(b[2] for b in text_blocks)
|
||
mid = len(right_edges) // 2
|
||
content_right = right_edges[mid] if right_edges else 0
|
||
|
||
for b in text_blocks:
|
||
x0, x1, text = b[0], b[2], b[4]
|
||
if x1 <= content_right + 10:
|
||
continue
|
||
|
||
is_single_line = "\n" not in text.strip()
|
||
is_wide = (x1 - x0) > page.rect.width * 0.5
|
||
has_math = bool(math_re.search(text))
|
||
|
||
if (is_single_line and is_wide) or has_math:
|
||
delta = x1 - content_right
|
||
result.warn(
|
||
"Formula overflow",
|
||
f"Page {page_num + 1}: Content extends {delta:.0f}pt beyond right content margin "
|
||
"(possible formula overflow)"
|
||
)
|
||
break
|
||
|
||
|
||
# ============================================================
|
||
# Main
|
||
# ============================================================
|
||
|
||
def run_qa(pdf_path, poster=False, skip_cover=False, check_tables=True, check_formulas=False):
|
||
result = QAResult()
|
||
|
||
if not os.path.exists(pdf_path):
|
||
result.error("File", f"File not found: {pdf_path}")
|
||
return result
|
||
|
||
doc = pymupdf.open(pdf_path)
|
||
|
||
result.add_info(f"File: {os.path.basename(pdf_path)}")
|
||
result.add_info(f"Size: {os.path.getsize(pdf_path) / 1024:.1f} KB")
|
||
if poster:
|
||
result.add_info("Mode: poster (creative)")
|
||
|
||
# Run all checks
|
||
check_metadata(doc, result)
|
||
check_page_size_consistency(doc, result)
|
||
check_blank_pages(doc, result)
|
||
check_punctuation(doc, result)
|
||
check_colors(doc, result)
|
||
check_font_embedding(doc, result)
|
||
check_helvetica_in_cjk(doc, result)
|
||
check_text_overflow(doc, result)
|
||
if not poster:
|
||
# Content fill ratio is not meaningful for posters — the last page
|
||
# of a seamlessly-paginated poster naturally has less content.
|
||
check_content_fill_ratio(doc, result)
|
||
check_cover_bleed(doc, result, poster=poster)
|
||
check_margin_symmetry(doc, result, skip_cover=skip_cover)
|
||
if check_tables:
|
||
check_table_centering(doc, result)
|
||
if check_formulas:
|
||
check_formula_overflow(doc, result)
|
||
if not poster:
|
||
check_toc_without_cover(doc, result)
|
||
|
||
doc.close()
|
||
return result
|
||
|
||
|
||
def format_report(result):
|
||
lines = []
|
||
lines.append("=" * 56)
|
||
lines.append(" PDF Quality Assurance Report")
|
||
lines.append("=" * 56)
|
||
|
||
# Info
|
||
if result.info:
|
||
lines.append("")
|
||
lines.append("ℹ️ Info:")
|
||
for msg in result.info:
|
||
lines.append(f" {msg}")
|
||
|
||
# Passes
|
||
if result.passes:
|
||
lines.append("")
|
||
lines.append(f"✅ Passed ({len(result.passes)}):")
|
||
for msg in result.passes:
|
||
lines.append(f" {msg}")
|
||
|
||
# Issues
|
||
errors = [(s, c, m) for s, c, m in result.issues if s == 'ERROR']
|
||
warns = [(s, c, m) for s, c, m in result.issues if s == 'WARN']
|
||
|
||
if errors:
|
||
lines.append("")
|
||
lines.append(f"❌ Errors ({len(errors)}):")
|
||
for _, cat, msg in errors:
|
||
lines.append(f" [{cat}] {msg}")
|
||
|
||
if warns:
|
||
lines.append("")
|
||
lines.append(f"⚠️ Warnings ({len(warns)}):")
|
||
for _, cat, msg in warns:
|
||
lines.append(f" [{cat}] {msg}")
|
||
|
||
# Summary
|
||
lines.append("")
|
||
lines.append("-" * 56)
|
||
total_issues = len(result.issues)
|
||
if total_issues == 0:
|
||
lines.append("🎉 PASS — All checks passed!")
|
||
elif errors:
|
||
lines.append(f"💀 FAIL — {len(errors)} error(s), {len(warns)} warning(s)")
|
||
else:
|
||
lines.append(f"⚠️ WARN — {len(warns)} warning(s), optimization recommended")
|
||
lines.append("-" * 56)
|
||
|
||
return "\n".join(lines)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
if len(sys.argv) < 2:
|
||
print("Usage: python3 pdf_qa.py <pdf_path>")
|
||
print(" python3 pdf_qa.py *.pdf (batch check)")
|
||
print("Options:")
|
||
print(" --poster Poster mode (creative)")
|
||
print(" --skip-cover Skip page 1 margin symmetry check")
|
||
print(" --no-tables Disable table centering check")
|
||
print(" --formulas Enable formula overflow check")
|
||
sys.exit(1)
|
||
|
||
import glob
|
||
files = []
|
||
poster = False
|
||
skip_cover = False
|
||
check_tables = True
|
||
check_formulas = False
|
||
args = sys.argv[1:]
|
||
if '--poster' in args:
|
||
poster = True
|
||
args.remove('--poster')
|
||
if '--skip-cover' in args:
|
||
skip_cover = True
|
||
args.remove('--skip-cover')
|
||
if '--no-tables' in args:
|
||
check_tables = False
|
||
args.remove('--no-tables')
|
||
if '--formulas' in args:
|
||
check_formulas = True
|
||
args.remove('--formulas')
|
||
for arg in args:
|
||
files.extend(glob.glob(arg))
|
||
|
||
if not files:
|
||
print(f"File not found: {args}")
|
||
sys.exit(1)
|
||
|
||
for pdf_path in files:
|
||
result = run_qa(
|
||
pdf_path,
|
||
poster=poster,
|
||
skip_cover=skip_cover,
|
||
check_tables=check_tables,
|
||
check_formulas=check_formulas
|
||
)
|
||
print(format_report(result))
|
||
if len(files) > 1:
|
||
print("\n")
|