1338 lines
57 KiB
Python
Executable File
1338 lines
57 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""
|
||
poster_validate.py — Pre- and post-generation quality checks for poster/creative PDFs.
|
||
|
||
Usage:
|
||
# Check HTML before PDF generation
|
||
python3 poster_validate.py check-html poster.html [--fix] [--output fixed.html]
|
||
|
||
# Check PDF after generation
|
||
python3 poster_validate.py check-pdf poster.pdf --source-html poster.html
|
||
|
||
Both commands emit a JSON report to stdout:
|
||
{"pass": bool, "source": "...", "check_type": "html"|"pdf",
|
||
"errors": [...], "warnings": [...], "info": [...]}
|
||
|
||
Exit codes:
|
||
0 pass (no errors; warnings/info are OK)
|
||
1 fail (at least one error)
|
||
2 script-level failure (bad arguments, unreadable file, …)
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import math
|
||
import os
|
||
import re
|
||
import sys
|
||
from html.parser import HTMLParser
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Constants
|
||
# ---------------------------------------------------------------------------
|
||
|
||
GENERIC_FAMILIES = frozenset(
|
||
["serif", "sans-serif", "monospace", "cursive", "fantasy", "system-ui", "ui-serif",
|
||
"ui-sans-serif", "ui-monospace", "ui-rounded", "math", "emoji", "fangsong"]
|
||
)
|
||
|
||
SERIF_FONTS = frozenset(f.lower() for f in [
|
||
"Playfair Display", "Georgia", "Times New Roman", "Times", "Noto Serif",
|
||
"Noto Serif SC", "Noto Serif TC", "Noto Serif JP", "Noto Serif KR",
|
||
"Source Serif Pro", "Source Serif 4", "Merriweather", "Lora", "PT Serif",
|
||
"Libre Baskerville", "EB Garamond", "Cormorant Garamond", "Crimson Text",
|
||
"STSong", "FangSong", "KaiTi", "STKaiti", "Songti SC",
|
||
])
|
||
|
||
CHINESE_FONTS = frozenset(f.lower() for f in [
|
||
"SimHei", "Microsoft YaHei", "Noto Sans SC", "Noto Sans TC",
|
||
"Noto Sans CJK SC", "Noto Sans CJK TC", "PingFang SC", "PingFang TC",
|
||
"Source Han Sans SC", "Source Han Sans TC", "WenQuanYi Micro Hei",
|
||
"WenQuanYi Zen Hei", "Hiragino Sans GB", "STHeiti", "STXihei",
|
||
"Noto Serif SC", "Noto Serif TC", "Noto Serif CJK SC",
|
||
"Source Han Serif SC", "SimSun", "NSimSun", "FangSong", "KaiTi",
|
||
"STSong", "STFangsong", "STKaiti", "Songti SC", "Heiti SC",
|
||
])
|
||
|
||
# Selectors we treat as "main containers" whose overflow:hidden is dangerous.
|
||
# NOTE: .poster and .page are EXCLUDED because html2poster.js auto-injects
|
||
# overflow:hidden on them at render time. See SKILL.md Engine Selection Rules.
|
||
CONTAINER_SELECTORS = {"body", "html", ".slide",
|
||
"#app", "#root", ".container", ".wrapper", "main",
|
||
"section", "article"}
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def _issue(code: str, message: str, severity: str = "error", line: int | None = None) -> dict:
|
||
d: dict[str, Any] = {"code": code, "message": message, "severity": severity}
|
||
if line is not None:
|
||
d["line"] = line
|
||
return d
|
||
|
||
|
||
def _line_number(full_text: str, pos: int) -> int:
|
||
"""Return 1-based line number for character position *pos*."""
|
||
return full_text.count("\n", 0, pos) + 1
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# CSS regex helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
_RE_FONT_FAMILY = re.compile(
|
||
r"font-family\s*:\s*([^;}\n]+)", re.IGNORECASE
|
||
)
|
||
|
||
_RE_FONT_SIZE = re.compile(
|
||
r"font-size\s*:\s*(\d+(?:\.\d+)?)\s*(px|pt|em|rem)", re.IGNORECASE
|
||
)
|
||
|
||
_RE_PAGE_SIZE = re.compile(
|
||
r"@page\s*\{[^}]*\bsize\s*:", re.IGNORECASE | re.DOTALL
|
||
)
|
||
|
||
_RE_CSS_URL = re.compile(
|
||
r"url\(\s*['\"]?(https?://[^'\")\s]+)['\"]?\s*\)", re.IGNORECASE
|
||
)
|
||
|
||
_RE_OVERFLOW = re.compile(
|
||
r"overflow\s*:\s*hidden", re.IGNORECASE
|
||
)
|
||
|
||
_RE_BG_WHITE = re.compile(
|
||
r"background(?:-color)?\s*:\s*(white|#fff(?:fff)?|transparent)\b", re.IGNORECASE
|
||
)
|
||
|
||
_RE_COLOR_HEX = re.compile(r"#([0-9a-fA-F]{3,8})")
|
||
_RE_COLOR_RGB = re.compile(r"rgb\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\)")
|
||
|
||
_RE_STYLE_BLOCK = re.compile(r"<style[^>]*>(.*?)</style>", re.IGNORECASE | re.DOTALL)
|
||
_RE_INLINE_STYLE = re.compile(r'style\s*=\s*["\']([^"\']*)["\']', re.IGNORECASE)
|
||
|
||
_RE_CSS_RULE = re.compile(
|
||
r"([^{]+)\{([^}]*)\}", re.DOTALL
|
||
)
|
||
|
||
_RE_WIDTH_PX = re.compile(r"width\s*:\s*(\d+(?:\.\d+)?)\s*px", re.IGNORECASE)
|
||
|
||
|
||
def _parse_font_list(raw: str) -> list[str]:
|
||
"""Split a font-family value into individual font names (unquoted, stripped)."""
|
||
fonts: list[str] = []
|
||
for part in raw.split(","):
|
||
name = part.strip().strip("'\"").strip()
|
||
if name:
|
||
fonts.append(name)
|
||
return fonts
|
||
|
||
|
||
def _has_generic(fonts: list[str]) -> bool:
|
||
return any(f.lower() in GENERIC_FAMILIES for f in fonts)
|
||
|
||
|
||
def _best_generic(fonts: list[str]) -> str:
|
||
"""Pick the best generic fallback for a list of named fonts."""
|
||
lower = [f.lower() for f in fonts]
|
||
if any(f in CHINESE_FONTS for f in lower):
|
||
return "sans-serif"
|
||
if any(f in SERIF_FONTS for f in lower):
|
||
return "serif"
|
||
return "sans-serif"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Color / contrast helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _hex_to_rgb(h: str) -> tuple[int, int, int] | None:
|
||
h = h.lstrip("#")
|
||
if len(h) == 3:
|
||
h = h[0]*2 + h[1]*2 + h[2]*2
|
||
if len(h) == 4:
|
||
h = h[0]*2 + h[1]*2 + h[2]*2 # ignore alpha
|
||
if len(h) == 8:
|
||
h = h[:6] # strip alpha
|
||
if len(h) != 6:
|
||
return None
|
||
try:
|
||
return int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16)
|
||
except ValueError:
|
||
return None
|
||
|
||
|
||
def _relative_luminance(r: int, g: int, b: int) -> float:
|
||
"""WCAG 2.x relative luminance."""
|
||
def _c(v: int) -> float:
|
||
s = v / 255.0
|
||
return s / 12.92 if s <= 0.03928 else ((s + 0.055) / 1.055) ** 2.4
|
||
return 0.2126 * _c(r) + 0.7152 * _c(g) + 0.0722 * _c(b)
|
||
|
||
|
||
def _contrast_ratio(rgb1: tuple[int, int, int], rgb2: tuple[int, int, int]) -> float:
|
||
l1 = _relative_luminance(*rgb1)
|
||
l2 = _relative_luminance(*rgb2)
|
||
lighter = max(l1, l2)
|
||
darker = min(l1, l2)
|
||
return (lighter + 0.05) / (darker + 0.05)
|
||
|
||
|
||
def _extract_color(css_text: str, prop: str) -> tuple[int, int, int] | None:
|
||
"""Try to extract an RGB color for a given CSS property from a rule body."""
|
||
pat = re.compile(rf"{prop}\s*:\s*([^;]+)", re.IGNORECASE)
|
||
m = pat.search(css_text)
|
||
if not m:
|
||
return None
|
||
val = m.group(1).strip()
|
||
# Named colours (just the common ones)
|
||
named = {
|
||
"white": (255, 255, 255), "black": (0, 0, 0), "red": (255, 0, 0),
|
||
"green": (0, 128, 0), "blue": (0, 0, 255), "yellow": (255, 255, 0),
|
||
"grey": (128, 128, 128), "gray": (128, 128, 128), "transparent": (255, 255, 255),
|
||
}
|
||
low = val.lower().split()[0].rstrip(";")
|
||
if low in named:
|
||
return named[low]
|
||
m_rgb = _RE_COLOR_RGB.search(val)
|
||
if m_rgb:
|
||
return int(m_rgb.group(1)), int(m_rgb.group(2)), int(m_rgb.group(3))
|
||
m_hex = _RE_COLOR_HEX.search(val)
|
||
if m_hex:
|
||
return _hex_to_rgb(m_hex.group(1))
|
||
return None
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# HTML visible text extractor (stdlib only)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class _TextExtractor(HTMLParser):
|
||
"""Extract visible text from HTML, skipping <script>, <style>, etc."""
|
||
|
||
_SKIP_TAGS = frozenset(["script", "style", "noscript", "template", "svg"])
|
||
|
||
def __init__(self) -> None:
|
||
super().__init__()
|
||
self._skip_depth = 0
|
||
self.parts: list[str] = []
|
||
|
||
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
||
if tag.lower() in self._SKIP_TAGS:
|
||
self._skip_depth += 1
|
||
|
||
def handle_endtag(self, tag: str) -> None:
|
||
if tag.lower() in self._SKIP_TAGS and self._skip_depth > 0:
|
||
self._skip_depth -= 1
|
||
|
||
def handle_data(self, data: str) -> None:
|
||
if self._skip_depth == 0:
|
||
self.parts.append(data)
|
||
|
||
def get_text(self) -> str:
|
||
return " ".join(self.parts)
|
||
|
||
|
||
def _html_visible_text(html: str) -> str:
|
||
ext = _TextExtractor()
|
||
try:
|
||
ext.feed(html)
|
||
except Exception:
|
||
pass
|
||
return ext.get_text()
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# CHECK-HTML
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def check_html(html_path: str, *, fix: bool = False, output_path: str | None = None) -> dict:
|
||
"""Run all HTML pre-checks. Return the JSON-serialisable report dict."""
|
||
|
||
path = Path(html_path)
|
||
if not path.is_file():
|
||
return {"pass": False, "source": html_path, "check_type": "html",
|
||
"errors": [_issue("FILE_NOT_FOUND", f"Cannot read '{html_path}'.")],
|
||
"warnings": [], "info": []}
|
||
|
||
raw = path.read_text(encoding="utf-8", errors="replace")
|
||
original = raw # keep for line-number lookup
|
||
# For fix mode: we collect all replacements and apply them at the end
|
||
# using re.sub on the original to avoid offset corruption
|
||
_all_fixes: list[tuple[str, str]] = [] # (old_text, new_text) applied in order
|
||
|
||
errors: list[dict] = []
|
||
warnings: list[dict] = []
|
||
info: list[dict] = []
|
||
|
||
# Collect all CSS text (style blocks + inline styles)
|
||
all_css_positions: list[tuple[str, int]] = [] # (css_text, char_offset_in_raw)
|
||
for m in _RE_STYLE_BLOCK.finditer(raw):
|
||
all_css_positions.append((m.group(1), m.start(1)))
|
||
for m in _RE_INLINE_STYLE.finditer(raw):
|
||
all_css_positions.append((m.group(1), m.start(1)))
|
||
|
||
all_css = "\n".join(c for c, _ in all_css_positions)
|
||
|
||
# ---- 1. FONT_NO_FALLBACK ----
|
||
for css_text, css_offset in all_css_positions:
|
||
for m in _RE_FONT_FAMILY.finditer(css_text):
|
||
fonts = _parse_font_list(m.group(1))
|
||
if fonts and not _has_generic(fonts):
|
||
abs_pos = css_offset + m.start()
|
||
ln = _line_number(original, abs_pos)
|
||
generic = _best_generic(fonts)
|
||
errors.append(_issue(
|
||
"FONT_NO_FALLBACK",
|
||
f"font-family {', '.join(repr(f) for f in fonts)} has no generic fallback. "
|
||
f"Add '{generic}' at the end.",
|
||
line=ln
|
||
))
|
||
if fix:
|
||
old_decl = m.group(0) # e.g. "font-family: 'Montserrat'"
|
||
val_part = m.group(1).rstrip().rstrip(";").rstrip()
|
||
new_decl = f"font-family: {val_part}, {generic}"
|
||
_all_fixes.append((old_decl, new_decl))
|
||
|
||
# ---- 2. OVERFLOW_HIDDEN_CONTAINER ----
|
||
for css_text, css_offset in all_css_positions:
|
||
for rule_m in _RE_CSS_RULE.finditer(css_text):
|
||
selector_raw = rule_m.group(1).strip()
|
||
body = rule_m.group(2)
|
||
if not _RE_OVERFLOW.search(body):
|
||
continue
|
||
# Check if this selector is a container
|
||
selectors = [s.strip().lower() for s in selector_raw.split(",")]
|
||
for sel in selectors:
|
||
# Strip pseudo-classes/elements for matching
|
||
base_sel = re.split(r"[:>\s+~]", sel)[0].strip()
|
||
if base_sel in CONTAINER_SELECTORS:
|
||
# Check for width < 200px exemption
|
||
w_m = _RE_WIDTH_PX.search(body)
|
||
if w_m and float(w_m.group(1)) < 200:
|
||
continue
|
||
abs_pos = css_offset + rule_m.start()
|
||
ln = _line_number(original, abs_pos)
|
||
errors.append(_issue(
|
||
"OVERFLOW_HIDDEN_CONTAINER",
|
||
f"'{base_sel}' has 'overflow: hidden' which clips content in PDF rendering. "
|
||
"Remove it or use 'overflow: visible'.",
|
||
line=ln
|
||
))
|
||
if fix:
|
||
old_rule = rule_m.group(0)
|
||
fixed_body = re.sub(r"overflow\s*:\s*hidden\s*;?\s*", "", body, flags=re.IGNORECASE)
|
||
new_rule = f"{rule_m.group(1)}{{{fixed_body}}}"
|
||
_all_fixes.append((old_rule, new_rule))
|
||
break # only report once per rule
|
||
|
||
# Also check inline styles on body/html tags
|
||
for tag_name in ("body", "html"):
|
||
tag_pat = re.compile(rf"<{tag_name}([^>]*)>", re.IGNORECASE)
|
||
for tm in tag_pat.finditer(raw):
|
||
attrs_str = tm.group(1)
|
||
style_m = re.search(r'style\s*=\s*["\']([^"\']*)["\']', attrs_str, re.IGNORECASE)
|
||
if style_m and _RE_OVERFLOW.search(style_m.group(1)):
|
||
ln = _line_number(original, tm.start())
|
||
errors.append(_issue(
|
||
"OVERFLOW_HIDDEN_CONTAINER",
|
||
f"<{tag_name}> has inline 'overflow: hidden' which clips content. Remove it.",
|
||
line=ln
|
||
))
|
||
if fix:
|
||
old_style = style_m.group(1)
|
||
new_style = re.sub(r"overflow\s*:\s*hidden\s*;?\s*", "", old_style, flags=re.IGNORECASE)
|
||
_all_fixes.append((old_style, new_style))
|
||
|
||
# ---- 2b. FIXED_SIZE_NO_SCREEN_ADAPT ----
|
||
# For fixed-size single-page designs (poster, infographic, certificate, card),
|
||
# check that @media screen auto-scale CSS is present.
|
||
# Detect fixed-size pages: html/body with explicit px width+height
|
||
_has_fixed_width = False
|
||
_has_fixed_height = False
|
||
_fixed_h_value = None
|
||
for css_text, css_offset in all_css_positions:
|
||
for rule_m in _RE_CSS_RULE.finditer(css_text):
|
||
selector_raw = rule_m.group(1).strip().lower()
|
||
body_text = rule_m.group(2)
|
||
selectors = [s.strip() for s in selector_raw.split(",")]
|
||
if any(s in ("body", "html", "html, body", "body, html") for s in selectors):
|
||
if re.search(r"(?<!max-)(?<!min-)width\s*:\s*\d+(?:\.\d+)?\s*px", body_text, re.IGNORECASE):
|
||
_has_fixed_width = True
|
||
h_m = re.search(r"(?<!max-)(?<!min-)height\s*:\s*(\d+(?:\.\d+)?)\s*px", body_text, re.IGNORECASE)
|
||
if h_m:
|
||
_has_fixed_height = True
|
||
_fixed_h_value = h_m.group(1)
|
||
# Also check .page / .poster containers
|
||
for css_text, css_offset in all_css_positions:
|
||
for rule_m in _RE_CSS_RULE.finditer(css_text):
|
||
selector_raw = rule_m.group(1).strip().lower()
|
||
body_text = rule_m.group(2)
|
||
selectors = [s.strip() for s in selector_raw.split(",")]
|
||
if any(s in (".page", ".poster", "#page", "#poster", ".slide") for s in selectors):
|
||
if re.search(r"(?<!max-)(?<!min-)width\s*:\s*\d+(?:\.\d+)?\s*px", body_text, re.IGNORECASE):
|
||
_has_fixed_width = True
|
||
h_m = re.search(r"(?<!max-)(?<!min-)height\s*:\s*(\d+(?:\.\d+)?)\s*px", body_text, re.IGNORECASE)
|
||
if h_m:
|
||
_has_fixed_height = True
|
||
if not _fixed_h_value:
|
||
_fixed_h_value = h_m.group(1)
|
||
|
||
_is_fixed_size_page = _has_fixed_width and _has_fixed_height
|
||
|
||
if _is_fixed_size_page:
|
||
# Check for @media screen rule
|
||
_has_media_screen = bool(re.search(r"@media\s+screen\s*\{", all_css, re.IGNORECASE))
|
||
if not _has_media_screen:
|
||
_height_hint = _fixed_h_value or "1400"
|
||
warnings.append(_issue(
|
||
"FIXED_SIZE_NO_SCREEN_ADAPT",
|
||
f"Fixed-size page detected ({_height_hint}px tall) but no @media screen rule found. "
|
||
"Browser preview will require scrolling to see the full page. "
|
||
"Add @media screen {{ html {{ height:auto; display:flex; justify-content:center; }} "
|
||
f"body {{ transform-origin:top center; scale:min(1, calc(100vh / {_height_hint})); "
|
||
"margin:0 auto; }} }} for auto-scaling preview.",
|
||
severity="warning"
|
||
))
|
||
|
||
# Check for @media screen + scale/transform (more specific)
|
||
if _has_media_screen:
|
||
# Extract full @media screen block content (handles nested braces)
|
||
_media_screen_content = ""
|
||
_ms_match = re.search(r"@media\s+screen\s*\{", all_css, re.IGNORECASE)
|
||
if _ms_match:
|
||
# Find matching closing brace (count nested braces)
|
||
_depth = 1
|
||
_start = _ms_match.end()
|
||
for _ci in range(_start, len(all_css)):
|
||
if all_css[_ci] == '{':
|
||
_depth += 1
|
||
elif all_css[_ci] == '}':
|
||
_depth -= 1
|
||
if _depth == 0:
|
||
_media_screen_content = all_css[_start:_ci]
|
||
break
|
||
_has_scale = bool(re.search(
|
||
r"(?:scale|transform|zoom)",
|
||
_media_screen_content, re.IGNORECASE
|
||
))
|
||
if not _has_scale:
|
||
warnings.append(_issue(
|
||
"SCREEN_ADAPT_NO_SCALE",
|
||
"@media screen block exists but lacks scale/transform/zoom for auto-fitting. "
|
||
f"Add: body {{ scale: min(1, calc(100vh / {_fixed_h_value or '1400'})); }}",
|
||
severity="warning"
|
||
))
|
||
|
||
# ---- 3. REMOTE_IMAGE ----
|
||
# <img src="http...">
|
||
for m in re.finditer(r'<img\s[^>]*src\s*=\s*["\']?(https?://[^\s"\'>\)]+)', raw, re.IGNORECASE):
|
||
url = m.group(1)
|
||
ln = _line_number(original, m.start())
|
||
warnings.append(_issue(
|
||
"REMOTE_IMAGE",
|
||
f"img src='{_truncate(url, 80)}' is a remote URL. "
|
||
"Download to images/ subdirectory and use relative path (src=\"images/filename.jpg\").",
|
||
severity="warning", line=ln
|
||
))
|
||
# CSS url(http...)
|
||
for css_text, css_offset in all_css_positions:
|
||
for m in _RE_CSS_URL.finditer(css_text):
|
||
url = m.group(1)
|
||
abs_pos = css_offset + m.start()
|
||
ln = _line_number(original, abs_pos)
|
||
warnings.append(_issue(
|
||
"REMOTE_IMAGE",
|
||
f"CSS url('{_truncate(url, 80)}') is a remote URL. "
|
||
"Download locally for reliable PDF generation.",
|
||
severity="warning", line=ln
|
||
))
|
||
|
||
# ---- 3b. ABSOLUTE_PATH ----
|
||
# <img src="file:///..." or src="/absolute/path">
|
||
for m in re.finditer(r'<img\s[^>]*src\s*=\s*["\']?(file://[^\s"\'>\)]+|/[^\s"\'>\)]+)', raw, re.IGNORECASE):
|
||
path_val = m.group(1)
|
||
ln = _line_number(original, m.start())
|
||
warnings.append(_issue(
|
||
"ABSOLUTE_PATH",
|
||
f"img src='{_truncate(path_val, 80)}' uses an absolute path. "
|
||
"Use relative path (src=\"images/filename.jpg\") for portability.",
|
||
severity="warning", line=ln
|
||
))
|
||
|
||
# ---- 4. NO_PAGE_SIZE ----
|
||
if not _RE_PAGE_SIZE.search(all_css):
|
||
warnings.append(_issue(
|
||
"NO_PAGE_SIZE",
|
||
"@page { size: ... } not found in CSS. Playwright will use default A4 "
|
||
"which may not match the poster design. Add explicit page size.",
|
||
severity="warning"
|
||
))
|
||
|
||
# ---- 4b. MISSING_MARGIN_RESET ----
|
||
# Check if html/body has margin:0 reset (Chromium defaults to body { margin: 8px })
|
||
has_margin_reset = bool(re.search(
|
||
r'(?:html|body|\*)\s*(?:,\s*(?:html|body|\*)\s*)?{[^}]*margin\s*:\s*0',
|
||
all_css, re.IGNORECASE
|
||
))
|
||
if not has_margin_reset:
|
||
warnings.append(_issue(
|
||
"MISSING_MARGIN_RESET",
|
||
"No 'margin: 0' found on html/body/*. Chromium's default body { margin: 8px } "
|
||
"will cause black edges on top/left/bottom of the poster PDF. "
|
||
"Add: html, body { margin: 0; padding: 0; }",
|
||
severity="warning"
|
||
))
|
||
|
||
# ---- 4c. PAGE_SIZE_CSS_VAR ----
|
||
# CSS variables are NOT resolved in @page rules — will silently fallback to A4
|
||
page_size_var_match = re.search(
|
||
r'@page\s*\{[^}]*\bsize\s*:[^;]*var\s*\(',
|
||
all_css, re.IGNORECASE | re.DOTALL
|
||
)
|
||
if page_size_var_match:
|
||
errors.append(_issue(
|
||
"PAGE_SIZE_CSS_VAR",
|
||
"@page { size } uses CSS variables (var(...)). Chromium does NOT resolve "
|
||
"CSS variables in @page rules — the page size will silently fallback to A4, "
|
||
"causing content to be off-center. Use concrete px values instead: "
|
||
"@page { size: 720px 960px; }",
|
||
severity="error"
|
||
))
|
||
|
||
# ---- 5. WHITE_BACKGROUND ----
|
||
_has_styled_bg = False
|
||
for css_text, _ in all_css_positions:
|
||
for rule_m in _RE_CSS_RULE.finditer(css_text):
|
||
selector_raw = rule_m.group(1).strip().lower()
|
||
body = rule_m.group(2)
|
||
selectors = [s.strip() for s in selector_raw.split(",")]
|
||
if any(s in ("body", "html", ":root") for s in selectors):
|
||
bg_m = re.search(r"background(?:-color)?\s*:\s*([^;]+)", body, re.IGNORECASE)
|
||
if bg_m:
|
||
val = bg_m.group(1).strip().lower()
|
||
if val in ("white", "#fff", "#ffffff", "transparent", "rgba(0,0,0,0)",
|
||
"rgba(255,255,255,1)", "rgb(255,255,255)"):
|
||
pass # still "white"-ish
|
||
else:
|
||
_has_styled_bg = True
|
||
if not _has_styled_bg:
|
||
warnings.append(_issue(
|
||
"WHITE_BACKGROUND",
|
||
"body/html has white, transparent, or no background. "
|
||
"Posters typically need a styled background colour or gradient.",
|
||
severity="warning"
|
||
))
|
||
|
||
# ---- 6. TINY_FONT ----
|
||
for css_text, css_offset in all_css_positions:
|
||
for m in _RE_FONT_SIZE.finditer(css_text):
|
||
size = float(m.group(1))
|
||
unit = m.group(2).lower()
|
||
# Convert to rough px
|
||
if unit == "pt":
|
||
size_px = size * (4 / 3)
|
||
elif unit in ("em", "rem"):
|
||
size_px = size * 16 # assume base 16px
|
||
else:
|
||
size_px = size
|
||
if size_px < 10:
|
||
abs_pos = css_offset + m.start()
|
||
ln = _line_number(original, abs_pos)
|
||
warnings.append(_issue(
|
||
"TINY_FONT",
|
||
f"font-size: {m.group(1)}{m.group(2)} ({size_px:.0f}px) may be unreadable in PDF output.",
|
||
severity="warning", line=ln
|
||
))
|
||
|
||
# ---- 7. COLOR_CONTRAST ----
|
||
_checked_pairs: set[tuple] = set()
|
||
for css_text, css_offset in all_css_positions:
|
||
for rule_m in _RE_CSS_RULE.finditer(css_text):
|
||
body = rule_m.group(2)
|
||
fg = _extract_color(body, "color")
|
||
bg = _extract_color(body, "background(?:-color)?")
|
||
if fg and bg:
|
||
pair = (fg, bg)
|
||
if pair not in _checked_pairs:
|
||
_checked_pairs.add(pair)
|
||
ratio = _contrast_ratio(fg, bg)
|
||
if ratio < 3.0:
|
||
abs_pos = css_offset + rule_m.start()
|
||
ln = _line_number(original, abs_pos)
|
||
warnings.append(_issue(
|
||
"COLOR_CONTRAST",
|
||
f"Text color rgb{fg} on background rgb{bg} has contrast ratio "
|
||
f"{ratio:.1f}:1 (< 3:1 minimum). May be hard to read.",
|
||
severity="warning", line=ln
|
||
))
|
||
|
||
# ---- 8. MISSING_PRINT_BG_NOTE ----
|
||
info.append(_issue(
|
||
"MISSING_PRINT_BG_NOTE",
|
||
"Remember to set print_background=True when calling page.pdf() in Playwright, "
|
||
"otherwise CSS backgrounds won't render.",
|
||
severity="info"
|
||
))
|
||
|
||
# ---- 9. OVERFLOW_DECORATION ----
|
||
# Detect decorative position:absolute elements that might overflow body width
|
||
# Look for patterns like: left: -50px, right: -100px, left: 120%, transform: translateX(...)
|
||
_overflow_positions = re.findall(
|
||
r'(?:left|right)\s*:\s*(-\d+(?:\.\d+)?(?:px|%|rem|em))\s*;',
|
||
all_css, re.IGNORECASE
|
||
) if all_css else []
|
||
for pos_val in _overflow_positions:
|
||
if pos_val.startswith('-') or (pos_val.endswith('%') and float(re.match(r'-?[\d.]+', pos_val).group()) > 100):
|
||
warnings.append(_issue(
|
||
"OVERFLOW_DECORATION",
|
||
f"CSS position '{pos_val}' may push decorative elements outside body width, "
|
||
"causing black edges in PDF. Keep absolute-positioned elements within [0, body_width].",
|
||
severity="warning"
|
||
))
|
||
break # Only warn once
|
||
|
||
# ---- 10. BG_COLOR_MISMATCH ----
|
||
# Check if body/html background matches main content container background
|
||
_body_bg = None
|
||
_canvas_bg = None
|
||
for css_text, css_offset in all_css_positions:
|
||
for rule_m in _RE_CSS_RULE.finditer(css_text):
|
||
selector_raw = rule_m.group(1).strip().lower()
|
||
body_text = rule_m.group(2)
|
||
selectors = [s.strip() for s in selector_raw.split(",")]
|
||
bg_m = re.search(r"background(?:-color)?\s*:\s*([^;]+)", body_text, re.IGNORECASE)
|
||
if bg_m:
|
||
bg_val = bg_m.group(1).strip().lower()
|
||
if any(s in ("body", "html") for s in selectors):
|
||
_body_bg = bg_val
|
||
if any(s in (".canvas", ".poster", ".poster-container", ".page") for s in selectors):
|
||
_canvas_bg = bg_val
|
||
# Helper: normalize a CSS background value to (r,g,b) for comparison.
|
||
# Returns None if value uses var(), gradient, url(), or unparseable.
|
||
def _bg_to_rgb(val: str):
|
||
if not val:
|
||
return None
|
||
v = val.strip().lower()
|
||
# Skip CSS variables, gradients, url() — can't compare reliably
|
||
if v.startswith("var(") or "gradient" in v or "url(" in v:
|
||
return None
|
||
# Use _extract_color machinery: wrap as "background: <val>" and extract
|
||
return _extract_color(f"background: {val};", "background")
|
||
|
||
_body_bg_rgb = _bg_to_rgb(_body_bg)
|
||
_canvas_bg_rgb = _bg_to_rgb(_canvas_bg)
|
||
|
||
if _body_bg and _canvas_bg and _body_bg != _canvas_bg:
|
||
# Skip if either uses CSS variables / gradients (can't reliably compare)
|
||
if _body_bg_rgb is not None and _canvas_bg_rgb is not None:
|
||
if _body_bg_rgb != _canvas_bg_rgb:
|
||
warnings.append(_issue(
|
||
"BG_COLOR_MISMATCH",
|
||
f"body background '{_body_bg}' differs from content container background "
|
||
f"'{_canvas_bg}'. This may cause visible color borders in PDF output. "
|
||
"Ensure html/body background matches the poster canvas color.",
|
||
severity="warning"
|
||
))
|
||
# else: unparseable (var/gradient/url) → skip, no warning
|
||
|
||
# ---- 10b. SCREEN_BG_MISMATCH ----
|
||
# Check if @media screen html background matches body/canvas background
|
||
# Parse @media screen independently (check 2b only runs for fixed-size pages)
|
||
_screen_has_media = bool(re.search(r"@media\s+screen\s*\{", all_css, re.IGNORECASE))
|
||
_screen_content = ""
|
||
if _screen_has_media:
|
||
_ms_m = re.search(r"@media\s+screen\s*\{", all_css, re.IGNORECASE)
|
||
if _ms_m:
|
||
_d = 1
|
||
_s = _ms_m.end()
|
||
for _ci in range(_s, len(all_css)):
|
||
if all_css[_ci] == '{':
|
||
_d += 1
|
||
elif all_css[_ci] == '}':
|
||
_d -= 1
|
||
if _d == 0:
|
||
_screen_content = all_css[_s:_ci]
|
||
break
|
||
if _screen_has_media and _screen_content:
|
||
_screen_html_bg = None
|
||
for rule_m in _RE_CSS_RULE.finditer(_screen_content):
|
||
selector_raw = rule_m.group(1).strip().lower()
|
||
body_text = rule_m.group(2)
|
||
selectors = [s.strip() for s in selector_raw.split(",")]
|
||
if any(s in ("html",) for s in selectors):
|
||
bg_m = re.search(r"background(?:-color)?\s*:\s*([^;]+)", body_text, re.IGNORECASE)
|
||
if bg_m:
|
||
_screen_html_bg = bg_m.group(1).strip().lower()
|
||
_ref_bg = _body_bg or _canvas_bg
|
||
if _screen_html_bg and _ref_bg and _screen_html_bg != _ref_bg:
|
||
# Normalize to RGB for comparison (skip if either is var/gradient/url)
|
||
_screen_rgb = _bg_to_rgb(_screen_html_bg)
|
||
_ref_rgb = _bg_to_rgb(_ref_bg)
|
||
if _screen_rgb is not None and _ref_rgb is not None:
|
||
if _screen_rgb != _ref_rgb:
|
||
warnings.append(_issue(
|
||
"SCREEN_BG_MISMATCH",
|
||
f"@media screen html background '{_screen_html_bg}' differs from "
|
||
f"body/canvas background '{_ref_bg}'. Browser preview will show "
|
||
"a different color border around the poster. "
|
||
"Set @media screen html background to match the poster background.",
|
||
severity="warning"
|
||
))
|
||
# else: unparseable (var/gradient) → skip
|
||
if _is_fixed_size_page and not _screen_html_bg:
|
||
# Check if body/html already sets background globally (would be inherited)
|
||
# If so, @media screen doesn't need its own background
|
||
if not _body_bg:
|
||
warnings.append(_issue(
|
||
"SCREEN_NO_BG",
|
||
"@media screen block exists but html has no explicit background color "
|
||
"(neither globally nor in @media screen). "
|
||
"Browser preview may show white borders around the poster. "
|
||
"Add: @media screen { html { background: <poster-bg-color>; } }",
|
||
severity="warning"
|
||
))
|
||
# ---- 10c. MULTIPAGE_BODY_BG_MISSING ----
|
||
# Multi-page documents with dark/colored .page backgrounds but no body background
|
||
# → sub-pixel gap between .page and @page reveals white body, causing white edges
|
||
_has_colored_page = False
|
||
# Resolve CSS variables from :root for var() expansion
|
||
_css_vars = {}
|
||
for css_text, _ in all_css_positions:
|
||
for rule_m in _RE_CSS_RULE.finditer(css_text):
|
||
sel = rule_m.group(1).strip().lower()
|
||
if ":root" in sel:
|
||
for vm in re.finditer(r"--(\w[\w-]*)\s*:\s*([^;]+)", rule_m.group(2)):
|
||
_css_vars[vm.group(1)] = vm.group(2).strip()
|
||
|
||
def _resolve_bg(val):
|
||
"""Resolve a CSS background value, expanding var() references once."""
|
||
v = val.strip().lower()
|
||
var_m = re.match(r"var\(--(\w[\w-]*)\)", v)
|
||
if var_m and var_m.group(1) in _css_vars:
|
||
return _css_vars[var_m.group(1)].strip().lower()
|
||
return v
|
||
|
||
for css_text, css_offset in all_css_positions:
|
||
for rule_m in _RE_CSS_RULE.finditer(css_text):
|
||
selector_raw = rule_m.group(1).strip().lower()
|
||
body_text = rule_m.group(2)
|
||
selectors = [s.strip() for s in selector_raw.split(",")]
|
||
if any(".page" in s for s in selectors):
|
||
bg_m = re.search(r"background(?:-color)?\s*:\s*([^;]+)", body_text, re.IGNORECASE)
|
||
if bg_m:
|
||
val = _resolve_bg(bg_m.group(1))
|
||
rgb = _bg_to_rgb(val)
|
||
if rgb is not None:
|
||
r_v, g_v, b_v = rgb
|
||
luminance = 0.299 * r_v + 0.587 * g_v + 0.114 * b_v
|
||
if luminance < 80:
|
||
_has_colored_page = True
|
||
elif "gradient" in val:
|
||
dark_hex = re.findall(r"#([0-9a-f]{3,6})", val)
|
||
for h in dark_hex:
|
||
if len(h) == 3:
|
||
h = h[0]*2 + h[1]*2 + h[2]*2
|
||
if len(h) == 6:
|
||
rv = int(h[0:2], 16)
|
||
gv = int(h[2:4], 16)
|
||
bv = int(h[4:6], 16)
|
||
if 0.299*rv + 0.587*gv + 0.114*bv < 80:
|
||
_has_colored_page = True
|
||
break
|
||
_page_div_count = len(re.findall(r'class\s*=\s*["\'][^"\']*\bpage\b', original, re.IGNORECASE))
|
||
if _page_div_count >= 2 and _has_colored_page and not _has_styled_bg:
|
||
warnings.append(_issue(
|
||
"MULTIPAGE_BODY_BG_MISSING",
|
||
"Multi-page document has dark/colored .page backgrounds but html/body has no "
|
||
"explicit background color. Playwright sub-pixel rounding creates <1px gaps "
|
||
"at .page edges where body background shows through — white body = visible "
|
||
"white edges on dark pages. Fix: set html,body { background: <darkest-page-color> }.",
|
||
severity="warning"
|
||
))
|
||
# ---- 11. HEIGHT_LOCKED ----
|
||
# Check if content containers use height:100% instead of min-height
|
||
_height_locked_selectors = set()
|
||
# Selectors that are content containers (not SVG/img/shape)
|
||
_content_containers = {".glass-canvas", ".shaped-canvas", ".process-list-container",
|
||
".process-list", ".grid-item", ".stat-block", ".delta-widget"}
|
||
for css_text, css_offset in all_css_positions:
|
||
for rule_m in _RE_CSS_RULE.finditer(css_text):
|
||
selector_raw = rule_m.group(1).strip()
|
||
body_text = rule_m.group(2)
|
||
# Check for height: 100% (not min-height)
|
||
if re.search(r"(?<!min-)height\s*:\s*100%", body_text, re.IGNORECASE):
|
||
selectors = [s.strip().lower() for s in selector_raw.split(",")]
|
||
for sel in selectors:
|
||
base_sel = re.split(r"[:>\s+~]", sel)[0].strip()
|
||
# Skip non-content elements
|
||
if base_sel in (".bg-layer", "svg", "img", ".shape-float",
|
||
".shape-circle", ".shape-wave", ".shape-diagonal_slash",
|
||
".shape-diamond", ".shape-wedge_right"):
|
||
continue
|
||
if base_sel in _content_containers:
|
||
_height_locked_selectors.add(base_sel)
|
||
# Also check inline styles
|
||
for m in re.finditer(r'class\s*=\s*["\']([^"\']*)["\']\s*[^>]*style\s*=\s*["\']([^"\']*)["\']', raw, re.IGNORECASE):
|
||
classes = m.group(1).lower().split()
|
||
style = m.group(2)
|
||
if re.search(r"(?<!min-)height\s*:\s*100%", style, re.IGNORECASE):
|
||
for cls in classes:
|
||
dotcls = f".{cls}"
|
||
if dotcls in _content_containers:
|
||
_height_locked_selectors.add(dotcls)
|
||
if _height_locked_selectors:
|
||
warnings.append(_issue(
|
||
"HEIGHT_LOCKED",
|
||
f"Content containers {', '.join(sorted(_height_locked_selectors))} use 'height: 100%' "
|
||
"instead of 'min-height: 100%'. This locks the container height and may clip content "
|
||
"when it exceeds the allocated grid area. Use 'min-height: 100%' to allow content to grow.",
|
||
severity="warning"
|
||
))
|
||
|
||
# ---- Apply all fixes (unified, avoids offset corruption) ----
|
||
# Apply larger replacements (full CSS rules like overflow fixes) before smaller
|
||
# ones (font-family declarations), so the larger match strings are still valid.
|
||
# Sort by length of old_text descending to ensure this order.
|
||
if fix and _all_fixes:
|
||
_all_fixes.sort(key=lambda pair: len(pair[0]), reverse=True)
|
||
for old_text, new_text in _all_fixes:
|
||
if old_text in raw:
|
||
raw = raw.replace(old_text, new_text, 1)
|
||
|
||
# ---- Build report ----
|
||
has_errors = len(errors) > 0
|
||
report: dict[str, Any] = {
|
||
"pass": not has_errors,
|
||
"source": html_path,
|
||
"check_type": "html",
|
||
"errors": errors,
|
||
"warnings": warnings,
|
||
"info": info,
|
||
}
|
||
|
||
# ---- Fix mode output ----
|
||
if fix:
|
||
if output_path:
|
||
Path(output_path).write_text(raw, encoding="utf-8")
|
||
report["fixed_file"] = output_path
|
||
else:
|
||
# Write fixed HTML to stdout after the JSON report to stderr
|
||
report["fixed_output"] = "stdout"
|
||
# We'll handle this in main()
|
||
|
||
report["_fixed_html"] = raw if fix else None
|
||
return report
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# CHECK-PDF
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def check_pdf(pdf_path: str, *, source_html: str | None = None, poster: bool = False) -> dict:
|
||
"""Run all PDF post-checks. Return the JSON-serialisable report dict."""
|
||
|
||
path = Path(pdf_path)
|
||
errors: list[dict] = []
|
||
warnings: list[dict] = []
|
||
info: list[dict] = []
|
||
|
||
if not path.is_file():
|
||
return {"pass": False, "source": pdf_path, "check_type": "pdf",
|
||
"errors": [_issue("FILE_NOT_FOUND", f"Cannot read '{pdf_path}'.")],
|
||
"warnings": [], "info": []}
|
||
|
||
# ---- 1. FILE_TOO_SMALL ----
|
||
file_size = path.stat().st_size
|
||
if file_size < 10 * 1024:
|
||
errors.append(_issue(
|
||
"FILE_TOO_SMALL",
|
||
f"PDF is only {file_size} bytes ({file_size/1024:.1f} KB). "
|
||
"Likely empty or broken.",
|
||
))
|
||
|
||
# ---- Import pdfplumber ----
|
||
try:
|
||
import pdfplumber
|
||
except ImportError:
|
||
errors.append(_issue(
|
||
"DEPENDENCY_MISSING",
|
||
"pdfplumber is not installed. Cannot perform PDF text checks. "
|
||
"Install with: pip install pdfplumber",
|
||
))
|
||
return {"pass": False, "source": pdf_path, "check_type": "pdf",
|
||
"errors": errors, "warnings": warnings, "info": info}
|
||
|
||
# ---- Open PDF ----
|
||
try:
|
||
pdf = pdfplumber.open(str(path))
|
||
except Exception as exc:
|
||
errors.append(_issue(
|
||
"PDF_UNREADABLE",
|
||
f"Cannot open PDF: {exc}",
|
||
))
|
||
return {"pass": False, "source": pdf_path, "check_type": "pdf",
|
||
"errors": errors, "warnings": warnings, "info": info}
|
||
|
||
pages = pdf.pages
|
||
|
||
# ---- 2. TEXT_MISSING (requires source HTML) ----
|
||
pdf_text_parts: list[str] = []
|
||
page_char_counts: list[int] = []
|
||
|
||
for page in pages:
|
||
try:
|
||
txt = page.extract_text() or ""
|
||
except Exception:
|
||
txt = ""
|
||
pdf_text_parts.append(txt)
|
||
meaningful = len(re.sub(r"\s", "", txt))
|
||
page_char_counts.append(meaningful)
|
||
|
||
pdf_chars = sum(page_char_counts)
|
||
|
||
if source_html:
|
||
html_p = Path(source_html)
|
||
if html_p.is_file():
|
||
html_raw = html_p.read_text(encoding="utf-8", errors="replace")
|
||
html_text = _html_visible_text(html_raw)
|
||
html_chars = len(re.sub(r"\s", "", html_text))
|
||
|
||
if html_chars > 0 and pdf_chars < html_chars * 0.30:
|
||
errors.append(_issue(
|
||
"TEXT_MISSING",
|
||
f"PDF contains only {pdf_chars} meaningful characters vs {html_chars} in "
|
||
f"source HTML ({pdf_chars/html_chars*100:.0f}%). "
|
||
"Fonts may have failed to load during rendering.",
|
||
))
|
||
else:
|
||
warnings.append(_issue(
|
||
"SOURCE_HTML_NOT_FOUND",
|
||
f"Source HTML '{source_html}' not found; skipping TEXT_MISSING check.",
|
||
severity="warning"
|
||
))
|
||
|
||
# ---- 3. DIMENSIONS_UNREASONABLE ----
|
||
for i, page in enumerate(pages):
|
||
w = page.width # in points
|
||
h = page.height
|
||
if w < 200 or h < 200:
|
||
warnings.append(_issue(
|
||
"DIMENSIONS_UNREASONABLE",
|
||
f"Page {i+1} dimensions are {w:.0f}×{h:.0f}pt "
|
||
f"({w/72:.1f}×{h/72:.1f}in). Unusually small for a poster.",
|
||
severity="warning"
|
||
))
|
||
break # only warn once
|
||
|
||
# ---- 4. ORPHAN_PAGE ----
|
||
# Skip for poster mode: seamlessly-paginated posters naturally have less content on the last page
|
||
if not poster and len(pages) > 1:
|
||
avg_chars = sum(page_char_counts[:-1]) / max(len(page_char_counts) - 1, 1)
|
||
last_chars = page_char_counts[-1]
|
||
if avg_chars > 0 and last_chars < avg_chars * 0.10:
|
||
warnings.append(_issue(
|
||
"ORPHAN_PAGE",
|
||
f"Last page (page {len(pages)}) has very little content "
|
||
f"({last_chars} chars vs avg {avg_chars:.0f}). "
|
||
"This may indicate accidental overflow. Check your page sizing.",
|
||
severity="warning"
|
||
))
|
||
|
||
pdf.close()
|
||
|
||
has_errors = len(errors) > 0
|
||
return {
|
||
"pass": not has_errors,
|
||
"source": pdf_path,
|
||
"check_type": "pdf",
|
||
"errors": errors,
|
||
"warnings": warnings,
|
||
"info": info,
|
||
}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Utilities
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _truncate(s: str, max_len: int = 80) -> str:
|
||
return s if len(s) <= max_len else s[:max_len - 3] + "..."
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# CLI
|
||
# ---------------------------------------------------------------------------
|
||
# LaTeX .tex file checks
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def check_tex(tex_path: str) -> dict:
|
||
"""Check a LaTeX .tex file for common issues, especially table overflow in dual-column layouts."""
|
||
errors = []
|
||
warnings = []
|
||
info = []
|
||
|
||
if not os.path.exists(tex_path):
|
||
return {"pass": False, "source": tex_path, "check_type": "tex",
|
||
"errors": [_issue("FILE_NOT_FOUND", f"File not found: {tex_path}")], "warnings": [], "info": []}
|
||
|
||
with open(tex_path, "r", encoding="utf-8", errors="replace") as f:
|
||
content = f.read()
|
||
|
||
lines = content.split("\n")
|
||
|
||
# Detect if this is a two-column document
|
||
is_twocolumn = bool(re.search(
|
||
r"\\documentclass\[.*?twocolumn.*?\]"
|
||
r"|\\twocolumn"
|
||
r"|\\begin\{multicols\}"
|
||
r"|IEEEtran|acmart|sig-alternate",
|
||
content, re.IGNORECASE
|
||
))
|
||
|
||
# ---- 1. BARE_TABULAR_IN_TWOCOLUMN ----
|
||
# Find \begin{tabular} not wrapped in resizebox/adjustbox/tabularx/tabular*
|
||
# and not inside a table* (full-width) environment
|
||
if is_twocolumn:
|
||
info.append(_issue("TWOCOLUMN_DETECTED",
|
||
"Two-column layout detected. Checking tables for overflow risk.",
|
||
severity="info"))
|
||
|
||
# Parse all tabular environments with line numbers
|
||
tabular_pattern = re.compile(r"\\begin\{tabular\}\s*\{([^}]*)\}", re.IGNORECASE)
|
||
for i, line in enumerate(lines, 1):
|
||
m = tabular_pattern.search(line)
|
||
if not m:
|
||
continue
|
||
|
||
col_spec = m.group(1)
|
||
# Count data columns (l, c, r, p{}, X — skip @{}, |, >{}, <{})
|
||
col_count = len(re.findall(r"[lcrX]|p\{[^}]*\}", col_spec))
|
||
|
||
# Check if this tabular is wrapped in protective environments
|
||
# Look backwards up to 10 lines for wrapping context
|
||
context_start = max(0, i - 11)
|
||
context = "\n".join(lines[context_start:i])
|
||
has_resizebox = bool(re.search(r"\\resizebox", context))
|
||
has_adjustbox = bool(re.search(r"\\adjustbox|\\begin\{adjustbox\}", context))
|
||
has_table_star = bool(re.search(r"\\begin\{table\*\}", context))
|
||
has_makebox = bool(re.search(r"\\makebox\[.*?\\textwidth\]", context))
|
||
is_protected = has_resizebox or has_adjustbox or has_table_star or has_makebox
|
||
|
||
if is_twocolumn and col_count >= 5 and not is_protected:
|
||
errors.append(_issue(
|
||
"BARE_TABULAR_OVERFLOW",
|
||
f"Line {i}: \\begin{{tabular}} with {col_count} columns in a two-column document, "
|
||
f"not wrapped in resizebox/adjustbox/table*. This WILL overflow \\columnwidth. "
|
||
f"Fix: use tabularx{{\\columnwidth}}, or wrap in \\resizebox{{\\columnwidth}}{{!}}{{...}}, "
|
||
f"or use table* for full-width.",
|
||
severity="error"
|
||
))
|
||
elif is_twocolumn and col_count >= 4 and not is_protected:
|
||
warnings.append(_issue(
|
||
"TABULAR_OVERFLOW_RISK",
|
||
f"Line {i}: \\begin{{tabular}} with {col_count} columns in two-column layout. "
|
||
f"May overflow \\columnwidth depending on content width. "
|
||
f"Consider tabularx{{\\columnwidth}} or \\small font size.",
|
||
severity="warning"
|
||
))
|
||
elif not is_twocolumn and col_count >= 7 and not is_protected:
|
||
warnings.append(_issue(
|
||
"TABULAR_WIDE",
|
||
f"Line {i}: \\begin{{tabular}} with {col_count} columns. "
|
||
f"May overflow \\textwidth. Consider tabularx or resizebox.",
|
||
severity="warning"
|
||
))
|
||
|
||
# ---- 2. TABULAR_WITHOUT_TABLE ----
|
||
# tabular not inside table/table* environment (no caption, no float)
|
||
for i, line in enumerate(lines, 1):
|
||
if re.search(r"\\begin\{tabular\}", line):
|
||
context_start = max(0, i - 6)
|
||
context = "\n".join(lines[context_start:i])
|
||
if not re.search(r"\\begin\{table\*?\}", context):
|
||
warnings.append(_issue(
|
||
"TABULAR_NO_FLOAT",
|
||
f"Line {i}: \\begin{{tabular}} not inside a table/table* float environment. "
|
||
f"Table will be inline (no caption, no \\label, no float positioning).",
|
||
severity="warning"
|
||
))
|
||
|
||
# ---- 3. PREFER_TABULARX ----
|
||
# Check if tabularx is loaded in preamble
|
||
has_tabularx_pkg = bool(re.search(r"\\usepackage.*\{tabularx\}", content))
|
||
tabular_count = len(re.findall(r"\\begin\{tabular\}", content))
|
||
tabularx_count = len(re.findall(r"\\begin\{tabularx\}", content))
|
||
tabular_star_count = len(re.findall(r"\\begin\{tabular\*\}", content))
|
||
|
||
if tabular_count > 0 and tabularx_count == 0 and tabular_star_count == 0:
|
||
if not has_tabularx_pkg:
|
||
warnings.append(_issue(
|
||
"TABULARX_NOT_LOADED",
|
||
f"Document has {tabular_count} tabular environment(s) but tabularx package is not loaded. "
|
||
f"Add \\usepackage{{tabularx}} to preamble for auto-width column support.",
|
||
severity="warning"
|
||
))
|
||
|
||
# ---- 4. IMAGE_NO_WIDTH ----
|
||
# \includegraphics without width/max width constraint
|
||
img_pattern = re.compile(r"\\includegraphics\s*(\[[^\]]*\])?\s*\{")
|
||
for i, line in enumerate(lines, 1):
|
||
m = img_pattern.search(line)
|
||
if m:
|
||
opts = m.group(1) or ""
|
||
if not re.search(r"width|max width|scale|height", opts, re.IGNORECASE):
|
||
warnings.append(_issue(
|
||
"IMAGE_NO_WIDTH",
|
||
f"Line {i}: \\includegraphics without width/height constraint. "
|
||
f"Add [max width=\\columnwidth] or [width=\\columnwidth] to prevent overflow.",
|
||
severity="warning"
|
||
))
|
||
|
||
# ---- 5. CJK_ASCII_QUOTES ----
|
||
# Detect ASCII " adjacent to CJK characters (common LLM mistake).
|
||
# LaTeX interprets " as right double quote, so "北漂" renders with two
|
||
# right quotes. Chinese text must use Unicode smart quotes “…”.
|
||
cjk_quote_pattern = re.compile(
|
||
r'[\u4e00-\u9fff\u3400-\u4dbf]"'
|
||
r'|"[\u4e00-\u9fff\u3400-\u4dbf]'
|
||
)
|
||
# Regex to strip inline command content that legitimately contains ASCII quotes
|
||
_inline_cmd_re = re.compile(
|
||
r'\\(?:texttt|url|path|lstinline)\{[^}]*\}'
|
||
r'|\\href\{[^}]*\}\{[^}]*\}'
|
||
r"|\\verb([|!@#])(.*?)\1"
|
||
)
|
||
# Track verbatim-like environments to skip
|
||
in_verbatim = False
|
||
for i, line in enumerate(lines, 1):
|
||
# Skip comment lines
|
||
if line.lstrip().startswith('%'):
|
||
continue
|
||
# Track verbatim/lstlisting/minted environments
|
||
if re.search(r'\\begin\{(verbatim|lstlisting|minted|Verbatim|lstcode)\}', line):
|
||
in_verbatim = True
|
||
continue
|
||
if re.search(r'\\end\{(verbatim|lstlisting|minted|Verbatim|lstcode)\}', line):
|
||
in_verbatim = False
|
||
continue
|
||
if in_verbatim:
|
||
continue
|
||
# Strip inline command arguments that may legitimately contain ASCII quotes
|
||
check_line = _inline_cmd_re.sub('', line)
|
||
if cjk_quote_pattern.search(check_line):
|
||
errors.append(_issue(
|
||
"CJK_ASCII_QUOTES",
|
||
f'Line {i}: ASCII \'"\' found adjacent to CJK characters. '
|
||
f'LaTeX interprets " as right double quote (\u201d), so "\u5317\u6f02" renders as \u201d\u5317\u6f02\u201d. '
|
||
f'Fix: use Unicode smart quotes \u201c\u5317\u6f02\u201d (U+201C / U+201D) for Chinese text. '
|
||
f'(This check skips verbatim/lstlisting/minted environments and '
|
||
f'\\texttt{{}}/\\url{{}}/\\href{{}}{{}}/\\verb|| inline commands.)',
|
||
severity="error"
|
||
))
|
||
break # One error is enough
|
||
|
||
# ---- 6. EQUATION_OVERFLOW_RISK ----
|
||
# Detect equations that are likely too wide for dual-column
|
||
if is_twocolumn:
|
||
in_equation = False
|
||
eq_start_line = 0
|
||
eq_content = ""
|
||
eq_env_name = ""
|
||
eq_envs = ["equation", "displaymath"]
|
||
for i, line in enumerate(lines, 1):
|
||
stripped = line.strip()
|
||
if stripped.startswith('%'):
|
||
continue
|
||
for env in eq_envs:
|
||
if f"\\begin{{{env}}}" in stripped:
|
||
in_equation = True
|
||
eq_start_line = i
|
||
eq_content = ""
|
||
eq_env_name = env
|
||
if f"\\end{{{env}}}" in stripped and in_equation:
|
||
in_equation = False
|
||
# Check for two equations joined by \quad on same line
|
||
if re.search(r'\\quad\s*\\math', eq_content) or eq_content.count('=') >= 2:
|
||
if '\\\\' not in eq_content and 'aligned' not in eq_content and 'split' not in eq_content:
|
||
warnings.append(_issue(
|
||
"EQUATION_DUAL_ON_LINE",
|
||
f"Line {eq_start_line}: {eq_env_name} environment contains multiple equations "
|
||
f"joined by \\quad (or 2+ '=' signs) without line breaks. "
|
||
f"In dual-column format this will overflow. "
|
||
f"Fix: use align/split/aligned environment with \\\\ line breaks.",
|
||
severity="warning"
|
||
))
|
||
# Check for very long equation content
|
||
math_len = len(re.sub(r'\\[a-zA-Z]+|\s|\{|\}|\[|\]|\^|_', '', eq_content))
|
||
if math_len > 80:
|
||
warnings.append(_issue(
|
||
"EQUATION_OVERFLOW_RISK",
|
||
f"Line {eq_start_line}: {eq_env_name} has ~{math_len} math chars "
|
||
f"(threshold 80). Likely overflows single column. "
|
||
f"Consider split, multline, or factoring out sub-expressions.",
|
||
severity="warning"
|
||
))
|
||
if in_equation:
|
||
eq_content += stripped + " "
|
||
|
||
# ---- 7. RESIZEBOX_TEXTWIDTH_IN_TWOCOLUMN ----
|
||
# \resizebox{\textwidth} inside table (not table*) in twocolumn
|
||
if is_twocolumn:
|
||
in_table_star = False
|
||
for i, line in enumerate(lines, 1):
|
||
stripped = line.strip()
|
||
if '\\begin{table*}' in stripped:
|
||
in_table_star = True
|
||
if '\\end{table*}' in stripped:
|
||
in_table_star = False
|
||
if not in_table_star and '\\resizebox' in stripped and '\\textwidth' in stripped:
|
||
errors.append(_issue(
|
||
"RESIZEBOX_TEXTWIDTH",
|
||
f"Line {i}: \\resizebox{{\\textwidth}} used inside single-column float. "
|
||
f"In two-column layouts, \\textwidth = full page width (~504pt), but "
|
||
f"a table float is only one column (~252pt). This causes ~50% overflow. "
|
||
f"Fix: use \\resizebox{{\\columnwidth}}{{!}} for single-column, "
|
||
f"or change to table* for full-width.",
|
||
severity="error"
|
||
))
|
||
|
||
# ---- 8. ALGORITHM_OVERFLOW_RISK ----
|
||
if is_twocolumn:
|
||
in_algo = False
|
||
algo_start = 0
|
||
has_small_font = False
|
||
for i, line in enumerate(lines, 1):
|
||
if '\\SetAlFnt' in line and '\\small' in line:
|
||
has_small_font = True
|
||
if '\\begin{algorithm}' in line and '\\begin{algorithm*}' not in line:
|
||
in_algo = True
|
||
algo_start = i
|
||
if '\\end{algorithm}' in line and in_algo:
|
||
in_algo = False
|
||
if not has_small_font:
|
||
warnings.append(_issue(
|
||
"ALGORITHM_NO_SMALL_FONT",
|
||
f"Line {algo_start}: algorithm environment in dual-column without \\SetAlFnt{{\\small}}. "
|
||
f"Algorithm text at default size frequently overflows narrow columns. "
|
||
f"Add \\SetAlFnt{{\\small}} before the algorithm.",
|
||
severity="warning"
|
||
))
|
||
# Check for very long KwInput/KwOutput lines
|
||
if in_algo and ('\\KwInput' in line or '\\KwOutput' in line or '\\KwIn' in line or '\\KwOut' in line):
|
||
content_len = len(line.strip())
|
||
if content_len > 120:
|
||
warnings.append(_issue(
|
||
"ALGORITHM_LONG_IO",
|
||
f"Line {i}: Algorithm Input/Output line is {content_len} chars (threshold 120). "
|
||
f"Long I/O lines overflow column width. Break into multiple lines with \\\\ or "
|
||
f"abbreviate parameter names.",
|
||
severity="warning"
|
||
))
|
||
|
||
# Summary info
|
||
info.append(_issue(
|
||
"TABLE_SUMMARY",
|
||
f"Tables: {tabular_count} tabular, {tabularx_count} tabularx, {tabular_star_count} tabular*. "
|
||
f"Two-column: {'yes' if is_twocolumn else 'no'}.",
|
||
severity="info"
|
||
))
|
||
|
||
has_errors = any(e["severity"] == "error" for e in errors)
|
||
return {
|
||
"pass": not has_errors,
|
||
"source": tex_path,
|
||
"check_type": "tex",
|
||
"errors": errors,
|
||
"warnings": warnings,
|
||
"info": info,
|
||
}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def main() -> int:
|
||
parser = argparse.ArgumentParser(
|
||
description="Validate poster HTML/PDF for common quality issues."
|
||
)
|
||
sub = parser.add_subparsers(dest="command")
|
||
|
||
# -- check-html --
|
||
p_html = sub.add_parser("check-html", help="Pre-check an HTML file before PDF generation.")
|
||
p_html.add_argument("html_file", help="Path to the HTML file.")
|
||
p_html.add_argument("--fix", action="store_true",
|
||
help="Auto-fix issues where possible and output corrected HTML.")
|
||
p_html.add_argument("--output", "-o", default=None,
|
||
help="Write fixed HTML to this file (default: stdout).")
|
||
|
||
# -- check-pdf --
|
||
p_pdf = sub.add_parser("check-pdf", help="Post-check a generated PDF file.")
|
||
p_pdf.add_argument("pdf_file", help="Path to the PDF file.")
|
||
p_pdf.add_argument("--source-html", default=None,
|
||
help="Path to the source HTML (enables TEXT_MISSING check).")
|
||
p_pdf.add_argument("--poster", action="store_true",
|
||
help="Poster mode: suppress ORPHAN_PAGE for seamlessly-paginated posters.")
|
||
|
||
# -- check-tex --
|
||
p_tex = sub.add_parser("check-tex", help="Check a LaTeX .tex file for common issues (table overflow, etc.).")
|
||
p_tex.add_argument("tex_file", help="Path to the .tex file.")
|
||
|
||
args = parser.parse_args()
|
||
|
||
if not args.command:
|
||
parser.print_help()
|
||
return 2
|
||
|
||
try:
|
||
if args.command == "check-html":
|
||
report = check_html(
|
||
args.html_file,
|
||
fix=args.fix,
|
||
output_path=args.output,
|
||
)
|
||
fixed_html = report.pop("_fixed_html", None)
|
||
|
||
if args.fix and fixed_html and not args.output:
|
||
# JSON report goes to stderr, fixed HTML to stdout
|
||
print(json.dumps(report, indent=2, ensure_ascii=False), file=sys.stderr)
|
||
print(fixed_html)
|
||
else:
|
||
report.pop("_fixed_html", None)
|
||
print(json.dumps(report, indent=2, ensure_ascii=False))
|
||
|
||
return 0 if report["pass"] else 1
|
||
|
||
elif args.command == "check-pdf":
|
||
report = check_pdf(
|
||
args.pdf_file,
|
||
source_html=args.source_html,
|
||
poster=getattr(args, 'poster', False),
|
||
)
|
||
print(json.dumps(report, indent=2, ensure_ascii=False))
|
||
return 0 if report["pass"] else 1
|
||
|
||
elif args.command == "check-tex":
|
||
report = check_tex(args.tex_file)
|
||
print(json.dumps(report, indent=2, ensure_ascii=False))
|
||
return 0 if report["pass"] else 1
|
||
|
||
except Exception as exc:
|
||
err_report = {
|
||
"pass": False,
|
||
"error": f"Script error: {exc}",
|
||
"check_type": args.command.replace("check-", ""),
|
||
}
|
||
print(json.dumps(err_report, indent=2, ensure_ascii=False), file=sys.stderr)
|
||
return 2
|
||
|
||
return 0
|
||
|
||
|
||
if __name__ == "__main__":
|
||
sys.exit(main())
|