Files
mantle-ai-trader/skills/pdf/scripts/poster_validate.py
2026-06-06 05:21:10 +00:00

1338 lines
57 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
poster_validate.py — Pre- and post-generation quality checks for poster/creative PDFs.
Usage:
# Check HTML before PDF generation
python3 poster_validate.py check-html poster.html [--fix] [--output fixed.html]
# Check PDF after generation
python3 poster_validate.py check-pdf poster.pdf --source-html poster.html
Both commands emit a JSON report to stdout:
{"pass": bool, "source": "...", "check_type": "html"|"pdf",
"errors": [...], "warnings": [...], "info": [...]}
Exit codes:
0 pass (no errors; warnings/info are OK)
1 fail (at least one error)
2 script-level failure (bad arguments, unreadable file, …)
"""
from __future__ import annotations
import argparse
import json
import math
import os
import re
import sys
from html.parser import HTMLParser
from pathlib import Path
from typing import Any
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
GENERIC_FAMILIES = frozenset(
["serif", "sans-serif", "monospace", "cursive", "fantasy", "system-ui", "ui-serif",
"ui-sans-serif", "ui-monospace", "ui-rounded", "math", "emoji", "fangsong"]
)
SERIF_FONTS = frozenset(f.lower() for f in [
"Playfair Display", "Georgia", "Times New Roman", "Times", "Noto Serif",
"Noto Serif SC", "Noto Serif TC", "Noto Serif JP", "Noto Serif KR",
"Source Serif Pro", "Source Serif 4", "Merriweather", "Lora", "PT Serif",
"Libre Baskerville", "EB Garamond", "Cormorant Garamond", "Crimson Text",
"STSong", "FangSong", "KaiTi", "STKaiti", "Songti SC",
])
CHINESE_FONTS = frozenset(f.lower() for f in [
"SimHei", "Microsoft YaHei", "Noto Sans SC", "Noto Sans TC",
"Noto Sans CJK SC", "Noto Sans CJK TC", "PingFang SC", "PingFang TC",
"Source Han Sans SC", "Source Han Sans TC", "WenQuanYi Micro Hei",
"WenQuanYi Zen Hei", "Hiragino Sans GB", "STHeiti", "STXihei",
"Noto Serif SC", "Noto Serif TC", "Noto Serif CJK SC",
"Source Han Serif SC", "SimSun", "NSimSun", "FangSong", "KaiTi",
"STSong", "STFangsong", "STKaiti", "Songti SC", "Heiti SC",
])
# Selectors we treat as "main containers" whose overflow:hidden is dangerous.
# NOTE: .poster and .page are EXCLUDED because html2poster.js auto-injects
# overflow:hidden on them at render time. See SKILL.md Engine Selection Rules.
CONTAINER_SELECTORS = {"body", "html", ".slide",
"#app", "#root", ".container", ".wrapper", "main",
"section", "article"}
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _issue(code: str, message: str, severity: str = "error", line: int | None = None) -> dict:
d: dict[str, Any] = {"code": code, "message": message, "severity": severity}
if line is not None:
d["line"] = line
return d
def _line_number(full_text: str, pos: int) -> int:
"""Return 1-based line number for character position *pos*."""
return full_text.count("\n", 0, pos) + 1
# ---------------------------------------------------------------------------
# CSS regex helpers
# ---------------------------------------------------------------------------
_RE_FONT_FAMILY = re.compile(
r"font-family\s*:\s*([^;}\n]+)", re.IGNORECASE
)
_RE_FONT_SIZE = re.compile(
r"font-size\s*:\s*(\d+(?:\.\d+)?)\s*(px|pt|em|rem)", re.IGNORECASE
)
_RE_PAGE_SIZE = re.compile(
r"@page\s*\{[^}]*\bsize\s*:", re.IGNORECASE | re.DOTALL
)
_RE_CSS_URL = re.compile(
r"url\(\s*['\"]?(https?://[^'\")\s]+)['\"]?\s*\)", re.IGNORECASE
)
_RE_OVERFLOW = re.compile(
r"overflow\s*:\s*hidden", re.IGNORECASE
)
_RE_BG_WHITE = re.compile(
r"background(?:-color)?\s*:\s*(white|#fff(?:fff)?|transparent)\b", re.IGNORECASE
)
_RE_COLOR_HEX = re.compile(r"#([0-9a-fA-F]{3,8})")
_RE_COLOR_RGB = re.compile(r"rgb\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\)")
_RE_STYLE_BLOCK = re.compile(r"<style[^>]*>(.*?)</style>", re.IGNORECASE | re.DOTALL)
_RE_INLINE_STYLE = re.compile(r'style\s*=\s*["\']([^"\']*)["\']', re.IGNORECASE)
_RE_CSS_RULE = re.compile(
r"([^{]+)\{([^}]*)\}", re.DOTALL
)
_RE_WIDTH_PX = re.compile(r"width\s*:\s*(\d+(?:\.\d+)?)\s*px", re.IGNORECASE)
def _parse_font_list(raw: str) -> list[str]:
"""Split a font-family value into individual font names (unquoted, stripped)."""
fonts: list[str] = []
for part in raw.split(","):
name = part.strip().strip("'\"").strip()
if name:
fonts.append(name)
return fonts
def _has_generic(fonts: list[str]) -> bool:
return any(f.lower() in GENERIC_FAMILIES for f in fonts)
def _best_generic(fonts: list[str]) -> str:
"""Pick the best generic fallback for a list of named fonts."""
lower = [f.lower() for f in fonts]
if any(f in CHINESE_FONTS for f in lower):
return "sans-serif"
if any(f in SERIF_FONTS for f in lower):
return "serif"
return "sans-serif"
# ---------------------------------------------------------------------------
# Color / contrast helpers
# ---------------------------------------------------------------------------
def _hex_to_rgb(h: str) -> tuple[int, int, int] | None:
h = h.lstrip("#")
if len(h) == 3:
h = h[0]*2 + h[1]*2 + h[2]*2
if len(h) == 4:
h = h[0]*2 + h[1]*2 + h[2]*2 # ignore alpha
if len(h) == 8:
h = h[:6] # strip alpha
if len(h) != 6:
return None
try:
return int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16)
except ValueError:
return None
def _relative_luminance(r: int, g: int, b: int) -> float:
"""WCAG 2.x relative luminance."""
def _c(v: int) -> float:
s = v / 255.0
return s / 12.92 if s <= 0.03928 else ((s + 0.055) / 1.055) ** 2.4
return 0.2126 * _c(r) + 0.7152 * _c(g) + 0.0722 * _c(b)
def _contrast_ratio(rgb1: tuple[int, int, int], rgb2: tuple[int, int, int]) -> float:
l1 = _relative_luminance(*rgb1)
l2 = _relative_luminance(*rgb2)
lighter = max(l1, l2)
darker = min(l1, l2)
return (lighter + 0.05) / (darker + 0.05)
def _extract_color(css_text: str, prop: str) -> tuple[int, int, int] | None:
"""Try to extract an RGB color for a given CSS property from a rule body."""
pat = re.compile(rf"{prop}\s*:\s*([^;]+)", re.IGNORECASE)
m = pat.search(css_text)
if not m:
return None
val = m.group(1).strip()
# Named colours (just the common ones)
named = {
"white": (255, 255, 255), "black": (0, 0, 0), "red": (255, 0, 0),
"green": (0, 128, 0), "blue": (0, 0, 255), "yellow": (255, 255, 0),
"grey": (128, 128, 128), "gray": (128, 128, 128), "transparent": (255, 255, 255),
}
low = val.lower().split()[0].rstrip(";")
if low in named:
return named[low]
m_rgb = _RE_COLOR_RGB.search(val)
if m_rgb:
return int(m_rgb.group(1)), int(m_rgb.group(2)), int(m_rgb.group(3))
m_hex = _RE_COLOR_HEX.search(val)
if m_hex:
return _hex_to_rgb(m_hex.group(1))
return None
# ---------------------------------------------------------------------------
# HTML visible text extractor (stdlib only)
# ---------------------------------------------------------------------------
class _TextExtractor(HTMLParser):
"""Extract visible text from HTML, skipping <script>, <style>, etc."""
_SKIP_TAGS = frozenset(["script", "style", "noscript", "template", "svg"])
def __init__(self) -> None:
super().__init__()
self._skip_depth = 0
self.parts: list[str] = []
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
if tag.lower() in self._SKIP_TAGS:
self._skip_depth += 1
def handle_endtag(self, tag: str) -> None:
if tag.lower() in self._SKIP_TAGS and self._skip_depth > 0:
self._skip_depth -= 1
def handle_data(self, data: str) -> None:
if self._skip_depth == 0:
self.parts.append(data)
def get_text(self) -> str:
return " ".join(self.parts)
def _html_visible_text(html: str) -> str:
ext = _TextExtractor()
try:
ext.feed(html)
except Exception:
pass
return ext.get_text()
# ---------------------------------------------------------------------------
# CHECK-HTML
# ---------------------------------------------------------------------------
def check_html(html_path: str, *, fix: bool = False, output_path: str | None = None) -> dict:
"""Run all HTML pre-checks. Return the JSON-serialisable report dict."""
path = Path(html_path)
if not path.is_file():
return {"pass": False, "source": html_path, "check_type": "html",
"errors": [_issue("FILE_NOT_FOUND", f"Cannot read '{html_path}'.")],
"warnings": [], "info": []}
raw = path.read_text(encoding="utf-8", errors="replace")
original = raw # keep for line-number lookup
# For fix mode: we collect all replacements and apply them at the end
# using re.sub on the original to avoid offset corruption
_all_fixes: list[tuple[str, str]] = [] # (old_text, new_text) applied in order
errors: list[dict] = []
warnings: list[dict] = []
info: list[dict] = []
# Collect all CSS text (style blocks + inline styles)
all_css_positions: list[tuple[str, int]] = [] # (css_text, char_offset_in_raw)
for m in _RE_STYLE_BLOCK.finditer(raw):
all_css_positions.append((m.group(1), m.start(1)))
for m in _RE_INLINE_STYLE.finditer(raw):
all_css_positions.append((m.group(1), m.start(1)))
all_css = "\n".join(c for c, _ in all_css_positions)
# ---- 1. FONT_NO_FALLBACK ----
for css_text, css_offset in all_css_positions:
for m in _RE_FONT_FAMILY.finditer(css_text):
fonts = _parse_font_list(m.group(1))
if fonts and not _has_generic(fonts):
abs_pos = css_offset + m.start()
ln = _line_number(original, abs_pos)
generic = _best_generic(fonts)
errors.append(_issue(
"FONT_NO_FALLBACK",
f"font-family {', '.join(repr(f) for f in fonts)} has no generic fallback. "
f"Add '{generic}' at the end.",
line=ln
))
if fix:
old_decl = m.group(0) # e.g. "font-family: 'Montserrat'"
val_part = m.group(1).rstrip().rstrip(";").rstrip()
new_decl = f"font-family: {val_part}, {generic}"
_all_fixes.append((old_decl, new_decl))
# ---- 2. OVERFLOW_HIDDEN_CONTAINER ----
for css_text, css_offset in all_css_positions:
for rule_m in _RE_CSS_RULE.finditer(css_text):
selector_raw = rule_m.group(1).strip()
body = rule_m.group(2)
if not _RE_OVERFLOW.search(body):
continue
# Check if this selector is a container
selectors = [s.strip().lower() for s in selector_raw.split(",")]
for sel in selectors:
# Strip pseudo-classes/elements for matching
base_sel = re.split(r"[:>\s+~]", sel)[0].strip()
if base_sel in CONTAINER_SELECTORS:
# Check for width < 200px exemption
w_m = _RE_WIDTH_PX.search(body)
if w_m and float(w_m.group(1)) < 200:
continue
abs_pos = css_offset + rule_m.start()
ln = _line_number(original, abs_pos)
errors.append(_issue(
"OVERFLOW_HIDDEN_CONTAINER",
f"'{base_sel}' has 'overflow: hidden' which clips content in PDF rendering. "
"Remove it or use 'overflow: visible'.",
line=ln
))
if fix:
old_rule = rule_m.group(0)
fixed_body = re.sub(r"overflow\s*:\s*hidden\s*;?\s*", "", body, flags=re.IGNORECASE)
new_rule = f"{rule_m.group(1)}{{{fixed_body}}}"
_all_fixes.append((old_rule, new_rule))
break # only report once per rule
# Also check inline styles on body/html tags
for tag_name in ("body", "html"):
tag_pat = re.compile(rf"<{tag_name}([^>]*)>", re.IGNORECASE)
for tm in tag_pat.finditer(raw):
attrs_str = tm.group(1)
style_m = re.search(r'style\s*=\s*["\']([^"\']*)["\']', attrs_str, re.IGNORECASE)
if style_m and _RE_OVERFLOW.search(style_m.group(1)):
ln = _line_number(original, tm.start())
errors.append(_issue(
"OVERFLOW_HIDDEN_CONTAINER",
f"<{tag_name}> has inline 'overflow: hidden' which clips content. Remove it.",
line=ln
))
if fix:
old_style = style_m.group(1)
new_style = re.sub(r"overflow\s*:\s*hidden\s*;?\s*", "", old_style, flags=re.IGNORECASE)
_all_fixes.append((old_style, new_style))
# ---- 2b. FIXED_SIZE_NO_SCREEN_ADAPT ----
# For fixed-size single-page designs (poster, infographic, certificate, card),
# check that @media screen auto-scale CSS is present.
# Detect fixed-size pages: html/body with explicit px width+height
_has_fixed_width = False
_has_fixed_height = False
_fixed_h_value = None
for css_text, css_offset in all_css_positions:
for rule_m in _RE_CSS_RULE.finditer(css_text):
selector_raw = rule_m.group(1).strip().lower()
body_text = rule_m.group(2)
selectors = [s.strip() for s in selector_raw.split(",")]
if any(s in ("body", "html", "html, body", "body, html") for s in selectors):
if re.search(r"(?<!max-)(?<!min-)width\s*:\s*\d+(?:\.\d+)?\s*px", body_text, re.IGNORECASE):
_has_fixed_width = True
h_m = re.search(r"(?<!max-)(?<!min-)height\s*:\s*(\d+(?:\.\d+)?)\s*px", body_text, re.IGNORECASE)
if h_m:
_has_fixed_height = True
_fixed_h_value = h_m.group(1)
# Also check .page / .poster containers
for css_text, css_offset in all_css_positions:
for rule_m in _RE_CSS_RULE.finditer(css_text):
selector_raw = rule_m.group(1).strip().lower()
body_text = rule_m.group(2)
selectors = [s.strip() for s in selector_raw.split(",")]
if any(s in (".page", ".poster", "#page", "#poster", ".slide") for s in selectors):
if re.search(r"(?<!max-)(?<!min-)width\s*:\s*\d+(?:\.\d+)?\s*px", body_text, re.IGNORECASE):
_has_fixed_width = True
h_m = re.search(r"(?<!max-)(?<!min-)height\s*:\s*(\d+(?:\.\d+)?)\s*px", body_text, re.IGNORECASE)
if h_m:
_has_fixed_height = True
if not _fixed_h_value:
_fixed_h_value = h_m.group(1)
_is_fixed_size_page = _has_fixed_width and _has_fixed_height
if _is_fixed_size_page:
# Check for @media screen rule
_has_media_screen = bool(re.search(r"@media\s+screen\s*\{", all_css, re.IGNORECASE))
if not _has_media_screen:
_height_hint = _fixed_h_value or "1400"
warnings.append(_issue(
"FIXED_SIZE_NO_SCREEN_ADAPT",
f"Fixed-size page detected ({_height_hint}px tall) but no @media screen rule found. "
"Browser preview will require scrolling to see the full page. "
"Add @media screen {{ html {{ height:auto; display:flex; justify-content:center; }} "
f"body {{ transform-origin:top center; scale:min(1, calc(100vh / {_height_hint})); "
"margin:0 auto; }} }} for auto-scaling preview.",
severity="warning"
))
# Check for @media screen + scale/transform (more specific)
if _has_media_screen:
# Extract full @media screen block content (handles nested braces)
_media_screen_content = ""
_ms_match = re.search(r"@media\s+screen\s*\{", all_css, re.IGNORECASE)
if _ms_match:
# Find matching closing brace (count nested braces)
_depth = 1
_start = _ms_match.end()
for _ci in range(_start, len(all_css)):
if all_css[_ci] == '{':
_depth += 1
elif all_css[_ci] == '}':
_depth -= 1
if _depth == 0:
_media_screen_content = all_css[_start:_ci]
break
_has_scale = bool(re.search(
r"(?:scale|transform|zoom)",
_media_screen_content, re.IGNORECASE
))
if not _has_scale:
warnings.append(_issue(
"SCREEN_ADAPT_NO_SCALE",
"@media screen block exists but lacks scale/transform/zoom for auto-fitting. "
f"Add: body {{ scale: min(1, calc(100vh / {_fixed_h_value or '1400'})); }}",
severity="warning"
))
# ---- 3. REMOTE_IMAGE ----
# <img src="http...">
for m in re.finditer(r'<img\s[^>]*src\s*=\s*["\']?(https?://[^\s"\'>\)]+)', raw, re.IGNORECASE):
url = m.group(1)
ln = _line_number(original, m.start())
warnings.append(_issue(
"REMOTE_IMAGE",
f"img src='{_truncate(url, 80)}' is a remote URL. "
"Download to images/ subdirectory and use relative path (src=\"images/filename.jpg\").",
severity="warning", line=ln
))
# CSS url(http...)
for css_text, css_offset in all_css_positions:
for m in _RE_CSS_URL.finditer(css_text):
url = m.group(1)
abs_pos = css_offset + m.start()
ln = _line_number(original, abs_pos)
warnings.append(_issue(
"REMOTE_IMAGE",
f"CSS url('{_truncate(url, 80)}') is a remote URL. "
"Download locally for reliable PDF generation.",
severity="warning", line=ln
))
# ---- 3b. ABSOLUTE_PATH ----
# <img src="file:///..." or src="/absolute/path">
for m in re.finditer(r'<img\s[^>]*src\s*=\s*["\']?(file://[^\s"\'>\)]+|/[^\s"\'>\)]+)', raw, re.IGNORECASE):
path_val = m.group(1)
ln = _line_number(original, m.start())
warnings.append(_issue(
"ABSOLUTE_PATH",
f"img src='{_truncate(path_val, 80)}' uses an absolute path. "
"Use relative path (src=\"images/filename.jpg\") for portability.",
severity="warning", line=ln
))
# ---- 4. NO_PAGE_SIZE ----
if not _RE_PAGE_SIZE.search(all_css):
warnings.append(_issue(
"NO_PAGE_SIZE",
"@page { size: ... } not found in CSS. Playwright will use default A4 "
"which may not match the poster design. Add explicit page size.",
severity="warning"
))
# ---- 4b. MISSING_MARGIN_RESET ----
# Check if html/body has margin:0 reset (Chromium defaults to body { margin: 8px })
has_margin_reset = bool(re.search(
r'(?:html|body|\*)\s*(?:,\s*(?:html|body|\*)\s*)?{[^}]*margin\s*:\s*0',
all_css, re.IGNORECASE
))
if not has_margin_reset:
warnings.append(_issue(
"MISSING_MARGIN_RESET",
"No 'margin: 0' found on html/body/*. Chromium's default body { margin: 8px } "
"will cause black edges on top/left/bottom of the poster PDF. "
"Add: html, body { margin: 0; padding: 0; }",
severity="warning"
))
# ---- 4c. PAGE_SIZE_CSS_VAR ----
# CSS variables are NOT resolved in @page rules — will silently fallback to A4
page_size_var_match = re.search(
r'@page\s*\{[^}]*\bsize\s*:[^;]*var\s*\(',
all_css, re.IGNORECASE | re.DOTALL
)
if page_size_var_match:
errors.append(_issue(
"PAGE_SIZE_CSS_VAR",
"@page { size } uses CSS variables (var(...)). Chromium does NOT resolve "
"CSS variables in @page rules — the page size will silently fallback to A4, "
"causing content to be off-center. Use concrete px values instead: "
"@page { size: 720px 960px; }",
severity="error"
))
# ---- 5. WHITE_BACKGROUND ----
_has_styled_bg = False
for css_text, _ in all_css_positions:
for rule_m in _RE_CSS_RULE.finditer(css_text):
selector_raw = rule_m.group(1).strip().lower()
body = rule_m.group(2)
selectors = [s.strip() for s in selector_raw.split(",")]
if any(s in ("body", "html", ":root") for s in selectors):
bg_m = re.search(r"background(?:-color)?\s*:\s*([^;]+)", body, re.IGNORECASE)
if bg_m:
val = bg_m.group(1).strip().lower()
if val in ("white", "#fff", "#ffffff", "transparent", "rgba(0,0,0,0)",
"rgba(255,255,255,1)", "rgb(255,255,255)"):
pass # still "white"-ish
else:
_has_styled_bg = True
if not _has_styled_bg:
warnings.append(_issue(
"WHITE_BACKGROUND",
"body/html has white, transparent, or no background. "
"Posters typically need a styled background colour or gradient.",
severity="warning"
))
# ---- 6. TINY_FONT ----
for css_text, css_offset in all_css_positions:
for m in _RE_FONT_SIZE.finditer(css_text):
size = float(m.group(1))
unit = m.group(2).lower()
# Convert to rough px
if unit == "pt":
size_px = size * (4 / 3)
elif unit in ("em", "rem"):
size_px = size * 16 # assume base 16px
else:
size_px = size
if size_px < 10:
abs_pos = css_offset + m.start()
ln = _line_number(original, abs_pos)
warnings.append(_issue(
"TINY_FONT",
f"font-size: {m.group(1)}{m.group(2)} ({size_px:.0f}px) may be unreadable in PDF output.",
severity="warning", line=ln
))
# ---- 7. COLOR_CONTRAST ----
_checked_pairs: set[tuple] = set()
for css_text, css_offset in all_css_positions:
for rule_m in _RE_CSS_RULE.finditer(css_text):
body = rule_m.group(2)
fg = _extract_color(body, "color")
bg = _extract_color(body, "background(?:-color)?")
if fg and bg:
pair = (fg, bg)
if pair not in _checked_pairs:
_checked_pairs.add(pair)
ratio = _contrast_ratio(fg, bg)
if ratio < 3.0:
abs_pos = css_offset + rule_m.start()
ln = _line_number(original, abs_pos)
warnings.append(_issue(
"COLOR_CONTRAST",
f"Text color rgb{fg} on background rgb{bg} has contrast ratio "
f"{ratio:.1f}:1 (< 3:1 minimum). May be hard to read.",
severity="warning", line=ln
))
# ---- 8. MISSING_PRINT_BG_NOTE ----
info.append(_issue(
"MISSING_PRINT_BG_NOTE",
"Remember to set print_background=True when calling page.pdf() in Playwright, "
"otherwise CSS backgrounds won't render.",
severity="info"
))
# ---- 9. OVERFLOW_DECORATION ----
# Detect decorative position:absolute elements that might overflow body width
# Look for patterns like: left: -50px, right: -100px, left: 120%, transform: translateX(...)
_overflow_positions = re.findall(
r'(?:left|right)\s*:\s*(-\d+(?:\.\d+)?(?:px|%|rem|em))\s*;',
all_css, re.IGNORECASE
) if all_css else []
for pos_val in _overflow_positions:
if pos_val.startswith('-') or (pos_val.endswith('%') and float(re.match(r'-?[\d.]+', pos_val).group()) > 100):
warnings.append(_issue(
"OVERFLOW_DECORATION",
f"CSS position '{pos_val}' may push decorative elements outside body width, "
"causing black edges in PDF. Keep absolute-positioned elements within [0, body_width].",
severity="warning"
))
break # Only warn once
# ---- 10. BG_COLOR_MISMATCH ----
# Check if body/html background matches main content container background
_body_bg = None
_canvas_bg = None
for css_text, css_offset in all_css_positions:
for rule_m in _RE_CSS_RULE.finditer(css_text):
selector_raw = rule_m.group(1).strip().lower()
body_text = rule_m.group(2)
selectors = [s.strip() for s in selector_raw.split(",")]
bg_m = re.search(r"background(?:-color)?\s*:\s*([^;]+)", body_text, re.IGNORECASE)
if bg_m:
bg_val = bg_m.group(1).strip().lower()
if any(s in ("body", "html") for s in selectors):
_body_bg = bg_val
if any(s in (".canvas", ".poster", ".poster-container", ".page") for s in selectors):
_canvas_bg = bg_val
# Helper: normalize a CSS background value to (r,g,b) for comparison.
# Returns None if value uses var(), gradient, url(), or unparseable.
def _bg_to_rgb(val: str):
if not val:
return None
v = val.strip().lower()
# Skip CSS variables, gradients, url() — can't compare reliably
if v.startswith("var(") or "gradient" in v or "url(" in v:
return None
# Use _extract_color machinery: wrap as "background: <val>" and extract
return _extract_color(f"background: {val};", "background")
_body_bg_rgb = _bg_to_rgb(_body_bg)
_canvas_bg_rgb = _bg_to_rgb(_canvas_bg)
if _body_bg and _canvas_bg and _body_bg != _canvas_bg:
# Skip if either uses CSS variables / gradients (can't reliably compare)
if _body_bg_rgb is not None and _canvas_bg_rgb is not None:
if _body_bg_rgb != _canvas_bg_rgb:
warnings.append(_issue(
"BG_COLOR_MISMATCH",
f"body background '{_body_bg}' differs from content container background "
f"'{_canvas_bg}'. This may cause visible color borders in PDF output. "
"Ensure html/body background matches the poster canvas color.",
severity="warning"
))
# else: unparseable (var/gradient/url) → skip, no warning
# ---- 10b. SCREEN_BG_MISMATCH ----
# Check if @media screen html background matches body/canvas background
# Parse @media screen independently (check 2b only runs for fixed-size pages)
_screen_has_media = bool(re.search(r"@media\s+screen\s*\{", all_css, re.IGNORECASE))
_screen_content = ""
if _screen_has_media:
_ms_m = re.search(r"@media\s+screen\s*\{", all_css, re.IGNORECASE)
if _ms_m:
_d = 1
_s = _ms_m.end()
for _ci in range(_s, len(all_css)):
if all_css[_ci] == '{':
_d += 1
elif all_css[_ci] == '}':
_d -= 1
if _d == 0:
_screen_content = all_css[_s:_ci]
break
if _screen_has_media and _screen_content:
_screen_html_bg = None
for rule_m in _RE_CSS_RULE.finditer(_screen_content):
selector_raw = rule_m.group(1).strip().lower()
body_text = rule_m.group(2)
selectors = [s.strip() for s in selector_raw.split(",")]
if any(s in ("html",) for s in selectors):
bg_m = re.search(r"background(?:-color)?\s*:\s*([^;]+)", body_text, re.IGNORECASE)
if bg_m:
_screen_html_bg = bg_m.group(1).strip().lower()
_ref_bg = _body_bg or _canvas_bg
if _screen_html_bg and _ref_bg and _screen_html_bg != _ref_bg:
# Normalize to RGB for comparison (skip if either is var/gradient/url)
_screen_rgb = _bg_to_rgb(_screen_html_bg)
_ref_rgb = _bg_to_rgb(_ref_bg)
if _screen_rgb is not None and _ref_rgb is not None:
if _screen_rgb != _ref_rgb:
warnings.append(_issue(
"SCREEN_BG_MISMATCH",
f"@media screen html background '{_screen_html_bg}' differs from "
f"body/canvas background '{_ref_bg}'. Browser preview will show "
"a different color border around the poster. "
"Set @media screen html background to match the poster background.",
severity="warning"
))
# else: unparseable (var/gradient) → skip
if _is_fixed_size_page and not _screen_html_bg:
# Check if body/html already sets background globally (would be inherited)
# If so, @media screen doesn't need its own background
if not _body_bg:
warnings.append(_issue(
"SCREEN_NO_BG",
"@media screen block exists but html has no explicit background color "
"(neither globally nor in @media screen). "
"Browser preview may show white borders around the poster. "
"Add: @media screen { html { background: <poster-bg-color>; } }",
severity="warning"
))
# ---- 10c. MULTIPAGE_BODY_BG_MISSING ----
# Multi-page documents with dark/colored .page backgrounds but no body background
# → sub-pixel gap between .page and @page reveals white body, causing white edges
_has_colored_page = False
# Resolve CSS variables from :root for var() expansion
_css_vars = {}
for css_text, _ in all_css_positions:
for rule_m in _RE_CSS_RULE.finditer(css_text):
sel = rule_m.group(1).strip().lower()
if ":root" in sel:
for vm in re.finditer(r"--(\w[\w-]*)\s*:\s*([^;]+)", rule_m.group(2)):
_css_vars[vm.group(1)] = vm.group(2).strip()
def _resolve_bg(val):
"""Resolve a CSS background value, expanding var() references once."""
v = val.strip().lower()
var_m = re.match(r"var\(--(\w[\w-]*)\)", v)
if var_m and var_m.group(1) in _css_vars:
return _css_vars[var_m.group(1)].strip().lower()
return v
for css_text, css_offset in all_css_positions:
for rule_m in _RE_CSS_RULE.finditer(css_text):
selector_raw = rule_m.group(1).strip().lower()
body_text = rule_m.group(2)
selectors = [s.strip() for s in selector_raw.split(",")]
if any(".page" in s for s in selectors):
bg_m = re.search(r"background(?:-color)?\s*:\s*([^;]+)", body_text, re.IGNORECASE)
if bg_m:
val = _resolve_bg(bg_m.group(1))
rgb = _bg_to_rgb(val)
if rgb is not None:
r_v, g_v, b_v = rgb
luminance = 0.299 * r_v + 0.587 * g_v + 0.114 * b_v
if luminance < 80:
_has_colored_page = True
elif "gradient" in val:
dark_hex = re.findall(r"#([0-9a-f]{3,6})", val)
for h in dark_hex:
if len(h) == 3:
h = h[0]*2 + h[1]*2 + h[2]*2
if len(h) == 6:
rv = int(h[0:2], 16)
gv = int(h[2:4], 16)
bv = int(h[4:6], 16)
if 0.299*rv + 0.587*gv + 0.114*bv < 80:
_has_colored_page = True
break
_page_div_count = len(re.findall(r'class\s*=\s*["\'][^"\']*\bpage\b', original, re.IGNORECASE))
if _page_div_count >= 2 and _has_colored_page and not _has_styled_bg:
warnings.append(_issue(
"MULTIPAGE_BODY_BG_MISSING",
"Multi-page document has dark/colored .page backgrounds but html/body has no "
"explicit background color. Playwright sub-pixel rounding creates <1px gaps "
"at .page edges where body background shows through — white body = visible "
"white edges on dark pages. Fix: set html,body { background: <darkest-page-color> }.",
severity="warning"
))
# ---- 11. HEIGHT_LOCKED ----
# Check if content containers use height:100% instead of min-height
_height_locked_selectors = set()
# Selectors that are content containers (not SVG/img/shape)
_content_containers = {".glass-canvas", ".shaped-canvas", ".process-list-container",
".process-list", ".grid-item", ".stat-block", ".delta-widget"}
for css_text, css_offset in all_css_positions:
for rule_m in _RE_CSS_RULE.finditer(css_text):
selector_raw = rule_m.group(1).strip()
body_text = rule_m.group(2)
# Check for height: 100% (not min-height)
if re.search(r"(?<!min-)height\s*:\s*100%", body_text, re.IGNORECASE):
selectors = [s.strip().lower() for s in selector_raw.split(",")]
for sel in selectors:
base_sel = re.split(r"[:>\s+~]", sel)[0].strip()
# Skip non-content elements
if base_sel in (".bg-layer", "svg", "img", ".shape-float",
".shape-circle", ".shape-wave", ".shape-diagonal_slash",
".shape-diamond", ".shape-wedge_right"):
continue
if base_sel in _content_containers:
_height_locked_selectors.add(base_sel)
# Also check inline styles
for m in re.finditer(r'class\s*=\s*["\']([^"\']*)["\']\s*[^>]*style\s*=\s*["\']([^"\']*)["\']', raw, re.IGNORECASE):
classes = m.group(1).lower().split()
style = m.group(2)
if re.search(r"(?<!min-)height\s*:\s*100%", style, re.IGNORECASE):
for cls in classes:
dotcls = f".{cls}"
if dotcls in _content_containers:
_height_locked_selectors.add(dotcls)
if _height_locked_selectors:
warnings.append(_issue(
"HEIGHT_LOCKED",
f"Content containers {', '.join(sorted(_height_locked_selectors))} use 'height: 100%' "
"instead of 'min-height: 100%'. This locks the container height and may clip content "
"when it exceeds the allocated grid area. Use 'min-height: 100%' to allow content to grow.",
severity="warning"
))
# ---- Apply all fixes (unified, avoids offset corruption) ----
# Apply larger replacements (full CSS rules like overflow fixes) before smaller
# ones (font-family declarations), so the larger match strings are still valid.
# Sort by length of old_text descending to ensure this order.
if fix and _all_fixes:
_all_fixes.sort(key=lambda pair: len(pair[0]), reverse=True)
for old_text, new_text in _all_fixes:
if old_text in raw:
raw = raw.replace(old_text, new_text, 1)
# ---- Build report ----
has_errors = len(errors) > 0
report: dict[str, Any] = {
"pass": not has_errors,
"source": html_path,
"check_type": "html",
"errors": errors,
"warnings": warnings,
"info": info,
}
# ---- Fix mode output ----
if fix:
if output_path:
Path(output_path).write_text(raw, encoding="utf-8")
report["fixed_file"] = output_path
else:
# Write fixed HTML to stdout after the JSON report to stderr
report["fixed_output"] = "stdout"
# We'll handle this in main()
report["_fixed_html"] = raw if fix else None
return report
# ---------------------------------------------------------------------------
# CHECK-PDF
# ---------------------------------------------------------------------------
def check_pdf(pdf_path: str, *, source_html: str | None = None, poster: bool = False) -> dict:
"""Run all PDF post-checks. Return the JSON-serialisable report dict."""
path = Path(pdf_path)
errors: list[dict] = []
warnings: list[dict] = []
info: list[dict] = []
if not path.is_file():
return {"pass": False, "source": pdf_path, "check_type": "pdf",
"errors": [_issue("FILE_NOT_FOUND", f"Cannot read '{pdf_path}'.")],
"warnings": [], "info": []}
# ---- 1. FILE_TOO_SMALL ----
file_size = path.stat().st_size
if file_size < 10 * 1024:
errors.append(_issue(
"FILE_TOO_SMALL",
f"PDF is only {file_size} bytes ({file_size/1024:.1f} KB). "
"Likely empty or broken.",
))
# ---- Import pdfplumber ----
try:
import pdfplumber
except ImportError:
errors.append(_issue(
"DEPENDENCY_MISSING",
"pdfplumber is not installed. Cannot perform PDF text checks. "
"Install with: pip install pdfplumber",
))
return {"pass": False, "source": pdf_path, "check_type": "pdf",
"errors": errors, "warnings": warnings, "info": info}
# ---- Open PDF ----
try:
pdf = pdfplumber.open(str(path))
except Exception as exc:
errors.append(_issue(
"PDF_UNREADABLE",
f"Cannot open PDF: {exc}",
))
return {"pass": False, "source": pdf_path, "check_type": "pdf",
"errors": errors, "warnings": warnings, "info": info}
pages = pdf.pages
# ---- 2. TEXT_MISSING (requires source HTML) ----
pdf_text_parts: list[str] = []
page_char_counts: list[int] = []
for page in pages:
try:
txt = page.extract_text() or ""
except Exception:
txt = ""
pdf_text_parts.append(txt)
meaningful = len(re.sub(r"\s", "", txt))
page_char_counts.append(meaningful)
pdf_chars = sum(page_char_counts)
if source_html:
html_p = Path(source_html)
if html_p.is_file():
html_raw = html_p.read_text(encoding="utf-8", errors="replace")
html_text = _html_visible_text(html_raw)
html_chars = len(re.sub(r"\s", "", html_text))
if html_chars > 0 and pdf_chars < html_chars * 0.30:
errors.append(_issue(
"TEXT_MISSING",
f"PDF contains only {pdf_chars} meaningful characters vs {html_chars} in "
f"source HTML ({pdf_chars/html_chars*100:.0f}%). "
"Fonts may have failed to load during rendering.",
))
else:
warnings.append(_issue(
"SOURCE_HTML_NOT_FOUND",
f"Source HTML '{source_html}' not found; skipping TEXT_MISSING check.",
severity="warning"
))
# ---- 3. DIMENSIONS_UNREASONABLE ----
for i, page in enumerate(pages):
w = page.width # in points
h = page.height
if w < 200 or h < 200:
warnings.append(_issue(
"DIMENSIONS_UNREASONABLE",
f"Page {i+1} dimensions are {w:.0f}×{h:.0f}pt "
f"({w/72:.1f}×{h/72:.1f}in). Unusually small for a poster.",
severity="warning"
))
break # only warn once
# ---- 4. ORPHAN_PAGE ----
# Skip for poster mode: seamlessly-paginated posters naturally have less content on the last page
if not poster and len(pages) > 1:
avg_chars = sum(page_char_counts[:-1]) / max(len(page_char_counts) - 1, 1)
last_chars = page_char_counts[-1]
if avg_chars > 0 and last_chars < avg_chars * 0.10:
warnings.append(_issue(
"ORPHAN_PAGE",
f"Last page (page {len(pages)}) has very little content "
f"({last_chars} chars vs avg {avg_chars:.0f}). "
"This may indicate accidental overflow. Check your page sizing.",
severity="warning"
))
pdf.close()
has_errors = len(errors) > 0
return {
"pass": not has_errors,
"source": pdf_path,
"check_type": "pdf",
"errors": errors,
"warnings": warnings,
"info": info,
}
# ---------------------------------------------------------------------------
# Utilities
# ---------------------------------------------------------------------------
def _truncate(s: str, max_len: int = 80) -> str:
return s if len(s) <= max_len else s[:max_len - 3] + "..."
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
# LaTeX .tex file checks
# ---------------------------------------------------------------------------
def check_tex(tex_path: str) -> dict:
"""Check a LaTeX .tex file for common issues, especially table overflow in dual-column layouts."""
errors = []
warnings = []
info = []
if not os.path.exists(tex_path):
return {"pass": False, "source": tex_path, "check_type": "tex",
"errors": [_issue("FILE_NOT_FOUND", f"File not found: {tex_path}")], "warnings": [], "info": []}
with open(tex_path, "r", encoding="utf-8", errors="replace") as f:
content = f.read()
lines = content.split("\n")
# Detect if this is a two-column document
is_twocolumn = bool(re.search(
r"\\documentclass\[.*?twocolumn.*?\]"
r"|\\twocolumn"
r"|\\begin\{multicols\}"
r"|IEEEtran|acmart|sig-alternate",
content, re.IGNORECASE
))
# ---- 1. BARE_TABULAR_IN_TWOCOLUMN ----
# Find \begin{tabular} not wrapped in resizebox/adjustbox/tabularx/tabular*
# and not inside a table* (full-width) environment
if is_twocolumn:
info.append(_issue("TWOCOLUMN_DETECTED",
"Two-column layout detected. Checking tables for overflow risk.",
severity="info"))
# Parse all tabular environments with line numbers
tabular_pattern = re.compile(r"\\begin\{tabular\}\s*\{([^}]*)\}", re.IGNORECASE)
for i, line in enumerate(lines, 1):
m = tabular_pattern.search(line)
if not m:
continue
col_spec = m.group(1)
# Count data columns (l, c, r, p{}, X — skip @{}, |, >{}, <{})
col_count = len(re.findall(r"[lcrX]|p\{[^}]*\}", col_spec))
# Check if this tabular is wrapped in protective environments
# Look backwards up to 10 lines for wrapping context
context_start = max(0, i - 11)
context = "\n".join(lines[context_start:i])
has_resizebox = bool(re.search(r"\\resizebox", context))
has_adjustbox = bool(re.search(r"\\adjustbox|\\begin\{adjustbox\}", context))
has_table_star = bool(re.search(r"\\begin\{table\*\}", context))
has_makebox = bool(re.search(r"\\makebox\[.*?\\textwidth\]", context))
is_protected = has_resizebox or has_adjustbox or has_table_star or has_makebox
if is_twocolumn and col_count >= 5 and not is_protected:
errors.append(_issue(
"BARE_TABULAR_OVERFLOW",
f"Line {i}: \\begin{{tabular}} with {col_count} columns in a two-column document, "
f"not wrapped in resizebox/adjustbox/table*. This WILL overflow \\columnwidth. "
f"Fix: use tabularx{{\\columnwidth}}, or wrap in \\resizebox{{\\columnwidth}}{{!}}{{...}}, "
f"or use table* for full-width.",
severity="error"
))
elif is_twocolumn and col_count >= 4 and not is_protected:
warnings.append(_issue(
"TABULAR_OVERFLOW_RISK",
f"Line {i}: \\begin{{tabular}} with {col_count} columns in two-column layout. "
f"May overflow \\columnwidth depending on content width. "
f"Consider tabularx{{\\columnwidth}} or \\small font size.",
severity="warning"
))
elif not is_twocolumn and col_count >= 7 and not is_protected:
warnings.append(_issue(
"TABULAR_WIDE",
f"Line {i}: \\begin{{tabular}} with {col_count} columns. "
f"May overflow \\textwidth. Consider tabularx or resizebox.",
severity="warning"
))
# ---- 2. TABULAR_WITHOUT_TABLE ----
# tabular not inside table/table* environment (no caption, no float)
for i, line in enumerate(lines, 1):
if re.search(r"\\begin\{tabular\}", line):
context_start = max(0, i - 6)
context = "\n".join(lines[context_start:i])
if not re.search(r"\\begin\{table\*?\}", context):
warnings.append(_issue(
"TABULAR_NO_FLOAT",
f"Line {i}: \\begin{{tabular}} not inside a table/table* float environment. "
f"Table will be inline (no caption, no \\label, no float positioning).",
severity="warning"
))
# ---- 3. PREFER_TABULARX ----
# Check if tabularx is loaded in preamble
has_tabularx_pkg = bool(re.search(r"\\usepackage.*\{tabularx\}", content))
tabular_count = len(re.findall(r"\\begin\{tabular\}", content))
tabularx_count = len(re.findall(r"\\begin\{tabularx\}", content))
tabular_star_count = len(re.findall(r"\\begin\{tabular\*\}", content))
if tabular_count > 0 and tabularx_count == 0 and tabular_star_count == 0:
if not has_tabularx_pkg:
warnings.append(_issue(
"TABULARX_NOT_LOADED",
f"Document has {tabular_count} tabular environment(s) but tabularx package is not loaded. "
f"Add \\usepackage{{tabularx}} to preamble for auto-width column support.",
severity="warning"
))
# ---- 4. IMAGE_NO_WIDTH ----
# \includegraphics without width/max width constraint
img_pattern = re.compile(r"\\includegraphics\s*(\[[^\]]*\])?\s*\{")
for i, line in enumerate(lines, 1):
m = img_pattern.search(line)
if m:
opts = m.group(1) or ""
if not re.search(r"width|max width|scale|height", opts, re.IGNORECASE):
warnings.append(_issue(
"IMAGE_NO_WIDTH",
f"Line {i}: \\includegraphics without width/height constraint. "
f"Add [max width=\\columnwidth] or [width=\\columnwidth] to prevent overflow.",
severity="warning"
))
# ---- 5. CJK_ASCII_QUOTES ----
# Detect ASCII " adjacent to CJK characters (common LLM mistake).
# LaTeX interprets " as right double quote, so "北漂" renders with two
# right quotes. Chinese text must use Unicode smart quotes “…”.
cjk_quote_pattern = re.compile(
r'[\u4e00-\u9fff\u3400-\u4dbf]"'
r'|"[\u4e00-\u9fff\u3400-\u4dbf]'
)
# Regex to strip inline command content that legitimately contains ASCII quotes
_inline_cmd_re = re.compile(
r'\\(?:texttt|url|path|lstinline)\{[^}]*\}'
r'|\\href\{[^}]*\}\{[^}]*\}'
r"|\\verb([|!@#])(.*?)\1"
)
# Track verbatim-like environments to skip
in_verbatim = False
for i, line in enumerate(lines, 1):
# Skip comment lines
if line.lstrip().startswith('%'):
continue
# Track verbatim/lstlisting/minted environments
if re.search(r'\\begin\{(verbatim|lstlisting|minted|Verbatim|lstcode)\}', line):
in_verbatim = True
continue
if re.search(r'\\end\{(verbatim|lstlisting|minted|Verbatim|lstcode)\}', line):
in_verbatim = False
continue
if in_verbatim:
continue
# Strip inline command arguments that may legitimately contain ASCII quotes
check_line = _inline_cmd_re.sub('', line)
if cjk_quote_pattern.search(check_line):
errors.append(_issue(
"CJK_ASCII_QUOTES",
f'Line {i}: ASCII \'"\' found adjacent to CJK characters. '
f'LaTeX interprets " as right double quote (\u201d), so "\u5317\u6f02" renders as \u201d\u5317\u6f02\u201d. '
f'Fix: use Unicode smart quotes \u201c\u5317\u6f02\u201d (U+201C / U+201D) for Chinese text. '
f'(This check skips verbatim/lstlisting/minted environments and '
f'\\texttt{{}}/\\url{{}}/\\href{{}}{{}}/\\verb|| inline commands.)',
severity="error"
))
break # One error is enough
# ---- 6. EQUATION_OVERFLOW_RISK ----
# Detect equations that are likely too wide for dual-column
if is_twocolumn:
in_equation = False
eq_start_line = 0
eq_content = ""
eq_env_name = ""
eq_envs = ["equation", "displaymath"]
for i, line in enumerate(lines, 1):
stripped = line.strip()
if stripped.startswith('%'):
continue
for env in eq_envs:
if f"\\begin{{{env}}}" in stripped:
in_equation = True
eq_start_line = i
eq_content = ""
eq_env_name = env
if f"\\end{{{env}}}" in stripped and in_equation:
in_equation = False
# Check for two equations joined by \quad on same line
if re.search(r'\\quad\s*\\math', eq_content) or eq_content.count('=') >= 2:
if '\\\\' not in eq_content and 'aligned' not in eq_content and 'split' not in eq_content:
warnings.append(_issue(
"EQUATION_DUAL_ON_LINE",
f"Line {eq_start_line}: {eq_env_name} environment contains multiple equations "
f"joined by \\quad (or 2+ '=' signs) without line breaks. "
f"In dual-column format this will overflow. "
f"Fix: use align/split/aligned environment with \\\\ line breaks.",
severity="warning"
))
# Check for very long equation content
math_len = len(re.sub(r'\\[a-zA-Z]+|\s|\{|\}|\[|\]|\^|_', '', eq_content))
if math_len > 80:
warnings.append(_issue(
"EQUATION_OVERFLOW_RISK",
f"Line {eq_start_line}: {eq_env_name} has ~{math_len} math chars "
f"(threshold 80). Likely overflows single column. "
f"Consider split, multline, or factoring out sub-expressions.",
severity="warning"
))
if in_equation:
eq_content += stripped + " "
# ---- 7. RESIZEBOX_TEXTWIDTH_IN_TWOCOLUMN ----
# \resizebox{\textwidth} inside table (not table*) in twocolumn
if is_twocolumn:
in_table_star = False
for i, line in enumerate(lines, 1):
stripped = line.strip()
if '\\begin{table*}' in stripped:
in_table_star = True
if '\\end{table*}' in stripped:
in_table_star = False
if not in_table_star and '\\resizebox' in stripped and '\\textwidth' in stripped:
errors.append(_issue(
"RESIZEBOX_TEXTWIDTH",
f"Line {i}: \\resizebox{{\\textwidth}} used inside single-column float. "
f"In two-column layouts, \\textwidth = full page width (~504pt), but "
f"a table float is only one column (~252pt). This causes ~50% overflow. "
f"Fix: use \\resizebox{{\\columnwidth}}{{!}} for single-column, "
f"or change to table* for full-width.",
severity="error"
))
# ---- 8. ALGORITHM_OVERFLOW_RISK ----
if is_twocolumn:
in_algo = False
algo_start = 0
has_small_font = False
for i, line in enumerate(lines, 1):
if '\\SetAlFnt' in line and '\\small' in line:
has_small_font = True
if '\\begin{algorithm}' in line and '\\begin{algorithm*}' not in line:
in_algo = True
algo_start = i
if '\\end{algorithm}' in line and in_algo:
in_algo = False
if not has_small_font:
warnings.append(_issue(
"ALGORITHM_NO_SMALL_FONT",
f"Line {algo_start}: algorithm environment in dual-column without \\SetAlFnt{{\\small}}. "
f"Algorithm text at default size frequently overflows narrow columns. "
f"Add \\SetAlFnt{{\\small}} before the algorithm.",
severity="warning"
))
# Check for very long KwInput/KwOutput lines
if in_algo and ('\\KwInput' in line or '\\KwOutput' in line or '\\KwIn' in line or '\\KwOut' in line):
content_len = len(line.strip())
if content_len > 120:
warnings.append(_issue(
"ALGORITHM_LONG_IO",
f"Line {i}: Algorithm Input/Output line is {content_len} chars (threshold 120). "
f"Long I/O lines overflow column width. Break into multiple lines with \\\\ or "
f"abbreviate parameter names.",
severity="warning"
))
# Summary info
info.append(_issue(
"TABLE_SUMMARY",
f"Tables: {tabular_count} tabular, {tabularx_count} tabularx, {tabular_star_count} tabular*. "
f"Two-column: {'yes' if is_twocolumn else 'no'}.",
severity="info"
))
has_errors = any(e["severity"] == "error" for e in errors)
return {
"pass": not has_errors,
"source": tex_path,
"check_type": "tex",
"errors": errors,
"warnings": warnings,
"info": info,
}
# ---------------------------------------------------------------------------
def main() -> int:
parser = argparse.ArgumentParser(
description="Validate poster HTML/PDF for common quality issues."
)
sub = parser.add_subparsers(dest="command")
# -- check-html --
p_html = sub.add_parser("check-html", help="Pre-check an HTML file before PDF generation.")
p_html.add_argument("html_file", help="Path to the HTML file.")
p_html.add_argument("--fix", action="store_true",
help="Auto-fix issues where possible and output corrected HTML.")
p_html.add_argument("--output", "-o", default=None,
help="Write fixed HTML to this file (default: stdout).")
# -- check-pdf --
p_pdf = sub.add_parser("check-pdf", help="Post-check a generated PDF file.")
p_pdf.add_argument("pdf_file", help="Path to the PDF file.")
p_pdf.add_argument("--source-html", default=None,
help="Path to the source HTML (enables TEXT_MISSING check).")
p_pdf.add_argument("--poster", action="store_true",
help="Poster mode: suppress ORPHAN_PAGE for seamlessly-paginated posters.")
# -- check-tex --
p_tex = sub.add_parser("check-tex", help="Check a LaTeX .tex file for common issues (table overflow, etc.).")
p_tex.add_argument("tex_file", help="Path to the .tex file.")
args = parser.parse_args()
if not args.command:
parser.print_help()
return 2
try:
if args.command == "check-html":
report = check_html(
args.html_file,
fix=args.fix,
output_path=args.output,
)
fixed_html = report.pop("_fixed_html", None)
if args.fix and fixed_html and not args.output:
# JSON report goes to stderr, fixed HTML to stdout
print(json.dumps(report, indent=2, ensure_ascii=False), file=sys.stderr)
print(fixed_html)
else:
report.pop("_fixed_html", None)
print(json.dumps(report, indent=2, ensure_ascii=False))
return 0 if report["pass"] else 1
elif args.command == "check-pdf":
report = check_pdf(
args.pdf_file,
source_html=args.source_html,
poster=getattr(args, 'poster', False),
)
print(json.dumps(report, indent=2, ensure_ascii=False))
return 0 if report["pass"] else 1
elif args.command == "check-tex":
report = check_tex(args.tex_file)
print(json.dumps(report, indent=2, ensure_ascii=False))
return 0 if report["pass"] else 1
except Exception as exc:
err_report = {
"pass": False,
"error": f"Script error: {exc}",
"check_type": args.command.replace("check-", ""),
}
print(json.dumps(err_report, indent=2, ensure_ascii=False), file=sys.stderr)
return 2
return 0
if __name__ == "__main__":
sys.exit(main())