#!/usr/bin/env python3 """ poster_validate.py — Pre- and post-generation quality checks for poster/creative PDFs. Usage: # Check HTML before PDF generation python3 poster_validate.py check-html poster.html [--fix] [--output fixed.html] # Check PDF after generation python3 poster_validate.py check-pdf poster.pdf --source-html poster.html Both commands emit a JSON report to stdout: {"pass": bool, "source": "...", "check_type": "html"|"pdf", "errors": [...], "warnings": [...], "info": [...]} Exit codes: 0 pass (no errors; warnings/info are OK) 1 fail (at least one error) 2 script-level failure (bad arguments, unreadable file, …) """ from __future__ import annotations import argparse import json import math import os import re import sys from html.parser import HTMLParser from pathlib import Path from typing import Any # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- GENERIC_FAMILIES = frozenset( ["serif", "sans-serif", "monospace", "cursive", "fantasy", "system-ui", "ui-serif", "ui-sans-serif", "ui-monospace", "ui-rounded", "math", "emoji", "fangsong"] ) SERIF_FONTS = frozenset(f.lower() for f in [ "Playfair Display", "Georgia", "Times New Roman", "Times", "Noto Serif", "Noto Serif SC", "Noto Serif TC", "Noto Serif JP", "Noto Serif KR", "Source Serif Pro", "Source Serif 4", "Merriweather", "Lora", "PT Serif", "Libre Baskerville", "EB Garamond", "Cormorant Garamond", "Crimson Text", "STSong", "FangSong", "KaiTi", "STKaiti", "Songti SC", ]) CHINESE_FONTS = frozenset(f.lower() for f in [ "SimHei", "Microsoft YaHei", "Noto Sans SC", "Noto Sans TC", "Noto Sans CJK SC", "Noto Sans CJK TC", "PingFang SC", "PingFang TC", "Source Han Sans SC", "Source Han Sans TC", "WenQuanYi Micro Hei", "WenQuanYi Zen Hei", "Hiragino Sans GB", "STHeiti", "STXihei", "Noto Serif SC", "Noto Serif TC", "Noto Serif CJK SC", "Source Han Serif SC", "SimSun", "NSimSun", "FangSong", "KaiTi", "STSong", "STFangsong", "STKaiti", "Songti SC", "Heiti SC", ]) # Selectors we treat as "main containers" whose overflow:hidden is dangerous. # NOTE: .poster and .page are EXCLUDED because html2poster.js auto-injects # overflow:hidden on them at render time. See SKILL.md Engine Selection Rules. CONTAINER_SELECTORS = {"body", "html", ".slide", "#app", "#root", ".container", ".wrapper", "main", "section", "article"} # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _issue(code: str, message: str, severity: str = "error", line: int | None = None) -> dict: d: dict[str, Any] = {"code": code, "message": message, "severity": severity} if line is not None: d["line"] = line return d def _line_number(full_text: str, pos: int) -> int: """Return 1-based line number for character position *pos*.""" return full_text.count("\n", 0, pos) + 1 # --------------------------------------------------------------------------- # CSS regex helpers # --------------------------------------------------------------------------- _RE_FONT_FAMILY = re.compile( r"font-family\s*:\s*([^;}\n]+)", re.IGNORECASE ) _RE_FONT_SIZE = re.compile( r"font-size\s*:\s*(\d+(?:\.\d+)?)\s*(px|pt|em|rem)", re.IGNORECASE ) _RE_PAGE_SIZE = re.compile( r"@page\s*\{[^}]*\bsize\s*:", re.IGNORECASE | re.DOTALL ) _RE_CSS_URL = re.compile( r"url\(\s*['\"]?(https?://[^'\")\s]+)['\"]?\s*\)", re.IGNORECASE ) _RE_OVERFLOW = re.compile( r"overflow\s*:\s*hidden", re.IGNORECASE ) _RE_BG_WHITE = re.compile( r"background(?:-color)?\s*:\s*(white|#fff(?:fff)?|transparent)\b", re.IGNORECASE ) _RE_COLOR_HEX = re.compile(r"#([0-9a-fA-F]{3,8})") _RE_COLOR_RGB = re.compile(r"rgb\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\)") _RE_STYLE_BLOCK = re.compile(r"]*>(.*?)", re.IGNORECASE | re.DOTALL) _RE_INLINE_STYLE = re.compile(r'style\s*=\s*["\']([^"\']*)["\']', re.IGNORECASE) _RE_CSS_RULE = re.compile( r"([^{]+)\{([^}]*)\}", re.DOTALL ) _RE_WIDTH_PX = re.compile(r"width\s*:\s*(\d+(?:\.\d+)?)\s*px", re.IGNORECASE) def _parse_font_list(raw: str) -> list[str]: """Split a font-family value into individual font names (unquoted, stripped).""" fonts: list[str] = [] for part in raw.split(","): name = part.strip().strip("'\"").strip() if name: fonts.append(name) return fonts def _has_generic(fonts: list[str]) -> bool: return any(f.lower() in GENERIC_FAMILIES for f in fonts) def _best_generic(fonts: list[str]) -> str: """Pick the best generic fallback for a list of named fonts.""" lower = [f.lower() for f in fonts] if any(f in CHINESE_FONTS for f in lower): return "sans-serif" if any(f in SERIF_FONTS for f in lower): return "serif" return "sans-serif" # --------------------------------------------------------------------------- # Color / contrast helpers # --------------------------------------------------------------------------- def _hex_to_rgb(h: str) -> tuple[int, int, int] | None: h = h.lstrip("#") if len(h) == 3: h = h[0]*2 + h[1]*2 + h[2]*2 if len(h) == 4: h = h[0]*2 + h[1]*2 + h[2]*2 # ignore alpha if len(h) == 8: h = h[:6] # strip alpha if len(h) != 6: return None try: return int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16) except ValueError: return None def _relative_luminance(r: int, g: int, b: int) -> float: """WCAG 2.x relative luminance.""" def _c(v: int) -> float: s = v / 255.0 return s / 12.92 if s <= 0.03928 else ((s + 0.055) / 1.055) ** 2.4 return 0.2126 * _c(r) + 0.7152 * _c(g) + 0.0722 * _c(b) def _contrast_ratio(rgb1: tuple[int, int, int], rgb2: tuple[int, int, int]) -> float: l1 = _relative_luminance(*rgb1) l2 = _relative_luminance(*rgb2) lighter = max(l1, l2) darker = min(l1, l2) return (lighter + 0.05) / (darker + 0.05) def _extract_color(css_text: str, prop: str) -> tuple[int, int, int] | None: """Try to extract an RGB color for a given CSS property from a rule body.""" pat = re.compile(rf"{prop}\s*:\s*([^;]+)", re.IGNORECASE) m = pat.search(css_text) if not m: return None val = m.group(1).strip() # Named colours (just the common ones) named = { "white": (255, 255, 255), "black": (0, 0, 0), "red": (255, 0, 0), "green": (0, 128, 0), "blue": (0, 0, 255), "yellow": (255, 255, 0), "grey": (128, 128, 128), "gray": (128, 128, 128), "transparent": (255, 255, 255), } low = val.lower().split()[0].rstrip(";") if low in named: return named[low] m_rgb = _RE_COLOR_RGB.search(val) if m_rgb: return int(m_rgb.group(1)), int(m_rgb.group(2)), int(m_rgb.group(3)) m_hex = _RE_COLOR_HEX.search(val) if m_hex: return _hex_to_rgb(m_hex.group(1)) return None # --------------------------------------------------------------------------- # HTML visible text extractor (stdlib only) # --------------------------------------------------------------------------- class _TextExtractor(HTMLParser): """Extract visible text from HTML, skipping