Initial commit
This commit is contained in:
367
skills/pdf/scripts/cover_validate.js
Executable file
367
skills/pdf/scripts/cover_validate.js
Executable file
@@ -0,0 +1,367 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* cover_validate.js — Cover page overlap detection via Playwright rendering
|
||||
*
|
||||
* Detects text-vs-decorative-line overlap on cover HTML pages by:
|
||||
* 1. Rendering the HTML in Playwright
|
||||
* 2. Waiting for fonts to load
|
||||
* 3. Measuring bounding boxes of text elements and decorative line elements
|
||||
* 4. Checking for Y-axis overlap (minimum spacing = 1U = 5% of page width ≈ 30pt)
|
||||
*
|
||||
* Usage:
|
||||
* node cover_validate.js cover.html
|
||||
* node cover_validate.js cover.html --width 210mm --height 297mm
|
||||
* node cover_validate.js cover.html --min-gap 30 # custom min gap in px (default: auto = 5% of width)
|
||||
*
|
||||
* Exit codes:
|
||||
* 0 = no overlap issues found
|
||||
* 1 = overlap detected (prints details to stderr)
|
||||
* 2 = script error (missing file, browser launch failure, etc.)
|
||||
*
|
||||
* This script is ONLY for cover pages. Do NOT use it on:
|
||||
* - Multi-page documents (use html2pdf-next.js pre-render checks)
|
||||
* - Posters (use html2poster.js which handles overflow automatically)
|
||||
*/
|
||||
|
||||
'use strict';
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
// ── Playwright import ──
|
||||
|
||||
let playwright;
|
||||
try {
|
||||
playwright = require('playwright');
|
||||
} catch {
|
||||
try {
|
||||
playwright = require('playwright-core');
|
||||
} catch {
|
||||
console.error('✗ Neither playwright nor playwright-core is installed.');
|
||||
process.exit(2);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Chromium resolution (shared logic with html2poster.js) ──
|
||||
|
||||
function resolveChromium(chromiumObj) {
|
||||
let exe;
|
||||
try { exe = chromiumObj.executablePath(); } catch (_) { exe = null; }
|
||||
if (exe && fs.existsSync(exe)) return { status: 'ok', executablePath: exe };
|
||||
|
||||
const candidates = [
|
||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
||||
'/usr/bin/chromium-browser', '/usr/bin/chromium', '/usr/bin/google-chrome',
|
||||
];
|
||||
if (process.env.PLAYWRIGHT_CHROMIUM_PATH) candidates.unshift(process.env.PLAYWRIGHT_CHROMIUM_PATH);
|
||||
|
||||
for (const c of candidates) {
|
||||
if (fs.existsSync(c)) return { status: 'fallback', executablePath: c };
|
||||
}
|
||||
return { status: 'missing', executablePath: exe || '' };
|
||||
}
|
||||
|
||||
// ── CLI parsing ──
|
||||
|
||||
function parseArgs(argv) {
|
||||
const tokens = argv.slice(2);
|
||||
let input = null, width = '210mm', height = '297mm', minGap = null;
|
||||
|
||||
for (let i = 0; i < tokens.length; i++) {
|
||||
const t = tokens[i];
|
||||
if (t === '--width') width = tokens[++i];
|
||||
else if (t === '--height') height = tokens[++i];
|
||||
else if (t === '--min-gap') minGap = parseFloat(tokens[++i]);
|
||||
else if (t === '--help' || t === '-h') {
|
||||
console.log(`Usage: node cover_validate.js <cover.html> [options]
|
||||
|
||||
Options:
|
||||
--width <val> Page width (default: 210mm)
|
||||
--height <val> Page height (default: 297mm)
|
||||
--min-gap <px> Minimum gap between text and decorative lines (default: 5% of width)
|
||||
--help Show this help`);
|
||||
process.exit(0);
|
||||
} else if (!t.startsWith('-') && !input) {
|
||||
input = t;
|
||||
}
|
||||
}
|
||||
return { input, width, height, minGap };
|
||||
}
|
||||
|
||||
// ── Convert CSS dimension string to px for viewport ──
|
||||
|
||||
function dimToPx(dim) {
|
||||
if (!dim) return null;
|
||||
const s = String(dim).trim();
|
||||
const num = parseFloat(s);
|
||||
if (s.endsWith('mm')) return Math.round(num * 3.7795); // 1mm ≈ 3.7795px at 96dpi
|
||||
if (s.endsWith('cm')) return Math.round(num * 37.795);
|
||||
if (s.endsWith('in')) return Math.round(num * 96);
|
||||
if (s.endsWith('px') || !isNaN(num)) return Math.round(num);
|
||||
return null;
|
||||
}
|
||||
|
||||
// ── Decorative line detection heuristics ──
|
||||
// A decorative line is an element that:
|
||||
// - Is very thin in one dimension (height ≤ 5px or width ≤ 5px)
|
||||
// - OR is an <hr> element
|
||||
// - OR has a large aspect ratio (> 10:1 or < 1:10)
|
||||
// - AND is not inside a text element
|
||||
|
||||
const DECORATIVE_LINE_DETECTION = `
|
||||
(function detectOverlaps(minGapPx) {
|
||||
// Collect all elements
|
||||
const allElements = document.querySelectorAll('*');
|
||||
|
||||
const textElements = [];
|
||||
const lineElements = [];
|
||||
|
||||
// Classify elements
|
||||
for (const el of allElements) {
|
||||
const rect = el.getBoundingClientRect();
|
||||
if (rect.width === 0 || rect.height === 0) continue;
|
||||
|
||||
const tag = el.tagName.toLowerCase();
|
||||
const style = getComputedStyle(el);
|
||||
|
||||
// Skip invisible elements
|
||||
if (style.display === 'none' || style.visibility === 'hidden' || style.opacity === '0') continue;
|
||||
|
||||
// Detect decorative lines
|
||||
const isHR = tag === 'hr';
|
||||
const isThinH = rect.height <= 5 && rect.width > 20; // thin horizontal line
|
||||
const isThinV = rect.width <= 5 && rect.height > 20; // thin vertical line
|
||||
const aspectH = rect.width / rect.height;
|
||||
const aspectV = rect.height / rect.width;
|
||||
const isWideRatio = aspectH > 15 && rect.height <= 8; // very wide, very thin
|
||||
const isTallRatio = aspectV > 15 && rect.width <= 8; // very tall, very thin
|
||||
|
||||
// Check if element has only border (no text content, no background image)
|
||||
const hasOnlyBorder = (
|
||||
el.textContent.trim() === '' &&
|
||||
style.backgroundImage === 'none' &&
|
||||
(style.borderTopWidth !== '0px' || style.borderBottomWidth !== '0px' ||
|
||||
style.borderLeftWidth !== '0px' || style.borderRightWidth !== '0px')
|
||||
);
|
||||
const isBorderLine = hasOnlyBorder && (rect.height <= 8 || rect.width <= 8);
|
||||
|
||||
if (isHR || isThinH || isThinV || isWideRatio || isTallRatio || isBorderLine) {
|
||||
lineElements.push({
|
||||
tag: tag,
|
||||
class: el.className || '',
|
||||
rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height },
|
||||
type: isThinH || isWideRatio ? 'horizontal' : (isThinV || isTallRatio ? 'vertical' : (rect.width >= rect.height ? 'horizontal' : 'vertical')),
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
// Detect text elements (has direct text content or is a heading/paragraph)
|
||||
const textTags = ['h1','h2','h3','h4','h5','h6','p','span','a','li','td','th','label','summary'];
|
||||
const hasDirectText = Array.from(el.childNodes).some(n => n.nodeType === 3 && n.textContent.trim());
|
||||
|
||||
if (textTags.includes(tag) || hasDirectText) {
|
||||
// Skip if this is inside a decorative element
|
||||
if (rect.height < 3) continue;
|
||||
|
||||
textElements.push({
|
||||
tag: tag,
|
||||
class: el.className || '',
|
||||
text: el.textContent.trim().substring(0, 60),
|
||||
rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height },
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// De-duplicate: if a parent and child text element both overlap the same line,
|
||||
// only keep the more specific (smaller) one to avoid duplicate reports.
|
||||
// Sort text elements by area (smallest first) so we can skip parents.
|
||||
textElements.sort((a, b) => (a.rect.width * a.rect.height) - (b.rect.width * b.rect.height));
|
||||
|
||||
// Check overlaps between text elements and line elements
|
||||
const overlaps = [];
|
||||
const reportedPairs = new Set(); // track "lineIndex:textContent" to deduplicate
|
||||
|
||||
for (const text of textElements) {
|
||||
for (const line of lineElements) {
|
||||
const tr = text.rect;
|
||||
const lr = line.rect;
|
||||
|
||||
if (line.type === 'horizontal') {
|
||||
// Check vertical overlap/proximity
|
||||
const textTop = tr.y;
|
||||
const textBottom = tr.y + tr.height;
|
||||
const lineTop = lr.y;
|
||||
const lineBottom = lr.y + lr.height;
|
||||
|
||||
// Check horizontal overlap (they must share some X range)
|
||||
const xOverlap = !(tr.x + tr.width < lr.x || lr.x + lr.width < tr.x);
|
||||
if (!xOverlap) continue;
|
||||
|
||||
// Calculate vertical gap
|
||||
let vGap;
|
||||
if (lineTop >= textBottom) {
|
||||
vGap = lineTop - textBottom; // line is below text
|
||||
} else if (textTop >= lineBottom) {
|
||||
vGap = textTop - lineBottom; // line is above text
|
||||
} else {
|
||||
vGap = 0; // overlapping
|
||||
}
|
||||
|
||||
if (vGap < minGapPx) {
|
||||
// De-dup: same line region, only report the smallest (most specific) text element
|
||||
const lineKey = 'h:' + Math.round(lr.x) + ',' + Math.round(lr.y);
|
||||
if (!reportedPairs.has(lineKey)) {
|
||||
reportedPairs.add(lineKey);
|
||||
overlaps.push({
|
||||
text: text.text,
|
||||
textTag: text.tag,
|
||||
textClass: text.class,
|
||||
textRect: tr,
|
||||
lineTag: line.tag,
|
||||
lineClass: line.class,
|
||||
lineRect: lr,
|
||||
lineType: line.type,
|
||||
gap: Math.round(vGap * 10) / 10,
|
||||
required: minGapPx,
|
||||
});
|
||||
}
|
||||
}
|
||||
} else if (line.type === 'vertical') {
|
||||
// Check horizontal overlap/proximity
|
||||
const textLeft = tr.x;
|
||||
const textRight = tr.x + tr.width;
|
||||
const lineLeft = lr.x;
|
||||
const lineRight = lr.x + lr.width;
|
||||
|
||||
// Check vertical overlap (they must share some Y range)
|
||||
const yOverlap = !(tr.y + tr.height < lr.y || lr.y + lr.height < tr.y);
|
||||
if (!yOverlap) continue;
|
||||
|
||||
// Calculate horizontal gap
|
||||
let hGap;
|
||||
if (lineLeft >= textRight) {
|
||||
hGap = lineLeft - textRight;
|
||||
} else if (textLeft >= lineRight) {
|
||||
hGap = textLeft - lineRight;
|
||||
} else {
|
||||
hGap = 0;
|
||||
}
|
||||
|
||||
if (hGap < minGapPx) {
|
||||
const lineKey = 'v:' + Math.round(lr.x) + ',' + Math.round(lr.y);
|
||||
if (!reportedPairs.has(lineKey)) {
|
||||
reportedPairs.add(lineKey);
|
||||
overlaps.push({
|
||||
text: text.text,
|
||||
textTag: text.tag,
|
||||
textClass: text.class,
|
||||
textRect: tr,
|
||||
lineTag: line.tag,
|
||||
lineClass: line.class,
|
||||
lineRect: lr,
|
||||
lineType: line.type,
|
||||
gap: Math.round(hGap * 10) / 10,
|
||||
required: minGapPx,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
textElements: textElements.length,
|
||||
lineElements: lineElements.length,
|
||||
overlaps: overlaps,
|
||||
};
|
||||
})
|
||||
`;
|
||||
|
||||
// ── Main ──
|
||||
|
||||
async function main() {
|
||||
const { input, width, height, minGap } = parseArgs(process.argv);
|
||||
|
||||
if (!input) {
|
||||
console.error('✗ No input file specified. Usage: node cover_validate.js cover.html');
|
||||
process.exit(2);
|
||||
}
|
||||
|
||||
const absIn = path.resolve(input);
|
||||
if (!fs.existsSync(absIn)) {
|
||||
console.error(`✗ File not found: ${absIn}`);
|
||||
process.exit(2);
|
||||
}
|
||||
|
||||
const widthPx = dimToPx(width) || 794; // A4 width in px
|
||||
const heightPx = dimToPx(height) || 1123; // A4 height in px
|
||||
const gap = minGap || Math.round(widthPx * 0.05); // 1U = 5% of page width
|
||||
|
||||
console.log(`🔍 cover_validate — Cover overlap detection`);
|
||||
console.log(` Input: ${absIn}`);
|
||||
console.log(` Page: ${widthPx}×${heightPx}px`);
|
||||
console.log(` Min gap: ${gap}px (1U)`);
|
||||
|
||||
const { chromium } = playwright;
|
||||
const bInfo = resolveChromium(chromium);
|
||||
|
||||
if (bInfo.status === 'missing') {
|
||||
console.error('✗ No Chromium found. Install via: npx playwright install chromium');
|
||||
process.exit(2);
|
||||
}
|
||||
|
||||
let browser;
|
||||
try {
|
||||
const opts = { headless: true };
|
||||
if (bInfo.status === 'fallback') opts.executablePath = bInfo.executablePath;
|
||||
browser = await chromium.launch(opts);
|
||||
} catch (err) {
|
||||
console.error(`✗ Browser launch failed: ${err.message}`);
|
||||
process.exit(2);
|
||||
}
|
||||
|
||||
try {
|
||||
const page = await browser.newPage({ viewport: { width: widthPx, height: heightPx } });
|
||||
await page.goto('file://' + absIn, { waitUntil: 'networkidle' });
|
||||
console.log(` ✓ HTML loaded`);
|
||||
|
||||
// Wait for fonts
|
||||
const fontsLoaded = await page.evaluate(() =>
|
||||
document.fonts.ready.then(() => document.fonts.size)
|
||||
).catch(() => 0);
|
||||
console.log(` ✓ Fonts: ${fontsLoaded} loaded`);
|
||||
|
||||
// Run overlap detection
|
||||
const result = await page.evaluate(`(${DECORATIVE_LINE_DETECTION})(${gap})`);
|
||||
|
||||
console.log(` ✓ Found ${result.textElements} text elements, ${result.lineElements} decorative lines`);
|
||||
|
||||
if (result.overlaps.length === 0) {
|
||||
console.log(`\n ✅ No overlap issues found`);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Report overlaps
|
||||
console.error(`\n ❌ Found ${result.overlaps.length} text-line overlap(s):\n`);
|
||||
|
||||
for (const o of result.overlaps) {
|
||||
const direction = o.lineType === 'vertical' ? 'horizontal' : 'vertical';
|
||||
console.error(` ERROR: ${direction} gap = ${o.gap}px (required ≥ ${o.required}px)`);
|
||||
console.error(` Text: <${o.textTag}> "${o.text}" @ y=${Math.round(o.textRect.y)}-${Math.round(o.textRect.y + o.textRect.height)}`);
|
||||
console.error(` Line: <${o.lineTag}${o.lineClass ? '.' + o.lineClass.split(' ')[0] : ''}> [${o.lineType}] @ y=${Math.round(o.lineRect.y)}-${Math.round(o.lineRect.y + o.lineRect.height)}`);
|
||||
console.error(` Fix: Move the decorative line at least ${Math.ceil(o.required - o.gap)}px away from the text.`);
|
||||
console.error('');
|
||||
}
|
||||
|
||||
process.exit(1);
|
||||
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(err => {
|
||||
console.error(`✗ Unexpected error: ${err.message}`);
|
||||
process.exit(2);
|
||||
});
|
||||
2816
skills/pdf/scripts/design_engine.py
Executable file
2816
skills/pdf/scripts/design_engine.py
Executable file
File diff suppressed because it is too large
Load Diff
754
skills/pdf/scripts/html2pdf-next.js
Executable file
754
skills/pdf/scripts/html2pdf-next.js
Executable file
@@ -0,0 +1,754 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* html2pdf-next.js — HTML → PDF converter using Playwright + pdf-lib
|
||||
*
|
||||
* Drop-in replacement for html2pdf.js, WITHOUT Paged.js dependency.
|
||||
* Uses Chromium native @page CSS for pagination + pdf-lib for post-processing.
|
||||
*
|
||||
* Usage:
|
||||
* node html2pdf-next.js input.html
|
||||
* node html2pdf-next.js input.html --output result.pdf
|
||||
* node html2pdf-next.js input.html --css extra.css
|
||||
* node html2pdf-next.js input.html --width 720px --height 960px
|
||||
* node html2pdf-next.js input.html --direct (same as default now — no Paged.js to skip)
|
||||
* node html2pdf-next.js input.html --merge a.pdf b.pdf (merge additional PDFs after)
|
||||
*
|
||||
* Architecture:
|
||||
* 1. Playwright renders HTML → raw PDF via Chromium's native print engine
|
||||
* 2. Pre-render hooks: Mermaid, KaTeX, oversized element fixes
|
||||
* 3. Post-render: pdf-lib for merge, metadata, page count extraction
|
||||
* 4. No Paged.js, no paged.polyfill.js — CSS @page handles pagination natively
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const { execSync, spawnSync } = require('child_process');
|
||||
|
||||
const sleep = ms => new Promise(r => setTimeout(r, ms));
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
// Playwright / Chromium resolution (self-contained, no external helper)
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
|
||||
function loadPlaywright() {
|
||||
// Try direct require first
|
||||
try { return require('playwright'); } catch (_) {}
|
||||
|
||||
// Search common global paths
|
||||
const Module = require('module');
|
||||
const roots = new Set();
|
||||
if (process.env.PLAYWRIGHT_PATH) roots.add(process.env.PLAYWRIGHT_PATH);
|
||||
if (process.env.NODE_PATH) {
|
||||
process.env.NODE_PATH.split(path.delimiter).filter(Boolean).forEach(p => roots.add(p));
|
||||
}
|
||||
try {
|
||||
const g = execSync('npm root -g', { stdio: ['ignore', 'pipe', 'ignore'] }).toString().trim();
|
||||
if (g) roots.add(g);
|
||||
} catch (_) {}
|
||||
|
||||
for (const base of roots) {
|
||||
const pkg = path.join(base, 'playwright', 'package.json');
|
||||
if (!fs.existsSync(pkg)) continue;
|
||||
try { return Module.createRequire(pkg)('playwright'); } catch (_) {}
|
||||
}
|
||||
throw new Error('Playwright not found. Install: npm install -g playwright');
|
||||
}
|
||||
|
||||
function loadPdfLib() {
|
||||
try { return require('pdf-lib'); } catch (_) {}
|
||||
const Module = require('module');
|
||||
try {
|
||||
const g = execSync('npm root -g', { stdio: ['ignore', 'pipe', 'ignore'] }).toString().trim();
|
||||
const pkg = path.join(g, 'pdf-lib', 'package.json');
|
||||
if (fs.existsSync(pkg)) return Module.createRequire(pkg)('pdf-lib');
|
||||
} catch (_) {}
|
||||
throw new Error('pdf-lib not found. Install: npm install -g pdf-lib');
|
||||
}
|
||||
|
||||
function resolveChromium(chromiumObj, allowInstall = false) {
|
||||
let exe;
|
||||
try { exe = chromiumObj.executablePath(); } catch (_) { exe = null; }
|
||||
|
||||
if (exe && fs.existsSync(exe)) {
|
||||
return { status: 'ok', executablePath: exe };
|
||||
}
|
||||
|
||||
// Try system Chrome/Chromium
|
||||
const candidates = [
|
||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
||||
'/usr/bin/chromium-browser', '/usr/bin/chromium', '/usr/bin/google-chrome',
|
||||
];
|
||||
if (process.env.PLAYWRIGHT_CHROMIUM_PATH) candidates.unshift(process.env.PLAYWRIGHT_CHROMIUM_PATH);
|
||||
|
||||
for (const c of candidates) {
|
||||
if (fs.existsSync(c)) return { status: 'fallback', executablePath: c };
|
||||
}
|
||||
|
||||
if (allowInstall) {
|
||||
const r = spawnSync('npx', ['playwright', 'install', 'chromium'], { stdio: 'inherit', shell: true });
|
||||
if (r.status === 0) {
|
||||
try { exe = chromiumObj.executablePath(); } catch (_) {}
|
||||
if (exe && fs.existsSync(exe)) return { status: 'installed', executablePath: exe };
|
||||
}
|
||||
}
|
||||
|
||||
return { status: 'missing', executablePath: exe || '' };
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
// CLI
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
|
||||
function cli() {
|
||||
const tokens = process.argv.slice(2);
|
||||
if (!tokens.length || tokens[0] === '-h' || tokens[0] === '--help') {
|
||||
console.log(`
|
||||
Usage: node html2pdf-next.js <input.html> [options]
|
||||
|
||||
Options:
|
||||
--output, -o <file> Output PDF path (default: <input>.pdf)
|
||||
--css <file> Inject extra stylesheet
|
||||
--width <px> Custom page width (e.g. 720px)
|
||||
--height <px> Custom page height (e.g. 960px)
|
||||
--direct (no-op, kept for backward compat — always direct now)
|
||||
--merge <files...> Append additional PDF files after conversion
|
||||
--title <text> Set PDF document title metadata
|
||||
--help, -h Show help
|
||||
`);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const inputFile = tokens[0];
|
||||
let outputFile = null, customCSS = null, width = null, height = null;
|
||||
let mergeFiles = [], title = null;
|
||||
|
||||
for (let i = 1; i < tokens.length; i++) {
|
||||
const t = tokens[i];
|
||||
if (t === '--output' || t === '-o') outputFile = tokens[++i];
|
||||
else if (t === '--css') customCSS = tokens[++i];
|
||||
else if (t === '--width') width = tokens[++i];
|
||||
else if (t === '--height') height = tokens[++i];
|
||||
else if (t === '--direct') { /* no-op, always direct */ }
|
||||
else if (t === '--title') title = tokens[++i];
|
||||
else if (t === '--merge') {
|
||||
while (i + 1 < tokens.length && !tokens[i + 1].startsWith('--')) {
|
||||
mergeFiles.push(tokens[++i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!outputFile) {
|
||||
const p = path.parse(inputFile);
|
||||
outputFile = path.join(p.dir || '.', p.name + '.pdf');
|
||||
}
|
||||
|
||||
return { inputFile, outputFile, customCSS, width, height, mergeFiles, title };
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
// Helpers
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
|
||||
function prettyBytes(n) {
|
||||
const units = ['B', 'KB', 'MB', 'GB'];
|
||||
let u = 0;
|
||||
while (n >= 1024 && u < units.length - 1) { n /= 1024; u++; }
|
||||
return `${n.toFixed(1)} ${units[u]}`;
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
// Pre-render hooks (run in browser context before PDF export)
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
|
||||
async function preRenderHooks(page) {
|
||||
const warnings = [];
|
||||
|
||||
// 1. Wait for Mermaid diagrams
|
||||
const hasMermaid = await page.evaluate(() => document.querySelectorAll('.mermaid').length > 0);
|
||||
if (hasMermaid) {
|
||||
console.log(' ⏳ Waiting for Mermaid diagrams...');
|
||||
try {
|
||||
await page.waitForFunction(() => {
|
||||
for (const m of document.querySelectorAll('.mermaid'))
|
||||
if (!m.querySelector('svg') && !m.getAttribute('data-processed')) return false;
|
||||
return true;
|
||||
}, { timeout: 30000 });
|
||||
await sleep(2000);
|
||||
console.log(' ✓ Mermaid rendered');
|
||||
} catch (_) {
|
||||
warnings.push('Mermaid rendering timed out (30s)');
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Trigger KaTeX math rendering
|
||||
const katexStatus = await page.evaluate(() => ({
|
||||
lib: typeof renderMathInElement === 'function' || typeof katex !== 'undefined',
|
||||
rendered: document.querySelectorAll('.katex').length > 0,
|
||||
raw: /\$[^$]+\$|\$\$[^$]+\$\$|\\\(.*?\\\)|\\\[.*?\\\]/.test(document.body.innerText),
|
||||
}));
|
||||
|
||||
// Auto-inject KaTeX CDN if raw math detected but library not loaded
|
||||
if (!katexStatus.lib && katexStatus.raw && !katexStatus.rendered) {
|
||||
console.log(' ⏳ Auto-injecting KaTeX CDN (math formulas detected but KaTeX not loaded)...');
|
||||
await page.addStyleTag({ url: 'https://cdn.jsdelivr.net/npm/katex@0.16.22/dist/katex.min.css' });
|
||||
await page.addScriptTag({ url: 'https://cdn.jsdelivr.net/npm/katex@0.16.22/dist/katex.min.js' });
|
||||
await page.addScriptTag({ url: 'https://cdn.jsdelivr.net/npm/katex@0.16.22/dist/contrib/auto-render.min.js' });
|
||||
await sleep(2000); // Wait for CDN scripts to load
|
||||
// Re-check
|
||||
const recheckLib = await page.evaluate(() => typeof renderMathInElement === 'function');
|
||||
if (recheckLib) {
|
||||
console.log(' ✓ KaTeX CDN loaded successfully');
|
||||
} else {
|
||||
console.log(' ⚠ KaTeX CDN failed to load — math will render as raw text');
|
||||
warnings.push('KaTeX CDN injection failed; math formulas may appear as raw LaTeX code');
|
||||
}
|
||||
}
|
||||
|
||||
// Re-evaluate after potential CDN injection
|
||||
const katexReady = await page.evaluate(() => ({
|
||||
lib: typeof renderMathInElement === 'function' || typeof katex !== 'undefined',
|
||||
rendered: document.querySelectorAll('.katex').length > 0,
|
||||
raw: /\$[^$]+\$|\$\$[^$]+\$\$|\\\(.*?\\\)|\\\[.*?\\\]/.test(document.body.innerText),
|
||||
}));
|
||||
|
||||
if (katexReady.lib && !katexReady.rendered && katexReady.raw) {
|
||||
console.log(' ⏳ Triggering KaTeX rendering...');
|
||||
await page.evaluate(() => {
|
||||
if (typeof renderMathInElement === 'function')
|
||||
renderMathInElement(document.body, {
|
||||
delimiters: [
|
||||
{ left: '$$', right: '$$', display: true },
|
||||
{ left: '$', right: '$', display: false },
|
||||
{ left: '\\(', right: '\\)', display: false },
|
||||
{ left: '\\[', right: '\\]', display: true },
|
||||
],
|
||||
throwOnError: false,
|
||||
});
|
||||
});
|
||||
await sleep(1000);
|
||||
console.log(' ✓ KaTeX rendered');
|
||||
} else if (katexReady.rendered) {
|
||||
await sleep(500); // Font loading settle
|
||||
}
|
||||
|
||||
// 3. Fix oversized elements that prevent page breaks
|
||||
const nFixed = await page.evaluate(() => {
|
||||
const LIMIT = 1000;
|
||||
let n = 0;
|
||||
document.querySelectorAll(
|
||||
'[style*="page-break-inside: avoid"],[style*="break-inside: avoid"],' +
|
||||
'.avoid-break,table,figure,.theorem,.algorithm'
|
||||
).forEach(el => {
|
||||
if (el.getBoundingClientRect().height > LIMIT) {
|
||||
el.style.pageBreakInside = 'auto';
|
||||
el.style.breakInside = 'auto';
|
||||
n++;
|
||||
}
|
||||
});
|
||||
return n;
|
||||
});
|
||||
if (nFixed) {
|
||||
console.log(` ⚠ Fixed ${nFixed} oversized elements (removed break-inside: avoid)`);
|
||||
}
|
||||
|
||||
// 4. Detect overflow (horizontal AND vertical)
|
||||
const overflows = await page.evaluate(() => {
|
||||
const out = [];
|
||||
document.querySelectorAll('pre,table,figure,img,svg,.mermaid,blockquote,.equation').forEach(el => {
|
||||
const hDiff = el.scrollWidth - el.clientWidth;
|
||||
const vDiff = el.scrollHeight - el.clientHeight;
|
||||
if (hDiff > 2 || vDiff > 2) out.push({
|
||||
tag: el.tagName.toLowerCase(),
|
||||
cls: el.className || '',
|
||||
hOverflow: hDiff > 2 ? hDiff : 0,
|
||||
vOverflow: vDiff > 2 ? vDiff : 0,
|
||||
preview: (el.textContent || '').slice(0, 50).replace(/\s+/g, ' '),
|
||||
});
|
||||
});
|
||||
return out;
|
||||
});
|
||||
if (overflows.length) {
|
||||
console.log(' ⚠ Overflow detected:');
|
||||
overflows.forEach(o => {
|
||||
const parts = [];
|
||||
if (o.hOverflow) parts.push(`H +${o.hOverflow}px`);
|
||||
if (o.vOverflow) parts.push(`V +${o.vOverflow}px`);
|
||||
console.log(` <${o.tag}${o.cls ? '.' + o.cls.split(' ')[0] : ''}> ${parts.join(', ')}`);
|
||||
});
|
||||
warnings.push(`${overflows.length} element(s) have overflow`);
|
||||
}
|
||||
|
||||
// 4b. Fix vertical overflow on page-level containers
|
||||
// When html/body or the main content canvas has a fixed height + overflow:hidden,
|
||||
// content gets clipped. For documents (html2pdf-next.js), we DON'T expand the
|
||||
// container to its scrollHeight — that creates an oversized single "page" that
|
||||
// Playwright splits unevenly. Instead, we remove the fixed height and overflow:hidden
|
||||
// so content flows naturally and @page CSS handles pagination.
|
||||
//
|
||||
// (The old "expand to scrollHeight" logic belongs in html2poster.js where a single
|
||||
// continuous canvas is the desired output.)
|
||||
const vOverflowFix = await page.evaluate(() => {
|
||||
const fixes = [];
|
||||
// Candidates: html, body, and any direct child of body that acts as a full-page canvas
|
||||
const candidates = [document.documentElement, document.body];
|
||||
const bodyChildren = document.body.children;
|
||||
for (let i = 0; i < bodyChildren.length; i++) {
|
||||
const child = bodyChildren[i];
|
||||
// Skip SVG defs, script, style elements
|
||||
const tag = child.tagName.toLowerCase();
|
||||
if (tag === 'svg' || tag === 'script' || tag === 'style' || tag === 'link') continue;
|
||||
candidates.push(child);
|
||||
// Also check one level deeper (e.g., .canvas > .content)
|
||||
for (let j = 0; j < child.children.length; j++) {
|
||||
const grandchild = child.children[j];
|
||||
const gtag = grandchild.tagName.toLowerCase();
|
||||
if (gtag === 'svg' || gtag === 'script' || gtag === 'style') continue;
|
||||
candidates.push(grandchild);
|
||||
}
|
||||
}
|
||||
|
||||
for (const el of candidates) {
|
||||
const computed = getComputedStyle(el);
|
||||
const overflow = computed.overflow || computed.overflowY;
|
||||
const hasHiddenOverflow = overflow === 'hidden' || overflow === 'clip';
|
||||
const diff = el.scrollHeight - el.clientHeight;
|
||||
|
||||
if (hasHiddenOverflow && diff > 5) {
|
||||
// This element is clipping content vertically
|
||||
const tag = el.tagName.toLowerCase();
|
||||
const id = el.id ? `#${el.id}` : '';
|
||||
const cls = el.className ? `.${String(el.className).split(' ')[0]}` : '';
|
||||
const selector = `${tag}${id}${cls}`;
|
||||
|
||||
const oldHeight = el.clientHeight;
|
||||
|
||||
// Document mode: remove fixed height + overflow:hidden,
|
||||
// let @page handle natural pagination
|
||||
el.style.height = 'auto';
|
||||
el.style.minHeight = 'auto';
|
||||
el.style.maxHeight = 'none';
|
||||
el.style.overflow = 'visible';
|
||||
el.style.overflowY = 'visible';
|
||||
|
||||
fixes.push({
|
||||
selector,
|
||||
oldHeight,
|
||||
clipped: diff,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// After fixing containers, re-measure to get the final content height
|
||||
const finalHeight = Math.max(
|
||||
document.documentElement.scrollHeight,
|
||||
document.body.scrollHeight
|
||||
);
|
||||
|
||||
return { fixes, finalHeight };
|
||||
});
|
||||
|
||||
if (vOverflowFix.fixes.length) {
|
||||
console.log(' ⚠️ Removed fixed height + overflow:hidden — content will paginate naturally:');
|
||||
vOverflowFix.fixes.forEach(f => {
|
||||
console.log(` ${f.selector}: was ${f.oldHeight}px with ${f.clipped}px clipped → now auto (content will flow to next page)`);
|
||||
});
|
||||
}
|
||||
|
||||
// 4c. Convert absolute-bottom elements to document flow
|
||||
// Elements with `position: absolute; bottom: Npx` inside page containers
|
||||
// are pinned relative to their containing block. When content paginates
|
||||
// across multiple @page pages, these elements either overlap with body
|
||||
// text or land on the wrong page. Fix: convert them to static positioning
|
||||
// so they participate in normal document flow and paginate naturally.
|
||||
const absBottomFix = await page.evaluate(() => {
|
||||
const converted = [];
|
||||
// Scan inside page-level containers (body children and their children)
|
||||
const containers = [];
|
||||
for (let i = 0; i < document.body.children.length; i++) {
|
||||
const child = document.body.children[i];
|
||||
const tag = child.tagName.toLowerCase();
|
||||
if (tag === 'svg' || tag === 'script' || tag === 'style' || tag === 'link') continue;
|
||||
containers.push(child);
|
||||
}
|
||||
|
||||
for (const container of containers) {
|
||||
const descendants = container.querySelectorAll('*');
|
||||
for (const el of descendants) {
|
||||
const computed = getComputedStyle(el);
|
||||
if (computed.position === 'absolute' && computed.bottom !== 'auto' && computed.bottom !== '') {
|
||||
// Check if this element contains visible text (not just decorative)
|
||||
const hasText = el.textContent && el.textContent.trim().length > 0;
|
||||
if (!hasText) continue;
|
||||
|
||||
const tag = el.tagName.toLowerCase();
|
||||
const id = el.id ? `#${el.id}` : '';
|
||||
const cls = el.className ? `.${String(el.className).split(' ')[0]}` : '';
|
||||
const selector = `${tag}${id}${cls}`;
|
||||
|
||||
// Convert to static flow: remove absolute positioning
|
||||
el.style.position = 'static';
|
||||
el.style.bottom = 'auto';
|
||||
el.style.left = 'auto';
|
||||
el.style.right = 'auto';
|
||||
// Preserve horizontal padding/margin from the original left/right values
|
||||
// by keeping any existing padding or margin on the element
|
||||
|
||||
converted.push({ selector, bottom: computed.bottom });
|
||||
}
|
||||
}
|
||||
}
|
||||
return converted;
|
||||
});
|
||||
|
||||
if (absBottomFix.length) {
|
||||
console.log(' ⚠️ Converted absolute-bottom elements to document flow (prevents overlap on multi-page):');
|
||||
absBottomFix.forEach(f => {
|
||||
console.log(` ${f.selector}: was position:absolute;bottom:${f.bottom} → now static (flows with content)`);
|
||||
});
|
||||
}
|
||||
|
||||
// 5. Inject minimal @page CSS fallback
|
||||
await page.evaluate(() => {
|
||||
const styles = Array.from(document.querySelectorAll('style'));
|
||||
const hasPageRule = styles.some(s => (s.textContent || '').includes('@page'));
|
||||
if (!hasPageRule) {
|
||||
const s = document.createElement('style');
|
||||
s.textContent = `@page { margin: 20mm; }`;
|
||||
document.head.appendChild(s);
|
||||
}
|
||||
});
|
||||
|
||||
// 6. Fix full-page cover sections for print
|
||||
// In screen mode, height:100vh = viewport height. In print mode, 100vh ≠ page height.
|
||||
// Detect elements using 100vh and convert to print-safe page-filling behavior.
|
||||
const coverFixed = await page.evaluate(() => {
|
||||
let fixed = 0;
|
||||
// Find elements with height: 100vh (inline or computed)
|
||||
const allEls = document.querySelectorAll('*');
|
||||
for (const el of allEls) {
|
||||
const style = el.style;
|
||||
const computed = getComputedStyle(el);
|
||||
const isVh = style.height === '100vh' || computed.height === '100vh' ||
|
||||
style.minHeight === '100vh' || computed.minHeight === '100vh';
|
||||
// Also detect via class name hints
|
||||
const isCover = el.classList.contains('cover') || el.classList.contains('cover-page') ||
|
||||
el.id === 'cover' || el.getAttribute('data-role') === 'cover';
|
||||
if (isVh || (isCover && el.offsetHeight > 0)) {
|
||||
// Force the element to fill the print page
|
||||
el.style.height = '100vh';
|
||||
el.style.minHeight = '100vh';
|
||||
el.style.pageBreakAfter = 'always';
|
||||
el.style.pageBreakInside = 'avoid';
|
||||
el.style.boxSizing = 'border-box';
|
||||
el.style.overflow = 'hidden';
|
||||
fixed++;
|
||||
}
|
||||
}
|
||||
// Inject print-specific CSS to make 100vh work correctly
|
||||
if (fixed > 0) {
|
||||
const s = document.createElement('style');
|
||||
s.textContent = `
|
||||
@media print {
|
||||
.cover, .cover-page, [data-role="cover"] {
|
||||
height: 100vh !important;
|
||||
min-height: 100vh !important;
|
||||
page-break-after: always !important;
|
||||
page-break-inside: avoid !important;
|
||||
overflow: hidden !important;
|
||||
}
|
||||
}
|
||||
`;
|
||||
document.head.appendChild(s);
|
||||
}
|
||||
return fixed;
|
||||
});
|
||||
if (coverFixed) {
|
||||
console.log(` ✓ Fixed ${coverFixed} full-page cover section(s) for print`);
|
||||
// Also inject named @page rule for cover with zero margins
|
||||
await page.evaluate(() => {
|
||||
const s = document.createElement('style');
|
||||
s.textContent = `
|
||||
@page cover-page {
|
||||
margin: 0 !important;
|
||||
}
|
||||
@media print {
|
||||
.cover, .cover-page, [data-role="cover"] {
|
||||
page: cover-page;
|
||||
margin: 0 !important;
|
||||
padding: 40px !important;
|
||||
}
|
||||
}
|
||||
`;
|
||||
document.head.appendChild(s);
|
||||
});
|
||||
}
|
||||
|
||||
return { warnings, contentHeight: vOverflowFix.finalHeight };
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
// Content statistics (post-render, from PDF or page)
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
|
||||
async function collectStats(page) {
|
||||
return page.evaluate(() => {
|
||||
const body = document.body;
|
||||
const text = body.innerText || '';
|
||||
const zhChars = (text.match(/[\u4e00-\u9fa5]/g) || []).length;
|
||||
const enWords = (text.match(/[a-zA-Z]+/g) || []).length;
|
||||
return {
|
||||
wordCount: zhChars + enWords,
|
||||
figures: document.querySelectorAll('figure,.figure,img').length,
|
||||
tables: document.querySelectorAll('table').length,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
// pdf-lib post-processing: page count, metadata, merge
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
|
||||
async function postProcess(pdfPath, options = {}) {
|
||||
const { PDFDocument } = loadPdfLib();
|
||||
const pdfBytes = fs.readFileSync(pdfPath);
|
||||
const doc = await PDFDocument.load(pdfBytes);
|
||||
|
||||
// Set metadata
|
||||
if (options.title) doc.setTitle(options.title);
|
||||
doc.setProducer('html2pdf-next (Playwright + pdf-lib)');
|
||||
doc.setCreationDate(new Date());
|
||||
|
||||
const pageCount = doc.getPageCount();
|
||||
|
||||
// Merge additional PDFs
|
||||
if (options.mergeFiles && options.mergeFiles.length) {
|
||||
for (const mf of options.mergeFiles) {
|
||||
if (!fs.existsSync(mf)) {
|
||||
console.log(` ⚠ Merge file not found: ${mf}`);
|
||||
continue;
|
||||
}
|
||||
console.log(` 📎 Merging: ${path.basename(mf)}`);
|
||||
const donorBytes = fs.readFileSync(mf);
|
||||
const donorDoc = await PDFDocument.load(donorBytes);
|
||||
const copiedPages = await doc.copyPages(donorDoc, donorDoc.getPageIndices());
|
||||
copiedPages.forEach(p => doc.addPage(p));
|
||||
}
|
||||
}
|
||||
|
||||
// Save
|
||||
const finalBytes = await doc.save();
|
||||
fs.writeFileSync(pdfPath, finalBytes);
|
||||
|
||||
return { pageCount: doc.getPageCount(), originalPages: pageCount };
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
// Main pipeline
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
|
||||
async function convert(inputFile, outputFile, customCSS, options = {}) {
|
||||
const { width, height, mergeFiles, title } = options;
|
||||
|
||||
if (!fs.existsSync(inputFile)) {
|
||||
console.error(`✗ File not found: ${inputFile}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const playwright = loadPlaywright();
|
||||
const { chromium } = playwright;
|
||||
|
||||
// Resolve browser
|
||||
const canInstall = process.env.PDF_SKIP_BROWSER_INSTALL !== '1';
|
||||
const bInfo = resolveChromium(chromium, canInstall);
|
||||
|
||||
if (bInfo.status === 'missing') {
|
||||
console.error('\n✗ Chromium not found. Run: npx playwright install chromium\n');
|
||||
process.exit(2);
|
||||
}
|
||||
if (bInfo.status === 'fallback') {
|
||||
console.log(`⚠ Using fallback Chromium: ${bInfo.executablePath}`);
|
||||
}
|
||||
|
||||
const absIn = path.resolve(inputFile);
|
||||
const absOut = path.resolve(outputFile);
|
||||
|
||||
console.log(`\n🔄 Converting ${path.basename(inputFile)}...`);
|
||||
console.log(` Engine: Playwright + Chromium native @page (no Paged.js)`);
|
||||
|
||||
// Read and optionally inject CSS
|
||||
let html = fs.readFileSync(absIn, 'utf-8');
|
||||
if (customCSS) {
|
||||
if (!fs.existsSync(customCSS)) {
|
||||
console.error(`✗ CSS file not found: ${customCSS}`);
|
||||
process.exit(1);
|
||||
}
|
||||
const tag = `<style>${fs.readFileSync(customCSS, 'utf-8')}</style>`;
|
||||
html = html.includes('</head>') ? html.replace('</head>', tag + '\n</head>') : tag + '\n' + html;
|
||||
// Write modified HTML for Playwright to load
|
||||
const tmpHtml = absIn + '.tmp.html';
|
||||
fs.writeFileSync(tmpHtml, html);
|
||||
// We'll clean up later
|
||||
}
|
||||
|
||||
// Launch browser
|
||||
let browser;
|
||||
try {
|
||||
const opts = { headless: true };
|
||||
if (bInfo.status === 'fallback') opts.executablePath = bInfo.executablePath;
|
||||
browser = await chromium.launch(opts);
|
||||
} catch (err) {
|
||||
const msg = err.message || '';
|
||||
if (msg.includes('shared libraries') || msg.includes('.so')) {
|
||||
console.error('\n✗ Missing system libraries. Run: npx playwright install-deps chromium\n');
|
||||
} else {
|
||||
console.error(`\n✗ Browser launch failed: ${msg}\n`);
|
||||
}
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
try {
|
||||
const page = await browser.newPage();
|
||||
const loadFile = customCSS ? absIn + '.tmp.html' : absIn;
|
||||
await page.goto('file://' + loadFile, { waitUntil: 'networkidle' });
|
||||
|
||||
// ── Pre-render hooks ──
|
||||
console.log('\n📋 Pre-render checks:');
|
||||
const preRenderResult = await preRenderHooks(page);
|
||||
const warnings = preRenderResult.warnings;
|
||||
const measuredContentHeight = preRenderResult.contentHeight;
|
||||
|
||||
// ── Detect continuous-canvas mode (design_engine.py output) ──
|
||||
const continuousInfo = await page.evaluate(() => {
|
||||
const el = document.querySelector('.continuous-canvas');
|
||||
if (!el) return null;
|
||||
const root = getComputedStyle(document.documentElement);
|
||||
return {
|
||||
width: root.getPropertyValue('--canvas-w').trim() || '720px',
|
||||
height: root.getPropertyValue('--canvas-h').trim() || '960px',
|
||||
pages: el.querySelectorAll('.page-section').length,
|
||||
};
|
||||
});
|
||||
|
||||
if (continuousInfo) {
|
||||
// Creative PDF: seamless multi-page canvas
|
||||
console.log(`\n🎨 Continuous canvas: ${continuousInfo.pages} pages @ ${continuousInfo.width} × ${continuousInfo.height}`);
|
||||
await page.pdf({
|
||||
path: absOut,
|
||||
printBackground: true,
|
||||
margin: { top: 0, right: 0, bottom: 0, left: 0 },
|
||||
width: continuousInfo.width,
|
||||
height: continuousInfo.height,
|
||||
});
|
||||
} else {
|
||||
// Standard document
|
||||
console.log('\n📄 Rendering PDF...');
|
||||
const pdfOpts = {
|
||||
path: absOut,
|
||||
printBackground: true,
|
||||
preferCSSPageSize: true,
|
||||
tagged: true,
|
||||
};
|
||||
|
||||
if (width || height) {
|
||||
if (width) pdfOpts.width = width;
|
||||
if (height) pdfOpts.height = height;
|
||||
pdfOpts.margin = { top: 0, right: 0, bottom: 0, left: 0 };
|
||||
console.log(` Custom size: ${pdfOpts.width || 'auto'} × ${pdfOpts.height || 'auto'}`);
|
||||
} else {
|
||||
// No explicit size: check if @page CSS defines a fixed size
|
||||
const pageSize = await page.evaluate(() => {
|
||||
const styles = Array.from(document.querySelectorAll('style'));
|
||||
for (const s of styles) {
|
||||
const text = s.textContent || '';
|
||||
const match = text.match(/@page\s*\{[^}]*size:\s*([\d.]+)px\s+([\d.]+)px/);
|
||||
if (match) return { width: parseFloat(match[1]), height: parseFloat(match[2]) };
|
||||
}
|
||||
return null;
|
||||
});
|
||||
|
||||
if (pageSize) {
|
||||
// @page defines a fixed size — use preferCSSPageSize (already set above).
|
||||
// Playwright will paginate content at @page height boundaries seamlessly.
|
||||
// This is correct for both posters (seamless multi-page) and documents.
|
||||
pdfOpts.margin = { top: 0, right: 0, bottom: 0, left: 0 };
|
||||
console.log(` @page size: ${pageSize.width}px × ${pageSize.height}px`);
|
||||
if (measuredContentHeight && measuredContentHeight > pageSize.height + 5) {
|
||||
const estPages = Math.ceil(measuredContentHeight / pageSize.height);
|
||||
console.log(` Content height: ${measuredContentHeight}px → ~${estPages} pages`);
|
||||
}
|
||||
} else {
|
||||
pdfOpts.format = 'A4';
|
||||
}
|
||||
}
|
||||
|
||||
await page.pdf(pdfOpts);
|
||||
}
|
||||
|
||||
// Collect content stats from the page
|
||||
const stats = await collectStats(page);
|
||||
|
||||
// ── pdf-lib post-processing ──
|
||||
console.log('\n🔧 Post-processing (pdf-lib):');
|
||||
const postResult = await postProcess(absOut, { mergeFiles, title });
|
||||
|
||||
// Clean up temp HTML
|
||||
const tmpHtml = absIn + '.tmp.html';
|
||||
if (fs.existsSync(tmpHtml)) fs.unlinkSync(tmpHtml);
|
||||
|
||||
// ── Report ──
|
||||
const sz = fs.statSync(absOut).size;
|
||||
console.log('\n' + '═'.repeat(40));
|
||||
console.log(' PDF Generated Successfully');
|
||||
console.log('═'.repeat(40));
|
||||
console.log(` File: ${path.basename(absOut)}`);
|
||||
console.log(` Pages: ${postResult.pageCount}`);
|
||||
console.log(` Size: ${prettyBytes(sz)}`);
|
||||
console.log(` Words: ~${stats.wordCount.toLocaleString()}`);
|
||||
console.log(` Assets: ${stats.figures} figures, ${stats.tables} tables`);
|
||||
console.log(` Engine: Playwright (no Paged.js)`);
|
||||
console.log(` Path: ${absOut}`);
|
||||
|
||||
if (mergeFiles && mergeFiles.length && postResult.pageCount > postResult.originalPages) {
|
||||
console.log(` Merged: +${postResult.pageCount - postResult.originalPages} pages from ${mergeFiles.length} file(s)`);
|
||||
}
|
||||
|
||||
if (warnings.length) {
|
||||
console.log('\n⚠ Warnings:');
|
||||
warnings.forEach(w => console.log(` · ${w}`));
|
||||
}
|
||||
|
||||
// Anomaly detection
|
||||
if (postResult.pageCount > 1 && stats.wordCount > 0) {
|
||||
const avgWordsPerPage = stats.wordCount / postResult.pageCount;
|
||||
if (avgWordsPerPage < 30) {
|
||||
console.log(`\n⚠ Low content density: ~${Math.round(avgWordsPerPage)} words/page (expected 100+)`);
|
||||
}
|
||||
}
|
||||
|
||||
} catch (err) {
|
||||
console.error('\n✗ Conversion failed:', err.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
// Entry
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
|
||||
(async () => {
|
||||
try {
|
||||
const args = cli();
|
||||
await convert(args.inputFile, args.outputFile, args.customCSS, {
|
||||
width: args.width,
|
||||
height: args.height,
|
||||
mergeFiles: args.mergeFiles,
|
||||
title: args.title,
|
||||
});
|
||||
} catch (err) {
|
||||
console.error('Error:', err.message);
|
||||
process.exit(1);
|
||||
}
|
||||
})();
|
||||
256
skills/pdf/scripts/html2poster.js
Executable file
256
skills/pdf/scripts/html2poster.js
Executable file
@@ -0,0 +1,256 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* html2poster.js — Single-page poster/long-image HTML → PDF converter
|
||||
*
|
||||
* Purpose: Convert a fixed-width, dynamic-height HTML poster into a single-page
|
||||
* vector PDF with zero margins. This script is PURPOSE-BUILT for posters and
|
||||
* infographics — it does NOT handle multi-page documents, A4 pagination, or
|
||||
* document-style margins. For those, use html2pdf-next.js.
|
||||
*
|
||||
* Usage:
|
||||
* node html2poster.js poster.html
|
||||
* node html2poster.js poster.html --output out.pdf
|
||||
* node html2poster.js poster.html --width 720px
|
||||
* node html2poster.js poster.html --width 720px --max-height 8000
|
||||
*
|
||||
* What it does (in order):
|
||||
* 1. Load HTML in Playwright
|
||||
* 2. Force overflow:hidden on .poster/.page containers (clip decorative overflow)
|
||||
* 3. Inject @page { margin: 0 } (override any existing margin)
|
||||
* 4. Ensure html/body have margin:0, padding:0, matching background
|
||||
* 5. Measure .poster scrollHeight (actual content height)
|
||||
* 6. Generate single-page PDF with exact dimensions
|
||||
*
|
||||
* What it does NOT do:
|
||||
* - No pagination / page breaks
|
||||
* - No A4 fallback
|
||||
* - No margin injection (always zero)
|
||||
* - No cover adaptation
|
||||
* - No pdf-lib post-processing
|
||||
* - No continuous-canvas detection
|
||||
* - No vertical overflow expansion (posters WANT overflow:hidden)
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const { spawnSync } = require('child_process');
|
||||
|
||||
// ── Chromium resolution (shared logic with html2pdf-next.js) ──
|
||||
|
||||
function resolveChromium(chromiumObj) {
|
||||
let exe;
|
||||
try { exe = chromiumObj.executablePath(); } catch (_) { exe = null; }
|
||||
if (exe && fs.existsSync(exe)) return { status: 'ok', executablePath: exe };
|
||||
|
||||
const candidates = [
|
||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
||||
'/usr/bin/chromium-browser', '/usr/bin/chromium', '/usr/bin/google-chrome',
|
||||
];
|
||||
if (process.env.PLAYWRIGHT_CHROMIUM_PATH) candidates.unshift(process.env.PLAYWRIGHT_CHROMIUM_PATH);
|
||||
|
||||
for (const c of candidates) {
|
||||
if (fs.existsSync(c)) return { status: 'fallback', executablePath: c };
|
||||
}
|
||||
return { status: 'missing', executablePath: exe || '' };
|
||||
}
|
||||
|
||||
// ── CLI parsing ──
|
||||
|
||||
function parseArgs(argv) {
|
||||
const tokens = argv.slice(2);
|
||||
let input = null, output = null, width = '720px', maxHeight = 16000;
|
||||
|
||||
for (let i = 0; i < tokens.length; i++) {
|
||||
const t = tokens[i];
|
||||
if (t === '--output' || t === '-o') output = tokens[++i];
|
||||
else if (t === '--width') width = tokens[++i];
|
||||
else if (t === '--max-height') maxHeight = parseInt(tokens[++i], 10);
|
||||
else if (t === '--help' || t === '-h') {
|
||||
console.log(`
|
||||
Usage: node html2poster.js <input.html> [options]
|
||||
|
||||
Options:
|
||||
--output, -o Output PDF path (default: input with .pdf extension)
|
||||
--width Poster width (default: 720px)
|
||||
--max-height Maximum allowed height in px (default: 16000, safety limit)
|
||||
-h, --help Show this help
|
||||
`);
|
||||
process.exit(0);
|
||||
}
|
||||
else if (!input) input = t;
|
||||
else if (!output) output = t;
|
||||
}
|
||||
|
||||
if (!input) {
|
||||
console.error('Error: No input HTML file specified.');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (!output) {
|
||||
output = input.replace(/\.html?$/i, '.pdf');
|
||||
if (output === input) output = input + '.pdf';
|
||||
}
|
||||
|
||||
return { input, output, width, maxHeight };
|
||||
}
|
||||
|
||||
// ── Main ──
|
||||
|
||||
async function main() {
|
||||
const { input, output, width, maxHeight } = parseArgs(process.argv);
|
||||
const absIn = path.resolve(input);
|
||||
const absOut = path.resolve(output);
|
||||
|
||||
if (!fs.existsSync(absIn)) {
|
||||
console.error(`Error: File not found: ${absIn}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log(`\n🖼 html2poster — Single-page poster PDF generator`);
|
||||
console.log(` Input: ${absIn}`);
|
||||
console.log(` Output: ${absOut}`);
|
||||
console.log(` Width: ${width}`);
|
||||
|
||||
// Load Playwright
|
||||
let playwright;
|
||||
try {
|
||||
playwright = require('playwright');
|
||||
} catch {
|
||||
try {
|
||||
playwright = require('playwright-core');
|
||||
} catch {
|
||||
console.error('Error: playwright or playwright-core not installed.');
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
const { chromium } = playwright;
|
||||
const bInfo = resolveChromium(chromium);
|
||||
|
||||
if (bInfo.status === 'missing') {
|
||||
console.error('Error: No Chromium found. Run: npx playwright install chromium');
|
||||
process.exit(1);
|
||||
}
|
||||
if (bInfo.status === 'fallback') {
|
||||
console.log(` ⚠ Using fallback Chromium: ${bInfo.executablePath}`);
|
||||
}
|
||||
|
||||
// Launch browser
|
||||
const launchOpts = { headless: true };
|
||||
if (bInfo.status === 'fallback') launchOpts.executablePath = bInfo.executablePath;
|
||||
|
||||
const browser = await chromium.launch(launchOpts);
|
||||
|
||||
try {
|
||||
// Use a wide viewport so content doesn't wrap unexpectedly
|
||||
const widthPx = parseInt(width, 10) || 720;
|
||||
const page = await browser.newPage({ viewport: { width: widthPx, height: 1200 } });
|
||||
|
||||
await page.goto('file://' + absIn, { waitUntil: 'networkidle' });
|
||||
console.log(`\n ✓ HTML loaded`);
|
||||
|
||||
// ── Step 1: Force overflow:hidden on page containers ──
|
||||
// Decorative elements with negative offsets or width>100% inflate scrollWidth,
|
||||
// causing Playwright to shrink content to fit. overflow:hidden clips them.
|
||||
const overflowFixed = await page.evaluate(() => {
|
||||
const selectors = ['.poster', '.page', '#poster', '#page'];
|
||||
let fixed = 0;
|
||||
for (const sel of selectors) {
|
||||
const el = document.querySelector(sel);
|
||||
if (!el) continue;
|
||||
const computed = getComputedStyle(el);
|
||||
if (computed.overflow !== 'hidden') {
|
||||
el.style.overflow = 'hidden';
|
||||
fixed++;
|
||||
}
|
||||
}
|
||||
return fixed;
|
||||
});
|
||||
if (overflowFixed > 0) {
|
||||
console.log(` ✓ Added overflow:hidden to ${overflowFixed} container(s)`);
|
||||
}
|
||||
|
||||
// ── Step 2: Inject @page { margin: 0 } — override any existing @page rule ──
|
||||
await page.evaluate(() => {
|
||||
const s = document.createElement('style');
|
||||
// Use !important-equivalent: place at end so it wins cascade
|
||||
s.textContent = `@page { margin: 0 !important; size: auto; }`;
|
||||
document.head.appendChild(s);
|
||||
});
|
||||
|
||||
// ── Step 3: Ensure html/body have zero margin/padding ──
|
||||
const bgSync = await page.evaluate(() => {
|
||||
const html = document.documentElement;
|
||||
const body = document.body;
|
||||
html.style.margin = '0';
|
||||
html.style.padding = '0';
|
||||
body.style.margin = '0';
|
||||
body.style.padding = '0';
|
||||
|
||||
// Sync body background with poster background to avoid color gaps
|
||||
const poster = document.querySelector('.poster') || document.querySelector('.page');
|
||||
if (poster) {
|
||||
const posterBg = getComputedStyle(poster).backgroundColor;
|
||||
if (posterBg && posterBg !== 'rgba(0, 0, 0, 0)' && posterBg !== 'transparent') {
|
||||
body.style.backgroundColor = posterBg;
|
||||
html.style.backgroundColor = posterBg;
|
||||
return posterBg;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
});
|
||||
if (bgSync) {
|
||||
console.log(` ✓ Synced body background: ${bgSync}`);
|
||||
}
|
||||
|
||||
// ── Step 4: Measure actual content height ──
|
||||
const measurement = await page.evaluate(() => {
|
||||
const poster = document.querySelector('.poster') || document.querySelector('.page') || document.body;
|
||||
return {
|
||||
scrollHeight: poster.scrollHeight,
|
||||
scrollWidth: poster.scrollWidth,
|
||||
offsetWidth: poster.offsetWidth,
|
||||
selector: poster.className ? '.' + poster.className.split(' ')[0] : poster.tagName,
|
||||
};
|
||||
});
|
||||
|
||||
console.log(` ✓ Measured: ${measurement.selector} = ${measurement.scrollWidth}×${measurement.scrollHeight}px`);
|
||||
|
||||
if (measurement.scrollWidth > widthPx + 2) {
|
||||
console.log(` ⚠ WARNING: scrollWidth (${measurement.scrollWidth}px) > width (${widthPx}px)`);
|
||||
console.log(` Decorative elements may still overflow. Check for position:absolute elements with negative offsets.`);
|
||||
}
|
||||
|
||||
let contentHeight = measurement.scrollHeight;
|
||||
if (contentHeight > maxHeight) {
|
||||
console.log(` ⚠ Content height ${contentHeight}px exceeds max ${maxHeight}px, clamping.`);
|
||||
contentHeight = maxHeight;
|
||||
}
|
||||
if (contentHeight < 100) {
|
||||
console.log(` ⚠ Content height ${contentHeight}px seems too small, using 960px fallback.`);
|
||||
contentHeight = 960;
|
||||
}
|
||||
|
||||
// ── Step 5: Generate PDF ──
|
||||
console.log(`\n 📄 Generating PDF: ${width} × ${contentHeight}px`);
|
||||
await page.pdf({
|
||||
path: absOut,
|
||||
width: width,
|
||||
height: contentHeight + 'px',
|
||||
printBackground: true,
|
||||
margin: { top: '0', right: '0', bottom: '0', left: '0' },
|
||||
});
|
||||
|
||||
console.log(`\n ✅ Done: ${absOut}`);
|
||||
console.log(` Size: ${(fs.statSync(absOut).size / 1024).toFixed(1)} KB`);
|
||||
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(err => {
|
||||
console.error(`\n✗ Fatal: ${err.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
2959
skills/pdf/scripts/pdf.py
Executable file
2959
skills/pdf/scripts/pdf.py
Executable file
File diff suppressed because it is too large
Load Diff
901
skills/pdf/scripts/pdf_qa.py
Executable file
901
skills/pdf/scripts/pdf_qa.py
Executable file
@@ -0,0 +1,901 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
PDF Quality Assurance Checker
|
||||
=============================
|
||||
Automatically detects common typesetting issues in PDFs.
|
||||
|
||||
Usage: python3 pdf_qa.py <pdf_path>
|
||||
|
||||
Checks:
|
||||
1. Page size consistency across all pages
|
||||
2. Blank page detection
|
||||
3. CJK punctuation placement (line-start/end forbidden punctuation)
|
||||
4. Color analysis (informational only — counts and lists colors)
|
||||
5. Font embedding check (warns on non-embedded fonts)
|
||||
6. PDF metadata check (title/author/creator)
|
||||
7. Content overflow detection (text exceeding page boundaries)
|
||||
8. Content fill ratio per page (multi-page docs, warns if < 40%)
|
||||
9. Cover/poster full-bleed check (background extends to page edges)
|
||||
10. Margin symmetry check (left/right text margins)
|
||||
11. Table centering check (if detected)
|
||||
12. Formula overflow check (optional)
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
from collections import Counter
|
||||
|
||||
try:
|
||||
import pymupdf # PyMuPDF
|
||||
except ImportError:
|
||||
import fitz as pymupdf
|
||||
|
||||
# ============================================================
|
||||
# Config
|
||||
# ============================================================
|
||||
|
||||
# CJK punctuation forbidden at line start
|
||||
LINE_START_FORBIDDEN = set(
|
||||
"。、,;:!?)】〛〉」』"
|
||||
"\u201c\u201d" # "" curly double quotes
|
||||
"\u2026" # … ellipsis
|
||||
"\u2014" # — em dash
|
||||
"\uff5e" # ~ fullwidth tilde
|
||||
"\u00b7" # · middle dot
|
||||
)
|
||||
|
||||
# CJK punctuation forbidden at line end
|
||||
LINE_END_FORBIDDEN = set(
|
||||
"(【《〈「"
|
||||
"\u2018\u2019" # '' curly single quotes
|
||||
"\u201c" # " left curly double quote
|
||||
)
|
||||
|
||||
# Minimum fill ratio for last page (DISABLED — caused false positives)
|
||||
# LAST_PAGE_MIN_FILL = 0.40
|
||||
|
||||
# Maximum allowed color count — REMOVED (color count is now info-only)
|
||||
# MAX_COLORS = 8
|
||||
|
||||
# ============================================================
|
||||
# Checks
|
||||
# ============================================================
|
||||
|
||||
class QAResult:
|
||||
def __init__(self):
|
||||
self.issues = [] # (severity, category, message)
|
||||
self.passes = [] # passed checks
|
||||
self.info = [] # informational
|
||||
|
||||
def error(self, cat, msg):
|
||||
self.issues.append(('ERROR', cat, msg))
|
||||
|
||||
def warn(self, cat, msg):
|
||||
self.issues.append(('WARN', cat, msg))
|
||||
|
||||
def ok(self, msg):
|
||||
self.passes.append(msg)
|
||||
|
||||
def add_info(self, msg):
|
||||
self.info.append(msg)
|
||||
|
||||
|
||||
def check_last_page_fill(doc, result):
|
||||
"""Check content fill ratio of the last page"""
|
||||
if len(doc) < 2:
|
||||
result.ok("Single-page document, no last-page blank check needed")
|
||||
return
|
||||
|
||||
last_page = doc[-1]
|
||||
page_rect = last_page.rect
|
||||
page_area = page_rect.width * page_rect.height
|
||||
|
||||
# Get bounding boxes of all content on last page
|
||||
blocks = last_page.get_text("blocks")
|
||||
if not blocks:
|
||||
result.error("Last page blank", f"Page {len(doc)} (last page) has no content at all!")
|
||||
return
|
||||
|
||||
# Calculate max y-coordinate covered by content
|
||||
max_y = 0
|
||||
min_y = page_rect.height
|
||||
for b in blocks:
|
||||
if b[4].strip(): # Has text content
|
||||
min_y = min(min_y, b[1])
|
||||
max_y = max(max_y, b[3])
|
||||
|
||||
if max_y == 0:
|
||||
result.error("Last page blank", f"Page {len(doc)} (last page) has no valid text content")
|
||||
return
|
||||
|
||||
content_height = max_y - min_y
|
||||
fill_ratio = content_height / page_rect.height
|
||||
|
||||
result.add_info(f"Last page fill ratio: {fill_ratio:.0%} (content height {content_height:.0f}px / page height {page_rect.height:.0f}px)")
|
||||
|
||||
if fill_ratio < 0.25:
|
||||
result.error("Last page blank", f"Last page fill ratio only {fill_ratio:.0%}, mostly blank! Consider compressing preceding page spacing or trimming content")
|
||||
elif fill_ratio < LAST_PAGE_MIN_FILL:
|
||||
result.warn("Last page blank", f"Last page fill ratio {fill_ratio:.0%}, somewhat sparse — optimization recommended")
|
||||
else:
|
||||
result.ok(f"Last page fill ratio {fill_ratio:.0%} ✓")
|
||||
|
||||
|
||||
def check_punctuation(doc, result):
|
||||
"""Check CJK punctuation placement rules"""
|
||||
violations = []
|
||||
|
||||
for page_num in range(len(doc)):
|
||||
page = doc[page_num]
|
||||
# Extract text by line
|
||||
text_dict = page.get_text("dict")
|
||||
|
||||
for block in text_dict.get("blocks", []):
|
||||
if block.get("type") != 0: # Only check text blocks
|
||||
continue
|
||||
for line in block.get("lines", []):
|
||||
line_text = ""
|
||||
for span in line.get("spans", []):
|
||||
line_text += span.get("text", "")
|
||||
|
||||
line_text = line_text.strip()
|
||||
if not line_text:
|
||||
continue
|
||||
|
||||
# Check line start
|
||||
first_char = line_text[0]
|
||||
if first_char in LINE_START_FORBIDDEN:
|
||||
violations.append((page_num + 1, f"Forbidden line-start punctuation '{first_char}': ...{line_text[:30]}"))
|
||||
|
||||
# Check line end
|
||||
last_char = line_text[-1] if len(line_text) > 0 else ''
|
||||
if last_char in LINE_END_FORBIDDEN:
|
||||
violations.append((page_num + 1, f"Forbidden line-end punctuation '{last_char}': {line_text[-30:]}..."))
|
||||
|
||||
if violations:
|
||||
# Show at most 10
|
||||
shown = violations[:10]
|
||||
for page_num, desc in shown:
|
||||
result.warn("Punctuation rules", f"Page {page_num} - {desc}")
|
||||
if len(violations) > 10:
|
||||
result.warn("Punctuation rules", f"...{len(violations) - 10} more violations")
|
||||
else:
|
||||
result.ok("Punctuation placement check passed ✓")
|
||||
|
||||
|
||||
def check_blank_pages(doc, result):
|
||||
"""Check for completely blank pages"""
|
||||
blank_pages = []
|
||||
for i in range(len(doc)):
|
||||
page = doc[i]
|
||||
text = page.get_text().strip()
|
||||
# Also check for images
|
||||
images = page.get_images()
|
||||
drawings = page.get_drawings()
|
||||
|
||||
if not text and not images and not drawings:
|
||||
blank_pages.append(i + 1)
|
||||
|
||||
if blank_pages:
|
||||
result.error("Blank pages", f"Found blank pages: {blank_pages}")
|
||||
else:
|
||||
result.ok("No blank pages ✓")
|
||||
|
||||
|
||||
def check_colors(doc, result):
|
||||
"""Analyze colors used in the document (informational only, no pass/fail)"""
|
||||
colors = set()
|
||||
|
||||
for page_num in range(len(doc)):
|
||||
page = doc[page_num]
|
||||
text_dict = page.get_text("dict")
|
||||
|
||||
for block in text_dict.get("blocks", []):
|
||||
if block.get("type") != 0:
|
||||
continue
|
||||
for line in block.get("lines", []):
|
||||
for span in line.get("spans", []):
|
||||
color = span.get("color", 0)
|
||||
if color != 0: # Exclude pure black
|
||||
r = (color >> 16) & 0xFF
|
||||
g = (color >> 8) & 0xFF
|
||||
b = color & 0xFF
|
||||
hex_color = f"#{r:02x}{g:02x}{b:02x}"
|
||||
colors.add(hex_color)
|
||||
|
||||
# Check drawing colors
|
||||
drawings = page.get_drawings()
|
||||
for d in drawings:
|
||||
if d.get("color"):
|
||||
c = d["color"]
|
||||
if isinstance(c, (tuple, list)) and len(c) >= 3:
|
||||
hex_color = f"#{int(c[0]*255):02x}{int(c[1]*255):02x}{int(c[2]*255):02x}"
|
||||
colors.add(hex_color)
|
||||
if d.get("fill"):
|
||||
c = d["fill"]
|
||||
if isinstance(c, (tuple, list)) and len(c) >= 3:
|
||||
hex_color = f"#{int(c[0]*255):02x}{int(c[1]*255):02x}{int(c[2]*255):02x}"
|
||||
colors.add(hex_color)
|
||||
|
||||
# Filter out near-black/white/gray colors
|
||||
distinct_colors = []
|
||||
for c in colors:
|
||||
r = int(c[1:3], 16)
|
||||
g = int(c[3:5], 16)
|
||||
b = int(c[5:7], 16)
|
||||
max_diff = max(abs(r-g), abs(g-b), abs(r-b))
|
||||
if max_diff > 20:
|
||||
distinct_colors.append(c)
|
||||
|
||||
result.add_info(f"Total text colors: {len(colors)} (chromatic: {len(distinct_colors)})")
|
||||
|
||||
if distinct_colors:
|
||||
result.add_info(f"Chromatic colors: {', '.join(sorted(distinct_colors)[:10])}")
|
||||
|
||||
|
||||
def check_page_size_consistency(doc, result):
|
||||
"""Check whether all page sizes are consistent"""
|
||||
if len(doc) < 2:
|
||||
result.ok("Single-page document, size consistent ✓")
|
||||
return
|
||||
|
||||
sizes = set()
|
||||
for i in range(len(doc)):
|
||||
page = doc[i]
|
||||
w = round(page.rect.width, 1)
|
||||
h = round(page.rect.height, 1)
|
||||
sizes.add((w, h))
|
||||
|
||||
if len(sizes) > 1:
|
||||
result.warn("Page size", f"Inconsistent page sizes: {sizes}")
|
||||
else:
|
||||
size = list(sizes)[0]
|
||||
# Convert to mm
|
||||
w_mm = size[0] * 25.4 / 72
|
||||
h_mm = size[1] * 25.4 / 72
|
||||
result.add_info(f"Page size: {w_mm:.0f}mm × {h_mm:.0f}mm ({len(doc)} pages)")
|
||||
result.ok("Page size consistent ✓")
|
||||
|
||||
|
||||
def check_text_overflow(doc, result):
|
||||
"""Check whether text overflows page boundaries"""
|
||||
overflow_pages = []
|
||||
|
||||
for i in range(len(doc)):
|
||||
page = doc[i]
|
||||
rect = page.rect
|
||||
blocks = page.get_text("blocks")
|
||||
|
||||
for b in blocks:
|
||||
# b = (x0, y0, x1, y1, text, block_no, block_type)
|
||||
if b[2] > rect.width + 2 or b[3] > rect.height + 2: # 2px tolerance
|
||||
overflow_pages.append(i + 1)
|
||||
break
|
||||
if b[0] < -2 or b[1] < -2:
|
||||
overflow_pages.append(i + 1)
|
||||
break
|
||||
|
||||
if overflow_pages:
|
||||
result.warn("Content overflow", f"Pages {overflow_pages} may have content exceeding page boundaries")
|
||||
else:
|
||||
result.ok("No content overflow ✓")
|
||||
|
||||
|
||||
def check_content_fill_ratio(doc, result):
|
||||
"""Check content fill ratio per page — warns when content is crammed at top leaving large void below.
|
||||
|
||||
Rules:
|
||||
- Skip single-page documents (may be intentional design)
|
||||
- Skip page 1 (usually cover with intentional whitespace)
|
||||
- Middle pages: warn if fill ratio < 40%
|
||||
- Last page: warn if fill ratio < 25% (naturally has less content)
|
||||
"""
|
||||
if len(doc) < 2:
|
||||
result.ok("Single-page document, skipping content fill ratio check ✓")
|
||||
return
|
||||
|
||||
low_fill_pages = []
|
||||
|
||||
for i in range(len(doc)):
|
||||
page = doc[i]
|
||||
page_rect = page.rect
|
||||
page_height = page_rect.height
|
||||
|
||||
# Skip page 1 (cover)
|
||||
if i == 0:
|
||||
continue
|
||||
|
||||
blocks = page.get_text("blocks")
|
||||
images = page.get_images()
|
||||
drawings = page.get_drawings()
|
||||
|
||||
if not blocks and not images and not drawings:
|
||||
continue # Blank page check handles this
|
||||
|
||||
# Calculate content bbox
|
||||
max_y = 0
|
||||
for b in blocks:
|
||||
if b[4].strip():
|
||||
max_y = max(max_y, b[3])
|
||||
|
||||
# Include images in bbox
|
||||
for img in images:
|
||||
try:
|
||||
img_rects = page.get_image_rects(img[0])
|
||||
for r in img_rects:
|
||||
max_y = max(max_y, r.y1)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if max_y == 0:
|
||||
continue
|
||||
|
||||
fill_ratio = max_y / page_height
|
||||
is_last = (i == len(doc) - 1)
|
||||
threshold = 0.25 if is_last else 0.40
|
||||
|
||||
if fill_ratio < threshold:
|
||||
low_fill_pages.append((i + 1, fill_ratio, threshold))
|
||||
|
||||
if low_fill_pages:
|
||||
for pg, ratio, thresh in low_fill_pages:
|
||||
result.warn(
|
||||
"Content fill ratio",
|
||||
f"Page {pg} content only fills {ratio:.0%} of page height "
|
||||
f"(threshold: {thresh:.0%}). Content may be crammed at the top "
|
||||
f"with a large blank area below."
|
||||
)
|
||||
else:
|
||||
result.ok("Content fill ratio adequate on all pages ✓")
|
||||
|
||||
|
||||
def check_cover_bleed(doc, result, poster=False):
|
||||
"""Check if the cover page (page 1) fills the entire page area (full-bleed).
|
||||
|
||||
A properly designed cover should have background color/graphics extending
|
||||
to the page edges. If the content bbox has significant margins on all sides,
|
||||
the cover likely wasn't rendered full-bleed (e.g. ReportLab with default margins).
|
||||
|
||||
For poster mode: checks ALL pages (not just the cover) since every page of a
|
||||
seamlessly-paginated poster should have consistent background fill.
|
||||
|
||||
Strategy: combine bounding boxes of drawings (rects, paths), images, and colored
|
||||
backgrounds. If the union bbox leaves > 5% margin on any side, warn.
|
||||
"""
|
||||
if not poster and len(doc) < 2:
|
||||
# Single page doc (non-poster) — not necessarily a cover scenario
|
||||
return
|
||||
|
||||
pages_to_check = range(len(doc)) if poster else [0]
|
||||
|
||||
for page_idx in pages_to_check:
|
||||
page = doc[page_idx]
|
||||
page_rect = page.rect
|
||||
pw, ph = page_rect.width, page_rect.height
|
||||
|
||||
# Collect all content bounding boxes
|
||||
min_x, min_y = pw, ph
|
||||
max_x, max_y = 0.0, 0.0
|
||||
has_content = False
|
||||
|
||||
# 1. Drawings (vector paths, rectangles — typical for colored backgrounds)
|
||||
for d in page.get_drawings():
|
||||
r = d.get("rect")
|
||||
if r:
|
||||
min_x = min(min_x, r.x0)
|
||||
min_y = min(min_y, r.y0)
|
||||
max_x = max(max_x, r.x1)
|
||||
max_y = max(max_y, r.y1)
|
||||
has_content = True
|
||||
|
||||
# 2. Images
|
||||
for img in page.get_images():
|
||||
try:
|
||||
for r in page.get_image_rects(img[0]):
|
||||
min_x = min(min_x, r.x0)
|
||||
min_y = min(min_y, r.y0)
|
||||
max_x = max(max_x, r.x1)
|
||||
max_y = max(max_y, r.y1)
|
||||
has_content = True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
page_label = f"Page {page_idx + 1}" if poster else "Cover page (p1)"
|
||||
|
||||
if not has_content:
|
||||
blocks = page.get_text("blocks")
|
||||
if blocks:
|
||||
result.warn(
|
||||
f"{page_label} not full-bleed",
|
||||
f"{page_label} has no background graphics (no filled rectangles or images). "
|
||||
"A proper cover/poster page should have a full-page background color or image "
|
||||
"extending to all edges."
|
||||
)
|
||||
continue
|
||||
|
||||
# Calculate margin ratios (how far content is from page edges)
|
||||
margin_left = max(0, min_x) / pw
|
||||
margin_top = max(0, min_y) / ph
|
||||
margin_right = max(0, pw - max_x) / pw
|
||||
margin_bottom = max(0, ph - max_y) / ph
|
||||
|
||||
threshold = 0.05
|
||||
margins_ok = (margin_left <= threshold and margin_top <= threshold and
|
||||
margin_right <= threshold and margin_bottom <= threshold)
|
||||
|
||||
if margins_ok:
|
||||
result.ok(f"{page_label} content extends to page edges (full-bleed) ✓")
|
||||
else:
|
||||
sides = []
|
||||
if margin_left > threshold:
|
||||
sides.append(f"left {margin_left:.0%}")
|
||||
if margin_top > threshold:
|
||||
sides.append(f"top {margin_top:.0%}")
|
||||
if margin_right > threshold:
|
||||
sides.append(f"right {margin_right:.0%}")
|
||||
if margin_bottom > threshold:
|
||||
sides.append(f"bottom {margin_bottom:.0%}")
|
||||
result.warn(
|
||||
f"{page_label} not full-bleed",
|
||||
f"{page_label} has visible margins: {', '.join(sides)}. "
|
||||
f"Background/graphics should extend to page edges."
|
||||
)
|
||||
|
||||
|
||||
def check_margin_symmetry(doc, result, skip_cover=False):
|
||||
"""Check left/right margin symmetry using text block bounds."""
|
||||
warn_pages = []
|
||||
|
||||
for page_num in range(len(doc)):
|
||||
if skip_cover and page_num == 0:
|
||||
continue
|
||||
|
||||
page = doc[page_num]
|
||||
blocks = page.get_text("blocks")
|
||||
text_blocks = [b for b in blocks if b[4].strip()]
|
||||
|
||||
if len(text_blocks) < 3:
|
||||
continue # Skip decorative/cover-like pages
|
||||
|
||||
left_margin = min(b[0] for b in text_blocks)
|
||||
right_margin = page.rect.width - max(b[2] for b in text_blocks)
|
||||
diff = abs(left_margin - right_margin)
|
||||
|
||||
if diff > page.rect.width * 0.05:
|
||||
warn_pages.append((page_num + 1, left_margin, right_margin, diff))
|
||||
|
||||
if warn_pages:
|
||||
for pg, left, right, diff in warn_pages:
|
||||
result.warn(
|
||||
"Margin symmetry",
|
||||
f"Page {pg} left/right margins differ by {diff:.0f}pt "
|
||||
f"(L {left:.0f}pt, R {right:.0f}pt)"
|
||||
)
|
||||
else:
|
||||
result.ok("Left/right margins appear symmetric \u2713")
|
||||
|
||||
|
||||
def check_table_centering(doc, result):
|
||||
"""Check if detected table regions are centered."""
|
||||
def _bbox_intersects(a, b, tol=6):
|
||||
return not (a[2] < b[0] - tol or a[0] > b[2] + tol or
|
||||
a[3] < b[1] - tol or a[1] > b[3] + tol)
|
||||
|
||||
def _rect_tuple(r):
|
||||
if hasattr(r, "x0"):
|
||||
return (r.x0, r.y0, r.x1, r.y1)
|
||||
return (r[0], r[1], r[2], r[3])
|
||||
|
||||
any_tables = False
|
||||
|
||||
for page_num in range(len(doc)):
|
||||
page = doc[page_num]
|
||||
drawings = page.get_drawings()
|
||||
segments = []
|
||||
|
||||
for d in drawings:
|
||||
for item in d.get("items", []):
|
||||
if not item:
|
||||
continue
|
||||
op = item[0]
|
||||
if op == "l" and len(item) >= 3:
|
||||
p0, p1 = item[1], item[2]
|
||||
segments.append((p0[0], p0[1], p1[0], p1[1]))
|
||||
elif op == "re" and len(item) >= 2:
|
||||
x0, y0, x1, y1 = _rect_tuple(item[1])
|
||||
segments.extend([
|
||||
(x0, y0, x1, y0),
|
||||
(x0, y1, x1, y1),
|
||||
(x0, y0, x0, y1),
|
||||
(x1, y0, x1, y1),
|
||||
])
|
||||
|
||||
if not segments:
|
||||
continue
|
||||
|
||||
cluster_list = []
|
||||
for x0, y0, x1, y1 in segments:
|
||||
min_x, max_x = min(x0, x1), max(x0, x1)
|
||||
min_y, max_y = min(y0, y1), max(y0, y1)
|
||||
bbox = (min_x, min_y, max_x, max_y)
|
||||
is_h = abs(y0 - y1) < 1 and (max_x - min_x) > 20
|
||||
is_v = abs(x0 - x1) < 1 and (max_y - min_y) > 20
|
||||
if not is_h and not is_v:
|
||||
continue
|
||||
|
||||
placed = False
|
||||
for cl in cluster_list:
|
||||
if _bbox_intersects(bbox, cl["bbox"]):
|
||||
cl["segments"].append((x0, y0, x1, y1, is_h, is_v))
|
||||
cl["bbox"] = (
|
||||
min(cl["bbox"][0], bbox[0]),
|
||||
min(cl["bbox"][1], bbox[1]),
|
||||
max(cl["bbox"][2], bbox[2]),
|
||||
max(cl["bbox"][3], bbox[3]),
|
||||
)
|
||||
if is_h:
|
||||
cl["h"] += 1
|
||||
if is_v:
|
||||
cl["v"] += 1
|
||||
placed = True
|
||||
break
|
||||
if not placed:
|
||||
cluster_list.append({
|
||||
"bbox": bbox,
|
||||
"segments": [(x0, y0, x1, y1, is_h, is_v)],
|
||||
"h": 1 if is_h else 0,
|
||||
"v": 1 if is_v else 0,
|
||||
})
|
||||
|
||||
for cl in cluster_list:
|
||||
if cl["h"] < 2 or cl["v"] < 2:
|
||||
continue
|
||||
any_tables = True
|
||||
bbox = cl["bbox"]
|
||||
page_width = page.rect.width
|
||||
left_margin = bbox[0]
|
||||
right_margin = page_width - bbox[2]
|
||||
if abs(left_margin - right_margin) > page_width * 0.05:
|
||||
result.warn(
|
||||
"Table centering",
|
||||
f"Page {page_num + 1}: Table not centered "
|
||||
f"(L {left_margin:.0f}pt, R {right_margin:.0f}pt)"
|
||||
)
|
||||
|
||||
if any_tables:
|
||||
result.ok("Table centering check complete \u2713")
|
||||
|
||||
|
||||
def check_font_embedding(doc, result):
|
||||
"""Check font embedding status using PyMuPDF font list."""
|
||||
fonts_used = set()
|
||||
non_embedded = set()
|
||||
|
||||
for page_num in range(len(doc)):
|
||||
page = doc[page_num]
|
||||
for font in page.get_fonts():
|
||||
basefont = font[3] if len(font) > 3 else "unknown"
|
||||
ext = font[1] if len(font) > 1 else ""
|
||||
fonts_used.add(basefont)
|
||||
if not ext:
|
||||
non_embedded.add(basefont)
|
||||
|
||||
if fonts_used:
|
||||
result.add_info(f"Fonts used: {', '.join(sorted(fonts_used))}")
|
||||
else:
|
||||
result.add_info("Fonts used: (none detected)")
|
||||
|
||||
if non_embedded:
|
||||
for basefont in sorted(non_embedded):
|
||||
result.warn(
|
||||
"Font embedding",
|
||||
f"Font {basefont} is not embedded. May display differently on other systems."
|
||||
)
|
||||
else:
|
||||
result.ok("All fonts are embedded \u2713")
|
||||
|
||||
|
||||
def check_helvetica_in_cjk(doc, result):
|
||||
"""Detect Helvetica rendering visible text in documents containing CJK text.
|
||||
|
||||
Helvetica is a Latin-only built-in PDF font. When it appears rendering
|
||||
actual text content in a CJK document, it almost always means a raw string
|
||||
was passed to a ReportLab Table or flowable without wrapping it in
|
||||
Paragraph() with a CJK font. The CJK characters rendered via Helvetica
|
||||
become garbled (fall back to ZapfDingbats symbols).
|
||||
|
||||
We only check Helvetica (not ZapfDingbats) because ZapfDingbats is
|
||||
legitimately used for bullet symbols in list items.
|
||||
|
||||
We check actual rendered text spans (not just font presence in font list)
|
||||
because ReportLab internally registers Helvetica on every page even when
|
||||
only CJK fonts are used in visible content.
|
||||
"""
|
||||
has_cjk = False
|
||||
helvetica_pages = []
|
||||
|
||||
for page_num in range(len(doc)):
|
||||
page = doc[page_num]
|
||||
text = page.get_text("text") or ""
|
||||
|
||||
# Check if document contains CJK characters
|
||||
if not has_cjk:
|
||||
for ch in text:
|
||||
if '\u4e00' <= ch <= '\u9fff' or '\u3400' <= ch <= '\u4dbf':
|
||||
has_cjk = True
|
||||
break
|
||||
|
||||
# Check if Helvetica is actually used to render visible text on this page
|
||||
blocks = page.get_text("dict", sort=True).get("blocks", [])
|
||||
found_on_page = False
|
||||
for block in blocks:
|
||||
if found_on_page:
|
||||
break
|
||||
for line in block.get("lines", []):
|
||||
if found_on_page:
|
||||
break
|
||||
for span in line.get("spans", []):
|
||||
font = span.get("font", "")
|
||||
txt = span.get("text", "").strip()
|
||||
if "Helvetica" in font and len(txt) > 0:
|
||||
helvetica_pages.append(page_num + 1)
|
||||
found_on_page = True
|
||||
break
|
||||
|
||||
if has_cjk and helvetica_pages:
|
||||
pages_str = ', '.join(str(p) for p in helvetica_pages[:5])
|
||||
if len(helvetica_pages) > 5:
|
||||
pages_str += f' ...and {len(helvetica_pages) - 5} more'
|
||||
result.warn(
|
||||
"Helvetica in CJK document",
|
||||
f"Helvetica font detected rendering text on page(s) {pages_str} in a CJK document. "
|
||||
f"This usually means a raw string was passed to a ReportLab Table or flowable "
|
||||
f"without wrapping in Paragraph(text, style) with a CJK-capable font. "
|
||||
f"CJK characters rendered via Helvetica will appear as garbled symbols."
|
||||
)
|
||||
|
||||
|
||||
def check_metadata(doc, result):
|
||||
"""Check PDF metadata presence for title, author, creator."""
|
||||
meta = doc.metadata or {}
|
||||
|
||||
def _missing(v):
|
||||
if v is None:
|
||||
return True
|
||||
if not str(v).strip():
|
||||
return True
|
||||
return False
|
||||
|
||||
title = meta.get("title")
|
||||
author = meta.get("author")
|
||||
creator = meta.get("creator")
|
||||
|
||||
if _missing(title) or str(title).strip().lower() in ("untitled", "(anonymous)"):
|
||||
result.warn("Metadata", "Missing/invalid title metadata")
|
||||
else:
|
||||
result.ok("Title metadata present \u2713")
|
||||
|
||||
if _missing(author):
|
||||
result.warn("Metadata", "Missing author metadata")
|
||||
else:
|
||||
result.ok("Author metadata present \u2713")
|
||||
|
||||
if _missing(creator):
|
||||
result.warn("Metadata", "Missing creator metadata")
|
||||
else:
|
||||
result.ok("Creator metadata present \u2713")
|
||||
|
||||
|
||||
def check_toc_without_cover(doc, result):
|
||||
"""Detect TOC on page 1 without a preceding cover page.
|
||||
|
||||
If the first page contains Table of Contents / 目录, it means the document
|
||||
has a TOC but no cover page. This is a structural issue — documents with
|
||||
TOC should have: Cover (p1) → TOC (p2) → Content (p3+).
|
||||
"""
|
||||
if len(doc) < 2:
|
||||
# Single-page docs don't need TOC/cover checks
|
||||
return
|
||||
|
||||
page1 = doc[0]
|
||||
text = page1.get_text("text", sort=True).strip()
|
||||
|
||||
# Normalize for matching
|
||||
text_lower = text.lower()
|
||||
first_300 = text_lower[:300]
|
||||
|
||||
toc_keywords = [
|
||||
"table of contents", "contents",
|
||||
"目录", "目 录",
|
||||
]
|
||||
|
||||
has_toc = any(kw in first_300 for kw in toc_keywords)
|
||||
|
||||
if has_toc:
|
||||
result.warn(
|
||||
"TOC without cover",
|
||||
"Page 1 appears to be a Table of Contents with no preceding cover page. "
|
||||
"Documents with TOC should have: Cover (p1) → TOC (p2) → Content (p3+)."
|
||||
)
|
||||
|
||||
|
||||
def check_formula_overflow(doc, result):
|
||||
"""Detect likely formula overflow past right content margin."""
|
||||
math_re = re.compile(r"[=+\-*/<>\u2264\u2265\u2211\u222b\u221a\u03c0\u00b5\u221e\u2202\u2206\u2248\u2260\u00b1\u00d7\u00f7]")
|
||||
|
||||
for page_num in range(len(doc)):
|
||||
page = doc[page_num]
|
||||
blocks = page.get_text("blocks")
|
||||
text_blocks = [b for b in blocks if b[4].strip()]
|
||||
|
||||
if len(text_blocks) < 3:
|
||||
continue
|
||||
|
||||
right_edges = sorted(b[2] for b in text_blocks)
|
||||
mid = len(right_edges) // 2
|
||||
content_right = right_edges[mid] if right_edges else 0
|
||||
|
||||
for b in text_blocks:
|
||||
x0, x1, text = b[0], b[2], b[4]
|
||||
if x1 <= content_right + 10:
|
||||
continue
|
||||
|
||||
is_single_line = "\n" not in text.strip()
|
||||
is_wide = (x1 - x0) > page.rect.width * 0.5
|
||||
has_math = bool(math_re.search(text))
|
||||
|
||||
if (is_single_line and is_wide) or has_math:
|
||||
delta = x1 - content_right
|
||||
result.warn(
|
||||
"Formula overflow",
|
||||
f"Page {page_num + 1}: Content extends {delta:.0f}pt beyond right content margin "
|
||||
"(possible formula overflow)"
|
||||
)
|
||||
break
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Main
|
||||
# ============================================================
|
||||
|
||||
def run_qa(pdf_path, poster=False, skip_cover=False, check_tables=True, check_formulas=False):
|
||||
result = QAResult()
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
result.error("File", f"File not found: {pdf_path}")
|
||||
return result
|
||||
|
||||
doc = pymupdf.open(pdf_path)
|
||||
|
||||
result.add_info(f"File: {os.path.basename(pdf_path)}")
|
||||
result.add_info(f"Size: {os.path.getsize(pdf_path) / 1024:.1f} KB")
|
||||
if poster:
|
||||
result.add_info("Mode: poster (creative)")
|
||||
|
||||
# Run all checks
|
||||
check_metadata(doc, result)
|
||||
check_page_size_consistency(doc, result)
|
||||
check_blank_pages(doc, result)
|
||||
check_punctuation(doc, result)
|
||||
check_colors(doc, result)
|
||||
check_font_embedding(doc, result)
|
||||
check_helvetica_in_cjk(doc, result)
|
||||
check_text_overflow(doc, result)
|
||||
if not poster:
|
||||
# Content fill ratio is not meaningful for posters — the last page
|
||||
# of a seamlessly-paginated poster naturally has less content.
|
||||
check_content_fill_ratio(doc, result)
|
||||
check_cover_bleed(doc, result, poster=poster)
|
||||
check_margin_symmetry(doc, result, skip_cover=skip_cover)
|
||||
if check_tables:
|
||||
check_table_centering(doc, result)
|
||||
if check_formulas:
|
||||
check_formula_overflow(doc, result)
|
||||
if not poster:
|
||||
check_toc_without_cover(doc, result)
|
||||
|
||||
doc.close()
|
||||
return result
|
||||
|
||||
|
||||
def format_report(result):
|
||||
lines = []
|
||||
lines.append("=" * 56)
|
||||
lines.append(" PDF Quality Assurance Report")
|
||||
lines.append("=" * 56)
|
||||
|
||||
# Info
|
||||
if result.info:
|
||||
lines.append("")
|
||||
lines.append("ℹ️ Info:")
|
||||
for msg in result.info:
|
||||
lines.append(f" {msg}")
|
||||
|
||||
# Passes
|
||||
if result.passes:
|
||||
lines.append("")
|
||||
lines.append(f"✅ Passed ({len(result.passes)}):")
|
||||
for msg in result.passes:
|
||||
lines.append(f" {msg}")
|
||||
|
||||
# Issues
|
||||
errors = [(s, c, m) for s, c, m in result.issues if s == 'ERROR']
|
||||
warns = [(s, c, m) for s, c, m in result.issues if s == 'WARN']
|
||||
|
||||
if errors:
|
||||
lines.append("")
|
||||
lines.append(f"❌ Errors ({len(errors)}):")
|
||||
for _, cat, msg in errors:
|
||||
lines.append(f" [{cat}] {msg}")
|
||||
|
||||
if warns:
|
||||
lines.append("")
|
||||
lines.append(f"⚠️ Warnings ({len(warns)}):")
|
||||
for _, cat, msg in warns:
|
||||
lines.append(f" [{cat}] {msg}")
|
||||
|
||||
# Summary
|
||||
lines.append("")
|
||||
lines.append("-" * 56)
|
||||
total_issues = len(result.issues)
|
||||
if total_issues == 0:
|
||||
lines.append("🎉 PASS — All checks passed!")
|
||||
elif errors:
|
||||
lines.append(f"💀 FAIL — {len(errors)} error(s), {len(warns)} warning(s)")
|
||||
else:
|
||||
lines.append(f"⚠️ WARN — {len(warns)} warning(s), optimization recommended")
|
||||
lines.append("-" * 56)
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python3 pdf_qa.py <pdf_path>")
|
||||
print(" python3 pdf_qa.py *.pdf (batch check)")
|
||||
print("Options:")
|
||||
print(" --poster Poster mode (creative)")
|
||||
print(" --skip-cover Skip page 1 margin symmetry check")
|
||||
print(" --no-tables Disable table centering check")
|
||||
print(" --formulas Enable formula overflow check")
|
||||
sys.exit(1)
|
||||
|
||||
import glob
|
||||
files = []
|
||||
poster = False
|
||||
skip_cover = False
|
||||
check_tables = True
|
||||
check_formulas = False
|
||||
args = sys.argv[1:]
|
||||
if '--poster' in args:
|
||||
poster = True
|
||||
args.remove('--poster')
|
||||
if '--skip-cover' in args:
|
||||
skip_cover = True
|
||||
args.remove('--skip-cover')
|
||||
if '--no-tables' in args:
|
||||
check_tables = False
|
||||
args.remove('--no-tables')
|
||||
if '--formulas' in args:
|
||||
check_formulas = True
|
||||
args.remove('--formulas')
|
||||
for arg in args:
|
||||
files.extend(glob.glob(arg))
|
||||
|
||||
if not files:
|
||||
print(f"File not found: {args}")
|
||||
sys.exit(1)
|
||||
|
||||
for pdf_path in files:
|
||||
result = run_qa(
|
||||
pdf_path,
|
||||
poster=poster,
|
||||
skip_cover=skip_cover,
|
||||
check_tables=check_tables,
|
||||
check_formulas=check_formulas
|
||||
)
|
||||
print(format_report(result))
|
||||
if len(files) > 1:
|
||||
print("\n")
|
||||
1337
skills/pdf/scripts/poster_validate.py
Executable file
1337
skills/pdf/scripts/poster_validate.py
Executable file
File diff suppressed because it is too large
Load Diff
269
skills/pdf/scripts/setup.sh
Executable file
269
skills/pdf/scripts/setup.sh
Executable file
@@ -0,0 +1,269 @@
|
||||
#!/usr/bin/env bash
|
||||
# ---
|
||||
# name: pdf-setup
|
||||
# author: Z.AI
|
||||
# version: "1.0"
|
||||
# description: Environment setup for the PDF skill. Checks and installs all required dependencies.
|
||||
# ---
|
||||
#
|
||||
# Installs only dependencies required by the PDF skill.
|
||||
set -euo pipefail
|
||||
|
||||
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
|
||||
ok() { echo -e " ${GREEN}✓${NC} $1"; }
|
||||
fail() { echo -e " ${RED}✗${NC} $1"; }
|
||||
warn() { echo -e " ${YELLOW}○${NC} $1"; }
|
||||
info() { echo -e " ${BLUE}→${NC} $1"; }
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
|
||||
echo "============================================"
|
||||
echo " PDF Skill — Environment Setup"
|
||||
echo "============================================"
|
||||
echo ""
|
||||
|
||||
# ── Detect platform ──
|
||||
OS="$(uname -s)"
|
||||
ARCH="$(uname -m)"
|
||||
echo "Platform: $OS $ARCH"
|
||||
echo ""
|
||||
|
||||
# ── 0. macOS: Homebrew ──
|
||||
if [ "$OS" = "Darwin" ]; then
|
||||
echo "--- Homebrew (macOS package manager) ---"
|
||||
if command -v brew &>/dev/null; then
|
||||
BREW_VER=$(brew --version 2>/dev/null | head -1)
|
||||
ok "brew ($BREW_VER)"
|
||||
else
|
||||
fail "brew not found — most dependencies below need Homebrew on macOS"
|
||||
info "Install: /bin/bash -c \"\$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)\""
|
||||
fi
|
||||
echo ""
|
||||
fi
|
||||
|
||||
# ── 1. Python 3 ──
|
||||
echo "--- Python ---"
|
||||
if command -v python3 &>/dev/null; then
|
||||
PY_VER=$(python3 --version 2>&1)
|
||||
ok "python3 ($PY_VER)"
|
||||
# macOS: warn if using system Python
|
||||
if [ "$OS" = "Darwin" ]; then
|
||||
PY_PATH=$(which python3 2>/dev/null)
|
||||
if [[ "$PY_PATH" == "/usr/bin/python3" ]]; then
|
||||
warn "Using macOS system Python (limited). Recommend: brew install python3"
|
||||
fi
|
||||
fi
|
||||
else
|
||||
fail "python3 not found"
|
||||
case "$OS" in
|
||||
Darwin) info "Install: brew install python3" ;;
|
||||
Linux) info "Install: sudo apt install python3 python3-pip (Debian/Ubuntu)"
|
||||
info " sudo dnf install python3 python3-pip (Fedora/RHEL)" ;;
|
||||
*) info "Install: https://www.python.org/downloads/" ;;
|
||||
esac
|
||||
fi
|
||||
|
||||
# ── 2. pip ──
|
||||
echo ""
|
||||
echo "--- pip ---"
|
||||
if python3 -m pip --version &>/dev/null 2>&1; then
|
||||
PIP_VER=$(python3 -m pip --version 2>/dev/null | head -1)
|
||||
ok "pip ($PIP_VER)"
|
||||
else
|
||||
fail "pip not found"
|
||||
case "$OS" in
|
||||
Darwin) info "Install: python3 -m ensurepip --upgrade"
|
||||
info " or: brew install python3 (includes pip)" ;;
|
||||
Linux) info "Install: sudo apt install python3-pip (Debian/Ubuntu)" ;;
|
||||
*) info "Install: python3 -m ensurepip --upgrade" ;;
|
||||
esac
|
||||
fi
|
||||
|
||||
# ── 3. Python packages (pip) ──
|
||||
echo ""
|
||||
echo "--- Python Packages ---"
|
||||
PY_PKGS=(
|
||||
"pikepdf:pikepdf"
|
||||
"pdfplumber:pdfplumber"
|
||||
"pypdf:pypdf"
|
||||
"reportlab:reportlab"
|
||||
"pymupdf:PyMuPDF"
|
||||
)
|
||||
|
||||
MISSING_PY=()
|
||||
for entry in "${PY_PKGS[@]}"; do
|
||||
mod="${entry%%:*}"
|
||||
pkg="${entry##*:}"
|
||||
if python3 -c "import $mod" 2>/dev/null; then
|
||||
ver=$(python3 -c "import $mod; print(getattr($mod, '__version__', 'installed'))" 2>/dev/null)
|
||||
ok "$pkg ($ver)"
|
||||
else
|
||||
fail "$pkg not installed"
|
||||
MISSING_PY+=("$pkg")
|
||||
fi
|
||||
done
|
||||
|
||||
if [ ${#MISSING_PY[@]} -gt 0 ]; then
|
||||
echo ""
|
||||
if [ -t 0 ]; then
|
||||
read -p " Install missing Python packages? [Y/n] " -n 1 -r REPLY
|
||||
echo ""
|
||||
REPLY=${REPLY:-Y}
|
||||
else
|
||||
warn "Non-interactive mode — skipping auto-install. Run interactively or install manually."
|
||||
REPLY=N
|
||||
fi
|
||||
if [[ ! $REPLY =~ ^[Nn]$ ]]; then
|
||||
python3 -m pip install -q "${MISSING_PY[@]}" 2>/dev/null \
|
||||
|| python3 -m pip install -q --user "${MISSING_PY[@]}" 2>/dev/null \
|
||||
|| python3 -m pip install -q --break-system-packages "${MISSING_PY[@]}" 2>/dev/null \
|
||||
|| { fail "pip install failed. Try manually: pip install ${MISSING_PY[*]}"; }
|
||||
ok "Installed: ${MISSING_PY[*]}"
|
||||
fi
|
||||
fi
|
||||
|
||||
# ── 4. Node.js ──
|
||||
echo ""
|
||||
echo "--- Node.js ---"
|
||||
if command -v node &>/dev/null; then
|
||||
NODE_VER=$(node --version)
|
||||
ok "node ($NODE_VER)"
|
||||
else
|
||||
fail "node not found"
|
||||
case "$OS" in
|
||||
Darwin) info "Install: brew install node" ;;
|
||||
Linux) info "Install: curl -fsSL https://deb.nodesource.com/setup_20.x | sudo -E bash -"
|
||||
info " sudo apt install -y nodejs" ;;
|
||||
*) info "Install: https://nodejs.org/" ;;
|
||||
esac
|
||||
fi
|
||||
|
||||
# ── 5. npm ──
|
||||
echo ""
|
||||
echo "--- npm ---"
|
||||
if command -v npm &>/dev/null; then
|
||||
NPM_VER=$(npm --version 2>/dev/null)
|
||||
ok "npm ($NPM_VER)"
|
||||
else
|
||||
fail "npm not found"
|
||||
case "$OS" in
|
||||
Darwin) info "Install: brew install node (includes npm)" ;;
|
||||
Linux) info "Install: comes with nodejs" ;;
|
||||
*) info "Install: https://nodejs.org/" ;;
|
||||
esac
|
||||
fi
|
||||
|
||||
# ── 6. Playwright + Chromium ──
|
||||
echo ""
|
||||
echo "--- Playwright (HTML→PDF engine) ---"
|
||||
if node -e "require('playwright')" 2>/dev/null; then
|
||||
PW_VER=$(node -e "console.log(require('playwright/package.json').version)" 2>/dev/null)
|
||||
ok "playwright ($PW_VER)"
|
||||
else
|
||||
fail "playwright not installed"
|
||||
info "Install: npm install -g playwright"
|
||||
fi
|
||||
|
||||
# Check Chromium
|
||||
if [ "$OS" = "Darwin" ]; then
|
||||
PW_CACHE="$HOME/Library/Caches/ms-playwright"
|
||||
else
|
||||
PW_CACHE="$HOME/.cache/ms-playwright"
|
||||
fi
|
||||
if ls "$PW_CACHE"/chromium-* &>/dev/null 2>&1; then
|
||||
CR_DIR=$(ls -d "$PW_CACHE"/chromium-* 2>/dev/null | tail -1)
|
||||
ok "chromium ($(basename "$CR_DIR"))"
|
||||
else
|
||||
fail "chromium not installed"
|
||||
info "Install: npx playwright install chromium"
|
||||
if [ "$OS" = "Linux" ]; then
|
||||
info " npx playwright install-deps (system libs, needs sudo)"
|
||||
fi
|
||||
fi
|
||||
|
||||
# ── 7. Tectonic (LaTeX engine, optional) ──
|
||||
echo ""
|
||||
echo "--- Tectonic (LaTeX→PDF, optional) ---"
|
||||
BUNDLED="$SCRIPT_DIR/tectonic"
|
||||
if [ -x "$BUNDLED" ]; then
|
||||
if [ "$OS" = "Darwin" ] && [ "$ARCH" = "arm64" ]; then
|
||||
ok "tectonic (bundled, macOS arm64)"
|
||||
else
|
||||
warn "bundled tectonic is macOS arm64 only — cannot run on $OS $ARCH"
|
||||
if command -v tectonic &>/dev/null; then
|
||||
TEC_VER=$(tectonic --version 2>&1 | head -1)
|
||||
ok "tectonic (system: $TEC_VER)"
|
||||
else
|
||||
fail "tectonic not in PATH"
|
||||
case "$OS" in
|
||||
Darwin) info "Install: brew install tectonic" ;;
|
||||
Linux) info "Install: conda install -c conda-forge tectonic"
|
||||
info " or: curl -fsSL https://drop-sh.fullyjustified.net | sh" ;;
|
||||
MINGW*|MSYS*|CYGWIN*) info "Install: scoop install tectonic / choco install tectonic" ;;
|
||||
esac
|
||||
fi
|
||||
fi
|
||||
elif command -v tectonic &>/dev/null; then
|
||||
TEC_VER=$(tectonic --version 2>&1 | head -1)
|
||||
ok "tectonic ($TEC_VER)"
|
||||
elif [ -x "$HOME/tectonic" ]; then
|
||||
ok "tectonic (~/tectonic)"
|
||||
else
|
||||
warn "tectonic not installed (needed only for LaTeX/academic PDFs)"
|
||||
case "$OS" in
|
||||
Darwin) info "Install: brew install tectonic" ;;
|
||||
Linux) info "Install: conda install -c conda-forge tectonic"
|
||||
info " or: curl -fsSL https://drop-sh.fullyjustified.net | sh" ;;
|
||||
MINGW*|MSYS*|CYGWIN*) info "Install: scoop install tectonic / choco install tectonic" ;;
|
||||
esac
|
||||
fi
|
||||
|
||||
# ── 8. LibreOffice (optional, for Office→PDF conversion) ──
|
||||
echo ""
|
||||
echo "--- LibreOffice (optional, Office→PDF) ---"
|
||||
if command -v soffice &>/dev/null; then
|
||||
LO_VER=$(soffice --version 2>/dev/null | head -1)
|
||||
ok "libreoffice ($LO_VER)"
|
||||
else
|
||||
warn "libreoffice not installed (needed only for .docx/.xlsx→PDF conversion)"
|
||||
case "$OS" in
|
||||
Darwin) info "Install: brew install --cask libreoffice" ;;
|
||||
Linux) info "Install: sudo apt install libreoffice-core (Debian/Ubuntu)" ;;
|
||||
esac
|
||||
fi
|
||||
|
||||
# ── 9. CJK Fonts ──
|
||||
echo ""
|
||||
echo "--- CJK Fonts ---"
|
||||
FONT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)/fonts"
|
||||
if [ -d "$FONT_DIR" ]; then
|
||||
FONT_COUNT=$(find "$FONT_DIR" -name "*.ttf" -o -name "*.otf" 2>/dev/null | head -20 | wc -l | tr -d ' ')
|
||||
ok "fonts directory ($FONT_COUNT font files in $FONT_DIR)"
|
||||
else
|
||||
warn "no fonts/ directory found — CJK PDFs may have missing glyphs"
|
||||
info "Expected at: $FONT_DIR"
|
||||
fi
|
||||
# Check system CJK fonts
|
||||
if [ "$OS" = "Darwin" ]; then
|
||||
if ls /System/Library/Fonts/PingFang.ttc &>/dev/null 2>&1 \
|
||||
|| ls /System/Library/Fonts/STHeiti*.ttc &>/dev/null 2>&1 \
|
||||
|| ls "$HOME/Library/Fonts/"*SimHei* &>/dev/null 2>&1; then
|
||||
ok "macOS CJK system fonts available"
|
||||
else
|
||||
warn "no common CJK system fonts found"
|
||||
fi
|
||||
elif [ "$OS" = "Linux" ]; then
|
||||
if fc-list :lang=zh 2>/dev/null | head -1 | grep -q .; then
|
||||
ok "system CJK fonts available (fc-list)"
|
||||
else
|
||||
warn "no CJK fonts found. Install: sudo apt install fonts-noto-cjk"
|
||||
fi
|
||||
fi
|
||||
|
||||
# ── Summary ──
|
||||
echo ""
|
||||
echo "============================================"
|
||||
echo " Setup complete."
|
||||
echo " Run 'python3 pdf.py env.check' for detailed status."
|
||||
echo " Run 'python3 pdf.py env.fix' to auto-install Python deps."
|
||||
echo "============================================"
|
||||
2075
skills/pdf/scripts/toc_validate.py
Executable file
2075
skills/pdf/scripts/toc_validate.py
Executable file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user