Initial commit

This commit is contained in:
Z User
2026-06-06 05:21:10 +00:00
Unverified
commit 6664758a6d
493 changed files with 135653 additions and 0 deletions

View File

@@ -0,0 +1,367 @@
#!/usr/bin/env node
/**
* cover_validate.js — Cover page overlap detection via Playwright rendering
*
* Detects text-vs-decorative-line overlap on cover HTML pages by:
* 1. Rendering the HTML in Playwright
* 2. Waiting for fonts to load
* 3. Measuring bounding boxes of text elements and decorative line elements
* 4. Checking for Y-axis overlap (minimum spacing = 1U = 5% of page width ≈ 30pt)
*
* Usage:
* node cover_validate.js cover.html
* node cover_validate.js cover.html --width 210mm --height 297mm
* node cover_validate.js cover.html --min-gap 30 # custom min gap in px (default: auto = 5% of width)
*
* Exit codes:
* 0 = no overlap issues found
* 1 = overlap detected (prints details to stderr)
* 2 = script error (missing file, browser launch failure, etc.)
*
* This script is ONLY for cover pages. Do NOT use it on:
* - Multi-page documents (use html2pdf-next.js pre-render checks)
* - Posters (use html2poster.js which handles overflow automatically)
*/
'use strict';
const fs = require('fs');
const path = require('path');
// ── Playwright import ──
let playwright;
try {
playwright = require('playwright');
} catch {
try {
playwright = require('playwright-core');
} catch {
console.error('✗ Neither playwright nor playwright-core is installed.');
process.exit(2);
}
}
// ── Chromium resolution (shared logic with html2poster.js) ──
function resolveChromium(chromiumObj) {
let exe;
try { exe = chromiumObj.executablePath(); } catch (_) { exe = null; }
if (exe && fs.existsSync(exe)) return { status: 'ok', executablePath: exe };
const candidates = [
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
'/usr/bin/chromium-browser', '/usr/bin/chromium', '/usr/bin/google-chrome',
];
if (process.env.PLAYWRIGHT_CHROMIUM_PATH) candidates.unshift(process.env.PLAYWRIGHT_CHROMIUM_PATH);
for (const c of candidates) {
if (fs.existsSync(c)) return { status: 'fallback', executablePath: c };
}
return { status: 'missing', executablePath: exe || '' };
}
// ── CLI parsing ──
function parseArgs(argv) {
const tokens = argv.slice(2);
let input = null, width = '210mm', height = '297mm', minGap = null;
for (let i = 0; i < tokens.length; i++) {
const t = tokens[i];
if (t === '--width') width = tokens[++i];
else if (t === '--height') height = tokens[++i];
else if (t === '--min-gap') minGap = parseFloat(tokens[++i]);
else if (t === '--help' || t === '-h') {
console.log(`Usage: node cover_validate.js <cover.html> [options]
Options:
--width <val> Page width (default: 210mm)
--height <val> Page height (default: 297mm)
--min-gap <px> Minimum gap between text and decorative lines (default: 5% of width)
--help Show this help`);
process.exit(0);
} else if (!t.startsWith('-') && !input) {
input = t;
}
}
return { input, width, height, minGap };
}
// ── Convert CSS dimension string to px for viewport ──
function dimToPx(dim) {
if (!dim) return null;
const s = String(dim).trim();
const num = parseFloat(s);
if (s.endsWith('mm')) return Math.round(num * 3.7795); // 1mm ≈ 3.7795px at 96dpi
if (s.endsWith('cm')) return Math.round(num * 37.795);
if (s.endsWith('in')) return Math.round(num * 96);
if (s.endsWith('px') || !isNaN(num)) return Math.round(num);
return null;
}
// ── Decorative line detection heuristics ──
// A decorative line is an element that:
// - Is very thin in one dimension (height ≤ 5px or width ≤ 5px)
// - OR is an <hr> element
// - OR has a large aspect ratio (> 10:1 or < 1:10)
// - AND is not inside a text element
const DECORATIVE_LINE_DETECTION = `
(function detectOverlaps(minGapPx) {
// Collect all elements
const allElements = document.querySelectorAll('*');
const textElements = [];
const lineElements = [];
// Classify elements
for (const el of allElements) {
const rect = el.getBoundingClientRect();
if (rect.width === 0 || rect.height === 0) continue;
const tag = el.tagName.toLowerCase();
const style = getComputedStyle(el);
// Skip invisible elements
if (style.display === 'none' || style.visibility === 'hidden' || style.opacity === '0') continue;
// Detect decorative lines
const isHR = tag === 'hr';
const isThinH = rect.height <= 5 && rect.width > 20; // thin horizontal line
const isThinV = rect.width <= 5 && rect.height > 20; // thin vertical line
const aspectH = rect.width / rect.height;
const aspectV = rect.height / rect.width;
const isWideRatio = aspectH > 15 && rect.height <= 8; // very wide, very thin
const isTallRatio = aspectV > 15 && rect.width <= 8; // very tall, very thin
// Check if element has only border (no text content, no background image)
const hasOnlyBorder = (
el.textContent.trim() === '' &&
style.backgroundImage === 'none' &&
(style.borderTopWidth !== '0px' || style.borderBottomWidth !== '0px' ||
style.borderLeftWidth !== '0px' || style.borderRightWidth !== '0px')
);
const isBorderLine = hasOnlyBorder && (rect.height <= 8 || rect.width <= 8);
if (isHR || isThinH || isThinV || isWideRatio || isTallRatio || isBorderLine) {
lineElements.push({
tag: tag,
class: el.className || '',
rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height },
type: isThinH || isWideRatio ? 'horizontal' : (isThinV || isTallRatio ? 'vertical' : (rect.width >= rect.height ? 'horizontal' : 'vertical')),
});
continue;
}
// Detect text elements (has direct text content or is a heading/paragraph)
const textTags = ['h1','h2','h3','h4','h5','h6','p','span','a','li','td','th','label','summary'];
const hasDirectText = Array.from(el.childNodes).some(n => n.nodeType === 3 && n.textContent.trim());
if (textTags.includes(tag) || hasDirectText) {
// Skip if this is inside a decorative element
if (rect.height < 3) continue;
textElements.push({
tag: tag,
class: el.className || '',
text: el.textContent.trim().substring(0, 60),
rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height },
});
}
}
// De-duplicate: if a parent and child text element both overlap the same line,
// only keep the more specific (smaller) one to avoid duplicate reports.
// Sort text elements by area (smallest first) so we can skip parents.
textElements.sort((a, b) => (a.rect.width * a.rect.height) - (b.rect.width * b.rect.height));
// Check overlaps between text elements and line elements
const overlaps = [];
const reportedPairs = new Set(); // track "lineIndex:textContent" to deduplicate
for (const text of textElements) {
for (const line of lineElements) {
const tr = text.rect;
const lr = line.rect;
if (line.type === 'horizontal') {
// Check vertical overlap/proximity
const textTop = tr.y;
const textBottom = tr.y + tr.height;
const lineTop = lr.y;
const lineBottom = lr.y + lr.height;
// Check horizontal overlap (they must share some X range)
const xOverlap = !(tr.x + tr.width < lr.x || lr.x + lr.width < tr.x);
if (!xOverlap) continue;
// Calculate vertical gap
let vGap;
if (lineTop >= textBottom) {
vGap = lineTop - textBottom; // line is below text
} else if (textTop >= lineBottom) {
vGap = textTop - lineBottom; // line is above text
} else {
vGap = 0; // overlapping
}
if (vGap < minGapPx) {
// De-dup: same line region, only report the smallest (most specific) text element
const lineKey = 'h:' + Math.round(lr.x) + ',' + Math.round(lr.y);
if (!reportedPairs.has(lineKey)) {
reportedPairs.add(lineKey);
overlaps.push({
text: text.text,
textTag: text.tag,
textClass: text.class,
textRect: tr,
lineTag: line.tag,
lineClass: line.class,
lineRect: lr,
lineType: line.type,
gap: Math.round(vGap * 10) / 10,
required: minGapPx,
});
}
}
} else if (line.type === 'vertical') {
// Check horizontal overlap/proximity
const textLeft = tr.x;
const textRight = tr.x + tr.width;
const lineLeft = lr.x;
const lineRight = lr.x + lr.width;
// Check vertical overlap (they must share some Y range)
const yOverlap = !(tr.y + tr.height < lr.y || lr.y + lr.height < tr.y);
if (!yOverlap) continue;
// Calculate horizontal gap
let hGap;
if (lineLeft >= textRight) {
hGap = lineLeft - textRight;
} else if (textLeft >= lineRight) {
hGap = textLeft - lineRight;
} else {
hGap = 0;
}
if (hGap < minGapPx) {
const lineKey = 'v:' + Math.round(lr.x) + ',' + Math.round(lr.y);
if (!reportedPairs.has(lineKey)) {
reportedPairs.add(lineKey);
overlaps.push({
text: text.text,
textTag: text.tag,
textClass: text.class,
textRect: tr,
lineTag: line.tag,
lineClass: line.class,
lineRect: lr,
lineType: line.type,
gap: Math.round(hGap * 10) / 10,
required: minGapPx,
});
}
}
}
}
}
return {
textElements: textElements.length,
lineElements: lineElements.length,
overlaps: overlaps,
};
})
`;
// ── Main ──
async function main() {
const { input, width, height, minGap } = parseArgs(process.argv);
if (!input) {
console.error('✗ No input file specified. Usage: node cover_validate.js cover.html');
process.exit(2);
}
const absIn = path.resolve(input);
if (!fs.existsSync(absIn)) {
console.error(`✗ File not found: ${absIn}`);
process.exit(2);
}
const widthPx = dimToPx(width) || 794; // A4 width in px
const heightPx = dimToPx(height) || 1123; // A4 height in px
const gap = minGap || Math.round(widthPx * 0.05); // 1U = 5% of page width
console.log(`🔍 cover_validate — Cover overlap detection`);
console.log(` Input: ${absIn}`);
console.log(` Page: ${widthPx}×${heightPx}px`);
console.log(` Min gap: ${gap}px (1U)`);
const { chromium } = playwright;
const bInfo = resolveChromium(chromium);
if (bInfo.status === 'missing') {
console.error('✗ No Chromium found. Install via: npx playwright install chromium');
process.exit(2);
}
let browser;
try {
const opts = { headless: true };
if (bInfo.status === 'fallback') opts.executablePath = bInfo.executablePath;
browser = await chromium.launch(opts);
} catch (err) {
console.error(`✗ Browser launch failed: ${err.message}`);
process.exit(2);
}
try {
const page = await browser.newPage({ viewport: { width: widthPx, height: heightPx } });
await page.goto('file://' + absIn, { waitUntil: 'networkidle' });
console.log(` ✓ HTML loaded`);
// Wait for fonts
const fontsLoaded = await page.evaluate(() =>
document.fonts.ready.then(() => document.fonts.size)
).catch(() => 0);
console.log(` ✓ Fonts: ${fontsLoaded} loaded`);
// Run overlap detection
const result = await page.evaluate(`(${DECORATIVE_LINE_DETECTION})(${gap})`);
console.log(` ✓ Found ${result.textElements} text elements, ${result.lineElements} decorative lines`);
if (result.overlaps.length === 0) {
console.log(`\n ✅ No overlap issues found`);
process.exit(0);
}
// Report overlaps
console.error(`\n ❌ Found ${result.overlaps.length} text-line overlap(s):\n`);
for (const o of result.overlaps) {
const direction = o.lineType === 'vertical' ? 'horizontal' : 'vertical';
console.error(` ERROR: ${direction} gap = ${o.gap}px (required ≥ ${o.required}px)`);
console.error(` Text: <${o.textTag}> "${o.text}" @ y=${Math.round(o.textRect.y)}-${Math.round(o.textRect.y + o.textRect.height)}`);
console.error(` Line: <${o.lineTag}${o.lineClass ? '.' + o.lineClass.split(' ')[0] : ''}> [${o.lineType}] @ y=${Math.round(o.lineRect.y)}-${Math.round(o.lineRect.y + o.lineRect.height)}`);
console.error(` Fix: Move the decorative line at least ${Math.ceil(o.required - o.gap)}px away from the text.`);
console.error('');
}
process.exit(1);
} finally {
await browser.close();
}
}
main().catch(err => {
console.error(`✗ Unexpected error: ${err.message}`);
process.exit(2);
});

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,754 @@
#!/usr/bin/env node
/**
* html2pdf-next.js — HTML → PDF converter using Playwright + pdf-lib
*
* Drop-in replacement for html2pdf.js, WITHOUT Paged.js dependency.
* Uses Chromium native @page CSS for pagination + pdf-lib for post-processing.
*
* Usage:
* node html2pdf-next.js input.html
* node html2pdf-next.js input.html --output result.pdf
* node html2pdf-next.js input.html --css extra.css
* node html2pdf-next.js input.html --width 720px --height 960px
* node html2pdf-next.js input.html --direct (same as default now — no Paged.js to skip)
* node html2pdf-next.js input.html --merge a.pdf b.pdf (merge additional PDFs after)
*
* Architecture:
* 1. Playwright renders HTML → raw PDF via Chromium's native print engine
* 2. Pre-render hooks: Mermaid, KaTeX, oversized element fixes
* 3. Post-render: pdf-lib for merge, metadata, page count extraction
* 4. No Paged.js, no paged.polyfill.js — CSS @page handles pagination natively
*/
const fs = require('fs');
const path = require('path');
const { execSync, spawnSync } = require('child_process');
const sleep = ms => new Promise(r => setTimeout(r, ms));
// ═══════════════════════════════════════════════════════════════════
// Playwright / Chromium resolution (self-contained, no external helper)
// ═══════════════════════════════════════════════════════════════════
function loadPlaywright() {
// Try direct require first
try { return require('playwright'); } catch (_) {}
// Search common global paths
const Module = require('module');
const roots = new Set();
if (process.env.PLAYWRIGHT_PATH) roots.add(process.env.PLAYWRIGHT_PATH);
if (process.env.NODE_PATH) {
process.env.NODE_PATH.split(path.delimiter).filter(Boolean).forEach(p => roots.add(p));
}
try {
const g = execSync('npm root -g', { stdio: ['ignore', 'pipe', 'ignore'] }).toString().trim();
if (g) roots.add(g);
} catch (_) {}
for (const base of roots) {
const pkg = path.join(base, 'playwright', 'package.json');
if (!fs.existsSync(pkg)) continue;
try { return Module.createRequire(pkg)('playwright'); } catch (_) {}
}
throw new Error('Playwright not found. Install: npm install -g playwright');
}
function loadPdfLib() {
try { return require('pdf-lib'); } catch (_) {}
const Module = require('module');
try {
const g = execSync('npm root -g', { stdio: ['ignore', 'pipe', 'ignore'] }).toString().trim();
const pkg = path.join(g, 'pdf-lib', 'package.json');
if (fs.existsSync(pkg)) return Module.createRequire(pkg)('pdf-lib');
} catch (_) {}
throw new Error('pdf-lib not found. Install: npm install -g pdf-lib');
}
function resolveChromium(chromiumObj, allowInstall = false) {
let exe;
try { exe = chromiumObj.executablePath(); } catch (_) { exe = null; }
if (exe && fs.existsSync(exe)) {
return { status: 'ok', executablePath: exe };
}
// Try system Chrome/Chromium
const candidates = [
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
'/usr/bin/chromium-browser', '/usr/bin/chromium', '/usr/bin/google-chrome',
];
if (process.env.PLAYWRIGHT_CHROMIUM_PATH) candidates.unshift(process.env.PLAYWRIGHT_CHROMIUM_PATH);
for (const c of candidates) {
if (fs.existsSync(c)) return { status: 'fallback', executablePath: c };
}
if (allowInstall) {
const r = spawnSync('npx', ['playwright', 'install', 'chromium'], { stdio: 'inherit', shell: true });
if (r.status === 0) {
try { exe = chromiumObj.executablePath(); } catch (_) {}
if (exe && fs.existsSync(exe)) return { status: 'installed', executablePath: exe };
}
}
return { status: 'missing', executablePath: exe || '' };
}
// ═══════════════════════════════════════════════════════════════════
// CLI
// ═══════════════════════════════════════════════════════════════════
function cli() {
const tokens = process.argv.slice(2);
if (!tokens.length || tokens[0] === '-h' || tokens[0] === '--help') {
console.log(`
Usage: node html2pdf-next.js <input.html> [options]
Options:
--output, -o <file> Output PDF path (default: <input>.pdf)
--css <file> Inject extra stylesheet
--width <px> Custom page width (e.g. 720px)
--height <px> Custom page height (e.g. 960px)
--direct (no-op, kept for backward compat — always direct now)
--merge <files...> Append additional PDF files after conversion
--title <text> Set PDF document title metadata
--help, -h Show help
`);
process.exit(0);
}
const inputFile = tokens[0];
let outputFile = null, customCSS = null, width = null, height = null;
let mergeFiles = [], title = null;
for (let i = 1; i < tokens.length; i++) {
const t = tokens[i];
if (t === '--output' || t === '-o') outputFile = tokens[++i];
else if (t === '--css') customCSS = tokens[++i];
else if (t === '--width') width = tokens[++i];
else if (t === '--height') height = tokens[++i];
else if (t === '--direct') { /* no-op, always direct */ }
else if (t === '--title') title = tokens[++i];
else if (t === '--merge') {
while (i + 1 < tokens.length && !tokens[i + 1].startsWith('--')) {
mergeFiles.push(tokens[++i]);
}
}
}
if (!outputFile) {
const p = path.parse(inputFile);
outputFile = path.join(p.dir || '.', p.name + '.pdf');
}
return { inputFile, outputFile, customCSS, width, height, mergeFiles, title };
}
// ═══════════════════════════════════════════════════════════════════
// Helpers
// ═══════════════════════════════════════════════════════════════════
function prettyBytes(n) {
const units = ['B', 'KB', 'MB', 'GB'];
let u = 0;
while (n >= 1024 && u < units.length - 1) { n /= 1024; u++; }
return `${n.toFixed(1)} ${units[u]}`;
}
// ═══════════════════════════════════════════════════════════════════
// Pre-render hooks (run in browser context before PDF export)
// ═══════════════════════════════════════════════════════════════════
async function preRenderHooks(page) {
const warnings = [];
// 1. Wait for Mermaid diagrams
const hasMermaid = await page.evaluate(() => document.querySelectorAll('.mermaid').length > 0);
if (hasMermaid) {
console.log(' ⏳ Waiting for Mermaid diagrams...');
try {
await page.waitForFunction(() => {
for (const m of document.querySelectorAll('.mermaid'))
if (!m.querySelector('svg') && !m.getAttribute('data-processed')) return false;
return true;
}, { timeout: 30000 });
await sleep(2000);
console.log(' ✓ Mermaid rendered');
} catch (_) {
warnings.push('Mermaid rendering timed out (30s)');
}
}
// 2. Trigger KaTeX math rendering
const katexStatus = await page.evaluate(() => ({
lib: typeof renderMathInElement === 'function' || typeof katex !== 'undefined',
rendered: document.querySelectorAll('.katex').length > 0,
raw: /\$[^$]+\$|\$\$[^$]+\$\$|\\\(.*?\\\)|\\\[.*?\\\]/.test(document.body.innerText),
}));
// Auto-inject KaTeX CDN if raw math detected but library not loaded
if (!katexStatus.lib && katexStatus.raw && !katexStatus.rendered) {
console.log(' ⏳ Auto-injecting KaTeX CDN (math formulas detected but KaTeX not loaded)...');
await page.addStyleTag({ url: 'https://cdn.jsdelivr.net/npm/katex@0.16.22/dist/katex.min.css' });
await page.addScriptTag({ url: 'https://cdn.jsdelivr.net/npm/katex@0.16.22/dist/katex.min.js' });
await page.addScriptTag({ url: 'https://cdn.jsdelivr.net/npm/katex@0.16.22/dist/contrib/auto-render.min.js' });
await sleep(2000); // Wait for CDN scripts to load
// Re-check
const recheckLib = await page.evaluate(() => typeof renderMathInElement === 'function');
if (recheckLib) {
console.log(' ✓ KaTeX CDN loaded successfully');
} else {
console.log(' ⚠ KaTeX CDN failed to load — math will render as raw text');
warnings.push('KaTeX CDN injection failed; math formulas may appear as raw LaTeX code');
}
}
// Re-evaluate after potential CDN injection
const katexReady = await page.evaluate(() => ({
lib: typeof renderMathInElement === 'function' || typeof katex !== 'undefined',
rendered: document.querySelectorAll('.katex').length > 0,
raw: /\$[^$]+\$|\$\$[^$]+\$\$|\\\(.*?\\\)|\\\[.*?\\\]/.test(document.body.innerText),
}));
if (katexReady.lib && !katexReady.rendered && katexReady.raw) {
console.log(' ⏳ Triggering KaTeX rendering...');
await page.evaluate(() => {
if (typeof renderMathInElement === 'function')
renderMathInElement(document.body, {
delimiters: [
{ left: '$$', right: '$$', display: true },
{ left: '$', right: '$', display: false },
{ left: '\\(', right: '\\)', display: false },
{ left: '\\[', right: '\\]', display: true },
],
throwOnError: false,
});
});
await sleep(1000);
console.log(' ✓ KaTeX rendered');
} else if (katexReady.rendered) {
await sleep(500); // Font loading settle
}
// 3. Fix oversized elements that prevent page breaks
const nFixed = await page.evaluate(() => {
const LIMIT = 1000;
let n = 0;
document.querySelectorAll(
'[style*="page-break-inside: avoid"],[style*="break-inside: avoid"],' +
'.avoid-break,table,figure,.theorem,.algorithm'
).forEach(el => {
if (el.getBoundingClientRect().height > LIMIT) {
el.style.pageBreakInside = 'auto';
el.style.breakInside = 'auto';
n++;
}
});
return n;
});
if (nFixed) {
console.log(` ⚠ Fixed ${nFixed} oversized elements (removed break-inside: avoid)`);
}
// 4. Detect overflow (horizontal AND vertical)
const overflows = await page.evaluate(() => {
const out = [];
document.querySelectorAll('pre,table,figure,img,svg,.mermaid,blockquote,.equation').forEach(el => {
const hDiff = el.scrollWidth - el.clientWidth;
const vDiff = el.scrollHeight - el.clientHeight;
if (hDiff > 2 || vDiff > 2) out.push({
tag: el.tagName.toLowerCase(),
cls: el.className || '',
hOverflow: hDiff > 2 ? hDiff : 0,
vOverflow: vDiff > 2 ? vDiff : 0,
preview: (el.textContent || '').slice(0, 50).replace(/\s+/g, ' '),
});
});
return out;
});
if (overflows.length) {
console.log(' ⚠ Overflow detected:');
overflows.forEach(o => {
const parts = [];
if (o.hOverflow) parts.push(`H +${o.hOverflow}px`);
if (o.vOverflow) parts.push(`V +${o.vOverflow}px`);
console.log(` <${o.tag}${o.cls ? '.' + o.cls.split(' ')[0] : ''}> ${parts.join(', ')}`);
});
warnings.push(`${overflows.length} element(s) have overflow`);
}
// 4b. Fix vertical overflow on page-level containers
// When html/body or the main content canvas has a fixed height + overflow:hidden,
// content gets clipped. For documents (html2pdf-next.js), we DON'T expand the
// container to its scrollHeight — that creates an oversized single "page" that
// Playwright splits unevenly. Instead, we remove the fixed height and overflow:hidden
// so content flows naturally and @page CSS handles pagination.
//
// (The old "expand to scrollHeight" logic belongs in html2poster.js where a single
// continuous canvas is the desired output.)
const vOverflowFix = await page.evaluate(() => {
const fixes = [];
// Candidates: html, body, and any direct child of body that acts as a full-page canvas
const candidates = [document.documentElement, document.body];
const bodyChildren = document.body.children;
for (let i = 0; i < bodyChildren.length; i++) {
const child = bodyChildren[i];
// Skip SVG defs, script, style elements
const tag = child.tagName.toLowerCase();
if (tag === 'svg' || tag === 'script' || tag === 'style' || tag === 'link') continue;
candidates.push(child);
// Also check one level deeper (e.g., .canvas > .content)
for (let j = 0; j < child.children.length; j++) {
const grandchild = child.children[j];
const gtag = grandchild.tagName.toLowerCase();
if (gtag === 'svg' || gtag === 'script' || gtag === 'style') continue;
candidates.push(grandchild);
}
}
for (const el of candidates) {
const computed = getComputedStyle(el);
const overflow = computed.overflow || computed.overflowY;
const hasHiddenOverflow = overflow === 'hidden' || overflow === 'clip';
const diff = el.scrollHeight - el.clientHeight;
if (hasHiddenOverflow && diff > 5) {
// This element is clipping content vertically
const tag = el.tagName.toLowerCase();
const id = el.id ? `#${el.id}` : '';
const cls = el.className ? `.${String(el.className).split(' ')[0]}` : '';
const selector = `${tag}${id}${cls}`;
const oldHeight = el.clientHeight;
// Document mode: remove fixed height + overflow:hidden,
// let @page handle natural pagination
el.style.height = 'auto';
el.style.minHeight = 'auto';
el.style.maxHeight = 'none';
el.style.overflow = 'visible';
el.style.overflowY = 'visible';
fixes.push({
selector,
oldHeight,
clipped: diff,
});
}
}
// After fixing containers, re-measure to get the final content height
const finalHeight = Math.max(
document.documentElement.scrollHeight,
document.body.scrollHeight
);
return { fixes, finalHeight };
});
if (vOverflowFix.fixes.length) {
console.log(' ⚠️ Removed fixed height + overflow:hidden — content will paginate naturally:');
vOverflowFix.fixes.forEach(f => {
console.log(` ${f.selector}: was ${f.oldHeight}px with ${f.clipped}px clipped → now auto (content will flow to next page)`);
});
}
// 4c. Convert absolute-bottom elements to document flow
// Elements with `position: absolute; bottom: Npx` inside page containers
// are pinned relative to their containing block. When content paginates
// across multiple @page pages, these elements either overlap with body
// text or land on the wrong page. Fix: convert them to static positioning
// so they participate in normal document flow and paginate naturally.
const absBottomFix = await page.evaluate(() => {
const converted = [];
// Scan inside page-level containers (body children and their children)
const containers = [];
for (let i = 0; i < document.body.children.length; i++) {
const child = document.body.children[i];
const tag = child.tagName.toLowerCase();
if (tag === 'svg' || tag === 'script' || tag === 'style' || tag === 'link') continue;
containers.push(child);
}
for (const container of containers) {
const descendants = container.querySelectorAll('*');
for (const el of descendants) {
const computed = getComputedStyle(el);
if (computed.position === 'absolute' && computed.bottom !== 'auto' && computed.bottom !== '') {
// Check if this element contains visible text (not just decorative)
const hasText = el.textContent && el.textContent.trim().length > 0;
if (!hasText) continue;
const tag = el.tagName.toLowerCase();
const id = el.id ? `#${el.id}` : '';
const cls = el.className ? `.${String(el.className).split(' ')[0]}` : '';
const selector = `${tag}${id}${cls}`;
// Convert to static flow: remove absolute positioning
el.style.position = 'static';
el.style.bottom = 'auto';
el.style.left = 'auto';
el.style.right = 'auto';
// Preserve horizontal padding/margin from the original left/right values
// by keeping any existing padding or margin on the element
converted.push({ selector, bottom: computed.bottom });
}
}
}
return converted;
});
if (absBottomFix.length) {
console.log(' ⚠️ Converted absolute-bottom elements to document flow (prevents overlap on multi-page):');
absBottomFix.forEach(f => {
console.log(` ${f.selector}: was position:absolute;bottom:${f.bottom} → now static (flows with content)`);
});
}
// 5. Inject minimal @page CSS fallback
await page.evaluate(() => {
const styles = Array.from(document.querySelectorAll('style'));
const hasPageRule = styles.some(s => (s.textContent || '').includes('@page'));
if (!hasPageRule) {
const s = document.createElement('style');
s.textContent = `@page { margin: 20mm; }`;
document.head.appendChild(s);
}
});
// 6. Fix full-page cover sections for print
// In screen mode, height:100vh = viewport height. In print mode, 100vh ≠ page height.
// Detect elements using 100vh and convert to print-safe page-filling behavior.
const coverFixed = await page.evaluate(() => {
let fixed = 0;
// Find elements with height: 100vh (inline or computed)
const allEls = document.querySelectorAll('*');
for (const el of allEls) {
const style = el.style;
const computed = getComputedStyle(el);
const isVh = style.height === '100vh' || computed.height === '100vh' ||
style.minHeight === '100vh' || computed.minHeight === '100vh';
// Also detect via class name hints
const isCover = el.classList.contains('cover') || el.classList.contains('cover-page') ||
el.id === 'cover' || el.getAttribute('data-role') === 'cover';
if (isVh || (isCover && el.offsetHeight > 0)) {
// Force the element to fill the print page
el.style.height = '100vh';
el.style.minHeight = '100vh';
el.style.pageBreakAfter = 'always';
el.style.pageBreakInside = 'avoid';
el.style.boxSizing = 'border-box';
el.style.overflow = 'hidden';
fixed++;
}
}
// Inject print-specific CSS to make 100vh work correctly
if (fixed > 0) {
const s = document.createElement('style');
s.textContent = `
@media print {
.cover, .cover-page, [data-role="cover"] {
height: 100vh !important;
min-height: 100vh !important;
page-break-after: always !important;
page-break-inside: avoid !important;
overflow: hidden !important;
}
}
`;
document.head.appendChild(s);
}
return fixed;
});
if (coverFixed) {
console.log(` ✓ Fixed ${coverFixed} full-page cover section(s) for print`);
// Also inject named @page rule for cover with zero margins
await page.evaluate(() => {
const s = document.createElement('style');
s.textContent = `
@page cover-page {
margin: 0 !important;
}
@media print {
.cover, .cover-page, [data-role="cover"] {
page: cover-page;
margin: 0 !important;
padding: 40px !important;
}
}
`;
document.head.appendChild(s);
});
}
return { warnings, contentHeight: vOverflowFix.finalHeight };
}
// ═══════════════════════════════════════════════════════════════════
// Content statistics (post-render, from PDF or page)
// ═══════════════════════════════════════════════════════════════════
async function collectStats(page) {
return page.evaluate(() => {
const body = document.body;
const text = body.innerText || '';
const zhChars = (text.match(/[\u4e00-\u9fa5]/g) || []).length;
const enWords = (text.match(/[a-zA-Z]+/g) || []).length;
return {
wordCount: zhChars + enWords,
figures: document.querySelectorAll('figure,.figure,img').length,
tables: document.querySelectorAll('table').length,
};
});
}
// ═══════════════════════════════════════════════════════════════════
// pdf-lib post-processing: page count, metadata, merge
// ═══════════════════════════════════════════════════════════════════
async function postProcess(pdfPath, options = {}) {
const { PDFDocument } = loadPdfLib();
const pdfBytes = fs.readFileSync(pdfPath);
const doc = await PDFDocument.load(pdfBytes);
// Set metadata
if (options.title) doc.setTitle(options.title);
doc.setProducer('html2pdf-next (Playwright + pdf-lib)');
doc.setCreationDate(new Date());
const pageCount = doc.getPageCount();
// Merge additional PDFs
if (options.mergeFiles && options.mergeFiles.length) {
for (const mf of options.mergeFiles) {
if (!fs.existsSync(mf)) {
console.log(` ⚠ Merge file not found: ${mf}`);
continue;
}
console.log(` 📎 Merging: ${path.basename(mf)}`);
const donorBytes = fs.readFileSync(mf);
const donorDoc = await PDFDocument.load(donorBytes);
const copiedPages = await doc.copyPages(donorDoc, donorDoc.getPageIndices());
copiedPages.forEach(p => doc.addPage(p));
}
}
// Save
const finalBytes = await doc.save();
fs.writeFileSync(pdfPath, finalBytes);
return { pageCount: doc.getPageCount(), originalPages: pageCount };
}
// ═══════════════════════════════════════════════════════════════════
// Main pipeline
// ═══════════════════════════════════════════════════════════════════
async function convert(inputFile, outputFile, customCSS, options = {}) {
const { width, height, mergeFiles, title } = options;
if (!fs.existsSync(inputFile)) {
console.error(`✗ File not found: ${inputFile}`);
process.exit(1);
}
const playwright = loadPlaywright();
const { chromium } = playwright;
// Resolve browser
const canInstall = process.env.PDF_SKIP_BROWSER_INSTALL !== '1';
const bInfo = resolveChromium(chromium, canInstall);
if (bInfo.status === 'missing') {
console.error('\n✗ Chromium not found. Run: npx playwright install chromium\n');
process.exit(2);
}
if (bInfo.status === 'fallback') {
console.log(`⚠ Using fallback Chromium: ${bInfo.executablePath}`);
}
const absIn = path.resolve(inputFile);
const absOut = path.resolve(outputFile);
console.log(`\n🔄 Converting ${path.basename(inputFile)}...`);
console.log(` Engine: Playwright + Chromium native @page (no Paged.js)`);
// Read and optionally inject CSS
let html = fs.readFileSync(absIn, 'utf-8');
if (customCSS) {
if (!fs.existsSync(customCSS)) {
console.error(`✗ CSS file not found: ${customCSS}`);
process.exit(1);
}
const tag = `<style>${fs.readFileSync(customCSS, 'utf-8')}</style>`;
html = html.includes('</head>') ? html.replace('</head>', tag + '\n</head>') : tag + '\n' + html;
// Write modified HTML for Playwright to load
const tmpHtml = absIn + '.tmp.html';
fs.writeFileSync(tmpHtml, html);
// We'll clean up later
}
// Launch browser
let browser;
try {
const opts = { headless: true };
if (bInfo.status === 'fallback') opts.executablePath = bInfo.executablePath;
browser = await chromium.launch(opts);
} catch (err) {
const msg = err.message || '';
if (msg.includes('shared libraries') || msg.includes('.so')) {
console.error('\n✗ Missing system libraries. Run: npx playwright install-deps chromium\n');
} else {
console.error(`\n✗ Browser launch failed: ${msg}\n`);
}
process.exit(1);
}
try {
const page = await browser.newPage();
const loadFile = customCSS ? absIn + '.tmp.html' : absIn;
await page.goto('file://' + loadFile, { waitUntil: 'networkidle' });
// ── Pre-render hooks ──
console.log('\n📋 Pre-render checks:');
const preRenderResult = await preRenderHooks(page);
const warnings = preRenderResult.warnings;
const measuredContentHeight = preRenderResult.contentHeight;
// ── Detect continuous-canvas mode (design_engine.py output) ──
const continuousInfo = await page.evaluate(() => {
const el = document.querySelector('.continuous-canvas');
if (!el) return null;
const root = getComputedStyle(document.documentElement);
return {
width: root.getPropertyValue('--canvas-w').trim() || '720px',
height: root.getPropertyValue('--canvas-h').trim() || '960px',
pages: el.querySelectorAll('.page-section').length,
};
});
if (continuousInfo) {
// Creative PDF: seamless multi-page canvas
console.log(`\n🎨 Continuous canvas: ${continuousInfo.pages} pages @ ${continuousInfo.width} × ${continuousInfo.height}`);
await page.pdf({
path: absOut,
printBackground: true,
margin: { top: 0, right: 0, bottom: 0, left: 0 },
width: continuousInfo.width,
height: continuousInfo.height,
});
} else {
// Standard document
console.log('\n📄 Rendering PDF...');
const pdfOpts = {
path: absOut,
printBackground: true,
preferCSSPageSize: true,
tagged: true,
};
if (width || height) {
if (width) pdfOpts.width = width;
if (height) pdfOpts.height = height;
pdfOpts.margin = { top: 0, right: 0, bottom: 0, left: 0 };
console.log(` Custom size: ${pdfOpts.width || 'auto'} × ${pdfOpts.height || 'auto'}`);
} else {
// No explicit size: check if @page CSS defines a fixed size
const pageSize = await page.evaluate(() => {
const styles = Array.from(document.querySelectorAll('style'));
for (const s of styles) {
const text = s.textContent || '';
const match = text.match(/@page\s*\{[^}]*size:\s*([\d.]+)px\s+([\d.]+)px/);
if (match) return { width: parseFloat(match[1]), height: parseFloat(match[2]) };
}
return null;
});
if (pageSize) {
// @page defines a fixed size — use preferCSSPageSize (already set above).
// Playwright will paginate content at @page height boundaries seamlessly.
// This is correct for both posters (seamless multi-page) and documents.
pdfOpts.margin = { top: 0, right: 0, bottom: 0, left: 0 };
console.log(` @page size: ${pageSize.width}px × ${pageSize.height}px`);
if (measuredContentHeight && measuredContentHeight > pageSize.height + 5) {
const estPages = Math.ceil(measuredContentHeight / pageSize.height);
console.log(` Content height: ${measuredContentHeight}px → ~${estPages} pages`);
}
} else {
pdfOpts.format = 'A4';
}
}
await page.pdf(pdfOpts);
}
// Collect content stats from the page
const stats = await collectStats(page);
// ── pdf-lib post-processing ──
console.log('\n🔧 Post-processing (pdf-lib):');
const postResult = await postProcess(absOut, { mergeFiles, title });
// Clean up temp HTML
const tmpHtml = absIn + '.tmp.html';
if (fs.existsSync(tmpHtml)) fs.unlinkSync(tmpHtml);
// ── Report ──
const sz = fs.statSync(absOut).size;
console.log('\n' + '═'.repeat(40));
console.log(' PDF Generated Successfully');
console.log('═'.repeat(40));
console.log(` File: ${path.basename(absOut)}`);
console.log(` Pages: ${postResult.pageCount}`);
console.log(` Size: ${prettyBytes(sz)}`);
console.log(` Words: ~${stats.wordCount.toLocaleString()}`);
console.log(` Assets: ${stats.figures} figures, ${stats.tables} tables`);
console.log(` Engine: Playwright (no Paged.js)`);
console.log(` Path: ${absOut}`);
if (mergeFiles && mergeFiles.length && postResult.pageCount > postResult.originalPages) {
console.log(` Merged: +${postResult.pageCount - postResult.originalPages} pages from ${mergeFiles.length} file(s)`);
}
if (warnings.length) {
console.log('\n⚠ Warnings:');
warnings.forEach(w => console.log(` · ${w}`));
}
// Anomaly detection
if (postResult.pageCount > 1 && stats.wordCount > 0) {
const avgWordsPerPage = stats.wordCount / postResult.pageCount;
if (avgWordsPerPage < 30) {
console.log(`\n⚠ Low content density: ~${Math.round(avgWordsPerPage)} words/page (expected 100+)`);
}
}
} catch (err) {
console.error('\n✗ Conversion failed:', err.message);
process.exit(1);
} finally {
await browser.close();
}
}
// ═══════════════════════════════════════════════════════════════════
// Entry
// ═══════════════════════════════════════════════════════════════════
(async () => {
try {
const args = cli();
await convert(args.inputFile, args.outputFile, args.customCSS, {
width: args.width,
height: args.height,
mergeFiles: args.mergeFiles,
title: args.title,
});
} catch (err) {
console.error('Error:', err.message);
process.exit(1);
}
})();

256
skills/pdf/scripts/html2poster.js Executable file
View File

@@ -0,0 +1,256 @@
#!/usr/bin/env node
/**
* html2poster.js — Single-page poster/long-image HTML → PDF converter
*
* Purpose: Convert a fixed-width, dynamic-height HTML poster into a single-page
* vector PDF with zero margins. This script is PURPOSE-BUILT for posters and
* infographics — it does NOT handle multi-page documents, A4 pagination, or
* document-style margins. For those, use html2pdf-next.js.
*
* Usage:
* node html2poster.js poster.html
* node html2poster.js poster.html --output out.pdf
* node html2poster.js poster.html --width 720px
* node html2poster.js poster.html --width 720px --max-height 8000
*
* What it does (in order):
* 1. Load HTML in Playwright
* 2. Force overflow:hidden on .poster/.page containers (clip decorative overflow)
* 3. Inject @page { margin: 0 } (override any existing margin)
* 4. Ensure html/body have margin:0, padding:0, matching background
* 5. Measure .poster scrollHeight (actual content height)
* 6. Generate single-page PDF with exact dimensions
*
* What it does NOT do:
* - No pagination / page breaks
* - No A4 fallback
* - No margin injection (always zero)
* - No cover adaptation
* - No pdf-lib post-processing
* - No continuous-canvas detection
* - No vertical overflow expansion (posters WANT overflow:hidden)
*/
const fs = require('fs');
const path = require('path');
const { spawnSync } = require('child_process');
// ── Chromium resolution (shared logic with html2pdf-next.js) ──
function resolveChromium(chromiumObj) {
let exe;
try { exe = chromiumObj.executablePath(); } catch (_) { exe = null; }
if (exe && fs.existsSync(exe)) return { status: 'ok', executablePath: exe };
const candidates = [
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
'/usr/bin/chromium-browser', '/usr/bin/chromium', '/usr/bin/google-chrome',
];
if (process.env.PLAYWRIGHT_CHROMIUM_PATH) candidates.unshift(process.env.PLAYWRIGHT_CHROMIUM_PATH);
for (const c of candidates) {
if (fs.existsSync(c)) return { status: 'fallback', executablePath: c };
}
return { status: 'missing', executablePath: exe || '' };
}
// ── CLI parsing ──
function parseArgs(argv) {
const tokens = argv.slice(2);
let input = null, output = null, width = '720px', maxHeight = 16000;
for (let i = 0; i < tokens.length; i++) {
const t = tokens[i];
if (t === '--output' || t === '-o') output = tokens[++i];
else if (t === '--width') width = tokens[++i];
else if (t === '--max-height') maxHeight = parseInt(tokens[++i], 10);
else if (t === '--help' || t === '-h') {
console.log(`
Usage: node html2poster.js <input.html> [options]
Options:
--output, -o Output PDF path (default: input with .pdf extension)
--width Poster width (default: 720px)
--max-height Maximum allowed height in px (default: 16000, safety limit)
-h, --help Show this help
`);
process.exit(0);
}
else if (!input) input = t;
else if (!output) output = t;
}
if (!input) {
console.error('Error: No input HTML file specified.');
process.exit(1);
}
if (!output) {
output = input.replace(/\.html?$/i, '.pdf');
if (output === input) output = input + '.pdf';
}
return { input, output, width, maxHeight };
}
// ── Main ──
async function main() {
const { input, output, width, maxHeight } = parseArgs(process.argv);
const absIn = path.resolve(input);
const absOut = path.resolve(output);
if (!fs.existsSync(absIn)) {
console.error(`Error: File not found: ${absIn}`);
process.exit(1);
}
console.log(`\n🖼 html2poster — Single-page poster PDF generator`);
console.log(` Input: ${absIn}`);
console.log(` Output: ${absOut}`);
console.log(` Width: ${width}`);
// Load Playwright
let playwright;
try {
playwright = require('playwright');
} catch {
try {
playwright = require('playwright-core');
} catch {
console.error('Error: playwright or playwright-core not installed.');
process.exit(1);
}
}
const { chromium } = playwright;
const bInfo = resolveChromium(chromium);
if (bInfo.status === 'missing') {
console.error('Error: No Chromium found. Run: npx playwright install chromium');
process.exit(1);
}
if (bInfo.status === 'fallback') {
console.log(` ⚠ Using fallback Chromium: ${bInfo.executablePath}`);
}
// Launch browser
const launchOpts = { headless: true };
if (bInfo.status === 'fallback') launchOpts.executablePath = bInfo.executablePath;
const browser = await chromium.launch(launchOpts);
try {
// Use a wide viewport so content doesn't wrap unexpectedly
const widthPx = parseInt(width, 10) || 720;
const page = await browser.newPage({ viewport: { width: widthPx, height: 1200 } });
await page.goto('file://' + absIn, { waitUntil: 'networkidle' });
console.log(`\n ✓ HTML loaded`);
// ── Step 1: Force overflow:hidden on page containers ──
// Decorative elements with negative offsets or width>100% inflate scrollWidth,
// causing Playwright to shrink content to fit. overflow:hidden clips them.
const overflowFixed = await page.evaluate(() => {
const selectors = ['.poster', '.page', '#poster', '#page'];
let fixed = 0;
for (const sel of selectors) {
const el = document.querySelector(sel);
if (!el) continue;
const computed = getComputedStyle(el);
if (computed.overflow !== 'hidden') {
el.style.overflow = 'hidden';
fixed++;
}
}
return fixed;
});
if (overflowFixed > 0) {
console.log(` ✓ Added overflow:hidden to ${overflowFixed} container(s)`);
}
// ── Step 2: Inject @page { margin: 0 } — override any existing @page rule ──
await page.evaluate(() => {
const s = document.createElement('style');
// Use !important-equivalent: place at end so it wins cascade
s.textContent = `@page { margin: 0 !important; size: auto; }`;
document.head.appendChild(s);
});
// ── Step 3: Ensure html/body have zero margin/padding ──
const bgSync = await page.evaluate(() => {
const html = document.documentElement;
const body = document.body;
html.style.margin = '0';
html.style.padding = '0';
body.style.margin = '0';
body.style.padding = '0';
// Sync body background with poster background to avoid color gaps
const poster = document.querySelector('.poster') || document.querySelector('.page');
if (poster) {
const posterBg = getComputedStyle(poster).backgroundColor;
if (posterBg && posterBg !== 'rgba(0, 0, 0, 0)' && posterBg !== 'transparent') {
body.style.backgroundColor = posterBg;
html.style.backgroundColor = posterBg;
return posterBg;
}
}
return null;
});
if (bgSync) {
console.log(` ✓ Synced body background: ${bgSync}`);
}
// ── Step 4: Measure actual content height ──
const measurement = await page.evaluate(() => {
const poster = document.querySelector('.poster') || document.querySelector('.page') || document.body;
return {
scrollHeight: poster.scrollHeight,
scrollWidth: poster.scrollWidth,
offsetWidth: poster.offsetWidth,
selector: poster.className ? '.' + poster.className.split(' ')[0] : poster.tagName,
};
});
console.log(` ✓ Measured: ${measurement.selector} = ${measurement.scrollWidth}×${measurement.scrollHeight}px`);
if (measurement.scrollWidth > widthPx + 2) {
console.log(` ⚠ WARNING: scrollWidth (${measurement.scrollWidth}px) > width (${widthPx}px)`);
console.log(` Decorative elements may still overflow. Check for position:absolute elements with negative offsets.`);
}
let contentHeight = measurement.scrollHeight;
if (contentHeight > maxHeight) {
console.log(` ⚠ Content height ${contentHeight}px exceeds max ${maxHeight}px, clamping.`);
contentHeight = maxHeight;
}
if (contentHeight < 100) {
console.log(` ⚠ Content height ${contentHeight}px seems too small, using 960px fallback.`);
contentHeight = 960;
}
// ── Step 5: Generate PDF ──
console.log(`\n 📄 Generating PDF: ${width} × ${contentHeight}px`);
await page.pdf({
path: absOut,
width: width,
height: contentHeight + 'px',
printBackground: true,
margin: { top: '0', right: '0', bottom: '0', left: '0' },
});
console.log(`\n ✅ Done: ${absOut}`);
console.log(` Size: ${(fs.statSync(absOut).size / 1024).toFixed(1)} KB`);
} finally {
await browser.close();
}
}
main().catch(err => {
console.error(`\n✗ Fatal: ${err.message}`);
process.exit(1);
});

2959
skills/pdf/scripts/pdf.py Executable file

File diff suppressed because it is too large Load Diff

901
skills/pdf/scripts/pdf_qa.py Executable file
View File

@@ -0,0 +1,901 @@
#!/usr/bin/env python3
"""
PDF Quality Assurance Checker
=============================
Automatically detects common typesetting issues in PDFs.
Usage: python3 pdf_qa.py <pdf_path>
Checks:
1. Page size consistency across all pages
2. Blank page detection
3. CJK punctuation placement (line-start/end forbidden punctuation)
4. Color analysis (informational only — counts and lists colors)
5. Font embedding check (warns on non-embedded fonts)
6. PDF metadata check (title/author/creator)
7. Content overflow detection (text exceeding page boundaries)
8. Content fill ratio per page (multi-page docs, warns if < 40%)
9. Cover/poster full-bleed check (background extends to page edges)
10. Margin symmetry check (left/right text margins)
11. Table centering check (if detected)
12. Formula overflow check (optional)
"""
import sys
import os
import re
import json
from collections import Counter
try:
import pymupdf # PyMuPDF
except ImportError:
import fitz as pymupdf
# ============================================================
# Config
# ============================================================
# CJK punctuation forbidden at line start
LINE_START_FORBIDDEN = set(
"。、,;:!?)】〛〉」』"
"\u201c\u201d" # "" curly double quotes
"\u2026" # … ellipsis
"\u2014" # — em dash
"\uff5e" # fullwidth tilde
"\u00b7" # · middle dot
)
# CJK punctuation forbidden at line end
LINE_END_FORBIDDEN = set(
"(【《〈「"
"\u2018\u2019" # '' curly single quotes
"\u201c" # " left curly double quote
)
# Minimum fill ratio for last page (DISABLED — caused false positives)
# LAST_PAGE_MIN_FILL = 0.40
# Maximum allowed color count — REMOVED (color count is now info-only)
# MAX_COLORS = 8
# ============================================================
# Checks
# ============================================================
class QAResult:
def __init__(self):
self.issues = [] # (severity, category, message)
self.passes = [] # passed checks
self.info = [] # informational
def error(self, cat, msg):
self.issues.append(('ERROR', cat, msg))
def warn(self, cat, msg):
self.issues.append(('WARN', cat, msg))
def ok(self, msg):
self.passes.append(msg)
def add_info(self, msg):
self.info.append(msg)
def check_last_page_fill(doc, result):
"""Check content fill ratio of the last page"""
if len(doc) < 2:
result.ok("Single-page document, no last-page blank check needed")
return
last_page = doc[-1]
page_rect = last_page.rect
page_area = page_rect.width * page_rect.height
# Get bounding boxes of all content on last page
blocks = last_page.get_text("blocks")
if not blocks:
result.error("Last page blank", f"Page {len(doc)} (last page) has no content at all!")
return
# Calculate max y-coordinate covered by content
max_y = 0
min_y = page_rect.height
for b in blocks:
if b[4].strip(): # Has text content
min_y = min(min_y, b[1])
max_y = max(max_y, b[3])
if max_y == 0:
result.error("Last page blank", f"Page {len(doc)} (last page) has no valid text content")
return
content_height = max_y - min_y
fill_ratio = content_height / page_rect.height
result.add_info(f"Last page fill ratio: {fill_ratio:.0%} (content height {content_height:.0f}px / page height {page_rect.height:.0f}px)")
if fill_ratio < 0.25:
result.error("Last page blank", f"Last page fill ratio only {fill_ratio:.0%}, mostly blank! Consider compressing preceding page spacing or trimming content")
elif fill_ratio < LAST_PAGE_MIN_FILL:
result.warn("Last page blank", f"Last page fill ratio {fill_ratio:.0%}, somewhat sparse — optimization recommended")
else:
result.ok(f"Last page fill ratio {fill_ratio:.0%}")
def check_punctuation(doc, result):
"""Check CJK punctuation placement rules"""
violations = []
for page_num in range(len(doc)):
page = doc[page_num]
# Extract text by line
text_dict = page.get_text("dict")
for block in text_dict.get("blocks", []):
if block.get("type") != 0: # Only check text blocks
continue
for line in block.get("lines", []):
line_text = ""
for span in line.get("spans", []):
line_text += span.get("text", "")
line_text = line_text.strip()
if not line_text:
continue
# Check line start
first_char = line_text[0]
if first_char in LINE_START_FORBIDDEN:
violations.append((page_num + 1, f"Forbidden line-start punctuation '{first_char}': ...{line_text[:30]}"))
# Check line end
last_char = line_text[-1] if len(line_text) > 0 else ''
if last_char in LINE_END_FORBIDDEN:
violations.append((page_num + 1, f"Forbidden line-end punctuation '{last_char}': {line_text[-30:]}..."))
if violations:
# Show at most 10
shown = violations[:10]
for page_num, desc in shown:
result.warn("Punctuation rules", f"Page {page_num} - {desc}")
if len(violations) > 10:
result.warn("Punctuation rules", f"...{len(violations) - 10} more violations")
else:
result.ok("Punctuation placement check passed ✓")
def check_blank_pages(doc, result):
"""Check for completely blank pages"""
blank_pages = []
for i in range(len(doc)):
page = doc[i]
text = page.get_text().strip()
# Also check for images
images = page.get_images()
drawings = page.get_drawings()
if not text and not images and not drawings:
blank_pages.append(i + 1)
if blank_pages:
result.error("Blank pages", f"Found blank pages: {blank_pages}")
else:
result.ok("No blank pages ✓")
def check_colors(doc, result):
"""Analyze colors used in the document (informational only, no pass/fail)"""
colors = set()
for page_num in range(len(doc)):
page = doc[page_num]
text_dict = page.get_text("dict")
for block in text_dict.get("blocks", []):
if block.get("type") != 0:
continue
for line in block.get("lines", []):
for span in line.get("spans", []):
color = span.get("color", 0)
if color != 0: # Exclude pure black
r = (color >> 16) & 0xFF
g = (color >> 8) & 0xFF
b = color & 0xFF
hex_color = f"#{r:02x}{g:02x}{b:02x}"
colors.add(hex_color)
# Check drawing colors
drawings = page.get_drawings()
for d in drawings:
if d.get("color"):
c = d["color"]
if isinstance(c, (tuple, list)) and len(c) >= 3:
hex_color = f"#{int(c[0]*255):02x}{int(c[1]*255):02x}{int(c[2]*255):02x}"
colors.add(hex_color)
if d.get("fill"):
c = d["fill"]
if isinstance(c, (tuple, list)) and len(c) >= 3:
hex_color = f"#{int(c[0]*255):02x}{int(c[1]*255):02x}{int(c[2]*255):02x}"
colors.add(hex_color)
# Filter out near-black/white/gray colors
distinct_colors = []
for c in colors:
r = int(c[1:3], 16)
g = int(c[3:5], 16)
b = int(c[5:7], 16)
max_diff = max(abs(r-g), abs(g-b), abs(r-b))
if max_diff > 20:
distinct_colors.append(c)
result.add_info(f"Total text colors: {len(colors)} (chromatic: {len(distinct_colors)})")
if distinct_colors:
result.add_info(f"Chromatic colors: {', '.join(sorted(distinct_colors)[:10])}")
def check_page_size_consistency(doc, result):
"""Check whether all page sizes are consistent"""
if len(doc) < 2:
result.ok("Single-page document, size consistent ✓")
return
sizes = set()
for i in range(len(doc)):
page = doc[i]
w = round(page.rect.width, 1)
h = round(page.rect.height, 1)
sizes.add((w, h))
if len(sizes) > 1:
result.warn("Page size", f"Inconsistent page sizes: {sizes}")
else:
size = list(sizes)[0]
# Convert to mm
w_mm = size[0] * 25.4 / 72
h_mm = size[1] * 25.4 / 72
result.add_info(f"Page size: {w_mm:.0f}mm × {h_mm:.0f}mm ({len(doc)} pages)")
result.ok("Page size consistent ✓")
def check_text_overflow(doc, result):
"""Check whether text overflows page boundaries"""
overflow_pages = []
for i in range(len(doc)):
page = doc[i]
rect = page.rect
blocks = page.get_text("blocks")
for b in blocks:
# b = (x0, y0, x1, y1, text, block_no, block_type)
if b[2] > rect.width + 2 or b[3] > rect.height + 2: # 2px tolerance
overflow_pages.append(i + 1)
break
if b[0] < -2 or b[1] < -2:
overflow_pages.append(i + 1)
break
if overflow_pages:
result.warn("Content overflow", f"Pages {overflow_pages} may have content exceeding page boundaries")
else:
result.ok("No content overflow ✓")
def check_content_fill_ratio(doc, result):
"""Check content fill ratio per page — warns when content is crammed at top leaving large void below.
Rules:
- Skip single-page documents (may be intentional design)
- Skip page 1 (usually cover with intentional whitespace)
- Middle pages: warn if fill ratio < 40%
- Last page: warn if fill ratio < 25% (naturally has less content)
"""
if len(doc) < 2:
result.ok("Single-page document, skipping content fill ratio check ✓")
return
low_fill_pages = []
for i in range(len(doc)):
page = doc[i]
page_rect = page.rect
page_height = page_rect.height
# Skip page 1 (cover)
if i == 0:
continue
blocks = page.get_text("blocks")
images = page.get_images()
drawings = page.get_drawings()
if not blocks and not images and not drawings:
continue # Blank page check handles this
# Calculate content bbox
max_y = 0
for b in blocks:
if b[4].strip():
max_y = max(max_y, b[3])
# Include images in bbox
for img in images:
try:
img_rects = page.get_image_rects(img[0])
for r in img_rects:
max_y = max(max_y, r.y1)
except Exception:
pass
if max_y == 0:
continue
fill_ratio = max_y / page_height
is_last = (i == len(doc) - 1)
threshold = 0.25 if is_last else 0.40
if fill_ratio < threshold:
low_fill_pages.append((i + 1, fill_ratio, threshold))
if low_fill_pages:
for pg, ratio, thresh in low_fill_pages:
result.warn(
"Content fill ratio",
f"Page {pg} content only fills {ratio:.0%} of page height "
f"(threshold: {thresh:.0%}). Content may be crammed at the top "
f"with a large blank area below."
)
else:
result.ok("Content fill ratio adequate on all pages ✓")
def check_cover_bleed(doc, result, poster=False):
"""Check if the cover page (page 1) fills the entire page area (full-bleed).
A properly designed cover should have background color/graphics extending
to the page edges. If the content bbox has significant margins on all sides,
the cover likely wasn't rendered full-bleed (e.g. ReportLab with default margins).
For poster mode: checks ALL pages (not just the cover) since every page of a
seamlessly-paginated poster should have consistent background fill.
Strategy: combine bounding boxes of drawings (rects, paths), images, and colored
backgrounds. If the union bbox leaves > 5% margin on any side, warn.
"""
if not poster and len(doc) < 2:
# Single page doc (non-poster) — not necessarily a cover scenario
return
pages_to_check = range(len(doc)) if poster else [0]
for page_idx in pages_to_check:
page = doc[page_idx]
page_rect = page.rect
pw, ph = page_rect.width, page_rect.height
# Collect all content bounding boxes
min_x, min_y = pw, ph
max_x, max_y = 0.0, 0.0
has_content = False
# 1. Drawings (vector paths, rectangles — typical for colored backgrounds)
for d in page.get_drawings():
r = d.get("rect")
if r:
min_x = min(min_x, r.x0)
min_y = min(min_y, r.y0)
max_x = max(max_x, r.x1)
max_y = max(max_y, r.y1)
has_content = True
# 2. Images
for img in page.get_images():
try:
for r in page.get_image_rects(img[0]):
min_x = min(min_x, r.x0)
min_y = min(min_y, r.y0)
max_x = max(max_x, r.x1)
max_y = max(max_y, r.y1)
has_content = True
except Exception:
pass
page_label = f"Page {page_idx + 1}" if poster else "Cover page (p1)"
if not has_content:
blocks = page.get_text("blocks")
if blocks:
result.warn(
f"{page_label} not full-bleed",
f"{page_label} has no background graphics (no filled rectangles or images). "
"A proper cover/poster page should have a full-page background color or image "
"extending to all edges."
)
continue
# Calculate margin ratios (how far content is from page edges)
margin_left = max(0, min_x) / pw
margin_top = max(0, min_y) / ph
margin_right = max(0, pw - max_x) / pw
margin_bottom = max(0, ph - max_y) / ph
threshold = 0.05
margins_ok = (margin_left <= threshold and margin_top <= threshold and
margin_right <= threshold and margin_bottom <= threshold)
if margins_ok:
result.ok(f"{page_label} content extends to page edges (full-bleed) ✓")
else:
sides = []
if margin_left > threshold:
sides.append(f"left {margin_left:.0%}")
if margin_top > threshold:
sides.append(f"top {margin_top:.0%}")
if margin_right > threshold:
sides.append(f"right {margin_right:.0%}")
if margin_bottom > threshold:
sides.append(f"bottom {margin_bottom:.0%}")
result.warn(
f"{page_label} not full-bleed",
f"{page_label} has visible margins: {', '.join(sides)}. "
f"Background/graphics should extend to page edges."
)
def check_margin_symmetry(doc, result, skip_cover=False):
"""Check left/right margin symmetry using text block bounds."""
warn_pages = []
for page_num in range(len(doc)):
if skip_cover and page_num == 0:
continue
page = doc[page_num]
blocks = page.get_text("blocks")
text_blocks = [b for b in blocks if b[4].strip()]
if len(text_blocks) < 3:
continue # Skip decorative/cover-like pages
left_margin = min(b[0] for b in text_blocks)
right_margin = page.rect.width - max(b[2] for b in text_blocks)
diff = abs(left_margin - right_margin)
if diff > page.rect.width * 0.05:
warn_pages.append((page_num + 1, left_margin, right_margin, diff))
if warn_pages:
for pg, left, right, diff in warn_pages:
result.warn(
"Margin symmetry",
f"Page {pg} left/right margins differ by {diff:.0f}pt "
f"(L {left:.0f}pt, R {right:.0f}pt)"
)
else:
result.ok("Left/right margins appear symmetric \u2713")
def check_table_centering(doc, result):
"""Check if detected table regions are centered."""
def _bbox_intersects(a, b, tol=6):
return not (a[2] < b[0] - tol or a[0] > b[2] + tol or
a[3] < b[1] - tol or a[1] > b[3] + tol)
def _rect_tuple(r):
if hasattr(r, "x0"):
return (r.x0, r.y0, r.x1, r.y1)
return (r[0], r[1], r[2], r[3])
any_tables = False
for page_num in range(len(doc)):
page = doc[page_num]
drawings = page.get_drawings()
segments = []
for d in drawings:
for item in d.get("items", []):
if not item:
continue
op = item[0]
if op == "l" and len(item) >= 3:
p0, p1 = item[1], item[2]
segments.append((p0[0], p0[1], p1[0], p1[1]))
elif op == "re" and len(item) >= 2:
x0, y0, x1, y1 = _rect_tuple(item[1])
segments.extend([
(x0, y0, x1, y0),
(x0, y1, x1, y1),
(x0, y0, x0, y1),
(x1, y0, x1, y1),
])
if not segments:
continue
cluster_list = []
for x0, y0, x1, y1 in segments:
min_x, max_x = min(x0, x1), max(x0, x1)
min_y, max_y = min(y0, y1), max(y0, y1)
bbox = (min_x, min_y, max_x, max_y)
is_h = abs(y0 - y1) < 1 and (max_x - min_x) > 20
is_v = abs(x0 - x1) < 1 and (max_y - min_y) > 20
if not is_h and not is_v:
continue
placed = False
for cl in cluster_list:
if _bbox_intersects(bbox, cl["bbox"]):
cl["segments"].append((x0, y0, x1, y1, is_h, is_v))
cl["bbox"] = (
min(cl["bbox"][0], bbox[0]),
min(cl["bbox"][1], bbox[1]),
max(cl["bbox"][2], bbox[2]),
max(cl["bbox"][3], bbox[3]),
)
if is_h:
cl["h"] += 1
if is_v:
cl["v"] += 1
placed = True
break
if not placed:
cluster_list.append({
"bbox": bbox,
"segments": [(x0, y0, x1, y1, is_h, is_v)],
"h": 1 if is_h else 0,
"v": 1 if is_v else 0,
})
for cl in cluster_list:
if cl["h"] < 2 or cl["v"] < 2:
continue
any_tables = True
bbox = cl["bbox"]
page_width = page.rect.width
left_margin = bbox[0]
right_margin = page_width - bbox[2]
if abs(left_margin - right_margin) > page_width * 0.05:
result.warn(
"Table centering",
f"Page {page_num + 1}: Table not centered "
f"(L {left_margin:.0f}pt, R {right_margin:.0f}pt)"
)
if any_tables:
result.ok("Table centering check complete \u2713")
def check_font_embedding(doc, result):
"""Check font embedding status using PyMuPDF font list."""
fonts_used = set()
non_embedded = set()
for page_num in range(len(doc)):
page = doc[page_num]
for font in page.get_fonts():
basefont = font[3] if len(font) > 3 else "unknown"
ext = font[1] if len(font) > 1 else ""
fonts_used.add(basefont)
if not ext:
non_embedded.add(basefont)
if fonts_used:
result.add_info(f"Fonts used: {', '.join(sorted(fonts_used))}")
else:
result.add_info("Fonts used: (none detected)")
if non_embedded:
for basefont in sorted(non_embedded):
result.warn(
"Font embedding",
f"Font {basefont} is not embedded. May display differently on other systems."
)
else:
result.ok("All fonts are embedded \u2713")
def check_helvetica_in_cjk(doc, result):
"""Detect Helvetica rendering visible text in documents containing CJK text.
Helvetica is a Latin-only built-in PDF font. When it appears rendering
actual text content in a CJK document, it almost always means a raw string
was passed to a ReportLab Table or flowable without wrapping it in
Paragraph() with a CJK font. The CJK characters rendered via Helvetica
become garbled (fall back to ZapfDingbats symbols).
We only check Helvetica (not ZapfDingbats) because ZapfDingbats is
legitimately used for bullet symbols in list items.
We check actual rendered text spans (not just font presence in font list)
because ReportLab internally registers Helvetica on every page even when
only CJK fonts are used in visible content.
"""
has_cjk = False
helvetica_pages = []
for page_num in range(len(doc)):
page = doc[page_num]
text = page.get_text("text") or ""
# Check if document contains CJK characters
if not has_cjk:
for ch in text:
if '\u4e00' <= ch <= '\u9fff' or '\u3400' <= ch <= '\u4dbf':
has_cjk = True
break
# Check if Helvetica is actually used to render visible text on this page
blocks = page.get_text("dict", sort=True).get("blocks", [])
found_on_page = False
for block in blocks:
if found_on_page:
break
for line in block.get("lines", []):
if found_on_page:
break
for span in line.get("spans", []):
font = span.get("font", "")
txt = span.get("text", "").strip()
if "Helvetica" in font and len(txt) > 0:
helvetica_pages.append(page_num + 1)
found_on_page = True
break
if has_cjk and helvetica_pages:
pages_str = ', '.join(str(p) for p in helvetica_pages[:5])
if len(helvetica_pages) > 5:
pages_str += f' ...and {len(helvetica_pages) - 5} more'
result.warn(
"Helvetica in CJK document",
f"Helvetica font detected rendering text on page(s) {pages_str} in a CJK document. "
f"This usually means a raw string was passed to a ReportLab Table or flowable "
f"without wrapping in Paragraph(text, style) with a CJK-capable font. "
f"CJK characters rendered via Helvetica will appear as garbled symbols."
)
def check_metadata(doc, result):
"""Check PDF metadata presence for title, author, creator."""
meta = doc.metadata or {}
def _missing(v):
if v is None:
return True
if not str(v).strip():
return True
return False
title = meta.get("title")
author = meta.get("author")
creator = meta.get("creator")
if _missing(title) or str(title).strip().lower() in ("untitled", "(anonymous)"):
result.warn("Metadata", "Missing/invalid title metadata")
else:
result.ok("Title metadata present \u2713")
if _missing(author):
result.warn("Metadata", "Missing author metadata")
else:
result.ok("Author metadata present \u2713")
if _missing(creator):
result.warn("Metadata", "Missing creator metadata")
else:
result.ok("Creator metadata present \u2713")
def check_toc_without_cover(doc, result):
"""Detect TOC on page 1 without a preceding cover page.
If the first page contains Table of Contents / 目录, it means the document
has a TOC but no cover page. This is a structural issue — documents with
TOC should have: Cover (p1) → TOC (p2) → Content (p3+).
"""
if len(doc) < 2:
# Single-page docs don't need TOC/cover checks
return
page1 = doc[0]
text = page1.get_text("text", sort=True).strip()
# Normalize for matching
text_lower = text.lower()
first_300 = text_lower[:300]
toc_keywords = [
"table of contents", "contents",
"目录", "目 录",
]
has_toc = any(kw in first_300 for kw in toc_keywords)
if has_toc:
result.warn(
"TOC without cover",
"Page 1 appears to be a Table of Contents with no preceding cover page. "
"Documents with TOC should have: Cover (p1) → TOC (p2) → Content (p3+)."
)
def check_formula_overflow(doc, result):
"""Detect likely formula overflow past right content margin."""
math_re = re.compile(r"[=+\-*/<>\u2264\u2265\u2211\u222b\u221a\u03c0\u00b5\u221e\u2202\u2206\u2248\u2260\u00b1\u00d7\u00f7]")
for page_num in range(len(doc)):
page = doc[page_num]
blocks = page.get_text("blocks")
text_blocks = [b for b in blocks if b[4].strip()]
if len(text_blocks) < 3:
continue
right_edges = sorted(b[2] for b in text_blocks)
mid = len(right_edges) // 2
content_right = right_edges[mid] if right_edges else 0
for b in text_blocks:
x0, x1, text = b[0], b[2], b[4]
if x1 <= content_right + 10:
continue
is_single_line = "\n" not in text.strip()
is_wide = (x1 - x0) > page.rect.width * 0.5
has_math = bool(math_re.search(text))
if (is_single_line and is_wide) or has_math:
delta = x1 - content_right
result.warn(
"Formula overflow",
f"Page {page_num + 1}: Content extends {delta:.0f}pt beyond right content margin "
"(possible formula overflow)"
)
break
# ============================================================
# Main
# ============================================================
def run_qa(pdf_path, poster=False, skip_cover=False, check_tables=True, check_formulas=False):
result = QAResult()
if not os.path.exists(pdf_path):
result.error("File", f"File not found: {pdf_path}")
return result
doc = pymupdf.open(pdf_path)
result.add_info(f"File: {os.path.basename(pdf_path)}")
result.add_info(f"Size: {os.path.getsize(pdf_path) / 1024:.1f} KB")
if poster:
result.add_info("Mode: poster (creative)")
# Run all checks
check_metadata(doc, result)
check_page_size_consistency(doc, result)
check_blank_pages(doc, result)
check_punctuation(doc, result)
check_colors(doc, result)
check_font_embedding(doc, result)
check_helvetica_in_cjk(doc, result)
check_text_overflow(doc, result)
if not poster:
# Content fill ratio is not meaningful for posters — the last page
# of a seamlessly-paginated poster naturally has less content.
check_content_fill_ratio(doc, result)
check_cover_bleed(doc, result, poster=poster)
check_margin_symmetry(doc, result, skip_cover=skip_cover)
if check_tables:
check_table_centering(doc, result)
if check_formulas:
check_formula_overflow(doc, result)
if not poster:
check_toc_without_cover(doc, result)
doc.close()
return result
def format_report(result):
lines = []
lines.append("=" * 56)
lines.append(" PDF Quality Assurance Report")
lines.append("=" * 56)
# Info
if result.info:
lines.append("")
lines.append(" Info:")
for msg in result.info:
lines.append(f" {msg}")
# Passes
if result.passes:
lines.append("")
lines.append(f"✅ Passed ({len(result.passes)}):")
for msg in result.passes:
lines.append(f" {msg}")
# Issues
errors = [(s, c, m) for s, c, m in result.issues if s == 'ERROR']
warns = [(s, c, m) for s, c, m in result.issues if s == 'WARN']
if errors:
lines.append("")
lines.append(f"❌ Errors ({len(errors)}):")
for _, cat, msg in errors:
lines.append(f" [{cat}] {msg}")
if warns:
lines.append("")
lines.append(f"⚠️ Warnings ({len(warns)}):")
for _, cat, msg in warns:
lines.append(f" [{cat}] {msg}")
# Summary
lines.append("")
lines.append("-" * 56)
total_issues = len(result.issues)
if total_issues == 0:
lines.append("🎉 PASS — All checks passed!")
elif errors:
lines.append(f"💀 FAIL — {len(errors)} error(s), {len(warns)} warning(s)")
else:
lines.append(f"⚠️ WARN — {len(warns)} warning(s), optimization recommended")
lines.append("-" * 56)
return "\n".join(lines)
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python3 pdf_qa.py <pdf_path>")
print(" python3 pdf_qa.py *.pdf (batch check)")
print("Options:")
print(" --poster Poster mode (creative)")
print(" --skip-cover Skip page 1 margin symmetry check")
print(" --no-tables Disable table centering check")
print(" --formulas Enable formula overflow check")
sys.exit(1)
import glob
files = []
poster = False
skip_cover = False
check_tables = True
check_formulas = False
args = sys.argv[1:]
if '--poster' in args:
poster = True
args.remove('--poster')
if '--skip-cover' in args:
skip_cover = True
args.remove('--skip-cover')
if '--no-tables' in args:
check_tables = False
args.remove('--no-tables')
if '--formulas' in args:
check_formulas = True
args.remove('--formulas')
for arg in args:
files.extend(glob.glob(arg))
if not files:
print(f"File not found: {args}")
sys.exit(1)
for pdf_path in files:
result = run_qa(
pdf_path,
poster=poster,
skip_cover=skip_cover,
check_tables=check_tables,
check_formulas=check_formulas
)
print(format_report(result))
if len(files) > 1:
print("\n")

File diff suppressed because it is too large Load Diff

269
skills/pdf/scripts/setup.sh Executable file
View File

@@ -0,0 +1,269 @@
#!/usr/bin/env bash
# ---
# name: pdf-setup
# author: Z.AI
# version: "1.0"
# description: Environment setup for the PDF skill. Checks and installs all required dependencies.
# ---
#
# Installs only dependencies required by the PDF skill.
set -euo pipefail
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
ok() { echo -e " ${GREEN}${NC} $1"; }
fail() { echo -e " ${RED}${NC} $1"; }
warn() { echo -e " ${YELLOW}${NC} $1"; }
info() { echo -e " ${BLUE}${NC} $1"; }
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
echo "============================================"
echo " PDF Skill — Environment Setup"
echo "============================================"
echo ""
# ── Detect platform ──
OS="$(uname -s)"
ARCH="$(uname -m)"
echo "Platform: $OS $ARCH"
echo ""
# ── 0. macOS: Homebrew ──
if [ "$OS" = "Darwin" ]; then
echo "--- Homebrew (macOS package manager) ---"
if command -v brew &>/dev/null; then
BREW_VER=$(brew --version 2>/dev/null | head -1)
ok "brew ($BREW_VER)"
else
fail "brew not found — most dependencies below need Homebrew on macOS"
info "Install: /bin/bash -c \"\$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)\""
fi
echo ""
fi
# ── 1. Python 3 ──
echo "--- Python ---"
if command -v python3 &>/dev/null; then
PY_VER=$(python3 --version 2>&1)
ok "python3 ($PY_VER)"
# macOS: warn if using system Python
if [ "$OS" = "Darwin" ]; then
PY_PATH=$(which python3 2>/dev/null)
if [[ "$PY_PATH" == "/usr/bin/python3" ]]; then
warn "Using macOS system Python (limited). Recommend: brew install python3"
fi
fi
else
fail "python3 not found"
case "$OS" in
Darwin) info "Install: brew install python3" ;;
Linux) info "Install: sudo apt install python3 python3-pip (Debian/Ubuntu)"
info " sudo dnf install python3 python3-pip (Fedora/RHEL)" ;;
*) info "Install: https://www.python.org/downloads/" ;;
esac
fi
# ── 2. pip ──
echo ""
echo "--- pip ---"
if python3 -m pip --version &>/dev/null 2>&1; then
PIP_VER=$(python3 -m pip --version 2>/dev/null | head -1)
ok "pip ($PIP_VER)"
else
fail "pip not found"
case "$OS" in
Darwin) info "Install: python3 -m ensurepip --upgrade"
info " or: brew install python3 (includes pip)" ;;
Linux) info "Install: sudo apt install python3-pip (Debian/Ubuntu)" ;;
*) info "Install: python3 -m ensurepip --upgrade" ;;
esac
fi
# ── 3. Python packages (pip) ──
echo ""
echo "--- Python Packages ---"
PY_PKGS=(
"pikepdf:pikepdf"
"pdfplumber:pdfplumber"
"pypdf:pypdf"
"reportlab:reportlab"
"pymupdf:PyMuPDF"
)
MISSING_PY=()
for entry in "${PY_PKGS[@]}"; do
mod="${entry%%:*}"
pkg="${entry##*:}"
if python3 -c "import $mod" 2>/dev/null; then
ver=$(python3 -c "import $mod; print(getattr($mod, '__version__', 'installed'))" 2>/dev/null)
ok "$pkg ($ver)"
else
fail "$pkg not installed"
MISSING_PY+=("$pkg")
fi
done
if [ ${#MISSING_PY[@]} -gt 0 ]; then
echo ""
if [ -t 0 ]; then
read -p " Install missing Python packages? [Y/n] " -n 1 -r REPLY
echo ""
REPLY=${REPLY:-Y}
else
warn "Non-interactive mode — skipping auto-install. Run interactively or install manually."
REPLY=N
fi
if [[ ! $REPLY =~ ^[Nn]$ ]]; then
python3 -m pip install -q "${MISSING_PY[@]}" 2>/dev/null \
|| python3 -m pip install -q --user "${MISSING_PY[@]}" 2>/dev/null \
|| python3 -m pip install -q --break-system-packages "${MISSING_PY[@]}" 2>/dev/null \
|| { fail "pip install failed. Try manually: pip install ${MISSING_PY[*]}"; }
ok "Installed: ${MISSING_PY[*]}"
fi
fi
# ── 4. Node.js ──
echo ""
echo "--- Node.js ---"
if command -v node &>/dev/null; then
NODE_VER=$(node --version)
ok "node ($NODE_VER)"
else
fail "node not found"
case "$OS" in
Darwin) info "Install: brew install node" ;;
Linux) info "Install: curl -fsSL https://deb.nodesource.com/setup_20.x | sudo -E bash -"
info " sudo apt install -y nodejs" ;;
*) info "Install: https://nodejs.org/" ;;
esac
fi
# ── 5. npm ──
echo ""
echo "--- npm ---"
if command -v npm &>/dev/null; then
NPM_VER=$(npm --version 2>/dev/null)
ok "npm ($NPM_VER)"
else
fail "npm not found"
case "$OS" in
Darwin) info "Install: brew install node (includes npm)" ;;
Linux) info "Install: comes with nodejs" ;;
*) info "Install: https://nodejs.org/" ;;
esac
fi
# ── 6. Playwright + Chromium ──
echo ""
echo "--- Playwright (HTML→PDF engine) ---"
if node -e "require('playwright')" 2>/dev/null; then
PW_VER=$(node -e "console.log(require('playwright/package.json').version)" 2>/dev/null)
ok "playwright ($PW_VER)"
else
fail "playwright not installed"
info "Install: npm install -g playwright"
fi
# Check Chromium
if [ "$OS" = "Darwin" ]; then
PW_CACHE="$HOME/Library/Caches/ms-playwright"
else
PW_CACHE="$HOME/.cache/ms-playwright"
fi
if ls "$PW_CACHE"/chromium-* &>/dev/null 2>&1; then
CR_DIR=$(ls -d "$PW_CACHE"/chromium-* 2>/dev/null | tail -1)
ok "chromium ($(basename "$CR_DIR"))"
else
fail "chromium not installed"
info "Install: npx playwright install chromium"
if [ "$OS" = "Linux" ]; then
info " npx playwright install-deps (system libs, needs sudo)"
fi
fi
# ── 7. Tectonic (LaTeX engine, optional) ──
echo ""
echo "--- Tectonic (LaTeX→PDF, optional) ---"
BUNDLED="$SCRIPT_DIR/tectonic"
if [ -x "$BUNDLED" ]; then
if [ "$OS" = "Darwin" ] && [ "$ARCH" = "arm64" ]; then
ok "tectonic (bundled, macOS arm64)"
else
warn "bundled tectonic is macOS arm64 only — cannot run on $OS $ARCH"
if command -v tectonic &>/dev/null; then
TEC_VER=$(tectonic --version 2>&1 | head -1)
ok "tectonic (system: $TEC_VER)"
else
fail "tectonic not in PATH"
case "$OS" in
Darwin) info "Install: brew install tectonic" ;;
Linux) info "Install: conda install -c conda-forge tectonic"
info " or: curl -fsSL https://drop-sh.fullyjustified.net | sh" ;;
MINGW*|MSYS*|CYGWIN*) info "Install: scoop install tectonic / choco install tectonic" ;;
esac
fi
fi
elif command -v tectonic &>/dev/null; then
TEC_VER=$(tectonic --version 2>&1 | head -1)
ok "tectonic ($TEC_VER)"
elif [ -x "$HOME/tectonic" ]; then
ok "tectonic (~/tectonic)"
else
warn "tectonic not installed (needed only for LaTeX/academic PDFs)"
case "$OS" in
Darwin) info "Install: brew install tectonic" ;;
Linux) info "Install: conda install -c conda-forge tectonic"
info " or: curl -fsSL https://drop-sh.fullyjustified.net | sh" ;;
MINGW*|MSYS*|CYGWIN*) info "Install: scoop install tectonic / choco install tectonic" ;;
esac
fi
# ── 8. LibreOffice (optional, for Office→PDF conversion) ──
echo ""
echo "--- LibreOffice (optional, Office→PDF) ---"
if command -v soffice &>/dev/null; then
LO_VER=$(soffice --version 2>/dev/null | head -1)
ok "libreoffice ($LO_VER)"
else
warn "libreoffice not installed (needed only for .docx/.xlsx→PDF conversion)"
case "$OS" in
Darwin) info "Install: brew install --cask libreoffice" ;;
Linux) info "Install: sudo apt install libreoffice-core (Debian/Ubuntu)" ;;
esac
fi
# ── 9. CJK Fonts ──
echo ""
echo "--- CJK Fonts ---"
FONT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)/fonts"
if [ -d "$FONT_DIR" ]; then
FONT_COUNT=$(find "$FONT_DIR" -name "*.ttf" -o -name "*.otf" 2>/dev/null | head -20 | wc -l | tr -d ' ')
ok "fonts directory ($FONT_COUNT font files in $FONT_DIR)"
else
warn "no fonts/ directory found — CJK PDFs may have missing glyphs"
info "Expected at: $FONT_DIR"
fi
# Check system CJK fonts
if [ "$OS" = "Darwin" ]; then
if ls /System/Library/Fonts/PingFang.ttc &>/dev/null 2>&1 \
|| ls /System/Library/Fonts/STHeiti*.ttc &>/dev/null 2>&1 \
|| ls "$HOME/Library/Fonts/"*SimHei* &>/dev/null 2>&1; then
ok "macOS CJK system fonts available"
else
warn "no common CJK system fonts found"
fi
elif [ "$OS" = "Linux" ]; then
if fc-list :lang=zh 2>/dev/null | head -1 | grep -q .; then
ok "system CJK fonts available (fc-list)"
else
warn "no CJK fonts found. Install: sudo apt install fonts-noto-cjk"
fi
fi
# ── Summary ──
echo ""
echo "============================================"
echo " Setup complete."
echo " Run 'python3 pdf.py env.check' for detailed status."
echo " Run 'python3 pdf.py env.fix' to auto-install Python deps."
echo "============================================"

2075
skills/pdf/scripts/toc_validate.py Executable file

File diff suppressed because it is too large Load Diff