Files
mantle-ai-trader/skills/pdf/scripts/html2pdf-next.js
2026-06-06 05:21:10 +00:00

755 lines
30 KiB
JavaScript
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env node
/**
* html2pdf-next.js — HTML → PDF converter using Playwright + pdf-lib
*
* Drop-in replacement for html2pdf.js, WITHOUT Paged.js dependency.
* Uses Chromium native @page CSS for pagination + pdf-lib for post-processing.
*
* Usage:
* node html2pdf-next.js input.html
* node html2pdf-next.js input.html --output result.pdf
* node html2pdf-next.js input.html --css extra.css
* node html2pdf-next.js input.html --width 720px --height 960px
* node html2pdf-next.js input.html --direct (same as default now — no Paged.js to skip)
* node html2pdf-next.js input.html --merge a.pdf b.pdf (merge additional PDFs after)
*
* Architecture:
* 1. Playwright renders HTML → raw PDF via Chromium's native print engine
* 2. Pre-render hooks: Mermaid, KaTeX, oversized element fixes
* 3. Post-render: pdf-lib for merge, metadata, page count extraction
* 4. No Paged.js, no paged.polyfill.js — CSS @page handles pagination natively
*/
const fs = require('fs');
const path = require('path');
const { execSync, spawnSync } = require('child_process');
const sleep = ms => new Promise(r => setTimeout(r, ms));
// ═══════════════════════════════════════════════════════════════════
// Playwright / Chromium resolution (self-contained, no external helper)
// ═══════════════════════════════════════════════════════════════════
function loadPlaywright() {
// Try direct require first
try { return require('playwright'); } catch (_) {}
// Search common global paths
const Module = require('module');
const roots = new Set();
if (process.env.PLAYWRIGHT_PATH) roots.add(process.env.PLAYWRIGHT_PATH);
if (process.env.NODE_PATH) {
process.env.NODE_PATH.split(path.delimiter).filter(Boolean).forEach(p => roots.add(p));
}
try {
const g = execSync('npm root -g', { stdio: ['ignore', 'pipe', 'ignore'] }).toString().trim();
if (g) roots.add(g);
} catch (_) {}
for (const base of roots) {
const pkg = path.join(base, 'playwright', 'package.json');
if (!fs.existsSync(pkg)) continue;
try { return Module.createRequire(pkg)('playwright'); } catch (_) {}
}
throw new Error('Playwright not found. Install: npm install -g playwright');
}
function loadPdfLib() {
try { return require('pdf-lib'); } catch (_) {}
const Module = require('module');
try {
const g = execSync('npm root -g', { stdio: ['ignore', 'pipe', 'ignore'] }).toString().trim();
const pkg = path.join(g, 'pdf-lib', 'package.json');
if (fs.existsSync(pkg)) return Module.createRequire(pkg)('pdf-lib');
} catch (_) {}
throw new Error('pdf-lib not found. Install: npm install -g pdf-lib');
}
function resolveChromium(chromiumObj, allowInstall = false) {
let exe;
try { exe = chromiumObj.executablePath(); } catch (_) { exe = null; }
if (exe && fs.existsSync(exe)) {
return { status: 'ok', executablePath: exe };
}
// Try system Chrome/Chromium
const candidates = [
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
'/usr/bin/chromium-browser', '/usr/bin/chromium', '/usr/bin/google-chrome',
];
if (process.env.PLAYWRIGHT_CHROMIUM_PATH) candidates.unshift(process.env.PLAYWRIGHT_CHROMIUM_PATH);
for (const c of candidates) {
if (fs.existsSync(c)) return { status: 'fallback', executablePath: c };
}
if (allowInstall) {
const r = spawnSync('npx', ['playwright', 'install', 'chromium'], { stdio: 'inherit', shell: true });
if (r.status === 0) {
try { exe = chromiumObj.executablePath(); } catch (_) {}
if (exe && fs.existsSync(exe)) return { status: 'installed', executablePath: exe };
}
}
return { status: 'missing', executablePath: exe || '' };
}
// ═══════════════════════════════════════════════════════════════════
// CLI
// ═══════════════════════════════════════════════════════════════════
function cli() {
const tokens = process.argv.slice(2);
if (!tokens.length || tokens[0] === '-h' || tokens[0] === '--help') {
console.log(`
Usage: node html2pdf-next.js <input.html> [options]
Options:
--output, -o <file> Output PDF path (default: <input>.pdf)
--css <file> Inject extra stylesheet
--width <px> Custom page width (e.g. 720px)
--height <px> Custom page height (e.g. 960px)
--direct (no-op, kept for backward compat — always direct now)
--merge <files...> Append additional PDF files after conversion
--title <text> Set PDF document title metadata
--help, -h Show help
`);
process.exit(0);
}
const inputFile = tokens[0];
let outputFile = null, customCSS = null, width = null, height = null;
let mergeFiles = [], title = null;
for (let i = 1; i < tokens.length; i++) {
const t = tokens[i];
if (t === '--output' || t === '-o') outputFile = tokens[++i];
else if (t === '--css') customCSS = tokens[++i];
else if (t === '--width') width = tokens[++i];
else if (t === '--height') height = tokens[++i];
else if (t === '--direct') { /* no-op, always direct */ }
else if (t === '--title') title = tokens[++i];
else if (t === '--merge') {
while (i + 1 < tokens.length && !tokens[i + 1].startsWith('--')) {
mergeFiles.push(tokens[++i]);
}
}
}
if (!outputFile) {
const p = path.parse(inputFile);
outputFile = path.join(p.dir || '.', p.name + '.pdf');
}
return { inputFile, outputFile, customCSS, width, height, mergeFiles, title };
}
// ═══════════════════════════════════════════════════════════════════
// Helpers
// ═══════════════════════════════════════════════════════════════════
function prettyBytes(n) {
const units = ['B', 'KB', 'MB', 'GB'];
let u = 0;
while (n >= 1024 && u < units.length - 1) { n /= 1024; u++; }
return `${n.toFixed(1)} ${units[u]}`;
}
// ═══════════════════════════════════════════════════════════════════
// Pre-render hooks (run in browser context before PDF export)
// ═══════════════════════════════════════════════════════════════════
async function preRenderHooks(page) {
const warnings = [];
// 1. Wait for Mermaid diagrams
const hasMermaid = await page.evaluate(() => document.querySelectorAll('.mermaid').length > 0);
if (hasMermaid) {
console.log(' ⏳ Waiting for Mermaid diagrams...');
try {
await page.waitForFunction(() => {
for (const m of document.querySelectorAll('.mermaid'))
if (!m.querySelector('svg') && !m.getAttribute('data-processed')) return false;
return true;
}, { timeout: 30000 });
await sleep(2000);
console.log(' ✓ Mermaid rendered');
} catch (_) {
warnings.push('Mermaid rendering timed out (30s)');
}
}
// 2. Trigger KaTeX math rendering
const katexStatus = await page.evaluate(() => ({
lib: typeof renderMathInElement === 'function' || typeof katex !== 'undefined',
rendered: document.querySelectorAll('.katex').length > 0,
raw: /\$[^$]+\$|\$\$[^$]+\$\$|\\\(.*?\\\)|\\\[.*?\\\]/.test(document.body.innerText),
}));
// Auto-inject KaTeX CDN if raw math detected but library not loaded
if (!katexStatus.lib && katexStatus.raw && !katexStatus.rendered) {
console.log(' ⏳ Auto-injecting KaTeX CDN (math formulas detected but KaTeX not loaded)...');
await page.addStyleTag({ url: 'https://cdn.jsdelivr.net/npm/katex@0.16.22/dist/katex.min.css' });
await page.addScriptTag({ url: 'https://cdn.jsdelivr.net/npm/katex@0.16.22/dist/katex.min.js' });
await page.addScriptTag({ url: 'https://cdn.jsdelivr.net/npm/katex@0.16.22/dist/contrib/auto-render.min.js' });
await sleep(2000); // Wait for CDN scripts to load
// Re-check
const recheckLib = await page.evaluate(() => typeof renderMathInElement === 'function');
if (recheckLib) {
console.log(' ✓ KaTeX CDN loaded successfully');
} else {
console.log(' ⚠ KaTeX CDN failed to load — math will render as raw text');
warnings.push('KaTeX CDN injection failed; math formulas may appear as raw LaTeX code');
}
}
// Re-evaluate after potential CDN injection
const katexReady = await page.evaluate(() => ({
lib: typeof renderMathInElement === 'function' || typeof katex !== 'undefined',
rendered: document.querySelectorAll('.katex').length > 0,
raw: /\$[^$]+\$|\$\$[^$]+\$\$|\\\(.*?\\\)|\\\[.*?\\\]/.test(document.body.innerText),
}));
if (katexReady.lib && !katexReady.rendered && katexReady.raw) {
console.log(' ⏳ Triggering KaTeX rendering...');
await page.evaluate(() => {
if (typeof renderMathInElement === 'function')
renderMathInElement(document.body, {
delimiters: [
{ left: '$$', right: '$$', display: true },
{ left: '$', right: '$', display: false },
{ left: '\\(', right: '\\)', display: false },
{ left: '\\[', right: '\\]', display: true },
],
throwOnError: false,
});
});
await sleep(1000);
console.log(' ✓ KaTeX rendered');
} else if (katexReady.rendered) {
await sleep(500); // Font loading settle
}
// 3. Fix oversized elements that prevent page breaks
const nFixed = await page.evaluate(() => {
const LIMIT = 1000;
let n = 0;
document.querySelectorAll(
'[style*="page-break-inside: avoid"],[style*="break-inside: avoid"],' +
'.avoid-break,table,figure,.theorem,.algorithm'
).forEach(el => {
if (el.getBoundingClientRect().height > LIMIT) {
el.style.pageBreakInside = 'auto';
el.style.breakInside = 'auto';
n++;
}
});
return n;
});
if (nFixed) {
console.log(` ⚠ Fixed ${nFixed} oversized elements (removed break-inside: avoid)`);
}
// 4. Detect overflow (horizontal AND vertical)
const overflows = await page.evaluate(() => {
const out = [];
document.querySelectorAll('pre,table,figure,img,svg,.mermaid,blockquote,.equation').forEach(el => {
const hDiff = el.scrollWidth - el.clientWidth;
const vDiff = el.scrollHeight - el.clientHeight;
if (hDiff > 2 || vDiff > 2) out.push({
tag: el.tagName.toLowerCase(),
cls: el.className || '',
hOverflow: hDiff > 2 ? hDiff : 0,
vOverflow: vDiff > 2 ? vDiff : 0,
preview: (el.textContent || '').slice(0, 50).replace(/\s+/g, ' '),
});
});
return out;
});
if (overflows.length) {
console.log(' ⚠ Overflow detected:');
overflows.forEach(o => {
const parts = [];
if (o.hOverflow) parts.push(`H +${o.hOverflow}px`);
if (o.vOverflow) parts.push(`V +${o.vOverflow}px`);
console.log(` <${o.tag}${o.cls ? '.' + o.cls.split(' ')[0] : ''}> ${parts.join(', ')}`);
});
warnings.push(`${overflows.length} element(s) have overflow`);
}
// 4b. Fix vertical overflow on page-level containers
// When html/body or the main content canvas has a fixed height + overflow:hidden,
// content gets clipped. For documents (html2pdf-next.js), we DON'T expand the
// container to its scrollHeight — that creates an oversized single "page" that
// Playwright splits unevenly. Instead, we remove the fixed height and overflow:hidden
// so content flows naturally and @page CSS handles pagination.
//
// (The old "expand to scrollHeight" logic belongs in html2poster.js where a single
// continuous canvas is the desired output.)
const vOverflowFix = await page.evaluate(() => {
const fixes = [];
// Candidates: html, body, and any direct child of body that acts as a full-page canvas
const candidates = [document.documentElement, document.body];
const bodyChildren = document.body.children;
for (let i = 0; i < bodyChildren.length; i++) {
const child = bodyChildren[i];
// Skip SVG defs, script, style elements
const tag = child.tagName.toLowerCase();
if (tag === 'svg' || tag === 'script' || tag === 'style' || tag === 'link') continue;
candidates.push(child);
// Also check one level deeper (e.g., .canvas > .content)
for (let j = 0; j < child.children.length; j++) {
const grandchild = child.children[j];
const gtag = grandchild.tagName.toLowerCase();
if (gtag === 'svg' || gtag === 'script' || gtag === 'style') continue;
candidates.push(grandchild);
}
}
for (const el of candidates) {
const computed = getComputedStyle(el);
const overflow = computed.overflow || computed.overflowY;
const hasHiddenOverflow = overflow === 'hidden' || overflow === 'clip';
const diff = el.scrollHeight - el.clientHeight;
if (hasHiddenOverflow && diff > 5) {
// This element is clipping content vertically
const tag = el.tagName.toLowerCase();
const id = el.id ? `#${el.id}` : '';
const cls = el.className ? `.${String(el.className).split(' ')[0]}` : '';
const selector = `${tag}${id}${cls}`;
const oldHeight = el.clientHeight;
// Document mode: remove fixed height + overflow:hidden,
// let @page handle natural pagination
el.style.height = 'auto';
el.style.minHeight = 'auto';
el.style.maxHeight = 'none';
el.style.overflow = 'visible';
el.style.overflowY = 'visible';
fixes.push({
selector,
oldHeight,
clipped: diff,
});
}
}
// After fixing containers, re-measure to get the final content height
const finalHeight = Math.max(
document.documentElement.scrollHeight,
document.body.scrollHeight
);
return { fixes, finalHeight };
});
if (vOverflowFix.fixes.length) {
console.log(' ⚠️ Removed fixed height + overflow:hidden — content will paginate naturally:');
vOverflowFix.fixes.forEach(f => {
console.log(` ${f.selector}: was ${f.oldHeight}px with ${f.clipped}px clipped → now auto (content will flow to next page)`);
});
}
// 4c. Convert absolute-bottom elements to document flow
// Elements with `position: absolute; bottom: Npx` inside page containers
// are pinned relative to their containing block. When content paginates
// across multiple @page pages, these elements either overlap with body
// text or land on the wrong page. Fix: convert them to static positioning
// so they participate in normal document flow and paginate naturally.
const absBottomFix = await page.evaluate(() => {
const converted = [];
// Scan inside page-level containers (body children and their children)
const containers = [];
for (let i = 0; i < document.body.children.length; i++) {
const child = document.body.children[i];
const tag = child.tagName.toLowerCase();
if (tag === 'svg' || tag === 'script' || tag === 'style' || tag === 'link') continue;
containers.push(child);
}
for (const container of containers) {
const descendants = container.querySelectorAll('*');
for (const el of descendants) {
const computed = getComputedStyle(el);
if (computed.position === 'absolute' && computed.bottom !== 'auto' && computed.bottom !== '') {
// Check if this element contains visible text (not just decorative)
const hasText = el.textContent && el.textContent.trim().length > 0;
if (!hasText) continue;
const tag = el.tagName.toLowerCase();
const id = el.id ? `#${el.id}` : '';
const cls = el.className ? `.${String(el.className).split(' ')[0]}` : '';
const selector = `${tag}${id}${cls}`;
// Convert to static flow: remove absolute positioning
el.style.position = 'static';
el.style.bottom = 'auto';
el.style.left = 'auto';
el.style.right = 'auto';
// Preserve horizontal padding/margin from the original left/right values
// by keeping any existing padding or margin on the element
converted.push({ selector, bottom: computed.bottom });
}
}
}
return converted;
});
if (absBottomFix.length) {
console.log(' ⚠️ Converted absolute-bottom elements to document flow (prevents overlap on multi-page):');
absBottomFix.forEach(f => {
console.log(` ${f.selector}: was position:absolute;bottom:${f.bottom} → now static (flows with content)`);
});
}
// 5. Inject minimal @page CSS fallback
await page.evaluate(() => {
const styles = Array.from(document.querySelectorAll('style'));
const hasPageRule = styles.some(s => (s.textContent || '').includes('@page'));
if (!hasPageRule) {
const s = document.createElement('style');
s.textContent = `@page { margin: 20mm; }`;
document.head.appendChild(s);
}
});
// 6. Fix full-page cover sections for print
// In screen mode, height:100vh = viewport height. In print mode, 100vh ≠ page height.
// Detect elements using 100vh and convert to print-safe page-filling behavior.
const coverFixed = await page.evaluate(() => {
let fixed = 0;
// Find elements with height: 100vh (inline or computed)
const allEls = document.querySelectorAll('*');
for (const el of allEls) {
const style = el.style;
const computed = getComputedStyle(el);
const isVh = style.height === '100vh' || computed.height === '100vh' ||
style.minHeight === '100vh' || computed.minHeight === '100vh';
// Also detect via class name hints
const isCover = el.classList.contains('cover') || el.classList.contains('cover-page') ||
el.id === 'cover' || el.getAttribute('data-role') === 'cover';
if (isVh || (isCover && el.offsetHeight > 0)) {
// Force the element to fill the print page
el.style.height = '100vh';
el.style.minHeight = '100vh';
el.style.pageBreakAfter = 'always';
el.style.pageBreakInside = 'avoid';
el.style.boxSizing = 'border-box';
el.style.overflow = 'hidden';
fixed++;
}
}
// Inject print-specific CSS to make 100vh work correctly
if (fixed > 0) {
const s = document.createElement('style');
s.textContent = `
@media print {
.cover, .cover-page, [data-role="cover"] {
height: 100vh !important;
min-height: 100vh !important;
page-break-after: always !important;
page-break-inside: avoid !important;
overflow: hidden !important;
}
}
`;
document.head.appendChild(s);
}
return fixed;
});
if (coverFixed) {
console.log(` ✓ Fixed ${coverFixed} full-page cover section(s) for print`);
// Also inject named @page rule for cover with zero margins
await page.evaluate(() => {
const s = document.createElement('style');
s.textContent = `
@page cover-page {
margin: 0 !important;
}
@media print {
.cover, .cover-page, [data-role="cover"] {
page: cover-page;
margin: 0 !important;
padding: 40px !important;
}
}
`;
document.head.appendChild(s);
});
}
return { warnings, contentHeight: vOverflowFix.finalHeight };
}
// ═══════════════════════════════════════════════════════════════════
// Content statistics (post-render, from PDF or page)
// ═══════════════════════════════════════════════════════════════════
async function collectStats(page) {
return page.evaluate(() => {
const body = document.body;
const text = body.innerText || '';
const zhChars = (text.match(/[\u4e00-\u9fa5]/g) || []).length;
const enWords = (text.match(/[a-zA-Z]+/g) || []).length;
return {
wordCount: zhChars + enWords,
figures: document.querySelectorAll('figure,.figure,img').length,
tables: document.querySelectorAll('table').length,
};
});
}
// ═══════════════════════════════════════════════════════════════════
// pdf-lib post-processing: page count, metadata, merge
// ═══════════════════════════════════════════════════════════════════
async function postProcess(pdfPath, options = {}) {
const { PDFDocument } = loadPdfLib();
const pdfBytes = fs.readFileSync(pdfPath);
const doc = await PDFDocument.load(pdfBytes);
// Set metadata
if (options.title) doc.setTitle(options.title);
doc.setProducer('html2pdf-next (Playwright + pdf-lib)');
doc.setCreationDate(new Date());
const pageCount = doc.getPageCount();
// Merge additional PDFs
if (options.mergeFiles && options.mergeFiles.length) {
for (const mf of options.mergeFiles) {
if (!fs.existsSync(mf)) {
console.log(` ⚠ Merge file not found: ${mf}`);
continue;
}
console.log(` 📎 Merging: ${path.basename(mf)}`);
const donorBytes = fs.readFileSync(mf);
const donorDoc = await PDFDocument.load(donorBytes);
const copiedPages = await doc.copyPages(donorDoc, donorDoc.getPageIndices());
copiedPages.forEach(p => doc.addPage(p));
}
}
// Save
const finalBytes = await doc.save();
fs.writeFileSync(pdfPath, finalBytes);
return { pageCount: doc.getPageCount(), originalPages: pageCount };
}
// ═══════════════════════════════════════════════════════════════════
// Main pipeline
// ═══════════════════════════════════════════════════════════════════
async function convert(inputFile, outputFile, customCSS, options = {}) {
const { width, height, mergeFiles, title } = options;
if (!fs.existsSync(inputFile)) {
console.error(`✗ File not found: ${inputFile}`);
process.exit(1);
}
const playwright = loadPlaywright();
const { chromium } = playwright;
// Resolve browser
const canInstall = process.env.PDF_SKIP_BROWSER_INSTALL !== '1';
const bInfo = resolveChromium(chromium, canInstall);
if (bInfo.status === 'missing') {
console.error('\n✗ Chromium not found. Run: npx playwright install chromium\n');
process.exit(2);
}
if (bInfo.status === 'fallback') {
console.log(`⚠ Using fallback Chromium: ${bInfo.executablePath}`);
}
const absIn = path.resolve(inputFile);
const absOut = path.resolve(outputFile);
console.log(`\n🔄 Converting ${path.basename(inputFile)}...`);
console.log(` Engine: Playwright + Chromium native @page (no Paged.js)`);
// Read and optionally inject CSS
let html = fs.readFileSync(absIn, 'utf-8');
if (customCSS) {
if (!fs.existsSync(customCSS)) {
console.error(`✗ CSS file not found: ${customCSS}`);
process.exit(1);
}
const tag = `<style>${fs.readFileSync(customCSS, 'utf-8')}</style>`;
html = html.includes('</head>') ? html.replace('</head>', tag + '\n</head>') : tag + '\n' + html;
// Write modified HTML for Playwright to load
const tmpHtml = absIn + '.tmp.html';
fs.writeFileSync(tmpHtml, html);
// We'll clean up later
}
// Launch browser
let browser;
try {
const opts = { headless: true };
if (bInfo.status === 'fallback') opts.executablePath = bInfo.executablePath;
browser = await chromium.launch(opts);
} catch (err) {
const msg = err.message || '';
if (msg.includes('shared libraries') || msg.includes('.so')) {
console.error('\n✗ Missing system libraries. Run: npx playwright install-deps chromium\n');
} else {
console.error(`\n✗ Browser launch failed: ${msg}\n`);
}
process.exit(1);
}
try {
const page = await browser.newPage();
const loadFile = customCSS ? absIn + '.tmp.html' : absIn;
await page.goto('file://' + loadFile, { waitUntil: 'networkidle' });
// ── Pre-render hooks ──
console.log('\n📋 Pre-render checks:');
const preRenderResult = await preRenderHooks(page);
const warnings = preRenderResult.warnings;
const measuredContentHeight = preRenderResult.contentHeight;
// ── Detect continuous-canvas mode (design_engine.py output) ──
const continuousInfo = await page.evaluate(() => {
const el = document.querySelector('.continuous-canvas');
if (!el) return null;
const root = getComputedStyle(document.documentElement);
return {
width: root.getPropertyValue('--canvas-w').trim() || '720px',
height: root.getPropertyValue('--canvas-h').trim() || '960px',
pages: el.querySelectorAll('.page-section').length,
};
});
if (continuousInfo) {
// Creative PDF: seamless multi-page canvas
console.log(`\n🎨 Continuous canvas: ${continuousInfo.pages} pages @ ${continuousInfo.width} × ${continuousInfo.height}`);
await page.pdf({
path: absOut,
printBackground: true,
margin: { top: 0, right: 0, bottom: 0, left: 0 },
width: continuousInfo.width,
height: continuousInfo.height,
});
} else {
// Standard document
console.log('\n📄 Rendering PDF...');
const pdfOpts = {
path: absOut,
printBackground: true,
preferCSSPageSize: true,
tagged: true,
};
if (width || height) {
if (width) pdfOpts.width = width;
if (height) pdfOpts.height = height;
pdfOpts.margin = { top: 0, right: 0, bottom: 0, left: 0 };
console.log(` Custom size: ${pdfOpts.width || 'auto'} × ${pdfOpts.height || 'auto'}`);
} else {
// No explicit size: check if @page CSS defines a fixed size
const pageSize = await page.evaluate(() => {
const styles = Array.from(document.querySelectorAll('style'));
for (const s of styles) {
const text = s.textContent || '';
const match = text.match(/@page\s*\{[^}]*size:\s*([\d.]+)px\s+([\d.]+)px/);
if (match) return { width: parseFloat(match[1]), height: parseFloat(match[2]) };
}
return null;
});
if (pageSize) {
// @page defines a fixed size — use preferCSSPageSize (already set above).
// Playwright will paginate content at @page height boundaries seamlessly.
// This is correct for both posters (seamless multi-page) and documents.
pdfOpts.margin = { top: 0, right: 0, bottom: 0, left: 0 };
console.log(` @page size: ${pageSize.width}px × ${pageSize.height}px`);
if (measuredContentHeight && measuredContentHeight > pageSize.height + 5) {
const estPages = Math.ceil(measuredContentHeight / pageSize.height);
console.log(` Content height: ${measuredContentHeight}px → ~${estPages} pages`);
}
} else {
pdfOpts.format = 'A4';
}
}
await page.pdf(pdfOpts);
}
// Collect content stats from the page
const stats = await collectStats(page);
// ── pdf-lib post-processing ──
console.log('\n🔧 Post-processing (pdf-lib):');
const postResult = await postProcess(absOut, { mergeFiles, title });
// Clean up temp HTML
const tmpHtml = absIn + '.tmp.html';
if (fs.existsSync(tmpHtml)) fs.unlinkSync(tmpHtml);
// ── Report ──
const sz = fs.statSync(absOut).size;
console.log('\n' + '═'.repeat(40));
console.log(' PDF Generated Successfully');
console.log('═'.repeat(40));
console.log(` File: ${path.basename(absOut)}`);
console.log(` Pages: ${postResult.pageCount}`);
console.log(` Size: ${prettyBytes(sz)}`);
console.log(` Words: ~${stats.wordCount.toLocaleString()}`);
console.log(` Assets: ${stats.figures} figures, ${stats.tables} tables`);
console.log(` Engine: Playwright (no Paged.js)`);
console.log(` Path: ${absOut}`);
if (mergeFiles && mergeFiles.length && postResult.pageCount > postResult.originalPages) {
console.log(` Merged: +${postResult.pageCount - postResult.originalPages} pages from ${mergeFiles.length} file(s)`);
}
if (warnings.length) {
console.log('\n⚠ Warnings:');
warnings.forEach(w => console.log(` · ${w}`));
}
// Anomaly detection
if (postResult.pageCount > 1 && stats.wordCount > 0) {
const avgWordsPerPage = stats.wordCount / postResult.pageCount;
if (avgWordsPerPage < 30) {
console.log(`\n⚠ Low content density: ~${Math.round(avgWordsPerPage)} words/page (expected 100+)`);
}
}
} catch (err) {
console.error('\n✗ Conversion failed:', err.message);
process.exit(1);
} finally {
await browser.close();
}
}
// ═══════════════════════════════════════════════════════════════════
// Entry
// ═══════════════════════════════════════════════════════════════════
(async () => {
try {
const args = cli();
await convert(args.inputFile, args.outputFile, args.customCSS, {
width: args.width,
height: args.height,
mergeFiles: args.mergeFiles,
title: args.title,
});
} catch (err) {
console.error('Error:', err.message);
process.exit(1);
}
})();