ddia/bin/pdf.py

#!/usr/bin/env python3
"""PDF generation from Markdown using pandoc + LaTeX."""

import os
import re
import sys
import subprocess
import argparse
from pathlib import Path
from typing import Optional, List, Dict
import shutil
import importlib.util


CHAPTER_ORDER = [
    "_index.md",
    "preface.md",
    "part-i.md",
    "ch1.md", "ch2.md", "ch3.md", "ch4.md",
    "part-ii.md",
    "ch5.md", "ch6.md", "ch7.md", "ch8.md", "ch9.md",
    "part-iii.md",
    "ch10.md", "ch11.md", "ch12.md", "ch13.md", "ch14.md",
    "colophon.md", "glossary.md",
]

DEFAULT_FONTS = {
    "mainfont": "PingFang SC",
    "sansfont": "Heiti SC",
}

YAML_FRONT_RE = re.compile(r'^---\n(.*?)\n---\n', re.DOTALL)
TITLE_RE = re.compile(r'^title:\s*["\']?([^"\'\n]+)["\']?\s*$', re.MULTILINE)
CHAPTER_NUM_RE = re.compile(r'^\d+\.\s*')
CHAPTER_FILE_RE = re.compile(r'^ch(\d+)\.md$')
CALLOUT_RE = re.compile(r'^> \[!(NOTE|TIP|WARNING|CAUTION|DANGER)\] ', re.MULTILINE)


def convert_pdf_markdown(text: str, filename: str) -> str:
    """PDF-specific markdown conversions."""
    text = _convert_callouts(text)
    text = _add_title_heading(text, filename)
    return text


def _convert_callouts(text: str) -> str:
    """Convert [!NOTE], [!TIP], etc. to Chinese."""
    def replace_callout(match):
        callout_type = match.group(1).lower()
        title_map = {
            'note': '注',
            'tip': '提示',
            'warning': '警告',
            'caution': '注意',
            'danger': '危险'
        }
        return f"**{title_map.get(callout_type, callout_type)}**: "

    text = CALLOUT_RE.sub(replace_callout, text)
    text = re.sub(r'^> ?', '', text, flags=re.MULTILINE)
    return text


def _add_title_heading(text: str, filename: str) -> str:
    """Add title heading from YAML frontmatter."""
    match = YAML_FRONT_RE.match(text)
    if match:
        frontmatter = match.group(1)
        title_match = TITLE_RE.search(frontmatter)
        if title_match:
            title = title_match.group(1)
            body = text[match.end():]
            clean_title = CHAPTER_NUM_RE.sub('', title)

            if CHAPTER_FILE_RE.match(filename):
                heading = f"# {clean_title}"
            else:
                heading = f"## {clean_title}"

            return f"---\n{frontmatter}\n---\n\n{heading}\n\n{body}"
    return text


def check_cmd(cmd: str) -> bool:
    return subprocess.run(["which", cmd], capture_output=True).returncode == 0


def get_available_engine() -> Optional[str]:
    if check_cmd("xelatex"):
        return "xelatex"
    if check_cmd("lualatex"):
        return "lualatex"
    return None


def check_dependencies() -> List[str]:
    missing = []
    if not check_cmd("pandoc"):
        missing.append("pandoc")
    if not get_available_engine():
        missing.append("xelatex or lualatex (LaTeX engine)")
    return missing


def preprocess_markdown(input_dir: Path, output_dir: Path) -> None:
    script_dir = Path(__file__).parent
    preprocess_script = script_dir / "preprocess-epub.py"
    spec = importlib.util.spec_from_file_location("preprocess_epub", preprocess_script)
    if spec is None:
        raise RuntimeError("Failed to load preprocess module")
    module = importlib.util.module_from_spec(spec)
    if spec.loader is None:
        raise RuntimeError("Failed to load preprocess module loader")
    spec.loader.exec_module(module)

    output_dir.mkdir(parents=True, exist_ok=True)
    md_files = sorted(input_dir.glob("*.md"))

    print(f"Preprocessing {len(md_files)} files...")
    for md_file in md_files:
        temp_output = output_dir / "tmp_preprocess.md"
        module.process_file(str(md_file), str(temp_output))

        with open(temp_output, 'r', encoding='utf-8') as f:
            content = f.read()

        content = convert_pdf_markdown(content, md_file.name)

        with open(output_dir / md_file.name, 'w', encoding='utf-8') as f:
            f.write(content)

        temp_output.unlink()


def generate_pdf(
    temp_dir: Path,
    output_file: Path,
    metadata_file: Optional[str],
    engine: str,
    fonts: Dict[str, str],
    margin: str = "1in",
) -> None:
    chapters = [str(temp_dir / ch) for ch in CHAPTER_ORDER if (temp_dir / ch).exists()]

    if not chapters:
        raise ValueError("No valid chapter files found")

    script_dir = Path(__file__).parent
    header_file = script_dir / "header.tex"

    cmd = [
        "pandoc", "-o", str(output_file),
        "--metadata-file", metadata_file or "",
        "-H", str(header_file),
        "--toc",
        "--toc-depth=2",
        "--top-level-division=chapter",
        "--file-scope",
        f"--pdf-engine={engine}",
        f"-V geometry:margin={margin}",
        "-V linestretch=1.5",
        "-V book=true",
        "-V classoption=openany",
        "-V mainfont=PingFang SC",
    ]
    cmd = [c for c in cmd if c]
    cmd.extend(chapters)

    print(f"Generating PDF with {engine}...")
    result = subprocess.run(cmd, capture_output=True, text=True)

    if result.returncode != 0:
        raise RuntimeError(f"PDF generation failed: {result.stderr}")


def main():
    parser = argparse.ArgumentParser(description="Generate PDF from Markdown")
    parser.add_argument("-i", "--input", default="content/zh", help="Input directory")
    parser.add_argument("-o", "--output", default="output", help="Output directory")
    parser.add_argument("-m", "--metadata", help="Metadata YAML file")
    parser.add_argument("-e", "--engine", choices=["xelatex", "lualatex"], help="PDF engine")
    parser.add_argument("--no-cleanup", action="store_true", help="Keep temp files")
    args = parser.parse_args()

    project_root = Path(__file__).parent.parent
    input_dir = project_root / args.input
    output_dir = project_root / args.output
    temp_dir = output_dir / "temp"

    missing = check_dependencies()
    if missing:
        print("Error: Missing dependencies:")
        for dep in missing:
            print(f"  - {dep}")
        print("\nInstall: brew install pandoc && brew install --cask mactex")
        sys.exit(1)

    detected_engine = get_available_engine()
    if detected_engine is None:
        print("Error: No PDF engine available")
        sys.exit(1)

    engine = args.engine or detected_engine
    metadata = args.metadata or str(project_root / "metadata.yaml")

    output_dir.mkdir(parents=True, exist_ok=True)
    output_file = output_dir / "ddia.pdf"
    output_file.unlink(missing_ok=True)

    preprocess_markdown(input_dir, temp_dir)
    generate_pdf(temp_dir, output_file, metadata, engine, DEFAULT_FONTS)

    if not args.no_cleanup and temp_dir.exists():
        shutil.rmtree(temp_dir)

    print(f"PDF created: {output_file}")


if __name__ == "__main__":
    main()