2
0
Fork 0
mirror of https://github.com/Vonng/ddia.git synced 2026-06-21 00:47:05 +08:00
ddia/bin/pdf.py
2026-03-28 10:06:38 +08:00

220 lines
6.6 KiB
Python
Executable file

#!/usr/bin/env python3
"""PDF generation from Markdown using pandoc + LaTeX."""
import os
import re
import sys
import subprocess
import argparse
from pathlib import Path
from typing import Optional, List, Dict
import shutil
import importlib.util
CHAPTER_ORDER = [
"_index.md",
"preface.md",
"part-i.md",
"ch1.md", "ch2.md", "ch3.md", "ch4.md",
"part-ii.md",
"ch5.md", "ch6.md", "ch7.md", "ch8.md", "ch9.md",
"part-iii.md",
"ch10.md", "ch11.md", "ch12.md", "ch13.md", "ch14.md",
"colophon.md", "glossary.md",
]
DEFAULT_FONTS = {
"mainfont": "PingFang SC",
"sansfont": "Heiti SC",
}
YAML_FRONT_RE = re.compile(r'^---\n(.*?)\n---\n', re.DOTALL)
TITLE_RE = re.compile(r'^title:\s*["\']?([^"\'\n]+)["\']?\s*$', re.MULTILINE)
CHAPTER_NUM_RE = re.compile(r'^\d+\.\s*')
CHAPTER_FILE_RE = re.compile(r'^ch(\d+)\.md$')
CALLOUT_RE = re.compile(r'^> \[!(NOTE|TIP|WARNING|CAUTION|DANGER)\] ', re.MULTILINE)
def convert_pdf_markdown(text: str, filename: str) -> str:
"""PDF-specific markdown conversions."""
text = _convert_callouts(text)
text = _add_title_heading(text, filename)
return text
def _convert_callouts(text: str) -> str:
"""Convert [!NOTE], [!TIP], etc. to Chinese."""
def replace_callout(match):
callout_type = match.group(1).lower()
title_map = {
'note': '',
'tip': '提示',
'warning': '警告',
'caution': '注意',
'danger': '危险'
}
return f"**{title_map.get(callout_type, callout_type)}**: "
text = CALLOUT_RE.sub(replace_callout, text)
text = re.sub(r'^> ?', '', text, flags=re.MULTILINE)
return text
def _add_title_heading(text: str, filename: str) -> str:
"""Add title heading from YAML frontmatter."""
match = YAML_FRONT_RE.match(text)
if match:
frontmatter = match.group(1)
title_match = TITLE_RE.search(frontmatter)
if title_match:
title = title_match.group(1)
body = text[match.end():]
clean_title = CHAPTER_NUM_RE.sub('', title)
if CHAPTER_FILE_RE.match(filename):
heading = f"# {clean_title}"
else:
heading = f"## {clean_title}"
return f"---\n{frontmatter}\n---\n\n{heading}\n\n{body}"
return text
def check_cmd(cmd: str) -> bool:
return subprocess.run(["which", cmd], capture_output=True).returncode == 0
def get_available_engine() -> Optional[str]:
if check_cmd("xelatex"):
return "xelatex"
if check_cmd("lualatex"):
return "lualatex"
return None
def check_dependencies() -> List[str]:
missing = []
if not check_cmd("pandoc"):
missing.append("pandoc")
if not get_available_engine():
missing.append("xelatex or lualatex (LaTeX engine)")
return missing
def preprocess_markdown(input_dir: Path, output_dir: Path) -> None:
script_dir = Path(__file__).parent
preprocess_script = script_dir / "preprocess-epub.py"
spec = importlib.util.spec_from_file_location("preprocess_epub", preprocess_script)
if spec is None:
raise RuntimeError("Failed to load preprocess module")
module = importlib.util.module_from_spec(spec)
if spec.loader is None:
raise RuntimeError("Failed to load preprocess module loader")
spec.loader.exec_module(module)
output_dir.mkdir(parents=True, exist_ok=True)
md_files = sorted(input_dir.glob("*.md"))
print(f"Preprocessing {len(md_files)} files...")
for md_file in md_files:
temp_output = output_dir / "tmp_preprocess.md"
module.process_file(str(md_file), str(temp_output))
with open(temp_output, 'r', encoding='utf-8') as f:
content = f.read()
content = convert_pdf_markdown(content, md_file.name)
with open(output_dir / md_file.name, 'w', encoding='utf-8') as f:
f.write(content)
temp_output.unlink()
def generate_pdf(
temp_dir: Path,
output_file: Path,
metadata_file: Optional[str],
engine: str,
fonts: Dict[str, str],
margin: str = "1in",
) -> None:
chapters = [str(temp_dir / ch) for ch in CHAPTER_ORDER if (temp_dir / ch).exists()]
if not chapters:
raise ValueError("No valid chapter files found")
script_dir = Path(__file__).parent
header_file = script_dir / "header.tex"
cmd = [
"pandoc", "-o", str(output_file),
"--metadata-file", metadata_file or "",
"-H", str(header_file),
"--toc",
"--toc-depth=2",
"--top-level-division=chapter",
"--file-scope",
f"--pdf-engine={engine}",
f"-V geometry:margin={margin}",
"-V linestretch=1.5",
"-V book=true",
"-V classoption=openany",
"-V mainfont=PingFang SC",
]
cmd = [c for c in cmd if c]
cmd.extend(chapters)
print(f"Generating PDF with {engine}...")
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"PDF generation failed: {result.stderr}")
def main():
parser = argparse.ArgumentParser(description="Generate PDF from Markdown")
parser.add_argument("-i", "--input", default="content/zh", help="Input directory")
parser.add_argument("-o", "--output", default="output", help="Output directory")
parser.add_argument("-m", "--metadata", help="Metadata YAML file")
parser.add_argument("-e", "--engine", choices=["xelatex", "lualatex"], help="PDF engine")
parser.add_argument("--no-cleanup", action="store_true", help="Keep temp files")
args = parser.parse_args()
project_root = Path(__file__).parent.parent
input_dir = project_root / args.input
output_dir = project_root / args.output
temp_dir = output_dir / "temp"
missing = check_dependencies()
if missing:
print("Error: Missing dependencies:")
for dep in missing:
print(f" - {dep}")
print("\nInstall: brew install pandoc && brew install --cask mactex")
sys.exit(1)
detected_engine = get_available_engine()
if detected_engine is None:
print("Error: No PDF engine available")
sys.exit(1)
engine = args.engine or detected_engine
metadata = args.metadata or str(project_root / "metadata.yaml")
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / "ddia.pdf"
output_file.unlink(missing_ok=True)
preprocess_markdown(input_dir, temp_dir)
generate_pdf(temp_dir, output_file, metadata, engine, DEFAULT_FONTS)
if not args.no_cleanup and temp_dir.exists():
shutil.rmtree(temp_dir)
print(f"PDF created: {output_file}")
if __name__ == "__main__":
main()