Support generate one PDF

2026-06-21 00:47:05 +08:00 · 2026-03-28 10:06:38 +08:00 · 2026-03-28 10:06:38 +08:00 · b1e0ad3245
commit b1e0ad3245
parent 573bb53a05
8 changed files with 554 additions and 2 deletions
--- a/.github/workflows/build-pdf.yaml
+++ b/.github/workflows/build-pdf.yaml
@ -0,0 +1,143 @@
+name: Build and Release PDF
+
+on:
+  push:
+    tags:
+      - 'v*'
+      - 'release*'
+  workflow_dispatch:
+
+permissions:
+  contents: write
+
+jobs:
+  build-pdf:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install dependencies
+        run: |
+          # Install pandoc
+          wget -q https://github.com/jgm/pandoc/releases/download/3.1.11/pandoc-3.1.11-1-amd64.deb
+          sudo dpkg -i pandoc-3.1.11-1-amd64.deb
+          rm pandoc-3.1.11-1-amd64.deb
+          
+          # Install TeX Live with XeLaTeX and Chinese support
+          sudo apt-get update
+          sudo apt-get install -y \
+            texlive-xetex \
+            texlive-lang-chinese \
+            texlive-fonts-recommended \
+            texlive-fonts-extra \
+            fonts-noto-cjk \
+            fonts-noto-cjk-extra
+          
+          # Verify installations
+          pandoc --version
+          xelatex --version
+
+      - name: Build PDF
+        run: |
+          chmod +x bin/pdf bin/preprocess-epub.py
+          mkdir -p output output/temp
+          
+          # Preprocess Markdown files
+          python3 bin/preprocess-epub.py content/zh output/temp
+          
+          # Generate PDF with CI-specific fonts (Noto CJK)
+          pandoc -o output/ddia.pdf \
+            --metadata-file=metadata.yaml \
+            -H bin/header-ci.tex \
+            --toc \
+            --toc-depth=2 \
+            --top-level-division=chapter \
+            --file-scope=true \
+            --pdf-engine=xelatex \
+            -V geometry:margin=1in \
+            -V linestretch=1.5 \
+            output/temp/_index.md \
+            output/temp/preface.md \
+            output/temp/part-i.md \
+            output/temp/ch1.md \
+            output/temp/ch2.md \
+            output/temp/ch3.md \
+            output/temp/ch4.md \
+            output/temp/part-ii.md \
+            output/temp/ch5.md \
+            output/temp/ch6.md \
+            output/temp/ch7.md \
+            output/temp/ch8.md \
+            output/temp/ch9.md \
+            output/temp/part-iii.md \
+            output/temp/ch10.md \
+            output/temp/ch11.md \
+            output/temp/ch12.md \
+            output/temp/ch13.md \
+            output/temp/ch14.md \
+            output/temp/colophon.md \
+            output/temp/glossary.md
+          
+          rm -rf output/temp
+          
+          if [ ! -f "output/ddia.pdf" ]; then
+            echo "Error: PDF file was not created"
+            exit 1
+          fi
+          
+          ls -lh output/ddia.pdf
+          file output/ddia.pdf
+
+      - name: Upload PDF artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: ddia-pdf
+          path: output/ddia.pdf
+          retention-days: 30
+
+      - name: Create/Update Release
+        if: startsWith(github.ref, 'refs/tags/')
+        uses: softprops/action-gh-release@v1
+        with:
+          files: output/ddia.pdf
+          name: ${{ github.ref_name }}
+          body: |
+            ## 《设计数据密集型应用》PDF 版本
+            
+            此版本为自动生成的 PDF 电子书。
+            
+            ### 文件信息
+            - 文件名: `ddia.pdf`
+            - 生成时间: ${{ github.event.head_commit.timestamp }}
+            - 标签: ${{ github.ref_name }}
+            
+            ### 依赖工具
+            - Pandoc 3.1.11
+            - XeLaTeX (TeX Live)
+            - 中文字体: Noto CJK
+            
+            ### 本地生成
+            如需本地生成 PDF，请确保安装以下依赖：
+            ```bash
+            # macOS
+            brew install pandoc
+            brew install --cask mactex
+            
+            # Ubuntu/Debian
+            sudo apt install pandoc texlive-xetex texlive-lang-chinese
+            
+            # 生成 PDF
+            make pdf
+            ```
+          draft: false
+          prerelease: false
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@ -10,4 +10,7 @@ public/
 CLAUDE.md
 content/cn/
 zh.md
-en.md
+en.md
+.venv
+AGENTS.md
+
--- a/5
+++ b/5
@ -17,4 +17,7 @@ translate:
 epub:
 	bin/epub

-.PHONY: default doc translate
+pdf:
+	bin/pdf
+
+.PHONY: default doc translate epub pdf
--- a/README.md
+++ b/README.md
@ -12,6 +12,19 @@

 **阅读**：访问 [https://ddia.vonng.com](https://ddia.vonng.com) 阅读本书在线版本，或使用 [hugo](https://gohugo.io/documentation/) / [hextra](https://imfing.github.io/hextra/zh-cn/) 主题自行构建。

+**下载**：可以使用以下命令生成 PDF 电子书：
+
+```bash
+# 安装依赖
+brew install pandoc
+brew install poppler  # 用于 pdftotext
+
+# 生成 PDF
+make pdf
+```
+
+生成的 PDF 文件位于 `output/ddia.pdf`
+
 > [!NOTE] 
 > [**DDIA 第二版**](https://ddia.vonng.com) 正在翻译中（翻译至至第十章），欢迎阅览并提出您的宝贵意见！[点击此处阅览第一版](https://ddia.vonng.com/v1)。

--- a/bin/header-ci.tex
+++ b/bin/header-ci.tex
@ -0,0 +1,31 @@
+% Chinese support with xeCJK
+\usepackage{xeCJK}
+\setCJKmainfont{Noto Serif CJK SC}
+\setCJKsansfont{Noto Sans CJK SC}
+
+% Enable Chinese line breaking
+\XeTeXlinebreaklocale "zh"
+
+% Paragraph settings
+\usepackage{parskip}
+\setlength{\parindent}{2em}
+\usepackage{ragged2e}
+
+% Chinese punctuation style
+\punctstyle{quanjiao}
+
+% Cover page
+\AtBeginDocument{%
+  \thispagestyle{empty}
+  \begin{center}
+    \vspace*{0.4\textheight}
+    {\Huge\bfseries 设计数据密集型应用}
+    \vspace{1cm}
+    {\LARGE 第二版}
+    \vspace{2cm}
+    {\Large Martin Kleppmann}
+    \vspace{0.5cm}
+    {\large 冯若航 译}
+  \end{center}
+  \clearpage
+}
--- a/bin/header.tex
+++ b/bin/header.tex
@ -0,0 +1,31 @@
+% Chinese support with xeCJK
+\usepackage{xeCJK}
+\setCJKmainfont{PingFang SC}
+\setCJKsansfont{Heiti SC}
+
+% Enable Chinese line breaking
+\XeTeXlinebreaklocale "zh"
+
+% Paragraph settings
+\usepackage{parskip}
+\setlength{\parindent}{2em}
+\usepackage{ragged2e}
+
+% Chinese punctuation style
+\punctstyle{quanjiao}
+
+% Cover page
+\AtBeginDocument{%
+  \thispagestyle{empty}
+  \begin{center}
+    \vspace*{0.4\textheight}
+    {\Huge\bfseries 设计数据密集型应用}
+    \vspace{1cm}
+    {\LARGE 第二版}
+    \vspace{2cm}
+    {\Large Martin Kleppmann}
+    \vspace{0.5cm}
+    {\large 冯若航 译}
+  \end{center}
+  \clearpage
+}
--- a/bin/pdf
+++ b/bin/pdf
@ -0,0 +1,108 @@
+#!/usr/bin/env bash
+
+set -e
+
+# Check for required dependencies
+check_dependencies() {
+	local missing_deps=()
+	
+	if ! command -v pandoc &> /dev/null; then
+		missing_deps+=("pandoc")
+	fi
+	
+	if ! command -v xelatex &> /dev/null; then
+		# Try lualatex as fallback
+		if ! command -v lualatex &> /dev/null; then
+			missing_deps+=("xelatex or lualatex (LaTeX engine)")
+		fi
+	fi
+	
+	if [ ${#missing_deps[@]} -ne 0 ]; then
+		echo "Error: Missing required dependencies:"
+		for dep in "${missing_deps[@]}"; do
+			echo "  - $dep"
+		done
+		echo ""
+		echo "Installation:"
+		echo "  macOS: brew install pandoc"
+		echo "  macOS: brew install --cask mactex"
+		echo ""
+		echo "  Linux: apt install pandoc texlive-xetex"
+		exit 1
+	fi
+}
+
+check_dependencies
+
+# Detect available PDF engine
+if command -v xelatex &> /dev/null; then
+	PDF_ENGINE="xelatex"
+elif command -v lualatex &> /dev/null; then
+	PDF_ENGINE="lualatex"
+else
+	echo "Error: No suitable PDF engine found"
+	exit 1
+fi
+
+SCRIPT_DIR=$(dirname "$0")
+INPUT_DIR=$(cd "$(dirname "$SCRIPT_DIR")" && pwd)
+OUTPUT_DIR="$INPUT_DIR/output"
+TEMP_DIR="$OUTPUT_DIR/temp"
+
+# Create output directory if it doesn't exist
+mkdir -p "$OUTPUT_DIR"
+mkdir -p "$TEMP_DIR"
+
+# Preprocess Markdown files to convert Hugo shortcodes
+echo "Preprocessing Markdown files..."
+python3 "${SCRIPT_DIR}/preprocess-epub.py" "${INPUT_DIR}/content/zh" "$TEMP_DIR"
+
+convert_to_pdf() {
+	# Convert all Markdown files into a single PDF book
+	OUTPUT_BOOK="$OUTPUT_DIR/ddia.pdf"
+	rm -f "$OUTPUT_BOOK"
+	echo "Converting all Markdown files into $OUTPUT_BOOK..."
+
+	local meta_file=${INPUT_DIR}/metadata.yaml
+	local header_file=${SCRIPT_DIR}/header.tex
+
+	# Use xelatex for Chinese support with custom header
+	pandoc -o "$OUTPUT_BOOK" \
+		--metadata-file="$meta_file" \
+		-H "$header_file" \
+		--toc \
+		--toc-depth=2 \
+		--top-level-division=chapter \
+		--file-scope=true \
+		--pdf-engine="$PDF_ENGINE" \
+		-V geometry:margin=1in \
+		-V linestretch=1.5 \
+		"${TEMP_DIR}"/_index.md \
+		"${TEMP_DIR}"/preface.md \
+		"${TEMP_DIR}"/part-i.md \
+		"${TEMP_DIR}"/ch1.md \
+		"${TEMP_DIR}"/ch2.md \
+		"${TEMP_DIR}"/ch3.md \
+		"${TEMP_DIR}"/ch4.md \
+		"${TEMP_DIR}"/part-ii.md \
+		"${TEMP_DIR}"/ch5.md \
+		"${TEMP_DIR}"/ch6.md \
+		"${TEMP_DIR}"/ch7.md \
+		"${TEMP_DIR}"/ch8.md \
+		"${TEMP_DIR}"/ch9.md \
+		"${TEMP_DIR}"/part-iii.md \
+		"${TEMP_DIR}"/ch10.md \
+		"${TEMP_DIR}"/ch11.md \
+		"${TEMP_DIR}"/ch12.md \
+		"${TEMP_DIR}"/ch13.md \
+		"${TEMP_DIR}"/ch14.md \
+		"${TEMP_DIR}"/colophon.md \
+		"${TEMP_DIR}"/glossary.md
+
+	echo "PDF book created at $OUTPUT_BOOK."
+}
+
+convert_to_pdf
+
+# Clean up temporary files
+rm -rf "$TEMP_DIR"
--- a/bin/pdf.py
+++ b/bin/pdf.py
@ -0,0 +1,220 @@
+#!/usr/bin/env python3
+"""PDF generation from Markdown using pandoc + LaTeX."""
+
+import os
+import re
+import sys
+import subprocess
+import argparse
+from pathlib import Path
+from typing import Optional, List, Dict
+import shutil
+import importlib.util
+
+
+CHAPTER_ORDER = [
+    "_index.md",
+    "preface.md",
+    "part-i.md",
+    "ch1.md", "ch2.md", "ch3.md", "ch4.md",
+    "part-ii.md",
+    "ch5.md", "ch6.md", "ch7.md", "ch8.md", "ch9.md",
+    "part-iii.md",
+    "ch10.md", "ch11.md", "ch12.md", "ch13.md", "ch14.md",
+    "colophon.md", "glossary.md",
+]
+
+DEFAULT_FONTS = {
+    "mainfont": "PingFang SC",
+    "sansfont": "Heiti SC",
+}
+
+YAML_FRONT_RE = re.compile(r'^---\n(.*?)\n---\n', re.DOTALL)
+TITLE_RE = re.compile(r'^title:\s*["\']?([^"\'\n]+)["\']?\s*$', re.MULTILINE)
+CHAPTER_NUM_RE = re.compile(r'^\d+\.\s*')
+CHAPTER_FILE_RE = re.compile(r'^ch(\d+)\.md$')
+CALLOUT_RE = re.compile(r'^> \[!(NOTE|TIP|WARNING|CAUTION|DANGER)\] ', re.MULTILINE)
+
+
+def convert_pdf_markdown(text: str, filename: str) -> str:
+    """PDF-specific markdown conversions."""
+    text = _convert_callouts(text)
+    text = _add_title_heading(text, filename)
+    return text
+
+
+def _convert_callouts(text: str) -> str:
+    """Convert [!NOTE], [!TIP], etc. to Chinese."""
+    def replace_callout(match):
+        callout_type = match.group(1).lower()
+        title_map = {
+            'note': '注',
+            'tip': '提示',
+            'warning': '警告',
+            'caution': '注意',
+            'danger': '危险'
+        }
+        return f"**{title_map.get(callout_type, callout_type)}**: "
+
+    text = CALLOUT_RE.sub(replace_callout, text)
+    text = re.sub(r'^> ?', '', text, flags=re.MULTILINE)
+    return text
+
+
+def _add_title_heading(text: str, filename: str) -> str:
+    """Add title heading from YAML frontmatter."""
+    match = YAML_FRONT_RE.match(text)
+    if match:
+        frontmatter = match.group(1)
+        title_match = TITLE_RE.search(frontmatter)
+        if title_match:
+            title = title_match.group(1)
+            body = text[match.end():]
+            clean_title = CHAPTER_NUM_RE.sub('', title)
+
+            if CHAPTER_FILE_RE.match(filename):
+                heading = f"# {clean_title}"
+            else:
+                heading = f"## {clean_title}"
+
+            return f"---\n{frontmatter}\n---\n\n{heading}\n\n{body}"
+    return text
+
+
+def check_cmd(cmd: str) -> bool:
+    return subprocess.run(["which", cmd], capture_output=True).returncode == 0
+
+
+def get_available_engine() -> Optional[str]:
+    if check_cmd("xelatex"):
+        return "xelatex"
+    if check_cmd("lualatex"):
+        return "lualatex"
+    return None
+
+
+def check_dependencies() -> List[str]:
+    missing = []
+    if not check_cmd("pandoc"):
+        missing.append("pandoc")
+    if not get_available_engine():
+        missing.append("xelatex or lualatex (LaTeX engine)")
+    return missing
+
+
+def preprocess_markdown(input_dir: Path, output_dir: Path) -> None:
+    script_dir = Path(__file__).parent
+    preprocess_script = script_dir / "preprocess-epub.py"
+    spec = importlib.util.spec_from_file_location("preprocess_epub", preprocess_script)
+    if spec is None:
+        raise RuntimeError("Failed to load preprocess module")
+    module = importlib.util.module_from_spec(spec)
+    if spec.loader is None:
+        raise RuntimeError("Failed to load preprocess module loader")
+    spec.loader.exec_module(module)
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    md_files = sorted(input_dir.glob("*.md"))
+
+    print(f"Preprocessing {len(md_files)} files...")
+    for md_file in md_files:
+        temp_output = output_dir / "tmp_preprocess.md"
+        module.process_file(str(md_file), str(temp_output))
+
+        with open(temp_output, 'r', encoding='utf-8') as f:
+            content = f.read()
+
+        content = convert_pdf_markdown(content, md_file.name)
+
+        with open(output_dir / md_file.name, 'w', encoding='utf-8') as f:
+            f.write(content)
+
+        temp_output.unlink()
+
+
+def generate_pdf(
+    temp_dir: Path,
+    output_file: Path,
+    metadata_file: Optional[str],
+    engine: str,
+    fonts: Dict[str, str],
+    margin: str = "1in",
+) -> None:
+    chapters = [str(temp_dir / ch) for ch in CHAPTER_ORDER if (temp_dir / ch).exists()]
+
+    if not chapters:
+        raise ValueError("No valid chapter files found")
+
+    script_dir = Path(__file__).parent
+    header_file = script_dir / "header.tex"
+
+    cmd = [
+        "pandoc", "-o", str(output_file),
+        "--metadata-file", metadata_file or "",
+        "-H", str(header_file),
+        "--toc",
+        "--toc-depth=2",
+        "--top-level-division=chapter",
+        "--file-scope",
+        f"--pdf-engine={engine}",
+        f"-V geometry:margin={margin}",
+        "-V linestretch=1.5",
+        "-V book=true",
+        "-V classoption=openany",
+        "-V mainfont=PingFang SC",
+    ]
+    cmd = [c for c in cmd if c]
+    cmd.extend(chapters)
+
+    print(f"Generating PDF with {engine}...")
+    result = subprocess.run(cmd, capture_output=True, text=True)
+
+    if result.returncode != 0:
+        raise RuntimeError(f"PDF generation failed: {result.stderr}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate PDF from Markdown")
+    parser.add_argument("-i", "--input", default="content/zh", help="Input directory")
+    parser.add_argument("-o", "--output", default="output", help="Output directory")
+    parser.add_argument("-m", "--metadata", help="Metadata YAML file")
+    parser.add_argument("-e", "--engine", choices=["xelatex", "lualatex"], help="PDF engine")
+    parser.add_argument("--no-cleanup", action="store_true", help="Keep temp files")
+    args = parser.parse_args()
+
+    project_root = Path(__file__).parent.parent
+    input_dir = project_root / args.input
+    output_dir = project_root / args.output
+    temp_dir = output_dir / "temp"
+
+    missing = check_dependencies()
+    if missing:
+        print("Error: Missing dependencies:")
+        for dep in missing:
+            print(f"  - {dep}")
+        print("\nInstall: brew install pandoc && brew install --cask mactex")
+        sys.exit(1)
+
+    detected_engine = get_available_engine()
+    if detected_engine is None:
+        print("Error: No PDF engine available")
+        sys.exit(1)
+
+    engine = args.engine or detected_engine
+    metadata = args.metadata or str(project_root / "metadata.yaml")
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_file = output_dir / "ddia.pdf"
+    output_file.unlink(missing_ok=True)
+
+    preprocess_markdown(input_dir, temp_dir)
+    generate_pdf(temp_dir, output_file, metadata, engine, DEFAULT_FONTS)
+
+    if not args.no_cleanup and temp_dir.exists():
+        shutil.rmtree(temp_dir)
+
+    print(f"PDF created: {output_file}")
+
+
+if __name__ == "__main__":
+    main()