Merge b1e0ad3245 into 900a2550dc

2026-06-22 17:37:04 +08:00 · 2026-04-09 11:34:44 +08:00 · 2026-04-09 11:34:44 +08:00 · eba72e0351
commit eba72e0351
parent 900a2550dc b1e0ad3245
8 changed files with 554 additions and 2 deletions
--- a/.github/workflows/build-pdf.yaml
+++ b/.github/workflows/build-pdf.yaml
@ -0,0 +1,143 @@
 name: Build and Release PDF
 on:
  push:
    tags:
      - 'v*'
      - 'release*'
  workflow_dispatch:
 permissions:
  contents: write
 jobs:
  build-pdf:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
      - name: Install dependencies
        run: |
          # Install pandoc
          wget -q https://github.com/jgm/pandoc/releases/download/3.1.11/pandoc-3.1.11-1-amd64.deb
          sudo dpkg -i pandoc-3.1.11-1-amd64.deb
          rm pandoc-3.1.11-1-amd64.deb
          # Install TeX Live with XeLaTeX and Chinese support
          sudo apt-get update
          sudo apt-get install -y \
            texlive-xetex \
            texlive-lang-chinese \
            texlive-fonts-recommended \
            texlive-fonts-extra \
            fonts-noto-cjk \
            fonts-noto-cjk-extra
          # Verify installations
          pandoc --version
          xelatex --version
      - name: Build PDF
        run: |
          chmod +x bin/pdf bin/preprocess-epub.py
          mkdir -p output output/temp
          # Preprocess Markdown files
          python3 bin/preprocess-epub.py content/zh output/temp
          # Generate PDF with CI-specific fonts (Noto CJK)
          pandoc -o output/ddia.pdf \
            --metadata-file=metadata.yaml \
            -H bin/header-ci.tex \
            --toc \
            --toc-depth=2 \
            --top-level-division=chapter \
            --file-scope=true \
            --pdf-engine=xelatex \
            -V geometry:margin=1in \
            -V linestretch=1.5 \
            output/temp/_index.md \
            output/temp/preface.md \
            output/temp/part-i.md \
            output/temp/ch1.md \
            output/temp/ch2.md \
            output/temp/ch3.md \
            output/temp/ch4.md \
            output/temp/part-ii.md \
            output/temp/ch5.md \
            output/temp/ch6.md \
            output/temp/ch7.md \
            output/temp/ch8.md \
            output/temp/ch9.md \
            output/temp/part-iii.md \
            output/temp/ch10.md \
            output/temp/ch11.md \
            output/temp/ch12.md \
            output/temp/ch13.md \
            output/temp/ch14.md \
            output/temp/colophon.md \
            output/temp/glossary.md
          rm -rf output/temp
          if [ ! -f "output/ddia.pdf" ]; then
            echo "Error: PDF file was not created"
            exit 1
          fi
          ls -lh output/ddia.pdf
          file output/ddia.pdf
      - name: Upload PDF artifact
        uses: actions/upload-artifact@v4
        with:
          name: ddia-pdf
          path: output/ddia.pdf
          retention-days: 30
      - name: Create/Update Release
        if: startsWith(github.ref, 'refs/tags/')
        uses: softprops/action-gh-release@v1
        with:
          files: output/ddia.pdf
          name: ${{ github.ref_name }}
          body: |
            ## 《设计数据密集型应用》PDF 版本
            此版本为自动生成的 PDF 电子书。
            ### 文件信息
            - 文件名: `ddia.pdf`
            - 生成时间: ${{ github.event.head_commit.timestamp }}
            - 标签: ${{ github.ref_name }}
            ### 依赖工具
            - Pandoc 3.1.11
            - XeLaTeX (TeX Live)
            - 中文字体: Noto CJK
            ### 本地生成
            如需本地生成 PDF，请确保安装以下依赖：
            ```bash
            # macOS
            brew install pandoc
            brew install --cask mactex
            # Ubuntu/Debian
            sudo apt install pandoc texlive-xetex texlive-lang-chinese
            # 生成 PDF
            make pdf
            ```
          draft: false
          prerelease: false
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@ -11,3 +11,6 @@ CLAUDE.md
 content/cn/
 zh.md
 en.md
 .venv
 AGENTS.md
--- a/5
+++ b/5
@ -17,4 +17,7 @@ translate:
 epub:
 	bin/epub
-.PHONY: default doc translate
+pdf:
 	bin/pdf
 .PHONY: default doc translate epub pdf
--- a/README.md
+++ b/README.md
@ -12,6 +12,19 @@
 **阅读**：访问 [https://ddia.vonng.com](https://ddia.vonng.com) 阅读本书在线版本，或使用 [hugo](https://gohugo.io/documentation/) / [hextra](https://imfing.github.io/hextra/zh-cn/) 主题自行构建。
 **下载**：可以使用以下命令生成 PDF 电子书：
 ```bash
 # 安装依赖
 brew install pandoc
 brew install poppler  # 用于 pdftotext
 # 生成 PDF
 make pdf
 ```
 生成的 PDF 文件位于 `output/ddia.pdf`
 > [!NOTE] 
 > [**DDIA 第二版**](https://ddia.vonng.com) 正在翻译中（翻译至至第十章），欢迎阅览并提出您的宝贵意见！[点击此处阅览第一版](https://ddia.vonng.com/v1)。
--- a/bin/header-ci.tex
+++ b/bin/header-ci.tex
@ -0,0 +1,31 @@
 % Chinese support with xeCJK
 \usepackage{xeCJK}
 \setCJKmainfont{Noto Serif CJK SC}
 \setCJKsansfont{Noto Sans CJK SC}
 % Enable Chinese line breaking
 \XeTeXlinebreaklocale "zh"
 % Paragraph settings
 \usepackage{parskip}
 \setlength{\parindent}{2em}
 \usepackage{ragged2e}
 % Chinese punctuation style
 \punctstyle{quanjiao}
 % Cover page
 \AtBeginDocument{%
  \thispagestyle{empty}
  \begin{center}
    \vspace*{0.4\textheight}
    {\Huge\bfseries 设计数据密集型应用}
    \vspace{1cm}
    {\LARGE 第二版}
    \vspace{2cm}
    {\Large Martin Kleppmann}
    \vspace{0.5cm}
    {\large 冯若航 译}
  \end{center}
  \clearpage
 }
--- a/bin/header.tex
+++ b/bin/header.tex
@ -0,0 +1,31 @@
 % Chinese support with xeCJK
 \usepackage{xeCJK}
 \setCJKmainfont{PingFang SC}
 \setCJKsansfont{Heiti SC}
 % Enable Chinese line breaking
 \XeTeXlinebreaklocale "zh"
 % Paragraph settings
 \usepackage{parskip}
 \setlength{\parindent}{2em}
 \usepackage{ragged2e}
 % Chinese punctuation style
 \punctstyle{quanjiao}
 % Cover page
 \AtBeginDocument{%
  \thispagestyle{empty}
  \begin{center}
    \vspace*{0.4\textheight}
    {\Huge\bfseries 设计数据密集型应用}
    \vspace{1cm}
    {\LARGE 第二版}
    \vspace{2cm}
    {\Large Martin Kleppmann}
    \vspace{0.5cm}
    {\large 冯若航 译}
  \end{center}
  \clearpage
 }
--- a/bin/pdf
+++ b/bin/pdf
@ -0,0 +1,108 @@
 #!/usr/bin/env bash
 set -e
 # Check for required dependencies
 check_dependencies() {
 	local missing_deps=()
 	if ! command -v pandoc &> /dev/null; then
 		missing_deps+=("pandoc")
 	fi
 	if ! command -v xelatex &> /dev/null; then
 		# Try lualatex as fallback
 		if ! command -v lualatex &> /dev/null; then
 			missing_deps+=("xelatex or lualatex (LaTeX engine)")
 		fi
 	fi
 	if [ ${#missing_deps[@]} -ne 0 ]; then
 		echo "Error: Missing required dependencies:"
 		for dep in "${missing_deps[@]}"; do
 			echo "  - $dep"
 		done
 		echo ""
 		echo "Installation:"
 		echo "  macOS: brew install pandoc"
 		echo "  macOS: brew install --cask mactex"
 		echo ""
 		echo "  Linux: apt install pandoc texlive-xetex"
 		exit 1
 	fi
 }
 check_dependencies
 # Detect available PDF engine
 if command -v xelatex &> /dev/null; then
 	PDF_ENGINE="xelatex"
 elif command -v lualatex &> /dev/null; then
 	PDF_ENGINE="lualatex"
 else
 	echo "Error: No suitable PDF engine found"
 	exit 1
 fi
 SCRIPT_DIR=$(dirname "$0")
 INPUT_DIR=$(cd "$(dirname "$SCRIPT_DIR")" && pwd)
 OUTPUT_DIR="$INPUT_DIR/output"
 TEMP_DIR="$OUTPUT_DIR/temp"
 # Create output directory if it doesn't exist
 mkdir -p "$OUTPUT_DIR"
 mkdir -p "$TEMP_DIR"
 # Preprocess Markdown files to convert Hugo shortcodes
 echo "Preprocessing Markdown files..."
 python3 "${SCRIPT_DIR}/preprocess-epub.py" "${INPUT_DIR}/content/zh" "$TEMP_DIR"
 convert_to_pdf() {
 	# Convert all Markdown files into a single PDF book
 	OUTPUT_BOOK="$OUTPUT_DIR/ddia.pdf"
 	rm -f "$OUTPUT_BOOK"
 	echo "Converting all Markdown files into $OUTPUT_BOOK..."
 	local meta_file=${INPUT_DIR}/metadata.yaml
 	local header_file=${SCRIPT_DIR}/header.tex
 	# Use xelatex for Chinese support with custom header
 	pandoc -o "$OUTPUT_BOOK" \
 		--metadata-file="$meta_file" \
 		-H "$header_file" \
 		--toc \
 		--toc-depth=2 \
 		--top-level-division=chapter \
 		--file-scope=true \
 		--pdf-engine="$PDF_ENGINE" \
 		-V geometry:margin=1in \
 		-V linestretch=1.5 \
 		"${TEMP_DIR}"/_index.md \
 		"${TEMP_DIR}"/preface.md \
 		"${TEMP_DIR}"/part-i.md \
 		"${TEMP_DIR}"/ch1.md \
 		"${TEMP_DIR}"/ch2.md \
 		"${TEMP_DIR}"/ch3.md \
 		"${TEMP_DIR}"/ch4.md \
 		"${TEMP_DIR}"/part-ii.md \
 		"${TEMP_DIR}"/ch5.md \
 		"${TEMP_DIR}"/ch6.md \
 		"${TEMP_DIR}"/ch7.md \
 		"${TEMP_DIR}"/ch8.md \
 		"${TEMP_DIR}"/ch9.md \
 		"${TEMP_DIR}"/part-iii.md \
 		"${TEMP_DIR}"/ch10.md \
 		"${TEMP_DIR}"/ch11.md \
 		"${TEMP_DIR}"/ch12.md \
 		"${TEMP_DIR}"/ch13.md \
 		"${TEMP_DIR}"/ch14.md \
 		"${TEMP_DIR}"/colophon.md \
 		"${TEMP_DIR}"/glossary.md
 	echo "PDF book created at $OUTPUT_BOOK."
 }
 convert_to_pdf
 # Clean up temporary files
 rm -rf "$TEMP_DIR"
--- a/bin/pdf.py
+++ b/bin/pdf.py
@ -0,0 +1,220 @@
 #!/usr/bin/env python3
 """PDF generation from Markdown using pandoc + LaTeX."""
 import os
 import re
 import sys
 import subprocess
 import argparse
 from pathlib import Path
 from typing import Optional, List, Dict
 import shutil
 import importlib.util
 CHAPTER_ORDER = [
    "_index.md",
    "preface.md",
    "part-i.md",
    "ch1.md", "ch2.md", "ch3.md", "ch4.md",
    "part-ii.md",
    "ch5.md", "ch6.md", "ch7.md", "ch8.md", "ch9.md",
    "part-iii.md",
    "ch10.md", "ch11.md", "ch12.md", "ch13.md", "ch14.md",
    "colophon.md", "glossary.md",
 ]
 DEFAULT_FONTS = {
    "mainfont": "PingFang SC",
    "sansfont": "Heiti SC",
 }
 YAML_FRONT_RE = re.compile(r'^---\n(.*?)\n---\n', re.DOTALL)
 TITLE_RE = re.compile(r'^title:\s*["\']?([^"\'\n]+)["\']?\s*$', re.MULTILINE)
 CHAPTER_NUM_RE = re.compile(r'^\d+\.\s*')
 CHAPTER_FILE_RE = re.compile(r'^ch(\d+)\.md$')
 CALLOUT_RE = re.compile(r'^> \[!(NOTE|TIP|WARNING|CAUTION|DANGER)\] ', re.MULTILINE)
 def convert_pdf_markdown(text: str, filename: str) -> str:
    """PDF-specific markdown conversions."""
    text = _convert_callouts(text)
    text = _add_title_heading(text, filename)
    return text
 def _convert_callouts(text: str) -> str:
    """Convert [!NOTE], [!TIP], etc. to Chinese."""
    def replace_callout(match):
        callout_type = match.group(1).lower()
        title_map = {
            'note': '注',
            'tip': '提示',
            'warning': '警告',
            'caution': '注意',
            'danger': '危险'
        }
        return f"**{title_map.get(callout_type, callout_type)}**: "
    text = CALLOUT_RE.sub(replace_callout, text)
    text = re.sub(r'^> ?', '', text, flags=re.MULTILINE)
    return text
 def _add_title_heading(text: str, filename: str) -> str:
    """Add title heading from YAML frontmatter."""
    match = YAML_FRONT_RE.match(text)
    if match:
        frontmatter = match.group(1)
        title_match = TITLE_RE.search(frontmatter)
        if title_match:
            title = title_match.group(1)
            body = text[match.end():]
            clean_title = CHAPTER_NUM_RE.sub('', title)
            if CHAPTER_FILE_RE.match(filename):
                heading = f"# {clean_title}"
            else:
                heading = f"## {clean_title}"
            return f"---\n{frontmatter}\n---\n\n{heading}\n\n{body}"
    return text
 def check_cmd(cmd: str) -> bool:
    return subprocess.run(["which", cmd], capture_output=True).returncode == 0
 def get_available_engine() -> Optional[str]:
    if check_cmd("xelatex"):
        return "xelatex"
    if check_cmd("lualatex"):
        return "lualatex"
    return None
 def check_dependencies() -> List[str]:
    missing = []
    if not check_cmd("pandoc"):
        missing.append("pandoc")
    if not get_available_engine():
        missing.append("xelatex or lualatex (LaTeX engine)")
    return missing
 def preprocess_markdown(input_dir: Path, output_dir: Path) -> None:
    script_dir = Path(__file__).parent
    preprocess_script = script_dir / "preprocess-epub.py"
    spec = importlib.util.spec_from_file_location("preprocess_epub", preprocess_script)
    if spec is None:
        raise RuntimeError("Failed to load preprocess module")
    module = importlib.util.module_from_spec(spec)
    if spec.loader is None:
        raise RuntimeError("Failed to load preprocess module loader")
    spec.loader.exec_module(module)
    output_dir.mkdir(parents=True, exist_ok=True)
    md_files = sorted(input_dir.glob("*.md"))
    print(f"Preprocessing {len(md_files)} files...")
    for md_file in md_files:
        temp_output = output_dir / "tmp_preprocess.md"
        module.process_file(str(md_file), str(temp_output))
        with open(temp_output, 'r', encoding='utf-8') as f:
            content = f.read()
        content = convert_pdf_markdown(content, md_file.name)
        with open(output_dir / md_file.name, 'w', encoding='utf-8') as f:
            f.write(content)
        temp_output.unlink()
 def generate_pdf(
    temp_dir: Path,
    output_file: Path,
    metadata_file: Optional[str],
    engine: str,
    fonts: Dict[str, str],
    margin: str = "1in",
 ) -> None:
    chapters = [str(temp_dir / ch) for ch in CHAPTER_ORDER if (temp_dir / ch).exists()]
    if not chapters:
        raise ValueError("No valid chapter files found")
    script_dir = Path(__file__).parent
    header_file = script_dir / "header.tex"
    cmd = [
        "pandoc", "-o", str(output_file),
        "--metadata-file", metadata_file or "",
        "-H", str(header_file),
        "--toc",
        "--toc-depth=2",
        "--top-level-division=chapter",
        "--file-scope",
        f"--pdf-engine={engine}",
        f"-V geometry:margin={margin}",
        "-V linestretch=1.5",
        "-V book=true",
        "-V classoption=openany",
        "-V mainfont=PingFang SC",
    ]
    cmd = [c for c in cmd if c]
    cmd.extend(chapters)
    print(f"Generating PDF with {engine}...")
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0:
        raise RuntimeError(f"PDF generation failed: {result.stderr}")
 def main():
    parser = argparse.ArgumentParser(description="Generate PDF from Markdown")
    parser.add_argument("-i", "--input", default="content/zh", help="Input directory")
    parser.add_argument("-o", "--output", default="output", help="Output directory")
    parser.add_argument("-m", "--metadata", help="Metadata YAML file")
    parser.add_argument("-e", "--engine", choices=["xelatex", "lualatex"], help="PDF engine")
    parser.add_argument("--no-cleanup", action="store_true", help="Keep temp files")
    args = parser.parse_args()
    project_root = Path(__file__).parent.parent
    input_dir = project_root / args.input
    output_dir = project_root / args.output
    temp_dir = output_dir / "temp"
    missing = check_dependencies()
    if missing:
        print("Error: Missing dependencies:")
        for dep in missing:
            print(f"  - {dep}")
        print("\nInstall: brew install pandoc && brew install --cask mactex")
        sys.exit(1)
    detected_engine = get_available_engine()
    if detected_engine is None:
        print("Error: No PDF engine available")
        sys.exit(1)
    engine = args.engine or detected_engine
    metadata = args.metadata or str(project_root / "metadata.yaml")
    output_dir.mkdir(parents=True, exist_ok=True)
    output_file = output_dir / "ddia.pdf"
    output_file.unlink(missing_ok=True)
    preprocess_markdown(input_dir, temp_dir)
    generate_pdf(temp_dir, output_file, metadata, engine, DEFAULT_FONTS)
    if not args.no_cleanup and temp_dir.exists():
        shutil.rmtree(temp_dir)
    print(f"PDF created: {output_file}")
 if __name__ == "__main__":
    main()