From e391301906a999f401aaa66a9f62dc3dbc0df5a9 Mon Sep 17 00:00:00 2001
From: Feng Ruohang <rh@vonng.com>
Date: Sun, 10 Aug 2025 16:45:19 +0800
Subject: [PATCH] add toc generate script

---
 bin/toc.py | 268 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 268 insertions(+)
 create mode 100755 bin/toc.py
diff --git a/bin/toc.py b/bin/toc.py
new file mode 100755
index 0000000..5a29432
--- /dev/null
+++ b/bin/toc.py
@@ -0,0 +1,268 @@
+#!/usr/bin/env python3
+"""
+TOC Generator for DDIA book
+Usage: python toc.py <lang> <depth> [output_file]
+Example: python toc.py zh 2
+         python toc.py en 3 en-toc.md
+"""
+
+import os
+import sys
+import re
+from pathlib import Path
+
+
+def extract_front_matter_title(content):
+    """Extract title from Hugo front matter"""
+    lines = content.split('\n')
+    in_front_matter = False
+    for line in lines:
+        if line.strip() == '---':
+            if not in_front_matter:
+                in_front_matter = True
+            else:
+                break
+        elif in_front_matter and line.startswith('title:'):
+            # Extract title, removing quotes
+            title = line.split(':', 1)[1].strip()
+            if title.startswith('"') and title.endswith('"'):
+                title = title[1:-1]
+            elif title.startswith("'") and title.endswith("'"):
+                title = title[1:-1]
+            return title
+    return None
+
+
+def extract_headings(content, max_depth):
+    """Extract headings up to specified depth from markdown content
+    
+    max_depth=1 -> extract H2 only
+    max_depth=2 -> extract H2-H3
+    max_depth=3 -> extract H2-H4
+    max_depth=4 -> extract H2-H5
+    """
+    headings = []
+    lines = content.split('\n')
+    
+    # Skip front matter
+    skip_until = 0
+    if lines[0].strip() == '---':
+        for i, line in enumerate(lines[1:], 1):
+            if line.strip() == '---':
+                skip_until = i + 1
+                break
+    
+    for line in lines[skip_until:]:
+        # Match markdown headings with optional ID
+        # Format: ## Heading Text {#heading-id}
+        match = re.match(r'^(#{2,5})\s+(.*?)(?:\s*\{#([\w-]+)\})?$', line)
+        if match:
+            level = len(match.group(1))
+            # max_depth=1 -> extract level 2 only (H2)
+            # max_depth=2 -> extract level 2-3 (H2-H3)
+            # max_depth=3 -> extract level 2-4 (H2-H4)
+            # max_depth=4 -> extract level 2-5 (H2-H5)
+            max_level = max_depth + 1
+            if level <= max_level:
+                heading_text = match.group(2).strip()
+                heading_id = match.group(3)
+                headings.append({
+                    'level': level,  # Keep original level: 2 for H2, 3 for H3, etc.
+                    'text': heading_text,
+                    'id': heading_id
+                })
+    
+    return headings
+
+
+def generate_toc_entry(file_name, title, lang, depth, content_dir):
+    """Generate TOC entry for a file"""
+    entries = []
+    
+    # Determine URL path
+    base_name = file_name.replace('.md', '')
+    if lang == 'zh':
+        url = f"/{base_name}"
+    else:
+        url = f"/{lang}/{base_name}"
+    
+    # Add main entry (level 1)
+    entries.append({
+        'level': 1,
+        'text': f"[{title}]({url})",
+        'raw_text': title
+    })
+    
+    # Special case: glossary.md only shows main title (no sub-headings)
+    if file_name == 'glossary.md':
+        effective_depth = 0  # Don't extract any sub-headings
+    else:
+        effective_depth = depth - 1  # Adjust depth: user depth 1 = no extraction, 2 = extract H2, etc.
+    
+    # If effective_depth >= 1, extract headings from file
+    if effective_depth >= 1:
+        file_path = content_dir / file_name
+        if file_path.exists():
+            with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+            
+            headings = extract_headings(content, effective_depth)
+            for heading in headings:
+                # Create link with anchor
+                if heading['id']:
+                    anchor_url = f"{url}#{heading['id']}"
+                else:
+                    # Generate anchor from heading text (simplified)
+                    anchor = heading['text'].lower()
+                    anchor = re.sub(r'[^\w\s-]', '', anchor)
+                    anchor = re.sub(r'\s+', '-', anchor)
+                    anchor_url = f"{url}#{anchor}"
+                
+                # Adjust level: H2 becomes level 2, H3 becomes level 3, etc.
+                # This ensures proper indentation under the main entry
+                entries.append({
+                    'level': heading['level'],
+                    'text': f"[{heading['text']}]({anchor_url})",
+                    'raw_text': heading['text']
+                })
+    
+    return entries
+
+
+def format_toc_entries(entries):
+    """Format TOC entries with proper indentation"""
+    formatted = []
+    for entry in entries:
+        level = entry['level']
+        text = entry['text']
+        
+        if level == 0:
+            # Blank line separator
+            formatted.append('')
+        elif level == 1:
+            # Main entry (chapter/section level)
+            formatted.append(f"## {text}")
+        elif level == 2:
+            # H2 heading
+            formatted.append(f"- {text}")
+        elif level == 3:
+            # H3 heading
+            formatted.append(f"    - {text}")
+        elif level == 4:
+            # H4 heading
+            formatted.append(f"        - {text}")
+        elif level == 5:
+            # H5 heading
+            formatted.append(f"            - {text}")
+    
+    return '\n'.join(formatted)
+
+
+def check_file_status(file_path, lang):
+    """Check if a file exists and add status marker if needed"""
+    if not file_path.exists():
+        return " (未发布)" if lang == 'zh' else " (未發布)" if lang == 'tw' else " (WIP)"
+    
+    # Check if file has minimal content (you can adjust this logic)
+    with open(file_path, 'r', encoding='utf-8') as f:
+        content = f.read()
+        # Simple heuristic: if file has less than 500 characters of actual content, consider it WIP
+        # Remove front matter for content check
+        lines = content.split('\n')
+        if lines[0].strip() == '---':
+            for i, line in enumerate(lines[1:], 1):
+                if line.strip() == '---':
+                    content = '\n'.join(lines[i+1:])
+                    break
+        
+        content_length = len(content.strip())
+        if content_length < 500:
+            return " (未发布)" if lang == 'zh' else " (未發布)" if lang == 'tw' else " (WIP)"
+    
+    return ""
+
+
+def main():
+    # Parse arguments
+    if len(sys.argv) < 3:
+        print("Usage: python toc.py <lang> <depth> [output_file]")
+        print("Example: python toc.py zh 2")
+        sys.exit(1)
+    
+    lang = sys.argv[1]
+    if lang not in ['zh', 'en', 'tw']:
+        print(f"Error: Language must be one of: zh, en, tw")
+        sys.exit(1)
+    
+    try:
+        depth = int(sys.argv[2])
+        if depth not in [1, 2, 3, 4]:
+            raise ValueError
+    except ValueError:
+        print(f"Error: Depth must be 1, 2, 3, or 4")
+        sys.exit(1)
+    
+    # Determine output file
+    if len(sys.argv) > 3:
+        output_file = sys.argv[3]
+    else:
+        output_file = f"{lang}.md"
+    
+    # Get content directory
+    script_dir = Path(__file__).parent
+    project_root = script_dir.parent
+    content_dir = project_root / 'content' / lang
+    
+    if not content_dir.exists():
+        print(f"Error: Content directory {content_dir} does not exist")
+        sys.exit(1)
+    
+    # Define file order
+    file_order = [
+        'preface.md',
+        'ch1.md', 'ch2.md', 'ch3.md', 'ch4.md', 'ch5.md', 'ch6.md',
+        'ch7.md', 'ch8.md', 'ch9.md', 'ch10.md', 'ch11.md', 'ch12.md', 'ch13.md',
+        'glossary.md',
+        'colophon.md'
+    ]
+    
+    # Generate TOC
+    all_entries = []
+    
+    for file_name in file_order:
+        file_path = content_dir / file_name
+        if file_path.exists():
+            with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+            
+            title = extract_front_matter_title(content)
+            if title:
+                entries = generate_toc_entry(file_name, title, lang, depth, content_dir)
+                
+                # Add status marker to main entry if needed
+                status = check_file_status(file_path, lang)
+                if status and entries:
+                    # Update the first entry (main title) with status
+                    entries[0]['text'] = entries[0]['text'].replace(')', f'){status}')
+                
+                all_entries.extend(entries)
+                if entries:  # Add blank line between chapters
+                    all_entries.append({'level': 0, 'text': ''})
+    
+    # Format and write output
+    formatted_toc = format_toc_entries(all_entries)
+    
+    # Clean up extra blank lines
+    formatted_toc = re.sub(r'\n{3,}', '\n\n', formatted_toc)
+    
+    # Write to file
+    output_path = Path(output_file)
+    with open(output_path, 'w', encoding='utf-8') as f:
+        f.write(formatted_toc)
+    
+    print(f"TOC generated successfully: {output_path}")
+    print(f"Language: {lang}, Depth: {depth}")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file