#!/usr/bin/env python3
"""
预处理 Markdown 文件,将 Hugo shortcode 转换为 Pandoc 可识别的格式
处理两种 shortcode:
1. {{< figure src="/fig/xxx.png" caption="xxx" >}} → 
2. {{< figure ... >}} (无 src) → 移除(通常用于代码示例)
"""
import os
import re
import sys
from pathlib import Path
FIGURE_SHORTCODE_RE = re.compile(r"\{\{<\s*figure\b(.*?)>\}\}", re.DOTALL)
CALLOUT_SHORTCODE_RE = re.compile(r"\{\{<\s*callout\b(.*?)>\}\}(.*?)\{\{<\s*/callout\s*>\}\}", re.DOTALL)
ATTR_RE = re.compile(r'([\w-]+)="([^"]*)"')
ABS_IMAGE_RE = re.compile(r'!\[([^\]]*)\]\(/(?!static/)([^)]+)\)')
FRONT_MATTER_RE = re.compile(r"\A---\s*\n(.*?)\n---\s*\n?", re.DOTALL)
TITLE_RE = re.compile(r'^title:\s*(?:"([^"]*)"|\'([^\']*)\'|(.+?))\s*$', re.MULTILINE)
LINK_HEADING_RE = re.compile(r"^(#{2,6})\s+(\[[^\]]+\]\([^)]+\))\s*$", re.MULTILINE)
HEADER_ID_RE = re.compile(r"\{#([A-Za-z0-9_:-]+)\}")
RAW_ID_RE = re.compile(r'(]*\bid=")([^"]+)(")', re.IGNORECASE)
RAW_HREF_RE = re.compile(r'(]*\bhref=")(/[^"#?)]*)(#[^"]*)?(")', re.IGNORECASE)
MD_HREF_RE = re.compile(r"(? {line}" if line else ">" for line in body.splitlines())
return f"> **注意**\n>\n{quoted}"
text = CALLOUT_SHORTCODE_RE.sub(replace_callout_shortcode, text)
text = FIGURE_SHORTCODE_RE.sub(replace_figure_shortcode, text)
# 把 Markdown 里的绝对路径图片  转为 static/map/ch01.png
text = ABS_IMAGE_RE.sub(r'', text)
# 网站目录页里用二级标题承载跳转链接;EPUB 目录应指向真实章节页。
text = LINK_HEADING_RE.sub(r"**\2**", text)
text = HEADER_ID_RE.sub(lambda m: "{#" + _page_anchor(slug, m.group(1)) + "}", text)
text = RAW_ID_RE.sub(lambda m: f"{m.group(1)}{_page_anchor(slug, m.group(2))}{m.group(3)}", text)
text = _rewrite_links(text, slug, known_pages)
text = _rewrite_footnotes(text, slug)
title = meta.get("title")
if title:
text = f"# {title} {{#{slug}}}\n\n{text.lstrip()}"
return text
def process_file(input_path, output_path, known_pages=None):
"""
处理单个 Markdown 文件
Args:
input_path: 输入文件路径
output_path: 输出文件路径
"""
with open(input_path, 'r', encoding='utf-8') as f:
content = f.read()
# 转换内容
converted_content = convert_markdown(content, _slug_for_path(input_path), known_pages)
# 写入输出文件
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(converted_content)
print(f"Processed: {input_path} -> {output_path}")
def main():
"""主函数"""
if len(sys.argv) < 2:
print("Usage: preprocess.py [output_file]")
print(" or: preprocess.py ")
sys.exit(1)
input_path = sys.argv[1]
if os.path.isfile(input_path):
# 处理单个文件
output_path = sys.argv[2] if len(sys.argv) > 2 else input_path
process_file(input_path, output_path)
elif os.path.isdir(input_path):
# 处理目录
output_dir = sys.argv[2]
input_dir = Path(input_path)
# 获取所有 .md 文件
md_files = sorted(input_dir.glob('*.md'))
known_pages = {_slug_for_path(path) for path in md_files}
for md_file in md_files:
output_file = os.path.join(output_dir, md_file.name)
process_file(str(md_file), output_file, known_pages)
print(f"\nTotal processed: {len(md_files)} files")
else:
print(f"Error: {input_path} is not a valid file or directory")
sys.exit(1)
if __name__ == '__main__':
main()