Files
forsetsystem/docs/_scripts/md_to_docx_henau.py
2026-04-27 12:00:47 +08:00

671 lines
23 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
import re
from dataclasses import dataclass
from pathlib import Path
from docx import Document
from docx.enum.section import WD_SECTION_START
from docx.enum.style import WD_STYLE_TYPE
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_BREAK, WD_LINE_SPACING
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.shared import Cm, Pt
ROOT = Path(__file__).resolve().parents[2]
SOURCE = ROOT / 'docs' / '11_毕业论文正文初稿.md'
OUTPUT = ROOT / 'docs' / '11_毕业论文正文初稿_河农大格式.docx'
TABLE_TITLES = [
'表 6-1 测试环境',
'表 6-2 功能测试结果',
'表 6-3 主要接口测试情况',
'表 6-4 各模型在测试集上的回归性能',
'表 6-5 主要分类模型结果',
]
CHINESE_TITLE_PLACEHOLDER_DATE = '二〇二六年〇月〇日'
ENGLISH_TITLE_PLACEHOLDER = 'ENGLISH TITLE PLACEHOLDER'
FONT_SONG = '宋体'
FONT_HEI = '黑体'
FONT_KAI = '楷体'
FONT_TNR = 'Times New Roman'
FONT_MONO = 'Courier New'
@dataclass
class Block:
kind: str
text: str | None = None
level: int | None = None
lines: list[str] | None = None
def set_run_fonts(run, east_asia: str, ascii_font: str, size: float | int | None = None, bold: bool | None = None):
if size is not None:
run.font.size = Pt(size)
if bold is not None:
run.bold = bold
run.font.name = ascii_font
r_pr = run._element.get_or_add_rPr()
r_fonts = r_pr.rFonts
if r_fonts is None:
r_fonts = OxmlElement('w:rFonts')
r_pr.insert(0, r_fonts)
r_fonts.set(qn('w:ascii'), ascii_font)
r_fonts.set(qn('w:hAnsi'), ascii_font)
r_fonts.set(qn('w:eastAsia'), east_asia)
r_fonts.set(qn('w:cs'), ascii_font)
def set_style_fonts(style, east_asia: str, ascii_font: str, size: float | int, bold: bool = False):
style.font.size = Pt(size)
style.font.bold = bold
style.font.name = ascii_font
style._element.rPr.rFonts.set(qn('w:ascii'), ascii_font)
style._element.rPr.rFonts.set(qn('w:hAnsi'), ascii_font)
style._element.rPr.rFonts.set(qn('w:eastAsia'), east_asia)
style._element.rPr.rFonts.set(qn('w:cs'), ascii_font)
def configure_document(doc: Document):
for section in doc.sections:
apply_page_layout(section)
normal = doc.styles['Normal']
set_style_fonts(normal, FONT_SONG, FONT_TNR, 10.5, False)
normal.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY
normal.paragraph_format.line_spacing = Pt(20)
normal.paragraph_format.first_line_indent = Pt(21)
normal.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
heading1 = doc.styles['Heading 1']
set_style_fonts(heading1, FONT_HEI, FONT_TNR, 14, True)
heading1.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.LEFT
heading1.paragraph_format.first_line_indent = Pt(0)
heading1.paragraph_format.space_before = Pt(12)
heading1.paragraph_format.space_after = Pt(6)
heading1.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY
heading1.paragraph_format.line_spacing = Pt(20)
heading2 = doc.styles['Heading 2']
set_style_fonts(heading2, FONT_HEI, FONT_TNR, 12, True)
heading2.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.LEFT
heading2.paragraph_format.first_line_indent = Pt(0)
heading2.paragraph_format.space_before = Pt(10)
heading2.paragraph_format.space_after = Pt(4)
heading2.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY
heading2.paragraph_format.line_spacing = Pt(20)
heading3 = doc.styles['Heading 3']
set_style_fonts(heading3, FONT_HEI, FONT_TNR, 10.5, True)
heading3.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.LEFT
heading3.paragraph_format.first_line_indent = Pt(0)
heading3.paragraph_format.space_before = Pt(8)
heading3.paragraph_format.space_after = Pt(3)
heading3.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY
heading3.paragraph_format.line_spacing = Pt(20)
heading4 = doc.styles['Heading 4']
set_style_fonts(heading4, FONT_HEI, FONT_TNR, 10.5, True)
heading4.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.LEFT
heading4.paragraph_format.first_line_indent = Pt(0)
heading4.paragraph_format.space_before = Pt(6)
heading4.paragraph_format.space_after = Pt(3)
heading4.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY
heading4.paragraph_format.line_spacing = Pt(20)
for toc_name in ('TOC 1', 'TOC 2', 'TOC 3'):
if toc_name in doc.styles:
toc_style = doc.styles[toc_name]
set_style_fonts(toc_style, FONT_SONG, FONT_TNR, 12, False)
toc_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.MULTIPLE
toc_style.paragraph_format.line_spacing = 1.5
if 'CodeBlock' not in doc.styles:
code_style = doc.styles.add_style('CodeBlock', WD_STYLE_TYPE.PARAGRAPH)
else:
code_style = doc.styles['CodeBlock']
set_style_fonts(code_style, FONT_MONO, FONT_MONO, 10.5, False)
code_style.paragraph_format.first_line_indent = Pt(0)
code_style.paragraph_format.left_indent = Pt(12)
code_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY
code_style.paragraph_format.line_spacing = Pt(20)
enable_update_fields_on_open(doc)
def apply_page_layout(section):
section.page_width = Cm(21)
section.page_height = Cm(29.7)
section.top_margin = Cm(2.5)
section.bottom_margin = Cm(2.5)
section.left_margin = Cm(3)
section.right_margin = Cm(3)
def enable_update_fields_on_open(doc: Document):
settings = doc.settings.element
update_fields = settings.find(qn('w:updateFields'))
if update_fields is None:
update_fields = OxmlElement('w:updateFields')
settings.append(update_fields)
update_fields.set(qn('w:val'), 'true')
def add_field(paragraph, instruction: str, placeholder: str | None = None):
begin = OxmlElement('w:fldChar')
begin.set(qn('w:fldCharType'), 'begin')
instr = OxmlElement('w:instrText')
instr.set(qn('xml:space'), 'preserve')
instr.text = instruction
separate = OxmlElement('w:fldChar')
separate.set(qn('w:fldCharType'), 'separate')
end = OxmlElement('w:fldChar')
end.set(qn('w:fldCharType'), 'end')
paragraph._p.append(begin)
paragraph._p.append(instr)
paragraph._p.append(separate)
if placeholder:
run = paragraph.add_run(placeholder)
set_run_fonts(run, FONT_SONG, FONT_TNR, 12, False)
paragraph._p.append(end)
def configure_footer_page_number(section, fmt: str, start: int):
sect_pr = section._sectPr
pg_num_type = sect_pr.find(qn('w:pgNumType'))
if pg_num_type is None:
pg_num_type = OxmlElement('w:pgNumType')
sect_pr.append(pg_num_type)
pg_num_type.set(qn('w:fmt'), fmt)
pg_num_type.set(qn('w:start'), str(start))
section.footer.is_linked_to_previous = False
footer_p = section.footer.paragraphs[0]
footer_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
add_field(footer_p, ' PAGE ')
for run in footer_p.runs:
set_run_fonts(run, FONT_TNR, FONT_TNR, 10.5, False)
def add_cover(doc: Document, title: str):
for _ in range(4):
doc.add_paragraph()
title_p = doc.add_paragraph()
title_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
title_p.paragraph_format.space_after = Pt(18)
run = title_p.add_run(title)
set_run_fonts(run, FONT_HEI, FONT_TNR, 18, True)
en_title_p = doc.add_paragraph()
en_title_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
en_title_p.paragraph_format.space_after = Pt(30)
run = en_title_p.add_run(ENGLISH_TITLE_PLACEHOLDER)
set_run_fonts(run, FONT_TNR, FONT_TNR, 18, True)
for _ in range(4):
doc.add_paragraph()
cover_lines = [
'作 者________________________',
'学 院________________________',
'专 业________________________',
'班 级________________________',
'学 号________________________',
'指导教师________________________',
f'完成日期:{CHINESE_TITLE_PLACEHOLDER_DATE}',
]
for line in cover_lines:
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.space_after = Pt(8)
run = p.add_run(line)
set_run_fonts(run, FONT_SONG, FONT_TNR, 15, False)
def add_toc_page(doc: Document):
toc_title = doc.add_paragraph()
toc_title.alignment = WD_ALIGN_PARAGRAPH.CENTER
toc_title.paragraph_format.space_before = Pt(16)
toc_title.paragraph_format.space_after = Pt(12)
toc_title.paragraph_format.line_spacing_rule = WD_LINE_SPACING.MULTIPLE
toc_title.paragraph_format.line_spacing = 1.5
run = toc_title.add_run('目 录')
set_run_fonts(run, FONT_HEI, FONT_TNR, 16, True)
toc_p = doc.add_paragraph()
add_field(toc_p, r' TOC \o "1-3" \h \z \u ', '右键更新目录')
doc.add_page_break()
def add_heading_paragraph(doc: Document, text: str, style_name: str, align=WD_ALIGN_PARAGRAPH.LEFT, size: float | int | None = None):
p = doc.add_paragraph(style=style_name)
p.alignment = align
if size is not None:
for run in p.runs:
run.clear()
run = p.add_run(text)
style_east_asia = FONT_HEI
style_ascii = FONT_TNR
if style_name == 'Heading 1':
style_east_asia = FONT_HEI
style_ascii = FONT_TNR
elif style_name == 'Heading 2':
style_east_asia = FONT_HEI
elif style_name == 'Heading 3':
style_east_asia = FONT_HEI
elif style_name == 'Heading 4':
style_east_asia = FONT_HEI
set_run_fonts(run, style_east_asia, style_ascii, size or 10.5, True)
return p
def render_inline(paragraph, text: str, east_asia: str, ascii_font: str, size: float | int, default_bold: bool = False):
token_pattern = re.compile(r'(\*\*.*?\*\*|`[^`]+`)')
parts = token_pattern.split(text)
for part in parts:
if not part:
continue
if part.startswith('**') and part.endswith('**'):
run = paragraph.add_run(part[2:-2])
set_run_fonts(run, east_asia, ascii_font, size, True)
elif part.startswith('`') and part.endswith('`'):
run = paragraph.add_run(part[1:-1])
set_run_fonts(run, FONT_MONO, FONT_MONO, size, False)
else:
run = paragraph.add_run(part)
set_run_fonts(run, east_asia, ascii_font, size, default_bold)
def add_body_paragraph(doc: Document, text: str, indent: bool = True, hanging: bool = False):
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
p.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY
p.paragraph_format.line_spacing = Pt(20)
if hanging:
p.paragraph_format.left_indent = Pt(21)
p.paragraph_format.first_line_indent = Pt(-21)
else:
p.paragraph_format.first_line_indent = Pt(21) if indent else Pt(0)
render_inline(p, text, FONT_SONG, FONT_TNR, 10.5, False)
return p
def add_code_paragraph(doc: Document, text: str):
p = doc.add_paragraph(style='CodeBlock')
p.alignment = WD_ALIGN_PARAGRAPH.LEFT
p.paragraph_format.first_line_indent = Pt(0)
run = p.add_run(text)
set_run_fonts(run, FONT_MONO, FONT_MONO, 10.5, False)
return p
def clean_title_text(text: str) -> str:
match = re.match(r'^第(\d+)章\s+(.+)$', text)
if match:
return f'{match.group(1)} {match.group(2)}'
return text
def split_cells(line: str) -> list[str]:
parts = [part.strip() for part in line.strip().strip('|').split('|')]
return parts
def ensure_cell_has_paragraph(cell):
if cell.paragraphs:
p = cell.paragraphs[0]
p.clear()
return p
return cell.add_paragraph()
def set_cell_border(cell, **kwargs):
tc = cell._tc
tc_pr = tc.get_or_add_tcPr()
tc_borders = tc_pr.first_child_found_in('w:tcBorders')
if tc_borders is None:
tc_borders = OxmlElement('w:tcBorders')
tc_pr.append(tc_borders)
for edge in ('top', 'left', 'bottom', 'right', 'insideH', 'insideV'):
edge_data = kwargs.get(edge)
if edge_data:
tag = 'w:' + edge
element = tc_borders.find(qn(tag))
if element is None:
element = OxmlElement(tag)
tc_borders.append(element)
for key, value in edge_data.items():
element.set(qn(f'w:{key}'), str(value))
def remove_all_table_borders(table):
tbl_pr = table._tbl.tblPr
tbl_borders = tbl_pr.first_child_found_in('w:tblBorders')
if tbl_borders is None:
tbl_borders = OxmlElement('w:tblBorders')
tbl_pr.append(tbl_borders)
for edge in ('top', 'left', 'bottom', 'right', 'insideH', 'insideV'):
element = tbl_borders.find(qn(f'w:{edge}'))
if element is None:
element = OxmlElement(f'w:{edge}')
tbl_borders.append(element)
element.set(qn('w:val'), 'nil')
def apply_three_line_table(table):
remove_all_table_borders(table)
row_count = len(table.rows)
col_count = len(table.columns)
for row in table.rows:
for cell in row.cells:
set_cell_border(
cell,
top={'val': 'nil'},
bottom={'val': 'nil'},
left={'val': 'nil'},
right={'val': 'nil'},
)
for col in range(col_count):
set_cell_border(
table.rows[0].cells[col],
top={'val': 'single', 'sz': 12, 'space': 0, 'color': '000000'},
bottom={'val': 'single', 'sz': 4, 'space': 0, 'color': '000000'},
left={'val': 'nil'},
right={'val': 'nil'},
)
set_cell_border(
table.rows[row_count - 1].cells[col],
bottom={'val': 'single', 'sz': 12, 'space': 0, 'color': '000000'},
left={'val': 'nil'},
right={'val': 'nil'},
)
def add_table_title(doc: Document, title: str):
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.first_line_indent = Pt(0)
p.paragraph_format.space_before = Pt(6)
p.paragraph_format.space_after = Pt(2)
run = p.add_run(title)
set_run_fonts(run, FONT_HEI, FONT_TNR, 9, True)
def add_markdown_table(doc: Document, lines: list[str], title: str):
add_table_title(doc, title)
header = split_cells(lines[0])
body_lines = lines[2:]
rows = [split_cells(line) for line in body_lines]
table = doc.add_table(rows=len(rows) + 1, cols=len(header))
table.alignment = WD_TABLE_ALIGNMENT.CENTER
table.autofit = True
for col, text in enumerate(header):
cell = table.rows[0].cells[col]
p = ensure_cell_has_paragraph(cell)
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.first_line_indent = Pt(0)
p.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY
p.paragraph_format.line_spacing = Pt(14)
run = p.add_run(text)
set_run_fonts(run, FONT_SONG, FONT_TNR, 9, True)
for row_idx, row in enumerate(rows, start=1):
for col_idx, text in enumerate(row):
cell = table.rows[row_idx].cells[col_idx]
p = ensure_cell_has_paragraph(cell)
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.first_line_indent = Pt(0)
p.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY
p.paragraph_format.line_spacing = Pt(14)
run = p.add_run(text)
set_run_fonts(run, FONT_SONG, FONT_TNR, 9, False)
apply_three_line_table(table)
doc.add_paragraph()
def parse_markdown_blocks(lines: list[str]) -> list[Block]:
blocks: list[Block] = []
paragraph_lines: list[str] = []
i = 0
in_code = False
code_lines: list[str] = []
def flush_paragraph():
if paragraph_lines:
blocks.append(Block(kind='paragraph', text=''.join(paragraph_lines).strip()))
paragraph_lines.clear()
while i < len(lines):
line = lines[i]
stripped = line.strip()
if stripped.startswith('```'):
flush_paragraph()
if in_code:
blocks.append(Block(kind='code', lines=code_lines.copy()))
code_lines.clear()
in_code = False
else:
in_code = True
i += 1
continue
if in_code:
code_lines.append(line.rstrip('\n'))
i += 1
continue
if stripped == '':
flush_paragraph()
i += 1
continue
heading_match = re.match(r'^(#{2,4})\s+(.+)$', stripped)
if heading_match:
flush_paragraph()
blocks.append(
Block(
kind='heading',
level=len(heading_match.group(1)),
text=heading_match.group(2).strip(),
)
)
i += 1
continue
if stripped.startswith('|'):
flush_paragraph()
table_lines: list[str] = []
while i < len(lines) and lines[i].strip().startswith('|'):
table_lines.append(lines[i].strip())
i += 1
blocks.append(Block(kind='table', lines=table_lines))
continue
list_match = re.match(r'^(\d+\.\s+|[-*]\s+)(.+)$', stripped)
if list_match:
flush_paragraph()
blocks.append(Block(kind='list_item', text=stripped))
i += 1
continue
paragraph_lines.append(stripped)
i += 1
flush_paragraph()
return blocks
def split_document_blocks(blocks: list[Block]) -> tuple[list[Block], list[Block]]:
abstract_blocks: list[Block] = []
body_blocks: list[Block] = []
in_body = False
for block in blocks:
if block.kind == 'heading' and block.text and re.match(r'^第\d+章\s+', block.text):
in_body = True
if in_body:
body_blocks.append(block)
else:
abstract_blocks.append(block)
return abstract_blocks, body_blocks
def read_source() -> tuple[str, list[Block]]:
text = SOURCE.read_text(encoding='utf-8')
lines = text.splitlines()
if not lines or not lines[0].startswith('# '):
raise ValueError('源 Markdown 缺少一级标题作为论文题目。')
title = lines[0][2:].strip()
content_lines = lines[1:]
cutoff = len(content_lines)
for idx, line in enumerate(content_lines):
if line.strip() == '## 后续补强建议':
cutoff = idx
break
content_lines = content_lines[:cutoff]
blocks = parse_markdown_blocks(content_lines)
return title, blocks
def add_section_heading(doc: Document, text: str, level: int):
if level == 2:
normalized = clean_title_text(text)
p = doc.add_paragraph(style='Heading 1')
p.alignment = WD_ALIGN_PARAGRAPH.LEFT
p.paragraph_format.first_line_indent = Pt(0)
p.clear()
run = p.add_run(normalized)
set_run_fonts(run, FONT_HEI, FONT_TNR, 14, True)
return
if level == 3:
p = doc.add_paragraph(style='Heading 2')
p.alignment = WD_ALIGN_PARAGRAPH.LEFT
p.paragraph_format.first_line_indent = Pt(0)
p.clear()
run = p.add_run(text)
set_run_fonts(run, FONT_HEI, FONT_TNR, 12, True)
return
p = doc.add_paragraph(style='Heading 3')
p.alignment = WD_ALIGN_PARAGRAPH.LEFT
p.paragraph_format.first_line_indent = Pt(0)
p.clear()
run = p.add_run(text)
set_run_fonts(run, FONT_HEI, FONT_TNR, 10.5, True)
def render_blocks(doc: Document, blocks: list[Block], is_abstract: bool = False):
table_index = 0
current_section = '摘要' if is_abstract else ''
for block in blocks:
if block.kind == 'heading':
if is_abstract and block.text == '摘要':
current_section = '摘要'
p = doc.add_paragraph(style='Heading 1')
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.first_line_indent = Pt(0)
p.paragraph_format.space_before = Pt(16)
p.paragraph_format.space_after = Pt(10)
p.clear()
run = p.add_run('摘 要')
set_run_fonts(run, FONT_HEI, FONT_TNR, 16, True)
else:
heading_text = block.text or ''
current_section = heading_text
if heading_text in ('参考文献', '致谢'):
doc.add_page_break()
p = doc.add_paragraph(style='Heading 1')
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.first_line_indent = Pt(0)
p.paragraph_format.line_spacing_rule = WD_LINE_SPACING.MULTIPLE
p.paragraph_format.line_spacing = 1.5
p.clear()
run = p.add_run(heading_text)
set_run_fonts(run, FONT_HEI, FONT_TNR, 14, True)
else:
add_section_heading(doc, heading_text, block.level or 3)
continue
if block.kind == 'paragraph':
text = block.text or ''
no_indent = text.startswith('**关键词') or text.startswith('关键词:')
is_reference = current_section == '参考文献' and re.match(r'^\[\d+\]', text)
add_body_paragraph(doc, text, indent=not no_indent and not is_reference, hanging=is_reference)
continue
if block.kind == 'list_item':
add_body_paragraph(doc, block.text or '', indent=False)
continue
if block.kind == 'code':
for line in block.lines or []:
add_code_paragraph(doc, line)
if not block.lines:
add_code_paragraph(doc, '')
continue
if block.kind == 'table':
title = TABLE_TITLES[table_index] if table_index < len(TABLE_TITLES) else f'{table_index + 1}'
add_markdown_table(doc, block.lines or [], title)
table_index += 1
def build_document():
title, blocks = read_source()
abstract_blocks, body_blocks = split_document_blocks(blocks)
doc = Document()
configure_document(doc)
add_cover(doc, title)
preface_section = doc.add_section(WD_SECTION_START.NEW_PAGE)
apply_page_layout(preface_section)
configure_footer_page_number(preface_section, 'upperRoman', 1)
add_toc_page(doc)
render_blocks(doc, abstract_blocks, is_abstract=True)
body_section = doc.add_section(WD_SECTION_START.NEW_PAGE)
apply_page_layout(body_section)
configure_footer_page_number(body_section, 'decimal', 1)
render_blocks(doc, body_blocks, is_abstract=False)
doc.save(OUTPUT)
def main():
build_document()
print(f'Generated: {OUTPUT}')
if __name__ == '__main__':
main()