from __future__ import annotations import re from dataclasses import dataclass from pathlib import Path from docx import Document from docx.enum.section import WD_SECTION_START from docx.enum.style import WD_STYLE_TYPE from docx.enum.table import WD_TABLE_ALIGNMENT from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_BREAK, WD_LINE_SPACING from docx.oxml import OxmlElement from docx.oxml.ns import qn from docx.shared import Cm, Pt ROOT = Path(__file__).resolve().parents[2] SOURCE = ROOT / 'docs' / '11_毕业论文正文初稿.md' OUTPUT = ROOT / 'docs' / '11_毕业论文正文初稿_河农大格式.docx' TABLE_TITLES = [ '表 6-1 测试环境', '表 6-2 功能测试结果', '表 6-3 主要接口测试情况', '表 6-4 各模型在测试集上的回归性能', '表 6-5 主要分类模型结果', ] CHINESE_TITLE_PLACEHOLDER_DATE = '二〇二六年〇月〇日' ENGLISH_TITLE_PLACEHOLDER = 'ENGLISH TITLE PLACEHOLDER' FONT_SONG = '宋体' FONT_HEI = '黑体' FONT_KAI = '楷体' FONT_TNR = 'Times New Roman' FONT_MONO = 'Courier New' @dataclass class Block: kind: str text: str | None = None level: int | None = None lines: list[str] | None = None def set_run_fonts(run, east_asia: str, ascii_font: str, size: float | int | None = None, bold: bool | None = None): if size is not None: run.font.size = Pt(size) if bold is not None: run.bold = bold run.font.name = ascii_font r_pr = run._element.get_or_add_rPr() r_fonts = r_pr.rFonts if r_fonts is None: r_fonts = OxmlElement('w:rFonts') r_pr.insert(0, r_fonts) r_fonts.set(qn('w:ascii'), ascii_font) r_fonts.set(qn('w:hAnsi'), ascii_font) r_fonts.set(qn('w:eastAsia'), east_asia) r_fonts.set(qn('w:cs'), ascii_font) def set_style_fonts(style, east_asia: str, ascii_font: str, size: float | int, bold: bool = False): style.font.size = Pt(size) style.font.bold = bold style.font.name = ascii_font style._element.rPr.rFonts.set(qn('w:ascii'), ascii_font) style._element.rPr.rFonts.set(qn('w:hAnsi'), ascii_font) style._element.rPr.rFonts.set(qn('w:eastAsia'), east_asia) style._element.rPr.rFonts.set(qn('w:cs'), ascii_font) def configure_document(doc: Document): for section in doc.sections: apply_page_layout(section) normal = doc.styles['Normal'] set_style_fonts(normal, FONT_SONG, FONT_TNR, 10.5, False) normal.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY normal.paragraph_format.line_spacing = Pt(20) normal.paragraph_format.first_line_indent = Pt(21) normal.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY heading1 = doc.styles['Heading 1'] set_style_fonts(heading1, FONT_HEI, FONT_TNR, 14, True) heading1.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.LEFT heading1.paragraph_format.first_line_indent = Pt(0) heading1.paragraph_format.space_before = Pt(12) heading1.paragraph_format.space_after = Pt(6) heading1.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY heading1.paragraph_format.line_spacing = Pt(20) heading2 = doc.styles['Heading 2'] set_style_fonts(heading2, FONT_HEI, FONT_TNR, 12, True) heading2.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.LEFT heading2.paragraph_format.first_line_indent = Pt(0) heading2.paragraph_format.space_before = Pt(10) heading2.paragraph_format.space_after = Pt(4) heading2.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY heading2.paragraph_format.line_spacing = Pt(20) heading3 = doc.styles['Heading 3'] set_style_fonts(heading3, FONT_HEI, FONT_TNR, 10.5, True) heading3.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.LEFT heading3.paragraph_format.first_line_indent = Pt(0) heading3.paragraph_format.space_before = Pt(8) heading3.paragraph_format.space_after = Pt(3) heading3.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY heading3.paragraph_format.line_spacing = Pt(20) heading4 = doc.styles['Heading 4'] set_style_fonts(heading4, FONT_HEI, FONT_TNR, 10.5, True) heading4.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.LEFT heading4.paragraph_format.first_line_indent = Pt(0) heading4.paragraph_format.space_before = Pt(6) heading4.paragraph_format.space_after = Pt(3) heading4.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY heading4.paragraph_format.line_spacing = Pt(20) for toc_name in ('TOC 1', 'TOC 2', 'TOC 3'): if toc_name in doc.styles: toc_style = doc.styles[toc_name] set_style_fonts(toc_style, FONT_SONG, FONT_TNR, 12, False) toc_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.MULTIPLE toc_style.paragraph_format.line_spacing = 1.5 if 'CodeBlock' not in doc.styles: code_style = doc.styles.add_style('CodeBlock', WD_STYLE_TYPE.PARAGRAPH) else: code_style = doc.styles['CodeBlock'] set_style_fonts(code_style, FONT_MONO, FONT_MONO, 10.5, False) code_style.paragraph_format.first_line_indent = Pt(0) code_style.paragraph_format.left_indent = Pt(12) code_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY code_style.paragraph_format.line_spacing = Pt(20) enable_update_fields_on_open(doc) def apply_page_layout(section): section.page_width = Cm(21) section.page_height = Cm(29.7) section.top_margin = Cm(2.5) section.bottom_margin = Cm(2.5) section.left_margin = Cm(3) section.right_margin = Cm(3) def enable_update_fields_on_open(doc: Document): settings = doc.settings.element update_fields = settings.find(qn('w:updateFields')) if update_fields is None: update_fields = OxmlElement('w:updateFields') settings.append(update_fields) update_fields.set(qn('w:val'), 'true') def add_field(paragraph, instruction: str, placeholder: str | None = None): begin = OxmlElement('w:fldChar') begin.set(qn('w:fldCharType'), 'begin') instr = OxmlElement('w:instrText') instr.set(qn('xml:space'), 'preserve') instr.text = instruction separate = OxmlElement('w:fldChar') separate.set(qn('w:fldCharType'), 'separate') end = OxmlElement('w:fldChar') end.set(qn('w:fldCharType'), 'end') paragraph._p.append(begin) paragraph._p.append(instr) paragraph._p.append(separate) if placeholder: run = paragraph.add_run(placeholder) set_run_fonts(run, FONT_SONG, FONT_TNR, 12, False) paragraph._p.append(end) def configure_footer_page_number(section, fmt: str, start: int): sect_pr = section._sectPr pg_num_type = sect_pr.find(qn('w:pgNumType')) if pg_num_type is None: pg_num_type = OxmlElement('w:pgNumType') sect_pr.append(pg_num_type) pg_num_type.set(qn('w:fmt'), fmt) pg_num_type.set(qn('w:start'), str(start)) section.footer.is_linked_to_previous = False footer_p = section.footer.paragraphs[0] footer_p.alignment = WD_ALIGN_PARAGRAPH.CENTER add_field(footer_p, ' PAGE ') for run in footer_p.runs: set_run_fonts(run, FONT_TNR, FONT_TNR, 10.5, False) def add_cover(doc: Document, title: str): for _ in range(4): doc.add_paragraph() title_p = doc.add_paragraph() title_p.alignment = WD_ALIGN_PARAGRAPH.CENTER title_p.paragraph_format.space_after = Pt(18) run = title_p.add_run(title) set_run_fonts(run, FONT_HEI, FONT_TNR, 18, True) en_title_p = doc.add_paragraph() en_title_p.alignment = WD_ALIGN_PARAGRAPH.CENTER en_title_p.paragraph_format.space_after = Pt(30) run = en_title_p.add_run(ENGLISH_TITLE_PLACEHOLDER) set_run_fonts(run, FONT_TNR, FONT_TNR, 18, True) for _ in range(4): doc.add_paragraph() cover_lines = [ '作 者:________________________', '学 院:________________________', '专 业:________________________', '班 级:________________________', '学 号:________________________', '指导教师:________________________', f'完成日期:{CHINESE_TITLE_PLACEHOLDER_DATE}', ] for line in cover_lines: p = doc.add_paragraph() p.alignment = WD_ALIGN_PARAGRAPH.CENTER p.paragraph_format.space_after = Pt(8) run = p.add_run(line) set_run_fonts(run, FONT_SONG, FONT_TNR, 15, False) def add_toc_page(doc: Document): toc_title = doc.add_paragraph() toc_title.alignment = WD_ALIGN_PARAGRAPH.CENTER toc_title.paragraph_format.space_before = Pt(16) toc_title.paragraph_format.space_after = Pt(12) toc_title.paragraph_format.line_spacing_rule = WD_LINE_SPACING.MULTIPLE toc_title.paragraph_format.line_spacing = 1.5 run = toc_title.add_run('目 录') set_run_fonts(run, FONT_HEI, FONT_TNR, 16, True) toc_p = doc.add_paragraph() add_field(toc_p, r' TOC \o "1-3" \h \z \u ', '右键更新目录') doc.add_page_break() def add_heading_paragraph(doc: Document, text: str, style_name: str, align=WD_ALIGN_PARAGRAPH.LEFT, size: float | int | None = None): p = doc.add_paragraph(style=style_name) p.alignment = align if size is not None: for run in p.runs: run.clear() run = p.add_run(text) style_east_asia = FONT_HEI style_ascii = FONT_TNR if style_name == 'Heading 1': style_east_asia = FONT_HEI style_ascii = FONT_TNR elif style_name == 'Heading 2': style_east_asia = FONT_HEI elif style_name == 'Heading 3': style_east_asia = FONT_HEI elif style_name == 'Heading 4': style_east_asia = FONT_HEI set_run_fonts(run, style_east_asia, style_ascii, size or 10.5, True) return p def render_inline(paragraph, text: str, east_asia: str, ascii_font: str, size: float | int, default_bold: bool = False): token_pattern = re.compile(r'(\*\*.*?\*\*|`[^`]+`)') parts = token_pattern.split(text) for part in parts: if not part: continue if part.startswith('**') and part.endswith('**'): run = paragraph.add_run(part[2:-2]) set_run_fonts(run, east_asia, ascii_font, size, True) elif part.startswith('`') and part.endswith('`'): run = paragraph.add_run(part[1:-1]) set_run_fonts(run, FONT_MONO, FONT_MONO, size, False) else: run = paragraph.add_run(part) set_run_fonts(run, east_asia, ascii_font, size, default_bold) def add_body_paragraph(doc: Document, text: str, indent: bool = True, hanging: bool = False): p = doc.add_paragraph() p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY p.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY p.paragraph_format.line_spacing = Pt(20) if hanging: p.paragraph_format.left_indent = Pt(21) p.paragraph_format.first_line_indent = Pt(-21) else: p.paragraph_format.first_line_indent = Pt(21) if indent else Pt(0) render_inline(p, text, FONT_SONG, FONT_TNR, 10.5, False) return p def add_code_paragraph(doc: Document, text: str): p = doc.add_paragraph(style='CodeBlock') p.alignment = WD_ALIGN_PARAGRAPH.LEFT p.paragraph_format.first_line_indent = Pt(0) run = p.add_run(text) set_run_fonts(run, FONT_MONO, FONT_MONO, 10.5, False) return p def clean_title_text(text: str) -> str: match = re.match(r'^第(\d+)章\s+(.+)$', text) if match: return f'{match.group(1)} {match.group(2)}' return text def split_cells(line: str) -> list[str]: parts = [part.strip() for part in line.strip().strip('|').split('|')] return parts def ensure_cell_has_paragraph(cell): if cell.paragraphs: p = cell.paragraphs[0] p.clear() return p return cell.add_paragraph() def set_cell_border(cell, **kwargs): tc = cell._tc tc_pr = tc.get_or_add_tcPr() tc_borders = tc_pr.first_child_found_in('w:tcBorders') if tc_borders is None: tc_borders = OxmlElement('w:tcBorders') tc_pr.append(tc_borders) for edge in ('top', 'left', 'bottom', 'right', 'insideH', 'insideV'): edge_data = kwargs.get(edge) if edge_data: tag = 'w:' + edge element = tc_borders.find(qn(tag)) if element is None: element = OxmlElement(tag) tc_borders.append(element) for key, value in edge_data.items(): element.set(qn(f'w:{key}'), str(value)) def remove_all_table_borders(table): tbl_pr = table._tbl.tblPr tbl_borders = tbl_pr.first_child_found_in('w:tblBorders') if tbl_borders is None: tbl_borders = OxmlElement('w:tblBorders') tbl_pr.append(tbl_borders) for edge in ('top', 'left', 'bottom', 'right', 'insideH', 'insideV'): element = tbl_borders.find(qn(f'w:{edge}')) if element is None: element = OxmlElement(f'w:{edge}') tbl_borders.append(element) element.set(qn('w:val'), 'nil') def apply_three_line_table(table): remove_all_table_borders(table) row_count = len(table.rows) col_count = len(table.columns) for row in table.rows: for cell in row.cells: set_cell_border( cell, top={'val': 'nil'}, bottom={'val': 'nil'}, left={'val': 'nil'}, right={'val': 'nil'}, ) for col in range(col_count): set_cell_border( table.rows[0].cells[col], top={'val': 'single', 'sz': 12, 'space': 0, 'color': '000000'}, bottom={'val': 'single', 'sz': 4, 'space': 0, 'color': '000000'}, left={'val': 'nil'}, right={'val': 'nil'}, ) set_cell_border( table.rows[row_count - 1].cells[col], bottom={'val': 'single', 'sz': 12, 'space': 0, 'color': '000000'}, left={'val': 'nil'}, right={'val': 'nil'}, ) def add_table_title(doc: Document, title: str): p = doc.add_paragraph() p.alignment = WD_ALIGN_PARAGRAPH.CENTER p.paragraph_format.first_line_indent = Pt(0) p.paragraph_format.space_before = Pt(6) p.paragraph_format.space_after = Pt(2) run = p.add_run(title) set_run_fonts(run, FONT_HEI, FONT_TNR, 9, True) def add_markdown_table(doc: Document, lines: list[str], title: str): add_table_title(doc, title) header = split_cells(lines[0]) body_lines = lines[2:] rows = [split_cells(line) for line in body_lines] table = doc.add_table(rows=len(rows) + 1, cols=len(header)) table.alignment = WD_TABLE_ALIGNMENT.CENTER table.autofit = True for col, text in enumerate(header): cell = table.rows[0].cells[col] p = ensure_cell_has_paragraph(cell) p.alignment = WD_ALIGN_PARAGRAPH.CENTER p.paragraph_format.first_line_indent = Pt(0) p.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY p.paragraph_format.line_spacing = Pt(14) run = p.add_run(text) set_run_fonts(run, FONT_SONG, FONT_TNR, 9, True) for row_idx, row in enumerate(rows, start=1): for col_idx, text in enumerate(row): cell = table.rows[row_idx].cells[col_idx] p = ensure_cell_has_paragraph(cell) p.alignment = WD_ALIGN_PARAGRAPH.CENTER p.paragraph_format.first_line_indent = Pt(0) p.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY p.paragraph_format.line_spacing = Pt(14) run = p.add_run(text) set_run_fonts(run, FONT_SONG, FONT_TNR, 9, False) apply_three_line_table(table) doc.add_paragraph() def parse_markdown_blocks(lines: list[str]) -> list[Block]: blocks: list[Block] = [] paragraph_lines: list[str] = [] i = 0 in_code = False code_lines: list[str] = [] def flush_paragraph(): if paragraph_lines: blocks.append(Block(kind='paragraph', text=''.join(paragraph_lines).strip())) paragraph_lines.clear() while i < len(lines): line = lines[i] stripped = line.strip() if stripped.startswith('```'): flush_paragraph() if in_code: blocks.append(Block(kind='code', lines=code_lines.copy())) code_lines.clear() in_code = False else: in_code = True i += 1 continue if in_code: code_lines.append(line.rstrip('\n')) i += 1 continue if stripped == '': flush_paragraph() i += 1 continue heading_match = re.match(r'^(#{2,4})\s+(.+)$', stripped) if heading_match: flush_paragraph() blocks.append( Block( kind='heading', level=len(heading_match.group(1)), text=heading_match.group(2).strip(), ) ) i += 1 continue if stripped.startswith('|'): flush_paragraph() table_lines: list[str] = [] while i < len(lines) and lines[i].strip().startswith('|'): table_lines.append(lines[i].strip()) i += 1 blocks.append(Block(kind='table', lines=table_lines)) continue list_match = re.match(r'^(\d+\.\s+|[-*]\s+)(.+)$', stripped) if list_match: flush_paragraph() blocks.append(Block(kind='list_item', text=stripped)) i += 1 continue paragraph_lines.append(stripped) i += 1 flush_paragraph() return blocks def split_document_blocks(blocks: list[Block]) -> tuple[list[Block], list[Block]]: abstract_blocks: list[Block] = [] body_blocks: list[Block] = [] in_body = False for block in blocks: if block.kind == 'heading' and block.text and re.match(r'^第\d+章\s+', block.text): in_body = True if in_body: body_blocks.append(block) else: abstract_blocks.append(block) return abstract_blocks, body_blocks def read_source() -> tuple[str, list[Block]]: text = SOURCE.read_text(encoding='utf-8') lines = text.splitlines() if not lines or not lines[0].startswith('# '): raise ValueError('源 Markdown 缺少一级标题作为论文题目。') title = lines[0][2:].strip() content_lines = lines[1:] cutoff = len(content_lines) for idx, line in enumerate(content_lines): if line.strip() == '## 后续补强建议': cutoff = idx break content_lines = content_lines[:cutoff] blocks = parse_markdown_blocks(content_lines) return title, blocks def add_section_heading(doc: Document, text: str, level: int): if level == 2: normalized = clean_title_text(text) p = doc.add_paragraph(style='Heading 1') p.alignment = WD_ALIGN_PARAGRAPH.LEFT p.paragraph_format.first_line_indent = Pt(0) p.clear() run = p.add_run(normalized) set_run_fonts(run, FONT_HEI, FONT_TNR, 14, True) return if level == 3: p = doc.add_paragraph(style='Heading 2') p.alignment = WD_ALIGN_PARAGRAPH.LEFT p.paragraph_format.first_line_indent = Pt(0) p.clear() run = p.add_run(text) set_run_fonts(run, FONT_HEI, FONT_TNR, 12, True) return p = doc.add_paragraph(style='Heading 3') p.alignment = WD_ALIGN_PARAGRAPH.LEFT p.paragraph_format.first_line_indent = Pt(0) p.clear() run = p.add_run(text) set_run_fonts(run, FONT_HEI, FONT_TNR, 10.5, True) def render_blocks(doc: Document, blocks: list[Block], is_abstract: bool = False): table_index = 0 current_section = '摘要' if is_abstract else '' for block in blocks: if block.kind == 'heading': if is_abstract and block.text == '摘要': current_section = '摘要' p = doc.add_paragraph(style='Heading 1') p.alignment = WD_ALIGN_PARAGRAPH.CENTER p.paragraph_format.first_line_indent = Pt(0) p.paragraph_format.space_before = Pt(16) p.paragraph_format.space_after = Pt(10) p.clear() run = p.add_run('摘 要') set_run_fonts(run, FONT_HEI, FONT_TNR, 16, True) else: heading_text = block.text or '' current_section = heading_text if heading_text in ('参考文献', '致谢'): doc.add_page_break() p = doc.add_paragraph(style='Heading 1') p.alignment = WD_ALIGN_PARAGRAPH.CENTER p.paragraph_format.first_line_indent = Pt(0) p.paragraph_format.line_spacing_rule = WD_LINE_SPACING.MULTIPLE p.paragraph_format.line_spacing = 1.5 p.clear() run = p.add_run(heading_text) set_run_fonts(run, FONT_HEI, FONT_TNR, 14, True) else: add_section_heading(doc, heading_text, block.level or 3) continue if block.kind == 'paragraph': text = block.text or '' no_indent = text.startswith('**关键词') or text.startswith('关键词:') is_reference = current_section == '参考文献' and re.match(r'^\[\d+\]', text) add_body_paragraph(doc, text, indent=not no_indent and not is_reference, hanging=is_reference) continue if block.kind == 'list_item': add_body_paragraph(doc, block.text or '', indent=False) continue if block.kind == 'code': for line in block.lines or []: add_code_paragraph(doc, line) if not block.lines: add_code_paragraph(doc, '') continue if block.kind == 'table': title = TABLE_TITLES[table_index] if table_index < len(TABLE_TITLES) else f'表 {table_index + 1}' add_markdown_table(doc, block.lines or [], title) table_index += 1 def build_document(): title, blocks = read_source() abstract_blocks, body_blocks = split_document_blocks(blocks) doc = Document() configure_document(doc) add_cover(doc, title) preface_section = doc.add_section(WD_SECTION_START.NEW_PAGE) apply_page_layout(preface_section) configure_footer_page_number(preface_section, 'upperRoman', 1) add_toc_page(doc) render_blocks(doc, abstract_blocks, is_abstract=True) body_section = doc.add_section(WD_SECTION_START.NEW_PAGE) apply_page_layout(body_section) configure_footer_page_number(body_section, 'decimal', 1) render_blocks(doc, body_blocks, is_abstract=False) doc.save(OUTPUT) def main(): build_document() print(f'Generated: {OUTPUT}') if __name__ == '__main__': main()