feat: 论文

This commit is contained in:
shuo
2026-04-27 12:00:47 +08:00
parent 304441c888
commit 1393689fe7
15 changed files with 1436 additions and 37 deletions

View File

@@ -0,0 +1,670 @@
from __future__ import annotations
import re
from dataclasses import dataclass
from pathlib import Path
from docx import Document
from docx.enum.section import WD_SECTION_START
from docx.enum.style import WD_STYLE_TYPE
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_BREAK, WD_LINE_SPACING
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.shared import Cm, Pt
ROOT = Path(__file__).resolve().parents[2]
SOURCE = ROOT / 'docs' / '11_毕业论文正文初稿.md'
OUTPUT = ROOT / 'docs' / '11_毕业论文正文初稿_河农大格式.docx'
TABLE_TITLES = [
'表 6-1 测试环境',
'表 6-2 功能测试结果',
'表 6-3 主要接口测试情况',
'表 6-4 各模型在测试集上的回归性能',
'表 6-5 主要分类模型结果',
]
CHINESE_TITLE_PLACEHOLDER_DATE = '二〇二六年〇月〇日'
ENGLISH_TITLE_PLACEHOLDER = 'ENGLISH TITLE PLACEHOLDER'
FONT_SONG = '宋体'
FONT_HEI = '黑体'
FONT_KAI = '楷体'
FONT_TNR = 'Times New Roman'
FONT_MONO = 'Courier New'
@dataclass
class Block:
kind: str
text: str | None = None
level: int | None = None
lines: list[str] | None = None
def set_run_fonts(run, east_asia: str, ascii_font: str, size: float | int | None = None, bold: bool | None = None):
if size is not None:
run.font.size = Pt(size)
if bold is not None:
run.bold = bold
run.font.name = ascii_font
r_pr = run._element.get_or_add_rPr()
r_fonts = r_pr.rFonts
if r_fonts is None:
r_fonts = OxmlElement('w:rFonts')
r_pr.insert(0, r_fonts)
r_fonts.set(qn('w:ascii'), ascii_font)
r_fonts.set(qn('w:hAnsi'), ascii_font)
r_fonts.set(qn('w:eastAsia'), east_asia)
r_fonts.set(qn('w:cs'), ascii_font)
def set_style_fonts(style, east_asia: str, ascii_font: str, size: float | int, bold: bool = False):
style.font.size = Pt(size)
style.font.bold = bold
style.font.name = ascii_font
style._element.rPr.rFonts.set(qn('w:ascii'), ascii_font)
style._element.rPr.rFonts.set(qn('w:hAnsi'), ascii_font)
style._element.rPr.rFonts.set(qn('w:eastAsia'), east_asia)
style._element.rPr.rFonts.set(qn('w:cs'), ascii_font)
def configure_document(doc: Document):
for section in doc.sections:
apply_page_layout(section)
normal = doc.styles['Normal']
set_style_fonts(normal, FONT_SONG, FONT_TNR, 10.5, False)
normal.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY
normal.paragraph_format.line_spacing = Pt(20)
normal.paragraph_format.first_line_indent = Pt(21)
normal.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
heading1 = doc.styles['Heading 1']
set_style_fonts(heading1, FONT_HEI, FONT_TNR, 14, True)
heading1.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.LEFT
heading1.paragraph_format.first_line_indent = Pt(0)
heading1.paragraph_format.space_before = Pt(12)
heading1.paragraph_format.space_after = Pt(6)
heading1.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY
heading1.paragraph_format.line_spacing = Pt(20)
heading2 = doc.styles['Heading 2']
set_style_fonts(heading2, FONT_HEI, FONT_TNR, 12, True)
heading2.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.LEFT
heading2.paragraph_format.first_line_indent = Pt(0)
heading2.paragraph_format.space_before = Pt(10)
heading2.paragraph_format.space_after = Pt(4)
heading2.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY
heading2.paragraph_format.line_spacing = Pt(20)
heading3 = doc.styles['Heading 3']
set_style_fonts(heading3, FONT_HEI, FONT_TNR, 10.5, True)
heading3.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.LEFT
heading3.paragraph_format.first_line_indent = Pt(0)
heading3.paragraph_format.space_before = Pt(8)
heading3.paragraph_format.space_after = Pt(3)
heading3.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY
heading3.paragraph_format.line_spacing = Pt(20)
heading4 = doc.styles['Heading 4']
set_style_fonts(heading4, FONT_HEI, FONT_TNR, 10.5, True)
heading4.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.LEFT
heading4.paragraph_format.first_line_indent = Pt(0)
heading4.paragraph_format.space_before = Pt(6)
heading4.paragraph_format.space_after = Pt(3)
heading4.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY
heading4.paragraph_format.line_spacing = Pt(20)
for toc_name in ('TOC 1', 'TOC 2', 'TOC 3'):
if toc_name in doc.styles:
toc_style = doc.styles[toc_name]
set_style_fonts(toc_style, FONT_SONG, FONT_TNR, 12, False)
toc_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.MULTIPLE
toc_style.paragraph_format.line_spacing = 1.5
if 'CodeBlock' not in doc.styles:
code_style = doc.styles.add_style('CodeBlock', WD_STYLE_TYPE.PARAGRAPH)
else:
code_style = doc.styles['CodeBlock']
set_style_fonts(code_style, FONT_MONO, FONT_MONO, 10.5, False)
code_style.paragraph_format.first_line_indent = Pt(0)
code_style.paragraph_format.left_indent = Pt(12)
code_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY
code_style.paragraph_format.line_spacing = Pt(20)
enable_update_fields_on_open(doc)
def apply_page_layout(section):
section.page_width = Cm(21)
section.page_height = Cm(29.7)
section.top_margin = Cm(2.5)
section.bottom_margin = Cm(2.5)
section.left_margin = Cm(3)
section.right_margin = Cm(3)
def enable_update_fields_on_open(doc: Document):
settings = doc.settings.element
update_fields = settings.find(qn('w:updateFields'))
if update_fields is None:
update_fields = OxmlElement('w:updateFields')
settings.append(update_fields)
update_fields.set(qn('w:val'), 'true')
def add_field(paragraph, instruction: str, placeholder: str | None = None):
begin = OxmlElement('w:fldChar')
begin.set(qn('w:fldCharType'), 'begin')
instr = OxmlElement('w:instrText')
instr.set(qn('xml:space'), 'preserve')
instr.text = instruction
separate = OxmlElement('w:fldChar')
separate.set(qn('w:fldCharType'), 'separate')
end = OxmlElement('w:fldChar')
end.set(qn('w:fldCharType'), 'end')
paragraph._p.append(begin)
paragraph._p.append(instr)
paragraph._p.append(separate)
if placeholder:
run = paragraph.add_run(placeholder)
set_run_fonts(run, FONT_SONG, FONT_TNR, 12, False)
paragraph._p.append(end)
def configure_footer_page_number(section, fmt: str, start: int):
sect_pr = section._sectPr
pg_num_type = sect_pr.find(qn('w:pgNumType'))
if pg_num_type is None:
pg_num_type = OxmlElement('w:pgNumType')
sect_pr.append(pg_num_type)
pg_num_type.set(qn('w:fmt'), fmt)
pg_num_type.set(qn('w:start'), str(start))
section.footer.is_linked_to_previous = False
footer_p = section.footer.paragraphs[0]
footer_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
add_field(footer_p, ' PAGE ')
for run in footer_p.runs:
set_run_fonts(run, FONT_TNR, FONT_TNR, 10.5, False)
def add_cover(doc: Document, title: str):
for _ in range(4):
doc.add_paragraph()
title_p = doc.add_paragraph()
title_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
title_p.paragraph_format.space_after = Pt(18)
run = title_p.add_run(title)
set_run_fonts(run, FONT_HEI, FONT_TNR, 18, True)
en_title_p = doc.add_paragraph()
en_title_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
en_title_p.paragraph_format.space_after = Pt(30)
run = en_title_p.add_run(ENGLISH_TITLE_PLACEHOLDER)
set_run_fonts(run, FONT_TNR, FONT_TNR, 18, True)
for _ in range(4):
doc.add_paragraph()
cover_lines = [
'作 者________________________',
'学 院________________________',
'专 业________________________',
'班 级________________________',
'学 号________________________',
'指导教师________________________',
f'完成日期:{CHINESE_TITLE_PLACEHOLDER_DATE}',
]
for line in cover_lines:
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.space_after = Pt(8)
run = p.add_run(line)
set_run_fonts(run, FONT_SONG, FONT_TNR, 15, False)
def add_toc_page(doc: Document):
toc_title = doc.add_paragraph()
toc_title.alignment = WD_ALIGN_PARAGRAPH.CENTER
toc_title.paragraph_format.space_before = Pt(16)
toc_title.paragraph_format.space_after = Pt(12)
toc_title.paragraph_format.line_spacing_rule = WD_LINE_SPACING.MULTIPLE
toc_title.paragraph_format.line_spacing = 1.5
run = toc_title.add_run('目 录')
set_run_fonts(run, FONT_HEI, FONT_TNR, 16, True)
toc_p = doc.add_paragraph()
add_field(toc_p, r' TOC \o "1-3" \h \z \u ', '右键更新目录')
doc.add_page_break()
def add_heading_paragraph(doc: Document, text: str, style_name: str, align=WD_ALIGN_PARAGRAPH.LEFT, size: float | int | None = None):
p = doc.add_paragraph(style=style_name)
p.alignment = align
if size is not None:
for run in p.runs:
run.clear()
run = p.add_run(text)
style_east_asia = FONT_HEI
style_ascii = FONT_TNR
if style_name == 'Heading 1':
style_east_asia = FONT_HEI
style_ascii = FONT_TNR
elif style_name == 'Heading 2':
style_east_asia = FONT_HEI
elif style_name == 'Heading 3':
style_east_asia = FONT_HEI
elif style_name == 'Heading 4':
style_east_asia = FONT_HEI
set_run_fonts(run, style_east_asia, style_ascii, size or 10.5, True)
return p
def render_inline(paragraph, text: str, east_asia: str, ascii_font: str, size: float | int, default_bold: bool = False):
token_pattern = re.compile(r'(\*\*.*?\*\*|`[^`]+`)')
parts = token_pattern.split(text)
for part in parts:
if not part:
continue
if part.startswith('**') and part.endswith('**'):
run = paragraph.add_run(part[2:-2])
set_run_fonts(run, east_asia, ascii_font, size, True)
elif part.startswith('`') and part.endswith('`'):
run = paragraph.add_run(part[1:-1])
set_run_fonts(run, FONT_MONO, FONT_MONO, size, False)
else:
run = paragraph.add_run(part)
set_run_fonts(run, east_asia, ascii_font, size, default_bold)
def add_body_paragraph(doc: Document, text: str, indent: bool = True, hanging: bool = False):
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
p.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY
p.paragraph_format.line_spacing = Pt(20)
if hanging:
p.paragraph_format.left_indent = Pt(21)
p.paragraph_format.first_line_indent = Pt(-21)
else:
p.paragraph_format.first_line_indent = Pt(21) if indent else Pt(0)
render_inline(p, text, FONT_SONG, FONT_TNR, 10.5, False)
return p
def add_code_paragraph(doc: Document, text: str):
p = doc.add_paragraph(style='CodeBlock')
p.alignment = WD_ALIGN_PARAGRAPH.LEFT
p.paragraph_format.first_line_indent = Pt(0)
run = p.add_run(text)
set_run_fonts(run, FONT_MONO, FONT_MONO, 10.5, False)
return p
def clean_title_text(text: str) -> str:
match = re.match(r'^第(\d+)章\s+(.+)$', text)
if match:
return f'{match.group(1)} {match.group(2)}'
return text
def split_cells(line: str) -> list[str]:
parts = [part.strip() for part in line.strip().strip('|').split('|')]
return parts
def ensure_cell_has_paragraph(cell):
if cell.paragraphs:
p = cell.paragraphs[0]
p.clear()
return p
return cell.add_paragraph()
def set_cell_border(cell, **kwargs):
tc = cell._tc
tc_pr = tc.get_or_add_tcPr()
tc_borders = tc_pr.first_child_found_in('w:tcBorders')
if tc_borders is None:
tc_borders = OxmlElement('w:tcBorders')
tc_pr.append(tc_borders)
for edge in ('top', 'left', 'bottom', 'right', 'insideH', 'insideV'):
edge_data = kwargs.get(edge)
if edge_data:
tag = 'w:' + edge
element = tc_borders.find(qn(tag))
if element is None:
element = OxmlElement(tag)
tc_borders.append(element)
for key, value in edge_data.items():
element.set(qn(f'w:{key}'), str(value))
def remove_all_table_borders(table):
tbl_pr = table._tbl.tblPr
tbl_borders = tbl_pr.first_child_found_in('w:tblBorders')
if tbl_borders is None:
tbl_borders = OxmlElement('w:tblBorders')
tbl_pr.append(tbl_borders)
for edge in ('top', 'left', 'bottom', 'right', 'insideH', 'insideV'):
element = tbl_borders.find(qn(f'w:{edge}'))
if element is None:
element = OxmlElement(f'w:{edge}')
tbl_borders.append(element)
element.set(qn('w:val'), 'nil')
def apply_three_line_table(table):
remove_all_table_borders(table)
row_count = len(table.rows)
col_count = len(table.columns)
for row in table.rows:
for cell in row.cells:
set_cell_border(
cell,
top={'val': 'nil'},
bottom={'val': 'nil'},
left={'val': 'nil'},
right={'val': 'nil'},
)
for col in range(col_count):
set_cell_border(
table.rows[0].cells[col],
top={'val': 'single', 'sz': 12, 'space': 0, 'color': '000000'},
bottom={'val': 'single', 'sz': 4, 'space': 0, 'color': '000000'},
left={'val': 'nil'},
right={'val': 'nil'},
)
set_cell_border(
table.rows[row_count - 1].cells[col],
bottom={'val': 'single', 'sz': 12, 'space': 0, 'color': '000000'},
left={'val': 'nil'},
right={'val': 'nil'},
)
def add_table_title(doc: Document, title: str):
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.first_line_indent = Pt(0)
p.paragraph_format.space_before = Pt(6)
p.paragraph_format.space_after = Pt(2)
run = p.add_run(title)
set_run_fonts(run, FONT_HEI, FONT_TNR, 9, True)
def add_markdown_table(doc: Document, lines: list[str], title: str):
add_table_title(doc, title)
header = split_cells(lines[0])
body_lines = lines[2:]
rows = [split_cells(line) for line in body_lines]
table = doc.add_table(rows=len(rows) + 1, cols=len(header))
table.alignment = WD_TABLE_ALIGNMENT.CENTER
table.autofit = True
for col, text in enumerate(header):
cell = table.rows[0].cells[col]
p = ensure_cell_has_paragraph(cell)
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.first_line_indent = Pt(0)
p.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY
p.paragraph_format.line_spacing = Pt(14)
run = p.add_run(text)
set_run_fonts(run, FONT_SONG, FONT_TNR, 9, True)
for row_idx, row in enumerate(rows, start=1):
for col_idx, text in enumerate(row):
cell = table.rows[row_idx].cells[col_idx]
p = ensure_cell_has_paragraph(cell)
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.first_line_indent = Pt(0)
p.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY
p.paragraph_format.line_spacing = Pt(14)
run = p.add_run(text)
set_run_fonts(run, FONT_SONG, FONT_TNR, 9, False)
apply_three_line_table(table)
doc.add_paragraph()
def parse_markdown_blocks(lines: list[str]) -> list[Block]:
blocks: list[Block] = []
paragraph_lines: list[str] = []
i = 0
in_code = False
code_lines: list[str] = []
def flush_paragraph():
if paragraph_lines:
blocks.append(Block(kind='paragraph', text=''.join(paragraph_lines).strip()))
paragraph_lines.clear()
while i < len(lines):
line = lines[i]
stripped = line.strip()
if stripped.startswith('```'):
flush_paragraph()
if in_code:
blocks.append(Block(kind='code', lines=code_lines.copy()))
code_lines.clear()
in_code = False
else:
in_code = True
i += 1
continue
if in_code:
code_lines.append(line.rstrip('\n'))
i += 1
continue
if stripped == '':
flush_paragraph()
i += 1
continue
heading_match = re.match(r'^(#{2,4})\s+(.+)$', stripped)
if heading_match:
flush_paragraph()
blocks.append(
Block(
kind='heading',
level=len(heading_match.group(1)),
text=heading_match.group(2).strip(),
)
)
i += 1
continue
if stripped.startswith('|'):
flush_paragraph()
table_lines: list[str] = []
while i < len(lines) and lines[i].strip().startswith('|'):
table_lines.append(lines[i].strip())
i += 1
blocks.append(Block(kind='table', lines=table_lines))
continue
list_match = re.match(r'^(\d+\.\s+|[-*]\s+)(.+)$', stripped)
if list_match:
flush_paragraph()
blocks.append(Block(kind='list_item', text=stripped))
i += 1
continue
paragraph_lines.append(stripped)
i += 1
flush_paragraph()
return blocks
def split_document_blocks(blocks: list[Block]) -> tuple[list[Block], list[Block]]:
abstract_blocks: list[Block] = []
body_blocks: list[Block] = []
in_body = False
for block in blocks:
if block.kind == 'heading' and block.text and re.match(r'^第\d+章\s+', block.text):
in_body = True
if in_body:
body_blocks.append(block)
else:
abstract_blocks.append(block)
return abstract_blocks, body_blocks
def read_source() -> tuple[str, list[Block]]:
text = SOURCE.read_text(encoding='utf-8')
lines = text.splitlines()
if not lines or not lines[0].startswith('# '):
raise ValueError('源 Markdown 缺少一级标题作为论文题目。')
title = lines[0][2:].strip()
content_lines = lines[1:]
cutoff = len(content_lines)
for idx, line in enumerate(content_lines):
if line.strip() == '## 后续补强建议':
cutoff = idx
break
content_lines = content_lines[:cutoff]
blocks = parse_markdown_blocks(content_lines)
return title, blocks
def add_section_heading(doc: Document, text: str, level: int):
if level == 2:
normalized = clean_title_text(text)
p = doc.add_paragraph(style='Heading 1')
p.alignment = WD_ALIGN_PARAGRAPH.LEFT
p.paragraph_format.first_line_indent = Pt(0)
p.clear()
run = p.add_run(normalized)
set_run_fonts(run, FONT_HEI, FONT_TNR, 14, True)
return
if level == 3:
p = doc.add_paragraph(style='Heading 2')
p.alignment = WD_ALIGN_PARAGRAPH.LEFT
p.paragraph_format.first_line_indent = Pt(0)
p.clear()
run = p.add_run(text)
set_run_fonts(run, FONT_HEI, FONT_TNR, 12, True)
return
p = doc.add_paragraph(style='Heading 3')
p.alignment = WD_ALIGN_PARAGRAPH.LEFT
p.paragraph_format.first_line_indent = Pt(0)
p.clear()
run = p.add_run(text)
set_run_fonts(run, FONT_HEI, FONT_TNR, 10.5, True)
def render_blocks(doc: Document, blocks: list[Block], is_abstract: bool = False):
table_index = 0
current_section = '摘要' if is_abstract else ''
for block in blocks:
if block.kind == 'heading':
if is_abstract and block.text == '摘要':
current_section = '摘要'
p = doc.add_paragraph(style='Heading 1')
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.first_line_indent = Pt(0)
p.paragraph_format.space_before = Pt(16)
p.paragraph_format.space_after = Pt(10)
p.clear()
run = p.add_run('摘 要')
set_run_fonts(run, FONT_HEI, FONT_TNR, 16, True)
else:
heading_text = block.text or ''
current_section = heading_text
if heading_text in ('参考文献', '致谢'):
doc.add_page_break()
p = doc.add_paragraph(style='Heading 1')
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.first_line_indent = Pt(0)
p.paragraph_format.line_spacing_rule = WD_LINE_SPACING.MULTIPLE
p.paragraph_format.line_spacing = 1.5
p.clear()
run = p.add_run(heading_text)
set_run_fonts(run, FONT_HEI, FONT_TNR, 14, True)
else:
add_section_heading(doc, heading_text, block.level or 3)
continue
if block.kind == 'paragraph':
text = block.text or ''
no_indent = text.startswith('**关键词') or text.startswith('关键词:')
is_reference = current_section == '参考文献' and re.match(r'^\[\d+\]', text)
add_body_paragraph(doc, text, indent=not no_indent and not is_reference, hanging=is_reference)
continue
if block.kind == 'list_item':
add_body_paragraph(doc, block.text or '', indent=False)
continue
if block.kind == 'code':
for line in block.lines or []:
add_code_paragraph(doc, line)
if not block.lines:
add_code_paragraph(doc, '')
continue
if block.kind == 'table':
title = TABLE_TITLES[table_index] if table_index < len(TABLE_TITLES) else f'{table_index + 1}'
add_markdown_table(doc, block.lines or [], title)
table_index += 1
def build_document():
title, blocks = read_source()
abstract_blocks, body_blocks = split_document_blocks(blocks)
doc = Document()
configure_document(doc)
add_cover(doc, title)
preface_section = doc.add_section(WD_SECTION_START.NEW_PAGE)
apply_page_layout(preface_section)
configure_footer_page_number(preface_section, 'upperRoman', 1)
add_toc_page(doc)
render_blocks(doc, abstract_blocks, is_abstract=True)
body_section = doc.add_section(WD_SECTION_START.NEW_PAGE)
apply_page_layout(body_section)
configure_footer_page_number(body_section, 'decimal', 1)
render_blocks(doc, body_blocks, is_abstract=False)
doc.save(OUTPUT)
def main():
build_document()
print(f'Generated: {OUTPUT}')
if __name__ == '__main__':
main()