news-classifier/ml-module/config_hybrid.yaml

77 lines
1.8 KiB
YAML
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# 混合策略分类器配置文件
# 基于毕业设计技术方案BERT语义特征 + 传统分类器
# ================================
# 基线模型配置 (Baseline)
# ================================
baseline:
# 模型类型: nb=朴素贝叶斯, svm=支持向量机, lr=逻辑回归
model_type: svm
# 训练数据路径
data_path: ./data/processed/news_data.csv
# 测试集比例
test_size: 0.2
# ================================
# 混合分类器配置 (Hybrid)
# ================================
hybrid:
# 分类器类型: svm=支持向量机, lr=逻辑回归
classifier_type: svm
# BERT预训练模型选择
# 推荐选项:
# - bert-base-chinese: Google原版中文BERT
# - hfl/chinese-bert-wwm-ext: 哈工大讯飞全词掩码版本(推荐)
# - hfl/chinese-roberta-wwm-ext: 哈工大RoBERTa效果最优
bert_model: hfl/chinese-roberta-wwm-ext
# 训练数据路径
data_path: ./data/processed/news_data.csv
# 测试集比例
test_size: 0.2
# SVM正则化参数如果使用SVM
# C值越大对误分类的惩罚越大
C: 1.0
# ================================
# BERT特征提取配置
# ================================
feature_extraction:
# 最大序列长度
max_length: 512
# 批处理大小根据GPU内存调整
batch_size: 32
# 是否使用GPU
use_gpu: true
# 特征缓存目录
cache_dir: ./data/bert_features
# ================================
# 可视化配置
# ================================
visualization:
# 图表输出目录
output_dir: ./outputs/visualizations
# 是否显示图表(训练时)
show: false
# t-SNE参数
tsne_perplexity: 30
tsne_n_iter: 1000
# ================================
# 日志配置
# ================================
logging:
level: INFO
file: ./training.log