news-classifier/ml-module/test_feature_dimension.py

174 lines
5.6 KiB
Python

"""
测试不同特征维度对训练时间的影响
"""
import time
import pandas as pd
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# 分类映射
CATEGORY_MAP = {
'ENTERTAINMENT': '娱乐',
'SPORTS': '体育',
'FINANCE': '财经',
'TECHNOLOGY': '科技',
'MILITARY': '军事',
'AUTOMOTIVE': '汽车',
'GOVERNMENT': '政务',
'HEALTH': '健康',
'AI': 'AI',
'HOUSE': '房产'
}
REVERSE_CATEGORY_MAP = {v: k for k, v in CATEGORY_MAP.items()}
# 加载停用词
stopwords = set()
try:
with open('data/news_stopwords.txt', 'r', encoding='utf-8') as f:
stopwords = set(line.strip() for line in f if line.strip())
except:
pass
def preprocess_text(text):
"""文本预处理"""
if pd.isna(text) or text is None:
return ''
if not isinstance(text, str):
text = str(text)
text = ' '.join(text.split())
words = jieba.cut(text)
result = [w for w in words if len(w) > 1 and w not in stopwords]
return ' '.join(result)
def test_dimension(max_features, ngram_range):
"""测试特定特征维度"""
logger.info(f"\n{'='*60}")
logger.info(f"测试配置: max_features={max_features}, ngram_range={ngram_range}")
logger.info(f"{'='*60}")
# 加载数据
start_time = time.time()
df = pd.read_csv('data/processed/news_data.csv')
df['text'] = df['title'] + ' ' + df['content']
df['processed_text'] = df['text'].apply(preprocess_text)
df['category_code'] = df['category_name'].map(REVERSE_CATEGORY_MAP)
df = df[df['category_code'].notna()]
df['category_code'] = df['category_code'].astype(str)
load_time = time.time() - start_time
logger.info(f"数据加载: {load_time:.2f}")
# 划分数据集
X = df['processed_text'].values
y = df['category_code'].values
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
logger.info(f"训练集: {len(X_train)}条, 测试集: {len(X_test)}")
# TF-IDF特征提取
start_time = time.time()
vectorizer = TfidfVectorizer(
max_features=max_features,
ngram_range=ngram_range,
min_df=2
)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
tfidf_time = time.time() - start_time
logger.info(f"TF-IDF特征提取: {tfidf_time:.2f}")
logger.info(f"特征维度: {X_train_tfidf.shape[1]}")
# SVM训练
start_time = time.time()
classifier = SVC(kernel='linear', probability=True, random_state=42)
classifier.fit(X_train_tfidf, y_train)
train_time = time.time() - start_time
logger.info(f"SVM训练: {train_time:.2f}")
# 评估
start_time = time.time()
y_pred = classifier.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
eval_time = time.time() - start_time
logger.info(f"模型评估: {eval_time:.2f}")
logger.info(f"准确率: {accuracy:.4f}")
total_time = load_time + tfidf_time + train_time + eval_time
logger.info(f"总时间: {total_time:.2f}")
return {
'max_features': max_features,
'ngram_range': ngram_range,
'feature_dim': X_train_tfidf.shape[1],
'load_time': load_time,
'tfidf_time': tfidf_time,
'train_time': train_time,
'eval_time': eval_time,
'total_time': total_time,
'accuracy': accuracy
}
def main():
"""主函数"""
logger.info("开始测试不同特征维度对训练时间的影响\n")
# 测试配置
configs = [
(5000, (1, 2)), # 当前配置
(10000, (1, 2)), # 仅增加特征维度
(5000, (1, 3)), # 仅增加n-gram
(10000, (1, 3)), # 同时增加
]
results = []
for max_features, ngram_range in configs:
try:
result = test_dimension(max_features, ngram_range)
results.append(result)
except Exception as e:
logger.error(f"测试失败: {e}")
# 打印对比结果
logger.info(f"\n{'='*80}")
logger.info("测试结果对比")
logger.info(f"{'='*80}")
print(f"\n{'配置':<30} {'特征维度':<10} {'TF-IDF':<10} {'SVM训练':<10} {'总时间':<10} {'准确率':<10}")
print("-" * 90)
for r in results:
config_str = f"max={r['max_features']}, ngram={r['ngram_range']}"
print(f"{config_str:<30} {r['feature_dim']:<10} {r['tfidf_time']:<10.2f} {r['train_time']:<10.2f} {r['total_time']:<10.2f} {r['accuracy']:<10.4f}")
# 计算时间增加倍数
if len(results) >= 2:
baseline = results[0]
logger.info(f"\n{'='*80}")
logger.info("相对于基线配置的时间增加倍数")
logger.info(f"{'='*80}")
for r in results[1:]:
config_str = f"max={r['max_features']}, ngram={r['ngram_range']}"
tfidf_ratio = r['tfidf_time'] / baseline['tfidf_time']
train_ratio = r['train_time'] / baseline['train_time']
total_ratio = r['total_time'] / baseline['total_time']
acc_diff = (r['accuracy'] - baseline['accuracy']) * 100
print(f"\n{config_str}:")
print(f" TF-IDF时间: {tfidf_ratio:.2f}x")
print(f" SVM训练时间: {train_ratio:.2f}x")
print(f" 总时间: {total_ratio:.2f}x")
print(f" 准确率变化: {acc_diff:+.2f}%")
if __name__ == '__main__':
main()