174 lines
5.6 KiB
Python
174 lines
5.6 KiB
Python
"""
|
|
测试不同特征维度对训练时间的影响
|
|
"""
|
|
|
|
import time
|
|
import pandas as pd
|
|
import jieba
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.svm import SVC
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.metrics import accuracy_score
|
|
import logging
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# 分类映射
|
|
CATEGORY_MAP = {
|
|
'ENTERTAINMENT': '娱乐',
|
|
'SPORTS': '体育',
|
|
'FINANCE': '财经',
|
|
'TECHNOLOGY': '科技',
|
|
'MILITARY': '军事',
|
|
'AUTOMOTIVE': '汽车',
|
|
'GOVERNMENT': '政务',
|
|
'HEALTH': '健康',
|
|
'AI': 'AI',
|
|
'HOUSE': '房产'
|
|
}
|
|
|
|
REVERSE_CATEGORY_MAP = {v: k for k, v in CATEGORY_MAP.items()}
|
|
|
|
# 加载停用词
|
|
stopwords = set()
|
|
try:
|
|
with open('data/news_stopwords.txt', 'r', encoding='utf-8') as f:
|
|
stopwords = set(line.strip() for line in f if line.strip())
|
|
except:
|
|
pass
|
|
|
|
def preprocess_text(text):
|
|
"""文本预处理"""
|
|
if pd.isna(text) or text is None:
|
|
return ''
|
|
if not isinstance(text, str):
|
|
text = str(text)
|
|
text = ' '.join(text.split())
|
|
words = jieba.cut(text)
|
|
result = [w for w in words if len(w) > 1 and w not in stopwords]
|
|
return ' '.join(result)
|
|
|
|
def test_dimension(max_features, ngram_range):
|
|
"""测试特定特征维度"""
|
|
logger.info(f"\n{'='*60}")
|
|
logger.info(f"测试配置: max_features={max_features}, ngram_range={ngram_range}")
|
|
logger.info(f"{'='*60}")
|
|
|
|
# 加载数据
|
|
start_time = time.time()
|
|
df = pd.read_csv('data/processed/news_data.csv')
|
|
df['text'] = df['title'] + ' ' + df['content']
|
|
df['processed_text'] = df['text'].apply(preprocess_text)
|
|
df['category_code'] = df['category_name'].map(REVERSE_CATEGORY_MAP)
|
|
df = df[df['category_code'].notna()]
|
|
df['category_code'] = df['category_code'].astype(str)
|
|
load_time = time.time() - start_time
|
|
logger.info(f"数据加载: {load_time:.2f}秒")
|
|
|
|
# 划分数据集
|
|
X = df['processed_text'].values
|
|
y = df['category_code'].values
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
X, y, test_size=0.2, random_state=42, stratify=y
|
|
)
|
|
logger.info(f"训练集: {len(X_train)}条, 测试集: {len(X_test)}条")
|
|
|
|
# TF-IDF特征提取
|
|
start_time = time.time()
|
|
vectorizer = TfidfVectorizer(
|
|
max_features=max_features,
|
|
ngram_range=ngram_range,
|
|
min_df=2
|
|
)
|
|
X_train_tfidf = vectorizer.fit_transform(X_train)
|
|
X_test_tfidf = vectorizer.transform(X_test)
|
|
tfidf_time = time.time() - start_time
|
|
logger.info(f"TF-IDF特征提取: {tfidf_time:.2f}秒")
|
|
logger.info(f"特征维度: {X_train_tfidf.shape[1]}")
|
|
|
|
# SVM训练
|
|
start_time = time.time()
|
|
classifier = SVC(kernel='linear', probability=True, random_state=42)
|
|
classifier.fit(X_train_tfidf, y_train)
|
|
train_time = time.time() - start_time
|
|
logger.info(f"SVM训练: {train_time:.2f}秒")
|
|
|
|
# 评估
|
|
start_time = time.time()
|
|
y_pred = classifier.predict(X_test_tfidf)
|
|
accuracy = accuracy_score(y_test, y_pred)
|
|
eval_time = time.time() - start_time
|
|
logger.info(f"模型评估: {eval_time:.2f}秒")
|
|
logger.info(f"准确率: {accuracy:.4f}")
|
|
|
|
total_time = load_time + tfidf_time + train_time + eval_time
|
|
logger.info(f"总时间: {total_time:.2f}秒")
|
|
|
|
return {
|
|
'max_features': max_features,
|
|
'ngram_range': ngram_range,
|
|
'feature_dim': X_train_tfidf.shape[1],
|
|
'load_time': load_time,
|
|
'tfidf_time': tfidf_time,
|
|
'train_time': train_time,
|
|
'eval_time': eval_time,
|
|
'total_time': total_time,
|
|
'accuracy': accuracy
|
|
}
|
|
|
|
def main():
|
|
"""主函数"""
|
|
logger.info("开始测试不同特征维度对训练时间的影响\n")
|
|
|
|
# 测试配置
|
|
configs = [
|
|
(5000, (1, 2)), # 当前配置
|
|
(10000, (1, 2)), # 仅增加特征维度
|
|
(5000, (1, 3)), # 仅增加n-gram
|
|
(10000, (1, 3)), # 同时增加
|
|
]
|
|
|
|
results = []
|
|
|
|
for max_features, ngram_range in configs:
|
|
try:
|
|
result = test_dimension(max_features, ngram_range)
|
|
results.append(result)
|
|
except Exception as e:
|
|
logger.error(f"测试失败: {e}")
|
|
|
|
# 打印对比结果
|
|
logger.info(f"\n{'='*80}")
|
|
logger.info("测试结果对比")
|
|
logger.info(f"{'='*80}")
|
|
|
|
print(f"\n{'配置':<30} {'特征维度':<10} {'TF-IDF':<10} {'SVM训练':<10} {'总时间':<10} {'准确率':<10}")
|
|
print("-" * 90)
|
|
|
|
for r in results:
|
|
config_str = f"max={r['max_features']}, ngram={r['ngram_range']}"
|
|
print(f"{config_str:<30} {r['feature_dim']:<10} {r['tfidf_time']:<10.2f} {r['train_time']:<10.2f} {r['total_time']:<10.2f} {r['accuracy']:<10.4f}")
|
|
|
|
# 计算时间增加倍数
|
|
if len(results) >= 2:
|
|
baseline = results[0]
|
|
logger.info(f"\n{'='*80}")
|
|
logger.info("相对于基线配置的时间增加倍数")
|
|
logger.info(f"{'='*80}")
|
|
|
|
for r in results[1:]:
|
|
config_str = f"max={r['max_features']}, ngram={r['ngram_range']}"
|
|
tfidf_ratio = r['tfidf_time'] / baseline['tfidf_time']
|
|
train_ratio = r['train_time'] / baseline['train_time']
|
|
total_ratio = r['total_time'] / baseline['total_time']
|
|
acc_diff = (r['accuracy'] - baseline['accuracy']) * 100
|
|
|
|
print(f"\n{config_str}:")
|
|
print(f" TF-IDF时间: {tfidf_ratio:.2f}x")
|
|
print(f" SVM训练时间: {train_ratio:.2f}x")
|
|
print(f" 总时间: {total_ratio:.2f}x")
|
|
print(f" 准确率变化: {acc_diff:+.2f}%")
|
|
|
|
if __name__ == '__main__':
|
|
main() |