""" 测试不同特征维度对训练时间的影响 """ import time import pandas as pd import jieba from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import SVC from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # 分类映射 CATEGORY_MAP = { 'ENTERTAINMENT': '娱乐', 'SPORTS': '体育', 'FINANCE': '财经', 'TECHNOLOGY': '科技', 'MILITARY': '军事', 'AUTOMOTIVE': '汽车', 'GOVERNMENT': '政务', 'HEALTH': '健康', 'AI': 'AI', 'HOUSE': '房产' } REVERSE_CATEGORY_MAP = {v: k for k, v in CATEGORY_MAP.items()} # 加载停用词 stopwords = set() try: with open('data/news_stopwords.txt', 'r', encoding='utf-8') as f: stopwords = set(line.strip() for line in f if line.strip()) except: pass def preprocess_text(text): """文本预处理""" if pd.isna(text) or text is None: return '' if not isinstance(text, str): text = str(text) text = ' '.join(text.split()) words = jieba.cut(text) result = [w for w in words if len(w) > 1 and w not in stopwords] return ' '.join(result) def test_dimension(max_features, ngram_range): """测试特定特征维度""" logger.info(f"\n{'='*60}") logger.info(f"测试配置: max_features={max_features}, ngram_range={ngram_range}") logger.info(f"{'='*60}") # 加载数据 start_time = time.time() df = pd.read_csv('data/processed/news_data.csv') df['text'] = df['title'] + ' ' + df['content'] df['processed_text'] = df['text'].apply(preprocess_text) df['category_code'] = df['category_name'].map(REVERSE_CATEGORY_MAP) df = df[df['category_code'].notna()] df['category_code'] = df['category_code'].astype(str) load_time = time.time() - start_time logger.info(f"数据加载: {load_time:.2f}秒") # 划分数据集 X = df['processed_text'].values y = df['category_code'].values X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) logger.info(f"训练集: {len(X_train)}条, 测试集: {len(X_test)}条") # TF-IDF特征提取 start_time = time.time() vectorizer = TfidfVectorizer( max_features=max_features, ngram_range=ngram_range, min_df=2 ) X_train_tfidf = vectorizer.fit_transform(X_train) X_test_tfidf = vectorizer.transform(X_test) tfidf_time = time.time() - start_time logger.info(f"TF-IDF特征提取: {tfidf_time:.2f}秒") logger.info(f"特征维度: {X_train_tfidf.shape[1]}") # SVM训练 start_time = time.time() classifier = SVC(kernel='linear', probability=True, random_state=42) classifier.fit(X_train_tfidf, y_train) train_time = time.time() - start_time logger.info(f"SVM训练: {train_time:.2f}秒") # 评估 start_time = time.time() y_pred = classifier.predict(X_test_tfidf) accuracy = accuracy_score(y_test, y_pred) eval_time = time.time() - start_time logger.info(f"模型评估: {eval_time:.2f}秒") logger.info(f"准确率: {accuracy:.4f}") total_time = load_time + tfidf_time + train_time + eval_time logger.info(f"总时间: {total_time:.2f}秒") return { 'max_features': max_features, 'ngram_range': ngram_range, 'feature_dim': X_train_tfidf.shape[1], 'load_time': load_time, 'tfidf_time': tfidf_time, 'train_time': train_time, 'eval_time': eval_time, 'total_time': total_time, 'accuracy': accuracy } def main(): """主函数""" logger.info("开始测试不同特征维度对训练时间的影响\n") # 测试配置 configs = [ (5000, (1, 2)), # 当前配置 (10000, (1, 2)), # 仅增加特征维度 (5000, (1, 3)), # 仅增加n-gram (10000, (1, 3)), # 同时增加 ] results = [] for max_features, ngram_range in configs: try: result = test_dimension(max_features, ngram_range) results.append(result) except Exception as e: logger.error(f"测试失败: {e}") # 打印对比结果 logger.info(f"\n{'='*80}") logger.info("测试结果对比") logger.info(f"{'='*80}") print(f"\n{'配置':<30} {'特征维度':<10} {'TF-IDF':<10} {'SVM训练':<10} {'总时间':<10} {'准确率':<10}") print("-" * 90) for r in results: config_str = f"max={r['max_features']}, ngram={r['ngram_range']}" print(f"{config_str:<30} {r['feature_dim']:<10} {r['tfidf_time']:<10.2f} {r['train_time']:<10.2f} {r['total_time']:<10.2f} {r['accuracy']:<10.4f}") # 计算时间增加倍数 if len(results) >= 2: baseline = results[0] logger.info(f"\n{'='*80}") logger.info("相对于基线配置的时间增加倍数") logger.info(f"{'='*80}") for r in results[1:]: config_str = f"max={r['max_features']}, ngram={r['ngram_range']}" tfidf_ratio = r['tfidf_time'] / baseline['tfidf_time'] train_ratio = r['train_time'] / baseline['train_time'] total_ratio = r['total_time'] / baseline['total_time'] acc_diff = (r['accuracy'] - baseline['accuracy']) * 100 print(f"\n{config_str}:") print(f" TF-IDF时间: {tfidf_ratio:.2f}x") print(f" SVM训练时间: {train_ratio:.2f}x") print(f" 总时间: {total_ratio:.2f}x") print(f" 准确率变化: {acc_diff:+.2f}%") if __name__ == '__main__': main()