feat: 将数据集从国外员工缺勤数据替换为中国企业缺勤模拟数据
- 新增中国企业员工缺勤模拟数据集生成脚本(generate_dataset.py),覆盖7个行业、180家企业、2600名员工 - 重构 config.py,更新特征字段为中文名称,调整目标列、员工ID、行业类型等配置 - 重构 clustering.py,简化聚类逻辑,更新聚类特征和群体命名(高压通勤型、健康波动型等) - 重构 feature_mining.py,更新相关性分析和群体比较维度(按行业、班次、婚姻状态等) - 新增 model_features.py 定义模型训练特征 - 更新 preprocessing.py 和 train_model.py 适配新数据结构 - 更新各 API 路由默认参数(model: random_forest, dimension: industry) - 前端更新主题样式和各视图组件适配中文字段 - 更新系统名称为 China Enterprise Absence Analysis System
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
import os
|
||||
|
||||
import joblib
|
||||
import numpy as np
|
||||
|
||||
import config
|
||||
from core.feature_mining import get_correlation_for_heatmap, group_comparison
|
||||
@@ -10,109 +10,95 @@ class AnalysisService:
|
||||
def __init__(self):
|
||||
self.models = {}
|
||||
self.feature_names = None
|
||||
|
||||
self.selected_features = None
|
||||
self.training_metadata = {}
|
||||
|
||||
def _ensure_models_loaded(self):
|
||||
if not self.models:
|
||||
model_files = {
|
||||
'random_forest': 'random_forest_model.pkl',
|
||||
'xgboost': 'xgboost_model.pkl',
|
||||
'lightgbm': 'lightgbm_model.pkl',
|
||||
}
|
||||
|
||||
for name, filename in model_files.items():
|
||||
model_path = os.path.join(config.MODELS_DIR, filename)
|
||||
if os.path.exists(model_path):
|
||||
try:
|
||||
self.models[name] = joblib.load(model_path)
|
||||
except Exception as e:
|
||||
print(f"Failed to load {name}: {e}")
|
||||
|
||||
feature_names_path = os.path.join(config.MODELS_DIR, 'feature_names.pkl')
|
||||
if os.path.exists(feature_names_path):
|
||||
self.feature_names = joblib.load(feature_names_path)
|
||||
|
||||
if self.models:
|
||||
return
|
||||
metadata_path = os.path.join(config.MODELS_DIR, 'training_metadata.pkl')
|
||||
if os.path.exists(metadata_path):
|
||||
self.training_metadata = joblib.load(metadata_path)
|
||||
model_files = {
|
||||
'random_forest': 'random_forest_model.pkl',
|
||||
'xgboost': 'xgboost_model.pkl',
|
||||
'lightgbm': 'lightgbm_model.pkl',
|
||||
'gradient_boosting': 'gradient_boosting_model.pkl',
|
||||
}
|
||||
allowed_models = self.training_metadata.get('available_models')
|
||||
if allowed_models:
|
||||
model_files = {k: v for k, v in model_files.items() if k in allowed_models}
|
||||
for name, filename in model_files.items():
|
||||
path = os.path.join(config.MODELS_DIR, filename)
|
||||
if os.path.exists(path):
|
||||
try:
|
||||
self.models[name] = joblib.load(path)
|
||||
except Exception as exc:
|
||||
print(f'Failed to load model {name}: {exc}')
|
||||
for filename, attr in [('feature_names.pkl', 'feature_names'), ('selected_features.pkl', 'selected_features')]:
|
||||
path = os.path.join(config.MODELS_DIR, filename)
|
||||
if os.path.exists(path):
|
||||
try:
|
||||
setattr(self, attr, joblib.load(path))
|
||||
except Exception as exc:
|
||||
print(f'Failed to load artifact {filename}: {exc}')
|
||||
|
||||
def get_feature_importance(self, model_type='random_forest'):
|
||||
self._ensure_models_loaded()
|
||||
|
||||
if model_type not in self.models:
|
||||
if self.models:
|
||||
model_type = list(self.models.keys())[0]
|
||||
else:
|
||||
return self._get_default_importance()
|
||||
|
||||
model_type = next(iter(self.models), 'default')
|
||||
if model_type == 'default':
|
||||
return self._get_default_importance()
|
||||
model = self.models[model_type]
|
||||
|
||||
try:
|
||||
if hasattr(model, 'feature_importances_'):
|
||||
importances = model.feature_importances_
|
||||
else:
|
||||
return self._get_default_importance()
|
||||
|
||||
feature_names = self.feature_names or [f'feature_{i}' for i in range(len(importances))]
|
||||
|
||||
if len(feature_names) != len(importances):
|
||||
feature_names = [f'feature_{i}' for i in range(len(importances))]
|
||||
|
||||
feature_importance = list(zip(feature_names, importances))
|
||||
feature_importance.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
features = []
|
||||
for i, (name, imp) in enumerate(feature_importance[:15]):
|
||||
features.append({
|
||||
if not hasattr(model, 'feature_importances_'):
|
||||
return self._get_default_importance()
|
||||
|
||||
importances = model.feature_importances_
|
||||
feature_names = self.selected_features or self.feature_names or []
|
||||
if len(feature_names) != len(importances):
|
||||
feature_names = [f'feature_{idx}' for idx in range(len(importances))]
|
||||
ranked = sorted(zip(feature_names, importances), key=lambda item: item[1], reverse=True)[:15]
|
||||
return {
|
||||
'model_type': model_type,
|
||||
'features': [
|
||||
{
|
||||
'name': name,
|
||||
'name_cn': config.FEATURE_NAME_CN.get(name, name),
|
||||
'importance': round(float(imp), 4),
|
||||
'rank': i + 1
|
||||
})
|
||||
|
||||
return {
|
||||
'model_type': model_type,
|
||||
'features': features
|
||||
}
|
||||
except Exception as e:
|
||||
print(f"Error getting feature importance: {e}")
|
||||
return self._get_default_importance()
|
||||
|
||||
'importance': round(float(importance), 4),
|
||||
'rank': idx + 1,
|
||||
}
|
||||
for idx, (name, importance) in enumerate(ranked)
|
||||
],
|
||||
}
|
||||
|
||||
def _get_default_importance(self):
|
||||
default_features = [
|
||||
('Reason for absence', 0.25),
|
||||
('Transportation expense', 0.12),
|
||||
('Distance from Residence to Work', 0.10),
|
||||
('Service time', 0.08),
|
||||
('Age', 0.07),
|
||||
('Work load Average/day', 0.06),
|
||||
('Body mass index', 0.05),
|
||||
('Social drinker', 0.04),
|
||||
('Hit target', 0.03),
|
||||
('Son', 0.03),
|
||||
('Pet', 0.02),
|
||||
('Education', 0.02),
|
||||
('Social smoker', 0.01)
|
||||
defaults = [
|
||||
('加班通勤压力指数', 0.24),
|
||||
('健康风险指数', 0.18),
|
||||
('请假类型', 0.12),
|
||||
('通勤时长分钟', 0.1),
|
||||
('月均加班时长', 0.08),
|
||||
('近90天缺勤次数', 0.07),
|
||||
('心理压力等级', 0.06),
|
||||
('家庭负担指数', 0.05),
|
||||
]
|
||||
|
||||
features = []
|
||||
for i, (name, imp) in enumerate(default_features):
|
||||
features.append({
|
||||
'name': name,
|
||||
'name_cn': config.FEATURE_NAME_CN.get(name, name),
|
||||
'importance': imp,
|
||||
'rank': i + 1
|
||||
})
|
||||
|
||||
return {
|
||||
'model_type': 'default',
|
||||
'features': features
|
||||
'features': [
|
||||
{
|
||||
'name': name,
|
||||
'name_cn': config.FEATURE_NAME_CN.get(name, name),
|
||||
'importance': importance,
|
||||
'rank': idx + 1,
|
||||
}
|
||||
for idx, (name, importance) in enumerate(defaults)
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def get_correlation(self):
|
||||
return get_correlation_for_heatmap()
|
||||
|
||||
|
||||
def get_group_comparison(self, dimension):
|
||||
valid_dimensions = ['drinker', 'smoker', 'education', 'children', 'pet']
|
||||
|
||||
if dimension not in valid_dimensions:
|
||||
raise ValueError(f"Invalid dimension: {dimension}. Must be one of {valid_dimensions}")
|
||||
|
||||
return group_comparison(dimension)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user