Files
forsetsystem/backend/services/data_service.py
shenjianZ e63267cef6 feat: 将数据集从国外员工缺勤数据替换为中国企业缺勤模拟数据
- 新增中国企业员工缺勤模拟数据集生成脚本(generate_dataset.py),覆盖7个行业、180家企业、2600名员工
  - 重构 config.py,更新特征字段为中文名称,调整目标列、员工ID、行业类型等配置
  - 重构 clustering.py,简化聚类逻辑,更新聚类特征和群体命名(高压通勤型、健康波动型等)
  - 重构 feature_mining.py,更新相关性分析和群体比较维度(按行业、班次、婚姻状态等)
  - 新增 model_features.py 定义模型训练特征
  - 更新 preprocessing.py 和 train_model.py 适配新数据结构
  - 更新各 API 路由默认参数(model: random_forest, dimension: industry)
  - 前端更新主题样式和各视图组件适配中文字段
  - 更新系统名称为 China Enterprise Absence Analysis System
2026-03-11 10:46:58 +08:00

109 lines
4.6 KiB
Python

import config
from core.preprocessing import get_clean_data
class DataService:
def __init__(self):
self._df = None
@property
def df(self):
if self._df is None:
self._df = get_clean_data()
return self._df
def get_basic_stats(self):
df = self.df
total_records = len(df)
total_employees = df[config.EMPLOYEE_ID_COLUMN].nunique()
avg_absent_hours = round(df[config.TARGET_COLUMN].mean(), 2)
max_absent_hours = round(float(df[config.TARGET_COLUMN].max()), 1)
min_absent_hours = round(float(df[config.TARGET_COLUMN].min()), 1)
high_risk_count = len(df[df[config.TARGET_COLUMN] > 8])
return {
'total_records': total_records,
'total_employees': total_employees,
'avg_absent_hours': avg_absent_hours,
'max_absent_hours': max_absent_hours,
'min_absent_hours': min_absent_hours,
'high_risk_ratio': round(high_risk_count / total_records, 4),
'industries_covered': int(df['所属行业'].nunique()),
}
def get_monthly_trend(self):
df = self.df
monthly = df.groupby('缺勤月份').agg({config.TARGET_COLUMN: ['sum', 'mean', 'count']}).reset_index()
monthly.columns = ['month', 'total_hours', 'avg_hours', 'record_count']
result = {'months': [], 'total_hours': [], 'avg_hours': [], 'record_counts': []}
for month in range(1, 13):
row = monthly[monthly['month'] == month]
result['months'].append(f'{month}')
if len(row):
result['total_hours'].append(round(float(row['total_hours'].values[0]), 1))
result['avg_hours'].append(round(float(row['avg_hours'].values[0]), 2))
result['record_counts'].append(int(row['record_count'].values[0]))
else:
result['total_hours'].append(0)
result['avg_hours'].append(0)
result['record_counts'].append(0)
return result
def get_weekday_distribution(self):
df = self.df
weekday = df.groupby('星期几').agg({config.TARGET_COLUMN: ['sum', 'mean', 'count']}).reset_index()
weekday.columns = ['weekday', 'total_hours', 'avg_hours', 'record_count']
result = {'weekdays': [], 'weekday_codes': [], 'total_hours': [], 'avg_hours': [], 'record_counts': []}
for code in range(1, 8):
row = weekday[weekday['weekday'] == code]
result['weekdays'].append(config.WEEKDAY_NAMES.get(code, str(code)))
result['weekday_codes'].append(code)
if len(row):
result['total_hours'].append(round(float(row['total_hours'].values[0]), 1))
result['avg_hours'].append(round(float(row['avg_hours'].values[0]), 2))
result['record_counts'].append(int(row['record_count'].values[0]))
else:
result['total_hours'].append(0)
result['avg_hours'].append(0)
result['record_counts'].append(0)
return result
def get_reason_distribution(self):
df = self.df
reason = df.groupby('请假原因大类').agg({config.TARGET_COLUMN: 'count'}).reset_index()
reason.columns = ['name', 'count']
reason = reason.sort_values('count', ascending=False)
total = reason['count'].sum()
return {
'reasons': [
{
'name': row['name'],
'count': int(row['count']),
'percentage': round(float(row['count']) / total * 100, 1),
}
for _, row in reason.iterrows()
]
}
def get_season_distribution(self):
df = self.df
season = df.groupby('季节').agg({config.TARGET_COLUMN: ['sum', 'mean', 'count']}).reset_index()
season.columns = ['season', 'total_hours', 'avg_hours', 'record_count']
total_records = season['record_count'].sum()
result = {'seasons': []}
for code in [1, 2, 3, 4]:
row = season[season['season'] == code]
if not len(row):
continue
result['seasons'].append({
'code': code,
'name': config.SEASON_NAMES.get(code, f'季节{code}'),
'total_hours': round(float(row['total_hours'].values[0]), 1),
'avg_hours': round(float(row['avg_hours'].values[0]), 2),
'record_count': int(row['record_count'].values[0]),
'percentage': round(float(row['record_count'].values[0]) / total_records * 100, 1),
})
return result
data_service = DataService()