feat: 将数据集从国外员工缺勤数据替换为中国企业缺勤模拟数据
- 新增中国企业员工缺勤模拟数据集生成脚本(generate_dataset.py),覆盖7个行业、180家企业、2600名员工 - 重构 config.py,更新特征字段为中文名称,调整目标列、员工ID、行业类型等配置 - 重构 clustering.py,简化聚类逻辑,更新聚类特征和群体命名(高压通勤型、健康波动型等) - 重构 feature_mining.py,更新相关性分析和群体比较维度(按行业、班次、婚姻状态等) - 新增 model_features.py 定义模型训练特征 - 更新 preprocessing.py 和 train_model.py 适配新数据结构 - 更新各 API 路由默认参数(model: random_forest, dimension: industry) - 前端更新主题样式和各视图组件适配中文字段 - 更新系统名称为 China Enterprise Absence Analysis System
This commit is contained in:
@@ -1,6 +1,3 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
import config
|
||||
from core.preprocessing import get_clean_data
|
||||
|
||||
@@ -8,154 +5,103 @@ from core.preprocessing import get_clean_data
|
||||
class DataService:
|
||||
def __init__(self):
|
||||
self._df = None
|
||||
|
||||
|
||||
@property
|
||||
def df(self):
|
||||
if self._df is None:
|
||||
self._df = get_clean_data()
|
||||
return self._df
|
||||
|
||||
|
||||
def get_basic_stats(self):
|
||||
df = self.df
|
||||
|
||||
total_records = len(df)
|
||||
total_employees = df['ID'].nunique()
|
||||
total_absent_hours = df['Absenteeism time in hours'].sum()
|
||||
avg_absent_hours = round(df['Absenteeism time in hours'].mean(), 2)
|
||||
max_absent_hours = int(df['Absenteeism time in hours'].max())
|
||||
min_absent_hours = int(df['Absenteeism time in hours'].min())
|
||||
|
||||
high_risk_count = len(df[df['Absenteeism time in hours'] > 8])
|
||||
high_risk_ratio = round(high_risk_count / total_records, 4)
|
||||
|
||||
total_employees = df[config.EMPLOYEE_ID_COLUMN].nunique()
|
||||
avg_absent_hours = round(df[config.TARGET_COLUMN].mean(), 2)
|
||||
max_absent_hours = round(float(df[config.TARGET_COLUMN].max()), 1)
|
||||
min_absent_hours = round(float(df[config.TARGET_COLUMN].min()), 1)
|
||||
high_risk_count = len(df[df[config.TARGET_COLUMN] > 8])
|
||||
return {
|
||||
'total_records': total_records,
|
||||
'total_employees': total_employees,
|
||||
'total_absent_hours': int(total_absent_hours),
|
||||
'avg_absent_hours': avg_absent_hours,
|
||||
'max_absent_hours': max_absent_hours,
|
||||
'min_absent_hours': min_absent_hours,
|
||||
'high_risk_ratio': high_risk_ratio
|
||||
'high_risk_ratio': round(high_risk_count / total_records, 4),
|
||||
'industries_covered': int(df['所属行业'].nunique()),
|
||||
}
|
||||
|
||||
|
||||
def get_monthly_trend(self):
|
||||
df = self.df
|
||||
|
||||
monthly = df.groupby('Month of absence').agg({
|
||||
'Absenteeism time in hours': ['sum', 'mean', 'count']
|
||||
}).reset_index()
|
||||
|
||||
monthly = df.groupby('缺勤月份').agg({config.TARGET_COLUMN: ['sum', 'mean', 'count']}).reset_index()
|
||||
monthly.columns = ['month', 'total_hours', 'avg_hours', 'record_count']
|
||||
|
||||
months = ['1月', '2月', '3月', '4月', '5月', '6月',
|
||||
'7月', '8月', '9月', '10月', '11月', '12月']
|
||||
|
||||
result = {
|
||||
'months': months,
|
||||
'total_hours': [],
|
||||
'avg_hours': [],
|
||||
'record_counts': []
|
||||
}
|
||||
|
||||
for i in range(1, 13):
|
||||
row = monthly[monthly['month'] == i]
|
||||
if len(row) > 0:
|
||||
result['total_hours'].append(int(row['total_hours'].values[0]))
|
||||
result = {'months': [], 'total_hours': [], 'avg_hours': [], 'record_counts': []}
|
||||
for month in range(1, 13):
|
||||
row = monthly[monthly['month'] == month]
|
||||
result['months'].append(f'{month}月')
|
||||
if len(row):
|
||||
result['total_hours'].append(round(float(row['total_hours'].values[0]), 1))
|
||||
result['avg_hours'].append(round(float(row['avg_hours'].values[0]), 2))
|
||||
result['record_counts'].append(int(row['record_count'].values[0]))
|
||||
else:
|
||||
result['total_hours'].append(0)
|
||||
result['avg_hours'].append(0)
|
||||
result['record_counts'].append(0)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_weekday_distribution(self):
|
||||
df = self.df
|
||||
|
||||
weekday = df.groupby('Day of the week').agg({
|
||||
'Absenteeism time in hours': ['sum', 'mean', 'count']
|
||||
}).reset_index()
|
||||
|
||||
weekday = df.groupby('星期几').agg({config.TARGET_COLUMN: ['sum', 'mean', 'count']}).reset_index()
|
||||
weekday.columns = ['weekday', 'total_hours', 'avg_hours', 'record_count']
|
||||
|
||||
result = {
|
||||
'weekdays': [],
|
||||
'weekday_codes': [],
|
||||
'total_hours': [],
|
||||
'avg_hours': [],
|
||||
'record_counts': []
|
||||
}
|
||||
|
||||
for code in [2, 3, 4, 5, 6]:
|
||||
result = {'weekdays': [], 'weekday_codes': [], 'total_hours': [], 'avg_hours': [], 'record_counts': []}
|
||||
for code in range(1, 8):
|
||||
row = weekday[weekday['weekday'] == code]
|
||||
result['weekdays'].append(config.WEEKDAY_NAMES.get(code, str(code)))
|
||||
result['weekday_codes'].append(code)
|
||||
if len(row) > 0:
|
||||
result['total_hours'].append(int(row['total_hours'].values[0]))
|
||||
if len(row):
|
||||
result['total_hours'].append(round(float(row['total_hours'].values[0]), 1))
|
||||
result['avg_hours'].append(round(float(row['avg_hours'].values[0]), 2))
|
||||
result['record_counts'].append(int(row['record_count'].values[0]))
|
||||
else:
|
||||
result['total_hours'].append(0)
|
||||
result['avg_hours'].append(0)
|
||||
result['record_counts'].append(0)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_reason_distribution(self):
|
||||
df = self.df
|
||||
|
||||
reason = df.groupby('Reason for absence').agg({
|
||||
'Absenteeism time in hours': 'count'
|
||||
}).reset_index()
|
||||
|
||||
reason.columns = ['code', 'count']
|
||||
reason = df.groupby('请假原因大类').agg({config.TARGET_COLUMN: 'count'}).reset_index()
|
||||
reason.columns = ['name', 'count']
|
||||
reason = reason.sort_values('count', ascending=False)
|
||||
|
||||
total = reason['count'].sum()
|
||||
|
||||
result = {
|
||||
'reasons': []
|
||||
return {
|
||||
'reasons': [
|
||||
{
|
||||
'name': row['name'],
|
||||
'count': int(row['count']),
|
||||
'percentage': round(float(row['count']) / total * 100, 1),
|
||||
}
|
||||
for _, row in reason.iterrows()
|
||||
]
|
||||
}
|
||||
|
||||
for _, row in reason.iterrows():
|
||||
code = int(row['code'])
|
||||
result['reasons'].append({
|
||||
'code': code,
|
||||
'name': config.REASON_NAMES.get(code, f'原因{code}'),
|
||||
'count': int(row['count']),
|
||||
'percentage': round(row['count'] / total * 100, 1)
|
||||
})
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_season_distribution(self):
|
||||
df = self.df
|
||||
|
||||
season = df.groupby('Seasons').agg({
|
||||
'Absenteeism time in hours': ['sum', 'mean', 'count']
|
||||
}).reset_index()
|
||||
|
||||
season = df.groupby('季节').agg({config.TARGET_COLUMN: ['sum', 'mean', 'count']}).reset_index()
|
||||
season.columns = ['season', 'total_hours', 'avg_hours', 'record_count']
|
||||
|
||||
total_records = season['record_count'].sum()
|
||||
|
||||
result = {
|
||||
'seasons': []
|
||||
}
|
||||
|
||||
result = {'seasons': []}
|
||||
for code in [1, 2, 3, 4]:
|
||||
row = season[season['season'] == code]
|
||||
if len(row) > 0:
|
||||
result['seasons'].append({
|
||||
'code': int(code),
|
||||
'name': config.SEASON_NAMES.get(code, f'季节{code}'),
|
||||
'total_hours': int(row['total_hours'].values[0]),
|
||||
'avg_hours': round(float(row['avg_hours'].values[0]), 2),
|
||||
'record_count': int(row['record_count'].values[0]),
|
||||
'percentage': round(row['record_count'].values[0] / total_records * 100, 1)
|
||||
})
|
||||
|
||||
if not len(row):
|
||||
continue
|
||||
result['seasons'].append({
|
||||
'code': code,
|
||||
'name': config.SEASON_NAMES.get(code, f'季节{code}'),
|
||||
'total_hours': round(float(row['total_hours'].values[0]), 1),
|
||||
'avg_hours': round(float(row['avg_hours'].values[0]), 2),
|
||||
'record_count': int(row['record_count'].values[0]),
|
||||
'percentage': round(float(row['record_count'].values[0]) / total_records * 100, 1),
|
||||
})
|
||||
return result
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user