Files
forsetsystem/backend/core/clustering.py
shenjianZ e63267cef6 feat: 将数据集从国外员工缺勤数据替换为中国企业缺勤模拟数据
- 新增中国企业员工缺勤模拟数据集生成脚本(generate_dataset.py),覆盖7个行业、180家企业、2600名员工
  - 重构 config.py,更新特征字段为中文名称,调整目标列、员工ID、行业类型等配置
  - 重构 clustering.py,简化聚类逻辑,更新聚类特征和群体命名(高压通勤型、健康波动型等)
  - 重构 feature_mining.py,更新相关性分析和群体比较维度(按行业、班次、婚姻状态等)
  - 新增 model_features.py 定义模型训练特征
  - 更新 preprocessing.py 和 train_model.py 适配新数据结构
  - 更新各 API 路由默认参数(model: random_forest, dimension: industry)
  - 前端更新主题样式和各视图组件适配中文字段
  - 更新系统名称为 China Enterprise Absence Analysis System
2026-03-11 10:46:58 +08:00

134 lines
5.2 KiB
Python

import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import config
from core.preprocessing import get_clean_data
class KMeansAnalyzer:
def __init__(self, n_clusters=3):
self.n_clusters = n_clusters
self.model = None
self.scaler = MinMaxScaler()
self.labels = None
self.feature_cols = [
'年龄',
'司龄年数',
'月均加班时长',
'通勤时长分钟',
'BMI',
'缺勤时长(小时)',
]
def fit(self, n_clusters=None):
if n_clusters:
self.n_clusters = n_clusters
df = get_clean_data().reset_index(drop=True)
data = df[self.feature_cols].values
data_scaled = self.scaler.fit_transform(data)
self.model = KMeans(n_clusters=self.n_clusters, random_state=config.RANDOM_STATE, n_init=10)
self.labels = self.model.fit_predict(data_scaled)
return self.model
def get_cluster_results(self, n_clusters=3):
if self.model is None or self.n_clusters != n_clusters:
self.fit(n_clusters)
centers = self.scaler.inverse_transform(self.model.cluster_centers_)
unique, counts = np.unique(self.labels, return_counts=True)
total = len(self.labels)
names = self._generate_cluster_names(centers)
clusters = []
for cluster_id, count in zip(unique, counts):
center = centers[int(cluster_id)]
clusters.append({
'id': int(cluster_id),
'name': names.get(int(cluster_id), f'群体{int(cluster_id) + 1}'),
'member_count': int(count),
'percentage': round(count / total * 100, 1),
'center': {
feature: round(float(value), 2)
for feature, value in zip(self.feature_cols, center)
},
'description': self._generate_description(names.get(int(cluster_id), '')),
})
return {'n_clusters': self.n_clusters, 'clusters': clusters}
def get_cluster_profile(self, n_clusters=3):
if self.model is None or self.n_clusters != n_clusters:
self.fit(n_clusters)
centers_scaled = self.model.cluster_centers_
names = self._generate_cluster_names(self.scaler.inverse_transform(centers_scaled))
return {
'dimensions': ['年龄', '司龄', '加班', '通勤', 'BMI', '缺勤'],
'dimension_keys': self.feature_cols,
'clusters': [
{
'id': idx,
'name': names.get(idx, f'群体{idx + 1}'),
'values': [round(float(v), 2) for v in centers_scaled[idx]],
}
for idx in range(self.n_clusters)
],
}
def get_scatter_data(self, n_clusters=3, x_axis='月均加班时长', y_axis='缺勤时长(小时)'):
if self.model is None or self.n_clusters != n_clusters:
self.fit(n_clusters)
df = get_clean_data().reset_index(drop=True)
if x_axis not in df.columns:
x_axis = '月均加班时长'
if y_axis not in df.columns:
y_axis = config.TARGET_COLUMN
points = []
for idx in range(min(len(df), len(self.labels))):
row = df.iloc[idx]
points.append({
'employee_id': str(row[config.EMPLOYEE_ID_COLUMN]),
'x': float(row[x_axis]),
'y': float(row[y_axis]),
'cluster_id': int(self.labels[idx]),
})
return {
'x_axis': x_axis,
'x_axis_name': config.FEATURE_NAME_CN.get(x_axis, x_axis),
'y_axis': y_axis,
'y_axis_name': config.FEATURE_NAME_CN.get(y_axis, y_axis),
'points': points[:500],
'cluster_colors': {
'0': '#5B8FF9',
'1': '#61DDAA',
'2': '#F6BD16',
'3': '#E8684A',
'4': '#6DC8EC',
},
}
def _generate_cluster_names(self, centers):
names = {}
for idx, center in enumerate(centers):
_, tenure, overtime, commute, bmi, absence = center
if overtime > 38 and commute > 55 and absence > 8:
names[idx] = '高压通勤型'
elif bmi > 27 and absence > 8:
names[idx] = '健康波动型'
elif tenure > 8 and absence < 6:
names[idx] = '稳定低风险型'
elif overtime > 28 and absence > 7:
names[idx] = '轮班负荷型'
else:
names[idx] = f'群体{idx + 1}'
return names
def _generate_description(self, name):
descriptions = {
'高压通勤型': '加班和通勤压力都高,缺勤时长偏长。',
'健康波动型': '健康相关风险更高,需要重点关注。',
'稳定低风险型': '司龄较长,缺勤水平稳定且偏低。',
'轮班负荷型': '排班和工作负荷较重,缺勤风险较高。',
}
return descriptions.get(name, '常规员工群体。')
kmeans_analyzer = KMeansAnalyzer()