Files
forsetsystem/backend/core/clustering.py
shenjianZ a39d8b2fd2 feat: 初始化员工缺勤分析系统项目
搭建完整的前后端分离架构,实现数据概览、预测分析、聚类分析等核心功能模块

  详细版:
  feat: 初始化员工缺勤分析系统项目

  - 后端:基于 Flask 搭建 RESTful API,包含数据概览、特征分析、预测模型、聚类分析四大模块
  - 前端:基于 Vue.js 构建单页应用,实现 Dashboard、预测、聚类、因子分析等页面
  - 模型:集成随机森林、XGBoost、LightGBM、Stacking 等多种机器学习模型
  - 文档:完成需求规格说明、系统架构设计、接口设计、数据设计、UI原型设计等文档
2026-03-08 14:48:26 +08:00

230 lines
7.5 KiB
Python

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import joblib
import os
import config
from core.preprocessing import get_clean_data
class KMeansAnalyzer:
def __init__(self, n_clusters=3):
self.n_clusters = n_clusters
self.model = None
self.scaler = MinMaxScaler()
self.data = None
self.data_scaled = None
self.labels = None
def _get_feature_columns(self, df):
df.columns = [col.strip() for col in df.columns]
feature_map = {
'Age': None,
'Service time': None,
'Work load Average/day': None,
'Body mass index': None,
'Absenteeism time in hours': None
}
for key in feature_map:
if key in df.columns:
feature_map[key] = key
else:
for col in df.columns:
if key.replace(' ', '').lower() == col.replace(' ', '').lower():
feature_map[key] = col
break
actual_features = [v for v in feature_map.values() if v is not None]
return actual_features
def fit(self, n_clusters=None):
if n_clusters:
self.n_clusters = n_clusters
df = get_clean_data()
df = df.reset_index(drop=True)
feature_cols = self._get_feature_columns(df)
if not feature_cols:
feature_cols = ['Age', 'Service time', 'Body mass index', 'Absenteeism time in hours']
feature_cols = [c for c in feature_cols if c in df.columns]
self.data = df[feature_cols].values
self.scaler = MinMaxScaler()
self.data_scaled = self.scaler.fit_transform(self.data)
self.model = KMeans(
n_clusters=self.n_clusters,
random_state=config.RANDOM_STATE,
n_init=10
)
self.labels = self.model.fit_predict(self.data_scaled)
return self.model
def get_cluster_results(self, n_clusters=3):
if self.model is None or self.n_clusters != n_clusters:
self.fit(n_clusters)
centers = self.scaler.inverse_transform(self.model.cluster_centers_)
unique, counts = np.unique(self.labels, return_counts=True)
total = len(self.labels)
cluster_names = self._generate_cluster_names(centers)
feature_cols = self._get_feature_columns(get_clean_data())
clusters = []
for i, (cluster_id, count) in enumerate(zip(unique, counts)):
center_dict = {}
for j, fname in enumerate(feature_cols):
if j < len(centers[i]):
center_dict[fname] = round(centers[i][j], 2)
clusters.append({
'id': int(cluster_id),
'name': cluster_names.get(cluster_id, f'群体{cluster_id+1}'),
'member_count': int(count),
'percentage': round(count / total * 100, 1),
'center': center_dict,
'description': self._generate_description(cluster_names.get(cluster_id, ''))
})
return {
'n_clusters': self.n_clusters,
'clusters': clusters
}
def get_cluster_profile(self, n_clusters=3):
if self.model is None or self.n_clusters != n_clusters:
self.fit(n_clusters)
centers_scaled = self.model.cluster_centers_
df = get_clean_data()
df.columns = [col.strip() for col in df.columns]
feature_cols = self._get_feature_columns(df)
dimensions = ['年龄', '工龄', '工作负荷', 'BMI', '缺勤倾向'][:len(feature_cols)]
cluster_names = self._generate_cluster_names(
self.scaler.inverse_transform(centers_scaled)
)
clusters = []
for i in range(self.n_clusters):
clusters.append({
'id': i,
'name': cluster_names.get(i, f'群体{i+1}'),
'values': [round(v, 2) for v in centers_scaled[i]]
})
return {
'dimensions': dimensions,
'dimension_keys': feature_cols,
'clusters': clusters
}
def get_scatter_data(self, n_clusters=3, x_axis='Age', y_axis='Absenteeism time in hours'):
if self.model is None or self.n_clusters != n_clusters:
self.fit(n_clusters)
df = get_clean_data()
df = df.reset_index(drop=True)
df.columns = [col.strip() for col in df.columns]
x_col = None
y_col = None
for col in df.columns:
if x_axis.replace(' ', '').lower() in col.replace(' ', '').lower():
x_col = col
if y_axis.replace(' ', '').lower() in col.replace(' ', '').lower():
y_col = col
if x_col is None:
x_col = df.columns[0]
if y_col is None:
y_col = df.columns[-1]
points = []
for idx in range(min(len(df), len(self.labels))):
row = df.iloc[idx]
points.append({
'employee_id': int(row['ID']),
'x': float(row[x_col]),
'y': float(row[y_col]),
'cluster_id': int(self.labels[idx])
})
cluster_colors = {
'0': '#67C23A',
'1': '#E6A23C',
'2': '#F56C6C',
'3': '#909399',
'4': '#409EFF'
}
return {
'x_axis': x_col,
'x_axis_name': config.FEATURE_NAME_CN.get(x_col, x_col),
'y_axis': y_col,
'y_axis_name': config.FEATURE_NAME_CN.get(y_col, y_col),
'points': points[:500],
'cluster_colors': cluster_colors
}
def _generate_cluster_names(self, centers):
names = {}
for i, center in enumerate(centers):
if len(center) >= 5:
service_time = center[1]
work_load = center[2]
bmi = center[3]
absent = center[4]
else:
service_time = center[1] if len(center) > 1 else 0
work_load = 0
bmi = center[2] if len(center) > 2 else 0
absent = center[3] if len(center) > 3 else 0
if service_time > 15 and absent < 3:
names[i] = '模范型员工'
elif work_load > 260 and absent > 5:
names[i] = '压力型员工'
elif bmi > 28:
names[i] = '生活习惯型员工'
else:
names[i] = f'群体{i+1}'
return names
def _generate_description(self, name):
descriptions = {
'模范型员工': '工龄长、工作稳定、缺勤率低',
'压力型员工': '工作负荷大、缺勤较多',
'生活习惯型员工': 'BMI偏高、需关注健康'
}
return descriptions.get(name, '常规员工群体')
def save_model(self):
os.makedirs(config.MODELS_DIR, exist_ok=True)
joblib.dump(self.model, config.KMEANS_MODEL_PATH)
def load_model(self):
if os.path.exists(config.KMEANS_MODEL_PATH):
self.model = joblib.load(config.KMEANS_MODEL_PATH)
self.n_clusters = self.model.n_clusters
kmeans_analyzer = KMeansAnalyzer()