Files
forsetsystem/backend/core/clustering.py
2026-04-27 11:59:35 +08:00

270 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import config
from core.preprocessing import get_clean_data
class KMeansAnalyzer:
def __init__(self, n_clusters=3):
self.n_clusters = n_clusters
self.model = None
self.scaler = MinMaxScaler()
self.labels = None
self.feature_cols = [
'年龄',
'司龄年数',
'月均加班时长',
'通勤时长分钟',
'BMI',
'缺勤时长(小时)',
]
def fit(self, n_clusters=None):
if n_clusters:
self.n_clusters = n_clusters
df = get_clean_data().reset_index(drop=True)
data = df[self.feature_cols].values
data_scaled = self.scaler.fit_transform(data)
self.model = KMeans(n_clusters=self.n_clusters, random_state=config.RANDOM_STATE, n_init=10)
self.labels = self.model.fit_predict(data_scaled)
return self.model
def get_cluster_results(self, n_clusters=3):
if self.model is None or self.n_clusters != n_clusters:
self.fit(n_clusters)
centers = self.scaler.inverse_transform(self.model.cluster_centers_)
unique, counts = np.unique(self.labels, return_counts=True)
total = len(self.labels)
names = self._generate_cluster_names(centers)
clusters = []
for cluster_id, count in zip(unique, counts):
center = centers[int(cluster_id)]
clusters.append({
'id': int(cluster_id),
'name': names.get(int(cluster_id), '常规稳态型'),
'member_count': int(count),
'percentage': round(count / total * 100, 1),
'center': {
feature: round(float(value), 2)
for feature, value in zip(self.feature_cols, center)
},
'description': self._generate_description(names.get(int(cluster_id), '常规稳态型'), center),
})
return {'n_clusters': self.n_clusters, 'clusters': clusters}
def get_cluster_profile(self, n_clusters=3):
if self.model is None or self.n_clusters != n_clusters:
self.fit(n_clusters)
centers_scaled = self.model.cluster_centers_
names = self._generate_cluster_names(self.scaler.inverse_transform(centers_scaled))
return {
'dimensions': ['年龄', '司龄', '加班', '通勤', 'BMI', '缺勤'],
'dimension_keys': self.feature_cols,
'clusters': [
{
'id': idx,
'name': names.get(idx, '常规稳态型'),
'values': [round(float(v), 2) for v in centers_scaled[idx]],
}
for idx in range(self.n_clusters)
],
}
def get_scatter_data(self, n_clusters=3, x_axis='月均加班时长', y_axis='缺勤时长(小时)'):
if self.model is None or self.n_clusters != n_clusters:
self.fit(n_clusters)
df = get_clean_data().reset_index(drop=True)
centers = self.scaler.inverse_transform(self.model.cluster_centers_)
names = self._generate_cluster_names(centers)
if x_axis not in df.columns:
x_axis = '月均加班时长'
if y_axis not in df.columns:
y_axis = config.TARGET_COLUMN
points = []
for idx in range(min(len(df), len(self.labels))):
row = df.iloc[idx]
points.append({
'employee_id': str(row[config.EMPLOYEE_ID_COLUMN]),
'x': float(row[x_axis]),
'y': float(row[y_axis]),
'cluster_id': int(self.labels[idx]),
})
return {
'x_axis': x_axis,
'x_axis_name': config.FEATURE_NAME_CN.get(x_axis, x_axis),
'y_axis': y_axis,
'y_axis_name': config.FEATURE_NAME_CN.get(y_axis, y_axis),
'points': points[:500],
'cluster_colors': {
'0': '#5B8FF9',
'1': '#61DDAA',
'2': '#F6BD16',
'3': '#E8684A',
'4': '#6DC8EC',
},
'cluster_names': {
str(idx): names.get(idx, '常规稳态型')
for idx in range(self.n_clusters)
},
}
def _generate_cluster_names(self, centers):
rank_info = self._build_rank_info(centers)
base_names = {}
for idx, center in enumerate(centers):
base_names[idx] = self._classify_cluster(center, rank_info, idx)
return self._deduplicate_cluster_names(base_names, centers)
def _build_rank_info(self, centers):
centers = np.asarray(centers, dtype=float)
return {
'年龄': self._rank_desc(centers[:, 0]),
'司龄': self._rank_desc(centers[:, 1]),
'加班': self._rank_desc(centers[:, 2]),
'通勤': self._rank_desc(centers[:, 3]),
'BMI': self._rank_desc(centers[:, 4]),
'缺勤': self._rank_desc(centers[:, 5]),
}
def _rank_desc(self, values):
ordered = np.argsort(-np.asarray(values, dtype=float))
ranks = {}
for rank, idx in enumerate(ordered):
ranks[int(idx)] = rank
return ranks
def _classify_cluster(self, center, rank_info, idx):
age, tenure, overtime, commute, bmi, absence = center
high_absence = rank_info['缺勤'][idx] == 0
low_absence = rank_info['缺勤'][idx] == len(rank_info['缺勤']) - 1
high_overtime = rank_info['加班'][idx] <= 1
high_commute = rank_info['通勤'][idx] <= 1
high_bmi = rank_info['BMI'][idx] <= 1
high_tenure = rank_info['司龄'][idx] <= 1
low_tenure = rank_info['司龄'][idx] >= len(rank_info['司龄']) - 1
young_group = rank_info['年龄'][idx] >= len(rank_info['年龄']) - 1
if (absence >= 7.5 and overtime >= 28 and commute >= 40) or (high_absence and high_overtime and high_commute):
return '压力奔波型'
if (absence >= 7.0 and bmi >= 25.5) or (high_absence and high_bmi):
return '健康关注型'
if (overtime >= 30 and absence >= 6.0) or (high_overtime and rank_info['缺勤'][idx] <= 1):
return '负荷承压型'
if (tenure >= 8 and absence <= 6.0) or (high_tenure and low_absence):
return '稳定成熟型'
if (tenure <= 4 and age <= 32) or (low_tenure and young_group):
return '新锐成长型'
if commute <= 35 and absence <= 6.5:
return '通勤平衡型'
if tenure >= 6 and absence <= 6.8:
return '经验稳健型'
return '常规稳态型'
def _deduplicate_cluster_names(self, names, centers):
grouped = {}
for idx, name in names.items():
grouped.setdefault(name, []).append(idx)
deduplicated = names.copy()
for name, indices in grouped.items():
if len(indices) == 1:
continue
order = self._build_duplicate_order(indices, centers)
suffixes = self._suffix_candidates(name)
for rank, idx in enumerate(order):
suffix = suffixes[rank] if rank < len(suffixes) else f'{rank + 1}'
deduplicated[idx] = f'{name}{suffix}'
return deduplicated
def _build_duplicate_order(self, indices, centers):
return sorted(
indices,
key=lambda idx: (
centers[idx][5], # 缺勤时长
centers[idx][2], # 加班
centers[idx][1], # 司龄
centers[idx][3], # 通勤
centers[idx][4], # BMI
centers[idx][0], # 年龄
),
reverse=True,
)
def _suffix_candidates(self, name):
suffix_map = {
'压力奔波型': ['-高压组', '-长途组', '-持续关注组'],
'健康关注型': ['-重点关注组', '-预警组', '-干预组'],
'负荷承压型': ['-高负荷组', '-轮班组', '-调节组'],
'稳定成熟型': ['-资深组', '-成熟组', '-稳健组'],
'新锐成长型': ['-适应组', '-成长组', '-潜力组'],
'通勤平衡型': ['-均衡组', '-稳态组', '-协同组'],
'经验稳健型': ['-资深组', '-稳健组', '-协同组'],
'常规稳态型': ['-平衡组', '-常态组', '-协同组'],
}
return suffix_map.get(name, [f'{idx}' for idx in range(1, 10)])
def _generate_description(self, name, center=None):
descriptions = {
'压力奔波型': '加班与通勤压力同时偏高,缺勤波动更明显。',
'健康关注型': '健康负担更突出,缺勤时长偏高,建议优先关注。',
'负荷承压型': '工作负荷较重,缺勤风险处于偏高水平。',
'稳定成熟型': '司龄较长,整体状态稳定,缺勤水平偏低。',
'新锐成长型': '整体更年轻、司龄较短,仍处于适应与成长阶段。',
'通勤平衡型': '通勤与缺勤表现较均衡,整体波动相对可控。',
'经验稳健型': '具备一定经验积累,整体表现稳健,缺勤风险较低。',
'常规稳态型': '整体表现接近企业常态,是较典型的员工群体。',
}
for key, description in descriptions.items():
if name.startswith(key):
if center is None:
return description
return self._build_dynamic_description(key, center, description)
return descriptions.get(name, '整体表现接近企业常态。')
def _build_dynamic_description(self, base_name, center, default_description):
age, tenure, overtime, commute, bmi, absence = center
clauses = []
if tenure >= 8:
clauses.append('司龄较长')
elif tenure <= 4:
clauses.append('司龄较短')
if overtime >= 30:
clauses.append('加班负荷偏高')
elif overtime <= 18:
clauses.append('加班压力相对可控')
if commute >= 45:
clauses.append('通勤压力偏高')
elif commute <= 30:
clauses.append('通勤节奏较平衡')
if bmi >= 26:
clauses.append('健康管理压力更明显')
if absence >= 7.5:
clauses.append('缺勤时长偏高')
elif absence <= 5.5:
clauses.append('缺勤水平偏低')
if age <= 32:
clauses.append('群体整体更年轻')
elif age >= 40:
clauses.append('群体整体更成熟')
unique_clauses = []
for clause in clauses:
if clause not in unique_clauses:
unique_clauses.append(clause)
if not unique_clauses:
return default_description
return ''.join(unique_clauses[:3]) + ''
kmeans_analyzer = KMeansAnalyzer()