feat: update clustering implementation and docs

This commit is contained in:
shuo
2026-04-21 11:13:11 +08:00
parent 5655eb0cda
commit 27c394fd8c
17 changed files with 540 additions and 215 deletions

View File

@@ -76,6 +76,8 @@ class KMeansAnalyzer:
if self.model is None or self.n_clusters != n_clusters:
self.fit(n_clusters)
df = get_clean_data().reset_index(drop=True)
centers = self.scaler.inverse_transform(self.model.cluster_centers_)
names = self._generate_cluster_names(centers)
if x_axis not in df.columns:
x_axis = '月均加班时长'
if y_axis not in df.columns:
@@ -102,23 +104,67 @@ class KMeansAnalyzer:
'3': '#E8684A',
'4': '#6DC8EC',
},
'cluster_names': {
str(idx): names.get(idx, f'群体{idx + 1}')
for idx in range(self.n_clusters)
},
}
def _generate_cluster_names(self, centers):
names = {}
base_names = {}
for idx, center in enumerate(centers):
_, tenure, overtime, commute, bmi, absence = center
if overtime > 38 and commute > 55 and absence > 8:
names[idx] = '高压通勤型'
base_names[idx] = '高压通勤型'
elif bmi > 27 and absence > 8:
names[idx] = '健康波动型'
base_names[idx] = '健康波动型'
elif tenure > 8 and absence < 6:
names[idx] = '稳定低风险型'
base_names[idx] = '稳定低风险型'
elif overtime > 28 and absence > 7:
names[idx] = '轮班负荷型'
base_names[idx] = '轮班负荷型'
else:
names[idx] = f'群体{idx + 1}'
return names
base_names[idx] = f'群体{idx + 1}'
return self._deduplicate_cluster_names(base_names, centers)
def _deduplicate_cluster_names(self, names, centers):
grouped = {}
for idx, name in names.items():
grouped.setdefault(name, []).append(idx)
deduplicated = names.copy()
for name, indices in grouped.items():
if len(indices) == 1:
continue
order = self._build_duplicate_order(indices, centers)
suffixes = self._suffix_candidates(name)
for rank, idx in enumerate(order):
suffix = suffixes[rank] if rank < len(suffixes) else f'{rank + 1}'
deduplicated[idx] = f'{name}{suffix}'
return deduplicated
def _build_duplicate_order(self, indices, centers):
return sorted(
indices,
key=lambda idx: (
centers[idx][5], # 缺勤时长
centers[idx][2], # 加班
centers[idx][1], # 司龄
centers[idx][3], # 通勤
centers[idx][4], # BMI
centers[idx][0], # 年龄
),
reverse=True,
)
def _suffix_candidates(self, name):
suffix_map = {
'高压通勤型': ['-高风险组', '-关注组', '-观察组'],
'健康波动型': ['-重点关注组', '-预警组', '-观察组'],
'稳定低风险型': ['-资深组', '-成熟组', '-稳健组'],
'轮班负荷型': ['-高负荷组', '-轮班组', '-强化组'],
}
return suffix_map.get(name, [f'{idx}' for idx in range(1, 10)])
def _generate_description(self, name):
descriptions = {
@@ -127,6 +173,9 @@ class KMeansAnalyzer:
'稳定低风险型': '司龄较长,缺勤水平稳定且偏低。',
'轮班负荷型': '排班和工作负荷较重,缺勤风险较高。',
}
for key, description in descriptions.items():
if name.startswith(key):
return description
return descriptions.get(name, '常规员工群体。')