feat: update clustering implementation and docs
This commit is contained in:
@@ -76,6 +76,8 @@ class KMeansAnalyzer:
|
||||
if self.model is None or self.n_clusters != n_clusters:
|
||||
self.fit(n_clusters)
|
||||
df = get_clean_data().reset_index(drop=True)
|
||||
centers = self.scaler.inverse_transform(self.model.cluster_centers_)
|
||||
names = self._generate_cluster_names(centers)
|
||||
if x_axis not in df.columns:
|
||||
x_axis = '月均加班时长'
|
||||
if y_axis not in df.columns:
|
||||
@@ -102,23 +104,67 @@ class KMeansAnalyzer:
|
||||
'3': '#E8684A',
|
||||
'4': '#6DC8EC',
|
||||
},
|
||||
'cluster_names': {
|
||||
str(idx): names.get(idx, f'群体{idx + 1}')
|
||||
for idx in range(self.n_clusters)
|
||||
},
|
||||
}
|
||||
|
||||
def _generate_cluster_names(self, centers):
|
||||
names = {}
|
||||
base_names = {}
|
||||
for idx, center in enumerate(centers):
|
||||
_, tenure, overtime, commute, bmi, absence = center
|
||||
if overtime > 38 and commute > 55 and absence > 8:
|
||||
names[idx] = '高压通勤型'
|
||||
base_names[idx] = '高压通勤型'
|
||||
elif bmi > 27 and absence > 8:
|
||||
names[idx] = '健康波动型'
|
||||
base_names[idx] = '健康波动型'
|
||||
elif tenure > 8 and absence < 6:
|
||||
names[idx] = '稳定低风险型'
|
||||
base_names[idx] = '稳定低风险型'
|
||||
elif overtime > 28 and absence > 7:
|
||||
names[idx] = '轮班负荷型'
|
||||
base_names[idx] = '轮班负荷型'
|
||||
else:
|
||||
names[idx] = f'群体{idx + 1}'
|
||||
return names
|
||||
base_names[idx] = f'群体{idx + 1}'
|
||||
return self._deduplicate_cluster_names(base_names, centers)
|
||||
|
||||
def _deduplicate_cluster_names(self, names, centers):
|
||||
grouped = {}
|
||||
for idx, name in names.items():
|
||||
grouped.setdefault(name, []).append(idx)
|
||||
|
||||
deduplicated = names.copy()
|
||||
for name, indices in grouped.items():
|
||||
if len(indices) == 1:
|
||||
continue
|
||||
|
||||
order = self._build_duplicate_order(indices, centers)
|
||||
suffixes = self._suffix_candidates(name)
|
||||
for rank, idx in enumerate(order):
|
||||
suffix = suffixes[rank] if rank < len(suffixes) else f'{rank + 1}'
|
||||
deduplicated[idx] = f'{name}{suffix}'
|
||||
return deduplicated
|
||||
|
||||
def _build_duplicate_order(self, indices, centers):
|
||||
return sorted(
|
||||
indices,
|
||||
key=lambda idx: (
|
||||
centers[idx][5], # 缺勤时长
|
||||
centers[idx][2], # 加班
|
||||
centers[idx][1], # 司龄
|
||||
centers[idx][3], # 通勤
|
||||
centers[idx][4], # BMI
|
||||
centers[idx][0], # 年龄
|
||||
),
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
def _suffix_candidates(self, name):
|
||||
suffix_map = {
|
||||
'高压通勤型': ['-高风险组', '-关注组', '-观察组'],
|
||||
'健康波动型': ['-重点关注组', '-预警组', '-观察组'],
|
||||
'稳定低风险型': ['-资深组', '-成熟组', '-稳健组'],
|
||||
'轮班负荷型': ['-高负荷组', '-轮班组', '-强化组'],
|
||||
}
|
||||
return suffix_map.get(name, [f'({idx})' for idx in range(1, 10)])
|
||||
|
||||
def _generate_description(self, name):
|
||||
descriptions = {
|
||||
@@ -127,6 +173,9 @@ class KMeansAnalyzer:
|
||||
'稳定低风险型': '司龄较长,缺勤水平稳定且偏低。',
|
||||
'轮班负荷型': '排班和工作负荷较重,缺勤风险较高。',
|
||||
}
|
||||
for key, description in descriptions.items():
|
||||
if name.startswith(key):
|
||||
return description
|
||||
return descriptions.get(name, '常规员工群体。')
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user