270 lines
11 KiB
Python
270 lines
11 KiB
Python
import numpy as np
|
||
from sklearn.cluster import KMeans
|
||
from sklearn.preprocessing import MinMaxScaler
|
||
|
||
import config
|
||
from core.preprocessing import get_clean_data
|
||
|
||
|
||
class KMeansAnalyzer:
|
||
def __init__(self, n_clusters=3):
|
||
self.n_clusters = n_clusters
|
||
self.model = None
|
||
self.scaler = MinMaxScaler()
|
||
self.labels = None
|
||
self.feature_cols = [
|
||
'年龄',
|
||
'司龄年数',
|
||
'月均加班时长',
|
||
'通勤时长分钟',
|
||
'BMI',
|
||
'缺勤时长(小时)',
|
||
]
|
||
|
||
def fit(self, n_clusters=None):
|
||
if n_clusters:
|
||
self.n_clusters = n_clusters
|
||
df = get_clean_data().reset_index(drop=True)
|
||
data = df[self.feature_cols].values
|
||
data_scaled = self.scaler.fit_transform(data)
|
||
self.model = KMeans(n_clusters=self.n_clusters, random_state=config.RANDOM_STATE, n_init=10)
|
||
self.labels = self.model.fit_predict(data_scaled)
|
||
return self.model
|
||
|
||
def get_cluster_results(self, n_clusters=3):
|
||
if self.model is None or self.n_clusters != n_clusters:
|
||
self.fit(n_clusters)
|
||
centers = self.scaler.inverse_transform(self.model.cluster_centers_)
|
||
unique, counts = np.unique(self.labels, return_counts=True)
|
||
total = len(self.labels)
|
||
names = self._generate_cluster_names(centers)
|
||
clusters = []
|
||
for cluster_id, count in zip(unique, counts):
|
||
center = centers[int(cluster_id)]
|
||
clusters.append({
|
||
'id': int(cluster_id),
|
||
'name': names.get(int(cluster_id), '常规稳态型'),
|
||
'member_count': int(count),
|
||
'percentage': round(count / total * 100, 1),
|
||
'center': {
|
||
feature: round(float(value), 2)
|
||
for feature, value in zip(self.feature_cols, center)
|
||
},
|
||
'description': self._generate_description(names.get(int(cluster_id), '常规稳态型'), center),
|
||
})
|
||
return {'n_clusters': self.n_clusters, 'clusters': clusters}
|
||
|
||
def get_cluster_profile(self, n_clusters=3):
|
||
if self.model is None or self.n_clusters != n_clusters:
|
||
self.fit(n_clusters)
|
||
centers_scaled = self.model.cluster_centers_
|
||
names = self._generate_cluster_names(self.scaler.inverse_transform(centers_scaled))
|
||
return {
|
||
'dimensions': ['年龄', '司龄', '加班', '通勤', 'BMI', '缺勤'],
|
||
'dimension_keys': self.feature_cols,
|
||
'clusters': [
|
||
{
|
||
'id': idx,
|
||
'name': names.get(idx, '常规稳态型'),
|
||
'values': [round(float(v), 2) for v in centers_scaled[idx]],
|
||
}
|
||
for idx in range(self.n_clusters)
|
||
],
|
||
}
|
||
|
||
def get_scatter_data(self, n_clusters=3, x_axis='月均加班时长', y_axis='缺勤时长(小时)'):
|
||
if self.model is None or self.n_clusters != n_clusters:
|
||
self.fit(n_clusters)
|
||
df = get_clean_data().reset_index(drop=True)
|
||
centers = self.scaler.inverse_transform(self.model.cluster_centers_)
|
||
names = self._generate_cluster_names(centers)
|
||
if x_axis not in df.columns:
|
||
x_axis = '月均加班时长'
|
||
if y_axis not in df.columns:
|
||
y_axis = config.TARGET_COLUMN
|
||
points = []
|
||
for idx in range(min(len(df), len(self.labels))):
|
||
row = df.iloc[idx]
|
||
points.append({
|
||
'employee_id': str(row[config.EMPLOYEE_ID_COLUMN]),
|
||
'x': float(row[x_axis]),
|
||
'y': float(row[y_axis]),
|
||
'cluster_id': int(self.labels[idx]),
|
||
})
|
||
return {
|
||
'x_axis': x_axis,
|
||
'x_axis_name': config.FEATURE_NAME_CN.get(x_axis, x_axis),
|
||
'y_axis': y_axis,
|
||
'y_axis_name': config.FEATURE_NAME_CN.get(y_axis, y_axis),
|
||
'points': points[:500],
|
||
'cluster_colors': {
|
||
'0': '#5B8FF9',
|
||
'1': '#61DDAA',
|
||
'2': '#F6BD16',
|
||
'3': '#E8684A',
|
||
'4': '#6DC8EC',
|
||
},
|
||
'cluster_names': {
|
||
str(idx): names.get(idx, '常规稳态型')
|
||
for idx in range(self.n_clusters)
|
||
},
|
||
}
|
||
|
||
def _generate_cluster_names(self, centers):
|
||
rank_info = self._build_rank_info(centers)
|
||
base_names = {}
|
||
for idx, center in enumerate(centers):
|
||
base_names[idx] = self._classify_cluster(center, rank_info, idx)
|
||
return self._deduplicate_cluster_names(base_names, centers)
|
||
|
||
def _build_rank_info(self, centers):
|
||
centers = np.asarray(centers, dtype=float)
|
||
return {
|
||
'年龄': self._rank_desc(centers[:, 0]),
|
||
'司龄': self._rank_desc(centers[:, 1]),
|
||
'加班': self._rank_desc(centers[:, 2]),
|
||
'通勤': self._rank_desc(centers[:, 3]),
|
||
'BMI': self._rank_desc(centers[:, 4]),
|
||
'缺勤': self._rank_desc(centers[:, 5]),
|
||
}
|
||
|
||
def _rank_desc(self, values):
|
||
ordered = np.argsort(-np.asarray(values, dtype=float))
|
||
ranks = {}
|
||
for rank, idx in enumerate(ordered):
|
||
ranks[int(idx)] = rank
|
||
return ranks
|
||
|
||
def _classify_cluster(self, center, rank_info, idx):
|
||
age, tenure, overtime, commute, bmi, absence = center
|
||
high_absence = rank_info['缺勤'][idx] == 0
|
||
low_absence = rank_info['缺勤'][idx] == len(rank_info['缺勤']) - 1
|
||
high_overtime = rank_info['加班'][idx] <= 1
|
||
high_commute = rank_info['通勤'][idx] <= 1
|
||
high_bmi = rank_info['BMI'][idx] <= 1
|
||
high_tenure = rank_info['司龄'][idx] <= 1
|
||
low_tenure = rank_info['司龄'][idx] >= len(rank_info['司龄']) - 1
|
||
young_group = rank_info['年龄'][idx] >= len(rank_info['年龄']) - 1
|
||
|
||
if (absence >= 7.5 and overtime >= 28 and commute >= 40) or (high_absence and high_overtime and high_commute):
|
||
return '压力奔波型'
|
||
if (absence >= 7.0 and bmi >= 25.5) or (high_absence and high_bmi):
|
||
return '健康关注型'
|
||
if (overtime >= 30 and absence >= 6.0) or (high_overtime and rank_info['缺勤'][idx] <= 1):
|
||
return '负荷承压型'
|
||
if (tenure >= 8 and absence <= 6.0) or (high_tenure and low_absence):
|
||
return '稳定成熟型'
|
||
if (tenure <= 4 and age <= 32) or (low_tenure and young_group):
|
||
return '新锐成长型'
|
||
if commute <= 35 and absence <= 6.5:
|
||
return '通勤平衡型'
|
||
if tenure >= 6 and absence <= 6.8:
|
||
return '经验稳健型'
|
||
return '常规稳态型'
|
||
|
||
def _deduplicate_cluster_names(self, names, centers):
|
||
grouped = {}
|
||
for idx, name in names.items():
|
||
grouped.setdefault(name, []).append(idx)
|
||
|
||
deduplicated = names.copy()
|
||
for name, indices in grouped.items():
|
||
if len(indices) == 1:
|
||
continue
|
||
|
||
order = self._build_duplicate_order(indices, centers)
|
||
suffixes = self._suffix_candidates(name)
|
||
for rank, idx in enumerate(order):
|
||
suffix = suffixes[rank] if rank < len(suffixes) else f'{rank + 1}'
|
||
deduplicated[idx] = f'{name}{suffix}'
|
||
return deduplicated
|
||
|
||
def _build_duplicate_order(self, indices, centers):
|
||
return sorted(
|
||
indices,
|
||
key=lambda idx: (
|
||
centers[idx][5], # 缺勤时长
|
||
centers[idx][2], # 加班
|
||
centers[idx][1], # 司龄
|
||
centers[idx][3], # 通勤
|
||
centers[idx][4], # BMI
|
||
centers[idx][0], # 年龄
|
||
),
|
||
reverse=True,
|
||
)
|
||
|
||
def _suffix_candidates(self, name):
|
||
suffix_map = {
|
||
'压力奔波型': ['-高压组', '-长途组', '-持续关注组'],
|
||
'健康关注型': ['-重点关注组', '-预警组', '-干预组'],
|
||
'负荷承压型': ['-高负荷组', '-轮班组', '-调节组'],
|
||
'稳定成熟型': ['-资深组', '-成熟组', '-稳健组'],
|
||
'新锐成长型': ['-适应组', '-成长组', '-潜力组'],
|
||
'通勤平衡型': ['-均衡组', '-稳态组', '-协同组'],
|
||
'经验稳健型': ['-资深组', '-稳健组', '-协同组'],
|
||
'常规稳态型': ['-平衡组', '-常态组', '-协同组'],
|
||
}
|
||
return suffix_map.get(name, [f'({idx})' for idx in range(1, 10)])
|
||
|
||
def _generate_description(self, name, center=None):
|
||
descriptions = {
|
||
'压力奔波型': '加班与通勤压力同时偏高,缺勤波动更明显。',
|
||
'健康关注型': '健康负担更突出,缺勤时长偏高,建议优先关注。',
|
||
'负荷承压型': '工作负荷较重,缺勤风险处于偏高水平。',
|
||
'稳定成熟型': '司龄较长,整体状态稳定,缺勤水平偏低。',
|
||
'新锐成长型': '整体更年轻、司龄较短,仍处于适应与成长阶段。',
|
||
'通勤平衡型': '通勤与缺勤表现较均衡,整体波动相对可控。',
|
||
'经验稳健型': '具备一定经验积累,整体表现稳健,缺勤风险较低。',
|
||
'常规稳态型': '整体表现接近企业常态,是较典型的员工群体。',
|
||
}
|
||
for key, description in descriptions.items():
|
||
if name.startswith(key):
|
||
if center is None:
|
||
return description
|
||
return self._build_dynamic_description(key, center, description)
|
||
return descriptions.get(name, '整体表现接近企业常态。')
|
||
|
||
def _build_dynamic_description(self, base_name, center, default_description):
|
||
age, tenure, overtime, commute, bmi, absence = center
|
||
clauses = []
|
||
|
||
if tenure >= 8:
|
||
clauses.append('司龄较长')
|
||
elif tenure <= 4:
|
||
clauses.append('司龄较短')
|
||
|
||
if overtime >= 30:
|
||
clauses.append('加班负荷偏高')
|
||
elif overtime <= 18:
|
||
clauses.append('加班压力相对可控')
|
||
|
||
if commute >= 45:
|
||
clauses.append('通勤压力偏高')
|
||
elif commute <= 30:
|
||
clauses.append('通勤节奏较平衡')
|
||
|
||
if bmi >= 26:
|
||
clauses.append('健康管理压力更明显')
|
||
|
||
if absence >= 7.5:
|
||
clauses.append('缺勤时长偏高')
|
||
elif absence <= 5.5:
|
||
clauses.append('缺勤水平偏低')
|
||
|
||
if age <= 32:
|
||
clauses.append('群体整体更年轻')
|
||
elif age >= 40:
|
||
clauses.append('群体整体更成熟')
|
||
|
||
unique_clauses = []
|
||
for clause in clauses:
|
||
if clause not in unique_clauses:
|
||
unique_clauses.append(clause)
|
||
|
||
if not unique_clauses:
|
||
return default_description
|
||
return ','.join(unique_clauses[:3]) + '。'
|
||
|
||
|
||
kmeans_analyzer = KMeansAnalyzer()
|