import numpy as np from sklearn.cluster import KMeans from sklearn.preprocessing import MinMaxScaler import config from core.preprocessing import get_clean_data class KMeansAnalyzer: def __init__(self, n_clusters=3): self.n_clusters = n_clusters self.model = None self.scaler = MinMaxScaler() self.labels = None self.feature_cols = [ '年龄', '司龄年数', '月均加班时长', '通勤时长分钟', 'BMI', '缺勤时长(小时)', ] def fit(self, n_clusters=None): if n_clusters: self.n_clusters = n_clusters df = get_clean_data().reset_index(drop=True) data = df[self.feature_cols].values data_scaled = self.scaler.fit_transform(data) self.model = KMeans(n_clusters=self.n_clusters, random_state=config.RANDOM_STATE, n_init=10) self.labels = self.model.fit_predict(data_scaled) return self.model def get_cluster_results(self, n_clusters=3): if self.model is None or self.n_clusters != n_clusters: self.fit(n_clusters) centers = self.scaler.inverse_transform(self.model.cluster_centers_) unique, counts = np.unique(self.labels, return_counts=True) total = len(self.labels) names = self._generate_cluster_names(centers) clusters = [] for cluster_id, count in zip(unique, counts): center = centers[int(cluster_id)] clusters.append({ 'id': int(cluster_id), 'name': names.get(int(cluster_id), '常规稳态型'), 'member_count': int(count), 'percentage': round(count / total * 100, 1), 'center': { feature: round(float(value), 2) for feature, value in zip(self.feature_cols, center) }, 'description': self._generate_description(names.get(int(cluster_id), '常规稳态型'), center), }) return {'n_clusters': self.n_clusters, 'clusters': clusters} def get_cluster_profile(self, n_clusters=3): if self.model is None or self.n_clusters != n_clusters: self.fit(n_clusters) centers_scaled = self.model.cluster_centers_ names = self._generate_cluster_names(self.scaler.inverse_transform(centers_scaled)) return { 'dimensions': ['年龄', '司龄', '加班', '通勤', 'BMI', '缺勤'], 'dimension_keys': self.feature_cols, 'clusters': [ { 'id': idx, 'name': names.get(idx, '常规稳态型'), 'values': [round(float(v), 2) for v in centers_scaled[idx]], } for idx in range(self.n_clusters) ], } def get_scatter_data(self, n_clusters=3, x_axis='月均加班时长', y_axis='缺勤时长(小时)'): if self.model is None or self.n_clusters != n_clusters: self.fit(n_clusters) df = get_clean_data().reset_index(drop=True) centers = self.scaler.inverse_transform(self.model.cluster_centers_) names = self._generate_cluster_names(centers) if x_axis not in df.columns: x_axis = '月均加班时长' if y_axis not in df.columns: y_axis = config.TARGET_COLUMN points = [] for idx in range(min(len(df), len(self.labels))): row = df.iloc[idx] points.append({ 'employee_id': str(row[config.EMPLOYEE_ID_COLUMN]), 'x': float(row[x_axis]), 'y': float(row[y_axis]), 'cluster_id': int(self.labels[idx]), }) return { 'x_axis': x_axis, 'x_axis_name': config.FEATURE_NAME_CN.get(x_axis, x_axis), 'y_axis': y_axis, 'y_axis_name': config.FEATURE_NAME_CN.get(y_axis, y_axis), 'points': points[:500], 'cluster_colors': { '0': '#5B8FF9', '1': '#61DDAA', '2': '#F6BD16', '3': '#E8684A', '4': '#6DC8EC', }, 'cluster_names': { str(idx): names.get(idx, '常规稳态型') for idx in range(self.n_clusters) }, } def _generate_cluster_names(self, centers): rank_info = self._build_rank_info(centers) base_names = {} for idx, center in enumerate(centers): base_names[idx] = self._classify_cluster(center, rank_info, idx) return self._deduplicate_cluster_names(base_names, centers) def _build_rank_info(self, centers): centers = np.asarray(centers, dtype=float) return { '年龄': self._rank_desc(centers[:, 0]), '司龄': self._rank_desc(centers[:, 1]), '加班': self._rank_desc(centers[:, 2]), '通勤': self._rank_desc(centers[:, 3]), 'BMI': self._rank_desc(centers[:, 4]), '缺勤': self._rank_desc(centers[:, 5]), } def _rank_desc(self, values): ordered = np.argsort(-np.asarray(values, dtype=float)) ranks = {} for rank, idx in enumerate(ordered): ranks[int(idx)] = rank return ranks def _classify_cluster(self, center, rank_info, idx): age, tenure, overtime, commute, bmi, absence = center high_absence = rank_info['缺勤'][idx] == 0 low_absence = rank_info['缺勤'][idx] == len(rank_info['缺勤']) - 1 high_overtime = rank_info['加班'][idx] <= 1 high_commute = rank_info['通勤'][idx] <= 1 high_bmi = rank_info['BMI'][idx] <= 1 high_tenure = rank_info['司龄'][idx] <= 1 low_tenure = rank_info['司龄'][idx] >= len(rank_info['司龄']) - 1 young_group = rank_info['年龄'][idx] >= len(rank_info['年龄']) - 1 if (absence >= 7.5 and overtime >= 28 and commute >= 40) or (high_absence and high_overtime and high_commute): return '压力奔波型' if (absence >= 7.0 and bmi >= 25.5) or (high_absence and high_bmi): return '健康关注型' if (overtime >= 30 and absence >= 6.0) or (high_overtime and rank_info['缺勤'][idx] <= 1): return '负荷承压型' if (tenure >= 8 and absence <= 6.0) or (high_tenure and low_absence): return '稳定成熟型' if (tenure <= 4 and age <= 32) or (low_tenure and young_group): return '新锐成长型' if commute <= 35 and absence <= 6.5: return '通勤平衡型' if tenure >= 6 and absence <= 6.8: return '经验稳健型' return '常规稳态型' def _deduplicate_cluster_names(self, names, centers): grouped = {} for idx, name in names.items(): grouped.setdefault(name, []).append(idx) deduplicated = names.copy() for name, indices in grouped.items(): if len(indices) == 1: continue order = self._build_duplicate_order(indices, centers) suffixes = self._suffix_candidates(name) for rank, idx in enumerate(order): suffix = suffixes[rank] if rank < len(suffixes) else f'{rank + 1}' deduplicated[idx] = f'{name}{suffix}' return deduplicated def _build_duplicate_order(self, indices, centers): return sorted( indices, key=lambda idx: ( centers[idx][5], # 缺勤时长 centers[idx][2], # 加班 centers[idx][1], # 司龄 centers[idx][3], # 通勤 centers[idx][4], # BMI centers[idx][0], # 年龄 ), reverse=True, ) def _suffix_candidates(self, name): suffix_map = { '压力奔波型': ['-高压组', '-长途组', '-持续关注组'], '健康关注型': ['-重点关注组', '-预警组', '-干预组'], '负荷承压型': ['-高负荷组', '-轮班组', '-调节组'], '稳定成熟型': ['-资深组', '-成熟组', '-稳健组'], '新锐成长型': ['-适应组', '-成长组', '-潜力组'], '通勤平衡型': ['-均衡组', '-稳态组', '-协同组'], '经验稳健型': ['-资深组', '-稳健组', '-协同组'], '常规稳态型': ['-平衡组', '-常态组', '-协同组'], } return suffix_map.get(name, [f'({idx})' for idx in range(1, 10)]) def _generate_description(self, name, center=None): descriptions = { '压力奔波型': '加班与通勤压力同时偏高,缺勤波动更明显。', '健康关注型': '健康负担更突出,缺勤时长偏高,建议优先关注。', '负荷承压型': '工作负荷较重,缺勤风险处于偏高水平。', '稳定成熟型': '司龄较长,整体状态稳定,缺勤水平偏低。', '新锐成长型': '整体更年轻、司龄较短,仍处于适应与成长阶段。', '通勤平衡型': '通勤与缺勤表现较均衡,整体波动相对可控。', '经验稳健型': '具备一定经验积累,整体表现稳健,缺勤风险较低。', '常规稳态型': '整体表现接近企业常态,是较典型的员工群体。', } for key, description in descriptions.items(): if name.startswith(key): if center is None: return description return self._build_dynamic_description(key, center, description) return descriptions.get(name, '整体表现接近企业常态。') def _build_dynamic_description(self, base_name, center, default_description): age, tenure, overtime, commute, bmi, absence = center clauses = [] if tenure >= 8: clauses.append('司龄较长') elif tenure <= 4: clauses.append('司龄较短') if overtime >= 30: clauses.append('加班负荷偏高') elif overtime <= 18: clauses.append('加班压力相对可控') if commute >= 45: clauses.append('通勤压力偏高') elif commute <= 30: clauses.append('通勤节奏较平衡') if bmi >= 26: clauses.append('健康管理压力更明显') if absence >= 7.5: clauses.append('缺勤时长偏高') elif absence <= 5.5: clauses.append('缺勤水平偏低') if age <= 32: clauses.append('群体整体更年轻') elif age >= 40: clauses.append('群体整体更成熟') unique_clauses = [] for clause in clauses: if clause not in unique_clauses: unique_clauses.append(clause) if not unique_clauses: return default_description return ','.join(unique_clauses[:3]) + '。' kmeans_analyzer = KMeansAnalyzer()