import pandas as pd import numpy as np from sklearn.cluster import KMeans from sklearn.preprocessing import MinMaxScaler import joblib import os import config from core.preprocessing import get_clean_data class KMeansAnalyzer: def __init__(self, n_clusters=3): self.n_clusters = n_clusters self.model = None self.scaler = MinMaxScaler() self.data = None self.data_scaled = None self.labels = None def _get_feature_columns(self, df): df.columns = [col.strip() for col in df.columns] feature_map = { 'Age': None, 'Service time': None, 'Work load Average/day': None, 'Body mass index': None, 'Absenteeism time in hours': None } for key in feature_map: if key in df.columns: feature_map[key] = key else: for col in df.columns: if key.replace(' ', '').lower() == col.replace(' ', '').lower(): feature_map[key] = col break actual_features = [v for v in feature_map.values() if v is not None] return actual_features def fit(self, n_clusters=None): if n_clusters: self.n_clusters = n_clusters df = get_clean_data() df = df.reset_index(drop=True) feature_cols = self._get_feature_columns(df) if not feature_cols: feature_cols = ['Age', 'Service time', 'Body mass index', 'Absenteeism time in hours'] feature_cols = [c for c in feature_cols if c in df.columns] self.data = df[feature_cols].values self.scaler = MinMaxScaler() self.data_scaled = self.scaler.fit_transform(self.data) self.model = KMeans( n_clusters=self.n_clusters, random_state=config.RANDOM_STATE, n_init=10 ) self.labels = self.model.fit_predict(self.data_scaled) return self.model def get_cluster_results(self, n_clusters=3): if self.model is None or self.n_clusters != n_clusters: self.fit(n_clusters) centers = self.scaler.inverse_transform(self.model.cluster_centers_) unique, counts = np.unique(self.labels, return_counts=True) total = len(self.labels) cluster_names = self._generate_cluster_names(centers) feature_cols = self._get_feature_columns(get_clean_data()) clusters = [] for i, (cluster_id, count) in enumerate(zip(unique, counts)): center_dict = {} for j, fname in enumerate(feature_cols): if j < len(centers[i]): center_dict[fname] = round(centers[i][j], 2) clusters.append({ 'id': int(cluster_id), 'name': cluster_names.get(cluster_id, f'群体{cluster_id+1}'), 'member_count': int(count), 'percentage': round(count / total * 100, 1), 'center': center_dict, 'description': self._generate_description(cluster_names.get(cluster_id, '')) }) return { 'n_clusters': self.n_clusters, 'clusters': clusters } def get_cluster_profile(self, n_clusters=3): if self.model is None or self.n_clusters != n_clusters: self.fit(n_clusters) centers_scaled = self.model.cluster_centers_ df = get_clean_data() df.columns = [col.strip() for col in df.columns] feature_cols = self._get_feature_columns(df) dimensions = ['年龄', '工龄', '工作负荷', 'BMI', '缺勤倾向'][:len(feature_cols)] cluster_names = self._generate_cluster_names( self.scaler.inverse_transform(centers_scaled) ) clusters = [] for i in range(self.n_clusters): clusters.append({ 'id': i, 'name': cluster_names.get(i, f'群体{i+1}'), 'values': [round(v, 2) for v in centers_scaled[i]] }) return { 'dimensions': dimensions, 'dimension_keys': feature_cols, 'clusters': clusters } def get_scatter_data(self, n_clusters=3, x_axis='Age', y_axis='Absenteeism time in hours'): if self.model is None or self.n_clusters != n_clusters: self.fit(n_clusters) df = get_clean_data() df = df.reset_index(drop=True) df.columns = [col.strip() for col in df.columns] x_col = None y_col = None for col in df.columns: if x_axis.replace(' ', '').lower() in col.replace(' ', '').lower(): x_col = col if y_axis.replace(' ', '').lower() in col.replace(' ', '').lower(): y_col = col if x_col is None: x_col = df.columns[0] if y_col is None: y_col = df.columns[-1] points = [] for idx in range(min(len(df), len(self.labels))): row = df.iloc[idx] points.append({ 'employee_id': int(row['ID']), 'x': float(row[x_col]), 'y': float(row[y_col]), 'cluster_id': int(self.labels[idx]) }) cluster_colors = { '0': '#67C23A', '1': '#E6A23C', '2': '#F56C6C', '3': '#909399', '4': '#409EFF' } return { 'x_axis': x_col, 'x_axis_name': config.FEATURE_NAME_CN.get(x_col, x_col), 'y_axis': y_col, 'y_axis_name': config.FEATURE_NAME_CN.get(y_col, y_col), 'points': points[:500], 'cluster_colors': cluster_colors } def _generate_cluster_names(self, centers): names = {} for i, center in enumerate(centers): if len(center) >= 5: service_time = center[1] work_load = center[2] bmi = center[3] absent = center[4] else: service_time = center[1] if len(center) > 1 else 0 work_load = 0 bmi = center[2] if len(center) > 2 else 0 absent = center[3] if len(center) > 3 else 0 if service_time > 15 and absent < 3: names[i] = '模范型员工' elif work_load > 260 and absent > 5: names[i] = '压力型员工' elif bmi > 28: names[i] = '生活习惯型员工' else: names[i] = f'群体{i+1}' return names def _generate_description(self, name): descriptions = { '模范型员工': '工龄长、工作稳定、缺勤率低', '压力型员工': '工作负荷大、缺勤较多', '生活习惯型员工': 'BMI偏高、需关注健康' } return descriptions.get(name, '常规员工群体') def save_model(self): os.makedirs(config.MODELS_DIR, exist_ok=True) joblib.dump(self.model, config.KMEANS_MODEL_PATH) def load_model(self): if os.path.exists(config.KMEANS_MODEL_PATH): self.model = joblib.load(config.KMEANS_MODEL_PATH) self.n_clusters = self.model.n_clusters kmeans_analyzer = KMeansAnalyzer()