import numpy as np import config from core.preprocessing import get_clean_data def calculate_correlation(): df = get_clean_data() numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() for candidate in [config.EMPLOYEE_ID_COLUMN]: if candidate in numeric_cols: numeric_cols.remove(candidate) return df[numeric_cols].corr() def get_correlation_for_heatmap(): corr_matrix = calculate_correlation() key_features = [ '月均加班时长', '通勤时长分钟', '近90天缺勤次数', 'BMI', '近30天睡眠时长均值', '缺勤时长(小时)', ] key_features = [f for f in key_features if f in corr_matrix.columns] sub_matrix = corr_matrix.loc[key_features, key_features] return { 'features': [config.FEATURE_NAME_CN.get(f, f) for f in key_features], 'matrix': sub_matrix.values.round(2).tolist(), } def group_comparison(dimension): df = get_clean_data() dimension_map = { 'industry': ('所属行业', None, '所属行业'), 'shift_type': ('班次类型', None, '班次类型'), 'job_family': ('岗位序列', None, '岗位序列'), 'marital_status': ('婚姻状态', None, '婚姻状态'), 'chronic_disease': ('是否慢性病史', {0: '无慢性病史', 1: '有慢性病史'}, '慢性病史'), } if dimension not in dimension_map: raise ValueError(f"Invalid dimension: {dimension}") column, value_map, dimension_name = dimension_map[dimension] groups = [] for value in sorted(df[column].unique()): group_df = df[df[column] == value] groups.append({ 'name': value_map.get(value, value) if value_map else str(value), 'value': int(value) if isinstance(value, (int, np.integer)) else str(value), 'avg_hours': round(group_df[config.TARGET_COLUMN].mean(), 2), 'count': int(len(group_df)), 'percentage': round(len(group_df) / len(df) * 100, 1), }) groups.sort(key=lambda item: item['avg_hours'], reverse=True) top = groups[0]['avg_hours'] if groups else 0 bottom = groups[-1]['avg_hours'] if len(groups) > 1 else 0 diff_value = round(top - bottom, 2) diff_percentage = round(diff_value / bottom * 100, 1) if bottom else 0 return { 'dimension': dimension, 'dimension_name': dimension_name, 'groups': groups, 'difference': { 'value': diff_value, 'percentage': diff_percentage, }, }