import pandas as pd import numpy as np import config from core.preprocessing import get_clean_data def calculate_correlation(): df = get_clean_data() numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() if 'ID' in numeric_cols: numeric_cols.remove('ID') corr_matrix = df[numeric_cols].corr() return corr_matrix def get_correlation_for_heatmap(): corr_matrix = calculate_correlation() key_features = [ 'Age', 'Service time', 'Distance from Residence to Work', 'Work load Average/day ', 'Body mass index', 'Absenteeism time in hours' ] key_features = [f for f in key_features if f in corr_matrix.columns] sub_matrix = corr_matrix.loc[key_features, key_features] result = { 'features': [config.FEATURE_NAME_CN.get(f, f) for f in key_features], 'matrix': sub_matrix.values.round(2).tolist() } return result def calculate_feature_importance(model, feature_names): if hasattr(model, 'feature_importances_'): importance = model.feature_importances_ else: raise ValueError("Model does not have feature_importances_ attribute") importance_dict = dict(zip(feature_names, importance)) sorted_importance = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True) return sorted_importance def get_feature_importance_from_model(model_path, feature_names): import joblib model = joblib.load(model_path) return calculate_feature_importance(model, feature_names) def group_comparison(dimension): df = get_clean_data() dimension_map = { 'drinker': ('Social drinker', {0: '不饮酒', 1: '饮酒'}), 'smoker': ('Social smoker', {0: '不吸烟', 1: '吸烟'}), 'education': ('Education', {1: '高中', 2: '本科', 3: '研究生', 4: '博士'}), 'children': ('Son', {0: '无子女'}, lambda x: x > 0, '有子女'), 'pet': ('Pet', {0: '无宠物'}, lambda x: x > 0, '有宠物') } if dimension not in dimension_map: raise ValueError(f"Invalid dimension: {dimension}") col, value_map = dimension_map[dimension][0], dimension_map[dimension][1] if dimension in ['children', 'pet']: threshold_fn = dimension_map[dimension][2] other_label = dimension_map[dimension][3] groups = [] for val in [0]: group_df = df[df[col] == val] if len(group_df) > 0: groups.append({ 'name': value_map.get(val, str(val)), 'value': val, 'avg_hours': round(group_df['Absenteeism time in hours'].mean(), 2), 'count': len(group_df), 'percentage': round(len(group_df) / len(df) * 100, 1) }) group_df = df[df[col].apply(threshold_fn)] if len(group_df) > 0: groups.append({ 'name': other_label, 'value': 1, 'avg_hours': round(group_df['Absenteeism time in hours'].mean(), 2), 'count': len(group_df), 'percentage': round(len(group_df) / len(df) * 100, 1) }) else: groups = [] for val in sorted(df[col].unique()): group_df = df[df[col] == val] if len(group_df) > 0: groups.append({ 'name': value_map.get(val, str(val)), 'value': int(val), 'avg_hours': round(group_df['Absenteeism time in hours'].mean(), 2), 'count': len(group_df), 'percentage': round(len(group_df) / len(df) * 100, 1) }) if len(groups) >= 2: diff_value = abs(groups[0]['avg_hours'] - groups[1]['avg_hours']) base = min(groups[0]['avg_hours'], groups[1]['avg_hours']) diff_percentage = round(diff_value / base * 100, 1) if base > 0 else 0 else: diff_value = 0 diff_percentage = 0 return { 'dimension': dimension, 'dimension_name': { 'drinker': '饮酒习惯', 'smoker': '吸烟习惯', 'education': '学历', 'children': '子女', 'pet': '宠物' }.get(dimension, dimension), 'groups': groups, 'difference': { 'value': diff_value, 'percentage': diff_percentage } } if __name__ == '__main__': print("Correlation matrix:") corr = get_correlation_for_heatmap() print(corr) print("\nGroup comparison (drinker):") comp = group_comparison('drinker') print(comp)