import pandas as pd import numpy as np import config from core.preprocessing import get_clean_data class DataService: def __init__(self): self._df = None @property def df(self): if self._df is None: self._df = get_clean_data() return self._df def get_basic_stats(self): df = self.df total_records = len(df) total_employees = df['ID'].nunique() total_absent_hours = df['Absenteeism time in hours'].sum() avg_absent_hours = round(df['Absenteeism time in hours'].mean(), 2) max_absent_hours = int(df['Absenteeism time in hours'].max()) min_absent_hours = int(df['Absenteeism time in hours'].min()) high_risk_count = len(df[df['Absenteeism time in hours'] > 8]) high_risk_ratio = round(high_risk_count / total_records, 4) return { 'total_records': total_records, 'total_employees': total_employees, 'total_absent_hours': int(total_absent_hours), 'avg_absent_hours': avg_absent_hours, 'max_absent_hours': max_absent_hours, 'min_absent_hours': min_absent_hours, 'high_risk_ratio': high_risk_ratio } def get_monthly_trend(self): df = self.df monthly = df.groupby('Month of absence').agg({ 'Absenteeism time in hours': ['sum', 'mean', 'count'] }).reset_index() monthly.columns = ['month', 'total_hours', 'avg_hours', 'record_count'] months = ['1月', '2月', '3月', '4月', '5月', '6月', '7月', '8月', '9月', '10月', '11月', '12月'] result = { 'months': months, 'total_hours': [], 'avg_hours': [], 'record_counts': [] } for i in range(1, 13): row = monthly[monthly['month'] == i] if len(row) > 0: result['total_hours'].append(int(row['total_hours'].values[0])) result['avg_hours'].append(round(float(row['avg_hours'].values[0]), 2)) result['record_counts'].append(int(row['record_count'].values[0])) else: result['total_hours'].append(0) result['avg_hours'].append(0) result['record_counts'].append(0) return result def get_weekday_distribution(self): df = self.df weekday = df.groupby('Day of the week').agg({ 'Absenteeism time in hours': ['sum', 'mean', 'count'] }).reset_index() weekday.columns = ['weekday', 'total_hours', 'avg_hours', 'record_count'] result = { 'weekdays': [], 'weekday_codes': [], 'total_hours': [], 'avg_hours': [], 'record_counts': [] } for code in [2, 3, 4, 5, 6]: row = weekday[weekday['weekday'] == code] result['weekdays'].append(config.WEEKDAY_NAMES.get(code, str(code))) result['weekday_codes'].append(code) if len(row) > 0: result['total_hours'].append(int(row['total_hours'].values[0])) result['avg_hours'].append(round(float(row['avg_hours'].values[0]), 2)) result['record_counts'].append(int(row['record_count'].values[0])) else: result['total_hours'].append(0) result['avg_hours'].append(0) result['record_counts'].append(0) return result def get_reason_distribution(self): df = self.df reason = df.groupby('Reason for absence').agg({ 'Absenteeism time in hours': 'count' }).reset_index() reason.columns = ['code', 'count'] reason = reason.sort_values('count', ascending=False) total = reason['count'].sum() result = { 'reasons': [] } for _, row in reason.iterrows(): code = int(row['code']) result['reasons'].append({ 'code': code, 'name': config.REASON_NAMES.get(code, f'原因{code}'), 'count': int(row['count']), 'percentage': round(row['count'] / total * 100, 1) }) return result def get_season_distribution(self): df = self.df season = df.groupby('Seasons').agg({ 'Absenteeism time in hours': ['sum', 'mean', 'count'] }).reset_index() season.columns = ['season', 'total_hours', 'avg_hours', 'record_count'] total_records = season['record_count'].sum() result = { 'seasons': [] } for code in [1, 2, 3, 4]: row = season[season['season'] == code] if len(row) > 0: result['seasons'].append({ 'code': int(code), 'name': config.SEASON_NAMES.get(code, f'季节{code}'), 'total_hours': int(row['total_hours'].values[0]), 'avg_hours': round(float(row['avg_hours'].values[0]), 2), 'record_count': int(row['record_count'].values[0]), 'percentage': round(row['record_count'].values[0] / total_records * 100, 1) }) return result data_service = DataService()