forsetsystem/backend/core/deep_learning_model.py

import os
from typing import Dict, List, Optional, Tuple

import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import config
from core.model_features import engineer_features

try:
    import torch
    import torch.nn as nn
    from torch.utils.data import DataLoader, TensorDataset
except ImportError:
    torch = None
    nn = None
    DataLoader = None
    TensorDataset = None


WINDOW_SIZE = 5
SEQUENCE_FEATURES = [
    '缺勤月份',
    '星期几',
    '是否节假日前后',
    '请假类型',
    '请假原因大类',
    '是否提供医院证明',
    '是否临时请假',
    '是否连续缺勤',
    '前一工作日是否加班',
    '月均加班时长',
    '通勤时长分钟',
    '是否夜班岗位',
    '是否慢性病史',
    '加班通勤压力指数',
    '缺勤历史强度',
]
STATIC_FEATURES = [
    '所属行业',
    '婚姻状态',
    '岗位序列',
    '岗位级别',
    '年龄',
    '司龄年数',
    '子女数量',
    '班次类型',
    '绩效等级',
    'BMI',
    '健康风险指数',
    '家庭负担指数',
    '岗位稳定性指数',
]


class LSTMMLPRegressor(nn.Module):
    def __init__(self, seq_input_dim: int, static_input_dim: int):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=seq_input_dim,
            hidden_size=48,
            num_layers=1,
            batch_first=True,
            dropout=0.0,
        )
        self.static_net = nn.Sequential(
            nn.Linear(static_input_dim, 32),
            nn.ReLU(),
            nn.Dropout(0.1),
        )
        self.fusion = nn.Sequential(
            nn.Linear(48 + 32, 48),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(48, 1),
        )

    def forward(self, sequence_x, static_x):
        lstm_output, _ = self.lstm(sequence_x)
        sequence_repr = lstm_output[:, -1, :]
        static_repr = self.static_net(static_x)
        fused = torch.cat([sequence_repr, static_repr], dim=1)
        return self.fusion(fused).squeeze(1)


def is_available() -> bool:
    return torch is not None


def _fit_category_maps(df: pd.DataFrame, features: List[str]) -> Dict[str, Dict[str, int]]:
    category_maps = {}
    for feature in features:
        if feature not in df.columns:
            continue
        if pd.api.types.is_numeric_dtype(df[feature]):
            continue
        values = sorted(df[feature].astype(str).unique().tolist())
        category_maps[feature] = {value: idx for idx, value in enumerate(values)}
    return category_maps


def _apply_category_maps(df: pd.DataFrame, features: List[str], category_maps: Dict[str, Dict[str, int]]) -> pd.DataFrame:
    encoded = df.copy()
    for feature in features:
        if feature not in encoded.columns:
            encoded[feature] = 0
            continue
        if feature in category_maps:
            mapper = category_maps[feature]
            encoded[feature] = encoded[feature].astype(str).map(lambda value: mapper.get(value, 0))
    return encoded


def _safe_standardize(values: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    mean = values.mean(axis=0)
    std = values.std(axis=0)
    std = np.where(std < 1e-6, 1.0, std)
    return mean.astype(np.float32), std.astype(np.float32)


def _build_sequence_arrays(
    df: pd.DataFrame,
    category_maps: Dict[str, Dict[str, int]],
    target_transform: str,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    df = engineer_features(df.copy())
    features = sorted(set(SEQUENCE_FEATURES + STATIC_FEATURES))
    df = _apply_category_maps(df, features, category_maps)
    df = df.sort_values(
        [config.EMPLOYEE_ID_COLUMN, config.EVENT_DATE_INDEX_COLUMN, config.EVENT_SEQUENCE_COLUMN]
    ).reset_index(drop=True)

    sequence_samples = []
    static_samples = []
    targets = []

    for _, group in df.groupby(config.EMPLOYEE_ID_COLUMN, sort=False):
        seq_values = group[SEQUENCE_FEATURES].astype(float).values
        static_values = group[STATIC_FEATURES].astype(float).values
        target_values = group[config.TARGET_COLUMN].astype(float).values

        for index in range(len(group)):
            window_slice = seq_values[max(0, index - WINDOW_SIZE + 1): index + 1]
            sequence_window = np.zeros((WINDOW_SIZE, len(SEQUENCE_FEATURES)), dtype=np.float32)
            sequence_window[-len(window_slice):] = window_slice
            sequence_samples.append(sequence_window)
            static_samples.append(static_values[index].astype(np.float32))
            targets.append(float(target_values[index]))

    targets = np.array(targets, dtype=np.float32)
    if target_transform == 'log1p':
        targets = np.log1p(np.clip(targets, a_min=0, a_max=None)).astype(np.float32)

    return (
        np.array(sequence_samples, dtype=np.float32),
        np.array(static_samples, dtype=np.float32),
        targets,
    )


def train_lstm_mlp(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    model_path: str,
    target_transform: str = 'log1p',
    epochs: int = 24,
    batch_size: int = 128,
) -> Optional[Dict]:
    if torch is None:
        return None

    used_features = sorted(set(SEQUENCE_FEATURES + STATIC_FEATURES))
    category_maps = _fit_category_maps(train_df, used_features)
    train_seq, train_static, y_train = _build_sequence_arrays(train_df, category_maps, target_transform)
    test_seq, test_static, y_test_transformed = _build_sequence_arrays(test_df, category_maps, target_transform)

    seq_mean, seq_std = _safe_standardize(train_seq.reshape(-1, train_seq.shape[-1]))
    static_mean, static_std = _safe_standardize(train_static)

    train_seq = ((train_seq - seq_mean) / seq_std).astype(np.float32)
    test_seq = ((test_seq - seq_mean) / seq_std).astype(np.float32)
    train_static = ((train_static - static_mean) / static_std).astype(np.float32)
    test_static = ((test_static - static_mean) / static_std).astype(np.float32)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if device.type == 'cuda':
        device_name = torch.cuda.get_device_name(device)
        print(f'[lstm_mlp] Training device: CUDA ({device_name})')
    else:
        print('[lstm_mlp] Training device: CPU')
    model = LSTMMLPRegressor(seq_input_dim=train_seq.shape[-1], static_input_dim=train_static.shape[-1]).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.MSELoss()

    train_dataset = TensorDataset(
        torch.tensor(train_seq),
        torch.tensor(train_static),
        torch.tensor(y_train),
    )
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    model.train()
    for _ in range(epochs):
        for batch_seq, batch_static, batch_target in train_loader:
            batch_seq = batch_seq.to(device)
            batch_static = batch_static.to(device)
            batch_target = batch_target.to(device)

            optimizer.zero_grad()
            predictions = model(batch_seq, batch_static)
            loss = criterion(predictions, batch_target)
            loss.backward()
            optimizer.step()

    model.eval()
    with torch.no_grad():
        predictions = model(
            torch.tensor(test_seq).to(device),
            torch.tensor(test_static).to(device),
        ).cpu().numpy()

    if target_transform == 'log1p':
        y_pred = np.expm1(predictions)
    else:
        y_pred = predictions
    y_true = test_df[config.TARGET_COLUMN].astype(float).values
    y_pred = np.clip(y_pred, a_min=0, a_max=None)
    mse = mean_squared_error(y_true, y_pred)

    default_prefix = train_seq[:, :-1, :].mean(axis=0).astype(np.float32)
    bundle = {
        'state_dict': model.state_dict(),
        'sequence_features': SEQUENCE_FEATURES,
        'static_features': STATIC_FEATURES,
        'category_maps': category_maps,
        'seq_mean': seq_mean,
        'seq_std': seq_std,
        'static_mean': static_mean,
        'static_std': static_std,
        'default_sequence_prefix': default_prefix,
        'window_size': WINDOW_SIZE,
        'target_transform': target_transform,
        'sequence_input_dim': train_seq.shape[-1],
        'static_input_dim': train_static.shape[-1],
    }
    torch.save(bundle, model_path)

    return {
        'metrics': {
            'r2': round(r2_score(y_true, y_pred), 4),
            'mse': round(mse, 4),
            'rmse': round(float(np.sqrt(mse)), 4),
            'mae': round(mean_absolute_error(y_true, y_pred), 4),
        },
        'metadata': {
            'sequence_window_size': WINDOW_SIZE,
            'sequence_feature_names': SEQUENCE_FEATURES,
            'static_feature_names': STATIC_FEATURES,
        },
    }


def load_lstm_mlp_bundle(model_path: str) -> Optional[Dict]:
    if torch is None or not os.path.exists(model_path):
        return None
    bundle = torch.load(model_path, map_location='cpu')
    model = LSTMMLPRegressor(
        seq_input_dim=bundle['sequence_input_dim'],
        static_input_dim=bundle['static_input_dim'],
    )
    model.load_state_dict(bundle['state_dict'])
    model.eval()
    bundle['model'] = model
    return bundle


def predict_lstm_mlp(bundle: Dict, current_df: pd.DataFrame) -> float:
    df = engineer_features(current_df.copy())
    used_features = sorted(set(bundle['sequence_features'] + bundle['static_features']))
    df = _apply_category_maps(df, used_features, bundle['category_maps'])

    sequence_row = df[bundle['sequence_features']].astype(float).values[0].astype(np.float32)
    static_row = df[bundle['static_features']].astype(float).values[0].astype(np.float32)

    prefix = bundle['default_sequence_prefix']
    sequence_window = np.vstack([prefix, sequence_row.reshape(1, -1)]).astype(np.float32)
    sequence_window = (sequence_window - bundle['seq_mean']) / bundle['seq_std']
    static_row = ((static_row - bundle['static_mean']) / bundle['static_std']).astype(np.float32)

    with torch.no_grad():
        prediction = bundle['model'](
            torch.tensor(sequence_window).unsqueeze(0),
            torch.tensor(static_row).unsqueeze(0),
        ).cpu().numpy()[0]

    if bundle.get('target_transform') == 'log1p':
        prediction = np.expm1(prediction)
    return float(max(0.5, prediction))