fix(training): patch lightgbm sklearn compatibility

This commit is contained in:
2026-03-12 18:15:09 +08:00
parent d7c8019f96
commit d70bd54c41
16 changed files with 885 additions and 203 deletions

View File

@@ -1,6 +1,7 @@
import os
import sys
import time
import inspect
from datetime import datetime
import joblib
@@ -14,6 +15,8 @@ from sklearn.preprocessing import RobustScaler
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import config
from core.deep_learning_model import is_available as deep_learning_available
from core.deep_learning_model import train_lstm_mlp
from core.model_features import (
NUMERICAL_OUTLIER_COLUMNS,
ORDINAL_COLUMNS,
@@ -43,6 +46,37 @@ except ImportError:
xgb = None
def patch_lightgbm_sklearn_compatibility():
if lgb is None:
return
try:
from sklearn.utils.validation import check_X_y
except Exception:
return
params = inspect.signature(check_X_y).parameters
if 'force_all_finite' in params or 'ensure_all_finite' not in params:
return
def wrapped_check_X_y(*args, force_all_finite=None, **kwargs):
if force_all_finite is not None and 'ensure_all_finite' not in kwargs:
kwargs['ensure_all_finite'] = force_all_finite
return check_X_y(*args, **kwargs)
try:
import lightgbm.compat as lgb_compat
import lightgbm.sklearn as lgb_sklearn
lgb_compat._LGBMCheckXY = wrapped_check_X_y
lgb_sklearn._LGBMCheckXY = wrapped_check_X_y
except Exception:
pass
patch_lightgbm_sklearn_compatibility()
def print_training_log(model_name, start_time, best_score, best_params, n_iter, cv_folds):
elapsed = time.time() - start_time
print(f' {"-" * 50}')
@@ -68,6 +102,10 @@ class OptimizedModelTrainer:
self.feature_k = 22
self.target_transform = 'log1p'
self.enabled_models = ['random_forest', 'gradient_boosting', 'extra_trees', 'lightgbm', 'xgboost']
if deep_learning_available():
self.enabled_models.append('lstm_mlp')
self.raw_train_df = None
self.raw_test_df = None
def analyze_data(self, df):
y = df[TARGET_COLUMN]
@@ -96,19 +134,21 @@ class OptimizedModelTrainer:
return self.feature_selector.transform(X) if self.feature_selector else X
def prepare_data(self):
df = normalize_columns(get_clean_data())
df = prepare_modeling_dataframe(df)
self.analyze_data(df)
raw_df = normalize_columns(get_clean_data())
self.analyze_data(prepare_modeling_dataframe(raw_df.copy()))
target_bins = make_target_bins(df[TARGET_COLUMN].values)
train_df, test_df = train_test_split(
df,
target_bins = make_target_bins(raw_df[TARGET_COLUMN].values)
raw_train_df, raw_test_df = train_test_split(
raw_df,
test_size=config.TEST_SIZE,
random_state=config.RANDOM_STATE,
stratify=target_bins,
)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
self.raw_train_df = raw_train_df.reset_index(drop=True)
self.raw_test_df = raw_test_df.reset_index(drop=True)
train_df = prepare_modeling_dataframe(self.raw_train_df)
test_df = prepare_modeling_dataframe(self.raw_test_df)
self.outlier_bounds = fit_outlier_bounds(train_df, NUMERICAL_OUTLIER_COLUMNS)
train_df = apply_outlier_bounds(train_df, self.outlier_bounds)
@@ -138,7 +178,8 @@ class OptimizedModelTrainer:
'feature_count_after_selection': int(X_train.shape[1]),
'training_date': datetime.now().strftime('%Y-%m-%d'),
'target_transform': self.target_transform,
'available_models': list(self.enabled_models),
'available_models': [],
'deep_learning_available': False,
}
return X_train, X_test, y_train, y_test
@@ -206,20 +247,25 @@ class OptimizedModelTrainer:
def train_lightgbm(self, X_train, y_train):
if lgb is None:
return
self._run_search(
'lightgbm',
lgb.LGBMRegressor(random_state=config.RANDOM_STATE, n_jobs=-1, verbose=-1),
{
'n_estimators': [180, 260, 340],
'max_depth': [7, 9, -1],
'learning_rate': [0.03, 0.05, 0.08],
'subsample': [0.7, 0.85, 1.0],
'colsample_bytree': [0.7, 0.85, 1.0],
'num_leaves': [31, 50, 70],
},
X_train,
y_train,
)
try:
self._run_search(
'lightgbm',
lgb.LGBMRegressor(random_state=config.RANDOM_STATE, n_jobs=-1, verbose=-1),
{
'n_estimators': [180, 260, 340],
'max_depth': [7, 9, -1],
'learning_rate': [0.03, 0.05, 0.08],
'subsample': [0.7, 0.85, 1.0],
'colsample_bytree': [0.7, 0.85, 1.0],
'num_leaves': [31, 50, 70],
},
X_train,
y_train,
)
except Exception as exc:
print(f' {"-" * 50}')
print(' Model: lightgbm')
print(f' Skipped: {exc}')
def train_xgboost(self, X_train, y_train):
if xgb is None:
@@ -254,6 +300,7 @@ class OptimizedModelTrainer:
os.makedirs(config.MODELS_DIR, exist_ok=True)
for name, model in self.models.items():
joblib.dump(model, os.path.join(config.MODELS_DIR, f'{name}_model.pkl'))
self.training_metadata['available_models'] = list(self.model_metrics.keys())
joblib.dump(self.scaler, config.SCALER_PATH)
joblib.dump(self.feature_names, os.path.join(config.MODELS_DIR, 'feature_names.pkl'))
joblib.dump(self.selected_features, os.path.join(config.MODELS_DIR, 'selected_features.pkl'))
@@ -282,6 +329,23 @@ class OptimizedModelTrainer:
self.model_metrics[name] = metrics
print(f' {name:20s} R2={metrics["r2"]:.4f} RMSE={metrics["rmse"]:.4f} MAE={metrics["mae"]:.4f}')
if 'lstm_mlp' in self.enabled_models and self.raw_train_df is not None and self.raw_test_df is not None:
deep_model_path = os.path.join(config.MODELS_DIR, 'lstm_mlp_model.pt')
deep_result = train_lstm_mlp(
self.raw_train_df,
self.raw_test_df,
deep_model_path,
target_transform=self.target_transform,
)
if deep_result:
self.model_metrics['lstm_mlp'] = deep_result['metrics']
self.training_metadata['deep_learning_available'] = True
self.training_metadata.update(deep_result['metadata'])
print(
f' {"lstm_mlp":20s} R2={deep_result["metrics"]["r2"]:.4f} '
f'RMSE={deep_result["metrics"]["rmse"]:.4f} MAE={deep_result["metrics"]["mae"]:.4f}'
)
self.save_models()
return self.model_metrics