import json from functools import lru_cache from pathlib import Path import joblib import numpy as np import pandas as pd import xgboost as xgb # BASE_DIR = Path(__file__).resolve().parent # M1_MODEL_PATH = BASE_DIR / "xgboost_model_M1.joblib" # M1_CATEGORY_ORDERS_PATH = BASE_DIR / "category_orders_train_M1.json" # M2_MODEL_PATH = BASE_DIR / "xgboost_model_M2.joblib" # M2_CATEGORY_ORDERS_PATH = BASE_DIR / "category_orders_train_M2.json" # M2_ISO_PATH = BASE_DIR / "isotonic_model_M2.joblib" M1_MODEL_PATH = "./xgboost_model_M1.joblib" M1_CATEGORY_ORDERS_PATH = "./category_orders_train_M1.json" M2_MODEL_PATH = "./xgboost_model_M2.joblib" M2_CATEGORY_ORDERS_PATH = "./category_orders_train_M2.json" M2_ISO_PATH = "./isotonic_model_M2.joblib" def _load_category_orders(path: Path) -> dict: with open(path, "r") as f: return json.load(f) @lru_cache(maxsize=1) def _load_m1_model(): return joblib.load(M1_MODEL_PATH) @lru_cache(maxsize=1) def _load_m2_model(): return joblib.load(M2_MODEL_PATH) @lru_cache(maxsize=1) def _load_m2_iso_model(): return joblib.load(M2_ISO_PATH) @lru_cache(maxsize=None) def _load_category_orders_cached(path: Path): # Cache category orders per path to avoid disk I/O on each scoring return _load_category_orders(path) def _prepare_m1(df: pd.DataFrame, category_orders: dict) -> pd.DataFrame: df = df.copy() for col, categories in category_orders.items(): if col not in df.columns: df[col] = np.nan df[col].replace([None, "", "null", np.nan, "nan", " "], np.nan, inplace=True) df[col] = pd.Categorical(df[col], categories=categories, ordered=True) return df def _prepare_m2(df: pd.DataFrame, category_orders: dict) -> pd.DataFrame: df = df.copy() for col, categories in category_orders.items(): if col not in df.columns: df[col] = np.nan df[col] = df[col].astype(str).str.lower() df[col].replace([None, "", "null", np.nan, "nan", " "], np.nan, inplace=True) df[col] = pd.Categorical(df[col], categories=categories, ordered=True) return df def processing_m1(input_data: pd.DataFrame) -> pd.DataFrame: df = pd.DataFrame(input_data) if df.empty: raise ValueError("Input DataFrame is empty.") model = _load_m1_model() df.rename(columns={"riskrating": "RiskRating", "trueipgeo": "TrueIpGeo"}, inplace=True) category_orders = _load_category_orders_cached(M1_CATEGORY_ORDERS_PATH) df = _prepare_m1(df, category_orders) expected_features = model.feature_names dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=np.nan) predictions = model.predict(dmatrix) df["prediction"] = predictions return df def processing_m2(input_data: pd.DataFrame) -> pd.DataFrame: df = pd.DataFrame(input_data) if df.empty: raise ValueError("Input DataFrame is empty.") model = _load_m2_model() category_orders = _load_category_orders_cached(M2_CATEGORY_ORDERS_PATH) df = _prepare_m2(df, category_orders) expected_features = model.feature_names for feature in expected_features: if feature not in df.columns: df[feature] = np.nan dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=np.nan) pd_arr = model.predict(dmatrix) df["pd_m2"] = pd_arr iso_model = _load_m2_iso_model() df["pd_m2_iso"] = iso_model.predict(pd_arr) return df def processing_all(df_m1: pd.DataFrame, df_m2: pd.DataFrame, df_thx: pd.DataFrame): return processing_m1(df_m1), processing_m2(df_m2), df_thx # Legacy single-model entry point def processing(input_data: pd.DataFrame) -> pd.DataFrame: return processing_m1(input_data)