117 lines
3.7 KiB
Python
117 lines
3.7 KiB
Python
import json
|
|
from functools import lru_cache
|
|
from pathlib import Path
|
|
|
|
import joblib
|
|
import numpy as np
|
|
import pandas as pd
|
|
import xgboost as xgb
|
|
|
|
# BASE_DIR = Path(__file__).resolve().parent
|
|
# M1_MODEL_PATH = BASE_DIR / "xgboost_model_M1.joblib"
|
|
# M1_CATEGORY_ORDERS_PATH = BASE_DIR / "category_orders_train_M1.json"
|
|
# M2_MODEL_PATH = BASE_DIR / "xgboost_model_M2.joblib"
|
|
# M2_CATEGORY_ORDERS_PATH = BASE_DIR / "category_orders_train_M2.json"
|
|
# M2_ISO_PATH = BASE_DIR / "isotonic_model_M2.joblib"
|
|
|
|
M1_MODEL_PATH = "./xgboost_model_M1.joblib"
|
|
M1_CATEGORY_ORDERS_PATH = "./category_orders_train_M1.json"
|
|
M2_MODEL_PATH = "./xgboost_model_M2.joblib"
|
|
M2_CATEGORY_ORDERS_PATH = "./category_orders_train_M2.json"
|
|
M2_ISO_PATH = "./isotonic_model_M2.joblib"
|
|
|
|
|
|
def _load_category_orders(path: Path) -> dict:
|
|
with open(path, "r") as f:
|
|
return json.load(f)
|
|
|
|
|
|
@lru_cache(maxsize=1)
|
|
def _load_m1_model():
|
|
return joblib.load(M1_MODEL_PATH)
|
|
|
|
|
|
@lru_cache(maxsize=1)
|
|
def _load_m2_model():
|
|
return joblib.load(M2_MODEL_PATH)
|
|
|
|
|
|
@lru_cache(maxsize=1)
|
|
def _load_m2_iso_model():
|
|
return joblib.load(M2_ISO_PATH)
|
|
|
|
|
|
@lru_cache(maxsize=None)
|
|
def _load_category_orders_cached(path: Path):
|
|
# Cache category orders per path to avoid disk I/O on each scoring
|
|
return _load_category_orders(path)
|
|
|
|
|
|
def _prepare_m1(df: pd.DataFrame, category_orders: dict) -> pd.DataFrame:
|
|
df = df.copy()
|
|
for col, categories in category_orders.items():
|
|
if col not in df.columns:
|
|
df[col] = np.nan
|
|
df[col].replace([None, "", "null", np.nan, "nan", " "], np.nan, inplace=True)
|
|
df[col] = pd.Categorical(df[col], categories=categories, ordered=True)
|
|
return df
|
|
|
|
|
|
def _prepare_m2(df: pd.DataFrame, category_orders: dict) -> pd.DataFrame:
|
|
df = df.copy()
|
|
for col, categories in category_orders.items():
|
|
if col not in df.columns:
|
|
df[col] = np.nan
|
|
df[col] = df[col].astype(str).str.lower()
|
|
df[col].replace([None, "", "null", np.nan, "nan", " "], np.nan, inplace=True)
|
|
df[col] = pd.Categorical(df[col], categories=categories, ordered=True)
|
|
return df
|
|
|
|
|
|
def processing_m1(input_data: pd.DataFrame) -> pd.DataFrame:
|
|
df = pd.DataFrame(input_data)
|
|
if df.empty:
|
|
raise ValueError("Input DataFrame is empty.")
|
|
|
|
model = _load_m1_model()
|
|
df.rename(columns={"riskrating": "RiskRating", "trueipgeo": "TrueIpGeo"}, inplace=True)
|
|
category_orders = _load_category_orders_cached(M1_CATEGORY_ORDERS_PATH)
|
|
df = _prepare_m1(df, category_orders)
|
|
|
|
expected_features = model.feature_names
|
|
dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=np.nan)
|
|
predictions = model.predict(dmatrix)
|
|
df["prediction"] = predictions
|
|
return df
|
|
|
|
|
|
def processing_m2(input_data: pd.DataFrame) -> pd.DataFrame:
|
|
df = pd.DataFrame(input_data)
|
|
if df.empty:
|
|
raise ValueError("Input DataFrame is empty.")
|
|
|
|
model = _load_m2_model()
|
|
category_orders = _load_category_orders_cached(M2_CATEGORY_ORDERS_PATH)
|
|
df = _prepare_m2(df, category_orders)
|
|
|
|
expected_features = model.feature_names
|
|
for feature in expected_features:
|
|
if feature not in df.columns:
|
|
df[feature] = np.nan
|
|
dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=np.nan)
|
|
pd_arr = model.predict(dmatrix)
|
|
df["pd_m2"] = pd_arr
|
|
|
|
iso_model = _load_m2_iso_model()
|
|
df["pd_m2_iso"] = iso_model.predict(pd_arr)
|
|
return df
|
|
|
|
|
|
def processing_all(df_m1: pd.DataFrame, df_m2: pd.DataFrame, df_thx: pd.DataFrame):
|
|
return processing_m1(df_m1), processing_m2(df_m2), df_thx
|
|
|
|
|
|
# Legacy single-model entry point
|
|
def processing(input_data: pd.DataFrame) -> pd.DataFrame:
|
|
return processing_m1(input_data)
|