blocks-transformer/processing.py
Ankur Malik 67c2174ab3
All checks were successful
Build and Push Docker Image / test (push) Successful in 27s
Build and Push Docker Image / build_and_push (push) Successful in 2m39s
Add digital_id_confidence_rating dtype, split prepare for M1/M2, clean lat/long CSV
2025-11-25 05:58:44 -05:00

117 lines
3.7 KiB
Python

import json
from functools import lru_cache
from pathlib import Path
import joblib
import numpy as np
import pandas as pd
import xgboost as xgb
# BASE_DIR = Path(__file__).resolve().parent
# M1_MODEL_PATH = BASE_DIR / "xgboost_model_M1.joblib"
# M1_CATEGORY_ORDERS_PATH = BASE_DIR / "category_orders_train_M1.json"
# M2_MODEL_PATH = BASE_DIR / "xgboost_model_M2.joblib"
# M2_CATEGORY_ORDERS_PATH = BASE_DIR / "category_orders_train_M2.json"
# M2_ISO_PATH = BASE_DIR / "isotonic_model_M2.joblib"
M1_MODEL_PATH = "./xgboost_model_M1.joblib"
M1_CATEGORY_ORDERS_PATH = "./category_orders_train_M1.json"
M2_MODEL_PATH = "./xgboost_model_M2.joblib"
M2_CATEGORY_ORDERS_PATH = "./category_orders_train_M2.json"
M2_ISO_PATH = "./isotonic_model_M2.joblib"
def _load_category_orders(path: Path) -> dict:
with open(path, "r") as f:
return json.load(f)
@lru_cache(maxsize=1)
def _load_m1_model():
return joblib.load(M1_MODEL_PATH)
@lru_cache(maxsize=1)
def _load_m2_model():
return joblib.load(M2_MODEL_PATH)
@lru_cache(maxsize=1)
def _load_m2_iso_model():
return joblib.load(M2_ISO_PATH)
@lru_cache(maxsize=None)
def _load_category_orders_cached(path: Path):
# Cache category orders per path to avoid disk I/O on each scoring
return _load_category_orders(path)
def _prepare_m1(df: pd.DataFrame, category_orders: dict) -> pd.DataFrame:
df = df.copy()
for col, categories in category_orders.items():
if col not in df.columns:
df[col] = np.nan
df[col].replace([None, "", "null", np.nan, "nan", " "], np.nan, inplace=True)
df[col] = pd.Categorical(df[col], categories=categories, ordered=True)
return df
def _prepare_m2(df: pd.DataFrame, category_orders: dict) -> pd.DataFrame:
df = df.copy()
for col, categories in category_orders.items():
if col not in df.columns:
df[col] = np.nan
df[col] = df[col].astype(str).str.lower()
df[col].replace([None, "", "null", np.nan, "nan", " "], np.nan, inplace=True)
df[col] = pd.Categorical(df[col], categories=categories, ordered=True)
return df
def processing_m1(input_data: pd.DataFrame) -> pd.DataFrame:
df = pd.DataFrame(input_data)
if df.empty:
raise ValueError("Input DataFrame is empty.")
model = _load_m1_model()
df.rename(columns={"riskrating": "RiskRating", "trueipgeo": "TrueIpGeo"}, inplace=True)
category_orders = _load_category_orders_cached(M1_CATEGORY_ORDERS_PATH)
df = _prepare_m1(df, category_orders)
expected_features = model.feature_names
dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=np.nan)
predictions = model.predict(dmatrix)
df["prediction"] = predictions
return df
def processing_m2(input_data: pd.DataFrame) -> pd.DataFrame:
df = pd.DataFrame(input_data)
if df.empty:
raise ValueError("Input DataFrame is empty.")
model = _load_m2_model()
category_orders = _load_category_orders_cached(M2_CATEGORY_ORDERS_PATH)
df = _prepare_m2(df, category_orders)
expected_features = model.feature_names
for feature in expected_features:
if feature not in df.columns:
df[feature] = np.nan
dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=np.nan)
pd_arr = model.predict(dmatrix)
df["pd_m2"] = pd_arr
iso_model = _load_m2_iso_model()
df["pd_m2_iso"] = iso_model.predict(pd_arr)
return df
def processing_all(df_m1: pd.DataFrame, df_m2: pd.DataFrame, df_thx: pd.DataFrame):
return processing_m1(df_m1), processing_m2(df_m2), df_thx
# Legacy single-model entry point
def processing(input_data: pd.DataFrame) -> pd.DataFrame:
return processing_m1(input_data)