Add digital_id_confidence_rating dtype, split prepare for M1/M2, clean lat/long CSV
All checks were successful
Build and Push Docker Image / test (push) Successful in 27s
Build and Push Docker Image / build_and_push (push) Successful in 2m39s

This commit is contained in:
Ankur Malik 2025-11-25 05:58:44 -05:00
parent d0f4d225ee
commit 67c2174ab3
3 changed files with 14 additions and 5 deletions

View File

@ -1,5 +1,4 @@
postal_code_ref,latitute_ref,longitude_ref postal_code_ref,latitute_ref,longitude_ref
,,
c1001,-34.61178,-58.41731 c1001,-34.61178,-58.41731
8001,-33.9249,18.4241 8001,-33.9249,18.4241
2176,-33.87706,150.87529 2176,-33.87706,150.87529

Can't render this file because it is too large.

View File

@ -276,6 +276,7 @@ def pre_processing_m1(data_df: pd.DataFrame) -> pd.DataFrame:
"true_ip_state_confidence": float, "true_ip_state_confidence": float,
"IP_Negative_History": int, "IP_Negative_History": int,
"fuzzy_device_worst_score": float, "fuzzy_device_worst_score": float,
"digital_id_confidence_rating" : str,
"day_of_week_sin": float, "day_of_week_sin": float,
"riskrating": str, "riskrating": str,
"payfrequency": str, "payfrequency": str,

View File

@ -47,12 +47,21 @@ def _load_category_orders_cached(path: Path):
return _load_category_orders(path) return _load_category_orders(path)
def _prepare(df: pd.DataFrame, category_orders: dict) -> pd.DataFrame: def _prepare_m1(df: pd.DataFrame, category_orders: dict) -> pd.DataFrame:
df = df.copy() df = df.copy()
for col, categories in category_orders.items(): for col, categories in category_orders.items():
if col not in df.columns: if col not in df.columns:
df[col] = np.nan df[col] = np.nan
df[col].replace([None, "", "null", np.nan, "nan", " "], np.nan, inplace=True)
df[col] = pd.Categorical(df[col], categories=categories, ordered=True)
return df
def _prepare_m2(df: pd.DataFrame, category_orders: dict) -> pd.DataFrame:
df = df.copy()
for col, categories in category_orders.items():
if col not in df.columns:
df[col] = np.nan
df[col] = df[col].astype(str).str.lower() df[col] = df[col].astype(str).str.lower()
df[col].replace([None, "", "null", np.nan, "nan", " "], np.nan, inplace=True) df[col].replace([None, "", "null", np.nan, "nan", " "], np.nan, inplace=True)
df[col] = pd.Categorical(df[col], categories=categories, ordered=True) df[col] = pd.Categorical(df[col], categories=categories, ordered=True)
@ -67,7 +76,7 @@ def processing_m1(input_data: pd.DataFrame) -> pd.DataFrame:
model = _load_m1_model() model = _load_m1_model()
df.rename(columns={"riskrating": "RiskRating", "trueipgeo": "TrueIpGeo"}, inplace=True) df.rename(columns={"riskrating": "RiskRating", "trueipgeo": "TrueIpGeo"}, inplace=True)
category_orders = _load_category_orders_cached(M1_CATEGORY_ORDERS_PATH) category_orders = _load_category_orders_cached(M1_CATEGORY_ORDERS_PATH)
df = _prepare(df, category_orders) df = _prepare_m1(df, category_orders)
expected_features = model.feature_names expected_features = model.feature_names
dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=np.nan) dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=np.nan)
@ -83,7 +92,7 @@ def processing_m2(input_data: pd.DataFrame) -> pd.DataFrame:
model = _load_m2_model() model = _load_m2_model()
category_orders = _load_category_orders_cached(M2_CATEGORY_ORDERS_PATH) category_orders = _load_category_orders_cached(M2_CATEGORY_ORDERS_PATH)
df = _prepare(df, category_orders) df = _prepare_m2(df, category_orders)
expected_features = model.feature_names expected_features = model.feature_names
for feature in expected_features: for feature in expected_features: