diff --git a/latitute_longitute_reference.csv b/latitute_longitute_reference.csv index 67f96b2..1fd4bd6 100644 --- a/latitute_longitute_reference.csv +++ b/latitute_longitute_reference.csv @@ -1,5 +1,4 @@ postal_code_ref,latitute_ref,longitude_ref -,, c1001,-34.61178,-58.41731 8001,-33.9249,18.4241 2176,-33.87706,150.87529 diff --git a/pre_processing.py b/pre_processing.py index fcfe17c..b1e4e5c 100644 --- a/pre_processing.py +++ b/pre_processing.py @@ -276,6 +276,7 @@ def pre_processing_m1(data_df: pd.DataFrame) -> pd.DataFrame: "true_ip_state_confidence": float, "IP_Negative_History": int, "fuzzy_device_worst_score": float, + "digital_id_confidence_rating" : str, "day_of_week_sin": float, "riskrating": str, "payfrequency": str, diff --git a/processing.py b/processing.py index 4f25b1e..95c5d42 100644 --- a/processing.py +++ b/processing.py @@ -47,12 +47,21 @@ def _load_category_orders_cached(path: Path): return _load_category_orders(path) -def _prepare(df: pd.DataFrame, category_orders: dict) -> pd.DataFrame: +def _prepare_m1(df: pd.DataFrame, category_orders: dict) -> pd.DataFrame: + df = df.copy() + for col, categories in category_orders.items(): + if col not in df.columns: + df[col] = np.nan + df[col].replace([None, "", "null", np.nan, "nan", " "], np.nan, inplace=True) + df[col] = pd.Categorical(df[col], categories=categories, ordered=True) + return df + + +def _prepare_m2(df: pd.DataFrame, category_orders: dict) -> pd.DataFrame: df = df.copy() for col, categories in category_orders.items(): if col not in df.columns: df[col] = np.nan - df[col] = df[col].astype(str).str.lower() df[col].replace([None, "", "null", np.nan, "nan", " "], np.nan, inplace=True) df[col] = pd.Categorical(df[col], categories=categories, ordered=True) @@ -67,7 +76,7 @@ def processing_m1(input_data: pd.DataFrame) -> pd.DataFrame: model = _load_m1_model() df.rename(columns={"riskrating": "RiskRating", "trueipgeo": "TrueIpGeo"}, inplace=True) category_orders = _load_category_orders_cached(M1_CATEGORY_ORDERS_PATH) - df = _prepare(df, category_orders) + df = _prepare_m1(df, category_orders) expected_features = model.feature_names dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=np.nan) @@ -83,7 +92,7 @@ def processing_m2(input_data: pd.DataFrame) -> pd.DataFrame: model = _load_m2_model() category_orders = _load_category_orders_cached(M2_CATEGORY_ORDERS_PATH) - df = _prepare(df, category_orders) + df = _prepare_m2(df, category_orders) expected_features = model.feature_names for feature in expected_features: