Add digital_id_confidence_rating dtype, split prepare for M1/M2, clean lat/long CSV
This commit is contained in:
parent
d0f4d225ee
commit
67c2174ab3
@ -1,5 +1,4 @@
|
||||
postal_code_ref,latitute_ref,longitude_ref
|
||||
,,
|
||||
c1001,-34.61178,-58.41731
|
||||
8001,-33.9249,18.4241
|
||||
2176,-33.87706,150.87529
|
||||
|
||||
|
Can't render this file because it is too large.
|
@ -276,6 +276,7 @@ def pre_processing_m1(data_df: pd.DataFrame) -> pd.DataFrame:
|
||||
"true_ip_state_confidence": float,
|
||||
"IP_Negative_History": int,
|
||||
"fuzzy_device_worst_score": float,
|
||||
"digital_id_confidence_rating" : str,
|
||||
"day_of_week_sin": float,
|
||||
"riskrating": str,
|
||||
"payfrequency": str,
|
||||
|
||||
@ -47,12 +47,21 @@ def _load_category_orders_cached(path: Path):
|
||||
return _load_category_orders(path)
|
||||
|
||||
|
||||
def _prepare(df: pd.DataFrame, category_orders: dict) -> pd.DataFrame:
|
||||
def _prepare_m1(df: pd.DataFrame, category_orders: dict) -> pd.DataFrame:
|
||||
df = df.copy()
|
||||
for col, categories in category_orders.items():
|
||||
if col not in df.columns:
|
||||
df[col] = np.nan
|
||||
df[col].replace([None, "", "null", np.nan, "nan", " "], np.nan, inplace=True)
|
||||
df[col] = pd.Categorical(df[col], categories=categories, ordered=True)
|
||||
return df
|
||||
|
||||
|
||||
def _prepare_m2(df: pd.DataFrame, category_orders: dict) -> pd.DataFrame:
|
||||
df = df.copy()
|
||||
for col, categories in category_orders.items():
|
||||
if col not in df.columns:
|
||||
df[col] = np.nan
|
||||
df[col] = df[col].astype(str).str.lower()
|
||||
df[col].replace([None, "", "null", np.nan, "nan", " "], np.nan, inplace=True)
|
||||
df[col] = pd.Categorical(df[col], categories=categories, ordered=True)
|
||||
@ -67,7 +76,7 @@ def processing_m1(input_data: pd.DataFrame) -> pd.DataFrame:
|
||||
model = _load_m1_model()
|
||||
df.rename(columns={"riskrating": "RiskRating", "trueipgeo": "TrueIpGeo"}, inplace=True)
|
||||
category_orders = _load_category_orders_cached(M1_CATEGORY_ORDERS_PATH)
|
||||
df = _prepare(df, category_orders)
|
||||
df = _prepare_m1(df, category_orders)
|
||||
|
||||
expected_features = model.feature_names
|
||||
dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=np.nan)
|
||||
@ -83,7 +92,7 @@ def processing_m2(input_data: pd.DataFrame) -> pd.DataFrame:
|
||||
|
||||
model = _load_m2_model()
|
||||
category_orders = _load_category_orders_cached(M2_CATEGORY_ORDERS_PATH)
|
||||
df = _prepare(df, category_orders)
|
||||
df = _prepare_m2(df, category_orders)
|
||||
|
||||
expected_features = model.feature_names
|
||||
for feature in expected_features:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user