Add digital_id_confidence_rating dtype, split prepare for M1/M2, clean lat/long CSV
This commit is contained in:
parent
d0f4d225ee
commit
67c2174ab3
@ -1,5 +1,4 @@
|
|||||||
postal_code_ref,latitute_ref,longitude_ref
|
postal_code_ref,latitute_ref,longitude_ref
|
||||||
,,
|
|
||||||
c1001,-34.61178,-58.41731
|
c1001,-34.61178,-58.41731
|
||||||
8001,-33.9249,18.4241
|
8001,-33.9249,18.4241
|
||||||
2176,-33.87706,150.87529
|
2176,-33.87706,150.87529
|
||||||
|
|||||||
|
Can't render this file because it is too large.
|
@ -276,6 +276,7 @@ def pre_processing_m1(data_df: pd.DataFrame) -> pd.DataFrame:
|
|||||||
"true_ip_state_confidence": float,
|
"true_ip_state_confidence": float,
|
||||||
"IP_Negative_History": int,
|
"IP_Negative_History": int,
|
||||||
"fuzzy_device_worst_score": float,
|
"fuzzy_device_worst_score": float,
|
||||||
|
"digital_id_confidence_rating" : str,
|
||||||
"day_of_week_sin": float,
|
"day_of_week_sin": float,
|
||||||
"riskrating": str,
|
"riskrating": str,
|
||||||
"payfrequency": str,
|
"payfrequency": str,
|
||||||
|
|||||||
@ -47,12 +47,21 @@ def _load_category_orders_cached(path: Path):
|
|||||||
return _load_category_orders(path)
|
return _load_category_orders(path)
|
||||||
|
|
||||||
|
|
||||||
def _prepare(df: pd.DataFrame, category_orders: dict) -> pd.DataFrame:
|
def _prepare_m1(df: pd.DataFrame, category_orders: dict) -> pd.DataFrame:
|
||||||
df = df.copy()
|
df = df.copy()
|
||||||
for col, categories in category_orders.items():
|
for col, categories in category_orders.items():
|
||||||
if col not in df.columns:
|
if col not in df.columns:
|
||||||
df[col] = np.nan
|
df[col] = np.nan
|
||||||
|
df[col].replace([None, "", "null", np.nan, "nan", " "], np.nan, inplace=True)
|
||||||
|
df[col] = pd.Categorical(df[col], categories=categories, ordered=True)
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def _prepare_m2(df: pd.DataFrame, category_orders: dict) -> pd.DataFrame:
|
||||||
|
df = df.copy()
|
||||||
|
for col, categories in category_orders.items():
|
||||||
|
if col not in df.columns:
|
||||||
|
df[col] = np.nan
|
||||||
df[col] = df[col].astype(str).str.lower()
|
df[col] = df[col].astype(str).str.lower()
|
||||||
df[col].replace([None, "", "null", np.nan, "nan", " "], np.nan, inplace=True)
|
df[col].replace([None, "", "null", np.nan, "nan", " "], np.nan, inplace=True)
|
||||||
df[col] = pd.Categorical(df[col], categories=categories, ordered=True)
|
df[col] = pd.Categorical(df[col], categories=categories, ordered=True)
|
||||||
@ -67,7 +76,7 @@ def processing_m1(input_data: pd.DataFrame) -> pd.DataFrame:
|
|||||||
model = _load_m1_model()
|
model = _load_m1_model()
|
||||||
df.rename(columns={"riskrating": "RiskRating", "trueipgeo": "TrueIpGeo"}, inplace=True)
|
df.rename(columns={"riskrating": "RiskRating", "trueipgeo": "TrueIpGeo"}, inplace=True)
|
||||||
category_orders = _load_category_orders_cached(M1_CATEGORY_ORDERS_PATH)
|
category_orders = _load_category_orders_cached(M1_CATEGORY_ORDERS_PATH)
|
||||||
df = _prepare(df, category_orders)
|
df = _prepare_m1(df, category_orders)
|
||||||
|
|
||||||
expected_features = model.feature_names
|
expected_features = model.feature_names
|
||||||
dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=np.nan)
|
dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=np.nan)
|
||||||
@ -83,7 +92,7 @@ def processing_m2(input_data: pd.DataFrame) -> pd.DataFrame:
|
|||||||
|
|
||||||
model = _load_m2_model()
|
model = _load_m2_model()
|
||||||
category_orders = _load_category_orders_cached(M2_CATEGORY_ORDERS_PATH)
|
category_orders = _load_category_orders_cached(M2_CATEGORY_ORDERS_PATH)
|
||||||
df = _prepare(df, category_orders)
|
df = _prepare_m2(df, category_orders)
|
||||||
|
|
||||||
expected_features = model.feature_names
|
expected_features = model.feature_names
|
||||||
for feature in expected_features:
|
for feature in expected_features:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user