blocks-transformer/pre_processing.py
admin user 8f303a4993
All checks were successful
Build and Push Docker Image / test (push) Successful in 2m8s
Build and Push Docker Image / build_and_push (push) Successful in 2m13s
Early Term Default/Fraud indicator v1 block
2025-01-21 21:17:40 +00:00

96 lines
5.2 KiB
Python

import pandas as pd
import json
import jmespath
import logging
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
)
logger = logging.getLogger(__name__)
def pre_processing(input_data):
# combined_df = pd.DataFrame([input_data])
combined_df = input_data
combined_df["app_age"] = combined_df.apply(
lambda row: pd.to_datetime(row["application_timestamp"]).year - pd.to_datetime(row["application_date_of_birth"]).year
if pd.notnull(row["application_timestamp"]) and pd.notnull(row["application_date_of_birth"]) else None,
axis=1
)
# for col in ["requestid", "inputipaddress", "deviceid", "fuzzydeviceid", "trueip", "dnsip"]:
# combined_df[f"{col}_consistency"] = combined_df.groupby("application_key")[col].transform("nunique")
for col in ["requestid", "inputipaddress", "deviceid", "fuzzydeviceid", "trueip", "dnsip"]:
combined_df[f"{col}_consistency"] = combined_df[col].apply(
lambda x: 1 if pd.notnull(x) and str(x).lower() not in ("nan", "none", None) else 0
)
combined_df.rename(columns={'inputipaddress_consistency': 'inputip_consistency'}, inplace=True)
combined_df.rename(columns={'requestid_consistency': 'request_consistency'}, inplace=True)
combined_df['digitalidconfidence'] = pd.to_numeric(combined_df['digitalidconfidence'], errors='coerce').astype('Int64')
for col in ["digitalidconfidence"]:
combined_df[f"avg_{col}"] = combined_df.groupby("application_key")[col].transform("mean")
combined_df[f"min_{col}"] = combined_df.groupby("application_key")[col].transform("min")
combined_df[f"max_{col}"] = combined_df.groupby("application_key")[col].transform("max")
combined_df['Level_1_Link_Accept'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Level_1_Link_Accept', na=False, regex=True).astype(int)
combined_df['Identity_Negative_History_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Identity_Negative_History', na=False, regex=True).astype(int)
combined_df['Level_1_Link_Accept_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Level_1_Link_Accept', na=False, regex=True).astype(int)
combined_df['Device_Negative_History_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Device_Negative_History', na=False, regex=True).astype(int)
combined_df['Level_1_Link_Reject_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Level_1_Link_Reject', na=False, regex=True).astype(int)
combined_df['IP_Negative_History_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('IP_Negative_History', na=False, regex=True).astype(int)
combined_df['Identity_Spoofing_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Identity_Spoofing', na=False, regex=True).astype(int)
combined_df['Bot_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Bot', na=False, regex=True).astype(int)
def map_fraud_risk(risk):
risk = str(risk).lower()
if "very low" in risk:
return "Very Low"
elif "low" in risk:
return "Low"
elif "moderate" in risk:
return "Moderate"
elif "review" in risk:
return "Review"
elif "very high" in risk:
return "Very High"
else:
return None
combined_df["fraud_risk"] = combined_df["fraud_risk"].apply(map_fraud_risk)
# combined_df.replace({'nan': None, 'None': None}, inplace=True)
combined_df.replace({'nan': None, 'None': None}, inplace=True)
dtype_dict = {
'app_age': 'int64', 'first_seen_days': 'int64', 'request_consistency': 'int64',
'application_source_name': str, 'fuzzydeviceid_consistency': 'int64',
'domain_creation_days': 'int64', 'employmentstatus': str, 'Identity_Spoofing_Max': 'int64',
'trueip_consistency': 'int64', 'inputip_consistency': 'int64', 'ea_score': 'int64',
'lengthatbank': float, 'lengthatjob': float, 'max_digitalidconfidence': float,
'Identity_Negative_History_Max': 'int64', 'digitalidconfidence': 'int64',
'IP_Negative_History_Max': 'int64', 'Device_Negative_History_Max': 'int64',
'Bot_Max': 'int64', 'avg_digitalidconfidence': float, 'min_digitalidconfidence': float,
'Level_1_Link_Reject_Max': 'int64', 'dnsip_consistency': 'int64', 'ip_country_confidence': 'int64',
'riskrating': str, 'ownhome': str, 'deviceid_consistency': 'int64',
'payfrequency': str, 'fraud_risk': str, 'Level_1_Link_Accept': 'int64',
'ip_net_speed_cell': str, 'ip_region_confidence': 'int64', 'Level_1_Link_Accept_Max': 'int64'
}
output_columns = list(dtype_dict.keys())
filtered_df = combined_df[output_columns]
int_columns = [col for col, dtype in dtype_dict.items() if dtype == int]
for col in int_columns:
filtered_df[col] = pd.to_numeric(filtered_df[col], errors='coerce')
filtered_df = filtered_df.astype(dtype_dict, errors='ignore')
return filtered_df.to_dict(orient="records")