96 lines
5.2 KiB
Python
96 lines
5.2 KiB
Python
import pandas as pd
|
|
import json
|
|
import jmespath
|
|
import logging
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
def pre_processing(input_data):
|
|
|
|
# combined_df = pd.DataFrame([input_data])
|
|
combined_df = input_data
|
|
combined_df["app_age"] = combined_df.apply(
|
|
lambda row: pd.to_datetime(row["application_timestamp"]).year - pd.to_datetime(row["application_date_of_birth"]).year
|
|
if pd.notnull(row["application_timestamp"]) and pd.notnull(row["application_date_of_birth"]) else None,
|
|
axis=1
|
|
)
|
|
|
|
# for col in ["requestid", "inputipaddress", "deviceid", "fuzzydeviceid", "trueip", "dnsip"]:
|
|
# combined_df[f"{col}_consistency"] = combined_df.groupby("application_key")[col].transform("nunique")
|
|
|
|
for col in ["requestid", "inputipaddress", "deviceid", "fuzzydeviceid", "trueip", "dnsip"]:
|
|
combined_df[f"{col}_consistency"] = combined_df[col].apply(
|
|
lambda x: 1 if pd.notnull(x) and str(x).lower() not in ("nan", "none", None) else 0
|
|
)
|
|
|
|
combined_df.rename(columns={'inputipaddress_consistency': 'inputip_consistency'}, inplace=True)
|
|
combined_df.rename(columns={'requestid_consistency': 'request_consistency'}, inplace=True)
|
|
|
|
combined_df['digitalidconfidence'] = pd.to_numeric(combined_df['digitalidconfidence'], errors='coerce').astype('Int64')
|
|
|
|
for col in ["digitalidconfidence"]:
|
|
combined_df[f"avg_{col}"] = combined_df.groupby("application_key")[col].transform("mean")
|
|
combined_df[f"min_{col}"] = combined_df.groupby("application_key")[col].transform("min")
|
|
combined_df[f"max_{col}"] = combined_df.groupby("application_key")[col].transform("max")
|
|
|
|
combined_df['Level_1_Link_Accept'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Level_1_Link_Accept', na=False, regex=True).astype(int)
|
|
combined_df['Identity_Negative_History_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Identity_Negative_History', na=False, regex=True).astype(int)
|
|
combined_df['Level_1_Link_Accept_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Level_1_Link_Accept', na=False, regex=True).astype(int)
|
|
combined_df['Device_Negative_History_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Device_Negative_History', na=False, regex=True).astype(int)
|
|
combined_df['Level_1_Link_Reject_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Level_1_Link_Reject', na=False, regex=True).astype(int)
|
|
combined_df['IP_Negative_History_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('IP_Negative_History', na=False, regex=True).astype(int)
|
|
combined_df['Identity_Spoofing_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Identity_Spoofing', na=False, regex=True).astype(int)
|
|
combined_df['Bot_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Bot', na=False, regex=True).astype(int)
|
|
|
|
def map_fraud_risk(risk):
|
|
risk = str(risk).lower()
|
|
if "very low" in risk:
|
|
return "Very Low"
|
|
elif "low" in risk:
|
|
return "Low"
|
|
elif "moderate" in risk:
|
|
return "Moderate"
|
|
elif "review" in risk:
|
|
return "Review"
|
|
elif "very high" in risk:
|
|
return "Very High"
|
|
else:
|
|
return None
|
|
|
|
combined_df["fraud_risk"] = combined_df["fraud_risk"].apply(map_fraud_risk)
|
|
|
|
# combined_df.replace({'nan': None, 'None': None}, inplace=True)
|
|
combined_df.replace({'nan': None, 'None': None}, inplace=True)
|
|
|
|
dtype_dict = {
|
|
'app_age': 'int64', 'first_seen_days': 'int64', 'request_consistency': 'int64',
|
|
'application_source_name': str, 'fuzzydeviceid_consistency': 'int64',
|
|
'domain_creation_days': 'int64', 'employmentstatus': str, 'Identity_Spoofing_Max': 'int64',
|
|
'trueip_consistency': 'int64', 'inputip_consistency': 'int64', 'ea_score': 'int64',
|
|
'lengthatbank': float, 'lengthatjob': float, 'max_digitalidconfidence': float,
|
|
'Identity_Negative_History_Max': 'int64', 'digitalidconfidence': 'int64',
|
|
'IP_Negative_History_Max': 'int64', 'Device_Negative_History_Max': 'int64',
|
|
'Bot_Max': 'int64', 'avg_digitalidconfidence': float, 'min_digitalidconfidence': float,
|
|
'Level_1_Link_Reject_Max': 'int64', 'dnsip_consistency': 'int64', 'ip_country_confidence': 'int64',
|
|
'riskrating': str, 'ownhome': str, 'deviceid_consistency': 'int64',
|
|
'payfrequency': str, 'fraud_risk': str, 'Level_1_Link_Accept': 'int64',
|
|
'ip_net_speed_cell': str, 'ip_region_confidence': 'int64', 'Level_1_Link_Accept_Max': 'int64'
|
|
}
|
|
|
|
output_columns = list(dtype_dict.keys())
|
|
filtered_df = combined_df[output_columns]
|
|
|
|
int_columns = [col for col, dtype in dtype_dict.items() if dtype == int]
|
|
|
|
for col in int_columns:
|
|
filtered_df[col] = pd.to_numeric(filtered_df[col], errors='coerce')
|
|
|
|
filtered_df = filtered_df.astype(dtype_dict, errors='ignore')
|
|
|
|
return filtered_df.to_dict(orient="records")
|