import pandas as pd import json import jmespath import logging # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s - %(message)s", ) logger = logging.getLogger(__name__) def pre_processing(input_data): # combined_df = pd.DataFrame([input_data]) combined_df = input_data combined_df["app_age"] = combined_df.apply( lambda row: pd.to_datetime(row["application_timestamp"]).year - pd.to_datetime(row["application_date_of_birth"]).year if pd.notnull(row["application_timestamp"]) and pd.notnull(row["application_date_of_birth"]) else None, axis=1 ) # for col in ["requestid", "inputipaddress", "deviceid", "fuzzydeviceid", "trueip", "dnsip"]: # combined_df[f"{col}_consistency"] = combined_df.groupby("application_key")[col].transform("nunique") for col in ["requestid", "inputipaddress", "deviceid", "fuzzydeviceid", "trueip", "dnsip"]: combined_df[f"{col}_consistency"] = combined_df[col].apply( lambda x: 1 if pd.notnull(x) and str(x).lower() not in ("nan", "none", None) else 0 ) combined_df.rename(columns={'inputipaddress_consistency': 'inputip_consistency'}, inplace=True) combined_df.rename(columns={'requestid_consistency': 'request_consistency'}, inplace=True) combined_df['digitalidconfidence'] = pd.to_numeric(combined_df['digitalidconfidence'], errors='coerce').astype('Int64') for col in ["digitalidconfidence"]: combined_df[f"avg_{col}"] = combined_df.groupby("application_key")[col].transform("mean") combined_df[f"min_{col}"] = combined_df.groupby("application_key")[col].transform("min") combined_df[f"max_{col}"] = combined_df.groupby("application_key")[col].transform("max") combined_df['Level_1_Link_Accept'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Level_1_Link_Accept', na=False, regex=True).astype(int) combined_df['Identity_Negative_History_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Identity_Negative_History', na=False, regex=True).astype(int) combined_df['Level_1_Link_Accept_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Level_1_Link_Accept', na=False, regex=True).astype(int) combined_df['Device_Negative_History_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Device_Negative_History', na=False, regex=True).astype(int) combined_df['Level_1_Link_Reject_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Level_1_Link_Reject', na=False, regex=True).astype(int) combined_df['IP_Negative_History_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('IP_Negative_History', na=False, regex=True).astype(int) combined_df['Identity_Spoofing_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Identity_Spoofing', na=False, regex=True).astype(int) combined_df['Bot_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Bot', na=False, regex=True).astype(int) def map_fraud_risk(risk): risk = str(risk).lower() if "very low" in risk: return "Very Low" elif "low" in risk: return "Low" elif "moderate" in risk: return "Moderate" elif "review" in risk: return "Review" elif "very high" in risk: return "Very High" else: return None combined_df["fraud_risk"] = combined_df["fraud_risk"].apply(map_fraud_risk) # combined_df.replace({'nan': None, 'None': None}, inplace=True) combined_df.replace({'nan': None, 'None': None}, inplace=True) dtype_dict = { 'app_age': 'int64', 'first_seen_days': 'int64', 'request_consistency': 'int64', 'application_source_name': str, 'fuzzydeviceid_consistency': 'int64', 'domain_creation_days': 'int64', 'employmentstatus': str, 'Identity_Spoofing_Max': 'int64', 'trueip_consistency': 'int64', 'inputip_consistency': 'int64', 'ea_score': 'int64', 'lengthatbank': float, 'lengthatjob': float, 'max_digitalidconfidence': float, 'Identity_Negative_History_Max': 'int64', 'digitalidconfidence': 'int64', 'IP_Negative_History_Max': 'int64', 'Device_Negative_History_Max': 'int64', 'Bot_Max': 'int64', 'avg_digitalidconfidence': float, 'min_digitalidconfidence': float, 'Level_1_Link_Reject_Max': 'int64', 'dnsip_consistency': 'int64', 'ip_country_confidence': 'int64', 'riskrating': str, 'ownhome': str, 'deviceid_consistency': 'int64', 'payfrequency': str, 'fraud_risk': str, 'Level_1_Link_Accept': 'int64', 'ip_net_speed_cell': str, 'ip_region_confidence': 'int64', 'Level_1_Link_Accept_Max': 'int64' } output_columns = list(dtype_dict.keys()) filtered_df = combined_df[output_columns] int_columns = [col for col, dtype in dtype_dict.items() if dtype == int] for col in int_columns: filtered_df[col] = pd.to_numeric(filtered_df[col], errors='coerce') filtered_df = filtered_df.astype(dtype_dict, errors='ignore') return filtered_df.to_dict(orient="records")