blocks-transformer/pre_processing.py

import pandas as pd
import json
import jmespath
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
)
logger = logging.getLogger(__name__)

def pre_processing(input_data):

    # combined_df = pd.DataFrame([input_data])
    combined_df = input_data
    combined_df["app_age"] = combined_df.apply(
        lambda row: pd.to_datetime(row["application_timestamp"]).year - pd.to_datetime(row["application_date_of_birth"]).year
        if pd.notnull(row["application_timestamp"]) and pd.notnull(row["application_date_of_birth"]) else None,
        axis=1
    )

    # for col in ["requestid", "inputipaddress", "deviceid", "fuzzydeviceid", "trueip", "dnsip"]:
    #     combined_df[f"{col}_consistency"] = combined_df.groupby("application_key")[col].transform("nunique")

    for col in ["requestid", "inputipaddress", "deviceid", "fuzzydeviceid", "trueip", "dnsip"]:
        combined_df[f"{col}_consistency"] = combined_df[col].apply(
            lambda x: 1 if pd.notnull(x) and str(x).lower() not in ("nan", "none", None) else 0
        )

    combined_df.rename(columns={'inputipaddress_consistency': 'inputip_consistency'}, inplace=True)
    combined_df.rename(columns={'requestid_consistency': 'request_consistency'}, inplace=True)

    combined_df['digitalidconfidence'] = pd.to_numeric(combined_df['digitalidconfidence'], errors='coerce').astype('Int64')

    for col in ["digitalidconfidence"]:
        combined_df[f"avg_{col}"] = combined_df.groupby("application_key")[col].transform("mean")
        combined_df[f"min_{col}"] = combined_df.groupby("application_key")[col].transform("min")
        combined_df[f"max_{col}"] = combined_df.groupby("application_key")[col].transform("max")

    combined_df['Level_1_Link_Accept'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Level_1_Link_Accept', na=False, regex=True).astype(int)
    combined_df['Identity_Negative_History_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Identity_Negative_History', na=False, regex=True).astype(int)
    combined_df['Level_1_Link_Accept_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Level_1_Link_Accept', na=False, regex=True).astype(int)
    combined_df['Device_Negative_History_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Device_Negative_History', na=False, regex=True).astype(int)
    combined_df['Level_1_Link_Reject_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Level_1_Link_Reject', na=False, regex=True).astype(int)
    combined_df['IP_Negative_History_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('IP_Negative_History', na=False, regex=True).astype(int)
    combined_df['Identity_Spoofing_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Identity_Spoofing', na=False, regex=True).astype(int)
    combined_df['Bot_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Bot', na=False, regex=True).astype(int)

    def map_fraud_risk(risk):
        risk = str(risk).lower()
        if "very low" in risk:
            return "Very Low"
        elif "low" in risk:
            return "Low"
        elif "moderate" in risk:
            return "Moderate"
        elif "review" in risk:
            return "Review"
        elif "very high" in risk:
            return "Very High"
        else:
            return None

    combined_df["fraud_risk"] = combined_df["fraud_risk"].apply(map_fraud_risk)

    # combined_df.replace({'nan': None, 'None': None}, inplace=True)
    combined_df.replace({'nan': None, 'None': None}, inplace=True)

    dtype_dict = {
        'app_age': 'int64', 'first_seen_days': 'int64', 'request_consistency': 'int64',
        'application_source_name': str, 'fuzzydeviceid_consistency': 'int64',
        'domain_creation_days': 'int64', 'employmentstatus': str, 'Identity_Spoofing_Max': 'int64',
        'trueip_consistency': 'int64', 'inputip_consistency': 'int64', 'ea_score': 'int64',
        'lengthatbank': float, 'lengthatjob': float, 'max_digitalidconfidence': float,
        'Identity_Negative_History_Max': 'int64', 'digitalidconfidence': 'int64',
        'IP_Negative_History_Max': 'int64', 'Device_Negative_History_Max': 'int64',
        'Bot_Max': 'int64', 'avg_digitalidconfidence': float, 'min_digitalidconfidence': float,
        'Level_1_Link_Reject_Max': 'int64', 'dnsip_consistency': 'int64', 'ip_country_confidence': 'int64',
        'riskrating': str, 'ownhome': str, 'deviceid_consistency': 'int64',
        'payfrequency': str, 'fraud_risk': str, 'Level_1_Link_Accept': 'int64',
        'ip_net_speed_cell': str, 'ip_region_confidence': 'int64', 'Level_1_Link_Accept_Max': 'int64'
    }

    output_columns = list(dtype_dict.keys())
    filtered_df = combined_df[output_columns]

    int_columns = [col for col, dtype in dtype_dict.items() if dtype == int]

    for col in int_columns:
        filtered_df[col] = pd.to_numeric(filtered_df[col], errors='coerce')

    filtered_df = filtered_df.astype(dtype_dict, errors='ignore')

    return filtered_df.to_dict(orient="records")