blocks-transformer/pre_processing.py

import pandas as pd
import numpy as np
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
)
logger = logging.getLogger(__name__)


def pre_processing(data_df):

    # combined_df = pd.DataFrame([input_data])
    # data = pd.DataFrame(data)
    combined_df = data_df
    combined_df["applicant_age"] = combined_df.apply(lambda row: pd.to_datetime(row["application_timestamp"]).year - pd.to_datetime(row["application_date_of_birth"]).year if pd.notnull(row["application_timestamp"]) and pd.notnull(row["application_date_of_birth"]) else None,axis=1
    )

    # Extracting Temporal features
    combined_df['application_timestamp'] = pd.to_datetime(combined_df["application_timestamp"])
    combined_df.loc[:, 'application_time'] = pd.to_datetime(combined_df['application_timestamp']).dt.time

    combined_df['day'] = combined_df['application_timestamp'].dt.day
    combined_df['day_of_week'] = combined_df['application_timestamp'].dt.weekday  # 0=Monday, 6=Sunday

    combined_df['day_sin'] = np.sin(2 * np.pi * combined_df['day'] / 31)
    combined_df['day_cos'] = np.cos(2 * np.pi * combined_df['day'] / 31)
    combined_df['day_of_week_sin'] = np.sin(2 * np.pi * combined_df['day_of_week'] / 7)
    combined_df['day_of_week_cos'] = np.cos(2 * np.pi * combined_df['day_of_week'] / 7)

    # combined_df['is_weekend'] = combined_df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

    # Create a day/night variable
    def classify_day_night(hour):
        if 6 <= hour < 18:
            return 'Day'
        else:
            return 'Night'

    # Extract hour from application_time
    combined_df['hour'] = combined_df['application_time'].apply(lambda x: x.hour if pd.notnull(x) else np.nan)
    combined_df['day_night'] = combined_df['hour'].apply(lambda hour: classify_day_night(hour) if pd.notnull(hour) else 'Unknown')

    # combined_df['os_version'] = combined_df['os_version'].str.replace(r'[^a-zA-Z0-9]', '_', regex=True)
    combined_df['os_version'] = combined_df['os_version'].apply(lambda x: x.split('.')[0] if isinstance(x, str) and '.' in x
                                                            else x.split('_')[0] if isinstance(x, str) and '_' in x
                                                            else x)


    # Datatype conversions
    # combined_df['Level_1_Link_Accept'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Level_1_Link_Accept', na=False, regex=True).astype(int)
    combined_df['Identity_Negative_History'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Identity_Negative_History', na=False, regex=True).astype(int)
    combined_df['Device_Negative_History'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Device_Negative_History', na=False, regex=True).astype(int)
    combined_df['Level_1_Link_Reject'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Level_1_Link_Reject', na=False, regex=True).astype(int)
    combined_df['IP_Negative_History'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('IP_Negative_History', na=False, regex=True).astype(int)
    combined_df['Identity_Spoofing'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Identity_Spoofing', na=False, regex=True).astype(int)
    # combined_df['Bot'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Bot', na=False, regex=True).astype(int)

    combined_df['digitalidconfidence'] = pd.to_numeric(combined_df['digitalidconfidence'], errors='coerce').astype('Int64')

    # Rename Columns if Required
    combined_df.rename(columns={
        'DigitalIdConfidence': 'digitalidconfidence',
        # 'inputipaddress_consistency': 'inputip_consistency',
        # 'requestid_consistency': 'request_consistency',
        # Add others as required if present in your DataFrame and needing renaming.
    }, inplace=True)

    # #Testing : remove below
    # combined_df.to_csv('op-pre-processing_intermediate.csv', index=False)

    dtype_dict = {
        "applicant_age" :  int,
        "digitalidconfidence" :  float,
        "first_seen_days" :  float,
        "employmentstatus" :  str,
        "ea_score" :  float,
        "trueipgeo" :  str,
        "hour" :  int,
        "email_creation_days" :  float,
        "lengthatjob" :  float,
        "day_cos" :  float,
        "summary_risk_score" :  float,
        "digital_id_trust_score_rating" :  str,
        "day" :  'int32',
        "lengthatbank" :  float,
        "day_of_week_cos" :  float,
        "Level_1_Link_Reject" :  int,
        "Identity_Negative_History" :  int,
        "educationlevel" :  str,
        "os_version" :  str,
        "account_email_worst_score" :  float,
        "true_ip_score" :  float,
        "ip_net_speed_cell" :  str,
        "account_email_score" :  float,
        "day_of_week" :  'int32',
        "true_ip_worst_score" :  float,
        "proxy_ip_worst_score" :  float,
        "day_night" :  str,
        "proxy_ip_score" :  float,
        "monthsatresidence" :  float,
        "Device_Negative_History" :  int,
        "fuzzy_device_score" :  float,
        "day_sin" :  float,
        "ip_region_confidence" :  float,
        "true_ip_state_confidence" :  float,
        "IP_Negative_History" :  int,
        "fuzzy_device_worst_score" :  float,
        "digital_id_confidence_rating" :  str,
        "day_of_week_sin" :  float,
        "riskrating" :  str,
        "payfrequency" :  str,
        "ownhome" :  str,
        "Identity_Spoofing" :  int
    }

    next_block_cols = ['application_key', 'application_timestamp', 'deviceid', 'fuzzydeviceid', 'application_email_address']
    cols_to_keep = [col for col in dtype_dict.keys() if col in combined_df.columns]

    final_cols = list(set(next_block_cols).union(set(cols_to_keep)))
    # Type casting
    for col, dtype in dtype_dict.items():
        if col in combined_df.columns:
            if dtype == int:
                combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce', downcast='integer')
            elif dtype == float:
                combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce', downcast='float')
            elif dtype == str:
                combined_df[col] = combined_df[col].astype(str)
    # cross check data type
    capping_dict = {
        "applicant_age": (18, 93),
        "digitalidconfidence": (0, 9017),
        "first_seen_days": (0, 10486),
        "ea_score": (1, 930),
        "hour": (0, 23),
        "email_creation_days": (2438, 9661),
        "lengthatjob": (1, 24),
        "day_cos": (-0.9948693234, 1),
        "summary_risk_score": (-100, 30),
        "day": (1, 31),
        "lengthatbank": (0, 25),
        "day_of_week_cos": (-0.9009688679, 1),
        "Level_1_Link_Reject": (0, 1),
        "Identity_Negative_History": (0, 1),
        "account_email_worst_score": (-52, 0),
        "true_ip_score": (-38, 49),
        "account_email_score": (-18, 9),
        "day_of_week": (0, 6),
        "true_ip_worst_score": (-100, 0),
        "proxy_ip_worst_score": (-100, 0),
        "proxy_ip_score": (-29, 60),
        "monthsatresidence": (0, 25),
        "Device_Negative_History": (0, 1),
        "fuzzy_device_score": (-29, 14),
        "day_sin": (-0.9987165072, 0.9987165072),
        "ip_region_confidence": (75, 99),
        # "true_ip_state_confidence": (5, 98),
        "IP_Negative_History": (0, 1),
        "fuzzy_device_worst_score": (-100, 0),
        "day_of_week_sin": (-0.9749279122, 0.9749279122),
        "Identity_Spoofing": (0, 1),
    }


    # Apply capping
    for column, (cap_min, cap_max) in capping_dict.items():
        if column in combined_df.columns:
            combined_df[column] = combined_df[column].clip(lower=cap_min, upper=cap_max)


    def handle_unknowns(X, column, known_values, default_treatment=None):
        if column not in X.columns:
            return X  #  Return X to avoid NoneType error
        known_values = {str(val).lower() for val in known_values}
        invalid_values = {None, "none", "nan", pd.NA}
        X[column] = X[column].apply(
            lambda x: str(x).lower() if pd.notna(x) and str(x).lower() in known_values
            else (default_treatment if pd.notna(x) and str(x).lower() not in invalid_values else np.nan)
        )
        return X  #  Always return the DataFrame


    unknown_treatments = {
        "employmentstatus": {
            "valid_values": [
                "disability", "fixed income", "full time employed", "part time employment",
                "retired benefits", "self employed", "student", "unemployed", "welfare"
            ],
            "default_treatment": "other"
        },
        "trueipgeo": {
            "valid_values": ["US"],
            "default_treatment": "other"
        },
        "digital_id_trust_score_rating": {
            "valid_values": ["very_high", "high", "neutral", "low"],
            "default_treatment": "very_low"
        },
        "educationlevel": {
            "valid_values": ["associate's degree", "bachelor's degree", "doctorate", "high school", "master's degree"],
            "default_treatment": "other"
        },
        "os_version": {
            "valid_values": [
                '18', '17', '16', '15', '14', '13', '12', '11', '10', '9', '8'
            ],
            "default_treatment": 'unknown'
        },
        "ip_net_speed_cell": {
            "valid_values": [
                "broadband", "cable", "dialup", "dsl", "fixed wireless", "mobile", "mobile wireless", "ocx", "satellite",
                "t1", "tx", "wireless", "xdsl"
            ],
            "default_treatment": "mobile"
        },
        "digital_id_confidence_rating": {
            "valid_values": ["high", "medium", "very_high"],
            "default_treatment": "very_low"
        },
        "riskrating": {
            "valid_values": ["low", "medium", "neutral", "trusted"],
            "default_treatment": "high"
        },
        "ownhome": {
            "valid_values": ["true", "false"],
            "default_treatment": np.nan
        },
    }

    for column, treatment in unknown_treatments.items():
        combined_df = handle_unknowns(combined_df, column, treatment["valid_values"], treatment["default_treatment"])

    payfrequency_map = {
        "biweekly": ["biweekly", "bi-weekly", "bi weekly", "bw"],
        "semimonthly": ["semi-monthly", "semimonthly"]
    }

    combined_df['payfrequency'] = combined_df['payfrequency'].apply(
        lambda x: next((key for key, values in payfrequency_map.items() if str(x).lower() in values), np.nan)
    )

    return  combined_df[final_cols]