blocks-transformer/pre_processing.py

578 lines
28 KiB
Python
Raw Normal View History

2025-03-12 16:12:18 +00:00
import logging
2025-11-23 23:22:32 -05:00
import math
import re
from pathlib import Path
from typing import Dict, Iterable, List, Tuple, Union
import numpy as np
import pandas as pd
2025-03-12 16:12:18 +00:00
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
)
logger = logging.getLogger(__name__)
2025-11-23 23:22:32 -05:00
BASE_DIR = Path(__file__).resolve().parent
M2_LATLONG_REF_PATH = BASE_DIR / "latitute_longitute_reference.csv"
THX_FIELDS = [
"application_key",
"application_timestamp",
"digital_id_first_seen",
"summary_risk_score",
"cpu_clock",
"account_login_first_seen",
"account_telephone_first_seen",
"true_ip_first_seen",
"ssn_hash_first_seen",
"account_email_attributes",
"tps_ip_latitude",
"tps_ip_longitude",
]
# Hardcoded M2 data dictionary (replaces file lookup)
M2_DATA_DICTIONARY: Dict[str, Dict[str, Union[float, str, None]]] = {
"account_email_attributes_challenge_passed": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"account_email_attributes_challenged": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"account_email_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"account_login_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"account_name_activities_trusted_prob": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"account_telephone_attributes_challenge_failed": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"account_telephone_attributes_loan_app": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"account_telephone_attributes_trusted": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"account_telephone_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"accphone_gbl_velocity_hour": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"applicant_age": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"browser_spoof_score": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"day": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"day_cos": {"data_type": "float", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"day_sin": {"data_type": "float", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"di_autofill_count_login": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"digital_id_attributes_challenged": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"digital_id_attributes_trusted": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"digital_id_day_diff": {"data_type": "int", "valid_min": -999999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"digital_id_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"digital_id_month_diff": {"data_type": "int", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"digital_id_trust_score": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"digitalidconfidence": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"dist_dnsip_ref_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"dist_em_ip_ref_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"dist_inputip_ref_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"dist_proxyip_ref_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"dist_trueip_dnsip_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"dist_trueip_em_ip_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"dist_trueip_ref_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"domain_creation_days": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"emailtofullnameconfidence": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"emailtolastnameconfidence": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"first_seen_days": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"fraudrisk": {"data_type": "string", "valid_min": None, "valid_max": None, "observed_cap_min": None, "observed_cap_max": None},
"fuzzy_device_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"fuzzy_device_id_confidence": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"hour": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"input_ip_connection_type": {"data_type": "string", "valid_min": None, "valid_max": None, "observed_cap_min": None, "observed_cap_max": None},
"iptophoneconfidence": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"national_id_first_seen_age": {"data_type": "int", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"overalldigitalidentityscore": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"policy_score": {"data_type": "float", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"proxy_ip_first_seen_age": {"data_type": "int", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"proxy_ip_worst_score": {"data_type": "float", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"proxy_score": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"reasoncode_new_smartid_lt_1wk_global": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"riskrating": {"data_type": "string", "valid_min": None, "valid_max": None, "observed_cap_min": None, "observed_cap_max": None},
"ssn_hash_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"totalhits": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"true_ip_activities_trusted_prob": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"true_ip_attributes_trusted": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"true_ip_attributes_trusted_conf": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"true_ip_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"true_ip_worst_score": {"data_type": "float", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"uniquehits": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
}
# Hardcoded one-hot config (parsed_feature, model_var, contains)
M2_ONEHOT_CONFIG: List[Tuple[str, str, str]] = [
("reasoncode", "reasoncode_new_smartid_lt_1wk_global", "new_smartid_lt_1wk_global"),
("account_name_activities", "account_name_activities_trusted_prob", "trusted_prob"),
("account_email_attributes", "account_email_attributes_challenged", "challenged"),
("account_email_attributes", "account_email_attributes_challenge_passed", "challenge_passed"),
("true_ip_attributes", "true_ip_attributes_trusted", "trusted"),
("true_ip_attributes", "true_ip_attributes_trusted_conf", "trusted_conf"),
("digital_id_attributes", "digital_id_attributes_challenged", "challenged"),
("digital_id_attributes", "digital_id_attributes_trusted", "trusted"),
("account_telephone_attributes", "account_telephone_attributes_challenge_failed", "challenge_failed"),
("account_telephone_attributes", "account_telephone_attributes_loan_app", "loan_app"),
("account_telephone_attributes", "account_telephone_attributes_trusted", "trusted"),
("true_ip_activities", "true_ip_activities_trusted_prob", "trusted_prob"),
]
# ----------------------------
# Helpers
# ----------------------------
def _handle_unknowns(X: pd.DataFrame, column: str, known_values: Iterable[str], default_treatment=None):
if column not in X.columns:
return X
known_values = {str(val).lower() for val in known_values}
invalid_values = {None, "none", "nan", pd.NA}
X[column] = X[column].apply(
lambda x: str(x).lower()
if pd.notna(x) and str(x).lower() in known_values
else (default_treatment if pd.notna(x) and str(x).lower() not in invalid_values else np.nan)
)
return X
def _haversine_km(lat1, lon1, lat2, lon2):
if None in (lat1, lon1, lat2, lon2):
return None
try:
rlat1 = float(lat1) * math.pi / 180.0
rlat2 = float(lat2) * math.pi / 180.0
dlat = (float(lat2) - float(lat1)) * math.pi / 180.0
dlon = (float(lon2) - float(lon1)) * math.pi / 180.0
except Exception:
return None
a = (
math.sin(dlat / 2.0) ** 2
+ math.cos(rlat1) * math.cos(rlat2) * math.sin(dlon / 2.0) ** 2
)
a = min(1.0, max(0.0, a))
return 2 * 6371.0088 * math.asin(math.sqrt(a))
def _prep_latlong_ref():
if not M2_LATLONG_REF_PATH.exists():
logger.warning("latitute_longitute_reference.csv missing at %s", M2_LATLONG_REF_PATH)
return pd.DataFrame()
try:
ref = pd.read_csv(M2_LATLONG_REF_PATH, usecols=["postal_code_ref", "latitute_ref", "longitude_ref"])
except Exception:
ref = pd.read_csv(M2_LATLONG_REF_PATH)
# keep lower string version for matching
if "postal_code_ref" in ref.columns:
ref["postal_code_ref"] = ref["postal_code_ref"].astype(str).str.lower()
return ref
def _normalize_zip_for_ref(zip_val):
"""
Normalize zip/postal code values so they match reference CSV keys.
- Floats like 89503.0 -> "89503"
- Int-like strings "89503.0" -> "89503"
Note: we intentionally avoid zero-filling to preserve behaviour seen in UAT references
where leading-zero ZIPs are not matched to the reference table.
"""
if pd.isna(zip_val):
return None
if isinstance(zip_val, (int, float)) and not isinstance(zip_val, bool):
return str(int(zip_val)).lower()
zip_str = str(zip_val).strip()
if zip_str.replace(".", "", 1).isdigit():
try:
return str(int(float(zip_str))).lower()
except Exception:
pass
return zip_str.lower() if zip_str else None
# ----------------------------
# M1 Pre-processing (existing behaviour)
# ----------------------------
def pre_processing_m1(data_df: pd.DataFrame) -> pd.DataFrame:
combined_df = data_df.copy()
combined_df["applicant_age"] = combined_df.apply(
lambda row: pd.to_datetime(row["application_timestamp"]).year - pd.to_datetime(row["application_date_of_birth"]).year
if pd.notnull(row["application_timestamp"]) and pd.notnull(row["application_date_of_birth"])
else None,
axis=1,
2025-03-12 16:12:18 +00:00
)
2025-11-23 23:22:32 -05:00
combined_df["application_timestamp"] = pd.to_datetime(combined_df["application_timestamp"])
combined_df.loc[:, "application_time"] = pd.to_datetime(combined_df["application_timestamp"]).dt.time
2025-03-12 16:12:18 +00:00
2025-11-23 23:22:32 -05:00
combined_df["day"] = combined_df["application_timestamp"].dt.day
combined_df["day_of_week"] = combined_df["application_timestamp"].dt.weekday
2025-03-12 16:12:18 +00:00
2025-11-23 23:22:32 -05:00
combined_df["day_sin"] = np.sin(2 * np.pi * combined_df["day"] / 31)
combined_df["day_cos"] = np.cos(2 * np.pi * combined_df["day"] / 31)
combined_df["day_of_week_sin"] = np.sin(2 * np.pi * combined_df["day_of_week"] / 7)
combined_df["day_of_week_cos"] = np.cos(2 * np.pi * combined_df["day_of_week"] / 7)
2025-03-12 16:12:18 +00:00
def classify_day_night(hour):
if 6 <= hour < 18:
2025-11-23 23:22:32 -05:00
return "Day"
return "Night"
combined_df["hour"] = combined_df["application_time"].apply(lambda x: x.hour if pd.notnull(x) else np.nan)
combined_df["day_night"] = combined_df["hour"].apply(lambda hour: classify_day_night(hour) if pd.notnull(hour) else "Unknown")
combined_df["os_version"] = combined_df["os_version"].apply(
lambda x: x.split(".")[0] if isinstance(x, str) and "." in x else x.split("_")[0] if isinstance(x, str) and "_" in x else x
)
combined_df["Identity_Negative_History"] = combined_df["tmxsummaryreasoncode"].astype(str).str.contains(
"Identity_Negative_History", na=False, regex=True
).astype(int)
combined_df["Device_Negative_History"] = combined_df["tmxsummaryreasoncode"].astype(str).str.contains(
"Device_Negative_History", na=False, regex=True
).astype(int)
combined_df["Level_1_Link_Reject"] = combined_df["tmxsummaryreasoncode"].astype(str).str.contains(
"Level_1_Link_Reject", na=False, regex=True
).astype(int)
combined_df["IP_Negative_History"] = combined_df["tmxsummaryreasoncode"].astype(str).str.contains(
"IP_Negative_History", na=False, regex=True
).astype(int)
combined_df["Identity_Spoofing"] = combined_df["tmxsummaryreasoncode"].astype(str).str.contains(
"Identity_Spoofing", na=False, regex=True
).astype(int)
combined_df["digitalidconfidence"] = pd.to_numeric(combined_df["digitalidconfidence"], errors="coerce").astype("Int64")
combined_df.rename(
columns={
"DigitalIdConfidence": "digitalidconfidence",
},
inplace=True,
)
2025-03-12 16:12:18 +00:00
dtype_dict = {
2025-11-23 23:22:32 -05:00
"applicant_age": int,
"digitalidconfidence": float,
"first_seen_days": float,
"employmentstatus": str,
"ea_score": float,
"trueipgeo": str,
"hour": int,
"email_creation_days": float,
"lengthatjob": float,
"day_cos": float,
"summary_risk_score": float,
"digital_id_trust_score_rating": str,
"day": "int32",
"lengthatbank": float,
"day_of_week_cos": float,
"Level_1_Link_Reject": int,
"Identity_Negative_History": int,
"educationlevel": str,
"os_version": str,
"account_email_worst_score": float,
"true_ip_score": float,
"ip_net_speed_cell": str,
"account_email_score": float,
"day_of_week": "int32",
"true_ip_worst_score": float,
"proxy_ip_worst_score": float,
"day_night": str,
"proxy_ip_score": float,
"monthsatresidence": float,
"Device_Negative_History": int,
"fuzzy_device_score": float,
"day_sin": float,
"ip_region_confidence": float,
"true_ip_state_confidence": float,
"IP_Negative_History": int,
"fuzzy_device_worst_score": float,
"digital_id_confidence_rating" : str,
2025-11-23 23:22:32 -05:00
"day_of_week_sin": float,
"riskrating": str,
"payfrequency": str,
"ownhome": str,
"Identity_Spoofing": int,
2025-03-12 16:12:18 +00:00
}
2025-11-23 23:22:32 -05:00
next_block_cols = ["application_key", "application_timestamp", "deviceid", "fuzzydeviceid", "application_email_address"]
2025-03-12 16:12:18 +00:00
cols_to_keep = [col for col in dtype_dict.keys() if col in combined_df.columns]
final_cols = list(set(next_block_cols).union(set(cols_to_keep)))
for col, dtype in dtype_dict.items():
if col in combined_df.columns:
if dtype == int:
2025-11-23 23:22:32 -05:00
combined_df[col] = pd.to_numeric(combined_df[col], errors="coerce", downcast="integer")
2025-03-12 16:12:18 +00:00
elif dtype == float:
2025-11-23 23:22:32 -05:00
combined_df[col] = pd.to_numeric(combined_df[col], errors="coerce", downcast="float")
2025-03-12 16:12:18 +00:00
elif dtype == str:
combined_df[col] = combined_df[col].astype(str)
2025-11-23 23:22:32 -05:00
2025-03-12 16:12:18 +00:00
capping_dict = {
"applicant_age": (18, 93),
"digitalidconfidence": (0, 9017),
"first_seen_days": (0, 10486),
"ea_score": (1, 930),
"hour": (0, 23),
"email_creation_days": (2438, 9661),
"lengthatjob": (1, 24),
"day_cos": (-0.9948693234, 1),
"summary_risk_score": (-100, 30),
"day": (1, 31),
"lengthatbank": (0, 25),
"day_of_week_cos": (-0.9009688679, 1),
"Level_1_Link_Reject": (0, 1),
"Identity_Negative_History": (0, 1),
"account_email_worst_score": (-52, 0),
"true_ip_score": (-38, 49),
"account_email_score": (-18, 9),
"day_of_week": (0, 6),
"true_ip_worst_score": (-100, 0),
"proxy_ip_worst_score": (-100, 0),
"proxy_ip_score": (-29, 60),
"monthsatresidence": (0, 25),
"Device_Negative_History": (0, 1),
"fuzzy_device_score": (-29, 14),
"day_sin": (-0.9987165072, 0.9987165072),
"ip_region_confidence": (75, 99),
"IP_Negative_History": (0, 1),
"fuzzy_device_worst_score": (-100, 0),
"day_of_week_sin": (-0.9749279122, 0.9749279122),
"Identity_Spoofing": (0, 1),
}
for column, (cap_min, cap_max) in capping_dict.items():
if column in combined_df.columns:
combined_df[column] = combined_df[column].clip(lower=cap_min, upper=cap_max)
unknown_treatments = {
"employmentstatus": {
"valid_values": [
2025-11-23 23:22:32 -05:00
"disability",
"fixed income",
"full time employed",
"part time employment",
"retired benefits",
"self employed",
"student",
"unemployed",
"welfare",
2025-03-12 16:12:18 +00:00
],
2025-11-23 23:22:32 -05:00
"default_treatment": "other",
2025-03-12 16:12:18 +00:00
},
2025-11-23 23:22:32 -05:00
"trueipgeo": {"valid_values": ["US"], "default_treatment": "other"},
"digital_id_trust_score_rating": {"valid_values": ["very_high", "high", "neutral", "low"], "default_treatment": "very_low"},
2025-03-12 16:12:18 +00:00
"educationlevel": {
"valid_values": ["associate's degree", "bachelor's degree", "doctorate", "high school", "master's degree"],
2025-11-23 23:22:32 -05:00
"default_treatment": "other",
2025-03-12 16:12:18 +00:00
},
"os_version": {
2025-11-23 23:22:32 -05:00
"valid_values": ["18", "17", "16", "15", "14", "13", "12", "11", "10", "9", "8"],
"default_treatment": "unknown",
2025-03-12 16:12:18 +00:00
},
"ip_net_speed_cell": {
"valid_values": [
2025-11-23 23:22:32 -05:00
"broadband",
"cable",
"dialup",
"dsl",
"fixed wireless",
"mobile",
"mobile wireless",
"ocx",
"satellite",
"t1",
"tx",
"wireless",
"xdsl",
2025-03-12 16:12:18 +00:00
],
2025-11-23 23:22:32 -05:00
"default_treatment": "mobile",
2025-03-12 16:12:18 +00:00
},
2025-11-23 23:22:32 -05:00
"digital_id_confidence_rating": {"valid_values": ["high", "medium", "very_high"], "default_treatment": "very_low"},
"riskrating": {"valid_values": ["low", "medium", "neutral", "trusted"], "default_treatment": "high"},
"ownhome": {"valid_values": ["true", "false"], "default_treatment": np.nan},
2025-03-12 16:12:18 +00:00
}
for column, treatment in unknown_treatments.items():
2025-11-23 23:22:32 -05:00
combined_df = _handle_unknowns(combined_df, column, treatment["valid_values"], treatment["default_treatment"])
2025-03-12 16:12:18 +00:00
2025-11-23 23:22:32 -05:00
payfrequency_map = {"biweekly": ["biweekly", "bi-weekly", "bi weekly", "bw"], "semimonthly": ["semi-monthly", "semimonthly"]}
2025-03-12 16:12:18 +00:00
2025-11-23 23:22:32 -05:00
combined_df["payfrequency"] = combined_df["payfrequency"].apply(
2025-03-12 16:12:18 +00:00
lambda x: next((key for key, values in payfrequency_map.items() if str(x).lower() in values), np.nan)
)
2025-11-23 23:22:32 -05:00
return combined_df[final_cols]
# ----------------------------
# M2 Pre-processing
# ----------------------------
def _apply_onehot_features(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
for parsed_feature, model_var, contains_val in M2_ONEHOT_CONFIG:
value = df.get(parsed_feature, pd.Series([None])).iloc[0]
flag = 0
if isinstance(value, list):
flag = int(any(contains_val in str(v).lower() or re.sub(r"[^a-z0-9]+", " ", contains_val) in re.sub(r"[^a-z0-9]+", " ", str(v).lower()) for v in value))
elif isinstance(value, str):
val_norm = re.sub(r"[^a-z0-9]+", " ", value.lower())
contains_norm = re.sub(r"[^a-z0-9]+", " ", contains_val)
flag = int(contains_val in value.lower() or contains_norm in val_norm)
df[model_var] = flag
return df
def _extract_first_seen_days(ts_value, app_ts):
ts = pd.to_datetime(ts_value, errors="coerce", utc=True)
app = pd.to_datetime(app_ts, errors="coerce", utc=True)
# align to naive for subtraction
if isinstance(ts, pd.Timestamp) and ts.tzinfo is not None:
ts = ts.tz_localize(None)
if isinstance(app, pd.Timestamp) and app.tzinfo is not None:
app = app.tz_localize(None)
if pd.isna(ts) or pd.isna(app):
return None
return (app.normalize() - ts.normalize()).days
def _to_naive_ts(val):
ts = pd.to_datetime(val, errors="coerce", utc=True)
if isinstance(ts, pd.Timestamp) and ts.tzinfo is not None:
ts = ts.tz_localize(None)
return ts
def _month_diff(earlier, later):
"""Month difference (earlier - later) using year/month buckets."""
ts_earlier = _to_naive_ts(earlier)
ts_later = _to_naive_ts(later)
if pd.isna(ts_earlier) or pd.isna(ts_later):
return None
return (ts_earlier.year - ts_later.year) * 12 + (ts_earlier.month - ts_later.month)
def pre_processing_m2(data_df: pd.DataFrame) -> pd.DataFrame:
df = data_df.copy()
df.columns = df.columns.str.lower()
# Timestamp-derived features (align with M1 behaviour to keep probabilities consistent)
df["application_timestamp"] = pd.to_datetime(df["application_timestamp"], errors="coerce", utc=True)
df["day"] = df["application_timestamp"].dt.day
df["hour"] = df["application_timestamp"].dt.hour
df["day_sin"] = np.sin(2 * np.pi * df["day"] / 31)
df["day_cos"] = np.cos(2 * np.pi * df["day"] / 31)
def _classify_day_night(hour_val):
if pd.isna(hour_val):
return np.nan
return "day" if 6 <= hour_val < 18 else "night"
df["day_night"] = df["hour"].apply(_classify_day_night)
# Apply onehot flags from attributes
df = _apply_onehot_features(df)
# Distances
lat_ref = _prep_latlong_ref()
if not lat_ref.empty and "zip" in df.columns:
zip_value = df["zip"].iloc[0]
zip_lookup = _normalize_zip_for_ref(zip_value)
ref_row = lat_ref[lat_ref["postal_code_ref"] == zip_lookup] if zip_lookup else pd.DataFrame()
lat_ref_val = ref_row["latitute_ref"].iloc[0] if not ref_row.empty else None
lon_ref_val = ref_row["longitude_ref"].iloc[0] if not ref_row.empty else None
else:
lat_ref_val = None
lon_ref_val = None
df["dist_inputip_ref_km"] = df.apply(
lambda r: _haversine_km(lat_ref_val, lon_ref_val, r.get("input_ip_latitude"), r.get("input_ip_longitude")), axis=1
)
df["dist_em_ip_ref_km"] = df.apply(
lambda r: _haversine_km(lat_ref_val, lon_ref_val, r.get("ip_latitude") or r.get("tps_ip_latitude"), r.get("ip_longitude") or r.get("tps_ip_longitude")),
axis=1,
)
df["dist_proxyip_ref_km"] = df.apply(
lambda r: _haversine_km(lat_ref_val, lon_ref_val, r.get("proxy_ip_latitude"), r.get("proxy_ip_longitude")), axis=1
)
df["dist_dnsip_ref_km"] = df.apply(
lambda r: _haversine_km(lat_ref_val, lon_ref_val, r.get("dns_ip_latitude"), r.get("dns_ip_longitude")), axis=1
)
df["dist_trueip_ref_km"] = df.apply(
lambda r: _haversine_km(lat_ref_val, lon_ref_val, r.get("true_ip_latitude"), r.get("true_ip_longitude")), axis=1
)
df["dist_trueip_em_ip_km"] = df.apply(
lambda r: _haversine_km(r.get("true_ip_latitude"), r.get("true_ip_longitude"), r.get("ip_latitude") or r.get("tps_ip_latitude"), r.get("ip_longitude") or r.get("tps_ip_longitude")),
axis=1,
)
df["dist_trueip_dnsip_km"] = df.apply(
lambda r: _haversine_km(r.get("true_ip_latitude"), r.get("true_ip_longitude"), r.get("dns_ip_latitude"), r.get("dns_ip_longitude")),
axis=1,
)
2025-03-12 16:12:18 +00:00
2025-11-23 23:22:32 -05:00
# Ages
app_ts_val = df.get("application_timestamp", pd.Series([None])).iloc[0]
def _safe_day_diff(row):
if not row.get("digital_id_first_seen"):
return None
val = _extract_first_seen_days(row.get("digital_id_first_seen"), app_ts_val)
return -val if val is not None else None
df["digital_id_day_diff"] = df.apply(_safe_day_diff, axis=1)
df["digital_id_month_diff"] = df.apply(lambda r: _month_diff(r.get("digital_id_first_seen"), app_ts_val), axis=1)
for col_name in [
"digital_id_first_seen",
"account_email_first_seen",
"account_login_first_seen",
"account_telephone_first_seen",
"true_ip_first_seen",
"ssn_hash_first_seen",
"fuzzy_device_first_seen",
"national_id_first_seen",
"proxy_ip_first_seen",
]:
out_col = f"{col_name}_age"
df[out_col] = df.apply(lambda r: _extract_first_seen_days(r.get(col_name), app_ts_val), axis=1)
# applicant_age for consistency if not present
if "applicant_age" not in df.columns:
df["applicant_age"] = df.apply(
lambda row: pd.to_datetime(row["application_timestamp"]).year - pd.to_datetime(row["application_date_of_birth"]).year
if pd.notnull(row.get("application_timestamp")) and pd.notnull(row.get("application_date_of_birth"))
else None,
axis=1,
)
2025-03-12 16:12:18 +00:00
2025-11-23 23:22:32 -05:00
# Safe casting and capping using data dictionary
for var_name, rules in M2_DATA_DICTIONARY.items():
if var_name not in df.columns:
continue
col = pd.to_numeric(df[var_name], errors="coerce") if rules.get("data_type") in ["float", "int"] else df[var_name]
if rules.get("data_type") == "int":
col = col.astype("float")
valid_min = rules.get("valid_min")
valid_max = rules.get("valid_max")
observed_min = rules.get("observed_cap_min")
observed_max = rules.get("observed_cap_max")
if observed_min is not None or observed_max is not None:
col = col.clip(lower=observed_min, upper=observed_max)
# if valid_min is not None:
# col = col.where(col >= valid_min, np.nan)
# if valid_max is not None:
# col = col.where(col <= valid_max, np.nan)
df[var_name] = col
return df
def pre_processing_all(data_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
# Ensure requested THX fields exist so downstream packaging always has keys
df_base = data_df.copy()
for field in THX_FIELDS:
if field in df_base.columns:
df_base[field] = df_base[field].astype(str)
else:
df_base[field] = None
df_thx = df_base[THX_FIELDS].copy()
2025-03-12 16:12:18 +00:00
2025-11-23 23:22:32 -05:00
df_m1 = pre_processing_m1(df_base.copy())
df_m2 = pre_processing_m2(df_base.copy())
return df_m1, df_m2, df_thx
2025-03-12 16:12:18 +00:00
2025-11-23 23:22:32 -05:00
# Backwards compatible entry point (used by legacy code/tests if any)
def pre_processing(data_df: pd.DataFrame) -> pd.DataFrame:
df_m1, _, _ = pre_processing_all(data_df)
return df_m1