import pandas as pd import numpy as np import logging # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s - %(message)s", ) logger = logging.getLogger(__name__) def pre_processing(data_df): # combined_df = pd.DataFrame([input_data]) # data = pd.DataFrame(data) combined_df = data_df combined_df["applicant_age"] = combined_df.apply(lambda row: pd.to_datetime(row["application_timestamp"]).year - pd.to_datetime(row["application_date_of_birth"]).year if pd.notnull(row["application_timestamp"]) and pd.notnull(row["application_date_of_birth"]) else None,axis=1 ) # Extracting Temporal features combined_df['application_timestamp'] = pd.to_datetime(combined_df["application_timestamp"]) combined_df.loc[:, 'application_time'] = pd.to_datetime(combined_df['application_timestamp']).dt.time combined_df['day'] = combined_df['application_timestamp'].dt.day combined_df['day_of_week'] = combined_df['application_timestamp'].dt.weekday # 0=Monday, 6=Sunday combined_df['day_sin'] = np.sin(2 * np.pi * combined_df['day'] / 31) combined_df['day_cos'] = np.cos(2 * np.pi * combined_df['day'] / 31) combined_df['day_of_week_sin'] = np.sin(2 * np.pi * combined_df['day_of_week'] / 7) combined_df['day_of_week_cos'] = np.cos(2 * np.pi * combined_df['day_of_week'] / 7) # combined_df['is_weekend'] = combined_df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0) # Create a day/night variable def classify_day_night(hour): if 6 <= hour < 18: return 'Day' else: return 'Night' # Extract hour from application_time combined_df['hour'] = combined_df['application_time'].apply(lambda x: x.hour if pd.notnull(x) else np.nan) combined_df['day_night'] = combined_df['hour'].apply(lambda hour: classify_day_night(hour) if pd.notnull(hour) else 'Unknown') # combined_df['os_version'] = combined_df['os_version'].str.replace(r'[^a-zA-Z0-9]', '_', regex=True) combined_df['os_version'] = combined_df['os_version'].apply(lambda x: x.split('.')[0] if isinstance(x, str) and '.' in x else x.split('_')[0] if isinstance(x, str) and '_' in x else x) # Datatype conversions # combined_df['Level_1_Link_Accept'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Level_1_Link_Accept', na=False, regex=True).astype(int) combined_df['Identity_Negative_History'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Identity_Negative_History', na=False, regex=True).astype(int) combined_df['Device_Negative_History'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Device_Negative_History', na=False, regex=True).astype(int) combined_df['Level_1_Link_Reject'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Level_1_Link_Reject', na=False, regex=True).astype(int) combined_df['IP_Negative_History'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('IP_Negative_History', na=False, regex=True).astype(int) combined_df['Identity_Spoofing'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Identity_Spoofing', na=False, regex=True).astype(int) # combined_df['Bot'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Bot', na=False, regex=True).astype(int) combined_df['digitalidconfidence'] = pd.to_numeric(combined_df['digitalidconfidence'], errors='coerce').astype('Int64') # Rename Columns if Required combined_df.rename(columns={ 'DigitalIdConfidence': 'digitalidconfidence', # 'inputipaddress_consistency': 'inputip_consistency', # 'requestid_consistency': 'request_consistency', # Add others as required if present in your DataFrame and needing renaming. }, inplace=True) # #Testing : remove below # combined_df.to_csv('op-pre-processing_intermediate.csv', index=False) dtype_dict = { "applicant_age" : int, "digitalidconfidence" : float, "first_seen_days" : float, "employmentstatus" : str, "ea_score" : float, "trueipgeo" : str, "hour" : int, "email_creation_days" : float, "lengthatjob" : float, "day_cos" : float, "summary_risk_score" : float, "digital_id_trust_score_rating" : str, "day" : 'int32', "lengthatbank" : float, "day_of_week_cos" : float, "Level_1_Link_Reject" : int, "Identity_Negative_History" : int, "educationlevel" : str, "os_version" : str, "account_email_worst_score" : float, "true_ip_score" : float, "ip_net_speed_cell" : str, "account_email_score" : float, "day_of_week" : 'int32', "true_ip_worst_score" : float, "proxy_ip_worst_score" : float, "day_night" : str, "proxy_ip_score" : float, "monthsatresidence" : float, "Device_Negative_History" : int, "fuzzy_device_score" : float, "day_sin" : float, "ip_region_confidence" : float, "true_ip_state_confidence" : float, "IP_Negative_History" : int, "fuzzy_device_worst_score" : float, "digital_id_confidence_rating" : str, "day_of_week_sin" : float, "riskrating" : str, "payfrequency" : str, "ownhome" : str, "Identity_Spoofing" : int } next_block_cols = ['application_key', 'application_timestamp', 'deviceid', 'fuzzydeviceid', 'application_email_address'] cols_to_keep = [col for col in dtype_dict.keys() if col in combined_df.columns] final_cols = list(set(next_block_cols).union(set(cols_to_keep))) # Type casting for col, dtype in dtype_dict.items(): if col in combined_df.columns: if dtype == int: combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce', downcast='integer') elif dtype == float: combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce', downcast='float') elif dtype == str: combined_df[col] = combined_df[col].astype(str) # cross check data type capping_dict = { "applicant_age": (18, 93), "digitalidconfidence": (0, 9017), "first_seen_days": (0, 10486), "ea_score": (1, 930), "hour": (0, 23), "email_creation_days": (2438, 9661), "lengthatjob": (1, 24), "day_cos": (-0.9948693234, 1), "summary_risk_score": (-100, 30), "day": (1, 31), "lengthatbank": (0, 25), "day_of_week_cos": (-0.9009688679, 1), "Level_1_Link_Reject": (0, 1), "Identity_Negative_History": (0, 1), "account_email_worst_score": (-52, 0), "true_ip_score": (-38, 49), "account_email_score": (-18, 9), "day_of_week": (0, 6), "true_ip_worst_score": (-100, 0), "proxy_ip_worst_score": (-100, 0), "proxy_ip_score": (-29, 60), "monthsatresidence": (0, 25), "Device_Negative_History": (0, 1), "fuzzy_device_score": (-29, 14), "day_sin": (-0.9987165072, 0.9987165072), "ip_region_confidence": (75, 99), # "true_ip_state_confidence": (5, 98), "IP_Negative_History": (0, 1), "fuzzy_device_worst_score": (-100, 0), "day_of_week_sin": (-0.9749279122, 0.9749279122), "Identity_Spoofing": (0, 1), } # Apply capping for column, (cap_min, cap_max) in capping_dict.items(): if column in combined_df.columns: combined_df[column] = combined_df[column].clip(lower=cap_min, upper=cap_max) def handle_unknowns(X, column, known_values, default_treatment=None): if column not in X.columns: return X # Return X to avoid NoneType error known_values = {str(val).lower() for val in known_values} invalid_values = {None, "none", "nan", pd.NA} X[column] = X[column].apply( lambda x: str(x).lower() if pd.notna(x) and str(x).lower() in known_values else (default_treatment if pd.notna(x) and str(x).lower() not in invalid_values else np.nan) ) return X # Always return the DataFrame unknown_treatments = { "employmentstatus": { "valid_values": [ "disability", "fixed income", "full time employed", "part time employment", "retired benefits", "self employed", "student", "unemployed", "welfare" ], "default_treatment": "other" }, "trueipgeo": { "valid_values": ["US"], "default_treatment": "other" }, "digital_id_trust_score_rating": { "valid_values": ["very_high", "high", "neutral", "low"], "default_treatment": "very_low" }, "educationlevel": { "valid_values": ["associate's degree", "bachelor's degree", "doctorate", "high school", "master's degree"], "default_treatment": "other" }, "os_version": { "valid_values": [ '18', '17', '16', '15', '14', '13', '12', '11', '10', '9', '8' ], "default_treatment": 'unknown' }, "ip_net_speed_cell": { "valid_values": [ "broadband", "cable", "dialup", "dsl", "fixed wireless", "mobile", "mobile wireless", "ocx", "satellite", "t1", "tx", "wireless", "xdsl" ], "default_treatment": "mobile" }, "digital_id_confidence_rating": { "valid_values": ["high", "medium", "very_high"], "default_treatment": "very_low" }, "riskrating": { "valid_values": ["low", "medium", "neutral", "trusted"], "default_treatment": "high" }, "ownhome": { "valid_values": ["true", "false"], "default_treatment": np.nan }, } for column, treatment in unknown_treatments.items(): combined_df = handle_unknowns(combined_df, column, treatment["valid_values"], treatment["default_treatment"]) payfrequency_map = { "biweekly": ["biweekly", "bi-weekly", "bi weekly", "bw"], "semimonthly": ["semi-monthly", "semimonthly"] } combined_df['payfrequency'] = combined_df['payfrequency'].apply( lambda x: next((key for key, values in payfrequency_map.items() if str(x).lower() in values), np.nan) ) return combined_df[final_cols]