Sync m-1-v-1 block with local updates
This commit is contained in:
parent
1bf55226e1
commit
d0f4d225ee
244
block.py
244
block.py
@ -1,12 +1,12 @@
|
||||
import pandas as pd
|
||||
import logging
|
||||
import json
|
||||
|
||||
import jmespath
|
||||
import regex as re
|
||||
from pre_processing import pre_processing
|
||||
from processing import processing
|
||||
from post_processing import post_processing
|
||||
import json_repair
|
||||
import pandas as pd
|
||||
import regex as re
|
||||
from pre_processing import pre_processing_all
|
||||
from processing import processing_all
|
||||
from post_processing import post_processing_all
|
||||
|
||||
|
||||
# Configure logging
|
||||
@ -16,7 +16,7 @@ logging.basicConfig(
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_JSON_LIKE = re.compile(r'^\s*\?*[\{\[].*[\}\]]\s*$', re.DOTALL)
|
||||
_JSON_LIKE = re.compile(r"^\s*\?*[\{\[].*[\}\]]\s*$", re.DOTALL)
|
||||
|
||||
|
||||
def extract_value(blob, expression):
|
||||
@ -25,14 +25,13 @@ def extract_value(blob, expression):
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def coalesce(*args):
|
||||
for value in args:
|
||||
if value is not None:
|
||||
return value
|
||||
return None
|
||||
|
||||
# New sanitize blob function
|
||||
|
||||
|
||||
def deep_repair(obj):
|
||||
# 1) If it's a string that *looks* like JSON (with or without one leading '?'),
|
||||
@ -40,18 +39,17 @@ def deep_repair(obj):
|
||||
if isinstance(obj, str):
|
||||
s = obj.strip()
|
||||
if _JSON_LIKE.match(s):
|
||||
# strip one leading '?' if present
|
||||
if s.startswith('?'):
|
||||
if s.startswith("?"):
|
||||
s = s[1:]
|
||||
parsed = json_repair.loads(s)
|
||||
return deep_repair(parsed)
|
||||
return obj
|
||||
|
||||
# 2) Dict → recurse on each value
|
||||
# 2) Dict – recurse on each value
|
||||
if isinstance(obj, dict):
|
||||
return {k: deep_repair(v) for k, v in obj.items()}
|
||||
|
||||
# 3) List → recurse on each element
|
||||
# 3) List – recurse on each element
|
||||
if isinstance(obj, list):
|
||||
return [deep_repair(v) for v in obj]
|
||||
|
||||
@ -66,34 +64,21 @@ def sanitize_blob(blob):
|
||||
logger.error("Failed to sanitize blob: %s", e)
|
||||
return None
|
||||
|
||||
# Expressions to extract values
|
||||
# Expressions to extract values (M1 + added M2 fields)
|
||||
expressions = {
|
||||
# M1 (existing)
|
||||
"first_seen_days": [
|
||||
# 1) any vendor under integration_hub_results → first_seen_days
|
||||
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].first_seen_days)[0]",
|
||||
|
||||
# 2) the flat “dotted” key
|
||||
"Blob.\"emailage.emailriskscore.first_seen_days\"",
|
||||
|
||||
# 3) fallback to the top level tps_vendor_raw_response path
|
||||
'Blob."emailage.emailriskscore.first_seen_days"',
|
||||
"Blob.tps_vendor_raw_response.query.results[0].first_seen_days",
|
||||
],
|
||||
"ea_score": [
|
||||
# 1) any vendor under integration_hub_results
|
||||
'Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].EAScore',
|
||||
|
||||
# 2) the flat “dotted” key
|
||||
"Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].EAScore",
|
||||
'Blob."emailage.emailriskscore.eascore"',
|
||||
|
||||
# 3) fallback to the top level tps_vendor_raw_response
|
||||
'Blob.tps_vendor_raw_response.query.results[0].EAScore',
|
||||
"Blob.tps_vendor_raw_response.query.results[0].EAScore",
|
||||
],
|
||||
"email_creation_days": [
|
||||
# 1) any vendor under integration_hub_results → results[0].email_creation_days
|
||||
"(Blob.integration_hub_results.*"
|
||||
".tps_vendor_raw_response.query.results[0].email_creation_days)[0]",
|
||||
|
||||
# 2) fallback to the top level tps_vendor_raw_response path
|
||||
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].email_creation_days)[0]",
|
||||
"Blob.tps_vendor_raw_response.query.results[0].email_creation_days",
|
||||
],
|
||||
"summary_risk_score": ["Blob.summary_risk_score"],
|
||||
@ -102,11 +87,7 @@ expressions = {
|
||||
"account_email_worst_score": ["Blob.account_email_worst_score"],
|
||||
"true_ip_score": ["Blob.true_ip_score"],
|
||||
"ip_net_speed_cell": [
|
||||
# 1) any vendor under integration_hub_results → results[0].ip_netSpeedCell
|
||||
"(Blob.integration_hub_results.*"
|
||||
".tps_vendor_raw_response.query.results[0].ip_netSpeedCell)[0]",
|
||||
|
||||
# 2) fallback to the top level tps_vendor_raw_response path
|
||||
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ip_netSpeedCell)[0]",
|
||||
"Blob.tps_vendor_raw_response.query.results[0].ip_netSpeedCell",
|
||||
],
|
||||
"account_email_score": ["Blob.account_email_score"],
|
||||
@ -115,17 +96,105 @@ expressions = {
|
||||
"proxy_ip_score": ["Blob.proxy_ip_score"],
|
||||
"fuzzy_device_score": ["Blob.fuzzy_device_score"],
|
||||
"ip_region_confidence": [
|
||||
# 1) any vendor under integration_hub_results → results[0].ip_regionconf
|
||||
"(Blob.integration_hub_results.*"
|
||||
".tps_vendor_raw_response.query.results[0].ip_regionconf)[0]",
|
||||
|
||||
# 2) fallback to the top level tps_vendor_raw_response path
|
||||
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ip_regionconf)[0]",
|
||||
"Blob.tps_vendor_raw_response.query.results[0].ip_regionconf",
|
||||
],
|
||||
"true_ip_state_confidence": ["Blob.true_ip_state_confidence"],
|
||||
"fuzzy_device_worst_score": ["Blob.fuzzy_device_worst_score"],
|
||||
"digital_id_confidence_rating": ["Blob.digital_id_confidence_rating"],
|
||||
"trueipgeo": ["TrueIpGeo","Blob.true_ip_geo"],
|
||||
"trueipgeo": ["TrueIpGeo", "Blob.true_ip_geo"],
|
||||
# M2 additions
|
||||
"policy_score": ["Blob.policy_score"],
|
||||
"digital_id_trust_score": ["Blob.digital_id_trust_score"],
|
||||
"proxy_score": ["Blob.proxy_score"],
|
||||
"browser_spoof_score": ["Blob.browser_spoof_score"],
|
||||
"input_ip_connection_type": ["Blob.input_ip_connection_type"],
|
||||
"fuzzy_device_id_confidence": ["Blob.fuzzy_device_id_confidence"],
|
||||
"fraudrisk": [
|
||||
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].fraudRisk)[0]",
|
||||
"Blob.tps_vendor_raw_response.query.results[0].fraudRisk",
|
||||
'Blob."emailage.emailriskscore.fraudRisk"',
|
||||
],
|
||||
"overalldigitalidentityscore": [
|
||||
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].overallDigitalIdentityScore)[0]",
|
||||
"Blob.tps_vendor_raw_response.query.results[0].overallDigitalIdentityScore",
|
||||
'Blob."emailage.emailriskscore.overallDigitalIdentityScore"',
|
||||
],
|
||||
"totalhits": [
|
||||
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].totalhits)[0]",
|
||||
"Blob.tps_vendor_raw_response.query.results[0].totalhits",
|
||||
'Blob."emailage.emailriskscore.totalhits"',
|
||||
],
|
||||
"uniquehits": [
|
||||
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].uniquehits)[0]",
|
||||
"Blob.tps_vendor_raw_response.query.results[0].uniquehits",
|
||||
'Blob."emailage.emailriskscore.uniquehits"',
|
||||
],
|
||||
"emailtofullnameconfidence": [
|
||||
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].emailToFullNameConfidence)[0]",
|
||||
"Blob.tps_vendor_raw_response.query.results[0].emailToFullNameConfidence",
|
||||
'Blob."emailage.emailriskscore.emailToFullNameConfidence"',
|
||||
],
|
||||
"emailtolastnameconfidence": [
|
||||
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].emailToLastNameConfidence)[0]",
|
||||
"Blob.tps_vendor_raw_response.query.results[0].emailToLastNameConfidence",
|
||||
'Blob."emailage.emailriskscore.emailToLastNameConfidence"',
|
||||
],
|
||||
"domain_creation_days": [
|
||||
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].domain_creation_days)[0]",
|
||||
"Blob.tps_vendor_raw_response.query.results[0].domain_creation_days",
|
||||
'Blob."emailage.emailriskscore.domain_creation_days"',
|
||||
],
|
||||
"iptophoneconfidence": [
|
||||
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ipToPhoneConfidence)[0]",
|
||||
"Blob.tps_vendor_raw_response.query.results[0].ipToPhoneConfidence",
|
||||
'Blob."emailage.emailriskscore.ipToPhoneConfidence"',
|
||||
],
|
||||
"di_autofill_count_login": [
|
||||
"Blob.tmx_variables.di_autofill_count_login",
|
||||
"Blob.policy_details_api.policy_detail_api.customer.rules.vars.variable.di_autofill_count_login",
|
||||
],
|
||||
"accphone_gbl_velocity_hour": [
|
||||
"Blob.tmx_variables.accphone_gbl_velocity_hour",
|
||||
"Blob.tmx_variables._accphone_gbl_velocity_hour",
|
||||
],
|
||||
# Lat/long fields for distance engineering
|
||||
"ip_latitude": [
|
||||
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ip_latitude)[0]",
|
||||
"Blob.tps_vendor_raw_response.query.results[0].ip_latitude",
|
||||
],
|
||||
"ip_longitude": [
|
||||
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ip_longitude)[0]",
|
||||
"Blob.tps_vendor_raw_response.query.results[0].ip_longitude",
|
||||
],
|
||||
"tps_ip_latitude": ["Blob.tps_vendor_raw_response.query.results[0].ip_latitude"],
|
||||
"tps_ip_longitude": ["Blob.tps_vendor_raw_response.query.results[0].ip_longitude"],
|
||||
"true_ip_latitude": ["Blob.true_ip_latitude"],
|
||||
"true_ip_longitude": ["Blob.true_ip_longitude"],
|
||||
"proxy_ip_latitude": ["Blob.proxy_ip_latitude"],
|
||||
"proxy_ip_longitude": ["Blob.proxy_ip_longitude"],
|
||||
"dns_ip_latitude": ["Blob.dns_ip_latitude"],
|
||||
"dns_ip_longitude": ["Blob.dns_ip_longitude"],
|
||||
"input_ip_latitude": ["Blob.input_ip_latitude"],
|
||||
"input_ip_longitude": ["Blob.input_ip_longitude"],
|
||||
# First-seen timestamps for age deltas
|
||||
"digital_id_first_seen": ["Blob.digital_id_first_seen"],
|
||||
"account_email_first_seen": ["Blob.account_email_first_seen"],
|
||||
"account_login_first_seen": ["Blob.account_login_first_seen"],
|
||||
"account_telephone_first_seen": ["Blob.account_telephone_first_seen"],
|
||||
"true_ip_first_seen": ["Blob.true_ip_first_seen"],
|
||||
"ssn_hash_first_seen": ["Blob.ssn_hash_first_seen"],
|
||||
"fuzzy_device_first_seen": ["Blob.fuzzy_device_first_seen"],
|
||||
"national_id_first_seen": ["Blob.national_id_first_seen"],
|
||||
"proxy_ip_first_seen": ["Blob.proxy_ip_first_seen"],
|
||||
# Attribute arrays (used for one-hot style parsing)
|
||||
"account_name_activities": ["Blob.account_name_activities"],
|
||||
"account_email_attributes": ["Blob.account_email_attributes"],
|
||||
"true_ip_attributes": ["Blob.true_ip_attributes"],
|
||||
"true_ip_activities": ["Blob.true_ip_activities"],
|
||||
"digital_id_attributes": ["Blob.digital_id_attributes"],
|
||||
"account_telephone_attributes": ["Blob.account_telephone_attributes"],
|
||||
"cpu_clock": ["Blob.cpu_clock"]
|
||||
}
|
||||
|
||||
|
||||
@ -156,9 +225,9 @@ def __main__(
|
||||
TrueIpGeo: str,
|
||||
Blob: str,
|
||||
DeviceId: str,
|
||||
FuzzyDeviceId: str
|
||||
FuzzyDeviceId: str,
|
||||
ReasonCode: str,
|
||||
) -> dict:
|
||||
|
||||
# Convert input parameters into a flat dictionary
|
||||
data = {
|
||||
"application_key": application_key,
|
||||
@ -184,49 +253,82 @@ def __main__(
|
||||
"TrueIpGeo": TrueIpGeo,
|
||||
"Blob": Blob,
|
||||
"DeviceId": DeviceId,
|
||||
"FuzzyDeviceId": FuzzyDeviceId
|
||||
"FuzzyDeviceId": FuzzyDeviceId,
|
||||
"ReasonCode": ReasonCode,
|
||||
}
|
||||
|
||||
# Convert dictionary to a single-row DataFrame
|
||||
combined_df = pd.DataFrame([data])
|
||||
combined_df.columns = combined_df.columns.str.lower()
|
||||
|
||||
# Uncomment Below For Testing using Uprova Batch Data
|
||||
# combined_df["educationlevel"] = None
|
||||
# combined_df["monthsatresidence"] = None
|
||||
# combined_df["ownhome"] = False
|
||||
# combined_df['lengthatbank'] = 0
|
||||
|
||||
combined_df["application_email_address"] = combined_df["application_email_address"].str.lower()
|
||||
if Blob:
|
||||
combined_df["blob"] = combined_df["blob"].apply(sanitize_blob)
|
||||
|
||||
# Step 2: Extract values using the expressions dictionary
|
||||
for column, expressions_list in expressions.items():
|
||||
combined_df[column] = combined_df["blob"].apply(lambda x: coalesce(
|
||||
*[extract_value(x, expr) for expr in expressions_list]))
|
||||
def _extract_with_fallback(blob_obj):
|
||||
values = []
|
||||
for expr in expressions_list:
|
||||
val = extract_value(blob_obj, expr)
|
||||
if val is None and isinstance(expr, str) and expr.startswith("Blob."):
|
||||
val = extract_value(blob_obj, expr[len("Blob.") :])
|
||||
values.append(val)
|
||||
return coalesce(*values)
|
||||
|
||||
logger.info("pre_flowx data")
|
||||
logger.info(combined_df.iloc[0].drop('blob').to_dict())
|
||||
extracted = combined_df["blob"].apply(_extract_with_fallback)
|
||||
if column in combined_df.columns:
|
||||
combined_df[column] = extracted.where(extracted.notnull(), combined_df[column])
|
||||
else:
|
||||
for column, expressions_list in expressions.items():
|
||||
combined_df[column] = extracted
|
||||
|
||||
# logger.info("pre_flowx data")
|
||||
# logger.info(combined_df.iloc[0].drop("blob").to_dict())
|
||||
else:
|
||||
for column in expressions:
|
||||
combined_df[column] = None
|
||||
logger.info("pre_flowx data")
|
||||
logger.info(combined_df.iloc[0].to_dict())
|
||||
pre_processed_data = pre_processing(combined_df)
|
||||
# logger.info(f"pre_processed_data: {pre_processed_data}")
|
||||
logger.info("pre_processed data")
|
||||
logger.info(pre_processed_data.iloc[0].to_dict())
|
||||
df = processing(pre_processed_data)
|
||||
logger.info("processed_data")
|
||||
logger.info(df.iloc[0].to_dict())
|
||||
df["application_timestamp"] = df["application_timestamp"].astype(str)
|
||||
# logger.info("prediction: %.8f", float(df['prediction'].iloc[0]))
|
||||
result = post_processing(df)
|
||||
logger.info("post_processed_data")
|
||||
logger.info(result)
|
||||
# logger.info("pre_flowx data")
|
||||
# logger.info(combined_df.iloc[0].to_dict())
|
||||
df_m1, df_m2, df_thx = pre_processing_all(combined_df)
|
||||
# logger.info("pre_processed data m1")
|
||||
# logger.info(df_m1.iloc[0].to_dict())
|
||||
# logger.info("pre_processed data m2")
|
||||
# logger.info(df_m2.iloc[0].to_dict())
|
||||
|
||||
processed_m1, processed_m2, df_thx = processing_all(df_m1, df_m2, df_thx)
|
||||
# logger.info("processed_data m1")
|
||||
# logger.info(processed_m1.iloc[0].to_dict())
|
||||
# logger.info("processed_data m2")
|
||||
# logger.info(processed_m2.iloc[0].to_dict())
|
||||
|
||||
result = post_processing_all(processed_m1, processed_m2, df_thx)
|
||||
# State Check
|
||||
state_value = combined_df["state"].iloc[0]
|
||||
zip_value = combined_df["zip"].iloc[0]
|
||||
if (pd.notnull(state_value) and state_value == "ZZ") or (pd.notnull(zip_value) and zip_value == "86445"):
|
||||
if (pd.notnull(state_value) and state_value == "ZZ") or (
|
||||
pd.notnull(zip_value) and zip_value == "86445"
|
||||
):
|
||||
result["hd_score_m1"] = 1250
|
||||
logger.info("post_processed_data after state check")
|
||||
logger.info(result)
|
||||
result["hd_score_m2"] = 1250
|
||||
result["hd_score_iso_m2"] = 1250
|
||||
# logger.info("post_processed_data after state check")
|
||||
# logger.info(result)
|
||||
|
||||
# Normalize numeric scores to built-in float so JSON encoding (e.g. Temporal)
|
||||
# does not fail on NumPy scalar types like np.float32/np.float64.
|
||||
for key in ("hd_score_m1", "hd_score_m2", "hd_score_iso_m2"):
|
||||
if key in result and result[key] is not None:
|
||||
try:
|
||||
result[key] = float(result[key])
|
||||
except (TypeError, ValueError):
|
||||
logger.warning("Failed to cast %s=%r to float", key, result[key])
|
||||
|
||||
print(result)
|
||||
|
||||
return result
|
||||
|
||||
# testing :
|
||||
# __main__
|
||||
|
||||
88
category_orders_train_M1.json
Normal file
88
category_orders_train_M1.json
Normal file
@ -0,0 +1,88 @@
|
||||
{
|
||||
"employmentstatus": [
|
||||
"disability",
|
||||
"fixed income",
|
||||
"full time employed",
|
||||
"other",
|
||||
"part time employment",
|
||||
"retired benefits",
|
||||
"self employed",
|
||||
"student",
|
||||
"unemployed",
|
||||
"welfare"
|
||||
],
|
||||
"TrueIpGeo": [
|
||||
"other",
|
||||
"us"
|
||||
],
|
||||
"digital_id_trust_score_rating": [
|
||||
"high",
|
||||
"low",
|
||||
"neutral",
|
||||
"very_high",
|
||||
"very_low"
|
||||
],
|
||||
"educationlevel": [
|
||||
"associate's degree",
|
||||
"bachelor's degree",
|
||||
"doctorate",
|
||||
"high school",
|
||||
"master's degree",
|
||||
"other"
|
||||
],
|
||||
"os_version": [
|
||||
"10",
|
||||
"11",
|
||||
"12",
|
||||
"13",
|
||||
"14",
|
||||
"15",
|
||||
"16",
|
||||
"17",
|
||||
"18",
|
||||
"8",
|
||||
"9",
|
||||
"unknown"
|
||||
],
|
||||
"ip_net_speed_cell": [
|
||||
"broadband",
|
||||
"cable",
|
||||
"dialup",
|
||||
"dsl",
|
||||
"fixed wireless",
|
||||
"mobile",
|
||||
"mobile wireless",
|
||||
"ocx",
|
||||
"satellite",
|
||||
"t1",
|
||||
"tx",
|
||||
"wireless",
|
||||
"xdsl"
|
||||
],
|
||||
"day_night": [
|
||||
"Day",
|
||||
"Night"
|
||||
],
|
||||
"digital_id_confidence_rating": [
|
||||
"high",
|
||||
"medium",
|
||||
"very_high",
|
||||
"very_low"
|
||||
],
|
||||
"RiskRating": [
|
||||
"high",
|
||||
"low",
|
||||
"medium",
|
||||
"neutral",
|
||||
"trusted"
|
||||
],
|
||||
"payfrequency": [
|
||||
"biweekly",
|
||||
"semimonthly"
|
||||
],
|
||||
"ownhome": [
|
||||
"false",
|
||||
"true"
|
||||
]
|
||||
|
||||
}
|
||||
303
category_orders_train_M2.json
Normal file
303
category_orders_train_M2.json
Normal file
@ -0,0 +1,303 @@
|
||||
{
|
||||
"riskrating": [
|
||||
"high",
|
||||
"low",
|
||||
"medium",
|
||||
"neutral",
|
||||
"trusted"
|
||||
],
|
||||
"input_ip_connection_type": [
|
||||
"cable",
|
||||
"consumer satellite",
|
||||
"dialup",
|
||||
"dsl",
|
||||
"fixed wireless",
|
||||
"framerelay",
|
||||
"isdn",
|
||||
"mobile wireless",
|
||||
"ocx",
|
||||
"tx"
|
||||
],
|
||||
"fraudrisk": [
|
||||
"001 very low",
|
||||
"003 very low",
|
||||
"005 very low",
|
||||
"006 very low",
|
||||
"008 very low",
|
||||
"009 very low",
|
||||
"010 very low",
|
||||
"011 very low",
|
||||
"012 very low",
|
||||
"014 very low",
|
||||
"015 very low",
|
||||
"016 very low",
|
||||
"017 very low",
|
||||
"018 very low",
|
||||
"020 very low",
|
||||
"021 very low",
|
||||
"022 very low",
|
||||
"023 very low",
|
||||
"024 very low",
|
||||
"025 very low",
|
||||
"026 very low",
|
||||
"027 very low",
|
||||
"028 very low",
|
||||
"029 very low",
|
||||
"030 very low",
|
||||
"031 very low",
|
||||
"032 very low",
|
||||
"033 very low",
|
||||
"034 very low",
|
||||
"035 very low",
|
||||
"036 very low",
|
||||
"037 very low",
|
||||
"038 very low",
|
||||
"039 very low",
|
||||
"040 very low",
|
||||
"041 very low",
|
||||
"042 very low",
|
||||
"043 very low",
|
||||
"044 very low",
|
||||
"045 very low",
|
||||
"046 very low",
|
||||
"047 very low",
|
||||
"048 very low",
|
||||
"049 very low",
|
||||
"050 very low",
|
||||
"051 very low",
|
||||
"052 very low",
|
||||
"053 very low",
|
||||
"054 very low",
|
||||
"055 very low",
|
||||
"056 very low",
|
||||
"057 very low",
|
||||
"058 very low",
|
||||
"059 very low",
|
||||
"060 very low",
|
||||
"061 very low",
|
||||
"062 very low",
|
||||
"063 very low",
|
||||
"064 very low",
|
||||
"065 very low",
|
||||
"066 very low",
|
||||
"067 very low",
|
||||
"068 very low",
|
||||
"069 very low",
|
||||
"070 very low",
|
||||
"071 very low",
|
||||
"072 very low",
|
||||
"073 very low",
|
||||
"074 very low",
|
||||
"075 very low",
|
||||
"076 very low",
|
||||
"077 very low",
|
||||
"078 very low",
|
||||
"079 very low",
|
||||
"080 very low",
|
||||
"081 very low",
|
||||
"082 very low",
|
||||
"083 very low",
|
||||
"084 very low",
|
||||
"085 very low",
|
||||
"086 very low",
|
||||
"087 very low",
|
||||
"088 very low",
|
||||
"089 very low",
|
||||
"090 very low",
|
||||
"091 very low",
|
||||
"092 very low",
|
||||
"093 very low",
|
||||
"094 very low",
|
||||
"095 very low",
|
||||
"096 very low",
|
||||
"097 very low",
|
||||
"098 very low",
|
||||
"099 very low",
|
||||
"100 very low",
|
||||
"101 low",
|
||||
"102 low",
|
||||
"103 low",
|
||||
"104 low",
|
||||
"105 low",
|
||||
"106 low",
|
||||
"107 low",
|
||||
"108 low",
|
||||
"109 low",
|
||||
"110 low",
|
||||
"111 low",
|
||||
"112 low",
|
||||
"113 low",
|
||||
"114 low",
|
||||
"115 low",
|
||||
"116 low",
|
||||
"117 low",
|
||||
"118 low",
|
||||
"119 low",
|
||||
"120 low",
|
||||
"121 low",
|
||||
"122 low",
|
||||
"123 low",
|
||||
"124 low",
|
||||
"125 low",
|
||||
"126 low",
|
||||
"127 low",
|
||||
"128 low",
|
||||
"129 low",
|
||||
"130 low",
|
||||
"131 low",
|
||||
"132 low",
|
||||
"133 low",
|
||||
"134 low",
|
||||
"135 low",
|
||||
"136 low",
|
||||
"137 low",
|
||||
"138 low",
|
||||
"139 low",
|
||||
"140 low",
|
||||
"141 low",
|
||||
"142 low",
|
||||
"143 low",
|
||||
"144 low",
|
||||
"145 low",
|
||||
"146 low",
|
||||
"147 low",
|
||||
"148 low",
|
||||
"149 low",
|
||||
"153 low",
|
||||
"154 low",
|
||||
"156 low",
|
||||
"157 low",
|
||||
"158 low",
|
||||
"159 low",
|
||||
"160 low",
|
||||
"161 low",
|
||||
"162 low",
|
||||
"163 low",
|
||||
"164 low",
|
||||
"165 low",
|
||||
"166 low",
|
||||
"167 low",
|
||||
"168 low",
|
||||
"169 low",
|
||||
"170 low",
|
||||
"171 low",
|
||||
"172 low",
|
||||
"173 low",
|
||||
"174 low",
|
||||
"175 low",
|
||||
"177 low",
|
||||
"178 low",
|
||||
"179 low",
|
||||
"180 low",
|
||||
"181 low",
|
||||
"182 low",
|
||||
"183 low",
|
||||
"184 low",
|
||||
"185 low",
|
||||
"186 low",
|
||||
"187 low",
|
||||
"188 low",
|
||||
"189 low",
|
||||
"190 low",
|
||||
"191 low",
|
||||
"192 low",
|
||||
"193 low",
|
||||
"194 low",
|
||||
"195 low",
|
||||
"196 low",
|
||||
"197 low",
|
||||
"198 low",
|
||||
"199 low",
|
||||
"200 low",
|
||||
"201 low",
|
||||
"202 low",
|
||||
"203 low",
|
||||
"204 low",
|
||||
"205 low",
|
||||
"206 low",
|
||||
"207 low",
|
||||
"208 low",
|
||||
"209 low",
|
||||
"210 low",
|
||||
"211 low",
|
||||
"212 low",
|
||||
"213 low",
|
||||
"214 low",
|
||||
"215 low",
|
||||
"216 low",
|
||||
"217 low",
|
||||
"218 low",
|
||||
"219 low",
|
||||
"220 low",
|
||||
"221 low",
|
||||
"222 low",
|
||||
"224 low",
|
||||
"225 low",
|
||||
"226 low",
|
||||
"227 low",
|
||||
"228 low",
|
||||
"229 low",
|
||||
"230 low",
|
||||
"231 low",
|
||||
"232 low",
|
||||
"233 low",
|
||||
"234 low",
|
||||
"235 low",
|
||||
"236 low",
|
||||
"237 low",
|
||||
"238 low",
|
||||
"239 low",
|
||||
"240 low",
|
||||
"241 low",
|
||||
"242 low",
|
||||
"243 low",
|
||||
"244 low",
|
||||
"245 low",
|
||||
"246 low",
|
||||
"247 low",
|
||||
"248 low",
|
||||
"250 low",
|
||||
"252 low",
|
||||
"254 low",
|
||||
"259 low",
|
||||
"267 low",
|
||||
"268 low",
|
||||
"271 low",
|
||||
"272 low",
|
||||
"274 low",
|
||||
"275 low",
|
||||
"278 low",
|
||||
"282 low",
|
||||
"287 low",
|
||||
"288 low",
|
||||
"289 low",
|
||||
"290 low",
|
||||
"291 low",
|
||||
"293 low",
|
||||
"296 low",
|
||||
"297 low",
|
||||
"464 moderate",
|
||||
"467 moderate",
|
||||
"485 moderate",
|
||||
"491 moderate",
|
||||
"492 moderate",
|
||||
"496 moderate",
|
||||
"702 review",
|
||||
"703 review",
|
||||
"705 review",
|
||||
"706 review",
|
||||
"707 review",
|
||||
"708 review",
|
||||
"710 review",
|
||||
"730 review",
|
||||
"790 review",
|
||||
"801 high",
|
||||
"890 high",
|
||||
"902 very high",
|
||||
"906 very high"
|
||||
],
|
||||
"day_night": [
|
||||
"day",
|
||||
"night"
|
||||
]
|
||||
}
|
||||
BIN
isotonic_model_M2.joblib
Normal file
BIN
isotonic_model_M2.joblib
Normal file
Binary file not shown.
28079
latitute_longitute_reference.csv
Normal file
28079
latitute_longitute_reference.csv
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,5 +1,10 @@
|
||||
import logging
|
||||
from typing import Dict
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from pre_processing import THX_FIELDS
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
@ -9,17 +14,85 @@ logging.basicConfig(
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def post_processing(df):
|
||||
|
||||
def post_processing_m1(df: pd.DataFrame) -> pd.DataFrame:
|
||||
df = df.copy()
|
||||
try:
|
||||
df['hd_score_m1'] = np.round(
|
||||
np.minimum(df['prediction'] * 100 + 0.00001, 1) * 85 +
|
||||
np.maximum(np.log2(df['prediction'] * 100 + 0.000001) * 185, 0),
|
||||
0
|
||||
df["hd_score_m1"] = np.round(
|
||||
np.minimum(df["prediction"] * 100 + 0.00001, 1) * 85
|
||||
+ np.maximum(np.log2(df["prediction"] * 100 + 0.000001) * 185, 0),
|
||||
0,
|
||||
)
|
||||
logging.info(f"hd_score_m1 calculated: {df['hd_score_m1'].iloc[0]}")
|
||||
logging.info("hd_score_m1 calculated: %s", df["hd_score_m1"].iloc[0])
|
||||
except Exception as e:
|
||||
logging.error(f"Error processing hd_score_m1 calculations: {e}")
|
||||
logging.error("Error processing hd_score_m1 calculations: %s", e)
|
||||
return df
|
||||
|
||||
|
||||
return df[['application_key', 'application_timestamp', 'deviceid', 'fuzzydeviceid', 'application_email_address', 'hd_score_m1']].iloc[0].to_dict()
|
||||
def post_processing_m2(df: pd.DataFrame) -> pd.DataFrame:
|
||||
df = df.copy()
|
||||
try:
|
||||
df["hd_score_m2"] = np.round(
|
||||
np.minimum(df["pd_m2"] * 100.0 + 0.00001, 1.0) * 75.0
|
||||
+ np.maximum(np.log2(df["pd_m2"] * 100.0 + 0.000001) * 180.0, 0.0),
|
||||
0,
|
||||
)
|
||||
df["hd_score_iso_m2"] = np.round(
|
||||
np.minimum(df["pd_m2_iso"] * 100.0 + 0.00001, 1.0) * 97.0
|
||||
+ np.maximum(np.log2(df["pd_m2_iso"] * 100.0 + 0.000001) * 246.0, 0.0),
|
||||
0,
|
||||
)
|
||||
logging.info("hd_score_m2 calculated: %s", df["hd_score_m2"].iloc[0])
|
||||
logging.info("hd_score_iso_m2 calculated: %s", df["hd_score_iso_m2"].iloc[0])
|
||||
except Exception as e:
|
||||
logging.error("Error processing hd_score_m2 calculations: %s", e)
|
||||
return df
|
||||
|
||||
|
||||
def _safe_get(df: pd.DataFrame, column: str):
|
||||
"""Return scalar from single-row DataFrame, normalizing NaN/None to None."""
|
||||
if column not in df.columns:
|
||||
return None
|
||||
val = df[column].iloc[0]
|
||||
if isinstance(val, (list, dict)):
|
||||
return val
|
||||
try:
|
||||
if pd.isna(val):
|
||||
return None
|
||||
except TypeError:
|
||||
pass
|
||||
return val
|
||||
|
||||
|
||||
def post_processing_all(df_m1: pd.DataFrame, df_m2: pd.DataFrame, df_thx: pd.DataFrame) -> Dict[str, object]:
|
||||
df_m1_scored = post_processing_m1(df_m1)
|
||||
df_m2_scored = post_processing_m2(df_m2)
|
||||
row_m1 = df_m1_scored.iloc[0]
|
||||
row_m2 = df_m2_scored.iloc[0]
|
||||
result = {
|
||||
"application_key": row_m1.get("application_key"),
|
||||
"application_timestamp": str(row_m1.get("application_timestamp")) if row_m1.get("application_timestamp") is not None else None,
|
||||
"deviceid": row_m1.get("deviceid"),
|
||||
"fuzzydeviceid": row_m1.get("fuzzydeviceid"),
|
||||
"application_email_address": row_m1.get("application_email_address"),
|
||||
"hd_score_m1": row_m1.get("hd_score_m1"),
|
||||
"hd_score_m2": row_m2.get("hd_score_m2"),
|
||||
"hd_score_iso_m2": row_m2.get("hd_score_iso_m2"),
|
||||
"action": None,
|
||||
}
|
||||
flattened_thx = {field: _safe_get(df_thx, field) for field in THX_FIELDS if field not in result}
|
||||
result.update(flattened_thx)
|
||||
return result
|
||||
|
||||
|
||||
# Legacy entry point for backward compatibility
|
||||
def post_processing(df: pd.DataFrame) -> Dict[str, object]:
|
||||
df_scored = post_processing_m1(df)
|
||||
row = df_scored.iloc[0]
|
||||
return {
|
||||
"application_key": row.get("application_key"),
|
||||
"application_timestamp": str(row.get("application_timestamp")) if row.get("application_timestamp") is not None else None,
|
||||
"deviceid": row.get("deviceid"),
|
||||
"fuzzydeviceid": row.get("fuzzydeviceid"),
|
||||
"application_email_address": row.get("application_email_address"),
|
||||
"hd_score_m1": row.get("hd_score_m1"),
|
||||
}
|
||||
|
||||
@ -1,6 +1,11 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import logging
|
||||
import math
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
@ -9,127 +14,288 @@ logging.basicConfig(
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
BASE_DIR = Path(__file__).resolve().parent
|
||||
M2_LATLONG_REF_PATH = BASE_DIR / "latitute_longitute_reference.csv"
|
||||
THX_FIELDS = [
|
||||
"application_key",
|
||||
"application_timestamp",
|
||||
"digital_id_first_seen",
|
||||
"summary_risk_score",
|
||||
"cpu_clock",
|
||||
"account_login_first_seen",
|
||||
"account_telephone_first_seen",
|
||||
"true_ip_first_seen",
|
||||
"ssn_hash_first_seen",
|
||||
"account_email_attributes",
|
||||
"tps_ip_latitude",
|
||||
"tps_ip_longitude",
|
||||
]
|
||||
|
||||
def pre_processing(data_df):
|
||||
# Hardcoded M2 data dictionary (replaces file lookup)
|
||||
M2_DATA_DICTIONARY: Dict[str, Dict[str, Union[float, str, None]]] = {
|
||||
"account_email_attributes_challenge_passed": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"account_email_attributes_challenged": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"account_email_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"account_login_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"account_name_activities_trusted_prob": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"account_telephone_attributes_challenge_failed": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"account_telephone_attributes_loan_app": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"account_telephone_attributes_trusted": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"account_telephone_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"accphone_gbl_velocity_hour": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"applicant_age": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"browser_spoof_score": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"day": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"day_cos": {"data_type": "float", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"day_sin": {"data_type": "float", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"di_autofill_count_login": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"digital_id_attributes_challenged": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"digital_id_attributes_trusted": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"digital_id_day_diff": {"data_type": "int", "valid_min": -999999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"digital_id_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"digital_id_month_diff": {"data_type": "int", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"digital_id_trust_score": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"digitalidconfidence": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"dist_dnsip_ref_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"dist_em_ip_ref_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"dist_inputip_ref_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"dist_proxyip_ref_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"dist_trueip_dnsip_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"dist_trueip_em_ip_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"dist_trueip_ref_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"domain_creation_days": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"emailtofullnameconfidence": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"emailtolastnameconfidence": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"first_seen_days": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"fraudrisk": {"data_type": "string", "valid_min": None, "valid_max": None, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"fuzzy_device_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"fuzzy_device_id_confidence": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"hour": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"input_ip_connection_type": {"data_type": "string", "valid_min": None, "valid_max": None, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"iptophoneconfidence": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"national_id_first_seen_age": {"data_type": "int", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"overalldigitalidentityscore": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"policy_score": {"data_type": "float", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"proxy_ip_first_seen_age": {"data_type": "int", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"proxy_ip_worst_score": {"data_type": "float", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"proxy_score": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"reasoncode_new_smartid_lt_1wk_global": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"riskrating": {"data_type": "string", "valid_min": None, "valid_max": None, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"ssn_hash_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"totalhits": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"true_ip_activities_trusted_prob": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"true_ip_attributes_trusted": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"true_ip_attributes_trusted_conf": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"true_ip_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"true_ip_worst_score": {"data_type": "float", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
"uniquehits": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||
}
|
||||
|
||||
# combined_df = pd.DataFrame([input_data])
|
||||
# data = pd.DataFrame(data)
|
||||
combined_df = data_df
|
||||
combined_df["applicant_age"] = combined_df.apply(lambda row: pd.to_datetime(row["application_timestamp"]).year - pd.to_datetime(row["application_date_of_birth"]).year if pd.notnull(row["application_timestamp"]) and pd.notnull(row["application_date_of_birth"]) else None,axis=1
|
||||
# Hardcoded one-hot config (parsed_feature, model_var, contains)
|
||||
M2_ONEHOT_CONFIG: List[Tuple[str, str, str]] = [
|
||||
("reasoncode", "reasoncode_new_smartid_lt_1wk_global", "new_smartid_lt_1wk_global"),
|
||||
("account_name_activities", "account_name_activities_trusted_prob", "trusted_prob"),
|
||||
("account_email_attributes", "account_email_attributes_challenged", "challenged"),
|
||||
("account_email_attributes", "account_email_attributes_challenge_passed", "challenge_passed"),
|
||||
("true_ip_attributes", "true_ip_attributes_trusted", "trusted"),
|
||||
("true_ip_attributes", "true_ip_attributes_trusted_conf", "trusted_conf"),
|
||||
("digital_id_attributes", "digital_id_attributes_challenged", "challenged"),
|
||||
("digital_id_attributes", "digital_id_attributes_trusted", "trusted"),
|
||||
("account_telephone_attributes", "account_telephone_attributes_challenge_failed", "challenge_failed"),
|
||||
("account_telephone_attributes", "account_telephone_attributes_loan_app", "loan_app"),
|
||||
("account_telephone_attributes", "account_telephone_attributes_trusted", "trusted"),
|
||||
("true_ip_activities", "true_ip_activities_trusted_prob", "trusted_prob"),
|
||||
]
|
||||
|
||||
|
||||
# ----------------------------
|
||||
# Helpers
|
||||
# ----------------------------
|
||||
def _handle_unknowns(X: pd.DataFrame, column: str, known_values: Iterable[str], default_treatment=None):
|
||||
if column not in X.columns:
|
||||
return X
|
||||
known_values = {str(val).lower() for val in known_values}
|
||||
invalid_values = {None, "none", "nan", pd.NA}
|
||||
X[column] = X[column].apply(
|
||||
lambda x: str(x).lower()
|
||||
if pd.notna(x) and str(x).lower() in known_values
|
||||
else (default_treatment if pd.notna(x) and str(x).lower() not in invalid_values else np.nan)
|
||||
)
|
||||
return X
|
||||
|
||||
|
||||
def _haversine_km(lat1, lon1, lat2, lon2):
|
||||
if None in (lat1, lon1, lat2, lon2):
|
||||
return None
|
||||
try:
|
||||
rlat1 = float(lat1) * math.pi / 180.0
|
||||
rlat2 = float(lat2) * math.pi / 180.0
|
||||
dlat = (float(lat2) - float(lat1)) * math.pi / 180.0
|
||||
dlon = (float(lon2) - float(lon1)) * math.pi / 180.0
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
a = (
|
||||
math.sin(dlat / 2.0) ** 2
|
||||
+ math.cos(rlat1) * math.cos(rlat2) * math.sin(dlon / 2.0) ** 2
|
||||
)
|
||||
a = min(1.0, max(0.0, a))
|
||||
return 2 * 6371.0088 * math.asin(math.sqrt(a))
|
||||
|
||||
|
||||
def _prep_latlong_ref():
|
||||
if not M2_LATLONG_REF_PATH.exists():
|
||||
logger.warning("latitute_longitute_reference.csv missing at %s", M2_LATLONG_REF_PATH)
|
||||
return pd.DataFrame()
|
||||
try:
|
||||
ref = pd.read_csv(M2_LATLONG_REF_PATH, usecols=["postal_code_ref", "latitute_ref", "longitude_ref"])
|
||||
except Exception:
|
||||
ref = pd.read_csv(M2_LATLONG_REF_PATH)
|
||||
# keep lower string version for matching
|
||||
if "postal_code_ref" in ref.columns:
|
||||
ref["postal_code_ref"] = ref["postal_code_ref"].astype(str).str.lower()
|
||||
return ref
|
||||
|
||||
|
||||
def _normalize_zip_for_ref(zip_val):
|
||||
"""
|
||||
Normalize zip/postal code values so they match reference CSV keys.
|
||||
|
||||
- Floats like 89503.0 -> "89503"
|
||||
- Int-like strings "89503.0" -> "89503"
|
||||
Note: we intentionally avoid zero-filling to preserve behaviour seen in UAT references
|
||||
where leading-zero ZIPs are not matched to the reference table.
|
||||
"""
|
||||
if pd.isna(zip_val):
|
||||
return None
|
||||
if isinstance(zip_val, (int, float)) and not isinstance(zip_val, bool):
|
||||
return str(int(zip_val)).lower()
|
||||
zip_str = str(zip_val).strip()
|
||||
if zip_str.replace(".", "", 1).isdigit():
|
||||
try:
|
||||
return str(int(float(zip_str))).lower()
|
||||
except Exception:
|
||||
pass
|
||||
return zip_str.lower() if zip_str else None
|
||||
|
||||
|
||||
# ----------------------------
|
||||
# M1 Pre-processing (existing behaviour)
|
||||
# ----------------------------
|
||||
def pre_processing_m1(data_df: pd.DataFrame) -> pd.DataFrame:
|
||||
combined_df = data_df.copy()
|
||||
combined_df["applicant_age"] = combined_df.apply(
|
||||
lambda row: pd.to_datetime(row["application_timestamp"]).year - pd.to_datetime(row["application_date_of_birth"]).year
|
||||
if pd.notnull(row["application_timestamp"]) and pd.notnull(row["application_date_of_birth"])
|
||||
else None,
|
||||
axis=1,
|
||||
)
|
||||
|
||||
# Extracting Temporal features
|
||||
combined_df['application_timestamp'] = pd.to_datetime(combined_df["application_timestamp"])
|
||||
combined_df.loc[:, 'application_time'] = pd.to_datetime(combined_df['application_timestamp']).dt.time
|
||||
combined_df["application_timestamp"] = pd.to_datetime(combined_df["application_timestamp"])
|
||||
combined_df.loc[:, "application_time"] = pd.to_datetime(combined_df["application_timestamp"]).dt.time
|
||||
|
||||
combined_df['day'] = combined_df['application_timestamp'].dt.day
|
||||
combined_df['day_of_week'] = combined_df['application_timestamp'].dt.weekday # 0=Monday, 6=Sunday
|
||||
combined_df["day"] = combined_df["application_timestamp"].dt.day
|
||||
combined_df["day_of_week"] = combined_df["application_timestamp"].dt.weekday
|
||||
|
||||
combined_df['day_sin'] = np.sin(2 * np.pi * combined_df['day'] / 31)
|
||||
combined_df['day_cos'] = np.cos(2 * np.pi * combined_df['day'] / 31)
|
||||
combined_df['day_of_week_sin'] = np.sin(2 * np.pi * combined_df['day_of_week'] / 7)
|
||||
combined_df['day_of_week_cos'] = np.cos(2 * np.pi * combined_df['day_of_week'] / 7)
|
||||
combined_df["day_sin"] = np.sin(2 * np.pi * combined_df["day"] / 31)
|
||||
combined_df["day_cos"] = np.cos(2 * np.pi * combined_df["day"] / 31)
|
||||
combined_df["day_of_week_sin"] = np.sin(2 * np.pi * combined_df["day_of_week"] / 7)
|
||||
combined_df["day_of_week_cos"] = np.cos(2 * np.pi * combined_df["day_of_week"] / 7)
|
||||
|
||||
# combined_df['is_weekend'] = combined_df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
|
||||
|
||||
# Create a day/night variable
|
||||
def classify_day_night(hour):
|
||||
if 6 <= hour < 18:
|
||||
return 'Day'
|
||||
else:
|
||||
return 'Night'
|
||||
return "Day"
|
||||
return "Night"
|
||||
|
||||
# Extract hour from application_time
|
||||
combined_df['hour'] = combined_df['application_time'].apply(lambda x: x.hour if pd.notnull(x) else np.nan)
|
||||
combined_df['day_night'] = combined_df['hour'].apply(lambda hour: classify_day_night(hour) if pd.notnull(hour) else 'Unknown')
|
||||
combined_df["hour"] = combined_df["application_time"].apply(lambda x: x.hour if pd.notnull(x) else np.nan)
|
||||
combined_df["day_night"] = combined_df["hour"].apply(lambda hour: classify_day_night(hour) if pd.notnull(hour) else "Unknown")
|
||||
|
||||
# combined_df['os_version'] = combined_df['os_version'].str.replace(r'[^a-zA-Z0-9]', '_', regex=True)
|
||||
combined_df['os_version'] = combined_df['os_version'].apply(lambda x: x.split('.')[0] if isinstance(x, str) and '.' in x
|
||||
else x.split('_')[0] if isinstance(x, str) and '_' in x
|
||||
else x)
|
||||
combined_df["os_version"] = combined_df["os_version"].apply(
|
||||
lambda x: x.split(".")[0] if isinstance(x, str) and "." in x else x.split("_")[0] if isinstance(x, str) and "_" in x else x
|
||||
)
|
||||
|
||||
combined_df["Identity_Negative_History"] = combined_df["tmxsummaryreasoncode"].astype(str).str.contains(
|
||||
"Identity_Negative_History", na=False, regex=True
|
||||
).astype(int)
|
||||
combined_df["Device_Negative_History"] = combined_df["tmxsummaryreasoncode"].astype(str).str.contains(
|
||||
"Device_Negative_History", na=False, regex=True
|
||||
).astype(int)
|
||||
combined_df["Level_1_Link_Reject"] = combined_df["tmxsummaryreasoncode"].astype(str).str.contains(
|
||||
"Level_1_Link_Reject", na=False, regex=True
|
||||
).astype(int)
|
||||
combined_df["IP_Negative_History"] = combined_df["tmxsummaryreasoncode"].astype(str).str.contains(
|
||||
"IP_Negative_History", na=False, regex=True
|
||||
).astype(int)
|
||||
combined_df["Identity_Spoofing"] = combined_df["tmxsummaryreasoncode"].astype(str).str.contains(
|
||||
"Identity_Spoofing", na=False, regex=True
|
||||
).astype(int)
|
||||
|
||||
# Datatype conversions
|
||||
# combined_df['Level_1_Link_Accept'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Level_1_Link_Accept', na=False, regex=True).astype(int)
|
||||
combined_df['Identity_Negative_History'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Identity_Negative_History', na=False, regex=True).astype(int)
|
||||
combined_df['Device_Negative_History'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Device_Negative_History', na=False, regex=True).astype(int)
|
||||
combined_df['Level_1_Link_Reject'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Level_1_Link_Reject', na=False, regex=True).astype(int)
|
||||
combined_df['IP_Negative_History'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('IP_Negative_History', na=False, regex=True).astype(int)
|
||||
combined_df['Identity_Spoofing'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Identity_Spoofing', na=False, regex=True).astype(int)
|
||||
# combined_df['Bot'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Bot', na=False, regex=True).astype(int)
|
||||
combined_df["digitalidconfidence"] = pd.to_numeric(combined_df["digitalidconfidence"], errors="coerce").astype("Int64")
|
||||
|
||||
combined_df['digitalidconfidence'] = pd.to_numeric(combined_df['digitalidconfidence'], errors='coerce').astype('Int64')
|
||||
|
||||
# Rename Columns if Required
|
||||
combined_df.rename(columns={
|
||||
'DigitalIdConfidence': 'digitalidconfidence',
|
||||
# 'inputipaddress_consistency': 'inputip_consistency',
|
||||
# 'requestid_consistency': 'request_consistency',
|
||||
# Add others as required if present in your DataFrame and needing renaming.
|
||||
}, inplace=True)
|
||||
|
||||
# #Testing : remove below
|
||||
# combined_df.to_csv('op-pre-processing_intermediate.csv', index=False)
|
||||
combined_df.rename(
|
||||
columns={
|
||||
"DigitalIdConfidence": "digitalidconfidence",
|
||||
},
|
||||
inplace=True,
|
||||
)
|
||||
|
||||
dtype_dict = {
|
||||
"applicant_age" : int,
|
||||
"digitalidconfidence" : float,
|
||||
"first_seen_days" : float,
|
||||
"employmentstatus" : str,
|
||||
"ea_score" : float,
|
||||
"trueipgeo" : str,
|
||||
"hour" : int,
|
||||
"email_creation_days" : float,
|
||||
"lengthatjob" : float,
|
||||
"day_cos" : float,
|
||||
"summary_risk_score" : float,
|
||||
"digital_id_trust_score_rating" : str,
|
||||
"day" : 'int32',
|
||||
"lengthatbank" : float,
|
||||
"day_of_week_cos" : float,
|
||||
"Level_1_Link_Reject" : int,
|
||||
"Identity_Negative_History" : int,
|
||||
"educationlevel" : str,
|
||||
"os_version" : str,
|
||||
"account_email_worst_score" : float,
|
||||
"true_ip_score" : float,
|
||||
"ip_net_speed_cell" : str,
|
||||
"account_email_score" : float,
|
||||
"day_of_week" : 'int32',
|
||||
"true_ip_worst_score" : float,
|
||||
"proxy_ip_worst_score" : float,
|
||||
"day_night" : str,
|
||||
"proxy_ip_score" : float,
|
||||
"monthsatresidence" : float,
|
||||
"Device_Negative_History" : int,
|
||||
"fuzzy_device_score" : float,
|
||||
"day_sin" : float,
|
||||
"ip_region_confidence" : float,
|
||||
"true_ip_state_confidence" : float,
|
||||
"IP_Negative_History" : int,
|
||||
"fuzzy_device_worst_score" : float,
|
||||
"digital_id_confidence_rating" : str,
|
||||
"day_of_week_sin" : float,
|
||||
"riskrating" : str,
|
||||
"payfrequency" : str,
|
||||
"ownhome" : str,
|
||||
"Identity_Spoofing" : int
|
||||
"applicant_age": int,
|
||||
"digitalidconfidence": float,
|
||||
"first_seen_days": float,
|
||||
"employmentstatus": str,
|
||||
"ea_score": float,
|
||||
"trueipgeo": str,
|
||||
"hour": int,
|
||||
"email_creation_days": float,
|
||||
"lengthatjob": float,
|
||||
"day_cos": float,
|
||||
"summary_risk_score": float,
|
||||
"digital_id_trust_score_rating": str,
|
||||
"day": "int32",
|
||||
"lengthatbank": float,
|
||||
"day_of_week_cos": float,
|
||||
"Level_1_Link_Reject": int,
|
||||
"Identity_Negative_History": int,
|
||||
"educationlevel": str,
|
||||
"os_version": str,
|
||||
"account_email_worst_score": float,
|
||||
"true_ip_score": float,
|
||||
"ip_net_speed_cell": str,
|
||||
"account_email_score": float,
|
||||
"day_of_week": "int32",
|
||||
"true_ip_worst_score": float,
|
||||
"proxy_ip_worst_score": float,
|
||||
"day_night": str,
|
||||
"proxy_ip_score": float,
|
||||
"monthsatresidence": float,
|
||||
"Device_Negative_History": int,
|
||||
"fuzzy_device_score": float,
|
||||
"day_sin": float,
|
||||
"ip_region_confidence": float,
|
||||
"true_ip_state_confidence": float,
|
||||
"IP_Negative_History": int,
|
||||
"fuzzy_device_worst_score": float,
|
||||
"day_of_week_sin": float,
|
||||
"riskrating": str,
|
||||
"payfrequency": str,
|
||||
"ownhome": str,
|
||||
"Identity_Spoofing": int,
|
||||
}
|
||||
|
||||
next_block_cols = ['application_key', 'application_timestamp', 'deviceid', 'fuzzydeviceid', 'application_email_address']
|
||||
next_block_cols = ["application_key", "application_timestamp", "deviceid", "fuzzydeviceid", "application_email_address"]
|
||||
cols_to_keep = [col for col in dtype_dict.keys() if col in combined_df.columns]
|
||||
|
||||
final_cols = list(set(next_block_cols).union(set(cols_to_keep)))
|
||||
# Type casting
|
||||
for col, dtype in dtype_dict.items():
|
||||
if col in combined_df.columns:
|
||||
if dtype == int:
|
||||
combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce', downcast='integer')
|
||||
combined_df[col] = pd.to_numeric(combined_df[col], errors="coerce", downcast="integer")
|
||||
elif dtype == float:
|
||||
combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce', downcast='float')
|
||||
combined_df[col] = pd.to_numeric(combined_df[col], errors="coerce", downcast="float")
|
||||
elif dtype == str:
|
||||
combined_df[col] = combined_df[col].astype(str)
|
||||
# cross check data type
|
||||
|
||||
capping_dict = {
|
||||
"applicant_age": (18, 93),
|
||||
"digitalidconfidence": (0, 9017),
|
||||
@ -157,98 +323,254 @@ def pre_processing(data_df):
|
||||
"fuzzy_device_score": (-29, 14),
|
||||
"day_sin": (-0.9987165072, 0.9987165072),
|
||||
"ip_region_confidence": (75, 99),
|
||||
# "true_ip_state_confidence": (5, 98),
|
||||
"IP_Negative_History": (0, 1),
|
||||
"fuzzy_device_worst_score": (-100, 0),
|
||||
"day_of_week_sin": (-0.9749279122, 0.9749279122),
|
||||
"Identity_Spoofing": (0, 1),
|
||||
}
|
||||
|
||||
|
||||
|
||||
# Apply capping
|
||||
for column, (cap_min, cap_max) in capping_dict.items():
|
||||
if column in combined_df.columns:
|
||||
combined_df[column] = combined_df[column].clip(lower=cap_min, upper=cap_max)
|
||||
|
||||
|
||||
def handle_unknowns(X, column, known_values, default_treatment=None):
|
||||
if column not in X.columns:
|
||||
return X # Return X to avoid NoneType error
|
||||
known_values = {str(val).lower() for val in known_values}
|
||||
invalid_values = {None, "none", "nan", pd.NA}
|
||||
X[column] = X[column].apply(
|
||||
lambda x: str(x).lower() if pd.notna(x) and str(x).lower() in known_values
|
||||
else (default_treatment if pd.notna(x) and str(x).lower() not in invalid_values else np.nan)
|
||||
)
|
||||
return X # Always return the DataFrame
|
||||
|
||||
|
||||
|
||||
|
||||
unknown_treatments = {
|
||||
"employmentstatus": {
|
||||
"valid_values": [
|
||||
"disability", "fixed income", "full time employed", "part time employment",
|
||||
"retired benefits", "self employed", "student", "unemployed", "welfare"
|
||||
"disability",
|
||||
"fixed income",
|
||||
"full time employed",
|
||||
"part time employment",
|
||||
"retired benefits",
|
||||
"self employed",
|
||||
"student",
|
||||
"unemployed",
|
||||
"welfare",
|
||||
],
|
||||
"default_treatment": "other"
|
||||
},
|
||||
"trueipgeo": {
|
||||
"valid_values": ["US"],
|
||||
"default_treatment": "other"
|
||||
},
|
||||
"digital_id_trust_score_rating": {
|
||||
"valid_values": ["very_high", "high", "neutral", "low"],
|
||||
"default_treatment": "very_low"
|
||||
"default_treatment": "other",
|
||||
},
|
||||
"trueipgeo": {"valid_values": ["US"], "default_treatment": "other"},
|
||||
"digital_id_trust_score_rating": {"valid_values": ["very_high", "high", "neutral", "low"], "default_treatment": "very_low"},
|
||||
"educationlevel": {
|
||||
"valid_values": ["associate's degree", "bachelor's degree", "doctorate", "high school", "master's degree"],
|
||||
"default_treatment": "other"
|
||||
"default_treatment": "other",
|
||||
},
|
||||
"os_version": {
|
||||
"valid_values": [
|
||||
'18', '17', '16', '15', '14', '13', '12', '11', '10', '9', '8'
|
||||
],
|
||||
"default_treatment": 'unknown'
|
||||
"valid_values": ["18", "17", "16", "15", "14", "13", "12", "11", "10", "9", "8"],
|
||||
"default_treatment": "unknown",
|
||||
},
|
||||
"ip_net_speed_cell": {
|
||||
"valid_values": [
|
||||
"broadband", "cable", "dialup", "dsl", "fixed wireless", "mobile", "mobile wireless", "ocx", "satellite",
|
||||
"t1", "tx", "wireless", "xdsl"
|
||||
"broadband",
|
||||
"cable",
|
||||
"dialup",
|
||||
"dsl",
|
||||
"fixed wireless",
|
||||
"mobile",
|
||||
"mobile wireless",
|
||||
"ocx",
|
||||
"satellite",
|
||||
"t1",
|
||||
"tx",
|
||||
"wireless",
|
||||
"xdsl",
|
||||
],
|
||||
"default_treatment": "mobile"
|
||||
},
|
||||
"digital_id_confidence_rating": {
|
||||
"valid_values": ["high", "medium", "very_high"],
|
||||
"default_treatment": "very_low"
|
||||
},
|
||||
"riskrating": {
|
||||
"valid_values": ["low", "medium", "neutral", "trusted"],
|
||||
"default_treatment": "high"
|
||||
},
|
||||
"ownhome": {
|
||||
"valid_values": ["true", "false"],
|
||||
"default_treatment": np.nan
|
||||
"default_treatment": "mobile",
|
||||
},
|
||||
"digital_id_confidence_rating": {"valid_values": ["high", "medium", "very_high"], "default_treatment": "very_low"},
|
||||
"riskrating": {"valid_values": ["low", "medium", "neutral", "trusted"], "default_treatment": "high"},
|
||||
"ownhome": {"valid_values": ["true", "false"], "default_treatment": np.nan},
|
||||
}
|
||||
|
||||
for column, treatment in unknown_treatments.items():
|
||||
combined_df = handle_unknowns(combined_df, column, treatment["valid_values"], treatment["default_treatment"])
|
||||
combined_df = _handle_unknowns(combined_df, column, treatment["valid_values"], treatment["default_treatment"])
|
||||
|
||||
payfrequency_map = {
|
||||
"biweekly": ["biweekly", "bi-weekly", "bi weekly", "bw"],
|
||||
"semimonthly": ["semi-monthly", "semimonthly"]
|
||||
}
|
||||
payfrequency_map = {"biweekly": ["biweekly", "bi-weekly", "bi weekly", "bw"], "semimonthly": ["semi-monthly", "semimonthly"]}
|
||||
|
||||
combined_df['payfrequency'] = combined_df['payfrequency'].apply(
|
||||
combined_df["payfrequency"] = combined_df["payfrequency"].apply(
|
||||
lambda x: next((key for key, values in payfrequency_map.items() if str(x).lower() in values), np.nan)
|
||||
)
|
||||
|
||||
return combined_df[final_cols]
|
||||
|
||||
|
||||
# ----------------------------
|
||||
# M2 Pre-processing
|
||||
# ----------------------------
|
||||
def _apply_onehot_features(df: pd.DataFrame) -> pd.DataFrame:
|
||||
df = df.copy()
|
||||
for parsed_feature, model_var, contains_val in M2_ONEHOT_CONFIG:
|
||||
value = df.get(parsed_feature, pd.Series([None])).iloc[0]
|
||||
flag = 0
|
||||
if isinstance(value, list):
|
||||
flag = int(any(contains_val in str(v).lower() or re.sub(r"[^a-z0-9]+", " ", contains_val) in re.sub(r"[^a-z0-9]+", " ", str(v).lower()) for v in value))
|
||||
elif isinstance(value, str):
|
||||
val_norm = re.sub(r"[^a-z0-9]+", " ", value.lower())
|
||||
contains_norm = re.sub(r"[^a-z0-9]+", " ", contains_val)
|
||||
flag = int(contains_val in value.lower() or contains_norm in val_norm)
|
||||
df[model_var] = flag
|
||||
return df
|
||||
|
||||
|
||||
def _extract_first_seen_days(ts_value, app_ts):
|
||||
ts = pd.to_datetime(ts_value, errors="coerce", utc=True)
|
||||
app = pd.to_datetime(app_ts, errors="coerce", utc=True)
|
||||
# align to naive for subtraction
|
||||
if isinstance(ts, pd.Timestamp) and ts.tzinfo is not None:
|
||||
ts = ts.tz_localize(None)
|
||||
if isinstance(app, pd.Timestamp) and app.tzinfo is not None:
|
||||
app = app.tz_localize(None)
|
||||
if pd.isna(ts) or pd.isna(app):
|
||||
return None
|
||||
return (app.normalize() - ts.normalize()).days
|
||||
|
||||
|
||||
def _to_naive_ts(val):
|
||||
ts = pd.to_datetime(val, errors="coerce", utc=True)
|
||||
if isinstance(ts, pd.Timestamp) and ts.tzinfo is not None:
|
||||
ts = ts.tz_localize(None)
|
||||
return ts
|
||||
|
||||
|
||||
def _month_diff(earlier, later):
|
||||
"""Month difference (earlier - later) using year/month buckets."""
|
||||
ts_earlier = _to_naive_ts(earlier)
|
||||
ts_later = _to_naive_ts(later)
|
||||
if pd.isna(ts_earlier) or pd.isna(ts_later):
|
||||
return None
|
||||
return (ts_earlier.year - ts_later.year) * 12 + (ts_earlier.month - ts_later.month)
|
||||
|
||||
|
||||
def pre_processing_m2(data_df: pd.DataFrame) -> pd.DataFrame:
|
||||
df = data_df.copy()
|
||||
df.columns = df.columns.str.lower()
|
||||
|
||||
# Timestamp-derived features (align with M1 behaviour to keep probabilities consistent)
|
||||
df["application_timestamp"] = pd.to_datetime(df["application_timestamp"], errors="coerce", utc=True)
|
||||
df["day"] = df["application_timestamp"].dt.day
|
||||
df["hour"] = df["application_timestamp"].dt.hour
|
||||
df["day_sin"] = np.sin(2 * np.pi * df["day"] / 31)
|
||||
df["day_cos"] = np.cos(2 * np.pi * df["day"] / 31)
|
||||
|
||||
def _classify_day_night(hour_val):
|
||||
if pd.isna(hour_val):
|
||||
return np.nan
|
||||
return "day" if 6 <= hour_val < 18 else "night"
|
||||
|
||||
df["day_night"] = df["hour"].apply(_classify_day_night)
|
||||
|
||||
# Apply onehot flags from attributes
|
||||
df = _apply_onehot_features(df)
|
||||
|
||||
# Distances
|
||||
lat_ref = _prep_latlong_ref()
|
||||
if not lat_ref.empty and "zip" in df.columns:
|
||||
zip_value = df["zip"].iloc[0]
|
||||
zip_lookup = _normalize_zip_for_ref(zip_value)
|
||||
ref_row = lat_ref[lat_ref["postal_code_ref"] == zip_lookup] if zip_lookup else pd.DataFrame()
|
||||
lat_ref_val = ref_row["latitute_ref"].iloc[0] if not ref_row.empty else None
|
||||
lon_ref_val = ref_row["longitude_ref"].iloc[0] if not ref_row.empty else None
|
||||
else:
|
||||
lat_ref_val = None
|
||||
lon_ref_val = None
|
||||
|
||||
df["dist_inputip_ref_km"] = df.apply(
|
||||
lambda r: _haversine_km(lat_ref_val, lon_ref_val, r.get("input_ip_latitude"), r.get("input_ip_longitude")), axis=1
|
||||
)
|
||||
df["dist_em_ip_ref_km"] = df.apply(
|
||||
lambda r: _haversine_km(lat_ref_val, lon_ref_val, r.get("ip_latitude") or r.get("tps_ip_latitude"), r.get("ip_longitude") or r.get("tps_ip_longitude")),
|
||||
axis=1,
|
||||
)
|
||||
df["dist_proxyip_ref_km"] = df.apply(
|
||||
lambda r: _haversine_km(lat_ref_val, lon_ref_val, r.get("proxy_ip_latitude"), r.get("proxy_ip_longitude")), axis=1
|
||||
)
|
||||
df["dist_dnsip_ref_km"] = df.apply(
|
||||
lambda r: _haversine_km(lat_ref_val, lon_ref_val, r.get("dns_ip_latitude"), r.get("dns_ip_longitude")), axis=1
|
||||
)
|
||||
df["dist_trueip_ref_km"] = df.apply(
|
||||
lambda r: _haversine_km(lat_ref_val, lon_ref_val, r.get("true_ip_latitude"), r.get("true_ip_longitude")), axis=1
|
||||
)
|
||||
df["dist_trueip_em_ip_km"] = df.apply(
|
||||
lambda r: _haversine_km(r.get("true_ip_latitude"), r.get("true_ip_longitude"), r.get("ip_latitude") or r.get("tps_ip_latitude"), r.get("ip_longitude") or r.get("tps_ip_longitude")),
|
||||
axis=1,
|
||||
)
|
||||
df["dist_trueip_dnsip_km"] = df.apply(
|
||||
lambda r: _haversine_km(r.get("true_ip_latitude"), r.get("true_ip_longitude"), r.get("dns_ip_latitude"), r.get("dns_ip_longitude")),
|
||||
axis=1,
|
||||
)
|
||||
|
||||
# Ages
|
||||
app_ts_val = df.get("application_timestamp", pd.Series([None])).iloc[0]
|
||||
def _safe_day_diff(row):
|
||||
if not row.get("digital_id_first_seen"):
|
||||
return None
|
||||
val = _extract_first_seen_days(row.get("digital_id_first_seen"), app_ts_val)
|
||||
return -val if val is not None else None
|
||||
|
||||
df["digital_id_day_diff"] = df.apply(_safe_day_diff, axis=1)
|
||||
df["digital_id_month_diff"] = df.apply(lambda r: _month_diff(r.get("digital_id_first_seen"), app_ts_val), axis=1)
|
||||
|
||||
for col_name in [
|
||||
"digital_id_first_seen",
|
||||
"account_email_first_seen",
|
||||
"account_login_first_seen",
|
||||
"account_telephone_first_seen",
|
||||
"true_ip_first_seen",
|
||||
"ssn_hash_first_seen",
|
||||
"fuzzy_device_first_seen",
|
||||
"national_id_first_seen",
|
||||
"proxy_ip_first_seen",
|
||||
]:
|
||||
out_col = f"{col_name}_age"
|
||||
df[out_col] = df.apply(lambda r: _extract_first_seen_days(r.get(col_name), app_ts_val), axis=1)
|
||||
|
||||
# applicant_age for consistency if not present
|
||||
if "applicant_age" not in df.columns:
|
||||
df["applicant_age"] = df.apply(
|
||||
lambda row: pd.to_datetime(row["application_timestamp"]).year - pd.to_datetime(row["application_date_of_birth"]).year
|
||||
if pd.notnull(row.get("application_timestamp")) and pd.notnull(row.get("application_date_of_birth"))
|
||||
else None,
|
||||
axis=1,
|
||||
)
|
||||
|
||||
# Safe casting and capping using data dictionary
|
||||
for var_name, rules in M2_DATA_DICTIONARY.items():
|
||||
if var_name not in df.columns:
|
||||
continue
|
||||
col = pd.to_numeric(df[var_name], errors="coerce") if rules.get("data_type") in ["float", "int"] else df[var_name]
|
||||
if rules.get("data_type") == "int":
|
||||
col = col.astype("float")
|
||||
valid_min = rules.get("valid_min")
|
||||
valid_max = rules.get("valid_max")
|
||||
observed_min = rules.get("observed_cap_min")
|
||||
observed_max = rules.get("observed_cap_max")
|
||||
if observed_min is not None or observed_max is not None:
|
||||
col = col.clip(lower=observed_min, upper=observed_max)
|
||||
# if valid_min is not None:
|
||||
# col = col.where(col >= valid_min, np.nan)
|
||||
# if valid_max is not None:
|
||||
# col = col.where(col <= valid_max, np.nan)
|
||||
df[var_name] = col
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def pre_processing_all(data_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
||||
# Ensure requested THX fields exist so downstream packaging always has keys
|
||||
df_base = data_df.copy()
|
||||
|
||||
for field in THX_FIELDS:
|
||||
if field in df_base.columns:
|
||||
df_base[field] = df_base[field].astype(str)
|
||||
else:
|
||||
df_base[field] = None
|
||||
df_thx = df_base[THX_FIELDS].copy()
|
||||
|
||||
df_m1 = pre_processing_m1(df_base.copy())
|
||||
df_m2 = pre_processing_m2(df_base.copy())
|
||||
return df_m1, df_m2, df_thx
|
||||
|
||||
|
||||
# Backwards compatible entry point (used by legacy code/tests if any)
|
||||
def pre_processing(data_df: pd.DataFrame) -> pd.DataFrame:
|
||||
df_m1, _, _ = pre_processing_all(data_df)
|
||||
return df_m1
|
||||
|
||||
129
processing.py
129
processing.py
@ -1,46 +1,107 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
import joblib
|
||||
import json
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
|
||||
import joblib
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import xgboost as xgb
|
||||
|
||||
# BASE_DIR = Path(__file__).resolve().parent
|
||||
# M1_MODEL_PATH = BASE_DIR / "xgboost_model_M1.joblib"
|
||||
# M1_CATEGORY_ORDERS_PATH = BASE_DIR / "category_orders_train_M1.json"
|
||||
# M2_MODEL_PATH = BASE_DIR / "xgboost_model_M2.joblib"
|
||||
# M2_CATEGORY_ORDERS_PATH = BASE_DIR / "category_orders_train_M2.json"
|
||||
# M2_ISO_PATH = BASE_DIR / "isotonic_model_M2.joblib"
|
||||
|
||||
M1_MODEL_PATH = "./xgboost_model_M1.joblib"
|
||||
M1_CATEGORY_ORDERS_PATH = "./category_orders_train_M1.json"
|
||||
M2_MODEL_PATH = "./xgboost_model_M2.joblib"
|
||||
M2_CATEGORY_ORDERS_PATH = "./category_orders_train_M2.json"
|
||||
M2_ISO_PATH = "./isotonic_model_M2.joblib"
|
||||
|
||||
|
||||
def processing(input_data):
|
||||
def _load_category_orders(path: Path) -> dict:
|
||||
with open(path, "r") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def _load_m1_model():
|
||||
return joblib.load(M1_MODEL_PATH)
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def _load_m2_model():
|
||||
return joblib.load(M2_MODEL_PATH)
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def _load_m2_iso_model():
|
||||
return joblib.load(M2_ISO_PATH)
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def _load_category_orders_cached(path: Path):
|
||||
# Cache category orders per path to avoid disk I/O on each scoring
|
||||
return _load_category_orders(path)
|
||||
|
||||
|
||||
def _prepare(df: pd.DataFrame, category_orders: dict) -> pd.DataFrame:
|
||||
df = df.copy()
|
||||
for col, categories in category_orders.items():
|
||||
if col not in df.columns:
|
||||
df[col] = np.nan
|
||||
|
||||
df[col] = df[col].astype(str).str.lower()
|
||||
df[col].replace([None, "", "null", np.nan, "nan", " "], np.nan, inplace=True)
|
||||
df[col] = pd.Categorical(df[col], categories=categories, ordered=True)
|
||||
return df
|
||||
|
||||
|
||||
def processing_m1(input_data: pd.DataFrame) -> pd.DataFrame:
|
||||
df = pd.DataFrame(input_data)
|
||||
|
||||
# Load Model
|
||||
model_path = "./xgboost_model.joblib"
|
||||
# model_path = "C:/Users/abinisha/habemco_flowx/m1_v1/xgboost_model.joblib"
|
||||
model = joblib.load(model_path)
|
||||
df.rename(columns={'riskrating': 'RiskRating', 'trueipgeo': 'TrueIpGeo'}, inplace=True)
|
||||
|
||||
# Load Category Orders
|
||||
category_orders_path ="./category_orders_train.json"
|
||||
# category_orders_path = "C:/Users/abinisha/habemco_flowx/m1_v1/category_orders_train.json"
|
||||
with open(category_orders_path, 'r') as f:
|
||||
category_orders = json.load(f)
|
||||
|
||||
if df.empty:
|
||||
raise ValueError("Input DataFrame is empty.")
|
||||
|
||||
model = _load_m1_model()
|
||||
df.rename(columns={"riskrating": "RiskRating", "trueipgeo": "TrueIpGeo"}, inplace=True)
|
||||
category_orders = _load_category_orders_cached(M1_CATEGORY_ORDERS_PATH)
|
||||
df = _prepare(df, category_orders)
|
||||
|
||||
# Ensure all expected features exist
|
||||
expected_features = model.feature_names
|
||||
|
||||
|
||||
for col, categories in category_orders.items():
|
||||
df[col].replace([None, "", "null", np.nan, "nan", " "], np.nan, inplace=True)
|
||||
df[col] = pd.Categorical(df[col], categories=categories, ordered=True)
|
||||
|
||||
# missing_features = [feature for feature in expected_features if feature not in df.columns]
|
||||
# for feature in missing_features:
|
||||
# df[feature] = np.nan # Use NaN to avoid dtype issues
|
||||
|
||||
# Create XGBoost DMatrix
|
||||
dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=np.nan)
|
||||
|
||||
# Make predictions
|
||||
predictions = model.predict(dmatrix)
|
||||
df['prediction'] = predictions
|
||||
|
||||
df["prediction"] = predictions
|
||||
return df
|
||||
|
||||
|
||||
def processing_m2(input_data: pd.DataFrame) -> pd.DataFrame:
|
||||
df = pd.DataFrame(input_data)
|
||||
if df.empty:
|
||||
raise ValueError("Input DataFrame is empty.")
|
||||
|
||||
model = _load_m2_model()
|
||||
category_orders = _load_category_orders_cached(M2_CATEGORY_ORDERS_PATH)
|
||||
df = _prepare(df, category_orders)
|
||||
|
||||
expected_features = model.feature_names
|
||||
for feature in expected_features:
|
||||
if feature not in df.columns:
|
||||
df[feature] = np.nan
|
||||
dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=np.nan)
|
||||
pd_arr = model.predict(dmatrix)
|
||||
df["pd_m2"] = pd_arr
|
||||
|
||||
iso_model = _load_m2_iso_model()
|
||||
df["pd_m2_iso"] = iso_model.predict(pd_arr)
|
||||
return df
|
||||
|
||||
|
||||
def processing_all(df_m1: pd.DataFrame, df_m2: pd.DataFrame, df_thx: pd.DataFrame):
|
||||
return processing_m1(df_m1), processing_m2(df_m2), df_thx
|
||||
|
||||
|
||||
# Legacy single-model entry point
|
||||
def processing(input_data: pd.DataFrame) -> pd.DataFrame:
|
||||
return processing_m1(input_data)
|
||||
|
||||
@ -97,6 +97,10 @@
|
||||
"zip": {
|
||||
"type": ["string", "null"],
|
||||
"description": "Zip of the current residence."
|
||||
},
|
||||
"ReasonCode": {
|
||||
"type": ["string", "null"],
|
||||
"description": "Reason code from ThreatMetrix."
|
||||
}
|
||||
},
|
||||
"required": []
|
||||
|
||||
@ -5,3 +5,4 @@ joblib == 1.4.2
|
||||
jmespath == 1.0.1
|
||||
regex == 2023.12.25
|
||||
json_repair == 0.47.6
|
||||
scikit-learn==1.5.2
|
||||
|
||||
@ -26,6 +26,54 @@
|
||||
"type": ["number", "null"],
|
||||
"description": "HD Fraud Score M1"
|
||||
},
|
||||
"hd_score_m2": {
|
||||
"type": ["number", "null"],
|
||||
"description": "HD Fraud Score M2"
|
||||
},
|
||||
"hd_score_iso_m2": {
|
||||
"type": ["number", "null"],
|
||||
"description": "HD Fraud Score M2 Scaled"
|
||||
},
|
||||
"digital_id_first_seen": {
|
||||
"type": ["string", "null"],
|
||||
"description": "Digital ID first seen timestamp"
|
||||
},
|
||||
"summary_risk_score": {
|
||||
"type": ["string", "null"],
|
||||
"description": "Summary risk score"
|
||||
},
|
||||
"cpu_clock": {
|
||||
"type": ["string", "null"],
|
||||
"description": "CPU clock value from device profiling"
|
||||
},
|
||||
"account_login_first_seen": {
|
||||
"type": ["string", "null"],
|
||||
"description": "Account login first seen timestamp"
|
||||
},
|
||||
"account_telephone_first_seen": {
|
||||
"type": ["string", "null"],
|
||||
"description": "Account telephone first seen timestamp"
|
||||
},
|
||||
"true_ip_first_seen": {
|
||||
"type": ["string", "null"],
|
||||
"description": "True IP first seen timestamp"
|
||||
},
|
||||
"ssn_hash_first_seen": {
|
||||
"type": ["string", "null"],
|
||||
"description": "SSN hash first seen timestamp"
|
||||
},
|
||||
"account_email_attributes": {
|
||||
"type": ["string", "null"],
|
||||
"description": "Account email attributes"
|
||||
},
|
||||
"tps_ip_latitude": {
|
||||
"type": ["string", "null"],
|
||||
"description": "TPS IP latitude"
|
||||
},
|
||||
"tps_ip_longitude": {
|
||||
"type": ["string", "null"],
|
||||
"description": "TPS IP longitude"
|
||||
},
|
||||
"action": {
|
||||
"type": ["string", "null"],
|
||||
"description": "Recommended Action."
|
||||
|
||||
File diff suppressed because one or more lines are too long
BIN
xgboost_model_M1.joblib
Normal file
BIN
xgboost_model_M1.joblib
Normal file
Binary file not shown.
BIN
xgboost_model_M2.joblib
Normal file
BIN
xgboost_model_M2.joblib
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user