Sync m-1-v-1 block with local updates
This commit is contained in:
parent
1bf55226e1
commit
d0f4d225ee
242
block.py
242
block.py
@ -1,12 +1,12 @@
|
|||||||
import pandas as pd
|
|
||||||
import logging
|
import logging
|
||||||
import json
|
|
||||||
import jmespath
|
import jmespath
|
||||||
import regex as re
|
|
||||||
from pre_processing import pre_processing
|
|
||||||
from processing import processing
|
|
||||||
from post_processing import post_processing
|
|
||||||
import json_repair
|
import json_repair
|
||||||
|
import pandas as pd
|
||||||
|
import regex as re
|
||||||
|
from pre_processing import pre_processing_all
|
||||||
|
from processing import processing_all
|
||||||
|
from post_processing import post_processing_all
|
||||||
|
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
@ -16,7 +16,7 @@ logging.basicConfig(
|
|||||||
)
|
)
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
_JSON_LIKE = re.compile(r'^\s*\?*[\{\[].*[\}\]]\s*$', re.DOTALL)
|
_JSON_LIKE = re.compile(r"^\s*\?*[\{\[].*[\}\]]\s*$", re.DOTALL)
|
||||||
|
|
||||||
|
|
||||||
def extract_value(blob, expression):
|
def extract_value(blob, expression):
|
||||||
@ -25,14 +25,13 @@ def extract_value(blob, expression):
|
|||||||
except Exception:
|
except Exception:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def coalesce(*args):
|
def coalesce(*args):
|
||||||
for value in args:
|
for value in args:
|
||||||
if value is not None:
|
if value is not None:
|
||||||
return value
|
return value
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# New sanitize blob function
|
|
||||||
|
|
||||||
|
|
||||||
def deep_repair(obj):
|
def deep_repair(obj):
|
||||||
# 1) If it's a string that *looks* like JSON (with or without one leading '?'),
|
# 1) If it's a string that *looks* like JSON (with or without one leading '?'),
|
||||||
@ -40,18 +39,17 @@ def deep_repair(obj):
|
|||||||
if isinstance(obj, str):
|
if isinstance(obj, str):
|
||||||
s = obj.strip()
|
s = obj.strip()
|
||||||
if _JSON_LIKE.match(s):
|
if _JSON_LIKE.match(s):
|
||||||
# strip one leading '?' if present
|
if s.startswith("?"):
|
||||||
if s.startswith('?'):
|
|
||||||
s = s[1:]
|
s = s[1:]
|
||||||
parsed = json_repair.loads(s)
|
parsed = json_repair.loads(s)
|
||||||
return deep_repair(parsed)
|
return deep_repair(parsed)
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
# 2) Dict → recurse on each value
|
# 2) Dict – recurse on each value
|
||||||
if isinstance(obj, dict):
|
if isinstance(obj, dict):
|
||||||
return {k: deep_repair(v) for k, v in obj.items()}
|
return {k: deep_repair(v) for k, v in obj.items()}
|
||||||
|
|
||||||
# 3) List → recurse on each element
|
# 3) List – recurse on each element
|
||||||
if isinstance(obj, list):
|
if isinstance(obj, list):
|
||||||
return [deep_repair(v) for v in obj]
|
return [deep_repair(v) for v in obj]
|
||||||
|
|
||||||
@ -66,34 +64,21 @@ def sanitize_blob(blob):
|
|||||||
logger.error("Failed to sanitize blob: %s", e)
|
logger.error("Failed to sanitize blob: %s", e)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Expressions to extract values
|
# Expressions to extract values (M1 + added M2 fields)
|
||||||
expressions = {
|
expressions = {
|
||||||
|
# M1 (existing)
|
||||||
"first_seen_days": [
|
"first_seen_days": [
|
||||||
# 1) any vendor under integration_hub_results → first_seen_days
|
|
||||||
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].first_seen_days)[0]",
|
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].first_seen_days)[0]",
|
||||||
|
'Blob."emailage.emailriskscore.first_seen_days"',
|
||||||
# 2) the flat “dotted” key
|
|
||||||
"Blob.\"emailage.emailriskscore.first_seen_days\"",
|
|
||||||
|
|
||||||
# 3) fallback to the top level tps_vendor_raw_response path
|
|
||||||
"Blob.tps_vendor_raw_response.query.results[0].first_seen_days",
|
"Blob.tps_vendor_raw_response.query.results[0].first_seen_days",
|
||||||
],
|
],
|
||||||
"ea_score": [
|
"ea_score": [
|
||||||
# 1) any vendor under integration_hub_results
|
"Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].EAScore",
|
||||||
'Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].EAScore',
|
|
||||||
|
|
||||||
# 2) the flat “dotted” key
|
|
||||||
'Blob."emailage.emailriskscore.eascore"',
|
'Blob."emailage.emailriskscore.eascore"',
|
||||||
|
"Blob.tps_vendor_raw_response.query.results[0].EAScore",
|
||||||
# 3) fallback to the top level tps_vendor_raw_response
|
|
||||||
'Blob.tps_vendor_raw_response.query.results[0].EAScore',
|
|
||||||
],
|
],
|
||||||
"email_creation_days": [
|
"email_creation_days": [
|
||||||
# 1) any vendor under integration_hub_results → results[0].email_creation_days
|
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].email_creation_days)[0]",
|
||||||
"(Blob.integration_hub_results.*"
|
|
||||||
".tps_vendor_raw_response.query.results[0].email_creation_days)[0]",
|
|
||||||
|
|
||||||
# 2) fallback to the top level tps_vendor_raw_response path
|
|
||||||
"Blob.tps_vendor_raw_response.query.results[0].email_creation_days",
|
"Blob.tps_vendor_raw_response.query.results[0].email_creation_days",
|
||||||
],
|
],
|
||||||
"summary_risk_score": ["Blob.summary_risk_score"],
|
"summary_risk_score": ["Blob.summary_risk_score"],
|
||||||
@ -102,11 +87,7 @@ expressions = {
|
|||||||
"account_email_worst_score": ["Blob.account_email_worst_score"],
|
"account_email_worst_score": ["Blob.account_email_worst_score"],
|
||||||
"true_ip_score": ["Blob.true_ip_score"],
|
"true_ip_score": ["Blob.true_ip_score"],
|
||||||
"ip_net_speed_cell": [
|
"ip_net_speed_cell": [
|
||||||
# 1) any vendor under integration_hub_results → results[0].ip_netSpeedCell
|
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ip_netSpeedCell)[0]",
|
||||||
"(Blob.integration_hub_results.*"
|
|
||||||
".tps_vendor_raw_response.query.results[0].ip_netSpeedCell)[0]",
|
|
||||||
|
|
||||||
# 2) fallback to the top level tps_vendor_raw_response path
|
|
||||||
"Blob.tps_vendor_raw_response.query.results[0].ip_netSpeedCell",
|
"Blob.tps_vendor_raw_response.query.results[0].ip_netSpeedCell",
|
||||||
],
|
],
|
||||||
"account_email_score": ["Blob.account_email_score"],
|
"account_email_score": ["Blob.account_email_score"],
|
||||||
@ -115,17 +96,105 @@ expressions = {
|
|||||||
"proxy_ip_score": ["Blob.proxy_ip_score"],
|
"proxy_ip_score": ["Blob.proxy_ip_score"],
|
||||||
"fuzzy_device_score": ["Blob.fuzzy_device_score"],
|
"fuzzy_device_score": ["Blob.fuzzy_device_score"],
|
||||||
"ip_region_confidence": [
|
"ip_region_confidence": [
|
||||||
# 1) any vendor under integration_hub_results → results[0].ip_regionconf
|
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ip_regionconf)[0]",
|
||||||
"(Blob.integration_hub_results.*"
|
|
||||||
".tps_vendor_raw_response.query.results[0].ip_regionconf)[0]",
|
|
||||||
|
|
||||||
# 2) fallback to the top level tps_vendor_raw_response path
|
|
||||||
"Blob.tps_vendor_raw_response.query.results[0].ip_regionconf",
|
"Blob.tps_vendor_raw_response.query.results[0].ip_regionconf",
|
||||||
],
|
],
|
||||||
"true_ip_state_confidence": ["Blob.true_ip_state_confidence"],
|
"true_ip_state_confidence": ["Blob.true_ip_state_confidence"],
|
||||||
"fuzzy_device_worst_score": ["Blob.fuzzy_device_worst_score"],
|
"fuzzy_device_worst_score": ["Blob.fuzzy_device_worst_score"],
|
||||||
"digital_id_confidence_rating": ["Blob.digital_id_confidence_rating"],
|
"digital_id_confidence_rating": ["Blob.digital_id_confidence_rating"],
|
||||||
"trueipgeo": ["TrueIpGeo", "Blob.true_ip_geo"],
|
"trueipgeo": ["TrueIpGeo", "Blob.true_ip_geo"],
|
||||||
|
# M2 additions
|
||||||
|
"policy_score": ["Blob.policy_score"],
|
||||||
|
"digital_id_trust_score": ["Blob.digital_id_trust_score"],
|
||||||
|
"proxy_score": ["Blob.proxy_score"],
|
||||||
|
"browser_spoof_score": ["Blob.browser_spoof_score"],
|
||||||
|
"input_ip_connection_type": ["Blob.input_ip_connection_type"],
|
||||||
|
"fuzzy_device_id_confidence": ["Blob.fuzzy_device_id_confidence"],
|
||||||
|
"fraudrisk": [
|
||||||
|
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].fraudRisk)[0]",
|
||||||
|
"Blob.tps_vendor_raw_response.query.results[0].fraudRisk",
|
||||||
|
'Blob."emailage.emailriskscore.fraudRisk"',
|
||||||
|
],
|
||||||
|
"overalldigitalidentityscore": [
|
||||||
|
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].overallDigitalIdentityScore)[0]",
|
||||||
|
"Blob.tps_vendor_raw_response.query.results[0].overallDigitalIdentityScore",
|
||||||
|
'Blob."emailage.emailriskscore.overallDigitalIdentityScore"',
|
||||||
|
],
|
||||||
|
"totalhits": [
|
||||||
|
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].totalhits)[0]",
|
||||||
|
"Blob.tps_vendor_raw_response.query.results[0].totalhits",
|
||||||
|
'Blob."emailage.emailriskscore.totalhits"',
|
||||||
|
],
|
||||||
|
"uniquehits": [
|
||||||
|
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].uniquehits)[0]",
|
||||||
|
"Blob.tps_vendor_raw_response.query.results[0].uniquehits",
|
||||||
|
'Blob."emailage.emailriskscore.uniquehits"',
|
||||||
|
],
|
||||||
|
"emailtofullnameconfidence": [
|
||||||
|
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].emailToFullNameConfidence)[0]",
|
||||||
|
"Blob.tps_vendor_raw_response.query.results[0].emailToFullNameConfidence",
|
||||||
|
'Blob."emailage.emailriskscore.emailToFullNameConfidence"',
|
||||||
|
],
|
||||||
|
"emailtolastnameconfidence": [
|
||||||
|
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].emailToLastNameConfidence)[0]",
|
||||||
|
"Blob.tps_vendor_raw_response.query.results[0].emailToLastNameConfidence",
|
||||||
|
'Blob."emailage.emailriskscore.emailToLastNameConfidence"',
|
||||||
|
],
|
||||||
|
"domain_creation_days": [
|
||||||
|
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].domain_creation_days)[0]",
|
||||||
|
"Blob.tps_vendor_raw_response.query.results[0].domain_creation_days",
|
||||||
|
'Blob."emailage.emailriskscore.domain_creation_days"',
|
||||||
|
],
|
||||||
|
"iptophoneconfidence": [
|
||||||
|
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ipToPhoneConfidence)[0]",
|
||||||
|
"Blob.tps_vendor_raw_response.query.results[0].ipToPhoneConfidence",
|
||||||
|
'Blob."emailage.emailriskscore.ipToPhoneConfidence"',
|
||||||
|
],
|
||||||
|
"di_autofill_count_login": [
|
||||||
|
"Blob.tmx_variables.di_autofill_count_login",
|
||||||
|
"Blob.policy_details_api.policy_detail_api.customer.rules.vars.variable.di_autofill_count_login",
|
||||||
|
],
|
||||||
|
"accphone_gbl_velocity_hour": [
|
||||||
|
"Blob.tmx_variables.accphone_gbl_velocity_hour",
|
||||||
|
"Blob.tmx_variables._accphone_gbl_velocity_hour",
|
||||||
|
],
|
||||||
|
# Lat/long fields for distance engineering
|
||||||
|
"ip_latitude": [
|
||||||
|
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ip_latitude)[0]",
|
||||||
|
"Blob.tps_vendor_raw_response.query.results[0].ip_latitude",
|
||||||
|
],
|
||||||
|
"ip_longitude": [
|
||||||
|
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ip_longitude)[0]",
|
||||||
|
"Blob.tps_vendor_raw_response.query.results[0].ip_longitude",
|
||||||
|
],
|
||||||
|
"tps_ip_latitude": ["Blob.tps_vendor_raw_response.query.results[0].ip_latitude"],
|
||||||
|
"tps_ip_longitude": ["Blob.tps_vendor_raw_response.query.results[0].ip_longitude"],
|
||||||
|
"true_ip_latitude": ["Blob.true_ip_latitude"],
|
||||||
|
"true_ip_longitude": ["Blob.true_ip_longitude"],
|
||||||
|
"proxy_ip_latitude": ["Blob.proxy_ip_latitude"],
|
||||||
|
"proxy_ip_longitude": ["Blob.proxy_ip_longitude"],
|
||||||
|
"dns_ip_latitude": ["Blob.dns_ip_latitude"],
|
||||||
|
"dns_ip_longitude": ["Blob.dns_ip_longitude"],
|
||||||
|
"input_ip_latitude": ["Blob.input_ip_latitude"],
|
||||||
|
"input_ip_longitude": ["Blob.input_ip_longitude"],
|
||||||
|
# First-seen timestamps for age deltas
|
||||||
|
"digital_id_first_seen": ["Blob.digital_id_first_seen"],
|
||||||
|
"account_email_first_seen": ["Blob.account_email_first_seen"],
|
||||||
|
"account_login_first_seen": ["Blob.account_login_first_seen"],
|
||||||
|
"account_telephone_first_seen": ["Blob.account_telephone_first_seen"],
|
||||||
|
"true_ip_first_seen": ["Blob.true_ip_first_seen"],
|
||||||
|
"ssn_hash_first_seen": ["Blob.ssn_hash_first_seen"],
|
||||||
|
"fuzzy_device_first_seen": ["Blob.fuzzy_device_first_seen"],
|
||||||
|
"national_id_first_seen": ["Blob.national_id_first_seen"],
|
||||||
|
"proxy_ip_first_seen": ["Blob.proxy_ip_first_seen"],
|
||||||
|
# Attribute arrays (used for one-hot style parsing)
|
||||||
|
"account_name_activities": ["Blob.account_name_activities"],
|
||||||
|
"account_email_attributes": ["Blob.account_email_attributes"],
|
||||||
|
"true_ip_attributes": ["Blob.true_ip_attributes"],
|
||||||
|
"true_ip_activities": ["Blob.true_ip_activities"],
|
||||||
|
"digital_id_attributes": ["Blob.digital_id_attributes"],
|
||||||
|
"account_telephone_attributes": ["Blob.account_telephone_attributes"],
|
||||||
|
"cpu_clock": ["Blob.cpu_clock"]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -156,9 +225,9 @@ def __main__(
|
|||||||
TrueIpGeo: str,
|
TrueIpGeo: str,
|
||||||
Blob: str,
|
Blob: str,
|
||||||
DeviceId: str,
|
DeviceId: str,
|
||||||
FuzzyDeviceId: str
|
FuzzyDeviceId: str,
|
||||||
|
ReasonCode: str,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
|
|
||||||
# Convert input parameters into a flat dictionary
|
# Convert input parameters into a flat dictionary
|
||||||
data = {
|
data = {
|
||||||
"application_key": application_key,
|
"application_key": application_key,
|
||||||
@ -184,49 +253,82 @@ def __main__(
|
|||||||
"TrueIpGeo": TrueIpGeo,
|
"TrueIpGeo": TrueIpGeo,
|
||||||
"Blob": Blob,
|
"Blob": Blob,
|
||||||
"DeviceId": DeviceId,
|
"DeviceId": DeviceId,
|
||||||
"FuzzyDeviceId": FuzzyDeviceId
|
"FuzzyDeviceId": FuzzyDeviceId,
|
||||||
|
"ReasonCode": ReasonCode,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Convert dictionary to a single-row DataFrame
|
# Convert dictionary to a single-row DataFrame
|
||||||
combined_df = pd.DataFrame([data])
|
combined_df = pd.DataFrame([data])
|
||||||
combined_df.columns = combined_df.columns.str.lower()
|
combined_df.columns = combined_df.columns.str.lower()
|
||||||
|
|
||||||
|
# Uncomment Below For Testing using Uprova Batch Data
|
||||||
|
# combined_df["educationlevel"] = None
|
||||||
|
# combined_df["monthsatresidence"] = None
|
||||||
|
# combined_df["ownhome"] = False
|
||||||
|
# combined_df['lengthatbank'] = 0
|
||||||
|
|
||||||
combined_df["application_email_address"] = combined_df["application_email_address"].str.lower()
|
combined_df["application_email_address"] = combined_df["application_email_address"].str.lower()
|
||||||
if Blob:
|
if Blob:
|
||||||
combined_df["blob"] = combined_df["blob"].apply(sanitize_blob)
|
combined_df["blob"] = combined_df["blob"].apply(sanitize_blob)
|
||||||
|
|
||||||
# Step 2: Extract values using the expressions dictionary
|
# Step 2: Extract values using the expressions dictionary
|
||||||
for column, expressions_list in expressions.items():
|
for column, expressions_list in expressions.items():
|
||||||
combined_df[column] = combined_df["blob"].apply(lambda x: coalesce(
|
def _extract_with_fallback(blob_obj):
|
||||||
*[extract_value(x, expr) for expr in expressions_list]))
|
values = []
|
||||||
|
for expr in expressions_list:
|
||||||
|
val = extract_value(blob_obj, expr)
|
||||||
|
if val is None and isinstance(expr, str) and expr.startswith("Blob."):
|
||||||
|
val = extract_value(blob_obj, expr[len("Blob.") :])
|
||||||
|
values.append(val)
|
||||||
|
return coalesce(*values)
|
||||||
|
|
||||||
logger.info("pre_flowx data")
|
extracted = combined_df["blob"].apply(_extract_with_fallback)
|
||||||
logger.info(combined_df.iloc[0].drop('blob').to_dict())
|
if column in combined_df.columns:
|
||||||
|
combined_df[column] = extracted.where(extracted.notnull(), combined_df[column])
|
||||||
else:
|
else:
|
||||||
for column, expressions_list in expressions.items():
|
combined_df[column] = extracted
|
||||||
|
|
||||||
|
# logger.info("pre_flowx data")
|
||||||
|
# logger.info(combined_df.iloc[0].drop("blob").to_dict())
|
||||||
|
else:
|
||||||
|
for column in expressions:
|
||||||
combined_df[column] = None
|
combined_df[column] = None
|
||||||
logger.info("pre_flowx data")
|
# logger.info("pre_flowx data")
|
||||||
logger.info(combined_df.iloc[0].to_dict())
|
# logger.info(combined_df.iloc[0].to_dict())
|
||||||
pre_processed_data = pre_processing(combined_df)
|
df_m1, df_m2, df_thx = pre_processing_all(combined_df)
|
||||||
# logger.info(f"pre_processed_data: {pre_processed_data}")
|
# logger.info("pre_processed data m1")
|
||||||
logger.info("pre_processed data")
|
# logger.info(df_m1.iloc[0].to_dict())
|
||||||
logger.info(pre_processed_data.iloc[0].to_dict())
|
# logger.info("pre_processed data m2")
|
||||||
df = processing(pre_processed_data)
|
# logger.info(df_m2.iloc[0].to_dict())
|
||||||
logger.info("processed_data")
|
|
||||||
logger.info(df.iloc[0].to_dict())
|
processed_m1, processed_m2, df_thx = processing_all(df_m1, df_m2, df_thx)
|
||||||
df["application_timestamp"] = df["application_timestamp"].astype(str)
|
# logger.info("processed_data m1")
|
||||||
# logger.info("prediction: %.8f", float(df['prediction'].iloc[0]))
|
# logger.info(processed_m1.iloc[0].to_dict())
|
||||||
result = post_processing(df)
|
# logger.info("processed_data m2")
|
||||||
logger.info("post_processed_data")
|
# logger.info(processed_m2.iloc[0].to_dict())
|
||||||
logger.info(result)
|
|
||||||
|
result = post_processing_all(processed_m1, processed_m2, df_thx)
|
||||||
# State Check
|
# State Check
|
||||||
state_value = combined_df["state"].iloc[0]
|
state_value = combined_df["state"].iloc[0]
|
||||||
zip_value = combined_df["zip"].iloc[0]
|
zip_value = combined_df["zip"].iloc[0]
|
||||||
if (pd.notnull(state_value) and state_value == "ZZ") or (pd.notnull(zip_value) and zip_value == "86445"):
|
if (pd.notnull(state_value) and state_value == "ZZ") or (
|
||||||
|
pd.notnull(zip_value) and zip_value == "86445"
|
||||||
|
):
|
||||||
result["hd_score_m1"] = 1250
|
result["hd_score_m1"] = 1250
|
||||||
logger.info("post_processed_data after state check")
|
result["hd_score_m2"] = 1250
|
||||||
logger.info(result)
|
result["hd_score_iso_m2"] = 1250
|
||||||
|
# logger.info("post_processed_data after state check")
|
||||||
|
# logger.info(result)
|
||||||
|
|
||||||
|
# Normalize numeric scores to built-in float so JSON encoding (e.g. Temporal)
|
||||||
|
# does not fail on NumPy scalar types like np.float32/np.float64.
|
||||||
|
for key in ("hd_score_m1", "hd_score_m2", "hd_score_iso_m2"):
|
||||||
|
if key in result and result[key] is not None:
|
||||||
|
try:
|
||||||
|
result[key] = float(result[key])
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
logger.warning("Failed to cast %s=%r to float", key, result[key])
|
||||||
|
|
||||||
|
print(result)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
# testing :
|
|
||||||
# __main__
|
|
||||||
|
|||||||
88
category_orders_train_M1.json
Normal file
88
category_orders_train_M1.json
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
{
|
||||||
|
"employmentstatus": [
|
||||||
|
"disability",
|
||||||
|
"fixed income",
|
||||||
|
"full time employed",
|
||||||
|
"other",
|
||||||
|
"part time employment",
|
||||||
|
"retired benefits",
|
||||||
|
"self employed",
|
||||||
|
"student",
|
||||||
|
"unemployed",
|
||||||
|
"welfare"
|
||||||
|
],
|
||||||
|
"TrueIpGeo": [
|
||||||
|
"other",
|
||||||
|
"us"
|
||||||
|
],
|
||||||
|
"digital_id_trust_score_rating": [
|
||||||
|
"high",
|
||||||
|
"low",
|
||||||
|
"neutral",
|
||||||
|
"very_high",
|
||||||
|
"very_low"
|
||||||
|
],
|
||||||
|
"educationlevel": [
|
||||||
|
"associate's degree",
|
||||||
|
"bachelor's degree",
|
||||||
|
"doctorate",
|
||||||
|
"high school",
|
||||||
|
"master's degree",
|
||||||
|
"other"
|
||||||
|
],
|
||||||
|
"os_version": [
|
||||||
|
"10",
|
||||||
|
"11",
|
||||||
|
"12",
|
||||||
|
"13",
|
||||||
|
"14",
|
||||||
|
"15",
|
||||||
|
"16",
|
||||||
|
"17",
|
||||||
|
"18",
|
||||||
|
"8",
|
||||||
|
"9",
|
||||||
|
"unknown"
|
||||||
|
],
|
||||||
|
"ip_net_speed_cell": [
|
||||||
|
"broadband",
|
||||||
|
"cable",
|
||||||
|
"dialup",
|
||||||
|
"dsl",
|
||||||
|
"fixed wireless",
|
||||||
|
"mobile",
|
||||||
|
"mobile wireless",
|
||||||
|
"ocx",
|
||||||
|
"satellite",
|
||||||
|
"t1",
|
||||||
|
"tx",
|
||||||
|
"wireless",
|
||||||
|
"xdsl"
|
||||||
|
],
|
||||||
|
"day_night": [
|
||||||
|
"Day",
|
||||||
|
"Night"
|
||||||
|
],
|
||||||
|
"digital_id_confidence_rating": [
|
||||||
|
"high",
|
||||||
|
"medium",
|
||||||
|
"very_high",
|
||||||
|
"very_low"
|
||||||
|
],
|
||||||
|
"RiskRating": [
|
||||||
|
"high",
|
||||||
|
"low",
|
||||||
|
"medium",
|
||||||
|
"neutral",
|
||||||
|
"trusted"
|
||||||
|
],
|
||||||
|
"payfrequency": [
|
||||||
|
"biweekly",
|
||||||
|
"semimonthly"
|
||||||
|
],
|
||||||
|
"ownhome": [
|
||||||
|
"false",
|
||||||
|
"true"
|
||||||
|
]
|
||||||
|
|
||||||
|
}
|
||||||
303
category_orders_train_M2.json
Normal file
303
category_orders_train_M2.json
Normal file
@ -0,0 +1,303 @@
|
|||||||
|
{
|
||||||
|
"riskrating": [
|
||||||
|
"high",
|
||||||
|
"low",
|
||||||
|
"medium",
|
||||||
|
"neutral",
|
||||||
|
"trusted"
|
||||||
|
],
|
||||||
|
"input_ip_connection_type": [
|
||||||
|
"cable",
|
||||||
|
"consumer satellite",
|
||||||
|
"dialup",
|
||||||
|
"dsl",
|
||||||
|
"fixed wireless",
|
||||||
|
"framerelay",
|
||||||
|
"isdn",
|
||||||
|
"mobile wireless",
|
||||||
|
"ocx",
|
||||||
|
"tx"
|
||||||
|
],
|
||||||
|
"fraudrisk": [
|
||||||
|
"001 very low",
|
||||||
|
"003 very low",
|
||||||
|
"005 very low",
|
||||||
|
"006 very low",
|
||||||
|
"008 very low",
|
||||||
|
"009 very low",
|
||||||
|
"010 very low",
|
||||||
|
"011 very low",
|
||||||
|
"012 very low",
|
||||||
|
"014 very low",
|
||||||
|
"015 very low",
|
||||||
|
"016 very low",
|
||||||
|
"017 very low",
|
||||||
|
"018 very low",
|
||||||
|
"020 very low",
|
||||||
|
"021 very low",
|
||||||
|
"022 very low",
|
||||||
|
"023 very low",
|
||||||
|
"024 very low",
|
||||||
|
"025 very low",
|
||||||
|
"026 very low",
|
||||||
|
"027 very low",
|
||||||
|
"028 very low",
|
||||||
|
"029 very low",
|
||||||
|
"030 very low",
|
||||||
|
"031 very low",
|
||||||
|
"032 very low",
|
||||||
|
"033 very low",
|
||||||
|
"034 very low",
|
||||||
|
"035 very low",
|
||||||
|
"036 very low",
|
||||||
|
"037 very low",
|
||||||
|
"038 very low",
|
||||||
|
"039 very low",
|
||||||
|
"040 very low",
|
||||||
|
"041 very low",
|
||||||
|
"042 very low",
|
||||||
|
"043 very low",
|
||||||
|
"044 very low",
|
||||||
|
"045 very low",
|
||||||
|
"046 very low",
|
||||||
|
"047 very low",
|
||||||
|
"048 very low",
|
||||||
|
"049 very low",
|
||||||
|
"050 very low",
|
||||||
|
"051 very low",
|
||||||
|
"052 very low",
|
||||||
|
"053 very low",
|
||||||
|
"054 very low",
|
||||||
|
"055 very low",
|
||||||
|
"056 very low",
|
||||||
|
"057 very low",
|
||||||
|
"058 very low",
|
||||||
|
"059 very low",
|
||||||
|
"060 very low",
|
||||||
|
"061 very low",
|
||||||
|
"062 very low",
|
||||||
|
"063 very low",
|
||||||
|
"064 very low",
|
||||||
|
"065 very low",
|
||||||
|
"066 very low",
|
||||||
|
"067 very low",
|
||||||
|
"068 very low",
|
||||||
|
"069 very low",
|
||||||
|
"070 very low",
|
||||||
|
"071 very low",
|
||||||
|
"072 very low",
|
||||||
|
"073 very low",
|
||||||
|
"074 very low",
|
||||||
|
"075 very low",
|
||||||
|
"076 very low",
|
||||||
|
"077 very low",
|
||||||
|
"078 very low",
|
||||||
|
"079 very low",
|
||||||
|
"080 very low",
|
||||||
|
"081 very low",
|
||||||
|
"082 very low",
|
||||||
|
"083 very low",
|
||||||
|
"084 very low",
|
||||||
|
"085 very low",
|
||||||
|
"086 very low",
|
||||||
|
"087 very low",
|
||||||
|
"088 very low",
|
||||||
|
"089 very low",
|
||||||
|
"090 very low",
|
||||||
|
"091 very low",
|
||||||
|
"092 very low",
|
||||||
|
"093 very low",
|
||||||
|
"094 very low",
|
||||||
|
"095 very low",
|
||||||
|
"096 very low",
|
||||||
|
"097 very low",
|
||||||
|
"098 very low",
|
||||||
|
"099 very low",
|
||||||
|
"100 very low",
|
||||||
|
"101 low",
|
||||||
|
"102 low",
|
||||||
|
"103 low",
|
||||||
|
"104 low",
|
||||||
|
"105 low",
|
||||||
|
"106 low",
|
||||||
|
"107 low",
|
||||||
|
"108 low",
|
||||||
|
"109 low",
|
||||||
|
"110 low",
|
||||||
|
"111 low",
|
||||||
|
"112 low",
|
||||||
|
"113 low",
|
||||||
|
"114 low",
|
||||||
|
"115 low",
|
||||||
|
"116 low",
|
||||||
|
"117 low",
|
||||||
|
"118 low",
|
||||||
|
"119 low",
|
||||||
|
"120 low",
|
||||||
|
"121 low",
|
||||||
|
"122 low",
|
||||||
|
"123 low",
|
||||||
|
"124 low",
|
||||||
|
"125 low",
|
||||||
|
"126 low",
|
||||||
|
"127 low",
|
||||||
|
"128 low",
|
||||||
|
"129 low",
|
||||||
|
"130 low",
|
||||||
|
"131 low",
|
||||||
|
"132 low",
|
||||||
|
"133 low",
|
||||||
|
"134 low",
|
||||||
|
"135 low",
|
||||||
|
"136 low",
|
||||||
|
"137 low",
|
||||||
|
"138 low",
|
||||||
|
"139 low",
|
||||||
|
"140 low",
|
||||||
|
"141 low",
|
||||||
|
"142 low",
|
||||||
|
"143 low",
|
||||||
|
"144 low",
|
||||||
|
"145 low",
|
||||||
|
"146 low",
|
||||||
|
"147 low",
|
||||||
|
"148 low",
|
||||||
|
"149 low",
|
||||||
|
"153 low",
|
||||||
|
"154 low",
|
||||||
|
"156 low",
|
||||||
|
"157 low",
|
||||||
|
"158 low",
|
||||||
|
"159 low",
|
||||||
|
"160 low",
|
||||||
|
"161 low",
|
||||||
|
"162 low",
|
||||||
|
"163 low",
|
||||||
|
"164 low",
|
||||||
|
"165 low",
|
||||||
|
"166 low",
|
||||||
|
"167 low",
|
||||||
|
"168 low",
|
||||||
|
"169 low",
|
||||||
|
"170 low",
|
||||||
|
"171 low",
|
||||||
|
"172 low",
|
||||||
|
"173 low",
|
||||||
|
"174 low",
|
||||||
|
"175 low",
|
||||||
|
"177 low",
|
||||||
|
"178 low",
|
||||||
|
"179 low",
|
||||||
|
"180 low",
|
||||||
|
"181 low",
|
||||||
|
"182 low",
|
||||||
|
"183 low",
|
||||||
|
"184 low",
|
||||||
|
"185 low",
|
||||||
|
"186 low",
|
||||||
|
"187 low",
|
||||||
|
"188 low",
|
||||||
|
"189 low",
|
||||||
|
"190 low",
|
||||||
|
"191 low",
|
||||||
|
"192 low",
|
||||||
|
"193 low",
|
||||||
|
"194 low",
|
||||||
|
"195 low",
|
||||||
|
"196 low",
|
||||||
|
"197 low",
|
||||||
|
"198 low",
|
||||||
|
"199 low",
|
||||||
|
"200 low",
|
||||||
|
"201 low",
|
||||||
|
"202 low",
|
||||||
|
"203 low",
|
||||||
|
"204 low",
|
||||||
|
"205 low",
|
||||||
|
"206 low",
|
||||||
|
"207 low",
|
||||||
|
"208 low",
|
||||||
|
"209 low",
|
||||||
|
"210 low",
|
||||||
|
"211 low",
|
||||||
|
"212 low",
|
||||||
|
"213 low",
|
||||||
|
"214 low",
|
||||||
|
"215 low",
|
||||||
|
"216 low",
|
||||||
|
"217 low",
|
||||||
|
"218 low",
|
||||||
|
"219 low",
|
||||||
|
"220 low",
|
||||||
|
"221 low",
|
||||||
|
"222 low",
|
||||||
|
"224 low",
|
||||||
|
"225 low",
|
||||||
|
"226 low",
|
||||||
|
"227 low",
|
||||||
|
"228 low",
|
||||||
|
"229 low",
|
||||||
|
"230 low",
|
||||||
|
"231 low",
|
||||||
|
"232 low",
|
||||||
|
"233 low",
|
||||||
|
"234 low",
|
||||||
|
"235 low",
|
||||||
|
"236 low",
|
||||||
|
"237 low",
|
||||||
|
"238 low",
|
||||||
|
"239 low",
|
||||||
|
"240 low",
|
||||||
|
"241 low",
|
||||||
|
"242 low",
|
||||||
|
"243 low",
|
||||||
|
"244 low",
|
||||||
|
"245 low",
|
||||||
|
"246 low",
|
||||||
|
"247 low",
|
||||||
|
"248 low",
|
||||||
|
"250 low",
|
||||||
|
"252 low",
|
||||||
|
"254 low",
|
||||||
|
"259 low",
|
||||||
|
"267 low",
|
||||||
|
"268 low",
|
||||||
|
"271 low",
|
||||||
|
"272 low",
|
||||||
|
"274 low",
|
||||||
|
"275 low",
|
||||||
|
"278 low",
|
||||||
|
"282 low",
|
||||||
|
"287 low",
|
||||||
|
"288 low",
|
||||||
|
"289 low",
|
||||||
|
"290 low",
|
||||||
|
"291 low",
|
||||||
|
"293 low",
|
||||||
|
"296 low",
|
||||||
|
"297 low",
|
||||||
|
"464 moderate",
|
||||||
|
"467 moderate",
|
||||||
|
"485 moderate",
|
||||||
|
"491 moderate",
|
||||||
|
"492 moderate",
|
||||||
|
"496 moderate",
|
||||||
|
"702 review",
|
||||||
|
"703 review",
|
||||||
|
"705 review",
|
||||||
|
"706 review",
|
||||||
|
"707 review",
|
||||||
|
"708 review",
|
||||||
|
"710 review",
|
||||||
|
"730 review",
|
||||||
|
"790 review",
|
||||||
|
"801 high",
|
||||||
|
"890 high",
|
||||||
|
"902 very high",
|
||||||
|
"906 very high"
|
||||||
|
],
|
||||||
|
"day_night": [
|
||||||
|
"day",
|
||||||
|
"night"
|
||||||
|
]
|
||||||
|
}
|
||||||
BIN
isotonic_model_M2.joblib
Normal file
BIN
isotonic_model_M2.joblib
Normal file
Binary file not shown.
28079
latitute_longitute_reference.csv
Normal file
28079
latitute_longitute_reference.csv
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,5 +1,10 @@
|
|||||||
import logging
|
import logging
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from pre_processing import THX_FIELDS
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
@ -9,17 +14,85 @@ logging.basicConfig(
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def post_processing(df):
|
def post_processing_m1(df: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
df = df.copy()
|
||||||
try:
|
try:
|
||||||
df['hd_score_m1'] = np.round(
|
df["hd_score_m1"] = np.round(
|
||||||
np.minimum(df['prediction'] * 100 + 0.00001, 1) * 85 +
|
np.minimum(df["prediction"] * 100 + 0.00001, 1) * 85
|
||||||
np.maximum(np.log2(df['prediction'] * 100 + 0.000001) * 185, 0),
|
+ np.maximum(np.log2(df["prediction"] * 100 + 0.000001) * 185, 0),
|
||||||
0
|
0,
|
||||||
)
|
)
|
||||||
logging.info(f"hd_score_m1 calculated: {df['hd_score_m1'].iloc[0]}")
|
logging.info("hd_score_m1 calculated: %s", df["hd_score_m1"].iloc[0])
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Error processing hd_score_m1 calculations: {e}")
|
logging.error("Error processing hd_score_m1 calculations: %s", e)
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
return df[['application_key', 'application_timestamp', 'deviceid', 'fuzzydeviceid', 'application_email_address', 'hd_score_m1']].iloc[0].to_dict()
|
def post_processing_m2(df: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
df = df.copy()
|
||||||
|
try:
|
||||||
|
df["hd_score_m2"] = np.round(
|
||||||
|
np.minimum(df["pd_m2"] * 100.0 + 0.00001, 1.0) * 75.0
|
||||||
|
+ np.maximum(np.log2(df["pd_m2"] * 100.0 + 0.000001) * 180.0, 0.0),
|
||||||
|
0,
|
||||||
|
)
|
||||||
|
df["hd_score_iso_m2"] = np.round(
|
||||||
|
np.minimum(df["pd_m2_iso"] * 100.0 + 0.00001, 1.0) * 97.0
|
||||||
|
+ np.maximum(np.log2(df["pd_m2_iso"] * 100.0 + 0.000001) * 246.0, 0.0),
|
||||||
|
0,
|
||||||
|
)
|
||||||
|
logging.info("hd_score_m2 calculated: %s", df["hd_score_m2"].iloc[0])
|
||||||
|
logging.info("hd_score_iso_m2 calculated: %s", df["hd_score_iso_m2"].iloc[0])
|
||||||
|
except Exception as e:
|
||||||
|
logging.error("Error processing hd_score_m2 calculations: %s", e)
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def _safe_get(df: pd.DataFrame, column: str):
|
||||||
|
"""Return scalar from single-row DataFrame, normalizing NaN/None to None."""
|
||||||
|
if column not in df.columns:
|
||||||
|
return None
|
||||||
|
val = df[column].iloc[0]
|
||||||
|
if isinstance(val, (list, dict)):
|
||||||
|
return val
|
||||||
|
try:
|
||||||
|
if pd.isna(val):
|
||||||
|
return None
|
||||||
|
except TypeError:
|
||||||
|
pass
|
||||||
|
return val
|
||||||
|
|
||||||
|
|
||||||
|
def post_processing_all(df_m1: pd.DataFrame, df_m2: pd.DataFrame, df_thx: pd.DataFrame) -> Dict[str, object]:
|
||||||
|
df_m1_scored = post_processing_m1(df_m1)
|
||||||
|
df_m2_scored = post_processing_m2(df_m2)
|
||||||
|
row_m1 = df_m1_scored.iloc[0]
|
||||||
|
row_m2 = df_m2_scored.iloc[0]
|
||||||
|
result = {
|
||||||
|
"application_key": row_m1.get("application_key"),
|
||||||
|
"application_timestamp": str(row_m1.get("application_timestamp")) if row_m1.get("application_timestamp") is not None else None,
|
||||||
|
"deviceid": row_m1.get("deviceid"),
|
||||||
|
"fuzzydeviceid": row_m1.get("fuzzydeviceid"),
|
||||||
|
"application_email_address": row_m1.get("application_email_address"),
|
||||||
|
"hd_score_m1": row_m1.get("hd_score_m1"),
|
||||||
|
"hd_score_m2": row_m2.get("hd_score_m2"),
|
||||||
|
"hd_score_iso_m2": row_m2.get("hd_score_iso_m2"),
|
||||||
|
"action": None,
|
||||||
|
}
|
||||||
|
flattened_thx = {field: _safe_get(df_thx, field) for field in THX_FIELDS if field not in result}
|
||||||
|
result.update(flattened_thx)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# Legacy entry point for backward compatibility
|
||||||
|
def post_processing(df: pd.DataFrame) -> Dict[str, object]:
|
||||||
|
df_scored = post_processing_m1(df)
|
||||||
|
row = df_scored.iloc[0]
|
||||||
|
return {
|
||||||
|
"application_key": row.get("application_key"),
|
||||||
|
"application_timestamp": str(row.get("application_timestamp")) if row.get("application_timestamp") is not None else None,
|
||||||
|
"deviceid": row.get("deviceid"),
|
||||||
|
"fuzzydeviceid": row.get("fuzzydeviceid"),
|
||||||
|
"application_email_address": row.get("application_email_address"),
|
||||||
|
"hd_score_m1": row.get("hd_score_m1"),
|
||||||
|
}
|
||||||
|
|||||||
@ -1,6 +1,11 @@
|
|||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
import logging
|
import logging
|
||||||
|
import math
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Iterable, List, Tuple, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
@ -9,67 +14,230 @@ logging.basicConfig(
|
|||||||
)
|
)
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
BASE_DIR = Path(__file__).resolve().parent
|
||||||
|
M2_LATLONG_REF_PATH = BASE_DIR / "latitute_longitute_reference.csv"
|
||||||
|
THX_FIELDS = [
|
||||||
|
"application_key",
|
||||||
|
"application_timestamp",
|
||||||
|
"digital_id_first_seen",
|
||||||
|
"summary_risk_score",
|
||||||
|
"cpu_clock",
|
||||||
|
"account_login_first_seen",
|
||||||
|
"account_telephone_first_seen",
|
||||||
|
"true_ip_first_seen",
|
||||||
|
"ssn_hash_first_seen",
|
||||||
|
"account_email_attributes",
|
||||||
|
"tps_ip_latitude",
|
||||||
|
"tps_ip_longitude",
|
||||||
|
]
|
||||||
|
|
||||||
def pre_processing(data_df):
|
# Hardcoded M2 data dictionary (replaces file lookup)
|
||||||
|
M2_DATA_DICTIONARY: Dict[str, Dict[str, Union[float, str, None]]] = {
|
||||||
|
"account_email_attributes_challenge_passed": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"account_email_attributes_challenged": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"account_email_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"account_login_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"account_name_activities_trusted_prob": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"account_telephone_attributes_challenge_failed": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"account_telephone_attributes_loan_app": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"account_telephone_attributes_trusted": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"account_telephone_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"accphone_gbl_velocity_hour": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"applicant_age": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"browser_spoof_score": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"day": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"day_cos": {"data_type": "float", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"day_sin": {"data_type": "float", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"di_autofill_count_login": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"digital_id_attributes_challenged": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"digital_id_attributes_trusted": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"digital_id_day_diff": {"data_type": "int", "valid_min": -999999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"digital_id_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"digital_id_month_diff": {"data_type": "int", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"digital_id_trust_score": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"digitalidconfidence": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"dist_dnsip_ref_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"dist_em_ip_ref_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"dist_inputip_ref_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"dist_proxyip_ref_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"dist_trueip_dnsip_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"dist_trueip_em_ip_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"dist_trueip_ref_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"domain_creation_days": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"emailtofullnameconfidence": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"emailtolastnameconfidence": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"first_seen_days": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"fraudrisk": {"data_type": "string", "valid_min": None, "valid_max": None, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"fuzzy_device_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"fuzzy_device_id_confidence": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"hour": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"input_ip_connection_type": {"data_type": "string", "valid_min": None, "valid_max": None, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"iptophoneconfidence": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"national_id_first_seen_age": {"data_type": "int", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"overalldigitalidentityscore": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"policy_score": {"data_type": "float", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"proxy_ip_first_seen_age": {"data_type": "int", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"proxy_ip_worst_score": {"data_type": "float", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"proxy_score": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"reasoncode_new_smartid_lt_1wk_global": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"riskrating": {"data_type": "string", "valid_min": None, "valid_max": None, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"ssn_hash_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"totalhits": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"true_ip_activities_trusted_prob": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"true_ip_attributes_trusted": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"true_ip_attributes_trusted_conf": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"true_ip_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"true_ip_worst_score": {"data_type": "float", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
"uniquehits": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
|
||||||
|
}
|
||||||
|
|
||||||
# combined_df = pd.DataFrame([input_data])
|
# Hardcoded one-hot config (parsed_feature, model_var, contains)
|
||||||
# data = pd.DataFrame(data)
|
M2_ONEHOT_CONFIG: List[Tuple[str, str, str]] = [
|
||||||
combined_df = data_df
|
("reasoncode", "reasoncode_new_smartid_lt_1wk_global", "new_smartid_lt_1wk_global"),
|
||||||
combined_df["applicant_age"] = combined_df.apply(lambda row: pd.to_datetime(row["application_timestamp"]).year - pd.to_datetime(row["application_date_of_birth"]).year if pd.notnull(row["application_timestamp"]) and pd.notnull(row["application_date_of_birth"]) else None,axis=1
|
("account_name_activities", "account_name_activities_trusted_prob", "trusted_prob"),
|
||||||
|
("account_email_attributes", "account_email_attributes_challenged", "challenged"),
|
||||||
|
("account_email_attributes", "account_email_attributes_challenge_passed", "challenge_passed"),
|
||||||
|
("true_ip_attributes", "true_ip_attributes_trusted", "trusted"),
|
||||||
|
("true_ip_attributes", "true_ip_attributes_trusted_conf", "trusted_conf"),
|
||||||
|
("digital_id_attributes", "digital_id_attributes_challenged", "challenged"),
|
||||||
|
("digital_id_attributes", "digital_id_attributes_trusted", "trusted"),
|
||||||
|
("account_telephone_attributes", "account_telephone_attributes_challenge_failed", "challenge_failed"),
|
||||||
|
("account_telephone_attributes", "account_telephone_attributes_loan_app", "loan_app"),
|
||||||
|
("account_telephone_attributes", "account_telephone_attributes_trusted", "trusted"),
|
||||||
|
("true_ip_activities", "true_ip_activities_trusted_prob", "trusted_prob"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Helpers
|
||||||
|
# ----------------------------
|
||||||
|
def _handle_unknowns(X: pd.DataFrame, column: str, known_values: Iterable[str], default_treatment=None):
|
||||||
|
if column not in X.columns:
|
||||||
|
return X
|
||||||
|
known_values = {str(val).lower() for val in known_values}
|
||||||
|
invalid_values = {None, "none", "nan", pd.NA}
|
||||||
|
X[column] = X[column].apply(
|
||||||
|
lambda x: str(x).lower()
|
||||||
|
if pd.notna(x) and str(x).lower() in known_values
|
||||||
|
else (default_treatment if pd.notna(x) and str(x).lower() not in invalid_values else np.nan)
|
||||||
|
)
|
||||||
|
return X
|
||||||
|
|
||||||
|
|
||||||
|
def _haversine_km(lat1, lon1, lat2, lon2):
|
||||||
|
if None in (lat1, lon1, lat2, lon2):
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
rlat1 = float(lat1) * math.pi / 180.0
|
||||||
|
rlat2 = float(lat2) * math.pi / 180.0
|
||||||
|
dlat = (float(lat2) - float(lat1)) * math.pi / 180.0
|
||||||
|
dlon = (float(lon2) - float(lon1)) * math.pi / 180.0
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
a = (
|
||||||
|
math.sin(dlat / 2.0) ** 2
|
||||||
|
+ math.cos(rlat1) * math.cos(rlat2) * math.sin(dlon / 2.0) ** 2
|
||||||
|
)
|
||||||
|
a = min(1.0, max(0.0, a))
|
||||||
|
return 2 * 6371.0088 * math.asin(math.sqrt(a))
|
||||||
|
|
||||||
|
|
||||||
|
def _prep_latlong_ref():
|
||||||
|
if not M2_LATLONG_REF_PATH.exists():
|
||||||
|
logger.warning("latitute_longitute_reference.csv missing at %s", M2_LATLONG_REF_PATH)
|
||||||
|
return pd.DataFrame()
|
||||||
|
try:
|
||||||
|
ref = pd.read_csv(M2_LATLONG_REF_PATH, usecols=["postal_code_ref", "latitute_ref", "longitude_ref"])
|
||||||
|
except Exception:
|
||||||
|
ref = pd.read_csv(M2_LATLONG_REF_PATH)
|
||||||
|
# keep lower string version for matching
|
||||||
|
if "postal_code_ref" in ref.columns:
|
||||||
|
ref["postal_code_ref"] = ref["postal_code_ref"].astype(str).str.lower()
|
||||||
|
return ref
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_zip_for_ref(zip_val):
|
||||||
|
"""
|
||||||
|
Normalize zip/postal code values so they match reference CSV keys.
|
||||||
|
|
||||||
|
- Floats like 89503.0 -> "89503"
|
||||||
|
- Int-like strings "89503.0" -> "89503"
|
||||||
|
Note: we intentionally avoid zero-filling to preserve behaviour seen in UAT references
|
||||||
|
where leading-zero ZIPs are not matched to the reference table.
|
||||||
|
"""
|
||||||
|
if pd.isna(zip_val):
|
||||||
|
return None
|
||||||
|
if isinstance(zip_val, (int, float)) and not isinstance(zip_val, bool):
|
||||||
|
return str(int(zip_val)).lower()
|
||||||
|
zip_str = str(zip_val).strip()
|
||||||
|
if zip_str.replace(".", "", 1).isdigit():
|
||||||
|
try:
|
||||||
|
return str(int(float(zip_str))).lower()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return zip_str.lower() if zip_str else None
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# M1 Pre-processing (existing behaviour)
|
||||||
|
# ----------------------------
|
||||||
|
def pre_processing_m1(data_df: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
combined_df = data_df.copy()
|
||||||
|
combined_df["applicant_age"] = combined_df.apply(
|
||||||
|
lambda row: pd.to_datetime(row["application_timestamp"]).year - pd.to_datetime(row["application_date_of_birth"]).year
|
||||||
|
if pd.notnull(row["application_timestamp"]) and pd.notnull(row["application_date_of_birth"])
|
||||||
|
else None,
|
||||||
|
axis=1,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extracting Temporal features
|
combined_df["application_timestamp"] = pd.to_datetime(combined_df["application_timestamp"])
|
||||||
combined_df['application_timestamp'] = pd.to_datetime(combined_df["application_timestamp"])
|
combined_df.loc[:, "application_time"] = pd.to_datetime(combined_df["application_timestamp"]).dt.time
|
||||||
combined_df.loc[:, 'application_time'] = pd.to_datetime(combined_df['application_timestamp']).dt.time
|
|
||||||
|
|
||||||
combined_df['day'] = combined_df['application_timestamp'].dt.day
|
combined_df["day"] = combined_df["application_timestamp"].dt.day
|
||||||
combined_df['day_of_week'] = combined_df['application_timestamp'].dt.weekday # 0=Monday, 6=Sunday
|
combined_df["day_of_week"] = combined_df["application_timestamp"].dt.weekday
|
||||||
|
|
||||||
combined_df['day_sin'] = np.sin(2 * np.pi * combined_df['day'] / 31)
|
combined_df["day_sin"] = np.sin(2 * np.pi * combined_df["day"] / 31)
|
||||||
combined_df['day_cos'] = np.cos(2 * np.pi * combined_df['day'] / 31)
|
combined_df["day_cos"] = np.cos(2 * np.pi * combined_df["day"] / 31)
|
||||||
combined_df['day_of_week_sin'] = np.sin(2 * np.pi * combined_df['day_of_week'] / 7)
|
combined_df["day_of_week_sin"] = np.sin(2 * np.pi * combined_df["day_of_week"] / 7)
|
||||||
combined_df['day_of_week_cos'] = np.cos(2 * np.pi * combined_df['day_of_week'] / 7)
|
combined_df["day_of_week_cos"] = np.cos(2 * np.pi * combined_df["day_of_week"] / 7)
|
||||||
|
|
||||||
# combined_df['is_weekend'] = combined_df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
|
|
||||||
|
|
||||||
# Create a day/night variable
|
|
||||||
def classify_day_night(hour):
|
def classify_day_night(hour):
|
||||||
if 6 <= hour < 18:
|
if 6 <= hour < 18:
|
||||||
return 'Day'
|
return "Day"
|
||||||
else:
|
return "Night"
|
||||||
return 'Night'
|
|
||||||
|
|
||||||
# Extract hour from application_time
|
combined_df["hour"] = combined_df["application_time"].apply(lambda x: x.hour if pd.notnull(x) else np.nan)
|
||||||
combined_df['hour'] = combined_df['application_time'].apply(lambda x: x.hour if pd.notnull(x) else np.nan)
|
combined_df["day_night"] = combined_df["hour"].apply(lambda hour: classify_day_night(hour) if pd.notnull(hour) else "Unknown")
|
||||||
combined_df['day_night'] = combined_df['hour'].apply(lambda hour: classify_day_night(hour) if pd.notnull(hour) else 'Unknown')
|
|
||||||
|
|
||||||
# combined_df['os_version'] = combined_df['os_version'].str.replace(r'[^a-zA-Z0-9]', '_', regex=True)
|
combined_df["os_version"] = combined_df["os_version"].apply(
|
||||||
combined_df['os_version'] = combined_df['os_version'].apply(lambda x: x.split('.')[0] if isinstance(x, str) and '.' in x
|
lambda x: x.split(".")[0] if isinstance(x, str) and "." in x else x.split("_")[0] if isinstance(x, str) and "_" in x else x
|
||||||
else x.split('_')[0] if isinstance(x, str) and '_' in x
|
)
|
||||||
else x)
|
|
||||||
|
|
||||||
|
combined_df["Identity_Negative_History"] = combined_df["tmxsummaryreasoncode"].astype(str).str.contains(
|
||||||
|
"Identity_Negative_History", na=False, regex=True
|
||||||
|
).astype(int)
|
||||||
|
combined_df["Device_Negative_History"] = combined_df["tmxsummaryreasoncode"].astype(str).str.contains(
|
||||||
|
"Device_Negative_History", na=False, regex=True
|
||||||
|
).astype(int)
|
||||||
|
combined_df["Level_1_Link_Reject"] = combined_df["tmxsummaryreasoncode"].astype(str).str.contains(
|
||||||
|
"Level_1_Link_Reject", na=False, regex=True
|
||||||
|
).astype(int)
|
||||||
|
combined_df["IP_Negative_History"] = combined_df["tmxsummaryreasoncode"].astype(str).str.contains(
|
||||||
|
"IP_Negative_History", na=False, regex=True
|
||||||
|
).astype(int)
|
||||||
|
combined_df["Identity_Spoofing"] = combined_df["tmxsummaryreasoncode"].astype(str).str.contains(
|
||||||
|
"Identity_Spoofing", na=False, regex=True
|
||||||
|
).astype(int)
|
||||||
|
|
||||||
# Datatype conversions
|
combined_df["digitalidconfidence"] = pd.to_numeric(combined_df["digitalidconfidence"], errors="coerce").astype("Int64")
|
||||||
# combined_df['Level_1_Link_Accept'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Level_1_Link_Accept', na=False, regex=True).astype(int)
|
|
||||||
combined_df['Identity_Negative_History'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Identity_Negative_History', na=False, regex=True).astype(int)
|
|
||||||
combined_df['Device_Negative_History'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Device_Negative_History', na=False, regex=True).astype(int)
|
|
||||||
combined_df['Level_1_Link_Reject'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Level_1_Link_Reject', na=False, regex=True).astype(int)
|
|
||||||
combined_df['IP_Negative_History'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('IP_Negative_History', na=False, regex=True).astype(int)
|
|
||||||
combined_df['Identity_Spoofing'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Identity_Spoofing', na=False, regex=True).astype(int)
|
|
||||||
# combined_df['Bot'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Bot', na=False, regex=True).astype(int)
|
|
||||||
|
|
||||||
combined_df['digitalidconfidence'] = pd.to_numeric(combined_df['digitalidconfidence'], errors='coerce').astype('Int64')
|
combined_df.rename(
|
||||||
|
columns={
|
||||||
# Rename Columns if Required
|
"DigitalIdConfidence": "digitalidconfidence",
|
||||||
combined_df.rename(columns={
|
},
|
||||||
'DigitalIdConfidence': 'digitalidconfidence',
|
inplace=True,
|
||||||
# 'inputipaddress_consistency': 'inputip_consistency',
|
)
|
||||||
# 'requestid_consistency': 'request_consistency',
|
|
||||||
# Add others as required if present in your DataFrame and needing renaming.
|
|
||||||
}, inplace=True)
|
|
||||||
|
|
||||||
# #Testing : remove below
|
|
||||||
# combined_df.to_csv('op-pre-processing_intermediate.csv', index=False)
|
|
||||||
|
|
||||||
dtype_dict = {
|
dtype_dict = {
|
||||||
"applicant_age": int,
|
"applicant_age": int,
|
||||||
@ -84,7 +252,7 @@ def pre_processing(data_df):
|
|||||||
"day_cos": float,
|
"day_cos": float,
|
||||||
"summary_risk_score": float,
|
"summary_risk_score": float,
|
||||||
"digital_id_trust_score_rating": str,
|
"digital_id_trust_score_rating": str,
|
||||||
"day" : 'int32',
|
"day": "int32",
|
||||||
"lengthatbank": float,
|
"lengthatbank": float,
|
||||||
"day_of_week_cos": float,
|
"day_of_week_cos": float,
|
||||||
"Level_1_Link_Reject": int,
|
"Level_1_Link_Reject": int,
|
||||||
@ -95,7 +263,7 @@ def pre_processing(data_df):
|
|||||||
"true_ip_score": float,
|
"true_ip_score": float,
|
||||||
"ip_net_speed_cell": str,
|
"ip_net_speed_cell": str,
|
||||||
"account_email_score": float,
|
"account_email_score": float,
|
||||||
"day_of_week" : 'int32',
|
"day_of_week": "int32",
|
||||||
"true_ip_worst_score": float,
|
"true_ip_worst_score": float,
|
||||||
"proxy_ip_worst_score": float,
|
"proxy_ip_worst_score": float,
|
||||||
"day_night": str,
|
"day_night": str,
|
||||||
@ -108,28 +276,26 @@ def pre_processing(data_df):
|
|||||||
"true_ip_state_confidence": float,
|
"true_ip_state_confidence": float,
|
||||||
"IP_Negative_History": int,
|
"IP_Negative_History": int,
|
||||||
"fuzzy_device_worst_score": float,
|
"fuzzy_device_worst_score": float,
|
||||||
"digital_id_confidence_rating" : str,
|
|
||||||
"day_of_week_sin": float,
|
"day_of_week_sin": float,
|
||||||
"riskrating": str,
|
"riskrating": str,
|
||||||
"payfrequency": str,
|
"payfrequency": str,
|
||||||
"ownhome": str,
|
"ownhome": str,
|
||||||
"Identity_Spoofing" : int
|
"Identity_Spoofing": int,
|
||||||
}
|
}
|
||||||
|
|
||||||
next_block_cols = ['application_key', 'application_timestamp', 'deviceid', 'fuzzydeviceid', 'application_email_address']
|
next_block_cols = ["application_key", "application_timestamp", "deviceid", "fuzzydeviceid", "application_email_address"]
|
||||||
cols_to_keep = [col for col in dtype_dict.keys() if col in combined_df.columns]
|
cols_to_keep = [col for col in dtype_dict.keys() if col in combined_df.columns]
|
||||||
|
|
||||||
final_cols = list(set(next_block_cols).union(set(cols_to_keep)))
|
final_cols = list(set(next_block_cols).union(set(cols_to_keep)))
|
||||||
# Type casting
|
|
||||||
for col, dtype in dtype_dict.items():
|
for col, dtype in dtype_dict.items():
|
||||||
if col in combined_df.columns:
|
if col in combined_df.columns:
|
||||||
if dtype == int:
|
if dtype == int:
|
||||||
combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce', downcast='integer')
|
combined_df[col] = pd.to_numeric(combined_df[col], errors="coerce", downcast="integer")
|
||||||
elif dtype == float:
|
elif dtype == float:
|
||||||
combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce', downcast='float')
|
combined_df[col] = pd.to_numeric(combined_df[col], errors="coerce", downcast="float")
|
||||||
elif dtype == str:
|
elif dtype == str:
|
||||||
combined_df[col] = combined_df[col].astype(str)
|
combined_df[col] = combined_df[col].astype(str)
|
||||||
# cross check data type
|
|
||||||
capping_dict = {
|
capping_dict = {
|
||||||
"applicant_age": (18, 93),
|
"applicant_age": (18, 93),
|
||||||
"digitalidconfidence": (0, 9017),
|
"digitalidconfidence": (0, 9017),
|
||||||
@ -157,98 +323,254 @@ def pre_processing(data_df):
|
|||||||
"fuzzy_device_score": (-29, 14),
|
"fuzzy_device_score": (-29, 14),
|
||||||
"day_sin": (-0.9987165072, 0.9987165072),
|
"day_sin": (-0.9987165072, 0.9987165072),
|
||||||
"ip_region_confidence": (75, 99),
|
"ip_region_confidence": (75, 99),
|
||||||
# "true_ip_state_confidence": (5, 98),
|
|
||||||
"IP_Negative_History": (0, 1),
|
"IP_Negative_History": (0, 1),
|
||||||
"fuzzy_device_worst_score": (-100, 0),
|
"fuzzy_device_worst_score": (-100, 0),
|
||||||
"day_of_week_sin": (-0.9749279122, 0.9749279122),
|
"day_of_week_sin": (-0.9749279122, 0.9749279122),
|
||||||
"Identity_Spoofing": (0, 1),
|
"Identity_Spoofing": (0, 1),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Apply capping
|
|
||||||
for column, (cap_min, cap_max) in capping_dict.items():
|
for column, (cap_min, cap_max) in capping_dict.items():
|
||||||
if column in combined_df.columns:
|
if column in combined_df.columns:
|
||||||
combined_df[column] = combined_df[column].clip(lower=cap_min, upper=cap_max)
|
combined_df[column] = combined_df[column].clip(lower=cap_min, upper=cap_max)
|
||||||
|
|
||||||
|
|
||||||
def handle_unknowns(X, column, known_values, default_treatment=None):
|
|
||||||
if column not in X.columns:
|
|
||||||
return X # Return X to avoid NoneType error
|
|
||||||
known_values = {str(val).lower() for val in known_values}
|
|
||||||
invalid_values = {None, "none", "nan", pd.NA}
|
|
||||||
X[column] = X[column].apply(
|
|
||||||
lambda x: str(x).lower() if pd.notna(x) and str(x).lower() in known_values
|
|
||||||
else (default_treatment if pd.notna(x) and str(x).lower() not in invalid_values else np.nan)
|
|
||||||
)
|
|
||||||
return X # Always return the DataFrame
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
unknown_treatments = {
|
unknown_treatments = {
|
||||||
"employmentstatus": {
|
"employmentstatus": {
|
||||||
"valid_values": [
|
"valid_values": [
|
||||||
"disability", "fixed income", "full time employed", "part time employment",
|
"disability",
|
||||||
"retired benefits", "self employed", "student", "unemployed", "welfare"
|
"fixed income",
|
||||||
|
"full time employed",
|
||||||
|
"part time employment",
|
||||||
|
"retired benefits",
|
||||||
|
"self employed",
|
||||||
|
"student",
|
||||||
|
"unemployed",
|
||||||
|
"welfare",
|
||||||
],
|
],
|
||||||
"default_treatment": "other"
|
"default_treatment": "other",
|
||||||
},
|
|
||||||
"trueipgeo": {
|
|
||||||
"valid_values": ["US"],
|
|
||||||
"default_treatment": "other"
|
|
||||||
},
|
|
||||||
"digital_id_trust_score_rating": {
|
|
||||||
"valid_values": ["very_high", "high", "neutral", "low"],
|
|
||||||
"default_treatment": "very_low"
|
|
||||||
},
|
},
|
||||||
|
"trueipgeo": {"valid_values": ["US"], "default_treatment": "other"},
|
||||||
|
"digital_id_trust_score_rating": {"valid_values": ["very_high", "high", "neutral", "low"], "default_treatment": "very_low"},
|
||||||
"educationlevel": {
|
"educationlevel": {
|
||||||
"valid_values": ["associate's degree", "bachelor's degree", "doctorate", "high school", "master's degree"],
|
"valid_values": ["associate's degree", "bachelor's degree", "doctorate", "high school", "master's degree"],
|
||||||
"default_treatment": "other"
|
"default_treatment": "other",
|
||||||
},
|
},
|
||||||
"os_version": {
|
"os_version": {
|
||||||
"valid_values": [
|
"valid_values": ["18", "17", "16", "15", "14", "13", "12", "11", "10", "9", "8"],
|
||||||
'18', '17', '16', '15', '14', '13', '12', '11', '10', '9', '8'
|
"default_treatment": "unknown",
|
||||||
],
|
|
||||||
"default_treatment": 'unknown'
|
|
||||||
},
|
},
|
||||||
"ip_net_speed_cell": {
|
"ip_net_speed_cell": {
|
||||||
"valid_values": [
|
"valid_values": [
|
||||||
"broadband", "cable", "dialup", "dsl", "fixed wireless", "mobile", "mobile wireless", "ocx", "satellite",
|
"broadband",
|
||||||
"t1", "tx", "wireless", "xdsl"
|
"cable",
|
||||||
|
"dialup",
|
||||||
|
"dsl",
|
||||||
|
"fixed wireless",
|
||||||
|
"mobile",
|
||||||
|
"mobile wireless",
|
||||||
|
"ocx",
|
||||||
|
"satellite",
|
||||||
|
"t1",
|
||||||
|
"tx",
|
||||||
|
"wireless",
|
||||||
|
"xdsl",
|
||||||
],
|
],
|
||||||
"default_treatment": "mobile"
|
"default_treatment": "mobile",
|
||||||
},
|
|
||||||
"digital_id_confidence_rating": {
|
|
||||||
"valid_values": ["high", "medium", "very_high"],
|
|
||||||
"default_treatment": "very_low"
|
|
||||||
},
|
|
||||||
"riskrating": {
|
|
||||||
"valid_values": ["low", "medium", "neutral", "trusted"],
|
|
||||||
"default_treatment": "high"
|
|
||||||
},
|
|
||||||
"ownhome": {
|
|
||||||
"valid_values": ["true", "false"],
|
|
||||||
"default_treatment": np.nan
|
|
||||||
},
|
},
|
||||||
|
"digital_id_confidence_rating": {"valid_values": ["high", "medium", "very_high"], "default_treatment": "very_low"},
|
||||||
|
"riskrating": {"valid_values": ["low", "medium", "neutral", "trusted"], "default_treatment": "high"},
|
||||||
|
"ownhome": {"valid_values": ["true", "false"], "default_treatment": np.nan},
|
||||||
}
|
}
|
||||||
|
|
||||||
for column, treatment in unknown_treatments.items():
|
for column, treatment in unknown_treatments.items():
|
||||||
combined_df = handle_unknowns(combined_df, column, treatment["valid_values"], treatment["default_treatment"])
|
combined_df = _handle_unknowns(combined_df, column, treatment["valid_values"], treatment["default_treatment"])
|
||||||
|
|
||||||
payfrequency_map = {
|
payfrequency_map = {"biweekly": ["biweekly", "bi-weekly", "bi weekly", "bw"], "semimonthly": ["semi-monthly", "semimonthly"]}
|
||||||
"biweekly": ["biweekly", "bi-weekly", "bi weekly", "bw"],
|
|
||||||
"semimonthly": ["semi-monthly", "semimonthly"]
|
|
||||||
}
|
|
||||||
|
|
||||||
combined_df['payfrequency'] = combined_df['payfrequency'].apply(
|
combined_df["payfrequency"] = combined_df["payfrequency"].apply(
|
||||||
lambda x: next((key for key, values in payfrequency_map.items() if str(x).lower() in values), np.nan)
|
lambda x: next((key for key, values in payfrequency_map.items() if str(x).lower() in values), np.nan)
|
||||||
)
|
)
|
||||||
|
|
||||||
return combined_df[final_cols]
|
return combined_df[final_cols]
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# M2 Pre-processing
|
||||||
|
# ----------------------------
|
||||||
|
def _apply_onehot_features(df: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
df = df.copy()
|
||||||
|
for parsed_feature, model_var, contains_val in M2_ONEHOT_CONFIG:
|
||||||
|
value = df.get(parsed_feature, pd.Series([None])).iloc[0]
|
||||||
|
flag = 0
|
||||||
|
if isinstance(value, list):
|
||||||
|
flag = int(any(contains_val in str(v).lower() or re.sub(r"[^a-z0-9]+", " ", contains_val) in re.sub(r"[^a-z0-9]+", " ", str(v).lower()) for v in value))
|
||||||
|
elif isinstance(value, str):
|
||||||
|
val_norm = re.sub(r"[^a-z0-9]+", " ", value.lower())
|
||||||
|
contains_norm = re.sub(r"[^a-z0-9]+", " ", contains_val)
|
||||||
|
flag = int(contains_val in value.lower() or contains_norm in val_norm)
|
||||||
|
df[model_var] = flag
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_first_seen_days(ts_value, app_ts):
|
||||||
|
ts = pd.to_datetime(ts_value, errors="coerce", utc=True)
|
||||||
|
app = pd.to_datetime(app_ts, errors="coerce", utc=True)
|
||||||
|
# align to naive for subtraction
|
||||||
|
if isinstance(ts, pd.Timestamp) and ts.tzinfo is not None:
|
||||||
|
ts = ts.tz_localize(None)
|
||||||
|
if isinstance(app, pd.Timestamp) and app.tzinfo is not None:
|
||||||
|
app = app.tz_localize(None)
|
||||||
|
if pd.isna(ts) or pd.isna(app):
|
||||||
|
return None
|
||||||
|
return (app.normalize() - ts.normalize()).days
|
||||||
|
|
||||||
|
|
||||||
|
def _to_naive_ts(val):
|
||||||
|
ts = pd.to_datetime(val, errors="coerce", utc=True)
|
||||||
|
if isinstance(ts, pd.Timestamp) and ts.tzinfo is not None:
|
||||||
|
ts = ts.tz_localize(None)
|
||||||
|
return ts
|
||||||
|
|
||||||
|
|
||||||
|
def _month_diff(earlier, later):
|
||||||
|
"""Month difference (earlier - later) using year/month buckets."""
|
||||||
|
ts_earlier = _to_naive_ts(earlier)
|
||||||
|
ts_later = _to_naive_ts(later)
|
||||||
|
if pd.isna(ts_earlier) or pd.isna(ts_later):
|
||||||
|
return None
|
||||||
|
return (ts_earlier.year - ts_later.year) * 12 + (ts_earlier.month - ts_later.month)
|
||||||
|
|
||||||
|
|
||||||
|
def pre_processing_m2(data_df: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
df = data_df.copy()
|
||||||
|
df.columns = df.columns.str.lower()
|
||||||
|
|
||||||
|
# Timestamp-derived features (align with M1 behaviour to keep probabilities consistent)
|
||||||
|
df["application_timestamp"] = pd.to_datetime(df["application_timestamp"], errors="coerce", utc=True)
|
||||||
|
df["day"] = df["application_timestamp"].dt.day
|
||||||
|
df["hour"] = df["application_timestamp"].dt.hour
|
||||||
|
df["day_sin"] = np.sin(2 * np.pi * df["day"] / 31)
|
||||||
|
df["day_cos"] = np.cos(2 * np.pi * df["day"] / 31)
|
||||||
|
|
||||||
|
def _classify_day_night(hour_val):
|
||||||
|
if pd.isna(hour_val):
|
||||||
|
return np.nan
|
||||||
|
return "day" if 6 <= hour_val < 18 else "night"
|
||||||
|
|
||||||
|
df["day_night"] = df["hour"].apply(_classify_day_night)
|
||||||
|
|
||||||
|
# Apply onehot flags from attributes
|
||||||
|
df = _apply_onehot_features(df)
|
||||||
|
|
||||||
|
# Distances
|
||||||
|
lat_ref = _prep_latlong_ref()
|
||||||
|
if not lat_ref.empty and "zip" in df.columns:
|
||||||
|
zip_value = df["zip"].iloc[0]
|
||||||
|
zip_lookup = _normalize_zip_for_ref(zip_value)
|
||||||
|
ref_row = lat_ref[lat_ref["postal_code_ref"] == zip_lookup] if zip_lookup else pd.DataFrame()
|
||||||
|
lat_ref_val = ref_row["latitute_ref"].iloc[0] if not ref_row.empty else None
|
||||||
|
lon_ref_val = ref_row["longitude_ref"].iloc[0] if not ref_row.empty else None
|
||||||
|
else:
|
||||||
|
lat_ref_val = None
|
||||||
|
lon_ref_val = None
|
||||||
|
|
||||||
|
df["dist_inputip_ref_km"] = df.apply(
|
||||||
|
lambda r: _haversine_km(lat_ref_val, lon_ref_val, r.get("input_ip_latitude"), r.get("input_ip_longitude")), axis=1
|
||||||
|
)
|
||||||
|
df["dist_em_ip_ref_km"] = df.apply(
|
||||||
|
lambda r: _haversine_km(lat_ref_val, lon_ref_val, r.get("ip_latitude") or r.get("tps_ip_latitude"), r.get("ip_longitude") or r.get("tps_ip_longitude")),
|
||||||
|
axis=1,
|
||||||
|
)
|
||||||
|
df["dist_proxyip_ref_km"] = df.apply(
|
||||||
|
lambda r: _haversine_km(lat_ref_val, lon_ref_val, r.get("proxy_ip_latitude"), r.get("proxy_ip_longitude")), axis=1
|
||||||
|
)
|
||||||
|
df["dist_dnsip_ref_km"] = df.apply(
|
||||||
|
lambda r: _haversine_km(lat_ref_val, lon_ref_val, r.get("dns_ip_latitude"), r.get("dns_ip_longitude")), axis=1
|
||||||
|
)
|
||||||
|
df["dist_trueip_ref_km"] = df.apply(
|
||||||
|
lambda r: _haversine_km(lat_ref_val, lon_ref_val, r.get("true_ip_latitude"), r.get("true_ip_longitude")), axis=1
|
||||||
|
)
|
||||||
|
df["dist_trueip_em_ip_km"] = df.apply(
|
||||||
|
lambda r: _haversine_km(r.get("true_ip_latitude"), r.get("true_ip_longitude"), r.get("ip_latitude") or r.get("tps_ip_latitude"), r.get("ip_longitude") or r.get("tps_ip_longitude")),
|
||||||
|
axis=1,
|
||||||
|
)
|
||||||
|
df["dist_trueip_dnsip_km"] = df.apply(
|
||||||
|
lambda r: _haversine_km(r.get("true_ip_latitude"), r.get("true_ip_longitude"), r.get("dns_ip_latitude"), r.get("dns_ip_longitude")),
|
||||||
|
axis=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Ages
|
||||||
|
app_ts_val = df.get("application_timestamp", pd.Series([None])).iloc[0]
|
||||||
|
def _safe_day_diff(row):
|
||||||
|
if not row.get("digital_id_first_seen"):
|
||||||
|
return None
|
||||||
|
val = _extract_first_seen_days(row.get("digital_id_first_seen"), app_ts_val)
|
||||||
|
return -val if val is not None else None
|
||||||
|
|
||||||
|
df["digital_id_day_diff"] = df.apply(_safe_day_diff, axis=1)
|
||||||
|
df["digital_id_month_diff"] = df.apply(lambda r: _month_diff(r.get("digital_id_first_seen"), app_ts_val), axis=1)
|
||||||
|
|
||||||
|
for col_name in [
|
||||||
|
"digital_id_first_seen",
|
||||||
|
"account_email_first_seen",
|
||||||
|
"account_login_first_seen",
|
||||||
|
"account_telephone_first_seen",
|
||||||
|
"true_ip_first_seen",
|
||||||
|
"ssn_hash_first_seen",
|
||||||
|
"fuzzy_device_first_seen",
|
||||||
|
"national_id_first_seen",
|
||||||
|
"proxy_ip_first_seen",
|
||||||
|
]:
|
||||||
|
out_col = f"{col_name}_age"
|
||||||
|
df[out_col] = df.apply(lambda r: _extract_first_seen_days(r.get(col_name), app_ts_val), axis=1)
|
||||||
|
|
||||||
|
# applicant_age for consistency if not present
|
||||||
|
if "applicant_age" not in df.columns:
|
||||||
|
df["applicant_age"] = df.apply(
|
||||||
|
lambda row: pd.to_datetime(row["application_timestamp"]).year - pd.to_datetime(row["application_date_of_birth"]).year
|
||||||
|
if pd.notnull(row.get("application_timestamp")) and pd.notnull(row.get("application_date_of_birth"))
|
||||||
|
else None,
|
||||||
|
axis=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Safe casting and capping using data dictionary
|
||||||
|
for var_name, rules in M2_DATA_DICTIONARY.items():
|
||||||
|
if var_name not in df.columns:
|
||||||
|
continue
|
||||||
|
col = pd.to_numeric(df[var_name], errors="coerce") if rules.get("data_type") in ["float", "int"] else df[var_name]
|
||||||
|
if rules.get("data_type") == "int":
|
||||||
|
col = col.astype("float")
|
||||||
|
valid_min = rules.get("valid_min")
|
||||||
|
valid_max = rules.get("valid_max")
|
||||||
|
observed_min = rules.get("observed_cap_min")
|
||||||
|
observed_max = rules.get("observed_cap_max")
|
||||||
|
if observed_min is not None or observed_max is not None:
|
||||||
|
col = col.clip(lower=observed_min, upper=observed_max)
|
||||||
|
# if valid_min is not None:
|
||||||
|
# col = col.where(col >= valid_min, np.nan)
|
||||||
|
# if valid_max is not None:
|
||||||
|
# col = col.where(col <= valid_max, np.nan)
|
||||||
|
df[var_name] = col
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def pre_processing_all(data_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
||||||
|
# Ensure requested THX fields exist so downstream packaging always has keys
|
||||||
|
df_base = data_df.copy()
|
||||||
|
|
||||||
|
for field in THX_FIELDS:
|
||||||
|
if field in df_base.columns:
|
||||||
|
df_base[field] = df_base[field].astype(str)
|
||||||
|
else:
|
||||||
|
df_base[field] = None
|
||||||
|
df_thx = df_base[THX_FIELDS].copy()
|
||||||
|
|
||||||
|
df_m1 = pre_processing_m1(df_base.copy())
|
||||||
|
df_m2 = pre_processing_m2(df_base.copy())
|
||||||
|
return df_m1, df_m2, df_thx
|
||||||
|
|
||||||
|
|
||||||
|
# Backwards compatible entry point (used by legacy code/tests if any)
|
||||||
|
def pre_processing(data_df: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
df_m1, _, _ = pre_processing_all(data_df)
|
||||||
|
return df_m1
|
||||||
|
|||||||
129
processing.py
129
processing.py
@ -1,46 +1,107 @@
|
|||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
import xgboost as xgb
|
|
||||||
import joblib
|
|
||||||
import json
|
import json
|
||||||
|
from functools import lru_cache
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import joblib
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import xgboost as xgb
|
||||||
|
|
||||||
|
# BASE_DIR = Path(__file__).resolve().parent
|
||||||
|
# M1_MODEL_PATH = BASE_DIR / "xgboost_model_M1.joblib"
|
||||||
|
# M1_CATEGORY_ORDERS_PATH = BASE_DIR / "category_orders_train_M1.json"
|
||||||
|
# M2_MODEL_PATH = BASE_DIR / "xgboost_model_M2.joblib"
|
||||||
|
# M2_CATEGORY_ORDERS_PATH = BASE_DIR / "category_orders_train_M2.json"
|
||||||
|
# M2_ISO_PATH = BASE_DIR / "isotonic_model_M2.joblib"
|
||||||
|
|
||||||
|
M1_MODEL_PATH = "./xgboost_model_M1.joblib"
|
||||||
|
M1_CATEGORY_ORDERS_PATH = "./category_orders_train_M1.json"
|
||||||
|
M2_MODEL_PATH = "./xgboost_model_M2.joblib"
|
||||||
|
M2_CATEGORY_ORDERS_PATH = "./category_orders_train_M2.json"
|
||||||
|
M2_ISO_PATH = "./isotonic_model_M2.joblib"
|
||||||
|
|
||||||
|
|
||||||
def processing(input_data):
|
def _load_category_orders(path: Path) -> dict:
|
||||||
|
with open(path, "r") as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _load_m1_model():
|
||||||
|
return joblib.load(M1_MODEL_PATH)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _load_m2_model():
|
||||||
|
return joblib.load(M2_MODEL_PATH)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _load_m2_iso_model():
|
||||||
|
return joblib.load(M2_ISO_PATH)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=None)
|
||||||
|
def _load_category_orders_cached(path: Path):
|
||||||
|
# Cache category orders per path to avoid disk I/O on each scoring
|
||||||
|
return _load_category_orders(path)
|
||||||
|
|
||||||
|
|
||||||
|
def _prepare(df: pd.DataFrame, category_orders: dict) -> pd.DataFrame:
|
||||||
|
df = df.copy()
|
||||||
|
for col, categories in category_orders.items():
|
||||||
|
if col not in df.columns:
|
||||||
|
df[col] = np.nan
|
||||||
|
|
||||||
|
df[col] = df[col].astype(str).str.lower()
|
||||||
|
df[col].replace([None, "", "null", np.nan, "nan", " "], np.nan, inplace=True)
|
||||||
|
df[col] = pd.Categorical(df[col], categories=categories, ordered=True)
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def processing_m1(input_data: pd.DataFrame) -> pd.DataFrame:
|
||||||
df = pd.DataFrame(input_data)
|
df = pd.DataFrame(input_data)
|
||||||
|
|
||||||
# Load Model
|
|
||||||
model_path = "./xgboost_model.joblib"
|
|
||||||
# model_path = "C:/Users/abinisha/habemco_flowx/m1_v1/xgboost_model.joblib"
|
|
||||||
model = joblib.load(model_path)
|
|
||||||
df.rename(columns={'riskrating': 'RiskRating', 'trueipgeo': 'TrueIpGeo'}, inplace=True)
|
|
||||||
|
|
||||||
# Load Category Orders
|
|
||||||
category_orders_path ="./category_orders_train.json"
|
|
||||||
# category_orders_path = "C:/Users/abinisha/habemco_flowx/m1_v1/category_orders_train.json"
|
|
||||||
with open(category_orders_path, 'r') as f:
|
|
||||||
category_orders = json.load(f)
|
|
||||||
|
|
||||||
if df.empty:
|
if df.empty:
|
||||||
raise ValueError("Input DataFrame is empty.")
|
raise ValueError("Input DataFrame is empty.")
|
||||||
|
|
||||||
|
model = _load_m1_model()
|
||||||
|
df.rename(columns={"riskrating": "RiskRating", "trueipgeo": "TrueIpGeo"}, inplace=True)
|
||||||
|
category_orders = _load_category_orders_cached(M1_CATEGORY_ORDERS_PATH)
|
||||||
|
df = _prepare(df, category_orders)
|
||||||
|
|
||||||
# Ensure all expected features exist
|
|
||||||
expected_features = model.feature_names
|
expected_features = model.feature_names
|
||||||
|
|
||||||
|
|
||||||
for col, categories in category_orders.items():
|
|
||||||
df[col].replace([None, "", "null", np.nan, "nan", " "], np.nan, inplace=True)
|
|
||||||
df[col] = pd.Categorical(df[col], categories=categories, ordered=True)
|
|
||||||
|
|
||||||
# missing_features = [feature for feature in expected_features if feature not in df.columns]
|
|
||||||
# for feature in missing_features:
|
|
||||||
# df[feature] = np.nan # Use NaN to avoid dtype issues
|
|
||||||
|
|
||||||
# Create XGBoost DMatrix
|
|
||||||
dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=np.nan)
|
dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=np.nan)
|
||||||
|
|
||||||
# Make predictions
|
|
||||||
predictions = model.predict(dmatrix)
|
predictions = model.predict(dmatrix)
|
||||||
df['prediction'] = predictions
|
df["prediction"] = predictions
|
||||||
|
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def processing_m2(input_data: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
df = pd.DataFrame(input_data)
|
||||||
|
if df.empty:
|
||||||
|
raise ValueError("Input DataFrame is empty.")
|
||||||
|
|
||||||
|
model = _load_m2_model()
|
||||||
|
category_orders = _load_category_orders_cached(M2_CATEGORY_ORDERS_PATH)
|
||||||
|
df = _prepare(df, category_orders)
|
||||||
|
|
||||||
|
expected_features = model.feature_names
|
||||||
|
for feature in expected_features:
|
||||||
|
if feature not in df.columns:
|
||||||
|
df[feature] = np.nan
|
||||||
|
dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=np.nan)
|
||||||
|
pd_arr = model.predict(dmatrix)
|
||||||
|
df["pd_m2"] = pd_arr
|
||||||
|
|
||||||
|
iso_model = _load_m2_iso_model()
|
||||||
|
df["pd_m2_iso"] = iso_model.predict(pd_arr)
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def processing_all(df_m1: pd.DataFrame, df_m2: pd.DataFrame, df_thx: pd.DataFrame):
|
||||||
|
return processing_m1(df_m1), processing_m2(df_m2), df_thx
|
||||||
|
|
||||||
|
|
||||||
|
# Legacy single-model entry point
|
||||||
|
def processing(input_data: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
return processing_m1(input_data)
|
||||||
|
|||||||
@ -97,6 +97,10 @@
|
|||||||
"zip": {
|
"zip": {
|
||||||
"type": ["string", "null"],
|
"type": ["string", "null"],
|
||||||
"description": "Zip of the current residence."
|
"description": "Zip of the current residence."
|
||||||
|
},
|
||||||
|
"ReasonCode": {
|
||||||
|
"type": ["string", "null"],
|
||||||
|
"description": "Reason code from ThreatMetrix."
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"required": []
|
"required": []
|
||||||
|
|||||||
@ -5,3 +5,4 @@ joblib == 1.4.2
|
|||||||
jmespath == 1.0.1
|
jmespath == 1.0.1
|
||||||
regex == 2023.12.25
|
regex == 2023.12.25
|
||||||
json_repair == 0.47.6
|
json_repair == 0.47.6
|
||||||
|
scikit-learn==1.5.2
|
||||||
|
|||||||
@ -26,6 +26,54 @@
|
|||||||
"type": ["number", "null"],
|
"type": ["number", "null"],
|
||||||
"description": "HD Fraud Score M1"
|
"description": "HD Fraud Score M1"
|
||||||
},
|
},
|
||||||
|
"hd_score_m2": {
|
||||||
|
"type": ["number", "null"],
|
||||||
|
"description": "HD Fraud Score M2"
|
||||||
|
},
|
||||||
|
"hd_score_iso_m2": {
|
||||||
|
"type": ["number", "null"],
|
||||||
|
"description": "HD Fraud Score M2 Scaled"
|
||||||
|
},
|
||||||
|
"digital_id_first_seen": {
|
||||||
|
"type": ["string", "null"],
|
||||||
|
"description": "Digital ID first seen timestamp"
|
||||||
|
},
|
||||||
|
"summary_risk_score": {
|
||||||
|
"type": ["string", "null"],
|
||||||
|
"description": "Summary risk score"
|
||||||
|
},
|
||||||
|
"cpu_clock": {
|
||||||
|
"type": ["string", "null"],
|
||||||
|
"description": "CPU clock value from device profiling"
|
||||||
|
},
|
||||||
|
"account_login_first_seen": {
|
||||||
|
"type": ["string", "null"],
|
||||||
|
"description": "Account login first seen timestamp"
|
||||||
|
},
|
||||||
|
"account_telephone_first_seen": {
|
||||||
|
"type": ["string", "null"],
|
||||||
|
"description": "Account telephone first seen timestamp"
|
||||||
|
},
|
||||||
|
"true_ip_first_seen": {
|
||||||
|
"type": ["string", "null"],
|
||||||
|
"description": "True IP first seen timestamp"
|
||||||
|
},
|
||||||
|
"ssn_hash_first_seen": {
|
||||||
|
"type": ["string", "null"],
|
||||||
|
"description": "SSN hash first seen timestamp"
|
||||||
|
},
|
||||||
|
"account_email_attributes": {
|
||||||
|
"type": ["string", "null"],
|
||||||
|
"description": "Account email attributes"
|
||||||
|
},
|
||||||
|
"tps_ip_latitude": {
|
||||||
|
"type": ["string", "null"],
|
||||||
|
"description": "TPS IP latitude"
|
||||||
|
},
|
||||||
|
"tps_ip_longitude": {
|
||||||
|
"type": ["string", "null"],
|
||||||
|
"description": "TPS IP longitude"
|
||||||
|
},
|
||||||
"action": {
|
"action": {
|
||||||
"type": ["string", "null"],
|
"type": ["string", "null"],
|
||||||
"description": "Recommended Action."
|
"description": "Recommended Action."
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
BIN
xgboost_model_M1.joblib
Normal file
BIN
xgboost_model_M1.joblib
Normal file
Binary file not shown.
BIN
xgboost_model_M2.joblib
Normal file
BIN
xgboost_model_M2.joblib
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user