Compare commits

...

6 Commits

Author SHA1 Message Date
Ankur Malik
67c2174ab3 Add digital_id_confidence_rating dtype, split prepare for M1/M2, clean lat/long CSV
All checks were successful
Build and Push Docker Image / test (push) Successful in 27s
Build and Push Docker Image / build_and_push (push) Successful in 2m39s
2025-11-25 05:58:44 -05:00
Ankur Malik
d0f4d225ee Sync m-1-v-1 block with local updates
All checks were successful
Build and Push Docker Image / test (push) Successful in 2m45s
Build and Push Docker Image / build_and_push (push) Successful in 4m16s
2025-11-23 23:22:32 -05:00
1bf55226e1 Upload files to "/"
All checks were successful
Build and Push Docker Image / test (push) Successful in 2m53s
Build and Push Docker Image / build_and_push (push) Successful in 4m32s
2025-07-11 14:42:06 +00:00
1d3d28213e Upload files to "/"
All checks were successful
Build and Push Docker Image / test (push) Successful in 1m1s
Build and Push Docker Image / build_and_push (push) Successful in 2m30s
2025-03-31 13:53:20 +00:00
1152a701b4 Upload files to "/"
All checks were successful
Build and Push Docker Image / test (push) Successful in 54s
Build and Push Docker Image / build_and_push (push) Successful in 2m12s
2025-03-31 12:36:37 +00:00
ead9a776da Advanced M series V1 model block
All checks were successful
Build and Push Docker Image / test (push) Successful in 55s
Build and Push Docker Image / build_and_push (push) Successful in 3m21s
2025-03-12 16:12:18 +00:00
17 changed files with 29946 additions and 22 deletions

View File

@ -1 +1,3 @@
**Hello world!!!** ## Advanced M series V1 model block
M Series Model trained on historical data to identify fraudulent patterns.

347
block.py
View File

@ -1,21 +1,334 @@
@flowx_block import logging
def example_function(request: dict) -> dict:
# Processing logic here... import jmespath
import json_repair
import pandas as pd
import regex as re
from pre_processing import pre_processing_all
from processing import processing_all
from post_processing import post_processing_all
return {
"meta_info": [ # Configure logging
{ logging.basicConfig(
"name": "created_date", level=logging.INFO,
"type": "string", format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
"value": "2024-11-05" )
} logger = logging.getLogger(__name__)
_JSON_LIKE = re.compile(r"^\s*\?*[\{\[].*[\}\]]\s*$", re.DOTALL)
def extract_value(blob, expression):
try:
return jmespath.search(expression, blob)
except Exception:
return None
def coalesce(*args):
for value in args:
if value is not None:
return value
return None
def deep_repair(obj):
# 1) If it's a string that *looks* like JSON (with or without one leading '?'),
# strip exactly one leading '?', reparses, and recurse.
if isinstance(obj, str):
s = obj.strip()
if _JSON_LIKE.match(s):
if s.startswith("?"):
s = s[1:]
parsed = json_repair.loads(s)
return deep_repair(parsed)
return obj
# 2) Dict recurse on each value
if isinstance(obj, dict):
return {k: deep_repair(v) for k, v in obj.items()}
# 3) List recurse on each element
if isinstance(obj, list):
return [deep_repair(v) for v in obj]
# 4) Otherwise, leave it alone
return obj
def sanitize_blob(blob):
try:
return deep_repair(blob)
except Exception as e:
logger.error("Failed to sanitize blob: %s", e)
return None
# Expressions to extract values (M1 + added M2 fields)
expressions = {
# M1 (existing)
"first_seen_days": [
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].first_seen_days)[0]",
'Blob."emailage.emailriskscore.first_seen_days"',
"Blob.tps_vendor_raw_response.query.results[0].first_seen_days",
], ],
"fields": [ "ea_score": [
{ "Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].EAScore",
"name": "", 'Blob."emailage.emailriskscore.eascore"',
"type": "", "Blob.tps_vendor_raw_response.query.results[0].EAScore",
"value": "" ],
} "email_creation_days": [
] "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].email_creation_days)[0]",
"Blob.tps_vendor_raw_response.query.results[0].email_creation_days",
],
"summary_risk_score": ["Blob.summary_risk_score"],
"digital_id_trust_score_rating": ["Blob.digital_id_trust_score_rating"],
"os_version": ["Blob.os_version"],
"account_email_worst_score": ["Blob.account_email_worst_score"],
"true_ip_score": ["Blob.true_ip_score"],
"ip_net_speed_cell": [
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ip_netSpeedCell)[0]",
"Blob.tps_vendor_raw_response.query.results[0].ip_netSpeedCell",
],
"account_email_score": ["Blob.account_email_score"],
"true_ip_worst_score": ["Blob.true_ip_worst_score"],
"proxy_ip_worst_score": ["Blob.proxy_ip_worst_score"],
"proxy_ip_score": ["Blob.proxy_ip_score"],
"fuzzy_device_score": ["Blob.fuzzy_device_score"],
"ip_region_confidence": [
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ip_regionconf)[0]",
"Blob.tps_vendor_raw_response.query.results[0].ip_regionconf",
],
"true_ip_state_confidence": ["Blob.true_ip_state_confidence"],
"fuzzy_device_worst_score": ["Blob.fuzzy_device_worst_score"],
"digital_id_confidence_rating": ["Blob.digital_id_confidence_rating"],
"trueipgeo": ["TrueIpGeo", "Blob.true_ip_geo"],
# M2 additions
"policy_score": ["Blob.policy_score"],
"digital_id_trust_score": ["Blob.digital_id_trust_score"],
"proxy_score": ["Blob.proxy_score"],
"browser_spoof_score": ["Blob.browser_spoof_score"],
"input_ip_connection_type": ["Blob.input_ip_connection_type"],
"fuzzy_device_id_confidence": ["Blob.fuzzy_device_id_confidence"],
"fraudrisk": [
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].fraudRisk)[0]",
"Blob.tps_vendor_raw_response.query.results[0].fraudRisk",
'Blob."emailage.emailriskscore.fraudRisk"',
],
"overalldigitalidentityscore": [
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].overallDigitalIdentityScore)[0]",
"Blob.tps_vendor_raw_response.query.results[0].overallDigitalIdentityScore",
'Blob."emailage.emailriskscore.overallDigitalIdentityScore"',
],
"totalhits": [
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].totalhits)[0]",
"Blob.tps_vendor_raw_response.query.results[0].totalhits",
'Blob."emailage.emailriskscore.totalhits"',
],
"uniquehits": [
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].uniquehits)[0]",
"Blob.tps_vendor_raw_response.query.results[0].uniquehits",
'Blob."emailage.emailriskscore.uniquehits"',
],
"emailtofullnameconfidence": [
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].emailToFullNameConfidence)[0]",
"Blob.tps_vendor_raw_response.query.results[0].emailToFullNameConfidence",
'Blob."emailage.emailriskscore.emailToFullNameConfidence"',
],
"emailtolastnameconfidence": [
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].emailToLastNameConfidence)[0]",
"Blob.tps_vendor_raw_response.query.results[0].emailToLastNameConfidence",
'Blob."emailage.emailriskscore.emailToLastNameConfidence"',
],
"domain_creation_days": [
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].domain_creation_days)[0]",
"Blob.tps_vendor_raw_response.query.results[0].domain_creation_days",
'Blob."emailage.emailriskscore.domain_creation_days"',
],
"iptophoneconfidence": [
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ipToPhoneConfidence)[0]",
"Blob.tps_vendor_raw_response.query.results[0].ipToPhoneConfidence",
'Blob."emailage.emailriskscore.ipToPhoneConfidence"',
],
"di_autofill_count_login": [
"Blob.tmx_variables.di_autofill_count_login",
"Blob.policy_details_api.policy_detail_api.customer.rules.vars.variable.di_autofill_count_login",
],
"accphone_gbl_velocity_hour": [
"Blob.tmx_variables.accphone_gbl_velocity_hour",
"Blob.tmx_variables._accphone_gbl_velocity_hour",
],
# Lat/long fields for distance engineering
"ip_latitude": [
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ip_latitude)[0]",
"Blob.tps_vendor_raw_response.query.results[0].ip_latitude",
],
"ip_longitude": [
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ip_longitude)[0]",
"Blob.tps_vendor_raw_response.query.results[0].ip_longitude",
],
"tps_ip_latitude": ["Blob.tps_vendor_raw_response.query.results[0].ip_latitude"],
"tps_ip_longitude": ["Blob.tps_vendor_raw_response.query.results[0].ip_longitude"],
"true_ip_latitude": ["Blob.true_ip_latitude"],
"true_ip_longitude": ["Blob.true_ip_longitude"],
"proxy_ip_latitude": ["Blob.proxy_ip_latitude"],
"proxy_ip_longitude": ["Blob.proxy_ip_longitude"],
"dns_ip_latitude": ["Blob.dns_ip_latitude"],
"dns_ip_longitude": ["Blob.dns_ip_longitude"],
"input_ip_latitude": ["Blob.input_ip_latitude"],
"input_ip_longitude": ["Blob.input_ip_longitude"],
# First-seen timestamps for age deltas
"digital_id_first_seen": ["Blob.digital_id_first_seen"],
"account_email_first_seen": ["Blob.account_email_first_seen"],
"account_login_first_seen": ["Blob.account_login_first_seen"],
"account_telephone_first_seen": ["Blob.account_telephone_first_seen"],
"true_ip_first_seen": ["Blob.true_ip_first_seen"],
"ssn_hash_first_seen": ["Blob.ssn_hash_first_seen"],
"fuzzy_device_first_seen": ["Blob.fuzzy_device_first_seen"],
"national_id_first_seen": ["Blob.national_id_first_seen"],
"proxy_ip_first_seen": ["Blob.proxy_ip_first_seen"],
# Attribute arrays (used for one-hot style parsing)
"account_name_activities": ["Blob.account_name_activities"],
"account_email_attributes": ["Blob.account_email_attributes"],
"true_ip_attributes": ["Blob.true_ip_attributes"],
"true_ip_activities": ["Blob.true_ip_activities"],
"digital_id_attributes": ["Blob.digital_id_attributes"],
"account_telephone_attributes": ["Blob.account_telephone_attributes"],
"cpu_clock": ["Blob.cpu_clock"]
}
def __main__(
# Application->
application_key: str,
application_timestamp: str,
application_ssn: str,
application_email_address: str,
application_bank_account_number: str,
application_is_rejected: str,
application_date_of_birth: str,
# uprovaloanapplication->
educationlevel: str,
employmentstatus: str,
lengthatbank: str,
lengthatjob: str,
ownhome: str,
payfrequency: str,
monthsatresidence: str,
state: str,
zip: str,
# thxresponse->
EventType: str,
DigitalIdConfidence: str,
RiskRating: str,
TmxSummaryReasonCode: str,
TrueIpGeo: str,
Blob: str,
DeviceId: str,
FuzzyDeviceId: str,
ReasonCode: str,
) -> dict:
# Convert input parameters into a flat dictionary
data = {
"application_key": application_key,
"application_timestamp": application_timestamp,
"application_ssn ": application_ssn,
"application_email_address": application_email_address,
"application_bank_account_number": application_bank_account_number,
"application_is_rejected": application_is_rejected,
"application_date_of_birth": application_date_of_birth,
"educationlevel": educationlevel,
"employmentstatus": employmentstatus,
"lengthatbank": lengthatbank,
"lengthatjob": lengthatjob,
"ownhome": ownhome,
"payfrequency": payfrequency,
"monthsatresidence": monthsatresidence,
"state": state,
"zip": zip,
"EventType": EventType,
"DigitalIdConfidence": DigitalIdConfidence,
"RiskRating": RiskRating,
"TmxSummaryReasonCode": TmxSummaryReasonCode,
"TrueIpGeo": TrueIpGeo,
"Blob": Blob,
"DeviceId": DeviceId,
"FuzzyDeviceId": FuzzyDeviceId,
"ReasonCode": ReasonCode,
} }
# Convert dictionary to a single-row DataFrame
combined_df = pd.DataFrame([data])
combined_df.columns = combined_df.columns.str.lower()
# Uncomment Below For Testing using Uprova Batch Data
# combined_df["educationlevel"] = None
# combined_df["monthsatresidence"] = None
# combined_df["ownhome"] = False
# combined_df['lengthatbank'] = 0
combined_df["application_email_address"] = combined_df["application_email_address"].str.lower()
if Blob:
combined_df["blob"] = combined_df["blob"].apply(sanitize_blob)
# Step 2: Extract values using the expressions dictionary
for column, expressions_list in expressions.items():
def _extract_with_fallback(blob_obj):
values = []
for expr in expressions_list:
val = extract_value(blob_obj, expr)
if val is None and isinstance(expr, str) and expr.startswith("Blob."):
val = extract_value(blob_obj, expr[len("Blob.") :])
values.append(val)
return coalesce(*values)
extracted = combined_df["blob"].apply(_extract_with_fallback)
if column in combined_df.columns:
combined_df[column] = extracted.where(extracted.notnull(), combined_df[column])
else:
combined_df[column] = extracted
# logger.info("pre_flowx data")
# logger.info(combined_df.iloc[0].drop("blob").to_dict())
else:
for column in expressions:
combined_df[column] = None
# logger.info("pre_flowx data")
# logger.info(combined_df.iloc[0].to_dict())
df_m1, df_m2, df_thx = pre_processing_all(combined_df)
# logger.info("pre_processed data m1")
# logger.info(df_m1.iloc[0].to_dict())
# logger.info("pre_processed data m2")
# logger.info(df_m2.iloc[0].to_dict())
processed_m1, processed_m2, df_thx = processing_all(df_m1, df_m2, df_thx)
# logger.info("processed_data m1")
# logger.info(processed_m1.iloc[0].to_dict())
# logger.info("processed_data m2")
# logger.info(processed_m2.iloc[0].to_dict())
result = post_processing_all(processed_m1, processed_m2, df_thx)
# State Check
state_value = combined_df["state"].iloc[0]
zip_value = combined_df["zip"].iloc[0]
if (pd.notnull(state_value) and state_value == "ZZ") or (
pd.notnull(zip_value) and zip_value == "86445"
):
result["hd_score_m1"] = 1250
result["hd_score_m2"] = 1250
result["hd_score_iso_m2"] = 1250
# logger.info("post_processed_data after state check")
# logger.info(result)
# Normalize numeric scores to built-in float so JSON encoding (e.g. Temporal)
# does not fail on NumPy scalar types like np.float32/np.float64.
for key in ("hd_score_m1", "hd_score_m2", "hd_score_iso_m2"):
if key in result and result[key] is not None:
try:
result[key] = float(result[key])
except (TypeError, ValueError):
logger.warning("Failed to cast %s=%r to float", key, result[key])
print(result)
return result

View File

@ -0,0 +1,88 @@
{
"employmentstatus": [
"disability",
"fixed income",
"full time employed",
"other",
"part time employment",
"retired benefits",
"self employed",
"student",
"unemployed",
"welfare"
],
"TrueIpGeo": [
"other",
"us"
],
"digital_id_trust_score_rating": [
"high",
"low",
"neutral",
"very_high",
"very_low"
],
"educationlevel": [
"associate's degree",
"bachelor's degree",
"doctorate",
"high school",
"master's degree",
"other"
],
"os_version": [
"10",
"11",
"12",
"13",
"14",
"15",
"16",
"17",
"18",
"8",
"9",
"unknown"
],
"ip_net_speed_cell": [
"broadband",
"cable",
"dialup",
"dsl",
"fixed wireless",
"mobile",
"mobile wireless",
"ocx",
"satellite",
"t1",
"tx",
"wireless",
"xdsl"
],
"day_night": [
"Day",
"Night"
],
"digital_id_confidence_rating": [
"high",
"medium",
"very_high",
"very_low"
],
"RiskRating": [
"high",
"low",
"medium",
"neutral",
"trusted"
],
"payfrequency": [
"biweekly",
"semimonthly"
],
"ownhome": [
"false",
"true"
]
}

View File

@ -0,0 +1,88 @@
{
"employmentstatus": [
"disability",
"fixed income",
"full time employed",
"other",
"part time employment",
"retired benefits",
"self employed",
"student",
"unemployed",
"welfare"
],
"TrueIpGeo": [
"other",
"us"
],
"digital_id_trust_score_rating": [
"high",
"low",
"neutral",
"very_high",
"very_low"
],
"educationlevel": [
"associate's degree",
"bachelor's degree",
"doctorate",
"high school",
"master's degree",
"other"
],
"os_version": [
"10",
"11",
"12",
"13",
"14",
"15",
"16",
"17",
"18",
"8",
"9",
"unknown"
],
"ip_net_speed_cell": [
"broadband",
"cable",
"dialup",
"dsl",
"fixed wireless",
"mobile",
"mobile wireless",
"ocx",
"satellite",
"t1",
"tx",
"wireless",
"xdsl"
],
"day_night": [
"Day",
"Night"
],
"digital_id_confidence_rating": [
"high",
"medium",
"very_high",
"very_low"
],
"RiskRating": [
"high",
"low",
"medium",
"neutral",
"trusted"
],
"payfrequency": [
"biweekly",
"semimonthly"
],
"ownhome": [
"false",
"true"
]
}

View File

@ -0,0 +1,303 @@
{
"riskrating": [
"high",
"low",
"medium",
"neutral",
"trusted"
],
"input_ip_connection_type": [
"cable",
"consumer satellite",
"dialup",
"dsl",
"fixed wireless",
"framerelay",
"isdn",
"mobile wireless",
"ocx",
"tx"
],
"fraudrisk": [
"001 very low",
"003 very low",
"005 very low",
"006 very low",
"008 very low",
"009 very low",
"010 very low",
"011 very low",
"012 very low",
"014 very low",
"015 very low",
"016 very low",
"017 very low",
"018 very low",
"020 very low",
"021 very low",
"022 very low",
"023 very low",
"024 very low",
"025 very low",
"026 very low",
"027 very low",
"028 very low",
"029 very low",
"030 very low",
"031 very low",
"032 very low",
"033 very low",
"034 very low",
"035 very low",
"036 very low",
"037 very low",
"038 very low",
"039 very low",
"040 very low",
"041 very low",
"042 very low",
"043 very low",
"044 very low",
"045 very low",
"046 very low",
"047 very low",
"048 very low",
"049 very low",
"050 very low",
"051 very low",
"052 very low",
"053 very low",
"054 very low",
"055 very low",
"056 very low",
"057 very low",
"058 very low",
"059 very low",
"060 very low",
"061 very low",
"062 very low",
"063 very low",
"064 very low",
"065 very low",
"066 very low",
"067 very low",
"068 very low",
"069 very low",
"070 very low",
"071 very low",
"072 very low",
"073 very low",
"074 very low",
"075 very low",
"076 very low",
"077 very low",
"078 very low",
"079 very low",
"080 very low",
"081 very low",
"082 very low",
"083 very low",
"084 very low",
"085 very low",
"086 very low",
"087 very low",
"088 very low",
"089 very low",
"090 very low",
"091 very low",
"092 very low",
"093 very low",
"094 very low",
"095 very low",
"096 very low",
"097 very low",
"098 very low",
"099 very low",
"100 very low",
"101 low",
"102 low",
"103 low",
"104 low",
"105 low",
"106 low",
"107 low",
"108 low",
"109 low",
"110 low",
"111 low",
"112 low",
"113 low",
"114 low",
"115 low",
"116 low",
"117 low",
"118 low",
"119 low",
"120 low",
"121 low",
"122 low",
"123 low",
"124 low",
"125 low",
"126 low",
"127 low",
"128 low",
"129 low",
"130 low",
"131 low",
"132 low",
"133 low",
"134 low",
"135 low",
"136 low",
"137 low",
"138 low",
"139 low",
"140 low",
"141 low",
"142 low",
"143 low",
"144 low",
"145 low",
"146 low",
"147 low",
"148 low",
"149 low",
"153 low",
"154 low",
"156 low",
"157 low",
"158 low",
"159 low",
"160 low",
"161 low",
"162 low",
"163 low",
"164 low",
"165 low",
"166 low",
"167 low",
"168 low",
"169 low",
"170 low",
"171 low",
"172 low",
"173 low",
"174 low",
"175 low",
"177 low",
"178 low",
"179 low",
"180 low",
"181 low",
"182 low",
"183 low",
"184 low",
"185 low",
"186 low",
"187 low",
"188 low",
"189 low",
"190 low",
"191 low",
"192 low",
"193 low",
"194 low",
"195 low",
"196 low",
"197 low",
"198 low",
"199 low",
"200 low",
"201 low",
"202 low",
"203 low",
"204 low",
"205 low",
"206 low",
"207 low",
"208 low",
"209 low",
"210 low",
"211 low",
"212 low",
"213 low",
"214 low",
"215 low",
"216 low",
"217 low",
"218 low",
"219 low",
"220 low",
"221 low",
"222 low",
"224 low",
"225 low",
"226 low",
"227 low",
"228 low",
"229 low",
"230 low",
"231 low",
"232 low",
"233 low",
"234 low",
"235 low",
"236 low",
"237 low",
"238 low",
"239 low",
"240 low",
"241 low",
"242 low",
"243 low",
"244 low",
"245 low",
"246 low",
"247 low",
"248 low",
"250 low",
"252 low",
"254 low",
"259 low",
"267 low",
"268 low",
"271 low",
"272 low",
"274 low",
"275 low",
"278 low",
"282 low",
"287 low",
"288 low",
"289 low",
"290 low",
"291 low",
"293 low",
"296 low",
"297 low",
"464 moderate",
"467 moderate",
"485 moderate",
"491 moderate",
"492 moderate",
"496 moderate",
"702 review",
"703 review",
"705 review",
"706 review",
"707 review",
"708 review",
"710 review",
"730 review",
"790 review",
"801 high",
"890 high",
"902 very high",
"906 very high"
],
"day_night": [
"day",
"night"
]
}

BIN
isotonic_model_M2.joblib Normal file

Binary file not shown.

File diff suppressed because it is too large Load Diff

98
post_processing.py Normal file
View File

@ -0,0 +1,98 @@
import logging
from typing import Dict
import numpy as np
import pandas as pd
from pre_processing import THX_FIELDS
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
)
logger = logging.getLogger(__name__)
def post_processing_m1(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
try:
df["hd_score_m1"] = np.round(
np.minimum(df["prediction"] * 100 + 0.00001, 1) * 85
+ np.maximum(np.log2(df["prediction"] * 100 + 0.000001) * 185, 0),
0,
)
logging.info("hd_score_m1 calculated: %s", df["hd_score_m1"].iloc[0])
except Exception as e:
logging.error("Error processing hd_score_m1 calculations: %s", e)
return df
def post_processing_m2(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
try:
df["hd_score_m2"] = np.round(
np.minimum(df["pd_m2"] * 100.0 + 0.00001, 1.0) * 75.0
+ np.maximum(np.log2(df["pd_m2"] * 100.0 + 0.000001) * 180.0, 0.0),
0,
)
df["hd_score_iso_m2"] = np.round(
np.minimum(df["pd_m2_iso"] * 100.0 + 0.00001, 1.0) * 97.0
+ np.maximum(np.log2(df["pd_m2_iso"] * 100.0 + 0.000001) * 246.0, 0.0),
0,
)
logging.info("hd_score_m2 calculated: %s", df["hd_score_m2"].iloc[0])
logging.info("hd_score_iso_m2 calculated: %s", df["hd_score_iso_m2"].iloc[0])
except Exception as e:
logging.error("Error processing hd_score_m2 calculations: %s", e)
return df
def _safe_get(df: pd.DataFrame, column: str):
"""Return scalar from single-row DataFrame, normalizing NaN/None to None."""
if column not in df.columns:
return None
val = df[column].iloc[0]
if isinstance(val, (list, dict)):
return val
try:
if pd.isna(val):
return None
except TypeError:
pass
return val
def post_processing_all(df_m1: pd.DataFrame, df_m2: pd.DataFrame, df_thx: pd.DataFrame) -> Dict[str, object]:
df_m1_scored = post_processing_m1(df_m1)
df_m2_scored = post_processing_m2(df_m2)
row_m1 = df_m1_scored.iloc[0]
row_m2 = df_m2_scored.iloc[0]
result = {
"application_key": row_m1.get("application_key"),
"application_timestamp": str(row_m1.get("application_timestamp")) if row_m1.get("application_timestamp") is not None else None,
"deviceid": row_m1.get("deviceid"),
"fuzzydeviceid": row_m1.get("fuzzydeviceid"),
"application_email_address": row_m1.get("application_email_address"),
"hd_score_m1": row_m1.get("hd_score_m1"),
"hd_score_m2": row_m2.get("hd_score_m2"),
"hd_score_iso_m2": row_m2.get("hd_score_iso_m2"),
"action": None,
}
flattened_thx = {field: _safe_get(df_thx, field) for field in THX_FIELDS if field not in result}
result.update(flattened_thx)
return result
# Legacy entry point for backward compatibility
def post_processing(df: pd.DataFrame) -> Dict[str, object]:
df_scored = post_processing_m1(df)
row = df_scored.iloc[0]
return {
"application_key": row.get("application_key"),
"application_timestamp": str(row.get("application_timestamp")) if row.get("application_timestamp") is not None else None,
"deviceid": row.get("deviceid"),
"fuzzydeviceid": row.get("fuzzydeviceid"),
"application_email_address": row.get("application_email_address"),
"hd_score_m1": row.get("hd_score_m1"),
}

577
pre_processing.py Normal file
View File

@ -0,0 +1,577 @@
import logging
import math
import re
from pathlib import Path
from typing import Dict, Iterable, List, Tuple, Union
import numpy as np
import pandas as pd
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
)
logger = logging.getLogger(__name__)
BASE_DIR = Path(__file__).resolve().parent
M2_LATLONG_REF_PATH = BASE_DIR / "latitute_longitute_reference.csv"
THX_FIELDS = [
"application_key",
"application_timestamp",
"digital_id_first_seen",
"summary_risk_score",
"cpu_clock",
"account_login_first_seen",
"account_telephone_first_seen",
"true_ip_first_seen",
"ssn_hash_first_seen",
"account_email_attributes",
"tps_ip_latitude",
"tps_ip_longitude",
]
# Hardcoded M2 data dictionary (replaces file lookup)
M2_DATA_DICTIONARY: Dict[str, Dict[str, Union[float, str, None]]] = {
"account_email_attributes_challenge_passed": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"account_email_attributes_challenged": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"account_email_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"account_login_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"account_name_activities_trusted_prob": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"account_telephone_attributes_challenge_failed": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"account_telephone_attributes_loan_app": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"account_telephone_attributes_trusted": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"account_telephone_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"accphone_gbl_velocity_hour": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"applicant_age": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"browser_spoof_score": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"day": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"day_cos": {"data_type": "float", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"day_sin": {"data_type": "float", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"di_autofill_count_login": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"digital_id_attributes_challenged": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"digital_id_attributes_trusted": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"digital_id_day_diff": {"data_type": "int", "valid_min": -999999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"digital_id_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"digital_id_month_diff": {"data_type": "int", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"digital_id_trust_score": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"digitalidconfidence": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"dist_dnsip_ref_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"dist_em_ip_ref_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"dist_inputip_ref_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"dist_proxyip_ref_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"dist_trueip_dnsip_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"dist_trueip_em_ip_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"dist_trueip_ref_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"domain_creation_days": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"emailtofullnameconfidence": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"emailtolastnameconfidence": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"first_seen_days": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"fraudrisk": {"data_type": "string", "valid_min": None, "valid_max": None, "observed_cap_min": None, "observed_cap_max": None},
"fuzzy_device_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"fuzzy_device_id_confidence": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"hour": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"input_ip_connection_type": {"data_type": "string", "valid_min": None, "valid_max": None, "observed_cap_min": None, "observed_cap_max": None},
"iptophoneconfidence": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"national_id_first_seen_age": {"data_type": "int", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"overalldigitalidentityscore": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"policy_score": {"data_type": "float", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"proxy_ip_first_seen_age": {"data_type": "int", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"proxy_ip_worst_score": {"data_type": "float", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"proxy_score": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"reasoncode_new_smartid_lt_1wk_global": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"riskrating": {"data_type": "string", "valid_min": None, "valid_max": None, "observed_cap_min": None, "observed_cap_max": None},
"ssn_hash_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"totalhits": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"true_ip_activities_trusted_prob": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"true_ip_attributes_trusted": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"true_ip_attributes_trusted_conf": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"true_ip_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
"true_ip_worst_score": {"data_type": "float", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
"uniquehits": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
}
# Hardcoded one-hot config (parsed_feature, model_var, contains)
M2_ONEHOT_CONFIG: List[Tuple[str, str, str]] = [
("reasoncode", "reasoncode_new_smartid_lt_1wk_global", "new_smartid_lt_1wk_global"),
("account_name_activities", "account_name_activities_trusted_prob", "trusted_prob"),
("account_email_attributes", "account_email_attributes_challenged", "challenged"),
("account_email_attributes", "account_email_attributes_challenge_passed", "challenge_passed"),
("true_ip_attributes", "true_ip_attributes_trusted", "trusted"),
("true_ip_attributes", "true_ip_attributes_trusted_conf", "trusted_conf"),
("digital_id_attributes", "digital_id_attributes_challenged", "challenged"),
("digital_id_attributes", "digital_id_attributes_trusted", "trusted"),
("account_telephone_attributes", "account_telephone_attributes_challenge_failed", "challenge_failed"),
("account_telephone_attributes", "account_telephone_attributes_loan_app", "loan_app"),
("account_telephone_attributes", "account_telephone_attributes_trusted", "trusted"),
("true_ip_activities", "true_ip_activities_trusted_prob", "trusted_prob"),
]
# ----------------------------
# Helpers
# ----------------------------
def _handle_unknowns(X: pd.DataFrame, column: str, known_values: Iterable[str], default_treatment=None):
if column not in X.columns:
return X
known_values = {str(val).lower() for val in known_values}
invalid_values = {None, "none", "nan", pd.NA}
X[column] = X[column].apply(
lambda x: str(x).lower()
if pd.notna(x) and str(x).lower() in known_values
else (default_treatment if pd.notna(x) and str(x).lower() not in invalid_values else np.nan)
)
return X
def _haversine_km(lat1, lon1, lat2, lon2):
if None in (lat1, lon1, lat2, lon2):
return None
try:
rlat1 = float(lat1) * math.pi / 180.0
rlat2 = float(lat2) * math.pi / 180.0
dlat = (float(lat2) - float(lat1)) * math.pi / 180.0
dlon = (float(lon2) - float(lon1)) * math.pi / 180.0
except Exception:
return None
a = (
math.sin(dlat / 2.0) ** 2
+ math.cos(rlat1) * math.cos(rlat2) * math.sin(dlon / 2.0) ** 2
)
a = min(1.0, max(0.0, a))
return 2 * 6371.0088 * math.asin(math.sqrt(a))
def _prep_latlong_ref():
if not M2_LATLONG_REF_PATH.exists():
logger.warning("latitute_longitute_reference.csv missing at %s", M2_LATLONG_REF_PATH)
return pd.DataFrame()
try:
ref = pd.read_csv(M2_LATLONG_REF_PATH, usecols=["postal_code_ref", "latitute_ref", "longitude_ref"])
except Exception:
ref = pd.read_csv(M2_LATLONG_REF_PATH)
# keep lower string version for matching
if "postal_code_ref" in ref.columns:
ref["postal_code_ref"] = ref["postal_code_ref"].astype(str).str.lower()
return ref
def _normalize_zip_for_ref(zip_val):
"""
Normalize zip/postal code values so they match reference CSV keys.
- Floats like 89503.0 -> "89503"
- Int-like strings "89503.0" -> "89503"
Note: we intentionally avoid zero-filling to preserve behaviour seen in UAT references
where leading-zero ZIPs are not matched to the reference table.
"""
if pd.isna(zip_val):
return None
if isinstance(zip_val, (int, float)) and not isinstance(zip_val, bool):
return str(int(zip_val)).lower()
zip_str = str(zip_val).strip()
if zip_str.replace(".", "", 1).isdigit():
try:
return str(int(float(zip_str))).lower()
except Exception:
pass
return zip_str.lower() if zip_str else None
# ----------------------------
# M1 Pre-processing (existing behaviour)
# ----------------------------
def pre_processing_m1(data_df: pd.DataFrame) -> pd.DataFrame:
combined_df = data_df.copy()
combined_df["applicant_age"] = combined_df.apply(
lambda row: pd.to_datetime(row["application_timestamp"]).year - pd.to_datetime(row["application_date_of_birth"]).year
if pd.notnull(row["application_timestamp"]) and pd.notnull(row["application_date_of_birth"])
else None,
axis=1,
)
combined_df["application_timestamp"] = pd.to_datetime(combined_df["application_timestamp"])
combined_df.loc[:, "application_time"] = pd.to_datetime(combined_df["application_timestamp"]).dt.time
combined_df["day"] = combined_df["application_timestamp"].dt.day
combined_df["day_of_week"] = combined_df["application_timestamp"].dt.weekday
combined_df["day_sin"] = np.sin(2 * np.pi * combined_df["day"] / 31)
combined_df["day_cos"] = np.cos(2 * np.pi * combined_df["day"] / 31)
combined_df["day_of_week_sin"] = np.sin(2 * np.pi * combined_df["day_of_week"] / 7)
combined_df["day_of_week_cos"] = np.cos(2 * np.pi * combined_df["day_of_week"] / 7)
def classify_day_night(hour):
if 6 <= hour < 18:
return "Day"
return "Night"
combined_df["hour"] = combined_df["application_time"].apply(lambda x: x.hour if pd.notnull(x) else np.nan)
combined_df["day_night"] = combined_df["hour"].apply(lambda hour: classify_day_night(hour) if pd.notnull(hour) else "Unknown")
combined_df["os_version"] = combined_df["os_version"].apply(
lambda x: x.split(".")[0] if isinstance(x, str) and "." in x else x.split("_")[0] if isinstance(x, str) and "_" in x else x
)
combined_df["Identity_Negative_History"] = combined_df["tmxsummaryreasoncode"].astype(str).str.contains(
"Identity_Negative_History", na=False, regex=True
).astype(int)
combined_df["Device_Negative_History"] = combined_df["tmxsummaryreasoncode"].astype(str).str.contains(
"Device_Negative_History", na=False, regex=True
).astype(int)
combined_df["Level_1_Link_Reject"] = combined_df["tmxsummaryreasoncode"].astype(str).str.contains(
"Level_1_Link_Reject", na=False, regex=True
).astype(int)
combined_df["IP_Negative_History"] = combined_df["tmxsummaryreasoncode"].astype(str).str.contains(
"IP_Negative_History", na=False, regex=True
).astype(int)
combined_df["Identity_Spoofing"] = combined_df["tmxsummaryreasoncode"].astype(str).str.contains(
"Identity_Spoofing", na=False, regex=True
).astype(int)
combined_df["digitalidconfidence"] = pd.to_numeric(combined_df["digitalidconfidence"], errors="coerce").astype("Int64")
combined_df.rename(
columns={
"DigitalIdConfidence": "digitalidconfidence",
},
inplace=True,
)
dtype_dict = {
"applicant_age": int,
"digitalidconfidence": float,
"first_seen_days": float,
"employmentstatus": str,
"ea_score": float,
"trueipgeo": str,
"hour": int,
"email_creation_days": float,
"lengthatjob": float,
"day_cos": float,
"summary_risk_score": float,
"digital_id_trust_score_rating": str,
"day": "int32",
"lengthatbank": float,
"day_of_week_cos": float,
"Level_1_Link_Reject": int,
"Identity_Negative_History": int,
"educationlevel": str,
"os_version": str,
"account_email_worst_score": float,
"true_ip_score": float,
"ip_net_speed_cell": str,
"account_email_score": float,
"day_of_week": "int32",
"true_ip_worst_score": float,
"proxy_ip_worst_score": float,
"day_night": str,
"proxy_ip_score": float,
"monthsatresidence": float,
"Device_Negative_History": int,
"fuzzy_device_score": float,
"day_sin": float,
"ip_region_confidence": float,
"true_ip_state_confidence": float,
"IP_Negative_History": int,
"fuzzy_device_worst_score": float,
"digital_id_confidence_rating" : str,
"day_of_week_sin": float,
"riskrating": str,
"payfrequency": str,
"ownhome": str,
"Identity_Spoofing": int,
}
next_block_cols = ["application_key", "application_timestamp", "deviceid", "fuzzydeviceid", "application_email_address"]
cols_to_keep = [col for col in dtype_dict.keys() if col in combined_df.columns]
final_cols = list(set(next_block_cols).union(set(cols_to_keep)))
for col, dtype in dtype_dict.items():
if col in combined_df.columns:
if dtype == int:
combined_df[col] = pd.to_numeric(combined_df[col], errors="coerce", downcast="integer")
elif dtype == float:
combined_df[col] = pd.to_numeric(combined_df[col], errors="coerce", downcast="float")
elif dtype == str:
combined_df[col] = combined_df[col].astype(str)
capping_dict = {
"applicant_age": (18, 93),
"digitalidconfidence": (0, 9017),
"first_seen_days": (0, 10486),
"ea_score": (1, 930),
"hour": (0, 23),
"email_creation_days": (2438, 9661),
"lengthatjob": (1, 24),
"day_cos": (-0.9948693234, 1),
"summary_risk_score": (-100, 30),
"day": (1, 31),
"lengthatbank": (0, 25),
"day_of_week_cos": (-0.9009688679, 1),
"Level_1_Link_Reject": (0, 1),
"Identity_Negative_History": (0, 1),
"account_email_worst_score": (-52, 0),
"true_ip_score": (-38, 49),
"account_email_score": (-18, 9),
"day_of_week": (0, 6),
"true_ip_worst_score": (-100, 0),
"proxy_ip_worst_score": (-100, 0),
"proxy_ip_score": (-29, 60),
"monthsatresidence": (0, 25),
"Device_Negative_History": (0, 1),
"fuzzy_device_score": (-29, 14),
"day_sin": (-0.9987165072, 0.9987165072),
"ip_region_confidence": (75, 99),
"IP_Negative_History": (0, 1),
"fuzzy_device_worst_score": (-100, 0),
"day_of_week_sin": (-0.9749279122, 0.9749279122),
"Identity_Spoofing": (0, 1),
}
for column, (cap_min, cap_max) in capping_dict.items():
if column in combined_df.columns:
combined_df[column] = combined_df[column].clip(lower=cap_min, upper=cap_max)
unknown_treatments = {
"employmentstatus": {
"valid_values": [
"disability",
"fixed income",
"full time employed",
"part time employment",
"retired benefits",
"self employed",
"student",
"unemployed",
"welfare",
],
"default_treatment": "other",
},
"trueipgeo": {"valid_values": ["US"], "default_treatment": "other"},
"digital_id_trust_score_rating": {"valid_values": ["very_high", "high", "neutral", "low"], "default_treatment": "very_low"},
"educationlevel": {
"valid_values": ["associate's degree", "bachelor's degree", "doctorate", "high school", "master's degree"],
"default_treatment": "other",
},
"os_version": {
"valid_values": ["18", "17", "16", "15", "14", "13", "12", "11", "10", "9", "8"],
"default_treatment": "unknown",
},
"ip_net_speed_cell": {
"valid_values": [
"broadband",
"cable",
"dialup",
"dsl",
"fixed wireless",
"mobile",
"mobile wireless",
"ocx",
"satellite",
"t1",
"tx",
"wireless",
"xdsl",
],
"default_treatment": "mobile",
},
"digital_id_confidence_rating": {"valid_values": ["high", "medium", "very_high"], "default_treatment": "very_low"},
"riskrating": {"valid_values": ["low", "medium", "neutral", "trusted"], "default_treatment": "high"},
"ownhome": {"valid_values": ["true", "false"], "default_treatment": np.nan},
}
for column, treatment in unknown_treatments.items():
combined_df = _handle_unknowns(combined_df, column, treatment["valid_values"], treatment["default_treatment"])
payfrequency_map = {"biweekly": ["biweekly", "bi-weekly", "bi weekly", "bw"], "semimonthly": ["semi-monthly", "semimonthly"]}
combined_df["payfrequency"] = combined_df["payfrequency"].apply(
lambda x: next((key for key, values in payfrequency_map.items() if str(x).lower() in values), np.nan)
)
return combined_df[final_cols]
# ----------------------------
# M2 Pre-processing
# ----------------------------
def _apply_onehot_features(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
for parsed_feature, model_var, contains_val in M2_ONEHOT_CONFIG:
value = df.get(parsed_feature, pd.Series([None])).iloc[0]
flag = 0
if isinstance(value, list):
flag = int(any(contains_val in str(v).lower() or re.sub(r"[^a-z0-9]+", " ", contains_val) in re.sub(r"[^a-z0-9]+", " ", str(v).lower()) for v in value))
elif isinstance(value, str):
val_norm = re.sub(r"[^a-z0-9]+", " ", value.lower())
contains_norm = re.sub(r"[^a-z0-9]+", " ", contains_val)
flag = int(contains_val in value.lower() or contains_norm in val_norm)
df[model_var] = flag
return df
def _extract_first_seen_days(ts_value, app_ts):
ts = pd.to_datetime(ts_value, errors="coerce", utc=True)
app = pd.to_datetime(app_ts, errors="coerce", utc=True)
# align to naive for subtraction
if isinstance(ts, pd.Timestamp) and ts.tzinfo is not None:
ts = ts.tz_localize(None)
if isinstance(app, pd.Timestamp) and app.tzinfo is not None:
app = app.tz_localize(None)
if pd.isna(ts) or pd.isna(app):
return None
return (app.normalize() - ts.normalize()).days
def _to_naive_ts(val):
ts = pd.to_datetime(val, errors="coerce", utc=True)
if isinstance(ts, pd.Timestamp) and ts.tzinfo is not None:
ts = ts.tz_localize(None)
return ts
def _month_diff(earlier, later):
"""Month difference (earlier - later) using year/month buckets."""
ts_earlier = _to_naive_ts(earlier)
ts_later = _to_naive_ts(later)
if pd.isna(ts_earlier) or pd.isna(ts_later):
return None
return (ts_earlier.year - ts_later.year) * 12 + (ts_earlier.month - ts_later.month)
def pre_processing_m2(data_df: pd.DataFrame) -> pd.DataFrame:
df = data_df.copy()
df.columns = df.columns.str.lower()
# Timestamp-derived features (align with M1 behaviour to keep probabilities consistent)
df["application_timestamp"] = pd.to_datetime(df["application_timestamp"], errors="coerce", utc=True)
df["day"] = df["application_timestamp"].dt.day
df["hour"] = df["application_timestamp"].dt.hour
df["day_sin"] = np.sin(2 * np.pi * df["day"] / 31)
df["day_cos"] = np.cos(2 * np.pi * df["day"] / 31)
def _classify_day_night(hour_val):
if pd.isna(hour_val):
return np.nan
return "day" if 6 <= hour_val < 18 else "night"
df["day_night"] = df["hour"].apply(_classify_day_night)
# Apply onehot flags from attributes
df = _apply_onehot_features(df)
# Distances
lat_ref = _prep_latlong_ref()
if not lat_ref.empty and "zip" in df.columns:
zip_value = df["zip"].iloc[0]
zip_lookup = _normalize_zip_for_ref(zip_value)
ref_row = lat_ref[lat_ref["postal_code_ref"] == zip_lookup] if zip_lookup else pd.DataFrame()
lat_ref_val = ref_row["latitute_ref"].iloc[0] if not ref_row.empty else None
lon_ref_val = ref_row["longitude_ref"].iloc[0] if not ref_row.empty else None
else:
lat_ref_val = None
lon_ref_val = None
df["dist_inputip_ref_km"] = df.apply(
lambda r: _haversine_km(lat_ref_val, lon_ref_val, r.get("input_ip_latitude"), r.get("input_ip_longitude")), axis=1
)
df["dist_em_ip_ref_km"] = df.apply(
lambda r: _haversine_km(lat_ref_val, lon_ref_val, r.get("ip_latitude") or r.get("tps_ip_latitude"), r.get("ip_longitude") or r.get("tps_ip_longitude")),
axis=1,
)
df["dist_proxyip_ref_km"] = df.apply(
lambda r: _haversine_km(lat_ref_val, lon_ref_val, r.get("proxy_ip_latitude"), r.get("proxy_ip_longitude")), axis=1
)
df["dist_dnsip_ref_km"] = df.apply(
lambda r: _haversine_km(lat_ref_val, lon_ref_val, r.get("dns_ip_latitude"), r.get("dns_ip_longitude")), axis=1
)
df["dist_trueip_ref_km"] = df.apply(
lambda r: _haversine_km(lat_ref_val, lon_ref_val, r.get("true_ip_latitude"), r.get("true_ip_longitude")), axis=1
)
df["dist_trueip_em_ip_km"] = df.apply(
lambda r: _haversine_km(r.get("true_ip_latitude"), r.get("true_ip_longitude"), r.get("ip_latitude") or r.get("tps_ip_latitude"), r.get("ip_longitude") or r.get("tps_ip_longitude")),
axis=1,
)
df["dist_trueip_dnsip_km"] = df.apply(
lambda r: _haversine_km(r.get("true_ip_latitude"), r.get("true_ip_longitude"), r.get("dns_ip_latitude"), r.get("dns_ip_longitude")),
axis=1,
)
# Ages
app_ts_val = df.get("application_timestamp", pd.Series([None])).iloc[0]
def _safe_day_diff(row):
if not row.get("digital_id_first_seen"):
return None
val = _extract_first_seen_days(row.get("digital_id_first_seen"), app_ts_val)
return -val if val is not None else None
df["digital_id_day_diff"] = df.apply(_safe_day_diff, axis=1)
df["digital_id_month_diff"] = df.apply(lambda r: _month_diff(r.get("digital_id_first_seen"), app_ts_val), axis=1)
for col_name in [
"digital_id_first_seen",
"account_email_first_seen",
"account_login_first_seen",
"account_telephone_first_seen",
"true_ip_first_seen",
"ssn_hash_first_seen",
"fuzzy_device_first_seen",
"national_id_first_seen",
"proxy_ip_first_seen",
]:
out_col = f"{col_name}_age"
df[out_col] = df.apply(lambda r: _extract_first_seen_days(r.get(col_name), app_ts_val), axis=1)
# applicant_age for consistency if not present
if "applicant_age" not in df.columns:
df["applicant_age"] = df.apply(
lambda row: pd.to_datetime(row["application_timestamp"]).year - pd.to_datetime(row["application_date_of_birth"]).year
if pd.notnull(row.get("application_timestamp")) and pd.notnull(row.get("application_date_of_birth"))
else None,
axis=1,
)
# Safe casting and capping using data dictionary
for var_name, rules in M2_DATA_DICTIONARY.items():
if var_name not in df.columns:
continue
col = pd.to_numeric(df[var_name], errors="coerce") if rules.get("data_type") in ["float", "int"] else df[var_name]
if rules.get("data_type") == "int":
col = col.astype("float")
valid_min = rules.get("valid_min")
valid_max = rules.get("valid_max")
observed_min = rules.get("observed_cap_min")
observed_max = rules.get("observed_cap_max")
if observed_min is not None or observed_max is not None:
col = col.clip(lower=observed_min, upper=observed_max)
# if valid_min is not None:
# col = col.where(col >= valid_min, np.nan)
# if valid_max is not None:
# col = col.where(col <= valid_max, np.nan)
df[var_name] = col
return df
def pre_processing_all(data_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
# Ensure requested THX fields exist so downstream packaging always has keys
df_base = data_df.copy()
for field in THX_FIELDS:
if field in df_base.columns:
df_base[field] = df_base[field].astype(str)
else:
df_base[field] = None
df_thx = df_base[THX_FIELDS].copy()
df_m1 = pre_processing_m1(df_base.copy())
df_m2 = pre_processing_m2(df_base.copy())
return df_m1, df_m2, df_thx
# Backwards compatible entry point (used by legacy code/tests if any)
def pre_processing(data_df: pd.DataFrame) -> pd.DataFrame:
df_m1, _, _ = pre_processing_all(data_df)
return df_m1

116
processing.py Normal file
View File

@ -0,0 +1,116 @@
import json
from functools import lru_cache
from pathlib import Path
import joblib
import numpy as np
import pandas as pd
import xgboost as xgb
# BASE_DIR = Path(__file__).resolve().parent
# M1_MODEL_PATH = BASE_DIR / "xgboost_model_M1.joblib"
# M1_CATEGORY_ORDERS_PATH = BASE_DIR / "category_orders_train_M1.json"
# M2_MODEL_PATH = BASE_DIR / "xgboost_model_M2.joblib"
# M2_CATEGORY_ORDERS_PATH = BASE_DIR / "category_orders_train_M2.json"
# M2_ISO_PATH = BASE_DIR / "isotonic_model_M2.joblib"
M1_MODEL_PATH = "./xgboost_model_M1.joblib"
M1_CATEGORY_ORDERS_PATH = "./category_orders_train_M1.json"
M2_MODEL_PATH = "./xgboost_model_M2.joblib"
M2_CATEGORY_ORDERS_PATH = "./category_orders_train_M2.json"
M2_ISO_PATH = "./isotonic_model_M2.joblib"
def _load_category_orders(path: Path) -> dict:
with open(path, "r") as f:
return json.load(f)
@lru_cache(maxsize=1)
def _load_m1_model():
return joblib.load(M1_MODEL_PATH)
@lru_cache(maxsize=1)
def _load_m2_model():
return joblib.load(M2_MODEL_PATH)
@lru_cache(maxsize=1)
def _load_m2_iso_model():
return joblib.load(M2_ISO_PATH)
@lru_cache(maxsize=None)
def _load_category_orders_cached(path: Path):
# Cache category orders per path to avoid disk I/O on each scoring
return _load_category_orders(path)
def _prepare_m1(df: pd.DataFrame, category_orders: dict) -> pd.DataFrame:
df = df.copy()
for col, categories in category_orders.items():
if col not in df.columns:
df[col] = np.nan
df[col].replace([None, "", "null", np.nan, "nan", " "], np.nan, inplace=True)
df[col] = pd.Categorical(df[col], categories=categories, ordered=True)
return df
def _prepare_m2(df: pd.DataFrame, category_orders: dict) -> pd.DataFrame:
df = df.copy()
for col, categories in category_orders.items():
if col not in df.columns:
df[col] = np.nan
df[col] = df[col].astype(str).str.lower()
df[col].replace([None, "", "null", np.nan, "nan", " "], np.nan, inplace=True)
df[col] = pd.Categorical(df[col], categories=categories, ordered=True)
return df
def processing_m1(input_data: pd.DataFrame) -> pd.DataFrame:
df = pd.DataFrame(input_data)
if df.empty:
raise ValueError("Input DataFrame is empty.")
model = _load_m1_model()
df.rename(columns={"riskrating": "RiskRating", "trueipgeo": "TrueIpGeo"}, inplace=True)
category_orders = _load_category_orders_cached(M1_CATEGORY_ORDERS_PATH)
df = _prepare_m1(df, category_orders)
expected_features = model.feature_names
dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=np.nan)
predictions = model.predict(dmatrix)
df["prediction"] = predictions
return df
def processing_m2(input_data: pd.DataFrame) -> pd.DataFrame:
df = pd.DataFrame(input_data)
if df.empty:
raise ValueError("Input DataFrame is empty.")
model = _load_m2_model()
category_orders = _load_category_orders_cached(M2_CATEGORY_ORDERS_PATH)
df = _prepare_m2(df, category_orders)
expected_features = model.feature_names
for feature in expected_features:
if feature not in df.columns:
df[feature] = np.nan
dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=np.nan)
pd_arr = model.predict(dmatrix)
df["pd_m2"] = pd_arr
iso_model = _load_m2_iso_model()
df["pd_m2_iso"] = iso_model.predict(pd_arr)
return df
def processing_all(df_m1: pd.DataFrame, df_m2: pd.DataFrame, df_thx: pd.DataFrame):
return processing_m1(df_m1), processing_m2(df_m2), df_thx
# Legacy single-model entry point
def processing(input_data: pd.DataFrame) -> pd.DataFrame:
return processing_m1(input_data)

View File

@ -1 +1,107 @@
{} {
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"application_key": {
"type": ["string", "null"],
"description": "Unique identifier for the application."
},
"application_timestamp": {
"type": ["string", "null"],
"description": "Timestamp when the application was submitted in UTC."
},
"application_ssn": {
"type": ["string", "null"],
"description": "Social Security Number of the applicant."
},
"application_email_address": {
"type": ["string", "null"],
"description": "Email address of the applicant."
},
"application_bank_account_number": {
"type": ["string", "null"],
"description": "Bank account number of the applicant."
},
"application_is_rejected": {
"type": ["boolean", "null"],
"description": "Indicates whether the application was rejected."
},
"application_date_of_birth": {
"type": ["string", "null"],
"description": "Date of birth of the applicant."
},
"EventType": {
"type": ["string", "null"],
"description": "Type of event associated with the application."
},
"RiskRating": {
"type": ["string", "null"],
"description": "Risk rating assigned to the application."
},
"TmxSummaryReasonCode": {
"type": ["string", "null"],
"description": "Reason code summary from third-party risk assessment."
},
"DigitalIdConfidence": {
"type": ["string", "null"],
"description": "Confidence score for the digital identity of the applicant."
},
"TrueIpGeo": {
"type": ["string", "null"],
"description": "Geolocation information of the true IP address used in the application."
},
"Blob": {
"type": ["string", "null"],
"description": "Raw data blob containing additional information related to the application."
},
"DeviceId": {
"type": ["string", "null"],
"description": "Unique identifier for the device used to submit the application."
},
"FuzzyDeviceId": {
"type": ["string", "null"],
"description": "Hashed or partially anonymized identifier for the device."
},
"ownhome": {
"type": ["boolean", "null"],
"description": "Indicates whether the applicant owns a home."
},
"employmentstatus": {
"type": ["string", "null"],
"description": "Employment status of the applicant."
},
"lengthatjob": {
"type": ["number", "null"],
"description": "Length of time (in months) the applicant has been at their current job."
},
"payfrequency": {
"type": ["string", "null"],
"description": "Frequency of pay for the applicant (e.g., weekly, biweekly, monthly)."
},
"lengthatbank": {
"type": ["string", "null"],
"description": "Length of time the applicant has been with their bank."
},
"educationlevel": {
"type": ["string", "null"],
"description": "Highest level of education attained by the applicant."
},
"monthsatresidence": {
"type": ["number", "null"],
"description": "Number of months the applicant has lived at their current residence."
},
"state": {
"type": ["string", "null"],
"description": "State of the current residence."
},
"zip": {
"type": ["string", "null"],
"description": "Zip of the current residence."
},
"ReasonCode": {
"type": ["string", "null"],
"description": "Reason code from ThreatMetrix."
}
},
"required": []
}

View File

@ -1 +1,8 @@
{} pandas == 2.2.3
numpy == 2.2.3
xgboost == 2.1.4
joblib == 1.4.2
jmespath == 1.0.1
regex == 2023.12.25
json_repair == 0.47.6
scikit-learn==1.5.2

View File

@ -1 +1,82 @@
{} {
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"application_key": {
"type": ["string", "null"],
"description": "Application Key"
},
"application_timestamp": {
"type": ["string", "null"],
"description": "Application Timestamp"
},
"deviceid": {
"type": ["string", "null"],
"description": "Deviceid"
},
"fuzzydeviceid": {
"type": ["string", "null"],
"description": "Fuzzy Deviceid"
},
"application_email_address": {
"type": ["string", "null"],
"description": "Application Email Address"
},
"hd_score_m1": {
"type": ["number", "null"],
"description": "HD Fraud Score M1"
},
"hd_score_m2": {
"type": ["number", "null"],
"description": "HD Fraud Score M2"
},
"hd_score_iso_m2": {
"type": ["number", "null"],
"description": "HD Fraud Score M2 Scaled"
},
"digital_id_first_seen": {
"type": ["string", "null"],
"description": "Digital ID first seen timestamp"
},
"summary_risk_score": {
"type": ["string", "null"],
"description": "Summary risk score"
},
"cpu_clock": {
"type": ["string", "null"],
"description": "CPU clock value from device profiling"
},
"account_login_first_seen": {
"type": ["string", "null"],
"description": "Account login first seen timestamp"
},
"account_telephone_first_seen": {
"type": ["string", "null"],
"description": "Account telephone first seen timestamp"
},
"true_ip_first_seen": {
"type": ["string", "null"],
"description": "True IP first seen timestamp"
},
"ssn_hash_first_seen": {
"type": ["string", "null"],
"description": "SSN hash first seen timestamp"
},
"account_email_attributes": {
"type": ["string", "null"],
"description": "Account email attributes"
},
"tps_ip_latitude": {
"type": ["string", "null"],
"description": "TPS IP latitude"
},
"tps_ip_longitude": {
"type": ["string", "null"],
"description": "TPS IP longitude"
},
"action": {
"type": ["string", "null"],
"description": "Recommended Action."
}
}
}

67
test_block.py Normal file

File diff suppressed because one or more lines are too long

BIN
xgboost_model.joblib Normal file

Binary file not shown.

BIN
xgboost_model_M1.joblib Normal file

Binary file not shown.

BIN
xgboost_model_M2.joblib Normal file

Binary file not shown.