Ankur Malik d0f4d225ee
All checks were successful
Build and Push Docker Image / test (push) Successful in 2m45s
Build and Push Docker Image / build_and_push (push) Successful in 4m16s
Sync m-1-v-1 block with local updates
2025-11-23 23:22:32 -05:00

335 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import logging
import jmespath
import json_repair
import pandas as pd
import regex as re
from pre_processing import pre_processing_all
from processing import processing_all
from post_processing import post_processing_all
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
)
logger = logging.getLogger(__name__)
_JSON_LIKE = re.compile(r"^\s*\?*[\{\[].*[\}\]]\s*$", re.DOTALL)
def extract_value(blob, expression):
try:
return jmespath.search(expression, blob)
except Exception:
return None
def coalesce(*args):
for value in args:
if value is not None:
return value
return None
def deep_repair(obj):
# 1) If it's a string that *looks* like JSON (with or without one leading '?'),
# strip exactly one leading '?', reparses, and recurse.
if isinstance(obj, str):
s = obj.strip()
if _JSON_LIKE.match(s):
if s.startswith("?"):
s = s[1:]
parsed = json_repair.loads(s)
return deep_repair(parsed)
return obj
# 2) Dict recurse on each value
if isinstance(obj, dict):
return {k: deep_repair(v) for k, v in obj.items()}
# 3) List recurse on each element
if isinstance(obj, list):
return [deep_repair(v) for v in obj]
# 4) Otherwise, leave it alone
return obj
def sanitize_blob(blob):
try:
return deep_repair(blob)
except Exception as e:
logger.error("Failed to sanitize blob: %s", e)
return None
# Expressions to extract values (M1 + added M2 fields)
expressions = {
# M1 (existing)
"first_seen_days": [
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].first_seen_days)[0]",
'Blob."emailage.emailriskscore.first_seen_days"',
"Blob.tps_vendor_raw_response.query.results[0].first_seen_days",
],
"ea_score": [
"Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].EAScore",
'Blob."emailage.emailriskscore.eascore"',
"Blob.tps_vendor_raw_response.query.results[0].EAScore",
],
"email_creation_days": [
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].email_creation_days)[0]",
"Blob.tps_vendor_raw_response.query.results[0].email_creation_days",
],
"summary_risk_score": ["Blob.summary_risk_score"],
"digital_id_trust_score_rating": ["Blob.digital_id_trust_score_rating"],
"os_version": ["Blob.os_version"],
"account_email_worst_score": ["Blob.account_email_worst_score"],
"true_ip_score": ["Blob.true_ip_score"],
"ip_net_speed_cell": [
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ip_netSpeedCell)[0]",
"Blob.tps_vendor_raw_response.query.results[0].ip_netSpeedCell",
],
"account_email_score": ["Blob.account_email_score"],
"true_ip_worst_score": ["Blob.true_ip_worst_score"],
"proxy_ip_worst_score": ["Blob.proxy_ip_worst_score"],
"proxy_ip_score": ["Blob.proxy_ip_score"],
"fuzzy_device_score": ["Blob.fuzzy_device_score"],
"ip_region_confidence": [
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ip_regionconf)[0]",
"Blob.tps_vendor_raw_response.query.results[0].ip_regionconf",
],
"true_ip_state_confidence": ["Blob.true_ip_state_confidence"],
"fuzzy_device_worst_score": ["Blob.fuzzy_device_worst_score"],
"digital_id_confidence_rating": ["Blob.digital_id_confidence_rating"],
"trueipgeo": ["TrueIpGeo", "Blob.true_ip_geo"],
# M2 additions
"policy_score": ["Blob.policy_score"],
"digital_id_trust_score": ["Blob.digital_id_trust_score"],
"proxy_score": ["Blob.proxy_score"],
"browser_spoof_score": ["Blob.browser_spoof_score"],
"input_ip_connection_type": ["Blob.input_ip_connection_type"],
"fuzzy_device_id_confidence": ["Blob.fuzzy_device_id_confidence"],
"fraudrisk": [
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].fraudRisk)[0]",
"Blob.tps_vendor_raw_response.query.results[0].fraudRisk",
'Blob."emailage.emailriskscore.fraudRisk"',
],
"overalldigitalidentityscore": [
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].overallDigitalIdentityScore)[0]",
"Blob.tps_vendor_raw_response.query.results[0].overallDigitalIdentityScore",
'Blob."emailage.emailriskscore.overallDigitalIdentityScore"',
],
"totalhits": [
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].totalhits)[0]",
"Blob.tps_vendor_raw_response.query.results[0].totalhits",
'Blob."emailage.emailriskscore.totalhits"',
],
"uniquehits": [
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].uniquehits)[0]",
"Blob.tps_vendor_raw_response.query.results[0].uniquehits",
'Blob."emailage.emailriskscore.uniquehits"',
],
"emailtofullnameconfidence": [
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].emailToFullNameConfidence)[0]",
"Blob.tps_vendor_raw_response.query.results[0].emailToFullNameConfidence",
'Blob."emailage.emailriskscore.emailToFullNameConfidence"',
],
"emailtolastnameconfidence": [
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].emailToLastNameConfidence)[0]",
"Blob.tps_vendor_raw_response.query.results[0].emailToLastNameConfidence",
'Blob."emailage.emailriskscore.emailToLastNameConfidence"',
],
"domain_creation_days": [
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].domain_creation_days)[0]",
"Blob.tps_vendor_raw_response.query.results[0].domain_creation_days",
'Blob."emailage.emailriskscore.domain_creation_days"',
],
"iptophoneconfidence": [
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ipToPhoneConfidence)[0]",
"Blob.tps_vendor_raw_response.query.results[0].ipToPhoneConfidence",
'Blob."emailage.emailriskscore.ipToPhoneConfidence"',
],
"di_autofill_count_login": [
"Blob.tmx_variables.di_autofill_count_login",
"Blob.policy_details_api.policy_detail_api.customer.rules.vars.variable.di_autofill_count_login",
],
"accphone_gbl_velocity_hour": [
"Blob.tmx_variables.accphone_gbl_velocity_hour",
"Blob.tmx_variables._accphone_gbl_velocity_hour",
],
# Lat/long fields for distance engineering
"ip_latitude": [
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ip_latitude)[0]",
"Blob.tps_vendor_raw_response.query.results[0].ip_latitude",
],
"ip_longitude": [
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ip_longitude)[0]",
"Blob.tps_vendor_raw_response.query.results[0].ip_longitude",
],
"tps_ip_latitude": ["Blob.tps_vendor_raw_response.query.results[0].ip_latitude"],
"tps_ip_longitude": ["Blob.tps_vendor_raw_response.query.results[0].ip_longitude"],
"true_ip_latitude": ["Blob.true_ip_latitude"],
"true_ip_longitude": ["Blob.true_ip_longitude"],
"proxy_ip_latitude": ["Blob.proxy_ip_latitude"],
"proxy_ip_longitude": ["Blob.proxy_ip_longitude"],
"dns_ip_latitude": ["Blob.dns_ip_latitude"],
"dns_ip_longitude": ["Blob.dns_ip_longitude"],
"input_ip_latitude": ["Blob.input_ip_latitude"],
"input_ip_longitude": ["Blob.input_ip_longitude"],
# First-seen timestamps for age deltas
"digital_id_first_seen": ["Blob.digital_id_first_seen"],
"account_email_first_seen": ["Blob.account_email_first_seen"],
"account_login_first_seen": ["Blob.account_login_first_seen"],
"account_telephone_first_seen": ["Blob.account_telephone_first_seen"],
"true_ip_first_seen": ["Blob.true_ip_first_seen"],
"ssn_hash_first_seen": ["Blob.ssn_hash_first_seen"],
"fuzzy_device_first_seen": ["Blob.fuzzy_device_first_seen"],
"national_id_first_seen": ["Blob.national_id_first_seen"],
"proxy_ip_first_seen": ["Blob.proxy_ip_first_seen"],
# Attribute arrays (used for one-hot style parsing)
"account_name_activities": ["Blob.account_name_activities"],
"account_email_attributes": ["Blob.account_email_attributes"],
"true_ip_attributes": ["Blob.true_ip_attributes"],
"true_ip_activities": ["Blob.true_ip_activities"],
"digital_id_attributes": ["Blob.digital_id_attributes"],
"account_telephone_attributes": ["Blob.account_telephone_attributes"],
"cpu_clock": ["Blob.cpu_clock"]
}
def __main__(
# Application->
application_key: str,
application_timestamp: str,
application_ssn: str,
application_email_address: str,
application_bank_account_number: str,
application_is_rejected: str,
application_date_of_birth: str,
# uprovaloanapplication->
educationlevel: str,
employmentstatus: str,
lengthatbank: str,
lengthatjob: str,
ownhome: str,
payfrequency: str,
monthsatresidence: str,
state: str,
zip: str,
# thxresponse->
EventType: str,
DigitalIdConfidence: str,
RiskRating: str,
TmxSummaryReasonCode: str,
TrueIpGeo: str,
Blob: str,
DeviceId: str,
FuzzyDeviceId: str,
ReasonCode: str,
) -> dict:
# Convert input parameters into a flat dictionary
data = {
"application_key": application_key,
"application_timestamp": application_timestamp,
"application_ssn ": application_ssn,
"application_email_address": application_email_address,
"application_bank_account_number": application_bank_account_number,
"application_is_rejected": application_is_rejected,
"application_date_of_birth": application_date_of_birth,
"educationlevel": educationlevel,
"employmentstatus": employmentstatus,
"lengthatbank": lengthatbank,
"lengthatjob": lengthatjob,
"ownhome": ownhome,
"payfrequency": payfrequency,
"monthsatresidence": monthsatresidence,
"state": state,
"zip": zip,
"EventType": EventType,
"DigitalIdConfidence": DigitalIdConfidence,
"RiskRating": RiskRating,
"TmxSummaryReasonCode": TmxSummaryReasonCode,
"TrueIpGeo": TrueIpGeo,
"Blob": Blob,
"DeviceId": DeviceId,
"FuzzyDeviceId": FuzzyDeviceId,
"ReasonCode": ReasonCode,
}
# Convert dictionary to a single-row DataFrame
combined_df = pd.DataFrame([data])
combined_df.columns = combined_df.columns.str.lower()
# Uncomment Below For Testing using Uprova Batch Data
# combined_df["educationlevel"] = None
# combined_df["monthsatresidence"] = None
# combined_df["ownhome"] = False
# combined_df['lengthatbank'] = 0
combined_df["application_email_address"] = combined_df["application_email_address"].str.lower()
if Blob:
combined_df["blob"] = combined_df["blob"].apply(sanitize_blob)
# Step 2: Extract values using the expressions dictionary
for column, expressions_list in expressions.items():
def _extract_with_fallback(blob_obj):
values = []
for expr in expressions_list:
val = extract_value(blob_obj, expr)
if val is None and isinstance(expr, str) and expr.startswith("Blob."):
val = extract_value(blob_obj, expr[len("Blob.") :])
values.append(val)
return coalesce(*values)
extracted = combined_df["blob"].apply(_extract_with_fallback)
if column in combined_df.columns:
combined_df[column] = extracted.where(extracted.notnull(), combined_df[column])
else:
combined_df[column] = extracted
# logger.info("pre_flowx data")
# logger.info(combined_df.iloc[0].drop("blob").to_dict())
else:
for column in expressions:
combined_df[column] = None
# logger.info("pre_flowx data")
# logger.info(combined_df.iloc[0].to_dict())
df_m1, df_m2, df_thx = pre_processing_all(combined_df)
# logger.info("pre_processed data m1")
# logger.info(df_m1.iloc[0].to_dict())
# logger.info("pre_processed data m2")
# logger.info(df_m2.iloc[0].to_dict())
processed_m1, processed_m2, df_thx = processing_all(df_m1, df_m2, df_thx)
# logger.info("processed_data m1")
# logger.info(processed_m1.iloc[0].to_dict())
# logger.info("processed_data m2")
# logger.info(processed_m2.iloc[0].to_dict())
result = post_processing_all(processed_m1, processed_m2, df_thx)
# State Check
state_value = combined_df["state"].iloc[0]
zip_value = combined_df["zip"].iloc[0]
if (pd.notnull(state_value) and state_value == "ZZ") or (
pd.notnull(zip_value) and zip_value == "86445"
):
result["hd_score_m1"] = 1250
result["hd_score_m2"] = 1250
result["hd_score_iso_m2"] = 1250
# logger.info("post_processed_data after state check")
# logger.info(result)
# Normalize numeric scores to built-in float so JSON encoding (e.g. Temporal)
# does not fail on NumPy scalar types like np.float32/np.float64.
for key in ("hd_score_m1", "hd_score_m2", "hd_score_iso_m2"):
if key in result and result[key] is not None:
try:
result[key] = float(result[key])
except (TypeError, ValueError):
logger.warning("Failed to cast %s=%r to float", key, result[key])
print(result)
return result