import logging import jmespath import json_repair import pandas as pd import regex as re from pre_processing import pre_processing_all from processing import processing_all from post_processing import post_processing_all # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s - %(message)s", ) logger = logging.getLogger(__name__) _JSON_LIKE = re.compile(r"^\s*\?*[\{\[].*[\}\]]\s*$", re.DOTALL) def extract_value(blob, expression): try: return jmespath.search(expression, blob) except Exception: return None def coalesce(*args): for value in args: if value is not None: return value return None def deep_repair(obj): # 1) If it's a string that *looks* like JSON (with or without one leading '?'), # strip exactly one leading '?', reparses, and recurse. if isinstance(obj, str): s = obj.strip() if _JSON_LIKE.match(s): if s.startswith("?"): s = s[1:] parsed = json_repair.loads(s) return deep_repair(parsed) return obj # 2) Dict – recurse on each value if isinstance(obj, dict): return {k: deep_repair(v) for k, v in obj.items()} # 3) List – recurse on each element if isinstance(obj, list): return [deep_repair(v) for v in obj] # 4) Otherwise, leave it alone return obj def sanitize_blob(blob): try: return deep_repair(blob) except Exception as e: logger.error("Failed to sanitize blob: %s", e) return None # Expressions to extract values (M1 + added M2 fields) expressions = { # M1 (existing) "first_seen_days": [ "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].first_seen_days)[0]", 'Blob."emailage.emailriskscore.first_seen_days"', "Blob.tps_vendor_raw_response.query.results[0].first_seen_days", ], "ea_score": [ "Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].EAScore", 'Blob."emailage.emailriskscore.eascore"', "Blob.tps_vendor_raw_response.query.results[0].EAScore", ], "email_creation_days": [ "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].email_creation_days)[0]", "Blob.tps_vendor_raw_response.query.results[0].email_creation_days", ], "summary_risk_score": ["Blob.summary_risk_score"], "digital_id_trust_score_rating": ["Blob.digital_id_trust_score_rating"], "os_version": ["Blob.os_version"], "account_email_worst_score": ["Blob.account_email_worst_score"], "true_ip_score": ["Blob.true_ip_score"], "ip_net_speed_cell": [ "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ip_netSpeedCell)[0]", "Blob.tps_vendor_raw_response.query.results[0].ip_netSpeedCell", ], "account_email_score": ["Blob.account_email_score"], "true_ip_worst_score": ["Blob.true_ip_worst_score"], "proxy_ip_worst_score": ["Blob.proxy_ip_worst_score"], "proxy_ip_score": ["Blob.proxy_ip_score"], "fuzzy_device_score": ["Blob.fuzzy_device_score"], "ip_region_confidence": [ "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ip_regionconf)[0]", "Blob.tps_vendor_raw_response.query.results[0].ip_regionconf", ], "true_ip_state_confidence": ["Blob.true_ip_state_confidence"], "fuzzy_device_worst_score": ["Blob.fuzzy_device_worst_score"], "digital_id_confidence_rating": ["Blob.digital_id_confidence_rating"], "trueipgeo": ["TrueIpGeo", "Blob.true_ip_geo"], # M2 additions "policy_score": ["Blob.policy_score"], "digital_id_trust_score": ["Blob.digital_id_trust_score"], "proxy_score": ["Blob.proxy_score"], "browser_spoof_score": ["Blob.browser_spoof_score"], "input_ip_connection_type": ["Blob.input_ip_connection_type"], "fuzzy_device_id_confidence": ["Blob.fuzzy_device_id_confidence"], "fraudrisk": [ "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].fraudRisk)[0]", "Blob.tps_vendor_raw_response.query.results[0].fraudRisk", 'Blob."emailage.emailriskscore.fraudRisk"', ], "overalldigitalidentityscore": [ "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].overallDigitalIdentityScore)[0]", "Blob.tps_vendor_raw_response.query.results[0].overallDigitalIdentityScore", 'Blob."emailage.emailriskscore.overallDigitalIdentityScore"', ], "totalhits": [ "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].totalhits)[0]", "Blob.tps_vendor_raw_response.query.results[0].totalhits", 'Blob."emailage.emailriskscore.totalhits"', ], "uniquehits": [ "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].uniquehits)[0]", "Blob.tps_vendor_raw_response.query.results[0].uniquehits", 'Blob."emailage.emailriskscore.uniquehits"', ], "emailtofullnameconfidence": [ "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].emailToFullNameConfidence)[0]", "Blob.tps_vendor_raw_response.query.results[0].emailToFullNameConfidence", 'Blob."emailage.emailriskscore.emailToFullNameConfidence"', ], "emailtolastnameconfidence": [ "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].emailToLastNameConfidence)[0]", "Blob.tps_vendor_raw_response.query.results[0].emailToLastNameConfidence", 'Blob."emailage.emailriskscore.emailToLastNameConfidence"', ], "domain_creation_days": [ "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].domain_creation_days)[0]", "Blob.tps_vendor_raw_response.query.results[0].domain_creation_days", 'Blob."emailage.emailriskscore.domain_creation_days"', ], "iptophoneconfidence": [ "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ipToPhoneConfidence)[0]", "Blob.tps_vendor_raw_response.query.results[0].ipToPhoneConfidence", 'Blob."emailage.emailriskscore.ipToPhoneConfidence"', ], "di_autofill_count_login": [ "Blob.tmx_variables.di_autofill_count_login", "Blob.policy_details_api.policy_detail_api.customer.rules.vars.variable.di_autofill_count_login", ], "accphone_gbl_velocity_hour": [ "Blob.tmx_variables.accphone_gbl_velocity_hour", "Blob.tmx_variables._accphone_gbl_velocity_hour", ], # Lat/long fields for distance engineering "ip_latitude": [ "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ip_latitude)[0]", "Blob.tps_vendor_raw_response.query.results[0].ip_latitude", ], "ip_longitude": [ "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ip_longitude)[0]", "Blob.tps_vendor_raw_response.query.results[0].ip_longitude", ], "tps_ip_latitude": ["Blob.tps_vendor_raw_response.query.results[0].ip_latitude"], "tps_ip_longitude": ["Blob.tps_vendor_raw_response.query.results[0].ip_longitude"], "true_ip_latitude": ["Blob.true_ip_latitude"], "true_ip_longitude": ["Blob.true_ip_longitude"], "proxy_ip_latitude": ["Blob.proxy_ip_latitude"], "proxy_ip_longitude": ["Blob.proxy_ip_longitude"], "dns_ip_latitude": ["Blob.dns_ip_latitude"], "dns_ip_longitude": ["Blob.dns_ip_longitude"], "input_ip_latitude": ["Blob.input_ip_latitude"], "input_ip_longitude": ["Blob.input_ip_longitude"], # First-seen timestamps for age deltas "digital_id_first_seen": ["Blob.digital_id_first_seen"], "account_email_first_seen": ["Blob.account_email_first_seen"], "account_login_first_seen": ["Blob.account_login_first_seen"], "account_telephone_first_seen": ["Blob.account_telephone_first_seen"], "true_ip_first_seen": ["Blob.true_ip_first_seen"], "ssn_hash_first_seen": ["Blob.ssn_hash_first_seen"], "fuzzy_device_first_seen": ["Blob.fuzzy_device_first_seen"], "national_id_first_seen": ["Blob.national_id_first_seen"], "proxy_ip_first_seen": ["Blob.proxy_ip_first_seen"], # Attribute arrays (used for one-hot style parsing) "account_name_activities": ["Blob.account_name_activities"], "account_email_attributes": ["Blob.account_email_attributes"], "true_ip_attributes": ["Blob.true_ip_attributes"], "true_ip_activities": ["Blob.true_ip_activities"], "digital_id_attributes": ["Blob.digital_id_attributes"], "account_telephone_attributes": ["Blob.account_telephone_attributes"], "cpu_clock": ["Blob.cpu_clock"] } def __main__( # Application-> application_key: str, application_timestamp: str, application_ssn: str, application_email_address: str, application_bank_account_number: str, application_is_rejected: str, application_date_of_birth: str, # uprovaloanapplication-> educationlevel: str, employmentstatus: str, lengthatbank: str, lengthatjob: str, ownhome: str, payfrequency: str, monthsatresidence: str, state: str, zip: str, # thxresponse-> EventType: str, DigitalIdConfidence: str, RiskRating: str, TmxSummaryReasonCode: str, TrueIpGeo: str, Blob: str, DeviceId: str, FuzzyDeviceId: str, ReasonCode: str, ) -> dict: # Convert input parameters into a flat dictionary data = { "application_key": application_key, "application_timestamp": application_timestamp, "application_ssn ": application_ssn, "application_email_address": application_email_address, "application_bank_account_number": application_bank_account_number, "application_is_rejected": application_is_rejected, "application_date_of_birth": application_date_of_birth, "educationlevel": educationlevel, "employmentstatus": employmentstatus, "lengthatbank": lengthatbank, "lengthatjob": lengthatjob, "ownhome": ownhome, "payfrequency": payfrequency, "monthsatresidence": monthsatresidence, "state": state, "zip": zip, "EventType": EventType, "DigitalIdConfidence": DigitalIdConfidence, "RiskRating": RiskRating, "TmxSummaryReasonCode": TmxSummaryReasonCode, "TrueIpGeo": TrueIpGeo, "Blob": Blob, "DeviceId": DeviceId, "FuzzyDeviceId": FuzzyDeviceId, "ReasonCode": ReasonCode, } # Convert dictionary to a single-row DataFrame combined_df = pd.DataFrame([data]) combined_df.columns = combined_df.columns.str.lower() # Uncomment Below For Testing using Uprova Batch Data # combined_df["educationlevel"] = None # combined_df["monthsatresidence"] = None # combined_df["ownhome"] = False # combined_df['lengthatbank'] = 0 combined_df["application_email_address"] = combined_df["application_email_address"].str.lower() if Blob: combined_df["blob"] = combined_df["blob"].apply(sanitize_blob) # Step 2: Extract values using the expressions dictionary for column, expressions_list in expressions.items(): def _extract_with_fallback(blob_obj): values = [] for expr in expressions_list: val = extract_value(blob_obj, expr) if val is None and isinstance(expr, str) and expr.startswith("Blob."): val = extract_value(blob_obj, expr[len("Blob.") :]) values.append(val) return coalesce(*values) extracted = combined_df["blob"].apply(_extract_with_fallback) if column in combined_df.columns: combined_df[column] = extracted.where(extracted.notnull(), combined_df[column]) else: combined_df[column] = extracted # logger.info("pre_flowx data") # logger.info(combined_df.iloc[0].drop("blob").to_dict()) else: for column in expressions: combined_df[column] = None # logger.info("pre_flowx data") # logger.info(combined_df.iloc[0].to_dict()) df_m1, df_m2, df_thx = pre_processing_all(combined_df) # logger.info("pre_processed data m1") # logger.info(df_m1.iloc[0].to_dict()) # logger.info("pre_processed data m2") # logger.info(df_m2.iloc[0].to_dict()) processed_m1, processed_m2, df_thx = processing_all(df_m1, df_m2, df_thx) # logger.info("processed_data m1") # logger.info(processed_m1.iloc[0].to_dict()) # logger.info("processed_data m2") # logger.info(processed_m2.iloc[0].to_dict()) result = post_processing_all(processed_m1, processed_m2, df_thx) # State Check state_value = combined_df["state"].iloc[0] zip_value = combined_df["zip"].iloc[0] if (pd.notnull(state_value) and state_value == "ZZ") or ( pd.notnull(zip_value) and zip_value == "86445" ): result["hd_score_m1"] = 1250 result["hd_score_m2"] = 1250 result["hd_score_iso_m2"] = 1250 # logger.info("post_processed_data after state check") # logger.info(result) # Normalize numeric scores to built-in float so JSON encoding (e.g. Temporal) # does not fail on NumPy scalar types like np.float32/np.float64. for key in ("hd_score_m1", "hd_score_m2", "hd_score_iso_m2"): if key in result and result[key] is not None: try: result[key] = float(result[key]) except (TypeError, ValueError): logger.warning("Failed to cast %s=%r to float", key, result[key]) print(result) return result