diff --git a/block.py b/block.py index cbe3bce..6227df9 100644 --- a/block.py +++ b/block.py @@ -6,6 +6,8 @@ import regex as re from pre_processing import pre_processing from processing import processing from post_processing import post_processing +import json_repair + # Configure logging logging.basicConfig( @@ -14,6 +16,8 @@ logging.basicConfig( ) logger = logging.getLogger(__name__) +_JSON_LIKE = re.compile(r'^\s*\?*[\{\[].*[\}\]]\s*$', re.DOTALL) + def extract_value(blob, expression): try: @@ -21,7 +25,6 @@ def extract_value(blob, expression): except Exception: return None -# Coalesce function to return the first non-None value def coalesce(*args): for value in args: if value is not None: @@ -29,71 +32,114 @@ def coalesce(*args): return None # New sanitize blob function + + +def deep_repair(obj): + # 1) If it's a string that *looks* like JSON (with or without one leading '?'), + # strip exactly one leading '?', reparses, and recurse. + if isinstance(obj, str): + s = obj.strip() + if _JSON_LIKE.match(s): + # strip one leading '?' if present + if s.startswith('?'): + s = s[1:] + parsed = json_repair.loads(s) + return deep_repair(parsed) + return obj + + # 2) Dict → recurse on each value + if isinstance(obj, dict): + return {k: deep_repair(v) for k, v in obj.items()} + + # 3) List → recurse on each element + if isinstance(obj, list): + return [deep_repair(v) for v in obj] + + # 4) Otherwise, leave it alone + return obj + + def sanitize_blob(blob): try: - blob = re.sub(r'"(\w+)":"(\{[^}]+\})"', r'"\1":\2', blob) - blob = re.sub(r'"tps_vendor_raw_response"\s*:\s*"\?\{', '"tps_vendor_raw_response":{', blob) - blob = blob.replace('\\"', '"') - blob = blob.replace('\\n', '') - blob = blob.replace('\\t', '') - blob = blob.replace('\\\\', '') - blob = re.sub(r'(\}\})"', r'\1', blob) - blob = re.sub(r',\s*([\}\]])', r'\1', blob) - return json.loads(blob) - except json.JSONDecodeError as e: - logger.error(f"JSON Decode Error: {e}") - error_pos = e.pos - snippet = blob[max(0, error_pos - 50): error_pos + 50] - logger.error(f"Error near:\n{snippet}") + return deep_repair(blob) + except Exception as e: + logger.error("Failed to sanitize blob: %s", e) return None -#---------------- Sanitise ends here - -# Function to extract a value using JMESPath + # Expressions to extract values expressions = { "first_seen_days": [ - "tps_vendor_raw_response.query.results[0].first_seen_days", - "emailage.emailriskscore.first_seen_days" + # 1) any vendor under integration_hub_results → first_seen_days + "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].first_seen_days)[0]", + + # 2) the flat “dotted” key + "Blob.\"emailage.emailriskscore.first_seen_days\"", + + # 3) fallback to the top level tps_vendor_raw_response path + "Blob.tps_vendor_raw_response.query.results[0].first_seen_days", ], "ea_score": [ - "tps_vendor_raw_response.query.results[0].EAScore", - "emailage.emailriskscore.eascore" + # 1) any vendor under integration_hub_results + 'Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].EAScore', + + # 2) the flat “dotted” key + 'Blob."emailage.emailriskscore.eascore"', + + # 3) fallback to the top level tps_vendor_raw_response + 'Blob.tps_vendor_raw_response.query.results[0].EAScore', ], "email_creation_days": [ - "tps_vendor_raw_response.query.results[0].email_creation_days" + # 1) any vendor under integration_hub_results → results[0].email_creation_days + "(Blob.integration_hub_results.*" + ".tps_vendor_raw_response.query.results[0].email_creation_days)[0]", + + # 2) fallback to the top level tps_vendor_raw_response path + "Blob.tps_vendor_raw_response.query.results[0].email_creation_days", ], - "summary_risk_score": ["summary_risk_score"], - "digital_id_trust_score_rating": ["digital_id_trust_score_rating"], - "os_version": ["os_version"], - "account_email_worst_score": ["account_email_worst_score"], - "true_ip_score": ["true_ip_score"], + "summary_risk_score": ["Blob.summary_risk_score"], + "digital_id_trust_score_rating": ["Blob.digital_id_trust_score_rating"], + "os_version": ["Blob.os_version"], + "account_email_worst_score": ["Blob.account_email_worst_score"], + "true_ip_score": ["Blob.true_ip_score"], "ip_net_speed_cell": [ - "tps_vendor_raw_response.query.results[0].ip_netSpeedCell", - # "true_ip_connection_type" + # 1) any vendor under integration_hub_results → results[0].ip_netSpeedCell + "(Blob.integration_hub_results.*" + ".tps_vendor_raw_response.query.results[0].ip_netSpeedCell)[0]", + + # 2) fallback to the top level tps_vendor_raw_response path + "Blob.tps_vendor_raw_response.query.results[0].ip_netSpeedCell", ], - "account_email_score": ["account_email_score"], - "true_ip_worst_score": ["true_ip_worst_score"], - "proxy_ip_worst_score": ["proxy_ip_worst_score"], - "proxy_ip_score": ["proxy_ip_score"], - "fuzzy_device_score": ["fuzzy_device_score"], - "ip_region_confidence": ["tps_vendor_raw_response.query.results[0].ip_regionconf"], - "true_ip_state_confidence": ["true_ip_state_confidence"], - "fuzzy_device_worst_score": ["fuzzy_device_worst_score"], - "digital_id_confidence_rating": ["digital_id_confidence_rating"] + "account_email_score": ["Blob.account_email_score"], + "true_ip_worst_score": ["Blob.true_ip_worst_score"], + "proxy_ip_worst_score": ["Blob.proxy_ip_worst_score"], + "proxy_ip_score": ["Blob.proxy_ip_score"], + "fuzzy_device_score": ["Blob.fuzzy_device_score"], + "ip_region_confidence": [ + # 1) any vendor under integration_hub_results → results[0].ip_regionconf + "(Blob.integration_hub_results.*" + ".tps_vendor_raw_response.query.results[0].ip_regionconf)[0]", + + # 2) fallback to the top level tps_vendor_raw_response path + "Blob.tps_vendor_raw_response.query.results[0].ip_regionconf", + ], + "true_ip_state_confidence": ["Blob.true_ip_state_confidence"], + "fuzzy_device_worst_score": ["Blob.fuzzy_device_worst_score"], + "digital_id_confidence_rating": ["Blob.digital_id_confidence_rating"], + "trueipgeo": ["TrueIpGeo","Blob.true_ip_geo"], } def __main__( - #Application-> + # Application-> application_key: str, application_timestamp: str, - application_ssn : str, + application_ssn: str, application_email_address: str, application_bank_account_number: str, application_is_rejected: str, application_date_of_birth: str, - #uprovaloanapplication-> - educationlevel:str, + # uprovaloanapplication-> + educationlevel: str, employmentstatus: str, lengthatbank: str, lengthatjob: str, @@ -102,43 +148,43 @@ def __main__( monthsatresidence: str, state: str, zip: str, - #thxresponse-> + # thxresponse-> EventType: str, DigitalIdConfidence: str, RiskRating: str, TmxSummaryReasonCode: str, TrueIpGeo: str, - Blob:str, - DeviceId:str, + Blob: str, + DeviceId: str, FuzzyDeviceId: str - ) -> dict: +) -> dict: # Convert input parameters into a flat dictionary data = { - "application_key" : application_key, - "application_timestamp" : application_timestamp, - "application_ssn " : application_ssn , - "application_email_address" : application_email_address, - "application_bank_account_number" : application_bank_account_number, - "application_is_rejected" : application_is_rejected, - "application_date_of_birth" : application_date_of_birth, - "educationlevel" : educationlevel, - "employmentstatus" : employmentstatus, - "lengthatbank" : lengthatbank, - "lengthatjob" : lengthatjob, - "ownhome" : ownhome, - "payfrequency" : payfrequency, - "monthsatresidence" : monthsatresidence, - "state" : state, - "zip" : zip, - "EventType" : EventType, - "DigitalIdConfidence" : DigitalIdConfidence, - "RiskRating" : RiskRating, - "TmxSummaryReasonCode" : TmxSummaryReasonCode, - "TrueIpGeo" : TrueIpGeo, - "Blob" : Blob, - "DeviceId" : DeviceId, - "FuzzyDeviceId" : FuzzyDeviceId + "application_key": application_key, + "application_timestamp": application_timestamp, + "application_ssn ": application_ssn, + "application_email_address": application_email_address, + "application_bank_account_number": application_bank_account_number, + "application_is_rejected": application_is_rejected, + "application_date_of_birth": application_date_of_birth, + "educationlevel": educationlevel, + "employmentstatus": employmentstatus, + "lengthatbank": lengthatbank, + "lengthatjob": lengthatjob, + "ownhome": ownhome, + "payfrequency": payfrequency, + "monthsatresidence": monthsatresidence, + "state": state, + "zip": zip, + "EventType": EventType, + "DigitalIdConfidence": DigitalIdConfidence, + "RiskRating": RiskRating, + "TmxSummaryReasonCode": TmxSummaryReasonCode, + "TrueIpGeo": TrueIpGeo, + "Blob": Blob, + "DeviceId": DeviceId, + "FuzzyDeviceId": FuzzyDeviceId } # Convert dictionary to a single-row DataFrame @@ -150,7 +196,8 @@ def __main__( # Step 2: Extract values using the expressions dictionary for column, expressions_list in expressions.items(): - combined_df[column] = combined_df["blob"].apply(lambda x: coalesce(*[extract_value(x, expr) for expr in expressions_list])) + combined_df[column] = combined_df["blob"].apply(lambda x: coalesce( + *[extract_value(x, expr) for expr in expressions_list])) logger.info("pre_flowx data") logger.info(combined_df.iloc[0].drop('blob').to_dict()) @@ -164,7 +211,7 @@ def __main__( logger.info("pre_processed data") logger.info(pre_processed_data.iloc[0].to_dict()) df = processing(pre_processed_data) - logger.info("procesed_data") + logger.info("processed_data") logger.info(df.iloc[0].to_dict()) df["application_timestamp"] = df["application_timestamp"].astype(str) # logger.info("prediction: %.8f", float(df['prediction'].iloc[0])) @@ -177,7 +224,7 @@ def __main__( if (pd.notnull(state_value) and state_value == "ZZ") or (pd.notnull(zip_value) and zip_value == "86445"): result["hd_score_m1"] = 1250 logger.info("post_processed_data after state check") - logger.info(result) + logger.info(result) return result diff --git a/requirements.txt b/requirements.txt index 049e4db..76ec1ff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ xgboost == 2.1.4 joblib == 1.4.2 jmespath == 1.0.1 regex == 2023.12.25 +json_repair == 0.47.6