Upload files to "/"

2025-07-11 14:42:06 +00:00 · 2025-07-11 14:42:06 +00:00 · 1bf55226e1
commit 1bf55226e1
parent 1d3d28213e
2 changed files with 122 additions and 74 deletions
--- a/block.py
+++ b/block.py
@ -6,6 +6,8 @@ import regex as re
 from pre_processing import pre_processing
 from processing import processing
 from post_processing import post_processing
+import json_repair
+

 # Configure logging
 logging.basicConfig(
@ -14,6 +16,8 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)

+_JSON_LIKE = re.compile(r'^\s*\?*[\{\[].*[\}\]]\s*$', re.DOTALL)
+

 def extract_value(blob, expression):
    try:
@ -21,7 +25,6 @@ def extract_value(blob, expression):
    except Exception:
        return None

-# Coalesce function to return the first non-None value
 def coalesce(*args):
    for value in args:
        if value is not None:
@ -29,71 +32,114 @@ def coalesce(*args):
    return None

 # New sanitize blob function
+
+
+def deep_repair(obj):
+    # 1) If it's a string that *looks* like JSON (with or without one leading '?'),
+    #    strip exactly one leading '?', reparses, and recurse.
+    if isinstance(obj, str):
+        s = obj.strip()
+        if _JSON_LIKE.match(s):
+            # strip one leading '?' if present
+            if s.startswith('?'):
+                s = s[1:]
+            parsed = json_repair.loads(s)
+            return deep_repair(parsed)
+        return obj
+
+    # 2) Dict → recurse on each value
+    if isinstance(obj, dict):
+        return {k: deep_repair(v) for k, v in obj.items()}
+
+    # 3) List → recurse on each element
+    if isinstance(obj, list):
+        return [deep_repair(v) for v in obj]
+
+    # 4) Otherwise, leave it alone
+    return obj
+
+
 def sanitize_blob(blob):
    try:
-        blob = re.sub(r'"(\w+)":"(\{[^}]+\})"', r'"\1":\2', blob)
-        blob = re.sub(r'"tps_vendor_raw_response"\s*:\s*"\?\{', '"tps_vendor_raw_response":{', blob)
-        blob = blob.replace('\\"', '"')
-        blob = blob.replace('\\n', '')
-        blob = blob.replace('\\t', '')
-        blob = blob.replace('\\\\', '')
-        blob = re.sub(r'(\}\})"', r'\1', blob)
-        blob = re.sub(r',\s*([\}\]])', r'\1', blob)
-        return json.loads(blob)
-    except json.JSONDecodeError as e:
-        logger.error(f"JSON Decode Error: {e}")
-        error_pos = e.pos
-        snippet = blob[max(0, error_pos - 50): error_pos + 50]
-        logger.error(f"Error near:\n{snippet}")
+        return deep_repair(blob)
+    except Exception as e:
+        logger.error("Failed to sanitize blob: %s", e)
        return None
-#---------------- Sanitise ends here 
-
-# Function to extract a value using JMESPath
+    
 # Expressions to extract values
 expressions = {
    "first_seen_days": [
-        "tps_vendor_raw_response.query.results[0].first_seen_days",
-        "emailage.emailriskscore.first_seen_days"
+        # 1) any vendor under integration_hub_results → first_seen_days
+        "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].first_seen_days)[0]",
+
+        # 2) the flat “dotted” key
+        "Blob.\"emailage.emailriskscore.first_seen_days\"",
+
+        # 3) fallback to the top level tps_vendor_raw_response path
+        "Blob.tps_vendor_raw_response.query.results[0].first_seen_days",
    ],
    "ea_score": [
-        "tps_vendor_raw_response.query.results[0].EAScore",
-        "emailage.emailriskscore.eascore"
+        # 1) any vendor under integration_hub_results
+        'Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].EAScore',
+
+        # 2) the flat “dotted” key
+        'Blob."emailage.emailriskscore.eascore"',
+
+        # 3) fallback to the top level tps_vendor_raw_response
+        'Blob.tps_vendor_raw_response.query.results[0].EAScore',
    ],
    "email_creation_days": [
-        "tps_vendor_raw_response.query.results[0].email_creation_days"
+        # 1) any vendor under integration_hub_results → results[0].email_creation_days
+        "(Blob.integration_hub_results.*"
+        ".tps_vendor_raw_response.query.results[0].email_creation_days)[0]",
+
+        # 2) fallback to the top level tps_vendor_raw_response path
+        "Blob.tps_vendor_raw_response.query.results[0].email_creation_days",
    ],
-    "summary_risk_score": ["summary_risk_score"],
-    "digital_id_trust_score_rating": ["digital_id_trust_score_rating"],
-    "os_version": ["os_version"],
-    "account_email_worst_score": ["account_email_worst_score"],
-    "true_ip_score": ["true_ip_score"],
+    "summary_risk_score": ["Blob.summary_risk_score"],
+    "digital_id_trust_score_rating": ["Blob.digital_id_trust_score_rating"],
+    "os_version": ["Blob.os_version"],
+    "account_email_worst_score": ["Blob.account_email_worst_score"],
+    "true_ip_score": ["Blob.true_ip_score"],
    "ip_net_speed_cell": [
-        "tps_vendor_raw_response.query.results[0].ip_netSpeedCell",
-        # "true_ip_connection_type"
+        # 1) any vendor under integration_hub_results → results[0].ip_netSpeedCell
+        "(Blob.integration_hub_results.*"
+        ".tps_vendor_raw_response.query.results[0].ip_netSpeedCell)[0]",
+
+        # 2) fallback to the top level tps_vendor_raw_response path
+        "Blob.tps_vendor_raw_response.query.results[0].ip_netSpeedCell",
    ],
-    "account_email_score": ["account_email_score"],
-    "true_ip_worst_score": ["true_ip_worst_score"],
-    "proxy_ip_worst_score": ["proxy_ip_worst_score"],
-    "proxy_ip_score": ["proxy_ip_score"],
-    "fuzzy_device_score": ["fuzzy_device_score"],
-    "ip_region_confidence": ["tps_vendor_raw_response.query.results[0].ip_regionconf"],
-    "true_ip_state_confidence": ["true_ip_state_confidence"],
-    "fuzzy_device_worst_score": ["fuzzy_device_worst_score"],
-    "digital_id_confidence_rating": ["digital_id_confidence_rating"]
+    "account_email_score": ["Blob.account_email_score"],
+    "true_ip_worst_score": ["Blob.true_ip_worst_score"],
+    "proxy_ip_worst_score": ["Blob.proxy_ip_worst_score"],
+    "proxy_ip_score": ["Blob.proxy_ip_score"],
+    "fuzzy_device_score": ["Blob.fuzzy_device_score"],
+    "ip_region_confidence": [
+        # 1) any vendor under integration_hub_results → results[0].ip_regionconf
+        "(Blob.integration_hub_results.*"
+        ".tps_vendor_raw_response.query.results[0].ip_regionconf)[0]",
+
+        # 2) fallback to the top level tps_vendor_raw_response path
+        "Blob.tps_vendor_raw_response.query.results[0].ip_regionconf",
+    ],
+    "true_ip_state_confidence": ["Blob.true_ip_state_confidence"],
+    "fuzzy_device_worst_score": ["Blob.fuzzy_device_worst_score"],
+    "digital_id_confidence_rating": ["Blob.digital_id_confidence_rating"],
+    "trueipgeo": ["TrueIpGeo","Blob.true_ip_geo"],
 }


 def __main__(
-    #Application->
+    # Application->
    application_key: str,
    application_timestamp: str,
-    application_ssn : str,
+    application_ssn: str,
    application_email_address: str,
    application_bank_account_number: str,
    application_is_rejected: str,
    application_date_of_birth: str,
-    #uprovaloanapplication->
-    educationlevel:str,
+    # uprovaloanapplication->
+    educationlevel: str,
    employmentstatus: str,
    lengthatbank: str,
    lengthatjob: str,
@ -102,43 +148,43 @@ def __main__(
    monthsatresidence: str,
    state: str,
    zip: str,
-    #thxresponse->
+    # thxresponse->
    EventType: str,
    DigitalIdConfidence: str,
    RiskRating: str,
    TmxSummaryReasonCode: str,
    TrueIpGeo: str,
-    Blob:str,
-    DeviceId:str,
+    Blob: str,
+    DeviceId: str,
    FuzzyDeviceId: str
-    ) -> dict:
+) -> dict:

    # Convert input parameters into a flat dictionary
    data = {
-        "application_key" : application_key,
-        "application_timestamp" : application_timestamp,
-        "application_ssn " : application_ssn ,
-        "application_email_address" : application_email_address,
-        "application_bank_account_number" : application_bank_account_number,
-        "application_is_rejected" : application_is_rejected,
-        "application_date_of_birth" : application_date_of_birth,
-        "educationlevel" : educationlevel,
-        "employmentstatus" : employmentstatus,
-        "lengthatbank" : lengthatbank,
-        "lengthatjob" : lengthatjob,
-        "ownhome" : ownhome,
-        "payfrequency" : payfrequency,
-        "monthsatresidence" : monthsatresidence,
-        "state" : state,
-        "zip" : zip,
-        "EventType" : EventType,
-        "DigitalIdConfidence" : DigitalIdConfidence,
-        "RiskRating" : RiskRating,
-        "TmxSummaryReasonCode" : TmxSummaryReasonCode,
-        "TrueIpGeo" : TrueIpGeo,
-        "Blob" : Blob,
-        "DeviceId" : DeviceId,
-        "FuzzyDeviceId" : FuzzyDeviceId
+        "application_key": application_key,
+        "application_timestamp": application_timestamp,
+        "application_ssn ": application_ssn,
+        "application_email_address": application_email_address,
+        "application_bank_account_number": application_bank_account_number,
+        "application_is_rejected": application_is_rejected,
+        "application_date_of_birth": application_date_of_birth,
+        "educationlevel": educationlevel,
+        "employmentstatus": employmentstatus,
+        "lengthatbank": lengthatbank,
+        "lengthatjob": lengthatjob,
+        "ownhome": ownhome,
+        "payfrequency": payfrequency,
+        "monthsatresidence": monthsatresidence,
+        "state": state,
+        "zip": zip,
+        "EventType": EventType,
+        "DigitalIdConfidence": DigitalIdConfidence,
+        "RiskRating": RiskRating,
+        "TmxSummaryReasonCode": TmxSummaryReasonCode,
+        "TrueIpGeo": TrueIpGeo,
+        "Blob": Blob,
+        "DeviceId": DeviceId,
+        "FuzzyDeviceId": FuzzyDeviceId
    }

    # Convert dictionary to a single-row DataFrame
@ -150,7 +196,8 @@ def __main__(

        # Step 2: Extract values using the expressions dictionary
        for column, expressions_list in expressions.items():
-            combined_df[column] = combined_df["blob"].apply(lambda x: coalesce(*[extract_value(x, expr) for expr in expressions_list]))
+            combined_df[column] = combined_df["blob"].apply(lambda x: coalesce(
+                *[extract_value(x, expr) for expr in expressions_list]))

        logger.info("pre_flowx data")
        logger.info(combined_df.iloc[0].drop('blob').to_dict())
@ -164,7 +211,7 @@ def __main__(
    logger.info("pre_processed data")
    logger.info(pre_processed_data.iloc[0].to_dict())
    df = processing(pre_processed_data)
-    logger.info("procesed_data")
+    logger.info("processed_data")
    logger.info(df.iloc[0].to_dict())
    df["application_timestamp"] = df["application_timestamp"].astype(str)
    # logger.info("prediction: %.8f", float(df['prediction'].iloc[0]))
@ -177,7 +224,7 @@ def __main__(
    if (pd.notnull(state_value) and state_value == "ZZ") or (pd.notnull(zip_value) and zip_value == "86445"):
        result["hd_score_m1"] = 1250
        logger.info("post_processed_data after state check")
-        logger.info(result) 
+        logger.info(result)

    return result

--- a/requirements.txt
+++ b/requirements.txt
@ -4,3 +4,4 @@ xgboost == 2.1.4
 joblib == 1.4.2
 jmespath == 1.0.1
 regex == 2023.12.25
+json_repair == 0.47.6