Upload files to "/"
All checks were successful
Build and Push Docker Image / test (push) Successful in 2m53s
Build and Push Docker Image / build_and_push (push) Successful in 4m32s

This commit is contained in:
admin user 2025-07-11 14:42:06 +00:00
parent 1d3d28213e
commit 1bf55226e1
2 changed files with 122 additions and 74 deletions

195
block.py
View File

@ -6,6 +6,8 @@ import regex as re
from pre_processing import pre_processing from pre_processing import pre_processing
from processing import processing from processing import processing
from post_processing import post_processing from post_processing import post_processing
import json_repair
# Configure logging # Configure logging
logging.basicConfig( logging.basicConfig(
@ -14,6 +16,8 @@ logging.basicConfig(
) )
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_JSON_LIKE = re.compile(r'^\s*\?*[\{\[].*[\}\]]\s*$', re.DOTALL)
def extract_value(blob, expression): def extract_value(blob, expression):
try: try:
@ -21,7 +25,6 @@ def extract_value(blob, expression):
except Exception: except Exception:
return None return None
# Coalesce function to return the first non-None value
def coalesce(*args): def coalesce(*args):
for value in args: for value in args:
if value is not None: if value is not None:
@ -29,71 +32,114 @@ def coalesce(*args):
return None return None
# New sanitize blob function # New sanitize blob function
def deep_repair(obj):
# 1) If it's a string that *looks* like JSON (with or without one leading '?'),
# strip exactly one leading '?', reparses, and recurse.
if isinstance(obj, str):
s = obj.strip()
if _JSON_LIKE.match(s):
# strip one leading '?' if present
if s.startswith('?'):
s = s[1:]
parsed = json_repair.loads(s)
return deep_repair(parsed)
return obj
# 2) Dict → recurse on each value
if isinstance(obj, dict):
return {k: deep_repair(v) for k, v in obj.items()}
# 3) List → recurse on each element
if isinstance(obj, list):
return [deep_repair(v) for v in obj]
# 4) Otherwise, leave it alone
return obj
def sanitize_blob(blob): def sanitize_blob(blob):
try: try:
blob = re.sub(r'"(\w+)":"(\{[^}]+\})"', r'"\1":\2', blob) return deep_repair(blob)
blob = re.sub(r'"tps_vendor_raw_response"\s*:\s*"\?\{', '"tps_vendor_raw_response":{', blob) except Exception as e:
blob = blob.replace('\\"', '"') logger.error("Failed to sanitize blob: %s", e)
blob = blob.replace('\\n', '')
blob = blob.replace('\\t', '')
blob = blob.replace('\\\\', '')
blob = re.sub(r'(\}\})"', r'\1', blob)
blob = re.sub(r',\s*([\}\]])', r'\1', blob)
return json.loads(blob)
except json.JSONDecodeError as e:
logger.error(f"JSON Decode Error: {e}")
error_pos = e.pos
snippet = blob[max(0, error_pos - 50): error_pos + 50]
logger.error(f"Error near:\n{snippet}")
return None return None
#---------------- Sanitise ends here
# Function to extract a value using JMESPath
# Expressions to extract values # Expressions to extract values
expressions = { expressions = {
"first_seen_days": [ "first_seen_days": [
"tps_vendor_raw_response.query.results[0].first_seen_days", # 1) any vendor under integration_hub_results → first_seen_days
"emailage.emailriskscore.first_seen_days" "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].first_seen_days)[0]",
# 2) the flat “dotted” key
"Blob.\"emailage.emailriskscore.first_seen_days\"",
# 3) fallback to the top level tps_vendor_raw_response path
"Blob.tps_vendor_raw_response.query.results[0].first_seen_days",
], ],
"ea_score": [ "ea_score": [
"tps_vendor_raw_response.query.results[0].EAScore", # 1) any vendor under integration_hub_results
"emailage.emailriskscore.eascore" 'Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].EAScore',
# 2) the flat “dotted” key
'Blob."emailage.emailriskscore.eascore"',
# 3) fallback to the top level tps_vendor_raw_response
'Blob.tps_vendor_raw_response.query.results[0].EAScore',
], ],
"email_creation_days": [ "email_creation_days": [
"tps_vendor_raw_response.query.results[0].email_creation_days" # 1) any vendor under integration_hub_results → results[0].email_creation_days
"(Blob.integration_hub_results.*"
".tps_vendor_raw_response.query.results[0].email_creation_days)[0]",
# 2) fallback to the top level tps_vendor_raw_response path
"Blob.tps_vendor_raw_response.query.results[0].email_creation_days",
], ],
"summary_risk_score": ["summary_risk_score"], "summary_risk_score": ["Blob.summary_risk_score"],
"digital_id_trust_score_rating": ["digital_id_trust_score_rating"], "digital_id_trust_score_rating": ["Blob.digital_id_trust_score_rating"],
"os_version": ["os_version"], "os_version": ["Blob.os_version"],
"account_email_worst_score": ["account_email_worst_score"], "account_email_worst_score": ["Blob.account_email_worst_score"],
"true_ip_score": ["true_ip_score"], "true_ip_score": ["Blob.true_ip_score"],
"ip_net_speed_cell": [ "ip_net_speed_cell": [
"tps_vendor_raw_response.query.results[0].ip_netSpeedCell", # 1) any vendor under integration_hub_results → results[0].ip_netSpeedCell
# "true_ip_connection_type" "(Blob.integration_hub_results.*"
".tps_vendor_raw_response.query.results[0].ip_netSpeedCell)[0]",
# 2) fallback to the top level tps_vendor_raw_response path
"Blob.tps_vendor_raw_response.query.results[0].ip_netSpeedCell",
], ],
"account_email_score": ["account_email_score"], "account_email_score": ["Blob.account_email_score"],
"true_ip_worst_score": ["true_ip_worst_score"], "true_ip_worst_score": ["Blob.true_ip_worst_score"],
"proxy_ip_worst_score": ["proxy_ip_worst_score"], "proxy_ip_worst_score": ["Blob.proxy_ip_worst_score"],
"proxy_ip_score": ["proxy_ip_score"], "proxy_ip_score": ["Blob.proxy_ip_score"],
"fuzzy_device_score": ["fuzzy_device_score"], "fuzzy_device_score": ["Blob.fuzzy_device_score"],
"ip_region_confidence": ["tps_vendor_raw_response.query.results[0].ip_regionconf"], "ip_region_confidence": [
"true_ip_state_confidence": ["true_ip_state_confidence"], # 1) any vendor under integration_hub_results → results[0].ip_regionconf
"fuzzy_device_worst_score": ["fuzzy_device_worst_score"], "(Blob.integration_hub_results.*"
"digital_id_confidence_rating": ["digital_id_confidence_rating"] ".tps_vendor_raw_response.query.results[0].ip_regionconf)[0]",
# 2) fallback to the top level tps_vendor_raw_response path
"Blob.tps_vendor_raw_response.query.results[0].ip_regionconf",
],
"true_ip_state_confidence": ["Blob.true_ip_state_confidence"],
"fuzzy_device_worst_score": ["Blob.fuzzy_device_worst_score"],
"digital_id_confidence_rating": ["Blob.digital_id_confidence_rating"],
"trueipgeo": ["TrueIpGeo","Blob.true_ip_geo"],
} }
def __main__( def __main__(
#Application-> # Application->
application_key: str, application_key: str,
application_timestamp: str, application_timestamp: str,
application_ssn : str, application_ssn: str,
application_email_address: str, application_email_address: str,
application_bank_account_number: str, application_bank_account_number: str,
application_is_rejected: str, application_is_rejected: str,
application_date_of_birth: str, application_date_of_birth: str,
#uprovaloanapplication-> # uprovaloanapplication->
educationlevel:str, educationlevel: str,
employmentstatus: str, employmentstatus: str,
lengthatbank: str, lengthatbank: str,
lengthatjob: str, lengthatjob: str,
@ -102,43 +148,43 @@ def __main__(
monthsatresidence: str, monthsatresidence: str,
state: str, state: str,
zip: str, zip: str,
#thxresponse-> # thxresponse->
EventType: str, EventType: str,
DigitalIdConfidence: str, DigitalIdConfidence: str,
RiskRating: str, RiskRating: str,
TmxSummaryReasonCode: str, TmxSummaryReasonCode: str,
TrueIpGeo: str, TrueIpGeo: str,
Blob:str, Blob: str,
DeviceId:str, DeviceId: str,
FuzzyDeviceId: str FuzzyDeviceId: str
) -> dict: ) -> dict:
# Convert input parameters into a flat dictionary # Convert input parameters into a flat dictionary
data = { data = {
"application_key" : application_key, "application_key": application_key,
"application_timestamp" : application_timestamp, "application_timestamp": application_timestamp,
"application_ssn " : application_ssn , "application_ssn ": application_ssn,
"application_email_address" : application_email_address, "application_email_address": application_email_address,
"application_bank_account_number" : application_bank_account_number, "application_bank_account_number": application_bank_account_number,
"application_is_rejected" : application_is_rejected, "application_is_rejected": application_is_rejected,
"application_date_of_birth" : application_date_of_birth, "application_date_of_birth": application_date_of_birth,
"educationlevel" : educationlevel, "educationlevel": educationlevel,
"employmentstatus" : employmentstatus, "employmentstatus": employmentstatus,
"lengthatbank" : lengthatbank, "lengthatbank": lengthatbank,
"lengthatjob" : lengthatjob, "lengthatjob": lengthatjob,
"ownhome" : ownhome, "ownhome": ownhome,
"payfrequency" : payfrequency, "payfrequency": payfrequency,
"monthsatresidence" : monthsatresidence, "monthsatresidence": monthsatresidence,
"state" : state, "state": state,
"zip" : zip, "zip": zip,
"EventType" : EventType, "EventType": EventType,
"DigitalIdConfidence" : DigitalIdConfidence, "DigitalIdConfidence": DigitalIdConfidence,
"RiskRating" : RiskRating, "RiskRating": RiskRating,
"TmxSummaryReasonCode" : TmxSummaryReasonCode, "TmxSummaryReasonCode": TmxSummaryReasonCode,
"TrueIpGeo" : TrueIpGeo, "TrueIpGeo": TrueIpGeo,
"Blob" : Blob, "Blob": Blob,
"DeviceId" : DeviceId, "DeviceId": DeviceId,
"FuzzyDeviceId" : FuzzyDeviceId "FuzzyDeviceId": FuzzyDeviceId
} }
# Convert dictionary to a single-row DataFrame # Convert dictionary to a single-row DataFrame
@ -150,7 +196,8 @@ def __main__(
# Step 2: Extract values using the expressions dictionary # Step 2: Extract values using the expressions dictionary
for column, expressions_list in expressions.items(): for column, expressions_list in expressions.items():
combined_df[column] = combined_df["blob"].apply(lambda x: coalesce(*[extract_value(x, expr) for expr in expressions_list])) combined_df[column] = combined_df["blob"].apply(lambda x: coalesce(
*[extract_value(x, expr) for expr in expressions_list]))
logger.info("pre_flowx data") logger.info("pre_flowx data")
logger.info(combined_df.iloc[0].drop('blob').to_dict()) logger.info(combined_df.iloc[0].drop('blob').to_dict())
@ -164,7 +211,7 @@ def __main__(
logger.info("pre_processed data") logger.info("pre_processed data")
logger.info(pre_processed_data.iloc[0].to_dict()) logger.info(pre_processed_data.iloc[0].to_dict())
df = processing(pre_processed_data) df = processing(pre_processed_data)
logger.info("procesed_data") logger.info("processed_data")
logger.info(df.iloc[0].to_dict()) logger.info(df.iloc[0].to_dict())
df["application_timestamp"] = df["application_timestamp"].astype(str) df["application_timestamp"] = df["application_timestamp"].astype(str)
# logger.info("prediction: %.8f", float(df['prediction'].iloc[0])) # logger.info("prediction: %.8f", float(df['prediction'].iloc[0]))
@ -177,7 +224,7 @@ def __main__(
if (pd.notnull(state_value) and state_value == "ZZ") or (pd.notnull(zip_value) and zip_value == "86445"): if (pd.notnull(state_value) and state_value == "ZZ") or (pd.notnull(zip_value) and zip_value == "86445"):
result["hd_score_m1"] = 1250 result["hd_score_m1"] = 1250
logger.info("post_processed_data after state check") logger.info("post_processed_data after state check")
logger.info(result) logger.info(result)
return result return result

View File

@ -4,3 +4,4 @@ xgboost == 2.1.4
joblib == 1.4.2 joblib == 1.4.2
jmespath == 1.0.1 jmespath == 1.0.1
regex == 2023.12.25 regex == 2023.12.25
json_repair == 0.47.6