Upload files to "/"
This commit is contained in:
parent
1d3d28213e
commit
1bf55226e1
195
block.py
195
block.py
@ -6,6 +6,8 @@ import regex as re
|
|||||||
from pre_processing import pre_processing
|
from pre_processing import pre_processing
|
||||||
from processing import processing
|
from processing import processing
|
||||||
from post_processing import post_processing
|
from post_processing import post_processing
|
||||||
|
import json_repair
|
||||||
|
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
@ -14,6 +16,8 @@ logging.basicConfig(
|
|||||||
)
|
)
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_JSON_LIKE = re.compile(r'^\s*\?*[\{\[].*[\}\]]\s*$', re.DOTALL)
|
||||||
|
|
||||||
|
|
||||||
def extract_value(blob, expression):
|
def extract_value(blob, expression):
|
||||||
try:
|
try:
|
||||||
@ -21,7 +25,6 @@ def extract_value(blob, expression):
|
|||||||
except Exception:
|
except Exception:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Coalesce function to return the first non-None value
|
|
||||||
def coalesce(*args):
|
def coalesce(*args):
|
||||||
for value in args:
|
for value in args:
|
||||||
if value is not None:
|
if value is not None:
|
||||||
@ -29,71 +32,114 @@ def coalesce(*args):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
# New sanitize blob function
|
# New sanitize blob function
|
||||||
|
|
||||||
|
|
||||||
|
def deep_repair(obj):
|
||||||
|
# 1) If it's a string that *looks* like JSON (with or without one leading '?'),
|
||||||
|
# strip exactly one leading '?', reparses, and recurse.
|
||||||
|
if isinstance(obj, str):
|
||||||
|
s = obj.strip()
|
||||||
|
if _JSON_LIKE.match(s):
|
||||||
|
# strip one leading '?' if present
|
||||||
|
if s.startswith('?'):
|
||||||
|
s = s[1:]
|
||||||
|
parsed = json_repair.loads(s)
|
||||||
|
return deep_repair(parsed)
|
||||||
|
return obj
|
||||||
|
|
||||||
|
# 2) Dict → recurse on each value
|
||||||
|
if isinstance(obj, dict):
|
||||||
|
return {k: deep_repair(v) for k, v in obj.items()}
|
||||||
|
|
||||||
|
# 3) List → recurse on each element
|
||||||
|
if isinstance(obj, list):
|
||||||
|
return [deep_repair(v) for v in obj]
|
||||||
|
|
||||||
|
# 4) Otherwise, leave it alone
|
||||||
|
return obj
|
||||||
|
|
||||||
|
|
||||||
def sanitize_blob(blob):
|
def sanitize_blob(blob):
|
||||||
try:
|
try:
|
||||||
blob = re.sub(r'"(\w+)":"(\{[^}]+\})"', r'"\1":\2', blob)
|
return deep_repair(blob)
|
||||||
blob = re.sub(r'"tps_vendor_raw_response"\s*:\s*"\?\{', '"tps_vendor_raw_response":{', blob)
|
except Exception as e:
|
||||||
blob = blob.replace('\\"', '"')
|
logger.error("Failed to sanitize blob: %s", e)
|
||||||
blob = blob.replace('\\n', '')
|
|
||||||
blob = blob.replace('\\t', '')
|
|
||||||
blob = blob.replace('\\\\', '')
|
|
||||||
blob = re.sub(r'(\}\})"', r'\1', blob)
|
|
||||||
blob = re.sub(r',\s*([\}\]])', r'\1', blob)
|
|
||||||
return json.loads(blob)
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
logger.error(f"JSON Decode Error: {e}")
|
|
||||||
error_pos = e.pos
|
|
||||||
snippet = blob[max(0, error_pos - 50): error_pos + 50]
|
|
||||||
logger.error(f"Error near:\n{snippet}")
|
|
||||||
return None
|
return None
|
||||||
#---------------- Sanitise ends here
|
|
||||||
|
|
||||||
# Function to extract a value using JMESPath
|
|
||||||
# Expressions to extract values
|
# Expressions to extract values
|
||||||
expressions = {
|
expressions = {
|
||||||
"first_seen_days": [
|
"first_seen_days": [
|
||||||
"tps_vendor_raw_response.query.results[0].first_seen_days",
|
# 1) any vendor under integration_hub_results → first_seen_days
|
||||||
"emailage.emailriskscore.first_seen_days"
|
"(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].first_seen_days)[0]",
|
||||||
|
|
||||||
|
# 2) the flat “dotted” key
|
||||||
|
"Blob.\"emailage.emailriskscore.first_seen_days\"",
|
||||||
|
|
||||||
|
# 3) fallback to the top level tps_vendor_raw_response path
|
||||||
|
"Blob.tps_vendor_raw_response.query.results[0].first_seen_days",
|
||||||
],
|
],
|
||||||
"ea_score": [
|
"ea_score": [
|
||||||
"tps_vendor_raw_response.query.results[0].EAScore",
|
# 1) any vendor under integration_hub_results
|
||||||
"emailage.emailriskscore.eascore"
|
'Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].EAScore',
|
||||||
|
|
||||||
|
# 2) the flat “dotted” key
|
||||||
|
'Blob."emailage.emailriskscore.eascore"',
|
||||||
|
|
||||||
|
# 3) fallback to the top level tps_vendor_raw_response
|
||||||
|
'Blob.tps_vendor_raw_response.query.results[0].EAScore',
|
||||||
],
|
],
|
||||||
"email_creation_days": [
|
"email_creation_days": [
|
||||||
"tps_vendor_raw_response.query.results[0].email_creation_days"
|
# 1) any vendor under integration_hub_results → results[0].email_creation_days
|
||||||
|
"(Blob.integration_hub_results.*"
|
||||||
|
".tps_vendor_raw_response.query.results[0].email_creation_days)[0]",
|
||||||
|
|
||||||
|
# 2) fallback to the top level tps_vendor_raw_response path
|
||||||
|
"Blob.tps_vendor_raw_response.query.results[0].email_creation_days",
|
||||||
],
|
],
|
||||||
"summary_risk_score": ["summary_risk_score"],
|
"summary_risk_score": ["Blob.summary_risk_score"],
|
||||||
"digital_id_trust_score_rating": ["digital_id_trust_score_rating"],
|
"digital_id_trust_score_rating": ["Blob.digital_id_trust_score_rating"],
|
||||||
"os_version": ["os_version"],
|
"os_version": ["Blob.os_version"],
|
||||||
"account_email_worst_score": ["account_email_worst_score"],
|
"account_email_worst_score": ["Blob.account_email_worst_score"],
|
||||||
"true_ip_score": ["true_ip_score"],
|
"true_ip_score": ["Blob.true_ip_score"],
|
||||||
"ip_net_speed_cell": [
|
"ip_net_speed_cell": [
|
||||||
"tps_vendor_raw_response.query.results[0].ip_netSpeedCell",
|
# 1) any vendor under integration_hub_results → results[0].ip_netSpeedCell
|
||||||
# "true_ip_connection_type"
|
"(Blob.integration_hub_results.*"
|
||||||
|
".tps_vendor_raw_response.query.results[0].ip_netSpeedCell)[0]",
|
||||||
|
|
||||||
|
# 2) fallback to the top level tps_vendor_raw_response path
|
||||||
|
"Blob.tps_vendor_raw_response.query.results[0].ip_netSpeedCell",
|
||||||
],
|
],
|
||||||
"account_email_score": ["account_email_score"],
|
"account_email_score": ["Blob.account_email_score"],
|
||||||
"true_ip_worst_score": ["true_ip_worst_score"],
|
"true_ip_worst_score": ["Blob.true_ip_worst_score"],
|
||||||
"proxy_ip_worst_score": ["proxy_ip_worst_score"],
|
"proxy_ip_worst_score": ["Blob.proxy_ip_worst_score"],
|
||||||
"proxy_ip_score": ["proxy_ip_score"],
|
"proxy_ip_score": ["Blob.proxy_ip_score"],
|
||||||
"fuzzy_device_score": ["fuzzy_device_score"],
|
"fuzzy_device_score": ["Blob.fuzzy_device_score"],
|
||||||
"ip_region_confidence": ["tps_vendor_raw_response.query.results[0].ip_regionconf"],
|
"ip_region_confidence": [
|
||||||
"true_ip_state_confidence": ["true_ip_state_confidence"],
|
# 1) any vendor under integration_hub_results → results[0].ip_regionconf
|
||||||
"fuzzy_device_worst_score": ["fuzzy_device_worst_score"],
|
"(Blob.integration_hub_results.*"
|
||||||
"digital_id_confidence_rating": ["digital_id_confidence_rating"]
|
".tps_vendor_raw_response.query.results[0].ip_regionconf)[0]",
|
||||||
|
|
||||||
|
# 2) fallback to the top level tps_vendor_raw_response path
|
||||||
|
"Blob.tps_vendor_raw_response.query.results[0].ip_regionconf",
|
||||||
|
],
|
||||||
|
"true_ip_state_confidence": ["Blob.true_ip_state_confidence"],
|
||||||
|
"fuzzy_device_worst_score": ["Blob.fuzzy_device_worst_score"],
|
||||||
|
"digital_id_confidence_rating": ["Blob.digital_id_confidence_rating"],
|
||||||
|
"trueipgeo": ["TrueIpGeo","Blob.true_ip_geo"],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def __main__(
|
def __main__(
|
||||||
#Application->
|
# Application->
|
||||||
application_key: str,
|
application_key: str,
|
||||||
application_timestamp: str,
|
application_timestamp: str,
|
||||||
application_ssn : str,
|
application_ssn: str,
|
||||||
application_email_address: str,
|
application_email_address: str,
|
||||||
application_bank_account_number: str,
|
application_bank_account_number: str,
|
||||||
application_is_rejected: str,
|
application_is_rejected: str,
|
||||||
application_date_of_birth: str,
|
application_date_of_birth: str,
|
||||||
#uprovaloanapplication->
|
# uprovaloanapplication->
|
||||||
educationlevel:str,
|
educationlevel: str,
|
||||||
employmentstatus: str,
|
employmentstatus: str,
|
||||||
lengthatbank: str,
|
lengthatbank: str,
|
||||||
lengthatjob: str,
|
lengthatjob: str,
|
||||||
@ -102,43 +148,43 @@ def __main__(
|
|||||||
monthsatresidence: str,
|
monthsatresidence: str,
|
||||||
state: str,
|
state: str,
|
||||||
zip: str,
|
zip: str,
|
||||||
#thxresponse->
|
# thxresponse->
|
||||||
EventType: str,
|
EventType: str,
|
||||||
DigitalIdConfidence: str,
|
DigitalIdConfidence: str,
|
||||||
RiskRating: str,
|
RiskRating: str,
|
||||||
TmxSummaryReasonCode: str,
|
TmxSummaryReasonCode: str,
|
||||||
TrueIpGeo: str,
|
TrueIpGeo: str,
|
||||||
Blob:str,
|
Blob: str,
|
||||||
DeviceId:str,
|
DeviceId: str,
|
||||||
FuzzyDeviceId: str
|
FuzzyDeviceId: str
|
||||||
) -> dict:
|
) -> dict:
|
||||||
|
|
||||||
# Convert input parameters into a flat dictionary
|
# Convert input parameters into a flat dictionary
|
||||||
data = {
|
data = {
|
||||||
"application_key" : application_key,
|
"application_key": application_key,
|
||||||
"application_timestamp" : application_timestamp,
|
"application_timestamp": application_timestamp,
|
||||||
"application_ssn " : application_ssn ,
|
"application_ssn ": application_ssn,
|
||||||
"application_email_address" : application_email_address,
|
"application_email_address": application_email_address,
|
||||||
"application_bank_account_number" : application_bank_account_number,
|
"application_bank_account_number": application_bank_account_number,
|
||||||
"application_is_rejected" : application_is_rejected,
|
"application_is_rejected": application_is_rejected,
|
||||||
"application_date_of_birth" : application_date_of_birth,
|
"application_date_of_birth": application_date_of_birth,
|
||||||
"educationlevel" : educationlevel,
|
"educationlevel": educationlevel,
|
||||||
"employmentstatus" : employmentstatus,
|
"employmentstatus": employmentstatus,
|
||||||
"lengthatbank" : lengthatbank,
|
"lengthatbank": lengthatbank,
|
||||||
"lengthatjob" : lengthatjob,
|
"lengthatjob": lengthatjob,
|
||||||
"ownhome" : ownhome,
|
"ownhome": ownhome,
|
||||||
"payfrequency" : payfrequency,
|
"payfrequency": payfrequency,
|
||||||
"monthsatresidence" : monthsatresidence,
|
"monthsatresidence": monthsatresidence,
|
||||||
"state" : state,
|
"state": state,
|
||||||
"zip" : zip,
|
"zip": zip,
|
||||||
"EventType" : EventType,
|
"EventType": EventType,
|
||||||
"DigitalIdConfidence" : DigitalIdConfidence,
|
"DigitalIdConfidence": DigitalIdConfidence,
|
||||||
"RiskRating" : RiskRating,
|
"RiskRating": RiskRating,
|
||||||
"TmxSummaryReasonCode" : TmxSummaryReasonCode,
|
"TmxSummaryReasonCode": TmxSummaryReasonCode,
|
||||||
"TrueIpGeo" : TrueIpGeo,
|
"TrueIpGeo": TrueIpGeo,
|
||||||
"Blob" : Blob,
|
"Blob": Blob,
|
||||||
"DeviceId" : DeviceId,
|
"DeviceId": DeviceId,
|
||||||
"FuzzyDeviceId" : FuzzyDeviceId
|
"FuzzyDeviceId": FuzzyDeviceId
|
||||||
}
|
}
|
||||||
|
|
||||||
# Convert dictionary to a single-row DataFrame
|
# Convert dictionary to a single-row DataFrame
|
||||||
@ -150,7 +196,8 @@ def __main__(
|
|||||||
|
|
||||||
# Step 2: Extract values using the expressions dictionary
|
# Step 2: Extract values using the expressions dictionary
|
||||||
for column, expressions_list in expressions.items():
|
for column, expressions_list in expressions.items():
|
||||||
combined_df[column] = combined_df["blob"].apply(lambda x: coalesce(*[extract_value(x, expr) for expr in expressions_list]))
|
combined_df[column] = combined_df["blob"].apply(lambda x: coalesce(
|
||||||
|
*[extract_value(x, expr) for expr in expressions_list]))
|
||||||
|
|
||||||
logger.info("pre_flowx data")
|
logger.info("pre_flowx data")
|
||||||
logger.info(combined_df.iloc[0].drop('blob').to_dict())
|
logger.info(combined_df.iloc[0].drop('blob').to_dict())
|
||||||
@ -164,7 +211,7 @@ def __main__(
|
|||||||
logger.info("pre_processed data")
|
logger.info("pre_processed data")
|
||||||
logger.info(pre_processed_data.iloc[0].to_dict())
|
logger.info(pre_processed_data.iloc[0].to_dict())
|
||||||
df = processing(pre_processed_data)
|
df = processing(pre_processed_data)
|
||||||
logger.info("procesed_data")
|
logger.info("processed_data")
|
||||||
logger.info(df.iloc[0].to_dict())
|
logger.info(df.iloc[0].to_dict())
|
||||||
df["application_timestamp"] = df["application_timestamp"].astype(str)
|
df["application_timestamp"] = df["application_timestamp"].astype(str)
|
||||||
# logger.info("prediction: %.8f", float(df['prediction'].iloc[0]))
|
# logger.info("prediction: %.8f", float(df['prediction'].iloc[0]))
|
||||||
@ -177,7 +224,7 @@ def __main__(
|
|||||||
if (pd.notnull(state_value) and state_value == "ZZ") or (pd.notnull(zip_value) and zip_value == "86445"):
|
if (pd.notnull(state_value) and state_value == "ZZ") or (pd.notnull(zip_value) and zip_value == "86445"):
|
||||||
result["hd_score_m1"] = 1250
|
result["hd_score_m1"] = 1250
|
||||||
logger.info("post_processed_data after state check")
|
logger.info("post_processed_data after state check")
|
||||||
logger.info(result)
|
logger.info(result)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|||||||
@ -4,3 +4,4 @@ xgboost == 2.1.4
|
|||||||
joblib == 1.4.2
|
joblib == 1.4.2
|
||||||
jmespath == 1.0.1
|
jmespath == 1.0.1
|
||||||
regex == 2023.12.25
|
regex == 2023.12.25
|
||||||
|
json_repair == 0.47.6
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user