Early Term Default/Fraud indicator v1 block
All checks were successful
Build and Push Docker Image / test (push) Successful in 2m8s
Build and Push Docker Image / build_and_push (push) Successful in 2m13s

This commit is contained in:
admin user 2025-01-21 21:17:40 +00:00
parent 4ca7b2486f
commit 8f303a4993
9 changed files with 392 additions and 24 deletions

132
block.py
View File

@ -1,21 +1,111 @@
@flowx_block import pandas as pd
def example_function(request: dict) -> dict: import json
import jmespath
# Processing logic here... import xgboost as xgb
import math
return { import joblib
"meta_info": [ import logging
{ from typing import Optional, List, Dict
"name": "created_date", from pre_processing import pre_processing
"type": "string", from processing import processing
"value": "2024-11-05" from post_processing import post_processing
}
], # Configure logging
"fields": [ logging.basicConfig(
{ level=logging.INFO,
"name": "", format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
"type": "", )
"value": "" logger = logging.getLogger(__name__)
}
]
} def __main__(application_key: str, application_timestamp: str, application_source_name: str, application_date_of_birth: str, ownhome: str, employmentstatus: str, lengthatjob: float, payfrequency: str, lengthatbank: str, inputipaddress: str, deviceid: str, fuzzydeviceid: str, trueip: str, dnsip: str, requestid: str, riskrating: str, tmxsummaryreasoncode: str, digitalidconfidence: str, results: Optional[List[Dict]] = None) -> dict:
data = {
"application_key": application_key,
"application_timestamp": application_timestamp,
"application_source_name": application_source_name,
"application_date_of_birth": application_date_of_birth,
"ownhome": ownhome,
"employmentstatus": employmentstatus,
"lengthatjob": lengthatjob,
"payfrequency": payfrequency,
"lengthatbank": lengthatbank,
"inputipaddress": inputipaddress,
"deviceid": deviceid,
"fuzzydeviceid": fuzzydeviceid,
"trueip": trueip,
"dnsip": dnsip,
"requestid": requestid,
"riskrating": riskrating,
"tmxsummaryreasoncode": tmxsummaryreasoncode,
"digitalidconfidence": digitalidconfidence
}
data = pd.DataFrame([data])
expected_schema = {
"ea_score": str,
"ip_net_speed_cell": str,
"ip_country_confidence": str,
"ip_region_confidence": str,
"fraud_risk": str,
"first_seen_days": str,
"domain_creation_days": str
}
expressions = {
"ea_score": "EAScore",
"ip_net_speed_cell": "ip_netSpeedCell",
"ip_country_confidence": "ip_countryconf",
"ip_region_confidence": "ip_regionconf",
"fraud_risk": "fraudRisk",
"first_seen_days": "first_seen_days",
"domain_creation_days": "domain_creation_days",
}
if results:
first_result = results[0]
for column, expression in expressions.items():
try:
extracted_value = jmespath.search(expression, first_result)
expected_type = expected_schema[column]
if extracted_value is not None and not isinstance(extracted_value, expected_type):
try:
extracted_value = expected_type(extracted_value)
except (ValueError, TypeError) as cast_error:
logger.error(f"Failed to cast {column} value to {expected_type.__name__}: {cast_error}")
extracted_value = None
data[column] = extracted_value
except Exception as e:
logger.error(f"Error extracting value for {column}: {e}")
data[column] = None
else:
for column in expressions.keys():
data[column] = None
logger.info(f"pre_pre_processed_data: {data.to_dict(orient='records')}")
pre_processed_data = pre_processing(data)
logger.info(f"pre_processed_data: {pre_processed_data}")
prediction = processing(pre_processed_data)
logger.info("prediction: %.8f", float(prediction))
result = post_processing(prediction[0])
logger.info("Score: %.0f", float(result["score"]))
logger.info("Action: %s", result["action"])
logger.info("Description: %s", result["description"])
return {
# 'prediction': prediction,
'score': result["score"],
'action': result["action"],
'description': result["description"]
}

View File

@ -0,0 +1,11 @@
{
"application_source_name": ["arrowshade organic low", "arrowshade pr", "arrowshade pr hq", "upab5010", "upab5016", "upab5555", "uplm100pr", "uplm5555", "upr new lead - s1", "uprpadmc1", "uprreact1", "uprrefi1", "uprwebnew", "uprwebrfi", "uprwebvip"],
"ownhome": ["false", "none", "true"],
"employmentstatus": ["disability", "fixed income", "full time employed", "none", "other", "part time employment", "retired benefits", "self employed", "student", "unemployed", "welfare"],
"payfrequency": ["bi weekly", "bi-weekly", "bw", "none", "semi-monthly", "semimonthly"],
"fraud_risk": ["low", "moderate", "none", "review", "very high", "very low"],
"ip_net_speed_cell": ["broadband", "cable", "dialup", "mobile", "none", "satellite", "t1", "wireless", "xdsl"],
"riskrating": ["high", "low", "medium", "neutral", "none", "trusted"]
}

27
post_processing.py Normal file
View File

@ -0,0 +1,27 @@
import logging
import math
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
)
logger = logging.getLogger(__name__)
def post_processing(value):
try:
part1 = min(value * 100 + 0.00001, 1) * 85
part2 = max(math.log(value * 100 + 0.000001, 2) * 185, 0)
score = round((part1 + part2), 0)
score_threshold = 1230
action = "Application Decline" if score >= score_threshold else "Application Pass"
description = (
f"HD Fraud Score is above the risk threshold {score_threshold}, Recommended action: {action}."
if score >= score_threshold
else f"HD Fraud Score is below the risk threshold {score_threshold}, Recommended action: {action}."
)
# logger.info({'score': score, 'action': action, 'description': description})
return {'score': score, 'action': action, 'description': description}
except Exception as e:
logger.error(f"Error in post_processing: {e}")
return {'score': None, 'action': 'Unknown', 'description': 'Error processing the score'}

95
pre_processing.py Normal file
View File

@ -0,0 +1,95 @@
import pandas as pd
import json
import jmespath
import logging
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
)
logger = logging.getLogger(__name__)
def pre_processing(input_data):
# combined_df = pd.DataFrame([input_data])
combined_df = input_data
combined_df["app_age"] = combined_df.apply(
lambda row: pd.to_datetime(row["application_timestamp"]).year - pd.to_datetime(row["application_date_of_birth"]).year
if pd.notnull(row["application_timestamp"]) and pd.notnull(row["application_date_of_birth"]) else None,
axis=1
)
# for col in ["requestid", "inputipaddress", "deviceid", "fuzzydeviceid", "trueip", "dnsip"]:
# combined_df[f"{col}_consistency"] = combined_df.groupby("application_key")[col].transform("nunique")
for col in ["requestid", "inputipaddress", "deviceid", "fuzzydeviceid", "trueip", "dnsip"]:
combined_df[f"{col}_consistency"] = combined_df[col].apply(
lambda x: 1 if pd.notnull(x) and str(x).lower() not in ("nan", "none", None) else 0
)
combined_df.rename(columns={'inputipaddress_consistency': 'inputip_consistency'}, inplace=True)
combined_df.rename(columns={'requestid_consistency': 'request_consistency'}, inplace=True)
combined_df['digitalidconfidence'] = pd.to_numeric(combined_df['digitalidconfidence'], errors='coerce').astype('Int64')
for col in ["digitalidconfidence"]:
combined_df[f"avg_{col}"] = combined_df.groupby("application_key")[col].transform("mean")
combined_df[f"min_{col}"] = combined_df.groupby("application_key")[col].transform("min")
combined_df[f"max_{col}"] = combined_df.groupby("application_key")[col].transform("max")
combined_df['Level_1_Link_Accept'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Level_1_Link_Accept', na=False, regex=True).astype(int)
combined_df['Identity_Negative_History_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Identity_Negative_History', na=False, regex=True).astype(int)
combined_df['Level_1_Link_Accept_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Level_1_Link_Accept', na=False, regex=True).astype(int)
combined_df['Device_Negative_History_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Device_Negative_History', na=False, regex=True).astype(int)
combined_df['Level_1_Link_Reject_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Level_1_Link_Reject', na=False, regex=True).astype(int)
combined_df['IP_Negative_History_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('IP_Negative_History', na=False, regex=True).astype(int)
combined_df['Identity_Spoofing_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Identity_Spoofing', na=False, regex=True).astype(int)
combined_df['Bot_Max'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Bot', na=False, regex=True).astype(int)
def map_fraud_risk(risk):
risk = str(risk).lower()
if "very low" in risk:
return "Very Low"
elif "low" in risk:
return "Low"
elif "moderate" in risk:
return "Moderate"
elif "review" in risk:
return "Review"
elif "very high" in risk:
return "Very High"
else:
return None
combined_df["fraud_risk"] = combined_df["fraud_risk"].apply(map_fraud_risk)
# combined_df.replace({'nan': None, 'None': None}, inplace=True)
combined_df.replace({'nan': None, 'None': None}, inplace=True)
dtype_dict = {
'app_age': 'int64', 'first_seen_days': 'int64', 'request_consistency': 'int64',
'application_source_name': str, 'fuzzydeviceid_consistency': 'int64',
'domain_creation_days': 'int64', 'employmentstatus': str, 'Identity_Spoofing_Max': 'int64',
'trueip_consistency': 'int64', 'inputip_consistency': 'int64', 'ea_score': 'int64',
'lengthatbank': float, 'lengthatjob': float, 'max_digitalidconfidence': float,
'Identity_Negative_History_Max': 'int64', 'digitalidconfidence': 'int64',
'IP_Negative_History_Max': 'int64', 'Device_Negative_History_Max': 'int64',
'Bot_Max': 'int64', 'avg_digitalidconfidence': float, 'min_digitalidconfidence': float,
'Level_1_Link_Reject_Max': 'int64', 'dnsip_consistency': 'int64', 'ip_country_confidence': 'int64',
'riskrating': str, 'ownhome': str, 'deviceid_consistency': 'int64',
'payfrequency': str, 'fraud_risk': str, 'Level_1_Link_Accept': 'int64',
'ip_net_speed_cell': str, 'ip_region_confidence': 'int64', 'Level_1_Link_Accept_Max': 'int64'
}
output_columns = list(dtype_dict.keys())
filtered_df = combined_df[output_columns]
int_columns = [col for col, dtype in dtype_dict.items() if dtype == int]
for col in int_columns:
filtered_df[col] = pd.to_numeric(filtered_df[col], errors='coerce')
filtered_df = filtered_df.astype(dtype_dict, errors='ignore')
return filtered_df.to_dict(orient="records")

41
processing.py Normal file
View File

@ -0,0 +1,41 @@
import pandas as pd
import xgboost as xgb
import math
import joblib
import json
def processing(input_data):
df = pd.DataFrame(input_data)
model = joblib.load("./xgboost_model.joblib")
with open('./category_orders_train.json', 'r') as f:
category_orders = json.load(f)
if df.empty:
raise ValueError("Input DataFrame is empty.")
categorical_columns = ["application_source_name", "ownhome", "employmentstatus", "payfrequency", "fraud_risk", "ip_net_speed_cell", "riskrating"]
for col in categorical_columns:
if col in df.columns:
df[col] = df[col].str.lower()
df[col].replace([None, "", "null", math.isnan, pd.NA], "none", inplace=True)
df[col] = pd.Categorical(df[col], categories=category_orders[col])
else:
df[col] = pd.Categorical(["none"], categories=category_orders.get(col, ["none"]))
non_categorical_columns = [col for col in df.columns if col not in categorical_columns]
for col in non_categorical_columns:
if col in df.columns:
df[col] = df[col].astype(str).str.lower().replace(["null", "nan", "", None], pd.NA)
df[col] = pd.to_numeric(df[col], errors="coerce")
else:
df[col] = pd.NA
expected_features = model.feature_names
missing_features = [feature for feature in expected_features if feature not in df.columns]
for feature in missing_features:
df[feature] = None
dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True)
predictions = model.predict(dmatrix)
return predictions

View File

@ -1 +1,84 @@
{} {
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"application_key": {
"type": ["string", "null"],
"description": "Unique identifier for the application."
},
"application_timestamp": {
"type": ["string", "null"],
"description": "Timestamp of the application in UTC."
},
"application_source_name": {
"type": ["string", "null"],
"description": "Source from which the application was submitted."
},
"application_date_of_birth": {
"type": ["string", "null"],
"description": "Date of birth of the applicant."
},
"ownhome": {
"type": ["string", "null"],
"description": "Indicates if the applicant owns a home."
},
"employmentstatus": {
"type": ["string", "null"],
"description": "Employment status of the applicant."
},
"lengthatjob": {
"type": ["number", "null"],
"description": "Length of time the applicant has been at their current job."
},
"payfrequency": {
"type": ["string", "null"],
"description": "Frequency of pay for the applicant."
},
"lengthatbank": {
"type": ["string", "null"],
"description": "Length of time the applicant has been with their bank."
},
"inputipaddress": {
"type": ["string", "null"],
"description": "IP address of the device used to submit the application."
},
"deviceid": {
"type": ["string", "null"],
"description": "Unique identifier for the device used to submit the application."
},
"fuzzydeviceid": {
"type": ["string", "null"],
"description": "Hashed or partially anonymized identifier for the device."
},
"trueip": {
"type": ["string", "null"],
"description": "Actual IP address of the applicant's device."
},
"dnsip": {
"type": ["string", "null"],
"description": "DNS IP address of the device used to submit the application."
},
"requestid": {
"type": ["string", "null"],
"description": "Unique identifier for the application request."
},
"riskrating": {
"type": ["string", "null"],
"description": "Risk rating assigned to the application."
},
"tmxsummaryreasoncode": {
"type": ["string", "null"],
"description": "Reason code summary from third-party risk assessment."
},
"digitalidconfidence": {
"type": ["string", "null"],
"description": "Confidence score for the digital identity of the applicant."
},
"results": {
"type": ["array", "null"],
"items": {"type": "object"},
"description": "ThreatMetrixResponse emailage blob."
}
},
"required": []
}

View File

@ -1 +1,5 @@
{} pandas ==2.2.2
joblib ==1.3.2
xgboost ==1.7.5
jmespath==1.0.1
numpy==1.23.5

View File

@ -1 +1,18 @@
{} {
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"score": {
"type": "number",
"description": "HD Fraud Score."
},
"action": {
"type": "string",
"description": "Recommended Action."
},
"description": {
"type": "string",
"description": "Description"
}
}
}

BIN
xgboost_model.joblib Normal file

Binary file not shown.