Add G2 pipeline, models, and schema for g1_v1
All checks were successful
Build and Push Docker Image / test (push) Successful in 25s
Build and Push Docker Image / build_and_push (push) Successful in 2m37s

This commit is contained in:
Ankur Malik 2025-11-26 11:49:57 -05:00
parent e58625ad19
commit 9f3cb9ca4f
8 changed files with 179 additions and 44 deletions

View File

@ -1,8 +1,8 @@
import logging import logging
from typing import List, Dict from typing import List, Dict
from graph_pre_processing import pre_processing from graph_pre_processing import pre_processing_g1, pre_processing_g2
from graph_processing import processing from graph_processing import processing_g1, processing_g2
from graph_post_processing import post_processing from graph_post_processing import post_processing_g1, post_processing_g2
# Configure logging # Configure logging
logging.basicConfig( logging.basicConfig(
@ -12,20 +12,30 @@ logging.basicConfig(
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def __main__(results: List[Dict]) -> List[Dict]: def __main__(results: List[Dict]) -> Dict:
logger.info(f"data receiving in g1v1 block: {results}") logger.info("data receiving in g1v1 block: %s", results)
data = pre_processing(results) g1_input = pre_processing_g1(results)
logger.info(f"pre_processed_data, new_user_app_data: {data}") g2_input = pre_processing_g2(results)
logger.info("pre_processed_data_g1: %s", g1_input)
logger.info("pre_processed_data_g2: %s", g2_input)
# df = processing(data) cluster_size = g1_input.get("cluster_size", 2)
if data.get("cluster_size", 2) < 2: if cluster_size is None:
data["prediction"] = 0 cluster_size = 2
if cluster_size < 2:
g1_processed = {**g1_input, "prediction": 0}
g2_processed = {**g2_input, "prediction_g2": 0}
else: else:
data = processing(data) g1_processed = processing_g1(g1_input)
logger.info("prediction: %.8f", float(data['prediction'])) g2_processed = processing_g2(g2_input)
# Post-processing: calculate the Final Score and update the dataframe. logger.info("prediction_g1: %.8f", float(g1_processed.get("prediction", 0)))
final = post_processing(data) logger.info("prediction_g2: %.8f", float(g2_processed.get("prediction_g2", 0)))
final_g1 = post_processing_g1(g1_processed)
final_g2 = post_processing_g2(g2_processed)
final = {**final_g1, **final_g2}
logger.info(final) logger.info(final)
return final return final

View File

@ -1,5 +1,6 @@
import logging import logging
import math import math
from decimal import Decimal, ROUND_HALF_UP
# Configure logging # Configure logging
logging.basicConfig( logging.basicConfig(
@ -8,7 +9,8 @@ logging.basicConfig(
) )
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def post_processing(data):
def post_processing_g1(data):
try: try:
prediction = data.get("prediction", 0) prediction = data.get("prediction", 0)
score_g1 = round( score_g1 = round(
@ -17,9 +19,9 @@ def post_processing(data):
0 0
) )
data["hd_score_g1"] = score_g1 data["hd_score_g1"] = score_g1
logger.info(f"score_g1 calculated: {score_g1}") logger.info("score_g1 calculated: %s", score_g1)
except Exception as e: except Exception as e:
logger.error(f"Error processing score_g1 calculations: {e}") logger.error("Error processing score_g1 calculations: %s", e)
return { return {
key: data.get(key, None) key: data.get(key, None)
@ -29,3 +31,25 @@ def post_processing(data):
"app_dt_day_cnt", "hd_score_iso_m2" "app_dt_day_cnt", "hd_score_iso_m2"
] ]
} }
def post_processing_g2(data):
prediction = data.get("prediction_g2", data.get("prediction"))
hd_score_g2 = None
try:
if prediction is not None:
prediction_val = float(prediction)
raw_score = (prediction_val * 100) * 20 + math.log((prediction_val + 0.000001) * 100, 2) * 41.6
# SQL-like rounding (half up)
hd_score_g2 = int(Decimal(str(raw_score)).quantize(Decimal("1"), rounding=ROUND_HALF_UP))
hd_score_g2 = max(hd_score_g2, 0.0)
logger.info("score_g2 calculated: %s", hd_score_g2)
except Exception as e:
logger.error("Error processing score_g2 calculations: %s", e)
return {"hd_score_g2": hd_score_g2}
# Backward compatibility alias
post_processing = post_processing_g1

View File

@ -1,4 +1,5 @@
import logging import logging
import numpy as np
# Configure logging # Configure logging
logging.basicConfig( logging.basicConfig(
@ -7,7 +8,41 @@ logging.basicConfig(
) )
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def pre_processing(results): G2_PREDICTORS = [
"hd_score_m2",
"rejected_app_count",
"hd_score_m2_connected_max",
"hd_score_m2_connected_avg",
"applicant_age_connected_max",
"applicant_age_connected_avg",
"account_tel_first_seen_min_conn",
"account_tel_first_seen_max_conn",
"account_tel_first_seen_avg_conn",
"ssn_hash_first_seen_min_conn",
"ssn_hash_first_seen_avg_conn",
"account_login_first_seen_min_conn",
"digital_id_first_seen_max_conn",
"true_ip_first_seen_min_conn",
"true_ip_first_seen_max_conn",
"dist_em_ip_ref_km_min_conn",
"pct_acc_email_attr_challenged_1_conn",
"account_login_first_seen_range_conn",
"account_login_first_seen_stddev_conn",
"cpu_clock_range_conn",
"summary_risk_score_max_conn",
]
def _coerce_float(value):
if value is None:
return np.nan
try:
return float(value)
except (TypeError, ValueError):
return np.nan
def pre_processing_g1(results):
result = results[0] result = results[0]
dtypes = { dtypes = {
"hd_score_m1": float, "hd_score_m1": float,
@ -35,3 +70,22 @@ def pre_processing(results):
data[col] = dtype(value) if value.replace(".", "", 1).isdigit() else None data[col] = dtype(value) if value.replace(".", "", 1).isdigit() else None
return data return data
def pre_processing_g2(results):
result = results[0]
working = dict(result)
if "rejected_app_count_g2" in working:
# Always prefer the G2-specific count for G2 preprocessing
working["rejected_app_count"] = working.get("rejected_app_count_g2")
data = {}
for feature in G2_PREDICTORS:
data[feature] = _coerce_float(working.get(feature))
data["cluster_size"] = working.get("cluster_size")
return data
# Backward compatibility alias
pre_processing = pre_processing_g1

View File

@ -11,24 +11,51 @@ logging.basicConfig(
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def processing(data): def processing_g1(data):
df = pd.DataFrame([data]) df = pd.DataFrame([data])
if df.empty: if df.empty:
logger.error("Input DataFrame is empty.") logger.error("Input DataFrame is empty.")
# Load Model # Load Model
model_path = "./xgboost_model.joblib" # model_path = "C:/Users/abinisha/habemco_flowx/g1_v1/xgboost_model_G1.joblib"
# model_path ="C:/Users/abinisha/habemco_flowx/g1_v1/xgboost_model.joblib" model_path = "./xgboost_model_G1.joblib"
model = joblib.load(model_path) model = joblib.load(model_path)
expected_features = model.feature_names expected_features = model.feature_names
df = df.applymap(lambda x: float('nan') if x is None else x) df = df.applymap(lambda x: float("nan") if x is None else x)
dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=float('nan')) dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=float("nan"))
prediction = model.predict(dmatrix) prediction = model.predict(dmatrix)
df['prediction'] = prediction df["prediction"] = prediction
return df.iloc[0].to_dict() return df.iloc[0].to_dict()
def processing_g2(data):
df = pd.DataFrame([data])
if df.empty:
logger.error("Input DataFrame is empty.")
# model_path = "C:/Users/abinisha/habemco_flowx/g1_v1/xgboost_model_G2.joblib"
model_path = "./xgboost_model_G2.joblib"
model = joblib.load(model_path)
expected_features = model.feature_names
df = df.reindex(columns=expected_features)
df = df.applymap(lambda x: float("nan") if x is None else x)
dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=float("nan"))
prediction = model.predict(dmatrix)
df["prediction_g2"] = prediction
return df.iloc[0].to_dict()
# Backward compatibility alias
processing = processing_g1

View File

@ -10,6 +10,10 @@
"type": ["number", "null"], "type": ["number", "null"],
"description": "HD fraud Score G1" "description": "HD fraud Score G1"
}, },
"hd_score_g2": {
"type": ["number", "null"],
"description": "HD fraud Score G2"
},
"cluster_size_users_v2": { "cluster_size_users_v2": {
"type": ["number", "null"], "type": ["number", "null"],
"description": "Size of the user cluster in version 2." "description": "Size of the user cluster in version 2."
@ -36,6 +40,3 @@
} }
} }
} }

View File

@ -1,27 +1,46 @@
import unittest import unittest
import pandas as pd
from block import __main__ from block import __main__
data = [{ data = [{
# "application_key": "0A123C7F-BE45-4912-8E22-0904707325E7", "application_key": "A3CDD39F-10F8-40B0-A4C9-0E1558B75131",
"hd_score_m1": 1211, "hd_score_m1": 1101.0,
"cluster_size_users_v2": 2, "hd_score_iso_m2": 1113,
"target_connected_30_sum": 0, "cluster_size": 10,
"email_cnt": 1, "cluster_size_users_v2": 3,
"rejected_app_count": 2, "target_connected_30_sum": 0.0,
"app_dt_day_cnt": 2, "email_cnt": 3,
"cluster_size": 3, "rejected_app_count": 6.0,
"hd_score_iso_m2": 1202 "app_dt_day_cnt": 7,
}] "hd_score_m2": 1188,
"hd_score_m2_connected_max": 1197.0,
"hd_score_m2_connected_avg": 1184.888889,
"applicant_age_connected_max": 60.0,
"applicant_age_connected_avg": 52.44444444,
"account_tel_first_seen_min_conn": 879.0,
"account_tel_first_seen_max_conn": 989.0,
"account_tel_first_seen_avg_conn": 949.6666667,
"ssn_hash_first_seen_min_conn": 5.0,
"ssn_hash_first_seen_avg_conn": 58.0,
"account_login_first_seen_min_conn": 0.0,
"digital_id_first_seen_max_conn": 2652.0,
"true_ip_first_seen_min_conn": 1857.0,
"true_ip_first_seen_max_conn": 1967.0,
"dist_em_ip_ref_km_min_conn": 17.43689023,
"pct_acc_email_attr_challenged_1_conn": 0.0,
"account_login_first_seen_range_conn": 2313.0,
"account_login_first_seen_stddev_conn": 1042.4994,
"cpu_clock_range_conn": 9054.0,
"summary_risk_score_max_conn": 14.0
}]
class TestBlock(unittest.TestCase): class TestBlock(unittest.TestCase):
def test_main_success(self): def test_main_returns_scores(self):
blockResult = __main__(data) block_result = __main__(data)
self.assertIsInstance(block_result, dict)
# breakpoint() self.assertIn("hd_score_g1", block_result)
self.assertIsInstance(blockResult, dict, "Result should be a dictionary.") self.assertIn("hd_score_g2", block_result)
self.assertIn("hd_score_g1", blockResult, "Result dictionary should contain 'hd_score_g1' if success.")
if __name__ == "__main__": if __name__ == "__main__":

BIN
xgboost_model_G1.joblib Normal file

Binary file not shown.

BIN
xgboost_model_G2.joblib Normal file

Binary file not shown.