Add G2 pipeline, models, and schema for g1_v1

2025-11-26 11:49:57 -05:00 · 2025-11-26 11:49:57 -05:00 · 9f3cb9ca4f
commit 9f3cb9ca4f
parent e58625ad19
8 changed files with 179 additions and 44 deletions
--- a/block.py
+++ b/block.py
@ -1,8 +1,8 @@
 import logging
 from typing import List, Dict
-from graph_pre_processing import pre_processing
-from graph_processing import processing
-from graph_post_processing import post_processing
+from graph_pre_processing import pre_processing_g1, pre_processing_g2
+from graph_processing import processing_g1, processing_g2
+from graph_post_processing import post_processing_g1, post_processing_g2

 # Configure logging
 logging.basicConfig(
@ -12,20 +12,30 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)


-def __main__(results: List[Dict]) -> List[Dict]:
-    logger.info(f"data receiving in g1v1 block: {results}")
-    data = pre_processing(results)
-    logger.info(f"pre_processed_data, new_user_app_data: {data}")
+def __main__(results: List[Dict]) -> Dict:
+    logger.info("data receiving in g1v1 block: %s", results)
+    g1_input = pre_processing_g1(results)
+    g2_input = pre_processing_g2(results)
+    logger.info("pre_processed_data_g1: %s", g1_input)
+    logger.info("pre_processed_data_g2: %s", g2_input)

-    # df = processing(data)
-    if data.get("cluster_size", 2) < 2:
-        data["prediction"] = 0
+    cluster_size = g1_input.get("cluster_size", 2)
+    if cluster_size is None:
+        cluster_size = 2
+
+    if cluster_size < 2:
+        g1_processed = {**g1_input, "prediction": 0}
+        g2_processed = {**g2_input, "prediction_g2": 0}
    else:
-        data = processing(data)
-    logger.info("prediction: %.8f", float(data['prediction']))
+        g1_processed = processing_g1(g1_input)
+        g2_processed = processing_g2(g2_input)

-    # Post-processing: calculate the Final Score and update the dataframe.
-    final = post_processing(data)
+    logger.info("prediction_g1: %.8f", float(g1_processed.get("prediction", 0)))
+    logger.info("prediction_g2: %.8f", float(g2_processed.get("prediction_g2", 0)))
+
+    final_g1 = post_processing_g1(g1_processed)
+    final_g2 = post_processing_g2(g2_processed)
+    final = {**final_g1, **final_g2}
    logger.info(final)

    return final
--- a/graph_post_processing.py
+++ b/graph_post_processing.py
@ -1,5 +1,6 @@
 import logging
 import math
+from decimal import Decimal, ROUND_HALF_UP

 # Configure logging
 logging.basicConfig(
@ -8,7 +9,8 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)

-def post_processing(data):
+
+def post_processing_g1(data):
    try:
        prediction = data.get("prediction", 0)
        score_g1 = round(
@ -17,9 +19,9 @@ def post_processing(data):
            0
        )
        data["hd_score_g1"] = score_g1
-        logger.info(f"score_g1 calculated: {score_g1}")
+        logger.info("score_g1 calculated: %s", score_g1)
    except Exception as e:
-        logger.error(f"Error processing score_g1 calculations: {e}")
+        logger.error("Error processing score_g1 calculations: %s", e)

    return {
        key: data.get(key, None)
@ -29,3 +31,25 @@ def post_processing(data):
            "app_dt_day_cnt", "hd_score_iso_m2"
        ]
    }
+
+
+def post_processing_g2(data):
+    prediction = data.get("prediction_g2", data.get("prediction"))
+    hd_score_g2 = None
+
+    try:
+        if prediction is not None:
+            prediction_val = float(prediction)
+            raw_score = (prediction_val * 100) * 20 + math.log((prediction_val + 0.000001) * 100, 2) * 41.6
+            # SQL-like rounding (half up)
+            hd_score_g2 = int(Decimal(str(raw_score)).quantize(Decimal("1"), rounding=ROUND_HALF_UP))
+            hd_score_g2 = max(hd_score_g2, 0.0)
+            logger.info("score_g2 calculated: %s", hd_score_g2)
+    except Exception as e:
+        logger.error("Error processing score_g2 calculations: %s", e)
+
+    return {"hd_score_g2": hd_score_g2}
+
+
+# Backward compatibility alias
+post_processing = post_processing_g1
--- a/graph_pre_processing.py
+++ b/graph_pre_processing.py
@ -1,4 +1,5 @@
 import logging
+import numpy as np

 # Configure logging
 logging.basicConfig(
@ -7,7 +8,41 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)

-def pre_processing(results):
+G2_PREDICTORS = [
+    "hd_score_m2",
+    "rejected_app_count",
+    "hd_score_m2_connected_max",
+    "hd_score_m2_connected_avg",
+    "applicant_age_connected_max",
+    "applicant_age_connected_avg",
+    "account_tel_first_seen_min_conn",
+    "account_tel_first_seen_max_conn",
+    "account_tel_first_seen_avg_conn",
+    "ssn_hash_first_seen_min_conn",
+    "ssn_hash_first_seen_avg_conn",
+    "account_login_first_seen_min_conn",
+    "digital_id_first_seen_max_conn",
+    "true_ip_first_seen_min_conn",
+    "true_ip_first_seen_max_conn",
+    "dist_em_ip_ref_km_min_conn",
+    "pct_acc_email_attr_challenged_1_conn",
+    "account_login_first_seen_range_conn",
+    "account_login_first_seen_stddev_conn",
+    "cpu_clock_range_conn",
+    "summary_risk_score_max_conn",
+]
+
+
+def _coerce_float(value):
+    if value is None:
+        return np.nan
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return np.nan
+
+
+def pre_processing_g1(results):
    result = results[0]
    dtypes = {
        "hd_score_m1": float,
@ -35,3 +70,22 @@ def pre_processing(results):
            data[col] = dtype(value) if value.replace(".", "", 1).isdigit() else None

    return data
+
+
+def pre_processing_g2(results):
+    result = results[0]
+    working = dict(result)
+    if "rejected_app_count_g2" in working:
+        # Always prefer the G2-specific count for G2 preprocessing
+        working["rejected_app_count"] = working.get("rejected_app_count_g2")
+
+    data = {}
+    for feature in G2_PREDICTORS:
+        data[feature] = _coerce_float(working.get(feature))
+
+    data["cluster_size"] = working.get("cluster_size")
+    return data
+
+
+# Backward compatibility alias
+pre_processing = pre_processing_g1
--- a/graph_processing.py
+++ b/graph_processing.py
@ -11,24 +11,51 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)


-def processing(data):
+def processing_g1(data):
    df = pd.DataFrame([data])
    if df.empty:
        logger.error("Input DataFrame is empty.")

    # Load Model
-    model_path = "./xgboost_model.joblib"
-    # model_path ="C:/Users/abinisha/habemco_flowx/g1_v1/xgboost_model.joblib"
+    # model_path = "C:/Users/abinisha/habemco_flowx/g1_v1/xgboost_model_G1.joblib"
+    model_path = "./xgboost_model_G1.joblib"
    model = joblib.load(model_path)

    expected_features = model.feature_names

-    df = df.applymap(lambda x: float('nan') if x is None else x)
+    df = df.applymap(lambda x: float("nan") if x is None else x)

-    dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=float('nan'))
+    dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=float("nan"))

    prediction = model.predict(dmatrix)

-    df['prediction'] = prediction
+    df["prediction"] = prediction

    return df.iloc[0].to_dict()
+
+
+def processing_g2(data):
+    df = pd.DataFrame([data])
+    if df.empty:
+        logger.error("Input DataFrame is empty.")
+
+    # model_path = "C:/Users/abinisha/habemco_flowx/g1_v1/xgboost_model_G2.joblib"
+    model_path = "./xgboost_model_G2.joblib"
+    model = joblib.load(model_path)
+
+    expected_features = model.feature_names
+
+    df = df.reindex(columns=expected_features)
+    df = df.applymap(lambda x: float("nan") if x is None else x)
+
+    dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=float("nan"))
+
+    prediction = model.predict(dmatrix)
+
+    df["prediction_g2"] = prediction
+
+    return df.iloc[0].to_dict()
+
+
+# Backward compatibility alias
+processing = processing_g1
--- a/response_schema.json
+++ b/response_schema.json
@ -10,6 +10,10 @@
      "type": ["number", "null"],
      "description": "HD fraud Score G1"
    },
+    "hd_score_g2": {
+      "type": ["number", "null"],
+      "description": "HD fraud Score G2"
+    },
    "cluster_size_users_v2": {
      "type": ["number", "null"],
      "description": "Size of the user cluster in version 2."
@ -36,6 +40,3 @@
    }
  }
 }
-
-
-
--- a/test_block.py
+++ b/test_block.py
@ -1,27 +1,46 @@
 import unittest
-import pandas as pd
 from block import __main__


 data = [{
-    #   "application_key": "0A123C7F-BE45-4912-8E22-0904707325E7",
-      "hd_score_m1": 1211,
-      "cluster_size_users_v2": 2,
-      "target_connected_30_sum": 0,
-      "email_cnt": 1,
-      "rejected_app_count": 2,
-      "app_dt_day_cnt": 2,
-      "cluster_size": 3,
-      "hd_score_iso_m2": 1202
+            "application_key": "A3CDD39F-10F8-40B0-A4C9-0E1558B75131",
+            "hd_score_m1": 1101.0,
+            "hd_score_iso_m2": 1113,
+            "cluster_size": 10,
+            "cluster_size_users_v2": 3,
+            "target_connected_30_sum": 0.0,
+            "email_cnt": 3,
+            "rejected_app_count": 6.0,
+            "app_dt_day_cnt": 7,
+            "hd_score_m2": 1188,
+            "hd_score_m2_connected_max": 1197.0,
+            "hd_score_m2_connected_avg": 1184.888889,
+            "applicant_age_connected_max": 60.0,
+            "applicant_age_connected_avg": 52.44444444,
+            "account_tel_first_seen_min_conn": 879.0,
+            "account_tel_first_seen_max_conn": 989.0,
+            "account_tel_first_seen_avg_conn": 949.6666667,
+            "ssn_hash_first_seen_min_conn": 5.0,
+            "ssn_hash_first_seen_avg_conn": 58.0,
+            "account_login_first_seen_min_conn": 0.0,
+            "digital_id_first_seen_max_conn": 2652.0,
+            "true_ip_first_seen_min_conn": 1857.0,
+            "true_ip_first_seen_max_conn": 1967.0,
+            "dist_em_ip_ref_km_min_conn": 17.43689023,
+            "pct_acc_email_attr_challenged_1_conn": 0.0,
+            "account_login_first_seen_range_conn": 2313.0,
+            "account_login_first_seen_stddev_conn": 1042.4994,
+            "cpu_clock_range_conn": 9054.0,
+            "summary_risk_score_max_conn": 14.0
        }]

-class TestBlock(unittest.TestCase):
-    def test_main_success(self):
-        blockResult = __main__(data) 

-        # breakpoint()
-        self.assertIsInstance(blockResult, dict, "Result should be a dictionary.")
-        self.assertIn("hd_score_g1", blockResult, "Result dictionary should contain 'hd_score_g1' if success.")
+class TestBlock(unittest.TestCase):
+    def test_main_returns_scores(self):
+        block_result = __main__(data)
+        self.assertIsInstance(block_result, dict)
+        self.assertIn("hd_score_g1", block_result)
+        self.assertIn("hd_score_g2", block_result)


 if __name__ == "__main__":
--- a/xgboost_model_G1.joblib
+++ b/xgboost_model_G1.joblib
--- a/xgboost_model_G2.joblib
+++ b/xgboost_model_G2.joblib