diff --git a/block.py b/block.py index 662bfba..cf1829b 100644 --- a/block.py +++ b/block.py @@ -1,8 +1,8 @@ import logging from typing import List, Dict -from graph_pre_processing import pre_processing -from graph_processing import processing -from graph_post_processing import post_processing +from graph_pre_processing import pre_processing_g1, pre_processing_g2 +from graph_processing import processing_g1, processing_g2 +from graph_post_processing import post_processing_g1, post_processing_g2 # Configure logging logging.basicConfig( @@ -12,20 +12,30 @@ logging.basicConfig( logger = logging.getLogger(__name__) -def __main__(results: List[Dict]) -> List[Dict]: - logger.info(f"data receiving in g1v1 block: {results}") - data = pre_processing(results) - logger.info(f"pre_processed_data, new_user_app_data: {data}") +def __main__(results: List[Dict]) -> Dict: + logger.info("data receiving in g1v1 block: %s", results) + g1_input = pre_processing_g1(results) + g2_input = pre_processing_g2(results) + logger.info("pre_processed_data_g1: %s", g1_input) + logger.info("pre_processed_data_g2: %s", g2_input) - # df = processing(data) - if data.get("cluster_size", 2) < 2: - data["prediction"] = 0 + cluster_size = g1_input.get("cluster_size", 2) + if cluster_size is None: + cluster_size = 2 + + if cluster_size < 2: + g1_processed = {**g1_input, "prediction": 0} + g2_processed = {**g2_input, "prediction_g2": 0} else: - data = processing(data) - logger.info("prediction: %.8f", float(data['prediction'])) + g1_processed = processing_g1(g1_input) + g2_processed = processing_g2(g2_input) - # Post-processing: calculate the Final Score and update the dataframe. - final = post_processing(data) + logger.info("prediction_g1: %.8f", float(g1_processed.get("prediction", 0))) + logger.info("prediction_g2: %.8f", float(g2_processed.get("prediction_g2", 0))) + + final_g1 = post_processing_g1(g1_processed) + final_g2 = post_processing_g2(g2_processed) + final = {**final_g1, **final_g2} logger.info(final) return final diff --git a/graph_post_processing.py b/graph_post_processing.py index 77c58a0..7f82e4d 100644 --- a/graph_post_processing.py +++ b/graph_post_processing.py @@ -1,5 +1,6 @@ import logging import math +from decimal import Decimal, ROUND_HALF_UP # Configure logging logging.basicConfig( @@ -8,7 +9,8 @@ logging.basicConfig( ) logger = logging.getLogger(__name__) -def post_processing(data): + +def post_processing_g1(data): try: prediction = data.get("prediction", 0) score_g1 = round( @@ -17,9 +19,9 @@ def post_processing(data): 0 ) data["hd_score_g1"] = score_g1 - logger.info(f"score_g1 calculated: {score_g1}") + logger.info("score_g1 calculated: %s", score_g1) except Exception as e: - logger.error(f"Error processing score_g1 calculations: {e}") + logger.error("Error processing score_g1 calculations: %s", e) return { key: data.get(key, None) @@ -29,3 +31,25 @@ def post_processing(data): "app_dt_day_cnt", "hd_score_iso_m2" ] } + + +def post_processing_g2(data): + prediction = data.get("prediction_g2", data.get("prediction")) + hd_score_g2 = None + + try: + if prediction is not None: + prediction_val = float(prediction) + raw_score = (prediction_val * 100) * 20 + math.log((prediction_val + 0.000001) * 100, 2) * 41.6 + # SQL-like rounding (half up) + hd_score_g2 = int(Decimal(str(raw_score)).quantize(Decimal("1"), rounding=ROUND_HALF_UP)) + hd_score_g2 = max(hd_score_g2, 0.0) + logger.info("score_g2 calculated: %s", hd_score_g2) + except Exception as e: + logger.error("Error processing score_g2 calculations: %s", e) + + return {"hd_score_g2": hd_score_g2} + + +# Backward compatibility alias +post_processing = post_processing_g1 diff --git a/graph_pre_processing.py b/graph_pre_processing.py index 56f7023..b67dd0a 100644 --- a/graph_pre_processing.py +++ b/graph_pre_processing.py @@ -1,4 +1,5 @@ import logging +import numpy as np # Configure logging logging.basicConfig( @@ -7,7 +8,41 @@ logging.basicConfig( ) logger = logging.getLogger(__name__) -def pre_processing(results): +G2_PREDICTORS = [ + "hd_score_m2", + "rejected_app_count", + "hd_score_m2_connected_max", + "hd_score_m2_connected_avg", + "applicant_age_connected_max", + "applicant_age_connected_avg", + "account_tel_first_seen_min_conn", + "account_tel_first_seen_max_conn", + "account_tel_first_seen_avg_conn", + "ssn_hash_first_seen_min_conn", + "ssn_hash_first_seen_avg_conn", + "account_login_first_seen_min_conn", + "digital_id_first_seen_max_conn", + "true_ip_first_seen_min_conn", + "true_ip_first_seen_max_conn", + "dist_em_ip_ref_km_min_conn", + "pct_acc_email_attr_challenged_1_conn", + "account_login_first_seen_range_conn", + "account_login_first_seen_stddev_conn", + "cpu_clock_range_conn", + "summary_risk_score_max_conn", +] + + +def _coerce_float(value): + if value is None: + return np.nan + try: + return float(value) + except (TypeError, ValueError): + return np.nan + + +def pre_processing_g1(results): result = results[0] dtypes = { "hd_score_m1": float, @@ -35,3 +70,22 @@ def pre_processing(results): data[col] = dtype(value) if value.replace(".", "", 1).isdigit() else None return data + + +def pre_processing_g2(results): + result = results[0] + working = dict(result) + if "rejected_app_count_g2" in working: + # Always prefer the G2-specific count for G2 preprocessing + working["rejected_app_count"] = working.get("rejected_app_count_g2") + + data = {} + for feature in G2_PREDICTORS: + data[feature] = _coerce_float(working.get(feature)) + + data["cluster_size"] = working.get("cluster_size") + return data + + +# Backward compatibility alias +pre_processing = pre_processing_g1 diff --git a/graph_processing.py b/graph_processing.py index 7ecae86..9611c21 100644 --- a/graph_processing.py +++ b/graph_processing.py @@ -11,24 +11,51 @@ logging.basicConfig( logger = logging.getLogger(__name__) -def processing(data): +def processing_g1(data): df = pd.DataFrame([data]) if df.empty: logger.error("Input DataFrame is empty.") # Load Model - model_path = "./xgboost_model.joblib" - # model_path ="C:/Users/abinisha/habemco_flowx/g1_v1/xgboost_model.joblib" + # model_path = "C:/Users/abinisha/habemco_flowx/g1_v1/xgboost_model_G1.joblib" + model_path = "./xgboost_model_G1.joblib" model = joblib.load(model_path) expected_features = model.feature_names - df = df.applymap(lambda x: float('nan') if x is None else x) + df = df.applymap(lambda x: float("nan") if x is None else x) - dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=float('nan')) + dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=float("nan")) prediction = model.predict(dmatrix) - df['prediction'] = prediction + df["prediction"] = prediction return df.iloc[0].to_dict() + + +def processing_g2(data): + df = pd.DataFrame([data]) + if df.empty: + logger.error("Input DataFrame is empty.") + + # model_path = "C:/Users/abinisha/habemco_flowx/g1_v1/xgboost_model_G2.joblib" + model_path = "./xgboost_model_G2.joblib" + model = joblib.load(model_path) + + expected_features = model.feature_names + + df = df.reindex(columns=expected_features) + df = df.applymap(lambda x: float("nan") if x is None else x) + + dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=float("nan")) + + prediction = model.predict(dmatrix) + + df["prediction_g2"] = prediction + + return df.iloc[0].to_dict() + + +# Backward compatibility alias +processing = processing_g1 diff --git a/response_schema.json b/response_schema.json index 1b53838..57aefb2 100644 --- a/response_schema.json +++ b/response_schema.json @@ -10,6 +10,10 @@ "type": ["number", "null"], "description": "HD fraud Score G1" }, + "hd_score_g2": { + "type": ["number", "null"], + "description": "HD fraud Score G2" + }, "cluster_size_users_v2": { "type": ["number", "null"], "description": "Size of the user cluster in version 2." @@ -36,6 +40,3 @@ } } } - - - diff --git a/test_block.py b/test_block.py index 5de8a6e..4dc4a3f 100644 --- a/test_block.py +++ b/test_block.py @@ -1,27 +1,46 @@ import unittest -import pandas as pd from block import __main__ data = [{ - # "application_key": "0A123C7F-BE45-4912-8E22-0904707325E7", - "hd_score_m1": 1211, - "cluster_size_users_v2": 2, - "target_connected_30_sum": 0, - "email_cnt": 1, - "rejected_app_count": 2, - "app_dt_day_cnt": 2, - "cluster_size": 3, - "hd_score_iso_m2": 1202 - }] + "application_key": "A3CDD39F-10F8-40B0-A4C9-0E1558B75131", + "hd_score_m1": 1101.0, + "hd_score_iso_m2": 1113, + "cluster_size": 10, + "cluster_size_users_v2": 3, + "target_connected_30_sum": 0.0, + "email_cnt": 3, + "rejected_app_count": 6.0, + "app_dt_day_cnt": 7, + "hd_score_m2": 1188, + "hd_score_m2_connected_max": 1197.0, + "hd_score_m2_connected_avg": 1184.888889, + "applicant_age_connected_max": 60.0, + "applicant_age_connected_avg": 52.44444444, + "account_tel_first_seen_min_conn": 879.0, + "account_tel_first_seen_max_conn": 989.0, + "account_tel_first_seen_avg_conn": 949.6666667, + "ssn_hash_first_seen_min_conn": 5.0, + "ssn_hash_first_seen_avg_conn": 58.0, + "account_login_first_seen_min_conn": 0.0, + "digital_id_first_seen_max_conn": 2652.0, + "true_ip_first_seen_min_conn": 1857.0, + "true_ip_first_seen_max_conn": 1967.0, + "dist_em_ip_ref_km_min_conn": 17.43689023, + "pct_acc_email_attr_challenged_1_conn": 0.0, + "account_login_first_seen_range_conn": 2313.0, + "account_login_first_seen_stddev_conn": 1042.4994, + "cpu_clock_range_conn": 9054.0, + "summary_risk_score_max_conn": 14.0 + }] + class TestBlock(unittest.TestCase): - def test_main_success(self): - blockResult = __main__(data) - - # breakpoint() - self.assertIsInstance(blockResult, dict, "Result should be a dictionary.") - self.assertIn("hd_score_g1", blockResult, "Result dictionary should contain 'hd_score_g1' if success.") + def test_main_returns_scores(self): + block_result = __main__(data) + self.assertIsInstance(block_result, dict) + self.assertIn("hd_score_g1", block_result) + self.assertIn("hd_score_g2", block_result) if __name__ == "__main__": diff --git a/xgboost_model_G1.joblib b/xgboost_model_G1.joblib new file mode 100644 index 0000000..0805b92 Binary files /dev/null and b/xgboost_model_G1.joblib differ diff --git a/xgboost_model_G2.joblib b/xgboost_model_G2.joblib new file mode 100644 index 0000000..592f853 Binary files /dev/null and b/xgboost_model_G2.joblib differ