diff --git a/README.md b/README.md index 59a3efc..d76e1ab 100644 --- a/README.md +++ b/README.md @@ -1 +1,2 @@ -**Hello world!!!** +# G Series Block +Loads and score G 1 v1 model diff --git a/block.py b/block.py index 3b227f9..662bfba 100644 --- a/block.py +++ b/block.py @@ -1,21 +1,34 @@ -@flowx_block -def example_function(request: dict) -> dict: +import logging +from typing import List, Dict +from graph_pre_processing import pre_processing +from graph_processing import processing +from graph_post_processing import post_processing - # Processing logic here... +# Configure logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(name)s - %(message)s", +) +logger = logging.getLogger(__name__) - return { - "meta_info": [ - { - "name": "created_date", - "type": "string", - "value": "2024-11-05" - } - ], - "fields": [ - { - "name": "", - "type": "", - "value": "" - } - ] - } + +def __main__(results: List[Dict]) -> List[Dict]: + logger.info(f"data receiving in g1v1 block: {results}") + data = pre_processing(results) + logger.info(f"pre_processed_data, new_user_app_data: {data}") + + # df = processing(data) + if data.get("cluster_size", 2) < 2: + data["prediction"] = 0 + else: + data = processing(data) + logger.info("prediction: %.8f", float(data['prediction'])) + + # Post-processing: calculate the Final Score and update the dataframe. + final = post_processing(data) + logger.info(final) + + return final + +# testing : +# __main__ diff --git a/graph_post_processing.py b/graph_post_processing.py new file mode 100644 index 0000000..19be35e --- /dev/null +++ b/graph_post_processing.py @@ -0,0 +1,31 @@ +import logging +import math + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(name)s - %(message)s", +) +logger = logging.getLogger(__name__) + +def post_processing(data): + try: + prediction = data.get("prediction", 0) + score_g1 = round( + min(prediction * 100 + 0.00001, 1) * 89 + + max(math.log2(prediction * 100 + 0.000001) * 193, 0), + 0 + ) + data["hd_score_g1"] = score_g1 + logger.info(f"score_g1 calculated: {score_g1}") + except Exception as e: + logger.error(f"Error processing score_g1 calculations: {e}") + + return { + key: data.get(key, None) + for key in [ + "hd_score_m1", "hd_score_g1", "cluster_size_users_v2", + "target_connected_30_sum", "email_cnt", "rejected_app_count", + "app_dt_day_cnt" + ] + } diff --git a/graph_pre_processing.py b/graph_pre_processing.py new file mode 100644 index 0000000..7263c6f --- /dev/null +++ b/graph_pre_processing.py @@ -0,0 +1,35 @@ +import logging + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(name)s - %(message)s", +) +logger = logging.getLogger(__name__) + +def pre_processing(results): + result = results[0] + dtypes = { + "hd_score_m1": float, + "cluster_size_users_v2": float, + "target_connected_30_sum": float, + "email_cnt": float, + "rejected_app_count": float, + "app_dt_day_cnt": float + } + data = { + "hd_score_m1": result["hd_score_m1"], + "cluster_size_users_v2": result["cluster_size_users_v2"], + "target_connected_30_sum": result["target_connected_30_sum"], + "email_cnt": result["email_cnt"], + "rejected_app_count": result["rejected_app_count"], + "app_dt_day_cnt": result["app_dt_day_cnt"], + "cluster_size": result["cluster_size"], + } + + for col, dtype in dtypes.items(): + if col in data: + value = str(data[col]).strip() + data[col] = dtype(value) if value.replace(".", "", 1).isdigit() else None + + return data diff --git a/graph_processing.py b/graph_processing.py new file mode 100644 index 0000000..7ecae86 --- /dev/null +++ b/graph_processing.py @@ -0,0 +1,34 @@ +import xgboost as xgb +import pandas as pd +import joblib +import logging + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(name)s - %(message)s", +) +logger = logging.getLogger(__name__) + + +def processing(data): + df = pd.DataFrame([data]) + if df.empty: + logger.error("Input DataFrame is empty.") + + # Load Model + model_path = "./xgboost_model.joblib" + # model_path ="C:/Users/abinisha/habemco_flowx/g1_v1/xgboost_model.joblib" + model = joblib.load(model_path) + + expected_features = model.feature_names + + df = df.applymap(lambda x: float('nan') if x is None else x) + + dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=float('nan')) + + prediction = model.predict(dmatrix) + + df['prediction'] = prediction + + return df.iloc[0].to_dict() diff --git a/request_schema.json b/request_schema.json index 0967ef4..2838865 100644 --- a/request_schema.json +++ b/request_schema.json @@ -1 +1,11 @@ -{} +{ + "$schema": "http://json-schema.org/draft-07/schema", + "type": "object", + "properties": { + "results": { + "type": ["array", "null"], + "items": {"type": "object"} + } + }, + "required": [] +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 0967ef4..c044a52 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,5 @@ -{} +joblib==1.4.2 +pandas==2.2.3 +xgboost==2.1.3 +typing==3.6.1 + diff --git a/response_schema.json b/response_schema.json index 0967ef4..62d7e82 100644 --- a/response_schema.json +++ b/response_schema.json @@ -1 +1,37 @@ -{} +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "hd_score_m1": { + "type": ["number", "null"], + "description": "HD fraud Score M1" + }, + "hd_score_g1": { + "type": ["number", "null"], + "description": "HD fraud Score G1" + }, + "cluster_size_users_v2": { + "type": ["number", "null"], + "description": "Size of the user cluster in version 2." + }, + "target_connected_30_sum": { + "type": ["number", "null"], + "description": "Sum of target connections within 30 days." + }, + "email_cnt": { + "type": ["number", "null"], + "description": "Count of emails associated with the application." + }, + "rejected_app_count": { + "type": ["number", "null"], + "description": "Count of rejected applications for the applicant." + }, + "app_dt_day_cnt": { + "type": ["number", "null"], + "description": "Number of application days counted." + } + } +} + + + diff --git a/test_block.py b/test_block.py new file mode 100644 index 0000000..2cbf95a --- /dev/null +++ b/test_block.py @@ -0,0 +1,27 @@ +import unittest +import pandas as pd +from block import __main__ + + +data = [{ + # "application_key": "0A123C7F-BE45-4912-8E22-0904707325E7", + "hd_score_m1": 1211, + "cluster_size_users_v2": 2, + "target_connected_30_sum": 0, + "email_cnt": 1, + "rejected_app_count": 2, + "app_dt_day_cnt": 2, + "cluster_size": 3 + }] + +class TestBlock(unittest.TestCase): + def test_main_success(self): + blockResult = __main__(data) + + # breakpoint() + self.assertIsInstance(blockResult, dict, "Result should be a dictionary.") + self.assertIn("hd_score_g1", blockResult, "Result dictionary should contain 'hd_score_g1' if success.") + + +if __name__ == "__main__": + unittest.main() diff --git a/xgboost_model.joblib b/xgboost_model.joblib new file mode 100644 index 0000000..0805b92 Binary files /dev/null and b/xgboost_model.joblib differ