Sync m-1-v-1 block with local updates

2025-11-23 23:22:32 -05:00 · 2025-11-23 23:22:32 -05:00 · d0f4d225ee
commit d0f4d225ee
parent 1bf55226e1
14 changed files with 29412 additions and 311 deletions
--- a/block.py
+++ b/block.py
@ -1,12 +1,12 @@
 import pandas as pd
 import logging
-import json
+
 import jmespath
 import regex as re
 from pre_processing import pre_processing
 from processing import processing
 from post_processing import post_processing
 import json_repair
 import pandas as pd
 import regex as re
 from pre_processing import pre_processing_all
 from processing import processing_all
 from post_processing import post_processing_all
 # Configure logging
@ -16,7 +16,7 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
-_JSON_LIKE = re.compile(r'^\s*\?*[\{\[].*[\}\]]\s*$', re.DOTALL)
+_JSON_LIKE = re.compile(r"^\s*\?*[\{\[].*[\}\]]\s*$", re.DOTALL)
 def extract_value(blob, expression):
@ -25,14 +25,13 @@ def extract_value(blob, expression):
    except Exception:
        return None
 def coalesce(*args):
    for value in args:
        if value is not None:
            return value
    return None
 # New sanitize blob function
 def deep_repair(obj):
    # 1) If it's a string that *looks* like JSON (with or without one leading '?'),
@ -40,18 +39,17 @@ def deep_repair(obj):
    if isinstance(obj, str):
        s = obj.strip()
        if _JSON_LIKE.match(s):
-            # strip one leading '?' if present
+            if s.startswith("?"):
            if s.startswith('?'):
                s = s[1:]
            parsed = json_repair.loads(s)
            return deep_repair(parsed)
        return obj
-    # 2) Dict → recurse on each value
+    # 2) Dict – recurse on each value
    if isinstance(obj, dict):
        return {k: deep_repair(v) for k, v in obj.items()}
-    # 3) List → recurse on each element
+    # 3) List – recurse on each element
    if isinstance(obj, list):
        return [deep_repair(v) for v in obj]
@ -66,34 +64,21 @@ def sanitize_blob(blob):
        logger.error("Failed to sanitize blob: %s", e)
        return None
-# Expressions to extract values
+# Expressions to extract values (M1 + added M2 fields)
 expressions = {
    # M1 (existing)
    "first_seen_days": [
        # 1) any vendor under integration_hub_results → first_seen_days
        "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].first_seen_days)[0]",
-
+        'Blob."emailage.emailriskscore.first_seen_days"',
        # 2) the flat “dotted” key
        "Blob.\"emailage.emailriskscore.first_seen_days\"",
        # 3) fallback to the top level tps_vendor_raw_response path
        "Blob.tps_vendor_raw_response.query.results[0].first_seen_days",
    ],
    "ea_score": [
-        # 1) any vendor under integration_hub_results
+        "Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].EAScore",
        'Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].EAScore',
        # 2) the flat “dotted” key
        'Blob."emailage.emailriskscore.eascore"',
-
+        "Blob.tps_vendor_raw_response.query.results[0].EAScore",
        # 3) fallback to the top level tps_vendor_raw_response
        'Blob.tps_vendor_raw_response.query.results[0].EAScore',
    ],
    "email_creation_days": [
-        # 1) any vendor under integration_hub_results → results[0].email_creation_days
+        "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].email_creation_days)[0]",
        "(Blob.integration_hub_results.*"
        ".tps_vendor_raw_response.query.results[0].email_creation_days)[0]",
        # 2) fallback to the top level tps_vendor_raw_response path
        "Blob.tps_vendor_raw_response.query.results[0].email_creation_days",
    ],
    "summary_risk_score": ["Blob.summary_risk_score"],
@ -102,11 +87,7 @@ expressions = {
    "account_email_worst_score": ["Blob.account_email_worst_score"],
    "true_ip_score": ["Blob.true_ip_score"],
    "ip_net_speed_cell": [
-        # 1) any vendor under integration_hub_results → results[0].ip_netSpeedCell
+        "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ip_netSpeedCell)[0]",
        "(Blob.integration_hub_results.*"
        ".tps_vendor_raw_response.query.results[0].ip_netSpeedCell)[0]",
        # 2) fallback to the top level tps_vendor_raw_response path
        "Blob.tps_vendor_raw_response.query.results[0].ip_netSpeedCell",
    ],
    "account_email_score": ["Blob.account_email_score"],
@ -115,17 +96,105 @@ expressions = {
    "proxy_ip_score": ["Blob.proxy_ip_score"],
    "fuzzy_device_score": ["Blob.fuzzy_device_score"],
    "ip_region_confidence": [
-        # 1) any vendor under integration_hub_results → results[0].ip_regionconf
+        "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ip_regionconf)[0]",
        "(Blob.integration_hub_results.*"
        ".tps_vendor_raw_response.query.results[0].ip_regionconf)[0]",
        # 2) fallback to the top level tps_vendor_raw_response path
        "Blob.tps_vendor_raw_response.query.results[0].ip_regionconf",
    ],
    "true_ip_state_confidence": ["Blob.true_ip_state_confidence"],
    "fuzzy_device_worst_score": ["Blob.fuzzy_device_worst_score"],
    "digital_id_confidence_rating": ["Blob.digital_id_confidence_rating"],
    "trueipgeo": ["TrueIpGeo", "Blob.true_ip_geo"],
    # M2 additions
    "policy_score": ["Blob.policy_score"],
    "digital_id_trust_score": ["Blob.digital_id_trust_score"],
    "proxy_score": ["Blob.proxy_score"],
    "browser_spoof_score": ["Blob.browser_spoof_score"],
    "input_ip_connection_type": ["Blob.input_ip_connection_type"],
    "fuzzy_device_id_confidence": ["Blob.fuzzy_device_id_confidence"],
    "fraudrisk": [
        "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].fraudRisk)[0]",
        "Blob.tps_vendor_raw_response.query.results[0].fraudRisk",
        'Blob."emailage.emailriskscore.fraudRisk"',
    ],
    "overalldigitalidentityscore": [
        "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].overallDigitalIdentityScore)[0]",
        "Blob.tps_vendor_raw_response.query.results[0].overallDigitalIdentityScore",
        'Blob."emailage.emailriskscore.overallDigitalIdentityScore"',
    ],
    "totalhits": [
        "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].totalhits)[0]",
        "Blob.tps_vendor_raw_response.query.results[0].totalhits",
        'Blob."emailage.emailriskscore.totalhits"',
    ],
    "uniquehits": [
        "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].uniquehits)[0]",
        "Blob.tps_vendor_raw_response.query.results[0].uniquehits",
        'Blob."emailage.emailriskscore.uniquehits"',
    ],
    "emailtofullnameconfidence": [
        "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].emailToFullNameConfidence)[0]",
        "Blob.tps_vendor_raw_response.query.results[0].emailToFullNameConfidence",
        'Blob."emailage.emailriskscore.emailToFullNameConfidence"',
    ],
    "emailtolastnameconfidence": [
        "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].emailToLastNameConfidence)[0]",
        "Blob.tps_vendor_raw_response.query.results[0].emailToLastNameConfidence",
        'Blob."emailage.emailriskscore.emailToLastNameConfidence"',
    ],
    "domain_creation_days": [
        "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].domain_creation_days)[0]",
        "Blob.tps_vendor_raw_response.query.results[0].domain_creation_days",
        'Blob."emailage.emailriskscore.domain_creation_days"',
    ],
    "iptophoneconfidence": [
        "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ipToPhoneConfidence)[0]",
        "Blob.tps_vendor_raw_response.query.results[0].ipToPhoneConfidence",
        'Blob."emailage.emailriskscore.ipToPhoneConfidence"',
    ],
    "di_autofill_count_login": [
        "Blob.tmx_variables.di_autofill_count_login",
        "Blob.policy_details_api.policy_detail_api.customer.rules.vars.variable.di_autofill_count_login",
    ],
    "accphone_gbl_velocity_hour": [
        "Blob.tmx_variables.accphone_gbl_velocity_hour",
        "Blob.tmx_variables._accphone_gbl_velocity_hour",
    ],
    # Lat/long fields for distance engineering
    "ip_latitude": [
        "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ip_latitude)[0]",
        "Blob.tps_vendor_raw_response.query.results[0].ip_latitude",
    ],
    "ip_longitude": [
        "(Blob.integration_hub_results.*.tps_vendor_raw_response.query.results[0].ip_longitude)[0]",
        "Blob.tps_vendor_raw_response.query.results[0].ip_longitude",
    ],
    "tps_ip_latitude": ["Blob.tps_vendor_raw_response.query.results[0].ip_latitude"],
    "tps_ip_longitude": ["Blob.tps_vendor_raw_response.query.results[0].ip_longitude"],
    "true_ip_latitude": ["Blob.true_ip_latitude"],
    "true_ip_longitude": ["Blob.true_ip_longitude"],
    "proxy_ip_latitude": ["Blob.proxy_ip_latitude"],
    "proxy_ip_longitude": ["Blob.proxy_ip_longitude"],
    "dns_ip_latitude": ["Blob.dns_ip_latitude"],
    "dns_ip_longitude": ["Blob.dns_ip_longitude"],
    "input_ip_latitude": ["Blob.input_ip_latitude"],
    "input_ip_longitude": ["Blob.input_ip_longitude"],
    # First-seen timestamps for age deltas
    "digital_id_first_seen": ["Blob.digital_id_first_seen"],
    "account_email_first_seen": ["Blob.account_email_first_seen"],
    "account_login_first_seen": ["Blob.account_login_first_seen"],
    "account_telephone_first_seen": ["Blob.account_telephone_first_seen"],
    "true_ip_first_seen": ["Blob.true_ip_first_seen"],
    "ssn_hash_first_seen": ["Blob.ssn_hash_first_seen"],
    "fuzzy_device_first_seen": ["Blob.fuzzy_device_first_seen"],
    "national_id_first_seen": ["Blob.national_id_first_seen"],
    "proxy_ip_first_seen": ["Blob.proxy_ip_first_seen"],
    # Attribute arrays (used for one-hot style parsing)
    "account_name_activities": ["Blob.account_name_activities"],
    "account_email_attributes": ["Blob.account_email_attributes"],
    "true_ip_attributes": ["Blob.true_ip_attributes"],
    "true_ip_activities": ["Blob.true_ip_activities"],
    "digital_id_attributes": ["Blob.digital_id_attributes"],
    "account_telephone_attributes": ["Blob.account_telephone_attributes"],
    "cpu_clock": ["Blob.cpu_clock"]
 }
@ -156,9 +225,9 @@ def __main__(
    TrueIpGeo: str,
    Blob: str,
    DeviceId: str,
-    FuzzyDeviceId: str
+    FuzzyDeviceId: str,
    ReasonCode: str,
 ) -> dict:
    # Convert input parameters into a flat dictionary
    data = {
        "application_key": application_key,
@ -184,49 +253,82 @@ def __main__(
        "TrueIpGeo": TrueIpGeo,
        "Blob": Blob,
        "DeviceId": DeviceId,
-        "FuzzyDeviceId": FuzzyDeviceId
+        "FuzzyDeviceId": FuzzyDeviceId,
        "ReasonCode": ReasonCode,
    }
    # Convert dictionary to a single-row DataFrame
    combined_df = pd.DataFrame([data])
    combined_df.columns = combined_df.columns.str.lower()
    # Uncomment Below For Testing using Uprova Batch Data 
    # combined_df["educationlevel"] = None
    # combined_df["monthsatresidence"] = None
    # combined_df["ownhome"] = False
    # combined_df['lengthatbank'] = 0
    combined_df["application_email_address"] = combined_df["application_email_address"].str.lower()
    if Blob:
        combined_df["blob"] = combined_df["blob"].apply(sanitize_blob)
        # Step 2: Extract values using the expressions dictionary
        for column, expressions_list in expressions.items():
-            combined_df[column] = combined_df["blob"].apply(lambda x: coalesce(
+            def _extract_with_fallback(blob_obj):
-                *[extract_value(x, expr) for expr in expressions_list]))
+                values = []
                for expr in expressions_list:
                    val = extract_value(blob_obj, expr)
                    if val is None and isinstance(expr, str) and expr.startswith("Blob."):
                        val = extract_value(blob_obj, expr[len("Blob.") :])
                    values.append(val)
                return coalesce(*values)
-        logger.info("pre_flowx data")
+            extracted = combined_df["blob"].apply(_extract_with_fallback)
-        logger.info(combined_df.iloc[0].drop('blob').to_dict())
+            if column in combined_df.columns:
                combined_df[column] = extracted.where(extracted.notnull(), combined_df[column])
            else:
-        for column, expressions_list in expressions.items():
+                combined_df[column] = extracted
        # logger.info("pre_flowx data")
        # logger.info(combined_df.iloc[0].drop("blob").to_dict())
    else:
        for column in expressions:
            combined_df[column] = None
-        logger.info("pre_flowx data")
+        # logger.info("pre_flowx data")
-        logger.info(combined_df.iloc[0].to_dict())
+        # logger.info(combined_df.iloc[0].to_dict())
-    pre_processed_data = pre_processing(combined_df)
+    df_m1, df_m2, df_thx = pre_processing_all(combined_df)
-    # logger.info(f"pre_processed_data: {pre_processed_data}")
+    # logger.info("pre_processed data m1")
-    logger.info("pre_processed data")
+    # logger.info(df_m1.iloc[0].to_dict())
-    logger.info(pre_processed_data.iloc[0].to_dict())
+    # logger.info("pre_processed data m2")
-    df = processing(pre_processed_data)
+    # logger.info(df_m2.iloc[0].to_dict())
-    logger.info("processed_data")
+
-    logger.info(df.iloc[0].to_dict())
+    processed_m1, processed_m2, df_thx = processing_all(df_m1, df_m2, df_thx)
-    df["application_timestamp"] = df["application_timestamp"].astype(str)
+    # logger.info("processed_data m1")
-    # logger.info("prediction: %.8f", float(df['prediction'].iloc[0]))
+    # logger.info(processed_m1.iloc[0].to_dict())
-    result = post_processing(df)
+    # logger.info("processed_data m2")
-    logger.info("post_processed_data")
+    # logger.info(processed_m2.iloc[0].to_dict())
-    logger.info(result)
+
    result = post_processing_all(processed_m1, processed_m2, df_thx)
    # State Check
    state_value = combined_df["state"].iloc[0]
    zip_value = combined_df["zip"].iloc[0]
-    if (pd.notnull(state_value) and state_value == "ZZ") or (pd.notnull(zip_value) and zip_value == "86445"):
+    if (pd.notnull(state_value) and state_value == "ZZ") or (
        pd.notnull(zip_value) and zip_value == "86445"
    ):
        result["hd_score_m1"] = 1250
-        logger.info("post_processed_data after state check")
+        result["hd_score_m2"] = 1250
-        logger.info(result)
+        result["hd_score_iso_m2"] = 1250
        # logger.info("post_processed_data after state check")
        # logger.info(result)
    # Normalize numeric scores to built-in float so JSON encoding (e.g. Temporal)
    # does not fail on NumPy scalar types like np.float32/np.float64.
    for key in ("hd_score_m1", "hd_score_m2", "hd_score_iso_m2"):
        if key in result and result[key] is not None:
            try:
                result[key] = float(result[key])
            except (TypeError, ValueError):
                logger.warning("Failed to cast %s=%r to float", key, result[key])
    print(result)
    return result
 # testing :
 # __main__
--- a/category_orders_train_M1.json
+++ b/category_orders_train_M1.json
@ -0,0 +1,88 @@
 {
 "employmentstatus": [
        "disability",
        "fixed income",
        "full time employed",
        "other",
        "part time employment",
        "retired benefits",
        "self employed",
        "student",
        "unemployed",
        "welfare"
    ],
    "TrueIpGeo": [
        "other",
        "us"
    ],
    "digital_id_trust_score_rating": [
        "high",
        "low",
        "neutral",
        "very_high",
        "very_low"
    ],
    "educationlevel": [
        "associate's degree",
        "bachelor's degree",
        "doctorate",
        "high school",
        "master's degree",
        "other"
    ],
    "os_version": [
        "10",
        "11",
        "12",
        "13",
        "14",
        "15",
        "16",
        "17",
        "18",
        "8",
        "9",
        "unknown"
    ],
    "ip_net_speed_cell": [
        "broadband",
        "cable",
        "dialup",
        "dsl",
        "fixed wireless",
        "mobile",
        "mobile wireless",
        "ocx",
        "satellite",
        "t1",
        "tx",
        "wireless",
        "xdsl"
    ],
    "day_night": [
        "Day",
        "Night"
    ],
    "digital_id_confidence_rating": [
        "high",
        "medium",
        "very_high",
        "very_low"
    ],
    "RiskRating": [
        "high",
        "low",
        "medium",
        "neutral",
        "trusted"
    ],
    "payfrequency": [
        "biweekly",
        "semimonthly"
    ],
    "ownhome": [
        "false",
        "true"
    ]
 }
--- a/category_orders_train_M2.json
+++ b/category_orders_train_M2.json
@ -0,0 +1,303 @@
 {
    "riskrating": [
        "high",
        "low",
        "medium",
        "neutral",
        "trusted"
    ],
    "input_ip_connection_type": [
        "cable",
        "consumer satellite",
        "dialup",
        "dsl",
        "fixed wireless",
        "framerelay",
        "isdn",
        "mobile wireless",
        "ocx",
        "tx"
    ],
    "fraudrisk": [
        "001 very low",
        "003 very low",
        "005 very low",
        "006 very low",
        "008 very low",
        "009 very low",
        "010 very low",
        "011 very low",
        "012 very low",
        "014 very low",
        "015 very low",
        "016 very low",
        "017 very low",
        "018 very low",
        "020 very low",
        "021 very low",
        "022 very low",
        "023 very low",
        "024 very low",
        "025 very low",
        "026 very low",
        "027 very low",
        "028 very low",
        "029 very low",
        "030 very low",
        "031 very low",
        "032 very low",
        "033 very low",
        "034 very low",
        "035 very low",
        "036 very low",
        "037 very low",
        "038 very low",
        "039 very low",
        "040 very low",
        "041 very low",
        "042 very low",
        "043 very low",
        "044 very low",
        "045 very low",
        "046 very low",
        "047 very low",
        "048 very low",
        "049 very low",
        "050 very low",
        "051 very low",
        "052 very low",
        "053 very low",
        "054 very low",
        "055 very low",
        "056 very low",
        "057 very low",
        "058 very low",
        "059 very low",
        "060 very low",
        "061 very low",
        "062 very low",
        "063 very low",
        "064 very low",
        "065 very low",
        "066 very low",
        "067 very low",
        "068 very low",
        "069 very low",
        "070 very low",
        "071 very low",
        "072 very low",
        "073 very low",
        "074 very low",
        "075 very low",
        "076 very low",
        "077 very low",
        "078 very low",
        "079 very low",
        "080 very low",
        "081 very low",
        "082 very low",
        "083 very low",
        "084 very low",
        "085 very low",
        "086 very low",
        "087 very low",
        "088 very low",
        "089 very low",
        "090 very low",
        "091 very low",
        "092 very low",
        "093 very low",
        "094 very low",
        "095 very low",
        "096 very low",
        "097 very low",
        "098 very low",
        "099 very low",
        "100 very low",
        "101 low",
        "102 low",
        "103 low",
        "104 low",
        "105 low",
        "106 low",
        "107 low",
        "108 low",
        "109 low",
        "110 low",
        "111 low",
        "112 low",
        "113 low",
        "114 low",
        "115 low",
        "116 low",
        "117 low",
        "118 low",
        "119 low",
        "120 low",
        "121 low",
        "122 low",
        "123 low",
        "124 low",
        "125 low",
        "126 low",
        "127 low",
        "128 low",
        "129 low",
        "130 low",
        "131 low",
        "132 low",
        "133 low",
        "134 low",
        "135 low",
        "136 low",
        "137 low",
        "138 low",
        "139 low",
        "140 low",
        "141 low",
        "142 low",
        "143 low",
        "144 low",
        "145 low",
        "146 low",
        "147 low",
        "148 low",
        "149 low",
        "153 low",
        "154 low",
        "156 low",
        "157 low",
        "158 low",
        "159 low",
        "160 low",
        "161 low",
        "162 low",
        "163 low",
        "164 low",
        "165 low",
        "166 low",
        "167 low",
        "168 low",
        "169 low",
        "170 low",
        "171 low",
        "172 low",
        "173 low",
        "174 low",
        "175 low",
        "177 low",
        "178 low",
        "179 low",
        "180 low",
        "181 low",
        "182 low",
        "183 low",
        "184 low",
        "185 low",
        "186 low",
        "187 low",
        "188 low",
        "189 low",
        "190 low",
        "191 low",
        "192 low",
        "193 low",
        "194 low",
        "195 low",
        "196 low",
        "197 low",
        "198 low",
        "199 low",
        "200 low",
        "201 low",
        "202 low",
        "203 low",
        "204 low",
        "205 low",
        "206 low",
        "207 low",
        "208 low",
        "209 low",
        "210 low",
        "211 low",
        "212 low",
        "213 low",
        "214 low",
        "215 low",
        "216 low",
        "217 low",
        "218 low",
        "219 low",
        "220 low",
        "221 low",
        "222 low",
        "224 low",
        "225 low",
        "226 low",
        "227 low",
        "228 low",
        "229 low",
        "230 low",
        "231 low",
        "232 low",
        "233 low",
        "234 low",
        "235 low",
        "236 low",
        "237 low",
        "238 low",
        "239 low",
        "240 low",
        "241 low",
        "242 low",
        "243 low",
        "244 low",
        "245 low",
        "246 low",
        "247 low",
        "248 low",
        "250 low",
        "252 low",
        "254 low",
        "259 low",
        "267 low",
        "268 low",
        "271 low",
        "272 low",
        "274 low",
        "275 low",
        "278 low",
        "282 low",
        "287 low",
        "288 low",
        "289 low",
        "290 low",
        "291 low",
        "293 low",
        "296 low",
        "297 low",
        "464 moderate",
        "467 moderate",
        "485 moderate",
        "491 moderate",
        "492 moderate",
        "496 moderate",
        "702 review",
        "703 review",
        "705 review",
        "706 review",
        "707 review",
        "708 review",
        "710 review",
        "730 review",
        "790 review",
        "801 high",
        "890 high",
        "902 very high",
        "906 very high"
    ],
    "day_night": [
        "day",
        "night"
    ]
 }
--- a/isotonic_model_M2.joblib
+++ b/isotonic_model_M2.joblib
--- a/latitute_longitute_reference.csv
+++ b/latitute_longitute_reference.csv
--- a/post_processing.py
+++ b/post_processing.py
@ -1,5 +1,10 @@
 import logging
 from typing import Dict
 import numpy as np
 import pandas as pd
 from pre_processing import THX_FIELDS
 # Configure logging
 logging.basicConfig(
@ -9,17 +14,85 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)
-def post_processing(df):
+def post_processing_m1(df: pd.DataFrame) -> pd.DataFrame:
-
+    df = df.copy()
    try:
-        df['hd_score_m1'] = np.round(
+        df["hd_score_m1"] = np.round(
-            np.minimum(df['prediction'] * 100 + 0.00001, 1) * 85 +
+            np.minimum(df["prediction"] * 100 + 0.00001, 1) * 85
-            np.maximum(np.log2(df['prediction'] * 100 + 0.000001) * 185, 0),
+            + np.maximum(np.log2(df["prediction"] * 100 + 0.000001) * 185, 0),
-            0
+            0,
        )
-        logging.info(f"hd_score_m1 calculated: {df['hd_score_m1'].iloc[0]}")
+        logging.info("hd_score_m1 calculated: %s", df["hd_score_m1"].iloc[0])
    except Exception as e:
-        logging.error(f"Error processing hd_score_m1 calculations: {e}")
+        logging.error("Error processing hd_score_m1 calculations: %s", e)
    return df
-    return df[['application_key', 'application_timestamp', 'deviceid', 'fuzzydeviceid', 'application_email_address', 'hd_score_m1']].iloc[0].to_dict()
+def post_processing_m2(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    try:
        df["hd_score_m2"] = np.round(
            np.minimum(df["pd_m2"] * 100.0 + 0.00001, 1.0) * 75.0
            + np.maximum(np.log2(df["pd_m2"] * 100.0 + 0.000001) * 180.0, 0.0),
            0,
        )
        df["hd_score_iso_m2"] = np.round(
            np.minimum(df["pd_m2_iso"] * 100.0 + 0.00001, 1.0) * 97.0
            + np.maximum(np.log2(df["pd_m2_iso"] * 100.0 + 0.000001) * 246.0, 0.0),
            0,
        )
        logging.info("hd_score_m2 calculated: %s", df["hd_score_m2"].iloc[0])
        logging.info("hd_score_iso_m2 calculated: %s", df["hd_score_iso_m2"].iloc[0])
    except Exception as e:
        logging.error("Error processing hd_score_m2 calculations: %s", e)
    return df
 def _safe_get(df: pd.DataFrame, column: str):
    """Return scalar from single-row DataFrame, normalizing NaN/None to None."""
    if column not in df.columns:
        return None
    val = df[column].iloc[0]
    if isinstance(val, (list, dict)):
        return val
    try:
        if pd.isna(val):
            return None
    except TypeError:
        pass
    return val
 def post_processing_all(df_m1: pd.DataFrame, df_m2: pd.DataFrame, df_thx: pd.DataFrame) -> Dict[str, object]:
    df_m1_scored = post_processing_m1(df_m1)
    df_m2_scored = post_processing_m2(df_m2)
    row_m1 = df_m1_scored.iloc[0]
    row_m2 = df_m2_scored.iloc[0]
    result = {
        "application_key": row_m1.get("application_key"),
        "application_timestamp": str(row_m1.get("application_timestamp")) if row_m1.get("application_timestamp") is not None else None,
        "deviceid": row_m1.get("deviceid"),
        "fuzzydeviceid": row_m1.get("fuzzydeviceid"),
        "application_email_address": row_m1.get("application_email_address"),
        "hd_score_m1": row_m1.get("hd_score_m1"),
        "hd_score_m2": row_m2.get("hd_score_m2"),
        "hd_score_iso_m2": row_m2.get("hd_score_iso_m2"),
        "action": None,
    }
    flattened_thx = {field: _safe_get(df_thx, field) for field in THX_FIELDS if field not in result}
    result.update(flattened_thx)
    return result
 # Legacy entry point for backward compatibility
 def post_processing(df: pd.DataFrame) -> Dict[str, object]:
    df_scored = post_processing_m1(df)
    row = df_scored.iloc[0]
    return {
        "application_key": row.get("application_key"),
        "application_timestamp": str(row.get("application_timestamp")) if row.get("application_timestamp") is not None else None,
        "deviceid": row.get("deviceid"),
        "fuzzydeviceid": row.get("fuzzydeviceid"),
        "application_email_address": row.get("application_email_address"),
        "hd_score_m1": row.get("hd_score_m1"),
    }
--- a/pre_processing.py
+++ b/pre_processing.py
@ -1,6 +1,11 @@
 import pandas as pd
 import numpy as np
 import logging
 import math
 import re
 from pathlib import Path
 from typing import Dict, Iterable, List, Tuple, Union
 import numpy as np
 import pandas as pd
 # Configure logging
 logging.basicConfig(
@ -9,67 +14,230 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
 BASE_DIR = Path(__file__).resolve().parent
 M2_LATLONG_REF_PATH = BASE_DIR / "latitute_longitute_reference.csv"
 THX_FIELDS = [
    "application_key",
    "application_timestamp",
    "digital_id_first_seen",
    "summary_risk_score",
    "cpu_clock",
    "account_login_first_seen",
    "account_telephone_first_seen",
    "true_ip_first_seen",
    "ssn_hash_first_seen",
    "account_email_attributes",
    "tps_ip_latitude",
    "tps_ip_longitude",
 ]
-def pre_processing(data_df):
+# Hardcoded M2 data dictionary (replaces file lookup)
 M2_DATA_DICTIONARY: Dict[str, Dict[str, Union[float, str, None]]] = {
    "account_email_attributes_challenge_passed": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "account_email_attributes_challenged": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "account_email_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
    "account_login_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
    "account_name_activities_trusted_prob": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "account_telephone_attributes_challenge_failed": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "account_telephone_attributes_loan_app": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "account_telephone_attributes_trusted": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "account_telephone_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
    "accphone_gbl_velocity_hour": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "applicant_age": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "browser_spoof_score": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
    "day": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "day_cos": {"data_type": "float", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "day_sin": {"data_type": "float", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "di_autofill_count_login": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "digital_id_attributes_challenged": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "digital_id_attributes_trusted": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "digital_id_day_diff": {"data_type": "int", "valid_min": -999999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "digital_id_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
    "digital_id_month_diff": {"data_type": "int", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "digital_id_trust_score": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "digitalidconfidence": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
    "dist_dnsip_ref_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
    "dist_em_ip_ref_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
    "dist_inputip_ref_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
    "dist_proxyip_ref_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
    "dist_trueip_dnsip_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
    "dist_trueip_em_ip_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
    "dist_trueip_ref_km": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
    "domain_creation_days": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
    "emailtofullnameconfidence": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "emailtolastnameconfidence": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "first_seen_days": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
    "fraudrisk": {"data_type": "string", "valid_min": None, "valid_max": None, "observed_cap_min": None, "observed_cap_max": None},
    "fuzzy_device_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
    "fuzzy_device_id_confidence": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
    "hour": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "input_ip_connection_type": {"data_type": "string", "valid_min": None, "valid_max": None, "observed_cap_min": None, "observed_cap_max": None},
    "iptophoneconfidence": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "national_id_first_seen_age": {"data_type": "int", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
    "overalldigitalidentityscore": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "policy_score": {"data_type": "float", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "proxy_ip_first_seen_age": {"data_type": "int", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
    "proxy_ip_worst_score": {"data_type": "float", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "proxy_score": {"data_type": "float", "valid_min": 0.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
    "reasoncode_new_smartid_lt_1wk_global": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "riskrating": {"data_type": "string", "valid_min": None, "valid_max": None, "observed_cap_min": None, "observed_cap_max": None},
    "ssn_hash_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
    "totalhits": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "true_ip_activities_trusted_prob": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "true_ip_attributes_trusted": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "true_ip_attributes_trusted_conf": {"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "true_ip_first_seen_age": {"data_type": "int", "valid_min": -999.0, "valid_max": 999999.0, "observed_cap_min": None, "observed_cap_max": None},
    "true_ip_worst_score": {"data_type": "float", "valid_min": -999.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
    "uniquehits": {"data_type": "float", "valid_min": 0.0, "valid_max": 999.0, "observed_cap_min": None, "observed_cap_max": None},
 }
-    # combined_df = pd.DataFrame([input_data])
+# Hardcoded one-hot config (parsed_feature, model_var, contains)
-    # data = pd.DataFrame(data)
+M2_ONEHOT_CONFIG: List[Tuple[str, str, str]] = [
-    combined_df = data_df
+    ("reasoncode", "reasoncode_new_smartid_lt_1wk_global", "new_smartid_lt_1wk_global"),
-    combined_df["applicant_age"] = combined_df.apply(lambda row: pd.to_datetime(row["application_timestamp"]).year - pd.to_datetime(row["application_date_of_birth"]).year if pd.notnull(row["application_timestamp"]) and pd.notnull(row["application_date_of_birth"]) else None,axis=1
+    ("account_name_activities", "account_name_activities_trusted_prob", "trusted_prob"),
    ("account_email_attributes", "account_email_attributes_challenged", "challenged"),
    ("account_email_attributes", "account_email_attributes_challenge_passed", "challenge_passed"),
    ("true_ip_attributes", "true_ip_attributes_trusted", "trusted"),
    ("true_ip_attributes", "true_ip_attributes_trusted_conf", "trusted_conf"),
    ("digital_id_attributes", "digital_id_attributes_challenged", "challenged"),
    ("digital_id_attributes", "digital_id_attributes_trusted", "trusted"),
    ("account_telephone_attributes", "account_telephone_attributes_challenge_failed", "challenge_failed"),
    ("account_telephone_attributes", "account_telephone_attributes_loan_app", "loan_app"),
    ("account_telephone_attributes", "account_telephone_attributes_trusted", "trusted"),
    ("true_ip_activities", "true_ip_activities_trusted_prob", "trusted_prob"),
 ]
 # ----------------------------
 # Helpers
 # ----------------------------
 def _handle_unknowns(X: pd.DataFrame, column: str, known_values: Iterable[str], default_treatment=None):
    if column not in X.columns:
        return X
    known_values = {str(val).lower() for val in known_values}
    invalid_values = {None, "none", "nan", pd.NA}
    X[column] = X[column].apply(
        lambda x: str(x).lower()
        if pd.notna(x) and str(x).lower() in known_values
        else (default_treatment if pd.notna(x) and str(x).lower() not in invalid_values else np.nan)
    )
    return X
 def _haversine_km(lat1, lon1, lat2, lon2):
    if None in (lat1, lon1, lat2, lon2):
        return None
    try:
        rlat1 = float(lat1) * math.pi / 180.0
        rlat2 = float(lat2) * math.pi / 180.0
        dlat = (float(lat2) - float(lat1)) * math.pi / 180.0
        dlon = (float(lon2) - float(lon1)) * math.pi / 180.0
    except Exception:
        return None
    a = (
        math.sin(dlat / 2.0) ** 2
        + math.cos(rlat1) * math.cos(rlat2) * math.sin(dlon / 2.0) ** 2
    )
    a = min(1.0, max(0.0, a))
    return 2 * 6371.0088 * math.asin(math.sqrt(a))
 def _prep_latlong_ref():
    if not M2_LATLONG_REF_PATH.exists():
        logger.warning("latitute_longitute_reference.csv missing at %s", M2_LATLONG_REF_PATH)
        return pd.DataFrame()
    try:
        ref = pd.read_csv(M2_LATLONG_REF_PATH, usecols=["postal_code_ref", "latitute_ref", "longitude_ref"])
    except Exception:
        ref = pd.read_csv(M2_LATLONG_REF_PATH)
    # keep lower string version for matching
    if "postal_code_ref" in ref.columns:
        ref["postal_code_ref"] = ref["postal_code_ref"].astype(str).str.lower()
    return ref
 def _normalize_zip_for_ref(zip_val):
    """
    Normalize zip/postal code values so they match reference CSV keys.
    - Floats like 89503.0 -> "89503"
    - Int-like strings "89503.0" -> "89503"
    Note: we intentionally avoid zero-filling to preserve behaviour seen in UAT references
    where leading-zero ZIPs are not matched to the reference table.
    """
    if pd.isna(zip_val):
        return None
    if isinstance(zip_val, (int, float)) and not isinstance(zip_val, bool):
        return str(int(zip_val)).lower()
    zip_str = str(zip_val).strip()
    if zip_str.replace(".", "", 1).isdigit():
        try:
            return str(int(float(zip_str))).lower()
        except Exception:
            pass
    return zip_str.lower() if zip_str else None
 # ----------------------------
 # M1 Pre-processing (existing behaviour)
 # ----------------------------
 def pre_processing_m1(data_df: pd.DataFrame) -> pd.DataFrame:
    combined_df = data_df.copy()
    combined_df["applicant_age"] = combined_df.apply(
        lambda row: pd.to_datetime(row["application_timestamp"]).year - pd.to_datetime(row["application_date_of_birth"]).year
        if pd.notnull(row["application_timestamp"]) and pd.notnull(row["application_date_of_birth"])
        else None,
        axis=1,
    )
-    # Extracting Temporal features 
+    combined_df["application_timestamp"] = pd.to_datetime(combined_df["application_timestamp"])
-    combined_df['application_timestamp'] = pd.to_datetime(combined_df["application_timestamp"])
+    combined_df.loc[:, "application_time"] = pd.to_datetime(combined_df["application_timestamp"]).dt.time
    combined_df.loc[:, 'application_time'] = pd.to_datetime(combined_df['application_timestamp']).dt.time
-    combined_df['day'] = combined_df['application_timestamp'].dt.day
+    combined_df["day"] = combined_df["application_timestamp"].dt.day
-    combined_df['day_of_week'] = combined_df['application_timestamp'].dt.weekday  # 0=Monday, 6=Sunday
+    combined_df["day_of_week"] = combined_df["application_timestamp"].dt.weekday
-    combined_df['day_sin'] = np.sin(2 * np.pi * combined_df['day'] / 31)
+    combined_df["day_sin"] = np.sin(2 * np.pi * combined_df["day"] / 31)
-    combined_df['day_cos'] = np.cos(2 * np.pi * combined_df['day'] / 31)
+    combined_df["day_cos"] = np.cos(2 * np.pi * combined_df["day"] / 31)
-    combined_df['day_of_week_sin'] = np.sin(2 * np.pi * combined_df['day_of_week'] / 7)
+    combined_df["day_of_week_sin"] = np.sin(2 * np.pi * combined_df["day_of_week"] / 7)
-    combined_df['day_of_week_cos'] = np.cos(2 * np.pi * combined_df['day_of_week'] / 7)
+    combined_df["day_of_week_cos"] = np.cos(2 * np.pi * combined_df["day_of_week"] / 7)
    # combined_df['is_weekend'] = combined_df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
    # Create a day/night variable
    def classify_day_night(hour):
        if 6 <= hour < 18:
-            return 'Day'
+            return "Day"
-        else:
+        return "Night"
            return 'Night'
-    # Extract hour from application_time
+    combined_df["hour"] = combined_df["application_time"].apply(lambda x: x.hour if pd.notnull(x) else np.nan)
-    combined_df['hour'] = combined_df['application_time'].apply(lambda x: x.hour if pd.notnull(x) else np.nan)
+    combined_df["day_night"] = combined_df["hour"].apply(lambda hour: classify_day_night(hour) if pd.notnull(hour) else "Unknown")
    combined_df['day_night'] = combined_df['hour'].apply(lambda hour: classify_day_night(hour) if pd.notnull(hour) else 'Unknown')
-    # combined_df['os_version'] = combined_df['os_version'].str.replace(r'[^a-zA-Z0-9]', '_', regex=True)
+    combined_df["os_version"] = combined_df["os_version"].apply(
-    combined_df['os_version'] = combined_df['os_version'].apply(lambda x: x.split('.')[0] if isinstance(x, str) and '.' in x 
+        lambda x: x.split(".")[0] if isinstance(x, str) and "." in x else x.split("_")[0] if isinstance(x, str) and "_" in x else x
-                                                            else x.split('_')[0] if isinstance(x, str) and '_' in x 
+    )
                                                            else x)
    combined_df["Identity_Negative_History"] = combined_df["tmxsummaryreasoncode"].astype(str).str.contains(
        "Identity_Negative_History", na=False, regex=True
    ).astype(int)
    combined_df["Device_Negative_History"] = combined_df["tmxsummaryreasoncode"].astype(str).str.contains(
        "Device_Negative_History", na=False, regex=True
    ).astype(int)
    combined_df["Level_1_Link_Reject"] = combined_df["tmxsummaryreasoncode"].astype(str).str.contains(
        "Level_1_Link_Reject", na=False, regex=True
    ).astype(int)
    combined_df["IP_Negative_History"] = combined_df["tmxsummaryreasoncode"].astype(str).str.contains(
        "IP_Negative_History", na=False, regex=True
    ).astype(int)
    combined_df["Identity_Spoofing"] = combined_df["tmxsummaryreasoncode"].astype(str).str.contains(
        "Identity_Spoofing", na=False, regex=True
    ).astype(int)
-    # Datatype conversions 
+    combined_df["digitalidconfidence"] = pd.to_numeric(combined_df["digitalidconfidence"], errors="coerce").astype("Int64")
    # combined_df['Level_1_Link_Accept'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Level_1_Link_Accept', na=False, regex=True).astype(int)
    combined_df['Identity_Negative_History'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Identity_Negative_History', na=False, regex=True).astype(int)
    combined_df['Device_Negative_History'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Device_Negative_History', na=False, regex=True).astype(int)
    combined_df['Level_1_Link_Reject'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Level_1_Link_Reject', na=False, regex=True).astype(int)
    combined_df['IP_Negative_History'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('IP_Negative_History', na=False, regex=True).astype(int)
    combined_df['Identity_Spoofing'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Identity_Spoofing', na=False, regex=True).astype(int)
    # combined_df['Bot'] = combined_df['tmxsummaryreasoncode'].astype(str).str.contains('Bot', na=False, regex=True).astype(int)
-    combined_df['digitalidconfidence'] = pd.to_numeric(combined_df['digitalidconfidence'], errors='coerce').astype('Int64')
+    combined_df.rename(
-
+        columns={
-    # Rename Columns if Required 
+            "DigitalIdConfidence": "digitalidconfidence",
-    combined_df.rename(columns={
+        },
-        'DigitalIdConfidence': 'digitalidconfidence',
+        inplace=True,
-        # 'inputipaddress_consistency': 'inputip_consistency',
+    )
        # 'requestid_consistency': 'request_consistency',
        # Add others as required if present in your DataFrame and needing renaming.
    }, inplace=True)
    # #Testing : remove below
    # combined_df.to_csv('op-pre-processing_intermediate.csv', index=False)
    dtype_dict = {
        "applicant_age": int,
@ -84,7 +252,7 @@ def pre_processing(data_df):
        "day_cos": float,
        "summary_risk_score": float,
        "digital_id_trust_score_rating": str,
-        "day" :  'int32',
+        "day": "int32",
        "lengthatbank": float,
        "day_of_week_cos": float,
        "Level_1_Link_Reject": int,
@ -95,7 +263,7 @@ def pre_processing(data_df):
        "true_ip_score": float,
        "ip_net_speed_cell": str,
        "account_email_score": float,
-        "day_of_week" :  'int32',
+        "day_of_week": "int32",
        "true_ip_worst_score": float,
        "proxy_ip_worst_score": float,
        "day_night": str,
@ -108,28 +276,26 @@ def pre_processing(data_df):
        "true_ip_state_confidence": float,
        "IP_Negative_History": int,
        "fuzzy_device_worst_score": float,
        "digital_id_confidence_rating" :  str,
        "day_of_week_sin": float,
        "riskrating": str,
        "payfrequency": str,
        "ownhome": str,
-        "Identity_Spoofing" :  int
+        "Identity_Spoofing": int,
    }
-    next_block_cols = ['application_key', 'application_timestamp', 'deviceid', 'fuzzydeviceid', 'application_email_address']
+    next_block_cols = ["application_key", "application_timestamp", "deviceid", "fuzzydeviceid", "application_email_address"]
    cols_to_keep = [col for col in dtype_dict.keys() if col in combined_df.columns]
    final_cols = list(set(next_block_cols).union(set(cols_to_keep)))
    # Type casting
    for col, dtype in dtype_dict.items():
        if col in combined_df.columns:
            if dtype == int:
-                combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce', downcast='integer')
+                combined_df[col] = pd.to_numeric(combined_df[col], errors="coerce", downcast="integer")
            elif dtype == float:
-                combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce', downcast='float')
+                combined_df[col] = pd.to_numeric(combined_df[col], errors="coerce", downcast="float")
            elif dtype == str:
                combined_df[col] = combined_df[col].astype(str)
-    # cross check data type 
+
    capping_dict = {
        "applicant_age": (18, 93),
        "digitalidconfidence": (0, 9017),
@ -157,98 +323,254 @@ def pre_processing(data_df):
        "fuzzy_device_score": (-29, 14),
        "day_sin": (-0.9987165072, 0.9987165072),
        "ip_region_confidence": (75, 99),
        # "true_ip_state_confidence": (5, 98),
        "IP_Negative_History": (0, 1),
        "fuzzy_device_worst_score": (-100, 0),
        "day_of_week_sin": (-0.9749279122, 0.9749279122),
        "Identity_Spoofing": (0, 1),
    }
    # Apply capping
    for column, (cap_min, cap_max) in capping_dict.items():
        if column in combined_df.columns:
            combined_df[column] = combined_df[column].clip(lower=cap_min, upper=cap_max)
    def handle_unknowns(X, column, known_values, default_treatment=None):
        if column not in X.columns:
            return X  #  Return X to avoid NoneType error
        known_values = {str(val).lower() for val in known_values}
        invalid_values = {None, "none", "nan", pd.NA}
        X[column] = X[column].apply(
            lambda x: str(x).lower() if pd.notna(x) and str(x).lower() in known_values
            else (default_treatment if pd.notna(x) and str(x).lower() not in invalid_values else np.nan)
        )
        return X  #  Always return the DataFrame
    unknown_treatments = {
        "employmentstatus": {
            "valid_values": [
-                "disability", "fixed income", "full time employed", "part time employment",
+                "disability",
-                "retired benefits", "self employed", "student", "unemployed", "welfare"
+                "fixed income",
                "full time employed",
                "part time employment",
                "retired benefits",
                "self employed",
                "student",
                "unemployed",
                "welfare",
            ],
-            "default_treatment": "other"
+            "default_treatment": "other",
        },
        "trueipgeo": {
            "valid_values": ["US"],
            "default_treatment": "other"
        },
        "digital_id_trust_score_rating": {
            "valid_values": ["very_high", "high", "neutral", "low"],
            "default_treatment": "very_low"
        },
        "trueipgeo": {"valid_values": ["US"], "default_treatment": "other"},
        "digital_id_trust_score_rating": {"valid_values": ["very_high", "high", "neutral", "low"], "default_treatment": "very_low"},
        "educationlevel": {
            "valid_values": ["associate's degree", "bachelor's degree", "doctorate", "high school", "master's degree"],
-            "default_treatment": "other"
+            "default_treatment": "other",
        },
        "os_version": {
-            "valid_values": [
+            "valid_values": ["18", "17", "16", "15", "14", "13", "12", "11", "10", "9", "8"],
-                '18', '17', '16', '15', '14', '13', '12', '11', '10', '9', '8'
+            "default_treatment": "unknown",
            ],
            "default_treatment": 'unknown'
        },
        "ip_net_speed_cell": {
            "valid_values": [
-                "broadband", "cable", "dialup", "dsl", "fixed wireless", "mobile", "mobile wireless", "ocx", "satellite",
+                "broadband",
-                "t1", "tx", "wireless", "xdsl"
+                "cable",
                "dialup",
                "dsl",
                "fixed wireless",
                "mobile",
                "mobile wireless",
                "ocx",
                "satellite",
                "t1",
                "tx",
                "wireless",
                "xdsl",
            ],
-            "default_treatment": "mobile"
+            "default_treatment": "mobile",
        },
        "digital_id_confidence_rating": {
            "valid_values": ["high", "medium", "very_high"],
            "default_treatment": "very_low"
        },
        "riskrating": {
            "valid_values": ["low", "medium", "neutral", "trusted"],
            "default_treatment": "high"
        },
        "ownhome": {
            "valid_values": ["true", "false"],
            "default_treatment": np.nan
        },
        "digital_id_confidence_rating": {"valid_values": ["high", "medium", "very_high"], "default_treatment": "very_low"},
        "riskrating": {"valid_values": ["low", "medium", "neutral", "trusted"], "default_treatment": "high"},
        "ownhome": {"valid_values": ["true", "false"], "default_treatment": np.nan},
    }
    for column, treatment in unknown_treatments.items():
-        combined_df = handle_unknowns(combined_df, column, treatment["valid_values"], treatment["default_treatment"]) 
+        combined_df = _handle_unknowns(combined_df, column, treatment["valid_values"], treatment["default_treatment"])
-    payfrequency_map = {
+    payfrequency_map = {"biweekly": ["biweekly", "bi-weekly", "bi weekly", "bw"], "semimonthly": ["semi-monthly", "semimonthly"]}
        "biweekly": ["biweekly", "bi-weekly", "bi weekly", "bw"],
        "semimonthly": ["semi-monthly", "semimonthly"]
    }
-    combined_df['payfrequency'] = combined_df['payfrequency'].apply(
+    combined_df["payfrequency"] = combined_df["payfrequency"].apply(
        lambda x: next((key for key, values in payfrequency_map.items() if str(x).lower() in values), np.nan)
    )
    return combined_df[final_cols]
 # ----------------------------
 # M2 Pre-processing
 # ----------------------------
 def _apply_onehot_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for parsed_feature, model_var, contains_val in M2_ONEHOT_CONFIG:
        value = df.get(parsed_feature, pd.Series([None])).iloc[0]
        flag = 0
        if isinstance(value, list):
            flag = int(any(contains_val in str(v).lower() or re.sub(r"[^a-z0-9]+", " ", contains_val) in re.sub(r"[^a-z0-9]+", " ", str(v).lower()) for v in value))
        elif isinstance(value, str):
            val_norm = re.sub(r"[^a-z0-9]+", " ", value.lower())
            contains_norm = re.sub(r"[^a-z0-9]+", " ", contains_val)
            flag = int(contains_val in value.lower() or contains_norm in val_norm)
        df[model_var] = flag
    return df
 def _extract_first_seen_days(ts_value, app_ts):
    ts = pd.to_datetime(ts_value, errors="coerce", utc=True)
    app = pd.to_datetime(app_ts, errors="coerce", utc=True)
    # align to naive for subtraction
    if isinstance(ts, pd.Timestamp) and ts.tzinfo is not None:
        ts = ts.tz_localize(None)
    if isinstance(app, pd.Timestamp) and app.tzinfo is not None:
        app = app.tz_localize(None)
    if pd.isna(ts) or pd.isna(app):
        return None
    return (app.normalize() - ts.normalize()).days
 def _to_naive_ts(val):
    ts = pd.to_datetime(val, errors="coerce", utc=True)
    if isinstance(ts, pd.Timestamp) and ts.tzinfo is not None:
        ts = ts.tz_localize(None)
    return ts
 def _month_diff(earlier, later):
    """Month difference (earlier - later) using year/month buckets."""
    ts_earlier = _to_naive_ts(earlier)
    ts_later = _to_naive_ts(later)
    if pd.isna(ts_earlier) or pd.isna(ts_later):
        return None
    return (ts_earlier.year - ts_later.year) * 12 + (ts_earlier.month - ts_later.month)
 def pre_processing_m2(data_df: pd.DataFrame) -> pd.DataFrame:
    df = data_df.copy()
    df.columns = df.columns.str.lower()
    # Timestamp-derived features (align with M1 behaviour to keep probabilities consistent)
    df["application_timestamp"] = pd.to_datetime(df["application_timestamp"], errors="coerce", utc=True)
    df["day"] = df["application_timestamp"].dt.day
    df["hour"] = df["application_timestamp"].dt.hour
    df["day_sin"] = np.sin(2 * np.pi * df["day"] / 31)
    df["day_cos"] = np.cos(2 * np.pi * df["day"] / 31)
    def _classify_day_night(hour_val):
        if pd.isna(hour_val):
            return np.nan
        return "day" if 6 <= hour_val < 18 else "night"
    df["day_night"] = df["hour"].apply(_classify_day_night)
    # Apply onehot flags from attributes
    df = _apply_onehot_features(df)
    # Distances
    lat_ref = _prep_latlong_ref()
    if not lat_ref.empty and "zip" in df.columns:
        zip_value = df["zip"].iloc[0]
        zip_lookup = _normalize_zip_for_ref(zip_value)
        ref_row = lat_ref[lat_ref["postal_code_ref"] == zip_lookup] if zip_lookup else pd.DataFrame()
        lat_ref_val = ref_row["latitute_ref"].iloc[0] if not ref_row.empty else None
        lon_ref_val = ref_row["longitude_ref"].iloc[0] if not ref_row.empty else None
    else:
        lat_ref_val = None
        lon_ref_val = None
    df["dist_inputip_ref_km"] = df.apply(
        lambda r: _haversine_km(lat_ref_val, lon_ref_val, r.get("input_ip_latitude"), r.get("input_ip_longitude")), axis=1
    )
    df["dist_em_ip_ref_km"] = df.apply(
        lambda r: _haversine_km(lat_ref_val, lon_ref_val, r.get("ip_latitude") or r.get("tps_ip_latitude"), r.get("ip_longitude") or r.get("tps_ip_longitude")),
        axis=1,
    )
    df["dist_proxyip_ref_km"] = df.apply(
        lambda r: _haversine_km(lat_ref_val, lon_ref_val, r.get("proxy_ip_latitude"), r.get("proxy_ip_longitude")), axis=1
    )
    df["dist_dnsip_ref_km"] = df.apply(
        lambda r: _haversine_km(lat_ref_val, lon_ref_val, r.get("dns_ip_latitude"), r.get("dns_ip_longitude")), axis=1
    )
    df["dist_trueip_ref_km"] = df.apply(
        lambda r: _haversine_km(lat_ref_val, lon_ref_val, r.get("true_ip_latitude"), r.get("true_ip_longitude")), axis=1
    )
    df["dist_trueip_em_ip_km"] = df.apply(
        lambda r: _haversine_km(r.get("true_ip_latitude"), r.get("true_ip_longitude"), r.get("ip_latitude") or r.get("tps_ip_latitude"), r.get("ip_longitude") or r.get("tps_ip_longitude")),
        axis=1,
    )
    df["dist_trueip_dnsip_km"] = df.apply(
        lambda r: _haversine_km(r.get("true_ip_latitude"), r.get("true_ip_longitude"), r.get("dns_ip_latitude"), r.get("dns_ip_longitude")),
        axis=1,
    )
    # Ages
    app_ts_val = df.get("application_timestamp", pd.Series([None])).iloc[0]
    def _safe_day_diff(row):
        if not row.get("digital_id_first_seen"):
            return None
        val = _extract_first_seen_days(row.get("digital_id_first_seen"), app_ts_val)
        return -val if val is not None else None
    df["digital_id_day_diff"] = df.apply(_safe_day_diff, axis=1)
    df["digital_id_month_diff"] = df.apply(lambda r: _month_diff(r.get("digital_id_first_seen"), app_ts_val), axis=1)
    for col_name in [
        "digital_id_first_seen",
        "account_email_first_seen",
        "account_login_first_seen",
        "account_telephone_first_seen",
        "true_ip_first_seen",
        "ssn_hash_first_seen",
        "fuzzy_device_first_seen",
        "national_id_first_seen",
        "proxy_ip_first_seen",
    ]:
        out_col = f"{col_name}_age"
        df[out_col] = df.apply(lambda r: _extract_first_seen_days(r.get(col_name), app_ts_val), axis=1)
    # applicant_age for consistency if not present
    if "applicant_age" not in df.columns:
        df["applicant_age"] = df.apply(
            lambda row: pd.to_datetime(row["application_timestamp"]).year - pd.to_datetime(row["application_date_of_birth"]).year
            if pd.notnull(row.get("application_timestamp")) and pd.notnull(row.get("application_date_of_birth"))
            else None,
            axis=1,
        )
    # Safe casting and capping using data dictionary
    for var_name, rules in M2_DATA_DICTIONARY.items():
        if var_name not in df.columns:
            continue
        col = pd.to_numeric(df[var_name], errors="coerce") if rules.get("data_type") in ["float", "int"] else df[var_name]
        if rules.get("data_type") == "int":
            col = col.astype("float")
        valid_min = rules.get("valid_min")
        valid_max = rules.get("valid_max")
        observed_min = rules.get("observed_cap_min")
        observed_max = rules.get("observed_cap_max")
        if observed_min is not None or observed_max is not None:
            col = col.clip(lower=observed_min, upper=observed_max)
        # if valid_min is not None:
        #     col = col.where(col >= valid_min, np.nan)
        # if valid_max is not None:
        #     col = col.where(col <= valid_max, np.nan)
        df[var_name] = col
    return df
 def pre_processing_all(data_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    # Ensure requested THX fields exist so downstream packaging always has keys
    df_base = data_df.copy()
    for field in THX_FIELDS:
        if field in df_base.columns:
            df_base[field] = df_base[field].astype(str)
        else:
            df_base[field] = None
    df_thx = df_base[THX_FIELDS].copy()
    df_m1 = pre_processing_m1(df_base.copy())
    df_m2 = pre_processing_m2(df_base.copy())
    return df_m1, df_m2, df_thx
 # Backwards compatible entry point (used by legacy code/tests if any)
 def pre_processing(data_df: pd.DataFrame) -> pd.DataFrame:
    df_m1, _, _ = pre_processing_all(data_df)
    return df_m1
--- a/processing.py
+++ b/processing.py
@ -1,46 +1,107 @@
 import pandas as pd
 import numpy as np 
 import xgboost as xgb
 import joblib
 import json
 from functools import lru_cache
 from pathlib import Path
 import joblib
 import numpy as np
 import pandas as pd
 import xgboost as xgb
 # BASE_DIR = Path(__file__).resolve().parent
 # M1_MODEL_PATH = BASE_DIR / "xgboost_model_M1.joblib"
 # M1_CATEGORY_ORDERS_PATH = BASE_DIR / "category_orders_train_M1.json"
 # M2_MODEL_PATH = BASE_DIR / "xgboost_model_M2.joblib"
 # M2_CATEGORY_ORDERS_PATH = BASE_DIR / "category_orders_train_M2.json"
 # M2_ISO_PATH = BASE_DIR / "isotonic_model_M2.joblib"
 M1_MODEL_PATH = "./xgboost_model_M1.joblib"
 M1_CATEGORY_ORDERS_PATH = "./category_orders_train_M1.json"
 M2_MODEL_PATH = "./xgboost_model_M2.joblib"
 M2_CATEGORY_ORDERS_PATH = "./category_orders_train_M2.json"
 M2_ISO_PATH = "./isotonic_model_M2.joblib"
-def processing(input_data):
+def _load_category_orders(path: Path) -> dict:
    with open(path, "r") as f:
        return json.load(f)
@lru_cache(maxsize=1)
 def _load_m1_model():
    return joblib.load(M1_MODEL_PATH)
@lru_cache(maxsize=1)
 def _load_m2_model():
    return joblib.load(M2_MODEL_PATH)
@lru_cache(maxsize=1)
 def _load_m2_iso_model():
    return joblib.load(M2_ISO_PATH)
@lru_cache(maxsize=None)
 def _load_category_orders_cached(path: Path):
    # Cache category orders per path to avoid disk I/O on each scoring
    return _load_category_orders(path)
 def _prepare(df: pd.DataFrame, category_orders: dict) -> pd.DataFrame:
    df = df.copy()
    for col, categories in category_orders.items():
        if col not in df.columns:
            df[col] = np.nan
        df[col] = df[col].astype(str).str.lower()
        df[col].replace([None, "", "null", np.nan, "nan", " "], np.nan, inplace=True)
        df[col] = pd.Categorical(df[col], categories=categories, ordered=True)
    return df
 def processing_m1(input_data: pd.DataFrame) -> pd.DataFrame:
    df = pd.DataFrame(input_data)
    # Load Model
    model_path = "./xgboost_model.joblib"
    # model_path = "C:/Users/abinisha/habemco_flowx/m1_v1/xgboost_model.joblib"
    model = joblib.load(model_path)
    df.rename(columns={'riskrating': 'RiskRating', 'trueipgeo': 'TrueIpGeo'}, inplace=True)
    # Load Category Orders
    category_orders_path ="./category_orders_train.json"
    # category_orders_path = "C:/Users/abinisha/habemco_flowx/m1_v1/category_orders_train.json"
    with open(category_orders_path, 'r') as f:
        category_orders = json.load(f)
    if df.empty:
        raise ValueError("Input DataFrame is empty.")
    model = _load_m1_model()
    df.rename(columns={"riskrating": "RiskRating", "trueipgeo": "TrueIpGeo"}, inplace=True)
    category_orders = _load_category_orders_cached(M1_CATEGORY_ORDERS_PATH)
    df = _prepare(df, category_orders)
    # Ensure all expected features exist
    expected_features = model.feature_names
    for col, categories in category_orders.items():
        df[col].replace([None, "", "null", np.nan, "nan", " "], np.nan, inplace=True)
        df[col] = pd.Categorical(df[col], categories=categories, ordered=True)
    # missing_features = [feature for feature in expected_features if feature not in df.columns]
    # for feature in missing_features:
    #     df[feature] = np.nan  # Use NaN to avoid dtype issues
    # Create XGBoost DMatrix
    dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=np.nan)
    # Make predictions
    predictions = model.predict(dmatrix)
-    df['prediction'] = predictions
+    df["prediction"] = predictions
    return df
 def processing_m2(input_data: pd.DataFrame) -> pd.DataFrame:
    df = pd.DataFrame(input_data)
    if df.empty:
        raise ValueError("Input DataFrame is empty.")
    model = _load_m2_model()
    category_orders = _load_category_orders_cached(M2_CATEGORY_ORDERS_PATH)
    df = _prepare(df, category_orders)
    expected_features = model.feature_names
    for feature in expected_features:
        if feature not in df.columns:
            df[feature] = np.nan
    dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=np.nan)
    pd_arr = model.predict(dmatrix)
    df["pd_m2"] = pd_arr
    iso_model = _load_m2_iso_model()
    df["pd_m2_iso"] = iso_model.predict(pd_arr)
    return df
 def processing_all(df_m1: pd.DataFrame, df_m2: pd.DataFrame, df_thx: pd.DataFrame):
    return processing_m1(df_m1), processing_m2(df_m2), df_thx
 # Legacy single-model entry point
 def processing(input_data: pd.DataFrame) -> pd.DataFrame:
    return processing_m1(input_data)
--- a/request_schema.json
+++ b/request_schema.json
@ -97,6 +97,10 @@
    "zip": {
      "type": ["string", "null"],
      "description": "Zip of the current residence."
    },
    "ReasonCode": {
      "type": ["string", "null"],
      "description": "Reason code from ThreatMetrix."
    }
  },
  "required": []
--- a/requirements.txt
+++ b/requirements.txt
@ -5,3 +5,4 @@ joblib == 1.4.2
 jmespath == 1.0.1
 regex == 2023.12.25
 json_repair == 0.47.6
 scikit-learn==1.5.2
--- a/response_schema.json
+++ b/response_schema.json
@ -26,6 +26,54 @@
      "type": ["number", "null"],
      "description": "HD Fraud Score M1"
    },
    "hd_score_m2": {
      "type": ["number", "null"],
      "description": "HD Fraud Score M2"
    },
    "hd_score_iso_m2": {
      "type": ["number", "null"],
      "description": "HD Fraud Score M2 Scaled"
    },
    "digital_id_first_seen": {
      "type": ["string",  "null"],
      "description": "Digital ID first seen timestamp"
    },
    "summary_risk_score": {
      "type": ["string",  "null"],
      "description": "Summary risk score"
    },
    "cpu_clock": {
      "type": ["string",  "null"],
      "description": "CPU clock value from device profiling"
    },
    "account_login_first_seen": {
      "type": ["string",  "null"],
      "description": "Account login first seen timestamp"
    },
    "account_telephone_first_seen": {
      "type": ["string",  "null"],
      "description": "Account telephone first seen timestamp"
    },
    "true_ip_first_seen": {
      "type": ["string",  "null"],
      "description": "True IP first seen timestamp"
    },
    "ssn_hash_first_seen": {
      "type": ["string", "null"],
      "description": "SSN hash first seen timestamp"
    },
    "account_email_attributes": {
      "type": ["string", "null"],
      "description": "Account email attributes"
    },
    "tps_ip_latitude": {
      "type": ["string", "null"],
      "description": "TPS IP latitude"
    },
    "tps_ip_longitude": {
      "type": ["string",  "null"],
      "description": "TPS IP longitude"
    },
    "action": {
      "type": ["string", "null"],
      "description": "Recommended Action."
--- a/test_block.py
+++ b/test_block.py
--- a/xgboost_model_M1.joblib
+++ b/xgboost_model_M1.joblib
--- a/xgboost_model_M2.joblib
+++ b/xgboost_model_M2.joblib