diff --git a/main.sql b/main.sql index a3d7e2c..7fdebe9 100644 --- a/main.sql +++ b/main.sql @@ -1,20 +1,47 @@ WITH abc AS ( - SELECT DISTINCT ON (app.application_key) + SELECT DISTINCT ON (app.application_key) app.application_key, CAST(app.application_timestamp AS TIMESTAMP) AS application_timestamp, + CAST(app.application_date_of_birth AS DATE) AS application_date_of_birth, LOWER(app.application_email_address) AS application_email_address, app.application_is_rejected, - NULLIF(app.application_ssn, '') AS application_ssn, + NULLIF(app.application_ssn, '') AS application_ssn, NULLIF(app.application_bank_account_number, '') AS application_bank_account_number, loans.originated_loan_is_fraud, - loans.originated_loan_is_never_paid + loans.originated_loan_is_never_paid, + CAST(CASE WHEN LOWER(TRIM(thx.digital_id_first_seen::text)) IN ('', 'none', 'null') THEN NULL ELSE thx.digital_id_first_seen::date END AS DATE) AS digital_id_first_seen, + CAST(NULLIF(NULLIF(LOWER(TRIM(thx.summary_risk_score::text)), ''), 'none') AS DOUBLE PRECISION) AS summary_risk_score, + CAST(NULLIF(NULLIF(LOWER(TRIM(thx.cpu_clock::text)), ''), 'none') AS DOUBLE PRECISION) AS cpu_clock, + CAST(NULLIF(NULLIF(thx.true_ip_first_seen::text, ''), 'None') AS DATE) AS true_ip_first_seen, + CAST(NULLIF(NULLIF(thx.ssn_hash_first_seen::text, ''),'None') AS DATE) AS ssn_hash_first_seen, + thx.account_email_attributes, + CAST(NULLIF(NULLIF(LOWER(TRIM(thx.tps_ip_latitude::text)), ''), 'none') AS DOUBLE PRECISION) AS tps_ip_latitude, + CAST(NULLIF(NULLIF(LOWER(TRIM(thx.tps_ip_longitude::text)), ''), 'none') AS DOUBLE PRECISION) AS tps_ip_longitude, + CAST(NULLIF(NULLIF(thx.account_telephone_first_seen::text, ''),'None') AS DATE) AS account_telephone_first_seen, + CAST(NULLIF(NULLIF(thx.account_login_first_seen::text, ''),'None') AS DATE) AS account_login_first_seen, + CAST(NULLIF(NULLIF(LOWER(TRIM(ref.latitute_ref::text)), ''), 'none') AS DOUBLE PRECISION) AS latitute_ref, + CAST(NULLIF(NULLIF(LOWER(TRIM(ref.longitude_ref::text)), ''), 'none') AS DOUBLE PRECISION) AS longitude_ref, + CAST(NULLIF(NULLIF(LOWER(TRIM(scores.hd_score_m2::text)), ''), 'none') AS DOUBLE PRECISION) AS hd_score_m2 + -- CAST(NULLIF(NULLIF(LOWER(TRIM(scores.hd_score_iso_m2::text)), ''), 'none') AS DOUBLE PRECISION) AS hd_score_iso_m2 FROM public.applications AS app LEFT JOIN public.bridge_funding_lifecycle AS master ON app.application_key = master.application_key LEFT JOIN public.originated_loan AS loans ON master.originated_loan_key = loans.originated_loan_key + LEFT JOIN public.thx AS thx + ON app.application_key = thx.application_key + LEFT JOIN public.uprova AS up + ON app.application_key = up.application_key + LEFT JOIN public.latitute_longitute_reference AS ref + ON + /* safe ZIP join: strip non-digits, take first 5, cast to int on both sides */ + CAST(SUBSTRING(REGEXP_REPLACE(up.zip::text, '[^0-9]', '', 'g') FOR 5) AS INTEGER) + = + CAST(SUBSTRING(REGEXP_REPLACE(ref.postal_code_ref::text, '[^0-9]', '', 'g') FOR 5) AS INTEGER) + LEFT JOIN public.scores AS scores + ON app.application_key = scores.application_key WHERE app.application_key = $application_key - OR app.application_key = ANY(ARRAY$connected_application_keys::text[]) + OR app.application_key = ANY(ARRAY$connected_application_keys::text[]) ORDER BY app.application_key, app.application_timestamp DESC ), @@ -23,9 +50,10 @@ current_app AS ( application_key AS current_app_key, application_timestamp AS current_app_created_at, $hd_score_m1 AS hd_score_m1, - $hd_score_iso_m2 AS hd_score_iso_m2 + $hd_score_iso_m2 AS hd_score_iso_m2, + hd_score_m2 FROM abc - WHERE application_key = $application_key + WHERE application_key = $application_key ), connected_apps AS ( @@ -52,7 +80,7 @@ clusters_alt AS ( THEN 1 ELSE 0 END AS target_connected_30, - CONCAT(a.application_ssn, '_', a.application_bank_account_number) AS user_connected_v2, + CONCAT(a.application_ssn, '_', a.application_bank_account_number) AS user_connected_v2, a.application_timestamp AS connected_date, a.application_email_address AS connected_email, a.application_is_rejected AS application_is_rejected_connected @@ -78,7 +106,7 @@ clusters AS ( SELECT a.current_app_key AS application_key, CAST(MAX(hd_score_m1) AS FLOAT) AS hd_score_m1, - CAST(MAX(hd_score_iso_m2) AS FLOAT) AS hd_score_iso_m2, + CAST(MAX(hd_score_iso_m2) AS FLOAT) as hd_score_iso_m2, COUNT(DISTINCT a.user_connected_v2) AS cluster_size_users_v2, MAX(b.target_connected_30_sum) AS target_connected_30_sum, COUNT(DISTINCT a.connected_email) AS email_cnt, @@ -89,15 +117,163 @@ clusters AS ( LEFT JOIN full_apps_past b ON a.current_app_key = b.current_app_key GROUP BY 1 +), + +clusters_alt_g2 AS ( + SELECT + b.current_app_key, + a.connected_app_key, + a.application_ssn, + a.application_bank_account_number, + a.application_is_rejected, + a.hd_score_m2 AS hd_score_m2_connected, + CASE + WHEN a.application_timestamp IS NULL OR a.application_date_of_birth IS NULL THEN NULL + ELSE DATE_PART('year', AGE(a.application_timestamp, a.application_date_of_birth)) + END AS applicant_age_connected, + CASE + WHEN a.application_timestamp IS NULL OR a.account_telephone_first_seen IS NULL THEN NULL + ELSE (a.application_timestamp::date - a.account_telephone_first_seen::date) + END AS account_telephone_first_seen_age_connected, + CASE + WHEN a.application_timestamp IS NULL OR a.ssn_hash_first_seen IS NULL THEN NULL + ELSE (a.application_timestamp::date - a.ssn_hash_first_seen::date) + END AS ssn_hash_first_seen_age_connected, + CASE + WHEN a.application_timestamp IS NULL OR a.account_login_first_seen IS NULL THEN NULL + ELSE (a.application_timestamp::date - a.account_login_first_seen::date) + END AS account_login_first_seen_age_connected, + CASE + WHEN a.application_timestamp IS NULL OR a.digital_id_first_seen IS NULL THEN NULL + ELSE (a.application_timestamp::date - a.digital_id_first_seen::date) + END AS digital_id_first_seen_age_connected, + CASE + WHEN a.application_timestamp IS NULL OR a.true_ip_first_seen IS NULL THEN NULL + ELSE (a.application_timestamp::date - a.true_ip_first_seen::date) + END AS true_ip_first_seen_age_connected, + CASE + WHEN a.latitute_ref IS NULL OR a.longitude_ref IS NULL + OR a.tps_ip_latitude IS NULL OR a.tps_ip_longitude IS NULL + THEN NULL + ELSE 2 * 6371.0088 * ASIN( + LEAST( + 1.0, + SQRT( + POWER(SIN(((a.tps_ip_latitude - a.latitute_ref) * 0.017453292519943295) / 2), 2) + + COS(a.latitute_ref * 0.017453292519943295) * COS(a.tps_ip_latitude * 0.017453292519943295) + * POWER(SIN(((a.tps_ip_longitude - a.longitude_ref) * 0.017453292519943295) / 2), 2) + ) + ) + ) + END AS dist_em_ip_ref_km_connected, + CASE + WHEN LOWER(a.account_email_attributes) ILIKE '%challenged%' THEN 1 + ELSE 0 + END AS account_email_attributes_challenged_connected, + a.summary_risk_score AS summary_risk_score_connected, + a.cpu_clock AS cpu_clock_connected, + CONCAT(a.application_ssn, '_', a.application_bank_account_number) AS connected_user_id + FROM connected_apps a + CROSS JOIN current_app b +), + +full_apps_past_g2 AS ( + SELECT + current_app_key, + COUNT(DISTINCT connected_user_id) AS connected_user_count, + COUNT(DISTINCT CASE + WHEN LOWER(CAST(application_is_rejected AS TEXT)) = 'true' + THEN connected_user_id + END) AS rejected_app_count_g2, + MAX(hd_score_m2_connected) AS hd_score_m2_connected_max, + AVG(hd_score_m2_connected) AS hd_score_m2_connected_avg, + MAX(applicant_age_connected) AS applicant_age_connected_max, + AVG(applicant_age_connected) AS applicant_age_connected_avg, + MIN(account_telephone_first_seen_age_connected) AS account_tel_first_seen_min_conn, + MAX(account_telephone_first_seen_age_connected) AS account_tel_first_seen_max_conn, + AVG(account_telephone_first_seen_age_connected) AS account_tel_first_seen_avg_conn, + MIN(cpu_clock_connected) AS cpu_clock_min_conn, + MAX(cpu_clock_connected) AS cpu_clock_max_conn, + MIN(ssn_hash_first_seen_age_connected) AS ssn_hash_first_seen_min_conn, + AVG(ssn_hash_first_seen_age_connected) AS ssn_hash_first_seen_avg_conn, + MIN(account_login_first_seen_age_connected) AS account_login_first_seen_min_conn, + MAX(account_login_first_seen_age_connected) AS account_login_first_seen_max_conn, + STDDEV_SAMP(account_login_first_seen_age_connected) AS account_login_first_seen_stddev_conn, + MAX(digital_id_first_seen_age_connected) AS digital_id_first_seen_max_conn, + MIN(true_ip_first_seen_age_connected) AS true_ip_first_seen_min_conn, + MAX(true_ip_first_seen_age_connected) AS true_ip_first_seen_max_conn, + MIN(dist_em_ip_ref_km_connected) AS dist_em_ip_ref_km_min_conn, + MAX(summary_risk_score_connected) AS summary_risk_score_max_conn, + COUNT(DISTINCT CASE + WHEN account_email_attributes_challenged_connected = 1 THEN connected_user_id + END) AS acc_email_attr_challenged_1_conn + FROM clusters_alt_g2 + WHERE current_app_key != connected_app_key + GROUP BY current_app_key +), + +clusters_g2 AS ( + SELECT + cur.current_app_key AS application_key, + cur.hd_score_m2 AS hd_score_m2, + agg.hd_score_m2_connected_avg, + agg.hd_score_m2_connected_max, + agg.applicant_age_connected_max, + agg.applicant_age_connected_avg, + agg.account_tel_first_seen_min_conn, + agg.account_tel_first_seen_max_conn, + agg.account_tel_first_seen_avg_conn, + agg.ssn_hash_first_seen_min_conn, + agg.ssn_hash_first_seen_avg_conn, + agg.account_login_first_seen_min_conn, + agg.account_login_first_seen_stddev_conn, + (agg.account_login_first_seen_max_conn - agg.account_login_first_seen_min_conn) AS account_login_first_seen_range_conn, + agg.digital_id_first_seen_max_conn, + agg.true_ip_first_seen_min_conn, + agg.true_ip_first_seen_max_conn, + (agg.cpu_clock_max_conn - agg.cpu_clock_min_conn) AS cpu_clock_range_conn, + agg.summary_risk_score_max_conn, + CASE + WHEN agg.connected_user_count IS NULL OR agg.connected_user_count = 0 THEN NULL + ELSE (agg.acc_email_attr_challenged_1_conn::DOUBLE PRECISION / agg.connected_user_count::DOUBLE PRECISION) * 100 + END AS pct_acc_email_attr_challenged_1_conn, + agg.rejected_app_count_g2, + agg.dist_em_ip_ref_km_min_conn + FROM current_app cur + LEFT JOIN full_apps_past_g2 agg + ON cur.current_app_key = agg.current_app_key ) SELECT - COALESCE(MAX(hd_score_m1), $hd_score_m1) AS hd_score_m1, - COALESCE(MAX(hd_score_iso_m2), $hd_score_iso_m2) AS hd_score_iso_m2, - COALESCE(MAX(cluster_size_users_v2), 0) AS cluster_size_users_v2, - COALESCE(MAX(target_connected_30_sum), 0) AS target_connected_30_sum, - COALESCE(MAX(email_cnt), 0) AS email_cnt, - COALESCE(MAX(rejected_app_count), 0) AS rejected_app_count, - COALESCE(MAX(app_dt_day_cnt), 0) AS app_dt_day_cnt, - COALESCE(MAX(cluster_size), 0) AS cluster_size -FROM clusters; \ No newline at end of file + COALESCE(MAX(c.hd_score_m1), $hd_score_m1) AS hd_score_m1, + COALESCE(MAX(c.hd_score_iso_m2), $hd_score_iso_m2) AS hd_score_iso_m2, + COALESCE(MAX(c.cluster_size_users_v2), 0) AS cluster_size_users_v2, + COALESCE(MAX(c.target_connected_30_sum), 0) AS target_connected_30_sum, + COALESCE(MAX(c.email_cnt), 0) AS email_cnt, + COALESCE(MAX(c.rejected_app_count), 0) AS rejected_app_count, + COALESCE(MAX(c.app_dt_day_cnt), 0) AS app_dt_day_cnt, + COALESCE(MAX(c.cluster_size), 0) AS cluster_size, + MAX(g2.hd_score_m2) AS hd_score_m2, + MAX(g2.hd_score_m2_connected_avg) AS hd_score_m2_connected_avg, + MAX(g2.rejected_app_count_g2) AS rejected_app_count_g2, + MAX(g2.summary_risk_score_max_conn) AS summary_risk_score_max_conn, + MAX(g2.cpu_clock_range_conn) AS cpu_clock_range_conn, + MAX(g2.ssn_hash_first_seen_avg_conn) AS ssn_hash_first_seen_avg_conn, + MAX(g2.account_tel_first_seen_avg_conn) AS account_tel_first_seen_avg_conn, + MAX(g2.account_tel_first_seen_min_conn) AS account_tel_first_seen_min_conn, + MAX(g2.account_login_first_seen_stddev_conn) AS account_login_first_seen_stddev_conn, + MAX(g2.account_login_first_seen_min_conn) AS account_login_first_seen_min_conn, + MAX(g2.ssn_hash_first_seen_min_conn) AS ssn_hash_first_seen_min_conn, + MAX(g2.account_tel_first_seen_max_conn) AS account_tel_first_seen_max_conn, + MAX(g2.applicant_age_connected_max) AS applicant_age_connected_max, + MAX(g2.applicant_age_connected_avg) AS applicant_age_connected_avg, + MAX(g2.digital_id_first_seen_max_conn) AS digital_id_first_seen_max_conn, + MAX(g2.account_login_first_seen_range_conn) AS account_login_first_seen_range_conn, + MAX(g2.true_ip_first_seen_min_conn) AS true_ip_first_seen_min_conn, + MAX(g2.true_ip_first_seen_max_conn) AS true_ip_first_seen_max_conn, + MAX(g2.hd_score_m2_connected_max) AS hd_score_m2_connected_max, + MAX(g2.pct_acc_email_attr_challenged_1_conn) AS pct_acc_email_attr_challenged_1_conn, + MAX(g2.dist_em_ip_ref_km_min_conn) AS dist_em_ip_ref_km_min_conn +FROM clusters c +LEFT JOIN clusters_g2 g2 + ON c.application_key = g2.application_key;