Olutola Oluwatobi Solomon

Fraud detection

# Fraud Detection Technical Assessment **Author: Olutola Oluwatobi Solomon **Date: 25th August, 2025 This notebook solves the fraud detection task by: 1. Creating synthetic transaction data with fraud labels. 2. Training multiple machine learning models. 3. Evaluating models with time-based train/test split. 4. Detecting concept drift using PSI. 5. Mitigating drift by recalibrating thresholds. 6. Presenting metrics, plots, and a summary report. In [11]:  ## Importing necessary librairies import numpy as np import pandas as pd import matplotlib.pyplot as plt from datetime import datetime, timedelta import os, json, textwrap, joblib from from from from from from from ) sklearn.preprocessing import OneHotEncoder, StandardScaler sklearn.compose import ColumnTransformer sklearn.pipeline import Pipeline sklearn.linear_model import LogisticRegression sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier sklearn.neural_network import MLPClassifier sklearn.metrics import ( roc_auc_score, average_precision_score, precision_recall_curve, roc_curve, confusion_matrix, f1_score, precision_score, recall_score In [12]:  ## Generate Synthetic Dataset rng = np.random.default_rng(7) def synthesize_transactions(n=60_000, start_date="-", months=9): start = datetime.fromisoformat(start_date) timestamps = np.array([start + timedelta(minutes=int(i)) for i in np.linspace(0, months*30*24*60, n, endpoint=False)]) # Features customer_id = rng.integers(10_000, 60_000, size=n) merchant_id = rng.integers(1_000, 8_000, size=n) mcc_list = ["grocery","electronics","restaurant","fashion","fuel","online","travel","uti merchant_category = rng.choice(mcc_list, size=n, p=[0.2,0.12,0.18,0.12,0.1,0.18,0.05,0.0 txn_hour = np.array([t.hour for t in timestamps]) day_of_week = np.array([t.weekday() for t in timestamps]) is_weekend = (day_of_week >= 5).astype(int) account_age_days = rng.integers(7, 365*5, size=n) card_present = rng.choice([0,1], size=n, p=[0.45,0.55]) is_international = rng.choice([0,1], size=n, p=[0.92,0.08]) high_risk_country = (is_international & (rng.random(size=n) < 0.25)).astype(int) device_risk_score = np.clip(rng.normal(0.3, 0.2, size=n), 0, 1) distance_km = np.clip(np.abs(rng.normal(8, 12, size=n)), 0, None) distance_km += (is_international * rng.normal(1500, 500, size=n)).clip(0) amounts = np.exp(rng.normal(np.log(35), 0.8, size=n)) base_rate = 0.8 + 0.3*(txn_hour//8) v1h_count = rng.poisson(lam=np.clip(base_rate, 0.2, 3.0), size=n) v24h_amount = amounts * rng.uniform(0.1, 2.5, size=n) * (1 + v1h_count*0.1) # Drift simulation months_from_start = np.array([(t.year - start.year)*12 + (t.month - start.month) for t i drift_start_month = 7 drift_mask = months_from_start >= drift_start_month amount_drift_multiplier = np.where(drift_mask, 1.6, 1.0) # Fraud probability mcc_weight = np.array([ {"grocery": -0.6, "electronics": 0.3, "restaurant": -0.2, "fashion": 0.1, "fuel": -0.3, "online": 0.8, "travel": 0.7, "utilities": -0.4}[m] for m in merchant ]) logit = (-5.0 + 0.002*(amounts - 50) + 1.1*(1-card_present) + 0.9*is_international + 1.5*high_risk_country + 2.2*device_risk_score + 0.0008*distance_km + 0.6*((txn_hour<6)|(txn_hour>22)).astype(int) - 0.0008*account_age_days + mcc_weight + 0.12*v1h_count ) p_fraud = 1/(1+np.exp(-logit)) is_fraud = rng.binomial(1, np.clip(p_fraud, 0, 1)) amounts = amounts * (np.where((is_fraud==0)&drift_mask, amount_drift_multiplier, 1.0)) df = pd.DataFrame({ "timestamp":timestamps,"customer_id":customer_id,"merchant_id":merchant_id, "amount":amounts,"merchant_category":merchant_category,"txn_hour":txn_hour, "day_of_week":day_of_week,"is_weekend":is_weekend,"account_age_days":account_age_day "card_present":card_present,"is_international":is_international,"high_risk_country" "device_risk_score":device_risk_score,"distance_from_home_km":distance_km, "v1h_count":v1h_count,"v24h_amount":v24h_amount,"months_from_start":months_from_star "is_fraud":is_fraud }) return df df = synthesize_transactions() df.head() Out[12]: timestamp customer_id merchant_id In [13]: amount merchant_category txn_hour day_of_week is_weekend accoun 0 -:00:00 57245 6779 - electronics 0 0 0 1 -:06:00 41254 4461 - restaurant 0 0 0 2 -:12:00 44208 7763 - electronics 0 0 0 3 -:19:00 54860 3212 - fashion 0 0 0 4 -:25:00 38914 1583 - utilities 0 0 0  ### Train-Test Split train_mask = df["months_from_start"] <= 5 test_mask = df["months_from_start"] >= 6 df_train, df_test = df.loc[train_mask], df.loc[test_mask] feature_cols = ["amount","merchant_category","txn_hour","day_of_week","is_weekend", "account_age_days","card_present","is_international","high_risk_country", "device_risk_score","distance_from_home_km","v1h_count","v24h_amount"] target_col = "is_fraud" X_train, y_train = df_train[feature_cols], df_train[target_col] X_test, y_test = df_test[feature_cols], df_test[target_col] In [16]:  ###Preprocessing & Models numeric_features = ["amount","txn_hour","day_of_week","is_weekend","account_age_days", "card_present","is_international","high_risk_country", "device_risk_score","distance_from_home_km","v1h_count","v24h_amount"] categorical_features = ["merchant_category"] preprocess_scaled = ColumnTransformer([ ("num", StandardScaler(with_mean=False), numeric_features), ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features) ]) preprocess_tree = ColumnTransformer([ ("num", "passthrough", numeric_features), ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features) ]) models = { "LogisticRegression": Pipeline([ ("prep", preprocess_scaled), ("clf", LogisticRegression(max_iter=150, class_weight="balanced", solver="saga")) ]), "RandomForest": Pipeline([ ("prep", preprocess_tree), ("clf", RandomForestClassifier(n_estimators=150, class_weight="balanced_subsample", ]), "HistGradientBoosting": Pipeline([ ("prep", preprocess_tree), ("clf", HistGradientBoostingClassifier(learning_rate=0.08, random_state=7)) ]), "NeuralNet_MLP": Pipeline([ ("prep", preprocess_scaled), ("clf", MLPClassifier(hidden_layer_sizes=(64,32), max_iter=15, random_state=7)) ]) } In [18]:  ## Model Evaluation def evaluate_pipeline(pipe, X_tr, y_tr, X_te, y_te, name): pipe.fit(X_tr, y_tr) scores = (pipe.predict_proba(X_te)[:,1] if hasattr(pipe[-1], "predict_proba") else pipe.decision_function(X_te)) roc = roc_auc_score(y_te, scores) pr = average_precision_score(y_te, scores) precision, recall, thresholds = precision_recall_curve(y_te, scores) f1s = (2*precision*recall)/(precision+recall+1e-9) best_idx = np.argmax(f1s[:-1]) best_thr = thresholds[best_idx] y_pred = (scores >= best_thr).astype(int) return { "model":name,"ROC_AUC":roc,"PR_AUC":pr, "F1":f1_score(y_te,y_pred),"Precision":precision_score(y_te,y_pred), "Recall":recall_score(y_te,y_pred),"Best_Threshold":best_thr } results = [evaluate_pipeline(pipe, X_train,y_train,X_test,y_test,name) for name,pipe in mode pd.DataFrame(results) C:\Users\SLMN\anaconda3\Lib\site-packages\sklearn\neural_network\_multilayer_perceptron.p y:691: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (15) reached and the o ptimization hasn't converged yet. warnings.warn( Out[18]: model ROC_AUC In [19]: PR_AUC F1 Precision Recall Best_Threshold 0 LogisticRegression - - - - - - 1 RandomForest - - - - - - 2 HistGradientBoosting - - - - - - 3 NeuralNet_MLP - - - - - -  ## Drift Detection with PSI def compute_psi(ref, curr, bins=20): ref, curr = np.asarray(ref), np.asarray(curr) qs = np.linspace(0,1,bins+1) edges = np.unique(np.quantile(ref, qs)) ref_hist,_ = np.histogram(ref,bins=edges) curr_hist,_ = np.histogram(curr,bins=edges) ref_prop = np.clip(ref_hist/ref_hist.sum(),1e-6,None) curr_prop = np.clip(curr_hist/curr_hist.sum(),1e-6,None) return np.sum((curr_prop-ref_prop)*np.log(curr_prop/ref_prop)) ref_window = df_train last_month = df[df["months_from_start"]==df["months_from_start"].max()] psi_amount = compute_psi(ref_window["amount"], last_month["amount"]) psi_device = compute_psi(ref_window["device_risk_score"], last_month["device_risk_score"]) psi_distance = compute_psi(ref_window["distance_from_home_km"], last_month["distance_from_ho pd.DataFrame({ "feature":["amount","device_risk_score","distance_from_home_km"], "PSI":[psi_amount,psi_device,psi_distance] }) Out[19]: feature 0 PSI amount- 1 device_risk_score - 2 distance_from_home_km - In [20]:  plt.figure() plt.hist(ref_window["amount"], bins=50, alpha=0.7, label="Train months") plt.hist(last_month["amount"], bins=50, alpha=0.7, label="Last month") plt.legend() plt.title("Transaction Amount Distribution (Drift visible)") plt.show() In [ ]:  In [ ]:  In [ ]: 

Scheduled maintenance