Fraud detection
# Fraud Detection Technical Assessment
**Author: Olutola Oluwatobi Solomon
**Date: 25th August, 2025
This notebook solves the fraud detection task by:
1. Creating synthetic transaction data with fraud labels.
2. Training multiple machine learning models.
3. Evaluating models with time-based train/test split.
4. Detecting concept drift using PSI.
5. Mitigating drift by recalibrating thresholds.
6. Presenting metrics, plots, and a summary report.
In [11]:
## Importing necessary librairies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import os, json, textwrap, joblib
from
from
from
from
from
from
from
)
sklearn.preprocessing import OneHotEncoder, StandardScaler
sklearn.compose import ColumnTransformer
sklearn.pipeline import Pipeline
sklearn.linear_model import LogisticRegression
sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
sklearn.neural_network import MLPClassifier
sklearn.metrics import (
roc_auc_score, average_precision_score,
precision_recall_curve, roc_curve, confusion_matrix,
f1_score, precision_score, recall_score
In [12]:
## Generate Synthetic Dataset
rng = np.random.default_rng(7)
def synthesize_transactions(n=60_000, start_date="-", months=9):
start = datetime.fromisoformat(start_date)
timestamps = np.array([start + timedelta(minutes=int(i))
for i in np.linspace(0, months*30*24*60, n, endpoint=False)])
# Features
customer_id = rng.integers(10_000, 60_000, size=n)
merchant_id = rng.integers(1_000, 8_000, size=n)
mcc_list = ["grocery","electronics","restaurant","fashion","fuel","online","travel","uti
merchant_category = rng.choice(mcc_list, size=n, p=[0.2,0.12,0.18,0.12,0.1,0.18,0.05,0.0
txn_hour = np.array([t.hour for t in timestamps])
day_of_week = np.array([t.weekday() for t in timestamps])
is_weekend = (day_of_week >= 5).astype(int)
account_age_days = rng.integers(7, 365*5, size=n)
card_present = rng.choice([0,1], size=n, p=[0.45,0.55])
is_international = rng.choice([0,1], size=n, p=[0.92,0.08])
high_risk_country = (is_international & (rng.random(size=n) < 0.25)).astype(int)
device_risk_score = np.clip(rng.normal(0.3, 0.2, size=n), 0, 1)
distance_km = np.clip(np.abs(rng.normal(8, 12, size=n)), 0, None)
distance_km += (is_international * rng.normal(1500, 500, size=n)).clip(0)
amounts = np.exp(rng.normal(np.log(35), 0.8, size=n))
base_rate = 0.8 + 0.3*(txn_hour//8)
v1h_count = rng.poisson(lam=np.clip(base_rate, 0.2, 3.0), size=n)
v24h_amount = amounts * rng.uniform(0.1, 2.5, size=n) * (1 + v1h_count*0.1)
# Drift simulation
months_from_start = np.array([(t.year - start.year)*12 + (t.month - start.month) for t i
drift_start_month = 7
drift_mask = months_from_start >= drift_start_month
amount_drift_multiplier = np.where(drift_mask, 1.6, 1.0)
# Fraud probability
mcc_weight = np.array([
{"grocery": -0.6, "electronics": 0.3, "restaurant": -0.2, "fashion": 0.1,
"fuel": -0.3, "online": 0.8, "travel": 0.7, "utilities": -0.4}[m] for m in merchant
])
logit = (-5.0
+ 0.002*(amounts - 50)
+ 1.1*(1-card_present)
+ 0.9*is_international
+ 1.5*high_risk_country
+ 2.2*device_risk_score
+ 0.0008*distance_km
+ 0.6*((txn_hour<6)|(txn_hour>22)).astype(int)
- 0.0008*account_age_days
+ mcc_weight
+ 0.12*v1h_count
)
p_fraud = 1/(1+np.exp(-logit))
is_fraud = rng.binomial(1, np.clip(p_fraud, 0, 1))
amounts = amounts * (np.where((is_fraud==0)&drift_mask, amount_drift_multiplier, 1.0))
df = pd.DataFrame({
"timestamp":timestamps,"customer_id":customer_id,"merchant_id":merchant_id,
"amount":amounts,"merchant_category":merchant_category,"txn_hour":txn_hour,
"day_of_week":day_of_week,"is_weekend":is_weekend,"account_age_days":account_age_day
"card_present":card_present,"is_international":is_international,"high_risk_country"
"device_risk_score":device_risk_score,"distance_from_home_km":distance_km,
"v1h_count":v1h_count,"v24h_amount":v24h_amount,"months_from_start":months_from_star
"is_fraud":is_fraud
})
return df
df = synthesize_transactions()
df.head()
Out[12]:
timestamp customer_id merchant_id
In [13]:
amount
merchant_category txn_hour
day_of_week
is_weekend accoun
0
-:00:00
57245
6779
-
electronics
0
0
0
1
-:06:00
41254
4461
-
restaurant
0
0
0
2
-:12:00
44208
7763
-
electronics
0
0
0
3
-:19:00
54860
3212
-
fashion
0
0
0
4
-:25:00
38914
1583
-
utilities
0
0
0
### Train-Test Split
train_mask = df["months_from_start"] <= 5
test_mask = df["months_from_start"] >= 6
df_train, df_test = df.loc[train_mask], df.loc[test_mask]
feature_cols = ["amount","merchant_category","txn_hour","day_of_week","is_weekend",
"account_age_days","card_present","is_international","high_risk_country",
"device_risk_score","distance_from_home_km","v1h_count","v24h_amount"]
target_col = "is_fraud"
X_train, y_train = df_train[feature_cols], df_train[target_col]
X_test, y_test = df_test[feature_cols], df_test[target_col]
In [16]:
###Preprocessing & Models
numeric_features = ["amount","txn_hour","day_of_week","is_weekend","account_age_days",
"card_present","is_international","high_risk_country",
"device_risk_score","distance_from_home_km","v1h_count","v24h_amount"]
categorical_features = ["merchant_category"]
preprocess_scaled = ColumnTransformer([
("num", StandardScaler(with_mean=False), numeric_features),
("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])
preprocess_tree = ColumnTransformer([
("num", "passthrough", numeric_features),
("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])
models = {
"LogisticRegression": Pipeline([
("prep", preprocess_scaled),
("clf", LogisticRegression(max_iter=150, class_weight="balanced", solver="saga"))
]),
"RandomForest": Pipeline([
("prep", preprocess_tree),
("clf", RandomForestClassifier(n_estimators=150, class_weight="balanced_subsample",
]),
"HistGradientBoosting": Pipeline([
("prep", preprocess_tree),
("clf", HistGradientBoostingClassifier(learning_rate=0.08, random_state=7))
]),
"NeuralNet_MLP": Pipeline([
("prep", preprocess_scaled),
("clf", MLPClassifier(hidden_layer_sizes=(64,32), max_iter=15, random_state=7))
])
}
In [18]:
## Model Evaluation
def evaluate_pipeline(pipe, X_tr, y_tr, X_te, y_te, name):
pipe.fit(X_tr, y_tr)
scores = (pipe.predict_proba(X_te)[:,1] if hasattr(pipe[-1], "predict_proba")
else pipe.decision_function(X_te))
roc = roc_auc_score(y_te, scores)
pr = average_precision_score(y_te, scores)
precision, recall, thresholds = precision_recall_curve(y_te, scores)
f1s = (2*precision*recall)/(precision+recall+1e-9)
best_idx = np.argmax(f1s[:-1])
best_thr = thresholds[best_idx]
y_pred = (scores >= best_thr).astype(int)
return {
"model":name,"ROC_AUC":roc,"PR_AUC":pr,
"F1":f1_score(y_te,y_pred),"Precision":precision_score(y_te,y_pred),
"Recall":recall_score(y_te,y_pred),"Best_Threshold":best_thr
}
results = [evaluate_pipeline(pipe, X_train,y_train,X_test,y_test,name) for name,pipe in mode
pd.DataFrame(results)
C:\Users\SLMN\anaconda3\Lib\site-packages\sklearn\neural_network\_multilayer_perceptron.p
y:691: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (15) reached and the o
ptimization hasn't converged yet.
warnings.warn(
Out[18]:
model ROC_AUC
In [19]:
PR_AUC
F1
Precision
Recall Best_Threshold
0
LogisticRegression
-
-
-
-
-
-
1
RandomForest
-
-
-
-
-
-
2
HistGradientBoosting
-
-
-
-
-
-
3
NeuralNet_MLP
-
-
-
-
-
-
## Drift Detection with PSI
def compute_psi(ref, curr, bins=20):
ref, curr = np.asarray(ref), np.asarray(curr)
qs = np.linspace(0,1,bins+1)
edges = np.unique(np.quantile(ref, qs))
ref_hist,_ = np.histogram(ref,bins=edges)
curr_hist,_ = np.histogram(curr,bins=edges)
ref_prop = np.clip(ref_hist/ref_hist.sum(),1e-6,None)
curr_prop = np.clip(curr_hist/curr_hist.sum(),1e-6,None)
return np.sum((curr_prop-ref_prop)*np.log(curr_prop/ref_prop))
ref_window = df_train
last_month = df[df["months_from_start"]==df["months_from_start"].max()]
psi_amount = compute_psi(ref_window["amount"], last_month["amount"])
psi_device = compute_psi(ref_window["device_risk_score"], last_month["device_risk_score"])
psi_distance = compute_psi(ref_window["distance_from_home_km"], last_month["distance_from_ho
pd.DataFrame({
"feature":["amount","device_risk_score","distance_from_home_km"],
"PSI":[psi_amount,psi_device,psi_distance]
})
Out[19]:
feature
0
PSI
amount-
1
device_risk_score
-
2
distance_from_home_km
-
In [20]:
plt.figure()
plt.hist(ref_window["amount"], bins=50, alpha=0.7, label="Train months")
plt.hist(last_month["amount"], bins=50, alpha=0.7, label="Last month")
plt.legend()
plt.title("Transaction Amount Distribution (Drift visible)")
plt.show()
In [ ]:
In [ ]:
In [ ]: