Problem Statement#
import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/svgoudar/datasets/refs/heads/main/financial_anomaly_data.csv")
df.head()
| Timestamp | TransactionID | AccountID | Amount | Merchant | TransactionType | Location | |
|---|---|---|---|---|---|---|---|
| 0 | 01-01-2023 08:00 | TXN1127 | ACC4 | 95071.92 | MerchantH | Purchase | Tokyo |
| 1 | 01-01-2023 08:01 | TXN1639 | ACC10 | 15607.89 | MerchantH | Purchase | London |
| 2 | 01-01-2023 08:02 | TXN872 | ACC8 | 65092.34 | MerchantE | Withdrawal | London |
| 3 | 01-01-2023 08:03 | TXN1438 | ACC6 | 87.87 | MerchantE | Purchase | London |
| 4 | 01-01-2023 08:04 | TXN1338 | ACC6 | 716.56 | MerchantI | Purchase | Los Angeles |
Here’s a good Kaggle dataset for anomaly detection + starter code sketch using DBSCAN.
Dataset recommendation#
“Network Traffic Anomaly Detection Dataset” on Kaggle. ([Kaggle][1])
Contains network traffic data.
Useful for detecting anomalous patterns (e.g. intrusion, unusual behaviour). ([Kaggle][1])
Another option: Financial Anomaly Data on Kaggle. ([Kaggle][2])
Starter code sketch (in Python) using DBSCAN#
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.metrics import classification_report, confusion_matrix
# 1. Load dataset
df = pd.read_csv('path/to/network_traffic_anomaly_detection.csv')
# 2. Preprocess
# - handle missing values
# - select relevant features
# - encode categorical variables (if any)
# - scale features (DBSCAN is distance-based)
features = ['feature1', 'feature2', 'feature3', ...] # replace
X = df[features].copy()
X.fillna(X.mean(), inplace=True)
# if categorical:
# X = pd.get_dummies(X, columns=['cat_feature1', ...])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 3. DBSCAN clustering
eps = 0.5 # to tune
min_samples = 5 # to tune
db = DBSCAN(eps=eps, min_samples=min_samples, metric='euclidean')
labels = db.fit_predict(X_scaled)
# 4. Mark anomalies
# Usually noise = label -1 in DBSCAN
df['cluster'] = labels
df['anomaly'] = (labels == -1).astype(int)
# 5. If you have true labels, evaluate
if 'true_label' in df.columns:
y_true = df['true_label']
y_pred = df['anomaly']
print(classification_report(y_true, y_pred))
print(confusion_matrix(y_true, y_pred))
# 6. Explore clusters
# Count number of anomalies
print("Anomalies detected:", sum(df['anomaly']))
print("Clusters found:", len(set(labels)) - (1 if -1 in labels else 0))