# -*- coding: utf-8 -*-
"""Huynh_Do_Lab_7.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1fPRq2s6SvYQPe8L9jELg0IQKJjstjFby
"""

from google.colab import files
import os
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    accuracy_score,
    roc_curve,
    auc
)
from scipy.cluster.hierarchy import dendrogram, linkage

# ===== Step 1: Load & Inspect Data =====
uploaded = files.upload()  # select your 'bill_authentication.csv'
df = pd.read_csv('bill_authentication.csv')

print("\n== Data Info ==")
print(df.info())
print("\n== Statistical Summary ==")
print(df.describe())
print("\n== Class Distribution ==")
print(df['Class'].value_counts(), "\n")

# ===== Step 2: Preprocess =====
X = df.drop('Class', axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# ===== Step 3: Train Models =====
models = {
    "Logistic Regression":   LogisticRegression(max_iter=1000),
    "K-Nearest Neighbors":   KNeighborsClassifier(),
    "Gaussian Naive Bayes":  GaussianNB(),
    "Random Forest":         RandomForestClassifier(random_state=42),
    "Support Vector Machine": SVC(probability=True, random_state=42)
}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)

# ===== Step 4: Evaluate & Print Results =====
results = {}
print("\n== Model Performance ==")
for name, model in models.items():
    y_pred = model.predict(X_test_scaled)
    acc    = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, digits=4)
    cm     = confusion_matrix(y_test, y_pred)

    print(f"\n--- {name} ---")
    print(f"Accuracy: {acc:.4f}")
    print("Confusion Matrix:")
    print(cm)
    print("Classification Report:")
    print(report)

    results[name] = acc

best = max(results, key=results.get)
print(f"\n>> Best performing model: {best} ({results[best]:.4f})")

# ===== Step 5: Confusion Matrix Heatmaps =====
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for ax, (name, model) in zip(axes, models.items()):
    cm = confusion_matrix(y_test, model.predict(X_test_scaled))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax)
    ax.set_title(name)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")

plt.tight_layout()
plt.show()

# ===== Step 6: ROC Curve Comparison =====
y_test_bin = label_binarize(y_test, classes=[0, 1]).ravel()
plt.figure(figsize=(10, 8))

for name, model in models.items():
    if hasattr(model, "predict_proba"):
        scores = model.predict_proba(X_test_scaled)[:, 1]
    else:
        scores = model.decision_function(X_test_scaled)
    fpr, tpr, _ = roc_curve(y_test_bin, scores)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"{name} (AUC={roc_auc:.4f})")

plt.plot([0, 1], [0, 1], "k--", label="Random Guess")
plt.title("ROC Curve Comparison")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")
plt.grid(True)
plt.tight_layout()
plt.show()

# ===== Step 7: Feature Importance (Random Forest) =====
rf = models["Random Forest"]
feat_imp = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)

plt.figure(figsize=(8, 5))
sns.barplot(x=feat_imp.values, y=feat_imp.index)
plt.title("Feature Importance (Random Forest)")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()