# -*- coding: utf-8 -*-
"""Huynh_Do_Lab_6.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1Mk8mGpa-_HooyfgFWHqFNgaNgrWcFcCn
"""

from google.colab import files
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.linear_model import LinearRegression
import numpy as np

#Step1: Upload 'insurance.csv' file
uploaded = files.upload()
data = pd.read_csv("ClusterData.csv")
print("Data Summary:\n", data.describe())

#Step2: Drop categorical columns
data_numeric = data.drop(['State', 'state_code'], axis=1)

#Step3: Standardize the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_numeric)

#Step4: Apply Hierarchical Clustering
hierarchical = AgglomerativeClustering()
hierarchical_labels = hierarchical.fit_predict(data_scaled)

#Step5: Apply KMeans with k=7
kmeans = KMeans(n_clusters=7, random_state=42)
kmeans_labels = kmeans.fit_predict(data_scaled)

#Step6: Apply Gaussian Naive Bayes
gnb = GaussianNB()
gnb.fit(data_scaled, data_numeric['PsychRegions'])
gnb_predictions = gnb.predict(data_scaled)

#Step7: Classification report
classification_report_gnb = classification_report(data_numeric['PsychRegions'], gnb_predictions)
print("Classification Report:\n", classification_report_gnb)

#Step8: Visualize KMeans Clusters
plt.figure(figsize=(12, 6))
sns.scatterplot(x=data_scaled[:, 0], y=data_scaled[:, 1], hue=kmeans_labels, palette='Set1', s=100)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='black', marker='X', label='Centroids')
plt.title('KMeans Clustering with k=7')
plt.legend()
plt.show()