# -*- coding: utf-8 -*-
"""HUYNH_DO_LAB#4.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1y7aANFvWq2fNQiLfflDQNlpc8r-55GDJ
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from mpl_toolkits.mplot3d import Axes3D
from google.colab import files

# Upload 'Cereals.csv' file
uploaded = files.upload()

# Load the dataset
df = pd.read_csv("Cereals.csv")
df.head()

# ----------------------------------------------------------
# Remove All Cereals with Missing Values
# ----------------------------------------------------------
df_clean = df.dropna().copy()
print("\nData After Dropping Missing Values:")
print(df_clean.head())
print(f"\nRows after cleaning: {len(df_clean)} rows remaining")

# ----------------------------------------------------------
# Select Only Numeric Features and Standardize
# ----------------------------------------------------------
df_numeric = df_clean.select_dtypes(include=[np.number])
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_numeric)

# ----------------------------------------------------------
# Run K-Means Clustering
# ----------------------------------------------------------
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans.fit(df_scaled)
labels_kmeans = kmeans.labels_

# Attach KMeans Cluster labels
df_clean['KMeans_Cluster'] = labels_kmeans

print("\n KMeans Cluster Assignments:")
print(df_clean[['KMeans_Cluster']].head())

print("\n KMeans Cluster Centroids:")
kmeans_centroids = pd.DataFrame(kmeans.cluster_centers_, columns=df_numeric.columns)
print(kmeans_centroids)

# ----------------------------------------------------------
# Apply Hierarchical Clustering
# ----------------------------------------------------------
linkage_single = linkage(df_scaled, method='single')
linkage_complete = linkage(df_scaled, method='complete')

# ----------------------------------------------------------
# Compare Dendrograms (Single vs Complete Linkage)
# ----------------------------------------------------------
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
dendrogram(linkage_single)
plt.title('Single Linkage Dendrogram')
plt.xlabel('Samples')
plt.ylabel('Distance')

plt.subplot(1, 2, 2)
dendrogram(linkage_complete)
plt.title('Complete Linkage Dendrogram')
plt.xlabel('Samples')
plt.ylabel('Distance')
plt.tight_layout()
plt.show()

# ----------------------------------------------------------
# Analyze Cluster Centroids Again
# ----------------------------------------------------------
# (Already printed after KMeans)
# cluster via Hierarchical:
num_clusters = 4
complete_clusters = fcluster(linkage_complete, num_clusters, criterion='maxclust')
df_clean['CompleteLinkage_Cluster'] = complete_clusters

print("\n Hierarchical Clustering (Complete Linkage) Cluster Assignments:")
print(df_clean[['CompleteLinkage_Cluster']].head())

# ----------------------------------------------------------
# How Many Clusters Would You Use?
# ----------------------------------------------------------
# Elbow method for KMeans (for backup visual)
sse = []
for k in range(1, 10):
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(df_scaled)
    sse.append(km.inertia_)

plt.figure(figsize=(8,5))
plt.plot(range(1,10), sse, marker='o')
plt.title('Elbow Method For Optimal k')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Sum of Squared Errors (SSE)')
plt.grid(True)
plt.show()

print("""
FINAL ANSWER:
 Based on Elbow Method + Dendrogram visual splits
The best number of clusters to use = 4 clusters.
""")