# -*- coding: utf-8 -*-
"""Huynh_Do_Lab3.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1FHA5TbIDs5UJoCuJ39yBOoHOaBWcmTCT
"""

# Import required Python packages
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
import seaborn as sns  # Optional for better visuals
from google.colab import files
from sklearn.preprocessing import StandardScaler

# Upload 'Cereals.csv' file
uploaded = files.upload()

# Load the dataset
df = pd.read_csv("b5.csv")
df.head()

# Step 1: Standardize the dataset
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

# Step 2: Calculate covariance matrix
cov_matrix = np.cov(df_scaled.T)

# Step 3: Compute eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)

# Step 4: Sort eigenvalues and eigenvectors in descending order
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues_sorted = eigenvalues[sorted_indices]
eigenvectors_sorted = eigenvectors[:, sorted_indices]

# Step 5: Select top k eigenvectors and project the data (2D)
k2 = 2
top_k2_eigenvectors = eigenvectors_sorted[:, :k2]
df_pca_2d = df_scaled.dot(top_k2_eigenvectors)
df_pca_2d = pd.DataFrame(df_pca_2d, columns=[f'PC{i+1}' for i in range(k2)])

# Step 6: Visualize 2D PCA
plt.figure(figsize=(8, 6))
plt.scatter(df_pca_2d['PC1'], df_pca_2d['PC2'], alpha=0.5, s=10)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA Projection (2D)')
plt.grid(True)
plt.tight_layout()
plt.show()

# Step 7: Select top k eigenvectors and project the data (3D)
k3 = 3
top_k3_eigenvectors = eigenvectors_sorted[:, :k3]
df_pca_3d = df_scaled.dot(top_k3_eigenvectors)
df_pca_3d = pd.DataFrame(df_pca_3d, columns=[f'PC{i+1}' for i in range(k3)])

# Step 8: Visualize 3D PCA
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df_pca_3d['PC1'], df_pca_3d['PC2'], df_pca_3d['PC3'], alpha=0.4, s=10)
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3')
ax.set_title('PCA Projection (3D)')
plt.tight_layout()
plt.show()

# Total sum of all eigenvalues
total_variance = np.sum(eigenvalues_sorted)

# Variance explained by each principal component
variance_explained = eigenvalues_sorted / total_variance

# Print variance explained by PC1, PC2, PC3
for i in range(3):
    print(f"PC{i+1} explains {variance_explained[i]*100:.2f}% of the total variance.")