06.02 - PCA
Contents
06.02 - PCA¶
!wget --no-cache -O init.py -q https://raw.githubusercontent.com/fagonzalezo/ai4eng-unal/main/content/init.py
import init; init.init(force_download=False); init.get_weblink()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
consulta A Tutorial on Principal Component Analysis para una decripción intuitiva y detallada de PCA y SVD
Intuición¶
tenemos los siguientes datos 2D y nos gustaría encontrar una proyección en 1D que preserve la máxima cantidad de variabilidad.
np.random.seed(1)
X = np.dot(np.random.random(size=(2, 2)), np.random.normal(size=(2, 200))).T+10
# center data on 0,0
X=X-np.mean(X, axis=0)
print (X.shape)
plt.scatter(X[:,0], X[:,1])
(200, 2)
<matplotlib.collections.PathCollection at 0x7fa907e03890>
recuerda que la proyección de un vector \(\vec{x}\) en otro vector \(\vec{v}\) (consulta here) viene dada por:
donde \(c\) es el tamaño de la proyección de \(\vec{x}\) sobre \(\vec{v}\)
inspeccionamos algunas proyecciones¶
plt.figure(figsize=(15,3))
unit_vector = lambda angle: np.array([np.cos(angle), np.sin(angle)])
for i in range(3):
plt.subplot(1,3,i+1)
angle = np.random.random()*np.pi*2 if i!=0 else 1.8
v = unit_vector(angle)
c = X.dot(v.reshape(-1,1))/(np.linalg.norm(v)**2)
Xp = np.repeat(v.reshape(-1,2),len(X),axis=0)*c
plt.scatter(X[:,0], X[:,1], color="blue", alpha=.5, label="original data")
plt.scatter(Xp[:,0], Xp[:,1], color="red", alpha=.5, label="projected data")
plt.axvline(0, color="gray")
plt.axhline(0, color="gray")
plt.plot([0,v[0]], [0,v[1]], color="black", lw=3, label="projection vector")
plt.axis('equal')
plt.ylim(-2,2)
plt.title("$\\alpha$=%.2f rads, proj std=%.3f"%(angle, np.std(c)))
if i==2:
plt.legend(loc="center left", bbox_to_anchor=(1.01,.5))
encontremos las proyecciones con mayor y menor std por fuerza bruta¶
def get_maxmin_projections(X):
stds = []
angles = np.linspace(0,np.pi*2, 100)
for a in angles:
v = np.array([np.cos(a), np.sin(a)])
c = X.dot(v.reshape(-1,1))/(np.linalg.norm(v)**2)
stds.append(np.std(c))
v2 = unit_vector(angles[np.argmin(stds)])
v1 = unit_vector(angles[np.argmax(stds)])
return angles, stds, v1, v2
angles, stds, v1, v2 = get_maxmin_projections(X)
plt.plot(angles, stds)
plt.xlabel("projection $\\alpha$ (in rads)")
plt.ylabel("projection std")
Text(0,0.5,'projection std')
plt.scatter(X[:,0], X[:,1], color="blue", alpha=.5, label="original data")
plt.axvline(0, color="gray")
plt.axhline(0, color="gray")
plt.plot([0,v1[0]], [0,v1[1]], color="black", lw=5, label="max std projection vector")
plt.plot([0,v2[0]], [0,v2[1]], color="black", ls="--", lw=2, label="min std projection vector")
plt.axis('equal')
plt.ylim(-2,2)
plt.legend(loc="center left", bbox_to_anchor=(1.01,.5))
<matplotlib.legend.Legend at 0x7fa907afc450>
estos son los componentes principales!! observa que su dimensionalidad es la misma que los datos originales
esto es lo que PCA nos da
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
pca.fit(X)
print ("sklearn PCA components")
print (pca.components_)
print ("brute force components")
print (v1)
print (v2)
sklearn PCA components
[[-0.94446029 -0.32862557]]
brute force components
[-0.93969262 -0.34202014]
[-0.32706796 0.94500082]
c = pca.transform(X)
print (c.shape)
c
(200, 1)
array([[ 6.76769235e-01],
[-1.07121393e+00],
[ 7.27912364e-01],
[ 2.30964136e+00],
[ 6.30052323e-01],
[ 1.02448887e+00],
[ 7.77183507e-01],
[-1.39656414e+00],
[-2.91049874e-01],
[ 1.88864221e+00],
[-7.11544293e-01],
[ 6.38884130e-01],
[ 5.48059617e-01],
[-2.19312436e-01],
[-3.87789490e-01],
[ 7.15219956e-01],
[-1.08373816e+00],
[-2.99917403e-01],
[-7.96849021e-01],
[-8.12568346e-01],
[-1.54018281e+00],
[-2.52920476e-01],
[ 6.26464454e-01],
[-1.61007571e+00],
[ 5.04240563e-01],
[ 5.53935753e-01],
[ 6.81911252e-01],
[-2.00157228e-02],
[ 1.13550833e-01],
[ 2.92286085e-02],
[-2.14393483e-01],
[-1.03406124e+00],
[ 3.88635004e-01],
[ 9.96727811e-01],
[ 1.39223653e+00],
[ 4.57043694e-01],
[ 6.81839901e-01],
[-9.05233246e-01],
[ 4.94316334e-01],
[ 6.22411280e-01],
[ 3.26088548e-01],
[ 4.52560386e-01],
[ 6.81840663e-01],
[-2.44832816e-01],
[-5.27149562e-01],
[-4.51448737e-01],
[-1.42864453e+00],
[ 8.05233004e-01],
[ 1.81049742e-01],
[ 3.49039347e-01],
[ 2.65803583e+00],
[-1.34272221e+00],
[-1.73026340e-01],
[ 6.13676729e-01],
[-1.89940741e+00],
[-7.93074429e-01],
[-4.17072486e-01],
[ 1.54913526e-01],
[ 2.44646603e-01],
[ 7.26337140e-01],
[-7.91592424e-01],
[ 4.39666794e-01],
[-2.66630687e-01],
[-8.77131636e-01],
[-6.37447634e-01],
[-7.72982393e-01],
[-1.04616382e+00],
[ 1.15209837e+00],
[-5.26661400e-02],
[-9.74296354e-01],
[-6.24348505e-01],
[-1.00475074e+00],
[ 5.89973268e-01],
[ 1.50344054e+00],
[ 1.27433349e+00],
[-1.25658172e+00],
[ 1.37852445e-01],
[-1.36126475e+00],
[ 7.27518820e-01],
[ 4.50501231e-01],
[-1.17577071e-01],
[-8.49638130e-01],
[-9.51657336e-02],
[-1.81175961e-01],
[ 2.81596080e-01],
[-2.56560634e-01],
[ 8.52804745e-01],
[-4.77688980e-01],
[-2.96471868e-01],
[ 1.68108524e-03],
[-2.05727542e-01],
[ 8.12610001e-01],
[-7.06157363e-02],
[ 2.31690062e-01],
[-1.59605923e-01],
[-5.98727081e-01],
[ 1.01944512e+00],
[-7.01462226e-01],
[-1.40420099e+00],
[ 6.94997907e-01],
[ 5.18636606e-01],
[ 4.83061626e-01],
[ 6.79198052e-01],
[-1.30170017e+00],
[-2.71805220e-01],
[ 9.47603686e-01],
[-3.49630397e-01],
[ 4.85113462e-01],
[-3.04715098e-01],
[-3.31839520e-01],
[-1.38578436e+00],
[ 8.84502948e-01],
[-2.47084475e+00],
[-9.56899804e-02],
[-4.64806358e-01],
[ 7.06669625e-01],
[ 1.54312708e-01],
[ 5.45819213e-01],
[ 1.46023727e-01],
[ 9.57253276e-01],
[-6.91815248e-01],
[-1.00443516e-01],
[ 2.77924488e-01],
[-1.20207491e+00],
[-6.04953108e-02],
[-1.03273685e+00],
[ 6.88215760e-01],
[-1.21050656e+00],
[-2.40052449e-01],
[-6.06855334e-01],
[ 1.29217575e+00],
[-1.03282074e-01],
[-1.41361475e+00],
[ 7.57783205e-01],
[ 1.41360423e+00],
[ 1.99564613e+00],
[ 1.66865955e+00],
[ 1.66032125e+00],
[ 4.24742508e-01],
[-9.26445715e-01],
[ 3.28504629e-02],
[-5.17521702e-01],
[-9.24887775e-02],
[-3.05962249e-02],
[ 1.30795754e-01],
[-7.74659629e-02],
[-4.20826569e-01],
[ 6.78334448e-01],
[-6.35104074e-01],
[ 2.72075594e-01],
[-2.26801066e-01],
[-1.45908094e+00],
[ 4.03275391e-01],
[ 4.88618199e-01],
[-3.77797862e-02],
[ 2.25514691e-01],
[ 3.73320407e-01],
[ 9.96559672e-01],
[ 6.68655132e-01],
[-3.09207055e-01],
[ 1.44746288e+00],
[-1.27674147e-01],
[ 1.95898129e-02],
[-4.68331172e-01],
[-7.59794861e-01],
[ 2.11566325e+00],
[-1.28843614e+00],
[ 5.24455206e-01],
[ 2.68082969e-01],
[ 4.06271559e-02],
[-1.63087335e+00],
[ 4.50273668e-01],
[-1.41736985e+00],
[-3.20579341e-01],
[-2.16095416e+00],
[ 7.55938440e-01],
[ 1.13147728e+00],
[-4.01022769e-01],
[-1.33261395e-01],
[-1.20765775e-01],
[ 1.03185993e+00],
[-1.29878689e-01],
[-4.08011754e-01],
[ 4.17084437e-01],
[-1.00930809e-01],
[ 7.22839507e-02],
[ 6.47903117e-01],
[ 4.74689466e-01],
[ 6.85499472e-01],
[-1.49366216e+00],
[-3.49297457e-01],
[-7.79713261e-01],
[ 5.67446775e-01],
[ 5.18831382e-02],
[ 1.25350822e+00],
[-8.53016941e-01],
[-2.61547685e-01],
[-2.02667441e+00],
[ 1.20688282e+00],
[-3.53816725e-01]])
pero de modo mucho más eficiente
%timeit pca.fit(X)
144 µs ± 2.69 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
%timeit get_maxmin_projections(X)
3.1 ms ± 62.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
we can use the largest component to reduce our data from 2D to 1D¶
podemos usar el componente mayor para reducir la dimensionalidad de nuestros datos de 2D a 1D¶
observa que:
donde:
\(\mathbf{X}\) son nuestros datos
\(\mathbf{V}\) es el vector de componentes seleccionados
\(\mathbf{X_t}\) son los datos transformados
así que nos estamos restringiendo a transformaciones linealer (rotaciones y escalado)
pca = PCA(n_components=1)
pca.fit(X)
Xt = pca.transform(X)[:,0]
plt.scatter(X[:,0], X[:,1], color="blue", alpha=.5, label="$\mathbf{X}$: original data")
plt.scatter(Xt, [0]*len(Xt), color="red", alpha=.5, label="$\mathbf{X_t}$: reduced data")
plt.axis("equal");
plt.legend(loc="center left", bbox_to_anchor=(1.01,.5))
<matplotlib.legend.Legend at 0x7fa905b33650>
y podemos también recontruir los datos 2D después de la transformación
v0 = pca.components_[0]
c = X.dot(v0)
Xr = np.r_[[i*v0 for i in c]]
plt.scatter(X[:,0], X[:,1], color="blue", alpha=.5, label="original data")
plt.scatter(Xr[:,0], Xr[:,1], color="red", alpha=.5, label="reconstructed data from largest component")
plt.legend(loc="center left", bbox_to_anchor=(1.01,.5))
<matplotlib.legend.Legend at 0x7fa907c51d90>