In [2]:
# This is a sketch of the SVD material presented in class on Sept 18, translated into python.
# This sketch is limited to the use of the SVD in dimensionality reduction.
# We start with some toy data for 4 real persons using 3 attributes per person.
# We use the SVD to discover that each real person can be represented as linear combinations of 2 "canonical persons".

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd  # Panda is used just to make the matrix output look nice.
from pdb import set_trace,pm
import scipy as sc
import scipy.linalg as lg
import scipy.sparse as sp
In [18]:
A = np.array([[1,2,3],[4,2,6],[7,8,9],[10,8,12]])  # The Data
cols=['age','wgt','hgt']                           # The attributes
rows=['tom','dick','harry','janet']                # The Real Persons
def LLL(A,rows=rows,cols=cols): return pd.DataFrame(A,index=rows,columns=cols) # Used just to annotate the printouts
print(LLL(A))
       age  wgt  hgt
tom      1    2    3
dick     4    2    6
harry    7    8    9
janet   10    8   12
In [21]:
# center data by subtracting the means
print('means =',np.mean(A,axis=0))
print('outer product =\n',LLL(np.outer([1,1,1,1],np.mean(A,axis=0))))
AA=A-np.outer([1,1,1,1],np.mean(A,axis=0))
print('centered data =\n',LLL(AA))
means = [5.5 5.  7.5]
outer product =
        age  wgt  hgt
tom    5.5  5.0  7.5
dick   5.5  5.0  7.5
harry  5.5  5.0  7.5
janet  5.5  5.0  7.5
centered data =
        age  wgt  hgt
tom   -4.5 -3.0 -4.5
dick  -1.5 -3.0 -1.5
harry  1.5  3.0  1.5
janet  4.5  3.0  4.5
In [23]:
ScatterMatrix = AA.T @ AA  # '@' denotes matrix-matrix product.  ".T" denotes transpose.
print('Scatter Matrix = \n',LLL(ScatterMatrix,rows=cols))
CovarianceMatrix = ScatterMatrix/len(ScatterMatrix)
print('Covariance Matrix = \n',LLL(CovarianceMatrix,rows=cols))
Scatter Matrix = 
       age   wgt   hgt
age  45.0  36.0  45.0
wgt  36.0  36.0  36.0
hgt  45.0  36.0  45.0
Covariance Matrix = 
       age   wgt   hgt
age  15.0  12.0  15.0
wgt  12.0  12.0  12.0
hgt  15.0  12.0  15.0
In [33]:
# Singular Value Decomposition of Centered Data yields representation of each person as a linear combination of
# 2 canonical persons (usually called principal components:
U,sigma,VT = lg.svd(AA)
print(' U sigma V.T =')
print(U)
print('sigma = ',sigma)
print(LLL(VT,rows=['PC1','PC2','PC3']))
 U sigma V.T =
[[-0.63731763  0.30631069  0.59313071  0.38496229]
 [-0.30631069 -0.63731763 -0.38496229  0.59313071]
 [ 0.30631069  0.63731763 -0.38496229  0.59313071]
 [ 0.63731763 -0.30631069  0.59313071  0.38496229]]
sigma =  [1.09830833e+01 2.31773205e+00 4.72717559e-17]
          age           wgt       hgt
PC1  0.605913  5.154991e-01  0.605913
PC2 -0.364513  8.568901e-01 -0.364513
PC3 -0.707107  1.000653e-16  0.707107
In [38]:
#Because there are only two nonzero Sigma entries, we need only the first 2 cols of U & forst 2 rows of VT:
U1=U[:,:2]
sigma1=sigma[:2]
VT1=VT[:2,:]
PCS=VT1
In [39]:
## Each person can be represented by two coefficients in in terms of the canonical persons:
SCORES = AA @ VT1.T
print('SCORES = ')
print(LLL(SCORES,cols=['PC1','PC2']))
print('PCS = \n',LLL(PCS,rows=['PC1','PC2']))
SCORES = 
            PC1       PC2
tom   -6.999713  0.709946
dick  -3.364236 -1.477131
harry  3.364236  1.477131
janet  6.999713 -0.709946
PCS = 
           age       wgt       hgt
PC1  0.605913  0.515499  0.605913
PC2 -0.364513  0.856890 -0.364513
In [40]:
# The rows of PCS are the Principal Components (canonical persons).
# The rows of SCORES are the amounts of each canoncial person present in the corresponding real person
# To verify that the real persons can be completely characterized by the canoncial persons (PCs), we reconstruct the centered data.
print('reconstructed centered data =')
print(LLL(SCORES @ PCS))
reconstructed centered data =
       age  wgt  hgt
tom   -4.5 -3.0 -4.5
dick  -1.5 -3.0 -1.5
harry  1.5  3.0  1.5
janet  4.5  3.0  4.5
In [56]:
# The matrix VT can be obtained directly from the covariance matrix (at the cost of some accuracy):

(lamda,V ) = lg.eigh(-CovarianceMatrix)
lamda=-lamda
print('eigenvalues ',lamda,'\neigenvectors =\n',V)
print('Compare eigenvectors to transpose of VT = Principal Components: \n',VT.T)
eigenvalues  [4.02093727e+01 1.79062729e+00 2.92448971e-16] 
eigenvectors =
 [[ 6.05912800e-01  3.64512933e-01  7.07106781e-01]
 [ 5.15499134e-01 -8.56890100e-01  1.11022302e-16]
 [ 6.05912800e-01  3.64512933e-01 -7.07106781e-01]]
Compare eigenvectors to transpose of VT = Principal Components: 
 [[ 6.05912800e-01 -3.64512933e-01 -7.07106781e-01]
 [ 5.15499134e-01  8.56890100e-01  1.00065300e-16]
 [ 6.05912800e-01 -3.64512933e-01  7.07106781e-01]]
In [ ]: