# Generate data ----------------------------------------------------------------
# Fixed parameters
<- 1e3 # sample size
n <- 4 # final number of items
p <- 2 # target number of components
K <- .8 # Fixed correlation between observed items
rho
# Define correlation matrix with blocks
<- lapply(1:K, function (x){
Sigma_K <- matrix(rho,
Sigma nrow = p/K,
ncol = p/K)
diag(Sigma) <- 1
Sigma
})<- Matrix::bdiag(Sigma_K)
Sigma
# Define vector of observed item means
<- rep(5, p)
mu
# Sample data from multivaraite normal distribution
set.seed(1235)
<- MASS::mvrnorm(n, mu, Sigma)
X
# Scale it for future use in PCA
<- scale(X) X
Introduction
Principal Component Analysis (PCA) is a technique that finds a low-dimensional representation of a large set of variables contained in an \(n \times p\) data matrix \(\mathbf{X}\) with minimal loss of information. We refer to this low-dimensional representation as the \(n \times r\) matrix \(\mathbf{T}\), where \(r < p\).
The columns of \(\mathbf{T}\) are called principal components (PCs) of \(\mathbf{X}\). We follow the common practice of assuming that the columns of \(\mathbf{X}\) are mean-centered and scaled to have a variance of 1. The first PC of \(\mathbf{X}\) is the linear combination of the columns of \(\mathbf{X}\) with the largest variance: \[ \mathbf{t}_1 = \lambda_{11} \mathbf{x}_1 + \lambda_{12} \mathbf{x}_2 + \dots + \lambda_{1p} \mathbf{x}_p = \mathbf{X} \mathbf{\lambda}_1 \] with \(\mathbf{\lambda}_1\) being the \(1 \times p\) vector of coefficients \(\lambda_{11}, \dots, \lambda_{1p}\). The second principal component (\(\mathbf{t}_2\)) is defined by finding the vector of coefficients \(\mathbf{\lambda}_2\) giving the linear combination of \(\mathbf{x}_1, \dots, \mathbf{x}_p\) with maximal variance out of all the linear combinations that are uncorrelated with \(\mathbf{t}_1\). Every subsequent column of \(\mathbf{T}\) can be understood in the same way. As a result, the PCs are independent by definition and every subsequent PC has less variance than the preceding one. We can write the relationship between all the PCs and \(\mathbf{X}\) in matrix notation: \[\begin{equation} \label{eq:PCAmatnot} \mathbf{T} = \mathbf{X} \mathbf{\Lambda} \end{equation}\] where \(\mathbf{\Lambda}\) is a \(p \times r\) matrix of weights, with columns \(\mathbf{\lambda}_1, \dots, \mathbf{\lambda}_q\). PCA can be thought of as the process of projecting the original data from a \(p\)-dimensional space to a lower \(q\)-dimensional space. The coefficient vectors \(\mathbf{\lambda}_1, \dots, \mathbf{\lambda}_r\) define the directions in which we are projecting the \(n\) observations of \(\mathbf{x}_1, \dots, \mathbf{x}_p\). The projected values are the principal component scores \(\mathbf{T}\).
The goal of PCA is to find the values of \(\mathbf{\Lambda}\) that maximize the variance of the columns of \(\mathbf{T}\). One way to find the PCA solution for \(\mathbf{\Lambda}\) is by taking the truncated singular value decomposition (SVD) of \(\mathbf{X}\):
\[\begin{equation} \label{eq:SVD} \mathbf{X} = \mathbf{UDV}' \end{equation}\]
where:
- \(\mathbf{D}\) is the \(r \times r\) diagonal matrix with elements equal to the square root of the non-zero eigenvalues of \(\mathbf{XX}^T\) and \(\mathbf{X}^T\mathbf{X}\);
- \(\mathbf{U}\) is an \(n \times p\) orthogonal matrix such that \(\mathbf{U}^T\mathbf{U=I}\)
- \(\mathbf{V}\) is the \(p \times p\) orthogonal matrix of eigenvectors of \(\mathbf{X}^T\mathbf{X}\) such that \(\mathbf{V}^T\mathbf{V=I}\);
The scores on the first \(r\) PCs are given by the \(n \times r\) matrix \(\mathbf{U}_{r}\mathbf{D}_{r}\), where \(\mathbf{U}_{r}\) and \(\mathbf{D}_{r}\) stand for the reduced forms of \(\mathbf{U}\) and \(\mathbf{D}\) obtained by taking their first \(r\) column and diagonal values. The weights \(\mathbf{\Lambda}\) are given by the \(p \times r\) matrix \(\mathbf{V}_{r}\), so that: \[ \mathbf{T} = \mathbf{U}_{r}\mathbf{D}_{r} = \mathbf{X}\mathbf{V}_{r} \]
Another way to achieve the same solution relies on the eigenvalue decomposition (EVD) of the cross-product matrix (or the correlation matrix). In particular, the EVD of \(\mathbf{X}^{T} \mathbf{X}\) is equal to \(\mathbf{V} \mathbf{D}^{2} \mathbf{V}^{T}\), where \(\mathbf{V}\) and \(\mathbf{D}\) are the same eigenvectors and eigenvalues, respectively, defined above.
Learn by coding
Let us start by generating sampling 1000 (n) observations from a multivariate normal distribution. In this example, we use a block structure in the correlation matrix to generate a collection of 4 items (p) that can be summarized well by two principal components (K).
PCA as SVD
As we discussed, PCA can be obtained as the SVD of \(X\).
# PCA as singular value decomposition of X -------------------------------------
# Perform SVD
<- svd(X)
uSVt
# Extract objects
<- uSVt$u
U <- diag(uSVt$d)
Sigma <- uSVt$v
V
# Compute the PC scores
<- U %*% Sigma
T_svd
# Compute the PC scores (equivalent way)
<- X %*% V
T_svd
# Define eigenvalues
<- uSVt$d^2
eigenv_svd
# Compute cumulative proportion of explained variance
<- cumsum(prop.table(eigenv_svd)) CPVE_svd
PCA as eigenvalue decomposition
PCA can also be obtained as the eigenvalue decomposition of the cross-product matrix of \(X\)
# PCA as eigenvalue decomposition of XtX --------------------------------------------
# Compute the cross-product matrix XtX
<- t(X) %*% X
XtX
# Perform eigenvalue decomposition
<- eigen(XtX)
eigenmat_XtX
# Extract eigenvalues
<- eigenmat_XtX$values
eigenvalues_XtX
# Extract component loadings
<- eigenmat_XtX$vectors
eigenvectors_XtX
# Compute the PC scores
<- X %*% eigenvectors_XtX
T_eig_XtX
# Compute cumulative proportion of explained variance
<- cumsum(prop.table(eigenvalues_XtX)) CPVE_eig_XtX
It can also be computed by the eigenvalue decomposition of the correlation matrix of \(X\).
# PCA as eigenvalue decomposition of cor(X) -----------------------------------------
# Compute the correlation matrix
<- cor(X, method = "pearson")
X_cormat
# Perform eigenvalue decomposition
<- eigen(X_cormat)
eigenmaT_eig_corx
# Extract eigenvalues
<- eigenmaT_eig_corx$values
eigenvalues_corx
# Extract component loadings
<- eigenmaT_eig_corx$vectors
eigenvectors_corx
# Compute the PC scores
<- X %*% eigenvectors_corx
T_eig_corx
# Compute cumulative proportion of explained variance
<- cumsum(prop.table(eigenvalues_corx)) CPVE_corx
Compare
To check our results we can compare the PC scores, CPVEs, and loading matrices with the results of the standard R function to compute PCA (prcomp
). Let’s compute the PCs and all the relevant quantities with the prcomp
:
# PCA with prcomp --------------------------------------------------------------
# Compute PCA with prcomp
<- prcomp(X)
PCX
# Extract component scores
<- PCX$x
T_prcomp
# Extract eigenvalues
<- PCX$sdev^2
eigenvalues_prcomp
# Extract component loadings
<- as.matrix(PCX$rotation)
V_prcomp
# Extract cumulative explained variance
<- cumsum(prop.table(eigenvalues_prcomp)) CPVE_PCX
To compare the PC score and loading matrices we will use Tucker congruence coefficient. This is a measure of similarity between matrices that ranges between -1 and +1. A congruence of 1 means the two matrices are identical. To compare the vectors of the cumulative proportion of explained variance we will simply print the vectors one next to the other. In the following, you can see that all of the ways to find solutions to the PCA problem are identical.
PC scores
# Compare solutions ------------------------------------------------------------
# Load package for tucker congruence
library(RegularizedSCA)
# > PC scores ------------------------------------------------------------------
# prcomp results = SVD
TuckerCoef(T_prcomp, T_svd)$tucker_value
[1] 1
# prcomp results = eigenvalues_XtX
TuckerCoef(T_prcomp, T_eig_XtX)$tucker_value
[1] 1
# prcomp results = eigenvectors_corx
TuckerCoef(T_prcomp, T_eig_corx)$tucker_value
[1] 1
Component loadings
# > Component loadings ---------------------------------------------------------
# prcomp results = SVD
TuckerCoef(V_prcomp, V)$tucker_value
[1] 1
# prcomp results = eigenvalues_XtX
TuckerCoef(V_prcomp, eigenvectors_XtX)$tucker_value
[1] 1
# prcomp results = eigenvectors_corx
TuckerCoef(V_prcomp, eigenvectors_corx)$tucker_value
[1] 1
Cumulative proportion of explained variance
# Cumulative proportion of explained variance
data.frame(
SVD = CPVE_svd,
eig_XtX = CPVE_eig_XtX,
eig_corx = CPVE_corx,
prcomp = CPVE_PCX
)
SVD eig_XtX eig_corx prcomp
1 0.4680609 0.4680609 0.4680609 0.4680609
2 0.8992696 0.8992696 0.8992696 0.8992696
3 0.9505615 0.9505615 0.9505615 0.9505615
4 1.0000000 1.0000000 1.0000000 1.0000000
TL;DR, just give me the code!
# Generate data ----------------------------------------------------------------
# Fixed parameters
<- 1e3 # sample size
n <- 4 # final number of items
p <- 2 # target number of components
K <- .8 # Fixed correlation between observed items
rho
# Define correlation matrix with blocks
<- lapply(1:K, function (x){
Sigma_K <- matrix(rho,
Sigma nrow = p/K,
ncol = p/K)
diag(Sigma) <- 1
Sigma
})<- Matrix::bdiag(Sigma_K)
Sigma
# Define vector of observed item means
<- rep(5, p)
mu
# Sample data from multivaraite normal distribution
set.seed(1235)
<- MASS::mvrnorm(n, mu, Sigma)
X
# Scale it for future use in PCA
<- scale(X)
X
# PCA as singular value decomposition of X -------------------------------------
# Perform SVD
<- svd(X)
uSVt
# Extract objects
<- uSVt$u
U <- diag(uSVt$d)
Sigma <- uSVt$v
V
# Compute the PC scores
<- U %*% Sigma
T_svd
# Compute the PC scores (equivalent way)
<- X %*% V
T_svd
# Define eigenvalues
<- uSVt$d^2
eigenv_svd
# Compute cumulative proportion of explained variance
<- cumsum(prop.table(eigenv_svd))
CPVE_svd
# PCA as eigenvalue decomposition of XtX --------------------------------------------
# Compute the cross-product matrix XtX
<- t(X) %*% X
XtX
# Perform eigenvalue decomposition
<- eigen(XtX)
eigenmat_XtX
# Extract eigenvalues
<- eigenmat_XtX$values
eigenvalues_XtX
# Extract component loadings
<- eigenmat_XtX$vectors
eigenvectors_XtX
# Compute the PC scores
<- X %*% eigenvectors_XtX
T_eig_XtX
# Compute cumulative proportion of explained variance
<- cumsum(prop.table(eigenvalues_XtX))
CPVE_eig_XtX
# PCA as eigenvalue decomposition of cor(X) -----------------------------------------
# Compute the correlation matrix
<- cor(X, method = "pearson")
X_cormat
# Perform eigenvalue decomposition
<- eigen(X_cormat)
eigenmaT_eig_corx
# Extract eigenvalues
<- eigenmaT_eig_corx$values
eigenvalues_corx
# Extract component loadings
<- eigenmaT_eig_corx$vectors
eigenvectors_corx
# Compute the PC scores
<- X %*% eigenvectors_corx
T_eig_corx
# Compute cumulative proportion of explained variance
<- cumsum(prop.table(eigenvalues_corx))
CPVE_corx
# PCA with prcomp --------------------------------------------------------------
# Compute PCA with prcomp
<- prcomp(X)
PCX
# Extract component scores
<- PCX$x
T_prcomp
# Extract eigenvalues
<- PCX$sdev^2
eigenvalues_prcomp
# Extract component loadings
<- as.matrix(PCX$rotation)
V_prcomp
# Extract cumulative explained variance
<- cumsum(prop.table(eigenvalues_prcomp))
CPVE_PCX
# Compare solutions ------------------------------------------------------------
# Load package for tucker congruence
library(RegularizedSCA)
# > PC scores ------------------------------------------------------------------
# prcomp results = SVD
TuckerCoef(T_prcomp, T_svd)$tucker_value
# prcomp results = eigenvalues_XtX
TuckerCoef(T_prcomp, T_eig_XtX)$tucker_value
# prcomp results = eigenvectors_corx
TuckerCoef(T_prcomp, T_eig_corx)$tucker_value
# > Component loadings ---------------------------------------------------------
# prcomp results = SVD
TuckerCoef(V_prcomp, V)$tucker_value
# prcomp results = eigenvalues_XtX
TuckerCoef(V_prcomp, eigenvectors_XtX)$tucker_value
# prcomp results = eigenvectors_corx
TuckerCoef(V_prcomp, eigenvectors_corx)$tucker_value
# Cumulative proportion of explained variance
data.frame(
SVD = CPVE_svd,
eig_XtX = CPVE_eig_XtX,
eig_corx = CPVE_corx,
prcomp = CPVE_PCX
)