Principle Component Analysis (PCA)

Following post shows an overview of Principle Components Analysis (PCA) using the Chronic Kidney Disease dataset, from UCI Machine Learning Repository. PCA is a dimensionality reduction technique that converts correlated predictors into a set of uncorrelated predictors called principle components using orthogonal transformations.

Chronic Kidney Disease Data, collected from a hospital for nearly 2 months, contains features that can be used to predict chronic kidney disease incidence. Collected attributes include age, blood pressure, specific gravity, albumin, sugar, red blood cells, pus cell, pus cell clumps, bacteria, blood glucose random, blood urea, serum creatinine, sodium, potassium, hemoglobin, packed cell volume, white blood cell count, red blood cell count, hypertension, diabetes mellitus, coronary artery disease, appetite, pedal edema and anemia. Features are used to predict coronary kidney disease incidence as defined by the class predictor. For further details about feature definition and data collection, see citation.

Import packages

library(stats)
library(rpart)

Read the Chronic Kidney Disease dataset (archived) from UCI Machine Learning Repository

ckd_data <- read.csv("https://raw.githubusercontent.com/ajav17/Chronic-Kidney-Disease-UCI-Dataset/master/chronic_kidney_disease_dataset.csv")
ckd_data <- ckd_data[ ,!(colnames(ckd_data) %in% c("rbc", "pc"))]

Replace missing values with NA for later imputation

ckd_data[ckd_data == "?"] <- NA
ckd_data <- base::droplevels(ckd_data)

Convert variable types

ckd_data$age <- as.numeric(as.character(ckd_data$age))
ckd_data$bp <- as.numeric(as.character(ckd_data$bp))
ckd_data$sg <- as.numeric(as.character(ckd_data$sg))
ckd_data$al <- as.numeric(as.character(ckd_data$al))
ckd_data$su <- as.numeric(as.character(ckd_data$su))
ckd_data$bgr <- as.numeric(as.character(ckd_data$bgr))
ckd_data$bu <- as.numeric(as.character(ckd_data$bu))
ckd_data$sc <- as.numeric(as.character(ckd_data$sc))
ckd_data$sod <- as.numeric(as.character(ckd_data$sod))
ckd_data$pot <- as.numeric(as.character(ckd_data$pot))
ckd_data$hemo <- as.numeric(as.character(ckd_data$hemo))
ckd_data$pcv <- as.numeric(as.character(ckd_data$pcv))
ckd_data$wbcc <- as.numeric(as.character(ckd_data$wbcc))
ckd_data$rbcc <- as.numeric(as.character(ckd_data$rbcc))
ckd_data$pcc <- as.factor(ckd_data$pcc)
ckd_data$ba <- as.factor(ckd_data$ba)
ckd_data$htn <- as.factor(ckd_data$htn)
ckd_data$dm <- as.factor(ckd_data$dm)
ckd_data$cad <- as.factor(ckd_data$cad)
ckd_data$appet <- as.factor(ckd_data$appet)
ckd_data$pe <- as.factor(ckd_data$pe)
ckd_data$ane <- as.factor(ckd_data$ane)
ckd_data$class <- as.factor(ckd_data$class)

Impute missing values with mean

ckd_copy <- ckd_data
imputeMean <- function(x)
    {
        if (class(x) == "numeric")
        {
            replace(x, is.na(x), mean(x, na.rm = TRUE))
        }
    }

ckd_data <- lapply(ckd_data, imputeMean)
ckd <- data.frame(t(do.call(rbind, ckd_data)))

ckd <- cbind(ckd, ckd_copy$pcc, ckd_copy$ba, ckd_copy$htn,
           ckd_copy$dm, ckd_copy$cad, ckd_copy$appet, ckd_copy$pe, ckd_copy$ane, ckd_copy$class)
colnames(ckd) <- c("age", "bp", "sg", "al", "su", "bgr", "bu", "sc", "sod", "pot", "hemo", "pcv", "wbcc", "rbcc",
                "pcc", "ba", "htn", "dm", "cad", "appet", "pe", "ane", "class")

Omit any remaining values

ckd_data <- stats::na.omit(ckd)
nrow(ckd_data)

## [1] 333

PCA

Selecting numeric predictors for PCA

numeric <- sapply(ckd_data, is.numeric)
ckd_data_num <- ckd_data[, numeric]

Partitioning train and test sets

n = nrow(ckd_data)
index = base::sample(1:n, size = round(0.75*n), replace = FALSE)
train2 = ckd_data[index ,]
test2 = ckd_data[-index ,]

num_index_test <- sapply(test2, is.numeric)
num_index_train <- sapply(train2, is.numeric)
train2_num <- train2[, num_index_train]
test2_num <- test2[, num_index_test]

Applying principle components

principle_comp <- prcomp(train2_num, center = TRUE, scale. = T)
dim(principle_comp$x)

## [1] 250  14

print(principle_comp)

## Standard deviations (1, .., p=14):
##  [1] 2.1356539 1.3095935 1.1427020 1.0895520 1.0118050 0.9217633 0.9126997
##  [8] 0.7884078 0.7144548 0.6482579 0.5699756 0.5500808 0.4665634 0.3566861
## 
## Rotation (n x k) = (14 x 14):
##              PC1         PC2         PC3         PC4          PC5
## age   0.18568534 -0.17753082 -0.01522342  0.07163808 -0.259122149
## bp    0.14224600 -0.06096325  0.01023196 -0.21816811 -0.753714791
## sg   -0.28408381  0.19397838 -0.08372604 -0.09834507 -0.079331169
## al    0.32074464 -0.09668204  0.11527488 -0.02914664  0.304899748
## su    0.18225833 -0.47366373 -0.45419059  0.02005034 -0.030996731
## bgr   0.20315545 -0.48387235 -0.31621651  0.21516216 -0.003578622
## bu    0.32176736  0.27937508 -0.15752930 -0.22991428  0.190878611
## sc    0.24473474  0.40660766 -0.35624425  0.27295528  0.041297545
## sod  -0.22178142 -0.32830865  0.18627870 -0.58252764  0.070832354
## pot   0.12879353  0.07563276 -0.42480054 -0.61217479  0.241389043
## hemo -0.39747992 -0.04151640 -0.28043678  0.08301244  0.027836462
## pcv  -0.39742088 -0.03274801 -0.26889647  0.09283823  0.067927108
## wbcc  0.05639762 -0.30642626  0.32113766  0.16222212  0.389218295
## rbcc -0.37268972 -0.07563917 -0.22830468  0.08224969  0.078949133
##              PC6          PC7          PC8         PC9         PC10
## age   0.38107638 -0.836458433  0.034051016 -0.10113476  0.030437335
## bp    0.28768385  0.416439791 -0.070003826 -0.26565074  0.044451928
## sg    0.28809467  0.049686527  0.823316652  0.28234625  0.105209625
## al   -0.05405720  0.083316805  0.377341403 -0.70660250  0.312758271
## su    0.01215736  0.167991350  0.107505491  0.11614701 -0.219389115
## bgr  -0.23253742  0.033653576  0.186836060  0.16397884  0.059675977
## bu    0.07736250 -0.043518758  0.160869073 -0.14260331 -0.606381283
## sc    0.26438650  0.091288016 -0.128929647 -0.02475725 -0.157213404
## sod  -0.02494634 -0.085140048  0.005944041 -0.06364227 -0.380939539
## pot   0.16013606 -0.037867603 -0.236084281  0.16044111  0.474656659
## hemo  0.12030748 -0.048457282 -0.086920409 -0.22563429  0.118676716
## pcv   0.07908303 -0.005326901 -0.060800312 -0.31590475  0.023707811
## wbcc  0.70375227  0.256511416 -0.123270476  0.15166872 -0.008497769
## rbcc  0.11717638  0.032654766 -0.050176955 -0.27452758 -0.242840195
##              PC11        PC12        PC13          PC14
## age   0.002140315  0.05017720 -0.01258517  0.0402359275
## bp   -0.140660707 -0.07834737 -0.02658567  0.0002937661
## sg   -0.000282000  0.02889846  0.05813752  0.0192951310
## al    0.089281175  0.10591501  0.10242703 -0.0404737905
## su    0.579438952  0.23498919 -0.20477001  0.0180125332
## bgr  -0.562197308 -0.29468777  0.23353223 -0.0169339759
## bu   -0.201342020 -0.31309454 -0.37423449 -0.0624076547
## sc    0.115437689  0.05904818  0.65927189 -0.0041900558
## sod   0.083534380 -0.06632398  0.54310495 -0.0236515783
## pot  -0.133772009  0.08409883 -0.05540173  0.0481587674
## hemo  0.145279970 -0.37742270 -0.04932235 -0.7061277301
## pcv   0.088802381 -0.38335927 -0.04611510  0.6986644563
## wbcc -0.079468220 -0.11017304 -0.06431116  0.0077562753
## rbcc -0.454289249  0.65014871 -0.08588973 -0.0464742259

summary(principle_comp)

## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6
## Standard deviation     2.1357 1.3096 1.14270 1.08955 1.01180 0.92176
## Proportion of Variance 0.3258 0.1225 0.09327 0.08479 0.07312 0.06069
## Cumulative Proportion  0.3258 0.4483 0.54156 0.62635 0.69948 0.76017
##                           PC7    PC8     PC9    PC10    PC11    PC12
## Standard deviation     0.9127 0.7884 0.71445 0.64826 0.56998 0.55008
## Proportion of Variance 0.0595 0.0444 0.03646 0.03002 0.02321 0.02161
## Cumulative Proportion  0.8197 0.8641 0.90053 0.93055 0.95375 0.97536
##                           PC13    PC14
## Standard deviation     0.46656 0.35669
## Proportion of Variance 0.01555 0.00909
## Cumulative Proportion  0.99091 1.00000

Biplot

biplot(principle_comp, scale = 0)

Scree plot

plot(principle_comp, type = "l")

Training data with appropriate components

train_prin <- data.frame(train2$class, principle_comp$x)
train_prin <- train_prin[, 1:10] # select 9 components

Model decision tree

tree_prin <- rpart(train2.class ~ ., data = train_prin)

Construct test data from PCA

# Converting test to PCA
test_prin <- as.data.frame(predict(principle_comp, test2_num))

Predict decision tree on test data

# Select 9 components and predict on test set
test_prin <- test_prin[, 1:9]
tree_predict <- predict(tree_prin, test_prin, type = "class")

Calculating accuracy

tab <- base::table(tree_predict, test2$class)
round(sum(diag(tab))/sum(tab), 3)

## [1] 0.952

Import packages

Read the Chronic Kidney Disease dataset (archived) from UCI Machine Learning Repository

Replace missing values with NA for later imputation

Convert variable types

Impute missing values with mean

Omit any remaining values

PCA

Selecting numeric predictors for PCA

Partitioning train and test sets

Applying principle components

Biplot

Scree plot

Training data with appropriate components

Model decision tree

Construct test data from PCA

Predict decision tree on test data

Calculating accuracy

Part 8 of the Machine Learning series

Previous articles

Next articles