Learning PCA, tSNE and UMAP from simple example

PCA, tSNE, and UMAP are the most commonly used l dimensionality reduction techniques visualizing high-dimensional datasets.
r
pca
tsne
umap
unsupervised
dimensional reduction
Author
Published

Saturday, May 27, 2023

Modified

Friday, April 12, 2024

### Load packages
pacman::p_load(
    tidyverse,  # tidy data
    FactoMineR, # compute principal component methods
    factoextra,  # extract, visualize and interpretate the results
    corrplot,  # visualize cos2 of variables
    broom,
    ggforce,
    palmerpenguins,
    umap,
    Rtsne
)

Library and data

### load packages
rm(list = ls())
### demo data
penguins <- penguins |> 
  drop_na() |> 
  select(-year)
head(penguins)
## # A tibble: 6 × 7
##   species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##   <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
## 1 Adelie  Torgersen           39.1          18.7               181        3750
## 2 Adelie  Torgersen           39.5          17.4               186        3800
## 3 Adelie  Torgersen           40.3          18                 195        3250
## 4 Adelie  Torgersen           36.7          19.3               193        3450
## 5 Adelie  Torgersen           39.3          20.6               190        3650
## 6 Adelie  Torgersen           38.9          17.8               181        3625
## # ℹ 1 more variable: sex <fct>

PCA

### Perform PCA using prcomp()
pca_fit <- penguins |> 
  select(where(is.numeric)) |> 
  scale() |> 
  prcomp()

### importance of components
pca_fit
## Standard deviations (1, .., p=4):
## [1] 1.6569115 0.8821095 0.6071594 0.3284579
## 
## Rotation (n x k) = (4 x 4):
##                          PC1         PC2        PC3        PC4
## bill_length_mm     0.4537532 -0.60019490 -0.6424951  0.1451695
## bill_depth_mm     -0.3990472 -0.79616951  0.4258004 -0.1599044
## flipper_length_mm  0.5768250 -0.00578817  0.2360952 -0.7819837
## body_mass_g        0.5496747 -0.07646366  0.5917374  0.5846861
summary(pca_fit)
## Importance of components:
##                           PC1    PC2     PC3     PC4
## Standard deviation     1.6569 0.8821 0.60716 0.32846
## Proportion of Variance 0.6863 0.1945 0.09216 0.02697
## Cumulative Proportion  0.6863 0.8809 0.97303 1.00000

### visualize the PC1 and PC2
pca_fit |> 
  augment(penguins) |> 
  rename_at(
    vars(starts_with(".fitted")), 
    list(~str_replace(., ".fitted", ""))
    ) |> 
    ggplot(aes(x = PC1, y = PC2, color = species, shape = sex)) +
    geom_point()

UMAP

rm(list = ls())
### theme set
theme_set(theme_bw(18))
### subset data
penguins <- penguins |> 
  drop_na() |> 
  select(-year) |> 
  mutate(ID = row_number())

### meta data
penguins_meta <- penguins |> 
  select(ID, species, island, sex)
### umap analysis
set.seed(123)
umap_fit <- penguins |>
  select(where(is.numeric)) |> 
  column_to_rownames("ID") |> 
  scale() |> 
  umap()
### retrieve data from umap analysis
umap_df <- umap_fit$layout |> 
  as.data.frame() |> 
  rename(UMAP1 = "V1", UMAP2 = "V2") |> 
  mutate(ID = row_number()) |> 
  inner_join(penguins_meta, by = "ID") |> 
  as_tibble()  
head(umap_df)
## # A tibble: 6 × 6
##   UMAP1 UMAP2    ID species island    sex   
##   <dbl> <dbl> <int> <fct>   <fct>     <fct> 
## 1 -7.00 -1.14     1 Adelie  Torgersen male  
## 2 -5.79 -1.28     2 Adelie  Torgersen female
## 3 -5.39 -1.87     3 Adelie  Torgersen female
## 4 -7.55 -2.53     4 Adelie  Torgersen female
## 5 -8.77 -2.36     5 Adelie  Torgersen male  
## 6 -6.22 -1.30     6 Adelie  Torgersen female
### visualize umap
umap_df |> 
  ggplot(aes(x = UMAP1, y = UMAP2, color = species, shape = sex)) +
  geom_point() +
  labs(x = "UMAP1", y = "UMAP2", title = "UMAP plot")


### facet by island
umap_df |> 
  ggplot(aes(x = UMAP1, y = UMAP2, color = species))+
  geom_point(size = 3, alpha = 0.5) +
  facet_wrap(~island) +
  labs(x = "UMAP1", y = "UMAP2", subtitle = "UMAP plot") +
  theme(legend.position = "bottom")


### circlize the outlier cases
umap_df |> 
  ggplot(aes(x = UMAP1, y = UMAP2, color = species, shape = sex)) +
  geom_point()+
  labs(x = "UMAP1", y = "UMAP2", subtitle = "UMAP plot") +
  geom_circle(aes(x0 = -5, y0= -3.8, r = 0.65), color = "green", inherit.aes = FALSE)
## Warning in geom_circle(aes(x0 = -5, y0 = -3.8, r = 0.65), color = "green", : All aesthetics have length 1, but the data has 333 rows.
## ℹ Did you mean to use `annotate()`?

tSNE

rm(list = ls())
### theme set
theme_set(theme_bw(18))
### subset data
penguins <- penguins |> 
  drop_na() |> 
  select(-year) |> 
  mutate(ID = row_number())

### meta data
penguins_meta <- penguins |> 
  select(ID, species, island, sex)

### tSNE analysis
set.seed(135)
tsne_fit <- penguins |> 
  select(where(is.numeric)) |> 
  column_to_rownames("ID") |> 
  scale() |> 
  Rtsne()

### retrive data
tsne_df  <- tsne_fit$Y |> 
  as.data.frame() |> 
  rename(tSNE1 = "V1", tSNE2 = "V2") |> 
  mutate(ID = row_number())

tsne_df <- tsne_df |> 
  inner_join(penguins_meta, by = "ID")

### check data
head(tsne_df)
##        tSNE1     tSNE2 ID species    island    sex
## 1  -5.788624  5.231530  1  Adelie Torgersen   male
## 2  -6.308533  8.950800  2  Adelie Torgersen female
## 3  -6.015678 12.082790  3  Adelie Torgersen female
## 4  -9.649822  6.857170  4  Adelie Torgersen female
## 5 -12.064101  4.828783  5  Adelie Torgersen   male
## 6  -5.480114  7.486022  6  Adelie Torgersen female

### visualize 
tsne_df |> 
  ggplot(
    aes(x = tSNE1, y = tSNE2, color = species, shape = sex)) +
  geom_point()+
  theme(legend.position = "right") +
  ### identify potential sample mismatch
  geom_circle(
    aes(x0 = 9, y0 = -2, r = 2),
    color = "green",
    inherit.aes = FALSE
  )
## Warning in geom_circle(aes(x0 = 9, y0 = -2, r = 2), color = "green", inherit.aes = FALSE): All aesthetics have length 1, but the data has 333 rows.
## ℹ Did you mean to use `annotate()`?

Reference