---
title: "Learning [ropls] for multivariate analysis and feature selection of omics data"
date: 2023-07-05
# date-modified: 2023-04-05
categories:
- r
- pca
- plsda
- supervised
image: plsda.png
---
PCA (also called eigenvector analysis) is unsupervised pattern recognition technique mostly utilized as data reduction and modelling technique. It determines the degree or extent to which variables are related. Large data of many variables are unavoidably superfluous and overlap, the use of correlation matrix generally quantifies these anomalies by extracting the eigenvalues and eigenvectors from the square matrix originated by multiplying the data matrix. The purpose of PCA is to find orthogonal variables that capture the maximum amount of variance in the data without considering class information. PCA provide the information about the relationships and patterns and help identify major sources of variation and potential outliers
PLS discriminant analysis is a supervised technique that uses the PLS algorithm to explain and predict the membership of observations to several classes using quantitative or qualitative explanatory variables or parameters. The purpose of PLS-DA is to identify the latent variables that maximize the discrimination between the predefined classes in the data. PLS-DA focus on the the separation of classes in the dataset and provide information on important features that serparate classes.
## Packages and Data
```{r}
pacman::p_load(ropls, tidyverse, ggsci)
### load data
data(sacurine)
names(sacurine)
attach(sacurine)
strF(dataMatrix)
strF(variableMetadata)
# View(dataMatrix)
# View(variableMetadata)
# View(sampleMetadata)
```
## PCA
```{r}
pca <- opls(dataMatrix)
genderFc <- sampleMetadata[, "gender"]
plot(pca,
typeVc = "x-score",
parAsColFcVn = genderFc,
parEllipsesL = TRUE
)
dev.off()
plot(pca,
typeVc = "x-score",
parAsColFcVn = genderFc,
parLabVc = as.character(sampleMetadata[, "age"]),
parPaletteVc = c("green4", "magenta"))
dev.off()
```
## PLS-DA
```{r}
### PLSDA analysis
plsda <- opls(dataMatrix, genderFc)
### sample scores plot
sample_score <- plsda@scoreMN |>
as.data.frame() |>
mutate(gender = sacurine[["sampleMetadata"]][["gender"]])
### plot
ggplot(sample_score, aes(x = p1, y = p2, color = gender)) +
geom_hline(yintercept = 0, linetype = "dashed", linewidth = 0.5) +
geom_vline(xintercept = 0, linetype = "dashed", linewidth = 0.5) +
geom_point() +
geom_point(aes(x = -10, y = -10), color = "white") +
labs(x = "P1(10.0%)", y = "P2(9%)") +
stat_ellipse(
level = 0.95, linetype = "solid", size = 1, show.legend = FALSE
) +
scale_color_manual(values = c("#3CB371", "#FF6347")) +
theme_bw() +
theme(
legend.position = c(0.9, 0.8),
legend.text = element_text(color = "black", size = 12, family = "Arial", face = "plain"),
panel.background = element_blank(),
panel.grid = element_blank(),
axis.text = element_text(color = "black", size = 15, family = 'Arial', face = "plain"),
axis.title = element_text(color = "black", size = 15, family = 'Arial', face = "plain"),
axis.ticks = element_line(color = "black")
)
```
```{r}
### find the discrinminative variable that VIP greater than 1
### VIP scores plot
vip_score <- as.data.frame(plsda@vipVn)
colnames(vip_score) <- "vip"
vip_score$metabolites <- rownames(vip_score)
vip_score <- vip_score[order(-vip_score$vip), ]
vip_score$metabolites <- factor(
vip_score$metabolites, levels = vip_score$metabolites)
loading_score <- plsda@loadingMN |>
as.data.frame()
loading_score$metabolites <- rownames(loading_score)
all_score <- merge(vip_score, loading_score, by = "metabolites")
all_score$cat <- paste("feature", 1:nrow(all_score), sep = "")
### plot
ggplot(all_score[all_score$vip >=1, ], aes(x = cat, y = vip)) +
geom_segment(aes(x = cat, xend = cat,
y = 0, yend = vip)) +
geom_point(shape = 21, size = 5, color = "#008000", fill = "#008000")+
geom_point(aes(1, 2.5), color = "white") +
geom_hline(yintercept = 1, linetype = "dashed") +
scale_y_continuous(expand = c(0, 0)) +
labs(x = "", y = "VIP value") +
theme_bw() +
theme(
legend.position = "none",
legend.text = element_text(color = "black", size = 12, family = "Arial", face = "plain"),
panel.background = element_blank(),
panel.grid = element_blank(),
axis.text = element_text(color = "black", size = 15, family = 'Arial', face = "plain"),
axis.text.x = element_text(angle = 90),
axis.title = element_text(color = "black", size = 15, family = 'Arial', face = "plain"),
axis.ticks = element_line(color = "black"),
axis.ticks.x = element_blank()
)
```
## OPLS-DA
```{r}
### OPLS-DA analysis
oplsda <- opls(dataMatrix, genderFc, predI = 1, orthoI = NA)
### sample scores plot
sample_score <- oplsda@scoreMN |>
as.data.frame() |>
mutate(
gender = sacurine[["sampleMetadata"]][["gender"]],
o1 = oplsda@orthoScoreMN[, 1]
)
### plot
ggplot(sample_score, aes(p1, o1, color = gender)) +
geom_hline(yintercept = 0, linetype = "dashed", size = 0.5) +
geom_vline(xintercept = 0, linetype = "dashed", size = 0.5) +
geom_point() +
#geom_point(aes(-10,-10), color = 'white') +
labs(x = "P1(5.0%)", y = "to1") +
stat_ellipse(level = 0.95, linetype = "solid",
size = 1, show.legend = FALSE) +
scale_color_manual(values = c("#3CB371", "#FF6347")) +
theme_bw() +
theme(legend.position = c(0.1, 0.85),
legend.title = element_blank(),
legend.text = element_text(color = "black", size = 12, family = "Arial",
face = "plain"),
panel.background = element_blank(),
panel.grid = element_blank(),
axis.text = element_text(color = "black", size = 15, family = "Arial", face = "plain"),
axis.title = element_text(color = "black", size = 15, family = "Arial", face = "plain"),
axis.ticks = element_line(color = 'black'))
```
```{r}
### VIP scores plot
vip_score <- as.data.frame(oplsda@vipVn)
colnames(vip_score) <- "vip"
vip_score$metabolites <- rownames(vip_score)
vip_score <- vip_score[order(-vip_score$vip), ]
vip_score$metabolites <- factor(
vip_score$metabolites, levels = vip_score$metabolites)
loading_score <- oplsda@loadingMN |>
as.data.frame()
loading_score$metabolites <- rownames(loading_score)
all_score <- merge(vip_score, loading_score, by = "metabolites")
all_score$cat <- paste("feature", 1:nrow(all_score), sep = "")
### plot
ggplot(all_score[all_score$vip >=1, ], aes(x = cat, y = vip)) +
geom_segment(aes(x = cat, xend = cat,
y = 0, yend = vip)) +
geom_point(shape = 21, size = 5, color = "#FFA07A", fill = "#FFA07A")+
geom_point(aes(1, 2.5), color = "white") +
geom_hline(yintercept = 1, linetype = "dashed") +
scale_y_continuous(expand = c(0, 0)) +
labs(x = "", y = "VIP value") +
theme_bw() +
theme(
legend.position = "none",
legend.text = element_text(color = "black", size = 12, family = "Arial", face = "plain"),
panel.background = element_blank(),
panel.grid = element_blank(),
axis.text = element_text(color = "black", size = 15, family = 'Arial', face = "plain"),
axis.text.x = element_text(angle = 90),
axis.title = element_text(color = "black", size = 15, family = 'Arial', face = "plain"),
axis.ticks = element_line(color = "black"),
axis.ticks.x = element_blank()
)
```
## Predict models
```{r}
### OPLS-DA model training
oplsda.2 <- opls(dataMatrix, genderFc, predI = 1, orthoI = NA,subset = "odd")
### training dataset accuracy
trainVi <- getSubsetVi(oplsda.2)
tab <- table(genderFc[trainVi], fitted(oplsda.2))
print(paste("model accuracy:", round(sum(diag(tab))/sum(tab)*100, 2), "%", sep = ""))
### testing dataset accuracy
tab2 <- table(genderFc[-trainVi], predict(oplsda.2, dataMatrix[-trainVi, ]))
print(paste("model accuracy:", round(sum(diag(tab2))/sum(tab2)*100, 2),'%', sep = ''))
```
## Other
```{r}
# volcano plot
df <- dataMatrix %>% as.data.frame()
df$gender <- sacurine[["sampleMetadata"]][["gender"]]
df <- df[order(df$gender), ]
df <- df[,-110]
M.mean <- apply(df[1:100,], 2, FUN = mean)
F.mean <- apply(df[101:183,], 2, FUN = mean)
FC <- M.mean / F.mean
log2FC <- log(FC, 2)
pvalue <- apply(df, 2, function(x) {t.test(x[1:100],x[101:183])$p.value})
p.adj <- p.adjust(pvalue, method = 'BH')
p.adj.log <- -log10(p.adj)
colcano.df <- data.frame(log2FC, p.adj, p.adj.log)
colcano.df$cat <- ifelse(colcano.df$log2FC >= 1 & colcano.df$p.adj < 0.05, "Up",
ifelse(colcano.df$log2FC <= -1 & colcano.df$p.adj < 0.05, "Down","NS"))
ggplot(colcano.df, aes(log2FC, p.adj.log)) +
geom_point(colour = "#A9A9A9", size = 1) +
labs(y = "-log10(p-value.adj)") +
theme_bw() +
scale_x_continuous(limits = c(-2, 2)) +
theme(legend.position = 'none',
legend.text = element_text(color = 'black', size = 12, family = 'Arial', face = 'plain'),
panel.background = element_blank(),
panel.grid = element_blank(),
axis.text = element_text(color = 'black', size = 15, family = 'Arial', face = 'plain'),
axis.text.x = element_text(angle = 90),
axis.title = element_text(color = 'black', size = 15, family = 'Arial', face = 'plain'),
axis.ticks = element_line(color = 'black'),
axis.ticks.x = element_blank())
```