General workflow for RNAseq data analysis with R

Use lightweight and open-source software with R programming language and the Bioconductor suite of packages to perfrm RNA-seq data analysis.
r
rnaseq
Author
Published

Monday, September 11, 2023

Modified

Wednesday, January 10, 2024

Step1: Check the quality of raw reads

### go to row reads folder
cd ~/../..DIY_Transcriptomics/data/fastq
### activate env
conda active rnaseq
### use fastqc to check the quality of fastq files
fastqc *.gz -t 4

Step2: Mapping reads with Kallisto pseudoaligment

  • Download reference transciptome files from here

  • Build an index from reference fasta file

kallisto index -i Homo_sapiens.GRCh38.cdna.all.index Homo_sapiens.GRCh38.cdna.all.fa.gz
  • Map reads to the indexed reference host transcriptome

  • Summarize output with Multiqc in a single summary html
multiqc -d .
### first the healthy subjects (HS)
kallisto quant -i Homo_sapiens.GRCh38.cdna.all.index -o HS01 -t 4 --single -l 250 -s 30 SRR8668755.fastq.gz &> HS01.log
kallisto quant -i Homo_sapiens.GRCh38.cdna.all.index -o HS02 -t 4 --single -l 250 -s 30 SRR8668756.fastq.gz &> HS02.log
kallisto quant -i Homo_sapiens.GRCh38.cdna.all.index -o HS03 -t 4 --single -l 250 -s 30 SRR8668757.fastq.gz &> HS03.log
kallisto quant -i Homo_sapiens.GRCh38.cdna.all.index -o HS04 -t 4 --single -l 250 -s 30 SRR8668758.fastq.gz &> HS04.log
kallisto quant -i Homo_sapiens.GRCh38.cdna.all.index -o HS05 -t 4 --single -l 250 -s 30 SRR8668759.fastq.gz &> HS05.log

### then the cutaneous leishmaniasis (CL) patients
kallisto quant -i Homo_sapiens.GRCh38.cdna.all.index -o CL08 -t 4 --single -l 250 -s 30 SRR8668769.fastq.gz &> CL08.log
kallisto quant -i Homo_sapiens.GRCh38.cdna.all.index -o CL10 -t 4 --single -l 250 -s 30 SRR8668771.fastq.gz &> CL10.log
kallisto quant -i Homo_sapiens.GRCh38.cdna.all.index -o CL11 -t 4 --single -l 250 -s 30 SRR8668772.fastq.gz &> CL11.log
kallisto quant -i Homo_sapiens.GRCh38.cdna.all.index -o CL12 -t 4 --single -l 250 -s 30 SRR8668773.fastq.gz &> CL12.log
kallisto quant -i Homo_sapiens.GRCh38.cdna.all.index -o CL13 -t 4 --single -l 250 -s 30 SRR8668774.fastq.gz &> CL13.log

### Summarize outputs
multiqc -d .

Step3: Import data into R

Load packages

### load packages
library(rhdf5) # provides functions for handling hdf5 file formats (kallisto outputs bootstraps in this format)
library(tidyverse) # Hadley Wickham's collection of R packages for data science, which we will use throughout the course
library(tximport) # package for getting Kallisto results into R
library(ensembldb) # helps deal with ensembl
library(EnsDb.Hsapiens.v86) # replace with your organism-specific database package
library(here) # file path
library(datapasta) # paste data into R from clipboard
library(edgeR) # well known package for differential expression analysis, but we only use for the DGEList object and for normalization methods
library(matrixStats) # easily calculate stats on rows or columns of a data matrix
library(cowplot) # allows you to combine multiple plots in one figure
library(DT) # for making interactive tables
library(plotly) # for making interactive plots
library(gt) # A layered 'grammar of tables' - think ggplot, but for tables
library(limma) # venerable package for differential gene expression using linear modeling
library(edgeR)
library(RColorBrewer) # need colors to make heatmaps
# library(gameofthrones) #because...why not.  Install using 'devtools::install_github("aljrico/gameofthrones")'
library(heatmaply) #for making interactive heatmaps using plotly
library(gplots)
# library(d3heatmap) # for making interactive heatmaps using D3
library(GSEABase) # functions and methods for Gene Set Enrichment Analysis
library(Biobase) # base functions for bioconductor; required by GSEABase
library(GSVA) # Gene Set Variation Analysis, a non-parametric and unsupervised method for estimating variation of gene set enrichment across samples.
library(gprofiler2) # tools for accessing the GO enrichment results using g:Profiler web resources
library(clusterProfiler) # provides a suite of tools for functional enrichment analysis
library(msigdbr) # access to msigdb collections directly within R
library(enrichplot) # great for making the standard GSEA enrichment plots

Import study design files

### load study design
targets <- read_tsv(
    here("learn", "DIY_Transcriptomics", "config", "studydesign.txt")
)
targets
## # A tibble: 10 × 3
##    sample sra_accession group  
##    <chr>  <chr>         <chr>  
##  1 HS01   SRR8668755    healthy
##  2 HS02   SRR8668756    healthy
##  3 HS03   SRR8668757    healthy
##  4 HS04   SRR8668758    healthy
##  5 HS05   SRR8668759    healthy
##  6 CL08   SRR8668769    disease
##  7 CL10   SRR8668771    disease
##  8 CL11   SRR8668772    disease
##  9 CL12   SRR8668773    disease
## 10 CL13   SRR8668774    disease

### create file paths for the abundance files
path <- file.path(
    "learn", "DIY_Transcriptomics", "data", "fastq", targets$sample, "abundance.tsv"
)
path
##  [1] "learn/DIY_Transcriptomics/data/fastq/HS01/abundance.tsv"
##  [2] "learn/DIY_Transcriptomics/data/fastq/HS02/abundance.tsv"
##  [3] "learn/DIY_Transcriptomics/data/fastq/HS03/abundance.tsv"
##  [4] "learn/DIY_Transcriptomics/data/fastq/HS04/abundance.tsv"
##  [5] "learn/DIY_Transcriptomics/data/fastq/HS05/abundance.tsv"
##  [6] "learn/DIY_Transcriptomics/data/fastq/CL08/abundance.tsv"
##  [7] "learn/DIY_Transcriptomics/data/fastq/CL10/abundance.tsv"
##  [8] "learn/DIY_Transcriptomics/data/fastq/CL11/abundance.tsv"
##  [9] "learn/DIY_Transcriptomics/data/fastq/CL12/abundance.tsv"
## [10] "learn/DIY_Transcriptomics/data/fastq/CL13/abundance.tsv"

### make sure path is correct
all(file.exists(path))
## [1] TRUE

Get organsim specific annotations

tx <- transcripts(
    EnsDb.Hsapiens.v86, columns = c("tx_id", "gene_name")
) |>
    as_tibble() |> 
    ### need to change first column name to 'target_id'
    rename(target_id = tx_id)
tx
## # A tibble: 216,741 × 7
##    seqnames start   end width strand target_id       gene_name
##    <fct>    <int> <int> <int> <fct>  <chr>           <chr>    
##  1 1        11869 14409  2541 +      ENST00000456328 DDX11L1  
##  2 1        12010 13670  1661 +      ENST00000450305 DDX11L1  
##  3 1        14404 29570 15167 -      ENST00000488147 WASH7P   
##  4 1        17369 17436    68 -      ENST00000619216 MIR6859-1
##  5 1        29554 31097  1544 +      ENST00000473358 MIR1302-2
##  6 1        30267 31109   843 +      ENST00000469289 MIR1302-2
##  7 1        30366 30503   138 +      ENST00000607096 MIR1302-2
##  8 1        34554 36081  1528 -      ENST00000417324 FAM138A  
##  9 1        35245 36073   829 -      ENST00000461467 FAM138A  
## 10 1        52473 53312   840 +      ENST00000606857 OR4G4P   
## # ℹ 216,731 more rows

### move transcrip ID the first column in the dataframe
tx <- select(tx, "target_id", "gene_name")

Load transcript counts

txi_gene <- tximport(
    path,
    type = "kallisto",
    tx2gene = tx,
    txOut = FALSE, # How does the result change if this =FALSE vs =TRUE?
    countsFromAbundance = "lengthScaledTPM",
    ignoreTxVersion = TRUE
)

### take a look at the type of object
class(txi_gene)
## [1] "list"
names(txi_gene)
## [1] "abundance"           "counts"              "length"             
## [4] "countsFromAbundance"

Step4: Wrangling gene expression

### examine the data
tpm <- txi_gene$abundance
counts <- txi_gene$counts

### generate summary stats
tpm_stats <- transform(
    tpm,
    SD = rowSds(tpm),
    AVG = rowMeans(tpm),
    MED = rowMedians(tpm)
)

### look at the summary data
head(tpm_stats)
##                  X1          X2         X3           X4         X5          X6
## 5S_rRNA   0.0000000   0.0000000   0.000000   0.00000000   0.000000   0.0000000
## A1BG      6.3823340   6.7399420   3.842043   6.70710000   8.591181  14.6144150
## A1CF      0.0757872   0.0616752   0.214226   0.00859446   0.000000   0.0265036
## A2M     162.5622990 341.8312630 295.814803 261.47230600 169.453100 345.6983730
## A2ML1   109.6624900  80.6018680  90.356840  64.11152700  67.035650  52.5762220
## A2MP1     0.2237170   0.3143760   0.390022   0.15622800   0.122260   0.2700100
##                 X7          X8           X9         X10           SD
## 5S_rRNA   0.000000   0.0000000   0.00000000   0.0000000   0.00000000
## A1BG     18.182700  12.8662800  12.05296100   9.8268802   4.41172300
## A1CF      0.072777   0.1359934   0.03722713   0.0581254   0.06414927
## A2M     643.554440 341.8972850 324.80208900 415.9663840 135.78415021
## A2ML1     4.401309   5.1981760   1.75721000  15.1189910  39.78108982
## A2MP1     1.087410   0.7737610   0.68834900   0.6378640   0.31619850
##                  AVG         MED
## 5S_rRNA   0.00000000   0.0000000
## A1BG      9.98058361   9.2090306
## A1CF      0.06909094   0.0599003
## A2M     330.30523424 333.3166760
## A2ML1    49.08202830  58.3438745
## A2MP1     0.46639970   0.3521990

### visualize it using scatter plot
ggplot(tpm_stats, aes(x = SD, y = MED)) +
    geom_point(shape = 18, size = 2) +
    geom_smooth(method = lm) +
    geom_hex(show.legend = FALSE) +
    labs(
        y = "Median", x = "Standard deviation",
        title = "Transcripts per million (TPM)",
        subtitle = "unfiltered, non-normalized data",
        caption = "DIYtranscriptomics - Spring 2020"
    ) +
    theme_classic() +
    theme_dark() +
    theme_bw()

### capture sample labels from study design
sampleLabels <- targets$sample

### make DGElist from the counts
DGEList <- DGEList(txi_gene$counts)
DGEList
## An object of class "DGEList"
## $counts
##             Sample1     Sample2     Sample3     Sample4  Sample5     Sample6
## 5S_rRNA     0.00000     0.00000     0.00000    0.000000   0.0000     0.00000
## A1BG      184.19070   245.13169   101.73418   79.204797  13.3335   682.72256
## A1CF       22.94477    23.53171    59.50815    1.064719   0.0000    12.98875
## A2M     11901.13136 31538.08430 19870.31859 7832.900974 667.1460 40967.58721
## A2ML1   15162.86062 14045.04908 11463.06052 3627.335147 498.4618 11767.57706
##            Sample7     Sample8     Sample9    Sample10
## 5S_rRNA     0.0000     0.00000     0.00000     0.00000
## A1BG      391.5812   326.10430   397.97768   499.96534
## A1CF       16.4421    36.15938    12.89508    31.02343
## A2M     35158.3894 21982.61779 27205.95758 53686.20520
## A2ML1     454.1300   631.23251   277.98636  3685.37686
## 35366 more rows ...
## 
## $samples
##          group lib.size norm.factors
## Sample1      1 42300696            1
## Sample2      1 51775632            1
## Sample3      1 40485664            1
## Sample4      1 16661634            1
## Sample5      1  2191577            1
## Sample6      1 55389922            1
## Sample7      1 28800520            1
## Sample8      1 31632164            1
## Sample9      1 39464569            1
## Sample10     1 67827185            1

### get counts per million
cpm <- cpm(DGEList)
colSums(cpm)
##  Sample1  Sample2  Sample3  Sample4  Sample5  Sample6  Sample7  Sample8 
##    1e+06    1e+06    1e+06    1e+06    1e+06    1e+06    1e+06    1e+06 
##  Sample9 Sample10 
##    1e+06    1e+06

log2_cpm <- cpm(DGEList, log=TRUE)
### connvert data matrix to dataframe
log2_cpm_df <- as_tibble(log2_cpm, rownames = "geneID")
### add sample names
colnames(log2_cpm_df) <- c("geneID", sampleLabels)

### pivot from wide to long 
log2_cpm_df_pivot <- pivot_longer(
  log2_cpm_df, 
  cols = HS01:CL13, 
  names_to = "samples", 
  values_to = "expression" 
) 

p1 <- ggplot(
    log2_cpm_df_pivot,
    aes(x = samples, y = expression, fill = samples)
) +
    geom_violin(trim = FALSE, show.legend = FALSE) +
    stat_summary(fun = "median",
        geom = "point",
        shape = 95,
        size = 10,
        color = "black",
        show.legend = FALSE
    ) +
    labs(
        y = "log2 expression", x = "sample",
        title = "Log2 Counts per Million (CPM)",
        subtitle = "unfiltered, non-normalized",
        caption = paste0("produced on ", Sys.time())
    ) +
    theme_bw()
p1

  • How many genes or transcripts have no read counts at all?
### filter data
table(rowSums(DGEList$counts==0)==10)
## 
## FALSE  TRUE 
## 29373  5998

### how many genes had more than 1 CPM (TRUE) in at least 3 samples

### set cutoff
keepers <- rowSums(cpm>1) >= 5 # user defined
DGEList_filtered <- DGEList[keepers, ]

log2_cpm_filtered <- cpm(DGEList_filtered, log=TRUE)
log2_cpm_filtered_df <- as_tibble(log2_cpm_filtered, rownames = "geneID")
colnames(log2_cpm_filtered_df) <- c("geneID", sampleLabels)
log2_cpm_filtered_df_pivot <- pivot_longer(
    log2_cpm_filtered_df,
    cols = HS01:CL13,
    names_to = "samples",
    values_to = "expression"
)

p2 <- ggplot(
    log2_cpm_filtered_df_pivot,
    aes(x = samples, y = expression, fill = samples)
) +
    geom_violin(trim = FALSE, show.legend = FALSE) +
    stat_summary(
        fun = "median",
        geom = "point",
        shape = 95,
        size = 10,
        color = "black",
        show.legend = FALSE
    ) +
    labs(y = "log2 expression", x = "sample",
        title = "Log2 Counts per Million (CPM)",
        subtitle = "filtered, non-normalized",
        caption = paste0("produced on ", Sys.time())) +
    theme_bw()

p2

### normalize data
DGEList_filtered_norm <- calcNormFactors(
    DGEList_filtered, method = "TMM"
)
log2_cpm_filtered_norm <- cpm(
    DGEList_filtered_norm, log = TRUE
)
log2_cpm_filtered_norm_df <- as_tibble(
    log2_cpm_filtered_norm, rownames = "geneID"
)
colnames(log2_cpm_filtered_norm_df) <- c("geneID", sampleLabels)

log2_cpm_filtered_norm_df_pivot <- pivot_longer(
    log2_cpm_filtered_norm_df,
    cols = HS01:CL13,
    names_to = "samples",
    values_to = "expression"
)

p3 <- ggplot(
    log2_cpm_filtered_norm_df_pivot,
    aes(x = samples, y = expression, fill = samples)
) +
    geom_violin(trim = FALSE, show.legend = FALSE) +
    stat_summary(
        fun = "median",
        geom = "point",
        shape = 95,
        size = 10,
        color = "black",
        show.legend = FALSE
    ) +
    labs(
        y = "log2 expression", x = "sample",
        title = "Log2 Counts per Million (CPM)",
        subtitle = "filtered, TMM normalized",
        caption = paste0("produced on ", Sys.time())
    ) +
    theme_bw()

### combine plots
plot_grid(
  p1, p2, p3, 
  labels = c('A', 'B', 'C'), 
  label_size = 12
)

Step5: Data exploration with multivariate analysis

### Identify variables of interest in study design file
group <- targets$group
group <- factor(group)
group
##  [1] healthy healthy healthy healthy healthy disease disease disease disease
## [10] disease
## Levels: disease healthy

Hierarchical clustering

### calculate distance  methods (e.g. switch from 'maximum' to 'euclidean')
distance <- dist(
    t(log2_cpm_filtered_norm), method = "maximum"
)
### other distance methods are "euclidean", maximum", "manhattan", "canberra", "binary" or "minkowski"

clusters <- hclust(distance, method = "average") 
### other agglomeration methods are "ward.D", "ward.D2", "single", "complete", "average", "mcquitty", "median", or "centroid"
plot(clusters, labels=sampleLabels)

Principal compoent analysis

pca_res <- prcomp(
    t(log2_cpm_filtered_norm),
    scale. = FALSE, retx = TRUE
)


### prints variance summary for all principal components.
summary(pca_res)
## Importance of components:
##                             PC1     PC2      PC3      PC4      PC5      PC6
## Standard deviation     113.9821 49.8074 33.42376 27.08082 26.40296 22.23750
## Proportion of Variance   0.6656  0.1271  0.05723  0.03757  0.03571  0.02533
## Cumulative Proportion    0.6656  0.7927  0.84993  0.88750  0.92322  0.94855
##                             PC7      PC8      PC9      PC10
## Standard deviation     20.50645 17.71945 16.42366 1.816e-13
## Proportion of Variance  0.02154  0.01609  0.01382 0.000e+00
## Cumulative Proportion   0.97010  0.98618  1.00000 1.000e+00

### look at the PCA result (pca.res)
ls(pca_res)
## [1] "center"   "rotation" "scale"    "sdev"     "x"

### $rotation shows you how much each gene influenced each PC (called 'scores')
head(pca_res$rotation) 
##                 PC1          PC2           PC3          PC4           PC5
## A1BG    0.006058601 -0.005265865  0.0030322868  0.008759217  0.0021903806
## A2M     0.004897345  0.002459393 -0.0029437295  0.010128989  0.0010754828
## A2ML1  -0.015426376  0.002131922  0.0236833694  0.012776801  0.0151330784
## A2MP1   0.008361769  0.005416383 -0.0080073403 -0.004989098  0.0019137300
## A4GALT -0.004861196 -0.010071400 -0.0040747825  0.004365382 -0.0073287027
## AAAS    0.002247684 -0.005546829 -0.0003891962  0.005401109 -0.0008667994
##                 PC6           PC7         PC8          PC9        PC10
## A1BG    0.002600255  1.165903e-02 0.002109927  0.006017461 -0.62642253
## A2M    -0.003082754 -2.988929e-03 0.007583352 -0.002958490 -0.07649096
## A2ML1  -0.013272183 -1.389944e-02 0.008501332  0.020687796  0.29435235
## A2MP1  -0.002415560  1.626299e-03 0.011281299 -0.005577339 -0.50451276
## A4GALT -0.009698589  6.199904e-05 0.012467687  0.004110126 -0.31572438
## AAAS    0.006991981  3.783003e-03 0.004628266  0.003568768 -0.14061416

### 'x' shows you how much each sample influenced each PC (called 'loadings')
pca_res$x 
##                 PC1          PC2        PC3        PC4         PC5        PC6
## Sample1  -116.29139   38.2334807  21.154790 -31.204512  24.3342557  -5.032445
## Sample2   -98.08443   28.5182187   6.488451  20.455248 -24.7150117  -5.625522
## Sample3   -98.57599   46.3377421 -21.880766 -22.638077  26.9619112   8.312240
## Sample4  -102.83232   27.1997919 -10.521681  36.884584 -34.6489813  11.837625
## Sample5  -119.48548 -130.0448432  -8.771308  -8.078419   0.7747427  -2.609032
## Sample6    76.83934  -13.0282654  65.541177  34.668976  32.4966464   9.756467
## Sample7   114.80802    0.9427947 -66.411954  27.226075  30.8708143  -6.756527
## Sample8   122.14072   -0.8992447  -2.600281 -31.000051 -26.0328136  19.278692
## Sample9   129.23356   -4.8764679   3.983424 -16.285254 -13.4446636  25.667565
## Sample10   92.24799    7.6167930  13.018146 -10.028570 -16.5969001 -54.829063
##                 PC7         PC8         PC9          PC10
## Sample1   37.289161  -8.2755495   9.2338103  6.432316e-13
## Sample2    9.263960  37.6085903 -14.4523481  7.064106e-14
## Sample3  -40.584484   0.6580365  -8.3486393  3.455381e-13
## Sample4   -3.089166 -28.1333136  11.4331050 -5.619926e-14
## Sample5   -2.014396  -0.2124505  -0.6860841 -1.205842e-13
## Sample6  -10.563273   2.1965073   5.3204553 -6.485607e-14
## Sample7   15.193876   4.0838551   4.1812220 -2.588419e-13
## Sample8   -5.936520  15.6564254  28.6671255 -1.908206e-13
## Sample9   11.836127 -13.5624080 -32.5109827 -3.271781e-13
## Sample10 -11.395285 -10.0196929  -2.8376640  1.788202e-13

### A screeplot is a standard way to view eigenvalues for each PCA
screeplot(pca_res) 


### sdev^2 captures these eigenvalues from the PCA result
pc_var <- pca_res$sdev^2 

### use these eigenvalues to calculate the percentage variance explained by each PC
pc_per <- round(pc_var/sum(pc_var)*100, 1) 
pc_per
##  [1] 66.6 12.7  5.7  3.8  3.6  2.5  2.2  1.6  1.4  0.0
  • Can we figure out the identify of outlier?
### visualize PCA result 
pca_res_df <- as_tibble(pca_res$x) |> 
  bind_cols(targets) |> 
  mutate(group = factor(group))
pca_plot1 <- pca_res_df |>
    ggplot(
        aes(x = PC1, y = PC2, label = sample, color = group)
    ) +
    geom_point(size = 4) +
    stat_ellipse() +
    xlab(paste0("PC1 (", pc_per[1], "%", ")")) +
    ylab(paste0("PC2 (", pc_per[2], "%", ")")) +
    labs(title = "PCA plot",
        caption = paste0("produced on ", Sys.time())) +
    coord_fixed() +
    theme_bw()

pca_plot1

  • Another way to view PCA laodings to understand impact of each sample on each pricipal component
pca_res_df <- pca_res$x[, 1:4] |>
    as_tibble() |>
    add_column(
        sample = sampleLabels,
        group = group
    )
### dataframe to be pivoted  
pca_pivot <- pivot_longer(
  pca_res_df, 
  cols = PC1:PC4, 
  names_to = "PC", 
  values_to = "loadings"
) 
### plot
pca_plot2 <- ggplot(pca_pivot) +
    aes(x = sample, y = loadings, fill = group) +
    geom_bar(stat = "identity") +
    facet_wrap(~PC) +
    labs(
        title = "PCA 'small multiples' plot",
        caption = paste0("produced on ", Sys.time())
    ) +
    theme_bw() +
    coord_flip()
pca_plot2

### combine plot
# plot_grid(pca_plot1, pca_plot2)
# make columns comparing each of the averages above that you're interested in
mydata_df <- mutate(
  log2_cpm_filtered_norm_df,
  healthy.AVG = (HS01 + HS02 + HS03 + HS04 + HS05)/5, 
  disease.AVG = (CL08 + CL10 + CL11 + CL12 + CL13)/5,
  LogFC = (disease.AVG - healthy.AVG)
  ) |> 
  mutate_if(is.numeric, round, 2)

### sort data
mydata_sort <- mydata_df |>
  arrange(desc(LogFC)) |> 
  select(geneID, LogFC)
mydata_sort
## # A tibble: 15,765 × 2
##    geneID    LogFC
##    <chr>     <dbl>
##  1 IGLC3     10.6 
##  2 MMP1      10.4 
##  3 IGKV1-5   10.2 
##  4 IGLV3-1    9.96
##  5 IGLV3-19   9.9 
##  6 IGLC1      9.88
##  7 IGKV1D-39  9.87
##  8 IGLL5      9.85
##  9 IGKV1D-33  9.83
## 10 IGLV2-8    9.65
## # ℹ 15,755 more rows

### filter data
mydata_filter <- mydata_df |>
    filter(
        geneID == "MMP1" | geneID == "GZMB" | geneID == "IL1B" | geneID == "GNLY" | geneID == "IFNG"
        | geneID == "CCL4" | geneID == "PRF1" | geneID == "APOBEC3A" | geneID == "UNC13A"
    ) |>
    select(geneID, healthy.AVG, disease.AVG, LogFC) |>
    arrange(desc(LogFC))
mydata_filter
## # A tibble: 9 × 4
##   geneID   healthy.AVG disease.AVG LogFC
##   <chr>          <dbl>       <dbl> <dbl>
## 1 MMP1            1.44       11.9  10.4 
## 2 IFNG           -3.65        4.42  8.07
## 3 CCL4           -1.75        6.14  7.89
## 4 GZMB           -0.58        6.47  7.05
## 5 GNLY            0.98        7.17  6.19
## 6 IL1B            0.63        6.48  5.85
## 7 PRF1            1.03        6.64  5.61
## 8 APOBEC3A        0.27        5.6   5.33
## 9 UNC13A         -0.78        1.95  2.73
### Produce publication-quality tables using `gt` package
mydata_filter |>
    gt() |>
    fmt_number(columns = 2:4, decimals = 1) |>
    tab_header(
        title = md("**Regulators of skin pathogenesis**"),
        subtitle = md("*during cutaneous leishmaniasis*")
    ) |>
    tab_footnote(
        footnote = "Deletion or blockaid ameliorates disease in mice",
        locations = cells_body(columns = geneID, rows = c(6, 7))
    ) |>
    tab_footnote(
        footnote = "Associated with treatment failure in multiple studies",
        locations = cells_body(columns = geneID, rows = c(2:9))
    ) |>
    tab_footnote(
        footnote = "Implicated in parasite control",
        locations = cells_body(columns = geneID, rows = c(2))
    ) |>
    tab_source_note(
        source_note = md("Reference: Amorim *et al*., (2019). DOI: 10.1126/scitranslmed.aar3619")
    )
Regulators of skin pathogenesis
during cutaneous leishmaniasis
geneID healthy.AVG disease.AVG LogFC
MMP1 1.4 11.9 10.4
IFNG1,2 −3.6 4.4 8.1
CCL41 −1.8 6.1 7.9
GZMB1 −0.6 6.5 7.0
GNLY1 1.0 7.2 6.2
IL1B3,1 0.6 6.5 5.8
PRF13,1 1.0 6.6 5.6
APOBEC3A1 0.3 5.6 5.3
UNC13A1 −0.8 1.9 2.7
Reference: Amorim et al., (2019). DOI: 10.1126/scitranslmed.aar3619
1 Associated with treatment failure in multiple studies
2 Implicated in parasite control
3 Deletion or blockaid ameliorates disease in mice
  • Make an interactive table using the DT package
# datatable(
#     mydata_df[, c(1, 12:14)],
#     extensions = c('KeyTable', "FixedHeader"),
#     filter = 'top',
#     options = list(keys = TRUE,
#         searchHighlight = TRUE,
#         pageLength = 10,
#         # dom = "Blfrtip",
#         # buttons = c("copy", "csv", "excel"),
#         lengthMenu = c("10", "25", "50", "100"))
# )
  • Make an interactive scatter plot with plotly
myplot <- ggplot(mydata_df) +
  aes(x=healthy.AVG, y=disease.AVG, 
      text = paste("Symbol:", geneID)) +
  geom_point(shape=16, size=1) +
  ggtitle("disease vs. healthy") +
  theme_bw()

myplot

# ggplotly(myplot)

Step6: Differential gene expression

Set up design matrix

group <- factor(targets$group)
design <- model.matrix(~0 + group)
colnames(design) <- levels(group)

# NOTE: if you need a paired analysis (a.k.a.'blocking' design) or have a batch effect, the following design is useful
# design <- model.matrix(~block + treatment)
# this is just an example. 'block' and 'treatment' would need to be objects in your environment

Model mean-variance trend and fit linear model to data

### Use VOOM function from Limma package to model the mean-variance relationship
head(DGEList_filtered_norm)
## An object of class "DGEList"
## $counts
##            Sample1     Sample2     Sample3    Sample4     Sample5     Sample6
## A1BG     184.19070   245.13169   101.73418   79.20480  13.3334957   682.72256
## A2M    11901.13136 31538.08430 19870.31859 7832.90097 667.1460374 40967.58721
## A2ML1  15162.86062 14045.04908 11463.06052 3627.33515 498.4617540 11767.57706
## A2MP1     29.31478    51.91491    46.89141    8.37674   0.8615391    57.27197
## A4GALT   128.32345   355.39448   131.42493   75.43359  30.6454026   112.79123
## AAAS     269.07806   457.62142   257.49394  147.56154  27.9749726   636.62246
##            Sample7     Sample8    Sample9   Sample10
## A1BG     391.58119   326.10430   397.9777   499.9653
## A2M    35158.38935 21982.61779 27205.9576 53686.2052
## A2ML1    454.12997   631.23251   277.9864  3685.3769
## A2MP1    106.33010    89.04506   103.1984   147.3506
## A4GALT    87.64872    88.03357    62.6173   230.0622
## AAAS     382.02897   412.70476   446.8081   509.4904
## 
## $samples
##          group lib.size norm.factors
## Sample1      1 42300696    0.9838341
## Sample2      1 51775632    1.0056785
## Sample3      1 40485664    1.1055948
## Sample4      1 16661634    1.0332639
## Sample5      1  2191577    1.0154852
## Sample6      1 55389922    0.8810033
## Sample7      1 28800520    1.0117840
## Sample8      1 31632164    1.0093077
## Sample9      1 39464569    0.9639270
## Sample10     1 67827185    1.0046274
v_DEGList_filtered_norm <- voom(DGEList_filtered_norm, design, plot = TRUE)


# fit a linear model to your data
fit <- lmFit(v_DEGList_filtered_norm, design)

### contrast matrix
contrast_matrix <- makeContrasts(infection = disease - healthy,
                                 levels=design)
### extract the linear model fit
fits <- contrasts.fit(fit, contrast_matrix)

### get bayesian stats for the linear model fit
ebFit <- eBayes(fits)

#write.fit(ebFit, file="lmfit_results.txt")

### top table to view DEGs
TopHits <- topTable(ebFit, adjust ="BH", coef=1, number=40000, sort.by="logFC")

### convert to a tibble
TopHits_df <- TopHits |>
  as_tibble(rownames = "geneID")

TopHits_df
## # A tibble: 15,765 × 7
##    geneID    logFC AveExpr     t      P.Value  adj.P.Val     B
##    <chr>     <dbl>   <dbl> <dbl>        <dbl>      <dbl> <dbl>
##  1 MMP1      10.8    6.65  13.4  0.0000000182 0.00000123 10.0 
##  2 IGLC3     10.3    3.49   7.50 0.00000815   0.0000778   4.06
##  3 IGKV1D-33 10.2    1.12   9.29 0.000000913  0.0000160   6.11
##  4 IGKV2D-28 10.1    0.898  9.50 0.000000723  0.0000137   6.32
##  5 IGLV3-25  10.0    0.242  6.89 0.0000186    0.000147    3.27
##  6 IGLV3-1   10.0    1.75  11.8  0.0000000715 0.00000281  8.36
##  7 IGKV1-5    9.93   3.36  13.8  0.0000000124 0.00000101 10.0 
##  8 IGHV1-3    9.89   0.447  6.24 0.0000468    0.000302    2.37
##  9 IGLV3-19   9.84   2.18   6.93 0.0000175    0.000140    3.33
## 10 IGHV3-21   9.80   1.42   9.00 0.00000128   0.0000200   5.80
## # ℹ 15,755 more rows
Limma outputs
  • TopTable (from Limma) outputs a few different stats:
  • logFC, AveExpr, and P.Value should be self-explanatory
  • adj.P.Val is your adjusted P value, also known as an FDR (if BH method was used for multiple testing correction)
  • B statistic is the log-odds that that gene is differentially expressed. If B = 1.5, then log odds is e^1.5, where e is euler’s constant (approx. 2.718). So, the odds of differential expression os about 4.8 to 1
  • t statistic is ratio of the logFC to the standard error (where the error has been moderated across all genes…because of Bayesian approach)
### volcano plot
vplot <- TopHits_df |>
    ggplot(
        aes(y = -log10(adj.P.Val), x = logFC, text = paste("Symbol:", geneID))
    ) +
    geom_point(size = 2) +
    geom_hline(yintercept = -log10(0.01), linetype = "longdash", colour = "grey", linewidth = 1) +
    geom_vline(xintercept = 1, linetype = "longdash", colour = "#BE684D", linewidth = 1) +
    geom_vline(xintercept = -1, linetype = "longdash", colour = "#2C467A", linewidth = 1) +
    # annotate("rect", xmin = 1, xmax = 12, ymin = -log10(0.01), ymax = 7.5, alpha=.2, fill="#BE684D") +
    # annotate("rect", xmin = -1, xmax = -12, ymin = -log10(0.01), ymax = 7.5, alpha=.2, fill="#2C467A") +
    labs(title = "Volcano plot",
        subtitle = "Cutaneous leishmaniasis",
        caption = paste0("produced on ", Sys.time())) +
    theme_bw()

vplot

### pull out DEGs and make venn diagram
results <- decideTests(
    ebFit, method = "global", adjust.method = "BH", p.value = 0.01, lfc = 2
)
head(results)
## TestResults matrix
##         Contrasts
##          infection
##   A1BG           0
##   A2M            0
##   A2ML1         -1
##   A2MP1          0
##   A4GALT         0
##   AAAS           0
summary(results)
##        infection
## Down        1012
## NotSig     13446
## Up          1307
vennDiagram(results, include="up")

### retrieve expression data for DEGs
head(v_DEGList_filtered_norm$E)
##           Sample1     Sample2    Sample3    Sample4    Sample5   Sample6
## A1BG    2.1498709 2.237982296 1.19157184  2.2109278  2.6359548 3.8074408
## A2M     8.1597752 9.242459704 8.79420169  8.8297563  8.2288005 9.7134419
## A2ML1   8.5092054 8.075454826 8.00060559  7.7192264  7.8086463 7.9138199
## A2MP1  -0.4811408 0.009534871 0.08239155 -0.9556369 -0.7089007 0.2435267
## A4GALT  1.6301549 2.772935178 1.55941146  2.1409994  3.8068082 1.2151214
## AAAS    2.6954627 3.137215647 2.52703139  3.1043863  3.6774835 3.7066558
##          Sample7  Sample8   Sample9 Sample10
## A1BG    3.750084 3.354713 3.3888732 2.876674
## A2M    10.236678 9.427422 9.4821796 9.621825
## A2ML1   3.963625 4.306483 2.8719810 5.757340
## A2MP1   1.874250 1.487855 1.4467684 1.117544
## A4GALT  1.596943 1.471466 0.7304817 1.758560
## AAAS    3.714501 3.694027 3.5556433 2.903874
colnames(v_DEGList_filtered_norm$E) <- sampleLabels

diffGenes <- v_DEGList_filtered_norm$E[results[,1] !=0,]
head(diffGenes)
##             HS01     HS02     HS03     HS04     HS05     CL08       CL10
## A2ML1   8.509205 8.075455 8.000606 7.719226 7.808646 7.913820  3.9636246
## AADAC   3.783598 3.145125 3.421001 3.846583 4.124090 2.241617 -3.5057299
## AADACL2 6.884775 7.025967 6.096531 6.700546 6.104662 5.215819 -5.8649243
## AARD    2.817882 1.719728 2.562007 2.090062 2.784928 1.028134  0.2785801
## ABCA10  4.429922 4.330043 6.261231 3.871370 2.785400 2.261820  2.9336235
## ABCA12  8.609129 7.810834 7.884251 7.603805 7.963791 7.338358  3.3247149
##              CL11       CL12       CL13
## A2ML1   4.3064835  2.8719810  5.7573398
## AADAC   0.1777154 -0.9997370 -0.5843452
## AADACL2 2.9288016  2.0251967  3.5424241
## AARD    0.2142361 -0.4255113  0.1288021
## ABCA10  2.4838643  2.4430992  0.6882190
## ABCA12  5.2844659  4.7055495  6.6999992
dim(diffGenes)
## [1] 2319   10

#convert DEGs to a dataframe using as_tibble
diffGenes_df <- as_tibble(diffGenes, rownames = "geneID")

### create interactive tables to display DEGs
# datatable(diffGenes_df,
#           extensions = c('KeyTable', "FixedHeader"),
#           caption = 'Table 1: DEGs in cutaneous leishmaniasis',
#           options = list(keys = TRUE, searchHighlight = TRUE, pageLength = 10, lengthMenu = c("10", "25", "50", "100"))) |>
#   formatRound(columns=c(2:11), digits=2)

### write DEGs to a file
# write_tsv(diffGenes_df, "DiffGenes.txt") 
#NOTE: this .txt file can be directly used for input into other clustering or network analysis tools (e.g., String, Clust (https://github.com/BaselAbujamous/clust, etc.)

Step7: Module identification

heatcolors <- rev(brewer.pal(name="RdBu", n=11))

#cluster rows by pearson correlation
clustRows <- hclust(
  as.dist(1-cor(t(diffGenes), method="pearson")), 
  method="complete"
)
#cluster columns by pearson correlation
clustColumns <- hclust(
  as.dist(1-cor(diffGenes, method="spearman")), 
  method="complete"
  )

module_assign <- cutree(clustRows, k=2)

module_color <- rainbow(length(unique(module_assign)), start=0.1, end=0.9) 

module_color <- module_color[as.vector(module_assign)] 

heatmap.2(
    diffGenes,
    Rowv = as.dendrogram(clustRows),
    Colv = as.dendrogram(clustColumns),
    RowSideColors = module_color,
    col = heatcolors, scale = 'row', labRow = NA,
    density.info = "none", trace = "none",
    cexRow = 1, cexCol = 1, margins = c(8, 20)
)


modulePick <- 2 
Module_up <- diffGenes[names(module_assign[module_assign %in% modulePick]),] 
hrsub_up <- hclust(
    as.dist(1 - cor(t(Module_up), method = "pearson")),
    method = "complete"
)

heatmap.2(
    Module_up, 
    Rowv=as.dendrogram(hrsub_up), 
    Colv=NA, 
    labRow = NA,
    col=heatcolors, scale="row", 
    density.info="none", trace="none", 
    RowSideColors=module_color[module_assign%in%modulePick], margins=c(8,20)
)


modulePick <- 1 
Module_down <- diffGenes[names(module_assign[module_assign %in% modulePick]),] 
hrsub_down <- hclust(
    as.dist(1 - cor(t(Module_down), method = "pearson")),
    method = "complete"
)

heatmap.2(Module_down, 
          Rowv=as.dendrogram(hrsub_down), 
          Colv=NA, 
          labRow = NA,
          col=heatcolors, scale="row", 
          density.info="none", trace="none", 
          RowSideColors=module_color[module_assign%in%modulePick], margins=c(8,20))

Step8: Functional enrichment analysis

Go enrichment analysis

### carry out GO enrichment using gProfiler2
#### use topTable result to pick the top genes for carrying out a Gene Ontology (GO) enrichment analysis
myTopHits <- topTable(
    ebFit,
    adjust = "BH",
    coef = 1,
    number = 50,
    sort.by = "logFC"
)

### use the 'gost' function from the gprofiler2 package to run GO enrichment analysis
gost_res <- gost(
    rownames(myTopHits),
    organism = "hsapiens",
    correction_method = "fdr"
)

### produce an interactive manhattan plot of enriched GO terms
gostplot(gost_res, interactive = TRUE, capped = T) 

### produce a publication quality static manhattan plot with specific GO terms highlighted
mygostplot <- gostplot(
  gost_res, 
  interactive = FALSE, 
  capped = TRUE #set interactive=FALSE to get plot for publications
) 

### rerun the above gostplot function with 'interactive=F' and save to an object 'mygostplot'
publish_gostplot(
  mygostplot, #your static gostplot from above
  highlight_terms = c("GO:0034987"),
  filename = NULL,
  width = NA,
  height = NA)


### generate a table of  gost results
publish_gosttable(
  gost_res,
  highlight_terms = NULL,
  use_colors = TRUE,
  show_columns = c("source", "term_name", "term_size", "intersection_size"),
  filename = NULL,
  ggplot=TRUE)

# now repeat the above steps using only genes from a single module from the step 6 script, by using `rownames(myModule)`
# what is value in breaking up DEGs into modules for functional enrichment analysis?

Perform GSEA using clusterProfiler

There are a few ways to get msigDB collections into R:

  • Option1: download directly from msigdb and load from your computer. can use the ‘read.gmt’ function from clusterProfiler package to create a dataframe. alternatively, you can read in using ‘getGmt’ function from GSEABase package if you need a GeneSetCollection object
c2cp <- read.gmt("/Users/.../MSigDB/c2.cp.v7.1.symbols.gmt")
  • Option2: use the msigdb package to access up-to-date collections. this option has the additional advantage of providing access to species-specific collections are also retrieved as tibbles
msigdbr_species()
## # A tibble: 20 × 2
##    species_name                    species_common_name                          
##    <chr>                           <chr>                                        
##  1 Anolis carolinensis             Carolina anole, green anole                  
##  2 Bos taurus                      bovine, cattle, cow, dairy cow, domestic cat…
##  3 Caenorhabditis elegans          <NA>                                         
##  4 Canis lupus familiaris          dog, dogs                                    
##  5 Danio rerio                     leopard danio, zebra danio, zebra fish, zebr…
##  6 Drosophila melanogaster         fruit fly                                    
##  7 Equus caballus                  domestic horse, equine, horse                
##  8 Felis catus                     cat, cats, domestic cat                      
##  9 Gallus gallus                   bantam, chicken, chickens, Gallus domesticus 
## 10 Homo sapiens                    human                                        
## 11 Macaca mulatta                  rhesus macaque, rhesus macaques, Rhesus monk…
## 12 Monodelphis domestica           gray short-tailed opossum                    
## 13 Mus musculus                    house mouse, mouse                           
## 14 Ornithorhynchus anatinus        duck-billed platypus, duckbill platypus, pla…
## 15 Pan troglodytes                 chimpanzee                                   
## 16 Rattus norvegicus               brown rat, Norway rat, rat, rats             
## 17 Saccharomyces cerevisiae        baker's yeast, brewer's yeast, S. cerevisiae 
## 18 Schizosaccharomyces pombe 972h- <NA>                                         
## 19 Sus scrofa                      pig, pigs, swine, wild boar                  
## 20 Xenopus tropicalis              tropical clawed frog, western clawed frog
### Gets all collections/signatures with human gene IDs
hs_gsea <- msigdbr(species = "Homo sapiens") 

### Take a look at the categories and subcategories of signatures available to you
hs_gsea |> 
  distinct(gs_cat, gs_subcat) |> 
  arrange(gs_cat, gs_subcat)
## # A tibble: 23 × 2
##    gs_cat gs_subcat        
##    <chr>  <chr>            
##  1 C1     ""               
##  2 C2     "CGP"            
##  3 C2     "CP"             
##  4 C2     "CP:BIOCARTA"    
##  5 C2     "CP:KEGG"        
##  6 C2     "CP:PID"         
##  7 C2     "CP:REACTOME"    
##  8 C2     "CP:WIKIPATHWAYS"
##  9 C3     "MIR:MIRDB"      
## 10 C3     "MIR:MIR_Legacy" 
## # ℹ 13 more rows

### Choose a specific msigdb collection/subcollection
hs_gsea_c2 <- msigdbr(
  ### change depending on species your data came from
  species = "Homo sapiens", 
  category = "C2"
  ) |> 
  ###just get the columns corresponding to signature name and gene symbols of genes in each signature 
  select(gs_name, gene_symbol) 

### Pull out just the columns corresponding to gene symbols and LogFC for at least one pairwise comparison for the enrichment analysis
mydata_df_sub <- select(mydata_df, geneID, LogFC)

### Construct a named vector
mydata_gsea <- mydata_df_sub$LogFC
names(mydata_gsea) <- as.character(mydata_df_sub$geneID)
mydata_gsea <- sort(mydata_gsea, decreasing = TRUE)

### Run GSEA using the 'GSEA' function from clusterProfiler
myGSEA_res <- GSEA(mydata_gsea, TERM2GENE=hs_gsea_c2, verbose=FALSE)
myGSEA_df <- as_tibble(myGSEA_res@result)

### View results as an interactive table
# datatable(
#     myGSEA_df,
#     extensions = c('KeyTable', "FixedHeader"),
#     caption = 'Signatures enriched in leishmaniasis',
#     options = list(
#         keys = TRUE,
#         searchHighlight = TRUE,
#         pageLength = 10,
#         lengthMenu = c("10", "25", "50", "100")
#     )
# ) |>
#     formatRound(columns = c(2:10), digits = 2)

### Create enrichment plots using the enrichplot package
gseaplot2(
    myGSEA_res,
    ### can choose multiple signatures to overlay in this plot
    geneSetID = c(6, 47, 262), 
    ### can set this to FALSE for a cleaner plot
    pvalue_table = FALSE, 
    ### can also turn off this title
    title = myGSEA_res$Description[47]
) 


### Add a variable to this result that matches enrichment direction with phenotype
myGSEA_df <- myGSEA_df |>
    mutate(
        phenotype = case_when(
            NES > 0 ~ "disease",
            NES < 0 ~ "healthy"
        )
    )

### Create 'bubble plot' to summarize y signatures across x phenotypes
ggplot(myGSEA_df[1:20,], aes(x=phenotype, y=ID)) + 
  geom_point(aes(size=setSize, color = NES, alpha=-log10(p.adjust))) +
  scale_color_gradient(low="blue", high="red") +
  theme_bw()

Competitive GSEA using CAMERA

For competitive tests the null hypothesis is that genes in the set are, at most, as often differentially expressed as genes outside the set.

### Create a few signatures to test in our enrichment analysis
### if your own data is from mice, wrap this in 'toupper()' to make gene symbols all caps
mySig <- rownames(myTopHits) 

mySig2 <- sample(
  (rownames(v_DEGList_filtered_norm$E)), 
  size = 50, replace = FALSE
  )

collection <- list(real = mySig, fake = mySig2)

###Test for enrichment using CAMERA
camera_res <- camera(v_DEGList_filtered_norm$E, collection, design, contrast_matrix[,1]) 
camera_df <- as_tibble(camera_res, rownames = "setName")
camera_df
## # A tibble: 2 × 5
##   setName NGenes Direction   PValue      FDR
##   <chr>    <dbl> <chr>        <dbl>    <dbl>
## 1 real        50 Up        5.85e-20 1.17e-19
## 2 fake        50 Up        9.57e- 1 9.57e- 1

Self-contained GSEA using ROAST

For self-contained the null hypothesis is that no genes in the set are differentially expressed.

mroast(
  v_DEGList_filtered_norm$E, 
  collection, design, 
  #mroast adjusts for multiple testing
  contrast=1
  ) 
##      NGenes PropDown PropUp Direction PValue   FDR PValue.Mixed FDR.Mixed
## real     50     0.06   0.94        Up  5e-04 5e-04        5e-04     5e-04
## fake     50     0.04   0.86        Up  5e-04 5e-04        5e-04     5e-04

### Repeat with an actual gene set collection
### camera requires collections to be presented as a list, rather than a tibble, so we must read in our signatures using the 'getGmt' function
broadSet_C2_ALL <- getGmt(
    here("learn", "DIY_Transcriptomics", "data", "c2.all.v7.2.symbols.gmt"),
    geneIdType = SymbolIdentifier()
)

### Extract as a list
broadSet_C2_ALL <- geneIds(broadSet_C2_ALL)
camera_res <- camera(
    v_DEGList_filtered_norm$E,
    broadSet_C2_ALL, design,
    contrast_matrix[, 1]
)
camera_df <- as_tibble(camera_res, rownames = "setName")
camera_df
## # A tibble: 6,229 × 5
##    setName                                    NGenes Direction   PValue      FDR
##    <chr>                                       <dbl> <chr>        <dbl>    <dbl>
##  1 MCLACHLAN_DENTAL_CARIES_UP                    245 Up        2.80e-33 1.74e-29
##  2 POOLA_INVASIVE_BREAST_CANCER_UP               275 Up        2.59e-32 8.06e-29
##  3 WIELAND_UP_BY_HBV_INFECTION                    99 Up        2.10e-31 4.36e-28
##  4 REACTOME_IMMUNOREGULATORY_INTERACTIONS_BE…    158 Up        9.85e-31 1.53e-27
##  5 REACTOME_FCGR_ACTIVATION                       62 Up        1.88e-29 2.34e-26
##  6 REACTOME_CD22_MEDIATED_BCR_REGULATION          53 Up        2.62e-27 2.72e-24
##  7 REACTOME_CREATION_OF_C4_AND_C2_ACTIVATORS      58 Up        1.70e-26 1.52e-23
##  8 REACTOME_INITIAL_TRIGGERING_OF_COMPLEMENT      66 Up        2.81e-26 2.19e-23
##  9 WALLACE_PROSTATE_CANCER_RACE_UP               277 Up        6.49e-26 4.49e-23
## 10 FLECHNER_BIOPSY_KIDNEY_TRANSPLANT_REJECTE…     86 Up        4.32e-25 2.69e-22
## # ℹ 6,219 more rows

### Filter based on FDR and display as interactive table
camera_df <- filter(camera_df, FDR<=0.01)

# datatable(
#     camera_df,
#     extensions = c('KeyTable', "FixedHeader"),
#     caption = 'Signatures enriched in leishmaniasis',
#     options = list(
#         keys = TRUE,
#         searchHighlight = TRUE,
#         pageLength = 10,
#         lengthMenu = c("10", "25", "50", "100")
#     )
# ) |>
#     formatRound(columns = c(2, 4, 5), digits = 2)

### Add a variable that maps up/down regulated pathways with phenotype
camera_df <- camera_df |>
    mutate(
        phenotype = case_when(
            Direction == "Up" ~ "disease",
            Direction == "Down" ~ "healthy"
        )
    )

### Filtering to return anything that has 'CD8' or 'CYTOTOX' in the name of the signature
camera_df_sub <- camera_df |>
  filter(str_detect(setName, "CD8|CYTOTOX"))

### Graph camera results as bubble chart 
ggplot(camera_df[1:25,], aes(x=phenotype, y=setName)) + 
  geom_point(aes(size=NGenes, color = Direction, alpha=-log10(FDR))) +
  theme_bw()

Single sample GSEA using the GSVA package

The GSVA package offers a different way of approaching functional enrichment analysis.

  • In contrast to most GSE methods, GSVA performs a change in coordinate systems,transforming the data from a gene by sample matrix to a gene set (signature) by sample matrix. This allows for the evaluation of pathway enrichment for each sample.
  • The method is both non-parametric and unsupervised bypasses the conventional approach of explicitly modeling phenotypes within enrichment scoring algorithms.
  • Focus is therefore placed on the RELATIVE enrichment of pathways across the sample space rather than the absolute enrichment with respect to a phenotype. However, with data with a moderate to small sample size (< 30), other GSE methods that explicitly include the phenotype in their model are more likely to provide greater statistical power to detect functional enrichment.
# Be aware that if you choose a large MsigDB file here, this step may take a while
GSVA_res_C2CP <- gsva(
    v_DEGList_filtered_norm$E,
    ### signatures
    broadSet_C2_ALL,
    ### criteria for filtering gene sets
    min.sz = 5, max.sz = 500,
    mx.diff = FALSE,
    ### options for method are "gsva", ssgsea', "zscore" or "plage"
    method = "gsva"
)
## Estimating GSVA scores for 5919 gene sets.
## Estimating ECDFs with Gaussian kernels
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |                                                                      |   1%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |==                                                                    |   4%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |===                                                                   |   5%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |=====                                                                 |   8%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |=======                                                               |  11%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |=========                                                             |  14%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |============                                                          |  18%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |==============                                                        |  21%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |================                                                      |  24%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |=================                                                     |  25%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |===================                                                   |  28%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |=====================                                                 |  31%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |=======================                                               |  34%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |==========================                                            |  38%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |============================                                          |  41%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |==============================                                        |  44%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |===============================                                       |  45%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |=================================                                     |  48%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |===================================                                   |  51%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=====================================                                 |  52%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |=====================================                                 |  54%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |=======================================                               |  55%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |========================================                              |  58%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |==========================================                            |  61%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |============================================                          |  62%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |============================================                          |  64%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |=============================================                         |  65%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |===============================================                       |  68%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |=================================================                     |  69%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |=================================================                     |  71%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |==================================================                    |  72%
  |                                                                            
  |===================================================                   |  72%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |===================================================                   |  74%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |=====================================================                 |  75%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |======================================================                |  76%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |======================================================                |  78%
  |                                                                            
  |=======================================================               |  78%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |========================================================              |  79%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |========================================================              |  81%
  |                                                                            
  |=========================================================             |  81%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |==========================================================            |  82%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |==========================================================            |  84%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |===========================================================           |  85%
  |                                                                            
  |============================================================          |  85%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=============================================================         |  86%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |=============================================================         |  88%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |===============================================================       |  89%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |===============================================================       |  91%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |=================================================================     |  92%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |=================================================================     |  94%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |===================================================================   |  95%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |====================================================================  |  96%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |====================================================================  |  98%
  |                                                                            
  |===================================================================== |  98%
  |                                                                            
  |===================================================================== |  99%
  |                                                                            
  |======================================================================|  99%
  |                                                                            
  |======================================================================| 100%
  • Apply linear model to GSVA result. Now using Limma to find significantly enriched gene sets in the same way you did to find diffGenes. This means you’ll be using topTable, decideTests, etc.
  • note that you need to reference your design and contrast matrix here.
fit_C2CP <- lmFit(GSVA_res_C2CP, design)
ebfit_C2CP <- eBayes(fit_C2CP)

### Use topTable and decideTests functions to identify the differentially enriched gene sets
topPaths_C2CP <- topTable(ebfit_C2CP, adjust ="BH", coef=1, number=50, sort.by="logFC")
res_C2CP <- decideTests(ebfit_C2CP, method="global", adjust.method="BH", p.value=0.05, lfc=0.58)

### The summary of the decideTests result shows how many sets were enriched in induced and repressed genes in all sample types
summary(res_C2CP)
##        disease healthy
## Down        12      42
## NotSig    5851    5864
## Up          56      13

# Pull out the gene sets that are differentially enriched between groups
diffSets_C2CP <- GSVA_res_C2CP[res_C2CP[,1] !=0,]
head(diffSets_C2CP)
##                                                  HS01       HS02       HS03
## ST_IL_13_PATHWAY                           -0.7173858 -0.7147717 -0.7164712
## WATANABE_ULCERATIVE_COLITIS_WITH_CANCER_DN -0.6188325 -0.5710992 -0.4131349
## FARMER_BREAST_CANCER_CLUSTER_1             -0.6886216 -0.5784651 -0.5593251
## BOWIE_RESPONSE_TO_EXTRACELLULAR_MATRIX     -0.6707584 -0.5317945 -0.5360570
## BOWIE_RESPONSE_TO_TAMOXIFEN                -0.6717424 -0.5377614 -0.5172726
## KAUFFMANN_MELANOMA_RELAPSE_DN               0.5698331  0.7398284  0.6624807
##                                                  HS04       HS05       CL08
## ST_IL_13_PATHWAY                           -0.5968289 -0.4130711  0.8128353
## WATANABE_ULCERATIVE_COLITIS_WITH_CANCER_DN -0.6270975 -0.5185058  0.2930852
## FARMER_BREAST_CANCER_CLUSTER_1             -0.6325020 -0.4714246  0.5083211
## BOWIE_RESPONSE_TO_EXTRACELLULAR_MATRIX     -0.6457163 -0.5252770  0.5529664
## BOWIE_RESPONSE_TO_TAMOXIFEN                -0.6692727 -0.5128578  0.5408430
## KAUFFMANN_MELANOMA_RELAPSE_DN               0.4426951  0.6730961 -0.6256108
##                                                  CL10       CL11       CL12
## ST_IL_13_PATHWAY                            0.6256980  0.5670608  0.6341371
## WATANABE_ULCERATIVE_COLITIS_WITH_CANCER_DN  0.5838839  0.6787619  0.7118710
## FARMER_BREAST_CANCER_CLUSTER_1              0.5593513  0.6517882  0.7463300
## BOWIE_RESPONSE_TO_EXTRACELLULAR_MATRIX      0.4914990  0.7723819  0.6409015
## BOWIE_RESPONSE_TO_TAMOXIFEN                 0.5064169  0.7568069  0.6405129
## KAUFFMANN_MELANOMA_RELAPSE_DN              -0.6277048 -0.6895166 -0.8205882
##                                                  CL13
## ST_IL_13_PATHWAY                            0.6455147
## WATANABE_ULCERATIVE_COLITIS_WITH_CANCER_DN  0.6389032
## FARMER_BREAST_CANCER_CLUSTER_1              0.5802493
## BOWIE_RESPONSE_TO_EXTRACELLULAR_MATRIX      0.6434423
## BOWIE_RESPONSE_TO_TAMOXIFEN                 0.6232986
## KAUFFMANN_MELANOMA_RELAPSE_DN              -0.3907835
dim(diffSets_C2CP)
## [1] 68 10

### Make a heatmap of differentially enriched gene sets 
hr_C2CP <- hclust(
    as.dist(1 - cor(t(diffSets_C2CP), method = "pearson")),
    ### cluster rows by pearson correlation
    method = "complete"
)

hc_C2CP <- hclust(
    as.dist(1 - cor(diffSets_C2CP, method = "spearman")),
    ### cluster columns by spearman correlation
    method = "complete"
)

### Cut the resulting tree and create color vector for clusters.  Vary the cut height to give more or fewer clusters, or you the 'k' argument to force n number of clusters
mycl_C2CP <- cutree(hr_C2CP, k=2)
mycolhc_C2CP <- rainbow(
    length(unique(mycl_C2CP)),
    start = 0.1, end = 0.9
)

mycolhc_C2CP <- mycolhc_C2CP[as.vector(mycl_C2CP)] 

### assign your favorite heatmap color scheme. Some useful examples: colorpanel(40, "darkblue", "yellow", "white"); heat.colors(75); cm.colors(75); rainbow(75); redgreen(75); library(RColorBrewer); rev(brewer.pal(9,"Blues")[-1]). Type demo.col(20) to see more color schemes.
myheatcol <- colorRampPalette(colors=c("yellow","white","blue"))(100)

### Plot the hclust results as a heatmap
heatmap.2(
    diffSets_C2CP,
    Rowv = as.dendrogram(hr_C2CP),
    Colv = as.dendrogram(hc_C2CP),
    col = myheatcol, scale = "row",
    density.info = "none", trace = "none",
    ### Creates heatmap for entire data set where the obtained clusters are indicated in the color bar.
    cexRow = 0.9, cexCol = 1, margins = c(10, 14)
) 

Reference

SessionInfo

sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Sonoma 14.1.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Asia/Singapore
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] enrichplot_1.20.0         msigdbr_7.5.1            
##  [3] clusterProfiler_4.8.2     gprofiler2_0.2.2         
##  [5] GSVA_1.48.3               GSEABase_1.62.0          
##  [7] graph_1.78.0              annotate_1.80.0          
##  [9] XML_3.99-0.15             gplots_3.1.3             
## [11] heatmaply_1.5.0           viridis_0.6.4            
## [13] viridisLite_0.4.2         RColorBrewer_1.1-3       
## [15] gt_0.10.0                 plotly_4.10.3            
## [17] DT_0.30                   cowplot_1.1.1            
## [19] matrixStats_1.1.0         edgeR_4.0.1              
## [21] limma_3.58.1              datapasta_3.1.0          
## [23] here_1.0.1                EnsDb.Hsapiens.v86_2.99.0
## [25] ensembldb_2.24.1          AnnotationFilter_1.24.0  
## [27] GenomicFeatures_1.52.2    AnnotationDbi_1.64.1     
## [29] Biobase_2.62.0            GenomicRanges_1.52.1     
## [31] GenomeInfoDb_1.38.0       IRanges_2.36.0           
## [33] S4Vectors_0.40.1          BiocGenerics_0.48.1      
## [35] tximport_1.28.0           lubridate_1.9.3          
## [37] forcats_1.0.0             stringr_1.5.1            
## [39] dplyr_1.1.4               purrr_1.0.2              
## [41] readr_2.1.4               tidyr_1.3.0              
## [43] tibble_3.2.1              ggplot2_3.4.4            
## [45] tidyverse_2.0.0           rhdf5_2.44.0             
## 
## loaded via a namespace (and not attached):
##   [1] fs_1.6.3                    ProtGenerics_1.32.0        
##   [3] bitops_1.0-7                HDO.db_0.99.1              
##   [5] httr_1.4.7                  webshot_0.5.5              
##   [7] tools_4.3.1                 utf8_1.2.4                 
##   [9] R6_2.5.1                    HDF5Array_1.28.1           
##  [11] mgcv_1.9-0                  lazyeval_0.2.2             
##  [13] rhdf5filters_1.12.1         withr_2.5.2                
##  [15] prettyunits_1.2.0           gridExtra_2.3              
##  [17] cli_3.6.1                   TSP_1.2-4                  
##  [19] scatterpie_0.2.1            sass_0.4.7                 
##  [21] labeling_0.4.3              commonmark_1.9.0           
##  [23] Rsamtools_2.16.0            yulab.utils_0.1.0          
##  [25] gson_0.1.0                  DOSE_3.26.2                
##  [27] RSQLite_2.3.3               generics_0.1.3             
##  [29] gridGraphics_0.5-1          BiocIO_1.10.0              
##  [31] crosstalk_1.2.0             vroom_1.6.4                
##  [33] gtools_3.9.5                dendextend_1.17.1          
##  [35] GO.db_3.17.0                Matrix_1.6-3               
##  [37] fansi_1.0.5                 abind_1.4-5                
##  [39] lifecycle_1.0.4             yaml_2.3.7                 
##  [41] SummarizedExperiment_1.30.2 qvalue_2.32.0              
##  [43] BiocFileCache_2.8.0         grid_4.3.1                 
##  [45] blob_1.2.4                  promises_1.2.1             
##  [47] crayon_1.5.2                lattice_0.22-5             
##  [49] beachmat_2.16.0             KEGGREST_1.42.0            
##  [51] pillar_1.9.0                knitr_1.45                 
##  [53] fgsea_1.26.0                rjson_0.2.21               
##  [55] codetools_0.2-19            fastmatch_1.1-4            
##  [57] glue_1.6.2                  downloader_0.4             
##  [59] ggfun_0.1.3                 data.table_1.14.8          
##  [61] vctrs_0.6.4                 png_0.1-8                  
##  [63] treeio_1.24.3               gtable_0.3.4               
##  [65] assertthat_0.2.1            cachem_1.0.8               
##  [67] xfun_0.41                   mime_0.12                  
##  [69] S4Arrays_1.0.6              tidygraph_1.2.3            
##  [71] SingleCellExperiment_1.22.0 seriation_1.4.2            
##  [73] iterators_1.0.14            statmod_1.5.0              
##  [75] ellipsis_0.3.2              nlme_3.1-163               
##  [77] ggtree_3.8.2                bit64_4.0.5                
##  [79] progress_1.2.2              filelock_1.0.2             
##  [81] rprojroot_2.0.4             irlba_2.3.5.1              
##  [83] KernSmooth_2.23-22          colorspace_2.1-0           
##  [85] DBI_1.1.3                   tidyselect_1.2.0           
##  [87] bit_4.0.5                   compiler_4.3.1             
##  [89] curl_5.1.0                  xml2_1.3.5                 
##  [91] DelayedArray_0.26.7         shadowtext_0.1.2           
##  [93] rtracklayer_1.60.1          scales_1.2.1               
##  [95] caTools_1.18.2              hexbin_1.28.3              
##  [97] rappdirs_0.3.3              digest_0.6.33              
##  [99] rmarkdown_2.25              ca_0.71.1                  
## [101] XVector_0.42.0              htmltools_0.5.7            
## [103] pkgconfig_2.0.3             sparseMatrixStats_1.12.2   
## [105] MatrixGenerics_1.14.0       dbplyr_2.4.0               
## [107] fastmap_1.1.1               rlang_1.1.2                
## [109] htmlwidgets_1.6.2           shiny_1.8.0                
## [111] DelayedMatrixStats_1.22.6   farver_2.1.1               
## [113] jsonlite_1.8.7              BiocParallel_1.36.0        
## [115] GOSemSim_2.26.1             BiocSingular_1.16.0        
## [117] RCurl_1.98-1.13             magrittr_2.0.3             
## [119] GenomeInfoDbData_1.2.11     ggplotify_0.1.2            
## [121] patchwork_1.1.3             Rhdf5lib_1.22.1            
## [123] munsell_0.5.0               Rcpp_1.0.11                
## [125] ape_5.7-1                   babelgene_22.9             
## [127] stringi_1.8.1               ggraph_2.1.0               
## [129] zlibbioc_1.48.0             MASS_7.3-60                
## [131] plyr_1.8.9                  parallel_4.3.1             
## [133] ggrepel_0.9.4               Biostrings_2.70.1          
## [135] graphlayouts_1.0.2          splines_4.3.1              
## [137] hms_1.1.3                   locfit_1.5-9.8             
## [139] igraph_1.5.1                markdown_1.11              
## [141] reshape2_1.4.4              biomaRt_2.56.1             
## [143] ScaledMatrix_1.8.1          evaluate_0.23              
## [145] httpuv_1.6.12               tzdb_0.4.0                 
## [147] foreach_1.5.2               tweenr_2.0.2               
## [149] polyclip_1.10-6             ggforce_0.4.1              
## [151] rsvd_1.0.5                  xtable_1.8-4               
## [153] restfulr_0.0.15             tidytree_0.4.5             
## [155] later_1.3.1                 aplot_0.2.2                
## [157] memoise_2.0.1               registry_0.5-1             
## [159] GenomicAlignments_1.36.0    timechange_0.2.0