Differential Gene Expression

Setup

library(Rubrary)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(tibble)

# For `airway` install
r = getOption("repos")
r["CRAN"] = "http://cran.us.r-project.org"
options(repos = r)

Introduction

The example dataset for this vignette comes from the paper [1] below:

Himes et al. (2014). “RNA-Seq Transcriptome Profiling Identifies CRISPLD2 as a Glucocorticoid Responsive Gene that Modulates Cytokine Function in Airway Smooth Muscle Cells.” PLoS ONE, 9(6), e99625. https://doi.org/10.1371/journal.pone.0099625.

From the abstract:

Using RNA-Seq, a high-throughput sequencing method, we characterized transcriptomic changes in four primary human ASM cell lines that were treated with dexamethasone—a potent synthetic glucocorticoid (1 µM for 18 hours).

The experiment was performed on human airway smooth muscle (ASM) cells. Glucocorticoids are a commonly used drug to treat asthma for its anti-inflammatory effects in lung tissue. The goal of the experiment was to characterize the mechanism of glucocorticoid inflammation suppression through RNAseq. Himes et al. found a number of glucocorticoid-responsive genes such as DUSP1 and CRISPLD2. They also identified enrichment in functional annotation of terms such as “glycoprotein/extracellular matrix”, “vasculature development”, and “circulatory system process”.

In the original paper, the authors used Cuffdiff 2 to find differential gene expression and used the NIH Database for Annotation, Visualization, and Integrated Discovery (DAVID) for gene set enrichment analysis.

Load data

The package airway contains data from the Himes et al. paper in the form of a RangedSummarizedExperiment object and is hosted on Bioconductor.

Get `airway` data from package

if (!requireNamespace("airway", quietly = TRUE)){
  BiocManager::install("airway")
}

library(airway)
data(airway)

Get counts and sample annotation

The gene expression data is represented in counts. There are 4 samples treated with dexamethasone (anno$dex == "trt") and 4 untreated samples (anno$dex == "untrt").

# Counts
cts <- assay(airway, "counts")
# Annotation
anno <- colData(airway) %>% as.data.frame()
anno$dex <- relevel(anno$dex, "untrt") # Set untreated as first factor

rmarkdown::paged_table(data.frame(cts))

rmarkdown::paged_table(anno)

Replace Ensembl ID w/ HGNC symbol

airway gene names are given in Ensembl ID, but HGNC gene symbols can be easier to interpret in downstream analysis and more compatible with gene set enrichment analysis (GSEA) tools.

# Gene format is currently in Ensembl ID format
head(rownames(cts))
#> [1] "ENSG00000000003" "ENSG00000000005" "ENSG00000000419" "ENSG00000000457"
#> [5] "ENSG00000000460" "ENSG00000000938"

library(biomaRt)
# Load Mart object - database of Homo sapiens genes from Ensembl
mart <- biomaRt::useDataset(
  dataset = "hsapiens_gene_ensembl",
  mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL",
                          host="https://useast.ensembl.org"))

conv <- Rubrary::convert_genes(
    genes = rownames(cts),
    from_to = c("ensembl_gene_id", "hgnc_symbol"),
    mart = mart, table = T) %>%
  filter(hgnc_symbol != "") %>% # Exclude unmatched HGNC symbols
  distinct(hgnc_symbol, .keep_all = T)

cts <- cts %>%
  as.data.frame() %>%
  tibble::rownames_to_column("ensembl_gene_id")

cts_hgnc <- dplyr::left_join( # Join stats with HGNC symbol table
  conv, cts, by = "ensembl_gene_id") %>%
  dplyr::select(!ensembl_gene_id) %>%
  tibble::column_to_rownames("hgnc_symbol")

# Genes now have associated HGNC symbols
rmarkdown::paged_table(head(cts_hgnc))

Visualize samples w/ PCA

Since we already created the Mart object when converting gene symbols, we can reuse it to avoid querying the BioMart database again and load protein coding genes efficiently.

# Upper quartile normalize
cts_UQ <- Rubrary::quantile_norm(cts_hgnc, q = 75)
# Log transform
pc_genes <- Rubrary::get_PC_genes(mart)
cts_log2UQ <- Rubrary::get_log_coding_geneexp(cts_UQ, pc_genes = pc_genes)
#> Warning: 400 genes not found in dataframe gene list: 
#> MAFIP, PRAMEF27, HNRNPCL3, HNRNPCL2, PRAMEF26, PRAMEF13, PRAMEF18, RPS4Y2, FCGBP, DUX4L8, ...
#> ** Search may take a long time to run!
# Rubrary::run_PCA
airway_PCA <- Rubrary::run_PCA(cts_log2UQ)
#> ** Cumulative var. exp. >= 80% at PC 4 (81.3%)

# Rubrary::plot_PCA
Rubrary::plot_PCA(
  df_pca = airway_PCA,
  anno = tibble::rownames_to_column(anno, "Sample_ID"),
  annoname = "Sample_ID",
  annotype = "dex",
  label = TRUE
)

# Rubrary::plot_PCA_matrix
Rubrary::plot_PCA_matrix(
  df_pca = airway_PCA,
  anno = tibble::rownames_to_column(anno, "Sample_ID"),
  annoname = "Sample_ID",
  annotype = "cell"
)
#> Registered S3 method overwritten by 'GGally':
#>   method from   
#>   +.gg   ggplot2

`DESeq2` differentially expressed genes

DESeq2 [2] performs differential gene expression analysis my modeling counts as a negative binomial distribution.

Create DESeqDataSet object and run DESeq

From DESeq2::results documentation, the contrast argument can be formatted as:

a character vector with exactly three elements: the name of a factor in the design formula, the name of the numerator level for the fold change, and the name of the denominator level for the fold change (simplest case)

Thus, the fold change per gene in the results represents the change in the dex condition, $FC = \frac{\text{trt}}{\text{untrt}}$. $FC > 1$ results in $\log(FC) > 0$, an increase in gene expression in treated samples relative to untreated samples.

Rubrary::output_DESeq takes the DESeqResults object, transforms it to a dataframe, and adds a sign log2 p-value column.

library(DESeq2)

dds <- DESeqDataSetFromMatrix(
  countData = cts_hgnc[,rownames(anno)],
  colData = anno,
  design = ~ dex)

dds <- DESeq(dds)
#> estimating size factors
#> estimating dispersions
#> gene-wise dispersion estimates
#> mean-dispersion relationship
#> final dispersion estimates
#> fitting model and testing
res <- results(dds, contrast = c("dex", "trt", "untrt"))
deseq_res_df <- Rubrary::output_DESeq(res)

Filter to protein coding genes

For ease of interpretation, the DE genes results can also be filtered to protein coding genes only. Rubrary::filter_genes() can filter with the list of protein coding genes retrieved with Rubrary::get_PC_genes().

deseq_res_df_PC <- Rubrary::filter_genes(
  df = deseq_res_df,
  gene_col = "gene",
  genes_filt = pc_genes
) %>%
  `rownames<-`(NULL) %>%
  as.data.frame() %>%
  tibble::column_to_rownames("gene")
#> Warning: 3021 genes not found in dataframe gene list: 
#> MAFIP, PRAMEF11, PRAMEF4, PRAMEF10, PRAMEF27, HNRNPCL3, HNRNPCL2, PRAMEF26, PRAMEF13, PRAMEF18, ...
#> ** Search may take a long time to run!

Gene expression boxplot

Rubrary::plot_DEgene_boxplot(
  data = cts_log2UQ,
  anno = anno,
  anno_sig = "dex",
  anno_type = "cell",
  sig_DE = deseq_res_df_PC
)

Volcano plot

DESeq results can be visualized through volcano plots. Volcano plots show that top and bottom differentially expressed genes when sorted by decreasing signed log p value present as outliers on the top right and top left.

rmarkdown::paged_table(
  head(deseq_res_df_PC[order(deseq_res_df_PC$sign_log_p, decreasing = T),]))

rmarkdown::paged_table(
  tail(deseq_res_df_PC[order(deseq_res_df_PC$sign_log_p, decreasing = T),]))


Rubrary::plot_volcano(
  df = tibble::rownames_to_column(deseq_res_df_PC, "gene"),
  names = "gene",
  x = "log2FoldChange", y = "pvalue",
  FCcutoff = 1, pCutoff = 5e-04,
  title = "Airway Treated vs. Untreated DE Genes",
  xlab_high = "\U2191 in treated\n\U2193 in untreated",
  xlab_low = "\U2191 in untreated\n\U2193 in treated",
  subtitle = "Protein coding genes"
)

Citations

[1]

Himes BE, Jiang X, Wagner P, Hu R, Wang Q, Klanderman B, et al. RNA-Seq Transcriptome Profiling Identifies CRISPLD2 as a Glucocorticoid Responsive Gene that Modulates Cytokine Function in Airway Smooth Muscle Cells. PLOS ONE 2014;9:e99625. https://doi.org/10.1371/journal.pone.0099625.

[2]

Love MI, Huber W, Anders S. Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biology 2014;15:550. https://doi.org/10.1186/s13059-014-0550-8.