3 Male v Female DGE

3.1 Transcriptome - edgeR

3.1.1 Libraries

library(dplyr)
library(edgeR)
library(ggfortify)
library(gplots)

3.1.2 Data

SampleInfo <- 
  read.csv("Input Files/metadata.csv", 
           stringsAsFactors = FALSE) %>% 
  mutate(year=as.factor(year)) 

Raw <- read.csv(file = "Output Files/txome_genecounts_locid_nohemo.csv", 
                stringsAsFactors = FALSE,
                header = TRUE, 
                row.names = 1)

3.1.3 Differential Gene Expression Analysis

3.1.3.1 DGElist

DGE <- DGEList(counts=Raw, group=SampleInfo$sex)
DGE$samples
##           group lib.size norm.factors
## Hg11513  female  2842670            1
## Hg1354   female  3618804            1
## Hg1403   female  6100522            1
## Hg1404   female  3533565            1
## Hg908b   female  5871501            1
## Hg1274     male  3026478            1
## Hg1407     male  4238107            1
## Hg1256   female  4573826            1
## Hg1265  unknown  2979315            1
## Hg1288   female  2823770            1
## Hg266    female  3543715            1
## Hg1402     male  6560784            1
## Hg270      male  3772971            1
## Hg1261   female  1943267            1
## Hg1272   female  4708624            1
## Hg1380   female  3204059            1
## Hg1387   female  4158987            1
## Hg1397   female  4343361            1
## Hg1421   female  4508013            1
## Hg1528   female  2894695            1
## Hg861a   female  5407908            1
## Hg862a   female  4075756            1
## Hg934    female  3956989            1
## Hg1262   female  4653972            1
## Hg1273   female  5011031            1
## Hg1282   female  2951861            1
## Hg1283   female  4658064            1
## Hg1286   female  4284639            1
## Hg1351   female  1430600            1
## Hg1516   female  4688415            1
## Hg1527   female  3763080            1

3.1.3.2 Filter out lowly expressed genes

keep <- filterByExpr(DGE)
DGE <- DGE[keep, , keep.lib.sizes=FALSE]
length(rownames(DGE$counts))
## [1] 13093

3.1.3.3 Normalize based on library sizes

DGE <- calcNormFactors(DGE)

3.1.3.4 Estimate Dispersion

On average, the true abundance for each gene can vary up or down by xx% (BCV) between replicates Differentially expressed = needs to vary by more than xx%

# Overall common dispersion
DGE <- estimateCommonDisp(DGE, verbose = TRUE)
## Disp = 0.18267 , BCV = 0.4274
# Dispersion trend based on gene abundance
DGE <- estimateTrendedDisp(DGE)

# Tagwise dispersion - allows different dispersion for each gene
DGE <- estimateTagwiseDisp(DGE, verbose = TRUE)
## Using interpolation to estimate tagwise dispersion.
plotBCV(DGE)

3.1.4 Exploratory Plots

3.1.4.1 Normalized counts

CPM <- 
  cpm(DGE, normalized.lib.sizes = TRUE, log = TRUE) %>% 
  data.frame()
boxplot(CPM, las = 2, ylab = "log2 CPM", main = "Normalized Data")

3.1.4.2 Cluster dendrogram

Males = 1274, 1407, 1402, 270 Unknown = 1265

RawDist <- dist(t(CPM), method = "euclidean")
plot(hclust(RawDist, method = "average"), xlab="Average Euclidean Distance")

3.1.4.3 PCA Plots

PCA <- prcomp(t(RawDist))

# PCs 1-2
autoplot(PCA, data=SampleInfo, colour="sex") +
  theme(legend.position="bottom")

autoplot(PCA, data=SampleInfo, colour="disease_stage") +
  theme(legend.position="bottom")

autoplot(PCA, data=SampleInfo, colour="year") +
  theme(legend.position="bottom")

autoplot(PCA, data=SampleInfo, colour="molt") +
  theme(legend.position="bottom")

# PCs 3-4
autoplot(PCA, data=SampleInfo, x=3, y=4, colour="sex") +
  theme(legend.position="bottom")

autoplot(PCA, data=SampleInfo, x=3, y=4, colour="disease_stage") +
  theme(legend.position="bottom")

autoplot(PCA, data=SampleInfo, x=3, y=4, colour="year") +
  theme(legend.position="bottom")

autoplot(PCA, data=SampleInfo, x=3, y=4, colour="molt") +
  theme(legend.position="bottom")

3.1.5 Test for DEGs

et <- exactTest(DGE, pair=c("male", "female"))

3.1.5.1 Select DEGs by fold change & pvalue

Uncorrected p < 0.05 log2FC > 1 (2-fold change)

results <- topTags(et, n = dim(DGE)[1]) %>% 
  data.frame() %>% 
  filter(abs(logFC) > 1 & PValue < 0.05)
nrow(results)
## [1] 198
write.csv(results, "Output Files/txome_sex_edgeR_degs.csv")

3.1.5.2 DEG Heatmap

jpeg("Figures/transcriptome_sex_edgeR_heatmap.jpg", width=9, height=9, units='in', res=900)
heatmap.2(as.matrix(CPM[rownames(CPM) %in% rownames(results), ]),
          scale = "row", trace = "none", margins=c(5,8), 
          cexRow = 0.5, labRow=FALSE)
dev.off()
## png 
##   2

3.2 Genome - edgeR

3.2.1 Libraries

library(dplyr)
library(edgeR)
library(ggfortify)
library(gplots)

3.2.2 Data

SampleInfo <- 
  read.csv("Input Files/metadata.csv", 
           stringsAsFactors = FALSE) %>% 
  mutate(year=as.factor(year)) 

Raw <- read.csv(file = "Output Files/gnome_genecounts_locid_nohemo.csv", 
                stringsAsFactors = FALSE,
                header = TRUE, 
                row.names = 1)

3.2.3 Differential Gene Expression Analysis

3.2.3.1 DGElist

DGE <- DGEList(counts=Raw, group=SampleInfo$sex)
DGE$samples
##           group lib.size norm.factors
## Hg11513  female  3894792            1
## Hg1354   female  4928497            1
## Hg1403   female  7571672            1
## Hg1404   female  4696012            1
## Hg908b   female  7291936            1
## Hg1274     male  3984216            1
## Hg1407     male  5400490            1
## Hg1256   female  5728747            1
## Hg1265  unknown  3851318            1
## Hg1288   female  3740785            1
## Hg266    female  4414778            1
## Hg1402     male  8016958            1
## Hg270      male  4804685            1
## Hg1261   female  2938200            1
## Hg1272   female  6665010            1
## Hg1380   female  4719984            1
## Hg1387   female  5562169            1
## Hg1397   female  5747795            1
## Hg1421   female  5958925            1
## Hg1528   female  3655704            1
## Hg861a   female  6717503            1
## Hg862a   female  5273447            1
## Hg934    female  5089019            1
## Hg1262   female  5954066            1
## Hg1273   female  6511255            1
## Hg1282   female  3997507            1
## Hg1283   female  5907705            1
## Hg1286   female  5521600            1
## Hg1351   female  2230773            1
## Hg1516   female  6012343            1
## Hg1527   female  4895128            1

3.2.3.2 Filter out lowly expressed genes

keep <- filterByExpr(DGE)
DGE <- DGE[keep, , keep.lib.sizes=FALSE]
length(rownames(DGE$counts))
## [1] 14063

3.2.3.3 Normalize based on library sizes

DGE <- calcNormFactors(DGE)

3.2.3.4 Estimate Dispersion

On average, the true abundance for each gene can vary up or down by xx% (BCV) between replicates Differentially expressed = needs to vary by more than xx%

# Overall common dispersion
DGE <- estimateCommonDisp(DGE, verbose = TRUE)
## Disp = 0.22568 , BCV = 0.4751
# Dispersion trend based on gene abundance
DGE <- estimateTrendedDisp(DGE)

# Tagwise dispersion - allows different dispersion for each gene
DGE <- estimateTagwiseDisp(DGE, verbose = TRUE)
## Using interpolation to estimate tagwise dispersion.
plotBCV(DGE)

3.2.4 Exploratory Plots

3.2.4.1 Normalized counts

CPM <- 
  cpm(DGE, normalized.lib.sizes = TRUE, log = TRUE) %>% 
  data.frame()
boxplot(CPM, las = 2, ylab = "log2 CPM", main = "Normalized Data")

3.2.4.2 Cluster dendrogram

Males = 1274, 1407, 1402, 270 Unknown = 1265

RawDist <- dist(t(CPM), method = "euclidean")
plot(hclust(RawDist, method = "average"), xlab="Average Euclidean Distance")

3.2.4.3 PCA Plots

PCA <- prcomp(t(RawDist))

# PCs 1-2
autoplot(PCA, data=SampleInfo, colour="sex") +
  theme(legend.position="bottom")

autoplot(PCA, data=SampleInfo, colour="disease_stage") +
  theme(legend.position="bottom")

autoplot(PCA, data=SampleInfo, colour="year") +
  theme(legend.position="bottom")

autoplot(PCA, data=SampleInfo, colour="molt") +
  theme(legend.position="bottom")

# PCs 3-4
autoplot(PCA, data=SampleInfo, x=3, y=4, colour="sex") +
  theme(legend.position="bottom")

autoplot(PCA, data=SampleInfo, x=3, y=4, colour="disease_stage") +
  theme(legend.position="bottom")

autoplot(PCA, data=SampleInfo, x=3, y=4, colour="year") +
  theme(legend.position="bottom")

autoplot(PCA, data=SampleInfo, x=3, y=4, colour="molt") +
  theme(legend.position="bottom")

3.2.5 Test for DEGs

et <- exactTest(DGE, pair=c("male", "female"))

3.2.5.1 Select DEGs by fold change & pvalue

Uncorrected p < 0.05 log2FC > 1 (2-fold change)

results <- topTags(et, n = dim(DGE)[1]) %>% 
  data.frame() %>% 
  filter(abs(logFC) > 1 & PValue < 0.05)
nrow(results)
## [1] 343
write.csv(results, "Output Files/gnome_sex_edgeR_degs.csv")

3.2.5.2 DEG Heatmap

jpeg("Figures/genome_sex_edgeR_heatmap.jpg", width=9, height=9, units='in', res=900)
heatmap.2(as.matrix(CPM[rownames(CPM) %in% rownames(results), ]),
          scale = "row", trace = "none", margins=c(5,8), 
          cexRow = 0.5, labRow=FALSE)
dev.off()
## png 
##   2