4 Virus+ vs Virus- DGE

4.1 Transcriptome - edgeR - Females

4.1.1 Libraries

library(tidyverse)
library(edgeR)
library(ggfortify)
library(kableExtra)
library(knitr)
library(WebGestaltR)

4.1.2 Data

SampleInfo <- read.csv("Input Files/metadata.csv", 
                       stringsAsFactors = FALSE) %>% 
  mutate(RNA=ifelse(disease_stage=="acute" | disease_stage=="peak", "virus+", "virus-")) %>% 
  filter(!sex=="male")
  
#Raw RSEM Gene Counts
Raw <- read.csv(file = "Output Files/txome_genecounts_locid_nohemo.csv", 
                stringsAsFactors = FALSE,
                header = TRUE, 
                row.names = 1) %>% 
  select(SampleInfo$sample)

4.1.3 Differential Gene Expression Analysis

4.1.3.1 DGElist

#Create DGE Object with counts & sample groups
DGE <- DGEList(counts = Raw, group = SampleInfo$RNA)

#Check DGE Object Samples, Groups, Lib Size, and Norm Factor
DGE$samples

##          group lib.size norm.factors
## Hg11513 virus+  2842670            1
## Hg1354  virus+  3618804            1
## Hg1403  virus+  6100522            1
## Hg1404  virus+  3533565            1
## Hg908b  virus+  5871501            1
## Hg1256  virus+  4573826            1
## Hg1265  virus+  2979315            1
## Hg1288  virus+  2823770            1
## Hg266   virus+  3543715            1
## Hg1261  virus-  1943267            1
## Hg1272  virus-  4708624            1
## Hg1380  virus-  3204059            1
## Hg1387  virus-  4158987            1
## Hg1397  virus-  4343361            1
## Hg1421  virus-  4508013            1
## Hg1528  virus-  2894695            1
## Hg861a  virus-  5407908            1
## Hg862a  virus-  4075756            1
## Hg934   virus-  3956989            1
## Hg1262  virus-  4653972            1
## Hg1273  virus-  5011031            1
## Hg1282  virus-  2951861            1
## Hg1283  virus-  4658064            1
## Hg1286  virus-  4284639            1
## Hg1351  virus-  1430600            1
## Hg1516  virus-  4688415            1
## Hg1527  virus-  3763080            1

4.1.3.2 Filter out lowly expressed genes

keep <- filterByExpr(DGE)
DGE <- DGE[keep, , keep.lib.sizes=FALSE]
length(rownames(DGE$counts))

## [1] 11387

postfilter <- rownames(DGE$counts)
write.table(postfilter, "Output Files/txome_female_edgeR_viralRNA_postfilter.txt",
            row.name=FALSE, col.name=FALSE, quote=FALSE)

4.1.3.3 Normalize based on library sizes

DGE <- calcNormFactors(DGE)

4.1.3.4 Estimate Dispersion

On average, the true abundance for each gene can vary up or down by xx% (BCV) between replicates Differentially expressed = needs to vary by more than xx%

# Overall common dispersion
DGE <- estimateCommonDisp(DGE, verbose = TRUE)

## Disp = 0.16755 , BCV = 0.4093

# Dispersion trend based on gene abundance
DGE <- estimateTrendedDisp(DGE)

# Tagwise dispersion - allows different dispersion for each gene
DGE <- estimateTagwiseDisp(DGE, verbose = TRUE)

## Using interpolation to estimate tagwise dispersion.

plotBCV(DGE)

4.1.4 Exploratory Plots

4.1.4.1 Normalized counts

CPM <- 
  cpm(DGE, normalized.lib.sizes = TRUE, log = TRUE) %>% 
  data.frame()
boxplot(CPM, las = 2, ylab = "log2 CPM", main = "Normalized Data")

write.csv(CPM, "Output Files/txome_female_edgeR_viralRNA_normcounts.csv")

4.1.4.2 Cluster dendrogram

RawDist <- dist(t(CPM), method = "euclidean")
plot(hclust(RawDist, method = "average"), xlab="Average Euclidean Distance")

4.1.4.3 PCA Plots

PCA <- prcomp(t(RawDist))

# PCs 1-2
autoplot(PCA, data=SampleInfo, colour="RNA") +
  theme(legend.position="bottom")

autoplot(PCA, data=SampleInfo, colour="sex") +
  theme(legend.position="bottom")

autoplot(PCA, data=SampleInfo, colour="year") +
  theme(legend.position="bottom")

autoplot(PCA, data=SampleInfo, colour="molt") +
  theme(legend.position="bottom")

# PCs 3-4
autoplot(PCA, data=SampleInfo, x=3, y=4, colour="RNA") +
  theme(legend.position="bottom")

autoplot(PCA, data=SampleInfo, x=3, y=4, colour="sex") +
  theme(legend.position="bottom")

autoplot(PCA, data=SampleInfo, x=3, y=4, colour="year") +
  theme(legend.position="bottom")

autoplot(PCA, data=SampleInfo, x=3, y=4, colour="molt") +
  theme(legend.position="bottom")

4.1.5 Test for DEGs

et <- exactTest(DGE, pair=c("virus-", "virus+"))

write.csv(et, "Output Files/txome_female_edgeR_viralRNA_DGE_results.csv")

4.1.5.1 Select DEGs by fold change & pvalue

Uncorrected p < 0.05 log2FC > 1 (2-fold change)

results <- 
  topTags(et, n = dim(DGE)[1]) %>% 
  data.frame() %>% 
  filter(abs(logFC) > 1 & PValue < 0.05) %>% 
  rownames_to_column(var="gene")
nrow(results)

## [1] 136

write.csv(results, "Output Files/txome_female_edgeR_viralRNA_DEG.csv")

4.1.5.2 Make sure genes are up/down regulated in Virus+ vs Virus-

Visually compare gene expression across viral RNA status to ensure directional change is correct.

CPM %>% 
  rownames_to_column(var="gene") %>% 
  filter(gene %in% results$gene) %>% 
  pivot_longer(cols=!gene, names_to="sample", values_to="expression") %>% 
  mutate(RNA=ifelse(sample %in% SampleInfo$sample, 
                              SampleInfo[match(sample, SampleInfo$sample), 8],
                              NA)) %>% 
  filter(gene %in% results$gene[1:10]) %>% 
  mutate(RNA=factor(RNA, levels=c("virus-", "virus+"))) %>% 
  ggplot(., aes(x=RNA, y=expression)) +
  geom_boxplot() +
  facet_wrap(~gene)

4.1.5.3 Heatmap

# Add +/- to sample names to indicate IAV status
sampiav <- 
  SampleInfo %>% 
  select(sample, RNA) %>% 
  mutate(sample=ifelse(RNA=="virus+", 
                       paste0(sample, " (+)"), 
                       paste0(sample, " (-)")))

CPMiav <-
  CPM %>% 
  as.data.frame() %>%
  setNames(sampiav$sample) %>% 
  as.matrix()


# Heatmap of top 100 DE genes (50 up and 50 down)
topde <-
  data.frame(rbind(slice_min(results, logFC, n=50), 
                   slice_max(results, logFC, n=50))) 

jpeg(filename="Figures/heatmap_top100.jpeg", width=8, height=5, units="in", res=300)
heatmap.2(as.matrix(CPMiav[rownames(CPMiav) %in% topde$gene,]),
          scale="row", 
          trace="none", 
          dendrogram="column", 
          key=FALSE, 
          labRow=FALSE,
          lmat=rbind(c(4,3), c(2,1)),
          lwid=c(0.05,1),
          lhei=c(0.2, 1), 
          margins=c(5,2.5))
dev.off()

## png 
##   2

## All genes - gene expression patterns not clear
heatmap.2(as.matrix(CPMiav),
          scale="row", 
          trace="none", 
          dendrogram="column", 
          key=FALSE, 
          labRow=FALSE,
          lmat=rbind(c(4,3), c(2,1)),
          lwid=c(0.05,1),
          lhei=c(0.2, 1), 
          colsep=c(2,5),
          margins=c(5,2.5))

4.1.6 GO Enrichment Analysis

4.1.6.1 Configure Gene Lists

background <- rownames(CPM)

up <- 
  results %>% 
  filter(logFC>=1)

down <-
  results %>% 
  filter(logFC < 1)

4.1.6.2 Up-regulated genes

4.1.6.2.1 Biological Process

go_bp_up <-
    WebGestaltR(enrichMethod="ORA", organism="hsapiens", 
                enrichDatabase="geneontology_Biological_Process",
                interestGene = up$gene, interestGeneType = "genesymbol", 
                referenceGene = background, referenceGeneType = "genesymbol", 
                sigMethod = "fdr", fdrThr = 0.05, 
                isOutput = FALSE) %>% 
  mutate(direction="up",
         go="bp")

## Loading the functional categories...
## Loading the ID list...
## Loading the reference list...
## Performing the enrichment analysis...

4.1.6.2.2 Molecular Function

go_mf_up <- 
    WebGestaltR(enrichMethod="ORA", organism="hsapiens", 
                enrichDatabase="geneontology_Molecular_Function",
                interestGene = up$gene, interestGeneType = "genesymbol", 
                referenceGene = background, referenceGeneType = "genesymbol", 
                sigMethod = "fdr", fdrThr = 0.05, 
                isOutput = FALSE)

## Loading the functional categories...
## Loading the ID list...
## Loading the reference list...
## Performing the enrichment analysis...

## Warning in oraEnrichment(interestGeneList, referenceGeneList, geneSet, minNum =
## minNum, : No significant gene set is identified based on FDR 0.05!

4.1.6.2.3 Cellular Component

go_cc_up <- 
    WebGestaltR(enrichMethod="ORA", organism="hsapiens", 
                enrichDatabase="geneontology_Cellular_Component",
                interestGene = up$gene, interestGeneType = "genesymbol", 
                referenceGene = background, referenceGeneType = "genesymbol", 
                sigMethod = "fdr", fdrThr = 0.05, 
                isOutput = FALSE)

## Loading the functional categories...
## Loading the ID list...
## Loading the reference list...
## Performing the enrichment analysis...

## Warning in oraEnrichment(interestGeneList, referenceGeneList, geneSet, minNum =
## minNum, : No significant gene set is identified based on FDR 0.05!

4.1.6.3 Down-regulated Genes

4.1.6.3.1 Biological Process

go_bp_down <- 
    WebGestaltR(enrichMethod="ORA", organism="hsapiens", 
                enrichDatabase="geneontology_Biological_Process",
                interestGene = down$gene, interestGeneType = "genesymbol", 
                referenceGene = background, referenceGeneType = "genesymbol", 
                sigMethod = "fdr", fdrThr = 0.05, 
                isOutput = FALSE) %>% 
  mutate(direction="down",
         go="bp")

## Loading the functional categories...
## Loading the ID list...
## Loading the reference list...
## Performing the enrichment analysis...

4.1.6.3.2 Molecular Function

go_mf_down <- 
    WebGestaltR(enrichMethod="ORA", organism="hsapiens", 
                enrichDatabase="geneontology_Molecular_Function",
                interestGene = down$gene, interestGeneType = "genesymbol", 
                referenceGene = background, referenceGeneType = "genesymbol", 
                sigMethod = "fdr", fdrThr = 0.05, 
                isOutput = FALSE) %>% 
  mutate(direction="down", 
         go="mf")

## Loading the functional categories...
## Loading the ID list...
## Loading the reference list...
## Performing the enrichment analysis...

4.1.6.3.3 Cellular Component

go_cc_down <- 
    WebGestaltR(enrichMethod="ORA", organism="hsapiens", 
                enrichDatabase="geneontology_Cellular_Component",
                interestGene = down$gene, interestGeneType = "genesymbol", 
                referenceGene = background, referenceGeneType = "genesymbol", 
                sigMethod = "fdr", fdrThr = 0.05, 
                isOutput = FALSE) %>% 
  mutate(direction="down", 
         go="cc")

## Loading the functional categories...
## Loading the ID list...
## Loading the reference list...
## Performing the enrichment analysis...

4.1.6.4 Save Results

go <-
  bind_rows(go_bp_down, go_bp_up,
            go_cc_down, 
            go_mf_down)

go <- 
  go %>% 
  select(!link & !overlapId) 

go %>% 
  mutate(across(where(is.numeric), \(x) round(x, digits=5))) %>% 
  kable() %>% 
  kable_styling("basic", font_size=9)

geneSet	description	size	overlap	expect	enrichmentRatio	pValue	FDR	userId	direction	go
GO:0045087	innate immune response	491	26	5.07520	5.12295	0.00000	0.00000	IFI6;C1QB;NR1H3;ACTG1;C1QC;C2;C1QA;NECTIN2;CFB;CFD;HLA-DQA1;MNDA;UBC;APOBEC3G;DHX58;RPL13A;APOE;MX2;AXL;SOCS1;UBA52;ISG15;IFIT3;AIF1;MX1;RSAD2	down	bp
GO:0002253	activation of immune response	370	18	3.82449	4.70651	0.00000	0.00009	C1QB;NR1H3;ACTG1;C1QC;C2;C1QA;RPS3;NECTIN2;CD79A;CFB;ACTB;CFD;HLA-DQA1;MNDA;UBC;DHX58;UBA52;RSAD2	down	bp
GO:0050778	positive regulation of immune response	476	20	4.92016	4.06491	0.00000	0.00010	C1QB;NR1H3;ACTG1;C1QC;C2;C1QA;RPS3;NECTIN2;HLX;CD79A;CFB;IDO1;ACTB;CFD;HLA-DQA1;MNDA;UBC;DHX58;UBA52;RSAD2	down	bp
GO:0072376	protein activation cascade	38	7	0.39279	17.82143	0.00000	0.00012	C1QB;C1QC;C2;GP1BB;C1QA;CFB;CFD	down	bp
GO:0006956	complement activation	27	6	0.27908	21.49887	0.00000	0.00026	C1QB;C1QC;C2;C1QA;CFB;CFD	down	bp
GO:0019221	cytokine-mediated signaling pathway	458	18	4.73410	3.80220	0.00000	0.00070	IL32;IFI6;NR1H3;RACK1;HLA-DQA1;UBC;MPL;HSPA8;CARD14;MX2;AXL;SOCS1;UBA52;ISG15;COL1A2;IFIT3;MX1;RSAD2	down	bp
GO:0019058	viral life cycle	208	12	2.14998	5.58144	0.00000	0.00101	ITGB5;NECTIN2;UBC;HSPA8;APOBEC3G;APOE;AXL;UBA52;ISG15;DDX5;MX1;RSAD2	down	bp
GO:0006614	SRP-dependent cotranslational protein targeting to membrane	83	8	0.85793	9.32481	0.00000	0.00117	RPS26;RPL36;RPS3;RPS14;RPL13A;RPL14;UBA52;RPS12	down	bp
GO:0031347	regulation of defense response	438	17	4.52737	3.75494	0.00000	0.00117	C1QB;NR1H3;C1QC;C2;C1QA;NECTIN2;GPSM3;CFB;IDO1;MNDA;UBC;APOBEC3G;DHX58;APOE;SOCS1;UBA52;RSAD2	down	bp
GO:0030449	regulation of complement activation	22	5	0.22740	21.98748	0.00000	0.00123	C1QB;C1QC;C2;C1QA;CFB	down	bp
GO:0006613	cotranslational protein targeting to membrane	87	8	0.89927	8.89608	0.00000	0.00129	RPS26;RPL36;RPS3;RPS14;RPL13A;RPL14;UBA52;RPS12	down	bp
GO:2000257	regulation of protein activation cascade	23	5	0.23774	21.03150	0.00000	0.00129	C1QB;C1QC;C2;C1QA;CFB	down	bp
GO:0045047	protein targeting to ER	93	8	0.96129	8.32214	0.00000	0.00187	RPS26;RPL36;RPS3;RPS14;RPL13A;RPL14;UBA52;RPS12	down	bp
GO:0072599	establishment of protein localization to endoplasmic reticulum	96	8	0.99230	8.06207	0.00001	0.00220	RPS26;RPL36;RPS3;RPS14;RPL13A;RPL14;UBA52;RPS12	down	bp
GO:0006958	complement activation, classical pathway	13	4	0.13437	29.76766	0.00001	0.00239	C1QB;C1QC;C2;C1QA	down	bp
GO:0006413	translational initiation	167	10	1.72619	5.79311	0.00001	0.00254	RPS26;RPL36;RPS3;RPS14;ATF4;RPL13A;EIF4G2;RPL14;UBA52;RPS12	down	bp
GO:0002920	regulation of humoral immune response	31	5	0.32043	15.60402	0.00001	0.00407	C1QB;C1QC;C2;C1QA;CFB	down	bp
GO:0000184	nuclear-transcribed mRNA catabolic process, nonsense-mediated decay	108	8	1.11634	7.16629	0.00001	0.00407	RPS26;RPL36;RPS3;RPS14;RPL13A;RPL14;UBA52;RPS12	down	bp
GO:0060337	type I interferon signaling pathway	53	6	0.54783	10.95225	0.00002	0.00407	IFI6;MX2;ISG15;IFIT3;MX1;RSAD2	down	bp
GO:0071357	cellular response to type I interferon	53	6	0.54783	10.95225	0.00002	0.00407	IFI6;MX2;ISG15;IFIT3;MX1;RSAD2	down	bp
GO:0006612	protein targeting to membrane	147	9	1.51946	5.92316	0.00002	0.00472	RPS26;RPL36;RPS3;RPS14;RPL13A;RPL14;UBA52;RPS12;RTP5	down	bp
GO:0034340	response to type I interferon	57	6	0.58918	10.18367	0.00002	0.00566	IFI6;MX2;ISG15;IFIT3;MX1;RSAD2	down	bp
GO:0050866	negative regulation of cell activation	118	8	1.21970	6.55898	0.00003	0.00613	BPI;NR1H3;HLX;IDO1;MNDA;APOE;AXL;SOCS1	down	bp
GO:0070972	protein localization to endoplasmic reticulum	119	8	1.23004	6.50386	0.00003	0.00625	RPS26;RPL36;RPS3;RPS14;RPL13A;RPL14;UBA52;RPS12	down	bp
GO:0006959	humoral immune response	93	7	0.96129	7.28187	0.00005	0.00936	BPI;C1QB;C1QC;C2;C1QA;CFB;CFD	down	bp
GO:0045069	regulation of viral genome replication	65	6	0.67187	8.93030	0.00005	0.01010	HSPA8;APOBEC3G;ISG15;DDX5;MX1;RSAD2	down	bp
GO:0098542	defense response to other organism	252	11	2.60479	4.22299	0.00005	0.01010	BPI;IFI6;BATF2;CEBPE;APOBEC3G;DHX58;MX2;ISG15;IFIT3;MX1;RSAD2	down	bp
GO:0050792	regulation of viral process	130	8	1.34374	5.95353	0.00006	0.01010	NECTIN2;HSPA8;APOBEC3G;APOE;ISG15;DDX5;MX1;RSAD2	down	bp
GO:1903900	regulation of viral life cycle	100	7	1.03365	6.77214	0.00007	0.01285	NECTIN2;HSPA8;APOBEC3G;ISG15;DDX5;MX1;RSAD2	down	bp
GO:0002455	humoral immune response mediated by circulating immunoglobulin	23	4	0.23774	16.82520	0.00008	0.01368	C1QB;C1QC;C2;C1QA	down	bp
GO:0006402	mRNA catabolic process	270	11	2.79084	3.94146	0.00010	0.01646	RPS26;RPL36;RPS3;RPS14;UBC;HSPA8;RPL13A;RPL14;UBA52;RPS12;DDX5	down	bp
GO:0002695	negative regulation of leukocyte activation	106	7	1.09567	6.38881	0.00011	0.01670	BPI;NR1H3;HLX;IDO1;MNDA;AXL;SOCS1	down	bp
GO:0000956	nuclear-transcribed mRNA catabolic process	183	9	1.89157	4.75795	0.00011	0.01670	RPS26;RPL36;RPS3;RPS14;RPL13A;RPL14;UBA52;RPS12;DDX5	down	bp
GO:0002673	regulation of acute inflammatory response	47	5	0.48581	10.29201	0.00012	0.01711	C1QB;C1QC;C2;C1QA;CFB	down	bp
GO:0043903	regulation of symbiosis, encompassing mutualism through parasitism	146	8	1.50912	5.30109	0.00013	0.01831	NECTIN2;HSPA8;APOBEC3G;APOE;ISG15;DDX5;MX1;RSAD2	down	bp
GO:0030168	platelet activation	112	7	1.15768	6.04656	0.00015	0.02091	ACTG1;GP1BB;ACTB;MPL;APOE;AXL;COL1A2	down	bp
GO:0009636	response to toxic substance	283	11	2.92522	3.76040	0.00015	0.02091	G6PD;RPS3;RACK1;EEF2;ACTB;PCGF2;ATF4;APOE;AXL;UBA52;TXNIP	down	bp
GO:0051607	defense response to virus	151	8	1.56081	5.12556	0.00016	0.02130	IFI6;APOBEC3G;DHX58;MX2;ISG15;IFIT3;MX1;RSAD2	down	bp
GO:0002757	immune response-activating signal transduction	338	12	3.49372	3.43473	0.00018	0.02301	NR1H3;ACTG1;RPS3;NECTIN2;CD79A;ACTB;HLA-DQA1;MNDA;UBC;DHX58;UBA52;RSAD2	down	bp
GO:0010608	posttranscriptional regulation of gene expression	392	13	4.05189	3.20838	0.00019	0.02336	RPS3;RACK1;EEF2;RPS14;EIF5A;UBC;NCL;HSPA8;ATF4;RPL13A;EIF4G2;UBA52;DDX5	down	bp
GO:0006401	RNA catabolic process	292	11	3.01825	3.64450	0.00020	0.02481	RPS26;RPL36;RPS3;RPS14;UBC;HSPA8;RPL13A;RPL14;UBA52;RPS12;DDX5	down	bp
GO:0090150	establishment of protein localization to membrane	245	10	2.53243	3.94877	0.00021	0.02517	RPS26;RPL36;RPS3;RACK1;RPS14;RPL13A;RPL14;UBA52;RPS12;RTP5	down	bp
GO:0019079	viral genome replication	85	6	0.87860	6.82905	0.00024	0.02765	HSPA8;APOBEC3G;ISG15;DDX5;MX1;RSAD2	down	bp
GO:0070613	regulation of protein processing	57	5	0.58918	8.48639	0.00029	0.03326	C1QB;C1QC;C2;C1QA;CFB	down	bp
GO:0002697	regulation of immune effector process	256	10	2.64613	3.77910	0.00030	0.03346	C1QB;C1QC;C2;C1QA;NECTIN2;HLX;CFB;APOBEC3G;DHX58;RSAD2	down	bp
GO:0050727	regulation of inflammatory response	210	9	2.17066	4.14621	0.00031	0.03374	C1QB;NR1H3;C1QC;C2;C1QA;GPSM3;CFB;IDO1;APOE	down	bp
GO:0002764	immune response-regulating signaling pathway	360	12	3.72113	3.22483	0.00032	0.03415	NR1H3;ACTG1;RPS3;NECTIN2;CD79A;ACTB;HLA-DQA1;MNDA;UBC;DHX58;UBA52;RSAD2	down	bp
GO:1903317	regulation of protein maturation	59	5	0.60985	8.19872	0.00034	0.03466	C1QB;C1QC;C2;C1QA;CFB	down	bp
GO:0050865	regulation of cell activation	363	12	3.75214	3.19818	0.00034	0.03466	BPI;NR1H3;RPS3;NECTIN2;HLX;IDO1;MNDA;MPL;APOE;AXL;SOCS1;AIF1	down	bp
GO:0045071	negative regulation of viral genome replication	33	4	0.34110	11.72665	0.00035	0.03466	APOBEC3G;ISG15;MX1;RSAD2	down	bp
GO:0007596	blood coagulation	214	9	2.21200	4.06871	0.00036	0.03466	ACTG1;GP1BB;USF1;ACTB;MPL;HMG20B;APOE;AXL;COL1A2	down	bp
GO:0035455	response to interferon-alpha	14	3	0.14471	20.73105	0.00036	0.03466	MX2;AXL;IFIT3	down	bp
GO:0050817	coagulation	216	9	2.23268	4.03104	0.00038	0.03604	ACTG1;GP1BB;USF1;ACTB;MPL;HMG20B;APOE;AXL;COL1A2	down	bp
GO:0002683	negative regulation of immune system process	265	10	2.73916	3.65075	0.00039	0.03671	BPI;NR1H3;C1QC;HLX;IDO1;MNDA;DHX58;TSC22D3;AXL;SOCS1	down	bp
GO:0007599	hemostasis	218	9	2.25335	3.99406	0.00041	0.03716	ACTG1;GP1BB;USF1;ACTB;MPL;HMG20B;APOE;AXL;COL1A2	down	bp
GO:1901361	organic cyclic compound catabolic process	484	14	5.00285	2.79841	0.00042	0.03808	RPS26;RPL36;RPS3;RPS14;IDO1;UBC;HSPA8;APOBEC3G;RPL13A;APOE;RPL14;UBA52;RPS12;DDX5	down	bp
GO:0006412	translation	489	14	5.05453	2.76979	0.00047	0.04148	RPS26;RPL36;RPS3;RACK1;EEF2;RPS14;EIF5A;NCL;ATF4;RPL13A;EIF4G2;RPL14;UBA52;RPS12	down	bp
GO:0006605	protein targeting	324	11	3.34901	3.28455	0.00049	0.04273	RPS26;RPL36;RPS3;RPS14;UBC;HSPA8;RPL13A;RPL14;UBA52;RPS12;RTP5	down	bp
GO:0015701	bicarbonate transport	17	3	0.03228	92.95098	0.00000	0.01930	CA2;SLC4A1;RHAG	up	bp
GO:0072562	blood microparticle	46	7	0.46867	14.93587	0.00000	0.00014	C1QB;ACTG1;C1QC;CFB;ACTB;HSPA8;APOE	down	cc
GO:0022626	cytosolic ribosome	94	9	0.95772	9.39734	0.00000	0.00014	RPS26;RPL36;RPS3;RACK1;RPS14;RPL13A;RPL14;UBA52;RPS12	down	cc
GO:0022627	cytosolic small ribosomal subunit	35	6	0.35660	16.82571	0.00000	0.00027	RPS26;RPS3;RACK1;RPS14;UBA52;RPS12	down	cc
GO:0005840	ribosome	192	10	1.95619	5.11198	0.00002	0.00279	RPS26;RPL36;RPS3;RACK1;EEF2;RPS14;RPL13A;RPL14;UBA52;RPS12	down	cc
GO:0015935	small ribosomal subunit	58	6	0.59093	10.15345	0.00003	0.00279	RPS26;RPS3;RACK1;RPS14;UBA52;RPS12	down	cc
GO:0044445	cytosolic part	193	10	1.96638	5.08549	0.00003	0.00279	RPS26;RPL36;RPS3;RACK1;RPS14;HSPA8;RPL13A;RPL14;UBA52;RPS12	down	cc
GO:0044391	ribosomal subunit	157	9	1.59959	5.62643	0.00003	0.00281	RPS26;RPL36;RPS3;RACK1;RPS14;RPL13A;RPL14;UBA52;RPS12	down	cc
GO:0005581	collagen trimer	28	4	0.28528	14.02143	0.00017	0.01412	C1QB;C1QC;C1QA;COL1A2	down	cc
GO:0005198	structural molecule activity	407	17	4.17174	4.07504	0.00000	0.00060	RPS26;ACTG1;SORBS3;RPL36;RPS3;RACK1;DES;RPS14;ACTB;HSPB6;RPL13A;APOE;TPM2;RPL14;UBA52;RPS12;COL1A2	down	mf
GO:0003735	structural constituent of ribosome	133	8	1.36325	5.86834	0.00006	0.02771	RPS26;RPL36;RPS3;RPS14;RPL13A;RPL14;UBA52;RPS12	down	mf
GO:0004252	serine-type endopeptidase activity	74	6	0.75850	7.91037	0.00010	0.03103	C1QB;C1QC;C2;C1QA;CFB;CFD	down	mf

write.csv(go, "Output Files/txome_female_edgeR_viralRNA_go.csv",
          row.names=FALSE)

4.1.7 KEGG Analysis

kegg <-
  WebGestaltR(enrichMethod = "ORA", organism = "hsapiens", enrichDatabase = "pathway_KEGG",
              interestGene = results$gene, interestGeneType = "genesymbol", 
              referenceGene = background, referenceGeneType = "genesymbol", 
              minNum = 5, sigMethod = "fdr", isOutput = FALSE)

## Loading the functional categories...
## Loading the ID list...
## Loading the reference list...
## Performing the enrichment analysis...

write.csv(kegg, "Output Files/txome_female_edgeR_viralRNA_kegg.csv")

4.2 Genome - edgeR - Females

4.2.1 Libraries

library(tidyverse)
library(edgeR)
library(ggfortify)
library(kableExtra)
library(knitr)
library(WebGestaltR)

4.2.2 Data

SampleInfo <- read.csv("Input Files/metadata.csv", 
                       stringsAsFactors = FALSE) %>% 
  mutate(RNA=ifelse(disease_stage=="acute" | disease_stage=="peak", "virus+", "virus-")) %>% 
  filter(!sex=="male")
  
#Raw RSEM Gene Counts
Raw <- read.csv(file = "Output Files/gnome_genecounts_locid_nohemo.csv", 
                stringsAsFactors = FALSE,
                header = TRUE, 
                row.names = 1) %>% 
  select(SampleInfo$sample)

4.2.3 Differential Gene Expression Analysis

4.2.3.1 DGElist

#Create DGE Object with counts & sample groups
DGE <- DGEList(counts = Raw, group = SampleInfo$RNA)

#Check DGE Object Samples, Groups, Lib Size, and Norm Factor
DGE$samples

##          group lib.size norm.factors
## Hg11513 virus+  3894792            1
## Hg1354  virus+  4928497            1
## Hg1403  virus+  7571672            1
## Hg1404  virus+  4696012            1
## Hg908b  virus+  7291936            1
## Hg1256  virus+  5728747            1
## Hg1265  virus+  3851318            1
## Hg1288  virus+  3740785            1
## Hg266   virus+  4414778            1
## Hg1261  virus-  2938200            1
## Hg1272  virus-  6665010            1
## Hg1380  virus-  4719984            1
## Hg1387  virus-  5562169            1
## Hg1397  virus-  5747795            1
## Hg1421  virus-  5958925            1
## Hg1528  virus-  3655704            1
## Hg861a  virus-  6717503            1
## Hg862a  virus-  5273447            1
## Hg934   virus-  5089019            1
## Hg1262  virus-  5954066            1
## Hg1273  virus-  6511255            1
## Hg1282  virus-  3997507            1
## Hg1283  virus-  5907705            1
## Hg1286  virus-  5521600            1
## Hg1351  virus-  2230773            1
## Hg1516  virus-  6012343            1
## Hg1527  virus-  4895128            1

4.2.3.2 Filter out lowly expressed genes

keep <- filterByExpr(DGE)
DGE <- DGE[keep, , keep.lib.sizes=FALSE]
length(rownames(DGE$counts))

## [1] 11969

postfilter <- rownames(DGE$counts)
write.table(postfilter, "Output Files/gnome_female_edgeR_viralRNA_postfilter.txt",
            row.name=FALSE, col.name=FALSE, quote=FALSE)

4.2.3.3 Normalize based on library sizes

DGE <- calcNormFactors(DGE)

4.2.3.4 Estimate Dispersion

On average, the true abundance for each gene can vary up or down by xx% (BCV) between replicates Differentially expressed = needs to vary by more than xx%

# Overall common dispersion
DGE <- estimateCommonDisp(DGE, verbose = TRUE)

## Disp = 0.20107 , BCV = 0.4484

# Dispersion trend based on gene abundance
DGE <- estimateTrendedDisp(DGE)

# Tagwise dispersion - allows different dispersion for each gene
DGE <- estimateTagwiseDisp(DGE, verbose = TRUE)

## Using interpolation to estimate tagwise dispersion.

plotBCV(DGE)

4.2.4 Exploratory Plots

4.2.4.1 Normalized counts

CPM <- 
  cpm(DGE, normalized.lib.sizes = TRUE, log = TRUE) %>% 
  data.frame()
boxplot(CPM, las = 2, ylab = "log2 CPM", main = "Normalized Data")

write.csv(CPM, "Output Files/gnome_female_edgeR_viralRNA_normcounts.csv")

4.2.4.2 Cluster dendrogram

RawDist <- dist(t(CPM), method = "euclidean")
plot(hclust(RawDist, method = "average"), xlab="Average Euclidean Distance")

4.2.4.3 PCA Plots

PCA <- prcomp(t(RawDist))

# PCs 1-2
autoplot(PCA, data=SampleInfo, colour="RNA") +
  theme(legend.position="bottom")

autoplot(PCA, data=SampleInfo, colour="sex") +
  theme(legend.position="bottom")

autoplot(PCA, data=SampleInfo, colour="year") +
  theme(legend.position="bottom")

autoplot(PCA, data=SampleInfo, colour="molt") +
  theme(legend.position="bottom")

# PCs 3-4
autoplot(PCA, data=SampleInfo, x=3, y=4, colour="RNA") +
  theme(legend.position="bottom")

autoplot(PCA, data=SampleInfo, x=3, y=4, colour="sex") +
  theme(legend.position="bottom")

autoplot(PCA, data=SampleInfo, x=3, y=4, colour="year") +
  theme(legend.position="bottom")

autoplot(PCA, data=SampleInfo, x=3, y=4, colour="molt") +
  theme(legend.position="bottom")

4.2.5 Test for DEGs

et <- exactTest(DGE, pair=c("virus-", "virus+"))

write.csv(et, "Output Files/gnome_female_edgeR_viralRNA_DGE_results.csv")

4.2.5.1 Select DEGs by fold change & pvalue

Uncorrected p < 0.05 log2FC > 1 (2-fold change)

results <- 
  topTags(et, n = dim(DGE)[1]) %>% 
  data.frame() %>% 
  filter(abs(logFC) > 1 & PValue < 0.05) %>% 
  rownames_to_column(var="gene")
nrow(results)

## [1] 132

write.csv(results, "Output Files/gnome_female_edgeR_viralRNA_DEG.csv")

4.2.5.2 Make sure genes are up/down regulated in Virus+ vs Virus-

Visually compare gene expression across viral RNA status to ensure directional change is correct.

CPM %>% 
  rownames_to_column(var="gene") %>% 
  filter(gene %in% results$gene) %>% 
  pivot_longer(cols=!gene, names_to="sample", values_to="expression") %>% 
  mutate(RNA=ifelse(sample %in% SampleInfo$sample, 
                              SampleInfo[match(sample, SampleInfo$sample), 8],
                              NA)) %>% 
  filter(gene %in% results$gene[1:10]) %>% 
  mutate(RNA=factor(RNA, levels=c("virus-", "virus+"))) %>% 
  ggplot(., aes(x=RNA, y=expression)) +
  geom_boxplot() +
  facet_wrap(~gene)

4.2.6 GO Enrichment Analysis

4.2.6.1 Configure Gene Lists

background <- rownames(CPM)

up <- 
  results %>% 
  filter(logFC>=1)
down <-
  results %>% 
  filter(logFC < 1)

4.2.6.2 Up-regulated genes

4.2.6.2.1 Biological Process

go_bp_up <-
    WebGestaltR(enrichMethod="ORA", organism="hsapiens", 
                enrichDatabase="geneontology_Biological_Process",
                interestGene = up$gene, interestGeneType = "genesymbol", 
                referenceGene = background, referenceGeneType = "genesymbol", 
                sigMethod = "fdr", fdrThr = 0.05, 
                isOutput = FALSE) %>% 
  mutate(direction="up",
         go="bp")

## Loading the functional categories...
## Loading the ID list...
## Loading the reference list...
## Performing the enrichment analysis...

4.2.6.2.2 Molecular Function

go_mf_up <- 
    WebGestaltR(enrichMethod="ORA", organism="hsapiens", 
                enrichDatabase="geneontology_Molecular_Function",
                interestGene = up$gene, interestGeneType = "genesymbol", 
                referenceGene = background, referenceGeneType = "genesymbol", 
                sigMethod = "fdr", fdrThr = 0.05, 
                isOutput = FALSE)

## Loading the functional categories...
## Loading the ID list...
## Loading the reference list...
## Performing the enrichment analysis...

## Warning in oraEnrichment(interestGeneList, referenceGeneList, geneSet, minNum =
## minNum, : No significant gene set is identified based on FDR 0.05!

4.2.6.2.3 Cellular Component

go_cc_up <- 
    WebGestaltR(enrichMethod="ORA", organism="hsapiens", 
                enrichDatabase="geneontology_Cellular_Component",
                interestGene = up$gene, interestGeneType = "genesymbol", 
                referenceGene = background, referenceGeneType = "genesymbol", 
                sigMethod = "fdr", fdrThr = 0.05, 
                isOutput = FALSE)

## Loading the functional categories...
## Loading the ID list...
## Loading the reference list...
## Performing the enrichment analysis...

## Warning in oraEnrichment(interestGeneList, referenceGeneList, geneSet, minNum =
## minNum, : No significant gene set is identified based on FDR 0.05!

4.2.6.3 Down-regulated Genes

4.2.6.3.1 Biological Process

go_bp_down <- 
    WebGestaltR(enrichMethod="ORA", organism="hsapiens", 
                enrichDatabase="geneontology_Biological_Process",
                interestGene = down$gene, interestGeneType = "genesymbol", 
                referenceGene = background, referenceGeneType = "genesymbol", 
                sigMethod = "fdr", fdrThr = 0.05, 
                isOutput = FALSE) %>% 
  mutate(direction="down",
         go="bp")

## Loading the functional categories...
## Loading the ID list...
## Loading the reference list...
## Performing the enrichment analysis...

4.2.6.3.2 Molecular Function

go_mf_down <- 
    WebGestaltR(enrichMethod="ORA", organism="hsapiens", 
                enrichDatabase="geneontology_Molecular_Function",
                interestGene = down$gene, interestGeneType = "genesymbol", 
                referenceGene = background, referenceGeneType = "genesymbol", 
                sigMethod = "fdr", fdrThr = 0.05, 
                isOutput = FALSE) %>% 
  mutate(direction="down", 
         go="mf")

## Loading the functional categories...
## Loading the ID list...
## Loading the reference list...
## Performing the enrichment analysis...

4.2.6.3.3 Cellular Component

go_cc_down <- 
    WebGestaltR(enrichMethod="ORA", organism="hsapiens", 
                enrichDatabase="geneontology_Cellular_Component",
                interestGene = down$gene, interestGeneType = "genesymbol", 
                referenceGene = background, referenceGeneType = "genesymbol", 
                sigMethod = "fdr", fdrThr = 0.05, 
                isOutput = FALSE) %>% 
  mutate(direction="down", 
         go="cc")

## Loading the functional categories...
## Loading the ID list...
## Loading the reference list...
## Performing the enrichment analysis...

4.2.6.4 Save Results

go <-
  bind_rows(go_bp_down, go_bp_up,
            go_cc_down,
            go_mf_down)

go <- 
  go %>% 
  select(!link & !overlapId) 

go %>% 
  mutate(across(where(is.numeric), \(x) round(x, digits=5))) %>% 
  kable() %>% 
  kable_styling("basic", font_size=9)

geneSet	description	size	overlap	expect	enrichmentRatio	pValue	FDR	userId	direction	go
GO:0060337	type I interferon signaling pathway	59	8	0.50066	15.97902	0.00000	0.00005	IFI6;IFITM3;OASL;MX1;MX2;IRF7;ZBP1;ISG15	down	bp
GO:0071357	cellular response to type I interferon	59	8	0.50066	15.97902	0.00000	0.00005	IFI6;IFITM3;OASL;MX1;MX2;IRF7;ZBP1;ISG15	down	bp
GO:0072376	protein activation cascade	39	7	0.33094	21.15171	0.00000	0.00005	C1QB;C2;C1QC;C1QA;GP1BB;CFB;CFD	down	bp
GO:0034340	response to type I interferon	63	8	0.53460	14.96447	0.00000	0.00006	IFI6;IFITM3;OASL;MX1;MX2;IRF7;ZBP1;ISG15	down	bp
GO:0006956	complement activation	27	6	0.22911	26.18783	0.00000	0.00008	C1QB;C2;C1QC;C1QA;CFB;CFD	down	bp
GO:0006959	humoral immune response	97	9	0.82311	10.93409	0.00000	0.00010	C1QB;C2;C1QC;C1QA;CFB;CFD;LCN2;CXCL10;CAMP	down	bp
GO:0098542	defense response to other organism	270	13	2.29114	5.67403	0.00000	0.00028	IFI6;BATF2;IFITM3;OASL;APOBEC3G;MX1;MX2;DHX58;IRF7;LCN2;ISG15;CXCL10;CAMP	down	bp
GO:0030449	regulation of complement activation	22	5	0.18669	26.78301	0.00000	0.00058	C1QB;C2;C1QC;C1QA;CFB	down	bp
GO:0051607	defense response to virus	163	10	1.38317	7.22977	0.00000	0.00059	IFI6;IFITM3;OASL;APOBEC3G;MX1;MX2;DHX58;IRF7;ISG15;CXCL10	down	bp
GO:2000257	regulation of protein activation cascade	23	5	0.19517	25.61853	0.00000	0.00059	C1QB;C2;C1QC;C1QA;CFB	down	bp
GO:0009615	response to virus	219	11	1.85837	5.91917	0.00000	0.00101	IFI6;IFITM3;OASL;APOBEC3G;MX1;MX2;DHX58;IRF7;LCN2;ISG15;CXCL10	down	bp
GO:0006958	complement activation, classical pathway	13	4	0.11031	36.26007	0.00000	0.00138	C1QB;C2;C1QC;C1QA	down	bp
GO:0002920	regulation of humoral immune response	31	5	0.26306	19.00730	0.00001	0.00215	C1QB;C2;C1QC;C1QA;CFB	down	bp
GO:0031347	regulation of defense response	458	15	3.88645	3.85956	0.00001	0.00215	C1QB;NR1H3;C2;C1QC;C1QA;CFB;IDO1;NCR3;APOBEC3G;DHX58;IRF7;APOE;ZBP1;LILRA5;SOCS1	down	bp
GO:0045071	negative regulation of viral genome replication	34	5	0.28851	17.33018	0.00001	0.00304	IFITM3;OASL;APOBEC3G;MX1;ISG15	down	bp
GO:0006954	inflammatory response	441	14	3.74220	3.74112	0.00002	0.00571	C1QB;NR1H3;C2;C1QC;C1QA;CFB;IDO1;SIGLEC1;NCR3;GAL;APOE;APOL3;LILRA5;CXCL10	down	bp
GO:0002455	humoral immune response mediated by circulating immunoglobulin	22	4	0.18669	21.42641	0.00003	0.00917	C1QB;C2;C1QC;C1QA	down	bp
GO:0043901	negative regulation of multi-organism process	107	7	0.90797	7.70950	0.00003	0.00917	IFITM3;OASL;APOBEC3G;MX1;DHX58;ISG15;CAMP	down	bp
GO:0002673	regulation of acute inflammatory response	46	5	0.39034	12.80927	0.00004	0.01092	C1QB;C2;C1QC;C1QA;CFB	down	bp
GO:0019221	cytokine-mediated signaling pathway	485	14	4.11557	3.40172	0.00005	0.01306	IFI6;NR1H3;IL15RA;IFITM3;OASL;MPL;MX1;MX2;IRF7;ZBP1;LCN2;SOCS1;ISG15;CXCL10	down	bp
GO:1903901	negative regulation of viral life cycle	49	5	0.41580	12.02502	0.00006	0.01348	IFITM3;OASL;APOBEC3G;MX1;ISG15	down	bp
GO:0019079	viral genome replication	88	6	0.74674	8.03490	0.00010	0.02159	ATG16L2;IFITM3;OASL;APOBEC3G;MX1;ISG15	down	bp
GO:0050727	regulation of inflammatory response	220	9	1.86686	4.82094	0.00010	0.02159	C1QB;NR1H3;C2;C1QC;C1QA;CFB;IDO1;APOE;LILRA5	down	bp
GO:0070613	regulation of protein processing	56	5	0.47520	10.52190	0.00011	0.02255	C1QB;C2;C1QC;C1QA;CFB	down	bp
GO:1903317	regulation of protein maturation	58	5	0.49217	10.15907	0.00013	0.02562	C1QB;C2;C1QC;C1QA;CFB	down	bp
GO:0048525	negative regulation of viral process	59	5	0.50066	9.98688	0.00014	0.02674	IFITM3;OASL;APOBEC3G;MX1;ISG15	down	bp
GO:0050778	positive regulation of immune response	490	13	4.15800	3.12651	0.00023	0.04085	C1QB;NR1H3;C2;C1QC;C1QA;HLX;CFB;IDO1;CFD;NCR3;DHX58;IRF7;ZBP1	down	bp
GO:0043903	regulation of symbiosis, encompassing mutualism through parasitism	146	7	1.23891	5.65011	0.00023	0.04085	IFITM3;OASL;APOBEC3G;MX1;APOE;ISG15;CAMP	down	bp
GO:0045069	regulation of viral genome replication	66	5	0.56006	8.92767	0.00023	0.04085	IFITM3;OASL;APOBEC3G;MX1;ISG15	down	bp
GO:0015701	bicarbonate transport	16	3	0.04202	71.38702	0.00001	0.04491	CA2;SLC4A1;RHAG	up	bp
GO:0072562	blood microparticle	45	5	0.38203	13.08812	0.00004	0.02457	C1QB;C1QC;CFB;APOE;ACTA1	down	cc
GO:0004252	serine-type endopeptidase activity	68	6	0.51862	11.56927	0.00001	0.01059	C1QB;C2;C1QC;C1QA;CFB;CFD	down	mf
GO:0008236	serine-type peptidase activity	81	6	0.61776	9.71248	0.00003	0.01059	C1QB;C2;C1QC;C1QA;CFB;CFD	down	mf
GO:0017171	serine hydrolase activity	82	6	0.62539	9.59403	0.00004	0.01059	C1QB;C2;C1QC;C1QA;CFB;CFD	down	mf

write.csv(go, "Output Files/gnome_female_edgeR_viralRNA_go.csv")

4.2.7 KEGG Analysis

kegg <-
  WebGestaltR(enrichMethod = "ORA", organism = "hsapiens", enrichDatabase = "pathway_KEGG",
              interestGene = results$gene, interestGeneType = "genesymbol", 
              referenceGene = background, referenceGeneType = "genesymbol", 
              minNum = 5, sigMethod = "fdr", isOutput = FALSE)

## Loading the functional categories...
## Loading the ID list...
## Loading the reference list...
## Performing the enrichment analysis...

write.csv(kegg, "Output Files/gnome_female_edgeR_viralRNA_kegg.csv")

4.3 Transcriptome Null Set - edgeR - Females

4.3.1 Libraries

library(tidyverse)
library(edgeR)
library(WebGestaltR)

4.3.2 Data

Unknown = female

SampleInfo <- read.csv("Input Files/metadata.csv", 
                       stringsAsFactors = FALSE) %>% 
  mutate(RNA=ifelse(disease_stage=="acute" | disease_stage=="peak", "virus+", "virus-")) %>% 
  filter(!sex=="male")

Raw <- read.csv(file = "Output Files/txome_genecounts_locid_nohemo.csv", 
                stringsAsFactors = FALSE,
                header = TRUE, 
                row.names = 1) %>% 
  select(SampleInfo$sample)

4.3.3 Loop Differential Gene Expression Analysis

4.3.3.1 Configure inputs & set up results lists

rna <- SampleInfo$RNA

degenes <- setNames(vector('list', 1000), 1:1000)

postfilter <- setNames(vector('list', 1000), 1:1000)

4.3.3.2 Loop

Random number generator used to generate 9 for set.seed() function.

set.seed(9)

for (i in 1:1000) {

#Shuffle group assignments without replacement
groups <- sample(rna, replace = FALSE, prob = NULL)

# Create DGE Object
DGE <- DGEList(counts=Raw, group=groups)

# Filter out lowly expressed genes
keep <- filterByExpr(DGE)
DGE <- DGE[keep, , keep.lib.sizes=FALSE]

postfilter[[i]] <- 
  DGE %>% 
  '[['(1) %>% 
  rownames()

# Normalize based on library sizes & calculate dispersion
DGE <- calcNormFactors(DGE)
DGE <- estimateCommonDisp(DGE)
DGE <- estimateTrendedDisp(DGE)
DGE <- estimateTagwiseDisp(DGE)

# Test for DEGs
et <- exactTest(DGE, pair=c("virus-", "virus+"))

# Select DEGs by fold change & pvalue
results <- 
  topTags(et, n = dim(DGE)[1]) %>% 
  data.frame() %>% 
  filter(abs(logFC) > 1 & PValue < 0.05) %>% 
  rownames_to_column(var="gene")

# Save output to list
degenes[[i]] <- results$gene

# End loop
}

4.3.3.3 Save lists of DEGs

# Postfilter List
postfilter_df <- data.frame(postfilter[[1]])
write.csv(postfilter_df, file = "Output Files/txome_female_edgeR_viralRNA_null_DEG_postfilter.csv", row.names = FALSE)

# DEGs
max_length <- max(unlist(lapply(degenes, length)))
filled <-
  lapply(degenes, function(x) {ans <- rep(NA,length=max_length);
    ans[0:length(x)]<- x;
    return(ans)})
df <- do.call(cbind, filled)
write.csv(df, file = "Output Files/txome_female_edgeR_viralRNA_null_DEG.csv", row.names = FALSE)

4.3.4 GO Enrichment Analysis

4.3.4.1 Import data

degenes <- read.csv("Output Files/txome_female_edgeR_viralRNA_null_DEG.csv") %>% 
  as.list() %>% 
  lapply(., function(x) x[!is.na(x)])

postfilter <- read.csv("Output Files/txome_female_edgeR_viralRNA_null_DEG_postfilter.csv")

4.3.4.2 Set up results lists

go_bp <- setNames(vector('list', 1000), 1:1000)
go_mf <- setNames(vector('list', 1000), 1:1000)
go_cc <- setNames(vector('list', 1000), 1:1000)

4.3.4.3 Loop

for (i in 1:1000) {
  genes <- degenes[[i]]
  background <- postfilter[,1]
  bp_test <- 
        WebGestaltR(enrichMethod="ORA", organism="hsapiens", 
                enrichDatabase="geneontology_Biological_Process",
                interestGene = genes, interestGeneType = "genesymbol", 
                referenceGene = background, referenceGeneType = "genesymbol", 
                sigMethod = "fdr", fdrThr = 0.05, 
                isOutput = FALSE)
go_bp[[i]] <- bp_test$description
  
mf_test <- 
    WebGestaltR(enrichMethod="ORA", organism="hsapiens", 
                enrichDatabase="geneontology_Molecular_Function",
                interestGene = genes, interestGeneType = "genesymbol", 
                referenceGene = background, referenceGeneType = "genesymbol", 
                sigMethod = "fdr", fdrThr = 0.05, 
                isOutput = FALSE)
go_mf[[i]] <- mf_test$description

cc_test <- 
    WebGestaltR(enrichMethod="ORA", organism="hsapiens", 
                enrichDatabase="geneontology_Cellular_Component",
                interestGene = genes, interestGeneType = "genesymbol", 
                referenceGene = background, referenceGeneType = "genesymbol", 
                sigMethod = "fdr", fdrThr = 0.05, 
                isOutput = FALSE)    
go_cc[[i]] <- cc_test$description
}

4.3.4.4 Save files

max_length <- max(unlist(lapply(go_bp, length)))
go_bp_filled <-
  lapply(go_bp, function(x) {ans <- rep(NA,length=max_length);
  ans[0:length(x)]<- x;
  return(ans)})
go_bp_final <- do.call(cbind, go_bp_filled)
write.csv(go_bp_final, file = "Output Files/txome_female_edgeR_viralRNA_null_GO_bp.csv", row.names = FALSE)

max_length <- max(unlist(lapply(go_mf, length)))
go_mf_filled <-
  lapply(go_mf, function(x) {ans <- rep(NA,length=max_length);
  ans[0:length(x)]<- x;
  return(ans)})
go_mf_final <- do.call(cbind, go_mf_filled)
write.csv(go_mf_final, file = "Output Files/txome_female_edgeR_viralRNA_null_GO_mf.csv", row.names = FALSE)

max_length <- max(unlist(lapply(go_cc, length)))
go_cc_filled <-
  lapply(go_cc, function(x) {ans <- rep(NA,length=max_length);
  ans[0:length(x)]<- x;
  return(ans)})
go_cc_final <- do.call(cbind, go_cc_filled)
write.csv(go_cc_final, file = "Output Files/txome_female_edgeR_viralRNA_null_GO_cc.csv", row.names = FALSE)

4.3.5 KEGG Analysis

4.3.5.1 Set up results lists

kegg <- setNames(vector('list', 1000), 1:1000)

4.3.5.2 Loop

for (i in 164:1000) {
genes <- degenes[[i]]
background <- postfilter[,1]
test <-
  WebGestaltR(enrichMethod = "ORA", organism = "hsapiens", enrichDatabase = "pathway_KEGG",
              interestGene = genes, interestGeneType = "genesymbol", 
              referenceGene = background, referenceGeneType = "genesymbol", 
              minNum = 5, sigMethod = "fdr", isOutput = FALSE)

ifelse(is.data.frame(test)=="FALSE", 
       kegg[[i]] <- NA, 
       kegg[[i]] <- test$description)
}

4.3.5.3 Save results

max_length <- max(unlist(lapply(kegg, length)))
kegg <-
  lapply(kegg, function(x) {ans <- rep(NA,length=max_length);
  ans[0:length(x)]<- x;
  return(ans)})
kegg <- do.call(cbind, kegg)
write.csv(kegg, file = "Output Files/txome_female_edgeR_viralRNA_null_kegg.csv", row.names = FALSE)

4.4 Genome Null Set - edgeR - Females

4.4.1 Libraries

library(tidyverse)
library(edgeR)
library(WebGestaltR)

4.4.2 Data

Unknown = female

SampleInfo <- read.csv("Input Files/metadata.csv", 
                       stringsAsFactors = FALSE) %>% 
  mutate(RNA=ifelse(disease_stage=="acute" | disease_stage=="peak", "virus+", "virus-")) %>% 
  filter(!sex=="male")

Raw <- read.csv(file = "Output Files/gnome_genecounts_locid_nohemo.csv", 
                stringsAsFactors = FALSE,
                header = TRUE, 
                row.names = 1) %>% 
  select(SampleInfo$sample)

4.4.3 Loop Differential Gene Expression Analysis

4.4.3.1 Configure inputs & set up results lists

rna <- SampleInfo$RNA

degenes <- setNames(vector('list', 1000), 1:1000)

postfilter <- setNames(vector('list', 1000), 1:1000)

4.4.3.2 Loop

Random number generator used to generate 9 for set.seed() function.

set.seed(9)

for (i in 1:1000) {

#Shuffle group assignments without replacement
groups <- sample(rna, replace = FALSE, prob = NULL)

# Create DGE Object
DGE <- DGEList(counts=Raw, group=groups)

# Filter out lowly expressed genes
keep <- filterByExpr(DGE)
DGE <- DGE[keep, , keep.lib.sizes=FALSE]

postfilter[[i]] <- 
  DGE %>% 
  '[['(1) %>% 
  rownames()

# Normalize based on library sizes & calculate dispersion
DGE <- calcNormFactors(DGE)
DGE <- estimateCommonDisp(DGE)
DGE <- estimateTrendedDisp(DGE)
DGE <- estimateTagwiseDisp(DGE)

# Test for DEGs
et <- exactTest(DGE, pair=c("virus-", "virus+"))

# Select DEGs by fold change & pvalue
results <- 
  topTags(et, n = dim(DGE)[1]) %>% 
  data.frame() %>% 
  filter(abs(logFC) > 1 & PValue < 0.05) %>% 
  rownames_to_column(var="gene")

# Save output to list
degenes[[i]] <- results$gene

# End loop
}

4.4.3.3 Save lists of DEGs

# Postfilter Lists
postfilter_df <- data.frame(postfilter[[1]])
write.csv(postfilter_df, file = "Output Files/gnome_female_edgeR_viralRNA_null_DEG_postfilter.csv", row.names = FALSE)

# DEGs
max_length <- max(unlist(lapply(degenes, length)))
filled <-
  lapply(degenes, function(x) {ans <- rep(NA,length=max_length);
    ans[0:length(x)]<- x;
    return(ans)})
df <- do.call(cbind, filled)
write.csv(df, file = "Output Files/gnome_female_edgeR_viralRNA_null_DEG.csv", row.names = FALSE)

4.4.4 GO Enrichment Analysis

4.4.4.1 Import data

degenes <- read.csv("Output Files/gnome_female_edgeR_viralRNA_null_DEG.csv")

postfilter <- read.csv("Output Files/gnome_female_edgeR_viralRNA_null_DEG_postfilter.csv")

4.4.4.2 Set up results lists

go_bp <- setNames(vector('list', 1000), 1:1000)
go_mf <- setNames(vector('list', 1000), 1:1000)
go_cc <- setNames(vector('list', 1000), 1:1000)

4.4.4.3 Loop

for (i in 1:1000) {
  genes <- degenes[[i]]
  background <- postfilter[,1]
  bp_test <- 
        WebGestaltR(enrichMethod="ORA", organism="hsapiens", 
                enrichDatabase="geneontology_Biological_Process",
                interestGene = genes, interestGeneType = "genesymbol", 
                referenceGene = background, referenceGeneType = "genesymbol", 
                sigMethod = "fdr", fdrThr = 0.05, 
                isOutput = FALSE)
go_bp[[i]] <- bp_test$description

mf_test <- 
    WebGestaltR(enrichMethod="ORA", organism="hsapiens", 
                enrichDatabase="geneontology_Molecular_Function",
                interestGene = genes, interestGeneType = "genesymbol", 
                referenceGene = background, referenceGeneType = "genesymbol", 
                sigMethod = "fdr", fdrThr = 0.05, 
                isOutput = FALSE)
go_mf[[i]] <- mf_test$description

cc_test <- 
    WebGestaltR(enrichMethod="ORA", organism="hsapiens", 
                enrichDatabase="geneontology_Cellular_Component",
                interestGene = genes, interestGeneType = "genesymbol", 
                referenceGene = background, referenceGeneType = "genesymbol", 
                sigMethod = "fdr", fdrThr = 0.05, 
                isOutput = FALSE)    
go_cc[[i]] <- cc_test$description
}

4.4.4.4 Save files

max_length <- max(unlist(lapply(go_bp, length)))
go_bp_filled <-
  lapply(go_bp, function(x) {ans <- rep(NA,length=max_length);
  ans[0:length(x)]<- x;
  return(ans)})
go_bp_final <- do.call(cbind, go_bp_filled)
write.csv(go_bp_final, file = "Output Files/gnome_female_edgeR_viralRNA_null_GO_bp.csv", row.names = FALSE)

max_length <- max(unlist(lapply(go_mf, length)))
go_mf_filled <-
  lapply(go_mf, function(x) {ans <- rep(NA,length=max_length);
  ans[0:length(x)]<- x;
  return(ans)})
go_mf_final <- do.call(cbind, go_mf_filled)
write.csv(go_mf_final, file = "Output Files/gnome_female_edgeR_viralRNA_null_GO_mf.csv", row.names = FALSE)

max_length <- max(unlist(lapply(go_cc, length)))
go_cc_filled <-
  lapply(go_cc, function(x) {ans <- rep(NA,length=max_length);
  ans[0:length(x)]<- x;
  return(ans)})
go_cc_final <- do.call(cbind, go_cc_filled)
write.csv(go_cc_final, file = "Output Files/gnome_female_edgeR_viralRNA_null_GO_cc.csv", row.names = FALSE)

4.4.5 KEGG Analysis

4.4.5.1 Set up results lists

kegg <- setNames(vector('list', 1000), 1:1000)

4.4.5.2 Loop

for (i in 1:1000) {
genes <- degenes[[i]]
background <- postfilter[,1]
test <-
  WebGestaltR(enrichMethod = "ORA", organism = "hsapiens", enrichDatabase = "pathway_KEGG",
              interestGene = genes, interestGeneType = "genesymbol", 
              referenceGene = background, referenceGeneType = "genesymbol", 
              minNum = 5, sigMethod = "fdr", isOutput = FALSE)

ifelse(is.data.frame(test)=="FALSE", 
       kegg[[i]] <- NA, 
       kegg[[i]] <- test$description)
}

4.4.5.3 Save results

max_length <- max(unlist(lapply(kegg, length)))
kegg <-
  lapply(kegg, function(x) {ans <- rep(NA,length=max_length);
  ans[0:length(x)]<- x;
  return(ans)})
kegg <- do.call(cbind, kegg)
write.csv(kegg, file = "Output Files/gnome_female_edgeR_viralRNA_null_kegg.csv", row.names = FALSE)

4.5 Refine GO/KEGG

*Based on Null Set results

4.5.1 Libraries

library(tidyverse)
library(kableExtra)
library(knitr)
library(ggh4x)
library(gridExtra)

4.5.2 Data

4.5.2.1 GO/KEGG Data

txome_go <- read.csv("Output Files/txome_female_edgeR_viralRNA_go.csv")
t_null_bp <- read.csv("Output Files/txome_female_edgeR_viralRNA_null_GO_bp.csv")
t_null_mf <- read.csv("Output Files/txome_female_edgeR_viralRNA_null_GO_mf.csv")
t_null_cc <- read.csv("Output Files/txome_female_edgeR_viralRNA_null_GO_cc.csv")


txome_kegg <- read.csv("Output Files/txome_female_edgeR_viralRNA_kegg.csv")
t_kegg <- read.csv("Output Files/txome_female_edgeR_viralRNA_null_kegg.csv")

gnome_go <- read.csv("Output Files/gnome_female_edgeR_viralRNA_go.csv", row.names=1)
g_null_bp <- read.csv("Output Files/gnome_female_edgeR_viralRNA_null_GO_bp.csv")
g_null_mf <- read.csv("Output Files/gnome_female_edgeR_viralRNA_null_GO_mf.csv")
g_null_cc <- read.csv("Output Files/gnome_female_edgeR_viralRNA_null_GO_cc.csv")

gnome_kegg <- read.csv("Output Files/gnome_female_edgeR_viralRNA_kegg.csv")
g_kegg <- read.csv("Output Files/gnome_female_edgeR_viralRNA_null_kegg.csv")

4.5.2.2 Raw Gene Expression

t_raw <- read.csv("Output Files/txome_female_edgeR_viralRNA_normcounts.csv") %>% 
  dplyr::rename("gene" = "X")

g_raw <- read.csv("Output Files/gnome_female_edgeR_viralRNA_normcounts.csv") %>% 
  dplyr::rename("gene" = "X")

meta <- read.csv("Input Files/metadata.csv")

4.5.3 Transcriptome

4.5.3.1 GO Term

4.5.3.1.1 What terms are overrepresented in >10% of nullsets?

# BP
t_null_bp_10 <-
  t_null_bp %>% 
  pivot_longer(cols=1:1000) %>% 
  na.omit(value) %>% 
  group_by(value) %>% 
  summarise(count=n()) %>% 
  filter(count>100) 

# MF
t_null_mf_10 <-
  t_null_mf %>% 
  pivot_longer(cols=1:1000) %>% 
  na.omit(value) %>% 
  group_by(value) %>% 
  summarise(count=n()) %>% 
  filter(count>100) 

# CC
t_null_cc_10 <-
  t_null_cc %>% 
  pivot_longer(cols=1:1000) %>% 
  na.omit(value) %>% 
  group_by(value) %>% 
  summarise(count=n()) %>% 
  filter(count>100)

4.5.3.1.2 Filter results

final_t_bp <- 
  txome_go %>% 
  filter(go=="bp" &
         !description %in% t_null_bp_10$value)
         
final_t_mf <-
  txome_go %>% 
  filter(go=="mf" &
         !description %in% t_null_mf_10$value)

final_t_cc <-
  txome_go %>% 
  filter(go=="cc" &
         !description %in% t_null_cc_10$value)

4.5.3.2 KEGG Pathway

4.5.3.2.1 What pathways are overrepreesented in >10% of nullsets?

t_kegg_10 <-
  t_kegg %>% 
  pivot_longer(cols=1:1000) %>% 
  na.omit(value) %>% 
  group_by(value) %>% 
  summarise(count=n()) %>% 
  filter(count>100)

4.5.3.2.2 Filter Results

txome_kegg_final <- 
  txome_kegg %>% 
  filter(!description %in% t_kegg_10$value)

txome_kegg %>% 
  kable() %>%    
  kable_styling("basic")

4.5.4 Genome

4.5.4.1 GO Term

4.5.4.1.1 What terms are overrepresented in >10% of nullsets?

# BP
g_null_bp_10 <-
  g_null_bp %>% 
  pivot_longer(cols=1:1000) %>% 
  na.omit(value) %>% 
  group_by(value) %>% 
  summarise(count=n()) %>% 
  filter(count>100) 

# MF
g_null_mf_10 <-
  g_null_mf %>% 
  pivot_longer(cols=1:1000) %>% 
  na.omit(value) %>% 
  group_by(value) %>% 
  summarise(count=n()) %>% 
  filter(count>100) 

# CC
g_null_cc_10 <-
  g_null_cc %>% 
  pivot_longer(cols=1:1000) %>% 
  na.omit(value) %>% 
  group_by(value) %>% 
  summarise(count=n()) %>% 
  filter(count>100)

4.5.4.1.2 Filter results

final_g_bp <- 
  gnome_go %>% 
  filter(go=="bp" &
         !description %in% g_null_bp_10$value)
         
final_g_mf <-
  gnome_go %>% 
  filter(go=="mf" &
         !description %in% g_null_mf_10$value)

final_g_cc <-
  gnome_go %>% 
  filter(go=="cc" &
         !description %in% g_null_cc_10$value)

4.5.4.2 KEGG Pathway

4.5.4.2.1 What pathways are overrepreesented in >10% of nullsets?

g_kegg_10 <-
  g_kegg %>% 
  pivot_longer(cols=1:1000) %>% 
  na.omit(value) %>% 
  group_by(value) %>% 
  summarise(count=n()) %>% 
  filter(count>100)

4.5.4.2.2 Filter Results

gnome_kegg_final <- 
  gnome_kegg %>% 
  filter(!description %in% g_kegg_10$value) 

gnome_kegg %>% 
  kable() %>%    
  kable_styling("basic")

4.5.5 GO Term Figure

t_final <- 
  rbind(final_t_bp, final_t_mf, final_t_cc) %>% 
  mutate(approach="Transcriptome")
write.csv(t_final, "Output Files/txome_female_edgeR_viralRNA_go_refined.csv", row.names=FALSE)

g_final <-
  rbind(final_g_bp, final_g_mf, final_g_cc) %>% 
  mutate(approach= "Genome")
write.csv(g_final, "Output Files/gnome_female_edgeR_viralRNA_go_refined.csv", row.names=FALSE)

go_all <-
  rbind(t_final, g_final)

go_fig <-
  go_all %>% 
  mutate(go=factor(
           ifelse(go=="bp", "BP", 
                  ifelse(go=="cc", "CC", "MF")),
           levels=c("BP", "CC", "MF")), 
         enrichmentRatio=ifelse(direction=="down", enrichmentRatio*-1, enrichmentRatio)) %>% 
  
  ggplot(data=., aes(x=fct_reorder(description, enrichmentRatio, .desc=TRUE), 
                     y=enrichmentRatio, fill=factor(approach))) +
  geom_bar(stat="identity", position="dodge") +
  scale_fill_manual(values=c("slategray3", "slategrey"), name=NULL) +
  labs(x="GO Term", y="Enrichment Ratio") +
  scale_x_discrete(labels = function(description) str_wrap(description, width = 65)) +
  scale_y_continuous(n.breaks=10) +
  facet_grid(vars(go), scales="free_y", space="free_y") +
  theme_bw() +
  theme(panel.grid=element_blank(), text=element_text(size=11), 
        legend.position="bottom") +
  coord_flip() 
  
ggsave(filename="Figures/viralRNA_goterm.jpg", go_fig, width=9, height=8, units="in")

4.5.5.1 Presentation figure

go_fig_pres <-
  t_final %>% 
  mutate(go=factor(
           ifelse(go=="bp", "BP", 
                  ifelse(go=="cc", "CC", "MF")),
           levels=c("BP", "CC", "MF")), 
         enrichmentRatio=ifelse(direction=="down", enrichmentRatio*-1, enrichmentRatio)) %>% 
  
  ggplot(data=., aes(x=fct_reorder(description, enrichmentRatio, .desc=TRUE), 
                     y=enrichmentRatio)) +
  geom_bar(stat="identity", position="dodge", fill="slategray") +
  labs(x="GO Term", y="Enrichment Ratio") +
  scale_x_discrete(labels = function(description) str_wrap(description, width = 65)) +
  scale_y_continuous(n.breaks=10) +
  facet_grid(vars(go), scales="free_y", space="free_y") +
  theme_bw() +
  theme(panel.grid=element_blank(), text=element_text(size=16), 
        legend.position="bottom") +
  coord_flip()
go_fig_pres

ggsave(filename="Figures/viralRNA_goterm_t_pres.jpg", go_fig_pres, width=10, height=8, units="in")


go_fig_pres2 <-
  go_fig +
  theme(text = element_text(size = 16))

ggsave(filename="Figures/viralRNA_goterm_tg_pres.jpg", go_fig_pres2, width=10, height=9, units="in")

4.5.6 Immunne GO term Genes

4.5.6.1 Extracting Immune GO Genes

Manually look up function of each gene & categorize. Select subset of 16 genes for presentation in manuscript.

immune_go_genes <- 
  go_all %>% 
  filter(grepl("immune|viral|inflammatory|interferon|leukocyte", description)) %>% 
  select(userId) %>% 
  mutate(genes = str_split_fixed(userId, ";", n=18)) %>% 
  select(!userId) %>% 
  unlist() %>% 
  data.frame() %>% 
  setNames("genes") %>% 
  filter(!duplicated(genes)) %>% 
  filter(!genes=="") %>% 
  arrange(genes) %>% 
  mutate(description = c(
    "Innate Immunity", 
    "extra", 
    "Antiviral", 
    "Innate Immunity & adaptive", 
    "extra", 
    "extra", 
    "Complement System", 
    "Complement System", 
    "Complement System", 
    "Complement System", 
    "adaptive", 
    "Complement System", 
    "Complement System",
    "Immunoregulation",
    "Innate Immunity", 
    "inflammation", 
    "mhc", 
    "Innate Immunity",
    "Immunoregulation",
    "Immunoregulation", 
    "Innate Immunity", 
    "Immunoregulation", 
    "Innate Immunity", 
    "extra",
    "extra", 
    "Antiviral", 
    "Antiviral", 
    "Innate Immunity", 
    "extra", 
    "extra", 
    "extra", 
    "Antiviral", 
    "Immunoregulation", 
    "inflammation", 
    "extra", 
    "Innate Immunity", 
    "Immunoregulation"
  ))

immune_go_genes_subset <- 
  immune_go_genes %>% 
  filter(description == "Innate Immunity" |
         description == "Complement System" |
         description == "Immunoregulation" |
         description == "Antiviral") %>% 
  filter(!genes == "C2" & 
         !genes == "CFD",
         !genes == "HSPA8",
         !genes == "ACTB", 
         !genes == "DHX58") %>% 
  mutate(description = factor(description, levels=c("Innate Immunity", "Immunoregulation", "Antiviral", "Complement System")))

4.5.6.2 Extract Expression Data

t_immunegenes_subset <- 
  t_raw %>% 
  filter(gene %in% immune_go_genes_subset$genes) %>% 
  pivot_longer(!gene, names_to="sample", values_to="expression") %>% 
  mutate(disease=ifelse(sample %in% meta$sample,
                     meta[match(sample, meta$sample),4],
                     sample),
         disease=factor(ifelse(disease=="acute", "A", 
                               ifelse(disease=="peak", "P", 
                                      ifelse(disease=="late", "L", "C"))),
                        levels=c("C", "A", "P", "L")),
         description = ifelse(gene %in% immune_go_genes$genes, 
                              immune_go_genes[match(gene, immune_go_genes$genes),2],
                              gene)) 

t_immune_genes_all <-
  t_raw %>% 
  filter(gene %in% immune_go_genes$genes) %>% 
  pivot_longer(!gene, names_to="sample", values_to="expression") %>% 
  mutate(disease=ifelse(sample %in% meta$sample,
                     meta[match(sample, meta$sample),4],
                     sample),
         disease=factor(ifelse(disease=="acute", "A", 
                               ifelse(disease=="peak", "P", 
                                      ifelse(disease=="late", "L", "C"))),
                        levels=c("C", "A", "P", "L")))

g_immune_genes_all <- 
  g_raw %>% 
  filter(gene %in% immune_go_genes$genes) %>% 
  pivot_longer(!gene, names_to="sample", values_to="expression") %>% 
  mutate(disease=ifelse(sample %in% meta$sample,
                     meta[match(sample, meta$sample),4],
                     sample),
         disease=factor(ifelse(disease=="acute", "A", 
                               ifelse(disease=="peak", "P", 
                                      ifelse(disease=="late", "L", "C"))),
                        levels=c("C", "A", "P", "L")))

4.5.6.3 Plot for manuscript

t_plot_manuscript <-
  t_immunegenes_subset %>% 
  ggplot(., aes(x=disease, y=expression)) +
  geom_boxplot() +
  facet_nested_wrap(vars(description, gene),
                    nest_line=element_line(colour="black")) +
  theme(ggh4x.facet.nestline = element_line(linetype = 1)) +
  labs(x = "Influenza Stage", y = "log2(CPM)") +
  theme_bw() +
  theme(panel.grid=element_blank())
t_plot_manuscript

ggsave("Figures/T_female_edgeR_immunegenes.jpg", t_plot_manuscript, 
       width=8, height=8, units="in")

4.5.6.4 Plot for supplemental

t_plot_supplemental <-
  t_immune_genes_all %>% 
  ggplot(aes(x=disease, y=expression)) +
  geom_boxplot() +
  facet_wrap(~gene, scales="free", ncol=5) +
  labs(title=NULL,
       x="Influenza Stage", y="log2(CPM)") +
  theme_bw() +
  theme(panel.grid=element_blank())

ggsave("Figures/supplemental_T_female_edgeR_immunegenes.jpg", t_plot_supplemental,
          width=8, height=10, units="in")

g_plot_supplemental <-
  g_immune_genes_all %>% 
  ggplot(aes(x=disease, y=expression)) +
  geom_boxplot() +
  facet_wrap(~gene, scales="free", ncol=5) +
  labs(title=NULL,
       x="Influenza Stage", y="log2(CPM)") +
  theme_bw() +
  theme(panel.grid=element_blank())

ggsave("Figures/supplemental_G_female_edgeR_immunegenes.jpg", g_plot_supplemental,
          width=8, height=10, units="in")

4.5.7 KEGG Table - Manuscript

t_kegg_table <- 
  txome_kegg_final %>% 
  mutate(`Pairwise Comparison (Approach)` = "Virus+ vs Virus- (Transcriptome)",
         `KEGG Pathway` = description,
         `Enrichment Ratio (FDR)` = paste0(round(enrichmentRatio, digits=3), " (", round(FDR, digits=4), ")"),
         Genes=userId) %>% 
  select(`Pairwise Comparison (Approach)`, `KEGG Pathway`, `Enrichment Ratio (FDR)`, Genes)

g_kegg_table <-
  gnome_kegg_final %>% 
  mutate(`Pairwise Comparison (Approach)` = "Virus+ vs Virus- (Genome)",
         `KEGG Pathway` = description,
         `Enrichment Ratio (FDR)` = paste0(round(enrichmentRatio, digits=2), " (", round(FDR, digits=4), ")"),
         Genes=userId) %>% 
  select(`Pairwise Comparison (Approach)`, `KEGG Pathway`, `Enrichment Ratio (FDR)`, Genes)


kegg_table <- 
  rbind(t_kegg_table, g_kegg_table)

write.csv(kegg_table, "Output Files/manuscript_keggtable.csv", row.names=FALSE)

4.5.8 Supplemental Table

4.5.8.1 Biological Process

t_sup_bp <- 
  txome_go %>% 
  filter(go=="bp") %>% 
  mutate(GOTerm = paste(description, " ", "(", geneSet, ")", sep=""), 
         enrichmentRatio = round(enrichmentRatio, digits=1),
         FDR = round(FDR, digits=3),
         enrichmentRatio = ifelse(direction=="down", paste("-", enrichmentRatio, sep=""),
                                  paste("+", enrichmentRatio, sep="")),
         Transcriptome = paste(enrichmentRatio, " ", "(", FDR, ")", sep=""), 
         Transcriptome_null = ifelse(description %in% t_null_bp_10$value, "x", "")) %>% 
  select(GOTerm, Transcriptome, Transcriptome_null)


g_sup_bp <- 
  gnome_go %>% 
  filter(go=="bp") %>% 
  mutate(GOTerm = paste(description, " ", "(", geneSet, ")", sep=""), 
         enrichmentRatio = round(enrichmentRatio, digits=1),
         FDR = round(FDR, digits=3),
         enrichmentRatio = ifelse(direction=="down", paste("-", enrichmentRatio, sep=""),
                                  paste("+", enrichmentRatio, sep="")),
         Genome = paste(enrichmentRatio, " ", "(", FDR, ")", sep=""), 
         Genome_null = ifelse(description %in% g_null_bp_10$value, "x", "")) %>% 
  select(GOTerm, Genome, Genome_null)

sup_bp <- 
  merge(t_sup_bp, g_sup_bp, all=TRUE) %>% 
  mutate(go="BP")

4.5.8.2 Cellular Component

t_sup_cc <- 
  txome_go %>% 
  filter(go=="cc") %>% 
  mutate(GOTerm = paste(description, " ", "(", geneSet, ")", sep=""), 
         enrichmentRatio = round(enrichmentRatio, digits=1),
         FDR = round(FDR, digits=3),
         enrichmentRatio = ifelse(direction=="down", paste("-", enrichmentRatio, sep=""),
                                  paste("+", enrichmentRatio, sep="")),
         Transcriptome = paste(enrichmentRatio, " ", "(", FDR, ")", sep=""), 
         Transcriptome_null = ifelse(description %in% t_null_bp_10$value, "x", "")) %>% 
  select(GOTerm, Transcriptome, Transcriptome_null)


g_sup_cc <- 
  gnome_go %>% 
  filter(go=="cc") %>% 
  mutate(GOTerm = paste(description, " ", "(", geneSet, ")", sep=""), 
         enrichmentRatio = round(enrichmentRatio, digits=1),
         FDR = round(FDR, digits=3),
         enrichmentRatio = ifelse(direction=="down", paste("-", enrichmentRatio, sep=""),
                                  paste("+", enrichmentRatio, sep="")),
         Genome = paste(enrichmentRatio, " ", "(", FDR, ")", sep=""), 
         Genome_null = ifelse(description %in% g_null_bp_10$value, "x", "")) %>% 
  select(GOTerm, Genome, Genome_null)

sup_cc <- 
  merge(t_sup_cc, g_sup_cc, all=TRUE) %>% 
  mutate(go="CC")

4.5.8.3 Molecular Function

t_sup_mf <- 
  txome_go %>% 
  filter(go=="mf") %>% 
  mutate(GOTerm = paste(description, " ", "(", geneSet, ")", sep=""), 
         enrichmentRatio = round(enrichmentRatio, digits=1),
         FDR = round(FDR, digits=3),
         enrichmentRatio = ifelse(direction=="down", paste("-", enrichmentRatio, sep=""),
                                  paste("+", enrichmentRatio, sep="")),
         Transcriptome = paste(enrichmentRatio, " ", "(", FDR, ")", sep=""), 
         Transcriptome_null = ifelse(description %in% t_null_bp_10$value, "x", "")) %>% 
  select(GOTerm, Transcriptome, Transcriptome_null)


g_sup_mf <- 
  gnome_go %>% 
  filter(go=="mf") %>% 
  mutate(GOTerm = paste(description, " ", "(", geneSet, ")", sep=""), 
         enrichmentRatio = round(enrichmentRatio, digits=1),
         FDR = round(FDR, digits=3),
         enrichmentRatio = ifelse(direction=="down", paste("-", enrichmentRatio, sep=""),
                                  paste("+", enrichmentRatio, sep="")),
         Genome = paste(enrichmentRatio, " ", "(", FDR, ")", sep=""), 
         Genome_null = ifelse(description %in% g_null_bp_10$value, "x", "")) %>% 
  select(GOTerm, Genome, Genome_null)

sup_mf <- 
  merge(t_sup_mf, g_sup_mf, all=TRUE) %>% 
  mutate(go="MF")

4.5.8.4 Save GO Tables

sup <- 
  rbind(sup_bp, sup_cc, sup_mf)

sup %>% 
  kable() %>%    
  kable_styling("basic")

write.csv(sup, "Output Files/supplemental_female_edgeR_viralRNA_go.csv", row.names=FALSE)

4.5.8.5 KEGG

t_sup_kegg <- 
  txome_kegg %>% 
  mutate(KEGG = paste(description, " ", "(", geneSet, ")", sep=""), 
         Enrichment = round(enrichmentRatio, digits=1),
         FDR = round(FDR, digits=3),
         Transcriptome = paste(Enrichment, " ", "(", FDR, ")", sep=""), 
         Transcriptome_null = ifelse(description %in% t_kegg$value, "x", ""),
         Transcriptome_Genes = userId) %>% 
  select(KEGG, Transcriptome, Transcriptome_null, Transcriptome_Genes)

g_sup_kegg <- 
  gnome_kegg %>% 
  mutate(KEGG = paste(description, " ", "(", geneSet, ")", sep=""), 
         Enrichment = round(enrichmentRatio, digits=1),
         FDR = round(FDR, digits=3),
         Genome = paste(Enrichment, " ", "(", FDR, ")", sep=""), 
         Genome_null = ifelse(description %in% g_kegg$value, "x", ""),
         Genome_Genes = userId) %>% 
  select(KEGG, Genome, Genome_null, Genome_Genes)


sup_kegg <- 
  merge(t_sup_kegg, g_sup_kegg, all=TRUE)

sup_kegg %>% 
  kable() %>%    
  kable_styling("basic")

write.csv(sup_kegg, "Output Files/supplemental_female_edgeR_viralRNA_kegg.csv", row.names=FALSE)