2 LOC Gene Identification

2.1 Libraries

library(tidyverse)
library(Biostrings)
library(BSgenome)
library(gridExtra)

2.2 Load data

txome_genes <- 
  read.table("Input Files/txome_genecounts.txt") %>% 
  rownames_to_column(var="gene")
txome_map <- read.table("Input Files/txome_transcript-gene_map.txt", 
                        header=1)
txome_seqs <- readDNAStringSet(
  "C:/Users/Christina/OneDrive - University of Maine System/hg-rna/txome_denovo_GG.okay_trim.txt", 
                   format = "fasta", seek.first.rec = TRUE, use.names = TRUE)

gnome_genes <- 
  read.table("Input Files/gnome_genecounts.txt") %>% 
  rownames_to_column(var="gene")
gnome_map <- read.table("Input Files/gnome_transcript-gene_map.txt", 
                        header=1)
gnome_seqs <- readDNAStringSet(
  "C:/Users/Christina/OneDrive - University of Maine System/hg-rna/gnome_Transcript_Seqs_exons_trim.txt", 
                               format = "fasta", seek.first.rec = TRUE, use.names = TRUE)

meta <- 
  read.csv("Input Files/metadata.csv") %>% 
  mutate(rna=ifelse(disease_stage=="acute", "Virus+", 
                    ifelse(disease_stage=="peak", "Virus+", "Virus-")))

2.3 Extract gene sequences

2.3.1 Extract LOC genes from gene count matrices

txome_loc <- 
  txome_genes %>% 
  select(gene) %>% 
  filter(str_detect(gene, "^LOC"))

gnome_loc <-
  gnome_genes %>% 
  select(gene) %>% 
  filter(str_detect(gene, "^LOC"))

2.3.2 Extract transcript IDs for all sequence IDs

txome_loc_txpts <- 
  merge(txome_loc, txome_map, by.x="gene", by.y="gene_id", all.x=TRUE)
  
gnome_loc_txpts <- 
  merge(gnome_loc, gnome_map, by.x="gene", by.y="gene_id", all.x=TRUE)

2.3.3 Extract transcript sequences for blast

txome_loc_seq <- getSeq(x = txome_seqs, txome_loc_txpts$transcript_id)

gnome_loc_seq <- getSeq(x = gnome_seqs, gnome_loc_txpts$transcript_id)

2.3.4 Export sequences

writeXStringSet(txome_loc_seq, "Output Files/txome_loc_seqs.fasta")
writeXStringSet(gnome_loc_seq, "Output Files/gnome_loc_seqs.fasta")

2.4 Blast sequences

blastn against nt database with code:

  • blastn -db nt -query input.fasta -max_target_seqs 2 -max_hsps 1 -evalue 0.01 -perc_identity 80 -outfmt “10 std qcovs qcovhsp stitle” -out output.csv -remote

2.5 Process BLASTn Results

2.5.1 Transcriptome

2.5.1.1 Read in & tidy results

txome_blast <- 
  read.table("Input Files/txome_loc_blastresults.csv",
             header=FALSE, sep=",", col.names=paste0("V", seq_len(24)), 
             fill=TRUE, quote="") %>%
  mutate_all(as.character) %>% 
  mutate_if(is.character, ~replace_na(.,"")) %>% 
  mutate(V15=paste0(V15, V16, V17, V18, V19, V20, V21, V22, V23, V24)) %>% 
  select(!c(16:24)) %>% 
  setNames(c("transcript_id", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", 
           "qend", "sstart", "send", "evalue", "bitscore", "qcovs", "qcovhsp", "stitle")) %>% 
  select(transcript_id, pident, qcovs, evalue, stitle) %>% 
  left_join(., txome_map)

2.5.1.2 Prepare output dataframes

txome_geneassignments <- 
  data.frame(matrix(ncol = 6, nrow = 0)) %>% 
  setNames(c("transcript_id", "pident", "qcovs", "evalue", "stitle", "gene_id"))

txome_needcheck <- 
  data.frame(matrix(ncol = 6, nrow = 0)) %>% 
  setNames(c("transcript_id", "pident", "qcovs", "evalue", "stitle", "gene_id"))

2.5.1.3 Loop

for (i in 1:nrow(txome_loc)) {
  data <- txome_blast[txome_blast$gene_id==txome_loc[i,1],]
  
# Remove lines with vague terms in stitle title
data <- data[grep("uncharacterized", data$stitle, invert = TRUE),]
data <- data[grep("genome assembly", data$stitle, invert = TRUE),]
data <- data[grep(" chromosome", data$stitle, invert=TRUE),]

if (nrow(data)==0) {
  next
}

# Extract gene names in ()
for (i in 1:nrow(data)) {
  stitle <- data[i,]$stitle
  {if(grepl('\\(', stitle)) {
    newstitle <- str_extract(stitle, "(?<=\\()([^()]*?)(?=\\)[^()]*$)")
    }
    else{newstitle <- NA}
  }
  data[i,]$stitle <- newstitle
}

# Select first stitle for each ID in transcript
data <- data[ !duplicated(data$transcript_id), ]

# Select only unique stitles for each gene (across transcripts)
data <- data %>% distinct(stitle, .keep_all=TRUE)

# Exclude rows with irrelevant gene names (NA, LOCs, long names)
data <- 
  data %>% 
  filter(!stitle=="NA" &
         !stitle=="" & 
         !grepl("^LOC", stitle) &
         !nchar(stitle) > 20)

# If there is one row left of the dataframe, save to a list
{if((nrow(data))==1) {
  txome_geneassignments <- rbind(data, txome_geneassignments)
}
  else{txome_needcheck <- rbind(data, txome_needcheck)}
}

# End Loop

}

2.5.2 Genome

2.5.2.1 Read in & configure gnome blastn files

gnome_blast <- 
  read.csv("Input Files/gnome_loc_blastresults.csv", 
           header=FALSE, sep=",", col.names=paste0("V", seq_len(21)), 
           fill=TRUE, quote="") %>%
  mutate_all(as.character) %>% 
  mutate_if(is.character, ~replace_na(.,"")) %>% 
  mutate(V15=paste0(V15, V16, V17, V18, V19, V20, V21)) %>% 
  select(!c(16:21)) %>% 
  setNames(c("transcript_id", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", 
           "qend", "sstart", "send", "evalue", "bitscore", "qcovs", "qcovhsp", "stitle")) %>% 
  select(transcript_id, pident, qcovs, evalue, stitle) %>% 
  left_join(., gnome_map)

2.5.2.2 Prepare output dataframes

gnome_geneassignments <- 
  data.frame(matrix(ncol = 6, nrow = 0)) %>% 
  setNames(c("transcript_id", "pident", "qcovs", "evalue", "stitle", "gene_id"))

gnome_needcheck <- 
  data.frame(matrix(ncol = 6, nrow = 0)) %>% 
  setNames(c("transcript_id", "pident", "qcovs", "evalue", "stitle", "gene_id"))

2.5.2.3 Loop

for (i in 1:nrow(gnome_loc)) {
  data <- gnome_blast[gnome_blast$gene_id==gnome_loc[i,1],]
  
# Remove lines with vague terms in stitle title
data <- data[grep("uncharacterized", data$stitle, invert = TRUE),]
data <- data[grep("genome assembly", data$stitle, invert = TRUE),]
data <- data[grep(" chromosome", data$stitle, invert=TRUE),]

if (nrow(data)==0) {
  next
}

# Extract gene names in ()
for (i in 1:nrow(data)) {
  stitle <- data[i,]$stitle
  {if(grepl('\\(', stitle)) {
    newstitle <- str_extract(stitle, "(?<=\\()([^()]*?)(?=\\)[^()]*$)")
    }
    else{newstitle <- NA}
  }
  data[i,]$stitle <- newstitle
}

# Select first stitle for each ID in transcript
data <- data[ !duplicated(data$transcript_id), ]

# Select only unique stitles for each gene (across transcripts)
data <- data %>% distinct(stitle, .keep_all=TRUE)

# Exclude rows with irrelevant gene names (NA, LOCs, long names)
data <- 
  data %>% 
  filter(!stitle=="NA" &
         !stitle=="" & 
         !grepl("^LOC", stitle) &
         !nchar(stitle) > 20 &
         pident > 89)

# If there is one row left of the dataframe, save to a list
{if((nrow(data))==1) {
  gnome_geneassignments <- rbind(data, gnome_geneassignments)
}
  else{gnome_needcheck <- rbind(data, gnome_needcheck)}
}

# End Loop

}

2.5.3 Combine Transcriptome & Genome Blast Results

Note: For duplicates, only using genes for which both T & G blasts yielded the same result (distinct())

blast_all <-
  bind_rows(txome_geneassignments, gnome_geneassignments) %>% 
  select(stitle, gene_id) %>%
  distinct() %>%
  distinct(gene_id, .keep_all=TRUE) %>% 
  mutate(stitle=toupper(stitle))

2.6 Combine with prior LOC ID master list

LOCs identified through blastn & manually through extracting NCBI’s description

null_loc <-
  read.csv("Input Files/TG_nullset_masterLOC.csv") %>% 
  mutate(symbol=ifelse(gene=="LOC118528561", "ANKRD13A", symbol)) %>% 
  filter(!grepl('LOC', symbol))

loc_update <- 
  merge(blast_all, null_loc, by.x="gene_id", by.y="gene", all=TRUE) %>% 
  mutate_all(~replace(., is.na(.), 0))

for (i in 1:nrow(loc_update)) {
  if(loc_update$stitle[i]=="0" && !loc_update$symbol[i]=="0") {
    loc_update$stitle[i] <- loc_update$symbol[i]
  }
  else{loc_update$stitle[i] <- loc_update$stitle[i]}
}

loc_update <- 
  loc_update %>% 
  select(!symbol)

write.csv(loc_update, "Output Files/rna_locid_all.csv")

2.7 Update & tidy gene count matrices

Replace LOCs with new IDs & aggregate gene counts

txome_genes_update <- 
  txome_genes %>% 
  mutate(gene=ifelse(gene %in% loc_update$gene_id, 
                     loc_update[match(gene, loc_update$gene_id),2],
                     gene)) %>% 
  aggregate(. ~ gene, sum)

gnome_genes_update <- 
  gnome_genes %>% 
  mutate(gene=ifelse(gene %in% loc_update$gene_id, 
                     loc_update[match(gene, loc_update$gene_id),2],
                     gene)) %>% 
  aggregate(. ~ gene, sum)

2.8 Hemoglobin

2.8.1 Extract hemoglobin genes

Hemoglobin genes identified from Harrington et al. 2020 (suppl table 1) and from NCBI gray seal gene search for “hemoglobin”

hemoglobin_genes <-
  c("HBA1", "HBA2", "HBB", "HBBP1", "HBD", "HBE1", "HBG1", "HBG2", "HBM", "HBQ1",
    "HBZ", "HBZP1","AHSP", "RHEX", "LOC118528782", "LOC118528783", "LOC118528784", 
    "LOC118528786", "LOC118528797", "LOC118530184", "LOC118530187")

2.8.2 Prop of total gene counts

t_all <- 
  txome_genes_update %>% 
  pivot_longer(!gene, names_to="sample", values_to="expression") %>% 
  group_by(sample) %>% 
  summarise(total=sum(expression))

g_all <-
  gnome_genes_update %>% 
  pivot_longer(!gene, names_to="sample", values_to="expression") %>% 
  group_by(sample) %>% 
  summarise(total=sum(expression))

t_hemo <- 
  txome_genes_update %>% 
  filter(gene %in% hemoglobin_genes) %>%
  pivot_longer(!gene, names_to="sample", values_to="expression") %>% 
  group_by(sample) %>% 
  summarise(hemoglobin=sum(expression)) %>% 
  merge(., t_all, by="sample") %>% 
  mutate(Transcriptome = (hemoglobin/total)*100)

g_hemo <- 
  gnome_genes_update %>% 
  filter(gene %in% hemoglobin_genes) %>% 
  pivot_longer(!gene, names_to="sample", values_to="expression") %>% 
  group_by(sample) %>% 
  summarise(hemoglobin=sum(expression)) %>% 
  merge(., g_all, by="sample") %>% 
  mutate(Genome = (hemoglobin/total)*100)

hemo_plotdata <-
  merge(t_hemo, g_hemo, by="sample") %>% 
  select(sample, Transcriptome, Genome) %>% 
  pivot_longer(!sample, names_to="approach", values_to="prop") %>% 
  merge(., meta, by="sample") %>% 
  mutate(approach=factor(approach, levels=c("Transcriptome", "Genome")),
         disease_stage=factor(disease_stage, levels=c("control", "acute", "peak", "late")),
         rna=factor(rna, levels=c("Virus-", "Virus+")))

hemo_plot1 <- 
  ggplot(hemo_plotdata, aes(x=rna, y=prop)) +
  geom_boxplot() +
  facet_grid(~approach) +
  labs(x="Influenza Viral RNA Status", y=NULL) +
  theme_bw() +
  theme(panel.grid=element_blank())

hemo_plot2 <-
  ggplot(hemo_plotdata, aes(x=disease_stage, y=prop)) +
  geom_boxplot() +
  facet_grid(~approach) +
  labs(x="Influenza Stage", y=NULL) +
  scale_x_discrete(labels=c("Control", "Acute", "Peak", "Late")) +
  theme_bw() +
  theme(panel.grid=element_blank())

hemo_plot <-
  grid.arrange(hemo_plot1, hemo_plot2, ncol=1, left="Proportion of Hemoglobin Reads")

ggsave("Figures/supplemental_hemoplot.jpg", hemo_plot, width=5, height=6, units="in")

2.8.3 Remove hemoglobin genes

txome_hemo <- 
  txome_genes_update %>% 
  filter(!gene %in% hemoglobin_genes)

gnome_hemo <- 
  gnome_genes_update %>% 
  filter(!gene %in% hemoglobin_genes)

2.8.4 Export new gene count matrices

write.csv(txome_hemo, "Output Files/txome_genecounts_locid_nohemo.csv", row.names=FALSE)
write.csv(gnome_hemo, "Output Files/gnome_genecounts_locid_nohemo.csv", row.names=FALSE)