Skip to content

Commit 611c7aa

Browse files
committed
ann rte fix
1 parent 0667135 commit 611c7aa

File tree

1 file changed

+69
-52
lines changed

1 file changed

+69
-52
lines changed

workflow/aref/scripts/annotate_rtes.R

Lines changed: 69 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -308,16 +308,16 @@ tryCatch(
308308
names(ss) <- c(row$element_orf_id)
309309
orf_ss_all <- c(orf_ss_all, ss)
310310
}
311-
311+
312312
system(sprintf("makeblastdb -in %s -dbtype 'prot'", orf_aa_consensus_path))
313313

314314
orf_aa_ss_all <- Biostrings::translate(orf_ss_all)
315315
orf_aa_ss_all_path <- sprintf("%s/intactness_annotation_workdir/%s_all_orfs_aa.fa", outputdir, element)
316-
writeXStringSet(orf_aa_ss_all,orf_aa_ss_all_path)
316+
writeXStringSet(orf_aa_ss_all, orf_aa_ss_all_path)
317317

318318
orf_aa_ss_all_blast_results_path <- sprintf("%s/intactness_annotation_workdir/%s_orf_length_%s_aa_blast_to_consensus_results.tsv", outputdir, element, modal_width)
319-
system(sprintf("blastp -db %s -query %s -out %s -outfmt 7",orf_aa_consensus_path, orf_aa_ss_all_path, orf_aa_ss_all_blast_results_path))
320-
bres <- read_delim(orf_aa_ss_all_blast_results_path, comment = "#", delim = "\t", col_names = c("qseqid","sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"))
319+
system(sprintf("blastp -db %s -query %s -out %s -outfmt 7", orf_aa_consensus_path, orf_aa_ss_all_path, orf_aa_ss_all_blast_results_path))
320+
bres <- read_delim(orf_aa_ss_all_blast_results_path, comment = "#", delim = "\t", col_names = c("qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"))
321321
bres <- bres %>%
322322
separate_wider_delim(cols = qseqid, delim = ":", names = c("gene_id", "orf_start")) %>%
323323
mutate(gene_orf_id = paste(gene_id, orf_start, delim = ":"))
@@ -414,7 +414,7 @@ tryCatch(
414414
partial_orf_passes_mutation_threshold = map_chr(partial_orf_passes_mutation_threshold, ~ paste(.x, collapse = ";"))
415415
) %>%
416416
dplyr::select(gene_id, intactness_req, orf_passes_mutation_threshold, partial_orf_passes_mutation_threshold)
417-
},
417+
},
418418
error = function(e) {
419419
print("no elements annotated for intactness")
420420
intactness_ann <- rmfragments %>%
@@ -722,7 +722,7 @@ getannotation <- function(to_be_annotated, regions_of_interest, property, name_i
722722
tibble() %>%
723723
dplyr::select(gene_id, !!property)
724724

725-
#now do it in a stranded fashion
725+
# now do it in a stranded fashion
726726
inregions <- to_be_annotated %>% subsetByOverlaps(regions_of_interest, invert = FALSE, ignore.strand = FALSE)
727727
tryCatch(
728728
{
@@ -747,7 +747,9 @@ getannotation <- function(to_be_annotated, regions_of_interest, property, name_i
747747
tibble() %>%
748748
dplyr::select(gene_id, stranded)
749749

750-
annot <- full_join(annot_unstranded, annot_stranded) %>% mutate(!!paste0(property, "_orientation") := ifelse(!!sym(property) == name_out, name_out, ifelse(!!sym(property) == stranded, "Sense", "Antisense"))) %>% dplyr::select(-stranded)
750+
annot <- full_join(annot_unstranded, annot_stranded) %>%
751+
mutate(!!paste0(property, "_orientation") := ifelse(!!sym(property) == name_out, name_out, ifelse(!!sym(property) == stranded, "Sense", "Antisense"))) %>%
752+
dplyr::select(-stranded)
751753
return(annot)
752754
}
753755

@@ -823,47 +825,49 @@ region_annot <- region_annot %>%
823825
TRUE ~ "Other"
824826
)) %>%
825827
mutate(
826-
loc_integrative_stranded = case_when(
827-
exonic == "Exonic" & exonic_orientation == "Sense" ~ "Exonic_Sense",
828-
intronic == "Intronic" & intronic_orientation == "Sense" ~ "Intronic_Sense",
829-
coding_tx == "CdgTx" & coding_tx_orientation == "Sense" ~ "CdgTxOther_Sense",
830-
noncoding_tx == "NoncdgTx" & noncoding_tx_orientation == "Sense" ~ "NoncdgTx_Sense",
831-
exonic == "Exonic" & exonic_orientation == "Antisense" ~ "Exonic_Antisense",
832-
intronic == "Intronic" & intronic_orientation == "Antisense" ~ "Intronic_Antisense",
833-
coding_tx == "CdgTx" & coding_tx_orientation == "Antisense" ~ "CdgTxOther_Antisense",
834-
noncoding_tx == "NoncdgTx" & noncoding_tx_orientation == "Antisense" ~ "NoncdgTx_Antisense",
835-
coding_tx_adjacent == "CdgTxAdj" & coding_tx_adjacent_orientation == "Sense" ~ "CdgTxAdj_Sense",
836-
noncoding_tx_adjacent == "NoncdgTxAdj" & noncoding_tx_adjacent_orientation == "Sense" ~ "NoncdgTxAdj_Sense",
837-
coding_tx_adjacent == "CdgTxAdj" & coding_tx_adjacent_orientation == "Antisense" ~ "CdgTxAdj_Antisense",
838-
noncoding_tx_adjacent == "NoncdgTxAdj" & noncoding_tx_adjacent_orientation == "Antisense" ~ "NoncdgTxAdj_Antisense",
839-
genic == "Intergenic" ~ "Intergenic",
840-
TRUE ~ "Other"
841-
)) %>%
828+
loc_integrative_stranded = case_when(
829+
exonic == "Exonic" & exonic_orientation == "Sense" ~ "Exonic_Sense",
830+
intronic == "Intronic" & intronic_orientation == "Sense" ~ "Intronic_Sense",
831+
coding_tx == "CdgTx" & coding_tx_orientation == "Sense" ~ "CdgTxOther_Sense",
832+
noncoding_tx == "NoncdgTx" & noncoding_tx_orientation == "Sense" ~ "NoncdgTx_Sense",
833+
exonic == "Exonic" & exonic_orientation == "Antisense" ~ "Exonic_Antisense",
834+
intronic == "Intronic" & intronic_orientation == "Antisense" ~ "Intronic_Antisense",
835+
coding_tx == "CdgTx" & coding_tx_orientation == "Antisense" ~ "CdgTxOther_Antisense",
836+
noncoding_tx == "NoncdgTx" & noncoding_tx_orientation == "Antisense" ~ "NoncdgTx_Antisense",
837+
coding_tx_adjacent == "CdgTxAdj" & coding_tx_adjacent_orientation == "Sense" ~ "CdgTxAdj_Sense",
838+
noncoding_tx_adjacent == "NoncdgTxAdj" & noncoding_tx_adjacent_orientation == "Sense" ~ "NoncdgTxAdj_Sense",
839+
coding_tx_adjacent == "CdgTxAdj" & coding_tx_adjacent_orientation == "Antisense" ~ "CdgTxAdj_Antisense",
840+
noncoding_tx_adjacent == "NoncdgTxAdj" & noncoding_tx_adjacent_orientation == "Antisense" ~ "NoncdgTxAdj_Antisense",
841+
genic == "Intergenic" ~ "Intergenic",
842+
TRUE ~ "Other"
843+
)
844+
) %>%
842845
mutate(
843-
loc_highres_integrative_stranded = case_when(
844-
utr5 == "5UTR" & utr5_orientation == "Sense" ~ "UTR_Sense",
845-
utr3 == "3UTR" & utr3_orientation == "Sense" ~ "UTR_Sense",
846-
exonic == "Exonic" & exonic_orientation == "Sense" ~ "Exonic_Sense",
847-
intronic == "Intronic" & intronic_orientation == "Sense" ~ "Intronic_Sense",
848-
coding_tx == "CdgTx" & coding_tx_orientation == "Sense" ~ "CdgTxOther_Sense",
849-
noncoding_tx == "NoncdgTx" & noncoding_tx_orientation == "Sense" ~ "NoncdgTx_Sense",
850-
utr5 == "5UTR" & utr5_orientation == "Antisense" ~ "UTR_Antisense",
851-
utr3 == "3UTR" & utr3_orientation == "Antisense" ~ "UTR_Antisense",
852-
exonic == "Exonic" & exonic_orientation == "Antisense" ~ "Exonic_Antisense",
853-
intronic == "Intronic" & intronic_orientation == "Antisense" ~ "Intronic_Antisense",
854-
coding_tx == "CdgTx" & coding_tx_orientation == "Antisense" ~ "CdgTxOther_Antisense",
855-
noncoding_tx == "NoncdgTx" & noncoding_tx_orientation == "Antisense" ~ "NoncdgTx_Antisense",
856-
coding_tx_downstream == "CdgTxAdjDwn" & coding_tx_downstream_orientation == "Sense" ~ "CdgTxAdjDwn_Sense",
857-
noncoding_tx_downstream == "NoncdgTxAdjDwn" & noncoding_tx_downstream_orientation == "Sense" ~ "NoncdgTxAdjDwn_Sense",
858-
coding_tx_upstream == "CdgTxAdjUp" & coding_tx_upstream_orientation == "Sense" ~ "CdgTxAdjUp_Sense",
859-
noncoding_tx_upstream == "NoncdgTxAdjUp" & noncoding_tx_upstream_orientation == "Sense" ~ "NoncdgTxAdjUp_Sense",
860-
coding_tx_downstream == "CdgTxAdjDwn" & coding_tx_downstream_orientation == "Antisense" ~ "CdgTxAdjDwn_Antisense",
861-
noncoding_tx_downstream == "NoncdgTxAdjDwn" & noncoding_tx_downstream_orientation == "Antisense" ~ "NoncdgTxAdjDwn_Antisense",
862-
coding_tx_upstream == "CdgTxAdjUp" & coding_tx_upstream_orientation == "Antisense" ~ "CdgTxAdjUp_Antisense",
863-
noncoding_tx_upstream == "NoncdgTxAdjUp" & noncoding_tx_upstream_orientation == "Antisense" ~ "NoncdgTxAdjUp_Antisense",
864-
genic == "Intergenic" ~ "Intergenic",
865-
TRUE ~ "Other"
866-
)) %>%
846+
loc_highres_integrative_stranded = case_when(
847+
utr5 == "5UTR" & utr5_orientation == "Sense" ~ "UTR_Sense",
848+
utr3 == "3UTR" & utr3_orientation == "Sense" ~ "UTR_Sense",
849+
exonic == "Exonic" & exonic_orientation == "Sense" ~ "Exonic_Sense",
850+
intronic == "Intronic" & intronic_orientation == "Sense" ~ "Intronic_Sense",
851+
coding_tx == "CdgTx" & coding_tx_orientation == "Sense" ~ "CdgTxOther_Sense",
852+
noncoding_tx == "NoncdgTx" & noncoding_tx_orientation == "Sense" ~ "NoncdgTx_Sense",
853+
utr5 == "5UTR" & utr5_orientation == "Antisense" ~ "UTR_Antisense",
854+
utr3 == "3UTR" & utr3_orientation == "Antisense" ~ "UTR_Antisense",
855+
exonic == "Exonic" & exonic_orientation == "Antisense" ~ "Exonic_Antisense",
856+
intronic == "Intronic" & intronic_orientation == "Antisense" ~ "Intronic_Antisense",
857+
coding_tx == "CdgTx" & coding_tx_orientation == "Antisense" ~ "CdgTxOther_Antisense",
858+
noncoding_tx == "NoncdgTx" & noncoding_tx_orientation == "Antisense" ~ "NoncdgTx_Antisense",
859+
coding_tx_downstream == "CdgTxAdjDwn" & coding_tx_downstream_orientation == "Sense" ~ "CdgTxAdjDwn_Sense",
860+
noncoding_tx_downstream == "NoncdgTxAdjDwn" & noncoding_tx_downstream_orientation == "Sense" ~ "NoncdgTxAdjDwn_Sense",
861+
coding_tx_upstream == "CdgTxAdjUp" & coding_tx_upstream_orientation == "Sense" ~ "CdgTxAdjUp_Sense",
862+
noncoding_tx_upstream == "NoncdgTxAdjUp" & noncoding_tx_upstream_orientation == "Sense" ~ "NoncdgTxAdjUp_Sense",
863+
coding_tx_downstream == "CdgTxAdjDwn" & coding_tx_downstream_orientation == "Antisense" ~ "CdgTxAdjDwn_Antisense",
864+
noncoding_tx_downstream == "NoncdgTxAdjDwn" & noncoding_tx_downstream_orientation == "Antisense" ~ "NoncdgTxAdjDwn_Antisense",
865+
coding_tx_upstream == "CdgTxAdjUp" & coding_tx_upstream_orientation == "Antisense" ~ "CdgTxAdjUp_Antisense",
866+
noncoding_tx_upstream == "NoncdgTxAdjUp" & noncoding_tx_upstream_orientation == "Antisense" ~ "NoncdgTxAdjUp_Antisense",
867+
genic == "Intergenic" ~ "Intergenic",
868+
TRUE ~ "Other"
869+
)
870+
) %>%
867871
mutate(loc_lowres_integrative_stranded = case_when(
868872
loc_integrative == "Exonic" & exonic_orientation == "Sense" ~ "Genic_Sense",
869873
loc_integrative == "Intronic" & intronic_orientation == "Sense" ~ "Genic_Sense",
@@ -885,19 +889,32 @@ dist_to_nearest_coding_tx <- distanceToNearest(rmfragmentsgr_properinsertloc, co
885889
as.data.frame() %>%
886890
tibble()
887891
dist_to_nearest_coding_tx_df <- tibble(gene_id = mcols(rmfragmentsgr_properinsertloc[dist_to_nearest_coding_tx$queryHits, ])$gene_id, dist_to_nearest_coding_tx = dist_to_nearest_coding_tx$distance)
888-
nearest_coding_tx_df <- tibble(gene_id =mcols(rmfragmentsgr_properinsertloc)$gene_id, nearest_coding_tx = mcols(coding_transcripts[nearest(rmfragmentsgr_properinsertloc, coding_transcripts, ignore.strand = TRUE)])$gene_id)
892+
nearest_indices <- nearest(rmfragmentsgr_properinsertloc, coding_transcripts, ignore.strand = TRUE)
893+
valid_indices <- !is.na(nearest_indices)
894+
result <- rep(NA, length(nearest_indices))
895+
result[valid_indices] <- mcols(coding_transcripts[nearest_indices[valid_indices]])$gene_id
896+
nearest_coding_tx_df <- tibble(gene_id = mcols(rmfragmentsgr_properinsertloc)$gene_id, nearest_coding_tx = result)
889897

890898
dist_to_nearest_noncoding_tx <- distanceToNearest(rmfragmentsgr_properinsertloc, noncoding_transcripts, ignore.strand = TRUE) %>%
891899
as.data.frame() %>%
892900
tibble()
893901
dist_to_nearest_noncoding_tx_df <- tibble(gene_id = mcols(rmfragmentsgr_properinsertloc[dist_to_nearest_noncoding_tx$queryHits, ])$gene_id, dist_to_nearest_noncoding_tx = dist_to_nearest_noncoding_tx$distance)
894-
nearest_noncoding_tx_df <- tibble(gene_id =mcols(rmfragmentsgr_properinsertloc)$gene_id, nearest_noncoding_tx = mcols(noncoding_transcripts[nearest(rmfragmentsgr_properinsertloc, noncoding_transcripts, ignore.strand = TRUE)])$gene_id)
902+
nearest_indices <- nearest(rmfragmentsgr_properinsertloc, noncoding_transcripts, ignore.strand = TRUE)
903+
valid_indices <- !is.na(nearest_indices)
904+
result <- rep(NA, length(nearest_indices))
905+
result[valid_indices] <- mcols(noncoding_transcripts[nearest_indices[valid_indices]])$gene_id
906+
nearest_noncoding_tx_df <- tibble(gene_id = mcols(rmfragmentsgr_properinsertloc)$gene_id, nearest_noncoding_tx = result)
907+
895908

896909
dist_to_nearest_tx <- distanceToNearest(rmfragmentsgr_properinsertloc, transcripts, ignore.strand = TRUE) %>%
897910
as.data.frame() %>%
898911
tibble()
899912
dist_to_nearest_tx_df <- tibble(gene_id = mcols(rmfragmentsgr_properinsertloc[dist_to_nearest_tx$queryHits, ])$gene_id, dist_to_nearest_tx = dist_to_nearest_tx$distance)
900-
nearest_tx_df <- tibble(gene_id =mcols(rmfragmentsgr_properinsertloc)$gene_id, nearest_tx = mcols(transcripts[nearest(rmfragmentsgr_properinsertloc, transcripts, ignore.strand = TRUE)])$gene_id)
913+
nearest_indices <- nearest(rmfragmentsgr_properinsertloc, transcripts, ignore.strand = TRUE)
914+
valid_indices <- !is.na(nearest_indices)
915+
result <- rep(NA, length(nearest_indices))
916+
result[valid_indices] <- mcols(transcripts[nearest_indices[valid_indices]])$gene_id
917+
nearest_tx_df <- tibble(gene_id = mcols(rmfragmentsgr_properinsertloc)$gene_id, nearest_tx = result)
901918

902919
dist_to_nearest_txs_df <- left_join(rmfamilies %>% dplyr::select(gene_id), dist_to_nearest_coding_tx_df) %>%
903920
left_join(dist_to_nearest_noncoding_tx_df) %>%
@@ -909,7 +926,7 @@ dist_to_nearest_txs_df <- left_join(rmfamilies %>% dplyr::select(gene_id), dist_
909926

910927

911928
########################################################## PULL EVERYTHING TOEGETHER
912-
length(dist_to_nearest_noncoding_tx)
929+
length(dist_to_nearest_noncoding_tx)
913930
length(dist_to_nearest_coding_tx)
914931
length(rownames(rmfamilies))
915932
length(rmfragmentsgr_properinsertloc)
@@ -919,7 +936,7 @@ annots <- rmfamilies %>%
919936
full_join(req_annot) %>%
920937
full_join(ltr_viral_status) %>%
921938
full_join(ltr_proviral_groups) %>%
922-
full_join(region_annot %>% rename_at(vars(-gene_id, -loc_integrative, -loc_lowres_integrative,-loc_highres_integrative, -loc_integrative_stranded, -loc_lowres_integrative_stranded, -loc_highres_integrative_stranded), ~ paste0(., "_loc"))) %>%
939+
full_join(region_annot %>% rename_at(vars(-gene_id, -loc_integrative, -loc_lowres_integrative, -loc_highres_integrative, -loc_integrative_stranded, -loc_lowres_integrative_stranded, -loc_highres_integrative_stranded), ~ paste0(., "_loc"))) %>%
923940
full_join(dist_to_nearest_txs_df)
924941

925942

0 commit comments

Comments
 (0)