@@ -308,16 +308,16 @@ tryCatch(
308
308
names(ss ) <- c(row $ element_orf_id )
309
309
orf_ss_all <- c(orf_ss_all , ss )
310
310
}
311
-
311
+
312
312
system(sprintf(" makeblastdb -in %s -dbtype 'prot'" , orf_aa_consensus_path ))
313
313
314
314
orf_aa_ss_all <- Biostrings :: translate(orf_ss_all )
315
315
orf_aa_ss_all_path <- sprintf(" %s/intactness_annotation_workdir/%s_all_orfs_aa.fa" , outputdir , element )
316
- writeXStringSet(orf_aa_ss_all ,orf_aa_ss_all_path )
316
+ writeXStringSet(orf_aa_ss_all , orf_aa_ss_all_path )
317
317
318
318
orf_aa_ss_all_blast_results_path <- sprintf(" %s/intactness_annotation_workdir/%s_orf_length_%s_aa_blast_to_consensus_results.tsv" , outputdir , element , modal_width )
319
- system(sprintf(" blastp -db %s -query %s -out %s -outfmt 7" ,orf_aa_consensus_path , orf_aa_ss_all_path , orf_aa_ss_all_blast_results_path ))
320
- bres <- read_delim(orf_aa_ss_all_blast_results_path , comment = " #" , delim = " \t " , col_names = c(" qseqid" ," sseqid" , " pident" , " length" , " mismatch" , " gapopen" , " qstart" , " qend" , " sstart" , " send" , " evalue" , " bitscore" ))
319
+ system(sprintf(" blastp -db %s -query %s -out %s -outfmt 7" , orf_aa_consensus_path , orf_aa_ss_all_path , orf_aa_ss_all_blast_results_path ))
320
+ bres <- read_delim(orf_aa_ss_all_blast_results_path , comment = " #" , delim = " \t " , col_names = c(" qseqid" , " sseqid" , " pident" , " length" , " mismatch" , " gapopen" , " qstart" , " qend" , " sstart" , " send" , " evalue" , " bitscore" ))
321
321
bres <- bres %> %
322
322
separate_wider_delim(cols = qseqid , delim = " :" , names = c(" gene_id" , " orf_start" )) %> %
323
323
mutate(gene_orf_id = paste(gene_id , orf_start , delim = " :" ))
@@ -414,7 +414,7 @@ tryCatch(
414
414
partial_orf_passes_mutation_threshold = map_chr(partial_orf_passes_mutation_threshold , ~ paste(.x , collapse = " ;" ))
415
415
) %> %
416
416
dplyr :: select(gene_id , intactness_req , orf_passes_mutation_threshold , partial_orf_passes_mutation_threshold )
417
- },
417
+ },
418
418
error = function (e ) {
419
419
print(" no elements annotated for intactness" )
420
420
intactness_ann <- rmfragments %> %
@@ -722,7 +722,7 @@ getannotation <- function(to_be_annotated, regions_of_interest, property, name_i
722
722
tibble() %> %
723
723
dplyr :: select(gene_id , !! property )
724
724
725
- # now do it in a stranded fashion
725
+ # now do it in a stranded fashion
726
726
inregions <- to_be_annotated %> % subsetByOverlaps(regions_of_interest , invert = FALSE , ignore.strand = FALSE )
727
727
tryCatch(
728
728
{
@@ -747,7 +747,9 @@ getannotation <- function(to_be_annotated, regions_of_interest, property, name_i
747
747
tibble() %> %
748
748
dplyr :: select(gene_id , stranded )
749
749
750
- annot <- full_join(annot_unstranded , annot_stranded ) %> % mutate(!! paste0(property , " _orientation" ) : = ifelse(!! sym(property ) == name_out , name_out , ifelse(!! sym(property ) == stranded , " Sense" , " Antisense" ))) %> % dplyr :: select(- stranded )
750
+ annot <- full_join(annot_unstranded , annot_stranded ) %> %
751
+ mutate(!! paste0(property , " _orientation" ) : = ifelse(!! sym(property ) == name_out , name_out , ifelse(!! sym(property ) == stranded , " Sense" , " Antisense" ))) %> %
752
+ dplyr :: select(- stranded )
751
753
return (annot )
752
754
}
753
755
@@ -823,47 +825,49 @@ region_annot <- region_annot %>%
823
825
TRUE ~ " Other"
824
826
)) %> %
825
827
mutate(
826
- loc_integrative_stranded = case_when(
827
- exonic == " Exonic" & exonic_orientation == " Sense" ~ " Exonic_Sense" ,
828
- intronic == " Intronic" & intronic_orientation == " Sense" ~ " Intronic_Sense" ,
829
- coding_tx == " CdgTx" & coding_tx_orientation == " Sense" ~ " CdgTxOther_Sense" ,
830
- noncoding_tx == " NoncdgTx" & noncoding_tx_orientation == " Sense" ~ " NoncdgTx_Sense" ,
831
- exonic == " Exonic" & exonic_orientation == " Antisense" ~ " Exonic_Antisense" ,
832
- intronic == " Intronic" & intronic_orientation == " Antisense" ~ " Intronic_Antisense" ,
833
- coding_tx == " CdgTx" & coding_tx_orientation == " Antisense" ~ " CdgTxOther_Antisense" ,
834
- noncoding_tx == " NoncdgTx" & noncoding_tx_orientation == " Antisense" ~ " NoncdgTx_Antisense" ,
835
- coding_tx_adjacent == " CdgTxAdj" & coding_tx_adjacent_orientation == " Sense" ~ " CdgTxAdj_Sense" ,
836
- noncoding_tx_adjacent == " NoncdgTxAdj" & noncoding_tx_adjacent_orientation == " Sense" ~ " NoncdgTxAdj_Sense" ,
837
- coding_tx_adjacent == " CdgTxAdj" & coding_tx_adjacent_orientation == " Antisense" ~ " CdgTxAdj_Antisense" ,
838
- noncoding_tx_adjacent == " NoncdgTxAdj" & noncoding_tx_adjacent_orientation == " Antisense" ~ " NoncdgTxAdj_Antisense" ,
839
- genic == " Intergenic" ~ " Intergenic" ,
840
- TRUE ~ " Other"
841
- )) %> %
828
+ loc_integrative_stranded = case_when(
829
+ exonic == " Exonic" & exonic_orientation == " Sense" ~ " Exonic_Sense" ,
830
+ intronic == " Intronic" & intronic_orientation == " Sense" ~ " Intronic_Sense" ,
831
+ coding_tx == " CdgTx" & coding_tx_orientation == " Sense" ~ " CdgTxOther_Sense" ,
832
+ noncoding_tx == " NoncdgTx" & noncoding_tx_orientation == " Sense" ~ " NoncdgTx_Sense" ,
833
+ exonic == " Exonic" & exonic_orientation == " Antisense" ~ " Exonic_Antisense" ,
834
+ intronic == " Intronic" & intronic_orientation == " Antisense" ~ " Intronic_Antisense" ,
835
+ coding_tx == " CdgTx" & coding_tx_orientation == " Antisense" ~ " CdgTxOther_Antisense" ,
836
+ noncoding_tx == " NoncdgTx" & noncoding_tx_orientation == " Antisense" ~ " NoncdgTx_Antisense" ,
837
+ coding_tx_adjacent == " CdgTxAdj" & coding_tx_adjacent_orientation == " Sense" ~ " CdgTxAdj_Sense" ,
838
+ noncoding_tx_adjacent == " NoncdgTxAdj" & noncoding_tx_adjacent_orientation == " Sense" ~ " NoncdgTxAdj_Sense" ,
839
+ coding_tx_adjacent == " CdgTxAdj" & coding_tx_adjacent_orientation == " Antisense" ~ " CdgTxAdj_Antisense" ,
840
+ noncoding_tx_adjacent == " NoncdgTxAdj" & noncoding_tx_adjacent_orientation == " Antisense" ~ " NoncdgTxAdj_Antisense" ,
841
+ genic == " Intergenic" ~ " Intergenic" ,
842
+ TRUE ~ " Other"
843
+ )
844
+ ) %> %
842
845
mutate(
843
- loc_highres_integrative_stranded = case_when(
844
- utr5 == " 5UTR" & utr5_orientation == " Sense" ~ " UTR_Sense" ,
845
- utr3 == " 3UTR" & utr3_orientation == " Sense" ~ " UTR_Sense" ,
846
- exonic == " Exonic" & exonic_orientation == " Sense" ~ " Exonic_Sense" ,
847
- intronic == " Intronic" & intronic_orientation == " Sense" ~ " Intronic_Sense" ,
848
- coding_tx == " CdgTx" & coding_tx_orientation == " Sense" ~ " CdgTxOther_Sense" ,
849
- noncoding_tx == " NoncdgTx" & noncoding_tx_orientation == " Sense" ~ " NoncdgTx_Sense" ,
850
- utr5 == " 5UTR" & utr5_orientation == " Antisense" ~ " UTR_Antisense" ,
851
- utr3 == " 3UTR" & utr3_orientation == " Antisense" ~ " UTR_Antisense" ,
852
- exonic == " Exonic" & exonic_orientation == " Antisense" ~ " Exonic_Antisense" ,
853
- intronic == " Intronic" & intronic_orientation == " Antisense" ~ " Intronic_Antisense" ,
854
- coding_tx == " CdgTx" & coding_tx_orientation == " Antisense" ~ " CdgTxOther_Antisense" ,
855
- noncoding_tx == " NoncdgTx" & noncoding_tx_orientation == " Antisense" ~ " NoncdgTx_Antisense" ,
856
- coding_tx_downstream == " CdgTxAdjDwn" & coding_tx_downstream_orientation == " Sense" ~ " CdgTxAdjDwn_Sense" ,
857
- noncoding_tx_downstream == " NoncdgTxAdjDwn" & noncoding_tx_downstream_orientation == " Sense" ~ " NoncdgTxAdjDwn_Sense" ,
858
- coding_tx_upstream == " CdgTxAdjUp" & coding_tx_upstream_orientation == " Sense" ~ " CdgTxAdjUp_Sense" ,
859
- noncoding_tx_upstream == " NoncdgTxAdjUp" & noncoding_tx_upstream_orientation == " Sense" ~ " NoncdgTxAdjUp_Sense" ,
860
- coding_tx_downstream == " CdgTxAdjDwn" & coding_tx_downstream_orientation == " Antisense" ~ " CdgTxAdjDwn_Antisense" ,
861
- noncoding_tx_downstream == " NoncdgTxAdjDwn" & noncoding_tx_downstream_orientation == " Antisense" ~ " NoncdgTxAdjDwn_Antisense" ,
862
- coding_tx_upstream == " CdgTxAdjUp" & coding_tx_upstream_orientation == " Antisense" ~ " CdgTxAdjUp_Antisense" ,
863
- noncoding_tx_upstream == " NoncdgTxAdjUp" & noncoding_tx_upstream_orientation == " Antisense" ~ " NoncdgTxAdjUp_Antisense" ,
864
- genic == " Intergenic" ~ " Intergenic" ,
865
- TRUE ~ " Other"
866
- )) %> %
846
+ loc_highres_integrative_stranded = case_when(
847
+ utr5 == " 5UTR" & utr5_orientation == " Sense" ~ " UTR_Sense" ,
848
+ utr3 == " 3UTR" & utr3_orientation == " Sense" ~ " UTR_Sense" ,
849
+ exonic == " Exonic" & exonic_orientation == " Sense" ~ " Exonic_Sense" ,
850
+ intronic == " Intronic" & intronic_orientation == " Sense" ~ " Intronic_Sense" ,
851
+ coding_tx == " CdgTx" & coding_tx_orientation == " Sense" ~ " CdgTxOther_Sense" ,
852
+ noncoding_tx == " NoncdgTx" & noncoding_tx_orientation == " Sense" ~ " NoncdgTx_Sense" ,
853
+ utr5 == " 5UTR" & utr5_orientation == " Antisense" ~ " UTR_Antisense" ,
854
+ utr3 == " 3UTR" & utr3_orientation == " Antisense" ~ " UTR_Antisense" ,
855
+ exonic == " Exonic" & exonic_orientation == " Antisense" ~ " Exonic_Antisense" ,
856
+ intronic == " Intronic" & intronic_orientation == " Antisense" ~ " Intronic_Antisense" ,
857
+ coding_tx == " CdgTx" & coding_tx_orientation == " Antisense" ~ " CdgTxOther_Antisense" ,
858
+ noncoding_tx == " NoncdgTx" & noncoding_tx_orientation == " Antisense" ~ " NoncdgTx_Antisense" ,
859
+ coding_tx_downstream == " CdgTxAdjDwn" & coding_tx_downstream_orientation == " Sense" ~ " CdgTxAdjDwn_Sense" ,
860
+ noncoding_tx_downstream == " NoncdgTxAdjDwn" & noncoding_tx_downstream_orientation == " Sense" ~ " NoncdgTxAdjDwn_Sense" ,
861
+ coding_tx_upstream == " CdgTxAdjUp" & coding_tx_upstream_orientation == " Sense" ~ " CdgTxAdjUp_Sense" ,
862
+ noncoding_tx_upstream == " NoncdgTxAdjUp" & noncoding_tx_upstream_orientation == " Sense" ~ " NoncdgTxAdjUp_Sense" ,
863
+ coding_tx_downstream == " CdgTxAdjDwn" & coding_tx_downstream_orientation == " Antisense" ~ " CdgTxAdjDwn_Antisense" ,
864
+ noncoding_tx_downstream == " NoncdgTxAdjDwn" & noncoding_tx_downstream_orientation == " Antisense" ~ " NoncdgTxAdjDwn_Antisense" ,
865
+ coding_tx_upstream == " CdgTxAdjUp" & coding_tx_upstream_orientation == " Antisense" ~ " CdgTxAdjUp_Antisense" ,
866
+ noncoding_tx_upstream == " NoncdgTxAdjUp" & noncoding_tx_upstream_orientation == " Antisense" ~ " NoncdgTxAdjUp_Antisense" ,
867
+ genic == " Intergenic" ~ " Intergenic" ,
868
+ TRUE ~ " Other"
869
+ )
870
+ ) %> %
867
871
mutate(loc_lowres_integrative_stranded = case_when(
868
872
loc_integrative == " Exonic" & exonic_orientation == " Sense" ~ " Genic_Sense" ,
869
873
loc_integrative == " Intronic" & intronic_orientation == " Sense" ~ " Genic_Sense" ,
@@ -885,19 +889,32 @@ dist_to_nearest_coding_tx <- distanceToNearest(rmfragmentsgr_properinsertloc, co
885
889
as.data.frame() %> %
886
890
tibble()
887
891
dist_to_nearest_coding_tx_df <- tibble(gene_id = mcols(rmfragmentsgr_properinsertloc [dist_to_nearest_coding_tx $ queryHits , ])$ gene_id , dist_to_nearest_coding_tx = dist_to_nearest_coding_tx $ distance )
888
- nearest_coding_tx_df <- tibble(gene_id = mcols(rmfragmentsgr_properinsertloc )$ gene_id , nearest_coding_tx = mcols(coding_transcripts [nearest(rmfragmentsgr_properinsertloc , coding_transcripts , ignore.strand = TRUE )])$ gene_id )
892
+ nearest_indices <- nearest(rmfragmentsgr_properinsertloc , coding_transcripts , ignore.strand = TRUE )
893
+ valid_indices <- ! is.na(nearest_indices )
894
+ result <- rep(NA , length(nearest_indices ))
895
+ result [valid_indices ] <- mcols(coding_transcripts [nearest_indices [valid_indices ]])$ gene_id
896
+ nearest_coding_tx_df <- tibble(gene_id = mcols(rmfragmentsgr_properinsertloc )$ gene_id , nearest_coding_tx = result )
889
897
890
898
dist_to_nearest_noncoding_tx <- distanceToNearest(rmfragmentsgr_properinsertloc , noncoding_transcripts , ignore.strand = TRUE ) %> %
891
899
as.data.frame() %> %
892
900
tibble()
893
901
dist_to_nearest_noncoding_tx_df <- tibble(gene_id = mcols(rmfragmentsgr_properinsertloc [dist_to_nearest_noncoding_tx $ queryHits , ])$ gene_id , dist_to_nearest_noncoding_tx = dist_to_nearest_noncoding_tx $ distance )
894
- nearest_noncoding_tx_df <- tibble(gene_id = mcols(rmfragmentsgr_properinsertloc )$ gene_id , nearest_noncoding_tx = mcols(noncoding_transcripts [nearest(rmfragmentsgr_properinsertloc , noncoding_transcripts , ignore.strand = TRUE )])$ gene_id )
902
+ nearest_indices <- nearest(rmfragmentsgr_properinsertloc , noncoding_transcripts , ignore.strand = TRUE )
903
+ valid_indices <- ! is.na(nearest_indices )
904
+ result <- rep(NA , length(nearest_indices ))
905
+ result [valid_indices ] <- mcols(noncoding_transcripts [nearest_indices [valid_indices ]])$ gene_id
906
+ nearest_noncoding_tx_df <- tibble(gene_id = mcols(rmfragmentsgr_properinsertloc )$ gene_id , nearest_noncoding_tx = result )
907
+
895
908
896
909
dist_to_nearest_tx <- distanceToNearest(rmfragmentsgr_properinsertloc , transcripts , ignore.strand = TRUE ) %> %
897
910
as.data.frame() %> %
898
911
tibble()
899
912
dist_to_nearest_tx_df <- tibble(gene_id = mcols(rmfragmentsgr_properinsertloc [dist_to_nearest_tx $ queryHits , ])$ gene_id , dist_to_nearest_tx = dist_to_nearest_tx $ distance )
900
- nearest_tx_df <- tibble(gene_id = mcols(rmfragmentsgr_properinsertloc )$ gene_id , nearest_tx = mcols(transcripts [nearest(rmfragmentsgr_properinsertloc , transcripts , ignore.strand = TRUE )])$ gene_id )
913
+ nearest_indices <- nearest(rmfragmentsgr_properinsertloc , transcripts , ignore.strand = TRUE )
914
+ valid_indices <- ! is.na(nearest_indices )
915
+ result <- rep(NA , length(nearest_indices ))
916
+ result [valid_indices ] <- mcols(transcripts [nearest_indices [valid_indices ]])$ gene_id
917
+ nearest_tx_df <- tibble(gene_id = mcols(rmfragmentsgr_properinsertloc )$ gene_id , nearest_tx = result )
901
918
902
919
dist_to_nearest_txs_df <- left_join(rmfamilies %> % dplyr :: select(gene_id ), dist_to_nearest_coding_tx_df ) %> %
903
920
left_join(dist_to_nearest_noncoding_tx_df ) %> %
@@ -909,7 +926,7 @@ dist_to_nearest_txs_df <- left_join(rmfamilies %>% dplyr::select(gene_id), dist_
909
926
910
927
911
928
# ######################################################### PULL EVERYTHING TOEGETHER
912
- length(dist_to_nearest_noncoding_tx )
929
+ length(dist_to_nearest_noncoding_tx )
913
930
length(dist_to_nearest_coding_tx )
914
931
length(rownames(rmfamilies ))
915
932
length(rmfragmentsgr_properinsertloc )
@@ -919,7 +936,7 @@ annots <- rmfamilies %>%
919
936
full_join(req_annot ) %> %
920
937
full_join(ltr_viral_status ) %> %
921
938
full_join(ltr_proviral_groups ) %> %
922
- full_join(region_annot %> % rename_at(vars(- gene_id , - loc_integrative , - loc_lowres_integrative ,- loc_highres_integrative , - loc_integrative_stranded , - loc_lowres_integrative_stranded , - loc_highres_integrative_stranded ), ~ paste0(. , " _loc" ))) %> %
939
+ full_join(region_annot %> % rename_at(vars(- gene_id , - loc_integrative , - loc_lowres_integrative , - loc_highres_integrative , - loc_integrative_stranded , - loc_lowres_integrative_stranded , - loc_highres_integrative_stranded ), ~ paste0(. , " _loc" ))) %> %
923
940
full_join(dist_to_nearest_txs_df )
924
941
925
942
0 commit comments