20
20
# ' species <- c("Colletes similis","Halictus ligatus","Perdita californica")
21
21
# ' ncbi_byname(taxa=species, gene = c("coi", "co1"), seqrange = "1:2000")
22
22
# ' }
23
- ncbi_byname <- function (taxa , gene = " COI" , seqrange = " 1:3000" , getrelated = FALSE ,
24
- verbose = TRUE , ... ) {
23
+ ncbi_byname <- function (taxa , gene = " COI" , seqrange = " 1:3000" , getrelated = FALSE , verbose = TRUE , batch_size = 100 , ... ) {
25
24
foo <- function (xx ) {
26
25
mssg(verbose , paste(" Working on " , xx , " ..." , sep = " " ))
27
26
mssg(verbose , " ...retrieving sequence IDs..." )
@@ -51,9 +50,10 @@ ncbi_byname <- function(taxa, gene="COI", seqrange="1:3000", getrelated=FALSE,
51
50
if (! getrelated ) {
52
51
mssg(verbose , paste(" no sequences of " , gene , " for " , xx , sep = " " ))
53
52
res <- data.frame (
54
- xx , NA_character_ , NA_real_ , NA_character_ , NA_real_ , NA_character_ , NA_character_ ,
53
+ taxon = xx , gene_desc = NA_character_ ,
54
+ gi_no = NA_real_ , acc_no = NA_character_ , length = NA_real_ ,
55
+ sequence = NA_character_ , spused = NA_character_ ,
55
56
stringsAsFactors = FALSE )
56
- names(res ) <- NULL
57
57
} else {
58
58
mssg(verbose , " ...retrieving sequence IDs for related species..." )
59
59
newname <- strsplit(xx , " " )[[1 ]][[1 ]]
@@ -68,9 +68,10 @@ ncbi_byname <- function(taxa, gene="COI", seqrange="1:3000", getrelated=FALSE,
68
68
if (as.numeric(xml2 :: xml_text(xml2 :: xml_find_all(out , " //Count" )[[1 ]])) == 0 ) {
69
69
mssg(verbose , paste(" no sequences of " , gene , " for " , xx , " or " , newname , sep = " " ))
70
70
res <- data.frame (
71
- xx , NA_character_ , NA_real_ , NA_character_ , NA_real_ , NA_character_ , NA_character_ ,
71
+ taxon = xx , gene_desc = NA_character_ ,
72
+ gi_no = NA_real_ , acc_no = NA_character_ , length = NA_real_ ,
73
+ sequence = NA_character_ , spused = NA_character_ ,
72
74
stringsAsFactors = FALSE )
73
- names(res ) <- NULL
74
75
} else {
75
76
# # For each species = get GI number with longest sequence
76
77
mssg(verbose , " ...retrieving sequence ID with longest sequence length..." )
@@ -88,30 +89,53 @@ ncbi_byname <- function(taxa, gene="COI", seqrange="1:3000", getrelated=FALSE,
88
89
# # For each species = get GI number with longest sequence
89
90
mssg(verbose , " ...retrieving sequence ID with longest sequence length..." )
90
91
# construct query for species
91
- querysum <- list (db = " nucleotide" , api_key = ncbi_key(),
92
- id = paste(make_ids(out ), collapse = " " ))
93
- z <- con $ get(" entrez/eutils/esummary.fcgi" , query = querysum )
94
- txt <- z $ parse(" UTF-8" )
95
- res <- parse_ncbi(xx , xml2 :: xml_find_all(xml2 :: read_xml(txt ), " //eSummaryResult" )[[1 ]], verbose )
92
+ ids <- make_ids(out )
93
+ res_list <- lapply(seq(1 , length(ids ), by = batch_size ), function (i ) {
94
+ querysum <- list (db = " nucleotide" ,
95
+ id = paste(ids [i : min(i + batch_size - 1 , length(ids ))], collapse = " " ),
96
+ api_key = ncbi_key())
97
+ z <- con $ get(" entrez/eutils/esummary.fcgi" , query = querysum )
98
+ z $ raise_for_status()
99
+ xml2 :: xml_find_all(xml2 :: read_xml(z $ parse(" UTF-8" )), " //eSummaryResult" )
100
+ })
101
+
102
+ res <- do.call(rbind , lapply(res_list , function (x ) parse_ncbi(xx , x , verbose )))
96
103
}
97
-
104
+
98
105
mssg(verbose , " ...done." )
99
106
stats :: setNames(res , c(" taxon" , " gene_desc" , " gi_no" , " acc_no" , " length" , " sequence" , " spused" ))
100
107
}
101
-
108
+
102
109
foo_safe <- tryfail(NULL , foo )
103
- if (length(taxa ) == 1 ){ foo_safe(taxa ) } else { lapply(taxa , foo_safe ) }
110
+ if (length(taxa ) == 1 ){
111
+ return (foo_safe(taxa ))
112
+ } else {
113
+ return (do.call(rbind , lapply(taxa , foo_safe )))
114
+ }
104
115
}
105
116
106
117
parse_ncbi <- function (xx , z , verbose ) {
118
+
107
119
names <- xml2 :: xml_attr(xml2 :: xml_find_all(z , " //Item" ), " Name" ) # gets names of values in summary
120
+
121
+ if (length(names ) == 0 ) {
122
+ message(" No sequences found for " , xx )
123
+ return (data.frame (taxon = xx , gene_desc = NA ,
124
+ gi_no = NA , acc_no = NA , length = NA ,
125
+ sequence = NA , spused = NA ,
126
+ stringsAsFactors = FALSE ))
127
+ }
128
+
108
129
prd <- xml2 :: xml_text(xml2 :: xml_find_all(z , ' //Item[@Name="Caption"]' )) # get access numbers
109
130
prd <- sapply(prd , function (x ) strsplit(x , " _" )[[1 ]][[1 ]], USE.NAMES = FALSE )
110
131
l_ <- as.numeric(xml2 :: xml_text(xml2 :: xml_find_all(z , ' //Item[@Name="Length"]' ))) # gets seq lengths
111
132
gis <- as.numeric(xml2 :: xml_text(xml2 :: xml_find_all(z , ' //Item[@Name="Gi"]' ))) # gets GI numbers
112
133
sns <- xml2 :: xml_text(xml2 :: xml_find_all(z , ' //Item[@Name="Title"]' )) # gets seq lengths # get spp names
113
134
df <- data.frame (gis = gis , length = l_ , spnames = sns , predicted = prd , stringsAsFactors = FALSE )
114
135
df <- df [! df $ predicted %in% c(" XM" ," XR" ),] # remove predicted sequences
136
+ if (nrow(df ) == 0 ) {
137
+ return (data.frame (taxon = xx , gene_desc = NA , gi_no = NA , acc_no = NA , length = NA , sequence = NA , spused = NA , stringsAsFactors = FALSE ))
138
+ }
115
139
gisuse <- df [which.max(x = df $ length ),] # picks longest sequnence length
116
140
if (NROW(gisuse ) > 1 ) {
117
141
gisuse <- gisuse [sample(NROW(gisuse ), 1 ), ]
@@ -126,12 +150,25 @@ parse_ncbi <- function(xx, z, verbose) {
126
150
outseq <- w $ parse(" UTF-8" )
127
151
seq <- gsub(" \n " , " " , strsplit(sub(" \n " , " <<<" , outseq ), " <<<" )[[1 ]][[2 ]])
128
152
accessnum <- strsplit(outseq , " \\ |" )[[1 ]][4 ]
129
- outt <- list (xx , as.character(gisuse [,3 ]), gisuse [,1 ], accessnum , gisuse [,2 ], seq )
153
+ outt <- list (taxon = xx , gene_desc = as.character(gisuse [,3 ]),
154
+ gi_no = gisuse [,1 ], acc_no = accessnum ,
155
+ length = gisuse [,2 ], sequence = seq ,
156
+ spused = paste(
157
+ strsplit(as.character(gisuse [,3 ]),
158
+ " " )[[1 ]][1 : 2 ],
159
+ collapse = " " ))
130
160
131
161
spused <- paste(strsplit(outt [[2 ]], " " )[[1 ]][1 : 2 ], sep = " " , collapse = " " )
132
- outoutout <- data.frame (outt , spused = spused , stringsAsFactors = FALSE )
133
- names( outoutout ) <- NULL
134
- outoutout
162
+ outoutout <- data.frame (outt , stringsAsFactors = FALSE )
163
+
164
+ return ( outoutout )
135
165
}
136
166
137
167
make_ids <- function (x ) as.numeric(xml2 :: xml_text(xml2 :: xml_find_all(x , " //IdList//Id" )))
168
+
169
+ tryfail <- function (default , code ) {
170
+ tryCatch(code , error = function (e ) {
171
+ message(e )
172
+ return (default )
173
+ })
174
+ }
0 commit comments