Skip to content

Commit 25eeb1b

Browse files
authored
Merge pull request #132 from ropensci/129_ncbi_byname_error
fix errors with ncbi_byname reported in #126
2 parents 57c73ef + a8fafee commit 25eeb1b

File tree

2 files changed

+109
-18
lines changed

2 files changed

+109
-18
lines changed

R/ncbi_byname.R

Lines changed: 55 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,7 @@
2020
#' species <- c("Colletes similis","Halictus ligatus","Perdita californica")
2121
#' ncbi_byname(taxa=species, gene = c("coi", "co1"), seqrange = "1:2000")
2222
#' }
23-
ncbi_byname <- function(taxa, gene="COI", seqrange="1:3000", getrelated=FALSE,
24-
verbose=TRUE, ...) {
23+
ncbi_byname <- function(taxa, gene="COI", seqrange="1:3000", getrelated=FALSE, verbose=TRUE, batch_size = 100, ...) {
2524
foo <- function(xx) {
2625
mssg(verbose, paste("Working on ", xx, "...", sep = ""))
2726
mssg(verbose, "...retrieving sequence IDs...")
@@ -51,9 +50,10 @@ ncbi_byname <- function(taxa, gene="COI", seqrange="1:3000", getrelated=FALSE,
5150
if (!getrelated) {
5251
mssg(verbose, paste("no sequences of ", gene, " for ", xx, sep = ""))
5352
res <- data.frame(
54-
xx, NA_character_, NA_real_, NA_character_, NA_real_, NA_character_, NA_character_,
53+
taxon = xx, gene_desc = NA_character_,
54+
gi_no = NA_real_, acc_no = NA_character_, length = NA_real_,
55+
sequence = NA_character_, spused = NA_character_,
5556
stringsAsFactors = FALSE)
56-
names(res) <- NULL
5757
} else {
5858
mssg(verbose, "...retrieving sequence IDs for related species...")
5959
newname <- strsplit(xx, " ")[[1]][[1]]
@@ -68,9 +68,10 @@ ncbi_byname <- function(taxa, gene="COI", seqrange="1:3000", getrelated=FALSE,
6868
if (as.numeric(xml2::xml_text(xml2::xml_find_all(out, "//Count")[[1]])) == 0) {
6969
mssg(verbose, paste("no sequences of ", gene, " for ", xx, " or ", newname, sep = ""))
7070
res <- data.frame(
71-
xx, NA_character_, NA_real_, NA_character_, NA_real_, NA_character_, NA_character_,
71+
taxon = xx, gene_desc = NA_character_,
72+
gi_no = NA_real_, acc_no = NA_character_, length = NA_real_,
73+
sequence = NA_character_, spused = NA_character_,
7274
stringsAsFactors = FALSE)
73-
names(res) <- NULL
7475
} else {
7576
## For each species = get GI number with longest sequence
7677
mssg(verbose, "...retrieving sequence ID with longest sequence length...")
@@ -88,30 +89,53 @@ ncbi_byname <- function(taxa, gene="COI", seqrange="1:3000", getrelated=FALSE,
8889
## For each species = get GI number with longest sequence
8990
mssg(verbose, "...retrieving sequence ID with longest sequence length...")
9091
# construct query for species
91-
querysum <- list(db = "nucleotide", api_key = ncbi_key(),
92-
id = paste(make_ids(out), collapse = " "))
93-
z <- con$get("entrez/eutils/esummary.fcgi", query = querysum)
94-
txt <- z$parse("UTF-8")
95-
res <- parse_ncbi(xx, xml2::xml_find_all(xml2::read_xml(txt), "//eSummaryResult")[[1]], verbose)
92+
ids <- make_ids(out)
93+
res_list <- lapply(seq(1, length(ids), by = batch_size), function(i) {
94+
querysum <- list(db = "nucleotide",
95+
id = paste(ids[i:min(i + batch_size - 1, length(ids))], collapse = " "),
96+
api_key = ncbi_key())
97+
z <- con$get("entrez/eutils/esummary.fcgi", query = querysum)
98+
z$raise_for_status()
99+
xml2::xml_find_all(xml2::read_xml(z$parse("UTF-8")), "//eSummaryResult")
100+
})
101+
102+
res <- do.call(rbind, lapply(res_list, function(x) parse_ncbi(xx, x, verbose)))
96103
}
97-
104+
98105
mssg(verbose, "...done.")
99106
stats::setNames(res, c("taxon", "gene_desc", "gi_no", "acc_no", "length", "sequence", "spused"))
100107
}
101-
108+
102109
foo_safe <- tryfail(NULL, foo)
103-
if (length(taxa) == 1){ foo_safe(taxa) } else { lapply(taxa, foo_safe) }
110+
if (length(taxa) == 1){
111+
return(foo_safe(taxa))
112+
} else {
113+
return(do.call(rbind, lapply(taxa, foo_safe)))
114+
}
104115
}
105116

106117
parse_ncbi <- function(xx, z, verbose) {
118+
107119
names <- xml2::xml_attr(xml2::xml_find_all(z, "//Item"), "Name") # gets names of values in summary
120+
121+
if(length(names) == 0) {
122+
message("No sequences found for ", xx)
123+
return(data.frame(taxon = xx, gene_desc = NA,
124+
gi_no = NA, acc_no = NA, length = NA,
125+
sequence = NA, spused = NA,
126+
stringsAsFactors = FALSE))
127+
}
128+
108129
prd <- xml2::xml_text(xml2::xml_find_all(z, '//Item[@Name="Caption"]')) # get access numbers
109130
prd <- sapply(prd, function(x) strsplit(x, "_")[[1]][[1]], USE.NAMES=FALSE)
110131
l_ <- as.numeric(xml2::xml_text(xml2::xml_find_all(z, '//Item[@Name="Length"]'))) # gets seq lengths
111132
gis <- as.numeric(xml2::xml_text(xml2::xml_find_all(z, '//Item[@Name="Gi"]'))) # gets GI numbers
112133
sns <- xml2::xml_text(xml2::xml_find_all(z, '//Item[@Name="Title"]')) # gets seq lengths # get spp names
113134
df <- data.frame(gis=gis, length=l_, spnames=sns, predicted=prd, stringsAsFactors = FALSE)
114135
df <- df[!df$predicted %in% c("XM","XR"),] # remove predicted sequences
136+
if (nrow(df) == 0) {
137+
return(data.frame(taxon = xx, gene_desc = NA, gi_no = NA, acc_no = NA, length = NA, sequence = NA, spused = NA, stringsAsFactors = FALSE))
138+
}
115139
gisuse <- df[which.max(x=df$length),] # picks longest sequnence length
116140
if (NROW(gisuse) > 1) {
117141
gisuse <- gisuse[sample(NROW(gisuse), 1), ]
@@ -126,12 +150,25 @@ parse_ncbi <- function(xx, z, verbose) {
126150
outseq <- w$parse("UTF-8")
127151
seq <- gsub("\n", "", strsplit(sub("\n", "<<<", outseq), "<<<")[[1]][[2]])
128152
accessnum <- strsplit(outseq, "\\|")[[1]][4]
129-
outt <- list(xx, as.character(gisuse[,3]), gisuse[,1], accessnum, gisuse[,2], seq)
153+
outt <- list(taxon = xx, gene_desc = as.character(gisuse[,3]),
154+
gi_no = gisuse[,1], acc_no = accessnum,
155+
length = gisuse[,2], sequence = seq,
156+
spused = paste(
157+
strsplit(as.character(gisuse[,3]),
158+
" ")[[1]][1:2],
159+
collapse = " "))
130160

131161
spused <- paste(strsplit(outt[[2]], " ")[[1]][1:2], sep = "", collapse = " ")
132-
outoutout <- data.frame(outt, spused = spused, stringsAsFactors = FALSE)
133-
names(outoutout) <- NULL
134-
outoutout
162+
outoutout <- data.frame(outt, stringsAsFactors = FALSE)
163+
164+
return(outoutout)
135165
}
136166

137167
make_ids <- function(x) as.numeric(xml2::xml_text(xml2::xml_find_all(x, "//IdList//Id")))
168+
169+
tryfail <- function(default, code) {
170+
tryCatch(code, error = function(e) {
171+
message(e)
172+
return(default)
173+
})
174+
}

tests/testthat/test-ncbi.R

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
context("NCBI tests")
2+
3+
test_one_species <- "Acer rubrum"
4+
test_three_species <- c("Colletes similis", "Halictus ligatus", "Perdita californica")
5+
test_genes <- c("coi", "co1")
6+
test_that("ncbi_byname gets seq for single species", {
7+
skip_on_cran()
8+
result <- ncbi_byname("Acer rubrum")
9+
expect_equal(ncol(result), 7)
10+
expect_equal(nrow(result), 1)
11+
expect_true(!is.na(result$taxon))
12+
expect_true(!is.na(result$sequence))
13+
})
14+
15+
test_that("ncbi_byname gets seq for multiple species", {
16+
skip_on_cran()
17+
result <- ncbi_byname(test_three_species)
18+
expect_equal(ncol(result), 7)
19+
expect_gte(nrow(result), 3)
20+
expect_true(all(!is.na(result$taxon)))
21+
expect_true(all(!is.na(result$sequence)))
22+
})
23+
24+
test_that("ncbi_byname handles cases with no results"), {
25+
skip_on_cran()
26+
result <- ncbi_byname("This is not a species")
27+
expect_equal(ncol(result), 7)
28+
expect_equal(nrow(result), 1)
29+
expect_true(all(is.na(result[,2:7])))
30+
})
31+
32+
test_that("ncbi_byname gets seq for single gene", {
33+
skip_on_cran()
34+
result <- ncbi_byname(test_one_species, gene = "coi")
35+
expect_equal(ncol(result), 7)
36+
expect_gte(nrow(result), 1)
37+
expect_true(all(!is.na(result$taxon)))
38+
expect_true(all(!is.na(result$sequence)))
39+
})
40+
41+
## regression tests for https://github.com/ropensci/traits/issues/126
42+
43+
test_that("ncbi_byname works under subset of conditions from issue #126", {
44+
skip_on_cran()
45+
46+
res <- ncbi_byname(taxa = "Coryphaena hippurus", gene = c("Coi"), seqrange = "1:2000")
47+
expect_true(is.data.frame(res))
48+
res2 <- ncbi_byname(taxa = "Coryphaena hippurus", gene = c("Coi"), seqrange = "500:750")
49+
expect_true(is.data.frame(res2))
50+
51+
res3 <- ncbi_byname(taxa = "Sardinops melanostictus", gene = c("12s"), seqrange = "1:2000"))
52+
expect_true(is.data.frame(res3))
53+
54+
})

0 commit comments

Comments
 (0)