Skip to content

Commit 1b3a919

Browse files
committed
fix bugs for name2taxid
1 parent 80997c5 commit 1b3a919

File tree

2 files changed

+40
-18
lines changed

2 files changed

+40
-18
lines changed

src/detaxa/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__='0.5.14'
1+
__version__='0.5.15'

src/detaxa/taxonomy.py

Lines changed: 39 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,9 @@ def _checkTaxonomy(tid: Union[int, str]):
7777
"""Check if a taxonomy ID is present in the taxonomy database"""
7878

7979
if not len(taxParents):
80-
logger.fatal("Taxonomy not loaded. \"loadTaxonomy()\" must be called first.")
81-
_die("Taxonomy not loaded. \"loadTaxonomy()\" must be called first.")
80+
logger.info("Taxonomy not loaded. Call \"loadTaxonomy()\" first to avoid this message...")
81+
logger.info("Loading taxonomy...")
82+
loadTaxonomy()
8283

8384
if tid:
8485
# tid must be in string type
@@ -103,7 +104,6 @@ def _taxid2fullLink(tid: Union[int, str]) -> dict:
103104
104105
Returns:
105106
dict: A dictionary containing the full lineage of the target taxon.
106-
107107
"""
108108
tid = _checkTaxonomy(tid)
109109
if tid == "unknown": return {}
@@ -371,7 +371,14 @@ def taxid2parent(tid: Union[int, str], norank: bool=False) -> str:
371371

372372
return tid
373373

374-
def name2taxid(name, rank=None, superkingdom=None, fuzzy=True, cutoff=0.7, max_matches=3, reset=False, expand=True) -> list:
374+
def name2taxid(name: str,
375+
rank: str=None,
376+
superkingdom: str=None,
377+
fuzzy: bool=True,
378+
cutoff: float=0.7,
379+
max_matches: int=3,
380+
reset: bool=False,
381+
expand: bool=True) -> list:
375382
"""
376383
Get the taxonomic ID of a given taxonomic name.
377384
@@ -400,22 +407,35 @@ def name2taxid(name, rank=None, superkingdom=None, fuzzy=True, cutoff=0.7, max_m
400407
names_dmp_file = taxonomy_dir+"/names.dmp"
401408

402409
if df_names is None and expand and os.path.isfile( names_dmp_file ):
410+
logging.debug(f"Loading {names_dmp_file}")
403411
df_names = pd.read_csv(names_dmp_file,
404-
sep='\t|\t',
405-
engine='python',
406-
header=None,
407-
names=['taxid', 'name', 'annot', 'type'],
408-
usecols=['taxid','name'],
409-
index_col='name')
412+
sep='\t',
413+
header=None,
414+
names=['taxid', 'sep1', 'name', 'sep2', 'annot', 'sep3', 'type', 'sep4'],
415+
usecols=['taxid','name'],
416+
index_col='name')
417+
logging.debug(f"names.dmp loaded")
410418

411419
if not name in nameTid or reset:
412420
matched_taxid = []
421+
df_temp = None
422+
logging.debug(f"Searching {name}...")
413423

414424
if expand:
415-
import difflib
416-
matches = difflib.get_close_matches(name, df_names.index, max_matches, cutoff)
417-
logger.debug(f'{name}: {matches}')
418-
df_temp = df_names.loc[matches,:]
425+
if fuzzy==True:
426+
import difflib
427+
matches = difflib.get_close_matches(name, df_names.index, max_matches, cutoff)
428+
logger.debug(f'{name}: {matches}')
429+
df_temp = df_names.loc[matches,:]
430+
else:
431+
if name in df_names.index:
432+
df_temp = df_names.loc[[name],:]
433+
else:
434+
df_temp = df_names.head(0)
435+
436+
if len(df_temp)==0:
437+
nameTid[name] = []
438+
return nameTid[name]
419439

420440
if rank:
421441
df_temp['rank'] = df_temp.taxid.apply(taxid2rank)
@@ -429,8 +449,7 @@ def name2taxid(name, rank=None, superkingdom=None, fuzzy=True, cutoff=0.7, max_m
429449

430450
nameTid[name] = df_temp.head(max_matches).taxid.to_list()
431451
return nameTid[name]
432-
433-
else: # searching scientific names only
452+
else: # The 'expand' flag is False, we will search scientific names only
434453
for taxid in taxNames:
435454
if fuzzy==True:
436455
if not name in taxNames[taxid]:
@@ -453,7 +472,10 @@ def name2taxid(name, rank=None, superkingdom=None, fuzzy=True, cutoff=0.7, max_m
453472
nameTid[name] = matched_taxid
454473
return nameTid[name][:max_matches]
455474
else:
456-
return nameTid[name][:max_matches]
475+
if len(nameTid[name]):
476+
return nameTid[name][:max_matches]
477+
else:
478+
return []
457479

458480
def taxid2nameOnRank(tid: Union[int, str], target_rank=None) -> str:
459481
"""

0 commit comments

Comments
 (0)