Skip to content

Commit bb57c0c

Browse files
committed
performance fix
1 parent c430e03 commit bb57c0c

File tree

1 file changed

+44
-58
lines changed

1 file changed

+44
-58
lines changed

src/detaxa/taxonomy.py

Lines changed: 44 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -376,13 +376,21 @@ def taxid2parent(tid: Union[int, str], norank: bool=False) -> str:
376376

377377
return tid
378378

379+
def name2taxid_reset():
380+
"""
381+
Clean up cached results and searching domain
382+
"""
383+
global nameTid, df_names
384+
nameTid = {}
385+
df_names = None
386+
return
387+
379388
def name2taxid(name: str,
380389
rank: str=None,
381390
superkingdom: str=None,
382-
fuzzy: bool=True,
391+
fuzzy: bool=False,
383392
cutoff: float=0.7,
384-
max_matches: int=3,
385-
reset: bool=False,
393+
max_matches: int=3,
386394
expand: bool=True) -> list:
387395
"""
388396
Get the taxonomic ID of a given taxonomic name.
@@ -391,26 +399,22 @@ def name2taxid(name: str,
391399
name (str): Taxonomic scientific name.
392400
rank (str, optional): The expected rank of the taxonomic name.
393401
superkingdom (str, optional): The expected superkingdom of the taxonomic name.
394-
fuzzy (bool, optional): Whether to allow fuzzy search. Defaults to True.
395-
cutoff (float, optional): Similarity cutoff for difflib.get_close_matches().
396-
Only apply to `expand` mode. Cutoff will set to 1 if `fuzzy` set to False. Defaults to 0.7.
402+
fuzzy (bool, optional): Whether to allow fuzzy search. Defaults to False.
403+
cutoff (float, optional): Similarity cutoff for `difflib.get_close_matches`.
404+
Only apply to `expand` mode. Defaults to 0.7.
397405
max_matches (int, optional): Reporting max number of taxid. Defaults to 3.
398-
reset (bool, optional): Mapping results are cached. Whether to clean up previous searches.
399-
Defaults to False.
400406
expand (bool, optional): Search the entire 'names.dmp' if True, otherwise search sientific names only.
401407
Defaults to False.
402-
403408
Returns:
404409
list: The list of matched taxonomic ID.
405410
"""
406411
global nameTid, df_names, taxonomy_dir
407412
import pandas as pd
408413

409-
if not fuzzy: cutoff=1
410-
411414
# if expand is True, loading names.dmp
412415
names_dmp_file = taxonomy_dir+"/names.dmp"
413416

417+
# "expand" mode is ON
414418
if df_names is None and expand and os.path.isfile( names_dmp_file ):
415419
logging.debug(f"Loading {names_dmp_file}")
416420
df_names = pd.read_csv(names_dmp_file,
@@ -420,62 +424,44 @@ def name2taxid(name: str,
420424
usecols=['taxid','name'],
421425
index_col='name')
422426
logging.debug(f"names.dmp loaded")
427+
# "expand" mode is OFF, search loaded names only
428+
else:
429+
df_names = pd.DataFrame.from_dict(taxNames, orient='index', columns=['name'])
430+
df_names = df_names.reset_index().rename(columns={'index': 'taxid'})
431+
df_names = df_names.set_index('name')
423432

424-
if not name in nameTid or reset:
433+
if not name in nameTid:
425434
matched_taxid = []
426435
df_temp = None
427436
logging.debug(f"Searching {name}...")
428437

429-
if expand:
430-
if fuzzy==True:
431-
import difflib
432-
matches = difflib.get_close_matches(name, df_names.index, max_matches, cutoff)
433-
logger.debug(f'{name}: {matches}')
434-
df_temp = df_names.loc[matches,:]
438+
if fuzzy==True:
439+
import difflib
440+
matches = difflib.get_close_matches(name, df_names.index, max_matches, cutoff)
441+
logger.debug(f'{name}: {matches}')
442+
df_temp = df_names.loc[matches,:]
443+
else:
444+
if name in df_names.index:
445+
df_temp = df_names.loc[[name],:]
435446
else:
436-
if name in df_names.index:
437-
df_temp = df_names.loc[[name],:]
438-
else:
439-
df_temp = df_names.head(0)
447+
df_temp = df_names.head(0)
440448

441-
if len(df_temp)==0:
442-
nameTid[name] = []
443-
return nameTid[name]
444-
445-
if rank:
446-
df_temp['rank'] = df_temp.taxid.apply(taxid2rank)
447-
idx = df_temp['rank']==rank
448-
df_temp = df_temp[idx]
449-
450-
if superkingdom:
451-
df_temp['sk'] = df_temp.taxid.apply(lambda x: taxid2nameOnRank(x, 'superkingdom'))
452-
idx = df_temp['sk']==superkingdom
453-
df_temp = df_temp[idx]
454-
455-
nameTid[name] = df_temp.head(max_matches).taxid.to_list()
449+
if len(df_temp)==0:
450+
nameTid[name] = []
456451
return nameTid[name]
457-
else: # The 'expand' flag is False, we will search scientific names only
458-
for taxid in taxNames:
459-
if fuzzy==True:
460-
if not name in taxNames[taxid]:
461-
continue
462-
else:
463-
if name!=taxNames[taxid]:
464-
continue
465-
466-
if rank:
467-
if _getTaxRank(taxid)==rank:
468-
matched_taxid.append(taxid)
469-
else:
470-
matched_taxid.append(taxid)
471452

472-
# return when the first match found
473-
if len(matched_taxid)==max_matches:
474-
nameTid[name] = matched_taxid
475-
return nameTid[name]
476-
477-
nameTid[name] = matched_taxid
478-
return nameTid[name][:max_matches]
453+
if rank:
454+
df_temp['rank'] = df_temp.taxid.apply(taxid2rank)
455+
idx = df_temp['rank']==rank
456+
df_temp = df_temp[idx]
457+
458+
if superkingdom:
459+
df_temp['sk'] = df_temp.taxid.apply(lambda x: taxid2nameOnRank(x, 'superkingdom'))
460+
idx = df_temp['sk']==superkingdom
461+
df_temp = df_temp[idx]
462+
463+
nameTid[name] = df_temp.head(max_matches).taxid.to_list()
464+
return nameTid[name]
479465
else:
480466
if len(nameTid[name]):
481467
return nameTid[name][:max_matches]

0 commit comments

Comments
 (0)