@@ -376,13 +376,21 @@ def taxid2parent(tid: Union[int, str], norank: bool=False) -> str:
376
376
377
377
return tid
378
378
379
+ def name2taxid_reset ():
380
+ """
381
+ Clean up cached results and searching domain
382
+ """
383
+ global nameTid , df_names
384
+ nameTid = {}
385
+ df_names = None
386
+ return
387
+
379
388
def name2taxid (name : str ,
380
389
rank : str = None ,
381
390
superkingdom : str = None ,
382
- fuzzy : bool = True ,
391
+ fuzzy : bool = False ,
383
392
cutoff : float = 0.7 ,
384
- max_matches : int = 3 ,
385
- reset : bool = False ,
393
+ max_matches : int = 3 ,
386
394
expand : bool = True ) -> list :
387
395
"""
388
396
Get the taxonomic ID of a given taxonomic name.
@@ -391,26 +399,22 @@ def name2taxid(name: str,
391
399
name (str): Taxonomic scientific name.
392
400
rank (str, optional): The expected rank of the taxonomic name.
393
401
superkingdom (str, optional): The expected superkingdom of the taxonomic name.
394
- fuzzy (bool, optional): Whether to allow fuzzy search. Defaults to True .
395
- cutoff (float, optional): Similarity cutoff for difflib.get_close_matches() .
396
- Only apply to `expand` mode. Cutoff will set to 1 if `fuzzy` set to False. Defaults to 0.7.
402
+ fuzzy (bool, optional): Whether to allow fuzzy search. Defaults to False .
403
+ cutoff (float, optional): Similarity cutoff for ` difflib.get_close_matches` .
404
+ Only apply to `expand` mode. Defaults to 0.7.
397
405
max_matches (int, optional): Reporting max number of taxid. Defaults to 3.
398
- reset (bool, optional): Mapping results are cached. Whether to clean up previous searches.
399
- Defaults to False.
400
406
expand (bool, optional): Search the entire 'names.dmp' if True, otherwise search sientific names only.
401
407
Defaults to False.
402
-
403
408
Returns:
404
409
list: The list of matched taxonomic ID.
405
410
"""
406
411
global nameTid , df_names , taxonomy_dir
407
412
import pandas as pd
408
413
409
- if not fuzzy : cutoff = 1
410
-
411
414
# if expand is True, loading names.dmp
412
415
names_dmp_file = taxonomy_dir + "/names.dmp"
413
416
417
+ # "expand" mode is ON
414
418
if df_names is None and expand and os .path .isfile ( names_dmp_file ):
415
419
logging .debug (f"Loading { names_dmp_file } " )
416
420
df_names = pd .read_csv (names_dmp_file ,
@@ -420,62 +424,44 @@ def name2taxid(name: str,
420
424
usecols = ['taxid' ,'name' ],
421
425
index_col = 'name' )
422
426
logging .debug (f"names.dmp loaded" )
427
+ # "expand" mode is OFF, search loaded names only
428
+ else :
429
+ df_names = pd .DataFrame .from_dict (taxNames , orient = 'index' , columns = ['name' ])
430
+ df_names = df_names .reset_index ().rename (columns = {'index' : 'taxid' })
431
+ df_names = df_names .set_index ('name' )
423
432
424
- if not name in nameTid or reset :
433
+ if not name in nameTid :
425
434
matched_taxid = []
426
435
df_temp = None
427
436
logging .debug (f"Searching { name } ..." )
428
437
429
- if expand :
430
- if fuzzy == True :
431
- import difflib
432
- matches = difflib .get_close_matches (name , df_names .index , max_matches , cutoff )
433
- logger .debug (f'{ name } : { matches } ' )
434
- df_temp = df_names .loc [matches ,:]
438
+ if fuzzy == True :
439
+ import difflib
440
+ matches = difflib .get_close_matches (name , df_names .index , max_matches , cutoff )
441
+ logger .debug (f'{ name } : { matches } ' )
442
+ df_temp = df_names .loc [matches ,:]
443
+ else :
444
+ if name in df_names .index :
445
+ df_temp = df_names .loc [[name ],:]
435
446
else :
436
- if name in df_names .index :
437
- df_temp = df_names .loc [[name ],:]
438
- else :
439
- df_temp = df_names .head (0 )
447
+ df_temp = df_names .head (0 )
440
448
441
- if len (df_temp )== 0 :
442
- nameTid [name ] = []
443
- return nameTid [name ]
444
-
445
- if rank :
446
- df_temp ['rank' ] = df_temp .taxid .apply (taxid2rank )
447
- idx = df_temp ['rank' ]== rank
448
- df_temp = df_temp [idx ]
449
-
450
- if superkingdom :
451
- df_temp ['sk' ] = df_temp .taxid .apply (lambda x : taxid2nameOnRank (x , 'superkingdom' ))
452
- idx = df_temp ['sk' ]== superkingdom
453
- df_temp = df_temp [idx ]
454
-
455
- nameTid [name ] = df_temp .head (max_matches ).taxid .to_list ()
449
+ if len (df_temp )== 0 :
450
+ nameTid [name ] = []
456
451
return nameTid [name ]
457
- else : # The 'expand' flag is False, we will search scientific names only
458
- for taxid in taxNames :
459
- if fuzzy == True :
460
- if not name in taxNames [taxid ]:
461
- continue
462
- else :
463
- if name != taxNames [taxid ]:
464
- continue
465
-
466
- if rank :
467
- if _getTaxRank (taxid )== rank :
468
- matched_taxid .append (taxid )
469
- else :
470
- matched_taxid .append (taxid )
471
452
472
- # return when the first match found
473
- if len (matched_taxid )== max_matches :
474
- nameTid [name ] = matched_taxid
475
- return nameTid [name ]
476
-
477
- nameTid [name ] = matched_taxid
478
- return nameTid [name ][:max_matches ]
453
+ if rank :
454
+ df_temp ['rank' ] = df_temp .taxid .apply (taxid2rank )
455
+ idx = df_temp ['rank' ]== rank
456
+ df_temp = df_temp [idx ]
457
+
458
+ if superkingdom :
459
+ df_temp ['sk' ] = df_temp .taxid .apply (lambda x : taxid2nameOnRank (x , 'superkingdom' ))
460
+ idx = df_temp ['sk' ]== superkingdom
461
+ df_temp = df_temp [idx ]
462
+
463
+ nameTid [name ] = df_temp .head (max_matches ).taxid .to_list ()
464
+ return nameTid [name ]
479
465
else :
480
466
if len (nameTid [name ]):
481
467
return nameTid [name ][:max_matches ]
0 commit comments