Skip to content

Commit 9531315

Browse files
authored
Merge pull request #185 from lanl/develop
## New Modules #### Adds Penguin, Bunny, Peacock, and SeaLion modules: - **Penguin:** Text storage tool. - **Bunny:** Dataset generation tool for documents and their citations/references. - **Peacock:** Data visualization and generation of actionable statistics. - **SeaLion:** Generic report generation tool. ## Bugs - Fixes query index issue in Cheetah
2 parents 02c0c7b + 03a9bb7 commit 9531315

File tree

141 files changed

+19539
-301
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

141 files changed

+19539
-301
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
poetry.lock
12
# mac
23
results/
34
.DS_Store

CITATION.cff

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
version: 0.0.37
1+
version: 0.0.38
22
message: "If you use this software, please cite it as below."
33
authors:
44
- family-names: Eren
@@ -20,7 +20,7 @@ authors:
2020
- family-names: Alexandrov
2121
given-names: Boian
2222
title: "Tensor Extraction of Latent Features (T-ELF)"
23-
version: 0.0.37
23+
version: 0.0.38
2424
url: https://github.com/lanl/T-ELF
2525
doi: 10.5281/zenodo.10257897
2626
date-released: 2023-12-04

README.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -115,16 +115,17 @@ python post_install.py # use the following, for example, for GPU system: <python
115115
| **Method** | **Description** | **Example** | **Release Status** |
116116
|:----------:|:----------------------------------------------------------:|:-----------:|:------------------:|
117117
| Wolf | Graph centrality and ranking tool | [Link](examples/Wolf) | :white_check_mark: |
118-
| Peacock | Data visualization and generation of actionable statistics | | :soon: |
118+
| Peacock | Data visualization and generation of actionable statistics | [Link](examples/Peacock) | :white_check_mark: |
119+
| SeaLion | Generic report generation tool | [Link](examples/SeaLion) | :white_check_mark: |
119120
| Fox | Report generation tool for text data | | :soon: |
120-
| SeaLion | Generic report generation tool | | :soon: |
121121

122122
### TELF.applications
123123

124124
| **Method** | **Description** | **Example** | **Release Status** |
125125
|:----------:|:--------------------------------------------------------------------:|:-----------:|:------------------:|
126126
| Cheetah | Fast search by keywords and phrases | [Link](examples/Cheetah) | :white_check_mark: |
127-
| Bunny | Dataset generation tool for documents and their citations/references | | :soon: |
127+
| Bunny | Dataset generation tool for documents and their citations/references | [Link](examples/Bunny) | :white_check_mark: |
128+
| Penguin | Text storage tool | [Link](examples/Penguin) | :white_check_mark: |
128129
| Termite | Knowladge graph building tool | | :soon: |
129130

130131

TELF/applications/Bunny/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .bunny import Bunny, BunnyFilter, BunnyOperation
2+
from .auto_bunny import AutoBunny, AutoBunnyStep

TELF/applications/Bunny/auto_bunny.py

Lines changed: 346 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,346 @@
1+
import os
2+
import re
3+
import sys
4+
import pathlib
5+
import pandas as pd
6+
from dataclasses import dataclass, field
7+
8+
from .bunny import Bunny
9+
from ..Cheetah import Cheetah
10+
from ...pre_processing.iPenguin.Scopus import Scopus
11+
from ...pre_processing.iPenguin.SemanticScholar import SemanticScholar
12+
from ...pre_processing.Vulture import Vulture
13+
14+
@dataclass
15+
class AutoBunnyStep:
16+
"""Class for keeping track of AutoBunny args"""
17+
modes: list
18+
max_papers: int = 0
19+
hop_priority: str = 'random'
20+
cheetah_settings: dict = field(default_factory = lambda: {'query': None})
21+
vulture_settings: list = field(default_factory = lambda: [])
22+
23+
24+
class AutoBunny:
25+
26+
CHEETAH_INDEX = {
27+
'title': None,
28+
'abstract': 'clean_title_abstract',
29+
'year': 'year',
30+
'author_ids': 'author_ids',
31+
'affiliations': 'affiliations',
32+
'country': 'affiliations',
33+
}
34+
35+
def __init__(self, core, s2_key=None, scopus_keys=None, output_dir=None, cache_dir=None, cheetah_index=None, verbose=False):
36+
self.core = core
37+
self.s2_key = s2_key
38+
self.scopus_keys = scopus_keys
39+
self.output_dir = output_dir
40+
self.cache_dir = cache_dir
41+
self.cheetah_index = cheetah_index
42+
self.verbose = verbose
43+
44+
45+
def run(self, steps, *, s2_key=None, scopus_keys=None, cheetah_index=None, max_papers=250000, checkpoint=True):
46+
47+
# validate input
48+
if not isinstance(steps, (list, tuple)):
49+
steps = [steps]
50+
for i,x in enumerate(steps):
51+
if not isinstance(x, AutoBunnyStep):
52+
raise ValueError(f'Step at index {i} in `steps` is not valid')
53+
54+
if s2_key is not None:
55+
self.s2_key = s2_key
56+
if scopus_keys is not None:
57+
self.scopus_keys = scopus_keys
58+
if cheetah_index is not None:
59+
self.cheetah_index = cheetah_index
60+
61+
# init search
62+
df = self.core
63+
cheetah_table = None
64+
65+
# run for specified steps
66+
for i, s in enumerate(steps):
67+
modes = s.modes
68+
cheetah_settings = s.cheetah_settings
69+
vulture_settings = s.vulture_settings
70+
step_max_papers = s.max_papers
71+
hop_priority = s.hop_priority
72+
hop = int(df.type.max())
73+
74+
if checkpoint:
75+
df.to_csv(os.path.join(self.output_dir, f'hop-{hop}.csv'), index=False)
76+
cheetah_settings['do_results_table'] = True
77+
78+
if i == 0 and len(cheetah_settings) > 1:
79+
tmp_df = self.__vulture_clean(df, vulture_settings)
80+
tmp_df, cheetah_table = self.__cheetah_filter(tmp_df, cheetah_settings)
81+
if cheetah_table is not None:
82+
cheetah_table.to_csv(os.path.join(self.output_dir, f'cheetah_table-{hop}.csv'), index=False)
83+
84+
hop_estimate = Bunny.estimate_hop(df, modes[0]) # TODO: fix estimate_hop to use all modes
85+
if hop_estimate > max_papers:
86+
print(f'Early termination after {i} hops due to max papers in next hop', file=sys.stderr)
87+
return df
88+
89+
df = self.__bunny_hop(df, modes, step_max_papers, hop_priority)
90+
df = self.__vulture_clean(df, vulture_settings)
91+
df, cheetah_table = self.__cheetah_filter(df, cheetah_settings)
92+
93+
# format df
94+
df.drop(columns=['clean_title_abstract'], inplace=True)
95+
df = df.reset_index(drop=True)
96+
97+
# save final results if checkpointing
98+
if checkpoint:
99+
hop = int(df.type.max())
100+
df.to_csv(os.path.join(self.output_dir, 'final_bunny_papers.csv'), index=False)
101+
if cheetah_table is not None:
102+
cheetah_table.to_csv(os.path.join(self.output_dir, f'cheetah_table-{hop}.csv'), index=False)
103+
final_table = self.__final_cheetah_table()
104+
final_table.to_csv(os.path.join(self.output_dir, 'final_cheetah_table.csv'), index=False)
105+
return df
106+
107+
108+
### Helpers
109+
110+
111+
def __final_cheetah_table(self, stem='cheetah_table'):
112+
files = [x for x in os.listdir(self.output_dir) if x.endswith('.csv') and stem in x]
113+
frames = {}
114+
for f in files:
115+
match = re.search(f"{stem}-(\d+).csv", f)
116+
if match:
117+
x = int(match.group(1))
118+
frames[x] = pd.read_csv(os.path.join(self.output_dir, f))
119+
120+
for hop, df in frames.items():
121+
df = df[df.columns[:-2]].copy()
122+
num_papers_col = df.columns[-1]
123+
df.rename(columns={num_papers_col: f'hop{hop}-{num_papers_col}'}, inplace=True)
124+
frames[hop] = df
125+
126+
frames = list(frames.values())
127+
df = frames[0]
128+
for tmp_df in frames[1:]:
129+
df = df.merge(tmp_df, on=list(df.columns[:2]), how='outer')
130+
return df
131+
132+
133+
def __bunny_hop(self, df, modes, max_papers, hop_priority):
134+
bunny = Bunny(s2_key=self.s2_key, output_dir=self.cache_dir, verbose=self.verbose)
135+
use_scopus = self.scopus_keys is not None
136+
hop_df = bunny.hop(df, 1, modes, use_scopus=use_scopus, filters=None, max_papers=max_papers, hop_priority=hop_priority,
137+
scopus_keys=self.scopus_keys, s2_dir='s2', scopus_dir='scopus')
138+
return hop_df
139+
140+
141+
def __cheetah_filter(self, df, cheetah_settings):
142+
143+
# index settings
144+
cheetah_columns = {
145+
'title': None,
146+
'abstract': 'clean_title_abstract',
147+
'year': 'year',
148+
'author_ids': 'author_ids',
149+
'affiliations': 'affiliations',
150+
'country': 'affiliations',
151+
}
152+
153+
# preserve the previously filtered papers
154+
max_type = df.type.max()
155+
df_prev = df.loc[df.type < max_type]
156+
df_curr = df.loc[df.type == max_type]
157+
158+
# setup cheetah
159+
cheetah = Cheetah(verbose=self.verbose)
160+
index_file = os.path.join(self.output_dir, 'cheetah_index.p')
161+
cheetah.index(df_curr,
162+
columns=cheetah_columns,
163+
index_file=index_file,
164+
reindex=True)
165+
166+
# filter with cheetah
167+
cheetah_df, cheetah_table = cheetah.search(**cheetah_settings)
168+
169+
# fix the cheetah_table (if being computed)
170+
# the cheetah table uses indices set by df. These indices will be reset by the rest of
171+
# this function. It is more robust to replace indices with s2ids.
172+
if cheetah_table is not None and not cheetah_table.empty:
173+
cheetah_table['included_ids'] = cheetah_table.included_ids.fillna('').str.split(';')\
174+
.apply(lambda x: [int(i) for i in x if i] if x else [])
175+
176+
def include_s2ids(indices):
177+
if not indices:
178+
return None
179+
return ';'.join(map(str, df_curr.loc[indices].s2id.to_list()))
180+
181+
def exclude_s2ids(indices):
182+
all_s2ids = {x for x in df_curr.s2id.to_list() if not pd.isna(x)}
183+
if not indices:
184+
return ';'.join(list(all_s2ids))
185+
curr_s2ids = set(df_curr.loc[indices].s2id.to_list())
186+
return ';'.join(list(all_s2ids - curr_s2ids)) or None
187+
188+
cheetah_table['selected_s2ids'] = cheetah_table.included_ids.apply(include_s2ids)
189+
cheetah_table['excluded_s2ids'] = cheetah_table.included_ids.apply(exclude_s2ids)
190+
cheetah_table = cheetah_table.drop(columns='included_ids')
191+
192+
# combine cheetah filter results with frozen results from previous hops
193+
cheetah_df = pd.concat([df_prev, cheetah_df], ignore_index=True)
194+
cheetah_df = cheetah_df.drop_duplicates(subset=['s2id'], keep='first')
195+
cheetah_df = cheetah_df.reset_index(drop=True)
196+
return cheetah_df, cheetah_table
197+
198+
199+
def __vulture_clean(self, df, vulture_settings):
200+
201+
# setup vulture
202+
vulture = Vulture(n_jobs=-1, cache=self.output_dir, verbose=self.verbose)
203+
204+
dataframe_clean_args = {
205+
"df": df,
206+
"columns": ['title', 'abstract'],
207+
"append_to_original_df": True,
208+
"concat_cleaned_cols": True,
209+
}
210+
if vulture_settings:
211+
dataframe_clean_args["steps"] = vulture_settings
212+
return vulture.clean_dataframe(**dataframe_clean_args)
213+
214+
215+
216+
### Getters / Setters
217+
218+
219+
@property
220+
def core(self):
221+
return self._core
222+
223+
@property
224+
def s2_key(self):
225+
return self._s2_key
226+
227+
@property
228+
def scopus_keys(self):
229+
return self._scopus_keys
230+
231+
@property
232+
def cheetah_index(self):
233+
return self._cheetah_index
234+
235+
@property
236+
def output_dir(self):
237+
return self._output_dir
238+
239+
@property
240+
def cache_dir(self):
241+
return self._cache_dir
242+
243+
@core.setter
244+
def core(self, core):
245+
if not isinstance(core, pd.DataFrame):
246+
raise ValueError('AutoBunny expects core to be a SLIC DataFrame!')
247+
if 'type' not in core:
248+
core['type'] = [0] * len(core)
249+
self._core = core
250+
251+
@s2_key.setter
252+
def s2_key(self, key):
253+
if key is not None:
254+
self._s2_key = key
255+
elif isinstance(key, str):
256+
try:
257+
ip = SemanticScholar(key=key)
258+
self._s2_key = key
259+
except ValueError:
260+
raise ValueError(f'The key "{key}" was rejected by the Semantic Scholar API')
261+
else:
262+
raise TypeError(f'Unsupported type "{type(key)}" for Semantic Scholar key')
263+
264+
@scopus_keys.setter
265+
def scopus_keys(self, scopus_keys):
266+
if scopus_keys is None:
267+
self._scopus_keys = scopus_keys
268+
elif isinstance(scopus_keys, (list, set)):
269+
for key in scopus_keys:
270+
try:
271+
ip = Scopus(keys=[key])
272+
except ValueError:
273+
raise ValueError(f'The key "{k}" was rejected by the Scopus API')
274+
self._scopus_keys = list(scopus_keys)
275+
else:
276+
raise TypeError(f'Unsupported type "{type(key)}" for Scopus key')
277+
278+
@cheetah_index.setter
279+
def cheetah_index(self, cheetah_index):
280+
if cheetah_index is None:
281+
self._cheetah_index = self.CHEETAH_INDEX
282+
elif isinstance(cheetah_index, dict):
283+
if not all(key in self.CHEETAH_INDEX for key in cheetah_index.keys()):
284+
raise ValueError(f'Invalid index key in `cheetah_index`. Valid keys are in '
285+
f'{list(self.CHEETAH_INDEX.keys())}')
286+
287+
# fill in any missing keys from cheetah_index with default
288+
self._cheetah_index = {**self.CHEETAH_INDEX, **cheetah_index}
289+
else:
290+
raise TypeError(f'Unsupported type "{type(cheetah_index)}" for `cheetah_index`')
291+
292+
def __check_path(self, path, var_name):
293+
if path.exists() and path.is_file(): # handle the path already existing as file
294+
raise ValueError(f'The path `{var_name}` points to a file instead of a directory')
295+
if not path.exists():
296+
path.mkdir(parents=True) # parents=True ensures all missing parent directories are also created
297+
298+
def __check_path(self, path, var_name):
299+
"""
300+
Checks and ensures the given path exists as a directory. If path does not exist, a new directory
301+
will be created. If the path exists but is a file, a ValueError will be raised. A TypeError is
302+
raised if the provided path is neither a string nor a `pathlib.Path` object.
303+
304+
Parameters:
305+
-----------
306+
path: str, pathlib.Path
307+
The path to be checked and ensured as a directory.
308+
309+
Raises:
310+
-------
311+
TypeError:
312+
If the provided path is neither a string nor a `pathlib.Path` object.
313+
ValueError:
314+
If the path points to an existing file.
315+
"""
316+
if isinstance(path, str):
317+
path = pathlib.Path(path)
318+
if not isinstance(path, pathlib.Path):
319+
raise TypeError(f'Unsupported type "{type(path)}" for `path`')
320+
path = path.resolve()
321+
if path.exists():
322+
if path.is_file():
323+
raise ValueError(f'`{var_name}` points to a file instead of a directory')
324+
else:
325+
path.mkdir(parents=True, exist_ok=True)
326+
327+
def __process_path(self, path, var_name):
328+
if path is None:
329+
return pathlib.Path('/tmp')
330+
elif isinstance(path, str):
331+
_path = pathlib.Path(path)
332+
elif isinstance(path, pathlib.Path):
333+
_path = path
334+
else:
335+
raise TypeError(f'Unsupported type "{type(path)}" for `{var_name}`')
336+
self.__check_path(_path, var_name)
337+
return _path
338+
339+
@output_dir.setter
340+
def output_dir(self, output_dir):
341+
self._output_dir = self.__process_path(output_dir, 'output_dir')
342+
343+
@cache_dir.setter
344+
def cache_dir(self, cache_dir):
345+
self._cache_dir = self.__process_path(cache_dir, 'cache_dir')
346+

0 commit comments

Comments
 (0)