Skip to content

Commit ce17cfa

Browse files
authored
Merge pull request #188 from lanl/develop
Develop
2 parents bf34fd5 + 77dc571 commit ce17cfa

File tree

257 files changed

+15797
-15363
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

257 files changed

+15797
-15363
lines changed

.gitignore

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,20 @@
1-
poetry.lock
1+
VOCAB_CONSOLIDATOR_SubstitutionOperator.p
2+
VOCAB_CONSOLIDATOR_changes.csv
3+
clean_documents
4+
corrected_substitutions_df.csv
5+
SeaLion_post_processing/
6+
example_figures/
7+
graph.p
8+
example_Semantic_HNMFk/
9+
sample_index.p
10+
EXAMPLE_OUT
11+
result_example/
12+
search_terms.md
13+
scopus_cache/
14+
215
# mac
316
results/
17+
poetry.lock
418
.DS_Store
519
# contains the data for the project
620
# data/

CITATION.cff

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
version: 0.0.39
1+
version: 0.0.40
22
message: "If you use this software, please cite it as below."
33
authors:
44
- family-names: Eren
@@ -20,7 +20,7 @@ authors:
2020
- family-names: Alexandrov
2121
given-names: Boian
2222
title: "Tensor Extraction of Latent Features (T-ELF)"
23-
version: 0.0.39
23+
version: 0.0.40
2424
url: https://github.com/lanl/T-ELF
2525
doi: 10.5281/zenodo.10257897
2626
date-released: 2023-12-04

README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,14 @@ python post_install.py # use the following, for example, for GPU system: <python
130130
| Termite | Knowladge graph building tool | :soon: |
131131

132132

133+
## Use Cases
134+
135+
| **Example** | **Description** | **Link** |
136+
|:----------:|:--------------------------------------------------------------------:|:-----------:|
137+
| NM Law Data | Domain specific data for AI and RAG system written in our [paper](https://arxiv.org/abs/2502.20364) about New Mexico Law that uses the TELF pipeline | [Link](examples/NM%20Law%20Data)|
138+
| Full TELF Pipeline | An end-to-end pipeline demonstration, from data collection to analysis | :soon: |
139+
140+
133141
## How to Cite T-ELF?
134142
If you use T-ELF please cite.
135143

TELF/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@
33
sys.path += ["factorization"]
44
sys.path += ["pre_processing"]
55
sys.path += ["post_processing"]
6-
sys.path += ["applications"]
6+
sys.path += ["applications"]
7+
sys.path += ["helpers"]

TELF/applications/Bunny/auto_bunny.py

Lines changed: 3 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from ...pre_processing.iPenguin.Scopus import Scopus
1111
from ...pre_processing.iPenguin.SemanticScholar import SemanticScholar
1212
from ...pre_processing.Vulture import Vulture
13+
from ...helpers.file_system import check_path_var as check_path
1314

1415
@dataclass
1516
class AutoBunnyStep:
@@ -288,7 +289,7 @@ def scopus_keys(self, scopus_keys):
288289
try:
289290
ip = Scopus(keys=[key])
290291
except ValueError:
291-
raise ValueError(f'The key "{k}" was rejected by the Scopus API')
292+
raise ValueError(f'The key "{key}" was rejected by the Scopus API')
292293
self._scopus_keys = list(scopus_keys)
293294
else:
294295
raise TypeError(f'Unsupported type "{type(key)}" for Scopus key')
@@ -306,41 +307,6 @@ def cheetah_index(self, cheetah_index):
306307
self._cheetah_index = {**self.CHEETAH_INDEX, **cheetah_index}
307308
else:
308309
raise TypeError(f'Unsupported type "{type(cheetah_index)}" for `cheetah_index`')
309-
310-
def __check_path(self, path, var_name):
311-
if path.exists() and path.is_file(): # handle the path already existing as file
312-
raise ValueError(f'The path `{var_name}` points to a file instead of a directory')
313-
if not path.exists():
314-
path.mkdir(parents=True) # parents=True ensures all missing parent directories are also created
315-
316-
def __check_path(self, path, var_name):
317-
"""
318-
Checks and ensures the given path exists as a directory. If path does not exist, a new directory
319-
will be created. If the path exists but is a file, a ValueError will be raised. A TypeError is
320-
raised if the provided path is neither a string nor a `pathlib.Path` object.
321-
322-
Parameters:
323-
-----------
324-
path: str, pathlib.Path
325-
The path to be checked and ensured as a directory.
326-
327-
Raises:
328-
-------
329-
TypeError:
330-
If the provided path is neither a string nor a `pathlib.Path` object.
331-
ValueError:
332-
If the path points to an existing file.
333-
"""
334-
if isinstance(path, str):
335-
path = pathlib.Path(path)
336-
if not isinstance(path, pathlib.Path):
337-
raise TypeError(f'Unsupported type "{type(path)}" for `path`')
338-
path = path.resolve()
339-
if path.exists():
340-
if path.is_file():
341-
raise ValueError(f'`{var_name}` points to a file instead of a directory')
342-
else:
343-
path.mkdir(parents=True, exist_ok=True)
344310

345311
def __process_path(self, path, var_name):
346312
if path is None:
@@ -351,7 +317,7 @@ def __process_path(self, path, var_name):
351317
_path = path
352318
else:
353319
raise TypeError(f'Unsupported type "{type(path)}" for `{var_name}`')
354-
self.__check_path(_path, var_name)
320+
check_path(_path, var_name)
355321
return _path
356322

357323
@output_dir.setter

TELF/applications/Bunny/bunny.py

Lines changed: 3 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
from dataclasses import dataclass
1111
pd.set_option('future.no_silent_downcasting', True)
1212

13-
from ...applications.Penguin import Penguin, form_df
13+
from ..Penguin import Penguin
14+
from ..Penguin.crocodile import form_df
1415
from ...pre_processing.iPenguin.Scopus import Scopus
1516
from ...pre_processing.iPenguin.SemanticScholar import SemanticScholar
1617
from ...pre_processing.iPenguin.utils import format_pubyear
@@ -135,14 +136,6 @@ def find_doi(f):
135136
return match.group(1) if match else None
136137

137138

138-
def gen_chunks(l, n):
139-
"""Yield n number of sequential chunks from l."""
140-
d, r = divmod(len(l), n)
141-
for i in range(n):
142-
si = (d+1)*(i if i < r else r) + d*(0 if i < r else i - r)
143-
yield l[si:si+(d+1 if i < r else d)]
144-
145-
146139
class Bunny():
147140

148141
MODES = {'references', 'citations', 's2_author_ids'}
@@ -335,7 +328,7 @@ def form_core_scopus(self, data, data_type, keys, s2_dir='s2', scopus_dir='scopu
335328

336329
scopus_df.doi = scopus_df.doi.str.lower()
337330
scopus_df = scopus_df.loc[scopus_df.doi.isin(s2_dois)].copy()
338-
return s2_join_scopus(s2_df, scopus_df)
331+
return form_df(s2_df, scopus_df)
339332

340333

341334
def form_core(self, data, data_type, s2_dir='s2'):
@@ -752,20 +745,6 @@ def __evaluate_query(self, query, df, auth_map):
752745
return set.intersection(*results)
753746
elif query.operator == 'OR':
754747
return set.union(*results)
755-
756-
757-
def __form_query_str(self, query):
758-
if isinstance(query, BunnyFilter):
759-
ffunc = self.filter_funcs[query.filter_type]
760-
result = ffunc(df, query.filter_value, auth_map)
761-
return result
762-
elif isinstance(query, BunnyOperation):
763-
results = [self.__evaluate_query(operand, df, auth_map) for operand in query.operands]
764-
if query.operator == 'AND':
765-
return set.intersection(*results)
766-
elif query.operator == 'OR':
767-
return set.union(*results)
768-
769748

770749
def apply_filter(self, df, filters, filter_in_core=True, do_author_match=True):
771750
if 'eid' not in df and do_author_match:

TELF/applications/Cheetah/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
11
from .cheetah import Cheetah
2+
from .term_formatter import CheetahTermFormatter, convert_txt_to_cheetah_markdown
3+
from .term_generator import SearchTermGenerator
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
import os
2+
import warnings
3+
import pandas as pd
4+
from .cheetah import Cheetah
5+
6+
class CheetahTermFormatter:
7+
"""
8+
Loads search terms from a Markdown file and returns them as
9+
plain strings or dict blocks, with optional category filtering.
10+
Can also generate a substitutions lookup dict mapping phrases
11+
to underscored forms and back, if substitutions=True.
12+
13+
New parameters:
14+
all_categories (bool): if True, ignore `category` and
15+
`include_general` and include every section.
16+
"""
17+
def __init__(self, markdown_file, lower=False, category=None,
18+
include_general=True, substitutions=False, all_categories=False):
19+
self.markdown_file = markdown_file
20+
self.lower = lower
21+
self.category = category
22+
self.include_general = include_general
23+
self.substitutions = substitutions
24+
self.all_categories = all_categories
25+
26+
self.substitution_forward = {}
27+
self.substitution_reverse = {}
28+
29+
# parse the markdown into self.terms
30+
self.terms = self._parse_markdown()
31+
32+
# optionally build lookup table
33+
if self.substitutions:
34+
self._build_substitutions_lookup()
35+
36+
37+
def _parse_markdown(self):
38+
terms = []
39+
current_term = None
40+
positives = []
41+
negatives = []
42+
active_block = False
43+
current_section = None
44+
45+
try:
46+
with open(self.markdown_file, 'r', encoding='utf-8') as f:
47+
lines = f.readlines()
48+
except FileNotFoundError:
49+
warnings.warn(f"File '{self.markdown_file}' not found. Returning empty list.")
50+
return []
51+
52+
for raw in lines:
53+
line = raw.strip()
54+
55+
# Section header
56+
if line.startswith("# Category:"):
57+
current_section = line.split(":", 1)[1].strip()
58+
continue
59+
60+
# Decide whether to include this section
61+
if self.all_categories:
62+
include_section = True
63+
elif self.category is None:
64+
# no filtering → include everything
65+
include_section = True
66+
else:
67+
if current_section is None and self.include_general:
68+
include_section = True
69+
else:
70+
include_section = (current_section == self.category)
71+
72+
# Term header
73+
if line.startswith("##"):
74+
# finish previous block
75+
if current_term is not None and active_block:
76+
if positives or negatives:
77+
terms.append({
78+
current_term: {
79+
"positives": positives,
80+
"negatives": negatives
81+
}
82+
})
83+
else:
84+
terms.append(current_term)
85+
86+
# reset for new block
87+
positives = []
88+
negatives = []
89+
header = line.lstrip("#").strip()
90+
if self.lower:
91+
header = header.lower()
92+
current_term = header
93+
active_block = include_section
94+
95+
# collect positives / negatives
96+
elif active_block and line.lower().startswith("must have:"):
97+
items = [i.strip() for i in line.split(":", 1)[1].split(",") if i.strip()]
98+
positives.extend(items)
99+
elif active_block and line.lower().startswith("exclude with:"):
100+
items = [i.strip() for i in line.split(":", 1)[1].split(",") if i.strip()]
101+
negatives.extend(items)
102+
103+
# final block
104+
if current_term is not None and active_block:
105+
if positives or negatives:
106+
terms.append({
107+
current_term: {
108+
"positives": positives,
109+
"negatives": negatives
110+
}
111+
})
112+
else:
113+
terms.append(current_term)
114+
115+
return terms
116+
117+
def _build_substitutions_lookup(self):
118+
"""
119+
Build a dict mapping each term to its underscored form and vice versa.
120+
"""
121+
for entry in self.terms:
122+
if isinstance(entry, str):
123+
term = entry
124+
underscored = term.replace(" ", "_")
125+
self.substitution_forward[term] = underscored
126+
self.substitution_reverse[underscored] = term
127+
elif isinstance(entry, dict):
128+
for term in entry.keys():
129+
underscored = term.replace(" ", "_")
130+
self.substitution_forward[term] = underscored
131+
self.substitution_reverse[underscored] = term
132+
133+
def get_terms(self):
134+
return self.terms
135+
136+
def get_substitution_maps(self):
137+
"""
138+
Return the substitutions lookup dict (empty if substitutions=False).
139+
"""
140+
return self.substitution_forward, self.substitution_reverse
141+
142+
143+
def convert_txt_to_cheetah_markdown(txt_path, markdown_path):
144+
import ast
145+
146+
with open(txt_path, 'r', encoding='utf-8') as f:
147+
lines = [line.strip() for line in f if line.strip()]
148+
149+
markdown_lines = []
150+
151+
for line in lines:
152+
if line.startswith("{") and line.endswith("}"):
153+
try:
154+
parsed = ast.literal_eval(line)
155+
for key, value in parsed.items():
156+
positives = [v.lstrip('+') for v in value if v.startswith('+')]
157+
negatives = [v for v in value if not v.startswith('+')]
158+
markdown_lines.append(f"## {key}")
159+
if positives:
160+
markdown_lines.append(f"positives: {', '.join(positives)}")
161+
if negatives:
162+
markdown_lines.append(f"negatives: {', '.join(negatives)}")
163+
except Exception as e:
164+
print(f"Skipping line due to parse error: {line}\nError: {e}")
165+
else:
166+
markdown_lines.append(f"## {line.strip()}")
167+
168+
with open(markdown_path, 'w', encoding='utf-8') as f:
169+
f.write("\n".join(markdown_lines))
170+
171+
print(f"Converted markdown saved to: {markdown_path}")

0 commit comments

Comments
 (0)