Skip to content

Commit 1729628

Browse files
authored
Merge pull request #194 from lanl/develop
Develop
2 parents a629e66 + ab1e875 commit 1729628

File tree

248 files changed

+19600
-2778
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

248 files changed

+19600
-2778
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
example_out
2+
example_results
3+
hidden_keys.py
14
example_output/
25
VOCAB_CONSOLIDATOR_SubstitutionOperator.p
36
VOCAB_CONSOLIDATOR_changes.csv

CITATION.cff

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
version: 0.0.41
1+
version: 0.0.42
22
message: "If you use this software, please cite it as below."
33
authors:
44
- family-names: Eren
@@ -20,7 +20,7 @@ authors:
2020
- family-names: Alexandrov
2121
given-names: Boian
2222
title: "Tensor Extraction of Latent Features (T-ELF)"
23-
version: 0.0.41
23+
version: 0.0.42
2424
url: https://github.com/lanl/T-ELF
2525
doi: 10.5281/zenodo.10257897
2626
date-released: 2023-12-04

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ python post_install.py # use the following, for example, for GPU system: <python
128128
| Cheetah | Fast search by keywords and phrases | [Link](examples/Cheetah) |
129129
| Bunny | Dataset generation tool for documents and their citations/references | [Link](examples/Bunny) |
130130
| Penguin | Text storage tool | [Link](examples/Penguin) |
131+
| Lynx | Streamlit UI | [Link](examples/Lynx) |
131132
| Termite | Knowladge graph building tool | :soon: |
132133

133134

@@ -136,7 +137,7 @@ python post_install.py # use the following, for example, for GPU system: <python
136137
| **Example** | **Description** | **Link** |
137138
|:----------:|:--------------------------------------------------------------------:|:-----------:|
138139
| NM Law Data | Domain specific data for AI and RAG system written in our [paper](https://arxiv.org/abs/2502.20364) about New Mexico Law that uses the TELF pipeline | [Link](examples/NM%20Law%20Data)|
139-
| Full TELF Pipeline | An end-to-end pipeline demonstration, from data collection to analysis | :soon: |
140+
| Full TELF Pipeline | An end-to-end pipeline demonstration, from collection to analysis | [Link](examples/Full%20TELF%20Pipeline) |
140141

141142

142143
## How to Cite T-ELF?

TELF/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@
44
sys.path += ["pre_processing"]
55
sys.path += ["post_processing"]
66
sys.path += ["applications"]
7-
sys.path += ["helpers"]
7+
sys.path += ["helpers"]
8+
sys.path += ["pipeline"]
Lines changed: 129 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,76 @@
11
import os
22
import warnings
3-
import pandas as pd
4-
from .cheetah import Cheetah
3+
from typing import Dict, List, Tuple, Any, Optional, Set
4+
from ...helpers.terms import resolve_substitution_conflicts
5+
56

67
class CheetahTermFormatter:
78
"""
89
Loads search terms from a Markdown file and returns them as
910
plain strings or dict blocks, with optional category filtering.
10-
Can also generate a substitutions lookup dict mapping phrases
11-
to underscored forms and back, if substitutions=True.
12-
13-
New parameters:
14-
all_categories (bool): if True, ignore `category` and
15-
`include_general` and include every section.
11+
12+
Optionally generates a substitution lookup map (with underscore variants),
13+
and can drop conflicts if requested.
14+
15+
Parameters
16+
----------
17+
markdown_file : str | Path
18+
Path to the .md file to load.
19+
lower : bool
20+
Whether to lowercase all term headers.
21+
category : str | None
22+
If set, include only `# Category: <category>` sections.
23+
include_general : bool
24+
If filtering by category, whether to include pre-category terms.
25+
substitutions : bool
26+
If True, builds substitution maps.
27+
all_categories : bool
28+
If True, overrides `category` and `include_general`.
29+
drop_conflicts : bool
30+
If True, resolve substitution conflicts and prune dropped entries.
31+
If False, keep all substitutions as-is (even if conflicting).
1632
"""
17-
def __init__(self, markdown_file, lower=False, category=None,
18-
include_general=True, substitutions=False, all_categories=False):
33+
34+
def __init__(
35+
self,
36+
markdown_file,
37+
lower: bool = False,
38+
category: Optional[str] = None,
39+
include_general: bool = True,
40+
substitutions: bool = False,
41+
all_categories: bool = False,
42+
drop_conflicts: bool = True,
43+
):
1944
self.markdown_file = markdown_file
2045
self.lower = lower
2146
self.category = category
2247
self.include_general = include_general
2348
self.substitutions = substitutions
2449
self.all_categories = all_categories
50+
self.drop_conflicts = drop_conflicts
2551

26-
self.substitution_forward = {}
27-
self.substitution_reverse = {}
52+
self.substitution_forward: Dict[str, str] = {}
53+
self.substitution_reverse: Dict[str, str] = {}
2854

29-
# parse the markdown into self.terms
30-
self.terms = self._parse_markdown()
55+
# parse markdown → raw terms list
56+
self.terms: List[Any] = self._parse_markdown()
3157

32-
# optionally build lookup table
58+
# optionally build lookup tables
3359
if self.substitutions:
3460
self._build_substitutions_lookup()
61+
if self.drop_conflicts:
62+
self._postprocess_conflicts()
3563

36-
37-
def _parse_markdown(self):
38-
terms = []
39-
current_term = None
40-
positives = []
41-
negatives = []
42-
active_block = False
43-
current_section = None
64+
# ──────────────────────────────────────────────────────────────── #
65+
# markdown parsing #
66+
# ──────────────────────────────────────────────────────────────── #
67+
def _parse_markdown(self) -> List[Any]:
68+
terms: List[Any] = []
69+
current_term, positives, negatives = None, [], []
70+
active_block, current_section = False, None
4471

4572
try:
46-
with open(self.markdown_file, 'r', encoding='utf-8') as f:
73+
with open(self.markdown_file, "r", encoding="utf-8") as f:
4774
lines = f.readlines()
4875
except FileNotFoundError:
4976
warnings.warn(f"File '{self.markdown_file}' not found. Returning empty list.")
@@ -52,120 +79,145 @@ def _parse_markdown(self):
5279
for raw in lines:
5380
line = raw.strip()
5481

55-
# Section header
5682
if line.startswith("# Category:"):
5783
current_section = line.split(":", 1)[1].strip()
5884
continue
5985

60-
# Decide whether to include this section
61-
if self.all_categories:
62-
include_section = True
63-
elif self.category is None:
64-
# no filtering → include everything
65-
include_section = True
66-
else:
67-
if current_section is None and self.include_general:
68-
include_section = True
69-
else:
70-
include_section = (current_section == self.category)
86+
include_section = self.all_categories or self.category is None
87+
if self.category and not self.all_categories:
88+
include_section = (current_section == self.category) or (
89+
current_section is None and self.include_general
90+
)
7191

72-
# Term header
7392
if line.startswith("##"):
74-
# finish previous block
7593
if current_term is not None and active_block:
7694
if positives or negatives:
77-
terms.append({
78-
current_term: {
79-
"positives": positives,
80-
"negatives": negatives
81-
}
82-
})
95+
terms.append({current_term: {"positives": positives, "negatives": negatives}})
8396
else:
8497
terms.append(current_term)
8598

86-
# reset for new block
87-
positives = []
88-
negatives = []
99+
positives, negatives = [], []
89100
header = line.lstrip("#").strip()
90101
if self.lower:
91102
header = header.lower()
92-
current_term = header
93-
active_block = include_section
103+
current_term = header
104+
active_block = include_section
94105

95-
# collect positives / negatives
96-
elif active_block and line.lower().startswith("must have:"):
106+
elif active_block and line.lower().startswith("positives:"):
97107
items = [i.strip() for i in line.split(":", 1)[1].split(",") if i.strip()]
98108
positives.extend(items)
99-
elif active_block and line.lower().startswith("exclude with:"):
109+
110+
elif active_block and line.lower().startswith("negatives:"):
100111
items = [i.strip() for i in line.split(":", 1)[1].split(",") if i.strip()]
101112
negatives.extend(items)
102113

103-
# final block
104114
if current_term is not None and active_block:
105115
if positives or negatives:
106-
terms.append({
107-
current_term: {
108-
"positives": positives,
109-
"negatives": negatives
110-
}
111-
})
116+
terms.append({current_term: {"positives": positives, "negatives": negatives}})
112117
else:
113118
terms.append(current_term)
114119

115120
return terms
116121

117-
def _build_substitutions_lookup(self):
118-
"""
119-
Build a dict mapping each term to its underscored form and vice versa.
120-
"""
122+
# ──────────────────────────────────────────────────────────────── #
123+
# substitutions lookup #
124+
# ──────────────────────────────────────────────────────────────── #
125+
def _build_substitutions_lookup(self) -> None:
126+
"""Create forward & reverse maps (no filtering yet)."""
121127
for entry in self.terms:
122128
if isinstance(entry, str):
123129
term = entry
124130
underscored = term.replace(" ", "_")
125131
self.substitution_forward[term] = underscored
126132
self.substitution_reverse[underscored] = term
127-
elif isinstance(entry, dict):
133+
else: # dict
128134
for term in entry.keys():
129135
underscored = term.replace(" ", "_")
130136
self.substitution_forward[term] = underscored
131137
self.substitution_reverse[underscored] = term
132138

139+
def _postprocess_conflicts(self) -> None:
140+
"""Resolve substitution conflicts and prune dropped terms."""
141+
clean_forward, dropped = resolve_substitution_conflicts(
142+
self.substitution_forward, warn=True
143+
)
144+
self.substitution_forward = clean_forward
145+
146+
# rebuild reverse map
147+
rev: Dict[str, List[str]] = {}
148+
for src, tgt in clean_forward.items():
149+
rev.setdefault(tgt, []).append(src)
150+
self.substitution_reverse = rev
151+
152+
if not dropped:
153+
return
154+
155+
# prune self.terms to match cleaned substitutions
156+
pruned_terms: List[Any] = []
157+
for entry in self.terms:
158+
if isinstance(entry, str):
159+
if entry not in dropped:
160+
pruned_terms.append(entry)
161+
else:
162+
kept = {k: v for k, v in entry.items() if k not in dropped}
163+
if kept:
164+
pruned_terms.append(kept)
165+
self.terms = pruned_terms
166+
167+
# ──────────────────────────────────────────────────────────────── #
168+
# public access #
169+
# ──────────────────────────────────────────────────────────────── #
170+
def get_terms(self) -> List[Any]:
171+
return self.terms
172+
173+
def get_substitution_maps(self) -> Tuple[Dict[str, str], Dict[str, str]]:
174+
return self.substitution_forward, self.substitution_reverse
175+
176+
177+
# ──────────────────────────────────────────────────────────────── #
178+
# public helpers #
179+
# ──────────────────────────────────────────────────────────────── #
133180
def get_terms(self):
134181
return self.terms
135182

136183
def get_substitution_maps(self):
137-
"""
138-
Return the substitutions lookup dict (empty if substitutions=False).
139-
"""
184+
"""Return (forward_map, reverse_map)."""
140185
return self.substitution_forward, self.substitution_reverse
141186

142187

188+
# ═══════════════════════════════════════════════════════════════════ #
189+
# utility: convert TXT dump → cheetah markdown #
190+
# ═══════════════════════════════════════════════════════════════════ #
143191
def convert_txt_to_cheetah_markdown(txt_path, markdown_path):
192+
"""
193+
Helper to convert a simple TXT list (optionally containing dict literals)
194+
into the markdown format expected by CheetahTermFormatter.
195+
"""
144196
import ast
145197

146-
with open(txt_path, 'r', encoding='utf-8') as f:
198+
with open(txt_path, "r", encoding="utf-8") as f:
147199
lines = [line.strip() for line in f if line.strip()]
148200

149-
markdown_lines = []
201+
md_lines: List[str] = []
150202

151203
for line in lines:
152204
if line.startswith("{") and line.endswith("}"):
153205
try:
154206
parsed = ast.literal_eval(line)
155207
for key, value in parsed.items():
156-
positives = [v.lstrip('+') for v in value if v.startswith('+')]
157-
negatives = [v for v in value if not v.startswith('+')]
158-
markdown_lines.append(f"## {key}")
208+
positives = [v.lstrip("+") for v in value if v.startswith("+")]
209+
negatives = [v for v in value if not v.startswith("+")]
210+
md_lines.append(f"## {key}")
159211
if positives:
160-
markdown_lines.append(f"positives: {', '.join(positives)}")
212+
md_lines.append(f"positives: {', '.join(positives)}")
161213
if negatives:
162-
markdown_lines.append(f"negatives: {', '.join(negatives)}")
214+
md_lines.append(f"negatives: {', '.join(negatives)}")
163215
except Exception as e:
164216
print(f"Skipping line due to parse error: {line}\nError: {e}")
165217
else:
166-
markdown_lines.append(f"## {line.strip()}")
218+
md_lines.append(f"## {line.strip()}")
167219

168-
with open(markdown_path, 'w', encoding='utf-8') as f:
169-
f.write("\n".join(markdown_lines))
220+
with open(markdown_path, "w", encoding="utf-8") as f:
221+
f.write("\n".join(md_lines))
170222

171223
print(f"Converted markdown saved to: {markdown_path}")

TELF/applications/Lynx/__init__.py

Whitespace-only changes.

TELF/applications/Lynx/backend/__init__.py

Whitespace-only changes.

TELF/applications/Lynx/frontend/__init__.py

Whitespace-only changes.
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import streamlit as st
2+
import os
3+
import sys;sys.path.append(os.path.join("pages"))
4+
import sys;sys.path.append(os.path.join("..", "backend"))
5+
import sys;sys.path.append(os.path.join(".."))
6+
7+
if "project_loaded" not in st.session_state:
8+
st.session_state.project_loaded = False
9+
10+
load_project_page = st.Page(os.path.join("pages", "load_project.py"), title="Load Project", icon=":material/flag:", default=True)
11+
tree_view_page = st.Page(os.path.join("pages", "tree_view.py"), title="Tree Search", icon=":material/allergy:", default=False)
12+
document_analysis_view_page = st.Page(os.path.join("pages", "doc_view.py"), title="Document Analysis", icon=":material/lan:", default=False)
13+
link_view_page = st.Page(os.path.join("pages", "link_view.py"), title="Link Prediction", icon=":material/linked_services:", default=False)
14+
15+
pg = st.navigation(
16+
{
17+
f"Lynx":[load_project_page],
18+
"Views":[tree_view_page, document_analysis_view_page, link_view_page],
19+
}
20+
)
21+
pg.run()

TELF/applications/Lynx/frontend/pages/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)