lanl
diff --git a/‎.gitignore
Lines changed: 15 additions & 1 deletion b/‎.gitignore
Lines changed: 15 additions & 1 deletion
diff --git a/‎CITATION.cff
Lines changed: 2 additions & 2 deletions b/‎CITATION.cff
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md
Lines changed: 8 additions & 0 deletions b/‎README.md
Lines changed: 8 additions & 0 deletions
diff --git a/‎TELF/__init__.py
Lines changed: 2 additions & 1 deletion b/‎TELF/__init__.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎TELF/applications/Bunny/auto_bunny.py
Lines changed: 3 additions & 37 deletions b/‎TELF/applications/Bunny/auto_bunny.py
Lines changed: 3 additions & 37 deletions
diff --git a/‎TELF/applications/Bunny/bunny.py
Lines changed: 3 additions & 24 deletions b/‎TELF/applications/Bunny/bunny.py
Lines changed: 3 additions & 24 deletions
diff --git a/‎TELF/applications/Cheetah/__init__.py
Lines changed: 2 additions & 0 deletions b/‎TELF/applications/Cheetah/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎TELF/applications/Cheetah/term_formatter.py
Lines changed: 171 additions & 0 deletions b/‎TELF/applications/Cheetah/term_formatter.py
Lines changed: 171 additions & 0 deletions
@@ -1,6 +1,20 @@
-poetry.lock
+VOCAB_CONSOLIDATOR_SubstitutionOperator.p
+VOCAB_CONSOLIDATOR_changes.csv
+clean_documents
+corrected_substitutions_df.csv
+SeaLion_post_processing/
+example_figures/
+graph.p
+example_Semantic_HNMFk/
+sample_index.p
+EXAMPLE_OUT
+result_example/
+search_terms.md
+scopus_cache/
+
 # mac
 results/
+poetry.lock
 .DS_Store
 # contains the data for the project
 # data/
 
@@ -1,4 +1,4 @@
-version: 0.0.39
+version: 0.0.40
 message: "If you use this software, please cite it as below."
 authors:
   - family-names: Eren
@@ -20,7 +20,7 @@ authors:
   - family-names: Alexandrov
     given-names: Boian
 title: "Tensor Extraction of Latent Features (T-ELF)"
-version: 0.0.39
+version: 0.0.40
 url: https://github.com/lanl/T-ELF
 doi: 10.5281/zenodo.10257897
 date-released: 2023-12-04
@@ -130,6 +130,14 @@ python post_install.py # use the following, for example, for GPU system: <python
 |    Termite   | Knowladge graph building tool | :soon: |
 
 
+## Use Cases
+
+| **Example** |                            **Description**                           | **Link** |
+|:----------:|:--------------------------------------------------------------------:|:-----------:|
+|   NM Law Data           |                        Domain specific data for AI and RAG system written in our  [paper](https://arxiv.org/abs/2502.20364) about New Mexico Law that uses the TELF pipeline       |  [Link](examples/NM%20Law%20Data)|
+|    Full TELF Pipeline   | An end-to-end pipeline demonstration, from data collection to analysis | :soon:   |
+
+
 ## How to Cite T-ELF?
 If you use T-ELF please cite.
 
 
@@ -3,4 +3,5 @@
 sys.path += ["factorization"]
 sys.path += ["pre_processing"]
 sys.path += ["post_processing"]
-sys.path += ["applications"]
+sys.path += ["applications"]
+sys.path += ["helpers"]
@@ -10,6 +10,7 @@
 from ...pre_processing.iPenguin.Scopus import Scopus
 from ...pre_processing.iPenguin.SemanticScholar import SemanticScholar
 from ...pre_processing.Vulture import Vulture
+from ...helpers.file_system import check_path_var as check_path
 
 @dataclass
 class AutoBunnyStep:
@@ -288,7 +289,7 @@ def scopus_keys(self, scopus_keys):
                 try:
                     ip = Scopus(keys=[key])
                 except ValueError:
-                    raise ValueError(f'The key "{k}" was rejected by the Scopus API')
+                    raise ValueError(f'The key "{key}" was rejected by the Scopus API')
             self._scopus_keys = list(scopus_keys)
         else:
             raise TypeError(f'Unsupported type "{type(key)}" for Scopus key')
@@ -306,41 +307,6 @@ def cheetah_index(self, cheetah_index):
             self._cheetah_index = {**self.CHEETAH_INDEX, **cheetah_index} 
         else:
             raise TypeError(f'Unsupported type "{type(cheetah_index)}" for `cheetah_index`')
-            
-    def __check_path(self, path, var_name):      
-        if path.exists() and path.is_file():  # handle the path already existing as file
-            raise ValueError(f'The path `{var_name}` points to a file instead of a directory')
-        if not path.exists():
-            path.mkdir(parents=True)  # parents=True ensures all missing parent directories are also created
-
-    def __check_path(self, path, var_name):
-        """
-        Checks and ensures the given path exists as a directory. If path does not exist, a new directory
-        will be created. If the path exists but is a file, a ValueError will be raised. A TypeError is
-        raised if the provided path is neither a string nor a `pathlib.Path` object.
-    
-        Parameters:
-        -----------
-        path: str, pathlib.Path
-            The path to be checked and ensured as a directory.
-        
-        Raises:
-        -------
-        TypeError:
-            If the provided path is neither a string nor a `pathlib.Path` object.
-        ValueError: 
-            If the path points to an existing file.
-        """
-        if isinstance(path, str):
-            path = pathlib.Path(path)
-        if not isinstance(path, pathlib.Path):
-            raise TypeError(f'Unsupported type "{type(path)}" for `path`')
-        path = path.resolve()
-        if path.exists():
-            if path.is_file():
-                raise ValueError(f'`{var_name}` points to a file instead of a directory')
-        else:
-            path.mkdir(parents=True, exist_ok=True)
 
     def __process_path(self, path, var_name):
         if path is None:
@@ -351,7 +317,7 @@ def __process_path(self, path, var_name):
             _path = path
         else:
             raise TypeError(f'Unsupported type "{type(path)}" for `{var_name}`')
-        self.__check_path(_path, var_name)
+        check_path(_path, var_name)
         return _path
 
     @output_dir.setter
 
@@ -10,7 +10,8 @@
 from dataclasses import dataclass
 pd.set_option('future.no_silent_downcasting', True)
 
-from ...applications.Penguin import Penguin, form_df
+from ..Penguin import Penguin
+from ..Penguin.crocodile import form_df
 from ...pre_processing.iPenguin.Scopus import Scopus
 from ...pre_processing.iPenguin.SemanticScholar import SemanticScholar
 from ...pre_processing.iPenguin.utils import format_pubyear
@@ -135,14 +136,6 @@ def find_doi(f):
     return match.group(1) if match else None
 
 
-def gen_chunks(l, n):
-    """Yield n number of sequential chunks from l."""
-    d, r = divmod(len(l), n)
-    for i in range(n):
-        si = (d+1)*(i if i < r else r) + d*(0 if i < r else i - r)
-        yield l[si:si+(d+1 if i < r else d)]
-
-
 class Bunny():
 
     MODES = {'references', 'citations', 's2_author_ids'}
@@ -335,7 +328,7 @@ def form_core_scopus(self, data, data_type, keys, s2_dir='s2', scopus_dir='scopu
 
         scopus_df.doi = scopus_df.doi.str.lower()
         scopus_df = scopus_df.loc[scopus_df.doi.isin(s2_dois)].copy()
-        return s2_join_scopus(s2_df, scopus_df)
+        return form_df(s2_df, scopus_df)
 
 
     def form_core(self, data, data_type, s2_dir='s2'):
@@ -752,20 +745,6 @@ def __evaluate_query(self, query, df, auth_map):
                 return set.intersection(*results)
             elif query.operator == 'OR':
                 return set.union(*results)
-
-
-    def __form_query_str(self, query):
-        if isinstance(query, BunnyFilter):
-            ffunc = self.filter_funcs[query.filter_type]
-            result = ffunc(df, query.filter_value, auth_map)
-            return result
-        elif isinstance(query, BunnyOperation):
-            results = [self.__evaluate_query(operand, df, auth_map) for operand in query.operands]
-            if query.operator == 'AND':
-                return set.intersection(*results)
-            elif query.operator == 'OR':
-                return set.union(*results)
-
 
     def apply_filter(self, df, filters, filter_in_core=True, do_author_match=True):
         if 'eid' not in df and do_author_match:
 
@@ -1 +1,3 @@
 from .cheetah import Cheetah
+from .term_formatter import CheetahTermFormatter, convert_txt_to_cheetah_markdown
+from .term_generator import SearchTermGenerator
@@ -0,0 +1,171 @@
+import os
+import warnings
+import pandas as pd
+from .cheetah import Cheetah
+
+class CheetahTermFormatter:
+    """
+    Loads search terms from a Markdown file and returns them as
+    plain strings or dict blocks, with optional category filtering.
+    Can also generate a substitutions lookup dict mapping phrases
+    to underscored forms and back, if substitutions=True.
+    
+    New parameters:
+      all_categories (bool): if True, ignore `category` and
+        `include_general` and include every section.
+    """
+    def __init__(self, markdown_file, lower=False, category=None,
+                 include_general=True, substitutions=False, all_categories=False):
+        self.markdown_file    = markdown_file
+        self.lower            = lower
+        self.category         = category
+        self.include_general  = include_general
+        self.substitutions    = substitutions
+        self.all_categories   = all_categories
+
+        self.substitution_forward = {}
+        self.substitution_reverse = {}
+
+        # parse the markdown into self.terms
+        self.terms = self._parse_markdown()
+
+        # optionally build lookup table
+        if self.substitutions:
+            self._build_substitutions_lookup()
+
+
+    def _parse_markdown(self):
+        terms = []
+        current_term = None
+        positives = []
+        negatives = []
+        active_block = False
+        current_section = None
+
+        try:
+            with open(self.markdown_file, 'r', encoding='utf-8') as f:
+                lines = f.readlines()
+        except FileNotFoundError:
+            warnings.warn(f"File '{self.markdown_file}' not found. Returning empty list.")
+            return []
+
+        for raw in lines:
+            line = raw.strip()
+
+            # Section header
+            if line.startswith("# Category:"):
+                current_section = line.split(":", 1)[1].strip()
+                continue
+
+            # Decide whether to include this section
+            if self.all_categories:
+                include_section = True
+            elif self.category is None:
+                # no filtering → include everything
+                include_section = True
+            else:
+                if current_section is None and self.include_general:
+                    include_section = True
+                else:
+                    include_section = (current_section == self.category)
+
+            # Term header
+            if line.startswith("##"):
+                # finish previous block
+                if current_term is not None and active_block:
+                    if positives or negatives:
+                        terms.append({
+                            current_term: {
+                                "positives": positives,
+                                "negatives": negatives
+                            }
+                        })
+                    else:
+                        terms.append(current_term)
+
+                # reset for new block
+                positives = []
+                negatives = []
+                header = line.lstrip("#").strip()
+                if self.lower:
+                    header = header.lower()
+                current_term = header
+                active_block = include_section
+
+            # collect positives / negatives
+            elif active_block and line.lower().startswith("must have:"):
+                items = [i.strip() for i in line.split(":", 1)[1].split(",") if i.strip()]
+                positives.extend(items)
+            elif active_block and line.lower().startswith("exclude with:"):
+                items = [i.strip() for i in line.split(":", 1)[1].split(",") if i.strip()]
+                negatives.extend(items)
+
+        # final block
+        if current_term is not None and active_block:
+            if positives or negatives:
+                terms.append({
+                    current_term: {
+                        "positives": positives,
+                        "negatives": negatives
+                    }
+                })
+            else:
+                terms.append(current_term)
+
+        return terms
+
+    def _build_substitutions_lookup(self):
+        """
+        Build a dict mapping each term to its underscored form and vice versa.
+        """
+        for entry in self.terms:
+            if isinstance(entry, str):
+                term = entry
+                underscored = term.replace(" ", "_")
+                self.substitution_forward[term] = underscored
+                self.substitution_reverse[underscored] = term
+            elif isinstance(entry, dict):
+                for term in entry.keys():
+                    underscored = term.replace(" ", "_")
+                    self.substitution_forward[term] = underscored
+                    self.substitution_reverse[underscored] = term
+
+    def get_terms(self):
+        return self.terms
+
+    def get_substitution_maps(self):
+        """
+        Return the substitutions lookup dict (empty if substitutions=False).
+        """
+        return self.substitution_forward, self.substitution_reverse
+
+
+def convert_txt_to_cheetah_markdown(txt_path, markdown_path):
+    import ast
+
+    with open(txt_path, 'r', encoding='utf-8') as f:
+        lines = [line.strip() for line in f if line.strip()]
+
+    markdown_lines = []
+
+    for line in lines:
+        if line.startswith("{") and line.endswith("}"):
+            try:
+                parsed = ast.literal_eval(line)
+                for key, value in parsed.items():
+                    positives = [v.lstrip('+') for v in value if v.startswith('+')]
+                    negatives = [v for v in value if not v.startswith('+')]
+                    markdown_lines.append(f"## {key}")
+                    if positives:
+                        markdown_lines.append(f"positives: {', '.join(positives)}")
+                    if negatives:
+                        markdown_lines.append(f"negatives: {', '.join(negatives)}")
+            except Exception as e:
+                print(f"Skipping line due to parse error: {line}\nError: {e}")
+        else:
+            markdown_lines.append(f"## {line.strip()}")
+
+    with open(markdown_path, 'w', encoding='utf-8') as f:
+        f.write("\n".join(markdown_lines))
+
+    print(f"Converted markdown saved to: {markdown_path}")
Original file line number	Diff line number	Diff line change
`@@ -1 +1,3 @@`
`1`	`1`	`from .cheetah import Cheetah`
	`2`	`+from .term_formatter import CheetahTermFormatter, convert_txt_to_cheetah_markdown`
	`3`	`+from .term_generator import SearchTermGenerator`