lanl
diff --git a/‎CITATION.cff
Lines changed: 2 additions & 2 deletions b/‎CITATION.cff
Lines changed: 2 additions & 2 deletions
diff --git a/‎TELF/factorization/HNMFk.py
Lines changed: 122 additions & 16 deletions b/‎TELF/factorization/HNMFk.py
Lines changed: 122 additions & 16 deletions
diff --git a/‎TELF/pre_processing/Beaver/beaver.py
Lines changed: 1 addition & 1 deletion b/‎TELF/pre_processing/Beaver/beaver.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎TELF/version.py
Lines changed: 1 addition & 1 deletion b/‎TELF/version.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/Beaver.html
Lines changed: 3 additions & 3 deletions b/‎docs/Beaver.html
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/Cheetah.html
Lines changed: 3 additions & 3 deletions b/‎docs/Cheetah.html
Lines changed: 3 additions & 3 deletions
@@ -1,4 +1,4 @@
-version: 0.0.35
+version: 0.0.36
 message: "If you use this software, please cite it as below."
 authors:
   - family-names: Eren
@@ -20,7 +20,7 @@ authors:
   - family-names: Alexandrov
     given-names: Boian
 title: "Tensor Extraction of Latent Features (T-ELF)"
-version: 0.0.35
+version: 0.0.36
 url: https://github.com/lanl/T-ELF
 doi: 10.5281/zenodo.10257897
 date-released: 2023-12-04
@@ -32,18 +32,10 @@ def __init__(self,
         self.node_name = node_name
         self.parent_node = parent_node
         self.child_nodes = child_nodes
-        self.node_data = None
+        self.original_child_nodes = child_nodes
 
-    def __call__(self, persistent=False):
-
-        if persistent:
-            if self.node_data is None:
-                 self.node_data = pickle.load(open(self.node_path, "rb"))
-            
-            return self.node_data
-
-        else:
-            return pickle.load(open(self.node_path, "rb"))
+    def __call__(self):
+        return pickle.load(open(self.node_path, "rb"))
 
 
 class Node():
@@ -78,6 +70,7 @@ def __init__(self,
         self.parent_node_k = parent_node_k
         self.parent_node_name = parent_node_name
         self.child_node_names = child_node_names
+        self.original_child_node_names = child_node_names
         self.original_indices = original_indices
         self.num_samples = num_samples
         self.leaf = leaf
@@ -109,7 +102,8 @@ def __init__(self,
                  n_nodes=1,
                  verbose=True,
                  comm_buff_size=10000000,
-                 random_identifiers=False
+                 random_identifiers=False,
+                 root_node_name = "Root"
                  ):
         """
         HNMFk is a Hierarchical Non-negative Matrix Factorization module with the capability to do automatic model determination.
@@ -154,6 +148,8 @@ def __init__(self,
             If True, it prints progress. The default is True.
         random_identifiers : bool, optional
             If True, model will use randomly generated strings as the identifiers of the nodes. Otherwise, it will use the k for ancestry naming convention. 
+        root_node_name : str, optional
+            Naming convention to be used when saving the root name. Default is "Root".
         Returns
         -------
         None.
@@ -174,6 +170,7 @@ def __init__(self,
         self.verbose = verbose
         self.comm_buff_size = comm_buff_size
         self.random_identifiers = random_identifiers
+        self.root_node_name = root_node_name
 
         organized_nmfk_params = []
         for params in nmfk_params:
@@ -309,7 +306,7 @@ def fit(self, X, Ks, from_checkpoint=False, save_checkpoint=True):
                 if self.random_identifiers:
                     self.root_name = str(uuid.uuid1())
                 else:
-                    self.root_name = "*"
+                    self.root_name = self.root_node_name
 
                 self.target_jobs[self.root_name] = {
                     "parent_node_name":"None",
@@ -726,6 +723,110 @@ def traverse_nodes(self):
 
         return return_data
 
+    def traverse_tiny_leaf_topics(self, threshold=5):
+        """
+        Graph iterator with thresholding on number of documents. Returns a list of nodes where number of documents are less than the threshold.\n
+        This operation is online, only the nodes that are outliers based on the number of documents are kept in the memory.
+        
+        Parameters
+        ----------
+        threshold : int
+            Minimum number of documents each node should have.
+
+        Returns
+        -------
+        data : list
+            List of dictionarys that are format of node for each entry in the list.
+
+        """
+        self._all_nodes = []
+        self._get_traversal(self.root, small_docs_thresh=threshold)
+        return_data = self._all_nodes.copy()
+        self._all_nodes = []
+
+        return return_data
+    
+    def get_tiny_leaf_topics(self):
+        """
+        Graph iterator for tiny documents if processed already with self.process_tiny_leaf_topics(threshold:int).\n
+
+        Returns
+        -------
+        tiny_leafs : list
+            List of dictionarys that are format of node for each entry in the list.
+
+        """
+        try:
+            return pickle.load(open(os.path.join(self.experiment_name, "tiny_leafs.p"), "rb"))
+        except Exception as e:
+            print("Could not load the tiny leafs. Did you call process_tiny_leaf_topics(threshold:int)?", e)
+            return None
+    
+    def process_tiny_leaf_topics(self, threshold=5):
+        """
+        Graph post-processing with thresholding on number of documents.\n
+        Returns a list of all tiny nodes, with all the nodes that had number of documents less than the threshold.\n
+        Removes these outlier nodes from child-node lists on the original graph from their parents.\n
+        Graph is re-set each time this function is called such that original child nodes are re-assigned.\n
+        If threshold=None, this function will re-assign the original child indices only, and return None.
+
+        Parameters
+        ----------
+        threshold : int
+            Minimum number of documents each node should have.
+
+        Returns
+        -------
+        tiny_leafs : list
+            List of dictionarys that are format of node for each entry in the list.
+
+        """
+        
+        # set the old child nodes on each node
+        self._update_child_nodes_traversal(self.root)
+       
+        # remove the old saved tiny leafs 
+        try:
+            os.remove(os.path.join(self.experiment_name, "tiny_leafs.p"))
+        except:
+            pass
+
+        # if threshold is none, we reversed everything
+        if threshold is None:
+            return
+        
+        tiny_leafs = self.traverse_tiny_leaf_topics(threshold=threshold)
+        pickle.dump(tiny_leafs, open(os.path.join(self.experiment_name, "tiny_leafs.p"), "wb"))
+
+        # remove tinly leafs from its parents
+        for tf in tiny_leafs:
+            my_name = tf["node_name"]
+            parent_name = tf["parent_node_name"]
+            parent_node = self._search_traversal(self.root, parent_name)
+
+            # remove from online iterator
+            parent_node.child_nodes = [node for node in parent_node.child_nodes if node.node_name != my_name]
+            
+            # also need to remove from saved node data
+            parent_node_loaded = parent_node()
+            parent_node_loaded.child_node_names = [node_name for node_name in parent_node_loaded.child_node_names if node_name != my_name]
+            pickle.dump(parent_node_loaded, open(os.path.join(self.experiment_name, *parent_node_loaded.node_save_path.split(os.sep)[1:]), "wb"))
+
+        return tiny_leafs
+    
+    def _update_child_nodes_traversal(self, node):
+        
+        for nn in node.original_child_nodes:
+            self._update_child_nodes_traversal(nn)
+        
+        if node.child_nodes != node.original_child_nodes:
+            node.child_nodes = node.original_child_nodes
+
+        node_loaded = node()
+        if node_loaded.original_child_node_names != node_loaded.child_node_names:
+            node_loaded.child_node_names = node_loaded.original_child_node_names
+            pickle.dump(node_loaded, open(os.path.join(self.experiment_name, *node_loaded.node_save_path.split(os.sep)[1:]), "wb"))
+    
     def _search_traversal(self, node, name):
 
         # Base case: if the current node matches the target name
@@ -743,12 +844,17 @@ def _search_traversal(self, node, name):
         # If the node is not found in this branch, return None
         return None
 
-    def _get_traversal(self, node):
+    def _get_traversal(self, node, small_docs_thresh=None):
 
         for nn in node.child_nodes:
-            self._get_traversal(nn)
+            self._get_traversal(nn, small_docs_thresh=small_docs_thresh)
+
+        if small_docs_thresh is not None:
+            tmp_node_data = vars(node()).copy()
+            if not (tmp_node_data["leaf"] and tmp_node_data["num_samples"] < small_docs_thresh):
+                return
 
-        data = vars(node(persistent=True)).copy()
+        data = vars(node()).copy()
         data["node_save_path"] = os.path.join(self.experiment_name, *data["node_save_path"].split(os.sep)[1:])
         if data["node_name"] != self.root_name:
             data["parent_node_save_path"] = os.path.join(self.experiment_name, *data["parent_node_save_path"].split(os.sep)[1:])
 
@@ -877,7 +877,7 @@ def documents_words(self,
             for widx, token in tqdm(enumerate(highlighting), disable=not verbose):
                 idxs = np.where(vocabulary == token)[0]
                 if len(idxs):
-                    X[idxs[0]] = X[idxs[0]] * weights[widx]
+                    X[:, idxs] = X[:, idxs] * weights[widx]
 
         # convert to pydata sparse for consistency across all beaver methods
         X = sparse.COO.from_scipy_sparse(X)
 
@@ -1 +1 @@
-__version__ = "0.0.35"
+__version__ = "0.0.36"
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>TELF.pre_processing.Beaver: Fast matrix and tensor building tool &#8212; TELF 0.0.35 documentation</title>
+    <title>TELF.pre_processing.Beaver: Fast matrix and tensor building tool &#8212; TELF 0.0.36 documentation</title>
 
 
 
@@ -40,7 +40,7 @@
   <link rel="preload" as="script" href="_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="_static/documentation_options.js?v=6aa38c3a"></script>
+    <script src="_static/documentation_options.js?v=7dd70c5c"></script>
     <script src="_static/doctools.js?v=9bcbadda"></script>
     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -126,7 +126,7 @@
 
 
 
-    <p class="title logo__title">TELF 0.0.35 documentation</p>
+    <p class="title logo__title">TELF 0.0.36 documentation</p>
 
 </a></div>
         <div class="sidebar-primary-item">
 
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>TELF.applications.Cheetah: Advanced search by keywords and phrases &#8212; TELF 0.0.35 documentation</title>
+    <title>TELF.applications.Cheetah: Advanced search by keywords and phrases &#8212; TELF 0.0.36 documentation</title>
 
 
 
@@ -40,7 +40,7 @@
   <link rel="preload" as="script" href="_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="_static/documentation_options.js?v=6aa38c3a"></script>
+    <script src="_static/documentation_options.js?v=7dd70c5c"></script>
     <script src="_static/doctools.js?v=9bcbadda"></script>
     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -126,7 +126,7 @@
 
 
 
-    <p class="title logo__title">TELF 0.0.35 documentation</p>
+    <p class="title logo__title">TELF 0.0.36 documentation</p>
 
 </a></div>
         <div class="sidebar-primary-item">
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.0.35"`
	`1`	`+__version__ = "0.0.36"`