Skip to content

Commit 4561828

Browse files
authored
Merge pull request #183 from lanl/develop
Develop
2 parents a84b855 + 9336994 commit 4561828

File tree

92 files changed

+1131
-847
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

92 files changed

+1131
-847
lines changed

CITATION.cff

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
version: 0.0.35
1+
version: 0.0.36
22
message: "If you use this software, please cite it as below."
33
authors:
44
- family-names: Eren
@@ -20,7 +20,7 @@ authors:
2020
- family-names: Alexandrov
2121
given-names: Boian
2222
title: "Tensor Extraction of Latent Features (T-ELF)"
23-
version: 0.0.35
23+
version: 0.0.36
2424
url: https://github.com/lanl/T-ELF
2525
doi: 10.5281/zenodo.10257897
2626
date-released: 2023-12-04

TELF/factorization/HNMFk.py

Lines changed: 122 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -32,18 +32,10 @@ def __init__(self,
3232
self.node_name = node_name
3333
self.parent_node = parent_node
3434
self.child_nodes = child_nodes
35-
self.node_data = None
35+
self.original_child_nodes = child_nodes
3636

37-
def __call__(self, persistent=False):
38-
39-
if persistent:
40-
if self.node_data is None:
41-
self.node_data = pickle.load(open(self.node_path, "rb"))
42-
43-
return self.node_data
44-
45-
else:
46-
return pickle.load(open(self.node_path, "rb"))
37+
def __call__(self):
38+
return pickle.load(open(self.node_path, "rb"))
4739

4840

4941
class Node():
@@ -78,6 +70,7 @@ def __init__(self,
7870
self.parent_node_k = parent_node_k
7971
self.parent_node_name = parent_node_name
8072
self.child_node_names = child_node_names
73+
self.original_child_node_names = child_node_names
8174
self.original_indices = original_indices
8275
self.num_samples = num_samples
8376
self.leaf = leaf
@@ -109,7 +102,8 @@ def __init__(self,
109102
n_nodes=1,
110103
verbose=True,
111104
comm_buff_size=10000000,
112-
random_identifiers=False
105+
random_identifiers=False,
106+
root_node_name = "Root"
113107
):
114108
"""
115109
HNMFk is a Hierarchical Non-negative Matrix Factorization module with the capability to do automatic model determination.
@@ -154,6 +148,8 @@ def __init__(self,
154148
If True, it prints progress. The default is True.
155149
random_identifiers : bool, optional
156150
If True, model will use randomly generated strings as the identifiers of the nodes. Otherwise, it will use the k for ancestry naming convention.
151+
root_node_name : str, optional
152+
Naming convention to be used when saving the root name. Default is "Root".
157153
Returns
158154
-------
159155
None.
@@ -174,6 +170,7 @@ def __init__(self,
174170
self.verbose = verbose
175171
self.comm_buff_size = comm_buff_size
176172
self.random_identifiers = random_identifiers
173+
self.root_node_name = root_node_name
177174

178175
organized_nmfk_params = []
179176
for params in nmfk_params:
@@ -309,7 +306,7 @@ def fit(self, X, Ks, from_checkpoint=False, save_checkpoint=True):
309306
if self.random_identifiers:
310307
self.root_name = str(uuid.uuid1())
311308
else:
312-
self.root_name = "*"
309+
self.root_name = self.root_node_name
313310

314311
self.target_jobs[self.root_name] = {
315312
"parent_node_name":"None",
@@ -726,6 +723,110 @@ def traverse_nodes(self):
726723

727724
return return_data
728725

726+
def traverse_tiny_leaf_topics(self, threshold=5):
727+
"""
728+
Graph iterator with thresholding on number of documents. Returns a list of nodes where number of documents are less than the threshold.\n
729+
This operation is online, only the nodes that are outliers based on the number of documents are kept in the memory.
730+
731+
Parameters
732+
----------
733+
threshold : int
734+
Minimum number of documents each node should have.
735+
736+
Returns
737+
-------
738+
data : list
739+
List of dictionarys that are format of node for each entry in the list.
740+
741+
"""
742+
self._all_nodes = []
743+
self._get_traversal(self.root, small_docs_thresh=threshold)
744+
return_data = self._all_nodes.copy()
745+
self._all_nodes = []
746+
747+
return return_data
748+
749+
def get_tiny_leaf_topics(self):
750+
"""
751+
Graph iterator for tiny documents if processed already with self.process_tiny_leaf_topics(threshold:int).\n
752+
753+
Returns
754+
-------
755+
tiny_leafs : list
756+
List of dictionarys that are format of node for each entry in the list.
757+
758+
"""
759+
try:
760+
return pickle.load(open(os.path.join(self.experiment_name, "tiny_leafs.p"), "rb"))
761+
except Exception as e:
762+
print("Could not load the tiny leafs. Did you call process_tiny_leaf_topics(threshold:int)?", e)
763+
return None
764+
765+
def process_tiny_leaf_topics(self, threshold=5):
766+
"""
767+
Graph post-processing with thresholding on number of documents.\n
768+
Returns a list of all tiny nodes, with all the nodes that had number of documents less than the threshold.\n
769+
Removes these outlier nodes from child-node lists on the original graph from their parents.\n
770+
Graph is re-set each time this function is called such that original child nodes are re-assigned.\n
771+
If threshold=None, this function will re-assign the original child indices only, and return None.
772+
773+
Parameters
774+
----------
775+
threshold : int
776+
Minimum number of documents each node should have.
777+
778+
Returns
779+
-------
780+
tiny_leafs : list
781+
List of dictionarys that are format of node for each entry in the list.
782+
783+
"""
784+
785+
# set the old child nodes on each node
786+
self._update_child_nodes_traversal(self.root)
787+
788+
# remove the old saved tiny leafs
789+
try:
790+
os.remove(os.path.join(self.experiment_name, "tiny_leafs.p"))
791+
except:
792+
pass
793+
794+
# if threshold is none, we reversed everything
795+
if threshold is None:
796+
return
797+
798+
tiny_leafs = self.traverse_tiny_leaf_topics(threshold=threshold)
799+
pickle.dump(tiny_leafs, open(os.path.join(self.experiment_name, "tiny_leafs.p"), "wb"))
800+
801+
# remove tinly leafs from its parents
802+
for tf in tiny_leafs:
803+
my_name = tf["node_name"]
804+
parent_name = tf["parent_node_name"]
805+
parent_node = self._search_traversal(self.root, parent_name)
806+
807+
# remove from online iterator
808+
parent_node.child_nodes = [node for node in parent_node.child_nodes if node.node_name != my_name]
809+
810+
# also need to remove from saved node data
811+
parent_node_loaded = parent_node()
812+
parent_node_loaded.child_node_names = [node_name for node_name in parent_node_loaded.child_node_names if node_name != my_name]
813+
pickle.dump(parent_node_loaded, open(os.path.join(self.experiment_name, *parent_node_loaded.node_save_path.split(os.sep)[1:]), "wb"))
814+
815+
return tiny_leafs
816+
817+
def _update_child_nodes_traversal(self, node):
818+
819+
for nn in node.original_child_nodes:
820+
self._update_child_nodes_traversal(nn)
821+
822+
if node.child_nodes != node.original_child_nodes:
823+
node.child_nodes = node.original_child_nodes
824+
825+
node_loaded = node()
826+
if node_loaded.original_child_node_names != node_loaded.child_node_names:
827+
node_loaded.child_node_names = node_loaded.original_child_node_names
828+
pickle.dump(node_loaded, open(os.path.join(self.experiment_name, *node_loaded.node_save_path.split(os.sep)[1:]), "wb"))
829+
729830
def _search_traversal(self, node, name):
730831

731832
# Base case: if the current node matches the target name
@@ -743,12 +844,17 @@ def _search_traversal(self, node, name):
743844
# If the node is not found in this branch, return None
744845
return None
745846

746-
def _get_traversal(self, node):
847+
def _get_traversal(self, node, small_docs_thresh=None):
747848

748849
for nn in node.child_nodes:
749-
self._get_traversal(nn)
850+
self._get_traversal(nn, small_docs_thresh=small_docs_thresh)
851+
852+
if small_docs_thresh is not None:
853+
tmp_node_data = vars(node()).copy()
854+
if not (tmp_node_data["leaf"] and tmp_node_data["num_samples"] < small_docs_thresh):
855+
return
750856

751-
data = vars(node(persistent=True)).copy()
857+
data = vars(node()).copy()
752858
data["node_save_path"] = os.path.join(self.experiment_name, *data["node_save_path"].split(os.sep)[1:])
753859
if data["node_name"] != self.root_name:
754860
data["parent_node_save_path"] = os.path.join(self.experiment_name, *data["parent_node_save_path"].split(os.sep)[1:])

TELF/pre_processing/Beaver/beaver.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -877,7 +877,7 @@ def documents_words(self,
877877
for widx, token in tqdm(enumerate(highlighting), disable=not verbose):
878878
idxs = np.where(vocabulary == token)[0]
879879
if len(idxs):
880-
X[idxs[0]] = X[idxs[0]] * weights[widx]
880+
X[:, idxs] = X[:, idxs] * weights[widx]
881881

882882
# convert to pydata sparse for consistency across all beaver methods
883883
X = sparse.COO.from_scipy_sparse(X)

TELF/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.0.35"
1+
__version__ = "0.0.36"

docs/Beaver.html

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
<meta charset="utf-8" />
99
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
1010

11-
<title>TELF.pre_processing.Beaver: Fast matrix and tensor building tool &#8212; TELF 0.0.35 documentation</title>
11+
<title>TELF.pre_processing.Beaver: Fast matrix and tensor building tool &#8212; TELF 0.0.36 documentation</title>
1212

1313

1414

@@ -40,7 +40,7 @@
4040
<link rel="preload" as="script" href="_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
4141
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
4242

43-
<script src="_static/documentation_options.js?v=6aa38c3a"></script>
43+
<script src="_static/documentation_options.js?v=7dd70c5c"></script>
4444
<script src="_static/doctools.js?v=9bcbadda"></script>
4545
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
4646
<script src="_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -126,7 +126,7 @@
126126

127127

128128

129-
<p class="title logo__title">TELF 0.0.35 documentation</p>
129+
<p class="title logo__title">TELF 0.0.36 documentation</p>
130130

131131
</a></div>
132132
<div class="sidebar-primary-item">

docs/Cheetah.html

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
<meta charset="utf-8" />
99
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
1010

11-
<title>TELF.applications.Cheetah: Advanced search by keywords and phrases &#8212; TELF 0.0.35 documentation</title>
11+
<title>TELF.applications.Cheetah: Advanced search by keywords and phrases &#8212; TELF 0.0.36 documentation</title>
1212

1313

1414

@@ -40,7 +40,7 @@
4040
<link rel="preload" as="script" href="_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
4141
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
4242

43-
<script src="_static/documentation_options.js?v=6aa38c3a"></script>
43+
<script src="_static/documentation_options.js?v=7dd70c5c"></script>
4444
<script src="_static/doctools.js?v=9bcbadda"></script>
4545
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
4646
<script src="_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -126,7 +126,7 @@
126126

127127

128128

129-
<p class="title logo__title">TELF 0.0.35 documentation</p>
129+
<p class="title logo__title">TELF 0.0.36 documentation</p>
130130

131131
</a></div>
132132
<div class="sidebar-primary-item">

0 commit comments

Comments
 (0)