Skip to content

Commit 581cceb

Browse files
authored
Merge pull request #176 from lanl/develop
Develop
2 parents 309eb02 + 979a22a commit 581cceb

File tree

93 files changed

+380
-227
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

93 files changed

+380
-227
lines changed

CITATION.cff

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ authors:
2020
- family-names: Alexandrov
2121
given-names: Boian
2222
title: "Tensor Extraction of Latent Features (T-ELF)"
23-
version: 0.0.19
23+
version: 0.0.20
2424
url: https://github.com/lanl/T-ELF
2525
doi: 10.5281/zenodo.10257897
2626
date-released: 2023-12-04

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ If you use T-ELF please cite.
153153

154154
**APA:**
155155
```latex
156-
Eren, M., Solovyev, N., Barron, R., Bhattarai, M., Truong, D., Boureima, I., Skau, E., Rasmussen, K., & Alexandrov, B. (2023). Tensor Extraction of Latent Features (T-ELF) (Version 0.0.19) [Computer software]. https://doi.org/10.5281/zenodo.10257897
156+
Eren, M., Solovyev, N., Barron, R., Bhattarai, M., Truong, D., Boureima, I., Skau, E., Rasmussen, K., & Alexandrov, B. (2023). Tensor Extraction of Latent Features (T-ELF) (Version 0.0.20) [Computer software]. https://doi.org/10.5281/zenodo.10257897
157157
```
158158

159159
**BibTeX:**

TELF/factorization/HNMFk.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -536,6 +536,8 @@ def _process_node(self, Ks, depth, original_indices, node_name, parent_node_name
536536
if len(cluster_c_indices) == 0:
537537
continue
538538

539+
extracted_indicies = [current_node.original_indices[i] for i in cluster_c_indices]
540+
539541
# save current results
540542
next_name = str(uuid.uuid1())
541543
current_node.child_node_names.append(next_name)
@@ -544,8 +546,8 @@ def _process_node(self, Ks, depth, original_indices, node_name, parent_node_name
544546
next_job = {
545547
"parent_node_name":node_name,
546548
"node_name":next_name,
547-
"Ks":self._get_curr_Ks(node_k=current_node.k, num_samples=len(cluster_c_indices)),
548-
"original_indices":cluster_c_indices.copy(),
549+
"Ks":self._get_curr_Ks(node_k=current_node.k, num_samples=len(extracted_indicies)),
550+
"original_indices":extracted_indicies.copy(),
549551
"depth":current_node.depth+1,
550552
"parent_topic":c,
551553
}
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
from nltk.stem import PorterStemmer
2+
import re
3+
from concurrent.futures import ThreadPoolExecutor
4+
from tqdm import tqdm
5+
from TELF.pre_processing.Vulture.tokens_analysis.levenstein import compare_keys
6+
7+
"""
8+
SAMPLE USAGE
9+
----------
10+
stem_processor = StemProcessor(vocabulary)
11+
subs_stemed, new_vocabulary = stem_processor()
12+
"""
13+
class StemProcessor:
14+
15+
SUFFIXES = ['acity', 'ation', 'ative', 'cracy', 'craft', 'esque', 'able',
16+
'ance', 'ancy', 'cide', 'ence', 'ency', 'hood', 'ible', 'less',
17+
'like', 'ment', 'ness', 'ship', 'sion', 'ster', 'tion', 'ward',
18+
'ware', 'wise', 'acy', 'ant', 'ary', 'ate', 'dom', 'ent', 'ern',
19+
'ese', 'ess', 'est', 'ful', 'ian', 'ice', 'ify', 'ing', 'ion',
20+
'ish', 'ism', 'ist', 'ity', 'ive', 'ize', 'ory', 'ous', 'ac',
21+
'al', 'ar', 'ed', 'ee', 'en', 'er', 'fy', 'ic', 'ly', 'or', 'ty',
22+
'y']
23+
24+
def __init__(self, vocabulary, suffixes=None):
25+
"""
26+
Store values for processing in functions
27+
28+
Parameters
29+
----------
30+
vocabulary : list
31+
words from the corpus
32+
suffixes : list
33+
common suffixes in english
34+
"""
35+
if suffixes:
36+
self.suffixes = sorted(suffixes, key=len, reverse=True)
37+
else:
38+
self.suffixes = StemProcessor.SUFFIXES
39+
self.vocabulary = vocabulary
40+
41+
def strip_suffixes(self, word):
42+
"""
43+
Removes all suffixes, longest to shorest
44+
45+
Parameters
46+
----------
47+
word : str
48+
unified variants map to shortest variant
49+
50+
Returns
51+
-------
52+
word : str
53+
word without suffixes
54+
"""
55+
for suffix in self.suffixes:
56+
if word.endswith(suffix):
57+
return word[:-len(suffix)]
58+
return word
59+
60+
def unify_common_stems(self, vocab_stems, similarity_threshold=0.9, min_word_length=5, n_jobs=None):
61+
"""
62+
finds stems that are the same without endings
63+
64+
Parameters
65+
----------
66+
vocab_stems : dict (str:str)
67+
unified variants map to shortest variant
68+
similarity_threshold : float
69+
similarity cutoff
70+
min_word_length : int
71+
only consider words meeting this length
72+
n_jobs : int
73+
number of concurrent jobs
74+
75+
Returns
76+
-------
77+
vocab_stems : dict (str:str)
78+
unified variants map to shortest variant
79+
"""
80+
def compare_stems(stem_pair):
81+
stem_i, stem_j = stem_pair
82+
if len(stem_i) > min_word_length and len(stem_j) > min_word_length:
83+
compare_i = self.strip_suffixes(stem_i)
84+
compare_j = self.strip_suffixes(stem_j)
85+
similar, _ = compare_keys(compare_i, compare_j, threshold=similarity_threshold)
86+
if similar:
87+
return (stem_i, stem_j)
88+
return None
89+
90+
stems = list(vocab_stems.keys())
91+
stem_pairs = [(stems[i], stems[j]) for i in range(len(stems)) for j in range(i + 1, len(stems)) if stems[i][0] == stems[j][0]]
92+
similar = []
93+
94+
with ThreadPoolExecutor(max_workers=n_jobs) as executor:
95+
results = list(tqdm(executor.map(compare_stems, stem_pairs), total=len(stem_pairs)))
96+
97+
similar = [result for result in results if result is not None]
98+
99+
seen = {}
100+
for stem_i, stem_j in similar:
101+
shortest_stem = min(stem_i, stem_j, key=len)
102+
longest_stem = stem_j if shortest_stem == stem_i else stem_i
103+
104+
destination_map = seen.get(longest_stem, shortest_stem)
105+
if longest_stem in vocab_stems:
106+
if destination_map in vocab_stems:
107+
vocab_stems[destination_map]['src'].extend(vocab_stems.pop(longest_stem)['src'])
108+
else:
109+
vocab_stems[destination_map] = {'src': vocab_stems.pop(longest_stem)['src'], 'dest': vocab_stems[destination_map]}
110+
seen[longest_stem] = shortest_stem
111+
112+
return vocab_stems
113+
114+
def build_stem_map(self):
115+
"""
116+
Stems vocabulary map, ununified
117+
118+
Returns
119+
-------
120+
vocab_stems : dict (str:str)
121+
variants map to shortest variant, ununified
122+
"""
123+
ps = PorterStemmer()
124+
vocab_stems = {}
125+
for word in self.vocabulary:
126+
stem = ps.stem(word)
127+
if stem in vocab_stems:
128+
vocab_stems[stem]['src'].append(word)
129+
else:
130+
vocab_stems[stem] = {'src': [word], 'dest': word}
131+
132+
shortest_word = min(vocab_stems[stem]['src'], key=len)
133+
vocab_stems[stem]['dest'] = shortest_word
134+
135+
return vocab_stems
136+
137+
def __call__(self):
138+
139+
"""
140+
Stems vocabulary, constructs map of all variants to the shorstest variant.
141+
142+
Returns
143+
-------
144+
subs_stemed : dict (str:str)
145+
variants map to shortest variant
146+
shortened_vocabulary : list
147+
new vocabulary post-consolidation
148+
"""
149+
subs_stemed = {}
150+
vocab_stems = self.build_stem_map()
151+
vocab_stems = self.unify_common_stems(vocab_stems)
152+
shortened_vocabulary = set()
153+
154+
for stem, info in vocab_stems.items():
155+
destination_word = info['dest']
156+
shortened_vocabulary.add(destination_word)
157+
for src in info['src']:
158+
if src != destination_word:
159+
subs_stemed[src] = destination_word
160+
161+
return subs_stemed, list(shortened_vocabulary)

TELF/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = '0.0.19'
1+
__version__ = '0.0.20'

docs/Beaver.html

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
<meta charset="utf-8" />
99
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
1010

11-
<title>TELF.pre_processing.Beaver: Fast matrix and tensor building tool &#8212; TELF 0.0.19 documentation</title>
11+
<title>TELF.pre_processing.Beaver: Fast matrix and tensor building tool &#8212; TELF 0.0.20 documentation</title>
1212

1313

1414

@@ -37,7 +37,7 @@
3737
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=365ca57ee442770a23c6" />
3838
<script src="_static/vendor/fontawesome/6.1.2/js/all.min.js?digest=365ca57ee442770a23c6"></script>
3939

40-
<script src="_static/documentation_options.js?v=f00aad14"></script>
40+
<script src="_static/documentation_options.js?v=30839ccb"></script>
4141
<script src="_static/doctools.js?v=888ff710"></script>
4242
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
4343
<script src="_static/scripts/sphinx-book-theme.js?digest=5a5c038af52cf7bc1a1ec88eea08e6366ee68824"></script>
@@ -127,7 +127,7 @@
127127

128128

129129

130-
<p class="title logo__title">TELF 0.0.19 documentation</p>
130+
<p class="title logo__title">TELF 0.0.20 documentation</p>
131131

132132
</a></div>
133133
<div class="sidebar-primary-item"><nav class="bd-links" id="bd-docs-nav" aria-label="Main">

docs/Cheetah.html

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
<meta charset="utf-8" />
99
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
1010

11-
<title>TELF.applications.Cheetah: Advanced search by keywords and phrases &#8212; TELF 0.0.19 documentation</title>
11+
<title>TELF.applications.Cheetah: Advanced search by keywords and phrases &#8212; TELF 0.0.20 documentation</title>
1212

1313

1414

@@ -37,7 +37,7 @@
3737
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=365ca57ee442770a23c6" />
3838
<script src="_static/vendor/fontawesome/6.1.2/js/all.min.js?digest=365ca57ee442770a23c6"></script>
3939

40-
<script src="_static/documentation_options.js?v=f00aad14"></script>
40+
<script src="_static/documentation_options.js?v=30839ccb"></script>
4141
<script src="_static/doctools.js?v=888ff710"></script>
4242
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
4343
<script src="_static/scripts/sphinx-book-theme.js?digest=5a5c038af52cf7bc1a1ec88eea08e6366ee68824"></script>
@@ -127,7 +127,7 @@
127127

128128

129129

130-
<p class="title logo__title">TELF 0.0.19 documentation</p>
130+
<p class="title logo__title">TELF 0.0.20 documentation</p>
131131

132132
</a></div>
133133
<div class="sidebar-primary-item"><nav class="bd-links" id="bd-docs-nav" aria-label="Main">

docs/HNMFk.html

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
<meta charset="utf-8" />
99
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
1010

11-
<title>TELF.factorization.HNMFk: Hierarchical Non-negative Matrix Factorization with Automatic Model Determination &#8212; TELF 0.0.19 documentation</title>
11+
<title>TELF.factorization.HNMFk: Hierarchical Non-negative Matrix Factorization with Automatic Model Determination &#8212; TELF 0.0.20 documentation</title>
1212

1313

1414

@@ -37,7 +37,7 @@
3737
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=365ca57ee442770a23c6" />
3838
<script src="_static/vendor/fontawesome/6.1.2/js/all.min.js?digest=365ca57ee442770a23c6"></script>
3939

40-
<script src="_static/documentation_options.js?v=f00aad14"></script>
40+
<script src="_static/documentation_options.js?v=30839ccb"></script>
4141
<script src="_static/doctools.js?v=888ff710"></script>
4242
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
4343
<script src="_static/scripts/sphinx-book-theme.js?digest=5a5c038af52cf7bc1a1ec88eea08e6366ee68824"></script>
@@ -127,7 +127,7 @@
127127

128128

129129

130-
<p class="title logo__title">TELF 0.0.19 documentation</p>
130+
<p class="title logo__title">TELF 0.0.20 documentation</p>
131131

132132
</a></div>
133133
<div class="sidebar-primary-item"><nav class="bd-links" id="bd-docs-nav" aria-label="Main">

docs/NMFk.html

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
<meta charset="utf-8" />
99
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
1010

11-
<title>TELF.factorization.NMFk: Non-negative Matrix Factorization with Automatic Model Determination &#8212; TELF 0.0.19 documentation</title>
11+
<title>TELF.factorization.NMFk: Non-negative Matrix Factorization with Automatic Model Determination &#8212; TELF 0.0.20 documentation</title>
1212

1313

1414

@@ -37,7 +37,7 @@
3737
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=365ca57ee442770a23c6" />
3838
<script src="_static/vendor/fontawesome/6.1.2/js/all.min.js?digest=365ca57ee442770a23c6"></script>
3939

40-
<script src="_static/documentation_options.js?v=f00aad14"></script>
40+
<script src="_static/documentation_options.js?v=30839ccb"></script>
4141
<script src="_static/doctools.js?v=888ff710"></script>
4242
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
4343
<script src="_static/scripts/sphinx-book-theme.js?digest=5a5c038af52cf7bc1a1ec88eea08e6366ee68824"></script>
@@ -127,7 +127,7 @@
127127

128128

129129

130-
<p class="title logo__title">TELF 0.0.19 documentation</p>
130+
<p class="title logo__title">TELF 0.0.20 documentation</p>
131131

132132
</a></div>
133133
<div class="sidebar-primary-item"><nav class="bd-links" id="bd-docs-nav" aria-label="Main">

docs/RESCALk.html

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
<meta charset="utf-8" />
99
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
1010

11-
<title>TELF.factorization.RESCALk: RESCAL with Automatic Model Determination &#8212; TELF 0.0.19 documentation</title>
11+
<title>TELF.factorization.RESCALk: RESCAL with Automatic Model Determination &#8212; TELF 0.0.20 documentation</title>
1212

1313

1414

@@ -37,7 +37,7 @@
3737
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=365ca57ee442770a23c6" />
3838
<script src="_static/vendor/fontawesome/6.1.2/js/all.min.js?digest=365ca57ee442770a23c6"></script>
3939

40-
<script src="_static/documentation_options.js?v=f00aad14"></script>
40+
<script src="_static/documentation_options.js?v=30839ccb"></script>
4141
<script src="_static/doctools.js?v=888ff710"></script>
4242
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
4343
<script src="_static/scripts/sphinx-book-theme.js?digest=5a5c038af52cf7bc1a1ec88eea08e6366ee68824"></script>
@@ -127,7 +127,7 @@
127127

128128

129129

130-
<p class="title logo__title">TELF 0.0.19 documentation</p>
130+
<p class="title logo__title">TELF 0.0.20 documentation</p>
131131

132132
</a></div>
133133
<div class="sidebar-primary-item"><nav class="bd-links" id="bd-docs-nav" aria-label="Main">

0 commit comments

Comments
 (0)