Skip to content

Commit 24bc96a

Browse files
feat: tool and scripts to interactively explore webgraph
Add methods to access and view successors/predecessors and count top-level domains in lists of vertices.
1 parent e684eb5 commit 24bc96a

File tree

3 files changed

+231
-12
lines changed

3 files changed

+231
-12
lines changed

src/main/java/org/commoncrawl/webgraph/CountingMergedIntIterator.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ public int compareTo(QueuedIterator o) {
4343
}
4444
}
4545

46-
public static int EMPTY_INPUT_ITERATOR_VALUE = LazyIntIterators.EMPTY_ITERATOR.nextInt();
46+
public static int LAZY_INT_ITERATOR_EMPTY_VALUE = LazyIntIterators.EMPTY_ITERATOR.nextInt();
4747

4848
private final PriorityQueue<QueuedIterator> iters = new PriorityQueue<>();
4949
private int currentCount = 0;
@@ -54,7 +54,7 @@ public int compareTo(QueuedIterator o) {
5454
public CountingMergedIntIterator(LazyIntIterator... iterators) {
5555
for (final LazyIntIterator iter : iterators) {
5656
final QueuedIterator qiter = new QueuedIterator(iter);
57-
if (qiter.value != EMPTY_INPUT_ITERATOR_VALUE) {
57+
if (qiter.value != LAZY_INT_ITERATOR_EMPTY_VALUE) {
5858
iters.add(qiter);
5959
}
6060
}
@@ -93,7 +93,7 @@ public int nextInt() {
9393
while ((val = qiter.iter.nextInt()) == value) {
9494
count++;
9595
}
96-
if (val != EMPTY_INPUT_ITERATOR_VALUE) {
96+
if (val != LAZY_INT_ITERATOR_EMPTY_VALUE) {
9797
qiter.value = val;
9898
iters.add(qiter);
9999
}

src/main/java/org/commoncrawl/webgraph/explore/Graph.java

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,32 @@
77
import java.io.IOException;
88
import java.nio.file.Files;
99
import java.nio.file.Paths;
10+
import java.util.AbstractMap.SimpleEntry;
11+
import java.util.Arrays;
12+
import java.util.Collections;
13+
import java.util.LinkedList;
14+
import java.util.List;
15+
import java.util.Map;
16+
import java.util.Map.Entry;
17+
import java.util.PrimitiveIterator;
18+
import java.util.stream.IntStream;
19+
import java.util.stream.Stream;
1020

1121
import org.commoncrawl.webgraph.CountingMergedIntIterator;
1222
import org.slf4j.Logger;
1323
import org.slf4j.LoggerFactory;
1424

1525
import it.unimi.dsi.fastutil.io.BinIO;
1626
import it.unimi.dsi.fastutil.longs.LongArrayList;
27+
import it.unimi.dsi.lang.MutableString;
1728
import it.unimi.dsi.sux4j.mph.GOV4Function;
1829
import it.unimi.dsi.util.FrontCodedStringList;
1930
import it.unimi.dsi.util.ImmutableExternalPrefixMap;
31+
import it.unimi.dsi.util.Interval;
2032
import it.unimi.dsi.util.ShiftAddXorSignedStringMap;
2133
import it.unimi.dsi.webgraph.ImmutableGraph;
2234
import it.unimi.dsi.webgraph.LazyIntIterator;
35+
import it.unimi.dsi.webgraph.LazyIntIterators;
2336

2437
/**
2538
* Holds webgraph-related data structures and access methods for graph
@@ -42,6 +55,8 @@ public class Graph {
4255
protected ShiftAddXorSignedStringMap vertexMapSmph;
4356
protected GOV4Function<String> vertexMapMph;
4457

58+
private static int LAZY_INT_ITERATOR_EMPTY_VALUE = LazyIntIterators.EMPTY_ITERATOR.nextInt();
59+
4560
public Graph(String name) throws Exception {
4661
this.name = name;
4762
try {
@@ -113,6 +128,157 @@ public int indegree(String vertexLabel) {
113128
return graphT.outdegree((int) vertexLabelToId(vertexLabel));
114129
}
115130

131+
public int[] successors(long vertexId) {
132+
return graph.successorArray((int) vertexId);
133+
}
134+
135+
public int[] successors(String vertexLabel) {
136+
return graph.successorArray((int) vertexLabelToId(vertexLabel));
137+
}
138+
139+
public Stream<String> successorStream(String vertexLabel) {
140+
return successorStream(graph, vertexLabelToId(vertexLabel));
141+
}
142+
143+
public IntStream successorIntStream(String vertexLabel) {
144+
return successorIntStream(graph, vertexLabelToId(vertexLabel));
145+
}
146+
147+
public Stream<String> successorStream(String vertexLabel, String prefix) {
148+
return successorStream(graph, vertexLabelToId(vertexLabel), vertexMap.getInterval(prefix));
149+
}
150+
151+
public IntStream successorIntStream(String vertexLabel, String prefix) {
152+
return successorIntStream(graph, vertexLabelToId(vertexLabel), vertexMap.getInterval(prefix));
153+
}
154+
155+
public Stream<Entry<String, Long>> successorTopLevelDomainCounts(String vertexLabel) {
156+
return successorTopLevelDomainCounts(graph, vertexLabelToId(vertexLabel));
157+
}
158+
159+
public Stream<String> successorStream(ImmutableGraph graph, long vertexId) {
160+
return successorIntStream(graph, vertexId).mapToObj(i -> vertexIdToLabel(i));
161+
}
162+
163+
public IntStream successorIntStream(ImmutableGraph graph, long vertexId) {
164+
return Arrays.stream(graph.successorArray((int) vertexId));
165+
}
166+
167+
private Stream<String> successorStream(ImmutableGraph graph, long vertexId, Interval interval) {
168+
return successorIntStream(graph, vertexId, interval).mapToObj(i -> vertexIdToLabel(i));
169+
}
170+
171+
public IntStream successorIntStream(ImmutableGraph graph, long vertexId, Interval interval) {
172+
return Arrays.stream(graph.successorArray((int) vertexId)).filter(x -> (interval.compareTo(x) == 0));
173+
}
174+
175+
public Stream<String> successorTopLevelDomainStream(ImmutableGraph graph, long vertexId) {
176+
return Arrays.stream(graph.successorArray((int) vertexId)).mapToObj(i -> getTopLevelDomain(vertexIdToLabel(i)));
177+
}
178+
179+
public Stream<Entry<String, Long>> successorTopLevelDomainCounts(ImmutableGraph graph, long vertexId) {
180+
if (vertexMap != null) {
181+
/*
182+
* speed up if we have a prefix map, utilizing the fact that vertex labels are
183+
* lexicographically sorted by reversed domain name
184+
*/
185+
List<Entry<String, Long>> res = new LinkedList<>();
186+
LazyIntIterator iter = graph.successors((int) vertexId);
187+
int curr = iter.nextInt();
188+
while (curr != LAZY_INT_ITERATOR_EMPTY_VALUE) {
189+
final MutableString currLabel = vertexMap.list().get(curr);
190+
final int pos = currLabel.indexOf('.');
191+
final MutableString tldPrefix;
192+
final String tld;
193+
if (pos > -1 && (pos + 1) < currLabel.length()) {
194+
tldPrefix = currLabel.substring(0, pos + 1);
195+
tld = tldPrefix.substring(0, pos).toString();
196+
} else {
197+
tldPrefix = currLabel;
198+
tld = currLabel.toString();
199+
}
200+
long count = 1;
201+
final Interval interval = vertexMap.getInterval(tldPrefix);
202+
int next;
203+
while ((next = iter.nextInt()) != LAZY_INT_ITERATOR_EMPTY_VALUE) {
204+
if (next > interval.right) {
205+
break;
206+
}
207+
count++;
208+
}
209+
curr = next;
210+
res.add(new SimpleEntry<>(tld, count));
211+
}
212+
return res.stream().sorted(Collections.reverseOrder(Map.Entry.comparingByValue()));
213+
}
214+
return GraphExplorer.frequencies(successorTopLevelDomainStream(graph, vertexId));
215+
}
216+
217+
public Stream<Entry<String, Long>> topLevelDomainCounts(IntStream vertexIds) {
218+
if (vertexMap != null) {
219+
List<Entry<String, Long>> res = new LinkedList<>();
220+
PrimitiveIterator.OfInt iter = vertexIds.iterator();
221+
if (iter.hasNext()) {
222+
int curr = iter.nextInt();;
223+
do {
224+
final MutableString currLabel = vertexMap.list().get(curr);
225+
final int pos = currLabel.indexOf('.');
226+
final MutableString tldPrefix;
227+
final String tld;
228+
if (pos > -1 && (pos + 1) < currLabel.length()) {
229+
tldPrefix = currLabel.substring(0, pos + 1);
230+
tld = tldPrefix.substring(0, pos).toString();
231+
} else {
232+
tldPrefix = currLabel;
233+
tld = currLabel.toString();
234+
}
235+
long count = 1;
236+
final Interval interval = vertexMap.getInterval(tldPrefix);
237+
int next = -1;
238+
while (iter.hasNext()) {
239+
next = iter.nextInt();
240+
if (next > interval.right) {
241+
break;
242+
}
243+
count++;
244+
}
245+
curr = next;
246+
res.add(new SimpleEntry<>(tld, count));
247+
} while (curr > -1);
248+
}
249+
return res.stream().sorted(Collections.reverseOrder(Map.Entry.comparingByValue()));
250+
}
251+
return GraphExplorer.frequencies(vertexIds.mapToObj(i -> Graph.getTopLevelDomain(vertexIdToLabel(i))));
252+
}
253+
254+
public int[] predecessors(long vertexId) {
255+
return graphT.successorArray((int) vertexId);
256+
}
257+
258+
public int[] predecessors(String vertexLabel) {
259+
return graphT.successorArray((int) vertexLabelToId(vertexLabel));
260+
}
261+
262+
public Stream<String> predecessorStream(String vertexLabel) {
263+
return successorStream(graphT, vertexLabelToId(vertexLabel));
264+
}
265+
266+
public IntStream predecessorIntStream(String vertexLabel) {
267+
return successorIntStream(graphT, vertexLabelToId(vertexLabel));
268+
}
269+
270+
public Stream<String> predecessorStream(String vertexLabel, String prefix) {
271+
return successorStream(graphT, vertexLabelToId(vertexLabel), vertexMap.getInterval(prefix));
272+
}
273+
274+
public IntStream predecessorIntStream(String vertexLabel, String prefix) {
275+
return successorIntStream(graphT, vertexLabelToId(vertexLabel), vertexMap.getInterval(prefix));
276+
}
277+
278+
public Stream<Entry<String, Long>> predecessorTopLevelDomainCounts(String vertexLabel) {
279+
return successorTopLevelDomainCounts(graphT, vertexLabelToId(vertexLabel));
280+
}
281+
116282
public long[] sharedPredecessors(long[] vertices) {
117283
return sharedPredecessors(vertices, vertices.length, vertices.length);
118284
}
@@ -169,4 +335,12 @@ public long[] sharedSuccessors(ImmutableGraph graph, long[] vertices, int minSha
169335
res.trim();
170336
return res.elements();
171337
}
338+
339+
public static String getTopLevelDomain(String reversedDomainName) {
340+
int dot = reversedDomainName.indexOf('.');
341+
if (dot < reversedDomainName.length()) {
342+
return reversedDomainName.substring(0, dot);
343+
}
344+
return reversedDomainName;
345+
}
172346
}

src/main/java/org/commoncrawl/webgraph/explore/GraphExplorer.java

Lines changed: 54 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@
1010
import java.nio.file.Files;
1111
import java.nio.file.Paths;
1212
import java.util.Arrays;
13+
import java.util.Comparator;
14+
import java.util.Map.Entry;
15+
import java.util.function.Function;
16+
import java.util.stream.Collectors;
17+
import java.util.stream.IntStream;
1318
import java.util.stream.Stream;
1419

1520
import org.commoncrawl.webgraph.CountingMergedIntIterator;
@@ -46,11 +51,11 @@ public String toString() {
4651
}
4752

4853
public int outdegree() {
49-
return g.graph.outdegree((int) id);
54+
return g.outdegree((int) id);
5055
}
5156

5257
public int indegree() {
53-
return g.graphT.outdegree((int) id);
58+
return g.indegree((int) id);
5459
}
5560

5661
public int[] successors() {
@@ -89,7 +94,6 @@ public void setVertex(long vertexId) {
8994
v = getVertex(vertexId);
9095
}
9196

92-
9397
/* Reimplementation of commands provided by pywebgraph (cn, pwn, ls, sl) */
9498

9599
/**
@@ -117,7 +121,7 @@ public void cn(long vertexId) {
117121
*/
118122
public void pwn() {
119123
if (v == null) {
120-
throw new NullPointerException("Current orking node not set, use cn(...) to define the working node.");
124+
throw new NullPointerException("Current working node not set, use cn(...) to define the working node.");
121125
}
122126
print(v.toString());
123127
}
@@ -127,7 +131,7 @@ public void pwn() {
127131
*/
128132
public void ls() {
129133
if (v == null) {
130-
throw new NullPointerException("Current orking node not set, use cn(...) to define the working node.");
134+
throw new NullPointerException("Current working node not set, use cn(...) to define the working node.");
131135
}
132136
ls(v.id);
133137
}
@@ -155,7 +159,7 @@ public void ls(String vertexLabel) {
155159
*/
156160
public void sl() {
157161
if (v == null) {
158-
throw new NullPointerException("Current orking node not set, use cn(...) to define the working node.");
162+
throw new NullPointerException("Current working node not set, use cn(...) to define the working node.");
159163
}
160164
sl(v.id);
161165
}
@@ -178,7 +182,6 @@ public void sl(String vertexLabel) {
178182
sl(g.vertexLabelToId(vertexLabel));
179183
}
180184

181-
182185
/* Utilities */
183186

184187
public long[] loadVerticesFromFile(String fileName) {
@@ -195,7 +198,34 @@ public void saveVerticesToFile(long[] vertexIDs, String fileName) {
195198
StandardCharsets.UTF_8)) {
196199
Arrays.stream(vertexIDs).forEach(id -> out.println(g.vertexIdToLabel(id)));
197200
} catch (IOException e) {
198-
LOG.error("Failed to load vertices from file {}", fileName, e);
201+
LOG.error("Failed to write vertices to file {}", fileName, e);
202+
}
203+
}
204+
205+
public void saveVerticesToFile(int[] vertexIDs, String fileName) {
206+
saveVerticesToFile(Arrays.stream(vertexIDs), fileName);
207+
}
208+
209+
public void saveVerticesToFile(IntStream vertexIDs, String fileName) {
210+
try (PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(fileName)), false,
211+
StandardCharsets.UTF_8)) {
212+
vertexIDs.forEach(id -> out.println(g.vertexIdToLabel(id)));
213+
} catch (IOException e) {
214+
LOG.error("Failed to write vertices to file {}", fileName, e);
215+
}
216+
}
217+
218+
public void saveCountsToFile(Stream<Entry<String, Long>> counts, String fileName) {
219+
try (PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(fileName)), false,
220+
StandardCharsets.UTF_8)) {
221+
counts.forEach(c -> {
222+
out.print(c.getValue());
223+
out.print('\t');
224+
out.print(c.getKey());
225+
out.print('\n');
226+
});
227+
} catch (IOException e) {
228+
LOG.error("Failed to write counts to file {}", fileName, e);
199229
}
200230
}
201231

@@ -206,7 +236,7 @@ private void print(String s) {
206236
public void printVertices(LazyIntIterator it) {
207237
int next = it.nextInt();
208238
int i = 0;
209-
while (next != CountingMergedIntIterator.EMPTY_INPUT_ITERATOR_VALUE) {
239+
while (next != CountingMergedIntIterator.LAZY_INT_ITERATOR_EMPTY_VALUE) {
210240
print(String.format("%d: %s", i, (new Vertex(next)).toString()));
211241
next = it.nextInt();
212242
i++;
@@ -228,4 +258,19 @@ public void printVertices(int[] vertexIDs) {
228258
i++;
229259
}
230260
}
261+
262+
/**
263+
* Count strings in a stream. Sort the resulting string-count pairs by
264+
* decreasing count (frequency) and secondarily by string in lexicographic
265+
* order.
266+
*
267+
* @param strings stream of strings
268+
* @return stream of pairs {@code <string, count>}
269+
*/
270+
public static Stream<Entry<String, Long>> frequencies(Stream<String> strings) {
271+
final Comparator<Entry<String, Long>> comp = Comparator.comparingLong((Entry<String, Long> e) -> e.getValue())
272+
.reversed().thenComparing(Comparator.comparing((Entry<String, Long> e) -> e.getKey()));
273+
return strings.collect(Collectors.groupingBy(Function.identity(), Collectors.counting())).entrySet().stream()
274+
.sorted(comp);
275+
}
231276
}

0 commit comments

Comments
 (0)