7
7
import java .io .IOException ;
8
8
import java .nio .file .Files ;
9
9
import java .nio .file .Paths ;
10
+ import java .util .AbstractMap .SimpleEntry ;
11
+ import java .util .Arrays ;
12
+ import java .util .Collections ;
13
+ import java .util .LinkedList ;
14
+ import java .util .List ;
15
+ import java .util .Map ;
16
+ import java .util .Map .Entry ;
17
+ import java .util .PrimitiveIterator ;
18
+ import java .util .stream .IntStream ;
19
+ import java .util .stream .Stream ;
10
20
11
21
import org .commoncrawl .webgraph .CountingMergedIntIterator ;
12
22
import org .slf4j .Logger ;
13
23
import org .slf4j .LoggerFactory ;
14
24
15
25
import it .unimi .dsi .fastutil .io .BinIO ;
16
26
import it .unimi .dsi .fastutil .longs .LongArrayList ;
27
+ import it .unimi .dsi .lang .MutableString ;
17
28
import it .unimi .dsi .sux4j .mph .GOV4Function ;
18
29
import it .unimi .dsi .util .FrontCodedStringList ;
19
30
import it .unimi .dsi .util .ImmutableExternalPrefixMap ;
31
+ import it .unimi .dsi .util .Interval ;
20
32
import it .unimi .dsi .util .ShiftAddXorSignedStringMap ;
21
33
import it .unimi .dsi .webgraph .ImmutableGraph ;
22
34
import it .unimi .dsi .webgraph .LazyIntIterator ;
35
+ import it .unimi .dsi .webgraph .LazyIntIterators ;
23
36
24
37
/**
25
38
* Holds webgraph-related data structures and access methods for graph
@@ -42,6 +55,8 @@ public class Graph {
42
55
protected ShiftAddXorSignedStringMap vertexMapSmph ;
43
56
protected GOV4Function <String > vertexMapMph ;
44
57
58
+ private static int LAZY_INT_ITERATOR_EMPTY_VALUE = LazyIntIterators .EMPTY_ITERATOR .nextInt ();
59
+
45
60
public Graph (String name ) throws Exception {
46
61
this .name = name ;
47
62
try {
@@ -113,6 +128,157 @@ public int indegree(String vertexLabel) {
113
128
return graphT .outdegree ((int ) vertexLabelToId (vertexLabel ));
114
129
}
115
130
131
+ public int [] successors (long vertexId ) {
132
+ return graph .successorArray ((int ) vertexId );
133
+ }
134
+
135
+ public int [] successors (String vertexLabel ) {
136
+ return graph .successorArray ((int ) vertexLabelToId (vertexLabel ));
137
+ }
138
+
139
+ public Stream <String > successorStream (String vertexLabel ) {
140
+ return successorStream (graph , vertexLabelToId (vertexLabel ));
141
+ }
142
+
143
+ public IntStream successorIntStream (String vertexLabel ) {
144
+ return successorIntStream (graph , vertexLabelToId (vertexLabel ));
145
+ }
146
+
147
+ public Stream <String > successorStream (String vertexLabel , String prefix ) {
148
+ return successorStream (graph , vertexLabelToId (vertexLabel ), vertexMap .getInterval (prefix ));
149
+ }
150
+
151
+ public IntStream successorIntStream (String vertexLabel , String prefix ) {
152
+ return successorIntStream (graph , vertexLabelToId (vertexLabel ), vertexMap .getInterval (prefix ));
153
+ }
154
+
155
+ public Stream <Entry <String , Long >> successorTopLevelDomainCounts (String vertexLabel ) {
156
+ return successorTopLevelDomainCounts (graph , vertexLabelToId (vertexLabel ));
157
+ }
158
+
159
+ public Stream <String > successorStream (ImmutableGraph graph , long vertexId ) {
160
+ return successorIntStream (graph , vertexId ).mapToObj (i -> vertexIdToLabel (i ));
161
+ }
162
+
163
+ public IntStream successorIntStream (ImmutableGraph graph , long vertexId ) {
164
+ return Arrays .stream (graph .successorArray ((int ) vertexId ));
165
+ }
166
+
167
+ private Stream <String > successorStream (ImmutableGraph graph , long vertexId , Interval interval ) {
168
+ return successorIntStream (graph , vertexId , interval ).mapToObj (i -> vertexIdToLabel (i ));
169
+ }
170
+
171
+ public IntStream successorIntStream (ImmutableGraph graph , long vertexId , Interval interval ) {
172
+ return Arrays .stream (graph .successorArray ((int ) vertexId )).filter (x -> (interval .compareTo (x ) == 0 ));
173
+ }
174
+
175
+ public Stream <String > successorTopLevelDomainStream (ImmutableGraph graph , long vertexId ) {
176
+ return Arrays .stream (graph .successorArray ((int ) vertexId )).mapToObj (i -> getTopLevelDomain (vertexIdToLabel (i )));
177
+ }
178
+
179
+ public Stream <Entry <String , Long >> successorTopLevelDomainCounts (ImmutableGraph graph , long vertexId ) {
180
+ if (vertexMap != null ) {
181
+ /*
182
+ * speed up if we have a prefix map, utilizing the fact that vertex labels are
183
+ * lexicographically sorted by reversed domain name
184
+ */
185
+ List <Entry <String , Long >> res = new LinkedList <>();
186
+ LazyIntIterator iter = graph .successors ((int ) vertexId );
187
+ int curr = iter .nextInt ();
188
+ while (curr != LAZY_INT_ITERATOR_EMPTY_VALUE ) {
189
+ final MutableString currLabel = vertexMap .list ().get (curr );
190
+ final int pos = currLabel .indexOf ('.' );
191
+ final MutableString tldPrefix ;
192
+ final String tld ;
193
+ if (pos > -1 && (pos + 1 ) < currLabel .length ()) {
194
+ tldPrefix = currLabel .substring (0 , pos + 1 );
195
+ tld = tldPrefix .substring (0 , pos ).toString ();
196
+ } else {
197
+ tldPrefix = currLabel ;
198
+ tld = currLabel .toString ();
199
+ }
200
+ long count = 1 ;
201
+ final Interval interval = vertexMap .getInterval (tldPrefix );
202
+ int next ;
203
+ while ((next = iter .nextInt ()) != LAZY_INT_ITERATOR_EMPTY_VALUE ) {
204
+ if (next > interval .right ) {
205
+ break ;
206
+ }
207
+ count ++;
208
+ }
209
+ curr = next ;
210
+ res .add (new SimpleEntry <>(tld , count ));
211
+ }
212
+ return res .stream ().sorted (Collections .reverseOrder (Map .Entry .comparingByValue ()));
213
+ }
214
+ return GraphExplorer .frequencies (successorTopLevelDomainStream (graph , vertexId ));
215
+ }
216
+
217
+ public Stream <Entry <String , Long >> topLevelDomainCounts (IntStream vertexIds ) {
218
+ if (vertexMap != null ) {
219
+ List <Entry <String , Long >> res = new LinkedList <>();
220
+ PrimitiveIterator .OfInt iter = vertexIds .iterator ();
221
+ if (iter .hasNext ()) {
222
+ int curr = iter .nextInt ();;
223
+ do {
224
+ final MutableString currLabel = vertexMap .list ().get (curr );
225
+ final int pos = currLabel .indexOf ('.' );
226
+ final MutableString tldPrefix ;
227
+ final String tld ;
228
+ if (pos > -1 && (pos + 1 ) < currLabel .length ()) {
229
+ tldPrefix = currLabel .substring (0 , pos + 1 );
230
+ tld = tldPrefix .substring (0 , pos ).toString ();
231
+ } else {
232
+ tldPrefix = currLabel ;
233
+ tld = currLabel .toString ();
234
+ }
235
+ long count = 1 ;
236
+ final Interval interval = vertexMap .getInterval (tldPrefix );
237
+ int next = -1 ;
238
+ while (iter .hasNext ()) {
239
+ next = iter .nextInt ();
240
+ if (next > interval .right ) {
241
+ break ;
242
+ }
243
+ count ++;
244
+ }
245
+ curr = next ;
246
+ res .add (new SimpleEntry <>(tld , count ));
247
+ } while (curr > -1 );
248
+ }
249
+ return res .stream ().sorted (Collections .reverseOrder (Map .Entry .comparingByValue ()));
250
+ }
251
+ return GraphExplorer .frequencies (vertexIds .mapToObj (i -> Graph .getTopLevelDomain (vertexIdToLabel (i ))));
252
+ }
253
+
254
+ public int [] predecessors (long vertexId ) {
255
+ return graphT .successorArray ((int ) vertexId );
256
+ }
257
+
258
+ public int [] predecessors (String vertexLabel ) {
259
+ return graphT .successorArray ((int ) vertexLabelToId (vertexLabel ));
260
+ }
261
+
262
+ public Stream <String > predecessorStream (String vertexLabel ) {
263
+ return successorStream (graphT , vertexLabelToId (vertexLabel ));
264
+ }
265
+
266
+ public IntStream predecessorIntStream (String vertexLabel ) {
267
+ return successorIntStream (graphT , vertexLabelToId (vertexLabel ));
268
+ }
269
+
270
+ public Stream <String > predecessorStream (String vertexLabel , String prefix ) {
271
+ return successorStream (graphT , vertexLabelToId (vertexLabel ), vertexMap .getInterval (prefix ));
272
+ }
273
+
274
+ public IntStream predecessorIntStream (String vertexLabel , String prefix ) {
275
+ return successorIntStream (graphT , vertexLabelToId (vertexLabel ), vertexMap .getInterval (prefix ));
276
+ }
277
+
278
+ public Stream <Entry <String , Long >> predecessorTopLevelDomainCounts (String vertexLabel ) {
279
+ return successorTopLevelDomainCounts (graphT , vertexLabelToId (vertexLabel ));
280
+ }
281
+
116
282
public long [] sharedPredecessors (long [] vertices ) {
117
283
return sharedPredecessors (vertices , vertices .length , vertices .length );
118
284
}
@@ -169,4 +335,12 @@ public long[] sharedSuccessors(ImmutableGraph graph, long[] vertices, int minSha
169
335
res .trim ();
170
336
return res .elements ();
171
337
}
338
+
339
+ public static String getTopLevelDomain (String reversedDomainName ) {
340
+ int dot = reversedDomainName .indexOf ('.' );
341
+ if (dot < reversedDomainName .length ()) {
342
+ return reversedDomainName .substring (0 , dot );
343
+ }
344
+ return reversedDomainName ;
345
+ }
172
346
}
0 commit comments