77import java .io .IOException ;
88import java .nio .file .Files ;
99import java .nio .file .Paths ;
10+ import java .util .AbstractMap .SimpleEntry ;
11+ import java .util .Arrays ;
12+ import java .util .Collections ;
13+ import java .util .LinkedList ;
14+ import java .util .List ;
15+ import java .util .Map ;
16+ import java .util .Map .Entry ;
17+ import java .util .PrimitiveIterator ;
18+ import java .util .stream .IntStream ;
19+ import java .util .stream .Stream ;
1020
1121import org .commoncrawl .webgraph .CountingMergedIntIterator ;
1222import org .slf4j .Logger ;
1323import org .slf4j .LoggerFactory ;
1424
1525import it .unimi .dsi .fastutil .io .BinIO ;
1626import it .unimi .dsi .fastutil .longs .LongArrayList ;
27+ import it .unimi .dsi .lang .MutableString ;
1728import it .unimi .dsi .sux4j .mph .GOV4Function ;
1829import it .unimi .dsi .util .FrontCodedStringList ;
1930import it .unimi .dsi .util .ImmutableExternalPrefixMap ;
31+ import it .unimi .dsi .util .Interval ;
2032import it .unimi .dsi .util .ShiftAddXorSignedStringMap ;
2133import it .unimi .dsi .webgraph .ImmutableGraph ;
2234import it .unimi .dsi .webgraph .LazyIntIterator ;
35+ import it .unimi .dsi .webgraph .LazyIntIterators ;
2336
2437/**
2538 * Holds webgraph-related data structures and access methods for graph
@@ -42,6 +55,8 @@ public class Graph {
4255 protected ShiftAddXorSignedStringMap vertexMapSmph ;
4356 protected GOV4Function <String > vertexMapMph ;
4457
58+ private static int LAZY_INT_ITERATOR_EMPTY_VALUE = LazyIntIterators .EMPTY_ITERATOR .nextInt ();
59+
4560 public Graph (String name ) throws Exception {
4661 this .name = name ;
4762 try {
@@ -113,6 +128,157 @@ public int indegree(String vertexLabel) {
113128 return graphT .outdegree ((int ) vertexLabelToId (vertexLabel ));
114129 }
115130
131+ public int [] successors (long vertexId ) {
132+ return graph .successorArray ((int ) vertexId );
133+ }
134+
135+ public int [] successors (String vertexLabel ) {
136+ return graph .successorArray ((int ) vertexLabelToId (vertexLabel ));
137+ }
138+
139+ public Stream <String > successorStream (String vertexLabel ) {
140+ return successorStream (graph , vertexLabelToId (vertexLabel ));
141+ }
142+
143+ public IntStream successorIntStream (String vertexLabel ) {
144+ return successorIntStream (graph , vertexLabelToId (vertexLabel ));
145+ }
146+
147+ public Stream <String > successorStream (String vertexLabel , String prefix ) {
148+ return successorStream (graph , vertexLabelToId (vertexLabel ), vertexMap .getInterval (prefix ));
149+ }
150+
151+ public IntStream successorIntStream (String vertexLabel , String prefix ) {
152+ return successorIntStream (graph , vertexLabelToId (vertexLabel ), vertexMap .getInterval (prefix ));
153+ }
154+
155+ public Stream <Entry <String , Long >> successorTopLevelDomainCounts (String vertexLabel ) {
156+ return successorTopLevelDomainCounts (graph , vertexLabelToId (vertexLabel ));
157+ }
158+
159+ public Stream <String > successorStream (ImmutableGraph graph , long vertexId ) {
160+ return successorIntStream (graph , vertexId ).mapToObj (i -> vertexIdToLabel (i ));
161+ }
162+
163+ public IntStream successorIntStream (ImmutableGraph graph , long vertexId ) {
164+ return Arrays .stream (graph .successorArray ((int ) vertexId ));
165+ }
166+
167+ private Stream <String > successorStream (ImmutableGraph graph , long vertexId , Interval interval ) {
168+ return successorIntStream (graph , vertexId , interval ).mapToObj (i -> vertexIdToLabel (i ));
169+ }
170+
171+ public IntStream successorIntStream (ImmutableGraph graph , long vertexId , Interval interval ) {
172+ return Arrays .stream (graph .successorArray ((int ) vertexId )).filter (x -> (interval .compareTo (x ) == 0 ));
173+ }
174+
175+ public Stream <String > successorTopLevelDomainStream (ImmutableGraph graph , long vertexId ) {
176+ return Arrays .stream (graph .successorArray ((int ) vertexId )).mapToObj (i -> getTopLevelDomain (vertexIdToLabel (i )));
177+ }
178+
179+ public Stream <Entry <String , Long >> successorTopLevelDomainCounts (ImmutableGraph graph , long vertexId ) {
180+ if (vertexMap != null ) {
181+ /*
182+ * speed up if we have a prefix map, utilizing the fact that vertex labels are
183+ * lexicographically sorted by reversed domain name
184+ */
185+ List <Entry <String , Long >> res = new LinkedList <>();
186+ LazyIntIterator iter = graph .successors ((int ) vertexId );
187+ int curr = iter .nextInt ();
188+ while (curr != LAZY_INT_ITERATOR_EMPTY_VALUE ) {
189+ final MutableString currLabel = vertexMap .list ().get (curr );
190+ final int pos = currLabel .indexOf ('.' );
191+ final MutableString tldPrefix ;
192+ final String tld ;
193+ if (pos > -1 && (pos + 1 ) < currLabel .length ()) {
194+ tldPrefix = currLabel .substring (0 , pos + 1 );
195+ tld = tldPrefix .substring (0 , pos ).toString ();
196+ } else {
197+ tldPrefix = currLabel ;
198+ tld = currLabel .toString ();
199+ }
200+ long count = 1 ;
201+ final Interval interval = vertexMap .getInterval (tldPrefix );
202+ int next ;
203+ while ((next = iter .nextInt ()) != LAZY_INT_ITERATOR_EMPTY_VALUE ) {
204+ if (next > interval .right ) {
205+ break ;
206+ }
207+ count ++;
208+ }
209+ curr = next ;
210+ res .add (new SimpleEntry <>(tld , count ));
211+ }
212+ return res .stream ().sorted (Collections .reverseOrder (Map .Entry .comparingByValue ()));
213+ }
214+ return GraphExplorer .frequencies (successorTopLevelDomainStream (graph , vertexId ));
215+ }
216+
217+ public Stream <Entry <String , Long >> topLevelDomainCounts (IntStream vertexIds ) {
218+ if (vertexMap != null ) {
219+ List <Entry <String , Long >> res = new LinkedList <>();
220+ PrimitiveIterator .OfInt iter = vertexIds .iterator ();
221+ if (iter .hasNext ()) {
222+ int curr = iter .nextInt ();;
223+ do {
224+ final MutableString currLabel = vertexMap .list ().get (curr );
225+ final int pos = currLabel .indexOf ('.' );
226+ final MutableString tldPrefix ;
227+ final String tld ;
228+ if (pos > -1 && (pos + 1 ) < currLabel .length ()) {
229+ tldPrefix = currLabel .substring (0 , pos + 1 );
230+ tld = tldPrefix .substring (0 , pos ).toString ();
231+ } else {
232+ tldPrefix = currLabel ;
233+ tld = currLabel .toString ();
234+ }
235+ long count = 1 ;
236+ final Interval interval = vertexMap .getInterval (tldPrefix );
237+ int next = -1 ;
238+ while (iter .hasNext ()) {
239+ next = iter .nextInt ();
240+ if (next > interval .right ) {
241+ break ;
242+ }
243+ count ++;
244+ }
245+ curr = next ;
246+ res .add (new SimpleEntry <>(tld , count ));
247+ } while (curr > -1 );
248+ }
249+ return res .stream ().sorted (Collections .reverseOrder (Map .Entry .comparingByValue ()));
250+ }
251+ return GraphExplorer .frequencies (vertexIds .mapToObj (i -> Graph .getTopLevelDomain (vertexIdToLabel (i ))));
252+ }
253+
254+ public int [] predecessors (long vertexId ) {
255+ return graphT .successorArray ((int ) vertexId );
256+ }
257+
258+ public int [] predecessors (String vertexLabel ) {
259+ return graphT .successorArray ((int ) vertexLabelToId (vertexLabel ));
260+ }
261+
262+ public Stream <String > predecessorStream (String vertexLabel ) {
263+ return successorStream (graphT , vertexLabelToId (vertexLabel ));
264+ }
265+
266+ public IntStream predecessorIntStream (String vertexLabel ) {
267+ return successorIntStream (graphT , vertexLabelToId (vertexLabel ));
268+ }
269+
270+ public Stream <String > predecessorStream (String vertexLabel , String prefix ) {
271+ return successorStream (graphT , vertexLabelToId (vertexLabel ), vertexMap .getInterval (prefix ));
272+ }
273+
274+ public IntStream predecessorIntStream (String vertexLabel , String prefix ) {
275+ return successorIntStream (graphT , vertexLabelToId (vertexLabel ), vertexMap .getInterval (prefix ));
276+ }
277+
278+ public Stream <Entry <String , Long >> predecessorTopLevelDomainCounts (String vertexLabel ) {
279+ return successorTopLevelDomainCounts (graphT , vertexLabelToId (vertexLabel ));
280+ }
281+
116282 public long [] sharedPredecessors (long [] vertices ) {
117283 return sharedPredecessors (vertices , vertices .length , vertices .length );
118284 }
@@ -169,4 +335,12 @@ public long[] sharedSuccessors(ImmutableGraph graph, long[] vertices, int minSha
169335 res .trim ();
170336 return res .elements ();
171337 }
338+
339+ public static String getTopLevelDomain (String reversedDomainName ) {
340+ int dot = reversedDomainName .indexOf ('.' );
341+ if (dot < reversedDomainName .length ()) {
342+ return reversedDomainName .substring (0 , dot );
343+ }
344+ return reversedDomainName ;
345+ }
172346}
0 commit comments