Skip to content

Commit 12bedb2

Browse files
feat: tool and scripts to interactively explore webgraph
Add more utility methods to save data in files, to map host names to registered domains and to translate from/to reverse domain name notation.
1 parent 81424f2 commit 12bedb2

File tree

2 files changed

+76
-0
lines changed

2 files changed

+76
-0
lines changed

src/main/java/org/commoncrawl/webgraph/explore/Graph.java

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,11 @@
1919
import java.util.stream.Stream;
2020

2121
import org.commoncrawl.webgraph.CountingMergedIntIterator;
22+
import org.commoncrawl.webgraph.HostToDomainGraph;
2223
import org.slf4j.Logger;
2324
import org.slf4j.LoggerFactory;
2425

26+
import crawlercommons.domains.EffectiveTldFinder;
2527
import it.unimi.dsi.fastutil.io.BinIO;
2628
import it.unimi.dsi.fastutil.longs.LongArrayList;
2729
import it.unimi.dsi.lang.MutableString;
@@ -343,4 +345,59 @@ public static String getTopLevelDomain(String reversedDomainName) {
343345
}
344346
return reversedDomainName;
345347
}
348+
349+
/**
350+
* Get the registered domain for a host name based on the ICANN section of the
351+
* <a href="https://www.publicsuffix.org/">public suffix list</a>.
352+
*
353+
* @see EffectiveTldFinder
354+
*
355+
* @param hostName host name, e.g. <code>www.example.org.uk</code>
356+
* @param strict if true return null instead of <code>hostName</code> if no
357+
* valid public suffix is detected
358+
* @return the domain name below the public suffix, e.g.
359+
* <code>example.org.uk</code>
360+
*/
361+
public static String getRegisteredDomain(String hostName, boolean strict) {
362+
return EffectiveTldFinder.getAssignedDomain(hostName, strict, true);
363+
}
364+
365+
/**
366+
* Get the registered domain for a host name, both in
367+
* <a href= "https://en.wikipedia.org/wiki/Reverse_domain_name_notation">reverse
368+
* domain name notation</a>.
369+
*
370+
* @see #getRegisteredDomain(String, boolean)
371+
*
372+
* @param reversedHostName host name in reverse domain name notation, e.g.
373+
* <code>uk.ork.example.www</code>
374+
* @param strict if true return null instead of
375+
* <code>reversedHostName</code> if no valid public
376+
* suffix is detected
377+
* @return the domain name below the public suffix, e.g.
378+
* <code>uk.org.example</code> (in reverse domain name notation)
379+
*/
380+
public static String getRegisteredDomainReversed(String reversedHostName, boolean strict) {
381+
String hostName = reverseDomainName(reversedHostName);
382+
String domainName = getRegisteredDomain(hostName, strict);
383+
if (strict && domainName == null) {
384+
return null;
385+
} else if (hostName.equals(domainName)) {
386+
return reversedHostName;
387+
}
388+
return reverseDomainName(domainName);
389+
}
390+
391+
/**
392+
* Reverse or "unreverse" a host/domain name: <code>com.example.www</code> is
393+
* reversed to <code>www.example.com</code> and vice versa.
394+
*
395+
* @param domain name
396+
* @return domain name with <a href=
397+
* "https://en.wikipedia.org/wiki/Reverse_domain_name_notation">reverse
398+
* domain name notation</a> (un)applied
399+
*/
400+
private static String reverseDomainName(String reversedDomainName) {
401+
return HostToDomainGraph.reverseHost(reversedDomainName);
402+
}
346403
}

src/main/java/org/commoncrawl/webgraph/explore/GraphExplorer.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import java.util.function.Function;
1616
import java.util.stream.Collectors;
1717
import java.util.stream.IntStream;
18+
import java.util.stream.LongStream;
1819
import java.util.stream.Stream;
1920

2021
import org.commoncrawl.webgraph.CountingMergedIntIterator;
@@ -215,6 +216,24 @@ public void saveVerticesToFile(IntStream vertexIDs, String fileName) {
215216
}
216217
}
217218

219+
public void saveVerticesToFile(LongStream vertexIDs, String fileName) {
220+
try (PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(fileName)), false,
221+
StandardCharsets.UTF_8)) {
222+
vertexIDs.forEach(id -> out.println(g.vertexIdToLabel(id)));
223+
} catch (IOException e) {
224+
LOG.error("Failed to write vertices to file {}", fileName, e);
225+
}
226+
}
227+
228+
public void saveToFile(Stream<String> strings, String fileName) {
229+
try (PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(fileName)), false,
230+
StandardCharsets.UTF_8)) {
231+
strings.forEach(out::println);
232+
} catch (IOException e) {
233+
LOG.error("Failed to write strings to file {}", fileName, e);
234+
}
235+
}
236+
218237
public void saveCountsToFile(Stream<Entry<String, Long>> counts, String fileName) {
219238
try (PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(fileName)), false,
220239
StandardCharsets.UTF_8)) {

0 commit comments

Comments
 (0)