|
19 | 19 | import java.util.stream.Stream;
|
20 | 20 |
|
21 | 21 | import org.commoncrawl.webgraph.CountingMergedIntIterator;
|
| 22 | +import org.commoncrawl.webgraph.HostToDomainGraph; |
22 | 23 | import org.slf4j.Logger;
|
23 | 24 | import org.slf4j.LoggerFactory;
|
24 | 25 |
|
| 26 | +import crawlercommons.domains.EffectiveTldFinder; |
25 | 27 | import it.unimi.dsi.fastutil.io.BinIO;
|
26 | 28 | import it.unimi.dsi.fastutil.longs.LongArrayList;
|
27 | 29 | import it.unimi.dsi.lang.MutableString;
|
@@ -343,4 +345,59 @@ public static String getTopLevelDomain(String reversedDomainName) {
|
343 | 345 | }
|
344 | 346 | return reversedDomainName;
|
345 | 347 | }
|
| 348 | + |
| 349 | + /** |
| 350 | + * Get the registered domain for a host name based on the ICANN section of the |
| 351 | + * <a href="https://www.publicsuffix.org/">public suffix list</a>. |
| 352 | + * |
| 353 | + * @see EffectiveTldFinder |
| 354 | + * |
| 355 | + * @param hostName host name, e.g. <code>www.example.org.uk</code> |
| 356 | + * @param strict if true return null instead of <code>hostName</code> if no |
| 357 | + * valid public suffix is detected |
| 358 | + * @return the domain name below the public suffix, e.g. |
| 359 | + * <code>example.org.uk</code> |
| 360 | + */ |
| 361 | + public static String getRegisteredDomain(String hostName, boolean strict) { |
| 362 | + return EffectiveTldFinder.getAssignedDomain(hostName, strict, true); |
| 363 | + } |
| 364 | + |
| 365 | + /** |
| 366 | + * Get the registered domain for a host name, both in |
| 367 | + * <a href= "https://en.wikipedia.org/wiki/Reverse_domain_name_notation">reverse |
| 368 | + * domain name notation</a>. |
| 369 | + * |
| 370 | + * @see #getRegisteredDomain(String, boolean) |
| 371 | + * |
| 372 | + * @param reversedHostName host name in reverse domain name notation, e.g. |
| 373 | + * <code>uk.ork.example.www</code> |
| 374 | + * @param strict if true return null instead of |
| 375 | + * <code>reversedHostName</code> if no valid public |
| 376 | + * suffix is detected |
| 377 | + * @return the domain name below the public suffix, e.g. |
| 378 | + * <code>uk.org.example</code> (in reverse domain name notation) |
| 379 | + */ |
| 380 | + public static String getRegisteredDomainReversed(String reversedHostName, boolean strict) { |
| 381 | + String hostName = reverseDomainName(reversedHostName); |
| 382 | + String domainName = getRegisteredDomain(hostName, strict); |
| 383 | + if (strict && domainName == null) { |
| 384 | + return null; |
| 385 | + } else if (hostName.equals(domainName)) { |
| 386 | + return reversedHostName; |
| 387 | + } |
| 388 | + return reverseDomainName(domainName); |
| 389 | + } |
| 390 | + |
| 391 | + /** |
| 392 | + * Reverse or "unreverse" a host/domain name: <code>com.example.www</code> is |
| 393 | + * reversed to <code>www.example.com</code> and vice versa. |
| 394 | + * |
| 395 | + * @param domain name |
| 396 | + * @return domain name with <a href= |
| 397 | + * "https://en.wikipedia.org/wiki/Reverse_domain_name_notation">reverse |
| 398 | + * domain name notation</a> (un)applied |
| 399 | + */ |
| 400 | + private static String reverseDomainName(String reversedDomainName) { |
| 401 | + return HostToDomainGraph.reverseHost(reversedDomainName); |
| 402 | + } |
346 | 403 | }
|
0 commit comments