Skip to content

Commit 8cc0f4c

Browse files
feat: tool and scripts to interactively explore webgraph
Add script to extract top-10k vertices by indegree and outdegree.
1 parent 12bedb2 commit 8cc0f4c

File tree

1 file changed

+54
-0
lines changed

1 file changed

+54
-0
lines changed
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
#!/bin/bash
2+
3+
set -eo pipefail
4+
5+
NAME="$1"
6+
TYPE="${2:-domain}"
7+
8+
if [ -z "$NAME" ]; then
9+
echo "Usage: $(basename $0) <graph-name> [<type>]"
10+
echo -e "\tgraph-name\tbase name of the webgraph (without the file suffix .graph)"
11+
echo -e "\ttype\ttype (level) of the graph aggregation: domain (default) or host"
12+
exit 1
13+
fi
14+
15+
WG=$(dirname $0)/run_webgraph.sh
16+
17+
if [ -e $NAME.outdegrees ] && [ -e $NAME.indegrees ]; then
18+
: # out/indegrees already done
19+
else
20+
$WG it.unimi.dsi.webgraph.Stats --save-degrees "$NAME"
21+
fi
22+
23+
24+
if [ "$TYPE" == "domain" ]; then
25+
zcat $NAME-vertices.txt.gz
26+
else
27+
zcat vertices/*.txt.gz
28+
fi \
29+
| cut -f2- \
30+
| paste $NAME.outdegrees $NAME.indegrees - \
31+
| gzip >$NAME-outdegrees-indegrees.txt.gz
32+
33+
34+
HEADER="outdegree\tindegree\tname"
35+
if [ "$TYPE" == "domain" ]; then
36+
HEADER="outdegree\tindegree\tname\tnumsubdomains"
37+
fi
38+
39+
(echo -e "$HEADER";
40+
set +o pipefail;
41+
zcat $NAME-outdegrees-indegrees.txt.gz \
42+
| perl -aF'\t' -lne 'print if $F[0] > 1000' \
43+
| sort -k1,1nr \
44+
| head -10000) \
45+
| gzip >$NAME-outdegrees-indegrees-topout.txt.gz
46+
47+
(echo -e "$HEADER";
48+
set +o pipefail;
49+
zcat $NAME-outdegrees-indegrees.txt.gz \
50+
| perl -aF'\t' -lne 'print if $F[1] > 1000' \
51+
| sort -k2,2nr \
52+
| head -10000) \
53+
| gzip >$NAME-outdegrees-indegrees-topin.txt.gz
54+

0 commit comments

Comments
 (0)