Skip to content

Commit 81424f2

Browse files
feat: tool and scripts to interactively explore webgraph
Update scripts to download webgraphs and build the vertex map: support host-level webgraphs which are shipped with a list of vertex files.
1 parent 1bfda67 commit 81424f2

File tree

2 files changed

+55
-6
lines changed

2 files changed

+55
-6
lines changed

src/script/webgraph_ranking/graph_explore_build_vertex_map.sh

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@ if ! shift 2; then
1111
echo "Build node indexes to interactively explore a Common Crawl webgraph."
1212
echo "The webgraph files are expected to be placed in the current directory."
1313
echo
14+
echo " <name> basename of the graph (without the .graph suffix)"
15+
echo " <vertices> vertices file name (including the file suffix)"
16+
echo " or directory containing the vertices files"
17+
echo
1418
exit 1
1519
fi
1620

@@ -98,8 +102,13 @@ if [ -e $NAME.iepm ]; then
98102
index_status
99103
exit 0
100104
fi
105+
CAT_VERTICES="zcat $VERTICES"
106+
if [ -d $VERTICES ]; then
107+
# host-level webgraph, multiple vertex files
108+
CAT_VERTICES="zcat $VERTICES/*.txt.gz"
109+
fi
101110
if (set -eo pipefail;
102-
zcat $VERTICES \
111+
eval $CAT_VERTICES \
103112
| cut -f2 \
104113
| "$WG" it.unimi.dsi.util.ImmutableExternalPrefixMap -b4Ki $NAME.iepm); then
105114
echo "immutable external prefix map successfully built: $NAME.iepm"

src/script/webgraph_ranking/graph_explore_download_webgraph.sh

Lines changed: 45 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,17 @@ export LC_ALL=C
1717

1818
BIN="$(dirname $0)"
1919

20+
USING_CURL=false
21+
USING_WGET=false
22+
if command -v curl &>/dev/null; then
23+
USING_CURL=true
24+
elif command -v wget &>/dev/null; then
25+
USING_WGET=true
26+
else
27+
echo "Either curl or wget are required for downloading" >&2
28+
exit 1
29+
fi
30+
2031
declare -A suffix_name_map
2132
suffix_name_map=(
2233
graph "webgraph / BVGraph"
@@ -33,6 +44,8 @@ function list_webgraph_files() {
3344
if [ -e $name.$suffix ]; then
3445
printf " .%-10s : %-20s (%s)\n" "$suffix" \
3546
"${suffix_name_map[$suffix]}" "$name.$suffix"
47+
elif [ -d "$name" ] && [[ "$suffix" =~ ^\*. ]]; then
48+
ls "$name"/* | sed 's/^/\t/'
3649
else
3750
echo -e "Missing $name.$suffix (${suffix_name_map[$suffix]})"
3851
ok=false
@@ -50,8 +63,20 @@ function download_file() {
5063
fi
5164
URL="https://data.commoncrawl.org/projects/hyperlinkgraph/$BASE_NAME/$GRAPH_AGGR_LEVEL/$FILE"
5265
echo "Downloading $URL"
53-
# wget --continue --timestamping "$URL"
54-
curl --silent --remote-time -o "$FILE" --time-cond "$FILE" --continue-at - "$URL"
66+
67+
if $USING_CURL; then
68+
69+
curl --silent --remote-time -o "$FILE" --time-cond "$FILE" --continue-at - "$URL"
70+
71+
elif $USING_WGET; then
72+
73+
if [ "$(dirname "$FILE")" == "." ]; then
74+
wget --continue --timestamping "$URL"
75+
else
76+
wget --continue --timestamping --directory-prefix="$(dirname "$FILE")" "$URL"
77+
fi
78+
79+
fi
5580
}
5681

5782
function download_files() {
@@ -63,18 +88,33 @@ function download_files() {
6388

6489

6590
BASE_NAME="${NAME%-domain}"
91+
BASE_NAME="${BASE_NAME%-host}"
6692
GRAPH_AGGR_LEVEL="${NAME##*-}"
6793

6894

6995
download_files "$NAME" graph properties stats
70-
download_files "$NAME-vertices" txt.gz
7196
download_files "$NAME-t" graph properties
7297

98+
if [ $GRAPH_AGGR_LEVEL == "domain" ]; then
99+
download_files "$NAME-vertices" txt.gz
100+
else
101+
download_files "$NAME-vertices" paths.gz
102+
zcat "$NAME-vertices".paths.gz \
103+
| while read path; do
104+
file=${path#projects/hyperlinkgraph/$BASE_NAME/$GRAPH_AGGR_LEVEL/}
105+
mkdir -p $(dirname "$file")
106+
download_file "$file"
107+
done
108+
fi
109+
73110
echo "Downloaded files"
74111
echo "- webgraph"
75112
list_webgraph_files $NAME graph properties stats
76113
echo "- webgraph (transpose)"
77114
list_webgraph_files $NAME-t graph properties
78115
echo "- webgraph vertices"
79-
list_webgraph_files $NAME-vertices txt.gz
80-
116+
if [ $GRAPH_AGGR_LEVEL == "domain" ]; then
117+
list_webgraph_files $NAME-vertices txt.gz
118+
else
119+
list_webgraph_files vertices "*.txt.gz"
120+
fi

0 commit comments

Comments
 (0)