@@ -17,6 +17,17 @@ export LC_ALL=C
17
17
18
18
BIN=" $( dirname $0 ) "
19
19
20
+ USING_CURL=false
21
+ USING_WGET=false
22
+ if command -v curl & > /dev/null; then
23
+ USING_CURL=true
24
+ elif command -v wget & > /dev/null; then
25
+ USING_WGET=true
26
+ else
27
+ echo " Either curl or wget are required for downloading" >&2
28
+ exit 1
29
+ fi
30
+
20
31
declare -A suffix_name_map
21
32
suffix_name_map=(
22
33
graph " webgraph / BVGraph"
@@ -33,6 +44,8 @@ function list_webgraph_files() {
33
44
if [ -e $name .$suffix ]; then
34
45
printf " .%-10s : %-20s (%s)\n" " $suffix " \
35
46
" ${suffix_name_map[$suffix]} " " $name .$suffix "
47
+ elif [ -d " $name " ] && [[ " $suffix " =~ ^\* . ]]; then
48
+ ls " $name " /* | sed ' s/^/\t/'
36
49
else
37
50
echo -e " Missing $name .$suffix (${suffix_name_map[$suffix]} )"
38
51
ok=false
@@ -50,8 +63,20 @@ function download_file() {
50
63
fi
51
64
URL=" https://data.commoncrawl.org/projects/hyperlinkgraph/$BASE_NAME /$GRAPH_AGGR_LEVEL /$FILE "
52
65
echo " Downloading $URL "
53
- # wget --continue --timestamping "$URL"
54
- curl --silent --remote-time -o " $FILE " --time-cond " $FILE " --continue-at - " $URL "
66
+
67
+ if $USING_CURL ; then
68
+
69
+ curl --silent --remote-time -o " $FILE " --time-cond " $FILE " --continue-at - " $URL "
70
+
71
+ elif $USING_WGET ; then
72
+
73
+ if [ " $( dirname " $FILE " ) " == " ." ]; then
74
+ wget --continue --timestamping " $URL "
75
+ else
76
+ wget --continue --timestamping --directory-prefix=" $( dirname " $FILE " ) " " $URL "
77
+ fi
78
+
79
+ fi
55
80
}
56
81
57
82
function download_files() {
@@ -63,18 +88,33 @@ function download_files() {
63
88
64
89
65
90
BASE_NAME=" ${NAME% -domain} "
91
+ BASE_NAME=" ${BASE_NAME% -host} "
66
92
GRAPH_AGGR_LEVEL=" ${NAME##* -} "
67
93
68
94
69
95
download_files " $NAME " graph properties stats
70
- download_files " $NAME -vertices" txt.gz
71
96
download_files " $NAME -t" graph properties
72
97
98
+ if [ $GRAPH_AGGR_LEVEL == " domain" ]; then
99
+ download_files " $NAME -vertices" txt.gz
100
+ else
101
+ download_files " $NAME -vertices" paths.gz
102
+ zcat " $NAME -vertices" .paths.gz \
103
+ | while read path; do
104
+ file=${path# projects/ hyperlinkgraph/ $BASE_NAME / $GRAPH_AGGR_LEVEL / }
105
+ mkdir -p $( dirname " $file " )
106
+ download_file " $file "
107
+ done
108
+ fi
109
+
73
110
echo " Downloaded files"
74
111
echo " - webgraph"
75
112
list_webgraph_files $NAME graph properties stats
76
113
echo " - webgraph (transpose)"
77
114
list_webgraph_files $NAME -t graph properties
78
115
echo " - webgraph vertices"
79
- list_webgraph_files $NAME -vertices txt.gz
80
-
116
+ if [ $GRAPH_AGGR_LEVEL == " domain" ]; then
117
+ list_webgraph_files $NAME -vertices txt.gz
118
+ else
119
+ list_webgraph_files vertices " *.txt.gz"
120
+ fi
0 commit comments