Skip to content

Instantly share code, notes, and snippets.

@phette23
Created October 20, 2023 15:40
Show Gist options
  • Save phette23/c14334c2aa34937562d9782557398841 to your computer and use it in GitHub Desktop.
Save phette23/c14334c2aa34937562d9782557398841 to your computer and use it in GitHub Desktop.
download Archive-It WARCs to backup
#!/usr/bin/env fish
# used for Art Practical site
# fill in credentials
set USER username
set PASS password
set COLLECTION 15633
# destination files
set JSONFILE data.json
set URLSFILE urls.txt
set DONEFILE done.txt
# number of files to download, keep rerunning the script until its done
set LIMIT 8
# download JSON data from WASAPI then write all WARC URLs to file
if test ! -f $JSONFILE
curl -u $USER:$PASS "https://warcs.archive-it.org/wasapi/v1/webdata?collection=$COLLECTION" > $JSONFILE
jq -r .files[].locations[0] $JSONFILE > $URLSFILE
end
# go through them, save finished URLs to done.txt
for INDEX in (seq 1 $LIMIT)
set_color --bold red
echo "Downloading file $INDEX out of $LIMIT"
set URL (head -n 1 $URLSFILE)
echo -e $URL '\n'
set_color normal
wget --http-user=$USER --http-password=$PASS --accept txt,gz $URL
if [ $status -eq 0 ]
# "cut" first line of URLSFILE to DONEFILE
echo $URL >> done.txt
rename -v 's/\?.*tmp//' *.tmp
sed -i '.bak' '1d' $URLSFILE
else
echo -e 'Error downloading\n$URL'
exit 1
end
end
set_color --bold red
echo -e "\nProgress:"
wc -l $URLSFILE && wc -l $DONEFILE
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment