Created
October 20, 2023 15:40
-
-
Save phette23/c14334c2aa34937562d9782557398841 to your computer and use it in GitHub Desktop.
download Archive-It WARCs to backup
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env fish | |
# used for Art Practical site | |
# fill in credentials | |
set USER username | |
set PASS password | |
set COLLECTION 15633 | |
# destination files | |
set JSONFILE data.json | |
set URLSFILE urls.txt | |
set DONEFILE done.txt | |
# number of files to download, keep rerunning the script until its done | |
set LIMIT 8 | |
# download JSON data from WASAPI then write all WARC URLs to file | |
if test ! -f $JSONFILE | |
curl -u $USER:$PASS "https://warcs.archive-it.org/wasapi/v1/webdata?collection=$COLLECTION" > $JSONFILE | |
jq -r .files[].locations[0] $JSONFILE > $URLSFILE | |
end | |
# go through them, save finished URLs to done.txt | |
for INDEX in (seq 1 $LIMIT) | |
set_color --bold red | |
echo "Downloading file $INDEX out of $LIMIT" | |
set URL (head -n 1 $URLSFILE) | |
echo -e $URL '\n' | |
set_color normal | |
wget --http-user=$USER --http-password=$PASS --accept txt,gz $URL | |
if [ $status -eq 0 ] | |
# "cut" first line of URLSFILE to DONEFILE | |
echo $URL >> done.txt | |
rename -v 's/\?.*tmp//' *.tmp | |
sed -i '.bak' '1d' $URLSFILE | |
else | |
echo -e 'Error downloading\n$URL' | |
exit 1 | |
end | |
end | |
set_color --bold red | |
echo -e "\nProgress:" | |
wc -l $URLSFILE && wc -l $DONEFILE |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment