Skip to content

Instantly share code, notes, and snippets.

@DC3
Forked from oliveratgithub/curl-crawler.sh
Created November 4, 2021 22:32
Show Gist options
  • Save DC3/3b2fafc046f452a905a950fa3310d1d2 to your computer and use it in GitHub Desktop.
Save DC3/3b2fafc046f452a905a950fa3310d1d2 to your computer and use it in GitHub Desktop.
Unix Shell-Script to crawl a list of website URLs using curl
#!/bin/sh
timezone="Europe/Zurich"
# List of valid timezones: wikipedia.org/wiki/List_of_tz_database_time_zones
script="${0##*/}"
rootdir=$(cd `dirname "${BASH_SOURCE[0]}"` && pwd)
logfile="$script.log"
log="$rootdir/$logfile"
now=$(TZ=":$timezone" date)
# Uncomment 'mailto=' (remove #) to enable emailing the log upon completion
#mailto="[email protected]"
mailsubj="$script log from $now"
logging() {
now=$(TZ=":$timezone" date)
if [[ -z "$1" || -z "$2" ]]; then
echo "$now [ERROR] Nothing to log. Use:\nlogging <level> <result>"
exit 2
else
echo "$now [$1] $2" >> $log
fi
}
if [ -z "$1" ]; then
echo "$now [ERROR] Missing file input. Use:\n$rootdir/$script /path/to/urls.txt"
exit 2
else
input="$1"
fi
logging "INFO" "Reading file: $input"
cat $input|while read line; do
logging "INFO" "Crawling URL: $line"
curlstart=$(date +"%s")
curlresult=`curl -sSL -w '%{http_code} %{url_effective}' $line -o /dev/null`
# curl parameters: -sS = silent; -L = follow redirects; -w = custom output format; -o = trash output
logging "INFO" "$curlresult"
curldone=$(date +"%s")
difftime=$(($curldone-$curlstart))
logging "INFO" "Crawl-time: $(($difftime / 3600)):$(($difftime / 60)):$(($difftime % 60))"
done
logging "INFO" "Done reading file: $input"
if [ ! -z "$mailto" -a "$mailto" != " " ]; then
logging "INFO" "Sending Email to: $mailto"
# Using postfix mail command to email the logfile contents
cat $log | mail -s "$mailsubj" $mailto
fi
exit
https://www.apple.com/
https://wikipedia.org
https://swissmacuser.ch/
https://twitter.com/swissmacuser
# This is an example output generated by curl-crawler
Sun Feb 19 21:56:07 CET 2017 [INFO] Reading file: ./urls.txt
Sun Feb 19 21:56:07 CET 2017 [INFO] Crawling URL: https://www.apple.com/
Sun Feb 19 21:56:07 CET 2017 [INFO] 200 https://www.apple.com/
Sun Feb 19 21:56:07 CET 2017 [INFO] Crawl-time: 0:0:0
Sun Feb 19 21:56:07 CET 2017 [INFO] Crawling URL: https://wikipedia.org
Sun Feb 19 21:56:07 CET 2017 [INFO] 200 https://www.wikipedia.org/
Sun Feb 19 21:56:07 CET 2017 [INFO] Crawl-time: 0:0:0
Sun Feb 19 21:56:07 CET 2017 [INFO] Crawling URL: https://swissmacuser.ch/
Sun Feb 19 21:56:08 CET 2017 [INFO] 200 https://swissmacuser.ch/
Sun Feb 19 21:56:08 CET 2017 [INFO] Crawl-time: 0:0:1
Sun Feb 19 21:56:08 CET 2017 [INFO] Crawling URL: https://twitter.com/swissmacuser
Sun Feb 19 21:56:09 CET 2017 [INFO] 200 https://twitter.com/swissmacuser
Sun Feb 19 21:56:09 CET 2017 [INFO] Crawl-time: 0:0:1
Sun Feb 19 21:56:09 CET 2017 [INFO] Done reading file: ./urls.txt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment