-
-
Save DC3/3b2fafc046f452a905a950fa3310d1d2 to your computer and use it in GitHub Desktop.
Unix Shell-Script to crawl a list of website URLs using curl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
timezone="Europe/Zurich" | |
# List of valid timezones: wikipedia.org/wiki/List_of_tz_database_time_zones | |
script="${0##*/}" | |
rootdir=$(cd `dirname "${BASH_SOURCE[0]}"` && pwd) | |
logfile="$script.log" | |
log="$rootdir/$logfile" | |
now=$(TZ=":$timezone" date) | |
# Uncomment 'mailto=' (remove #) to enable emailing the log upon completion | |
#mailto="[email protected]" | |
mailsubj="$script log from $now" | |
logging() { | |
now=$(TZ=":$timezone" date) | |
if [[ -z "$1" || -z "$2" ]]; then | |
echo "$now [ERROR] Nothing to log. Use:\nlogging <level> <result>" | |
exit 2 | |
else | |
echo "$now [$1] $2" >> $log | |
fi | |
} | |
if [ -z "$1" ]; then | |
echo "$now [ERROR] Missing file input. Use:\n$rootdir/$script /path/to/urls.txt" | |
exit 2 | |
else | |
input="$1" | |
fi | |
logging "INFO" "Reading file: $input" | |
cat $input|while read line; do | |
logging "INFO" "Crawling URL: $line" | |
curlstart=$(date +"%s") | |
curlresult=`curl -sSL -w '%{http_code} %{url_effective}' $line -o /dev/null` | |
# curl parameters: -sS = silent; -L = follow redirects; -w = custom output format; -o = trash output | |
logging "INFO" "$curlresult" | |
curldone=$(date +"%s") | |
difftime=$(($curldone-$curlstart)) | |
logging "INFO" "Crawl-time: $(($difftime / 3600)):$(($difftime / 60)):$(($difftime % 60))" | |
done | |
logging "INFO" "Done reading file: $input" | |
if [ ! -z "$mailto" -a "$mailto" != " " ]; then | |
logging "INFO" "Sending Email to: $mailto" | |
# Using postfix mail command to email the logfile contents | |
cat $log | mail -s "$mailsubj" $mailto | |
fi | |
exit |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
https://www.apple.com/ | |
https://wikipedia.org | |
https://swissmacuser.ch/ | |
https://twitter.com/swissmacuser |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is an example output generated by curl-crawler | |
Sun Feb 19 21:56:07 CET 2017 [INFO] Reading file: ./urls.txt | |
Sun Feb 19 21:56:07 CET 2017 [INFO] Crawling URL: https://www.apple.com/ | |
Sun Feb 19 21:56:07 CET 2017 [INFO] 200 https://www.apple.com/ | |
Sun Feb 19 21:56:07 CET 2017 [INFO] Crawl-time: 0:0:0 | |
Sun Feb 19 21:56:07 CET 2017 [INFO] Crawling URL: https://wikipedia.org | |
Sun Feb 19 21:56:07 CET 2017 [INFO] 200 https://www.wikipedia.org/ | |
Sun Feb 19 21:56:07 CET 2017 [INFO] Crawl-time: 0:0:0 | |
Sun Feb 19 21:56:07 CET 2017 [INFO] Crawling URL: https://swissmacuser.ch/ | |
Sun Feb 19 21:56:08 CET 2017 [INFO] 200 https://swissmacuser.ch/ | |
Sun Feb 19 21:56:08 CET 2017 [INFO] Crawl-time: 0:0:1 | |
Sun Feb 19 21:56:08 CET 2017 [INFO] Crawling URL: https://twitter.com/swissmacuser | |
Sun Feb 19 21:56:09 CET 2017 [INFO] 200 https://twitter.com/swissmacuser | |
Sun Feb 19 21:56:09 CET 2017 [INFO] Crawl-time: 0:0:1 | |
Sun Feb 19 21:56:09 CET 2017 [INFO] Done reading file: ./urls.txt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment