Created
January 2, 2020 05:35
-
-
Save alphamarket/42cadc3b8b82700252208f1c7c5e82a8 to your computer and use it in GitHub Desktop.
spider script for testing site's broken links
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
ENV['TZ'] = 'UTC' | |
require 'rubygems' | |
require 'active_support/core_ext/object/blank' | |
require 'nokogiri' | |
require 'net/http' | |
require 'byebug' | |
require 'logger' | |
require 'uri' | |
alias p puts | |
def h(i); Nokogiri::HTML(i.strip); end | |
$stdout.sync = true | |
visited = [] | |
error_counter = 0 | |
urls = [ARGV.first.to_s] | |
`rm errors.log access.log ignored.log &>/dev/null` | |
error = Logger.new("errors.log") | |
access = Logger.new("access.log") | |
ignored = Logger.new("ignored.log") | |
while (current = urls.pop) | |
visited.push(current) | |
uri = URI.parse(current) | |
base = "#{uri.scheme}://#{uri.host}:#{uri.port}" | |
response = Net::HTTP.get_response(uri) | |
if response.code.to_i / 100 == 2 | |
print "\e[32m.\e[m" | |
access.info current | |
html = h response.body | |
html.css("a").each do |link| | |
link = link[:href] | |
next if link.blank? | |
if link.start_with?('/') and not(link.start_with?('//')) | |
link = "#{base}#{link}" | |
else | |
ignored.info link | |
next | |
end | |
urls.push(link) unless visited.include? link | |
end | |
else | |
if response.code.to_i / 100 == 3 | |
print "\e[33mR\e[m" | |
error.info "\e[33m[R#{response.code}]\e[m > #{current}" | |
else | |
error_counter += 1 | |
print "\e[31mE\e[m" | |
error.error "\e[31m[E#{response.code}]\e[m > #{current}" | |
end | |
end | |
end | |
puts "\n\nprocessed: #{visited.count} links, \e[31merrors: #{error_counter}\e[m, \e[32msuccess: #{visited.count - error_counter}\e[m" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment