Skip to content

Instantly share code, notes, and snippets.

@mislav
Last active May 20, 2020 13:48
Show Gist options
  • Save mislav/3596663 to your computer and use it in GitHub Desktop.
Save mislav/3596663 to your computer and use it in GitHub Desktop.
RESOLVE SHORT URLS before storing. Short URLs are for microblogging; you should never actually keep them around.
require 'net/http'
# WARNING do not use this; it works but is very limited
def resolve url
res = Net::HTTP.get_response URI(url)
if res.code == '301' then res['location']
else url.to_s
end
end
# Why the above method sucks:
# - doesn't handle multiple redirects
# - uses HTTP GET instead of HEAD (slower, wasted bandwidth)
# - no HTTP error and Ruby exception handling
# - no HTTPS support
# - no strict timeouts (lookups can block for too long)
# Author: Mislav Marohnić
# License: MIT http://mislav.mit-license.org
require 'uri'
require 'net/https'
# Public: Service that resolves URLs to their final destination.
#
# Examples
#
# res = UrlResolver::resolve(url)
#
# if res.dead?
# abort "dead link"
# elsif res.failed?
# abort res.failed_reason
# elsif res.changed?
# puts "-> #{res.final_url} (#{res.num_redirects} redirects)"
# else
# warn "URL is direct"
# p res.response_code
# p res.response_headers
# end
class UrlResolver
# Public: Resolve a URL
#
# url - String or URI
# http_adapter - a HTTP adapter to make requests with
# (default: HttpAdapter.new)
#
# Returns a Resolution.
def self.resolve url, http_adapter = HttpAdapter.new
new(url, http_adapter).resolve
end
attr_reader :url, :http_adapter
def initialize url, http_adapter
@url = normalize_url(url)
@http_adapter = http_adapter
end
# Public: Perform URL resolution
#
# limit - Fixnum representing the maximum number of redirects
# (default: 5)
#
# All exceptions are caught and available as Resolution#error.
#
# Returns a Resolution.
def resolve limit = 5
resolution = Resolution.new url
begin
resolve_url(url, limit) do |new_url, response|
resolution.final_url = new_url
resolution.response = response
resolution.num_requests += 1
end
rescue => error
resolution.response = nil unless error.respond_to? :response
resolution.error = error
end
resolution
end
# Public: The result of a URL resolution.
class Resolution
attr_reader :original_url
attr_accessor :final_url, :num_requests, :response, :error
def initialize url
@final_url = @original_url = url
@num_requests = 0
@response = @error = nil
end
def num_redirects() num_requests - 1 end
def response_code
if response then response.code.to_i
else 500
end
end
def response_headers
if response then response.to_hash
else Hash.new
end
end
def failed?
error
end
def failed_reason
error.message
end
def changed?
original_url != final_url
end
def dead?
error.respond_to?(:not_found?) and error.not_found?
end
end
class TooManyRedirects < StandardError
attr_reader :response
def initialize(msg, response)
super(msg)
@response = response
end
end
class HttpError < StandardError
attr_reader :request_url, :response
def initialize(msg, request_url, response)
super(msg)
@request_url, @response = request_url, response
end
def not_found?
response_code == 404 or response_code == 410
end
def response_code
response.code.to_i
end
end
def normalize_url url
url.respond_to?(:host) ? url : URI(url.to_s)
end
def resolve_url url, limit, referer = nil, &block
response = request url, referer
yield url, response if block_given?
case response.code.to_i
when 400...600
raise HttpError.new(
"server returned #{response.code} #{response.message}",
url, response)
when 301
raise TooManyRedirects.new("redirect limit exceeded", response) if limit < 1
new_location = normalize_url response['location']
resolve_url(new_location, limit - 1, url, &block)
else
url
end
end
def request url, referer = nil
connection = http_adapter.get_connection(url)
headers = referer ? {'referer' => referer.to_s} : {}
request = http_adapter.create_request(url, headers)
http_adapter.perform_request(connection, request)
end
# Internal: HTTP adapter for Net::HTTP to use for URL resolution.
class HttpAdapter
def get_connection url
http = Net::HTTP.new url.host, url.port
if http.use_ssl = url.scheme == 'https'
http.verify_mode = OpenSSL::SSL::VERIFY_PEER
http.cert_store = cert_store
end
http.open_timeout = 1.5
http.read_timeout = 2
http
end
def cert_store
store = OpenSSL::X509::Store.new
store.set_default_paths
store
end
def create_request url, headers = {}
Net::HTTP::Head.new url.request_uri, headers
end
def perform_request connection, request
connection.start do |http|
http.request request
end
end
end
end
### END implementation; begin tests ###
if $0 == __FILE__
require 'test/unit'
class UrlResolverTest < Test::Unit::TestCase
def setup
@http = TestHttpAdapter.new
end
def resolve url
UrlResolver.new(url, @http).resolve
end
class TestHttpAdapter < UrlResolver::HttpAdapter
def initialize
super
@expectations = []
end
def expect_request expectation = nil
@expectations << (expectation || Proc.new)
end
def perform_request connection, request
response = @expectations.first.call(connection, request)
@expectations.shift if @expectations.size > 1
response
end
end
class MockResponse < Struct.new(:code, :message, :headers)
def [](name) headers[name] end
alias to_hash headers
end
def mock_response code, headers = {}
message = (400...500) === code ? 'Not Found' : 'OK'
MockResponse.new(code.to_s, message, headers)
end
def test_direct_url
@http.expect_request do |http, request|
assert_equal 'disney.com', http.address
assert !http.use_ssl?
assert_equal '/pluto', request.path
mock_response 200
end
resolution = resolve 'http://disney.com/pluto'
assert !resolution.failed?, "expected not to have failed"
assert !resolution.changed?, "expected not to have changed"
assert resolution.final_url.respond_to?(:host)
assert_equal 'http://disney.com/pluto', resolution.final_url.to_s
assert_equal 0, resolution.num_redirects
end
def test_failed_resolve
@http.expect_request do |http, request|
raise "boom!"
end
resolution = resolve 'http://disney.com'
assert resolution.failed?
assert_equal "boom!", resolution.failed_reason
end
def test_endless_redirect
@http.expect_request do |http, request|
mock_response 301, 'location' => 'http://disney.com'
end
resolution = resolve 'http://t.co/short'
assert resolution.failed?, "expected to have failed"
assert_equal "redirect limit exceeded", resolution.failed_reason
assert_equal 5, resolution.num_redirects
assert_equal 'http://disney.com', resolution.final_url.to_s
end
def test_normal_redirect
@http.expect_request do |http, request|
assert_equal 't.co', http.address
mock_response 301, 'location' => 'http://disney.com/pluto'
end
@http.expect_request do |http, request|
assert_equal 'disney.com', http.address
assert_equal '/pluto', request.path
mock_response 200
end
resolution = resolve 'http://t.co/short'
assert !resolution.failed?, "expected not to have failed"
assert resolution.changed?, "expected to have changed"
assert_equal 'http://disney.com/pluto', resolution.final_url.to_s
assert_equal 1, resolution.num_redirects
end
def test_redirect_to_dead_url
@http.expect_request do |http, request|
mock_response 301, 'location' => 'http://disney.com/pluto'
end
@http.expect_request do |http, request|
mock_response 404
end
resolution = resolve 'http://t.co/short'
assert resolution.failed?, "expected to have failed"
assert resolution.dead?, "expected to be dead"
assert_equal "server returned 404 Not Found", resolution.failed_reason
assert resolution.changed?, "expected to have changed"
assert_equal 'http://disney.com/pluto', resolution.final_url.to_s
assert_equal 1, resolution.num_redirects
end
def test_multiple_redirects
@http.expect_request do |http, request|
mock_response 301, 'location' => 'http://disney.com/pluto'
end
@http.expect_request do |http, request|
assert_equal '/pluto', request.path
mock_response 301, 'location' => 'http://disney.com'
end
@http.expect_request do |http, request|
assert_equal '/', request.path
mock_response 200
end
resolution = resolve 'http://t.co/short'
assert !resolution.failed?, "expected not to have failed"
assert resolution.changed?, "expected to have changed"
assert_equal 'http://disney.com', resolution.final_url.to_s
assert_equal 2, resolution.num_redirects
end
def test_ssl
@http.expect_request do |http, request|
mock_response 301, 'location' => 'https://disney.com/pluto'
end
@http.expect_request do |http, request|
assert http.use_ssl?, "expected to use SSL"
assert_equal 443, http.port
assert_equal '/pluto', request.path
mock_response 200
end
resolution = resolve 'http://t.co/short'
assert_equal 'https://disney.com/pluto', resolution.final_url.to_s
end
def test_referer
@http.expect_request do |http, request|
assert_nil request['referer'], "expected blank referer"
mock_response 301, 'location' => 'http://disney.com/pluto'
end
@http.expect_request do |http, request|
assert_equal 'http://t.co/short', request['referer']
mock_response 200
end
resolution = resolve 'http://t.co/short'
assert_equal 'http://disney.com/pluto', resolution.final_url.to_s
end
def test_response
@http.expect_request do |http, request|
mock_response 200, 'ETag' => 'hi!'
end
resolution = resolve 'http://disney.com/pluto'
assert_equal '200', resolution.response.code
assert_equal 200, resolution.response_code
assert_equal({'ETag' => 'hi!'}, resolution.response_headers)
end
def test_failed_response
@http.expect_request do |http, request|
mock_response 503
end
resolution = resolve 'http://disney.com/pluto'
assert_equal '503', resolution.response.code
assert_equal 503, resolution.response_code
end
def test_exception_response
@http.expect_request do |http, request|
mock_response 301, 'location' => 'http://disney.com'
end
@http.expect_request do |http, request|
raise "boom!"
end
resolution = resolve 'http://disney.com/pluto'
assert_equal 500, resolution.response_code
assert_equal({}, resolution.response_headers)
assert_nil resolution.response
end
end
end
@datenimperator
Copy link

Should it just track redirects w/ code 301, or maybe 302, 303, 307, 308 also? There are so many misconfigured web servers sending false redirect codes…

@magedmakled
Copy link

👍 Thanks

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment