Last active
May 20, 2020 13:48
-
-
Save mislav/3596663 to your computer and use it in GitHub Desktop.
RESOLVE SHORT URLS before storing. Short URLs are for microblogging; you should never actually keep them around.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'net/http' | |
# WARNING do not use this; it works but is very limited | |
def resolve url | |
res = Net::HTTP.get_response URI(url) | |
if res.code == '301' then res['location'] | |
else url.to_s | |
end | |
end | |
# Why the above method sucks: | |
# - doesn't handle multiple redirects | |
# - uses HTTP GET instead of HEAD (slower, wasted bandwidth) | |
# - no HTTP error and Ruby exception handling | |
# - no HTTPS support | |
# - no strict timeouts (lookups can block for too long) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Author: Mislav Marohnić | |
# License: MIT http://mislav.mit-license.org | |
require 'uri' | |
require 'net/https' | |
# Public: Service that resolves URLs to their final destination. | |
# | |
# Examples | |
# | |
# res = UrlResolver::resolve(url) | |
# | |
# if res.dead? | |
# abort "dead link" | |
# elsif res.failed? | |
# abort res.failed_reason | |
# elsif res.changed? | |
# puts "-> #{res.final_url} (#{res.num_redirects} redirects)" | |
# else | |
# warn "URL is direct" | |
# p res.response_code | |
# p res.response_headers | |
# end | |
class UrlResolver | |
# Public: Resolve a URL | |
# | |
# url - String or URI | |
# http_adapter - a HTTP adapter to make requests with | |
# (default: HttpAdapter.new) | |
# | |
# Returns a Resolution. | |
def self.resolve url, http_adapter = HttpAdapter.new | |
new(url, http_adapter).resolve | |
end | |
attr_reader :url, :http_adapter | |
def initialize url, http_adapter | |
@url = normalize_url(url) | |
@http_adapter = http_adapter | |
end | |
# Public: Perform URL resolution | |
# | |
# limit - Fixnum representing the maximum number of redirects | |
# (default: 5) | |
# | |
# All exceptions are caught and available as Resolution#error. | |
# | |
# Returns a Resolution. | |
def resolve limit = 5 | |
resolution = Resolution.new url | |
begin | |
resolve_url(url, limit) do |new_url, response| | |
resolution.final_url = new_url | |
resolution.response = response | |
resolution.num_requests += 1 | |
end | |
rescue => error | |
resolution.response = nil unless error.respond_to? :response | |
resolution.error = error | |
end | |
resolution | |
end | |
# Public: The result of a URL resolution. | |
class Resolution | |
attr_reader :original_url | |
attr_accessor :final_url, :num_requests, :response, :error | |
def initialize url | |
@final_url = @original_url = url | |
@num_requests = 0 | |
@response = @error = nil | |
end | |
def num_redirects() num_requests - 1 end | |
def response_code | |
if response then response.code.to_i | |
else 500 | |
end | |
end | |
def response_headers | |
if response then response.to_hash | |
else Hash.new | |
end | |
end | |
def failed? | |
error | |
end | |
def failed_reason | |
error.message | |
end | |
def changed? | |
original_url != final_url | |
end | |
def dead? | |
error.respond_to?(:not_found?) and error.not_found? | |
end | |
end | |
class TooManyRedirects < StandardError | |
attr_reader :response | |
def initialize(msg, response) | |
super(msg) | |
@response = response | |
end | |
end | |
class HttpError < StandardError | |
attr_reader :request_url, :response | |
def initialize(msg, request_url, response) | |
super(msg) | |
@request_url, @response = request_url, response | |
end | |
def not_found? | |
response_code == 404 or response_code == 410 | |
end | |
def response_code | |
response.code.to_i | |
end | |
end | |
def normalize_url url | |
url.respond_to?(:host) ? url : URI(url.to_s) | |
end | |
def resolve_url url, limit, referer = nil, &block | |
response = request url, referer | |
yield url, response if block_given? | |
case response.code.to_i | |
when 400...600 | |
raise HttpError.new( | |
"server returned #{response.code} #{response.message}", | |
url, response) | |
when 301 | |
raise TooManyRedirects.new("redirect limit exceeded", response) if limit < 1 | |
new_location = normalize_url response['location'] | |
resolve_url(new_location, limit - 1, url, &block) | |
else | |
url | |
end | |
end | |
def request url, referer = nil | |
connection = http_adapter.get_connection(url) | |
headers = referer ? {'referer' => referer.to_s} : {} | |
request = http_adapter.create_request(url, headers) | |
http_adapter.perform_request(connection, request) | |
end | |
# Internal: HTTP adapter for Net::HTTP to use for URL resolution. | |
class HttpAdapter | |
def get_connection url | |
http = Net::HTTP.new url.host, url.port | |
if http.use_ssl = url.scheme == 'https' | |
http.verify_mode = OpenSSL::SSL::VERIFY_PEER | |
http.cert_store = cert_store | |
end | |
http.open_timeout = 1.5 | |
http.read_timeout = 2 | |
http | |
end | |
def cert_store | |
store = OpenSSL::X509::Store.new | |
store.set_default_paths | |
store | |
end | |
def create_request url, headers = {} | |
Net::HTTP::Head.new url.request_uri, headers | |
end | |
def perform_request connection, request | |
connection.start do |http| | |
http.request request | |
end | |
end | |
end | |
end | |
### END implementation; begin tests ### | |
if $0 == __FILE__ | |
require 'test/unit' | |
class UrlResolverTest < Test::Unit::TestCase | |
def setup | |
@http = TestHttpAdapter.new | |
end | |
def resolve url | |
UrlResolver.new(url, @http).resolve | |
end | |
class TestHttpAdapter < UrlResolver::HttpAdapter | |
def initialize | |
super | |
@expectations = [] | |
end | |
def expect_request expectation = nil | |
@expectations << (expectation || Proc.new) | |
end | |
def perform_request connection, request | |
response = @expectations.first.call(connection, request) | |
@expectations.shift if @expectations.size > 1 | |
response | |
end | |
end | |
class MockResponse < Struct.new(:code, :message, :headers) | |
def [](name) headers[name] end | |
alias to_hash headers | |
end | |
def mock_response code, headers = {} | |
message = (400...500) === code ? 'Not Found' : 'OK' | |
MockResponse.new(code.to_s, message, headers) | |
end | |
def test_direct_url | |
@http.expect_request do |http, request| | |
assert_equal 'disney.com', http.address | |
assert !http.use_ssl? | |
assert_equal '/pluto', request.path | |
mock_response 200 | |
end | |
resolution = resolve 'http://disney.com/pluto' | |
assert !resolution.failed?, "expected not to have failed" | |
assert !resolution.changed?, "expected not to have changed" | |
assert resolution.final_url.respond_to?(:host) | |
assert_equal 'http://disney.com/pluto', resolution.final_url.to_s | |
assert_equal 0, resolution.num_redirects | |
end | |
def test_failed_resolve | |
@http.expect_request do |http, request| | |
raise "boom!" | |
end | |
resolution = resolve 'http://disney.com' | |
assert resolution.failed? | |
assert_equal "boom!", resolution.failed_reason | |
end | |
def test_endless_redirect | |
@http.expect_request do |http, request| | |
mock_response 301, 'location' => 'http://disney.com' | |
end | |
resolution = resolve 'http://t.co/short' | |
assert resolution.failed?, "expected to have failed" | |
assert_equal "redirect limit exceeded", resolution.failed_reason | |
assert_equal 5, resolution.num_redirects | |
assert_equal 'http://disney.com', resolution.final_url.to_s | |
end | |
def test_normal_redirect | |
@http.expect_request do |http, request| | |
assert_equal 't.co', http.address | |
mock_response 301, 'location' => 'http://disney.com/pluto' | |
end | |
@http.expect_request do |http, request| | |
assert_equal 'disney.com', http.address | |
assert_equal '/pluto', request.path | |
mock_response 200 | |
end | |
resolution = resolve 'http://t.co/short' | |
assert !resolution.failed?, "expected not to have failed" | |
assert resolution.changed?, "expected to have changed" | |
assert_equal 'http://disney.com/pluto', resolution.final_url.to_s | |
assert_equal 1, resolution.num_redirects | |
end | |
def test_redirect_to_dead_url | |
@http.expect_request do |http, request| | |
mock_response 301, 'location' => 'http://disney.com/pluto' | |
end | |
@http.expect_request do |http, request| | |
mock_response 404 | |
end | |
resolution = resolve 'http://t.co/short' | |
assert resolution.failed?, "expected to have failed" | |
assert resolution.dead?, "expected to be dead" | |
assert_equal "server returned 404 Not Found", resolution.failed_reason | |
assert resolution.changed?, "expected to have changed" | |
assert_equal 'http://disney.com/pluto', resolution.final_url.to_s | |
assert_equal 1, resolution.num_redirects | |
end | |
def test_multiple_redirects | |
@http.expect_request do |http, request| | |
mock_response 301, 'location' => 'http://disney.com/pluto' | |
end | |
@http.expect_request do |http, request| | |
assert_equal '/pluto', request.path | |
mock_response 301, 'location' => 'http://disney.com' | |
end | |
@http.expect_request do |http, request| | |
assert_equal '/', request.path | |
mock_response 200 | |
end | |
resolution = resolve 'http://t.co/short' | |
assert !resolution.failed?, "expected not to have failed" | |
assert resolution.changed?, "expected to have changed" | |
assert_equal 'http://disney.com', resolution.final_url.to_s | |
assert_equal 2, resolution.num_redirects | |
end | |
def test_ssl | |
@http.expect_request do |http, request| | |
mock_response 301, 'location' => 'https://disney.com/pluto' | |
end | |
@http.expect_request do |http, request| | |
assert http.use_ssl?, "expected to use SSL" | |
assert_equal 443, http.port | |
assert_equal '/pluto', request.path | |
mock_response 200 | |
end | |
resolution = resolve 'http://t.co/short' | |
assert_equal 'https://disney.com/pluto', resolution.final_url.to_s | |
end | |
def test_referer | |
@http.expect_request do |http, request| | |
assert_nil request['referer'], "expected blank referer" | |
mock_response 301, 'location' => 'http://disney.com/pluto' | |
end | |
@http.expect_request do |http, request| | |
assert_equal 'http://t.co/short', request['referer'] | |
mock_response 200 | |
end | |
resolution = resolve 'http://t.co/short' | |
assert_equal 'http://disney.com/pluto', resolution.final_url.to_s | |
end | |
def test_response | |
@http.expect_request do |http, request| | |
mock_response 200, 'ETag' => 'hi!' | |
end | |
resolution = resolve 'http://disney.com/pluto' | |
assert_equal '200', resolution.response.code | |
assert_equal 200, resolution.response_code | |
assert_equal({'ETag' => 'hi!'}, resolution.response_headers) | |
end | |
def test_failed_response | |
@http.expect_request do |http, request| | |
mock_response 503 | |
end | |
resolution = resolve 'http://disney.com/pluto' | |
assert_equal '503', resolution.response.code | |
assert_equal 503, resolution.response_code | |
end | |
def test_exception_response | |
@http.expect_request do |http, request| | |
mock_response 301, 'location' => 'http://disney.com' | |
end | |
@http.expect_request do |http, request| | |
raise "boom!" | |
end | |
resolution = resolve 'http://disney.com/pluto' | |
assert_equal 500, resolution.response_code | |
assert_equal({}, resolution.response_headers) | |
assert_nil resolution.response | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Should it just track redirects w/ code 301, or maybe 302, 303, 307, 308 also? There are so many misconfigured web servers sending false redirect codes…