Created
June 21, 2010 02:06
-
-
Save afiore/446302 to your computer and use it in GitHub Desktop.
Save a webpage (and its linked assets) into a single HTML file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env php | |
<?php | |
/*** | |
* PageSnap | |
* | |
* Script for archiving a HTML page (and associated media assets) into a single file. | |
* | |
* Requires: | |
* | |
* - PhpQuery: http://code.google.com/p/phpquery | |
* - Parallel Curl: http://github.com/petewarden/parallelcurl | |
*/ | |
require 'parallelcurl.php'; | |
require 'phpQuery.php'; | |
class PageSnap { | |
public $url=null; | |
private $doc=null; | |
//associative array mapping assets URLs to DOM Element | |
private $asset_urls=array(); | |
private $multi_curl=null; | |
public $selectors=array( | |
'link[href]', | |
'script[src]', | |
'img[src]', | |
'object[src]' | |
//TODO: iframe frame | |
); | |
function __construct($url,$filename=null){ | |
$this->url=$url; | |
$this->filename= $filename ? $filename : preg_replace('@^http://(www.)?@','',$url) . '.html'; | |
$this->doc=phpQuery::newDocument(file_get_contents($url)); | |
$this->multi_curl = new ParallelCurl(10,array( | |
CURLOPT_FOLLOWLOCATION => true, | |
CURLOPT_USERAGENT => "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1", | |
)); | |
return $this; | |
} | |
/** | |
* Parallel Curl callback. | |
* | |
*/ | |
function set_data_url($response_body,$url,$ch,$element){ | |
if (!in_array($url,$this->asset_urls)){ | |
file_put_contents('php://stderr',"fetching {$url}\n"); | |
$data=array(); | |
$data[]= 'data:' . preg_replace('/;.*$/','',curl_getinfo($ch,CURLINFO_CONTENT_TYPE)); | |
$data[]= 'base64,'.base64_encode($response_body); | |
foreach(array('href','src') as $attr) if ($element->hasAttribute($attr)) pq($element)->attr($attr,implode(';',$data)); | |
$this->asset_urls[]=$url; | |
} | |
} | |
/** | |
* Resolve relative URLs. | |
* | |
* @author Stefano Faenza | |
* http://www.stefanoforenza.com/how-to-build-an-absolute-url-in-php/ | |
*/ | |
private function absolute_url($u,$p){ | |
$url = parse_url( $u ); | |
$page = parse_url( $p ); | |
if ( strpos( $u , '/' ) === 0 ) | |
{ | |
//already absolute | |
} else { | |
$basePath = ''; | |
if ( | |
isset( $page[ 'path' ] ) | |
&& strpos( ltrim( $page[ 'path' ], '/' ), '/' ) | |
) | |
{ | |
$baseTokens = explode( '/', $page[ 'path' ] ); | |
array_pop( $baseTokens ); // strip basename | |
$baseTokens[] = $u; | |
$u = join( '/', $baseTokens ); | |
} | |
} | |
if ( ! isset( $url[ 'host' ])) | |
{ | |
$u = 'http://'.$page[ 'host' ].'/'.ltrim( $u, '/' ); | |
} | |
return $u; | |
} | |
/** | |
* Maps assets URL to the DOM elements that reference them. | |
* | |
*/ | |
private function get_urls($url){ | |
foreach($this->selectors as $selector){ | |
$i=0; | |
foreach(pq($selector) as $element){ | |
//move on to the next iteration if link is not shortcut icon or stylesheet | |
if ( pq($element)->attr('rel') && !in_array( strtolower(pq($element)->attr('rel')), array('shortcut icon','stylesheet'))){ | |
continue; | |
} | |
foreach(array('src','href') as $attr){ | |
if ($element->hasAttribute($attr)) { | |
$absolute_url = $this->absolute_url(pq($element)->attr($attr), $url); | |
$this->asset_urls[$absolute_url]=$element; | |
} | |
} | |
} | |
} | |
} | |
/** | |
* Updates the DOM with the data URL its linked assets and saves the document in the current working directory. | |
*/ | |
function fetch($url=null){ | |
$url or $url = $this->url; | |
$this->get_urls($url); | |
foreach($this->asset_urls as $u => $element){ | |
$this->multi_curl->startRequest($u,array($this,'set_data_url'),$element); | |
} | |
$this->multi_curl->finishAllRequests(); | |
//TODO: gzip file | |
file_put_contents( basename($this->filename), $this->doc->html()); | |
} | |
} | |
$p = new PageSnap($argv[1],$argv[2]); | |
$p->fetch(); | |
?> |
I would love a webservice that ran this, great idea
can you work on a solution for css files that @import other css files? Is there any work around for that?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Although this script is itself smart, a better name would make it even more!