Created
March 27, 2010 12:02
-
-
Save mro/345983 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# Bayern2 Programmseite abgrasen | |
# | |
require 'time' | |
# sudo gem install scrapi | |
# | |
# http://exceptionz.wordpress.com/2009/11/03/scrapi-on-snow-leopard/ | |
require 'scrapi' | |
require 'cgi' | |
require 'sqlite3' | |
class Tools | |
def self.scrape_options | |
return {:user_agent=>'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_7; de-de) AppleWebKit/530.19.2 (KHTML, like Gecko) Version/4.0.2 Safari/530.19', | |
:parser_options => { | |
"input-encoding" => :latin1, | |
"output-encoding" => :utf8, | |
'preserve-entities' => :false, | |
"quote-marks" => :true, | |
'output-xhtml' => :true | |
} | |
} | |
end | |
def self.unescapeHTML html | |
return html if html.nil? | |
html = CGI.unescapeHTML html | |
html.gsub! ' ', ' ' | |
html.gsub! ''', '\'' | |
html.gsub! '–', '–' | |
html.gsub! '‘', '‘' | |
html.gsub! '’', '’' | |
html.gsub! '“', '“' | |
html.gsub! '”', '”' | |
html.gsub! '„', '„' | |
html.gsub! '…', '…' | |
html.gsub! '€', '€' | |
html.gsub! 'Δ', 'Δ' | |
html | |
end | |
end | |
class Bayern2 | |
def self.clean raw, now | |
ret = {} | |
m = /([0-9]{2}:[0-9]{2}).*Uhr/.match( Tools.unescapeHTML( raw.start ) ) | |
if m.nil? | |
$stderr.put "mismatch: #{raw.start}" | |
else | |
base_date = now | |
base_date = base_date + 24*60*60 if m[1] < '05:00' # vor Sendeschluß: Programm vom folgenden Tag | |
ret[:dtstart] = Time.parse "#{base_date.strftime '%Y-%m-%dT'}#{m[1]}:00" | |
end | |
ret[:href] = "http://www.br-online.de#{raw.href.gsub(/;jsessionid=[A-Z0-9]+/,'')}" if raw.href | |
ret[:title] = Tools.unescapeHTML raw.title | |
ret[:description] = Tools.unescapeHTML(raw.description.to_s).gsub("\n"," ").gsub(/ *<br *\/?> */,"\n").gsub(/ *<\/?p> */,'') | |
ret[:description] = nil if ret[:description] == '' | |
ret | |
end | |
def self.stream_uri | |
URI.parse 'http://gffstream.ic.llnwd.net/stream/gffstream_w11a' | |
end | |
def self.programm_uri_for_date date | |
# Uhrzeit < 5:00 - vor Sendeschluß: Programm Webpage vom Vortag! | |
date = date - 24*60*60 if date.hour < 5 | |
URI.parse "http://www.br-online.de/br/jsp/global/funktion/programmvorschau/programmfahne.jsp?programm=B2&datum=#{date.strftime '%d.%m.%Y'}" | |
end | |
def self.programm now=Time.now | |
# TODO limit access: 2009-01-31 < t < Time.now + 24h | |
b2_sendung = Scraper.define do | |
attr_accessor :start, :title, :href, :description | |
process "th", :start => :text | |
process "td > h4", :title => :text | |
process "td > h4 > a", :href => '@href' | |
process "td > p", :description => :element | |
result :start, :title, :href, :description | |
end | |
b2_programm = Scraper.define do | |
attr_accessor :sendungen | |
array :sendungen | |
process "html body div#Inhalt tbody tr", :sendungen => b2_sendung | |
result :sendungen | |
end | |
ret = [] | |
prev = nil | |
b2_programm.scrape(Bayern2.programm_uri_for_date(now), Tools.scrape_options).each do |raw| | |
current = Bayern2.clean(raw, now) | |
# $stderr.puts "Cleaned #{current[:dtstart]}: #{current[:title]}" | |
if prev | |
raise "dtend >= dtstart: #{prev[:dtstart]} >= #{current[:dtstart]}: #{current[:title]}" if prev[:dtstart] >= current[:dtstart] | |
prev[:dtend] = current[:dtstart] | |
end | |
ret << (prev = current) | |
end | |
ret | |
end | |
def self.sendung now=Time.now | |
Bayern2.programm(now).each do |s| | |
return s if s[:dtstart] <= now && now < s[:dtend] | |
end | |
nil | |
end | |
def self.open_db | |
db = SQLite3::Database.new( "#{File.expand_path(File.dirname(__FILE__))}/bayern2.sqlite" ) | |
db.execute <<SQL | |
CREATE TABLE IF NOT EXISTS programm ( | |
dtstart VARCHAR(28), | |
dtend VARCHAR(28), | |
title TEXT, | |
description TEXT, | |
href TEXT | |
) | |
SQL | |
db.execute 'CREATE UNIQUE INDEX IF NOT EXISTS programm_idx ON programm ( dtstart, dtend )' | |
db | |
end | |
def self.refill_into_hash rr | |
ret = { :rowid => rr[0].to_i } | |
ret[:dtstart] = Time.parse(rr[1]) if ! rr[1].nil? | |
ret[:dtend] = Time.parse(rr[2]) if ! rr[2].nil? | |
ret[:title] = rr[3] if ! rr[3].nil? | |
ret[:description] = rr[4] if ! rr[4].nil? | |
ret[:href] = rr[5] if ! rr[5].nil? | |
ret | |
end | |
def self.iso t | |
return nil if t.nil? | |
t.strftime '%Y-%m-%dT%H:%M:%S%z' | |
end | |
# public | |
def self.scrape now=Time.now | |
db = Bayern2.open_db | |
# look up if we have to scrape at all? | |
sql_select = "SELECT rowid FROM programm WHERE dtstart <= ? AND ? < dtend" | |
r = db.execute( sql_select, Bayern2.iso(now), Bayern2.iso(now)) | |
if r.length == 0 | |
$stderr.puts "scraping #{now}..." | |
begin | |
db.transaction do |txn| | |
Bayern2.programm(now).each do |s| | |
# $stderr.puts "insert #{iso(s[:dtstart])} - #{iso(s[:dtend])} #{s[:title]}" | |
txn.execute( "INSERT INTO programm (dtstart,dtend,title,description,href) VALUES (?,?,?,?,?)", iso(s[:dtstart]), iso(s[:dtend]), s[:title], s[:description], s[:href]) | |
end | |
sleep 0.5 | |
end | |
rescue SQLite3::SQLException => e | |
$stderr.puts "#{e}" | |
end | |
end | |
db.close | |
end | |
# public | |
def self.findBroadcastByRowIds argv | |
return [] if argv.nil? || argv.length < 1 | |
raise "Currently only one id allowed." if argv.length > 1 | |
db = Bayern2.open_db | |
sql_select = "SELECT rowid,dtstart,dtend,title,description,href FROM programm WHERE rowid = ?" | |
r = db.execute( sql_select, argv[0] ) | |
ret = [] | |
r.each {|rr| ret << Bayern2.refill_into_hash(rr)} | |
db.close | |
ret | |
end | |
# public | |
def self.findNextBroadcastByTitleLike title, now = Time.now | |
db = Bayern2.open_db | |
sql_select = "SELECT rowid,dtstart,dtend,title,description,href FROM programm WHERE title like ? AND dtend > ? ORDER BY dtstart ASC LIMIT 1" | |
r = db.execute( sql_select, title, Bayern2.iso(now) ) | |
ret = [] | |
r.each {|rr| ret << Bayern2.refill_into_hash(rr)} | |
db.close | |
ret[0].nil? ? nil : ret[0] | |
end | |
# public | |
def self.findBroadcastsInTimeInterval start = Time.now, stop = start | |
db = Bayern2.open_db | |
sql_select = "SELECT rowid,dtstart,dtend,title,description,href FROM programm WHERE dtstart <= ? AND ? < dtend ORDER BY dtstart ASC" | |
r = db.execute( sql_select, Bayern2.iso(start), Bayern2.iso(stop) ) | |
ret = [] | |
r.each {|rr| ret << Bayern2.refill_into_hash(rr)} | |
db.close | |
ret | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby -rubygems | |
require "#{File.dirname __FILE__}/../programm.rb" | |
require 'rexml/document' | |
# refactor to remove | |
class XmlWriter | |
@xml = nil | |
@root = nil | |
@dst = nil | |
def write hash, dst=$stdout | |
return if hash[:title].nil? || hash[:title] == '' | |
@xml = REXML::Document.new if @xml.nil? | |
@root = REXML::Element.new 'stream', @xml if @root.nil? | |
@dst = dst | |
track = REXML::Element.new 'track', @root | |
hash.each do |k,v| | |
elem = REXML::Element.new k.to_s, track | |
txt = v.to_s | |
txt = v.strftime '%Y-%m-%dT%H:%M:%S%z' if v.kind_of? Time | |
elem.text = txt | |
end | |
end | |
def xml | |
@xml | |
end | |
def flush | |
return if @dst.nil? | |
@dst.puts @xml if ! @xml.nil? | |
@dst.flush | |
end | |
end | |
def write_recording_xml dst=$stdout, recordings=Time.now | |
# Bayern2.scrape now | |
w = XmlWriter.new | |
recordings = Bayern2.findBroadcastsInTimeInterval(recordings) if recordings.kind_of? Time | |
recordings = [ recordings ] if recordings.kind_of? Hash | |
recordings.each do |bc| | |
# $stderr.puts bc | |
recording = { | |
:start => bc[:dtstart], | |
:stop => bc[:dtend], | |
:stream_url => Bayern2.stream_uri, | |
:program_url=> Bayern2.programm_uri_for_date(bc[:dtstart]), | |
:album => 'B2 Zündfunk', | |
:artist => 'B2 Zündfunk', | |
:title => "#{bc[:title]}: #{bc[:description]}" | |
} | |
w.write recording, dst | |
end | |
w.flush | |
end | |
def max a, b | |
a > b ? a : b | |
end | |
def min a, b | |
a > b ? b : a | |
end | |
# http://code.google.com/p/xstreamripper/source/browse/trunk/streamripper/fetch_external_metadata.pl | |
if ARGV[0] == '-fetch_external_metadata_for_broadcast_rowids' | |
ARGV[0] = nil | |
ARGV.compact! | |
broadcasts = Bayern2.findBroadcastByRowIds ARGV | |
if broadcasts.length == 0 | |
$stderr.puts "I need rowids!" | |
exit 1 | |
end | |
if broadcasts.length > 1 | |
$stderr.puts "Sorry, but currently I only support one rowid." | |
exit 2 | |
end | |
bc = broadcasts[0] | |
# endless loop | |
while true | |
dt = -Time.now.to_f + bc[:dtstart].to_f - 0 # wait until n sec before broadcast start | |
if dt > 0 | |
# don't write a prefix! | |
# streamripper seems to write this once anyway, so if we want to cut off a header | |
# we better don't switch the meta-data before e actually want to start recording... | |
# puts '.' | |
else | |
dt = -Time.now.to_f + bc[:dtend].to_f + 5 # record until n sec after broadcast end | |
if dt > 0 | |
puts "TITLE=recording\nARTIST=Bayern2\nALBUM=Zuendfunk\n." | |
else | |
puts "TITLE=suffix\nARTIST=Bayern2\nALBUM=Zuendfunk\n." | |
dt = 1e3 | |
end | |
end | |
$stdout.flush | |
$stderr.puts "\t\tsleep for dt=#{dt} until #{Time.now + max(0.01, min(10, dt) )}" | |
$stderr.flush | |
sleep max(0.01, min(10, dt) ) | |
end | |
# never reached | |
exit 0 | |
end | |
# Aufnahme | |
if ARGV[0].nil? | |
Bayern2.scrape Time.now # - Dummy Lookup um Scrape zu erzwingen | |
# - wann beginnt der nächste Zündfunk - oder läuft bereits einer (limit 1 dtend > now AND title like '%Zündfunk%'). | |
bc = Bayern2.findNextBroadcastByTitleLike '%Zündfunk%', Time.now | |
exit 0 if bc.nil? | |
exit 3 if bc[:dtend].nil? | |
exit 4 if bc[:dtstart].nil? | |
exit 0 if -Time.now.to_f + bc[:dtstart].to_f > 6*60*60 | |
# - warten bis 60 sec vor Sendung | |
dt = -Time.now.to_f + bc[:dtstart].to_f - 60 | |
$stderr.puts "waiting until #{bc[:dtstart] - 60}" | |
sleep dt if dt > 0 | |
# - streamripper starten - rowid(s) als Parameter übergeben | |
dt = -Time.now.to_f + bc[:dtend].to_f + 120 | |
dt = dt.to_i | |
exit 6 if dt <= 0 | |
dst_dir = File.dirname(__FILE__) | |
# find new (max+1) id to use and build a filename as streamripper would | |
maxid = 0 | |
Dir.new(dst_dir).each do |file| | |
# non-greedy prefix match: | |
m = /.*?([0-9]+)-([0-9]{4})_([0-9]{2})_([0-9]{2})_([0-9]{2})_([0-9]{2})_([0-9]{2})\.mp3/.match file | |
next if m.nil? | |
current_id = m[1].to_i | |
maxid = current_id if current_id > maxid | |
end | |
dst_file = "#{maxid + 1}-#{bc[:dtend].strftime '%Y_%m_%d_%H_%M_%S'}.mp3" | |
# clean incomplete | |
system "rm #{dst_dir}/incomplete/*" | |
cmd = "/home/username/bin/streamripper '#{Bayern2.stream_uri}' -t -l #{dt} -u 'Mozilla' -s -d '#{dst_dir}' -a '#{dst_file}' -E '#{File.expand_path(__FILE__)} -fetch_external_metadata_for_broadcast_rowids #{bc[:rowid]}' 1> /dev/null" | |
$stderr.puts cmd | |
if system(cmd) | |
$stderr.puts 'ripping ok!' | |
# aggregate recordings | |
if Dir["#{dst_dir}/#{dst_file}"].length == 1 | |
# regular recording file written, incomplete mustn't contain recordings. | |
if Dir["#{dst_dir}/incomplete/*recording.mp3"].length > 0 | |
$stderr.puts "ERROR: OMG, found incomplete/*recording.mp3 AND #{dst_dir}/#{dst_file}" | |
else | |
# we're fine - the complete recording is in place. | |
end | |
else | |
# no 'regular' recording, so we aggregate all recordings from incomplete | |
# 'cat' is brute force, but works out fine. | |
system "cat #{dst_dir}/incomplete/*recording.mp3 > #{dst_dir}/#{dst_file}" | |
end | |
dst = "#{dst_dir}/#{dst_file}.xml" | |
File.open( dst, "w" ) { |f| write_recording_xml( f, bc ) } | |
$stderr.puts "wrote track description #{dst}" | |
else | |
$stderr.puts "ripping failed #{$?}" | |
exit 5 | |
end | |
end | |
################################################################################# | |
################################################################################# | |
################################################################################# | |
# ugly mess, but working: | |
# - podcast.rss bauen + zippen | |
def hash_to_xml hash, parent, attributes=false, date_fmt='%d %b %Y %H:%M:%S %z' # RFC 822 date-time | |
hash.each do |k,v| | |
txt = nil | |
txt = v if v.kind_of? String | |
txt = v.to_s if v.kind_of? Numeric | |
txt = v.strftime date_fmt if v.kind_of? Time | |
if attributes | |
parent.add_attribute k.to_s, txt | |
else | |
elem = REXML::Element.new( k.to_s, parent ) | |
if !txt.nil? | |
elem.text = txt | |
else | |
use_atts = 'enclosure' == k.to_s || 'itunes:image' == k.to_s | |
hash_to_xml( v, elem, use_atts ) if v.kind_of? Hash | |
end | |
end | |
end | |
end | |
def create_feed hash, items | |
xml = REXML::Document.new | |
xml << REXML::XMLDecl.new( '1.0', 'utf-8', 'yes' ) | |
rss = REXML::Element.new('rss', xml) | |
rss.add_attribute 'xmlns:itunes', 'http://www.itunes.com/dtds/podcast-1.0.dtd' | |
rss.add_attribute 'version', '2.0' | |
channel = REXML::Element.new('channel', rss) | |
hash_to_xml hash, channel | |
meta = { :lastBuildDate => Time.now, :pubDate => Time.now, :generator => 'streamripper to rss ruby script (C) Marcus Rohrmoser 2010' } | |
hash_to_xml meta, channel | |
cats = REXML::Element.new('itunes:category', channel) | |
cats.add_attribute 'text', 'Arts' | |
REXML::Element.new('itunes:category', cats).add_attribute 'text', 'Literature' | |
# REXML::Element.new('itunes:category', cats).add_attribute 'text', 'Arts' | |
items.each { |i| hash_to_item i, channel } | |
xml | |
end | |
def hash_to_item episode, channel | |
item = REXML::Element.new 'item', channel | |
hash_to_xml episode, item | |
end | |
def seconds_to_rss_duration seconds | |
hours = (seconds / 3600).to_i.to_s | |
minutes = (seconds / 60 % 60).to_i.to_s.rjust(2).gsub(/ /, '0') | |
secs = (seconds % 60).to_i.to_s.rjust(2).gsub(/ /, '0') | |
"#{hours}:#{minutes}:#{secs}" | |
end | |
def load_recording file | |
record = REXML::Document.new file | |
record.each_element('/stream/track') do |track| | |
meta = {} | |
track.each_element('*') do |elem| | |
val = elem.text.to_s | |
val = Time.parse val if elem.name == 'start' | |
val = Time.parse val if elem.name == 'stop' | |
meta[ elem.name.intern ] = val | |
end | |
$stderr.puts "loaded #{file}" | |
return meta | |
end | |
end | |
##################################################################### | |
## build the rss feed | |
##################################################################### | |
def build_rss dst_dir, channel | |
dst_dir = File.expand_path dst_dir | |
items = [] | |
Dir.new(dst_dir).sort.each do |file| | |
# mp3 files as spit out from streamripper | |
m = /.*([0-9]{4}_[0-9]{2}_[0-9]{2}_[0-9]{2}_[0-9]{2}_[0-9]{2})\.mp3$/.match file | |
next if m.nil? | |
url = "#{dst_dir}/#{file}".gsub(/.*\/recorder\//, "http://podcasts.example.com/") | |
meta = nil | |
File.open("#{dst_dir}/#{file}.xml", 'r' ) {|f| meta = load_recording f} | |
seconds = meta[:stop] - meta[:start] | |
items << { | |
:description => meta[:title], | |
:enclosure => { :url => url, :length => File.stat("#{dst_dir}/#{file}").size, :type => 'audio/mpeg' }, | |
:guid => url, | |
:pubDate => meta[:start], | |
'itunes:explicit' => 'clean', | |
'itunes:duration' => seconds_to_rss_duration(seconds), | |
:title => "#{meta[:start].strftime '%A, %d. %B %Y'}", | |
'itunes:author' => 'Zündfunk', # overwrite id3 author causing iTunes encoding issues | |
'itunes:keywords' => 'Bayern2,Zündfunk,Jugend,Pop' | |
} | |
$stderr.puts "analyzed #{dst_dir}/#{file}.xml" | |
end | |
create_feed(channel, items) | |
end | |
channel = { | |
:title => 'B2 Zündfunk', | |
:language => 'de', | |
'itunes:explicit' => 'clean', | |
:description => 'Radiomitschnitt Bayern 2 Werktags 19:00 - 20:20.', | |
:link => 'http://www.br-online.de/bayern2/zuendfunk/index.xml', | |
'itunes:subtitle' => 'Die Jugendwelle im BR.', | |
# 'itunes:category' => { :text => 'Society & Culture' }, | |
'itunes:summary' => 'zeitgenössische akustische Popkultur.', | |
'itunes:author' => 'Zündfunk', | |
'itunes:owner' => { | |
'itunes:name' => 'My Name', | |
'itunes:email' => '[email protected]' | |
}, | |
'itunes:image' => { | |
:href => 'http://www.br-online.de/content/cms/Universalseite/2008/03/09/cumulus/BR-online-Publikation--229136-20081103160106.jpg' | |
}, | |
:image => { | |
:url => 'http://www.br-online.de/content/cms/Universalseite/2008/03/09/cumulus/BR-online-Publikation--229136-20081103160106.jpg', | |
:title => 'B2 Zündfunk', | |
:link => 'http://www.br-online.de/bayern2/zuendfunk/index.xml' | |
} | |
} | |
rss = build_rss File.dirname(__FILE__), channel | |
dst = "#{File.dirname(__FILE__)}/podcast.rss" | |
File.open(dst, "w") {|f| rss.write f} | |
system "gzip --best < #{dst} > #{dst}z" | |
$stderr.puts "wrote podcast #{dst}z" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment