Skip to content

Instantly share code, notes, and snippets.

@fernandog
Created July 11, 2017 14:01
Show Gist options
  • Save fernandog/4d79edcd72d2071dd4db2d3d2987ce69 to your computer and use it in GitHub Desktop.
Save fernandog/4d79edcd72d2071dd4db2d3d2987ce69 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup, FeatureNotFound
import urllib, sys
from lxml import etree
print("%-20s: %s" % ('Python', sys.version_info))
print("%-20s: %s" % ('lxml.etree', etree.LXML_VERSION))
print("%-20s: %s" % ('libxml used', etree.LIBXML_VERSION))
print("%-20s: %s" % ('libxml compiled', etree.LIBXML_COMPILED_VERSION))
print("%-20s: %s" % ('libxslt used', etree.LIBXSLT_VERSION))
print("%-20s: %s" % ('libxslt compiled', etree.LIBXSLT_COMPILED_VERSION))
html = urllib.urlopen('https://gist.githubusercontent.com/fernandog/0087279c50576d1182b4eea9b80f4325/raw/3d28cd038dbae3ec296433063d021b6b3b4b811b/addic7ed.html').read()
soup = BeautifulSoup(html, 'lxml')
print soup.select('td.version > h3 > a[href^="/show/"]')[0]
class ParserBeautifulSoup(BeautifulSoup):
def __init__(self, markup, parsers, **kwargs):
# reject features
if set(parsers).intersection({'fast', 'permissive', 'strict', 'xml', 'html', 'html5'}):
raise ValueError('Features not allowed, only parser names')
# reject some kwargs
if 'features' in kwargs:
raise ValueError('Cannot use features kwarg')
if 'builder' in kwargs:
raise ValueError('Cannot use builder kwarg')
# pick the first parser available
for parser in parsers:
try:
super(ParserBeautifulSoup, self).__init__(markup, parser, **kwargs)
return
except FeatureNotFound:
pass
raise FeatureNotFound
soup = ParserBeautifulSoup(html, ['lxml'])
print soup.select('td.version > h3 > a[href^="/show/"]')[0]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment