Created
February 3, 2021 16:07
-
-
Save frobnitzem/5a0784ae255ff6dadb59e5454588bf3a to your computer and use it in GitHub Desktop.
Parse an HTML table into json.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import json | |
from html.parser import HTMLParser | |
# HTML is stupid - these tags don't close: | |
voids = set([ 'area', 'base', 'br', 'col', | |
'command', 'embed', 'hr', 'img', | |
'input', 'keygen', 'link', 'meta', | |
'param', 'source', 'track', 'wbr' | |
]) | |
# This parser lists out tag paths present in the data | |
# Use it to determine what HTML tags to consider | |
# as internal nodes and leaf nodes. | |
class GetElems(HTMLParser): | |
def __init__(self): | |
HTMLParser.__init__(self) | |
self.elems = set() | |
self.path = [] | |
def handle_starttag(self, tag, attrs): | |
self.path.append(tag) | |
self.elems.add('.'.join(self.path)) | |
if tag in voids: | |
self.path.pop() | |
def handle_endtag(self, tag): | |
if tag in voids: | |
return | |
self.path.pop() | |
# This class creates a "collapsed" HTML hierarchy. | |
# | |
# Internal nodes have the type: | |
# { 'child-tag 1': [type 1 children], 'child-tag-2': [type 2 children]} | |
# | |
# Leaf nodes have the type: | |
# [ "string1", "string2", ... ] | |
# | |
# >>> p = ElemParser(['div'], ['p']) | |
# >>> p.feed('<div><p>Test</p> <p>Parse me!</p></div>') | |
# >>> p.doc | |
# | |
# { 'div': {'p': [["Test"], ["Parse me!"]]} } | |
# | |
# obviously, 'elems' and 'leaves' should be disjoint | |
# sets of HTML tag names - not including voids. | |
class ElemParser(HTMLParser): | |
def __init__(self, elems, leaves): | |
HTMLParser.__init__(self) | |
self.doc = {} | |
self.elems = set(elems) | |
self.leaves = set(leaves) | |
# parse stack: right-most element is current doc element | |
self.loc = [(0, '', self.doc)] | |
self.level = 0 # stack-level (number of outer start-tags) | |
# 0 = parsing node | |
# 1 = parsing leaf | |
self.state = 0 | |
def handle_starttag(self, tag, attrs): | |
if tag in voids: | |
return | |
self.level += 1 | |
if self.state != 0 or (tag not in self.elems \ | |
and tag not in self.leaves): | |
return | |
i, t, d = self.loc[-1] | |
try: | |
l = d[tag] | |
except KeyError: | |
l = [] | |
d[tag] = l | |
if tag in self.leaves: | |
l.append([]) | |
self.state = 1 | |
else: | |
l.append({}) | |
self.loc.append((self.level, tag, l[-1])) | |
def handle_endtag(self, tag): | |
if tag in voids: | |
return | |
self.level -= 1 | |
i, t, d = self.loc[-1] | |
if i != self.level+1: | |
return | |
# pop the current doc. element | |
if t != tag: | |
raise ValueError(f"Start/end tag mismatch expected {t}, found {tag}") | |
self.loc.pop() | |
if t in self.leaves: | |
self.state = 0 | |
def handle_data(self, data): | |
if self.state != 1: | |
return | |
i, t, d = self.loc[-1] | |
d.append(data) | |
def test(): | |
print_elems('<html><head><title>Test</title></head>' | |
'<body><h1>Parse me!</h1></body></html>') | |
print_elems('<div><p>Test</p> <p>Parse me!</p></div>') | |
print_json('<div><p>Test</p> <p>Parse me!</p></div>', ['div'], ['p']) | |
def print_elems(html): | |
parser = GetElems() | |
parser.feed(html) | |
print(parser.elems) | |
def print_json(html, elems, leaves): | |
p = ElemParser(elems, leaves) | |
p.feed(html) | |
#print(p.doc) | |
print(json.dumps(p.doc, indent=4)) | |
if __name__=="__main__": | |
import sys | |
if len(sys.argv) == 1: # run the tests if there's no input file | |
test() | |
exit(0) | |
with open(sys.argv[1], encoding='utf-8') as f: | |
html = f.read() | |
# Step 1: determine what paths are present | |
#print_elems(html) | |
# Step 2: this worked for my table | |
print_json(html, ['tbody', 'thead', 'tr'], ['td', 'th']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment