Last active
May 18, 2018 12:20
-
-
Save adifahmi/0db3811720acea2f7ce4fdf4687a6c24 to your computer and use it in GitHub Desktop.
Python html tags checker, it checks if tags are properly closed, inspired from https://github.com/ryanpcmcquen/unclosedTagFinder
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import argparse | |
import urllib.parse | |
import urllib.request | |
htmlRegex = '<[^\!][^>]*>' | |
voidElementsRegex = '</?(?!area|base|br|col|embed|hr|img|input|keygen|link|menuitem|meta)' | |
openingTagRegex = '<[^/]' | |
closingTagRegex = '</' | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-i','--input',) | |
def get_tag_list(html): | |
tags = re.compile(htmlRegex, flags=re.I | re.M) | |
tag_list = re.findall(tags, html) | |
return tag_list | |
def get_opening_tag_list(tag_list): | |
opening_tag = list( | |
filter( | |
lambda tag: re.match(openingTagRegex, tag), | |
tag_list | |
) | |
) | |
return opening_tag | |
def get_closing_tag_list(tag_list): | |
closing_tag_list = list( | |
filter( | |
lambda tag: re.match(closingTagRegex, tag), | |
tag_list | |
) | |
) | |
return closing_tag_list | |
def clean_html(raw_html): | |
cleantext = re.sub(r'\W+', '', raw_html) | |
return cleantext | |
def clean_list(the_list): | |
for idx, val in enumerate(the_list): | |
the_list[idx] = clean_html(val) | |
return the_list | |
# Simple check if opening tags are equal closing tags | |
def is_match_count(opening_tag_list, closing_tag_list): | |
if len(opening_tag_list) != len(closing_tag_list): | |
return False | |
return True | |
# will check if tag is closed by equal tag | |
# ex: `head` must be closed by `head` too | |
def is_mismatch(opening_tag_list, closing_tag_list): | |
o = clean_list(opening_tag_list) | |
c = clean_list(closing_tag_list) | |
reverse_c = c[::-1] # need to be reversed since html closing tags works in reversed | |
if o != reverse_c: | |
return False | |
return True | |
def is_tag_completed(): | |
args = parser.parse_args() | |
html = args.input | |
tag_list = get_tag_list(html) | |
# print(tag_list) | |
opening_tag_list = get_opening_tag_list(tag_list) | |
closing_tag_list = get_closing_tag_list(tag_list) | |
# print(opening_tag_list) | |
# print(closing_tag_list) | |
if is_match_count(opening_tag_list, closing_tag_list) is False: | |
print("MISMATCHED TAGS COUNT") | |
print("FALSE") | |
return False | |
elif is_mismatch(opening_tag_list, closing_tag_list) is False: | |
print("MISMATCHED TAGS") | |
print("FALSE") | |
return False | |
else: | |
print("TRUE") | |
return True | |
if __name__ == '__main__': | |
is_tag_completed() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
How to use (python 3):
python parser.py -i '<div>Hello<b>World</b></div>'
python parser.py -i '<div>Hello<b>World</div></b>'
python parser.py -i '<div>Hello<b>World</div>'