shurane/parse.py

## parse.py
import re

# countries.sql https://gist.github.com/adhipg/1600028
# countries table https://riptutorial.com/sql/example/9933/countries-table

statements = [ "SELECT * FROM countries",
               "SELECT * FROM countries;",
               "SELECT a.* FROM countries",
               "SELECT capital FROM countries",
               "SELECT capital, phonecode FROM countries",
               "SELECT capital, phonecode, currency FROM countries",
               "SELECT capital,phonecode FROM countries",
               "SELECT capital,phonecode,currency FROM countries",
               "SELECT * FROM countries WHERE phonecode=1",
               "SELECT * FROM countries WHERE 1=1",
               "SELECT * FROM countries WHERE phonecode=1 AND currency='USD'",
               "SELECT * FROM countries WHERE phonecode=1 AND currency=\"USD\"",
               "SELECT capital FROM countries; SELECT currency FROM countries",
               "SELECT capital FROM countries; SELECT currency FROM countries;"
              ]

# for reference, look at https://docs.python.org/3/library/re.html#writing-a-tokenizer
def tokenize(s):
    keywords = ["SELECT", "FROM", "WHERE", "AND", "OR"]
    token_specs = [
        ("STAR",        r"\*|\w+\.\*"),
        ("KEYWORD",     r"|".join(keywords)),
        ("COMPARISON",  r"\w+=(\d+|\"\w+\"|'\w+')"),
        ("ID",          r"\w+,?"),
        ("END",         r";"),
        ("SKIP",        r"[ \t\n]+"),
        ("MISMATCH",    r"."),
    ]
    # (?P<name>...) is a named group, very interesting
    tok_regex = "|".join(f"(?P<{token}>{regex})" for token, regex in token_specs)

    for mo in re.finditer(tok_regex, s):
        kind = mo.lastgroup
        value = mo.group()
        cursor = mo.start()

        if kind == "ID":
            # is there a better way to ignore the commas? maybe during the tokenize step? or a different kind called IDCOMMA?
            value = value.rstrip(",")
        elif kind == "SKIP":
            continue
        elif kind == "MISMATCH":
            raise RuntimeError(f"{value!r} unexpected on character {cursor}")

        yield (kind, value, cursor)

for stmt in statements:
    print(list(tokenize(stmt)))
	import re

	# countries.sql https://gist.github.com/adhipg/1600028
	# countries table https://riptutorial.com/sql/example/9933/countries-table

	statements = [ "SELECT * FROM countries",
	"SELECT * FROM countries;",
	"SELECT a.* FROM countries",
	"SELECT capital FROM countries",
	"SELECT capital, phonecode FROM countries",
	"SELECT capital, phonecode, currency FROM countries",
	"SELECT capital,phonecode FROM countries",
	"SELECT capital,phonecode,currency FROM countries",
	"SELECT * FROM countries WHERE phonecode=1",
	"SELECT * FROM countries WHERE 1=1",
	"SELECT * FROM countries WHERE phonecode=1 AND currency='USD'",
	"SELECT * FROM countries WHERE phonecode=1 AND currency=\"USD\"",
	"SELECT capital FROM countries; SELECT currency FROM countries",
	"SELECT capital FROM countries; SELECT currency FROM countries;"
	]

	# for reference, look at https://docs.python.org/3/library/re.html#writing-a-tokenizer
	def tokenize(s):
	keywords = ["SELECT", "FROM", "WHERE", "AND", "OR"]
	token_specs = [
	("STAR", r"\\|\w+\.\"),
	("KEYWORD", r"\|".join(keywords)),
	("COMPARISON", r"\w+=(\d+\|\"\w+\"\|'\w+')"),
	("ID", r"\w+,?"),
	("END", r";"),
	("SKIP", r"[ \t\n]+"),
	("MISMATCH", r"."),
	]
	# (?P<name>...) is a named group, very interesting
	tok_regex = "\|".join(f"(?P<{token}>{regex})" for token, regex in token_specs)

	for mo in re.finditer(tok_regex, s):
	kind = mo.lastgroup
	value = mo.group()
	cursor = mo.start()

	if kind == "ID":
	# is there a better way to ignore the commas? maybe during the tokenize step? or a different kind called IDCOMMA?
	value = value.rstrip(",")
	elif kind == "SKIP":
	continue
	elif kind == "MISMATCH":
	raise RuntimeError(f"{value!r} unexpected on character {cursor}")

	yield (kind, value, cursor)

	for stmt in statements:
	print(list(tokenize(stmt)))