Created
January 12, 2022 05:13
-
-
Save shurane/f2a182f2c82e62b2816fd90199b2f88d to your computer and use it in GitHub Desktop.
sql grammar
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
# countries.sql https://gist.github.com/adhipg/1600028 | |
# countries table https://riptutorial.com/sql/example/9933/countries-table | |
statements = [ "SELECT * FROM countries", | |
"SELECT * FROM countries;", | |
"SELECT a.* FROM countries", | |
"SELECT capital FROM countries", | |
"SELECT capital, phonecode FROM countries", | |
"SELECT capital, phonecode, currency FROM countries", | |
"SELECT capital,phonecode FROM countries", | |
"SELECT capital,phonecode,currency FROM countries", | |
"SELECT * FROM countries WHERE phonecode=1", | |
"SELECT * FROM countries WHERE 1=1", | |
"SELECT * FROM countries WHERE phonecode=1 AND currency='USD'", | |
"SELECT * FROM countries WHERE phonecode=1 AND currency=\"USD\"", | |
"SELECT capital FROM countries; SELECT currency FROM countries", | |
"SELECT capital FROM countries; SELECT currency FROM countries;" | |
] | |
# for reference, look at https://docs.python.org/3/library/re.html#writing-a-tokenizer | |
def tokenize(s): | |
keywords = ["SELECT", "FROM", "WHERE", "AND", "OR"] | |
token_specs = [ | |
("STAR", r"\*|\w+\.\*"), | |
("KEYWORD", r"|".join(keywords)), | |
("COMPARISON", r"\w+=(\d+|\"\w+\"|'\w+')"), | |
("ID", r"\w+,?"), | |
("END", r";"), | |
("SKIP", r"[ \t\n]+"), | |
("MISMATCH", r"."), | |
] | |
# (?P<name>...) is a named group, very interesting | |
tok_regex = "|".join(f"(?P<{token}>{regex})" for token, regex in token_specs) | |
for mo in re.finditer(tok_regex, s): | |
kind = mo.lastgroup | |
value = mo.group() | |
cursor = mo.start() | |
if kind == "ID": | |
# is there a better way to ignore the commas? maybe during the tokenize step? or a different kind called IDCOMMA? | |
value = value.rstrip(",") | |
elif kind == "SKIP": | |
continue | |
elif kind == "MISMATCH": | |
raise RuntimeError(f"{value!r} unexpected on character {cursor}") | |
yield (kind, value, cursor) | |
for stmt in statements: | |
print(list(tokenize(stmt))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment