Skip to content

Instantly share code, notes, and snippets.

@shurane
Created January 12, 2022 05:13
Show Gist options
  • Save shurane/f2a182f2c82e62b2816fd90199b2f88d to your computer and use it in GitHub Desktop.
Save shurane/f2a182f2c82e62b2816fd90199b2f88d to your computer and use it in GitHub Desktop.
sql grammar
import re
# countries.sql https://gist.github.com/adhipg/1600028
# countries table https://riptutorial.com/sql/example/9933/countries-table
statements = [ "SELECT * FROM countries",
"SELECT * FROM countries;",
"SELECT a.* FROM countries",
"SELECT capital FROM countries",
"SELECT capital, phonecode FROM countries",
"SELECT capital, phonecode, currency FROM countries",
"SELECT capital,phonecode FROM countries",
"SELECT capital,phonecode,currency FROM countries",
"SELECT * FROM countries WHERE phonecode=1",
"SELECT * FROM countries WHERE 1=1",
"SELECT * FROM countries WHERE phonecode=1 AND currency='USD'",
"SELECT * FROM countries WHERE phonecode=1 AND currency=\"USD\"",
"SELECT capital FROM countries; SELECT currency FROM countries",
"SELECT capital FROM countries; SELECT currency FROM countries;"
]
# for reference, look at https://docs.python.org/3/library/re.html#writing-a-tokenizer
def tokenize(s):
keywords = ["SELECT", "FROM", "WHERE", "AND", "OR"]
token_specs = [
("STAR", r"\*|\w+\.\*"),
("KEYWORD", r"|".join(keywords)),
("COMPARISON", r"\w+=(\d+|\"\w+\"|'\w+')"),
("ID", r"\w+,?"),
("END", r";"),
("SKIP", r"[ \t\n]+"),
("MISMATCH", r"."),
]
# (?P<name>...) is a named group, very interesting
tok_regex = "|".join(f"(?P<{token}>{regex})" for token, regex in token_specs)
for mo in re.finditer(tok_regex, s):
kind = mo.lastgroup
value = mo.group()
cursor = mo.start()
if kind == "ID":
# is there a better way to ignore the commas? maybe during the tokenize step? or a different kind called IDCOMMA?
value = value.rstrip(",")
elif kind == "SKIP":
continue
elif kind == "MISMATCH":
raise RuntimeError(f"{value!r} unexpected on character {cursor}")
yield (kind, value, cursor)
for stmt in statements:
print(list(tokenize(stmt)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment