Last active
May 20, 2022 20:16
-
-
Save dabrahams/56669d75236d16cd72fd4d9a1077a5db to your computer and use it in GitHub Desktop.
Quick/Dirty Chart Parser.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// A simple bottom-up chart parser. | |
/// | |
/// - The chart is a lookup table of sets of partial parses (a.k.a. dotted | |
/// rules), indexed by (predicted-symbol, source-location) pairs. | |
/// | |
/// - A partial parse is a triple (B, R, N) where: | |
/// - B is the position in the token stream where the partially-parsed input begins. | |
/// - R is the partially-recognized BNF rule. | |
/// - N is the number of RHS symbols of R that have been recognized. | |
/// | |
/// - A chart location L is described by a pair (S, E) where, for all partial | |
/// parses P = (B, R, N) at L | |
/// - E is the position in the token stream where the input corresponding to P ends. | |
/// - The Nth RHS symbol of R is S | |
/// | |
struct ChartParser { | |
/// Terminals and nonterminals are identified by a symbol ID. | |
typealias SymbolID = Int | |
/// The symbols of all rules, stored end-to-end. | |
typealias RuleStore = [SymbolID] | |
/// The RHS symbols of a parse rule followed by the rule's LHS symbol. | |
typealias Rule = RuleStore.SubSequence | |
/// Range of positions constituting a rule's RHS in `ruleStore`. | |
typealias RuleID = Range<RuleStore.Index> | |
/// A position in the input. | |
typealias SourcePosition = Int | |
/// A range of positions in the input. | |
typealias SourceRegion = Range<SourcePosition> | |
/// A location in the parse chart. | |
struct ChartLocation: Hashable { | |
let expecting: SymbolID | |
let at: SourcePosition | |
} | |
/// A parse rule being matched. | |
struct PartialParse { | |
/// The positions in ruleStore of yet-to-be recognized RHS symbols. | |
var expected: Range<RuleStore.Index> | |
/// The position in the token stream where the partially-parsed input begins. | |
var start: SourcePosition | |
/// True iff the entire RHS has been recognized. | |
var isComplete: Bool { expected.isEmpty } | |
} | |
/// Storage for all the rules. | |
var ruleStore: RuleStore = [] | |
/// Rules indexed by their first RHS symbol. | |
var rulesByRHSStart: [SymbolID: [RuleID]] = [:] | |
/// All the partial parses so far. | |
var chart: [ChartLocation: [PartialParse]] = [:] | |
} | |
/// Initialization and algorithm. | |
extension ChartParser { | |
/// Creates a parser for the given grammar. | |
init<Grammar: Collection, RHS: Collection>(_ grammar: Grammar) | |
where Grammar.Element == (lhs: SymbolID, rhs: RHS), | |
RHS.Element == SymbolID | |
{ | |
for r in grammar { | |
let start = ruleStore.count | |
ruleStore.append(contentsOf: r.rhs) | |
rulesByRHSStart[r.rhs.first!, default: []].append(start..<ruleStore.count) | |
ruleStore.append(r.lhs) | |
} | |
} | |
/// Parses the sequence of symbols in `source`. | |
mutating func parse<S: Collection>(_ source: S) where S.Element == SymbolID { | |
print("parse:", source.map(symbolName)) | |
// Recognize each token over its range in the source. | |
for (i, s) in source.enumerated() { | |
recognize(s, spanningInput: i ..< i + 1) | |
} | |
} | |
/// Recognizes the expected (post-dot) symbol of `r`, ending at `l`. | |
private mutating func advance(_ r: PartialParse, to l: SourcePosition) { | |
print("\(r.start..<l): advance\t", ruleString(r.expected), "to", l) | |
var p = r | |
_ = p.expected.popFirst() | |
if p.isComplete { | |
let lhs = ruleStore[p.expected.upperBound] | |
recognize(lhs, spanningInput: p.start..<l) | |
} | |
else { | |
let postdot = ruleStore[p.expected.first!] | |
chart[ChartLocation(expecting: postdot, at: l), default: []].append(p) | |
} | |
} | |
/// Recognizes `s` over `g` in the input. | |
private mutating func recognize(_ s: SymbolID, spanningInput g: SourceRegion) { | |
print("\(g): recognize\t", symbolName(s)) | |
// Advance all rules that are expecting s at g.lowerBound. | |
if let advancingRules = chart[ChartLocation(expecting: s, at: g.lowerBound)] { | |
for p in advancingRules { | |
advance(p, to: g.upperBound) | |
} | |
} | |
// Initiate all rules that start by expecting s. | |
for r in rulesByRHSStart[s, default: []] { | |
print("\(g): initiate\t", ruleString(r)) | |
advance(PartialParse(expected: r, start: g.lowerBound), to: g.upperBound) | |
} | |
} | |
} | |
/// ------ Testing and diagnostics ------ | |
/// Diagnostic output | |
extension ChartParser { | |
/// Returns the name of symbol s | |
func symbolName(_ s: SymbolID) -> String { | |
"\(Symbol.names[s])" | |
} | |
func ruleString(_ r: RuleID) -> String { | |
let r1 = rulesByRHSStart.values.lazy.joined().first { $0.upperBound == r.upperBound }! | |
let head = "\(symbolName(ruleStore[r.upperBound])) => " | |
let rhsPrefix = r1 == r ? "" | |
: (ruleStore[r1.lowerBound..<r.lowerBound].map(symbolName) + ["."]) | |
.joined(separator: " ") | |
let tail = ruleStore[r].lazy.map(symbolName).joined(separator: " ") | |
return head + rhsPrefix + tail | |
} | |
} | |
/// Grammar definition | |
enum Symbol: Int, CaseIterable { | |
case LPAREN, RPAREN, NUMBER, PLUS, MINUS, TIMES, DIVIDE | |
case term, factor, expr | |
static let names = Array(allCases) | |
} | |
/// A tiny DSL for defining grammar rules. | |
infix operator => : AssignmentPrecedence | |
infix operator ~: MultiplicationPrecedence | |
func ~(a: Symbol, b: Symbol) -> [Symbol] { [a, b] } | |
func ~(a: [Symbol], b: Symbol) -> [Symbol] { a + [b] } | |
func =>(lhs: Symbol, rhs: Symbol) -> [Symbol] { [lhs, rhs] } | |
func =>(lhs: Symbol, rhs: [Symbol]) -> [Symbol] { [lhs] + rhs } | |
let grammar: [[Symbol]] = [ | |
.expr => .term, | |
.expr => .term ~ .PLUS ~ .term, | |
.expr => .term ~ .MINUS ~ .term, | |
.term => .factor, | |
.term => .factor ~ .TIMES ~ .factor, | |
.term => .factor ~ .DIVIDE ~ .factor, | |
.factor => .NUMBER, | |
.factor => .LPAREN ~ .expr ~ .RPAREN, | |
.factor => .PLUS ~ .factor, | |
.factor => .MINUS ~ .factor, | |
] | |
var parser = ChartParser( | |
grammar.lazy.map { | |
(lhs: $0.first!.rawValue, rhs: $0.dropFirst().lazy.map { $0.rawValue }) | |
} | |
) | |
let input: [Symbol] = [ | |
.MINUS, .LPAREN, .NUMBER, .PLUS, .NUMBER, .TIMES, .MINUS, .NUMBER, .RPAREN, | |
.DIVIDE, .NUMBER, | |
.PLUS, .NUMBER] | |
parser.parse(input.lazy.map {$0.rawValue}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment