expression = token* token = number / variable / _ / open_paren / close_paren / plus / minus / multiply / divide / generic_token number = [0-9]+('.'[0-9]+)? { return { type: "number", content: text()}} variable = [a-z]+ { return { type: "variable", content: text()}} _ = " "+ { return { type: "whitespace", content: text()}} open_paren = "(" { return { type: "openParen", content: text()}} close_paren = ")" { return { type: "closeParen", content: text()}} plus = "+" { return { type: "plus", content: text()}} minus = "-" { return { type: "minus", content: text()}} multiply = "*" { return { type: "multiply", content: text()}} divide = "/" { return { type: "divide", content: text()}} // \u00A0-\uFFFF is the entire Unicode BMP _including_ surrogate pairs and // unassigned code points, which aren't parse-able naively. A more precise // approach would be to define all valid Unicode ranges exactly but for // permissive parsing we don't mind the lack of precision. generic_token = [a-zA-Z0-9\u00A0-\uFFFF"'`_\-.=><:,*;!\[\]?$%|/\\@#&~^+{}]+ { return { type: 'generic', content: text() } }