Browse Source

feat(perf): Add Unicode support to SQL parser (#59115)

This is rare, but happens sometimes. Closes JAVASCRIPT-2NQW again.
George Gritsouk 1 year ago
parent
commit
8833e624d3

+ 3 - 1
static/app/views/starfish/utils/sqlish/SQLishParser.spec.tsx

@@ -18,7 +18,9 @@ describe('SQLishParser', function () {
       'columns AS `tags[column]`', // ClickHouse backtics
       'SELECT * FROM #temp', // Temporary tables
       '# Fetches', // Comments
-      '\r\n', // Windows newlinse
+      '\r\n', // Windows newlines
+      '✌🏻', // Emoji
+      'ă', // Unicode
       'SELECT id, nam*', // Truncation
       'AND created >= :c1', // PHP-Style I
       'LIMIT $2', // PHP-style II

+ 5 - 1
static/app/views/starfish/utils/sqlish/sqlish.pegjs

@@ -35,5 +35,9 @@ CollapsedColumns
 Whitespace
   = Whitespace:[\n\t\r ]+ { return { type: 'Whitespace', content: Whitespace.join("") } }
 
+// \u00A0-\uFFFF is the entire Unicode BMP _including_ surrogate pairs and
+// unassigned code points, which aren't parse-able naively. A more precise
+// approach would be to define all valid Unicode ranges exactly but for
+// permissive parsing we don't mind the lack of precision.
 GenericToken
-  = GenericToken:[a-zA-Z0-9"'`_\-.=><:,*;!\[\]?$%|/\\@#&~^+{}]+ { return { type: 'GenericToken', content: GenericToken.join('') } }
+  = GenericToken:[a-zA-Z0-9\u00A0-\uFFFF"'`_\-.=><:,*;!\[\]?$%|/\\@#&~^+{}]+ { return { type: 'GenericToken', content: GenericToken.join('') } }