Просмотр исходного кода

Revert "Revert "ref: fix invalid escapes to avoid DeprecationWarning (#34279)" (#34347)" (#34367)

* Revert "Revert "ref: fix invalid escapes to avoid DeprecationWarning (#34279)" (#34347)"

This reverts commit d294775458e911b6c84eeaa9002cd8fc99ebd79f.

* fix quoted strings which contain embedded newlines
asottile-sentry 2 лет назад
Родитель
Сommit
2ca13701bb
2 измененных файлов с 46 добавлено и 50 удалено
  1. 16 14
      src/sentry/utils/strings.py
  2. 30 36
      tests/sentry/utils/test_strings.py

+ 16 - 14
src/sentry/utils/strings.py

@@ -1,3 +1,4 @@
+import ast
 import base64
 import codecs
 import re
@@ -27,23 +28,24 @@ _lone_surrogate = re.compile(
 """
 )
 
-
-def unicode_escape_recovery_handler(err):
-    try:
-        value = err.object[err.start : err.end].decode("utf-8")
-    except UnicodeError:
-        value = ""
-    return value, err.end
-
-
-codecs.register_error("unicode-escape-recovery", unicode_escape_recovery_handler)
+INVALID_ESCAPE = re.compile(
+    r"""
+(?<!\\)              # no backslash behind
+((?:\\\\)*\\)        # odd number of backslashes
+(?!x[0-9a-fA-F]{2})  # char escape: \x__
+(?!u[0-9a-fA-F]{4})  # char escape: \u____
+(?!U[0-9a-fA-F]{8})  # char escape: \U________
+(?![0-7]{1,3})       # octal escape: \_, \__, \___
+(?![\\'"abfnrtv])    # other escapes: https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
+""",
+    re.VERBOSE,
+)
 
 
-def unescape_string(value):
+def unescape_string(value: str):
     """Unescapes a backslash escaped string."""
-    return value.encode("ascii", "backslashreplace").decode(
-        "unicode-escape", "unicode-escape-recovery"
-    )
+    value = INVALID_ESCAPE.sub(r"\1\\", value)
+    return ast.literal_eval(f'"""{value}"""')
 
 
 def strip_lone_surrogates(string):

+ 30 - 36
tests/sentry/utils/test_strings.py

@@ -1,5 +1,7 @@
 import functools
 
+import pytest
+
 from sentry.utils.strings import (
     codec_lookup,
     is_valid_dot_atom,
@@ -15,42 +17,34 @@ ZWSP = "\u200b"  # zero width space
 SHY = "\u00ad"  # soft hyphen
 
 
-def test_unescape_string():
-    # For raw string literals, python escapes any backslash,
-    # regardless if it's part of a recognized escape sequence or not.
-    value = r"\x80"
-    assert r"\x80" == "\\x80"
-
-    # We want to unescape that.
-    assert unescape_string(value) == "\x80"
-    assert r"\x80" != "\x80"
-
-    # For string literals, python leaves recognized escape sequences alone,
-    # and we should as well.
-    assert unescape_string("\x80") == "\x80"
-
-    # Essentially, we want the resulting str to
-    # have the same number of backslashes as the raw string.
-    assert unescape_string(r"\\x80") == "\\x80"
-    assert unescape_string(r"\\\x80") == "\\\x80"
-    assert unescape_string(r"\\\\x80") == "\\\\x80"
-
-    # Now for a real world example.
-    # If we specify this value as a string literal, we'll get a DeprecationWarning
-    # because \* is not a recognized escape sequence.
-    # This raw string literal reflects what was read off disk from our grouping
-    # enhancement config text files, before they were corrected to be \\**.
-    value = r"C:/WINDOWS/system32/DriverStore\**"
-    assert value == "C:/WINDOWS/system32/DriverStore\\**"
-
-    # This string should remain unchanged after unescape_string,
-    # because there are no recognized escape sequences to unescape.
-    # From 3.6 to 3.8 a DeprecationWarning which we suppress will
-    # be emitted during .decode("unicode-escape", "unicode-escape-recovery"),
-    # because \* isn't a recognized escape sequence.
-    # We just want this to be a reminder if the warning is upgraded to a
-    # behavior change in 3.9+.
-    assert unescape_string(value) == "C:/WINDOWS/system32/DriverStore\\**"
+@pytest.mark.parametrize(
+    ("s", "expected"),
+    (
+        # the literal \x escape sequence is converted to the character
+        (r"\x80", "\x80"),
+        # the result should have the same number of backslashes as the raw string
+        (r"\\x80", "\\x80"),
+        (r"\\\x80", "\\\x80"),
+        (r"\\\\x80", "\\\\x80"),
+        # this string has an invalid escape sequence: \*
+        (r"C:/WINDOWS/system32/DriverStore\**", "C:/WINDOWS/system32/DriverStore\\**"),
+        # this string has an unterminated invalid escape sequence: \x
+        (r"\x", "\\x"),
+        (r"\\\x", "\\\\x"),
+        # decodes character escapes
+        (r"\t", "\t"),
+        (r"\0", "\0"),
+        (r"\11", "\11"),
+        (r"\111", "\111"),
+        (r"\u2603", "☃"),
+        (r"\U0001f643", "🙃"),
+        # probably a mistake in the configuration but it allows quoted strings
+        # with embedded newlines
+        ("hello\nworld", "hello\nworld"),
+    ),
+)
+def test_unescape_string(s, expected):
+    assert unescape_string(s) == expected
 
 
 def test_codec_lookup():