2 лет назад · 2ca13701bb
--- a/src/sentry/utils/strings.py
+++ b/src/sentry/utils/strings.py
@@ -1,3 +1,4 @@
 
				+import ast
			
 
				 import base64
			
 
				 import codecs
			
 
				 import re
			
@@ -27,23 +28,24 @@ _lone_surrogate = re.compile(
 
				 """
			
 
				 )
			
 
				 
			
 
				-
			
 
				-def unicode_escape_recovery_handler(err):
			
 
				-    try:
			
 
				-        value = err.object[err.start : err.end].decode("utf-8")
			
 
				-    except UnicodeError:
			
 
				-        value = ""
			
 
				-    return value, err.end
			
 
				-
			
 
				-
			
 
				-codecs.register_error("unicode-escape-recovery", unicode_escape_recovery_handler)
			
 
				+INVALID_ESCAPE = re.compile(
			
 
				+    r"""
			
 
				+(?<!\\)              # no backslash behind
			
 
				+((?:\\\\)*\\)        # odd number of backslashes
			
 
				+(?!x[0-9a-fA-F]{2})  # char escape: \x__
			
 
				+(?!u[0-9a-fA-F]{4})  # char escape: \u____
			
 
				+(?!U[0-9a-fA-F]{8})  # char escape: \U________
			
 
				+(?![0-7]{1,3})       # octal escape: \_, \__, \___
			
 
				+(?![\\'"abfnrtv])    # other escapes: https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
			
 
				+""",
			
 
				+    re.VERBOSE,
			
 
				+)
			
 
				 
			
 
				 
			
 
				-def unescape_string(value):
			
 
				+def unescape_string(value: str):
			
 
				     """Unescapes a backslash escaped string."""
			
 
				-    return value.encode("ascii", "backslashreplace").decode(
			
 
				-        "unicode-escape", "unicode-escape-recovery"
			
 
				-    )
			
 
				+    value = INVALID_ESCAPE.sub(r"\1\\", value)
			
 
				+    return ast.literal_eval(f'"""{value}"""')
			
 
				 
			
 
				 
			
 
				 def strip_lone_surrogates(string):
			
--- a/tests/sentry/utils/test_strings.py
+++ b/tests/sentry/utils/test_strings.py
@@ -1,5 +1,7 @@
 
				 import functools
			
 
				 
			
 
				+import pytest
			
 
				+
			
 
				 from sentry.utils.strings import (
			
 
				     codec_lookup,
			
 
				     is_valid_dot_atom,
			
@@ -15,42 +17,34 @@ ZWSP = "\u200b"  # zero width space
 
				 SHY = "\u00ad"  # soft hyphen
			
 
				 
			
 
				 
			
 
				-def test_unescape_string():
			
 
				-    # For raw string literals, python escapes any backslash,
			
 
				-    # regardless if it's part of a recognized escape sequence or not.
			
 
				-    value = r"\x80"
			
 
				-    assert r"\x80" == "\\x80"
			
 
				-
			
 
				-    # We want to unescape that.
			
 
				-    assert unescape_string(value) == "\x80"
			
 
				-    assert r"\x80" != "\x80"
			
 
				-
			
 
				-    # For string literals, python leaves recognized escape sequences alone,
			
 
				-    # and we should as well.
			
 
				-    assert unescape_string("\x80") == "\x80"
			
 
				-
			
 
				-    # Essentially, we want the resulting str to
			
 
				-    # have the same number of backslashes as the raw string.
			
 
				-    assert unescape_string(r"\\x80") == "\\x80"
			
 
				-    assert unescape_string(r"\\\x80") == "\\\x80"
			
 
				-    assert unescape_string(r"\\\\x80") == "\\\\x80"
			
 
				-
			
 
				-    # Now for a real world example.
			
 
				-    # If we specify this value as a string literal, we'll get a DeprecationWarning
			
 
				-    # because \* is not a recognized escape sequence.
			
 
				-    # This raw string literal reflects what was read off disk from our grouping
			
 
				-    # enhancement config text files, before they were corrected to be \\**.
			
 
				-    value = r"C:/WINDOWS/system32/DriverStore\**"
			
 
				-    assert value == "C:/WINDOWS/system32/DriverStore\\**"
			
 
				-
			
 
				-    # This string should remain unchanged after unescape_string,
			
 
				-    # because there are no recognized escape sequences to unescape.
			
 
				-    # From 3.6 to 3.8 a DeprecationWarning which we suppress will
			
 
				-    # be emitted during .decode("unicode-escape", "unicode-escape-recovery"),
			
 
				-    # because \* isn't a recognized escape sequence.
			
 
				-    # We just want this to be a reminder if the warning is upgraded to a
			
 
				-    # behavior change in 3.9+.
			
 
				-    assert unescape_string(value) == "C:/WINDOWS/system32/DriverStore\\**"
			
 
				+@pytest.mark.parametrize(
			
 
				+    ("s", "expected"),
			
 
				+    (
			
 
				+        # the literal \x escape sequence is converted to the character
			
 
				+        (r"\x80", "\x80"),
			
 
				+        # the result should have the same number of backslashes as the raw string
			
 
				+        (r"\\x80", "\\x80"),
			
 
				+        (r"\\\x80", "\\\x80"),
			
 
				+        (r"\\\\x80", "\\\\x80"),
			
 
				+        # this string has an invalid escape sequence: \*
			
 
				+        (r"C:/WINDOWS/system32/DriverStore\**", "C:/WINDOWS/system32/DriverStore\\**"),
			
 
				+        # this string has an unterminated invalid escape sequence: \x
			
 
				+        (r"\x", "\\x"),
			
 
				+        (r"\\\x", "\\\\x"),
			
 
				+        # decodes character escapes
			
 
				+        (r"\t", "\t"),
			
 
				+        (r"\0", "\0"),
			
 
				+        (r"\11", "\11"),
			
 
				+        (r"\111", "\111"),
			
 
				+        (r"\u2603", "☃"),
			
 
				+        (r"\U0001f643", "🙃"),
			
 
				+        # probably a mistake in the configuration but it allows quoted strings
			
 
				+        # with embedded newlines
			
 
				+        ("hello\nworld", "hello\nworld"),
			
 
				+    ),
			
 
				+)
			
 
				+def test_unescape_string(s, expected):
			
 
				+    assert unescape_string(s) == expected
			
 
				 
			
 
				 
			
 
				 def test_codec_lookup():