Browse Source

feat(grouping): Improve JavaScript grouping (#13121)

Armin Ronacher 5 years ago
parent
commit
a8ecedb852

+ 1 - 1
requirements-base.txt

@@ -56,7 +56,7 @@ redis>=2.10.3,<2.10.6
 requests-oauthlib==0.3.3
 requests[security]>=2.20.0,<2.21.0
 selenium==3.141.0
-semaphore>=0.4.32,<0.5.0
+semaphore>=0.4.34,<0.5.0
 sentry-sdk>=0.7.14
 setproctitle>=1.1.7,<1.2.0
 simplejson>=3.2.0,<3.9.0

+ 4 - 1
src/sentry/event_manager.py

@@ -343,6 +343,7 @@ class EventManager(object):
         key=None,
         content_encoding=None,
         is_renormalize=False,
+        remove_other=None
     ):
         self._data = _decode_event(data, content_encoding=content_encoding)
         self.version = version
@@ -355,6 +356,7 @@ class EventManager(object):
         self._auth = auth
         self._key = key
         self._is_renormalize = is_renormalize
+        self._remove_other = remove_other
         self._normalized = False
 
     def process_csp_report(self):
@@ -445,7 +447,8 @@ class EventManager(object):
             max_secs_in_future=MAX_SECS_IN_FUTURE,
             max_secs_in_past=MAX_SECS_IN_PAST,
             enable_trimming=True,
-            is_renormalize=self._is_renormalize
+            is_renormalize=self._is_renormalize,
+            remove_other=self._remove_other,
         )
 
         self._data = CanonicalKeyDict(

+ 27 - 2
src/sentry/grouping/strategies/configurations.py

@@ -102,9 +102,34 @@ register_strategy_config(
     changelog='''
         * messages are now preprocessed to increase change of grouping together
         * exceptions without stacktraces are now grouped by a trimmed message
+    '''
+)
 
-        *This algorithm is currently work in progress and will continue to
-        evolve based on feedback*
+register_strategy_config(
+    id='newstyle:2019-05-08',
+    strategies=[
+        'expect-ct:v1',
+        'expect-staple:v1',
+        'hpkp:v1',
+        'csp:v1',
+        'threads:v1',
+        'stacktrace:v1',
+        'chained-exception:v1',
+        'template:v1',
+        'message:v2',
+    ],
+    delegates=[
+        'frame:v3',
+        'stacktrace:v1',
+        'single-exception:v2',
+    ],
+    changelog='''
+        * context lines are honored again for platforms with reliable source
+          code information (JavaScript, Python, PHP and Ruby)
+        * JavaScript stacktraces are better deduplicated across browser
+          versions.
+        * JavaScript stacktraces involving source maps are likely to group
+          better.
     '''
 )
 

+ 4 - 18
src/sentry/grouping/strategies/legacy.py

@@ -5,7 +5,8 @@ import posixpath
 
 from sentry.grouping.component import GroupingComponent
 from sentry.grouping.strategies.base import strategy
-from sentry.grouping.strategies.utils import remove_non_stacktrace_variants
+from sentry.grouping.strategies.utils import remove_non_stacktrace_variants, \
+    has_url_origin
 
 
 _ruby_anon_func = re.compile(r'_\d{2,}')
@@ -55,21 +56,6 @@ RECURSION_COMPARISON_FIELDS = [
 ]
 
 
-def is_url_legacy(filename):
-    return filename.startswith(('file:', 'http:', 'https:', 'applewebdata:'))
-
-
-def is_url_frame_legacy(frame):
-    if not frame.abs_path:
-        return False
-    # URLs can be generated such that they are:
-    #   blob:http://example.com/7f7aaadf-a006-4217-9ed5-5fbf8585c6c0
-    # https://developer.mozilla.org/en-US/docs/Web/API/URL/createObjectURL
-    if frame.abs_path.startswith('blob:'):
-        return True
-    return is_url_legacy(frame.abs_path)
-
-
 def is_unhashable_module_legacy(frame, platform):
     # Fix for the case where module is a partial copy of the URL
     # and should not be hashed
@@ -285,7 +271,7 @@ def frame_legacy(frame, event, **meta):
         contributes = False
         hint = 'native code indicated by filename'
     elif frame.filename:
-        if is_url_frame_legacy(frame):
+        if has_url_origin(frame.abs_path):
             filename_component.update(
                 contributes=False,
                 values=[frame.filename],
@@ -341,7 +327,7 @@ def frame_legacy(frame, event, **meta):
     if frame.context_line is not None:
         if len(frame.context_line) > 120:
             context_line_component.update(hint='discarded because line too long')
-        elif is_url_frame_legacy(frame) and not func:
+        elif has_url_origin(frame.abs_path) and not func:
             context_line_component.update(hint='discarded because from URL origin')
         else:
             context_line_component.update(values=[frame.context_line])

+ 117 - 22
src/sentry/grouping/strategies/newstyle.py

@@ -5,7 +5,8 @@ import re
 
 from sentry.grouping.component import GroupingComponent
 from sentry.grouping.strategies.base import strategy
-from sentry.grouping.strategies.utils import remove_non_stacktrace_variants
+from sentry.grouping.strategies.utils import remove_non_stacktrace_variants, \
+    has_url_origin
 from sentry.grouping.strategies.message import trim_message_for_grouping
 from sentry.stacktraces.platform import get_behavior_family_for_platform
 
@@ -50,13 +51,6 @@ RECURSION_COMPARISON_FIELDS = [
 ]
 
 
-def abs_path_is_url_v1(abs_path):
-    if not abs_path:
-        return False
-    return abs_path.startswith((
-        'blob:', 'file:', 'http:', 'https:', 'applewebdata:'))
-
-
 def is_recursion_v1(frame1, frame2):
     "Returns a boolean indicating whether frames are recursive calls."
     for field in RECURSION_COMPARISON_FIELDS:
@@ -66,7 +60,8 @@ def is_recursion_v1(frame1, frame2):
     return True
 
 
-def get_filename_component_v1(abs_path, filename, platform):
+def get_filename_component(abs_path, filename, platform,
+                           allow_file_origin=False):
     """Attempt to normalize filenames by detecing special filenames and by
     using the basename only.
     """
@@ -81,7 +76,7 @@ def get_filename_component_v1(abs_path, filename, platform):
         values=[filename],
     )
 
-    if abs_path_is_url_v1(abs_path):
+    if has_url_origin(abs_path, allow_file_origin=allow_file_origin):
         filename_component.update(
             contributes=False,
             hint='ignored because frame points to a URL',
@@ -151,7 +146,8 @@ def get_module_component_v1(abs_path, module, platform):
 
 
 def get_function_component(function, platform, legacy_function_logic,
-                           raw_function=None):
+                           sourcemap_used=False, context_line_available=False,
+                           raw_function=None, javascript_fuzzing=False):
     """
     Attempt to normalize functions by removing common platform outliers.
 
@@ -165,6 +161,7 @@ def get_function_component(function, platform, legacy_function_logic,
     or a trimmed version (of the truncated one) for native.
     """
     from sentry.stacktraces.functions import trim_function_name
+    behavior_family = get_behavior_family_for_platform(platform)
 
     if legacy_function_logic:
         func = raw_function or function
@@ -209,7 +206,7 @@ def get_function_component(function, platform, legacy_function_logic,
                 hint='ignored lambda function'
             )
 
-    elif get_behavior_family_for_platform(platform) == 'native':
+    elif behavior_family == 'native':
         if func in ('<redacted>', '<unknown>'):
             function_component.update(
                 contributes=False,
@@ -223,6 +220,27 @@ def get_function_component(function, platform, legacy_function_logic,
                     hint='isolated function'
                 )
 
+    elif javascript_fuzzing and behavior_family == 'javascript':
+        # This changes Object.foo or Foo.foo into foo so that we can
+        # resolve some common cross browser differences
+        new_function = func.rsplit('.', 1)[-1]
+        if new_function != func:
+            function_component.update(
+                values=[new_function],
+                hint='trimmed javascript function'
+            )
+
+        # if a sourcemap was used for this frame and we know that we can
+        # use the context line information we no longer want to use the
+        # function name.  The reason for this is that function names in
+        # sourcemaps are unreliable by the nature of sourcemaps and thus a
+        # bad indicator for grouping.
+        if sourcemap_used and context_line_available:
+            function_component.update(
+                contributes=False,
+                hint='ignored because sourcemap used and context line available'
+            )
+
     return function_component
 
 
@@ -246,14 +264,58 @@ def frame_v2(frame, event, **meta):
                                legacy_function_logic=False)
 
 
-def get_frame_component(frame, event, meta, legacy_function_logic=False):
+@strategy(
+    id='frame:v3',
+    interfaces=['frame'],
+    variants=['!system', 'app'],
+)
+def frame_v3(frame, event, **meta):
+    platform = frame.platform or event.platform
+    # These are platforms that we know have always source available and
+    # where the source is of good quality for grouping.  For javascript
+    # this assumes that we have sourcemaps available.
+    good_source = platform in ('javascript', 'node', 'python', 'php', 'ruby')
+    return get_frame_component(frame, event, meta,
+                               legacy_function_logic=False,
+                               use_contextline=good_source,
+                               javascript_fuzzing=True)
+
+
+def get_contextline_component(frame, platform):
+    """Returns a contextline component.  The caller's responsibility is to
+    make sure context lines are only used for platforms where we trust the
+    quality of the sourcecode.  It does however protect against some bad
+    JavaScript environments based on origin checks.
+    """
+    component = GroupingComponent(id='context-line')
+
+    if not frame.context_line:
+        return component
+
+    line = ' '.join(frame.context_line.expandtabs(2).split())
+    if line:
+        if len(frame.context_line) > 120:
+            component.update(hint='discarded because line too long')
+        elif get_behavior_family_for_platform(platform) == 'javascript' \
+                and has_url_origin(frame.abs_path, allow_file_origin=True):
+            component.update(hint='discarded because from URL origin')
+        else:
+            component.update(values=[line])
+
+    return component
+
+
+def get_frame_component(frame, event, meta, legacy_function_logic=False,
+                        use_contextline=False,
+                        javascript_fuzzing=False):
     platform = frame.platform or event.platform
 
     # Safari throws [native code] frames in for calls like ``forEach``
     # whereas Chrome ignores these. Let's remove it from the hashing algo
     # so that they're more likely to group together
-    filename_component = get_filename_component_v1(
-        frame.abs_path, frame.filename, platform)
+    filename_component = get_filename_component(
+        frame.abs_path, frame.filename, platform,
+        allow_file_origin=javascript_fuzzing)
 
     # if we have a module we use that for grouping.  This will always
     # take precedence over the filename if it contributes
@@ -265,22 +327,55 @@ def get_frame_component(frame, event, meta, legacy_function_logic=False):
             hint='module takes precedence'
         )
 
+    context_line_component = None
+
+    # If we are allowed to use the contextline we add it now.
+    if use_contextline:
+        context_line_component = get_contextline_component(frame, platform)
+
     function_component = get_function_component(
         function=frame.function,
         raw_function=frame.raw_function,
         platform=platform,
-        legacy_function_logic=legacy_function_logic
+        sourcemap_used=frame.data and frame.data.get('sourcemap') is not None,
+        context_line_available=context_line_component and context_line_component.contributes,
+        legacy_function_logic=legacy_function_logic,
+        javascript_fuzzing=javascript_fuzzing,
     )
 
-    return GroupingComponent(
+    values = [
+        module_component,
+        filename_component,
+        function_component,
+    ]
+    if context_line_component is not None:
+        values.append(context_line_component)
+
+    rv = GroupingComponent(
         id='frame',
-        values=[
-            module_component,
-            filename_component,
-            function_component,
-        ],
+        values=values,
     )
 
+    # if we are in javascript fuzzing mode we want to disregard some
+    # frames consistently.  These force common bad stacktraces together
+    # to have a common hash at the cost of maybe skipping over frames that
+    # would otherwise be useful.
+    if javascript_fuzzing \
+       and get_behavior_family_for_platform(platform) == 'javascript':
+        func = frame.raw_function or frame.function
+        if func:
+            func = func.rsplit('.', 1)[-1]
+        if func in (None, '?', '<anonymous function>', '<anonymous>',
+                    'Anonymous function', 'eval') or \
+           func.endswith('/<') or \
+           frame.abs_path in ('[native code]', 'native code', 'eval code', '<anonymous>'):
+            rv.update(
+                contributes=False,
+                hint='ignored low quality javascript frame'
+            )
+
+    return rv
+
 
 @strategy(
     id='stacktrace:v1',

+ 13 - 0
src/sentry/grouping/strategies/utils.py

@@ -44,3 +44,16 @@ def remove_non_stacktrace_variants(variants):
             )
 
     return variants
+
+
+def has_url_origin(path, allow_file_origin=False):
+    # URLs can be generated such that they are:
+    #   blob:http://example.com/7f7aaadf-a006-4217-9ed5-5fbf8585c6c0
+    # https://developer.mozilla.org/en-US/docs/Web/API/URL/createObjectURL
+    if not path:
+        return False
+    if path.startswith(('http:', 'https:', 'applewebdata:', 'blob:')):
+        return True
+    if path.startswith('file:'):
+        return not allow_file_origin
+    return False

+ 2 - 1
tests/sentry/grouping/grouping_inputs/frame-ignores-module-if-page-url-2.json

@@ -4,7 +4,8 @@
       {
         "abs_path": "https://sentry.io/foo/bar/baz",
         "module": "foo/bar/baz",
-        "filename": "foo.py"
+        "filename": "foo.py",
+        "function": "a"
       }
     ]
   },

+ 98 - 0
tests/sentry/grouping/grouping_inputs/javascript-xbrowser-chrome.json

@@ -0,0 +1,98 @@
+{
+  "culprit": "Object.aha(/tmp/test.html)",
+  "event_id": "6b8b408a6efa4d95955e51f2b3c9a168",
+  "platform": "javascript",
+  "logger": "",
+  "fingerprint": [
+    "{{ default }}"
+  ],
+  "exception": {
+    "values": [
+      {
+        "stacktrace": {
+          "frames": [
+            {
+              "abs_path": "file:///tmp/test.html",
+              "filename": "/tmp/test.html",
+              "lineno": 49,
+              "colno": 19,
+              "in_app": false
+            },
+            {
+              "function": "Foo.testMethod",
+              "abs_path": "file:///tmp/test.html",
+              "filename": "/tmp/test.html",
+              "lineno": 43,
+              "colno": 7,
+              "in_app": false
+            },
+            {
+              "function": "aha",
+              "abs_path": "file:///tmp/test.html",
+              "filename": "/tmp/test.html",
+              "lineno": 38,
+              "colno": 5,
+              "in_app": false
+            },
+            {
+              "function": "eval",
+              "abs_path": "file:///tmp/test.html",
+              "filename": "/tmp/test.html",
+              "lineno": 1,
+              "colno": 1,
+              "in_app": false
+            },
+            {
+              "function": "test",
+              "abs_path": "file:///tmp/test.html",
+              "filename": "/tmp/test.html",
+              "lineno": 32,
+              "colno": 23,
+              "in_app": false
+            },
+            {
+              "function": "Array.map",
+              "abs_path": "<anonymous>",
+              "in_app": false,
+              "filename": "<anonymous>"
+            },
+            {
+              "abs_path": "file:///tmp/test.html",
+              "filename": "/tmp/test.html",
+              "lineno": 33,
+              "colno": 17,
+              "in_app": false
+            },
+            {
+              "function": "Object.callback",
+              "abs_path": "file:///tmp/test.html",
+              "filename": "/tmp/test.html",
+              "lineno": 24,
+              "colno": 7,
+              "in_app": false
+            },
+            {
+              "function": "callAnotherThing",
+              "abs_path": "file:///tmp/test.html",
+              "filename": "/tmp/test.html",
+              "lineno": 19,
+              "colno": 16,
+              "in_app": false
+            },
+            {
+              "function": "Object.aha",
+              "abs_path": "file:///tmp/test.html",
+              "filename": "/tmp/test.html",
+              "lineno": 18,
+              "colno": 13,
+              "in_app": false
+            }
+          ]
+        },
+        "type": "Error",
+        "value": "bad"
+      }
+    ]
+  },
+  "release": null
+}

+ 100 - 0
tests/sentry/grouping/grouping_inputs/javascript-xbrowser-edge.json

@@ -0,0 +1,100 @@
+{
+  "culprit": "aha(/Home/Desktop/test.html)",
+  "event_id": "e58aee200c8a486385bf5e1346b44865",
+  "platform": "javascript",
+  "logger": "",
+  "fingerprint": [
+    "{{ default }}"
+  ],
+  "exception": {
+    "values": [
+      {
+        "stacktrace": {
+          "frames": [
+            {
+              "function": "Anonymous function",
+              "abs_path": "file://mac/Home/Desktop/test.html",
+              "filename": "/Home/Desktop/test.html",
+              "lineno": 49,
+              "colno": 8,
+              "in_app": false
+            },
+            {
+              "function": "Foo.prototype.testMethod",
+              "abs_path": "file://mac/Home/Desktop/test.html",
+              "filename": "/Home/Desktop/test.html",
+              "lineno": 43,
+              "colno": 7,
+              "in_app": false
+            },
+            {
+              "function": "aha",
+              "abs_path": "file://mac/Home/Desktop/test.html",
+              "filename": "/Home/Desktop/test.html",
+              "lineno": 38,
+              "colno": 5,
+              "in_app": false
+            },
+            {
+              "function": "eval code",
+              "abs_path": "eval code",
+              "filename": "eval code",
+              "lineno": 1,
+              "colno": 1,
+              "in_app": false
+            },
+            {
+              "function": "test",
+              "abs_path": "file://mac/Home/Desktop/test.html",
+              "filename": "/Home/Desktop/test.html",
+              "lineno": 32,
+              "colno": 5,
+              "in_app": false
+            },
+            {
+              "function": "Array.prototype.map",
+              "abs_path": "native code",
+              "in_app": false,
+              "filename": "native code"
+            },
+            {
+              "function": "Anonymous function",
+              "abs_path": "file://mac/Home/Desktop/test.html",
+              "filename": "/Home/Desktop/test.html",
+              "lineno": 33,
+              "colno": 7,
+              "in_app": false
+            },
+            {
+              "function": "callback",
+              "abs_path": "file://mac/Home/Desktop/test.html",
+              "filename": "/Home/Desktop/test.html",
+              "lineno": 24,
+              "colno": 7,
+              "in_app": false
+            },
+            {
+              "function": "callAnotherThing",
+              "abs_path": "file://mac/Home/Desktop/test.html",
+              "filename": "/Home/Desktop/test.html",
+              "lineno": 17,
+              "colno": 6,
+              "in_app": false
+            },
+            {
+              "function": "aha",
+              "abs_path": "file://mac/Home/Desktop/test.html",
+              "filename": "/Home/Desktop/test.html",
+              "lineno": 18,
+              "colno": 7,
+              "in_app": false
+            }
+          ]
+        },
+        "type": "Error",
+        "value": "bad"
+      }
+    ]
+  },
+  "release": null
+}

+ 92 - 0
tests/sentry/grouping/grouping_inputs/javascript-xbrowser-firefox.json

@@ -0,0 +1,92 @@
+{
+  "culprit": "aha(/private/tmp/test.html)",
+  "event_id": "52aeea2ece28426596fa82799418215c",
+  "platform": "javascript",
+  "logger": "",
+  "fingerprint": [
+    "{{ default }}"
+  ],
+  "exception": {
+    "values": [
+      {
+        "stacktrace": {
+          "frames": [
+            {
+              "abs_path": "file:///private/tmp/test.html",
+              "filename": "/private/tmp/test.html",
+              "lineno": 49,
+              "colno": 19,
+              "in_app": false
+            },
+            {
+              "function": "testMethod",
+              "abs_path": "file:///private/tmp/test.html",
+              "filename": "/private/tmp/test.html",
+              "lineno": 43,
+              "colno": 7,
+              "in_app": false
+            },
+            {
+              "function": "aha",
+              "abs_path": "file:///private/tmp/test.html",
+              "filename": "/private/tmp/test.html",
+              "lineno": 38,
+              "colno": 5,
+              "in_app": false
+            },
+            {
+              "abs_path": "file:///private/tmp/test.html",
+              "filename": "/private/tmp/test.html",
+              "lineno": 1,
+              "colno": 1,
+              "in_app": false
+            },
+            {
+              "function": "test",
+              "abs_path": "file:///private/tmp/test.html",
+              "filename": "/private/tmp/test.html",
+              "lineno": 32,
+              "colno": 23,
+              "in_app": false
+            },
+            {
+              "function": "test/<",
+              "abs_path": "file:///private/tmp/test.html",
+              "filename": "/private/tmp/test.html",
+              "lineno": 33,
+              "colno": 17,
+              "in_app": false
+            },
+            {
+              "function": "callback",
+              "abs_path": "file:///private/tmp/test.html",
+              "filename": "/private/tmp/test.html",
+              "lineno": 24,
+              "colno": 7,
+              "in_app": false
+            },
+            {
+              "function": "callAnotherThing",
+              "abs_path": "file:///private/tmp/test.html",
+              "filename": "/private/tmp/test.html",
+              "lineno": 19,
+              "colno": 16,
+              "in_app": false
+            },
+            {
+              "function": "aha",
+              "abs_path": "file:///private/tmp/test.html",
+              "filename": "/private/tmp/test.html",
+              "lineno": 18,
+              "colno": 13,
+              "in_app": false
+            }
+          ]
+        },
+        "type": "Error",
+        "value": "bad"
+      }
+    ]
+  },
+  "release": null
+}

Some files were not shown because too many files changed in this diff