Просмотр исходного кода

ref(snuba): Use issue API for Snuba search (#10368)

Brett Hoerner 6 лет назад
Родитель
Сommit
759e2905f3

+ 1 - 1
src/sentry/options/defaults.py

@@ -135,7 +135,7 @@ register('vsts.client-id', flags=FLAG_PRIORITIZE_DISK)
 register('vsts.client-secret', flags=FLAG_PRIORITIZE_DISK)
 
 # Snuba
-register('snuba.use_group_id_column', default=False)
+register('snuba.use_group_id_column', default=True)
 register('snuba.search.max-pre-snuba-candidates', default=500)
 register('snuba.search.chunk-growth-rate', default=1.5)
 register('snuba.search.max-chunk-size', default=2000)

+ 52 - 82
src/sentry/search/snuba/backend.py

@@ -6,7 +6,6 @@ import logging
 import math
 import pytz
 import time
-from collections import defaultdict
 from datetime import timedelta, datetime
 
 from django.utils import timezone
@@ -14,7 +13,7 @@ from django.utils import timezone
 from sentry import options
 from sentry.api.paginator import SequencePaginator, Paginator
 from sentry.event_manager import ALLOWED_FUTURE_DELTA
-from sentry.models import Release, Group, GroupEnvironment, GroupHash
+from sentry.models import Release, Group, GroupEnvironment
 from sentry.search.django import backend as ds
 from sentry.utils import snuba, metrics
 from sentry.utils.dates import to_timestamp
@@ -37,14 +36,14 @@ def snuba_str_to_datetime(d):
 
 
 def calculate_priority_cursor(data):
-    times_seen = sum(data['times_seen'])
-    last_seen = max(int(to_timestamp(snuba_str_to_datetime(d)) * 1000) for d in data['last_seen'])
+    times_seen = data['times_seen']
+    last_seen = int(to_timestamp(snuba_str_to_datetime(data['last_seen'])) * 1000)
     return ((math.log(times_seen) * 600) + last_seen)
 
 
-def _datetime_cursor_calculator(field, fn):
+def _datetime_cursor_calculator(field):
     def calculate(data):
-        datetime = fn(snuba_str_to_datetime(d) for d in data[field])
+        datetime = snuba_str_to_datetime(data[field])
         return int(to_timestamp(datetime) * 1000)
 
     return calculate
@@ -60,13 +59,13 @@ sort_strategies = {
         'priority', ['last_seen', 'times_seen'], calculate_priority_cursor,
     ),
     'date': (
-        'last_seen', [], _datetime_cursor_calculator('last_seen', max),
+        'last_seen', [], _datetime_cursor_calculator('last_seen'),
     ),
     'new': (
-        'first_seen', [], _datetime_cursor_calculator('first_seen', min),
+        'first_seen', [], _datetime_cursor_calculator('first_seen'),
     ),
     'freq': (
-        'times_seen', [], lambda data: sum(data['times_seen']),
+        'times_seen', [], lambda data: data['times_seen'],
     ),
 }
 
@@ -221,29 +220,25 @@ class SnubaSearchBackend(ds.DjangoSearchBackend):
                 parameters,
             )
 
-        # maximum number of GroupHashes to send down to Snuba,
-        # if more GroupHash candidates are found, a "bare" Snuba
+        # maximum number of Group IDs to send down to Snuba,
+        # if more Group ID candidates are found, a "bare" Snuba
         # search is performed and the result groups are then
         # post-filtered via queries to the Sentry DB
         max_pre_snuba_candidates = options.get('snuba.search.max-pre-snuba-candidates')
 
         # pre-filter query
-        candidate_hashes = None
+        candidate_ids = None
         if max_pre_snuba_candidates and limit <= max_pre_snuba_candidates:
-            candidate_hashes = dict(
-                GroupHash.objects.filter(
-                    group__in=group_queryset
-                ).values_list(
-                    'hash', 'group_id'
-                )[:max_pre_snuba_candidates + 1]
+            candidate_ids = list(
+                group_queryset.values_list('id', flat=True)[:max_pre_snuba_candidates + 1]
             )
-            metrics.timing('snuba.search.num_candidates', len(candidate_hashes))
+            metrics.timing('snuba.search.num_candidates', len(candidate_ids))
 
-            if not candidate_hashes:
+            if not candidate_ids:
                 # no matches could possibly be found from this point on
                 metrics.incr('snuba.search.no_candidates')
                 return Paginator(Group.objects.none()).get_result()
-            elif len(candidate_hashes) > max_pre_snuba_candidates:
+            elif len(candidate_ids) > max_pre_snuba_candidates:
                 # If the pre-filter query didn't include anything to significantly
                 # filter down the number of results (from 'first_release', 'query',
                 # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
@@ -254,7 +249,7 @@ class SnubaSearchBackend(ds.DjangoSearchBackend):
                 # this queryset to the results from Snuba, which we call
                 # post-filtering.
                 metrics.incr('snuba.search.too_many_candidates')
-                candidate_hashes = None
+                candidate_ids = None
 
         sort, extra_aggregations, score_fn = sort_strategies[sort_by]
 
@@ -277,7 +272,7 @@ class SnubaSearchBackend(ds.DjangoSearchBackend):
         # to answer the query (or hit the end of possible results). We do
         # this because a common case for search is to return 100 groups
         # sorted by `last_seen`, and we want to avoid returning all of
-        # a project's hashes and then post-sorting them all in Postgres
+        # a project's groups and then post-sorting them all in Postgres
         # when typically the first N results will do.
         while (time.time() - time_start) < max_time:
             num_chunks += 1
@@ -285,8 +280,8 @@ class SnubaSearchBackend(ds.DjangoSearchBackend):
             # grow the chunk size on each iteration to account for huge projects
             # and weird queries, up to a max size
             chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size)
-            # but if we have candidate_hashes always query for at least that many items
-            chunk_limit = max(chunk_limit, len(candidate_hashes) if candidate_hashes else 0)
+            # but if we have candidate_ids always query for at least that many items
+            chunk_limit = max(chunk_limit, len(candidate_ids) if candidate_ids else 0)
 
             # {group_id: group_score, ...}
             snuba_groups, more_results = snuba_search(
@@ -298,7 +293,7 @@ class SnubaSearchBackend(ds.DjangoSearchBackend):
                 sort=sort,
                 extra_aggregations=extra_aggregations,
                 score_fn=score_fn,
-                candidate_hashes=candidate_hashes,
+                candidate_ids=candidate_ids,
                 limit=chunk_limit,
                 offset=offset,
                 **parameters
@@ -309,7 +304,7 @@ class SnubaSearchBackend(ds.DjangoSearchBackend):
             if not snuba_groups:
                 break
 
-            if candidate_hashes:
+            if candidate_ids:
                 # pre-filtered candidates were passed down to Snuba,
                 # so we're finished with filtering and these are the
                 # only results
@@ -350,7 +345,7 @@ class SnubaSearchBackend(ds.DjangoSearchBackend):
             # something other than partially leak internal paginator logic up to here.
             # Or make separate Paginator implementation just for Snuba search?
             if cursor is not None \
-                    and not candidate_hashes \
+                    and not candidate_ids \
                     and more_results:
                 if cursor.is_prev and min_score < cursor.value:
                     continue
@@ -367,7 +362,7 @@ class SnubaSearchBackend(ds.DjangoSearchBackend):
             # * we started with Postgres candidates and so only do one Snuba query max
             # * the paginator is returning enough results to satisfy the query (>= the limit)
             # * there are no more groups in Snuba to post-filter
-            if candidate_hashes \
+            if candidate_ids \
                     or len(paginator_results.results) >= limit \
                     or not more_results:
                 break
@@ -381,7 +376,7 @@ class SnubaSearchBackend(ds.DjangoSearchBackend):
 
 
 def snuba_search(project_id, environment_id, tags, start, end,
-                 sort, extra_aggregations, score_fn, candidate_hashes,
+                 sort, extra_aggregations, score_fn, candidate_ids,
                  limit, offset, **parameters):
     """
     This function doesn't strictly benefit from or require being pulled out of the main
@@ -402,8 +397,8 @@ def snuba_search(project_id, environment_id, tags, start, end,
     if environment_id is not None:
         filters['environment'] = [environment_id]
 
-    if candidate_hashes is not None:
-        filters['primary_hash'] = candidate_hashes.keys()
+    if candidate_ids is not None:
+        filters['issue'] = candidate_ids
 
     having = SnubaConditionBuilder({
         'age_from': ScalarCondition('first_seen', '>'),
@@ -434,17 +429,17 @@ def snuba_search(project_id, environment_id, tags, start, end,
     for alias in required_aggregations:
         aggregations.append(aggregation_defs[alias] + [alias])
 
-    # {hash -> {<agg_alias> -> <agg_value>,
-    #           <agg_alias> -> <agg_value>,
-    #           ...},
+    # {group_id -> {<agg_alias> -> <agg_value>,
+    #               <agg_alias> -> <agg_value>,
+    #               ...},
     #  ...}
     # _OR_ if there's only one <agg_alias> in use
-    # {hash -> <agg_value>,
+    # {group_id -> <agg_value>,
     #  ...}
     snuba_results = snuba.query(
         start=start,
         end=end,
-        groupby=['primary_hash'],
+        groupby=['issue'],
         conditions=conditions,
         having=having,
         filter_keys=filters,
@@ -454,57 +449,32 @@ def snuba_search(project_id, environment_id, tags, start, end,
         limit=limit + 1,
         offset=offset,
     )
-    metrics.timing('snuba.search.num_result_hashes', len(snuba_results.keys()))
+    metrics.timing('snuba.search.num_result_groups', len(snuba_results.keys()))
     more_results = len(snuba_results) == limit + 1
 
-    # {hash -> group_id, ...}
-    if candidate_hashes is not None:
-        # any hash coming back had to come from our candidate set
-        hash_to_group = candidate_hashes
-    else:
-        hash_to_group = dict(
-            GroupHash.objects.filter(
-                project_id=project_id,
-                hash__in=snuba_results.keys()
-            ).values_list(
-                'hash', 'group_id'
-            )
-        )
-
-    # {group_id -> {field1: [...all values from field1 for all hashes...],
-    #               field2: [...all values from field2 for all hashes...]
+    # {group_id -> {field1: value1,
+    #               field2: value2,
     #               ...}
     #  ...}
     group_data = {}
-    for hash, obj in snuba_results.items():
-        if hash in hash_to_group:
-            group_id = hash_to_group[hash]
-
-            if group_id not in group_data:
-                group_data[group_id] = defaultdict(list)
-
-            dest = group_data[group_id]
-
-            # NOTE: The Snuba utility code is trying to be helpful by collapsing
-            # results with only one aggregate down to the single value. It's a
-            # bit of a hack that we then immediately undo that work here, but
-            # many other callers get value out of that functionality. If we see
-            # this pattern again we should either add an option to opt-out of
-            # the 'help' here or remove it from the Snuba code altogether.
-            if len(required_aggregations) == 1:
-                alias = list(required_aggregations)[0]
-                dest[alias].append(obj)
-            else:
-                for k, v in obj.items():
-                    dest[k].append(v)
+    for group_id, obj in snuba_results.items():
+        if group_id not in group_data:
+            group_data[group_id] = {}
+
+        dest = group_data[group_id]
+
+        # NOTE: The Snuba utility code is trying to be helpful by collapsing
+        # results with only one aggregate down to the single value. It's a
+        # bit of a hack that we then immediately undo that work here, but
+        # many other callers get value out of that functionality. If we see
+        # this pattern again we should either add an option to opt-out of
+        # the 'help' here or remove it from the Snuba code altogether.
+        if len(required_aggregations) == 1:
+            alias = list(required_aggregations)[0]
+            dest[alias] = obj
         else:
-            logger.warning(
-                'search.hash_not_found',
-                extra={
-                    'project_id': project_id,
-                    'hash': hash,
-                },
-            )
+            for k, v in obj.items():
+                dest[k] = v
 
     return (
         list(

+ 14 - 2
tests/snuba/search/test_backend.py

@@ -565,6 +565,8 @@ class SnubaSearchTest(SnubaTestCase):
             }
         )
 
+        self.group1.update(last_seen=self.group1.last_seen + timedelta(days=1))
+
         results = self.backend.query(
             self.project,
             environment=self.environments['production'],
@@ -575,10 +577,20 @@ class SnubaSearchTest(SnubaTestCase):
 
         results = self.backend.query(
             self.project,
+            date_to=self.group1.last_seen + timedelta(days=1),
             environment=self.environments['development'],
             last_seen_from=self.group1.last_seen,
             last_seen_from_inclusive=False,
         )
+        assert set(results) == set()
+
+        results = self.backend.query(
+            self.project,
+            date_to=self.group1.last_seen + timedelta(days=1),
+            environment=self.environments['development'],
+            last_seen_from=self.group1.last_seen,
+            last_seen_from_inclusive=True,
+        )
         assert set(results) == set([self.group1])
 
     def test_date_filter(self):
@@ -796,10 +808,10 @@ class SnubaSearchTest(SnubaTestCase):
             'end': Any(datetime),
             'filter_keys': {
                 'project_id': [self.project.id],
-                'primary_hash': [u'513772ee53011ad9f4dc374b2d34d0e9']
+                'issue': [self.group1.id]
             },
             'referrer': 'search',
-            'groupby': ['primary_hash'],
+            'groupby': ['issue'],
             'conditions': [],
             'limit': limit,
             'offset': 0,

+ 2 - 2
tests/snuba/tagstore/test_tagstore_backend.py

@@ -42,7 +42,7 @@ class TagStorageTest(SnubaTestCase):
         data = json.dumps([{
             'event_id': six.text_type(r) * 32,
             'primary_hash': hash1,
-            'group_id': int(hash1[:16], 16),
+            'group_id': self.proj1group1.id,
             'project_id': self.proj1.id,
             'message': 'message 1',
             'platform': 'python',
@@ -64,7 +64,7 @@ class TagStorageTest(SnubaTestCase):
         } for r in [1, 2]] + [{
             'event_id': '3' * 32,
             'primary_hash': hash2,
-            'group_id': int(hash2[:16], 16),
+            'group_id': self.proj1group2.id,
             'project_id': self.proj1.id,
             'message': 'message 2',
             'platform': 'python',

+ 7 - 0
tests/snuba/test_snuba.py

@@ -5,6 +5,7 @@ import pytest
 import time
 import uuid
 
+from sentry import options
 from sentry.models import GroupHash, GroupHashTombstone
 from sentry.testutils import SnubaTestCase
 from sentry.utils import snuba
@@ -86,6 +87,12 @@ class SnubaTest(SnubaTestCase):
         })
 
     def test_project_issues_with_tombstones(self):
+        # Nothing to be done if we're using `group_id`.
+        # When this option is the default we can remove
+        # this test.
+        if options.get('snuba.use_group_id_column'):
+            return
+
         base_time = datetime.utcnow()
         hash = 'a' * 32
 

+ 1 - 1
tests/snuba/tsdb/test_tsdb_backend.py

@@ -109,7 +109,7 @@ class SnubaTSDBTest(TestCase):
         data = json.dumps([{
             'event_id': (six.text_type(r) * 32)[:32],
             'primary_hash': [hash1, hash2][(r // 600) % 2],  # Switch every 10 mins
-            'group_id': int([hash1, hash2][(r // 600) % 2][:16], 16),
+            'group_id': [self.proj1group1.id, self.proj1group2.id][(r // 600) % 2],
             'project_id': self.proj1.id,
             'message': 'message 1',
             'platform': 'python',