6 лет назад · 759e2905f3
--- a/src/sentry/options/defaults.py
+++ b/src/sentry/options/defaults.py
@@ -135,7 +135,7 @@ register('vsts.client-id', flags=FLAG_PRIORITIZE_DISK)
 
				 register('vsts.client-secret', flags=FLAG_PRIORITIZE_DISK)
			
 
				 
			
 
				 # Snuba
			
 
				-register('snuba.use_group_id_column', default=False)
			
 
				+register('snuba.use_group_id_column', default=True)
			
 
				 register('snuba.search.max-pre-snuba-candidates', default=500)
			
 
				 register('snuba.search.chunk-growth-rate', default=1.5)
			
 
				 register('snuba.search.max-chunk-size', default=2000)
			
--- a/src/sentry/search/snuba/backend.py
+++ b/src/sentry/search/snuba/backend.py
@@ -6,7 +6,6 @@ import logging
 
				 import math
			
 
				 import pytz
			
 
				 import time
			
 
				-from collections import defaultdict
			
 
				 from datetime import timedelta, datetime
			
 
				 
			
 
				 from django.utils import timezone
			
@@ -14,7 +13,7 @@ from django.utils import timezone
 
				 from sentry import options
			
 
				 from sentry.api.paginator import SequencePaginator, Paginator
			
 
				 from sentry.event_manager import ALLOWED_FUTURE_DELTA
			
 
				-from sentry.models import Release, Group, GroupEnvironment, GroupHash
			
 
				+from sentry.models import Release, Group, GroupEnvironment
			
 
				 from sentry.search.django import backend as ds
			
 
				 from sentry.utils import snuba, metrics
			
 
				 from sentry.utils.dates import to_timestamp
			
@@ -37,14 +36,14 @@ def snuba_str_to_datetime(d):
 
				 
			
 
				 
			
 
				 def calculate_priority_cursor(data):
			
 
				-    times_seen = sum(data['times_seen'])
			
 
				-    last_seen = max(int(to_timestamp(snuba_str_to_datetime(d)) * 1000) for d in data['last_seen'])
			
 
				+    times_seen = data['times_seen']
			
 
				+    last_seen = int(to_timestamp(snuba_str_to_datetime(data['last_seen'])) * 1000)
			
 
				     return ((math.log(times_seen) * 600) + last_seen)
			
 
				 
			
 
				 
			
 
				-def _datetime_cursor_calculator(field, fn):
			
 
				+def _datetime_cursor_calculator(field):
			
 
				     def calculate(data):
			
 
				-        datetime = fn(snuba_str_to_datetime(d) for d in data[field])
			
 
				+        datetime = snuba_str_to_datetime(data[field])
			
 
				         return int(to_timestamp(datetime) * 1000)
			
 
				 
			
 
				     return calculate
			
@@ -60,13 +59,13 @@ sort_strategies = {
 
				         'priority', ['last_seen', 'times_seen'], calculate_priority_cursor,
			
 
				     ),
			
 
				     'date': (
			
 
				-        'last_seen', [], _datetime_cursor_calculator('last_seen', max),
			
 
				+        'last_seen', [], _datetime_cursor_calculator('last_seen'),
			
 
				     ),
			
 
				     'new': (
			
 
				-        'first_seen', [], _datetime_cursor_calculator('first_seen', min),
			
 
				+        'first_seen', [], _datetime_cursor_calculator('first_seen'),
			
 
				     ),
			
 
				     'freq': (
			
 
				-        'times_seen', [], lambda data: sum(data['times_seen']),
			
 
				+        'times_seen', [], lambda data: data['times_seen'],
			
 
				     ),
			
 
				 }
			
 
				 
			
@@ -221,29 +220,25 @@ class SnubaSearchBackend(ds.DjangoSearchBackend):
 
				                 parameters,
			
 
				             )
			
 
				 
			
 
				-        # maximum number of GroupHashes to send down to Snuba,
			
 
				-        # if more GroupHash candidates are found, a "bare" Snuba
			
 
				+        # maximum number of Group IDs to send down to Snuba,
			
 
				+        # if more Group ID candidates are found, a "bare" Snuba
			
 
				         # search is performed and the result groups are then
			
 
				         # post-filtered via queries to the Sentry DB
			
 
				         max_pre_snuba_candidates = options.get('snuba.search.max-pre-snuba-candidates')
			
 
				 
			
 
				         # pre-filter query
			
 
				-        candidate_hashes = None
			
 
				+        candidate_ids = None
			
 
				         if max_pre_snuba_candidates and limit <= max_pre_snuba_candidates:
			
 
				-            candidate_hashes = dict(
			
 
				-                GroupHash.objects.filter(
			
 
				-                    group__in=group_queryset
			
 
				-                ).values_list(
			
 
				-                    'hash', 'group_id'
			
 
				-                )[:max_pre_snuba_candidates + 1]
			
 
				+            candidate_ids = list(
			
 
				+                group_queryset.values_list('id', flat=True)[:max_pre_snuba_candidates + 1]
			
 
				             )
			
 
				-            metrics.timing('snuba.search.num_candidates', len(candidate_hashes))
			
 
				+            metrics.timing('snuba.search.num_candidates', len(candidate_ids))
			
 
				 
			
 
				-            if not candidate_hashes:
			
 
				+            if not candidate_ids:
			
 
				                 # no matches could possibly be found from this point on
			
 
				                 metrics.incr('snuba.search.no_candidates')
			
 
				                 return Paginator(Group.objects.none()).get_result()
			
 
				-            elif len(candidate_hashes) > max_pre_snuba_candidates:
			
 
				+            elif len(candidate_ids) > max_pre_snuba_candidates:
			
 
				                 # If the pre-filter query didn't include anything to significantly
			
 
				                 # filter down the number of results (from 'first_release', 'query',
			
 
				                 # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
			
@@ -254,7 +249,7 @@ class SnubaSearchBackend(ds.DjangoSearchBackend):
 
				                 # this queryset to the results from Snuba, which we call
			
 
				                 # post-filtering.
			
 
				                 metrics.incr('snuba.search.too_many_candidates')
			
 
				-                candidate_hashes = None
			
 
				+                candidate_ids = None
			
 
				 
			
 
				         sort, extra_aggregations, score_fn = sort_strategies[sort_by]
			
 
				 
			
@@ -277,7 +272,7 @@ class SnubaSearchBackend(ds.DjangoSearchBackend):
 
				         # to answer the query (or hit the end of possible results). We do
			
 
				         # this because a common case for search is to return 100 groups
			
 
				         # sorted by `last_seen`, and we want to avoid returning all of
			
 
				-        # a project's hashes and then post-sorting them all in Postgres
			
 
				+        # a project's groups and then post-sorting them all in Postgres
			
 
				         # when typically the first N results will do.
			
 
				         while (time.time() - time_start) < max_time:
			
 
				             num_chunks += 1
			
@@ -285,8 +280,8 @@ class SnubaSearchBackend(ds.DjangoSearchBackend):
 
				             # grow the chunk size on each iteration to account for huge projects
			
 
				             # and weird queries, up to a max size
			
 
				             chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size)
			
 
				-            # but if we have candidate_hashes always query for at least that many items
			
 
				-            chunk_limit = max(chunk_limit, len(candidate_hashes) if candidate_hashes else 0)
			
 
				+            # but if we have candidate_ids always query for at least that many items
			
 
				+            chunk_limit = max(chunk_limit, len(candidate_ids) if candidate_ids else 0)
			
 
				 
			
 
				             # {group_id: group_score, ...}
			
 
				             snuba_groups, more_results = snuba_search(
			
@@ -298,7 +293,7 @@ class SnubaSearchBackend(ds.DjangoSearchBackend):
 
				                 sort=sort,
			
 
				                 extra_aggregations=extra_aggregations,
			
 
				                 score_fn=score_fn,
			
 
				-                candidate_hashes=candidate_hashes,
			
 
				+                candidate_ids=candidate_ids,
			
 
				                 limit=chunk_limit,
			
 
				                 offset=offset,
			
 
				                 **parameters
			
@@ -309,7 +304,7 @@ class SnubaSearchBackend(ds.DjangoSearchBackend):
 
				             if not snuba_groups:
			
 
				                 break
			
 
				 
			
 
				-            if candidate_hashes:
			
 
				+            if candidate_ids:
			
 
				                 # pre-filtered candidates were passed down to Snuba,
			
 
				                 # so we're finished with filtering and these are the
			
 
				                 # only results
			
@@ -350,7 +345,7 @@ class SnubaSearchBackend(ds.DjangoSearchBackend):
 
				             # something other than partially leak internal paginator logic up to here.
			
 
				             # Or make separate Paginator implementation just for Snuba search?
			
 
				             if cursor is not None \
			
 
				-                    and not candidate_hashes \
			
 
				+                    and not candidate_ids \
			
 
				                     and more_results:
			
 
				                 if cursor.is_prev and min_score < cursor.value:
			
 
				                     continue
			
@@ -367,7 +362,7 @@ class SnubaSearchBackend(ds.DjangoSearchBackend):
 
				             # * we started with Postgres candidates and so only do one Snuba query max
			
 
				             # * the paginator is returning enough results to satisfy the query (>= the limit)
			
 
				             # * there are no more groups in Snuba to post-filter
			
 
				-            if candidate_hashes \
			
 
				+            if candidate_ids \
			
 
				                     or len(paginator_results.results) >= limit \
			
 
				                     or not more_results:
			
 
				                 break
			
@@ -381,7 +376,7 @@ class SnubaSearchBackend(ds.DjangoSearchBackend):
 
				 
			
 
				 
			
 
				 def snuba_search(project_id, environment_id, tags, start, end,
			
 
				-                 sort, extra_aggregations, score_fn, candidate_hashes,
			
 
				+                 sort, extra_aggregations, score_fn, candidate_ids,
			
 
				                  limit, offset, **parameters):
			
 
				     """
			
 
				     This function doesn't strictly benefit from or require being pulled out of the main
			
@@ -402,8 +397,8 @@ def snuba_search(project_id, environment_id, tags, start, end,
 
				     if environment_id is not None:
			
 
				         filters['environment'] = [environment_id]
			
 
				 
			
 
				-    if candidate_hashes is not None:
			
 
				-        filters['primary_hash'] = candidate_hashes.keys()
			
 
				+    if candidate_ids is not None:
			
 
				+        filters['issue'] = candidate_ids
			
 
				 
			
 
				     having = SnubaConditionBuilder({
			
 
				         'age_from': ScalarCondition('first_seen', '>'),
			
@@ -434,17 +429,17 @@ def snuba_search(project_id, environment_id, tags, start, end,
 
				     for alias in required_aggregations:
			
 
				         aggregations.append(aggregation_defs[alias] + [alias])
			
 
				 
			
 
				-    # {hash -> {<agg_alias> -> <agg_value>,
			
 
				-    #           <agg_alias> -> <agg_value>,
			
 
				-    #           ...},
			
 
				+    # {group_id -> {<agg_alias> -> <agg_value>,
			
 
				+    #               <agg_alias> -> <agg_value>,
			
 
				+    #               ...},
			
 
				     #  ...}
			
 
				     # _OR_ if there's only one <agg_alias> in use
			
 
				-    # {hash -> <agg_value>,
			
 
				+    # {group_id -> <agg_value>,
			
 
				     #  ...}
			
 
				     snuba_results = snuba.query(
			
 
				         start=start,
			
 
				         end=end,
			
 
				-        groupby=['primary_hash'],
			
 
				+        groupby=['issue'],
			
 
				         conditions=conditions,
			
 
				         having=having,
			
 
				         filter_keys=filters,
			
@@ -454,57 +449,32 @@ def snuba_search(project_id, environment_id, tags, start, end,
 
				         limit=limit + 1,
			
 
				         offset=offset,
			
 
				     )
			
 
				-    metrics.timing('snuba.search.num_result_hashes', len(snuba_results.keys()))
			
 
				+    metrics.timing('snuba.search.num_result_groups', len(snuba_results.keys()))
			
 
				     more_results = len(snuba_results) == limit + 1
			
 
				 
			
 
				-    # {hash -> group_id, ...}
			
 
				-    if candidate_hashes is not None:
			
 
				-        # any hash coming back had to come from our candidate set
			
 
				-        hash_to_group = candidate_hashes
			
 
				-    else:
			
 
				-        hash_to_group = dict(
			
 
				-            GroupHash.objects.filter(
			
 
				-                project_id=project_id,
			
 
				-                hash__in=snuba_results.keys()
			
 
				-            ).values_list(
			
 
				-                'hash', 'group_id'
			
 
				-            )
			
 
				-        )
			
 
				-
			
 
				-    # {group_id -> {field1: [...all values from field1 for all hashes...],
			
 
				-    #               field2: [...all values from field2 for all hashes...]
			
 
				+    # {group_id -> {field1: value1,
			
 
				+    #               field2: value2,
			
 
				     #               ...}
			
 
				     #  ...}
			
 
				     group_data = {}
			
 
				-    for hash, obj in snuba_results.items():
			
 
				-        if hash in hash_to_group:
			
 
				-            group_id = hash_to_group[hash]
			
 
				-
			
 
				-            if group_id not in group_data:
			
 
				-                group_data[group_id] = defaultdict(list)
			
 
				-
			
 
				-            dest = group_data[group_id]
			
 
				-
			
 
				-            # NOTE: The Snuba utility code is trying to be helpful by collapsing
			
 
				-            # results with only one aggregate down to the single value. It's a
			
 
				-            # bit of a hack that we then immediately undo that work here, but
			
 
				-            # many other callers get value out of that functionality. If we see
			
 
				-            # this pattern again we should either add an option to opt-out of
			
 
				-            # the 'help' here or remove it from the Snuba code altogether.
			
 
				-            if len(required_aggregations) == 1:
			
 
				-                alias = list(required_aggregations)[0]
			
 
				-                dest[alias].append(obj)
			
 
				-            else:
			
 
				-                for k, v in obj.items():
			
 
				-                    dest[k].append(v)
			
 
				+    for group_id, obj in snuba_results.items():
			
 
				+        if group_id not in group_data:
			
 
				+            group_data[group_id] = {}
			
 
				+
			
 
				+        dest = group_data[group_id]
			
 
				+
			
 
				+        # NOTE: The Snuba utility code is trying to be helpful by collapsing
			
 
				+        # results with only one aggregate down to the single value. It's a
			
 
				+        # bit of a hack that we then immediately undo that work here, but
			
 
				+        # many other callers get value out of that functionality. If we see
			
 
				+        # this pattern again we should either add an option to opt-out of
			
 
				+        # the 'help' here or remove it from the Snuba code altogether.
			
 
				+        if len(required_aggregations) == 1:
			
 
				+            alias = list(required_aggregations)[0]
			
 
				+            dest[alias] = obj
			
 
				         else:
			
 
				-            logger.warning(
			
 
				-                'search.hash_not_found',
			
 
				-                extra={
			
 
				-                    'project_id': project_id,
			
 
				-                    'hash': hash,
			
 
				-                },
			
 
				-            )
			
 
				+            for k, v in obj.items():
			
 
				+                dest[k] = v
			
 
				 
			
 
				     return (
			
 
				         list(
			
--- a/tests/snuba/search/test_backend.py
+++ b/tests/snuba/search/test_backend.py
@@ -565,6 +565,8 @@ class SnubaSearchTest(SnubaTestCase):
 
				             }
			
 
				         )
			
 
				 
			
 
				+        self.group1.update(last_seen=self.group1.last_seen + timedelta(days=1))
			
 
				+
			
 
				         results = self.backend.query(
			
 
				             self.project,
			
 
				             environment=self.environments['production'],
			
@@ -575,10 +577,20 @@ class SnubaSearchTest(SnubaTestCase):
 
				 
			
 
				         results = self.backend.query(
			
 
				             self.project,
			
 
				+            date_to=self.group1.last_seen + timedelta(days=1),
			
 
				             environment=self.environments['development'],
			
 
				             last_seen_from=self.group1.last_seen,
			
 
				             last_seen_from_inclusive=False,
			
 
				         )
			
 
				+        assert set(results) == set()
			
 
				+
			
 
				+        results = self.backend.query(
			
 
				+            self.project,
			
 
				+            date_to=self.group1.last_seen + timedelta(days=1),
			
 
				+            environment=self.environments['development'],
			
 
				+            last_seen_from=self.group1.last_seen,
			
 
				+            last_seen_from_inclusive=True,
			
 
				+        )
			
 
				         assert set(results) == set([self.group1])
			
 
				 
			
 
				     def test_date_filter(self):
			
@@ -796,10 +808,10 @@ class SnubaSearchTest(SnubaTestCase):
 
				             'end': Any(datetime),
			
 
				             'filter_keys': {
			
 
				                 'project_id': [self.project.id],
			
 
				-                'primary_hash': [u'513772ee53011ad9f4dc374b2d34d0e9']
			
 
				+                'issue': [self.group1.id]
			
 
				             },
			
 
				             'referrer': 'search',
			
 
				-            'groupby': ['primary_hash'],
			
 
				+            'groupby': ['issue'],
			
 
				             'conditions': [],
			
 
				             'limit': limit,
			
 
				             'offset': 0,
			
--- a/tests/snuba/tagstore/test_tagstore_backend.py
+++ b/tests/snuba/tagstore/test_tagstore_backend.py
@@ -42,7 +42,7 @@ class TagStorageTest(SnubaTestCase):
 
				         data = json.dumps([{
			
 
				             'event_id': six.text_type(r) * 32,
			
 
				             'primary_hash': hash1,
			
 
				-            'group_id': int(hash1[:16], 16),
			
 
				+            'group_id': self.proj1group1.id,
			
 
				             'project_id': self.proj1.id,
			
 
				             'message': 'message 1',
			
 
				             'platform': 'python',
			
@@ -64,7 +64,7 @@ class TagStorageTest(SnubaTestCase):
 
				         } for r in [1, 2]] + [{
			
 
				             'event_id': '3' * 32,
			
 
				             'primary_hash': hash2,
			
 
				-            'group_id': int(hash2[:16], 16),
			
 
				+            'group_id': self.proj1group2.id,
			
 
				             'project_id': self.proj1.id,
			
 
				             'message': 'message 2',
			
 
				             'platform': 'python',
			
--- a/tests/snuba/test_snuba.py
+++ b/tests/snuba/test_snuba.py
@@ -5,6 +5,7 @@ import pytest
 
				 import time
			
 
				 import uuid
			
 
				 
			
 
				+from sentry import options
			
 
				 from sentry.models import GroupHash, GroupHashTombstone
			
 
				 from sentry.testutils import SnubaTestCase
			
 
				 from sentry.utils import snuba
			
@@ -86,6 +87,12 @@ class SnubaTest(SnubaTestCase):
 
				         })
			
 
				 
			
 
				     def test_project_issues_with_tombstones(self):
			
 
				+        # Nothing to be done if we're using `group_id`.
			
 
				+        # When this option is the default we can remove
			
 
				+        # this test.
			
 
				+        if options.get('snuba.use_group_id_column'):
			
 
				+            return
			
 
				+
			
 
				         base_time = datetime.utcnow()
			
 
				         hash = 'a' * 32
			
 
				 
			
--- a/tests/snuba/tsdb/test_tsdb_backend.py
+++ b/tests/snuba/tsdb/test_tsdb_backend.py
@@ -109,7 +109,7 @@ class SnubaTSDBTest(TestCase):
 
				         data = json.dumps([{
			
 
				             'event_id': (six.text_type(r) * 32)[:32],
			
 
				             'primary_hash': [hash1, hash2][(r // 600) % 2],  # Switch every 10 mins
			
 
				-            'group_id': int([hash1, hash2][(r // 600) % 2][:16], 16),
			
 
				+            'group_id': [self.proj1group1.id, self.proj1group2.id][(r // 600) % 2],
			
 
				             'project_id': self.proj1.id,
			
 
				             'message': 'message 1',
			
 
				             'platform': 'python',