4 days ago · ba7fcabdee
--- a/src/sentry/options/defaults.py
+++ b/src/sentry/options/defaults.py
@@ -2603,3 +2603,9 @@ register(
 
				     default=0.0,
			
 
				     flags=FLAG_AUTOMATOR_MODIFIABLE,
			
 
				 )
			
 
				+
			
 
				+register(
			
 
				+    "similarity.backfill_nodestore_use_multithread",
			
 
				+    default=False,
			
 
				+    flags=FLAG_AUTOMATOR_MODIFIABLE,
			
 
				+)
			
--- a/src/sentry/tasks/embeddings_grouping/utils.py
+++ b/src/sentry/tasks/embeddings_grouping/utils.py
@@ -1,5 +1,6 @@
 
				 import logging
			
 
				 import time
			
 
				+from concurrent.futures import ThreadPoolExecutor, as_completed
			
 
				 from dataclasses import asdict
			
 
				 from datetime import UTC, datetime, timedelta
			
 
				 from typing import Any, TypedDict
			
@@ -11,7 +12,7 @@ from redis.client import StrictRedis
 
				 from rediscluster import RedisCluster
			
 
				 from snuba_sdk import Column, Condition, Entity, Limit, Op, Query, Request
			
 
				 
			
 
				-from sentry import features, nodestore
			
 
				+from sentry import features, nodestore, options
			
 
				 from sentry.conf.server import SEER_SIMILARITY_MODEL_VERSION
			
 
				 from sentry.eventstore.models import Event
			
 
				 from sentry.grouping.grouping_info import get_grouping_info
			
@@ -377,26 +378,11 @@ def update_groups(project, seer_response, group_id_batch_filtered, group_hashes_
 
				     )
			
 
				 
			
 
				 
			
 
				-@metrics.wraps(f"{BACKFILL_NAME}.lookup_event_bulk", sample_rate=1.0)
			
 
				-@sentry_sdk.tracing.trace
			
 
				-def lookup_group_data_stacktrace_bulk(
			
 
				-    project: Project, rows: list[GroupEventRow]
			
 
				-) -> dict[int, Event]:
			
 
				-    project_id = project.id
			
 
				-    node_id_to_group_data = {
			
 
				-        Event.generate_node_id(project_id, event_id=row["event_id"]): (
			
 
				-            row["event_id"],
			
 
				-            row["group_id"],
			
 
				-        )
			
 
				-        for row in rows
			
 
				-    }
			
 
				-
			
 
				-    groups_to_event = {}
			
 
				-
			
 
				+def _make_nodestore_call(project, node_keys):
			
 
				     try:
			
 
				         bulk_data = _retry_operation(
			
 
				             nodestore.backend.get_multi,
			
 
				-            list(node_id_to_group_data.keys()),
			
 
				+            node_keys,
			
 
				             retries=3,
			
 
				             delay=2,
			
 
				         )
			
@@ -404,7 +390,7 @@ def lookup_group_data_stacktrace_bulk(
 
				         extra = {
			
 
				             "organization_id": project.organization.id,
			
 
				             "project_id": project.id,
			
 
				-            "group_data": json.dumps(rows),
			
 
				+            "node_keys": json.dumps(node_keys),
			
 
				             "error": e.message,
			
 
				         }
			
 
				         logger.exception(
			
@@ -413,6 +399,46 @@ def lookup_group_data_stacktrace_bulk(
 
				         )
			
 
				         raise
			
 
				 
			
 
				+    return bulk_data
			
 
				+
			
 
				+
			
 
				+def make_nodestore_call_multithreaded(project, node_keys):
			
 
				+    def process_chunk(chunk):
			
 
				+        return _make_nodestore_call(project, chunk)
			
 
				+
			
 
				+    chunk_size = 5
			
 
				+    chunks = [node_keys[i : i + chunk_size] for i in range(0, len(node_keys), chunk_size)]
			
 
				+
			
 
				+    bulk_data = {}
			
 
				+    with ThreadPoolExecutor(max_workers=5) as executor:
			
 
				+        future_to_chunk = {executor.submit(process_chunk, chunk): chunk for chunk in chunks}
			
 
				+        for future in as_completed(future_to_chunk):
			
 
				+            bulk_data.update(future.result())
			
 
				+
			
 
				+    return bulk_data
			
 
				+
			
 
				+
			
 
				+@metrics.wraps(f"{BACKFILL_NAME}.lookup_event_bulk", sample_rate=1.0)
			
 
				+@sentry_sdk.tracing.trace
			
 
				+def lookup_group_data_stacktrace_bulk(
			
 
				+    project: Project, rows: list[GroupEventRow]
			
 
				+) -> dict[int, Event]:
			
 
				+    project_id = project.id
			
 
				+    node_id_to_group_data = {
			
 
				+        Event.generate_node_id(project_id, event_id=row["event_id"]): (
			
 
				+            row["event_id"],
			
 
				+            row["group_id"],
			
 
				+        )
			
 
				+        for row in rows
			
 
				+    }
			
 
				+
			
 
				+    groups_to_event = {}
			
 
				+
			
 
				+    if options.get("similarity.backfill_nodestore_use_multithread"):
			
 
				+        bulk_data = make_nodestore_call_multithreaded(project, list(node_id_to_group_data.keys()))
			
 
				+    else:
			
 
				+        bulk_data = _make_nodestore_call(project, list(node_id_to_group_data.keys()))
			
 
				+
			
 
				     for node_id, data in bulk_data.items():
			
 
				         if node_id in node_id_to_group_data:
			
 
				             event_id, group_id = (
			
--- a/tests/sentry/tasks/test_backfill_seer_grouping_records.py
+++ b/tests/sentry/tasks/test_backfill_seer_grouping_records.py
@@ -196,6 +196,35 @@ class TestBackfillSeerGroupingRecords(SnubaTestCase, TestCase):
 
				             "backfill_grouping_records._lookup_event_bulk.hit_ratio", 100, sample_rate=1.0
			
 
				         )
			
 
				 
			
 
				+    @patch("sentry.tasks.embeddings_grouping.utils.metrics")
			
 
				+    @override_options({"similarity.backfill_nodestore_use_multithread": True})
			
 
				+    def test_lookup_group_data_stacktrace_bulk_success_multithread(self, mock_metrics):
			
 
				+        """Test successful bulk group data and stacktrace lookup"""
			
 
				+        rows, events = self.bulk_rows, self.bulk_events
			
 
				+        nodestore_results, _ = get_events_from_nodestore(
			
 
				+            self.project, rows, self.group_hashes.keys()
			
 
				+        )
			
 
				+
			
 
				+        expected_group_data = [
			
 
				+            CreateGroupingRecordData(
			
 
				+                group_id=event.group.id,
			
 
				+                hash=self.group_hashes[event.group.id],
			
 
				+                project_id=self.project.id,
			
 
				+                message=event.title,
			
 
				+                exception_type=get_path(event.data, "exception", "values", -1, "type"),
			
 
				+            )
			
 
				+            for event in events
			
 
				+        ]
			
 
				+        expected_stacktraces = [
			
 
				+            f'Error{i}: error with value\n  File "function_{i}.py", function function_{i}'
			
 
				+            for i in range(5)
			
 
				+        ]
			
 
				+        assert nodestore_results["data"] == expected_group_data
			
 
				+        assert nodestore_results["stacktrace_list"] == expected_stacktraces
			
 
				+        mock_metrics.gauge.assert_called_with(
			
 
				+            "backfill_grouping_records._lookup_event_bulk.hit_ratio", 100, sample_rate=1.0
			
 
				+        )
			
 
				+
			
 
				     @patch("time.sleep", return_value=None)
			
 
				     @patch("sentry.tasks.embeddings_grouping.utils.logger")
			
 
				     @patch("sentry.nodestore.backend.get_multi")