Просмотр исходного кода

chore(spans): Multiprocess strategy for perf issue detection (#67619)

- Multiprocess strategy for performance issue detection
- Move fetching and producing segments outside of the multi-processing
pool to its own step so that function is just always doing the same
thing on each span.
- Also makes buffer window configurable so we can tune the window as we
roll this out
- Read project_id from the message header so we can discard spans faster
(https://github.com/getsentry/relay/pull/3320)
Shruthi 11 месяцев назад
Родитель
Сommit
06cc5b8754

+ 1 - 0
src/sentry/consumers/__init__.py

@@ -356,6 +356,7 @@ KAFKA_CONSUMERS: Mapping[str, ConsumerDefinition] = {
     "detect-performance-issues": {
         "topic": Topic.BUFFERED_SEGMENTS,
         "strategy_factory": "sentry.spans.consumers.detect_performance_issues.factory.DetectPerformanceIssuesStrategyFactory",
+        "click_options": multiprocessing_options(default_max_batch_size=100),
     },
     **settings.SENTRY_KAFKA_CONSUMERS,
 }

+ 6 - 0
src/sentry/options/defaults.py

@@ -2220,6 +2220,12 @@ register(
     default=[],
     flags=FLAG_PRIORITIZE_DISK | FLAG_AUTOMATOR_MODIFIABLE,
 )
+register(
+    "standalone-spans.buffer-window.seconds",
+    type=Int,
+    default=120,  # 2 minutes
+    flags=FLAG_PRIORITIZE_DISK | FLAG_AUTOMATOR_MODIFIABLE,
+)
 
 # Deobfuscate profiles using Symbolicator
 register(

+ 4 - 2
src/sentry/spans/buffer/redis.py

@@ -3,10 +3,10 @@ from __future__ import annotations
 from django.conf import settings
 from sentry_redis_tools.clients import RedisCluster, StrictRedis
 
+from sentry import options
 from sentry.utils import json, redis
 
 SEGMENT_TTL = 5 * 60  # 5 min TTL in seconds
-TWO_MINUTES = 2 * 60  # 2 min delay in seconds
 
 
 def get_redis_client() -> RedisCluster | StrictRedis:
@@ -80,11 +80,13 @@ class RedisSpansBuffer:
         key = get_unprocessed_segments_key(partition)
         results = self.client.lrange(key, 0, -1) or []
 
+        buffer_window = options.get("standalone-spans.buffer-window.seconds")
+
         ltrim_index = 0
         segment_keys = []
         for result in results:
             segment_timestamp, segment_key = json.loads(result)
-            if now - segment_timestamp < TWO_MINUTES:
+            if now - segment_timestamp < buffer_window:
                 break
 
             ltrim_index += 1

+ 22 - 2
src/sentry/spans/consumers/detect_performance_issues/factory.py

@@ -5,13 +5,13 @@ from typing import Any
 from arroyo.backends.kafka.consumer import KafkaPayload
 from arroyo.processing.strategies.abstract import ProcessingStrategy, ProcessingStrategyFactory
 from arroyo.processing.strategies.commit import CommitOffsets
-from arroyo.processing.strategies.run_task import RunTask
 from arroyo.types import BrokerValue, Commit, Message, Partition
 from sentry_kafka_schemas import get_codec
 from sentry_kafka_schemas.codecs import Codec, ValidationError
 from sentry_kafka_schemas.schema_types.buffered_segments_v1 import BufferedSegment
 
 from sentry.spans.consumers.detect_performance_issues.message import process_segment
+from sentry.utils.arroyo import MultiprocessingPool, RunTaskWithMultiprocessing
 
 BUFFERED_SEGMENT_SCHEMA: Codec[BufferedSegment] = get_codec("buffered-segments")
 
@@ -42,12 +42,32 @@ def _process_message(message: Message[KafkaPayload]):
 
 
 class DetectPerformanceIssuesStrategyFactory(ProcessingStrategyFactory[KafkaPayload]):
+    def __init__(
+        self,
+        max_batch_size: int,
+        max_batch_time: int,
+        num_processes: int,
+        input_block_size: int | None,
+        output_block_size: int | None,
+    ):
+        super().__init__()
+        self.max_batch_size = max_batch_size
+        self.max_batch_time = max_batch_time
+        self.input_block_size = input_block_size
+        self.output_block_size = output_block_size
+        self.pool = MultiprocessingPool(num_processes)
+
     def create_with_partitions(
         self,
         commit: Commit,
         partitions: Mapping[Partition, int],
     ) -> ProcessingStrategy[KafkaPayload]:
-        return RunTask(
+        return RunTaskWithMultiprocessing(
             function=_process_message,
             next_step=CommitOffsets(commit),
+            max_batch_size=self.max_batch_size,
+            max_batch_time=self.max_batch_time,
+            pool=self.pool,
+            input_block_size=self.input_block_size,
+            output_block_size=self.output_block_size,
         )

+ 51 - 12
src/sentry/spans/consumers/process/factory.py

@@ -1,8 +1,10 @@
+import dataclasses
 import logging
 from collections.abc import Mapping
 from typing import Any
 
-from arroyo.backends.kafka.consumer import KafkaPayload
+from arroyo.backends.kafka.consumer import Headers, KafkaPayload
+from arroyo.processing.strategies import RunTask
 from arroyo.processing.strategies.abstract import ProcessingStrategy, ProcessingStrategyFactory
 from arroyo.processing.strategies.commit import CommitOffsets
 from arroyo.types import BrokerValue, Commit, Message, Partition
@@ -18,29 +20,50 @@ from sentry.utils.arroyo import MultiprocessingPool, RunTaskWithMultiprocessing
 logger = logging.getLogger(__name__)
 SPAN_SCHEMA: Codec[SpanEvent] = get_codec("snuba-spans")
 
-PROCESS_SEGMENT_DELAY = 2 * 60  # 2 minutes
 BATCH_SIZE = 100
 
 
+@dataclasses.dataclass
+class ProduceSegmentContext:
+    should_process_segments: bool
+    timestamp: int
+    partition: int
+
+
+def get_project_id(headers: Headers) -> int | None:
+    for k, v in headers:
+        if k == "project_id":
+            return int(v.decode("utf-8"))
+
+    return None
+
+
 def _deserialize_span(value: bytes) -> Mapping[str, Any]:
     return SPAN_SCHEMA.decode(value)
 
 
-def process_message(message: Message[KafkaPayload]):
+def process_message(message: Message[KafkaPayload]) -> ProduceSegmentContext | None:
     if not options.get("standalone-spans.process-spans-consumer.enable"):
-        return
+        return None
+
+    try:
+        project_id = get_project_id(message.payload.headers)
+    except Exception:
+        logger.exception("Failed to parse span message header")
+        return None
+
+    if project_id is None or project_id not in options.get(
+        "standalone-spans.process-spans-consumer.project-allowlist"
+    ):
+        return None
 
     assert isinstance(message.value, BrokerValue)
     try:
         span = _deserialize_span(message.payload.value)
         segment_id = span["segment_id"]
-        project_id = span["project_id"]
     except Exception:
         logger.exception("Failed to process span payload")
-        return
-
-    if project_id not in options.get("standalone-spans.process-spans-consumer.project-allowlist"):
-        return
+        return None
 
     timestamp = int(message.value.timestamp.timestamp())
     partition = message.value.partition.index
@@ -51,8 +74,22 @@ def process_message(message: Message[KafkaPayload]):
         project_id, segment_id, timestamp, partition, message.payload.value
     )
 
-    if should_process_segments:
-        keys = client.get_unprocessed_segments_and_prune_bucket(timestamp, partition)
+    return ProduceSegmentContext(
+        should_process_segments=should_process_segments, timestamp=timestamp, partition=partition
+    )
+
+
+def produce_segment(message: Message[ProduceSegmentContext | None]):
+    if message.payload is None:
+        return
+
+    context: ProduceSegmentContext = message.payload
+    if context.should_process_segments:
+        client = RedisSpansBuffer()
+
+        keys = client.get_unprocessed_segments_and_prune_bucket(
+            context.timestamp, context.partition
+        )
         # With pipelining, redis server is forced to queue replies using
         # up memory, so batching the keys we fetch.
         for i in range(0, len(keys), BATCH_SIZE):
@@ -83,9 +120,11 @@ class ProcessSpansStrategyFactory(ProcessingStrategyFactory[KafkaPayload]):
         commit: Commit,
         partitions: Mapping[Partition, int],
     ) -> ProcessingStrategy[KafkaPayload]:
+        next_step = RunTask(function=produce_segment, next_step=CommitOffsets(commit))
+
         return RunTaskWithMultiprocessing(
             function=process_message,
-            next_step=CommitOffsets(commit),
+            next_step=next_step,
             max_batch_size=self.max_batch_size,
             max_batch_time=self.max_batch_time,
             pool=self.pool,

+ 7 - 1
tests/sentry/spans/consumers/detect_performance_issues/test_factory.py

@@ -26,7 +26,13 @@ def build_mock_message(data, topic=None):
 def test_segment_deserialized_correctly(mock_process_segment):
     topic = ArroyoTopic(get_topic_definition(Topic.BUFFERED_SEGMENTS)["real_topic_name"])
     partition = Partition(topic, 0)
-    strategy = DetectPerformanceIssuesStrategyFactory().create_with_partitions(
+    strategy = DetectPerformanceIssuesStrategyFactory(
+        num_processes=2,
+        input_block_size=1,
+        max_batch_size=1,
+        max_batch_time=1,
+        output_block_size=1,
+    ).create_with_partitions(
         commit=mock.Mock(),
         partitions={},
     )

+ 47 - 7
tests/sentry/spans/consumers/process/test_factory.py

@@ -11,6 +11,7 @@ from sentry.spans.buffer.redis import get_redis_client
 from sentry.spans.consumers.detect_performance_issues.factory import BUFFERED_SEGMENT_SCHEMA
 from sentry.spans.consumers.process.factory import ProcessSpansStrategyFactory
 from sentry.testutils.helpers.options import override_options
+from sentry.testutils.pytest.fixtures import django_db_all
 from sentry.utils import json
 from sentry.utils.arroyo_producer import SingletonProducer
 from sentry.utils.kafka_config import get_topic_definition
@@ -88,7 +89,13 @@ def test_consumer_pushes_to_redis():
     strategy.submit(
         Message(
             BrokerValue(
-                KafkaPayload(b"key", message1.value().encode("utf-8"), []),
+                KafkaPayload(
+                    b"key",
+                    message1.value().encode("utf-8"),
+                    [
+                        ("project_id", b"1"),
+                    ],
+                ),
                 partition,
                 1,
                 datetime.now(),
@@ -102,7 +109,13 @@ def test_consumer_pushes_to_redis():
     strategy.submit(
         Message(
             BrokerValue(
-                KafkaPayload(b"key", message2.value().encode("utf-8"), []),
+                KafkaPayload(
+                    b"key",
+                    message2.value().encode("utf-8"),
+                    [
+                        ("project_id", b"1"),
+                    ],
+                ),
                 partition,
                 1,
                 datetime.now(),
@@ -120,6 +133,7 @@ def test_consumer_pushes_to_redis():
     ]
 
 
+@django_db_all
 @override_options(
     {
         "standalone-spans.process-spans-consumer.enable": True,
@@ -142,7 +156,13 @@ def test_produces_valid_segment_to_kafka(mock_produce):
     strategy.submit(
         Message(
             BrokerValue(
-                KafkaPayload(b"key", message1.value().encode("utf-8"), []),
+                KafkaPayload(
+                    b"key",
+                    message1.value().encode("utf-8"),
+                    [
+                        ("project_id", b"1"),
+                    ],
+                ),
                 partition,
                 1,
                 datetime.now() - timedelta(minutes=3),
@@ -156,7 +176,13 @@ def test_produces_valid_segment_to_kafka(mock_produce):
     strategy.submit(
         Message(
             BrokerValue(
-                KafkaPayload(b"key", message2.value().encode("utf-8"), []),
+                KafkaPayload(
+                    b"key",
+                    message2.value().encode("utf-8"),
+                    [
+                        ("project_id", b"1"),
+                    ],
+                ),
                 partition,
                 1,
                 datetime.now(),
@@ -190,7 +216,13 @@ def test_option_disabled(mock_buffer):
     strategy.submit(
         Message(
             BrokerValue(
-                KafkaPayload(b"key", message.value().encode("utf-8"), []),
+                KafkaPayload(
+                    b"key",
+                    message.value().encode("utf-8"),
+                    [
+                        ("project_id", b"1"),
+                    ],
+                ),
                 partition,
                 1,
                 datetime.now(),
@@ -207,7 +239,9 @@ def test_option_disabled(mock_buffer):
 @override_options(
     {
         "standalone-spans.process-spans-consumer.enable": True,
-        "standalone-spans.process-spans-consumer.project-allowlist": [],
+        "standalone-spans.process-spans-consumer.project-allowlist": [
+            ("project_id", b"1"),
+        ],
     }
 )
 @mock.patch("sentry.spans.consumers.process.factory.RedisSpansBuffer")
@@ -225,7 +259,13 @@ def test_option_project_rollout(mock_buffer):
     strategy.submit(
         Message(
             BrokerValue(
-                KafkaPayload(b"key", message.value().encode("utf-8"), []),
+                KafkaPayload(
+                    b"key",
+                    message.value().encode("utf-8"),
+                    [
+                        ("project_id", b"1"),
+                    ],
+                ),
                 partition,
                 1,
                 datetime.now(),