2 лет назад · c86a9a318e
--- a/src/sentry/eventstream/kafka/consumer_strategy.py
+++ b/src/sentry/eventstream/kafka/consumer_strategy.py
@@ -1,32 +1,31 @@
 
				 import logging
			
 
				-import time
			
 
				-from collections import deque
			
 
				-from concurrent.futures import Future, ThreadPoolExecutor
			
 
				-from typing import Any, Deque, Mapping, Optional, Tuple
			
 
				+from typing import Any, Mapping, Optional
			
 
				 
			
 
				 from arroyo.backends.kafka.consumer import KafkaPayload
			
 
				-from arroyo.processing.strategies import ProcessingStrategy, ProcessingStrategyFactory
			
 
				-from arroyo.processing.strategies.abstract import MessageRejected
			
 
				+from arroyo.processing.strategies import (
			
 
				+    CommitOffsets,
			
 
				+    ProcessingStrategy,
			
 
				+    ProcessingStrategyFactory,
			
 
				+    RunTaskInThreads,
			
 
				+)
			
 
				 from arroyo.types import Commit, Message, Partition
			
 
				 
			
 
				 from sentry import options
			
 
				 from sentry.eventstream.base import GroupStates
			
 
				 from sentry.eventstream.kafka.postprocessworker import _record_metrics, _sampled_eventstream_timer
			
 
				-from sentry.eventstream.kafka.postprocessworker import (
			
 
				-    dispatch_post_process_group_task as _dispatch_post_process_group_task,
			
 
				-)
			
 
				 from sentry.eventstream.kafka.protocol import (
			
 
				     get_task_kwargs_for_message,
			
 
				     get_task_kwargs_for_message_from_headers,
			
 
				 )
			
 
				+from sentry.tasks.post_process import post_process_group
			
 
				 from sentry.utils import metrics
			
 
				+from sentry.utils.cache import cache_key_for_event
			
 
				 
			
 
				 _DURATION_METRIC = "eventstream.duration"
			
 
				 
			
 
				 logger = logging.getLogger(__name__)
			
 
				 
			
 
				 
			
 
				-# For testing. Function will eventually move here when postprocessworker is removed.
			
 
				 def dispatch_post_process_group_task(
			
 
				     event_id: str,
			
 
				     project_id: int,
			
@@ -40,19 +39,24 @@ def dispatch_post_process_group_task(
 
				     group_states: Optional[GroupStates] = None,
			
 
				     occurrence_id: Optional[str] = None,
			
 
				 ) -> None:
			
 
				-    _dispatch_post_process_group_task(
			
 
				-        event_id,
			
 
				-        project_id,
			
 
				-        group_id,
			
 
				-        is_new,
			
 
				-        is_regression,
			
 
				-        is_new_group_environment,
			
 
				-        primary_hash,
			
 
				-        queue,
			
 
				-        skip_consume,
			
 
				-        group_states,
			
 
				-        occurrence_id=occurrence_id,
			
 
				-    )
			
 
				+    if skip_consume:
			
 
				+        logger.info("post_process.skip.raw_event", extra={"event_id": event_id})
			
 
				+    else:
			
 
				+        cache_key = cache_key_for_event({"project": project_id, "event_id": event_id})
			
 
				+
			
 
				+        post_process_group.apply_async(
			
 
				+            kwargs={
			
 
				+                "is_new": is_new,
			
 
				+                "is_regression": is_regression,
			
 
				+                "is_new_group_environment": is_new_group_environment,
			
 
				+                "primary_hash": primary_hash,
			
 
				+                "cache_key": cache_key,
			
 
				+                "group_id": group_id,
			
 
				+                "group_states": group_states,
			
 
				+                "occurrence_id": occurrence_id,
			
 
				+            },
			
 
				+            queue=queue,
			
 
				+        )
			
 
				 
			
 
				 
			
 
				 def _get_task_kwargs(message: Message[KafkaPayload]) -> Optional[Mapping[str, Any]]:
			
@@ -81,64 +85,6 @@ def _get_task_kwargs_and_dispatch(message: Message[KafkaPayload]) -> None:
 
				     dispatch_post_process_group_task(**task_kwargs)
			
 
				 
			
 
				 
			
 
				-class DispatchTask(ProcessingStrategy[KafkaPayload]):
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        concurrency: int,
			
 
				-        max_pending_futures: int,
			
 
				-        commit: Commit,
			
 
				-    ) -> None:
			
 
				-        self.__executor = ThreadPoolExecutor(max_workers=concurrency)
			
 
				-        self.__futures: Deque[Tuple[Message[KafkaPayload], Future[None]]] = deque()
			
 
				-        self.__max_pending_futures = max_pending_futures
			
 
				-        self.__commit = commit
			
 
				-        self.__closed = False
			
 
				-
			
 
				-    def submit(self, message: Message[KafkaPayload]) -> None:
			
 
				-        assert not self.__closed
			
 
				-        # The list of pending futures is too long, tell the stream processor to slow down
			
 
				-        if len(self.__futures) > self.__max_pending_futures:
			
 
				-            raise MessageRejected
			
 
				-
			
 
				-        self.__futures.append(
			
 
				-            (message, self.__executor.submit(_get_task_kwargs_and_dispatch, message))
			
 
				-        )
			
 
				-
			
 
				-    def poll(self) -> None:
			
 
				-        # Remove completed futures in order
			
 
				-        while self.__futures and self.__futures[0][1].done():
			
 
				-            message, _ = self.__futures.popleft()
			
 
				-
			
 
				-            self.__commit(message.committable)
			
 
				-
			
 
				-    def join(self, timeout: Optional[float] = None) -> None:
			
 
				-        start = time.time()
			
 
				-
			
 
				-        # Commit all pending offsets
			
 
				-        self.__commit({}, force=True)
			
 
				-
			
 
				-        while self.__futures:
			
 
				-            remaining = timeout - (time.time() - start) if timeout is not None else None
			
 
				-            if remaining is not None and remaining <= 0:
			
 
				-                logger.warning(f"Timed out with {len(self.__futures)} futures in queue")
			
 
				-                break
			
 
				-
			
 
				-            message, future = self.__futures.popleft()
			
 
				-
			
 
				-            future.result(remaining)
			
 
				-
			
 
				-            self.__commit(message.committable, force=True)
			
 
				-
			
 
				-        self.__executor.shutdown()
			
 
				-
			
 
				-    def close(self) -> None:
			
 
				-        self.__closed = True
			
 
				-
			
 
				-    def terminate(self) -> None:
			
 
				-        self.__closed = True
			
 
				-        self.__executor.shutdown()
			
 
				-
			
 
				-
			
 
				 class PostProcessForwarderStrategyFactory(ProcessingStrategyFactory[KafkaPayload]):
			
 
				     def __init__(
			
 
				         self,
			
@@ -153,4 +99,9 @@ class PostProcessForwarderStrategyFactory(ProcessingStrategyFactory[KafkaPayload
 
				         commit: Commit,
			
 
				         partitions: Mapping[Partition, int],
			
 
				     ) -> ProcessingStrategy[KafkaPayload]:
			
 
				-        return DispatchTask(self.__concurrency, self.__max_pending_futures, commit)
			
 
				+        return RunTaskInThreads(
			
 
				+            _get_task_kwargs_and_dispatch,
			
 
				+            self.__concurrency,
			
 
				+            self.__max_pending_futures,
			
 
				+            CommitOffsets(commit),
			
 
				+        )
			
--- a/src/sentry/eventstream/kafka/postprocessworker.py
+++ b/src/sentry/eventstream/kafka/postprocessworker.py
@@ -10,14 +10,11 @@ from threading import Lock
 
				 from typing import Any, Generator, Mapping, MutableMapping, Optional, Tuple
			
 
				 
			
 
				 from sentry import options
			
 
				-from sentry.eventstream.base import GroupStates
			
 
				 from sentry.eventstream.kafka.protocol import (
			
 
				     get_task_kwargs_for_message,
			
 
				     get_task_kwargs_for_message_from_headers,
			
 
				 )
			
 
				-from sentry.tasks.post_process import post_process_group
			
 
				 from sentry.utils import metrics
			
 
				-from sentry.utils.cache import cache_key_for_event
			
 
				 
			
 
				 logger = logging.getLogger(__name__)
			
 
				 
			
@@ -87,45 +84,3 @@ def _record_metrics(partition: int, task_kwargs: Mapping[str, Any]) -> None:
 
				                 tags={"partition": partition, "type": event_type},
			
 
				                 sample_rate=1,
			
 
				             )
			
 
				-
			
 
				-
			
 
				-def dispatch_post_process_group_task(
			
 
				-    event_id: str,
			
 
				-    project_id: int,
			
 
				-    group_id: Optional[int],
			
 
				-    is_new: bool,
			
 
				-    is_regression: Optional[bool],
			
 
				-    is_new_group_environment: bool,
			
 
				-    primary_hash: Optional[str],
			
 
				-    queue: str,
			
 
				-    skip_consume: bool = False,
			
 
				-    group_states: Optional[GroupStates] = None,
			
 
				-    occurrence_id: Optional[str] = None,
			
 
				-) -> None:
			
 
				-    if skip_consume:
			
 
				-        logger.info("post_process.skip.raw_event", extra={"event_id": event_id})
			
 
				-    else:
			
 
				-        cache_key = cache_key_for_event({"project": project_id, "event_id": event_id})
			
 
				-
			
 
				-        post_process_group.apply_async(
			
 
				-            kwargs={
			
 
				-                "is_new": is_new,
			
 
				-                "is_regression": is_regression,
			
 
				-                "is_new_group_environment": is_new_group_environment,
			
 
				-                "primary_hash": primary_hash,
			
 
				-                "cache_key": cache_key,
			
 
				-                "group_id": group_id,
			
 
				-                "group_states": group_states,
			
 
				-                "occurrence_id": occurrence_id,
			
 
				-            },
			
 
				-            queue=queue,
			
 
				-        )
			
 
				-
			
 
				-
			
 
				-def _get_task_kwargs_and_dispatch(message: Message) -> None:
			
 
				-    task_kwargs = _get_task_kwargs(message)
			
 
				-    if not task_kwargs:
			
 
				-        return None
			
 
				-
			
 
				-    _record_metrics(message.partition(), task_kwargs)
			
 
				-    dispatch_post_process_group_task(**task_kwargs)