Browse Source

fix(post-process-group): retry retrieving events from eventstore up to 3 times (#50210)

Eventstore is occasionally returning None events. Not totally sure
whether its a timing issue or whether eventstore actually doesn't have
the event. This PR retries retrieving the event with an exponential
backoff delay and returns early in post_process_group if event is None.

Also retries on `ServiceUnavailable` exception when Bigtable is
temporarily unavailable.

Resolves SENTRY-ZEB, SENTRY-11G0
Gilbert Szeto 1 year ago
parent
commit
ccfb372121
1 changed files with 32 additions and 3 deletions
  1. 32 3
      src/sentry/tasks/post_process.py

+ 32 - 3
src/sentry/tasks/post_process.py

@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, List, Mapping, Optional, Sequence, Tuple, Type
 import sentry_sdk
 from django.conf import settings
 from django.utils import timezone
+from google.api_core.exceptions import ServiceUnavailable
 
 from sentry import features
 from sentry.exceptions import PluginError
@@ -20,6 +21,7 @@ from sentry.utils.cache import cache
 from sentry.utils.event_frames import get_sdk_name
 from sentry.utils.locking import UnableToAcquireLock
 from sentry.utils.locking.manager import LockManager
+from sentry.utils.retries import ConditionalRetryPolicy, exponential_delay
 from sentry.utils.safe import safe_execute
 from sentry.utils.sdk import bind_organization_context, set_current_event_project
 from sentry.utils.services import build_instance_from_options
@@ -388,6 +390,20 @@ def fetch_buffered_group_stats(group):
     group.times_seen_pending = result["times_seen"]
 
 
+MAX_FETCH_ATTEMPTS = 3
+
+
+def should_retry_fetch(attempt: int, e: Exception) -> bool:
+    from sentry.issues.occurrence_consumer import EventLookupError
+
+    return not attempt > MAX_FETCH_ATTEMPTS and (
+        isinstance(e, ServiceUnavailable) or isinstance(e, EventLookupError)
+    )
+
+
+fetch_retry_policy = ConditionalRetryPolicy(should_retry_fetch, exponential_delay(1.00))
+
+
 @instrumented_task(
     name="sentry.tasks.post_process.post_process_group",
     time_limit=120,
@@ -418,6 +434,7 @@ def post_process_group(
         from sentry.ingest.transaction_clusterer.datasource.redis import (
             record_transaction_name as record_transaction_name_for_clustering,
         )
+        from sentry.issues.occurrence_consumer import EventLookupError
         from sentry.models import Organization, Project
         from sentry.reprocessing2 import is_reprocessed_event
 
@@ -468,9 +485,21 @@ def post_process_group(
                 return
             # Issue platform events don't use `event_processing_store`. Fetch from eventstore
             # instead.
-            event = eventstore.get_event_by_id(
-                project_id, occurrence.event_id, group_id=group_id, skip_transaction_groupevent=True
-            )
+
+            def get_event_raise_exception() -> Event:
+                retrieved = eventstore.get_event_by_id(
+                    project_id,
+                    occurrence.event_id,
+                    group_id=group_id,
+                    skip_transaction_groupevent=True,
+                )
+                if retrieved is None:
+                    raise EventLookupError(
+                        f"failed to retrieve event(project_id={project_id}, event_id={occurrence.event_id}, group_id={group_id}) from eventstore"
+                    )
+                return retrieved
+
+            event = fetch_retry_policy(get_event_raise_exception)
 
         set_current_event_project(event.project_id)