Просмотр исходного кода

feat(ingest) Tracing for ingest consumers (#18479)

* feat(ingest) Tracing for ingest consumers

* feat(sdk): Allow to mark sdk scope as unsafe

Add function to mark current sdk scope as unsafe to prevent walking the
stack when deciding if current event is safe.

* fix: Avoid nested transactions in ingest consumers tracing
Michal Kuffa 4 лет назад
Родитель
Сommit
cda9d2a0a0

+ 3 - 0
src/sentry/conf/server.py

@@ -963,6 +963,9 @@ SENTRY_PROCESS_EVENT_APM_SAMPLING = 0
 # sample rate for the relay projectconfig endpoint
 SENTRY_RELAY_ENDPOINT_APM_SAMPLING = 0
 
+# sample rate for ingest consumer processing functions
+SENTRY_INGEST_CONSUMER_APM_SAMPLING = 0
+
 # ----
 # end APM config
 # ----

+ 48 - 7
src/sentry/ingest/ingest_consumer.py

@@ -1,5 +1,7 @@
 from __future__ import absolute_import
 
+import random
+import functools
 import atexit
 import logging
 import msgpack
@@ -8,14 +10,18 @@ from six import BytesIO
 import multiprocessing.dummy
 import multiprocessing as _multiprocessing
 
+from django.conf import settings
 from django.core.cache import cache
 
+import sentry_sdk
+
 from sentry import eventstore, features, options
 from sentry.cache import default_cache
 from sentry.models import Project, File, EventAttachment
 from sentry.signals import event_accepted
 from sentry.tasks.store import preprocess_event
 from sentry.utils import json, metrics
+from sentry.utils.sdk import mark_scope_as_unsafe
 from sentry.utils.dates import to_datetime
 from sentry.utils.cache import cache_key_for_event
 from sentry.utils.kafka import create_batching_kafka_consumer
@@ -41,6 +47,11 @@ class IngestConsumerWorker(AbstractBatchWorker):
         return message
 
     def flush_batch(self, batch):
+        mark_scope_as_unsafe()
+        with metrics.timer("ingest_consumer.flush_batch"):
+            return self._flush_batch(batch)
+
+    def _flush_batch(self, batch):
         attachment_chunks = []
         other_messages = []
         transactions = []
@@ -84,22 +95,39 @@ class IngestConsumerWorker(AbstractBatchWorker):
         if other_messages:
             with metrics.timer("ingest_consumer.process_other_messages_batch"):
                 for _ in self.pool.imap_unordered(
-                    lambda args: args[0](args[1], projects=projects), other_messages, chunksize=100
+                    lambda args: args[0](args[1], projects=projects), other_messages, chunksize=100,
                 ):
                     pass
 
         if transactions:
-            process_transactions_batch(transactions, projects)
+            with metrics.timer("ingest_consumer.process_transactions"):
+                process_transactions_batch(transactions, projects)
 
     def shutdown(self):
         pass
 
 
+def trace_func(**span_kwargs):
+    def wrapper(f):
+        @functools.wraps(f)
+        def inner(*args, **kwargs):
+            span_kwargs["sampled"] = random.random() < getattr(
+                settings, "SENTRY_INGEST_CONSUMER_APM_SAMPLING", 0
+            )
+            with sentry_sdk.start_span(**span_kwargs):
+                return f(*args, **kwargs)
+
+        return inner
+
+    return wrapper
+
+
+@trace_func(transaction="ingest_consumer.process_transactions_batch")
 @metrics.wraps("ingest_consumer.process_transactions_batch")
 def process_transactions_batch(messages, projects):
     if options.get("store.transactions-celery") is True:
         for message in messages:
-            process_event(message, projects)
+            _do_process_event(message, projects)
         return
 
     jobs = []
@@ -119,7 +147,7 @@ def process_transactions_batch(messages, projects):
 
 
 @metrics.wraps("ingest_consumer.process_event")
-def process_event(message, projects):
+def _do_process_event(message, projects):
     payload = message["payload"]
     start_time = float(message["start_time"])
     event_id = message["event_id"]
@@ -177,9 +205,14 @@ def process_event(message, projects):
     # Preprocess this event, which spawns either process_event or
     # save_event. Pass data explicitly to avoid fetching it again from the
     # cache.
-    preprocess_event(
-        cache_key=cache_key, data=data, start_time=start_time, event_id=event_id, project=project
-    )
+    with sentry_sdk.start_span(op="ingest_consumer.process_event.preprocess_event"):
+        preprocess_event(
+            cache_key=cache_key,
+            data=data,
+            start_time=start_time,
+            event_id=event_id,
+            project=project,
+        )
 
     # remember for an 1 hour that we saved this event (deduplication protection)
     cache.set(deduplication_key, "", CACHE_TIMEOUT)
@@ -188,6 +221,12 @@ def process_event(message, projects):
     event_accepted.send_robust(ip=remote_addr, data=data, project=project, sender=process_event)
 
 
+@trace_func(transaction="ingest_consumer.process_event")
+def process_event(message, projects):
+    return _do_process_event(message, projects)
+
+
+@trace_func(transaction="ingest_consumer.process_attachment_chunk")
 @metrics.wraps("ingest_consumer.process_attachment_chunk")
 def process_attachment_chunk(message, projects):
     payload = message["payload"]
@@ -201,6 +240,7 @@ def process_attachment_chunk(message, projects):
     )
 
 
+@trace_func(transaction="ingest_consumer.process_individual_attachment")
 @metrics.wraps("ingest_consumer.process_individual_attachment")
 def process_individual_attachment(message, projects):
     event_id = message["event_id"]
@@ -256,6 +296,7 @@ def process_individual_attachment(message, projects):
     attachment.delete()
 
 
+@trace_func(transaction="ingest_consumer.process_userreport")
 @metrics.wraps("ingest_consumer.process_userreport")
 def process_userreport(message, projects):
     project_id = int(message["project_id"])

+ 20 - 1
src/sentry/utils/sdk.py

@@ -24,6 +24,8 @@ UNSAFE_FILES = (
     "outcomes_consumer.py",
 )
 
+UNSAFE_TAG = "_unsafe"
+
 # Reexport sentry_sdk just in case we ever have to write another shim like we
 # did for raven
 from sentry_sdk import configure_scope, push_scope, capture_message, capture_exception  # NOQA
@@ -36,6 +38,11 @@ def is_current_event_safe():
     """
 
     with configure_scope() as scope:
+
+        # Scope was explicitly marked as unsafe
+        if scope._tags.get(UNSAFE_TAG):
+            return False
+
         project_id = scope._tags.get("project")
 
         if project_id and project_id == settings.SENTRY_PROJECT:
@@ -48,6 +55,17 @@ def is_current_event_safe():
     return True
 
 
+def mark_scope_as_unsafe():
+    """
+    Set the unsafe tag on the SDK scope for outgoing crashe and transactions.
+
+    Marking a scope explicitly as unsafe allows the recursion breaker to
+    decide early, before walking the stack and checking for unsafe files.
+    """
+    with configure_scope() as scope:
+        scope.set_tag(UNSAFE_TAG, True)
+
+
 def set_current_project(project_id):
     """
     Set the current project on the SDK scope for outgoing crash reports.
@@ -154,7 +172,8 @@ def configure_sdk():
                 relay_transport.capture_event(event)
             else:
                 metrics.incr("internal.uncaptured.events.relay", skip_internal=False)
-                sdk_logger.warn("internal-error.unsafe-stacktrace.relay")
+                if event.get("type") != "transaction":
+                    sdk_logger.warn("internal-error.unsafe-stacktrace.relay")
 
     sentry_sdk.init(
         transport=capture_event,

+ 17 - 12
tests/relay_integration/test_sdk.py

@@ -1,13 +1,14 @@
 from __future__ import absolute_import, print_function
+import uuid
 
 import pytest
 
-from sentry_sdk import Hub, last_event_id
+import sentry_sdk
+from sentry_sdk import Hub, push_scope
 
 from django.conf import settings
 from sentry.utils.sdk import configure_sdk, bind_organization_context
 from sentry.utils.compat import mock
-from sentry.app import raven
 
 from sentry import eventstore
 from sentry.testutils import assert_mock_called_once_with_partial
@@ -24,37 +25,39 @@ def post_event_with_sdk(settings, relay_server, wait_for_ingest_consumer):
     wait_for_ingest_consumer = wait_for_ingest_consumer(settings)
 
     def inner(*args, **kwargs):
-        event_id = raven.captureMessage(*args, **kwargs)
+        event_id = sentry_sdk.capture_event(*args, **kwargs)
         Hub.current.client.flush()
 
-        return wait_for_ingest_consumer(
-            lambda: eventstore.get_event_by_id(settings.SENTRY_PROJECT, event_id)
-        )
+        with push_scope():
+            return wait_for_ingest_consumer(
+                lambda: eventstore.get_event_by_id(settings.SENTRY_PROJECT, event_id)
+            )
 
     return inner
 
 
 @pytest.mark.django_db
 def test_simple(post_event_with_sdk):
-    event = post_event_with_sdk("internal client test")
+    event = post_event_with_sdk({"message": "internal client test"})
 
     assert event
     assert event.data["project"] == settings.SENTRY_PROJECT
-    assert event.data["event_id"] == last_event_id()
     assert event.data["logentry"]["formatted"] == "internal client test"
 
 
 @pytest.mark.django_db
-def test_recursion_breaker(post_event_with_sdk):
+def test_recursion_breaker(settings, post_event_with_sdk):
     # If this test terminates at all then we avoided recursion.
+    settings.SENTRY_INGEST_CONSUMER_APM_SAMPLING = 1.0
+    event_id = uuid.uuid4().hex
     with mock.patch(
         "sentry.event_manager.EventManager.save", side_effect=ValueError("oh no!")
     ) as save:
         with pytest.raises(ValueError):
-            post_event_with_sdk("internal client test")
+            post_event_with_sdk({"message": "internal client test", "event_id": event_id})
 
     assert_mock_called_once_with_partial(
-        save, settings.SENTRY_PROJECT, cache_key=u"e:{}:1".format(last_event_id())
+        save, settings.SENTRY_PROJECT, cache_key=u"e:{}:1".format(event_id)
     )
 
 
@@ -63,7 +66,9 @@ def test_encoding(post_event_with_sdk):
     class NotJSONSerializable:
         pass
 
-    event = post_event_with_sdk("check the req", extra={"request": NotJSONSerializable()})
+    with push_scope() as scope:
+        scope.set_extra("request", NotJSONSerializable())
+        event = post_event_with_sdk({"message": "check the req"})
 
     assert event.data["project"] == settings.SENTRY_PROJECT
     assert event.data["logentry"]["formatted"] == "check the req"