Browse Source

feat(uptime): Create issue occurrence when we get failures from the uptime checker (#72793)

This creates an issue occurrence for the `UptimeDomainCheckFailure`
issue platform type. Also add some testing framework things to support
tests in the future
Dan Fuller 9 months ago
parent
commit
fdd453c292

+ 1 - 1
requirements-base.txt

@@ -62,7 +62,7 @@ rfc3339-validator>=0.1.2
 rfc3986-validator>=0.1.1
 # [end] jsonschema format validators
 sentry-arroyo>=2.16.5
-sentry-kafka-schemas>=0.1.92
+sentry-kafka-schemas>=0.1.93
 sentry-ophio==0.2.7
 sentry-redis-tools>=0.1.7
 sentry-relay>=0.8.67

+ 1 - 1
requirements-dev-frozen.txt

@@ -181,7 +181,7 @@ sentry-cli==2.32.0
 sentry-devenv==1.6.2
 sentry-forked-django-stubs==5.0.2.post4
 sentry-forked-djangorestframework-stubs==3.15.0.post1
-sentry-kafka-schemas==0.1.92
+sentry-kafka-schemas==0.1.93
 sentry-ophio==0.2.7
 sentry-redis-tools==0.1.7
 sentry-relay==0.8.67

+ 1 - 1
requirements-frozen.txt

@@ -121,7 +121,7 @@ rpds-py==0.15.2
 rsa==4.8
 s3transfer==0.10.0
 sentry-arroyo==2.16.5
-sentry-kafka-schemas==0.1.92
+sentry-kafka-schemas==0.1.93
 sentry-ophio==0.2.7
 sentry-redis-tools==0.1.7
 sentry-relay==0.8.67

+ 3 - 0
src/sentry/conf/server.py

@@ -3435,6 +3435,9 @@ SEER_GROUPING_RECORDS_DELETE_URL = (
     f"/{SEER_SIMILARITY_MODEL_VERSION}/issues/similar-issues/grouping-record/delete"
 )
 
+# TODO: Remove this soon, just a way to configure a project for this before we implement properly
+UPTIME_POC_PROJECT_ID = 1
+
 
 # Devserver configuration overrides.
 ngrok_host = os.environ.get("SENTRY_DEVSERVER_NGROK")

+ 22 - 0
src/sentry/testutils/cases.py

@@ -7,6 +7,7 @@ import os.path
 import random
 import re
 import time
+import uuid
 from collections.abc import Mapping, Sequence
 from contextlib import contextmanager
 from datetime import UTC, datetime, timedelta
@@ -41,6 +42,12 @@ from rest_framework import status
 from rest_framework.request import Request
 from rest_framework.response import Response
 from rest_framework.test import APITestCase as BaseAPITestCase
+from sentry_kafka_schemas.schema_types.uptime_results_v1 import (
+    CHECKSTATUS_FAILURE,
+    CHECKSTATUSREASONTYPE_TIMEOUT,
+    REQUESTTYPE_HEAD,
+    CheckResult,
+)
 from sentry_relay.consts import SPAN_STATUS_NAME_TO_CODE
 from snuba_sdk import Granularity, Limit, Offset
 from snuba_sdk.conditions import BooleanCondition, Condition, ConditionGroup
@@ -3195,6 +3202,21 @@ class MonitorIngestTestCase(MonitorTestCase):
         self.token = self.create_internal_integration_token(install=app, user=self.user)
 
 
+class UptimeTestCase(TestCase):
+    def create_uptime_result(self) -> CheckResult:
+        return {
+            "guid": uuid.uuid4().hex,
+            "subscription_id": uuid.uuid4().hex,
+            "status": CHECKSTATUS_FAILURE,
+            "status_reason": {"type": CHECKSTATUSREASONTYPE_TIMEOUT, "description": "it timed out"},
+            "trace_id": uuid.uuid4().hex,
+            "scheduled_check_time": datetime.now().timestamp(),
+            "actual_check_time": datetime.now().timestamp(),
+            "duration_ms": 100,
+            "request_info": {"request_type": REQUESTTYPE_HEAD, "http_status_code": 500},
+        }
+
+
 class IntegratedApiTestCase(BaseTestCase):
     def should_call_api_without_proxying(self) -> bool:
         return not IntegrationProxyClient.determine_whether_should_proxy_to_control()

+ 4 - 1
src/sentry/uptime/consumers/results_consumer.py

@@ -9,9 +9,10 @@ from arroyo.processing.strategies.commit import CommitOffsets
 from arroyo.processing.strategies.run_task import RunTask
 from arroyo.types import BrokerValue, Commit, FilteredPayload, Message, Partition
 from sentry_kafka_schemas.codecs import Codec
-from sentry_kafka_schemas.schema_types.uptime_results_v1 import CheckResult
+from sentry_kafka_schemas.schema_types.uptime_results_v1 import CHECKSTATUS_FAILURE, CheckResult
 
 from sentry.conf.types.kafka_definition import Topic, get_topic_codec
+from sentry.uptime.issue_platform import create_issue_platform_occurrence
 
 logger = logging.getLogger(__name__)
 
@@ -24,6 +25,8 @@ def process_result(message: Message[KafkaPayload | FilteredPayload]):
 
     try:
         result: CheckResult = UPTIME_RESULTS_CODEC.decode(message.payload.value)
+        if result["status"] == CHECKSTATUS_FAILURE:
+            create_issue_platform_occurrence(result)
 
         # XXX(epurkhiser): This consumer literally does nothing except log right now
         logger.info("process_result", extra=result)

+ 90 - 0
src/sentry/uptime/issue_platform.py

@@ -0,0 +1,90 @@
+from __future__ import annotations
+
+import uuid
+from datetime import datetime, timezone
+
+from django.conf import settings
+from sentry_kafka_schemas.schema_types.uptime_results_v1 import CheckResult
+
+from sentry.issues.grouptype import UptimeDomainCheckFailure
+from sentry.issues.issue_occurrence import IssueEvidence, IssueOccurrence
+from sentry.issues.producer import PayloadType, produce_occurrence_to_kafka
+
+
+def create_issue_platform_occurrence(result: CheckResult):
+    occurrence = build_occurrence_from_result(result)
+    produce_occurrence_to_kafka(
+        payload_type=PayloadType.OCCURRENCE,
+        occurrence=occurrence,
+        event_data=build_event_data_for_occurrence(result, occurrence),
+    )
+
+
+def build_occurrence_from_result(result: CheckResult) -> IssueOccurrence:
+    status_reason = result["status_reason"]
+    assert status_reason
+    failure_reason = f'{status_reason["type"]} - {status_reason["description"]}'
+    evidence_display = [
+        IssueEvidence(
+            name="Failure reason",
+            value=failure_reason,
+            important=True,
+        ),
+        IssueEvidence(
+            name="Duration",
+            value=str(result["duration_ms"]),
+            important=False,
+        ),
+    ]
+    request_info = result["request_info"]
+    if request_info:
+        evidence_display.append(
+            IssueEvidence(
+                name="Method",
+                value=request_info["request_type"],
+                important=False,
+            )
+        )
+        evidence_display.append(
+            IssueEvidence(
+                name="Status Code",
+                value=str(request_info["http_status_code"]),
+                important=False,
+            ),
+        )
+
+    return IssueOccurrence(
+        id=uuid.uuid4().hex,
+        resource_id=None,
+        project_id=settings.UPTIME_POC_PROJECT_ID,  # TODO: Get this from the subscription or subscription like thing
+        event_id=uuid.uuid4().hex,
+        fingerprint=[
+            result["subscription_id"]
+        ],  # TODO: Should be the specific monitor id related to the subscription
+        type=UptimeDomainCheckFailure,
+        issue_title="Uptime Check Failed for https://sentry.io",  # TODO: Get this from the uptime check details
+        subtitle="Your monitored domain is down",
+        evidence_display=evidence_display,
+        evidence_data={},
+        culprit="",  # TODO: The url?
+        detection_time=datetime.now(timezone.utc),
+        level="error",
+    )
+
+
+def build_event_data_for_occurrence(result: CheckResult, occurrence: IssueOccurrence):
+    return {
+        "environment": "prod",  # TODO: Include the environment here when we have it
+        "event_id": occurrence.event_id,
+        "fingerprint": occurrence.fingerprint,
+        "platform": "other",
+        "project_id": occurrence.project_id,
+        # We set this to the time that the check was performed
+        "received": datetime.fromtimestamp(result["actual_check_time"]),
+        "sdk": None,
+        "tags": {
+            "subscription_id": result["subscription_id"],
+        },
+        "timestamp": occurrence.detection_time.isoformat(),
+        "contexts": {"trace": {"trace_id": result["trace_id"], "span_id": None}},
+    }

+ 0 - 0
tests/sentry/uptime/__init__.py


+ 0 - 0
tests/sentry/uptime/consumers/__init__.py


+ 37 - 0
tests/sentry/uptime/consumers/test_results_consumers.py

@@ -0,0 +1,37 @@
+from datetime import datetime
+from hashlib import md5
+
+from arroyo import Message, Topic
+from arroyo.backends.kafka import KafkaPayload
+from arroyo.types import BrokerValue, Partition
+from django.test import override_settings
+
+from sentry.issues.grouptype import UptimeDomainCheckFailure
+from sentry.models.group import Group
+from sentry.testutils.cases import UptimeTestCase
+from sentry.uptime.consumers.results_consumer import UPTIME_RESULTS_CODEC, process_result
+
+
+class ProcessResultTest(UptimeTestCase):
+    def test(self):
+        result = self.create_uptime_result()
+
+        message = Message(
+            BrokerValue(
+                KafkaPayload(None, UPTIME_RESULTS_CODEC.encode(result), []),
+                Partition(Topic("test"), 1),
+                1,
+                datetime.now(),
+            )
+        )
+        project = self.project
+        # TODO: Remove this once we have a subscription
+        with override_settings(UPTIME_POC_PROJECT_ID=project.id), self.feature(
+            UptimeDomainCheckFailure.build_ingest_feature_name()
+        ):
+            process_result(message)
+
+        hashed_fingerprint = md5(result["subscription_id"].encode("utf-8")).hexdigest()
+
+        group = Group.objects.get(grouphash__hash=hashed_fingerprint)
+        assert group.issue_type == UptimeDomainCheckFailure

Some files were not shown because too many files changed in this diff