Browse Source

feat(hc): Publish UserIP logs to kafka from REGION silos (#38988)

Enable feature flag backed ability to publish UserIP entries to a kafka producer associated with control silos, which consume and persist them.
Zach Collins 2 years ago
parent
commit
f94d5af03e

+ 27 - 0
.github/workflows/backend.yml

@@ -322,6 +322,32 @@ jobs:
         run: |
           make test-plugins
 
+  region-to-control:
+    if: needs.files-changed.outputs.backend == 'true'
+    needs: files-changed
+    name: region-to-control test
+    runs-on: ubuntu-20.04
+    timeout-minutes: 20
+    steps:
+      - uses: actions/checkout@7884fcad6b5d53d10323aee724dc68d8b9096a2e  # v2
+        with:
+          # Avoid codecov error message related to SHA resolution:
+          # https://github.com/codecov/codecov-bash/blob/7100762afbc822b91806a6574658129fe0d23a7d/codecov#L891
+          fetch-depth: '2'
+
+      - name: Setup sentry env
+        uses: ./.github/actions/setup-sentry
+        id: setup
+        with:
+          kafka: true
+
+      - name: Run test
+        run: |
+          make test-region-to-control-integration
+
+      - name: Handle artifacts
+        uses: ./.github/actions/artifacts
+
   relay:
     if: needs.files-changed.outputs.backend == 'true'
     needs: files-changed
@@ -470,6 +496,7 @@ jobs:
         migration,
         plugins,
         relay,
+        region-to-control,
         snuba,
         symbolicator,
         typing,

+ 8 - 0
Makefile

@@ -130,6 +130,7 @@ test-python-ci:
 		--ignore tests/sentry/snuba \
 		--ignore tests/sentry/search/events \
 		--ignore tests/sentry/ingest/ingest_consumer/test_ingest_consumer_kafka.py \
+		--ignore tests/sentry/region_to_control/test_region_to_control_kafka.py \
 		--cov . --cov-report="xml:.artifacts/python.coverage.xml"
 	@echo ""
 
@@ -172,6 +173,13 @@ test-plugins:
 	pytest tests/sentry_plugins -vv --cov . --cov-report="xml:.artifacts/plugins.coverage.xml"
 	@echo ""
 
+test-region-to-control-integration:
+	@echo "--> Running Region to Control consumer integration tests"
+	pytest \
+		tests/sentry/region_to_control/test_region_to_control_kafka.py \
+		-vv --cov . --cov-report="xml:.artifacts/region-to-control.coverage.xml"
+	@echo ""
+
 test-relay-integration:
 	@echo "--> Running Relay integration tests"
 	pytest \

+ 3 - 0
src/sentry/conf/server.py

@@ -2457,6 +2457,7 @@ KAFKA_PROFILES = "profiles"
 KAFKA_INGEST_PERFORMANCE_METRICS = "ingest-performance-metrics"
 KAFKA_SNUBA_GENERIC_METRICS = "snuba-generic-metrics"
 KAFKA_INGEST_REPLAYS_RECORDINGS = "ingest-replay-recordings"
+KAFKA_REGION_TO_CONTROL = "region-to-control"
 
 # topic for testing multiple indexer backends in parallel
 # in production. So far just testing backends for the perf data,
@@ -2504,6 +2505,8 @@ KAFKA_TOPICS = {
     KAFKA_INGEST_REPLAYS_RECORDINGS: {"cluster": "default"},
     # Metrics Testing Topics
     KAFKA_SNUBA_GENERICS_METRICS_CS: {"cluster": "default"},
+    # Region to Control Silo messaging - eg UserIp and AuditLog
+    KAFKA_REGION_TO_CONTROL: {"cluster": "default"},
 }
 
 

+ 24 - 12
src/sentry/models/userip.py

@@ -4,6 +4,8 @@ from django.db import models
 from django.utils import timezone
 
 from sentry.db.models import FlexibleForeignKey, Model, all_silo_model, sane_repr
+from sentry.models import User
+from sentry.region_to_control.messages import UserIpEvent
 from sentry.utils.geo import geo_by_addr
 
 
@@ -26,21 +28,31 @@ class UserIP(Model):
     __repr__ = sane_repr("user_id", "ip_address")
 
     @classmethod
-    def log(cls, user, ip_address):
+    def log(cls, user: User, ip_address: str):
         # Only log once every 5 minutes for the same user/ip_address pair
         # since this is hit pretty frequently by all API calls in the UI, etc.
         cache_key = f"userip.log:{user.id}:{ip_address}"
-        if cache.get(cache_key):
-            return
+        if not cache.get(cache_key):
+            _perform_log(user, ip_address)
+            cache.set(cache_key, 1, 300)
 
-        try:
-            geo = geo_by_addr(ip_address)
-        except Exception:
-            geo = None
 
-        values = {"last_seen": timezone.now()}
-        if geo:
-            values.update({"country_code": geo["country_code"], "region_code": geo["region"]})
+def _perform_log(user: User, ip_address: str):
+    from sentry.region_to_control.producer import produce_user_ip
 
-        UserIP.objects.create_or_update(user=user, ip_address=ip_address, values=values)
-        cache.set(cache_key, 1, 300)
+    try:
+        geo = geo_by_addr(ip_address)
+    except Exception:
+        geo = None
+
+    event = UserIpEvent(
+        user_id=user.id,
+        ip_address=ip_address,
+        last_seen=timezone.now(),
+    )
+
+    if geo:
+        event.country_code = geo["country_code"]
+        event.region_code = geo["region"]
+
+    produce_user_ip(event)

+ 4 - 0
src/sentry/options/defaults.py

@@ -533,3 +533,7 @@ register("performance.issues.n_plus_one_db.problem-creation", default=0.0)
 # System-wide options for default performance detection settings for any org opted into the performance-issues-ingest feature. Meant for rollout.
 register("performance.issues.n_plus_one_db.count_threshold", default=5)
 register("performance.issues.n_plus_one_db.duration_threshold", default=100.0)
+
+# region to control silo multi region mode controls
+# It is safe to switch this off or on freely, it is purely for testing features ahead of hybrid cloud rollout.
+register("hc.region-to-control.monolith-publish", default=False)

+ 0 - 0
src/sentry/region_to_control/__init__.py


+ 113 - 0
src/sentry/region_to_control/consumer.py

@@ -0,0 +1,113 @@
+import signal
+from typing import Callable, Generic, Mapping, Sequence, TypeVar
+
+from arroyo import Partition, Topic
+from arroyo.backends.kafka import KafkaConsumer, KafkaPayload
+from arroyo.processing import StreamProcessor
+from arroyo.processing.strategies import ProcessingStrategy, ProcessingStrategyFactory
+from arroyo.processing.strategies.batching import AbstractBatchWorker, BatchProcessingStrategy
+from arroyo.types import Message, Position
+from django.conf import settings
+
+from sentry.models import UserIP
+from sentry.utils import json, metrics
+
+from ..utils.kafka_config import get_kafka_consumer_cluster_options
+from .messages import RegionToControlMessage
+
+
+def get_region_to_control_consumer(
+    group_id: str = None,
+    auto_offset_reset="earliest",
+    max_batch_size=100,
+    max_batch_time=1000,
+    **opts,
+) -> StreamProcessor[KafkaPayload]:
+    cluster_name = settings.KAFKA_TOPICS[settings.KAFKA_REGION_TO_CONTROL]["cluster"]
+    consumer = KafkaConsumer(
+        get_kafka_consumer_cluster_options(
+            cluster_name,
+            override_params={
+                "auto.offset.reset": auto_offset_reset,
+                "enable.auto.commit": "false",
+                "enable.auto.offset.store": "false",
+                "group.id": group_id,
+            },
+        )
+    )
+
+    processor = StreamProcessor(
+        consumer=consumer,
+        topic=Topic(settings.KAFKA_REGION_TO_CONTROL),
+        processor_factory=ProcessorFactory(
+            lambda commit: BatchProcessingStrategy(
+                commit,
+                worker=RegionToControlConsumerWorker(),
+                max_batch_size=max_batch_size,
+                max_batch_time=max_batch_time,
+            )
+        ),
+    )
+
+    def handler(*args) -> None:
+        processor.signal_shutdown()
+
+    signal.signal(signal.SIGINT, handler)
+    signal.signal(signal.SIGTERM, handler)
+
+    return processor
+
+
+class RegionToControlConsumerWorker(AbstractBatchWorker[KafkaPayload, RegionToControlMessage]):
+    def process_message(self, message: Message[KafkaPayload]) -> RegionToControlMessage:
+        raw = json.loads(message.payload.value.decode("utf8"))
+        return RegionToControlMessage.from_payload(raw)
+
+    def flush_batch(self, batch: Sequence[RegionToControlMessage]):
+        with metrics.timer("region_to_control.consumer.flush_batch"):
+            return self._flush_batch(batch)
+
+    def _flush_batch(self, batch: Sequence[RegionToControlMessage]):
+        for row in batch:
+            if row.user_ip_event:
+                updated, created = UserIP.objects.create_or_update(
+                    values=dict(
+                        user_id=row.user_ip_event.user_id,
+                        ip_address=row.user_ip_event.ip_address,
+                        last_seen=row.user_ip_event.last_seen,
+                        country_code=row.user_ip_event.country_code,
+                        region_code=row.user_ip_event.region_code,
+                    )
+                )
+
+                if created:
+                    metrics.incr("region_to_control.consumer.user_ip_event.created")
+                elif updated:
+                    metrics.incr("region_to_control.consumer.user_ip_event.updated", amount=updated)
+                else:
+                    # This happens when there is an integrity error adding the UserIP -- such as when user is deleted,
+                    # or the ip address does not match the db validation.  This is expected and not an error condition
+                    # in low quantities.
+                    metrics.incr("region_to_control.consumer.user_ip_event.stale_event")
+
+
+ProcessorT = TypeVar("ProcessorT", bound=ProcessingStrategy[KafkaPayload])
+Commit = Callable[[Mapping[Partition, Position]], None]
+
+
+class ProcessorFactory(ProcessingStrategyFactory[KafkaPayload], Generic[ProcessorT]):
+    """
+    Generic processor factory that defers to a callable.
+    """
+
+    constructor: Callable[[Commit], ProcessorT]
+
+    def __init__(self, constructor: Callable[[Commit], ProcessorT]):
+        self.constructor = constructor
+
+    def create_with_partitions(
+        self,
+        commit: Commit,
+        partitions: Mapping[Partition, int],
+    ) -> ProcessingStrategy[KafkaPayload]:
+        return self.constructor(commit)

+ 60 - 0
src/sentry/region_to_control/messages.py

@@ -0,0 +1,60 @@
+import dataclasses
+import datetime
+from typing import Optional, Union
+
+from sentry.utils import json
+
+
+@dataclasses.dataclass
+class UserIpEvent:
+    user_id: int
+    ip_address: str
+    last_seen: datetime.datetime
+    country_code: Optional[str] = None
+    region_code: Optional[str] = None
+
+
+@dataclasses.dataclass
+class NormalizedUserIpEvent(UserIpEvent):
+    user_id: int = -1
+    ip_address: str = "127.0.0.1"
+    last_seen: datetime.datetime = datetime.datetime(2000, 1, 1)
+
+
+def discard_extra_fields(Dc, payload):
+    message_field: dataclasses.Field
+    kwds = {}
+    for message_field in dataclasses.fields(Dc):
+        if message_field.name in payload:
+            kwds[message_field.name] = payload[message_field.name]
+
+    return Dc(**kwds)
+
+
+@dataclasses.dataclass
+class RegionToControlMessage:
+    user_ip_event: Optional[UserIpEvent] = dataclasses.field(
+        default=None, metadata=dict(constructor=NormalizedUserIpEvent)
+    )
+
+    @staticmethod
+    def from_payload(payload: Union[str, bytes, dict]):
+        # Perform any upgrading that may be necessary for migrations since the region silo could in theory send messages
+        # with schemas older than the current.  test_region_to_control_consumer has historical payload
+        # regression tests which you should add to as well.
+        if not isinstance(payload, dict):
+            payload = json.loads(payload)
+
+        message_field: dataclasses.Field
+        result = RegionToControlMessage()
+        for message_field in dataclasses.fields(RegionToControlMessage):
+            if message_field.name in payload:
+                setattr(
+                    result,
+                    message_field.name,
+                    discard_extra_fields(
+                        message_field.metadata["constructor"], payload[message_field.name]
+                    ),
+                )
+
+        return result

+ 51 - 0
src/sentry/region_to_control/producer.py

@@ -0,0 +1,51 @@
+import dataclasses
+from typing import Optional
+
+from django.conf import settings
+
+from sentry.models import UserIP
+from sentry.region_to_control.messages import RegionToControlMessage, UserIpEvent
+from sentry.silo import SiloMode
+from sentry.utils import json, kafka_config
+from sentry.utils.pubsub import KafkaPublisher
+
+
+def produce_user_ip(event: UserIpEvent):
+    if _should_produce_to_kafka():
+        get_user_ip_publisher().publish(
+            settings.KAFKA_REGION_TO_CONTROL,
+            json.dumps(dataclasses.asdict(RegionToControlMessage(user_ip_event=event))),
+        )
+    else:
+        UserIP.objects.create_or_update(
+            user_id=event.user_id, ip_address=event.ip_address, values=dataclasses.asdict(event)
+        )
+
+
+_user_ip_publisher: Optional[KafkaPublisher] = None
+
+
+def get_user_ip_publisher() -> KafkaPublisher:
+    """
+    Locks in the configuration for a KafkaPublisher on first usage.  KafkaPublishers are thread safe in practice
+    due to librdkafka which implements the CPython client, as messages are processed in a separate thread + queue
+    at that level.
+    """
+    global _user_ip_publisher
+    if _user_ip_publisher is None:
+        config = settings.KAFKA_TOPICS.get(settings.KAFKA_REGION_TO_CONTROL)
+        _user_ip_publisher = KafkaPublisher(
+            kafka_config.get_kafka_producer_cluster_options(config["cluster"])
+        )
+    return _user_ip_publisher
+
+
+def _should_produce_to_kafka():
+    from sentry import options
+
+    mode = SiloMode.get_current_mode()
+    is_region = mode == SiloMode.REGION
+    is_mono_with_producer = mode == SiloMode.MONOLITH and options.get(
+        "hc.region-to-control.monolith-publish"
+    )
+    return is_region or is_mono_with_producer

+ 1 - 4
src/sentry/runner/commands/devserver.py

@@ -24,6 +24,7 @@ _DEFAULT_DAEMONS = {
         "--commit-batch-timeout-ms=1000",
     ],
     "ingest": ["sentry", "run", "ingest-consumer", "--all-consumer-types"],
+    "region_to_control": ["sentry", "run", "region-to-control-consumer", "--region-name", "_local"],
     "server": ["sentry", "run", "web"],
     "storybook": ["yarn", "storybook"],
     "subscription-consumer": [
@@ -119,7 +120,6 @@ def devserver(
     bind: str | None,
 ) -> NoReturn:
     "Starts a lightweight web server for development."
-
     if ingest:
         # Ingest requires kakfa+zookeeper to be running.
         # They're too heavyweight to startup on-demand with devserver.
@@ -235,9 +235,6 @@ and run `sentry devservices up kafka zookeeper`.
             }
         )
 
-    if ingest:
-        settings.SENTRY_USE_RELAY = True
-
     os.environ["SENTRY_USE_RELAY"] = "1" if settings.SENTRY_USE_RELAY else ""
 
     if workers:

Some files were not shown because too many files changed in this diff