3 months ago · 46db7b398c
--- a/migrations_lockfile.txt
+++ b/migrations_lockfile.txt
@@ -10,7 +10,7 @@ hybridcloud: 0016_add_control_cacheversion
 
				 nodestore: 0002_nodestore_no_dictfield
			
 
				 remote_subscriptions: 0003_drop_remote_subscription
			
 
				 replays: 0004_index_together
			
 
				-sentry: 0790_delete_dashboard_perms_col
			
 
				+sentry: 0791_add_hashing_metadata_to_grouphash_metadata
			
 
				 social_auth: 0002_default_auto_field
			
 
				 uptime: 0018_add_trace_sampling_field_to_uptime
			
 
				 workflow_engine: 0012_data_source_type_change
			
--- a/src/sentry/migrations/0791_add_hashing_metadata_to_grouphash_metadata.py
+++ b/src/sentry/migrations/0791_add_hashing_metadata_to_grouphash_metadata.py
@@ -0,0 +1,34 @@
 
				+# Generated by Django 5.1.1 on 2024-11-14 22:09
			
 
				+
			
 
				+from django.db import migrations
			
 
				+
			
 
				+import sentry.db.models.fields.jsonfield
			
 
				+from sentry.new_migrations.migrations import CheckedMigration
			
 
				+
			
 
				+
			
 
				+class Migration(CheckedMigration):
			
 
				+    # This flag is used to mark that a migration shouldn't be automatically run in production.
			
 
				+    # This should only be used for operations where it's safe to run the migration after your
			
 
				+    # code has deployed. So this should not be used for most operations that alter the schema
			
 
				+    # of a table.
			
 
				+    # Here are some things that make sense to mark as post deployment:
			
 
				+    # - Large data migrations. Typically we want these to be run manually so that they can be
			
 
				+    #   monitored and not block the deploy for a long period of time while they run.
			
 
				+    # - Adding indexes to large tables. Since this can take a long time, we'd generally prefer to
			
 
				+    #   run this outside deployments so that we don't block them. Note that while adding an index
			
 
				+    #   is a schema change, it's completely safe to run the operation after the code has deployed.
			
 
				+    # Once deployed, run these manually via: https://develop.sentry.dev/database-migrations/#migration-deployment
			
 
				+
			
 
				+    is_post_deployment = False
			
 
				+
			
 
				+    dependencies = [
			
 
				+        ("sentry", "0790_delete_dashboard_perms_col"),
			
 
				+    ]
			
 
				+
			
 
				+    operations = [
			
 
				+        migrations.AddField(
			
 
				+            model_name="grouphashmetadata",
			
 
				+            name="hashing_metadata",
			
 
				+            field=sentry.db.models.fields.jsonfield.JSONField(null=True),
			
 
				+        ),
			
 
				+    ]
			
--- a/src/sentry/models/grouphashmetadata.py
+++ b/src/sentry/models/grouphashmetadata.py
@@ -5,6 +5,8 @@ from sentry.backup.scopes import RelocationScope
 
				 from sentry.db.models import Model, region_silo_model
			
 
				 from sentry.db.models.base import sane_repr
			
 
				 from sentry.db.models.fields.foreignkey import FlexibleForeignKey
			
 
				+from sentry.db.models.fields.jsonfield import JSONField
			
 
				+from sentry.types.grouphash_metadata import HashingMetadata
			
 
				 
			
 
				 
			
 
				 # The overall grouping method used
			
@@ -56,6 +58,12 @@ class GroupHashMetadata(Model):
 
				     latest_grouping_config = models.CharField(null=True)
			
 
				     # The primary grouping method (message, stacktrace, fingerprint, etc.)
			
 
				     hash_basis = models.CharField(choices=HashBasis, null=True)
			
 
				+    # Metadata about the inputs to the hashing process and the hashing process itself (what
			
 
				+    # fingerprinting rules were matched? did we parameterize the message? etc.). For the specific
			
 
				+    # data stored, see the class definitions of the `HashingMetadata` subtypes.
			
 
				+    hashing_metadata: models.Field[HashingMetadata | None, HashingMetadata | None] = JSONField(
			
 
				+        null=True
			
 
				+    )
			
 
				 
			
 
				     # SEER
			
 
				 
			
--- a/src/sentry/types/grouphash_metadata.py
+++ b/src/sentry/types/grouphash_metadata.py
@@ -0,0 +1,159 @@
 
				+from __future__ import annotations
			
 
				+
			
 
				+from typing import NotRequired, TypedDict
			
 
				+
			
 
				+# NOTE: The structure in these metadata types is intentionaly flat, to make it easier to query in
			
 
				+# Redash or BigQuery, and they are all merged into a single flat JSON blob (which is then stored in
			
 
				+# `GroupHashMetadata.hashing_metadata`). Therefore, if entries are added, they should be namespaced
			
 
				+# according to their corresponding hash basis (so, for example, `fingerprint_source` and
			
 
				+# `message_source`, rather than just `source`), both for clarity and to avoid collisions.
			
 
				+
			
 
				+
			
 
				+class FingerprintHashingMetadata(TypedDict):
			
 
				+    """
			
 
				+    Fingerprint data, gathered both during stand-alone custom/built-in fingerprinting and hybrid
			
 
				+    fingerprinting involving message, stacktrace, security, or template hashing
			
 
				+    """
			
 
				+
			
 
				+    # The fingerprint value
			
 
				+    fingerprint: str
			
 
				+    # Either "client", "server_builtin_rule", or "server_custom_rule". (We don't have a "none of the
			
 
				+    # above" option here because we only record fingerprint metadata in cases where there's some
			
 
				+    # sort of custom fingerprint.)
			
 
				+    fingerprint_source: str
			
 
				+    # The fingerprint value set in the SDK, if anything other than ["{{ default }}"]. Note that just
			
 
				+    # because this is set doesn't mean we necessarily used it for grouping, since server-side rules
			
 
				+    # take precedence over client fingerprints. See `fingerprint_source` above.
			
 
				+    client_fingerprint: NotRequired[str]
			
 
				+    # The server-side rule applied, if any
			
 
				+    matched_fingerprinting_rule: NotRequired[str]
			
 
				+    # Whether or not a hybrid fingerprint (one involving both the signal value `{{ default }}` and a
			
 
				+    # custom value) was used. In that case, we group as we normally would, but then split the events
			
 
				+    # into more granular groups based on the custom value.
			
 
				+    is_hybrid_fingerprint: bool
			
 
				+
			
 
				+
			
 
				+class MessageHashingMetadata(TypedDict):
			
 
				+    """
			
 
				+    Data gathered when an event is grouped by log message or error type and value
			
 
				+    """
			
 
				+
			
 
				+    # Either "message" (from "message" or "logentry") or "exception" (error type and value, in cases
			
 
				+    # where there's no stacktrace)
			
 
				+    message_source: str
			
 
				+    # Whether we've done any parameterization of the message, such as replacing a number with "<int>"
			
 
				+    message_parameterized: bool
			
 
				+
			
 
				+
			
 
				+class SaltedMessageHashingMetadata(MessageHashingMetadata, FingerprintHashingMetadata):
			
 
				+    """
			
 
				+    Data from message-based bybrid fingerprinting
			
 
				+    """
			
 
				+
			
 
				+    pass
			
 
				+
			
 
				+
			
 
				+class StacktraceHashingMetadata(TypedDict):
			
 
				+    """
			
 
				+    Data gathered when an event is grouped based on a stacktrace found in an exception, a thread, or
			
 
				+    diretly in the event
			
 
				+    """
			
 
				+
			
 
				+    # Either "in-app" or "system"
			
 
				+    stacktrace_type: str
			
 
				+    # Where in the event data the stacktrace was found - either "exception", "thread", or
			
 
				+    # "top-level"
			
 
				+    stacktrace_location: str
			
 
				+    # The number of stacktraces used for grouping (will be more than 1 in cases of chained
			
 
				+    # exceptions)
			
 
				+    num_stacktraces: int
			
 
				+
			
 
				+
			
 
				+class SaltedStacktraceHashingMetadata(StacktraceHashingMetadata, FingerprintHashingMetadata):
			
 
				+    """
			
 
				+    Data from stacktrace-based bybrid fingerprinting
			
 
				+    """
			
 
				+
			
 
				+    pass
			
 
				+
			
 
				+
			
 
				+class SecurityHashingMetadata(TypedDict):
			
 
				+    """
			
 
				+    Data gathered when grouping browser-based security (Content Security Policy, Certifcate
			
 
				+    Transparency, Online Certificate Status Protocol Stapling, or HTTP Public Key Pinning) reports
			
 
				+    """
			
 
				+
			
 
				+    # Either "csp", "expect-ct", "expect-staple", or "hpkp"
			
 
				+    security_report_type: str
			
 
				+    # Domain name of the blocked address
			
 
				+    blocked_host: str
			
 
				+    # The CSP directive which was violated
			
 
				+    csp_directive: NotRequired[str]
			
 
				+    # In the case of a local `script-src` violation, whether it's an `unsafe-inline` or an
			
 
				+    # `unsafe-eval` violation
			
 
				+    csp_script_violation: NotRequired[str]
			
 
				+
			
 
				+
			
 
				+class SaltedSecurityHashingMetadata(SecurityHashingMetadata, FingerprintHashingMetadata):
			
 
				+    """
			
 
				+    Data from security-report-based bybrid fingerprinting
			
 
				+    """
			
 
				+
			
 
				+    pass
			
 
				+
			
 
				+
			
 
				+class TemplateHashingMetadata(TypedDict):
			
 
				+    """
			
 
				+    Data gathered when grouping errors generated by Django templates
			
 
				+    """
			
 
				+
			
 
				+    # The name of the template with the invalid template variable
			
 
				+    template_name: NotRequired[str]
			
 
				+    # The text of the line in the template containing the invalid variable
			
 
				+    template_context_line: NotRequired[str]
			
 
				+
			
 
				+
			
 
				+class SaltedTemplateHashingMetadata(TemplateHashingMetadata, FingerprintHashingMetadata):
			
 
				+    """
			
 
				+    Data from template-based bybrid fingerprinting
			
 
				+    """
			
 
				+
			
 
				+    pass
			
 
				+
			
 
				+
			
 
				+class ChecksumHashingMetadata(TypedDict):
			
 
				+    """
			
 
				+    Data gathered when legacy checksum grouping (wherein a hash is provided directly in the event)
			
 
				+    is used
			
 
				+    """
			
 
				+
			
 
				+    # The checksum used for grouping
			
 
				+    checksum: str
			
 
				+    # The incoming checksum value, if it was something other than a 32-digit hex value and we
			
 
				+    # therefore had to hash it before using it
			
 
				+    raw_checksum: NotRequired[str]
			
 
				+
			
 
				+
			
 
				+class FallbackHashingMetadata(TypedDict):
			
 
				+    """
			
 
				+    Data gathered when no other grouping method produces results
			
 
				+    """
			
 
				+
			
 
				+    # Whether we landed in the fallback because of a lack of data, because we had a stacktrace but
			
 
				+    # all frames were ignored, or some other reason
			
 
				+    fallback_reason: str
			
 
				+
			
 
				+
			
 
				+HashingMetadata = (
			
 
				+    FingerprintHashingMetadata
			
 
				+    | MessageHashingMetadata
			
 
				+    | SaltedMessageHashingMetadata
			
 
				+    | StacktraceHashingMetadata
			
 
				+    | SaltedStacktraceHashingMetadata
			
 
				+    | SecurityHashingMetadata
			
 
				+    | SaltedSecurityHashingMetadata
			
 
				+    | TemplateHashingMetadata
			
 
				+    | SaltedTemplateHashingMetadata
			
 
				+    | ChecksumHashingMetadata
			
 
				+    | FallbackHashingMetadata
			
 
				+)