Browse Source

Tag SQL optimization

David Burke 2 years ago
parent
commit
ed005c36b2

+ 20 - 0
events/tests/tests.py

@@ -205,6 +205,26 @@ class EventStoreTestCase(APITestCase):
         res = self.client.post(self.url, data, format="json")
         self.assertEqual(res.status_code, 200)
 
+    def test_store_somewhat_large_data(self):
+        """
+        This test is expected to exceed the 1mb limit of a postgres tsvector
+        only when two events exist for 1 issue.
+        """
+        with open("events/test_data/py_hi_event.json") as json_file:
+            data = json.load(json_file)
+
+        data["platform"] = " ".join([str(random.random()) for _ in range(30000)])
+        res = self.client.post(self.url, data, format="json")
+        self.assertEqual(res.status_code, 200)
+        data["event_id"] = "6600a066e64b4caf8ed7ec5af64ac4be"
+        data["platform"] = " ".join([str(random.random()) for _ in range(30000)])
+        res = self.client.post(self.url, data, format="json")
+        self.assertEqual(res.status_code, 200)
+        self.assertTrue(
+            Issue.objects.first().search_vector,
+            "tsvector is expected",
+        )
+
     @patch("events.views.logger")
     def test_invalid_event(self, mock_logger):
         with open("events/test_data/py_hi_event.json") as json_file:

+ 3 - 23
issues/migrations/0005_issue_tags.py

@@ -6,27 +6,6 @@ from .sql.triggers import UPDATE_ISSUE_TRIGGER
 from .sql.functions import GENERATE_ISSUE_TSVECTOR
 
 
-def forwards_func(apps, schema_editor):
-    Issue = apps.get_model("issues", "Issue")
-    for issue in Issue.objects.all()[:5000]:
-        tags = (
-            issue.event_set.all()
-            .order_by("tags")
-            .values_list("tags", flat=True)
-            .distinct()
-        )
-        super_dict = collections.defaultdict(set)
-        for tag in tags:
-            for key, value in tag.items():
-                super_dict[key].add(value)
-        issue.tags = {k: list(v) for k, v in super_dict.items()}
-        issue.save(update_fields=["tags"])
-
-
-def noop(*args, **kargs):
-    pass
-
-
 class Migration(migrations.Migration):
 
     dependencies = [
@@ -35,9 +14,10 @@ class Migration(migrations.Migration):
 
     operations = [
         migrations.AddField(
-            model_name="issue", name="tags", field=models.JSONField(default=dict),
+            model_name="issue",
+            name="tags",
+            field=models.JSONField(default=dict),
         ),
         migrations.RunSQL(UPDATE_ISSUE_TRIGGER, UPDATE_ISSUE_TRIGGER),
         migrations.RunSQL(GENERATE_ISSUE_TSVECTOR, GENERATE_ISSUE_TSVECTOR),
-        migrations.RunPython(forwards_func, noop),
     ]

+ 2 - 3
issues/migrations/0006_auto_20220120_0040.py

@@ -6,8 +6,7 @@ from django.db import migrations
 class Migration(migrations.Migration):
 
     dependencies = [
-        ('issues', '0005_issue_tags'),
+        ("issues", "0005_issue_tags"),
     ]
 
-    operations = [
-    ]
+    operations = []

+ 1 - 2
issues/migrations/0008_auto_20221114_2029.py

@@ -1,7 +1,6 @@
 # Generated by Django 4.1.3 on 2022-11-14 20:29
 
 from django.db import migrations
-from .sql.functions import UPDATE_ISSUE_INDEX
 
 
 class Migration(migrations.Migration):
@@ -10,4 +9,4 @@ class Migration(migrations.Migration):
         ("issues", "0007_auto_20220715_2048"),
     ]
 
-    operations = [migrations.RunSQL(UPDATE_ISSUE_INDEX)]
+    operations = []

+ 16 - 0
issues/migrations/0010_auto_20221202_0110.py

@@ -0,0 +1,16 @@
+# Generated by Django 4.1.3 on 2022-12-02 01:10
+
+from django.db import migrations
+from .sql.functions import UPDATE_ISSUE_INDEX, JSONB_RECURSIVE_MERGE
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("issues", "0009_alter_issue_id"),
+    ]
+
+    operations = [
+        migrations.RunSQL(JSONB_RECURSIVE_MERGE),
+        migrations.RunSQL(UPDATE_ISSUE_INDEX),
+    ]

+ 53 - 14
issues/migrations/sql/functions.py

@@ -8,6 +8,41 @@ END;
 $$ LANGUAGE plpgsql;;
 """
 
+# https://stackoverflow.com/a/42998229/443457
+JSONB_RECURSIVE_MERGE = """
+create or replace function remove_dupes(p_array jsonb)
+  returns jsonb
+as
+$$
+select jsonb_agg(distinct e)
+from jsonb_array_elements(p_array) as t(e);
+$$
+language sql;
+
+create or replace function jsonb_merge_deep(jsonb, jsonb)
+  returns jsonb
+  language sql
+  immutable
+as $func$
+  select case jsonb_typeof($1)
+    when 'object' then case jsonb_typeof($2)
+      when 'object' then (
+        select    jsonb_object_agg(k, case
+                    when e2.v is null then e1.v
+                    when e1.v is null then e2.v
+                    else jsonb_merge_deep(e1.v, e2.v)
+                  end)
+        from      jsonb_each($1) e1(k, v)
+        full join jsonb_each($2) e2(k, v) using (k)
+      )
+      else $2
+    end
+    when 'array' then remove_dupes($1 || $2)
+    else $2
+  end
+$func$;
+"""
+
 UPDATE_ISSUE_INDEX = """
 DROP PROCEDURE IF EXISTS update_issue_index;
 CREATE OR REPLACE PROCEDURE update_issue_index(update_issue_id integer)
@@ -16,35 +51,39 @@ AS $$
 WITH event_agg as (
     SELECT COUNT(events_event.event_id) as new_count,
     MAX(events_event.created) as new_last_seen,
-    MAX(events_event.level) as new_level
+    MAX(events_event.level) as new_level,
+    (
+      SELECT jsonb_object_agg(y.key, y.values) as new_tags FROM (
+        SELECT (a).key, array_agg(distinct(a).value) as values
+        FROM (
+          SELECT each(events_event.tags) as a
+          FROM events_event
+          LEFT JOIN issues_issue ON issues_issue.id = events_event.issue_id
+          WHERE events_event.issue_id=update_issue_id
+          AND events_event.created > issues_issue.last_seen
+          ) t GROUP by key
+      ) y
+    )
     FROM events_event
     LEFT JOIN issues_issue ON issues_issue.id = events_event.issue_id
     WHERE events_event.issue_id=update_issue_id
     AND events_event.created > issues_issue.last_seen
 ), event_vector as (
-    SELECT strip(COALESCE(generate_issue_tsvector(data), '') || COALESCE(issues_issue.search_vector, '')) as vector
+    SELECT strip(COALESCE(generate_issue_tsvector(data), '')) as vector
     FROM events_event
     LEFT JOIN issues_issue on issues_issue.id = events_event.issue_id
     WHERE events_event.issue_id=update_issue_id
+    AND events_event.created > issues_issue.last_seen
     limit 1
-), event_tags as (
-  SELECT jsonb_object_agg(y.key, y.values) as new_tags FROM (
-    SELECT (a).key, array_agg(distinct(a).value) as values
-    FROM (
-      SELECT each(tags) as a
-      FROM events_event
-      WHERE events_event.issue_id=update_issue_id
-    ) t GROUP by key
-  ) y
 )
 UPDATE issues_issue
 SET
   count = event_agg.new_count + issues_issue.count,
   last_seen = GREATEST(event_agg.new_last_seen, issues_issue.last_seen),
   level = GREATEST(event_agg.new_level, issues_issue.level),
-  search_vector = CASE WHEN search_vector is null or length(search_vector) < 100000 THEN event_vector.vector ELSE search_vector END,
-  tags = CASE WHEN event_Tags.new_tags is not null THEN event_tags.new_tags ELSE tags END
-FROM event_agg, event_vector, event_tags
+  search_vector = CASE WHEN length(event_vector.vector) < 10000 THEN event_vector.vector || COALESCE(search_vector, '') ELSE search_vector END,
+  tags = CASE WHEN event_agg.new_tags is not null THEN jsonb_merge_deep(event_agg.new_tags, tags) ELSE tags END
+FROM event_agg, event_vector
 WHERE issues_issue.id = update_issue_id;
 $$;
 """

+ 0 - 1
issues/migrations/sql/triggers.py

@@ -1,7 +1,6 @@
 # No longer used
 UPDATE_ISSUE_TRIGGER = """
 DROP TRIGGER IF EXISTS event_issue_update on events_event;
-DROP FUNCTION IF EXISTS update_issue;
 """
 
 INCREMENT_PROJECT_COUNTER_TRIGGER = """

+ 10 - 1
issues/tests/test_search_filter.py

@@ -15,7 +15,7 @@ class FilterTestCase(GlitchTipTestCase):
         self.url = reverse("issue-list")
 
     def test_filter_by_date(self):
-        """ A user should be able to filter by start and end datetimes. """
+        """A user should be able to filter by start and end datetimes."""
         with freeze_time(timezone.datetime(1999, 1, 1)):
             event1 = baker.make("events.Event", issue__project=self.project)
         with freeze_time(timezone.datetime(2010, 1, 1)):
@@ -238,3 +238,12 @@ class SearchTestCase(GlitchTipTestCase):
         self.assertContains(res, "matchingEventId")
         self.assertContains(res, event2.event_id.hex)
         self.assertEqual(res.headers.get("X-Sentry-Direct-Hit"), "1")
+
+        event3 = baker.make(
+            "events.Event", issue=event.issue, data={"name": "plum sauce"}
+        )
+        update_search_index_all_issues()
+        res = self.client.get(self.url + '?query=is:unresolved "plum sauce"')
+        self.assertContains(res, event3.issue.title)
+        res = self.client.get(self.url + '?query=is:unresolved "apple sauce"')
+        self.assertContains(res, event.issue.title)