Browse Source

feat(sessions): Enforce stricter rules on query interval (#24264)

This rejects intervals that would lead to weird rounding, and will throw clear error
messages instead of changing the interval in non-obvious ways.
Moves the date-range related code to the sessions API and prepares the code for minute-resolution sessions.
Arpad Borsos 4 years ago
parent
commit
99d3a6ba1e

+ 2 - 1
src/sentry/api/endpoints/organization_sessions.py

@@ -8,6 +8,7 @@ import sentry_sdk
 from sentry.api.bases import OrganizationEventsEndpointBase, NoProjects
 from sentry.snuba.sessions_v2 import (
     InvalidField,
+    InvalidParams,
     QueryDefinition,
     run_sessions_query,
     massage_sessions_result,
@@ -44,5 +45,5 @@ class OrganizationSessionsEndpoint(OrganizationEventsEndpointBase):
             # TODO: this context manager should be decoupled from `OrganizationEventsEndpointBase`?
             with super().handle_query_errors():
                 yield
-        except (InvalidField, NoProjects) as error:
+        except (InvalidField, InvalidParams, NoProjects) as error:
             raise ParseError(detail=str(error))

+ 1 - 50
src/sentry/api/utils.py

@@ -1,11 +1,9 @@
 from datetime import timedelta
 
-import math
 from django.utils import timezone
 
 from sentry.search.utils import parse_datetime_string, InvalidQuery
-from sentry.utils.dates import parse_stats_period, to_timestamp, to_datetime
-from sentry.constants import MAX_ROLLUP_POINTS
+from sentry.utils.dates import parse_stats_period
 
 MAX_STATS_PERIOD = timedelta(days=90)
 
@@ -81,50 +79,3 @@ def get_date_range_from_params(params, optional=False):
         raise InvalidParams("start must be before end")
 
     return start, end
-
-
-def get_date_range_rollup_from_params(
-    params,
-    minimum_interval="1h",
-    default_interval="",
-    round_range=False,
-    max_points=MAX_ROLLUP_POINTS,
-):
-    """
-    Similar to `get_date_range_from_params`, but this also parses and validates
-    an `interval`, as `get_rollup_from_request` would do.
-
-    This also optionally rounds the returned range to the given `interval`.
-    The rounding uses integer arithmetic on unix timestamps, so might yield
-    unexpected results when the interval is > 1d.
-    """
-    minimum_interval = parse_stats_period(minimum_interval).total_seconds()
-    interval = parse_stats_period(params.get("interval", default_interval))
-    interval = minimum_interval if interval is None else interval.total_seconds()
-    if interval <= 0:
-        raise InvalidParams("Interval cannot result in a zero duration.")
-
-    # round the interval up to the minimum
-    interval = int(minimum_interval * math.ceil(interval / minimum_interval))
-
-    start, end = get_date_range_from_params(params)
-    date_range = end - start
-
-    # round the range up to a multiple of the interval
-    if round_range:
-        date_range = timedelta(
-            seconds=int(interval * math.ceil(date_range.total_seconds() / interval))
-        )
-
-    if date_range.total_seconds() / interval > max_points:
-        raise InvalidParams(
-            "Your interval and date range would create too many results. "
-            "Use a larger interval, or a smaller date range."
-        )
-
-    if round_range:
-        end_ts = int(interval * math.ceil(to_timestamp(end) / interval))
-        end = to_datetime(end_ts)
-        start = end - date_range
-
-    return start, end, interval

+ 82 - 5
src/sentry/snuba/sessions_v2.py

@@ -1,10 +1,11 @@
-from datetime import datetime
+from datetime import datetime, timedelta
 import itertools
 import math
+import pytz
 
 from sentry.api.event_search import get_filter
-from sentry.api.utils import get_date_range_rollup_from_params
-from sentry.utils.dates import to_timestamp
+from sentry.api.utils import get_date_range_from_params
+from sentry.utils.dates import parse_stats_period, to_timestamp, to_datetime
 from sentry.utils.snuba import Dataset, raw_query, resolve_condition
 
 """
@@ -226,7 +227,8 @@ class QueryDefinition:
     `fields` and `groupby` definitions as [`ColumnDefinition`] objects.
     """
 
-    def __init__(self, query, params):
+    # XXX: Do *not* enable `allow_minute_resolution` yet outside of unit tests!
+    def __init__(self, query, params, allow_minute_resolution=False):
         self.query = query.get("query", "")
         raw_fields = query.getlist("field", [])
         raw_groupby = query.getlist("groupBy", [])
@@ -246,7 +248,7 @@ class QueryDefinition:
                 raise InvalidField(f'Invalid groupBy: "{key}"')
             self.groupby.append(GROUPBY_MAP[key])
 
-        start, end, rollup = get_date_range_rollup_from_params(query, "1h", round_range=True)
+        start, end, rollup = _get_constrained_date_range(query, allow_minute_resolution)
         self.rollup = rollup
         self.start = start
         self.end = end
@@ -277,6 +279,81 @@ class QueryDefinition:
         self.filter_keys = snuba_filter.filter_keys
 
 
+MAX_POINTS = 1000
+ONE_DAY = timedelta(days=1).total_seconds()
+ONE_HOUR = timedelta(hours=1).total_seconds()
+ONE_MINUTE = timedelta(minutes=1).total_seconds()
+
+
+class InvalidParams(Exception):
+    pass
+
+
+def _get_constrained_date_range(params, allow_minute_resolution=False):
+    interval = parse_stats_period(params.get("interval", "1h"))
+    interval = int(3600 if interval is None else interval.total_seconds())
+
+    smallest_interval = ONE_MINUTE if allow_minute_resolution else ONE_HOUR
+    if interval % smallest_interval != 0 or interval < smallest_interval:
+        interval_str = "one minute" if allow_minute_resolution else "one hour"
+        raise InvalidParams(
+            f"The interval has to be a multiple of the minimum interval of {interval_str}."
+        )
+
+    if interval > ONE_DAY:
+        raise InvalidParams("The interval has to be less than one day.")
+
+    if ONE_DAY % interval != 0:
+        raise InvalidParams("The interval should divide one day without a remainder.")
+
+    using_minute_resolution = interval % ONE_HOUR != 0
+
+    start, end = get_date_range_from_params(params)
+
+    # if `end` is explicitly given, we add a second to it, so it is treated as
+    # inclusive. the rounding logic down below will take care of the rest.
+    if params.get("end"):
+        end += timedelta(seconds=1)
+
+    date_range = end - start
+    # round the range up to a multiple of the interval.
+    # the minimum is 1h so the "totals" will not go out of sync, as they will
+    # use the materialized storage due to no grouping on the `started` column.
+    rounding_interval = int(math.ceil(interval / ONE_HOUR) * ONE_HOUR)
+    date_range = timedelta(
+        seconds=int(rounding_interval * math.ceil(date_range.total_seconds() / rounding_interval))
+    )
+
+    if using_minute_resolution:
+        if date_range.total_seconds() > 6 * ONE_HOUR:
+            raise InvalidParams(
+                "The time-range when using one-minute resolution intervals is restricted to 6 hours."
+            )
+        if (datetime.now(tz=pytz.utc) - start).total_seconds() > 30 * ONE_DAY:
+            raise InvalidParams(
+                "The time-range when using one-minute resolution intervals is restricted to the last 30 days."
+            )
+
+    if date_range.total_seconds() / interval > MAX_POINTS:
+        raise InvalidParams(
+            "Your interval and date range would create too many results. "
+            "Use a larger interval, or a smaller date range."
+        )
+
+    end_ts = int(rounding_interval * math.ceil(to_timestamp(end) / rounding_interval))
+    end = to_datetime(end_ts)
+    # when expanding the rounding interval, we would adjust the end time too far
+    # to the future, in which case the start time would not actually contain our
+    # desired date range. adjust for this by extend the time by another interval.
+    # for example, when "45m" means the range from 08:49:00-09:34:00, our rounding
+    # has to go from 08:00:00 to 10:00:00.
+    if rounding_interval > interval and (end - date_range) > start:
+        date_range += timedelta(seconds=rounding_interval)
+    start = end - date_range
+
+    return start, end, interval
+
+
 TS_COL = "bucketed_started"
 
 

+ 0 - 45
tests/sentry/api/test_utils.py

@@ -5,7 +5,6 @@ from freezegun import freeze_time
 
 from sentry.api.utils import (
     get_date_range_from_params,
-    get_date_range_rollup_from_params,
     InvalidParams,
     MAX_STATS_PERIOD,
 )
@@ -62,47 +61,3 @@ class GetDateRangeFromParamsTest(TestCase):
 
         with self.assertRaises(InvalidParams):
             start, end = get_date_range_from_params({"statsPeriodStart": "14d"})
-
-
-class GetDateRangeRollupFromParamsTest(TestCase):
-    def test_intervals(self):
-        # defaults to 1h
-        start, end, interval = get_date_range_rollup_from_params({})
-        assert interval == 3600
-
-        # rounds up to a multiple of the minimum
-        start, end, interval = get_date_range_rollup_from_params(
-            {"statsPeriod": "14h", "interval": "8m"}, minimum_interval="5m"
-        )
-        assert interval == 600
-
-    @freeze_time("2018-12-11 03:21:34")
-    def test_round_range(self):
-        start, end, interval = get_date_range_rollup_from_params(
-            {"statsPeriod": "2d"}, round_range=True
-        )
-        assert start == datetime.datetime(2018, 12, 9, 4, tzinfo=timezone.utc)
-        assert end == datetime.datetime(2018, 12, 11, 4, tzinfo=timezone.utc)
-
-        start, end, interval = get_date_range_rollup_from_params(
-            {"statsPeriod": "2d", "interval": "1d"}, round_range=True
-        )
-        assert start == datetime.datetime(2018, 12, 10, tzinfo=timezone.utc)
-        assert end == datetime.datetime(2018, 12, 12, tzinfo=timezone.utc)
-
-    def test_invalid_interval(self):
-        with self.assertRaises(InvalidParams):
-            start, end, interval = get_date_range_rollup_from_params({"interval": "0d"})
-        with self.assertRaises(InvalidParams):
-            # defaults stats period is 90d
-            start, end, interval = get_date_range_rollup_from_params(
-                {"interval": "1d"}, max_points=80
-            )
-
-    def test_round_exact(self):
-        start, end, interval = get_date_range_rollup_from_params(
-            {"start": "2021-01-12T04:06:16", "end": "2021-01-17T08:26:13", "interval": "1d"},
-            round_range=True,
-        )
-        assert start == datetime.datetime(2021, 1, 12, tzinfo=timezone.utc)
-        assert end == datetime.datetime(2021, 1, 18, tzinfo=timezone.utc)

+ 19 - 5
tests/snuba/api/endpoints/test_organization_sessions.py

@@ -122,13 +122,19 @@ class OrganizationSessionsEndpointTest(APITestCase, SnubaTestCase):
         assert response.data == {"detail": 'Invalid groupBy: "envriomnent"'}
 
     def test_invalid_query(self):
-        response = self.do_request({"field": ["sum(session)"], "query": ["foo:bar"]})
+        response = self.do_request(
+            {"statsPeriod": "1d", "field": ["sum(session)"], "query": ["foo:bar"]}
+        )
 
         assert response.status_code == 400, response.content
         assert response.data == {"detail": 'Invalid query field: "foo"'}
 
         response = self.do_request(
-            {"field": ["sum(session)"], "query": ["release:foo-bar@1.2.3 (123)"]}
+            {
+                "statsPeriod": "1d",
+                "field": ["sum(session)"],
+                "query": ["release:foo-bar@1.2.3 (123)"],
+            }
         )
 
         assert response.status_code == 400, response.content
@@ -137,13 +143,14 @@ class OrganizationSessionsEndpointTest(APITestCase, SnubaTestCase):
         assert response.data == {"detail": 'Invalid query field: "message"'}
 
     def test_too_many_points(self):
-        # TODO: looks like this is well within the range of valid points
-        return
         # default statsPeriod is 90d
         response = self.do_request({"field": ["sum(session)"], "interval": "1h"})
 
         assert response.status_code == 400, response.content
-        assert response.data == {}
+        assert response.data == {
+            "detail": "Your interval and date range would create too many results. "
+            "Use a larger interval, or a smaller date range."
+        }
 
     @freeze_time("2021-01-14T12:27:28.303Z")
     def test_timeseries_interval(self):
@@ -205,7 +212,14 @@ class OrganizationSessionsEndpointTest(APITestCase, SnubaTestCase):
         response = self.do_request(
             {"project": [-1], "statsPeriod": "2h", "interval": "5m", "field": ["sum(session)"]}
         )
+        assert response.status_code == 400, response.content
+        assert response.data == {
+            "detail": "The interval has to be a multiple of the minimum interval of one hour."
+        }
 
+        response = self.do_request(
+            {"project": [-1], "statsPeriod": "2h", "interval": "1h", "field": ["sum(session)"]}
+        )
         assert response.status_code == 200, response.content
         assert result_sorted(response.data) == {
             "query": "",

+ 143 - 2
tests/snuba/sessions/test_sessions_v2.py

@@ -1,5 +1,8 @@
 import math
+import pytest
+import pytz
 
+from datetime import datetime
 from freezegun import freeze_time
 from django.http import QueryDict
 
@@ -8,11 +11,13 @@ from sentry.snuba.sessions_v2 import (
     QueryDefinition,
     massage_sessions_result,
     _get_timestamps,
+    InvalidParams,
+    _get_constrained_date_range,
 )
 
 
-def _make_query(qs):
-    return QueryDefinition(QueryDict(qs), {})
+def _make_query(qs, allow_minute_resolution=True):
+    return QueryDefinition(QueryDict(qs), {}, allow_minute_resolution)
 
 
 def result_sorted(result):
@@ -25,6 +30,85 @@ def result_sorted(result):
     return result
 
 
+@freeze_time("2018-12-11 03:21:34")
+def test_round_range():
+    start, end, interval = _get_constrained_date_range({"statsPeriod": "2d"})
+    assert start == datetime(2018, 12, 9, 4, tzinfo=pytz.utc)
+    assert end == datetime(2018, 12, 11, 4, tzinfo=pytz.utc)
+
+    start, end, interval = _get_constrained_date_range({"statsPeriod": "2d", "interval": "1d"})
+    assert start == datetime(2018, 12, 10, tzinfo=pytz.utc)
+    assert end == datetime(2018, 12, 12, tzinfo=pytz.utc)
+
+
+def test_invalid_interval():
+    with pytest.raises(InvalidParams):
+        start, end, interval = _get_constrained_date_range({"interval": "0d"})
+
+
+def test_round_exact():
+    start, end, interval = _get_constrained_date_range(
+        {"start": "2021-01-12T04:06:16", "end": "2021-01-17T08:26:13", "interval": "1d"},
+    )
+    assert start == datetime(2021, 1, 12, tzinfo=pytz.utc)
+    assert end == datetime(2021, 1, 18, tzinfo=pytz.utc)
+
+
+def test_inclusive_end():
+    start, end, interval = _get_constrained_date_range(
+        {"start": "2021-02-24T00:00:00", "end": "2021-02-25T00:00:00", "interval": "1h"},
+    )
+    assert start == datetime(2021, 2, 24, tzinfo=pytz.utc)
+    assert end == datetime(2021, 2, 25, 1, tzinfo=pytz.utc)
+
+
+@freeze_time("2021-03-05T11:14:17.105Z")
+def test_interval_restrictions():
+    # making sure intervals are cleanly divisible
+    with pytest.raises(InvalidParams, match="The interval has to be less than one day."):
+        _make_query("statsPeriod=4d&interval=2d&field=sum(session)")
+    with pytest.raises(
+        InvalidParams, match="The interval should divide one day without a remainder."
+    ):
+        _make_query("statsPeriod=6h&interval=59m&field=sum(session)")
+    with pytest.raises(
+        InvalidParams, match="The interval should divide one day without a remainder."
+    ):
+        _make_query("statsPeriod=4d&interval=5h&field=sum(session)")
+
+    _make_query("statsPeriod=6h&interval=90m&field=sum(session)")
+    with pytest.raises(
+        InvalidParams,
+        match="The interval has to be a multiple of the minimum interval of one hour.",
+    ):
+        _make_query("statsPeriod=6h&interval=90m&field=sum(session)", False)
+
+    with pytest.raises(
+        InvalidParams,
+        match="The interval has to be a multiple of the minimum interval of one minute.",
+    ):
+        _make_query("statsPeriod=1h&interval=90s&field=sum(session)")
+
+    # restrictions for minute resolution time range
+    with pytest.raises(
+        InvalidParams,
+        match="The time-range when using one-minute resolution intervals is restricted to 6 hours.",
+    ):
+        _make_query("statsPeriod=7h&interval=15m&field=sum(session)")
+    with pytest.raises(
+        InvalidParams,
+        match="The time-range when using one-minute resolution intervals is restricted to the last 30 days.",
+    ):
+        _make_query(
+            "start=2021-01-05T11:14:17&end=2021-01-05T12:14:17&interval=15m&field=sum(session)"
+        )
+
+    with pytest.raises(
+        InvalidParams, match="Your interval and date range would create too many results."
+    ):
+        _make_query("statsPeriod=90d&interval=1h&field=sum(session)")
+
+
 @freeze_time("2020-12-18T11:14:17.105Z")
 def test_timestamps():
     query = _make_query("statsPeriod=1d&interval=12h&field=sum(session)")
@@ -35,6 +119,63 @@ def test_timestamps():
     assert actual_timestamps == expected_timestamps
 
 
+@freeze_time("2021-03-08T09:34:00.000Z")
+def test_hourly_rounded_start():
+    query = _make_query("statsPeriod=30m&interval=1m&field=sum(session)")
+
+    actual_timestamps = _get_timestamps(query)
+
+    assert actual_timestamps[0] == "2021-03-08T09:00:00Z"
+    assert len(actual_timestamps) == 60
+
+    # in this case "45m" means from 08:49:00-09:34:00, but since we round start/end
+    # to hours, we extend the start time to 08:00:00.
+    query = _make_query("statsPeriod=45m&interval=1m&field=sum(session)")
+
+    actual_timestamps = _get_timestamps(query)
+
+    assert actual_timestamps[0] == "2021-03-08T08:00:00Z"
+    assert len(actual_timestamps) == 120
+
+
+def test_rounded_end():
+    query = _make_query(
+        "field=sum(session)&interval=1h&start=2021-02-24T00:00:00Z&end=2021-02-25T00:00:00Z"
+    )
+
+    expected_timestamps = [
+        "2021-02-24T00:00:00Z",
+        "2021-02-24T01:00:00Z",
+        "2021-02-24T02:00:00Z",
+        "2021-02-24T03:00:00Z",
+        "2021-02-24T04:00:00Z",
+        "2021-02-24T05:00:00Z",
+        "2021-02-24T06:00:00Z",
+        "2021-02-24T07:00:00Z",
+        "2021-02-24T08:00:00Z",
+        "2021-02-24T09:00:00Z",
+        "2021-02-24T10:00:00Z",
+        "2021-02-24T11:00:00Z",
+        "2021-02-24T12:00:00Z",
+        "2021-02-24T13:00:00Z",
+        "2021-02-24T14:00:00Z",
+        "2021-02-24T15:00:00Z",
+        "2021-02-24T16:00:00Z",
+        "2021-02-24T17:00:00Z",
+        "2021-02-24T18:00:00Z",
+        "2021-02-24T19:00:00Z",
+        "2021-02-24T20:00:00Z",
+        "2021-02-24T21:00:00Z",
+        "2021-02-24T22:00:00Z",
+        "2021-02-24T23:00:00Z",
+        "2021-02-25T00:00:00Z",
+    ]
+    actual_timestamps = _get_timestamps(query)
+
+    assert len(actual_timestamps) == 25
+    assert actual_timestamps == expected_timestamps
+
+
 def test_simple_query():
     query = _make_query("statsPeriod=1d&interval=12h&field=sum(session)")