test_tsdb_backend.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511
  1. from datetime import datetime, timedelta
  2. from unittest.mock import patch
  3. import pytz
  4. from sentry.models import Environment, Group, GroupRelease, Release
  5. from sentry.testutils import SnubaTestCase, TestCase
  6. from sentry.testutils.helpers.datetime import iso_format
  7. from sentry.tsdb.base import TSDBModel
  8. from sentry.tsdb.snuba import SnubaTSDB
  9. from sentry.utils.dates import to_datetime, to_timestamp
  10. def timestamp(d):
  11. t = int(to_timestamp(d))
  12. return t - (t % 3600)
  13. def has_shape(data, shape, allow_empty=False):
  14. """
  15. Determine if a data object has the provided shape
  16. At any level, the object in `data` and in `shape` must have the same type.
  17. A dict is the same shape if all its keys and values have the same shape as the
  18. key/value in `shape`. The number of keys/values is not relevant.
  19. A list is the same shape if all its items have the same shape as the value
  20. in `shape`
  21. A tuple is the same shape if it has the same length as `shape` and all the
  22. values have the same shape as the corresponding value in `shape`
  23. Any other object simply has to have the same type.
  24. If `allow_empty` is set, lists and dicts in `data` will pass even if they are empty.
  25. """
  26. if not isinstance(data, type(shape)):
  27. return False
  28. if isinstance(data, dict):
  29. return (
  30. (allow_empty or len(data) > 0)
  31. and all(has_shape(k, list(shape.keys())[0]) for k in data.keys())
  32. and all(has_shape(v, list(shape.values())[0]) for v in data.values())
  33. )
  34. elif isinstance(data, list):
  35. return (allow_empty or len(data) > 0) and all(has_shape(v, shape[0]) for v in data)
  36. elif isinstance(data, tuple):
  37. return len(data) == len(shape) and all(
  38. has_shape(data[i], shape[i]) for i in range(len(data))
  39. )
  40. else:
  41. return True
  42. class SnubaTSDBTest(TestCase, SnubaTestCase):
  43. def setUp(self):
  44. super().setUp()
  45. self.db = SnubaTSDB()
  46. self.now = (datetime.utcnow() - timedelta(hours=4)).replace(
  47. hour=0, minute=0, second=0, microsecond=0, tzinfo=pytz.UTC
  48. )
  49. self.proj1 = self.create_project()
  50. env1 = "test"
  51. env2 = "dev"
  52. defaultenv = ""
  53. release1 = "1" * 10
  54. release2 = "2" * 10
  55. self.release1 = Release.objects.create(
  56. organization_id=self.organization.id, version=release1, date_added=self.now
  57. )
  58. self.release1.add_project(self.proj1)
  59. self.release2 = Release.objects.create(
  60. organization_id=self.organization.id, version=release2, date_added=self.now
  61. )
  62. self.release2.add_project(self.proj1)
  63. for r in range(0, 14400, 600): # Every 10 min for 4 hours
  64. self.store_event(
  65. data={
  66. "event_id": (str(r) * 32)[:32],
  67. "message": "message 1",
  68. "platform": "python",
  69. "fingerprint": [["group-1"], ["group-2"]][
  70. (r // 600) % 2
  71. ], # Switch every 10 mins
  72. "timestamp": iso_format(self.now + timedelta(seconds=r)),
  73. "tags": {
  74. "foo": "bar",
  75. "baz": "quux",
  76. # Switch every 2 hours
  77. "environment": [env1, None][(r // 7200) % 3],
  78. "sentry:user": f"id:user{r // 3300}",
  79. },
  80. "user": {
  81. # change every 55 min so some hours have 1 user, some have 2
  82. "id": f"user{r // 3300}",
  83. "email": f"user{r}@sentry.io",
  84. },
  85. "release": str(r // 3600) * 10, # 1 per hour,
  86. },
  87. project_id=self.proj1.id,
  88. )
  89. groups = Group.objects.filter(project=self.proj1).order_by("id")
  90. self.proj1group1 = groups[0]
  91. self.proj1group2 = groups[1]
  92. self.env1 = Environment.objects.get(name=env1)
  93. self.env2 = self.create_environment(name=env2) # No events
  94. self.defaultenv = Environment.objects.get(name=defaultenv)
  95. self.group1release1env1 = GroupRelease.objects.get(
  96. project_id=self.proj1.id,
  97. group_id=self.proj1group1.id,
  98. release_id=self.release1.id,
  99. environment=env1,
  100. )
  101. self.group1release2env1 = GroupRelease.objects.create(
  102. project_id=self.proj1.id,
  103. group_id=self.proj1group1.id,
  104. release_id=self.release2.id,
  105. environment=env1,
  106. )
  107. self.group2release1env1 = GroupRelease.objects.get(
  108. project_id=self.proj1.id,
  109. group_id=self.proj1group2.id,
  110. release_id=self.release1.id,
  111. environment=env1,
  112. )
  113. def test_range_groups(self):
  114. dts = [self.now + timedelta(hours=i) for i in range(4)]
  115. assert self.db.get_range(
  116. TSDBModel.group, [self.proj1group1.id], dts[0], dts[-1], rollup=3600
  117. ) == {
  118. self.proj1group1.id: [
  119. (timestamp(dts[0]), 3),
  120. (timestamp(dts[1]), 3),
  121. (timestamp(dts[2]), 3),
  122. (timestamp(dts[3]), 3),
  123. ]
  124. }
  125. # Multiple groups
  126. assert self.db.get_range(
  127. TSDBModel.group,
  128. [self.proj1group1.id, self.proj1group2.id],
  129. dts[0],
  130. dts[-1],
  131. rollup=3600,
  132. ) == {
  133. self.proj1group1.id: [
  134. (timestamp(dts[0]), 3),
  135. (timestamp(dts[1]), 3),
  136. (timestamp(dts[2]), 3),
  137. (timestamp(dts[3]), 3),
  138. ],
  139. self.proj1group2.id: [
  140. (timestamp(dts[0]), 3),
  141. (timestamp(dts[1]), 3),
  142. (timestamp(dts[2]), 3),
  143. (timestamp(dts[3]), 3),
  144. ],
  145. }
  146. assert self.db.get_range(TSDBModel.group, [], dts[0], dts[-1], rollup=3600) == {}
  147. def test_range_releases(self):
  148. dts = [self.now + timedelta(hours=i) for i in range(4)]
  149. assert self.db.get_range(
  150. TSDBModel.release, [self.release1.id], dts[0], dts[-1], rollup=3600
  151. ) == {
  152. self.release1.id: [
  153. (timestamp(dts[0]), 0),
  154. (timestamp(dts[1]), 6),
  155. (timestamp(dts[2]), 0),
  156. (timestamp(dts[3]), 0),
  157. ]
  158. }
  159. def test_range_project(self):
  160. dts = [self.now + timedelta(hours=i) for i in range(4)]
  161. assert self.db.get_range(
  162. TSDBModel.project, [self.proj1.id], dts[0], dts[-1], rollup=3600
  163. ) == {
  164. self.proj1.id: [
  165. (timestamp(dts[0]), 6),
  166. (timestamp(dts[1]), 6),
  167. (timestamp(dts[2]), 6),
  168. (timestamp(dts[3]), 6),
  169. ]
  170. }
  171. def test_range_environment_filter(self):
  172. dts = [self.now + timedelta(hours=i) for i in range(4)]
  173. assert self.db.get_range(
  174. TSDBModel.project,
  175. [self.proj1.id],
  176. dts[0],
  177. dts[-1],
  178. rollup=3600,
  179. environment_ids=[self.env1.id],
  180. ) == {
  181. self.proj1.id: [
  182. (timestamp(dts[0]), 6),
  183. (timestamp(dts[1]), 6),
  184. (timestamp(dts[2]), 0),
  185. (timestamp(dts[3]), 0),
  186. ]
  187. }
  188. # No events submitted for env2
  189. assert self.db.get_range(
  190. TSDBModel.project,
  191. [self.proj1.id],
  192. dts[0],
  193. dts[-1],
  194. rollup=3600,
  195. environment_ids=[self.env2.id],
  196. ) == {
  197. self.proj1.id: [
  198. (timestamp(dts[0]), 0),
  199. (timestamp(dts[1]), 0),
  200. (timestamp(dts[2]), 0),
  201. (timestamp(dts[3]), 0),
  202. ]
  203. }
  204. # Events submitted with no environment should match default environment
  205. assert self.db.get_range(
  206. TSDBModel.project,
  207. [self.proj1.id],
  208. dts[0],
  209. dts[-1],
  210. rollup=3600,
  211. environment_ids=[self.defaultenv.id],
  212. ) == {
  213. self.proj1.id: [
  214. (timestamp(dts[0]), 0),
  215. (timestamp(dts[1]), 0),
  216. (timestamp(dts[2]), 6),
  217. (timestamp(dts[3]), 6),
  218. ]
  219. }
  220. def test_range_rollups(self):
  221. # Daily
  222. daystart = self.now.replace(hour=0) # day buckets start on day boundaries
  223. dts = [daystart + timedelta(days=i) for i in range(2)]
  224. assert self.db.get_range(
  225. TSDBModel.project, [self.proj1.id], dts[0], dts[-1], rollup=86400
  226. ) == {self.proj1.id: [(timestamp(dts[0]), 24), (timestamp(dts[1]), 0)]}
  227. # Minutely
  228. dts = [self.now + timedelta(minutes=i) for i in range(120)]
  229. # Expect every 10th minute to have a 1, else 0
  230. expected = [(to_timestamp(d), 1 if i % 10 == 0 else 0) for i, d in enumerate(dts)]
  231. assert self.db.get_range(
  232. TSDBModel.project, [self.proj1.id], dts[0], dts[-1], rollup=60
  233. ) == {self.proj1.id: expected}
  234. def test_distinct_counts_series_users(self):
  235. dts = [self.now + timedelta(hours=i) for i in range(4)]
  236. assert self.db.get_distinct_counts_series(
  237. TSDBModel.users_affected_by_group, [self.proj1group1.id], dts[0], dts[-1], rollup=3600
  238. ) == {
  239. self.proj1group1.id: [
  240. (timestamp(dts[0]), 1),
  241. (timestamp(dts[1]), 1),
  242. (timestamp(dts[2]), 1),
  243. (timestamp(dts[3]), 2),
  244. ]
  245. }
  246. dts = [self.now + timedelta(hours=i) for i in range(4)]
  247. assert self.db.get_distinct_counts_series(
  248. TSDBModel.users_affected_by_project, [self.proj1.id], dts[0], dts[-1], rollup=3600
  249. ) == {
  250. self.proj1.id: [
  251. (timestamp(dts[0]), 1),
  252. (timestamp(dts[1]), 2),
  253. (timestamp(dts[2]), 2),
  254. (timestamp(dts[3]), 2),
  255. ]
  256. }
  257. assert (
  258. self.db.get_distinct_counts_series(
  259. TSDBModel.users_affected_by_group, [], dts[0], dts[-1], rollup=3600
  260. )
  261. == {}
  262. )
  263. def get_distinct_counts_totals_users(self):
  264. assert self.db.get_distinct_counts_totals(
  265. TSDBModel.users_affected_by_group,
  266. [self.proj1group1.id],
  267. self.now,
  268. self.now + timedelta(hours=4),
  269. rollup=3600,
  270. ) == {
  271. self.proj1group1.id: 2 # 2 unique users overall
  272. }
  273. assert self.db.get_distinct_counts_totals(
  274. TSDBModel.users_affected_by_group,
  275. [self.proj1group1.id],
  276. self.now,
  277. self.now,
  278. rollup=3600,
  279. ) == {
  280. self.proj1group1.id: 1 # Only 1 unique user in the first hour
  281. }
  282. assert self.db.get_distinct_counts_totals(
  283. TSDBModel.users_affected_by_project,
  284. [self.proj1.id],
  285. self.now,
  286. self.now + timedelta(hours=4),
  287. rollup=3600,
  288. ) == {self.proj1.id: 2}
  289. assert (
  290. self.db.get_distinct_counts_totals(
  291. TSDBModel.users_affected_by_group,
  292. [],
  293. self.now,
  294. self.now + timedelta(hours=4),
  295. rollup=3600,
  296. )
  297. == {}
  298. )
  299. def test_most_frequent(self):
  300. assert self.db.get_most_frequent(
  301. TSDBModel.frequent_issues_by_project,
  302. [self.proj1.id],
  303. self.now,
  304. self.now + timedelta(hours=4),
  305. rollup=3600,
  306. ) in [
  307. {self.proj1.id: [(self.proj1group1.id, 2.0), (self.proj1group2.id, 1.0)]},
  308. {self.proj1.id: [(self.proj1group2.id, 2.0), (self.proj1group1.id, 1.0)]},
  309. ] # Both issues equally frequent
  310. assert (
  311. self.db.get_most_frequent(
  312. TSDBModel.frequent_issues_by_project,
  313. [],
  314. self.now,
  315. self.now + timedelta(hours=4),
  316. rollup=3600,
  317. )
  318. == {}
  319. )
  320. def test_frequency_series(self):
  321. dts = [self.now + timedelta(hours=i) for i in range(4)]
  322. assert self.db.get_frequency_series(
  323. TSDBModel.frequent_releases_by_group,
  324. {
  325. self.proj1group1.id: (self.group1release1env1.id, self.group1release2env1.id),
  326. self.proj1group2.id: (self.group2release1env1.id,),
  327. },
  328. dts[0],
  329. dts[-1],
  330. rollup=3600,
  331. ) == {
  332. self.proj1group1.id: [
  333. (timestamp(dts[0]), {self.group1release1env1.id: 0, self.group1release2env1.id: 0}),
  334. (timestamp(dts[1]), {self.group1release1env1.id: 3, self.group1release2env1.id: 0}),
  335. (timestamp(dts[2]), {self.group1release1env1.id: 0, self.group1release2env1.id: 3}),
  336. (timestamp(dts[3]), {self.group1release1env1.id: 0, self.group1release2env1.id: 0}),
  337. ],
  338. self.proj1group2.id: [
  339. (timestamp(dts[0]), {self.group2release1env1.id: 0}),
  340. (timestamp(dts[1]), {self.group2release1env1.id: 3}),
  341. (timestamp(dts[2]), {self.group2release1env1.id: 0}),
  342. (timestamp(dts[3]), {self.group2release1env1.id: 0}),
  343. ],
  344. }
  345. assert (
  346. self.db.get_frequency_series(
  347. TSDBModel.frequent_releases_by_group, {}, dts[0], dts[-1], rollup=3600
  348. )
  349. == {}
  350. )
  351. def test_result_shape(self):
  352. """
  353. Tests that the results from the different TSDB methods have the
  354. expected format.
  355. """
  356. project_id = self.proj1.id
  357. dts = [self.now + timedelta(hours=i) for i in range(4)]
  358. results = self.db.get_most_frequent(
  359. TSDBModel.frequent_issues_by_project, [project_id], dts[0], dts[0]
  360. )
  361. assert has_shape(results, {1: [(1, 1.0)]})
  362. results = self.db.get_most_frequent_series(
  363. TSDBModel.frequent_issues_by_project, [project_id], dts[0], dts[0]
  364. )
  365. assert has_shape(results, {1: [(1, {1: 1.0})]})
  366. items = {
  367. # {project_id: (issue_id, issue_id, ...)}
  368. project_id: (self.proj1group1.id, self.proj1group2.id)
  369. }
  370. results = self.db.get_frequency_series(
  371. TSDBModel.frequent_issues_by_project, items, dts[0], dts[-1]
  372. )
  373. assert has_shape(results, {1: [(1, {1: 1})]})
  374. results = self.db.get_frequency_totals(
  375. TSDBModel.frequent_issues_by_project, items, dts[0], dts[-1]
  376. )
  377. assert has_shape(results, {1: {1: 1}})
  378. results = self.db.get_range(TSDBModel.project, [project_id], dts[0], dts[-1])
  379. assert has_shape(results, {1: [(1, 1)]})
  380. results = self.db.get_distinct_counts_series(
  381. TSDBModel.users_affected_by_project, [project_id], dts[0], dts[-1]
  382. )
  383. assert has_shape(results, {1: [(1, 1)]})
  384. results = self.db.get_distinct_counts_totals(
  385. TSDBModel.users_affected_by_project, [project_id], dts[0], dts[-1]
  386. )
  387. assert has_shape(results, {1: 1})
  388. results = self.db.get_distinct_counts_union(
  389. TSDBModel.users_affected_by_project, [project_id], dts[0], dts[-1]
  390. )
  391. assert has_shape(results, 1)
  392. def test_calculated_limit(self):
  393. with patch("sentry.tsdb.snuba.snuba") as snuba:
  394. # 24h test
  395. rollup = 3600
  396. end = self.now
  397. start = end + timedelta(days=-1, seconds=rollup)
  398. self.db.get_data(TSDBModel.group, [1, 2, 3, 4, 5], start, end, rollup=rollup)
  399. assert snuba.query.call_args[1]["limit"] == 120
  400. # 14 day test
  401. rollup = 86400
  402. start = end + timedelta(days=-14, seconds=rollup)
  403. self.db.get_data(TSDBModel.group, [1, 2, 3, 4, 5], start, end, rollup=rollup)
  404. assert snuba.query.call_args[1]["limit"] == 70
  405. # 1h test
  406. rollup = 3600
  407. end = self.now
  408. start = end + timedelta(hours=-1, seconds=rollup)
  409. self.db.get_data(TSDBModel.group, [1, 2, 3, 4, 5], start, end, rollup=rollup)
  410. assert snuba.query.call_args[1]["limit"] == 5
  411. class AddJitterToSeriesTest(TestCase):
  412. def setUp(self):
  413. self.db = SnubaTSDB()
  414. def run_test(self, end, interval, jitter, expected_start, expected_end):
  415. end = end.replace(tzinfo=pytz.UTC)
  416. start = end - interval
  417. rollup, rollup_series = self.db.get_optimal_rollup_series(start, end)
  418. series = self.db._add_jitter_to_series(rollup_series, start, rollup, jitter)
  419. assert to_datetime(series[0]) == expected_start.replace(tzinfo=pytz.UTC)
  420. assert to_datetime(series[-1]) == expected_end.replace(tzinfo=pytz.UTC)
  421. def test(self):
  422. self.run_test(
  423. end=datetime(2022, 5, 18, 10, 23, 4),
  424. interval=timedelta(hours=1),
  425. jitter=5,
  426. expected_start=datetime(2022, 5, 18, 9, 22, 55),
  427. expected_end=datetime(2022, 5, 18, 10, 22, 55),
  428. )
  429. self.run_test(
  430. end=datetime(2022, 5, 18, 10, 23, 8),
  431. interval=timedelta(hours=1),
  432. jitter=5,
  433. expected_start=datetime(2022, 5, 18, 9, 23, 5),
  434. expected_end=datetime(2022, 5, 18, 10, 23, 5),
  435. )
  436. # Jitter should be the same
  437. self.run_test(
  438. end=datetime(2022, 5, 18, 10, 23, 8),
  439. interval=timedelta(hours=1),
  440. jitter=55,
  441. expected_start=datetime(2022, 5, 18, 9, 23, 5),
  442. expected_end=datetime(2022, 5, 18, 10, 23, 5),
  443. )
  444. self.run_test(
  445. end=datetime(2022, 5, 18, 22, 33, 2),
  446. interval=timedelta(minutes=1),
  447. jitter=3,
  448. expected_start=datetime(2022, 5, 18, 22, 31, 53),
  449. expected_end=datetime(2022, 5, 18, 22, 32, 53),
  450. )
  451. def test_empty_series(self):
  452. assert self.db._add_jitter_to_series([], datetime(2022, 5, 18, 10, 23, 4), 60, 127) == []
  453. assert self.db._add_jitter_to_series([], datetime(2022, 5, 18, 10, 23, 4), 60, None) == []