test_tsdb_backend.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466
  1. from __future__ import absolute_import
  2. from datetime import datetime, timedelta
  3. import pytz
  4. import six
  5. from sentry.utils.compat.mock import patch
  6. from sentry.models import Environment, Group, GroupRelease, Release
  7. from sentry.tsdb.base import TSDBModel
  8. from sentry.tsdb.snuba import SnubaTSDB
  9. from sentry.testutils import TestCase, SnubaTestCase
  10. from sentry.testutils.helpers.datetime import iso_format
  11. from sentry.utils.dates import to_timestamp
  12. def timestamp(d):
  13. t = int(to_timestamp(d))
  14. return t - (t % 3600)
  15. def has_shape(data, shape, allow_empty=False):
  16. """
  17. Determine if a data object has the provided shape
  18. At any level, the object in `data` and in `shape` must have the same type.
  19. A dict is the same shape if all its keys and values have the same shape as the
  20. key/value in `shape`. The number of keys/values is not relevant.
  21. A list is the same shape if all its items have the same shape as the value
  22. in `shape`
  23. A tuple is the same shape if it has the same length as `shape` and all the
  24. values have the same shape as the corresponding value in `shape`
  25. Any other object simply has to have the same type.
  26. If `allow_empty` is set, lists and dicts in `data` will pass even if they are empty.
  27. """
  28. if not isinstance(data, type(shape)):
  29. return False
  30. if isinstance(data, dict):
  31. return (
  32. (allow_empty or len(data) > 0)
  33. and all(has_shape(k, list(shape.keys())[0]) for k in data.keys())
  34. and all(has_shape(v, list(shape.values())[0]) for v in data.values())
  35. )
  36. elif isinstance(data, list):
  37. return (allow_empty or len(data) > 0) and all(has_shape(v, shape[0]) for v in data)
  38. elif isinstance(data, tuple):
  39. return len(data) == len(shape) and all(
  40. has_shape(data[i], shape[i]) for i in range(len(data))
  41. )
  42. else:
  43. return True
  44. class SnubaTSDBTest(TestCase, SnubaTestCase):
  45. def setUp(self):
  46. super(SnubaTSDBTest, self).setUp()
  47. self.db = SnubaTSDB()
  48. self.now = (datetime.utcnow() - timedelta(hours=4)).replace(
  49. hour=0, minute=0, second=0, microsecond=0, tzinfo=pytz.UTC
  50. )
  51. self.proj1 = self.create_project()
  52. env1 = "test"
  53. env2 = "dev"
  54. defaultenv = ""
  55. release1 = "1" * 10
  56. release2 = "2" * 10
  57. self.release1 = Release.objects.create(
  58. organization_id=self.organization.id, version=release1, date_added=self.now
  59. )
  60. self.release1.add_project(self.proj1)
  61. self.release2 = Release.objects.create(
  62. organization_id=self.organization.id, version=release2, date_added=self.now
  63. )
  64. self.release2.add_project(self.proj1)
  65. for r in range(0, 14400, 600): # Every 10 min for 4 hours
  66. self.store_event(
  67. data={
  68. "event_id": (six.text_type(r) * 32)[:32],
  69. "message": "message 1",
  70. "platform": "python",
  71. "fingerprint": [["group-1"], ["group-2"]][
  72. (r // 600) % 2
  73. ], # Switch every 10 mins
  74. "timestamp": iso_format(self.now + timedelta(seconds=r)),
  75. "tags": {
  76. "foo": "bar",
  77. "baz": "quux",
  78. # Switch every 2 hours
  79. "environment": [env1, None][(r // 7200) % 3],
  80. "sentry:user": u"id:user{}".format(r // 3300),
  81. },
  82. "user": {
  83. # change every 55 min so some hours have 1 user, some have 2
  84. "id": u"user{}".format(r // 3300),
  85. "email": u"user{}@sentry.io".format(r),
  86. },
  87. "release": six.text_type(r // 3600) * 10, # 1 per hour,
  88. },
  89. project_id=self.proj1.id,
  90. )
  91. groups = Group.objects.filter(project=self.proj1).order_by("id")
  92. self.proj1group1 = groups[0]
  93. self.proj1group2 = groups[1]
  94. self.env1 = Environment.objects.get(name=env1)
  95. self.env2 = self.create_environment(name=env2) # No events
  96. self.defaultenv = Environment.objects.get(name=defaultenv)
  97. self.group1release1env1 = GroupRelease.objects.get(
  98. project_id=self.proj1.id,
  99. group_id=self.proj1group1.id,
  100. release_id=self.release1.id,
  101. environment=env1,
  102. )
  103. self.group1release2env1 = GroupRelease.objects.create(
  104. project_id=self.proj1.id,
  105. group_id=self.proj1group1.id,
  106. release_id=self.release2.id,
  107. environment=env1,
  108. )
  109. self.group2release1env1 = GroupRelease.objects.get(
  110. project_id=self.proj1.id,
  111. group_id=self.proj1group2.id,
  112. release_id=self.release1.id,
  113. environment=env1,
  114. )
  115. def test_range_groups(self):
  116. dts = [self.now + timedelta(hours=i) for i in range(4)]
  117. assert self.db.get_range(
  118. TSDBModel.group, [self.proj1group1.id], dts[0], dts[-1], rollup=3600
  119. ) == {
  120. self.proj1group1.id: [
  121. (timestamp(dts[0]), 3),
  122. (timestamp(dts[1]), 3),
  123. (timestamp(dts[2]), 3),
  124. (timestamp(dts[3]), 3),
  125. ]
  126. }
  127. # Multiple groups
  128. assert self.db.get_range(
  129. TSDBModel.group,
  130. [self.proj1group1.id, self.proj1group2.id],
  131. dts[0],
  132. dts[-1],
  133. rollup=3600,
  134. ) == {
  135. self.proj1group1.id: [
  136. (timestamp(dts[0]), 3),
  137. (timestamp(dts[1]), 3),
  138. (timestamp(dts[2]), 3),
  139. (timestamp(dts[3]), 3),
  140. ],
  141. self.proj1group2.id: [
  142. (timestamp(dts[0]), 3),
  143. (timestamp(dts[1]), 3),
  144. (timestamp(dts[2]), 3),
  145. (timestamp(dts[3]), 3),
  146. ],
  147. }
  148. assert self.db.get_range(TSDBModel.group, [], dts[0], dts[-1], rollup=3600) == {}
  149. def test_range_releases(self):
  150. dts = [self.now + timedelta(hours=i) for i in range(4)]
  151. assert self.db.get_range(
  152. TSDBModel.release, [self.release1.id], dts[0], dts[-1], rollup=3600
  153. ) == {
  154. self.release1.id: [
  155. (timestamp(dts[0]), 0),
  156. (timestamp(dts[1]), 6),
  157. (timestamp(dts[2]), 0),
  158. (timestamp(dts[3]), 0),
  159. ]
  160. }
  161. def test_range_project(self):
  162. dts = [self.now + timedelta(hours=i) for i in range(4)]
  163. assert self.db.get_range(
  164. TSDBModel.project, [self.proj1.id], dts[0], dts[-1], rollup=3600
  165. ) == {
  166. self.proj1.id: [
  167. (timestamp(dts[0]), 6),
  168. (timestamp(dts[1]), 6),
  169. (timestamp(dts[2]), 6),
  170. (timestamp(dts[3]), 6),
  171. ]
  172. }
  173. def test_range_environment_filter(self):
  174. dts = [self.now + timedelta(hours=i) for i in range(4)]
  175. assert self.db.get_range(
  176. TSDBModel.project,
  177. [self.proj1.id],
  178. dts[0],
  179. dts[-1],
  180. rollup=3600,
  181. environment_ids=[self.env1.id],
  182. ) == {
  183. self.proj1.id: [
  184. (timestamp(dts[0]), 6),
  185. (timestamp(dts[1]), 6),
  186. (timestamp(dts[2]), 0),
  187. (timestamp(dts[3]), 0),
  188. ]
  189. }
  190. # No events submitted for env2
  191. assert self.db.get_range(
  192. TSDBModel.project,
  193. [self.proj1.id],
  194. dts[0],
  195. dts[-1],
  196. rollup=3600,
  197. environment_ids=[self.env2.id],
  198. ) == {
  199. self.proj1.id: [
  200. (timestamp(dts[0]), 0),
  201. (timestamp(dts[1]), 0),
  202. (timestamp(dts[2]), 0),
  203. (timestamp(dts[3]), 0),
  204. ]
  205. }
  206. # Events submitted with no environment should match default environment
  207. assert self.db.get_range(
  208. TSDBModel.project,
  209. [self.proj1.id],
  210. dts[0],
  211. dts[-1],
  212. rollup=3600,
  213. environment_ids=[self.defaultenv.id],
  214. ) == {
  215. self.proj1.id: [
  216. (timestamp(dts[0]), 0),
  217. (timestamp(dts[1]), 0),
  218. (timestamp(dts[2]), 6),
  219. (timestamp(dts[3]), 6),
  220. ]
  221. }
  222. def test_range_rollups(self):
  223. # Daily
  224. daystart = self.now.replace(hour=0) # day buckets start on day boundaries
  225. dts = [daystart + timedelta(days=i) for i in range(2)]
  226. assert self.db.get_range(
  227. TSDBModel.project, [self.proj1.id], dts[0], dts[-1], rollup=86400
  228. ) == {self.proj1.id: [(timestamp(dts[0]), 24), (timestamp(dts[1]), 0)]}
  229. # Minutely
  230. dts = [self.now + timedelta(minutes=i) for i in range(120)]
  231. # Expect every 10th minute to have a 1, else 0
  232. expected = [(to_timestamp(d), 1 if i % 10 == 0 else 0) for i, d in enumerate(dts)]
  233. assert self.db.get_range(
  234. TSDBModel.project, [self.proj1.id], dts[0], dts[-1], rollup=60
  235. ) == {self.proj1.id: expected}
  236. def test_distinct_counts_series_users(self):
  237. dts = [self.now + timedelta(hours=i) for i in range(4)]
  238. assert self.db.get_distinct_counts_series(
  239. TSDBModel.users_affected_by_group, [self.proj1group1.id], dts[0], dts[-1], rollup=3600
  240. ) == {
  241. self.proj1group1.id: [
  242. (timestamp(dts[0]), 1),
  243. (timestamp(dts[1]), 1),
  244. (timestamp(dts[2]), 1),
  245. (timestamp(dts[3]), 2),
  246. ]
  247. }
  248. dts = [self.now + timedelta(hours=i) for i in range(4)]
  249. assert self.db.get_distinct_counts_series(
  250. TSDBModel.users_affected_by_project, [self.proj1.id], dts[0], dts[-1], rollup=3600
  251. ) == {
  252. self.proj1.id: [
  253. (timestamp(dts[0]), 1),
  254. (timestamp(dts[1]), 2),
  255. (timestamp(dts[2]), 2),
  256. (timestamp(dts[3]), 2),
  257. ]
  258. }
  259. assert (
  260. self.db.get_distinct_counts_series(
  261. TSDBModel.users_affected_by_group, [], dts[0], dts[-1], rollup=3600
  262. )
  263. == {}
  264. )
  265. def get_distinct_counts_totals_users(self):
  266. assert self.db.get_distinct_counts_totals(
  267. TSDBModel.users_affected_by_group,
  268. [self.proj1group1.id],
  269. self.now,
  270. self.now + timedelta(hours=4),
  271. rollup=3600,
  272. ) == {
  273. self.proj1group1.id: 2 # 2 unique users overall
  274. }
  275. assert self.db.get_distinct_counts_totals(
  276. TSDBModel.users_affected_by_group,
  277. [self.proj1group1.id],
  278. self.now,
  279. self.now,
  280. rollup=3600,
  281. ) == {
  282. self.proj1group1.id: 1 # Only 1 unique user in the first hour
  283. }
  284. assert self.db.get_distinct_counts_totals(
  285. TSDBModel.users_affected_by_project,
  286. [self.proj1.id],
  287. self.now,
  288. self.now + timedelta(hours=4),
  289. rollup=3600,
  290. ) == {self.proj1.id: 2}
  291. assert (
  292. self.db.get_distinct_counts_totals(
  293. TSDBModel.users_affected_by_group,
  294. [],
  295. self.now,
  296. self.now + timedelta(hours=4),
  297. rollup=3600,
  298. )
  299. == {}
  300. )
  301. def test_most_frequent(self):
  302. assert self.db.get_most_frequent(
  303. TSDBModel.frequent_issues_by_project,
  304. [self.proj1.id],
  305. self.now,
  306. self.now + timedelta(hours=4),
  307. rollup=3600,
  308. ) in [
  309. {self.proj1.id: [(self.proj1group1.id, 2.0), (self.proj1group2.id, 1.0)]},
  310. {self.proj1.id: [(self.proj1group2.id, 2.0), (self.proj1group1.id, 1.0)]},
  311. ] # Both issues equally frequent
  312. assert (
  313. self.db.get_most_frequent(
  314. TSDBModel.frequent_issues_by_project,
  315. [],
  316. self.now,
  317. self.now + timedelta(hours=4),
  318. rollup=3600,
  319. )
  320. == {}
  321. )
  322. def test_frequency_series(self):
  323. dts = [self.now + timedelta(hours=i) for i in range(4)]
  324. assert self.db.get_frequency_series(
  325. TSDBModel.frequent_releases_by_group,
  326. {
  327. self.proj1group1.id: (self.group1release1env1.id, self.group1release2env1.id),
  328. self.proj1group2.id: (self.group2release1env1.id,),
  329. },
  330. dts[0],
  331. dts[-1],
  332. rollup=3600,
  333. ) == {
  334. self.proj1group1.id: [
  335. (timestamp(dts[0]), {self.group1release1env1.id: 0, self.group1release2env1.id: 0}),
  336. (timestamp(dts[1]), {self.group1release1env1.id: 3, self.group1release2env1.id: 0}),
  337. (timestamp(dts[2]), {self.group1release1env1.id: 0, self.group1release2env1.id: 3}),
  338. (timestamp(dts[3]), {self.group1release1env1.id: 0, self.group1release2env1.id: 0}),
  339. ],
  340. self.proj1group2.id: [
  341. (timestamp(dts[0]), {self.group2release1env1.id: 0}),
  342. (timestamp(dts[1]), {self.group2release1env1.id: 3}),
  343. (timestamp(dts[2]), {self.group2release1env1.id: 0}),
  344. (timestamp(dts[3]), {self.group2release1env1.id: 0}),
  345. ],
  346. }
  347. assert (
  348. self.db.get_frequency_series(
  349. TSDBModel.frequent_releases_by_group, {}, dts[0], dts[-1], rollup=3600
  350. )
  351. == {}
  352. )
  353. def test_result_shape(self):
  354. """
  355. Tests that the results from the different TSDB methods have the
  356. expected format.
  357. """
  358. project_id = self.proj1.id
  359. dts = [self.now + timedelta(hours=i) for i in range(4)]
  360. results = self.db.get_most_frequent(
  361. TSDBModel.frequent_issues_by_project, [project_id], dts[0], dts[0]
  362. )
  363. assert has_shape(results, {1: [(1, 1.0)]})
  364. results = self.db.get_most_frequent_series(
  365. TSDBModel.frequent_issues_by_project, [project_id], dts[0], dts[0]
  366. )
  367. assert has_shape(results, {1: [(1, {1: 1.0})]})
  368. items = {
  369. # {project_id: (issue_id, issue_id, ...)}
  370. project_id: (self.proj1group1.id, self.proj1group2.id)
  371. }
  372. results = self.db.get_frequency_series(
  373. TSDBModel.frequent_issues_by_project, items, dts[0], dts[-1]
  374. )
  375. assert has_shape(results, {1: [(1, {1: 1})]})
  376. results = self.db.get_frequency_totals(
  377. TSDBModel.frequent_issues_by_project, items, dts[0], dts[-1]
  378. )
  379. assert has_shape(results, {1: {1: 1}})
  380. results = self.db.get_range(TSDBModel.project, [project_id], dts[0], dts[-1])
  381. assert has_shape(results, {1: [(1, 1)]})
  382. results = self.db.get_distinct_counts_series(
  383. TSDBModel.users_affected_by_project, [project_id], dts[0], dts[-1]
  384. )
  385. assert has_shape(results, {1: [(1, 1)]})
  386. results = self.db.get_distinct_counts_totals(
  387. TSDBModel.users_affected_by_project, [project_id], dts[0], dts[-1]
  388. )
  389. assert has_shape(results, {1: 1})
  390. results = self.db.get_distinct_counts_union(
  391. TSDBModel.users_affected_by_project, [project_id], dts[0], dts[-1]
  392. )
  393. assert has_shape(results, 1)
  394. def test_calculated_limit(self):
  395. with patch("sentry.tsdb.snuba.snuba") as snuba:
  396. # 24h test
  397. rollup = 3600
  398. end = self.now
  399. start = end + timedelta(days=-1, seconds=rollup)
  400. self.db.get_data(TSDBModel.group, [1, 2, 3, 4, 5], start, end, rollup=rollup)
  401. assert snuba.query.call_args[1]["limit"] == 120
  402. # 14 day test
  403. rollup = 86400
  404. start = end + timedelta(days=-14, seconds=rollup)
  405. self.db.get_data(TSDBModel.group, [1, 2, 3, 4, 5], start, end, rollup=rollup)
  406. assert snuba.query.call_args[1]["limit"] == 70
  407. # 1h test
  408. rollup = 3600
  409. end = self.now
  410. start = end + timedelta(hours=-1, seconds=rollup)
  411. self.db.get_data(TSDBModel.group, [1, 2, 3, 4, 5], start, end, rollup=rollup)
  412. assert snuba.query.call_args[1]["limit"] == 5