test_tsdb_backend.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454
  1. from __future__ import absolute_import
  2. import calendar
  3. from datetime import datetime, timedelta
  4. import json
  5. import pytz
  6. import requests
  7. import six
  8. from django.conf import settings
  9. from sentry.models import GroupHash, GroupRelease, Release
  10. from sentry.tsdb.base import TSDBModel
  11. from sentry.tsdb.snuba import SnubaTSDB
  12. from sentry.testutils import TestCase
  13. from sentry.utils.dates import to_timestamp
  14. def timestamp(d):
  15. t = int(to_timestamp(d))
  16. return t - (t % 3600)
  17. def has_shape(data, shape, allow_empty=False):
  18. """
  19. Determine if a data object has the provided shape
  20. At any level, the object in `data` and in `shape` must have the same type.
  21. A dict is the same shape if all its keys and values have the same shape as the
  22. key/value in `shape`. The number of keys/values is not relevant.
  23. A list is the same shape if all its items have the same shape as the value
  24. in `shape`
  25. A tuple is the same shape if it has the same length as `shape` and all the
  26. values have the same shape as the corresponding value in `shape`
  27. Any other object simply has to have the same type.
  28. If `allow_empty` is set, lists and dicts in `data` will pass even if they are empty.
  29. """
  30. if not isinstance(data, type(shape)):
  31. return False
  32. if isinstance(data, dict):
  33. return (allow_empty or len(data) > 0) and\
  34. all(has_shape(k, shape.keys()[0]) for k in data.keys()) and\
  35. all(has_shape(v, shape.values()[0]) for v in data.values())
  36. elif isinstance(data, list):
  37. return (allow_empty or len(data) > 0) and\
  38. all(has_shape(v, shape[0]) for v in data)
  39. elif isinstance(data, tuple):
  40. return len(data) == len(shape) and all(
  41. has_shape(data[i], shape[i]) for i in range(len(data)))
  42. else:
  43. return True
  44. class SnubaTSDBTest(TestCase):
  45. def setUp(self):
  46. assert requests.post(settings.SENTRY_SNUBA + '/tests/drop').status_code == 200
  47. self.db = SnubaTSDB()
  48. self.now = datetime.utcnow().replace(
  49. hour=0,
  50. minute=0,
  51. second=0,
  52. microsecond=0,
  53. tzinfo=pytz.UTC
  54. )
  55. self.proj1 = self.create_project()
  56. self.proj1env1 = self.create_environment(project=self.proj1, name='test')
  57. self.proj1env2 = self.create_environment(project=self.proj1, name='dev')
  58. self.proj1defaultenv = self.create_environment(project=self.proj1, name='')
  59. self.proj1group1 = self.create_group(self.proj1)
  60. self.proj1group2 = self.create_group(self.proj1)
  61. hash1 = '1' * 32
  62. hash2 = '2' * 32
  63. GroupHash.objects.create(project=self.proj1, group=self.proj1group1, hash=hash1)
  64. GroupHash.objects.create(project=self.proj1, group=self.proj1group2, hash=hash2)
  65. self.release1 = Release.objects.create(
  66. organization_id=self.organization.id,
  67. version='1' * 10,
  68. date_added=self.now,
  69. )
  70. self.release1.add_project(self.proj1)
  71. self.release2 = Release.objects.create(
  72. organization_id=self.organization.id,
  73. version='2' * 10,
  74. date_added=self.now,
  75. )
  76. self.release2.add_project(self.proj1)
  77. self.group1release1 = GroupRelease.objects.create(
  78. project_id=self.proj1.id,
  79. group_id=self.proj1group1.id,
  80. release_id=self.release1.id
  81. )
  82. self.group1release2 = GroupRelease.objects.create(
  83. project_id=self.proj1.id,
  84. group_id=self.proj1group1.id,
  85. release_id=self.release2.id
  86. )
  87. self.group2release1 = GroupRelease.objects.create(
  88. project_id=self.proj1.id,
  89. group_id=self.proj1group2.id,
  90. release_id=self.release1.id
  91. )
  92. data = json.dumps([{
  93. 'event_id': (six.text_type(r) * 32)[:32],
  94. 'primary_hash': [hash1, hash2][(r // 600) % 2], # Switch every 10 mins
  95. 'group_id': [self.proj1group1.id, self.proj1group2.id][(r // 600) % 2],
  96. 'project_id': self.proj1.id,
  97. 'message': 'message 1',
  98. 'platform': 'python',
  99. 'datetime': (self.now + timedelta(seconds=r)).strftime('%Y-%m-%dT%H:%M:%S.%fZ'),
  100. 'data': {
  101. 'received': calendar.timegm(self.now.timetuple()) + r,
  102. 'tags': {
  103. 'foo': 'bar',
  104. 'baz': 'quux',
  105. # Switch every 2 hours
  106. 'environment': [self.proj1env1.name, None][(r // 7200) % 2],
  107. 'sentry:user': u'id:user{}'.format(r // 3300),
  108. 'sentry:release': six.text_type(r // 3600) * 10, # 1 per hour
  109. },
  110. 'sentry.interfaces.User': {
  111. # change every 55 min so some hours have 1 user, some have 2
  112. 'id': u"user{}".format(r // 3300),
  113. 'email': u"user{}@sentry.io".format(r)
  114. }
  115. },
  116. } for r in range(0, 14400, 600)]) # Every 10 min for 4 hours
  117. assert requests.post(settings.SENTRY_SNUBA + '/tests/insert', data=data).status_code == 200
  118. # snuba trims query windows based on first_seen/last_seen, so these need to be correct-ish
  119. self.proj1group1.first_seen = self.now
  120. self.proj1group1.last_seen = self.now + timedelta(seconds=14400)
  121. self.proj1group1.save()
  122. self.proj1group2.first_seen = self.now
  123. self.proj1group2.last_seen = self.now + timedelta(seconds=14400)
  124. self.proj1group2.save()
  125. def test_range_groups(self):
  126. dts = [self.now + timedelta(hours=i) for i in range(4)]
  127. assert self.db.get_range(
  128. TSDBModel.group,
  129. [self.proj1group1.id],
  130. dts[0], dts[-1],
  131. rollup=3600
  132. ) == {
  133. self.proj1group1.id: [
  134. (timestamp(dts[0]), 3),
  135. (timestamp(dts[1]), 3),
  136. (timestamp(dts[2]), 3),
  137. (timestamp(dts[3]), 3),
  138. ],
  139. }
  140. # Multiple groups
  141. assert self.db.get_range(
  142. TSDBModel.group,
  143. [self.proj1group1.id, self.proj1group2.id],
  144. dts[0], dts[-1],
  145. rollup=3600
  146. ) == {
  147. self.proj1group1.id: [
  148. (timestamp(dts[0]), 3),
  149. (timestamp(dts[1]), 3),
  150. (timestamp(dts[2]), 3),
  151. (timestamp(dts[3]), 3),
  152. ],
  153. self.proj1group2.id: [
  154. (timestamp(dts[0]), 3),
  155. (timestamp(dts[1]), 3),
  156. (timestamp(dts[2]), 3),
  157. (timestamp(dts[3]), 3),
  158. ],
  159. }
  160. def test_range_releases(self):
  161. dts = [self.now + timedelta(hours=i) for i in range(4)]
  162. assert self.db.get_range(
  163. TSDBModel.release,
  164. [self.release1.id],
  165. dts[0], dts[-1],
  166. rollup=3600
  167. ) == {
  168. self.release1.id: [
  169. (timestamp(dts[0]), 0),
  170. (timestamp(dts[1]), 6),
  171. (timestamp(dts[2]), 0),
  172. (timestamp(dts[3]), 0),
  173. ]
  174. }
  175. def test_range_project(self):
  176. dts = [self.now + timedelta(hours=i) for i in range(4)]
  177. assert self.db.get_range(
  178. TSDBModel.project,
  179. [self.proj1.id],
  180. dts[0], dts[-1],
  181. rollup=3600
  182. ) == {
  183. self.proj1.id: [
  184. (timestamp(dts[0]), 6),
  185. (timestamp(dts[1]), 6),
  186. (timestamp(dts[2]), 6),
  187. (timestamp(dts[3]), 6),
  188. ]
  189. }
  190. def test_range_environment_filter(self):
  191. dts = [self.now + timedelta(hours=i) for i in range(4)]
  192. assert self.db.get_range(
  193. TSDBModel.project,
  194. [self.proj1.id],
  195. dts[0], dts[-1],
  196. rollup=3600,
  197. environment_id=self.proj1env1.id
  198. ) == {
  199. self.proj1.id: [
  200. (timestamp(dts[0]), 6),
  201. (timestamp(dts[1]), 6),
  202. (timestamp(dts[2]), 0),
  203. (timestamp(dts[3]), 0),
  204. ]
  205. }
  206. # No events submitted for env2
  207. assert self.db.get_range(
  208. TSDBModel.project,
  209. [self.proj1.id],
  210. dts[0], dts[-1],
  211. rollup=3600,
  212. environment_id=self.proj1env2.id
  213. ) == {
  214. self.proj1.id: [
  215. (timestamp(dts[0]), 0),
  216. (timestamp(dts[1]), 0),
  217. (timestamp(dts[2]), 0),
  218. (timestamp(dts[3]), 0),
  219. ]
  220. }
  221. # Events submitted with no environment should match default environment
  222. assert self.db.get_range(
  223. TSDBModel.project,
  224. [self.proj1.id],
  225. dts[0], dts[-1],
  226. rollup=3600,
  227. environment_id=self.proj1defaultenv.id
  228. ) == {
  229. self.proj1.id: [
  230. (timestamp(dts[0]), 0),
  231. (timestamp(dts[1]), 0),
  232. (timestamp(dts[2]), 6),
  233. (timestamp(dts[3]), 6),
  234. ]
  235. }
  236. def test_range_rollups(self):
  237. # Daily
  238. daystart = self.now.replace(hour=0) # day buckets start on day boundaries
  239. dts = [daystart + timedelta(days=i) for i in range(2)]
  240. assert self.db.get_range(
  241. TSDBModel.project,
  242. [self.proj1.id],
  243. dts[0], dts[-1],
  244. rollup=86400
  245. ) == {
  246. self.proj1.id: [
  247. (timestamp(dts[0]), 24),
  248. (timestamp(dts[1]), 0)
  249. ]
  250. }
  251. # Minutely
  252. dts = [self.now + timedelta(minutes=i) for i in range(120)]
  253. # Expect every 10th minute to have a 1, else 0
  254. expected = [(to_timestamp(d), int(i % 10 == 0)) for i, d in enumerate(dts)]
  255. assert self.db.get_range(
  256. TSDBModel.project,
  257. [self.proj1.id],
  258. dts[0], dts[-1],
  259. rollup=60
  260. ) == {
  261. self.proj1.id: expected
  262. }
  263. def test_distinct_counts_series_users(self):
  264. dts = [self.now + timedelta(hours=i) for i in range(4)]
  265. assert self.db.get_distinct_counts_series(
  266. TSDBModel.users_affected_by_group,
  267. [self.proj1group1.id],
  268. dts[0], dts[-1],
  269. rollup=3600
  270. ) == {
  271. self.proj1group1.id: [
  272. (timestamp(dts[0]), 1),
  273. (timestamp(dts[1]), 1),
  274. (timestamp(dts[2]), 1),
  275. (timestamp(dts[3]), 2),
  276. ],
  277. }
  278. dts = [self.now + timedelta(hours=i) for i in range(4)]
  279. assert self.db.get_distinct_counts_series(
  280. TSDBModel.users_affected_by_project,
  281. [self.proj1.id],
  282. dts[0], dts[-1],
  283. rollup=3600
  284. ) == {
  285. self.proj1.id: [
  286. (timestamp(dts[0]), 1),
  287. (timestamp(dts[1]), 2),
  288. (timestamp(dts[2]), 2),
  289. (timestamp(dts[3]), 2),
  290. ],
  291. }
  292. def get_distinct_counts_totals_users(self):
  293. assert self.db.get_distinct_counts_totals(
  294. TSDBModel.users_affected_by_group,
  295. [self.proj1group1.id],
  296. self.now,
  297. self.now + timedelta(hours=4),
  298. rollup=3600
  299. ) == {
  300. self.proj1group1.id: 2, # 2 unique users overall
  301. }
  302. assert self.db.get_distinct_counts_totals(
  303. TSDBModel.users_affected_by_group,
  304. [self.proj1group1.id],
  305. self.now,
  306. self.now,
  307. rollup=3600
  308. ) == {
  309. self.proj1group1.id: 1, # Only 1 unique user in the first hour
  310. }
  311. assert self.db.get_distinct_counts_totals(
  312. TSDBModel.users_affected_by_project,
  313. [self.proj1.id],
  314. self.now,
  315. self.now + timedelta(hours=4),
  316. rollup=3600
  317. ) == {
  318. self.proj1.id: 2,
  319. }
  320. def test_most_frequent(self):
  321. assert self.db.get_most_frequent(
  322. TSDBModel.frequent_issues_by_project,
  323. [self.proj1.id],
  324. self.now,
  325. self.now + timedelta(hours=4),
  326. rollup=3600,
  327. ) == {
  328. self.proj1.id: [
  329. (self.proj1group1.id, 2.0),
  330. (self.proj1group2.id, 1.0),
  331. ],
  332. }
  333. def test_frequency_series(self):
  334. dts = [self.now + timedelta(hours=i) for i in range(4)]
  335. assert self.db.get_frequency_series(
  336. TSDBModel.frequent_releases_by_group,
  337. {
  338. self.proj1group1.id: (self.group1release1.id, self.group1release2.id, ),
  339. self.proj1group2.id: (self.group2release1.id, )
  340. },
  341. dts[0], dts[-1],
  342. rollup=3600,
  343. ) == {
  344. self.proj1group1.id: [
  345. (timestamp(dts[0]), {
  346. self.group1release1.id: 0,
  347. self.group1release2.id: 0,
  348. }),
  349. (timestamp(dts[1]), {
  350. self.group1release1.id: 3,
  351. self.group1release2.id: 0,
  352. }),
  353. (timestamp(dts[2]), {
  354. self.group1release1.id: 0,
  355. self.group1release2.id: 3,
  356. }),
  357. (timestamp(dts[3]), {
  358. self.group1release1.id: 0,
  359. self.group1release2.id: 0,
  360. }),
  361. ],
  362. self.proj1group2.id: [
  363. (timestamp(dts[0]), {
  364. self.group2release1.id: 0,
  365. }),
  366. (timestamp(dts[1]), {
  367. self.group2release1.id: 3,
  368. }),
  369. (timestamp(dts[2]), {
  370. self.group2release1.id: 0,
  371. }),
  372. (timestamp(dts[3]), {
  373. self.group2release1.id: 0,
  374. }),
  375. ],
  376. }
  377. def test_result_shape(self):
  378. """
  379. Tests that the results from the different TSDB methods have the
  380. expected format.
  381. """
  382. project_id = self.proj1.id
  383. dts = [self.now + timedelta(hours=i) for i in range(4)]
  384. results = self.db.get_most_frequent(TSDBModel.frequent_issues_by_project,
  385. [project_id], dts[0], dts[0])
  386. assert has_shape(results, {1: [(1, 1.0)]})
  387. results = self.db.get_most_frequent_series(TSDBModel.frequent_issues_by_project,
  388. [project_id], dts[0], dts[0])
  389. assert has_shape(results, {1: [(1, {1: 1.0})]})
  390. items = {
  391. # {project_id: (issue_id, issue_id, ...)}
  392. project_id: (self.proj1group1.id, self.proj1group2.id)
  393. }
  394. results = self.db.get_frequency_series(TSDBModel.frequent_issues_by_project,
  395. items, dts[0], dts[-1])
  396. assert has_shape(results, {1: [(1, {1: 1})]})
  397. results = self.db.get_frequency_totals(TSDBModel.frequent_issues_by_project,
  398. items, dts[0], dts[-1])
  399. assert has_shape(results, {1: {1: 1}})
  400. results = self.db.get_range(TSDBModel.project, [project_id], dts[0], dts[-1])
  401. assert has_shape(results, {1: [(1, 1)]})
  402. results = self.db.get_distinct_counts_series(TSDBModel.users_affected_by_project,
  403. [project_id], dts[0], dts[-1])
  404. assert has_shape(results, {1: [(1, 1)]})
  405. results = self.db.get_distinct_counts_totals(TSDBModel.users_affected_by_project,
  406. [project_id], dts[0], dts[-1])
  407. assert has_shape(results, {1: 1})
  408. results = self.db.get_distinct_counts_union(TSDBModel.users_affected_by_project,
  409. [project_id], dts[0], dts[-1])
  410. assert has_shape(results, 1)