test_unmerge.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584
  1. from __future__ import absolute_import
  2. import functools
  3. import hashlib
  4. import itertools
  5. import logging
  6. import uuid
  7. from collections import OrderedDict
  8. from datetime import datetime, timedelta
  9. import pytz
  10. from mock import patch
  11. from sentry import eventstream
  12. from sentry.app import tsdb
  13. from sentry.models import Environment, Event, Group, GroupHash, GroupRelease, Release, UserReport
  14. from sentry.similarity import features, _make_index_backend
  15. from sentry.tasks.unmerge import (
  16. get_caches,
  17. get_event_user_from_interface,
  18. get_fingerprint,
  19. get_group_backfill_attributes,
  20. get_group_creation_attributes,
  21. unmerge,
  22. )
  23. from sentry.testutils import SnubaTestCase, TestCase
  24. from sentry.utils.dates import to_timestamp
  25. from sentry.utils import redis
  26. from sentry.testutils.helpers.datetime import before_now, iso_format
  27. from sentry.tasks.merge import merge_groups
  28. from sentry.tagstore.snuba.backend import SnubaTagStorage
  29. from six.moves import xrange
  30. # Use the default redis client as a cluster client in the similarity index
  31. index = _make_index_backend(redis.clusters.get("default").get_local_client(0))
  32. def test_get_fingerprint():
  33. assert (
  34. get_fingerprint(Event(data={"logentry": {"message": "Hello world"}}))
  35. == hashlib.md5("Hello world").hexdigest()
  36. )
  37. assert (
  38. get_fingerprint(
  39. Event(data={"fingerprint": ["Not hello world"], "logentry": {"message": "Hello world"}})
  40. )
  41. == hashlib.md5("Not hello world").hexdigest()
  42. )
  43. @patch("sentry.similarity.features.index", new=index)
  44. class UnmergeTestCase(TestCase, SnubaTestCase):
  45. def test_get_group_creation_attributes(self):
  46. now = datetime(2017, 5, 3, 6, 6, 6, tzinfo=pytz.utc)
  47. events = [
  48. Event(
  49. platform="javascript",
  50. message="Hello from JavaScript",
  51. datetime=now,
  52. data={
  53. "type": "default",
  54. "metadata": {},
  55. "tags": [["level", "info"], ["logger", "javascript"]],
  56. },
  57. ),
  58. Event(
  59. platform="python",
  60. message="Hello from Python",
  61. datetime=now - timedelta(hours=1),
  62. data={
  63. "type": "default",
  64. "metadata": {},
  65. "tags": [["level", "error"], ["logger", "python"]],
  66. },
  67. ),
  68. Event(
  69. platform="java",
  70. message="Hello from Java",
  71. datetime=now - timedelta(hours=2),
  72. data={
  73. "type": "default",
  74. "metadata": {},
  75. "tags": [["level", "debug"], ["logger", "java"]],
  76. },
  77. ),
  78. ]
  79. assert get_group_creation_attributes(get_caches(), events) == {
  80. "active_at": now - timedelta(hours=2),
  81. "first_seen": now - timedelta(hours=2),
  82. "last_seen": now,
  83. "platform": "java",
  84. "message": "Hello from JavaScript",
  85. "level": logging.INFO,
  86. "score": Group.calculate_score(3, now),
  87. "logger": "java",
  88. "times_seen": 3,
  89. "first_release": None,
  90. "culprit": "",
  91. "data": {"type": "default", "last_received": to_timestamp(now), "metadata": {}},
  92. }
  93. def test_get_group_backfill_attributes(self):
  94. now = datetime(2017, 5, 3, 6, 6, 6, tzinfo=pytz.utc)
  95. assert get_group_backfill_attributes(
  96. get_caches(),
  97. Group(
  98. active_at=now,
  99. first_seen=now,
  100. last_seen=now,
  101. platform="javascript",
  102. message="Hello from JavaScript",
  103. level=logging.INFO,
  104. score=Group.calculate_score(3, now),
  105. logger="javascript",
  106. times_seen=1,
  107. first_release=None,
  108. culprit="",
  109. data={"type": "default", "last_received": to_timestamp(now), "metadata": {}},
  110. ),
  111. [
  112. Event(
  113. platform="python",
  114. message="Hello from Python",
  115. datetime=now - timedelta(hours=1),
  116. data={
  117. "type": "default",
  118. "metadata": {},
  119. "tags": [["level", "error"], ["logger", "python"]],
  120. },
  121. ),
  122. Event(
  123. platform="java",
  124. message="Hello from Java",
  125. datetime=now - timedelta(hours=2),
  126. data={
  127. "type": "default",
  128. "metadata": {},
  129. "tags": [["level", "debug"], ["logger", "java"]],
  130. },
  131. ),
  132. ],
  133. ) == {
  134. "active_at": now - timedelta(hours=2),
  135. "first_seen": now - timedelta(hours=2),
  136. "platform": "java",
  137. "score": Group.calculate_score(3, now),
  138. "logger": "java",
  139. "times_seen": 3,
  140. "first_release": None,
  141. }
  142. def test_unmerge(self):
  143. tagstore = SnubaTagStorage() # Snuba is not the default tag storage for tests yet
  144. now = before_now(seconds=20).replace(microsecond=0, tzinfo=pytz.utc)
  145. def time_from_now(offset=0):
  146. return now + timedelta(seconds=offset)
  147. project = self.create_project()
  148. sequence = itertools.count(0)
  149. tag_values = itertools.cycle(["red", "green", "blue"])
  150. user_values = itertools.cycle([{"id": 1}, {"id": 2}])
  151. def create_message_event(template, parameters, environment, release, fingerprint="group1"):
  152. i = next(sequence)
  153. event_id = uuid.UUID(fields=(i, 0x0, 0x1000, 0x80, 0x80, 0x808080808080)).hex
  154. tags = [["color", next(tag_values)]]
  155. if release:
  156. tags.append(["sentry:release", release])
  157. event = self.store_event(
  158. data={
  159. "event_id": event_id,
  160. "message": template % parameters,
  161. "type": "default",
  162. "user": next(user_values),
  163. "tags": tags,
  164. "fingerprint": [fingerprint],
  165. "timestamp": iso_format(now + timedelta(seconds=i)),
  166. "environment": environment,
  167. "release": release,
  168. },
  169. project_id=project.id,
  170. )
  171. UserReport.objects.create(
  172. project_id=project.id,
  173. group_id=event.group.id,
  174. event_id=event_id,
  175. name="Log Hat",
  176. email="ceo@corptron.com",
  177. comments="Quack",
  178. )
  179. features.record([event])
  180. return event
  181. events = OrderedDict()
  182. for event in (
  183. create_message_event(
  184. "This is message #%s.", i, environment="production", release="version"
  185. )
  186. for i in xrange(10)
  187. ):
  188. events.setdefault(get_fingerprint(event), []).append(event)
  189. for event in (
  190. create_message_event(
  191. "This is message #%s!",
  192. i,
  193. environment="production",
  194. release="version2",
  195. fingerprint="group2",
  196. )
  197. for i in xrange(10, 16)
  198. ):
  199. events.setdefault(get_fingerprint(event), []).append(event)
  200. event = create_message_event(
  201. "This is message #%s!",
  202. 17,
  203. environment="staging",
  204. release="version3",
  205. fingerprint="group3",
  206. )
  207. events.setdefault(get_fingerprint(event), []).append(event)
  208. merge_source, source, destination = list(Group.objects.all())
  209. assert len(events) == 3
  210. assert sum(map(len, events.values())) == 17
  211. production_environment = Environment.objects.get(
  212. organization_id=project.organization_id, name="production"
  213. )
  214. with self.tasks():
  215. eventstream_state = eventstream.start_merge(project.id, [merge_source.id], source.id)
  216. merge_groups.delay([merge_source.id], source.id)
  217. eventstream.end_merge(eventstream_state)
  218. assert set(
  219. [
  220. (gtv.value, gtv.times_seen)
  221. for gtv in tagstore.get_group_tag_values(
  222. project.id, source.id, production_environment.id, "color"
  223. )
  224. ]
  225. ) == set([("red", 6), ("green", 5), ("blue", 5)])
  226. similar_items = features.compare(source)
  227. assert len(similar_items) == 2
  228. assert similar_items[0][0] == source.id
  229. assert similar_items[0][1]["message:message:character-shingles"] == 1.0
  230. assert similar_items[1][0] == destination.id
  231. assert similar_items[1][1]["message:message:character-shingles"] < 1.0
  232. with self.tasks():
  233. eventstream_state = eventstream.start_unmerge(
  234. project.id, [events.keys()[0]], source.id, destination.id
  235. )
  236. unmerge.delay(
  237. project.id, source.id, destination.id, [events.keys()[0]], None, batch_size=5
  238. )
  239. eventstream.end_unmerge(eventstream_state)
  240. assert (
  241. list(
  242. Group.objects.filter(id=merge_source.id).values_list(
  243. "times_seen", "first_seen", "last_seen"
  244. )
  245. )
  246. == []
  247. )
  248. assert list(
  249. Group.objects.filter(id=source.id).values_list("times_seen", "first_seen", "last_seen")
  250. ) == [(6, time_from_now(10), time_from_now(15))]
  251. assert list(
  252. Group.objects.filter(id=destination.id).values_list(
  253. "times_seen", "first_seen", "last_seen"
  254. )
  255. ) == [(11, time_from_now(0), time_from_now(16))]
  256. assert source.id != destination.id
  257. assert source.project == destination.project
  258. destination_event_ids = map(lambda event: event.event_id, events.values()[1])
  259. assert set(
  260. UserReport.objects.filter(group_id=source.id).values_list("event_id", flat=True)
  261. ) == set(destination_event_ids)
  262. assert set(
  263. GroupHash.objects.filter(group_id=source.id).values_list("hash", flat=True)
  264. ) == set([events.keys()[0], events.keys()[1]])
  265. assert set(
  266. GroupRelease.objects.filter(group_id=source.id).values_list(
  267. "environment", "first_seen", "last_seen"
  268. )
  269. ) == set([(u"production", time_from_now(10), time_from_now(15))])
  270. assert set(
  271. [
  272. (gtv.value, gtv.times_seen)
  273. for gtv in tagstore.get_group_tag_values(
  274. project.id, destination.id, production_environment.id, "color"
  275. )
  276. ]
  277. ) == set([(u"red", 4), (u"green", 3), (u"blue", 3)])
  278. destination_event_ids = map(
  279. lambda event: event.event_id, events.values()[0] + events.values()[2]
  280. )
  281. assert set(
  282. UserReport.objects.filter(group_id=destination.id).values_list("event_id", flat=True)
  283. ) == set(destination_event_ids)
  284. assert set(
  285. GroupHash.objects.filter(group_id=destination.id).values_list("hash", flat=True)
  286. ) == set([events.keys()[2]])
  287. assert set(
  288. GroupRelease.objects.filter(group_id=destination.id).values_list(
  289. "environment", "first_seen", "last_seen"
  290. )
  291. ) == set(
  292. [
  293. ("production", time_from_now(0), time_from_now(9)),
  294. ("staging", time_from_now(16), time_from_now(16)),
  295. ]
  296. )
  297. assert set(
  298. [
  299. (gtk.value, gtk.times_seen)
  300. for gtk in tagstore.get_group_tag_values(
  301. project.id, destination.id, production_environment.id, "color"
  302. )
  303. ]
  304. ) == set([("red", 4), ("blue", 3), ("green", 3)])
  305. rollup_duration = 3600
  306. time_series = tsdb.get_range(
  307. tsdb.models.group,
  308. [source.id, destination.id],
  309. now - timedelta(seconds=rollup_duration),
  310. time_from_now(17),
  311. rollup_duration,
  312. )
  313. environment_time_series = tsdb.get_range(
  314. tsdb.models.group,
  315. [source.id, destination.id],
  316. now - timedelta(seconds=rollup_duration),
  317. time_from_now(17),
  318. rollup_duration,
  319. environment_ids=[production_environment.id],
  320. )
  321. def get_expected_series_values(rollup, events, function=None):
  322. if function is None:
  323. def function(aggregate, event):
  324. return (aggregate if aggregate is not None else 0) + 1
  325. expected = {}
  326. for event in events:
  327. k = float((to_timestamp(event.datetime) // rollup_duration) * rollup_duration)
  328. expected[k] = function(expected.get(k), event)
  329. return expected
  330. def assert_series_contains(expected, actual, default=0):
  331. actual = dict(actual)
  332. for key, value in expected.items():
  333. assert actual.get(key, 0) == value
  334. for key in set(actual.keys()) - set(expected.keys()):
  335. assert actual.get(key, 0) == default
  336. assert_series_contains(
  337. get_expected_series_values(rollup_duration, events.values()[1]),
  338. time_series[source.id],
  339. 0,
  340. )
  341. assert_series_contains(
  342. get_expected_series_values(rollup_duration, events.values()[0] + events.values()[2]),
  343. time_series[destination.id],
  344. 0,
  345. )
  346. assert_series_contains(
  347. get_expected_series_values(rollup_duration, events.values()[1]),
  348. environment_time_series[source.id],
  349. 0,
  350. )
  351. assert_series_contains(
  352. get_expected_series_values(
  353. rollup_duration, events.values()[0][:-1] + events.values()[2]
  354. ),
  355. environment_time_series[destination.id],
  356. 0,
  357. )
  358. time_series = tsdb.get_distinct_counts_series(
  359. tsdb.models.users_affected_by_group,
  360. [source.id, destination.id],
  361. now - timedelta(seconds=rollup_duration),
  362. time_from_now(17),
  363. rollup_duration,
  364. )
  365. environment_time_series = tsdb.get_distinct_counts_series(
  366. tsdb.models.users_affected_by_group,
  367. [source.id, destination.id],
  368. now - timedelta(seconds=rollup_duration),
  369. time_from_now(17),
  370. rollup_duration,
  371. environment_id=production_environment.id,
  372. )
  373. def collect_by_user_tag(aggregate, event):
  374. aggregate = aggregate if aggregate is not None else set()
  375. aggregate.add(get_event_user_from_interface(event.data["user"]).tag_value)
  376. return aggregate
  377. for series in [time_series, environment_time_series]:
  378. assert_series_contains(
  379. {
  380. timestamp: len(values)
  381. for timestamp, values in get_expected_series_values(
  382. rollup_duration, events.values()[1], collect_by_user_tag
  383. ).items()
  384. },
  385. series[source.id],
  386. )
  387. assert_series_contains(
  388. {
  389. timestamp: len(values)
  390. for timestamp, values in get_expected_series_values(
  391. rollup_duration,
  392. events.values()[0] + events.values()[2],
  393. collect_by_user_tag,
  394. ).items()
  395. },
  396. time_series[destination.id],
  397. )
  398. def strip_zeroes(data):
  399. for group_id, series in data.items():
  400. for _, values in series:
  401. for key, val in values.items():
  402. if val == 0:
  403. values.pop(key)
  404. return data
  405. def collect_by_release(group, aggregate, event):
  406. aggregate = aggregate if aggregate is not None else {}
  407. release = event.get_tag("sentry:release")
  408. if not release:
  409. return aggregate
  410. release = GroupRelease.objects.get(
  411. group_id=group.id,
  412. environment=event.data["environment"],
  413. release_id=Release.objects.get(
  414. organization_id=project.organization_id, version=release
  415. ).id,
  416. ).id
  417. aggregate[release] = aggregate.get(release, 0) + 1
  418. return aggregate
  419. items = {}
  420. for i in [source.id, destination.id]:
  421. items[i] = list(GroupRelease.objects.filter(group_id=i).values_list("id", flat=True))
  422. time_series = strip_zeroes(
  423. tsdb.get_frequency_series(
  424. tsdb.models.frequent_releases_by_group,
  425. items,
  426. now - timedelta(seconds=rollup_duration),
  427. time_from_now(17),
  428. rollup_duration,
  429. )
  430. )
  431. assert_series_contains(
  432. get_expected_series_values(
  433. rollup_duration, events.values()[1], functools.partial(collect_by_release, source)
  434. ),
  435. time_series[source.id],
  436. {},
  437. )
  438. assert_series_contains(
  439. get_expected_series_values(
  440. rollup_duration,
  441. events.values()[0] + events.values()[2],
  442. functools.partial(collect_by_release, destination),
  443. ),
  444. time_series[destination.id],
  445. {},
  446. )
  447. items = {}
  448. for i in [source.id, destination.id]:
  449. items[i] = list(Environment.objects.all().values_list("id", flat=True))
  450. time_series = strip_zeroes(
  451. tsdb.get_frequency_series(
  452. tsdb.models.frequent_environments_by_group,
  453. items,
  454. now - timedelta(seconds=rollup_duration),
  455. time_from_now(17),
  456. rollup_duration,
  457. )
  458. )
  459. def collect_by_environment(aggregate, event):
  460. aggregate = aggregate if aggregate is not None else {}
  461. environment = Environment.objects.get(
  462. organization_id=project.organization_id, name=event.data["environment"]
  463. ).id
  464. aggregate[environment] = aggregate.get(environment, 0) + 1
  465. return aggregate
  466. assert_series_contains(
  467. get_expected_series_values(rollup_duration, events.values()[1], collect_by_environment),
  468. time_series[source.id],
  469. {},
  470. )
  471. assert_series_contains(
  472. get_expected_series_values(
  473. rollup_duration, events.values()[0] + events.values()[2], collect_by_environment
  474. ),
  475. time_series[destination.id],
  476. {},
  477. )
  478. source_similar_items = features.compare(source)
  479. assert source_similar_items[0] == (
  480. source.id,
  481. {
  482. "exception:message:character-shingles": None,
  483. "exception:stacktrace:application-chunks": None,
  484. "exception:stacktrace:pairs": None,
  485. "message:message:character-shingles": 1.0,
  486. },
  487. )
  488. assert source_similar_items[1][0] == destination.id
  489. assert source_similar_items[1][1]["message:message:character-shingles"] < 1.0
  490. destination_similar_items = features.compare(destination)
  491. assert destination_similar_items[0] == (
  492. destination.id,
  493. {
  494. "exception:message:character-shingles": None,
  495. "exception:stacktrace:application-chunks": None,
  496. "exception:stacktrace:pairs": None,
  497. "message:message:character-shingles": 1.0,
  498. },
  499. )
  500. assert destination_similar_items[1][0] == source.id
  501. assert destination_similar_items[1][1]["message:message:character-shingles"] < 1.0