scrape-trace 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. #!/usr/bin/env python
  2. from sentry.runner import configure
  3. configure()
  4. import argparse
  5. import pathlib
  6. from concurrent.futures import ThreadPoolExecutor
  7. from os import environ, listdir, system
  8. from os.path import join
  9. from uuid import uuid4
  10. import sentry_sdk
  11. import urllib3
  12. from sentry.event_manager import EventManager
  13. from sentry.utils import json
  14. if "SCRAPE_AUTH_TOKEN" not in environ:
  15. raise Exception(
  16. "Please set your sentry auth token in the environment variable SCRAPE_AUTH_TOKEN"
  17. )
  18. AUTH_TOKEN = environ["SCRAPE_AUTH_TOKEN"]
  19. http = urllib3.PoolManager()
  20. BASE_URL = "https://sentry.sentry.io/api/0/"
  21. progress = 0
  22. total_events = 0
  23. def save_event(root_dir, project_name, event_id):
  24. """Get the event json from prod, then save it to a local file"""
  25. r = http.request(
  26. "GET",
  27. f"{BASE_URL}projects/sentry/{project_name}/events/{event_id}/json/",
  28. headers={"Authorization": f"Bearer {AUTH_TOKEN}"},
  29. )
  30. assert r.status == 200, "Request for event json failed"
  31. with open(f"{root_dir}/{event_id}.json", "w") as f:
  32. f.write(r.data.decode("utf-8"))
  33. global progress
  34. progress += 1
  35. system("clear")
  36. print(f"{progress}/{total_events} events loaded for current page") # NOQA
  37. def scrape(trace_id, directory):
  38. """Go to discover and get all the event ids for a given trace id"""
  39. for offset in range(50):
  40. print("retrieving events") # NOQA
  41. r = http.request(
  42. "GET",
  43. f"{BASE_URL}organizations/sentry/events/",
  44. fields=[
  45. ("query", f"trace:{trace_id}"),
  46. ("field", "id"),
  47. ("field", "project.name"),
  48. ("cursor", f"0:{offset * 100}:0"),
  49. ("per_page", "100"),
  50. ("project", "-1"),
  51. ("statsPeriod", "8h"),
  52. ],
  53. headers={"Authorization": f"Bearer {AUTH_TOKEN}"},
  54. )
  55. assert r.status == 200, "Request for event list failed"
  56. content = json.loads(r.data)
  57. rows = content.get("data", [])
  58. if len(rows) == 0:
  59. return
  60. global total_events
  61. global progress
  62. total_events = len(rows)
  63. progress = 0
  64. with ThreadPoolExecutor(10) as executor:
  65. for row in rows:
  66. executor.submit(save_event, directory, row["project.name"], row["id"])
  67. # There isn't another page, early exit
  68. if len(rows) < 100:
  69. return
  70. def load(directory):
  71. files = listdir(directory)
  72. global total_events
  73. global progress
  74. total_events = len(files)
  75. progress = 0
  76. # start a transaction so we don't spam the output with errors
  77. with sentry_sdk.start_transaction(name="load transactions"):
  78. with ThreadPoolExecutor(10) as executor:
  79. for file in files:
  80. if file.endswith("json"):
  81. executor.submit(load_file, directory, file)
  82. def load_file(root_dir, file):
  83. with open(join(root_dir, file)) as f:
  84. data = json.load(f)
  85. manager = EventManager(data)
  86. manager.normalize()
  87. if "contexts" not in data:
  88. return
  89. spans = [process_span(span, data) for span in data.get("spans", [])]
  90. spans.append(process_event(data))
  91. http.request(
  92. "POST",
  93. "http://127.0.0.1:1218/tests/entities/spans/insert",
  94. body=json.dumps(spans),
  95. )
  96. manager.save(1)
  97. global progress
  98. progress += 1
  99. system("clear")
  100. print(f"{progress}/{total_events} events loaded into Sentry") # NOQA
  101. def process_span(span, event):
  102. span["duration_ms"] = int((span["timestamp"] - span["start_timestamp"]) * 1000)
  103. span["received"] = span["timestamp"]
  104. span["start_timestamp_ms"] = int(span["start_timestamp"] * 1000)
  105. span["exclusive_time_ms"] = int(span["exclusive_time"] * 1000)
  106. span["is_segment"] = False
  107. span["segment_id"] = str(int(event["event_id"], 16))[:16]
  108. span["event_id"] = event["event_id"]
  109. # Put everything in project 1 even though they're different projects in prod
  110. span["project_id"] = 1
  111. span["tags"] = {}
  112. return span
  113. def process_event(event):
  114. """Convert the scraped event into span data"""
  115. trace_context = event["contexts"]["trace"]
  116. start_ts = event["start_timestamp"]
  117. end_ts = event["timestamp"]
  118. base_span = {
  119. "description": event["transaction"],
  120. "duration_ms": int(end_ts - start_ts),
  121. "exclusive_time_ms": int(end_ts - start_ts),
  122. "group_raw": uuid4().hex[:16],
  123. "is_segment": False,
  124. "measurements": {},
  125. "organization_id": 1,
  126. "parent_span_id": trace_context.get("parent_span_id", "0" * 12),
  127. "profile_id": uuid4().hex,
  128. "project_id": 1,
  129. "received": start_ts,
  130. "retention_days": 90,
  131. "event_id": event["event_id"],
  132. "sentry_tags": {},
  133. "span_id": trace_context["span_id"],
  134. "start_timestamp_ms": int(start_ts * 1000),
  135. "tags": {},
  136. "trace_id": trace_context["trace_id"],
  137. }
  138. if "parent_span_id" in trace_context:
  139. base_span["parent_span_id"] = trace_context["parent_span_id"]
  140. else:
  141. del base_span["parent_span_id"]
  142. return base_span
  143. if __name__ == "__main__":
  144. parser = argparse.ArgumentParser(description="Scrape data from production and load it locally")
  145. parser.add_argument("trace_id", type=str, help="The trace id to scrape or load")
  146. parser.add_argument(
  147. "--scrape",
  148. action=argparse.BooleanOptionalAction,
  149. help="For the given trace id, scrape the events into a local folder",
  150. )
  151. parser.add_argument(
  152. "--load",
  153. action=argparse.BooleanOptionalAction,
  154. help="For the given trace id, load the events into your local clickhouse",
  155. )
  156. args = parser.parse_args()
  157. directory = f"trace/{args.trace_id}"
  158. pathlib.Path(directory).mkdir(parents=True, exist_ok=True)
  159. if args.scrape:
  160. scrape(args.trace_id, directory)
  161. if args.load:
  162. load(directory)