stackcollapse-perf-sched.awk 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. #!/usr/bin/awk -f
  2. #
  3. # This program generates collapsed off-cpu stacks fit for use by flamegraph.pl
  4. # from scheduler data collected via perf_events.
  5. #
  6. # Outputs the cumulative time off cpu in us for each distinct stack observed.
  7. #
  8. # Some awk variables further control behavior:
  9. #
  10. # record_tid If truthy, causes all stack traces to include the
  11. # command and LWP id.
  12. #
  13. # record_wake_stack If truthy, stacks include the frames from the wakeup
  14. # event in addition to the sleep event.
  15. # See http://www.brendangregg.com/FlameGraphs/offcpuflamegraphs.html#Wakeup
  16. #
  17. # recurse If truthy, attempt to recursively identify and
  18. # visualize the full wakeup stack chain.
  19. # See http://www.brendangregg.com/FlameGraphs/offcpuflamegraphs.html#ChainGraph
  20. #
  21. # Note that this is only an approximation, as only the
  22. # last sleep event is recorded (e.g. if a thread slept
  23. # multiple times before waking another thread, only the
  24. # last sleep event is used). Implies record_wake_stack=1
  25. #
  26. # To set any of these variables from the command line, run via:
  27. #
  28. # stackcollapse-perf-sched.awk -v recurse=1
  29. #
  30. # == Important warning ==
  31. #
  32. # WARNING: tracing all scheduler events is very high overhead in perf. Even
  33. # more alarmingly, there appear to be bugs in perf that prevent it from reliably
  34. # getting consistent traces (even with large trace buffers), causing it to
  35. # produce empty perf.data files with error messages of the form:
  36. #
  37. # 0x952790 [0x736d]: failed to process type: 3410
  38. #
  39. # This failure is not determinisitic, so re-executing perf record will
  40. # eventually succeed.
  41. #
  42. # == Usage ==
  43. #
  44. # First, record data via perf_events:
  45. #
  46. # sudo perf record -g -e 'sched:sched_switch' \
  47. # -e 'sched:sched_stat_sleep' -e 'sched:sched_stat_blocked' \
  48. # -p <pid> -o perf.data -- sleep 1
  49. #
  50. # Then post process with this script:
  51. #
  52. # sudo perf script -f time,comm,pid,tid,event,ip,sym,dso,trace -i perf.data | \
  53. # stackcollapse-perf-sched.awk -v recurse=1 | \
  54. # flamegraph.pl --color=io --countname=us >out.svg
  55. #
  56. #
  57. # CDDL HEADER START
  58. #
  59. # The contents of this file are subject to the terms of the
  60. # Common Development and Distribution License (the "License").
  61. # You may not use this file except in compliance with the License.
  62. #
  63. # You can obtain a copy of the license at docs/cddl1.txt or
  64. # http://opensource.org/licenses/CDDL-1.0.
  65. # See the License for the specific language governing permissions
  66. # and limitations under the License.
  67. #
  68. # When distributing Covered Code, include this CDDL HEADER in each
  69. # file and include the License file at docs/cddl1.txt.
  70. # If applicable, add the following below this CDDL HEADER, with the
  71. # fields enclosed by brackets "[]" replaced with your own identifying
  72. # information: Portions Copyright [yyyy] [name of copyright owner]
  73. #
  74. # CDDL HEADER END
  75. #
  76. #
  77. # Copyright (c) 2015 by MemSQL. All rights reserved.
  78. #
  79. #
  80. # Match a perf captured variable, returning just the contents. For example, for
  81. # the following line, get_perf_captured_variable("pid") would return "27235":
  82. #
  83. # swapper 0 [006] 708189.626415: sched:sched_stat_sleep: comm=memsqld pid=27235 delay=100078421 [ns
  84. #
  85. function get_perf_captured_variable(variable)
  86. {
  87. match($0, variable "=[^[:space:]]+")
  88. return substr($0, RSTART + length(variable) + 1,
  89. RLENGTH - length(variable) - 1)
  90. }
  91. #
  92. # The timestamp is the first field that ends in a colon, e.g.:
  93. #
  94. # swapper 0 [006] 708189.626415: sched:sched_stat_sleep: comm=memsqld pid=27235 delay=100078421 [ns
  95. #
  96. # or
  97. #
  98. # swapper 0/0 708189.626415: sched:sched_stat_sleep: comm=memsqld pid=27235 delay=100078421 [ns]
  99. #
  100. function get_perf_timestamp()
  101. {
  102. match($0, " [^ :]+:")
  103. return substr($0, RSTART + 1, RLENGTH - 2)
  104. }
  105. !/^#/ && /sched:sched_switch/ {
  106. switchcommand = get_perf_captured_variable("comm")
  107. switchpid = get_perf_captured_variable("prev_pid")
  108. switchtime=get_perf_timestamp()
  109. switchstack=""
  110. }
  111. #
  112. # Strip the function name from a stack entry
  113. #
  114. # Stack entry is expected to be formatted like:
  115. # c60849 MyClass::Foo(unsigned long) (/home/areece/a.out)
  116. #
  117. function get_function_name()
  118. {
  119. # We start from 2 since we don't need the hex offset.
  120. # We stop at NF - 1 since we don't need the library path.
  121. funcname = $2
  122. for (i = 3; i <= NF - 1; i++) {
  123. funcname = funcname " " $i
  124. }
  125. return funcname
  126. }
  127. (switchpid != 0 && /^[[:space:]]/) {
  128. if (switchstack == "") {
  129. switchstack = get_function_name()
  130. } else {
  131. switchstack = get_function_name() ";" switchstack
  132. }
  133. }
  134. (switchpid != 0 && /^$/) {
  135. switch_stacks[switchpid] = switchstack
  136. delete last_switch_stacks[switchpid]
  137. switch_time[switchpid] = switchtime
  138. switchpid=0
  139. switchcommand=""
  140. switchstack=""
  141. }
  142. !/^#/ && (/sched:sched_stat_sleep/ || /sched:sched_stat_blocked/) {
  143. wakecommand=$1
  144. wakepid=$2
  145. waketime=get_perf_timestamp()
  146. stat_next_command = get_perf_captured_variable("comm")
  147. stat_next_pid = get_perf_captured_variable("pid")
  148. stat_delay_ns = int(get_perf_captured_variable("delay"))
  149. wakestack=""
  150. }
  151. (stat_next_pid != 0 && /^[[:space:]]/) {
  152. if (wakestack == "") {
  153. wakestack = get_function_name()
  154. } else {
  155. # We build the wakestack in reverse order.
  156. wakestack = wakestack ";" get_function_name()
  157. }
  158. }
  159. (stat_next_pid != 0 && /^$/) {
  160. #
  161. # For some reason, perf appears to output duplicate
  162. # sched:sched_stat_sleep and sched:sched_stat_blocked events. We only
  163. # handle the first event.
  164. #
  165. if (stat_next_pid in switch_stacks) {
  166. last_wake_time[stat_next_pid] = waketime
  167. stack = switch_stacks[stat_next_pid]
  168. if (recurse || record_wake_stack) {
  169. stack = stack ";" wakestack
  170. if (record_tid) {
  171. stack = stack ";" wakecommand "-" wakepid
  172. } else {
  173. stack = stack ";" wakecommand
  174. }
  175. }
  176. if (recurse) {
  177. if (last_wake_time[wakepid] > last_switch_time[stat_next_pid]) {
  178. stack = stack ";-;" last_switch_stacks[wakepid]
  179. }
  180. last_switch_stacks[stat_next_pid] = stack
  181. }
  182. delete switch_stacks[stat_next_pid]
  183. if (record_tid) {
  184. stack_times[stat_next_command "-" stat_next_pid ";" stack] += stat_delay_ns
  185. } else {
  186. stack_times[stat_next_command ";" stack] += stat_delay_ns
  187. }
  188. }
  189. wakecommand=""
  190. wakepid=0
  191. stat_next_pid=0
  192. stat_next_command=""
  193. stat_delay_ms=0
  194. }
  195. END {
  196. for (stack in stack_times) {
  197. if (int(stack_times[stack] / 1000) > 0) {
  198. print stack, int(stack_times[stack] / 1000)
  199. }
  200. }
  201. }