metrics.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410
  1. package stats
  2. import (
  3. "log"
  4. "net"
  5. "net/http"
  6. "os"
  7. "strconv"
  8. "strings"
  9. "time"
  10. "github.com/prometheus/client_golang/prometheus"
  11. "github.com/prometheus/client_golang/prometheus/collectors"
  12. "github.com/prometheus/client_golang/prometheus/promhttp"
  13. "github.com/prometheus/client_golang/prometheus/push"
  14. "github.com/seaweedfs/seaweedfs/weed/glog"
  15. )
  16. // Readonly volume types
  17. const (
  18. Namespace = "SeaweedFS"
  19. IsReadOnly = "IsReadOnly"
  20. NoWriteOrDelete = "noWriteOrDelete"
  21. NoWriteCanDelete = "noWriteCanDelete"
  22. IsDiskSpaceLow = "isDiskSpaceLow"
  23. )
  24. var readOnlyVolumeTypes = [4]string{IsReadOnly, NoWriteOrDelete, NoWriteCanDelete, IsDiskSpaceLow}
  25. var (
  26. Gather = prometheus.NewRegistry()
  27. MasterClientConnectCounter = prometheus.NewCounterVec(
  28. prometheus.CounterOpts{
  29. Namespace: Namespace,
  30. Subsystem: "wdclient",
  31. Name: "connect_updates",
  32. Help: "Counter of master client leader updates.",
  33. }, []string{"type"})
  34. MasterRaftIsleader = prometheus.NewGauge(
  35. prometheus.GaugeOpts{
  36. Namespace: Namespace,
  37. Subsystem: "master",
  38. Name: "is_leader",
  39. Help: "is leader",
  40. })
  41. MasterAdminLock = prometheus.NewGaugeVec(
  42. prometheus.GaugeOpts{
  43. Namespace: Namespace,
  44. Subsystem: "master",
  45. Name: "admin_lock",
  46. Help: "admin lock",
  47. }, []string{"client"})
  48. MasterReceivedHeartbeatCounter = prometheus.NewCounterVec(
  49. prometheus.CounterOpts{
  50. Namespace: Namespace,
  51. Subsystem: "master",
  52. Name: "received_heartbeats",
  53. Help: "Counter of master received heartbeat.",
  54. }, []string{"type"})
  55. MasterReplicaPlacementMismatch = prometheus.NewGaugeVec(
  56. prometheus.GaugeOpts{
  57. Namespace: Namespace,
  58. Subsystem: "master",
  59. Name: "replica_placement_mismatch",
  60. Help: "replica placement mismatch",
  61. }, []string{"collection", "id"})
  62. MasterVolumeLayoutWritable = prometheus.NewGaugeVec(
  63. prometheus.GaugeOpts{
  64. Namespace: Namespace,
  65. Subsystem: "master",
  66. Name: "volume_layout_writable",
  67. Help: "Number of writable volumes in volume layouts",
  68. }, []string{"collection", "disk", "rp", "ttl"})
  69. MasterVolumeLayoutCrowded = prometheus.NewGaugeVec(
  70. prometheus.GaugeOpts{
  71. Namespace: Namespace,
  72. Subsystem: "master",
  73. Name: "volume_layout_crowded",
  74. Help: "Number of crowded volumes in volume layouts",
  75. }, []string{"collection", "disk", "rp", "ttl"})
  76. MasterPickForWriteErrorCounter = prometheus.NewCounter(
  77. prometheus.CounterOpts{
  78. Namespace: Namespace,
  79. Subsystem: "master",
  80. Name: "pick_for_write_error",
  81. Help: "Counter of master pick for write error",
  82. })
  83. MasterBroadcastToFullErrorCounter = prometheus.NewCounter(
  84. prometheus.CounterOpts{
  85. Namespace: Namespace,
  86. Subsystem: "master",
  87. Name: "broadcast_to_full",
  88. Help: "Counter of master broadcast send to full message channel err",
  89. })
  90. MasterLeaderChangeCounter = prometheus.NewCounterVec(
  91. prometheus.CounterOpts{
  92. Namespace: Namespace,
  93. Subsystem: "master",
  94. Name: "leader_changes",
  95. Help: "Counter of master leader changes.",
  96. }, []string{"type"})
  97. FilerRequestCounter = prometheus.NewCounterVec(
  98. prometheus.CounterOpts{
  99. Namespace: Namespace,
  100. Subsystem: "filer",
  101. Name: "request_total",
  102. Help: "Counter of filer requests.",
  103. }, []string{"type", "code"})
  104. FilerHandlerCounter = prometheus.NewCounterVec(
  105. prometheus.CounterOpts{
  106. Namespace: Namespace,
  107. Subsystem: "filer",
  108. Name: "handler_total",
  109. Help: "Counter of filer handlers.",
  110. }, []string{"type"})
  111. FilerRequestHistogram = prometheus.NewHistogramVec(
  112. prometheus.HistogramOpts{
  113. Namespace: Namespace,
  114. Subsystem: "filer",
  115. Name: "request_seconds",
  116. Help: "Bucketed histogram of filer request processing time.",
  117. Buckets: prometheus.ExponentialBuckets(0.0001, 2, 24),
  118. }, []string{"type"})
  119. FilerInFlightRequestsGauge = prometheus.NewGaugeVec(
  120. prometheus.GaugeOpts{
  121. Namespace: Namespace,
  122. Subsystem: "filer",
  123. Name: "in_flight_requests",
  124. Help: "Current number of in-flight requests being handled by filer.",
  125. }, []string{"type"})
  126. FilerServerLastSendTsOfSubscribeGauge = prometheus.NewGaugeVec(
  127. prometheus.GaugeOpts{
  128. Namespace: Namespace,
  129. Subsystem: "filer",
  130. Name: "last_send_timestamp_of_subscribe",
  131. Help: "The last send timestamp of the filer subscription.",
  132. }, []string{"sourceFiler", "clientName", "path"})
  133. FilerStoreCounter = prometheus.NewCounterVec(
  134. prometheus.CounterOpts{
  135. Namespace: Namespace,
  136. Subsystem: "filerStore",
  137. Name: "request_total",
  138. Help: "Counter of filer store requests.",
  139. }, []string{"store", "type"})
  140. FilerStoreHistogram = prometheus.NewHistogramVec(
  141. prometheus.HistogramOpts{
  142. Namespace: Namespace,
  143. Subsystem: "filerStore",
  144. Name: "request_seconds",
  145. Help: "Bucketed histogram of filer store request processing time.",
  146. Buckets: prometheus.ExponentialBuckets(0.0001, 2, 24),
  147. }, []string{"store", "type"})
  148. FilerSyncOffsetGauge = prometheus.NewGaugeVec(
  149. prometheus.GaugeOpts{
  150. Namespace: Namespace,
  151. Subsystem: "filerSync",
  152. Name: "sync_offset",
  153. Help: "The offset of the filer synchronization service.",
  154. }, []string{"sourceFiler", "targetFiler", "clientName", "path"})
  155. VolumeServerRequestCounter = prometheus.NewCounterVec(
  156. prometheus.CounterOpts{
  157. Namespace: Namespace,
  158. Subsystem: "volumeServer",
  159. Name: "request_total",
  160. Help: "Counter of volume server requests.",
  161. }, []string{"type", "code"})
  162. VolumeServerHandlerCounter = prometheus.NewCounterVec(
  163. prometheus.CounterOpts{
  164. Namespace: Namespace,
  165. Subsystem: "volumeServer",
  166. Name: "handler_total",
  167. Help: "Counter of volume server handlers.",
  168. }, []string{"type"})
  169. VolumeServerVacuumingCompactCounter = prometheus.NewCounterVec(
  170. prometheus.CounterOpts{
  171. Namespace: Namespace,
  172. Subsystem: "volumeServer",
  173. Name: "vacuuming_compact_count",
  174. Help: "Counter of volume vacuuming Compact counter",
  175. }, []string{"success"})
  176. VolumeServerVacuumingCommitCounter = prometheus.NewCounterVec(
  177. prometheus.CounterOpts{
  178. Namespace: Namespace,
  179. Subsystem: "volumeServer",
  180. Name: "vacuuming_commit_count",
  181. Help: "Counter of volume vacuuming commit counter",
  182. }, []string{"success"})
  183. VolumeServerVacuumingHistogram = prometheus.NewHistogramVec(
  184. prometheus.HistogramOpts{
  185. Namespace: Namespace,
  186. Subsystem: "volumeServer",
  187. Name: "vacuuming_seconds",
  188. Help: "Bucketed histogram of volume server vacuuming processing time.",
  189. Buckets: prometheus.ExponentialBuckets(0.0001, 2, 24),
  190. }, []string{"type"})
  191. VolumeServerRequestHistogram = prometheus.NewHistogramVec(
  192. prometheus.HistogramOpts{
  193. Namespace: Namespace,
  194. Subsystem: "volumeServer",
  195. Name: "request_seconds",
  196. Help: "Bucketed histogram of volume server request processing time.",
  197. Buckets: prometheus.ExponentialBuckets(0.0001, 2, 24),
  198. }, []string{"type"})
  199. VolumeServerInFlightRequestsGauge = prometheus.NewGaugeVec(
  200. prometheus.GaugeOpts{
  201. Namespace: Namespace,
  202. Subsystem: "volumeServer",
  203. Name: "in_flight_requests",
  204. Help: "Current number of in-flight requests being handled by volume server.",
  205. }, []string{"type"})
  206. VolumeServerVolumeGauge = prometheus.NewGaugeVec(
  207. prometheus.GaugeOpts{
  208. Namespace: Namespace,
  209. Subsystem: "volumeServer",
  210. Name: "volumes",
  211. Help: "Number of volumes or shards.",
  212. }, []string{"collection", "type"})
  213. VolumeServerReadOnlyVolumeGauge = prometheus.NewGaugeVec(
  214. prometheus.GaugeOpts{
  215. Namespace: Namespace,
  216. Subsystem: "volumeServer",
  217. Name: "read_only_volumes",
  218. Help: "Number of read only volumes.",
  219. }, []string{"collection", "type"})
  220. VolumeServerMaxVolumeCounter = prometheus.NewGauge(
  221. prometheus.GaugeOpts{
  222. Namespace: Namespace,
  223. Subsystem: "volumeServer",
  224. Name: "max_volumes",
  225. Help: "Maximum number of volumes.",
  226. })
  227. VolumeServerDiskSizeGauge = prometheus.NewGaugeVec(
  228. prometheus.GaugeOpts{
  229. Namespace: Namespace,
  230. Subsystem: "volumeServer",
  231. Name: "total_disk_size",
  232. Help: "Actual disk size used by volumes.",
  233. }, []string{"collection", "type"})
  234. VolumeServerResourceGauge = prometheus.NewGaugeVec(
  235. prometheus.GaugeOpts{
  236. Namespace: Namespace,
  237. Subsystem: "volumeServer",
  238. Name: "resource",
  239. Help: "Resource usage",
  240. }, []string{"name", "type"})
  241. S3RequestCounter = prometheus.NewCounterVec(
  242. prometheus.CounterOpts{
  243. Namespace: Namespace,
  244. Subsystem: "s3",
  245. Name: "request_total",
  246. Help: "Counter of s3 requests.",
  247. }, []string{"type", "code", "bucket"})
  248. S3HandlerCounter = prometheus.NewCounterVec(
  249. prometheus.CounterOpts{
  250. Namespace: Namespace,
  251. Subsystem: "s3",
  252. Name: "handler_total",
  253. Help: "Counter of s3 server handlers.",
  254. }, []string{"type"})
  255. S3RequestHistogram = prometheus.NewHistogramVec(
  256. prometheus.HistogramOpts{
  257. Namespace: Namespace,
  258. Subsystem: "s3",
  259. Name: "request_seconds",
  260. Help: "Bucketed histogram of s3 request processing time.",
  261. Buckets: prometheus.ExponentialBuckets(0.0001, 2, 24),
  262. }, []string{"type", "bucket"})
  263. S3TimeToFirstByteHistogram = prometheus.NewHistogramVec(
  264. prometheus.HistogramOpts{
  265. Namespace: Namespace,
  266. Subsystem: "s3",
  267. Name: "time_to_first_byte_millisecond",
  268. Help: "Bucketed histogram of s3 time to first byte request processing time.",
  269. Buckets: prometheus.ExponentialBuckets(0.001, 2, 27),
  270. }, []string{"type", "bucket"})
  271. S3InFlightRequestsGauge = prometheus.NewGaugeVec(
  272. prometheus.GaugeOpts{
  273. Namespace: Namespace,
  274. Subsystem: "s3",
  275. Name: "in_flight_requests",
  276. Help: "Current number of in-flight requests being handled by s3.",
  277. }, []string{"type"})
  278. )
  279. func init() {
  280. Gather.MustRegister(MasterClientConnectCounter)
  281. Gather.MustRegister(MasterRaftIsleader)
  282. Gather.MustRegister(MasterAdminLock)
  283. Gather.MustRegister(MasterReceivedHeartbeatCounter)
  284. Gather.MustRegister(MasterLeaderChangeCounter)
  285. Gather.MustRegister(MasterReplicaPlacementMismatch)
  286. Gather.MustRegister(MasterVolumeLayoutWritable)
  287. Gather.MustRegister(MasterVolumeLayoutCrowded)
  288. Gather.MustRegister(MasterBroadcastToFullErrorCounter)
  289. Gather.MustRegister(FilerRequestCounter)
  290. Gather.MustRegister(FilerHandlerCounter)
  291. Gather.MustRegister(FilerRequestHistogram)
  292. Gather.MustRegister(FilerInFlightRequestsGauge)
  293. Gather.MustRegister(FilerStoreCounter)
  294. Gather.MustRegister(FilerStoreHistogram)
  295. Gather.MustRegister(FilerSyncOffsetGauge)
  296. Gather.MustRegister(FilerServerLastSendTsOfSubscribeGauge)
  297. Gather.MustRegister(collectors.NewGoCollector())
  298. Gather.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}))
  299. Gather.MustRegister(VolumeServerRequestCounter)
  300. Gather.MustRegister(VolumeServerHandlerCounter)
  301. Gather.MustRegister(VolumeServerRequestHistogram)
  302. Gather.MustRegister(VolumeServerInFlightRequestsGauge)
  303. Gather.MustRegister(VolumeServerVacuumingCompactCounter)
  304. Gather.MustRegister(VolumeServerVacuumingCommitCounter)
  305. Gather.MustRegister(VolumeServerVacuumingHistogram)
  306. Gather.MustRegister(VolumeServerVolumeGauge)
  307. Gather.MustRegister(VolumeServerMaxVolumeCounter)
  308. Gather.MustRegister(VolumeServerReadOnlyVolumeGauge)
  309. Gather.MustRegister(VolumeServerDiskSizeGauge)
  310. Gather.MustRegister(VolumeServerResourceGauge)
  311. Gather.MustRegister(S3RequestCounter)
  312. Gather.MustRegister(S3HandlerCounter)
  313. Gather.MustRegister(S3RequestHistogram)
  314. Gather.MustRegister(S3InFlightRequestsGauge)
  315. Gather.MustRegister(S3TimeToFirstByteHistogram)
  316. }
  317. func LoopPushingMetric(name, instance, addr string, intervalSeconds int) {
  318. if addr == "" || intervalSeconds == 0 {
  319. return
  320. }
  321. glog.V(0).Infof("%s server sends metrics to %s every %d seconds", name, addr, intervalSeconds)
  322. pusher := push.New(addr, name).Gatherer(Gather).Grouping("instance", instance)
  323. for {
  324. err := pusher.Push()
  325. if err != nil && !strings.HasPrefix(err.Error(), "unexpected status code 200") {
  326. glog.V(0).Infof("could not push metrics to prometheus push gateway %s: %v", addr, err)
  327. }
  328. if intervalSeconds <= 0 {
  329. intervalSeconds = 15
  330. }
  331. time.Sleep(time.Duration(intervalSeconds) * time.Second)
  332. }
  333. }
  334. func JoinHostPort(host string, port int) string {
  335. portStr := strconv.Itoa(port)
  336. if strings.HasPrefix(host, "[") && strings.HasSuffix(host, "]") {
  337. return host + ":" + portStr
  338. }
  339. return net.JoinHostPort(host, portStr)
  340. }
  341. func StartMetricsServer(ip string, port int) {
  342. if port == 0 {
  343. return
  344. }
  345. http.Handle("/metrics", promhttp.HandlerFor(Gather, promhttp.HandlerOpts{}))
  346. log.Fatal(http.ListenAndServe(JoinHostPort(ip, port), nil))
  347. }
  348. func SourceName(port uint32) string {
  349. hostname, err := os.Hostname()
  350. if err != nil {
  351. return "unknown"
  352. }
  353. return net.JoinHostPort(hostname, strconv.Itoa(int(port)))
  354. }
  355. // todo - can be changed to DeletePartialMatch when https://github.com/prometheus/client_golang/pull/1013 gets released
  356. func DeleteCollectionMetrics(collection string) {
  357. VolumeServerDiskSizeGauge.DeleteLabelValues(collection, "normal")
  358. for _, volume_type := range readOnlyVolumeTypes {
  359. VolumeServerReadOnlyVolumeGauge.DeleteLabelValues(collection, volume_type)
  360. }
  361. VolumeServerVolumeGauge.DeleteLabelValues(collection, "volume")
  362. }