metrics.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454
  1. package stats
  2. import (
  3. "log"
  4. "net"
  5. "net/http"
  6. "os"
  7. "strconv"
  8. "strings"
  9. "sync"
  10. "time"
  11. "github.com/prometheus/client_golang/prometheus"
  12. "github.com/prometheus/client_golang/prometheus/collectors"
  13. "github.com/prometheus/client_golang/prometheus/promhttp"
  14. "github.com/prometheus/client_golang/prometheus/push"
  15. "github.com/seaweedfs/seaweedfs/weed/glog"
  16. )
  17. // Readonly volume types
  18. const (
  19. Namespace = "SeaweedFS"
  20. IsReadOnly = "IsReadOnly"
  21. NoWriteOrDelete = "noWriteOrDelete"
  22. NoWriteCanDelete = "noWriteCanDelete"
  23. IsDiskSpaceLow = "isDiskSpaceLow"
  24. bucketAtiveTTL = 10 * time.Minute
  25. )
  26. var readOnlyVolumeTypes = [4]string{IsReadOnly, NoWriteOrDelete, NoWriteCanDelete, IsDiskSpaceLow}
  27. var bucketLastActiveTsNs map[string]int64 = map[string]int64{}
  28. var bucketLastActiveLock sync.Mutex
  29. var (
  30. Gather = prometheus.NewRegistry()
  31. MasterClientConnectCounter = prometheus.NewCounterVec(
  32. prometheus.CounterOpts{
  33. Namespace: Namespace,
  34. Subsystem: "wdclient",
  35. Name: "connect_updates",
  36. Help: "Counter of master client leader updates.",
  37. }, []string{"type"})
  38. MasterRaftIsleader = prometheus.NewGauge(
  39. prometheus.GaugeOpts{
  40. Namespace: Namespace,
  41. Subsystem: "master",
  42. Name: "is_leader",
  43. Help: "is leader",
  44. })
  45. MasterAdminLock = prometheus.NewGaugeVec(
  46. prometheus.GaugeOpts{
  47. Namespace: Namespace,
  48. Subsystem: "master",
  49. Name: "admin_lock",
  50. Help: "admin lock",
  51. }, []string{"client"})
  52. MasterReceivedHeartbeatCounter = prometheus.NewCounterVec(
  53. prometheus.CounterOpts{
  54. Namespace: Namespace,
  55. Subsystem: "master",
  56. Name: "received_heartbeats",
  57. Help: "Counter of master received heartbeat.",
  58. }, []string{"type"})
  59. MasterReplicaPlacementMismatch = prometheus.NewGaugeVec(
  60. prometheus.GaugeOpts{
  61. Namespace: Namespace,
  62. Subsystem: "master",
  63. Name: "replica_placement_mismatch",
  64. Help: "replica placement mismatch",
  65. }, []string{"collection", "id"})
  66. MasterVolumeLayoutWritable = prometheus.NewGaugeVec(
  67. prometheus.GaugeOpts{
  68. Namespace: Namespace,
  69. Subsystem: "master",
  70. Name: "volume_layout_writable",
  71. Help: "Number of writable volumes in volume layouts",
  72. }, []string{"collection", "disk", "rp", "ttl"})
  73. MasterVolumeLayoutCrowded = prometheus.NewGaugeVec(
  74. prometheus.GaugeOpts{
  75. Namespace: Namespace,
  76. Subsystem: "master",
  77. Name: "volume_layout_crowded",
  78. Help: "Number of crowded volumes in volume layouts",
  79. }, []string{"collection", "disk", "rp", "ttl"})
  80. MasterPickForWriteErrorCounter = prometheus.NewCounter(
  81. prometheus.CounterOpts{
  82. Namespace: Namespace,
  83. Subsystem: "master",
  84. Name: "pick_for_write_error",
  85. Help: "Counter of master pick for write error",
  86. })
  87. MasterBroadcastToFullErrorCounter = prometheus.NewCounter(
  88. prometheus.CounterOpts{
  89. Namespace: Namespace,
  90. Subsystem: "master",
  91. Name: "broadcast_to_full",
  92. Help: "Counter of master broadcast send to full message channel err",
  93. })
  94. MasterLeaderChangeCounter = prometheus.NewCounterVec(
  95. prometheus.CounterOpts{
  96. Namespace: Namespace,
  97. Subsystem: "master",
  98. Name: "leader_changes",
  99. Help: "Counter of master leader changes.",
  100. }, []string{"type"})
  101. FilerRequestCounter = prometheus.NewCounterVec(
  102. prometheus.CounterOpts{
  103. Namespace: Namespace,
  104. Subsystem: "filer",
  105. Name: "request_total",
  106. Help: "Counter of filer requests.",
  107. }, []string{"type", "code"})
  108. FilerHandlerCounter = prometheus.NewCounterVec(
  109. prometheus.CounterOpts{
  110. Namespace: Namespace,
  111. Subsystem: "filer",
  112. Name: "handler_total",
  113. Help: "Counter of filer handlers.",
  114. }, []string{"type"})
  115. FilerRequestHistogram = prometheus.NewHistogramVec(
  116. prometheus.HistogramOpts{
  117. Namespace: Namespace,
  118. Subsystem: "filer",
  119. Name: "request_seconds",
  120. Help: "Bucketed histogram of filer request processing time.",
  121. Buckets: prometheus.ExponentialBuckets(0.0001, 2, 24),
  122. }, []string{"type"})
  123. FilerInFlightRequestsGauge = prometheus.NewGaugeVec(
  124. prometheus.GaugeOpts{
  125. Namespace: Namespace,
  126. Subsystem: "filer",
  127. Name: "in_flight_requests",
  128. Help: "Current number of in-flight requests being handled by filer.",
  129. }, []string{"type"})
  130. FilerServerLastSendTsOfSubscribeGauge = prometheus.NewGaugeVec(
  131. prometheus.GaugeOpts{
  132. Namespace: Namespace,
  133. Subsystem: "filer",
  134. Name: "last_send_timestamp_of_subscribe",
  135. Help: "The last send timestamp of the filer subscription.",
  136. }, []string{"sourceFiler", "clientName", "path"})
  137. FilerStoreCounter = prometheus.NewCounterVec(
  138. prometheus.CounterOpts{
  139. Namespace: Namespace,
  140. Subsystem: "filerStore",
  141. Name: "request_total",
  142. Help: "Counter of filer store requests.",
  143. }, []string{"store", "type"})
  144. FilerStoreHistogram = prometheus.NewHistogramVec(
  145. prometheus.HistogramOpts{
  146. Namespace: Namespace,
  147. Subsystem: "filerStore",
  148. Name: "request_seconds",
  149. Help: "Bucketed histogram of filer store request processing time.",
  150. Buckets: prometheus.ExponentialBuckets(0.0001, 2, 24),
  151. }, []string{"store", "type"})
  152. FilerSyncOffsetGauge = prometheus.NewGaugeVec(
  153. prometheus.GaugeOpts{
  154. Namespace: Namespace,
  155. Subsystem: "filerSync",
  156. Name: "sync_offset",
  157. Help: "The offset of the filer synchronization service.",
  158. }, []string{"sourceFiler", "targetFiler", "clientName", "path"})
  159. VolumeServerRequestCounter = prometheus.NewCounterVec(
  160. prometheus.CounterOpts{
  161. Namespace: Namespace,
  162. Subsystem: "volumeServer",
  163. Name: "request_total",
  164. Help: "Counter of volume server requests.",
  165. }, []string{"type", "code"})
  166. VolumeServerHandlerCounter = prometheus.NewCounterVec(
  167. prometheus.CounterOpts{
  168. Namespace: Namespace,
  169. Subsystem: "volumeServer",
  170. Name: "handler_total",
  171. Help: "Counter of volume server handlers.",
  172. }, []string{"type"})
  173. VolumeServerVacuumingCompactCounter = prometheus.NewCounterVec(
  174. prometheus.CounterOpts{
  175. Namespace: Namespace,
  176. Subsystem: "volumeServer",
  177. Name: "vacuuming_compact_count",
  178. Help: "Counter of volume vacuuming Compact counter",
  179. }, []string{"success"})
  180. VolumeServerVacuumingCommitCounter = prometheus.NewCounterVec(
  181. prometheus.CounterOpts{
  182. Namespace: Namespace,
  183. Subsystem: "volumeServer",
  184. Name: "vacuuming_commit_count",
  185. Help: "Counter of volume vacuuming commit counter",
  186. }, []string{"success"})
  187. VolumeServerVacuumingHistogram = prometheus.NewHistogramVec(
  188. prometheus.HistogramOpts{
  189. Namespace: Namespace,
  190. Subsystem: "volumeServer",
  191. Name: "vacuuming_seconds",
  192. Help: "Bucketed histogram of volume server vacuuming processing time.",
  193. Buckets: prometheus.ExponentialBuckets(0.0001, 2, 24),
  194. }, []string{"type"})
  195. VolumeServerRequestHistogram = prometheus.NewHistogramVec(
  196. prometheus.HistogramOpts{
  197. Namespace: Namespace,
  198. Subsystem: "volumeServer",
  199. Name: "request_seconds",
  200. Help: "Bucketed histogram of volume server request processing time.",
  201. Buckets: prometheus.ExponentialBuckets(0.0001, 2, 24),
  202. }, []string{"type"})
  203. VolumeServerInFlightRequestsGauge = prometheus.NewGaugeVec(
  204. prometheus.GaugeOpts{
  205. Namespace: Namespace,
  206. Subsystem: "volumeServer",
  207. Name: "in_flight_requests",
  208. Help: "Current number of in-flight requests being handled by volume server.",
  209. }, []string{"type"})
  210. VolumeServerVolumeGauge = prometheus.NewGaugeVec(
  211. prometheus.GaugeOpts{
  212. Namespace: Namespace,
  213. Subsystem: "volumeServer",
  214. Name: "volumes",
  215. Help: "Number of volumes or shards.",
  216. }, []string{"collection", "type"})
  217. VolumeServerReadOnlyVolumeGauge = prometheus.NewGaugeVec(
  218. prometheus.GaugeOpts{
  219. Namespace: Namespace,
  220. Subsystem: "volumeServer",
  221. Name: "read_only_volumes",
  222. Help: "Number of read only volumes.",
  223. }, []string{"collection", "type"})
  224. VolumeServerMaxVolumeCounter = prometheus.NewGauge(
  225. prometheus.GaugeOpts{
  226. Namespace: Namespace,
  227. Subsystem: "volumeServer",
  228. Name: "max_volumes",
  229. Help: "Maximum number of volumes.",
  230. })
  231. VolumeServerDiskSizeGauge = prometheus.NewGaugeVec(
  232. prometheus.GaugeOpts{
  233. Namespace: Namespace,
  234. Subsystem: "volumeServer",
  235. Name: "total_disk_size",
  236. Help: "Actual disk size used by volumes.",
  237. }, []string{"collection", "type"})
  238. VolumeServerResourceGauge = prometheus.NewGaugeVec(
  239. prometheus.GaugeOpts{
  240. Namespace: Namespace,
  241. Subsystem: "volumeServer",
  242. Name: "resource",
  243. Help: "Resource usage",
  244. }, []string{"name", "type"})
  245. S3RequestCounter = prometheus.NewCounterVec(
  246. prometheus.CounterOpts{
  247. Namespace: Namespace,
  248. Subsystem: "s3",
  249. Name: "request_total",
  250. Help: "Counter of s3 requests.",
  251. }, []string{"type", "code", "bucket"})
  252. S3HandlerCounter = prometheus.NewCounterVec(
  253. prometheus.CounterOpts{
  254. Namespace: Namespace,
  255. Subsystem: "s3",
  256. Name: "handler_total",
  257. Help: "Counter of s3 server handlers.",
  258. }, []string{"type"})
  259. S3RequestHistogram = prometheus.NewHistogramVec(
  260. prometheus.HistogramOpts{
  261. Namespace: Namespace,
  262. Subsystem: "s3",
  263. Name: "request_seconds",
  264. Help: "Bucketed histogram of s3 request processing time.",
  265. Buckets: prometheus.ExponentialBuckets(0.0001, 2, 24),
  266. }, []string{"type", "bucket"})
  267. S3TimeToFirstByteHistogram = prometheus.NewHistogramVec(
  268. prometheus.HistogramOpts{
  269. Namespace: Namespace,
  270. Subsystem: "s3",
  271. Name: "time_to_first_byte_millisecond",
  272. Help: "Bucketed histogram of s3 time to first byte request processing time.",
  273. Buckets: prometheus.ExponentialBuckets(0.001, 2, 27),
  274. }, []string{"type", "bucket"})
  275. S3InFlightRequestsGauge = prometheus.NewGaugeVec(
  276. prometheus.GaugeOpts{
  277. Namespace: Namespace,
  278. Subsystem: "s3",
  279. Name: "in_flight_requests",
  280. Help: "Current number of in-flight requests being handled by s3.",
  281. }, []string{"type"})
  282. )
  283. func init() {
  284. Gather.MustRegister(MasterClientConnectCounter)
  285. Gather.MustRegister(MasterRaftIsleader)
  286. Gather.MustRegister(MasterAdminLock)
  287. Gather.MustRegister(MasterReceivedHeartbeatCounter)
  288. Gather.MustRegister(MasterLeaderChangeCounter)
  289. Gather.MustRegister(MasterReplicaPlacementMismatch)
  290. Gather.MustRegister(MasterVolumeLayoutWritable)
  291. Gather.MustRegister(MasterVolumeLayoutCrowded)
  292. Gather.MustRegister(MasterPickForWriteErrorCounter)
  293. Gather.MustRegister(MasterBroadcastToFullErrorCounter)
  294. Gather.MustRegister(FilerRequestCounter)
  295. Gather.MustRegister(FilerHandlerCounter)
  296. Gather.MustRegister(FilerRequestHistogram)
  297. Gather.MustRegister(FilerInFlightRequestsGauge)
  298. Gather.MustRegister(FilerStoreCounter)
  299. Gather.MustRegister(FilerStoreHistogram)
  300. Gather.MustRegister(FilerSyncOffsetGauge)
  301. Gather.MustRegister(FilerServerLastSendTsOfSubscribeGauge)
  302. Gather.MustRegister(collectors.NewGoCollector())
  303. Gather.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}))
  304. Gather.MustRegister(VolumeServerRequestCounter)
  305. Gather.MustRegister(VolumeServerHandlerCounter)
  306. Gather.MustRegister(VolumeServerRequestHistogram)
  307. Gather.MustRegister(VolumeServerInFlightRequestsGauge)
  308. Gather.MustRegister(VolumeServerVacuumingCompactCounter)
  309. Gather.MustRegister(VolumeServerVacuumingCommitCounter)
  310. Gather.MustRegister(VolumeServerVacuumingHistogram)
  311. Gather.MustRegister(VolumeServerVolumeGauge)
  312. Gather.MustRegister(VolumeServerMaxVolumeCounter)
  313. Gather.MustRegister(VolumeServerReadOnlyVolumeGauge)
  314. Gather.MustRegister(VolumeServerDiskSizeGauge)
  315. Gather.MustRegister(VolumeServerResourceGauge)
  316. Gather.MustRegister(S3RequestCounter)
  317. Gather.MustRegister(S3HandlerCounter)
  318. Gather.MustRegister(S3RequestHistogram)
  319. Gather.MustRegister(S3InFlightRequestsGauge)
  320. Gather.MustRegister(S3TimeToFirstByteHistogram)
  321. go bucketMetricTTLControl()
  322. }
  323. func LoopPushingMetric(name, instance, addr string, intervalSeconds int) {
  324. if addr == "" || intervalSeconds == 0 {
  325. return
  326. }
  327. glog.V(0).Infof("%s server sends metrics to %s every %d seconds", name, addr, intervalSeconds)
  328. pusher := push.New(addr, name).Gatherer(Gather).Grouping("instance", instance)
  329. for {
  330. err := pusher.Push()
  331. if err != nil && !strings.HasPrefix(err.Error(), "unexpected status code 200") {
  332. glog.V(0).Infof("could not push metrics to prometheus push gateway %s: %v", addr, err)
  333. }
  334. if intervalSeconds <= 0 {
  335. intervalSeconds = 15
  336. }
  337. time.Sleep(time.Duration(intervalSeconds) * time.Second)
  338. }
  339. }
  340. func JoinHostPort(host string, port int) string {
  341. portStr := strconv.Itoa(port)
  342. if strings.HasPrefix(host, "[") && strings.HasSuffix(host, "]") {
  343. return host + ":" + portStr
  344. }
  345. return net.JoinHostPort(host, portStr)
  346. }
  347. func StartMetricsServer(ip string, port int) {
  348. if port == 0 {
  349. return
  350. }
  351. http.Handle("/metrics", promhttp.HandlerFor(Gather, promhttp.HandlerOpts{}))
  352. log.Fatal(http.ListenAndServe(JoinHostPort(ip, port), nil))
  353. }
  354. func SourceName(port uint32) string {
  355. hostname, err := os.Hostname()
  356. if err != nil {
  357. return "unknown"
  358. }
  359. return net.JoinHostPort(hostname, strconv.Itoa(int(port)))
  360. }
  361. func RecordBucketActiveTime(bucket string) {
  362. bucketLastActiveLock.Lock()
  363. bucketLastActiveTsNs[bucket] = time.Now().UnixNano()
  364. bucketLastActiveLock.Unlock()
  365. }
  366. func DeleteCollectionMetrics(collection string) {
  367. labels := prometheus.Labels{"collection": collection}
  368. c := MasterReplicaPlacementMismatch.DeletePartialMatch(labels)
  369. c += MasterVolumeLayoutWritable.DeletePartialMatch(labels)
  370. c += MasterVolumeLayoutCrowded.DeletePartialMatch(labels)
  371. c += VolumeServerDiskSizeGauge.DeletePartialMatch(labels)
  372. c += VolumeServerVolumeGauge.DeletePartialMatch(labels)
  373. c += VolumeServerReadOnlyVolumeGauge.DeletePartialMatch(labels)
  374. glog.V(0).Infof("delete collection metrics, %s: %d", collection, c)
  375. }
  376. func bucketMetricTTLControl() {
  377. ttlNs := bucketAtiveTTL.Nanoseconds()
  378. for {
  379. now := time.Now().UnixNano()
  380. bucketLastActiveLock.Lock()
  381. for bucket, ts := range bucketLastActiveTsNs {
  382. if (now - ts) > ttlNs {
  383. delete(bucketLastActiveTsNs, bucket)
  384. labels := prometheus.Labels{"bucket": bucket}
  385. c := S3RequestCounter.DeletePartialMatch(labels)
  386. c += S3RequestHistogram.DeletePartialMatch(labels)
  387. c += S3TimeToFirstByteHistogram.DeletePartialMatch(labels)
  388. glog.V(0).Infof("delete inactive bucket metrics, %s: %d", bucket, c)
  389. }
  390. }
  391. bucketLastActiveLock.Unlock()
  392. time.Sleep(bucketAtiveTTL)
  393. }
  394. }