command_cluster_check.go 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. package shell
  2. import (
  3. "context"
  4. "flag"
  5. "fmt"
  6. "io"
  7. "github.com/seaweedfs/seaweedfs/weed/cluster"
  8. "github.com/seaweedfs/seaweedfs/weed/pb"
  9. "github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
  10. "github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
  11. "github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
  12. )
  13. func init() {
  14. Commands = append(Commands, &commandClusterCheck{})
  15. }
  16. type commandClusterCheck struct {
  17. }
  18. func (c *commandClusterCheck) Name() string {
  19. return "cluster.check"
  20. }
  21. func (c *commandClusterCheck) Help() string {
  22. return `check current cluster network connectivity
  23. cluster.check
  24. `
  25. }
  26. func (c *commandClusterCheck) Do(args []string, commandEnv *CommandEnv, writer io.Writer) (err error) {
  27. clusterPsCommand := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
  28. if err = clusterPsCommand.Parse(args); err != nil {
  29. return nil
  30. }
  31. // collect topology information
  32. topologyInfo, volumeSizeLimitMb, err := collectTopologyInfo(commandEnv, 0)
  33. if err != nil {
  34. return err
  35. }
  36. fmt.Fprintf(writer, "Topology volumeSizeLimit:%d MB%s\n", volumeSizeLimitMb, diskInfosToString(topologyInfo.DiskInfos))
  37. if len(topologyInfo.DiskInfos) == 0 {
  38. return fmt.Errorf("no disk type defined")
  39. }
  40. for diskType, diskInfo := range topologyInfo.DiskInfos {
  41. if diskInfo.MaxVolumeCount == 0 {
  42. return fmt.Errorf("no volume available for \"%s\" disk type", diskType)
  43. }
  44. }
  45. // collect filers
  46. var filers []pb.ServerAddress
  47. err = commandEnv.MasterClient.WithClient(false, func(client master_pb.SeaweedClient) error {
  48. resp, err := client.ListClusterNodes(context.Background(), &master_pb.ListClusterNodesRequest{
  49. ClientType: cluster.FilerType,
  50. FilerGroup: *commandEnv.option.FilerGroup,
  51. })
  52. for _, node := range resp.ClusterNodes {
  53. filers = append(filers, pb.ServerAddress(node.Address))
  54. }
  55. return err
  56. })
  57. if err != nil {
  58. return
  59. }
  60. fmt.Fprintf(writer, "the cluster has %d filers: %+v\n", len(filers), filers)
  61. if len(filers) > 0 {
  62. genericDiskInfo, genericDiskInfoOk := topologyInfo.DiskInfos[""]
  63. hddDiskInfo, hddDiskInfoOk := topologyInfo.DiskInfos["hdd"]
  64. if !genericDiskInfoOk && !hddDiskInfoOk {
  65. return fmt.Errorf("filer metadata logs need generic or hdd disk type to be defined")
  66. }
  67. if (genericDiskInfoOk && genericDiskInfo.MaxVolumeCount == 0) || (hddDiskInfoOk && hddDiskInfo.MaxVolumeCount == 0) {
  68. return fmt.Errorf("filer metadata logs need generic or hdd volumes to be available")
  69. }
  70. }
  71. // collect volume servers
  72. var volumeServers []pb.ServerAddress
  73. t, _, err := collectTopologyInfo(commandEnv, 0)
  74. if err != nil {
  75. return err
  76. }
  77. for _, dc := range t.DataCenterInfos {
  78. for _, r := range dc.RackInfos {
  79. for _, dn := range r.DataNodeInfos {
  80. volumeServers = append(volumeServers, pb.NewServerAddressFromDataNode(dn))
  81. }
  82. }
  83. }
  84. fmt.Fprintf(writer, "the cluster has %d volume servers: %+v\n", len(volumeServers), volumeServers)
  85. // collect all masters
  86. var masters []pb.ServerAddress
  87. masters = append(masters, commandEnv.MasterClient.GetMasters()...)
  88. // check from master to volume servers
  89. for _, master := range masters {
  90. for _, volumeServer := range volumeServers {
  91. fmt.Fprintf(writer, "checking master %s to volume server %s ... ", string(master), string(volumeServer))
  92. err := pb.WithMasterClient(false, master, commandEnv.option.GrpcDialOption, false, func(client master_pb.SeaweedClient) error {
  93. pong, err := client.Ping(context.Background(), &master_pb.PingRequest{
  94. Target: string(volumeServer),
  95. TargetType: cluster.VolumeServerType,
  96. })
  97. if err == nil {
  98. printTiming(writer, pong.StartTimeNs, pong.RemoteTimeNs, pong.StopTimeNs)
  99. }
  100. return err
  101. })
  102. if err != nil {
  103. fmt.Fprintf(writer, "%v\n", err)
  104. }
  105. }
  106. }
  107. // check between masters
  108. for _, sourceMaster := range masters {
  109. for _, targetMaster := range masters {
  110. if sourceMaster == targetMaster {
  111. continue
  112. }
  113. fmt.Fprintf(writer, "checking master %s to %s ... ", string(sourceMaster), string(targetMaster))
  114. err := pb.WithMasterClient(false, sourceMaster, commandEnv.option.GrpcDialOption, false, func(client master_pb.SeaweedClient) error {
  115. pong, err := client.Ping(context.Background(), &master_pb.PingRequest{
  116. Target: string(targetMaster),
  117. TargetType: cluster.MasterType,
  118. })
  119. if err == nil {
  120. printTiming(writer, pong.StartTimeNs, pong.RemoteTimeNs, pong.StopTimeNs)
  121. }
  122. return err
  123. })
  124. if err != nil {
  125. fmt.Fprintf(writer, "%v\n", err)
  126. }
  127. }
  128. }
  129. // check from volume servers to masters
  130. for _, volumeServer := range volumeServers {
  131. for _, master := range masters {
  132. fmt.Fprintf(writer, "checking volume server %s to master %s ... ", string(volumeServer), string(master))
  133. err := pb.WithVolumeServerClient(false, volumeServer, commandEnv.option.GrpcDialOption, func(client volume_server_pb.VolumeServerClient) error {
  134. pong, err := client.Ping(context.Background(), &volume_server_pb.PingRequest{
  135. Target: string(master),
  136. TargetType: cluster.MasterType,
  137. })
  138. if err == nil {
  139. printTiming(writer, pong.StartTimeNs, pong.RemoteTimeNs, pong.StopTimeNs)
  140. }
  141. return err
  142. })
  143. if err != nil {
  144. fmt.Fprintf(writer, "%v\n", err)
  145. }
  146. }
  147. }
  148. // check from filers to masters
  149. for _, filer := range filers {
  150. for _, master := range masters {
  151. fmt.Fprintf(writer, "checking filer %s to master %s ... ", string(filer), string(master))
  152. err := pb.WithFilerClient(false, 0, filer, commandEnv.option.GrpcDialOption, func(client filer_pb.SeaweedFilerClient) error {
  153. pong, err := client.Ping(context.Background(), &filer_pb.PingRequest{
  154. Target: string(master),
  155. TargetType: cluster.MasterType,
  156. })
  157. if err == nil {
  158. printTiming(writer, pong.StartTimeNs, pong.RemoteTimeNs, pong.StopTimeNs)
  159. }
  160. return err
  161. })
  162. if err != nil {
  163. fmt.Fprintf(writer, "%v\n", err)
  164. }
  165. }
  166. }
  167. // check from filers to volume servers
  168. for _, filer := range filers {
  169. for _, volumeServer := range volumeServers {
  170. fmt.Fprintf(writer, "checking filer %s to volume server %s ... ", string(filer), string(volumeServer))
  171. err := pb.WithFilerClient(false, 0, filer, commandEnv.option.GrpcDialOption, func(client filer_pb.SeaweedFilerClient) error {
  172. pong, err := client.Ping(context.Background(), &filer_pb.PingRequest{
  173. Target: string(volumeServer),
  174. TargetType: cluster.VolumeServerType,
  175. })
  176. if err == nil {
  177. printTiming(writer, pong.StartTimeNs, pong.RemoteTimeNs, pong.StopTimeNs)
  178. }
  179. return err
  180. })
  181. if err != nil {
  182. fmt.Fprintf(writer, "%v\n", err)
  183. }
  184. }
  185. }
  186. // check between volume servers
  187. for _, sourceVolumeServer := range volumeServers {
  188. for _, targetVolumeServer := range volumeServers {
  189. if sourceVolumeServer == targetVolumeServer {
  190. continue
  191. }
  192. fmt.Fprintf(writer, "checking volume server %s to %s ... ", string(sourceVolumeServer), string(targetVolumeServer))
  193. err := pb.WithVolumeServerClient(false, sourceVolumeServer, commandEnv.option.GrpcDialOption, func(client volume_server_pb.VolumeServerClient) error {
  194. pong, err := client.Ping(context.Background(), &volume_server_pb.PingRequest{
  195. Target: string(targetVolumeServer),
  196. TargetType: cluster.VolumeServerType,
  197. })
  198. if err == nil {
  199. printTiming(writer, pong.StartTimeNs, pong.RemoteTimeNs, pong.StopTimeNs)
  200. }
  201. return err
  202. })
  203. if err != nil {
  204. fmt.Fprintf(writer, "%v\n", err)
  205. }
  206. }
  207. }
  208. // check between filers, and need to connect to itself
  209. for _, sourceFiler := range filers {
  210. for _, targetFiler := range filers {
  211. fmt.Fprintf(writer, "checking filer %s to %s ... ", string(sourceFiler), string(targetFiler))
  212. err := pb.WithFilerClient(false, 0, sourceFiler, commandEnv.option.GrpcDialOption, func(client filer_pb.SeaweedFilerClient) error {
  213. pong, err := client.Ping(context.Background(), &filer_pb.PingRequest{
  214. Target: string(targetFiler),
  215. TargetType: cluster.FilerType,
  216. })
  217. if err == nil {
  218. printTiming(writer, pong.StartTimeNs, pong.RemoteTimeNs, pong.StopTimeNs)
  219. }
  220. return err
  221. })
  222. if err != nil {
  223. fmt.Fprintf(writer, "%v\n", err)
  224. }
  225. }
  226. }
  227. return nil
  228. }
  229. func printTiming(writer io.Writer, startNs, remoteNs, stopNs int64) {
  230. roundTripTimeMs := float32(stopNs-startNs) / 1000000
  231. deltaTimeMs := float32(remoteNs-(startNs+stopNs)/2) / 1000000
  232. fmt.Fprintf(writer, "ok round trip %.3fms clock delta %.3fms\n", roundTripTimeMs, deltaTimeMs)
  233. }