command_volume_check_disk.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. package shell
  2. import (
  3. "bytes"
  4. "context"
  5. "flag"
  6. "fmt"
  7. "github.com/seaweedfs/seaweedfs/weed/operation"
  8. "github.com/seaweedfs/seaweedfs/weed/pb"
  9. "github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
  10. "github.com/seaweedfs/seaweedfs/weed/storage/needle_map"
  11. "golang.org/x/exp/slices"
  12. "google.golang.org/grpc"
  13. "io"
  14. "math"
  15. "net/http"
  16. "time"
  17. )
  18. func init() {
  19. Commands = append(Commands, &commandVolumeCheckDisk{})
  20. }
  21. type commandVolumeCheckDisk struct {
  22. env *CommandEnv
  23. syncDeletions *bool
  24. }
  25. func (c *commandVolumeCheckDisk) Name() string {
  26. return "volume.check.disk"
  27. }
  28. func (c *commandVolumeCheckDisk) Help() string {
  29. return `check all replicated volumes to find and fix inconsistencies. It is optional and resource intensive.
  30. How it works:
  31. find all volumes that are replicated
  32. for each volume id, if there are more than 2 replicas, find one pair with the largest 2 in file count.
  33. for the pair volume A and B
  34. append entries in A and not in B to B
  35. append entries in B and not in A to A
  36. `
  37. }
  38. func (c *commandVolumeCheckDisk) Do(args []string, commandEnv *CommandEnv, writer io.Writer) (err error) {
  39. fsckCommand := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
  40. slowMode := fsckCommand.Bool("slow", false, "slow mode checks all replicas even file counts are the same")
  41. verbose := fsckCommand.Bool("v", false, "verbose mode")
  42. volumeId := fsckCommand.Uint("volumeId", 0, "the volume id")
  43. applyChanges := fsckCommand.Bool("force", false, "apply the fix")
  44. c.syncDeletions = fsckCommand.Bool("syncDeleted", false, "sync of deletions the fix")
  45. nonRepairThreshold := fsckCommand.Float64("nonRepairThreshold", 0.3, "repair when missing keys is not more than this limit")
  46. if err = fsckCommand.Parse(args); err != nil {
  47. return nil
  48. }
  49. infoAboutSimulationMode(writer, *applyChanges, "-force")
  50. if err = commandEnv.confirmIsLocked(args); err != nil {
  51. return
  52. }
  53. c.env = commandEnv
  54. // collect topology information
  55. topologyInfo, _, err := collectTopologyInfo(commandEnv, 0)
  56. if err != nil {
  57. return err
  58. }
  59. volumeReplicas, _ := collectVolumeReplicaLocations(topologyInfo)
  60. // pick 1 pairs of volume replica
  61. fileCount := func(replica *VolumeReplica) uint64 {
  62. return replica.info.FileCount - replica.info.DeleteCount
  63. }
  64. for _, replicas := range volumeReplicas {
  65. if *volumeId > 0 && replicas[0].info.Id != uint32(*volumeId) {
  66. continue
  67. }
  68. slices.SortFunc(replicas, func(a, b *VolumeReplica) int {
  69. return int(fileCount(b) - fileCount(a))
  70. })
  71. for len(replicas) >= 2 {
  72. a, b := replicas[0], replicas[1]
  73. if !*slowMode {
  74. if fileCount(a) == fileCount(b) {
  75. replicas = replicas[1:]
  76. continue
  77. }
  78. }
  79. if a.info.ReadOnly || b.info.ReadOnly {
  80. fmt.Fprintf(writer, "skipping readonly volume %d on %s and %s\n", a.info.Id, a.location.dataNode.Id, b.location.dataNode.Id)
  81. replicas = replicas[1:]
  82. continue
  83. }
  84. if err := c.syncTwoReplicas(a, b, *applyChanges, *nonRepairThreshold, *verbose, writer); err != nil {
  85. fmt.Fprintf(writer, "sync volume %d on %s and %s: %v\n", a.info.Id, a.location.dataNode.Id, b.location.dataNode.Id, err)
  86. }
  87. replicas = replicas[1:]
  88. }
  89. }
  90. return nil
  91. }
  92. func (c *commandVolumeCheckDisk) syncTwoReplicas(a *VolumeReplica, b *VolumeReplica, applyChanges bool, nonRepairThreshold float64, verbose bool, writer io.Writer) (err error) {
  93. aHasChanges, bHasChanges := true, true
  94. for aHasChanges || bHasChanges {
  95. if aHasChanges, bHasChanges, err = c.checkBoth(a, b, applyChanges, nonRepairThreshold, verbose, writer); err != nil {
  96. return err
  97. }
  98. }
  99. return nil
  100. }
  101. func (c *commandVolumeCheckDisk) checkBoth(a *VolumeReplica, b *VolumeReplica, applyChanges bool, nonRepairThreshold float64, verbose bool, writer io.Writer) (aHasChanges bool, bHasChanges bool, err error) {
  102. aDB, bDB := needle_map.NewMemDb(), needle_map.NewMemDb()
  103. defer func() {
  104. aDB.Close()
  105. bDB.Close()
  106. }()
  107. // read index db
  108. readIndexDbCutoffFrom := uint64(time.Now().UnixNano())
  109. if err = c.readIndexDatabase(aDB, a.info.Collection, a.info.Id, pb.NewServerAddressFromDataNode(a.location.dataNode), verbose, writer); err != nil {
  110. return true, true, fmt.Errorf("readIndexDatabase %s volume %d: %v", a.location.dataNode, a.info.Id, err)
  111. }
  112. if err := c.readIndexDatabase(bDB, b.info.Collection, b.info.Id, pb.NewServerAddressFromDataNode(b.location.dataNode), verbose, writer); err != nil {
  113. return true, true, fmt.Errorf("readIndexDatabase %s volume %d: %v", b.location.dataNode, b.info.Id, err)
  114. }
  115. // find and make up the differences
  116. if aHasChanges, err = c.doVolumeCheckDisk(bDB, aDB, b, a, verbose, writer, applyChanges, nonRepairThreshold, readIndexDbCutoffFrom); err != nil {
  117. return true, true, fmt.Errorf("doVolumeCheckDisk source:%s target:%s volume %d: %v", b.location.dataNode.Id, a.location.dataNode.Id, b.info.Id, err)
  118. }
  119. if bHasChanges, err = c.doVolumeCheckDisk(aDB, bDB, a, b, verbose, writer, applyChanges, nonRepairThreshold, readIndexDbCutoffFrom); err != nil {
  120. return true, true, fmt.Errorf("doVolumeCheckDisk source:%s target:%s volume %d: %v", a.location.dataNode.Id, b.location.dataNode.Id, a.info.Id, err)
  121. }
  122. return
  123. }
  124. func (c *commandVolumeCheckDisk) doVolumeCheckDisk(minuend, subtrahend *needle_map.MemDb, source, target *VolumeReplica, verbose bool, writer io.Writer, applyChanges bool, nonRepairThreshold float64, cutoffFromAtNs uint64) (hasChanges bool, err error) {
  125. // find missing keys
  126. // hash join, can be more efficient
  127. var missingNeedles []needle_map.NeedleValue
  128. var partiallyDeletedNeedles []needle_map.NeedleValue
  129. var counter int
  130. doCutoffOfLastNeedle := true
  131. minuend.DescendingVisit(func(minuendValue needle_map.NeedleValue) error {
  132. counter++
  133. if subtrahendValue, found := subtrahend.Get(minuendValue.Key); !found {
  134. if minuendValue.Size.IsDeleted() {
  135. return nil
  136. }
  137. if doCutoffOfLastNeedle {
  138. if needleMeta, err := readNeedleMeta(c.env.option.GrpcDialOption, pb.NewServerAddressFromDataNode(source.location.dataNode), source.info.Id, minuendValue); err == nil {
  139. // needles older than the cutoff time are not missing yet
  140. if needleMeta.AppendAtNs > cutoffFromAtNs {
  141. return nil
  142. }
  143. doCutoffOfLastNeedle = false
  144. }
  145. }
  146. missingNeedles = append(missingNeedles, minuendValue)
  147. } else {
  148. if minuendValue.Size.IsDeleted() && !subtrahendValue.Size.IsDeleted() {
  149. partiallyDeletedNeedles = append(partiallyDeletedNeedles, minuendValue)
  150. }
  151. if doCutoffOfLastNeedle {
  152. doCutoffOfLastNeedle = false
  153. }
  154. }
  155. return nil
  156. })
  157. fmt.Fprintf(writer, "volume %d %s has %d entries, %s missed %d and partially deleted %d entries\n",
  158. source.info.Id, source.location.dataNode.Id, counter, target.location.dataNode.Id, len(missingNeedles), len(partiallyDeletedNeedles))
  159. if counter == 0 || (len(missingNeedles) == 0 && len(partiallyDeletedNeedles) == 0) {
  160. return false, nil
  161. }
  162. missingNeedlesFraction := float64(len(missingNeedles)) / float64(counter)
  163. if missingNeedlesFraction > nonRepairThreshold {
  164. return false, fmt.Errorf(
  165. "failed to start repair volume %d, percentage of missing keys is greater than the threshold: %.2f > %.2f",
  166. source.info.Id, missingNeedlesFraction, nonRepairThreshold)
  167. }
  168. for _, needleValue := range missingNeedles {
  169. needleBlob, err := readSourceNeedleBlob(c.env.option.GrpcDialOption, pb.NewServerAddressFromDataNode(source.location.dataNode), source.info.Id, needleValue)
  170. if err != nil {
  171. return hasChanges, err
  172. }
  173. if !applyChanges {
  174. continue
  175. }
  176. if verbose {
  177. fmt.Fprintf(writer, "read %s %s => %s\n", needleValue.Key.FileId(source.info.Id), source.location.dataNode.Id, target.location.dataNode.Id)
  178. }
  179. hasChanges = true
  180. if err = c.writeNeedleBlobToTarget(pb.NewServerAddressFromDataNode(target.location.dataNode), source.info.Id, needleValue, needleBlob); err != nil {
  181. return hasChanges, err
  182. }
  183. }
  184. if *c.syncDeletions && len(partiallyDeletedNeedles) > 0 {
  185. var fidList []string
  186. for _, needleValue := range partiallyDeletedNeedles {
  187. fidList = append(fidList, needleValue.Key.FileId(source.info.Id))
  188. if verbose {
  189. fmt.Fprintf(writer, "delete %s %s => %s\n", needleValue.Key.FileId(source.info.Id), source.location.dataNode.Id, target.location.dataNode.Id)
  190. }
  191. }
  192. deleteResults, deleteErr := operation.DeleteFilesAtOneVolumeServer(
  193. pb.NewServerAddressFromDataNode(target.location.dataNode),
  194. c.env.option.GrpcDialOption, fidList, false)
  195. if deleteErr != nil {
  196. return hasChanges, deleteErr
  197. }
  198. for _, deleteResult := range deleteResults {
  199. if deleteResult.Status == http.StatusAccepted {
  200. hasChanges = true
  201. return
  202. }
  203. }
  204. }
  205. return
  206. }
  207. func readSourceNeedleBlob(grpcDialOption grpc.DialOption, sourceVolumeServer pb.ServerAddress, volumeId uint32, needleValue needle_map.NeedleValue) (needleBlob []byte, err error) {
  208. err = operation.WithVolumeServerClient(false, sourceVolumeServer, grpcDialOption, func(client volume_server_pb.VolumeServerClient) error {
  209. resp, err := client.ReadNeedleBlob(context.Background(), &volume_server_pb.ReadNeedleBlobRequest{
  210. VolumeId: volumeId,
  211. Offset: needleValue.Offset.ToActualOffset(),
  212. Size: int32(needleValue.Size),
  213. })
  214. if err != nil {
  215. return err
  216. }
  217. needleBlob = resp.NeedleBlob
  218. return nil
  219. })
  220. return
  221. }
  222. func (c *commandVolumeCheckDisk) writeNeedleBlobToTarget(targetVolumeServer pb.ServerAddress, volumeId uint32, needleValue needle_map.NeedleValue, needleBlob []byte) error {
  223. return operation.WithVolumeServerClient(false, targetVolumeServer, c.env.option.GrpcDialOption, func(client volume_server_pb.VolumeServerClient) error {
  224. _, err := client.WriteNeedleBlob(context.Background(), &volume_server_pb.WriteNeedleBlobRequest{
  225. VolumeId: volumeId,
  226. NeedleId: uint64(needleValue.Key),
  227. Size: int32(needleValue.Size),
  228. NeedleBlob: needleBlob,
  229. })
  230. return err
  231. })
  232. }
  233. func (c *commandVolumeCheckDisk) readIndexDatabase(db *needle_map.MemDb, collection string, volumeId uint32, volumeServer pb.ServerAddress, verbose bool, writer io.Writer) error {
  234. var buf bytes.Buffer
  235. if err := c.copyVolumeIndexFile(collection, volumeId, volumeServer, &buf, verbose, writer); err != nil {
  236. return err
  237. }
  238. if verbose {
  239. fmt.Fprintf(writer, "load collection %s volume %d index size %d from %s ...\n", collection, volumeId, buf.Len(), volumeServer)
  240. }
  241. return db.LoadFilterFromReaderAt(bytes.NewReader(buf.Bytes()), true, false)
  242. }
  243. func (c *commandVolumeCheckDisk) copyVolumeIndexFile(collection string, volumeId uint32, volumeServer pb.ServerAddress, buf *bytes.Buffer, verbose bool, writer io.Writer) error {
  244. return operation.WithVolumeServerClient(true, volumeServer, c.env.option.GrpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
  245. ext := ".idx"
  246. copyFileClient, err := volumeServerClient.CopyFile(context.Background(), &volume_server_pb.CopyFileRequest{
  247. VolumeId: volumeId,
  248. Ext: ".idx",
  249. CompactionRevision: math.MaxUint32,
  250. StopOffset: math.MaxInt64,
  251. Collection: collection,
  252. IsEcVolume: false,
  253. IgnoreSourceFileNotFound: false,
  254. })
  255. if err != nil {
  256. return fmt.Errorf("failed to start copying volume %d%s: %v", volumeId, ext, err)
  257. }
  258. err = writeToBuffer(copyFileClient, buf)
  259. if err != nil {
  260. return fmt.Errorf("failed to copy %d%s from %s: %v", volumeId, ext, volumeServer, err)
  261. }
  262. return nil
  263. })
  264. }
  265. func writeToBuffer(client volume_server_pb.VolumeServer_CopyFileClient, buf *bytes.Buffer) error {
  266. for {
  267. resp, receiveErr := client.Recv()
  268. if receiveErr == io.EOF {
  269. break
  270. }
  271. if receiveErr != nil {
  272. return fmt.Errorf("receiving: %v", receiveErr)
  273. }
  274. buf.Write(resp.FileContent)
  275. }
  276. return nil
  277. }