123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617 |
- package shell
- import (
- "context"
- "flag"
- "fmt"
- "io"
- "path/filepath"
- "strconv"
- "time"
- "github.com/seaweedfs/seaweedfs/weed/pb"
- "github.com/seaweedfs/seaweedfs/weed/storage/needle"
- "github.com/seaweedfs/seaweedfs/weed/storage/needle_map"
- "github.com/seaweedfs/seaweedfs/weed/storage/types"
- "golang.org/x/exp/slices"
- "google.golang.org/grpc"
- "github.com/seaweedfs/seaweedfs/weed/operation"
- "github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
- "github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
- "github.com/seaweedfs/seaweedfs/weed/storage/super_block"
- )
- func init() {
- Commands = append(Commands, &commandVolumeFixReplication{})
- }
- type commandVolumeFixReplication struct {
- collectionPattern *string
- }
- func (c *commandVolumeFixReplication) Name() string {
- return "volume.fix.replication"
- }
- func (c *commandVolumeFixReplication) Help() string {
- return `add or remove replicas to volumes that are missing replicas or over-replicated
- This command finds all over-replicated volumes. If found, it will purge the oldest copies and stop.
- This command also finds all under-replicated volumes, and finds volume servers with free slots.
- If the free slots satisfy the replication requirement, the volume content is copied over and mounted.
- volume.fix.replication -n # do not take action
- volume.fix.replication # actually deleting or copying the volume files and mount the volume
- volume.fix.replication -collectionPattern=important* # fix any collections with prefix "important"
- Note:
- * each time this will only add back one replica for each volume id that is under replicated.
- If there are multiple replicas are missing, e.g. replica count is > 2, you may need to run this multiple times.
- * do not run this too quickly within seconds, since the new volume replica may take a few seconds
- to register itself to the master.
- `
- }
- func (c *commandVolumeFixReplication) Do(args []string, commandEnv *CommandEnv, writer io.Writer) (err error) {
- volFixReplicationCommand := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
- c.collectionPattern = volFixReplicationCommand.String("collectionPattern", "", "match with wildcard characters '*' and '?'")
- skipChange := volFixReplicationCommand.Bool("n", false, "skip the changes")
- doDelete := volFixReplicationCommand.Bool("doDelete", true, "Also delete over-replicated volumes besides fixing under-replication")
- doCheck := volFixReplicationCommand.Bool("doCheck", true, "Also check synchronization before deleting")
- retryCount := volFixReplicationCommand.Int("retry", 5, "how many times to retry")
- volumesPerStep := volFixReplicationCommand.Int("volumesPerStep", 0, "how many volumes to fix in one cycle")
- if err = volFixReplicationCommand.Parse(args); err != nil {
- return nil
- }
- if err = commandEnv.confirmIsLocked(args); err != nil {
- return
- }
- takeAction := !*skipChange
- underReplicatedVolumeIdsCount := 1
- for underReplicatedVolumeIdsCount > 0 {
- fixedVolumeReplicas := map[string]int{}
- // collect topology information
- topologyInfo, _, err := collectTopologyInfo(commandEnv, 15*time.Second)
- if err != nil {
- return err
- }
- // find all volumes that needs replication
- // collect all data nodes
- volumeReplicas, allLocations := collectVolumeReplicaLocations(topologyInfo)
- if len(allLocations) == 0 {
- return fmt.Errorf("no data nodes at all")
- }
- // find all under replicated volumes
- var underReplicatedVolumeIds, overReplicatedVolumeIds, misplacedVolumeIds []uint32
- for vid, replicas := range volumeReplicas {
- replica := replicas[0]
- replicaPlacement, _ := super_block.NewReplicaPlacementFromByte(byte(replica.info.ReplicaPlacement))
- switch {
- case replicaPlacement.GetCopyCount() > len(replicas):
- underReplicatedVolumeIds = append(underReplicatedVolumeIds, vid)
- case isMisplaced(replicas, replicaPlacement):
- misplacedVolumeIds = append(misplacedVolumeIds, vid)
- fmt.Fprintf(writer, "volume %d replication %s is not well placed %s\n", replica.info.Id, replicaPlacement, replica.location.dataNode.Id)
- case replicaPlacement.GetCopyCount() < len(replicas):
- overReplicatedVolumeIds = append(overReplicatedVolumeIds, vid)
- fmt.Fprintf(writer, "volume %d replication %s, but over replicated %+d\n", replica.info.Id, replicaPlacement, len(replicas))
- }
- }
- if !commandEnv.isLocked() {
- return fmt.Errorf("lock is lost")
- }
- if len(overReplicatedVolumeIds) > 0 && *doDelete {
- if err := c.deleteOneVolume(commandEnv, writer, takeAction, *doCheck, overReplicatedVolumeIds, volumeReplicas, allLocations, pickOneReplicaToDelete); err != nil {
- return err
- }
- }
- if len(misplacedVolumeIds) > 0 && *doDelete {
- if err := c.deleteOneVolume(commandEnv, writer, takeAction, *doCheck, misplacedVolumeIds, volumeReplicas, allLocations, pickOneMisplacedVolume); err != nil {
- return err
- }
- }
- underReplicatedVolumeIdsCount = len(underReplicatedVolumeIds)
- if underReplicatedVolumeIdsCount > 0 {
- // find the most under populated data nodes
- fixedVolumeReplicas, err = c.fixUnderReplicatedVolumes(commandEnv, writer, takeAction, underReplicatedVolumeIds, volumeReplicas, allLocations, *retryCount, *volumesPerStep)
- if err != nil {
- return err
- }
- }
- if *skipChange {
- break
- }
- // check that the topology has been updated
- if len(fixedVolumeReplicas) > 0 {
- fixedVolumes := make([]string, 0, len(fixedVolumeReplicas))
- for k, _ := range fixedVolumeReplicas {
- fixedVolumes = append(fixedVolumes, k)
- }
- volumeIdLocations, err := lookupVolumeIds(commandEnv, fixedVolumes)
- if err != nil {
- return err
- }
- for _, volumeIdLocation := range volumeIdLocations {
- volumeId := volumeIdLocation.VolumeOrFileId
- volumeIdLocationCount := len(volumeIdLocation.Locations)
- i := 0
- for fixedVolumeReplicas[volumeId] >= volumeIdLocationCount {
- fmt.Fprintf(writer, "the number of locations for volume %s has not increased yet, let's wait\n", volumeId)
- time.Sleep(time.Duration(i+1) * time.Second * 7)
- volumeLocIds, err := lookupVolumeIds(commandEnv, []string{volumeId})
- if err != nil {
- return err
- }
- volumeIdLocationCount = len(volumeLocIds[0].Locations)
- if *retryCount <= i {
- return fmt.Errorf("replicas volume %s mismatch in topology", volumeId)
- }
- i += 1
- }
- }
- }
- }
- return nil
- }
- func collectVolumeReplicaLocations(topologyInfo *master_pb.TopologyInfo) (map[uint32][]*VolumeReplica, []location) {
- volumeReplicas := make(map[uint32][]*VolumeReplica)
- var allLocations []location
- eachDataNode(topologyInfo, func(dc string, rack RackId, dn *master_pb.DataNodeInfo) {
- loc := newLocation(dc, string(rack), dn)
- for _, diskInfo := range dn.DiskInfos {
- for _, v := range diskInfo.VolumeInfos {
- volumeReplicas[v.Id] = append(volumeReplicas[v.Id], &VolumeReplica{
- location: &loc,
- info: v,
- })
- }
- }
- allLocations = append(allLocations, loc)
- })
- return volumeReplicas, allLocations
- }
- type SelectOneVolumeFunc func(replicas []*VolumeReplica, replicaPlacement *super_block.ReplicaPlacement) *VolumeReplica
- func checkOneVolume(a *VolumeReplica, b *VolumeReplica, writer io.Writer, grpcDialOption grpc.DialOption) (err error) {
- aDB, bDB := needle_map.NewMemDb(), needle_map.NewMemDb()
- defer func() {
- aDB.Close()
- bDB.Close()
- }()
- // read index db
- readIndexDbCutoffFrom := uint64(time.Now().UnixNano())
- if err = readIndexDatabase(aDB, a.info.Collection, a.info.Id, pb.NewServerAddressFromDataNode(a.location.dataNode), false, writer, grpcDialOption); err != nil {
- return fmt.Errorf("readIndexDatabase %s volume %d: %v", a.location.dataNode, a.info.Id, err)
- }
- if err := readIndexDatabase(bDB, b.info.Collection, b.info.Id, pb.NewServerAddressFromDataNode(b.location.dataNode), false, writer, grpcDialOption); err != nil {
- return fmt.Errorf("readIndexDatabase %s volume %d: %v", b.location.dataNode, b.info.Id, err)
- }
- if _, err = doVolumeCheckDisk(aDB, bDB, a, b, false, writer, true, false, float64(1), readIndexDbCutoffFrom, grpcDialOption); err != nil {
- return fmt.Errorf("doVolumeCheckDisk source:%s target:%s volume %d: %v", a.location.dataNode.Id, b.location.dataNode.Id, a.info.Id, err)
- }
- return
- }
- func (c *commandVolumeFixReplication) deleteOneVolume(commandEnv *CommandEnv, writer io.Writer, takeAction bool, doCheck bool, overReplicatedVolumeIds []uint32, volumeReplicas map[uint32][]*VolumeReplica, allLocations []location, selectOneVolumeFn SelectOneVolumeFunc) error {
- for _, vid := range overReplicatedVolumeIds {
- replicas := volumeReplicas[vid]
- replicaPlacement, _ := super_block.NewReplicaPlacementFromByte(byte(replicas[0].info.ReplicaPlacement))
- replica := selectOneVolumeFn(replicas, replicaPlacement)
- // check collection name pattern
- if *c.collectionPattern != "" {
- matched, err := filepath.Match(*c.collectionPattern, replica.info.Collection)
- if err != nil {
- return fmt.Errorf("match pattern %s with collection %s: %v", *c.collectionPattern, replica.info.Collection, err)
- }
- if !matched {
- break
- }
- }
- collectionIsMismatch := false
- for _, volumeReplica := range replicas {
- if volumeReplica.info.Collection != replica.info.Collection {
- fmt.Fprintf(writer, "skip delete volume %d as collection %s is mismatch: %s\n", replica.info.Id, replica.info.Collection, volumeReplica.info.Collection)
- collectionIsMismatch = true
- }
- }
- if collectionIsMismatch {
- continue
- }
- fmt.Fprintf(writer, "deleting volume %d from %s ...\n", replica.info.Id, replica.location.dataNode.Id)
- if !takeAction {
- break
- }
- if doCheck {
- for _, replicaB := range replicas {
- if replicaB.location.dataNode == replica.location.dataNode {
- continue
- }
- if err := checkOneVolume(replica, replicaB, writer, commandEnv.option.GrpcDialOption); err != nil {
- return fmt.Errorf("sync volume %d on %s and %s: %v\n", replica.info.Id, replica.location.dataNode.Id, replicaB.location.dataNode.Id, err)
- }
- }
- }
- if err := deleteVolume(commandEnv.option.GrpcDialOption, needle.VolumeId(replica.info.Id),
- pb.NewServerAddressFromDataNode(replica.location.dataNode), false); err != nil {
- return fmt.Errorf("deleting volume %d from %s : %v", replica.info.Id, replica.location.dataNode.Id, err)
- }
- }
- return nil
- }
- func (c *commandVolumeFixReplication) fixUnderReplicatedVolumes(commandEnv *CommandEnv, writer io.Writer, takeAction bool, underReplicatedVolumeIds []uint32, volumeReplicas map[uint32][]*VolumeReplica, allLocations []location, retryCount int, volumesPerStep int) (fixedVolumes map[string]int, err error) {
- fixedVolumes = map[string]int{}
- if len(underReplicatedVolumeIds) > volumesPerStep && volumesPerStep > 0 {
- underReplicatedVolumeIds = underReplicatedVolumeIds[0:volumesPerStep]
- }
- for _, vid := range underReplicatedVolumeIds {
- for i := 0; i < retryCount+1; i++ {
- if err = c.fixOneUnderReplicatedVolume(commandEnv, writer, takeAction, volumeReplicas, vid, allLocations); err == nil {
- if takeAction {
- fixedVolumes[strconv.FormatUint(uint64(vid), 10)] = len(volumeReplicas[vid])
- }
- break
- } else {
- fmt.Fprintf(writer, "fixing under replicated volume %d: %v\n", vid, err)
- }
- }
- }
- return fixedVolumes, nil
- }
- func (c *commandVolumeFixReplication) fixOneUnderReplicatedVolume(commandEnv *CommandEnv, writer io.Writer, takeAction bool, volumeReplicas map[uint32][]*VolumeReplica, vid uint32, allLocations []location) error {
- replicas := volumeReplicas[vid]
- replica := pickOneReplicaToCopyFrom(replicas)
- replicaPlacement, _ := super_block.NewReplicaPlacementFromByte(byte(replica.info.ReplicaPlacement))
- foundNewLocation := false
- hasSkippedCollection := false
- keepDataNodesSorted(allLocations, types.ToDiskType(replica.info.DiskType))
- fn := capacityByFreeVolumeCount(types.ToDiskType(replica.info.DiskType))
- for _, dst := range allLocations {
- // check whether data nodes satisfy the constraints
- if fn(dst.dataNode) > 0 && satisfyReplicaPlacement(replicaPlacement, replicas, dst) {
- // check collection name pattern
- if *c.collectionPattern != "" {
- matched, err := filepath.Match(*c.collectionPattern, replica.info.Collection)
- if err != nil {
- return fmt.Errorf("match pattern %s with collection %s: %v", *c.collectionPattern, replica.info.Collection, err)
- }
- if !matched {
- hasSkippedCollection = true
- break
- }
- }
- // ask the volume server to replicate the volume
- foundNewLocation = true
- fmt.Fprintf(writer, "replicating volume %d %s from %s to dataNode %s ...\n", replica.info.Id, replicaPlacement, replica.location.dataNode.Id, dst.dataNode.Id)
- if !takeAction {
- // adjust volume count
- addVolumeCount(dst.dataNode.DiskInfos[replica.info.DiskType], 1)
- break
- }
- err := operation.WithVolumeServerClient(false, pb.NewServerAddressFromDataNode(dst.dataNode), commandEnv.option.GrpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
- stream, replicateErr := volumeServerClient.VolumeCopy(context.Background(), &volume_server_pb.VolumeCopyRequest{
- VolumeId: replica.info.Id,
- SourceDataNode: string(pb.NewServerAddressFromDataNode(replica.location.dataNode)),
- })
- if replicateErr != nil {
- return fmt.Errorf("copying from %s => %s : %v", replica.location.dataNode.Id, dst.dataNode.Id, replicateErr)
- }
- for {
- resp, recvErr := stream.Recv()
- if recvErr != nil {
- if recvErr == io.EOF {
- break
- } else {
- return recvErr
- }
- }
- if resp.ProcessedBytes > 0 {
- fmt.Fprintf(writer, "volume %d processed %d bytes\n", replica.info.Id, resp.ProcessedBytes)
- }
- }
- return nil
- })
- if err != nil {
- return err
- }
- // adjust volume count
- addVolumeCount(dst.dataNode.DiskInfos[replica.info.DiskType], 1)
- break
- }
- }
- if !foundNewLocation && !hasSkippedCollection {
- fmt.Fprintf(writer, "failed to place volume %d replica as %s, existing:%+v\n", replica.info.Id, replicaPlacement, len(replicas))
- }
- return nil
- }
- func addVolumeCount(info *master_pb.DiskInfo, count int) {
- if info == nil {
- return
- }
- info.VolumeCount += int64(count)
- info.FreeVolumeCount -= int64(count)
- }
- func keepDataNodesSorted(dataNodes []location, diskType types.DiskType) {
- fn := capacityByFreeVolumeCount(diskType)
- slices.SortFunc(dataNodes, func(a, b location) int {
- return int(fn(b.dataNode) - fn(a.dataNode))
- })
- }
- /*
- if on an existing data node {
- return false
- }
- if different from existing dcs {
- if lack on different dcs {
- return true
- }else{
- return false
- }
- }
- if not on primary dc {
- return false
- }
- if different from existing racks {
- if lack on different racks {
- return true
- }else{
- return false
- }
- }
- if not on primary rack {
- return false
- }
- if lacks on same rack {
- return true
- } else {
- return false
- }
- */
- func satisfyReplicaPlacement(replicaPlacement *super_block.ReplicaPlacement, replicas []*VolumeReplica, possibleLocation location) bool {
- existingDataCenters, _, existingDataNodes := countReplicas(replicas)
- if _, found := existingDataNodes[possibleLocation.String()]; found {
- // avoid duplicated volume on the same data node
- return false
- }
- primaryDataCenters, _ := findTopKeys(existingDataCenters)
- // ensure data center count is within limit
- if _, found := existingDataCenters[possibleLocation.DataCenter()]; !found {
- // different from existing dcs
- if len(existingDataCenters) < replicaPlacement.DiffDataCenterCount+1 {
- // lack on different dcs
- return true
- } else {
- // adding this would go over the different dcs limit
- return false
- }
- }
- // now this is same as one of the existing data center
- if !isAmong(possibleLocation.DataCenter(), primaryDataCenters) {
- // not on one of the primary dcs
- return false
- }
- // now this is one of the primary dcs
- primaryDcRacks := make(map[string]int)
- for _, replica := range replicas {
- if replica.location.DataCenter() != possibleLocation.DataCenter() {
- continue
- }
- primaryDcRacks[replica.location.Rack()] += 1
- }
- primaryRacks, _ := findTopKeys(primaryDcRacks)
- sameRackCount := primaryDcRacks[possibleLocation.Rack()]
- // ensure rack count is within limit
- if _, found := primaryDcRacks[possibleLocation.Rack()]; !found {
- // different from existing racks
- if len(primaryDcRacks) < replicaPlacement.DiffRackCount+1 {
- // lack on different racks
- return true
- } else {
- // adding this would go over the different racks limit
- return false
- }
- }
- // now this is same as one of the existing racks
- if !isAmong(possibleLocation.Rack(), primaryRacks) {
- // not on the primary rack
- return false
- }
- // now this is on the primary rack
- // different from existing data nodes
- if sameRackCount < replicaPlacement.SameRackCount+1 {
- // lack on same rack
- return true
- } else {
- // adding this would go over the same data node limit
- return false
- }
- }
- func findTopKeys(m map[string]int) (topKeys []string, max int) {
- for k, c := range m {
- if max < c {
- topKeys = topKeys[:0]
- topKeys = append(topKeys, k)
- max = c
- } else if max == c {
- topKeys = append(topKeys, k)
- }
- }
- return
- }
- func isAmong(key string, keys []string) bool {
- for _, k := range keys {
- if k == key {
- return true
- }
- }
- return false
- }
- type VolumeReplica struct {
- location *location
- info *master_pb.VolumeInformationMessage
- }
- type location struct {
- dc string
- rack string
- dataNode *master_pb.DataNodeInfo
- }
- func newLocation(dc, rack string, dataNode *master_pb.DataNodeInfo) location {
- return location{
- dc: dc,
- rack: rack,
- dataNode: dataNode,
- }
- }
- func (l location) String() string {
- return fmt.Sprintf("%s %s %s", l.dc, l.rack, l.dataNode.Id)
- }
- func (l location) Rack() string {
- return fmt.Sprintf("%s %s", l.dc, l.rack)
- }
- func (l location) DataCenter() string {
- return l.dc
- }
- func pickOneReplicaToCopyFrom(replicas []*VolumeReplica) *VolumeReplica {
- mostRecent := replicas[0]
- for _, replica := range replicas {
- if replica.info.ModifiedAtSecond > mostRecent.info.ModifiedAtSecond {
- mostRecent = replica
- }
- }
- return mostRecent
- }
- func countReplicas(replicas []*VolumeReplica) (diffDc, diffRack, diffNode map[string]int) {
- diffDc = make(map[string]int)
- diffRack = make(map[string]int)
- diffNode = make(map[string]int)
- for _, replica := range replicas {
- diffDc[replica.location.DataCenter()] += 1
- diffRack[replica.location.Rack()] += 1
- diffNode[replica.location.String()] += 1
- }
- return
- }
- func pickOneReplicaToDelete(replicas []*VolumeReplica, replicaPlacement *super_block.ReplicaPlacement) *VolumeReplica {
- slices.SortFunc(replicas, func(a, b *VolumeReplica) int {
- if a.info.Size != b.info.Size {
- return int(a.info.Size - b.info.Size)
- }
- if a.info.ModifiedAtSecond != b.info.ModifiedAtSecond {
- return int(a.info.ModifiedAtSecond - b.info.ModifiedAtSecond)
- }
- if a.info.CompactRevision != b.info.CompactRevision {
- return int(a.info.CompactRevision - b.info.CompactRevision)
- }
- return 0
- })
- return replicas[0]
- }
- // check and fix misplaced volumes
- func isMisplaced(replicas []*VolumeReplica, replicaPlacement *super_block.ReplicaPlacement) bool {
- for i := 0; i < len(replicas); i++ {
- others := otherThan(replicas, i)
- if !satisfyReplicaPlacement(replicaPlacement, others, *replicas[i].location) {
- return true
- }
- }
- return false
- }
- func otherThan(replicas []*VolumeReplica, index int) (others []*VolumeReplica) {
- for i := 0; i < len(replicas); i++ {
- if index != i {
- others = append(others, replicas[i])
- }
- }
- return
- }
- func pickOneMisplacedVolume(replicas []*VolumeReplica, replicaPlacement *super_block.ReplicaPlacement) (toDelete *VolumeReplica) {
- var deletionCandidates []*VolumeReplica
- for i := 0; i < len(replicas); i++ {
- others := otherThan(replicas, i)
- if !isMisplaced(others, replicaPlacement) {
- deletionCandidates = append(deletionCandidates, replicas[i])
- }
- }
- if len(deletionCandidates) > 0 {
- return pickOneReplicaToDelete(deletionCandidates, replicaPlacement)
- }
- return pickOneReplicaToDelete(replicas, replicaPlacement)
- }
|