command_ec_balance.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519
  1. package shell
  2. import (
  3. "flag"
  4. "fmt"
  5. "io"
  6. "sort"
  7. "github.com/chrislusf/seaweedfs/weed/storage/erasure_coding"
  8. "github.com/chrislusf/seaweedfs/weed/storage/needle"
  9. )
  10. func init() {
  11. Commands = append(Commands, &commandEcBalance{})
  12. }
  13. type commandEcBalance struct {
  14. }
  15. func (c *commandEcBalance) Name() string {
  16. return "ec.balance"
  17. }
  18. func (c *commandEcBalance) Help() string {
  19. return `balance all ec shards among all racks and volume servers
  20. ec.balance [-c EACH_COLLECTION|<collection_name>] [-force] [-dataCenter <data_center>]
  21. Algorithm:
  22. For each type of volume server (different max volume count limit){
  23. for each collection:
  24. balanceEcVolumes(collectionName)
  25. for each rack:
  26. balanceEcRack(rack)
  27. }
  28. func balanceEcVolumes(collectionName){
  29. for each volume:
  30. doDeduplicateEcShards(volumeId)
  31. tracks rack~shardCount mapping
  32. for each volume:
  33. doBalanceEcShardsAcrossRacks(volumeId)
  34. for each volume:
  35. doBalanceEcShardsWithinRacks(volumeId)
  36. }
  37. // spread ec shards into more racks
  38. func doBalanceEcShardsAcrossRacks(volumeId){
  39. tracks rack~volumeIdShardCount mapping
  40. averageShardsPerEcRack = totalShardNumber / numRacks // totalShardNumber is 14 for now, later could varies for each dc
  41. ecShardsToMove = select overflown ec shards from racks with ec shard counts > averageShardsPerEcRack
  42. for each ecShardsToMove {
  43. destRack = pickOneRack(rack~shardCount, rack~volumeIdShardCount, averageShardsPerEcRack)
  44. destVolumeServers = volume servers on the destRack
  45. pickOneEcNodeAndMoveOneShard(destVolumeServers)
  46. }
  47. }
  48. func doBalanceEcShardsWithinRacks(volumeId){
  49. racks = collect all racks that the volume id is on
  50. for rack, shards := range racks
  51. doBalanceEcShardsWithinOneRack(volumeId, shards, rack)
  52. }
  53. // move ec shards
  54. func doBalanceEcShardsWithinOneRack(volumeId, shards, rackId){
  55. tracks volumeServer~volumeIdShardCount mapping
  56. averageShardCount = len(shards) / numVolumeServers
  57. volumeServersOverAverage = volume servers with volumeId's ec shard counts > averageShardsPerEcRack
  58. ecShardsToMove = select overflown ec shards from volumeServersOverAverage
  59. for each ecShardsToMove {
  60. destVolumeServer = pickOneVolumeServer(volumeServer~shardCount, volumeServer~volumeIdShardCount, averageShardCount)
  61. pickOneEcNodeAndMoveOneShard(destVolumeServers)
  62. }
  63. }
  64. // move ec shards while keeping shard distribution for the same volume unchanged or more even
  65. func balanceEcRack(rack){
  66. averageShardCount = total shards / numVolumeServers
  67. for hasMovedOneEcShard {
  68. sort all volume servers ordered by the number of local ec shards
  69. pick the volume server A with the lowest number of ec shards x
  70. pick the volume server B with the highest number of ec shards y
  71. if y > averageShardCount and x +1 <= averageShardCount {
  72. if B has a ec shard with volume id v that A does not have {
  73. move one ec shard v from B to A
  74. hasMovedOneEcShard = true
  75. }
  76. }
  77. }
  78. }
  79. `
  80. }
  81. func (c *commandEcBalance) Do(args []string, commandEnv *CommandEnv, writer io.Writer) (err error) {
  82. if err = commandEnv.confirmIsLocked(); err != nil {
  83. return
  84. }
  85. balanceCommand := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
  86. collection := balanceCommand.String("collection", "EACH_COLLECTION", "collection name, or \"EACH_COLLECTION\" for each collection")
  87. dc := balanceCommand.String("dataCenter", "", "only apply the balancing for this dataCenter")
  88. applyBalancing := balanceCommand.Bool("force", false, "apply the balancing plan")
  89. if err = balanceCommand.Parse(args); err != nil {
  90. return nil
  91. }
  92. // collect all ec nodes
  93. allEcNodes, totalFreeEcSlots, err := collectEcNodes(commandEnv, *dc)
  94. if err != nil {
  95. return err
  96. }
  97. if totalFreeEcSlots < 1 {
  98. return fmt.Errorf("no free ec shard slots. only %d left", totalFreeEcSlots)
  99. }
  100. racks := collectRacks(allEcNodes)
  101. if *collection == "EACH_COLLECTION" {
  102. collections, err := ListCollectionNames(commandEnv, false, true)
  103. if err != nil {
  104. return err
  105. }
  106. fmt.Printf("balanceEcVolumes collections %+v\n", len(collections))
  107. for _, c := range collections {
  108. fmt.Printf("balanceEcVolumes collection %+v\n", c)
  109. if err = balanceEcVolumes(commandEnv, c, allEcNodes, racks, *applyBalancing); err != nil {
  110. return err
  111. }
  112. }
  113. } else {
  114. if err = balanceEcVolumes(commandEnv, *collection, allEcNodes, racks, *applyBalancing); err != nil {
  115. return err
  116. }
  117. }
  118. if err := balanceEcRacks(commandEnv, racks, *applyBalancing); err != nil {
  119. return fmt.Errorf("balance ec racks: %v", err)
  120. }
  121. return nil
  122. }
  123. func collectRacks(allEcNodes []*EcNode) map[RackId]*EcRack {
  124. // collect racks info
  125. racks := make(map[RackId]*EcRack)
  126. for _, ecNode := range allEcNodes {
  127. if racks[ecNode.rack] == nil {
  128. racks[ecNode.rack] = &EcRack{
  129. ecNodes: make(map[EcNodeId]*EcNode),
  130. }
  131. }
  132. racks[ecNode.rack].ecNodes[EcNodeId(ecNode.info.Id)] = ecNode
  133. racks[ecNode.rack].freeEcSlot += ecNode.freeEcSlot
  134. }
  135. return racks
  136. }
  137. func balanceEcVolumes(commandEnv *CommandEnv, collection string, allEcNodes []*EcNode, racks map[RackId]*EcRack, applyBalancing bool) error {
  138. fmt.Printf("balanceEcVolumes %s\n", collection)
  139. if err := deleteDuplicatedEcShards(commandEnv, allEcNodes, collection, applyBalancing); err != nil {
  140. return fmt.Errorf("delete duplicated collection %s ec shards: %v", collection, err)
  141. }
  142. if err := balanceEcShardsAcrossRacks(commandEnv, allEcNodes, racks, collection, applyBalancing); err != nil {
  143. return fmt.Errorf("balance across racks collection %s ec shards: %v", collection, err)
  144. }
  145. if err := balanceEcShardsWithinRacks(commandEnv, allEcNodes, racks, collection, applyBalancing); err != nil {
  146. return fmt.Errorf("balance across racks collection %s ec shards: %v", collection, err)
  147. }
  148. return nil
  149. }
  150. func deleteDuplicatedEcShards(commandEnv *CommandEnv, allEcNodes []*EcNode, collection string, applyBalancing bool) error {
  151. // vid => []ecNode
  152. vidLocations := collectVolumeIdToEcNodes(allEcNodes)
  153. // deduplicate ec shards
  154. for vid, locations := range vidLocations {
  155. if err := doDeduplicateEcShards(commandEnv, collection, vid, locations, applyBalancing); err != nil {
  156. return err
  157. }
  158. }
  159. return nil
  160. }
  161. func doDeduplicateEcShards(commandEnv *CommandEnv, collection string, vid needle.VolumeId, locations []*EcNode, applyBalancing bool) error {
  162. // check whether this volume has ecNodes that are over average
  163. shardToLocations := make([][]*EcNode, erasure_coding.TotalShardsCount)
  164. for _, ecNode := range locations {
  165. shardBits := findEcVolumeShards(ecNode, vid)
  166. for _, shardId := range shardBits.ShardIds() {
  167. shardToLocations[shardId] = append(shardToLocations[shardId], ecNode)
  168. }
  169. }
  170. for shardId, ecNodes := range shardToLocations {
  171. if len(ecNodes) <= 1 {
  172. continue
  173. }
  174. sortEcNodesByFreeslotsAscending(ecNodes)
  175. fmt.Printf("ec shard %d.%d has %d copies, keeping %v\n", vid, shardId, len(ecNodes), ecNodes[0].info.Id)
  176. if !applyBalancing {
  177. continue
  178. }
  179. duplicatedShardIds := []uint32{uint32(shardId)}
  180. for _, ecNode := range ecNodes[1:] {
  181. if err := unmountEcShards(commandEnv.option.GrpcDialOption, vid, ecNode.info.Id, duplicatedShardIds); err != nil {
  182. return err
  183. }
  184. if err := sourceServerDeleteEcShards(commandEnv.option.GrpcDialOption, collection, vid, ecNode.info.Id, duplicatedShardIds); err != nil {
  185. return err
  186. }
  187. ecNode.deleteEcVolumeShards(vid, duplicatedShardIds)
  188. }
  189. }
  190. return nil
  191. }
  192. func balanceEcShardsAcrossRacks(commandEnv *CommandEnv, allEcNodes []*EcNode, racks map[RackId]*EcRack, collection string, applyBalancing bool) error {
  193. // collect vid => []ecNode, since previous steps can change the locations
  194. vidLocations := collectVolumeIdToEcNodes(allEcNodes)
  195. // spread the ec shards evenly
  196. for vid, locations := range vidLocations {
  197. if err := doBalanceEcShardsAcrossRacks(commandEnv, collection, vid, locations, racks, applyBalancing); err != nil {
  198. return err
  199. }
  200. }
  201. return nil
  202. }
  203. func doBalanceEcShardsAcrossRacks(commandEnv *CommandEnv, collection string, vid needle.VolumeId, locations []*EcNode, racks map[RackId]*EcRack, applyBalancing bool) error {
  204. // calculate average number of shards an ec rack should have for one volume
  205. averageShardsPerEcRack := ceilDivide(erasure_coding.TotalShardsCount, len(racks))
  206. // see the volume's shards are in how many racks, and how many in each rack
  207. rackToShardCount := groupByCount(locations, func(ecNode *EcNode) (id string, count int) {
  208. shardBits := findEcVolumeShards(ecNode, vid)
  209. return string(ecNode.rack), shardBits.ShardIdCount()
  210. })
  211. rackEcNodesWithVid := groupBy(locations, func(ecNode *EcNode) string {
  212. return string(ecNode.rack)
  213. })
  214. // ecShardsToMove = select overflown ec shards from racks with ec shard counts > averageShardsPerEcRack
  215. ecShardsToMove := make(map[erasure_coding.ShardId]*EcNode)
  216. for rackId, count := range rackToShardCount {
  217. if count > averageShardsPerEcRack {
  218. possibleEcNodes := rackEcNodesWithVid[rackId]
  219. for shardId, ecNode := range pickNEcShardsToMoveFrom(possibleEcNodes, vid, count-averageShardsPerEcRack) {
  220. ecShardsToMove[shardId] = ecNode
  221. }
  222. }
  223. }
  224. for shardId, ecNode := range ecShardsToMove {
  225. rackId := pickOneRack(racks, rackToShardCount, averageShardsPerEcRack)
  226. if rackId == "" {
  227. fmt.Printf("ec shard %d.%d at %s can not find a destination rack\n", vid, shardId, ecNode.info.Id)
  228. continue
  229. }
  230. var possibleDestinationEcNodes []*EcNode
  231. for _, n := range racks[rackId].ecNodes {
  232. possibleDestinationEcNodes = append(possibleDestinationEcNodes, n)
  233. }
  234. err := pickOneEcNodeAndMoveOneShard(commandEnv, averageShardsPerEcRack, ecNode, collection, vid, shardId, possibleDestinationEcNodes, applyBalancing)
  235. if err != nil {
  236. return err
  237. }
  238. rackToShardCount[string(rackId)] += 1
  239. rackToShardCount[string(ecNode.rack)] -= 1
  240. racks[rackId].freeEcSlot -= 1
  241. racks[ecNode.rack].freeEcSlot += 1
  242. }
  243. return nil
  244. }
  245. func pickOneRack(rackToEcNodes map[RackId]*EcRack, rackToShardCount map[string]int, averageShardsPerEcRack int) RackId {
  246. // TODO later may need to add some randomness
  247. for rackId, rack := range rackToEcNodes {
  248. if rackToShardCount[string(rackId)] >= averageShardsPerEcRack {
  249. continue
  250. }
  251. if rack.freeEcSlot <= 0 {
  252. continue
  253. }
  254. return rackId
  255. }
  256. return ""
  257. }
  258. func balanceEcShardsWithinRacks(commandEnv *CommandEnv, allEcNodes []*EcNode, racks map[RackId]*EcRack, collection string, applyBalancing bool) error {
  259. // collect vid => []ecNode, since previous steps can change the locations
  260. vidLocations := collectVolumeIdToEcNodes(allEcNodes)
  261. // spread the ec shards evenly
  262. for vid, locations := range vidLocations {
  263. // see the volume's shards are in how many racks, and how many in each rack
  264. rackToShardCount := groupByCount(locations, func(ecNode *EcNode) (id string, count int) {
  265. shardBits := findEcVolumeShards(ecNode, vid)
  266. return string(ecNode.rack), shardBits.ShardIdCount()
  267. })
  268. rackEcNodesWithVid := groupBy(locations, func(ecNode *EcNode) string {
  269. return string(ecNode.rack)
  270. })
  271. for rackId, _ := range rackToShardCount {
  272. var possibleDestinationEcNodes []*EcNode
  273. for _, n := range racks[RackId(rackId)].ecNodes {
  274. possibleDestinationEcNodes = append(possibleDestinationEcNodes, n)
  275. }
  276. sourceEcNodes := rackEcNodesWithVid[rackId]
  277. averageShardsPerEcNode := ceilDivide(rackToShardCount[rackId], len(possibleDestinationEcNodes))
  278. if err := doBalanceEcShardsWithinOneRack(commandEnv, averageShardsPerEcNode, collection, vid, sourceEcNodes, possibleDestinationEcNodes, applyBalancing); err != nil {
  279. return err
  280. }
  281. }
  282. }
  283. return nil
  284. }
  285. func doBalanceEcShardsWithinOneRack(commandEnv *CommandEnv, averageShardsPerEcNode int, collection string, vid needle.VolumeId, existingLocations, possibleDestinationEcNodes []*EcNode, applyBalancing bool) error {
  286. for _, ecNode := range existingLocations {
  287. shardBits := findEcVolumeShards(ecNode, vid)
  288. overLimitCount := shardBits.ShardIdCount() - averageShardsPerEcNode
  289. for _, shardId := range shardBits.ShardIds() {
  290. if overLimitCount <= 0 {
  291. break
  292. }
  293. fmt.Printf("%s has %d overlimit, moving ec shard %d.%d\n", ecNode.info.Id, overLimitCount, vid, shardId)
  294. err := pickOneEcNodeAndMoveOneShard(commandEnv, averageShardsPerEcNode, ecNode, collection, vid, shardId, possibleDestinationEcNodes, applyBalancing)
  295. if err != nil {
  296. return err
  297. }
  298. overLimitCount--
  299. }
  300. }
  301. return nil
  302. }
  303. func balanceEcRacks(commandEnv *CommandEnv, racks map[RackId]*EcRack, applyBalancing bool) error {
  304. // balance one rack for all ec shards
  305. for _, ecRack := range racks {
  306. if err := doBalanceEcRack(commandEnv, ecRack, applyBalancing); err != nil {
  307. return err
  308. }
  309. }
  310. return nil
  311. }
  312. func doBalanceEcRack(commandEnv *CommandEnv, ecRack *EcRack, applyBalancing bool) error {
  313. if len(ecRack.ecNodes) <= 1 {
  314. return nil
  315. }
  316. var rackEcNodes []*EcNode
  317. for _, node := range ecRack.ecNodes {
  318. rackEcNodes = append(rackEcNodes, node)
  319. }
  320. ecNodeIdToShardCount := groupByCount(rackEcNodes, func(node *EcNode) (id string, count int) {
  321. for _, ecShardInfo := range node.info.EcShardInfos {
  322. count += erasure_coding.ShardBits(ecShardInfo.EcIndexBits).ShardIdCount()
  323. }
  324. return node.info.Id, count
  325. })
  326. var totalShardCount int
  327. for _, count := range ecNodeIdToShardCount {
  328. totalShardCount += count
  329. }
  330. averageShardCount := ceilDivide(totalShardCount, len(rackEcNodes))
  331. hasMove := true
  332. for hasMove {
  333. hasMove = false
  334. sort.Slice(rackEcNodes, func(i, j int) bool {
  335. return rackEcNodes[i].freeEcSlot > rackEcNodes[j].freeEcSlot
  336. })
  337. emptyNode, fullNode := rackEcNodes[0], rackEcNodes[len(rackEcNodes)-1]
  338. emptyNodeShardCount, fullNodeShardCount := ecNodeIdToShardCount[emptyNode.info.Id], ecNodeIdToShardCount[fullNode.info.Id]
  339. if fullNodeShardCount > averageShardCount && emptyNodeShardCount+1 <= averageShardCount {
  340. emptyNodeIds := make(map[uint32]bool)
  341. for _, shards := range emptyNode.info.EcShardInfos {
  342. emptyNodeIds[shards.Id] = true
  343. }
  344. for _, shards := range fullNode.info.EcShardInfos {
  345. if _, found := emptyNodeIds[shards.Id]; !found {
  346. for _, shardId := range erasure_coding.ShardBits(shards.EcIndexBits).ShardIds() {
  347. fmt.Printf("%s moves ec shards %d.%d to %s\n", fullNode.info.Id, shards.Id, shardId, emptyNode.info.Id)
  348. err := moveMountedShardToEcNode(commandEnv, fullNode, shards.Collection, needle.VolumeId(shards.Id), shardId, emptyNode, applyBalancing)
  349. if err != nil {
  350. return err
  351. }
  352. ecNodeIdToShardCount[emptyNode.info.Id]++
  353. ecNodeIdToShardCount[fullNode.info.Id]--
  354. hasMove = true
  355. break
  356. }
  357. break
  358. }
  359. }
  360. }
  361. }
  362. return nil
  363. }
  364. func pickOneEcNodeAndMoveOneShard(commandEnv *CommandEnv, averageShardsPerEcNode int, existingLocation *EcNode, collection string, vid needle.VolumeId, shardId erasure_coding.ShardId, possibleDestinationEcNodes []*EcNode, applyBalancing bool) error {
  365. sortEcNodesByFreeslotsDecending(possibleDestinationEcNodes)
  366. for _, destEcNode := range possibleDestinationEcNodes {
  367. if destEcNode.info.Id == existingLocation.info.Id {
  368. continue
  369. }
  370. if destEcNode.freeEcSlot <= 0 {
  371. continue
  372. }
  373. if findEcVolumeShards(destEcNode, vid).ShardIdCount() >= averageShardsPerEcNode {
  374. continue
  375. }
  376. fmt.Printf("%s moves ec shard %d.%d to %s\n", existingLocation.info.Id, vid, shardId, destEcNode.info.Id)
  377. err := moveMountedShardToEcNode(commandEnv, existingLocation, collection, vid, shardId, destEcNode, applyBalancing)
  378. if err != nil {
  379. return err
  380. }
  381. return nil
  382. }
  383. return nil
  384. }
  385. func pickNEcShardsToMoveFrom(ecNodes []*EcNode, vid needle.VolumeId, n int) map[erasure_coding.ShardId]*EcNode {
  386. picked := make(map[erasure_coding.ShardId]*EcNode)
  387. var candidateEcNodes []*CandidateEcNode
  388. for _, ecNode := range ecNodes {
  389. shardBits := findEcVolumeShards(ecNode, vid)
  390. if shardBits.ShardIdCount() > 0 {
  391. candidateEcNodes = append(candidateEcNodes, &CandidateEcNode{
  392. ecNode: ecNode,
  393. shardCount: shardBits.ShardIdCount(),
  394. })
  395. }
  396. }
  397. sort.Slice(candidateEcNodes, func(i, j int) bool {
  398. return candidateEcNodes[i].shardCount > candidateEcNodes[j].shardCount
  399. })
  400. for i := 0; i < n; i++ {
  401. selectedEcNodeIndex := -1
  402. for i, candidateEcNode := range candidateEcNodes {
  403. shardBits := findEcVolumeShards(candidateEcNode.ecNode, vid)
  404. if shardBits > 0 {
  405. selectedEcNodeIndex = i
  406. for _, shardId := range shardBits.ShardIds() {
  407. candidateEcNode.shardCount--
  408. picked[shardId] = candidateEcNode.ecNode
  409. candidateEcNode.ecNode.deleteEcVolumeShards(vid, []uint32{uint32(shardId)})
  410. break
  411. }
  412. break
  413. }
  414. }
  415. if selectedEcNodeIndex >= 0 {
  416. ensureSortedEcNodes(candidateEcNodes, selectedEcNodeIndex, func(i, j int) bool {
  417. return candidateEcNodes[i].shardCount > candidateEcNodes[j].shardCount
  418. })
  419. }
  420. }
  421. return picked
  422. }
  423. func collectVolumeIdToEcNodes(allEcNodes []*EcNode) map[needle.VolumeId][]*EcNode {
  424. vidLocations := make(map[needle.VolumeId][]*EcNode)
  425. for _, ecNode := range allEcNodes {
  426. for _, shardInfo := range ecNode.info.EcShardInfos {
  427. vidLocations[needle.VolumeId(shardInfo.Id)] = append(vidLocations[needle.VolumeId(shardInfo.Id)], ecNode)
  428. }
  429. }
  430. return vidLocations
  431. }