volume_layout.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576
  1. package topology
  2. import (
  3. "fmt"
  4. "github.com/seaweedfs/seaweedfs/weed/stats"
  5. "math/rand"
  6. "sync"
  7. "sync/atomic"
  8. "time"
  9. "github.com/seaweedfs/seaweedfs/weed/storage/types"
  10. "github.com/seaweedfs/seaweedfs/weed/glog"
  11. "github.com/seaweedfs/seaweedfs/weed/storage"
  12. "github.com/seaweedfs/seaweedfs/weed/storage/needle"
  13. "github.com/seaweedfs/seaweedfs/weed/storage/super_block"
  14. )
  15. type copyState int
  16. const (
  17. noCopies copyState = 0 + iota
  18. insufficientCopies
  19. enoughCopies
  20. )
  21. type volumeState string
  22. const (
  23. readOnlyState volumeState = "ReadOnly"
  24. oversizedState = "Oversized"
  25. crowdedState = "Crowded"
  26. noWritableVolumes = "No writable volumes"
  27. )
  28. type stateIndicator func(copyState) bool
  29. func ExistCopies() stateIndicator {
  30. return func(state copyState) bool { return state != noCopies }
  31. }
  32. func NoCopies() stateIndicator {
  33. return func(state copyState) bool { return state == noCopies }
  34. }
  35. type volumesBinaryState struct {
  36. rp *super_block.ReplicaPlacement
  37. name volumeState // the name for volume state (eg. "Readonly", "Oversized")
  38. indicator stateIndicator // indicate whether the volumes should be marked as `name`
  39. copyMap map[needle.VolumeId]*VolumeLocationList
  40. }
  41. func NewVolumesBinaryState(name volumeState, rp *super_block.ReplicaPlacement, indicator stateIndicator) *volumesBinaryState {
  42. return &volumesBinaryState{
  43. rp: rp,
  44. name: name,
  45. indicator: indicator,
  46. copyMap: make(map[needle.VolumeId]*VolumeLocationList),
  47. }
  48. }
  49. func (v *volumesBinaryState) Dump() (res []uint32) {
  50. for vid, list := range v.copyMap {
  51. if v.indicator(v.copyState(list)) {
  52. res = append(res, uint32(vid))
  53. }
  54. }
  55. return
  56. }
  57. func (v *volumesBinaryState) IsTrue(vid needle.VolumeId) bool {
  58. list, _ := v.copyMap[vid]
  59. return v.indicator(v.copyState(list))
  60. }
  61. func (v *volumesBinaryState) Add(vid needle.VolumeId, dn *DataNode) {
  62. list, _ := v.copyMap[vid]
  63. if list != nil {
  64. list.Set(dn)
  65. return
  66. }
  67. list = NewVolumeLocationList()
  68. list.Set(dn)
  69. v.copyMap[vid] = list
  70. }
  71. func (v *volumesBinaryState) Remove(vid needle.VolumeId, dn *DataNode) {
  72. list, _ := v.copyMap[vid]
  73. if list != nil {
  74. list.Remove(dn)
  75. if list.Length() == 0 {
  76. delete(v.copyMap, vid)
  77. }
  78. }
  79. }
  80. func (v *volumesBinaryState) copyState(list *VolumeLocationList) copyState {
  81. if list == nil {
  82. return noCopies
  83. }
  84. if list.Length() < v.rp.GetCopyCount() {
  85. return insufficientCopies
  86. }
  87. return enoughCopies
  88. }
  89. // mapping from volume to its locations, inverted from server to volume
  90. type VolumeLayout struct {
  91. growRequest atomic.Bool
  92. lastGrowCount atomic.Uint32
  93. rp *super_block.ReplicaPlacement
  94. ttl *needle.TTL
  95. diskType types.DiskType
  96. vid2location map[needle.VolumeId]*VolumeLocationList
  97. writables []needle.VolumeId // transient array of writable volume id
  98. crowded map[needle.VolumeId]struct{}
  99. readonlyVolumes *volumesBinaryState // readonly volumes
  100. oversizedVolumes *volumesBinaryState // oversized volumes
  101. vacuumedVolumes map[needle.VolumeId]time.Time
  102. volumeSizeLimit uint64
  103. replicationAsMin bool
  104. accessLock sync.RWMutex
  105. }
  106. type VolumeLayoutStats struct {
  107. TotalSize uint64
  108. UsedSize uint64
  109. FileCount uint64
  110. }
  111. func NewVolumeLayout(rp *super_block.ReplicaPlacement, ttl *needle.TTL, diskType types.DiskType, volumeSizeLimit uint64, replicationAsMin bool) *VolumeLayout {
  112. return &VolumeLayout{
  113. rp: rp,
  114. ttl: ttl,
  115. diskType: diskType,
  116. vid2location: make(map[needle.VolumeId]*VolumeLocationList),
  117. writables: *new([]needle.VolumeId),
  118. crowded: make(map[needle.VolumeId]struct{}),
  119. readonlyVolumes: NewVolumesBinaryState(readOnlyState, rp, ExistCopies()),
  120. oversizedVolumes: NewVolumesBinaryState(oversizedState, rp, ExistCopies()),
  121. vacuumedVolumes: make(map[needle.VolumeId]time.Time),
  122. volumeSizeLimit: volumeSizeLimit,
  123. replicationAsMin: replicationAsMin,
  124. }
  125. }
  126. func (vl *VolumeLayout) String() string {
  127. return fmt.Sprintf("rp:%v, ttl:%v, writables:%v, volumeSizeLimit:%v", vl.rp, vl.ttl, vl.writables, vl.volumeSizeLimit)
  128. }
  129. func (vl *VolumeLayout) RegisterVolume(v *storage.VolumeInfo, dn *DataNode) {
  130. vl.accessLock.Lock()
  131. defer vl.accessLock.Unlock()
  132. defer vl.rememberOversizedVolume(v, dn)
  133. if _, ok := vl.vid2location[v.Id]; !ok {
  134. vl.vid2location[v.Id] = NewVolumeLocationList()
  135. }
  136. vl.vid2location[v.Id].Set(dn)
  137. // glog.V(4).Infof("volume %d added to %s len %d copy %d", v.Id, dn.Id(), vl.vid2location[v.Id].Length(), v.ReplicaPlacement.GetCopyCount())
  138. for _, dn := range vl.vid2location[v.Id].list {
  139. if vInfo, err := dn.GetVolumesById(v.Id); err == nil {
  140. if vInfo.ReadOnly {
  141. glog.V(1).Infof("vid %d removed from writable", v.Id)
  142. vl.removeFromWritable(v.Id)
  143. vl.readonlyVolumes.Add(v.Id, dn)
  144. return
  145. } else {
  146. vl.readonlyVolumes.Remove(v.Id, dn)
  147. }
  148. } else {
  149. glog.V(1).Infof("vid %d removed from writable", v.Id)
  150. vl.removeFromWritable(v.Id)
  151. vl.readonlyVolumes.Remove(v.Id, dn)
  152. return
  153. }
  154. }
  155. }
  156. func (vl *VolumeLayout) rememberOversizedVolume(v *storage.VolumeInfo, dn *DataNode) {
  157. if vl.isOversized(v) {
  158. vl.oversizedVolumes.Add(v.Id, dn)
  159. } else {
  160. vl.oversizedVolumes.Remove(v.Id, dn)
  161. }
  162. }
  163. func (vl *VolumeLayout) UnRegisterVolume(v *storage.VolumeInfo, dn *DataNode) {
  164. vl.accessLock.Lock()
  165. defer vl.accessLock.Unlock()
  166. // remove from vid2location map
  167. location, ok := vl.vid2location[v.Id]
  168. if !ok {
  169. return
  170. }
  171. if location.Remove(dn) {
  172. vl.readonlyVolumes.Remove(v.Id, dn)
  173. vl.oversizedVolumes.Remove(v.Id, dn)
  174. vl.ensureCorrectWritables(v.Id)
  175. if location.Length() == 0 {
  176. delete(vl.vid2location, v.Id)
  177. }
  178. }
  179. }
  180. func (vl *VolumeLayout) EnsureCorrectWritables(v *storage.VolumeInfo) {
  181. vl.accessLock.Lock()
  182. defer vl.accessLock.Unlock()
  183. vl.ensureCorrectWritables(v.Id)
  184. }
  185. func (vl *VolumeLayout) ensureCorrectWritables(vid needle.VolumeId) {
  186. isEnoughCopies := vl.enoughCopies(vid)
  187. isAllWritable := vl.isAllWritable(vid)
  188. isOversizedVolume := vl.oversizedVolumes.IsTrue(vid)
  189. if isEnoughCopies && isAllWritable && !isOversizedVolume {
  190. vl.setVolumeWritable(vid)
  191. } else {
  192. if !isEnoughCopies {
  193. glog.V(0).Infof("volume %d does not have enough copies", vid)
  194. }
  195. if !isAllWritable {
  196. glog.V(0).Infof("volume %d are not all writable", vid)
  197. }
  198. if isOversizedVolume {
  199. glog.V(1).Infof("volume %d are oversized", vid)
  200. }
  201. glog.V(0).Infof("volume %d remove from writable", vid)
  202. vl.removeFromWritable(vid)
  203. }
  204. }
  205. func (vl *VolumeLayout) isAllWritable(vid needle.VolumeId) bool {
  206. if location, ok := vl.vid2location[vid]; ok {
  207. for _, dn := range location.list {
  208. if v, getError := dn.GetVolumesById(vid); getError == nil {
  209. if v.ReadOnly {
  210. return false
  211. }
  212. }
  213. }
  214. } else {
  215. return false
  216. }
  217. return true
  218. }
  219. func (vl *VolumeLayout) isOversized(v *storage.VolumeInfo) bool {
  220. return uint64(v.Size) >= vl.volumeSizeLimit
  221. }
  222. func (vl *VolumeLayout) isCrowdedVolume(v *storage.VolumeInfo) bool {
  223. return float64(v.Size) > float64(vl.volumeSizeLimit)*VolumeGrowStrategy.Threshold
  224. }
  225. func (vl *VolumeLayout) isWritable(v *storage.VolumeInfo) bool {
  226. return !vl.isOversized(v) &&
  227. v.Version == needle.CurrentVersion &&
  228. !v.ReadOnly
  229. }
  230. func (vl *VolumeLayout) isEmpty() bool {
  231. vl.accessLock.RLock()
  232. defer vl.accessLock.RUnlock()
  233. return len(vl.vid2location) == 0
  234. }
  235. func (vl *VolumeLayout) Lookup(vid needle.VolumeId) []*DataNode {
  236. vl.accessLock.RLock()
  237. defer vl.accessLock.RUnlock()
  238. if location := vl.vid2location[vid]; location != nil {
  239. return location.list
  240. }
  241. return nil
  242. }
  243. func (vl *VolumeLayout) ListVolumeServers() (nodes []*DataNode) {
  244. vl.accessLock.RLock()
  245. defer vl.accessLock.RUnlock()
  246. for _, location := range vl.vid2location {
  247. nodes = append(nodes, location.list...)
  248. }
  249. return
  250. }
  251. func (vl *VolumeLayout) PickForWrite(count uint64, option *VolumeGrowOption) (vid needle.VolumeId, counter uint64, locationList *VolumeLocationList, shouldGrow bool, err error) {
  252. vl.accessLock.RLock()
  253. defer vl.accessLock.RUnlock()
  254. lenWriters := len(vl.writables)
  255. if lenWriters <= 0 {
  256. return 0, 0, nil, true, fmt.Errorf("%s", noWritableVolumes)
  257. }
  258. if option.DataCenter == "" && option.Rack == "" && option.DataNode == "" {
  259. vid := vl.writables[rand.Intn(lenWriters)]
  260. locationList = vl.vid2location[vid]
  261. if locationList == nil || len(locationList.list) == 0 {
  262. return 0, 0, nil, false, fmt.Errorf("Strangely vid %s is on no machine!", vid.String())
  263. }
  264. return vid, count, locationList.Copy(), false, nil
  265. }
  266. // clone vl.writables
  267. writables := make([]needle.VolumeId, len(vl.writables))
  268. copy(writables, vl.writables)
  269. // randomize the writables
  270. rand.Shuffle(len(writables), func(i, j int) {
  271. writables[i], writables[j] = writables[j], writables[i]
  272. })
  273. for _, writableVolumeId := range writables {
  274. volumeLocationList := vl.vid2location[writableVolumeId]
  275. for _, dn := range volumeLocationList.list {
  276. if option.DataCenter != "" && dn.GetDataCenter().Id() != NodeId(option.DataCenter) {
  277. continue
  278. }
  279. if option.Rack != "" && dn.GetRack().Id() != NodeId(option.Rack) {
  280. continue
  281. }
  282. if option.DataNode != "" && dn.Id() != NodeId(option.DataNode) {
  283. continue
  284. }
  285. vid, locationList, counter = writableVolumeId, volumeLocationList.Copy(), count
  286. return
  287. }
  288. }
  289. return vid, count, locationList, true, fmt.Errorf("%s in DataCenter:%v Rack:%v DataNode:%v", noWritableVolumes, option.DataCenter, option.Rack, option.DataNode)
  290. }
  291. func (vl *VolumeLayout) HasGrowRequest() bool {
  292. return vl.growRequest.Load()
  293. }
  294. func (vl *VolumeLayout) AddGrowRequest() {
  295. vl.growRequest.Store(true)
  296. }
  297. func (vl *VolumeLayout) DoneGrowRequest() {
  298. vl.growRequest.Store(false)
  299. }
  300. func (vl *VolumeLayout) SetLastGrowCount(count uint32) {
  301. if vl.lastGrowCount.Load() != count && count != 0 {
  302. vl.lastGrowCount.Store(count)
  303. }
  304. }
  305. func (vl *VolumeLayout) GetLastGrowCount() uint32 {
  306. return vl.lastGrowCount.Load()
  307. }
  308. func (vl *VolumeLayout) ShouldGrowVolumes(collection string) bool {
  309. writable, crowded := vl.GetWritableVolumeCount()
  310. stats.MasterVolumeLayoutWritable.WithLabelValues(collection, vl.diskType.String(), vl.rp.String(), vl.ttl.String()).Set(float64(writable))
  311. stats.MasterVolumeLayoutCrowded.WithLabelValues(collection, vl.diskType.String(), vl.rp.String(), vl.ttl.String()).Set(float64(crowded))
  312. return writable <= crowded
  313. }
  314. func (vl *VolumeLayout) ShouldGrowVolumesByDataNode(nodeType string, dataNode string) bool {
  315. vl.accessLock.RLock()
  316. writables := make([]needle.VolumeId, len(vl.writables))
  317. copy(writables, vl.writables)
  318. vl.accessLock.RUnlock()
  319. dataNodeId := NodeId(dataNode)
  320. for _, v := range writables {
  321. for _, dn := range vl.vid2location[v].list {
  322. dataNodeFound := false
  323. switch nodeType {
  324. case "DataCenter":
  325. dataNodeFound = dn.GetDataCenter().Id() == dataNodeId
  326. case "Rack":
  327. dataNodeFound = dn.GetRack().Id() == dataNodeId
  328. case "DataNode":
  329. dataNodeFound = dn.Id() == dataNodeId
  330. }
  331. if dataNodeFound {
  332. if info, err := dn.GetVolumesById(v); err == nil && !vl.isCrowdedVolume(&info) {
  333. return false
  334. }
  335. }
  336. }
  337. }
  338. return true
  339. }
  340. func (vl *VolumeLayout) GetWritableVolumeCount() (active, crowded int) {
  341. vl.accessLock.RLock()
  342. defer vl.accessLock.RUnlock()
  343. return len(vl.writables), len(vl.crowded)
  344. }
  345. func (vl *VolumeLayout) removeFromWritable(vid needle.VolumeId) bool {
  346. toDeleteIndex := -1
  347. for k, id := range vl.writables {
  348. if id == vid {
  349. toDeleteIndex = k
  350. break
  351. }
  352. }
  353. if toDeleteIndex >= 0 {
  354. glog.V(0).Infoln("Volume", vid, "becomes unwritable")
  355. vl.writables = append(vl.writables[0:toDeleteIndex], vl.writables[toDeleteIndex+1:]...)
  356. vl.removeFromCrowded(vid)
  357. return true
  358. }
  359. return false
  360. }
  361. func (vl *VolumeLayout) setVolumeWritable(vid needle.VolumeId) bool {
  362. for _, v := range vl.writables {
  363. if v == vid {
  364. return false
  365. }
  366. }
  367. glog.V(0).Infoln("Volume", vid, "becomes writable")
  368. vl.writables = append(vl.writables, vid)
  369. return true
  370. }
  371. func (vl *VolumeLayout) SetVolumeReadOnly(dn *DataNode, vid needle.VolumeId) bool {
  372. vl.accessLock.Lock()
  373. defer vl.accessLock.Unlock()
  374. if _, ok := vl.vid2location[vid]; ok {
  375. vl.readonlyVolumes.Add(vid, dn)
  376. return vl.removeFromWritable(vid)
  377. }
  378. return true
  379. }
  380. func (vl *VolumeLayout) SetVolumeWritable(dn *DataNode, vid needle.VolumeId) bool {
  381. vl.accessLock.Lock()
  382. defer vl.accessLock.Unlock()
  383. if _, ok := vl.vid2location[vid]; ok {
  384. vl.readonlyVolumes.Remove(vid, dn)
  385. }
  386. if vl.enoughCopies(vid) {
  387. return vl.setVolumeWritable(vid)
  388. }
  389. return false
  390. }
  391. func (vl *VolumeLayout) SetVolumeUnavailable(dn *DataNode, vid needle.VolumeId) bool {
  392. vl.accessLock.Lock()
  393. defer vl.accessLock.Unlock()
  394. if location, ok := vl.vid2location[vid]; ok {
  395. if location.Remove(dn) {
  396. vl.readonlyVolumes.Remove(vid, dn)
  397. vl.oversizedVolumes.Remove(vid, dn)
  398. if location.Length() < vl.rp.GetCopyCount() {
  399. glog.V(0).Infoln("Volume", vid, "has", location.Length(), "replica, less than required", vl.rp.GetCopyCount())
  400. return vl.removeFromWritable(vid)
  401. }
  402. }
  403. }
  404. return false
  405. }
  406. func (vl *VolumeLayout) SetVolumeAvailable(dn *DataNode, vid needle.VolumeId, isReadOnly, isFullCapacity bool) bool {
  407. vl.accessLock.Lock()
  408. defer vl.accessLock.Unlock()
  409. vInfo, err := dn.GetVolumesById(vid)
  410. if err != nil {
  411. return false
  412. }
  413. vl.vid2location[vid].Set(dn)
  414. if vInfo.ReadOnly || isReadOnly || isFullCapacity {
  415. return false
  416. }
  417. if vl.enoughCopies(vid) {
  418. return vl.setVolumeWritable(vid)
  419. }
  420. return false
  421. }
  422. func (vl *VolumeLayout) enoughCopies(vid needle.VolumeId) bool {
  423. locations := vl.vid2location[vid].Length()
  424. desired := vl.rp.GetCopyCount()
  425. return locations == desired || (vl.replicationAsMin && locations > desired)
  426. }
  427. func (vl *VolumeLayout) SetVolumeCapacityFull(vid needle.VolumeId) bool {
  428. vl.accessLock.Lock()
  429. defer vl.accessLock.Unlock()
  430. wasWritable := vl.removeFromWritable(vid)
  431. if wasWritable {
  432. glog.V(0).Infof("Volume %d reaches full capacity.", vid)
  433. }
  434. return wasWritable
  435. }
  436. func (vl *VolumeLayout) removeFromCrowded(vid needle.VolumeId) {
  437. delete(vl.crowded, vid)
  438. }
  439. func (vl *VolumeLayout) setVolumeCrowded(vid needle.VolumeId) {
  440. if _, ok := vl.crowded[vid]; !ok {
  441. vl.crowded[vid] = struct{}{}
  442. glog.V(0).Infoln("Volume", vid, "becomes crowded")
  443. }
  444. }
  445. func (vl *VolumeLayout) SetVolumeCrowded(vid needle.VolumeId) {
  446. // since delete is guarded by accessLock.Lock(),
  447. // and is always called in sequential order,
  448. // RLock() should be safe enough
  449. vl.accessLock.RLock()
  450. defer vl.accessLock.RUnlock()
  451. vl.setVolumeCrowded(vid)
  452. }
  453. type VolumeLayoutInfo struct {
  454. Replication string `json:"replication"`
  455. TTL string `json:"ttl"`
  456. Writables []needle.VolumeId `json:"writables"`
  457. Collection string `json:"collection"`
  458. DiskType string `json:"diskType"`
  459. }
  460. func (vl *VolumeLayout) ToInfo() (info VolumeLayoutInfo) {
  461. info.Replication = vl.rp.String()
  462. info.TTL = vl.ttl.String()
  463. info.Writables = vl.writables
  464. info.DiskType = vl.diskType.ReadableString()
  465. //m["locations"] = vl.vid2location
  466. return
  467. }
  468. func (vlc *VolumeLayoutCollection) ToGrowOption() (option *VolumeGrowOption) {
  469. return &VolumeGrowOption{
  470. Collection: vlc.Collection,
  471. ReplicaPlacement: vlc.VolumeLayout.rp,
  472. Ttl: vlc.VolumeLayout.ttl,
  473. DiskType: vlc.VolumeLayout.diskType,
  474. }
  475. }
  476. func (vl *VolumeLayout) Stats() *VolumeLayoutStats {
  477. vl.accessLock.RLock()
  478. defer vl.accessLock.RUnlock()
  479. ret := &VolumeLayoutStats{}
  480. freshThreshold := time.Now().Unix() - 60
  481. for vid, vll := range vl.vid2location {
  482. size, fileCount := vll.Stats(vid, freshThreshold)
  483. ret.FileCount += uint64(fileCount)
  484. ret.UsedSize += size * uint64(vll.Length())
  485. if vl.readonlyVolumes.IsTrue(vid) {
  486. ret.TotalSize += size * uint64(vll.Length())
  487. } else {
  488. ret.TotalSize += vl.volumeSizeLimit * uint64(vll.Length())
  489. }
  490. }
  491. return ret
  492. }