topology.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. package topology
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "fmt"
  6. "math/rand"
  7. "sync"
  8. "time"
  9. "github.com/seaweedfs/seaweedfs/weed/pb"
  10. "github.com/seaweedfs/seaweedfs/weed/storage/types"
  11. backoff "github.com/cenkalti/backoff/v4"
  12. hashicorpRaft "github.com/hashicorp/raft"
  13. "github.com/seaweedfs/raft"
  14. "github.com/seaweedfs/seaweedfs/weed/glog"
  15. "github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
  16. "github.com/seaweedfs/seaweedfs/weed/sequence"
  17. "github.com/seaweedfs/seaweedfs/weed/storage"
  18. "github.com/seaweedfs/seaweedfs/weed/storage/needle"
  19. "github.com/seaweedfs/seaweedfs/weed/storage/super_block"
  20. "github.com/seaweedfs/seaweedfs/weed/util"
  21. )
  22. type Topology struct {
  23. vacuumLockCounter int64
  24. NodeImpl
  25. collectionMap *util.ConcurrentReadMap
  26. ecShardMap map[needle.VolumeId]*EcShardLocations
  27. ecShardMapLock sync.RWMutex
  28. pulse int64
  29. volumeSizeLimit uint64
  30. replicationAsMin bool
  31. isDisableVacuum bool
  32. Sequence sequence.Sequencer
  33. chanFullVolumes chan storage.VolumeInfo
  34. chanCrowdedVolumes chan storage.VolumeInfo
  35. Configuration *Configuration
  36. RaftServer raft.Server
  37. RaftServerAccessLock sync.RWMutex
  38. HashicorpRaft *hashicorpRaft.Raft
  39. UuidAccessLock sync.RWMutex
  40. UuidMap map[string][]string
  41. }
  42. func NewTopology(id string, seq sequence.Sequencer, volumeSizeLimit uint64, pulse int, replicationAsMin bool) *Topology {
  43. t := &Topology{}
  44. t.id = NodeId(id)
  45. t.nodeType = "Topology"
  46. t.NodeImpl.value = t
  47. t.diskUsages = newDiskUsages()
  48. t.children = make(map[NodeId]Node)
  49. t.collectionMap = util.NewConcurrentReadMap()
  50. t.ecShardMap = make(map[needle.VolumeId]*EcShardLocations)
  51. t.pulse = int64(pulse)
  52. t.volumeSizeLimit = volumeSizeLimit
  53. t.replicationAsMin = replicationAsMin
  54. t.Sequence = seq
  55. t.chanFullVolumes = make(chan storage.VolumeInfo)
  56. t.chanCrowdedVolumes = make(chan storage.VolumeInfo)
  57. t.Configuration = &Configuration{}
  58. return t
  59. }
  60. func (t *Topology) IsChildLocked() (bool, error) {
  61. if t.IsLocked() {
  62. return true, errors.New("topology is locked")
  63. }
  64. for _, dcNode := range t.Children() {
  65. if dcNode.IsLocked() {
  66. return true, fmt.Errorf("topology child %s is locked", dcNode.String())
  67. }
  68. for _, rackNode := range dcNode.Children() {
  69. if rackNode.IsLocked() {
  70. return true, fmt.Errorf("dc %s child %s is locked", dcNode.String(), rackNode.String())
  71. }
  72. for _, dataNode := range rackNode.Children() {
  73. if dataNode.IsLocked() {
  74. return true, fmt.Errorf("rack %s child %s is locked", rackNode.String(), dataNode.Id())
  75. }
  76. }
  77. }
  78. }
  79. return false, nil
  80. }
  81. func (t *Topology) IsLeader() bool {
  82. t.RaftServerAccessLock.RLock()
  83. defer t.RaftServerAccessLock.RUnlock()
  84. if t.RaftServer != nil {
  85. if t.RaftServer.State() == raft.Leader {
  86. return true
  87. }
  88. if leader, err := t.Leader(); err == nil {
  89. if pb.ServerAddress(t.RaftServer.Name()) == leader {
  90. return true
  91. }
  92. }
  93. } else if t.HashicorpRaft != nil {
  94. if t.HashicorpRaft.State() == hashicorpRaft.Leader {
  95. return true
  96. }
  97. }
  98. return false
  99. }
  100. func (t *Topology) Leader() (l pb.ServerAddress, err error) {
  101. exponentialBackoff := backoff.NewExponentialBackOff()
  102. exponentialBackoff.InitialInterval = 100 * time.Millisecond
  103. exponentialBackoff.MaxElapsedTime = 20 * time.Second
  104. leaderNotSelected := errors.New("leader not selected yet")
  105. l, err = backoff.RetryWithData(
  106. func() (l pb.ServerAddress, err error) {
  107. l, err = t.MaybeLeader()
  108. if err == nil && l == "" {
  109. err = leaderNotSelected
  110. }
  111. return l, err
  112. },
  113. exponentialBackoff)
  114. if err == leaderNotSelected {
  115. l = ""
  116. }
  117. return l, err
  118. }
  119. func (t *Topology) MaybeLeader() (l pb.ServerAddress, err error) {
  120. t.RaftServerAccessLock.RLock()
  121. defer t.RaftServerAccessLock.RUnlock()
  122. if t.RaftServer != nil {
  123. l = pb.ServerAddress(t.RaftServer.Leader())
  124. } else if t.HashicorpRaft != nil {
  125. l = pb.ServerAddress(t.HashicorpRaft.Leader())
  126. } else {
  127. err = errors.New("Raft Server not ready yet!")
  128. }
  129. return
  130. }
  131. func (t *Topology) Lookup(collection string, vid needle.VolumeId) (dataNodes []*DataNode) {
  132. // maybe an issue if lots of collections?
  133. if collection == "" {
  134. for _, c := range t.collectionMap.Items() {
  135. if list := c.(*Collection).Lookup(vid); list != nil {
  136. return list
  137. }
  138. }
  139. } else {
  140. if c, ok := t.collectionMap.Find(collection); ok {
  141. return c.(*Collection).Lookup(vid)
  142. }
  143. }
  144. if locations, found := t.LookupEcShards(vid); found {
  145. for _, loc := range locations.Locations {
  146. dataNodes = append(dataNodes, loc...)
  147. }
  148. return dataNodes
  149. }
  150. return nil
  151. }
  152. func (t *Topology) NextVolumeId() (needle.VolumeId, error) {
  153. vid := t.GetMaxVolumeId()
  154. next := vid.Next()
  155. t.RaftServerAccessLock.RLock()
  156. defer t.RaftServerAccessLock.RUnlock()
  157. if t.RaftServer != nil {
  158. if _, err := t.RaftServer.Do(NewMaxVolumeIdCommand(next)); err != nil {
  159. return 0, err
  160. }
  161. } else if t.HashicorpRaft != nil {
  162. b, err := json.Marshal(NewMaxVolumeIdCommand(next))
  163. if err != nil {
  164. return 0, fmt.Errorf("failed marshal NewMaxVolumeIdCommand: %+v", err)
  165. }
  166. if future := t.HashicorpRaft.Apply(b, time.Second); future.Error() != nil {
  167. return 0, future.Error()
  168. }
  169. }
  170. return next, nil
  171. }
  172. func (t *Topology) PickForWrite(count uint64, option *VolumeGrowOption) (string, uint64, *VolumeLocationList, error) {
  173. vid, count, datanodes, err := t.GetVolumeLayout(option.Collection, option.ReplicaPlacement, option.Ttl, option.DiskType).PickForWrite(count, option)
  174. if err != nil {
  175. return "", 0, nil, fmt.Errorf("failed to find writable volumes for collection:%s replication:%s ttl:%s error: %v", option.Collection, option.ReplicaPlacement.String(), option.Ttl.String(), err)
  176. }
  177. if datanodes.Length() == 0 {
  178. return "", 0, nil, fmt.Errorf("no writable volumes available for collection:%s replication:%s ttl:%s", option.Collection, option.ReplicaPlacement.String(), option.Ttl.String())
  179. }
  180. fileId := t.Sequence.NextFileId(count)
  181. return needle.NewFileId(*vid, fileId, rand.Uint32()).String(), count, datanodes, nil
  182. }
  183. func (t *Topology) GetVolumeLayout(collectionName string, rp *super_block.ReplicaPlacement, ttl *needle.TTL, diskType types.DiskType) *VolumeLayout {
  184. return t.collectionMap.Get(collectionName, func() interface{} {
  185. return NewCollection(collectionName, t.volumeSizeLimit, t.replicationAsMin)
  186. }).(*Collection).GetOrCreateVolumeLayout(rp, ttl, diskType)
  187. }
  188. func (t *Topology) ListCollections(includeNormalVolumes, includeEcVolumes bool) (ret []string) {
  189. mapOfCollections := make(map[string]bool)
  190. for _, c := range t.collectionMap.Items() {
  191. mapOfCollections[c.(*Collection).Name] = true
  192. }
  193. if includeEcVolumes {
  194. t.ecShardMapLock.RLock()
  195. for _, ecVolumeLocation := range t.ecShardMap {
  196. mapOfCollections[ecVolumeLocation.Collection] = true
  197. }
  198. t.ecShardMapLock.RUnlock()
  199. }
  200. for k := range mapOfCollections {
  201. ret = append(ret, k)
  202. }
  203. return ret
  204. }
  205. func (t *Topology) FindCollection(collectionName string) (*Collection, bool) {
  206. c, hasCollection := t.collectionMap.Find(collectionName)
  207. if !hasCollection {
  208. return nil, false
  209. }
  210. return c.(*Collection), hasCollection
  211. }
  212. func (t *Topology) DeleteCollection(collectionName string) {
  213. t.collectionMap.Delete(collectionName)
  214. }
  215. func (t *Topology) DeleteLayout(collectionName string, rp *super_block.ReplicaPlacement, ttl *needle.TTL, diskType types.DiskType) {
  216. collection, found := t.FindCollection(collectionName)
  217. if !found {
  218. return
  219. }
  220. collection.DeleteVolumeLayout(rp, ttl, diskType)
  221. if len(collection.storageType2VolumeLayout.Items()) == 0 {
  222. t.DeleteCollection(collectionName)
  223. }
  224. }
  225. func (t *Topology) RegisterVolumeLayout(v storage.VolumeInfo, dn *DataNode) {
  226. diskType := types.ToDiskType(v.DiskType)
  227. vl := t.GetVolumeLayout(v.Collection, v.ReplicaPlacement, v.Ttl, diskType)
  228. vl.RegisterVolume(&v, dn)
  229. vl.EnsureCorrectWritables(&v)
  230. }
  231. func (t *Topology) UnRegisterVolumeLayout(v storage.VolumeInfo, dn *DataNode) {
  232. glog.Infof("removing volume info: %+v from %v", v, dn.id)
  233. diskType := types.ToDiskType(v.DiskType)
  234. volumeLayout := t.GetVolumeLayout(v.Collection, v.ReplicaPlacement, v.Ttl, diskType)
  235. volumeLayout.UnRegisterVolume(&v, dn)
  236. if volumeLayout.isEmpty() {
  237. t.DeleteLayout(v.Collection, v.ReplicaPlacement, v.Ttl, diskType)
  238. }
  239. }
  240. func (t *Topology) GetOrCreateDataCenter(dcName string) *DataCenter {
  241. t.Lock()
  242. defer t.Unlock()
  243. for _, c := range t.children {
  244. dc := c.(*DataCenter)
  245. if string(dc.Id()) == dcName {
  246. return dc
  247. }
  248. }
  249. dc := NewDataCenter(dcName)
  250. t.doLinkChildNode(dc)
  251. return dc
  252. }
  253. func (t *Topology) SyncDataNodeRegistration(volumes []*master_pb.VolumeInformationMessage, dn *DataNode) (newVolumes, deletedVolumes []storage.VolumeInfo) {
  254. // convert into in memory struct storage.VolumeInfo
  255. var volumeInfos []storage.VolumeInfo
  256. for _, v := range volumes {
  257. if vi, err := storage.NewVolumeInfo(v); err == nil {
  258. volumeInfos = append(volumeInfos, vi)
  259. } else {
  260. glog.V(0).Infof("Fail to convert joined volume information: %v", err)
  261. }
  262. }
  263. // find out the delta volumes
  264. var changedVolumes []storage.VolumeInfo
  265. newVolumes, deletedVolumes, changedVolumes = dn.UpdateVolumes(volumeInfos)
  266. for _, v := range newVolumes {
  267. t.RegisterVolumeLayout(v, dn)
  268. }
  269. for _, v := range deletedVolumes {
  270. t.UnRegisterVolumeLayout(v, dn)
  271. }
  272. for _, v := range changedVolumes {
  273. diskType := types.ToDiskType(v.DiskType)
  274. vl := t.GetVolumeLayout(v.Collection, v.ReplicaPlacement, v.Ttl, diskType)
  275. vl.EnsureCorrectWritables(&v)
  276. }
  277. return
  278. }
  279. func (t *Topology) IncrementalSyncDataNodeRegistration(newVolumes, deletedVolumes []*master_pb.VolumeShortInformationMessage, dn *DataNode) {
  280. var newVis, oldVis []storage.VolumeInfo
  281. for _, v := range newVolumes {
  282. vi, err := storage.NewVolumeInfoFromShort(v)
  283. if err != nil {
  284. glog.V(0).Infof("NewVolumeInfoFromShort %v: %v", v, err)
  285. continue
  286. }
  287. newVis = append(newVis, vi)
  288. }
  289. for _, v := range deletedVolumes {
  290. vi, err := storage.NewVolumeInfoFromShort(v)
  291. if err != nil {
  292. glog.V(0).Infof("NewVolumeInfoFromShort %v: %v", v, err)
  293. continue
  294. }
  295. oldVis = append(oldVis, vi)
  296. }
  297. dn.DeltaUpdateVolumes(newVis, oldVis)
  298. for _, vi := range newVis {
  299. t.RegisterVolumeLayout(vi, dn)
  300. }
  301. for _, vi := range oldVis {
  302. t.UnRegisterVolumeLayout(vi, dn)
  303. }
  304. return
  305. }
  306. func (t *Topology) DataNodeRegistration(dcName, rackName string, dn *DataNode) {
  307. if dn.Parent() != nil {
  308. return
  309. }
  310. // registration to topo
  311. dc := t.GetOrCreateDataCenter(dcName)
  312. rack := dc.GetOrCreateRack(rackName)
  313. rack.LinkChildNode(dn)
  314. glog.Infof("[%s] reLink To topo ", dn.Id())
  315. }
  316. func (t *Topology) DisableVacuum() {
  317. glog.V(0).Infof("DisableVacuum")
  318. t.isDisableVacuum = true
  319. }
  320. func (t *Topology) EnableVacuum() {
  321. glog.V(0).Infof("EnableVacuum")
  322. t.isDisableVacuum = false
  323. }