volume_grpc_erasure_coding.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413
  1. package weed_server
  2. import (
  3. "context"
  4. "fmt"
  5. "io"
  6. "math"
  7. "os"
  8. "path"
  9. "strings"
  10. "github.com/chrislusf/seaweedfs/weed/glog"
  11. "github.com/chrislusf/seaweedfs/weed/operation"
  12. "github.com/chrislusf/seaweedfs/weed/pb"
  13. "github.com/chrislusf/seaweedfs/weed/pb/volume_server_pb"
  14. "github.com/chrislusf/seaweedfs/weed/storage"
  15. "github.com/chrislusf/seaweedfs/weed/storage/erasure_coding"
  16. "github.com/chrislusf/seaweedfs/weed/storage/needle"
  17. "github.com/chrislusf/seaweedfs/weed/storage/types"
  18. "github.com/chrislusf/seaweedfs/weed/storage/volume_info"
  19. "github.com/chrislusf/seaweedfs/weed/util"
  20. )
  21. /*
  22. Steps to apply erasure coding to .dat .idx files
  23. 0. ensure the volume is readonly
  24. 1. client call VolumeEcShardsGenerate to generate the .ecx and .ec00 ~ .ec13 files
  25. 2. client ask master for possible servers to hold the ec files
  26. 3. client call VolumeEcShardsCopy on above target servers to copy ec files from the source server
  27. 4. target servers report the new ec files to the master
  28. 5. master stores vid -> [14]*DataNode
  29. 6. client checks master. If all 14 slices are ready, delete the original .idx, .idx files
  30. */
  31. // VolumeEcShardsGenerate generates the .ecx and .ec00 ~ .ec13 files
  32. func (vs *VolumeServer) VolumeEcShardsGenerate(ctx context.Context, req *volume_server_pb.VolumeEcShardsGenerateRequest) (*volume_server_pb.VolumeEcShardsGenerateResponse, error) {
  33. glog.V(0).Infof("VolumeEcShardsGenerate: %v", req)
  34. v := vs.store.GetVolume(needle.VolumeId(req.VolumeId))
  35. if v == nil {
  36. return nil, fmt.Errorf("volume %d not found", req.VolumeId)
  37. }
  38. baseFileName := v.DataFileName()
  39. if v.Collection != req.Collection {
  40. return nil, fmt.Errorf("existing collection:%v unexpected input: %v", v.Collection, req.Collection)
  41. }
  42. shouldCleanup := true
  43. defer func() {
  44. if !shouldCleanup {
  45. return
  46. }
  47. for i := 0; i < erasure_coding.TotalShardsCount; i++ {
  48. os.Remove(fmt.Sprintf("%s.ec%2d", baseFileName, i))
  49. }
  50. os.Remove(v.IndexFileName() + ".ecx")
  51. }()
  52. // write .ec00 ~ .ec13 files
  53. if err := erasure_coding.WriteEcFiles(baseFileName); err != nil {
  54. return nil, fmt.Errorf("WriteEcFiles %s: %v", baseFileName, err)
  55. }
  56. // write .ecx file
  57. if err := erasure_coding.WriteSortedFileFromIdx(v.IndexFileName(), ".ecx"); err != nil {
  58. return nil, fmt.Errorf("WriteSortedFileFromIdx %s: %v", v.IndexFileName(), err)
  59. }
  60. // write .vif files
  61. if err := volume_info.SaveVolumeInfo(baseFileName+".vif", &volume_server_pb.VolumeInfo{Version: uint32(v.Version())}); err != nil {
  62. return nil, fmt.Errorf("WriteEcFiles %s: %v", baseFileName, err)
  63. }
  64. shouldCleanup = false
  65. return &volume_server_pb.VolumeEcShardsGenerateResponse{}, nil
  66. }
  67. // VolumeEcShardsRebuild generates the any of the missing .ec00 ~ .ec13 files
  68. func (vs *VolumeServer) VolumeEcShardsRebuild(ctx context.Context, req *volume_server_pb.VolumeEcShardsRebuildRequest) (*volume_server_pb.VolumeEcShardsRebuildResponse, error) {
  69. glog.V(0).Infof("VolumeEcShardsRebuild: %v", req)
  70. baseFileName := erasure_coding.EcShardBaseFileName(req.Collection, int(req.VolumeId))
  71. var rebuiltShardIds []uint32
  72. for _, location := range vs.store.Locations {
  73. if util.FileExists(path.Join(location.IdxDirectory, baseFileName+".ecx")) {
  74. // write .ec00 ~ .ec13 files
  75. dataBaseFileName := path.Join(location.Directory, baseFileName)
  76. if generatedShardIds, err := erasure_coding.RebuildEcFiles(dataBaseFileName); err != nil {
  77. return nil, fmt.Errorf("RebuildEcFiles %s: %v", dataBaseFileName, err)
  78. } else {
  79. rebuiltShardIds = generatedShardIds
  80. }
  81. indexBaseFileName := path.Join(location.IdxDirectory, baseFileName)
  82. if err := erasure_coding.RebuildEcxFile(indexBaseFileName); err != nil {
  83. return nil, fmt.Errorf("RebuildEcxFile %s: %v", dataBaseFileName, err)
  84. }
  85. break
  86. }
  87. }
  88. return &volume_server_pb.VolumeEcShardsRebuildResponse{
  89. RebuiltShardIds: rebuiltShardIds,
  90. }, nil
  91. }
  92. // VolumeEcShardsCopy copy the .ecx and some ec data slices
  93. func (vs *VolumeServer) VolumeEcShardsCopy(ctx context.Context, req *volume_server_pb.VolumeEcShardsCopyRequest) (*volume_server_pb.VolumeEcShardsCopyResponse, error) {
  94. glog.V(0).Infof("VolumeEcShardsCopy: %v", req)
  95. location := vs.store.FindFreeLocation(types.HardDriveType)
  96. if location == nil {
  97. return nil, fmt.Errorf("no space left")
  98. }
  99. dataBaseFileName := storage.VolumeFileName(location.Directory, req.Collection, int(req.VolumeId))
  100. indexBaseFileName := storage.VolumeFileName(location.IdxDirectory, req.Collection, int(req.VolumeId))
  101. err := operation.WithVolumeServerClient(true, pb.ServerAddress(req.SourceDataNode), vs.grpcDialOption, func(client volume_server_pb.VolumeServerClient) error {
  102. // copy ec data slices
  103. for _, shardId := range req.ShardIds {
  104. if _, err := vs.doCopyFile(client, true, req.Collection, req.VolumeId, math.MaxUint32, math.MaxInt64, dataBaseFileName, erasure_coding.ToExt(int(shardId)), false, false, nil); err != nil {
  105. return err
  106. }
  107. }
  108. if req.CopyEcxFile {
  109. // copy ecx file
  110. if _, err := vs.doCopyFile(client, true, req.Collection, req.VolumeId, math.MaxUint32, math.MaxInt64, indexBaseFileName, ".ecx", false, false, nil); err != nil {
  111. return err
  112. }
  113. return nil
  114. }
  115. if req.CopyEcjFile {
  116. // copy ecj file
  117. if _, err := vs.doCopyFile(client, true, req.Collection, req.VolumeId, math.MaxUint32, math.MaxInt64, indexBaseFileName, ".ecj", true, true, nil); err != nil {
  118. return err
  119. }
  120. }
  121. if req.CopyVifFile {
  122. // copy vif file
  123. if _, err := vs.doCopyFile(client, true, req.Collection, req.VolumeId, math.MaxUint32, math.MaxInt64, dataBaseFileName, ".vif", false, true, nil); err != nil {
  124. return err
  125. }
  126. }
  127. return nil
  128. })
  129. if err != nil {
  130. return nil, fmt.Errorf("VolumeEcShardsCopy volume %d: %v", req.VolumeId, err)
  131. }
  132. return &volume_server_pb.VolumeEcShardsCopyResponse{}, nil
  133. }
  134. // VolumeEcShardsDelete local delete the .ecx and some ec data slices if not needed
  135. // the shard should not be mounted before calling this.
  136. func (vs *VolumeServer) VolumeEcShardsDelete(ctx context.Context, req *volume_server_pb.VolumeEcShardsDeleteRequest) (*volume_server_pb.VolumeEcShardsDeleteResponse, error) {
  137. bName := erasure_coding.EcShardBaseFileName(req.Collection, int(req.VolumeId))
  138. glog.V(0).Infof("ec volume %d shard delete %v", req.VolumeId, req.ShardIds)
  139. found := false
  140. var indexBaseFilename, dataBaseFilename string
  141. for _, location := range vs.store.Locations {
  142. if util.FileExists(path.Join(location.IdxDirectory, bName+".ecx")) {
  143. found = true
  144. indexBaseFilename = path.Join(location.IdxDirectory, bName)
  145. dataBaseFilename = path.Join(location.Directory, bName)
  146. for _, shardId := range req.ShardIds {
  147. os.Remove(dataBaseFilename + erasure_coding.ToExt(int(shardId)))
  148. }
  149. break
  150. }
  151. }
  152. if !found {
  153. return nil, nil
  154. }
  155. // check whether to delete the .ecx and .ecj file also
  156. hasEcxFile := false
  157. hasIdxFile := false
  158. existingShardCount := 0
  159. for _, location := range vs.store.Locations {
  160. fileInfos, err := os.ReadDir(location.Directory)
  161. if err != nil {
  162. continue
  163. }
  164. if location.IdxDirectory != location.Directory {
  165. idxFileInfos, err := os.ReadDir(location.IdxDirectory)
  166. if err != nil {
  167. continue
  168. }
  169. fileInfos = append(fileInfos, idxFileInfos...)
  170. }
  171. for _, fileInfo := range fileInfos {
  172. if fileInfo.Name() == bName+".ecx" || fileInfo.Name() == bName+".ecj" {
  173. hasEcxFile = true
  174. continue
  175. }
  176. if fileInfo.Name() == bName+".idx" {
  177. hasIdxFile = true
  178. continue
  179. }
  180. if strings.HasPrefix(fileInfo.Name(), bName+".ec") {
  181. existingShardCount++
  182. }
  183. }
  184. }
  185. if hasEcxFile && existingShardCount == 0 {
  186. if err := os.Remove(indexBaseFilename + ".ecx"); err != nil {
  187. return nil, err
  188. }
  189. os.Remove(indexBaseFilename + ".ecj")
  190. }
  191. if !hasIdxFile {
  192. // .vif is used for ec volumes and normal volumes
  193. os.Remove(dataBaseFilename + ".vif")
  194. }
  195. return &volume_server_pb.VolumeEcShardsDeleteResponse{}, nil
  196. }
  197. func (vs *VolumeServer) VolumeEcShardsMount(ctx context.Context, req *volume_server_pb.VolumeEcShardsMountRequest) (*volume_server_pb.VolumeEcShardsMountResponse, error) {
  198. glog.V(0).Infof("VolumeEcShardsMount: %v", req)
  199. for _, shardId := range req.ShardIds {
  200. err := vs.store.MountEcShards(req.Collection, needle.VolumeId(req.VolumeId), erasure_coding.ShardId(shardId))
  201. if err != nil {
  202. glog.Errorf("ec shard mount %v: %v", req, err)
  203. } else {
  204. glog.V(2).Infof("ec shard mount %v", req)
  205. }
  206. if err != nil {
  207. return nil, fmt.Errorf("mount %d.%d: %v", req.VolumeId, shardId, err)
  208. }
  209. }
  210. return &volume_server_pb.VolumeEcShardsMountResponse{}, nil
  211. }
  212. func (vs *VolumeServer) VolumeEcShardsUnmount(ctx context.Context, req *volume_server_pb.VolumeEcShardsUnmountRequest) (*volume_server_pb.VolumeEcShardsUnmountResponse, error) {
  213. glog.V(0).Infof("VolumeEcShardsUnmount: %v", req)
  214. for _, shardId := range req.ShardIds {
  215. err := vs.store.UnmountEcShards(needle.VolumeId(req.VolumeId), erasure_coding.ShardId(shardId))
  216. if err != nil {
  217. glog.Errorf("ec shard unmount %v: %v", req, err)
  218. } else {
  219. glog.V(2).Infof("ec shard unmount %v", req)
  220. }
  221. if err != nil {
  222. return nil, fmt.Errorf("unmount %d.%d: %v", req.VolumeId, shardId, err)
  223. }
  224. }
  225. return &volume_server_pb.VolumeEcShardsUnmountResponse{}, nil
  226. }
  227. func (vs *VolumeServer) VolumeEcShardRead(req *volume_server_pb.VolumeEcShardReadRequest, stream volume_server_pb.VolumeServer_VolumeEcShardReadServer) error {
  228. ecVolume, found := vs.store.FindEcVolume(needle.VolumeId(req.VolumeId))
  229. if !found {
  230. return fmt.Errorf("VolumeEcShardRead not found ec volume id %d", req.VolumeId)
  231. }
  232. ecShard, found := ecVolume.FindEcVolumeShard(erasure_coding.ShardId(req.ShardId))
  233. if !found {
  234. return fmt.Errorf("not found ec shard %d.%d", req.VolumeId, req.ShardId)
  235. }
  236. if req.FileKey != 0 {
  237. _, size, _ := ecVolume.FindNeedleFromEcx(types.Uint64ToNeedleId(req.FileKey))
  238. if size.IsDeleted() {
  239. return stream.Send(&volume_server_pb.VolumeEcShardReadResponse{
  240. IsDeleted: true,
  241. })
  242. }
  243. }
  244. bufSize := req.Size
  245. if bufSize > BufferSizeLimit {
  246. bufSize = BufferSizeLimit
  247. }
  248. buffer := make([]byte, bufSize)
  249. startOffset, bytesToRead := req.Offset, req.Size
  250. for bytesToRead > 0 {
  251. // min of bytesToRead and bufSize
  252. bufferSize := bufSize
  253. if bufferSize > bytesToRead {
  254. bufferSize = bytesToRead
  255. }
  256. bytesread, err := ecShard.ReadAt(buffer[0:bufferSize], startOffset)
  257. // println("read", ecShard.FileName(), "startOffset", startOffset, bytesread, "bytes, with target", bufferSize)
  258. if bytesread > 0 {
  259. if int64(bytesread) > bytesToRead {
  260. bytesread = int(bytesToRead)
  261. }
  262. err = stream.Send(&volume_server_pb.VolumeEcShardReadResponse{
  263. Data: buffer[:bytesread],
  264. })
  265. if err != nil {
  266. // println("sending", bytesread, "bytes err", err.Error())
  267. return err
  268. }
  269. startOffset += int64(bytesread)
  270. bytesToRead -= int64(bytesread)
  271. }
  272. if err != nil {
  273. if err != io.EOF {
  274. return err
  275. }
  276. return nil
  277. }
  278. }
  279. return nil
  280. }
  281. func (vs *VolumeServer) VolumeEcBlobDelete(ctx context.Context, req *volume_server_pb.VolumeEcBlobDeleteRequest) (*volume_server_pb.VolumeEcBlobDeleteResponse, error) {
  282. glog.V(0).Infof("VolumeEcBlobDelete: %v", req)
  283. resp := &volume_server_pb.VolumeEcBlobDeleteResponse{}
  284. for _, location := range vs.store.Locations {
  285. if localEcVolume, found := location.FindEcVolume(needle.VolumeId(req.VolumeId)); found {
  286. _, size, _, err := localEcVolume.LocateEcShardNeedle(types.NeedleId(req.FileKey), needle.Version(req.Version))
  287. if err != nil {
  288. return nil, fmt.Errorf("locate in local ec volume: %v", err)
  289. }
  290. if size.IsDeleted() {
  291. return resp, nil
  292. }
  293. err = localEcVolume.DeleteNeedleFromEcx(types.NeedleId(req.FileKey))
  294. if err != nil {
  295. return nil, err
  296. }
  297. break
  298. }
  299. }
  300. return resp, nil
  301. }
  302. // VolumeEcShardsToVolume generates the .idx, .dat files from .ecx, .ecj and .ec01 ~ .ec14 files
  303. func (vs *VolumeServer) VolumeEcShardsToVolume(ctx context.Context, req *volume_server_pb.VolumeEcShardsToVolumeRequest) (*volume_server_pb.VolumeEcShardsToVolumeResponse, error) {
  304. glog.V(0).Infof("VolumeEcShardsToVolume: %v", req)
  305. v, found := vs.store.FindEcVolume(needle.VolumeId(req.VolumeId))
  306. if !found {
  307. return nil, fmt.Errorf("ec volume %d not found", req.VolumeId)
  308. }
  309. if v.Collection != req.Collection {
  310. return nil, fmt.Errorf("existing collection:%v unexpected input: %v", v.Collection, req.Collection)
  311. }
  312. dataBaseFileName, indexBaseFileName := v.DataBaseFileName(), v.IndexBaseFileName()
  313. // calculate .dat file size
  314. datFileSize, err := erasure_coding.FindDatFileSize(dataBaseFileName, indexBaseFileName)
  315. if err != nil {
  316. return nil, fmt.Errorf("FindDatFileSize %s: %v", dataBaseFileName, err)
  317. }
  318. // write .dat file from .ec00 ~ .ec09 files
  319. if err := erasure_coding.WriteDatFile(dataBaseFileName, datFileSize); err != nil {
  320. return nil, fmt.Errorf("WriteEcFiles %s: %v", dataBaseFileName, err)
  321. }
  322. // write .idx file from .ecx and .ecj files
  323. if err := erasure_coding.WriteIdxFileFromEcIndex(indexBaseFileName); err != nil {
  324. return nil, fmt.Errorf("WriteIdxFileFromEcIndex %s: %v", v.IndexBaseFileName(), err)
  325. }
  326. return &volume_server_pb.VolumeEcShardsToVolumeResponse{}, nil
  327. }