volume_vacuum.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447
  1. package storage
  2. import (
  3. "fmt"
  4. "os"
  5. "runtime"
  6. "time"
  7. "github.com/chrislusf/seaweedfs/weed/glog"
  8. "github.com/chrislusf/seaweedfs/weed/stats"
  9. "github.com/chrislusf/seaweedfs/weed/storage/backend"
  10. idx2 "github.com/chrislusf/seaweedfs/weed/storage/idx"
  11. "github.com/chrislusf/seaweedfs/weed/storage/needle"
  12. "github.com/chrislusf/seaweedfs/weed/storage/needle_map"
  13. "github.com/chrislusf/seaweedfs/weed/storage/super_block"
  14. . "github.com/chrislusf/seaweedfs/weed/storage/types"
  15. "github.com/chrislusf/seaweedfs/weed/util"
  16. )
  17. func (v *Volume) garbageLevel() float64 {
  18. if v.ContentSize() == 0 {
  19. return 0
  20. }
  21. deletedSize := v.DeletedSize()
  22. fileSize := v.ContentSize()
  23. if v.DeletedCount() > 0 && v.DeletedSize() == 0 {
  24. // this happens for .sdx converted back to normal .idx
  25. // where deleted entry size is missing
  26. datFileSize, _, _ := v.FileStat()
  27. deletedSize = datFileSize - fileSize - super_block.SuperBlockSize
  28. fileSize = datFileSize
  29. }
  30. return float64(deletedSize) / float64(fileSize)
  31. }
  32. // compact a volume based on deletions in .dat files
  33. func (v *Volume) Compact(preallocate int64, compactionBytePerSecond int64) error {
  34. if v.MemoryMapMaxSizeMb != 0 { //it makes no sense to compact in memory
  35. return nil
  36. }
  37. glog.V(3).Infof("Compacting volume %d ...", v.Id)
  38. //no need to lock for copy on write
  39. //v.accessLock.Lock()
  40. //defer v.accessLock.Unlock()
  41. //glog.V(3).Infof("Got Compaction lock...")
  42. v.isCompacting = true
  43. defer func() {
  44. v.isCompacting = false
  45. }()
  46. filePath := v.FileName()
  47. v.lastCompactIndexOffset = v.IndexFileSize()
  48. v.lastCompactRevision = v.SuperBlock.CompactionRevision
  49. glog.V(3).Infof("creating copies for volume %d ,last offset %d...", v.Id, v.lastCompactIndexOffset)
  50. if err := v.DataBackend.Sync(); err != nil {
  51. glog.V(0).Infof("compact fail to sync volume %d", v.Id)
  52. }
  53. if err := v.nm.Sync(); err != nil {
  54. glog.V(0).Infof("compact fail to sync volume idx %d", v.Id)
  55. }
  56. return v.copyDataAndGenerateIndexFile(filePath+".cpd", filePath+".cpx", preallocate, compactionBytePerSecond)
  57. }
  58. // compact a volume based on deletions in .idx files
  59. func (v *Volume) Compact2(preallocate int64, compactionBytePerSecond int64) error {
  60. if v.MemoryMapMaxSizeMb != 0 { //it makes no sense to compact in memory
  61. return nil
  62. }
  63. glog.V(3).Infof("Compact2 volume %d ...", v.Id)
  64. v.isCompacting = true
  65. defer func() {
  66. v.isCompacting = false
  67. }()
  68. filePath := v.FileName()
  69. v.lastCompactIndexOffset = v.IndexFileSize()
  70. v.lastCompactRevision = v.SuperBlock.CompactionRevision
  71. glog.V(3).Infof("creating copies for volume %d ...", v.Id)
  72. if err := v.DataBackend.Sync(); err != nil {
  73. glog.V(0).Infof("compact2 fail to sync volume dat %d: %v", v.Id, err)
  74. }
  75. if err := v.nm.Sync(); err != nil {
  76. glog.V(0).Infof("compact2 fail to sync volume idx %d: %v", v.Id, err)
  77. }
  78. return copyDataBasedOnIndexFile(filePath+".dat", filePath+".idx", filePath+".cpd", filePath+".cpx", v.SuperBlock, v.Version(), preallocate, compactionBytePerSecond)
  79. }
  80. func (v *Volume) CommitCompact() error {
  81. if v.MemoryMapMaxSizeMb != 0 { //it makes no sense to compact in memory
  82. return nil
  83. }
  84. glog.V(0).Infof("Committing volume %d vacuuming...", v.Id)
  85. v.isCompacting = true
  86. defer func() {
  87. v.isCompacting = false
  88. }()
  89. v.dataFileAccessLock.Lock()
  90. defer v.dataFileAccessLock.Unlock()
  91. glog.V(3).Infof("Got volume %d committing lock...", v.Id)
  92. v.nm.Close()
  93. if v.DataBackend != nil {
  94. if err := v.DataBackend.Close(); err != nil {
  95. glog.V(0).Infof("fail to close volume %d", v.Id)
  96. }
  97. }
  98. v.DataBackend = nil
  99. stats.VolumeServerVolumeCounter.WithLabelValues(v.Collection, "volume").Dec()
  100. var e error
  101. if e = v.makeupDiff(v.FileName()+".cpd", v.FileName()+".cpx", v.FileName()+".dat", v.FileName()+".idx"); e != nil {
  102. glog.V(0).Infof("makeupDiff in CommitCompact volume %d failed %v", v.Id, e)
  103. e = os.Remove(v.FileName() + ".cpd")
  104. if e != nil {
  105. return e
  106. }
  107. e = os.Remove(v.FileName() + ".cpx")
  108. if e != nil {
  109. return e
  110. }
  111. } else {
  112. if runtime.GOOS == "windows" {
  113. e = os.RemoveAll(v.FileName() + ".dat")
  114. if e != nil {
  115. return e
  116. }
  117. e = os.RemoveAll(v.FileName() + ".idx")
  118. if e != nil {
  119. return e
  120. }
  121. }
  122. var e error
  123. if e = os.Rename(v.FileName()+".cpd", v.FileName()+".dat"); e != nil {
  124. return fmt.Errorf("rename %s: %v", v.FileName()+".cpd", e)
  125. }
  126. if e = os.Rename(v.FileName()+".cpx", v.FileName()+".idx"); e != nil {
  127. return fmt.Errorf("rename %s: %v", v.FileName()+".cpx", e)
  128. }
  129. }
  130. //glog.V(3).Infof("Pretending to be vacuuming...")
  131. //time.Sleep(20 * time.Second)
  132. os.RemoveAll(v.FileName() + ".ldb")
  133. glog.V(3).Infof("Loading volume %d commit file...", v.Id)
  134. if e = v.load(true, false, v.needleMapKind, 0); e != nil {
  135. return e
  136. }
  137. return nil
  138. }
  139. func (v *Volume) cleanupCompact() error {
  140. glog.V(0).Infof("Cleaning up volume %d vacuuming...", v.Id)
  141. e1 := os.Remove(v.FileName() + ".cpd")
  142. e2 := os.Remove(v.FileName() + ".cpx")
  143. if e1 != nil {
  144. return e1
  145. }
  146. if e2 != nil {
  147. return e2
  148. }
  149. return nil
  150. }
  151. func fetchCompactRevisionFromDatFile(datBackend backend.BackendStorageFile) (compactRevision uint16, err error) {
  152. superBlock, err := super_block.ReadSuperBlock(datBackend)
  153. if err != nil {
  154. return 0, err
  155. }
  156. return superBlock.CompactionRevision, nil
  157. }
  158. // if old .dat and .idx files are updated, this func tries to apply the same changes to new files accordingly
  159. func (v *Volume) makeupDiff(newDatFileName, newIdxFileName, oldDatFileName, oldIdxFileName string) (err error) {
  160. var indexSize int64
  161. oldIdxFile, err := os.Open(oldIdxFileName)
  162. defer oldIdxFile.Close()
  163. oldDatFile, err := os.Open(oldDatFileName)
  164. oldDatBackend := backend.NewDiskFile(oldDatFile)
  165. defer oldDatBackend.Close()
  166. // skip if the old .idx file has not changed
  167. if indexSize, err = verifyIndexFileIntegrity(oldIdxFile); err != nil {
  168. return fmt.Errorf("verifyIndexFileIntegrity %s failed: %v", oldIdxFileName, err)
  169. }
  170. if indexSize == 0 || uint64(indexSize) <= v.lastCompactIndexOffset {
  171. return nil
  172. }
  173. // fail if the old .dat file has changed to a new revision
  174. oldDatCompactRevision, err := fetchCompactRevisionFromDatFile(oldDatBackend)
  175. if err != nil {
  176. return fmt.Errorf("fetchCompactRevisionFromDatFile src %s failed: %v", oldDatFile.Name(), err)
  177. }
  178. if oldDatCompactRevision != v.lastCompactRevision {
  179. return fmt.Errorf("current old dat file's compact revision %d is not the expected one %d", oldDatCompactRevision, v.lastCompactRevision)
  180. }
  181. type keyField struct {
  182. offset Offset
  183. size uint32
  184. }
  185. incrementedHasUpdatedIndexEntry := make(map[NeedleId]keyField)
  186. for idxOffset := indexSize - NeedleMapEntrySize; uint64(idxOffset) >= v.lastCompactIndexOffset; idxOffset -= NeedleMapEntrySize {
  187. var IdxEntry []byte
  188. if IdxEntry, err = readIndexEntryAtOffset(oldIdxFile, idxOffset); err != nil {
  189. return fmt.Errorf("readIndexEntry %s at offset %d failed: %v", oldIdxFileName, idxOffset, err)
  190. }
  191. key, offset, size := idx2.IdxFileEntry(IdxEntry)
  192. glog.V(4).Infof("key %d offset %d size %d", key, offset, size)
  193. if _, found := incrementedHasUpdatedIndexEntry[key]; !found {
  194. incrementedHasUpdatedIndexEntry[key] = keyField{
  195. offset: offset,
  196. size: size,
  197. }
  198. }
  199. }
  200. // no updates during commit step
  201. if len(incrementedHasUpdatedIndexEntry) == 0 {
  202. return nil
  203. }
  204. // deal with updates during commit step
  205. var (
  206. dst, idx *os.File
  207. )
  208. if dst, err = os.OpenFile(newDatFileName, os.O_RDWR, 0644); err != nil {
  209. return fmt.Errorf("open dat file %s failed: %v", newDatFileName, err)
  210. }
  211. dstDatBackend := backend.NewDiskFile(dst)
  212. defer dstDatBackend.Close()
  213. if idx, err = os.OpenFile(newIdxFileName, os.O_RDWR, 0644); err != nil {
  214. return fmt.Errorf("open idx file %s failed: %v", newIdxFileName, err)
  215. }
  216. defer idx.Close()
  217. var newDatCompactRevision uint16
  218. newDatCompactRevision, err = fetchCompactRevisionFromDatFile(dstDatBackend)
  219. if err != nil {
  220. return fmt.Errorf("fetchCompactRevisionFromDatFile dst %s failed: %v", dst.Name(), err)
  221. }
  222. if oldDatCompactRevision+1 != newDatCompactRevision {
  223. return fmt.Errorf("oldDatFile %s 's compact revision is %d while newDatFile %s 's compact revision is %d", oldDatFileName, oldDatCompactRevision, newDatFileName, newDatCompactRevision)
  224. }
  225. for key, increIdxEntry := range incrementedHasUpdatedIndexEntry {
  226. idxEntryBytes := needle_map.ToBytes(key, increIdxEntry.offset, increIdxEntry.size)
  227. var offset int64
  228. if offset, err = dst.Seek(0, 2); err != nil {
  229. glog.V(0).Infof("failed to seek the end of file: %v", err)
  230. return
  231. }
  232. //ensure file writing starting from aligned positions
  233. if offset%NeedlePaddingSize != 0 {
  234. offset = offset + (NeedlePaddingSize - offset%NeedlePaddingSize)
  235. if offset, err = dst.Seek(offset, 0); err != nil {
  236. glog.V(0).Infof("failed to align in datafile %s: %v", dst.Name(), err)
  237. return
  238. }
  239. }
  240. //updated needle
  241. if !increIdxEntry.offset.IsZero() && increIdxEntry.size != 0 && increIdxEntry.size != TombstoneFileSize {
  242. //even the needle cache in memory is hit, the need_bytes is correct
  243. glog.V(4).Infof("file %d offset %d size %d", key, increIdxEntry.offset.ToAcutalOffset(), increIdxEntry.size)
  244. var needleBytes []byte
  245. needleBytes, err = needle.ReadNeedleBlob(oldDatBackend, increIdxEntry.offset.ToAcutalOffset(), increIdxEntry.size, v.Version())
  246. if err != nil {
  247. return fmt.Errorf("ReadNeedleBlob %s key %d offset %d size %d failed: %v", oldDatFile.Name(), key, increIdxEntry.offset.ToAcutalOffset(), increIdxEntry.size, err)
  248. }
  249. dst.Write(needleBytes)
  250. util.Uint32toBytes(idxEntryBytes[8:12], uint32(offset/NeedlePaddingSize))
  251. } else { //deleted needle
  252. //fakeDelNeedle 's default Data field is nil
  253. fakeDelNeedle := new(needle.Needle)
  254. fakeDelNeedle.Id = key
  255. fakeDelNeedle.Cookie = 0x12345678
  256. fakeDelNeedle.AppendAtNs = uint64(time.Now().UnixNano())
  257. _, _, _, err = fakeDelNeedle.Append(dstDatBackend, v.Version())
  258. if err != nil {
  259. return fmt.Errorf("append deleted %d failed: %v", key, err)
  260. }
  261. util.Uint32toBytes(idxEntryBytes[8:12], uint32(0))
  262. }
  263. if _, err := idx.Seek(0, 2); err != nil {
  264. return fmt.Errorf("cannot seek end of indexfile %s: %v",
  265. newIdxFileName, err)
  266. }
  267. _, err = idx.Write(idxEntryBytes)
  268. }
  269. return nil
  270. }
  271. type VolumeFileScanner4Vacuum struct {
  272. version needle.Version
  273. v *Volume
  274. dstBackend backend.BackendStorageFile
  275. nm *needle_map.MemDb
  276. newOffset int64
  277. now uint64
  278. writeThrottler *util.WriteThrottler
  279. }
  280. func (scanner *VolumeFileScanner4Vacuum) VisitSuperBlock(superBlock super_block.SuperBlock) error {
  281. scanner.version = superBlock.Version
  282. superBlock.CompactionRevision++
  283. _, err := scanner.dstBackend.WriteAt(superBlock.Bytes(), 0)
  284. scanner.newOffset = int64(superBlock.BlockSize())
  285. return err
  286. }
  287. func (scanner *VolumeFileScanner4Vacuum) ReadNeedleBody() bool {
  288. return true
  289. }
  290. func (scanner *VolumeFileScanner4Vacuum) VisitNeedle(n *needle.Needle, offset int64, needleHeader, needleBody []byte) error {
  291. if n.HasTtl() && scanner.now >= n.LastModified+uint64(scanner.v.Ttl.Minutes()*60) {
  292. return nil
  293. }
  294. nv, ok := scanner.v.nm.Get(n.Id)
  295. glog.V(4).Infoln("needle expected offset ", offset, "ok", ok, "nv", nv)
  296. if ok && nv.Offset.ToAcutalOffset() == offset && nv.Size > 0 && nv.Size != TombstoneFileSize {
  297. if err := scanner.nm.Set(n.Id, ToOffset(scanner.newOffset), n.Size); err != nil {
  298. return fmt.Errorf("cannot put needle: %s", err)
  299. }
  300. if _, _, _, err := n.Append(scanner.dstBackend, scanner.v.Version()); err != nil {
  301. return fmt.Errorf("cannot append needle: %s", err)
  302. }
  303. delta := n.DiskSize(scanner.version)
  304. scanner.newOffset += delta
  305. scanner.writeThrottler.MaybeSlowdown(delta)
  306. glog.V(4).Infoln("saving key", n.Id, "volume offset", offset, "=>", scanner.newOffset, "data_size", n.Size)
  307. }
  308. return nil
  309. }
  310. func (v *Volume) copyDataAndGenerateIndexFile(dstName, idxName string, preallocate int64, compactionBytePerSecond int64) (err error) {
  311. var (
  312. dst backend.BackendStorageFile
  313. )
  314. if dst, err = backend.CreateVolumeFile(dstName, preallocate, 0); err != nil {
  315. return
  316. }
  317. defer dst.Close()
  318. nm := needle_map.NewMemDb()
  319. defer nm.Close()
  320. scanner := &VolumeFileScanner4Vacuum{
  321. v: v,
  322. now: uint64(time.Now().Unix()),
  323. nm: nm,
  324. dstBackend: dst,
  325. writeThrottler: util.NewWriteThrottler(compactionBytePerSecond),
  326. }
  327. err = ScanVolumeFile(v.dir, v.Collection, v.Id, v.needleMapKind, scanner)
  328. if err != nil {
  329. return nil
  330. }
  331. err = nm.SaveToIdx(idxName)
  332. return
  333. }
  334. func copyDataBasedOnIndexFile(srcDatName, srcIdxName, dstDatName, datIdxName string, sb super_block.SuperBlock, version needle.Version, preallocate int64, compactionBytePerSecond int64) (err error) {
  335. var (
  336. srcDatBackend, dstDatBackend backend.BackendStorageFile
  337. dataFile *os.File
  338. )
  339. if dstDatBackend, err = backend.CreateVolumeFile(dstDatName, preallocate, 0); err != nil {
  340. return
  341. }
  342. defer dstDatBackend.Close()
  343. oldNm := needle_map.NewMemDb()
  344. defer oldNm.Close()
  345. newNm := needle_map.NewMemDb()
  346. defer newNm.Close()
  347. if err = oldNm.LoadFromIdx(srcIdxName); err != nil {
  348. return
  349. }
  350. if dataFile, err = os.Open(srcDatName); err != nil {
  351. return err
  352. }
  353. srcDatBackend = backend.NewDiskFile(dataFile)
  354. defer srcDatBackend.Close()
  355. now := uint64(time.Now().Unix())
  356. sb.CompactionRevision++
  357. dstDatBackend.WriteAt(sb.Bytes(), 0)
  358. newOffset := int64(sb.BlockSize())
  359. writeThrottler := util.NewWriteThrottler(compactionBytePerSecond)
  360. oldNm.AscendingVisit(func(value needle_map.NeedleValue) error {
  361. offset, size := value.Offset, value.Size
  362. if offset.IsZero() || size == TombstoneFileSize {
  363. return nil
  364. }
  365. n := new(needle.Needle)
  366. err := n.ReadData(srcDatBackend, offset.ToAcutalOffset(), size, version)
  367. if err != nil {
  368. return nil
  369. }
  370. if n.HasTtl() && now >= n.LastModified+uint64(sb.Ttl.Minutes()*60) {
  371. return nil
  372. }
  373. if err = newNm.Set(n.Id, ToOffset(newOffset), n.Size); err != nil {
  374. return fmt.Errorf("cannot put needle: %s", err)
  375. }
  376. if _, _, _, err = n.Append(dstDatBackend, sb.Version); err != nil {
  377. return fmt.Errorf("cannot append needle: %s", err)
  378. }
  379. delta := n.DiskSize(version)
  380. newOffset += delta
  381. writeThrottler.MaybeSlowdown(delta)
  382. glog.V(4).Infoln("saving key", n.Id, "volume offset", offset, "=>", newOffset, "data_size", n.Size)
  383. return nil
  384. })
  385. newNm.SaveToIdx(datIdxName)
  386. return
  387. }