remove_duplicate_fids.go 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. package main
  2. import (
  3. "flag"
  4. "fmt"
  5. "os"
  6. "path/filepath"
  7. "github.com/seaweedfs/seaweedfs/weed/glog"
  8. "github.com/seaweedfs/seaweedfs/weed/storage"
  9. "github.com/seaweedfs/seaweedfs/weed/storage/backend"
  10. "github.com/seaweedfs/seaweedfs/weed/storage/needle"
  11. "github.com/seaweedfs/seaweedfs/weed/storage/super_block"
  12. util_http "github.com/seaweedfs/seaweedfs/weed/util/http"
  13. )
  14. var (
  15. volumePath = flag.String("dir", "/tmp", "data directory to store files")
  16. volumeCollection = flag.String("collection", "", "the volume collection name")
  17. volumeId = flag.Int("volumeId", -1, "a volume id. The volume should already exist in the dir. The volume index file should not exist.")
  18. )
  19. func Checksum(n *needle.Needle) string {
  20. return fmt.Sprintf("%s%x", n.Id, n.Cookie)
  21. }
  22. type VolumeFileScanner4SeeDat struct {
  23. version needle.Version
  24. block super_block.SuperBlock
  25. dir string
  26. hashes map[string]bool
  27. dat *os.File
  28. datBackend backend.BackendStorageFile
  29. }
  30. func (scanner *VolumeFileScanner4SeeDat) VisitSuperBlock(superBlock super_block.SuperBlock) error {
  31. scanner.version = superBlock.Version
  32. scanner.block = superBlock
  33. return nil
  34. }
  35. func (scanner *VolumeFileScanner4SeeDat) ReadNeedleBody() bool {
  36. return true
  37. }
  38. func (scanner *VolumeFileScanner4SeeDat) VisitNeedle(n *needle.Needle, offset int64, needleHeader, needleBody []byte) error {
  39. if scanner.datBackend == nil {
  40. newFileName := filepath.Join(*volumePath, "dat_fixed")
  41. newDatFile, err := os.Create(newFileName)
  42. if err != nil {
  43. glog.Fatalf("Write New Volume Data %v", err)
  44. }
  45. scanner.datBackend = backend.NewDiskFile(newDatFile)
  46. scanner.datBackend.WriteAt(scanner.block.Bytes(), 0)
  47. }
  48. checksum := Checksum(n)
  49. if scanner.hashes[checksum] {
  50. glog.V(0).Infof("duplicate checksum:%s fid:%d,%s%x @ offset:%d", checksum, *volumeId, n.Id, n.Cookie, offset)
  51. return nil
  52. }
  53. scanner.hashes[checksum] = true
  54. _, s, _, e := n.Append(scanner.datBackend, scanner.version)
  55. fmt.Printf("size %d error %v\n", s, e)
  56. return nil
  57. }
  58. func main() {
  59. flag.Parse()
  60. util_http.InitGlobalHttpClient()
  61. vid := needle.VolumeId(*volumeId)
  62. outpath, _ := filepath.Abs(filepath.Dir(os.Args[0]))
  63. scanner := &VolumeFileScanner4SeeDat{
  64. dir: filepath.Join(outpath, "out"),
  65. hashes: map[string]bool{},
  66. }
  67. if _, err := os.Stat(scanner.dir); err != nil {
  68. if err := os.MkdirAll(scanner.dir, os.ModePerm); err != nil {
  69. glog.Fatalf("could not create output dir : %s", err)
  70. }
  71. }
  72. err := storage.ScanVolumeFile(*volumePath, *volumeCollection, vid, storage.NeedleMapInMemory, scanner)
  73. if err != nil {
  74. glog.Fatalf("Reading Volume File [ERROR] %s\n", err)
  75. }
  76. }