#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace { class TStringArrayOutput: public IOutputStream { public: TStringArrayOutput(IOutputStream* slave, size_t stride) : Slave(*slave) , Stride(stride) { Buf.reserve(stride); } void DoFinish() override { WriteBuf(); Flush(); } void DoWrite(const void* data, size_t len) override { for (const char* p = (const char*)data; len > 0; ++p, --len) { Buf.append(*p); if (Buf.size() == Stride) WriteBuf(); } } private: void WriteBuf() { Slave << '"' << Buf << "\",\n"sv; Buf.clear(); } private: IOutputStream& Slave; const size_t Stride; TString Buf; }; class THexOutput: public IOutputStream { public: inline THexOutput(IOutputStream* slave) : Slave_(slave) { } ~THexOutput() override { } inline IOutputStream* Slave() const noexcept { return Slave_; } private: void DoFinish() override { Slave_->Write('\n'); Slave_->Flush(); } void DoWrite(const void* data, size_t len) override { const char* b = (const char*)data; while (len) { const unsigned char c = *b; char buf[12]; char* tmp = buf; if (Count_ % Columns == 0) { *tmp++ = ' '; *tmp++ = ' '; *tmp++ = ' '; *tmp++ = ' '; } if (Count_ && Count_ % Columns != 0) { *tmp++ = ','; *tmp++ = ' '; } *tmp++ = '0'; *tmp++ = 'x'; tmp = HexEncode(&c, 1, tmp); if ((Count_ % Columns) == (Columns - 1)) { *tmp++ = ','; *tmp++ = '\n'; } Slave_->Write(buf, tmp - buf); --len; ++b; ++Count_; } } private: // width in source chars static const size_t Columns = 10; ui64 Count_ = 0; IOutputStream* Slave_ = nullptr; }; struct TYasmOutput: public IOutputStream { inline TYasmOutput(IOutputStream* out, const TString& base) : Out_(out) , Base_(base) { *Out_ << "global " << Base_ << "\n"; *Out_ << "global " << Base_ << "Size\n\nSECTION .rodata\n\n"; *Out_ << Base_ << ":\n"; } ~TYasmOutput() override { } void DoFinish() override { *Out_ << Base_ << "Size:\ndd " << Count_ << '\n'; *Out_ << "%ifidn __OUTPUT_FORMAT__,elf64\n"; *Out_ << "size " << Base_ << " " << Count_ << "\n"; *Out_ << "size " << Base_ << "Size 4\n"; *Out_ << "%endif\n"; } void DoWrite(const void* data, size_t len) override { Count_ += len; const unsigned char* p = (const unsigned char*)data; while (len) { const size_t step = Min(len, 100); *Out_ << "db " << (int)*p++; for (size_t i = 1; i < step; ++i) { *Out_ << ',' << (int)*p++; } *Out_ << '\n'; len -= step; } } IOutputStream* Out_ = nullptr; const TString Base_; ui64 Count_ = 0; }; struct TCOutput: public THexOutput { inline TCOutput(IOutputStream* out, const TString& base) : THexOutput(out) , B(base) { *Slave() << "static_assert(sizeof(unsigned int) == 4, \"ups, unsupported platform\");\n\nextern \"C\" {\nextern const unsigned char " << B << "[] = {\n"; } ~TCOutput() override { } void DoFinish() override { *Slave() << "\n};\nextern const unsigned int " << B << "Size = sizeof(" << B << ") / sizeof(" << B << "[0]);\n}\n"; } const TString B; }; struct TCStringOutput: public IOutputStream { inline TCStringOutput(IOutputStream* out, const TString& base) : O(out) , B(base) { *O << "static_assert(sizeof(unsigned int) == 4, \"ups, unsupported platform\");\n\nextern \"C\" {\nextern const unsigned char " << B << "[] = \n"; } ~TCStringOutput() override { } void DoWrite(const void* data, size_t len) override { *O << TString((const char*)data, len).Quote() << '\n'; } void DoFinish() override { //*O << ";\nextern const unsigned char* " << B << " = (const unsigned char*)" << B << "Array;\n"; *O << ";\nextern const unsigned int " << B << "Size = sizeof(" << B << ") / sizeof(" << B << "[0]) - 1;\n}\n"; } IOutputStream* O = nullptr; const TString B; }; struct TMyFileComparator { bool operator()(const TString& fname1, const TString& fname2) const { if (fname1 == fname2) { return false; } if (const auto* savedResultPtr = SavedResults.FindPtr(std::make_pair(fname1, fname2))) { return *savedResultPtr < 0; } TMemoryMap mmap1(fname1, TMemoryMap::oRdOnly); TMemoryMap mmap2(fname2, TMemoryMap::oRdOnly); mmap1.SetSequential(); mmap2.SetSequential(); Y_ASSERT(mmap1.Length() == mmap2.Length()); TMemoryMap::TMapResult mapResult1 = mmap1.Map(0, mmap1.Length()); TMemoryMap::TMapResult mapResult2 = mmap2.Map(0, mmap2.Length()); Y_ASSERT(mapResult1.MappedSize() == mapResult2.MappedSize()); int res = memcmp(mapResult1.MappedData(), mapResult2.MappedData(), mapResult1.MappedSize()); mmap1.Unmap(mapResult1); mmap2.Unmap(mapResult2); SavedResults[std::make_pair(fname1, fname2)] = res; SavedResults[std::make_pair(fname2, fname1)] = -res; return res < 0; } mutable THashMap, int> SavedResults; }; struct TDuplicatesMap { void Add(const TString& fname, const TString& rname) { Y_ENSURE(!InitialFillingDone); FileNames.push_back(fname); FileNameToRecordName[fname] = rname; } void Finish() { Y_ENSURE(!InitialFillingDone); InitialFillingDone = true; TMap> bySize; for (const TString& fname: FileNames) { TFile file(fname, OpenExisting | RdOnly); bySize[file.GetLength()].push_back(fname); } for (const auto& bySizeElement: bySize) { if (bySizeElement.second.size() > 1) { TMap, TMyFileComparator> byContents; for (const TString& fname: bySizeElement.second) { byContents[fname].push_back(fname); } for (const auto& byContentsElement: byContents) { if (byContentsElement.second.size() > 1) { const TString& rootName = byContentsElement.second.front(); const TString& rootRecordName = FileNameToRecordName[rootName]; for (const TString& fname: byContentsElement.second) { if (fname != rootName) { Synonyms[FileNameToRecordName[fname]] = rootRecordName; } } } } } } FileNames.clear(); FileNameToRecordName.clear(); } bool InitialFillingDone = false; TVector FileNames; THashMap FileNameToRecordName; THashMap Synonyms; }; struct TDeduplicationArchiveWriter { TDeduplicationArchiveWriter(const TDuplicatesMap& duplicatesMap, IOutputStream* out, bool compress) : DuplicatesMap(duplicatesMap) , Writer(out, compress) {} void Finish() { Writer.Finish(); } const TDuplicatesMap& DuplicatesMap; TArchiveWriter Writer; }; } static inline TAutoPtr OpenOutput(const TString& url) { if (url.empty()) { return new TBuffered(8192, Duplicate(1)); } else { return new TBuffered(8192, url); } } static inline bool IsDelim(char ch) noexcept { return ch == '/' || ch == '\\'; } static inline TString GetFile(const TString& s) { const char* e = s.end(); const char* b = s.begin(); const char* c = e - 1; while (c != b && !IsDelim(*c)) { --c; } if (c != e && IsDelim(*c)) { ++c; } return TString(c, e - c); } static inline TString Fix(TString f) { if (!f.empty() && IsDelim(f[f.size() - 1])) { f.pop_back(); } return f; } static bool Quiet = false; static inline void Append(IOutputStream& w, const TString& fname, const TString& rname) { TMappedFileInput in(fname); if (!Quiet) { Cerr << "--> " << rname << Endl; } TransferData((IInputStream*)&in, &w); } static inline void Append(TDuplicatesMap& w, const TString& fname, const TString& rname) { w.Add(fname, rname); } static inline void Append(TDeduplicationArchiveWriter& w, const TString& fname, const TString& rname) { if (!Quiet) { Cerr << "--> " << rname << Endl; } if (const TString* rootRecordName = w.DuplicatesMap.Synonyms.FindPtr(rname)) { w.Writer.AddSynonym(*rootRecordName, rname); } else { TMappedFileInput in(fname); w.Writer.Add(rname, &in); } } namespace { struct TRec { bool Recursive = false; TString Key; TString Path; TString Prefix; TRec() = default; inline void Fix() { ::Fix(Path); ::Fix(Prefix); } template inline void Recurse(T& w) const { if (IsDir(Path)) { DoRecurse(w, "/"); } else { Append(w, Path, Key.size() ? Key : Prefix + "/" + GetFile(Path)); } } template inline void DoRecurse(T& w, const TString& off) const { { TFileList fl; const char* name; const TString p = Path + off; fl.Fill(p, true); while ((name = fl.Next())) { const TString fname = p + name; const TString rname = Prefix + off + name; Append(w, fname, rname); } } if (Recursive) { TDirsList dl; const char* name; const TString p = Path + off; dl.Fill(p, true); while ((name = dl.Next())) { if (strcmp(name, ".") && strcmp(name, "..")) { DoRecurse(w, off + name + "/"); } } } } }; } static TString CutFirstSlash(const TString& fileName) { if (fileName[0] == '/') { return fileName.substr(1); } else { return fileName; } } struct TMappingReader { TMemoryMap Map; TBlob Blob; TArchiveReader Reader; TMappingReader(const TString& archive) : Map(archive) , Blob(TBlob::FromMemoryMapSingleThreaded(Map, 0, Map.Length())) , Reader(Blob) { } }; static void UnpackArchive(const TString& archive, const TFsPath& dir = TFsPath()) { TMappingReader mappingReader(archive); const TArchiveReader& reader = mappingReader.Reader; const size_t count = reader.Count(); for (size_t i = 0; i < count; ++i) { const TString key = reader.KeyByIndex(i); const TString fileName = CutFirstSlash(key); if (!Quiet) { Cerr << archive << " --> " << fileName << Endl; } const TFsPath path(dir / fileName); path.Parent().MkDirs(); TAutoPtr in = reader.ObjectByKey(key); TFixedBufferFileOutput out(path); TransferData(in.Get(), &out); out.Finish(); } } static void ListArchive(const TString& archive, bool cutSlash) { TMappingReader mappingReader(archive); const TArchiveReader& reader = mappingReader.Reader; const size_t count = reader.Count(); for (size_t i = 0; i < count; ++i) { const TString key = reader.KeyByIndex(i); TString fileName = key; if (cutSlash) { fileName = CutFirstSlash(key); } Cout << fileName << Endl; } } static void ListArchiveMd5(const TString& archive, bool cutSlash) { TMappingReader mappingReader(archive); const TArchiveReader& reader = mappingReader.Reader; const size_t count = reader.Count(); for (size_t i = 0; i < count; ++i) { const TString key = reader.KeyByIndex(i); TString fileName = key; if (cutSlash) { fileName = CutFirstSlash(key); } char md5buf[33]; Cout << fileName << '\t' << MD5::Stream(reader.ObjectByKey(key).Get(), md5buf) << Endl; } } int main(int argc, char** argv) { NLastGetopt::TOpts opts; opts.AddHelpOption('?'); opts.SetTitle( "Archiver\n" "Docs: https://wiki.yandex-team.ru/Development/Poisk/arcadia/tools/archiver" ); bool hexdump = false; opts.AddLongOption('x', "hexdump", "Produce hexdump") .NoArgument() .Optional() .StoreValue(&hexdump, true); size_t stride = 0; opts.AddLongOption('s', "segments", "Produce segmented C strings array of given size") .RequiredArgument("") .Optional() .DefaultValue("0") .StoreResult(&stride); bool cat = false; opts.AddLongOption('c', "cat", "Do not store keys (file names), just cat uncompressed files") .NoArgument() .Optional() .StoreValue(&cat, true); bool doNotZip = false; opts.AddLongOption('p', "plain", "Do not use compression") .NoArgument() .Optional() .StoreValue(&doNotZip, true); bool deduplicate = false; opts.AddLongOption("deduplicate", "Turn on file-wise deduplication") .NoArgument() .Optional() .StoreValue(&deduplicate, true); bool unpack = false; opts.AddLongOption('u', "unpack", "Unpack archive into current directory") .NoArgument() .Optional() .StoreValue(&unpack, true); bool list = false; opts.AddLongOption('l', "list", "List files in archive") .NoArgument() .Optional() .StoreValue(&list, true); bool cutSlash = true; opts.AddLongOption("as-is", "somewhy slash is cutted by default in list; with this option key will be shown as-is") .NoArgument() .Optional() .StoreValue(&cutSlash, false); bool listMd5 = false; opts.AddLongOption('m', "md5", "List files in archive with MD5 sums") .NoArgument() .Optional() .StoreValue(&listMd5, true); bool recursive = false; opts.AddLongOption('r', "recursive", "Read all files under each directory, recursively") .NoArgument() .Optional() .StoreValue(&recursive, true); Quiet = false; opts.AddLongOption('q', "quiet", "Do not output progress to stderr") .NoArgument() .Optional() .StoreValue(&Quiet, true); TString prepend; opts.AddLongOption('z', "prepend", "Prepend string to output") .RequiredArgument("") .StoreResult(&prepend); TString append; opts.AddLongOption('a', "append", "Append string to output") .RequiredArgument("") .StoreResult(&append); TString outputf; opts.AddLongOption('o', "output", "Output to file instead stdout") .RequiredArgument("") .StoreResult(&outputf); TString unpackDir; opts.AddLongOption('d', "unpackdir", "Unpack destination directory") .RequiredArgument("") .DefaultValue(".") .StoreResult(&unpackDir); TString yasmBase; opts.AddLongOption('A', "yasm", "Output dump is yasm format") .RequiredArgument("") .StoreResult(&yasmBase); TString cppBase; opts.AddLongOption('C', "cpp", "Output dump is C/C++ format") .RequiredArgument("") .StoreResult(&cppBase); TString forceKeys; opts.AddLongOption('k', "keys", "Set explicit list of keys for elements") .RequiredArgument("") .StoreResult(&forceKeys); opts.SetFreeArgDefaultTitle(""); opts.SetFreeArgsMin(1); NLastGetopt::TOptsParseResult optsRes(&opts, argc, argv); SubstGlobal(append, "\\n", "\n"); SubstGlobal(prepend, "\\n", "\n"); TVector recs; const auto& files = optsRes.GetFreeArgs(); TVector keys; if (forceKeys.size()) StringSplitter(forceKeys).Split(':').SkipEmpty().Collect(&keys); if (keys.size() && keys.size() != files.size()) { Cerr << "Invalid number of keys=" << keys.size() << " (!= number of files=" << files.size() << ")" << Endl; return 1; } for (size_t i = 0; i < files.size(); ++i) { const auto& path = files[i]; size_t off = 0; #ifdef _win_ if (path[0] > 0 && isalpha(path[0]) && path[1] == ':') off = 2; // skip drive letter ("d:") #endif // _win_ const size_t pos = path.find(':', off); TRec cur; cur.Path = path.substr(0, pos); if (pos != TString::npos) cur.Prefix = path.substr(pos + 1); if (keys.size()) cur.Key = keys[i]; cur.Recursive = recursive; cur.Fix(); recs.push_back(cur); } try { if (listMd5) { for (const auto& rec: recs) { ListArchiveMd5(rec.Path, cutSlash); } } else if (list) { for (const auto& rec: recs) { ListArchive(rec.Path, cutSlash); } } else if (unpack) { const TFsPath dir(unpackDir); for (const auto& rec: recs) { UnpackArchive(rec.Path, dir); } } else { TAutoPtr outf(OpenOutput(outputf)); IOutputStream* out = outf.Get(); THolder hexout; if (hexdump) { hexout.Reset(new THexOutput(out)); out = hexout.Get(); } else if (stride) { hexout.Reset(new TStringArrayOutput(out, stride)); out = hexout.Get(); } else if (yasmBase) { hexout.Reset(new TYasmOutput(out, yasmBase)); out = hexout.Get(); } else if (cppBase) { hexout.Reset(new TCStringOutput(out, cppBase)); out = hexout.Get(); } outf->Write(prepend.data(), prepend.size()); if (cat) { for (const auto& rec: recs) { rec.Recurse(*out); } } else { TDuplicatesMap duplicatesMap; if (deduplicate) { for (const auto& rec: recs) { rec.Recurse(duplicatesMap); } } duplicatesMap.Finish(); TDeduplicationArchiveWriter w(duplicatesMap, out, !doNotZip); for (const auto& rec: recs) { rec.Recurse(w); } w.Finish(); } try { out->Finish(); } catch (...) { } outf->Write(append.data(), append.size()); } } catch (...) { Cerr << CurrentExceptionMessage() << Endl; return 1; } return 0; }