Browse Source

KIKIMR-19469: Restart vdisk button

robdrynkin 1 year ago
parent
commit
a8f808af42

+ 5 - 0
.mapping.json

@@ -3449,6 +3449,11 @@
   "ydb/core/blobstorage/ut_blobstorage/ut_scrub/CMakeLists.linux-x86_64.txt":"",
   "ydb/core/blobstorage/ut_blobstorage/ut_scrub/CMakeLists.txt":"",
   "ydb/core/blobstorage/ut_blobstorage/ut_scrub/CMakeLists.windows-x86_64.txt":"",
+  "ydb/core/blobstorage/ut_blobstorage/ut_vdisk_restart/CMakeLists.darwin-x86_64.txt":"",
+  "ydb/core/blobstorage/ut_blobstorage/ut_vdisk_restart/CMakeLists.linux-aarch64.txt":"",
+  "ydb/core/blobstorage/ut_blobstorage/ut_vdisk_restart/CMakeLists.linux-x86_64.txt":"",
+  "ydb/core/blobstorage/ut_blobstorage/ut_vdisk_restart/CMakeLists.txt":"",
+  "ydb/core/blobstorage/ut_blobstorage/ut_vdisk_restart/CMakeLists.windows-x86_64.txt":"",
   "ydb/core/blobstorage/ut_group/CMakeLists.darwin-x86_64.txt":"",
   "ydb/core/blobstorage/ut_group/CMakeLists.linux-aarch64.txt":"",
   "ydb/core/blobstorage/ut_group/CMakeLists.linux-x86_64.txt":"",

+ 2 - 0
ydb/core/base/blobstorage.h

@@ -851,6 +851,7 @@ struct TEvBlobStorage {
         // node controller internal messages
         EvRegisterNodeRetry = EvPut + 14 * 512,
         EvAskRestartPDisk,
+        EvAskRestartVDisk,
         EvRestartPDisk,
         EvRestartPDiskResult,
         EvNodeWardenQueryGroupInfo,
@@ -2354,6 +2355,7 @@ struct TEvBlobStorage {
     struct TEvBunchOfEvents;
 
     struct TEvAskRestartPDisk;
+    struct TEvAskRestartVDisk;
     struct TEvRestartPDisk;
     struct TEvRestartPDiskResult;
 };

+ 13 - 0
ydb/core/blobstorage/base/blobstorage_events.h

@@ -431,6 +431,19 @@ namespace NKikimr {
         {}
     };
 
+    struct TEvBlobStorage::TEvAskRestartVDisk : TEventLocal<TEvAskRestartVDisk, EvAskRestartVDisk> {
+        const ui32 PDiskId;
+        const TVDiskID VDiskId;
+
+        TEvAskRestartVDisk(
+            const ui32 pDiskId,
+            const TVDiskID& vDiskId
+        )
+            : PDiskId(pDiskId)
+            , VDiskId(vDiskId)
+        {}
+    };
+
     struct TEvBlobStorage::TEvRestartPDisk : TEventLocal<TEvRestartPDisk, EvRestartPDisk> {
         const ui32 PDiskId;
         const NPDisk::TMainKey MainKey;

+ 1 - 1
ydb/core/blobstorage/nodewarden/blobstorage_node_warden_ut.cpp

@@ -807,7 +807,7 @@ Y_UNIT_TEST_SUITE(TBlobStorageWardenTest) {
         Setup(runtime, "", nullptr);
         auto edge = runtime.AllocateEdgeActor(0);
         TActorId nodeWarden = MakeBlobStorageNodeWardenID(edge.NodeId());
-        THttpRequest HttpRequest;
+        THttpRequestMock HttpRequest;
         NMonitoring::TMonService2HttpRequest monService2HttpRequest(nullptr, &HttpRequest, nullptr, nullptr, path,
                 nullptr);
         runtime.Send(new IEventHandle(nodeWarden, edge, new NMon::TEvHttpInfo(monService2HttpRequest)), 0);

+ 2 - 0
ydb/core/blobstorage/nodewarden/node_warden_impl.h

@@ -464,6 +464,7 @@ namespace NKikimr::NStorage {
 
         void Handle(TEvBlobStorage::TEvDropDonor::TPtr ev);
         void Handle(TEvBlobStorage::TEvAskRestartPDisk::TPtr ev);
+        void Handle(TEvBlobStorage::TEvAskRestartVDisk::TPtr ev);
         void Handle(TEvBlobStorage::TEvRestartPDiskResult::TPtr ev);
 
         void FillInVDiskStatus(google::protobuf::RepeatedPtrField<NKikimrBlobStorage::TVDiskStatus> *pb, bool initial);
@@ -535,6 +536,7 @@ namespace NKikimr::NStorage {
                 hFunc(TEvStatusUpdate, Handle);
                 hFunc(TEvBlobStorage::TEvDropDonor, Handle);
                 hFunc(TEvBlobStorage::TEvAskRestartPDisk, Handle);
+                hFunc(TEvBlobStorage::TEvAskRestartVDisk, Handle);
                 hFunc(TEvBlobStorage::TEvRestartPDiskResult, Handle);
 
                 hFunc(TEvGroupStatReport, Handle);

+ 18 - 1
ydb/core/blobstorage/nodewarden/node_warden_vdisk.cpp

@@ -6,6 +6,8 @@
 
 namespace NKikimr::NStorage {
 
+    constexpr TDuration PDISK_CONFIDENCE_DELAY = TDuration::Seconds(15);
+
     void TNodeWarden::DestroyLocalVDisk(TVDiskRecord& vdisk) {
         STLOG(PRI_INFO, BS_NODE, NW35, "DestroyLocalVDisk", (VDiskId, vdisk.GetVDiskId()), (VSlotId, vdisk.GetVSlotId()));
         Y_VERIFY(!vdisk.RuntimeData);
@@ -267,7 +269,7 @@ namespace NKikimr::NStorage {
             StartLocalVDiskActor(record, TDuration::Zero());
         } else if (record.RuntimeData->DonorMode < record.Config.HasDonorMode() || record.RuntimeData->ReadOnly != record.Config.GetReadOnly()) {
             PoisonLocalVDisk(record);
-            StartLocalVDiskActor(record, TDuration::Seconds(15) /* PDisk confidence delay */);
+            StartLocalVDiskActor(record, PDISK_CONFIDENCE_DELAY);
         }
     }
 
@@ -285,6 +287,21 @@ namespace NKikimr::NStorage {
         }
     }
 
+    void TNodeWarden::Handle(TEvBlobStorage::TEvAskRestartVDisk::TPtr ev) {
+        const auto& [pDiskId, vDiskId] = *ev->Get();
+        const auto nodeId = SelfId().NodeId();  // Skeleton and NodeWarden are on the same node
+        TVSlotId slotId(nodeId, pDiskId, 0);
+
+        for (auto it = LocalVDisks.lower_bound(slotId); it != LocalVDisks.end() && it->first.NodeId == nodeId && it->first.PDiskId == pDiskId; ++it) {
+            auto& record = it->second;
+            if (record.GetVDiskId() == vDiskId) {
+                PoisonLocalVDisk(record);
+                StartLocalVDiskActor(record, PDISK_CONFIDENCE_DELAY);
+                break;
+            }
+        }
+    }
+
     void TNodeWarden::Handle(TEvBlobStorage::TEvDropDonor::TPtr ev) {
         auto *msg = ev->Get();
         const TVSlotId vslotId(msg->NodeId, msg->PDiskId, msg->VSlotId);

+ 3 - 3
ydb/core/blobstorage/pdisk/blobstorage_pdisk_ut_actions.h

@@ -1301,7 +1301,7 @@ public:
 };
 
 class TTestHttpInfo : public TBaseTest {
-    THttpRequest HttpRequest;
+    THttpRequestMock HttpRequest;
     NMonitoring::TMonService2HttpRequest MonService2HttpRequest;
 
     void TestFSM(const TActorContext &ctx);
@@ -1313,7 +1313,7 @@ public:
 };
 
 class TTestHttpInfoFileDoesntExist : public TBaseTest {
-    THttpRequest HttpRequest;
+    THttpRequestMock HttpRequest;
     NMonitoring::TMonService2HttpRequest MonService2HttpRequest;
 
     void TestFSM(const TActorContext &ctx);
@@ -1326,7 +1326,7 @@ public:
 
 class TTestBootingState : public TBaseTest {
     const ui32 HttpRequestsCount = 1000;
-    THttpRequest HttpRequest;
+    THttpRequestMock HttpRequest;
     NMonitoring::TMonService2HttpRequest MonService2HttpRequest;
     bool EvYardAnswered = false;
     ui32 AnsweredRequests = 0;

+ 4 - 3
ydb/core/blobstorage/pdisk/blobstorage_pdisk_ut_http_request.h

@@ -3,18 +3,19 @@
 
 namespace NKikimr {
 
-struct THttpRequest : NMonitoring::IHttpRequest {
+struct THttpRequestMock : NMonitoring::IHttpRequest {
     TCgiParameters CgiParameters;
     THttpHeaders HttpHeaders;
+    TString Path;
 
-    ~THttpRequest() {}
+    ~THttpRequestMock() {}
 
     const char* GetURI() const override {
         return "";
     }
 
     const char* GetPath() const override {
-        return "";
+        return Path.c_str();
     }
 
     const TCgiParameters& GetParams() const override {

+ 124 - 2
ydb/core/blobstorage/pdisk/mock/pdisk_mock.cpp

@@ -45,6 +45,7 @@ struct TPDiskMockState::TImpl {
     TIntervalSet<ui64> Corrupted;
     NPDisk::TStatusFlags StatusFlags;
     THashSet<TVDiskID> ReadOnlyVDisks;
+    TString StateErrorReason;
 
     TImpl(ui32 nodeId, ui32 pdiskId, ui64 pdiskGuid, ui64 size, ui32 chunkSize)
         : NodeId(nodeId)
@@ -221,6 +222,15 @@ struct TPDiskMockState::TImpl {
         return ChunkSize;
     }
 
+    TMaybe<TOwner> GetOwner(const TVDiskID& vDiskId) const {
+        for (auto& [ownerId, owner] : Owners) {
+            if (owner.VDiskId.GroupID == vDiskId.GroupID && owner.VDiskId.VDisk == vDiskId.VDisk) {
+                return owner;
+            }
+        }
+        return Nothing();
+    }
+
     TIntervalSet<i64> GetWrittenAreas(ui32 chunkIdx) const {
         TIntervalSet<i64> res;
         for (auto& [ownerId, owner] : Owners) {
@@ -308,12 +318,23 @@ void TPDiskMockState::SetReadOnly(const TVDiskID& vDiskId, bool isReadOnly) {
     Impl->SetReadOnly(vDiskId, isReadOnly);
 }
 
+TString& TPDiskMockState::GetStateErrorReason() {
+    return Impl->StateErrorReason;
+}
+
 TPDiskMockState::TPtr TPDiskMockState::Snapshot() {
     auto res = MakeIntrusive<TPDiskMockState>(std::make_unique<TImpl>(*Impl));
     res->Impl->AdjustRefs();
     return res;
 }
 
+TMaybe<NPDisk::TOwnerRound> TPDiskMockState::GetOwnerRound(const TVDiskID& vDiskId) const {
+    if (auto owner = Impl->GetOwner(vDiskId)) {
+        return owner->OwnerRound;
+    }
+    return Nothing();
+}
+
 class TPDiskMockActor : public TActorBootstrapped<TPDiskMockActor> {
     enum {
         EvResume = EventSpaceBegin(TEvents::ES_PRIVATE),
@@ -339,7 +360,7 @@ public:
     }
 
     void Bootstrap() {
-        Become(&TThis::StateFunc);
+        Become(&TThis::StateNormal);
         ReportMetrics();
     }
 
@@ -814,7 +835,89 @@ public:
         return Impl.StatusFlags;
     }
 
-    STRICT_STFUNC(StateFunc,
+    void ErrorHandle(NPDisk::TEvYardInit::TPtr &ev) {
+        Send(ev->Sender, new NPDisk::TEvYardInitResult(NKikimrProto::CORRUPTED, State->GetStateErrorReason()));
+    }
+
+    void ErrorHandle(NPDisk::TEvCheckSpace::TPtr &ev) {
+        Send(ev->Sender, new NPDisk::TEvCheckSpaceResult(NKikimrProto::CORRUPTED, 0, 0, 0, 0, 0, State->GetStateErrorReason()));
+    }
+
+    void ErrorHandle(NPDisk::TEvLog::TPtr &ev) {
+        const NPDisk::TEvLog &evLog = *ev->Get();
+        THolder<NPDisk::TEvLogResult> result(new NPDisk::TEvLogResult(NKikimrProto::CORRUPTED, 0, State->GetStateErrorReason()));
+        result->Results.push_back(NPDisk::TEvLogResult::TRecord(evLog.Lsn, evLog.Cookie));
+        Send(ev->Sender, result.Release());
+    }
+
+    void ErrorHandle(NPDisk::TEvMultiLog::TPtr &ev) {
+        const NPDisk::TEvMultiLog &evMultiLog = *ev->Get();
+        THolder<NPDisk::TEvLogResult> result(new NPDisk::TEvLogResult(NKikimrProto::CORRUPTED, 0, State->GetStateErrorReason()));
+        for (auto &log : evMultiLog.Logs) {
+            result->Results.push_back(NPDisk::TEvLogResult::TRecord(log->Lsn, log->Cookie));
+        }
+        Send(ev->Sender, result.Release());
+    }
+
+    void ErrorHandle(NPDisk::TEvReadLog::TPtr &ev) {
+        const NPDisk::TEvReadLog &evReadLog = *ev->Get();
+        THolder<NPDisk::TEvReadLogResult> result(new NPDisk::TEvReadLogResult(
+            NKikimrProto::CORRUPTED, evReadLog.Position, evReadLog.Position, true, 0, State->GetStateErrorReason(), evReadLog.Owner));
+        Send(ev->Sender, result.Release());
+    }
+
+    void ErrorHandle(NPDisk::TEvChunkWrite::TPtr &ev) {
+        const NPDisk::TEvChunkWrite &evChunkWrite = *ev->Get();
+        Send(ev->Sender, new NPDisk::TEvChunkWriteResult(NKikimrProto::CORRUPTED,
+            evChunkWrite.ChunkIdx, evChunkWrite.Cookie, 0, State->GetStateErrorReason()));
+    }
+
+    void ErrorHandle(NPDisk::TEvChunkRead::TPtr &ev) {
+        const NPDisk::TEvChunkRead &evChunkRead = *ev->Get();
+        THolder<NPDisk::TEvChunkReadResult> result = MakeHolder<NPDisk::TEvChunkReadResult>(NKikimrProto::CORRUPTED,
+            evChunkRead.ChunkIdx, evChunkRead.Offset, evChunkRead.Cookie, 0, "PDisk is in error state");
+        Send(ev->Sender, result.Release());
+    }
+
+    void ErrorHandle(NPDisk::TEvHarakiri::TPtr &ev) {
+        Send(ev->Sender, new NPDisk::TEvHarakiriResult(NKikimrProto::CORRUPTED, 0, State->GetStateErrorReason()));
+    }
+
+    void ErrorHandle(NPDisk::TEvSlay::TPtr &ev) {
+        const NPDisk::TEvSlay &evSlay = *ev->Get();
+        Send(ev->Sender, new NPDisk::TEvSlayResult(NKikimrProto::CORRUPTED, 0,
+                    evSlay.VDiskId, evSlay.SlayOwnerRound, evSlay.PDiskId, evSlay.VSlotId, State->GetStateErrorReason()));
+    }
+
+    void ErrorHandle(NPDisk::TEvChunkReserve::TPtr &ev) {
+        Send(ev->Sender, new NPDisk::TEvChunkReserveResult(NKikimrProto::CORRUPTED, 0, State->GetStateErrorReason()));
+    }
+
+    void ErrorHandle(NPDisk::TEvChunkForget::TPtr &ev) {
+        Send(ev->Sender, new NPDisk::TEvChunkForgetResult(NKikimrProto::CORRUPTED, 0, State->GetStateErrorReason()));
+    }
+
+    void ErrorHandle(NPDisk::TEvYardControl::TPtr &ev) {
+        const NPDisk::TEvYardControl &evControl = *ev->Get();
+        Send(ev->Sender, new NPDisk::TEvYardControlResult(NKikimrProto::CORRUPTED, evControl.Cookie, State->GetStateErrorReason()));
+    }
+
+    void ErrorHandle(NPDisk::TEvAskForCutLog::TPtr &ev) {
+        // Just ignore the event, can't send cut log in this state.
+        Y_UNUSED(ev);
+    }
+
+    void HandleMoveToErrorState() {
+        Impl.StateErrorReason = "Some error reason";
+        Become(&TThis::StateError);
+    }
+
+    void HandleMoveToNormalState() {
+        Impl.StateErrorReason = "";
+        Become(&TThis::StateNormal);
+    }
+
+    STRICT_STFUNC(StateNormal,
         hFunc(NPDisk::TEvYardInit, Handle);
         hFunc(NPDisk::TEvLog, Handle);
         hFunc(NPDisk::TEvChunkForget, Handle);
@@ -829,6 +932,25 @@ public:
         hFunc(NPDisk::TEvHarakiri, Handle);
         hFunc(NPDisk::TEvConfigureScheduler, Handle);
         cFunc(TEvents::TSystem::Wakeup, ReportMetrics);
+
+        cFunc(EvBecomeError, HandleMoveToErrorState);
+    )
+
+    STRICT_STFUNC(StateError,
+        hFunc(NPDisk::TEvYardInit, ErrorHandle);
+        hFunc(NPDisk::TEvCheckSpace, ErrorHandle);
+        hFunc(NPDisk::TEvLog, ErrorHandle);
+        hFunc(NPDisk::TEvMultiLog, ErrorHandle);
+        hFunc(NPDisk::TEvReadLog, ErrorHandle);
+        hFunc(NPDisk::TEvChunkWrite, ErrorHandle);
+        hFunc(NPDisk::TEvChunkRead, ErrorHandle);
+        hFunc(NPDisk::TEvHarakiri, ErrorHandle);
+        hFunc(NPDisk::TEvSlay, ErrorHandle);
+        hFunc(NPDisk::TEvChunkReserve, ErrorHandle);
+        hFunc(NPDisk::TEvChunkForget, ErrorHandle);
+
+        cFunc(TEvents::TSystem::Wakeup, ReportMetrics);
+        cFunc(EvBecomeNormal, HandleMoveToNormalState);
     )
 };
 

+ 7 - 0
ydb/core/blobstorage/pdisk/mock/pdisk_mock.h

@@ -7,6 +7,11 @@
 
 namespace NKikimr {
 
+    enum EPDiskMockEvents {
+        EvBecomeError = TEvBlobStorage::EvEnd + 1,
+        EvBecomeNormal
+    };
+
     class TPDiskMockState : public TThrRefBase {
         struct TImpl;
         std::unique_ptr<TImpl> Impl;
@@ -22,11 +27,13 @@ namespace NKikimr {
 
         void SetCorruptedArea(ui32 chunkIdx, ui32 begin, ui32 end, bool enabled);
         std::set<ui32> GetChunks();
+        TMaybe<NPDisk::TOwnerRound> GetOwnerRound(const TVDiskID& vDiskId) const;
         ui32 GetChunkSize() const;
         TIntervalSet<i64> GetWrittenAreas(ui32 chunkIdx) const;
         void TrimQuery();
         void SetStatusFlags(NKikimrBlobStorage::TPDiskSpaceColor::E spaceColor);
         void SetStatusFlags(NPDisk::TStatusFlags flags);
+        TString& GetStateErrorReason();
 
         TPtr Snapshot(); // create a copy of PDisk whole state
 

Some files were not shown because too many files changed in this diff