Browse Source

KIKIMR-19304 add sorting before truncation

andrew-rykov 1 year ago
parent
commit
a11ce9b2b8

+ 39 - 90
ydb/core/health_check/health_check.cpp

@@ -515,7 +515,6 @@ public:
     TTabletRequestsState TabletRequests;
 
     TDuration Timeout = TDuration::MilliSeconds(10000);
-    ui32 ChildrenRecordsLimit = 0;
     static constexpr TStringBuf STATIC_STORAGE_POOL_NAME = "static";
 
     bool IsSpecificDatabaseFilter() {
@@ -527,7 +526,6 @@ public:
         if (Request->Request.operation_params().has_operation_timeout()) {
             Timeout = GetDuration(Request->Request.operation_params().operation_timeout());
         }
-        ChildrenRecordsLimit = Request->Request.records_limit();
         TIntrusivePtr<TDomainsInfo> domains = AppData()->DomainsInfo;
         TIntrusivePtr<TDomainsInfo::TDomain> domain = domains->Domains.begin()->second;
         DomainPath = "/" + domain->Name;
@@ -1990,80 +1988,6 @@ public:
         record.IssueLog.set_listed(value);
     }
 
-    void RemoveRecordsAboveLimit(TMergeIssuesContext& context, TList<TSelfCheckContext::TIssueRecord>& records) {
-        records.sort([](TSelfCheckContext::TIssueRecord& a, TSelfCheckContext::TIssueRecord& b) { return b.IssueLog.status() < a.IssueLog.status(); }); 
-        ui32 commonListed = 0;
-        for (auto it = records.begin(); it != records.end(); it++) {
-            if (commonListed == ChildrenRecordsLimit) {
-                auto removeIt = it;
-                it--;
-                SetIssueCount(*it, GetIssueCount(*it) + GetIssueCount(*removeIt));
-
-                auto reasons = removeIt->IssueLog.reason();
-                for (auto reasonIt = reasons.begin(); reasonIt != reasons.end(); reasonIt++) {
-                    context.removeIssuesIds.insert(*reasonIt);
-                }
-                context.removeIssuesIds.insert(removeIt->IssueLog.id());
-                records.erase(removeIt);
-            } else if (commonListed + GetIssueListed(*it) > ChildrenRecordsLimit) {
-                auto aboveLimit = commonListed + GetIssueListed(*it) - ChildrenRecordsLimit;
-                SetIssueListed(*it, GetIssueListed(*it) - aboveLimit);
-
-                switch (it->Tag) {
-                    case ETags::GroupState: {
-                        auto groupIds = it->IssueLog.mutable_location()->mutable_storage()->mutable_pool()->mutable_group()->mutable_id();
-                        while (aboveLimit > 0) {
-                            groupIds->RemoveLast();
-                            aboveLimit--;
-                        }
-                        break;
-                    }
-                    case ETags::VDiskState: {
-                        auto vdiscIds = it->IssueLog.mutable_location()->mutable_storage()->mutable_pool()->mutable_group()->mutable_vdisk()->mutable_id();
-                        while (aboveLimit > 0) {
-                            vdiscIds->RemoveLast();
-                            aboveLimit--;
-                        }
-                        break;
-                    }
-                    case ETags::PDiskState: {
-                        auto pdiscs = it->IssueLog.mutable_location()->mutable_storage()->mutable_pool()->mutable_group()->mutable_vdisk()->mutable_pdisk();
-                        while (aboveLimit > 0) {
-                            pdiscs->RemoveLast();
-                            aboveLimit--;
-                        }
-                        break;
-                    }
-                    default: {}
-                }
-                commonListed = ChildrenRecordsLimit;
-            } else {
-                commonListed += GetIssueListed(*it);
-            }
-        }
-    }
-
-    void RemoveRecordsAboveLimit(TMergeIssuesContext& context, ETags levelTag) {
-        auto& records = context.GetRecords(levelTag);
-        if (records.size() > 0) {
-            RemoveRecordsAboveLimit(context, records);
-        }
-    }
-
-    void RemoveRecordsAboveLimit(TMergeIssuesContext& context, ETags levelTag, ETags upperTag) {
-        auto& levelRecords = context.GetRecords(levelTag);
-        auto& upperRecords = context.GetRecords(upperTag);
-
-        TList<TSelfCheckResult::TIssueRecord> handled;
-        for (auto it = upperRecords.begin(); it != upperRecords.end(); it++) {
-            auto children = FindChildrenRecords(levelRecords, *it);
-
-            RemoveRecordsAboveLimit(context, *children);
-            handled.splice(handled.end(), *children);
-        }
-        levelRecords.splice(levelRecords.end(), handled);
-    }
-
     void FillGroupStatus(TGroupId groupId, const NKikimrWhiteboard::TBSGroupStateInfo& groupInfo, Ydb::Monitoring::StorageGroupStatus& storageGroupStatus, TSelfCheckContext context) {
         if (context.Location.mutable_storage()->mutable_pool()->mutable_group()->mutable_id()->empty()) {
             context.Location.mutable_storage()->mutable_pool()->mutable_group()->add_id();
@@ -2158,11 +2082,6 @@ public:
             MergeLevelRecords(mergeContext, ETags::VDiskState, ETags::GroupState);
             MergeLevelRecords(mergeContext, ETags::PDiskState, ETags::VDiskState);
         }
-        if (ChildrenRecordsLimit != 0) {
-            RemoveRecordsAboveLimit(mergeContext, ETags::PDiskState, ETags::VDiskState);
-            RemoveRecordsAboveLimit(mergeContext, ETags::VDiskState, ETags::GroupState);
-            RemoveRecordsAboveLimit(mergeContext, ETags::GroupState);
-        }
         mergeContext.FillRecords(records);
     }
 
@@ -2405,6 +2324,43 @@ public:
         context.FillSelfCheckResult();
     }
 
+    bool TruncateIssuesWithStatusWhileBeyondLimit(Ydb::Monitoring::SelfCheckResult& result, ui64 byteLimit, Ydb::Monitoring::StatusFlag::Status status) {
+        auto byteResult = result.ByteSizeLong();
+        if (byteResult <= byteLimit) {
+            return true;
+        }
+
+        int totalIssues = result.issue_log_size();
+        TVector<int> indexesToRemove;
+        for (int i = 0; i < totalIssues && byteResult > byteLimit; ++i) {
+            if (result.issue_log(i).status() == status) {
+                byteResult -= result.issue_log(i).ByteSizeLong();
+                indexesToRemove.push_back(i);
+            }
+        }
+
+        for (auto it = indexesToRemove.rbegin(); it != indexesToRemove.rend(); ++it) {
+            result.mutable_issue_log()->SwapElements(*it, result.issue_log_size() - 1);
+            result.mutable_issue_log()->RemoveLast();
+        }
+
+        return byteResult <= byteLimit;
+    }
+
+    void TruncateIssuesIfBeyondLimit(Ydb::Monitoring::SelfCheckResult& result, ui64 byteLimit) {
+        auto truncateStatusPriority = {
+            Ydb::Monitoring::StatusFlag::BLUE, 
+            Ydb::Monitoring::StatusFlag::YELLOW, 
+            Ydb::Monitoring::StatusFlag::ORANGE, 
+            Ydb::Monitoring::StatusFlag::RED
+        };
+        for (Ydb::Monitoring::StatusFlag::Status truncateStatus: truncateStatusPriority) {
+            if (TruncateIssuesWithStatusWhileBeyondLimit(result, byteLimit, truncateStatus)) {
+                break;
+            }
+        }
+    }
+
     void ReplyAndPassAway() {
         THolder<TEvSelfCheckResult> response = MakeHolder<TEvSelfCheckResult>();
         Ydb::Monitoring::SelfCheckResult& result = response->Result;
@@ -2423,15 +2379,8 @@ public:
 
         auto byteSize = result.ByteSizeLong();
         auto byteLimit = 50_MB - 1_KB; // 1_KB - for HEALTHCHECK STATUS issue going last
-        if (byteSize > byteLimit) {
-            do {
-                ui32 total = result.issue_log_size();
-                ui32 to_remove = total / 20;
-                for (ui32 i = 0; i < to_remove; ++i) {
-                    result.mutable_issue_log()->RemoveLast();
-                }
-            } while (result.ByteSizeLong() > byteLimit); 
-        }
+        TruncateIssuesIfBeyondLimit(result, byteLimit);
+
         if (byteSize > 30_MB) {
             auto* issue = result.add_issue_log();
             issue->set_type("HEALTHCHECK_STATUS");

+ 10 - 24
ydb/core/health_check/health_check_ut.cpp

@@ -182,7 +182,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
         }
     }
 
-    Ydb::Monitoring::SelfCheckResult RequestHc(int const groupNumber, int const vdiscPerGroupNumber, bool const isMergeRecords = false, int const recordsLimit = 0) {
+    Ydb::Monitoring::SelfCheckResult RequestHc(int const groupNumber, int const vdiscPerGroupNumber, bool const isMergeRecords = false) {
         TPortManager tp;
         ui16 port = tp.GetPort(2134);
         ui16 grpcPort = tp.GetPort(2135);
@@ -228,12 +228,11 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
 
         auto *request = new NHealthCheck::TEvSelfCheckRequest;
         request->Request.set_merge_records(isMergeRecords);
-        request->Request.set_records_limit(recordsLimit);
         runtime.Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, request, 0));
         return runtime.GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
     }
 
-    void CheckHcResult(Ydb::Monitoring::SelfCheckResult& result, int const groupNumber, int const vdiscPerGroupNumber, bool const isMergeRecords = false, int const recordsLimit = 0) {
+    void CheckHcResult(Ydb::Monitoring::SelfCheckResult& result, int const groupNumber, int const vdiscPerGroupNumber, bool const isMergeRecords = false) {
         int groupIssuesCount = 0;
         int groupIssuesNumber = !isMergeRecords ? groupNumber : 1;
         for (const auto& issue_log : result.Getissue_log()) {
@@ -243,9 +242,8 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
                     UNIT_ASSERT_VALUES_EQUAL(issue_log.listed(), 0);
                     UNIT_ASSERT_VALUES_EQUAL(issue_log.count(), 0);
                 } else {
-                    int groupListed = recordsLimit == 0 ? groupNumber : std::min<int>(groupNumber, recordsLimit);
-                    UNIT_ASSERT_VALUES_EQUAL(issue_log.location().storage().pool().group().id_size(), groupListed);
-                    UNIT_ASSERT_VALUES_EQUAL(issue_log.listed(), groupListed);
+                    UNIT_ASSERT_VALUES_EQUAL(issue_log.location().storage().pool().group().id_size(), groupNumber);
+                    UNIT_ASSERT_VALUES_EQUAL(issue_log.listed(), groupNumber);
                     UNIT_ASSERT_VALUES_EQUAL(issue_log.count(), groupNumber);
                 }
                 groupIssuesCount++;
@@ -264,9 +262,9 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
         UNIT_ASSERT_VALUES_EQUAL(issueVdiscCount, issueVdiscNumber);
     }
 
-    void ListingTest(int const groupNumber, int const vdiscPerGroupNumber, bool const isMergeRecords = false, int const recordsLimit = 0) {
-        auto result = RequestHc(groupNumber, vdiscPerGroupNumber, isMergeRecords, recordsLimit);
-        CheckHcResult(result, groupNumber, vdiscPerGroupNumber, isMergeRecords, recordsLimit);
+    void ListingTest(int const groupNumber, int const vdiscPerGroupNumber, bool const isMergeRecords = false) {
+        auto result = RequestHc(groupNumber, vdiscPerGroupNumber, isMergeRecords);
+        CheckHcResult(result, groupNumber, vdiscPerGroupNumber, isMergeRecords);
     }
 
     Ydb::Monitoring::SelfCheckResult RequestHcWithVdisks(TString erasurespecies, const TVector<Ydb::Monitoring::StatusFlag::Status>& vdiskStatuses) {
@@ -320,7 +318,6 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
 
         auto *request = new NHealthCheck::TEvSelfCheckRequest;
         request->Request.set_merge_records(true);
-        request->Request.set_records_limit(10);
 
         runtime.Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, request, 0));
         return runtime.GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
@@ -418,18 +415,6 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
         ListingTest(100, 100, true);
     }
 
-    Y_UNIT_TEST(Issues100GroupsMergingAndLimiting) {
-        ListingTest(100, 1, true, 10);
-    }
-
-    Y_UNIT_TEST(Issues100VCardMergingAndLimiting) {
-        ListingTest(1, 100, true, 10);
-    }
-
-    Y_UNIT_TEST(Issues100Groups100VCardMergingAndLimiting) {
-        ListingTest(100, 100, true, 10);
-    }
-
     Y_UNIT_TEST(NoneRedGroupWhenRedVdisk) {
         auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::NONE, {Ydb::Monitoring::StatusFlag::RED});
         CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 1);
@@ -555,16 +540,17 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
         UNIT_ASSERT_VALUES_EQUAL(issuesCount, total);
     }
 
-    Y_UNIT_TEST(ProtobufUnderLimit50Mb) {
+    Y_UNIT_TEST(ProtobufUnderLimitFor5000Groups) {
         auto result = RequestHc(3000, 1);
         CheckHcProtobufSize(result, Ydb::Monitoring::StatusFlag::RED, 0);
         CheckHcProtobufSize(result, Ydb::Monitoring::StatusFlag::ORANGE, 0);
         CheckHcProtobufSize(result, Ydb::Monitoring::StatusFlag::YELLOW, 0);
     }
 
-    Y_UNIT_TEST(ProtobufBeyondLimit50Mb) {
+    Y_UNIT_TEST(ProtobufUnderLimitFor350000Groups) {
         auto result = RequestHc(350000, 1);
         CheckHcProtobufSize(result, Ydb::Monitoring::StatusFlag::RED, 1);
+        UNIT_ASSERT_LT(result.ByteSizeLong(), 50_MB);
     }
 }
 

+ 0 - 2
ydb/core/viewer/json_healthcheck.h

@@ -72,7 +72,6 @@ public:
         request->Request.set_return_verbose_status(FromStringWithDefault<bool>(params.Get("verbose"), false));
         request->Request.set_maximum_level(FromStringWithDefault<ui32>(params.Get("max_level"), 0));
         request->Request.set_merge_records(FromStringWithDefault<bool>(params.Get("merge_records"), false));
-        request->Request.set_records_limit(FromStringWithDefault<ui32>(params.Get("records_limit"), 0));
         SetDuration(TDuration::MilliSeconds(Timeout), *request->Request.mutable_operation_params()->mutable_operation_timeout());
         if (params.Has("min_status")) {
             Ydb::Monitoring::StatusFlag::Status minStatus;
@@ -210,7 +209,6 @@ struct TJsonRequestParameters<TJsonHealthCheck> {
                       {"name":"tenant","in":"query","description":"path to database","required":false,"type":"string"},
                       {"name":"verbose","in":"query","description":"return verbose status","required":false,"type":"boolean"},
                       {"name":"merge_records","in":"query","description":"merge records","required":false,"type":"boolean"},
-                      {"name":"records_limit","in":"query","description":"children records limit","required":false,"type":"integer"},
                       {"name":"max_level","in":"query","description":"max depth of issues to return","required":false,"type":"integer"},
                       {"name":"min_status","in":"query","description":"min status of issues to return","required":false,"type":"string"},
                       {"name":"format","in":"query","description":"format of reply","required":false,"type":"string"}])___";

+ 0 - 1
ydb/public/api/protos/ydb_monitoring.proto

@@ -29,7 +29,6 @@ message SelfCheckRequest {
     uint32 maximum_level = 4; // maximum level of issues to return
     bool do_not_cache = 5; // by default database health state is taken from metadata cache; this option can be used to force bypassing that cache
     bool merge_records = 6; // combine similar records with similar status, message and level into one issue
-    uint32 records_limit = 7; // limit the number of records that have one parent record, default - without limit
 }
 
 message SelfCheckResponse {