|
@@ -2,6 +2,8 @@
|
|
|
#include "mkql_match_recognize_matched_vars.h"
|
|
|
#include "mkql_match_recognize_measure_arg.h"
|
|
|
#include "mkql_match_recognize_nfa.h"
|
|
|
+#include "mkql_match_recognize_save_load.h"
|
|
|
+
|
|
|
#include <ydb/library/yql/core/sql_types/match_recognize.h>
|
|
|
#include <ydb/library/yql/minikql/computation/mkql_computation_node_impl.h>
|
|
|
#include <ydb/library/yql/minikql/computation/mkql_computation_node_holders.h>
|
|
@@ -20,6 +22,8 @@ namespace NMatchRecognize {
|
|
|
enum class EOutputColumnSource {PartitionKey, Measure};
|
|
|
using TOutputColumnOrder = std::vector<std::pair<EOutputColumnSource, size_t>, TMKQLAllocator<std::pair<EOutputColumnSource, size_t>>>;
|
|
|
|
|
|
+constexpr ui32 StateVersion = 1;
|
|
|
+
|
|
|
using namespace NYql::NMatchRecognize;
|
|
|
|
|
|
struct TMatchRecognizeProcessorParameters {
|
|
@@ -43,7 +47,17 @@ class TBackTrackingMatchRecognize {
|
|
|
public:
|
|
|
//TODO(YQL-16486): create a tree for backtracking(replace var names with indexes)
|
|
|
|
|
|
- struct TPatternConfiguration {};
|
|
|
+ struct TPatternConfiguration {
|
|
|
+ void Save(TOutputSerializer& /*serializer*/) const {
|
|
|
+ }
|
|
|
+
|
|
|
+ void Load(TInputSerializer& /*serializer*/) {
|
|
|
+ }
|
|
|
+
|
|
|
+ friend bool operator==(const TPatternConfiguration&, const TPatternConfiguration&) {
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ };
|
|
|
|
|
|
struct TPatternConfigurationBuilder {
|
|
|
using TPatternConfigurationPtr = std::shared_ptr<TPatternConfiguration>;
|
|
@@ -124,6 +138,15 @@ public:
|
|
|
}
|
|
|
return not Matches.empty();
|
|
|
}
|
|
|
+
|
|
|
+ void Save(TOutputSerializer& /*serializer*/) const {
|
|
|
+ // Not used in not streaming mode.
|
|
|
+ }
|
|
|
+
|
|
|
+ void Load(TInputSerializer& /*serializer*/) {
|
|
|
+ // Not used in not streaming mode.
|
|
|
+ }
|
|
|
+
|
|
|
private:
|
|
|
const NUdf::TUnboxedValue PartitionKey;
|
|
|
const TMatchRecognizeProcessorParameters& Parameters;
|
|
@@ -137,8 +160,8 @@ private:
|
|
|
class TStreamingMatchRecognize {
|
|
|
using TPartitionList = TSparseList;
|
|
|
using TRange = TPartitionList::TRange;
|
|
|
- using TMatchedVars = TMatchedVars<TRange>;
|
|
|
public:
|
|
|
+ using TPatternConfiguration = TNfaTransitionGraph;
|
|
|
using TPatternConfigurationBuilder = TNfaTransitionGraphBuilder;
|
|
|
TStreamingMatchRecognize(
|
|
|
NUdf::TUnboxedValue&& partitionKey,
|
|
@@ -157,12 +180,18 @@ public:
|
|
|
Parameters.InputDataArg->SetValue(ctx, ctx.HolderFactory.Create<TListValue<TSparseList>>(Rows));
|
|
|
Parameters.CurrentRowIndexArg->SetValue(ctx, NUdf::TUnboxedValuePod(Rows.Size()));
|
|
|
Nfa.ProcessRow(Rows.Append(std::move(row)), ctx);
|
|
|
+ return HasMatched();
|
|
|
+ }
|
|
|
+
|
|
|
+ bool HasMatched() const {
|
|
|
return Nfa.HasMatched();
|
|
|
}
|
|
|
+
|
|
|
NUdf::TUnboxedValue GetOutputIfReady(TComputationContext& ctx) {
|
|
|
auto match = Nfa.GetMatched();
|
|
|
- if (!match.has_value())
|
|
|
+ if (!match.has_value()) {
|
|
|
return NUdf::TUnboxedValue{};
|
|
|
+ }
|
|
|
Parameters.MatchedVarsArg->SetValue(ctx, ctx.HolderFactory.Create<TMatchedVarsValue<TRange>>(ctx.HolderFactory, match.value()));
|
|
|
Parameters.MeasureInputDataArg->SetValue(ctx, ctx.HolderFactory.Create<TMeasureInputDataValue>(
|
|
|
ctx.HolderFactory.Create<TListValue<TSparseList>>(Rows),
|
|
@@ -189,6 +218,21 @@ public:
|
|
|
Y_UNUSED(ctx);
|
|
|
return false;
|
|
|
}
|
|
|
+
|
|
|
+ void Save(TOutputSerializer& serializer) const {
|
|
|
+ // PartitionKey saved in TStateForInterleavedPartitions as key.
|
|
|
+ Rows.Save(serializer);
|
|
|
+ Nfa.Save(serializer);
|
|
|
+ serializer.Write(MatchNumber);
|
|
|
+ }
|
|
|
+
|
|
|
+ void Load(TInputSerializer& serializer) {
|
|
|
+ // PartitionKey passed in contructor.
|
|
|
+ Rows.Load(serializer);
|
|
|
+ Nfa.Load(serializer);
|
|
|
+ MatchNumber = serializer.Read<ui64>();
|
|
|
+ }
|
|
|
+
|
|
|
private:
|
|
|
const NUdf::TUnboxedValue PartitionKey;
|
|
|
const TMatchRecognizeProcessorParameters& Parameters;
|
|
@@ -205,12 +249,15 @@ class TStateForNonInterleavedPartitions
|
|
|
using TRowPatternConfigurationBuilder = typename Algo::TPatternConfigurationBuilder;
|
|
|
public:
|
|
|
TStateForNonInterleavedPartitions(
|
|
|
- TMemoryUsageInfo* memInfo,
|
|
|
- IComputationExternalNode* inputRowArg,
|
|
|
- IComputationNode* partitionKey,
|
|
|
- TType* partitionKeyType,
|
|
|
- const TMatchRecognizeProcessorParameters& parameters,
|
|
|
- const TContainerCacheOnContext& cache
|
|
|
+ TMemoryUsageInfo* memInfo,
|
|
|
+ IComputationExternalNode* inputRowArg,
|
|
|
+ IComputationNode* partitionKey,
|
|
|
+ TType* partitionKeyType,
|
|
|
+ const TMatchRecognizeProcessorParameters& parameters,
|
|
|
+ const TContainerCacheOnContext& cache,
|
|
|
+ TComputationContext &ctx,
|
|
|
+ TType* rowType,
|
|
|
+ const TMutableObjectOverBoxedValue<TValuePackerBoxed>& rowPacker
|
|
|
)
|
|
|
: TComputationValue<TStateForNonInterleavedPartitions>(memInfo)
|
|
|
, InputRowArg(inputRowArg)
|
|
@@ -220,7 +267,54 @@ public:
|
|
|
, RowPatternConfiguration(TRowPatternConfigurationBuilder::Create(parameters.Pattern, parameters.VarNamesLookup))
|
|
|
, Cache(cache)
|
|
|
, Terminating(false)
|
|
|
+ , SerializerContext(ctx, rowType, rowPacker)
|
|
|
{}
|
|
|
+
|
|
|
+ NUdf::TUnboxedValue Save() const override {
|
|
|
+ TOutputSerializer serializer(SerializerContext);
|
|
|
+ serializer.Write(StateVersion);
|
|
|
+ serializer.Write(CurPartitionPackedKey);
|
|
|
+ bool isValid = static_cast<bool>(PartitionHandler);
|
|
|
+ serializer.Write(isValid);
|
|
|
+ if (isValid) {
|
|
|
+ PartitionHandler->Save(serializer);
|
|
|
+ }
|
|
|
+ isValid = static_cast<bool>(DelayedRow);
|
|
|
+ serializer.Write(isValid);
|
|
|
+ if (isValid) {
|
|
|
+ serializer.Write(DelayedRow);
|
|
|
+ }
|
|
|
+ RowPatternConfiguration->Save(serializer);
|
|
|
+ return serializer.MakeString();
|
|
|
+ }
|
|
|
+
|
|
|
+ void Load(const NUdf::TStringRef& state) override {
|
|
|
+ TInputSerializer serializer(SerializerContext, state);
|
|
|
+ const auto stateVersion = serializer.Read<decltype(StateVersion)>();
|
|
|
+ if (stateVersion == 1) {
|
|
|
+ serializer.Read(CurPartitionPackedKey);
|
|
|
+ bool validPartitionHandler = serializer.Read<bool>();
|
|
|
+ if (validPartitionHandler) {
|
|
|
+ NUdf::TUnboxedValue key = PartitionKeyPacker.Unpack(CurPartitionPackedKey, SerializerContext.Ctx.HolderFactory);
|
|
|
+ PartitionHandler.reset(new Algo(
|
|
|
+ std::move(key),
|
|
|
+ Parameters,
|
|
|
+ RowPatternConfiguration,
|
|
|
+ Cache
|
|
|
+ ));
|
|
|
+ PartitionHandler->Load(serializer);
|
|
|
+ }
|
|
|
+ bool validDelayedRow = serializer.Read<bool>();
|
|
|
+ if (validDelayedRow) {
|
|
|
+ DelayedRow = serializer.Read<NUdf::TUnboxedValue>();
|
|
|
+ }
|
|
|
+ auto restoredRowPatternConfiguration = std::make_shared<typename Algo::TPatternConfiguration>();
|
|
|
+ restoredRowPatternConfiguration->Load(serializer);
|
|
|
+ MKQL_ENSURE(*restoredRowPatternConfiguration == *RowPatternConfiguration, "Restored and current RowPatternConfiguration is different");
|
|
|
+ }
|
|
|
+ MKQL_ENSURE(serializer.Empty(), "State is corrupted");
|
|
|
+ }
|
|
|
+
|
|
|
bool ProcessInputRow(NUdf::TUnboxedValue&& row, TComputationContext& ctx) {
|
|
|
MKQL_ENSURE(not DelayedRow, "Internal logic error"); //we're finalizing previous partition
|
|
|
InputRowArg->SetValue(ctx, NUdf::TUnboxedValue(row));
|
|
@@ -288,6 +382,7 @@ private:
|
|
|
const TContainerCacheOnContext& Cache;
|
|
|
NUdf::TUnboxedValue DelayedRow;
|
|
|
bool Terminating;
|
|
|
+ TSerializerContext SerializerContext;
|
|
|
};
|
|
|
|
|
|
class TStateForInterleavedPartitions
|
|
@@ -302,7 +397,10 @@ public:
|
|
|
IComputationNode* partitionKey,
|
|
|
TType* partitionKeyType,
|
|
|
const TMatchRecognizeProcessorParameters& parameters,
|
|
|
- const TContainerCacheOnContext& cache
|
|
|
+ const TContainerCacheOnContext& cache,
|
|
|
+ TComputationContext &ctx,
|
|
|
+ TType* rowType,
|
|
|
+ const TMutableObjectOverBoxedValue<TValuePackerBoxed>& rowPacker
|
|
|
)
|
|
|
: TComputationValue<TStateForInterleavedPartitions>(memInfo)
|
|
|
, InputRowArg(inputRowArg)
|
|
@@ -311,9 +409,59 @@ public:
|
|
|
, Parameters(parameters)
|
|
|
, NfaTransitionGraph(TNfaTransitionGraphBuilder::Create(parameters.Pattern, parameters.VarNamesLookup))
|
|
|
, Cache(cache)
|
|
|
-{
|
|
|
-}
|
|
|
- bool ProcessInputRow(NUdf::TUnboxedValue&& row, TComputationContext& ctx) {
|
|
|
+ , SerializerContext(ctx, rowType, rowPacker)
|
|
|
+ {}
|
|
|
+
|
|
|
+ NUdf::TUnboxedValue Save() const override {
|
|
|
+ TOutputSerializer serializer(SerializerContext);
|
|
|
+ serializer.Write(StateVersion);
|
|
|
+ serializer.Write(Partitions.size());
|
|
|
+
|
|
|
+ for (const auto& [key, state] : Partitions) {
|
|
|
+ serializer.Write(key);
|
|
|
+ state->Save(serializer);
|
|
|
+ }
|
|
|
+ // HasReadyOutput is not packed because when loading we can recalculate HasReadyOutput from Partitions.
|
|
|
+ serializer.Write(Terminating);
|
|
|
+ NfaTransitionGraph->Save(serializer);
|
|
|
+ return serializer.MakeString();
|
|
|
+ }
|
|
|
+
|
|
|
+ void Load(const NUdf::TStringRef& state) override {
|
|
|
+ TInputSerializer serializer(SerializerContext, state);
|
|
|
+ const auto stateVersion = serializer.Read<decltype(StateVersion)>();
|
|
|
+ if (stateVersion == 1) {
|
|
|
+ Partitions.clear();
|
|
|
+ auto partitionsCount = serializer.Read<TPartitionMap::size_type>();
|
|
|
+ Partitions.reserve(partitionsCount);
|
|
|
+ for (size_t i = 0; i < partitionsCount; ++i) {
|
|
|
+ auto packedKey = serializer.Read<TPartitionMap::key_type, std::string_view>();
|
|
|
+ NUdf::TUnboxedValue key = PartitionKeyPacker.Unpack(packedKey, SerializerContext.Ctx.HolderFactory);
|
|
|
+ auto pair = Partitions.emplace(
|
|
|
+ packedKey,
|
|
|
+ std::make_unique<TStreamingMatchRecognize>(
|
|
|
+ std::move(key),
|
|
|
+ Parameters,
|
|
|
+ NfaTransitionGraph,
|
|
|
+ Cache));
|
|
|
+ pair.first->second->Load(serializer);
|
|
|
+ }
|
|
|
+
|
|
|
+ for (auto it = Partitions.begin(); it != Partitions.end(); ++it) {
|
|
|
+ if (it->second->HasMatched()) {
|
|
|
+ HasReadyOutput.push(it);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ serializer.Read(Terminating);
|
|
|
+ auto restoredTransitionGraph = std::make_shared<TNfaTransitionGraph>();
|
|
|
+ restoredTransitionGraph->Load(serializer);
|
|
|
+ MKQL_ENSURE(NfaTransitionGraph, "Empty NfaTransitionGraph");
|
|
|
+ MKQL_ENSURE(*restoredTransitionGraph == *NfaTransitionGraph, "Restored and current NfaTransitionGraph is different");
|
|
|
+ }
|
|
|
+ MKQL_ENSURE(serializer.Empty(), "State is corrupted");
|
|
|
+ }
|
|
|
+
|
|
|
+ bool ProcessInputRow(NUdf::TUnboxedValue&& row, TComputationContext& ctx) {
|
|
|
auto partition = GetPartitionHandler(row, ctx);
|
|
|
if (partition->second->ProcessInputRow(std::move(row), ctx)) {
|
|
|
HasReadyOutput.push(partition);
|
|
@@ -375,17 +523,19 @@ private:
|
|
|
const TMatchRecognizeProcessorParameters& Parameters;
|
|
|
const TNfaTransitionGraph::TPtr NfaTransitionGraph;
|
|
|
const TContainerCacheOnContext& Cache;
|
|
|
+ TSerializerContext SerializerContext;
|
|
|
};
|
|
|
|
|
|
template<class State>
|
|
|
-class TMatchRecognizeWrapper : public TStatefulFlowComputationNode<TMatchRecognizeWrapper<State>> {
|
|
|
- using TBaseComputation = TStatefulFlowComputationNode<TMatchRecognizeWrapper<State>>;
|
|
|
+class TMatchRecognizeWrapper : public TStatefulFlowComputationNode<TMatchRecognizeWrapper<State>, true> {
|
|
|
+ using TBaseComputation = TStatefulFlowComputationNode<TMatchRecognizeWrapper<State>, true>;
|
|
|
public:
|
|
|
TMatchRecognizeWrapper(TComputationMutables &mutables, EValueRepresentation kind, IComputationNode *inputFlow,
|
|
|
IComputationExternalNode *inputRowArg,
|
|
|
IComputationNode *partitionKey,
|
|
|
TType* partitionKeyType,
|
|
|
- const TMatchRecognizeProcessorParameters& parameters
|
|
|
+ const TMatchRecognizeProcessorParameters& parameters,
|
|
|
+ TType* rowType
|
|
|
)
|
|
|
:TBaseComputation(mutables, inputFlow, kind, EValueRepresentation::Embedded)
|
|
|
, InputFlow(inputFlow)
|
|
@@ -394,17 +544,36 @@ public:
|
|
|
, PartitionKeyType(partitionKeyType)
|
|
|
, Parameters(parameters)
|
|
|
, Cache(mutables)
|
|
|
+ , RowType(rowType)
|
|
|
+ , RowPacker(mutables)
|
|
|
{}
|
|
|
|
|
|
NUdf::TUnboxedValue DoCalculate(NUdf::TUnboxedValue &stateValue, TComputationContext &ctx) const {
|
|
|
if (stateValue.IsInvalid()) {
|
|
|
stateValue = ctx.HolderFactory.Create<State>(
|
|
|
- InputRowArg,
|
|
|
- PartitionKey,
|
|
|
- PartitionKeyType,
|
|
|
- Parameters,
|
|
|
- Cache
|
|
|
+ InputRowArg,
|
|
|
+ PartitionKey,
|
|
|
+ PartitionKeyType,
|
|
|
+ Parameters,
|
|
|
+ Cache,
|
|
|
+ ctx,
|
|
|
+ RowType,
|
|
|
+ RowPacker
|
|
|
+ );
|
|
|
+ } else if (stateValue.HasValue() && !stateValue.IsBoxed()) {
|
|
|
+ // Load from saved state.
|
|
|
+ NUdf::TUnboxedValue state = ctx.HolderFactory.Create<State>(
|
|
|
+ InputRowArg,
|
|
|
+ PartitionKey,
|
|
|
+ PartitionKeyType,
|
|
|
+ Parameters,
|
|
|
+ Cache,
|
|
|
+ ctx,
|
|
|
+ RowType,
|
|
|
+ RowPacker
|
|
|
);
|
|
|
+ state.Load(stateValue.AsStringRef());
|
|
|
+ stateValue = state;
|
|
|
}
|
|
|
auto state = static_cast<State*>(stateValue.AsBoxed().Get());
|
|
|
while (true) {
|
|
@@ -446,8 +615,9 @@ private:
|
|
|
IComputationNode* const PartitionKey;
|
|
|
TType* const PartitionKeyType;
|
|
|
const TMatchRecognizeProcessorParameters Parameters;
|
|
|
- TNfaTransitionGraph::TPtr NfaTransitionGraph;
|
|
|
const TContainerCacheOnContext Cache;
|
|
|
+ TType* const RowType;
|
|
|
+ TMutableObjectOverBoxedValue<TValuePackerBoxed> RowPacker;
|
|
|
};
|
|
|
|
|
|
TOutputColumnOrder GetOutputColumnOrder(TRuntimeNode partitionKyeColumnsIndexes, TRuntimeNode measureColumnsIndexes) {
|
|
@@ -550,7 +720,6 @@ std::pair<TUnboxedValueVector, THashMap<TString, size_t>> ConvertListOfStrings(c
|
|
|
} //namespace NMatchRecognize
|
|
|
|
|
|
|
|
|
-
|
|
|
IComputationNode* WrapMatchRecognizeCore(TCallable& callable, const TComputationNodeFactoryContext& ctx) {
|
|
|
using namespace NMatchRecognize;
|
|
|
size_t inputIndex = 0;
|
|
@@ -579,6 +748,7 @@ IComputationNode* WrapMatchRecognizeCore(TCallable& callable, const TComputation
|
|
|
MKQL_ENSURE(callable.GetInputsCount() == inputIndex, "Wrong input count");
|
|
|
|
|
|
const auto& [vars, varsLookup] = ConvertListOfStrings(varNames);
|
|
|
+ auto* rowType = AS_TYPE(TStructType, AS_TYPE(TFlowType, inputFlow.GetStaticType())->GetItemType());
|
|
|
|
|
|
const auto parameters = TMatchRecognizeProcessorParameters {
|
|
|
static_cast<IComputationExternalNode*>(LocateNode(ctx.NodeLocator, *inputDataArg.GetNode()))
|
|
@@ -604,6 +774,7 @@ IComputationNode* WrapMatchRecognizeCore(TCallable& callable, const TComputation
|
|
|
, LocateNode(ctx.NodeLocator, *partitionKeySelector.GetNode())
|
|
|
, partitionKeySelector.GetStaticType()
|
|
|
, std::move(parameters)
|
|
|
+ , rowType
|
|
|
);
|
|
|
} else {
|
|
|
const bool useNfaForTables = true; //TODO(YQL-16486) get this flag from an optimizer
|
|
@@ -615,6 +786,7 @@ IComputationNode* WrapMatchRecognizeCore(TCallable& callable, const TComputation
|
|
|
, LocateNode(ctx.NodeLocator, *partitionKeySelector.GetNode())
|
|
|
, partitionKeySelector.GetStaticType()
|
|
|
, std::move(parameters)
|
|
|
+ , rowType
|
|
|
);
|
|
|
} else {
|
|
|
return new TMatchRecognizeWrapper<TStateForNonInterleavedPartitions<TBackTrackingMatchRecognize>>(ctx.Mutables
|
|
@@ -624,6 +796,7 @@ IComputationNode* WrapMatchRecognizeCore(TCallable& callable, const TComputation
|
|
|
, LocateNode(ctx.NodeLocator, *partitionKeySelector.GetNode())
|
|
|
, partitionKeySelector.GetStaticType()
|
|
|
, std::move(parameters)
|
|
|
+ , rowType
|
|
|
);
|
|
|
}
|
|
|
}
|