pool.cpp 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. #include "pool.h"
  2. #include <util/string/cast.h>
  3. #include <util/stream/file.h>
  4. TInstance TInstance::FromFeaturesString(const TString& featuresString) {
  5. TInstance instance;
  6. TStringBuf featuresStringBuf(featuresString);
  7. featuresStringBuf.NextTok('\t'); // query id
  8. instance.Goal = FromString(featuresStringBuf.NextTok('\t'));
  9. featuresStringBuf.NextTok('\t'); // url
  10. instance.Weight = FromString(featuresStringBuf.NextTok('\t'));
  11. while (featuresStringBuf) {
  12. instance.Features.push_back(FromString(featuresStringBuf.NextTok('\t')));
  13. }
  14. return instance;
  15. }
  16. TPool::TCVIterator::TCVIterator(const TPool& parentPool, const size_t foldsCount, const EIteratorType iteratorType)
  17. : ParentPool(parentPool)
  18. , FoldsCount(foldsCount)
  19. , IteratorType(iteratorType)
  20. , InstanceFoldNumbers(ParentPool.size())
  21. {
  22. }
  23. void TPool::TCVIterator::ResetShuffle() {
  24. TVector<size_t> instanceNumbers(ParentPool.size());
  25. for (size_t instanceNumber = 0; instanceNumber < ParentPool.size(); ++instanceNumber) {
  26. instanceNumbers[instanceNumber] = instanceNumber;
  27. }
  28. Shuffle(instanceNumbers.begin(), instanceNumbers.end(), RandomGenerator);
  29. for (size_t instancePosition = 0; instancePosition < ParentPool.size(); ++instancePosition) {
  30. InstanceFoldNumbers[instanceNumbers[instancePosition]] = instancePosition % FoldsCount;
  31. }
  32. Current = InstanceFoldNumbers.begin();
  33. }
  34. void TPool::TCVIterator::SetTestFold(const size_t testFoldNumber) {
  35. TestFoldNumber = testFoldNumber;
  36. Current = InstanceFoldNumbers.begin();
  37. Advance();
  38. }
  39. bool TPool::TCVIterator::IsValid() const {
  40. return Current != InstanceFoldNumbers.end();
  41. }
  42. const TInstance& TPool::TCVIterator::operator*() const {
  43. return ParentPool[Current - InstanceFoldNumbers.begin()];
  44. }
  45. const TInstance* TPool::TCVIterator::operator->() const {
  46. return &ParentPool[Current - InstanceFoldNumbers.begin()];
  47. }
  48. TPool::TCVIterator& TPool::TCVIterator::operator++() {
  49. Advance();
  50. return *this;
  51. }
  52. void TPool::TCVIterator::Advance() {
  53. while (IsValid()) {
  54. ++Current;
  55. if (IsValid() && TakeCurrent()) {
  56. break;
  57. }
  58. }
  59. }
  60. bool TPool::TCVIterator::TakeCurrent() const {
  61. switch (IteratorType) {
  62. case LearnIterator:
  63. return *Current != TestFoldNumber;
  64. case TestIterator:
  65. return *Current == TestFoldNumber;
  66. }
  67. return false;
  68. }
  69. void TPool::ReadFromFeatures(const TString& featuresPath) {
  70. TFileInput featuresIn(featuresPath);
  71. TString featuresString;
  72. while (featuresIn.ReadLine(featuresString)) {
  73. this->push_back(TInstance::FromFeaturesString(featuresString));
  74. }
  75. }
  76. TPool::TCVIterator TPool::CrossValidationIterator(const size_t foldsCount, const EIteratorType iteratorType) const {
  77. return TPool::TCVIterator(*this, foldsCount, iteratorType);
  78. }
  79. TPool TPool::InjurePool(const double injureFactor, const double injureOffset) const {
  80. TPool injuredPool(*this);
  81. for (TInstance& instance : injuredPool) {
  82. for (double& feature : instance.Features) {
  83. feature = feature * injureFactor + injureOffset;
  84. }
  85. instance.Goal = instance.Goal * injureFactor + injureOffset;
  86. }
  87. return injuredPool;
  88. }