atomizer.h 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
  1. #pragma once
  2. #include <library/cpp/containers/str_map/str_map.h>
  3. #include <util/generic/vector.h>
  4. #include <util/generic/utility.h>
  5. #include <utility>
  6. #include <cstdio>
  7. template <class HashFcn = THash<const char*>, class EqualTo = TEqualTo<const char*>>
  8. class atomizer;
  9. template <class T, class HashFcn = THash<const char*>, class EqualTo = TEqualTo<const char*>>
  10. class super_atomizer;
  11. template <class HashFcn, class EqualTo>
  12. class atomizer: public string_hash<ui32, HashFcn, EqualTo> {
  13. private:
  14. TVector<const char*> order;
  15. public:
  16. using iterator = typename string_hash<ui32, HashFcn, EqualTo>::iterator;
  17. using const_iterator = typename string_hash<ui32, HashFcn, EqualTo>::const_iterator;
  18. using value_type = typename string_hash<ui32, HashFcn, EqualTo>::value_type;
  19. using size_type = typename string_hash<ui32, HashFcn, EqualTo>::size_type;
  20. using pool_size_type = typename string_hash<ui32, HashFcn, EqualTo>::pool_size_type;
  21. using string_hash<ui32, HashFcn, EqualTo>::pool;
  22. using string_hash<ui32, HashFcn, EqualTo>::size;
  23. using string_hash<ui32, HashFcn, EqualTo>::find;
  24. using string_hash<ui32, HashFcn, EqualTo>::end;
  25. using string_hash<ui32, HashFcn, EqualTo>::insert_copy;
  26. using string_hash<ui32, HashFcn, EqualTo>::clear_hash;
  27. atomizer() {
  28. order.reserve(HASH_SIZE_DEFAULT);
  29. }
  30. atomizer(size_type hash_size, pool_size_type pool_size)
  31. : string_hash<ui32, HashFcn, EqualTo>(hash_size, pool_size)
  32. {
  33. order.reserve(hash_size);
  34. }
  35. ~atomizer() = default;
  36. ui32 string_to_atom(const char* key) {
  37. const char* old_begin = pool.Begin();
  38. const char* old_end = pool.End();
  39. std::pair<iterator, bool> ins = insert_copy(key, ui32(size() + 1));
  40. if (ins.second) { // new?
  41. if (pool.Begin() != old_begin) // repoint?
  42. for (TVector<const char*>::iterator ptr = order.begin(); ptr != order.end(); ++ptr)
  43. if (old_begin <= *ptr && *ptr < old_end) // from old pool?
  44. *ptr += pool.Begin() - old_begin;
  45. order.push_back((*ins.first).first); // copy of 'key'
  46. }
  47. return (ui32)(*ins.first).second;
  48. }
  49. ui32 perm_string_to_atom(const char* key) {
  50. value_type val(key, ui32(size() + 1));
  51. std::pair<iterator, bool> ins = this->insert(val);
  52. if (ins.second)
  53. order.push_back((*ins.first).first); // == copy of 'key'
  54. return (ui32)(*ins.first).second; // == size()+1
  55. }
  56. ui32 find_atom(const char* key) const {
  57. const_iterator it = find(key);
  58. if (it == end())
  59. return 0; // INVALID_ATOM
  60. else
  61. return (ui32)(*it).second;
  62. }
  63. const char* get_atom_name(ui32 atom) const {
  64. if (atom && atom <= size())
  65. return order[atom - 1];
  66. return nullptr;
  67. }
  68. void clear_atomizer() {
  69. clear_hash();
  70. order.clear();
  71. }
  72. void SaveC2N(FILE* f) const { // we write sorted file
  73. for (ui32 i = 0; i < order.size(); i++)
  74. if (order[i])
  75. fprintf(f, "%d\t%s\n", i + 1, order[i]);
  76. }
  77. void LoadC2N(FILE* f) { // but can read unsorted one
  78. long k, km = 0;
  79. char buf[1000];
  80. char* s;
  81. while (fgets(buf, 1000, f)) {
  82. k = strtol(buf, &s, 10);
  83. char* endl = strchr(s, '\n');
  84. if (endl)
  85. *endl = 0;
  86. if (k > 0 && k != LONG_MAX) {
  87. km = Max(km, k);
  88. insert_copy(++s, ui32(k));
  89. }
  90. }
  91. order.resize(km);
  92. memset(&order[0], 0, order.size()); // if some atoms are absent
  93. for (const_iterator I = this->begin(); I != end(); ++I)
  94. order[(*I).second - 1] = (*I).first;
  95. }
  96. };
  97. template <class T, class HashFcn, class EqualTo>
  98. class super_atomizer: public string_hash<ui32, HashFcn, EqualTo> {
  99. private:
  100. using TOrder = TVector<std::pair<const char*, T>>;
  101. TOrder order;
  102. public:
  103. using iterator = typename string_hash<ui32, HashFcn, EqualTo>::iterator;
  104. using const_iterator = typename string_hash<ui32, HashFcn, EqualTo>::const_iterator;
  105. using value_type = typename string_hash<ui32, HashFcn, EqualTo>::value_type;
  106. using size_type = typename string_hash<ui32, HashFcn, EqualTo>::size_type;
  107. using pool_size_type = typename string_hash<ui32, HashFcn, EqualTo>::pool_size_type;
  108. using o_iterator = typename TOrder::iterator;
  109. using o_const_iterator = typename TOrder::const_iterator;
  110. using o_value_type = typename TOrder::value_type;
  111. using string_hash<ui32, HashFcn, EqualTo>::pool;
  112. using string_hash<ui32, HashFcn, EqualTo>::size;
  113. using string_hash<ui32, HashFcn, EqualTo>::find;
  114. using string_hash<ui32, HashFcn, EqualTo>::end;
  115. using string_hash<ui32, HashFcn, EqualTo>::insert_copy;
  116. using string_hash<ui32, HashFcn, EqualTo>::clear_hash;
  117. super_atomizer() {
  118. order.reserve(HASH_SIZE_DEFAULT);
  119. }
  120. super_atomizer(size_type hash_size, pool_size_type pool_size)
  121. : string_hash<ui32, HashFcn, EqualTo>(hash_size, pool_size)
  122. {
  123. order.reserve(hash_size);
  124. }
  125. ~super_atomizer() = default;
  126. ui32 string_to_atom(const char* key, const T* atom_data = NULL) {
  127. const char* old_begin = pool.Begin();
  128. const char* old_end = pool.End();
  129. std::pair<iterator, bool> ins = insert_copy(key, ui32(size() + 1));
  130. if (ins.second) { // new?
  131. if (pool.Begin() != old_begin) // repoint?
  132. for (typename TOrder::iterator ptr = order.begin(); ptr != order.end(); ++ptr)
  133. if (old_begin <= (*ptr).first && (*ptr).first < old_end) // from old pool?
  134. (*ptr).first += pool.Begin() - old_begin;
  135. order.push_back(std::pair<const char*, T>((*ins.first).first, atom_data ? *atom_data : T()));
  136. }
  137. return (*ins.first).second;
  138. }
  139. ui32 perm_string_to_atom(const char* key, const T* atom_data = NULL) {
  140. value_type val(key, ui32(size() + 1));
  141. std::pair<iterator, bool> ins = this->insert(val);
  142. if (ins.second)
  143. order.push_back(std::pair<const char*, T>((*ins.first).first, atom_data ? *atom_data : T()));
  144. return (*ins.first).second; // == size()+1
  145. }
  146. ui32 find_atom(const char* key) const {
  147. const_iterator it = find(key);
  148. if (it == end())
  149. return 0; // INVALID_ATOM
  150. else
  151. return (*it).second;
  152. }
  153. const char* get_atom_name(ui32 atom) const {
  154. if (atom && atom <= size())
  155. return order[atom - 1].first;
  156. return nullptr;
  157. }
  158. const T* get_atom_data(ui32 atom) const {
  159. if (atom && atom <= size())
  160. return &order[atom - 1].second;
  161. return NULL;
  162. }
  163. T* get_atom_data(ui32 atom) {
  164. if (atom && atom <= size())
  165. return &order[atom - 1].second;
  166. return NULL;
  167. }
  168. o_iterator o_begin() {
  169. return order.begin();
  170. }
  171. o_iterator o_end() {
  172. return order.end();
  173. }
  174. o_const_iterator o_begin() const {
  175. return order.begin();
  176. }
  177. o_const_iterator o_end() const {
  178. return order.end();
  179. }
  180. void clear_atomizer() {
  181. clear_hash();
  182. order.clear();
  183. }
  184. };