index_field.h 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. /*
  2. * Tencent is pleased to support the open source community by making wwsearch
  3. * available.
  4. *
  5. * Copyright (C) 2018-present Tencent. All Rights Reserved.
  6. *
  7. * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  8. * use this file except in compliance with the License. You may obtain a copy of
  9. * the License at
  10. *
  11. * https://opensource.org/licenses/Apache-2.0
  12. *
  13. * Unless required by applicable law or agreed to in writing, software
  14. * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  15. * WARRANTIES OF ANY KIND, either express or implied. See the License for the
  16. * specific language governing permissions and limitations under the License.
  17. */
  18. #pragma once
  19. #include <sys/epoll.h>
  20. #include <sys/mman.h>
  21. #include <sys/poll.h>
  22. #include <sys/prctl.h>
  23. #include <sys/select.h>
  24. #include <sys/socket.h>
  25. #include <sys/stat.h>
  26. #include <sys/syscall.h>
  27. #include <sys/time.h>
  28. #include <sys/types.h>
  29. #include <arpa/inet.h>
  30. #include <netinet/in.h>
  31. #include <netinet/tcp.h>
  32. #include <assert.h>
  33. #include <errno.h>
  34. #include <fcntl.h>
  35. #include <pthread.h>
  36. #include <sched.h>
  37. #include <signal.h>
  38. #include <stdarg.h>
  39. #include <stdint.h>
  40. #include <stdio.h>
  41. #include <stdlib.h>
  42. #include <string.h>
  43. #include <time.h>
  44. #include <unistd.h>
  45. #include <fstream>
  46. #include <iostream>
  47. #include <list>
  48. #include <map>
  49. #include <queue>
  50. #include <set>
  51. #include <sstream>
  52. #include <stack>
  53. #include <string>
  54. #include <vector>
  55. #include "serialize.h"
  56. #include "storage_type.h"
  57. #include "search_store.pb.h"
  58. namespace wwsearch {
  59. /* Notice : Used directly by user to index a document.
  60. */
  61. enum kIndexFieldType {
  62. kIndexFieldUnknowType = 0,
  63. kUint32IndexField = 1,
  64. kUint64IndexField = 2,
  65. kStringIndexField = 3,
  66. kMaxIndexFieldType = 0xFF
  67. };
  68. enum kIndexFieldFlag {
  69. kTokenizeFieldFlag = 1 << 0,
  70. kStoreFieldFlag = 1 << 1,
  71. kDocValueFieldFlag = 1 << 2,
  72. kSuffixBuildFlag = 1 << 3,
  73. kInvertIndexFieldFlag = 1 << 4,
  74. kNotStoreInvertTermFieldFlag = 1 << 5
  75. };
  76. class IndexFieldFlag {
  77. private:
  78. unsigned char flag_;
  79. public:
  80. IndexFieldFlag() { flag_ = 0; }
  81. IndexFieldFlag(const IndexFieldFlag& o) { flag_ = o.Flag(); }
  82. virtual ~IndexFieldFlag();
  83. void SetTokenize();
  84. bool Tokenize() const;
  85. void SetStoredField();
  86. bool StoredField() const;
  87. void SetDocValue();
  88. bool DocValue() const;
  89. void SetSuffixBuild();
  90. bool SuffixBuild() const;
  91. void SetInvertIndex();
  92. bool InvertIndex() const;
  93. void SetNotStoreInvertTerm();
  94. bool NotStoreInvertTerm() const;
  95. inline unsigned char Flag() const { return this->flag_; }
  96. inline void SetFlag(unsigned char flag) { this->flag_ = flag; }
  97. private:
  98. };
  99. // IndexField Construct:
  100. // field_id,flag,field_type,Length+value or value
  101. class IndexField : public SerializeAble {
  102. friend class IndexField;
  103. private:
  104. FieldID field_id_;
  105. IndexFieldFlag field_flag_;
  106. kIndexFieldType field_type_;
  107. uint64_t numeric_value_;
  108. std::string string_value_;
  109. // If we set this to true,wwsearch will not use internal
  110. // tokenizer to segment text.User must set terms.
  111. bool use_outer_segment_terms_; // user have segment terms
  112. std::set<std::string> terms_; // in dictionary sorted order
  113. uint32_t suffix_len_;
  114. // Only for filter
  115. // Only work for uint64_t
  116. std::vector<uint64_t> numeric_list_;
  117. public:
  118. IndexField();
  119. virtual ~IndexField();
  120. IndexField(const IndexField&) = delete;
  121. IndexField& operator=(const IndexField&) = delete;
  122. void CopyFrom(const IndexField& obj);
  123. inline IndexFieldFlag& Flag() { return this->field_flag_; }
  124. inline void SetMeta(const FieldID& field_id, const IndexFieldFlag& flag) {
  125. this->field_id_ = field_id;
  126. this->field_flag_ = flag;
  127. }
  128. void SetUint32(uint32_t value);
  129. void SetUint64(uint64_t value);
  130. void SetString(const std::string& value);
  131. void SetNumericList(const std::vector<uint64_t>& numeric_list);
  132. const std::vector<uint64_t>& NumericList() const {
  133. return this->numeric_list_;
  134. }
  135. // Attention: be carefully
  136. void SetSegmentTerms(std::set<std::string>& terms) {
  137. this->use_outer_segment_terms_ = true;
  138. this->terms_ = terms;
  139. }
  140. bool UseOuterSegmentTerms() { return this->use_outer_segment_terms_; }
  141. void SetSuffixLen(uint32_t suffix_len);
  142. uint32_t SuffixLen() const { return this->suffix_len_; }
  143. virtual bool SerializeToBytes(std::string& buffer, int flag);
  144. virtual bool DeSerializeFromByte(const char* buffer, uint32_t buffer_len);
  145. inline std::set<std::string>& Terms() { return this->terms_; }
  146. inline FieldID ID() const { return this->field_id_; }
  147. inline const kIndexFieldType& FieldType() const { return this->field_type_; }
  148. inline const std::string& StringValue() const { return this->string_value_; }
  149. inline uint64_t NumericValue() const { return this->numeric_value_; }
  150. // 0->only stored field
  151. // 1->only doc value
  152. bool EncodeToStoreField(lsmsearch::StoreIndexField* field, int flag = 0);
  153. bool DecodeFromStoreField(const lsmsearch::StoreIndexField* field);
  154. /*
  155. {
  156. Document[0]
  157. - field=0,flag=1,type=2,term_size=3,value=5/str_value=one two three
  158. - field=0,flag=1,type=2,term_size=3,value=5/str_value=one two three
  159. }
  160. */
  161. void PrintToString(std::string& str) const {
  162. char* buffer = new char[256];
  163. snprintf(buffer, 256, " - field=%u, flag=%d, type=%d, terms_size=%u, ",
  164. field_id_, field_flag_.Flag(), field_type_, terms_.size());
  165. str.append(buffer);
  166. if (field_flag_.StoredField()) {
  167. if (field_type_ == kStringIndexField) {
  168. str.append("str_value=").append(string_value_).append("\n");
  169. } else {
  170. snprintf(buffer, 256, "value:%llu \n", numeric_value_);
  171. str.append(buffer);
  172. }
  173. } else {
  174. str.append("\n");
  175. }
  176. delete buffer;
  177. }
  178. private:
  179. };
  180. } // namespace wwsearch