filter.h 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355
  1. /*
  2. * Tencent is pleased to support the open source community by making wwsearch
  3. * available.
  4. *
  5. * Copyright (C) 2018-present Tencent. All Rights Reserved.
  6. *
  7. * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  8. * use this file except in compliance with the License. You may obtain a copy of
  9. * the License at
  10. *
  11. * https://opensource.org/licenses/Apache-2.0
  12. *
  13. * Unless required by applicable law or agreed to in writing, software
  14. * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  15. * WARRANTIES OF ANY KIND, either express or implied. See the License for the
  16. * specific language governing permissions and limitations under the License.
  17. */
  18. #pragma once
  19. #include <algorithm>
  20. #include <sstream>
  21. #include "header.h"
  22. #include "index_field.h"
  23. #include "logger.h"
  24. #include "utils.h"
  25. namespace wwsearch {
  26. // Basic filter api
  27. class Filter {
  28. protected:
  29. IndexField field_;
  30. public:
  31. virtual ~Filter() {}
  32. // Must Get it and set
  33. IndexField *GetField() { return &field_; }
  34. // match this filter ?
  35. virtual bool Match(const IndexField *field) = 0;
  36. inline FieldID GetFieldID() { return field_.ID(); }
  37. virtual std::string PrintReadableStr() {
  38. return std::string("not implementation");
  39. };
  40. // If field match uint32 or uint64 field.
  41. static bool CheckFieldTypeNumeric(const IndexField &field) {
  42. if (field.FieldType() == kUint32IndexField ||
  43. field.FieldType() == kUint64IndexField) {
  44. return true;
  45. }
  46. return false;
  47. };
  48. // If field match string field.
  49. static bool CheckFieldTypeString(const IndexField &field) {
  50. if (field.FieldType() == kStringIndexField) {
  51. return true;
  52. }
  53. return false;
  54. };
  55. private:
  56. };
  57. // Equal api ,spport numeric and string.
  58. class EqualFilter : public Filter {
  59. private:
  60. public:
  61. virtual ~EqualFilter() {}
  62. virtual bool Match(const IndexField *field) override {
  63. // if no field stored,just filter it,because we do not know
  64. if (nullptr == field) return false;
  65. assert(field->ID() == field_.ID());
  66. if (CheckFieldTypeNumeric(*field) && CheckFieldTypeNumeric(field_)) {
  67. return field_.NumericValue() == field->NumericValue();
  68. } else if (CheckFieldTypeString(*field) && CheckFieldTypeString(field_)) {
  69. return field_.StringValue() == field->StringValue();
  70. }
  71. // type not match.skip
  72. return false;
  73. }
  74. virtual std::string PrintReadableStr() {
  75. char buffer[64];
  76. snprintf(buffer, sizeof(buffer), "field=%u equal [%llu]", field_.ID(),
  77. field_.NumericValue());
  78. return std::string(buffer);
  79. }
  80. private:
  81. };
  82. // Not equal filter,support numeric and string field.
  83. class NotEqualFilter : public Filter {
  84. private:
  85. public:
  86. virtual ~NotEqualFilter() {}
  87. virtual bool Match(const IndexField *field) override {
  88. // if no field stored,just filter it,because we do not know
  89. if (nullptr == field) return false;
  90. assert(field->ID() == field_.ID());
  91. if (CheckFieldTypeNumeric(*field) && CheckFieldTypeNumeric(field_)) {
  92. return field_.NumericValue() != field->NumericValue();
  93. } else if (CheckFieldTypeString(*field) && CheckFieldTypeString(field_)) {
  94. return field_.StringValue() != field->StringValue();
  95. }
  96. // type not match,skip
  97. return false;
  98. }
  99. virtual std::string PrintReadableStr() {
  100. char buffer[64];
  101. snprintf(buffer, sizeof(buffer), "field=%u not equal [%llu]", field_.ID(),
  102. field_.NumericValue());
  103. return std::string(buffer);
  104. }
  105. private:
  106. };
  107. // Range field, only support numeric field.
  108. class RangeFilter : public Filter {
  109. private:
  110. uint64_t begin_;
  111. uint64_t end_;
  112. public:
  113. RangeFilter(uint64_t begin, uint64_t end) : begin_(begin), end_(end) {}
  114. virtual ~RangeFilter() {}
  115. virtual bool Match(const IndexField *field) override {
  116. // if no field stored,just filter it,because we do not know
  117. if (nullptr == field) return false;
  118. assert(field->ID() == field_.ID());
  119. if (!CheckFieldTypeNumeric(*field) /*|| !CheckFieldTypeNumeric(field_)*/) {
  120. return false;
  121. }
  122. return field->NumericValue() >= begin_ && field->NumericValue() <= end_;
  123. }
  124. virtual std::string PrintReadableStr() {
  125. char buffer[64];
  126. snprintf(buffer, sizeof(buffer), "field=%u in range [%llu,%llu]",
  127. field_.ID(), begin_, end_);
  128. return std::string(buffer);
  129. }
  130. private:
  131. };
  132. // If one value match the field?
  133. class InFilter : public Filter {
  134. private:
  135. public:
  136. virtual ~InFilter() {}
  137. virtual bool Match(const IndexField *field) override {
  138. // if no field stored,just filter it,because we do not know
  139. if (nullptr == field) return false;
  140. assert(field->ID() == field_.ID());
  141. // must be string
  142. if (field_.FieldType() == kStringIndexField &&
  143. field->FieldType() == kStringIndexField) {
  144. SearchLogDebug("doc field=%s, query filter=%s",
  145. field_.StringValue().c_str(),
  146. field->StringValue().c_str());
  147. return field->StringValue().find(field_.StringValue()) !=
  148. std::string::npos;
  149. } else {
  150. return false;
  151. }
  152. }
  153. virtual std::string PrintReadableStr() {
  154. char buffer[64];
  155. snprintf(buffer, sizeof(buffer), "field=%u in [%s]", field_.ID(),
  156. field_.StringValue().c_str());
  157. return std::string(buffer);
  158. }
  159. private:
  160. };
  161. // If no one value match the field?
  162. class NotInFilter : public Filter {
  163. private:
  164. public:
  165. virtual ~NotInFilter() {}
  166. virtual bool Match(const IndexField *field) override {
  167. // if no field stored,just filter it,because we do not know
  168. if (nullptr == field) return false;
  169. assert(field->ID() == field_.ID());
  170. // must be string
  171. if (field_.FieldType() == kStringIndexField &&
  172. field->FieldType() == kStringIndexField) {
  173. return field->StringValue().find(field_.StringValue()) ==
  174. std::string::npos;
  175. } else {
  176. return false;
  177. }
  178. }
  179. virtual std::string PrintReadableStr() {
  180. char buffer[64];
  181. snprintf(buffer, sizeof(buffer), "field=%u not in [%s]", field_.ID(),
  182. field_.StringValue().c_str());
  183. return std::string(buffer);
  184. }
  185. private:
  186. };
  187. // If one numeric value match the field?
  188. class InNumericListFilter : public Filter {
  189. private:
  190. public:
  191. virtual ~InNumericListFilter() {}
  192. virtual bool Match(const IndexField *field) override {
  193. if (nullptr == field) return false;
  194. assert(field->ID() == field_.ID());
  195. if (!CheckFieldTypeNumeric(*field) || !CheckFieldTypeNumeric(field_)) {
  196. return false;
  197. }
  198. const std::vector<uint64_t> &numeric_list = field_.NumericList();
  199. uint64_t numeric_value = field->NumericValue();
  200. auto iter =
  201. std::find(numeric_list.begin(), numeric_list.end(), numeric_value);
  202. return iter != numeric_list.end();
  203. }
  204. virtual std::string PrintReadableStr() {
  205. char buffer[128];
  206. snprintf(buffer, sizeof(buffer), "field=%u in [%s]", field_.ID(),
  207. JoinContainerToString(field_.NumericList(), ";").c_str());
  208. return std::string(buffer);
  209. }
  210. private:
  211. };
  212. // If no one value match the field?
  213. class NotInNumericListFilter : public Filter {
  214. private:
  215. public:
  216. virtual ~NotInNumericListFilter() {}
  217. virtual bool Match(const IndexField *field) override {
  218. if (nullptr == field) return false;
  219. assert(field->ID() == field_.ID());
  220. if (!CheckFieldTypeNumeric(*field) || !CheckFieldTypeNumeric(field_)) {
  221. return false;
  222. }
  223. const std::vector<uint64_t> &numeric_list = field_.NumericList();
  224. uint64_t numeric_value = field->NumericValue();
  225. auto iter =
  226. std::find(numeric_list.begin(), numeric_list.end(), numeric_value);
  227. return iter == numeric_list.end();
  228. }
  229. virtual std::string PrintReadableStr() {
  230. char buffer[128];
  231. snprintf(buffer, sizeof(buffer), "field=%u in [%s]", field_.ID(),
  232. JoinContainerToString(field_.NumericList(), ";").c_str());
  233. return std::string(buffer);
  234. }
  235. private:
  236. };
  237. // If some string match the field?
  238. class MatchStringListFilter : public Filter {
  239. private:
  240. std::vector<std::string> string_value_list_;
  241. bool revert_;
  242. uint32_t min_should_match_filter_values_num_;
  243. public:
  244. MatchStringListFilter(
  245. const std::vector<std::string> &string_list, bool revert = false,
  246. uint32_t min_should_match_filter_values_num = 0xFFFFFFFF)
  247. : string_value_list_(string_list),
  248. revert_(revert),
  249. min_should_match_filter_values_num_(
  250. min_should_match_filter_values_num) {
  251. min_should_match_filter_values_num_ =
  252. (min_should_match_filter_values_num_ > string_value_list_.size())
  253. ? string_value_list_.size()
  254. : min_should_match_filter_values_num_;
  255. }
  256. virtual ~MatchStringListFilter() {}
  257. virtual bool Match(const IndexField *field) override {
  258. if (nullptr == field) return false;
  259. assert(field->ID() == field_.ID());
  260. if (field->FieldType() != kStringIndexField) {
  261. return false;
  262. }
  263. bool match = InnerMatch(field);
  264. return revert_ ? !match : match;
  265. }
  266. virtual std::string PrintReadableStr() {
  267. char buffer[128];
  268. snprintf(buffer, sizeof(buffer), "MatchStringListFilter: size=%u revert=%d",
  269. string_value_list_.size(), revert_);
  270. return std::string(buffer);
  271. }
  272. private:
  273. std::string ParseStringValue(const std::string &field_str_val,
  274. const std::string &string_value) {
  275. if (field_str_val.size() != 4 || string_value.size() % 4 != 0) {
  276. return std::string("WTF???? string value size not 4*x\n");
  277. }
  278. std::ostringstream o;
  279. uint32_t hash_val = *(uint32_t *)field_str_val.c_str();
  280. hash_val = ntohl(hash_val);
  281. o << "field hash_val = " << hash_val << ", ";
  282. for (int i = 0; i < string_value.size(); i += 4) {
  283. std::string sub = string_value.substr(i, 4);
  284. hash_val = *(uint32_t *)sub.c_str();
  285. hash_val = ntohl(hash_val);
  286. o << hash_val << " ";
  287. }
  288. return o.str();
  289. }
  290. bool InnerMatch(const IndexField *field) {
  291. uint32_t match_count = 0;
  292. for (size_t i = 0; i < string_value_list_.size(); i++) {
  293. if (field->StringValue().find(string_value_list_[i]) !=
  294. std::string::npos) {
  295. match_count++;
  296. }
  297. }
  298. return match_count >= min_should_match_filter_values_num_;
  299. }
  300. };
  301. } // namespace wwsearch