collector_top.h 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. /*
  2. * Tencent is pleased to support the open source community by making wwsearch
  3. * available.
  4. *
  5. * Copyright (C) 2018-present Tencent. All Rights Reserved.
  6. *
  7. * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  8. * use this file except in compliance with the License. You may obtain a copy of
  9. * the License at
  10. *
  11. * https://opensource.org/licenses/Apache-2.0
  12. *
  13. * Unless required by applicable law or agreed to in writing, software
  14. * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  15. * WARRANTIES OF ANY KIND, either express or implied. See the License for the
  16. * specific language governing permissions and limitations under the License.
  17. */
  18. #pragma once
  19. #include "collector.h"
  20. #include "filter.h"
  21. #include "post_scorer.h"
  22. #include "sorter.h"
  23. #include "tracer.h"
  24. namespace wwsearch {
  25. class Searcher;
  26. class SearchContext;
  27. using PriorityQueue =
  28. std::priority_queue<Document *, std::vector<Document *>, Sorter>;
  29. using ScorePriorityQueue =
  30. std::priority_queue<Document *, std::vector<Document *>, PostScorer>;
  31. /* Notice : Core class for collect & filter & sort the suitable doc list.
  32. * Main method is `InnerPurge()`. There are three periods:
  33. * 1. Collect : Get doc list from Scorer, all doc id will be passed to
  34. * TopNCollector
  35. * by `void Collect(DocumentID doc, int field_id)`.
  36. *
  37. * 2. Filter : Base class `Filter` has a method `bool Match(const IndexField
  38. * *field)`,
  39. * iterator all vecter<Filter*> to filter all match doc list.
  40. * Filter is based on DocValue column family which store in db.
  41. *
  42. * 3. PostScorer : Score the doc list by ScoreStrategy's list, including
  43. * Complete keywords constructed from query request.
  44. *
  45. * 4. Sort : use PriorityQueue to sort the doc list.
  46. * PriorityQueue with a Sorter compare function, support compare base on
  47. * numeric.
  48. */
  49. class TopNCollector : public Collector {
  50. private:
  51. PriorityQueue topN_docs_;
  52. ScorePriorityQueue score_priority_queue_;
  53. std::vector<SortCondition *> *sorter_;
  54. std::vector<std::shared_ptr<ScoreStrategy>> *score_strategy_list_;
  55. size_t max_score_doc_num_;
  56. size_t score_doc_count_;
  57. bool use_score_strategy_;
  58. std::vector<Document *> buffer_docs_;
  59. TableID table_;
  60. size_t top_n_;
  61. size_t offset_;
  62. size_t limit_;
  63. Searcher *searcher_;
  64. SearchContext *search_context_;
  65. std::vector<Filter *> *filter_;
  66. uint32_t inner_purge_total_docs_count_;
  67. uint32_t min_match_filter_num_;
  68. bool quick_top_n;
  69. wwsearch::SearchTracer *tracer_;
  70. uint32_t *get_match_total_cnt_;
  71. public:
  72. TopNCollector(
  73. TableID table, size_t offset, size_t limit, Searcher *searcher,
  74. SearchContext *search_context, std::vector<Filter *> *filter = nullptr,
  75. std::vector<SortCondition *> *sorter = nullptr,
  76. std::vector<std::shared_ptr<ScoreStrategy>> *score_strategy_list =
  77. nullptr,
  78. size_t max_score_doc_num = SIZE_MAX, uint32_t min_match_filter_num = 0,
  79. wwsearch::SearchTracer *tracer = nullptr,
  80. uint32_t *get_match_total_cnt = nullptr)
  81. : topN_docs_(Sorter(sorter)),
  82. score_priority_queue_(PostScorer()),
  83. sorter_(sorter),
  84. score_strategy_list_(score_strategy_list),
  85. max_score_doc_num_(std::max(offset + limit, max_score_doc_num)),
  86. score_doc_count_(0),
  87. use_score_strategy_(score_strategy_list != nullptr &&
  88. score_strategy_list->size() > 0),
  89. table_(table),
  90. top_n_(offset + limit),
  91. offset_(offset),
  92. limit_(limit),
  93. searcher_(searcher),
  94. search_context_(search_context),
  95. filter_(filter),
  96. inner_purge_total_docs_count_(0),
  97. min_match_filter_num_(min_match_filter_num),
  98. quick_top_n(false),
  99. tracer_(tracer),
  100. get_match_total_cnt_(get_match_total_cnt) {
  101. if (nullptr != filter) {
  102. if (0 == min_match_filter_num_ ||
  103. min_match_filter_num_ > filter->size()) {
  104. min_match_filter_num_ = filter->size();
  105. }
  106. }
  107. quick_top_n = (nullptr == sorter);
  108. if (use_score_strategy_) {
  109. assert(max_score_doc_num_ >= top_n_);
  110. }
  111. }
  112. virtual ~TopNCollector() {
  113. for (auto document : buffer_docs_) {
  114. if (nullptr != document) delete document;
  115. }
  116. buffer_docs_.clear();
  117. while (score_priority_queue_.size() > 0) {
  118. delete score_priority_queue_.top();
  119. score_priority_queue_.pop();
  120. }
  121. while (topN_docs_.size() > 0) {
  122. delete topN_docs_.top();
  123. topN_docs_.pop();
  124. }
  125. }
  126. virtual void Collect(DocumentID doc, int field_id) override;
  127. virtual bool Enough() override;
  128. virtual void Finish() override;
  129. virtual void GetAndClearMatchDocs(std::list<DocumentID> &docs) override;
  130. private:
  131. inline void InnerPurge();
  132. bool MatchFilter(Document *document);
  133. void HandleScorePriorityQueue();
  134. };
  135. } // namespace wwsearch