tokenizer_mmseg.h 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. /*
  2. * Tencent is pleased to support the open source community by making wwsearch
  3. * available.
  4. *
  5. * Copyright (C) 2018-present Tencent. All Rights Reserved.
  6. *
  7. * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  8. * use this file except in compliance with the License. You may obtain a copy of
  9. * the License at
  10. *
  11. * https://opensource.org/licenses/Apache-2.0
  12. *
  13. * Unless required by applicable law or agreed to in writing, software
  14. * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  15. * WARRANTIES OF ANY KIND, either express or implied. See the License for the
  16. * specific language governing permissions and limitations under the License.
  17. */
  18. #pragma once
  19. #include "header.h"
  20. #include "tokenizer.h"
  21. #ifdef WIN32
  22. #include "bsd_getopt_win.h"
  23. #else
  24. #include "bsd_getopt.h"
  25. #endif
  26. #include "Segmenter.h"
  27. #include "SegmenterManager.h"
  28. #include "SynonymsDict.h"
  29. #include "ThesaurusDict.h"
  30. #include "UnigramCorpusReader.h"
  31. #include "UnigramDict.h"
  32. #include "csr_utils.h"
  33. namespace wwsearch {
  34. class HashLockSupplier {
  35. public:
  36. virtual ~HashLockSupplier() {}
  37. virtual void Lock(uint32_t hash) = 0;
  38. virtual void unlock(uint32_t hash) = 0;
  39. };
  40. class ThreadHashLock : public HashLockSupplier {
  41. private:
  42. std::mutex *locks_;
  43. uint32_t locks_num_;
  44. public:
  45. ThreadHashLock(uint32_t locks_num) {
  46. this->locks_num_ = locks_num;
  47. this->locks_ = new std::mutex[locks_num_];
  48. }
  49. virtual ~ThreadHashLock() {
  50. if (nullptr != locks_) {
  51. delete locks_;
  52. locks_ = nullptr;
  53. }
  54. }
  55. virtual void Lock(uint32_t hash) { locks_[hash % locks_num_].lock(); }
  56. virtual void unlock(uint32_t hash) { locks_[hash % locks_num_].unlock(); }
  57. private:
  58. };
  59. class TokenizerMMSEG : public Tokenizer {
  60. private:
  61. std::vector<css::SegmenterManager *> managers_;
  62. size_t manager_num_;
  63. HashLockSupplier *hash_lock_;
  64. std::atomic<std::uint64_t> seq_;
  65. public:
  66. // Note:
  67. // file: dict_path/uni.lib must exist
  68. // file: dict_path/mmseg.ini must exist
  69. TokenizerMMSEG(const char *dict_path, size_t segmenter_num = 5000) {
  70. hash_lock_ = new ThreadHashLock(segmenter_num);
  71. manager_num_ = segmenter_num;
  72. assert(manager_num_ > 0);
  73. for (size_t i = 0; i < manager_num_; i++) {
  74. css::SegmenterManager *seg = new css::SegmenterManager();
  75. assert(0 == seg->init(dict_path));
  76. managers_.push_back(seg);
  77. }
  78. }
  79. virtual ~TokenizerMMSEG() {
  80. for (auto seg : managers_) delete seg;
  81. delete hash_lock_;
  82. }
  83. virtual bool Do(wwsearch::Document &document) override;
  84. virtual bool BuildTerms(const char *buffer, size_t buffer_size,
  85. std::set<std::string> &terms,
  86. bool no_covert_to_lower_case = false) override;
  87. inline bool IsSkipChar(char c) { return c == ' ' || c == '\t' || c == '\r'; }
  88. virtual void ToLowerCase(std::string &old) override;
  89. private:
  90. };
  91. } // namespace wwsearch
  92. namespace wwsearch {}