tokenize_unit.cpp 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273
  1. /*
  2. * Tencent is pleased to support the open source community by making wwsearch
  3. * available.
  4. *
  5. * Copyright (C) 2018-present Tencent. All Rights Reserved.
  6. *
  7. * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  8. * use this file except in compliance with the License. You may obtain a copy of
  9. * the License at
  10. *
  11. * https://opensource.org/licenses/Apache-2.0
  12. *
  13. * Unless required by applicable law or agreed to in writing, software
  14. * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  15. * WARRANTIES OF ANY KIND, either express or implied. See the License for the
  16. * specific language governing permissions and limitations under the License.
  17. */
  18. #include <gtest/gtest.h>
  19. #include "checked.h"
  20. #include "include/index_wrapper.h"
  21. #include "include/tokenizer_mmseg.h"
  22. #include "include/utf8_suffixbuilder.h"
  23. #include "unchecked.h"
  24. #include "unittest_util.h"
  25. namespace wwsearch {
  26. class TokenizeTest : public ::testing::Test {
  27. public:
  28. static DefaultIndexWrapper *index;
  29. static uint64_t document_id;
  30. wwsearch::TableID table;
  31. std::vector<DocumentUpdater *> documents;
  32. public:
  33. TokenizeTest() {
  34. table.business_type = 1;
  35. table.partition_set = 1;
  36. }
  37. static void SetUpTestCase() {
  38. index = new DefaultIndexWrapper();
  39. index->DBParams().path =
  40. std::string("/tmp/unit_") + std::string("tokenize");
  41. index->Config().SetLogLevel(g_debug ? wwsearch::kSearchLogLevelDebug
  42. : wwsearch::kSearchLogLevelError);
  43. auto status = index->Open(g_use_rocksdb, g_use_compression);
  44. ASSERT_TRUE(status.GetCode() == 0);
  45. }
  46. static void TearDownTestCase() {
  47. if (index != nullptr) {
  48. auto status = index->vdb_->DropDB();
  49. EXPECT_EQ(0, status.GetCode());
  50. delete index;
  51. index = nullptr;
  52. }
  53. }
  54. virtual void SetUp() override { table.partition_set++; }
  55. virtual void TearDown() override {
  56. for (auto du : documents) {
  57. delete du;
  58. }
  59. documents.clear();
  60. }
  61. uint64_t GetDocumentID() { return document_id++; }
  62. private:
  63. };
  64. DefaultIndexWrapper *TokenizeTest::index = nullptr;
  65. DocumentID TokenizeTest::document_id = 1;
  66. TEST_F(TokenizeTest, AddOneDocument) {
  67. const char *chinese = "我是中国人,我爱中国。。。This is english。我爱中国";
  68. const char *dict = ".";
  69. SearchLogDebug("dict read check %s/uni.lib\n", dict);
  70. SearchLogDebug("str:%s\n", chinese);
  71. wwsearch::TokenizerMMSEG tokenize(dict, 2);
  72. auto document = TestUtil::NewDocument(GetDocumentID(), chinese, 1, 1, 2);
  73. tokenize.Do(document->New());
  74. auto field = document->New().FindField(1);
  75. for (auto term : field->Terms()) {
  76. SearchLogDebug("term:%s\n", term.c_str());
  77. }
  78. }
  79. TEST_F(TokenizeTest, AddOneDocumentAndMatch) {
  80. const std::string doc_text{
  81. " \303\250\306\222\302\241\303\244\302\270\342\200\223\303\246\302\235"
  82. "\302\260 "
  83. "\303\246\305\240\342\202\254\303\246\305\223\302\257\303\245\302\267\302"
  84. "\245\303\247\302\250\342\200\271\303\251\306\222\302\250\303\251\342\200"
  85. "\241\342\200\241\303\250\302\264\302\255\303\247\342\200\235\302\263\303"
  86. "\250\302\257\302\267 "
  87. "\303\250\306\222\302\241\303\244\302\270\342\200\223\303\246\302\235\302"
  88. "\260\303\247\305\241\342\200\236\303\246\305\240\342\202\254\303\246\305"
  89. "\223\302\257\303\245\302\267\302\245\303\247\302\250\342\200\271\303\251"
  90. "\306\222\302\250\303\251\342\200\241\342\200\241\303\250\302\264\302\255"
  91. "\303\247\342\200\235\302\263\303\250\302\257\302\267 "
  92. "\303\250\306\222\302\241\303\244\302\270\342\200\223\303\246\302\235\302"
  93. "\260\'s\303\246\305\240\342\202\254\303\246\305\223\302\257\303\245\302"
  94. "\267\302\245\303\247\302\250\342\200\271\303\251\306\222\302\250\303\251"
  95. "\342\200\241\342\200\241\303\250\302\264\302\255\303\247\342\200\235\302"
  96. "\263\303\250\302\257\302\267 "
  97. "\303\251\305\223\342\202\254\303\246\302\261\342\200\232\303\251\306\222"
  98. "\302\250\303\251\342\200\224\302\250 "
  99. "\303\245\302\267\302\245\303\247\302\250\342\200\271\303\251\306\222\302"
  100. "\250 "
  101. "\303\247\342\200\235\302\263\303\250\302\257\302\267\303\246\342\200\224"
  102. "\302\245\303\246\305\223\305\270 "
  103. "\303\251\342\200\241\342\200\241\303\250\302\264\302\255\303\246\313\234"
  104. "\305\275\303\247\302\273\342\200\240 "
  105. "\303\251\302\242\342\200\236\303\247\302\256\342\200\224\303\245\302\215"
  106. "\342\200\242\303\245\302\217\302\267 YS-XM-201812-020 "
  107. "\303\245\305\276\342\200\271\303\245\302\217\302\267\303\246\313\206\342"
  108. "\200\223\303\250\302\247\342\200\236\303\246\302\240\302\274 "
  109. "\303\246\342\204\242\302\256\303\251\342\202\254\305\241\303\247\302\201"
  110. "\302\257\303\245\302\270\302\246\303\251\342\200\241\342\200\241\303\250"
  111. "\302\264\302\255 "
  112. "\303\245\302\220\313\206\303\245\302\220\305\222\303\245\302\217\302\267"
  113. " HT-2018-12-020 "
  114. "\303\246\342\200\242\302\260\303\251\342\200\241\302\217 "
  115. "\303\245\302\215\342\200\242\303\244\302\273\302\267 "
  116. "\303\251\342\200\241\342\200\230\303\251\302\242\302\235 "
  117. "\303\247\342\200\235\302\250\303\251\342\202\254\342\200\235 "
  118. "971\303\246\342\204\242\302\256\303\251\342\202\254\305\241\303\247\302"
  119. "\201\302\257\303\245\302\270\302\246\303\251\342\200\241\342\200\241\303"
  120. "\250\302\264\302\255\303\244\302\270\342\200\271\303\245\302\215\342\200"
  121. "\242 "
  122. "\303\245\313\206\302\260\303\250\302\264\302\247\303\246\342\200\224\302"
  123. "\245\303\246\305\223\305\270 "
  124. "\303\245\302\272\342\200\234\303\245\302\255\313\234\303\246\342\200\242"
  125. "\302\260\303\251\342\200\241\302\217 "
  126. "\303\247\342\200\235\302\263\303\250\302\257\302\267\303\244\302\272\302"
  127. "\272 "
  128. "\303\250\306\222\302\241\303\244\302\270\342\200\223\303\246\302\235\302"
  129. "\260 \303\251\342\204\242\342\200\236\303\244\302\273\302\266"};
  130. const char *dict = ".";
  131. wwsearch::TokenizerMMSEG tokenize(dict, 2);
  132. auto document = TestUtil::NewDocument(GetDocumentID(), doc_text, 1, 1, 2);
  133. tokenize.Do(document->New());
  134. auto field = document->New().FindField(1);
  135. for (auto term : field->Terms()) {
  136. SearchLogDebug("term:%s\n", term.c_str());
  137. }
  138. {
  139. const std::string match_txt{"HT2018-12-020"};
  140. std::set<std::string> terms;
  141. EXPECT_TRUE(
  142. tokenize.BuildTerms(match_txt.c_str(), match_txt.size(), terms));
  143. SearchLogDebug("Search for match_txt (%s):\n", match_txt.c_str());
  144. for (const auto &term : terms) {
  145. SearchLogDebug("term:%s\n", term.c_str());
  146. }
  147. }
  148. {
  149. const std::string match_txt{"HT2018-12-020971"};
  150. std::set<std::string> terms;
  151. EXPECT_TRUE(
  152. tokenize.BuildTerms(match_txt.c_str(), match_txt.size(), terms));
  153. SearchLogDebug("Search for match_txt (%s):\n", match_txt.c_str());
  154. for (const auto &term : terms) {
  155. SearchLogDebug("term:%s\n", term.c_str());
  156. }
  157. }
  158. {
  159. const std::string match_txt{"12mm厚"};
  160. std::set<std::string> terms;
  161. EXPECT_TRUE(
  162. tokenize.BuildTerms(match_txt.c_str(), match_txt.size(), terms));
  163. SearchLogDebug("Search for match_txt (%s):\n", match_txt.c_str());
  164. for (const auto &term : terms) {
  165. SearchLogDebug("term:%s\n", term.c_str());
  166. }
  167. }
  168. {
  169. const std::string match_txt{"保存时间"};
  170. std::set<std::string> terms;
  171. EXPECT_TRUE(
  172. tokenize.BuildTerms(match_txt.c_str(), match_txt.size(), terms));
  173. SearchLogDebug("Search for match_txt (%s):\n", match_txt.c_str());
  174. for (const auto &term : terms) {
  175. SearchLogDebug("term:%s\n", term.c_str());
  176. }
  177. }
  178. {
  179. const std::string match_txt{"回库报告"};
  180. std::set<std::string> terms;
  181. EXPECT_TRUE(
  182. tokenize.BuildTerms(match_txt.c_str(), match_txt.size(), terms));
  183. SearchLogDebug("Search for match_txt (%s):\n", match_txt.c_str());
  184. for (const auto &term : terms) {
  185. SearchLogDebug("term:%s\n", term.c_str());
  186. }
  187. }
  188. {
  189. const std::string match_txt{"出库报告"};
  190. std::set<std::string> terms;
  191. EXPECT_TRUE(
  192. tokenize.BuildTerms(match_txt.c_str(), match_txt.size(), terms));
  193. SearchLogDebug("Search for match_txt (%s):\n", match_txt.c_str());
  194. for (const auto &term : terms) {
  195. SearchLogDebug("term:%s\n", term.c_str());
  196. }
  197. }
  198. {
  199. const std::string match_txt{"入库报告"};
  200. std::set<std::string> terms;
  201. EXPECT_TRUE(
  202. tokenize.BuildTerms(match_txt.c_str(), match_txt.size(), terms));
  203. SearchLogDebug("Search for match_txt (%s):\n", match_txt.c_str());
  204. for (const auto &term : terms) {
  205. SearchLogDebug("term:%s\n", term.c_str());
  206. }
  207. }
  208. {
  209. const std::string match_txt{"进库报告"};
  210. std::set<std::string> terms;
  211. EXPECT_TRUE(
  212. tokenize.BuildTerms(match_txt.c_str(), match_txt.size(), terms));
  213. SearchLogDebug("Search for match_txt (%s):\n", match_txt.c_str());
  214. for (const auto &term : terms) {
  215. SearchLogDebug("term:%s\n", term.c_str());
  216. }
  217. }
  218. }
  219. TEST_F(TokenizeTest, UTF8) {
  220. std::string chinese("我是中国人,我爱中国,....。。。。");
  221. bool error = false;
  222. char *it = (char *)chinese.c_str();
  223. char *begin = it;
  224. SearchLogDebug("chinese:%s\n", chinese.c_str());
  225. while (!error) {
  226. try {
  227. utf8::next(it, (char *)(begin + chinese.size()));
  228. } catch (...) {
  229. error = true;
  230. break;
  231. }
  232. SearchLogDebug(":%s\n", it);
  233. }
  234. }
  235. TEST(UTF8SuffixBuilderTest, UTF8SuffixBuilder) {
  236. std::string keywords{"helloworld"};
  237. UTF8SuffixBuilder builder(keywords.c_str(), keywords.size(), 5);
  238. SearchLogDebug("UTF8SuffixBuilder keywords(%s)output :\n", keywords.c_str());
  239. do {
  240. std::string term(builder.Term(), builder.TermSize());
  241. SearchLogDebug("%s,", term.c_str());
  242. } while (builder.Next());
  243. SearchLogDebug("\n");
  244. }
  245. } // namespace wwsearch