zhuyijun
/
wwsearch
spegling av https://gitee.com/Tencent/wwsearch.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273
							/*
 * Tencent is pleased to support the open source community by making wwsearch
 * available.
 *
 * Copyright (C) 2018-present Tencent. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * https://opensource.org/licenses/Apache-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations under the License.
 */

#include <gtest/gtest.h>
#include "checked.h"
#include "include/index_wrapper.h"
#include "include/tokenizer_mmseg.h"
#include "include/utf8_suffixbuilder.h"
#include "unchecked.h"
#include "unittest_util.h"

namespace wwsearch {

class TokenizeTest : public ::testing::Test {
 public:
  static DefaultIndexWrapper *index;
  static uint64_t document_id;
  wwsearch::TableID table;
  std::vector<DocumentUpdater *> documents;

 public:
  TokenizeTest() {
    table.business_type = 1;
    table.partition_set = 1;
  }

  static void SetUpTestCase() {
    index = new DefaultIndexWrapper();
    index->DBParams().path =
        std::string("/tmp/unit_") + std::string("tokenize");
    index->Config().SetLogLevel(g_debug ? wwsearch::kSearchLogLevelDebug
                                        : wwsearch::kSearchLogLevelError);
    auto status = index->Open(g_use_rocksdb, g_use_compression);
    ASSERT_TRUE(status.GetCode() == 0);
  }

  static void TearDownTestCase() {
    if (index != nullptr) {
      auto status = index->vdb_->DropDB();
      EXPECT_EQ(0, status.GetCode());
      delete index;
      index = nullptr;
    }
  }

  virtual void SetUp() override { table.partition_set++; }

  virtual void TearDown() override {
    for (auto du : documents) {
      delete du;
    }
    documents.clear();
  }

  uint64_t GetDocumentID() { return document_id++; }

 private:
};

DefaultIndexWrapper *TokenizeTest::index = nullptr;
DocumentID TokenizeTest::document_id = 1;

TEST_F(TokenizeTest, AddOneDocument) {
  const char *chinese = "我是中国人，我爱中国。。。This is english。我爱中国";
  const char *dict = ".";

  SearchLogDebug("dict read check %s/uni.lib\n", dict);
  SearchLogDebug("str:%s\n", chinese);
  wwsearch::TokenizerMMSEG tokenize(dict, 2);
  auto document = TestUtil::NewDocument(GetDocumentID(), chinese, 1, 1, 2);
  tokenize.Do(document->New());

  auto field = document->New().FindField(1);
  for (auto term : field->Terms()) {
    SearchLogDebug("term:%s\n", term.c_str());
  }
}

TEST_F(TokenizeTest, AddOneDocumentAndMatch) {
  const std::string doc_text{
      " \303\250\306\222\302\241\303\244\302\270\342\200\223\303\246\302\235"
      "\302\260 "
      "\303\246\305\240\342\202\254\303\246\305\223\302\257\303\245\302\267\302"
      "\245\303\247\302\250\342\200\271\303\251\306\222\302\250\303\251\342\200"
      "\241\342\200\241\303\250\302\264\302\255\303\247\342\200\235\302\263\303"
      "\250\302\257\302\267 "
      "\303\250\306\222\302\241\303\244\302\270\342\200\223\303\246\302\235\302"
      "\260\303\247\305\241\342\200\236\303\246\305\240\342\202\254\303\246\305"
      "\223\302\257\303\245\302\267\302\245\303\247\302\250\342\200\271\303\251"
      "\306\222\302\250\303\251\342\200\241\342\200\241\303\250\302\264\302\255"
      "\303\247\342\200\235\302\263\303\250\302\257\302\267 "
      "\303\250\306\222\302\241\303\244\302\270\342\200\223\303\246\302\235\302"
      "\260\'s\303\246\305\240\342\202\254\303\246\305\223\302\257\303\245\302"
      "\267\302\245\303\247\302\250\342\200\271\303\251\306\222\302\250\303\251"
      "\342\200\241\342\200\241\303\250\302\264\302\255\303\247\342\200\235\302"
      "\263\303\250\302\257\302\267  "
      "\303\251\305\223\342\202\254\303\246\302\261\342\200\232\303\251\306\222"
      "\302\250\303\251\342\200\224\302\250 "
      "\303\245\302\267\302\245\303\247\302\250\342\200\271\303\251\306\222\302"
      "\250 "
      "\303\247\342\200\235\302\263\303\250\302\257\302\267\303\246\342\200\224"
      "\302\245\303\246\305\223\305\270 "
      "\303\251\342\200\241\342\200\241\303\250\302\264\302\255\303\246\313\234"
      "\305\275\303\247\302\273\342\200\240 "
      "\303\251\302\242\342\200\236\303\247\302\256\342\200\224\303\245\302\215"
      "\342\200\242\303\245\302\217\302\267 YS-XM-201812-020 "
      "\303\245\305\276\342\200\271\303\245\302\217\302\267\303\246\313\206\342"
      "\200\223\303\250\302\247\342\200\236\303\246\302\240\302\274 "
      "\303\246\342\204\242\302\256\303\251\342\202\254\305\241\303\247\302\201"
      "\302\257\303\245\302\270\302\246\303\251\342\200\241\342\200\241\303\250"
      "\302\264\302\255 "
      "\303\245\302\220\313\206\303\245\302\220\305\222\303\245\302\217\302\267"
      " HT-2018-12-020 "
      "\303\246\342\200\242\302\260\303\251\342\200\241\302\217 "
      "\303\245\302\215\342\200\242\303\244\302\273\302\267 "
      "\303\251\342\200\241\342\200\230\303\251\302\242\302\235 "
      "\303\247\342\200\235\302\250\303\251\342\202\254\342\200\235 "
      "971\303\246\342\204\242\302\256\303\251\342\202\254\305\241\303\247\302"
      "\201\302\257\303\245\302\270\302\246\303\251\342\200\241\342\200\241\303"
      "\250\302\264\302\255\303\244\302\270\342\200\271\303\245\302\215\342\200"
      "\242 "
      "\303\245\313\206\302\260\303\250\302\264\302\247\303\246\342\200\224\302"
      "\245\303\246\305\223\305\270 "
      "\303\245\302\272\342\200\234\303\245\302\255\313\234\303\246\342\200\242"
      "\302\260\303\251\342\200\241\302\217 "
      "\303\247\342\200\235\302\263\303\250\302\257\302\267\303\244\302\272\302"
      "\272 "
      "\303\250\306\222\302\241\303\244\302\270\342\200\223\303\246\302\235\302"
      "\260 \303\251\342\204\242\342\200\236\303\244\302\273\302\266"};
  const char *dict = ".";

  wwsearch::TokenizerMMSEG tokenize(dict, 2);
  auto document = TestUtil::NewDocument(GetDocumentID(), doc_text, 1, 1, 2);
  tokenize.Do(document->New());

  auto field = document->New().FindField(1);
  for (auto term : field->Terms()) {
    SearchLogDebug("term:%s\n", term.c_str());
  }

  {
    const std::string match_txt{"HT2018-12-020"};
    std::set<std::string> terms;
    EXPECT_TRUE(
        tokenize.BuildTerms(match_txt.c_str(), match_txt.size(), terms));
    SearchLogDebug("Search for match_txt (%s):\n", match_txt.c_str());
    for (const auto &term : terms) {
      SearchLogDebug("term:%s\n", term.c_str());
    }
  }

  {
    const std::string match_txt{"HT2018-12-020971"};
    std::set<std::string> terms;
    EXPECT_TRUE(
        tokenize.BuildTerms(match_txt.c_str(), match_txt.size(), terms));
    SearchLogDebug("Search for match_txt (%s):\n", match_txt.c_str());
    for (const auto &term : terms) {
      SearchLogDebug("term:%s\n", term.c_str());
    }
  }

  {
    const std::string match_txt{"12mm厚"};
    std::set<std::string> terms;
    EXPECT_TRUE(
        tokenize.BuildTerms(match_txt.c_str(), match_txt.size(), terms));
    SearchLogDebug("Search for match_txt (%s):\n", match_txt.c_str());
    for (const auto &term : terms) {
      SearchLogDebug("term:%s\n", term.c_str());
    }
  }

  {
    const std::string match_txt{"保存时间"};
    std::set<std::string> terms;
    EXPECT_TRUE(
        tokenize.BuildTerms(match_txt.c_str(), match_txt.size(), terms));
    SearchLogDebug("Search for match_txt (%s):\n", match_txt.c_str());
    for (const auto &term : terms) {
      SearchLogDebug("term:%s\n", term.c_str());
    }
  }

  {
    const std::string match_txt{"回库报告"};
    std::set<std::string> terms;
    EXPECT_TRUE(
        tokenize.BuildTerms(match_txt.c_str(), match_txt.size(), terms));
    SearchLogDebug("Search for match_txt (%s):\n", match_txt.c_str());
    for (const auto &term : terms) {
      SearchLogDebug("term:%s\n", term.c_str());
    }
  }

  {
    const std::string match_txt{"出库报告"};
    std::set<std::string> terms;
    EXPECT_TRUE(
        tokenize.BuildTerms(match_txt.c_str(), match_txt.size(), terms));
    SearchLogDebug("Search for match_txt (%s):\n", match_txt.c_str());
    for (const auto &term : terms) {
      SearchLogDebug("term:%s\n", term.c_str());
    }
  }

  {
    const std::string match_txt{"入库报告"};
    std::set<std::string> terms;
    EXPECT_TRUE(
        tokenize.BuildTerms(match_txt.c_str(), match_txt.size(), terms));
    SearchLogDebug("Search for match_txt (%s):\n", match_txt.c_str());
    for (const auto &term : terms) {
      SearchLogDebug("term:%s\n", term.c_str());
    }
  }

  {
    const std::string match_txt{"进库报告"};
    std::set<std::string> terms;
    EXPECT_TRUE(
        tokenize.BuildTerms(match_txt.c_str(), match_txt.size(), terms));
    SearchLogDebug("Search for match_txt (%s):\n", match_txt.c_str());
    for (const auto &term : terms) {
      SearchLogDebug("term:%s\n", term.c_str());
    }
  }
}

TEST_F(TokenizeTest, UTF8) {
  std::string chinese("我是中国人，我爱中国,....。。。。");
  bool error = false;
  char *it = (char *)chinese.c_str();
  char *begin = it;
  SearchLogDebug("chinese:%s\n", chinese.c_str());
  while (!error) {
    try {
      utf8::next(it, (char *)(begin + chinese.size()));
    } catch (...) {
      error = true;
      break;
    }
    SearchLogDebug(":%s\n", it);
  }
}

TEST(UTF8SuffixBuilderTest, UTF8SuffixBuilder) {
  std::string keywords{"helloworld"};
  UTF8SuffixBuilder builder(keywords.c_str(), keywords.size(), 5);
  SearchLogDebug("UTF8SuffixBuilder keywords(%s)output :\n", keywords.c_str());
  do {
    std::string term(builder.Term(), builder.TermSize());
    SearchLogDebug("%s,", term.c_str());
  } while (builder.Next());
  SearchLogDebug("\n");
}

}  // namespace wwsearch