tokenizer.h 1.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142
  1. /*
  2. * Tencent is pleased to support the open source community by making wwsearch
  3. * available.
  4. *
  5. * Copyright (C) 2018-present Tencent. All Rights Reserved.
  6. *
  7. * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  8. * use this file except in compliance with the License. You may obtain a copy of
  9. * the License at
  10. *
  11. * https://opensource.org/licenses/Apache-2.0
  12. *
  13. * Unless required by applicable law or agreed to in writing, software
  14. * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  15. * WARRANTIES OF ANY KIND, either express or implied. See the License for the
  16. * specific language governing permissions and limitations under the License.
  17. */
  18. #pragma once
  19. #include "document.h"
  20. #include "header.h"
  21. namespace wwsearch {
  22. class Tokenizer {
  23. private:
  24. public:
  25. virtual ~Tokenizer() {}
  26. // Do works for tokenize document for index.
  27. virtual bool Do(wwsearch::Document &document) = 0;
  28. // BuildTerms works for tokenize terms for query.
  29. virtual bool BuildTerms(const char *buffer, size_t buffer_size,
  30. std::set<std::string> &terms,
  31. bool no_covert_to_lower_case = false) = 0;
  32. virtual void ToLowerCase(std::string &old) = 0;
  33. private:
  34. };
  35. } // namespace wwsearch