SegmentPkg.h 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
  2. /* ***** BEGIN LICENSE BLOCK *****
  3. * Version: GPL 2.0
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License. You should have
  7. * received a copy of the GPL license along with this program; if you
  8. * did not, you can find it at http://www.gnu.org/
  9. *
  10. * Software distributed under the License is distributed on an "AS IS" basis,
  11. * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  12. * for the specific language governing rights and limitations under the
  13. * License.
  14. *
  15. * The Original Code is Coreseek.com code.
  16. *
  17. * Copyright (C) 2007-2008. All Rights Reserved.
  18. *
  19. * Author:
  20. * Li monan <li.monan@gmail.com>
  21. *
  22. * ***** END LICENSE BLOCK ***** */
  23. #ifndef css_SegmentPkg_h
  24. #define css_SegmentPkg_h
  25. #include <vector>
  26. #define HAVE_ATEXIT
  27. #include "Singleton.h"
  28. #include "csr_typedefs.h"
  29. namespace css {
  30. /*
  31. - find char-class
  32. - tolower case(optional, used in search.).
  33. */
  34. class ChineseCharTaggerImpl {
  35. public:
  36. ChineseCharTaggerImpl() { init(); }
  37. ~ChineseCharTaggerImpl() {
  38. for (int i = 1; i < 256; i++) {
  39. if (i == 0x23) continue;
  40. if (index_map[i]) delete[] index_map[i];
  41. }
  42. };
  43. u2 tagUnicode(u2 iCode, u1 length);
  44. protected:
  45. void init();
  46. // We reduced the map. only number-char page is exist
  47. // char cjk_map[20736]; // 256*(9f-4e) = 21k
  48. u1* index_map[256];
  49. u1 ansi_map[256];
  50. // char sym_map[512]; // 0x3000 - 0x303F && 0xFF??
  51. };
  52. typedef CSR_Singleton<ChineseCharTaggerImpl> ChineseCharTagger;
  53. #include "tolowercase.h"
  54. /*To lower
  55. */
  56. class ToLowerImpl {
  57. public:
  58. ToLowerImpl(){};
  59. inline u2 toLower(u2 k) {
  60. u1 idx = k >> 8;
  61. u2 iCode = k;
  62. if (table_index[idx]) iCode = table_index[idx][k & 0xFF];
  63. if (iCode) return iCode;
  64. return k;
  65. }
  66. };
  67. typedef CSR_Singleton<ToLowerImpl> ToLower;
  68. class SegmentPkg {
  69. public:
  70. SegmentPkg();
  71. ~SegmentPkg();
  72. void init();
  73. public:
  74. const char* m_buf; // make the hole object less than 64k
  75. u1* m_tag;
  76. int m_length; // used length
  77. u1 m_Own;
  78. int m_size; // total length
  79. int m_used;
  80. u1 m_remains_bytes;
  81. std::vector<int> m_wTagList; // the seps position.
  82. ChineseCharTaggerImpl* m_tagger;
  83. public:
  84. /**
  85. @return 0, appended.
  86. @return -1, too large
  87. NOTE: a newly created pkg always return 0. except not enough memory.(throw
  88. std::bad_alloc)
  89. */
  90. int feedData(const char* buf, int length);
  91. int tagData(const char* buf, int length);
  92. void setSize(int length);
  93. public:
  94. /**
  95. * read UTF-8 input can tagger the char-pos in tag array. tag length must equal
  96. * or larger than buf.
  97. * we assume buf is end with '\0'
  98. * and this function will changed m_wTagList as a side effect.
  99. * @return, the data remains untagged. must less than 3.
  100. */
  101. int tagData(const char* buf, u1* tag, int length = 0, int offset = 0);
  102. protected:
  103. const static int DEFAULT_PACKAGE_LENGTH = 65400;
  104. };
  105. } /* End of namespace css */
  106. #endif