Segmenter.h 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176
  1. /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
  2. /* ***** BEGIN LICENSE BLOCK *****
  3. * Version: GPL 2.0
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License. You should have
  7. * received a copy of the GPL license along with this program; if you
  8. * did not, you can find it at http://www.gnu.org/
  9. *
  10. * Software distributed under the License is distributed on an "AS IS" basis,
  11. * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  12. * for the specific language governing rights and limitations under the
  13. * License.
  14. *
  15. * The Original Code is Coreseek.com code.
  16. *
  17. * Copyright (C) 2007-2008. All Rights Reserved.
  18. *
  19. * Author:
  20. * Li monan <li.monan@gmail.com>
  21. *
  22. * ***** END LICENSE BLOCK ***** */
  23. #ifndef css_Segmenter_h
  24. #define css_Segmenter_h
  25. #include <vector>
  26. #ifdef WIN32
  27. #include <hash_map>
  28. #else
  29. #include <ext/hash_map>
  30. #endif
  31. #include "SegmentPkg.h"
  32. #include "UnigramDict.h"
  33. #include "SynonymsDict.h"
  34. #include "csr_typedefs.h"
  35. #include "freelist.h"
  36. #include "mmthunk.h"
  37. #include <algorithm>
  38. #include <fstream>
  39. #include <iostream>
  40. #include "UnigramCorpusReader.h"
  41. #include "ThesaurusDict.h"
  42. namespace css {
  43. using namespace CRFPP;
  44. #ifdef WIN32
  45. using namespace stdext;
  46. #else
  47. using namespace __gnu_cxx;
  48. #endif
  49. #define CRFDICT_UTF8 1
  50. #define BEGIN_TOKEN "b##b"
  51. #define END_TOKEN "e##e"
  52. #define NUMBER_TOKEN "M"
  53. #define ASCII_TOKEN "E"
  54. #define BEGIN_TOKEN_ID 0
  55. #define END_TOKEN_ID 1
  56. #define NUMBER_TOKEN_ID 2
  57. #define ASCII_TOKEN_ID 3
  58. #define BEGIN_TOKEN_LENGTH 4
  59. #define END_TOKEN_LENGTH 4
  60. #define NUMBER_TOKEN_LENGTH 1
  61. #define ASCII_TOKEN_LENGTH 1
  62. /*
  63. base functor, used to abstract n-gram smoothing algorithm
  64. Design only. not used yet.
  65. */
  66. template <typename FType>
  67. struct NgramSmoother {
  68. FType operator()(int L, int R, int Bi, FType Smoothing) const {
  69. double dTemp = 1.0 / MAX_FREQUENCE;
  70. return (-1) * log(Smoothing * (1 + L) / (MAX_FREQUENCE + 80000) +
  71. (1 - Smoothing) * ((1 - dTemp) * Bi / (1 + L) + dTemp));
  72. return 0;
  73. }
  74. const static int MAX_FREQUENCE = 2079997;
  75. };
  76. /**
  77. Bit flag format:
  78. Bit flag is used in char-type tagging. size = sizeof(char).
  79. x1 x2 x3 x4 x5 x6 x7
  80. x1 x2, the utf-8 char's position token
  81. 1 1, the next 2(or 4) char is token-length. (utf-8 data length)
  82. 0 0, only current char
  83. 0 1, next char
  84. 1 0, next 2 char
  85. 1 1, more than 3 char, read next 2 byte. this limited a token can not larger
  86. than 64k.
  87. ------
  88. [0-80], the standard ascii char,
  89. tag-set:
  90. m: number
  91. e: non CJK char, e.g. English pinyin
  92. t: time. 年号 干支等(此处识别出后,仅加入 oov ,不参与实际分词)
  93. c: CJK char.
  94. s: Symbol e.g. @
  95. w: Sentence seperator.
  96. x: unknown char.
  97. */
  98. class Segmenter_ConfigObj {
  99. public:
  100. u1 merge_number_and_ascii;
  101. u1 seperate_number_ascii;
  102. // TODO: compress_space is still unsupported, for spaces can be handled in
  103. // stopword list.
  104. u1 compress_space;
  105. u1 number_and_ascii_joint[512];
  106. Segmenter_ConfigObj()
  107. : merge_number_and_ascii(0), seperate_number_ascii(0), compress_space(0) {
  108. number_and_ascii_joint[0] = 0;
  109. }
  110. };
  111. class Segmenter {
  112. public:
  113. /**
  114. * @return 0
  115. */
  116. void setBuffer(u1* buf, u4 length);
  117. const u1* peekToken(u2& aLen, u2& aSymLen, u2 n = 0);
  118. void popToken(u2 len, u2 n = 0);
  119. void segNgram(int n) { m_ngram = n; }
  120. int getOffset();
  121. u1 isSentenceEnd();
  122. int isKeyWord(u1* buf, u4 length);
  123. int getWordWeight(u1* buf, u4 length);
  124. const char* thesaurus(const char* key, u2 key_len);
  125. Segmenter();
  126. ~Segmenter();
  127. protected:
  128. const u1* peekKwToken(u2& aLen, u2& aSymLen);
  129. void popKwToken(u2 len);
  130. public:
  131. static int toLowerCpy(const u1* src, u1* det, u2 det_size);
  132. protected:
  133. int m_begin_id;
  134. int m_end_id;
  135. int m_begin_count;
  136. int m_end_count;
  137. int m_ngram;
  138. ChineseCharTaggerImpl* m_tagger;
  139. MMThunk m_thunk;
  140. // static ToLowerImpl* m_lower;
  141. public:
  142. UnigramDict* m_unidict;
  143. UnigramDict* m_kwdict;
  144. UnigramDict* m_weightdict;
  145. SynonymsDict* m_symdict;
  146. ThesaurusDict* m_thesaurus;
  147. Segmenter_ConfigObj* m_config;
  148. // mmseg used.
  149. u1* m_buffer_begin;
  150. u1* m_buffer_ptr;
  151. u1* m_buffer_chunk_begin;
  152. u1* m_buffer_end;
  153. };
  154. } /* End of namespace css */
  155. #endif