mmthunk.h 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
  2. /* ***** BEGIN LICENSE BLOCK *****
  3. * Version: GPL 2.0
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License. You should have
  7. * received a copy of the GPL license along with this program; if you
  8. * did not, you can find it at http://www.gnu.org/
  9. *
  10. * Software distributed under the License is distributed on an "AS IS" basis,
  11. * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  12. * for the specific language governing rights and limitations under the
  13. * License.
  14. *
  15. * The Original Code is Coreseek.com code.
  16. *
  17. * Copyright (C) 2007-2008. All Rights Reserved.
  18. *
  19. * Author:
  20. * Li monan <li.monan@gmail.com>
  21. *
  22. * ***** END LICENSE BLOCK ***** */
  23. #ifndef _MM_THUNK_H_
  24. #define _MM_THUNK_H_
  25. #include <vector>
  26. #include <math.h>
  27. #include "UnigramDict.h"
  28. #include "freelist.h"
  29. #define CHUNK_BUFFER_SIZE 1024
  30. #define CHUNK_DEBUG 0
  31. namespace css {
  32. class Chunk {
  33. public:
  34. Chunk() : m_free_score(0.0), total_length(0) {}
  35. float m_free_score;
  36. int total_length;
  37. std::vector<u2> tokens;
  38. std::vector<u2> freqs;
  39. inline void pushToken(u2 len, u2 freq) {
  40. #if CHUNK_DEBUG
  41. printf("pt:%d, %d;\t", len, freq);
  42. #endif
  43. tokens.push_back(len);
  44. total_length += len;
  45. freqs.push_back(freq);
  46. // m_free_score += log((float)freq) * 100;
  47. }
  48. inline float get_free() {
  49. // m_free_score
  50. float score = 0.0;
  51. std::vector<u2>::iterator it;
  52. float freq = 0;
  53. for (it = freqs.begin(); it < freqs.end(); it++) {
  54. freq = ((float)*it) + 1;
  55. score += log(freq) * 100;
  56. }
  57. return score;
  58. }
  59. inline float get_avl() {
  60. float avg = (float)1.0 * total_length / tokens.size();
  61. return avg;
  62. }
  63. inline float get_avg() {
  64. float avg = (float)1.0 * total_length / tokens.size();
  65. std::vector<u2>::iterator it;
  66. float total = 0;
  67. for (it = tokens.begin(); it < tokens.end(); it++) {
  68. float diff = ((*it) - avg);
  69. total += diff * diff;
  70. }
  71. return (float)1.0 * total / (tokens.size() - 1);
  72. }
  73. inline void popup() {
  74. if (tokens.size()) {
  75. total_length -= tokens[tokens.size() - 1];
  76. tokens.pop_back();
  77. freqs.pop_back();
  78. }
  79. }
  80. inline void reset() {
  81. tokens.clear();
  82. freqs.clear();
  83. total_length = 0;
  84. }
  85. };
  86. class ChunkQueue {
  87. public:
  88. ChunkQueue() : max_length(0){};
  89. public:
  90. void push(Chunk& ck) {
  91. if (ck.total_length < max_length) return; // rule:1
  92. if (ck.total_length > max_length) {
  93. max_length = ck.total_length;
  94. m_chunks.clear();
  95. }
  96. m_chunks.push_back(ck);
  97. };
  98. u2 getToken() {
  99. size_t num_chunk = m_chunks.size();
  100. if (!num_chunk) return 0;
  101. if (num_chunk == 1) return m_chunks[0].tokens[0];
  102. // debug use->dump chunk
  103. #if CHUNK_DEBUG
  104. for (size_t i = 0; i < num_chunk; i++) {
  105. for (size_t j = 0; j < m_chunks[i].tokens.size(); j++)
  106. printf("%d,", m_chunks[i].tokens[j]);
  107. printf("\n");
  108. }
  109. #endif
  110. // do filter
  111. // apply rule 2
  112. float avg_length = 0;
  113. u4 remains[256]; // m_chunks.size can not larger than 256;
  114. u4* k_ptr = remains;
  115. for (size_t i = 0; i < m_chunks.size(); i++) {
  116. float avl = m_chunks[i].get_avl();
  117. if (avl > avg_length) {
  118. avg_length = avl;
  119. k_ptr = remains;
  120. *k_ptr = (u4)i;
  121. k_ptr++;
  122. } else if (avl == avg_length) {
  123. *k_ptr = (u4)i;
  124. k_ptr++;
  125. }
  126. }
  127. if ((k_ptr - remains) == 1)
  128. return m_chunks[remains[0]].tokens[0]; // match by rule2
  129. // apply rule 3
  130. u4 remains_r3[256];
  131. u4* k_ptr_r3 = remains_r3;
  132. avg_length = 1024 * 64; // an unreachable avg
  133. for (size_t i = 0; i < k_ptr - remains; i++) {
  134. float avg = m_chunks[remains[i]].get_avg();
  135. if (avg < avg_length) {
  136. avg_length = avg;
  137. k_ptr_r3 = remains_r3;
  138. *k_ptr_r3 = (u4)remains[i]; //*k_ptr_r3 = (u4)i;
  139. k_ptr_r3++;
  140. } else if (avg == avg_length) {
  141. *k_ptr_r3 = (u4)i;
  142. k_ptr_r3++;
  143. }
  144. }
  145. if ((k_ptr_r3 - remains_r3) == 1)
  146. return m_chunks[remains_r3[0]].tokens[0]; // match by rule3 min
  147. // avg_length
  148. // apply r4 max freedom
  149. float max_score = 0.0;
  150. size_t idx = -1;
  151. for (size_t i = 0; i < k_ptr_r3 - remains_r3; i++) {
  152. float score = m_chunks[remains_r3[i]].get_free();
  153. if (score > max_score) {
  154. max_score = score;
  155. idx = remains_r3[i];
  156. }
  157. }
  158. return m_chunks[idx].tokens[0];
  159. // return 0;
  160. };
  161. inline void reset() {
  162. m_chunks.clear();
  163. max_length = 0;
  164. };
  165. protected:
  166. std::vector<Chunk> m_chunks;
  167. i4 max_length;
  168. };
  169. class item_info {
  170. public:
  171. item_info()
  172. : // length(0),
  173. freq(0){};
  174. public:
  175. // u4 length;
  176. u4 freq;
  177. std::vector<u2> items;
  178. };
  179. class MMThunk {
  180. public:
  181. MMThunk() : base_offset(0), m_max_length(-1), m_length(0) {
  182. memset(m_charinfos, 0, sizeof(item_info*) * CHUNK_BUFFER_SIZE);
  183. memset(m_kwinfos, 0, sizeof(item_info*) * CHUNK_BUFFER_SIZE);
  184. item_list.set_size(CHUNK_BUFFER_SIZE * 2);
  185. };
  186. ~MMThunk(){};
  187. void setItems(i4 idx, u2 rs_count, UnigramDict::result_pair_type* results);
  188. void setKwItems(i4 idx, u2 rs_count, UnigramDict::result_pair_type* results);
  189. void advance(u2 step) { base_offset += step; };
  190. // peek the current token
  191. u1* peekToken(u2& length);
  192. u2 popupToken();
  193. u1* peekKwToken(u2& pos, u2& length);
  194. u2 popupKwToken();
  195. int Tokenize();
  196. void pushToken(u2 aSize, i4 base);
  197. void reset();
  198. u4 length() { return m_length; };
  199. protected:
  200. u2 base_offset;
  201. CRFPP::FreeList<item_info> item_list;
  202. item_info* m_charinfos[CHUNK_BUFFER_SIZE];
  203. std::vector<u2> tokens;
  204. item_info* m_kwinfos[CHUNK_BUFFER_SIZE];
  205. i4 m_kw_pos;
  206. i4 m_kw_ipos;
  207. i4 m_max_length;
  208. u4 m_length;
  209. ChunkQueue m_queue;
  210. protected:
  211. void pushChunk(Chunk& ck);
  212. };
  213. }
  214. #endif