darts.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593
  1. /*
  2. Darts -- Double-ARray Trie System
  3. $Id: darts.h.in 1575 2007-01-27 13:24:27Z taku $;
  4. Copyright(C) 2001-2007 Taku Kudo <taku@chasen.org>
  5. */
  6. #ifndef DARTS_H_
  7. #define DARTS_H_
  8. #define DARTS_VERSION "0.31"
  9. #include <vector>
  10. #include <cstring>
  11. #include <cstdio>
  12. #include "csr_typedefs.h"
  13. #ifdef HAVE_ZLIB_H
  14. namespace zlib {
  15. #include <zlib.h>
  16. }
  17. #define SH(p) \
  18. ((unsigned short)(unsigned char)((p)[0]) | \
  19. ((unsigned short)(unsigned char)((p)[1]) << 8))
  20. #define LG(p) ((unsigned long)(SH(p)) | ((unsigned long)(SH((p)+2)) << 16))
  21. #endif
  22. namespace Darts {
  23. template <class T>
  24. inline T _max(T x, T y) {
  25. return (x > y) ? x : y;
  26. }
  27. template <class T>
  28. inline T *_resize(T *ptr, size_t n, size_t l, T v) {
  29. T *tmp = new T[l];
  30. for (size_t i = 0; i < n; ++i) tmp[i] = ptr[i];
  31. for (size_t i = n; i < l; ++i) tmp[i] = v;
  32. delete[] ptr;
  33. return tmp;
  34. }
  35. template <class T>
  36. class Length {
  37. public:
  38. size_t operator()(const T *key) const {
  39. size_t i;
  40. for (i = 0; key[i] != (T)0; ++i) {
  41. }
  42. return i;
  43. }
  44. };
  45. template <>
  46. class Length<char> {
  47. public:
  48. size_t operator()(const char *key) const { return std::strlen(key); }
  49. };
  50. template <class node_type_, class node_u_type_, class array_type_,
  51. class array_u_type_, class length_func_ = Length<node_type_> >
  52. class DoubleArrayImpl {
  53. public:
  54. struct unit_t {
  55. array_type_ base;
  56. array_u_type_ check;
  57. };
  58. private:
  59. struct node_t {
  60. array_u_type_ code;
  61. size_t depth;
  62. size_t left;
  63. size_t right;
  64. };
  65. unit_t *array_;
  66. unsigned char *used_;
  67. size_t size_;
  68. size_t alloc_size_;
  69. node_type_ **key_;
  70. size_t key_size_;
  71. size_t *length_;
  72. array_type_ *value_;
  73. size_t progress_;
  74. size_t next_check_pos_;
  75. bool no_delete_;
  76. int error_;
  77. int (*progress_func_)(size_t, size_t);
  78. size_t resize(const size_t new_size) {
  79. unit_t tmp;
  80. tmp.base = 0;
  81. tmp.check = 0;
  82. array_ = _resize(array_, alloc_size_, new_size, tmp);
  83. used_ =
  84. _resize(used_, alloc_size_, new_size, static_cast<unsigned char>(0));
  85. alloc_size_ = new_size;
  86. return new_size;
  87. }
  88. size_t fetch(const node_t &parent, std::vector<node_t> &siblings) {
  89. if (error_ < 0) return 0;
  90. array_u_type_ prev = 0;
  91. for (size_t i = parent.left; i < parent.right; ++i) {
  92. if ((length_ ? length_[i] : length_func_()(key_[i])) < parent.depth)
  93. continue;
  94. const node_u_type_ *tmp = reinterpret_cast<node_u_type_ *>(key_[i]);
  95. array_u_type_ cur = 0;
  96. if ((length_ ? length_[i] : length_func_()(key_[i])) != parent.depth)
  97. cur = (array_u_type_)tmp[parent.depth] + 1;
  98. if (prev > cur) {
  99. error_ = -3;
  100. return 0;
  101. }
  102. if (cur != prev || siblings.empty()) {
  103. node_t tmp_node;
  104. tmp_node.depth = parent.depth + 1;
  105. tmp_node.code = cur;
  106. tmp_node.left = i;
  107. if (!siblings.empty()) siblings[siblings.size() - 1].right = i;
  108. siblings.push_back(tmp_node);
  109. }
  110. prev = cur;
  111. }
  112. if (!siblings.empty()) siblings[siblings.size() - 1].right = parent.right;
  113. return siblings.size();
  114. }
  115. size_t insert(const std::vector<node_t> &siblings) {
  116. if (error_ < 0) return 0;
  117. size_t begin = 0;
  118. size_t pos = _max((size_t)siblings[0].code + 1, next_check_pos_) - 1;
  119. size_t nonzero_num = 0;
  120. int first = 0;
  121. if (alloc_size_ <= pos) resize(pos + 1);
  122. while (true) {
  123. next:
  124. ++pos;
  125. if (alloc_size_ <= pos) resize(pos + 1);
  126. if (array_[pos].check) {
  127. ++nonzero_num;
  128. continue;
  129. } else if (!first) {
  130. next_check_pos_ = pos;
  131. first = 1;
  132. }
  133. begin = pos - siblings[0].code;
  134. if (alloc_size_ <= (begin + siblings[siblings.size() - 1].code)) {
  135. size_t new_size = static_cast<size_t>(
  136. alloc_size_ * _max(1.05, 1.0 * key_size_ / progress_));
  137. if (!new_size) new_size = alloc_size_ * 1.0 * key_size_; //????
  138. resize(new_size);
  139. }
  140. if (used_[begin]) continue;
  141. for (size_t i = 1; i < siblings.size(); ++i)
  142. if (array_[begin + siblings[i].code].check != 0) goto next;
  143. break;
  144. }
  145. // -- Simple heuristics --
  146. // if the percentage of non-empty contents in check between the index
  147. // 'next_check_pos' and 'check' is greater than some constant
  148. // value(e.g. 0.9),
  149. // new 'next_check_pos' index is written by 'check'.
  150. if (1.0 * nonzero_num / (pos - next_check_pos_ + 1) >= 0.95)
  151. next_check_pos_ = pos;
  152. used_[begin] = 1;
  153. size_ = _max(size_, begin + static_cast<size_t>(
  154. siblings[siblings.size() - 1].code + 1));
  155. for (size_t i = 0; i < siblings.size(); ++i)
  156. array_[begin + siblings[i].code].check = begin;
  157. for (size_t i = 0; i < siblings.size(); ++i) {
  158. std::vector<node_t> new_siblings;
  159. if (!fetch(siblings[i], new_siblings)) {
  160. array_[begin + siblings[i].code].base =
  161. value_ ? static_cast<array_type_>(-value_[siblings[i].left] - 1)
  162. : static_cast<array_type_>(-1 * siblings[i].left - 1);
  163. if (value_ && (array_type_)(-value_[siblings[i].left] - 1) >= 0) {
  164. error_ = -2;
  165. return 0;
  166. }
  167. ++progress_;
  168. if (progress_func_) (*progress_func_)(progress_, key_size_);
  169. } else {
  170. size_t h = insert(new_siblings);
  171. array_[begin + siblings[i].code].base = h;
  172. }
  173. }
  174. return begin;
  175. }
  176. public:
  177. typedef array_type_ value_type;
  178. typedef node_type_ key_type;
  179. typedef array_type_ result_type; // for compatibility
  180. struct result_pair_type {
  181. value_type value;
  182. size_t length;
  183. value_type pos;
  184. };
  185. explicit DoubleArrayImpl()
  186. : array_(0),
  187. used_(0),
  188. size_(0),
  189. alloc_size_(0),
  190. no_delete_(0),
  191. error_(0) {}
  192. ~DoubleArrayImpl() { clear(); }
  193. void set_result(value_type &x, value_type r, size_t) { x = r; }
  194. void set_result(result_pair_type &x, value_type r, size_t l) {
  195. x.value = r;
  196. x.length = l;
  197. x.pos = 0;
  198. }
  199. void set_result(result_pair_type &x, value_type r, size_t l,
  200. array_u_type_ p) {
  201. x.value = r;
  202. x.length = l;
  203. x.pos = p;
  204. }
  205. void set_array(void *ptr, size_t size = 0) {
  206. clear();
  207. array_ = reinterpret_cast<unit_t *>(ptr);
  208. no_delete_ = true;
  209. size_ = size;
  210. }
  211. const void *array() const {
  212. return const_cast<const void *>(reinterpret_cast<void *>(array_));
  213. }
  214. void clear() {
  215. if (!no_delete_) delete[] array_;
  216. delete[] used_;
  217. array_ = 0;
  218. used_ = 0;
  219. alloc_size_ = 0;
  220. size_ = 0;
  221. no_delete_ = false;
  222. }
  223. size_t unit_size() const { return sizeof(unit_t); }
  224. size_t size() const { return size_; }
  225. size_t total_size() const { return size_ * sizeof(unit_t); }
  226. size_t nonzero_size() const {
  227. size_t result = 0;
  228. for (size_t i = 0; i < size_; ++i)
  229. if (array_[i].check) ++result;
  230. return result;
  231. }
  232. int build(size_t key_size, key_type **key, size_t *length = 0,
  233. value_type *value = 0, int (*progress_func)(size_t, size_t) = 0) {
  234. if (!key_size || !key) return 0;
  235. progress_func_ = progress_func;
  236. key_ = key;
  237. length_ = length;
  238. key_size_ = key_size;
  239. value_ = value;
  240. progress_ = 0;
  241. // lower down the value to build very large(1450K word) ucs2 darts.
  242. resize(key_size_ * 64); // LMN: This line is the reason why word-item can
  243. // not larger than 255
  244. array_[0].base = 1;
  245. next_check_pos_ = 0;
  246. node_t root_node;
  247. root_node.left = 0;
  248. root_node.right = key_size;
  249. root_node.depth = 0;
  250. std::vector<node_t> siblings;
  251. fetch(root_node, siblings);
  252. insert(siblings);
  253. size_ += (1 << 8 * sizeof(key_type)) + 1;
  254. if (size_ >= alloc_size_) resize(size_);
  255. delete[] used_;
  256. used_ = 0;
  257. return error_;
  258. }
  259. int open(const char *file, const char *mode = "rb", size_t offset = 0,
  260. size_t size = 0) {
  261. std::FILE *fp = std::fopen(file, mode);
  262. if (!fp) return -1;
  263. if (std::fseek(fp, (long)offset, SEEK_SET) != 0) {
  264. std::fclose(fp);
  265. return -1;
  266. }
  267. if (!size) {
  268. if (std::fseek(fp, 0L, SEEK_END) != 0) {
  269. std::fclose(fp);
  270. return -1;
  271. }
  272. size = std::ftell(fp);
  273. if (std::fseek(fp, (long)offset, SEEK_SET) != 0) {
  274. std::fclose(fp);
  275. return -1;
  276. }
  277. }
  278. clear();
  279. size_ = size;
  280. size_ /= sizeof(unit_t);
  281. array_ = new unit_t[size_];
  282. if (size_ != std::fread(reinterpret_cast<unit_t *>(array_), sizeof(unit_t),
  283. size_, fp)) {
  284. std::fclose(fp);
  285. return -1;
  286. }
  287. std::fclose(fp);
  288. return 0;
  289. }
  290. int save(const char *file, const char *mode = "wb", size_t offset = 0) {
  291. if (!size_) return -1;
  292. std::FILE *fp = std::fopen(file, mode);
  293. if (!fp) return -1;
  294. if (size_ != std::fwrite(reinterpret_cast<unit_t *>(array_), sizeof(unit_t),
  295. size_, fp))
  296. return -1;
  297. std::fclose(fp);
  298. return 0;
  299. }
  300. #ifdef HAVE_ZLIB_H
  301. int gzopen(const char *file, const char *mode = "rb", size_t offset = 0,
  302. size_t size = 0) {
  303. std::FILE *fp = std::fopen(file, mode);
  304. if (!fp) return -1;
  305. clear();
  306. size_ = size;
  307. if (!size_) {
  308. if (-1L != static_cast<long>(std::fseek(fp, -8, SEEK_END))) {
  309. char buf[8];
  310. if (std::fread(static_cast<char *>(buf), 1, 8, fp) != sizeof(buf)) {
  311. std::fclose(fp);
  312. return -1;
  313. }
  314. size_ = LG(buf + 4);
  315. size_ /= sizeof(unit_t);
  316. }
  317. }
  318. std::fclose(fp);
  319. if (!size_) return -1;
  320. zlib::gzFile gzfp = zlib::gzopen(file, mode);
  321. if (!gzfp) return -1;
  322. array_ = new unit_t[size_];
  323. if (zlib::gzseek(gzfp, offset, SEEK_SET) != 0) return -1;
  324. zlib::gzread(gzfp, reinterpret_cast<unit_t *>(array_),
  325. sizeof(unit_t) * size_);
  326. zlib::gzclose(gzfp);
  327. return 0;
  328. }
  329. int gzsave(const char *file, const char *mode = "wb", size_t offset = 0) {
  330. zlib::gzFile gzfp = zlib::gzopen(file, mode);
  331. if (!gzfp) return -1;
  332. zlib::gzwrite(gzfp, reinterpret_cast<unit_t *>(array_),
  333. sizeof(unit_t) * size_);
  334. zlib::gzclose(gzfp);
  335. return 0;
  336. }
  337. #endif
  338. template <class T>
  339. inline void exactMatchSearch(const key_type *key, T &result, size_t len = 0,
  340. size_t node_pos = 0) {
  341. result = exactMatchSearch<T>(key, len, node_pos);
  342. return;
  343. }
  344. template <class T>
  345. inline T exactMatchSearch(const key_type *key, size_t len = 0,
  346. size_t node_pos = 0) {
  347. if (!len) len = length_func_()(key);
  348. T result;
  349. set_result(result, -1, 0);
  350. register array_type_ b = array_[node_pos].base;
  351. register array_u_type_ p;
  352. for (register size_t i = 0; i < len; ++i) {
  353. p = b + (node_u_type_)(key[i]) + 1;
  354. if (static_cast<array_u_type_>(b) == array_[p].check)
  355. b = array_[p].base;
  356. else
  357. return result;
  358. }
  359. p = b;
  360. array_type_ n = array_[p].base;
  361. if (static_cast<array_u_type_>(b) == array_[p].check && n < 0)
  362. set_result(result, -n - 1, len, p);
  363. else
  364. set_result(result, -n - 1, len, -1 * p);
  365. return result;
  366. }
  367. template <class T>
  368. size_t commonPrefixSearch(const key_type *key, T *result, size_t result_len,
  369. size_t len = 0, size_t node_pos = 0) {
  370. return commonPrefixSearch(key, 0xFFFF, result, result_len, len, node_pos);
  371. }
  372. template <class T>
  373. size_t commonPrefixSearch(const key_type *key, u4 flag, T *result,
  374. size_t result_len, size_t len = 0,
  375. size_t node_pos = 0) {
  376. if (!len) len = length_func_()(key);
  377. register array_type_ b = array_[node_pos].base;
  378. register size_t num = 0;
  379. register array_type_ n;
  380. register array_u_type_ p;
  381. for (register size_t i = 0; i < len; ++i) {
  382. p = b; // + 0;
  383. n = array_[p].base;
  384. if ((array_u_type_)b == array_[p].check && n < 0) {
  385. // result[num] = -n-1;
  386. // found a sub word
  387. if (num < result_len) set_result(result[num], -n - 1, i, p);
  388. ++num;
  389. }
  390. p = b + (node_u_type_)(key[i] & flag) + 1;
  391. if ((array_u_type_)b == array_[p].check)
  392. b = array_[p].base;
  393. else {
  394. // found a mismatch
  395. return num;
  396. }
  397. }
  398. p = b;
  399. n = array_[p].base;
  400. // total-string match.
  401. if ((array_u_type_)b == array_[p].check && n < 0) {
  402. if (num < result_len) set_result(result[num], -n - 1, len, p);
  403. ++num;
  404. } else {
  405. // all prev-string match. but not term. pos < 0, this is a prefix
  406. if (num < result_len) set_result(result[num], -n - 1, len, -1 * p);
  407. ++num;
  408. }
  409. return num;
  410. }
  411. template <class T>
  412. size_t commonPrefixSearch(const u4 *key, u4 flag, T *result,
  413. size_t result_len, size_t len = 0,
  414. size_t node_pos = 0) {
  415. if (!len) return 0;
  416. register array_type_ b = array_[node_pos].base;
  417. register size_t num = 0;
  418. register array_type_ n;
  419. register array_u_type_ p;
  420. for (register size_t i = 0; i < len; ++i) {
  421. p = b; // + 0;
  422. n = array_[p].base;
  423. if ((array_u_type_)b == array_[p].check && n < 0) {
  424. // result[num] = -n-1;
  425. // found a sub word
  426. if (num < result_len) set_result(result[num], -n - 1, i, p);
  427. ++num;
  428. }
  429. p = b + (node_u_type_)(key[i] & flag) + 1;
  430. if ((array_u_type_)b == array_[p].check)
  431. b = array_[p].base;
  432. else {
  433. // found a mismatch
  434. return num;
  435. }
  436. }
  437. p = b;
  438. n = array_[p].base;
  439. // total-string match.
  440. if ((array_u_type_)b == array_[p].check && n < 0) {
  441. if (num < result_len) set_result(result[num], -n - 1, len, p);
  442. ++num;
  443. } else {
  444. // all prev-string match. but not term. pos < 0, this is a prefix
  445. if (num < result_len) set_result(result[num], -n - 1, len, -1 * p);
  446. ++num;
  447. }
  448. return num;
  449. }
  450. value_type traverse(const key_type *key, size_t &node_pos, size_t &key_pos,
  451. size_t len = 0) {
  452. if (!len) len = length_func_()(key);
  453. register array_type_ b = array_[node_pos].base;
  454. register array_u_type_ p;
  455. for (; key_pos < len; ++key_pos) {
  456. p = b + (node_u_type_)(key[key_pos]) + 1;
  457. if (static_cast<array_u_type_>(b) == array_[p].check) {
  458. node_pos = p;
  459. b = array_[p].base;
  460. } else {
  461. return -2; // no node
  462. }
  463. }
  464. p = b;
  465. array_type_ n = array_[p].base;
  466. if (static_cast<array_u_type_>(b) == array_[p].check && n < 0)
  467. return -n - 1;
  468. return -1; // found, but no value
  469. }
  470. };
  471. #if 4 == 2
  472. typedef Darts::DoubleArrayImpl<char, unsigned char, short, unsigned short>
  473. DoubleArray;
  474. #define DARTS_ARRAY_SIZE_IS_DEFINED 1
  475. #endif
  476. #if 4 == 4 && !defined(DARTS_ARRAY_SIZE_IS_DEFINED)
  477. typedef Darts::DoubleArrayImpl<char, unsigned char, int, unsigned int>
  478. DoubleArray;
  479. #define DARTS_ARRAY_SIZE_IS_DEFINED 1
  480. #endif
  481. #if 4 == 4 && !defined(DARTS_ARRAY_SIZE_IS_DEFINED)
  482. typedef Darts::DoubleArrayImpl<char, unsigned char, long, unsigned long>
  483. DoubleArray;
  484. #define DARTS_ARRAY_SIZE_IS_DEFINED 1
  485. #endif
  486. #if 4 == 8 && !defined(DARTS_ARRAY_SIZE_IS_DEFINED)
  487. typedef Darts::DoubleArrayImpl<char, unsigned char, long long,
  488. unsigned long long> DoubleArray;
  489. #endif
  490. typedef Darts::DoubleArrayImpl<unsigned short, unsigned short, int,
  491. unsigned int> DoubleArrayUcs2;
  492. }
  493. #endif