/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* ***** BEGIN LICENSE BLOCK ***** * Version: GPL 2.0 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License. You should have * received a copy of the GPL license along with this program; if you * did not, you can find it at http://www.gnu.org/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the * License. * * The Original Code is Coreseek.com code. * * Copyright (C) 2007-2008. All Rights Reserved. * * Author: * Li monan * * ***** END LICENSE BLOCK ***** */ #ifndef css_SegmentPkg_h #define css_SegmentPkg_h #include #define HAVE_ATEXIT #include "Singleton.h" #include "csr_typedefs.h" namespace css { /* - find char-class - tolower case(optional, used in search.). */ class ChineseCharTaggerImpl { public: ChineseCharTaggerImpl() { init(); } ~ChineseCharTaggerImpl() { for (int i = 1; i < 256; i++) { if (i == 0x23) continue; if (index_map[i]) delete[] index_map[i]; } }; u2 tagUnicode(u2 iCode, u1 length); protected: void init(); // We reduced the map. only number-char page is exist // char cjk_map[20736]; // 256*(9f-4e) = 21k u1* index_map[256]; u1 ansi_map[256]; // char sym_map[512]; // 0x3000 - 0x303F && 0xFF?? }; typedef CSR_Singleton ChineseCharTagger; #include "tolowercase.h" /*To lower */ class ToLowerImpl { public: ToLowerImpl(){}; inline u2 toLower(u2 k) { u1 idx = k >> 8; u2 iCode = k; if (table_index[idx]) iCode = table_index[idx][k & 0xFF]; if (iCode) return iCode; return k; } }; typedef CSR_Singleton ToLower; class SegmentPkg { public: SegmentPkg(); ~SegmentPkg(); void init(); public: const char* m_buf; // make the hole object less than 64k u1* m_tag; int m_length; // used length u1 m_Own; int m_size; // total length int m_used; u1 m_remains_bytes; std::vector m_wTagList; // the seps position. ChineseCharTaggerImpl* m_tagger; public: /** @return 0, appended. @return -1, too large NOTE: a newly created pkg always return 0. except not enough memory.(throw std::bad_alloc) */ int feedData(const char* buf, int length); int tagData(const char* buf, int length); void setSize(int length); public: /** * read UTF-8 input can tagger the char-pos in tag array. tag length must equal * or larger than buf. * we assume buf is end with '\0' * and this function will changed m_wTagList as a side effect. * @return, the data remains untagged. must less than 3. */ int tagData(const char* buf, u1* tag, int length = 0, int offset = 0); protected: const static int DEFAULT_PACKAGE_LENGTH = 65400; }; } /* End of namespace css */ #endif