/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* ***** BEGIN LICENSE BLOCK ***** * Version: GPL 2.0 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License. You should have * received a copy of the GPL license along with this program; if you * did not, you can find it at http://www.gnu.org/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the * License. * * The Original Code is Coreseek.com code. * * Copyright (C) 2007-2008. All Rights Reserved. * * Author: * Li monan * * ***** END LICENSE BLOCK ***** */ #ifndef css_Segmenter_h #define css_Segmenter_h #include #ifdef WIN32 #include #else #include #endif #include "SegmentPkg.h" #include "UnigramDict.h" #include "SynonymsDict.h" #include "csr_typedefs.h" #include "freelist.h" #include "mmthunk.h" #include #include #include #include "UnigramCorpusReader.h" #include "ThesaurusDict.h" namespace css { using namespace CRFPP; #ifdef WIN32 using namespace stdext; #else using namespace __gnu_cxx; #endif #define CRFDICT_UTF8 1 #define BEGIN_TOKEN "b##b" #define END_TOKEN "e##e" #define NUMBER_TOKEN "M" #define ASCII_TOKEN "E" #define BEGIN_TOKEN_ID 0 #define END_TOKEN_ID 1 #define NUMBER_TOKEN_ID 2 #define ASCII_TOKEN_ID 3 #define BEGIN_TOKEN_LENGTH 4 #define END_TOKEN_LENGTH 4 #define NUMBER_TOKEN_LENGTH 1 #define ASCII_TOKEN_LENGTH 1 /* base functor, used to abstract n-gram smoothing algorithm Design only. not used yet. */ template struct NgramSmoother { FType operator()(int L, int R, int Bi, FType Smoothing) const { double dTemp = 1.0 / MAX_FREQUENCE; return (-1) * log(Smoothing * (1 + L) / (MAX_FREQUENCE + 80000) + (1 - Smoothing) * ((1 - dTemp) * Bi / (1 + L) + dTemp)); return 0; } const static int MAX_FREQUENCE = 2079997; }; /** Bit flag format: Bit flag is used in char-type tagging. size = sizeof(char). x1 x2 x3 x4 x5 x6 x7 x1 x2, the utf-8 char's position token 1 1, the next 2(or 4) char is token-length. (utf-8 data length) 0 0, only current char 0 1, next char 1 0, next 2 char 1 1, more than 3 char, read next 2 byte. this limited a token can not larger than 64k. ------ [0-80], the standard ascii char, tag-set: m: number e: non CJK char, e.g. English pinyin t: time. 年号 干支等(此处识别出后,仅加入 oov ,不参与实际分词) c: CJK char. s: Symbol e.g. @ w: Sentence seperator. x: unknown char. */ class Segmenter_ConfigObj { public: u1 merge_number_and_ascii; u1 seperate_number_ascii; // TODO: compress_space is still unsupported, for spaces can be handled in // stopword list. u1 compress_space; u1 number_and_ascii_joint[512]; Segmenter_ConfigObj() : merge_number_and_ascii(0), seperate_number_ascii(0), compress_space(0) { number_and_ascii_joint[0] = 0; } }; class Segmenter { public: /** * @return 0 */ void setBuffer(u1* buf, u4 length); const u1* peekToken(u2& aLen, u2& aSymLen, u2 n = 0); void popToken(u2 len, u2 n = 0); void segNgram(int n) { m_ngram = n; } int getOffset(); u1 isSentenceEnd(); int isKeyWord(u1* buf, u4 length); int getWordWeight(u1* buf, u4 length); const char* thesaurus(const char* key, u2 key_len); Segmenter(); ~Segmenter(); protected: const u1* peekKwToken(u2& aLen, u2& aSymLen); void popKwToken(u2 len); public: static int toLowerCpy(const u1* src, u1* det, u2 det_size); protected: int m_begin_id; int m_end_id; int m_begin_count; int m_end_count; int m_ngram; ChineseCharTaggerImpl* m_tagger; MMThunk m_thunk; // static ToLowerImpl* m_lower; public: UnigramDict* m_unidict; UnigramDict* m_kwdict; UnigramDict* m_weightdict; SynonymsDict* m_symdict; ThesaurusDict* m_thesaurus; Segmenter_ConfigObj* m_config; // mmseg used. u1* m_buffer_begin; u1* m_buffer_ptr; u1* m_buffer_chunk_begin; u1* m_buffer_end; }; } /* End of namespace css */ #endif