123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176 |
- /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
- /* ***** BEGIN LICENSE BLOCK *****
- * Version: GPL 2.0
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License. You should have
- * received a copy of the GPL license along with this program; if you
- * did not, you can find it at http://www.gnu.org/
- *
- * Software distributed under the License is distributed on an "AS IS" basis,
- * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
- * for the specific language governing rights and limitations under the
- * License.
- *
- * The Original Code is Coreseek.com code.
- *
- * Copyright (C) 2007-2008. All Rights Reserved.
- *
- * Author:
- * Li monan <li.monan@gmail.com>
- *
- * ***** END LICENSE BLOCK ***** */
- #ifndef css_Segmenter_h
- #define css_Segmenter_h
- #include <vector>
- #ifdef WIN32
- #include <hash_map>
- #else
- #include <ext/hash_map>
- #endif
- #include "SegmentPkg.h"
- #include "UnigramDict.h"
- #include "SynonymsDict.h"
- #include "csr_typedefs.h"
- #include "freelist.h"
- #include "mmthunk.h"
- #include <algorithm>
- #include <fstream>
- #include <iostream>
- #include "UnigramCorpusReader.h"
- #include "ThesaurusDict.h"
- namespace css {
- using namespace CRFPP;
- #ifdef WIN32
- using namespace stdext;
- #else
- using namespace __gnu_cxx;
- #endif
- #define CRFDICT_UTF8 1
- #define BEGIN_TOKEN "b##b"
- #define END_TOKEN "e##e"
- #define NUMBER_TOKEN "M"
- #define ASCII_TOKEN "E"
- #define BEGIN_TOKEN_ID 0
- #define END_TOKEN_ID 1
- #define NUMBER_TOKEN_ID 2
- #define ASCII_TOKEN_ID 3
- #define BEGIN_TOKEN_LENGTH 4
- #define END_TOKEN_LENGTH 4
- #define NUMBER_TOKEN_LENGTH 1
- #define ASCII_TOKEN_LENGTH 1
- /*
- base functor, used to abstract n-gram smoothing algorithm
- Design only. not used yet.
- */
- template <typename FType>
- struct NgramSmoother {
- FType operator()(int L, int R, int Bi, FType Smoothing) const {
- double dTemp = 1.0 / MAX_FREQUENCE;
- return (-1) * log(Smoothing * (1 + L) / (MAX_FREQUENCE + 80000) +
- (1 - Smoothing) * ((1 - dTemp) * Bi / (1 + L) + dTemp));
- return 0;
- }
- const static int MAX_FREQUENCE = 2079997;
- };
- /**
- Bit flag format:
- Bit flag is used in char-type tagging. size = sizeof(char).
- x1 x2 x3 x4 x5 x6 x7
- x1 x2, the utf-8 char's position token
- 1 1, the next 2(or 4) char is token-length. (utf-8 data length)
- 0 0, only current char
- 0 1, next char
- 1 0, next 2 char
- 1 1, more than 3 char, read next 2 byte. this limited a token can not larger
- than 64k.
- ------
- [0-80], the standard ascii char,
- tag-set:
- m: number
- e: non CJK char, e.g. English pinyin
- t: time. 年号 干支等(此处识别出后,仅加入 oov ,不参与实际分词)
- c: CJK char.
- s: Symbol e.g. @
- w: Sentence seperator.
- x: unknown char.
- */
- class Segmenter_ConfigObj {
- public:
- u1 merge_number_and_ascii;
- u1 seperate_number_ascii;
- // TODO: compress_space is still unsupported, for spaces can be handled in
- // stopword list.
- u1 compress_space;
- u1 number_and_ascii_joint[512];
- Segmenter_ConfigObj()
- : merge_number_and_ascii(0), seperate_number_ascii(0), compress_space(0) {
- number_and_ascii_joint[0] = 0;
- }
- };
- class Segmenter {
- public:
- /**
- * @return 0
- */
- void setBuffer(u1* buf, u4 length);
- const u1* peekToken(u2& aLen, u2& aSymLen, u2 n = 0);
- void popToken(u2 len, u2 n = 0);
- void segNgram(int n) { m_ngram = n; }
- int getOffset();
- u1 isSentenceEnd();
- int isKeyWord(u1* buf, u4 length);
- int getWordWeight(u1* buf, u4 length);
- const char* thesaurus(const char* key, u2 key_len);
- Segmenter();
- ~Segmenter();
- protected:
- const u1* peekKwToken(u2& aLen, u2& aSymLen);
- void popKwToken(u2 len);
- public:
- static int toLowerCpy(const u1* src, u1* det, u2 det_size);
- protected:
- int m_begin_id;
- int m_end_id;
- int m_begin_count;
- int m_end_count;
- int m_ngram;
- ChineseCharTaggerImpl* m_tagger;
- MMThunk m_thunk;
- // static ToLowerImpl* m_lower;
- public:
- UnigramDict* m_unidict;
- UnigramDict* m_kwdict;
- UnigramDict* m_weightdict;
- SynonymsDict* m_symdict;
- ThesaurusDict* m_thesaurus;
- Segmenter_ConfigObj* m_config;
- // mmseg used.
- u1* m_buffer_begin;
- u1* m_buffer_ptr;
- u1* m_buffer_chunk_begin;
- u1* m_buffer_end;
- };
- } /* End of namespace css */
- #endif
|