2 years ago · 8d33bf2ef4
--- a/src/comm/segment/bmm_segment.cc
+++ b/src/comm/segment/bmm_segment.cc
@@ -8,7 +8,8 @@ BmmSegment::~BmmSegment()
 
				 {
			
 
				 }
			
 
				 
			
 
				-void BmmSegment::ConcreteSplit(iutf8string& phrase, uint32_t appid, vector<string>& bmm_list){
			
 
				+void BmmSegment::ConcreteSplit(const string& str, uint32_t appid, vector<string>& bmm_list){
			
 
				+	iutf8string phrase(str);
			
 
				     int maxlen = MAX_WORD_LEN;
			
 
				     int len_phrase = phrase.length();
			
 
				     int i = len_phrase, j = 0;
			
--- a/src/comm/segment/bmm_segment.h
+++ b/src/comm/segment/bmm_segment.h
@@ -27,7 +27,7 @@ private:
 
				 public:
			
 
				     BmmSegment();
			
 
				     ~BmmSegment();
			
 
				-    virtual void ConcreteSplit(iutf8string& phrase, uint32_t appid, vector<string>& vec);
			
 
				+    virtual void ConcreteSplit(const string& str, uint32_t appid, vector<string>& vec);
			
 
				 };
			
 
				 
			
 
				 
			
--- a/src/comm/segment/custom_segment.cc
+++ b/src/comm/segment/custom_segment.cc
@@ -44,9 +44,9 @@ bool CustomSegment::Init(string word_path, string train_path){
 
				     return true;
			
 
				 }
			
 
				 
			
 
				-void CustomSegment::ConcreteSplit(iutf8string& phrase, uint32_t appid, vector<string>& vec){
			
 
				+void CustomSegment::ConcreteSplit(const string& str, uint32_t appid, vector<string>& vec){
			
 
				     char res[100] = {'\0'};
			
 
				-    word_split_func_(phrase.stlstring().c_str(), res, 100);
			
 
				+    word_split_func_(str.c_str(), res, 100);
			
 
				     string tmp = "";
			
 
				     for(int i = 0; i < strlen(res); i++){
			
 
				         if(res[i] != ' '){
			
--- a/src/comm/segment/custom_segment.h
+++ b/src/comm/segment/custom_segment.h
@@ -29,7 +29,7 @@ public:
 
				     CustomSegment();
			
 
				     ~CustomSegment();
			
 
				     virtual bool Init(string word_path, string train_path);
			
 
				-    virtual void ConcreteSplit(iutf8string& phrase, uint32_t appid, vector<string>& vec);
			
 
				+    virtual void ConcreteSplit(const string& str, uint32_t appid, vector<string>& vec);
			
 
				 private:
			
 
				     CConfig* cache_config_;
			
 
				     split_interface word_split_func_;
			
--- a/src/comm/segment/dag_segment.cc
+++ b/src/comm/segment/dag_segment.cc
@@ -10,7 +10,8 @@ DagSegment::~DagSegment()
 
				 {
			
 
				 }
			
 
				 
			
 
				-void DagSegment::ConcreteSplit(iutf8string& sentence, uint32_t appid, vector<string>& vec){
			
 
				+void DagSegment::ConcreteSplit(const string& str, uint32_t appid, vector<string>& vec){
			
 
				+	iutf8string sentence(str);
			
 
				     map<uint32_t, vector<uint32_t> > dag_map;
			
 
				     getDag(sentence, appid, dag_map);
			
 
				     map<uint32_t, RouteValue> route;
			
--- a/src/comm/segment/dag_segment.h
+++ b/src/comm/segment/dag_segment.h
@@ -34,7 +34,7 @@ class DagSegment: public Segment
 
				 public:
			
 
				     DagSegment();
			
 
				     ~DagSegment();
			
 
				-    virtual void ConcreteSplit(iutf8string& phrase, uint32_t appid, vector<string>& vec);
			
 
				+    virtual void ConcreteSplit(const string& str, uint32_t appid, vector<string>& vec);
			
 
				 private:
			
 
				     void getDag(iutf8string& sentence, uint32_t appid, map<uint32_t, vector<uint32_t> >& dag_map);
			
 
				     void calc(iutf8string& sentence, const map<uint32_t, vector<uint32_t> >& dag_map, map<uint32_t, RouteValue>& route, uint32_t appid);
			
--- a/src/comm/segment/fmm_segment.cc
+++ b/src/comm/segment/fmm_segment.cc
@@ -8,7 +8,8 @@ FmmSegment::~FmmSegment()
 
				 {
			
 
				 }
			
 
				 
			
 
				-void FmmSegment::ConcreteSplit(iutf8string& phrase, uint32_t appid, vector<string>& fmm_list){
			
 
				+void FmmSegment::ConcreteSplit(const string& str, uint32_t appid, vector<string>& fmm_list){
			
 
				+	iutf8string phrase(str);
			
 
				     int maxlen = MAX_WORD_LEN;
			
 
				     int len_phrase = phrase.length();
			
 
				     int i = 0, j = 0;
			
--- a/src/comm/segment/fmm_segment.h
+++ b/src/comm/segment/fmm_segment.h
@@ -27,7 +27,7 @@ private:
 
				 public:
			
 
				     FmmSegment();
			
 
				     ~FmmSegment();
			
 
				-    virtual void ConcreteSplit(iutf8string& phrase, uint32_t appid, vector<string>& vec);
			
 
				+    virtual void ConcreteSplit(const string& str, uint32_t appid, vector<string>& vec);
			
 
				 };
			
 
				 
			
 
				 
			
--- a/src/comm/segment/ngram_segment.cc
+++ b/src/comm/segment/ngram_segment.cc
@@ -9,11 +9,11 @@ NgramSegment::~NgramSegment()
 
				 {
			
 
				 }
			
 
				 
			
 
				-void NgramSegment::ConcreteSplit(iutf8string& utf8_str, uint32_t appid, vector<string>& parse_list){
			
 
				+void NgramSegment::ConcreteSplit(const string& str, uint32_t appid, vector<string>& parse_list){
			
 
				     vector<string> parse_list1;
			
 
				     vector<string> parse_list2;
			
 
				-    fmm(utf8_str, appid, parse_list1);
			
 
				-    bmm(utf8_str, appid, parse_list2);
			
 
				+    fmm(str, appid, parse_list1);
			
 
				+    bmm(str, appid, parse_list2);
			
 
				     parse_list1.insert(parse_list1.begin(), "<BEG>");
			
 
				     parse_list1.push_back("<END>");
			
 
				     parse_list2.insert(parse_list2.begin(), "<BEG>");
			
@@ -101,7 +101,8 @@ void NgramSegment::ConcreteSplit(iutf8string& utf8_str, uint32_t appid, vector<s
 
				     return;
			
 
				 }
			
 
				 
			
 
				-void NgramSegment::fmm(iutf8string& phrase, uint32_t appid, vector<string>& fmm_list) {
			
 
				+void NgramSegment::fmm(const string& str, uint32_t appid, vector<string>& fmm_list) {
			
 
				+	iutf8string phrase(str);
			
 
				     int maxlen = MAX_WORD_LEN;
			
 
				     int len_phrase = phrase.length();
			
 
				     int i = 0, j = 0;
			
@@ -129,7 +130,8 @@ void NgramSegment::fmm(iutf8string& phrase, uint32_t appid, vector<string>& fmm_
 
				     return;
			
 
				 }
			
 
				 
			
 
				-void NgramSegment::bmm(iutf8string& phrase, uint32_t appid, vector<string>& bmm_list) {
			
 
				+void NgramSegment::bmm(const string& str, uint32_t appid, vector<string>& bmm_list) {
			
 
				+	iutf8string phrase(str);
			
 
				     int maxlen = MAX_WORD_LEN;
			
 
				     int len_phrase = phrase.length();
			
 
				     int i = len_phrase, j = 0;
			
--- a/src/comm/segment/ngram_segment.h
+++ b/src/comm/segment/ngram_segment.h
@@ -25,10 +25,10 @@ class NgramSegment: public Segment
 
				 public:
			
 
				     NgramSegment();
			
 
				     ~NgramSegment();
			
 
				-    virtual void ConcreteSplit(iutf8string& phrase, uint32_t appid, vector<string>& vec);
			
 
				+    virtual void ConcreteSplit(const string& str, uint32_t appid, vector<string>& vec);
			
 
				 private:
			
 
				-    void fmm(iutf8string& phrase, uint32_t appid, vector<string>& vec);
			
 
				-    void bmm(iutf8string& phrase, uint32_t appid, vector<string>& vec);
			
 
				+    void fmm(const string& str, uint32_t appid, vector<string>& vec);
			
 
				+    void bmm(const string& str, uint32_t appid, vector<string>& vec);
			
 
				     double calSegProbability(const vector<string>& vec);
			
 
				     bool getWordInfo(string word, uint32_t appid, WordInfo& word_info);
			
 
				 };
			
--- a/src/comm/segment/segment.cc
+++ b/src/comm/segment/segment.cc
@@ -89,7 +89,8 @@ bool Segment::Init(string word_path, string train_path){
 
				     return true;
			
 
				 }
			
 
				 
			
 
				-void Segment::Split(iutf8string& phrase, uint32_t appid, vector<string>& new_res_all, bool hmm_flag){
			
 
				+void Segment::Split(const string& str, uint32_t appid, vector<string>& new_res_all, bool hmm_flag){
			
 
				+	iutf8string phrase(str);
			
 
				     vector<string> sen_list;
			
 
				     set<string> special_set;  // 记录英文和数字字符串
			
 
				     string tmp_words = "";
			
@@ -137,9 +138,8 @@ void Segment::Split(iutf8string& phrase, uint32_t appid, vector<string>& new_res
 
				     for (int i = 0; i < (int)sen_list.size(); i++) {
			
 
				         // special_set中保存了连续的字母数字串，不需要进行分词
			
 
				         if (special_set.find(sen_list[i]) == special_set.end() && punct_set_.find(sen_list[i]) == punct_set_.end()) {
			
 
				-            iutf8string utf8_str(sen_list[i]);
			
 
				             vector<string> parse_list;
			
 
				-            ConcreteSplit(utf8_str, appid, parse_list);
			
 
				+            ConcreteSplit(sen_list[i], appid, parse_list);
			
 
				             res_all.insert(res_all.end(), parse_list.begin(), parse_list.end());
			
 
				         }else { // 英文或数字需要放入到res_all，标点符号不需要
			
 
				             if(punct_set_.find(sen_list[i]) == punct_set_.end()){
			
@@ -196,10 +196,10 @@ void Segment::dealByHmmMgr(uint32_t appid, const vector<string>& res_all, vector
 
				     }
			
 
				 }
			
 
				 
			
 
				-void Segment::CutForSearch(iutf8string& phrase, uint32_t appid, vector<vector<string> >& search_res_all) {
			
 
				+void Segment::CutForSearch(const string& str, uint32_t appid, vector<vector<string> >& search_res_all) {
			
 
				     // 搜索引擎模式
			
 
				     vector<string> new_res_all;
			
 
				-    Split(phrase, appid, new_res_all);
			
 
				+    Split(str, appid, new_res_all);
			
 
				     for (size_t i = 0; i < new_res_all.size(); i++) {
			
 
				         vector<string> vec;
			
 
				         iutf8string utf8_str(new_res_all[i]);
			
@@ -238,7 +238,8 @@ bool Segment::isAllAlphaOrDigit(string str) {
 
				     return flag;
			
 
				 }
			
 
				 
			
 
				-void Segment::CutNgram(iutf8string& phrase, vector<string>& search_res, uint32_t n) {
			
 
				+void Segment::CutNgram(const string& str, vector<string>& search_res, uint32_t n) {
			
 
				+	iutf8string phrase(str);
			
 
				     uint32_t N = (n > (uint32_t)phrase.length()) ? (uint32_t)phrase.length() : n;
			
 
				     for (size_t i = 1; i <= N; i++) {
			
 
				         for (size_t j = 0; j < (size_t)phrase.length() - i + 1; j++) {
			
--- a/src/comm/segment/segment.h
+++ b/src/comm/segment/segment.h
@@ -40,10 +40,10 @@ public:
 
				     Segment();
			
 
				     virtual ~Segment();
			
 
				     virtual bool Init(string word_path, string train_path);
			
 
				-    void CutForSearch(iutf8string& phrase, uint32_t appid, vector<vector<string> >& search_res_all);
			
 
				-    void CutNgram(iutf8string& phrase, vector<string>& search_res, uint32_t n);
			
 
				-    void Split(iutf8string& phrase, uint32_t appid, vector<string>& vec, bool hmm_flag = false);
			
 
				-    virtual void ConcreteSplit(iutf8string& phrase, uint32_t appid, vector<string>& vec) = 0;
			
 
				+    void CutForSearch(const string& str, uint32_t appid, vector<vector<string> >& search_res_all);
			
 
				+    void CutNgram(const string& str, vector<string>& search_res, uint32_t n);
			
 
				+    void Split(const string& str, uint32_t appid, vector<string>& vec, bool hmm_flag = false);
			
 
				+    virtual void ConcreteSplit(const string& str, uint32_t appid, vector<string>& vec) = 0;
			
 
				 
			
 
				 protected:
			
 
				     bool isAllAlphaOrDigit(string str);