Browse Source

split param optimization

shzhulin3 2 years ago
parent
commit
8d33bf2ef4

+ 2 - 1
src/comm/segment/bmm_segment.cc

@@ -8,7 +8,8 @@ BmmSegment::~BmmSegment()
 {
 }
 
-void BmmSegment::ConcreteSplit(iutf8string& phrase, uint32_t appid, vector<string>& bmm_list){
+void BmmSegment::ConcreteSplit(const string& str, uint32_t appid, vector<string>& bmm_list){
+	iutf8string phrase(str);
     int maxlen = MAX_WORD_LEN;
     int len_phrase = phrase.length();
     int i = len_phrase, j = 0;

+ 1 - 1
src/comm/segment/bmm_segment.h

@@ -27,7 +27,7 @@ private:
 public:
     BmmSegment();
     ~BmmSegment();
-    virtual void ConcreteSplit(iutf8string& phrase, uint32_t appid, vector<string>& vec);
+    virtual void ConcreteSplit(const string& str, uint32_t appid, vector<string>& vec);
 };
 
 

+ 2 - 2
src/comm/segment/custom_segment.cc

@@ -44,9 +44,9 @@ bool CustomSegment::Init(string word_path, string train_path){
     return true;
 }
 
-void CustomSegment::ConcreteSplit(iutf8string& phrase, uint32_t appid, vector<string>& vec){
+void CustomSegment::ConcreteSplit(const string& str, uint32_t appid, vector<string>& vec){
     char res[100] = {'\0'};
-    word_split_func_(phrase.stlstring().c_str(), res, 100);
+    word_split_func_(str.c_str(), res, 100);
     string tmp = "";
     for(int i = 0; i < strlen(res); i++){
         if(res[i] != ' '){

+ 1 - 1
src/comm/segment/custom_segment.h

@@ -29,7 +29,7 @@ public:
     CustomSegment();
     ~CustomSegment();
     virtual bool Init(string word_path, string train_path);
-    virtual void ConcreteSplit(iutf8string& phrase, uint32_t appid, vector<string>& vec);
+    virtual void ConcreteSplit(const string& str, uint32_t appid, vector<string>& vec);
 private:
     CConfig* cache_config_;
     split_interface word_split_func_;

+ 2 - 1
src/comm/segment/dag_segment.cc

@@ -10,7 +10,8 @@ DagSegment::~DagSegment()
 {
 }
 
-void DagSegment::ConcreteSplit(iutf8string& sentence, uint32_t appid, vector<string>& vec){
+void DagSegment::ConcreteSplit(const string& str, uint32_t appid, vector<string>& vec){
+	iutf8string sentence(str);
     map<uint32_t, vector<uint32_t> > dag_map;
     getDag(sentence, appid, dag_map);
     map<uint32_t, RouteValue> route;

+ 1 - 1
src/comm/segment/dag_segment.h

@@ -34,7 +34,7 @@ class DagSegment: public Segment
 public:
     DagSegment();
     ~DagSegment();
-    virtual void ConcreteSplit(iutf8string& phrase, uint32_t appid, vector<string>& vec);
+    virtual void ConcreteSplit(const string& str, uint32_t appid, vector<string>& vec);
 private:
     void getDag(iutf8string& sentence, uint32_t appid, map<uint32_t, vector<uint32_t> >& dag_map);
     void calc(iutf8string& sentence, const map<uint32_t, vector<uint32_t> >& dag_map, map<uint32_t, RouteValue>& route, uint32_t appid);

+ 2 - 1
src/comm/segment/fmm_segment.cc

@@ -8,7 +8,8 @@ FmmSegment::~FmmSegment()
 {
 }
 
-void FmmSegment::ConcreteSplit(iutf8string& phrase, uint32_t appid, vector<string>& fmm_list){
+void FmmSegment::ConcreteSplit(const string& str, uint32_t appid, vector<string>& fmm_list){
+	iutf8string phrase(str);
     int maxlen = MAX_WORD_LEN;
     int len_phrase = phrase.length();
     int i = 0, j = 0;

+ 1 - 1
src/comm/segment/fmm_segment.h

@@ -27,7 +27,7 @@ private:
 public:
     FmmSegment();
     ~FmmSegment();
-    virtual void ConcreteSplit(iutf8string& phrase, uint32_t appid, vector<string>& vec);
+    virtual void ConcreteSplit(const string& str, uint32_t appid, vector<string>& vec);
 };
 
 

+ 7 - 5
src/comm/segment/ngram_segment.cc

@@ -9,11 +9,11 @@ NgramSegment::~NgramSegment()
 {
 }
 
-void NgramSegment::ConcreteSplit(iutf8string& utf8_str, uint32_t appid, vector<string>& parse_list){
+void NgramSegment::ConcreteSplit(const string& str, uint32_t appid, vector<string>& parse_list){
     vector<string> parse_list1;
     vector<string> parse_list2;
-    fmm(utf8_str, appid, parse_list1);
-    bmm(utf8_str, appid, parse_list2);
+    fmm(str, appid, parse_list1);
+    bmm(str, appid, parse_list2);
     parse_list1.insert(parse_list1.begin(), "<BEG>");
     parse_list1.push_back("<END>");
     parse_list2.insert(parse_list2.begin(), "<BEG>");
@@ -101,7 +101,8 @@ void NgramSegment::ConcreteSplit(iutf8string& utf8_str, uint32_t appid, vector<s
     return;
 }
 
-void NgramSegment::fmm(iutf8string& phrase, uint32_t appid, vector<string>& fmm_list) {
+void NgramSegment::fmm(const string& str, uint32_t appid, vector<string>& fmm_list) {
+	iutf8string phrase(str);
     int maxlen = MAX_WORD_LEN;
     int len_phrase = phrase.length();
     int i = 0, j = 0;
@@ -129,7 +130,8 @@ void NgramSegment::fmm(iutf8string& phrase, uint32_t appid, vector<string>& fmm_
     return;
 }
 
-void NgramSegment::bmm(iutf8string& phrase, uint32_t appid, vector<string>& bmm_list) {
+void NgramSegment::bmm(const string& str, uint32_t appid, vector<string>& bmm_list) {
+	iutf8string phrase(str);
     int maxlen = MAX_WORD_LEN;
     int len_phrase = phrase.length();
     int i = len_phrase, j = 0;

+ 3 - 3
src/comm/segment/ngram_segment.h

@@ -25,10 +25,10 @@ class NgramSegment: public Segment
 public:
     NgramSegment();
     ~NgramSegment();
-    virtual void ConcreteSplit(iutf8string& phrase, uint32_t appid, vector<string>& vec);
+    virtual void ConcreteSplit(const string& str, uint32_t appid, vector<string>& vec);
 private:
-    void fmm(iutf8string& phrase, uint32_t appid, vector<string>& vec);
-    void bmm(iutf8string& phrase, uint32_t appid, vector<string>& vec);
+    void fmm(const string& str, uint32_t appid, vector<string>& vec);
+    void bmm(const string& str, uint32_t appid, vector<string>& vec);
     double calSegProbability(const vector<string>& vec);
     bool getWordInfo(string word, uint32_t appid, WordInfo& word_info);
 };

+ 7 - 6
src/comm/segment/segment.cc

@@ -89,7 +89,8 @@ bool Segment::Init(string word_path, string train_path){
     return true;
 }
 
-void Segment::Split(iutf8string& phrase, uint32_t appid, vector<string>& new_res_all, bool hmm_flag){
+void Segment::Split(const string& str, uint32_t appid, vector<string>& new_res_all, bool hmm_flag){
+	iutf8string phrase(str);
     vector<string> sen_list;
     set<string> special_set;  // 记录英文和数字字符串
     string tmp_words = "";
@@ -137,9 +138,8 @@ void Segment::Split(iutf8string& phrase, uint32_t appid, vector<string>& new_res
     for (int i = 0; i < (int)sen_list.size(); i++) {
         // special_set中保存了连续的字母数字串,不需要进行分词
         if (special_set.find(sen_list[i]) == special_set.end() && punct_set_.find(sen_list[i]) == punct_set_.end()) {
-            iutf8string utf8_str(sen_list[i]);
             vector<string> parse_list;
-            ConcreteSplit(utf8_str, appid, parse_list);
+            ConcreteSplit(sen_list[i], appid, parse_list);
             res_all.insert(res_all.end(), parse_list.begin(), parse_list.end());
         }else { // 英文或数字需要放入到res_all,标点符号不需要
             if(punct_set_.find(sen_list[i]) == punct_set_.end()){
@@ -196,10 +196,10 @@ void Segment::dealByHmmMgr(uint32_t appid, const vector<string>& res_all, vector
     }
 }
 
-void Segment::CutForSearch(iutf8string& phrase, uint32_t appid, vector<vector<string> >& search_res_all) {
+void Segment::CutForSearch(const string& str, uint32_t appid, vector<vector<string> >& search_res_all) {
     // 搜索引擎模式
     vector<string> new_res_all;
-    Split(phrase, appid, new_res_all);
+    Split(str, appid, new_res_all);
     for (size_t i = 0; i < new_res_all.size(); i++) {
         vector<string> vec;
         iutf8string utf8_str(new_res_all[i]);
@@ -238,7 +238,8 @@ bool Segment::isAllAlphaOrDigit(string str) {
     return flag;
 }
 
-void Segment::CutNgram(iutf8string& phrase, vector<string>& search_res, uint32_t n) {
+void Segment::CutNgram(const string& str, vector<string>& search_res, uint32_t n) {
+	iutf8string phrase(str);
     uint32_t N = (n > (uint32_t)phrase.length()) ? (uint32_t)phrase.length() : n;
     for (size_t i = 1; i <= N; i++) {
         for (size_t j = 0; j < (size_t)phrase.length() - i + 1; j++) {

+ 4 - 4
src/comm/segment/segment.h

@@ -40,10 +40,10 @@ public:
     Segment();
     virtual ~Segment();
     virtual bool Init(string word_path, string train_path);
-    void CutForSearch(iutf8string& phrase, uint32_t appid, vector<vector<string> >& search_res_all);
-    void CutNgram(iutf8string& phrase, vector<string>& search_res, uint32_t n);
-    void Split(iutf8string& phrase, uint32_t appid, vector<string>& vec, bool hmm_flag = false);
-    virtual void ConcreteSplit(iutf8string& phrase, uint32_t appid, vector<string>& vec) = 0;
+    void CutForSearch(const string& str, uint32_t appid, vector<vector<string> >& search_res_all);
+    void CutNgram(const string& str, vector<string>& search_res, uint32_t n);
+    void Split(const string& str, uint32_t appid, vector<string>& vec, bool hmm_flag = false);
+    virtual void ConcreteSplit(const string& str, uint32_t appid, vector<string>& vec) = 0;
 
 protected:
     bool isAllAlphaOrDigit(string str);