/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: GPL 2.0
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License. You should have
* received a copy of the GPL license along with this program; if you
* did not, you can find it at http://www.gnu.org/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Coreseek.com code.
*
* Copyright (C) 2007-2008. All Rights Reserved.
*
* Author:
* Li monan
*
* ***** END LICENSE BLOCK ***** */
#ifndef _MM_THUNK_H_
#define _MM_THUNK_H_
#include
#include
#include "UnigramDict.h"
#include "freelist.h"
#define CHUNK_BUFFER_SIZE 1024
#define CHUNK_DEBUG 0
namespace css {
class Chunk {
public:
Chunk() : m_free_score(0.0), total_length(0) {}
float m_free_score;
int total_length;
std::vector tokens;
std::vector freqs;
inline void pushToken(u2 len, u2 freq) {
#if CHUNK_DEBUG
printf("pt:%d, %d;\t", len, freq);
#endif
tokens.push_back(len);
total_length += len;
freqs.push_back(freq);
// m_free_score += log((float)freq) * 100;
}
inline float get_free() {
// m_free_score
float score = 0.0;
std::vector::iterator it;
float freq = 0;
for (it = freqs.begin(); it < freqs.end(); it++) {
freq = ((float)*it) + 1;
score += log(freq) * 100;
}
return score;
}
inline float get_avl() {
float avg = (float)1.0 * total_length / tokens.size();
return avg;
}
inline float get_avg() {
float avg = (float)1.0 * total_length / tokens.size();
std::vector::iterator it;
float total = 0;
for (it = tokens.begin(); it < tokens.end(); it++) {
float diff = ((*it) - avg);
total += diff * diff;
}
return (float)1.0 * total / (tokens.size() - 1);
}
inline void popup() {
if (tokens.size()) {
total_length -= tokens[tokens.size() - 1];
tokens.pop_back();
freqs.pop_back();
}
}
inline void reset() {
tokens.clear();
freqs.clear();
total_length = 0;
}
};
class ChunkQueue {
public:
ChunkQueue() : max_length(0){};
public:
void push(Chunk& ck) {
if (ck.total_length < max_length) return; // rule:1
if (ck.total_length > max_length) {
max_length = ck.total_length;
m_chunks.clear();
}
m_chunks.push_back(ck);
};
u2 getToken() {
size_t num_chunk = m_chunks.size();
if (!num_chunk) return 0;
if (num_chunk == 1) return m_chunks[0].tokens[0];
// debug use->dump chunk
#if CHUNK_DEBUG
for (size_t i = 0; i < num_chunk; i++) {
for (size_t j = 0; j < m_chunks[i].tokens.size(); j++)
printf("%d,", m_chunks[i].tokens[j]);
printf("\n");
}
#endif
// do filter
// apply rule 2
float avg_length = 0;
u4 remains[256]; // m_chunks.size can not larger than 256;
u4* k_ptr = remains;
for (size_t i = 0; i < m_chunks.size(); i++) {
float avl = m_chunks[i].get_avl();
if (avl > avg_length) {
avg_length = avl;
k_ptr = remains;
*k_ptr = (u4)i;
k_ptr++;
} else if (avl == avg_length) {
*k_ptr = (u4)i;
k_ptr++;
}
}
if ((k_ptr - remains) == 1)
return m_chunks[remains[0]].tokens[0]; // match by rule2
// apply rule 3
u4 remains_r3[256];
u4* k_ptr_r3 = remains_r3;
avg_length = 1024 * 64; // an unreachable avg
for (size_t i = 0; i < k_ptr - remains; i++) {
float avg = m_chunks[remains[i]].get_avg();
if (avg < avg_length) {
avg_length = avg;
k_ptr_r3 = remains_r3;
*k_ptr_r3 = (u4)remains[i]; //*k_ptr_r3 = (u4)i;
k_ptr_r3++;
} else if (avg == avg_length) {
*k_ptr_r3 = (u4)i;
k_ptr_r3++;
}
}
if ((k_ptr_r3 - remains_r3) == 1)
return m_chunks[remains_r3[0]].tokens[0]; // match by rule3 min
// avg_length
// apply r4 max freedom
float max_score = 0.0;
size_t idx = -1;
for (size_t i = 0; i < k_ptr_r3 - remains_r3; i++) {
float score = m_chunks[remains_r3[i]].get_free();
if (score > max_score) {
max_score = score;
idx = remains_r3[i];
}
}
return m_chunks[idx].tokens[0];
// return 0;
};
inline void reset() {
m_chunks.clear();
max_length = 0;
};
protected:
std::vector m_chunks;
i4 max_length;
};
class item_info {
public:
item_info()
: // length(0),
freq(0){};
public:
// u4 length;
u4 freq;
std::vector items;
};
class MMThunk {
public:
MMThunk() : base_offset(0), m_max_length(-1), m_length(0) {
memset(m_charinfos, 0, sizeof(item_info*) * CHUNK_BUFFER_SIZE);
memset(m_kwinfos, 0, sizeof(item_info*) * CHUNK_BUFFER_SIZE);
item_list.set_size(CHUNK_BUFFER_SIZE * 2);
};
~MMThunk(){};
void setItems(i4 idx, u2 rs_count, UnigramDict::result_pair_type* results);
void setKwItems(i4 idx, u2 rs_count, UnigramDict::result_pair_type* results);
void advance(u2 step) { base_offset += step; };
// peek the current token
u1* peekToken(u2& length);
u2 popupToken();
u1* peekKwToken(u2& pos, u2& length);
u2 popupKwToken();
int Tokenize();
void pushToken(u2 aSize, i4 base);
void reset();
u4 length() { return m_length; };
protected:
u2 base_offset;
CRFPP::FreeList item_list;
item_info* m_charinfos[CHUNK_BUFFER_SIZE];
std::vector tokens;
item_info* m_kwinfos[CHUNK_BUFFER_SIZE];
i4 m_kw_pos;
i4 m_kw_ipos;
i4 m_max_length;
u4 m_length;
ChunkQueue m_queue;
protected:
void pushChunk(Chunk& ck);
};
}
#endif