Browse Source

feat: cache core.

yangshuang70 2 years ago
parent
commit
6bf5325319
48 changed files with 18621 additions and 0 deletions
  1. 186 0
      src/core/chain/barrier_ask_answer_chain.cc
  2. 102 0
      src/core/chain/barrier_ask_answer_chain.h
  3. 31 0
      src/core/chain/black_hole_ask_chain.cc
  4. 42 0
      src/core/chain/black_hole_ask_chain.h
  5. 28 0
      src/core/chain/buffer_bypass_answer_chain.cc
  6. 32 0
      src/core/chain/buffer_bypass_answer_chain.h
  7. 47 0
      src/core/chain/buffer_bypass_ask_chain.cc
  8. 42 0
      src/core/chain/buffer_bypass_ask_chain.h
  9. 42 0
      src/core/chain/buffer_process_answer_chain.cc
  10. 73 0
      src/core/chain/buffer_process_answer_chain.h
  11. 3962 0
      src/core/chain/buffer_process_ask_chain.cc
  12. 524 0
      src/core/chain/buffer_process_ask_chain.h
  13. 76 0
      src/core/chain/job_procedure.cc
  14. 138 0
      src/core/chain/system_command_ask_chain.cc
  15. 59 0
      src/core/chain/system_command_ask_chain.h
  16. 68 0
      src/core/hotbk/hb_feature.cc
  17. 77 0
      src/core/hotbk/hb_feature.h
  18. 214 0
      src/core/hotbk/hb_log.cc
  19. 62 0
      src/core/hotbk/hb_log.h
  20. 191 0
      src/core/hotbk/hot_backup_ask_chain.cc
  21. 58 0
      src/core/hotbk/hot_backup_ask_chain.h
  22. 163 0
      src/core/mem/feature.cc
  23. 88 0
      src/core/mem/feature.h
  24. 51 0
      src/core/mem/fence_queue.h
  25. 127 0
      src/core/mem/mallocator.h
  26. 1559 0
      src/core/mem/pt_malloc.cc
  27. 395 0
      src/core/mem/pt_malloc.h
  28. 19 0
      src/core/mem/sys_malloc.cc
  29. 191 0
      src/core/mem/sys_malloc.h
  30. 20 0
      src/core/misc/dtc_code.h
  31. 699 0
      src/core/misc/main_supply.cc
  32. 87 0
      src/core/misc/main_supply.h
  33. 288 0
      src/core/misc/mysql_error.h
  34. 35 0
      src/core/misc/purge_processor.h
  35. 40 0
      src/core/misc/reader_interface.h
  36. 1179 0
      src/core/raw/raw_data.cc
  37. 470 0
      src/core/raw/raw_data.h
  38. 1181 0
      src/core/raw/raw_data_process.cc
  39. 126 0
      src/core/raw/raw_data_process.h
  40. 98 0
      src/core/task/task_pendlist.cc
  41. 63 0
      src/core/task/task_pendlist.h
  42. 1726 0
      src/core/tree/t_tree.cc
  43. 354 0
      src/core/tree/t_tree.h
  44. 2028 0
      src/core/tree/tree_data.cc
  45. 574 0
      src/core/tree/tree_data.h
  46. 81 0
      src/core/tree/tree_data_keycmp.h
  47. 728 0
      src/core/tree/tree_data_process.cc
  48. 197 0
      src/core/tree/tree_data_process.h

+ 186 - 0
src/core/chain/barrier_ask_answer_chain.cc

@@ -0,0 +1,186 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <stdio.h>
+
+#include <fence_queue.h>
+#include <barrier_ask_answer_chain.h>
+#include <poll/poller_base.h>
+
+#include "log/log.h"
+
+//-------------------------------------------------------------------------
+BarrierAskAnswerChain::BarrierAskAnswerChain(PollerBase *o, int max,
+					     int maxkeycount,
+					     E_BARRIER_UNIT_PLACE place)
+	: JobAskInterface<DTCJobOperation>(o), count(0), max_barrier(max),
+	  max_key_count_(maxkeycount), main_chain(o)
+{
+	free_list.InitList();
+	for (int i = 0; i < BARRIER_HASH_MAX; i++)
+		hash_slot_[i].InitList();
+	//stat
+	if (IN_FRONT == place) {
+		stat_barrier_count = g_stat_mgr.get_stat_int_counter(
+			DTC_FRONT_BARRIER_COUNT);
+		stat_barrier_max_task = g_stat_mgr.get_stat_int_counter(
+			DTC_FRONT_BARRIER_MAX_TASK);
+	} else if (IN_BACK == place) {
+		stat_barrier_count =
+			g_stat_mgr.get_stat_int_counter(DTC_BACK_BARRIER_COUNT);
+		stat_barrier_max_task = g_stat_mgr.get_stat_int_counter(
+			DTC_BACK_BARRIER_MAX_TASK);
+	} else {
+		log4cplus_error("bad place value %d", place);
+	}
+	stat_barrier_count = 0;
+	stat_barrier_max_task = 0;
+}
+
+BarrierAskAnswerChain::~BarrierAskAnswerChain()
+{
+	while (!free_list.ListEmpty()) {
+		delete static_cast<BarrierQueue *>(free_list.ListNext());
+	}
+	for (int i = 0; i < BARRIER_HASH_MAX; i++) {
+		while (!hash_slot_[i].ListEmpty()) {
+			delete static_cast<BarrierQueue *>(
+				hash_slot_[i].ListNext());
+		}
+	}
+}
+
+BarrierQueue *BarrierAskAnswerChain::get_barrier(unsigned long key)
+{
+	ListObject<BarrierQueue> *h = &hash_slot_[key2idx(key)];
+	ListObject<BarrierQueue> *p;
+
+	for (p = h->ListNext(); p != h; p = p->ListNext()) {
+		if (p->ListOwner()->key() == key)
+			return p->ListOwner();
+	}
+
+	return NULL;
+}
+
+BarrierQueue *BarrierAskAnswerChain::get_barrier_by_idx(unsigned long idx)
+{
+	if (idx >= BARRIER_HASH_MAX)
+		return NULL;
+
+	ListObject<BarrierQueue> *h = &hash_slot_[idx];
+	ListObject<BarrierQueue> *p;
+
+	p = h->ListNext();
+	return p->ListOwner();
+}
+
+void BarrierAskAnswerChain::attach_free_barrier(BarrierQueue *barrier)
+{
+	barrier->ListMove(free_list);
+	count--;
+	stat_barrier_count = count;
+	//Stat.set_barrier_count (count);
+}
+
+void BarrierAskAnswerChain::job_ask_procedure(DTCJobOperation *job_operation)
+{
+	log4cplus_debug("enter job_ask_procedure");
+	if (job_operation->request_code() == DRequest::TYPE_SYSTEM_COMMAND &&
+	    job_operation->requestInfo.admin_code() !=
+		    DRequest::SystemCommand::Migrate) {
+		//Migrate命令在PrepareRequest的时候已经计算了PackedKey和hash,需要跟普通的task一起排队
+		chain_request(job_operation);
+		return;
+	}
+	if (job_operation->is_batch_request()) {
+		chain_request(job_operation);
+		return;
+	}
+
+	unsigned long key = job_operation->barrier_key();
+	BarrierQueue *barrier = get_barrier(key);
+
+	if (barrier) {
+		if (barrier->Count() < max_key_count_) {
+			barrier->Push(job_operation);
+			if (barrier->Count() >
+			    stat_barrier_max_task) //max key number
+				stat_barrier_max_task = barrier->Count();
+		} else {
+			log4cplus_warning(
+				"barrier[%s]: overload max key count %d bars %d",
+				owner->Name(), max_key_count_, count);
+			job_operation->set_error(
+				-EC_SERVER_BUSY, __FUNCTION__,
+				"too many request blocked at key");
+			job_operation->turn_around_job_answer();
+		}
+	} else if (count >= max_barrier) {
+		log4cplus_warning("too many barriers, count=%d", count);
+		job_operation->set_error(-EC_SERVER_BUSY, __FUNCTION__,
+					 "too many barriers");
+		job_operation->turn_around_job_answer();
+	} else {
+		if (free_list.ListEmpty()) {
+			barrier = new BarrierQueue(&task_queue_allocator);
+		} else {
+			barrier = free_list.NextOwner();
+		}
+		barrier->set_key(key);
+		barrier->list_move_tail(hash_slot_[key2idx(key)]);
+		barrier->Push(job_operation);
+		count++;
+		stat_barrier_count = count; //barrier number
+
+		chain_request(job_operation);
+	}
+	log4cplus_debug("leave job_ask_procedure");
+}
+
+void BarrierAskAnswerChain::job_answer_procedure(DTCJobOperation *job_operation)
+{
+	if (job_operation->request_code() == DRequest::TYPE_SYSTEM_COMMAND &&
+	    job_operation->requestInfo.admin_code() !=
+		    DRequest::SystemCommand::Migrate) {
+		job_operation->turn_around_job_answer();
+		return;
+	}
+	if (job_operation->is_batch_request()) {
+		job_operation->turn_around_job_answer();
+		return;
+	}
+
+	unsigned long key = job_operation->barrier_key();
+	BarrierQueue *barrier = get_barrier(key);
+	if (barrier == NULL) {
+		log4cplus_error("return job not in barrier, key=%lu", key);
+	} else if (barrier->Front() == job_operation) {
+		if (barrier->Count() == stat_barrier_max_task) //max key number
+			stat_barrier_max_task--;
+		barrier->Pop();
+		DTCJobOperation *next = barrier->Front();
+		if (next == NULL) {
+			attach_free_barrier(barrier);
+		} else {
+			queue_request(next);
+		}
+	} else {
+		log4cplus_error("return job not barrier header, key=%lu", key);
+	}
+
+	job_operation->turn_around_job_answer();
+}

+ 102 - 0
src/core/chain/barrier_ask_answer_chain.h

@@ -0,0 +1,102 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef __BARRIER_ASK_ANSWER_CHAIN__
+#define __BARRIER_ASK_ANSWER_CHAIN__
+
+#include <stdint.h>
+#include <list/list.h>
+#include "task/task_request.h"
+#include "timer/timer_list.h"
+#include "fence_queue.h"
+#include "stat_dtc.h"
+
+#define BARRIER_HASH_MAX 1024 * 8
+
+class DTCJobOperation;
+class PollerBase;
+class BarrierAskAnswerChain;
+
+class BarrierAskAnswerChain : public JobAskInterface<DTCJobOperation>,
+			      public JobAnswerInterface<DTCJobOperation> {
+    public:
+	enum E_BARRIER_UNIT_PLACE { IN_FRONT, IN_BACK };
+	BarrierAskAnswerChain(PollerBase *, int max, int maxkeycount,
+			      E_BARRIER_UNIT_PLACE place);
+	~BarrierAskAnswerChain();
+
+	virtual void job_ask_procedure(DTCJobOperation *);
+	virtual void job_answer_procedure(DTCJobOperation *);
+
+	void chain_request(DTCJobOperation *p)
+	{
+		p->push_reply_dispatcher(this);
+		main_chain.job_ask_procedure(p);
+	}
+
+	void queue_request(DTCJobOperation *p)
+	{
+		p->push_reply_dispatcher(this);
+		main_chain.indirect_notify(p);
+	}
+
+	PollerBase *owner_thread(void) const
+	{
+		return owner;
+	}
+	void attach_free_barrier(BarrierQueue *);
+	int max_count_by_key(void) const
+	{
+		return max_key_count_;
+	}
+	void register_next_chain(JobAskInterface<DTCJobOperation> *p)
+	{
+		main_chain.register_next_chain(p);
+	}
+	int barrier_count() const
+	{
+		return count;
+	}
+	ChainJoint<DTCJobOperation> *get_main_chain()
+	{
+		return &main_chain;
+	}
+
+    protected:
+	int count;
+	LinkQueue<DTCJobOperation *>::allocator task_queue_allocator;
+	ListObject<BarrierQueue> free_list;
+	ListObject<BarrierQueue> hash_slot_[BARRIER_HASH_MAX];
+	int max_barrier;
+
+	BarrierQueue *get_barrier(unsigned long key);
+	BarrierQueue *get_barrier_by_idx(unsigned long idx);
+	int key2idx(unsigned long key)
+	{
+		return key % BARRIER_HASH_MAX;
+	}
+
+    private:
+	int max_key_count_;
+
+	ChainJoint<DTCJobOperation> main_chain;
+
+	//stat
+	StatCounter stat_barrier_count;
+	StatCounter stat_barrier_max_task;
+};
+
+#endif

+ 31 - 0
src/core/chain/black_hole_ask_chain.cc

@@ -0,0 +1,31 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <black_hole_ask_chain.h>
+
+BlackHoleAskChain::~BlackHoleAskChain(void)
+{
+}
+
+void BlackHoleAskChain::job_ask_procedure(DTCJobOperation *job_operation)
+{
+	log4cplus_debug("enter job_ask_procedure");
+	// preset affected_rows==0 is obsoleted
+	// use BlackHole flag instead
+	job_operation->mark_as_black_hole();
+	job_operation->turn_around_job_answer();
+	log4cplus_debug("leave job_ask_procedure");
+}

+ 42 - 0
src/core/chain/black_hole_ask_chain.h

@@ -0,0 +1,42 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef __BLACK_HOLE_ASK_CHAIN__
+#define __BLACK_HOLE_ASK_CHAIN__
+
+#include <task/task_request.h>
+
+class BlackHoleAskChain : public JobAskInterface<DTCJobOperation> {
+    public:
+	BlackHoleAskChain(PollerBase *o)
+		: JobAskInterface<DTCJobOperation>(o), main_chain(o){};
+	virtual ~BlackHoleAskChain(void);
+	void register_next_chain(JobAskInterface<DTCJobOperation> *p)
+	{
+		main_chain.register_next_chain(p);
+	}
+	ChainJoint<DTCJobOperation> *get_main_chain()
+	{
+		return &main_chain;
+	}
+
+    private:
+	ChainJoint<DTCJobOperation> main_chain;
+
+	virtual void job_ask_procedure(DTCJobOperation *);
+};
+
+#endif

+ 28 - 0
src/core/chain/buffer_bypass_answer_chain.cc

@@ -0,0 +1,28 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <buffer_bypass_answer_chain.h>
+
+BufferBypassAnswerChain::~BufferBypassAnswerChain(void)
+{
+}
+
+void BufferBypassAnswerChain::job_answer_procedure(DTCJobOperation *job)
+{
+	if (job->result)
+		job->pass_all_result(job->result);
+	job->turn_around_job_answer();
+}

+ 32 - 0
src/core/chain/buffer_bypass_answer_chain.h

@@ -0,0 +1,32 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef __BUFFER_BYPASS_ANSWER_CHAIN__
+#define __BUFFER_BYPASS_ANSWER_CHAIN__
+
+#include <task/task_request.h>
+
+class BufferBypassAnswerChain : public JobAnswerInterface<DTCJobOperation> {
+    public:
+	BufferBypassAnswerChain(void)
+	{
+	}
+	virtual ~BufferBypassAnswerChain(void);
+
+	virtual void job_answer_procedure(DTCJobOperation *job);
+};
+
+#endif

+ 47 - 0
src/core/chain/buffer_bypass_ask_chain.cc

@@ -0,0 +1,47 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <buffer_bypass_ask_chain.h>
+#include <buffer_bypass_answer_chain.h>
+
+static BufferBypassAnswerChain g_buffer_bypass_answer_instance;
+
+BufferBypassAskChain::~BufferBypassAskChain(void)
+{
+}
+
+void BufferBypassAskChain::job_ask_procedure(DTCJobOperation *job_operation)
+{
+	if (job_operation->is_batch_request()) {
+		job_operation->turn_around_job_answer();
+		return;
+	}
+
+	if (job_operation->count_only() &&
+	    (job_operation->requestInfo.limit_start() ||
+	     job_operation->requestInfo.limit_count())) {
+		job_operation->set_error(
+			-EC_BAD_COMMAND, "BufferBypass",
+			"There's nothing to limit because no fields required");
+		job_operation->turn_around_job_answer();
+		return;
+	}
+
+	job_operation->mark_as_pass_thru();
+	job_operation->push_reply_dispatcher(&g_buffer_bypass_answer_instance);
+
+	main_chain.job_ask_procedure(job_operation);
+}

+ 42 - 0
src/core/chain/buffer_bypass_ask_chain.h

@@ -0,0 +1,42 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef __BUFFER_BYPASS_ASK_CHAIN__
+#define __BUFFER_BYPASS_ASK_CHAIN__
+
+#include <task/task_request.h>
+
+class BufferBypassAskChain : public JobAskInterface<DTCJobOperation> {
+    public:
+	BufferBypassAskChain(PollerBase *o)
+		: JobAskInterface<DTCJobOperation>(o), main_chain(o){};
+	virtual ~BufferBypassAskChain(void);
+	void register_next_chain(JobAskInterface<DTCJobOperation> *p)
+	{
+		main_chain.register_next_chain(p);
+	}
+	ChainJoint<DTCJobOperation> *get_main_chain()
+	{
+		return &main_chain;
+	}
+
+    private:
+	ChainJoint<DTCJobOperation> main_chain;
+
+	virtual void job_ask_procedure(DTCJobOperation *);
+};
+
+#endif

+ 42 - 0
src/core/chain/buffer_process_answer_chain.cc

@@ -0,0 +1,42 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <endian.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+
+#include "packet/packet.h"
+#include "log/log.h"
+#include "buffer_process_answer_chain.h"
+#include "buffer_flush.h"
+#include "mysql_error.h"
+#include "sys_malloc.h"
+#include "data_chunk.h"
+#include "raw_data_process.h"
+#include "key/key_route_ask_chain.h"
+#include "buffer_remoteLog.h"
+#include "hotback_task.h"
+#include "tree_data_process.h"
+DTC_USING_NAMESPACE;
+
+void BufferProcessAnswerChain::job_answer_procedure(
+	DTCJobOperation *job_operation)
+{
+	buffer_reply_notify_owner_->job_answer_procedure(job_operation);
+}

+ 73 - 0
src/core/chain/buffer_process_answer_chain.h

@@ -0,0 +1,73 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef __BUFFER_PROCESS_ANSWER_CHAIN__
+#define __BUFFER_PROCESS_ANSWER_CHAIN__
+
+#include <sys/mman.h>
+#include <time.h>
+
+#include "protocol.h"
+#include "value.h"
+#include "field/field.h"
+#include "section.h"
+#include "table/table_def.h"
+#include "task/task_request.h"
+#include "list/list.h"
+#include "fence_queue.h"
+#include "buffer_pond.h"
+#include "poll/poller_base.h"
+#include "config/dbconfig.h"
+#include "queue/lqueue.h"
+#include "stat_dtc.h"
+#include "data_process.h"
+#include "empty_filter.h"
+#include "namespace.h"
+#include "task_pendlist.h"
+#include "data_chunk.h"
+#include "hb_log.h"
+#include "lru_bit.h"
+#include "hb_feature.h"
+#include "blacklist/blacklist_unit.h"
+#include "expire_time.h"
+
+DTC_BEGIN_NAMESPACE
+
+class DTCFlushRequest;
+class BufferProcessAskChain;
+class DTCTableDefinition;
+class TaskPendingList;
+
+class BufferProcessAnswerChain : public JobAnswerInterface<DTCJobOperation> {
+    private:
+	BufferProcessAskChain *buffer_reply_notify_owner_;
+
+    public:
+	BufferProcessAnswerChain(BufferProcessAskChain *buffer_process)
+		: buffer_reply_notify_owner_(buffer_process)
+	{
+	}
+
+	virtual ~BufferProcessAnswerChain()
+	{
+	}
+
+	virtual void job_answer_procedure(DTCJobOperation *);
+};
+
+DTC_END_NAMESPACE
+
+#endif

+ 3962 - 0
src/core/chain/buffer_process_ask_chain.cc

@@ -0,0 +1,3962 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <endian.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+
+#include "packet/packet.h"
+#include "log/log.h"
+#include "buffer_process_ask_chain.h"
+#include "buffer_flush.h"
+#include "mysql_error.h"
+#include "sys_malloc.h"
+#include "data_chunk.h"
+#include "raw_data_process.h"
+#include "key/key_route_ask_chain.h"
+#include "buffer_remoteLog.h"
+#include "hotback_task.h"
+#include "tree_data_process.h"
+DTC_USING_NAMESPACE;
+
+extern DTCTableDefinition *g_table_def[];
+extern KeyRouteAskChain *g_key_route_ask_instance;
+extern int g_hash_changing;
+extern int g_target_new_hash;
+extern DTCConfig *g_dtc_config;
+extern int collect_load_config(DbConfig *dbconfig);
+extern DbConfig *dbConfig;
+
+#if __WORDSIZE == 64
+#define UINT64FMT_T "%lu"
+#else
+#define UINT64FMT_T "%llu"
+#endif
+
+inline int BufferProcessAskChain::transaction_find_node(DTCJobOperation &job)
+{
+	// alreay cleared/zero-ed
+	key = job.packed_key();
+	if (empty_node_filter_ != NULL &&
+	    empty_node_filter_->ISSET(job.int_key())) {
+		//Cache.cache_purge(key);
+		cache_transaction_node = Node();
+		return node_status = DTC_CODE_NODE_EMPTY;
+	}
+
+	int newhash, oldhash;
+	if (g_hash_changing) {
+		if (g_target_new_hash) {
+			oldhash = 0;
+			newhash = 1;
+		} else {
+			oldhash = 1;
+			newhash = 0;
+		}
+
+		cache_transaction_node = cache_.cache_find(key, oldhash);
+		if (!cache_transaction_node) {
+			cache_transaction_node =
+				cache_.cache_find(key, newhash);
+			if (!cache_transaction_node)
+				return node_status = DTC_CODE_NODE_NOTFOUND;
+		} else {
+			cache_.move_to_new_hash(key, cache_transaction_node);
+		}
+	} else {
+		if (g_target_new_hash) {
+			cache_transaction_node = cache_.cache_find(key, 1);
+			if (!cache_transaction_node)
+				return node_status = DTC_CODE_NODE_NOTFOUND;
+		} else {
+			cache_transaction_node = cache_.cache_find(key, 0);
+			if (!cache_transaction_node)
+				return node_status = DTC_CODE_NODE_NOTFOUND;
+		}
+	}
+
+	key_dirty = cache_transaction_node.is_dirty();
+	old_rows = cache_.node_rows_count(cache_transaction_node);
+	// prepare to decrease empty node count
+	node_empty = key_dirty == 0 && old_rows == 0;
+	return node_status = DTC_CODE_NODE_HIT;
+}
+
+inline void BufferProcessAskChain::transaction_update_lru(bool async, int level)
+{
+	if (!key_dirty) {
+		// clear node empty here, because the lru is adjusted
+		// it's not a fresh node in EmptyButInCleanList state
+		if (async == true) {
+			cache_transaction_node.set_dirty();
+			cache_.inc_dirty_node(1);
+			cache_.remove_from_lru(cache_transaction_node);
+			cache_.insert_to_dirty_lru(cache_transaction_node);
+			if (node_empty != 0) {
+				// empty to non-empty
+				cache_.dec_empty_node();
+				node_empty = 0;
+			}
+			lru_update = LRU_NONE;
+		} else {
+			lru_update = level;
+		}
+	}
+}
+
+void BufferProcessAskChain::transaction_end(void)
+{
+	int newRows = 0;
+	if (!!cache_transaction_node && !key_dirty &&
+	    !cache_transaction_node.is_dirty()) {
+		newRows = cache_.node_rows_count(cache_transaction_node);
+		int nodeEmpty1 = newRows == 0;
+
+		if (lru_update > lru_update_level_ ||
+		    nodeEmpty1 != node_empty) {
+			if (newRows == 0) {
+				cache_.remove_from_lru(cache_transaction_node);
+				cache_.insert_to_empty_lru(
+					cache_transaction_node);
+				if (node_empty == 0) {
+					// non-empty to empty
+					cache_.inc_empty_node();
+					node_empty = 1;
+				}
+				// Cache.DumpEmptyNodeList();
+			} else {
+				cache_.remove_from_lru(cache_transaction_node);
+				cache_.insert_to_clean_lru(
+					cache_transaction_node);
+				if (node_empty != 0) {
+					// empty to non-empty
+					cache_.dec_empty_node();
+					node_empty = 0;
+				}
+			}
+		}
+	}
+
+	CacheTransaction::Free();
+}
+
+int BufferProcessAskChain::write_lru_hotbackup_log(const char *key)
+{
+	log4cplus_debug("write_lru_hotbackup_log begin");
+	if (!log_hotbackup_key_switch_) {
+		return 0;
+	}
+	log4cplus_debug("write_lru_hotbackup_log new job");
+	DTCJobOperation *pJob = new DTCJobOperation;
+	if (pJob == NULL) {
+		log4cplus_error(
+			"cannot write_hotbackup_log row, new job error, possible memory exhausted\n");
+		return -1;
+	}
+
+	pJob->set_request_type(TaskTypeWriteLruHbLog);
+	HotBackTask &hotbacktask = pJob->get_hot_back_task();
+	hotbacktask.set_type(DTCHotBackup::SYNC_LRU);
+	hotbacktask.set_flag(DTCHotBackup::NON_VALUE);
+	hotbacktask.set_value(NULL, 0);
+	DTCValue packeKey = table_define_infomation_->packed_key(key);
+	hotbacktask.set_packed_key(packeKey.bin.ptr, packeKey.bin.len);
+	log4cplus_debug(" packed key len:%d, key len:%d,  key :%s",
+			packeKey.bin.len, *(unsigned char *)packeKey.bin.ptr,
+			packeKey.bin.ptr + 1);
+	dispatch_hot_back_task(pJob);
+	return 0;
+}
+
+int BufferProcessAskChain::write_hotbackup_log(const char *key, char *pstChunk,
+					       unsigned int uiNodeSize,
+					       int iType)
+{
+	if (!log_hotbackup_key_switch_) {
+		return 0;
+	}
+	DTCJobOperation *pJob = new DTCJobOperation;
+	if (pJob == NULL) {
+		log4cplus_error(
+			"cannot write_hotbackup_log row, new job error, possible memory exhausted\n");
+		return -1;
+	}
+
+	pJob->set_request_type(TaskTypeWriteHbLog);
+
+	HotBackTask &hotbacktask = pJob->get_hot_back_task();
+	hotbacktask.set_type(iType);
+	DTCValue packeKey;
+	if (iType == DTCHotBackup::SYNC_COLEXPAND_CMD)
+		packeKey.Set(key);
+	else
+		packeKey = table_define_infomation_->packed_key(key);
+	hotbacktask.set_packed_key(packeKey.bin.ptr, packeKey.bin.len);
+	log4cplus_debug(" packed key len:%d, key len:%d,  key :%s",
+			packeKey.bin.len, *(unsigned char *)packeKey.bin.ptr,
+			packeKey.bin.ptr + 1);
+	if (uiNodeSize > 0 &&
+	    (iType == DTCHotBackup::SYNC_COLEXPAND_CMD || uiNodeSize <= 100)) {
+		hotbacktask.set_flag(DTCHotBackup::HAS_VALUE);
+		hotbacktask.set_value(pstChunk, uiNodeSize);
+		dispatch_hot_back_task(pJob);
+	} else {
+		hotbacktask.set_flag(DTCHotBackup::NON_VALUE);
+		hotbacktask.set_value(NULL, 0);
+		dispatch_hot_back_task(pJob);
+	}
+
+	return 0;
+}
+
+int BufferProcessAskChain::write_hotbackup_log(const char *key, Node &node,
+					       int iType)
+{
+	if (!log_hotbackup_key_switch_) {
+		return 0;
+	}
+
+	unsigned int uiNodeSize = 0;
+	DataChunk *pstChunk = NULL;
+
+	if (!(!node) && node.vd_handle() != INVALID_HANDLE) {
+		pstChunk = (DataChunk *)PtMalloc::instance()->handle_to_ptr(
+			node.vd_handle());
+		uiNodeSize = pstChunk->node_size();
+	}
+	return write_hotbackup_log(key, (char *)pstChunk, uiNodeSize, iType);
+}
+
+inline int BufferProcessAskChain::write_hotbackup_log(DTCJobOperation &job,
+						      Node &node, int iType)
+{
+	return write_hotbackup_log(job.packed_key(), node, iType);
+}
+
+void BufferProcessAskChain::purge_node_processor(const char *key, Node node)
+{
+	if (!node)
+		return;
+
+	if (node == cache_transaction_node) {
+		if (node_empty) {
+			// purge an empty node! decrease empty counter
+			cache_.dec_empty_node();
+			node_empty = 0;
+		}
+		cache_transaction_node = Node::Empty();
+	}
+
+	if (write_hotbackup_log(key, node, DTCHotBackup::SYNC_PURGE)) {
+		log4cplus_error("hb: log purge key failed");
+	}
+}
+
+BufferProcessAskChain::BufferProcessAskChain(PollerBase *p,
+					     DTCTableDefinition *tdef,
+					     EUpdateMode um)
+	: JobAskInterface<DTCJobOperation>(p), main_chain(p), remote_chain(p),
+	  hotbackup_chain(p), cache_reply_(this),
+	  table_define_infomation_(tdef), cache_(this),
+	  dtc_mode_(DTC_MODE_DATABASE_ADDITION), full_mode_(false),
+	  m_bReplaceEmpty(false), lru_update_level_(0), async_server_(um),
+	  update_mode_(MODE_SYNC), insert_mode_(MODE_SYNC),
+	  memory_dirty_(false), insert_order_(INSERT_ORDER_LAST),
+	  node_size_limit_(0), node_rows_limit_(0), node_empty_limit_(0),
+
+	  flush_reply_(this), flush_timer_(NULL),
+	  current_pend_flush_request_(0), pend_flush_request_(0),
+	  max_flush_request_(1), marker_interval_(300), min_dirty_time_(3600),
+	  max_dirty_time_(43200),
+
+	  empty_node_filter_(NULL),
+	  // Hot Backup
+	  log_hotbackup_key_switch_(false), hotbackup_lru_feature_(NULL),
+	  // Hot Backup
+	  // BlackList
+	  black_list_(0), blacklist_timer_(0)
+
+// BlackList
+{
+	memset((char *)&cache_info_, 0, sizeof(cache_info_));
+
+	stat_get_count_ = g_stat_mgr.get_stat_int_counter(DTC_GET_COUNT);
+	stat_get_hits_ = g_stat_mgr.get_stat_int_counter(DTC_GET_HITS);
+	stat_insert_count_ = g_stat_mgr.get_stat_int_counter(DTC_INSERT_COUNT);
+	stat_insert_hits_ = g_stat_mgr.get_stat_int_counter(DTC_INSERT_HITS);
+	stat_update_count_ = g_stat_mgr.get_stat_int_counter(DTC_UPDATE_COUNT);
+	stat_update_hits_ = g_stat_mgr.get_stat_int_counter(DTC_UPDATE_HITS);
+	stat_delete_count_ = g_stat_mgr.get_stat_int_counter(DTC_DELETE_COUNT);
+	stat_delete_hits_ = g_stat_mgr.get_stat_int_counter(DTC_DELETE_HITS);
+	stat_purge_count_ = g_stat_mgr.get_stat_int_counter(DTC_PURGE_COUNT);
+
+	stat_drop_count_ = g_stat_mgr.get_stat_int_counter(DTC_DROP_COUNT);
+	stat_drop_rows_ = g_stat_mgr.get_stat_int_counter(DTC_DROP_ROWS);
+	stat_flush_count_ = g_stat_mgr.get_stat_int_counter(DTC_FLUSH_COUNT);
+	stat_flush_rows_ = g_stat_mgr.get_stat_int_counter(DTC_FLUSH_ROWS);
+	// statIncSyncStep = g_stat_mgr.get_sample(HBP_INC_SYNC_STEP);
+
+	stat_maxflush_request_ =
+		g_stat_mgr.get_stat_int_counter(DTC_MAX_FLUSH_REQ);
+	stat_currentFlush_request_ =
+		g_stat_mgr.get_stat_int_counter(DTC_CURR_FLUSH_REQ);
+
+	stat_oldestdirty_time_ =
+		g_stat_mgr.get_stat_int_counter(DTC_OLDEST_DIRTY_TIME);
+	stat_asyncflush_count_ =
+		g_stat_mgr.get_stat_int_counter(DTC_ASYNC_FLUSH_COUNT);
+
+	stat_expire_count_ =
+		g_stat_mgr.get_stat_int_counter(DTC_KEY_EXPIRE_USER_COUNT);
+	stat_buffer_process_expire_count_ =
+		g_stat_mgr.get_stat_int_counter(CACHE_EXPIRE_REQ);
+
+	max_expire_count_ =
+		g_dtc_config->get_int_val("cache", "max_expire_count_", 100);
+	max_expire_time_ = g_dtc_config->get_int_val(
+		"cache", "max_expire_time_", 3600 * 24 * 30);
+}
+
+BufferProcessAskChain::~BufferProcessAskChain()
+{
+	if (empty_node_filter_ != NULL)
+		delete empty_node_filter_;
+}
+
+int BufferProcessAskChain::set_insert_order(int o)
+{
+	if (dtc_mode_ == DTC_MODE_CACHE_ONLY && o == INSERT_ORDER_PURGE) {
+		log4cplus_error(
+			"NoDB server don't support TABLE_DEFINE.ServerOrderInsert = purge");
+		return -1;
+	}
+
+	if (cache_info_.sync_update == 0 && o == INSERT_ORDER_PURGE) {
+		log4cplus_error(
+			"AsyncUpdate server don't support TABLE_DEFINE.ServerOrderInsert = purge");
+		return -1;
+	}
+	insert_order_ = o;
+	if (data_process_)
+		data_process_->set_insert_order(o);
+	return 0;
+}
+
+int BufferProcessAskChain::enable_no_db_mode(void)
+{
+	if (insert_order_ == INSERT_ORDER_PURGE) {
+		log4cplus_error(
+			"NoDB server don't support TABLE_DEFINE.ServerOrderInsert = purge");
+		return DTC_CODE_FAILED;
+	}
+	if (table_define_infomation_->has_auto_increment()) {
+		log4cplus_error(
+			"NoDB server don't support auto_increment field");
+		return DTC_CODE_FAILED;
+	}
+	dtc_mode_ = DTC_MODE_CACHE_ONLY;
+	full_mode_ = true;
+	return DTC_CODE_SUCCESS;
+}
+
+int BufferProcessAskChain::disable_lru_update(int level)
+{
+	if (level > LRU_WRITE)
+		level = LRU_WRITE;
+	if (level < 0)
+		level = 0;
+	lru_update_level_ = level;
+	return 0;
+}
+
+int BufferProcessAskChain::disable_async_log(int disable)
+{
+	async_log_ = !!disable;
+	return 0;
+}
+
+int BufferProcessAskChain::set_buffer_size_and_version(
+	unsigned long cache_size, unsigned int cache_version)
+{
+	cache_info_.init(table_define_infomation_->key_format(), cache_size,
+			 cache_version);
+	return DTC_CODE_SUCCESS;
+}
+
+/*
+ * Function		: cache_open
+ * Description	: 打开cache
+ * Input			: key_name		共享内存ipc key
+ *				  ulNodeTotal_	数据节点总数
+ * ulBucketTotal	hash桶总数
+ * ulChunkTotal	chunk节点总数
+ * ulChunkSize	chunk节点大小(单位:byte)
+ * Output		: 
+ * Return		: 成功返回0,失败返回-1
+ */
+int BufferProcessAskChain::open_init_buffer(int key_name,
+					    int enable_empty_filter,
+					    int enable_auto_clean_dirty_buffer)
+{
+	cache_info_.key_size = table_define_infomation_->key_format();
+	cache_info_.ipc_mem_key = key_name;
+	cache_info_.sync_update = !async_server_;
+	cache_info_.empty_filter = enable_empty_filter ? 1 : 0;
+	cache_info_.auto_delete_dirty_shm =
+		enable_auto_clean_dirty_buffer ? 1 : 0;
+	cache_info_.force_update_table_conf =
+		g_dtc_config->get_int_val("cache", "ForceUpdateTableConf", 0);
+
+	log4cplus_debug(
+		"cache_info: \n\tshmkey[%d] \n\tshmsize[" UINT64FMT
+		"] \n\tkeysize[%u]"
+		"\n\tversion[%u] \n\tsyncUpdate[%u] \n\treadonly[%u]"
+		"\n\tcreateonly[%u] \n\tempytfilter[%u] \n\tautodeletedirtysharememory[%u]",
+		cache_info_.ipc_mem_key, cache_info_.ipc_mem_size,
+		cache_info_.key_size, cache_info_.version,
+		cache_info_.sync_update, cache_info_.read_only,
+		cache_info_.create_only, cache_info_.empty_filter,
+		cache_info_.auto_delete_dirty_shm);
+
+	if (cache_.cache_open(&cache_info_)) {
+		log4cplus_error("%s", cache_.error());
+		return -1;
+	}
+
+	log4cplus_info("Current cache_ memory format is V%d\n",
+		       cache_info_.version);
+
+	int iMemSyncUpdate = cache_.dirty_lru_empty() ? 1 : 0;
+	/*
+	 * 1. sync dtc + dirty mem, SYNC + memory_dirty_
+	 * 2. sync dtc + clean mem, SYNC + !memory_dirty_
+	 * 3. async dtc + dirty mem/clean mem: ASYNC
+	 *    disable ASYNC <--> FLUSH switch, so FLUSH never happen forever
+	 *    update_mode_ == async_server_
+	 */
+	switch (async_server_ * 0x10000 + iMemSyncUpdate) {
+	// sync dtcd + async mem
+	case 0x00000:
+		memory_dirty_ = true;
+		update_mode_ = MODE_SYNC;
+		break;
+	// sync dtcd + sync mem
+	case 0x00001:
+		update_mode_ = MODE_SYNC;
+		break;
+	// async dtcd + async mem
+	case 0x10000:
+		update_mode_ = MODE_ASYNC;
+		break;
+	// async dtcd + sync mem
+	case 0x10001:
+		update_mode_ = MODE_ASYNC;
+		break;
+	default:
+		update_mode_ = cache_info_.sync_update ? MODE_SYNC : MODE_ASYNC;
+	}
+	if (table_define_infomation_->has_auto_increment() == 0 &&
+	    update_mode_ == MODE_ASYNC)
+		insert_mode_ = MODE_ASYNC;
+	log4cplus_info("Cache Update Mode: %s",
+		       update_mode_ == MODE_SYNC ?
+			       "SYNC" :
+			       update_mode_ == MODE_ASYNC ?
+			       "ASYNC" :
+			       update_mode_ == MODE_FLUSH ? "FLUSH" : "<BAD>");
+	// 空结点过滤
+	const FEATURE_INFO_T *pstFeature;
+	pstFeature = cache_.query_feature_by_id(EMPTY_FILTER);
+	if (pstFeature != NULL) {
+		NEW(EmptyNodeFilter, empty_node_filter_);
+		if (empty_node_filter_ == NULL) {
+			log4cplus_error("new %s error: %m", "EmptyNodeFilter");
+			return -1;
+		}
+		if (empty_node_filter_->do_attach(pstFeature->fi_handle) != 0) {
+			log4cplus_error("EmptyNodeFilter attach error: %s",
+					empty_node_filter_->error());
+			return -1;
+		}
+	}
+	MallocBase *pstMalloc = PtMalloc::instance();
+	UpdateMode stUpdateMod = { async_server_, update_mode_, insert_mode_,
+				   insert_order_ };
+	if (table_define_infomation_->index_fields() > 0) {
+		log4cplus_debug("tree index enable, index field num[%d]",
+				table_define_infomation_->index_fields());
+		data_process_ =
+			new TreeDataProcess(pstMalloc, table_define_infomation_,
+					    &cache_, &stUpdateMod);
+		if (data_process_ == NULL) {
+			log4cplus_error("create TreeDataProcess error: %m");
+			return -1;
+		}
+	} else {
+		log4cplus_debug("%s", "use raw-data mode");
+		data_process_ =
+			new RawDataProcess(pstMalloc, table_define_infomation_,
+					   &cache_, &stUpdateMod);
+		if (data_process_ == NULL) {
+			log4cplus_error("create RawDataProcess error: %m");
+			return -1;
+		}
+		((RawDataProcess *)data_process_)
+			->set_limit_node_size(node_size_limit_);
+	}
+	if (update_mode_ == MODE_SYNC) {
+		async_log_ = 1;
+	}
+	// 热备特性
+	pstFeature = cache_.query_feature_by_id(HOT_BACKUP);
+	if (pstFeature != NULL) {
+		NEW(HBFeature, hotbackup_lru_feature_);
+		if (hotbackup_lru_feature_ == NULL) {
+			log4cplus_error("new hot-backup feature error: %m");
+			return -1;
+		}
+		if (hotbackup_lru_feature_->attach(pstFeature->fi_handle) !=
+		    0) {
+			log4cplus_error("hot-backup feature attach error: %s",
+					hotbackup_lru_feature_->error());
+			return -1;
+		}
+
+		if (hotbackup_lru_feature_->master_uptime() != 0) {
+			// 开启变更key日志
+			log_hotbackup_key_switch_ = true;
+		}
+	}
+	// Hot Backup
+	// DelayPurge
+	cache_.start_delay_purge_task(
+		owner->get_timer_list_by_m_seconds(10 /*10 ms*/));
+
+	// Blacklist
+	// 10 min sched
+	blacklist_timer_ = owner->get_timer_list(10 * 60);
+	NEW(BlackListUnit(blacklist_timer_), black_list_);
+	if (NULL == black_list_ ||
+	    black_list_->init_blacklist(
+		    100000, table_define_infomation_->key_format())) {
+		log4cplus_error("init black_list failed");
+		return -1;
+	}
+	black_list_->start_blacklist_expired_task();
+	// Blacklist
+	if (table_define_infomation_->expire_time_field_id() != -1) {
+		if (dtc_mode_ == DTC_MODE_CACHE_ONLY) {
+			key_expire_timer_ = owner->get_timer_list_by_m_seconds(
+				1000 /* 1s */);
+			NEW(ExpireTime(key_expire_timer_, &cache_,
+				       data_process_, table_define_infomation_,
+				       max_expire_count_),
+			    key_expire);
+			if (key_expire == NULL) {
+				log4cplus_error("init key expire time failed");
+				return -1;
+			}
+			key_expire->start_key_expired_task();
+		} else {
+			log4cplus_error("db mode do not support expire time");
+			return -1;
+		}
+	}
+	// Empty Node list
+	if (full_mode_ == true) {
+		// nodb Mode has not empty nodes,
+		node_empty_limit_ = 0;
+		// prune all present empty nodes
+		cache_.prune_empty_node_list();
+	} else if (node_empty_limit_) {
+		// Enable Empty Node Limitation
+		cache_.set_empty_node_limit(node_empty_limit_);
+		// re-counting empty node count
+		cache_.init_empty_node_list();
+		// upgrade from old memory
+		cache_.upgrade_empty_node_list();
+		// shrinking empty list
+		cache_.shrink_empty_node_list();
+	} else {
+		// move all empty node to clean list
+		cache_.merge_empty_node_list();
+	}
+
+	// Empty Node list
+	return 0;
+}
+
+bool BufferProcessAskChain::insert_empty_node(void)
+{
+	for (int i = 0; i < 2; i++) {
+		cache_transaction_node = cache_.cache_allocation(key);
+		if (!(!cache_transaction_node))
+			break;
+		if (cache_.try_purge_size(1, cache_transaction_node) != 0)
+			break;
+	}
+	if (!cache_transaction_node) {
+		log4cplus_debug("alloc cache node error");
+		return false;
+	}
+	cache_transaction_node.vd_handle() = INVALID_HANDLE;
+	// new node created, it's EmptyButInCleanList
+	// means it's not in empty before transaction
+	node_empty = 0;
+	return true;
+}
+
+BufferResult BufferProcessAskChain::insert_default_row(DTCJobOperation &job)
+{
+	int iRet;
+	log4cplus_debug("%s", "insert default start!");
+	if (!cache_transaction_node) {
+		// 发现空节点
+		if (insert_empty_node() == false) {
+			log4cplus_warning("alloc cache node error");
+			job.set_error(-EIO, CACHE_SVC,
+				      "alloc cache node error");
+			return DTC_CODE_BUFFER_ERROR;
+		}
+		if (empty_node_filter_)
+			empty_node_filter_->CLR(job.int_key());
+	} else {
+		uint32_t uiTotalRows =
+			((DataChunk *)(PtMalloc::instance()->handle_to_ptr(
+				 cache_transaction_node.vd_handle())))
+				->total_rows();
+		if (uiTotalRows != 0)
+			return DTC_CODE_BUFFER_SUCCESS;
+	}
+	RowValue stRowValue(job.table_definition());
+	stRowValue.default_value();
+	RawData stDataRows(&g_stSysMalloc, 1);
+	iRet = stDataRows.do_init(key);
+	if (iRet != 0) {
+		log4cplus_warning("raw data init error: %d, %s", iRet,
+				  stDataRows.get_err_msg());
+		job.set_error(-ENOMEM, CACHE_SVC, "new raw-data error");
+		cache_.purge_node_and_data(key, cache_transaction_node);
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	stDataRows.insert_row(stRowValue, false, false);
+	iRet = data_process_->do_replace_all(&cache_transaction_node,
+					     &stDataRows);
+	if (iRet != 0) {
+		log4cplus_debug("replace data error: %d, %s", iRet,
+				stDataRows.get_err_msg());
+		job.set_error(-ENOMEM, CACHE_SVC, "replace data error");
+		// 标记加入黑名单
+		job.push_black_list_size(stDataRows.data_size());
+		cache_.purge_node_and_data(key, cache_transaction_node);
+		return DTC_CODE_BUFFER_ERROR;
+	}
+
+	if (cache_transaction_node.vd_handle() == INVALID_HANDLE) {
+		log4cplus_error("BUG: node[%u] vdhandle=0",
+				cache_transaction_node.node_id());
+		cache_.purge_node(job.packed_key(), cache_transaction_node);
+	}
+
+	return DTC_CODE_BUFFER_SUCCESS;
+}
+
+/*
+ * Function		: buffer_get_data
+ * Description	: 处理get请求
+ * Input			: job			请求信息
+ * Output		: job			返回信息
+ * Return		: 成功返回0,失败返回-1
+ */
+BufferResult BufferProcessAskChain::buffer_get_data(DTCJobOperation &job)
+{
+	int iRet;
+
+	log4cplus_debug("buffer_get_data start ");
+	transaction_find_node(job);
+	switch (node_status) {
+	case DTC_CODE_NODE_NOTFOUND:
+		if (full_mode_ == false) {
+			if (job.flag_no_cache() != 0)
+				job.mark_as_pass_thru();
+			return DTC_CODE_BUFFER_GOTO_NEXT_CHAIN;
+		}
+		--stat_get_hits_; // FullCache Missing treat as miss
+			// FullCache Mode: treat as empty & fallthrough
+	case DTC_CODE_NODE_EMPTY:
+		++stat_get_hits_;
+		//发现空节点,直接构建result
+		log4cplus_debug("found Empty-Node[%u], response directed",
+				job.int_key());
+		job.prepare_result();
+		job.set_total_rows(0);
+		job.set_result_hit_flag(HIT_SUCCESS);
+		return DTC_CODE_BUFFER_SUCCESS;
+	}
+
+	if (dtc_mode_ == DTC_MODE_CACHE_ONLY) {
+		BufferResult cacheRet = check_and_expire(job);
+		if (cacheRet != DTC_CODE_BUFFER_GOTO_NEXT_CHAIN)
+			return cacheRet;
+	}
+	++stat_get_hits_;
+	log4cplus_debug("[%s:%d]cache hit ", __FILE__, __LINE__);
+
+	transaction_update_lru(false, LRU_READ);
+	iRet = data_process_->do_get(job, &cache_transaction_node);
+	if (iRet != 0) {
+		log4cplus_error("do_get() failed");
+		job.set_error_dup(-EIO, CACHE_SVC,
+				  data_process_->get_err_msg());
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	log4cplus_debug(" lru_update_level_:%d,LRU_READ:%d", lru_update_level_,
+			LRU_READ);
+	// Hot Backup
+	if (lru_update_level_ < LRU_READ &&
+	    write_lru_hotbackup_log(job.packed_key())) {
+		// 为避免错误扩大, 给客户端成功响应
+		log4cplus_error("hb: log lru key failed");
+	}
+	// Hot Bakcup
+	job.set_result_hit_flag(HIT_SUCCESS);
+	return DTC_CODE_BUFFER_SUCCESS;
+}
+
+/*
+ * Function		: buffer_batch_get_data
+ * Description	: 处理get请求
+ * Input			: job			请求信息
+ * Output		: job			返回信息
+ * Return		: 成功返回0,失败返回-1
+ */
+BufferResult BufferProcessAskChain::buffer_batch_get_data(DTCJobOperation &job)
+{
+	int index;
+	int iRet;
+	log4cplus_debug("buffer_batch_get_data start ");
+	job.prepare_result_no_limit();
+	for (index = 0; job.set_batch_cursor(index) >= 0; index++) {
+		++stat_get_count_;
+		job.set_result_hit_flag(HIT_INIT);
+		transaction_find_node(job);
+		switch (node_status) {
+		case DTC_CODE_NODE_EMPTY:
+			++stat_get_hits_;
+			job.done_batch_cursor(index);
+			log4cplus_debug("[%s:%d]cache empty ", __FILE__,
+					__LINE__);
+			break;
+		case DTC_CODE_NODE_NOTFOUND:
+			if (full_mode_)
+				job.done_batch_cursor(index);
+			log4cplus_debug("[%s:%d]cache miss ", __FILE__,
+					__LINE__);
+			break;
+		case DTC_CODE_NODE_HIT:
+			++stat_get_hits_;
+			log4cplus_debug("[%s:%d]cache hit ", __FILE__,
+					__LINE__);
+
+			transaction_update_lru(false, LRU_BATCH);
+			iRet = data_process_->do_get(job,
+						     &cache_transaction_node);
+			if (iRet != 0) {
+				log4cplus_error("do_get() failed");
+				job.set_error_dup(-EIO, CACHE_SVC,
+						  data_process_->get_err_msg());
+				return DTC_CODE_BUFFER_ERROR;
+			}
+			job.done_batch_cursor(index);
+			// Hot Backup
+			if (lru_update_level_ < LRU_BATCH &&
+			    write_lru_hotbackup_log(job.packed_key())) {
+				//为避免错误扩大, 给客户端成功响应
+				log4cplus_error("hb: log lru key failed");
+			}
+			break;
+		}
+		transaction_end();
+	}
+	// Hot Bakcup
+	return DTC_CODE_BUFFER_SUCCESS;
+}
+
+/*
+ * Function		: buffer_get_rb
+ * Description	: 处理Helper的get回读task
+ * Input			: job			请求信息
+ * Output		: job			返回信息
+ * Return		: 成功返回0,失败返回-1
+ */
+BufferResult BufferProcessAskChain::buffer_get_rb(DTCJobOperation &job)
+{
+	log4cplus_debug("buffer_get_rb start ");
+
+	job.prepare_result();
+	int iRet = job.append_result(job.result);
+	if (iRet < 0) {
+		log4cplus_info("job append_result error: %d", iRet);
+		job.set_error(iRet, CACHE_SVC, "append_result() error");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	log4cplus_debug("buffer_get_rb success");
+	return DTC_CODE_BUFFER_SUCCESS;
+}
+
+// helper执行GET回来后,更新内存数据
+BufferResult BufferProcessAskChain::buffer_replace_result(DTCJobOperation &job)
+{
+	int iRet;
+	int oldRows = 0;
+	log4cplus_debug("cache replace all start!");
+	transaction_find_node(job);
+
+	// 数据库回来的记录如果是0行则
+	// 1. 设置bits
+	// 2. 直接构造0行的result响应包
+	if (empty_node_filter_ != NULL) {
+		if ((job.result == NULL || job.result->total_rows() == 0)) {
+			log4cplus_debug("SET Empty-Node[%u]", job.int_key());
+			empty_node_filter_->SET(job.int_key());
+			cache_.cache_purge(key);
+			return DTC_CODE_BUFFER_SUCCESS;
+		} else {
+			empty_node_filter_->CLR(job.int_key());
+		}
+	}
+	if (!cache_transaction_node) {
+		if (insert_empty_node() == false)
+			return DTC_CODE_BUFFER_SUCCESS;
+	} else {
+		oldRows = cache_.node_rows_count(cache_transaction_node);
+	}
+	unsigned int uiNodeID = cache_transaction_node.node_id();
+	iRet = data_process_->do_replace_all(job, &cache_transaction_node);
+	if (iRet != 0 || cache_transaction_node.vd_handle() == INVALID_HANDLE) {
+		if (dtc_mode_ == DTC_MODE_CACHE_ONLY) {
+			// UNREACHABLE
+			log4cplus_info("cache replace data error: %d. node: %u",
+				       iRet, uiNodeID);
+			job.set_error_dup(-EIO, CACHE_SVC,
+					  data_process_->get_err_msg());
+			return DTC_CODE_BUFFER_ERROR;
+		}
+		log4cplus_debug("cache replace data error: %d. purge node: %u",
+				iRet, uiNodeID);
+		cache_.purge_node_and_data(key, cache_transaction_node);
+		cache_.inc_dirty_row(0 - oldRows);
+		return DTC_CODE_BUFFER_SUCCESS;
+	}
+	cache_.inc_total_row(data_process_->get_increase_row_count());
+
+	transaction_update_lru(false, LRU_READ);
+	if (oldRows != 0 ||
+	    cache_.node_rows_count(cache_transaction_node) != 0) {
+		// Hot Backup
+		if (lru_update_level_ < LRU_READ &&
+		    write_lru_hotbackup_log(job.packed_key())) {
+			// 为避免错误扩大, 给客户端成功响应
+			log4cplus_error("hb: log lru key failed");
+		}
+		// Hot Bakcup
+	}
+
+	log4cplus_debug("buffer_replace_result success! ");
+
+	return DTC_CODE_BUFFER_SUCCESS;
+}
+
+/*
+ * Function		: buffer_flush_data
+ * Description	: 处理flush请求
+ * Input			: job			请求信息
+ * Output		: job			返回信息
+ * Return		: 成功返回0,失败返回-1
+ */
+BufferResult
+BufferProcessAskChain::buffer_flush_data_before_delete(DTCJobOperation &job)
+{
+	log4cplus_debug("%s", "flush start!");
+	transaction_find_node(job);
+	if (!cache_transaction_node || !(cache_transaction_node.is_dirty())) {
+		log4cplus_debug(
+			"node is null or node is clean,return DTC_CODE_BUFFER_SUCCESS");
+		return DTC_CODE_BUFFER_SUCCESS;
+	}
+	unsigned int affected_count;
+	Node node = cache_transaction_node;
+	int iRet = 0;
+	// init
+	key_dirty = cache_transaction_node.is_dirty();
+	DTCFlushRequest *flushReq = new DTCFlushRequest(this, key);
+	if (flushReq == NULL) {
+		log4cplus_error("new DTCFlushRequest error: %m");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	iRet = data_process_->do_flush(flushReq, &cache_transaction_node,
+				       affected_count);
+	if (iRet != 0) {
+		log4cplus_error("do_flush error:%d", iRet);
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	if (affected_count == 0) {
+		delete flushReq;
+		if (key_dirty)
+			cache_.inc_dirty_node(-1);
+		cache_transaction_node.clr_dirty();
+		cache_.remove_from_lru(cache_transaction_node);
+		cache_.insert_to_clean_lru(cache_transaction_node);
+		return DTC_CODE_BUFFER_SUCCESS;
+	} else {
+		commit_flush_request(flushReq, NULL);
+		cache_.inc_dirty_row(
+			data_process_->get_increase_dirty_row_count());
+		if (key_dirty)
+			cache_.inc_dirty_node(-1);
+		cache_transaction_node.clr_dirty();
+		cache_.remove_from_lru(cache_transaction_node);
+		cache_.insert_to_clean_lru(cache_transaction_node);
+		++stat_flush_count_;
+		stat_flush_rows_ += affected_count;
+		return DTC_CODE_BUFFER_SUCCESS;
+	}
+}
+
+/*
+ * Function		: buffer_flush_data
+ * Description	: 处理flush请求
+ * Input			: job			请求信息
+ * Output		: job			返回信息
+ * Return		: 成功返回0,失败返回-1
+ */
+BufferResult BufferProcessAskChain::buffer_flush_data(DTCJobOperation &job)
+{
+	log4cplus_debug("%s", "flush start!");
+	transaction_find_node(job);
+	if (!cache_transaction_node || !(cache_transaction_node.is_dirty()))
+		return DTC_CODE_BUFFER_SUCCESS;
+	unsigned int affected_count;
+	BufferResult iRet =
+		buffer_flush_data(cache_transaction_node, &job, affected_count);
+	if (iRet == DTC_CODE_BUFFER_SUCCESS) {
+		++stat_flush_count_;
+		stat_flush_rows_ += affected_count;
+	}
+	return (iRet);
+}
+
+// called by flush next node
+int BufferProcessAskChain::buffer_flush_data_timer(Node &node,
+						   unsigned int &affected_count)
+{
+	int iRet, err = 0;
+	// init
+	transaction_begin(NULL);
+	key_dirty = node.is_dirty();
+	key = ((DataChunk *)(PtMalloc::instance()->handle_to_ptr(
+		       node.vd_handle())))
+		      ->key();
+	DTCFlushRequest *flushReq = new DTCFlushRequest(this, key);
+	if (flushReq == NULL) {
+		log4cplus_error("new DTCFlushRequest error: %m");
+		err = -1;
+		goto __out;
+	}
+	iRet = data_process_->do_flush(flushReq, &node, affected_count);
+	if (affected_count == 0) {
+		delete flushReq;
+		if (iRet < 0) {
+			err = -2;
+			goto __out;
+		} else {
+			if (key_dirty)
+				cache_.inc_dirty_node(-1);
+			node.clr_dirty();
+			cache_.remove_from_lru(node);
+			cache_.insert_to_clean_lru(node);
+			err = 1;
+			goto __out;
+		}
+	} else {
+		commit_flush_request(flushReq, NULL);
+		cache_.inc_dirty_row(
+			data_process_->get_increase_dirty_row_count());
+		if (iRet == 0) {
+			if (key_dirty)
+				cache_.inc_dirty_node(-1);
+			node.clr_dirty();
+			cache_.remove_from_lru(node);
+			cache_.insert_to_clean_lru(node);
+			err = 2;
+			goto __out;
+		} else {
+			err = -5;
+			goto __out;
+		}
+	}
+
+__out:
+	// clear init
+	CacheTransaction::Free();
+	return err;
+}
+/*
+ * Function		: buffer_flush_data
+ * Description	: 处理flush请求
+ * Input			: job			请求信息
+ * Output		: job			返回信息
+ * Return		: 成功返回0,失败返回-1
+ * 
+ */
+BufferResult
+BufferProcessAskChain::buffer_flush_data(Node &node, DTCJobOperation *pstTask,
+					 unsigned int &affected_count)
+{
+	int iRet;
+	// could called by flush timer event, no transactionFindNode called there, can't trust KeyDirty, recal it
+	key_dirty = node.is_dirty();
+	log4cplus_debug("%s", "flush node start!");
+	int flushCnt = 0;
+	DTCFlushRequest *flushReq = NULL;
+	if (dtc_mode_ == DTC_MODE_DATABASE_ADDITION) {
+		flushReq = new DTCFlushRequest(this, key);
+		if (flushReq == NULL) {
+			log4cplus_error("new DTCFlushRequest error: %m");
+			if (pstTask != NULL)
+				pstTask->set_error(-ENOMEM, CACHE_SVC,
+						   "new DTCFlushRequest error");
+			return DTC_CODE_BUFFER_ERROR;
+		}
+	}
+	iRet = data_process_->do_flush(flushReq, &node, affected_count);
+	if (flushReq) {
+		flushCnt = flushReq->numReq;
+		commit_flush_request(flushReq, pstTask);
+		if (iRet != 0) {
+			log4cplus_error("do_flush() failed while flush data");
+			if (pstTask != NULL)
+				pstTask->set_error_dup(
+					-EIO, CACHE_SVC,
+					data_process_->get_err_msg());
+
+			return DTC_CODE_BUFFER_ERROR;
+		}
+	}
+	cache_.inc_dirty_row(data_process_->get_increase_dirty_row_count());
+	if (key_dirty)
+		cache_.inc_dirty_node(-1);
+	node.clr_dirty();
+	key_dirty = 0;
+	transaction_update_lru(false, LRU_ALWAYS);
+	log4cplus_debug("buffer_flush_data success");
+	if (flushCnt == 0)
+		return DTC_CODE_BUFFER_SUCCESS;
+	else
+		return DTC_CODE_BUFFER_UNFINISHED;
+}
+
+/*
+ * Function		: buffer_purge_data
+ * Description	: 处理purge请求
+ * Input			: job			请求信息
+ * Output		: job			返回信息
+ * Return		: 成功返回0,失败返回-1
+ * 
+ */
+BufferResult BufferProcessAskChain::buffer_purge_data(DTCJobOperation &job)
+{
+	transaction_find_node(job);
+	switch (node_status) {
+	case DTC_CODE_NODE_EMPTY:
+		empty_node_filter_->CLR(job.int_key());
+		return DTC_CODE_BUFFER_SUCCESS;
+
+	case DTC_CODE_NODE_NOTFOUND:
+		return DTC_CODE_BUFFER_SUCCESS;
+
+	case DTC_CODE_NODE_HIT:
+		break;
+	}
+	BufferResult iRet = DTC_CODE_BUFFER_SUCCESS;
+	if (update_mode_ && cache_transaction_node.is_dirty()) {
+		unsigned int affected_count;
+		iRet = buffer_flush_data(cache_transaction_node, &job,
+					 affected_count);
+		if (iRet != DTC_CODE_BUFFER_UNFINISHED)
+			return iRet;
+	}
+	++stat_drop_count_;
+	stat_drop_rows_ += ((DataChunk *)(PtMalloc::instance()->handle_to_ptr(
+				    cache_transaction_node.vd_handle())))
+				   ->total_rows();
+	cache_.inc_total_row(0LL -
+			     ((DataChunk *)(PtMalloc::instance()->handle_to_ptr(
+				      cache_transaction_node.vd_handle())))
+				     ->total_rows());
+	unsigned int uiNodeID = cache_transaction_node.node_id();
+	if (cache_.cache_purge(key) != 0) {
+		log4cplus_error("PANIC: purge node[id=%u] fail", uiNodeID);
+	}
+	return iRet;
+}
+
+/*
+ * Function		: buffer_update_rows
+ * Description	: 处理Helper的update job
+ * Input		: job			请求信息
+ * Output		: job			返回信息
+ * Return		: 成功返回0,失败返回-1
+ * 
+ */
+BufferResult BufferProcessAskChain::buffer_update_rows(DTCJobOperation &job,
+						       bool async, bool setrows)
+{
+	int iRet;
+	log4cplus_debug("cache update data start! ");
+	if (m_bReplaceEmpty == true) {
+		BufferResult ret = insert_default_row(job);
+		if (ret != DTC_CODE_BUFFER_SUCCESS)
+			return (ret);
+	}
+	int rows = cache_.node_rows_count(cache_transaction_node);
+	iRet = data_process_->do_update(job, &cache_transaction_node, log_rows,
+					async, setrows);
+	if (iRet != 0) {
+		if (async == false && !job.flag_black_hole()) {
+			cache_.purge_node_and_data(key, cache_transaction_node);
+			cache_.inc_total_row(0LL - rows);
+			return DTC_CODE_BUFFER_SUCCESS;
+		}
+		log4cplus_warning("do_update() failed: %d,%s", iRet,
+				  data_process_->get_err_msg());
+		job.set_error_dup(-EIO, CACHE_SVC,
+				  data_process_->get_err_msg());
+		transaction_update_lru(async, LRU_ALWAYS);
+		goto ERR_RETURN;
+	}
+	// if update volatile field,node won't be dirty
+	transaction_update_lru(
+		(job.resultInfo.affected_rows() > 0 &&
+		 (job.request_operation() &&
+		  job.request_operation()
+			  ->has_type_commit()) //has core field modified
+		 ) ?
+			async :
+			false,
+		LRU_WRITE);
+	cache_.inc_dirty_row(data_process_->get_increase_dirty_row_count());
+	// Hot Backup
+	if (node_status != DTC_CODE_NODE_HIT ||
+	    (job.request_operation() &&
+	     job.request_operation()->has_type_commit())) {
+		// only write log if some non-volatile field got updated
+		// or cache miss and m_bReplaceEmpty is set (equiv insert(default)+update)
+		if (write_hotbackup_log(job, cache_transaction_node,
+					DTCHotBackup::SYNC_UPDATE)) {
+			// 为避免错误扩大, 给客户端成功响应
+			log4cplus_error("hb: log update key failed");
+		}
+	}
+	// Hot Bakcup
+	return DTC_CODE_BUFFER_SUCCESS;
+ERR_RETURN:
+	return DTC_CODE_BUFFER_ERROR;
+}
+
+// buffer_replace_rows don't allow empty node
+BufferResult BufferProcessAskChain::buffer_replace_rows(DTCJobOperation &job,
+							bool async,
+							bool setrows)
+{
+	int iRet;
+	log4cplus_debug("cache replace rows start!");
+	int rows = cache_.node_rows_count(cache_transaction_node);
+	iRet = data_process_->do_replace(job, &cache_transaction_node, log_rows,
+					 async, setrows);
+	if (iRet != 0) {
+		if (key_dirty == false && !job.flag_black_hole()) {
+			cache_.purge_node_and_data(key, cache_transaction_node);
+			cache_.inc_total_row(0LL - rows);
+		}
+		// 如果是同步replace命令,返回成功
+		if (async == false && !job.flag_black_hole())
+			return DTC_CODE_BUFFER_SUCCESS;
+		log4cplus_error("cache replace rows error: %d,%s", iRet,
+				data_process_->get_err_msg());
+		job.set_error(-EIO, CACHE_SVC, "do_replace_all() error");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	cache_.inc_total_row(data_process_->get_increase_row_count());
+	cache_.inc_dirty_row(data_process_->get_increase_dirty_row_count());
+	BufferResult ret = DTC_CODE_BUFFER_SUCCESS;
+	transaction_update_lru(async, LRU_WRITE);
+	// Hot Backup
+	if (write_hotbackup_log(job, cache_transaction_node,
+				DTCHotBackup::SYNC_UPDATE)) {
+		// 为避免错误扩大, 给客户端成功响应
+		log4cplus_error("hb: log update key failed");
+	}
+	// Hot Bakcup
+	log4cplus_debug("buffer_replace_rows success! ");
+	if (cache_transaction_node.vd_handle() == INVALID_HANDLE) {
+		log4cplus_error("BUG: node[%u] vdhandle=0",
+				cache_transaction_node.node_id());
+		cache_.purge_node(job.packed_key(), cache_transaction_node);
+		cache_.inc_total_row(0LL - rows);
+	}
+
+	return ret;
+}
+
+/*
+ * Function	: buffer_insert_row
+ * Description	: 处理Helper的insert job
+ * Input		: job			请求信息
+ * Output	: job			返回信息
+ * Return	: 成功返回0,失败返回-1
+ * 
+ */
+BufferResult BufferProcessAskChain::buffer_insert_row(DTCJobOperation &job,
+						      bool async, bool setrows)
+{
+	int iRet;
+	bool emptyFlag = false;
+	if (!cache_transaction_node) {
+		emptyFlag = true;
+		if (insert_empty_node() == false) {
+			if (async == true || job.flag_black_hole()) {
+				job.set_error(
+					-EIO, CACHE_SVC,
+					"allocate_node Error while insert row");
+				return DTC_CODE_BUFFER_ERROR;
+			}
+			return DTC_CODE_BUFFER_SUCCESS;
+		}
+		RawData stDataRows(&g_stSysMalloc, 1);
+		// iRet = stDataRows.do_init(0, job.table_definition()->key_format(), key);
+		iRet = stDataRows.do_init(key);
+		if (iRet != 0) {
+			log4cplus_warning("raw data init error: %d, %s", iRet,
+					  stDataRows.get_err_msg());
+			job.set_error(-ENOMEM, CACHE_SVC, "new raw-data error");
+			cache_.purge_node_and_data(key, cache_transaction_node);
+			return DTC_CODE_BUFFER_ERROR;
+		}
+		iRet = data_process_->do_replace_all(&cache_transaction_node,
+						     &stDataRows);
+		if (iRet != 0) {
+			log4cplus_warning("raw data init error: %d, %s", iRet,
+					  stDataRows.get_err_msg());
+			job.set_error(-ENOMEM, CACHE_SVC, "new raw-data error");
+			cache_.purge_node_and_data(key, cache_transaction_node);
+			return DTC_CODE_BUFFER_ERROR;
+		}
+		if (empty_node_filter_)
+			empty_node_filter_->CLR(job.int_key());
+	}
+	int oldRows = cache_.node_rows_count(cache_transaction_node);
+	iRet = data_process_->do_append(job, &cache_transaction_node, log_rows,
+					async, setrows);
+	if (iRet == -1062) {
+		job.set_error(-ER_DUP_ENTRY, CACHE_SVC,
+			      "duplicate unique key detected");
+		return DTC_CODE_BUFFER_ERROR;
+	} else if (iRet != 0) {
+		if ((async == false && !job.flag_black_hole()) || emptyFlag) {
+			log4cplus_debug("do_append() failed, purge now [%d %s]",
+					iRet, data_process_->get_err_msg());
+			cache_.inc_total_row(0LL - oldRows);
+			cache_.purge_node_and_data(key, cache_transaction_node);
+			return DTC_CODE_BUFFER_SUCCESS;
+		} else {
+			log4cplus_error("do_append() failed while update data");
+			job.set_error_dup(-EIO, CACHE_SVC,
+					  data_process_->get_err_msg());
+			return DTC_CODE_BUFFER_ERROR;
+		}
+	}
+	transaction_update_lru(async, LRU_WRITE);
+	cache_.inc_total_row(data_process_->get_increase_row_count());
+	if (async == true)
+		cache_.inc_dirty_row(
+			data_process_->get_increase_dirty_row_count());
+	// Hot Backup
+	if (write_hotbackup_log(job, cache_transaction_node,
+				DTCHotBackup::SYNC_INSERT)) {
+		// 为避免错误扩大, 给客户端成功响应
+		log4cplus_error("hb: log update key failed");
+	}
+	// Hot Bakcup
+	log4cplus_debug("buffer_insert_row success");
+	return DTC_CODE_BUFFER_SUCCESS;
+}
+
+/*
+ * Function		: buffer_delete_rows
+ * Description	: 处理del请求
+ * Input			: job			请求信息
+ * Output		: job			返回信息
+ * Return		: 成功返回0,失败返回-1
+ * 
+ */
+BufferResult BufferProcessAskChain::buffer_delete_rows(DTCJobOperation &job)
+{
+	int iRet;
+	log4cplus_debug("buffer_delete_rows start! ");
+	uint32_t oldRows = cache_.node_rows_count(cache_transaction_node);
+	int all_row_delete = job.all_rows();
+	// 如果没有del条件则删除整个节点
+	if (job.all_rows() != 0) {
+	empty:
+		if (lossy_mode_ || job.flag_black_hole()) {
+			job.resultInfo.set_affected_rows(oldRows);
+		}
+		// row cnt statistic dec by 1
+		cache_.inc_total_row(0LL - oldRows);
+		// dirty node cnt staticstic dec by 1
+		if (key_dirty) {
+			cache_.inc_dirty_node(-1);
+		}
+		// dirty row cnt statistic dec, if count dirty row error, let statistic wrong with it
+		if (all_row_delete) {
+			int old_dirty_rows = data_process_->get_dirty_row_count(
+				job, &cache_transaction_node);
+			if (old_dirty_rows > 0)
+				cache_.inc_dirty_row(old_dirty_rows);
+		} else {
+			cache_.inc_dirty_row(
+				data_process_->get_increase_dirty_row_count());
+		}
+		cache_.purge_node_and_data(key, cache_transaction_node);
+		if (empty_node_filter_)
+			empty_node_filter_->SET(job.int_key());
+		// Hot Backup
+		Node stEmpytNode;
+		if (write_hotbackup_log(job, stEmpytNode,
+					DTCHotBackup::SYNC_PURGE))
+		//		if(hbLog.write_update_key(job.packed_key(), DTCHotBackup::SYNC_UPDATE))
+		{
+			// 为避免错误扩大, 给客户端成功响应
+			log4cplus_error("hb: log update key failed");
+		}
+		// Hot Bakcup
+
+		return DTC_CODE_BUFFER_SUCCESS;
+	}
+	// delete error handle is too simple, statistic can not trust if error happen here
+	iRet = data_process_->do_delete(job, &cache_transaction_node, log_rows);
+	if (iRet != 0) {
+		log4cplus_error("do_delete() failed: %d,%s", iRet,
+				data_process_->get_err_msg());
+		job.set_error_dup(-EIO, CACHE_SVC,
+				  data_process_->get_err_msg());
+		if (!key_dirty) {
+			cache_.inc_total_row(0LL - oldRows);
+			cache_.purge_node_and_data(key, cache_transaction_node);
+		}
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	// Delete to empty
+	uint32_t uiTotalRows =
+		((DataChunk *)(PtMalloc::instance()->handle_to_ptr(
+			 cache_transaction_node.vd_handle())))
+			->total_rows();
+	if (uiTotalRows == 0)
+		goto empty;
+
+	cache_.inc_dirty_row(data_process_->get_increase_dirty_row_count());
+	cache_.inc_total_row(data_process_->get_increase_row_count());
+
+	transaction_update_lru(false, LRU_WRITE);
+	// Hot Backup
+	if (write_hotbackup_log(job, cache_transaction_node,
+				DTCHotBackup::SYNC_DELETE)) {
+		// 为避免错误扩大, 给客户端成功响应
+		log4cplus_error("hb: log update key failed");
+	}
+	// Hot Bakcup
+	return DTC_CODE_BUFFER_SUCCESS;
+}
+
+BufferResult BufferProcessAskChain::check_allowed_insert(DTCJobOperation &job)
+{
+	int rows = cache_.node_rows_count(cache_transaction_node);
+	// single rows checker
+	if (table_define_infomation_->key_as_uniq_field() && rows != 0) {
+		job.set_error(-ER_DUP_ENTRY, CACHE_SVC,
+			      "duplicate unique key detected");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	if (node_rows_limit_ > 0 && rows >= node_rows_limit_) {
+		// check weather allowed do_execute insert operation
+		job.set_error(
+			-EC_NOT_ALLOWED_INSERT, __FUNCTION__,
+			"rows exceed limit, not allowed insert any more data");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	return DTC_CODE_BUFFER_SUCCESS;
+}
+
+BufferResult
+BufferProcessAskChain::buffer_sync_insert_precheck(DTCJobOperation &job)
+{
+	log4cplus_debug("%s", "buffer_sync_insert begin");
+	// 这种模式下,不支持insert操作
+	if (m_bReplaceEmpty == true) {
+		job.set_error(
+			-EC_BAD_COMMAND, CACHE_SVC,
+			"insert cmd from client, not support under replace mode");
+		log4cplus_info(
+			"insert cmd from client, not support under replace mode");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	if (table_define_infomation_->key_as_uniq_field() ||
+	    node_rows_limit_ > 0) {
+		transaction_find_node(job);
+
+		// single rows checker
+		if (node_status == DTC_CODE_NODE_HIT &&
+		    check_allowed_insert(job) == DTC_CODE_BUFFER_ERROR)
+			return DTC_CODE_BUFFER_ERROR;
+	}
+	return DTC_CODE_BUFFER_GOTO_NEXT_CHAIN;
+}
+
+BufferResult BufferProcessAskChain::buffer_sync_insert(DTCJobOperation &job)
+{
+	log4cplus_debug("%s", "buffer_sync_insert begin");
+	// 这种模式下,不支持insert操作
+	if (m_bReplaceEmpty == true) {
+		job.set_error(
+			-EC_BAD_COMMAND, CACHE_SVC,
+			"insert cmd from client, not support under replace mode");
+		log4cplus_info(
+			"insert cmd from client, not support under replace mode");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	// 如果自增量字段是key,则会更新key
+	if (job.resultInfo.insert_id() > 0)
+		job.update_packed_key(job.resultInfo.insert_id());
+
+	transaction_find_node(job);
+	// Missing is NO-OP, otherwise insert it
+	switch (node_status) {
+	case DTC_CODE_NODE_NOTFOUND:
+		return DTC_CODE_BUFFER_SUCCESS;
+	case DTC_CODE_NODE_EMPTY:
+	case DTC_CODE_NODE_HIT:
+		if (lossy_mode_) {
+			job.set_error(0, NULL, NULL);
+			job.resultInfo.set_affected_rows(0);
+		}
+		break;
+	}
+	return buffer_insert_row(job, false /* async */,
+				 lossy_mode_ /* setrows */);
+}
+
+BufferResult BufferProcessAskChain::buffer_sync_update(DTCJobOperation &job)
+{
+	bool setrows = lossy_mode_;
+	log4cplus_debug("%s", "buffer_sync_update begin");
+	// NOOP sync update
+	if (job.request_operation() == NULL) {
+		// no field need to update
+		// 如果helper更新的纪录数为0则直接返回
+		return DTC_CODE_BUFFER_SUCCESS;
+	} else if (setrows == false && job.resultInfo.affected_rows() == 0) {
+		if (job.request_operation()->has_type_commit() == 0) {
+			// pure volatile update, ignore upstream affected-rows
+			setrows = true;
+		} else if (job.request_condition() &&
+			   job.request_condition()->has_type_timestamp()) {
+			// update base timestamp fields, ignore upstream affected-rows
+			setrows = true;
+		} else {
+			log4cplus_debug("%s", "helper's affected rows is zero");
+			// 如果helper更新的纪录数为0则直接返回
+			return DTC_CODE_BUFFER_SUCCESS;
+		}
+	}
+	transaction_find_node(job);
+	// Missing or Empty is NO-OP except EmptyAsDefault logical
+	switch (node_status) {
+	case DTC_CODE_NODE_NOTFOUND:
+		return DTC_CODE_BUFFER_SUCCESS;
+	case DTC_CODE_NODE_EMPTY:
+		if (m_bReplaceEmpty == true)
+			break;
+		if (lossy_mode_) {
+			job.set_error(0, NULL, NULL);
+			job.resultInfo.set_affected_rows(0);
+		}
+		return DTC_CODE_BUFFER_SUCCESS;
+	case DTC_CODE_NODE_HIT:
+		if (lossy_mode_) {
+			job.set_error(0, NULL, NULL);
+			job.resultInfo.set_affected_rows(0);
+		}
+		break;
+	}
+	return buffer_update_rows(job, false /*Async*/, setrows);
+}
+
+BufferResult BufferProcessAskChain::buffer_sync_replace(DTCJobOperation &job)
+{
+	const int setrows = lossy_mode_;
+	log4cplus_debug("%s", "buffer_sync_replace begin");
+	// NOOP sync update
+	if (lossy_mode_ == false && job.resultInfo.affected_rows() == 0) {
+		log4cplus_debug("%s", "helper's affected rows is zero");
+		// 如果helper更新的纪录数为0则直接返回
+		return DTC_CODE_BUFFER_SUCCESS;
+	}
+	transaction_find_node(job);
+	// missing node is NO-OP, empty node insert it, otherwise replace it
+	switch (node_status) {
+	case DTC_CODE_NODE_NOTFOUND:
+		return DTC_CODE_BUFFER_SUCCESS;
+	case DTC_CODE_NODE_EMPTY:
+		if (lossy_mode_) {
+			job.set_error(0, NULL, NULL);
+			job.resultInfo.set_affected_rows(0);
+		}
+		return buffer_insert_row(job, false, setrows);
+	case DTC_CODE_NODE_HIT:
+		if (lossy_mode_) {
+			job.set_error(0, NULL, NULL);
+			job.resultInfo.set_affected_rows(0);
+		}
+		break;
+	}
+	return buffer_replace_rows(job, false, lossy_mode_);
+}
+
+BufferResult BufferProcessAskChain::buffer_sync_delete(DTCJobOperation &job)
+{
+	log4cplus_debug("%s", "buffer_sync_delete begin");
+	// didn't check zero affected_rows
+	transaction_find_node(job);
+	// missing and empty is NO-OP, otherwise delete it
+	switch (node_status) {
+	case DTC_CODE_NODE_NOTFOUND:
+		return DTC_CODE_BUFFER_SUCCESS;
+	case DTC_CODE_NODE_EMPTY:
+		if (lossy_mode_) {
+			job.set_error(0, NULL, NULL);
+			job.resultInfo.set_affected_rows(0);
+		}
+		return DTC_CODE_BUFFER_SUCCESS;
+	case DTC_CODE_NODE_HIT:
+		break;
+	}
+
+	return buffer_delete_rows(job);
+}
+
+BufferResult BufferProcessAskChain::buffer_nodb_insert(DTCJobOperation &job)
+{
+	BufferResult iRet;
+	log4cplus_debug("%s", "buffer_asyn_prepare_insert begin");
+	// 这种模式下,不支持insert操作
+	if (m_bReplaceEmpty == true) {
+		job.set_error(
+			-EC_BAD_COMMAND, CACHE_SVC,
+			"insert cmd from client, not support under replace mode");
+		log4cplus_info(
+			"insert cmd from client, not support under replace mode");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	transaction_find_node(job);
+	if (node_status == DTC_CODE_NODE_HIT) {
+		iRet = check_and_expire(job);
+		if (iRet == DTC_CODE_BUFFER_ERROR) {
+			return iRet;
+		} else if (iRet == DTC_CODE_BUFFER_SUCCESS) {
+			node_status = DTC_CODE_NODE_NOTFOUND;
+			cache_transaction_node = Node();
+		}
+	}
+	if (node_status == DTC_CODE_NODE_HIT &&
+	    check_allowed_insert(job) == DTC_CODE_BUFFER_ERROR)
+		return DTC_CODE_BUFFER_ERROR;
+
+	// update key expire time
+	if (job.request_operation() &&
+	    job.update_key_expire_time(max_expire_time_) != 0) {
+		job.set_error(-EC_BAD_INVALID_FIELD, CACHE_SVC,
+			      "key expire time illegal");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	return buffer_insert_row(job, false /* async */, true /* setrows */);
+}
+
+BufferResult BufferProcessAskChain::buffer_nodb_update(DTCJobOperation &job)
+{
+	log4cplus_debug("%s", "buffer_fullmode_prepare_update begin");
+	transaction_find_node(job);
+	// missing & empty is NO-OP,
+	switch (node_status) {
+	case DTC_CODE_NODE_NOTFOUND:
+	case DTC_CODE_NODE_EMPTY:
+		if (m_bReplaceEmpty == true)
+			break;
+		return DTC_CODE_BUFFER_SUCCESS;
+	case DTC_CODE_NODE_HIT:
+		break;
+	}
+	BufferResult cacheRet = check_and_expire(job);
+	if (cacheRet != DTC_CODE_BUFFER_GOTO_NEXT_CHAIN)
+		return cacheRet;
+	// update key expire time
+	if (job.request_operation() &&
+	    job.update_key_expire_time(max_expire_time_) != 0) {
+		job.set_error(-EC_BAD_INVALID_FIELD, CACHE_SVC,
+			      "key expire time illegal");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	return buffer_update_rows(job, false /*Async*/, true /*setrows*/);
+}
+
+BufferResult BufferProcessAskChain::buffer_nodb_replace(DTCJobOperation &job)
+{
+	log4cplus_debug("%s", "buffer_asyn_prepare_replace begin");
+	transaction_find_node(job);
+	// update key expire time
+	if (job.request_operation() &&
+	    job.update_key_expire_time(max_expire_time_) != 0) {
+		job.set_error(-EC_BAD_INVALID_FIELD, CACHE_SVC,
+			      "key expire time illegal");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	// missing & empty insert it, otherwise replace it
+	switch (node_status) {
+	case DTC_CODE_NODE_EMPTY:
+	case DTC_CODE_NODE_NOTFOUND:
+		return buffer_insert_row(job, false, true /* setrows */);
+	case DTC_CODE_NODE_HIT:
+		break;
+	}
+	BufferResult cacheRet = check_and_expire(job);
+	if (cacheRet == DTC_CODE_BUFFER_ERROR) {
+		return cacheRet;
+	} else if (cacheRet == DTC_CODE_BUFFER_SUCCESS) {
+		node_status = DTC_CODE_NODE_NOTFOUND;
+		cache_transaction_node = Node();
+		return buffer_insert_row(job, false, true /* setrows */);
+	}
+	return buffer_replace_rows(job, false, true);
+}
+
+BufferResult BufferProcessAskChain::buffer_nodb_delete(DTCJobOperation &job)
+{
+	log4cplus_debug("%s", "buffer_fullmode_delete begin");
+	transaction_find_node(job);
+	// missing & empty is NO-OP
+	switch (node_status) {
+	case DTC_CODE_NODE_NOTFOUND:
+	case DTC_CODE_NODE_EMPTY:
+		return DTC_CODE_BUFFER_SUCCESS;
+	case DTC_CODE_NODE_HIT:
+		break;
+	}
+	return buffer_delete_rows(job);
+}
+
+BufferResult BufferProcessAskChain::buffer_async_insert(DTCJobOperation &job)
+{
+	log4cplus_debug("%s", "buffer_async_insert begin");
+	// 这种模式下,不支持insert操作
+	if (m_bReplaceEmpty == true) {
+		job.set_error(
+			-EC_BAD_COMMAND, CACHE_SVC,
+			"insert cmd from client, not support under replace mode");
+		log4cplus_info(
+			"insert cmd from client, not support under replace mode");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	transaction_find_node(job);
+	switch (node_status) {
+	case DTC_CODE_NODE_NOTFOUND:
+		if (full_mode_ == false)
+			return DTC_CODE_BUFFER_GOTO_NEXT_CHAIN;
+		if (update_mode_ == MODE_FLUSH)
+			return DTC_CODE_BUFFER_GOTO_NEXT_CHAIN;
+		break;
+	case DTC_CODE_NODE_EMPTY:
+		if (update_mode_ == MODE_FLUSH)
+			return DTC_CODE_BUFFER_GOTO_NEXT_CHAIN;
+		break;
+	case DTC_CODE_NODE_HIT:
+		if (check_allowed_insert(job) == DTC_CODE_BUFFER_ERROR)
+			return DTC_CODE_BUFFER_ERROR;
+		if (update_mode_ == MODE_FLUSH &&
+		    !(cache_transaction_node.is_dirty()))
+			return DTC_CODE_BUFFER_GOTO_NEXT_CHAIN;
+		break;
+	}
+	log4cplus_debug("%s", "buffer_async_insert data begin");
+	// 对insert 操作命中数据进行采样
+	++stat_insert_hits_;
+
+	return buffer_insert_row(job, true /* async */, true /* setrows */);
+}
+
+BufferResult BufferProcessAskChain::buffer_async_update(DTCJobOperation &job)
+{
+	log4cplus_debug("%s", "buffer_asyn_update begin");
+	transaction_find_node(job);
+	switch (node_status) {
+	case DTC_CODE_NODE_NOTFOUND:
+		if (full_mode_ == false)
+			return DTC_CODE_BUFFER_GOTO_NEXT_CHAIN;
+		// FALLTHROUGH
+	case DTC_CODE_NODE_EMPTY:
+		if (m_bReplaceEmpty == true) {
+			if (update_mode_ == MODE_FLUSH)
+				return DTC_CODE_BUFFER_GOTO_NEXT_CHAIN;
+			break;
+		}
+		return DTC_CODE_BUFFER_SUCCESS;
+	case DTC_CODE_NODE_HIT:
+		if (update_mode_ == MODE_FLUSH &&
+		    !(cache_transaction_node.is_dirty()))
+			return DTC_CODE_BUFFER_GOTO_NEXT_CHAIN;
+		break;
+	}
+
+	log4cplus_debug("%s", "buffer_async_update update data begin");
+	// 对update 操作命中数据进行采样
+	++stat_update_hits_;
+	return buffer_update_rows(job, true /*Async*/, true /*setrows*/);
+}
+
+BufferResult BufferProcessAskChain::buffer_async_replace(DTCJobOperation &job)
+{
+	log4cplus_debug("%s", "buffer_asyn_prepare_replace begin");
+	transaction_find_node(job);
+	switch (node_status) {
+	case DTC_CODE_NODE_NOTFOUND:
+		if (full_mode_ == false)
+			return DTC_CODE_BUFFER_GOTO_NEXT_CHAIN;
+		if (update_mode_ == MODE_FLUSH)
+			return DTC_CODE_BUFFER_GOTO_NEXT_CHAIN;
+		if (table_define_infomation_->key_as_uniq_field() == false)
+			return DTC_CODE_BUFFER_GOTO_NEXT_CHAIN;
+		return buffer_insert_row(job, true, true);
+	case DTC_CODE_NODE_EMPTY:
+		if (update_mode_ == MODE_FLUSH)
+			return DTC_CODE_BUFFER_GOTO_NEXT_CHAIN;
+		return buffer_insert_row(job, true, true);
+	case DTC_CODE_NODE_HIT:
+		if (update_mode_ == MODE_FLUSH &&
+		    !(cache_transaction_node.is_dirty()))
+			return DTC_CODE_BUFFER_GOTO_NEXT_CHAIN;
+		break;
+	}
+	return buffer_replace_rows(job, true, true);
+}
+
+/*
+ * Function		: deal_single_database_addition_ask
+ * Description	: 处理incoming job
+ * Input			: job			请求信息
+ * Output		: job			返回信息
+ * Return		: 0 			成功
+ *				: -1			失败
+ */
+BufferResult
+BufferProcessAskChain::deal_single_database_addition_ask(DTCJobOperation &job)
+{
+	job.renew_timestamp();
+	error_message_[0] = 0;
+	job.field_type(0);
+	// 取命令字
+	int iCmd = job.request_code();
+	log4cplus_debug(
+		"BufferProcessAskChain::deal_single_database_addition_ask cmd is %d ",
+		iCmd);
+	switch (iCmd) {
+	case DRequest::Get:
+		// set hit flag init status
+		job.set_result_hit_flag(HIT_INIT);
+		if (job.count_only() && (job.requestInfo.limit_start() ||
+					 job.requestInfo.limit_count())) {
+			job.set_error(
+				-EC_BAD_COMMAND, CACHE_SVC,
+				"There's nothing to limit because no fields required");
+			return DTC_CODE_BUFFER_ERROR;
+		}
+		// 如果命中黑名单,则purge掉当前节点,走PassThru模式
+		if (black_list_->in_blacklist(job.packed_key())) {
+			/* 
+				 * 理论上是在黑名单的节点是不可能在cache中的
+				 * 为了防止异常,预purge。
+				 */
+			log4cplus_debug(
+				"blacklist hit, passthough to datasource");
+			buffer_purge_data(job);
+			job.mark_as_pass_thru();
+			return DTC_CODE_BUFFER_GOTO_NEXT_CHAIN;
+		}
+		log4cplus_debug("blacklist miss, normal process");
+		++stat_get_count_;
+		return buffer_get_data(job);
+	case DRequest::Insert:
+		++stat_insert_count_;
+		if (update_mode_ == MODE_ASYNC && insert_mode_ != MODE_SYNC)
+			return buffer_async_insert(job);
+		// 标示task将提交给helper
+		return buffer_sync_insert_precheck(job);
+	case DRequest::Update:
+		++stat_update_count_;
+		if (update_mode_)
+			return buffer_async_update(job);
+		// 标示task将提交给helper
+		return DTC_CODE_BUFFER_GOTO_NEXT_CHAIN;
+		// 如果clinet 上送Delete 操作,删除Cache中数据,同时提交Helper
+		// 现阶段异步Cache暂时不支持Delete操作
+	case DRequest::Delete:
+		if (update_mode_ != MODE_SYNC) {
+			if (job.request_condition() &&
+			    job.request_condition()->has_type_rw()) {
+				job.set_error(
+					-EC_BAD_ASYNC_CMD, CACHE_SVC,
+					"Delete base non ReadOnly fields");
+				return DTC_CODE_BUFFER_ERROR;
+			}
+			// 异步delete前先flush
+			BufferResult iRet = DTC_CODE_BUFFER_SUCCESS;
+			iRet = buffer_flush_data_before_delete(job);
+			if (iRet == DTC_CODE_BUFFER_ERROR)
+				return iRet;
+		}
+		// 对于delete操作,直接提交DB,不改变原有逻辑
+		++stat_delete_count_;
+		// 标示task将提交给helper
+		return DTC_CODE_BUFFER_GOTO_NEXT_CHAIN;
+	case DRequest::Purge:
+		// 删除指定key在cache中的数据
+		++stat_purge_count_;
+		return buffer_purge_data(job);
+	case DRequest::Flush:
+		if (update_mode_)
+			// flush指定key在cache中的数据
+			return buffer_flush_data(job);
+		else
+			return DTC_CODE_BUFFER_SUCCESS;
+	case DRequest::Replace:
+		// 如果是淘汰的数据,不作处理
+		++stat_update_count_;
+		// 限制key字段作为唯一字段才能使用replace命令
+		if (!(job.table_definition()->key_part_of_uniq_field()) ||
+		    job.table_definition()->has_auto_increment()) {
+			job.set_error(
+				-EC_BAD_COMMAND, CACHE_SVC,
+				"replace cmd require key fields part of uniq-fields and no auto-increment field");
+			return DTC_CODE_BUFFER_ERROR;
+		}
+		if (update_mode_)
+			return buffer_async_replace(job);
+		// 标示task将提交给helper
+		return DTC_CODE_BUFFER_GOTO_NEXT_CHAIN;
+	case DRequest::TYPE_SYSTEM_COMMAND:
+		return buffer_process_admin(job);
+	default:
+		job.set_error(-EC_BAD_COMMAND, CACHE_SVC,
+			      "invalid cmd from client");
+		log4cplus_info("invalid cmd[%d] from client", iCmd);
+		break;
+	}
+	return DTC_CODE_BUFFER_ERROR;
+}
+
+/*
+ * Function		: deal_batch_database_addition_ask
+ * Description	: 处理incoming batch job
+ * Input			: job			请求信息
+ * Output		: job			返回信息
+ * Return		: 0 			成功
+ *				: -1			失败
+ */
+BufferResult
+BufferProcessAskChain::deal_batch_database_addition_ask(DTCJobOperation &job)
+{
+	job.renew_timestamp();
+	error_message_[0] = 0;
+
+	// 取命令字
+	int iCmd = job.request_code();
+	if (node_empty_limit_) {
+		int bsize = job.get_batch_size();
+		if (bsize * 10 > node_empty_limit_) {
+			job.set_error(-EC_TOO_MANY_KEY_VALUE, __FUNCTION__,
+				      "batch count exceed LimitEmptyNodes/10");
+			return DTC_CODE_BUFFER_ERROR;
+		}
+	}
+	switch (iCmd) {
+	case DRequest::Get:
+		return buffer_batch_get_data(job);
+
+		// unknwon command treat as OK, fallback to split mode
+	default:
+		break;
+	}
+	return DTC_CODE_BUFFER_SUCCESS;
+}
+
+/*
+ * Function		: reply_connector_answer
+ * Description	: 处理task from helper reply
+ * Input			: job			请求信息
+ * Output		: job			返回信息
+ * Return		: 0 			成功
+ *				: -1			失败
+ */
+
+BufferResult BufferProcessAskChain::reply_connector_answer(DTCJobOperation &job)
+{
+	job.renew_timestamp();
+	error_message_[0] = '\0';
+	int iLimit = 0;
+
+	int iCmd = job.request_code();
+	switch (iCmd) {
+	// 一定是cache miss,全部replace入cache
+	case DRequest::Get:
+		if (job.flag_pass_thru()) {
+			if (job.result)
+				job.pass_all_result(job.result);
+			return DTC_CODE_BUFFER_SUCCESS;
+		}
+
+		// ATTN: if failed, node always purged
+		if (job.result &&
+		    ((node_size_limit_ > 0 &&
+		      job.result->data_len() >= node_size_limit_) ||
+		     (node_rows_limit_ > 0 &&
+		      job.result->total_rows() >= node_rows_limit_))) {
+			log4cplus_error(
+				"key[%d] rows[%d] size[%d] exceed limit",
+				job.int_key(), job.result->total_rows(),
+				job.result->data_len());
+			iLimit = 1;
+		}
+		// don't add empty node if job back from blackhole
+		if (!iLimit && !job.flag_black_hole())
+			buffer_replace_result(job);
+
+		return buffer_get_rb(job);
+		// 没有回读则必定是multirow,新数据附在原有数据后面
+	case DRequest::Insert:
+		if (job.flag_black_hole())
+			return buffer_nodb_insert(job);
+		if (insert_order_ == INSERT_ORDER_PURGE) {
+			buffer_purge_data(job);
+			return DTC_CODE_BUFFER_SUCCESS;
+		}
+		return buffer_sync_insert(job);
+	case DRequest::Update:
+		if (job.flag_black_hole())
+			return buffer_nodb_update(job);
+
+		if (insert_order_ == INSERT_ORDER_PURGE &&
+		    job.resultInfo.affected_rows() > 0) {
+			buffer_purge_data(job);
+			return DTC_CODE_BUFFER_SUCCESS;
+		}
+		return buffer_sync_update(job);
+	case DRequest::Delete:
+		if (job.flag_black_hole())
+			return buffer_nodb_delete(job);
+		return buffer_sync_delete(job);
+	case DRequest::Replace:
+		if (job.flag_black_hole())
+			return buffer_nodb_replace(job);
+		return buffer_sync_replace(job);
+	case DRequest::TYPE_SYSTEM_COMMAND:
+		if (job.requestInfo.admin_code() ==
+		    DRequest::SystemCommand::Migrate) {
+			const DTCFieldValue *condition =
+				job.request_condition();
+			const DTCValue *key = condition->field_value(0);
+			Node node =
+				cache_.cache_find_auto_chose_hash(key->bin.ptr);
+			int rows = cache_.node_rows_count(node);
+			log4cplus_debug("migrate replay ,row %d", rows);
+			cache_.inc_total_row(0LL - rows);
+			cache_.purge_node_and_data(key->bin.ptr, node);
+			log4cplus_debug("should purgenode everything");
+			g_key_route_ask_instance->key_migrated(key->bin.ptr);
+			delete (job.request_operation());
+			job.set_request_operation(NULL);
+			return DTC_CODE_BUFFER_SUCCESS;
+		}
+		if (job.requestInfo.admin_code() ==
+			    DRequest::SystemCommand::MigrateDB ||
+		    job.requestInfo.admin_code() ==
+			    DRequest::SystemCommand::MigrateDBSwitch) {
+			return DTC_CODE_BUFFER_SUCCESS;
+		} else {
+			job.set_error(-EC_BAD_COMMAND, CACHE_SVC,
+				      "invalid cmd from helper");
+		}
+	case DRequest::Replicate:
+		// 处理主从同步
+		return buffer_process_replicate(job);
+	default:
+		job.set_error(-EC_BAD_COMMAND, CACHE_SVC,
+			      "invalid cmd from helper");
+	}
+
+	return DTC_CODE_BUFFER_ERROR;
+}
+
+BufferResult
+BufferProcessAskChain::deal_single_cache_only_ask(DTCJobOperation &job)
+{
+	// nodb mode always blackhole-d
+	job.mark_as_black_hole();
+	job.renew_timestamp();
+	error_message_[0] = 0;
+	// 取命令字
+	int iCmd = job.request_code();
+	switch (iCmd) {
+	case DRequest::Get:
+		if (job.count_only() && (job.requestInfo.limit_start() ||
+					 job.requestInfo.limit_count())) {
+			job.set_error(
+				-EC_BAD_COMMAND, CACHE_SVC,
+				"There's nothing to limit because no fields required");
+			return DTC_CODE_BUFFER_ERROR;
+		}
+		++stat_get_count_;
+		job.set_result_hit_flag(HIT_INIT);
+		return buffer_get_data(job);
+	case DRequest::Insert:
+		++stat_insert_count_;
+		return buffer_nodb_insert(job);
+	case DRequest::Update:
+		++stat_update_count_;
+		return buffer_nodb_update(job);
+	case DRequest::Delete:
+		++stat_delete_count_;
+		return buffer_nodb_delete(job);
+	case DRequest::Purge:
+		//删除指定key在cache中的数据
+		++stat_purge_count_;
+		return buffer_purge_data(job);
+	case DRequest::Flush:
+		return DTC_CODE_BUFFER_SUCCESS;
+		// 如果是淘汰的数据,不作处理
+	case DRequest::Replace:
+		++stat_update_count_;
+		// 限制key字段作为唯一字段才能使用replace命令
+		if (!(job.table_definition()->key_part_of_uniq_field()) ||
+		    job.table_definition()->has_auto_increment()) {
+			job.set_error(
+				-EC_BAD_COMMAND, CACHE_SVC,
+				"replace cmd require key fields part of uniq-fields and no auto-increment field");
+			return DTC_CODE_BUFFER_ERROR;
+		}
+		return buffer_nodb_replace(job);
+	case DRequest::TYPE_SYSTEM_COMMAND:
+		return buffer_process_admin(job);
+	default:
+		job.set_error(-EC_BAD_COMMAND, CACHE_SVC,
+			      "invalid cmd from client");
+		log4cplus_info("invalid cmd[%d] from client", iCmd);
+		break;
+	}
+	return DTC_CODE_BUFFER_ERROR;
+}
+
+/*
+ * 当DTC后端使用诸如Rocksdb之类的单机内嵌式持久引擎时,主从同步需要从存储侧拉取全量
+ * 数据,这里处理从存储引擎侧的返回值并返回给hotback主从同步端,注意:不对当前cache
+ * 做任何更改
+ * 
+ */
+BufferResult
+BufferProcessAskChain::buffer_process_replicate(DTCJobOperation &job)
+{
+	//	int iRet;
+	log4cplus_info("do cache process replicate start!");
+	// switch back the tabledef
+	job.set_request_code(DRequest::TYPE_SYSTEM_COMMAND);
+	// 数据库回来的记录如果是0行,则表示全量同步结束
+	if ((job.result == NULL || job.result->total_rows() == 0)) {
+		log4cplus_info("full replicate stage finished! key:[%u]",
+			       job.int_key());
+		job.set_table_definition(job.get_replicate_table());
+		job.set_error(-EC_FULL_SYNC_COMPLETE,
+			      "buffer_process_replicate",
+			      "full sync finished!");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	// 处理返回值
+	RowValue row(job.get_replicate_table());
+	RawData rawdata(&g_stSysMalloc, 1);
+	job.prepare_result_no_limit();
+	if (job.result != NULL) {
+		ResultSet *pstResultSet = job.result;
+		for (int i = 0; i < pstResultSet->total_rows(); i++) {
+			RowValue *pstRow = pstResultSet->_fetch_row();
+			if (pstRow == NULL) {
+				log4cplus_info("%s!",
+					       "call FetchRow func error");
+				rawdata.destory();
+				// hotback can not handle error exception now, just continue
+				log4cplus_error(
+					"replicate: get data from storage failed!");
+				continue;
+			}
+			// 设置key
+			job.set_request_key(pstRow->field_value(0));
+			job.build_packed_key();
+			row[2] = (*pstRow)[0];
+			// only bring back the key list
+			job.append_row(&row);
+			rawdata.destory();
+		}
+	}
+	log4cplus_info("do cache process replicate finished! ");
+	job.set_table_definition(job.get_replicate_table());
+	return DTC_CODE_BUFFER_SUCCESS;
+}
+
+BufferResult BufferProcessAskChain::reply_flush_answer(DTCJobOperation &job)
+{
+	error_message_[0] = '\0';
+	int iCmd = job.request_code();
+	switch (iCmd) {
+	// 如果是淘汰的数据,不作处理
+	case DRequest::Replace:
+		return DTC_CODE_BUFFER_SUCCESS;
+	default:
+		job.set_error(-EC_BAD_COMMAND, CACHE_SVC,
+			      "invalid cmd from helper");
+	}
+	return DTC_CODE_BUFFER_ERROR;
+}
+
+BufferResult BufferProcessAskChain::deal_flush_exeption(DTCJobOperation &job)
+{
+	// do_execute timeout
+	error_message_[0] = '\0';
+	switch (job.request_code()) {
+	case DRequest::Insert:
+		if (lossy_mode_ == true && job.result_code() == -ER_DUP_ENTRY) {
+			// upstream is un-trusted
+			job.renew_timestamp();
+			return buffer_sync_insert(job);
+		}
+		// FALLTHROUGH
+	case DRequest::Delete:
+		switch (job.result_code()) {
+		case -EC_UPSTREAM_ERROR:
+		case -CR_SERVER_LOST:
+			if (update_mode_ == MODE_SYNC) {
+				log4cplus_info(
+					"SQL do_execute result unknown, purge data");
+				buffer_purge_data(job);
+			} else {
+				log4cplus_error(
+					"SQL do_execute result unknown, data may be corrupted");
+			}
+			break;
+		}
+		break;
+	case DRequest::Update:
+		switch (job.result_code()) {
+		case -ER_DUP_ENTRY:
+			if (lossy_mode_ == true) {
+				// upstream is un-trusted
+				job.renew_timestamp();
+				return buffer_sync_update(job);
+			}
+			// FALLTHROUGH
+		case -EC_UPSTREAM_ERROR:
+		case -CR_SERVER_LOST:
+			if (update_mode_ == MODE_SYNC) {
+				log4cplus_info(
+					"SQL do_execute result unknown, purge data");
+				buffer_purge_data(job);
+			}
+			// must be cache miss
+			break;
+		}
+		break;
+	}
+	return DTC_CODE_BUFFER_ERROR;
+}
+
+BufferResult BufferProcessAskChain::check_and_expire(DTCJobOperation &job)
+{
+	uint32_t expire, now;
+	int iRet = data_process_->get_expire_time(
+		job.table_definition(), &cache_transaction_node, expire);
+	if (iRet != 0) {
+		log4cplus_error("get_expire_time failed");
+		job.set_error_dup(-EIO, CACHE_SVC,
+				  data_process_->get_err_msg());
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	if (expire != 0 && expire <= (now = time(NULL))) {
+		// expired
+		++stat_expire_count_;
+		log4cplus_debug(
+			"key: %u expired, purge current key when update, expire time: %d, current time: %d",
+			job.int_key(), expire, now);
+		if (job.request_code() == DRequest::Get) {
+			job.prepare_result();
+			job.set_total_rows(0);
+		}
+		cache_.inc_total_row(
+			0LL - cache_.node_rows_count(cache_transaction_node));
+		if (cache_.cache_purge(key) != 0)
+			log4cplus_error("PANIC: purge node[id=%u] fail",
+					cache_transaction_node.node_id());
+		return DTC_CODE_BUFFER_SUCCESS;
+	}
+	return DTC_CODE_BUFFER_GOTO_NEXT_CHAIN;
+}
+
+void BufferProcessAskChain::job_ask_procedure(DTCJobOperation *job_operation)
+{
+	log4cplus_debug("enter job_ask_procedure");
+	table_define_infomation_ =
+		TableDefinitionManager::instance()->get_cur_table_def();
+	uint64_t now_unix_time = GET_TIMESTAMP() / 1000;
+	if (job_operation->is_expired(now_unix_time)) {
+		log4cplus_debug(
+			"job time out, throw it for availability, now is [%lld] expire is [%lld]",
+			(long long)now_unix_time,
+			(long long)job_operation->get_expire_time());
+		stat_buffer_process_expire_count_++;
+		job_operation->set_error(-EC_TASK_TIMEOUT,
+					 "buffer_process_unit", "job time out");
+		job_operation->turn_around_job_answer();
+		return;
+	}
+
+	unsigned blacksize = 0;
+	transaction_begin(job_operation);
+
+	if (job_operation->result_code() < 0) {
+		job_operation->mark_as_hit(); /* mark as hit if result done */
+		job_operation->turn_around_job_answer();
+	} else if (job_operation->is_batch_request()) {
+		switch (deal_batch_database_addition_ask(*job_operation)) {
+		default:
+			job_operation->set_error(-EC_SERVER_ERROR,
+						 "buffer_process_unit",
+						 last_error_message());
+			job_operation
+				->mark_as_hit(); /* mark as hit if result done */
+			job_operation->turn_around_job_answer();
+			break;
+
+		case DTC_CODE_BUFFER_SUCCESS:
+			job_operation
+				->mark_as_hit(); /* mark as hit if result done */
+			job_operation->turn_around_job_answer();
+			break;
+
+		case DTC_CODE_BUFFER_ERROR:
+			if (job_operation->result_code() >= 0)
+				job_operation->set_error(-EC_SERVER_ERROR,
+							 "buffer_process_unit",
+							 last_error_message());
+			job_operation
+				->mark_as_hit(); /* mark as hit if result done */
+			job_operation->turn_around_job_answer();
+			break;
+		}
+	} else if (dtc_mode_ == DTC_MODE_DATABASE_ADDITION) {
+		BufferResult result =
+			deal_single_database_addition_ask(*job_operation);
+		switch (result) {
+		default:
+			if (!job_operation->flag_black_hole()) {
+				// add to black list.
+				blacksize =
+					job_operation->pop_black_list_size();
+				if (blacksize > 0) {
+					log4cplus_debug(
+						"add to blacklist, key=%d size=%u",
+						job_operation->int_key(),
+						blacksize);
+					black_list_->add_blacklist(
+						job_operation->packed_key(),
+						blacksize);
+				}
+			}
+		case DTC_CODE_BUFFER_ERROR:
+			if (job_operation->result_code() >= 0)
+				job_operation->set_error(-EC_SERVER_ERROR,
+							 "buffer_process",
+							 last_error_message());
+
+		case DTC_CODE_BUFFER_SUCCESS:
+			job_operation
+				->mark_as_hit(); /* mark as hit if result done */
+			job_operation->turn_around_job_answer();
+			break;
+		case DTC_CODE_BUFFER_GOTO_NEXT_CHAIN:
+			log4cplus_debug("push job to next-unit");
+			job_operation->push_reply_dispatcher(&cache_reply_);
+
+			main_chain.job_ask_procedure(job_operation);
+			break;
+		case DTC_CODE_BUFFER_UNFINISHED:
+			break;
+		case DTC_CODE_BUFFER_TO_REMOTE_TARGET: //migrate command,to remote dtc target.
+			job_operation->push_reply_dispatcher(&cache_reply_);
+			remote_chain.job_ask_procedure(job_operation);
+			break;
+		case DTC_CODE_BUFFER_TO_HOTBACKUP_TARGET: {
+			log4cplus_debug("push job to hotback-up thread");
+			break;
+		}
+		}
+	} else if (dtc_mode_ == DTC_MODE_CACHE_ONLY) {
+		BufferResult result =
+			deal_single_cache_only_ask(*job_operation);
+		switch (result) {
+		default:
+		case DTC_CODE_BUFFER_ERROR:
+			if (job_operation->result_code() >= 0)
+				job_operation->set_error(-EC_SERVER_ERROR,
+							 "buffer_process_unit",
+							 last_error_message());
+
+		case DTC_CODE_BUFFER_GOTO_NEXT_CHAIN:
+		case DTC_CODE_BUFFER_SUCCESS:
+			job_operation
+				->mark_as_hit(); /* mark as hit if result done */
+			job_operation->turn_around_job_answer();
+			break;
+		case DTC_CODE_BUFFER_UNFINISHED:
+			break;
+		case DTC_CODE_BUFFER_TO_REMOTE_TARGET: //migrate command,to remote dtc target.
+			job_operation->push_reply_dispatcher(&cache_reply_);
+			remote_chain.job_ask_procedure(job_operation);
+			break;
+		case DTC_CODE_BUFFER_TO_HOTBACKUP_TARGET: {
+			log4cplus_debug("push job to hotback thread");
+			break;
+		}
+		}
+	} else {
+		log4cplus_error("dtc mode error: %d", dtc_mode_);
+	}
+
+	transaction_end();
+
+	//delay purge.
+	cache_.delay_purge_notify();
+	log4cplus_debug("leave job_ask_procedure");
+}
+
+void BufferProcessAskChain::job_answer_procedure(DTCJobOperation *job_operation)
+{
+	if (DRequest::ReloadConfig == job_operation->request_code() &&
+	    TaskTypeHelperReloadConfig == job_operation->request_type()) {
+		/* delete job only */
+		log4cplus_debug("reload config job reply ,just delete job");
+		delete job_operation;
+		return;
+	}
+
+	transaction_begin(job_operation);
+
+	if (job_operation->result_code() < 0) {
+		deal_flush_exeption(*job_operation);
+	} else if (job_operation->result_code() > 0) {
+		log4cplus_info("result_code() > 0: from %s msg %s",
+			       job_operation->resultInfo.error_from(),
+			       job_operation->resultInfo.error_message());
+	}
+	if (job_operation->result_code() >= 0 &&
+	    reply_connector_answer(*job_operation) != DTC_CODE_BUFFER_SUCCESS) {
+		if (job_operation->result_code() >= 0)
+			job_operation->set_error(-EC_SERVER_ERROR,
+						 "reply_connector_answer",
+						 last_error_message());
+	}
+
+	if (!job_operation->flag_black_hole()) {
+		// add to black list.
+		unsigned blacksize = job_operation->pop_black_list_size();
+		if (blacksize > 0) {
+			log4cplus_debug("add to blacklist, key=%d size=%u",
+					job_operation->int_key(), blacksize);
+			black_list_->add_blacklist(job_operation->packed_key(),
+						   blacksize);
+		}
+	}
+
+	job_operation->turn_around_job_answer();
+
+	transaction_end();
+
+	//delay purge.
+	cache_.delay_purge_notify();
+}
+
+MARKER_STAMP BufferProcessAskChain::calculate_current_marker()
+{
+	time_t now;
+
+	time(&now);
+	return now - (now % marker_interval_);
+}
+
+void BufferProcessAskChain::set_drop_count(int c)
+{
+	//	Cache.set_drop_count(c);
+}
+
+void BufferProcessAskChain::get_dirty_stat()
+{
+	//	uint64_t ullMaxNode;
+	//	uint64_t ullMaxRow;
+	const double rate = 0.9;
+
+	if (PtMalloc::instance()->user_alloc_size() >=
+	    PtMalloc::instance()->total_size() * rate) {
+		//		ullMaxNode = Cache.get_total_used_node();
+		//		ullMaxRow = Cache.total_used_row();
+	} else {
+		if (PtMalloc::instance()->user_alloc_size() > 0) {
+			//			double enlarge = PtMalloc::instance()->total_size() * rate / PtMalloc::instance()->user_alloc_size();
+			//			ullMaxNode = (uint64_t)(Cache.get_total_used_node() * enlarge);
+			//			ullMaxRow = (uint64_t)(Cache.total_used_row() * enlarge);
+		} else {
+			//			ullMaxNode = 0;
+			//			ullMaxRow = 0;
+		}
+	}
+}
+
+void BufferProcessAskChain::set_flush_parameter(int intvl, int mreq,
+						int mintime, int maxtime)
+{
+	// require v4 cache
+	if (cache_.get_cache_info()->version < 4)
+		return;
+
+	/*
+	if(intvl < 60)
+		intvl = 60;
+	else if(intvl > 43200)
+		intvl = 43200;
+	*/
+
+	/* marker time interval changed to 1sec */
+	intvl = 1;
+	marker_interval_ = intvl;
+
+	/* 1. make sure at least one time marker exist
+	 * 2. init first marker time and last marker time
+	 * */
+	Node stTimeNode = cache_.first_time_marker();
+	if (!stTimeNode)
+		cache_.insert_time_marker(calculate_current_marker());
+	cache_.first_time_marker_time();
+	cache_.last_time_marker_time();
+
+	if (mreq <= 0)
+		mreq = 1;
+	if (mreq > 10000)
+		mreq = 10000;
+
+	if (mintime < 10)
+		mintime = 10;
+	if (maxtime <= mintime)
+		maxtime = mintime * 2;
+
+	max_flush_request_ = mreq;
+	min_dirty_time_ = mintime;
+	max_dirty_time_ = maxtime;
+
+	//get_dirty_stat();
+
+	/*attach timer only if async mode or sync mode but mem dirty*/
+	if (update_mode_ == MODE_ASYNC ||
+	    (update_mode_ == MODE_SYNC && memory_dirty_ == true)) {
+		/* check for expired dirty node every second */
+		flush_timer_ = owner->get_timer_list(1);
+		attach_timer(flush_timer_);
+	}
+}
+
+int BufferProcessAskChain::commit_flush_request(DTCFlushRequest *req,
+						DTCJobOperation *callbackTask)
+{
+	req->wait = callbackTask;
+
+	if (req->numReq == 0)
+		delete req;
+	else
+		current_pend_flush_request_++;
+
+	stat_currentFlush_request_ = current_pend_flush_request_;
+	return 0;
+}
+
+void BufferProcessAskChain::complete_flush_request(DTCFlushRequest *req)
+{
+	delete req;
+	current_pend_flush_request_--;
+	stat_currentFlush_request_ = current_pend_flush_request_;
+
+	calculate_flush_speed(0);
+
+	if (current_pend_flush_request_ < pend_flush_request_)
+		flush_next_node();
+}
+
+void BufferProcessAskChain::job_timer_procedure(void)
+{
+	log4cplus_debug("enter timer procedure");
+	int ret = 0;
+
+	MARKER_STAMP job_operation = calculate_current_marker();
+	if (cache_.first_time_marker_time() != job_operation)
+		cache_.insert_time_marker(job_operation);
+
+	calculate_flush_speed(1);
+
+	/* flush next node return
+	 * 1: no dirty node exist, sync dtc, should not attach timer again
+	 * 0: one flush request created, nFlushReq inc in flush_next_node, notinue
+	 * others: on flush request created due to some reason, should break for another flush timer event, otherwise may be    
+	 * block here, eg. no dirty node exist, and in async mode
+	 * */
+	while (current_pend_flush_request_ < pend_flush_request_) {
+		ret = flush_next_node();
+		if (ret == 0) {
+			continue;
+		} else {
+			break;
+		}
+	}
+
+	/*SYNC + memory_dirty_/ASYNC need to reattach flush timer*/
+	if ((update_mode_ == MODE_SYNC && memory_dirty_ == true) ||
+	    update_mode_ == MODE_ASYNC)
+		attach_timer(flush_timer_);
+
+	log4cplus_debug("leave timer procedure");
+}
+
+int BufferProcessAskChain::oldest_dirty_node_alarm()
+{
+	Node stHead = cache_.dirty_lru_head();
+	Node stNode = stHead.Prev();
+
+	if (cache_.is_time_marker(stNode)) {
+		stNode = stNode.Prev();
+		if (cache_.is_time_marker(stNode) || stNode == stHead) {
+			return 0;
+		} else {
+			return 1;
+		}
+	} else if (stNode == stHead) {
+		return 0;
+	} else {
+		return 1;
+	}
+}
+
+/*flush speed(nFlushReq) only depend on oldest dirty node existing time*/
+void BufferProcessAskChain::calculate_flush_speed(int is_flush_timer)
+{
+	delete_tail_time_markers();
+
+	// time base
+	int m, v;
+	unsigned int t1 = cache_.first_time_marker_time();
+	unsigned int t2 = cache_.last_time_marker_time();
+	//initialized t1 and t2, so no need of test for this
+	v = t1 - t2;
+
+	//if start with sync and mem dirty, flush as fast as we can
+	if (update_mode_ == MODE_SYNC) {
+		if (memory_dirty_ == false) {
+			pend_flush_request_ = 0;
+		} else {
+			pend_flush_request_ = max_flush_request_;
+		}
+		goto __stat;
+	}
+
+	//alarm if oldest dirty node exist too much time, flush at fastest speed
+	if (v >= max_dirty_time_) {
+		pend_flush_request_ = max_flush_request_;
+		if (oldest_dirty_node_alarm() && is_flush_timer) {
+			log4cplus_info(
+				"oldest dirty node exist time > max dirty time");
+		}
+	} else if (v >= min_dirty_time_) {
+		m = 1 + (v - min_dirty_time_) * (max_flush_request_ - 1) /
+				(max_dirty_time_ - min_dirty_time_);
+		if (m > pend_flush_request_)
+			pend_flush_request_ = m;
+	} else {
+		pend_flush_request_ = 0;
+	}
+
+__stat:
+	if (pend_flush_request_ > max_flush_request_)
+		pend_flush_request_ = max_flush_request_;
+
+	stat_maxflush_request_ = pend_flush_request_;
+	stat_oldestdirty_time_ = v;
+}
+
+/* return -1: encount the only time marker
+ * return  1: no dirty node exist, clear mem dirty
+ * return  2: no dirty node exist, in async mode
+ * return -2: no flush request created
+ * return  0: one flush request created
+ * */
+int BufferProcessAskChain::flush_next_node(void)
+{
+	unsigned int affected_count = 0;
+	MARKER_STAMP stamp;
+	static MARKER_STAMP last_rm_stamp;
+
+	Node stHead = cache_.dirty_lru_head();
+	Node stNode = stHead;
+	Node stPreNode = stNode.Prev();
+
+	/*case 1: delete continues time marker, until 
+     *        encount a normal node/head node, go next
+     *        encount the only time marker*/
+	while (1) {
+		stNode = stPreNode;
+		stPreNode = stNode.Prev();
+
+		if (!cache_.is_time_marker(stNode))
+			break;
+
+		if (cache_.first_time_marker_time() == stNode.Time()) {
+			if (update_mode_ == MODE_SYNC &&
+			    memory_dirty_ == true) {
+				/* delete this time marker, flush all dirty node */
+				cache_.remove_time_marker(stNode);
+				stNode = stPreNode;
+				stPreNode = stNode.Prev();
+				while (stNode != stHead) {
+					buffer_flush_data_timer(stNode,
+								affected_count);
+					stNode = stPreNode;
+					stPreNode = stNode.Prev();
+				}
+
+				disable_timer();
+				memory_dirty_ = false;
+				log4cplus_info("mem clean now for sync cache");
+				return 1;
+			}
+			return -1;
+		}
+
+		stamp = stNode.Time();
+		if (stamp > last_rm_stamp) {
+			last_rm_stamp = stamp;
+		}
+
+		log4cplus_debug("remove time marker in dirty lru, time %u",
+				stNode.Time());
+		cache_.remove_time_marker(stNode);
+	}
+
+	/*case 2: this the head node, clear mem dirty if nessary, return, should not happen*/
+	if (stNode == stHead) {
+		if (update_mode_ == MODE_SYNC && memory_dirty_ == true) {
+			disable_timer();
+			memory_dirty_ = false;
+			log4cplus_info("mem clean now for sync cache");
+			return 1;
+		} else {
+			return 2;
+		}
+	}
+
+	/*case 3: this a normal node, flush it.
+     * 	  return -2 if no flush request added to cache process
+     * */
+	int iRet = buffer_flush_data_timer(stNode, affected_count);
+	if (iRet == -1 || iRet == -2 || iRet == -3 || iRet == 1) {
+		return -2;
+	}
+
+	return 0;
+}
+
+void BufferProcessAskChain::delete_tail_time_markers()
+{
+	Node stHead = cache_.dirty_lru_head();
+	Node stNode = stHead;
+	Node stPreNode = stNode.Prev();
+
+	while (1) {
+		stNode = stPreNode;
+		stPreNode = stNode.Prev();
+
+		if (stNode == stHead ||
+		    cache_.first_time_marker_time() == stNode.Time())
+			break;
+
+		if (cache_.is_time_marker(stNode) &&
+		    cache_.is_time_marker(stPreNode))
+			cache_.remove_time_marker(stNode);
+		else
+			break;
+	}
+}
+
+BufferResult BufferProcessAskChain::buffer_process_admin(DTCJobOperation &Job)
+{
+	log4cplus_debug("BufferProcess::buffer_process_admin admin_code is %d ",
+			Job.requestInfo.admin_code());
+	if (Job.requestInfo.admin_code() ==
+		    DRequest::SystemCommand::QueryServerInfo ||
+	    Job.requestInfo.admin_code() == DRequest::SystemCommand::LogoutHB ||
+	    Job.requestInfo.admin_code() ==
+		    DRequest::SystemCommand::GetUpdateKey) {
+		if (hotbackup_lru_feature_ == NULL) { // 热备功能尚未启动
+			Job.set_error(-EBADRQC, CACHE_SVC,
+				      "hot-backup not active yet");
+			return DTC_CODE_BUFFER_ERROR;
+		}
+	}
+
+	switch (Job.requestInfo.admin_code()) {
+	case DRequest::SystemCommand::QueryServerInfo:
+		return buffer_query_serverinfo(Job);
+
+	case DRequest::SystemCommand::RegisterHB:
+		return buffer_register_hb(Job);
+
+	case DRequest::SystemCommand::LogoutHB:
+		return buffer_logout_hb(Job);
+
+	case DRequest::SystemCommand::GetKeyList:
+		return buffer_get_key_list(Job);
+
+	case DRequest::SystemCommand::GetUpdateKey:
+		return buffer_get_update_key(Job);
+
+	case DRequest::SystemCommand::GetRawData:
+		return buffer_get_raw_data(Job);
+
+	case DRequest::SystemCommand::ReplaceRawData:
+		return buffer_replace_raw_data(Job);
+
+	case DRequest::SystemCommand::AdjustLRU:
+		return buffer_adjust_lru(Job);
+
+	case DRequest::SystemCommand::VerifyHBT:
+		return buffer_verify_hbt(Job);
+
+	case DRequest::SystemCommand::GetHBTime:
+		return buffer_get_hbt(Job);
+
+	case DRequest::SystemCommand::kNodeHandleChange:
+		return buffer_nodehandlechange(Job);
+
+	case DRequest::SystemCommand::Migrate:
+		return buffer_migrate(Job);
+
+	case DRequest::SystemCommand::ClearCache:
+		return buffer_clear_cache(Job);
+
+	case DRequest::SystemCommand::MigrateDB:
+	case DRequest::SystemCommand::MigrateDBSwitch:
+		if (update_mode() || is_mem_dirty()) {
+			log4cplus_error("try to migrate when cache is async");
+			Job.set_error(-EC_SERVER_ERROR, "cache process",
+				      "try to migrate when cache is async");
+			return DTC_CODE_BUFFER_ERROR;
+		}
+		return DTC_CODE_BUFFER_GOTO_NEXT_CHAIN;
+
+	case DRequest::SystemCommand::ColExpandStatus:
+		return buffer_check_expand_status(Job);
+
+	case DRequest::SystemCommand::col_expand:
+		return buffer_column_expand(Job);
+
+	case DRequest::SystemCommand::ColExpandDone:
+		return buffer_column_expand_done(Job);
+
+	case DRequest::SystemCommand::ColExpandKey:
+		return buffer_column_expand_key(Job);
+
+	default:
+		Job.set_error(-EBADRQC, CACHE_SVC,
+			      "invalid admin cmd code from client");
+		log4cplus_info("invalid admin cmd code[%d] from client",
+			       Job.requestInfo.admin_code());
+		break;
+	}
+
+	return DTC_CODE_BUFFER_SUCCESS;
+}
+
+BufferResult
+BufferProcessAskChain::buffer_check_expand_status(DTCJobOperation &Job)
+{
+	if (update_mode() || is_mem_dirty()) {
+		Job.set_error(-EC_SERVER_ERROR, "cache process",
+			      "try to column expand when cache is async");
+		log4cplus_error("try to column expand when cache is async");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+
+	int ret = 0;
+	// get table.conf
+	RowValue stRow(Job.table_definition());
+	Job.update_row(stRow);
+	log4cplus_debug("value[len: %d]", stRow[3].bin.len);
+	DTCTableDefinition *t;
+	// parse table.conf to tabledef
+	// release t by DEC_DELETE, not delete
+	if (stRow[3].bin.ptr == NULL ||
+	    (t = TableDefinitionManager::instance()->load_buffered_table(
+		     stRow[3].bin.ptr)) == NULL) {
+		log4cplus_error("expand column with illegal ");
+		Job.set_error(-EC_SERVER_ERROR, "cache process table.yaml",
+			      "table.yaml illegal");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	if ((ret = cache_.check_expand_status()) == -1) {
+		// check tabledef
+		if (t->is_same_table(TableDefinitionManager::instance()
+					     ->get_new_table_def())) {
+			log4cplus_info(
+				"expand same column while expanding, canceled");
+			Job.set_error(
+				-EC_ERR_COL_EXPAND_DUPLICATE, "cache process",
+				"expand same column while expanding, canceled");
+		} else {
+			log4cplus_error(
+				"new expanding job while expand, canceled");
+			Job.set_error(
+				-EC_ERR_COL_EXPANDING, "cache process",
+				"new expanding job while expand, canceled");
+		}
+		// release t
+		DEC_DELETE(t);
+		return DTC_CODE_BUFFER_ERROR;
+	} else if (ret == -2) {
+		log4cplus_error("column expand not enabled");
+		Job.set_error(-EC_SERVER_ERROR, "cache process",
+			      "column expand not enabled");
+		DEC_DELETE(t);
+		return DTC_CODE_BUFFER_ERROR;
+	}
+
+	log4cplus_debug("buffer_check_expand_status ok");
+	return DTC_CODE_BUFFER_SUCCESS;
+}
+
+BufferResult BufferProcessAskChain::buffer_column_expand(DTCJobOperation &Job)
+{
+	int ret = 0;
+	// get table.conf
+	RowValue stRow(Job.table_definition());
+	Job.update_row(stRow);
+	log4cplus_debug("value[len: %d]", stRow[3].bin.len);
+	DTCTableDefinition *t;
+	// parse table.conf to tabledef
+	// release t by DEC_DELETE, not delete
+	if (stRow[3].bin.ptr == NULL ||
+	    (t = TableDefinitionManager::instance()->load_buffered_table(
+		     stRow[3].bin.ptr)) == NULL) {
+		log4cplus_error("expand column with illegal table.yaml");
+		Job.set_error(-EC_SERVER_ERROR, "cache process",
+			      "table.yaml illegal");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	// check expanding
+	if ((ret = cache_.check_expand_status()) == -1) {
+		// check tabledef
+		if (t->is_same_table(TableDefinitionManager::instance()
+					     ->get_new_table_def())) {
+			log4cplus_info(
+				"expand same column while expanding, canceled");
+			Job.set_error(
+				-EC_ERR_COL_EXPAND_DUPLICATE, "cache process",
+				"expand same column while expanding, canceled");
+		} else {
+			log4cplus_error(
+				"new expanding job while expand, canceled");
+			Job.set_error(
+				-EC_ERR_COL_EXPANDING, "cache process",
+				"new expanding job while expand, canceled");
+		}
+		// release t
+		DEC_DELETE(t);
+		return DTC_CODE_BUFFER_ERROR;
+	} else if (ret == -2) {
+		log4cplus_error("column expand not enabled");
+		Job.set_error(-EC_SERVER_ERROR, "cache process",
+			      "column expand not enabled");
+		DEC_DELETE(t);
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	if (t->is_same_table(
+		    TableDefinitionManager::instance()->get_cur_table_def())) {
+		log4cplus_info("expand same column, canceled");
+		Job.set_error(-EC_ERR_COL_EXPAND_DUPLICATE, "cache process",
+			      "expand same column, canceled");
+		DEC_DELETE(t);
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	// if ok
+	if (TableDefinitionManager::instance()->get_cur_table_idx() !=
+	    cache_.shm_table_idx()) {
+		log4cplus_error(
+			"tabledefmanager's idx and shm's are different, need restart");
+		Job.set_error(-EC_SERVER_ERROR, "cache process",
+			      "tabledefmanager's idx and shm's are different");
+		DEC_DELETE(t);
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	// set new table for tabledefmanger
+	// copy table.conf to shm
+	if ((ret = cache_.try_col_expand(stRow[3].bin.ptr, stRow[3].bin.len)) !=
+	    0) {
+		log4cplus_error("try col expand error, ret: %d", ret);
+		Job.set_error(-EC_SERVER_ERROR, "cache process",
+			      "try col expand error");
+		DEC_DELETE(t);
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	TableDefinitionManager::instance()->set_new_table_def(
+		t, (cache_.shm_table_idx() + 1));
+	TableDefinitionManager::instance()->renew_table_file_def(
+		stRow[3].bin.ptr, stRow[3].bin.len);
+	TableDefinitionManager::instance()->save_db_config();
+	cache_.col_expand(stRow[3].bin.ptr, stRow[3].bin.len);
+
+	if (dtc_mode_ == DTC_MODE_CACHE_ONLY)
+		write_hotbackup_log(_DTC_HB_COL_EXPAND_, stRow[3].bin.ptr,
+				    stRow[3].bin.len,
+				    DTCHotBackup::SYNC_COLEXPAND_CMD);
+	log4cplus_debug("buffer_column_expand ok");
+	return DTC_CODE_BUFFER_SUCCESS;
+}
+
+BufferResult
+BufferProcessAskChain::buffer_column_expand_done(DTCJobOperation &Job)
+{
+	int ret = 0;
+	// get table.conf
+	RowValue stRow(Job.table_definition());
+	Job.update_row(stRow);
+	log4cplus_debug("value[len: %d]", stRow[3].bin.len);
+	DTCTableDefinition *t;
+	// parse table.conf to tabledef
+	// release t by DEC_DELETE, not delete
+	if (stRow[3].bin.ptr == NULL ||
+	    (t = TableDefinitionManager::instance()->load_buffered_table(
+		     stRow[3].bin.ptr)) == NULL) {
+		log4cplus_error("expand column with illegal table.yaml");
+		Job.set_error(-EC_SERVER_ERROR, "cache process",
+			      "table.yaml illegal");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	if ((ret = cache_.check_expand_status()) == -2) {
+		log4cplus_error(
+			"expand done when not expand job begin or feature not enabled");
+		Job.set_error(-EC_SERVER_ERROR, "cache process",
+			      "expand done when not expand job begin");
+		return DTC_CODE_BUFFER_ERROR;
+	} else if (ret == 0) {
+		// check tabledef
+		if (t->is_same_table(TableDefinitionManager::instance()
+					     ->get_cur_table_def())) {
+			log4cplus_info(
+				"expand done same column while expanding not start, canceled");
+			Job.set_error(
+				-EC_ERR_COL_EXPAND_DONE_DUPLICATE,
+				"cache process",
+				"expand same column while expanding not start, canceled");
+		} else {
+			log4cplus_error(
+				"new expand done job while expanding not start, canceled");
+			Job.set_error(
+				-EC_ERR_COL_EXPAND_DONE_DISTINCT,
+				"cache process",
+				"new expanding job while expanding not start, canceled");
+		}
+		return DTC_CODE_BUFFER_ERROR;
+	} else {
+		// check tabledef
+		if (!t->is_same_table(TableDefinitionManager::instance()
+					      ->get_new_table_def())) {
+			log4cplus_error(
+				"new expand done job while expanding, canceled");
+			Job.set_error(
+				-EC_ERR_COL_EXPAND_DONE_DISTINCT,
+				"cache process",
+				"new expanding job done while expanding, canceled");
+			return DTC_CODE_BUFFER_ERROR;
+		}
+	}
+
+	//若是有源的,则重新载入配置文件到helper
+	if (dtc_mode_ == DTC_MODE_DATABASE_ADDITION) {
+		char *buf = stRow[3].bin.ptr;
+		char *bufLocal = (char *)MALLOC(strlen(buf) + 1);
+		memset(bufLocal, 0, strlen(buf) + 1);
+		strcpy(bufLocal, buf);
+		DbConfig *dbconfig = DbConfig::load_buffered(bufLocal);
+		FREE(bufLocal);
+		if (!dbconfig) {
+			log4cplus_error(
+				"reload dbconfig for collect failed, canceled");
+			Job.set_error(
+				-EC_ERR_COL_EXPAND_DONE_DISTINCT,
+				"cache process",
+				"reload dbconfig for collect failed, canceled");
+			return DTC_CODE_BUFFER_ERROR;
+		}
+		if (collect_load_config(dbconfig)) {
+			log4cplus_error(
+				"reload config to collect failed, canceled");
+			Job.set_error(
+				-EC_ERR_COL_EXPAND_DONE_DISTINCT,
+				"cache process",
+				"reload config to collect failed, canceled");
+			return DTC_CODE_BUFFER_ERROR;
+		}
+	}
+
+	TableDefinitionManager::instance()->renew_cur_table_def();
+	TableDefinitionManager::instance()->save_new_table_conf();
+	DTCColExpand::instance()->expand_done();
+
+	if (dtc_mode_ == DTC_MODE_CACHE_ONLY)
+		write_hotbackup_log(_DTC_HB_COL_EXPAND_DONE_, stRow[3].bin.ptr,
+				    stRow[3].bin.len,
+				    DTCHotBackup::SYNC_COLEXPAND_CMD);
+	log4cplus_debug("buffer_column_expand_done ok");
+
+	//若是有源的,则需要通知work helper重新载入配置文件
+	if (dtc_mode_ == DTC_MODE_DATABASE_ADDITION) {
+		DTCJobOperation *pJob = new DTCJobOperation(
+			TableDefinitionManager::instance()->get_cur_table_def());
+		if (NULL == pJob) {
+			log4cplus_error(
+				"cannot notify work helper reload config, new job error, possible memory exhausted!");
+		} else {
+			log4cplus_error(
+				"notify work helper reload config start!");
+			pJob->set_request_type(TaskTypeHelperReloadConfig);
+			pJob->set_request_code(DRequest::ReloadConfig);
+			pJob->push_reply_dispatcher(&cache_reply_);
+
+			main_chain.job_ask_procedure(pJob);
+		}
+	}
+	return DTC_CODE_BUFFER_SUCCESS;
+}
+
+BufferResult
+BufferProcessAskChain::buffer_column_expand_key(DTCJobOperation &Job)
+{
+	if (cache_.check_expand_status() != -1) {
+		log4cplus_error(
+			"expand one when not expand job begin or feature not enabled");
+		Job.set_error(-EC_ERR_COL_NOT_EXPANDING, "cache process",
+			      "expand one when not expand job begin");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	int iRet = 0;
+
+	const DTCFieldValue *condition = Job.request_condition();
+	const DTCValue *key;
+
+	// TODO this may need fix, as we do not check whether this field is key
+	if (!condition || condition->num_fields() < 1 ||
+	    condition->field_id(0) != 2) {
+		Job.set_error(-EC_ERR_COL_NO_KEY, "cache process",
+			      "no key value append for col expand");
+		log4cplus_error("no key value append for col expand");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	key = condition->field_value(0);
+	Node stNode = cache_.cache_find_auto_chose_hash(key->bin.ptr);
+	if (!stNode) {
+		log4cplus_info("key not exist for col expand");
+		return DTC_CODE_BUFFER_SUCCESS;
+	}
+
+	iRet = data_process_->expand_node(Job, &stNode);
+	if (iRet == -4) {
+		Job.set_error(-EC_ERR_COL_EXPAND_NO_MEM, "cache process",
+			      data_process_->get_err_msg());
+		log4cplus_error("no mem to expand for key, %s",
+				data_process_->get_err_msg());
+		return DTC_CODE_BUFFER_ERROR;
+	} else if (iRet != 0) {
+		Job.set_error(-EC_SERVER_ERROR, "cache process",
+			      data_process_->get_err_msg());
+		log4cplus_error("expand key error: %s",
+				data_process_->get_err_msg());
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	// hotbackup for nodb mode
+	if (dtc_mode_ == DTC_MODE_CACHE_ONLY)
+		write_hotbackup_log(key->bin.ptr, NULL, 0,
+				    DTCHotBackup::SYNC_COLEXPAND);
+
+	log4cplus_debug("buffer_column_expand_key ok");
+	return DTC_CODE_BUFFER_SUCCESS;
+}
+
+BufferResult BufferProcessAskChain::buffer_register_hb(DTCJobOperation &Job)
+{
+	if (hotbackup_lru_feature_ == NULL) { // 共享内存还没有激活热备特性
+		NEW(HBFeature, hotbackup_lru_feature_);
+		if (hotbackup_lru_feature_ == NULL) {
+			log4cplus_error("new hot-backup feature error: %m");
+			Job.set_error(-EC_SERVER_ERROR, "buffer_register_hb",
+				      "new hot-backup feature fail");
+			return DTC_CODE_BUFFER_ERROR;
+		}
+		int iRet = hotbackup_lru_feature_->init(time(NULL));
+		if (iRet == -ENOMEM) {
+			Node stNode;
+			if (cache_.try_purge_size(1, stNode) == 0)
+				iRet = hotbackup_lru_feature_->init(time(NULL));
+		}
+		if (iRet != 0) {
+			log4cplus_error("init hot-backup feature error: %d",
+					iRet);
+			Job.set_error(-EC_SERVER_ERROR, "buffer_register_hb",
+				      "init hot-backup feature fail");
+			return DTC_CODE_BUFFER_ERROR;
+		}
+		iRet = cache_.add_feature(HOT_BACKUP,
+					  hotbackup_lru_feature_->get_handle());
+		if (iRet != 0) {
+			log4cplus_error("add hot-backup feature error: %d",
+					iRet);
+			Job.set_error(-EC_SERVER_ERROR, "buffer_register_hb",
+				      "add hot-backup feature fail");
+			return DTC_CODE_BUFFER_ERROR;
+		}
+	}
+	if (hotbackup_lru_feature_->master_uptime() == 0)
+		hotbackup_lru_feature_->master_uptime() = time(NULL);
+
+	//开启变更key日志
+	log_hotbackup_key_switch_ = true;
+
+	int64_t hb_timestamp = hotbackup_lru_feature_->master_uptime();
+	Job.versionInfo.set_master_hb_timestamp(hb_timestamp);
+	Job.versionInfo.set_slave_hb_timestamp(
+		hotbackup_lru_feature_->slave_uptime());
+
+	Job.set_request_type(TaskTypeRegisterHbLog);
+	dispatch_hot_back_task(&Job);
+	return DTC_CODE_BUFFER_TO_HOTBACKUP_TARGET;
+}
+
+BufferResult BufferProcessAskChain::buffer_logout_hb(DTCJobOperation &Job)
+{
+	return DTC_CODE_BUFFER_SUCCESS;
+}
+
+/*
+ * 遍历cache中所有的Node节点
+ */
+BufferResult BufferProcessAskChain::buffer_get_key_list(DTCJobOperation &Job)
+{
+	uint32_t lst, lcnt;
+	lst = Job.requestInfo.limit_start();
+	lcnt = Job.requestInfo.limit_count();
+
+	log4cplus_debug("buffer_get_key_list start, limit[%u %u]", lst, lcnt);
+
+	// if the storage is Rocksdb, do replicate through it directly in full sync stage,
+	// just dispath the job to helper unit
+	if (dtc_mode_ == DTC_MODE_DATABASE_ADDITION &&
+	    dbConfig->dstype == 2 /* rocksdb */) {
+		log4cplus_info("proc local replicate!");
+		Job.set_request_code(DRequest::Replicate);
+		// Job.SetRequestType(TaskTypeHelperReplicate);
+		Job.set_request_type(TaskTypeRead);
+
+		// due to the hotback has a different table definition with the normal query, so
+		// need to switch table definition during query the storage
+		DTCTableDefinition *repTab = Job.table_definition();
+
+		Job.set_table_definition(
+			TableDefinitionManager::instance()->get_cur_table_def());
+		Job.set_replicate_table(repTab);
+
+		return DTC_CODE_BUFFER_GOTO_NEXT_CHAIN;
+	}
+
+	//遍历完所有的Node节点
+	if (lst > cache_.max_node_id()) {
+		Job.set_error(-EC_FULL_SYNC_COMPLETE, "buffer_get_key_list",
+			      "node id is overflow");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+
+	Job.prepare_result_no_limit();
+
+	RowValue r(Job.table_definition());
+	RawData rawdata(&g_stSysMalloc, 1);
+
+	for (unsigned i = lst; i < lst + lcnt; ++i) {
+		if (i < cache_.get_min_valid_node_id())
+			continue;
+		if (i > cache_.max_node_id())
+			break;
+
+		//查找对应的Node节点
+		Node node = I_SEARCH(i);
+		if (!node)
+			continue;
+		if (node.not_in_lru_list())
+			continue;
+		if (cache_.is_time_marker(node))
+			continue;
+
+		// 解码Key
+		DataChunk *keyptr = M_POINTER(DataChunk, node.vd_handle());
+
+		//发送packedkey
+		r[2] = TableDefinitionManager::instance()
+			       ->get_cur_table_def()
+			       ->packed_key(keyptr->key());
+
+		//解码Value
+		if (data_process_->get_node_all_rows_count(&node, &rawdata)) {
+			rawdata.destory();
+			continue;
+		}
+
+		r[3].Set((char *)(rawdata.get_addr()),
+			 (int)(rawdata.data_size()));
+
+		Job.append_row(&r);
+
+		rawdata.destory();
+	}
+
+	return DTC_CODE_BUFFER_SUCCESS;
+}
+
+/*
+ * hot backup拉取更新key或者lru变更,如果没有则挂起请求,直到
+ * 1. 超时
+ * 2. 有更新key, 或者LRU变更
+ */
+BufferResult BufferProcessAskChain::buffer_get_update_key(DTCJobOperation &Job)
+{
+	log4cplus_debug("buffer_get_update_key start");
+	Job.set_request_type(TaskTypeReadHbLog);
+	dispatch_hot_back_task(&Job);
+	return DTC_CODE_BUFFER_TO_HOTBACKUP_TARGET;
+}
+
+BufferResult BufferProcessAskChain::buffer_get_raw_data(DTCJobOperation &Job)
+{
+	int iRet;
+
+	const DTCFieldValue *condition = Job.request_condition();
+	const DTCValue *key;
+
+	log4cplus_debug("buffer_get_raw_data start ");
+
+	RowValue stRow(Job.table_definition()); //一行数据
+	RawData stNodeData(&g_stSysMalloc, 1);
+
+	Job.prepare_result_no_limit();
+
+	for (int i = 0; i < condition->num_fields(); i++) {
+		key = condition->field_value(i);
+		stRow[1].u64 = DTCHotBackup::HAS_VALUE; //表示附加value字段
+		stRow[2].Set(key->bin.ptr, key->bin.len);
+
+		Node stNode = cache_.cache_find_auto_chose_hash(key->bin.ptr);
+		if (!stNode) { //master没有该key的数据
+			stRow[1].u64 = DTCHotBackup::KEY_NOEXIST;
+			stRow[3].Set(0);
+			Job.append_row(&stRow);
+			continue;
+		} else {
+			iRet = data_process_->get_node_all_rows_count(
+				&stNode, &stNodeData);
+			if (iRet != 0) {
+				log4cplus_error("get raw-data failed");
+				Job.set_error_dup(-EIO, CACHE_SVC,
+						  data_process_->get_err_msg());
+				return DTC_CODE_BUFFER_ERROR;
+			}
+			stRow[3].Set((char *)(stNodeData.get_addr()),
+				     (int)(stNodeData.data_size()));
+		}
+
+		Job.append_row(&stRow); //当前行添加到task中
+		stNodeData.destory();
+	}
+
+	return DTC_CODE_BUFFER_SUCCESS;
+}
+
+BufferResult
+BufferProcessAskChain::buffer_replace_raw_data(DTCJobOperation &Job)
+{
+	log4cplus_debug("buffer_replace_raw_data start ");
+
+	int iRet;
+
+	const DTCFieldValue *condition = Job.request_condition();
+	const DTCValue *key;
+
+	RowValue stRow(Job.table_definition()); //一行数据
+	RawData stNodeData(&g_stSysMalloc, 1);
+	if (condition->num_fields() < 1) {
+		log4cplus_debug("%s", "replace raw data need key");
+		Job.set_error_dup(-EC_KEY_NEEDED, CACHE_SVC,
+				  data_process_->get_err_msg());
+		return DTC_CODE_BUFFER_ERROR;
+	}
+
+	key = condition->field_value(0);
+	stRow[2].Set(key->bin.ptr, key->bin.len);
+	Job.update_row(stRow); //获取数据
+
+	log4cplus_debug("value[len: %d]", stRow[3].bin.len);
+
+	//调整备机的空节点过滤
+	if (stRow[1].u64 & DTCHotBackup::EMPTY_NODE && empty_node_filter_) {
+		empty_node_filter_->SET(*(unsigned int *)(key->bin.ptr));
+	}
+
+	//key在master不存在, 或者是空节点,purge cache.
+	if (stRow[1].u64 & DTCHotBackup::KEY_NOEXIST ||
+	    stRow[1].u64 & DTCHotBackup::EMPTY_NODE) {
+		log4cplus_debug("purge slave data");
+		Node stNode = cache_.cache_find_auto_chose_hash(key->bin.ptr);
+		int rows = cache_.node_rows_count(stNode);
+		log4cplus_debug("migrate replay ,row %d", rows);
+		cache_.inc_total_row(0LL - rows);
+		cache_.cache_purge(key->bin.ptr);
+		return DTC_CODE_BUFFER_SUCCESS;
+	}
+
+	// 解析成raw data
+	ALLOC_HANDLE_T hData = g_stSysMalloc.Malloc(stRow[3].bin.len);
+	if (hData == INVALID_HANDLE) {
+		log4cplus_error("malloc error: %m");
+		Job.set_error(-ENOMEM, CACHE_SVC, "malloc error");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+
+	memcpy(g_stSysMalloc.handle_to_ptr(hData), stRow[3].bin.ptr,
+	       stRow[3].bin.len);
+
+	if ((iRet = stNodeData.do_attach(
+		     hData, 0, table_define_infomation_->key_format())) != 0) {
+		log4cplus_error("parse raw-data error: %d, %s", iRet,
+				stNodeData.get_err_msg());
+		Job.set_error(-EC_BAD_RAW_DATA, CACHE_SVC, "bad raw data");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+
+	// 检查packed key是否匹配
+	DTCValue packed_key = TableDefinitionManager::instance()
+				      ->get_cur_table_def()
+				      ->packed_key(stNodeData.key());
+	if (packed_key.bin.len != key->bin.len ||
+	    memcmp(packed_key.bin.ptr, key->bin.ptr, key->bin.len)) {
+		log4cplus_error(
+			"packed key miss match, key size=%d, packed key size=%d",
+			key->bin.len, packed_key.bin.len);
+		log4cplus_error("packed key miss match, packed_key %s,key %s",
+				packed_key.bin.ptr, key->bin.ptr);
+		Job.set_error(-EC_BAD_RAW_DATA, CACHE_SVC,
+			      "packed key miss match");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+
+	// 查找分配node节点
+	unsigned int uiNodeID;
+	Node stNode = cache_.cache_find_auto_chose_hash(key->bin.ptr);
+
+	if (!stNode) {
+		for (int i = 0; i < 2; i++) {
+			stNode = cache_.cache_allocation(key->bin.ptr);
+			if (!(!stNode))
+				break;
+			if (cache_.try_purge_size(1, stNode) != 0)
+				break;
+		}
+		if (!stNode) {
+			log4cplus_error("alloc cache node error");
+			Job.set_error(-EIO, CACHE_SVC,
+				      "alloc cache node error");
+			return DTC_CODE_BUFFER_ERROR;
+		}
+		stNode.vd_handle() = INVALID_HANDLE;
+	} else {
+		cache_.remove_from_lru(stNode);
+		cache_.insert_to_clean_lru(stNode);
+	}
+
+	uiNodeID = stNode.node_id();
+
+	// 替换数据
+	iRet = data_process_->do_replace_all(&stNode, &stNodeData);
+	if (iRet != 0) {
+		if (dtc_mode_ == DTC_MODE_CACHE_ONLY) {
+			/* FIXME: no backup db, can't purge data, no recover solution yet */
+			log4cplus_error("cache replace raw data error: %d, %s",
+					iRet, data_process_->get_err_msg());
+			Job.set_error(-EIO, CACHE_SVC,
+				      "ReplaceRawData() error");
+			return DTC_CODE_BUFFER_ERROR;
+		} else {
+			log4cplus_error(
+				"cache replace raw data error: %d, %s. purge node: %u",
+				iRet, data_process_->get_err_msg(), uiNodeID);
+			cache_.purge_node_and_data(key->bin.ptr, stNode);
+			return DTC_CODE_BUFFER_SUCCESS;
+		}
+	}
+
+	cache_.inc_total_row(data_process_->get_increase_row_count());
+
+	log4cplus_debug("buffer_replace_raw_data success! ");
+
+	return DTC_CODE_BUFFER_SUCCESS;
+}
+
+BufferResult BufferProcessAskChain::buffer_adjust_lru(DTCJobOperation &Job)
+{
+	const DTCFieldValue *condition = Job.request_condition();
+	const DTCValue *key;
+
+	log4cplus_debug("buffer_adjust_lru start ");
+
+	RowValue stRow(Job.table_definition()); //一行数据
+
+	for (int i = 0; i < condition->num_fields(); i++) {
+		key = condition->field_value(i);
+
+		Node stNode;
+		int newhash, oldhash;
+		if (g_hash_changing) {
+			if (g_target_new_hash) {
+				oldhash = 0;
+				newhash = 1;
+			} else {
+				oldhash = 1;
+				newhash = 0;
+			}
+
+			stNode = cache_.cache_find(key->bin.ptr, oldhash);
+			if (!stNode) {
+				stNode = cache_.cache_find(key->bin.ptr,
+							   newhash);
+			} else {
+				cache_.move_to_new_hash(key->bin.ptr, stNode);
+			}
+		} else {
+			if (g_target_new_hash) {
+				stNode = cache_.cache_find(key->bin.ptr, 1);
+			} else {
+				stNode = cache_.cache_find(key->bin.ptr, 0);
+			}
+		}
+		if (!stNode) {
+			//		            continue;
+			Job.set_error(-EC_KEY_NOTEXIST, CACHE_SVC,
+				      "key not exist");
+			return DTC_CODE_BUFFER_ERROR;
+		}
+		cache_.remove_from_lru(stNode);
+		cache_.insert_to_clean_lru(stNode);
+	}
+
+	return DTC_CODE_BUFFER_SUCCESS;
+}
+
+BufferResult BufferProcessAskChain::buffer_verify_hbt(DTCJobOperation &Job)
+{
+	log4cplus_debug("buffer_verify_hbt start ");
+
+	if (hotbackup_lru_feature_ == NULL) { // 共享内存还没有激活热备特性
+		NEW(HBFeature, hotbackup_lru_feature_);
+		if (hotbackup_lru_feature_ == NULL) {
+			log4cplus_error("new hot-backup feature error: %m");
+			Job.set_error(-EC_SERVER_ERROR, "buffer_register_hb",
+				      "new hot-backup feature fail");
+			return DTC_CODE_BUFFER_ERROR;
+		}
+		int iRet = hotbackup_lru_feature_->init(0);
+		if (iRet == -ENOMEM) {
+			Node stNode;
+			if (cache_.try_purge_size(1, stNode) == 0)
+				iRet = hotbackup_lru_feature_->init(0);
+		}
+		if (iRet != 0) {
+			log4cplus_error("init hot-backup feature error: %d",
+					iRet);
+			Job.set_error(-EC_SERVER_ERROR, "buffer_register_hb",
+				      "init hot-backup feature fail");
+			return DTC_CODE_BUFFER_ERROR;
+		}
+		iRet = cache_.add_feature(HOT_BACKUP,
+					  hotbackup_lru_feature_->get_handle());
+		if (iRet != 0) {
+			log4cplus_error("add hot-backup feature error: %d",
+					iRet);
+			Job.set_error(-EC_SERVER_ERROR, "buffer_register_hb",
+				      "add hot-backup feature fail");
+			return DTC_CODE_BUFFER_ERROR;
+		}
+	}
+
+	int64_t master_timestamp = Job.versionInfo.master_hb_timestamp();
+	if (hotbackup_lru_feature_->slave_uptime() == 0) {
+		hotbackup_lru_feature_->slave_uptime() = master_timestamp;
+	} else if (hotbackup_lru_feature_->slave_uptime() != master_timestamp) {
+		log4cplus_error(
+			"hot backup timestamp incorrect, master[%lld], this slave[%lld]",
+			(long long)master_timestamp,
+			(long long)(hotbackup_lru_feature_->slave_uptime()));
+		Job.set_error(-EC_ERR_SYNC_STAGE, "buffer_verify_hbt",
+			      "verify hot backup timestamp fail");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+
+	return DTC_CODE_BUFFER_SUCCESS;
+}
+
+BufferResult BufferProcessAskChain::buffer_get_hbt(DTCJobOperation &Job)
+{
+	log4cplus_debug("buffer_get_hbt start ");
+
+	if (hotbackup_lru_feature_ == NULL) { // 共享内存还没有激活热备特性
+		Job.versionInfo.set_master_hb_timestamp(0);
+		Job.versionInfo.set_slave_hb_timestamp(0);
+	} else {
+		Job.versionInfo.set_master_hb_timestamp(
+			hotbackup_lru_feature_->master_uptime());
+		Job.versionInfo.set_slave_hb_timestamp(
+			hotbackup_lru_feature_->slave_uptime());
+	}
+
+	log4cplus_debug("master-up-time: %lld, slave-up-time: %lld",
+			(long long)(Job.versionInfo.master_hb_timestamp()),
+			(long long)(Job.versionInfo.slave_hb_timestamp()));
+
+	return DTC_CODE_BUFFER_SUCCESS;
+}
+
+BufferResult
+BufferProcessAskChain::buffer_query_serverinfo(DTCJobOperation &Job)
+{
+	log4cplus_debug("buffer_query_serverinfo start");
+	Job.set_request_type(TaskTypeQueryHbLogInfo);
+	dispatch_hot_back_task(&Job);
+	return DTC_CODE_BUFFER_TO_HOTBACKUP_TARGET;
+}
+
+/* finished in one cache process cycle */
+BufferResult
+BufferProcessAskChain::buffer_nodehandlechange(DTCJobOperation &Job)
+{
+	log4cplus_debug("buffer_nodehandlechange start ");
+
+	const DTCFieldValue *condition = Job.request_condition();
+	const DTCValue *key = condition->field_value(0);
+	Node node;
+	MEM_HANDLE_T node_handle;
+	RawData node_raw_data(PtMalloc::instance(), 0);
+	/* no need of private raw data, just for copy */
+	char *private_buff = NULL;
+	int buff_len;
+	MEM_HANDLE_T new_node_handle;
+
+	if (condition->num_fields() < 1) {
+		log4cplus_debug("%s", "nodehandlechange need key");
+		Job.set_error_dup(-EC_KEY_NEEDED, CACHE_SVC,
+				  data_process_->get_err_msg());
+		return DTC_CODE_BUFFER_ERROR;
+	}
+
+	/* packed key -> node id -> node handle -> node raw data -> private buff*/
+	int newhash, oldhash;
+	if (g_hash_changing) {
+		if (g_target_new_hash) {
+			oldhash = 0;
+			newhash = 1;
+		} else {
+			oldhash = 1;
+			newhash = 0;
+		}
+		node = cache_.cache_find(key->bin.ptr, oldhash);
+		if (!node) {
+			node = cache_.cache_find(key->bin.ptr, newhash);
+		} else {
+			cache_.move_to_new_hash(key->bin.ptr, node);
+		}
+	} else {
+		if (g_target_new_hash) {
+			node = cache_.cache_find(key->bin.ptr, 1);
+		} else {
+			node = cache_.cache_find(key->bin.ptr, 0);
+		}
+	}
+
+	if (!node) {
+		log4cplus_debug("%s", "key not exist for defragmentation");
+		Job.set_error(-ER_KEY_NOT_FOUND, CACHE_SVC, "node not found");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+
+	node_handle = node.vd_handle();
+	if (node_handle == INVALID_HANDLE) {
+		Job.set_error(-EC_BAD_RAW_DATA, CACHE_SVC, "chunk not exist");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+
+	node_raw_data.do_attach(node_handle,
+				table_define_infomation_->key_fields() - 1,
+				table_define_infomation_->key_format());
+
+	if ((private_buff = (char *)MALLOC(node_raw_data.data_size())) ==
+	    NULL) {
+		log4cplus_error("no mem");
+		Job.set_error(-ENOMEM, CACHE_SVC, "malloc error");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+
+	memcpy(private_buff, node_raw_data.get_addr(),
+	       node_raw_data.data_size());
+	buff_len = node_raw_data.data_size();
+	if (node_raw_data.destory()) {
+		log4cplus_error("node raw data detroy error");
+		Job.set_error(-ENOMEM, CACHE_SVC, "free error");
+		FREE_IF(private_buff);
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	log4cplus_debug("old node handle: " UINT64FMT_T ", raw data size %d",
+			node_handle, buff_len);
+
+	/* new chunk */
+	/* new node handle -> new node handle ptr <- node raw data ptr*/
+	new_node_handle = PtMalloc::instance()->Malloc(buff_len);
+	log4cplus_debug("new node handle: " UINT64FMT_T, new_node_handle);
+
+	if (new_node_handle == INVALID_HANDLE) {
+		log4cplus_error("malloc error: %m");
+		Job.set_error(-ENOMEM, CACHE_SVC, "malloc error");
+		FREE_IF(private_buff);
+		return DTC_CODE_BUFFER_ERROR;
+	}
+
+	memcpy(PtMalloc::instance()->handle_to_ptr(new_node_handle),
+	       private_buff, buff_len);
+
+	/* free node raw data, set node handle */
+	node.vd_handle() = new_node_handle;
+	FREE_IF(private_buff);
+
+	log4cplus_debug("buffer_nodehandlechange success! ");
+	return DTC_CODE_BUFFER_SUCCESS;
+}
+
+BufferResult BufferProcessAskChain::buffer_migrate(DTCJobOperation &Job)
+{
+	if (g_key_route_ask_instance == 0) {
+		log4cplus_error("not support migrate cmd @ bypass mode");
+		Job.set_error(-EC_SERVER_ERROR, "buffer_migrate",
+			      "Not Support @ Bypass Mode");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	int iRet;
+
+	const DTCFieldValue *ui = Job.request_operation();
+	const DTCValue key = TableDefinitionManager::instance()
+				     ->get_cur_table_def()
+				     ->packed_key(Job.packed_key());
+	if (key.bin.ptr == 0 || key.bin.len <= 0) {
+		Job.set_error(-EC_KEY_NEEDED, "buffer_migrate",
+			      "need set migrate key");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+
+	log4cplus_debug("cache_cache_migrate start ");
+
+	RowValue stRow(Job.table_definition()); //一行数据
+	RawData stNodeData(&g_stSysMalloc, 1);
+
+	Node stNode = cache_.cache_find_auto_chose_hash(key.bin.ptr);
+
+	//如果有updateInfo则说明请求从DTC过来
+	int flag = 0;
+	if (ui && ui->field_value(0)) {
+		flag = ui->field_value(0)->s64;
+	}
+	if ((flag & 0xFF) == DTCMigrate::FROM_SERVER) {
+		log4cplus_debug("this migrate cmd is from DTC");
+		RowValue stRow(Job.table_definition()); //一行数据
+		RawData stNodeData(&g_stSysMalloc, 1);
+		stRow[2].Set(key.bin.ptr, key.bin.len);
+		Job.update_row(stRow); //获取数据
+
+		log4cplus_debug("value[len: %d]", stRow[3].bin.len);
+
+		//key在master不存在, 或者是空节点,purge cache.
+		if (stRow[1].u64 & DTCHotBackup::KEY_NOEXIST ||
+		    stRow[1].u64 & DTCHotBackup::EMPTY_NODE) {
+			log4cplus_debug("purge slave data");
+			cache_.cache_purge(key.bin.ptr);
+			return DTC_CODE_BUFFER_SUCCESS;
+		}
+
+		// 解析成raw data
+		ALLOC_HANDLE_T hData = g_stSysMalloc.Malloc(stRow[3].bin.len);
+		if (hData == INVALID_HANDLE) {
+			log4cplus_error("malloc error: %m");
+			Job.set_error(-ENOMEM, CACHE_SVC, "malloc error");
+			return DTC_CODE_BUFFER_ERROR;
+		}
+
+		memcpy(g_stSysMalloc.handle_to_ptr(hData), stRow[3].bin.ptr,
+		       stRow[3].bin.len);
+
+		if ((iRet = stNodeData.do_attach(
+			     hData, 0,
+			     table_define_infomation_->key_format())) != 0) {
+			log4cplus_error("parse raw-data error: %d, %s", iRet,
+					stNodeData.get_err_msg());
+			Job.set_error(-EC_BAD_RAW_DATA, CACHE_SVC,
+				      "bad raw data");
+			return DTC_CODE_BUFFER_ERROR;
+		}
+
+		// 检查packed key是否匹配
+		DTCValue packed_key = TableDefinitionManager::instance()
+					      ->get_cur_table_def()
+					      ->packed_key(stNodeData.key());
+		if (packed_key.bin.len != key.bin.len ||
+		    memcmp(packed_key.bin.ptr, key.bin.ptr, key.bin.len)) {
+			log4cplus_error(
+				"packed key miss match, key size=%d, packed key size=%d",
+				key.bin.len, packed_key.bin.len);
+
+			Job.set_error(-EC_BAD_RAW_DATA, CACHE_SVC,
+				      "packed key miss match");
+			return DTC_CODE_BUFFER_ERROR;
+		}
+
+		// 查找分配node节点
+		unsigned int uiNodeID;
+
+		if (!stNode) {
+			for (int i = 0; i < 2; i++) {
+				stNode = cache_.cache_allocation(key.bin.ptr);
+				if (!(!stNode))
+					break;
+				if (cache_.try_purge_size(1, stNode) != 0)
+					break;
+			}
+			if (!stNode) {
+				log4cplus_error("alloc cache node error");
+				Job.set_error(-EIO, CACHE_SVC,
+					      "alloc cache node error");
+				return DTC_CODE_BUFFER_ERROR;
+			}
+			stNode.vd_handle() = INVALID_HANDLE;
+		} else {
+			cache_.remove_from_lru(stNode);
+			cache_.insert_to_clean_lru(stNode);
+		}
+		if ((flag >> 8) & 0xFF) //如果为脏节点
+		{
+			cache_.remove_from_lru(stNode);
+			cache_.insert_to_dirty_lru(stNode);
+		}
+
+		uiNodeID = stNode.node_id();
+
+		// 替换数据
+		iRet = data_process_->do_replace_all(&stNode, &stNodeData);
+		if (iRet != 0) {
+			if (dtc_mode_ == DTC_MODE_CACHE_ONLY) {
+				/* FIXME: no backup db, can't purge data, no recover solution yet */
+				log4cplus_error(
+					"cache replace raw data error: %d, %s",
+					iRet, data_process_->get_err_msg());
+				Job.set_error(-EIO, CACHE_SVC,
+					      "ReplaceRawData() error");
+				return DTC_CODE_BUFFER_ERROR;
+			} else {
+				log4cplus_error(
+					"cache replace raw data error: %d, %s. purge node: %u",
+					iRet, data_process_->get_err_msg(),
+					uiNodeID);
+				cache_.purge_node_and_data(key.bin.ptr, stNode);
+				return DTC_CODE_BUFFER_SUCCESS;
+			}
+		}
+		if (write_hotbackup_log(key.bin.ptr, stNode,
+					DTCHotBackup::SYNC_UPDATE)) {
+			log4cplus_error(
+				"buffer_migrate: log update key failed");
+		}
+		cache_.inc_total_row(data_process_->get_increase_row_count());
+
+		Job.prepare_result_no_limit();
+
+		return DTC_CODE_BUFFER_SUCCESS;
+	}
+
+	log4cplus_debug("this migrate cmd is from api");
+	//请求从工具过来,我们需要构造请求发给其他dtc
+
+	if (!stNode) {
+		Job.set_error(-EC_KEY_NOTEXIST, "buffer_migrate",
+			      "this key not found in cache");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	//获取该节点的raw-data,构建replace请求给后端helper
+	iRet = data_process_->get_node_all_rows_count(&stNode, &stNodeData);
+	if (iRet != 0) {
+		log4cplus_error("get raw-data failed");
+		Job.set_error_dup(-EIO, CACHE_SVC,
+				  data_process_->get_err_msg());
+		return DTC_CODE_BUFFER_ERROR;
+	}
+
+	DTCFieldValue *uitmp = new DTCFieldValue(4);
+	if (uitmp == NULL) {
+		Job.set_error(-EIO, CACHE_SVC,
+			      "migrate:new DTCFieldValue error");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	//id0 {"type", DField::Unsigned, 4, DTCValue::Make(0), 0}
+	//type的最后一个字节用来表示请求来着其他dtc还是api
+	//倒数第二个字节表示节点是否为脏
+	uitmp->add_value(0, DField::Set, DField::Unsigned,
+			 DTCValue::Make(DTCMigrate::FROM_SERVER |
+					(stNode.is_dirty() << 8)));
+
+	//id1 {"flag", DField::Unsigned, 1, DTCValue::Make(0), 0},
+	uitmp->add_value(1, DField::Set, DField::Unsigned,
+			 DTCValue::Make(DTCHotBackup::HAS_VALUE));
+	//id2 {"key", DField::Binary, 255, DTCValue::Make(0), 0},
+
+	//id3 {"value", DField::Binary, MAXPACKETSIZE, DTCValue::Make(0), 0},
+
+	FREE_IF(Job.migratebuf);
+	Job.migratebuf = (char *)calloc(1, stNodeData.data_size());
+	if (Job.migratebuf == NULL) {
+		log4cplus_error("create buffer failed");
+		Job.set_error(-EIO, CACHE_SVC,
+			      "migrate:get raw data,create buffer failed");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	memcpy(Job.migratebuf, (char *)(stNodeData.get_addr()),
+	       (int)(stNodeData.data_size()));
+	uitmp->add_value(3, DField::Set, DField::Binary,
+			 DTCValue::Make(Job.migratebuf,
+					stNodeData.data_size()));
+	Job.set_request_operation(uitmp);
+	g_key_route_ask_instance->key_migrating(stNodeData.key());
+
+	return DTC_CODE_BUFFER_TO_REMOTE_TARGET;
+}
+
+BufferResult BufferProcessAskChain::buffer_clear_cache(DTCJobOperation &Job)
+{
+	if (update_mode_ != MODE_SYNC) {
+		log4cplus_error("try to clear cache for async mode, abort...");
+		Job.set_error(-EC_SERVER_ERROR, "buffer_clear_cache",
+			      "can not clear cache for aync mode, abort");
+		return DTC_CODE_BUFFER_ERROR;
+	}
+	// clean and rebuild
+	int64_t mu = 0, su = 0;
+	if (hotbackup_lru_feature_ != NULL) {
+		mu = hotbackup_lru_feature_->master_uptime();
+		su = hotbackup_lru_feature_->slave_uptime();
+	}
+	// table.conf in shm is set in clear_create
+	int ret = cache_.clear_create();
+	if (ret < 0) {
+		log4cplus_error("clear and create cache error: %s",
+				cache_.error());
+		if (ret == -1) {
+			log4cplus_error("fault error, exit...");
+			exit(-1);
+		}
+		if (ret == -2) {
+			log4cplus_error("error, abort...");
+			Job.set_error(-EC_SERVER_ERROR, "buffer_clear_cache",
+				      "clear cache_ error, abort");
+			return DTC_CODE_BUFFER_ERROR;
+		}
+	}
+	data_process_->change_mallocator(PtMalloc::instance());
+	// setup hotbackup
+	if (hotbackup_lru_feature_ != NULL) {
+		hotbackup_lru_feature_->detach();
+		// no need consider no enough mem, as mem is just cleared
+		hotbackup_lru_feature_->init(0);
+		int iRet = cache_.add_feature(
+			HOT_BACKUP, hotbackup_lru_feature_->get_handle());
+		if (iRet != 0) {
+			log4cplus_error("add hot-backup feature error: %d",
+					iRet);
+			exit(-1);
+		}
+		hotbackup_lru_feature_->master_uptime() = mu;
+		hotbackup_lru_feature_->slave_uptime() = su;
+	}
+	// hotbackup
+	char buf[16];
+	memset(buf, 0, sizeof(buf));
+	Node node;
+	if (write_hotbackup_log(buf, node, DTCHotBackup::SYNC_CLEAR))
+		log4cplus_error("hb: log clear cache error");
+
+	return DTC_CODE_BUFFER_SUCCESS;
+}

+ 524 - 0
src/core/chain/buffer_process_ask_chain.h

@@ -0,0 +1,524 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef __BUFFER_PROCESS_ASK_CHAIN__
+#define __BUFFER_PROCESS_ASK_CHAIN__
+
+#include <sys/mman.h>
+#include <time.h>
+
+#include "protocol.h"
+#include "value.h"
+#include "field/field.h"
+#include "section.h"
+#include "table/table_def.h"
+#include "task/task_request.h"
+#include "list/list.h"
+#include "fence_queue.h"
+#include "buffer_pond.h"
+#include "poll/poller_base.h"
+#include "config/dbconfig.h"
+#include "queue/lqueue.h"
+#include "stat_dtc.h"
+#include "data_process.h"
+#include "empty_filter.h"
+#include "namespace.h"
+#include "task_pendlist.h"
+#include "data_chunk.h"
+#include "hb_log.h"
+#include "lru_bit.h"
+#include "hb_feature.h"
+#include "blacklist/blacklist_unit.h"
+#include "expire_time.h"
+#include "buffer_process_answer_chain.h"
+
+DTC_BEGIN_NAMESPACE
+
+class DTCFlushRequest;
+class BufferProcessAskChain;
+class DTCTableDefinition;
+class TaskPendingList;
+enum BufferResult {
+	DTC_CODE_BUFFER_ERROR = -1,
+	DTC_CODE_BUFFER_SUCCESS = 0,
+	DTC_CODE_BUFFER_GOTO_NEXT_CHAIN = 1, // transmit job to connector.
+	DTC_CODE_BUFFER_UNFINISHED = 2, // waitting for flush module process.
+	DTC_CODE_BUFFER_TO_REMOTE_TARGET = 3, // transmit job remote dtc.
+	DTC_CODE_BUFFER_TO_HOTBACKUP_TARGET =
+		4 // transmit job to hot back-up progress.
+};
+typedef unsigned int MARKER_STAMP;
+
+class FlushReplyNotify : public JobAnswerInterface<DTCJobOperation> {
+    private:
+	BufferProcessAskChain *flush_reply_notify_owner_;
+
+    public:
+	FlushReplyNotify(BufferProcessAskChain *buffer_process)
+		: flush_reply_notify_owner_(buffer_process)
+	{
+	}
+
+	virtual ~FlushReplyNotify()
+	{
+	}
+	virtual void job_answer_procedure(DTCJobOperation *);
+};
+
+class HotBackReplay : public JobAnswerInterface<DTCJobOperation> {
+    public:
+	HotBackReplay()
+	{
+	}
+	virtual ~HotBackReplay()
+	{
+	}
+	virtual void job_answer_procedure(DTCJobOperation *job);
+};
+
+enum { LRU_NONE = 0,
+       LRU_BATCH,
+       LRU_READ,
+       LRU_WRITE,
+       LRU_ALWAYS = 999,
+};
+
+enum { DTC_CODE_NODE_NOTFOUND, DTC_CODE_NODE_EMPTY, DTC_CODE_NODE_HIT };
+
+struct CacheTransaction {
+	DTCJobOperation *current_task;
+	const char *key;
+	Node cache_transaction_node;
+	int old_rows;
+	uint8_t node_status;
+	uint8_t key_dirty;
+	uint8_t node_empty;
+	uint8_t lru_update;
+	// OLD ASYNC TRANSATION LOG
+	int log_type;
+	// OLD ASYNC TRANSATION LOG
+	RawData *log_rows;
+
+	void do_init(DTCJobOperation *job)
+	{
+		memset(this, 0, sizeof(CacheTransaction));
+		current_task = job;
+	}
+
+	void Free(void)
+	{
+		log_rows = NULL;
+		log_type = 0;
+
+		key = NULL;
+		cache_transaction_node = Node::Empty();
+		node_status = 0;
+		key_dirty = 0;
+		old_rows = 0;
+		node_empty = 0;
+		lru_update = 0;
+	}
+};
+
+class BufferProcessAskChain : public JobAskInterface<DTCJobOperation>,
+			      private TimerObject,
+			      public PurgeNodeProcessor,
+			      public CacheTransaction {
+	// base members
+    protected:
+	// cache chain control
+	ChainJoint<DTCJobOperation> main_chain;
+	// send command to remote dtc for migrate.
+	ChainJoint<DTCJobOperation> remote_chain;
+	// hblog job output
+	ChainJoint<DTCJobOperation> hotbackup_chain;
+
+	BufferProcessAnswerChain cache_reply_;
+
+	// table info
+	DTCTableDefinition *table_define_infomation_;
+	// cache memory management
+	BufferPond cache_;
+	DataProcess *data_process_;
+	BlockProperties cache_info_;
+
+	DTC_MODE dtc_mode_;
+	// full cache
+	bool full_mode_;
+	bool lossy_mode_;
+	// treat empty key as default value, flat bitmap emulation
+	bool m_bReplaceEmpty;
+	// lru update level
+	int lru_update_level_;
+	// working mode
+	EUpdateMode async_server_;
+	EUpdateMode update_mode_;
+	EUpdateMode insert_mode_;
+	// indicate mem dirty when start with sync dtc
+	bool memory_dirty_;
+	// server side sorting
+	unsigned char insert_order_;
+
+	// cache protection
+	// node size limit
+	int node_size_limit_;
+	// node rows limit
+	int node_rows_limit_;
+	// empty nodes limit
+	int node_empty_limit_;
+
+	// generated error message
+	char error_message_[256];
+
+	int max_expire_count_;
+	int max_expire_time_;
+
+    protected:
+	// stat subsystem
+	StatCounter stat_get_count_;
+	StatCounter stat_get_hits_;
+	StatCounter stat_insert_count_;
+	StatCounter stat_insert_hits_;
+	StatCounter stat_update_count_;
+	StatCounter stat_update_hits_;
+	StatCounter stat_delete_count_;
+	StatCounter stat_delete_hits_;
+	StatCounter stat_purge_count_;
+
+	StatCounter stat_drop_count_;
+	StatCounter stat_drop_rows_;
+	StatCounter stat_flush_count_;
+	StatCounter stat_flush_rows_;
+	StatSample stat_incsync_step_;
+
+	StatCounter stat_maxflush_request_;
+	StatCounter stat_currentFlush_request_;
+	StatCounter stat_oldestdirty_time_;
+	StatCounter stat_asyncflush_count_;
+
+	StatCounter stat_expire_count_;
+	StatCounter stat_buffer_process_expire_count_;
+
+    protected:
+	// async flush members
+	FlushReplyNotify flush_reply_;
+	TimerList *flush_timer_;
+	// current pending node
+	volatile int current_pend_flush_request_;
+	// pending node limit
+	volatile int pend_flush_request_;
+	// max speed
+	volatile unsigned short max_flush_request_;
+	volatile unsigned short marker_interval_;
+	volatile int min_dirty_time_;
+	volatile int max_dirty_time_;
+	// async log writer
+	int async_log_;
+	// empty node filter.
+	EmptyNodeFilter *empty_node_filter_;
+	// Hot Backup
+	// record update key.
+	bool log_hotbackup_key_switch_;
+	// record lru change.
+	HBFeature *hotbackup_lru_feature_;
+	// BlackList
+	BlackListUnit *black_list_;
+	TimerList *blacklist_timer_;
+	// BlackList
+	ExpireTime *key_expire;
+	TimerList *key_expire_timer_;
+	HotBackReplay hotback_reply_;
+
+    private:
+	// level 1 processing
+	// GET entrance
+	BufferResult buffer_get_data(DTCJobOperation &job);
+	// GET batch entrance
+	BufferResult buffer_batch_get_data(DTCJobOperation &job);
+	// GET response, DB --> cache
+	BufferResult buffer_replace_result(DTCJobOperation &job);
+	// GET response, DB --> client
+	BufferResult buffer_get_rb(DTCJobOperation &job);
+
+	// implementation some admin/purge/flush function
+	BufferResult buffer_process_admin(DTCJobOperation &job);
+	BufferResult buffer_purge_data(DTCJobOperation &job);
+	BufferResult buffer_flush_data(DTCJobOperation &job);
+	BufferResult buffer_flush_data_before_delete(DTCJobOperation &job);
+	int buffer_flush_data_timer(Node &node, unsigned int &affected_count);
+	BufferResult buffer_flush_data(Node &node, DTCJobOperation *pstTask,
+				       unsigned int &affected_count);
+
+	// sync mode operation, called by reply
+	BufferResult buffer_sync_insert_precheck(DTCJobOperation &job);
+	BufferResult buffer_sync_insert(DTCJobOperation &job);
+	BufferResult buffer_sync_update(DTCJobOperation &job);
+	BufferResult buffer_sync_replace(DTCJobOperation &job);
+	BufferResult buffer_sync_delete(DTCJobOperation &job);
+
+	// async mode operation, called by entrance
+	BufferResult buffer_async_insert(DTCJobOperation &job);
+	BufferResult buffer_async_update(DTCJobOperation &job);
+	BufferResult buffer_async_replace(DTCJobOperation &job);
+
+	// fullcache mode operation, called by entrance
+	BufferResult buffer_nodb_insert(DTCJobOperation &job);
+	BufferResult buffer_nodb_update(DTCJobOperation &job);
+	BufferResult buffer_nodb_replace(DTCJobOperation &job);
+	BufferResult buffer_nodb_delete(DTCJobOperation &job);
+
+	// level 2 operation
+	// level 2: INSERT with async compatible, create node & clear empty filter
+	BufferResult buffer_insert_row(DTCJobOperation &job, bool async,
+				       bool setrows);
+	// level 2: UPDATE with async compatible, accept empty node only if EmptyAsDefault
+	BufferResult buffer_update_rows(DTCJobOperation &job, bool async,
+					bool setrows);
+	// level 2: REPLACE with async compatible, don't allow empty node
+	BufferResult buffer_replace_rows(DTCJobOperation &job, bool async,
+					 bool setrows);
+	// level 2: DELETE has no async mode, don't allow empty node
+	BufferResult buffer_delete_rows(DTCJobOperation &job);
+
+	// very low level
+	// empty node inset default value to cache memory.
+	// auto clear empty filter
+	BufferResult insert_default_row(DTCJobOperation &job);
+	bool insert_empty_node(void);
+
+	// hot back-up
+	BufferResult buffer_register_hb(DTCJobOperation &job);
+	BufferResult buffer_logout_hb(DTCJobOperation &job);
+	BufferResult buffer_get_key_list(DTCJobOperation &job);
+	BufferResult buffer_get_update_key(DTCJobOperation &job);
+	BufferResult buffer_get_raw_data(DTCJobOperation &job);
+	BufferResult buffer_replace_raw_data(DTCJobOperation &job);
+	BufferResult buffer_adjust_lru(DTCJobOperation &job);
+	BufferResult buffer_verify_hbt(DTCJobOperation &job);
+	BufferResult buffer_get_hbt(DTCJobOperation &job);
+
+	//memory tidy
+	BufferResult buffer_nodehandlechange(DTCJobOperation &job);
+
+	// column expand related
+	BufferResult buffer_check_expand_status(DTCJobOperation &job);
+	BufferResult buffer_column_expand(DTCJobOperation &job);
+	BufferResult buffer_column_expand_done(DTCJobOperation &job);
+	BufferResult buffer_column_expand_key(DTCJobOperation &job);
+
+	//imgrate
+	BufferResult buffer_migrate(DTCJobOperation &job);
+
+	// clear cache(only support nodb mode)
+	BufferResult buffer_clear_cache(DTCJobOperation &job);
+
+	/* we can still purge clean node if hit ratio is ok */
+	BufferResult cache_purgeforhit(DTCJobOperation &job);
+
+	//rows limit
+	BufferResult check_allowed_insert(DTCJobOperation &job);
+
+	BufferResult buffer_query_serverinfo(DTCJobOperation &job);
+
+	// master-slave copy
+	BufferResult buffer_process_replicate(DTCJobOperation &job);
+
+	// hot back-up log
+	int write_hotbackup_log(const char *key, char *pstChunk,
+				unsigned int uiNodeSize, int iType);
+	int write_hotbackup_log(const char *key, Node &node, int iType);
+	int write_hotbackup_log(DTCJobOperation &job, Node &node, int iType);
+	int write_lru_hotbackup_log(const char *key);
+
+    public:
+	virtual void purge_node_processor(const char *key, Node node);
+
+	//inc flush job stat(created by flush dirty node function)
+	void inc_async_flush_stat()
+	{
+		stat_asyncflush_count_++;
+	}
+
+    private:
+	virtual void job_ask_procedure(DTCJobOperation *);
+	void job_answer_procedure(DTCJobOperation *);
+	// flush internal
+	virtual void job_timer_procedure(void);
+
+	int flush_next_node(void);
+	void delete_tail_time_markers();
+	void get_dirty_stat();
+	void calculate_flush_speed(int is_flush_timer);
+	MARKER_STAMP calculate_current_marker();
+
+	BufferProcessAskChain(const BufferProcessAskChain &robj);
+	BufferProcessAskChain &operator=(const BufferProcessAskChain &robj);
+
+    public:
+	BufferProcessAskChain(PollerBase *, DTCTableDefinition *,
+			      EUpdateMode async);
+	~BufferProcessAskChain(void);
+
+	const DTCTableDefinition *table_definition(void) const
+	{
+		return table_define_infomation_;
+	}
+	const char *last_error_message(void) const
+	{
+		return error_message_[0] ? error_message_ : "unknown error";
+	}
+
+	void set_limit_node_size(int node_size)
+	{
+		node_size_limit_ = node_size;
+	}
+
+	/* 0 =  no limit */
+	void set_limit_node_rows(int rows)
+	{
+		node_rows_limit_ = rows < 0 ? 0 : rows;
+		return;
+	}
+
+	/*
+		 * 0 = no limit,
+		 * 1-999: invalid, use 1000 instead
+		 * 1000-1G: max empty node count
+		 * >1G: invalid, no limit
+		 */
+	void set_limit_empty_nodes(int nodes)
+	{
+		node_empty_limit_ = nodes <= 0 ? 0 :
+						 nodes < 1000 ?
+						 1000 :
+						 nodes > (1 << 30) ? 0 : nodes;
+		return;
+	}
+
+	void disable_auto_purge(void)
+	{
+		cache_.disable_try_purge();
+	}
+
+	void set_date_expire_alert_time(int time)
+	{
+		cache_.set_date_expire_alert_time(time);
+	}
+
+	int set_buffer_size_and_version(unsigned long cache_size,
+					unsigned int cache_version);
+	int open_init_buffer(int key_name, int enable_empty_filter,
+			     int enable_auto_clean_dirty_buffer);
+
+	int update_mode(void) const
+	{
+		return update_mode_;
+	}
+	int enable_no_db_mode(void);
+	void enable_lossy_data_source(int v)
+	{
+		lossy_mode_ = v == 0 ? false : true;
+	}
+	int disable_lru_update(int);
+	int disable_async_log(int);
+
+	//DTC MODE: database in addition.
+	BufferResult deal_single_database_addition_ask(DTCJobOperation &job);
+	BufferResult deal_batch_database_addition_ask(DTCJobOperation &job);
+	BufferResult reply_connector_answer(DTCJobOperation &job);
+
+	//DTC MODE: cache only.
+	BufferResult deal_single_cache_only_ask(DTCJobOperation &job);
+
+	//Flush
+	BufferResult reply_flush_answer(DTCJobOperation &job);
+	BufferResult deal_flush_exeption(DTCJobOperation &job);
+
+	void print_row(const RowValue *r);
+	int set_insert_order(int o);
+	void set_replace_empty(bool v)
+	{
+		m_bReplaceEmpty = v;
+	}
+
+	// stage relate
+	void register_next_chain(JobAskInterface<DTCJobOperation> *p)
+	{
+		main_chain.register_next_chain(p);
+	}
+	void bind_dispatcher_remote(JobAskInterface<DTCJobOperation> *p)
+	{
+		remote_chain.register_next_chain(p);
+	}
+	void bind_hb_log_dispatcher(JobAskInterface<DTCJobOperation> *p)
+	{
+		hotbackup_chain.register_next_chain(p);
+	}
+
+	ChainJoint<DTCJobOperation> *get_main_chain()
+	{
+		return &main_chain;
+	}
+	ChainJoint<DTCJobOperation> *get_remote_chain()
+	{
+		return &remote_chain;
+	}
+	ChainJoint<DTCJobOperation> *get_hotbackup_chain()
+	{
+		return &hotbackup_chain;
+	}
+
+	// flush api
+	void set_flush_parameter(int, int, int, int);
+	void set_drop_count(int); // to be remove
+	int commit_flush_request(DTCFlushRequest *, DTCJobOperation *);
+	void complete_flush_request(DTCFlushRequest *);
+	void push_flush_queue(DTCJobOperation *p)
+	{
+		p->push_reply_dispatcher(&flush_reply_);
+		main_chain.indirect_notify(p);
+	}
+	inline bool is_mem_dirty()
+	{
+		return memory_dirty_;
+	}
+	int oldest_dirty_node_alarm();
+
+	// expire
+	BufferResult check_and_expire(DTCJobOperation &job);
+
+	friend class TaskPendingList;
+	friend class BufferProcessAnswerChain;
+
+    public:
+	// transaction implementation
+	inline void transaction_begin(DTCJobOperation *job)
+	{
+		CacheTransaction::do_init(job);
+	}
+	void transaction_end(void);
+	inline int transaction_find_node(DTCJobOperation &job);
+	inline void transaction_update_lru(bool async, int type);
+	void dispatch_hot_back_task(DTCJobOperation *job)
+	{
+		job->push_reply_dispatcher(&hotback_reply_);
+		hotbackup_chain.job_ask_procedure(job);
+	}
+};
+
+DTC_END_NAMESPACE
+
+#endif

+ 76 - 0
src/core/chain/job_procedure.cc

@@ -0,0 +1,76 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include "log/log.h"
+#include "buffer_process_ask_chain.h"
+#include <daemon/daemon.h>
+#include "buffer_remoteLog.h"
+
+void HotBackReplay::job_answer_procedure(DTCJobOperation *job_operation)
+{
+	log4cplus_debug("job_answer_procedure, request type %d",
+			job_operation->request_type());
+	int iRet = job_operation->result_code();
+	if (0 != iRet) {
+		if ((-ETIMEDOUT == iRet) || (-EC_INC_SYNC_STAGE == iRet) ||
+		    (-EC_FULL_SYNC_STAGE == iRet)) {
+			log4cplus_debug(
+				"hotback job , normal fail: from %s msg %s, request type %d",
+				job_operation->resultInfo.error_from(),
+				job_operation->resultInfo.error_message(),
+				job_operation->request_type());
+		} else {
+			log4cplus_error(
+				"hotback job fail: from %s msg %s, request type %d",
+				job_operation->resultInfo.error_from(),
+				job_operation->resultInfo.error_message(),
+				job_operation->request_type());
+		}
+	}
+
+	if ((TaskTypeWriteHbLog == job_operation->request_type()) ||
+	    (TaskTypeWriteLruHbLog == job_operation->request_type())) {
+		/*only delete job */
+		log4cplus_debug("write hotback job reply ,just delete job");
+		delete job_operation;
+		return;
+	}
+	log4cplus_debug("read hotback job ,reply to client");
+	job_operation->turn_around_job_answer();
+}
+
+void FlushReplyNotify::job_answer_procedure(DTCJobOperation *job_operation)
+{
+	flush_reply_notify_owner_->transaction_begin(job_operation);
+	if (job_operation->result_code() < 0) {
+		flush_reply_notify_owner_->deal_flush_exeption(*job_operation);
+	} else if (job_operation->result_code() > 0) {
+		log4cplus_info("result_code() > 0: from %s msg %s",
+			       job_operation->resultInfo.error_from(),
+			       job_operation->resultInfo.error_message());
+	}
+	if (job_operation->result_code() >= 0 &&
+	    flush_reply_notify_owner_->reply_flush_answer(*job_operation) !=
+		    DTC_CODE_BUFFER_SUCCESS) {
+		if (job_operation->result_code() >= 0)
+			job_operation->set_error(
+				-EC_SERVER_ERROR, "reply_flush_answer",
+				flush_reply_notify_owner_->last_error_message());
+	}
+
+	job_operation->turn_around_job_answer();
+	flush_reply_notify_owner_->transaction_end();
+}

+ 138 - 0
src/core/chain/system_command_ask_chain.cc

@@ -0,0 +1,138 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <system_command_ask_chain.h>
+#include <log/log.h>
+#include "protocol.h"
+
+SystemCommandAskChain *SystemCommandAskChain::system_command_instance = NULL;
+
+SystemCommandAskChain::SystemCommandAskChain(PollerBase *o)
+	: JobAskInterface<DTCJobOperation>(o), main_chain(o)
+{
+	atomic8_set(&read_only_, 0);
+	stat_read_only = g_stat_mgr.get_stat_int_counter(SERVER_READONLY);
+	stat_read_only.set((0 == atomic8_read(&read_only_)) ? 0 : 1);
+}
+
+SystemCommandAskChain::~SystemCommandAskChain(void)
+{
+}
+
+SystemCommandAskChain *SystemCommandAskChain::get_instance(PollerBase *o)
+{
+	if (NULL == system_command_instance) {
+		NEW(SystemCommandAskChain(o), system_command_instance);
+	}
+	return system_command_instance;
+}
+
+SystemCommandAskChain *SystemCommandAskChain::get_instance()
+{
+	return system_command_instance;
+}
+
+bool SystemCommandAskChain::is_read_only()
+{
+	return 0 != atomic8_read(&read_only_);
+}
+void SystemCommandAskChain::query_mem_info(DTCJobOperation *job_operation)
+{
+	struct DTCServerInfo s_info;
+	memset(&s_info, 0x00, sizeof(s_info));
+
+	s_info.version = 0x1;
+	s_info.datasize = g_stat_mgr.get_interval_10s_stat_value(DTC_DATA_SIZE);
+	s_info.memsize = g_stat_mgr.get_interval_10s_stat_value(DTC_CACHE_SIZE);
+	log4cplus_debug("Memory info is: memsize is %lu , datasize is %lu",
+			s_info.memsize, s_info.datasize);
+	job_operation->resultInfo.set_server_info(&s_info);
+}
+void SystemCommandAskChain::deal_server_admin(DTCJobOperation *job_operation)
+{
+	switch (job_operation->requestInfo.admin_code()) {
+	case DRequest::SystemCommand::SET_READONLY: {
+		atomic8_set(&read_only_, 1);
+		stat_read_only.set(1);
+		log4cplus_info("set server status to readonly.");
+		break;
+	}
+	case DRequest::SystemCommand::SET_READWRITE: {
+		atomic8_set(&read_only_, 0);
+		stat_read_only.set(0);
+		log4cplus_info("set server status to read/write.");
+		break;
+	}
+	case DRequest::SystemCommand::QUERY_MEM_INFO: {
+		log4cplus_debug("query meminfo.");
+		query_mem_info(job_operation);
+		break;
+	}
+
+	default: {
+		log4cplus_debug("unknow cmd: %d",
+				job_operation->requestInfo.admin_code());
+		job_operation->set_error(-EC_REQUEST_ABORTED, "RequestControl",
+					 "Unknown svrAdmin command.");
+		break;
+	}
+	}
+
+	job_operation->turn_around_job_answer();
+}
+
+void SystemCommandAskChain::job_ask_procedure(DTCJobOperation *job_operation)
+{
+	log4cplus_debug("enter job_ask_procedure");
+	log4cplus_debug("Cmd is %d, AdminCmd is %u",
+			job_operation->request_code(),
+			job_operation->requestInfo.admin_code());
+	//处理ServerAdmin命令
+	if (DRequest::TYPE_SYSTEM_COMMAND == job_operation->request_code()) {
+		switch (job_operation->requestInfo.admin_code()) {
+		case DRequest::SystemCommand::SET_READONLY:
+		case DRequest::SystemCommand::SET_READWRITE:
+		case DRequest::SystemCommand::QUERY_MEM_INFO:
+			deal_server_admin(job_operation);
+			return;
+
+			//allow all admin_code pass
+		default: {
+			log4cplus_debug(
+				"job_ask_procedure admincmd,  next process ");
+
+			main_chain.job_ask_procedure(job_operation);
+			return;
+		}
+		}
+	}
+
+	//当server为readonly,对非查询请求直接返回错误
+	if (0 != atomic8_read(&read_only_)) {
+		if (DRequest::Get != job_operation->request_code()) {
+			log4cplus_info(
+				"server is readonly, reject write operation");
+			job_operation->set_error(-EC_SERVER_READONLY,
+						 "RequestControl",
+						 "Server is readonly.");
+			job_operation->turn_around_job_answer();
+			return;
+		}
+	}
+
+	main_chain.job_ask_procedure(job_operation);
+	log4cplus_debug("enter job_ask_procedure");
+}

+ 59 - 0
src/core/chain/system_command_ask_chain.h

@@ -0,0 +1,59 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef __SYSTEM_COMMAND_ASK_CHAIN__
+#define __SYSTEM_COMMAND_ASK_CHAIN__
+
+#include <task/task_request.h>
+#include <stat_dtc.h>
+
+class SystemCommandAskChain : public JobAskInterface<DTCJobOperation> {
+    protected:
+	static SystemCommandAskChain *system_command_instance;
+	SystemCommandAskChain(PollerBase *o);
+
+    public:
+	//返回实例,如果实例尚未构造,则构造一个新的实例返回
+	static SystemCommandAskChain *get_instance(PollerBase *o);
+	//仅是返回,如果实例尚未构造,则返回空
+	static SystemCommandAskChain *get_instance();
+	virtual ~SystemCommandAskChain(void);
+	void register_next_chain(JobAskInterface<DTCJobOperation> *p)
+	{
+		main_chain.register_next_chain(p);
+	}
+	ChainJoint<DTCJobOperation> *get_main_chain()
+	{
+		return &main_chain;
+	}
+	bool is_read_only();
+
+    private:
+	ChainJoint<DTCJobOperation> main_chain;
+	//server是否为只读状态
+	atomic8_t read_only_;
+	//Readonly的统计对象
+	StatCounter stat_read_only;
+
+    private:
+	virtual void job_ask_procedure(DTCJobOperation *);
+
+	//处理serveradmin 命令
+	void deal_server_admin(DTCJobOperation *job_operation);
+	void query_mem_info(DTCJobOperation *job_operation);
+};
+
+#endif

+ 68 - 0
src/core/hotbk/hb_feature.cc

@@ -0,0 +1,68 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <string.h>
+#include <stdio.h>
+#include <errno.h>
+#include "hb_feature.h"
+#include "global.h"
+
+DTC_USING_NAMESPACE
+
+HBFeature::HBFeature() : hb_info_(NULL), handle_(INVALID_HANDLE)
+{
+	memset(errmsg_, 0, sizeof(errmsg_));
+}
+
+HBFeature::~HBFeature()
+{
+}
+
+int HBFeature::init(time_t tMasterUptime)
+{
+	handle_ = M_CALLOC(sizeof(HB_FEATURE_INFO_T));
+	if (INVALID_HANDLE == handle_) {
+		snprintf(errmsg_, sizeof(errmsg_), "init hb_feature fail, %s",
+			 M_ERROR());
+		return -ENOMEM;
+	}
+
+	hb_info_ = M_POINTER(HB_FEATURE_INFO_T, handle_);
+	hb_info_->master_up_time = tMasterUptime;
+	hb_info_->slave_up_time = 0;
+
+	return DTC_CODE_SUCCESS;
+}
+
+int HBFeature::attach(MEM_HANDLE_T handle)
+{
+	if (INVALID_HANDLE == handle) {
+		snprintf(errmsg_, sizeof(errmsg_),
+			 "attach hb feature failed, memory handle = 0");
+		return DTC_CODE_FAILED;
+	}
+
+	handle_ = handle;
+	hb_info_ = M_POINTER(HB_FEATURE_INFO_T, handle_);
+
+	return DTC_CODE_SUCCESS;
+}
+
+void HBFeature::detach(void)
+{
+	hb_info_ = NULL;
+	handle_ = INVALID_HANDLE;
+}

+ 77 - 0
src/core/hotbk/hb_feature.h

@@ -0,0 +1,77 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef __DTC_HB_FEATURE_H
+#define __DTC_HB_FEATURE_H
+
+#include <stdio.h>
+#include <stdint.h>
+#include <time.h>
+
+#include "namespace.h"
+#include "algorithm/singleton.h"
+#include "global.h"
+
+struct hb_feature_info {
+	int64_t master_up_time;
+	int64_t slave_up_time;
+};
+typedef struct hb_feature_info HB_FEATURE_INFO_T;
+
+class HBFeature {
+    public:
+	HBFeature();
+	~HBFeature();
+
+	static HBFeature *instance()
+	{
+		return Singleton<HBFeature>::instance();
+	}
+	static void destory()
+	{
+		Singleton<HBFeature>::destory();
+	}
+
+	int init(time_t tMasterUptime);
+	int attach(MEM_HANDLE_T handle);
+	void detach(void);
+
+	const char *error() const
+	{
+		return errmsg_;
+	}
+
+	MEM_HANDLE_T get_handle() const
+	{
+		return handle_;
+	}
+
+	int64_t &master_uptime()
+	{
+		return hb_info_->master_up_time;
+	}
+	int64_t &slave_uptime()
+	{
+		return hb_info_->slave_up_time;
+	}
+
+    private:
+	HB_FEATURE_INFO_T *hb_info_;
+	MEM_HANDLE_T handle_;
+	char errmsg_[256];
+};
+
+#endif

+ 214 - 0
src/core/hotbk/hb_log.cc

@@ -0,0 +1,214 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include "hb_log.h"
+#include "global.h"
+#include "table/hotbackup_table_def.h"
+
+HBLog::HBLog(DTCTableDefinition *tbl)
+	: tabledef_(tbl), log_writer_(0), log_reader_(0)
+{
+}
+
+HBLog::~HBLog()
+{
+	DELETE(log_writer_);
+	DELETE(log_reader_);
+}
+
+int HBLog::init(const char *path, const char *prefix, uint64_t total,
+		off_t max_size)
+{
+	log_writer_ = new BinlogWriter;
+	log_reader_ = new BinlogReader;
+
+	if (log_writer_->init(path, prefix, total, max_size)) {
+		log4cplus_error("init log_writer failed");
+		return DTC_CODE_FAILED;
+	}
+
+	if (log_reader_->init(path, prefix)) {
+		log4cplus_error("init log_reader failed");
+		return -2;
+	}
+
+	return DTC_CODE_SUCCESS;
+}
+
+int HBLog::write_update_log(DTCJobOperation &job)
+{
+	RawData *raw_data;
+	NEW(RawData(&g_stSysMalloc, 1), raw_data);
+
+	if (!raw_data) {
+		log4cplus_error("raw_data is null");
+		return DTC_CODE_FAILED;
+	}
+
+	HotBackTask &hotbacktask = job.get_hot_back_task();
+	int type = hotbacktask.get_type();
+	if (raw_data->init(0, tabledef_->key_size(), (const char *)&type, 0, -1,
+			   -1, 0)) {
+		DELETE(raw_data);
+		return DTC_CODE_FAILED;
+	}
+	DTCValue key;
+	DTCValue value;
+	if (0 == hotbacktask.get_packed_key_len()) {
+		log4cplus_error("packedkey len is  zero");
+		return DTC_CODE_FAILED;
+	} else {
+		key.Set(hotbacktask.get_packed_key(),
+			hotbacktask.get_packed_key_len());
+	}
+
+	if (0 == hotbacktask.get_value_len()) {
+		value.Set(0);
+	} else {
+		value.Set(hotbacktask.get_value(), hotbacktask.get_value_len());
+	}
+
+	RowValue row(tabledef_);
+	row[0].u64 = type;
+	row[1].u64 = hotbacktask.get_flag();
+	row[2] = key;
+	row[3] = value;
+	log4cplus_debug(" tye is %d, flag %d", type, hotbacktask.get_flag());
+	raw_data->insert_row(row, false, false);
+	log_writer_->insert_header(type, 0, 1);
+	log_writer_->append_body(raw_data->get_addr(), raw_data->data_size());
+	DELETE(raw_data);
+
+	log4cplus_debug(" packed key len:%d,key len:%d, key :%s", key.bin.len,
+			*(unsigned char *)key.bin.ptr, key.bin.ptr + 1);
+	return log_writer_->Commit();
+}
+
+int HBLog::write_lru_hb_log(DTCJobOperation &job)
+{
+	RawData *raw_data;
+	NEW(RawData(&g_stSysMalloc, 1), raw_data);
+
+	if (!raw_data) {
+		log4cplus_error("raw_data is null");
+		return DTC_CODE_FAILED;
+	}
+
+	HotBackTask &hotbacktask = job.get_hot_back_task();
+	int type = hotbacktask.get_type();
+	if (raw_data->init(0, tabledef_->key_size(), (const char *)&type, 0, -1,
+			   -1, 0)) {
+		DELETE(raw_data);
+		return DTC_CODE_FAILED;
+	}
+	DTCValue key;
+	if (0 == hotbacktask.get_packed_key_len()) {
+		log4cplus_error("packedkey len is  zero");
+		return DTC_CODE_FAILED;
+	} else {
+		key.Set(hotbacktask.get_packed_key(),
+			hotbacktask.get_packed_key_len());
+	}
+
+	RowValue row(tabledef_);
+	row[0].u64 = type;
+	row[1].u64 = hotbacktask.get_flag();
+	row[2] = key;
+	row[3].Set(0);
+	log4cplus_debug(" type is %d, flag %d", type, hotbacktask.get_flag());
+	raw_data->insert_row(row, false, false);
+	log_writer_->insert_header(BINLOG_LRU, 0, 1);
+	log_writer_->append_body(raw_data->get_addr(), raw_data->data_size());
+	DELETE(raw_data);
+
+	log4cplus_debug(
+		" write lru hotback log, packed key len:%d,key len:%d, key :%s",
+		key.bin.len, *(unsigned char *)key.bin.ptr, key.bin.ptr + 1);
+	return log_writer_->Commit();
+}
+
+int HBLog::Seek(const JournalID &v)
+{
+	return log_reader_->Seek(v);
+}
+
+/* 批量拉取更新key,返回更新key的个数 */
+int HBLog::task_append_all_rows(DTCJobOperation &job, int limit)
+{
+	int count;
+	for (count = 0; count < limit; ++count) {
+		/* 没有待处理日志 */
+		if (log_reader_->Read())
+			break;
+
+		RawData *raw_data;
+
+		NEW(RawData(&g_stSysMalloc, 0), raw_data);
+
+		if (!raw_data) {
+			log4cplus_error("allocate rawdata mem failed");
+			return DTC_CODE_FAILED;
+		}
+
+		if (raw_data->check_size(g_stSysMalloc.get_handle(
+						 log_reader_->record_pointer()),
+					 0, tabledef_->key_size(),
+					 log_reader_->record_length(0)) < 0) {
+			log4cplus_error("raw data broken: wrong size");
+			DELETE(raw_data);
+			return DTC_CODE_FAILED;
+		}
+
+		/* attach raw data read from one binlog */
+		if (raw_data->do_attach(g_stSysMalloc.get_handle(
+						log_reader_->record_pointer()),
+					0, tabledef_->key_size())) {
+			log4cplus_error("attach rawdata mem failed");
+
+			DELETE(raw_data);
+			return DTC_CODE_FAILED;
+		}
+
+		RowValue r(tabledef_);
+		r[0].u64 = *(unsigned *)raw_data->key();
+
+		unsigned char flag = 0;
+		while (raw_data->decode_row(r, flag) == 0) {
+			log4cplus_debug("type: " UINT64FMT ", flag: " UINT64FMT
+					", key:%s, value :%s",
+					r[0].u64, r[1].u64, r[2].bin.ptr,
+					r[3].bin.ptr);
+			log4cplus_debug("binlog-type: %d",
+					log_reader_->binlog_type());
+
+			job.append_row(&r);
+		}
+
+		DELETE(raw_data);
+	}
+
+	return count;
+}
+
+JournalID HBLog::get_reader_jid(void)
+{
+	return log_reader_->query_id();
+}
+
+JournalID HBLog::get_writer_jid(void)
+{
+	return log_writer_->query_id();
+}

+ 62 - 0
src/core/hotbk/hb_log.h

@@ -0,0 +1,62 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef __DTC_HB_LOG_H
+#define __DTC_HB_LOG_H
+
+#include "log/logger.h"
+#include "journal_id.h"
+#include "task/task_request.h"
+#include "field/field.h"
+#include "raw_data.h"
+#include "table/hotbackup_table_def.h"
+#include "sys_malloc.h"
+#include "table/table_def.h"
+
+class BinlogWriter;
+class BinlogReader;
+
+class HBLog {
+    public:
+	//传入编解码的表结构
+	HBLog(DTCTableDefinition *tbl);
+	~HBLog();
+
+	int init(const char *path, const char *prefix, uint64_t total,
+		 off_t max_size);
+	int Seek(const JournalID &);
+
+	JournalID get_reader_jid(void);
+	JournalID get_writer_jid(void);
+
+	//不带value,只写更新key
+	int write_update_key(DTCValue key, int type);
+
+	//将多条log记录编码进TaskReqeust
+	int task_append_all_rows(DTCJobOperation &, int limit);
+
+	//提供给LRUBitUnit来记录lru变更
+	int write_lru_hb_log(DTCJobOperation &job);
+	int write_update_log(DTCJobOperation &job);
+	int write_update_key(DTCValue key, DTCValue v, int type);
+
+    private:
+	DTCTableDefinition *tabledef_;
+	BinlogWriter *log_writer_;
+	BinlogReader *log_reader_;
+};
+
+#endif

+ 191 - 0
src/core/hotbk/hot_backup_ask_chain.cc

@@ -0,0 +1,191 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include "hot_backup_ask_chain.h"
+#include "poll/poller_base.h"
+#include "task/task_request.h"
+#include "log/log.h"
+#include "hotback_task.h"
+
+extern DTCTableDefinition *g_table_def[];
+
+HotBackupAskChain::HotBackupAskChain(PollerBase *o)
+	: JobAskInterface<DTCJobOperation>(o), ownerThread_(o), main_chain(o),
+	  taskPendList_(this),
+	  hbLog_(TableDefinitionManager::instance()->get_hot_backup_table_def())
+{
+}
+
+HotBackupAskChain::~HotBackupAskChain()
+{
+}
+void HotBackupAskChain::job_ask_procedure(DTCJobOperation *job_operation)
+{
+	log4cplus_debug("enter job_ask_procedure");
+	log4cplus_debug("request type is %d ", job_operation->request_type());
+	THBResult result = HB_PROCESS_ERROR;
+	switch (job_operation->request_type()) {
+	case TaskTypeWriteHbLog: {
+		result = write_hb_log_process(*job_operation);
+		break;
+	}
+	case TaskTypeReadHbLog: {
+		result = read_hb_log_process(*job_operation);
+		break;
+	}
+	case TaskTypeWriteLruHbLog: {
+		result = write_lru_hb_log_process(*job_operation);
+		break;
+	}
+	case TaskTypeRegisterHbLog: {
+		result = register_hb_log_process(*job_operation);
+		break;
+	}
+	case TaskTypeQueryHbLogInfo: {
+		result = query_hb_log_info_process(*job_operation);
+		break;
+	}
+	default: {
+		job_operation->set_error(-EBADRQC, "hb process",
+					 "invalid hb cmd code");
+		log4cplus_info("invalid hb cmd code[%d]",
+			       job_operation->request_type());
+		job_operation->turn_around_job_answer();
+		return;
+	}
+	}
+
+	if (HB_PROCESS_PENDING == result) {
+		log4cplus_debug("hb job is pending ");
+		return;
+	}
+	log4cplus_debug("hb job reply");
+	job_operation->turn_around_job_answer();
+	log4cplus_debug("leave job_ask_procedure");
+	return;
+}
+
+bool HotBackupAskChain::do_init(uint64_t total, off_t max_size)
+{
+	log4cplus_debug("total: %lu, max_size: %ld", total, max_size);
+	if (hbLog_.init("../log/hblog", "hblog", total, max_size)) {
+		log4cplus_error("hotback process for hblog init failed");
+		return false;
+	}
+
+	return true;
+}
+
+THBResult HotBackupAskChain::write_hb_log_process(DTCJobOperation &job)
+{
+	if (0 != hbLog_.write_update_log(job)) {
+		job.set_error(-EC_ERR_HOTBACK_WRITEUPDATE, "HBProcess",
+			      "write_hb_log_process fail");
+		return HB_PROCESS_ERROR;
+	}
+	taskPendList_.Wakeup();
+	return HB_PROCESS_OK;
+}
+
+THBResult HotBackupAskChain::write_lru_hb_log_process(DTCJobOperation &job)
+{
+	if (0 != hbLog_.write_lru_hb_log(job)) {
+		job.set_error(-EC_ERR_HOTBACK_WRITELRU, "HBProcess",
+			      "write_lru_hb_log_process fail");
+		return HB_PROCESS_ERROR;
+	}
+	return HB_PROCESS_OK;
+}
+
+THBResult HotBackupAskChain::read_hb_log_process(DTCJobOperation &job)
+{
+	log4cplus_debug("read Hb log begin ");
+	JournalID hb_jid = job.versionInfo.hot_backup_id();
+	JournalID write_jid = hbLog_.get_writer_jid();
+
+	if (hb_jid.GE(write_jid)) {
+		taskPendList_.add2_list(&job);
+		return HB_PROCESS_PENDING;
+	}
+
+	if (hbLog_.Seek(hb_jid)) {
+		job.set_error(-EC_BAD_HOTBACKUP_JID, "HBProcess",
+			      "read_hb_log_process jid overflow");
+		return HB_PROCESS_ERROR;
+	}
+
+	job.prepare_result_no_limit();
+
+	int count =
+		hbLog_.task_append_all_rows(job, job.requestInfo.limit_count());
+	if (count >= 0) {
+		statIncSyncStep_.push(count);
+	} else {
+		job.set_error(-EC_ERROR_BASE, "HBProcess",
+			      "read_hb_log_process,decode binlog error");
+		return HB_PROCESS_ERROR;
+	}
+
+	job.versionInfo.set_hot_backup_id((uint64_t)hbLog_.get_reader_jid());
+	return HB_PROCESS_OK;
+}
+THBResult HotBackupAskChain::register_hb_log_process(DTCJobOperation &job)
+{
+	JournalID client_jid = job.versionInfo.hot_backup_id();
+	JournalID master_jid = hbLog_.get_writer_jid();
+	log4cplus_info(
+		"hb register, client[serial=%u, offset=%u], master[serial=%u, offset=%u]",
+		client_jid.serial, client_jid.offset, master_jid.serial,
+		master_jid.offset);
+
+	//full sync
+	if (client_jid.Zero()) {
+		log4cplus_info("full-sync stage.");
+		job.versionInfo.set_hot_backup_id((uint64_t)master_jid);
+		job.set_error(-EC_FULL_SYNC_STAGE, "HBProcess",
+			      "Register,full-sync stage");
+		return HB_PROCESS_ERROR;
+	} else {
+		//inc sync
+		if (hbLog_.Seek(client_jid) == 0) {
+			log4cplus_info("inc-sync stage.");
+			job.versionInfo.set_hot_backup_id((uint64_t)client_jid);
+			job.set_error(-EC_INC_SYNC_STAGE, "HBProcess",
+				      "register, inc-sync stage");
+			return HB_PROCESS_ERROR;
+		}
+		//error
+		else {
+			log4cplus_info("err-sync stage.");
+			job.versionInfo.set_hot_backup_id((uint64_t)0);
+			job.set_error(-EC_ERR_SYNC_STAGE, "HBProcess",
+				      "register, err-sync stage");
+			return HB_PROCESS_ERROR;
+		}
+	}
+}
+THBResult HotBackupAskChain::query_hb_log_info_process(DTCJobOperation &job)
+{
+	struct DTCServerInfo s_info;
+	memset(&s_info, 0x00, sizeof(s_info));
+	s_info.version = 0x1;
+
+	JournalID jid = hbLog_.get_writer_jid();
+	s_info.binlog_id = jid.Serial();
+	s_info.binlog_off = jid.get_offset();
+	job.resultInfo.set_server_info(&s_info);
+	return HB_PROCESS_OK;
+}

+ 58 - 0
src/core/hotbk/hot_backup_ask_chain.h

@@ -0,0 +1,58 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef __HOT_BACKUP_ASK_CHAIN__
+#define __HOT_BACKUP_ASK_CHAIN__
+
+#include "request/request_base.h"
+#include "hb_log.h"
+#include "task_pendlist.h"
+#include "stat_manager.h"
+#include <map>
+
+class PollerBase;
+class DTCJobOperation;
+enum THBResult {
+	HB_PROCESS_ERROR = -1,
+	HB_PROCESS_OK = 0,
+	HB_PROCESS_PENDING = 2,
+};
+
+class HotBackupAskChain : public JobAskInterface<DTCJobOperation> {
+    public:
+	HotBackupAskChain(PollerBase *o);
+	virtual ~HotBackupAskChain();
+
+	virtual void job_ask_procedure(DTCJobOperation *job_operation);
+	bool do_init(uint64_t total, off_t max_size);
+
+    private:
+	/*concrete hb operation*/
+	THBResult write_hb_log_process(DTCJobOperation &job);
+	THBResult read_hb_log_process(DTCJobOperation &job);
+	THBResult write_lru_hb_log_process(DTCJobOperation &job);
+	THBResult register_hb_log_process(DTCJobOperation &job);
+	THBResult query_hb_log_info_process(DTCJobOperation &job);
+
+    private:
+	PollerBase *ownerThread_;
+	ChainJoint<DTCJobOperation> main_chain;
+	TaskPendingList taskPendList_;
+	HBLog hbLog_;
+	StatSample statIncSyncStep_;
+};
+
+#endif

+ 163 - 0
src/core/mem/feature.cc

@@ -0,0 +1,163 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <stdio.h>
+#include <string.h>
+#include "algorithm/singleton.h"
+#include "feature.h"
+#include "global.h"
+
+DTC_USING_NAMESPACE
+
+Feature *Feature::instance()
+{
+	return Singleton<Feature>::instance();
+}
+
+void Feature::destroy()
+{
+	return Singleton<Feature>::destory();
+}
+
+Feature::Feature() : _baseInfo(NULL)
+{
+	memset(errmsg_, 0, sizeof(errmsg_));
+}
+
+Feature::~Feature()
+{
+}
+/* feature id -> feature.  拷贝输入feature 到 找到feature
+ */
+int Feature::modify_feature(FEATURE_INFO_T *fi)
+{
+	if (!fi)
+		return -1;
+
+	FEATURE_INFO_T *p = get_feature_by_id(fi->fi_id);
+	if (!p) {
+		snprintf(errmsg_, sizeof(errmsg_), "not found feature[%d]",
+			 fi->fi_id);
+		return -2;
+	}
+
+	*p = *fi;
+	return 0;
+}
+/* feature id -> feature. 清空这个feature 
+ */
+int Feature::delete_feature(FEATURE_INFO_T *fi)
+{
+	if (!fi)
+		return -1;
+
+	FEATURE_INFO_T *p = get_feature_by_id(fi->fi_id);
+	if (!p) {
+		snprintf(errmsg_, sizeof(errmsg_), "not found feature[%d]",
+			 fi->fi_id);
+		return -2;
+	}
+
+	//delete feature
+	p->fi_id = 0;
+	p->fi_attr = 0;
+	p->fi_handle = INVALID_HANDLE;
+
+	return 0;
+}
+/* 找一个空闲feature, 赋值 
+ */
+int Feature::add_feature(const uint32_t id, const MEM_HANDLE_T v,
+			 const uint32_t attr)
+{
+	if (INVALID_HANDLE == v) {
+		snprintf(errmsg_, sizeof(errmsg_), "handle is invalid");
+		return -1;
+	}
+
+	//find freespace
+	FEATURE_INFO_T *p = get_feature_by_id(0);
+	if (!p) {
+		snprintf(errmsg_, sizeof(errmsg_),
+			 "have no free space to add a new feature");
+		return -2;
+	}
+
+	p->fi_id = id;
+	p->fi_attr = attr;
+	p->fi_handle = v;
+
+	return 0;
+}
+/* feature id -> feature. 
+ * 1. feature id == 0: 则表示找一个空闲feature.
+ * 2. 否则根据feature id 找对应的feature
+ */
+FEATURE_INFO_T *Feature::get_feature_by_id(const uint32_t fd)
+{
+	if (!_baseInfo || _baseInfo->bi_total == 0) {
+		goto EXIT;
+	}
+
+	for (uint32_t i = 0; i < _baseInfo->bi_total; i++) {
+		if (_baseInfo->bi_features[i].fi_id == fd) {
+			return (&(_baseInfo->bi_features[i]));
+		}
+	}
+
+EXIT:
+	return (FEATURE_INFO_T *)(0);
+}
+/* 1. 创建num个空feature
+ * 2. 初始化头信息(baseInfo)
+ */
+int Feature::do_init(const uint32_t num)
+{
+	size_t size = sizeof(FEATURE_INFO_T);
+	size *= num;
+	size += sizeof(BASE_INFO_T);
+
+	MEM_HANDLE_T v = M_CALLOC(size);
+	if (INVALID_HANDLE == v) {
+		snprintf(errmsg_, sizeof(errmsg_), "init features failed, %s",
+			 M_ERROR());
+		return -1;
+	}
+
+	_baseInfo = M_POINTER(BASE_INFO_T, v);
+	_baseInfo->bi_total = num;
+
+	return 0;
+}
+/* feature已经存在,第一个feature的内存句柄。直接初始化头信息指向 
+ */
+int Feature::do_attach(MEM_HANDLE_T handle)
+{
+	if (INVALID_HANDLE == handle) {
+		snprintf(errmsg_, sizeof(errmsg_),
+			 "attach features failed, memory handle=0");
+		return -1;
+	}
+
+	_baseInfo = M_POINTER(BASE_INFO_T, handle);
+	return 0;
+}
+
+int Feature::do_detach(void)
+{
+	_baseInfo = NULL;
+	return 0;
+}

+ 88 - 0
src/core/mem/feature.h

@@ -0,0 +1,88 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef __DTC_FEATURE_H
+#define __DTC_FEATURE_H
+
+#include "namespace.h"
+#include "global.h"
+
+DTC_BEGIN_NAMESPACE
+
+// feature type
+enum feature_id {
+	NODE_GROUP = 10, //DTC begin feature id
+	NODE_INDEX,
+	HASH_BUCKET,
+	TABLE_INFO,
+	EMPTY_FILTER,
+	HOT_BACKUP,
+	COL_EXPAND,
+};
+typedef enum feature_id FEATURE_ID_T;
+
+struct feature_info {
+	uint32_t fi_id; // feature id
+	uint32_t fi_attr; // feature attribute
+	MEM_HANDLE_T fi_handle; // feature handler
+};
+typedef struct feature_info FEATURE_INFO_T;
+
+struct base_info {
+	uint32_t bi_total; // total features
+	FEATURE_INFO_T bi_features[0];
+};
+typedef struct base_info BASE_INFO_T;
+
+class Feature {
+    public:
+	static Feature *instance();
+	static void destroy();
+
+	MEM_HANDLE_T get_handle() const
+	{
+		return M_HANDLE(_baseInfo);
+	}
+	const char *error() const
+	{
+		return errmsg_;
+	}
+
+	int modify_feature(FEATURE_INFO_T *fi);
+	int delete_feature(FEATURE_INFO_T *fi);
+	int add_feature(const uint32_t id, const MEM_HANDLE_T v,
+			const uint32_t attr = 0);
+	FEATURE_INFO_T *get_feature_by_id(const uint32_t id);
+
+	//创建物理内存并格式化
+	int do_init(const uint32_t num = MIN_FEATURES);
+	//绑定到物理内存
+	int do_attach(MEM_HANDLE_T handle);
+	//脱离物理内存
+	int do_detach(void);
+
+    public:
+	Feature();
+	~Feature();
+
+    private:
+	BASE_INFO_T *_baseInfo;
+	char errmsg_[256];
+};
+
+DTC_END_NAMESPACE
+
+#endif

+ 51 - 0
src/core/mem/fence_queue.h

@@ -0,0 +1,51 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef __BARRIER_QUEUE_H__
+#define __BARRIER_QUEUE_H__
+
+#include <list/list.h>
+#include <queue/lqueue.h>
+
+class DTCJobOperation;
+class BarrierAskAnswerChain;
+class BarrierQueue;
+
+class BarrierQueue : public ListObject<BarrierQueue>,
+		     public LinkQueue<DTCJobOperation *> {
+    public:
+	friend class BarrierAskAnswerChain;
+
+	inline BarrierQueue(LinkQueue<DTCJobOperation *>::allocator *a = NULL)
+		: LinkQueue<DTCJobOperation *>(a), key_(0)
+	{
+	}
+	inline ~BarrierQueue(){};
+
+	inline unsigned long key() const
+	{
+		return key_;
+	}
+	inline void set_key(unsigned long k)
+	{
+		key_ = k;
+	}
+
+    private:
+	unsigned long key_;
+};
+
+#endif

+ 127 - 0
src/core/mem/mallocator.h

@@ -0,0 +1,127 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef MALLOCATOR_H
+#define MALLOCATOR_H
+
+#include <stdint.h>
+#include <stdlib.h>
+#include "namespace.h"
+
+DTC_BEGIN_NAMESPACE
+
+#define ALLOC_SIZE_T uint32_t
+#define ALLOC_HANDLE_T uint64_t
+#define INTER_SIZE_T uint64_t
+#define INTER_HANDLE_T uint64_t
+
+#define INVALID_HANDLE 0ULL
+
+#define SIZE_SZ (sizeof(ALLOC_SIZE_T))
+#define MALLOC_ALIGNMENT (2 * SIZE_SZ)
+#define MALLOC_ALIGN_MASK (MALLOC_ALIGNMENT - 1)
+#define MAX_ALLOC_SIZE (((ALLOC_SIZE_T)-1) & ~MALLOC_ALIGN_MASK)
+
+class MallocBase {
+    public:
+	MallocBase()
+	{
+	}
+	virtual ~MallocBase()
+	{
+	}
+
+	template <class T> T *Pointer(ALLOC_HANDLE_T hHandle)
+	{
+		return reinterpret_cast<T *>(handle_to_ptr(hHandle));
+	}
+
+	virtual ALLOC_HANDLE_T get_handle(void *p) = 0;
+
+	virtual const char *get_err_msg() = 0;
+
+	/*************************************************
+	  Description:	分配内存
+	  Input:		tSize		分配的内存大小
+	  Output:		
+	  Return:		内存块句柄,INVALID_HANDLE为失败
+	*************************************************/
+	virtual ALLOC_HANDLE_T Malloc(ALLOC_SIZE_T tSize) = 0;
+
+	/*************************************************
+	  Description:	分配内存,并将内存初始化为0
+	  Input:		tSize		分配的内存大小
+	  Output:		
+	  Return:		内存块句柄,INVALID_HANDLE为失败
+	*************************************************/
+	virtual ALLOC_HANDLE_T Calloc(ALLOC_SIZE_T tSize) = 0;
+
+	/*************************************************
+	  Description:	重新分配内存
+	  Input:		hHandle	老内存句柄
+				tSize		新分配的内存大小
+	  Output:		
+	  Return:		内存块句柄,INVALID_HANDLE为失败(失败时不会释放老内存块)
+	*************************************************/
+	virtual ALLOC_HANDLE_T ReAlloc(ALLOC_HANDLE_T hHandle,
+				       ALLOC_SIZE_T tSize) = 0;
+
+	/*************************************************
+	  Description:	释放内存
+	  Input:		hHandle	内存句柄
+	  Output:		
+	  Return:		0为成功,非0失败
+	*************************************************/
+	virtual int Free(ALLOC_HANDLE_T hHandle) = 0;
+
+	/*************************************************
+	  Description:	获取内存块大小
+	  Input:		hHandle	内存句柄
+	  Output:		
+	  Return:		内存大小
+	*************************************************/
+	virtual ALLOC_SIZE_T chunk_size(ALLOC_HANDLE_T hHandle) = 0;
+
+	/*************************************************
+	  Description:	将句柄转换成内存地址
+	  Input:		内存句柄
+	  Output:		
+	  Return:		内存地址,如果句柄无效返回NULL
+	*************************************************/
+	virtual void *handle_to_ptr(ALLOC_HANDLE_T hHandle) = 0;
+
+	/*************************************************
+	  Description:	将内存地址转换为句柄
+	  Input:		内存地址
+	  Output:		
+	  Return:		内存句柄,如果地址无效返回INVALID_HANDLE
+	*************************************************/
+	virtual ALLOC_HANDLE_T ptr_to_handle(void *p) = 0;
+
+	virtual ALLOC_SIZE_T ask_for_destroy_size(ALLOC_HANDLE_T hHandl) = 0;
+
+	/*************************************************
+	  Description:	检测handle是否有效
+	  Input:		内存句柄
+	  Output:		
+      Return:	    0: 有效; -1:无效
+	*************************************************/
+	virtual int handle_is_valid(ALLOC_HANDLE_T mem_handle) = 0;
+};
+
+DTC_END_NAMESPACE
+
+#endif

+ 1559 - 0
src/core/mem/pt_malloc.cc

@@ -0,0 +1,1559 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "log/log.h"
+#include "pt_malloc.h"
+#include "algorithm/singleton.h"
+
+DTC_USING_NAMESPACE
+
+/* conversion from malloc headers to user pointers, and back */
+#define chunk2mem(h) (void *)(((char *)h) + 2 * sizeof(ALLOC_SIZE_T))
+#define mem2chunk(h) (void *)(((char *)h) - 2 * sizeof(ALLOC_SIZE_T))
+#define chunkhandle2memhandle(handle) (handle + 2 * sizeof(ALLOC_SIZE_T))
+#define memhandle2chunkhandle(handle) (handle - 2 * sizeof(ALLOC_SIZE_T))
+#if BIN_MEM_CHECK
+#define chunksize2memsize(size) (size - 2 * sizeof(ALLOC_SIZE_T))
+#define checked_chunksize2memsize(size)                                        \
+	(size > 2 * sizeof(ALLOC_SIZE_T) ? (size - 2 * sizeof(ALLOC_SIZE_T)) : \
+					   0)
+#else
+#define chunksize2memsize(size) (size - sizeof(ALLOC_SIZE_T))
+#define checked_chunksize2memsize(size)                                        \
+	(size > sizeof(ALLOC_SIZE_T) ? (size - sizeof(ALLOC_SIZE_T)) : 0)
+#endif
+
+/* Check if m has acceptable alignment */
+
+#define aligned_OK(m) (((unsigned long)(m)&MALLOC_ALIGN_MASK) == 0)
+
+#define misaligned_chunk(h)                                                    \
+	((MALLOC_ALIGNMENT == 2 * SIZE_SZ ? (h) : chunkhandle2memhandle(h)) &  \
+	 MALLOC_ALIGN_MASK)
+
+/*
+   Check if a request is so large that it would wrap around zero when
+   padded and aligned. To simplify some other code, the bound is made
+   low enough so that adding MINSIZE will also not wrap around zero.
+*/
+
+#define REQUEST_OUT_OF_RANGE(req)                                              \
+	((unsigned long)(req) >= (unsigned long)(ALLOC_SIZE_T)(-2 * MINSIZE))
+
+/* pad request bytes into a usable size -- internal version */
+#if BIN_MEM_CHECK
+#define request2size(req)                                                      \
+	(((req) + 2 * SIZE_SZ + MALLOC_ALIGN_MASK < MINSIZE) ?                 \
+		 MINSIZE :                                                     \
+		 ((req) + 2 * SIZE_SZ + MALLOC_ALIGN_MASK) &                   \
+			 ~MALLOC_ALIGN_MASK)
+#else
+#define request2size(req)                                                      \
+	(((req) + SIZE_SZ + MALLOC_ALIGN_MASK < MINSIZE) ?                     \
+		 MINSIZE :                                                     \
+		 ((req) + SIZE_SZ + MALLOC_ALIGN_MASK) & ~MALLOC_ALIGN_MASK)
+#endif
+
+/*  Same, except also perform argument check */
+
+#define checked_request2size(req, sz)                                          \
+	if (REQUEST_OUT_OF_RANGE(req)) {                                       \
+		return (INVALID_HANDLE);                                       \
+	}                                                                      \
+	(sz) = request2size(req);
+
+/*
+  --------------- Physical chunk operations ---------------
+*/
+/* size field is or'ed with PREV_INUSE when previous adjacent chunk in use */
+#define PREV_INUSE 0x1
+#define RESERVE_BITS (0x2 | 0x4)
+/*
+  Bits to mask off when extracting size
+*/
+#define SIZE_BITS (PREV_INUSE | RESERVE_BITS)
+
+/* Get size, ignoring use bits */
+#define CHUNK_SIZE(p) ((p)->m_tSize & ~(SIZE_BITS))
+#define REAL_SIZE(sz) ((sz) & ~(SIZE_BITS))
+
+/* extract inuse bit of previous chunk */
+#define prev_inuse(p) ((p)->m_tSize & PREV_INUSE)
+#define inuse_bit_at_offset(p, offset)                                         \
+	(((MallocChunk *)(((char *)p) + offset))->m_tSize & PREV_INUSE)
+#define set_inuse_bit_at_offset(p, s)                                          \
+	(((MallocChunk *)(((char *)(p)) + (s)))->m_tSize |= PREV_INUSE)
+#define clear_inuse_bit_at_offset(p, s)                                        \
+	(((MallocChunk *)(((char *)(p)) + (s)))->m_tSize &= ~(PREV_INUSE))
+#define set_size_at_offset(p, offset, size)                                    \
+	(((MallocChunk *)(((char *)p) + (offset)))->m_tSize =                  \
+		 REAL_SIZE(size) |                                             \
+		 (((MallocChunk *)(((char *)p) + (offset)))->m_tSize &         \
+		  SIZE_BITS))
+#define set_presize_at_offset(p, offset, size)                                 \
+	(((MallocChunk *)(((char *)p) + (offset)))->m_tPreSize =               \
+		 REAL_SIZE(size))
+
+#define in_smallbin_range(sz)                                                  \
+	((unsigned long)(sz) < (unsigned long)MIN_LARGE_SIZE)
+
+#define smallbin_index(sz) (((unsigned)(sz)) >> 3)
+
+#define largebin_index(sz)                                                     \
+	(((((unsigned long)(sz)) >> 6) <= 32) ?                                \
+		 56 + (((unsigned long)(sz)) >> 6) :                           \
+		 ((((unsigned long)(sz)) >> 9) <= 20) ?                        \
+		 91 + (((unsigned long)(sz)) >> 9) :                           \
+		 ((((unsigned long)(sz)) >> 12) <= 10) ?                       \
+		 110 + (((unsigned long)(sz)) >> 12) :                         \
+		 ((((unsigned long)(sz)) >> 15) <= 4) ?                        \
+		 119 + (((unsigned long)(sz)) >> 15) :                         \
+		 ((((unsigned long)(sz)) >> 18) <= 2) ?                        \
+		 124 + (((unsigned long)(sz)) >> 18) :                         \
+		 126)
+
+#define bin_index(sz)                                                          \
+	((in_smallbin_range(sz)) ? smallbin_index(sz) : largebin_index(sz))
+
+#define NFASTBINS NSMALLBINS
+#define FAST_MAX_SIZE MIN_LARGE_SIZE
+#define fastbin_index(sz) smallbin_index(sz)
+
+#define AT_TOP(chunk, sz)                                                      \
+	(((char *)chunk) + sz == ((char *)m_pBaseAddr) + m_pstHead->m_hTop)
+
+#define CAN_COMBILE(size, add)                                                 \
+	((INTER_SIZE_T)size + add <= (INTER_SIZE_T)MAX_ALLOC_SIZE)
+
+PtMalloc::PtMalloc()
+{
+	m_pBaseAddr = NULL;
+	m_pstHead = NULL;
+	m_ptBin = NULL;
+	m_ptFastBin = NULL;
+	m_ptUnsortedBin = NULL;
+	statChunkTotal = g_stat_mgr.get_stat_int_counter(DTC_CHUNK_TOTAL);
+	statDataSize = g_stat_mgr.get_stat_iterm(DTC_DATA_SIZE);
+	statMemoryTop = g_stat_mgr.get_stat_iterm(DTC_MEMORY_TOP);
+	statTmpDataSizeRecently = 0;
+	statTmpDataAllocCountRecently = 0;
+	statAverageDataSizeRecently =
+		g_stat_mgr.get_stat_iterm(DATA_SIZE_AVG_RECENT);
+	memset(err_message_, 0, sizeof(err_message_));
+	minChunkSize = MINSIZE;
+}
+
+PtMalloc::~PtMalloc()
+{
+}
+
+PtMalloc *PtMalloc::instance()
+{
+	return Singleton<PtMalloc>::instance();
+}
+
+void PtMalloc::destroy()
+{
+	Singleton<PtMalloc>::destory();
+}
+/*初始化header中的signature域*/
+void PtMalloc::init_sign()
+{
+	static const unsigned int V4Sign[14] = {
+		DTC_SIGN_0, DTC_SIGN_1, DTC_SIGN_2, DTC_SIGN_3, DTC_SIGN_4,
+		DTC_SIGN_5, DTC_SIGN_6, DTC_SIGN_7, DTC_SIGN_8, DTC_SIGN_9,
+		DTC_SIGN_A, DTC_SIGN_B, DTC_SIGN_C, DTC_SIGN_D
+	};
+
+	memcpy(m_pstHead->m_auiSign, V4Sign, sizeof(m_pstHead->m_auiSign));
+}
+
+#if __WORDSIZE == 64
+#define UINT64FMT_T "%lu"
+#else
+#define UINT64FMT_T "%llu"
+#endif
+/*初始化cache头信息*/
+/*传入参数,cache的起始地址,cache的总大小*/
+int PtMalloc::do_init(void *pAddr, INTER_SIZE_T tSize)
+{
+	int i;
+
+	if (tSize < sizeof(MemHead) + sizeof(CBin) * (NBINS + NFASTBINS + 1) +
+			    DTC_RESERVE_SIZE + MINSIZE) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "invalid size[" UINT64FMT_T "]", tSize);
+		return (-1);
+	}
+
+	m_pBaseAddr = pAddr;
+	m_pstHead = (MemHead *)m_pBaseAddr;
+	memset(m_pstHead, 0, sizeof(MemHead));
+	init_sign();
+	m_pstHead->m_ushVer = DTC_VER_MIN;
+	m_pstHead->m_ushHeadSize = sizeof(MemHead);
+	m_pstHead->m_tSize = tSize;
+	m_pstHead->m_tUserAllocChunkCnt = 0;
+	m_pstHead->m_hReserveZone =
+		sizeof(MemHead) + sizeof(CBin) * (NBINS + NFASTBINS + 1);
+	m_pstHead->m_hReserveZone =
+		(m_pstHead->m_hReserveZone + MALLOC_ALIGN_MASK) &
+		~MALLOC_ALIGN_MASK;
+	m_pstHead->m_hBottom = (m_pstHead->m_hReserveZone + DTC_RESERVE_SIZE +
+				MALLOC_ALIGN_MASK) &
+			       ~MALLOC_ALIGN_MASK;
+	m_pstHead->m_hTop = m_pstHead->m_hBottom;
+	m_pstHead->m_tUserAllocSize = m_pstHead->m_hBottom;
+	statMemoryTop = m_pstHead->m_hTop;
+	m_pstHead->m_tLastFreeChunkSize =
+		(tSize > m_pstHead->m_hTop + MINSIZE) ?
+			(tSize - m_pstHead->m_hTop - MINSIZE) :
+			0;
+	m_pstHead->m_ushBinCnt = NBINS;
+	m_pstHead->m_ushFastBinCnt = NFASTBINS;
+	memset(m_pstHead->m_auiBinBitMap, 0, sizeof(m_pstHead->m_auiBinBitMap));
+	m_ptBin = (CBin *)(((char *)m_pBaseAddr) + sizeof(MemHead));
+	m_ptFastBin = m_ptBin + NBINS;
+	m_ptUnsortedBin = m_ptFastBin + NFASTBINS;
+
+	for (i = 0; i < NBINS; i++) {
+		m_ptBin[i].m_hPreChunk = INVALID_HANDLE;
+		m_ptBin[i].m_hNextChunk = INVALID_HANDLE;
+	}
+
+	for (i = 0; i < NFASTBINS; i++) {
+		m_ptFastBin[i].m_hPreChunk = INVALID_HANDLE;
+		m_ptFastBin[i].m_hNextChunk = INVALID_HANDLE;
+	}
+
+	m_ptUnsortedBin[0].m_hPreChunk = INVALID_HANDLE;
+	m_ptUnsortedBin[0].m_hNextChunk = INVALID_HANDLE;
+
+	MallocChunk *pstChunk;
+	pstChunk = (MallocChunk *)handle_to_ptr(m_pstHead->m_hTop);
+	pstChunk->m_tPreSize = 0;
+	pstChunk->m_tSize = PREV_INUSE;
+
+	// init stat
+	statChunkTotal = m_pstHead->m_tUserAllocChunkCnt;
+	statDataSize = m_pstHead->m_tUserAllocSize;
+
+	return (0);
+}
+/*校验cache的版本是否正确*/
+int PtMalloc::detect_version()
+{
+	if (m_pstHead->m_auiSign[0] != DTC_SIGN_0 ||
+	    m_pstHead->m_auiSign[1] != DTC_SIGN_1)
+		return 1;
+	if (m_pstHead->m_ushVer == 2)
+		return (2);
+	if (m_pstHead->m_ushVer == 3)
+		return (3);
+	if (m_pstHead->m_ushVer == 4)
+		return (4);
+
+	snprintf(err_message_, sizeof(err_message_),
+		 "unknown version signature %u", m_pstHead->m_ushVer);
+	return (0);
+}
+/*查看cache是否一致:在启动dtc,加载cache的时候,只要是需要写cache,就会设置不一致,防止dtc在运行时crash,重启后不经检查使用乱掉的内存*/
+int PtMalloc::share_memory_integrity()
+{
+	return (int)m_pstHead->m_shmIntegrity;
+}
+
+void PtMalloc::set_share_memory_integrity(const int flags)
+{
+	if (flags)
+		m_pstHead->m_shmIntegrity = 1;
+	else
+		m_pstHead->m_shmIntegrity = 0;
+}
+/*对于已经存在的IPC shared memory,dtc在启动后会将这个块内存作为cache,在这里检查这块cache的头信息,是否正确*/
+int PtMalloc::do_attach(void *pAddr, INTER_SIZE_T tSize)
+{
+	if (tSize < sizeof(MemHead) + sizeof(CBin) * (NBINS + NFASTBINS + 1) +
+			    MINSIZE) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "invalid size[" UINT64FMT_T "]", tSize);
+		return (-1);
+	}
+
+	m_pBaseAddr = pAddr;
+	m_pstHead = (MemHead *)m_pBaseAddr;
+	if (detect_version() != DTC_VER_MIN) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "Unsupported preferred version %u",
+			 m_pstHead->m_ushVer);
+		return (-2);
+	}
+
+	if (m_pstHead->m_tSize != tSize) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "invalid argument");
+		return (-3);
+	}
+	if (m_pstHead->m_hTop >= m_pstHead->m_tSize) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "memory corruption-invalid bottom value");
+		return (-4);
+	}
+	m_ptBin = (CBin *)(((char *)m_pBaseAddr) + sizeof(MemHead));
+	m_ptFastBin = m_ptBin + NBINS;
+	m_ptUnsortedBin = m_ptFastBin + NFASTBINS;
+
+	// init stat
+	statChunkTotal = m_pstHead->m_tUserAllocChunkCnt;
+	statDataSize = m_pstHead->m_tUserAllocSize;
+
+	return (0);
+}
+
+ALLOC_HANDLE_T PtMalloc::get_reserve_zone()
+{
+	return m_pstHead->m_hReserveZone;
+}
+/*输入参数是chunk的用户handle*/
+/*返回这块chunk的用户使用空间的大小*/
+ALLOC_SIZE_T PtMalloc::chunk_size(ALLOC_HANDLE_T hHandle)
+{
+	MallocChunk *pstChunk;
+
+	if (hHandle >= m_pstHead->m_hTop || hHandle <= m_pstHead->m_hBottom) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "[chunk_size]-invalid handle");
+		return (0);
+	}
+
+	pstChunk = (MallocChunk *)mem2chunk(handle_to_ptr(hHandle));
+
+	if (check_inuse_chunk(pstChunk) != 0) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "[chunk_size]-invalid chunk");
+		return (0);
+	}
+
+	return chunksize2memsize(CHUNK_SIZE(pstChunk));
+}
+/*设置输入bin上的头chunk为使用状态,并将这个chunk从bin上拖链*/
+void *PtMalloc::bin_malloc(CBin &ptBin)
+{
+	MallocChunk *pstChunk;
+	void *p;
+
+	if (ptBin.m_hNextChunk == INVALID_HANDLE)
+		return (NULL);
+
+	p = handle_to_ptr(ptBin.m_hNextChunk);
+	pstChunk = (MallocChunk *)p;
+	set_inuse_bit_at_offset(pstChunk, REAL_SIZE(pstChunk->m_tSize));
+	unlink_bin(ptBin, ptBin.m_hNextChunk);
+
+	return p;
+}
+/*对所有的bin检查:small&large bins, fast bins, unsorted bins*/
+/*校验方法:每个bin组成一个双向的循环链表*/
+int PtMalloc::check_bin()
+{
+	int i;
+
+	INTER_HANDLE_T hHandle;
+	MallocChunk *pstChunk;
+	for (i = 0; i < NBINS; i++) {
+		hHandle = m_ptBin[i].m_hNextChunk;
+		if (hHandle != INVALID_HANDLE) {
+			do {
+				pstChunk =
+					(MallocChunk *)handle_to_ptr(hHandle);
+				if (pstChunk->m_hNextChunk != INVALID_HANDLE)
+					hHandle = pstChunk->m_hNextChunk;
+			} while (pstChunk->m_hNextChunk != INVALID_HANDLE);
+		}
+		if (m_ptBin[i].m_hPreChunk != hHandle) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "bad bin[%d]", i);
+			return (-1);
+		}
+	}
+
+	for (i = 0; i < NFASTBINS; i++) {
+		hHandle = m_ptFastBin[i].m_hNextChunk;
+		if (hHandle != INVALID_HANDLE) {
+			do {
+				pstChunk =
+					(MallocChunk *)handle_to_ptr(hHandle);
+				if (pstChunk->m_hNextChunk != INVALID_HANDLE)
+					hHandle = pstChunk->m_hNextChunk;
+			} while (pstChunk->m_hNextChunk != INVALID_HANDLE);
+		}
+		if (m_ptFastBin[i].m_hPreChunk != hHandle) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "bad fast-bin[%d]", i);
+			return (-2);
+		}
+	}
+
+	hHandle = m_ptUnsortedBin[0].m_hNextChunk;
+	if (hHandle != INVALID_HANDLE) {
+		do {
+			pstChunk = (MallocChunk *)handle_to_ptr(hHandle);
+			if (pstChunk->m_hNextChunk != INVALID_HANDLE)
+				hHandle = pstChunk->m_hNextChunk;
+		} while (pstChunk->m_hNextChunk != INVALID_HANDLE);
+	}
+	if (m_ptUnsortedBin[0].m_hPreChunk != hHandle) {
+#if __WORDSIZE == 64
+		snprintf(err_message_, sizeof(err_message_),
+			 "bad unsorted-bin[%d] %lu!=%lu", 0,
+			 m_ptUnsortedBin[0].m_hPreChunk, hHandle);
+#else
+		snprintf(err_message_, sizeof(err_message_),
+			 "bad unsorted-bin[%d] %llu!=%llu", 0,
+			 m_ptUnsortedBin[0].m_hPreChunk, hHandle);
+#endif
+		return (-3);
+	}
+
+	return (0);
+}
+/*校验存放在bin中的chunk的一致性*/
+/*检验方法:从分配的top线开始向bottom方向,一个chunk一个chunk的检查,检查这个chunk的大小是不是和它的后一个chunk的presize一致*/
+#if BIN_MEM_CHECK
+int PtMalloc::check_mem()
+{
+	INTER_HANDLE_T hHandle;
+	MallocChunk *pstChunk;
+	ALLOC_SIZE_T tSize;
+
+	tSize = 0;
+	hHandle = m_pstHead->m_hTop;
+	while (hHandle > m_pstHead->m_hBottom) {
+		pstChunk = (MallocChunk *)handle_to_ptr(hHandle);
+		if (CHUNK_SIZE(pstChunk) != tSize) {
+#if __WORDSIZE == 64
+			snprintf(err_message_, sizeof(err_message_),
+				 "bad memory1 handle[%lu]", hHandle);
+#else
+			snprintf(err_message_, sizeof(err_message_),
+				 "bad memory1 handle[%llu]", hHandle);
+#endif
+			return (-1);
+		}
+		tSize = pstChunk->m_tPreSize;
+		if (hHandle < tSize) {
+#if __WORDSIZE == 64
+			snprintf(err_message_, sizeof(err_message_),
+				 "bad memory handle[%lu]", hHandle);
+#else
+			snprintf(err_message_, sizeof(err_message_),
+				 "bad memory handle[%llu]", hHandle);
+#endif
+			return (-2);
+		}
+		hHandle -= tSize;
+	}
+
+	return (0);
+}
+#endif
+/*从fastbins的一个bin下取一个空闲chunk,满足tsize大小。*/
+/*bin的索引查找方法是:按照在smallbins中查找bin的方法进行*/
+void *PtMalloc::fast_malloc(ALLOC_SIZE_T tSize)
+{
+	return bin_malloc(m_ptFastBin[smallbin_index(tSize)]);
+}
+/*从smallbins的一个bin下取一个空闲chunk满足tsize大小*/
+void *PtMalloc::small_bin_malloc(ALLOC_SIZE_T tSize)
+{
+	void *p;
+	unsigned int uiBinIdx;
+
+	uiBinIdx = smallbin_index(tSize);
+	p = bin_malloc(m_ptBin[uiBinIdx]);
+	if (empty_bin(uiBinIdx))
+		clear_bin_bit_map(uiBinIdx);
+
+	return (p);
+}
+/*释放fastbins的每个bin下的空闲chunk*/
+/*对于每个chunk试探是否可以和内存里的前后chunk合并,合并如果可以,并设置新chunk为使用状态,并从bin上拖链,最后将拖链的chunk存放在unsortedbin下*/
+int PtMalloc::free_fast()
+{
+	if (!(m_pstHead->m_uiFlags & MALLOC_FLAG_FAST)) // no fast chunk
+		return (0);
+
+	for (int i = 0; i < NFASTBINS; i++) {
+		if (m_ptFastBin[i].m_hNextChunk != INVALID_HANDLE) {
+			MallocChunk *pstChunk;
+			//			MallocChunk* pstPreChunk;
+			MallocChunk *pstNextChunk;
+			ALLOC_SIZE_T tSize;
+			ALLOC_SIZE_T tPreSize;
+			//			ALLOC_SIZE_T tNextSize;
+			unsigned int uiBinIdx;
+
+			do { // free fast-chunk & put it into unsorted chunk list
+				pstChunk = (MallocChunk *)handle_to_ptr(
+					m_ptFastBin[i].m_hNextChunk);
+				unlink_bin(m_ptFastBin[i],
+					   m_ptFastBin[i].m_hNextChunk);
+
+				tSize = CHUNK_SIZE(pstChunk);
+				if (!prev_inuse(pstChunk) &&
+				    CAN_COMBILE(tSize, pstChunk->m_tPreSize)) {
+					tPreSize = pstChunk->m_tPreSize;
+					tSize += tPreSize;
+					pstChunk =
+						(MallocChunk
+							 *)(((char *)pstChunk) -
+							    tPreSize);
+
+					uiBinIdx = bin_index(tPreSize);
+					unlink_bin(m_ptBin[uiBinIdx],
+						   ptr_to_handle(pstChunk));
+					if (empty_bin(uiBinIdx))
+						clear_bin_bit_map(uiBinIdx);
+					set_inuse_bit_at_offset(pstChunk,
+								tSize);
+				}
+
+				if (!AT_TOP(pstChunk, tSize)) {
+					pstNextChunk =
+						(MallocChunk
+							 *)(((char *)pstChunk) +
+							    tSize);
+					ALLOC_SIZE_T tNextSize =
+						CHUNK_SIZE(pstNextChunk);
+					uiBinIdx = bin_index(tNextSize);
+					if (!inuse_bit_at_offset(pstNextChunk,
+								 tNextSize) &&
+					    CAN_COMBILE(tSize, tNextSize)) {
+						tSize += tNextSize;
+						unlink_bin(
+							m_ptBin[uiBinIdx],
+							ptr_to_handle(
+								pstNextChunk));
+						if (empty_bin(uiBinIdx))
+							clear_bin_bit_map(
+								uiBinIdx);
+						set_inuse_bit_at_offset(
+							pstChunk, tSize);
+					} else {
+						//						clear_inuse_bit_at_offset(pstNextChunk, 0);
+					}
+				}
+
+				if (m_pstHead->m_tLastFreeChunkSize <
+				    REAL_SIZE(tSize))
+					m_pstHead->m_tLastFreeChunkSize =
+						REAL_SIZE(tSize);
+				pstChunk->m_tSize =
+					REAL_SIZE(tSize) |
+					(pstChunk->m_tSize & SIZE_BITS);
+				if (AT_TOP(pstChunk, tSize)) {
+					// combine into bottom
+					m_pstHead->m_hTop -= tSize;
+					statMemoryTop = m_pstHead->m_hTop;
+					//					clear_inuse_bit_at_offset(pstChunk, 0);
+				} else {
+					link_bin(m_ptUnsortedBin[0],
+						 ptr_to_handle(pstChunk));
+				}
+				pstNextChunk =
+					(MallocChunk *)(((char *)pstChunk) +
+							REAL_SIZE(tSize));
+				pstNextChunk->m_tPreSize = REAL_SIZE(tSize);
+
+			} while (m_ptFastBin[i].m_hNextChunk != INVALID_HANDLE);
+		}
+	}
+
+	m_pstHead->m_uiFlags &= ~MALLOC_FLAG_FAST;
+
+	return (0);
+}
+/*从top线上面分配一个chunk满足tsize*/
+void *PtMalloc::top_alloc(ALLOC_SIZE_T tSize)
+{
+	if (m_pstHead->m_hTop + tSize + MINSIZE >= m_pstHead->m_tSize) {
+		snprintf(err_message_, sizeof(err_message_), "out of memory");
+		return (NULL);
+	}
+
+	void *p;
+	MallocChunk *pstChunk;
+	pstChunk = (MallocChunk *)handle_to_ptr(m_pstHead->m_hTop);
+	pstChunk->m_tSize = (pstChunk->m_tSize & SIZE_BITS) | REAL_SIZE(tSize);
+	p = (void *)pstChunk;
+
+	pstChunk = (MallocChunk *)(((char *)pstChunk) + tSize);
+	pstChunk->m_tPreSize = REAL_SIZE(tSize);
+	pstChunk->m_tSize = PREV_INUSE;
+
+	m_pstHead->m_hTop += tSize;
+	statMemoryTop = m_pstHead->m_hTop;
+
+	return chunk2mem(p);
+}
+/*从输入的bin上将handle指定的chunk拖链*/
+int PtMalloc::unlink_bin(CBin &stBin, INTER_HANDLE_T hHandle)
+{
+	MallocChunk *pstChunk;
+	MallocChunk *pstTmp;
+
+	if (hHandle == INVALID_HANDLE)
+		return (-1);
+
+	if (stBin.m_hNextChunk == INVALID_HANDLE ||
+	    stBin.m_hPreChunk == INVALID_HANDLE) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "unlink-bin: bad bin!");
+		return (-2);
+	}
+
+	pstChunk = (MallocChunk *)handle_to_ptr(hHandle);
+	if (pstChunk->m_hPreChunk == INVALID_HANDLE) {
+		//remove head
+		stBin.m_hNextChunk = pstChunk->m_hNextChunk;
+	} else {
+		pstTmp = (MallocChunk *)handle_to_ptr(pstChunk->m_hPreChunk);
+		pstTmp->m_hNextChunk = pstChunk->m_hNextChunk;
+	}
+	if (pstChunk->m_hNextChunk == INVALID_HANDLE) {
+		stBin.m_hPreChunk = pstChunk->m_hPreChunk;
+	} else {
+		pstTmp = (MallocChunk *)handle_to_ptr(pstChunk->m_hNextChunk);
+		pstTmp->m_hPreChunk = pstChunk->m_hPreChunk;
+	}
+
+	return (0);
+}
+/*将handle指定的chunk插入到bin上*/
+int PtMalloc::link_bin(CBin &stBin, INTER_HANDLE_T hHandle)
+{
+	MallocChunk *pstChunk;
+	MallocChunk *pstTmp;
+
+	if (hHandle == INVALID_HANDLE)
+		return (-1);
+
+	pstChunk = (MallocChunk *)handle_to_ptr(hHandle);
+	pstChunk->m_hNextChunk = stBin.m_hNextChunk;
+	pstChunk->m_hPreChunk = INVALID_HANDLE;
+	if (stBin.m_hNextChunk != INVALID_HANDLE) {
+		pstTmp = (MallocChunk *)handle_to_ptr(stBin.m_hNextChunk);
+		pstTmp->m_hPreChunk = hHandle;
+		if (stBin.m_hPreChunk == INVALID_HANDLE) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "link-bin: bad bin");
+			return (-2);
+		}
+	} else {
+		if (stBin.m_hPreChunk != INVALID_HANDLE) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "link-bin: bad bin");
+			return (-3);
+		}
+		stBin.m_hPreChunk = hHandle;
+	}
+	stBin.m_hNextChunk = hHandle;
+
+	return (0);
+}
+/*在bin中查找一个合适的位置,将hanlde指定的chunk插入进去*/
+/*寻找位置的方法:从bin的尾部开始,找到第一个位置,它的大小介于前后chunk的大小之间*/
+int PtMalloc::link_sorted_bin(CBin &stBin, INTER_HANDLE_T hHandle,
+			      ALLOC_SIZE_T tSize)
+{
+	MallocChunk *pstChunk;
+	MallocChunk *pstNextChunk;
+
+	if (hHandle == INVALID_HANDLE)
+		return (-1);
+
+	pstChunk = (MallocChunk *)handle_to_ptr(hHandle);
+	pstChunk->m_hNextChunk = INVALID_HANDLE;
+	pstChunk->m_hPreChunk = INVALID_HANDLE;
+
+	if (stBin.m_hNextChunk == INVALID_HANDLE) { // empty bin
+		pstChunk->m_hPreChunk = INVALID_HANDLE;
+		pstChunk->m_hNextChunk = INVALID_HANDLE;
+		stBin.m_hNextChunk = hHandle;
+		stBin.m_hPreChunk = hHandle;
+	} else {
+		INTER_HANDLE_T hPre;
+		hPre = stBin.m_hPreChunk;
+		tSize = REAL_SIZE(tSize) | PREV_INUSE;
+		MallocChunk *pstPreChunk = 0;
+		while (hPre != INVALID_HANDLE) {
+			pstPreChunk = (MallocChunk *)handle_to_ptr(hPre);
+			if (tSize <= pstPreChunk->m_tSize)
+				break;
+			hPre = pstPreChunk->m_hPreChunk;
+		}
+		if (hPre == INVALID_HANDLE) {
+			if (stBin.m_hPreChunk == INVALID_HANDLE) {
+				// empty list
+				snprintf(err_message_, sizeof(err_message_),
+					 "memory corruction");
+				return (-1);
+			}
+
+			// place chunk at list head
+			link_bin(stBin, hHandle);
+		} else {
+			pstChunk->m_hPreChunk = hPre;
+			pstChunk->m_hNextChunk = pstPreChunk->m_hNextChunk;
+			pstPreChunk->m_hNextChunk = hHandle;
+			if (pstChunk->m_hNextChunk != INVALID_HANDLE) {
+				pstNextChunk = (MallocChunk *)handle_to_ptr(
+					pstChunk->m_hNextChunk);
+				pstNextChunk->m_hPreChunk =
+					ptr_to_handle(pstChunk);
+			} else {
+				// list tail
+				stBin.m_hPreChunk = hHandle;
+			}
+		}
+	}
+
+	return (0);
+}
+/*分配chunk满足tsize的主体逻辑*/
+ALLOC_HANDLE_T PtMalloc::inter_malloc(ALLOC_SIZE_T tSize)
+{
+	void *p;
+
+	checked_request2size(tSize, tSize);
+
+	/* no more use fast bin
+	if(tSize < FAST_MAX_SIZE){
+		p = fast_malloc(tSize);
+		if(p != NULL)
+			return ptr_to_handle(chunk2mem(p));
+	}
+	*/
+
+	if (in_smallbin_range(tSize)) {
+		p = small_bin_malloc(tSize);
+		if (p != NULL)
+			return ptr_to_handle(chunk2mem(p));
+	}
+
+	for (;;) {
+		MallocChunk *pstChunk = NULL;
+		MallocChunk *pstNextChunk = NULL;
+
+		unsigned int uiBinIdx = bin_index(tSize);
+		if (!in_smallbin_range(tSize)) {
+			INTER_HANDLE_T v = m_ptBin[uiBinIdx].m_hNextChunk;
+			unsigned int try_search_count = 0;
+
+			/* 每个bin最多只搜索100次,如果失败则跳至下一个bin */
+			while (v != INVALID_HANDLE &&
+			       ++try_search_count < 100) {
+				pstChunk = (MallocChunk *)handle_to_ptr(v);
+				if (CHUNK_SIZE(pstChunk) >= tSize)
+					break;
+
+				v = pstChunk->m_hNextChunk;
+			}
+
+			if (!(v != INVALID_HANDLE && try_search_count < 100))
+				goto SEARCH_NEXT_BIN;
+
+			ALLOC_SIZE_T tRemainSize;
+			tRemainSize = CHUNK_SIZE(pstChunk) - tSize;
+			// unlink
+			unlink_bin(m_ptBin[uiBinIdx], ptr_to_handle(pstChunk));
+			if (empty_bin(uiBinIdx))
+				clear_bin_bit_map(uiBinIdx);
+
+			if (tRemainSize < get_min_chunk_size()) {
+				set_inuse_bit_at_offset(pstChunk,
+							CHUNK_SIZE(pstChunk));
+			} else {
+				pstChunk->m_tSize =
+					tSize | (pstChunk->m_tSize & SIZE_BITS);
+				pstNextChunk =
+					(MallocChunk *)(((char *)pstChunk) +
+							tSize);
+				pstNextChunk->m_tSize = tRemainSize;
+				pstNextChunk->m_tPreSize = tSize;
+				set_inuse_bit_at_offset(pstNextChunk, 0);
+				((MallocChunk *)(((char *)pstChunk) + tSize +
+						 tRemainSize))
+					->m_tPreSize = tRemainSize;
+				set_inuse_bit_at_offset(pstNextChunk,
+							tRemainSize);
+				ALLOC_SIZE_T user_size;
+				inter_free(chunkhandle2memhandle(
+						   ptr_to_handle(pstNextChunk)),
+					   user_size);
+			}
+
+			p = (void *)pstChunk;
+			return ptr_to_handle(chunk2mem(p));
+		}
+
+		/*
+		   do_search for a chunk by scanning bins, starting with next largest
+		   bin. This search is strictly by best-fit; i.e., the smallest
+		   (with ties going to approximately the least recently used) chunk
+		   that fits is selected.
+		   */
+	SEARCH_NEXT_BIN:
+		uiBinIdx++;
+		unsigned int uiBitMapIdx = uiBinIdx / 32;
+		if (m_pstHead->m_auiBinBitMap[uiBitMapIdx] == 0) {
+			uiBitMapIdx++;
+			uiBinIdx = uiBitMapIdx * 32;
+			while (uiBitMapIdx <
+				       sizeof(m_pstHead->m_auiBinBitMap) &&
+			       m_pstHead->m_auiBinBitMap[uiBitMapIdx] == 0) {
+				uiBitMapIdx++;
+				uiBinIdx += 32;
+			}
+		}
+		while (uiBinIdx < NBINS &&
+		       m_ptBin[uiBinIdx].m_hNextChunk == INVALID_HANDLE)
+			uiBinIdx++;
+
+		if (uiBinIdx >= NBINS) {
+			goto MALLOC_BOTTOM;
+		}
+
+		INTER_HANDLE_T hPre;
+		hPre = m_ptBin[uiBinIdx].m_hPreChunk;
+		do {
+			pstChunk = (MallocChunk *)handle_to_ptr(hPre);
+			hPre = pstChunk->m_hPreChunk;
+		} while (CHUNK_SIZE(pstChunk) < tSize);
+		ALLOC_SIZE_T tRemainSize;
+		tRemainSize = CHUNK_SIZE(pstChunk) - tSize;
+		// unlink
+		unlink_bin(m_ptBin[uiBinIdx], ptr_to_handle(pstChunk));
+		if (empty_bin(uiBinIdx))
+			clear_bin_bit_map(uiBinIdx);
+
+		if (tRemainSize < get_min_chunk_size()) {
+			set_inuse_bit_at_offset(pstChunk, CHUNK_SIZE(pstChunk));
+		} else {
+			/* disable unsorted bins */
+			pstChunk->m_tSize =
+				tSize | (pstChunk->m_tSize & SIZE_BITS);
+			pstNextChunk =
+				(MallocChunk *)(((char *)pstChunk) + tSize);
+			pstNextChunk->m_tSize = tRemainSize;
+			pstNextChunk->m_tPreSize = tSize;
+			set_inuse_bit_at_offset(pstNextChunk, 0);
+			((MallocChunk *)(((char *)pstChunk) + tSize +
+					 tRemainSize))
+				->m_tPreSize = tRemainSize;
+			set_inuse_bit_at_offset(pstNextChunk, tRemainSize);
+			ALLOC_SIZE_T user_size;
+			inter_free(chunkhandle2memhandle(
+					   ptr_to_handle(pstNextChunk)),
+				   user_size);
+		}
+
+		p = (void *)pstChunk;
+		return ptr_to_handle(chunk2mem(p));
+	}
+
+MALLOC_BOTTOM:
+	return ptr_to_handle(top_alloc(tSize));
+}
+/*对intermalloc的包装,对返回结果进行了简单检查*/
+ALLOC_HANDLE_T PtMalloc::Malloc(ALLOC_SIZE_T tSize)
+{
+	MallocChunk *pstChunk;
+
+	m_pstHead->m_tLastFreeChunkSize = 0;
+	ALLOC_HANDLE_T hHandle = inter_malloc(tSize);
+	if (hHandle != INVALID_HANDLE) {
+		//		log4cplus_error("MALLOC: %lu", hHandle);
+		pstChunk = (MallocChunk *)mem2chunk(handle_to_ptr(hHandle));
+		m_pstHead->m_tUserAllocSize += CHUNK_SIZE(pstChunk);
+		m_pstHead->m_tUserAllocChunkCnt++;
+		++statChunkTotal;
+		statDataSize = m_pstHead->m_tUserAllocSize;
+		add_alloc_size_to_stat(tSize);
+	}
+	return (hHandle);
+}
+/*对intermalloc的包装,对返回结果进行了简单检查,并将返回的chunk的用户部分清空*/
+ALLOC_HANDLE_T PtMalloc::Calloc(ALLOC_SIZE_T tSize)
+{
+	ALLOC_HANDLE_T hHandle = Malloc(tSize);
+	if (hHandle != INVALID_HANDLE) {
+		char *p = Pointer<char>(hHandle);
+		memset(p, 0x00, tSize);
+	}
+
+	return hHandle;
+}
+
+/*当输入的chunk在使用中时候返回0*/
+int PtMalloc::check_inuse_chunk(MallocChunk *pstChunk)
+{
+	if (!inuse_bit_at_offset(pstChunk, CHUNK_SIZE(pstChunk))) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "chunk not inuse!");
+		return (-1);
+	}
+
+	MallocChunk *pstTmp;
+	if (!prev_inuse(pstChunk)) {
+		pstTmp = (MallocChunk *)(((char *)pstChunk) -
+					 pstChunk->m_tPreSize);
+		if (ptr_to_handle(pstTmp) < m_pstHead->m_hBottom ||
+		    CHUNK_SIZE(pstTmp) != pstChunk->m_tPreSize) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "invalid pre-chunk size!");
+			return (-2);
+		}
+	}
+
+	pstTmp = (MallocChunk *)(((char *)pstChunk) + CHUNK_SIZE(pstChunk));
+	if (!AT_TOP(pstTmp, 0)) {
+		if (CHUNK_SIZE(pstTmp) < MINSIZE) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "invalid next chunk!");
+			return (-3);
+		}
+	}
+
+	return (0);
+}
+/*realloc的主体逻辑*/
+ALLOC_HANDLE_T PtMalloc::inter_re_alloc(ALLOC_HANDLE_T hHandle,
+					ALLOC_SIZE_T tSize,
+					ALLOC_SIZE_T &tOldMemSize)
+{
+	INTER_HANDLE_T hNewHandle;
+	INTER_SIZE_T tNewSize;
+	MallocChunk *pstChunk;
+
+	ALLOC_SIZE_T tUserReqSize = tSize;
+
+	tOldMemSize = 0;
+	if (hHandle == INVALID_HANDLE) {
+		//		return inter_malloc(tSize - MALLOC_ALIGN_MASK);
+		return inter_malloc(tSize);
+	}
+
+	if (tSize == 0) {
+		inter_free(hHandle, tOldMemSize);
+		return (INVALID_HANDLE);
+	}
+
+	checked_request2size(tSize, tSize);
+
+	if (hHandle >= m_pstHead->m_hTop || hHandle <= m_pstHead->m_hBottom) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "realloc-invalid handle");
+		return (INVALID_HANDLE);
+	}
+
+	ALLOC_SIZE_T tOldSize;
+	pstChunk = (MallocChunk *)mem2chunk(handle_to_ptr(hHandle));
+	tOldSize = CHUNK_SIZE(pstChunk);
+	hHandle = ptr_to_handle((void *)pstChunk);
+	if (hHandle + tOldSize > m_pstHead->m_hTop) {
+#if __WORDSIZE == 64
+		snprintf(err_message_, sizeof(err_message_),
+			 "realloc-invalid handle: %lu, size: %u", hHandle,
+			 tOldSize);
+#else
+		snprintf(err_message_, sizeof(err_message_),
+			 "realloc-invalid handle: %llu, size: %u", hHandle,
+			 tOldSize);
+#endif
+		return (INVALID_HANDLE);
+	}
+
+	if (misaligned_chunk(hHandle)) {
+#if __WORDSIZE == 64
+		snprintf(err_message_, sizeof(err_message_),
+			 "realloc-invalid handle: %lu, size: %u", hHandle,
+			 tOldSize);
+#else
+		snprintf(err_message_, sizeof(err_message_),
+			 "realloc-invalid handle: %llu, size: %u", hHandle,
+			 tOldSize);
+#endif
+		return (INVALID_HANDLE);
+	}
+
+	if (tOldSize < MINSIZE) {
+#if __WORDSIZE == 64
+		snprintf(err_message_, sizeof(err_message_),
+			 "realloc-invalid old-size: %lu, size: %u", hHandle,
+			 tOldSize);
+#else
+		snprintf(err_message_, sizeof(err_message_),
+			 "realloc-invalid old-size: %llu, size: %u", hHandle,
+			 tOldSize);
+#endif
+		return (INVALID_HANDLE);
+	}
+
+	if (check_inuse_chunk(pstChunk) != 0) {
+#if __WORDSIZE == 64
+		snprintf(err_message_, sizeof(err_message_),
+			 "realloc-invalid chunk: %lu, size: %u", hHandle,
+			 tOldSize);
+#else
+		snprintf(err_message_, sizeof(err_message_),
+			 "realloc-invalid chunk: %llu, size: %u", hHandle,
+			 tOldSize);
+#endif
+		return (INVALID_HANDLE);
+	}
+	tOldMemSize = tOldSize;
+
+	int iPreInUse = prev_inuse(pstChunk);
+	ALLOC_SIZE_T tPreSize = pstChunk->m_tPreSize;
+
+	MallocChunk *pstTmp;
+	MallocChunk *pstNextChunk;
+	pstNextChunk =
+		(MallocChunk *)(((char *)pstChunk) + CHUNK_SIZE(pstChunk));
+
+	if (tOldSize >= tSize) {
+		hNewHandle = hHandle;
+		tNewSize = tOldSize;
+	} else {
+		/* Try to expand forward into top */
+		if (AT_TOP(pstChunk, tOldSize) &&
+		    m_pstHead->m_hTop + (tSize - tOldSize) + MINSIZE <
+			    m_pstHead->m_tSize) {
+			pstChunk->m_tSize = REAL_SIZE(tSize) |
+					    (pstChunk->m_tSize & SIZE_BITS);
+			pstNextChunk = (MallocChunk *)handle_to_ptr(
+				m_pstHead->m_hTop + (tSize - tOldSize));
+			pstNextChunk->m_tPreSize = REAL_SIZE(tSize);
+			pstNextChunk->m_tSize = PREV_INUSE;
+
+			m_pstHead->m_hTop += (tSize - tOldSize);
+			statMemoryTop = m_pstHead->m_hTop;
+			return ptr_to_handle(chunk2mem(pstChunk));
+		} else if (!AT_TOP(pstChunk, tOldSize) &&
+			   !inuse_bit_at_offset(pstNextChunk,
+						CHUNK_SIZE(pstNextChunk)) &&
+			   ((INTER_SIZE_T)tOldSize +
+			    CHUNK_SIZE(pstNextChunk)) >= tSize) {
+			hNewHandle = hHandle;
+			tNewSize = (INTER_SIZE_T)tOldSize +
+				   CHUNK_SIZE(pstNextChunk);
+			unlink_bin(m_ptBin[bin_index(CHUNK_SIZE(pstNextChunk))],
+				   ptr_to_handle(pstNextChunk));
+		}
+		/* ada: defrag */
+		else if (!prev_inuse(pstChunk) &&
+			 (tOldSize + pstChunk->m_tPreSize) >= tSize) {
+			pstTmp = (MallocChunk *)(((char *)pstChunk) -
+						 pstChunk->m_tPreSize);
+			iPreInUse = prev_inuse(pstTmp);
+			tPreSize = pstTmp->m_tPreSize;
+			// copy & move
+			hNewHandle = hHandle - pstChunk->m_tPreSize;
+			tNewSize =
+				(INTER_SIZE_T)tOldSize + pstChunk->m_tPreSize;
+			unlink_bin(m_ptBin[bin_index(pstChunk->m_tPreSize)],
+				   hNewHandle);
+			// copy user data
+			memmove(chunk2mem(handle_to_ptr(hNewHandle)),
+				chunk2mem(handle_to_ptr(hHandle)),
+				chunksize2memsize(tOldSize));
+		} else {
+			// alloc , copy & free
+			hNewHandle = inter_malloc(tUserReqSize);
+			if (hNewHandle == INVALID_HANDLE) {
+				snprintf(err_message_, sizeof(err_message_),
+					 "realloc-out of memory");
+				return (INVALID_HANDLE);
+			}
+			pstTmp = (MallocChunk *)mem2chunk(
+				handle_to_ptr(hNewHandle));
+			hNewHandle = ptr_to_handle(pstTmp);
+			tNewSize = CHUNK_SIZE(pstTmp);
+			// copy user data
+			memcpy(chunk2mem(pstTmp),
+			       chunk2mem(handle_to_ptr(hHandle)),
+			       chunksize2memsize(tOldSize));
+			ALLOC_SIZE_T tTmpSize;
+			inter_free(chunkhandle2memhandle(hHandle), tTmpSize);
+			return chunkhandle2memhandle(hNewHandle);
+		}
+	}
+
+	assert(tNewSize >= tSize);
+	MallocChunk *pstNewChunk;
+	pstNewChunk = (MallocChunk *)handle_to_ptr(hNewHandle);
+	INTER_SIZE_T tRemainderSize = tNewSize - tSize;
+	if (tRemainderSize >= get_min_chunk_size()) {
+		// split
+		MallocChunk *pstRemainChunk;
+		pstRemainChunk = (MallocChunk *)(((char *)pstNewChunk) + tSize);
+		//	ALLOC_SIZE_T tPreChunkSize = tSize;
+		do {
+			ALLOC_SIZE_T tThisChunkSize;
+			if (tRemainderSize > MAX_ALLOC_SIZE) {
+				if (tRemainderSize - MAX_ALLOC_SIZE >= MINSIZE)
+					tThisChunkSize =
+						REAL_SIZE(MAX_ALLOC_SIZE);
+				else
+					tThisChunkSize = REAL_SIZE(
+						tRemainderSize - MINSIZE);
+			} else {
+				tThisChunkSize = tRemainderSize;
+			}
+			pstRemainChunk->m_tSize =
+				REAL_SIZE(tThisChunkSize) | PREV_INUSE;
+
+			// next chunk
+			pstNextChunk =
+				(MallocChunk *)(((char *)pstRemainChunk) +
+						REAL_SIZE(tThisChunkSize));
+			pstNextChunk->m_tPreSize = REAL_SIZE(tThisChunkSize);
+			pstNextChunk->m_tSize |= PREV_INUSE;
+			/* Mark remainder as inuse so free() won't complain */
+			set_inuse_bit_at_offset(pstRemainChunk, tThisChunkSize);
+			ALLOC_SIZE_T tTmpSize;
+			inter_free(ptr_to_handle(chunk2mem(pstRemainChunk)),
+				   tTmpSize);
+
+			//		tPreChunkSize = tThisChunkSize;
+			tRemainderSize -= tThisChunkSize;
+			pstRemainChunk =
+				(MallocChunk *)(((char *)pstRemainChunk) +
+						REAL_SIZE(tThisChunkSize));
+		} while (tRemainderSize > 0);
+
+		tNewSize = tSize;
+	} else {
+		// next chunk
+		pstNextChunk = (MallocChunk *)(((char *)pstNewChunk) +
+					       REAL_SIZE(tNewSize));
+		pstNextChunk->m_tSize |= PREV_INUSE;
+	}
+	pstNewChunk->m_tSize = REAL_SIZE(tNewSize);
+	if (iPreInUse)
+		pstNewChunk->m_tSize |= PREV_INUSE;
+	pstNewChunk->m_tPreSize = tPreSize;
+
+	return ptr_to_handle(chunk2mem(pstNewChunk));
+}
+/*对intserrealloc的包装,对返回结果进行了简单的检查*/
+ALLOC_HANDLE_T PtMalloc::ReAlloc(ALLOC_HANDLE_T hHandle, ALLOC_SIZE_T tSize)
+{
+	ALLOC_HANDLE_T hNewHandle;
+	ALLOC_SIZE_T tOldSize;
+	MallocChunk *pstChunk;
+
+	m_pstHead->m_tLastFreeChunkSize = 0;
+	hNewHandle = inter_re_alloc(hHandle, tSize, tOldSize);
+	if (hNewHandle != INVALID_HANDLE) {
+		pstChunk = (MallocChunk *)mem2chunk(handle_to_ptr(hNewHandle));
+		m_pstHead->m_tUserAllocSize += CHUNK_SIZE(pstChunk);
+		m_pstHead->m_tUserAllocSize -= tOldSize;
+		if (hHandle == INVALID_HANDLE) {
+			m_pstHead->m_tUserAllocChunkCnt++;
+			++statChunkTotal;
+		}
+		add_alloc_size_to_stat(tSize);
+		statDataSize = m_pstHead->m_tUserAllocSize;
+	} else if (tSize == 0) {
+		m_pstHead->m_tUserAllocSize -= tOldSize;
+		m_pstHead->m_tUserAllocChunkCnt--;
+		--statChunkTotal;
+		statDataSize = m_pstHead->m_tUserAllocSize;
+	}
+
+	return (hNewHandle);
+}
+/*free接口的主体逻辑*/
+int PtMalloc::inter_free(ALLOC_HANDLE_T hHandle, ALLOC_SIZE_T &tMemSize)
+{
+	tMemSize = 0;
+	if (hHandle == INVALID_HANDLE)
+		return (0);
+
+	if (hHandle >= m_pstHead->m_tSize) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "free-invalid handle");
+		return (-1);
+	}
+
+	//	log4cplus_error("FREE: %lu", hHandle);
+
+	MallocChunk *pstChunk;
+	ALLOC_SIZE_T tSize;
+	pstChunk = (MallocChunk *)mem2chunk(handle_to_ptr(hHandle));
+	tSize = CHUNK_SIZE(pstChunk);
+	tMemSize = tSize;
+	hHandle = ptr_to_handle((void *)pstChunk);
+	if (hHandle + tSize >= m_pstHead->m_tSize) {
+#if __WORDSIZE == 64
+		snprintf(err_message_, sizeof(err_message_),
+			 "free-invalid handle: %lu, size: %u", hHandle, tSize);
+#else
+		snprintf(err_message_, sizeof(err_message_),
+			 "free-invalid handle: %llu, size: %u", hHandle, tSize);
+#endif
+		return (-2);
+	}
+
+	if (!inuse_bit_at_offset(pstChunk, tSize)) {
+#if __WORDSIZE == 64
+		snprintf(
+			err_message_, sizeof(err_message_),
+			"free-memory[handle %lu, size: %u, top: %lu] not in use",
+			hHandle, tSize, m_pstHead->m_hTop);
+#else
+		snprintf(
+			err_message_, sizeof(err_message_),
+			"free-memory[handle %llu, size: %u, top: %llu] not in use",
+			hHandle, tSize, m_pstHead->m_hTop);
+#endif
+		return (-3);
+	}
+
+	if (misaligned_chunk(hHandle)) {
+#if __WORDSIZE == 64
+		snprintf(err_message_, sizeof(err_message_),
+			 "free-invalid handle: %lu, size: %u", hHandle, tSize);
+#else
+		snprintf(err_message_, sizeof(err_message_),
+			 "free-invalid handle: %llu, size: %u", hHandle, tSize);
+#endif
+		return (INVALID_HANDLE);
+	}
+
+	if (check_inuse_chunk(pstChunk) != 0) {
+#if __WORDSIZE == 64
+		snprintf(err_message_, sizeof(err_message_),
+			 "free-invalid chunk: %lu, size: %u", hHandle, tSize);
+#else
+		snprintf(err_message_, sizeof(err_message_),
+			 "free-invalid chunk: %llu, size: %u", hHandle, tSize);
+#endif
+		return (INVALID_HANDLE);
+	}
+
+	unsigned int uiBinIdx;
+	MallocChunk *pstNextChunk;
+
+	if (!prev_inuse(pstChunk) && CAN_COMBILE(tSize, pstChunk->m_tPreSize)) {
+		tSize += pstChunk->m_tPreSize;
+		hHandle -= pstChunk->m_tPreSize;
+		uiBinIdx = bin_index(pstChunk->m_tPreSize);
+		pstChunk = (MallocChunk *)(((char *)pstChunk) -
+					   pstChunk->m_tPreSize);
+		// unlink
+		unlink_bin(m_ptBin[uiBinIdx], ptr_to_handle(pstChunk));
+		if (empty_bin(uiBinIdx))
+			clear_bin_bit_map(uiBinIdx);
+		set_size_at_offset(pstChunk, 0, tSize);
+		set_presize_at_offset(pstChunk, tSize, tSize);
+	}
+
+	if ((hHandle + tSize) != m_pstHead->m_hTop) {
+		pstNextChunk = (MallocChunk *)handle_to_ptr(hHandle + tSize);
+		if (CHUNK_SIZE(pstNextChunk) < MINSIZE) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "free-invalid handle: " UINT64FMT_T
+				 ", size: %u",
+				 hHandle, tSize);
+			return (-4);
+		}
+		if (!inuse_bit_at_offset(pstNextChunk,
+					 REAL_SIZE(pstNextChunk->m_tSize)) &&
+		    CAN_COMBILE(tSize, CHUNK_SIZE(pstNextChunk))) {
+			tSize += CHUNK_SIZE(pstNextChunk);
+			uiBinIdx = bin_index(CHUNK_SIZE(pstNextChunk));
+			// unlink
+			unlink_bin(m_ptBin[uiBinIdx],
+				   ptr_to_handle(pstNextChunk));
+			if (empty_bin(uiBinIdx))
+				clear_bin_bit_map(uiBinIdx);
+			set_size_at_offset(pstChunk, 0, tSize);
+			set_presize_at_offset(pstChunk, tSize, tSize);
+		}
+	}
+
+	set_size_at_offset(pstChunk, 0, tSize);
+	set_presize_at_offset(pstChunk, tSize, tSize);
+	set_inuse_bit_at_offset(pstChunk, tSize);
+
+	if (m_pstHead->m_tLastFreeChunkSize < tSize)
+		m_pstHead->m_tLastFreeChunkSize = tSize;
+
+	if ((hHandle + tSize) == m_pstHead->m_hTop) {
+		m_pstHead->m_hTop -= tSize;
+		statMemoryTop = m_pstHead->m_hTop;
+		pstChunk->m_tSize = PREV_INUSE;
+		if (m_pstHead->m_tSize > (m_pstHead->m_hTop + MINSIZE) &&
+		    m_pstHead->m_tLastFreeChunkSize <
+			    m_pstHead->m_tSize - m_pstHead->m_hTop - MINSIZE)
+			m_pstHead->m_tLastFreeChunkSize = m_pstHead->m_tSize -
+							  m_pstHead->m_hTop -
+							  MINSIZE;
+		return (0);
+	}
+
+	clear_inuse_bit_at_offset(pstChunk, tSize);
+
+	// place chunk into bin
+	if (in_smallbin_range(tSize)) {
+		link_bin(m_ptBin[smallbin_index(tSize)],
+			 ptr_to_handle(pstChunk));
+		set_bin_bit_map(smallbin_index(tSize));
+	} else {
+#if 0
+		/* 当一个bin下挂接的节点非常多时,因为要排序,所以这个调用会花费很多cpu时间 by ada */
+		int iIdx = largebin_index(tSize);
+		link_sorted_bin(m_ptBin[iIdx], ptr_to_handle(pstChunk), tSize);
+#endif
+		link_bin(m_ptBin[largebin_index(tSize)],
+			 ptr_to_handle(pstChunk));
+		set_bin_bit_map(largebin_index(tSize));
+	}
+	//#endif
+
+	return (0);
+}
+/*对interfree的包装,对返回结果进行了简单检查*/
+int PtMalloc::Free(ALLOC_HANDLE_T hHandle)
+{
+	int iRet;
+	ALLOC_SIZE_T tSize;
+
+	tSize = 0;
+	iRet = inter_free(hHandle, tSize);
+	if (iRet == 0) {
+		m_pstHead->m_tUserAllocSize -= tSize;
+		m_pstHead->m_tUserAllocChunkCnt--;
+		--statChunkTotal;
+		statDataSize = m_pstHead->m_tUserAllocSize;
+	}
+
+	return (iRet);
+}
+/*返回如果free掉handle指定chunk能够给cache共享多少空闲内存*/
+/*前后合并chunk可能导致释放比指定handle的大小更大的空间*/
+unsigned PtMalloc::ask_for_destroy_size(ALLOC_HANDLE_T hHandle)
+{
+	//	ALLOC_SIZE_T logic_size = 0;
+	ALLOC_SIZE_T physic_size = 0;
+	ALLOC_HANDLE_T physic_handle = 0;
+
+	MallocChunk *current_chunk = 0;
+	MallocChunk *next_chunk = 0;
+
+	if (INVALID_HANDLE == hHandle || hHandle >= m_pstHead->m_tSize)
+		goto ERROR;
+
+	/* physic pointer */
+	current_chunk = (MallocChunk *)mem2chunk(handle_to_ptr(hHandle));
+	physic_size = CHUNK_SIZE(current_chunk);
+	//	logic_size = chunksize2memsize(physic_size);
+	physic_handle = ptr_to_handle((void *)current_chunk);
+
+	/* start error check. */
+	/* overflow */
+	if (physic_handle + physic_size > m_pstHead->m_tSize)
+		goto ERROR;
+
+	/* current chunk is not inuse */
+	if (!inuse_bit_at_offset(current_chunk, physic_size))
+		goto ERROR;
+
+	/* not aligned */
+	if (misaligned_chunk(physic_handle))
+		goto ERROR;
+
+	/* */
+	if (0 != check_inuse_chunk(current_chunk))
+		goto ERROR;
+
+	/* try combile prev-chunk */
+	if (!prev_inuse(current_chunk) &&
+	    CAN_COMBILE(physic_size, current_chunk->m_tPreSize)) {
+		physic_size += current_chunk->m_tPreSize;
+
+		/* forward handle */
+		physic_handle -= current_chunk->m_tPreSize;
+		current_chunk = (MallocChunk *)((char *)current_chunk -
+						current_chunk->m_tPreSize);
+	}
+
+	/* try combile next-chunk */
+	if (physic_handle + physic_size != m_pstHead->m_hTop) {
+		next_chunk = (MallocChunk *)(handle_to_ptr(physic_handle +
+							   physic_size));
+		if (CHUNK_SIZE(next_chunk) < MINSIZE)
+			goto ERROR;
+
+		/* can combile */
+		if (!inuse_bit_at_offset(next_chunk, CHUNK_SIZE(next_chunk)) &&
+		    CAN_COMBILE(physic_size, CHUNK_SIZE(next_chunk))) {
+			physic_size += CHUNK_SIZE(next_chunk);
+		}
+	}
+
+	/* 释放到top边界,合并成一大块内存 */
+	if (physic_handle + physic_size == m_pstHead->m_hTop) {
+		ALLOC_SIZE_T physic_free = m_pstHead->m_tSize -
+					   m_pstHead->m_hTop - MINSIZE +
+					   physic_size;
+		physic_size =
+			physic_size < physic_free ? physic_free : physic_size;
+	}
+
+	return chunksize2memsize(physic_size);
+
+ERROR:
+	snprintf(err_message_, sizeof(err_message_),
+		 "found invalid handle, can't destroy");
+	return 0;
+}
+
+ALLOC_SIZE_T PtMalloc::last_free_size()
+{
+	free_fast();
+
+	return chunksize2memsize(m_pstHead->m_tLastFreeChunkSize);
+}
+
+/**************************************************************************
+ * for test
+ * dump all bins and chunks
+ *************************************************************************/
+
+/*对所有的bin检查:small&large bins, fast bins, unsorted bins*/
+/*校验方法:每个bin组成一个双向的循环链表*/
+int PtMalloc::dump_bins()
+{
+	int i;
+	int count;
+	uint64_t size;
+
+	INTER_HANDLE_T hHandle;
+	MallocChunk *pstChunk;
+	printf("dump bins\n");
+	for (i = 0; i < NBINS; i++) {
+		hHandle = m_ptBin[i].m_hNextChunk;
+		count = 0;
+		size = 0;
+		if (hHandle != INVALID_HANDLE) {
+			do {
+				pstChunk =
+					(MallocChunk *)handle_to_ptr(hHandle);
+				if (pstChunk->m_hNextChunk != INVALID_HANDLE)
+					hHandle = pstChunk->m_hNextChunk;
+				size += CHUNK_SIZE(pstChunk);
+				++count;
+			} while (pstChunk->m_hNextChunk != INVALID_HANDLE);
+		}
+		if (m_ptBin[i].m_hPreChunk != hHandle) {
+			printf("bad bin[%d]", i);
+			return (-1);
+		}
+		if (count) {
+#if __WORDSIZE == 64
+			printf("bins[%d] chunk num[%d] size[%lu]\n", i, count,
+			       size);
+#else
+			printf("bins[%d] chunk num[%d] size[%llu]\n", i, count,
+			       size);
+#endif
+		}
+	}
+
+	printf("dump fast bins\n");
+	for (i = 0; i < NFASTBINS; i++) {
+		hHandle = m_ptFastBin[i].m_hNextChunk;
+		count = 0;
+		if (hHandle != INVALID_HANDLE) {
+			do {
+				pstChunk =
+					(MallocChunk *)handle_to_ptr(hHandle);
+				if (pstChunk->m_hNextChunk != INVALID_HANDLE)
+					hHandle = pstChunk->m_hNextChunk;
+				++count;
+			} while (pstChunk->m_hNextChunk != INVALID_HANDLE);
+		}
+		if (m_ptFastBin[i].m_hPreChunk != hHandle) {
+			printf("bad fast-bin[%d]\n", i);
+			return (-2);
+		}
+		if (count) {
+			printf("fast bins[%d] chunk num[%d]\n", i, count);
+		}
+	}
+	printf("dump unsorted bins\n");
+	hHandle = m_ptUnsortedBin[0].m_hNextChunk;
+	count = 0;
+	if (hHandle != INVALID_HANDLE) {
+		do {
+			pstChunk = (MallocChunk *)handle_to_ptr(hHandle);
+			printf("%d\n", CHUNK_SIZE(pstChunk));
+			if (pstChunk->m_hNextChunk != INVALID_HANDLE)
+				hHandle = pstChunk->m_hNextChunk;
+		} while (pstChunk->m_hNextChunk != INVALID_HANDLE);
+	}
+	if (m_ptUnsortedBin[0].m_hPreChunk != hHandle) {
+#if __WORDSIZE == 64
+		printf("bad unsorted-bin[%d] %lu!=%lu\n", 0,
+		       m_ptUnsortedBin[0].m_hPreChunk, hHandle);
+#else
+		printf("bad unsorted-bin[%d] %llu!=%llu\n", 0,
+		       m_ptUnsortedBin[0].m_hPreChunk, hHandle);
+#endif
+		return (-3);
+	}
+	printf("unsorted bins:chunk num[%d]\n", count);
+
+	return (0);
+}
+
+int PtMalloc::dump_mem()
+{
+	INTER_HANDLE_T hHandle;
+	MallocChunk *pstChunk;
+	//	ALLOC_SIZE_T tSize;
+
+	//	tSize = 0;
+	printf("dump_mem\n");
+	hHandle = m_pstHead->m_hBottom;
+	while (hHandle < m_pstHead->m_hTop) {
+		pstChunk = (MallocChunk *)handle_to_ptr(hHandle);
+		printf("%d\t\t%d\n", CHUNK_SIZE(pstChunk),
+		       prev_inuse(pstChunk));
+		hHandle += CHUNK_SIZE(pstChunk);
+	}
+
+	return (0);
+}

+ 395 - 0
src/core/mem/pt_malloc.h

@@ -0,0 +1,395 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef BIN_MALLOC_H
+#define BIN_MALLOC_H
+
+#include <stdint.h>
+#include <stdlib.h>
+#include "namespace.h"
+#include "mallocator.h"
+#include "log/log.h"
+#include "stat_dtc.h"
+
+DTC_BEGIN_NAMESPACE
+
+#define MALLOC_FLAG_FAST 0x1
+
+/*
+  This struct declaration is misleading (but accurate and necessary).
+  It declares a "view" into memory allowing access to necessary
+  fields at known offsets from a given base. See explanation below.
+*/
+
+typedef struct {
+	ALLOC_SIZE_T m_tPreSize; /* Size of previous chunk (if free).  */
+	ALLOC_SIZE_T m_tSize; /* Size in bytes, including overhead. */
+
+	INTER_HANDLE_T m_hPreChunk; /* double links -- used only if free. */
+	INTER_HANDLE_T m_hNextChunk;
+} MallocChunk;
+
+typedef struct {
+	INTER_HANDLE_T m_hPreChunk;
+	INTER_HANDLE_T m_hNextChunk;
+} CBin;
+
+/* The smallest possible chunk */
+#define MIN_CHUNK_SIZE (sizeof(MallocChunk))
+
+/* The smallest size we can malloc is an aligned minimal chunk */
+#define MINSIZE                                                                \
+	(unsigned long)(((MIN_CHUNK_SIZE + MALLOC_ALIGN_MASK) &                \
+			 ~MALLOC_ALIGN_MASK))
+
+#define NBINS 128
+#define NSMALLBINS 64
+#define SMALLBIN_WIDTH 8
+#define MIN_LARGE_SIZE 512
+
+#define DTC_SIGN_0 0
+#define DTC_SIGN_1 0x4D635474U
+#define DTC_SIGN_2 1
+#define DTC_SIGN_3 0xFFFFFFFFU
+#define DTC_SIGN_4 0xFFFFFFFFU
+#define DTC_SIGN_5 0xFFFFFFFFU
+#define DTC_SIGN_6 4
+#define DTC_SIGN_7 0
+#define DTC_SIGN_8 16
+#define DTC_SIGN_9 0xFFFFFFFFU
+#define DTC_SIGN_A 0
+#define DTC_SIGN_B 0
+#define DTC_SIGN_C 0xFFFFFFFFU
+#define DTC_SIGN_D 0xFFFFFFFFU
+
+#define DTC_VER_MIN 4 // 本代码认识的dtc内存最小版本
+
+#define DTC_RESERVE_SIZE (4 * 1024UL)
+
+#define EC_NO_MEM 2041 // 内存不足错误码
+#define EC_KEY_EXIST 2042
+#define EC_KEY_NOT_EXIST 2043
+#define MAXSTATCOUNT 10000 * 3600 * 12
+
+struct _MemHead {
+	uint32_t m_auiSign[14]; // 内存格式标记
+	unsigned short m_ushVer; // 内存格式版本号
+	unsigned short m_ushHeadSize; // 头大小
+	INTER_SIZE_T m_tSize; // 内存总大小
+	INTER_SIZE_T m_tUserAllocSize; // 上层应用分配到可用的内存大小
+	INTER_SIZE_T m_tUserAllocChunkCnt; // 上层应用分配的内存块数量
+	uint32_t m_uiFlags; // 特性标记
+	INTER_HANDLE_T m_hBottom; // 上层应用可用内存底地址
+	INTER_HANDLE_T m_hReserveZone; // 为上层应用保留的地址
+	INTER_HANDLE_T m_hTop; // 目前分配到的最高地址
+	INTER_SIZE_T m_tLastFreeChunkSize; // 最近一次free后,合并得到的chunk大小
+	uint16_t m_ushBinCnt; // bin的数量
+	uint16_t m_ushFastBinCnt; // fastbin数量
+	uint32_t m_auiBinBitMap[(NBINS - 1) / 32 + 1]; // bin的bitmap
+	uint32_t m_shmIntegrity; //共享内存完整性标记
+	char m_achReserv
+		[872]; // 保留字段 (使CMemHead的大小为1008Bytes,加上后面的bins后达到4K)
+} __attribute__((__aligned__(4)));
+typedef struct _MemHead MemHead;
+
+#define GET_OBJ(mallocter, handle, obj_ptr)                                    \
+	do {                                                                   \
+		obj_ptr = (typeof(obj_ptr))mallocter.handle_to_ptr(handle);    \
+	} while (0)
+
+class PtMalloc : public MallocBase {
+    private:
+	void *m_pBaseAddr;
+	MemHead *m_pstHead;
+	CBin *m_ptBin;
+	CBin *m_ptFastBin;
+	CBin *m_ptUnsortedBin;
+	char err_message_[200];
+
+	// stat
+	StatCounter statChunkTotal;
+	StatItem statDataSize;
+	StatItem statMemoryTop;
+
+	uint64_t statTmpDataSizeRecently; //最近分配的内存大小
+	uint64_t statTmpDataAllocCountRecently; //最近分配的内存次数
+	StatItem statAverageDataSizeRecently;
+	inline void add_alloc_size_to_stat(uint64_t size)
+	{
+		if (statTmpDataAllocCountRecently > MAXSTATCOUNT) {
+			statTmpDataSizeRecently = 0;
+			statTmpDataAllocCountRecently = 0;
+			statAverageDataSizeRecently = MINSIZE;
+		} else {
+			statTmpDataSizeRecently += size;
+			statTmpDataAllocCountRecently++;
+			statAverageDataSizeRecently =
+				statTmpDataSizeRecently /
+				statTmpDataAllocCountRecently;
+		}
+	}
+
+	//最小的chrunk size,
+	unsigned int minChunkSize;
+	inline unsigned int get_min_chunk_size(void)
+	{
+		return minChunkSize == 1 ?
+			       ((statChunkTotal <= 0) ?
+					MINSIZE :
+					statDataSize / statChunkTotal) :
+			       minChunkSize;
+	}
+
+    public:
+	void set_min_chunk_size(unsigned int size)
+	{
+		minChunkSize =
+			size == 1 ? 1 : (size < MINSIZE ? MINSIZE : size);
+	}
+
+    protected:
+	void init_sign();
+
+	void *bin_malloc(CBin &ptBin);
+	void *small_bin_malloc(ALLOC_SIZE_T tSize);
+	void *fast_malloc(ALLOC_SIZE_T tSize);
+	void *top_alloc(ALLOC_SIZE_T tSize);
+	int unlink_bin(CBin &stBin, INTER_HANDLE_T hHandle);
+	int link_bin(CBin &stBin, INTER_HANDLE_T hHandle);
+	int link_sorted_bin(CBin &stBin, INTER_HANDLE_T hHandle,
+			    ALLOC_SIZE_T tSize);
+	int check_inuse_chunk(MallocChunk *pstChunk);
+	int free_fast();
+
+	inline void set_bin_bit_map(unsigned int uiBinIdx)
+	{
+		m_pstHead->m_auiBinBitMap[uiBinIdx / 32] |=
+			(1UL << (uiBinIdx % 32));
+	}
+	inline void clear_bin_bit_map(unsigned int uiBinIdx)
+	{
+		m_pstHead->m_auiBinBitMap[uiBinIdx / 32] &=
+			(~(1UL << (uiBinIdx % 32)));
+	}
+	inline int empty_bin(unsigned int uiBinIdx)
+	{
+		return (m_ptBin[uiBinIdx].m_hNextChunk == INVALID_HANDLE);
+	}
+
+	// 内部做一下统计
+	ALLOC_HANDLE_T inter_malloc(ALLOC_SIZE_T tSize);
+	ALLOC_HANDLE_T inter_re_alloc(ALLOC_HANDLE_T hHandle,
+				      ALLOC_SIZE_T tSize,
+				      ALLOC_SIZE_T &tOldMemSize);
+	int inter_free(ALLOC_HANDLE_T hHandle, ALLOC_SIZE_T &tMemSize);
+
+    public:
+	PtMalloc();
+	~PtMalloc();
+
+	static PtMalloc *instance();
+	static void destroy();
+
+	template <class T> T *Pointer(ALLOC_HANDLE_T hHandle)
+	{
+		return reinterpret_cast<T *>(handle_to_ptr(hHandle));
+	}
+
+	ALLOC_HANDLE_T get_handle(void *p)
+	{
+		return ptr_to_handle(p);
+	}
+
+	const char *get_err_msg()
+	{
+		return err_message_;
+	}
+	const MemHead *get_head_info() const
+	{
+		return m_pstHead;
+	}
+
+	/*************************************************
+	  Description:	格式化内存
+	  Input:		pAddr	内存块地址
+				tSize		内存块大小
+	  Return:		0为成功,非0失败
+	*************************************************/
+	int do_init(void *pAddr, INTER_SIZE_T tSize);
+
+	/*************************************************
+	  Description:	attach已经格式化好的内存块
+	  Input:		pAddr	内存块地址
+				tSize		内存块大小
+	  Return:		0为成功,非0失败
+	*************************************************/
+	int do_attach(void *pAddr, INTER_SIZE_T tSize);
+
+	/*************************************************
+	  Description:	检测内存块的dtc版本
+	  Input:		pAddr	内存块地址
+				tSize		内存块大小
+	   Output:		
+	  Return:		0为成功,非0失败
+	*************************************************/
+	int detect_version();
+
+	/* 共享内存完整性检测接口 */
+	int share_memory_integrity();
+	void set_share_memory_integrity(const int flag);
+
+	/*************************************************
+	  Description:	检测内部数据结构bin是否正确
+	  Input:		
+	  Output:		
+	  Return:		0为成功,非0失败
+	*************************************************/
+	int check_bin();
+#if BIN_MEM_CHECK
+	int check_mem();
+#endif
+	int dump_bins();
+	int dump_mem();
+
+	/*************************************************
+	  Description:	分配内存
+	  Input:		tSize		分配的内存大小
+	  Output:		
+	  Return:		内存块句柄,INVALID_HANDLE为失败
+	*************************************************/
+	ALLOC_HANDLE_T Malloc(ALLOC_SIZE_T tSize);
+
+	/*************************************************
+	  Description:	分配内存,并将内存初始化为0
+	  Input:		tSize		分配的内存大小
+	  Output:		
+	  Return:		内存块句柄,INVALID_HANDLE为失败
+	*************************************************/
+	ALLOC_HANDLE_T Calloc(ALLOC_SIZE_T tSize);
+
+	/*************************************************
+	  Description:	重新分配内存
+	  Input:		hHandle	老内存句柄
+				tSize		新分配的内存大小
+	  Output:		
+	  Return:		内存块句柄,INVALID_HANDLE为失败(失败时不会释放老内存块)
+	*************************************************/
+	ALLOC_HANDLE_T ReAlloc(ALLOC_HANDLE_T hHandle, ALLOC_SIZE_T tSize);
+
+	/*************************************************
+	  Description:	释放内存
+	  Input:		hHandle	内存句柄
+	  Output:		
+	  Return:		0为成功,非0失败
+	*************************************************/
+	int Free(ALLOC_HANDLE_T hHandle);
+
+	/*************************************************
+	  Description: 获取释放这块内存后可以得到多少free空间	
+	  Input:		hHandle	内存句柄
+	  Output:		
+	  Return:		>0为成功,0失败
+	*************************************************/
+	unsigned ask_for_destroy_size(ALLOC_HANDLE_T hHandle);
+
+	/*************************************************
+	  Description:	获取内存块大小
+	  Input:		hHandle	内存句柄
+	  Output:		
+	  Return:		内存大小
+	*************************************************/
+	ALLOC_SIZE_T chunk_size(ALLOC_HANDLE_T hHandle);
+
+	/*************************************************
+	  Description:	获取用户已经分配的内存总大小
+	  Input:		
+	  Output:		
+	  Return:		内存大小
+	*************************************************/
+	INTER_SIZE_T user_alloc_size()
+	{
+		return m_pstHead->m_tUserAllocSize;
+	}
+
+	/*************************************************
+	  Description:	获取内存总大小
+	  Input:		
+	  Output:		
+	  Return:		内存大小
+	*************************************************/
+	INTER_SIZE_T total_size()
+	{
+		return m_pstHead->m_tSize;
+	}
+
+	/*************************************************
+	  Description:	最近一次释放内存,合并后的chunk大小
+	  Input:		
+	  Output:		
+	  Return:		内存大小
+	*************************************************/
+	ALLOC_SIZE_T last_free_size();
+
+	/*************************************************
+	  Description:	获取为上层应用保留的内存块(大小为DTC_RESERVE_SIZE=4K)
+	  Input:		
+	  Output:		
+	  Return:		内存句柄
+	*************************************************/
+	ALLOC_HANDLE_T get_reserve_zone();
+
+	/*************************************************
+	  Description:	将句柄转换成内存地址
+	  Input:		内存句柄
+	  Output:		
+	  Return:		内存地址,如果句柄无效返回NULL
+	*************************************************/
+	inline void *handle_to_ptr(ALLOC_HANDLE_T hHandle)
+	{
+		if (hHandle == INVALID_HANDLE)
+			return (NULL);
+		return (void *)(((char *)m_pBaseAddr) + hHandle);
+	}
+
+	/*************************************************
+	  Description:	将内存地址转换为句柄
+	  Input:		内存地址
+	  Output:		
+	  Return:		内存句柄,如果地址无效返回INVALID_HANDLE
+	*************************************************/
+	inline ALLOC_HANDLE_T ptr_to_handle(void *p)
+	{
+		if ((char *)p < (char *)m_pBaseAddr ||
+		    (char *)p >= ((char *)m_pBaseAddr) + m_pstHead->m_tSize)
+			return INVALID_HANDLE;
+		return (ALLOC_HANDLE_T)(((char *)p) - ((char *)m_pBaseAddr));
+	}
+
+	/*************************************************
+	  Description:	检测handle是否有效
+	  Input:		内存句柄
+	  Output:		
+      Return:	    0: 有效; -1:无效
+	*************************************************/
+	virtual int handle_is_valid(ALLOC_HANDLE_T mem_handle)
+	{
+		return 0;
+	}
+};
+
+DTC_END_NAMESPACE
+
+#endif

+ 19 - 0
src/core/mem/sys_malloc.cc

@@ -0,0 +1,19 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include "sys_malloc.h"
+
+SysMalloc g_stSysMalloc;

+ 191 - 0
src/core/mem/sys_malloc.h

@@ -0,0 +1,191 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef SYS_MALLOC_H
+#define SYS_MALLOC_H
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "namespace.h"
+#include "mallocator.h"
+
+DTC_BEGIN_NAMESPACE
+
+class SysMalloc : public MallocBase {
+    private:
+	char err_message_[200];
+
+    public:
+	SysMalloc()
+	{
+	}
+	virtual ~SysMalloc()
+	{
+	}
+
+	template <class T> T *Pointer(ALLOC_HANDLE_T hHandle)
+	{
+		return reinterpret_cast<T *>(handle_to_ptr(hHandle));
+	}
+
+	ALLOC_HANDLE_T get_handle(void *p)
+	{
+		return (ALLOC_HANDLE_T)((char *)p - (char *)0);
+	}
+
+	const char *get_err_msg()
+	{
+		return err_message_;
+	}
+
+	/*************************************************
+	  Description:	分配内存
+	  Input:		tSize		分配的内存大小
+	  Output:		
+	  Return:		内存块句柄,INVALID_HANDLE为失败
+	*************************************************/
+	ALLOC_HANDLE_T Malloc(ALLOC_SIZE_T tSize)
+	{
+		void *p = malloc(sizeof(ALLOC_SIZE_T) + tSize);
+		if (p == NULL) {
+			snprintf(err_message_, sizeof(err_message_), "%m");
+			return (INVALID_HANDLE);
+		}
+		*(ALLOC_SIZE_T *)p = tSize;
+		return get_handle((void *)((char *)p + sizeof(ALLOC_SIZE_T)));
+	}
+
+	/*************************************************
+	  Description:	分配内存,并将内存初始化为0
+	  Input:		tSize		分配的内存大小
+	  Output:		
+	  Return:		内存块句柄,INVALID_HANDLE为失败
+	*************************************************/
+	ALLOC_HANDLE_T Calloc(ALLOC_SIZE_T tSize)
+	{
+		void *p = calloc(1, sizeof(ALLOC_SIZE_T) + tSize);
+		if (p == NULL) {
+			snprintf(err_message_, sizeof(err_message_), "%m");
+			return (INVALID_HANDLE);
+		}
+		*(ALLOC_SIZE_T *)p = tSize;
+		return get_handle((void *)((char *)p + sizeof(ALLOC_SIZE_T)));
+	}
+
+	/*************************************************
+	  Description:	重新分配内存
+	  Input:		hHandle	老内存句柄
+				tSize		新分配的内存大小
+	  Output:		
+	  Return:		内存块句柄,INVALID_HANDLE为失败(失败时不会释放老内存块)
+	*************************************************/
+	ALLOC_HANDLE_T ReAlloc(ALLOC_HANDLE_T hHandle, ALLOC_SIZE_T tSize)
+	{
+		char *old;
+		if (hHandle == INVALID_HANDLE)
+			old = NULL;
+		else
+			old = (char *)0 + (hHandle - sizeof(ALLOC_SIZE_T));
+		if (tSize == 0) {
+			free(old);
+			return (INVALID_HANDLE);
+		}
+		void *p = realloc(old, sizeof(ALLOC_SIZE_T) + tSize);
+		if (p == NULL) {
+			snprintf(err_message_, sizeof(err_message_), "%m");
+			return (INVALID_HANDLE);
+		}
+		*(ALLOC_SIZE_T *)p = tSize;
+		return get_handle((void *)((char *)p + sizeof(ALLOC_SIZE_T)));
+	}
+
+	/*************************************************
+	  Description:	释放内存
+	  Input:		hHandle	内存句柄
+	  Output:		
+	  Return:		0为成功,非0失败
+	*************************************************/
+	int Free(ALLOC_HANDLE_T hHandle)
+	{
+		if (hHandle == INVALID_HANDLE)
+			return (0);
+
+		char *old = (char *)0 + (hHandle - sizeof(ALLOC_SIZE_T));
+		free(old);
+		return (0);
+	}
+
+	/*************************************************
+	  Description:	获取内存块大小
+	  Input:		hHandle	内存句柄
+	  Output:		
+	  Return:		内存大小
+	*************************************************/
+	ALLOC_SIZE_T chunk_size(ALLOC_HANDLE_T hHandle)
+	{
+		if (hHandle == INVALID_HANDLE)
+			return (0);
+
+		char *old = (char *)0 + (hHandle - sizeof(ALLOC_SIZE_T));
+		return *(ALLOC_SIZE_T *)old;
+	}
+
+	/*************************************************
+	  Description:	将句柄转换成内存地址
+	  Input:		内存句柄
+	  Output:		
+	  Return:		内存地址,如果句柄无效返回NULL
+	*************************************************/
+	void *handle_to_ptr(ALLOC_HANDLE_T hHandle)
+	{
+		return (char *)0 + hHandle;
+	}
+
+	/*************************************************
+	  Description:	将内存地址转换为句柄
+	  Input:		内存地址
+	  Output:		
+	  Return:		内存句柄,如果地址无效返回INVALID_HANDLE
+	*************************************************/
+	ALLOC_HANDLE_T ptr_to_handle(void *p)
+	{
+		return get_handle(p);
+	}
+
+	/* not implement */
+	ALLOC_SIZE_T ask_for_destroy_size(ALLOC_HANDLE_T hHandle)
+	{
+		return (ALLOC_SIZE_T)0;
+	}
+
+	/*************************************************
+	  Description:	检测handle是否有效
+	  Input:		内存句柄
+	  Output:		
+      Return:	    0: 有效; -1:无效
+	*************************************************/
+	virtual int handle_is_valid(ALLOC_HANDLE_T mem_handle)
+	{
+		return 0;
+	}
+};
+
+extern SysMalloc g_stSysMalloc;
+
+DTC_END_NAMESPACE
+
+#endif

+ 20 - 0
src/core/misc/dtc_code.h

@@ -0,0 +1,20 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef __H_DTC_CODE_H__
+#define __H_DTC_CODE_H__
+
+#endif

+ 699 - 0
src/core/misc/main_supply.cc

@@ -0,0 +1,699 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include "main_supply.h"
+
+extern PollerBase *g_buffer_multi_thread;
+
+extern int init_plugin;
+extern int cache_key;
+extern int g_datasource_mode;
+extern int async_update;
+extern int g_target_new_hash;
+extern int g_hash_changing;
+extern int enable_plugin;
+extern PollerBase *g_datasource_thread;
+extern DataConnectorAskChain *g_data_connector_ask_instance;
+extern ListenerPool *listener;
+extern PluginManager *main_plugin_mgr;
+extern PollerBase *g_hot_backup_thread;
+extern DTCConfig *g_dtc_config;
+extern PollerBase *g_main_thread;
+extern PollerBase *g_remote_thread;
+extern RemoteDtcAskAnswerChain *g_remote_dtc_instance;
+extern ListenerPool *main_listener;
+
+extern BufferProcessAskChain *g_buffer_process_ask_instance;
+extern HotBackupAskChain *g_hot_backup_ask_instance;
+extern BarrierAskAnswerChain *g_buffer_barrier_instance;
+extern KeyRouteAskChain *g_key_route_ask_instance;
+extern BarrierAskAnswerChain *g_connector_barrier_instance;
+extern BufferBypassAskChain *g_buffer_bypass_ask_instance;
+extern AgentHubAskChain *g_agent_hub_ask_instance;
+extern JobHubAskChain *g_job_hub_ask_instance;
+extern BlackHoleAskChain *g_black_hole_ask_instance;
+
+extern void StopTaskExecutor(void);
+
+int plugin_start(void)
+{
+	init_plugin = 0;
+	main_plugin_mgr = PluginManager::instance();
+	if (NULL == main_plugin_mgr) {
+		log4cplus_error("create PluginManager instance failed.");
+		return DTC_CODE_FAILED;
+	}
+
+	if (main_plugin_mgr->open(g_dtc_config->get_int_val(
+		    "cache", "PluginNetworkMode", 0)) != 0) {
+		log4cplus_error("init plugin manager failed.");
+		return DTC_CODE_FAILED;
+	}
+
+	init_plugin = 1;
+	return DTC_CODE_SUCCESS;
+}
+
+int plugin_stop(void)
+{
+	main_plugin_mgr->close();
+	PluginManager::destory();
+	main_plugin_mgr = NULL;
+	return DTC_CODE_SUCCESS;
+}
+
+int stat_open_fd()
+{
+	int count = 0;
+	for (int i = 0; i < 1000; i++) {
+		if (fcntl(i, F_GETFL, 0) != -1)
+			count++;
+	}
+	return count;
+}
+
+int init_cache_mode()
+{
+	g_datasource_mode = g_dtc_config->get_int_val(
+		"cache", "DisableDataSource", DTC_MODE_CACHE_ONLY);
+	switch (g_datasource_mode) {
+	case DTC_MODE_DATABASE_ADDITION:
+		log4cplus_info("dtc datasource mode: %s(%d)",
+			       "DTC_MODE_DATABASE_ADDITION", g_datasource_mode);
+		break;
+	case DTC_MODE_CACHE_ONLY:
+		log4cplus_info("dtc datasource mode: %s(%d)",
+			       "DTC_MODE_CACHE_ONLY", g_datasource_mode);
+		break;
+	case DTC_MODE_DATABASE_ONLY:
+		log4cplus_info("dtc datasource mode: %s(%d)",
+			       "DTC_MODE_DATABASE_ONLY", g_datasource_mode);
+		break;
+	default:
+		log4cplus_error("datasource config error: %d",
+				g_datasource_mode);
+		return DTC_CODE_FAILED;
+	}
+
+	async_update = g_dtc_config->get_int_val("cache", "DelayUpdate", 0);
+	if (async_update < 0 || async_update > 1) {
+		log4cplus_error("Invalid DelayUpdate value");
+		return DTC_CODE_FAILED;
+	}
+
+	const char *keyStr = g_dtc_config->get_str_val("cache", "CacheShmKey");
+	if (keyStr == NULL) {
+		cache_key = 0;
+	} else if (!strcasecmp(keyStr, "none") &&
+		   g_datasource_mode != DTC_MODE_DATABASE_ONLY) {
+		log4cplus_error(
+			"Can not set DisableDataSource=(DTC_MODE_CACHE_ONLY|DTC_MODE_DATABASE_ADDITION) and CacheShmKey=NONE together.");
+		return DTC_CODE_FAILED;
+	} else if (isdigit(keyStr[0])) {
+		cache_key = strtol(keyStr, NULL, 0);
+	} else {
+		log4cplus_error("Invalid CacheShmKey value \"%s\"", keyStr);
+		return DTC_CODE_FAILED;
+	}
+
+	if (g_datasource_mode == DTC_MODE_DATABASE_ONLY && async_update) {
+		log4cplus_error(
+			"can't DelayUpdate when CacheShmKey set to NONE");
+		return DTC_CODE_FAILED;
+	}
+
+	if (g_datasource_mode != DTC_MODE_DATABASE_ONLY && cache_key == 0)
+		log4cplus_info("CacheShmKey not set, cache data is volatile");
+
+	if (g_datasource_mode == DTC_MODE_CACHE_ONLY)
+		log4cplus_info("disable data source, cache data is volatile");
+
+	return DTC_CODE_SUCCESS;
+}
+
+int init_main_chain_thread()
+{
+	g_main_thread = new PollerBase("dtc-thread-main");
+	if (g_main_thread == NULL)
+		return DTC_CODE_FAILED;
+	if (g_main_thread->initialize_thread() == DTC_CODE_FAILED)
+		return DTC_CODE_FAILED;
+
+	return DTC_CODE_SUCCESS;
+}
+
+int init_hotbackup_chain_thread()
+{
+	log4cplus_debug("StartHotbackThread begin");
+	g_hot_backup_thread = new PollerBase("dtc-thread-hotbackup");
+	g_hot_backup_ask_instance = new HotBackupAskChain(g_hot_backup_thread);
+
+	if (g_hot_backup_thread == NULL || g_hot_backup_ask_instance == NULL) {
+		log4cplus_error(
+			"hot backup thread or instance created failed.");
+		return DTC_CODE_FAILED;
+	}
+
+	if (g_hot_backup_thread->initialize_thread() == DTC_CODE_FAILED) {
+		log4cplus_error("init hotback thread fail");
+		return DTC_CODE_FAILED;
+	}
+	if (g_hot_backup_ask_instance->do_init(
+		    g_dtc_config->get_size_val("cache", "BinlogTotalSize",
+					       BINLOG_MAX_TOTAL_SIZE, 'M'),
+		    g_dtc_config->get_size_val("cache", "BinlogOneSize",
+					       BINLOG_MAX_SIZE, 'M')) == -1) {
+		log4cplus_error("hotbackProcess init fail");
+		return DTC_CODE_FAILED;
+	}
+
+	log4cplus_debug("StartHotbackThread end");
+	return DTC_CODE_SUCCESS;
+}
+
+int init_buffer_process_ask_chain_thread()
+{
+	log4cplus_error("init_buffer_process_ask_chain_thread start");
+	g_buffer_multi_thread = new PollerBase("dtc-multi-thread-cache");
+	g_buffer_process_ask_instance = new BufferProcessAskChain(
+		g_buffer_multi_thread,
+		TableDefinitionManager::instance()->get_cur_table_def(),
+		async_update ? MODE_ASYNC : MODE_SYNC);
+	g_buffer_process_ask_instance->set_limit_node_size(
+		g_dtc_config->get_int_val("cache", "LimitNodeSize",
+					  100 * 1024 * 1024));
+	g_buffer_process_ask_instance->set_limit_node_rows(
+		g_dtc_config->get_int_val("cache", "LimitNodeRows", 0));
+	g_buffer_process_ask_instance->set_limit_empty_nodes(
+		g_dtc_config->get_int_val("cache", "LimitEmptyNodes", 0));
+
+	if (g_buffer_multi_thread->initialize_thread() == DTC_CODE_FAILED) {
+		return DTC_CODE_FAILED;
+	}
+
+	unsigned long long cache_size =
+		g_dtc_config->get_size_val("cache", "CacheMemorySize", 0, 'M');
+	if (cache_size <= (50ULL << 20)) // 50M
+	{
+		log4cplus_error("CacheMemorySize too small");
+		return DTC_CODE_FAILED;
+	} else if (sizeof(long) == 4 && cache_size >= 4000000000ULL) {
+		log4cplus_error("CacheMemorySize %lld too large", cache_size);
+	} else if (g_buffer_process_ask_instance->set_buffer_size_and_version(
+			   cache_size,
+			   g_dtc_config->get_int_val("cache", "CacheShmVersion",
+						     4)) == DTC_CODE_FAILED) {
+		return DTC_CODE_FAILED;
+	}
+
+	/* disable async transaction log */
+	g_buffer_process_ask_instance->disable_async_log(1);
+
+	int lruLevel =
+		g_dtc_config->get_int_val("cache", "disable_lru_update", 0);
+	if (g_datasource_mode == DTC_MODE_CACHE_ONLY) {
+		if (g_buffer_process_ask_instance->enable_no_db_mode() < 0) {
+			return DTC_CODE_FAILED;
+		}
+		if (g_dtc_config->get_int_val("cache", "disable_auto_purge",
+					      0) > 0) {
+			g_buffer_process_ask_instance->disable_auto_purge();
+			// lruLevel = 3; /* LRU_WRITE */
+		}
+		int autoPurgeAlertTime = g_dtc_config->get_int_val(
+			"cache", "AutoPurgeAlertTime", 0);
+		g_buffer_process_ask_instance->set_date_expire_alert_time(
+			autoPurgeAlertTime);
+		if (autoPurgeAlertTime > 0 &&
+		    TableDefinitionManager::instance()
+				    ->get_cur_table_def()
+				    ->lastcmod_field_id() <= 0) {
+			log4cplus_error(
+				"Can't start AutoPurgeAlert without lastcmod field");
+			return DTC_CODE_FAILED;
+		}
+	}
+	g_buffer_process_ask_instance->disable_lru_update(lruLevel);
+	g_buffer_process_ask_instance->enable_lossy_data_source(
+		g_dtc_config->get_int_val("cache", "LossyDataSource", 0));
+
+	if (async_update != MODE_SYNC && cache_key == 0) {
+		log4cplus_error(
+			"Anonymous shared memory don't support DelayUpdate");
+		return DTC_CODE_FAILED;
+	}
+
+	int iAutoDeleteDirtyShm = g_dtc_config->get_int_val(
+		"cache", "AutoDeleteDirtyShareMemory", 0);
+	/*disable empty node filter*/
+	if (g_buffer_process_ask_instance->open_init_buffer(
+		    cache_key, 0, iAutoDeleteDirtyShm) == DTC_CODE_FAILED) {
+		return DTC_CODE_FAILED;
+	}
+
+	if (g_buffer_process_ask_instance->update_mode() ||
+	    g_buffer_process_ask_instance->is_mem_dirty()) // asyncUpdate active
+	{
+		if (TableDefinitionManager::instance()
+			    ->get_cur_table_def()
+			    ->uniq_fields() < 1) {
+			log4cplus_error("DelayUpdate needs uniq-field(s)");
+			return DTC_CODE_FAILED;
+		}
+
+		if (g_datasource_mode == DTC_MODE_CACHE_ONLY) {
+			if (g_buffer_process_ask_instance->update_mode()) {
+				log4cplus_error(
+					"Can't start async mode when disableDataSource.");
+				return DTC_CODE_FAILED;
+			} else {
+				log4cplus_error(
+					"Can't start disableDataSource with shm dirty,please flush async shm to db first or delete shm");
+				return DTC_CODE_FAILED;
+			}
+		} else {
+			if ((TableDefinitionManager::instance()
+				     ->get_cur_table_def()
+				     ->compress_field_id() >= 0)) {
+				log4cplus_error(
+					"sorry,DTC just support compress in disableDataSource mode now.");
+				return DTC_CODE_FAILED;
+			}
+		}
+
+		/*marker is the only source of flush speed calculattion, inc precision to 10*/
+		g_buffer_process_ask_instance->set_flush_parameter(
+			g_dtc_config->get_int_val("cache", "MarkerPrecision",
+						  10),
+			g_dtc_config->get_int_val("cache", "MaxFlushSpeed", 1),
+			g_dtc_config->get_int_val("cache", "MinDirtyTime",
+						  3600),
+			g_dtc_config->get_int_val("cache", "MaxDirtyTime",
+						  43200));
+
+		g_buffer_process_ask_instance->set_drop_count(
+			g_dtc_config->get_int_val("cache", "MaxDropCount",
+						  1000));
+	} else {
+		if (g_datasource_mode == DTC_MODE_DATABASE_ADDITION)
+			g_data_connector_ask_instance->disable_commit_group();
+	}
+
+	if (g_buffer_process_ask_instance->set_insert_order(dbConfig->ordIns) <
+	    0)
+		return DTC_CODE_FAILED;
+
+	log4cplus_error("init_buffer_process_ask_chain_thread end");
+
+	return DTC_CODE_SUCCESS;
+}
+
+int collect_load_config(DbConfig *dbconfig)
+{
+	if (g_datasource_mode == DTC_MODE_CACHE_ONLY)
+		return DTC_CODE_SUCCESS;
+
+	if (!g_data_connector_ask_instance)
+		return DTC_CODE_FAILED;
+
+	if (dbconfig == NULL) {
+		log4cplus_error("dbconfig == NULL");
+		return DTC_CODE_FAILED;
+	}
+
+	if (g_data_connector_ask_instance->renew_config(dbconfig)) {
+		log4cplus_error("helperunit renew config error!");
+		return DTC_CODE_FAILED;
+	}
+
+	return DTC_CODE_SUCCESS;
+}
+
+int init_remote_dtc_chain_thread()
+{
+	log4cplus_debug("init_remote_dtc_chain_thread begin");
+	g_remote_thread = new PollerBase("dtc-thread-remote");
+	g_remote_dtc_instance = new RemoteDtcAskAnswerChain(
+		g_remote_thread,
+		g_dtc_config->get_int_val("cache", "HelperCountPerGroup", 16));
+	if (g_remote_thread->initialize_thread() == DTC_CODE_FAILED) {
+		log4cplus_error("init remote thread error");
+		return DTC_CODE_FAILED;
+	}
+
+	//get helper timeout
+	int timeout = g_dtc_config->get_int_val("cache", "HelperTimeout", 30);
+	int retry = g_dtc_config->get_int_val("cache", "HelperRetryTimeout", 1);
+	int connect =
+		g_dtc_config->get_int_val("cache", "HelperConnectTimeout", 10);
+
+	g_remote_dtc_instance->set_timer_handler(
+		g_remote_thread->get_timer_list(timeout),
+		g_remote_thread->get_timer_list(connect),
+		g_remote_thread->get_timer_list(retry));
+	log4cplus_debug("init_remote_dtc_chain_thread end");
+
+	return DTC_CODE_SUCCESS;
+}
+
+int init_remote_dtc_chain(PollerBase *thread)
+{
+	log4cplus_debug("init_remote_dtc_chain begin");
+
+	g_remote_dtc_instance = new RemoteDtcAskAnswerChain(
+		thread,
+		g_dtc_config->get_int_val("cache", "HelperCountPerGroup", 16));
+
+	//get helper timeout
+	int timeout = g_dtc_config->get_int_val("cache", "HelperTimeout", 30);
+	int retry = g_dtc_config->get_int_val("cache", "HelperRetryTimeout", 1);
+	int connect =
+		g_dtc_config->get_int_val("cache", "HelperConnectTimeout", 10);
+
+	g_remote_dtc_instance->set_timer_handler(
+		thread->get_timer_list(timeout),
+		thread->get_timer_list(connect), thread->get_timer_list(retry));
+	log4cplus_debug("init_remote_dtc_chain end");
+
+	return DTC_CODE_SUCCESS;
+}
+int init_data_connector_chain_thread()
+{
+	log4cplus_debug("init_data_connector_chain_thread begin");
+	if (g_datasource_mode == DTC_MODE_DATABASE_ADDITION) {
+		g_data_connector_ask_instance = new DataConnectorAskChain();
+		if (g_data_connector_ask_instance->load_config(
+			    dbConfig, TableDefinitionManager::instance()
+					      ->get_cur_table_def()
+					      ->key_format()) == -1) {
+			return DTC_CODE_FAILED;
+		}
+	}
+
+	//get helper timeout
+	int timeout = g_dtc_config->get_int_val("cache", "HelperTimeout", 30);
+	int retry = g_dtc_config->get_int_val("cache", "HelperRetryTimeout", 1);
+	int connect =
+		g_dtc_config->get_int_val("cache", "HelperConnectTimeout", 10);
+
+	g_datasource_thread = new PollerBase("dtc-thread-datasource");
+	if (g_datasource_thread->initialize_thread() == DTC_CODE_FAILED)
+		return DTC_CODE_FAILED;
+
+	if (g_datasource_mode == DTC_MODE_DATABASE_ADDITION)
+		g_data_connector_ask_instance->set_timer_handler(
+			g_datasource_thread->get_timer_list(timeout),
+			g_datasource_thread->get_timer_list(connect),
+			g_datasource_thread->get_timer_list(retry));
+	log4cplus_debug("init_data_connector_chain_thread end");
+
+	return DTC_CODE_SUCCESS;
+}
+
+int init_buffer_process_ask_chain(PollerBase *thread)
+{
+	log4cplus_error("init_buffer_process_ask_chain start");
+	g_buffer_process_ask_instance = new BufferProcessAskChain(
+		thread, TableDefinitionManager::instance()->get_cur_table_def(),
+		async_update ? MODE_ASYNC : MODE_SYNC);
+	g_buffer_process_ask_instance->set_limit_node_size(
+		g_dtc_config->get_int_val("cache", "LimitNodeSize",
+					  100 * 1024 * 1024));
+	g_buffer_process_ask_instance->set_limit_node_rows(
+		g_dtc_config->get_int_val("cache", "LimitNodeRows", 0));
+	g_buffer_process_ask_instance->set_limit_empty_nodes(
+		g_dtc_config->get_int_val("cache", "LimitEmptyNodes", 0));
+
+	unsigned long long cache_size =
+		g_dtc_config->get_size_val("cache", "CacheMemorySize", 0, 'M');
+	if (cache_size <= (50ULL << 20)) // 50M
+	{
+		log4cplus_error("CacheMemorySize too small");
+		return DTC_CODE_FAILED;
+	} else if (sizeof(long) == 4 && cache_size >= 4000000000ULL) {
+		log4cplus_error("CacheMemorySize %lld too large", cache_size);
+	} else if (g_buffer_process_ask_instance->set_buffer_size_and_version(
+			   cache_size,
+			   g_dtc_config->get_int_val("cache", "CacheShmVersion",
+						     4)) == DTC_CODE_FAILED) {
+		return DTC_CODE_FAILED;
+	}
+
+	/* disable async transaction log */
+	g_buffer_process_ask_instance->disable_async_log(1);
+
+	int lruLevel =
+		g_dtc_config->get_int_val("cache", "disable_lru_update", 0);
+	if (g_datasource_mode == DTC_MODE_CACHE_ONLY) {
+		if (g_buffer_process_ask_instance->enable_no_db_mode() < 0) {
+			return DTC_CODE_FAILED;
+		}
+		if (g_dtc_config->get_int_val("cache", "disable_auto_purge",
+					      0) > 0) {
+			g_buffer_process_ask_instance->disable_auto_purge();
+			// lruLevel = 3; /* LRU_WRITE */
+		}
+		int autoPurgeAlertTime = g_dtc_config->get_int_val(
+			"cache", "AutoPurgeAlertTime", 0);
+		g_buffer_process_ask_instance->set_date_expire_alert_time(
+			autoPurgeAlertTime);
+		if (autoPurgeAlertTime > 0 &&
+		    TableDefinitionManager::instance()
+				    ->get_cur_table_def()
+				    ->lastcmod_field_id() <= 0) {
+			log4cplus_error(
+				"Can't start AutoPurgeAlert without lastcmod field");
+			return DTC_CODE_FAILED;
+		}
+	}
+	g_buffer_process_ask_instance->disable_lru_update(lruLevel);
+	g_buffer_process_ask_instance->enable_lossy_data_source(
+		g_dtc_config->get_int_val("cache", "LossyDataSource", 0));
+
+	if (async_update != MODE_SYNC && cache_key == 0) {
+		log4cplus_error(
+			"Anonymous shared memory don't support DelayUpdate");
+		return DTC_CODE_FAILED;
+	}
+
+	int iAutoDeleteDirtyShm = g_dtc_config->get_int_val(
+		"cache", "AutoDeleteDirtyShareMemory", 0);
+	/*disable empty node filter*/
+	if (g_buffer_process_ask_instance->open_init_buffer(
+		    cache_key, 0, iAutoDeleteDirtyShm) == DTC_CODE_FAILED) {
+		return DTC_CODE_FAILED;
+	}
+
+	if (g_buffer_process_ask_instance->update_mode() ||
+	    g_buffer_process_ask_instance->is_mem_dirty()) // asyncUpdate active
+	{
+		if (TableDefinitionManager::instance()
+			    ->get_cur_table_def()
+			    ->uniq_fields() < 1) {
+			log4cplus_error("DelayUpdate needs uniq-field(s)");
+			return DTC_CODE_FAILED;
+		}
+
+		switch (g_datasource_mode) {
+		case DTC_MODE_CACHE_ONLY:
+			if (g_buffer_process_ask_instance->update_mode()) {
+				log4cplus_error(
+					"Can't start async mode when disableDataSource.");
+				return DTC_CODE_FAILED;
+			} else {
+				log4cplus_error(
+					"Can't start disableDataSource with shm dirty,please flush async shm to db first or delete shm");
+				return DTC_CODE_FAILED;
+			}
+			break;
+		case DTC_MODE_DATABASE_ADDITION:
+			if ((TableDefinitionManager::instance()
+				     ->get_cur_table_def()
+				     ->compress_field_id() >= 0)) {
+				log4cplus_error(
+					"sorry,DTC just support compress in disableDataSource mode now.");
+				return DTC_CODE_FAILED;
+			}
+			break;
+		default:
+			log4cplus_error("datasource mode error:%d",
+					g_datasource_mode);
+			return DTC_CODE_FAILED;
+		}
+
+		/*marker is the only source of flush speed calculattion, inc precision to 10*/
+		g_buffer_process_ask_instance->set_flush_parameter(
+			g_dtc_config->get_int_val("cache", "MarkerPrecision",
+						  10),
+			g_dtc_config->get_int_val("cache", "MaxFlushSpeed", 1),
+			g_dtc_config->get_int_val("cache", "MinDirtyTime",
+						  3600),
+			g_dtc_config->get_int_val("cache", "MaxDirtyTime",
+						  43200));
+
+		g_buffer_process_ask_instance->set_drop_count(
+			g_dtc_config->get_int_val("cache", "MaxDropCount",
+						  1000));
+	} else {
+		if (g_datasource_mode == DTC_MODE_DATABASE_ADDITION)
+			g_data_connector_ask_instance->disable_commit_group();
+	}
+
+	if (g_buffer_process_ask_instance->set_insert_order(dbConfig->ordIns) <
+	    0)
+		return DTC_CODE_FAILED;
+
+	log4cplus_error("init_buffer_process_ask_chain end");
+
+	return DTC_CODE_SUCCESS;
+}
+
+int init_data_connector_ask_chain(PollerBase *thread)
+{
+	log4cplus_debug("init_data_connector_ask_chain begin");
+
+	g_data_connector_ask_instance = new DataConnectorAskChain();
+	if (g_data_connector_ask_instance->load_config(
+		    dbConfig, TableDefinitionManager::instance()
+				      ->get_cur_table_def()
+				      ->key_format()) == -1) {
+		return DTC_CODE_FAILED;
+	}
+	//get helper timeout
+	int timeout = g_dtc_config->get_int_val("cache", "HelperTimeout", 30);
+	int retry = g_dtc_config->get_int_val("cache", "HelperRetryTimeout", 1);
+	int connect =
+		g_dtc_config->get_int_val("cache", "HelperConnectTimeout", 10);
+
+	g_data_connector_ask_instance->set_timer_handler(
+		thread->get_timer_list(timeout),
+		thread->get_timer_list(connect), thread->get_timer_list(retry));
+
+	g_data_connector_ask_instance->do_attach(thread);
+	if (g_datasource_mode == DTC_MODE_DATABASE_ONLY) {
+		g_data_connector_ask_instance->disable_commit_group();
+	}
+	log4cplus_debug("init_data_connector_ask_chain end");
+
+	return DTC_CODE_SUCCESS;
+}
+//获取、配置基础信息
+int init_config_info()
+{
+	mkdir("../stat", 0777);
+	mkdir("../data", 0777);
+
+	g_hash_changing = g_dtc_config->get_int_val("cache", "HashChanging", 0);
+	g_target_new_hash =
+		g_dtc_config->get_int_val("cache", "TargetNewHash", 0);
+
+	DTCGlobal::pre_alloc_nodegroup_count =
+		g_dtc_config->get_int_val("cache", "PreAllocNGNum", 1024);
+	DTCGlobal::pre_alloc_nodegroup_count =
+		DTCGlobal::pre_alloc_nodegroup_count <= 1 ?
+			1 :
+			DTCGlobal::pre_alloc_nodegroup_count >= (1 << 12) ?
+			1 :
+			DTCGlobal::pre_alloc_nodegroup_count;
+
+	DTCGlobal::min_chunk_size_ =
+		g_dtc_config->get_int_val("cache", "MinChunkSize", 0);
+	if (DTCGlobal::min_chunk_size_ < 0) {
+		DTCGlobal::min_chunk_size_ = 0;
+	}
+
+	DTCGlobal::pre_purge_nodes_ =
+		g_dtc_config->get_int_val("cache", "pre_purge_nodes", 0);
+	if (DTCGlobal::pre_purge_nodes_ < 0) {
+		DTCGlobal::pre_purge_nodes_ = 0;
+	} else if (DTCGlobal::pre_purge_nodes_ > 10000) {
+		DTCGlobal::pre_purge_nodes_ = 10000;
+	}
+
+	RELATIVE_HOUR_CALCULATOR->set_base_hour(
+		g_dtc_config->get_int_val("cache", "RelativeYear", 2014));
+
+	log4cplus_info("Table %s: key/field# %d/%d, keysize %d",
+		       dbConfig->tblName,
+		       TableDefinitionManager::instance()
+			       ->get_cur_table_def()
+			       ->key_fields(),
+		       TableDefinitionManager::instance()
+				       ->get_cur_table_def()
+				       ->num_fields() +
+			       1,
+		       TableDefinitionManager::instance()
+			       ->get_cur_table_def()
+			       ->max_key_size());
+
+	return DTC_CODE_SUCCESS;
+}
+
+void free_all_resource()
+{
+	//stop plugin
+	if (enable_plugin && init_plugin) {
+		plugin_stop();
+	}
+
+	DELETE(main_listener);
+
+	if (g_buffer_multi_thread) {
+		g_buffer_multi_thread->interrupt();
+	}
+	if (g_hot_backup_thread) {
+		g_hot_backup_thread->interrupt();
+	}
+	if (g_datasource_thread) {
+		g_datasource_thread->interrupt();
+	}
+
+	if (g_remote_thread) {
+		g_remote_thread->interrupt();
+	}
+
+	if (g_main_thread) {
+		g_main_thread->interrupt();
+	}
+
+	StopTaskExecutor();
+
+	DELETE(g_buffer_process_ask_instance);
+	DELETE(g_data_connector_ask_instance);
+	DELETE(g_buffer_barrier_instance);
+	DELETE(g_key_route_ask_instance);
+	DELETE(g_connector_barrier_instance);
+	DELETE(g_buffer_bypass_ask_instance);
+	DELETE(g_hot_backup_ask_instance);
+	DELETE(g_remote_dtc_instance);
+	DELETE(g_black_hole_ask_instance);
+	DELETE(g_agent_hub_ask_instance);
+	DELETE(g_job_hub_ask_instance);
+
+	DELETE(g_buffer_multi_thread);
+	DELETE(g_datasource_thread);
+	DELETE(g_remote_thread);
+	DELETE(g_main_thread);
+	DELETE(g_hot_backup_thread);
+	g_stat_mgr.stop_background_thread();
+	log4cplus_info("--------%s-v%s END!--------", project_name, version);
+	daemon_cleanup();
+#if MEMCHECK
+	dump_non_delete();
+	log4cplus_debug("memory allocated %lu virtual %lu", count_alloc_size(),
+			count_virtual_size());
+#endif
+}

+ 87 - 0
src/core/misc/main_supply.h

@@ -0,0 +1,87 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef __MAIN_SUPPLY_H
+#define __MAIN_SUPPLY_H
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <stdio.h>
+
+#include <version.h>
+#include <table/table_def.h>
+#include <config/config.h>
+#include <poll/poller_base.h>
+#include <listener/listener_pool.h>
+#include <barrier_ask_answer_chain.h>
+#include <client/client_unit.h>
+#include <data_connector_ask_chain.h>
+#include <connector/connector_group.h>
+#include <buffer_process_ask_chain.h>
+#include <buffer_bypass_ask_chain.h>
+#include <daemons.h>
+#include <config/dbconfig.h>
+#include <log/log.h>
+#include <daemon/daemon.h>
+#include <pipetask.h>
+#include <mem_check.h>
+#include "socket/unix_socket.h"
+#include "stat_dtc.h"
+#include "system_command_ask_chain.h"
+#include "task/task_multi_unit.h"
+#include "black_hole_ask_chain.h"
+#include "container.h"
+#include "proc_title.h"
+#include "plugin/plugin_mgr.h"
+#include "dtc_global.h"
+#include "remote_dtc_ask_answer_chain.h"
+#include "key/key_route_ask_chain.h"
+#include "agent/agent_listen_pool.h"
+#include "agent/agent_unit.h"
+#include "version.h"
+#include "dtcutils.h"
+#include "algorithm/relative_hour_calculator.h"
+#include "buffer_remoteLog.h"
+#include "hot_backup_ask_chain.h"
+#include "logger.h"
+#include "data_process.h"
+#include "namespace.h"
+#include "global.h"
+
+DTC_BEGIN_NAMESPACE
+
+int plugin_start(void);
+int plugin_stop(void);
+int stat_open_fd(void);
+int init_cache_mode(void);
+int init_hotbackup_chain_thread(void);
+int init_main_chain_thread(void);
+int init_buffer_process_ask_chain_thread(void);
+int collect_load_config(DbConfig *dbconfig);
+int init_remote_dtc_chain_thread(void);
+int init_remote_dtc_chain(PollerBase *thread);
+int init_data_connector_chain_thread(void);
+int init_buffer_process_ask_chain(PollerBase *thread);
+int init_data_connector_ask_chain(PollerBase *thread);
+
+int init_remote_log_config();
+int init_config_info();
+void free_all_resource();
+
+DTC_END_NAMESPACE
+
+#endif

+ 288 - 0
src/core/misc/mysql_error.h

@@ -0,0 +1,288 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef __H_DTC_MYSQL_ERROR_H__
+#define __H_DTC_MYSQL_ERROR_H__
+enum { ER_HASHCHK = 1000,
+       ER_NISAMCHK = 1001,
+       ER_NO = 1002,
+       ER_YES = 1003,
+       ER_CANT_CREATE_FILE = 1004,
+       ER_CANT_CREATE_TABLE = 1005,
+       ER_CANT_CREATE_DB = 1006,
+       ER_DB_CREATE_EXISTS = 1007,
+       ER_DB_DROP_EXISTS = 1008,
+       ER_DB_DROP_DELETE = 1009,
+       ER_DB_DROP_RMDIR = 1010,
+       ER_CANT_DELETE_FILE = 1011,
+       ER_CANT_FIND_SYSTEM_REC = 1012,
+       ER_CANT_GET_STAT = 1013,
+       ER_CANT_GET_WD = 1014,
+       ER_CANT_LOCK = 1015,
+       ER_CANT_OPEN_FILE = 1016,
+       ER_FILE_NOT_FOUND = 1017,
+       ER_CANT_READ_DIR = 1018,
+       ER_CANT_SET_WD = 1019,
+       ER_CHECKREAD = 1020,
+       ER_DISK_FULL = 1021,
+       ER_DUP_KEY = 1022,
+       ER_ERROR_ON_CLOSE = 1023,
+       ER_ERROR_ON_READ = 1024,
+       ER_ERROR_ON_RENAME = 1025,
+       ER_ERROR_ON_WRITE = 1026,
+       ER_FILE_USED = 1027,
+       ER_FILSORT_ABORT = 1028,
+       ER_FORM_NOT_FOUND = 1029,
+       ER_GET_ERRNO = 1030,
+       ER_ILLEGAL_HA = 1031,
+       ER_KEY_NOT_FOUND = 1032,
+       ER_NOT_FORM_FILE = 1033,
+       ER_NOT_KEYFILE = 1034,
+       ER_OLD_KEYFILE = 1035,
+       ER_OPEN_AS_READONLY = 1036,
+       ER_OUTOFMEMORY = 1037,
+       ER_OUT_OF_SORTMEMORY = 1038,
+       ER_UNEXPECTED_EOF = 1039,
+       ER_CON_COUNT_ERROR = 1040,
+       ER_OUT_OF_RESOURCES = 1041,
+       ER_BAD_HOST_ERROR = 1042,
+       ER_HANDSHAKE_ERROR = 1043,
+       ER_DBACCESS_DENIED_ERROR = 1044,
+       ER_ACCESS_DENIED_ERROR = 1045,
+       ER_NO_DB_ERROR = 1046,
+       ER_UNKNOWN_COM_ERROR = 1047,
+       ER_BAD_NULL_ERROR = 1048,
+       ER_BAD_DB_ERROR = 1049,
+       ER_TABLE_EXISTS_ERROR = 1050,
+       ER_BAD_TABLE_ERROR = 1051,
+       ER_NON_UNIQ_ERROR = 1052,
+       ER_SERVER_SHUTDOWN = 1053,
+       ER_BAD_FIELD_ERROR = 1054,
+       ER_WRONG_FIELD_WITH_GROUP = 1055,
+       ER_WRONG_GROUP_FIELD = 1056,
+       ER_WRONG_SUM_SELECT = 1057,
+       ER_WRONG_VALUE_COUNT = 1058,
+       ER_TOO_LONG_IDENT = 1059,
+       ER_DUP_FIELDNAME = 1060,
+       ER_DUP_KEYNAME = 1061,
+       ER_DUP_ENTRY = 1062,
+       ER_WRONG_FIELD_SPEC = 1063,
+       ER_PARSE_ERROR = 1064,
+       ER_EMPTY_QUERY = 1065,
+       ER_NONUNIQ_TABLE = 1066,
+       ER_INVALID_DEFAULT = 1067,
+       ER_MULTIPLE_PRI_KEY = 1068,
+       ER_TOO_MANY_KEYS = 1069,
+       ER_TOO_MANY_KEY_PARTS = 1070,
+       ER_TOO_LONG_KEY = 1071,
+       ER_KEY_COLUMN_DOES_NOT_EXITS = 1072,
+       ER_BLOB_USED_AS_KEY = 1073,
+       ER_TOO_BIG_FIELDLENGTH = 1074,
+       ER_WRONG_AUTO_KEY = 1075,
+       ER_READY = 1076,
+       ER_NORMAL_SHUTDOWN = 1077,
+       ER_GOT_SIGNAL = 1078,
+       ER_SHUTDOWN_COMPLETE = 1079,
+       ER_FORCING_CLOSE = 1080,
+       ER_IPSOCK_ERROR = 1081,
+       ER_NO_SUCH_INDEX = 1082,
+       ER_WRONG_FIELD_TERMINATORS = 1083,
+       ER_BLOBS_AND_NO_TERMINATED = 1084,
+       ER_TEXTFILE_NOT_READABLE = 1085,
+       ER_FILE_EXISTS_ERROR = 1086,
+       ER_LOAD_INFO = 1087,
+       ER_ALTER_INFO = 1088,
+       ER_WRONG_SUB_KEY = 1089,
+       ER_CANT_REMOVE_ALL_FIELDS = 1090,
+       ER_CANT_DROP_FIELD_OR_KEY = 1091,
+       ER_INSERT_INFO = 1092,
+       ER_INSERT_TABLE_USED = 1093,
+       ER_NO_SUCH_THREAD = 1094,
+       ER_KILL_DENIED_ERROR = 1095,
+       ER_NO_TABLES_USED = 1096,
+       ER_TOO_BIG_SET = 1097,
+       ER_NO_UNIQUE_LOGFILE = 1098,
+       ER_TABLE_NOT_LOCKED_FOR_WRITE = 1099,
+       ER_TABLE_NOT_LOCKED = 1100,
+       ER_BLOB_CANT_HAVE_DEFAULT = 1101,
+       ER_WRONG_DB_NAME = 1102,
+       ER_WRONG_TABLE_NAME = 1103,
+       ER_TOO_BIG_SELECT = 1104,
+       ER_UNKNOWN_ERROR = 1105,
+       ER_UNKNOWN_PROCEDURE = 1106,
+       ER_WRONG_PARAMCOUNT_TO_PROCEDURE = 1107,
+       ER_WRONG_PARAMETERS_TO_PROCEDURE = 1108,
+       ER_UNKNOWN_TABLE = 1109,
+       ER_FIELD_SPECIFIED_TWICE = 1110,
+       ER_INVALID_GROUP_FUNC_USE = 1111,
+       ER_UNSUPPORTED_EXTENSION = 1112,
+       ER_TABLE_MUST_HAVE_COLUMNS = 1113,
+       ER_RECORD_FILE_FULL = 1114,
+       ER_UNKNOWN_CHARACTER_SET = 1115,
+       ER_TOO_MANY_TABLES = 1116,
+       ER_TOO_MANY_FIELDS = 1117,
+       ER_TOO_BIG_ROWSIZE = 1118,
+       ER_STACK_OVERRUN = 1119,
+       ER_WRONG_OUTER_JOIN = 1120,
+       ER_NULL_COLUMN_IN_INDEX = 1121,
+       ER_CANT_FIND_UDF = 1122,
+       ER_CANT_INITIALIZE_UDF = 1123,
+       ER_UDF_NO_PATHS = 1124,
+       ER_UDF_EXISTS = 1125,
+       ER_CANT_OPEN_LIBRARY = 1126,
+       ER_CANT_FIND_DL_ENTRY = 1127,
+       ER_FUNCTION_NOT_DEFINED = 1128,
+       ER_HOST_IS_BLOCKED = 1129,
+       ER_HOST_NOT_PRIVILEGED = 1130,
+       ER_PASSWORD_ANONYMOUS_USER = 1131,
+       ER_PASSWORD_NOT_ALLOWED = 1132,
+       ER_PASSWORD_NO_MATCH = 1133,
+       ER_UPDATE_INFO = 1134,
+       ER_CANT_CREATE_THREAD = 1135,
+       ER_WRONG_VALUE_COUNT_ON_ROW = 1136,
+       ER_CANT_REOPEN_TABLE = 1137,
+       ER_INVALID_USE_OF_NULL = 1138,
+       ER_REGEXP_ERROR = 1139,
+       ER_MIX_OF_GROUP_FUNC_AND_FIELDS = 1140,
+       ER_NONEXISTING_GRANT = 1141,
+       ER_TABLEACCESS_DENIED_ERROR = 1142,
+       ER_COLUMNACCESS_DENIED_ERROR = 1143,
+       ER_ILLEGAL_GRANT_FOR_TABLE = 1144,
+       ER_GRANT_WRONG_HOST_OR_USER = 1145,
+       ER_NO_SUCH_TABLE = 1146,
+       ER_NONEXISTING_TABLE_GRANT = 1147,
+       ER_NOT_ALLOWED_COMMAND = 1148,
+       ER_SYNTAX_ERROR = 1149,
+       ER_DELAYED_CANT_CHANGE_LOCK = 1150,
+       ER_TOO_MANY_DELAYED_THREADS = 1151,
+       ER_ABORTING_CONNECTION = 1152,
+       ER_NET_PACKET_TOO_LARGE = 1153,
+       ER_NET_READ_ERROR_FROM_PIPE = 1154,
+       ER_NET_FCNTL_ERROR = 1155,
+       ER_NET_PACKETS_OUT_OF_ORDER = 1156,
+       ER_NET_UNCOMPRESS_ERROR = 1157,
+       ER_NET_READ_ERROR = 1158,
+       ER_NET_READ_INTERRUPTED = 1159,
+       ER_NET_ERROR_ON_WRITE = 1160,
+       ER_NET_WRITE_INTERRUPTED = 1161,
+       ER_TOO_LONG_STRING = 1162,
+       ER_TABLE_CANT_HANDLE_BLOB = 1163,
+       ER_TABLE_CANT_HANDLE_AUTO_INCREMENT = 1164,
+       ER_DELAYED_INSERT_TABLE_LOCKED = 1165,
+       ER_WRONG_COLUMN_NAME = 1166,
+       ER_WRONG_KEY_COLUMN = 1167,
+       ER_WRONG_MRG_TABLE = 1168,
+       ER_DUP_UNIQUE = 1169,
+       ER_BLOB_KEY_WITHOUT_LENGTH = 1170,
+       ER_PRIMARY_CANT_HAVE_NULL = 1171,
+       ER_TOO_MANY_ROWS = 1172,
+       ER_REQUIRES_PRIMARY_KEY = 1173,
+       ER_NO_RAID_COMPILED = 1174,
+       ER_UPDATE_WITHOUT_KEY_IN_SAFE_MODE = 1175,
+       ER_KEY_DOES_NOT_EXITS = 1176,
+       ER_CHECK_NO_SUCH_TABLE = 1177,
+       ER_CHECK_NOT_IMPLEMENTED = 1178,
+       ER_CANT_DO_THIS_DURING_AN_TRANSACTION = 1179,
+       ER_ERROR_DURING_COMMIT = 1180,
+       ER_ERROR_DURING_ROLLBACK = 1181,
+       ER_ERROR_DURING_FLUSH_LOGS = 1182,
+       ER_ERROR_DURING_CHECKPOINT = 1183,
+       ER_NEW_ABORTING_CONNECTION = 1184,
+       ER_DUMP_NOT_IMPLEMENTED = 1185,
+       ER_FLUSH_MASTER_BINLOG_CLOSED = 1186,
+       ER_INDEX_REBUILD = 1187,
+       ER_MASTER = 1188,
+       ER_MASTER_NET_READ = 1189,
+       ER_MASTER_NET_WRITE = 1190,
+       ER_FT_MATCHING_KEY_NOT_FOUND = 1191,
+       ER_LOCK_OR_ACTIVE_TRANSACTION = 1192,
+       ER_UNKNOWN_SYSTEM_VARIABLE = 1193,
+       ER_CRASHED_ON_USAGE = 1194,
+       ER_CRASHED_ON_REPAIR = 1195,
+       ER_WARNING_NOT_COMPLETE_ROLLBACK = 1196,
+       ER_TRANS_CACHE_FULL = 1197,
+       ER_SLAVE_MUST_STOP = 1198,
+       ER_SLAVE_NOT_RUNNING = 1199,
+       ER_BAD_SLAVE = 1200,
+       ER_MASTER_INFO = 1201,
+       ER_SLAVE_THREAD = 1202,
+       ER_TOO_MANY_USER_CONNECTIONS = 1203,
+       ER_SET_CONSTANTS_ONLY = 1204,
+       ER_LOCK_WAIT_TIMEOUT = 1205,
+       ER_LOCK_TABLE_FULL = 1206,
+       ER_READ_ONLY_TRANSACTION = 1207,
+       ER_DROP_DB_WITH_READ_LOCK = 1208,
+       ER_CREATE_DB_WITH_READ_LOCK = 1209,
+       ER_WRONG_ARGUMENTS = 1210,
+       ER_NO_PERMISSION_TO_CREATE_USER = 1211,
+       ER_UNION_TABLES_IN_DIFFERENT_DIR = 1212,
+       ER_LOCK_DEADLOCK = 1213,
+       ER_TABLE_CANT_HANDLE_FULLTEXT = 1214,
+       ER_CANNOT_ADD_FOREIGN = 1215,
+       ER_NO_REFERENCED_ROW = 1216,
+       ER_ROW_IS_REFERENCED = 1217,
+       ER_CONNECT_TO_MASTER = 1218,
+       ER_QUERY_ON_MASTER = 1219,
+       ER_ERROR_WHEN_EXECUTING_COMMAND = 1220,
+       ER_WRONG_USAGE = 1221,
+       ER_WRONG_NUMBER_OF_COLUMNS_IN_SELECT = 1222,
+       ER_CANT_UPDATE_WITH_READLOCK = 1223,
+       ER_MIXING_NOT_ALLOWED = 1224,
+       ER_DUP_ARGUMENT = 1225,
+       ER_USER_LIMIT_REACHED = 1226,
+       ER_SPECIFIC_ACCESS_DENIED_ERROR = 1227,
+       ER_LOCAL_VARIABLE = 1228,
+       ER_GLOBAL_VARIABLE = 1229,
+       ER_NO_DEFAULT = 1230,
+       ER_WRONG_VALUE_FOR_VAR = 1231,
+       ER_WRONG_TYPE_FOR_VAR = 1232,
+       ER_VAR_CANT_BE_READ = 1233,
+       ER_CANT_USE_OPTION_HERE = 1234,
+       ER_NOT_SUPPORTED_YET = 1235,
+       ER_MASTER_FATAL_ERROR_READING_BINLOG = 1236,
+       ER_SLAVE_IGNORED_TABLE = 1237,
+       ER_INCORRECT_GLOBAL_LOCAL_VAR = 1238,
+       CR_UNKNOWN_ERROR = 1900,
+       CR_SOCKET_CREATE_ERROR = 1901,
+       CR_CONNECTION_ERROR = 1902,
+       CR_CONN_HOST_ERROR = 1903,
+       CR_IPSOCK_ERROR = 1904,
+       CR_UNKNOWN_HOST = 1905,
+       CR_SERVER_GONE_ERROR = 1906,
+       CR_VERSION_ERROR = 1907,
+       CR_OUT_OF_MEMORY = 1908,
+       CR_WRONG_HOST_INFO = 1909,
+       CR_LOCALHOST_CONNECTION = 1910,
+       CR_TCP_CONNECTION = 1911,
+       CR_SERVER_HANDSHAKE_ERR = 1912,
+       CR_SERVER_LOST = 1913,
+       CR_COMMANDS_OUT_OF_SYNC = 1914,
+       CR_NAMEDPIPE_CONNECTION = 1915,
+       CR_NAMEDPIPEWAIT_ERROR = 1916,
+       CR_NAMEDPIPEOPEN_ERROR = 1917,
+       CR_NAMEDPIPESETSTATE_ERROR = 1918,
+       CR_CANT_READ_CHARSET = 1919,
+       CR_NET_PACKET_TOO_LARGE = 1920,
+       CR_EMBEDDED_CONNECTION = 1921,
+       CR_PROBE_SLAVE_STATUS = 1922,
+       CR_PROBE_SLAVE_HOSTS = 1923,
+       CR_PROBE_SLAVE_CONNECT = 1924,
+       CR_PROBE_MASTER_CONNECT = 1925,
+       CR_SSL_CONNECTION_ERROR = 1926,
+       CR_MALFORMED_PACKET = 1927,
+       CR_WRONG_LICENSE = 1928,
+};
+#endif

+ 35 - 0
src/core/misc/purge_processor.h

@@ -0,0 +1,35 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef __DTC_PURGE_NODE_PROCESSOR_H
+#define __DTC_PURGE_NODE_PROCESSOR_H
+
+#include <stddef.h>
+
+#include "node/node.h"
+
+DTC_BEGIN_NAMESPACE
+
+class PurgeNodeProcessor {
+    public:
+	PurgeNodeProcessor(){};
+	virtual ~PurgeNodeProcessor(){};
+	virtual void purge_node_processor(const char *key, Node node) = 0;
+};
+
+DTC_END_NAMESPACE
+
+#endif

+ 40 - 0
src/core/misc/reader_interface.h

@@ -0,0 +1,40 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef __READER_INTERFACE_H
+#define __READER_INTERFACE_H
+
+#include "field/field.h"
+
+class ReaderInterface {
+    public:
+	ReaderInterface()
+	{
+	}
+	virtual ~ReaderInterface()
+	{
+	}
+
+	virtual const char *err_msg() = 0;
+	virtual int begin_read()
+	{
+		return 0;
+	}
+	virtual int read_row(RowValue &row) = 0;
+	virtual int end() = 0;
+};
+
+#endif

+ 1179 - 0
src/core/raw/raw_data.cc

@@ -0,0 +1,1179 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "raw_data.h"
+#include "global.h"
+#include "algorithm/relative_hour_calculator.h"
+
+#ifndef likely
+#if __GCC_MAJOR >= 3
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#else
+#define likely(x) (x)
+#define unlikely(x) (x)
+#endif
+#endif
+
+#define GET_VALUE(x, t)                                                        \
+	do {                                                                   \
+		if (unlikely(offset_ + sizeof(t) > size_))                     \
+			goto ERROR_RET;                                        \
+		x = (typeof(x)) * (t *)(p_content_ + offset_);                 \
+		offset_ += sizeof(t);                                          \
+	} while (0)
+
+#define GET_VALUE_AT_OFFSET(x, t, offset)                                      \
+	do {                                                                   \
+		if (unlikely(offset + sizeof(t) > size_))                      \
+			goto ERROR_RET;                                        \
+		x = (typeof(x)) * (t *)(p_content_ + offset);                  \
+	} while (0)
+
+#define SET_VALUE(x, t)                                                        \
+	do {                                                                   \
+		if (unlikely(offset_ + sizeof(t) > size_))                     \
+			goto ERROR_RET;                                        \
+		*(t *)(p_content_ + offset_) = x;                              \
+		offset_ += sizeof(t);                                          \
+	} while (0)
+
+#define SET_VALUE_AT_OFFSET(x, t, offset)                                      \
+	do {                                                                   \
+		if (unlikely(offset + sizeof(t) > size_))                      \
+			goto ERROR_RET;                                        \
+		*(t *)(p_content_ + offset) = x;                               \
+	} while (0)
+
+#define SET_BIN_VALUE(p, len)                                                  \
+	do {                                                                   \
+		if (unlikely(offset_ + sizeof(int) + len > size_))             \
+			goto ERROR_RET;                                        \
+		*(int *)(p_content_ + offset_) = len;                          \
+		offset_ += sizeof(int);                                        \
+		if (likely(len != 0))                                          \
+			memcpy(p_content_ + offset_, p, len);                  \
+		offset_ += len;                                                \
+	} while (0)
+
+#define CHECK_SIZE(s)                                                          \
+	do {                                                                   \
+		if (unlikely(offset_ + s > size_))                             \
+			goto ERROR_RET;                                        \
+	} while (0)
+
+#define SKIP_SIZE(s)                                                           \
+	do {                                                                   \
+		if (unlikely(offset_ + s > size_))                             \
+			goto ERROR_RET;                                        \
+		offset_ += s;                                                  \
+	} while (0)
+const int BTYE_MAX_VALUE = 255;
+RawData::RawData(MallocBase *pstMalloc, int iAutoDestroy)
+{
+	data_size_ = 0;
+	row_count_ = 0;
+	key_size_ = 0;
+	m_iLAId = -1;
+	expire_id_ = -1;
+	table_index_ = -1;
+	key_start_ = 0;
+	data_start_ = 0;
+	offset_ = 0;
+	m_uiLAOffset = 0;
+	row_offset_ = 0;
+	get_request_count_offset_ = 0;
+	time_stamp_offset_ = 0;
+	get_request_count_ = 0;
+	create_time_ = 0;
+	latest_request_time_ = 0;
+	latest_update_time_ = 0;
+	key_index_ = -1;
+	p_content_ = NULL;
+	need_new_bufer_size = 0;
+	mallocator_ = pstMalloc;
+	handle_ = INVALID_HANDLE;
+	auto_destory_ = iAutoDestroy;
+	size_ = 0;
+	p_reference_ = NULL;
+	memset(err_message_, 0, sizeof(err_message_));
+}
+
+RawData::~RawData()
+{
+	if (auto_destory_) {
+		destory();
+	}
+	handle_ = INVALID_HANDLE;
+	size_ = 0;
+}
+
+int RawData::init(uint8_t uchKeyIdx, int iKeySize, const char *pchKey,
+		  ALLOC_SIZE_T uiDataSize, int laId, int expireId, int nodeIdx)
+{
+	int ks = iKeySize != 0 ? iKeySize : 1 + *(unsigned char *)pchKey;
+
+	/*|1字节:类型|4字节:数据大小|4字节: 行数| 1字节 : Get次数| 2字节: 最后访问时间| 2字节 : 最后更新时间|2字节: 最后创建时间 |key|*/
+	uiDataSize += 2 + sizeof(uint32_t) * 2 + sizeof(uint16_t) * 3 + ks;
+
+	handle_ = INVALID_HANDLE;
+	size_ = 0;
+
+	handle_ = mallocator_->Malloc(uiDataSize);
+	if (handle_ == INVALID_HANDLE) {
+		snprintf(err_message_, sizeof(err_message_), "malloc error");
+		need_new_bufer_size = uiDataSize;
+		return (EC_NO_MEM);
+	}
+	size_ = mallocator_->chunk_size(handle_);
+
+	data_size_ = 2 + sizeof(uint32_t) * 2 + sizeof(uint16_t) * 3 + ks;
+	row_count_ = 0;
+	key_index_ = uchKeyIdx;
+	key_size_ = iKeySize;
+	m_iLAId = laId;
+	expire_id_ = expireId;
+
+	p_content_ = Pointer<char>();
+	offset_ = 0;
+	m_uiLAOffset = 0;
+	if (nodeIdx != -1) {
+		table_index_ = nodeIdx;
+	}
+	if (table_index_ != 0 && table_index_ != 1) {
+		snprintf(err_message_, sizeof(err_message_), "node idx error");
+		return -100;
+	}
+	SET_VALUE(((table_index_ << 7) & 0x80) + DATA_TYPE_RAW, unsigned char);
+	SET_VALUE(data_size_, uint32_t);
+	SET_VALUE(row_count_, uint32_t);
+
+	get_request_count_offset_ = offset_;
+	get_request_count_ = 1;
+	SET_VALUE(get_request_count_, uint8_t);
+	time_stamp_offset_ = offset_;
+	init_timp_stamp();
+	SKIP_SIZE(3 * sizeof(uint16_t));
+	key_start_ = offset_;
+	if (iKeySize != 0) {
+		memcpy(p_content_ + offset_, pchKey, iKeySize);
+		offset_ += iKeySize;
+	} else {
+		memcpy(p_content_ + offset_, pchKey, ks);
+		offset_ += ks;
+	}
+	data_start_ = offset_;
+	row_offset_ = data_start_;
+
+	return (0);
+
+ERROR_RET:
+	snprintf(err_message_, sizeof(err_message_), "set value error");
+	return (-100);
+}
+
+int RawData::do_init(const char *pchKey, ALLOC_SIZE_T uiDataSize)
+{
+	if (DTCColExpand::instance()->is_expanding())
+		table_index_ =
+			(DTCColExpand::instance()->cur_table_idx() + 1) % 2;
+	else
+		table_index_ = DTCColExpand::instance()->cur_table_idx() % 2;
+	if (table_index_ != 0 && table_index_ != 1) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "attach error, nodeIdx[%d] error", table_index_);
+		return -1;
+	}
+	table_definition_ =
+		TableDefinitionManager::instance()->get_table_def_by_idx(
+			table_index_);
+	if (table_definition_ == NULL) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "attach error, tabledef[NULL]");
+		return -1;
+	}
+
+	return init(table_definition_->key_fields() - 1,
+		    table_definition_->key_format(), pchKey, uiDataSize,
+		    table_definition_->lastacc_field_id(),
+		    table_definition_->expire_time_field_id());
+}
+
+int RawData::do_attach(MEM_HANDLE_T hHandle)
+{
+	handle_ = hHandle;
+	char *p = Pointer<char>();
+	table_index_ = (*p >> 7) & 0x01;
+	if (table_index_ != 0 && table_index_ != 1) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "attach error, nodeIdx[%d] error", table_index_);
+		return -1;
+	}
+	table_definition_ =
+		TableDefinitionManager::instance()->get_table_def_by_idx(
+			table_index_);
+	if (table_definition_ == NULL) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "attach error, tabledef[NULL]");
+		return -1;
+	}
+	return do_attach(hHandle, table_definition_->key_fields() - 1,
+			 table_definition_->key_format(),
+			 table_definition_->lastacc_field_id(),
+			 table_definition_->lastcmod_field_id(),
+			 table_definition_->expire_time_field_id());
+}
+
+/* this function belive that inputted raw data is formatted correclty, but it's not the case sometimes */
+int RawData::do_attach(MEM_HANDLE_T hHandle, uint8_t uchKeyIdx, int iKeySize,
+		       int laid, int lcmodid, int expireid)
+{
+	int ks = 0;
+
+	size_ = mallocator_->chunk_size(hHandle);
+	if (unlikely(size_ == 0)) {
+		snprintf(err_message_, sizeof(err_message_), "attach error: %s",
+			 mallocator_->get_err_msg());
+		return (-1);
+	}
+	handle_ = hHandle;
+
+	p_content_ = Pointer<char>();
+	offset_ = 0;
+	m_uiLAOffset = 0;
+	unsigned char uchType;
+	GET_VALUE(uchType, unsigned char);
+	if (unlikely((uchType & 0x7f) != DATA_TYPE_RAW)) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "invalid data type: %u", uchType);
+		return (-2);
+	}
+
+	GET_VALUE(data_size_, uint32_t);
+	GET_VALUE(row_count_, uint32_t);
+	get_request_count_offset_ = offset_;
+	GET_VALUE(get_request_count_, uint8_t);
+	time_stamp_offset_ = offset_;
+	attach_time_stamp();
+	SKIP_SIZE(3 * sizeof(uint16_t));
+	if (unlikely(data_size_ > size_)) {
+		snprintf(
+			err_message_, sizeof(err_message_),
+			"raw-data handle[" UINT64FMT
+			"] data size[%u] error, large than chunk size[" UINT64FMT
+			"]",
+			hHandle, data_size_, size_);
+		return (-3);
+	}
+
+	key_index_ = uchKeyIdx;
+	key_start_ = offset_;
+	key_size_ = iKeySize;
+	m_iLAId = laid;
+	m_iLCmodId = lcmodid;
+	expire_id_ = expireid;
+
+	ks = iKeySize != 0 ? iKeySize :
+			     1 + *(unsigned char *)(p_content_ + key_start_);
+	SKIP_SIZE(ks);
+	data_start_ = offset_;
+	row_offset_ = data_start_;
+
+	return (0);
+
+ERROR_RET:
+	snprintf(err_message_, sizeof(err_message_), "get value error");
+	return (-100);
+}
+
+int RawData::destory()
+{
+	if (handle_ == INVALID_HANDLE) {
+		size_ = 0;
+		return 0;
+	}
+
+	int iRet = mallocator_->Free(handle_);
+	handle_ = INVALID_HANDLE;
+	size_ = 0;
+	return (iRet);
+}
+
+int RawData::check_size(MEM_HANDLE_T hHandle, uint8_t uchKeyIdx, int iKeySize,
+			int size)
+{
+	size_ = mallocator_->chunk_size(hHandle);
+	if (unlikely(size_ == 0)) {
+		snprintf(err_message_, sizeof(err_message_), "attach error: %s",
+			 mallocator_->get_err_msg());
+		return (-1);
+	}
+	handle_ = hHandle;
+
+	p_content_ = Pointer<char>();
+	offset_ = 0;
+	m_uiLAOffset = 0;
+	unsigned char uchType;
+	GET_VALUE(uchType, unsigned char);
+	if (unlikely(uchType != DATA_TYPE_RAW)) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "invalid data type: %u", uchType);
+		return (-2);
+	}
+
+	GET_VALUE(data_size_, uint32_t);
+	if (data_size_ != (unsigned int)size) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "invalid data type: %u", uchType);
+		return -1;
+	}
+
+	return 0;
+ERROR_RET:
+	return -1;
+}
+
+int RawData::strip_mem()
+{
+	ALLOC_HANDLE_T hTmp = mallocator_->ReAlloc(handle_, data_size_);
+	if (hTmp == INVALID_HANDLE) {
+		snprintf(err_message_, sizeof(err_message_), "realloc error");
+		need_new_bufer_size = data_size_;
+		return (EC_NO_MEM);
+	}
+	handle_ = hTmp;
+	size_ = mallocator_->chunk_size(handle_);
+	p_content_ = Pointer<char>();
+
+	return (0);
+}
+
+int RawData::decode_row(RowValue &stRow, unsigned char &uchRowFlags,
+			int iDecodeFlag)
+{
+	if (unlikely(handle_ == INVALID_HANDLE || p_content_ == NULL)) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "rawdata not init yet");
+		return (-1);
+	}
+
+	ALLOC_SIZE_T uiOldOffset = offset_;
+	ALLOC_SIZE_T uiOldRowOffset = row_offset_;
+	m_uiLAOffset = 0;
+	row_offset_ = offset_;
+	GET_VALUE(uchRowFlags, unsigned char);
+
+	for (int j = key_index_ + 1; j <= stRow.num_fields();
+	     j++) //拷贝一行数据
+	{
+		if (stRow.table_definition()->is_discard(j))
+			continue;
+		if (j == m_iLAId)
+			m_uiLAOffset = offset_;
+		switch (stRow.field_type(j)) {
+		case DField::Signed:
+			if (unlikely(stRow.field_size(j) >
+				     (int)sizeof(int32_t))) {
+				GET_VALUE(stRow.field_value(j)->s64, int64_t);
+			} else {
+				GET_VALUE(stRow.field_value(j)->s64, int32_t);
+			}
+			break;
+
+		case DField::Unsigned:
+			if (unlikely(stRow.field_size(j) >
+				     (int)sizeof(uint32_t))) {
+				GET_VALUE(stRow.field_value(j)->u64, uint64_t);
+			} else {
+				GET_VALUE(stRow.field_value(j)->u64, uint32_t);
+			}
+			break;
+
+		case DField::Float: //浮点数
+			if (likely(stRow.field_size(j) > (int)sizeof(float))) {
+				GET_VALUE(stRow.field_value(j)->flt, double);
+			} else {
+				GET_VALUE(stRow.field_value(j)->flt, float);
+			}
+			break;
+
+		case DField::String: //字符串
+		case DField::Binary: //二进制数据
+		default: {
+			GET_VALUE(stRow.field_value(j)->bin.len, int);
+			stRow.field_value(j)->bin.ptr = p_content_ + offset_;
+			SKIP_SIZE(stRow.field_value(j)->bin.len);
+			break;
+		}
+		} //end of switch
+	}
+
+	if (unlikely(iDecodeFlag & PRE_DECODE_ROW)) {
+		offset_ = uiOldOffset;
+		row_offset_ = uiOldRowOffset;
+	}
+
+	return (0);
+
+ERROR_RET:
+	if (unlikely(iDecodeFlag & PRE_DECODE_ROW)) {
+		offset_ = uiOldOffset;
+		row_offset_ = uiOldRowOffset;
+	}
+	snprintf(err_message_, sizeof(err_message_), "get value error");
+	return (-100);
+}
+
+int RawData::get_expire_time(DTCTableDefinition *t, uint32_t &expire)
+{
+	expire = 0;
+	if (unlikely(handle_ == INVALID_HANDLE || p_content_ == NULL)) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "rawdata not init yet");
+		return (-1);
+	}
+	if (expire_id_ == -1) {
+		expire = 0;
+		return 0;
+	}
+	SKIP_SIZE(sizeof(unsigned char)); //skip flag
+	// the first field should be expire time
+	for (int j = key_index_ + 1; j <= table_definition_->num_fields();
+	     j++) { //拷贝一行数据
+		if (j == expire_id_) {
+			expire = *((uint32_t *)(p_content_ + offset_));
+			break;
+		}
+
+		switch (table_definition_->field_type(j)) {
+		case DField::Unsigned:
+		case DField::Signed:
+			if (table_definition_->field_size(j) >
+			    (int)sizeof(int32_t))
+				SKIP_SIZE(sizeof(int64_t));
+			else
+				SKIP_SIZE(sizeof(int32_t));
+			;
+			break;
+
+		case DField::Float: //浮点数
+			if (table_definition_->field_size(j) >
+			    (int)sizeof(float))
+				SKIP_SIZE(sizeof(double));
+			else
+				SKIP_SIZE(sizeof(float));
+			break;
+
+		case DField::String: //字符串
+		case DField::Binary: //二进制数据
+		default:
+			int iLen = 0;
+			GET_VALUE(iLen, int);
+			SKIP_SIZE(iLen);
+			break;
+		} //end of switch
+	}
+	return 0;
+
+ERROR_RET:
+	snprintf(err_message_, sizeof(err_message_), "get expire error");
+	return (-100);
+}
+
+int RawData::get_lastcmod(uint32_t &lastcmod)
+{
+	lastcmod = 0;
+	if (unlikely(handle_ == INVALID_HANDLE || p_content_ == NULL)) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "rawdata not init yet");
+		return (-1);
+	}
+
+	row_offset_ = offset_;
+	SKIP_SIZE(sizeof(unsigned char)); //skip flag
+
+	for (int j = key_index_ + 1; j <= table_definition_->num_fields();
+	     j++) //拷贝一行数据
+	{
+		//id: bug fix skip discard
+		if (table_definition_->is_discard(j))
+			continue;
+		if (j == m_iLCmodId)
+			lastcmod = *((uint32_t *)(p_content_ + offset_));
+
+		switch (table_definition_->field_type(j)) {
+		case DField::Unsigned:
+		case DField::Signed:
+			if (table_definition_->field_size(j) >
+			    (int)sizeof(int32_t))
+				SKIP_SIZE(sizeof(int64_t));
+			else
+				SKIP_SIZE(sizeof(int32_t));
+			;
+			break;
+
+		case DField::Float: //浮点数
+			if (table_definition_->field_size(j) >
+			    (int)sizeof(float))
+				SKIP_SIZE(sizeof(double));
+			else
+				SKIP_SIZE(sizeof(float));
+			break;
+
+		case DField::String: //字符串
+		case DField::Binary: //二进制数据
+		default: {
+			int iLen = 0;
+			GET_VALUE(iLen, int);
+			SKIP_SIZE(iLen);
+			break;
+		}
+		} //end of switch
+	}
+	return (0);
+
+ERROR_RET:
+	snprintf(err_message_, sizeof(err_message_), "get timecmod error");
+	return (-100);
+}
+
+int RawData::set_data_size()
+{
+	SET_VALUE_AT_OFFSET(data_size_, uint32_t, 1);
+
+ERROR_RET:
+	snprintf(err_message_, sizeof(err_message_), "set data size error");
+	return (-100);
+}
+
+int RawData::set_row_count()
+{
+	SET_VALUE_AT_OFFSET(row_count_, uint32_t, 5);
+
+ERROR_RET:
+	snprintf(err_message_, sizeof(err_message_), "set row count error");
+	return (-100);
+}
+
+int RawData::expand_chunk(ALLOC_SIZE_T expand_size)
+{
+	if (handle_ == INVALID_HANDLE) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "data not init yet");
+		return (-1);
+	}
+
+	if (data_size_ + expand_size > size_) {
+		ALLOC_HANDLE_T hTmp =
+			mallocator_->ReAlloc(handle_, data_size_ + expand_size);
+		if (hTmp == INVALID_HANDLE) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "realloc error[%s]",
+				 mallocator_->get_err_msg());
+			need_new_bufer_size = data_size_ + expand_size;
+			return (EC_NO_MEM);
+		}
+		handle_ = hTmp;
+		size_ = mallocator_->chunk_size(handle_);
+		p_content_ = Pointer<char>();
+	}
+
+	return (0);
+}
+
+int RawData::re_alloc_chunk(ALLOC_SIZE_T tSize)
+{
+	if (tSize > size_) {
+		ALLOC_HANDLE_T hTmp = mallocator_->ReAlloc(handle_, tSize);
+		if (hTmp == INVALID_HANDLE) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "realloc error");
+			need_new_bufer_size = tSize;
+			return (EC_NO_MEM);
+		}
+		handle_ = hTmp;
+		size_ = mallocator_->chunk_size(handle_);
+		p_content_ = Pointer<char>();
+	}
+
+	return (0);
+}
+
+ALLOC_SIZE_T RawData::calc_row_size(const RowValue &stRow, int keyIdx)
+{
+	if (keyIdx == -1)
+		log4cplus_error("RawData may not init yet...");
+	ALLOC_SIZE_T tSize = 1; // flag
+	for (int j = keyIdx + 1; j <= stRow.num_fields(); j++) //拷贝一行数据
+	{
+		if (stRow.table_definition()->is_discard(j))
+			continue;
+		switch (stRow.field_type(j)) {
+		case DField::Signed:
+		case DField::Unsigned:
+			tSize += unlikely(stRow.field_size(j) >
+					  (int)sizeof(int32_t)) ?
+					 sizeof(int64_t) :
+					 sizeof(int32_t);
+			break;
+
+		case DField::Float: //浮点数
+			tSize += likely(stRow.field_size(j) >
+					(int)sizeof(float)) ?
+					 sizeof(double) :
+					 sizeof(float);
+			break;
+
+		case DField::String: //字符串
+		case DField::Binary: //二进制数据
+		default: {
+			tSize += sizeof(int);
+			tSize += stRow.field_value(j)->bin.len;
+			break;
+		}
+		} //end of switch
+	}
+	if (tSize < 2)
+		log4cplus_info("key_index_:%d, stRow.num_fields():%d tSize:%d",
+			       keyIdx, stRow.num_fields(), tSize);
+
+	return (tSize);
+}
+
+int RawData::encode_row(const RowValue &stRow, unsigned char uchOp,
+			bool expendBuf)
+{
+	int iRet;
+
+	ALLOC_SIZE_T tSize;
+	tSize = calc_row_size(stRow, key_index_);
+
+	if (unlikely(expendBuf)) {
+		iRet = expand_chunk(tSize);
+		if (unlikely(iRet != 0))
+			return (iRet);
+	}
+
+	SET_VALUE(uchOp, unsigned char);
+
+	for (int j = key_index_ + 1; j <= stRow.num_fields();
+	     j++) //拷贝一行数据
+	{
+		if (stRow.table_definition()->is_discard(j))
+			continue;
+		const DTCValue *const v = stRow.field_value(j);
+		switch (stRow.field_type(j)) {
+		case DField::Signed:
+			if (unlikely(stRow.field_size(j) >
+				     (int)sizeof(int32_t)))
+				SET_VALUE(v->s64, int64_t);
+			else
+				SET_VALUE(v->s64, int32_t);
+			break;
+
+		case DField::Unsigned:
+			if (unlikely(stRow.field_size(j) >
+				     (int)sizeof(uint32_t)))
+				SET_VALUE(v->u64, uint64_t);
+			else
+				SET_VALUE(v->u64, uint32_t);
+			break;
+
+		case DField::Float: //浮点数
+			if (likely(stRow.field_size(j) > (int)sizeof(float)))
+				SET_VALUE(v->flt, double);
+			else
+				SET_VALUE(v->flt, float);
+			break;
+
+		case DField::String: //字符串
+		case DField::Binary: //二进制数据
+		default: {
+			SET_BIN_VALUE(v->bin.ptr, v->bin.len);
+			break;
+		}
+		} //end of switch
+	}
+
+	data_size_ += tSize;
+	set_data_size();
+	row_count_++;
+	set_row_count();
+
+	return 0;
+
+ERROR_RET:
+	snprintf(err_message_, sizeof(err_message_), "encode row error");
+	return (-100);
+}
+
+int RawData::insert_row_flag(const RowValue &stRow, bool byFirst,
+			     unsigned char uchOp)
+{
+	uint32_t uiOldSize = data_size_;
+
+	offset_ = data_size_;
+	int iRet = encode_row(stRow, uchOp);
+	uint32_t uiNewRowSize = data_size_ - uiOldSize;
+	if (iRet == 0 && byFirst == true && uiNewRowSize > 0 &&
+	    (uiOldSize - data_start_) > 0) {
+		void *pBuf = MALLOC(uiNewRowSize);
+		if (pBuf == NULL) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "malloc error: %m");
+			return (-ENOMEM);
+		}
+		char *pchDataStart = p_content_ + data_start_;
+		// save last row
+		memmove(pBuf, p_content_ + uiOldSize, uiNewRowSize);
+		// move buf up sz bytes
+		memmove(pchDataStart + uiNewRowSize, pchDataStart,
+			uiOldSize - data_start_);
+		// last row as first row
+		memcpy(pchDataStart, pBuf, uiNewRowSize);
+		FREE(pBuf);
+	}
+
+	return (iRet);
+}
+
+int RawData::insert_row(const RowValue &stRow, bool byFirst, bool isDirty)
+{
+	return insert_row_flag(stRow, byFirst,
+			       isDirty ? OPER_INSERT : OPER_SELECT);
+}
+
+int RawData::insert_n_rows(unsigned int uiNRows, const RowValue *pstRow,
+			   bool byFirst, bool isDirty)
+{
+	int iRet;
+	unsigned int i;
+	ALLOC_SIZE_T tSize;
+
+	tSize = 0;
+	for (i = 0; i < uiNRows; i++)
+		tSize += calc_row_size(pstRow[i], key_index_);
+
+	iRet = expand_chunk(tSize); // 先扩大buffer,避免后面insert失败回滚
+	if (iRet != 0)
+		return (iRet);
+
+	uint32_t uiOldSize = data_size_;
+	offset_ = data_size_;
+	for (i = 0; i < uiNRows; i++) {
+		iRet = encode_row(pstRow[i],
+				  isDirty ? OPER_INSERT : OPER_SELECT);
+		if (iRet != 0) {
+			return (iRet);
+		}
+	}
+
+	uint32_t uiNewRowSize = data_size_ - uiOldSize;
+	if (byFirst == true && uiNewRowSize > 0 &&
+	    (uiOldSize - data_start_) > 0) {
+		void *pBuf = MALLOC(uiNewRowSize);
+		if (pBuf == NULL) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "malloc error: %m");
+			return (-ENOMEM);
+		}
+		char *pchDataStart = p_content_ + data_start_;
+		// save last row
+		memmove(pBuf, p_content_ + uiOldSize, uiNewRowSize);
+		// move buf up sz bytes
+		memmove(pchDataStart + uiNewRowSize, pchDataStart,
+			uiOldSize - data_start_);
+		// last row as first row
+		memcpy(pchDataStart, pBuf, uiNewRowSize);
+		FREE(pBuf);
+	}
+
+	return (0);
+}
+
+int RawData::skip_row(const RowValue &stRow)
+{
+	if (handle_ == INVALID_HANDLE || p_content_ == NULL) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "rawdata not init yet");
+		return (-1);
+	}
+
+	offset_ = row_offset_;
+	if (offset_ >= data_size_) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "already at end of data");
+		return (-2);
+	}
+
+	SKIP_SIZE(sizeof(unsigned char)); // flag
+
+	for (int j = key_index_ + 1; j <= stRow.num_fields();
+	     j++) //拷贝一行数据
+	{
+		//id: bug fix skip discard
+		if (stRow.table_definition()->is_discard(j))
+			continue;
+
+		switch (stRow.field_type(j)) {
+		case DField::Unsigned:
+		case DField::Signed:
+			if (stRow.field_size(j) > (int)sizeof(int32_t))
+				SKIP_SIZE(sizeof(int64_t));
+			else
+				SKIP_SIZE(sizeof(int32_t));
+			;
+			break;
+
+		case DField::Float: //浮点数
+			if (stRow.field_size(j) > (int)sizeof(float))
+				SKIP_SIZE(sizeof(double));
+			else
+				SKIP_SIZE(sizeof(float));
+			break;
+
+		case DField::String: //字符串
+		case DField::Binary: //二进制数据
+		default: {
+			int iLen;
+			GET_VALUE(iLen, int);
+			SKIP_SIZE(iLen);
+			break;
+		}
+		} //end of switch
+	}
+
+	return (0);
+
+ERROR_RET:
+	snprintf(err_message_, sizeof(err_message_), "skip row error");
+	return (-100);
+}
+
+int RawData::replace_cur_row(const RowValue &stRow, bool isDirty)
+{
+	int iRet = 0;
+	ALLOC_SIZE_T uiOldOffset;
+	ALLOC_SIZE_T uiNewRowSize;
+	ALLOC_SIZE_T uiCurRowSize;
+	ALLOC_SIZE_T uiNextRowsOffset;
+	ALLOC_SIZE_T uiNextRowsSize;
+
+	uiOldOffset = offset_;
+	if ((iRet = skip_row(stRow)) != 0) {
+		goto ERROR_RET;
+	}
+
+	unsigned char uchRowFlag;
+	GET_VALUE_AT_OFFSET(uchRowFlag, unsigned char, row_offset_);
+	if (isDirty)
+		uchRowFlag = OPER_UPDATE;
+
+	uiNewRowSize = calc_row_size(stRow, key_index_);
+	uiCurRowSize = offset_ - row_offset_;
+	uiNextRowsOffset = offset_;
+	uiNextRowsSize = data_size_ - offset_;
+
+	if (uiNewRowSize > uiCurRowSize) {
+		// enlarge buffer
+		MEM_HANDLE_T hTmp = mallocator_->ReAlloc(
+			handle_, data_size_ + uiNewRowSize - uiCurRowSize);
+		if (hTmp == INVALID_HANDLE) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "realloc error");
+			need_new_bufer_size =
+				data_size_ + uiNewRowSize - uiCurRowSize;
+			iRet = EC_NO_MEM;
+			goto ERROR_RET;
+		}
+		handle_ = hTmp;
+		size_ = mallocator_->chunk_size(handle_);
+		p_content_ = Pointer<char>();
+
+		// move data
+		if (uiNextRowsSize > 0)
+			memmove(p_content_ + uiNextRowsOffset +
+					(uiNewRowSize - uiCurRowSize),
+				p_content_ + uiNextRowsOffset, uiNextRowsSize);
+
+		// copy new row
+		offset_ = row_offset_;
+		iRet = encode_row(stRow, uchRowFlag, false);
+		if (iRet != 0) {
+			if (uiNextRowsSize > 0)
+				memmove(p_content_ + uiNextRowsOffset,
+					p_content_ + uiNextRowsOffset +
+						(uiNewRowSize - uiCurRowSize),
+					uiNextRowsSize);
+			iRet = -1;
+			goto ERROR_RET;
+		}
+
+		row_count_--;
+		data_size_ -= uiCurRowSize;
+	} else {
+		// back up old row
+		void *pTmpBuf = MALLOC(uiCurRowSize);
+		if (pTmpBuf == NULL) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "malloc error: %m");
+			return (-ENOMEM);
+		}
+		memmove(pTmpBuf, p_content_ + row_offset_, uiCurRowSize);
+
+		// copy new row
+		offset_ = row_offset_;
+		iRet = encode_row(stRow, uchRowFlag, false);
+		if (iRet != 0) {
+			memmove(p_content_ + row_offset_, pTmpBuf,
+				uiCurRowSize);
+			FREE(pTmpBuf);
+			iRet = -1;
+			goto ERROR_RET;
+		}
+
+		// move data
+		if (uiNextRowsSize > 0 && offset_ != uiNextRowsOffset)
+			memmove(p_content_ + offset_,
+				p_content_ + uiNextRowsOffset, uiNextRowsSize);
+		FREE(pTmpBuf);
+
+		// shorten buffer
+		MEM_HANDLE_T hTmp = mallocator_->ReAlloc(
+			handle_, data_size_ + uiNewRowSize - uiCurRowSize);
+		if (hTmp != INVALID_HANDLE) {
+			handle_ = hTmp;
+			size_ = mallocator_->chunk_size(handle_);
+			p_content_ = Pointer<char>();
+		}
+
+		row_count_--;
+		data_size_ -= uiCurRowSize;
+	}
+
+	set_data_size();
+	set_row_count();
+
+	return (0);
+
+ERROR_RET:
+	offset_ = uiOldOffset;
+	return (iRet);
+}
+
+int RawData::delete_cur_row(const RowValue &stRow)
+{
+	int iRet = 0;
+	ALLOC_SIZE_T uiOldOffset;
+	ALLOC_SIZE_T uiNextRowsSize;
+
+	uiOldOffset = offset_;
+	if ((iRet = skip_row(stRow)) != 0) {
+		goto ERROR_RET;
+	}
+	uiNextRowsSize = data_size_ - offset_;
+
+	memmove(p_content_ + row_offset_, p_content_ + offset_, uiNextRowsSize);
+	data_size_ -= (offset_ - row_offset_);
+	row_count_--;
+	set_data_size();
+	set_row_count();
+
+	offset_ = row_offset_;
+	return (iRet);
+
+ERROR_RET:
+	offset_ = uiOldOffset;
+	return (iRet);
+}
+
+int RawData::delete_all_rows()
+{
+	data_size_ = data_start_;
+	row_offset_ = data_start_;
+	row_count_ = 0;
+	offset_ = data_size_;
+
+	set_data_size();
+	set_row_count();
+
+	need_new_bufer_size = 0;
+
+	return (0);
+}
+
+int RawData::set_cur_row_flag(unsigned char uchFlag)
+{
+	if (row_offset_ >= data_size_) {
+		snprintf(err_message_, sizeof(err_message_), "no more rows");
+		return (-1);
+	}
+	*(unsigned char *)(p_content_ + row_offset_) = uchFlag;
+
+	return (0);
+}
+
+int RawData::copy_row()
+{
+	int iRet;
+	ALLOC_SIZE_T uiSize = p_reference_->offset_ - p_reference_->row_offset_;
+	if ((iRet = expand_chunk(uiSize)) != 0)
+		return (iRet);
+
+	memcpy(p_content_ + offset_,
+	       p_reference_->p_content_ + p_reference_->row_offset_, uiSize);
+	offset_ += uiSize;
+	data_size_ += uiSize;
+	row_count_++;
+
+	set_data_size();
+	set_row_count();
+
+	return (0);
+}
+
+int RawData::copy_all()
+{
+	int iRet;
+	ALLOC_SIZE_T uiSize = p_reference_->data_size_;
+	if ((iRet = re_alloc_chunk(uiSize)) != 0)
+		return (iRet);
+
+	memcpy(p_content_, p_reference_->p_content_, uiSize);
+
+	if ((iRet = do_attach(handle_)) != 0)
+		return (iRet);
+
+	return (0);
+}
+
+int RawData::append_n_records(unsigned int uiNRows, const char *pchData,
+			      const unsigned int uiLen)
+{
+	int iRet;
+
+	iRet = expand_chunk(uiLen);
+	if (iRet != 0)
+		return (iRet);
+
+	memcpy(p_content_ + data_size_, pchData, uiLen);
+	data_size_ += uiLen;
+	row_count_ += uiNRows;
+
+	set_data_size();
+	set_row_count();
+
+	return (0);
+}
+
+void RawData::init_timp_stamp()
+{
+	if (unlikely(NULL == p_content_)) {
+		return;
+	}
+
+	if (unlikely(offset_ + 3 * sizeof(uint16_t) > size_)) {
+		return;
+	}
+	uint16_t dwCurHour = RELATIVE_HOUR_CALCULATOR->get_relative_hour();
+
+	latest_request_time_ = dwCurHour;
+	latest_update_time_ = dwCurHour;
+	create_time_ = dwCurHour;
+
+	*(uint16_t *)(p_content_ + time_stamp_offset_) = dwCurHour;
+	*(uint16_t *)(p_content_ + time_stamp_offset_ + sizeof(uint16_t)) =
+		dwCurHour;
+	*(uint16_t *)(p_content_ + time_stamp_offset_ + 2 * sizeof(uint16_t)) =
+		dwCurHour;
+}
+
+void RawData::attach_time_stamp()
+{
+	if (unlikely(NULL == p_content_)) {
+		return;
+	}
+	if (unlikely(time_stamp_offset_ + 3 * sizeof(uint16_t) > size_)) {
+		return;
+	}
+	latest_request_time_ = *(uint16_t *)(p_content_ + time_stamp_offset_);
+	latest_update_time_ = *(uint16_t *)(p_content_ + time_stamp_offset_ +
+					    sizeof(uint16_t));
+	create_time_ = *(uint16_t *)(p_content_ + time_stamp_offset_ +
+				     2 * sizeof(uint16_t));
+}
+void RawData::update_last_access_time_by_hour()
+{
+	if (unlikely(NULL == p_content_)) {
+		return;
+	}
+	if (unlikely(time_stamp_offset_ + sizeof(uint16_t) > size_)) {
+		return;
+	}
+	latest_request_time_ = RELATIVE_HOUR_CALCULATOR->get_relative_hour();
+	*(uint16_t *)(p_content_ + time_stamp_offset_) = latest_request_time_;
+}
+void RawData::update_last_update_time_by_hour()
+{
+	if (unlikely(NULL == p_content_)) {
+		return;
+	}
+	if (unlikely(time_stamp_offset_ + 2 * sizeof(uint16_t) > size_)) {
+		return;
+	}
+	latest_update_time_ = RELATIVE_HOUR_CALCULATOR->get_relative_hour();
+	*(uint16_t *)(p_content_ + time_stamp_offset_ + sizeof(uint16_t)) =
+		latest_update_time_;
+}
+uint32_t RawData::get_create_time_by_hour()
+{
+	return create_time_;
+}
+uint32_t RawData::get_last_access_time_by_hour()
+{
+	return latest_request_time_;
+}
+
+uint32_t RawData::get_last_update_time_by_hour()
+{
+	return latest_update_time_;
+}
+uint32_t RawData::get_select_op_count()
+{
+	return get_request_count_;
+}
+
+void RawData::inc_select_count()
+{
+	if (unlikely(get_request_count_ >= BTYE_MAX_VALUE)) {
+		return;
+	}
+	if (unlikely(get_request_count_offset_ + sizeof(uint8_t) > size_)) {
+		return;
+	}
+	get_request_count_++;
+	*(uint8_t *)(p_content_ + get_request_count_offset_) =
+		get_request_count_;
+}
+
+DTCTableDefinition *RawData::get_node_table_def()
+{
+	return table_definition_;
+}

+ 470 - 0
src/core/raw/raw_data.h

@@ -0,0 +1,470 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef RAW_DATA_H
+#define RAW_DATA_H
+
+#include "mem/pt_malloc.h"
+#include "global.h"
+#include "field/field.h"
+#include "data/col_expand.h"
+#include "table/table_def_manager.h"
+#include "node/node.h"
+
+#define PRE_DECODE_ROW 1
+
+typedef enum _EnumDataType {
+	DATA_TYPE_RAW, // 平板数据结构
+	DATA_TYPE_TREE_ROOT, // 树的根节点
+	DATA_TYPE_TREE_NODE // 树的节点
+} EnumDataType;
+
+typedef enum _enum_oper_type_ {
+	OPER_DIRTY = 0x02, // cover INSERT, DELETE, UPDATE
+	OPER_SELECT = 0x30,
+	OPER_INSERT_OLD = 0x31, // old stuff, same as SELECT aka useless
+	OPER_UPDATE = 0x32,
+	OPER_DELETE_NA = 0x33, // async DELETE require quite a lot change
+	OPER_FLUSH = 0x34, // useless too, same as SELECT
+	OPER_RESV1 = 0x35,
+	OPER_INSERT = 0x36,
+	OPER_RESV2 = 0x37,
+} TOperType;
+
+struct RawFormat {
+	unsigned char data_type_; // 数据类型EnumDataType
+	uint32_t data_size_; // 数据总大小
+	uint32_t row_count_; // 行数
+	uint8_t get_request_count_; // get次数
+	uint16_t latest_request_time_; // 最近访问时间
+	uint16_t latest_update_time_; // 最近更新时间
+	uint16_t create_time_; // 创建时间
+	char p_key_[0]; // key
+	char p_rows_data_[0]; // 行数据
+} __attribute__((packed));
+
+// 注意:修改操作可能会导致handle改变,因此需要检查重新保存
+class RawData {
+    private:
+	char *p_content_; // 注意:地址可能会因为realloc而改变
+	uint32_t data_size_; // 包括data_type,data_size,rowcnt,key,rows等总数据大小
+	uint32_t row_count_;
+	uint8_t key_index_;
+	int key_size_;
+	int m_iLAId;
+	int m_iLCmodId;
+	int expire_id_;
+	int table_index_;
+
+	ALLOC_SIZE_T key_start_;
+	ALLOC_SIZE_T data_start_;
+	ALLOC_SIZE_T row_offset_;
+	ALLOC_SIZE_T offset_;
+	ALLOC_SIZE_T m_uiLAOffset;
+	int get_request_count_offset_;
+	int time_stamp_offset_;
+	uint8_t get_request_count_;
+	uint16_t latest_request_time_;
+	uint16_t latest_update_time_;
+	uint16_t create_time_;
+	ALLOC_SIZE_T need_new_bufer_size; // 最近一次分配内存失败需要的大小
+
+	MEM_HANDLE_T handle_;
+	uint64_t size_;
+	MallocBase *mallocator_;
+	int auto_destory_;
+
+	RawData *p_reference_;
+	char err_message_[200];
+
+	DTCTableDefinition *table_definition_;
+
+    protected:
+	template <class T> T *Pointer(void) const
+	{
+		return reinterpret_cast<T *>(
+			mallocator_->handle_to_ptr(handle_));
+	}
+
+	int set_data_size();
+	int set_row_count();
+	int expand_chunk(ALLOC_SIZE_T expand_size);
+	int re_alloc_chunk(ALLOC_SIZE_T tSize);
+	int skip_row(const RowValue &stRow);
+	int encode_row(const RowValue &stRow, unsigned char uchOp,
+		       bool expendBuf = true);
+
+    public:
+	/*************************************************
+	  Description:    构造函数
+	  Input:          pstMalloc	内存分配器
+	                     iAutoDestroy	析构的时候是否自动释放内存
+	  Output:         
+	  Return:         
+	*************************************************/
+	RawData(MallocBase *pstMalloc, int iAutoDestroy = 0);
+
+	~RawData();
+
+	void change_mallocator(MallocBase *pstMalloc)
+	{
+		mallocator_ = pstMalloc;
+	}
+
+	const char *get_err_msg()
+	{
+		return err_message_;
+	}
+
+	/*************************************************
+	  Description:	新分配一块内存,并初始化
+	  Input:		 uchKeyIdx	作为key的字段在table里的下标
+				iKeySize	key的格式,0为变长,非0为定长长度
+				pchKey	为格式化后的key,变长key的第0字节为长度
+				uiDataSize	为数据的大小,用于一次分配足够大的chunk。如果设置为0,则insert row的时候再realloc扩大
+	  Output:		
+	  Return:		0为成功,非0失败
+	*************************************************/
+	int init(uint8_t uchKeyIdx, int iKeySize, const char *pchKey,
+		 ALLOC_SIZE_T uiDataSize = 0, int laid = -1, int expireid = -1,
+		 int nodeIdx = -1);
+	int do_init(const char *pchKey, ALLOC_SIZE_T uiDataSize = 0);
+
+	/*************************************************
+	  Description:	attach一块已经格式化好的内存
+	  Input:		hHandle	内存的句柄
+				uchKeyIdx	作为key的字段在table里的下标
+				iKeySize	key的格式,0为变长,非0为定长长度
+	  Output:		
+	  Return:		0为成功,非0失败
+	*************************************************/
+	int do_attach(MEM_HANDLE_T hHandle, uint8_t uchKeyIdx, int iKeySize,
+		      int laid = -1, int lastcmod = -1, int expireid = -1);
+	int do_attach(MEM_HANDLE_T hHandle);
+
+	/*************************************************
+	  Description:	获取内存块的句柄
+	  Input:		
+	  Output:		
+	  Return:		句柄。 注意:任何修改操作可能会导致handle改变,因此需要检查重新保存
+	*************************************************/
+	MEM_HANDLE_T get_handle()
+	{
+		return handle_;
+	}
+
+	const char *get_addr() const
+	{
+		return p_content_;
+	}
+
+	/*************************************************
+	  Description:	设置一个refrence,在调用CopyRow()或者CopyAll()的时候使用
+	  Input:		pstRef	refrence指针
+	  Output:		
+	  Return:		
+	*************************************************/
+	void set_refrence(RawData *pstRef)
+	{
+		p_reference_ = pstRef;
+	}
+
+	/*************************************************
+	  Description:	包括key、rows等所有内存的大小
+	  Input:		
+	  Output:		
+	  Return:		所有内存的大小
+	*************************************************/
+	uint32_t data_size() const
+	{
+		return data_size_;
+	}
+
+	/*************************************************
+	  Description:	rows的开始偏移量
+	  Input:		
+	  Output:		
+	  Return:		rows的开始偏移量
+	*************************************************/
+	uint32_t data_start() const
+	{
+		return data_start_;
+	}
+
+	/*************************************************
+	  Description:	内存分配失败时,返回所需要的内存大小
+	  Input:		
+	  Output:		
+	  Return:		返回所需要的内存大小
+	*************************************************/
+	ALLOC_SIZE_T need_size()
+	{
+		return need_new_bufer_size;
+	}
+
+	/*************************************************
+	  Description:	计算插入该行所需要的内存大小
+	  Input:		stRow	行数据
+	  Output:		
+	  Return:		返回所需要的内存大小
+	*************************************************/
+	ALLOC_SIZE_T calc_row_size(const RowValue &stRow, int keyIndex);
+
+	/*************************************************
+	  Description:	获取格式化后的key
+	  Input:		
+	  Output:		
+	  Return:		格式化后的key
+	*************************************************/
+	const char *key() const
+	{
+		return p_content_ ? (p_content_ + key_start_) : NULL;
+	}
+	char *key()
+	{
+		return p_content_ ? (p_content_ + key_start_) : NULL;
+	}
+
+	/*************************************************
+	  Description:	获取key的格式
+	  Input:		
+	  Output:		
+	  Return:		变长返回0,定长key返回定长的长度
+	*************************************************/
+	int key_format() const
+	{
+		return key_size_;
+	}
+
+	/*************************************************
+	  Description:	获取key的实际长度
+	  Input:		
+	  Output:		
+	  Return:		key的实际长度
+	*************************************************/
+	int key_size();
+
+	unsigned int total_rows() const
+	{
+		return row_count_;
+	}
+	void rewind(void)
+	{
+		offset_ = data_start_;
+		row_offset_ = data_start_;
+	}
+
+	/*************************************************
+	  Description:	销毁释放内存
+	  Input:		
+	  Output:		
+	  Return:		0为成功,非0失败
+	*************************************************/
+	int destory();
+
+	/*************************************************
+	  Description:	释放多余的内存(通常在delete一些row后调用一次)
+	  Input:		
+	  Output:		
+	  Return:		0为成功,非0失败
+	*************************************************/
+	int strip_mem();
+
+	/*************************************************
+	  Description:	读取一行数据
+	  Input:		
+	  Output:		stRow	保存行数据
+				uchRowFlags	行数据是否脏数据等flag
+				iDecodeFlag	是否只是pre-read,不fetch_row移动指针
+	  Return:		0为成功,非0失败
+	*************************************************/
+	int decode_row(RowValue &stRow, unsigned char &uchRowFlags,
+		       int iDecodeFlag = 0);
+
+	/*************************************************
+	  Description:	插入一行数据
+	  Input:		stRow	需要插入的行数据
+	  Output:		
+				byFirst	是否插入到最前面,默认添加到最后面
+				isDirty	是否脏数据
+	  Return:		0为成功,非0失败
+	*************************************************/
+	int insert_row(const RowValue &stRow, bool byFirst, bool isDirty);
+
+	/*************************************************
+	  Description:	插入一行数据
+	  Input:		stRow	需要插入的行数据
+	  Output:		
+				byFirst	是否插入到最前面,默认添加到最后面
+				uchOp	row的标记
+	  Return:		0为成功,非0失败
+	*************************************************/
+	int insert_row_flag(const RowValue &stRow, bool byFirst,
+			    unsigned char uchOp);
+
+	/*************************************************
+	  Description:	插入若干行数据
+	  Input:		uiNRows	行数
+				stRow	需要插入的行数据
+	  Output:		
+				byFirst	是否插入到最前面,默认添加到最后面
+				isDirty	是否脏数据
+	  Return:		0为成功,非0失败
+	*************************************************/
+	int insert_n_rows(unsigned int uiNRows, const RowValue *pstRow,
+			  bool byFirst, bool isDirty);
+
+	/*************************************************
+	  Description:	用指定数据替换当前行
+	  Input:		stRow	新的行数据
+	  Output:		
+				isDirty	是否脏数据
+	  Return:		0为成功,非0失败
+	*************************************************/
+	int replace_cur_row(const RowValue &stRow, bool isDirty);
+
+	/*************************************************
+	  Description:	删除当前行
+	  Input:		stRow	仅使用row的字段类型等信息,不需要实际数据
+	  Output:		
+	  Return:		0为成功,非0失败
+	*************************************************/
+	int delete_cur_row(const RowValue &stRow);
+
+	/*************************************************
+	  Description:	删除所有行
+	  Input:		
+	  Output:		
+	  Return:		0为成功,非0失败
+	*************************************************/
+	int delete_all_rows();
+
+	/*************************************************
+	  Description:	设置当前行的标记
+	  Input:		uchFlag	行的标记
+	  Output:		
+	  Return:		0为成功,非0失败
+	*************************************************/
+	int set_cur_row_flag(unsigned char uchFlag);
+
+	/*************************************************
+	  Description:	从refrence copy当前行到本地buffer末尾
+	  Input:		
+	  Output:		
+	  Return:		0为成功,非0失败
+	*************************************************/
+	int copy_row();
+
+	/*************************************************
+	  Description:	用refrence的数据替换本地数据
+	  Input:		
+	  Output:		
+	  Return:		0为成功,非0失败
+	*************************************************/
+	int copy_all();
+
+	/*************************************************
+	  Description:	添加N行已经格式化好的数据到末尾
+	  Input:		
+	  Output:		
+	  Return:		0为成功,非0失败
+	*************************************************/
+	int append_n_records(unsigned int uiNRows, const char *pchData,
+			     const unsigned int uiLen);
+
+	/*************************************************
+	  Description:	更新最后访问时间戳
+	  Input:	时间戳	
+	  Output:		
+	  Return:
+	*************************************************/
+	void update_lastacc(uint32_t now)
+	{
+		if (m_uiLAOffset > 0)
+			*(uint32_t *)(p_content_ + m_uiLAOffset) = now;
+	}
+	int get_expire_time(DTCTableDefinition *t, uint32_t &expire);
+	/*************************************************
+	  Description:	获取最后需改时间
+	  Input:	时间戳	
+	  Output:		
+	  Return:
+	*************************************************/
+	int get_lastcmod(uint32_t &lastcmod);
+	int check_size(MEM_HANDLE_T hHandle, uint8_t uchKeyIdx, int iKeySize,
+		       int size);
+
+	/*************************************************
+	  Description:	初始化时间戳,包括最后访问时间
+	  、最后更新时间、创建时间三部分
+	  Input:	时间戳(以某个绝对事件为开始的小时数)
+	  虽然名字为Update,其实只会被调用一次
+	  tomchen
+	*************************************************/
+	void init_timp_stamp();
+	/*************************************************
+	  Description:	更新节点最后访问时间
+	  Input:	时间戳(以某个绝对事件为开始的小时数)
+	   tomchen
+	*************************************************/
+	void update_last_access_time_by_hour();
+	/*************************************************
+	  Description:	更新节点最后更新时间
+	  Input:	时间戳(以某个绝对事件为开始的小时数)
+	   tomchen
+	*************************************************/
+	void update_last_update_time_by_hour();
+	/*************************************************
+	  Description:	增加节点被select请求的次数
+	 tomchen
+	*************************************************/
+	void inc_select_count();
+	/*************************************************
+	  Description:	获取节点创建时间
+	 tomchen
+	*************************************************/
+	uint32_t get_create_time_by_hour();
+	/*************************************************
+	  Description:	获取节点最后访问时间
+	 tomchen
+	*************************************************/
+	uint32_t get_last_access_time_by_hour();
+	/*************************************************
+	  Description:	获取节点最后更新时间
+	 tomchen
+	*************************************************/
+	uint32_t get_last_update_time_by_hour();
+	/*************************************************
+	  Description:	获取节点被select操作的次数
+	 tomchen
+	*************************************************/
+	uint32_t get_select_op_count();
+	/*************************************************
+	  Description:	attach上时间戳
+	 tomchen
+	*************************************************/
+	void attach_time_stamp();
+
+	DTCTableDefinition *get_node_table_def();
+};
+
+inline int RawData::key_size()
+{
+	return key_size_ > 0 ? key_size_ :
+			       (sizeof(char) + *(unsigned char *)key());
+}
+
+#endif

+ 1181 - 0
src/core/raw/raw_data_process.cc

@@ -0,0 +1,1181 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "raw_data_process.h"
+#include "global.h"
+#include "log/log.h"
+#include "sys_malloc.h"
+#include "task/task_pkey.h"
+#include "buffer_flush.h"
+#include "algorithm/relative_hour_calculator.h"
+
+DTC_USING_NAMESPACE
+
+RawDataProcess::RawDataProcess(MallocBase *pstMalloc,
+			       DTCTableDefinition *p_table_definition_,
+			       BufferPond *pstPool,
+			       const UpdateMode *pstUpdateMode)
+	: raw_data_(pstMalloc), p_table_(p_table_definition_),
+	  p_mallocator_(pstMalloc), p_buffer_pond_(pstPool)
+{
+	memcpy(&update_mode_, pstUpdateMode, sizeof(update_mode_));
+	nodeSizeLimit = 0;
+	history_datasize = g_stat_mgr.get_sample(DATA_SIZE_HISTORY_STAT);
+	history_rowsize = g_stat_mgr.get_sample(ROW_SIZE_HISTORY_STAT);
+}
+
+RawDataProcess::~RawDataProcess()
+{
+}
+
+int RawDataProcess::init_data(Node *p_node, RawData *affected_data,
+			      const char *ptrKey)
+{
+	int iRet;
+
+	iRet = raw_data_.do_init(ptrKey, 0);
+	if (iRet != 0) {
+		log4cplus_error("raw-data init error: %d,%s", iRet,
+				raw_data_.get_err_msg());
+		return (-1);
+	}
+	p_node->vd_handle() = raw_data_.get_handle();
+
+	if (affected_data != NULL) {
+		iRet = affected_data->do_init(ptrKey, 0);
+		if (iRet != 0) {
+			log4cplus_error("raw-data init error: %d,%s", iRet,
+					affected_data->get_err_msg());
+			return (-2);
+		}
+	}
+
+	return DTC_CODE_SUCCESS;
+}
+
+int RawDataProcess::attach_data(Node *p_node, RawData *affected_data)
+{
+	int iRet;
+
+	iRet = raw_data_.do_attach(p_node->vd_handle());
+	if (iRet != DTC_CODE_SUCCESS) {
+		log4cplus_error("raw-data attach[handle:" UINT64FMT
+				"] error: %d,%s",
+				p_node->vd_handle(), iRet,
+				raw_data_.get_err_msg());
+		return (-1);
+	}
+
+	if (affected_data != NULL) {
+		iRet = affected_data->do_init(raw_data_.key(), 0);
+		if (iRet != DTC_CODE_SUCCESS) {
+			log4cplus_error("raw-data init error: %d,%s", iRet,
+					affected_data->get_err_msg());
+			return (-2);
+		}
+	}
+
+	return DTC_CODE_SUCCESS;
+}
+
+int RawDataProcess::get_node_all_rows_count(Node *p_node, RawData *pstRows)
+{
+	int iRet;
+
+	rows_count_ = 0;
+	dirty_rows_count_ = 0;
+
+	iRet = attach_data(p_node, pstRows);
+	if (iRet != DTC_CODE_SUCCESS) {
+		log4cplus_error("attach data error: %d", iRet);
+		return (-1);
+	}
+
+	pstRows->set_refrence(&raw_data_);
+	if (pstRows->copy_all() != 0) {
+		log4cplus_error("copy data error: %d,%s", iRet,
+				pstRows->get_err_msg());
+		return (-2);
+	}
+
+	return DTC_CODE_SUCCESS;
+}
+
+int RawDataProcess::expand_node(DTCJobOperation &job_op, Node *p_node)
+{
+	int iRet;
+	DTCTableDefinition *stpNodeTab, *stpTaskTab;
+	RowValue *stpNodeRow, *stpTaskRow;
+
+	// no need to check expand status as checked in CCacheProces
+
+	// save node to stack as new version
+	iRet = attach_data(p_node, NULL);
+	if (iRet != DTC_CODE_SUCCESS) {
+		log4cplus_error("attach data error: %d", iRet);
+		return -1;
+	}
+	unsigned int uiTotalRows = raw_data_.total_rows();
+	stpNodeTab = raw_data_.get_node_table_def();
+	stpTaskTab = TableDefinitionManager::instance()->get_new_table_def();
+	if (stpTaskTab == stpNodeTab) {
+		log4cplus_info(
+			"expand one node which is already new version, pay attention, treat as success");
+		return DTC_CODE_SUCCESS;
+	}
+	RowValue stNewRow(stpTaskTab);
+	RowValue stNewNodeRow(stpNodeTab);
+	stpTaskRow = &stNewRow;
+	stpNodeRow = &stNewNodeRow;
+	RawData stNewTmpRawData(&g_stSysMalloc, 1);
+	iRet = stNewTmpRawData.do_init(raw_data_.key(), raw_data_.data_size());
+	if (iRet != DTC_CODE_SUCCESS) {
+		log4cplus_error(
+			"init raw-data struct error, ret = %d, err = %s", iRet,
+			stNewTmpRawData.get_err_msg());
+		return -2;
+	}
+	for (unsigned int i = 0; i < uiTotalRows; ++i) {
+		unsigned char uchRowFlags;
+		if (raw_data_.decode_row(*stpNodeRow, uchRowFlags, 0) != 0) {
+			log4cplus_error("raw-data decode row error: %d, %s",
+					iRet, raw_data_.get_err_msg());
+			return -1;
+		}
+		stpTaskRow->default_value();
+		stpTaskRow->Copy(stpNodeRow);
+		iRet = stNewTmpRawData.insert_row(
+			*stpTaskRow,
+			update_mode_.m_uchInsertOrder ? true : false, false);
+		if (0 != iRet) {
+			log4cplus_error(
+				"insert row to raw-data error: ret = %d, err = %s",
+				iRet, stNewTmpRawData.get_err_msg());
+			return -2;
+		}
+	}
+
+	// allocate new with new version
+	RawData stTmpRawData(p_mallocator_);
+	iRet = stTmpRawData.do_init(stNewTmpRawData.key(),
+				    stNewTmpRawData.data_size());
+	if (iRet == EC_NO_MEM) {
+		if (p_buffer_pond_->try_purge_size(stTmpRawData.need_size(),
+						   *p_node) == 0)
+			iRet = stTmpRawData.do_init(
+				stNewTmpRawData.key(),
+				stNewTmpRawData.data_size() -
+					stNewTmpRawData.data_start());
+	}
+
+	if (iRet != DTC_CODE_SUCCESS) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "raw-data init error: %s", stTmpRawData.get_err_msg());
+		stTmpRawData.destory();
+		return -3;
+	}
+
+	stTmpRawData.set_refrence(&stNewTmpRawData);
+	iRet = stTmpRawData.copy_all();
+	if (iRet != DTC_CODE_SUCCESS) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "raw-data init error: %s", stTmpRawData.get_err_msg());
+		stTmpRawData.destory();
+		return -3;
+	}
+
+	// purge old
+	raw_data_.destory();
+	p_node->vd_handle() = stTmpRawData.get_handle();
+	return DTC_CODE_SUCCESS;
+}
+
+int RawDataProcess::destroy_data(Node *p_node)
+{
+	int iRet;
+
+	iRet = raw_data_.do_attach(p_node->vd_handle());
+	if (iRet != DTC_CODE_SUCCESS) {
+		log4cplus_error("raw-data attach error: %d,%s", iRet,
+				raw_data_.get_err_msg());
+		return DTC_CODE_FAILED;
+	}
+	rows_count_ += 0LL - raw_data_.total_rows();
+
+	raw_data_.destory();
+	p_node->vd_handle() = INVALID_HANDLE;
+
+	return DTC_CODE_SUCCESS;
+}
+
+int RawDataProcess::do_replace_all(Node *p_node, RawData *new_data)
+{
+	int iRet;
+
+	log4cplus_debug("do_replace_all start ");
+
+	rows_count_ = 0;
+	dirty_rows_count_ = 0;
+
+	RawData tmpRawData(p_mallocator_);
+
+	iRet = tmpRawData.do_init(new_data->key(),
+				  new_data->data_size() -
+					  new_data->data_start());
+	if (iRet == EC_NO_MEM) {
+		if (p_buffer_pond_->try_purge_size(tmpRawData.need_size(),
+						   *p_node) == 0)
+			iRet = tmpRawData.do_init(
+				new_data->key(),
+				new_data->data_size() - new_data->data_start());
+	}
+
+	if (iRet != 0) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "raw-data init error: %s", tmpRawData.get_err_msg());
+		tmpRawData.destory();
+		return (-2);
+	}
+
+	tmpRawData.set_refrence(new_data);
+	iRet = tmpRawData.copy_all();
+	if (iRet != 0) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "raw-data init error: %s", tmpRawData.get_err_msg());
+		tmpRawData.destory();
+		return (-3);
+	}
+
+	if (p_node->vd_handle() != INVALID_HANDLE)
+		destroy_data(p_node);
+	p_node->vd_handle() = tmpRawData.get_handle();
+	rows_count_ += new_data->total_rows();
+	if (tmpRawData.total_rows() > 0) {
+		log4cplus_debug(
+			"do_replace_all,  stat history datasize, size is %u",
+			tmpRawData.data_size());
+		history_datasize.push(tmpRawData.data_size());
+		history_rowsize.push(tmpRawData.total_rows());
+	}
+	return DTC_CODE_SUCCESS;
+}
+
+int RawDataProcess::get_expire_time(DTCTableDefinition *t, Node *p_node,
+				    uint32_t &expire)
+{
+	int iRet = DTC_CODE_SUCCESS;
+
+	iRet = attach_data(p_node, NULL);
+	if (iRet != DTC_CODE_SUCCESS) {
+		log4cplus_error("attach data error: %d", iRet);
+		return iRet;
+	}
+	iRet = raw_data_.get_expire_time(t, expire);
+	if (iRet != DTC_CODE_SUCCESS) {
+		log4cplus_error("raw data get expire time error: %d", iRet);
+		return iRet;
+	}
+	return DTC_CODE_SUCCESS;
+}
+
+void RawDataProcess::change_mallocator(MallocBase *pstMalloc)
+{
+	log4cplus_debug("oring mallc: %p, new mallc: %p", p_mallocator_,
+			pstMalloc);
+	p_mallocator_ = pstMalloc;
+	raw_data_.change_mallocator(pstMalloc);
+}
+
+int RawDataProcess::get_dirty_row_count(DTCJobOperation &job_op, Node *p_node)
+{
+	int iRet = 0;
+	int dirty_rows = 0;
+
+	iRet = attach_data(p_node, NULL);
+	if (iRet != DTC_CODE_SUCCESS) {
+		log4cplus_error("attach data error: %d", iRet);
+		return iRet;
+	}
+
+	unsigned char uchRowFlags;
+	unsigned int uiTotalRows = raw_data_.total_rows();
+
+	DTCTableDefinition *t = raw_data_.get_node_table_def();
+	RowValue stRow(t);
+	for (unsigned int i = 0; i < uiTotalRows; i++) {
+		iRet = raw_data_.decode_row(stRow, uchRowFlags, 0);
+		if (iRet != 0) {
+			log4cplus_error("raw-data decode row error: %d,%s",
+					iRet, raw_data_.get_err_msg());
+			return (-4);
+		}
+
+		if (uchRowFlags & OPER_DIRTY)
+			dirty_rows++;
+	}
+
+	return dirty_rows;
+}
+
+// affected_data is always NULL
+int RawDataProcess::do_delete(DTCJobOperation &job_op, Node *p_node,
+			      RawData *affected_data)
+{
+	int iRet;
+	DTCTableDefinition *stpNodeTab, *stpTaskTab;
+	RowValue *stpNodeRow, *stpTaskRow;
+
+	log4cplus_debug("do_delete start! ");
+
+	rows_count_ = 0;
+	dirty_rows_count_ = 0;
+
+	iRet = attach_data(p_node, affected_data);
+	if (iRet != DTC_CODE_SUCCESS) {
+		log4cplus_error("attach data error: %d", iRet);
+		return (iRet);
+	}
+
+	if (affected_data != NULL)
+		affected_data->set_refrence(&raw_data_);
+
+	stpNodeTab = raw_data_.get_node_table_def();
+	stpTaskTab = job_op.table_definition();
+	RowValue stNodeRow(stpNodeTab);
+	RowValue stTaskRow(stpTaskTab);
+	if (stpNodeTab == stpTaskTab) {
+		stpNodeRow = &stTaskRow;
+		stpTaskRow = &stTaskRow;
+	} else {
+		stpNodeRow = &stNodeRow;
+		stpTaskRow = &stTaskRow;
+	}
+
+	int iAffectRows = 0;
+	unsigned char uchRowFlags;
+	unsigned int uiTotalRows = raw_data_.total_rows();
+	for (unsigned int i = 0; i < uiTotalRows; i++) {
+		iRet = raw_data_.decode_row(*stpNodeRow, uchRowFlags, 0);
+		if (iRet != DTC_CODE_SUCCESS) {
+			log4cplus_error("raw-data decode row error: %d,%s",
+					iRet, raw_data_.get_err_msg());
+			return (-4);
+		}
+		if (stpNodeTab != stpTaskTab) {
+			stpTaskRow->Copy(stpNodeRow);
+		}
+		if (job_op.compare_row(*stpTaskRow) != 0) { //符合del条件
+			if (affected_data != NULL) { // copy row
+				iRet = affected_data->copy_row();
+				if (iRet != 0) {
+					log4cplus_error(
+						"raw-data copy row error: %d,%s",
+						iRet,
+						affected_data->get_err_msg());
+				}
+			}
+			iRet = raw_data_.delete_cur_row(*stpNodeRow);
+			if (iRet != EC_NO_MEM)
+				p_node->vd_handle() = raw_data_.get_handle();
+			if (iRet != 0) {
+				log4cplus_error(
+					"raw-data delete row error: %d,%s",
+					iRet, raw_data_.get_err_msg());
+				return (-5);
+			}
+			iAffectRows++;
+			rows_count_--;
+			if (uchRowFlags & OPER_DIRTY)
+				dirty_rows_count_--;
+		}
+	}
+	if (iAffectRows > 0) {
+		if (job_op.resultInfo.affected_rows() == 0 ||
+		    (job_op.request_condition() &&
+		     job_op.request_condition()->has_type_timestamp())) {
+			job_op.resultInfo.set_affected_rows(iAffectRows);
+		}
+		raw_data_.strip_mem();
+	}
+
+	if (raw_data_.total_rows() > 0) {
+		log4cplus_debug("stat history datasize, size is %u",
+				raw_data_.data_size());
+		history_datasize.push(raw_data_.data_size());
+		history_rowsize.push(raw_data_.total_rows());
+		raw_data_.update_last_access_time_by_hour();
+		raw_data_.update_last_update_time_by_hour();
+	}
+	return DTC_CODE_SUCCESS;
+}
+
+int RawDataProcess::do_get(DTCJobOperation &job_op, Node *p_node)
+{
+	int iRet;
+	DTCTableDefinition *stpNodeTab, *stpTaskTab;
+	RowValue *stpNodeRow, *stpTaskRow;
+
+	log4cplus_debug("do_get start! ");
+
+	rows_count_ = 0;
+	dirty_rows_count_ = 0;
+	int laid = job_op.flag_no_cache() ?
+			   -1 :
+			   job_op.table_definition()->lastacc_field_id();
+
+	iRet = raw_data_.do_attach(p_node->vd_handle());
+	if (iRet != 0) {
+		log4cplus_error("raw-data attach[handle:" UINT64FMT
+				"] error: %d,%s",
+				p_node->vd_handle(), iRet,
+				raw_data_.get_err_msg());
+		return (-1);
+	}
+
+	unsigned int uiTotalRows = raw_data_.total_rows();
+	job_op.prepare_result(); //准备返回结果对象
+	if (job_op.all_rows() &&
+	    (job_op.count_only() || !job_op.in_range((int)uiTotalRows, 0))) {
+		if (job_op.is_batch_request()) {
+			if ((int)uiTotalRows > 0)
+				job_op.add_total_rows((int)uiTotalRows);
+		} else {
+			job_op.set_total_rows((int)uiTotalRows);
+		}
+	} else {
+		stpNodeTab = raw_data_.get_node_table_def();
+		stpTaskTab = job_op.table_definition();
+		RowValue stNodeRow(stpNodeTab);
+		RowValue stTaskRow(stpTaskTab);
+		if (stpNodeTab == stpTaskTab) {
+			stpNodeRow = &stTaskRow;
+			stpTaskRow = &stTaskRow;
+		} else {
+			stpNodeRow = &stNodeRow;
+			stpTaskRow = &stTaskRow;
+		}
+		unsigned char uchRowFlags;
+		for (unsigned int i = 0; i < uiTotalRows; i++) //逐行拷贝数据
+		{
+			job_op.update_key(
+				*stpNodeRow); // use stpNodeRow is fine, as just modify key field
+			if ((iRet = raw_data_.decode_row(
+				     *stpNodeRow, uchRowFlags, 0)) != 0) {
+				log4cplus_error(
+					"raw-data decode row error: %d,%s",
+					iRet, raw_data_.get_err_msg());
+				return (-2);
+			}
+			// this pointer compare is ok, as these two is both come from tabledefmanager. if they mean same, they are same object.
+			if (stpNodeTab != stpTaskTab) {
+				stpTaskRow->Copy(stpNodeRow);
+			}
+			if (job_op.compare_row(*stpTaskRow) ==
+			    0) //如果不符合查询条件
+				continue;
+
+			if (stpTaskTab->expire_time_field_id() > 0)
+				stpTaskRow->update_expire_time();
+			//当前行添加到task中
+			if (job_op.append_row(stpTaskRow) > 0 && laid > 0) {
+				raw_data_.update_lastacc(job_op.Timestamp());
+			}
+			if (job_op.all_rows() && job_op.result_full()) {
+				job_op.set_total_rows((int)uiTotalRows);
+				break;
+			}
+		}
+	}
+	/*更新访问时间和查找操作计数*/
+	raw_data_.update_last_access_time_by_hour();
+	raw_data_.inc_select_count();
+	log4cplus_debug(
+		"node[id:%u] ,Get Count is %d, last_access_time is %d, create_time is %d",
+		p_node->node_id(), raw_data_.get_select_op_count(),
+		raw_data_.get_last_access_time_by_hour(),
+		raw_data_.get_create_time_by_hour());
+	return DTC_CODE_SUCCESS;
+}
+
+// affected_data is always NULL
+int RawDataProcess::do_append(DTCJobOperation &job_op, Node *p_node,
+			      RawData *affected_data, bool isDirty,
+			      bool setrows)
+{
+	int iRet;
+	DTCTableDefinition *stpNodeTab, *stpTaskTab;
+	RowValue *stpNodeRow, *stpTaskRow;
+
+	iRet = attach_data(p_node, affected_data);
+	if (iRet != DTC_CODE_SUCCESS) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "attach data error");
+		log4cplus_warning("attach data error: %d", iRet);
+		return (iRet);
+	}
+
+	stpNodeTab = raw_data_.get_node_table_def();
+	stpTaskTab = job_op.table_definition();
+	RowValue stTaskRow(stpTaskTab);
+	RowValue stNodeRow(stpNodeTab);
+	stpTaskRow = &stTaskRow;
+	stpTaskRow->default_value();
+	job_op.update_row(*stpTaskRow);
+
+	if (stpTaskTab->auto_increment_field_id() >= stpTaskTab->key_fields() &&
+	    job_op.resultInfo.insert_id()) {
+		const int iFieldID = stpTaskTab->auto_increment_field_id();
+		const uint64_t iVal = job_op.resultInfo.insert_id();
+		stpTaskRow->field_value(iFieldID)->Set(iVal);
+	}
+
+	if (stpNodeTab == stpTaskTab) {
+		stpNodeRow = stpTaskRow;
+	} else {
+		stpNodeRow = &stNodeRow;
+		stpNodeRow->default_value();
+		stpNodeRow->Copy(stpTaskRow);
+	}
+
+	log4cplus_debug("do_append start! ");
+
+	rows_count_ = 0;
+	dirty_rows_count_ = 0;
+
+	unsigned int uiTotalRows = raw_data_.total_rows();
+	if (uiTotalRows > 0) {
+		if ((isDirty || setrows) &&
+		    job_op.table_definition()->key_as_uniq_field()) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "duplicate key error");
+			return (-1062);
+		}
+		RowValue stOldRow(stpNodeTab); //一行数据
+		if (setrows &&
+		    job_op.table_definition()->key_part_of_uniq_field()) {
+			for (unsigned int i = 0; i < uiTotalRows;
+			     i++) { //逐行拷贝数据
+				unsigned char uchRowFlags;
+				if (raw_data_.decode_row(stOldRow, uchRowFlags,
+							 0) != 0) {
+					log4cplus_error(
+						"raw-data decode row error: %d,%s",
+						iRet, raw_data_.get_err_msg());
+					return (-1);
+				}
+
+				if (stpNodeRow->Compare(
+					    stOldRow,
+					    stpNodeTab->uniq_fields_list(),
+					    stpNodeTab->uniq_fields()) == 0) {
+					snprintf(err_message_,
+						 sizeof(err_message_),
+						 "duplicate key error");
+					return (-1062);
+				}
+			}
+		}
+	}
+
+	if (affected_data != NULL &&
+	    affected_data->insert_row(*stpNodeRow, false, isDirty) != 0) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "raw-data insert row error: %s",
+			 affected_data->get_err_msg());
+		return (-1);
+	}
+
+	// insert clean row
+	iRet = raw_data_.insert_row(
+		*stpNodeRow, update_mode_.m_uchInsertOrder ? true : false,
+		isDirty);
+	if (iRet == EC_NO_MEM) {
+		if (p_buffer_pond_->try_purge_size(raw_data_.need_size(),
+						   *p_node) == 0)
+			iRet = raw_data_.insert_row(
+				*stpNodeRow,
+				update_mode_.m_uchInsertOrder ? true : false,
+				isDirty);
+	}
+	if (iRet != EC_NO_MEM)
+		p_node->vd_handle() = raw_data_.get_handle();
+	if (iRet != 0) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "raw-data insert row error: %s",
+			 raw_data_.get_err_msg());
+		/*标记加入黑名单*/
+		job_op.push_black_list_size(raw_data_.need_size());
+		return (-2);
+	}
+
+	if (job_op.resultInfo.affected_rows() == 0 || setrows == true)
+		job_op.resultInfo.set_affected_rows(1);
+	rows_count_++;
+	if (isDirty)
+		dirty_rows_count_++;
+	log4cplus_debug("stat history datasize, size is %u",
+			raw_data_.data_size());
+	history_datasize.push(raw_data_.data_size());
+	history_rowsize.push(raw_data_.total_rows());
+	raw_data_.update_last_access_time_by_hour();
+	raw_data_.update_last_update_time_by_hour();
+	log4cplus_debug(
+		"node[id:%u] ,Get Count is %d, create_time is %d, last_access_time is %d, last_update_time is %d ",
+		p_node->node_id(), raw_data_.get_select_op_count(),
+		raw_data_.get_create_time_by_hour(),
+		raw_data_.get_last_access_time_by_hour(),
+		raw_data_.get_last_update_time_by_hour());
+	return DTC_CODE_SUCCESS;
+}
+
+int RawDataProcess::do_replace_all(DTCJobOperation &job_op, Node *p_node)
+{
+	log4cplus_debug("do_replace_all start! ");
+	DTCTableDefinition *stpNodeTab, *stpTaskTab;
+	RowValue *stpNodeRow;
+
+	int iRet;
+	int try_purge_count = 0;
+	uint64_t all_rows_size = 0;
+	int laid = job_op.flag_no_cache() || job_op.count_only() ?
+			   -1 :
+			   job_op.table_definition()->lastacc_field_id();
+	int matchedCount = 0;
+	int limitStart = 0;
+	int limitStop = 0x10000000;
+
+	stpTaskTab = job_op.table_definition();
+	if (DTCColExpand::instance()->is_expanding())
+		stpNodeTab =
+			TableDefinitionManager::instance()->get_new_table_def();
+	else
+		stpNodeTab =
+			TableDefinitionManager::instance()->get_cur_table_def();
+	RowValue stNodeRow(stpNodeTab);
+	stpNodeRow = &stNodeRow;
+	stpNodeRow->default_value();
+
+	if (laid > 0 && job_op.requestInfo.limit_count() > 0) {
+		limitStart = job_op.requestInfo.limit_start();
+		if (job_op.requestInfo.limit_start() > 0x10000000) {
+			laid = -1;
+		} else if (job_op.requestInfo.limit_count() < 0x10000000) {
+			limitStop =
+				limitStart + job_op.requestInfo.limit_count();
+		}
+	}
+
+	rows_count_ = 0;
+	dirty_rows_count_ = 0;
+
+	if (p_node->vd_handle() != INVALID_HANDLE) {
+		iRet = destroy_data(p_node);
+		if (iRet != 0)
+			return (-1);
+	}
+
+	iRet = raw_data_.do_init(job_op.packed_key(), 0);
+	if (iRet == EC_NO_MEM) {
+		if (p_buffer_pond_->try_purge_size(raw_data_.need_size(),
+						   *p_node) == 0)
+			iRet = raw_data_.init(p_table_->key_fields() - 1,
+					      p_table_->key_format(),
+					      job_op.packed_key(), 0);
+	}
+	if (iRet != EC_NO_MEM)
+		p_node->vd_handle() = raw_data_.get_handle();
+
+	if (iRet != 0) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "raw-data init error: %s", raw_data_.get_err_msg());
+		/*标记加入黑名单*/
+		job_op.push_black_list_size(raw_data_.need_size());
+		p_buffer_pond_->purge_node(job_op.packed_key(), *p_node);
+		return (-2);
+	}
+
+	if (job_op.result != NULL) {
+		ResultSet *pstResultSet = job_op.result;
+		for (int i = 0; i < pstResultSet->total_rows(); i++) {
+			RowValue *pstRow = pstResultSet->_fetch_row();
+			if (pstRow == NULL) {
+				log4cplus_debug("%s!",
+						"call fetch_row func error");
+				p_buffer_pond_->purge_node(job_op.packed_key(),
+							   *p_node);
+				raw_data_.destory();
+				return (-3);
+			}
+
+			if (laid > 0 && job_op.compare_row(*pstRow)) {
+				if (matchedCount >= limitStart &&
+				    matchedCount < limitStop) {
+					(*pstRow)[laid].s64 =
+						job_op.Timestamp();
+				}
+				matchedCount++;
+			}
+
+			if (stpTaskTab != stpNodeTab) {
+				stpNodeRow->Copy(pstRow);
+			} else {
+				stpNodeRow = pstRow;
+			}
+
+			/* 插入当前行 */
+			iRet = raw_data_.insert_row(*stpNodeRow, false, false);
+
+			/* 如果内存空间不足,尝试扩大最多两次 */
+			if (iRet == EC_NO_MEM) {
+				/* 预测整个Node的数据大小 */
+				all_rows_size = raw_data_.need_size() -
+						raw_data_.data_start();
+				all_rows_size *= pstResultSet->total_rows();
+				all_rows_size /= (i + 1);
+				all_rows_size += raw_data_.data_start();
+
+				if (try_purge_count >= 2) {
+					goto ERROR_PROCESS;
+				}
+
+				/* 尝试次数 */
+				++try_purge_count;
+				if (p_buffer_pond_->try_purge_size(
+					    (size_t)all_rows_size, *p_node) ==
+				    0)
+					iRet = raw_data_.insert_row(
+						*stpNodeRow, false, false);
+			}
+			if (iRet != EC_NO_MEM)
+				p_node->vd_handle() = raw_data_.get_handle();
+
+			/* 当前行操作成功 */
+			if (0 == iRet)
+				continue;
+		ERROR_PROCESS:
+			snprintf(
+				err_message_, sizeof(err_message_),
+				"raw-data insert row error: ret=%d,err=%s, cnt=%d",
+				iRet, raw_data_.get_err_msg(), try_purge_count);
+			/*标记加入黑名单*/
+			job_op.push_black_list_size(all_rows_size);
+			p_buffer_pond_->purge_node(job_op.packed_key(),
+						   *p_node);
+			raw_data_.destory();
+			return (-4);
+		}
+
+		rows_count_ += pstResultSet->total_rows();
+	}
+
+	raw_data_.update_last_access_time_by_hour();
+	raw_data_.update_last_update_time_by_hour();
+	log4cplus_debug(
+		"node[id:%u], handle[" UINT64FMT
+		"] ,data-size[%u],  Get Count is %d, create_time is %d, last_access_time is %d, Update time is %d",
+		p_node->node_id(), p_node->vd_handle(), raw_data_.data_size(),
+		raw_data_.get_select_op_count(),
+		raw_data_.get_create_time_by_hour(),
+		raw_data_.get_last_access_time_by_hour(),
+		raw_data_.get_last_update_time_by_hour());
+
+	history_datasize.push(raw_data_.data_size());
+	history_rowsize.push(raw_data_.total_rows());
+	return DTC_CODE_SUCCESS;
+}
+
+// The correct replace behavior:
+// 	If conflict rows found, delete them all
+// 	Insert new row
+// 	Affected rows is total deleted and inserted rows
+// Implementation hehavior:
+// 	If first conflict row found, update it, and increase affected rows to 2 (1 delete + 1 insert)
+// 	delete other fonflict row, increase affected 1 per row
+// 	If no rows found, insert it and set affected rows to 1
+int RawDataProcess::do_replace(DTCJobOperation &job_op, Node *p_node,
+			       RawData *affected_data, bool async, bool setrows)
+{
+	int iRet;
+	DTCTableDefinition *stpNodeTab, *stpTaskTab;
+	RowValue *stpNodeRow, *stpTaskRow;
+
+	log4cplus_debug("do_replace start! ");
+
+	rows_count_ = 0;
+	dirty_rows_count_ = 0;
+
+	if (p_node->vd_handle() == INVALID_HANDLE) {
+		iRet = init_data(p_node, affected_data, job_op.packed_key());
+		if (iRet != DTC_CODE_SUCCESS) {
+			log4cplus_error("init data error: %d", iRet);
+			if (p_node->vd_handle() == INVALID_HANDLE)
+				p_buffer_pond_->purge_node(job_op.packed_key(),
+							   *p_node);
+			return (iRet);
+		}
+	} else {
+		iRet = attach_data(p_node, affected_data);
+		if (iRet != DTC_CODE_SUCCESS) {
+			log4cplus_error("attach data error: %d", iRet);
+			return (iRet);
+		}
+	}
+
+	unsigned char uchRowFlags;
+	uint64_t ullAffectedrows = 0;
+	unsigned int uiTotalRows = raw_data_.total_rows();
+	if (affected_data != NULL)
+		affected_data->set_refrence(&raw_data_);
+
+	stpNodeTab = raw_data_.get_node_table_def();
+	stpTaskTab = job_op.table_definition();
+	RowValue stNewRow(stpTaskTab);
+	RowValue stNewNodeRow(stpNodeTab);
+	stNewRow.default_value();
+	stpTaskRow = &stNewRow;
+	stpNodeRow = &stNewNodeRow;
+	job_op.update_row(*stpTaskRow); //获取Replace的行
+	if (stpNodeTab != stpTaskTab)
+		stpNodeRow->Copy(stpTaskRow);
+	else
+		stpNodeRow = stpTaskRow;
+
+	RowValue stRow(stpNodeTab); //一行数据
+	for (unsigned int i = 0; i < uiTotalRows; i++) { //逐行拷贝数据
+		if (raw_data_.decode_row(stRow, uchRowFlags, 0) != 0) {
+			log4cplus_error("raw-data decode row error: %d,%s",
+					iRet, raw_data_.get_err_msg());
+			return (-1);
+		}
+
+		if (job_op.table_definition()->key_as_uniq_field() == false &&
+		    stNewRow.Compare(
+			    stRow,
+			    job_op.table_definition()->uniq_fields_list(),
+			    job_op.table_definition()->uniq_fields()) != 0)
+			continue;
+
+		if (ullAffectedrows == 0) {
+			if (affected_data != NULL &&
+			    affected_data->insert_row(*stpNodeRow, false,
+						      async) != 0) {
+				log4cplus_error(
+					"raw-data copy row error: %d,%s", iRet,
+					affected_data->get_err_msg());
+				return (-2);
+			}
+
+			ullAffectedrows = 2;
+			iRet = raw_data_.replace_cur_row(*stpNodeRow,
+							 async); // 加进cache
+		} else {
+			ullAffectedrows++;
+			iRet = raw_data_.delete_cur_row(
+				*stpNodeRow); // 加进cache
+		}
+		if (iRet == EC_NO_MEM) {
+			if (p_buffer_pond_->try_purge_size(
+				    raw_data_.need_size(), *p_node) == 0)
+				iRet = raw_data_.replace_cur_row(*stpNodeRow,
+								 async);
+		}
+		if (iRet != EC_NO_MEM)
+			p_node->vd_handle() = raw_data_.get_handle();
+		if (iRet != 0) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "raw-data replace row error: %d, %s", iRet,
+				 raw_data_.get_err_msg());
+			/*标记加入黑名单*/
+			job_op.push_black_list_size(raw_data_.need_size());
+			return (-3);
+		}
+		if (uchRowFlags & OPER_DIRTY)
+			dirty_rows_count_--;
+		if (async)
+			dirty_rows_count_++;
+	}
+
+	if (ullAffectedrows == 0) { // 找不到匹配的行,insert一行
+		iRet = raw_data_.insert_row(*stpNodeRow, false,
+					    async); // 加进cache
+		if (iRet == EC_NO_MEM) {
+			if (p_buffer_pond_->try_purge_size(
+				    raw_data_.need_size(), *p_node) == 0)
+				iRet = raw_data_.insert_row(*stpNodeRow, false,
+							    async);
+		}
+		if (iRet != EC_NO_MEM)
+			p_node->vd_handle() = raw_data_.get_handle();
+
+		if (iRet != 0) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "raw-data replace row error: %d, %s", iRet,
+				 raw_data_.get_err_msg());
+			/*标记加入黑名单*/
+			job_op.push_black_list_size(raw_data_.need_size());
+			return (-3);
+		}
+		rows_count_++;
+		ullAffectedrows++;
+		if (async)
+			dirty_rows_count_++;
+	}
+
+	if (async == true || setrows == true) {
+		job_op.resultInfo.set_affected_rows(ullAffectedrows);
+	} else if (ullAffectedrows != job_op.resultInfo.affected_rows()) {
+		//如果cache更新纪录数和helper更新的纪录数不相等
+		log4cplus_debug(
+			"unequal affected rows, cache[%lld], helper[%lld]",
+			(long long)ullAffectedrows,
+			(long long)job_op.resultInfo.affected_rows());
+	}
+
+	log4cplus_debug("stat history datasize, size is %u",
+			raw_data_.data_size());
+	history_datasize.push(raw_data_.data_size());
+	history_rowsize.push(raw_data_.total_rows());
+	raw_data_.update_last_access_time_by_hour();
+	raw_data_.update_last_update_time_by_hour();
+	log4cplus_debug(
+		"node[id:%u], create_time is %d, last_access_time is %d, Update Time is %d ",
+		p_node->node_id(), raw_data_.get_create_time_by_hour(),
+		raw_data_.get_last_access_time_by_hour(),
+		raw_data_.get_last_update_time_by_hour());
+	return DTC_CODE_SUCCESS;
+}
+
+/*
+ * encode到私有内存,防止replace,update引起重新rellocate导致value引用了过期指针
+ */
+int RawDataProcess::encode_to_private_area(RawData &raw, RowValue &value,
+					   unsigned char value_flag)
+{
+	int ret = raw.do_init(raw_data_.key(),
+			      raw.calc_row_size(value,
+						p_table_->key_fields() - 1));
+	if (DTC_CODE_SUCCESS != ret) {
+		log4cplus_error("init raw-data struct error, ret=%d, err=%s",
+				ret, raw.get_err_msg());
+		return -1;
+	}
+
+	ret = raw.insert_row(value, false, false);
+	if (DTC_CODE_SUCCESS != ret) {
+		log4cplus_error("insert row to raw-data error: ret=%d, err=%s",
+				ret, raw.get_err_msg());
+		return -2;
+	}
+
+	raw.rewind();
+
+	ret = raw.decode_row(value, value_flag, 0);
+	if (DTC_CODE_SUCCESS != ret) {
+		log4cplus_error("decode raw-data to row error: ret=%d, err=%s",
+				ret, raw.get_err_msg());
+		return -3;
+	}
+
+	return DTC_CODE_SUCCESS;
+}
+
+int RawDataProcess::do_update(DTCJobOperation &job_op, Node *p_node,
+			      RawData *affected_data, bool async, bool setrows)
+{
+	int iRet;
+	DTCTableDefinition *stpNodeTab, *stpTaskTab;
+	RowValue *stpNodeRow, *stpTaskRow;
+
+	log4cplus_debug("do_update start! ");
+
+	rows_count_ = 0;
+	dirty_rows_count_ = 0;
+
+	iRet = attach_data(p_node, affected_data);
+	if (iRet != DTC_CODE_SUCCESS) {
+		log4cplus_error("attach data error: %d", iRet);
+		return (iRet);
+	}
+
+	unsigned char uchRowFlags;
+	uint64_t ullAffectedrows = 0;
+	unsigned int uiTotalRows = raw_data_.total_rows();
+	if (affected_data != NULL)
+		affected_data->set_refrence(&raw_data_);
+
+	RowValue stRow(job_op.table_definition()); //一行数据
+
+	stpNodeTab = raw_data_.get_node_table_def();
+	stpTaskTab = job_op.table_definition();
+	RowValue stNewRow(stpTaskTab);
+	RowValue stNewNodeRow(stpNodeTab);
+	stpTaskRow = &stNewRow;
+	stpNodeRow = &stNewNodeRow;
+	if (stpNodeTab == stpTaskTab)
+		stpNodeRow = stpTaskRow;
+
+	for (unsigned int i = 0; i < uiTotalRows; i++) { //逐行拷贝数据
+		if (raw_data_.decode_row(*stpNodeRow, uchRowFlags, 0) != 0) {
+			log4cplus_error("raw-data decode row error: %d,%s",
+					iRet, raw_data_.get_err_msg());
+			return (-1);
+		}
+
+		if (stpNodeTab != stpTaskTab)
+			stpTaskRow->Copy(stpNodeRow);
+
+		//如果不符合查询条件
+		if (job_op.compare_row(*stpTaskRow) == 0)
+			continue;
+
+		job_op.update_row(*stpTaskRow); //修改数据
+		ullAffectedrows++;
+
+		if (stpNodeTab != stpTaskTab)
+			stpNodeRow->Copy(stpTaskRow);
+
+		if (affected_data != NULL &&
+		    affected_data->insert_row(*stpNodeRow, false, async) != 0) {
+			log4cplus_error("raw-data copy row error: %d,%s", iRet,
+					affected_data->get_err_msg());
+			return (-2);
+		}
+
+		// 在私有区间decode
+		RawData stTmpRows(&g_stSysMalloc, 1);
+		if (encode_to_private_area(stTmpRows, *stpNodeRow,
+					   uchRowFlags)) {
+			log4cplus_error(
+				"encode rowvalue to private rawdata area failed");
+			return -3;
+		}
+
+		iRet = raw_data_.replace_cur_row(*stpNodeRow,
+						 async); // 加进cache
+		if (iRet == EC_NO_MEM) {
+			if (p_buffer_pond_->try_purge_size(
+				    raw_data_.need_size(), *p_node) == 0)
+				iRet = raw_data_.replace_cur_row(*stpNodeRow,
+								 async);
+		}
+		if (iRet != EC_NO_MEM)
+			p_node->vd_handle() = raw_data_.get_handle();
+		if (iRet != 0) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "raw-data replace row error: %d, %s", iRet,
+				 raw_data_.get_err_msg());
+			/*标记加入黑名单*/
+			job_op.push_black_list_size(raw_data_.need_size());
+			return (-6);
+		}
+
+		if (uchRowFlags & OPER_DIRTY)
+			dirty_rows_count_--;
+		if (async)
+			dirty_rows_count_++;
+	}
+
+	if (async == true || setrows == true) {
+		job_op.resultInfo.set_affected_rows(ullAffectedrows);
+	} else if (ullAffectedrows != job_op.resultInfo.affected_rows()) {
+		//如果cache更新纪录数和helper更新的纪录数不相等
+		log4cplus_debug(
+			"unequal affected rows, cache[%lld], helper[%lld]",
+			(long long)ullAffectedrows,
+			(long long)job_op.resultInfo.affected_rows());
+	}
+	log4cplus_debug("stat history datasize, size is %u",
+			raw_data_.data_size());
+	history_datasize.push(raw_data_.data_size());
+	history_rowsize.push(raw_data_.total_rows());
+	raw_data_.update_last_access_time_by_hour();
+	raw_data_.update_last_update_time_by_hour();
+	log4cplus_debug(
+		"node[id:%u], create_time is %d, last_access_time is %d, UpdateTime is %d",
+		p_node->node_id(), raw_data_.get_create_time_by_hour(),
+		raw_data_.get_last_access_time_by_hour(),
+		raw_data_.get_last_update_time_by_hour());
+	return DTC_CODE_SUCCESS;
+}
+
+int RawDataProcess::do_flush(DTCFlushRequest *flush_req, Node *p_node,
+			     unsigned int &affected_count)
+{
+	int iRet;
+
+	log4cplus_debug("do_flush start! ");
+
+	rows_count_ = 0;
+	dirty_rows_count_ = 0;
+
+	iRet = attach_data(p_node, NULL);
+	if (iRet != DTC_CODE_SUCCESS) {
+		log4cplus_error("attach data error: %d", iRet);
+		return (iRet);
+	}
+
+	unsigned char uchRowFlags;
+	unsigned int uiTotalRows = raw_data_.total_rows();
+
+	affected_count = 0;
+	DTCValue astKey[p_table_->key_fields()];
+	TaskPackedKey::unpack_key(p_table_, raw_data_.key(), astKey);
+	RowValue stRow(p_table_); //一行数据
+	for (int i = 0; i < p_table_->key_fields(); i++)
+		stRow[i] = astKey[i];
+
+	for (unsigned int i = 0; p_node->is_dirty() && i < uiTotalRows;
+	     i++) { //逐行拷贝数据
+		if (raw_data_.decode_row(stRow, uchRowFlags, 0) != 0) {
+			log4cplus_error("raw-data decode row error: %d,%s",
+					iRet, raw_data_.get_err_msg());
+			return (-1);
+		}
+
+		if ((uchRowFlags & OPER_DIRTY) == false)
+			continue;
+
+		if (flush_req && flush_req->flush_row(stRow) != 0) {
+			log4cplus_error("do_flush() invoke flushRow() failed.");
+			return (-2);
+		}
+		raw_data_.set_cur_row_flag(uchRowFlags & ~OPER_DIRTY);
+		dirty_rows_count_--;
+		affected_count++;
+	}
+
+	return DTC_CODE_SUCCESS;
+}
+
+int RawDataProcess::do_purge(DTCFlushRequest *flush_req, Node *p_node,
+			     unsigned int &affected_count)
+{
+	int iRet = DTC_CODE_SUCCESS;
+
+	log4cplus_debug("do_purge start! ");
+
+	iRet = do_flush(flush_req, p_node, affected_count);
+	if (iRet != 0) {
+		return (iRet);
+	}
+	rows_count_ = 0LL - raw_data_.total_rows();
+
+	return DTC_CODE_SUCCESS;
+}

+ 126 - 0
src/core/raw/raw_data_process.h

@@ -0,0 +1,126 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef RAW_DATA_PROCESS_H
+#define RAW_DATA_PROCESS_H
+
+#include "buffer_def.h"
+#include "protocol.h"
+#include "value.h"
+#include "field/field.h"
+#include "section.h"
+#include "table/table_def.h"
+#include "task/task_request.h"
+#include "stat_dtc.h"
+#include "raw_data.h"
+#include "node.h"
+#include "data_process.h"
+#include "buffer_pond.h"
+#include "namespace.h"
+#include "stat_manager.h"
+
+DTC_BEGIN_NAMESPACE
+
+class DTCJobOperation;
+class DTCFlushRequest;
+
+class RawDataProcess : public DataProcess {
+    private:
+	RawData raw_data_;
+	DTCTableDefinition *p_table_;
+	MallocBase *p_mallocator_;
+	BufferPond *p_buffer_pond_;
+	UpdateMode update_mode_;
+	int64_t rows_count_;
+	int64_t dirty_rows_count_;
+	char err_message_[200];
+
+	unsigned int nodeSizeLimit; // -DEBUG-
+
+	/*对历史节点数据的采样统计,放在高端内存操作管理的地方,便于收敛统计点 , modify by tomchen 2014.08.27*/
+	StatSample history_datasize;
+	StatSample history_rowsize;
+
+    protected:
+	int init_data(Node *p_node, RawData *affected_data, const char *ptrKey);
+	int attach_data(Node *p_node, RawData *affected_data);
+	int destroy_data(Node *p_node);
+
+    private:
+	int encode_to_private_area(RawData &, RowValue &, unsigned char);
+
+    public:
+	RawDataProcess(MallocBase *pstMalloc,
+		       DTCTableDefinition *p_table_definition_,
+		       BufferPond *pstPool, const UpdateMode *pstUpdateMode);
+	~RawDataProcess();
+
+	void set_limit_node_size(int node_size)
+	{
+		nodeSizeLimit = node_size;
+	} // -DEBUG-
+
+	const char *get_err_msg()
+	{
+		return err_message_;
+	}
+	void set_insert_mode(EUpdateMode iMode)
+	{
+		update_mode_.m_iInsertMode = iMode;
+	}
+	void set_insert_order(int iOrder)
+	{
+		update_mode_.m_uchInsertOrder = iOrder;
+	}
+	void change_mallocator(MallocBase *pstMalloc);
+
+	// expire time for cache only dtc mode
+	int get_expire_time(DTCTableDefinition *t, Node *node,
+			    uint32_t &expire);
+	// count dirty row, cache process will use it when buffer_delete_rows in job->all_rows case
+	int get_dirty_row_count(DTCJobOperation &job_op, Node *node);
+	int64_t get_increase_row_count()
+	{
+		return rows_count_;
+	}
+	int64_t get_increase_dirty_row_count()
+	{
+		return dirty_rows_count_;
+	}
+	int get_node_all_rows_count(Node *p_node, RawData *pstRows);
+	int expand_node(DTCJobOperation &job_op, Node *p_node);
+
+	int do_replace_all(DTCJobOperation &job_op, Node *p_node);
+	int do_replace_all(Node *p_node, RawData *new_data);
+	int do_replace(DTCJobOperation &job_op, Node *p_node,
+		       RawData *affected_data, bool async,
+		       bool setrows = false);
+	int do_delete(DTCJobOperation &job_op, Node *p_node,
+		      RawData *affected_data);
+	int do_get(DTCJobOperation &job_op, Node *p_node);
+	int do_append(DTCJobOperation &job_op, Node *p_node,
+		      RawData *affected_data, bool isDirty, bool uniq);
+	int do_update(DTCJobOperation &job_op, Node *p_node,
+		      RawData *affected_data, bool async, bool setrows = false);
+	int do_flush(DTCFlushRequest *flush_req, Node *p_node,
+		     unsigned int &affected_count);
+	int do_purge(DTCFlushRequest *flush_req, Node *p_node,
+		     unsigned int &affected_count);
+};
+
+DTC_END_NAMESPACE
+
+#endif

+ 98 - 0
src/core/task/task_pendlist.cc

@@ -0,0 +1,98 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include "task_pendlist.h"
+#include "buffer_process_ask_chain.h"
+#include "log/log.h"
+
+DTC_USING_NAMESPACE
+
+TaskPendingList::TaskPendingList(JobAskInterface<DTCJobOperation> *o, int to)
+	: _timeout(to), _timelist(0), _owner(o), _wakeup(0)
+{
+	_timelist = _owner->owner->get_timer_list(_timeout);
+}
+
+TaskPendingList::~TaskPendingList()
+{
+	std::list<slot_t>::iterator it;
+	for (it = _pendlist.begin(); it != _pendlist.end(); ++it) {
+		//把所有请求踢回客户端
+		it->first->set_error(-ETIMEDOUT, __FUNCTION__,
+				     "object deconstruct");
+		it->first->turn_around_job_answer();
+	}
+}
+
+void TaskPendingList::add2_list(DTCJobOperation *job)
+{
+	if (job) {
+		if (_pendlist.empty())
+			attach_timer(_timelist);
+
+		_pendlist.push_back(std::make_pair(job, time(NULL)));
+	}
+
+	return;
+}
+
+// 唤醒队列中所有已经pending的task
+void TaskPendingList::Wakeup(void)
+{
+	log4cplus_debug("TaskPendingList Wakeup");
+
+	//唤醒所有task
+	_wakeup = 1;
+
+	attach_ready_timer(_owner->owner);
+
+	return;
+}
+
+void TaskPendingList::job_timer_procedure(void)
+{
+	log4cplus_debug("enter timer procedure");
+	std::list<slot_t> copy;
+	copy.swap(_pendlist);
+	std::list<slot_t>::iterator it;
+
+	if (_wakeup) {
+		for (it = copy.begin(); it != copy.end(); ++it) {
+			_owner->job_ask_procedure(it->first);
+		}
+
+		_wakeup = 0;
+	} else {
+		time_t now = time(NULL);
+
+		for (it = copy.begin(); it != copy.end(); ++it) {
+			//超时处理
+			if (it->second + _timeout >= now) {
+				_pendlist.push_back(*it);
+			} else {
+				it->first->set_error(-ETIMEDOUT, __FUNCTION__,
+						     "pending job is timedout");
+				it->first->turn_around_job_answer();
+			}
+		}
+
+		if (!_pendlist.empty())
+			attach_timer(_timelist);
+	}
+
+	log4cplus_debug("leave timer procedure");
+	return;
+}

+ 63 - 0
src/core/task/task_pendlist.h

@@ -0,0 +1,63 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef __TASK_REQUEST_PENDINGLIST_H
+#define __TASK_REQUEST_PENDINGLIST_H
+
+#include "timer/timer_list.h"
+#include "namespace.h"
+#include "task/task_request.h"
+#include <list>
+
+DTC_BEGIN_NAMESPACE
+/*
+ * 请求挂起列表。
+ *
+ * 如果发现请求暂时没法满足,则挂起,直到
+ *     1. 超时
+ *     2. 条件满足被唤醒
+ */
+class BufferProcessAskChain;
+class CacheBase;
+class TaskReqeust;
+class TimerObject;
+class TaskPendingList : private TimerObject {
+    public:
+	TaskPendingList(JobAskInterface<DTCJobOperation> *o, int timeout = 5);
+	~TaskPendingList();
+
+	void add2_list(DTCJobOperation *); //加入pending list
+	void Wakeup(void); //唤醒队列中的所有task
+
+    private:
+	virtual void job_timer_procedure(void);
+
+    private:
+	TaskPendingList(const TaskPendingList &);
+	const TaskPendingList &operator=(const TaskPendingList &);
+
+    private:
+	int _timeout;
+	TimerList *_timelist;
+	JobAskInterface<DTCJobOperation> *_owner;
+	int _wakeup;
+	typedef std::pair<DTCJobOperation *, time_t> slot_t;
+	std::list<slot_t> _pendlist;
+};
+
+DTC_END_NAMESPACE
+
+#endif

+ 1726 - 0
src/core/tree/t_tree.cc

@@ -0,0 +1,1726 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include "log/log.h"
+#include "t_tree.h"
+#include "value.h"
+#include "data_chunk.h"
+
+/*#ifndef MODU_TEST
+#include "tree_data.h"
+#endif*/
+
+#define GET_KEY(x, u, t)                                                       \
+	do {                                                                   \
+		x = (typeof(x)) * (t *)(u);                                    \
+	} while (0)
+
+int64_t KeyCompare(const char *pchKey, void *pCmpCookie, MallocBase &stMalloc,
+		   ALLOC_HANDLE_T hOtherKey)
+{
+	const char *pOtherKey =
+		reinterpret_cast<char *>(stMalloc.handle_to_ptr(hOtherKey));
+	pOtherKey =
+		pOtherKey + sizeof(unsigned char) * 2 + 2 * sizeof(uint32_t);
+
+	CmpCookie *cookie = reinterpret_cast<CmpCookie *>(pCmpCookie);
+	const DTCTableDefinition *t_pstTab = cookie->p_table_;
+	const int idx = cookie->m_index_;
+	int field_type = t_pstTab->field_type(idx);
+
+	char *v = const_cast<char *>(pchKey);
+	DTCValue *value = reinterpret_cast<DTCValue *>(v);
+
+	switch (field_type) {
+	case DField::Signed:
+		int64_t skey, sotherKey;
+		skey = value->s64;
+		if (unlikely(t_pstTab->field_size(idx) >
+			     (int)sizeof(int32_t))) {
+			GET_KEY(sotherKey, pOtherKey, int64_t);
+		} else {
+			GET_KEY(sotherKey, pOtherKey, int32_t);
+		}
+		return skey - sotherKey;
+
+	case DField::Unsigned:
+		uint64_t ukey, uotherKey;
+		ukey = value->u64;
+		if (unlikely(t_pstTab->field_size(idx) >
+			     (int)sizeof(uint32_t))) {
+			GET_KEY(uotherKey, pOtherKey, uint64_t);
+		} else {
+			GET_KEY(uotherKey, pOtherKey, uint32_t);
+		}
+		return ukey - uotherKey;
+
+	case DField::Float:
+		double dkey, dotherKey, sKey;
+		dkey = value->flt;
+		if (likely(t_pstTab->field_size(idx) > (int)sizeof(float))) {
+			GET_KEY(dotherKey, pOtherKey, double);
+		} else {
+			GET_KEY(dotherKey, pOtherKey, float);
+		}
+		sKey = dkey - dotherKey;
+		if (sKey > -0.0001 && sKey < 0.0001)
+			return 0;
+		return sKey;
+
+	case DField::String: {
+		int keyLen = 0, tKeyLen = 0;
+		char *key = NULL;
+		if (DField::String == field_type) {
+			keyLen = value->str.len;
+			key = value->str.ptr;
+		} else if (DField::Binary == field_type) {
+			keyLen = value->bin.len;
+			key = value->bin.ptr;
+		} else
+			keyLen = 0;
+
+		GET_KEY(tKeyLen, pOtherKey, int);
+		if (keyLen == 0 && tKeyLen == 0) {
+			return 0;
+		} else if (keyLen == 0 && tKeyLen != 0) {
+			return -1;
+		} else if (keyLen != 0 && tKeyLen == 0) {
+			return 1;
+		} else if (keyLen != 0 && tKeyLen != 0) {
+			pOtherKey = pOtherKey + sizeof(int);
+			int len = keyLen < tKeyLen ? keyLen : tKeyLen;
+			int res = strncasecmp(key, pOtherKey, len);
+			if (keyLen == tKeyLen)
+				return res;
+			else if (res == 0) {
+				return keyLen > tKeyLen ? 1 : -1;
+			} else {
+				return res;
+			}
+		}
+		return 0;
+	}
+	case DField::Binary: {
+		int keyLen = 0, tKeyLen = 0;
+		char *key = NULL;
+		if (DField::String == field_type) {
+			keyLen = value->str.len;
+			key = value->str.ptr;
+		} else if (DField::Binary == field_type) {
+			keyLen = value->bin.len;
+			key = value->bin.ptr;
+		} else
+			keyLen = 0;
+
+		GET_KEY(tKeyLen, pOtherKey, int);
+		if (keyLen == 0 && tKeyLen == 0) {
+			return 0;
+		} else if (keyLen == 0 && tKeyLen != 0) {
+			return -1;
+		} else if (keyLen != 0 && tKeyLen == 0) {
+			return 1;
+		} else if (keyLen != 0 && tKeyLen != 0) {
+			pOtherKey = pOtherKey + sizeof(int);
+			int len = keyLen < tKeyLen ? keyLen : tKeyLen;
+			int res = memcmp(key, pOtherKey, len);
+			if (keyLen == tKeyLen)
+				return res;
+			else if (res == 0) {
+				return keyLen > tKeyLen ? 1 : -1;
+			} else {
+				return res;
+			}
+		}
+		return 0;
+	}
+
+	default:
+		return 0;
+	}
+	return 0;
+}
+
+int Visit(MallocBase &stMalloc, ALLOC_HANDLE_T &hRecord, void *pCookie)
+{
+	pResCookie *cookie = reinterpret_cast<pResCookie *>(pCookie);
+	const char *p_content_ =
+		reinterpret_cast<char *>(stMalloc.handle_to_ptr(hRecord));
+	uint32_t hRecordRowCnts =
+		*(uint32_t *)(p_content_ + sizeof(unsigned char) +
+			      sizeof(uint32_t));
+
+	if (cookie->need_find_node_count > 0 &&
+	    cookie->has_got_row_count >= cookie->need_find_node_count)
+		return 0;
+	(cookie->p_handle)[cookie->has_got_node_count] = hRecord;
+	cookie->has_got_node_count = cookie->has_got_node_count + 1;
+	cookie->has_got_row_count = cookie->has_got_row_count + hRecordRowCnts;
+	return 0;
+}
+
+int _TtreeNode::do_init()
+{
+	m_hLeft = INVALID_HANDLE;
+	m_hRight = INVALID_HANDLE;
+	m_chBalance = 0;
+	m_ushNItems = 0;
+	for (int i = 0; i < PAGE_SIZE; i++)
+		m_ahItems[i] = INVALID_HANDLE;
+	return (0);
+}
+
+ALLOC_HANDLE_T _TtreeNode::Alloc(MallocBase &stMalloc, ALLOC_HANDLE_T hRecord)
+{
+	ALLOC_HANDLE_T h;
+	h = stMalloc.Malloc(sizeof(TtreeNode));
+	if (h == INVALID_HANDLE)
+		return (INVALID_HANDLE);
+
+	TtreeNode *p = (TtreeNode *)stMalloc.handle_to_ptr(h);
+	p->do_init();
+	p->m_ahItems[0] = hRecord;
+	p->m_ushNItems = 1;
+
+	return (h);
+}
+
+int convert_cvalue(MallocBase &stMalloc, DTCValue *pch, void *pCmpCookie,
+		   ALLOC_HANDLE_T hReInsert)
+{
+	CmpCookie *cookie = reinterpret_cast<CmpCookie *>(pCmpCookie);
+	const DTCTableDefinition *t_pstTab = cookie->p_table_;
+	const int idx = cookie->m_index_;
+	int field_type = t_pstTab->field_type(idx);
+
+	char *pchKey =
+		((DataChunk *)stMalloc.handle_to_ptr(hReInsert))->index_key();
+
+	switch (field_type) {
+	case DField::Signed:
+		if (unlikely(t_pstTab->field_size(idx) > (int)sizeof(int32_t)))
+			pch->s64 = *(int64_t *)pchKey;
+		else
+			pch->s64 = (int64_t) * (int32_t *)pchKey;
+		break;
+
+	case DField::Unsigned:
+		if (unlikely(t_pstTab->field_size(idx) > (int)sizeof(uint32_t)))
+			pch->u64 = *(uint64_t *)pchKey;
+		else
+			pch->u64 = (uint64_t) * (uint32_t *)pchKey;
+		break;
+
+	case DField::Float:
+		if (likely(t_pstTab->field_size(idx) > (int)sizeof(float)))
+			pch->flt = *(double *)pchKey;
+		else
+			pch->flt = (double)*(float *)pchKey;
+		break;
+
+	case DField::String:
+	case DField::Binary:
+		pch->bin.len = *((int *)pchKey);
+		pch->bin.ptr = pchKey + sizeof(int);
+		break;
+	}
+
+	return 0;
+}
+
+int _TtreeNode::do_insert(MallocBase &stMalloc, ALLOC_HANDLE_T &hNode,
+			  const char *pchKey, void *pCmpCookie,
+			  KeyComparator pfComp, ALLOC_HANDLE_T hRecord,
+			  bool &isAllocNode)
+{
+	TtreeNode *p_node;
+
+	GET_OBJ(stMalloc, hNode, p_node);
+	uint16_t ushNodeCnt = p_node->m_ushNItems;
+	int iDiff = pfComp(pchKey, pCmpCookie, stMalloc, p_node->m_ahItems[0]);
+
+	if (iDiff == 0) {
+		//		assert(0);
+		return (-2);
+	}
+
+	if (iDiff <= 0) {
+		ALLOC_HANDLE_T hLeft = p_node->m_hLeft;
+		if ((hLeft == INVALID_HANDLE || iDiff == 0) &&
+		    p_node->m_ushNItems < PAGE_SIZE) {
+			for (uint32_t i = ushNodeCnt; i > 0; i--)
+				p_node->m_ahItems[i] = p_node->m_ahItems[i - 1];
+			p_node->m_ahItems[0] = hRecord;
+			p_node->m_ushNItems++;
+			return (0);
+		}
+		if (hLeft == INVALID_HANDLE) {
+			hLeft = Alloc(stMalloc, hRecord);
+			if (hLeft == INVALID_HANDLE)
+				return (-1);
+			isAllocNode = true;
+			p_node->m_hLeft = hLeft;
+		} else {
+			ALLOC_HANDLE_T hChild = hLeft;
+			int iGrow =
+				do_insert(stMalloc, hChild, pchKey, pCmpCookie,
+					  pfComp, hRecord, isAllocNode);
+			if (iGrow < 0)
+				return iGrow;
+			if (hChild != hLeft) {
+				hLeft = hChild;
+				p_node->m_hLeft = hChild;
+			}
+			if (iGrow == 0)
+				return (0);
+		}
+		if (p_node->m_chBalance > 0) {
+			p_node->m_chBalance = 0;
+			return (0);
+		} else if (p_node->m_chBalance == 0) {
+			p_node->m_chBalance = -1;
+			return (1);
+		} else {
+			TtreeNode *pstLeft =
+				(TtreeNode *)stMalloc.handle_to_ptr(hLeft);
+			if (pstLeft->m_chBalance < 0) { // single LL turn
+				p_node->m_hLeft = pstLeft->m_hRight;
+				pstLeft->m_hRight = hNode;
+				p_node->m_chBalance = 0;
+				pstLeft->m_chBalance = 0;
+				hNode = hLeft;
+			} else { // double LR turn
+				ALLOC_HANDLE_T hRight = pstLeft->m_hRight;
+				TtreeNode *pstRight =
+					(TtreeNode *)stMalloc.handle_to_ptr(
+						hRight);
+				pstLeft->m_hRight = pstRight->m_hLeft;
+				pstRight->m_hLeft = hLeft;
+				p_node->m_hLeft = pstRight->m_hRight;
+				pstRight->m_hRight = hNode;
+				p_node->m_chBalance =
+					(pstRight->m_chBalance < 0) ? 1 : 0;
+				pstLeft->m_chBalance =
+					(pstRight->m_chBalance > 0) ? -1 : 0;
+				pstRight->m_chBalance = 0;
+				hNode = hRight;
+			}
+			return (0);
+		}
+	}
+
+	iDiff = pfComp(pchKey, pCmpCookie, stMalloc,
+		       p_node->m_ahItems[ushNodeCnt - 1]);
+	if (iDiff == 0) {
+		//		assert(0);
+		return (-2);
+	}
+	if (iDiff >= 0) {
+		ALLOC_HANDLE_T hRight = p_node->m_hRight;
+		if ((hRight == INVALID_HANDLE || iDiff == 0) &&
+		    p_node->m_ushNItems < PAGE_SIZE) {
+			p_node->m_ahItems[ushNodeCnt] = hRecord;
+			p_node->m_ushNItems++;
+			return (0);
+		}
+		if (hRight == INVALID_HANDLE) {
+			hRight = Alloc(stMalloc, hRecord);
+			if (hRight == INVALID_HANDLE)
+				return (-1);
+			p_node->m_hRight = hRight;
+			isAllocNode = true;
+		} else {
+			ALLOC_HANDLE_T hChild = hRight;
+			int iGrow =
+				do_insert(stMalloc, hChild, pchKey, pCmpCookie,
+					  pfComp, hRecord, isAllocNode);
+			if (iGrow < 0)
+				return iGrow;
+			if (hChild != hRight) {
+				hRight = hChild;
+				p_node->m_hRight = hChild;
+			}
+			if (iGrow == 0)
+				return (0);
+		}
+		if (p_node->m_chBalance < 0) {
+			p_node->m_chBalance = 0;
+			return (0);
+		} else if (p_node->m_chBalance == 0) {
+			p_node->m_chBalance = 1;
+			return (1);
+		} else {
+			TtreeNode *pstRight =
+				(TtreeNode *)stMalloc.handle_to_ptr(hRight);
+			if (pstRight->m_chBalance > 0) { // single RR turn
+				p_node->m_hRight = pstRight->m_hLeft;
+				pstRight->m_hLeft = hNode;
+				p_node->m_chBalance = 0;
+				pstRight->m_chBalance = 0;
+				hNode = hRight;
+			} else { // double RL turn
+				ALLOC_HANDLE_T hLeft = pstRight->m_hLeft;
+				TtreeNode *pstLeft =
+					(TtreeNode *)stMalloc.handle_to_ptr(
+						hLeft);
+				pstRight->m_hLeft = pstLeft->m_hRight;
+				pstLeft->m_hRight = hRight;
+				p_node->m_hRight = pstLeft->m_hLeft;
+				pstLeft->m_hLeft = hNode;
+				p_node->m_chBalance =
+					(pstLeft->m_chBalance > 0) ? -1 : 0;
+				pstRight->m_chBalance =
+					(pstLeft->m_chBalance < 0) ? 1 : 0;
+				pstLeft->m_chBalance = 0;
+				hNode = hLeft;
+			}
+			return (0);
+		}
+	}
+
+	int iLeft = 1;
+	int iRight = ushNodeCnt - 1;
+	while (iLeft < iRight) {
+		int i = (iLeft + iRight) >> 1;
+		iDiff = pfComp(pchKey, pCmpCookie, stMalloc,
+			       p_node->m_ahItems[i]);
+		if (iDiff == 0) {
+			//			assert(0);
+			return (-2);
+		}
+		if (iDiff > 0) {
+			iLeft = i + 1;
+		} else {
+			iRight = i;
+			if (iDiff == 0)
+				break;
+		}
+	}
+	// Insert before item[r]
+	if (p_node->m_ushNItems < PAGE_SIZE) {
+		for (int i = ushNodeCnt; i > iRight; i--)
+			p_node->m_ahItems[i] = p_node->m_ahItems[i - 1];
+		p_node->m_ahItems[iRight] = hRecord;
+		p_node->m_ushNItems++;
+		return (0);
+	} else {
+		TtreeNode stBackup;
+		memcpy(&stBackup, p_node, sizeof(TtreeNode));
+		ALLOC_HANDLE_T hReInsert;
+		if (p_node->m_chBalance >= 0) {
+			hReInsert = p_node->m_ahItems[0];
+			for (int i = 1; i < iRight; i++)
+				p_node->m_ahItems[i - 1] = p_node->m_ahItems[i];
+			p_node->m_ahItems[iRight - 1] = hRecord;
+		} else {
+			hReInsert = p_node->m_ahItems[ushNodeCnt - 1];
+			for (int i = ushNodeCnt - 1; i > iRight; i--)
+				p_node->m_ahItems[i] = p_node->m_ahItems[i - 1];
+			p_node->m_ahItems[iRight] = hRecord;
+		}
+
+		DTCValue pch;
+		convert_cvalue(stMalloc, &pch, pCmpCookie, hReInsert);
+		int iRet =
+			do_insert(stMalloc, hNode, (const char *)(&pch),
+				  pCmpCookie, pfComp, hReInsert, isAllocNode);
+		if (iRet < 0) {
+			memcpy(p_node->m_ahItems, stBackup.m_ahItems,
+			       sizeof(p_node->m_ahItems));
+		}
+		return (iRet);
+	}
+}
+
+int _TtreeNode::Delete(MallocBase &stMalloc, ALLOC_HANDLE_T &hNode,
+		       const char *pchKey, void *pCmpCookie,
+		       KeyComparator pfComp, bool &isFreeNode)
+{
+	TtreeNode *p_node;
+	ALLOC_HANDLE_T hTmp;
+
+	GET_OBJ(stMalloc, hNode, p_node);
+	uint16_t ushNodeCnt = p_node->m_ushNItems;
+	int iDiff = pfComp(pchKey, pCmpCookie, stMalloc, p_node->m_ahItems[0]);
+
+	if (iDiff < 0) {
+		ALLOC_HANDLE_T hLeft = p_node->m_hLeft;
+		if (hLeft != INVALID_HANDLE) {
+			ALLOC_HANDLE_T hChild = hLeft;
+			int iRet = Delete(stMalloc, hChild, pchKey, pCmpCookie,
+					  pfComp, isFreeNode);
+			if (iRet < -1)
+				return (iRet);
+			if (hChild != hLeft) {
+				p_node->m_hLeft = hChild;
+			}
+			if (iRet > 0) {
+				return balance_left_branch(stMalloc, hNode);
+			} else if (iRet == 0) {
+				return (0);
+			}
+		}
+		//		assert(iDiff == 0);
+	}
+
+	iDiff = pfComp(pchKey, pCmpCookie, stMalloc,
+		       p_node->m_ahItems[ushNodeCnt - 1]);
+	if (iDiff <= 0) {
+		for (int i = 0; i < ushNodeCnt; i++) {
+			if (pfComp(pchKey, pCmpCookie, stMalloc,
+				   p_node->m_ahItems[i]) == 0) {
+				if (ushNodeCnt == 1) {
+					if (p_node->m_hRight ==
+					    INVALID_HANDLE) {
+						hTmp = p_node->m_hLeft;
+						stMalloc.Free(hNode);
+						hNode = hTmp;
+						return (1);
+					} else if (p_node->m_hLeft ==
+						   INVALID_HANDLE) {
+						hTmp = p_node->m_hRight;
+						stMalloc.Free(hNode);
+						hNode = hTmp;
+						return (1);
+					}
+					isFreeNode = true;
+				}
+				ALLOC_HANDLE_T hLeft = p_node->m_hLeft;
+				ALLOC_HANDLE_T hRight = p_node->m_hRight;
+				if (ushNodeCnt <= MIN_ITEMS) {
+					if (hLeft != INVALID_HANDLE &&
+					    p_node->m_chBalance <= 0) {
+						TtreeNode *pstLeft;
+						GET_OBJ(stMalloc, hLeft,
+							pstLeft);
+						while (pstLeft->m_hRight !=
+						       INVALID_HANDLE) {
+							GET_OBJ(stMalloc,
+								pstLeft->m_hRight,
+								pstLeft);
+						}
+						while (--i >= 0) {
+							p_node->m_ahItems[i + 1] =
+								p_node->m_ahItems
+									[i];
+						}
+						p_node->m_ahItems[0] =
+							pstLeft->m_ahItems
+								[pstLeft->m_ushNItems -
+								 1];
+						DTCValue pch;
+						convert_cvalue(
+							stMalloc, &pch,
+							pCmpCookie,
+							p_node->m_ahItems[0]);
+
+						ALLOC_HANDLE_T hChild = hLeft;
+						int iRet = Delete(
+							stMalloc, hChild,
+							(const char *)(&pch),
+							pCmpCookie, pfComp,
+							isFreeNode);
+						if (iRet < -1) {
+							return (iRet);
+						}
+						if (hChild != hLeft) {
+							p_node->m_hLeft =
+								hChild;
+						}
+						if (iRet > 0) {
+							iRet = balance_left_branch(
+								stMalloc,
+								hNode);
+						}
+						return (iRet);
+					} else if (p_node->m_hRight !=
+						   INVALID_HANDLE) {
+						TtreeNode *pstRight;
+						GET_OBJ(stMalloc, hRight,
+							pstRight);
+						while (pstRight->m_hLeft !=
+						       INVALID_HANDLE) {
+							GET_OBJ(stMalloc,
+								pstRight->m_hLeft,
+								pstRight);
+						}
+						while (++i < ushNodeCnt) {
+							p_node->m_ahItems[i - 1] =
+								p_node->m_ahItems
+									[i];
+						}
+						p_node->m_ahItems[ushNodeCnt -
+								  1] =
+							pstRight->m_ahItems[0];
+						DTCValue pch;
+						convert_cvalue(
+							stMalloc, &pch,
+							pCmpCookie,
+							p_node->m_ahItems
+								[ushNodeCnt -
+								 1]);
+						ALLOC_HANDLE_T hChild = hRight;
+						int iRet = Delete(
+							stMalloc, hChild,
+							(const char *)(&pch),
+							pCmpCookie, pfComp,
+							isFreeNode);
+						if (iRet < -1) {
+							return (iRet);
+						}
+						if (hChild != hRight) {
+							p_node->m_hRight =
+								hChild;
+						}
+						if (iRet > 0) {
+							iRet = balance_right_branch(
+								stMalloc,
+								hNode);
+						}
+						return (iRet);
+					}
+				}
+
+				while (++i < ushNodeCnt) {
+					p_node->m_ahItems[i - 1] =
+						p_node->m_ahItems[i];
+				}
+				p_node->m_ushNItems--;
+
+				return (0);
+			}
+		}
+	}
+
+	ALLOC_HANDLE_T hRight = p_node->m_hRight;
+	if (hRight != 0) {
+		ALLOC_HANDLE_T hChild = hRight;
+		int iRet = Delete(stMalloc, hChild, pchKey, pCmpCookie, pfComp,
+				  isFreeNode);
+		if (iRet < -1) {
+			return (iRet);
+		}
+		if (hChild != hRight) {
+			p_node->m_hRight = hChild;
+		}
+		if (iRet > 0) {
+			return balance_right_branch(stMalloc, hNode);
+		} else {
+			return iRet;
+		}
+	}
+
+	return -1;
+}
+
+inline int _TtreeNode::balance_left_branch(MallocBase &stMalloc,
+					   ALLOC_HANDLE_T &hNode)
+{
+	TtreeNode *p_node;
+	GET_OBJ(stMalloc, hNode, p_node);
+
+	if (p_node->m_chBalance < 0) {
+		p_node->m_chBalance = 0;
+		return (1);
+	} else if (p_node->m_chBalance == 0) {
+		p_node->m_chBalance = 1;
+		return (0);
+	} else {
+		ALLOC_HANDLE_T hRight = p_node->m_hRight;
+		TtreeNode *pstRight;
+		GET_OBJ(stMalloc, hRight, pstRight);
+
+		if (pstRight->m_chBalance >= 0) { // single RR turn
+			p_node->m_hRight = pstRight->m_hLeft;
+			pstRight->m_hLeft = hNode;
+			if (pstRight->m_chBalance == 0) {
+				p_node->m_chBalance = 1;
+				pstRight->m_chBalance = -1;
+				hNode = hRight;
+				return 0;
+			} else {
+				p_node->m_chBalance = 0;
+				pstRight->m_chBalance = 0;
+				hNode = hRight;
+				return 1;
+			}
+		} else { // double RL turn
+			ALLOC_HANDLE_T hLeft = pstRight->m_hLeft;
+			TtreeNode *pstLeft;
+			GET_OBJ(stMalloc, hLeft, pstLeft);
+			pstRight->m_hLeft = pstLeft->m_hRight;
+			pstLeft->m_hRight = hRight;
+			p_node->m_hRight = pstLeft->m_hLeft;
+			pstLeft->m_hLeft = hNode;
+			p_node->m_chBalance = pstLeft->m_chBalance > 0 ? -1 : 0;
+			pstRight->m_chBalance =
+				pstLeft->m_chBalance < 0 ? 1 : 0;
+			pstLeft->m_chBalance = 0;
+			hNode = hLeft;
+			return 1;
+		}
+	}
+}
+
+inline int _TtreeNode::balance_right_branch(MallocBase &stMalloc,
+					    ALLOC_HANDLE_T &hNode)
+{
+	TtreeNode *p_node;
+	GET_OBJ(stMalloc, hNode, p_node);
+
+	if (p_node->m_chBalance > 0) {
+		p_node->m_chBalance = 0;
+		return (1);
+	} else if (p_node->m_chBalance == 0) {
+		p_node->m_chBalance = -1;
+		return (0);
+	} else {
+		ALLOC_HANDLE_T hLeft = p_node->m_hLeft;
+		TtreeNode *pstLeft;
+		GET_OBJ(stMalloc, hLeft, pstLeft);
+		if (pstLeft->m_chBalance <= 0) { // single LL turn
+			p_node->m_hLeft = pstLeft->m_hRight;
+			pstLeft->m_hRight = hNode;
+			if (pstLeft->m_chBalance == 0) {
+				p_node->m_chBalance = -1;
+				pstLeft->m_chBalance = 1;
+				hNode = hLeft;
+				return (0);
+			} else {
+				p_node->m_chBalance = 0;
+				pstLeft->m_chBalance = 0;
+				hNode = hLeft;
+				return (1);
+			}
+		} else { // double LR turn
+			ALLOC_HANDLE_T hRight = pstLeft->m_hRight;
+			TtreeNode *pstRight;
+			GET_OBJ(stMalloc, hRight, pstRight);
+
+			pstLeft->m_hRight = pstRight->m_hLeft;
+			pstRight->m_hLeft = hLeft;
+			p_node->m_hLeft = pstRight->m_hRight;
+			pstRight->m_hRight = hNode;
+			p_node->m_chBalance = pstRight->m_chBalance < 0 ? 1 : 0;
+			pstLeft->m_chBalance =
+				pstRight->m_chBalance > 0 ? -1 : 0;
+			pstRight->m_chBalance = 0;
+			hNode = hRight;
+			return (1);
+		}
+	}
+}
+
+unsigned _TtreeNode::ask_for_destroy_size(MallocBase &stMalloc,
+					  ALLOC_HANDLE_T hNode)
+{
+	unsigned size = 0;
+
+	if (INVALID_HANDLE == hNode)
+		return size;
+
+	TtreeNode *p_node;
+	GET_OBJ(stMalloc, hNode, p_node);
+	ALLOC_HANDLE_T hLeft = p_node->m_hLeft;
+	ALLOC_HANDLE_T hRight = p_node->m_hRight;
+
+	for (int i = 0; i < p_node->m_ushNItems; i++)
+		size += stMalloc.chunk_size(p_node->m_ahItems[i]);
+	//size += ((DataChunk*)(stMalloc.handle_to_ptr(p_node->m_ahItems[i])))->ask_for_destroy_size(&stMalloc);
+
+	size += stMalloc.chunk_size(hNode);
+
+	size += ask_for_destroy_size(stMalloc, hLeft);
+	size += ask_for_destroy_size(stMalloc, hRight);
+
+	return size;
+}
+
+int _TtreeNode::destory(MallocBase &stMalloc, ALLOC_HANDLE_T hNode)
+{
+	if (hNode != INVALID_HANDLE) {
+		TtreeNode *p_node;
+		GET_OBJ(stMalloc, hNode, p_node);
+		ALLOC_HANDLE_T hLeft = p_node->m_hLeft;
+		ALLOC_HANDLE_T hRight = p_node->m_hRight;
+		for (int i = 0; i < p_node->m_ushNItems; i++)
+			stMalloc.Free(p_node->m_ahItems[i]);
+		//((DataChunk*)(stMalloc.handle_to_ptr(p_node->m_ahItems[i])))->destory(&stMalloc);
+		stMalloc.Free(hNode);
+
+		destory(stMalloc, hLeft);
+		destory(stMalloc, hRight);
+	}
+	return (0);
+}
+
+int _TtreeNode::do_find(MallocBase &stMalloc, const char *pchKey,
+			void *pCmpCookie, KeyComparator pfComp,
+			ALLOC_HANDLE_T *&phRecord)
+{
+	int iDiff;
+
+	phRecord = NULL;
+	if (m_ushNItems == 0)
+		return (0);
+
+	iDiff = pfComp(pchKey, pCmpCookie, stMalloc, m_ahItems[0]);
+	if (iDiff == 0) {
+		phRecord = &(m_ahItems[0]);
+		return (1);
+	} else if (iDiff > 0) {
+		iDiff = pfComp(pchKey, pCmpCookie, stMalloc,
+			       m_ahItems[m_ushNItems - 1]);
+		if (iDiff == 0) {
+			phRecord = &(m_ahItems[m_ushNItems - 1]);
+			return (1);
+		} else if (iDiff > 0) {
+			if (m_hRight == INVALID_HANDLE) {
+				return (0);
+			}
+			TtreeNode *p_node;
+			GET_OBJ(stMalloc, m_hRight, p_node);
+			return p_node->do_find(stMalloc, pchKey, pCmpCookie,
+					       pfComp, phRecord);
+		}
+
+		int iLeft = 1;
+		int iRight = m_ushNItems - 1;
+		while (iLeft < iRight) {
+			int i = (iLeft + iRight) >> 1;
+			iDiff = pfComp(pchKey, pCmpCookie, stMalloc,
+				       m_ahItems[i]);
+			if (iDiff == 0) {
+				phRecord = &(m_ahItems[i]);
+				return (1);
+			}
+			if (iDiff > 0) {
+				iLeft = i + 1;
+			} else {
+				iRight = i;
+			}
+		}
+		return (0);
+	} else {
+		if (m_hLeft == INVALID_HANDLE) {
+			return (0);
+		}
+		TtreeNode *p_node;
+		GET_OBJ(stMalloc, m_hLeft, p_node);
+		return p_node->do_find(stMalloc, pchKey, pCmpCookie, pfComp,
+				       phRecord);
+	}
+}
+
+int _TtreeNode::do_find(MallocBase &stMalloc, const char *pchKey,
+			void *pCmpCookie, KeyComparator pfComp,
+			ALLOC_HANDLE_T &hRecord)
+{
+	int iRet;
+	ALLOC_HANDLE_T *phItem;
+
+	hRecord = INVALID_HANDLE;
+	iRet = do_find(stMalloc, pchKey, pCmpCookie, pfComp, phItem);
+	if (iRet == 1 && phItem != NULL) {
+		hRecord = *phItem;
+	}
+
+	return (iRet);
+}
+
+int _TtreeNode::find_handle(MallocBase &stMalloc, ALLOC_HANDLE_T hRecord)
+{
+	if (m_ushNItems == 0)
+		return (0);
+
+	for (int i = 0; i < m_ushNItems; i++)
+		if (m_ahItems[i] == hRecord)
+			return (1);
+
+	TtreeNode *p_node;
+	if (m_hRight != INVALID_HANDLE) {
+		GET_OBJ(stMalloc, m_hRight, p_node);
+		if (p_node->find_handle(stMalloc, hRecord) == 1)
+			return (1);
+	}
+
+	if (m_hLeft != INVALID_HANDLE) {
+		GET_OBJ(stMalloc, m_hLeft, p_node);
+		if (p_node->find_handle(stMalloc, hRecord) == 1)
+			return (1);
+	}
+
+	return (0);
+}
+
+int _TtreeNode::find_node(MallocBase &stMalloc, const char *pchKey,
+			  void *pCmpCookie, KeyComparator pfComp,
+			  ALLOC_HANDLE_T &hNode)
+{
+	int iDiff;
+
+	hNode = INVALID_HANDLE;
+	if (m_ushNItems == 0)
+		return (0);
+
+	iDiff = pfComp(pchKey, pCmpCookie, stMalloc, m_ahItems[0]);
+	if (iDiff == 0) {
+		hNode = stMalloc.ptr_to_handle(this);
+		return (1);
+	} else if (iDiff > 0) {
+		iDiff = pfComp(pchKey, pCmpCookie, stMalloc,
+			       m_ahItems[m_ushNItems - 1]);
+		if (iDiff <= 0) {
+			hNode = stMalloc.ptr_to_handle(this);
+			return (1);
+		} else if (iDiff > 0) {
+			if (m_hRight == INVALID_HANDLE) {
+				return (0);
+			}
+			TtreeNode *p_node;
+			GET_OBJ(stMalloc, m_hRight, p_node);
+			return p_node->find_node(stMalloc, pchKey, pCmpCookie,
+						 pfComp, hNode);
+		}
+	} else {
+		if (m_hLeft == INVALID_HANDLE) {
+			hNode = stMalloc.ptr_to_handle(this);
+			return (1);
+		}
+		TtreeNode *p_node;
+		GET_OBJ(stMalloc, m_hLeft, p_node);
+		return p_node->find_node(stMalloc, pchKey, pCmpCookie, pfComp,
+					 hNode);
+	}
+
+	return (0);
+}
+
+int _TtreeNode::traverse_forward(MallocBase &stMalloc, ItemVisit pfVisit,
+				 void *pCookie)
+{
+	int iRet;
+
+	if (m_hLeft != INVALID_HANDLE) {
+		if ((iRet = ((TtreeNode *)stMalloc.handle_to_ptr(m_hLeft))
+				    ->traverse_forward(stMalloc, pfVisit,
+						       pCookie)) != 0) {
+			return (iRet);
+		}
+	}
+
+	for (int i = 0; i < m_ushNItems; i++) {
+		if ((iRet = pfVisit(stMalloc, m_ahItems[i], pCookie)) != 0) {
+			return (iRet);
+		}
+	}
+
+	if (m_hRight != INVALID_HANDLE) {
+		if ((iRet = ((TtreeNode *)stMalloc.handle_to_ptr(m_hRight))
+				    ->traverse_forward(stMalloc, pfVisit,
+						       pCookie)) != 0) {
+			return (iRet);
+		}
+	}
+
+	return (0);
+}
+
+int _TtreeNode::traverse_backward(MallocBase &stMalloc, ItemVisit pfVisit,
+				  void *pCookie)
+{
+	int iRet;
+
+	if (m_hRight != INVALID_HANDLE) {
+		if ((iRet = ((TtreeNode *)stMalloc.handle_to_ptr(m_hRight))
+				    ->traverse_backward(stMalloc, pfVisit,
+							pCookie)) != 0) {
+			return (iRet);
+		}
+	}
+	for (int i = m_ushNItems; --i >= 0;) {
+		if ((iRet = pfVisit(stMalloc, m_ahItems[i], pCookie)) != 0) {
+			return (iRet);
+		}
+	}
+	if (m_hLeft != INVALID_HANDLE) {
+		if ((iRet = ((TtreeNode *)stMalloc.handle_to_ptr(m_hLeft))
+				    ->traverse_backward(stMalloc, pfVisit,
+							pCookie)) != 0) {
+			return (iRet);
+		}
+	}
+
+	return (0);
+}
+
+int _TtreeNode::post_order_traverse(MallocBase &stMalloc, ItemVisit pfVisit,
+				    void *pCookie)
+{
+	int iRet;
+
+	if (m_hRight != INVALID_HANDLE) {
+		if ((iRet = ((TtreeNode *)stMalloc.handle_to_ptr(m_hRight))
+				    ->post_order_traverse(stMalloc, pfVisit,
+							  pCookie)) != 0) {
+			return (iRet);
+		}
+	}
+
+	if (m_hLeft != INVALID_HANDLE) {
+		if ((iRet = ((TtreeNode *)stMalloc.handle_to_ptr(m_hLeft))
+				    ->post_order_traverse(stMalloc, pfVisit,
+							  pCookie)) != 0) {
+			return (iRet);
+		}
+	}
+
+	for (int i = m_ushNItems; --i >= 0;) {
+		if ((iRet = pfVisit(stMalloc, m_ahItems[i], pCookie)) != 0) {
+			return (iRet);
+		}
+	}
+
+	return (0);
+}
+
+int _TtreeNode::traverse_forward(MallocBase &stMalloc, const char *pchKey,
+				 void *pCmpCookie, KeyComparator pfComp,
+				 int iInclusion, ItemVisit pfVisit,
+				 void *pCookie)
+{
+	int iDiff;
+	int iRet;
+
+	if (m_hLeft != INVALID_HANDLE) {
+		iDiff = pfComp(pchKey, pCmpCookie, stMalloc, m_ahItems[0]);
+		if (iDiff < 0) {
+			if ((iRet = ((TtreeNode *)stMalloc.handle_to_ptr(
+					     m_hLeft))
+					    ->traverse_forward(
+						    stMalloc, pchKey,
+						    pCmpCookie, pfComp,
+						    iInclusion, pfVisit,
+						    pCookie)) != 0) {
+				return (iRet);
+			}
+		}
+	}
+
+	int i = m_ushNItems;
+	iDiff = pfComp(pchKey, pCmpCookie, stMalloc,
+		       m_ahItems[m_ushNItems - 1]);
+	if (iDiff <= 0) {
+		for (i = 0; i < m_ushNItems; i++) {
+			iDiff = pfComp(pchKey, pCmpCookie, stMalloc,
+				       m_ahItems[i]);
+			if (iDiff <= 0 && iDiff >= 0 - iInclusion) {
+				if ((iRet = pfVisit(stMalloc, m_ahItems[i],
+						    pCookie)) != 0) {
+					return (iRet);
+				}
+			} else if (iDiff < 0 - iInclusion) {
+				break;
+			}
+		}
+	}
+
+	if (i >= m_ushNItems && m_hRight != INVALID_HANDLE) {
+		if ((iRet = ((TtreeNode *)stMalloc.handle_to_ptr(m_hRight))
+				    ->traverse_forward(stMalloc, pchKey,
+						       pCmpCookie, pfComp,
+						       iInclusion, pfVisit,
+						       pCookie)) != 0) {
+			return (iRet);
+		}
+	}
+
+	return (0);
+}
+
+int _TtreeNode::traverse_forward(MallocBase &stMalloc, const char *pchKey,
+				 const char *pchKey1, void *pCmpCookie,
+				 KeyComparator pfComp, ItemVisit pfVisit,
+				 void *pCookie)
+{
+	int iDiff;
+	int iDiff1;
+	int iRet;
+
+	if (m_hLeft != INVALID_HANDLE) {
+		iDiff = pfComp(pchKey, pCmpCookie, stMalloc, m_ahItems[0]);
+		if (iDiff < 0) {
+			if ((iRet = ((TtreeNode *)stMalloc.handle_to_ptr(
+					     m_hLeft))
+					    ->traverse_forward(
+						    stMalloc, pchKey, pchKey1,
+						    pCmpCookie, pfComp, pfVisit,
+						    pCookie)) != 0) {
+				return (iRet);
+			}
+		}
+	}
+
+	int i;
+	iDiff = pfComp(pchKey, pCmpCookie, stMalloc,
+		       m_ahItems[m_ushNItems - 1]);
+	iDiff1 = pfComp(pchKey1, pCmpCookie, stMalloc, m_ahItems[0]);
+	if (iDiff1 < 0 || iDiff > 0) { // key1 < item[0]   OR   key > item[n]
+	} else {
+		for (i = 0; i < m_ushNItems; i++) {
+			iDiff = pfComp(pchKey, pCmpCookie, stMalloc,
+				       m_ahItems[i]);
+			if (iDiff <= 0) {
+				iDiff1 = pfComp(pchKey1, pCmpCookie, stMalloc,
+						m_ahItems[i]);
+				if (iDiff1 >= 0) {
+					if ((iRet = pfVisit(stMalloc,
+							    m_ahItems[i],
+							    pCookie)) != 0) {
+						return (iRet);
+					}
+				}
+			}
+		}
+	}
+
+	iDiff1 = pfComp(pchKey1, pCmpCookie, stMalloc,
+			m_ahItems[m_ushNItems - 1]);
+	if (iDiff1 >= 0 && m_hRight != INVALID_HANDLE) {
+		if ((iRet = ((TtreeNode *)stMalloc.handle_to_ptr(m_hRight))
+				    ->traverse_forward(stMalloc, pchKey,
+						       pchKey1, pCmpCookie,
+						       pfComp, pfVisit,
+						       pCookie)) != 0) {
+			return (iRet);
+		}
+	}
+
+	return (0);
+}
+
+int _TtreeNode::traverse_forward(MallocBase &stMalloc, const char *pchKey,
+				 void *pCmpCookie, KeyComparator pfComp,
+				 ItemVisit pfVisit, void *pCookie)
+{
+	int iDiff;
+	int iRet;
+
+	if (m_hLeft != INVALID_HANDLE) {
+		iDiff = pfComp(pchKey, pCmpCookie, stMalloc, m_ahItems[0]);
+		if (iDiff < 0) {
+			if ((iRet = ((TtreeNode *)stMalloc.handle_to_ptr(
+					     m_hLeft))
+					    ->traverse_forward(stMalloc, pchKey,
+							       pCmpCookie,
+							       pfComp, pfVisit,
+							       pCookie)) != 0) {
+				return (iRet);
+			}
+		}
+	}
+
+	int i;
+	iDiff = pfComp(pchKey, pCmpCookie, stMalloc,
+		       m_ahItems[m_ushNItems - 1]);
+	if (iDiff <= 0) {
+		for (i = 0; i < m_ushNItems; i++) {
+			iDiff = pfComp(pchKey, pCmpCookie, stMalloc,
+				       m_ahItems[i]);
+			if (iDiff <= 0) {
+				if ((iRet = pfVisit(stMalloc, m_ahItems[i],
+						    pCookie)) != 0) {
+					return (iRet);
+				}
+			}
+		}
+	}
+
+	if (m_hRight != INVALID_HANDLE) {
+		if ((iRet = ((TtreeNode *)stMalloc.handle_to_ptr(m_hRight))
+				    ->traverse_forward(
+					    stMalloc, pchKey, pCmpCookie,
+					    pfComp, pfVisit, pCookie)) != 0) {
+			return (iRet);
+		}
+	}
+
+	return (0);
+}
+
+int _TtreeNode::traverse_backward(MallocBase &stMalloc, const char *pchKey,
+				  const char *pchKey1, void *pCmpCookie,
+				  KeyComparator pfComp, ItemVisit pfVisit,
+				  void *pCookie)
+{
+	int iDiff;
+	int iDiff1;
+	int iRet;
+	int i;
+
+	if (m_hRight != INVALID_HANDLE) {
+		iDiff1 = pfComp(pchKey1, pCmpCookie, stMalloc,
+				m_ahItems[m_ushNItems - 1]);
+		if (iDiff1 > 0) {
+			if ((iRet = ((TtreeNode *)stMalloc.handle_to_ptr(
+					     m_hRight))
+					    ->traverse_backward(
+						    stMalloc, pchKey, pchKey1,
+						    pCmpCookie, pfComp, pfVisit,
+						    pCookie)) != 0) {
+				return (iRet);
+			}
+		}
+	}
+
+	iDiff = pfComp(pchKey, pCmpCookie, stMalloc,
+		       m_ahItems[m_ushNItems - 1]);
+	iDiff1 = pfComp(pchKey1, pCmpCookie, stMalloc, m_ahItems[0]);
+	if (iDiff1 < 0 || iDiff > 0) { // key1 < item[0]   OR   key > item[n]
+	} else {
+		for (i = m_ushNItems; --i >= 0;) {
+			iDiff = pfComp(pchKey, pCmpCookie, stMalloc,
+				       m_ahItems[i]);
+			if (iDiff <= 0) {
+				iDiff1 = pfComp(pchKey1, pCmpCookie, stMalloc,
+						m_ahItems[i]);
+				if (iDiff1 >= 0) {
+					if ((iRet = pfVisit(stMalloc,
+							    m_ahItems[i],
+							    pCookie)) != 0) {
+						return (iRet);
+					}
+				}
+			}
+		}
+	}
+
+	iDiff = pfComp(pchKey, pCmpCookie, stMalloc, m_ahItems[0]);
+	if (iDiff <= 0 && m_hLeft != INVALID_HANDLE) {
+		if ((iRet = ((TtreeNode *)stMalloc.handle_to_ptr(m_hLeft))
+				    ->traverse_backward(stMalloc, pchKey,
+							pchKey1, pCmpCookie,
+							pfComp, pfVisit,
+							pCookie)) != 0) {
+			return (iRet);
+		}
+	}
+
+	return (0);
+}
+
+int _TtreeNode::traverse_backward(MallocBase &stMalloc, const char *pchKey,
+				  void *pCmpCookie, KeyComparator pfComp,
+				  ItemVisit pfVisit, void *pCookie)
+{
+	int iDiff;
+	int iRet;
+
+	if (m_hRight != INVALID_HANDLE) {
+		iDiff = pfComp(pchKey, pCmpCookie, stMalloc,
+			       m_ahItems[m_ushNItems - 1]);
+		if (iDiff > 0) {
+			if ((iRet = ((TtreeNode *)stMalloc.handle_to_ptr(
+					     m_hRight))
+					    ->traverse_backward(
+						    stMalloc, pchKey,
+						    pCmpCookie, pfComp, pfVisit,
+						    pCookie)) != 0) {
+				return (iRet);
+			}
+		}
+	}
+
+	iDiff = pfComp(pchKey, pCmpCookie, stMalloc, m_ahItems[0]);
+	if (iDiff >= 0) {
+		for (int i = m_ushNItems; --i >= 0;) {
+			iDiff = pfComp(pchKey, pCmpCookie, stMalloc,
+				       m_ahItems[i]);
+			if (iDiff >= 0) {
+				if ((iRet = pfVisit(stMalloc, m_ahItems[i],
+						    pCookie)) != 0) {
+					return (iRet);
+				}
+			}
+		}
+	}
+
+	if (m_hLeft != INVALID_HANDLE) {
+		if ((iRet = ((TtreeNode *)stMalloc.handle_to_ptr(m_hLeft))
+				    ->traverse_backward(
+					    stMalloc, pchKey, pCmpCookie,
+					    pfComp, pfVisit, pCookie)) != 0) {
+			return (iRet);
+		}
+	}
+
+	return (0);
+}
+
+int _TtreeNode::post_order_traverse(MallocBase &stMalloc, const char *pchKey,
+				    const char *pchKey1, void *pCmpCookie,
+				    KeyComparator pfComp, ItemVisit pfVisit,
+				    void *pCookie)
+{
+	int iDiff;
+	int iDiff1;
+	int iRet;
+
+	if (m_hLeft != INVALID_HANDLE) {
+		iDiff = pfComp(pchKey, pCmpCookie, stMalloc, m_ahItems[0]);
+		if (iDiff < 0) {
+			if ((iRet = ((TtreeNode *)stMalloc.handle_to_ptr(
+					     m_hLeft))
+					    ->post_order_traverse(
+						    stMalloc, pchKey, pchKey1,
+						    pCmpCookie, pfComp, pfVisit,
+						    pCookie)) != 0) {
+				return (iRet);
+			}
+		}
+	}
+
+	iDiff1 = pfComp(pchKey1, pCmpCookie, stMalloc,
+			m_ahItems[m_ushNItems - 1]);
+	if (iDiff1 >= 0 && m_hRight != INVALID_HANDLE) {
+		if ((iRet = ((TtreeNode *)stMalloc.handle_to_ptr(m_hRight))
+				    ->post_order_traverse(stMalloc, pchKey,
+							  pchKey1, pCmpCookie,
+							  pfComp, pfVisit,
+							  pCookie)) != 0) {
+			return (iRet);
+		}
+	}
+
+	int i;
+	iDiff = pfComp(pchKey, pCmpCookie, stMalloc,
+		       m_ahItems[m_ushNItems - 1]);
+	iDiff1 = pfComp(pchKey1, pCmpCookie, stMalloc, m_ahItems[0]);
+	if (iDiff1 < 0 || iDiff > 0) { // key1 < item[0]   OR   key > item[n]
+	} else {
+		for (i = 0; i < m_ushNItems; i++) {
+			iDiff = pfComp(pchKey, pCmpCookie, stMalloc,
+				       m_ahItems[i]);
+			if (iDiff <= 0) {
+				iDiff1 = pfComp(pchKey1, pCmpCookie, stMalloc,
+						m_ahItems[i]);
+				if (iDiff1 >= 0) {
+					if ((iRet = pfVisit(stMalloc,
+							    m_ahItems[i],
+							    pCookie)) != 0) {
+						return (iRet);
+					}
+				}
+			}
+		}
+	}
+
+	return (0);
+}
+
+int _TtreeNode::post_order_traverse_ge(MallocBase &stMalloc, const char *pchKey,
+				       void *pCmpCookie, KeyComparator pfComp,
+				       ItemVisit pfVisit, void *pCookie)
+{
+	int iDiff;
+	int iRet;
+
+	if (m_hLeft != INVALID_HANDLE) {
+		iDiff = pfComp(pchKey, pCmpCookie, stMalloc, m_ahItems[0]);
+		if (iDiff < 0) {
+			if ((iRet = ((TtreeNode *)stMalloc.handle_to_ptr(
+					     m_hLeft))
+					    ->post_order_traverse_ge(
+						    stMalloc, pchKey,
+						    pCmpCookie, pfComp, pfVisit,
+						    pCookie)) != 0) {
+				return (iRet);
+			}
+		}
+	}
+
+	if (m_hRight != INVALID_HANDLE) {
+		if ((iRet = ((TtreeNode *)stMalloc.handle_to_ptr(m_hRight))
+				    ->post_order_traverse_ge(
+					    stMalloc, pchKey, pCmpCookie,
+					    pfComp, pfVisit, pCookie)) != 0) {
+			return (iRet);
+		}
+	}
+
+	int i;
+	iDiff = pfComp(pchKey, pCmpCookie, stMalloc,
+		       m_ahItems[m_ushNItems - 1]);
+	if (iDiff <= 0) {
+		for (i = 0; i < m_ushNItems; i++) {
+			iDiff = pfComp(pchKey, pCmpCookie, stMalloc,
+				       m_ahItems[i]);
+			if (iDiff <= 0) {
+				if ((iRet = pfVisit(stMalloc, m_ahItems[i],
+						    pCookie)) != 0) {
+					return (iRet);
+				}
+			}
+		}
+	}
+
+	return (0);
+}
+
+int _TtreeNode::post_order_traverse_le(MallocBase &stMalloc, const char *pchKey,
+				       void *pCmpCookie, KeyComparator pfComp,
+				       ItemVisit pfVisit, void *pCookie)
+{
+	int iDiff;
+	int iRet;
+
+	if (m_hRight != INVALID_HANDLE) {
+		iDiff = pfComp(pchKey, pCmpCookie, stMalloc,
+			       m_ahItems[m_ushNItems - 1]);
+		if (iDiff > 0) {
+			if ((iRet = ((TtreeNode *)stMalloc.handle_to_ptr(
+					     m_hRight))
+					    ->post_order_traverse_le(
+						    stMalloc, pchKey,
+						    pCmpCookie, pfComp, pfVisit,
+						    pCookie)) != 0) {
+				return (iRet);
+			}
+		}
+	}
+
+	if (m_hLeft != INVALID_HANDLE) {
+		if ((iRet = ((TtreeNode *)stMalloc.handle_to_ptr(m_hLeft))
+				    ->post_order_traverse_le(
+					    stMalloc, pchKey, pCmpCookie,
+					    pfComp, pfVisit, pCookie)) != 0) {
+			return (iRet);
+		}
+	}
+
+	iDiff = pfComp(pchKey, pCmpCookie, stMalloc, m_ahItems[0]);
+	if (iDiff >= 0) {
+		for (int i = m_ushNItems; --i >= 0;) {
+			iDiff = pfComp(pchKey, pCmpCookie, stMalloc,
+				       m_ahItems[i]);
+			if (iDiff >= 0) {
+				if ((iRet = pfVisit(stMalloc, m_ahItems[i],
+						    pCookie)) != 0) {
+					return (iRet);
+				}
+			}
+		}
+	}
+
+	return (0);
+}
+
+Ttree::Ttree(MallocBase &stMalloc) : m_stMalloc(stMalloc)
+{
+	root_handle_ = INVALID_HANDLE;
+	err_message_[0] = 0;
+}
+
+Ttree::~Ttree()
+{
+}
+
+ALLOC_HANDLE_T Ttree::first_node()
+{
+	if (root_handle_ == INVALID_HANDLE)
+		return INVALID_HANDLE;
+	TtreeNode *p_node;
+	GET_OBJ(m_stMalloc, root_handle_, p_node);
+	return p_node->m_ahItems[0];
+}
+
+int Ttree::do_insert(const char *pchKey, void *pCmpCookie, KeyComparator pfComp,
+		     ALLOC_HANDLE_T hRecord, bool &isAllocNode)
+{
+	ALLOC_HANDLE_T hNode;
+
+	if (root_handle_ == INVALID_HANDLE) {
+		hNode = TtreeNode::Alloc(m_stMalloc, hRecord);
+		if (hNode == INVALID_HANDLE) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "alloc tree-node error: %s",
+				 m_stMalloc.get_err_msg());
+			return (EC_NO_MEM);
+		}
+		isAllocNode = true;
+		root_handle_ = hNode;
+	} else {
+		hNode = root_handle_;
+		int iRet = TtreeNode::do_insert(m_stMalloc, hNode, pchKey,
+						pCmpCookie, pfComp, hRecord,
+						isAllocNode);
+		if (iRet == -2) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "key already exists.");
+			return (EC_KEY_EXIST);
+		} else if (iRet == -1) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "alloc tree-node error: %s",
+				 m_stMalloc.get_err_msg());
+			return (EC_NO_MEM);
+		} else if (iRet < 0) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "insert error");
+			return (-1);
+		}
+		if (hNode != root_handle_) {
+			root_handle_ = hNode;
+		}
+	}
+
+	return (0);
+}
+
+int Ttree::Delete(const char *pchKey, void *pCmpCookie, KeyComparator pfComp,
+		  bool &isFreeNode)
+{
+	if (root_handle_ == INVALID_HANDLE) {
+		return (0);
+	}
+
+	ALLOC_HANDLE_T hNode = root_handle_;
+	int iRet = TtreeNode::Delete(m_stMalloc, hNode, pchKey, pCmpCookie,
+				     pfComp, isFreeNode);
+	if (iRet < -1) {
+		snprintf(err_message_, sizeof(err_message_), "internal error");
+		return (-1);
+	} else if (iRet == -1) {
+		snprintf(err_message_, sizeof(err_message_), "tree error");
+		return (-1);
+	}
+	if (hNode != root_handle_)
+		root_handle_ = hNode;
+
+	return (0);
+}
+
+int Ttree::find_handle(ALLOC_HANDLE_T hRecord)
+{
+	if (root_handle_ == INVALID_HANDLE) {
+		return (0);
+	}
+
+	TtreeNode *p_node;
+	GET_OBJ(m_stMalloc, root_handle_, p_node);
+	return p_node->find_handle(m_stMalloc, hRecord);
+}
+
+int Ttree::do_find(const char *pchKey, void *pCmpCookie, KeyComparator pfComp,
+		   ALLOC_HANDLE_T &hRecord)
+{
+	hRecord = INVALID_HANDLE;
+	if (root_handle_ == INVALID_HANDLE) {
+		return (0);
+	}
+
+	TtreeNode *p_node;
+	GET_OBJ(m_stMalloc, root_handle_, p_node);
+	return p_node->do_find(m_stMalloc, pchKey, pCmpCookie, pfComp, hRecord);
+}
+
+int Ttree::do_find(const char *pchKey, void *pCmpCookie, KeyComparator pfComp,
+		   ALLOC_HANDLE_T *&phRecord)
+{
+	phRecord = NULL;
+	if (root_handle_ == INVALID_HANDLE) {
+		return (0);
+	}
+
+	TtreeNode *p_node;
+	GET_OBJ(m_stMalloc, root_handle_, p_node);
+	return p_node->do_find(m_stMalloc, pchKey, pCmpCookie, pfComp,
+			       phRecord);
+}
+
+int Ttree::destory()
+{
+	TtreeNode::destory(m_stMalloc, root_handle_);
+	root_handle_ = INVALID_HANDLE;
+	return (0);
+}
+
+unsigned Ttree::ask_for_destroy_size(void)
+{
+	return TtreeNode::ask_for_destroy_size(m_stMalloc, root_handle_);
+}
+
+int Ttree::traverse_forward(ItemVisit pfVisit, void *pCookie)
+{
+	if (root_handle_ != INVALID_HANDLE) {
+		TtreeNode *p_node;
+		GET_OBJ(m_stMalloc, root_handle_, p_node);
+		return p_node->traverse_forward(m_stMalloc, pfVisit, pCookie);
+	}
+
+	return (0);
+}
+
+int Ttree::traverse_backward(ItemVisit pfVisit, void *pCookie)
+{
+	if (root_handle_ != INVALID_HANDLE) {
+		TtreeNode *p_node;
+		GET_OBJ(m_stMalloc, root_handle_, p_node);
+		return p_node->traverse_backward(m_stMalloc, pfVisit, pCookie);
+	}
+
+	return (0);
+}
+
+int Ttree::post_order_traverse(ItemVisit pfVisit, void *pCookie)
+{
+	if (root_handle_ != INVALID_HANDLE) {
+		TtreeNode *p_node;
+		GET_OBJ(m_stMalloc, root_handle_, p_node);
+		return p_node->post_order_traverse(m_stMalloc, pfVisit,
+						   pCookie);
+	}
+
+	return (0);
+}
+
+int Ttree::traverse_forward(const char *pchKey, void *pCmpCookie,
+			    KeyComparator pfComp, int64_t iInclusion,
+			    ItemVisit pfVisit, void *pCookie)
+{
+	if (root_handle_ != INVALID_HANDLE) {
+		TtreeNode *p_node;
+		GET_OBJ(m_stMalloc, root_handle_, p_node);
+
+		return p_node->traverse_forward(m_stMalloc, pchKey, pCmpCookie,
+						pfComp, iInclusion, pfVisit,
+						pCookie);
+	}
+
+	return (0);
+}
+
+int Ttree::traverse_forward(const char *pchKey, void *pCmpCookie,
+			    KeyComparator pfComp, ItemVisit pfVisit,
+			    void *pCookie)
+{
+	if (root_handle_ != INVALID_HANDLE) {
+		TtreeNode *p_node;
+		GET_OBJ(m_stMalloc, root_handle_, p_node);
+
+		return p_node->traverse_forward(m_stMalloc, pchKey, pCmpCookie,
+						pfComp, pfVisit, pCookie);
+	}
+
+	return (0);
+}
+
+int Ttree::traverse_forward(const char *pchKey, const char *pchKey1,
+			    void *pCmpCookie, KeyComparator pfComp,
+			    ItemVisit pfVisit, void *pCookie)
+{
+	if (root_handle_ != INVALID_HANDLE) {
+		TtreeNode *p_node;
+		GET_OBJ(m_stMalloc, root_handle_, p_node);
+
+		return p_node->traverse_forward(m_stMalloc, pchKey, pchKey1,
+						pCmpCookie, pfComp, pfVisit,
+						pCookie);
+	}
+
+	return (0);
+}
+
+int Ttree::traverse_backward(const char *pchKey, void *pCmpCookie,
+			     KeyComparator pfComp, ItemVisit pfVisit,
+			     void *pCookie)
+{
+	if (root_handle_ != INVALID_HANDLE) {
+		TtreeNode *p_node;
+		GET_OBJ(m_stMalloc, root_handle_, p_node);
+
+		return p_node->traverse_backward(m_stMalloc, pchKey, pCmpCookie,
+						 pfComp, pfVisit, pCookie);
+	}
+
+	return (0);
+}
+
+int Ttree::traverse_backward(const char *pchKey, const char *pchKey1,
+			     void *pCmpCookie, KeyComparator pfComp,
+			     ItemVisit pfVisit, void *pCookie)
+{
+	if (root_handle_ != INVALID_HANDLE) {
+		TtreeNode *p_node;
+		GET_OBJ(m_stMalloc, root_handle_, p_node);
+
+		return p_node->traverse_backward(m_stMalloc, pchKey, pchKey1,
+						 pCmpCookie, pfComp, pfVisit,
+						 pCookie);
+	}
+
+	return (0);
+}
+
+int Ttree::post_order_traverse(const char *pchKey, const char *pchKey1,
+			       void *pCmpCookie, KeyComparator pfComp,
+			       ItemVisit pfVisit, void *pCookie)
+{
+	if (root_handle_ != INVALID_HANDLE) {
+		TtreeNode *p_node;
+		GET_OBJ(m_stMalloc, root_handle_, p_node);
+
+		return p_node->post_order_traverse(m_stMalloc, pchKey, pchKey1,
+						   pCmpCookie, pfComp, pfVisit,
+						   pCookie);
+	}
+
+	return (0);
+}
+
+int Ttree::post_order_traverse_ge(const char *pchKey, void *pCmpCookie,
+				  KeyComparator pfComp, ItemVisit pfVisit,
+				  void *pCookie)
+{
+	if (root_handle_ != INVALID_HANDLE) {
+		TtreeNode *p_node;
+		GET_OBJ(m_stMalloc, root_handle_, p_node);
+
+		return p_node->post_order_traverse_ge(m_stMalloc, pchKey,
+						      pCmpCookie, pfComp,
+						      pfVisit, pCookie);
+	}
+
+	return (0);
+}
+
+int Ttree::post_order_traverse_le(const char *pchKey, void *pCmpCookie,
+				  KeyComparator pfComp, ItemVisit pfVisit,
+				  void *pCookie)
+{
+	if (root_handle_ != INVALID_HANDLE) {
+		TtreeNode *p_node;
+		GET_OBJ(m_stMalloc, root_handle_, p_node);
+
+		return p_node->post_order_traverse_le(m_stMalloc, pchKey,
+						      pCmpCookie, pfComp,
+						      pfVisit, pCookie);
+	}
+
+	return (0);
+}

+ 354 - 0
src/core/tree/t_tree.h

@@ -0,0 +1,354 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef T_TREE_H
+#define T_TREE_H
+
+#include <stdint.h>
+#include "mem/mallocator.h"
+
+int64_t KeyCompare(const char *pchKey, void *pCmpCookie, MallocBase &stMalloc,
+		   ALLOC_HANDLE_T hOtherKey);
+int Visit(MallocBase &stMalloc, ALLOC_HANDLE_T &hRecord, void *pCookie);
+
+typedef int64_t (*KeyComparator)(const char *pchKey, void *pCmpCookie,
+				 MallocBase &stMalloc,
+				 ALLOC_HANDLE_T hOtherKey);
+typedef int (*ItemVisit)(MallocBase &stMalloc, ALLOC_HANDLE_T &hRecord,
+			 void *pCookie);
+
+class Ttree {
+    protected:
+	ALLOC_HANDLE_T root_handle_;
+	MallocBase &m_stMalloc;
+	char err_message_[100];
+
+    public:
+	Ttree(MallocBase &stMalloc);
+	~Ttree();
+
+	const char *get_err_msg()
+	{
+		return err_message_;
+	}
+	const ALLOC_HANDLE_T Root() const
+	{
+		return root_handle_;
+	}
+	ALLOC_HANDLE_T first_node();
+
+	/*************************************************
+	  Description:	attach一块已经格式化好的内存
+	  Input:		
+	  Output:		
+	  Return:		
+	*************************************************/
+	void do_attach(ALLOC_HANDLE_T hRoot)
+	{
+		root_handle_ = hRoot;
+	}
+
+	/*************************************************
+	  Description:	将key insert到树里,hRecord为key对应的数据(包含key)
+	  Input:		pchKey		插入的key
+				pCmpCookie	调用用户自定义的pfComp函数跟树里的节点比较时作为输入参数
+				pfComp		用户自定义的key比较函数
+				hRecord		保存着要插入的key以及其他数据的句柄
+	  Output:		
+	  Return:		0为成功,EC_NO_MEM为内存不足,EC_KEY_EXIST为key已经存在,其他值为错误
+	*************************************************/
+	int do_insert(const char *pchKey, void *pCmpCookie,
+		      KeyComparator pfComp, ALLOC_HANDLE_T hRecord,
+		      bool &isAllocNode);
+
+	/*************************************************
+	  Description:	删除key以及对应的数据(但不会自动释放key对应的内存)
+	  Input:		pchKey		插入的key
+				pCmpCookie	调用用户自定义的pfComp函数跟树里的节点比较时作为输入参数
+				pfComp		用户自定义的key比较函数
+	  Output:		
+	  Return:		0为成功,其他值为错误
+	*************************************************/
+	int Delete(const char *pchKey, void *pCmpCookie, KeyComparator pfComp,
+		   bool &isFreeNode);
+
+	int find_handle(ALLOC_HANDLE_T hRecord);
+
+	/*************************************************
+	  Description:	查找key对应的数据
+	  Input:		pchKey		插入的key
+				pCmpCookie	调用用户自定义的pfComp函数跟树里的节点比较时作为输入参数
+				pfComp		用户自定义的key比较函数
+	  Output:		hRecord		保存查找到的key以及其他数据的句柄
+	  Return:		0为查找不到,1为找到数据
+	*************************************************/
+	int do_find(const char *pchKey, void *pCmpCookie, KeyComparator pfComp,
+		    ALLOC_HANDLE_T &hRecord);
+
+	/*************************************************
+	  Description:	查找key对应的数据
+	  Input:		pchKey		插入的key
+				pCmpCookie	调用用户自定义的pfComp函数跟树里的节点比较时作为输入参数
+				pfComp		用户自定义的key比较函数
+	  Output:		phRecord		指向树节点的item指针
+	  Return:		0为查找不到,1为找到数据
+	*************************************************/
+	int do_find(const char *pchKey, void *pCmpCookie, KeyComparator pfComp,
+		    ALLOC_HANDLE_T *&phRecord);
+
+	/*************************************************
+	  Description:	销毁整棵树,并释放相应的内存
+	  Input:		
+	  Output:		
+	  Return:		0为成功,非0失败
+	*************************************************/
+	int destory();
+
+	/*************************************************
+	  Description: 查询销毁整棵树可以释放多少空闲内存	
+	  Input:		
+	  Output:		
+	  Return:	 >0 成功, 0 失败
+	*************************************************/
+	unsigned ask_for_destroy_size(void);
+
+	/*************************************************
+	  Description:	从小到大遍历整棵树
+	  Input:		pfVisit	访问数据记录的用户自定义函数
+				pCookie	自定义函数的cookie参数
+	  Output:		
+	  Return:		0为成功,其他值为错误
+	*************************************************/
+	int traverse_forward(ItemVisit pfVisit, void *pCookie);
+
+	/*************************************************
+	  Description:	从大到小遍历整棵树
+	  Input:		pfVisit	访问数据记录的用户自定义函数
+				pCookie	自定义函数的cookie参数
+	  Output:		
+	  Return:		0为成功,其他值为错误
+	*************************************************/
+	int traverse_backward(ItemVisit pfVisit, void *pCookie);
+
+	/*************************************************
+	  Description:	后序遍历整棵树
+	  Input:		pfVisit	访问数据记录的用户自定义函数
+				pCookie	自定义函数的cookie参数
+	  Output:		
+	  Return:		0为成功,其他值为错误
+	*************************************************/
+	int post_order_traverse(ItemVisit pfVisit, void *pCookie);
+
+	/*************************************************
+	  Description:	从指定的key开始,从小到大遍历树,遍历的范围为[key, key+iInclusion]
+	  Input:		pchKey		开始的key
+				pCmpCookie	调用用户自定义的pfComp函数跟树里的节点比较时作为输入参数
+				pfComp		用户自定义的key比较函数
+				iInclusion		key的范围
+				pfVisit		访问数据记录的用户自定义函数
+				pCookie		自定义函数的cookie参数
+	  Output:		
+	  Return:		0为成功,其他值为错误
+	*************************************************/
+	int traverse_forward(const char *pchKey, void *pCmpCookie,
+			     KeyComparator pfComp, int64_t iInclusion,
+			     ItemVisit pfVisit, void *pCookie);
+
+	/*************************************************
+	  Description:	从指定的key开始,从小到大遍历树, 遍历的范围为[key, key1]
+	  Input:		pchKey		开始的key
+				pchKey1		结束的key
+				pCmpCookie	调用用户自定义的pfComp函数跟树里的节点比较时作为输入参数
+				pfComp		用户自定义的key比较函数
+				pfVisit		访问数据记录的用户自定义函数
+				pCookie		自定义函数的cookie参数
+	  Output:		
+	  Return:		0为成功,其他值为错误
+	*************************************************/
+	int traverse_forward(const char *pchKey, const char *pchKey1,
+			     void *pCmpCookie, KeyComparator pfComp,
+			     ItemVisit pfVisit, void *pCookie);
+
+	/*************************************************
+	  Description:	从指定的key开始,从小到大遍历树(遍历大于等于key的所有记录)
+	  Input:		pchKey		开始的key
+				pCmpCookie	调用用户自定义的pfComp函数跟树里的节点比较时作为输入参数
+				pfComp		用户自定义的key比较函数
+				pfVisit		访问数据记录的用户自定义函数
+				pCookie		自定义函数的cookie参数
+	  Output:		
+	  Return:		0为成功,其他值为错误
+	*************************************************/
+	int traverse_forward(const char *pchKey, void *pCmpCookie,
+			     KeyComparator pfComp, ItemVisit pfVisit,
+			     void *pCookie);
+
+	/*************************************************
+	  Description:	从指定的key开始,从大到小遍历树(遍历小于等于key的所有记录)
+	  Input:		pchKey		开始的key
+				pCmpCookie	调用用户自定义的pfComp函数跟树里的节点比较时作为输入参数
+				pfComp		用户自定义的key比较函数
+				pfVisit		访问数据记录的用户自定义函数
+				pCookie		自定义函数的cookie参数
+	  Output:		
+	  Return:		0为成功,其他值为错误
+	*************************************************/
+	int traverse_backward(const char *pchKey, void *pCmpCookie,
+			      KeyComparator pfComp, ItemVisit pfVisit,
+			      void *pCookie);
+
+	/*************************************************
+	  Description:	从指定的key开始,从大到小遍历树,遍历的范围为[key, key1]
+	  Input:		pchKey		开始的key
+				pCmpCookie	调用用户自定义的pfComp函数跟树里的节点比较时作为输入参数
+				pfComp		用户自定义的key比较函数
+				pfVisit		访问数据记录的用户自定义函数
+				pCookie		自定义函数的cookie参数
+	  Output:		
+	  Return:		0为成功,其他值为错误
+	*************************************************/
+	int traverse_backward(const char *pchKey, const char *pchKey1,
+			      void *pCmpCookie, KeyComparator pfComp,
+			      ItemVisit pfVisit, void *pCookie);
+
+	/*************************************************
+	  Description:	从指定的key开始,先左右树,后根结点, 遍历的范围为[key, key1]
+	  Input:		pchKey		开始的key
+				pchKey1		结束的key
+				pCmpCookie	调用用户自定义的pfComp函数跟树里的节点比较时作为输入参数
+				pfComp		用户自定义的key比较函数
+				pfVisit		访问数据记录的用户自定义函数
+				pCookie		自定义函数的cookie参数
+	  Output:		
+	  Return:		0为成功,其他值为错误
+	*************************************************/
+	int post_order_traverse(const char *pchKey, const char *pchKey1,
+				void *pCmpCookie, KeyComparator pfComp,
+				ItemVisit pfVisit, void *pCookie);
+
+	/*************************************************
+	  Description:	从指定的key开始,后序遍历树(遍历大于等于key的所有记录)
+	  Input:		pchKey		开始的key
+				pCmpCookie	调用用户自定义的pfComp函数跟树里的节点比较时作为输入参数
+				pfComp		用户自定义的key比较函数
+				pfVisit		访问数据记录的用户自定义函数
+				pCookie		自定义函数的cookie参数
+	  Output:		
+	  Return:		0为成功,其他值为错误
+	*************************************************/
+	int post_order_traverse_ge(const char *pchKey, void *pCmpCookie,
+				   KeyComparator pfComp, ItemVisit pfVisit,
+				   void *pCookie);
+
+	/*************************************************
+	  Description:	从指定的key开始,后序遍历树(遍历小于等于key的所有记录)
+	  Input:		pchKey		开始的key
+				pCmpCookie	调用用户自定义的pfComp函数跟树里的节点比较时作为输入参数
+				pfComp		用户自定义的key比较函数
+				pfVisit		访问数据记录的用户自定义函数
+				pCookie		自定义函数的cookie参数
+	  Output:		
+	  Return:		0为成功,其他值为错误
+	*************************************************/
+	int post_order_traverse_le(const char *pchKey, void *pCmpCookie,
+				   KeyComparator pfComp, ItemVisit pfVisit,
+				   void *pCookie);
+};
+
+/************************************************************
+  Description:    封装了T-tree node的各种操作,仅供t-tree内部使用   
+  Version:         DTC 3.0
+***********************************************************/
+struct _TtreeNode {
+	enum { PAGE_SIZE = 20, // 每个节点保存多少条记录
+	       MIN_ITEMS =
+		       PAGE_SIZE - 2 // minimal number of items in internal node
+	};
+
+	ALLOC_HANDLE_T m_hLeft;
+	ALLOC_HANDLE_T m_hRight;
+	int8_t m_chBalance;
+	uint16_t m_ushNItems;
+	ALLOC_HANDLE_T m_ahItems[PAGE_SIZE];
+
+	int do_init();
+	static ALLOC_HANDLE_T Alloc(MallocBase &stMalloc,
+				    ALLOC_HANDLE_T hRecord);
+	static int do_insert(MallocBase &stMalloc, ALLOC_HANDLE_T &hNode,
+			     const char *pchKey, void *pCmpCookie,
+			     KeyComparator pfComp, ALLOC_HANDLE_T hRecord,
+			     bool &isAllocNode);
+	static int Delete(MallocBase &stMalloc, ALLOC_HANDLE_T &hNode,
+			  const char *pchKey, void *pCmpCookie,
+			  KeyComparator pfComp, bool &isFreeNode);
+	static int balance_left_branch(MallocBase &stMalloc,
+				       ALLOC_HANDLE_T &hNode);
+	static int balance_right_branch(MallocBase &stMalloc,
+					ALLOC_HANDLE_T &hNode);
+	static int destory(MallocBase &stMalloc, ALLOC_HANDLE_T hNode);
+	static unsigned ask_for_destroy_size(MallocBase &,
+					     ALLOC_HANDLE_T hNode);
+
+	// 查找指定的key。找到返回1,否则返回0
+	int do_find(MallocBase &stMalloc, const char *pchKey, void *pCmpCookie,
+		    KeyComparator pfComp, ALLOC_HANDLE_T &hRecord);
+	int do_find(MallocBase &stMalloc, const char *pchKey, void *pCmpCookie,
+		    KeyComparator pfComp, ALLOC_HANDLE_T *&phRecord);
+	int find_handle(MallocBase &stMalloc, ALLOC_HANDLE_T hRecord);
+	// 假设node包含key-k1~kn,查找这样的node节点:k1<= key <=kn
+	int find_node(MallocBase &stMalloc, const char *pchKey,
+		      void *pCmpCookie, KeyComparator pfComp,
+		      ALLOC_HANDLE_T &hNode);
+	int traverse_forward(MallocBase &stMalloc, ItemVisit pfVisit,
+			     void *pCookie);
+	int traverse_backward(MallocBase &stMalloc, ItemVisit pfVisit,
+			      void *pCookie);
+	int post_order_traverse(MallocBase &stMalloc, ItemVisit pfVisit,
+				void *pCookie);
+
+	int traverse_forward(MallocBase &stMalloc, const char *pchKey,
+			     void *pCmpCookie, KeyComparator pfComp,
+			     int iInclusion, ItemVisit pfVisit, void *pCookie);
+	int traverse_forward(MallocBase &stMalloc, const char *pchKey,
+			     void *pCmpCookie, KeyComparator pfComp,
+			     ItemVisit pfVisit, void *pCookie);
+	int traverse_forward(MallocBase &stMalloc, const char *pchKey,
+			     const char *pchKey1, void *pCmpCookie,
+			     KeyComparator pfComp, ItemVisit pfVisit,
+			     void *pCookie);
+
+	int traverse_backward(MallocBase &stMalloc, const char *pchKey,
+			      void *pCmpCookie, KeyComparator pfComp,
+			      ItemVisit pfVisit, void *pCookie);
+	int traverse_backward(MallocBase &stMalloc, const char *pchKey,
+			      const char *pchKey1, void *pCmpCookie,
+			      KeyComparator pfComp, ItemVisit pfVisit,
+			      void *pCookie);
+
+	int post_order_traverse(MallocBase &stMalloc, const char *pchKey,
+				const char *pchKey1, void *pCmpCookie,
+				KeyComparator pfComp, ItemVisit pfVisit,
+				void *pCookie);
+	int post_order_traverse_ge(MallocBase &stMalloc, const char *pchKey,
+				   void *pCmpCookie, KeyComparator pfComp,
+				   ItemVisit pfVisit, void *pCookie);
+	int post_order_traverse_le(MallocBase &stMalloc, const char *pchKey,
+				   void *pCmpCookie, KeyComparator pfComp,
+				   ItemVisit pfVisit, void *pCookie);
+} __attribute__((packed));
+typedef struct _TtreeNode TtreeNode;
+
+#endif

+ 2028 - 0
src/core/tree/tree_data.cc

@@ -0,0 +1,2028 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "tree_data.h"
+#include "global.h"
+#include "task/task_pkey.h"
+#include "buffer_flush.h"
+#include "algorithm/relative_hour_calculator.h"
+
+#ifndef likely
+#if __GCC_MAJOR >= 3
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#else
+#define likely(x) (x)
+#define unlikely(x) (x)
+#endif
+#endif
+
+#define GET_TREE_VALUE(x, t)                                                   \
+	do {                                                                   \
+		if (unlikely(offset_ + sizeof(t) > size_))                     \
+			goto ERROR_RET;                                        \
+		x = (typeof(x)) * (t *)(p_content_ + offset_);                 \
+		offset_ += sizeof(t);                                          \
+	} while (0)
+
+#define GET_TREE_VALUE_AT_OFFSET(x, t, offset)                                 \
+	do {                                                                   \
+		if (unlikely(offset + sizeof(t) > size_))                      \
+			goto ERROR_RET;                                        \
+		x = (typeof(x)) * (t *)(p_content_ + offset);                  \
+	} while (0)
+
+#define SET_TREE_VALUE_AT_OFFSET(x, t, offset)                                 \
+	do {                                                                   \
+		if (unlikely(offset + sizeof(t) > size_))                      \
+			goto ERROR_RET;                                        \
+		*(t *)(p_content_ + offset) = x;                               \
+	} while (0)
+
+#define SET_TREE_VALUE(x, t)                                                   \
+	do {                                                                   \
+		if (unlikely(offset_ + sizeof(t) > size_))                     \
+			goto ERROR_RET;                                        \
+		*(t *)(p_content_ + offset_) = x;                              \
+		offset_ += sizeof(t);                                          \
+	} while (0)
+
+#define SET_TREE_BIN_VALUE(p, len)                                             \
+	do {                                                                   \
+		if (unlikely(offset_ + sizeof(int) + len > size_))             \
+			goto ERROR_RET;                                        \
+		*(int *)(p_content_ + offset_) = len;                          \
+		offset_ += sizeof(int);                                        \
+		if (likely(len != 0))                                          \
+			memcpy(p_content_ + offset_, p, len);                  \
+		offset_ += len;                                                \
+	} while (0)
+
+#define SKIP_TREE_SIZE(s)                                                      \
+	do {                                                                   \
+		if (unlikely(offset_ + s > size_))                             \
+			goto ERROR_RET;                                        \
+		offset_ += s;                                                  \
+	} while (0)
+
+TreeData::TreeData(MallocBase *pstMalloc) : t_tree_(*pstMalloc)
+{
+	p_tree_root_ = NULL;
+	index_depth_ = 0;
+	need_new_bufer_size = 0;
+	key_size_ = 0;
+	handle_ = INVALID_HANDLE;
+	table_index_ = -1;
+	size_ = 0;
+	_root_size = 0;
+	mallocator_ = pstMalloc;
+	memset(err_message_, 0, sizeof(err_message_));
+
+	key_index_ = -1;
+	expire_id_ = -1;
+	m_iLAId = -1;
+	m_iLCmodId = -1;
+
+	offset_ = 0;
+	row_offset_ = 0;
+	affected_rows_ = 0;
+
+	index_part_of_uniq_field_ = false;
+	p_record_ = INVALID_HANDLE;
+}
+
+TreeData::~TreeData()
+{
+	handle_ = INVALID_HANDLE;
+	_root_size = 0;
+}
+
+int TreeData::do_init(uint8_t uchKeyIdx, int iKeySize, const char *pchKey,
+		      int laId, int expireId, int nodeIdx)
+{
+	int ks = iKeySize != 0 ? iKeySize : 1 + *(unsigned char *)pchKey;
+	int uiDataSize = 2 + sizeof(uint32_t) * 4 + sizeof(uint16_t) * 3 +
+			 sizeof(MEM_HANDLE_T) + ks;
+
+	handle_ = INVALID_HANDLE;
+	_root_size = 0;
+
+	handle_ = mallocator_->Malloc(uiDataSize);
+	if (handle_ == INVALID_HANDLE) {
+		snprintf(err_message_, sizeof(err_message_), "malloc error");
+		need_new_bufer_size = uiDataSize;
+		return (EC_NO_MEM);
+	}
+	_root_size = mallocator_->chunk_size(handle_);
+
+	p_tree_root_ = Pointer<RootData>();
+	p_tree_root_->data_type_ =
+		((table_index_ << 7) & 0x80) + DATA_TYPE_TREE_ROOT;
+	p_tree_root_->tree_size_ = 0;
+	p_tree_root_->total_raw_size_ = 0;
+	p_tree_root_->node_count_ = 0;
+	p_tree_root_->row_count_ = 0;
+	p_tree_root_->root_handle_ = INVALID_HANDLE;
+
+	p_tree_root_->get_request_count_ = 1;
+
+	m_uiLAOffset = 0;
+
+	key_size_ = iKeySize;
+	key_index_ = uchKeyIdx;
+	m_iLAId = laId;
+	expire_id_ = expireId;
+	if (nodeIdx != -1) {
+		table_index_ = nodeIdx;
+	}
+	if (table_index_ != 0 && table_index_ != 1) {
+		snprintf(err_message_, sizeof(err_message_), "node idx error");
+		return -100;
+	}
+
+	if (iKeySize != 0) {
+		memcpy(p_tree_root_->p_key_, pchKey, iKeySize);
+	} else {
+		memcpy(p_tree_root_->p_key_, pchKey, ks);
+	}
+
+	t_tree_.do_attach(INVALID_HANDLE);
+
+	return (0);
+}
+
+int TreeData::do_init(const char *pchKey)
+{
+	if (DTCColExpand::instance()->is_expanding())
+		table_index_ =
+			(DTCColExpand::instance()->cur_table_idx() + 1) % 2;
+	else
+		table_index_ = DTCColExpand::instance()->cur_table_idx() % 2;
+	if (table_index_ != 0 && table_index_ != 1) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "attach error, nodeIdx[%d] error", table_index_);
+		return -1;
+	}
+	p_table_ = TableDefinitionManager::instance()->get_table_def_by_idx(
+		table_index_);
+	if (p_table_ == NULL) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "attach error, tabledef[NULL]");
+		return -1;
+	}
+
+	return do_init(p_table_->key_fields() - 1, p_table_->key_format(),
+		       pchKey, p_table_->lastacc_field_id(),
+		       p_table_->expire_time_field_id());
+}
+
+int TreeData::do_attach(MEM_HANDLE_T hHandle, uint8_t uchKeyIdx, int iKeySize,
+			int laid, int lcmodid, int expireid)
+{
+	_root_size = mallocator_->chunk_size(hHandle);
+	if (unlikely(_root_size == 0)) {
+		snprintf(err_message_, sizeof(err_message_), "attach error: %s",
+			 mallocator_->get_err_msg());
+		return (-1);
+	}
+	handle_ = hHandle;
+
+	p_tree_root_ = Pointer<RootData>();
+
+	unsigned char uchType;
+	uchType = p_tree_root_->data_type_;
+	if (unlikely((uchType & 0x7f) != DATA_TYPE_TREE_ROOT)) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "invalid data type: %u", uchType);
+		return (-2);
+	}
+
+	m_uiLAOffset = 0;
+
+	key_size_ = iKeySize;
+	key_index_ = uchKeyIdx;
+	expire_id_ = expireid;
+	m_iLAId = laid;
+	m_iLCmodId = lcmodid;
+
+	t_tree_.do_attach(p_tree_root_->root_handle_);
+
+	return (0);
+}
+
+int TreeData::do_attach(MEM_HANDLE_T hHandle)
+{
+	handle_ = hHandle;
+	char *p = Pointer<char>();
+	table_index_ = (*p >> 7) & 0x01;
+	if (table_index_ != 0 && table_index_ != 1) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "attach error, nodeIdx[%d] error", table_index_);
+		return -1;
+	}
+	p_table_ = TableDefinitionManager::instance()->get_table_def_by_idx(
+		table_index_);
+	if (p_table_ == NULL) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "attach error, tabledef[NULL]");
+		return -1;
+	}
+	return do_attach(hHandle, p_table_->key_fields() - 1,
+			 p_table_->key_format(), p_table_->lastacc_field_id(),
+			 p_table_->lastcmod_field_id(),
+			 p_table_->expire_time_field_id());
+}
+
+int TreeData::encode_tree_row(const RowValue &stRow, unsigned char uchOp)
+{
+	SET_TREE_VALUE(uchOp, unsigned char);
+	for (int j = 1; j <= stRow.num_fields(); j++) //¿½±´Ò»ÐÐÊý¾Ý
+	{
+		if (stRow.table_definition()->is_discard(j))
+			continue;
+		const DTCValue *const v = stRow.field_value(j);
+		switch (stRow.field_type(j)) {
+		case DField::Signed:
+			if (unlikely(stRow.field_size(j) >
+				     (int)sizeof(int32_t)))
+				SET_TREE_VALUE(v->s64, int64_t);
+			else
+				SET_TREE_VALUE(v->s64, int32_t);
+			break;
+
+		case DField::Unsigned:
+			if (unlikely(stRow.field_size(j) >
+				     (int)sizeof(uint32_t)))
+				SET_TREE_VALUE(v->u64, uint64_t);
+			else
+				SET_TREE_VALUE(v->u64, uint32_t);
+			break;
+
+		case DField::Float:
+			if (likely(stRow.field_size(j) > (int)sizeof(float)))
+				SET_TREE_VALUE(v->flt, double);
+			else
+				SET_TREE_VALUE(v->flt, float);
+			break;
+
+		case DField::String:
+		case DField::Binary:
+		default: {
+			SET_TREE_BIN_VALUE(v->bin.ptr, v->bin.len);
+			break;
+		}
+		} //end of switch
+	}
+
+	return 0;
+
+ERROR_RET:
+	snprintf(err_message_, sizeof(err_message_), "encode row error");
+	return (-100);
+}
+
+int TreeData::expand_tree_chunk(MEM_HANDLE_T *pRecord, ALLOC_SIZE_T expand_size)
+{
+	if (pRecord == NULL) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "tree data not init yet");
+		return (-1);
+	}
+
+	uint32_t dataSize = *(uint32_t *)(p_content_ + sizeof(unsigned char));
+	if (dataSize + expand_size > size_) {
+		ALLOC_HANDLE_T hTmp = mallocator_->ReAlloc(
+			(*pRecord), dataSize + expand_size);
+		if (hTmp == INVALID_HANDLE) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "realloc error[%s]",
+				 mallocator_->get_err_msg());
+			need_new_bufer_size = dataSize + expand_size;
+			return (EC_NO_MEM);
+		}
+		p_tree_root_->tree_size_ -= size_;
+		*pRecord = hTmp;
+		size_ = mallocator_->chunk_size(hTmp);
+		p_content_ = Pointer<char>(*pRecord);
+		p_tree_root_->tree_size_ += size_;
+	}
+	return (0);
+}
+
+int TreeData::insert_sub_tree(uint8_t uchCondIdxCnt,
+			      const RowValue &stCondition, KeyComparator pfComp,
+			      ALLOC_HANDLE_T hRoot)
+{
+	int iRet;
+	if (uchCondIdxCnt != TTREE_INDEX_POS) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "index field error");
+		return (-100);
+	}
+
+	bool isAllocNode = false;
+	DTCValue value = stCondition[TTREE_INDEX_POS];
+	char *indexKey = reinterpret_cast<char *>(&value);
+	CmpCookie cookie(p_table_, uchCondIdxCnt);
+	iRet = t_tree_.do_insert(indexKey, &cookie, pfComp, hRoot, isAllocNode);
+	if (iRet == 0 && isAllocNode) {
+		p_tree_root_->tree_size_ += sizeof(TtreeNode);
+	}
+	return iRet;
+}
+
+int TreeData::do_find(uint8_t uchCondIdxCnt, const RowValue &stCondition,
+		      KeyComparator pfComp, ALLOC_HANDLE_T *&hRecord)
+{
+	int iRet;
+	if (uchCondIdxCnt != TTREE_INDEX_POS) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "index field error");
+		return (-100);
+	}
+
+	DTCValue value = stCondition[TTREE_INDEX_POS];
+	char *indexKey = reinterpret_cast<char *>(&value);
+	CmpCookie cookie(p_table_, uchCondIdxCnt);
+	iRet = t_tree_.do_find(indexKey, &cookie, pfComp, hRecord);
+	return iRet;
+}
+
+int TreeData::insert_row_flag(const RowValue &stRow, KeyComparator pfComp,
+			      unsigned char uchFlag)
+{
+	int iRet;
+	uint32_t rowCnt = 0;
+	MEM_HANDLE_T *pRecord = NULL;
+	MEM_HANDLE_T hRecord = INVALID_HANDLE;
+	int trowSize = calc_tree_row_size(stRow, 0);
+	int tSize = 0;
+	offset_ = 0;
+
+	iRet = do_find(TTREE_INDEX_POS, stRow, pfComp, pRecord);
+	if (iRet == -100)
+		return iRet;
+	if (pRecord == NULL) {
+		tSize = trowSize + sizeof(unsigned char) + sizeof(uint32_t) * 2;
+		hRecord = mallocator_->Malloc(tSize);
+		if (hRecord == INVALID_HANDLE) {
+			need_new_bufer_size = tSize;
+			snprintf(err_message_, sizeof(err_message_),
+				 "malloc error");
+			return (EC_NO_MEM);
+		}
+		size_ = mallocator_->chunk_size(hRecord);
+		p_content_ = Pointer<char>(hRecord);
+		*p_content_ = DATA_TYPE_TREE_NODE; //RawFormat->DataType
+		offset_ += sizeof(unsigned char);
+		*(uint32_t *)(p_content_ + offset_) = 0; //RawFormat->data_size
+		offset_ += sizeof(uint32_t);
+		*(uint32_t *)(p_content_ + offset_) = 0; //RawFormat->RowCount
+		offset_ += sizeof(uint32_t);
+
+		iRet = encode_tree_row(stRow, uchFlag);
+		if (iRet != 0) {
+			goto ERROR_INSERT_RET;
+		}
+
+		iRet = insert_sub_tree(TTREE_INDEX_POS, stRow, pfComp, hRecord);
+		if (iRet != 0) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "insert error");
+			need_new_bufer_size = sizeof(TtreeNode);
+			mallocator_->Free(hRecord);
+			goto ERROR_INSERT_RET;
+		}
+		p_tree_root_->tree_size_ += size_;
+		p_tree_root_->node_count_++;
+	} else {
+		p_content_ = Pointer<char>(*pRecord);
+		size_ = mallocator_->chunk_size(*pRecord);
+		iRet = expand_tree_chunk(pRecord, trowSize);
+		if (iRet != 0) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "expand tree chunk error");
+			return iRet;
+		}
+
+		offset_ = *(uint32_t *)(p_content_ +
+					sizeof(unsigned char)); //datasize
+
+		iRet = encode_tree_row(stRow, uchFlag);
+		if (iRet != 0) {
+			goto ERROR_INSERT_RET;
+		}
+	}
+
+	/*每次insert数据之后,更新头部信息*/
+	rowCnt = *(uint32_t *)(p_content_ + sizeof(unsigned char) +
+			       sizeof(uint32_t));
+	*(uint32_t *)(p_content_ + sizeof(unsigned char)) = offset_;
+	*(uint32_t *)(p_content_ + sizeof(unsigned char) + sizeof(uint32_t)) =
+		rowCnt + 1;
+	p_tree_root_->root_handle_ = t_tree_.Root();
+	p_tree_root_->row_count_ += 1;
+	p_tree_root_->total_raw_size_ += trowSize;
+
+ERROR_INSERT_RET:
+	offset_ = 0;
+	size_ = 0;
+	hRecord = INVALID_HANDLE;
+	p_content_ = NULL;
+
+	return (iRet);
+}
+
+int TreeData::insert_row(const RowValue &stRow, KeyComparator pfComp,
+			 bool isDirty)
+{
+	return insert_row_flag(stRow, pfComp,
+			       isDirty ? OPER_INSERT : OPER_SELECT);
+}
+
+unsigned TreeData::ask_for_destroy_size(void)
+{
+	if (unlikely(_root_size == 0)) {
+		snprintf(err_message_, sizeof(err_message_), "attach error: %s",
+			 mallocator_->get_err_msg());
+		return (-1);
+	}
+	return p_tree_root_->tree_size_ + _root_size;
+}
+
+int TreeData::destory()
+{
+	if (unlikely(_root_size == 0)) {
+		snprintf(err_message_, sizeof(err_message_), "attach error: %s",
+			 mallocator_->get_err_msg());
+		return (-1);
+	}
+	t_tree_.destory();
+	mallocator_->Free(handle_);
+
+	handle_ = INVALID_HANDLE;
+	_root_size = 0;
+	return (0);
+}
+
+int TreeData::copy_raw_all(RawData *new_data)
+{
+	int iRet;
+	uint32_t totalNodeCnt = p_tree_root_->node_count_;
+	if (totalNodeCnt == 0) {
+		return 1;
+	}
+	pResCookie resCookie;
+	MEM_HANDLE_T pCookie[totalNodeCnt];
+	resCookie.p_handle = pCookie;
+	resCookie.need_find_node_count = 0;
+	iRet = t_tree_.traverse_forward(Visit, &resCookie);
+	if (iRet != 0) {
+		snprintf(err_message_, sizeof(err_message_),
+			 " traverse tree-data rows error:%d", iRet);
+		return (-1);
+	}
+	ALLOC_SIZE_T headlen = sizeof(unsigned char) + sizeof(uint32_t) * 2;
+	for (uint32_t i = 0; i < resCookie.has_got_node_count; i++) {
+		char *pch = Pointer<char>(pCookie[i]);
+		ALLOC_SIZE_T dtsize =
+			*(uint32_t *)(pch + sizeof(unsigned char));
+
+		uint32_t rowcnt = *(uint32_t *)(pch + sizeof(unsigned char) +
+						sizeof(uint32_t));
+		iRet = new_data->append_n_records(rowcnt, pch + headlen,
+						  dtsize - headlen);
+		if (iRet != 0)
+			return iRet;
+	}
+	if ((iRet = new_data->do_attach(new_data->get_handle())) != 0)
+		return (iRet);
+
+	return 0;
+}
+
+int TreeData::copy_tree_all(RawData *new_data)
+{
+	int iRet;
+	if (p_table_->num_fields() < 1) {
+		log4cplus_error("field nums is too short");
+		return -1;
+	}
+
+	unsigned int uiTotalRows = new_data->total_rows();
+	if (uiTotalRows == 0)
+		return (0);
+
+	new_data->rewind();
+	RowValue stOldRow(p_table_);
+	for (unsigned int i = 0; i < uiTotalRows; i++) {
+		unsigned char uchRowFlags;
+		stOldRow.default_value();
+		if (new_data->decode_row(stOldRow, uchRowFlags, 0) != 0) {
+			log4cplus_error("raw-data decode row error: %s",
+					new_data->get_err_msg());
+			return (-1);
+		}
+
+		iRet = insert_row(stOldRow, KeyCompare, false);
+		if (iRet == EC_NO_MEM) {
+			/*这里为了下次完全重新建立T树,把未建立完的树全部删除*/
+			need_new_bufer_size =
+				new_data->data_size() - new_data->data_start();
+			destroy_sub_tree();
+			return (EC_NO_MEM);
+		}
+	}
+
+	return (0);
+}
+
+int TreeData::decode_tree_row(RowValue &stRow, unsigned char &uchRowFlags,
+			      int iDecodeFlag)
+{
+	row_offset_ = offset_;
+
+	GET_TREE_VALUE(uchRowFlags, unsigned char);
+	for (int j = 1; j <= stRow.num_fields(); j++) {
+		if (stRow.table_definition()->is_discard(j))
+			continue;
+		if (j == m_iLAId)
+			m_uiLAOffset = offset_;
+		switch (stRow.field_type(j)) {
+		case DField::Signed:
+			if (unlikely(stRow.field_size(j) >
+				     (int)sizeof(int32_t))) {
+				GET_TREE_VALUE(stRow.field_value(j)->s64,
+					       int64_t);
+			} else {
+				GET_TREE_VALUE(stRow.field_value(j)->s64,
+					       int32_t);
+			}
+			break;
+
+		case DField::Unsigned:
+			if (unlikely(stRow.field_size(j) >
+				     (int)sizeof(uint32_t))) {
+				GET_TREE_VALUE(stRow.field_value(j)->u64,
+					       uint64_t);
+			} else {
+				GET_TREE_VALUE(stRow.field_value(j)->u64,
+					       uint32_t);
+			}
+			break;
+
+		case DField::Float:
+			if (likely(stRow.field_size(j) > (int)sizeof(float))) {
+				GET_TREE_VALUE(stRow.field_value(j)->flt,
+					       double);
+			} else {
+				GET_TREE_VALUE(stRow.field_value(j)->flt,
+					       float);
+			}
+			break;
+
+		case DField::String:
+		case DField::Binary:
+		default: {
+			GET_TREE_VALUE(stRow.field_value(j)->bin.len, int);
+			stRow.field_value(j)->bin.ptr = p_content_ + offset_;
+			SKIP_TREE_SIZE((uint32_t)stRow.field_value(j)->bin.len);
+			break;
+		}
+		} //end of switch
+	}
+	return (0);
+
+ERROR_RET:
+	snprintf(err_message_, sizeof(err_message_), "get value error");
+	return (-100);
+}
+
+int TreeData::compare_tree_data(RowValue *stpNodeRow)
+{
+	uint32_t rowCnt = p_tree_root_->node_count_;
+	if (rowCnt == 0) {
+		return 1;
+	}
+
+	const uint8_t *ufli = p_table_->uniq_fields_list();
+	for (int i = 0;
+	     !index_part_of_uniq_field_ && i < p_table_->uniq_fields(); i++) {
+		if (ufli[i] == TTREE_INDEX_POS) {
+			index_part_of_uniq_field_ = true;
+			break;
+		}
+	}
+
+	if (index_part_of_uniq_field_) {
+		MEM_HANDLE_T *pRecord = NULL;
+		RowValue stOldRow(p_table_);
+		char *indexKey = reinterpret_cast<char *>(
+			stpNodeRow->field_value(TTREE_INDEX_POS));
+		CmpCookie cookie(p_table_, TTREE_INDEX_POS);
+		int iRet =
+			t_tree_.do_find(indexKey, &cookie, KeyCompare, pRecord);
+		if (iRet == -100)
+			return iRet;
+		if (pRecord != NULL) {
+			p_content_ = Pointer<char>(*pRecord);
+			uint32_t rows = *(uint32_t *)(p_content_ +
+						      sizeof(unsigned char) +
+						      sizeof(uint32_t));
+			offset_ = sizeof(unsigned char) + sizeof(uint32_t) * 2;
+			size_ = mallocator_->chunk_size(*pRecord);
+
+			for (uint32_t j = 0; j < rows; j++) {
+				stOldRow.default_value();
+				unsigned char uchRowFlags;
+				if (decode_tree_row(stOldRow, uchRowFlags, 0) !=
+				    0) {
+					return (-2);
+				}
+				if (stpNodeRow->Compare(
+					    stOldRow,
+					    p_table_->uniq_fields_list(),
+					    p_table_->uniq_fields()) == 0) {
+					p_record_ = *pRecord;
+					return 0;
+				}
+			}
+		}
+	} else {
+		pResCookie resCookie;
+		MEM_HANDLE_T pCookie[rowCnt];
+		resCookie.p_handle = pCookie;
+		resCookie.need_find_node_count = 0;
+		if (t_tree_.traverse_forward(Visit, &resCookie) != 0) {
+			snprintf(err_message_, sizeof(err_message_),
+				 " traverse tree-data rows error");
+			return (-1);
+		}
+
+		RowValue stOldRow(p_table_);
+		for (uint32_t i = 0; i < resCookie.has_got_node_count;
+		     i++) { //逐行拷贝数据
+			p_content_ = Pointer<char>(pCookie[i]);
+			uint32_t rows = *(uint32_t *)(p_content_ +
+						      sizeof(unsigned char) +
+						      sizeof(uint32_t));
+			offset_ = sizeof(unsigned char) + sizeof(uint32_t) * 2;
+			size_ = mallocator_->chunk_size(pCookie[i]);
+
+			for (uint32_t j = 0; j < rows; j++) {
+				stOldRow.default_value();
+				unsigned char uchRowFlags;
+				if (decode_tree_row(stOldRow, uchRowFlags, 0) !=
+				    0) {
+					return (-2);
+				}
+				if (stpNodeRow->Compare(
+					    stOldRow,
+					    p_table_->uniq_fields_list(),
+					    p_table_->uniq_fields()) == 0) {
+					p_record_ = pCookie[i];
+					return 0;
+				}
+			}
+		}
+	}
+
+	return 1;
+}
+
+int TreeData::replace_tree_data(DTCJobOperation &job_op, Node *p_node,
+				RawData *affected_data, bool async,
+				unsigned char &RowFlag, bool setrows)
+{
+	int iRet;
+	unsigned int uiTotalRows = 0;
+	uint32_t iDelete = 0;
+	DTCTableDefinition *stpNodeTab, *stpTaskTab;
+	RowValue *stpNodeRow, *stpTaskRow;
+
+	stpNodeTab = p_table_;
+	stpTaskTab = job_op.table_definition();
+	RowValue stNewRow(stpTaskTab);
+	RowValue stNewNodeRow(stpNodeTab);
+	affected_rows_ = 0;
+
+	stpTaskRow = &stNewRow;
+	stpNodeRow = &stNewNodeRow;
+	if (stpNodeTab == stpTaskTab)
+		stpNodeRow = stpTaskRow;
+
+	stNewRow.default_value();
+	job_op.update_row(*stpTaskRow);
+
+	if (stpNodeTab != stpTaskTab)
+		stpNodeRow->Copy(stpTaskRow);
+	else
+		stpNodeRow = stpTaskRow;
+
+	iRet = compare_tree_data(stpNodeRow);
+	if (iRet < 0) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "compare tree data error:%d", iRet);
+		return iRet;
+	} else if (iRet == 0) {
+		DTCValue new_value = (*stpTaskRow)[TTREE_INDEX_POS];
+		char *NewIndex = reinterpret_cast<char *>(&new_value);
+		CmpCookie cookie(p_table_, TTREE_INDEX_POS);
+		if (KeyCompare(NewIndex, &cookie, *mallocator_, p_record_) !=
+		    0) //Index字段变更
+		{
+			char *tmp_pchContent = p_content_;
+			uint32_t tmp_size = size_;
+			ALLOC_SIZE_T tmp_uiOffset = offset_;
+			iRet = insert_row(*stpTaskRow, KeyCompare, m_async);
+			p_content_ = tmp_pchContent;
+			size_ = tmp_size;
+			offset_ = tmp_uiOffset;
+
+			if (iRet == EC_NO_MEM)
+				return iRet;
+			else if (iRet == 0) {
+				offset_ = row_offset_;
+				RowValue stOldRow(p_table_);
+				stOldRow.default_value();
+				unsigned char uchRowFlags;
+				if (decode_tree_row(stOldRow, uchRowFlags, 0) !=
+				    0) {
+					return (-2);
+				}
+				RowFlag = uchRowFlags;
+				uiTotalRows = get_row_count();
+				offset_ = row_offset_;
+				if (delete_cur_row(stOldRow) == 0)
+					iDelete++;
+
+				if (uiTotalRows > 0 && uiTotalRows == iDelete &&
+				    get_row_count() ==
+					    0) //RowFormat上的内容已删光
+				{
+					//删除tree node
+					bool isFreeNode = false;
+					DTCValue value = (stOldRow)
+						[TTREE_INDEX_POS]; //for轮询的最后一行数据
+					char *indexKey =
+						reinterpret_cast<char *>(
+							&value);
+					CmpCookie cookie(p_table_,
+							 TTREE_INDEX_POS);
+					int iret = t_tree_.Delete(indexKey,
+								  &cookie,
+								  KeyCompare,
+								  isFreeNode);
+					if (iret != 0) {
+						snprintf(
+							err_message_,
+							sizeof(err_message_),
+							"delete stTree failed:%d",
+							iret);
+						return -4;
+					}
+					if (isFreeNode)
+						p_tree_root_->tree_size_ -=
+							sizeof(TtreeNode);
+					p_tree_root_->tree_size_ -= size_;
+					p_tree_root_->node_count_--;
+					p_tree_root_->root_handle_ =
+						t_tree_.Root();
+					//释放handle
+					mallocator_->Free(p_record_);
+				}
+			}
+		} else //Index字段不变
+		{
+			MEM_HANDLE_T *pRawHandle = NULL;
+			int iRet = do_find(TTREE_INDEX_POS, *stpNodeRow,
+					   KeyCompare, pRawHandle);
+			if (iRet == -100 || iRet == 0)
+				return iRet;
+
+			iRet = replace_cur_row(*stpNodeRow, m_async,
+					       pRawHandle); // 加进cache
+			if (iRet == EC_NO_MEM) {
+				return iRet;
+			}
+			if (iRet != 0) {
+				/*标记加入黑名单*/
+				job_op.push_black_list_size(need_size());
+				return (-6);
+			}
+		}
+		affected_rows_ = 2;
+	}
+	return 0;
+}
+
+int TreeData::replace_sub_raw_data(DTCJobOperation &job_op,
+				   MEM_HANDLE_T hRecord)
+{
+	DTCTableDefinition *stpNodeTab, *stpTaskTab;
+	RowValue *stpNodeRow, *stpTaskRow, *stpCurRow;
+
+	stpNodeTab = p_table_;
+	stpTaskTab = job_op.table_definition();
+	RowValue stNewRow(stpTaskTab);
+	RowValue stNewNodeRow(stpNodeTab);
+	RowValue stCurRow(stpNodeTab);
+
+	stpTaskRow = &stNewRow;
+	stpNodeRow = &stNewNodeRow;
+	stpCurRow = &stCurRow;
+	if (stpNodeTab == stpTaskTab)
+		stpNodeRow = stpTaskRow;
+
+	p_content_ = Pointer<char>(hRecord);
+	unsigned int uiTotalRows = get_row_count();
+	offset_ = sizeof(unsigned char) +
+		  sizeof(uint32_t) * 2; //offset DataType + data_size + RowCount
+	size_ = mallocator_->chunk_size(hRecord);
+
+	unsigned char uchRowFlags;
+	uint32_t iDelete = 0;
+	uint32_t iInsert = 0;
+	for (unsigned int i = 0; i < uiTotalRows; i++) {
+		if (decode_tree_row(*stpNodeRow, uchRowFlags, 0) != 0)
+			return (-1);
+
+		if (stpNodeTab != stpTaskTab)
+			stpTaskRow->Copy(stpNodeRow);
+
+		stpCurRow->Copy(stpNodeRow);
+
+		//如果不符合查询条件
+		if (job_op.compare_row(*stpTaskRow) == 0)
+			continue;
+
+		MEM_HANDLE_T *pRawHandle = NULL;
+		int iRet = do_find(TTREE_INDEX_POS, *stpCurRow, KeyCompare,
+				   pRawHandle);
+		if (iRet == -100 || iRet == 0)
+			return iRet;
+
+		job_op.update_row(*stpTaskRow); //修改数据
+
+		if (stpNodeTab != stpTaskTab)
+			stpNodeRow->Copy(stpTaskRow);
+
+		if (affected_rows_ == 0) {
+			iRet = 0;
+			DTCValue new_value = (*stpTaskRow)[TTREE_INDEX_POS];
+			char *NewIndex = reinterpret_cast<char *>(&new_value);
+			CmpCookie cookie(p_table_, TTREE_INDEX_POS);
+
+			if (KeyCompare(NewIndex, &cookie, *mallocator_,
+				       hRecord) != 0) //update Index字段
+			{
+				char *tmp_pchContent = p_content_;
+				uint32_t tmp_size = size_;
+				ALLOC_SIZE_T tmp_uiOffset = offset_;
+
+				iRet = insert_row(*stpTaskRow, KeyCompare,
+						  m_async);
+
+				p_content_ = tmp_pchContent;
+				size_ = tmp_size;
+				offset_ = tmp_uiOffset;
+				if (iRet == EC_NO_MEM) {
+					return iRet;
+				} else if (iRet == 0) {
+					iInsert++;
+					offset_ = row_offset_;
+					if (delete_cur_row(*stpCurRow) == 0)
+						iDelete++;
+				}
+			} else {
+				iRet = replace_cur_row(*stpNodeRow, m_async,
+						       pRawHandle); // 加进cache
+				if (iRet == EC_NO_MEM) {
+					return iRet;
+				}
+				if (iRet != 0) {
+					/*标记加入黑名单*/
+					job_op.push_black_list_size(
+						need_size());
+					return (-6);
+				}
+			}
+
+			affected_rows_ += 2;
+		} else {
+			if (delete_cur_row(*stpCurRow) == 0) {
+				iDelete++;
+				affected_rows_++;
+			}
+		}
+	}
+
+	if (uiTotalRows > 0 &&
+	    uiTotalRows - iDelete == 0) //RowFormat上的内容已删光
+	{
+		//删除tree node
+		bool isFreeNode = false;
+		DTCValue value =
+			(*stpCurRow)[TTREE_INDEX_POS]; //for轮询的最后一行数据
+		char *indexKey = reinterpret_cast<char *>(&value);
+		CmpCookie cookie(p_table_, TTREE_INDEX_POS);
+		int iret = t_tree_.Delete(indexKey, &cookie, KeyCompare,
+					  isFreeNode);
+		if (iret != 0) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "delete stTree failed:%d", iret);
+			return -4;
+		}
+		if (isFreeNode)
+			p_tree_root_->tree_size_ -= sizeof(TtreeNode);
+		p_tree_root_->tree_size_ -= size_;
+		p_tree_root_->node_count_--;
+		p_tree_root_->root_handle_ = t_tree_.Root();
+		//释放handle
+		mallocator_->Free(hRecord);
+	}
+
+	return 0;
+}
+
+/*
+ * encode到私有内存,防止replace,update引起重新rellocate导致value引用了过期指针
+ */
+int TreeData::encode_to_private_area(RawData &raw, RowValue &value,
+				     unsigned char value_flag)
+{
+	int ret = raw.do_init(
+		key(), raw.calc_row_size(value, p_table_->key_fields() - 1));
+	if (0 != ret) {
+		log4cplus_error("init raw-data struct error, ret=%d, err=%s",
+				ret, raw.get_err_msg());
+		return -1;
+	}
+
+	ret = raw.insert_row(value, false, false);
+	if (0 != ret) {
+		log4cplus_error("insert row to raw-data error: ret=%d, err=%s",
+				ret, raw.get_err_msg());
+		return -2;
+	}
+
+	raw.rewind();
+
+	ret = raw.decode_row(value, value_flag, 0);
+	if (0 != ret) {
+		log4cplus_error("decode raw-data to row error: ret=%d, err=%s",
+				ret, raw.get_err_msg());
+		return -3;
+	}
+
+	return 0;
+}
+
+int TreeData::update_sub_raw_data(DTCJobOperation &job_op, MEM_HANDLE_T hRecord)
+{
+	DTCTableDefinition *stpNodeTab, *stpTaskTab;
+	RowValue *stpNodeRow, *stpTaskRow, *stpCurRow;
+
+	stpNodeTab = p_table_;
+	stpTaskTab = job_op.table_definition();
+	RowValue stNewRow(stpTaskTab);
+	RowValue stNewNodeRow(stpNodeTab);
+	RowValue stCurRow(stpNodeTab);
+
+	stpTaskRow = &stNewRow;
+	stpNodeRow = &stNewNodeRow;
+	stpCurRow = &stCurRow;
+	if (stpNodeTab == stpTaskTab)
+		stpNodeRow = stpTaskRow;
+
+	p_content_ = Pointer<char>(hRecord);
+	unsigned int uiTotalRows = get_row_count();
+	offset_ = sizeof(unsigned char) +
+		  sizeof(uint32_t) * 2; //offset DataType + data_size + RowCount
+	size_ = mallocator_->chunk_size(hRecord);
+
+	unsigned char uchRowFlags;
+	uint32_t iDelete = 0;
+	uint32_t iInsert = 0;
+	for (unsigned int i = 0; i < uiTotalRows; i++) {
+		if (decode_tree_row(*stpNodeRow, uchRowFlags, 0) != 0)
+			return (-1);
+
+		if (stpNodeTab != stpTaskTab)
+			stpTaskRow->Copy(stpNodeRow);
+
+		stpCurRow->Copy(stpNodeRow);
+
+		//如果不符合查询条件
+		if (job_op.compare_row(*stpTaskRow) == 0)
+			continue;
+
+		MEM_HANDLE_T *pRawHandle = NULL;
+		int iRet = do_find(TTREE_INDEX_POS, *stpCurRow, KeyCompare,
+				   pRawHandle);
+		if (iRet == -100 || iRet == 0)
+			return iRet;
+
+		job_op.update_row(*stpTaskRow); //修改数据
+
+		if (stpNodeTab != stpTaskTab)
+			stpNodeRow->Copy(stpTaskRow);
+
+		iRet = 0;
+		DTCValue new_value = (*stpTaskRow)[TTREE_INDEX_POS];
+		char *NewIndex = reinterpret_cast<char *>(&new_value);
+		CmpCookie cookie(p_table_, TTREE_INDEX_POS);
+
+		if (KeyCompare(NewIndex, &cookie, *mallocator_, hRecord) !=
+		    0) //update Index字段
+		{
+			char *tmp_pchContent = p_content_;
+			uint32_t tmp_size = size_;
+			ALLOC_SIZE_T tmp_uiOffset = offset_;
+
+			iRet = insert_row(*stpTaskRow, KeyCompare, m_async);
+
+			p_content_ = tmp_pchContent;
+			size_ = tmp_size;
+			offset_ = tmp_uiOffset;
+			if (iRet == EC_NO_MEM) {
+				return iRet;
+			} else if (iRet == 0) {
+				iInsert++;
+				offset_ = row_offset_;
+				if (delete_cur_row(*stpCurRow) == 0)
+					iDelete++;
+			}
+		} else {
+			// 在私有区间decode
+			RawData stTmpRows(&g_stSysMalloc, 1);
+			if (encode_to_private_area(stTmpRows, *stpNodeRow,
+						   uchRowFlags)) {
+				log4cplus_error(
+					"encode rowvalue to private rawdata area failed");
+				return -3;
+			}
+
+			iRet = replace_cur_row(*stpNodeRow, m_async,
+					       pRawHandle); // 加进cache
+			if (iRet == EC_NO_MEM) {
+				return iRet;
+			}
+			if (iRet != 0) {
+				/*标记加入黑名单*/
+				job_op.push_black_list_size(need_size());
+				return (-6);
+			}
+		}
+
+		affected_rows_++;
+		if (uchRowFlags & OPER_DIRTY)
+			dirty_rows_count_--;
+		if (m_async)
+			dirty_rows_count_++;
+	}
+
+	if (uiTotalRows > 0 &&
+	    uiTotalRows - iDelete == 0) //RowFormat上的内容已删光
+	{
+		//删除tree node
+		bool isFreeNode = false;
+		DTCValue value =
+			(*stpCurRow)[TTREE_INDEX_POS]; //for轮询的最后一行数据
+		char *indexKey = reinterpret_cast<char *>(&value);
+		CmpCookie cookie(p_table_, TTREE_INDEX_POS);
+		int iret = t_tree_.Delete(indexKey, &cookie, KeyCompare,
+					  isFreeNode);
+		if (iret != 0) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "delete stTree failed:%d", iret);
+			return -4;
+		}
+		if (isFreeNode)
+			p_tree_root_->tree_size_ -= sizeof(TtreeNode);
+		p_tree_root_->tree_size_ -= size_;
+		p_tree_root_->node_count_--;
+		p_tree_root_->root_handle_ = t_tree_.Root();
+		//释放handle
+		mallocator_->Free(hRecord);
+	}
+
+	if (iInsert != iDelete) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "update index change error: insert:%d, delete:%d",
+			 iInsert, iDelete);
+		return (-10);
+	}
+
+	return 0;
+}
+
+int TreeData::delete_sub_raw_data(DTCJobOperation &job_op, MEM_HANDLE_T hRecord)
+{
+	int iRet;
+	DTCTableDefinition *stpNodeTab, *stpTaskTab;
+	RowValue *stpNodeRow, *stpTaskRow;
+
+	stpNodeTab = p_table_;
+	stpTaskTab = job_op.table_definition();
+	RowValue stNodeRow(stpNodeTab);
+	RowValue stTaskRow(stpTaskTab);
+	if (stpNodeTab == stpTaskTab) {
+		stpNodeRow = &stTaskRow;
+		stpTaskRow = &stTaskRow;
+	} else {
+		stpNodeRow = &stNodeRow;
+		stpTaskRow = &stTaskRow;
+	}
+
+	unsigned int iAffectRows = 0;
+	unsigned char uchRowFlags;
+
+	p_content_ = Pointer<char>(hRecord);
+	unsigned int uiTotalRows = get_row_count();
+	offset_ = sizeof(unsigned char) +
+		  sizeof(uint32_t) * 2; //offset DataType + data_size + RowCount
+	size_ = mallocator_->chunk_size(hRecord);
+
+	for (unsigned int i = 0; i < uiTotalRows; i++) {
+		if ((decode_tree_row(*stpNodeRow, uchRowFlags, 0)) != 0) {
+			return (-2);
+		}
+		if (stpNodeTab != stpTaskTab) {
+			stpTaskRow->Copy(stpNodeRow);
+		}
+		if (job_op.compare_row(*stpTaskRow) != 0) { //符合del条件
+			iRet = delete_cur_row(*stpNodeRow);
+			if (iRet != 0) {
+				log4cplus_error(
+					"tree-data delete row error: %d", iRet);
+				return (-5);
+			}
+			iAffectRows++;
+			rows_count_--;
+			if (uchRowFlags & OPER_DIRTY)
+				dirty_rows_count_--;
+		}
+	}
+
+	if (iAffectRows > uiTotalRows)
+		return (-3);
+	else if (iAffectRows == uiTotalRows &&
+		 uiTotalRows > 0) //RowFormat上的内容已删光
+	{
+		//删除tree node
+		bool isFreeNode = false;
+		DTCValue value =
+			(*stpNodeRow)[TTREE_INDEX_POS]; //for轮询的最后一行数据
+		char *indexKey = reinterpret_cast<char *>(&value);
+		CmpCookie cookie(p_table_, TTREE_INDEX_POS);
+		int iret = t_tree_.Delete(indexKey, &cookie, KeyCompare,
+					  isFreeNode);
+		if (iret != 0) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "delete stTree failed:%d\t%s", iret,
+				 t_tree_.get_err_msg());
+			return -4;
+		}
+		if (isFreeNode)
+			p_tree_root_->tree_size_ -= sizeof(TtreeNode);
+		p_tree_root_->tree_size_ -= size_;
+		p_tree_root_->node_count_--;
+		p_tree_root_->root_handle_ = t_tree_.Root();
+		//释放handle
+		mallocator_->Free(hRecord);
+	}
+
+	return (0);
+}
+
+int TreeData::skip_row(const RowValue &stRow)
+{
+	if (p_content_ == NULL) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "rawdata not init yet");
+		return (-1);
+	}
+
+	offset_ = row_offset_;
+	if (offset_ >= get_data_size()) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "already at end of data");
+		return (-2);
+	}
+
+	SKIP_TREE_SIZE(sizeof(unsigned char)); // flag
+
+	for (int j = key_index_ + 1; j <= stRow.num_fields();
+	     j++) //拷贝一行数据
+	{
+		//id: bug fix skip discard
+		if (stRow.table_definition()->is_discard(j))
+			continue;
+		int temp = 0;
+		switch (stRow.field_type(j)) {
+		case DField::Unsigned:
+		case DField::Signed:
+			GET_TREE_VALUE_AT_OFFSET(temp, int, offset_);
+
+			if (stRow.field_size(j) > (int)sizeof(int32_t))
+				SKIP_TREE_SIZE(sizeof(int64_t));
+			else
+				SKIP_TREE_SIZE(sizeof(int32_t));
+			;
+			break;
+
+		case DField::Float: //浮点数
+			if (stRow.field_size(j) > (int)sizeof(float))
+				SKIP_TREE_SIZE(sizeof(double));
+			else
+				SKIP_TREE_SIZE(sizeof(float));
+			break;
+
+		case DField::String: //字符串
+		case DField::Binary: //二进制数据
+		default: {
+			int iLen;
+			GET_TREE_VALUE(iLen, int);
+			SKIP_TREE_SIZE(iLen);
+			break;
+		}
+		} //end of switch
+	}
+
+	return (0);
+
+ERROR_RET:
+	snprintf(err_message_, sizeof(err_message_), "skip row error");
+	return (-100);
+}
+
+int TreeData::replace_cur_row(const RowValue &stRow, bool isDirty,
+			      MEM_HANDLE_T *hRecord)
+{
+	int iRet = 0;
+	ALLOC_SIZE_T uiOldOffset;
+	ALLOC_SIZE_T uiNextRowsSize;
+	ALLOC_SIZE_T uiNewRowSize = 0;
+	ALLOC_SIZE_T uiCurRowSize = 0;
+	ALLOC_SIZE_T uiNextRowsOffset;
+	ALLOC_SIZE_T uiDataSize = get_data_size();
+
+	uiOldOffset = offset_;
+	if ((iRet = skip_row(stRow)) != 0) {
+		goto ERROR_RET;
+	}
+
+	unsigned char uchRowFlag;
+	GET_TREE_VALUE_AT_OFFSET(uchRowFlag, unsigned char, row_offset_);
+	if (isDirty)
+		uchRowFlag = OPER_UPDATE;
+
+	uiNewRowSize = calc_tree_row_size(stRow, key_index_);
+	uiCurRowSize = offset_ - row_offset_;
+	uiNextRowsOffset = offset_;
+	uiNextRowsSize = uiDataSize - offset_;
+
+	if (uiNewRowSize > uiCurRowSize) {
+		// enlarge buffer
+		MEM_HANDLE_T hTmp = mallocator_->ReAlloc(
+			*hRecord, uiDataSize + uiNewRowSize - uiCurRowSize);
+		if (hTmp == INVALID_HANDLE) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "realloc error");
+			need_new_bufer_size =
+				uiDataSize + uiNewRowSize - uiCurRowSize;
+			iRet = EC_NO_MEM;
+			goto ERROR_RET;
+		}
+		p_tree_root_->tree_size_ -= size_;
+		*hRecord = hTmp;
+		size_ = mallocator_->chunk_size(*hRecord);
+		p_tree_root_->tree_size_ += size_;
+		p_content_ = Pointer<char>(*hRecord);
+
+		// move data
+		if (uiNextRowsSize > 0)
+			memmove(p_content_ + uiNextRowsOffset +
+					(uiNewRowSize - uiCurRowSize),
+				p_content_ + uiNextRowsOffset, uiNextRowsSize);
+
+		// copy new row
+		offset_ = row_offset_;
+		iRet = encode_tree_row(stRow, uchRowFlag);
+		if (iRet != 0) {
+			if (uiNextRowsSize > 0)
+				memmove(p_content_ + uiNextRowsOffset,
+					p_content_ + uiNextRowsOffset +
+						(uiNewRowSize - uiCurRowSize),
+					uiNextRowsSize);
+			iRet = -1;
+			goto ERROR_RET;
+		}
+	} else {
+		// back up old row
+		void *pTmpBuf = MALLOC(uiCurRowSize);
+		if (pTmpBuf == NULL) {
+			need_new_bufer_size = uiCurRowSize;
+			snprintf(err_message_, sizeof(err_message_),
+				 "malloc error: %m");
+			return (-ENOMEM);
+		}
+		memmove(pTmpBuf, p_content_ + row_offset_, uiCurRowSize);
+
+		// copy new row
+		offset_ = row_offset_;
+		iRet = encode_tree_row(stRow, uchRowFlag);
+		if (iRet != 0) {
+			memmove(p_content_ + row_offset_, pTmpBuf,
+				uiCurRowSize);
+			FREE(pTmpBuf);
+			iRet = -1;
+			goto ERROR_RET;
+		}
+
+		// move data
+		if (uiNextRowsSize > 0 && offset_ != uiNextRowsOffset)
+			memmove(p_content_ + offset_,
+				p_content_ + uiNextRowsOffset, uiNextRowsSize);
+		FREE(pTmpBuf);
+
+		// shorten buffer
+		MEM_HANDLE_T hTmp = mallocator_->ReAlloc(
+			*hRecord, uiDataSize + uiNewRowSize - uiCurRowSize);
+		if (hTmp != INVALID_HANDLE) {
+			p_tree_root_->tree_size_ -= size_;
+			*hRecord = hTmp;
+			size_ = mallocator_->chunk_size(*hRecord);
+			p_tree_root_->tree_size_ += size_;
+			p_content_ = Pointer<char>(*hRecord);
+		}
+	}
+	set_data_size(uiDataSize - uiCurRowSize + uiNewRowSize);
+	p_tree_root_->total_raw_size_ += (uiNewRowSize - uiCurRowSize);
+
+ERROR_RET:
+	offset_ = uiOldOffset + uiNewRowSize - uiCurRowSize;
+	return (iRet);
+}
+
+int TreeData::delete_cur_row(const RowValue &stRow)
+{
+	int iRet = 0;
+	ALLOC_SIZE_T uiOldOffset;
+	ALLOC_SIZE_T uiNextRowsSize;
+
+	uiOldOffset = offset_;
+	if ((iRet = skip_row(stRow)) != 0) {
+		log4cplus_error("skip error: %d,%s", iRet, get_err_msg());
+		goto ERROR_RET;
+	}
+	uiNextRowsSize = get_data_size() - offset_;
+
+	memmove(p_content_ + row_offset_, p_content_ + offset_, uiNextRowsSize);
+	set_row_count(get_row_count() - 1);
+	set_data_size(get_data_size() - (offset_ - row_offset_));
+
+	p_tree_root_->row_count_--;
+	p_tree_root_->total_raw_size_ -= (offset_ - row_offset_);
+
+	offset_ = row_offset_;
+	return (iRet);
+
+ERROR_RET:
+	offset_ = uiOldOffset;
+	return (iRet);
+}
+
+int TreeData::get_sub_raw_data(DTCJobOperation &job_op, MEM_HANDLE_T hRecord)
+{
+	//	int laid = job_op.flag_no_cache() ? -1 : job_op.table_definition()->lastacc_field_id();
+
+	if (job_op.result_full())
+		return 0;
+
+	DTCTableDefinition *stpNodeTab, *stpTaskTab;
+	RowValue *stpNodeRow, *stpTaskRow;
+	stpNodeTab = p_table_;
+	stpTaskTab = job_op.table_definition();
+	RowValue stNodeRow(stpNodeTab);
+	RowValue stTaskRow(stpTaskTab);
+	if (stpNodeTab == stpTaskTab) {
+		stpNodeRow = &stTaskRow;
+		stpTaskRow = &stTaskRow;
+	} else {
+		stpNodeRow = &stNodeRow;
+		stpTaskRow = &stTaskRow;
+	}
+
+	p_content_ = Pointer<char>(hRecord);
+	uint32_t rows = get_row_count();
+	offset_ = sizeof(unsigned char) + sizeof(uint32_t) * 2;
+	size_ = mallocator_->chunk_size(hRecord);
+
+	unsigned char uchRowFlags;
+	for (unsigned int j = 0; j < rows; j++) {
+		job_op.update_key(
+			*stpNodeRow); // use stpNodeRow is fine, as just modify key field
+		if ((decode_tree_row(*stpNodeRow, uchRowFlags, 0)) != 0) {
+			return (-2);
+		}
+		// this pointer compare is ok, as these two is both come from tabledefmanager. if they mean same, they are same object.
+		if (stpNodeTab != stpTaskTab) {
+			stpTaskRow->Copy(stpNodeRow);
+		}
+		if (job_op.compare_row(*stpTaskRow) == 0) //如果不符合查询条件
+			continue;
+
+		if (stpTaskTab->expire_time_field_id() > 0)
+			stpTaskRow->update_expire_time();
+		//当前行添加到task中
+		job_op.append_row(stpTaskRow);
+
+		if (job_op.all_rows() && job_op.result_full()) {
+			job_op.set_total_rows((int)rows);
+			break;
+		}
+	}
+	return 0;
+}
+
+int TreeData::get_sub_raw(DTCJobOperation &job_op, unsigned int nodeCnt,
+			  bool isAsc, SubRowProcess subRowProc)
+{
+	pResCookie resCookie;
+	MEM_HANDLE_T pCookie[nodeCnt];
+	resCookie.p_handle = pCookie;
+
+	if (job_op.all_rows() &&
+	    job_op.requestInfo.limit_count() >
+		    0) //condition: ONLY `LIMIT` without `WHERE`
+		resCookie.need_find_node_count =
+			job_op.requestInfo.limit_start() +
+			job_op.requestInfo.limit_count();
+	else
+		resCookie.need_find_node_count = 0;
+
+	t_tree_.traverse_forward(Visit, &resCookie);
+
+	if (isAsc) //升序
+	{
+		for (int i = 0; i < (int)resCookie.has_got_node_count; i++) {
+			int iRet = (this->*subRowProc)(job_op, pCookie[i]);
+			if (iRet != 0)
+				return iRet;
+		}
+	} else //降序
+	{
+		for (int i = (int)resCookie.has_got_node_count - 1; i >= 0;
+		     i--) {
+			int iRet = (this->*subRowProc)(job_op, pCookie[i]);
+			if (iRet != 0)
+				return iRet;
+		}
+	}
+
+	return 0;
+}
+
+int TreeData::match_index_condition(DTCJobOperation &job_op,
+				    unsigned int NodeCnt,
+				    SubRowProcess subRowProc)
+{
+	const DTCFieldValue *condition = job_op.request_condition();
+	int numfields = 0; //条件字段个数
+	bool isAsc = !(p_table_->is_desc_order(TTREE_INDEX_POS));
+
+	if (condition)
+		numfields = condition->num_fields();
+
+	int indexIdArr[numfields]; //开辟空间比实际使用的大
+	int indexCount = 0; //条件索引个数
+	int firstEQIndex = -1; //第一个EQ在indexIdArr中的位置
+
+	for (int i = 0; i < numfields; i++) {
+		if (condition->field_id(i) == TTREE_INDEX_POS) {
+			if (firstEQIndex == -1 &&
+			    condition->field_operation(i) == DField::EQ)
+				firstEQIndex = i;
+			indexIdArr[indexCount++] = i;
+		}
+	}
+
+	if (indexCount == 0 ||
+	    (indexCount == 1 && condition->field_operation(indexIdArr[0]) ==
+					DField::NE)) { //平板类型
+		int iret = get_sub_raw(job_op, NodeCnt, isAsc, subRowProc);
+		if (iret != 0)
+			return iret;
+	} else if (firstEQIndex != -1) //有至少一个EQ条件
+	{
+		MEM_HANDLE_T *pRecord = NULL;
+
+		char *indexKey = reinterpret_cast<char *>(
+			condition->field_value(firstEQIndex));
+		CmpCookie cookie(p_table_, TTREE_INDEX_POS);
+		int iRet =
+			t_tree_.do_find(indexKey, &cookie, KeyCompare, pRecord);
+		if (iRet == -100)
+			return iRet;
+		if (pRecord != NULL) {
+			iRet = (this->*subRowProc)(job_op, *pRecord);
+			if (iRet != 0)
+				return iRet;
+		}
+	} else {
+		int leftId = -1;
+		int rightId = -1;
+
+		for (int i = 0; i < indexCount; i++) {
+			switch (condition->field_operation(indexIdArr[i])) {
+			case DField::LT:
+			case DField::LE:
+				if (rightId == -1)
+					rightId = indexIdArr[i];
+				break;
+
+			case DField::GT:
+			case DField::GE:
+				if (leftId == -1)
+					leftId = indexIdArr[i];
+				break;
+
+			default:
+				break;
+			}
+		}
+
+		if (leftId != -1 && rightId == -1) //GE
+		{
+			pResCookie resCookie;
+			MEM_HANDLE_T pCookie[NodeCnt];
+			resCookie.p_handle = pCookie;
+			resCookie.need_find_node_count = 0;
+			char *indexKey = reinterpret_cast<char *>(
+				condition->field_value(leftId));
+			CmpCookie cookie(p_table_, TTREE_INDEX_POS);
+
+			if (t_tree_.traverse_forward(indexKey, &cookie,
+						     KeyCompare, Visit,
+						     &resCookie) != 0) {
+				snprintf(err_message_, sizeof(err_message_),
+					 " traverse tree-data rows error");
+				return (-1);
+			}
+
+			if (isAsc) {
+				for (int i = 0;
+				     i < (int)resCookie.has_got_node_count;
+				     i++) {
+					int iRet = (this->*subRowProc)(
+						job_op, pCookie[i]);
+					if (iRet != 0)
+						return iRet;
+				}
+			} else {
+				for (int i = (int)resCookie.has_got_node_count -
+					     1;
+				     i >= 0; i--) {
+					int iRet = (this->*subRowProc)(
+						job_op, pCookie[i]);
+					if (iRet != 0)
+						return iRet;
+				}
+			}
+		} else if (leftId == -1 && rightId != -1) //LE
+		{
+			pResCookie resCookie;
+			MEM_HANDLE_T pCookie[NodeCnt];
+			resCookie.p_handle = pCookie;
+			resCookie.need_find_node_count = NodeCnt;
+			char *indexKey = reinterpret_cast<char *>(
+				condition->field_value(rightId));
+			CmpCookie cookie(p_table_, TTREE_INDEX_POS);
+
+			if (t_tree_.traverse_backward(indexKey, &cookie,
+						      KeyCompare, Visit,
+						      &resCookie) != 0) {
+				snprintf(err_message_, sizeof(err_message_),
+					 " traverse tree-data rows error");
+				return (-1);
+			}
+
+			if (isAsc) {
+				for (int i = (int)resCookie.has_got_node_count -
+					     1;
+				     i >= 0; i--) {
+					int iRet = (this->*subRowProc)(
+						job_op, pCookie[i]);
+					if (iRet != 0)
+						return iRet;
+				}
+			} else {
+				for (int i = 0;
+				     i < (int)resCookie.has_got_node_count;
+				     i++) {
+					int iRet = (this->*subRowProc)(
+						job_op, pCookie[i]);
+					if (iRet != 0)
+						return iRet;
+				}
+			}
+		} else if (leftId != -1 && rightId != -1) //range
+		{
+			pResCookie resCookie;
+			MEM_HANDLE_T pCookie[NodeCnt];
+			resCookie.p_handle = pCookie;
+			resCookie.need_find_node_count = 0;
+			char *beginKey = reinterpret_cast<char *>(
+				condition->field_value(leftId));
+			char *endKey = reinterpret_cast<char *>(
+				condition->field_value(rightId));
+			CmpCookie cookie(p_table_, TTREE_INDEX_POS);
+
+			if (t_tree_.traverse_forward(beginKey, endKey, &cookie,
+						     KeyCompare, Visit,
+						     &resCookie) != 0) {
+				snprintf(err_message_, sizeof(err_message_),
+					 " traverse tree-data rows error");
+				return (-1);
+			}
+
+			if (isAsc) {
+				for (int i = 0;
+				     i < (int)resCookie.has_got_node_count;
+				     i++) {
+					int iRet = (this->*subRowProc)(
+						job_op, pCookie[i]);
+					if (iRet != 0)
+						return iRet;
+				}
+			} else {
+				for (int i = (int)resCookie.has_got_node_count -
+					     1;
+				     i >= 0; i--) {
+					int iRet = (this->*subRowProc)(
+						job_op, pCookie[i]);
+					if (iRet != 0)
+						return iRet;
+				}
+			}
+		} else //may all NE, raw data process
+		{
+			int iret =
+				get_sub_raw(job_op, NodeCnt, isAsc, subRowProc);
+			if (iret != 0)
+				return iret;
+		}
+	}
+
+	return 0;
+}
+
+int TreeData::get_dirty_row_count()
+{
+	unsigned int uiTotalNodes = p_tree_root_->node_count_;
+	int dirty_rows = 0;
+	pResCookie resCookie;
+	MEM_HANDLE_T pCookie[uiTotalNodes];
+	resCookie.p_handle = pCookie;
+	resCookie.need_find_node_count = 0;
+
+	RowValue stRow(p_table_);
+
+	t_tree_.traverse_forward(Visit, &resCookie);
+
+	for (int i = 0; i < (int)resCookie.has_got_node_count; i++) {
+		p_content_ = Pointer<char>(pCookie[i]);
+		uint32_t rows = get_row_count();
+		offset_ = sizeof(unsigned char) + sizeof(uint32_t) * 2;
+		size_ = mallocator_->chunk_size(pCookie[i]);
+
+		unsigned char uchRowFlags;
+		for (unsigned int j = 0; j < rows; j++) {
+			if (decode_tree_row(stRow, uchRowFlags, 0) != 0) {
+				log4cplus_error(
+					"subraw-data decode row error: %s",
+					get_err_msg());
+				return (-1);
+			}
+
+			if (uchRowFlags & OPER_DIRTY)
+				dirty_rows++;
+		}
+	}
+
+	return dirty_rows;
+}
+
+int TreeData::flush_tree_data(DTCFlushRequest *flush_req, Node *p_node,
+			      unsigned int &affected_count)
+{
+	unsigned int uiTotalNodes = p_tree_root_->node_count_;
+
+	affected_count = 0;
+	DTCValue astKey[p_table_->key_fields()];
+	TaskPackedKey::unpack_key(p_table_, key(), astKey);
+	RowValue stRow(p_table_); //一行数据
+	for (int i = 0; i < p_table_->key_fields(); i++)
+		stRow[i] = astKey[i];
+
+	rows_count_ = 0;
+	dirty_rows_count_ = 0;
+
+	pResCookie resCookie;
+	MEM_HANDLE_T pCookie[uiTotalNodes];
+	resCookie.p_handle = pCookie;
+	resCookie.need_find_node_count = 0;
+
+	t_tree_.traverse_forward(Visit, &resCookie);
+
+	for (int i = 0; i < (int)resCookie.has_got_node_count; i++) {
+		p_content_ = Pointer<char>(pCookie[i]);
+		uint32_t rows = get_row_count();
+		offset_ = sizeof(unsigned char) + sizeof(uint32_t) * 2;
+		size_ = mallocator_->chunk_size(pCookie[i]);
+
+		unsigned char uchRowFlags;
+		for (unsigned int j = 0; j < rows; j++) {
+			if (decode_tree_row(stRow, uchRowFlags, 0) != 0) {
+				log4cplus_error(
+					"subraw-data decode row error: %s",
+					get_err_msg());
+				return (-1);
+			}
+
+			if ((uchRowFlags & OPER_DIRTY) == false)
+				continue;
+
+			if (flush_req && flush_req->flush_row(stRow) != 0) {
+				log4cplus_error(
+					"do_flush() invoke flushRow() failed.");
+				return (-2);
+			}
+			set_cur_row_flag(uchRowFlags & ~OPER_DIRTY);
+			dirty_rows_count_--;
+			affected_count++;
+		}
+	}
+
+	return 0;
+}
+
+int TreeData::get_tree_data(DTCJobOperation &job_op)
+{
+	uint32_t rowCnt = p_tree_root_->row_count_;
+	if (rowCnt == 0) {
+		return 0;
+	}
+
+	job_op.prepare_result(); //准备返回结果对象
+	if (job_op.all_rows() &&
+	    (job_op.count_only() || !job_op.in_range((int)rowCnt, 0))) {
+		if (job_op.is_batch_request()) {
+			if ((int)rowCnt > 0)
+				job_op.add_total_rows((int)rowCnt);
+		} else {
+			job_op.set_total_rows((int)rowCnt);
+		}
+	} else {
+		int iret =
+			match_index_condition(job_op, p_tree_root_->node_count_,
+					      &TreeData::get_sub_raw_data);
+		if (iret != 0)
+			return iret;
+	}
+
+	return 0;
+}
+
+int TreeData::update_tree_data(DTCJobOperation &job_op, Node *p_node,
+			       RawData *affected_data, bool async, bool setrows)
+{
+	uint32_t rowCnt = p_tree_root_->node_count_;
+	if (rowCnt == 0) {
+		return 0;
+	}
+
+	m_pstNode = p_node;
+	m_async = async;
+	dirty_rows_count_ = 0;
+
+	return match_index_condition(job_op, rowCnt,
+				     &TreeData::update_sub_raw_data);
+}
+
+int TreeData::delete_tree_data(DTCJobOperation &job_op)
+{
+	uint32_t rowCnt = p_tree_root_->node_count_;
+	if (rowCnt == 0) {
+		return 0;
+	}
+
+	rows_count_ = 0;
+	dirty_rows_count_ = 0;
+
+	job_op.prepare_result(); //准备返回结果对象
+	if (job_op.all_rows() &&
+	    (job_op.count_only() || !job_op.in_range((int)rowCnt, 0))) {
+		if (job_op.is_batch_request()) {
+			if ((int)rowCnt > 0)
+				job_op.add_total_rows((int)rowCnt);
+		} else {
+			job_op.set_total_rows((int)rowCnt);
+		}
+	} else {
+		int iret = match_index_condition(
+			job_op, rowCnt, &TreeData::delete_sub_raw_data);
+		if (iret != 0)
+			return iret;
+	}
+
+	return 0;
+}
+
+int TreeData::get_expire_time(DTCTableDefinition *t, uint32_t &expire)
+{
+	expire = 0;
+	if (unlikely(handle_ == INVALID_HANDLE)) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "root tree data not init yet");
+		return (-1);
+	}
+	if (expire_id_ == -1) {
+		expire = 0;
+		return 0;
+	}
+
+	MEM_HANDLE_T root = get_tree_root();
+	if (unlikely(root == INVALID_HANDLE)) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "root tree data not init yet");
+		return (-1);
+	}
+
+	MEM_HANDLE_T firstHanle = t_tree_.first_node();
+	if (unlikely(firstHanle == INVALID_HANDLE)) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "root tree data not init yet");
+		return (-1);
+	}
+
+	offset_ = 0;
+	size_ = mallocator_->chunk_size(firstHanle);
+	p_content_ = Pointer<char>(firstHanle);
+
+	SKIP_TREE_SIZE(sizeof(unsigned char));
+
+	for (int j = key_index_ + 1; j <= p_table_->num_fields(); j++) {
+		if (j == expire_id_) {
+			expire = *((uint32_t *)(p_content_ + offset_));
+			break;
+		}
+
+		switch (p_table_->field_type(j)) {
+		case DField::Unsigned:
+		case DField::Signed:
+			if (p_table_->field_size(j) > (int)sizeof(int32_t))
+				SKIP_TREE_SIZE(sizeof(int64_t));
+			else
+				SKIP_TREE_SIZE(sizeof(int32_t));
+			;
+			break;
+
+		case DField::Float:
+			if (p_table_->field_size(j) > (int)sizeof(float))
+				SKIP_TREE_SIZE(sizeof(double));
+			else
+				SKIP_TREE_SIZE(sizeof(float));
+			break;
+
+		case DField::String:
+		case DField::Binary:
+		default:
+			uint32_t iLen = 0;
+			GET_TREE_VALUE(iLen, int);
+			SKIP_TREE_SIZE(iLen);
+			break;
+		} //end of switch
+	}
+	return 0;
+
+	offset_ = 0;
+	size_ = 0;
+	p_content_ = NULL;
+
+ERROR_RET:
+	snprintf(err_message_, sizeof(err_message_), "get expire error");
+	return (-100);
+}
+
+ALLOC_SIZE_T TreeData::calc_tree_row_size(const RowValue &stRow, int keyIdx)
+{
+	if (keyIdx == -1)
+		log4cplus_error("TreeData may not init yet...");
+	ALLOC_SIZE_T tSize = 1; // flag
+	for (int j = keyIdx + 1; j <= stRow.num_fields(); j++) //¿½±´Ò»ÐÐÊý¾Ý
+	{
+		if (stRow.table_definition()->is_discard(j))
+			continue;
+		switch (stRow.field_type(j)) {
+		case DField::Signed:
+		case DField::Unsigned:
+			tSize += unlikely(stRow.field_size(j) >
+					  (int)sizeof(int32_t)) ?
+					 sizeof(int64_t) :
+					 sizeof(int32_t);
+			break;
+
+		case DField::Float: //¸¡µãÊý
+			tSize += likely(stRow.field_size(j) >
+					(int)sizeof(float)) ?
+					 sizeof(double) :
+					 sizeof(float);
+			break;
+
+		case DField::String: //×Ö·û´®
+		case DField::Binary: //¶þ½øÖÆÊý¾Ý
+		default: {
+			tSize += sizeof(int);
+			tSize += stRow.field_value(j)->bin.len;
+			break;
+		}
+		} //end of switch
+	}
+	if (tSize < 2)
+		log4cplus_info("key_index_:%d, stRow.num_fields():%d tSize:%d",
+			       keyIdx, stRow.num_fields(), tSize);
+
+	return (tSize);
+}
+
+int TreeData::destroy_sub_tree()
+{
+	t_tree_.destory();
+	p_tree_root_->row_count_ = 0;
+	p_tree_root_->root_handle_ = INVALID_HANDLE;
+	p_tree_root_->tree_size_ = 0;
+	p_tree_root_->total_raw_size_ = 0;
+	p_tree_root_->node_count_ = 0;
+	return 0;
+}
+
+unsigned int TreeData::get_row_count()
+{
+	return *(uint32_t *)(p_content_ + sizeof(unsigned char) +
+			     sizeof(uint32_t));
+}
+
+unsigned int TreeData::get_data_size()
+{
+	return *(uint32_t *)(p_content_ + sizeof(unsigned char));
+}
+
+int TreeData::set_row_count(unsigned int count)
+{
+	SET_TREE_VALUE_AT_OFFSET(count, uint32_t,
+				 sizeof(unsigned char) + sizeof(uint32_t));
+
+ERROR_RET:
+	snprintf(err_message_, sizeof(err_message_), "set data rowcount error");
+	return (-100);
+}
+
+int TreeData::set_data_size(unsigned int data_size)
+{
+	SET_TREE_VALUE_AT_OFFSET(data_size, uint32_t, sizeof(unsigned char));
+
+ERROR_RET:
+	snprintf(err_message_, sizeof(err_message_), "set data size error");
+	return (-100);
+}
+
+int TreeData::set_cur_row_flag(unsigned char uchFlag)
+{
+	if (row_offset_ >= get_data_size()) {
+		snprintf(err_message_, sizeof(err_message_), "no more rows");
+		return (-1);
+	}
+	*(unsigned char *)(p_content_ + row_offset_) = uchFlag;
+
+	return (0);
+}

+ 574 - 0
src/core/tree/tree_data.h

@@ -0,0 +1,574 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef TREE_DATA_H
+#define TREE_DATA_H
+
+#include "raw/raw_data.h"
+#include "t_tree.h"
+#include "protocol.h"
+#include "task/task_request.h"
+#include "value.h"
+#include "field/field.h"
+#include "section.h"
+#include "table/table_def.h"
+
+typedef enum _TreeCheckResult {
+	CHK_CONTINUE, // 继续访问这棵子树
+	CHK_SKIP, // 忽略这棵子树,继续访问其他节点
+	CHK_STOP, // 终止访问循环
+	CHK_DESTROY // 销毁这棵子树
+} TreeCheckResult;
+
+#define TTREE_INDEX_POS 1
+
+typedef TreeCheckResult (*CheckTreeFunc)(MallocBase &stMalloc,
+					 uint8_t uchIndexCnt,
+					 uint8_t uchCurIdxCnt,
+					 const RowValue *pstIndexValue,
+					 const uint32_t uiTreeRowNum,
+					 void *pCookie);
+typedef int (*VisitRawData)(MallocBase &stMalloc, uint8_t uchIndexCnt,
+			    const RowValue *pstIndexValue,
+			    ALLOC_HANDLE_T &hHandle, int64_t &llRowNumInc,
+			    void *pCookie);
+class TreeData;
+typedef int (TreeData::*SubRowProcess)(DTCJobOperation &job_op,
+				       MEM_HANDLE_T hRecord);
+
+class DTCFlushRequest;
+
+/************************************************************
+  Description:    t-tree根节点的数据结构
+  Version:         DTC 3.0
+***********************************************************/
+struct _RootData {
+	unsigned char data_type_;
+	uint32_t tree_size_;
+	uint32_t total_raw_size_; //所有RawData总和,不包含Header
+	uint32_t node_count_; //索引T树中Node总计个数
+	uint32_t row_count_; //索引T树中总计行数
+	uint8_t get_request_count_;
+	uint16_t latest_request_time_;
+	uint16_t latest_update_time_;
+	uint16_t create_time_;
+	MEM_HANDLE_T root_handle_;
+	char p_key_[0];
+} __attribute__((packed));
+typedef struct _RootData RootData;
+
+class DTCTableDefinition;
+typedef struct _CmpCookie {
+	const DTCTableDefinition *p_table_;
+	uint8_t m_index_;
+	_CmpCookie(const DTCTableDefinition *p_table_definition_,
+		   uint8_t index_)
+	{
+		p_table_ = p_table_definition_;
+		m_index_ = index_;
+	}
+} CmpCookie;
+
+typedef struct _pCookie {
+	MEM_HANDLE_T *p_handle;
+	uint32_t has_got_node_count; //已经遍历到的节点个数
+	uint32_t need_find_node_count; //需要遍历的节点个数,0代表不限
+	uint32_t has_got_row_count; //已经遍历到的数据行数
+	_pCookie()
+		: p_handle(NULL), has_got_node_count(0),
+		  need_find_node_count(0), has_got_row_count(0)
+	{
+	}
+} pResCookie;
+
+typedef enum _CondType {
+	COND_VAL_SET, // 查询特定的值列表
+	COND_RANGE, // 查询value[0] ~ Key-value[0]<=value[1].s64
+	COND_GE, // 查询大于等于value[0]的key
+	COND_LE, // 查询小于等于value[0]的key
+	COND_ALL // 遍历所有key
+} CondType;
+
+typedef enum _Order {
+	ORDER_ASC, // 升序
+	ORDER_DEC, // 降序
+	ORDER_POS, // 后序访问
+} Order;
+
+typedef struct {
+	unsigned char cond_type;
+	unsigned char ch_order;
+	unsigned int value_num;
+	DTCValue *p_value;
+} TtreeCondition;
+
+class TreeData {
+    private:
+	RootData *p_tree_root_; // 注意:地址可能会因为realloc而改变
+	Ttree t_tree_;
+	DTCTableDefinition *p_table_;
+	uint8_t index_depth_;
+	int table_index_;
+	char err_message_[100];
+
+	ALLOC_SIZE_T need_new_bufer_size; // 最近一次分配内存失败需要的大小
+	uint64_t affected_rows_;
+
+	MEM_HANDLE_T handle_;
+	uint32_t size_;
+	uint32_t _root_size;
+	MallocBase *mallocator_;
+	Node *m_pstNode;
+	bool m_async;
+	int64_t rows_count_;
+	int64_t dirty_rows_count_;
+
+	int key_size_;
+	uint8_t key_index_;
+	int expire_id_;
+	int m_iLAId;
+	int m_iLCmodId;
+	ALLOC_SIZE_T m_uiLAOffset;
+
+	ALLOC_SIZE_T offset_;
+	ALLOC_SIZE_T row_offset_;
+	char *p_content_;
+
+	bool index_part_of_uniq_field_;
+	MEM_HANDLE_T p_record_;
+
+	/************************************************************
+	  Description:    递归查找数据的cookie参数
+	  Version:         DTC 3.0
+	***********************************************************/
+	typedef struct {
+		TreeData *m_pst_tree_;
+		uint8_t m_uch_cond_idx_cnt_;
+		uint8_t m_uch_cur_index_;
+		MEM_HANDLE_T m_h_handle_;
+		int64_t m_ll_affect_rows_;
+		const int *pi_inclusion_;
+		KeyComparator m_pf_comp_;
+		const RowValue *m_pst_cond_;
+		RowValue *m_pst_index_value_;
+		VisitRawData m_pf_visit_;
+		void *m_pCookie_;
+	} CIndexCookie;
+
+	typedef struct {
+		TreeData *m_pst_tree_;
+		uint8_t m_uch_cur_cond_;
+		MEM_HANDLE_T m_h_handle_;
+		int64_t m_ll_affect_rows_;
+		const TtreeCondition *m_pst_cond_;
+		KeyComparator m_pf_comp_;
+		RowValue *m_pst_index_value_;
+		CheckTreeFunc m_pf_check_;
+		VisitRawData m_pf_visit_;
+		void *m_p_cookie_;
+	} CSearchCookie;
+
+	int set_data_size(unsigned int data_size);
+	int set_row_count(unsigned int count);
+	unsigned int get_data_size();
+	unsigned int get_row_count();
+
+    protected:
+	template <class T> T *Pointer(void) const
+	{
+		return reinterpret_cast<T *>(
+			mallocator_->handle_to_ptr(handle_));
+	}
+
+	template <class T> T *Pointer(MEM_HANDLE_T handle) const
+	{
+		return reinterpret_cast<T *>(
+			mallocator_->handle_to_ptr(handle));
+	}
+
+	int encode_to_private_area(RawData &raw, RowValue &value,
+				   unsigned char value_flag);
+
+	inline int pack_key(const RowValue &stRow, uint8_t uchKeyIdx,
+			    int &iKeySize, char *&pchKey,
+			    unsigned char achKeyBuf[]);
+	inline int pack_key(const DTCValue *pstVal, uint8_t uchKeyIdx,
+			    int &iKeySize, char *&pchKey,
+			    unsigned char achKeyBuf[]);
+	inline int unpack_key(char *pchKey, uint8_t uchKeyIdx, RowValue &stRow);
+
+	int insert_sub_tree(uint8_t uchCurIndex, uint8_t uchCondIdxCnt,
+			    const RowValue &stCondition, KeyComparator pfComp,
+			    ALLOC_HANDLE_T hRoot);
+	int insert_sub_tree(uint8_t uchCondIdxCnt, const RowValue &stCondition,
+			    KeyComparator pfComp, ALLOC_HANDLE_T hRoot);
+	int insert_sub_tree(uint8_t uchCondIdxCnt, KeyComparator pfComp,
+			    ALLOC_HANDLE_T hRoot);
+	int insert_row_flag(uint8_t uchCurIndex, const RowValue &stRow,
+			    KeyComparator pfComp, unsigned char uchFlag);
+	int do_find(CIndexCookie *pstIdxCookie);
+	int do_find(uint8_t uchCondIdxCnt, const RowValue &stCondition,
+		    KeyComparator pfComp, ALLOC_HANDLE_T &hRecord);
+	int do_find(uint8_t uchCondIdxCnt, const RowValue &stCondition,
+		    KeyComparator pfComp, ALLOC_HANDLE_T *&hRecord);
+	static int search_visit(MallocBase &stMalloc, ALLOC_HANDLE_T &hRecord,
+				void *pCookie);
+	int do_search(CSearchCookie *pstSearchCookie);
+	int Delete(CIndexCookie *pstIdxCookie);
+	int Delete(uint8_t uchCondIdxCnt, const RowValue &stCondition,
+		   KeyComparator pfComp, ALLOC_HANDLE_T &hRecord);
+
+    public:
+	TreeData(MallocBase *pstMalloc);
+	~TreeData();
+
+	const char *get_err_msg()
+	{
+		return err_message_;
+	}
+	MEM_HANDLE_T get_handle()
+	{
+		return handle_;
+	}
+	int do_attach(MEM_HANDLE_T hHandle);
+	int do_attach(MEM_HANDLE_T hHandle, uint8_t uchKeyIdx, int iKeySize,
+		      int laid = -1, int lcmodid = -1, int expireid = -1);
+
+	const MEM_HANDLE_T get_tree_root() const
+	{
+		return t_tree_.Root();
+	}
+
+	/*************************************************
+	  Description:	新分配一块内存,并初始化
+	  Input:		 iKeySize	key的格式,0为变长,非0为定长长度
+				pchKey	为格式化后的key,变长key的第0字节为长度
+	  Output:		
+	  Return:		0为成功,非0失败
+	*************************************************/
+	int do_init(int iKeySize, const char *pchKey);
+	int do_init(uint8_t uchKeyIdx, int iKeySize, const char *pchKey,
+		    int laId = -1, int expireId = -1, int nodeIdx = -1);
+	int do_init(const char *pchKey);
+
+	const char *key() const
+	{
+		return p_tree_root_ ? p_tree_root_->p_key_ : NULL;
+	}
+	char *key()
+	{
+		return p_tree_root_ ? p_tree_root_->p_key_ : NULL;
+	}
+
+	unsigned int total_rows()
+	{
+		return p_tree_root_->row_count_;
+	}
+	uint64_t get_affectedrows()
+	{
+		return affected_rows_;
+	}
+	void set_affected_rows(int num)
+	{
+		affected_rows_ = num;
+	}
+
+	/*************************************************
+	  Description:	最近一次分配内存失败所需要的内存大小
+	  Input:		
+	  Output:		
+	  Return:		返回所需要的内存大小
+	*************************************************/
+	ALLOC_SIZE_T need_size()
+	{
+		return need_new_bufer_size;
+	}
+
+	/*************************************************
+	  Description:	销毁uchLevel以及以下级别的子树
+	  Input:		uchLevel	销毁uchLevel以及以下级别的子树,显然uchLevel应该在1到uchIndexDepth之间
+	  Output:		
+	  Return:		0为成功,非0失败
+	*************************************************/
+	//	int destory(uint8_t uchLevel=1);
+	int destory();
+
+	/*************************************************
+	  Description:	插入一行数据
+	  Input:		stRow	包含index字段以及后面字段的值
+				pfComp	用户自定义的key比较函数
+				uchFlag	行标记
+	  Output:		
+	  Return:		0为成功,非0失败
+	*************************************************/
+	int insert_row_flag(const RowValue &stRow, KeyComparator pfComp,
+			    unsigned char uchFlag);
+
+	/*************************************************
+	  Description:	插入一行数据
+	  Input:		stRow	包含index字段以及后面字段的值
+				pfComp	用户自定义的key比较函数
+				isDirty	是否脏数据
+	  Output:		
+	  Return:		0为成功,非0失败
+	*************************************************/
+	int insert_row(const RowValue &stRow, KeyComparator pfComp,
+		       bool isDirty);
+
+	/*************************************************
+	  Description:	查找一行数据
+	  Input:		stCondition	包含各级index字段的值
+				pfComp	用户自定义的key比较函数
+				
+	  Output:		hRecord	查找到的一个指向CRawData的句柄
+	  Return:		0为找不到,1为找到数据
+	*************************************************/
+	int do_find(const RowValue &stCondition, KeyComparator pfComp,
+		    ALLOC_HANDLE_T &hRecord);
+
+	/*************************************************
+	  Description:	按索引条件查找
+	  Input:		pstCond	一个数组,而且大小刚好是uchIndexDepth
+				pfComp	用户自定义的key比较函数
+				pfVisit	当查找到记录时,用户自定义的访问数据函数
+				pCookie	访问数据函数使用的cookie参数
+	  Output:		
+	  Return:		0为成功,其他值为错误
+	*************************************************/
+	int do_search(const TtreeCondition *pstCond, KeyComparator pfComp,
+		      VisitRawData pfVisit, CheckTreeFunc pfCheck,
+		      void *pCookie);
+
+	/*************************************************
+	  Description:	从小到大遍历所有数据
+	  Input:		pfComp	用户自定义的key比较函数
+				pfVisit	当查找到记录时,用户自定义的访问数据函数
+				pCookie	访问数据函数使用的cookie参数
+	  Output:		
+	  Return:		0为成功,其他值为错误
+	*************************************************/
+	int traverse_forward(KeyComparator pfComp, VisitRawData pfVisit,
+			     void *pCookie);
+
+	/*************************************************
+	  Description:	根据指定的index值,删除符合条件的所有行(包括子树)
+	  Input:		uchCondIdxCnt	条件index的数量
+				stCondition		包含各级index字段的值
+				pfComp		用户自定义的key比较函数
+				
+	  Output:		
+	  Return:		0为成功,其他值为错误
+	*************************************************/
+	int delete_sub_row(uint8_t uchCondIdxCnt, const RowValue &stCondition,
+			   KeyComparator pfComp);
+
+	/*************************************************
+	  Description:	将某个级别的index值修改为另外一个值
+	  Input:		uchCondIdxCnt	条件index的数量
+				stCondition		包含各级index字段的值
+				pfComp		用户自定义的key比较函数
+				pstNewValue	对应最后一个条件字段的新index值
+	  Output:		
+	  Return:		0为成功,其他值为错误
+	*************************************************/
+	int update_index(uint8_t uchCondIdxCnt, const RowValue &stCondition,
+			 KeyComparator pfComp, const DTCValue *pstNewValue);
+	unsigned ask_for_destroy_size(void);
+
+	DTCTableDefinition *get_node_table_def()
+	{
+		return p_table_;
+	}
+
+	void change_mallocator(MallocBase *pstMalloc)
+	{
+		mallocator_ = pstMalloc;
+	}
+
+	int expand_tree_chunk(MEM_HANDLE_T *pRecord, ALLOC_SIZE_T expand_size);
+
+	/*************************************************
+	  Description:	destroy data in t-tree
+	  Output:		
+	*************************************************/
+	int destroy_sub_tree();
+
+	/*************************************************
+	  Description:	copy data from raw to t-tree
+	  Output:		
+	*************************************************/
+	int copy_tree_all(RawData *new_data);
+
+	/*************************************************
+	  Description:	copy data from t-tree to raw
+	  Output:		
+	*************************************************/
+	int copy_raw_all(RawData *new_data);
+
+	/*************************************************
+	  Description:	get tree data from t-tree
+	  Output:		
+	*************************************************/
+	int decode_tree_row(RowValue &stRow, unsigned char &uchRowFlags,
+			    int iDecodeFlag = 0);
+
+	/*************************************************
+	  Description:	set tree data from t-tree
+	  Output:		
+	*************************************************/
+	int encode_tree_row(const RowValue &stRow, unsigned char uchOp);
+
+	/*************************************************
+	  Description: compare row data value	
+	  Output:		
+	*************************************************/
+	int compare_tree_data(RowValue *stpNodeRow);
+
+	/*************************************************
+	  Description:	get data in t-tree
+	  Output:		
+	*************************************************/
+	int get_tree_data(DTCJobOperation &job_op);
+
+	/*************************************************
+	  Description:	flush data in t-tree
+	  Output:		
+	*************************************************/
+	int flush_tree_data(DTCFlushRequest *flush_req, Node *p_node,
+			    unsigned int &affected_count);
+
+	/*************************************************
+	  Description:	get data in t-tree
+	  Output:		
+	*************************************************/
+	int delete_tree_data(DTCJobOperation &job_op);
+
+	/*************************************************
+	  Description:	获得T树中的Raw类型的每一行的数据
+	  Output:		
+	*************************************************/
+	int get_sub_raw_data(DTCJobOperation &job_op, MEM_HANDLE_T hRecord);
+
+	/*************************************************
+	  Description:	删除T树中的Raw类型的行的数据
+	  Output:		
+	*************************************************/
+	int delete_sub_raw_data(DTCJobOperation &job_op, MEM_HANDLE_T hRecord);
+
+	/*************************************************
+	  Description:	修改T树中的Raw类型的行的数据
+	  Output:		
+	*************************************************/
+	int update_sub_raw_data(DTCJobOperation &job_op, MEM_HANDLE_T hRecord);
+
+	/*************************************************
+	  Description:	替换T树中的Raw类型的行的数据,如没有此行则创建
+	  Output:		
+	*************************************************/
+	int replace_sub_raw_data(DTCJobOperation &job_op, MEM_HANDLE_T hRecord);
+
+	/*************************************************
+	  Description:	处理T树中平板类型业务
+	  Output:		
+	*************************************************/
+	int get_sub_raw(DTCJobOperation &job_op, unsigned int nodeCnt,
+			bool isAsc, SubRowProcess subRowProc);
+
+	/*************************************************
+	  Description:	匹配索引
+	  Output:		
+	*************************************************/
+	int match_index_condition(DTCJobOperation &job_op, unsigned int rowCnt,
+				  SubRowProcess subRowProc);
+
+	/*************************************************
+	  Description:	update data in t-tree
+	  Output:		
+	*************************************************/
+	int update_tree_data(DTCJobOperation &job_op, Node *p_node,
+			     RawData *affected_data, bool async, bool setrows);
+
+	/*************************************************
+	  Description:	replace data in t-tree
+	  Output:		
+	*************************************************/
+	int replace_tree_data(DTCJobOperation &job_op, Node *p_node,
+			      RawData *affected_data, bool async,
+			      unsigned char &RowFlag, bool setrows);
+
+	/*************************************************
+	  Description:	calculate row data size
+	  Output:		
+	*************************************************/
+	ALLOC_SIZE_T calc_tree_row_size(const RowValue &stRow, int keyIdx);
+
+	/*************************************************
+	  Description:	get expire time
+	  Output:		
+	*************************************************/
+	int get_expire_time(DTCTableDefinition *t, uint32_t &expire);
+
+	/*************************************************
+	  Description:	替换当前行
+	  Input:		stRow	仅使用row的字段类型等信息,不需要实际数据
+	  Output:		
+	  Return:		0为成功,非0失败
+	*************************************************/
+	int replace_cur_row(const RowValue &stRow, bool isDirty,
+			    MEM_HANDLE_T *hRecord);
+
+	/*************************************************
+	  Description:	删除当前行
+	  Input:		stRow	仅使用row的字段类型等信息,不需要实际数据
+	  Output:		
+	  Return:		0为成功,非0失败
+	*************************************************/
+	int delete_cur_row(const RowValue &stRow);
+
+	/*************************************************
+	  Description:	调到下一行
+	  Input:		stRow	仅使用row的字段类型等信息,不需要实际数据
+	  Output:		m_uiOffset会指向下一行数据的偏移
+	  Return:		0为成功,非0失败
+	*************************************************/
+	int skip_row(const RowValue &stRow);
+
+	/*************************************************
+    Description: 
+    Output: 
+    *************************************************/
+	int64_t get_increase_dirty_row_count()
+	{
+		return dirty_rows_count_;
+	}
+
+	/*************************************************
+	  Description:	查询本次操作增加的行数(可以为负数)
+	  Input:		
+	  Output:		
+	  Return:		行数
+	*************************************************/
+	int64_t get_increase_row_count()
+	{
+		return rows_count_;
+	}
+
+	int set_cur_row_flag(unsigned char uchFlag);
+
+	int get_dirty_row_count();
+};
+
+#endif

+ 81 - 0
src/core/tree/tree_data_keycmp.h

@@ -0,0 +1,81 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <ctype.h>
+
+static inline int stricmp(const char *p, const char *q)
+{
+	while (toupper(*(unsigned char *)p) == toupper(*(unsigned char *)q)) {
+		if (*p == '\0') {
+			return 0;
+		}
+		p += 1;
+		q += 1;
+	}
+	return toupper(*(unsigned char *)p) - toupper(*(unsigned char *)q);
+}
+
+static inline int strincmp(const char *p, const char *q, size_t n)
+{
+	while (n > 0) {
+		int diff = toupper(*(unsigned char *)p) -
+			   toupper(*(unsigned char *)q);
+		if (diff != 0) {
+			return diff;
+		} else if (*p == '\0') {
+			return 0;
+		}
+		p += 1;
+		q += 1;
+		n -= 1;
+	}
+	return 0;
+}
+
+static inline int stricoll(const char *p, const char *q)
+{
+	char p_buf[256];
+	char q_buf[256];
+	size_t p_len = strlen(p);
+	size_t q_len = strlen(q);
+	char *p_dst = p_buf;
+	char *q_dst = q_buf;
+	int i;
+	if (p_len >= sizeof(p_buf)) {
+		p_dst = new char[p_len + 1];
+	}
+	if (q_len >= sizeof(q_buf)) {
+		q_dst = new char[q_len + 1];
+	}
+	for (i = 0; p[i] != '\0'; i++) {
+		p_dst[i] = toupper(p[i] & 0xFF);
+	}
+	p_dst[i] = '\0';
+
+	for (i = 0; q[i] != '\0'; i++) {
+		q_dst[i] = toupper(q[i] & 0xFF);
+	}
+	q_dst[i] = '\0';
+
+	int diff = strcoll(p_dst, q_dst);
+	if (p_dst != p_buf) {
+		delete[] p_dst;
+	}
+	if (q_dst != q_buf) {
+		delete[] q_dst;
+	}
+	return diff;
+}

+ 728 - 0
src/core/tree/tree_data_process.cc

@@ -0,0 +1,728 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "tree_data_process.h"
+#include "global.h"
+#include "log/log.h"
+#include "sys_malloc.h"
+
+DTC_USING_NAMESPACE
+
+TreeDataProcess::TreeDataProcess(MallocBase *pstMalloc,
+				 DTCTableDefinition *p_table_definition_,
+				 BufferPond *pstPool,
+				 const UpdateMode *pstUpdateMode)
+	: m_stTreeData(pstMalloc), p_table_(p_table_definition_),
+	  p_mallocator_(pstMalloc), p_buffer_pond_(pstPool)
+{
+	memcpy(&update_mode_, pstUpdateMode, sizeof(update_mode_));
+	nodeSizeLimit = 0;
+	history_rowsize = g_stat_mgr.get_sample(ROW_SIZE_HISTORY_STAT);
+}
+
+TreeDataProcess::~TreeDataProcess()
+{
+}
+
+int TreeDataProcess::get_expire_time(DTCTableDefinition *t, Node *p_node,
+				     uint32_t &expire)
+{
+	int iRet = 0;
+
+	iRet = m_stTreeData.do_attach(p_node->vd_handle());
+	if (iRet != 0) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "attach data error");
+		log4cplus_error("tree-data attach[handle:" UINT64FMT
+				"] error: %d,%s",
+				p_node->vd_handle(), iRet,
+				m_stTreeData.get_err_msg());
+		return (iRet);
+	}
+
+	iRet = m_stTreeData.get_expire_time(t, expire);
+	if (iRet != 0) {
+		log4cplus_error("tree data get expire time error: %d", iRet);
+		return iRet;
+	}
+	return 0;
+}
+
+int TreeDataProcess::do_replace_all(Node *p_node, RawData *new_data)
+{
+	int iRet;
+
+	log4cplus_debug("Replace TreeData start ");
+
+	rows_count_ = 0;
+	dirty_rows_count_ = 0;
+
+	TreeData tmpTreeData(p_mallocator_);
+
+	iRet = tmpTreeData.do_init(new_data->key());
+	if (iRet == EC_NO_MEM) {
+		if (p_buffer_pond_->try_purge_size(tmpTreeData.need_size(),
+						   *p_node) == 0)
+			iRet = tmpTreeData.do_init(new_data->key());
+	}
+
+	if (iRet != 0) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "root-data init error: %s", tmpTreeData.get_err_msg());
+		tmpTreeData.destory();
+		return (-2);
+	}
+
+	iRet = tmpTreeData.copy_tree_all(new_data);
+	if (iRet == EC_NO_MEM) {
+		if (p_buffer_pond_->try_purge_size(tmpTreeData.need_size(),
+						   *p_node) == 0)
+			iRet = tmpTreeData.copy_tree_all(new_data);
+	}
+
+	if (iRet != 0) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "root-data init error: %s", tmpTreeData.get_err_msg());
+		tmpTreeData.destory();
+		return (-2);
+	}
+
+	if (p_node->vd_handle() != INVALID_HANDLE)
+		destroy_data(p_node);
+	p_node->vd_handle() = tmpTreeData.get_handle();
+
+	if (tmpTreeData.total_rows() > 0) {
+		history_rowsize.push(tmpTreeData.total_rows());
+	}
+	return (0);
+}
+
+int TreeDataProcess::do_append(DTCJobOperation &job_op, Node *p_node,
+			       RawData *affected_data, bool isDirty,
+			       bool setrows)
+{
+	int iRet;
+	DTCTableDefinition *stpNodeTab, *stpTaskTab;
+	RowValue *stpNodeRow, *stpTaskRow;
+
+	iRet = m_stTreeData.do_attach(p_node->vd_handle());
+	if (iRet != 0) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "attach data error");
+		log4cplus_error("tree-data attach[handle:" UINT64FMT
+				"] error: %d,%s",
+				p_node->vd_handle(), iRet,
+				m_stTreeData.get_err_msg());
+		return (iRet);
+	}
+
+	stpNodeTab = m_stTreeData.get_node_table_def();
+	stpTaskTab = job_op.table_definition();
+	RowValue stTaskRow(stpTaskTab);
+	RowValue stNodeRow(stpNodeTab);
+	stpTaskRow = &stTaskRow;
+	stpTaskRow->default_value();
+	job_op.update_row(*stpTaskRow);
+
+	if (stpTaskTab->auto_increment_field_id() >= stpTaskTab->key_fields() &&
+	    job_op.resultInfo.insert_id()) {
+		const int iFieldID = stpTaskTab->auto_increment_field_id();
+		const uint64_t iVal = job_op.resultInfo.insert_id();
+		stpTaskRow->field_value(iFieldID)->Set(iVal);
+	}
+
+	if (stpNodeTab == stpTaskTab) {
+		stpNodeRow = stpTaskRow;
+	} else {
+		stpNodeRow = &stNodeRow;
+		stpNodeRow->default_value();
+		stpNodeRow->Copy(stpTaskRow);
+	}
+
+	log4cplus_debug("AppendTreeData start! ");
+
+	rows_count_ = 0;
+	dirty_rows_count_ = 0;
+
+	unsigned int uiTotalRows = m_stTreeData.total_rows();
+	if (uiTotalRows > 0) {
+		if ((isDirty || setrows) &&
+		    job_op.table_definition()->key_as_uniq_field()) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "duplicate key error");
+			return (-1062);
+		}
+		if (setrows &&
+		    job_op.table_definition()->key_part_of_uniq_field()) {
+			iRet = m_stTreeData.compare_tree_data(stpNodeRow);
+			if (iRet < 0) {
+				log4cplus_error(
+					"tree-data decode row error: %d,%s",
+					iRet, m_stTreeData.get_err_msg());
+				return iRet;
+			} else if (iRet == 0) {
+				snprintf(err_message_, sizeof(err_message_),
+					 "duplicate key error");
+				return (-1062);
+			}
+		}
+	}
+
+	// insert clean row
+	iRet = m_stTreeData.insert_row(*stpNodeRow, KeyCompare, isDirty);
+	if (iRet == EC_NO_MEM) {
+		if (p_buffer_pond_->try_purge_size(m_stTreeData.need_size(),
+						   *p_node) == 0)
+			iRet = m_stTreeData.insert_row(*stpNodeRow, KeyCompare,
+						       isDirty);
+	}
+	if (iRet != EC_NO_MEM)
+		p_node->vd_handle() = m_stTreeData.get_handle();
+	if (iRet != 0) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "tree-data insert row error: %s,%d",
+			 m_stTreeData.get_err_msg(), iRet);
+		/*标记加入黑名单*/
+		job_op.push_black_list_size(m_stTreeData.need_size());
+		return (-2);
+	}
+
+	if (job_op.resultInfo.affected_rows() == 0 || setrows == true)
+		job_op.resultInfo.set_affected_rows(1);
+	rows_count_++;
+	if (isDirty)
+		dirty_rows_count_++;
+	history_rowsize.push(m_stTreeData.total_rows());
+	return (0);
+}
+
+int TreeDataProcess::do_get(DTCJobOperation &job_op, Node *p_node)
+{
+	int iRet;
+	log4cplus_debug("Get TreeData start! ");
+
+	rows_count_ = 0;
+	dirty_rows_count_ = 0;
+
+	iRet = m_stTreeData.do_attach(p_node->vd_handle());
+	if (iRet != 0) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "attach data error");
+		log4cplus_error("tree-data attach[handle:" UINT64FMT
+				"] error: %d,%s",
+				p_node->vd_handle(), iRet,
+				m_stTreeData.get_err_msg());
+		return (-1);
+	}
+
+	iRet = m_stTreeData.get_tree_data(job_op);
+	if (iRet != 0) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "get tree data error");
+		log4cplus_error("tree-data get[handle:" UINT64FMT
+				"] error: %d,%s",
+				p_node->vd_handle(), iRet,
+				m_stTreeData.get_err_msg());
+		return iRet;
+	}
+
+	/*更新访问时间和查找操作计数*/
+	log4cplus_debug("node[id:%u] ,Get Count is %d", p_node->node_id(),
+			m_stTreeData.total_rows());
+	return (0);
+}
+
+int TreeDataProcess::expand_node(DTCJobOperation &job_op, Node *p_node)
+{
+	return 0;
+}
+
+int TreeDataProcess::get_dirty_row_count(DTCJobOperation &job_op, Node *p_node)
+{
+	int iRet = m_stTreeData.do_attach(p_node->vd_handle());
+	if (iRet != 0) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "attach data error");
+		log4cplus_error("tree-data attach[handle:" UINT64FMT
+				"] error: %d,%s",
+				p_node->vd_handle(), iRet,
+				m_stTreeData.get_err_msg());
+		return (-1);
+	}
+
+	return m_stTreeData.get_dirty_row_count();
+}
+
+int TreeDataProcess::attach_data(Node *p_node, RawData *affected_data)
+{
+	int iRet;
+
+	iRet = m_stTreeData.do_attach(p_node->vd_handle());
+	if (iRet != 0) {
+		log4cplus_error("tree-data attach[handle:" UINT64FMT
+				"] error: %d,%s",
+				p_node->vd_handle(), iRet,
+				m_stTreeData.get_err_msg());
+		return (-1);
+	}
+
+	if (affected_data != NULL) {
+		iRet = affected_data->do_init(m_stTreeData.key(), 0);
+		if (iRet != 0) {
+			log4cplus_error("tree-data init error: %d,%s", iRet,
+					affected_data->get_err_msg());
+			return (-2);
+		}
+	}
+
+	return (0);
+}
+
+int TreeDataProcess::get_node_all_rows_count(Node *p_node, RawData *pstRows)
+{
+	int iRet = 0;
+
+	rows_count_ = 0;
+	dirty_rows_count_ = 0;
+
+	iRet = attach_data(p_node, pstRows);
+	if (iRet != 0) {
+		log4cplus_error("attach data error: %d", iRet);
+		return (-1);
+	}
+
+	iRet = m_stTreeData.copy_raw_all(pstRows);
+	if (iRet != 0) {
+		log4cplus_error("copy data error: %d,%s", iRet,
+				m_stTreeData.get_err_msg());
+		return (-2);
+	}
+
+	return (0);
+}
+
+int TreeDataProcess::do_delete(DTCJobOperation &job_op, Node *p_node,
+			       RawData *affected_data)
+{
+	int iRet;
+	log4cplus_debug("Delete TreeData start! ");
+
+	iRet = m_stTreeData.do_attach(p_node->vd_handle());
+	if (iRet != 0) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "attach data error");
+		log4cplus_error("tree-data attach[handle:" UINT64FMT
+				"] error: %d,%s",
+				p_node->vd_handle(), iRet,
+				m_stTreeData.get_err_msg());
+		return (-1);
+	}
+
+	int start = m_stTreeData.total_rows();
+
+	iRet = m_stTreeData.delete_tree_data(job_op);
+	if (iRet != 0) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "get tree data error");
+		log4cplus_error("tree-data get[handle:" UINT64FMT
+				"] error: %d,%s",
+				p_node->vd_handle(), iRet,
+				m_stTreeData.get_err_msg());
+		return iRet;
+	}
+
+	int iAffectRows = start - m_stTreeData.total_rows();
+	if (iAffectRows > 0) {
+		if (job_op.resultInfo.affected_rows() == 0 ||
+		    (job_op.request_condition() &&
+		     job_op.request_condition()->has_type_timestamp())) {
+			job_op.resultInfo.set_affected_rows(iAffectRows);
+		}
+	}
+
+	rows_count_ = m_stTreeData.get_increase_row_count();
+	dirty_rows_count_ = m_stTreeData.get_increase_dirty_row_count();
+
+	log4cplus_debug("node[id:%u] ,Get Count is %d", p_node->node_id(),
+			m_stTreeData.total_rows());
+	return (0);
+}
+
+int TreeDataProcess::do_replace_all(DTCJobOperation &job_op, Node *p_node)
+{
+	log4cplus_debug("do_replace_all start! ");
+	DTCTableDefinition *stpNodeTab, *stpTaskTab;
+	RowValue *stpNodeRow;
+
+	int iRet;
+	int try_purge_count = 0;
+	uint64_t all_rows_size = 0;
+	int laid = job_op.flag_no_cache() || job_op.count_only() ?
+			   -1 :
+			   job_op.table_definition()->lastacc_field_id();
+	int matchedCount = 0;
+	int limitStart = 0;
+	int limitStop = 0x10000000;
+
+	stpTaskTab = job_op.table_definition();
+	if (DTCColExpand::instance()->is_expanding())
+		stpNodeTab =
+			TableDefinitionManager::instance()->get_new_table_def();
+	else
+		stpNodeTab =
+			TableDefinitionManager::instance()->get_cur_table_def();
+	RowValue stNodeRow(stpNodeTab);
+	stpNodeRow = &stNodeRow;
+	stpNodeRow->default_value();
+
+	if (laid > 0 && job_op.requestInfo.limit_count() > 0) {
+		limitStart = job_op.requestInfo.limit_start();
+		if (job_op.requestInfo.limit_start() > 0x10000000) {
+			laid = -1;
+		} else if (job_op.requestInfo.limit_count() < 0x10000000) {
+			limitStop =
+				limitStart + job_op.requestInfo.limit_count();
+		}
+	}
+
+	rows_count_ = 0;
+	dirty_rows_count_ = 0;
+
+	if (p_node->vd_handle() != INVALID_HANDLE) {
+		iRet = destroy_data(p_node);
+		if (iRet != 0)
+			return (-1);
+	}
+
+	iRet = m_stTreeData.do_init(job_op.packed_key());
+	if (iRet == EC_NO_MEM) {
+		if (p_buffer_pond_->try_purge_size(m_stTreeData.need_size(),
+						   *p_node) == 0)
+			iRet = m_stTreeData.do_init(p_table_->key_fields() - 1,
+						    p_table_->key_format(),
+						    job_op.packed_key());
+	}
+	if (iRet != EC_NO_MEM)
+		p_node->vd_handle() = m_stTreeData.get_handle();
+
+	if (iRet != 0) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "raw-data init error: %s", m_stTreeData.get_err_msg());
+		/*标记加入黑名单*/
+		job_op.push_black_list_size(m_stTreeData.need_size());
+		p_buffer_pond_->purge_node(job_op.packed_key(), *p_node);
+		return (-2);
+	}
+
+	if (job_op.result != NULL) {
+		ResultSet *pstResultSet = job_op.result;
+		for (int i = 0; i < pstResultSet->total_rows(); i++) {
+			RowValue *pstRow = pstResultSet->_fetch_row();
+			if (pstRow == NULL) {
+				log4cplus_debug("%s!",
+						"call fetch_row func error");
+				p_buffer_pond_->purge_node(job_op.packed_key(),
+							   *p_node);
+				m_stTreeData.destory();
+				return (-3);
+			}
+
+			if (laid > 0 && job_op.compare_row(*pstRow)) {
+				if (matchedCount >= limitStart &&
+				    matchedCount < limitStop) {
+					(*pstRow)[laid].s64 =
+						job_op.Timestamp();
+				}
+				matchedCount++;
+			}
+
+			if (stpTaskTab != stpNodeTab) {
+				stpNodeRow->Copy(pstRow);
+			} else {
+				stpNodeRow = pstRow;
+			}
+
+			/* 插入当前行 */
+			iRet = m_stTreeData.insert_row(*stpNodeRow, KeyCompare,
+						       false);
+
+			/* 如果内存空间不足,尝试扩大最多两次 */
+			if (iRet == EC_NO_MEM) {
+				if (try_purge_count >= 2) {
+					goto ERROR_PROCESS;
+				}
+
+				/* 尝试次数 */
+				++try_purge_count;
+				if (p_buffer_pond_->try_purge_size(
+					    m_stTreeData.need_size(),
+					    *p_node) == 0)
+					iRet = m_stTreeData.insert_row(
+						*stpNodeRow, KeyCompare, false);
+			}
+			if (iRet != EC_NO_MEM)
+				p_node->vd_handle() = m_stTreeData.get_handle();
+
+			/* 当前行操作成功 */
+			if (0 == iRet)
+				continue;
+		ERROR_PROCESS:
+			snprintf(
+				err_message_, sizeof(err_message_),
+				"raw-data insert row error: ret=%d,err=%s, cnt=%d",
+				iRet, m_stTreeData.get_err_msg(),
+				try_purge_count);
+			/*标记加入黑名单*/
+			job_op.push_black_list_size(all_rows_size);
+			p_buffer_pond_->purge_node(job_op.packed_key(),
+						   *p_node);
+			m_stTreeData.destory();
+			return (-4);
+		}
+
+		rows_count_ += pstResultSet->total_rows();
+	}
+
+	history_rowsize.push(m_stTreeData.total_rows());
+
+	return (0);
+}
+
+int TreeDataProcess::do_replace(DTCJobOperation &job_op, Node *p_node,
+				RawData *affected_data, bool async,
+				bool setrows = false)
+{
+	int iRet;
+	log4cplus_debug("Replace TreeData start! ");
+
+	rows_count_ = 0;
+	dirty_rows_count_ = 0;
+
+	if (p_node) {
+		iRet = m_stTreeData.do_attach(p_node->vd_handle());
+		if (iRet != 0) {
+			log4cplus_error("attach tree data error: %d", iRet);
+			return (iRet);
+		}
+	} else {
+		iRet = m_stTreeData.do_init(job_op.packed_key());
+		if (iRet == EC_NO_MEM) {
+			if (p_buffer_pond_->try_purge_size(
+				    m_stTreeData.need_size(), *p_node) == 0)
+				iRet = m_stTreeData.do_init(
+					job_op.packed_key());
+		}
+
+		if (iRet != 0) {
+			log4cplus_error("tree-data replace[handle:" UINT64FMT
+					"] error: %d,%s",
+					p_node->vd_handle(), iRet,
+					m_stTreeData.get_err_msg());
+			return iRet;
+		}
+
+		p_node->vd_handle() = m_stTreeData.get_handle();
+	}
+
+	unsigned char uchRowFlags;
+	iRet = m_stTreeData.replace_tree_data(job_op, p_node, affected_data,
+					      async, uchRowFlags, setrows);
+	if (iRet == EC_NO_MEM) {
+		if (p_buffer_pond_->try_purge_size(m_stTreeData.need_size(),
+						   *p_node) == 0)
+			iRet = m_stTreeData.replace_tree_data(
+				job_op, p_node, affected_data, async,
+				uchRowFlags, setrows);
+	}
+
+	if (iRet != 0) {
+		log4cplus_error("tree-data replace[handle:" UINT64FMT
+				"] error: %d,%s",
+				p_node->vd_handle(), iRet,
+				m_stTreeData.get_err_msg());
+		return iRet;
+	}
+
+	if (uchRowFlags & OPER_DIRTY)
+		dirty_rows_count_--;
+	if (async)
+		dirty_rows_count_++;
+
+	uint64_t ullAffectedRows = m_stTreeData.get_affectedrows();
+	if (ullAffectedRows == 0) //insert
+	{
+		DTCTableDefinition *stpTaskTab;
+		RowValue *stpNewRow;
+		stpTaskTab = job_op.table_definition();
+		RowValue stNewRow(stpTaskTab);
+		stNewRow.default_value();
+		stpNewRow = &stNewRow;
+		job_op.update_row(*stpNewRow); //获取Replace的行
+		iRet = m_stTreeData.insert_row(*stpNewRow, KeyCompare,
+					       async); // 加进cache
+		if (iRet == EC_NO_MEM) {
+			if (p_buffer_pond_->try_purge_size(
+				    m_stTreeData.need_size(), *p_node) == 0)
+				iRet = m_stTreeData.insert_row(
+					*stpNewRow, KeyCompare, async);
+		}
+		if (iRet != EC_NO_MEM)
+			p_node->vd_handle() = m_stTreeData.get_handle();
+
+		if (iRet != 0) {
+			snprintf(err_message_, sizeof(err_message_),
+				 "raw-data replace row error: %d, %s", iRet,
+				 m_stTreeData.get_err_msg());
+			/*标记加入黑名单*/
+			job_op.push_black_list_size(m_stTreeData.need_size());
+			return (-3);
+		}
+		rows_count_++;
+		ullAffectedRows++;
+		if (async)
+			dirty_rows_count_++;
+	}
+	if (async == true || setrows == true) {
+		job_op.resultInfo.set_affected_rows(ullAffectedRows);
+	} else if (ullAffectedRows != job_op.resultInfo.affected_rows()) {
+		//如果cache更新纪录数和helper更新的纪录数不相等
+		log4cplus_debug(
+			"unequal affected rows, cache[%lld], helper[%lld]",
+			(long long)ullAffectedRows,
+			(long long)job_op.resultInfo.affected_rows());
+	}
+
+	return 0;
+}
+
+int TreeDataProcess::do_update(DTCJobOperation &job_op, Node *p_node,
+			       RawData *affected_data, bool async,
+			       bool setrows = false)
+{
+	int iRet;
+	log4cplus_debug("Update TreeData start! ");
+
+	rows_count_ = 0;
+	dirty_rows_count_ = 0;
+
+	iRet = m_stTreeData.do_attach(p_node->vd_handle());
+	if (iRet != 0) {
+		log4cplus_error("attach tree data error: %d", iRet);
+		return (iRet);
+	}
+
+	m_stTreeData.set_affected_rows(0);
+
+	iRet = m_stTreeData.update_tree_data(job_op, p_node, affected_data,
+					     async, setrows);
+	if (iRet == EC_NO_MEM) {
+		if (p_buffer_pond_->try_purge_size(m_stTreeData.need_size(),
+						   *p_node) == 0)
+			iRet = m_stTreeData.update_tree_data(
+				job_op, p_node, affected_data, async, setrows);
+	}
+
+	if (iRet != 0) {
+		log4cplus_error("tree-data update[handle:" UINT64FMT
+				"] error: %d,%s",
+				p_node->vd_handle(), iRet,
+				m_stTreeData.get_err_msg());
+		return iRet;
+	}
+
+	uint64_t ullAffectedRows = m_stTreeData.get_affectedrows();
+	dirty_rows_count_ = m_stTreeData.get_increase_dirty_row_count();
+
+	if (async == true || setrows == true) {
+		job_op.resultInfo.set_affected_rows(ullAffectedRows);
+	} else if (ullAffectedRows != job_op.resultInfo.affected_rows()) {
+		//如果cache更新纪录数和helper更新的纪录数不相等
+		log4cplus_debug(
+			"unequal affected rows, cache[%lld], helper[%lld]",
+			(long long)ullAffectedRows,
+			(long long)job_op.resultInfo.affected_rows());
+	}
+
+	return (0);
+}
+
+int TreeDataProcess::do_flush(DTCFlushRequest *flush_req, Node *p_node,
+			      unsigned int &affected_count)
+{
+	int iRet;
+
+	log4cplus_debug("do_flush start! ");
+
+	rows_count_ = 0;
+	dirty_rows_count_ = 0;
+
+	iRet = m_stTreeData.do_attach(p_node->vd_handle());
+	if (iRet != 0) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "attach data error");
+		log4cplus_error("tree-data attach[handle:" UINT64FMT
+				"] error: %d,%s",
+				p_node->vd_handle(), iRet,
+				m_stTreeData.get_err_msg());
+		return (-1);
+	}
+
+	iRet = m_stTreeData.flush_tree_data(flush_req, p_node, affected_count);
+	if (iRet != 0) {
+		snprintf(err_message_, sizeof(err_message_),
+			 "flush tree data error");
+		log4cplus_error("tree-data flush[handle:" UINT64FMT
+				"] error: %d,%s",
+				p_node->vd_handle(), iRet,
+				m_stTreeData.get_err_msg());
+		return iRet;
+	}
+
+	dirty_rows_count_ = m_stTreeData.get_increase_dirty_row_count();
+
+	return (0);
+}
+
+int TreeDataProcess::do_purge(DTCFlushRequest *flush_req, Node *p_node,
+			      unsigned int &affected_count)
+{
+	int iRet;
+
+	log4cplus_debug("do_purge start! ");
+
+	iRet = do_flush(flush_req, p_node, affected_count);
+	if (iRet != 0) {
+		return (iRet);
+	}
+	rows_count_ = 0LL - m_stTreeData.total_rows();
+
+	return 0;
+}
+
+int TreeDataProcess::destroy_data(Node *p_node)
+{
+	if (p_node->vd_handle() == INVALID_HANDLE)
+		return 0;
+	TreeData treeData(p_mallocator_);
+	treeData.do_attach(p_node->vd_handle());
+	treeData.destory();
+	p_node->vd_handle() = INVALID_HANDLE;
+	return 0;
+}

+ 197 - 0
src/core/tree/tree_data_process.h

@@ -0,0 +1,197 @@
+/*
+* Copyright [2021] JD.com, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef TREE_DATA_PROCESS_H
+#define TREE_DATA_PROCESS_H
+
+#include "buffer_def.h"
+#include "protocol.h"
+#include "value.h"
+#include "field/field.h"
+#include "section.h"
+#include "table/table_def.h"
+#include "task/task_request.h"
+#include "stat_dtc.h"
+#include "tree_data.h"
+#include "node.h"
+#include "data_process.h"
+#include "buffer_pond.h"
+#include "namespace.h"
+#include "stat_manager.h"
+#include "data_chunk.h"
+
+DTC_BEGIN_NAMESPACE
+
+class DTCJobOperation;
+class DTCFlushRequest;
+
+class TreeDataProcess : public DataProcess {
+    private:
+	TreeData m_stTreeData;
+	DTCTableDefinition *p_table_;
+	MallocBase *p_mallocator_;
+	BufferPond *p_buffer_pond_;
+	UpdateMode update_mode_;
+	int64_t rows_count_;
+	int64_t dirty_rows_count_;
+	char err_message_[200];
+
+	unsigned int nodeSizeLimit; // -DEBUG-
+
+	StatSample history_datasize;
+	StatSample history_rowsize;
+
+    protected:
+	int attach_data(Node *p_node, RawData *affected_data);
+
+    public:
+	void change_mallocator(MallocBase *pstMalloc)
+	{
+		log4cplus_debug("oring mallc: %p, new mallc: %p", p_mallocator_,
+				pstMalloc);
+		p_mallocator_ = pstMalloc;
+		m_stTreeData.change_mallocator(pstMalloc);
+	}
+
+	TreeDataProcess(MallocBase *pstMalloc,
+			DTCTableDefinition *p_table_definition_,
+			BufferPond *pstPool, const UpdateMode *pstUpdateMode);
+	~TreeDataProcess();
+
+	const char *get_err_msg()
+	{
+		return err_message_;
+	}
+	void set_insert_mode(EUpdateMode iMode)
+	{
+	}
+	void set_insert_order(int iOrder)
+	{
+	}
+
+	/*************************************************
+    Description: get expire time
+    Output:   
+    *************************************************/
+	int get_expire_time(DTCTableDefinition *t, Node *p_node,
+			    uint32_t &expire);
+
+	/*************************************************
+    Description: 
+    Output:   
+    *************************************************/
+	int expand_node(DTCJobOperation &job_op, Node *p_node);
+
+	/*************************************************
+    Description: 
+    Output:   
+    *************************************************/
+	int get_dirty_row_count(DTCJobOperation &job_op, Node *p_node);
+
+	/*************************************************
+    Description: 
+    Output: 
+    *************************************************/
+	int64_t get_increase_row_count()
+	{
+		return rows_count_;
+	};
+
+	/*************************************************
+    Description: 
+    Output: 
+    *************************************************/
+	int64_t get_increase_dirty_row_count()
+	{
+		return dirty_rows_count_;
+	}
+
+	/*************************************************
+    Description: 
+    Output: 
+    *************************************************/
+	int get_node_all_rows_count(Node *p_node, RawData *pstRows);
+
+	/*************************************************
+    Description: 
+    Output: 
+    *************************************************/
+	int do_delete(DTCJobOperation &job_op, Node *p_node,
+		      RawData *affected_data);
+
+	/*************************************************
+    Description: 
+    Output: 
+    *************************************************/
+	int do_replace_all(DTCJobOperation &job_op, Node *p_node);
+
+	/*************************************************
+    Description: 
+    Output: 
+    *************************************************/
+	int do_replace(DTCJobOperation &job_op, Node *p_node,
+		       RawData *affected_data, bool async, bool setrows);
+
+	/*************************************************
+    Description: 
+    Output: 
+    *************************************************/
+	int do_update(DTCJobOperation &job_op, Node *p_node,
+		      RawData *affected_data, bool async, bool setrows);
+
+	/*************************************************
+    Description: 
+    Output: 
+    *************************************************/
+	int do_flush(DTCFlushRequest *flush_req, Node *p_node,
+		     unsigned int &affected_count);
+
+	/*************************************************
+    Description: 
+    Output: 
+    *************************************************/
+	int do_purge(DTCFlushRequest *flush_req, Node *p_node,
+		     unsigned int &affected_count);
+
+	/*************************************************
+    Description: append data in t-tree
+    Output:   
+    *************************************************/
+	int do_append(DTCJobOperation &job_op, Node *p_node,
+		      RawData *affected_data, bool isDirty, bool setrows);
+
+	/*************************************************
+    Description: replace data in t-tree
+    Output:   
+    *************************************************/
+	int do_replace_all(Node *p_node, RawData *new_data);
+
+	/*************************************************
+    Description: get data in t-tree
+    Output:   
+    *************************************************/
+	int do_get(DTCJobOperation &job_op, Node *p_node);
+
+	/*************************************************
+    Description: destroy t-tree
+    Output:   
+    *************************************************/
+	int destroy_data(Node *p_node);
+};
+
+DTC_END_NAMESPACE
+
+#endif