rpc_trace_filter.cc 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526
  1. /*
  2. Copyright (c) 2023 Sogou, Inc.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. #include <stdio.h>
  14. #include <limits.h>
  15. #include "workflow/WFTask.h"
  16. #include "workflow/HttpUtil.h"
  17. #include "rpc_trace_filter.h"
  18. #include "opentelemetry_trace.pb.h"
  19. namespace srpc
  20. {
  21. using namespace opentelemetry::proto::trace::v1;
  22. using namespace opentelemetry::proto::common::v1;
  23. using namespace opentelemetry::proto::resource::v1;
  24. static constexpr char const *SRPC_COMPONENT_OTEL_STR = "rpc.system";
  25. static InstrumentationLibrarySpans *
  26. rpc_span_fill_pb_request(const RPCModuleData& data,
  27. const std::unordered_map<std::string, std::string>& attributes,
  28. TracesData *req)
  29. {
  30. ResourceSpans *rs = req->add_resource_spans();
  31. InstrumentationLibrarySpans *spans = rs->add_instrumentation_library_spans();
  32. Resource *resource = rs->mutable_resource();
  33. KeyValue *attribute;
  34. AnyValue *value;
  35. auto iter = data.find(OTLP_METHOD_NAME);
  36. if (iter != data.end())
  37. {
  38. attribute = resource->add_attributes();
  39. attribute->set_key(OTLP_METHOD_NAME);
  40. value = attribute->mutable_value();
  41. value->set_string_value(iter->second);
  42. }
  43. for (const auto& attr : attributes)
  44. {
  45. KeyValue *attribute = resource->add_attributes();
  46. attribute->set_key(attr.first);
  47. AnyValue *value = attribute->mutable_value();
  48. value->set_string_value(attr.second);
  49. }
  50. // if attributes also set service.name, data takes precedence
  51. iter = data.find(OTLP_SERVICE_NAME);
  52. if (iter != data.end())
  53. {
  54. attribute = resource->add_attributes();
  55. attribute->set_key(OTLP_SERVICE_NAME);
  56. value = attribute->mutable_value();
  57. value->set_string_value(iter->second);
  58. }
  59. return spans;
  60. }
  61. static void rpc_span_fill_pb_span(RPCModuleData& data,
  62. const std::unordered_map<std::string, std::string>& attributes,
  63. InstrumentationLibrarySpans *spans)
  64. {
  65. Span *span = spans->add_spans();
  66. Status *status = span->mutable_status();
  67. KeyValue *attribute;
  68. AnyValue *attr_value;
  69. for (const auto& attr : attributes)
  70. {
  71. attribute = span->add_attributes();
  72. attribute->set_key(attr.first);
  73. attr_value = attribute->mutable_value();
  74. attr_value->set_string_value(attr.second);
  75. }
  76. span->set_span_id(data[SRPC_SPAN_ID].c_str(), SRPC_SPANID_SIZE);
  77. span->set_trace_id(data[SRPC_TRACE_ID].c_str(), SRPC_TRACEID_SIZE);
  78. // name is required and specified in OpenTelemetry semantic conventions.
  79. if (data.find(OTLP_METHOD_NAME) != data.end())
  80. {
  81. span->set_name(data[OTLP_METHOD_NAME]); // for RPC
  82. attribute= span->add_attributes();
  83. attribute->set_key(SRPC_COMPONENT_OTEL_STR); // srpc.component -> rpc.system
  84. attr_value = attribute->mutable_value();
  85. attr_value->set_string_value(data[SRPC_COMPONENT]);
  86. }
  87. else
  88. span->set_name(data[SRPC_HTTP_METHOD]); // for HTTP
  89. // refer to : trace/semantic_conventions/http/#status
  90. int http_status_code = 0;
  91. auto iter = data.find(SRPC_HTTP_STATUS_CODE);
  92. if (iter != data.end())
  93. http_status_code = atoi(iter->second.c_str());
  94. for (const auto& kv : data)
  95. {
  96. const std::string& key = kv.first;
  97. const std::string& val = kv.second;
  98. if (key.compare(SRPC_PARENT_SPAN_ID) == 0)
  99. {
  100. span->set_parent_span_id(val);
  101. }
  102. else if (key.compare(SRPC_SPAN_KIND) == 0)
  103. {
  104. if (val.compare(SRPC_SPAN_KIND_CLIENT) == 0)
  105. {
  106. span->set_kind(Span_SpanKind_SPAN_KIND_CLIENT);
  107. if (http_status_code >= 400)
  108. status->set_code(Status_StatusCode_STATUS_CODE_ERROR);
  109. }
  110. else if (val.compare(SRPC_SPAN_KIND_SERVER) == 0)
  111. {
  112. span->set_kind(Span_SpanKind_SPAN_KIND_SERVER);
  113. if (http_status_code >= 500)
  114. status->set_code(Status_StatusCode_STATUS_CODE_ERROR);
  115. }
  116. }
  117. else if (key.compare(SRPC_START_TIMESTAMP) == 0)
  118. {
  119. span->set_start_time_unix_nano(atoll(data[SRPC_START_TIMESTAMP].data()));
  120. }
  121. else if (key.compare(SRPC_FINISH_TIMESTAMP) == 0)
  122. {
  123. span->set_end_time_unix_nano(atoll(data[SRPC_FINISH_TIMESTAMP].data()));
  124. }
  125. else if (key.compare(SRPC_STATE) == 0)
  126. {
  127. int state = atoi(val.c_str());
  128. if (state == RPCStatusOK)
  129. status->set_code(Status_StatusCode_STATUS_CODE_OK);
  130. else
  131. status->set_code(Status_StatusCode_STATUS_CODE_ERROR);
  132. }
  133. else if (key.compare(WF_TASK_STATE) == 0)
  134. {
  135. int state = atoi(val.c_str());
  136. if (state == WFT_STATE_SUCCESS)
  137. status->set_code(Status_StatusCode_STATUS_CODE_OK);
  138. else
  139. status->set_code(Status_StatusCode_STATUS_CODE_ERROR);
  140. }
  141. else if (key.compare(0, 5, "srpc.") != 0)
  142. {
  143. attribute= span->add_attributes();
  144. attribute->set_key(key);
  145. attr_value = attribute->mutable_value();
  146. size_t len = key.length();
  147. if ((len > 4 && key.substr(len - 4).compare("port") == 0) ||
  148. (len > 5 && key.substr(len - 5).compare("count") == 0) ||
  149. (len > 6 && key.substr(len - 6).compare("length") == 0) ||
  150. key.compare(SRPC_HTTP_STATUS_CODE)== 0)
  151. {
  152. attr_value->set_int_value(atoi(val.c_str()));
  153. }
  154. else
  155. {
  156. attr_value->set_string_value(val);
  157. }
  158. }
  159. }
  160. }
  161. static size_t rpc_span_log_format(RPCModuleData& data, char *str, size_t len)
  162. {
  163. const uint64_t *trace_id = (const uint64_t *)data[SRPC_TRACE_ID].c_str();
  164. const uint64_t *span_id = (const uint64_t *)data[SRPC_SPAN_ID].c_str();
  165. char trace_id_buf[SRPC_TRACEID_SIZE * 2 + 1];
  166. char span_id_buf[SRPC_SPANID_SIZE * 2 + 1];
  167. TRACE_ID_BIN_TO_HEX(trace_id, trace_id_buf);
  168. SPAN_ID_BIN_TO_HEX(span_id, span_id_buf);
  169. size_t ret = snprintf(str, len, "trace_id: %s span_id: %s",
  170. trace_id_buf, span_id_buf);
  171. for (const auto& iter : data)
  172. {
  173. if (strcmp(iter.first.c_str(), SRPC_TRACE_ID) == 0 ||
  174. strcmp(iter.first.c_str(), SRPC_SPAN_ID) == 0 ||
  175. strcmp(iter.first.c_str(), SRPC_FINISH_TIMESTAMP) == 0 ||
  176. strcmp(iter.first.c_str(), SRPC_DURATION) == 0)
  177. {
  178. continue;
  179. }
  180. if (strcmp(iter.first.c_str(), SRPC_PARENT_SPAN_ID) == 0)
  181. {
  182. char parent_span_id_buf[SRPC_SPANID_SIZE * 2 + 1];
  183. span_id = (const uint64_t *)iter.second.c_str();
  184. SPAN_ID_BIN_TO_HEX(span_id, parent_span_id_buf);
  185. ret += snprintf(str + ret, len - ret, " parent_span_id: %s",
  186. parent_span_id_buf);
  187. }
  188. else if (strcmp(iter.first.c_str(), SRPC_START_TIMESTAMP) == 0)
  189. {
  190. ret += snprintf(str + ret, len - ret,
  191. " start_time: %s finish_time: %s duration: %s(ns)",
  192. data[SRPC_START_TIMESTAMP].c_str(),
  193. data[SRPC_FINISH_TIMESTAMP].c_str(),
  194. data[SRPC_DURATION].c_str());
  195. }
  196. else if (strcmp(iter.first.c_str(), SRPC_STATE) == 0 ||
  197. strcmp(iter.first.c_str(), SRPC_ERROR) == 0)
  198. {
  199. ret += snprintf(str + ret, len - ret, " %s: %s",
  200. iter.first.c_str(), iter.second.c_str());
  201. }
  202. /*
  203. else if (strcmp(it.first.c_str(), SRPC_SPAN_LOG) == 0)
  204. {
  205. ret += snprintf(str + ret, len - ret,
  206. "\n%s trace_id: %s span_id: %s"
  207. " timestamp: %s %s",
  208. "[ANNOTATION]",
  209. trace_id_buf,
  210. span_id_buf,
  211. it.first.c_str() + strlen(SRPC_SPAN_LOG) + 1,
  212. it.second.c_str());
  213. }
  214. */
  215. else
  216. {
  217. const char *key = iter.first.c_str();
  218. if (iter.first.compare(0, 5, "srpc.") == 0)
  219. key += 5;
  220. ret += snprintf(str + ret, len - ret, " %s: %s",
  221. key, iter.second.c_str());
  222. }
  223. }
  224. return ret;
  225. }
  226. bool RPCTraceFilterPolicy::collect(RPCModuleData& span)
  227. {
  228. if (span.find(SRPC_TRACE_ID) == span.end())
  229. return false;
  230. long long timestamp = GET_CURRENT_MS();
  231. if (timestamp < this->last_collect_timestamp + this->stat_interval &&
  232. this->spans_interval_count < this->spans_per_interval &&
  233. this->spans_second_count < this->spans_per_sec)
  234. {
  235. this->spans_interval_count++;
  236. this->spans_second_count++;
  237. return true;
  238. }
  239. else if (timestamp >= this->last_collect_timestamp + this->stat_interval &&
  240. this->spans_per_sec)
  241. {
  242. this->spans_interval_count = 0;
  243. if (timestamp / 1000 > this->last_collect_timestamp / 1000) // next second
  244. this->spans_second_count = 0;
  245. this->last_collect_timestamp = timestamp;
  246. if (this->spans_second_count < this->spans_per_sec)
  247. {
  248. this->spans_second_count++;
  249. this->spans_interval_count++;
  250. return true;
  251. }
  252. }
  253. return false;
  254. }
  255. bool RPCTraceFilterPolicy::report(size_t count)
  256. {
  257. long long timestamp = GET_CURRENT_MS();
  258. if (this->last_report_timestamp == 0)
  259. this->last_report_timestamp = timestamp;
  260. if (timestamp > this->last_report_timestamp + (long long)this->report_interval ||
  261. count >= this->report_threshold)
  262. {
  263. this->last_report_timestamp = timestamp;
  264. return true;
  265. }
  266. return false;
  267. }
  268. void RPCTraceLogTask::dispatch()
  269. {
  270. char str[SPAN_LOG_MAX_LENGTH];
  271. rpc_span_log_format(this->span, str, SPAN_LOG_MAX_LENGTH);
  272. fprintf(stderr, "[SPAN_LOG] %s\n", str);
  273. this->subtask_done();
  274. }
  275. SubTask *RPCTraceRedis::create(RPCModuleData& span)
  276. {
  277. auto iter = span.find(SRPC_TRACE_ID);
  278. if (iter == span.end())
  279. return WFTaskFactory::create_empty_task();
  280. auto *task = WFTaskFactory::create_redis_task(this->redis_url,
  281. this->retry_max,
  282. nullptr);
  283. protocol::RedisRequest *req = task->get_req();
  284. char value[SPAN_LOG_MAX_LENGTH];
  285. value[0] = '0';
  286. rpc_span_log_format(span, value, SPAN_LOG_MAX_LENGTH);
  287. req->set_request("SET", { span[SRPC_TRACE_ID], value} );
  288. return task;
  289. }
  290. RPCTraceOpenTelemetry::RPCTraceOpenTelemetry(const std::string& url) :
  291. RPCFilter(RPCModuleTypeTrace),
  292. url(url + OTLP_TRACES_PATH),
  293. redirect_max(OTLP_HTTP_REDIRECT_MAX),
  294. retry_max(OTLP_HTTP_RETRY_MAX),
  295. filter_policy(SPANS_PER_SECOND_DEFAULT,
  296. RPC_REPORT_THREHOLD_DEFAULT,
  297. RPC_REPORT_INTERVAL_DEFAULT),
  298. report_status(false),
  299. report_span_count(0)
  300. {
  301. this->report_req = new TracesData;
  302. }
  303. RPCTraceOpenTelemetry::RPCTraceOpenTelemetry(const std::string& url,
  304. const std::string& path) :
  305. RPCFilter(RPCModuleTypeTrace),
  306. url(url + path),
  307. redirect_max(OTLP_HTTP_REDIRECT_MAX),
  308. retry_max(OTLP_HTTP_RETRY_MAX),
  309. filter_policy(SPANS_PER_SECOND_DEFAULT,
  310. RPC_REPORT_THREHOLD_DEFAULT,
  311. RPC_REPORT_INTERVAL_DEFAULT),
  312. report_status(false),
  313. report_span_count(0)
  314. {
  315. this->report_req = new TracesData;
  316. }
  317. RPCTraceOpenTelemetry::RPCTraceOpenTelemetry(const std::string& url,
  318. const std::string& path,
  319. int redirect_max,
  320. int retry_max,
  321. size_t spans_per_second,
  322. size_t report_threshold,
  323. size_t report_interval) :
  324. RPCFilter(RPCModuleTypeTrace),
  325. url(url + path),
  326. redirect_max(redirect_max),
  327. retry_max(retry_max),
  328. filter_policy(spans_per_second, report_threshold, report_interval),
  329. report_status(false),
  330. report_span_count(0)
  331. {
  332. this->report_req = new TracesData;
  333. }
  334. RPCTraceOpenTelemetry::~RPCTraceOpenTelemetry()
  335. {
  336. delete this->report_req;
  337. }
  338. SubTask *RPCTraceOpenTelemetry::create(RPCModuleData& span)
  339. {
  340. std::string *output = new std::string;
  341. SubTask *next = NULL;
  342. TracesData *req = (TracesData *)this->report_req;
  343. this->mutex.lock();
  344. if (!this->report_status)
  345. next = WFTaskFactory::create_empty_task();
  346. else
  347. {
  348. // fprintf(stderr, "[Trace info to report]\n%s\n\n", req->DebugString().c_str());
  349. req->SerializeToString(output);
  350. this->report_status = false;
  351. this->report_span_count = 0;
  352. req->clear_resource_spans();
  353. this->report_map.clear();
  354. }
  355. this->mutex.unlock();
  356. if (next)
  357. return next;
  358. WFHttpTask *task = WFTaskFactory::create_http_task(this->url,
  359. this->redirect_max,
  360. this->retry_max,
  361. [](WFHttpTask *task) {
  362. /*
  363. protocol::HttpResponse *resp = task->get_resp();
  364. fprintf(stderr, "[Trace report callback] state=%d error=%d\n",
  365. task->get_state(), task->get_error());
  366. if (task->get_state() == WFT_STATE_SUCCESS)
  367. {
  368. fprintf(stderr, "%s %s %s\r\n", resp->get_http_version(),
  369. resp->get_status_code(), resp->get_reason_phrase());
  370. }
  371. */
  372. delete (std::string *)task->user_data;
  373. });
  374. protocol::HttpRequest *http_req = task->get_req();
  375. http_req->set_method(HttpMethodPost);
  376. http_req->add_header_pair("Content-Type", "application/x-protobuf");
  377. task->user_data = output;
  378. http_req->append_output_body_nocopy(output->c_str(), output->length());
  379. return task;
  380. }
  381. void RPCTraceOpenTelemetry::add_attributes(const std::string& key,
  382. const std::string& value)
  383. {
  384. this->mutex.lock();
  385. this->attributes.insert(std::make_pair(key, value));
  386. this->mutex.unlock();
  387. }
  388. void RPCTraceOpenTelemetry::add_span_attributes(const std::string& key,
  389. const std::string& value)
  390. {
  391. this->mutex.lock();
  392. this->span_attributes.insert(std::make_pair(key, value));
  393. this->mutex.unlock();
  394. }
  395. size_t RPCTraceOpenTelemetry::clear_attributes()
  396. {
  397. size_t ret;
  398. this->mutex.lock();
  399. ret = this->attributes.size();
  400. this->attributes.clear();
  401. this->mutex.unlock();
  402. return ret;
  403. }
  404. size_t RPCTraceOpenTelemetry::clear_span_attributes()
  405. {
  406. size_t ret;
  407. this->mutex.lock();
  408. ret = this->span_attributes.size();
  409. this->span_attributes.clear();
  410. this->mutex.unlock();
  411. return ret;
  412. }
  413. bool RPCTraceOpenTelemetry::filter(RPCModuleData& data)
  414. {
  415. std::unordered_map<std::string, google::protobuf::Message *>::iterator it;
  416. InstrumentationLibrarySpans *spans;
  417. std::string service_name;
  418. bool ret;
  419. auto iter = data.find(OTLP_SERVICE_NAME);
  420. if (iter != data.end())
  421. {
  422. service_name = iter->second;
  423. }
  424. else // for HTTP
  425. {
  426. service_name = data[SRPC_COMPONENT] + std::string(".") +
  427. data[SRPC_HTTP_SCHEME];
  428. if (data.find(SRPC_SPAN_KIND_CLIENT) != data.end())
  429. service_name += ".client";
  430. else
  431. service_name += ".server";
  432. }
  433. this->mutex.lock();
  434. if (this->filter_policy.collect(data))
  435. {
  436. ++this->report_span_count;
  437. it = this->report_map.find(service_name);
  438. if (it == this->report_map.end())
  439. {
  440. spans = rpc_span_fill_pb_request(data, this->attributes,
  441. (TracesData *)this->report_req);
  442. this->report_map.insert({service_name, spans});
  443. }
  444. else
  445. spans = (InstrumentationLibrarySpans *)it->second;
  446. rpc_span_fill_pb_span(data, this->span_attributes, spans);
  447. }
  448. ret = this->filter_policy.report(this->report_span_count);
  449. if (ret)
  450. this->report_status = true;
  451. this->mutex.unlock();
  452. return ret;
  453. }
  454. } // end namespace srpc