uri.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517
  1. // Licensed to the Apache Software Foundation (ASF) under one
  2. // or more contributor license agreements. See the NOTICE file
  3. // distributed with this work for additional information
  4. // regarding copyright ownership. The ASF licenses this file
  5. // to you under the Apache License, Version 2.0 (the
  6. // "License"); you may not use this file except in compliance
  7. // with the License. You may obtain a copy of the License at
  8. //
  9. // http://www.apache.org/licenses/LICENSE-2.0
  10. //
  11. // Unless required by applicable law or agreed to in writing,
  12. // software distributed under the License is distributed on an
  13. // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  14. // KIND, either express or implied. See the License for the
  15. // specific language governing permissions and limitations
  16. // under the License.
  17. #include <ctype.h> // isalnum
  18. #include <unordered_set>
  19. #include "brpc/log.h"
  20. #include "brpc/details/http_parser.h" // http_parser_parse_url
  21. #include "brpc/uri.h" // URI
  22. namespace brpc {
  23. URI::URI()
  24. : _port(-1)
  25. , _query_was_modified(false)
  26. , _initialized_query_map(false)
  27. {}
  28. URI::~URI() {
  29. }
  30. void URI::Clear() {
  31. _st.reset();
  32. _port = -1;
  33. _query_was_modified = false;
  34. _initialized_query_map = false;
  35. _host.clear();
  36. _path.clear();
  37. _user_info.clear();
  38. _fragment.clear();
  39. _scheme.clear();
  40. _query.clear();
  41. _query_map.clear();
  42. }
  43. void URI::Swap(URI &rhs) {
  44. _st.swap(rhs._st);
  45. std::swap(_port, rhs._port);
  46. std::swap(_query_was_modified, rhs._query_was_modified);
  47. std::swap(_initialized_query_map, rhs._initialized_query_map);
  48. _host.swap(rhs._host);
  49. _path.swap(rhs._path);
  50. _user_info.swap(rhs._user_info);
  51. _fragment.swap(rhs._fragment);
  52. _scheme.swap(rhs._scheme);
  53. _query.swap(rhs._query);
  54. _query_map.swap(rhs._query_map);
  55. }
  56. // Parse queries, which is case-sensitive
  57. static void ParseQueries(URI::QueryMap& query_map, const std::string &query) {
  58. query_map.clear();
  59. if (query.empty()) {
  60. return;
  61. }
  62. for (QuerySplitter sp(query.c_str()); sp; ++sp) {
  63. if (!sp.key().empty()) {
  64. if (!query_map.initialized()) {
  65. query_map.init(URI::QUERY_MAP_INITIAL_BUCKET);
  66. }
  67. std::string key(sp.key().data(), sp.key().size());
  68. std::string value(sp.value().data(), sp.value().size());
  69. query_map[key] = value;
  70. }
  71. }
  72. }
  73. inline const char* SplitHostAndPort(const char* host_begin,
  74. const char* host_end,
  75. int* port) {
  76. uint64_t port_raw = 0;
  77. uint64_t multiply = 1;
  78. for (const char* q = host_end - 1; q > host_begin; --q) {
  79. if (*q >= '0' && *q <= '9') {
  80. port_raw += (*q - '0') * multiply;
  81. multiply *= 10;
  82. } else if (*q == ':') {
  83. *port = static_cast<int>(port_raw);
  84. return q;
  85. } else {
  86. break;
  87. }
  88. }
  89. *port = -1;
  90. return host_end;
  91. }
  92. // valid characters in URL
  93. // https://datatracker.ietf.org/doc/html/rfc3986#section-2.1
  94. // https://datatracker.ietf.org/doc/html/rfc3986#section-2.3
  95. // https://datatracker.ietf.org/doc/html/rfc3986#section-2.4
  96. // space is not allowed by rfc3986, but allowed by brpc
  97. static bool is_valid_char(const char* p) {
  98. static const std::unordered_set<char> other_valid_char = {
  99. ':', '/', '?', '#', '[', ']', '@', '!', '$', '&',
  100. '\'', '(', ')'/ '*', '+', ',', ';', '='/ '-', '.',
  101. '_', '~', '%', ' '
  102. };
  103. return (isalnum(*p) || other_valid_char.find(*p) != other_valid_char.end());
  104. }
  105. static bool is_all_spaces(const char* p) {
  106. for (; *p == ' '; ++p) {}
  107. return !*p;
  108. }
  109. const char URI_PARSE_CONTINUE = 0;
  110. const char URI_PARSE_CHECK = 1;
  111. const char URI_PARSE_BREAK = 2;
  112. static const char g_url_parsing_fast_action_map_raw[] = {
  113. 0/*-128*/, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  114. 0/*-118*/, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  115. 0/*-108*/, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  116. 0/*-98*/, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  117. 0/*-88*/, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  118. 0/*-78*/, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  119. 0/*-68*/, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  120. 0/*-58*/, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  121. 0/*-48*/, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  122. 0/*-38*/, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  123. 0/*-28*/, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  124. 0/*-18*/, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  125. 0/*-8*/, 0, 0, 0, 0, 0, 0, 0, URI_PARSE_BREAK/*\0*/, 0,
  126. 0/*2*/, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  127. 0/*12*/, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  128. 0/*22*/, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  129. URI_PARSE_CHECK/* */, 0, 0, URI_PARSE_BREAK/*#*/, 0, 0, 0, 0, 0, 0,
  130. 0/*42*/, 0, 0, 0, 0, URI_PARSE_BREAK/*/*/, 0, 0, 0, 0,
  131. 0/*52*/, 0, 0, 0, 0, 0, URI_PARSE_CHECK/*:*/, 0, 0, 0,
  132. 0/*62*/, URI_PARSE_BREAK/*?*/, URI_PARSE_CHECK/*@*/, 0, 0, 0, 0, 0, 0, 0,
  133. 0/*72*/, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  134. 0/*82*/, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  135. 0/*92*/, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  136. 0/*102*/, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  137. 0/*112*/, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  138. 0/*122*/, 0, 0, 0, 0, 0
  139. };
  140. static const char* const g_url_parsing_fast_action_map =
  141. g_url_parsing_fast_action_map_raw + 128;
  142. // This implementation is faster than http_parser_parse_url() and allows
  143. // ignoring of scheme("http://")
  144. int URI::SetHttpURL(const char* url) {
  145. Clear();
  146. const char* p = url;
  147. // skip heading blanks
  148. if (*p == ' ') {
  149. for (++p; *p == ' '; ++p) {}
  150. }
  151. const char* start = p;
  152. // Find end of host, locate scheme and user_info during the searching
  153. bool need_scheme = true;
  154. bool need_user_info = true;
  155. for (; true; ++p) {
  156. const char action = g_url_parsing_fast_action_map[(int)*p];
  157. if (action == URI_PARSE_CONTINUE) {
  158. continue;
  159. }
  160. if (action == URI_PARSE_BREAK) {
  161. break;
  162. }
  163. if (!is_valid_char(p)) {
  164. _st.set_error(EINVAL, "invalid character in url");
  165. return -1;
  166. } else if (*p == ':') {
  167. if (p[1] == '/' && p[2] == '/' && need_scheme) {
  168. need_scheme = false;
  169. _scheme.assign(start, p - start);
  170. p += 2;
  171. start = p + 1;
  172. }
  173. } else if (*p == '@') {
  174. if (need_user_info) {
  175. need_user_info = false;
  176. _user_info.assign(start, p - start);
  177. start = p + 1;
  178. }
  179. } else if (*p == ' ') {
  180. if (!is_all_spaces(p + 1)) {
  181. _st.set_error(EINVAL, "Invalid space in url");
  182. return -1;
  183. }
  184. break;
  185. }
  186. }
  187. const char* host_end = SplitHostAndPort(start, p, &_port);
  188. _host.assign(start, host_end - start);
  189. if (*p == '/') {
  190. start = p; //slash pointed by p is counted into _path
  191. ++p;
  192. for (; *p && *p != '?' && *p != '#'; ++p) {
  193. if (*p == ' ') {
  194. if (!is_all_spaces(p + 1)) {
  195. _st.set_error(EINVAL, "Invalid space in path");
  196. return -1;
  197. }
  198. break;
  199. }
  200. }
  201. _path.assign(start, p - start);
  202. }
  203. if (*p == '?') {
  204. start = ++p;
  205. for (; *p && *p != '#'; ++p) {
  206. if (*p == ' ') {
  207. if (!is_all_spaces(p + 1)) {
  208. _st.set_error(EINVAL, "Invalid space in query");
  209. return -1;
  210. }
  211. break;
  212. }
  213. }
  214. _query.assign(start, p - start);
  215. }
  216. if (*p == '#') {
  217. start = ++p;
  218. for (; *p; ++p) {
  219. if (*p == ' ') {
  220. if (!is_all_spaces(p + 1)) {
  221. _st.set_error(EINVAL, "Invalid space in fragment");
  222. return -1;
  223. }
  224. break;
  225. }
  226. }
  227. _fragment.assign(start, p - start);
  228. }
  229. return 0;
  230. }
  231. int ParseURL(const char* url,
  232. std::string* scheme_out, std::string* host_out, int* port_out) {
  233. const char* p = url;
  234. // skip heading blanks
  235. if (*p == ' ') {
  236. for (++p; *p == ' '; ++p) {}
  237. }
  238. const char* start = p;
  239. // Find end of host, locate scheme and user_info during the searching
  240. bool need_scheme = true;
  241. bool need_user_info = true;
  242. for (; true; ++p) {
  243. const char action = g_url_parsing_fast_action_map[(int)*p];
  244. if (action == URI_PARSE_CONTINUE) {
  245. continue;
  246. }
  247. if (action == URI_PARSE_BREAK) {
  248. break;
  249. }
  250. if (*p == ':') {
  251. if (p[1] == '/' && p[2] == '/' && need_scheme) {
  252. need_scheme = false;
  253. if (scheme_out) {
  254. scheme_out->assign(start, p - start);
  255. }
  256. p += 2;
  257. start = p + 1;
  258. }
  259. } else if (*p == '@') {
  260. if (need_user_info) {
  261. need_user_info = false;
  262. start = p + 1;
  263. }
  264. } else if (*p == ' ') {
  265. if (!is_all_spaces(p + 1)) {
  266. LOG(ERROR) << "Invalid space in url=`" << url << '\'';
  267. return -1;
  268. }
  269. break;
  270. }
  271. }
  272. int port = -1;
  273. const char* host_end = SplitHostAndPort(start, p, &port);
  274. if (host_out) {
  275. host_out->assign(start, host_end - start);
  276. }
  277. if (port_out) {
  278. *port_out = port;
  279. }
  280. return 0;
  281. }
  282. void URI::Print(std::ostream& os) const {
  283. if (!_host.empty()) {
  284. if (!_scheme.empty()) {
  285. os << _scheme << "://";
  286. } else {
  287. os << "http://";
  288. }
  289. // user_info is passed by Authorization
  290. os << _host;
  291. if (_port >= 0) {
  292. os << ':' << _port;
  293. }
  294. }
  295. PrintWithoutHost(os);
  296. }
  297. void URI::PrintWithoutHost(std::ostream& os) const {
  298. if (_path.empty()) {
  299. // According to rfc2616#section-5.1.2, the absolute path
  300. // cannot be empty; if none is present in the original URI, it MUST
  301. // be given as "/" (the server root).
  302. os << '/';
  303. } else {
  304. os << _path;
  305. }
  306. if (_initialized_query_map && _query_was_modified) {
  307. bool is_first = true;
  308. for (QueryIterator it = QueryBegin(); it != QueryEnd(); ++it) {
  309. if (is_first) {
  310. is_first = false;
  311. os << '?';
  312. } else {
  313. os << '&';
  314. }
  315. os << it->first;
  316. if (!it->second.empty()) {
  317. os << '=' << it->second;
  318. }
  319. }
  320. } else if (!_query.empty()) {
  321. os << '?' << _query;
  322. }
  323. if (!_fragment.empty()) {
  324. os << '#' << _fragment;
  325. }
  326. }
  327. void URI::InitializeQueryMap() const {
  328. if (!_query_map.initialized()) {
  329. CHECK_EQ(0, _query_map.init(QUERY_MAP_INITIAL_BUCKET));
  330. }
  331. ParseQueries(_query_map, _query);
  332. _query_was_modified = false;
  333. _initialized_query_map = true;
  334. }
  335. void URI::AppendQueryString(std::string* query, bool append_question_mark) const {
  336. if (_query_map.empty()) {
  337. return;
  338. }
  339. if (append_question_mark) {
  340. query->push_back('?');
  341. }
  342. QueryIterator it = QueryBegin();
  343. query->append(it->first);
  344. if (!it->second.empty()) {
  345. query->push_back('=');
  346. query->append(it->second);
  347. }
  348. ++it;
  349. for (; it != QueryEnd(); ++it) {
  350. query->push_back('&');
  351. query->append(it->first);
  352. if (!it->second.empty()) {
  353. query->push_back('=');
  354. query->append(it->second);
  355. }
  356. }
  357. }
  358. void URI::GenerateH2Path(std::string* h2_path) const {
  359. h2_path->reserve(_path.size() + _query.size() + _fragment.size() + 3);
  360. h2_path->clear();
  361. if (_path.empty()) {
  362. h2_path->push_back('/');
  363. } else {
  364. h2_path->append(_path);
  365. }
  366. if (_initialized_query_map && _query_was_modified) {
  367. AppendQueryString(h2_path, true);
  368. } else if (!_query.empty()) {
  369. h2_path->push_back('?');
  370. h2_path->append(_query);
  371. }
  372. if (!_fragment.empty()) {
  373. h2_path->push_back('#');
  374. h2_path->append(_fragment);
  375. }
  376. }
  377. void URI::SetHostAndPort(const std::string& host) {
  378. const char* const host_begin = host.c_str();
  379. const char* host_end =
  380. SplitHostAndPort(host_begin, host_begin + host.size(), &_port);
  381. _host.assign(host_begin, host_end - host_begin);
  382. }
  383. void URI::SetH2Path(const char* h2_path) {
  384. _path.clear();
  385. _query.clear();
  386. _fragment.clear();
  387. _query_was_modified = false;
  388. _initialized_query_map = false;
  389. _query_map.clear();
  390. const char* p = h2_path;
  391. const char* start = p;
  392. for (; *p && *p != '?' && *p != '#'; ++p) {}
  393. _path.assign(start, p - start);
  394. if (*p == '?') {
  395. start = ++p;
  396. for (; *p && *p != '#'; ++p) {}
  397. _query.assign(start, p - start);
  398. }
  399. if (*p == '#') {
  400. start = ++p;
  401. for (; *p; ++p) {}
  402. _fragment.assign(start, p - start);
  403. }
  404. }
  405. QueryRemover::QueryRemover(const std::string* str)
  406. : _query(str)
  407. , _qs(str->data(), str->data() + str->size())
  408. , _iterated_len(0)
  409. , _removed_current_key_value(false)
  410. , _ever_removed(false) {
  411. }
  412. QueryRemover& QueryRemover::operator++() {
  413. if (!_qs) {
  414. return *this;
  415. }
  416. if (!_ever_removed) {
  417. _qs.operator++();
  418. return *this;
  419. }
  420. if (!_removed_current_key_value) {
  421. _modified_query.resize(_iterated_len);
  422. if (!_modified_query.empty()) {
  423. _modified_query.push_back('&');
  424. _iterated_len += 1;
  425. }
  426. _modified_query.append(key_and_value().data(), key_and_value().length());
  427. _iterated_len += key_and_value().length();
  428. } else {
  429. _removed_current_key_value = false;
  430. }
  431. _qs.operator++();
  432. return *this;
  433. }
  434. QueryRemover QueryRemover::operator++(int) {
  435. QueryRemover tmp = *this;
  436. operator++();
  437. return tmp;
  438. }
  439. void QueryRemover::remove_current_key_and_value() {
  440. _removed_current_key_value = true;
  441. if (!_ever_removed) {
  442. _ever_removed = true;
  443. size_t offset = key().data() - _query->data();
  444. size_t len = offset - ((offset > 0 && (*_query)[offset - 1] == '&')? 1: 0);
  445. _modified_query.append(_query->data(), len);
  446. _iterated_len += len;
  447. }
  448. return;
  449. }
  450. std::string QueryRemover::modified_query() {
  451. if (!_ever_removed) {
  452. return *_query;
  453. }
  454. size_t offset = key().data() - _query->data();
  455. // find out where the remaining string starts
  456. if (_removed_current_key_value) {
  457. size_t size = key_and_value().length();
  458. while (offset + size < _query->size() && (*_query)[offset + size] == '&') {
  459. // ingore unnecessary '&'
  460. size += 1;
  461. }
  462. offset += size;
  463. }
  464. _modified_query.resize(_iterated_len);
  465. if (offset < _query->size()) {
  466. if (!_modified_query.empty()) {
  467. _modified_query.push_back('&');
  468. }
  469. _modified_query.append(*_query, offset, std::string::npos);
  470. }
  471. return _modified_query;
  472. }
  473. void append_query(std::string *query_string,
  474. const butil::StringPiece& key,
  475. const butil::StringPiece& value) {
  476. if (!query_string->empty() && butil::back_char(*query_string) != '?') {
  477. query_string->push_back('&');
  478. }
  479. query_string->append(key.data(), key.size());
  480. query_string->push_back('=');
  481. query_string->append(value.data(), value.size());
  482. }
  483. } // namespace brpc