tc_xml.cpp 18 KB


  1. /**
  2. * Tencent is pleased to support the open source community by making Tars available.
  3. *
  4. * Copyright (C) 2016THL A29 Limited, a Tencent company. All rights reserved.
  5. *
  6. * Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  7. * in compliance with the License. You may obtain a copy of the License at
  8. *
  9. * https://opensource.org/licenses/BSD-3-Clause
  10. *
  11. * Unless required by applicable law or agreed to in writing, software distributed
  12. * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  13. * CONDITIONS OF ANY KIND, either express or implied. See the License for the
  14. * specific language governing permissions and limitations under the License.
  15. */
  16. #include "util/tc_xml.h"
  17. #include "util/tc_common.h"
  18. #include <math.h>
  19. #include <sstream>
  20. #include <iostream>
  21. #include <iomanip>
  22. namespace tars
  23. {
  24. #define FILTER_SPACE while(isspace((int)reader.get())) {reader.skip();}
  25. #define FILTER_NODENAME while(lookup_node_name[(int)reader.get()]) {reader.skip();}
  26. #define XML_PARSE_ERROR(what) { throw TC_Xml_Exception(what); }
  27. // Node name (anything but space \n \r \t / > ? \0)
  28. const unsigned char lookup_node_name[256] =
  29. {
  30. // 0 1 2 3 4 5 6 7 8 9 A B C D E F
  31. 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, // 0
  32. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1
  33. 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, // 2
  34. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, // 3
  35. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4
  36. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5
  37. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6
  38. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7
  39. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8
  40. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9
  41. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A
  42. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B
  43. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C
  44. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D
  45. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E
  46. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F
  47. };
  48. //// Text (i.e. PCDATA) (anything but < \0)
  49. //const unsigned char lookup_text[256] =
  50. //{
  51. // // 0 1 2 3 4 5 6 7 8 9 A B C D E F
  52. // 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0
  53. // 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1
  54. // 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2
  55. // 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, // 3
  56. // 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4
  57. // 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5
  58. // 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6
  59. // 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7
  60. // 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8
  61. // 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9
  62. // 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A
  63. // 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B
  64. // 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C
  65. // 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D
  66. // 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E
  67. // 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F
  68. //};
  69. // Text (i.e. PCDATA) that does not require processing when ws normalization is disabled
  70. // (anything but < \0 &)
  71. const unsigned char lookup_text_pure_no_ws[256] =
  72. {
  73. // 0 1 2 3 4 5 6 7 8 9 A B C D E F
  74. 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0
  75. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1
  76. 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2
  77. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, // 3
  78. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4
  79. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5
  80. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6
  81. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7
  82. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8
  83. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9
  84. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A
  85. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B
  86. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C
  87. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D
  88. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E
  89. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F
  90. };
  91. // Digits (dec and hex, 255 denotes end of numeric character reference)
  92. const unsigned char lookup_digits[256] =
  93. {
  94. // 0 1 2 3 4 5 6 7 8 9 A B C D E F
  95. 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // 0
  96. 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // 1
  97. 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // 2
  98. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,255,255,255,255,255,255, // 3
  99. 255, 10, 11, 12, 13, 14, 15,255,255,255,255,255,255,255,255,255, // 4
  100. 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // 5
  101. 255, 10, 11, 12, 13, 14, 15,255,255,255,255,255,255,255,255,255, // 6
  102. 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // 7
  103. 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // 8
  104. 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // 9
  105. 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // A
  106. 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // B
  107. 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // C
  108. 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // D
  109. 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // E
  110. 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255 // F
  111. };
  112. XmlValuePtr TC_Xml::getValue(const string& str)
  113. {
  114. BufferXmlReader reader;
  115. XmlValueObjPtr p = new XmlValueObj();
  116. // Parse BOM, if any
  117. reader.setBuffer(str.c_str(), str.length());
  118. if ((unsigned char)reader.get(0) == 0xEF &&
  119. (unsigned char)reader.get(1) == 0xBB &&
  120. (unsigned char)reader.get(2) == 0xBF)
  121. {
  122. reader._pos += 3; // Skup utf-8 bom
  123. }
  124. FILTER_SPACE;
  125. // Parse and append new child
  126. if (reader.get() == '<' && reader.get(1) == '?')
  127. {
  128. reader.skip(2);
  129. ignoreDeclaration(reader);
  130. }
  131. FILTER_SPACE;
  132. return getNode(reader);
  133. }
  134. XmlValuePtr TC_Xml::getValue(const vector<char>& buf)
  135. {
  136. BufferXmlReader reader;
  137. XmlValueObjPtr p = new XmlValueObj();
  138. // Parse BOM, if any
  139. reader.setBuffer(buf);
  140. if ((unsigned char)reader.get(0) == 0xEF &&
  141. (unsigned char)reader.get(1) == 0xBB &&
  142. (unsigned char)reader.get(2) == 0xBF)
  143. {
  144. reader._pos += 3; // Skup utf-8 bom
  145. }
  146. FILTER_SPACE;
  147. // Parse and append new child
  148. if (reader.get() == '<' && reader.get(1) == '?')
  149. {
  150. reader.skip(2);
  151. ignoreDeclaration(reader);
  152. }
  153. FILTER_SPACE;
  154. return getNode(reader);
  155. }
  156. XmlValuePtr TC_Xml::getNode(BufferXmlReader& reader, const string& nodename)
  157. {
  158. XmlValueObjPtr p = new XmlValueObj();
  159. while(1)
  160. {
  161. // 开始符号
  162. if (!reader.expect('<'))
  163. {
  164. return getValue(reader);
  165. }
  166. // 判断是否是注释
  167. if (ignoreComment(reader))
  168. {
  169. continue;
  170. }
  171. // CDATA
  172. if (reader.get(0) == '!' && reader.get(1) == '[' && reader.get(2) == 'C' && reader.get(3) == 'D' &&
  173. reader.get(4) == 'A' && reader.get(5) == 'T' && reader.get(6) == 'A' && reader.get(7) == '[')
  174. {
  175. reader.skip(8);
  176. return getCdata(reader);
  177. }
  178. // 判断是否是node结束符
  179. if (isEndNode(reader, nodename))
  180. {
  181. break;
  182. }
  183. // 取node名称
  184. FILTER_SPACE;
  185. size_t pos = reader.pos();
  186. FILTER_NODENAME;
  187. string name = string(reader._buf + pos, reader.pos() - pos);
  188. while(reader.read() != '>');
  189. if (reader.get(-2) == '/')
  190. {
  191. XmlValuePtr s = new XmlValueString();
  192. insertArray(name, s, p);
  193. FILTER_SPACE;
  194. continue;
  195. }
  196. // 取Node内容
  197. FILTER_SPACE;
  198. XmlValuePtr q = getNode(reader, name);
  199. insertArray(name, q, p);
  200. // 遇到尾部就退出
  201. FILTER_SPACE;
  202. if (reader.hasEnd())
  203. {
  204. break;
  205. }
  206. }
  207. if (p->value.size() == 0) // node内容为空直接返回空字符对象
  208. {
  209. XmlValuePtr ss = new XmlValueString();
  210. return ss;
  211. }
  212. return p;
  213. }
  214. bool TC_Xml::isEndNode(BufferXmlReader& reader, const string& nodename)
  215. {
  216. if (reader.get() == '/' && reader.get(1) != '>' && !nodename.empty())
  217. {
  218. size_t lastpos = reader.pos();
  219. size_t start = 0;
  220. while (isspace(reader.get(start++)));
  221. size_t end = start;
  222. while(lookup_node_name[(int)reader.get(++end)]);
  223. string backname = string(reader._buf + lastpos + start, end - start);
  224. if (backname == nodename)
  225. {
  226. while(reader.read() != '>');
  227. return true;
  228. }
  229. }
  230. return false;
  231. }
  232. XmlValueStringPtr TC_Xml::getCdata(BufferXmlReader& reader)
  233. {
  234. size_t pos = reader.pos();
  235. XmlValueStringPtr p = new XmlValueString(true);
  236. while (reader.get() != ']' || reader.get(1) != ']' || reader.get(2) != '>')
  237. {
  238. reader.skip(1);
  239. }
  240. p->value = string(reader._buf + pos, reader.pos() - pos);
  241. reader.skip(3);
  242. while(reader.read() != '>');
  243. return p;
  244. }
  245. XmlValueStringPtr TC_Xml::getValue(BufferXmlReader& reader)
  246. {
  247. XmlValueStringPtr p = new XmlValueString();
  248. FILTER_SPACE
  249. while (lookup_text_pure_no_ws[(int)reader.get()])
  250. {
  251. if (reader.get() != '&')
  252. {
  253. p->value.append(1, reader.read());
  254. }
  255. if (reader.get(1) == 'g' && reader.get(2) == 't' && reader.get(3) == ';')
  256. {
  257. p->value.append(1, '>');
  258. reader.skip(4);
  259. continue;
  260. }
  261. if (reader.get(1) == 'l' && reader.get(2) == 't' && reader.get(3) == ';')
  262. {
  263. p->value.append(1, '<');
  264. reader.skip(4);
  265. continue;
  266. }
  267. if (reader.get(1) == 'a' && reader.get(2) == 'm' && reader.get(3) == 'p' && reader.get(4) == ';')
  268. {
  269. p->value.append(1, '&');
  270. reader.skip(5);
  271. continue;
  272. }
  273. if (reader.get(1) == 'a' && reader.get(2) == 'p' && reader.get(3) == 'o' && reader.get(4) == 's' && reader.get(5) == ';')
  274. {
  275. p->value.append(1, '\'');
  276. reader.skip(6);
  277. continue;
  278. }
  279. if (reader.get(1) == 'q' && reader.get(2) == 'u' && reader.get(3) == 'o' && reader.get(4) == 's' && reader.get(5) == ';')
  280. {
  281. p->value.append(1, '"');
  282. reader.skip(6);
  283. continue;
  284. }
  285. // 中文转码
  286. if (reader.get(1) == '#')
  287. {
  288. unsigned long code = 0;
  289. if (reader.get(2) == 'x')
  290. {
  291. reader.skip(3);
  292. unsigned char digit = reader.get();
  293. while (lookup_digits[digit] != 0xFF)
  294. {
  295. code = code * 16 + digit;
  296. digit = (unsigned char)reader.read();
  297. }
  298. }
  299. else
  300. {
  301. reader.skip(2);
  302. unsigned char digit = reader.get();
  303. while (lookup_digits[digit] != 0xFF)
  304. {
  305. code = code * 10 + digit;
  306. digit = (unsigned char)reader.read();
  307. }
  308. }
  309. if (reader.read() != ';')
  310. {
  311. XML_PARSE_ERROR("expected ;");
  312. }
  313. // Insert UTF8 sequence
  314. char text[8] = {0};
  315. if (code < 0x80) // 1 byte sequence
  316. {
  317. text[0] = static_cast<char>(code);
  318. }
  319. else if (code < 0x800) // 2 byte sequence
  320. {
  321. text[1] = static_cast<char>((code | 0x80) & 0xBF); code >>= 6;
  322. text[0] = static_cast<char>(code | 0xc0);
  323. }
  324. else if (code < 0x10000) // 3 byte sequence
  325. {
  326. text[2] = static_cast<char>((code | 0x80) & 0xBF); code >>= 6;
  327. text[1] = static_cast<char>((code | 0x80) & 0xBF); code >>= 6;
  328. text[0] = static_cast<char>(code | 0xE0);
  329. }
  330. else if (code < 0x110000) // 4 byte sequence
  331. {
  332. text[3] = static_cast<unsigned char>((code | 0x80) & 0xBF); code >>= 6;
  333. text[2] = static_cast<unsigned char>((code | 0x80) & 0xBF); code >>= 6;
  334. text[1] = static_cast<unsigned char>((code | 0x80) & 0xBF); code >>= 6;
  335. text[0] = static_cast<unsigned char>(code | 0xF0);
  336. }
  337. else // Invalid, only codes up to 0x10FFFF are allowed in Unicode
  338. {
  339. XML_PARSE_ERROR("invalid numeric character entity");
  340. }
  341. p->value.append(text);
  342. }
  343. }
  344. while(reader.read() != '>');
  345. return p;
  346. }
  347. void TC_Xml::insertArray(const string& name, XmlValuePtr& v, XmlValueObjPtr& p)
  348. {
  349. if (p->value.find(name) == p->value.end())
  350. {
  351. p->value[name] = v;
  352. }
  353. else if (p->value[name]->getType() == v->getType())
  354. {
  355. XmlValueArrayPtr array = new XmlValueArray();
  356. array->push_back(p->value[name]);
  357. array->push_back(v);
  358. p->value[name] = array;
  359. }
  360. else if (p->value[name]->getType() == eXmlTypeArray)
  361. {
  362. XmlValueArrayPtr array = XmlValueArrayPtr::dynamicCast(p->value[name]);
  363. if (array->value.size() > 0 && array->value[0]->getType() == v->getType())
  364. {
  365. array->push_back(v);
  366. p->value[name] = array;
  367. }
  368. }
  369. }
  370. bool TC_Xml::ignoreComment(BufferXmlReader& reader)
  371. {
  372. if (reader.get() == '!' && reader.get(1) == '-' && reader.get(2) == '-')
  373. {
  374. reader.skip(3);
  375. while (1)
  376. {
  377. if (reader.read() == '-' && reader.read() == '-' && reader.read() == '>')
  378. {
  379. return true;
  380. }
  381. }
  382. }
  383. return false;
  384. }
  385. void TC_Xml::ignoreDeclaration(BufferXmlReader& reader)
  386. {
  387. if ((reader.get(0) == 'x' || reader.get(0) == 'X') &&
  388. (reader.get(1) == 'm' || reader.get(1) == 'M') &&
  389. (reader.get(2) == 'l' || reader.get(2) == 'L') &&
  390. isspace(reader.get(3)))
  391. {
  392. // '<?xml ' - xml declaration, ignore it
  393. reader.skip(4);
  394. while (1)
  395. {
  396. if (reader.read() == '?' && reader.read() == '>')
  397. {
  398. return;
  399. }
  400. }
  401. }
  402. XML_PARSE_ERROR("unexpected xml head")
  403. }
  404. string TC_Xml::writeValue(const XmlValuePtr & p, bool bHead)
  405. {
  406. ostringstream os;
  407. os << (bHead ? "<?xml version='1.0' encoding='utf-8'?>" : "");
  408. if(!p || p->getType() != eXmlTypeObj)
  409. {
  410. return os.str();
  411. }
  412. writeObj(os, XmlValueObjPtr::dynamicCast(p));
  413. return os.str();
  414. }
  415. void TC_Xml::writeValue(const XmlValuePtr& p, vector<char>& buf, bool bHead)
  416. {
  417. if(!p || p->getType() != eXmlTypeObj)
  418. {
  419. return;
  420. }
  421. ostringstream os;
  422. os << (bHead ? "<?xml version='1.0' encoding='utf-8'?>" : "");
  423. writeObj(os, XmlValueObjPtr::dynamicCast(p));
  424. string s = os.str();
  425. buf.assign(s.begin(), s.end());
  426. }
  427. void TC_Xml::writeObj(std::ostream& os, const XmlValuePtr& p)
  428. {
  429. if (p->getType() != eXmlTypeObj)
  430. {
  431. XML_PARSE_ERROR("not support but xmlobj")
  432. }
  433. os << "\n";
  434. XmlValueObjPtr q = XmlValueObjPtr::dynamicCast(p);
  435. for (map<string, XmlValuePtr>::const_iterator it = q->value.begin(); it != q->value.end(); it++)
  436. {
  437. switch (it->second->getType())
  438. {
  439. case eXmlTypeString:
  440. os << "<" << it->first << ">";
  441. writeString(os, it->second);
  442. os << "</" << it->first << ">\n";
  443. break;
  444. case eXmlTypeArray:
  445. writeArray(os, it->first, it->second);
  446. break;
  447. case eXmlTypeObj:
  448. default:
  449. os << "<" << it->first << ">";
  450. writeObj(os, it->second);
  451. os << "</" << it->first << ">\n";
  452. }
  453. }
  454. }
  455. void TC_Xml::writeString(std::ostream& os, const XmlValuePtr& p)
  456. {
  457. XmlValueStringPtr q = XmlValueStringPtr::dynamicCast(p);
  458. if (q->cdata)
  459. {
  460. os << "<![CDATA[" << q->value << "]]>";
  461. return;
  462. }
  463. writeEChar(os, q->value);
  464. }
  465. void TC_Xml::writeArray(std::ostream& os, const string& name, const XmlValuePtr& p)
  466. {
  467. XmlValueArrayPtr q = XmlValueArrayPtr::dynamicCast(p);
  468. for (size_t i = 0; i < q->value.size(); i++)
  469. {
  470. os << "<" << name << ">";
  471. if (q->value[i]->getType() == eXmlTypeString)
  472. {
  473. writeString(os, q->value[i]);
  474. }
  475. else
  476. {
  477. writeObj(os, q->value[i]);
  478. }
  479. os << "</" << name << ">\r\n";
  480. }
  481. }
  482. void TC_Xml::writeEChar(std::ostream& os, const string& data)
  483. {
  484. string s(data);
  485. s = TC_Common::replace(s, "<", "&lt;");
  486. s = TC_Common::replace(s, ">", "&lt;");
  487. s = TC_Common::replace(s, "\'", "&apos;");
  488. s = TC_Common::replace(s, "\"", "&quot;");
  489. os << s;
  490. }
  491. //Xml里面定义的空白字符
  492. bool TC_Xml::isspace(char c)
  493. {
  494. if(c == ' ' || c == '\t' || c == '\r' || c == '\n')
  495. return true;
  496. return false;
  497. }
  498. }