tc_encoder.cpp 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285
  1. /**
  2. * Tencent is pleased to support the open source community by making Tars available.
  3. *
  4. * Copyright (C) 2016THL A29 Limited, a Tencent company. All rights reserved.
  5. *
  6. * Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  7. * in compliance with the License. You may obtain a copy of the License at
  8. *
  9. * https://opensource.org/licenses/BSD-3-Clause
  10. *
  11. * Unless required by applicable law or agreed to in writing, software distributed
  12. * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  13. * CONDITIONS OF ANY KIND, either express or implied. See the License for the
  14. * specific language governing permissions and limitations under the License.
  15. */
  16. #include "util/tc_platform.h"
  17. #if TARGET_PLATFORM_LINUX || TARGET_PLATFORM_IOS
  18. #include <iconv.h>
  19. #endif
  20. #include <string.h>
  21. #include "util/tc_encoder.h"
  22. #include <iostream>
  23. namespace tars
  24. {
  25. #if TARGET_PLATFORM_WINDOWS
  26. #include <windows.h>
  27. void TC_Encoder::gbk2utf8(const string &sIn, vector<string> &vtStr,int mode)
  28. {
  29. string sOut;
  30. for(string::size_type pos = 0; pos < sIn.length(); ++pos)
  31. {
  32. if((unsigned char)sIn[pos] < 0x80)
  33. {
  34. //单字节(GBK: 0x00-0x7F)
  35. sOut += sIn[pos];
  36. }
  37. else
  38. {
  39. //双字节
  40. char pIn[128] = "\0";
  41. strncpy(pIn, sIn.c_str() + pos, 2);
  42. sOut = gbk2utf8(pIn);
  43. ++pos;
  44. }
  45. vtStr.push_back(sOut);
  46. }
  47. }
  48. std::string TC_Encoder::gbk2utf8(const std::string &strGbk,int mode)
  49. {
  50. string outUtf8 = "";
  51. int n = MultiByteToWideChar(CP_ACP, 0, strGbk.c_str(), -1, NULL, 0);
  52. WCHAR *str1 = new WCHAR[n];
  53. MultiByteToWideChar(CP_ACP, 0, strGbk.c_str(), -1,str1, n);
  54. n = WideCharToMultiByte(CP_UTF8, 0, str1, -1, NULL, 0, NULL, NULL);
  55. char *str2 = new char[n];
  56. WideCharToMultiByte(CP_UTF8, 0, str1, -1, str2, n, NULL, NULL);
  57. outUtf8 = str2;
  58. delete[]str1;
  59. str1 = NULL;
  60. delete[]str2;
  61. str2 = NULL;
  62. return outUtf8;
  63. }
  64. std::string TC_Encoder::utf82gbk(const std::string &strUtf8,int mode)
  65. {
  66. string outGBK = "";
  67. int n = MultiByteToWideChar(CP_UTF8, 0, strUtf8.c_str(), -1, NULL, 0);
  68. WCHAR *str1 = new WCHAR[n];
  69. MultiByteToWideChar(CP_UTF8, 0, strUtf8.c_str(), -1, str1, n);
  70. n = WideCharToMultiByte(CP_ACP, 0, str1, -1, NULL, 0, NULL, NULL);
  71. char *str2 = new char[n];
  72. WideCharToMultiByte(CP_ACP, 0, str1, -1, str2, n, NULL, NULL);
  73. outGBK = str2;
  74. delete[] str1;
  75. str1 = NULL;
  76. delete[] str2;
  77. str2 = NULL;
  78. return outGBK;
  79. }
  80. #else
  81. string TC_Encoder::gbk2utf8(const string &sIn,int mode)
  82. {
  83. iconv_t cd;
  84. switch(mode){
  85. case TC_Encoder::ICONV_TRANSLIT:
  86. cd = iconv_open("UTF-8//TRANSLIT", "GBK");
  87. break;
  88. case TC_Encoder::ICONV_IGNORE:
  89. cd = iconv_open("UTF-8//IGNORE", "GBK");
  90. break;
  91. default:
  92. cd = iconv_open("UTF-8", "GBK");
  93. break;
  94. }
  95. if (cd == (iconv_t)-1){
  96. THROW_EXCEPTION_SYSCODE(TC_Encoder_Exception, "[TC_Encoder::gbk2utf8] iconv_open error");
  97. }
  98. string sOut;
  99. size_t bufsize = sIn.size()*2+1;
  100. char* buf = new char[bufsize];
  101. char* pOut = buf;
  102. size_t isize = sIn.length();
  103. size_t osize = bufsize;
  104. char* pIn = (char*)sIn.c_str();
  105. size_t ret = iconv(cd, &pIn, &isize, &pOut, &osize);
  106. if((size_t)-1 == ret && TC_Encoder::ICONV_NORMAL == mode){
  107. iconv_close(cd);
  108. delete []buf;
  109. THROW_EXCEPTION_SYSCODE(TC_Encoder_Exception, "[TC_Encoder::gbk2utf8] iconv error");
  110. return sOut;
  111. }
  112. iconv_close(cd);
  113. buf[bufsize-osize]=0;
  114. sOut.assign(buf);
  115. delete []buf;
  116. return sOut;
  117. }
  118. void TC_Encoder::gbk2utf8(const string &sIn, vector<string> &vtStr,int mode){
  119. string out = TC_Encoder::gbk2utf8(sIn,mode);
  120. for (size_t i = 0; i < out.size();) {
  121. unsigned char uc = out[i];
  122. if (uc >= 0xF0 && (i + 4) <= out.size()) {
  123. vtStr.push_back(out.substr(i, 4));
  124. i += 4;
  125. continue;
  126. }
  127. if (uc >= 0xE0 && (i + 3) <= out.size()) {
  128. vtStr.push_back(out.substr(i,3));
  129. i += 3;
  130. continue;
  131. }
  132. if (uc >= 0xC0 && (i + 2) <= out.size()) {
  133. vtStr.push_back(out.substr(i,2));
  134. i += 2;
  135. continue;
  136. }
  137. if ((i + 1) <= out.size()) {
  138. vtStr.push_back(out.substr(i,1));
  139. i++;
  140. continue;
  141. }
  142. else {
  143. THROW_EXCEPTION_SYSCODE(TC_Encoder_Exception, "[TC_Encoder::gbk2utf8] invalid utf8 string | conversion error");
  144. }
  145. }
  146. }
  147. void TC_Encoder::utf82gbk(char *sOut, int &iMaxOutLen, const char *sIn, int iInLen,int mode)
  148. {
  149. iconv_t cd;
  150. switch(mode){
  151. case TC_Encoder::ICONV_TRANSLIT:
  152. cd = iconv_open("GBK//TRANSLIT", "UTF-8");
  153. break;
  154. case TC_Encoder::ICONV_IGNORE:
  155. cd = iconv_open("GBK//IGNORE", "UTF-8");
  156. break;
  157. default:
  158. cd = iconv_open("GBK", "UTF-8");
  159. break;
  160. }
  161. if (cd == (iconv_t)-1){
  162. THROW_EXCEPTION_SYSCODE(TC_Encoder_Exception, "[TC_Encoder::gbk2utf8] iconv_open error");
  163. }
  164. char * pIn = (char*)sIn;
  165. size_t sizeLeftLen = iMaxOutLen;
  166. size_t sizeInLen = iInLen;
  167. char* pOut = sOut;
  168. size_t ret = iconv(cd, &pIn, &sizeInLen, (char **)&sOut, &sizeLeftLen);
  169. if (ret == (size_t) - 1 && TC_Encoder::ICONV_NORMAL == mode){
  170. iconv_close(cd);
  171. THROW_EXCEPTION_SYSCODE(TC_Encoder_Exception, "[TC_Encoder::utf82gbk] iconv error");
  172. return;
  173. }
  174. iconv_close(cd);
  175. pOut[iMaxOutLen - (int)sizeLeftLen] = '\0';
  176. iMaxOutLen = iMaxOutLen - (int)sizeLeftLen;
  177. }
  178. string TC_Encoder::utf82gbk(const string &sIn,int mode)
  179. {
  180. if(sIn.length() == 0)
  181. {
  182. return "";
  183. }
  184. string sOut;
  185. int iLen = sIn.length() * 2 + 1;
  186. char *pOut = new char[iLen];
  187. try
  188. {
  189. utf82gbk(pOut, iLen, sIn.c_str(), sIn.length(),mode);
  190. }
  191. catch (TC_Encoder_Exception& e)
  192. {
  193. delete[] pOut;
  194. throw e;
  195. }
  196. sOut.assign(pOut, iLen);
  197. delete[] pOut;
  198. return sOut;
  199. }
  200. #endif
  201. /**
  202. * \n -> \r\0
  203. * \r -> \r\r
  204. */
  205. string TC_Encoder::transTo(const string& str, char f /*='\n'*/, char t /*= '\r'*/, char u /*= '\0'*/)
  206. {
  207. string ret = str;
  208. for (size_t i = 0; i < ret.length(); ++i)
  209. {
  210. if (ret[i] == f)
  211. {
  212. ret[i] = t;
  213. ret.insert(++i, 1, u);
  214. }
  215. else if (ret[i] == t)
  216. {
  217. ret.insert(++i, 1, t);
  218. }
  219. }
  220. return ret;
  221. }
  222. /**
  223. * \r\0 -> \n
  224. * \r\r -> \r
  225. */
  226. string TC_Encoder::transFrom(const string& str, char f /*= '\n'*/, char t /*= '\r'*/, char u /*= '\0'*/)
  227. {
  228. string ret = "";
  229. for (string::const_iterator it = str.begin()
  230. ; it != str.end()
  231. ; ++it)
  232. {
  233. ret.append(1, *it);
  234. if (*it == t)
  235. {
  236. if (*(++it) == u)
  237. {
  238. *ret.rbegin() = f;
  239. }
  240. }
  241. }
  242. return ret;
  243. }
  244. }