Unicode.cpp 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. /***************************************************************************
  2. *
  3. * Project _____ __ ____ _ _
  4. * ( _ ) /__\ (_ _)_| |_ _| |_
  5. * )(_)( /(__)\ )( (_ _)(_ _)
  6. * (_____)(__)(__)(__) |_| |_|
  7. *
  8. *
  9. * Copyright 2018-present, Leonid Stryzhevskyi <lganzzzo@gmail.com>
  10. *
  11. * Licensed under the Apache License, Version 2.0 (the "License");
  12. * you may not use this file except in compliance with the License.
  13. * You may obtain a copy of the License at
  14. *
  15. * http://www.apache.org/licenses/LICENSE-2.0
  16. *
  17. * Unless required by applicable law or agreed to in writing, software
  18. * distributed under the License is distributed on an "AS IS" BASIS,
  19. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  20. * See the License for the specific language governing permissions and
  21. * limitations under the License.
  22. *
  23. ***************************************************************************/
  24. #include "Unicode.hpp"
  25. #if defined(WIN32) || defined(_WIN32)
  26. #include <winsock2.h>
  27. #else
  28. #include <arpa/inet.h>
  29. #endif
  30. namespace oatpp { namespace encoding {
  31. v_buff_size Unicode::getUtf8CharSequenceLength(v_char8 firstByte) {
  32. if(firstByte < 128){
  33. return 1;
  34. }
  35. if((firstByte | 192) != firstByte){
  36. return 0;
  37. }
  38. if((firstByte | 32) != firstByte){
  39. return 2;
  40. } else if((firstByte | 16) != firstByte){
  41. return 3;
  42. } else if((firstByte | 8) != firstByte){
  43. return 4;
  44. } else if((firstByte | 4) != firstByte){
  45. return 5;
  46. } else if((firstByte | 2) != firstByte){
  47. return 6;
  48. } else {
  49. return 0;
  50. }
  51. }
  52. v_buff_size Unicode::getUtf8CharSequenceLengthForCode(v_uint32 code){
  53. if(code < 128) {
  54. return 1;
  55. } else if(code < 0x00000800){
  56. return 2;
  57. } else if(code < 0x00010000){
  58. return 3;
  59. } else if(code < 0x00200000){
  60. return 4;
  61. } else if(code < 0x04000000){
  62. return 5;
  63. } else {
  64. return 6;
  65. }
  66. }
  67. v_int32 Unicode::encodeUtf8Char(const char* sequence, v_buff_size& length){
  68. v_char8 byte = sequence[0];
  69. if(byte > 127){
  70. v_int32 code;
  71. if((byte | 32) != byte){
  72. length = 2;
  73. code = ((31 & byte) << 6) | (sequence[1] & 63);
  74. return code;
  75. } else if((byte | 16) != byte){
  76. code = (15 & byte) << 12;
  77. length = 3;
  78. } else if((byte | 8) != byte){
  79. length = 4;
  80. v_int32 value = *((p_int32)sequence);
  81. code = ((7 & byte) << 18) |
  82. (((value >> 24) & 0xFF) & 63) |
  83. (((value >> 16) & 0xFF) & 63) << 6 |
  84. (((value >> 8) & 0xFF) & 63) << 12;
  85. return code;
  86. } else if((byte | 4) != byte){
  87. code = (3 & byte) << 24;
  88. length = 5;
  89. } else if((byte | 2) != byte){
  90. code = (1 & byte) << 30;
  91. length = 6;
  92. } else {
  93. return -1;
  94. }
  95. v_char8 bitIndex = 0;
  96. for(v_buff_size i = length; i > 1; i--){
  97. code |= (sequence[i - 1] & 63) << bitIndex;
  98. bitIndex += 6;
  99. }
  100. return code;
  101. } else {
  102. length = 1;
  103. return byte;
  104. }
  105. }
  106. v_buff_size Unicode::decodeUtf8Char(v_int32 code, p_char8 buffer) {
  107. if(code >= 0x00000080 && code < 0x00000800){
  108. *((p_int16) buffer) = htons(((((code >> 6) & 31) | 192) << 8) | ((code & 63) | 128));
  109. return 2;
  110. } else if(code >= 0x00000800 && code < 0x00010000){
  111. *((p_int16) buffer) = htons((((( code >> 12 ) & 15) | 224) << 8) |
  112. (((code >> 6 ) & 63) | 128));
  113. buffer[2] = (code & 63) | 128;
  114. return 3;
  115. } else if(code >= 0x00010000 && code < 0x00200000){
  116. *((p_int32) buffer) = htonl(((((code >> 18 ) & 7) | 240) << 24) |
  117. ((((code >> 12 ) & 63) | 128) << 16) |
  118. ((((code >> 6 ) & 63) | 128) << 8) |
  119. (( code & 63) | 128) );
  120. return 4;
  121. } else if(code >= 0x00200000 && code < 0x04000000){
  122. *((p_int32) buffer) = htonl(((((code >> 24 ) & 3) | 248) << 24) |
  123. ((((code >> 18 ) & 63) | 128) << 16) |
  124. ((((code >> 12 ) & 63) | 128) << 8) |
  125. (((code >> 6 ) & 63) | 128));
  126. buffer[4] = (code & 63) | 128;
  127. return 5;
  128. } else if(code >= 0x04000000){
  129. *((p_int32) buffer) = htonl(((((code >> 30 ) & 1) | 252) << 24) |
  130. ((((code >> 24 ) & 63) | 128) << 16) |
  131. ((((code >> 18 ) & 63) | 128) << 8) |
  132. (((code >> 12 ) & 63) | 128));
  133. *((p_int16) &buffer[4]) = htons(((((code >> 6 ) & 63) | 128) << 8) | (code & 63));
  134. return 6;
  135. }
  136. buffer[0] = v_char8(code);
  137. return 1;
  138. }
  139. void Unicode::codeToUtf16SurrogatePair(v_int32 code, v_int16& high, v_int16& low){
  140. code -= 0x010000;
  141. high = 0xD800 + ((code >> 10) & 1023);
  142. low = 0xDC00 + (code & 1023);
  143. }
  144. v_int32 Unicode::utf16SurrogatePairToCode(v_int16 high, v_int16 low){
  145. return (((low - 0xDC00) & 1023) | (((high - 0xD800) & 1023) << 10)) + 0x010000;
  146. }
  147. }}