Utf8_16.h 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. // Utf8_16.h
  2. // Copyright (C) 2002 Scott Kirkwood
  3. //
  4. // Permission to use, copy, modify, distribute and sell this code
  5. // and its documentation for any purpose is hereby granted without fee,
  6. // provided that the above copyright notice appear in all copies or
  7. // any derived copies. Scott Kirkwood makes no representations
  8. // about the suitability of this software for any purpose.
  9. // It is provided "as is" without express or implied warranty.
  10. //
  11. // Notes: Used the UTF information I found at:
  12. // http://www.cl.cam.ac.uk/~mgk25/unicode.html
  13. ////////////////////////////////////////////////////////////////////////////////
  14. #ifndef _UTF8_16_H_
  15. #define _UTF8_16_H_
  16. #include <stdio.h>
  17. #include <assert.h>
  18. #include "csr_typedefs.h"
  19. #ifdef _MSC_VER
  20. #pragma warning(disable : 4514) // nreferenced inline function has been removed
  21. #endif
  22. namespace csr {
  23. int csrUTF8Encode(u1* pBuf, int iCode); // forward ref for GCC
  24. int csrUTF8DecodeLength(const u1* pBuf);
  25. int csrUTF8Decode(const u1* pBuf);
  26. int csrUTF8Decode(const u1* pBuf, u2& length);
  27. int csrUTF8StringLength(const u1* pBuf);
  28. class Utf8_16 {
  29. public:
  30. typedef unsigned short utf16; // 16 bits
  31. typedef unsigned char utf8; // 8 bits
  32. typedef unsigned char ubyte;
  33. enum encodingType {
  34. eUnknown,
  35. eUtf16BigEndian,
  36. eUtf16LittleEndian, // Default on Windows
  37. eUtf8,
  38. eLast
  39. };
  40. static const utf8 k_Boms[eLast][3];
  41. };
  42. // Reads UTF-16 and outputs UTF-8
  43. class Utf16_Iter : public Utf8_16 {
  44. public:
  45. Utf16_Iter();
  46. void reset();
  47. void set(const ubyte* pBuf, size_t nLen, encodingType eEncoding);
  48. utf8 get() const { return m_nCur; }
  49. void operator++();
  50. operator bool() { return m_pRead <= m_pEnd; }
  51. protected:
  52. void toStart(); // Put to start state, swap bytes if necessary
  53. enum eState { eStart, e2Bytes2, e3Bytes2, e3Bytes3 };
  54. protected:
  55. encodingType m_eEncoding;
  56. eState m_eState;
  57. utf8 m_nCur;
  58. utf16 m_nCur16;
  59. const ubyte* m_pBuf;
  60. const ubyte* m_pRead;
  61. const ubyte* m_pEnd;
  62. };
  63. // Reads UTF-8 and outputs UTF-16
  64. class Utf8_Iter : public Utf8_16 {
  65. public:
  66. Utf8_Iter();
  67. void reset();
  68. void set(const ubyte* pBuf, size_t nLen, encodingType eEncoding);
  69. #ifdef _DEBUG
  70. utf16 get() const;
  71. #else
  72. utf16 get() const { return m_nCur; }
  73. #endif
  74. bool canGet() const { return m_eState == eStart; }
  75. void operator++();
  76. operator bool() { return m_pRead <= m_pEnd; }
  77. protected:
  78. void swap();
  79. void toStart(); // Put to start state, swap bytes if necessary
  80. enum eState { eStart, e2Bytes_Byte2, e3Bytes_Byte2, e3Bytes_Byte3 };
  81. protected:
  82. encodingType m_eEncoding;
  83. eState m_eState;
  84. utf16 m_nCur;
  85. const ubyte* m_pBuf;
  86. const ubyte* m_pRead;
  87. const ubyte* m_pEnd;
  88. };
  89. // Reads UTF16 and outputs UTF8
  90. class Utf8_16_Read : public Utf8_16 {
  91. public:
  92. Utf8_16_Read();
  93. ~Utf8_16_Read();
  94. size_t convert(char* buf, size_t len);
  95. char* getNewBuf() { return reinterpret_cast<char*>(m_pNewBuf); }
  96. encodingType getEncoding() const { return m_eEncoding; }
  97. protected:
  98. int determineEncoding();
  99. private:
  100. encodingType m_eEncoding;
  101. ubyte* m_pBuf;
  102. ubyte* m_pNewBuf;
  103. size_t m_nBufSize;
  104. bool m_bFirstRead;
  105. size_t m_nLen;
  106. Utf16_Iter m_Iter16;
  107. };
  108. // Read in a UTF-8 buffer and write out to UTF-16 or UTF-8
  109. class Utf8_16_Write : public Utf8_16 {
  110. public:
  111. Utf8_16_Write();
  112. ~Utf8_16_Write();
  113. void setEncoding(encodingType eType);
  114. FILE* fopen(const char* _name, const char* _type);
  115. size_t fwrite(const void* p, size_t _size);
  116. void fclose();
  117. protected:
  118. encodingType m_eEncoding;
  119. FILE* m_pFile;
  120. utf16* m_pBuf;
  121. size_t m_nBufSize;
  122. bool m_bFirstWrite;
  123. };
  124. }; // end if namespace
  125. #endif