用户注册



邮箱:

密码:

用户登录


邮箱:

密码:
记住登录一个月忘记密码?

发表随想


还能输入:200字
云代码 - c++代码库

Utf8转Utf16

2021-01-03 作者: 云代码会员举报

[c++]代码库

#include "Utf8_16.h"
 
const Utf8_16::utf8 Utf8_16::k_Boms[][3] = {
    {0x00, 0x00, 0x00},  // Unknown
    {0xEF, 0xBB, 0xBF},  // UTF8
    {0xFE, 0xFF, 0x00},  // Big endian
    {0xFF, 0xFE, 0x00},  // Little endian
};
 
 
// ==================================================================
 
Utf8_16_Read::Utf8_16_Read() {
    m_eEncoding     = uni8Bit;
    m_nAllocatedBufSize = 0;
    m_nNewBufSize   = 0;
    m_pNewBuf       = NULL;
    m_bFirstRead    = true;
}
 
Utf8_16_Read::~Utf8_16_Read()
{
    if ((m_eEncoding == uni16BE) || (m_eEncoding == uni16LE) || (m_eEncoding == uni16BE_NoBOM) || (m_eEncoding == uni16LE_NoBOM))
    {
        delete [] m_pNewBuf;
        m_pNewBuf = NULL;
    }
}
 
// Returned value :
// 0 : utf8
// 1 : 7bits
// 2 : 8bits
u78 Utf8_16_Read::utf8_7bits_8bits()
{
    int rv = 1;
    int ASCII7only = 1;
    utf8 *sx    = (utf8 *)m_pBuf;
    utf8 *endx  = sx + m_nLen;
 
    while (sx<endx)
    {
        if (*sx == '\0')
        {                                           // For detection, we'll say that NUL means not UTF8
            ASCII7only = 0;
            rv = 0;
            break;
        }
        else if ((*sx & 0x80) == 0x0)
        {           // 0nnnnnnn If the byte's first hex code begins with 0-7, it is an ASCII character.
            ++sx;
        }
        else if ((*sx & (0x80+0x40)) == 0x80)
        {                                             // 10nnnnnn 8 through B cannot be first hex codes
            ASCII7only=0;
            rv=0;
            break;
        }
        else if ((*sx & (0x80+0x40+0x20)) == (0x80+0x40))
        {                     // 110xxxvv 10nnnnnn, 11 bit character
            ASCII7only=0;
            if (std::distance(sx, endx) < 2) {
                rv=0; break;
            }
            if ( (sx[1]&(0x80+0x40)) != 0x80) {
                rv=0; break;
            }
            sx+=2;
        }
        else if ((*sx & (0x80+0x40+0x20+0x10)) == (0x80+0x40+0x20))
        {                               // 1110qqqq 10xxxxvv 10nnnnnn, 16 bit character
            ASCII7only=0;
            if (std::distance(sx, endx) < 3) {
                rv=0; break;
            }
            if ((sx[1]&(0x80+0x40)) != 0x80 || (sx[2]&(0x80+0x40)) != 0x80) {
                rv=0; break;
            }
            sx+=3;
        }
        else if ((*sx & (0x80+0x40+0x20+0x10+0x8)) == (0x80+0x40+0x20+0x10))
        {                               // 11110qqq 10xxxxvv 10nnnnnn 10mmmmmm, 21 bit character
            ASCII7only=0;
            if (std::distance(sx, endx) < 4) {
                rv=0; break;
            }
            if ((sx[1]&(0x80+0x40)) != 0x80 || (sx[2]&(0x80+0x40)) != 0x80 || (sx[3]&(0x80+0x40)) != 0x80) {
                rv=0; break;
            }
            sx+=4;
        }
        else
        {
            ASCII7only=0;
            rv=0;
            break;
        }
    }
    if (ASCII7only)
        return ascii7bits;
    if (rv)
        return utf8NoBOM;
    return ascii8bits;
}
 
size_t Utf8_16_Read::convert(char* buf, size_t len)
{
    // bugfix by Jens Lorenz
    static  size_t nSkip = 0;
 
    m_pBuf = (ubyte*)buf;
    m_nLen = len;
    m_nNewBufSize = 0;
 
    if (m_bFirstRead == true)
    {
        determineEncoding();
        nSkip = m_nSkip;
        m_bFirstRead = false;
    }
 
    switch (m_eEncoding)
    {
        case uni7Bit:
        case uni8Bit:
        case uniCookie: {
            // Do nothing, pass through
            m_nAllocatedBufSize = 0;
            m_pNewBuf = m_pBuf;
            m_nNewBufSize = len;
            break;
        }
        case uniUTF8: {
            // Pass through after BOM
            m_nAllocatedBufSize = 0;
            m_pNewBuf = m_pBuf + nSkip;
            m_nNewBufSize = len - nSkip;
            break;
        }   
        case uni16BE_NoBOM:
        case uni16LE_NoBOM:
        case uni16BE:
        case uni16LE: {
            size_t newSize = len + len / 2 + 1;
             
            if (m_nAllocatedBufSize != newSize)
            {
                if (m_pNewBuf)
                    delete [] m_pNewBuf;
                m_pNewBuf  = NULL;
                m_pNewBuf  = new ubyte[newSize];
                m_nAllocatedBufSize = newSize;
            }
             
            ubyte* pCur = m_pNewBuf;
             
            m_Iter16.set(m_pBuf + nSkip, len - nSkip, m_eEncoding);
 
            for (; m_Iter16; ++m_Iter16)
            {
                *pCur++ = m_Iter16.get();
            }
            m_nNewBufSize = pCur - m_pNewBuf;
            break;
        }
        default:
            break;
    }
 
    // necessary for second calls and more
    nSkip = 0;
 
    return m_nNewBufSize;
}
 
 
void Utf8_16_Read::determineEncoding()
{
    INT uniTest = IS_TEXT_UNICODE_STATISTICS;
    m_eEncoding = uni8Bit;
    m_nSkip = 0;
 
    // detect UTF-16 big-endian with BOM
    if (m_nLen > 1 && m_pBuf[0] == k_Boms[uni16BE][0] && m_pBuf[1] == k_Boms[uni16BE][1])
    {
        m_eEncoding = uni16BE;
        m_nSkip = 2;
    }
    // detect UTF-16 little-endian with BOM
    else if (m_nLen > 1 && m_pBuf[0] == k_Boms[uni16LE][0] && m_pBuf[1] == k_Boms[uni16LE][1])
    {
        m_eEncoding = uni16LE;
        m_nSkip = 2;
    }
    // detect UTF-8 with BOM
    else if (m_nLen > 2 && m_pBuf[0] == k_Boms[uniUTF8][0] &&
        m_pBuf[1] == k_Boms[uniUTF8][1] && m_pBuf[2] == k_Boms[uniUTF8][2])
    {
        m_eEncoding = uniUTF8;
        m_nSkip = 3;
    }
    // try to detect UTF-16 little-endian without BOM
    else if (m_nLen > 1 && m_nLen % 2 == 0 && m_pBuf[0] != NULL && m_pBuf[1] == NULL && IsTextUnicode(m_pBuf, static_cast<int32_t>(m_nLen), &uniTest))
    {
        m_eEncoding = uni16LE_NoBOM;
        m_nSkip = 0;
    }
    /* UTF-16 big-endian without BOM detection is taken away scince this detection is very week
    // try to detect UTF-16 big-endian without BOM
    else if (m_nLen > 1 && m_pBuf[0] == NULL && m_pBuf[1] != NULL)
    {
        m_eEncoding = uni16BE_NoBOM;
        m_nSkip = 0;
    }
    */
    else
    {
        u78 detectedEncoding = utf8_7bits_8bits();
        if (detectedEncoding == utf8NoBOM)
            m_eEncoding = uniCookie;
        else if (detectedEncoding == ascii7bits)
            m_eEncoding = uni7Bit;
        else //(detectedEncoding == ascii8bits)
            m_eEncoding = uni8Bit;
        m_nSkip = 0;
    }
}
 
UniMode Utf8_16_Read::determineEncoding(const unsigned char *buf, size_t bufLen)
{
    // detect UTF-16 big-endian with BOM
    if (bufLen > 1 && buf[0] == k_Boms[uni16BE][0] && buf[1] == k_Boms[uni16BE][1])
    {
        return uni16BE;
    }
     
    // detect UTF-16 little-endian with BOM
    if (bufLen > 1 && buf[0] == k_Boms[uni16LE][0] && buf[1] == k_Boms[uni16LE][1])
    {
        return uni16LE;
    }
     
    // detect UTF-8 with BOM
    if (bufLen > 2 && buf[0] == k_Boms[uniUTF8][0] &&
        buf[1] == k_Boms[uniUTF8][1] && buf[2] == k_Boms[uniUTF8][2])
    {
        return uniUTF8;
    }
 
    return uni8Bit;
}
 
 
// ==================================================================
 
Utf8_16_Write::Utf8_16_Write()
{
    m_eEncoding = uni8Bit;
    m_pFile = NULL;
    m_pNewBuf = NULL;
    m_bFirstWrite = true;
    m_nBufSize = 0;
}
 
Utf8_16_Write::~Utf8_16_Write()
{
    fclose();
}
 
FILE * Utf8_16_Write::fopen(const TCHAR *_name, const TCHAR *_type)
{
    m_pFile = ::generic_fopen(_name, _type);
 
    m_bFirstWrite = true;
 
    return m_pFile;
}
 
size_t Utf8_16_Write::fwrite(const void* p, size_t _size)
{
    // no file open
    if (!m_pFile)
    {
        return 0;
    }
 
    size_t  ret = 0;
     
    if (m_bFirstWrite)
    {
        switch (m_eEncoding)
        {
            case uniUTF8: {
                ::fwrite(k_Boms[m_eEncoding], 3, 1, m_pFile);
                break;
            }   
            case uni16BE:
            case uni16LE:
                ::fwrite(k_Boms[m_eEncoding], 2, 1, m_pFile);
                break;
            default:
                // nothing to do
                break;
        }
        m_bFirstWrite = false;
    }
     
    switch (m_eEncoding)
    {
        case uni7Bit:
        case uni8Bit:
        case uniCookie:
        case uniUTF8: {
            // Normal write
            ret = ::fwrite(p, _size, 1, m_pFile);
            break;
        }
        case uni16BE_NoBOM:
        case uni16LE_NoBOM:
        case uni16BE:
        case uni16LE: {
            static const int bufSize = 64*1024;
            utf16 buf[bufSize];
             
            Utf8_Iter iter8;
            iter8.set(static_cast<const ubyte*>(p), _size, m_eEncoding);
             
            int bufIndex = 0;
            while (iter8) {
                if (iter8.canGet()) {
                    buf[bufIndex++] = iter8.get();
                }
                ++iter8;
                if (bufIndex == bufSize || !iter8) {
                    if (!::fwrite(buf, bufIndex*sizeof(utf16), 1, m_pFile)) return 0;
                    bufIndex = 0;
                }
            }
            ret = 1;
            break;
        }   
        default:
            break;
    }
     
    return ret;
}
 
 
size_t Utf8_16_Write::convert(char* p, size_t _size)
{
    if (m_pNewBuf)
    {
        delete [] m_pNewBuf;
    }
 
    switch (m_eEncoding)
    {
        case uni7Bit:
        case uni8Bit:
        case uniCookie: {
            // Normal write
            m_nBufSize = _size;
            m_pNewBuf = (ubyte*)new ubyte[m_nBufSize];
            memcpy(m_pNewBuf, p, _size);
            break;
        }
        case uniUTF8: {
            m_nBufSize = _size + 3;
            m_pNewBuf = (ubyte*)new ubyte[m_nBufSize];
            memcpy(m_pNewBuf, k_Boms[m_eEncoding], 3);
            memcpy(&m_pNewBuf[3], p, _size);
            break;
        }
        case uni16BE_NoBOM:
        case uni16LE_NoBOM:
        case uni16BE:
        case uni16LE:
        {
            utf16* pCur = NULL;
             
            if (m_eEncoding == uni16BE || m_eEncoding == uni16LE) {
                // Write the BOM
                m_pNewBuf = (ubyte*)new ubyte[sizeof(utf16) * (_size + 1)];
                memcpy(m_pNewBuf, k_Boms[m_eEncoding], 2);
                pCur = (utf16*)&m_pNewBuf[2];
            } else {
                m_pNewBuf = (ubyte*)new ubyte[sizeof(utf16) * _size];
                pCur = (utf16*)m_pNewBuf;
            }
 
            Utf8_Iter iter8;
            iter8.set(reinterpret_cast<const ubyte*>(p), _size, m_eEncoding);
             
            for (; iter8; ++iter8) {
                if (iter8.canGet()) {
                    *pCur++ = iter8.get();
                }
            }
            m_nBufSize = (const char*)pCur - (const char*)m_pNewBuf;
            break;
        }
        default:
            break;
    }
     
    return m_nBufSize;
}
 
 
void Utf8_16_Write::setEncoding(UniMode eType)
{
    m_eEncoding = eType;
}
 
 
void Utf8_16_Write::fclose()
{
    if (m_pNewBuf)
        delete [] m_pNewBuf;
 
    if (m_pFile)
        ::fclose(m_pFile);
}
 
 
//=================================================================
Utf8_Iter::Utf8_Iter()
{
    reset();
}
 
void Utf8_Iter::reset()
{
    m_pBuf = NULL;
    m_pRead = NULL;
    m_pEnd = NULL;
    m_eState = eStart;
    m_nCur = 0;
    m_eEncoding = uni8Bit;
}
 
void Utf8_Iter::set(const ubyte* pBuf, size_t nLen, UniMode eEncoding)
{
    m_pBuf      = pBuf;
    m_pRead     = pBuf;
    m_pEnd      = pBuf + nLen;
    m_eEncoding = eEncoding;
    operator++();
    // Note: m_eState, m_nCur not set
}
 
// Go to the next byte.
void Utf8_Iter::operator++()
{
    switch (m_eState)
    {
        case eStart:
            if (*m_pRead < 0x80) {
                m_nCur = *m_pRead;
                toStart();
            } else if (*m_pRead < 0xE0) {
                m_nCur = static_cast<utf16>((0x1F & *m_pRead) << 6);
                m_eState = e2Bytes_Byte2;
            } else {
                m_nCur = static_cast<utf16>((0xF & *m_pRead) << 12);
                m_eState = e3Bytes_Byte2;
            }
            break;
        case e2Bytes_Byte2:
        case e3Bytes_Byte3:
            m_nCur |= static_cast<utf8>(0x3F & *m_pRead);
            toStart();
            break;
        case e3Bytes_Byte2:
            m_nCur |= static_cast<utf16>((0x3F & *m_pRead) << 6);
            m_eState = e3Bytes_Byte3;
            break;
    }
    ++m_pRead;
}
 
void Utf8_Iter::toStart()
{
    m_eState = eStart;
    if (m_eEncoding == uni16BE || m_eEncoding == uni16BE_NoBOM)
    {
        swap();
    }
}
 
void Utf8_Iter::swap()
{
    utf8* p = reinterpret_cast<utf8*>(&m_nCur);
    utf8 swapbyte = *p;
    *p = *(p + 1);
    *(p + 1) = swapbyte;
}
 
//==================================================
Utf16_Iter::Utf16_Iter()
{
    reset();
}
 
void Utf16_Iter::reset()
{
    m_pBuf = NULL;
    m_pRead = NULL;
    m_pEnd = NULL;
    m_eState = eStart;
    m_nCur = 0;
    m_nCur16 = 0;
    m_eEncoding = uni8Bit;
}
 
void Utf16_Iter::set(const ubyte* pBuf, size_t nLen, UniMode eEncoding)
{
    m_pBuf = pBuf;
    m_pRead = pBuf;
    m_pEnd = pBuf + nLen;
    m_eEncoding = eEncoding;
    m_eState = eStart;
    operator++();
    // Note: m_eState, m_nCur, m_nCur16 not reinitalized.
}
 
// Goes to the next byte.
// Not the next symbol which you might expect.
// This way we can continue from a partial buffer that doesn't align
void Utf16_Iter::operator++()
{
    switch (m_eState)
    {
        case eStart:
            if (m_eEncoding == uni16LE || m_eEncoding == uni16LE_NoBOM)
            {
                m_nCur16 = *m_pRead++;
                m_nCur16 |= static_cast<utf16>(*m_pRead << 8);
            }
            else //(m_eEncoding == uni16BE || m_eEncoding == uni16BE_NoBOM)
            {
                m_nCur16 = static_cast<utf16>(*m_pRead++ << 8);
                m_nCur16 |= *m_pRead;
            }
            ++m_pRead;
             
            if (m_nCur16 < 0x80) {
                m_nCur = static_cast<ubyte>(m_nCur16 & 0xFF);
                m_eState = eStart;
            } else if (m_nCur16 < 0x800) {
                m_nCur = static_cast<ubyte>(0xC0 | m_nCur16 >> 6);
                m_eState = e2Bytes2;
            } else {
                m_nCur = static_cast<ubyte>(0xE0 | m_nCur16 >> 12);
                m_eState = e3Bytes2;
            }
            break;
        case e2Bytes2:
        case e3Bytes3:
            m_nCur = static_cast<ubyte>(0x80 | m_nCur16 & 0x3F);
            m_eState = eStart;
            break;
        case e3Bytes2:
            m_nCur = static_cast<ubyte>(0x80 | ((m_nCur16 >> 6) & 0x3F));
            m_eState = e3Bytes3;
            break;
    }
}


网友评论    (发表评论)


发表评论:

评论须知:

  • 1、评论每次加2分,每天上限为30;
  • 2、请文明用语,共同创建干净的技术交流环境;
  • 3、若被发现提交非法信息,评论将会被删除,并且给予扣分处理,严重者给予封号处理;
  • 4、请勿发布广告信息或其他无关评论,否则将会删除评论并扣分,严重者给予封号处理。


扫码下载

加载中,请稍后...

输入口令后可复制整站源码

加载中,请稍后...