2022年4月7日 星期四

[C++]UTF8轉wide character

 中文轉UTF8 在轉換為wide character 透過此方式轉換即可將文字轉為數字傳送出去

再透過解譯數字長度,將數字組回文字

---------------------------------------------------------------------

宣告以下資訊"

---.h

static const char trailingBytesForUTF8[256] = {
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};

static const unsigned int offsetsFromUTF8[6] = {
    0x00000000UL, 0x00003080UL, 0x000E2080UL,
    0x03C82080UL, 0xFA082080UL, 0x82082080UL
};

int u8_toucs(unsigned int* dest, int sz, char* src, int srcsz);
int u8_toutf8(char* dest, int sz, unsigned int* src, int srcsz);
int u8_wc_toutf8(char* dest, unsigned int ch);

-----.cpp

int u8_toucs(unsigned int* dest, int sz, char* src, int srcsz){
    unsigned int ch;
    char* src_end = src + srcsz;
    int nb;
    int i = 0;

    while (i < sz - 1) {
        nb = trailingBytesForUTF8[(unsigned char)*src];
        if (srcsz == -1) {
            if (*src == 0)
                goto done_toucs;
        }
        else {
            if (src + nb >= src_end)
                goto done_toucs;
        }
        ch = 0;
        switch (nb) {
            /* these fall through deliberately */
        case 3: ch += (unsigned char)*src++; ch <<= 6;
        case 2: ch += (unsigned char)*src++; ch <<= 6;
        case 1: ch += (unsigned char)*src++; ch <<= 6;
        case 0: ch += (unsigned char)*src++;
        }
        ch -= offsetsFromUTF8[nb];
        dest[i++] = ch;
    }
done_toucs:
    dest[i] = 0;
    return i;
}

int u8_toutf8(char* dest, int sz, unsigned int* src, int srcsz)
{
    unsigned int ch;
    int i = 0;
    char* dest_end = dest + sz;

    while (srcsz < 0 ? src[i] != 0 : i < srcsz) {
        ch = src[i];
        if (ch < 0x80) {
            if (dest >= dest_end)
                return i;
            *dest++ = (char)ch;
        }
        else if (ch < 0x800) {
            if (dest >= dest_end - 1)
                return i;
            *dest++ = (ch >> 6) | 0xC0;
            *dest++ = (ch & 0x3F) | 0x80;
        }
        else if (ch < 0x10000) {
            if (dest >= dest_end - 2)
                return i;
            *dest++ = (ch >> 12) | 0xE0;
            *dest++ = ((ch >> 6) & 0x3F) | 0x80;
            *dest++ = (ch & 0x3F) | 0x80;
        }
        else if (ch < 0x110000) {
            if (dest >= dest_end - 3)
                return i;
            *dest++ = (ch >> 18) | 0xF0;
            *dest++ = ((ch >> 12) & 0x3F) | 0x80;
            *dest++ = ((ch >> 6) & 0x3F) | 0x80;
            *dest++ = (ch & 0x3F) | 0x80;
        }
        i++;
    }
    if (dest < dest_end)
        *dest = '\0';
    return i;
}

-----------------------------------------------

Sample: 範例

char result_str2[100] = {};
uint32_t b_ucs[100] = {}; // plenty of space
int b_chars = 0;

char cbuffdata[100] = u8"哈摟哈!";
uint32_t utf_len2 = strlen(cbuffdata);
b_chars = u8_toucs(b_ucs, (utf_len2 + 1) * 4, cbuffdata, utf_len2);
//此動作已完成轉換
b_chars = u8_toutf8(result_str2, (utf_len2 + 1) * 4, b_ucs, utf_len2);
//此動作為轉換回UTF8


----可再搭配其他轉換方式將UTF8轉成ansi顯示
    CString csbuf;
    ConvertUTF8toANSI(result_str2, &csbuf);
//此函式連結

-----------------------------------------------------------------------

參考:https://www.cprogramming.com/tutorial/unicode.html

沒有留言:

張貼留言

[SQL]顯示千分位與小數顯示

  CONVERT ( data_type [ ( length ) ] , expression [ , style ] ) CONVERT style參數說明 1  (expression為 money 或 smallmoney型別): 0 : 預設,保留小數位後兩位,並四捨...