w3m

Unnamed repository; edit this file to name it for gitweb.
git clone https://logand.com/git/w3m.git/
Log | Files | Refs | README

gb18030.c (9567B)


      1 
      2 #include "wc.h"
      3 #include "gb18030.h"
      4 #include "search.h"
      5 #include "wtf.h"
      6 #ifdef USE_UNICODE
      7 #include "ucs.h"
      8 #endif
      9 #include "map/gb18030_ucs.map"
     10 
     11 #define C0 WC_GB18030_MAP_C0
     12 #define GL WC_GB18030_MAP_GL
     13 #define C1 WC_GB18030_MAP_C1
     14 #define LB WC_GB18030_MAP_LB
     15 #define UB WC_GB18030_MAP_UB
     16 #define L4 WC_GB18030_MAP_L4
     17 
     18 wc_uint8 WC_GB18030_MAP[ 0x100 ] = {
     19     C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0,
     20     C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0,
     21     GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL,
     22     L4, L4, L4, L4, L4, L4, L4, L4, L4, L4, GL, GL, GL, GL, GL, GL,
     23     LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB,
     24     LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB,
     25     LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB,
     26     LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, C0,
     27 
     28     LB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB,
     29     UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB,
     30     UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB,
     31     UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB,
     32     UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB,
     33     UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB,
     34     UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB,
     35     UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, C1,
     36 };
     37 
     38 wc_wchar_t
     39 wc_gbk_ext_to_cs128w(wc_wchar_t cc)
     40 {
     41     cc.code = WC_GBK_N(cc.code);
     42     if (cc.code < 0x4000)
     43 	cc.ccs = WC_CCS_GBK_EXT_1;
     44     else {
     45 	cc.ccs = WC_CCS_GBK_EXT_2;
     46 	cc.code -= 0x4000;
     47     }
     48     cc.code = WC_N_CS128W(cc.code);
     49     return cc;
     50 }
     51 
     52 wc_wchar_t
     53 wc_cs128w_to_gbk_ext(wc_wchar_t cc)
     54 {
     55     cc.code = WC_CS128W_N(cc.code);
     56     if (cc.ccs == WC_CCS_GBK_EXT_2)
     57 	cc.code += 0x4000;
     58     cc.ccs = WC_CCS_GBK_EXT;
     59     cc.code = WC_N_GBK(cc.code);
     60     return cc;
     61 }
     62 
     63 static wc_ccs
     64 wc_gbk_or_gbk_ext(wc_uint16 code) {
     65     return wc_map3_range_search(code,
     66         gbk_ext_ucs_map, N_gbk_ext_ucs_map)
     67         ? WC_CCS_GBK_EXT : WC_CCS_GBK;
     68 }
     69 
     70 #ifdef USE_UNICODE
     71 wc_uint32
     72 wc_gb18030_to_ucs(wc_wchar_t cc)
     73 {
     74     wc_map3 *map;
     75 
     76     switch (WC_CCS_SET(cc.ccs)) {
     77     case WC_CCS_GBK_EXT_1:
     78     case WC_CCS_GBK_EXT_2:
     79 	cc = wc_cs128w_to_gbk_ext(cc);
     80     case WC_CCS_GBK_EXT:
     81 	map = wc_map3_range_search((wc_uint16)cc.code,
     82 		gbk_ext_ucs_map, N_gbk_ext_ucs_map);
     83 	if (map)
     84 	    return map->code3 + WC_GBK_N(cc.code) - WC_GBK_N(map->code2);
     85 	return WC_C_UCS4_ERROR;
     86     case WC_CCS_GB18030:
     87 	break;
     88     default:
     89 	return wc_any_to_ucs(cc);
     90     }
     91     if (cc.code >= WC_C_GB18030_UCS2 && cc.code <= WC_C_GB18030_UCS2_END) {
     92 	int i, min = 0, max = N_ucs_gb18030_map - 1;
     93 
     94 	cc.code = WC_GB18030_N(cc.code) - WC_GB18030_N(WC_C_GB18030_UCS2);
     95 	if (cc.code >= ucs_gb18030_map[max].code3)
     96 	    i = max;
     97 	else {
     98 	    while(1) {
     99 		i = (min + max) / 2;
    100 		if (min == max)
    101 		    break;
    102 		if (cc.code < ucs_gb18030_map[i].code3)
    103 		    max = i - 1;
    104 		else if (cc.code >= ucs_gb18030_map[i+1].code3)
    105 		    min = i + 1;
    106 		else
    107 		    break;
    108 	    }
    109 	}
    110 	return ucs_gb18030_map[i].code + cc.code - ucs_gb18030_map[i].code3;
    111     }
    112     if (cc.code >= WC_C_GB18030_UCS4 && cc.code <= WC_C_GB18030_UCS4_END)
    113 	return WC_GB18030_N(cc.code) - WC_GB18030_N(WC_C_GB18030_UCS4)
    114 		+ 0x10000;
    115     return WC_C_UCS4_ERROR;
    116 }
    117 
    118 wc_wchar_t
    119 wc_ucs_to_gb18030(wc_uint32 ucs)
    120 {
    121     wc_wchar_t cc;
    122     wc_map3 *map;
    123 
    124     if (ucs <= WC_C_UCS2_END) {
    125 	map = wc_map3_range_search((wc_uint16)ucs,
    126 		ucs_gbk_ext_map, N_ucs_gbk_ext_map);
    127 	if (map) {
    128 	    cc.code = WC_GBK_N(map->code3) + ucs - map->code;
    129 	    cc.code = WC_N_GBK(cc.code);
    130 	    cc.ccs = WC_CCS_GBK_EXT;
    131 	    return cc;
    132 	}
    133 	map = wc_map3_range_search((wc_uint16)ucs,
    134 		ucs_gb18030_map, N_ucs_gb18030_map);
    135 	if (map) {
    136 	    cc.code = map->code3 + ucs - map->code + WC_GB18030_N(WC_C_GB18030_UCS2);
    137 	    cc.code = WC_N_GB18030(cc.code);
    138 	    if (WcOption.gb18030_as_ucs)
    139 		cc.ccs = WC_CCS_GB18030 | (wc_ucs_to_ccs(ucs) & ~WC_CCS_A_SET);
    140 	    else
    141 		cc.ccs = WC_CCS_GB18030_W;
    142 	    return cc;
    143 	}
    144     } else if (ucs <= WC_C_UNICODE_END) {
    145 	cc.code = ucs - 0x10000 + WC_GB18030_N(WC_C_GB18030_UCS4);
    146 	cc.code = WC_N_GB18030(cc.code);
    147 	if (WcOption.gb18030_as_ucs)
    148 	    cc.ccs = WC_CCS_GB18030 | (wc_ucs_to_ccs(ucs) & ~WC_CCS_A_SET);
    149 	else
    150 	    cc.ccs = WC_CCS_GB18030_W;
    151 	return cc;
    152     }
    153     cc.ccs = WC_CCS_UNKNOWN;
    154     return cc;
    155 }
    156 #endif
    157 
    158 Str
    159 wc_conv_from_gb18030(Str is, wc_ces ces)
    160 {
    161     Str os;
    162     wc_uchar *sp = (wc_uchar *)is->ptr;
    163     wc_uchar *ep = sp + is->length;
    164     wc_uchar *p;
    165     int state = WC_GB18030_NOSTATE;
    166     wc_uint32 gbk;
    167     wc_wchar_t cc;
    168 #ifdef USE_UNICODE
    169     wc_uint32 ucs;
    170 #endif
    171 
    172     for (p = sp; p < ep && *p < 0x80; p++) 
    173 	;
    174     if (p == ep)
    175 	return is;
    176     os = Strnew_size(is->length);
    177     if (p > sp)
    178 	Strcat_charp_n(os, (char *)is->ptr, (int)(p - sp));
    179 
    180     for (; p < ep; p++) {
    181 	switch (state) {
    182 	case WC_GB18030_NOSTATE:
    183 	    switch (WC_GB18030_MAP[*p]) {
    184 	    case UB:
    185 		state = WC_GB18030_MBYTE1;
    186 		break;
    187 	    case C1:
    188 		wtf_push_unknown(os, p, 1);
    189 		break;
    190 	    default:
    191 		Strcat_char(os, (char)*p);
    192 		break;
    193 	    }
    194 	    break;
    195 	case WC_GB18030_MBYTE1:
    196 	    if (WC_GB18030_MAP[*p] & LB) {
    197 		gbk = ((wc_uint32)*(p-1) << 8) | *p;
    198 		if (wc_gbk_or_gbk_ext(gbk) == WC_CCS_GBK_EXT)
    199 		    wtf_push(os, WC_CCS_GBK_EXT, gbk);
    200 		else if (*(p-1) >= 0xA1 && *p >= 0xA1)
    201 		    wtf_push(os, wc_gb2312_or_gbk(gbk), gbk);
    202 		else
    203 		    wtf_push(os, WC_CCS_GBK, gbk);
    204 	    } else if (WC_GB18030_MAP[*p] == L4) {
    205 		state = WC_GB18030_MBYTE2;
    206 		break;
    207 	    } else
    208 		wtf_push_unknown(os, p-1, 2);
    209 	    state = WC_GB18030_NOSTATE;
    210 	    break;
    211 	case WC_GB18030_MBYTE2:
    212 	    if (WC_GB18030_MAP[*p] == UB) {
    213 		state = WC_GB18030_MBYTE3;
    214 		break;
    215 	    } else
    216 		wtf_push_unknown(os, p-2, 3);
    217 	    state = WC_GB18030_NOSTATE;
    218 	    break;
    219 	case WC_GB18030_MBYTE3:
    220 	    if (WC_GB18030_MAP[*p] == L4) {
    221 		cc.ccs = WC_CCS_GB18030_W;
    222 		cc.code = ((wc_uint32)*(p-3) << 24)
    223 		        | ((wc_uint32)*(p-2) << 16)
    224 		        | ((wc_uint32)*(p-1) << 8)
    225 		        | *p;
    226 #ifdef USE_UNICODE
    227 		if (WcOption.gb18030_as_ucs &&
    228 		    (ucs = wc_gb18030_to_ucs(cc)) != WC_C_UCS4_ERROR)
    229 		    wtf_push(os, WC_CCS_GB18030 | (wc_ucs_to_ccs(ucs) & ~WC_CCS_A_SET), cc.code);
    230 		else
    231 #endif
    232 		    wtf_push(os, cc.ccs, cc.code);
    233 	    } else
    234 		wtf_push_unknown(os, p-3, 4);
    235 	    state = WC_GB18030_NOSTATE;
    236 	    break;
    237 	}
    238     }
    239     switch (state) {
    240     case WC_GB18030_MBYTE1:
    241 	wtf_push_unknown(os, p-1, 1);
    242 	break;
    243     case WC_GB18030_MBYTE2:
    244 	wtf_push_unknown(os, p-2, 2);
    245 	break;
    246     case WC_GB18030_MBYTE3:
    247 	wtf_push_unknown(os, p-3, 3);
    248 	break;
    249     }
    250     return os;
    251 }
    252 
    253 void
    254 wc_push_to_gb18030(Str os, wc_wchar_t cc, wc_status *st)
    255 {
    256   while (1) {
    257     switch (WC_CCS_SET(cc.ccs)) {
    258     case WC_CCS_US_ASCII:
    259 	Strcat_char(os, (char)cc.code);
    260 	return;
    261     case WC_CCS_GB_2312:
    262 	Strcat_char(os, (char)((cc.code >> 8) | 0x80));
    263 	Strcat_char(os, (char)((cc.code & 0xff) | 0x80));
    264 	return;
    265     case WC_CCS_GBK_1:
    266     case WC_CCS_GBK_2:
    267 	cc = wc_cs128w_to_gbk(cc);
    268     case WC_CCS_GBK:
    269 	Strcat_char(os, (char)(cc.code >> 8));
    270 	Strcat_char(os, (char)(cc.code & 0xff));
    271 	return;
    272     case WC_CCS_GBK_EXT_1:
    273     case WC_CCS_GBK_EXT_2:
    274 	cc = wc_cs128w_to_gbk(cc);
    275     case WC_CCS_GBK_EXT:
    276 	Strcat_char(os, (char)(cc.code >> 8));
    277 	Strcat_char(os, (char)(cc.code & 0xff));
    278 	return;
    279     case WC_CCS_GB18030:
    280 	Strcat_char(os, (char)((cc.code >> 24) & 0xff));
    281 	Strcat_char(os, (char)((cc.code >> 16) & 0xff));
    282 	Strcat_char(os, (char)((cc.code >> 8)  & 0xff));
    283 	Strcat_char(os, (char)(cc.code & 0xff));
    284 	return;
    285     case WC_CCS_UNKNOWN_W:
    286 	if (!WcOption.no_replace)
    287 	    Strcat_charp(os, WC_REPLACE_W);
    288 	return;
    289     case WC_CCS_UNKNOWN:
    290 	if (!WcOption.no_replace)
    291 	    Strcat_charp(os, WC_REPLACE);
    292 	return;
    293     default:
    294 #ifdef USE_UNICODE
    295 	if (WcOption.ucs_conv)
    296 	    cc = wc_any_to_any_ces(cc, st);
    297 	else
    298 #endif
    299 	    cc.ccs = WC_CCS_IS_WIDE(cc.ccs) ? WC_CCS_UNKNOWN_W : WC_CCS_UNKNOWN;
    300 	continue;
    301     }
    302   }
    303 }
    304 
    305 Str
    306 wc_char_conv_from_gb18030(wc_uchar c, wc_status *st)
    307 {
    308     static Str os;
    309     static wc_uchar gb[4];
    310     wc_uint32 gbk;
    311     wc_wchar_t cc;
    312 #ifdef USE_UNICODE
    313     wc_uint32 ucs;
    314 #endif
    315 
    316     if (st->state == -1) {
    317 	st->state = WC_GB18030_NOSTATE;
    318 	os = Strnew_size(8);
    319     }
    320 
    321     switch (st->state) {
    322     case WC_GB18030_NOSTATE:
    323 	switch (WC_GB18030_MAP[c]) {
    324 	case UB:
    325 	    gb[0] = c;
    326 	    st->state = WC_GB18030_MBYTE1;
    327 	    return NULL;
    328 	case C1:
    329 	    break;
    330 	default:
    331 	    Strcat_char(os, (char)c);
    332 	    break;
    333 	}
    334 	break;
    335     case WC_GB18030_MBYTE1:
    336 	if (WC_GB18030_MAP[c] & LB) {
    337 	    gbk = ((wc_uint32)gb[0] << 8) | c;
    338 	    if (wc_gbk_or_gbk_ext(gbk) == WC_CCS_GBK_EXT)
    339 		wtf_push(os, WC_CCS_GBK_EXT, gbk);
    340 	    else if (gb[0] >= 0xA1 && c >= 0xA1)
    341 		wtf_push(os, wc_gb2312_or_gbk(gbk), gbk);
    342 	    else
    343 		wtf_push(os, WC_CCS_GBK, gbk);
    344 	} else if (WC_GB18030_MAP[c] == L4) {
    345 	    gb[1] = c;
    346 	    st->state = WC_GB18030_MBYTE2;
    347 	    return NULL;
    348 	}
    349 	break;
    350     case WC_GB18030_MBYTE2:
    351 	if (WC_GB18030_MAP[c] == UB) {
    352 	    gb[2] = c;
    353 	    st->state = WC_GB18030_MBYTE3;
    354 	    return NULL;
    355 	}
    356 	break;
    357     case WC_GB18030_MBYTE3:
    358 	if (WC_GB18030_MAP[c] == L4) {
    359 	    cc.ccs = WC_CCS_GB18030_W;
    360 	    cc.code = ((wc_uint32)gb[0] << 24)
    361 		    | ((wc_uint32)gb[1] << 16)
    362 		    | ((wc_uint32)gb[2] << 8)
    363 		    | c;
    364 #ifdef USE_UNICODE
    365 	    if (WcOption.gb18030_as_ucs &&
    366 		(ucs = wc_gb18030_to_ucs(cc)) != WC_C_UCS4_ERROR)
    367 		wtf_push(os, WC_CCS_GB18030 | (wc_ucs_to_ccs(ucs) & ~WC_CCS_A_SET), cc.code);
    368 	    else
    369 #endif
    370 	        wtf_push(os, cc.ccs, cc.code);
    371 	}
    372 	break;
    373     }
    374     st->state = -1;
    375     return os;
    376 }