w3m

Unnamed repository; edit this file to name it for gitweb.
git clone https://logand.com/git/w3m.git/
Log | Files | Refs | README

detect.c (13154B)


      1 
      2 #include "wc.h"
      3 #include "iso2022.h"
      4 #include "sjis.h"
      5 #include "big5.h"
      6 #include "hz.h"
      7 #include "viet.h"
      8 #ifdef USE_UNICODE
      9 #include "utf8.h"
     10 #include "utf7.h"
     11 #endif
     12 
     13 wc_uint8 WC_DETECT_MAP[ 0x100 ] = {
     14     0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 
     15     0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 
     16     0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 
     17     0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 
     18     0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 
     19     0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 
     20     0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 
     21     0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 
     22     1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1, 
     23     1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1, 
     24     1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1, 
     25     1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1, 
     26     1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1, 
     27     1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1, 
     28     1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1, 
     29     1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1, 
     30 };
     31 
     32 #define DETECT_NORMAL	0
     33 #define DETECT_POSSIBLE	1
     34 #define DETECT_OK	2
     35 #define DETECT_BROKEN	4
     36 #define DETECT_ERROR	8
     37 #define SET_DETECT(x,y) ((x) |= (y))
     38 #define SET_BROKEN_ERROR(x) ((x) = ((x) & DETECT_BROKEN) ? DETECT_ERROR : ((x) | DETECT_BROKEN))
     39 
     40 void
     41 wc_create_detect_map(wc_ces ces, wc_bool esc)
     42 {
     43     static wc_ces detect_ces = WC_CES_US_ASCII;
     44     int i;
     45 
     46     if (ces != detect_ces) {
     47 	if (ces & WC_CES_T_VIET) {
     48 	    wc_uint8 *map = NULL;
     49 	    switch (ces) {
     50 	    case WC_CES_TCVN_5712:
     51 		map = wc_c0_tcvn57122_map;
     52 		break;
     53 	    case WC_CES_VISCII_11:
     54 		map = wc_c0_viscii112_map;
     55 		break;
     56 	    case WC_CES_VPS:
     57 		map = wc_c0_vps2_map;
     58 		break;
     59 	    }
     60 	    for (i = 0; i < 0x20; i++)
     61 		WC_DETECT_MAP[i] = map[i] ? 1 : 0;
     62 	} else {
     63 	    for (i = 0; i < 0x20; i++)
     64 		WC_DETECT_MAP[i] = 0;
     65 	    WC_DETECT_MAP[WC_C_HZ_TILDA] = (ces == WC_CES_HZ_GB_2312) ? 1 : 0;
     66 #ifdef USE_UNICODE
     67 	    WC_DETECT_MAP[WC_C_UTF7_PLUS] = (ces == WC_CES_UTF_7) ? 1 : 0;
     68 #endif
     69 	}
     70 	detect_ces = ces;
     71     }
     72     WC_DETECT_MAP[WC_C_ESC] = (esc || (ces & WC_CES_T_ISO_2022)) ? 1 : 0;
     73     return;
     74 }
     75 
     76 wc_ces
     77 wc_auto_detect(char *is, size_t len, wc_ces hint)
     78 {
     79     wc_uchar *p = (wc_uchar *)is;
     80     wc_uchar *ep = p + len;
     81     wc_uchar *q;
     82     wc_ces euc = 0, priv = 0;
     83     wc_status st;
     84     int euc_state = 0, sjis_state = 0, big5_state = 0, hz_state = 0;
     85     int iso_detect = DETECT_ERROR, euc_detect = DETECT_ERROR,
     86 	sjis_detect = DETECT_ERROR, big5_detect = DETECT_ERROR,
     87 	hz_detect = DETECT_ERROR, latin_detect = DETECT_ERROR,
     88 	priv_detect = DETECT_ERROR;
     89     int possible = 0;
     90     wc_bool iso2022jp2 = WC_FALSE, iso2022jp3 = WC_FALSE,
     91 	iso2022cn = WC_FALSE, iso2022kr = WC_FALSE, ok = WC_FALSE;
     92 #ifdef USE_UNICODE
     93     int utf8_state = 0;
     94     int utf8_detect = DETECT_ERROR;
     95     int utf8_next = 0;
     96 #endif
     97 
     98     wc_create_detect_map(hint, WC_TRUE);
     99     for (; p < ep && ! WC_DETECT_MAP[*p]; p++)
    100 	;
    101     if (p == ep)
    102 	return hint;
    103 
    104     switch (hint) {
    105     case WC_CES_ISO_2022_JP:
    106     case WC_CES_ISO_2022_JP_2:
    107     case WC_CES_ISO_2022_JP_3:
    108     case WC_CES_EUC_JP:
    109     case WC_CES_SHIFT_JIS:
    110     case WC_CES_SHIFT_JISX0213:
    111 	euc = WC_CES_EUC_JP;
    112 	euc_state = WC_EUC_NOSTATE;
    113 	sjis_state = WC_SJIS_NOSTATE;
    114 	iso_detect = euc_detect = sjis_detect = DETECT_NORMAL;
    115 	possible = 3;
    116 	break;
    117     case WC_CES_ISO_2022_CN:
    118     case WC_CES_EUC_CN:
    119 	euc = WC_CES_EUC_CN;
    120 	euc_state = WC_EUC_NOSTATE;
    121 	big5_state = WC_BIG5_NOSTATE;
    122 	iso_detect = euc_detect = big5_detect = DETECT_NORMAL;
    123 	possible = 3;
    124 	break;
    125     case WC_CES_EUC_TW:
    126     case WC_CES_BIG5:
    127 	euc = WC_CES_EUC_TW;
    128 	euc_state = WC_EUC_NOSTATE;
    129 	big5_state = WC_BIG5_NOSTATE;
    130 	iso_detect = euc_detect = big5_detect = DETECT_NORMAL;
    131 	possible = 3;
    132 	break;
    133     case WC_CES_HZ_GB_2312:
    134 	euc = WC_CES_EUC_CN;
    135 	euc_state = WC_EUC_NOSTATE;
    136 	hz_state = WC_HZ_NOSTATE;
    137 	iso_detect = euc_detect = big5_detect = hz_detect = DETECT_NORMAL;
    138 	possible = 4;
    139 	break;
    140     case WC_CES_ISO_2022_KR:
    141     case WC_CES_EUC_KR:
    142 	euc = WC_CES_EUC_KR;
    143 	euc_state = WC_EUC_NOSTATE;
    144 	iso_detect = euc_detect = DETECT_NORMAL;
    145 	possible = 3;
    146 	break;
    147 #ifdef USE_UNICODE
    148     case WC_CES_UTF_8:
    149 	iso_detect = DETECT_NORMAL;
    150 	possible = 1;
    151 	break;
    152 #endif
    153     case WC_CES_US_ASCII:
    154 	iso_detect = latin_detect = DETECT_NORMAL;
    155 	possible = 2;
    156 	break;
    157     default:
    158 	if (hint & WC_CES_T_ISO_8859) {
    159 	    iso_detect = latin_detect = DETECT_NORMAL;
    160 	    possible = 2;
    161 	} else {
    162 	    iso_detect = priv_detect = DETECT_NORMAL;
    163 	    priv = hint;	/* for TVCN, VISCII, VPS */
    164 	    possible = 2;
    165 	}
    166 	break;
    167     }
    168 #ifdef USE_UNICODE
    169     if (priv_detect == DETECT_ERROR) {
    170 	utf8_detect = DETECT_NORMAL;
    171 	possible++;
    172     }
    173 #endif
    174 
    175     wc_input_init(WC_CES_US_ASCII, &st);
    176 
    177     for (; p < ep; p++) {
    178 	if (possible == 0 || (possible == 1 && ok))
    179 	    break;
    180 	if (iso_detect != DETECT_ERROR) {
    181 	    switch (*p) {
    182 	    case WC_C_ESC:
    183 		if (*(p+1) == WC_C_MBCS) {
    184 		    q = p;
    185 		    if (! wc_parse_iso2022_esc(&q, &st))
    186 			break;
    187 		    if (st.design[0] == WC_CCS_JIS_C_6226 ||
    188 			st.design[0] == WC_CCS_JIS_X_0208)
    189 			;
    190 		    else if (st.design[0] == WC_CCS_JIS_X_0213_1 ||
    191 			     st.design[0] == WC_CCS_JIS_X_0213_2)
    192 			iso2022jp3 = WC_TRUE;
    193 		    else if (WC_CCS_TYPE(st.design[0]) == WC_CCS_A_CS94W)
    194 			iso2022jp2 = WC_TRUE;
    195 		    if (st.design[1] == WC_CCS_KS_X_1001)
    196 			iso2022kr = WC_TRUE;
    197 		    else if (st.design[1] == WC_CCS_GB_2312 ||
    198 			     st.design[1] == WC_CCS_ISO_IR_165 ||
    199 			     st.design[1] == WC_CCS_CNS_11643_1)
    200 			iso2022cn = WC_TRUE;
    201 		    if (WC_CCS_TYPE(st.design[2]) == WC_CCS_A_CS94W ||
    202 			WC_CCS_TYPE(st.design[3]) == WC_CCS_A_CS94W)
    203 			iso2022cn = WC_TRUE;
    204 		} else if (*(p+1) == WC_C_G2_CS96) {
    205 		    q = p;
    206 		    if (! wc_parse_iso2022_esc(&q, &st))
    207 			break;
    208 		    if (WC_CCS_TYPE(st.design[2]) == WC_CCS_A_CS96)
    209 			iso2022jp2 = WC_TRUE;
    210 		} else if (*(p+1) == WC_C_CSWSR) {
    211 		    q = p;
    212 		    if (! wc_parse_iso2022_esc(&q, &st))
    213 			break;
    214 		    possible = 0;
    215 		    iso_detect = DETECT_BROKEN;
    216 		    continue;
    217 		}
    218 		iso_detect = DETECT_OK;
    219 		ok = WC_TRUE;
    220 		break;
    221 	    case WC_C_SI:
    222 	    case WC_C_SO:
    223 		iso_detect = DETECT_OK;
    224 		ok = WC_TRUE;
    225 		iso2022cn = WC_TRUE;
    226 		iso2022kr = WC_TRUE;
    227 		break;
    228 	    default:
    229 		if (*p & 0x80) {
    230 		    iso_detect = DETECT_ERROR;
    231 		    possible--;
    232 		}
    233 		break;
    234 	    }
    235 	}
    236 	if (euc_detect != DETECT_ERROR) {
    237 	    switch (euc_state) {
    238 	    case WC_EUC_NOSTATE:
    239 		switch (WC_ISO_MAP[*p]) {
    240 		case WC_ISO_MAP_GR:
    241 		    euc_state = WC_EUC_MBYTE1;
    242 		    break;
    243 		case WC_ISO_MAP_SS2:
    244 		    if (euc == WC_CES_EUC_JP)
    245 			euc_state = WC_EUC_MBYTE1;
    246 		    else if (euc == WC_CES_EUC_TW)
    247 			euc_state = WC_EUC_TW_SS2;
    248 		    else
    249 			euc_detect = DETECT_ERROR;
    250 		    break;
    251 		case WC_ISO_MAP_SS3:
    252 		    if (euc == WC_CES_EUC_JP &&
    253 			WC_ISO_MAP[*(p+1)] == WC_ISO_MAP_GR)
    254 			;
    255 		    else
    256 			euc_detect = DETECT_ERROR;
    257 		    break;
    258 		case WC_ISO_MAP_C1:
    259 		case WC_ISO_MAP_GR96:
    260 		    euc_detect = DETECT_ERROR;
    261 		    break;
    262 		}
    263 		break;
    264 	    case WC_EUC_MBYTE1:
    265 		if (WC_ISO_MAP[*p] == WC_ISO_MAP_GR) {
    266 		    SET_DETECT(euc_detect, DETECT_OK);
    267 		    ok = WC_TRUE;
    268 		} else
    269 		    SET_BROKEN_ERROR(euc_detect);
    270 		euc_state = WC_EUC_NOSTATE;
    271 		break;
    272 	    case WC_EUC_TW_SS2:
    273 		if (!( 0xa0 <= *p && *p <= 0xb0) ||
    274 		    WC_ISO_MAP[*(p+1)] != WC_ISO_MAP_GR)
    275 		    euc_detect = DETECT_ERROR;
    276 		euc_state = WC_EUC_NOSTATE;
    277 		break;
    278 	    }
    279 	    if (euc_detect == DETECT_ERROR)
    280 		possible--;
    281 	}
    282 	if (sjis_detect != DETECT_ERROR) {
    283 	    switch (sjis_state) {
    284 	    case WC_SJIS_NOSTATE:
    285 		switch (WC_SJIS_MAP[*p]) {
    286 		case WC_SJIS_MAP_SL:
    287 		case WC_SJIS_MAP_SH:
    288 		    sjis_state = WC_SJIS_SHIFT_L;
    289 		    break;
    290 		case WC_SJIS_MAP_SK:
    291 		    SET_DETECT(sjis_detect, DETECT_POSSIBLE);
    292 		    break;
    293 		case WC_SJIS_MAP_SX:
    294 		    if (WcOption.use_jisx0213) {
    295 			sjis_state = WC_SJIS_SHIFT_X;
    296 			break;
    297 		    }
    298 		case WC_SJIS_MAP_80:
    299 		case WC_SJIS_MAP_A0:
    300 		case WC_SJIS_MAP_C1:
    301 		    sjis_detect = DETECT_ERROR;
    302 		    break;
    303 		}
    304 		break;
    305 	    case WC_SJIS_SHIFT_L:
    306 		if (WC_SJIS_MAP[*p] & WC_SJIS_MAP_LB) {
    307 		    SET_DETECT(sjis_detect, DETECT_OK);
    308 		    ok = WC_TRUE;
    309 		} else
    310 		    SET_BROKEN_ERROR(sjis_detect);
    311 		sjis_state = WC_SJIS_NOSTATE;
    312 		break;
    313 	    case WC_SJIS_SHIFT_X:
    314 		if (WC_SJIS_MAP[*p] & WC_SJIS_MAP_LB)
    315 		    SET_DETECT(sjis_detect, DETECT_POSSIBLE);
    316 		else
    317 		    sjis_detect = DETECT_ERROR;
    318 		sjis_state = WC_SJIS_NOSTATE;
    319 		break;
    320 	    }
    321 	    if (sjis_detect == DETECT_ERROR)
    322 		possible--;
    323 	}
    324 	if (big5_detect != DETECT_ERROR) {
    325 	    switch (big5_state) {
    326 	    case WC_BIG5_NOSTATE:
    327 		switch (WC_BIG5_MAP[*p]) {
    328 		case WC_BIG5_MAP_UB:
    329 		    big5_state = WC_BIG5_MBYTE1;
    330 		    break;
    331 		case WC_BIG5_MAP_C1:
    332 		    big5_detect = DETECT_ERROR;
    333 		    break;
    334 		}
    335 		break;
    336 	    case WC_BIG5_MBYTE1:
    337 		if (WC_BIG5_MAP[*p] & WC_BIG5_MAP_LB) {
    338 		    SET_DETECT(big5_detect, DETECT_OK);
    339 		    ok = WC_TRUE;
    340 		} else
    341 		    SET_BROKEN_ERROR(big5_detect);
    342 		big5_state = WC_BIG5_NOSTATE;
    343 		break;
    344 	    }
    345 	    if (big5_detect == DETECT_ERROR)
    346 		possible--;
    347 	}
    348 	if (hz_detect != DETECT_ERROR) {
    349 	  if (*p & 0x80) {
    350 		hz_detect = DETECT_ERROR;
    351 		possible--;
    352 	  } else {
    353 	    switch (hz_state) {
    354 	    case WC_HZ_NOSTATE:
    355 		if (*p == WC_C_HZ_TILDA)
    356 		    hz_state = WC_HZ_TILDA;
    357 		break;
    358 	    case WC_HZ_TILDA:
    359 		if (*p == WC_C_HZ_SI)
    360 		    hz_state = WC_HZ_MBYTE;
    361 		else
    362 		    hz_state = WC_HZ_NOSTATE;
    363 		break;
    364 	    case WC_HZ_TILDA_MB:
    365 		if (*p == WC_C_HZ_SO)
    366 		    hz_state = WC_HZ_NOSTATE;
    367 		else
    368 		    hz_state = WC_HZ_MBYTE;
    369 		break;
    370 	    case WC_HZ_MBYTE:
    371 		if (*p == WC_C_HZ_TILDA)
    372 		    hz_state = WC_HZ_TILDA_MB;
    373 		else
    374 		    hz_state = WC_HZ_MBYTE1;
    375 		break;
    376 	    case WC_HZ_MBYTE1:
    377 		hz_detect = DETECT_OK;
    378 		ok = WC_TRUE;
    379 		hz_state = WC_HZ_NOSTATE;
    380 		break;
    381 	    }
    382 	  }
    383 	}
    384 	if (latin_detect != DETECT_ERROR) {
    385 	    switch (WC_ISO_MAP[*p] & WC_ISO_MAP_CG) {
    386 	    case WC_ISO_MAP_GR:
    387 	    case WC_ISO_MAP_GR96:
    388 		SET_DETECT(latin_detect, DETECT_OK);
    389 		ok = WC_TRUE;
    390 		break;
    391 	    case WC_ISO_MAP_C1:
    392 		latin_detect = DETECT_ERROR;
    393 		break;
    394 	    }
    395 	    if (latin_detect == DETECT_ERROR)
    396 		possible--;
    397 	}
    398 	if (priv_detect != DETECT_ERROR) {
    399 	    if (*p != WC_C_ESC && WC_DETECT_MAP[*p]) {
    400 		SET_DETECT(priv_detect, DETECT_OK);
    401 		ok = WC_TRUE;
    402 	    }
    403 /*
    404 	    if (priv_detect == DETECT_ERROR)
    405 		possible--;
    406 */
    407 	}
    408 #ifdef USE_UNICODE
    409 	if (utf8_detect != DETECT_ERROR) {
    410 	    switch (utf8_state) {
    411 	    case WC_UTF8_NOSTATE:
    412 		switch (utf8_next = WC_UTF8_MAP[*p]) {
    413 		case 1:
    414 		case 8:
    415 		    break;
    416 		case 0:
    417 		case 7:
    418 		    utf8_detect = DETECT_ERROR;
    419 		    break;
    420 		default:
    421 		    utf8_next--;
    422 		    utf8_state = WC_UTF8_NEXT;
    423 		    break;
    424 		}
    425 		break;
    426 	    case WC_UTF8_NEXT:
    427 		if (WC_UTF8_MAP[*p]) {
    428 		    utf8_detect = DETECT_ERROR;
    429 		    utf8_state = WC_UTF8_NOSTATE;
    430 		    break;
    431 		}
    432 		utf8_next--;
    433 		if (! utf8_next) {
    434 		    SET_DETECT(utf8_detect, DETECT_OK);
    435 		    ok = WC_TRUE;
    436 		    utf8_state = WC_UTF8_NOSTATE;
    437 		}
    438 		break;
    439 	    }
    440 	    if (utf8_detect == DETECT_ERROR)
    441 		possible--;
    442 	}
    443 #endif
    444     }
    445 
    446     if (iso_detect != DETECT_ERROR) {
    447 	if (iso_detect == DETECT_NORMAL) {
    448 	   if (hz_detect == DETECT_OK)
    449 		return WC_CES_HZ_GB_2312;
    450 	   if (priv_detect == DETECT_OK)
    451 		return priv;
    452 	   return WC_CES_US_ASCII;
    453 	}
    454 	switch (euc) {
    455 	case WC_CES_EUC_CN:
    456 	case WC_CES_EUC_TW:
    457 	    if (iso2022cn)
    458 		return WC_CES_ISO_2022_CN;
    459 	    break;
    460 	case WC_CES_EUC_KR:
    461 	    if (iso2022kr)
    462 		return WC_CES_ISO_2022_KR;
    463 	    break;
    464 	}
    465 	if (iso2022jp3)
    466 	    return WC_CES_ISO_2022_JP_3;
    467 	if (iso2022jp2)
    468 	    return WC_CES_ISO_2022_JP_2;
    469 	if (iso2022cn)
    470 	    return WC_CES_ISO_2022_CN;
    471 	if (iso2022kr)
    472 	    return WC_CES_ISO_2022_KR;
    473 	return WC_CES_ISO_2022_JP;
    474     }
    475     switch (hint) {
    476     case WC_CES_ISO_2022_JP:
    477     case WC_CES_ISO_2022_JP_2:
    478     case WC_CES_ISO_2022_JP_3:
    479     case WC_CES_ISO_2022_KR:
    480     case WC_CES_ISO_2022_CN:
    481 	break;
    482     case WC_CES_EUC_JP:
    483     case WC_CES_EUC_CN:
    484     case WC_CES_EUC_TW:
    485     case WC_CES_EUC_KR:
    486 	if (euc_detect != DETECT_ERROR)
    487 	    return hint;
    488 	break;
    489     case WC_CES_SHIFT_JIS:
    490     case WC_CES_SHIFT_JISX0213:
    491 	if (sjis_detect != DETECT_ERROR)
    492 	    return hint;
    493 	break;
    494     case WC_CES_BIG5:
    495 	if (big5_detect != DETECT_ERROR)
    496 	    return hint;
    497 	break;
    498 #ifdef USE_UNICODE
    499     case WC_CES_UTF_8:
    500 	return hint;
    501 #endif
    502     case WC_CES_US_ASCII:
    503 #ifdef USE_UNICODE
    504 	if (utf8_detect != DETECT_ERROR)
    505 	    return hint;
    506 #endif
    507 	if (latin_detect != DETECT_ERROR)
    508 	    return WC_CES_ISO_8859_1;
    509 	return hint;
    510     default:
    511 	if (latin_detect != DETECT_ERROR)
    512 	    return hint;
    513 	if (priv_detect != DETECT_ERROR)
    514 	    return hint;
    515 #ifdef USE_UNICODE
    516 	if (utf8_detect != DETECT_ERROR)
    517 	    return WC_CES_UTF_8;
    518 #endif
    519 	return hint;
    520     }
    521     if (euc_detect == DETECT_OK)
    522 	return euc;
    523     if (sjis_detect == DETECT_OK)
    524 	return WC_CES_SHIFT_JIS;
    525     if (big5_detect == DETECT_OK)
    526 	return WC_CES_BIG5;
    527 #ifdef USE_UNICODE
    528     if (utf8_detect == DETECT_OK)
    529 	return WC_CES_UTF_8;
    530     if (sjis_detect & DETECT_POSSIBLE)
    531 	return WC_CES_SHIFT_JIS;
    532 #endif
    533     if (euc_detect != DETECT_ERROR)
    534 	return euc;
    535     if (sjis_detect != DETECT_ERROR)
    536 	return WC_CES_SHIFT_JIS;
    537     if (big5_detect != DETECT_ERROR)
    538 	return WC_CES_BIG5;
    539 #ifdef USE_UNICODE
    540     if (utf8_detect != DETECT_ERROR)
    541 	return WC_CES_UTF_8;
    542 #endif
    543     return hint;
    544 }