w3m

Unnamed repository; edit this file to name it for gitweb.
git clone https://logand.com/git/w3m.git/
Log | Files | Refs | README

charset.c (11311B)


      1 
      2 #include <stdlib.h>
      3 #include <ctype.h>
      4 #include <gc.h>
      5 #define New_N(type,n) ((type*)GC_MALLOC((n)*sizeof(type)))
      6 
      7 #include "wc.h"
      8 
      9 #ifdef HAVE_LANGINFO_CODESET
     10 #include <langinfo.h>
     11 #endif
     12 
     13 wc_locale WcLocale = 0;
     14 
     15 static struct {
     16   char *lang;
     17   wc_ces ces;
     18 } lang_ces_table[] = {
     19   { "cs", WC_CES_ISO_8859_2 },	/* cs_CZ */
     20   { "el", WC_CES_ISO_8859_7 },	/* el_GR */
     21   { "iw", WC_CES_ISO_8859_8 },	/* iw_IL */
     22   { "ja", WC_CES_EUC_JP },	/* ja_JP */
     23   { "ko", WC_CES_EUC_KR },	/* ko_KR */
     24   { "hu", WC_CES_ISO_8859_2 },	/* hu_HU */
     25   { "pl", WC_CES_ISO_8859_2 },	/* pl_PL */
     26   { "ro", WC_CES_ISO_8859_2 },	/* ro_RO */
     27   { "ru", WC_CES_ISO_8859_5 },	/* ru_SU */
     28   { "sk", WC_CES_ISO_8859_2 },	/* sk_SK */
     29   { "sl", WC_CES_ISO_8859_2 },	/* sl_CS */
     30   { "tr", WC_CES_ISO_8859_9 },	/* tr_TR */
     31   { "zh", WC_CES_EUC_CN },	/* zh_CN */
     32   { NULL, 0 }
     33 };
     34 
     35 static wc_ces
     36 wc_codepage(int n)
     37 {
     38 	switch (n) {
     39 	case 437: return WC_CES_CP437;
     40 	case 737: return WC_CES_CP737;
     41 	case 775: return WC_CES_CP775;
     42 	case 850: return WC_CES_CP850;
     43 	case 852: return WC_CES_CP852;
     44 	case 855: return WC_CES_CP855;
     45 	case 856: return WC_CES_CP856;
     46 	case 857: return WC_CES_CP857;
     47 	case 860: return WC_CES_CP860;
     48 	case 861: return WC_CES_CP861;
     49 	case 862: return WC_CES_CP862;
     50 	case 863: return WC_CES_CP863;
     51 	case 864: return WC_CES_CP864;
     52 	case 865: return WC_CES_CP865;
     53 	case 866: return WC_CES_CP866;
     54 	case 869: return WC_CES_CP869;
     55 	case 874: return WC_CES_CP874;
     56 	case 932: return WC_CES_CP932;		/* CP932 = Shift_JIS */
     57 	case 936: return WC_CES_CP936;		/* CP936 = GBK > EUC_CN */
     58 	case 943: return WC_CES_CP943;		/* CP943 = Shift_JIS */
     59 	case 949: return WC_CES_CP949;		/* CP949 = UHC > EUC_KR */
     60 	case 950: return WC_CES_CP950;		/* CP950 = Big5 */
     61 	case 1006: return WC_CES_CP1006;
     62 	case 1250: return WC_CES_CP1250;
     63 	case 1251: return WC_CES_CP1251;
     64 	case 1252: return WC_CES_CP1252;
     65 	case 1253: return WC_CES_CP1253;
     66 	case 1254: return WC_CES_CP1254;
     67 	case 1255: return WC_CES_CP1255;
     68 	case 1256: return WC_CES_CP1256;
     69 	case 1257: return WC_CES_CP1257;
     70 	case 1258: return WC_CES_CP1258;
     71 	}
     72 	return 0;
     73 }
     74 
     75 wc_ces
     76 wc_guess_charset(char *charset, wc_ces orig)
     77 {
     78     wc_ces guess;
     79 
     80     if (charset == NULL || *charset == '\0')
     81 	return orig;
     82     guess = wc_charset_to_ces(charset);
     83     return guess ? guess : orig;
     84 }
     85 
     86 wc_ces
     87 wc_guess_charset_short(char *charset, wc_ces orig)
     88 {
     89     wc_ces guess;
     90 
     91     if (charset == NULL || *charset == '\0')
     92 	return orig;
     93     guess = wc_charset_short_to_ces(charset);
     94     return guess ? guess : orig;
     95 }
     96 
     97 wc_ces
     98 wc_guess_locale_charset(char *locale, wc_ces orig)
     99 {
    100     wc_ces guess;
    101 
    102     if (locale == NULL || *locale == '\0')
    103 	return orig;
    104     guess = wc_locale_to_ces(locale);
    105     return guess ? guess : orig;
    106 }
    107 
    108 wc_ces
    109 wc_charset_to_ces(char *charset)
    110 {
    111     char *p = charset;
    112     char buf[16];
    113     int n;
    114 
    115     if (tolower(*p) == 'x' && *(p+1) == '-')
    116 	p += 2;
    117     for (n = 0; *p && n < 15; p++) {
    118 	if ((unsigned char)*p > 0x20 && *p != '_' && *p != '-')
    119 	    buf[n++] = tolower(*p);
    120     }
    121     buf[n] = 0;
    122     p = buf;
    123     switch (*p) {
    124     case 'e':
    125 	if (! strncmp(p, "euc", 3)) {
    126 	    p += 3;
    127 	    switch (*p) {
    128 	    case 'j': return WC_CES_EUC_JP;
    129 	    case 'c': return WC_CES_EUC_CN;
    130 	    case 't': return WC_CES_EUC_TW;
    131 	    case 'k': return WC_CES_EUC_KR;
    132 	    }
    133 	    switch (WcLocale) {
    134 	    case WC_LOCALE_JA_JP: return WC_CES_EUC_JP;
    135 	    case WC_LOCALE_ZH_CN: return WC_CES_EUC_CN;
    136 	    case WC_LOCALE_ZH_TW: return WC_CES_EUC_TW;
    137 	    case WC_LOCALE_ZH_HK: return WC_CES_EUC_CN;
    138 	    case WC_LOCALE_KO_KR: return WC_CES_EUC_KR;
    139 	    }
    140 	    return WC_CES_EUC_JP;
    141         }
    142 	break;
    143     case 'i':
    144 	if (! strncmp(p, "iso2022", 7)) {
    145 	    p += 7;
    146 	    switch (*p) {
    147 	    case 'j':
    148 		if (! strncmp(p, "jp2", 3))
    149 		    return WC_CES_ISO_2022_JP_2;
    150 		if (! strncmp(p, "jp3", 3))
    151 		    return WC_CES_ISO_2022_JP_3;
    152 		return WC_CES_ISO_2022_JP;
    153 	    case 'c': return WC_CES_ISO_2022_CN;
    154 	    case 'k': return WC_CES_ISO_2022_KR;
    155 	    }
    156 	    return WC_CES_ISO_2022_JP;
    157 	} else if (! strncmp(p, "iso8859", 7)) {
    158 	    n = atoi(p + 7);
    159 	    if (n >= 1 && n <= 16 && n != 12)
    160 		return (WC_CES_E_ISO_8859 | n);
    161 	    return WC_CES_ISO_8859_1;
    162 	} else if (! strncmp(p, "ibm", 3)) {
    163 	    p += 3;
    164 	    if (*p >= '1' && *p <= '9')
    165 	    	return wc_codepage(atoi(p));
    166 	    return wc_charset_to_ces(p);
    167 	}
    168 	break;
    169     case 'j':
    170 	if (! strncmp(p, "johab", 5))
    171 	    return WC_CES_JOHAB;
    172 	if (! strncmp(p, "jis", 3))
    173 	    return WC_CES_ISO_2022_JP;
    174 	break;
    175     case 's':
    176 	if (! strncmp(p, "shiftjisx0213", 13) ||
    177 	    ! strncmp(p, "sjisx0213", 9))
    178 	    return WC_CES_SHIFT_JISX0213;
    179 	if (! strncmp(p, "shiftjis", 8) ||
    180 	    ! strncmp(p, "sjis", 4))
    181 	    return WC_CES_SHIFT_JIS;
    182 	break;
    183     case 'p':
    184 	if (! strncmp(p, "pck", 3))
    185 	    return WC_CES_SHIFT_JIS;
    186 	break;
    187     case 'g':
    188 	if (! strncmp(p, "gb18030", 7) ||
    189 	    ! strncmp(p, "gbk2k", 5))
    190 	    return WC_CES_GB18030;
    191 	if (! strncmp(p, "gbk", 3))
    192 	    return WC_CES_GBK;
    193 	if (! strncmp(p, "gb2312", 6))
    194 	    return WC_CES_EUC_CN;
    195 	break;
    196     case 'b':
    197 	if (! strncmp(p, "big5hkscs", 9))
    198 	    return WC_CES_HKSCS;
    199 	if (! strncmp(p, "big5", 4))
    200 	    return WC_CES_BIG5;
    201 	break;
    202     case 'h':
    203 	if (! strncmp(p, "hz", 2))
    204 	    return WC_CES_HZ_GB_2312;
    205 	if (! strncmp(p, "hkscs", 5))
    206 	    return WC_CES_HKSCS;
    207 	break;
    208     case 'k':
    209 	if (! strncmp(p, "koi8r", 5))
    210 	    return WC_CES_KOI8_R;
    211 	if (! strncmp(p, "koi8u", 5))
    212 	    return WC_CES_KOI8_U;
    213 	if (! strncmp(p, "ksx1001", 7))
    214 	    return WC_CES_EUC_KR;
    215 	if (! strncmp(p, "ksc5601", 7))
    216 	    return WC_CES_EUC_KR;
    217 	break;
    218     case 't':
    219 	if (! strncmp(p, "tis620", 6))
    220 	    return WC_CES_TIS_620;
    221 	if (! strncmp(p, "tcvn", 4))
    222 	    return WC_CES_TCVN_5712;
    223 	break;
    224     case 'n':
    225 	if (! strncmp(p, "next", 4))
    226 	    return WC_CES_NEXTSTEP;
    227 	break;
    228     case 'v':
    229 	if (! strncmp(p, "viet", 4)) {
    230 	    p += 4;
    231 	    if (! strncmp(p, "tcvn", 4))
    232 		return WC_CES_TCVN_5712;
    233 	}
    234 	if (! strncmp(p, "viscii", 6))
    235 	    return WC_CES_VISCII_11;
    236 	if (! strncmp(p, "vps", 3))
    237 	    return WC_CES_VPS;
    238 	break;
    239     case 'u':
    240 #ifdef USE_UNICODE
    241 	if (! strncmp(p, "utf8", 4))
    242 	    return WC_CES_UTF_8;
    243 	if (! strncmp(p, "utf7", 4))
    244 	    return WC_CES_UTF_7;
    245 #endif
    246 	if (! strncmp(p, "uhc", 3))
    247 	    return WC_CES_UHC;
    248 	if (! strncmp(p, "ujis", 4))
    249 	    return WC_CES_EUC_JP;
    250 	if (! strncmp(p, "usascii", 7))
    251 	    return WC_CES_US_ASCII;
    252 	break;
    253     case 'a':
    254 	if (! strncmp(p, "ascii", 5))
    255 	    return WC_CES_US_ASCII;
    256 	break;
    257     case 'c':
    258 	if (! strncmp(p, "cngb", 4))
    259 	    return WC_CES_EUC_CN;
    260 	if (*(p+1) != 'p')
    261 	    break;
    262 	p += 2;
    263 	if (*p >= '1' &&  *p <= '9')
    264 	    return wc_codepage(atoi(p));
    265 	break;
    266     case 'w':
    267 	if (strncmp(p, "windows", 7))
    268 	    break;
    269 	p += 7;
    270 	if (! strncmp(p, "31j", 3))
    271 	    return WC_CES_CP932;
    272 	if (*p >= '1' &&  *p <= '9')
    273 	    return wc_codepage(atoi(p));
    274 	break;
    275     }
    276     return 0;
    277 }
    278 
    279 wc_ces
    280 wc_charset_short_to_ces(char *charset)
    281 {
    282     char *p = charset;
    283     char buf[16];
    284     wc_ces ces;
    285     int n;
    286 
    287     ces = wc_charset_to_ces(charset);
    288     if (ces)
    289 	return ces;
    290 
    291     for (n = 0; *p && n < 15; p++) {
    292 	if ((unsigned char)*p > 0x20 && *p != '_' && *p != '-')
    293 	    buf[n++] = tolower(*p);
    294     }
    295     buf[n] = 0;
    296     p = buf;
    297     switch (*p) {
    298     case 'e':
    299 	switch (*(p+1)) {
    300 	case 'j': return WC_CES_EUC_JP;
    301 	case 'c': return WC_CES_EUC_CN;
    302 	case 't': return WC_CES_EUC_TW;
    303 	case 'k': return WC_CES_EUC_KR;
    304 	}
    305 	return WC_CES_EUC_JP;
    306     case 'j':
    307 	p++;
    308 	if (*p == 'o')
    309 	    return WC_CES_JOHAB;
    310 	if (*p == 'p')
    311 	   p++;
    312 	if (*p == '2')
    313 	   return WC_CES_ISO_2022_JP_2;
    314 	if (*p == '3')
    315 	   return WC_CES_ISO_2022_JP_3;
    316 	return WC_CES_ISO_2022_JP;
    317     case 's':
    318 	return WC_CES_SHIFT_JIS;
    319     case 'g':
    320 	return WC_CES_EUC_CN;
    321     case 'b':
    322 	return WC_CES_BIG5;
    323     case 'h':
    324 	if (*(p+1) == 'k')
    325 	    return WC_CES_HKSCS;
    326 	return WC_CES_HZ_GB_2312;
    327     case 'k':
    328 	if (*(p+1) == 'o')
    329 	    return WC_CES_KOI8_R;
    330 	return WC_CES_ISO_2022_KR;
    331     case 'l':
    332 	n = atoi(p + 1);
    333 	if (n >= 1 && n <= 16 && n != 12)
    334 	    return (WC_CES_E_ISO_8859 | n);
    335 	return WC_CES_ISO_8859_1;
    336     case 't':
    337 	if (*(p+1) == 'c')
    338 	    return WC_CES_TCVN_5712;
    339 	return WC_CES_TIS_620;
    340     case 'n':
    341 	return WC_CES_NEXTSTEP;
    342     case 'v':
    343 	if (*(p+1) == 'p')
    344 	    return WC_CES_VPS;
    345 	return WC_CES_VISCII_11;
    346 #ifdef USE_UNICODE
    347     case 'u':
    348 	if (*(p+1) == '7')
    349 	    return WC_CES_UTF_7;
    350 	return WC_CES_UTF_8;
    351 #endif
    352     case 'a':
    353 	return WC_CES_US_ASCII;
    354     case 'c':
    355 	return WC_CES_ISO_2022_CN;
    356     case 'w':
    357 	p++;
    358 	if (*p >= '1' &&  *p <= '9')
    359 	    return wc_codepage(atoi(p));
    360 	break;
    361     case 'r':
    362 	return WC_CES_RAW;
    363     }
    364     return 0;
    365 }
    366 
    367 wc_ces
    368 wc_locale_to_ces(char *locale)
    369 {
    370     char *p = locale;
    371     char buf[8];
    372     int n;
    373 
    374     if (*p == 'C' && *(p+1) == '\0')
    375 	return WC_CES_US_ASCII;
    376 #ifdef HAVE_LANGINFO_CODESET
    377     {
    378 	char *cs = nl_langinfo(CODESET);
    379 	if (cs && strcmp(cs, "US-ASCII"))
    380 	    return wc_charset_to_ces(cs);
    381     }
    382 #endif
    383     for (n = 0; *p && *p != '.' && n < 7; p++) {
    384 	if ((unsigned char)*p > 0x20)
    385 	    buf[n++] = tolower(*p);
    386     }
    387     buf[n] = 0;
    388     if (*p == '.') {
    389 	p++;
    390 	if (! strcasecmp(p, "euc")) {
    391 	    switch (buf[0]) {
    392 	    case 'j':
    393 		WcLocale = WC_LOCALE_JA_JP;
    394 		break;
    395 	    case 'k':
    396 		WcLocale = WC_LOCALE_KO_KR;
    397 		break;
    398 	    case 'z':
    399 	        if (!strcmp(buf, "zh_tw"))
    400 		    WcLocale = WC_LOCALE_ZH_TW;
    401 	        else if (!strcmp(buf, "zh_hk"))
    402 		    WcLocale = WC_LOCALE_ZH_HK;
    403 		else
    404 		    WcLocale = WC_LOCALE_ZH_CN;
    405 		break;
    406 	    default:
    407 		WcLocale = 0;
    408 		break;
    409 	    }
    410 	}
    411 	return wc_charset_to_ces(p);
    412     }
    413 
    414     if (!strcmp(buf, "japanese"))
    415 	return WC_CES_SHIFT_JIS;
    416     if (!strcmp(buf, "zh_tw") ||
    417 	!strcmp(buf, "zh_hk"))
    418 	return WC_CES_BIG5;
    419     for (n = 0; lang_ces_table[n].lang; n++) {
    420 	if (!strncmp(buf, lang_ces_table[n].lang, 2))
    421 	    return lang_ces_table[n].ces;
    422     }
    423     return WC_CES_ISO_8859_1;
    424 }
    425 
    426 char *
    427 wc_ces_to_charset(wc_ces ces)
    428 {
    429     if (ces == WC_CES_WTF)
    430 	return "WTF";
    431     return WcCesInfo[WC_CES_INDEX(ces)].name;
    432 }
    433 
    434 char *
    435 wc_ces_to_charset_desc(wc_ces ces)
    436 {
    437     if (ces == WC_CES_WTF)
    438 	return "W3M Transfer Format";
    439     return WcCesInfo[WC_CES_INDEX(ces)].desc;
    440 }
    441 
    442 wc_ces
    443 wc_guess_8bit_charset(wc_ces orig)
    444 {
    445     switch (orig) {
    446     case WC_CES_ISO_2022_JP:
    447     case WC_CES_ISO_2022_JP_2:
    448     case WC_CES_ISO_2022_JP_3:
    449 	return WC_CES_EUC_JP;
    450     case WC_CES_ISO_2022_KR:
    451 	return WC_CES_EUC_KR;
    452     case WC_CES_ISO_2022_CN:
    453     case WC_CES_HZ_GB_2312:
    454 	return WC_CES_EUC_CN;
    455     case WC_CES_US_ASCII:
    456 	return WC_CES_ISO_8859_1;
    457     }
    458     return orig;
    459 }
    460 
    461 wc_bool
    462 wc_check_ces(wc_ces ces)
    463 {
    464     size_t i = WC_CES_INDEX(ces);
    465 
    466     return (i <= WC_CES_END && WcCesInfo[i].id == ces);
    467 }
    468 
    469 static int
    470 wc_ces_list_cmp(const void *a, const void *b)
    471 {
    472     return strcasecmp(((wc_ces_list *)a)->desc, ((wc_ces_list *)b)->desc);
    473 }
    474 
    475 static wc_ces_list *list = NULL;
    476 
    477 wc_ces_list *
    478 wc_get_ces_list(void)
    479 {
    480     wc_ces_info *info;
    481     size_t n;
    482 
    483     if (list)
    484 	return list;
    485     for (info = WcCesInfo, n = 0; info->id; info++) {
    486 	if (info->name != NULL)
    487 	    n++;
    488     }
    489     list = New_N(wc_ces_list, n + 1);
    490     for (info = WcCesInfo, n = 0; info->id; info++) {
    491 	if (info->name != NULL) {
    492 	    list[n].id = info->id;
    493 	    list[n].name = info->name;
    494 	    list[n].desc = info->desc;
    495 	    n++;
    496 	}
    497     }
    498     list[n].id = 0;
    499     list[n].name = NULL;
    500     list[n].desc = NULL;
    501     qsort(list, n, sizeof(wc_ces_list), wc_ces_list_cmp);
    502     return list;
    503 }