1
Fork 0
mirror of git://git.sv.gnu.org/emacs.git synced 2026-01-03 18:41:25 -08:00

(detect_coding_iso2022): New arg latin_extra_code_state. Allow Latin

extra codes only when *latin_extra_code_state is nonzero.
(detect_coding_mask): If there is a NULL byte, detect the encoding as
UTF-16 or binary.  If there is a Latin extra code, detect the encoding
as ISO-2022 only when no other proper encoding is found.
This commit is contained in:
Kenichi Handa 2008-01-09 06:05:23 +00:00
parent ca8dfeda7c
commit 36a04480a5

View file

@ -1406,12 +1406,17 @@ enum iso_code_class_type iso_code_class[256];
CODING_CATEGORY_MASK_ISO_7_ELSE CODING_CATEGORY_MASK_ISO_7_ELSE
CODING_CATEGORY_MASK_ISO_8_ELSE CODING_CATEGORY_MASK_ISO_8_ELSE
are set. If a code which should never appear in ISO2022 is found, are set. If a code which should never appear in ISO2022 is found,
returns 0. */ returns 0.
If *latin_extra_code_state is zero and Latin extra codes are found,
set *latin_extra_code_state to 1 and return 0. If it is nonzero,
accept Latin extra codes. */
static int static int
detect_coding_iso2022 (src, src_end, multibytep) detect_coding_iso2022 (src, src_end, multibytep, latin_extra_code_state)
unsigned char *src, *src_end; unsigned char *src, *src_end;
int multibytep; int multibytep;
int *latin_extra_code_state;
{ {
int mask = CODING_CATEGORY_MASK_ISO; int mask = CODING_CATEGORY_MASK_ISO;
int mask_found = 0; int mask_found = 0;
@ -1574,6 +1579,11 @@ detect_coding_iso2022 (src, src_end, multibytep)
if (VECTORP (Vlatin_extra_code_table) if (VECTORP (Vlatin_extra_code_table)
&& !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
{ {
if (! *latin_extra_code_state)
{
*latin_extra_code_state = 1;
return 0;
}
if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
& CODING_FLAG_ISO_LATIN_EXTRA) & CODING_FLAG_ISO_LATIN_EXTRA)
newmask |= CODING_CATEGORY_MASK_ISO_8_1; newmask |= CODING_CATEGORY_MASK_ISO_8_1;
@ -1600,6 +1610,11 @@ detect_coding_iso2022 (src, src_end, multibytep)
{ {
int newmask = 0; int newmask = 0;
if (! *latin_extra_code_state)
{
*latin_extra_code_state = 1;
return 0;
}
if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
& CODING_FLAG_ISO_LATIN_EXTRA) & CODING_FLAG_ISO_LATIN_EXTRA)
newmask |= CODING_CATEGORY_MASK_ISO_8_1; newmask |= CODING_CATEGORY_MASK_ISO_8_1;
@ -4127,6 +4142,8 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
unsigned char *src = source, *src_end = source + src_bytes; unsigned char *src = source, *src_end = source + src_bytes;
unsigned int mask, utf16_examined_p, iso2022_examined_p; unsigned int mask, utf16_examined_p, iso2022_examined_p;
int i; int i;
int null_byte_found;
int latin_extra_code_state = 1;
/* At first, skip all ASCII characters and control characters except /* At first, skip all ASCII characters and control characters except
for three ISO2022 specific control characters. */ for three ISO2022 specific control characters. */
@ -4135,21 +4152,32 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
ascii_skip_code[ISO_CODE_ESC] = 0; ascii_skip_code[ISO_CODE_ESC] = 0;
label_loop_detect_coding: label_loop_detect_coding:
while (src < src_end && ascii_skip_code[*src]) src++; null_byte_found = 0;
while (src < src_end && ascii_skip_code[*src])
null_byte_found |= (! *src++);
if (! null_byte_found)
{
unsigned char *p = src + 1;
while (p < src_end)
null_byte_found |= (! *p++);
}
*skip = src - source; *skip = src - source;
if (src >= src_end) if (src >= src_end)
/* We found nothing other than ASCII. There's nothing to do. */ /* We found nothing other than ASCII (and NULL byte). There's
nothing to do. */
return 0; return 0;
c = *src; c = *src;
/* The text seems to be encoded in some multilingual coding system. /* The text seems to be encoded in some multilingual coding system.
Now, try to find in which coding system the text is encoded. */ Now, try to find in which coding system the text is encoded. */
if (c < 0x80) if (! null_byte_found && c < 0x80)
{ {
/* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */ /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
/* C is an ISO2022 specific control code of C0. */ /* C is an ISO2022 specific control code of C0. */
mask = detect_coding_iso2022 (src, src_end, multibytep); latin_extra_code_state = 1;
mask = detect_coding_iso2022 (src, src_end, multibytep,
&latin_extra_code_state);
if (mask == 0) if (mask == 0)
{ {
/* No valid ISO2022 code follows C. Try again. */ /* No valid ISO2022 code follows C. Try again. */
@ -4177,21 +4205,27 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
if (multibytep && c == LEADING_CODE_8_BIT_CONTROL) if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
c = src[1] - 0x20; c = src[1] - 0x20;
if (c < 0xA0) if (null_byte_found)
{
try = (CODING_CATEGORY_MASK_UTF_16_BE
| CODING_CATEGORY_MASK_UTF_16_LE);
}
else if (c < 0xA0)
{ {
/* C is the first byte of SJIS character code, /* C is the first byte of SJIS character code,
or a leading-code of Emacs' internal format (emacs-mule), or a leading-code of Emacs' internal format (emacs-mule),
or the first byte of UTF-16. */ or the first byte of UTF-16. */
try = (CODING_CATEGORY_MASK_SJIS try = (CODING_CATEGORY_MASK_SJIS
| CODING_CATEGORY_MASK_EMACS_MULE | CODING_CATEGORY_MASK_EMACS_MULE
| CODING_CATEGORY_MASK_UTF_16_BE | CODING_CATEGORY_MASK_UTF_16_BE
| CODING_CATEGORY_MASK_UTF_16_LE); | CODING_CATEGORY_MASK_UTF_16_LE);
/* Or, if C is a special latin extra code, /* Or, if C is a special latin extra code,
or is an ISO2022 specific control code of C1 (SS2 or SS3), or is an ISO2022 specific control code of C1 (SS2 or SS3),
or is an ISO2022 control-sequence-introducer (CSI), or is an ISO2022 control-sequence-introducer (CSI),
we should also consider the possibility of ISO2022 codings. */ we should also consider the possibility of ISO2022 codings. */
if ((VECTORP (Vlatin_extra_code_table) if ((latin_extra_code_state
&& VECTORP (Vlatin_extra_code_table)
&& !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
|| (c == ISO_CODE_SS2 || c == ISO_CODE_SS3) || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
|| (c == ISO_CODE_CSI || (c == ISO_CODE_CSI
@ -4201,7 +4235,7 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
&& src + 1 < src_end && src + 1 < src_end
&& src[1] == ']'))))) && src[1] == ']')))))
try |= (CODING_CATEGORY_MASK_ISO_8_ELSE try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
| CODING_CATEGORY_MASK_ISO_8BIT); | CODING_CATEGORY_MASK_ISO_8BIT);
} }
else else
/* C is a character of ISO2022 in graphic plane right, /* C is a character of ISO2022 in graphic plane right,
@ -4209,29 +4243,36 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
or the first byte of BIG5's 2-byte code, or the first byte of BIG5's 2-byte code,
or the first byte of UTF-8/16. */ or the first byte of UTF-8/16. */
try = (CODING_CATEGORY_MASK_ISO_8_ELSE try = (CODING_CATEGORY_MASK_ISO_8_ELSE
| CODING_CATEGORY_MASK_ISO_8BIT | CODING_CATEGORY_MASK_ISO_8BIT
| CODING_CATEGORY_MASK_SJIS | CODING_CATEGORY_MASK_SJIS
| CODING_CATEGORY_MASK_BIG5 | CODING_CATEGORY_MASK_BIG5
| CODING_CATEGORY_MASK_UTF_8 | CODING_CATEGORY_MASK_UTF_8
| CODING_CATEGORY_MASK_UTF_16_BE | CODING_CATEGORY_MASK_UTF_16_BE
| CODING_CATEGORY_MASK_UTF_16_LE); | CODING_CATEGORY_MASK_UTF_16_LE);
/* Or, we may have to consider the possibility of CCL. */ /* Or, we may have to consider the possibility of CCL. */
if (coding_system_table[CODING_CATEGORY_IDX_CCL] if (! null_byte_found
&& coding_system_table[CODING_CATEGORY_IDX_CCL]
&& (coding_system_table[CODING_CATEGORY_IDX_CCL] && (coding_system_table[CODING_CATEGORY_IDX_CCL]
->spec.ccl.valid_codes)[c]) ->spec.ccl.valid_codes)[c])
try |= CODING_CATEGORY_MASK_CCL; try |= CODING_CATEGORY_MASK_CCL;
mask = 0; mask = 0;
utf16_examined_p = iso2022_examined_p = 0;
if (priorities) if (priorities)
{ {
/* At first try detection with Latin extra codes not-allowed.
If no proper coding system is found because of Latin extra
codes, try detection with Latin extra codes allowed. */
latin_extra_code_state = 0;
label_retry:
utf16_examined_p = iso2022_examined_p = 0;
for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++) for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
{ {
if (!iso2022_examined_p if (!iso2022_examined_p
&& (priorities[i] & try & CODING_CATEGORY_MASK_ISO)) && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
{ {
mask |= detect_coding_iso2022 (src, src_end, multibytep); mask |= detect_coding_iso2022 (src, src_end, multibytep,
&latin_extra_code_state);
iso2022_examined_p = 1; iso2022_examined_p = 1;
} }
else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS) else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
@ -4252,16 +4293,40 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL) else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
mask |= detect_coding_ccl (src, src_end, multibytep); mask |= detect_coding_ccl (src, src_end, multibytep);
else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT) else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
mask |= CODING_CATEGORY_MASK_RAW_TEXT; {
if (latin_extra_code_state == 1)
{
/* Detection of ISO-2022 based coding system
failed because of Latin extra codes. Before
falling back to raw-text, try again with
Latin extra codes allowed. */
latin_extra_code_state = 2;
try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE
| CODING_CATEGORY_MASK_ISO_8BIT);
goto label_retry;
}
mask |= CODING_CATEGORY_MASK_RAW_TEXT;
}
else if (priorities[i] & CODING_CATEGORY_MASK_BINARY) else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
mask |= CODING_CATEGORY_MASK_BINARY; {
if (latin_extra_code_state == 1)
{
/* See the above comment. */
latin_extra_code_state = 2;
try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE
| CODING_CATEGORY_MASK_ISO_8BIT);
goto label_retry;
}
mask |= CODING_CATEGORY_MASK_BINARY;
}
if (mask & priorities[i]) if (mask & priorities[i])
return priorities[i]; return priorities[i];
} }
return CODING_CATEGORY_MASK_RAW_TEXT; return CODING_CATEGORY_MASK_RAW_TEXT;
} }
if (try & CODING_CATEGORY_MASK_ISO) if (try & CODING_CATEGORY_MASK_ISO)
mask |= detect_coding_iso2022 (src, src_end, multibytep); mask |= detect_coding_iso2022 (src, src_end, multibytep,
&latin_extra_code_state);
if (try & CODING_CATEGORY_MASK_SJIS) if (try & CODING_CATEGORY_MASK_SJIS)
mask |= detect_coding_sjis (src, src_end, multibytep); mask |= detect_coding_sjis (src, src_end, multibytep);
if (try & CODING_CATEGORY_MASK_BIG5) if (try & CODING_CATEGORY_MASK_BIG5)