mirror of
git://git.sv.gnu.org/emacs.git
synced 2026-04-28 01:00:52 -07:00
(detect_coding_iso2022): New arg
latin_extra_code_state. Allow Latin extra codes only when *latin_extra_code_state is nonzero. (detect_coding_mask): If there is a NULL byte, detect the encoding as UTF-16 or binary. If Latin extra codes exist, detect the encoding as ISO-2022 only when there's no other proper encoding is found.
This commit is contained in:
parent
69c43ff730
commit
5a7a1dde2c
2 changed files with 103 additions and 24 deletions
|
|
@ -1,3 +1,13 @@
|
|||
2008-01-19 Kenichi Handa <handa@m17n.org>
|
||||
|
||||
* coding.c (detect_coding_iso2022): New arg
|
||||
latin_extra_code_state. Allow Latin extra codes only
|
||||
when *latin_extra_code_state is nonzero.
|
||||
(detect_coding_mask): If there is a NULL byte, detect the encoding
|
||||
as UTF-16 or binary. If Latin extra codes exist, detect the
|
||||
encoding as ISO-2022 only when there's no other proper encoding is
|
||||
found.
|
||||
|
||||
2008-01-17 Jason Rumney <jasonr@gnu.org>
|
||||
|
||||
* xterm.c (handle_one_xevent): Revert to counting chars not bytes.
|
||||
|
|
|
|||
117
src/coding.c
117
src/coding.c
|
|
@ -1410,12 +1410,17 @@ enum iso_code_class_type iso_code_class[256];
|
|||
CODING_CATEGORY_MASK_ISO_7_ELSE
|
||||
CODING_CATEGORY_MASK_ISO_8_ELSE
|
||||
are set. If a code which should never appear in ISO2022 is found,
|
||||
returns 0. */
|
||||
returns 0.
|
||||
|
||||
If *latin_extra_code_state is zero and Latin extra codes are found,
|
||||
set *latin_extra_code_state to 1 and return 0. If it is nonzero,
|
||||
accept Latin extra codes. */
|
||||
|
||||
static int
|
||||
detect_coding_iso2022 (src, src_end, multibytep)
|
||||
detect_coding_iso2022 (src, src_end, multibytep, latin_extra_code_state)
|
||||
unsigned char *src, *src_end;
|
||||
int multibytep;
|
||||
int *latin_extra_code_state;
|
||||
{
|
||||
int mask = CODING_CATEGORY_MASK_ISO;
|
||||
int mask_found = 0;
|
||||
|
|
@ -1578,6 +1583,11 @@ detect_coding_iso2022 (src, src_end, multibytep)
|
|||
if (VECTORP (Vlatin_extra_code_table)
|
||||
&& !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
|
||||
{
|
||||
if (! *latin_extra_code_state)
|
||||
{
|
||||
*latin_extra_code_state = 1;
|
||||
return 0;
|
||||
}
|
||||
if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
|
||||
& CODING_FLAG_ISO_LATIN_EXTRA)
|
||||
newmask |= CODING_CATEGORY_MASK_ISO_8_1;
|
||||
|
|
@ -1604,6 +1614,11 @@ detect_coding_iso2022 (src, src_end, multibytep)
|
|||
{
|
||||
int newmask = 0;
|
||||
|
||||
if (! *latin_extra_code_state)
|
||||
{
|
||||
*latin_extra_code_state = 1;
|
||||
return 0;
|
||||
}
|
||||
if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
|
||||
& CODING_FLAG_ISO_LATIN_EXTRA)
|
||||
newmask |= CODING_CATEGORY_MASK_ISO_8_1;
|
||||
|
|
@ -4131,6 +4146,8 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
|
|||
unsigned char *src = source, *src_end = source + src_bytes;
|
||||
unsigned int mask, utf16_examined_p, iso2022_examined_p;
|
||||
int i;
|
||||
int null_byte_found;
|
||||
int latin_extra_code_state = 1;
|
||||
|
||||
/* At first, skip all ASCII characters and control characters except
|
||||
for three ISO2022 specific control characters. */
|
||||
|
|
@ -4139,21 +4156,36 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
|
|||
ascii_skip_code[ISO_CODE_ESC] = 0;
|
||||
|
||||
label_loop_detect_coding:
|
||||
while (src < src_end && ascii_skip_code[*src]) src++;
|
||||
null_byte_found = 0;
|
||||
/* We stop this loop before the last byte because it may be a NULL
|
||||
anchor byte. */
|
||||
while (src < src_end - 1 && ascii_skip_code[*src])
|
||||
null_byte_found |= (! *src++);
|
||||
if (ascii_skip_code[*src])
|
||||
src++;
|
||||
else if (! null_byte_found)
|
||||
{
|
||||
unsigned char *p = src + 1;
|
||||
while (p < src_end - 1)
|
||||
null_byte_found |= (! *p++);
|
||||
}
|
||||
*skip = src - source;
|
||||
|
||||
if (src >= src_end)
|
||||
/* We found nothing other than ASCII. There's nothing to do. */
|
||||
/* We found nothing other than ASCII (and NULL byte). There's
|
||||
nothing to do. */
|
||||
return 0;
|
||||
|
||||
c = *src;
|
||||
/* The text seems to be encoded in some multilingual coding system.
|
||||
Now, try to find in which coding system the text is encoded. */
|
||||
if (c < 0x80)
|
||||
if (! null_byte_found && c < 0x80)
|
||||
{
|
||||
/* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
|
||||
/* C is an ISO2022 specific control code of C0. */
|
||||
mask = detect_coding_iso2022 (src, src_end, multibytep);
|
||||
latin_extra_code_state = 1;
|
||||
mask = detect_coding_iso2022 (src, src_end, multibytep,
|
||||
&latin_extra_code_state);
|
||||
if (mask == 0)
|
||||
{
|
||||
/* No valid ISO2022 code follows C. Try again. */
|
||||
|
|
@ -4181,21 +4213,27 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
|
|||
if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
|
||||
c = src[1] - 0x20;
|
||||
|
||||
if (c < 0xA0)
|
||||
if (null_byte_found)
|
||||
{
|
||||
try = (CODING_CATEGORY_MASK_UTF_16_BE
|
||||
| CODING_CATEGORY_MASK_UTF_16_LE);
|
||||
}
|
||||
else if (c < 0xA0)
|
||||
{
|
||||
/* C is the first byte of SJIS character code,
|
||||
or a leading-code of Emacs' internal format (emacs-mule),
|
||||
or the first byte of UTF-16. */
|
||||
try = (CODING_CATEGORY_MASK_SJIS
|
||||
| CODING_CATEGORY_MASK_EMACS_MULE
|
||||
| CODING_CATEGORY_MASK_UTF_16_BE
|
||||
| CODING_CATEGORY_MASK_UTF_16_LE);
|
||||
| CODING_CATEGORY_MASK_EMACS_MULE
|
||||
| CODING_CATEGORY_MASK_UTF_16_BE
|
||||
| CODING_CATEGORY_MASK_UTF_16_LE);
|
||||
|
||||
/* Or, if C is a special latin extra code,
|
||||
or is an ISO2022 specific control code of C1 (SS2 or SS3),
|
||||
or is an ISO2022 control-sequence-introducer (CSI),
|
||||
we should also consider the possibility of ISO2022 codings. */
|
||||
if ((VECTORP (Vlatin_extra_code_table)
|
||||
if ((latin_extra_code_state
|
||||
&& VECTORP (Vlatin_extra_code_table)
|
||||
&& !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
|
||||
|| (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
|
||||
|| (c == ISO_CODE_CSI
|
||||
|
|
@ -4205,7 +4243,7 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
|
|||
&& src + 1 < src_end
|
||||
&& src[1] == ']')))))
|
||||
try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
|
||||
| CODING_CATEGORY_MASK_ISO_8BIT);
|
||||
| CODING_CATEGORY_MASK_ISO_8BIT);
|
||||
}
|
||||
else
|
||||
/* C is a character of ISO2022 in graphic plane right,
|
||||
|
|
@ -4213,29 +4251,36 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
|
|||
or the first byte of BIG5's 2-byte code,
|
||||
or the first byte of UTF-8/16. */
|
||||
try = (CODING_CATEGORY_MASK_ISO_8_ELSE
|
||||
| CODING_CATEGORY_MASK_ISO_8BIT
|
||||
| CODING_CATEGORY_MASK_SJIS
|
||||
| CODING_CATEGORY_MASK_BIG5
|
||||
| CODING_CATEGORY_MASK_UTF_8
|
||||
| CODING_CATEGORY_MASK_UTF_16_BE
|
||||
| CODING_CATEGORY_MASK_UTF_16_LE);
|
||||
| CODING_CATEGORY_MASK_ISO_8BIT
|
||||
| CODING_CATEGORY_MASK_SJIS
|
||||
| CODING_CATEGORY_MASK_BIG5
|
||||
| CODING_CATEGORY_MASK_UTF_8
|
||||
| CODING_CATEGORY_MASK_UTF_16_BE
|
||||
| CODING_CATEGORY_MASK_UTF_16_LE);
|
||||
|
||||
/* Or, we may have to consider the possibility of CCL. */
|
||||
if (coding_system_table[CODING_CATEGORY_IDX_CCL]
|
||||
if (! null_byte_found
|
||||
&& coding_system_table[CODING_CATEGORY_IDX_CCL]
|
||||
&& (coding_system_table[CODING_CATEGORY_IDX_CCL]
|
||||
->spec.ccl.valid_codes)[c])
|
||||
try |= CODING_CATEGORY_MASK_CCL;
|
||||
|
||||
mask = 0;
|
||||
utf16_examined_p = iso2022_examined_p = 0;
|
||||
if (priorities)
|
||||
{
|
||||
/* At first try detection with Latin extra codes not-allowed.
|
||||
If no proper coding system is found because of Latin extra
|
||||
codes, try detection with Latin extra codes allowed. */
|
||||
latin_extra_code_state = 0;
|
||||
label_retry:
|
||||
utf16_examined_p = iso2022_examined_p = 0;
|
||||
for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
|
||||
{
|
||||
if (!iso2022_examined_p
|
||||
&& (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
|
||||
{
|
||||
mask |= detect_coding_iso2022 (src, src_end, multibytep);
|
||||
mask |= detect_coding_iso2022 (src, src_end, multibytep,
|
||||
&latin_extra_code_state);
|
||||
iso2022_examined_p = 1;
|
||||
}
|
||||
else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
|
||||
|
|
@ -4256,16 +4301,40 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
|
|||
else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
|
||||
mask |= detect_coding_ccl (src, src_end, multibytep);
|
||||
else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
|
||||
mask |= CODING_CATEGORY_MASK_RAW_TEXT;
|
||||
{
|
||||
if (latin_extra_code_state == 1)
|
||||
{
|
||||
/* Detection of ISO-2022 based coding system
|
||||
failed because of Latin extra codes. Before
|
||||
falling back to raw-text, try again with
|
||||
Latin extra codes allowed. */
|
||||
latin_extra_code_state = 2;
|
||||
try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE
|
||||
| CODING_CATEGORY_MASK_ISO_8BIT);
|
||||
goto label_retry;
|
||||
}
|
||||
mask |= CODING_CATEGORY_MASK_RAW_TEXT;
|
||||
}
|
||||
else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
|
||||
mask |= CODING_CATEGORY_MASK_BINARY;
|
||||
{
|
||||
if (latin_extra_code_state == 1)
|
||||
{
|
||||
/* See the above comment. */
|
||||
latin_extra_code_state = 2;
|
||||
try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE
|
||||
| CODING_CATEGORY_MASK_ISO_8BIT);
|
||||
goto label_retry;
|
||||
}
|
||||
mask |= CODING_CATEGORY_MASK_BINARY;
|
||||
}
|
||||
if (mask & priorities[i])
|
||||
return priorities[i];
|
||||
}
|
||||
return CODING_CATEGORY_MASK_RAW_TEXT;
|
||||
}
|
||||
if (try & CODING_CATEGORY_MASK_ISO)
|
||||
mask |= detect_coding_iso2022 (src, src_end, multibytep);
|
||||
mask |= detect_coding_iso2022 (src, src_end, multibytep,
|
||||
&latin_extra_code_state);
|
||||
if (try & CODING_CATEGORY_MASK_SJIS)
|
||||
mask |= detect_coding_sjis (src, src_end, multibytep);
|
||||
if (try & CODING_CATEGORY_MASK_BIG5)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue