[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Emacs-diffs] /srv/bzr/emacs/trunk r112229: Optimize the code for readin
From: |
K. Handa |
Subject: |
[Emacs-diffs] /srv/bzr/emacs/trunk r112229: Optimize the code for reading UTF-8 files. |
Date: |
Fri, 05 Apr 2013 23:19:51 +0900 |
User-agent: |
Bazaar (2.5.0) |
------------------------------------------------------------
revno: 112229 [merge]
committer: K. Handa <address@hidden>
branch nick: trunk
timestamp: Fri 2013-04-05 23:19:51 +0900
message:
Optimize the code for reading UTF-8 files.
modified:
src/ChangeLog
src/coding.c
src/insdel.c
=== modified file 'src/ChangeLog'
--- a/src/ChangeLog 2013-04-05 14:07:02 +0000
+++ b/src/ChangeLog 2013-04-05 14:17:55 +0000
@@ -1,3 +1,23 @@
+2013-04-03 Kenichi Handa <address@hidden>
+
+ The following changes is to optimize the code for reading UTF-8
+ files.
+
+ * coding.c (check_ascii): Renamed from detect_ascii. Return value
+ changed. Check EOL format. Do not call adjust_coding_eol_type
+ here.
+ (check_utf_8): New function.
+ (adjust_coding_eol_type): Do nothing if already adjusted.
+ (detect_coding): Compare the return value of check_ascii with
+ coding->src_bytes. Call adjust_coding_eol_type if necessary.
+ (decode_coding_gap): Optimize for valid UTF-8.
+
+2013-03-21 Kenichi Handa <address@hidden>
+
+ * coding.c (syms_of_coding): Cancel previous change.
+
+ * insdel.c (insert_from_gap): Fix previous change.
+
2013-04-05 Dmitry Antipov <address@hidden>
Consistently use platform-specific function to detect window system.
@@ -484,7 +504,7 @@
* coding.c (decode_coding_gap): Fix typo caught by static checking.
-2013-03-15 handa <address@hidden>
+2013-03-15 Kenichi Handa <address@hidden>
* insdel.c (insert_from_gap): New arg text_at_gap_tail.
(adjust_after_replace): Make it back to static. Delete the third
=== modified file 'src/coding.c'
--- a/src/coding.c 2013-03-20 08:08:34 +0000
+++ b/src/coding.c 2013-04-05 14:08:56 +0000
@@ -6072,17 +6072,18 @@
#define EOL_SEEN_CRLF 4
-static Lisp_Object adjust_coding_eol_type (struct coding_system *coding, int
eol_seen);
-
-
-/* Return true iff all the source bytes are ASCII.
+static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
+ int eol_seen);
+
+
+/* Return the number of ASCII characters at the head of the source.
By side effects, set coding->head_ascii and coding->eol_seen. The
value of coding->eol_seen is "logical or" of EOL_SEEN_LF,
EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when
all the source bytes are ASCII. */
-static bool
-detect_ascii (struct coding_system *coding)
+static int
+check_ascii (struct coding_system *coding)
{
const unsigned char *src, *end;
Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
@@ -6096,21 +6097,20 @@
src = coding->source;
end = src + coding->src_bytes;
- if (inhibit_eol_conversion)
+ if (inhibit_eol_conversion
+ || eol_seen != EOL_SEEN_NONE)
{
/* We don't have to check EOL format. */
while (src < end && !( *src & 0x80)) src++;
- eol_seen = EOL_SEEN_LF;
- adjust_coding_eol_type (coding, eol_seen);
- }
- else if (eol_seen != EOL_SEEN_NONE)
- {
- /* We don't have to check EOL format either. */
- while (src < end && !(*src & 0x80)) src++;
+ if (inhibit_eol_conversion)
+ {
+ eol_seen = EOL_SEEN_LF;
+ adjust_coding_eol_type (coding, eol_seen);
+ }
}
else
{
- end--; /* We look ahead one byte. */
+ end--; /* We look ahead one byte for "CR LF". */
while (src < end)
{
int c = *src;
@@ -6118,6 +6118,69 @@
if (c & 0x80)
break;
src++;
+ if (c == '\r')
+ {
+ if (*src == '\n')
+ {
+ eol_seen |= EOL_SEEN_CRLF;
+ src++;
+ }
+ else
+ eol_seen |= EOL_SEEN_CR;
+ }
+ else if (c == '\n')
+ eol_seen |= EOL_SEEN_LF;
+ }
+ if (src == end)
+ {
+ int c = *src;
+
+ /* All bytes but the last one C are ASCII. */
+ if (! (c & 0x80))
+ {
+ if (c == '\r')
+ eol_seen |= EOL_SEEN_CR;
+ else if (c == '\n')
+ eol_seen |= EOL_SEEN_LF;
+ src++;
+ }
+ }
+ }
+ coding->head_ascii = src - coding->source;
+ coding->eol_seen = eol_seen;
+ return (coding->head_ascii);
+}
+
+
+/* Return the number of charcters at the source if all the bytes are
+ valid UTF-8 (of Unicode range). Otherwise, return -1. By side
+ effects, update coding->eol_seen. The value of coding->eol_seen is
+ "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
+ the value is reliable only when all the source bytes are valid
+ UTF-8. */
+
+static int
+check_utf_8 (struct coding_system *coding)
+{
+ const unsigned char *src, *end;
+ int eol_seen = coding->eol_seen;
+ int nchars = coding->head_ascii;
+
+ if (coding->head_ascii < 0)
+ check_ascii (coding);
+ else
+ coding_set_source (coding);
+ src = coding->source + coding->head_ascii;
+ /* We look ahead one byte for CR LF. */
+ end = coding->source + coding->src_bytes - 1;
+
+ while (src < end)
+ {
+ int c = *src;
+
+ if (UTF_8_1_OCTET_P (*src))
+ {
+ src++;
if (c < 0x20)
{
if (c == '\r')
@@ -6126,6 +6189,7 @@
{
eol_seen |= EOL_SEEN_CRLF;
src++;
+ nchars++;
}
else
eol_seen |= EOL_SEEN_CR;
@@ -6134,27 +6198,58 @@
eol_seen |= EOL_SEEN_LF;
}
}
- if (src > end)
- /* The last two bytes are CR LF, which means that we have
- scanned all bytes. */
- end++;
- else if (src == end)
- {
- end++;
- if (! (*src & 0x80))
- {
- if (*src == '\r')
- eol_seen |= EOL_SEEN_CR;
- else if (*src == '\n')
- eol_seen |= EOL_SEEN_LF;
- src++;
- }
- }
- adjust_coding_eol_type (coding, eol_seen);
- }
- coding->head_ascii = src - coding->source;
+ else if (UTF_8_2_OCTET_LEADING_P (c))
+ {
+ if (c < 0xC2 /* overlong sequence */
+ || src + 1 >= end
+ || ! UTF_8_EXTRA_OCTET_P (src[1]))
+ return -1;
+ src += 2;
+ }
+ else if (UTF_8_3_OCTET_LEADING_P (c))
+ {
+ if (src + 2 >= end
+ || ! (UTF_8_EXTRA_OCTET_P (src[1])
+ && UTF_8_EXTRA_OCTET_P (src[2])))
+ return -1;
+ c = (((c & 0xF) << 12)
+ | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
+ if (c < 0x800 /* overlong sequence */
+ || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
+ return -1;
+ src += 3;
+ }
+ else if (UTF_8_4_OCTET_LEADING_P (c))
+ {
+ if (src + 3 >= end
+ || ! (UTF_8_EXTRA_OCTET_P (src[1])
+ && UTF_8_EXTRA_OCTET_P (src[2])
+ && UTF_8_EXTRA_OCTET_P (src[3])))
+ return -1;
+ c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
+ | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
+ if (c < 0x10000 /* overlong sequence */
+ || c >= 0x110000) /* non-Unicode character */
+ return -1;
+ src += 4;
+ }
+ else
+ return -1;
+ nchars++;
+ }
+
+ if (src == end)
+ {
+ if (! UTF_8_1_OCTET_P (*src))
+ return -1;
+ nchars++;
+ if (*src == '\r')
+ eol_seen |= EOL_SEEN_CR;
+ else if (*src == '\n')
+ eol_seen |= EOL_SEEN_LF;
+ }
coding->eol_seen = eol_seen;
- return (src == end);
+ return nchars;
}
@@ -6269,6 +6364,9 @@
Lisp_Object eol_type;
eol_type = CODING_ID_EOL_TYPE (coding->id);
+ if (! VECTORP (eol_type))
+ /* Already adjusted. */
+ return eol_type;
if (eol_seen & EOL_SEEN_LF)
{
coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
@@ -6360,7 +6458,8 @@
{
coding->eol_seen |= EOL_SEEN_CRLF;
src++;
- coding->head_ascii++;
+ if (! eight_bit_found)
+ coding->head_ascii++;
}
else
coding->eol_seen |= EOL_SEEN_CR;
@@ -6461,9 +6560,14 @@
coding_systems
= AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
detect_info.found = detect_info.rejected = 0;
- if (detect_ascii (coding))
+ if (check_ascii (coding) == coding->src_bytes)
{
+ int head_ascii = coding->head_ascii;
+
+ if (coding->eol_seen != EOL_SEEN_NONE)
+ adjust_coding_eol_type (coding, coding->eol_seen);
setup_coding_system (XCDR (coding_systems), coding);
+ coding->head_ascii = head_ascii;
}
else
{
@@ -7620,15 +7724,27 @@
if (CODING_REQUIRE_DETECTION (coding))
detect_coding (coding);
attrs = CODING_ID_ATTRS (coding->id);
- if (! disable_ascii_optimization)
+ if (! disable_ascii_optimization
+ && ! coding->src_multibyte
+ && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
+ && NILP (CODING_ATTR_POST_READ (attrs))
+ && NILP (get_translation_table (attrs, 0, NULL)))
{
- if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
- && NILP (CODING_ATTR_POST_READ (attrs))
- && NILP (get_translation_table (attrs, 0, NULL))
- && (coding->head_ascii >= 0 /* We've already called detect_coding */
- ? coding->head_ascii == bytes
- : detect_ascii (coding)))
- {
+ chars = coding->head_ascii;
+ if (chars < 0)
+ chars = check_ascii (coding);
+ if (chars != bytes)
+ {
+ if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8))
+ chars = check_utf_8 (coding);
+ else
+ chars = -1;
+ }
+ if (chars >= 0)
+ {
+ if (coding->eol_seen != EOL_SEEN_NONE)
+ adjust_coding_eol_type (coding, coding->eol_seen);
+
if (coding->eol_seen == EOL_SEEN_CR)
{
unsigned char *src_end = GAP_END_ADDR;
@@ -7645,6 +7761,7 @@
unsigned char *src = GAP_END_ADDR;
unsigned char *src_beg = src - coding->src_bytes;
unsigned char *dst = src;
+ ptrdiff_t diff;
while (src_beg < src)
{
@@ -7652,10 +7769,13 @@
if (*src == '\n')
src--;
}
- bytes -= dst - src;
+ diff = dst - src;
+ bytes -= diff;
+ chars -= diff;
}
- coding->produced_char = coding->produced = bytes;
- insert_from_gap (bytes, bytes, 1);
+ coding->produced = bytes;
+ coding->produced_char = chars;
+ insert_from_gap (chars, bytes, 1);
return;
}
}
@@ -10877,7 +10997,7 @@
DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
doc: /* If non-nil, Emacs does not optimize code decoder for
ASCII files.
Internal use only. Removed after the experimental optimizer gets stable. */);
- disable_ascii_optimization = 1;
+ disable_ascii_optimization = 0;
DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
doc: /* Char table for translating self-inserting characters.
=== modified file 'src/insdel.c'
--- a/src/insdel.c 2013-03-28 14:04:49 +0000
+++ b/src/insdel.c 2013-04-05 14:17:55 +0000
@@ -983,6 +983,9 @@
void
insert_from_gap (ptrdiff_t nchars, ptrdiff_t nbytes, bool text_at_gap_tail)
{
+ int ins_charpos = GPT;
+ int ins_bytepos = GPT_BYTE;
+
if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
nchars = nbytes;
@@ -1003,18 +1006,18 @@
eassert (GPT <= GPT_BYTE);
- adjust_overlays_for_insert (GPT - nchars, nchars);
- adjust_markers_for_insert (GPT - nchars, GPT_BYTE - nbytes,
- GPT, GPT_BYTE, 0);
+ adjust_overlays_for_insert (ins_charpos, nchars);
+ adjust_markers_for_insert (ins_charpos, ins_bytepos,
+ ins_charpos + nchars, ins_bytepos + nbytes, 0);
if (buffer_intervals (current_buffer))
{
- offset_intervals (current_buffer, GPT - nchars, nchars);
- graft_intervals_into_buffer (NULL, GPT - nchars, nchars,
+ offset_intervals (current_buffer, ins_charpos, nchars);
+ graft_intervals_into_buffer (NULL, ins_charpos, nchars,
current_buffer, 0);
}
- if (! text_at_gap_tail && GPT - nchars < PT)
+ if (ins_charpos < PT)
adjust_point (nchars, nbytes);
check_markers ();
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [Emacs-diffs] /srv/bzr/emacs/trunk r112229: Optimize the code for reading UTF-8 files.,
K. Handa <=