[Qemacs-commit] qemacs charset.c

qemacs-commit
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemacs-commit] qemacs charset.c

From:	Charlie Gordon
Subject:	[Qemacs-commit] qemacs charset.c
Date:	Thu, 06 Feb 2014 00:19:39 +0000
CVSROOT:        /sources/qemacs
Module name:    qemacs
Changes by:     Charlie Gordon <chqrlie>        14/02/06 00:19:39

Modified files:
        .              : charset.c 

Log message:
        fix some problems with dos eol_type.
        
        * skip \n if EOL_DOS in get_pos and goto_line charset methods
        * detect eol_type automatically for non 8 bit charsets

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/qemacs/charset.c?cvsroot=qemacs&r1=1.30&r2=1.31

Patches:
Index: charset.c
===================================================================
RCS file: /sources/qemacs/qemacs/charset.c,v
retrieving revision 1.30
retrieving revision 1.31
diff -u -b -r1.30 -r1.31
--- charset.c   5 Feb 2014 00:56:49 -0000       1.30
+++ charset.c   6 Feb 2014 00:19:39 -0000       1.31
@@ -442,7 +442,7 @@
                                  int *line_ptr, int *col_ptr)
 {
     const uint16_t *p, *p1, *lp;
-    uint16_t nl;
+    uint16_t nl, lf;
     union { uint16_t n; char c[2]; } u;
     int line, col;
 
@@ -452,10 +452,21 @@
     u.n = 0;
     u.c[s->charset == &charset_ucs2be] = s->eol_char;
     nl = u.n;
+    u.c[s->charset == &charset_ucs2be] = '\n';
+    lf = u.n;
+
+    if (s->eol_type == EOL_DOS && p < p1 && *p == lf) {
+        /* Skip \n at start of buffer.
+         * Should check for pending skip state */
+        p++;
+        lp++;
+    }
 
     /* XXX: should handle surrogates */
     while (p < p1) {
         if (*p++ == nl) {
+            if (s->eol_type == EOL_DOS && p < p1 && *p == lf)
+                p++;
             lp = p;
             line++;
         }
@@ -469,7 +480,7 @@
                                   const u8 *buf, int size, int nlines)
 {
     const uint16_t *p, *p1, *lp;
-    uint16_t nl;
+    uint16_t nl, lf;
     union { uint16_t n; char c[2]; } u;
 
     lp = p = (const uint16_t *)buf;
@@ -477,10 +488,21 @@
     u.n = 0;
     u.c[s->charset == &charset_ucs2be] = s->eol_char;
     nl = u.n;
+    u.c[s->charset == &charset_ucs2be] = '\n';
+    lf = u.n;
+
+    if (s->eol_type == EOL_DOS && p < p1 && *p == lf) {
+        /* Skip \n at start of buffer.
+         * Should check for pending skip state */
+        p++;
+        lp++;
+    }
 
     while (nlines > 0 && p < p1) {
         while (p < p1) {
             if (*p++ == nl) {
+                if (s->eol_type == EOL_DOS && p < p1 && *p == lf)
+                    p++;
                 lp = p;
                 nlines--;
                 break;
@@ -617,7 +639,7 @@
                                  int *line_ptr, int *col_ptr)
 {
     const uint32_t *p, *p1, *lp;
-    uint32_t nl;
+    uint32_t nl, lf;
     union { uint32_t n; char c[4]; } u;
     int line, col;
 
@@ -627,9 +649,20 @@
     u.n = 0;
     u.c[(s->charset == &charset_ucs4be) * 3] = s->eol_char;
     nl = u.n;
+    u.c[(s->charset == &charset_ucs4be) * 3] = '\n';
+    lf = u.n;
+
+    if (s->eol_type == EOL_DOS && p < p1 && *p == lf) {
+        /* Skip \n at start of buffer.
+         * Should check for pending skip state */
+        p++;
+        lp++;
+    }
 
     while (p < p1) {
         if (*p++ == nl) {
+            if (s->eol_type == EOL_DOS && p < p1 && *p == lf)
+                p++;
             lp = p;
             line++;
         }
@@ -643,7 +676,7 @@
                                   const u8 *buf, int size, int nlines)
 {
     const uint32_t *p, *p1, *lp;
-    uint32_t nl;
+    uint32_t nl, lf;
     union { uint32_t n; char c[4]; } u;
 
     lp = p = (const uint32_t *)buf;
@@ -651,10 +684,21 @@
     u.n = 0;
     u.c[(s->charset == &charset_ucs4be) * 3] = s->eol_char;
     nl = u.n;
+    u.c[(s->charset == &charset_ucs4be) * 3] = '\n';
+    lf = u.n;
+
+    if (s->eol_type == EOL_DOS && p < p1 && *p == lf) {
+        /* Skip \n at start of buffer.
+         * Should check for pending skip state */
+        p++;
+        lp++;
+    }
 
     while (nlines > 0 && p < p1) {
         while (p < p1) {
             if (*p++ == nl) {
+                if (s->eol_type == EOL_DOS && p < p1 && *p == lf)
+                    p++;
                 lp = p;
                 nlines--;
                 break;
@@ -857,26 +901,34 @@
     memset(s, 0, sizeof(CharsetDecodeState));
 }
 
-/* detect the charset. Actually only UTF8 is detected */
-QECharset *detect_charset(const u8 *buf, int size, EOLType *eol_typep)
-{
-    int i, l, c, has_utf8;
+/* detect the end of line type. */
+void detect_eol_type_8bit(const u8 *buf, int size,
+                          QECharset *charset, EOLType *eol_typep)
+{
+    const u8 *p, *p1;
+    int c, eol_bits;
+    EOLType eol_type;
 
-    if (eol_typep) {
-        /* XXX: delay test after charset match */
-        /* XXX: only works for 8 bit charsets */
-        int eol_bits = 0;
-        for (i = 0; i < size - 1; i++) {
-            c = buf[i++];
+    if (!eol_typep)
+        return;
+
+    eol_type = *eol_typep;
+
+    p = buf;
+    p1 = p + size - 1;
+
+    eol_bits = 0;
+    while (p < p1) {
+        c = *p++;
             if (c == '\r') {
-                if (buf[i] == '\n') {
+            if (*p == '\n') {
+                p++;
                     eol_bits |= 1 << EOL_DOS;
-                    i++;
                 } else {
                     eol_bits |= 1 << EOL_MAC;
                 }
             } else
-            if (buf[i] == '\n') {
+        if (c == '\n') {
                 eol_bits |= 1 << EOL_UNIX;
             }
         }
@@ -885,20 +937,142 @@
             /* no change, keep default value */
             break;
         case 1 << EOL_UNIX:
-            *eol_typep = EOL_UNIX;
+            eol_type = EOL_UNIX;
             break;
         case 1 << EOL_DOS:
-            *eol_typep = EOL_DOS;
+            eol_type = EOL_DOS;
             break;
         case 1 << EOL_MAC:
-            *eol_typep = EOL_MAC;
+            eol_type = EOL_MAC;
             break;
         default:
             /* A mixture of different styles, binary / unix */
-            *eol_typep = EOL_UNIX;
+            eol_type = EOL_UNIX;
             break;
         }
+    *eol_typep = eol_type;
+}
+
+void detect_eol_type_16bit(const u8 *buf, int size,
+                           QECharset *charset, EOLType *eol_typep)
+{
+    const uint16_t *p, *p1;
+    uint16_t cr, lf;
+    union { uint16_t n; char c[2]; } u;
+    int c, eol_bits;
+    EOLType eol_type;
+
+    if (!eol_typep)
+        return;
+
+    eol_type = *eol_typep;
+
+    p = (const uint16_t *)buf;
+    p1 = p + (size >> 1) - 1;
+    u.n = 0;
+    u.c[charset == &charset_ucs2be] = '\r';
+    cr = u.n;
+    u.c[charset == &charset_ucs2be] = '\n';
+    lf = u.n;
+
+    eol_bits = 0;
+    while (p < p1) {
+        c = *p++;
+        if (c == cr) {
+            if (*p == lf) {
+                p++;
+                eol_bits |= 1 << EOL_DOS;
+            } else {
+                eol_bits |= 1 << EOL_MAC;
+            }
+        } else
+        if (c == lf) {
+            eol_bits |= 1 << EOL_UNIX;
     }
+    }
+    switch (eol_bits) {
+        case 0:
+            /* no change, keep default value */
+            break;
+        case 1 << EOL_UNIX:
+            eol_type = EOL_UNIX;
+            break;
+        case 1 << EOL_DOS:
+            eol_type = EOL_DOS;
+            break;
+        case 1 << EOL_MAC:
+            eol_type = EOL_MAC;
+            break;
+        default:
+            /* A mixture of different styles, binary / unix */
+            eol_type = EOL_UNIX;
+            break;
+    }
+    *eol_typep = eol_type;
+}
+
+void detect_eol_type_32bit(const u8 *buf, int size,
+                           QECharset *charset, EOLType *eol_typep)
+{
+    const uint32_t *p, *p1;
+    uint16_t cr, lf;
+    union { uint32_t n; char c[4]; } u;
+    int c, eol_bits;
+    EOLType eol_type;
+
+    if (!eol_typep)
+        return;
+
+    eol_type = *eol_typep;
+
+    p = (const uint32_t *)buf;
+    p1 = p + (size >> 2) - 1;
+    u.n = 0;
+    u.c[(charset == &charset_ucs4be) * 3] = '\r';
+    cr = u.n;
+    u.c[(charset == &charset_ucs4be) * 3] = '\n';
+    lf = u.n;
+
+    eol_bits = 0;
+    while (p < p1) {
+        c = *p++;
+        if (c == cr) {
+            if (*p == lf) {
+                p++;
+                eol_bits |= 1 << EOL_DOS;
+            } else {
+                eol_bits |= 1 << EOL_MAC;
+            }
+        } else
+        if (c == lf) {
+            eol_bits |= 1 << EOL_UNIX;
+        }
+    }
+    switch (eol_bits) {
+        case 0:
+            /* no change, keep default value */
+            break;
+        case 1 << EOL_UNIX:
+            eol_type = EOL_UNIX;
+            break;
+        case 1 << EOL_DOS:
+            eol_type = EOL_DOS;
+            break;
+        case 1 << EOL_MAC:
+            eol_type = EOL_MAC;
+            break;
+        default:
+            /* A mixture of different styles, binary / unix */
+            eol_type = EOL_UNIX;
+            break;
+    }
+    *eol_typep = eol_type;
+}
+
+/* detect the charset. Actually only UTF8 is detected */
+QECharset *detect_charset(const u8 *buf, int size, EOLType *eol_typep)
+{
+    int i, l, c, has_utf8;
 
     has_utf8 = 0;
     for (i = 0; i < size;) {
@@ -919,24 +1093,33 @@
             l--;
         }
     }
-    if (has_utf8)
+    if (has_utf8) {
+        detect_eol_type_8bit(buf, size, &charset_utf8, eol_typep);
         return &charset_utf8;
+    }
 
     /* Check for zwnbsp BOM: files starting with zero-width
      * no-break space as a byte-order mark (BOM) will be detected
      * as ucs2 or ucs4 encoded.
      */
     if (size >= 2 && buf[0] == 0xff && buf[1] == 0xfe) {
-        if (size >= 4 && buf[2] == 0 && buf[3] == 0)
+        if (size >= 4 && buf[2] == 0 && buf[3] == 0) {
+            detect_eol_type_32bit(buf, size, &charset_ucs4le, eol_typep);
             return &charset_ucs4le;
+        } else {
+            detect_eol_type_16bit(buf, size, &charset_ucs2le, eol_typep);
         return &charset_ucs2le;
     }
+    }
 
-    if (size >= 2 && buf[0] == 0xfe && buf[1] == 0xff)
+    if (size >= 2 && buf[0] == 0xfe && buf[1] == 0xff) {
+        detect_eol_type_16bit(buf, size, &charset_ucs2be, eol_typep);
         return &charset_ucs2be;
+    }
 
     if (size >= 4
     &&  buf[0] == 0 && buf[1] == 0 && buf[2] == 0xfe && buf[3] == 0xff) {
+        detect_eol_type_32bit(buf, size, &charset_ucs4be, eol_typep);
         return &charset_ucs4be;
     }
 
@@ -950,17 +1133,21 @@
             if (buf[i] > maxc[i & 3])
                 maxc[i & 3] = buf[i];
         }
-        if (maxc[0] > 'a' && maxc[1] < 0x2f && maxc[2] > 'a' && maxc[3] < 0x2f)
+        if (maxc[0] > 'a' && maxc[1] < 0x2f && maxc[2] > 'a' && maxc[3] < 
0x2f) {
+            detect_eol_type_16bit(buf, size, &charset_ucs2le, eol_typep);
             return &charset_ucs2le;
-
-        if (maxc[1] > 'a' && maxc[0] < 0x2f && maxc[3] > 'a' && maxc[2] < 0x2f)
+        }
+        if (maxc[1] > 'a' && maxc[0] < 0x2f && maxc[3] > 'a' && maxc[2] < 
0x2f) {
+            detect_eol_type_16bit(buf, size, &charset_ucs2be, eol_typep);
             return &charset_ucs2be;
     }
+    }
 #endif
     /* Should detect iso-2220-jp upon \033$@ and \033$B, but jis
      * support is not selected in tiny build
      */
     /* CG: should use a state variable for default charset */
+    detect_eol_type_8bit(buf, size, &charset_8859_1, eol_typep);
     return &charset_8859_1;
 }
 
@@ -1028,11 +1215,20 @@
     p1 = p + size;
     nl = s->eol_char;
 
+    if (s->eol_type == EOL_DOS && p < p1 && *p == '\n') {
+        /* Skip \n at start of buffer.
+         * Should check for pending skip state */
+        p++;
+        lp++;
+    }
+
     for (;;) {
         p = memchr(p, nl, p1 - p);
         if (!p)
             break;
         p++;
+        if (s->eol_type == EOL_DOS && p < p1 && *p == '\n')
+            p++;
         lp = p;
         line++;
     }
@@ -1051,11 +1247,20 @@
     p1 = p + size;
     nl = s->eol_char;
 
+    if (s->eol_type == EOL_DOS && p < p1 && *p == '\n') {
+        /* Skip \n at start of buffer.
+         * Should check for pending skip state */
+        p++;
+        lp++;
+    }
+
     while (nlines > 0) {
         p = memchr(p, nl, p1 - p);
         if (!p)
             break;
         p++;
+        if (s->eol_type == EOL_DOS && p < p1 && *p == '\n')
+            p++;
         lp = p;
         nlines--;
     }
[Prev in Thread]
Current Thread
[Next in Thread]
[Qemacs-commit] qemacs charset.c, Charlie Gordon <=
Prev by Date: [Qemacs-commit] qemacs buffer.c charset.c dired.c extras.c orgm...
Next by Date: [Qemacs-commit] qemacs clang.c latex-mode.c makemode.c perl.c s...
Previous by thread: [Qemacs-commit] qemacs buffer.c charset.c dired.c extras.c orgm...
Next by thread: [Qemacs-commit] qemacs clang.c latex-mode.c makemode.c perl.c s...
Index(es):
- Date
- Thread