[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemacs-commit] qemacs buffer.c charset.c charsetmore.c qe.h qe...
From: |
Charlie Gordon |
Subject: |
[Qemacs-commit] qemacs buffer.c charset.c charsetmore.c qe.h qe... |
Date: |
Mon, 10 Feb 2014 20:29:27 +0000 |
CVSROOT: /sources/qemacs
Module name: qemacs
Changes by: Charlie Gordon <chqrlie> 14/02/10 20:29:27
Modified files:
. : buffer.c charset.c charsetmore.c qe.h qe.c
qeconfig.h
Log message:
Improve charset detection and handling
* add charset_raw for binary files
* improve charset detection for ambiguous cases
* add do_show_coding_system()
* add do_set_auto_coding() to (re)select the best coding system
* handle BOM mark: display as \ufeff and ignore for syntax coloring
CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/qemacs/buffer.c?cvsroot=qemacs&r1=1.72&r2=1.73
http://cvs.savannah.gnu.org/viewcvs/qemacs/charset.c?cvsroot=qemacs&r1=1.31&r2=1.32
http://cvs.savannah.gnu.org/viewcvs/qemacs/charsetmore.c?cvsroot=qemacs&r1=1.16&r2=1.17
http://cvs.savannah.gnu.org/viewcvs/qemacs/qe.h?cvsroot=qemacs&r1=1.139&r2=1.140
http://cvs.savannah.gnu.org/viewcvs/qemacs/qe.c?cvsroot=qemacs&r1=1.148&r2=1.149
http://cvs.savannah.gnu.org/viewcvs/qemacs/qeconfig.h?cvsroot=qemacs&r1=1.45&r2=1.46
Patches:
Index: buffer.c
===================================================================
RCS file: /sources/qemacs/qemacs/buffer.c,v
retrieving revision 1.72
retrieving revision 1.73
diff -u -b -r1.72 -r1.73
--- buffer.c 7 Feb 2014 15:56:15 -0000 1.72
+++ buffer.c 10 Feb 2014 20:29:26 -0000 1.73
@@ -498,6 +498,9 @@
if (flags & BF_UTF8) {
eb_set_charset(b, &charset_utf8, b->eol_type);
+ } else
+ if (flags & BF_RAW) {
+ eb_set_charset(b, &charset_raw, EOL_UNIX);
} else {
/* CG: default charset should be selectable */
eb_set_charset(b, &charset_8859_1, b->eol_type);
Index: charset.c
===================================================================
RCS file: /sources/qemacs/qemacs/charset.c,v
retrieving revision 1.31
retrieving revision 1.32
diff -u -b -r1.31 -r1.32
--- charset.c 6 Feb 2014 00:19:39 -0000 1.31
+++ charset.c 10 Feb 2014 20:29:26 -0000 1.32
@@ -99,6 +99,37 @@
};
/********************************************************/
+/* raw */
+
+static void decode_raw_init(CharsetDecodeState *s)
+{
+ s->table = table_idem;
+}
+
+static u8 *encode_raw(__unused__ QECharset *charset, u8 *p, int c)
+{
+ if (c <= 0xff) {
+ *p++ = c;
+ return p;
+ } else {
+ return NULL;
+ }
+}
+
+QECharset charset_raw = {
+ "raw",
+ "binary|none",
+ decode_raw_init,
+ decode_8bit,
+ encode_raw,
+ charset_get_pos_8bit,
+ charset_get_chars_8bit,
+ charset_goto_char_8bit,
+ charset_goto_line_8bit,
+ 1, 0, 0, 10, 0, 0, NULL, NULL,
+};
+
+/********************************************************/
/* 8859-1 */
static void decode_8859_1_init(CharsetDecodeState *s)
@@ -1072,7 +1103,7 @@
/* detect the charset. Actually only UTF8 is detected */
QECharset *detect_charset(const u8 *buf, int size, EOLType *eol_typep)
{
- int i, l, c, has_utf8;
+ int i, l, c, has_utf8, has_binary;
has_utf8 = 0;
for (i = 0; i < size;) {
@@ -1145,10 +1176,41 @@
#endif
/* Should detect iso-2220-jp upon \033$@ and \033$B, but jis
* support is not selected in tiny build
+ * XXX: should use charset probe functions.
*/
- /* CG: should use a state variable for default charset */
- detect_eol_type_8bit(buf, size, &charset_8859_1, eol_typep);
+
+ has_binary = 0;
+ {
+ static const uint32_t magic = (1 << '\b') | (1 << '\t') | (1 << '\f') |
+ (1 << '\n') | (1 << '\r') | (1 <<
'\033') |
+ (1 << 0x0e) | (1 << 0x0f) | (1 << 0x1f);
+
+ for (i = 0; i < size; i++) {
+ c = buf[i];
+ if (c < 32 && !(magic & (1 << c)))
+ has_binary += 1;
+ }
+ }
+ if (has_binary) {
+ *eol_typep = EOL_UNIX;
+ return &charset_raw;
+ }
+
+ detect_eol_type_8bit(buf, size, &charset_raw, eol_typep);
+
+ if (*eol_typep == EOL_DOS) {
+ /* XXX: default DOS files to Latin1, should be selectable */
return &charset_8859_1;
+ }
+#ifndef CONFIG_TINY
+ if (*eol_typep == EOL_MAC) {
+ /* XXX: default MAC files to Mac_roman, should be selectable */
+ /* XXX: should use probe functions */
+ return &charset_mac_roman;
+ }
+#endif
+ /* XXX: should use a state variable for default charset */
+ return &charset_utf8;
}
/********************************************************/
@@ -1347,6 +1409,7 @@
for (i = 0xc0; i < 0xfe; i++)
table_utf8[i] = ESCAPE_CHAR;
+ qe_register_charset(&charset_raw);
qe_register_charset(&charset_8859_1);
qe_register_charset(&charset_vt100);
qe_register_charset(&charset_7bit);
Index: charsetmore.c
===================================================================
RCS file: /sources/qemacs/qemacs/charsetmore.c,v
retrieving revision 1.16
retrieving revision 1.17
diff -u -b -r1.16 -r1.17
--- charsetmore.c 24 Jan 2014 01:22:23 -0000 1.16
+++ charsetmore.c 10 Feb 2014 20:29:26 -0000 1.17
@@ -1159,7 +1159,7 @@
0x00af, 0x02d8, 0x02d9, 0x02da, 0x00b8, 0x02dd, 0x02db, 0x02c7,
};
-static QECharset charset_mac_roman = {
+QECharset charset_mac_roman = {
"mac-roman",
"x-mac|mac",
decode_8bit_init,
Index: qe.h
===================================================================
RCS file: /sources/qemacs/qemacs/qe.h,v
retrieving revision 1.139
retrieving revision 1.140
diff -u -b -r1.139 -r1.140
--- qe.h 10 Feb 2014 20:10:32 -0000 1.139
+++ qe.h 10 Feb 2014 20:29:26 -0000 1.140
@@ -506,10 +506,14 @@
};
extern QECharset *first_charset;
-extern QECharset charset_utf8, charset_8859_1; /* predefined charsets */
+/* predefined charsets */
+extern QECharset charset_raw;
+extern QECharset charset_8859_1;
+extern QECharset charset_utf8;
extern QECharset charset_vt100; /* used for the tty output */
extern QECharset charset_ucs2le, charset_ucs2be;
extern QECharset charset_ucs4le, charset_ucs4be;
+extern QECharset charset_mac_roman;
typedef enum EOLType {
EOL_UNIX = 0,
@@ -1760,6 +1764,8 @@
void do_exchange_point_and_mark(EditState *s);
QECharset *read_charset(EditState *s, const char *charset_str,
EOLType *eol_typep);
+void do_show_coding_system(EditState *s);
+void do_set_auto_coding(EditState *s, int verbose);
void do_set_buffer_file_coding_system(EditState *s, const char *charset_str);
void do_convert_buffer_file_coding_system(EditState *s,
const char *charset_str);
Index: qe.c
===================================================================
RCS file: /sources/qemacs/qemacs/qe.c,v
retrieving revision 1.148
retrieving revision 1.149
diff -u -b -r1.148 -r1.149
--- qe.c 10 Feb 2014 20:10:32 -0000 1.148
+++ qe.c 10 Feb 2014 20:29:26 -0000 1.149
@@ -1875,6 +1875,31 @@
return charset;
}
+void do_show_coding_system(EditState *s)
+{
+ put_status(s, "Buffer charset is now %s%s", s->b->charset->name,
+ s->b->eol_type == EOL_DOS ? "-dos" :
+ s->b->eol_type == EOL_MAC ? "-mac" : "-unix");
+}
+
+void do_set_auto_coding(EditState *s, int verbose)
+{
+ u8 buf[4097];
+ int buf_size;
+ EditBuffer *b = s->b;
+ EOLType eol_type = b->eol_type;
+ QECharset *charset;
+
+ buf_size = eb_read(b, 0, buf, sizeof(buf));
+ eol_type = b->eol_type;
+ /* XXX: detect_charset returns a default charset */
+ charset = detect_charset(buf, buf_size, &eol_type);
+ eb_set_charset(b, charset, eol_type);
+ if (verbose) {
+ do_show_coding_system(s);
+ }
+}
+
void do_set_buffer_file_coding_system(EditState *s, const char *charset_str)
{
QECharset *charset;
@@ -1885,7 +1910,7 @@
if (!charset)
return;
eb_set_charset(s->b, charset, eol_type);
- put_status(s, "Charset is now %s for this buffer", s->b->charset->name);
+ do_show_coding_system(s);
}
/* convert the charset of a buffer to another charset */
@@ -3242,7 +3267,7 @@
int generic_get_colorized_line(EditState *s, unsigned int *buf, int buf_size,
int *offsetp, int line_num)
{
- int len, l, line, col, offset;
+ int len, l, line, col, offset, bom;
int colorize_state;
/* invalidate cache if needed */
@@ -3272,7 +3297,8 @@
for (l = s->colorize_nb_valid_lines; l <= line_num; l++) {
len = eb_get_line(s->b, buf, buf_size, &offset);
- s->colorize_func(buf, len, &colorize_state, 1);
+ bom = (len > 0 && buf[0] == 0xFEFF);
+ s->colorize_func(buf + bom, len - bom, &colorize_state, 1);
s->colorize_states[l] = colorize_state;
}
}
@@ -3280,7 +3306,8 @@
/* compute line color */
colorize_state = s->colorize_states[line_num];
len = eb_get_line(s->b, buf, buf_size, offsetp);
- s->colorize_func(buf, len, &colorize_state, 0);
+ bom = (len > 0 && buf[0] == 0xFEFF);
+ s->colorize_func(buf + bom, len - bom, &colorize_state, 0);
/* XXX: if state is same as previous, minimize invalid region? */
s->colorize_states[line_num + 1] = colorize_state;
@@ -3504,7 +3531,8 @@
/* currently, we cannot display these chars */
display_printf(ds, offset0, offset, "\\U%08x", c);
} else
- if (c >= 256 && s->qe_state->show_unicode == 1) {
+ if (c >= 256 && (s->qe_state->show_unicode == 1 || c == 0xfeff)) {
+ /* Display BOM as \uFEFF to make it explicit */
display_printf(ds, offset0, offset, "\\u%04x", c);
} else {
display_char_bidir(ds, offset0, offset, embedding_level, c);
Index: qeconfig.h
===================================================================
RCS file: /sources/qemacs/qemacs/qeconfig.h,v
retrieving revision 1.45
retrieving revision 1.46
diff -u -b -r1.45 -r1.46
--- qeconfig.h 10 Feb 2014 20:10:32 -0000 1.45
+++ qeconfig.h 10 Feb 2014 20:29:26 -0000 1.46
@@ -381,6 +381,8 @@
CMD2( KEY_NONE, KEY_NONE,
"set-mode", do_set_mode, ESs,
"s{Set mode: }[mode]")
+ CMD1( KEY_NONE, KEY_NONE,
+ "set-auto-coding", do_set_auto_coding, 1)
/* tab & indent */
CMD2( KEY_NONE, KEY_NONE,
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [Qemacs-commit] qemacs buffer.c charset.c charsetmore.c qe.h qe...,
Charlie Gordon <=