groff
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Groff] Re: man page encoding


From: Werner LEMBERG
Subject: Re: [Groff] Re: man page encoding
Date: Thu, 07 Jul 2005 11:54:40 +0200 (CEST)

> Thanks, then let's go for the proposed
>    .\" t  -*- coding: EUC-JP -*-

What's the `t'?  BTW, attached is the file gpreconv which has been
written five years ago by a guy whose name I no longer know (sigh --
since then I'm thinking about UTF8 support without actually starting
with the implementation on the input side).  Something like this
should become part of groff as soon as it supports Unicode on the
input side.


    Werner
#define I18N

#include <stdio.h>
#include <errno.h>
#include <unistd.h>
#include <string.h>

#ifdef I18N
#include <locale.h>
#include <langinfo.h>
#include <iconv.h>
#endif /* I18N */

#define MAX_VAR_LEN 100

char *check_encoding_tag(char *);
char *check_encoding_tag_parse_tag(char *, char **, char **);
char *read_file (FILE *);
char *emacs2mime(char *);
char *mime2locale(char *);
void conversion_latin1 (char *, char *);
void conversion_utf8 (char *, char *);
void conversion_cp1047 (char *, char *);
void conversion_iconv (char *, char *);
void help (char *);

typedef struct {
  char * from;
  char * to;
} encname_t;

encname_t emacs_to_mime[] = {
  {"alternativnyj",                   ""},
  {"big5",                            "Big5"},
  {"chinese-big5",                    "Big5"},
  {"chinese-euc",                     "EUC-CN"},
  {"chinese-hz",                      "HZ-GB-2312"},
  {"chinese-iso-7bit",                ""},
  {"chinese-iso-8bit",                ""},
  {"chinese-iso-8bit-with-esc",       ""},
  {"cn-big5",                         "Big5"},
  {"cn-gb-2312",                      "GB2312"},
  {"compound-text",                   ""},
  {"ctext",                           ""},
  {"cyrillic-alternativnyj",          ""},
  {"cyrillic-iso-8bit",               "ISO-8859-5"},
  {"cyrillic-iso-8bit-with-esc",      ""},
  {"cyrillic-koi8",                   "KOI8-R"},
  {"dos",                             ""},
  {"emacs-mule",                      ""},
  {"euc-china",                       "EUC-CN"},
  {"euc-cn",                          "EUC-CN"},
  {"euc-japan",                       "EUC-JP"},
  {"euc-jisx0213",                    "EUC-JP"},
  {"euc-jisx0213-with-esc",           "EUC-JP"},
  {"euc-jp",                          "EUC-JP"},
  {"euc-korea",                       "EUC-KR"},
  {"euc-kr",                          "EUC-KR"},
  {"gb2312",                          "GB2312"},
  {"greek-iso-8bit",                  "ISO-8859-7"},
  {"greek-iso-8bit-with-esc",         "ISO-8859-7"},
  {"hebrew-iso-8bit",                 "ISO-8859-8"},
  {"hebrew-iso-8bit-with-esc",        "ISO-8859-8"},
  {"hz",                              "HZ-GB-2312"},
  {"hz-gb-2312",                      "HZ-GB-2312"},
  {"in-is13194-devanagari",           ""},
  {"in-is13194-devanagari-with-esc",  ""},
  {"iso-2022-7",                      ""},
  {"iso-2022-7bit",                   ""},
  {"iso-2022-7bit-lock",              ""},
  {"iso-2022-7bit-lock-ss2",          ""},
  {"iso-2022-7bit-ss2",               ""},
  {"iso-2022-8",                      ""},
  {"iso-2022-8bit",                   ""},
  {"iso-2022-8bit-lock"    ,          ""},
  {"iso-2022-8bit-lock-ss2",          ""},
  {"iso-2022-8bit-ss2",               ""},
  {"iso-2022-cjk",                    ""},
  {"iso-2022-cn",                     "ISO-2022-CN"},
  {"iso-2022-cn-ext",                 "ISO-2022-CN-EXT"},
  {"iso-2022-int-1",                  ""},
  {"iso-2022-jp",                     "ISO-2022-JP"},
  {"iso-2022-jp-1978-irv",            "ISO-2022-JP"},
  {"iso-2022-jp-2",                   "ISO-2022-JP-2"},
  {"iso-2022-jp-3",                   ""},
  {"iso-2022-jp-3-compatible",        ""},
  {"iso-2022-jp-3-strict",            ""},
  {"iso-2022-kr",                     "ISO-2022-KR"},
  {"iso-2022-lock",                   ""},
  {"iso-8859-1",                      "ISO-8859-1"},
  {"iso-8859-2",                      "ISO-8859-2"},
  {"iso-8859-3",                      "ISO-8859-3"},
  {"iso-8859-4",                      "ISO-8859-4"},
  {"iso-8859-5",                      "ISO-8859-5"},
  {"iso-8859-6",                      "ISO-8859-6"},
  {"iso-8859-7",                      "ISO-8859-7"},
  {"iso-8859-8",                      "ISO-8859-8"},
  {"iso-8859-9",                      "ISO-8859-9"},
  {"iso-latin-1",                     "ISO-8859-1"},
  {"iso-latin-2",                     "ISO-8859-2"},
  {"iso-latin-3",                     "ISO-8859-3"},
  {"iso-latin-4",                     "ISO-8859-4"},
  {"iso-latin-5",                     "ISO-8859-9"},
  {"iso-safe",                        ""},
  {"japanese-iso-7bit-1978-irv",      "ISO-2022-JP"},
  {"japanese-iso-8bit",               ""},
  {"japanese-iso-8bit-with-esc",      ""},
  {"japanese-euc",                    "EUC-JP"},
  {"japanese-shift-jis",              "Shift_JIS"},
  {"japanese-shift-jisx0213",         ""},
  {"junet",                           "ISO-2022-JP"},
  {"koi8",                            "KOI8-R"},
  {"koi8-r",                          "KOI8-R"},
  {"korean-euc",                      "EUC-KR"},
  {"korean-iso-7bit-lock",            "ISO-2022-KR"},
  {"korean-iso-8bit",                 ""},
  {"korean-iso-8bit-with-esc",        ""},
  {"lao",                             ""},
  {"lao-with-esc",                    ""},
  {"latin-1",                         "ISO-8859-1"},
  {"latin-2",                         "ISO-8859-2"},
  {"latin-3",                         "ISO-8859-3"},
  {"latin-4",                         "ISO-8859-4"},
  {"latin-5",                         "ISO-8859-9"},
  {"mac",                             ""},
  {"old-jis",                         "ISO-2022-JP"},
  {"raw-text",                        ""},
  {"shift_jis",                       "Shift_JIS"},
  {"shift_jisx0213",                  "Shift_JIS"},
  {"sjis",                            "Shift_JIS"},
  {"th-tis620",                       "TIS-620"},
  {"thai-tis620",                     "TIS-620"},
  {"tibetan",                         ""},
  {"tis-620",                         "TIS-620"},
  {"tis620",                          "TIS-620"},
  {"us-ascii",                        "US-ASCII"},
  {"utf-16-be",                       "UTF-16BE"},
  {"utf-16-be-no-signature",          "UTF-16BE"},
  {"utf-16-le",                       "UTF-16LE"},
  {"utf-16-le-no-signature",          "UTF-16LE"},
  {"utf-7",                           "UTF-7"},
  {"utf-7-safe",                      "UTF-7"},
  {"utf-8",                           "UTF-8"},
  {"utf-8-ws",                        "UTF-8"},
  {"vietnamese-viqr",                 "VIQR"},
  {"vietnamese-viscii",               "VISCII"},
  {"vietnamese-vscii",                "VISCII"},
  {"viqr",                            "VIQR"},
  {"viscii",                          "VISCII"},
  {"vscii",                           "VSCII"},
  {"x-ctext",                         ""},
  {NULL,                              NULL}
};

encname_t mime_to_locale[] = {
  {NULL,                              NULL}
};

main(int argc, char **argv)
{
  char *encoding=NULL, *default_encoding, *inbuf, *locale;
  FILE *fp;

  /* determine the default encoding.  This part has to be located
   * before getopt() since the help message shows the default
   * encoding.
   */ 
#ifdef I18N
  setlocale(LC_ALL, "");
  locale = setlocale(LC_CTYPE, NULL);
  if (!locale || !strcmp(locale, "C") || !strcmp(locale, "POSIX")) {
    default_encoding = "latin1";
  } else {
    default_encoding = nl_langinfo(CODESET);
    if (!default_encoding) default_encoding = "latin1";
  }
#else
  default_encoding = "latin1";
#endif

  /* parse the command option */
  while(1){
    int opt;
    opt = getopt(argc, argv, "e:h");
    if (opt == -1) break;
    switch(opt){
    case 'e':
      encoding = (char *)strdup(optarg); break;
    case 'h':
      help(default_encoding); exit(0);
    default:
      exit(1);
    }
  }

  /* read a source */
  if (optind < argc) {
    fp = fopen(argv[optind], "r");
    if (!fp) {
      printf("Cannot open %s\n", argv[optind]);
      exit(1);
    }
    inbuf = read_file(fp);
    fclose(fp);
  } else {
    inbuf = read_file(stdin);
  }

  /* finally determine the encoding */
  if (encoding == NULL) {
    encoding = check_encoding_tag(inbuf);
    if (encoding == NULL) {
      encoding = default_encoding;
    }
  }

  /* translate from MIME & Emacs encoding names to locale encoding names */
  encoding = emacs2mime(encoding);
  encoding = mime2locale(encoding);

  /* call converter (converters write to stdout) */
  if (!strcasecmp(encoding, "latin1")) {
    conversion_latin1(inbuf, encoding);
  } else if (!strcasecmp(encoding, "utf8")) {
    conversion_utf8(inbuf, encoding);
  } else if (!strcasecmp(encoding, "cp1047")) {
    conversion_cp1047(inbuf, encoding);
  } else {
#ifdef I18N
    conversion_iconv(inbuf, encoding);
#else
    printf("Conversion from %s to UTF-8 is not supported.\n", encoding);
    exit(1);
#endif
  }
}

/*  ---------------------------------------------------------
 *  print help message
 *  ---------------------------------------------------------
 */
void help(char *default_encoding)
{
  printf(
         "Preprocessor for Groff system (%s)\n"
         "Usage: gpreconv [option] [input file]\n"
         "  -e encoding     specify encoding\n"
         "  -h              this message\n"
         "The default encoding is \"%s\".\n",
#ifdef I18N
         "internationalized version",
#else
         "non-internationalized version",
#endif
         default_encoding);
}

/*  ---------------------------------------------------------
 *  read input file
 *  The file has to be inputed as a whole before conversion
 *  since the encoding may be stateful like ISO-2022 series.
 *  ---------------------------------------------------------
 */
char *read_file (FILE *fp)
{
#define READBUF_SIZE 32768
  char *buf = NULL;
  size_t bufsize = 0;
  size_t readsize = 0;
  size_t n;
  while(1) {
    if (readsize == bufsize) {
      bufsize += READBUF_SIZE;
      buf = (char *)realloc(buf, bufsize + 1);
      if (!buf) {
        printf("Unable to allocate memory.\n"); exit(1);
      }
    }
    n = fread (buf+readsize, 1, bufsize-readsize, fp);
    readsize += n;
    if (feof(fp)) break;
    if (ferror(fp)) {
      printf("Read error.\n"); exit(1);
    }
  }
  buf[readsize] = 0;
  return buf;
}

/*  ---------------------------------------------------------
 *  check encoding tag in the read buffer (not implemented yet)
 *  ---------------------------------------------------------
 */
char *check_encoding_tag(char *inbuf)
{
  char *p, *lineend, *d1, *d2, *variable, *value;
  for (p=inbuf ; !strncmp(p, ".\\\"", 3) ; p = lineend + 1) {
    if ((lineend = strchr(p, '\n')) == NULL) break;
    *lineend = 0;
    d1 = strstr(p, "-*-"); if (d1) d2 = strstr(d1+3, "-*-");
    *lineend = '\n';
    if (!d1 || !d2) continue;
    *d2 = 0; d1+=3;
    while(*d1) {
      d1 = check_encoding_tag_parse_tag(d1, &variable, &value);
      if (!strcasecmp(variable, "coding")) {
        *d2 = '-';
        return value;
      }
    }
    *d2 = '-';
  }
  return NULL;
}

char *check_encoding_tag_parse_tag(char *d1, char **variable, char **value)
{
  static char var[MAX_VAR_LEN], val[MAX_VAR_LEN];
  int l;

  *variable = var; *value = val;

  while (*d1 == ' ' || *d1 == '\t') d1++;
  l = 0;
  while (l<MAX_VAR_LEN-1 && *d1 && !strchr(";: \t", *d1)) {
    var[l++] = *(d1++);
  }
  var[l] = 0;
  while (*d1 && *d1!=':' && *d1!=';') d1++;
  val[0] = 0;
  if (!*d1) return d1;
  if (*d1 == ';') return d1+1;
  d1++;
  while (*d1 == ' ' || *d1 == '\t') d1++;  
  l = 0;
  while (l<MAX_VAR_LEN-1 && *d1 && !strchr("; \t", *d1)) {
    val[l++] = *(d1++);
  }
  val[l] = 0;
  while (*d1 && *d1!=';') d1++;
  if (!*d1) return d1;
  if (*d1 == ';') return d1+1;
}

/*  ---------------------------------------------------------
 *  convert encoding name from emacs to mime
 *  ---------------------------------------------------------
 */
char *emacs2mime(char *emacs_encoding)
{
  static char emacs_enc[MAX_VAR_LEN];
  int emacs_enc_len;
  encname_t *table;

  strncpy(emacs_enc, emacs_encoding, MAX_VAR_LEN-1);
  emacs_enc[MAX_VAR_LEN-1] = 0;
  emacs_enc_len = strlen(emacs_enc);
  if (!strcasecmp(emacs_enc + emacs_enc_len - 4, "-dos"))
    emacs_enc[emacs_enc_len - 4] = 0;
  if (!strcasecmp(emacs_enc + emacs_enc_len - 4, "-mac"))
    emacs_enc[emacs_enc_len - 4] = 0;
  if (!strcasecmp(emacs_enc + emacs_enc_len - 5, "-unix"))
    emacs_enc[emacs_enc_len - 5] = 0;
  for (table = emacs_to_mime; table->from; table++) {
    if (!strcasecmp(emacs_enc, table->from)) return table->to;
  }
  return emacs_enc;
}

/*  ---------------------------------------------------------
 *  convert encoding name from mime to locale
 *  ---------------------------------------------------------
 */
char *mime2locale(char *mime_encoding)
{
  encname_t *table;

  for (table = mime_to_locale; table->from; table++) {
    if (!strcasecmp(mime_encoding, table->from)) return table->to;
  }
  return mime_encoding;
}

/*  ---------------------------------------------------------
 *  conversion functions
 *  ---------------------------------------------------------
 */

/* conversion from ISO-8859-1 (aka Latin-1) to UTF-8 */
void conversion_latin1 (char *inbuf, char *encoding)
{
  unsigned char *p;
  for(p=inbuf; *p; p++) {
    if (*p < 0x80) putchar(*p);
    else {putchar(0xc0 + (*p >> 6)); putchar(0x80 + (*p & 0x3f));}
  }
  return;
}

/* conversion from UTF-8 to UTF-8, i.e., do nothing */
void conversion_utf8 (char *inbuf, char *encoding)
{
  fwrite(inbuf, 1, strlen(inbuf), stdout);
  return;
}

/* conversion from CP1047 (EBCDIC) to UTF-8 */
/* the table is made from /font/devcp1047/R.proto in groff 1.16 */
void conversion_cp1047 (char *inbuf, char *encoding)
{
  static unsigned char cp1047[] = {
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 0x00-0x07 */
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 0x08-0x0f */
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 0x10-0x17 */
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 0x18-0x1f */
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 0x20-0x27 */
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 0x28-0x2f */
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 0x30-0x37 */
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 0x38-0x3f */
    0x00, 0x00, 0xe2, 0xe4, 0xe0, 0xe1, 0xe3, 0xe5,  /* 0x40-0x47 */
    0xe7, 0xf1, 0xa2, '.' , '<' , '(' , '+' , '|' ,  /* 0x48-0x4f */
    '&' , 0xe8, 0xea, 0xeb, 0xe8, 0xed, 0xee, 0xef,  /* 0x50-0x57 */
    0xec, 0xdf, '!' , '$' , '*' , ')' , ';' , '^' ,  /* 0x58-0x5f */
    '-' , '/' , 0xc2, 0xc4, 0xc0, 0xc1, 0xc3, 0xc5,  /* 0x60-0x67 */
    0xc7, 0xd1, 0xa6, ',' , '%' , '_' , '>' , '?' ,  /* 0x68-0x6f */
    0xf8, 0xc9, 0xca, 0xcb, 0xc8, 0xcd, 0xce, 0xcf,  /* 0x70-0x77 */
    0xcc, '`' , ':' , '#' , '@' , '\'', '=' , '\"',  /* 0x78-0x7f */
    0xd8, 'a' , 'b' , 'c' , 'd' , 'e' , 'f' , 'g' ,  /* 0x80-0x87 */
    'h' , 'i' , 0xab, 0xbb, 0xf0, 0xfd, 0xfe, 0xb1,  /* 0x88-0x8f */
    0xb0, 'j' , 'k' , 'l' , 'm' , 'n' , 'o' , 'p' ,  /* 0x90-0x97 */
    'q' , 'r' , 0xaa, 0xba, 0xe6, 0xb8, 0xc6, 0xa4,  /* 0x98-0x9f */
    0xb5, '~' , 's' , 't' , 'u' , 'v' , 'w' , 'x' ,  /* 0xa0-0xa7 */
    'y' , 'z' , 0xa1, 0xbf, 0xd0, '[' , 0xde, 0xae,  /* 0xa8-0xaf */
    0xac, 0xa3, 0xa5, 0xb7, 0xa9, 0xa7, 0xb6, 0xbc,  /* 0xb0-0xb7 */
    0xbd, 0xbe, 0xdd, 0xa8, 0xaf, ']' , 0xb4, 0xd7,  /* 0xb8-0xbf */
    '{' , 'A' , 'B' , 'C' , 'D' , 'E' , 'F' , 'G' ,  /* 0xc0-0xc7 */
    'H' , 'I' , 0xad, 0xf4, 0xf6, 0xf2, 0xf3, 0xf5,  /* 0xc8-0xcf */
    '}' , 'J' , 'K' , 'L' , 'M' , 'N' , 'O' , 'P' ,  /* 0xd0-0xd7 */
    'Q' , 'R' , 0xb9, 0xfb, 0xfc, 0xf9, 0xfa, 0xff,  /* 0xd8-0xdf */
    '\\', 0xf7, 'S' , 'T' , 'U' , 'V' , 'W' , 'X' ,  /* 0xe0-0xe7 */
    'Y' , 'Z' , 0xb2, 0xd4, 0xd6, 0xd2, 0xd3, 0xd5,  /* 0xe8-0xef */
    '0' , '1' , '2' , '3' , '4' , '5' , '6' , '7' ,  /* 0xf0-0xf7 */
    '8' , '9' , 0xb3, 0xdb, 0xdc, 0xd9, 0xda, 0x00   /* 0xf8-0xff */
  };
  unsigned char *p, c;
  for(p=inbuf; *p; p++) {
    c = cp1047[*p];
    if (c == 0) c = *p; /* fail safe */
    if (c < 0x80) putchar(c);
    else {
      putchar(0xc0 + (c >> 6));
      putchar(0x80 + (c & 0x3f));
    }
  }
  return;
}

/* locale-sensible conversion */
#ifdef I18N
void conversion_iconv (char *inbuf, char *encoding)
{
#define OUTBUF_SIZE 32768
#define OUTBUF_LIMIT 10
  char *outbuf = NULL, *outbuf_top = NULL;
  size_t bufsize = 0;
  size_t inbytesleft, outbytesleft, status;
  iconv_t handle;

  handle = iconv_open("UTF-8", encoding);
  if (handle == (iconv_t)-1) {
    if (errno == EINVAL) {
      printf("Conversion from %s to UTF-8 is not supported.\n", encoding);
      exit(1);
    }
    printf("iconv_open failed!\n"); exit(1);
  }

  inbytesleft = strlen(inbuf) + 1;
  outbytesleft = 0;
  while (inbytesleft > 0) {
    if (outbytesleft < OUTBUF_LIMIT) {
      size_t outsize = outbuf - outbuf_top;
      outbuf_top = (char *)realloc(outbuf_top, bufsize+=OUTBUF_SIZE);
      if (!outbuf_top) {
        printf("Unable to allocate memory.\n"); exit(1);
      }
      outbuf = outbuf_top + outsize;
      outbytesleft += OUTBUF_SIZE;
    }
    status = iconv(handle, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
    if (status == -1) {
      if (errno == EINVAL || errno == EILSEQ) {
        printf("Invalid character.\n"); exit(1);
      }
    }
  }
  fwrite(outbuf_top, 1, strlen(outbuf_top), stdout);
  return;
}
#endif

reply via email to

[Prev in Thread] Current Thread [Next in Thread]