/* * * Program to convert files between ASCII and UTF8, using the * &#-escapes from XML to escape non-ASCII characters. * * Usage: * * xml2asc * or * asc2xml * * Both forms read from stdin and write to stdout. The first form * converts from UTF8 (with or without &#-escapes) to ASCII, inserting * &#-escapes for all non-ASCII characters. The second form converts * from ASCII (with or without &#-escapes) to UTF8, removing all * &#-escapes, except those representing ASCII characters. * * If invoked under any other name, the action is the same as xml2asc. * * Version: $Revision: 1.3 $ ($Date: 1998/01/19 18:58:17 $) * Author: Bert Bos * * Copyright © World Wide Web Consortium, (Massachusetts Institute of * Technology, Institut National de Recherche en Informatique et en * Automatique, Keio University). All Rights Reserved. * * Please read http://www.w3.org/Consortium/Legal/copyright-software.html * **/ #include #include #include /* putUTF8 -- write a character to stdout in UTF8 encoding */ static void putUTF8(long c) { if (c <= 0x7F) { /* Leave ASCII encoded */ printf("&#%ld;", c); } else if (c <= 0x07FF) { /* 110xxxxx 10xxxxxx */ putchar(0xC0 | (c >> 6)); putchar(0x80 | (c & 0x3F)); } else if (c <= 0xFFFF) { /* 1110xxxx + 2 */ putchar(0xE0 | (c >> 12)); putchar(0x80 | ((c >> 6) & 0x3F)); putchar(0x80 | (c & 0x3F)); } else if (c <= 0x1FFFFF) { /* 11110xxx + 3 */ putchar(0xF0 | (c >> 18)); putchar(0x80 | ((c >> 12) & 0x3F)); putchar(0x80 | ((c >> 6) & 0x3F)); putchar(0x80 | (c & 0x3F)); } else if (c <= 0x3FFFFFF) { /* 111110xx + 4 */ putchar(0xF8 | (c >> 24)); putchar(0x80 | ((c >> 18) & 0x3F)); putchar(0x80 | ((c >> 12) & 0x3F)); putchar(0x80 | ((c >> 6) & 0x3F)); putchar(0x80 | (c & 0x3F)); } else if (c <= 0x7FFFFFFF) { /* 1111110x + 5 */ putchar(0xFC | (c >> 30)); putchar(0x80 | ((c >> 24) & 0x3F)); putchar(0x80 | ((c >> 18) & 0x3F)); putchar(0x80 | ((c >> 12) & 0x3F)); putchar(0x80 | ((c >> 6) & 0x3F)); putchar(0x80 | (c & 0x3F)); } else { /* Not a valid character... */ printf("&#%ld;", c); } } /* asc2xml -- copy stdin to stdout, converting ASCII XML to UTF8 XML */ static void asc2xml(void) { long n; int c; while ((c = getchar()) != EOF) { if (c != '&') { /* Normal ASCII char */ putchar(c); } else if ((c = getchar()) == EOF) { /* '&' before EOF */ putchar('&'); } else if (c != '#') { /* '&' not followed by '#' */ putchar('&'); putchar(c); } else if ((c = getchar()) == 'x') { /* '&#x' + hexadecimal */ n = 0; while (isxdigit((c = getchar()))) { if (c <= '9') n = 16 * n + c - '0'; else if (c <= 'F') n = 16 * n + c - 'A' + 10; else n = 16 * n + c - 'a' + 10; } /* Don't check for overflow, don't check if c == ';' */ putUTF8(n); } else { /* '&#' + decimal */ n = c - '0'; while (isdigit((c = getchar()))) { n = 10 * n + c - '0'; } /* Don't check for overflow, don't check if c == ';' */ putUTF8(n); } } } /* getUTF8 -- read a UTF8 encoded character from stdin */ static long getUTF8() { long c; int b; if ((b = getchar()) == EOF) { /* EOF */ c = EOF; } else if (b <= 0x7F) { /* ASCII */ c = b; } else if ((b & 0xE0) == 0xC0) { /* 110xxxxx 10xxxxxx */ c = (b & 0x1F) << 6; b = getchar(); /* Don't check for 10xxxxxx */ c |= b & 0x3F; } else if ((b & 0xF0) == 0xE0) { /* 1110xxxx + 2 */ c = (b & 0x0F) << 12; b = getchar(); c |= (b & 0x3F) << 6; b = getchar(); c |= b & 0x3F; } else if ((b & 0xF1) == 0xF0) { /* 11110xxx + 3 */ c = (b & 0x0F) << 18; b = getchar(); c |= (b & 0x3F) << 12; b = getchar(); c |= (b & 0x3F) << 6; b = getchar(); c |= b & 0x3F; } else if ((b & 0xFD) == 0xF8) { /* 111110xx + 4 */ c = (b & 0x0F) << 24; b = getchar(); c |= (b & 0x0F) << 18; b = getchar(); c |= (b & 0x3F) << 12; b = getchar(); c |= (b & 0x3F) << 6; b = getchar(); c |= b & 0x3F; } else if ((b & 0xFE) == 0xFC) { /* 1111110x + 5 */ c = (b & 0x0F) << 30; b = getchar(); c |= (b & 0x0F) << 24; b = getchar(); c |= (b & 0x0F) << 18; b = getchar(); c |= (b & 0x3F) << 12; b = getchar(); c |= (b & 0x3F) << 6; b = getchar(); c |= b & 0x3F; } else { /* Error */ } return c; } /* xml2asc -- copy stdin to stdout, converting UTF8 XML to ASCII XML */ static void xml2asc(void) { long c; while ((c = getUTF8()) != EOF) { if (c <= 127) putchar(c); else printf("&#%ld;", c); } } /* Print usage message, then exit */ static void usage(char *progname) { fprintf(stderr, "Usage: %s outfile\n", progname); exit(1); } /* main -- main body */ int main(int argc, char *argv[]) { if (argc != 1) usage(argv[0]); if (strcmp(argv[0] + strlen(argv[0]) - 7, "asc2xml") == 0) asc2xml(); else xml2asc(); return 0; }