/* * tabtocsv -- convert tab-separated values to comma-separated values * * Copyright © 2014 World Wide Web Consortium * See http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231 * * Author: Bert Bos * Created: 3 April 2014 */ #include #include #include #include #include #include #define VERSION "0.1" static struct option longopts[] = { {"crlf", no_argument, NULL, 'c'}, {"help", no_argument, NULL, 'h'}, {"rfc4180", no_argument, NULL, 'r'}, {"separator", required_argument, NULL, 's'}, {"version", no_argument, NULL, 'v'} }; static bool strict = false; static bool crlf = false; static char sep = ','; #define out(c) putchar((c)) #define outs(s) fputs((s), stdout) /* version -- print program version and exit */ static void version(const char *prog) { printf("%s %s\n", prog, VERSION); exit(0); } /* usage -- print usage message on stderr and exit */ static void usage(const char *prog) { fprintf(stderr, "Usage: %s [-h] [-v] [-r] [-s char] [-c] [file [file...]]\n", prog); exit(1); } /* help -- print explanation of command line options and exit */ static void help(const char *prog) { printf("\ Usage: %s [options] [--] [file [file...]]\n\ Convert tab-separated files to comma-separated.\n\ Version: %s\n\ Options:\n\ -r, --rfc4180\n\ Strict conformance to RFC 4180: refuse control characters\n\ (ASCII 0-31), use a comma (,) as separator and CR-LF as line\n\ ending. Implies --crlf. Incompatible with --separator.\n\ -c, --crlf\n\ Force line endings to be CR-LF. Default is to keep the\n\ line endings of the input.\n\ -s, --separator=\n\ The character to insert between fields. Default is a comma (,).\n\ -h, --help\n\ Print this help text and exit.\n\ -v, --version\n\ Print the progam version number and exit.\n\ The arguments are files with lines of tab-separated fields.\n\ Without arguments, or if the argument is -, the lines are read\n\ from standard input.\n", prog, VERSION); exit(0); } /* process -- read tab-separated file f and write comma-separated to stdout */ static void process(FILE *f, const char *name) { enum {START, TAB, CR, IN_FIELD, BACKSLASH} state = START; int c; /* Remove the BOM, if any */ if ((c = fgetc(f)) != 0xEF) { ungetc(c, f); } else if ((c = fgetc(f)) != 0xBB) { putchar(0xEF); state = IN_FIELD; ungetc(c, f); } else if ((c = fgetc(f)) != 0xBF) { putchar(0xEF); putchar(0xBB); state = IN_FIELD; ungetc(c, f); } while ((c = fgetc(f)) != EOF) { if (strict && c < 0x20 && c != '\r' && c != '\n' && c != '\t') errx(1, "Control characters in input are not allowed in --rfc4180 mode"); switch (state) { case START: switch (c) { case '\\': out('"'); state = BACKSLASH; break; case '\t': out(sep); state = TAB; break; case '"': outs("\"\"\""); state = IN_FIELD; break; case '\r': state = CR; break; case '\n': if (crlf) out('\r'); out('\n'); break; default: out('"'); out(c); state = IN_FIELD; break; } break; case TAB: switch (c) { case '\\': out('"'); state = BACKSLASH; break; case '\t': out(sep); break; case '"': outs("\"\"\""); state = IN_FIELD; break; case '\r': state = CR; break; case '\n': if (crlf) out('\r'); out('\n'); state = START; break; default: out('"'); out(c); state = IN_FIELD; break; } break; case CR: switch (c) { case '\\': out('\r'); if (crlf) out('\n'); out('"'); state = BACKSLASH; break; case '\t': out('\r'); if (crlf) out('\n'); out(sep); state = TAB; break; case '"': out('\r'); if (crlf) out('\n'); outs("\"\"\""); state = IN_FIELD; break; case '\r': out('\r'); if (crlf) out('\n'); state = CR; break; case '\n': outs("\r\n"); state = START; break; default: out('\r'); if (crlf) out('\n'); out('"'); out(c); state = IN_FIELD; break; } break; case IN_FIELD: switch (c) { case '\\': state = BACKSLASH; break; case '\t': out('"'); out(sep); state = TAB; break; case '"': outs("\"\""); break; case '\r': out('"'); state = CR; break; case '\n': out('"'); if (crlf) out('\r'); out('\n'); state = START; break; default: out(c); break; } break; case BACKSLASH: switch (c) { case 't': if (strict) errx(1, "Tab (\\t) is not allowed in --rfc4180 mode"); out('\t'); state = IN_FIELD; break; case 'r': out('\r'); state = IN_FIELD; break; case 'n': out('\n'); state = IN_FIELD; break; case '\\': out('\\'); state = IN_FIELD; break; case '"': outs("\\\"\""); state = IN_FIELD; break; case '\n': outs("\\\""); if (crlf) out('\r'); out('\n'); state = START; break; case '\r': outs("\\\""); state = CR; break; case '\t': outs("\\\""); out(sep); state = TAB; break; default: out('\\'); out(c); state = IN_FIELD; break; } break; } } switch (state) { case START: break; case TAB: if (crlf) outs("\r\n"); break; case CR: out('\r'); if (crlf) out('\n'); break; case IN_FIELD: out('"'); if (crlf) outs("\r\n"); break; case BACKSLASH: out('\\'); if (crlf) outs("\r\n"); break; } } int main(int argc, char *argv[]) { FILE *f; int i, c; while ((c = getopt_long(argc, argv, ":chrs:v", longopts, NULL)) != -1) switch (c) { case 'c': crlf = true; break; case 'h': help(argv[0]); break; case 'r': strict = true; crlf = true; break; case 's': if (!*optarg||optarg[1]) usage(argv[0]); sep = *optarg; break; case 'v': version(argv[0]); break; default: usage(argv[0]); } if (strict && sep != ',') errx(1, "--rfc4180 and --separator cannot be used together."); if (sep == '"' || sep == '\r' || sep == '\n') errx(1, "Cannot use this separator."); if (optind == argc) process(stdin, ""); else for (i = optind; i < argc; i++) if (argv[i][0] == '-' && !argv[i][1]) process(stdin, ""); else if (!(f = fopen(argv[i], "r"))) err(1, NULL); else process(f, argv[i]); return 0; }