/* $Id: url.l,v 1.3 1994/05/11 22:37:02 connolly Exp $ */ /* Lexical analyzer for URIs (Universal Resource Identifiers) */ %{ #include "y.tab.h" #include #include #include static char *unquote(const char*); extern int lex_input(); extern int lex_unput(); #undef input #undef unput #define input() lex_input() #define unput(c) lex_unput(c) /* The POSIX portable filename character set is * a-z, A-Z, 0-9, -, _, and . * names can't start with - */ %} /* NMCHAR [-*\.0-9@A-Z_a-z] */ /* not reserved, national, or punctuation, or escape char */ NMCHAR [^=;/#?: {}|\[\]\\^~<>%] /* @@ plus and colon are context-sensitive... crud... I think regexps are the way to go.*/ HEX [0-9A-Z] %% ({NMCHAR}|%{HEX}{HEX})+ { yylval.string = unquote(yytext); return WORD; } ":" { return ':'; } ";" { return ';'; } "/" { return '/'; } "=" { return '='; } "?" { return '?'; } "#" { return '#'; } . { printf("bad char: '%c'\n", yytext[0]); } %% static int hex(int c1, int c2) { if(c2 >= '0' && c2 <= '9'){ c2 -= '0'; }else if(c2 >= 'A' && c2 <= 'F'){ c2 = c2 - 'A' + 10; }else{ abort(); } if(c1 >= '0' && c1 <= '9'){ c1 -= '0'; }else if(c1 >= 'A' && c1 <= 'F'){ c1 = c1 - 'A' + 10; }else{ abort(); } return c1 * 16 + c2; } static char* unquote(const char *text) { char *ret; assert(text); ret = strdup(text); assert(ret); { const char *p = text; char *q = ret; while(*p){ if(*p == '%'){ assert(*(p+1) && *(p+2)); *q++ = hex(*(p+1), *(p+2)); p += 3; }else{ *q++ = *p++; } } *q = 0; } return ret; }