/* $Id: url.l,v 1.3 1994/05/11 22:37:02 connolly Exp $ */
/* Lexical analyzer for URIs (Universal Resource Identifiers) */

%{
#include "y.tab.h"
#include <stdlib.h>
#include <string.h>
#include <assert.h>
static char *unquote(const char*);

extern int lex_input();
extern int lex_unput();

#undef input
#undef unput

#define input() lex_input()
#define unput(c) lex_unput(c)

/* The POSIX portable filename character set is
 * a-z, A-Z, 0-9, -, _, and .
 * names can't start with -
 */
%}


/* NMCHAR	[-*\.0-9@A-Z_a-z] */

/* not reserved, national, or punctuation, or escape char */
NMCHAR	[^=;/#?: {}|\[\]\\^~<>%]

/* @@ plus and colon are context-sensitive... crud... I think
regexps are the way to go.*/

HEX	[0-9A-Z]

%%

({NMCHAR}|%{HEX}{HEX})+	{ yylval.string = unquote(yytext); return WORD; }
":"			{ return ':'; }
";"			{ return ';'; }
"/"			{ return '/'; }
"="			{ return '='; }
"?"			{ return '?'; }
"#"			{ return '#'; }
.			{ printf("bad char: '%c'\n", yytext[0]); }

%%

static int
hex(int c1, int c2)
{
  if(c2 >= '0' && c2 <= '9'){
    c2 -= '0';
  }else if(c2 >= 'A' && c2 <= 'F'){
    c2 = c2 - 'A' + 10;
  }else{
    abort();
  }

  if(c1 >= '0' && c1 <= '9'){
    c1 -= '0';
  }else if(c1 >= 'A' && c1 <= 'F'){
    c1 = c1 - 'A' + 10;
  }else{
    abort();
  }

  return c1 * 16 + c2;
}

static char*
unquote(const char *text)
{
  char *ret;

  assert(text);
  ret = strdup(text);

  assert(ret);

  {
    const char *p = text;
    char *q = ret;

    while(*p){
      if(*p == '%'){
	assert(*(p+1) && *(p+2));
	*q++ = hex(*(p+1), *(p+2));
	p += 3;
      }else{
	*q++ = *p++;
      }
    }

    *q = 0;
  }

  return ret;
}