# RubyRDF / RDF Parser. Based on code contributed from
# RDF4R RDF Parser, Copyright © 2002 Brandt Kurowski (brandt@kurowski.net)
# packaged as part of RubyRDF, see http://www.w3.org/2001/12/rubyrdf/intro.html
# All Rights Reserved. This work is distributed under the W3C® Software 
# License [1] in the hope that it will be useful, but WITHOUT ANY 
# WARRANTY; without even the implied warranty of MERCHANTABILITY or 
# FITNESS FOR A PARTICULAR PURPOSE. 
# [1] http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231

require 'uri'

# An RDF/XML parser
# $Id: RDFParser.rb,v 1.15 2003/04/06 16:48:37 danbri Exp $
# maintainer: danbri@w3.org
# public bugs/suggestions: www-archive+rubyrdf@w3.org
# see http://www.w3.org/2001/12/rubyrdf/pack/intro.html
#
# notes:
# this is the RDF4R Expat-based RDF parser, but with the most obvious 
# expatisms removed, so we don't depend on expat, saxdriver etc.
# in fact I changed very little, except the subclassing
# which used to be "class XMLParser < XML::Parser", and we now no 
# longer require 'saxdriver'. 
#
# We subclass this class as REXRDFParser, and 
# mix in this functionality to XMLParser, which subclasses the Expat lib.
# 
# There may be a better way to do this.
# --danbri


# XML name: will this regex do the trick?
# /^[A-Za-z_:][\w\.\-:]*$/

module RDF4R
  module Driver

    module RDFParser

      # An XML Element #################################################
      class Element

        attr_accessor :resource, :property, :li_count
        attr_accessor :shortcircuit
        attr_reader :parent, :attributes, :ns, :localname, :prefix, :name
        attr_reader :about, :id, :parseType, :in_rdf, :xml_lang, :rdf_resource
        attr_accessor :nodeid
        #fixme: danbri: hmm why is :id read only? nodeid should be same?

        def initialize(parser, parent, name, attrs)
          @parser = parser
          @parent = parent
          #STDERR.puts "ELEMENT init: name=#{name}"
          @ns, @localname = @parser.parse_name(name)
          @name = @ns + @localname
          @attributes = {}
          @li_count = 0
          @shortcircuit = false
          handleXMLattrs(attrs)
          handleRDFattrs(attrs)
        end

	def handleXMLattrs(attrs)
          attrs.each do |att_name, att_value|
            att_ns, att_ln = @parser.parse_name(att_name)
            if att_ln == 'xmlns' and att_value == RDF_NS
              # we need to know if the default namespace is
              # rdf, because RDF M&S 1.0 relies on unprefixed attributes
              @in_rdf = true
            elsif att_ns == XML_NS and att_ln == 'lang' 
              @xml_lang = att_value
            end
          end
          if @in_rdf.nil?
            @in_rdf = @parent.nil? ? false : @parent.in_rdf
          end
        end



        # from #rdfig
        # eikeon> danbri: depends if you want to handle unicode... 
        # here is what rdflib uses: http://rdflib.net/cvs/rdflib/rdflib/syntax/xml_names.py?rev=1.1&content-type=text/vnd.viewcvs-markup
        def check_name(name)
          return true if name =~ /^[A-Za-z_][\w\.\-]*$/
          #return true if name =~ /^[A-Za-z_:][\w\.\-:]*$/
          raise("name '#{name}' didn't match XML name syntax constraints")
        end # for things like value of rdf:ID and rdf:bagID, also element names
            # FIXME: status of ':' unclear. maybe we need two different checks
 
        def handleRDFattrs(attrs)

          attrs.each do |att_name, att_value|
            att_ns, att_ln = @parser.parse_name(att_name)
            if att_ns == RDF_NS or @in_rdf
              case att_ln
              when 'about'
                # FIXME only in 'resource' state
                begin 
                  @about = @parser.base_uri + att_value
                rescue
                  e,u=$!.to_s.split(/:\s+/)
                  raise ("Parser error #{$!} parser base uri was: '#{@parser.base_uri}'") unless e =~ /bad URI/
                  # STDERR.puts "Caught !!!ERROR!!! #{e}  uri='#{u}'" # xxxxx
                  #test: http://www.w3.org/2000/10/rdf-tests/rdfcore/rdfms-difference-between-ID-and-about/test3.rdf
                  # "Non-ASCII characters in URIs are not converted."
                  @about = u
                end

             when 'ID'
                check_name att_value
                # FIXME only in 'resource' state
                if @parser.ids.include? att_value
                  raise "same ID (#{att_value}) cannot be used more than once"
                end
                # FIXME: xmlbase context should be noticed at this point
                @id = att_value
                @parser.ids << @id
                @parser.consumer.notify_id(@id)
              when 'resource' 
                # FIXME only in 'property' state
                r = URI.parse(att_value)
                #STDERR.puts("Testin URI att_value=#{att_value} rel: #{r} base: #{@parser.base_uri} ")

                if !(att_value=='' && @parser.base_uri=='') 
                  @rdf_resource = r.relative? ? @parser.base_uri + r : r
                else
                  # we got an empty resource=  and have no base uri
                  raise "(#{att_ln}) / parser base: '#{@parser.base_uri}' seems we got a resource='' empty and no base uri is set."
                end
 
             when 'nodeID'
                check_name att_value 
                @nodeid = att_value
              when 'aboutEach'
                raise "aboutEach not implemented"
              when 'aboutEachPrefix'
                raise "aboutEachPrefix not implemented"
              when 'bagID'
                check_name att_value
                @bag_id = att_value
              when 'parseType'
                @parseType = att_value
              else
                @attributes[att_ns + att_ln] = att_value
              end
            else
              @attributes[att_ns + att_ln] = att_value
            end
          end
          unless @id.nil? or @about.nil?
            raise "can't have both 'ID' and 'about' attributes"
          end
          unless @nodeid.nil? or @about.nil?
            raise "can't have both 'nodeID' and 'about' attributes"
          end
          unless @nodeid.nil? or @id.nil?
            puts "**** \n\ngot nodid and id, checking state: #{@parser.states.last}\n\n\n" 
            raise "can't have both 'nodeID' and 'ID' attributes"
          end
          unless @nodeid.nil? or @resource.nil?
            raise "Can't have both 'nodeID' and 'resource' attributes"
          end 
        end 
      end #Element
    
      ############################################################

      # RDFParser class definition follows:

      def RDFParser.process(source, base, consumer)
        p = RDFParser.new(nil, ';')
        p.base_uri = base.kind_of?(URI::Generic) ? base : URI.parse(base)
        p.consumer = consumer 
        begin
          p.parse(source)
        rescue RDFParserError
          $stderr.puts %Q{in #{source.path}} if source.kind_of? File
          $stderr.puts %Q{at #{source.lineno}}
          raise $!
        end 
        p.consumer.models
      end

      XML_NS = 'http://www.w3.org/XML/1998/namespace'
      RDF_NS = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
      RDF_DESCRIPTION = RDF_NS + 'Description'
      RDF_TYPE = RDF_NS + 'type'
      RDF_LI = RDF_NS + 'li'
      CONTAINER_NAMES = ['Seq', 'Bag', 'Alt'].map {|x| RDF_NS + x}

      attr_accessor :ns, :nodeids # added by danbri
      attr_accessor :consumer
      attr_accessor :base_uri
      attr_accessor :we_want_models_for_everything
      attr_accessor :ids

      def initialize(encoding = nil, nssep = nil)
        @elements = []
        @ids = []
        @nodeids = {} 
        @states = ['ready']
        @we_want_models_for_everything = false
      end

      def get_resource(element)
        return @consumer.resource(element.about) unless element.about.nil?
        return @consumer.resource(@base_uri + ('#' + element.id)) unless element.id.nil?
        # incomplete rdf:nodeID support.
        if (!element.nodeid.nil?)
          n=@nodeids[element.nodeid] 
          if (n==nil)
            n=@consumer.resource
            @nodeids[element.nodeid]=n
          end
          return n
        end
        return @consumer.resource
      end


      def transition(element)

        case @states.last

        when 'ready'
          # we're outside any RDF model
          if element.ns == RDF_NS or element.in_rdf
            if element.localname == 'RDF' 
              # it's rdf:RDF
              @consumer.start_model
              @consumer.notify_base(@base_uri)
              new_state = 'rdf'
            elsif localname == 'Description'
              # it's rdf:Description outside of rdf:RDF, so we create 
              # a new model and jump right into describing the resource
              @consumer.start_model
              @consumer.notify_base(@base_uri)
              resource = get_resource(element)
              @subject.push resource
              new_state = 'resource'
            else
              # it's rdf:* outside of rdf:RDF: an error, or a model?
              raise %Q{i'm confused by rdf:#{localname}}
            end
          else
          if ns and ns.length > 0 and @we_want_models_for_everything
            # it's *:* outside of rdf:RDF: a model!
            new_state = 'resource'
            @consumer.start_model
            @consumer.notify_base(@base_uri)
            element.resource = get_resource(element)
            raise "do you really want me to handle this?"
          else
            # it's * outside of rdf:RDF: stay in ready state.
            new_state = 'ready'
          end
        end

        when 'rdf'
          # any element in rdf:RDF is a resource
          if CONTAINER_NAMES.include?(element.name)
            new_state = 'container'
          else
            new_state = 'resource'
          end
          element.resource = get_resource(element)

        when 'container'
          # any element in a container should be a 'list item'
          new_state = 'property'
          unless element.name == RDF_LI
            # raise %Q{what's a #{ns}:#{localname} doing in a container?} 
            # see http://www.w3.org/2000/10/rdf-tests/rdfcore/rdf-containers-syntax-vs-schema/test006.rdf
            # containers match typed nodes, so can have arbitrary properties
            #raise %Q{what's a #{ns}:#{element.name} doing in a container?} unless (element.name =~ /^#{RDF_NS}_\d*/)
          end # We allow rdf:_n as a property, where n is \d+
              # FIXME: does this leave corner cases? eg _0 ?
          element.parent.li_count += 1
          element.property = @consumer.resource(RDF_NS + '_' + element.parent.li_count.to_s)

        when 'resource'
          # any element in a resource is a property
          element.property = @consumer.resource(element.name)
          new_state = 'property'

        when 'property'
          # any element in a property is a resource
          if CONTAINER_NAMES.include?(element.name) 
            new_state = 'container'
          else
            new_state = 'resource'
          end
          element.resource = get_resource(element)
          @consumer.statement(
             element.parent.parent.resource, 
             element.parent.property, 
             element.resource
          )

          when 'literal'
            raise %Q{can't have element (#{prefix}:#{localname}) in literal}
          else
            raise %Q{internal error: unanticipated state (#{state})}
          end
          return new_state

      end

      # XML SAX functions:

      def startElement(name, attrs)
        # if we just entered an element when we thought we had a
        # literal, but all we've collected is whitespace, then we
        # ignore it as insignificant whitespace between elements
        @states.pop if @states.last == 'literal' and @string =~ /^\s*$/
        element = Element.new(self, @elements.last, name, attrs) 
        new_state = transition(element)

        # if we're entering a new resource, we state its type
        if new_state == 'resource' or new_state == 'container'
          @consumer.statement(
            element.resource, 
            @consumer.resource(RDF_TYPE), 
            @consumer.resource(element.name)
	   ) unless element.name == RDF_DESCRIPTION
					# rdf:Description is used for untyped nodes
        end

        if new_state == 'property' and not element.rdf_resource.nil?
          # handle rdf:resource on a property
          element.resource = @consumer.resource(element.rdf_resource)
          @consumer.statement(
           element.parent.resource,
	   element.property,
           element.resource
	  )
        end
        # FIXME look at RDF section 6 for handing property+idAttr+parseType crap

        # Handle rdf:nodeID on a property
        if new_state == 'property' and not element.nodeid.nil?
          element.resource = get_resource(element) 
          @consumer.statement(
           element.parent.resource,
	   element.property,
	   element.resource 
          )
        end

        if element.parseType == 'Resource'
          element.resource = get_resource(element)
          @consumer.statement(
            element.parent.resource,
            element.property,
            element.resource
          )
          element.shortcircuit = true
          new_state = 'resource'
          elsif element.parseType == 'Literal'
          raise %Q{sorry, i don't do parseType='Literal' yet.}
        end

        if new_state == 'resource' or (new_state == 'property' and not element.resource.nil?)
          # take care of "abbreviated syntax"
          element.attributes.each do |name, value|
            @consumer.statement(
              element.resource,
              @consumer.resource(name),
              @consumer.literal(value)
            )
          end
        end
        @elements.push element
        @states.push new_state
      end

      def endElement(name)
        element = @elements.pop
        state = @states.pop
        #puts "State: #{state}" #debug
        case state #= @states.pop
        when 'resource'
        when 'container'
        when 'property'
        when 'literal'
          unless @string.nil?
            @consumer.statement(
              element.shortcircuit ? element.resource : element.parent.resource,
							element.property,
							@consumer.literal(@string, element.xml_lang)
						)
          end
	  @states.pop # get rid of element state ('literal' state is an "extra")
        when 'rdf'
          @consumer.end_model
        when 'ready'
          # we must be outside of rdf, so ignore
	else
          raise %Q{ending element in unknown state "#{state}"!}
        end
      end

      def character(text)
        case state = @states.last
        when 'literal'
          @string += text
        when 'property'
          @states.push 'literal'
          @string = text
        when 'ready'
          # ignore
        else
          if text.strip.length > 0
            raise %Q{where am i? "#{text.strip}" in #{state}}
          end
        end
      end

      def parse_name(name)
        name =~ /^(([^;]*);)?(.+)$/
        ns = $2
        localname = $3
        ns = ns || ""
        return [ns, localname]
      end
    end
  end
end

