#!/usr/bin/ruby
#
# An RDFa parser in Ruby
#
# Ben Adida
# 2008-05-07
#
# Excuse the Ruby-beginner's lack of Ruby-ness. The point of using Ruby here is
# to use the HTML5 library in a new environment where I am forced to
# do a cleanroom implementation of the specification.
#
# I am sure I will look back on this code and cry.
#

require 'rubygems'
require 'html5'
require 'html5/treebuilders'
require 'uri'

class Resource
  def initialize(*args)
    if args.size == 1
      @uri = args[0]
    else
      @uri = URI.join(args[1], args[0])
    end
  end
  
  def to_s
    "<#{@uri}>"
  end
end


class CURIE
  attr :mapping
  attr :prefix
  attr :suffix
  
  def initialize(curie, uri_mappings) 
    @prefix, @suffix = curie.split(":")

    # consider the bnode situation
    if @prefix == "_"
	# we force a non-nil name, otherwise it generates a new name
	@bnode = BNode.new(@suffix || "")
    else
	@mapping = uri_mappings[@prefix]
    	@uri = Resource.new(@mapping + @suffix)
    end
  end
  
  def to_uri
    if @bnode
	@bnode
    else
    	@uri
    end
  end
  
  def to_curie
    if @bnode
	@bnode.to_s
    else
	"#{@prefix}:#{@suffix}"
    end
  end

  def to_s
    self.to_uri.to_s
  end
end

RDF_TYPE = CURIE.new("rdf:type", {'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'})

LINK_TYPES = ['alternate', 'appendix', 'bookmark', 'cite', 'chapter', 'contents', 'copyright', 'first', 'glossary',
  'help', 'icon', 'index', 'last', 'license', 'meta', 'next', 'p3pv1', 'prev', 'role', 'section', 'stylesheet', 'subsection',
  'start', 'top', 'up']
XH_MAPPING = {"" => "http://www.w3.org/1999/xhtml/vocab\#"}

# reserved values in @rel or @rev
def link_type_to_curie(value)
  if LINK_TYPES.include?(value)
    CURIE.new(":#{value}", XH_MAPPING)
  else
    nil
  end
end
  
# space-separated CURIEs or Link Types
def parse_curies(value, uri_mappings, with_link_types=false)
  curie_array = []
  if not value
    return curie_array
  end
  
  value.split(' ').each do |curie|
    if curie.include?(":")
      curie_array << CURIE.new(curie, uri_mappings)
    elsif with_link_types
      link_type_curie = link_type_to_curie(value)
      if link_type_curie
        curie_array << link_type_curie
      end
    end
  end
  
  curie_array
end

class URIorSafeCURIE
  # the URI or CURIE
  attr :value

  def initialize(value, uri_mappings, base)
    # check if the value is [foo:bar]
    if value[0,1] == '['
      @value = CURIE.new(value[1,value.length - 2], uri_mappings)
    else
      @value = Resource.new(value, base)
    end
  end
  
  def to_uri
    if @value.class == Resource
      @value
    else
      @value.to_uri
    end
  end
end

class BNode
  @@count = 0
  @@mappings = {}
  def initialize(name = nil)
    if name
	if @@mappings[name] == nil
	  @@mappings[name] = @@count
	  @@count += 1
	end 
	@id = @@mappings[name]
    else
	@id = @@count
        @@count += 1
    end
  end
  
  def to_s
    return "_:bn#{@id}"
  end
end

class Literal
  attr :value
  attr :type
  attr :lang
  
  def initialize(value, type, lang)
    @value = value
    @type = type
    @lang = lang
  end
  
  def to_s
    result = '"' + @value.gsub(/"/,'\\"') + '"'

    # check if newline
    if result.include?("\n")
	result = "\"\"#{result}\"\""
    end
    if @type
      result += "^^#{@type}"
    elsif @lang
      result += "@#{lang}"
    end
    result
  end
  
  def to_xml_s
    @value
  end
end

# a new RDFa Parser MUST be instantiated for every parsed document
# there is only the constructor.
# that's how it is. Stop Complaining.
class RDFaParser

  # The Recursive Baggage
  class EvaluationContext
    attr :base, true
    attr :parent_subject, true
    attr :parent_object, true
    attr :uri_mappings, true
    attr :incomplete_triples, true
    attr :language, true

    def initialize(base)
      # Initialize the evaluation context, [5.1]
      @base = base
      @parent_subject = @base
      @parent_object = nil
      @uri_mappings = XH_MAPPING
      @incomplete_triples = []
      @language = nil
    end

    def initialize_copy(from)
	# clone the evaluation context correctly
	@uri_mappings = from.uri_mappings.clone
	@incomplete_triples = from.incomplete_triples.clone
    end
  end

  def initialize(file, base)
    # create a new parser, with an hpricot tree
    # http://code.whytheluckystiff.net/hpricot/
    parser = HTML5::HTMLParser.new(:tree => HTML5::TreeBuilders['hpricot'])

    # parse the file
    @doc = parser.parse(file)
    @base = base
  end
  
  def parse
    # initialize the triplestore
    @triplestore = []
    
    # parse
    parse_whole_document(@doc, @base)
    
    # output triple store
    @triplestore.collect do |t|
      "#{t[:s].to_s} #{t[:p].to_s} #{t[:o].to_s} ."
    end.join("\n")
  end
  
  protected
  
  # add a triple, object can be literal or URI or bnode
  def add_triple(subject, predicate, object)
    @triplestore << {:s => subject, :p => predicate, :o => object}
  end
  
  # Parsing an RDFa document (this is *not* the recursive method)
  def parse_whole_document(doc, base)
    # find if the document has a base element
    base_el = doc.at('head').at('base')
    if (base_el)
      base = base_el.attributes['href']
    end

    # initialize the evaluation context with the appropriate base
    evaluation_context= EvaluationContext.new(base)

    self.traverse(doc.root, evaluation_context)
  end
  
  # Extract the XMLNS mappings from an element
  def extract_mappings(element)
    mappings = {}
    
    # look for xmlns
    element.attributes.each do |attr_name,attr_value|
      if attr_name == "xmlns"
        # mappings[""] = attr_value
        next
      end
      
      if attr_name[0,5] == "xmlns"
        mappings[attr_name[6, attr_name.length - 6]] = attr_value
      end
    end
    
    mappings
  end

  # The recursive helper function
  def traverse(element, evaluation_context)
    # local variables [5.5 Step 1]
    recurse = true
    skip = false
    new_subject = nil
    current_object_resource = nil
    uri_mappings = evaluation_context.uri_mappings.clone
    incomplete_triples = []
    language = evaluation_context.language
    
    # shortcut
    attrs = element.attributes

    # SPEC CONFUSION: not sure what to initialize this value to
    current_object_literal = nil

    # XMLNS mappings [5.5 Step 2]
    uri_mappings.merge!(extract_mappings(element))
    
    # Language information [5.5 Step 3]
    language = attrs['xml:lang'] || language
    
    # rels and revs
    rels = parse_curies(attrs['rel'], uri_mappings, true)
    revs = parse_curies(attrs['rev'], uri_mappings, true)
    valid_rel_or_rev = (rels and rels.length > 0) || (revs and revs.length > 0)
    
    if not valid_rel_or_rev
      # Establishing a new subject if no valid rel/rev [5.5 Step 4]
      if attrs['about']
        new_subject = URIorSafeCURIE.new(attrs['about'], uri_mappings, evaluation_context.base).to_uri
      elsif attrs['src']
        new_subject = Resource.new(attrs['src'], evaluation_context.base)
      elsif attrs['resource']
        new_subject = URIorSafeCURIE.new(attrs['resource'], uri_mappings, evaluation_context.base).to_uri
      elsif attrs['href']
        new_subject = Resource.new(attrs['href'], evaluation_context.base)
      end

      # SPEC CONFUSION: not sure what "If no URI is provided by a resource attribute" means, I assume 
      # it means that new_subject is still null
      if not new_subject
        if element.name == 'head' or element.name =='body'
          new_subject = Resource.new("", evaluation_context.base)
        elsif element.attributes['typeof']
          new_subject = BNode.new
        else
          # if it's null, it's null and nothing changes
          new_subject = evaluation_context.parent_object
          if not attrs['property']
	    skip = true
	  end
        end
      end

    else
      # Establish both new subject and current object resource [5.5 Step 5]
      
      if attrs['about']
        new_subject = URIorSafeCURIE.new(attrs['about'], uri_mappings, evaluation_context.base).to_uri
      elsif attrs['src']
        new_subject = Resource.new(attrs['src'], evaluation_context.base)
      end
      
      # If no URI is provided then the first match from the following rules will apply
      if not new_subject
        if element.name == 'head' or element.name =='body'
          new_subject = Resource.new("", evaluation_context.base)
        elsif element.attributes['typeof']
          new_subject = BNode.new
        else
          # if it's null, it's null and nothing changes
          new_subject = evaluation_context.parent_object
          # no skip flag set this time
        end
      end
      
      if attrs['resource']
        current_object_resource = URIorSafeCURIE.new(attrs['resource'], uri_mappings, evaluation_context.base).to_uri
      elsif attrs['href']
        current_object_resource = Resource.new(attrs['href'], evaluation_context.base)
      end

    end
    
    # Process @typeof if there is a subject [Step 6]
    if new_subject and attrs['typeof']
      types = parse_curies(attrs['typeof'], uri_mappings, false)
      types.each do |one_type|
        add_triple(new_subject, RDF_TYPE, one_type.to_uri)
      end
    end
    
    # Generate triples with given object [Step 7]
    if current_object_resource
      rels.each do |rel|
        add_triple(new_subject, rel, current_object_resource)
      end
      
      revs.each do |rev|
        add_triple(current_object_resource, rev, new_subject)
      end
    else
      # Incomplete triples and bnode creation [Step 8]
      if valid_rel_or_rev
	current_object_resource = BNode.new
      end
      
      rels.each do |rel|
        # SPEC CONFUSION: we don't store the subject here?
        incomplete_triples << {:predicate => rel, :direction => :forward}
      end
      
      revs.each do |rev|
        # SPEC CONFUSION: we don't store the object here?
        incomplete_triples << {:predicate => rev, :direction => :reverse}
      end

    end
    
    # Establish current object literal [Step 9]
    if attrs['property']
      properties = parse_curies(attrs['property'], uri_mappings, false)

      # get the literal datatype
      type = attrs['datatype']
      children_node_types = element.children.collect{|c| c.class}.uniq
      
      xml_literal = Resource.new("http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral")
      # the following 3 IF clauses should be mutually exclusive. Written as is to prevent extensive indentation.
      
      # SPEC CONFUSION: have to special case XML Literal, not clear right away.
      # SPEC CONFUSION: specify that the conditions are in order of priority
      if type and (type != '') and (CURIE.new(type,uri_mappings).to_uri.to_s != xml_literal.to_s)
        # typed literal
        current_object_literal = Literal.new(attrs['content'] || element.inner_text, CURIE.new(type, uri_mappings).to_uri, language)
      elsif attrs['content'] or (children_node_types == [Hpricot::Text]) or (element.children.length == 0) or (type == '')
        # plain literal
        current_object_literal = Literal.new(attrs['content'] || element.inner_text, nil, language)
      elsif (children_node_types != [Hpricot::Text]) and (type == nil or CURIE.new(type, uri_mappings).to_uri.to_s == xml_literal.to_s)
        # XML Literal
        # SPEC CONFUSION: what is the associativity of 'and' and 'or'?
        # SPEC CONFUSION: does it have to be "rdf:XMLLiteral", or can it be another prefix? Maybe write the whole URI.
        current_object_literal = Literal.new(element.inner_html, xml_literal, language)
        recurse = false
      end
      
      # add each property
      properties.each do |property|
        add_triple(new_subject, property, current_object_literal)
      end
      
      # SPEC CONFUSION: "the triple has been created" ==> there may be more than one
      # set the recurse flag above in the IF about xmlliteral, as it is the only place that can happen
    end
    
    # Complete the incomplete triples from the evaluation context [Step 10]
    if not skip and new_subject
      evaluation_context.incomplete_triples.each do |trip|
        if trip[:direction] == :forward
          add_triple(evaluation_context.parent_subject, trip[:predicate], new_subject)
        elsif trip[:direction] == :reverse
          add_triple(new_subject, trip[:predicate], evaluation_context.parent_subject)
        end
      end
    end
    
    # Create a new evaluation context and proceed recursively [Step 11]
    if recurse
      # SPEC CONFUSION: new evaluation context for each child? Probably not necessary,
      # but maybe needs to be pointed out?

      if skip
	new_ec = evaluation_context.clone
	new_ec.language = language
	new_ec.uri_mappings = uri_mappings
      else
   	# create a new evaluation context
      	new_ec = EvaluationContext.new(evaluation_context.base)
      	new_ec.parent_subject = new_subject || evaluation_context.parent_subject
      	new_ec.parent_object = current_object_resource || new_subject || evaluation_context.parent_subject
      	new_ec.uri_mappings = uri_mappings
      	new_ec.incomplete_triples = incomplete_triples
      	new_ec.language = language
      end
      
      element.children.each do |child|
        # recurse only if it's an element
        if child.class != Hpricot::Elem then
          next
        end

        self.traverse(child, new_ec)
        
      end
    end
  end
  
end

