#!/usr/bin/env ruby
#
# $Id: scutter.rb,v 1.6 2003/04/20 23:15:53 danbri Exp $
# author: Dan Brickley <danbri@w3.org>
# 
# This is a basic RDF harvester that traverses rdfs:seeAlso links
# and calls application-specific handlers before and after it 
# retrieves each RDF document. It comes with some demo code blocks that
# collect photo information, look out for mentions of airports, and 
# generate basic logging records.
#
# TODO list:
#
# some possible starting points:
#   http://rdfweb.org/people/danbri/rdfweb/danbri-foaf.rdf
#   http://www.perceive.net/xml/googlescutter.rdf
#   http://www.perceive.net/xml/googlescutterNoChatlogs.rdf
#
# nearby:
#   wordnet/ruby hacking, 
#   http://fireball.danbri.org/people/danbri/2002/07/xmlns-wordnet/dantest.rb

require 'basicrdf'

module RDF4R
  class SimpleScutter
    attr_accessor :start, :seen, :seealso, :out, :seenpic, :debug, \
	:uri, :outfile, :left, :pagehandlers, :inithandlers, :errorhandlers

    def initialize(start_uri='',outfile="_report.html")
      @left=[]
      @uri=start_uri 		# todo: should allow for a list?
      @left.push uri if uri != '' # todo: do we need this qualifier?
      @debug=true
      @pagehandlers=[]
      @inithandlers=[]
      @errorhandlers=[]
      @seen=Hash.new(0) 		# counter for whether a rdf uri has been seen
      @seealso=Hash.new(0) 	# all the seealso uris we've seen,  counted
      @outfile=outfile 		# output filename
    end

    def run
      while left.size>0
        @uri = @left.pop.to_s
        page = nil
        @inithandlers.each {|handler| handler.call(self)} # call inithandlers
 
        # Try fetching some RDF:
        #
        seen[uri]=seen[uri]+1  # increment per-uri encounter
        begin 
          page = Loader.get_rdf_from_uri(uri, uri, errorhandlers)
          # note api change: we don't pass in errorhandlers any more
          # in rubyrdf. maybe we should?

          raise "#{$!} (rdfget returned nil)" if page==nil
        rescue
          err_msg="FAILED URI: '#{uri}' type: #{uri.class} MSG: #{$!}" 
          errorhandlers.each {|handler| handler.call err_msg }
          next
        end
        next if page.size==0 # skip to next URI if empty graph

        # Extract any seeAlso'd links from the RDF: 
        #
        rdfs='http://www.w3.org/2000/01/rdf-schema#'

        also = page.ask Statement.new(nil,  rdfs+'seeAlso',nil)
        also.objects.each do |a|
          a=a.to_s
          if seen[a]==0
            seealso[a]=seealso[a]+1
            left.push a unless a==nil           # stash this unseen link
          end
        end
        self.left=[] # reset and rebuild
        seealso.each_key {|k| left.push(k) if seen[k]==0 }
        pagehandlers.each {|handler| handler.call(self,page)} # call pagehandlers
      end 
      puts "RDF crawl complete. Exiting!" if @debug
    end
  end
end

