#!/usr/bin/env ruby
	
require 'basicrdf'

# basic etag support
# see also ../robotexclusion/
# and other hacks for web harvesting

# Note/test/todo: not all files have an etag. esp dir listings...

class WebDoc

  @@etags={}

  attr_accessor :uri, :uri_hash, :state, :headers, :content, :fetched, :log

  def to_s
    return @content
  end

  def initialize(u)
    @log=''
    @uri=u.chomp
    @headers=''
    @uri_hash=hashcodeIntFromString @uri
  end


  # conditionally get using last known etag 
  # (unless one explicitly supplied)
  def cget(opts={})
    if !opts['etag']
      opts['etag']=headers['ETag']
    end
    self.get(opts)
  end

  def get(opts={})
    require 'net/http'
    log=''
    # FIXME: we don't use base_uri yet.
    log += "URI is #{uri}"
    etag=opts['etag']
    log += "Sending etag: #{etag}" if etag
    log += "uri hash: #{uri_hash}" if uri_hash
    @@etags[uri]=etag
    data=''
    models=[]
    uri =~ /:\/\/([^\/]+)(\/*.*)$/
    host = $1
    res = $2
    h = Net::HTTP::new host
    user_agent = 'RubyRDF;http://www.w3.org/2001/12/rubyrdf/intro.html'
    #todo: gz
    my_headers = { 'User-agent' =>  user_agent }
    my_headers['If-None-Match']=etag if etag
    log += "Sending headers: #{my_headers.inspect}"
    h.open_timeout = 10
    h.read_timeout = 60
    begin
      resp, data = h.get(res, my_headers)
      log += "HEAD: #{resp.header.inspect} "
#      puts "Saving header #{resp.header} with etag:  #{resp.header['ETag']}"
      @headers=resp.header
      rescue
      error_msg="rdfget: HTTP GET on '#{uri}'.  error:#{$!}"
      raise error_msg if !("#{$!}" =~ /304/)
      log += "Got a 304 (from ETag?)"
    end
#    STDERR.puts log
#    return nil if data=''
    @content=data if !data==''
    @fetched=true
    @fetched=false if data==''
    return data
  end
end

####################

page='http://www.w3.org/2001/12/rubyrdf/pack/tests/robotexclusion/t5.txt'
doc=WebDoc.new(page)

puts "1st get (should fetch):"
doc.cget
puts "Fetched: #{doc.fetched}"

puts "2nd get (shouldn't fetch; etag is #{doc.headers['ETag']}):"
c=doc.cget
puts "Fetched: #{doc.fetched}"
puts "Content was: '#{c}'"

puts "3rd get (should fetch; etag is #{doc.headers['ETag']}):"
doc.cget('etag'=>'bogus')
puts "Fetched: #{doc.fetched}"

