#!/usr/bin/env ruby require 'basicrdf' # basic etag support # see also ../robotexclusion/ # and other hacks for web harvesting # Note/test/todo: not all files have an etag. esp dir listings... class WebDoc @@etags={} attr_accessor :uri, :uri_hash, :state, :headers, :content, :fetched, :log def to_s return @content end def initialize(u) @log='' @uri=u.chomp @headers='' @uri_hash=hashcodeIntFromString @uri end # conditionally get using last known etag # (unless one explicitly supplied) def cget(opts={}) if !opts['etag'] opts['etag']=headers['ETag'] end self.get(opts) end def get(opts={}) require 'net/http' log='' # FIXME: we don't use base_uri yet. log += "URI is #{uri}" etag=opts['etag'] log += "Sending etag: #{etag}" if etag log += "uri hash: #{uri_hash}" if uri_hash @@etags[uri]=etag data='' models=[] uri =~ /:\/\/([^\/]+)(\/*.*)$/ host = $1 res = $2 h = Net::HTTP::new host user_agent = 'RubyRDF;http://www.w3.org/2001/12/rubyrdf/intro.html' #todo: gz my_headers = { 'User-agent' => user_agent } my_headers['If-None-Match']=etag if etag log += "Sending headers: #{my_headers.inspect}" h.open_timeout = 10 h.read_timeout = 60 begin resp, data = h.get(res, my_headers) log += "HEAD: #{resp.header.inspect} " # puts "Saving header #{resp.header} with etag: #{resp.header['ETag']}" @headers=resp.header rescue error_msg="rdfget: HTTP GET on '#{uri}'. error:#{$!}" raise error_msg if !("#{$!}" =~ /304/) log += "Got a 304 (from ETag?)" end # STDERR.puts log # return nil if data='' @content=data if !data=='' @fetched=true @fetched=false if data=='' return data end end #################### page='http://www.w3.org/2001/12/rubyrdf/pack/tests/robotexclusion/t5.txt' doc=WebDoc.new(page) puts "1st get (should fetch):" doc.cget puts "Fetched: #{doc.fetched}" puts "2nd get (shouldn't fetch; etag is #{doc.headers['ETag']}):" c=doc.cget puts "Fetched: #{doc.fetched}" puts "Content was: '#{c}'" puts "3rd get (should fetch; etag is #{doc.headers['ETag']}):" doc.cget('etag'=>'bogus') puts "Fetched: #{doc.fetched}"