#!/usr/bin/env ruby
# Author:: Dan Brickley
# License:: W3C Software License

# towards support of: http://www.robotstxt.org/wc/norobots.html

# TODO:
# multiple user-agents on same Rule not supported yet.
# test for multiple '*' records.

require 'test/unit'

class TC_RobotExclusion < Test::Unit::TestCase

  attr_accessor :files, :data, :verbose

  NUMFILES=4

  def setup
    @files=['t1.txt','t2.txt','t3.txt','t4.txt']    
    @data=Hash.new
    @verbose=true
    @files.each do |file|
      # STDERR.puts "Loading file: #{file}"
      txt=File.new(file).read
      @data[file]=txt
    end
  end

  def test_loaded
    assert(@data != nil, "Got some data")
    assert(@data.keys.size==NUMFILES, "Check number of test fields loaded.") 
  end

  def test_t1
    n='t1'
    t=@data[n+'.txt']
    rp=RobotFileParser.new(t,n)
    STDERR.puts rp if @verbose
    assert rp.can_fetch('anybot','/blah')==true, "t1: anybot ok for /blah"
    assert rp.can_fetch('anybot','/tmp/')==false, "t1: anybot not ok for /tmp/"
    assert rp.can_fetch('anybot','/tmp')==true, "t1: anybot ok for /tmp"
    assert rp.can_fetch('anybot','/foo.html')==false, "t1: anybot not ok for /foo.html"
  end

  def test_t2
    n='t2'
    t=@data[n+'.txt']
    rp=RobotFileParser.new(t,n)
    STDERR.puts rp if @verbose
    assert rp.can_fetch('anybot','/blah')==true, "t2: anybot ok for /blah"
    assert rp.can_fetch('anybot','/tmp/')==true, "t2: anybot ok for /tmp/"
    assert rp.can_fetch('anybot','/cyberworld/map/')==false, "t2: anybot not ok for /cyberworld/map/"
    assert rp.can_fetch('cybermapper','/cyberworld/map/')==true, "t2: cybermapper ok for /cyberworld/map/"
    assert rp.can_fetch('cyberMAPper','/cyberworld/map/')==true, "t2: cyberMApper ok for /cyberworld/map/"
  end

  def test_t3
    n='t3'
    t=@data[n+'.txt']
    rp=RobotFileParser.new(t,n)
    STDERR.puts rp if @verbose
    assert rp.can_fetch('anybot','/cyberworld/map/')==false, "t3: anybot not ok for /cyberworld/map/"
    assert rp.can_fetch('anybot','/tmp/')==false, "t3: anybot not ok for /tmp/"
    assert rp.can_fetch('cybermapper','/tmp/')==false, "t3: cybermapper not ok for /tmp/"
    assert rp.can_fetch('FoafBot','/people/')==false, "t3: foafbot not ok for /people/"
  end

  def test_t4
    n='t4'
    t=@data[n+'.txt']
    rp=RobotFileParser.new(t,n)
    STDERR.puts rp if @verbose
  end

  def test_rdfwebsite
    t=<<-DATA
User-agent: *
Disallow: /cgi-bin/
Disallow: /rdf-viz/
Disallow: /wordnet/
Disallow: /rweb/
Disallow: /viewcvs/
DATA
    # todo, HTTP GET this
    rp=RobotFileParser.new(t,'rdfweb site')
    STDERR.puts rp if @verbose

    assert !rp.can_fetch('myscutter', '/cgi-bin/foo'), 
	"rdfweb robots.txt refuses access to /cgi-bin/ for *"

    assert rp.can_fetch('myscutter', '/cgi-bar/foo'), 
	"rdfweb robots.txt allows access to /cgi-bar/ for *"

  end




  def test_w3csite
    t=<<-DATA
#
# robots.txt for http://www.w3.org/
#

# For use by search.w3.org
User-agent: W3Crobot/1
Disallow: /Out-Of-Date

# AltaVista Search
User-agent: AltaVista Intranet V2.0 W3C Webreq 
Disallow: /Out-Of-Date

# exclude some access-controlled areas
User-agent: *
Disallow: /Team
Disallow: /Project
Disallow: /Systems
Disallow: /Web
Disallow: /History
Disallow: /Out-Of-Date
Disallow: /2002/02/mid
Disallow: /People/all/
DATA
    # todo, HTTP GET this
    rp=RobotFileParser.new(t,'copy of w3c site robots.txt')
    STDERR.puts rp if @verbose

    assert !rp.can_fetch('myscutter', '/Team'), 
	"w3c robots.txt refuses access to /Team/ for *"

    assert !rp.can_fetch('W3Crobot/1', '/Out-Of-Date'), 
	"w3c robots.txt refused access to /Out-of-Date/ for W3Crobot/1"

    assert rp.can_fetch('W3Crobot/1', '/Team'), 
	"w3c robots.txt allows access to /Team for W3Crobot/1"

  end

  ## TODO:
  ## more tests to exercise looser name matching against 





end

#######################################################################
#
# would-be library code. (httputils.rb ?)
# todo: Also dig out my etags stuff, and package them.
#
# in python, this is 'robotparser'

class RobotFileParser

  def to_s
     return "RULESET #{@name} with rules #{rules.inspect}"
  end

  attr_accessor :raw, :ruletext, :rules, :name, :url
 
  # read a robots.txt URI (from :url)
  def read

    raise "read() method unimplemented. pass text and name to new() for now"
    if !url
      raise "no url specified for this RobotFileParser"
    end
  end
 
  def initialize(text='',name='')
    @verbose=true
    @raw=text
    @name=name
    @rules=[]
    text=text.gsub(/^#.*$\n/,"")
    @ruletext=text.split(/\n\n+/)
    @ruletext.each do |rule|
      r=RobotRule.new(rule)
      rules.push(r)
      # STDERR.puts "Rule: \n\n#{r.inspect}\n\n"
    end 
  end

  # Does this ruleset can_fetch 'agent' accessing resource 'path'?
  #
  def can_fetch(agent,path)

    # we assume rules aren't ordered (check)
    excluded=false
    STDERR.puts "\n\nChecking ruleset #{self.name} if agent: #{agent} ok for  #{path}" if  @verbose

    # lets get the right set of rules first
    myrules=[]
    mentionedme=false    
    rules.each do |r|
      if r.useragent.downcase == agent.downcase
		# todo: liberal match, substring, ignore version etc.
        myrules.push(r) 
        mentionedme=true
      end
    end
    rules.each do |r|
      if r.useragent=="*"
        myrules.push(r) if !mentionedme # only note generic rule if nothing 4 me
      end
    end

    STDERR.puts "\nWORKING ruleset: \n#{myrules.inspect}\n\n" if @verbose
    
    # then check then
    myrules.each do |r|
      STDERR.puts "looking for #{path} in matched item: #{r.disallow.inspect}" if @verbose
      r.disallow.each do |privpath|
        STDERR.puts "check: is #{path} beneath #{privpath}"
        if lmatch(path, privpath) 
          excluded=true
          STDERR.puts "LMATCH Refused access! setting excluded=true (matched #{path} in #{privpath}" if @verbose
        end
      end
#      if (r.disallow.member?(path))
#          puts "Refused access! setting excluded=true."
#          excluded=true
#        end
    end
    STDERR.puts "WELCOME: #{!excluded} in #{name}\n\n"
    return !excluded # if !excluded, can_fetch...
  end

end

# is this super-verbose? I suspect so...
# true if 'part' matches within 'main' anchored at start
def lmatch(main,part)
  re=Regexp.new('^'+part)
  return main.scan(re).size!=0
end


# A RobotRule has one useragent (which may be '*') and multiple "disallows".
#
class RobotRule

  attr_accessor :text, :useragent, :disallow
 
  def initialize(data='')
    @text=data.clone
    @useragent=''
    dis=[]
    text.split(/\n/).each do |line| 
      # STDERR.puts "CHECKING LINE: #{line}\n"
      if (line =~ /User-agent: (.*)$/)
        @useragent=$1
      end
      if (line =~ /^Disallow: (.*)$/)
        path=$1
        path.gsub!(/\s#(.*)/, "")
        dis.push(path)
        # STDERR.puts "Storing dis: #{path}\n"
      end
    end
    @disallow=dis
  end
end

class RobotExclusionError < Exception

  # anything to add?

end
