#!/usr/bin/env ruby # Author:: Dan Brickley # License:: W3C Software License # towards support of: http://www.robotstxt.org/wc/norobots.html # TODO: # multiple user-agents on same Rule not supported yet. # test for multiple '*' records. require 'test/unit' class TC_RobotExclusion < Test::Unit::TestCase attr_accessor :files, :data, :verbose NUMFILES=4 def setup @files=['t1.txt','t2.txt','t3.txt','t4.txt'] @data=Hash.new @verbose=true @files.each do |file| # STDERR.puts "Loading file: #{file}" txt=File.new(file).read @data[file]=txt end end def test_loaded assert(@data != nil, "Got some data") assert(@data.keys.size==NUMFILES, "Check number of test fields loaded.") end def test_t1 n='t1' t=@data[n+'.txt'] rp=RobotFileParser.new(t,n) STDERR.puts rp if @verbose assert rp.can_fetch('anybot','/blah')==true, "t1: anybot ok for /blah" assert rp.can_fetch('anybot','/tmp/')==false, "t1: anybot not ok for /tmp/" assert rp.can_fetch('anybot','/tmp')==true, "t1: anybot ok for /tmp" assert rp.can_fetch('anybot','/foo.html')==false, "t1: anybot not ok for /foo.html" end def test_t2 n='t2' t=@data[n+'.txt'] rp=RobotFileParser.new(t,n) STDERR.puts rp if @verbose assert rp.can_fetch('anybot','/blah')==true, "t2: anybot ok for /blah" assert rp.can_fetch('anybot','/tmp/')==true, "t2: anybot ok for /tmp/" assert rp.can_fetch('anybot','/cyberworld/map/')==false, "t2: anybot not ok for /cyberworld/map/" assert rp.can_fetch('cybermapper','/cyberworld/map/')==true, "t2: cybermapper ok for /cyberworld/map/" assert rp.can_fetch('cyberMAPper','/cyberworld/map/')==true, "t2: cyberMApper ok for /cyberworld/map/" end def test_t3 n='t3' t=@data[n+'.txt'] rp=RobotFileParser.new(t,n) STDERR.puts rp if @verbose assert rp.can_fetch('anybot','/cyberworld/map/')==false, "t3: anybot not ok for /cyberworld/map/" assert rp.can_fetch('anybot','/tmp/')==false, "t3: anybot not ok for /tmp/" assert rp.can_fetch('cybermapper','/tmp/')==false, "t3: cybermapper not ok for /tmp/" assert rp.can_fetch('FoafBot','/people/')==false, "t3: foafbot not ok for /people/" end def test_t4 n='t4' t=@data[n+'.txt'] rp=RobotFileParser.new(t,n) STDERR.puts rp if @verbose end def test_rdfwebsite t=<<-DATA User-agent: * Disallow: /cgi-bin/ Disallow: /rdf-viz/ Disallow: /wordnet/ Disallow: /rweb/ Disallow: /viewcvs/ DATA # todo, HTTP GET this rp=RobotFileParser.new(t,'rdfweb site') STDERR.puts rp if @verbose assert !rp.can_fetch('myscutter', '/cgi-bin/foo'), "rdfweb robots.txt refuses access to /cgi-bin/ for *" assert rp.can_fetch('myscutter', '/cgi-bar/foo'), "rdfweb robots.txt allows access to /cgi-bar/ for *" end def test_w3csite t=<<-DATA # # robots.txt for http://www.w3.org/ # # For use by search.w3.org User-agent: W3Crobot/1 Disallow: /Out-Of-Date # AltaVista Search User-agent: AltaVista Intranet V2.0 W3C Webreq Disallow: /Out-Of-Date # exclude some access-controlled areas User-agent: * Disallow: /Team Disallow: /Project Disallow: /Systems Disallow: /Web Disallow: /History Disallow: /Out-Of-Date Disallow: /2002/02/mid Disallow: /People/all/ DATA # todo, HTTP GET this rp=RobotFileParser.new(t,'copy of w3c site robots.txt') STDERR.puts rp if @verbose assert !rp.can_fetch('myscutter', '/Team'), "w3c robots.txt refuses access to /Team/ for *" assert !rp.can_fetch('W3Crobot/1', '/Out-Of-Date'), "w3c robots.txt refused access to /Out-of-Date/ for W3Crobot/1" assert rp.can_fetch('W3Crobot/1', '/Team'), "w3c robots.txt allows access to /Team for W3Crobot/1" end ## TODO: ## more tests to exercise looser name matching against end ####################################################################### # # would-be library code. (httputils.rb ?) # todo: Also dig out my etags stuff, and package them. # # in python, this is 'robotparser' class RobotFileParser def to_s return "RULESET #{@name} with rules #{rules.inspect}" end attr_accessor :raw, :ruletext, :rules, :name, :url # read a robots.txt URI (from :url) def read raise "read() method unimplemented. pass text and name to new() for now" if !url raise "no url specified for this RobotFileParser" end end def initialize(text='',name='') @verbose=true @raw=text @name=name @rules=[] text=text.gsub(/^#.*$\n/,"") @ruletext=text.split(/\n\n+/) @ruletext.each do |rule| r=RobotRule.new(rule) rules.push(r) # STDERR.puts "Rule: \n\n#{r.inspect}\n\n" end end # Does this ruleset can_fetch 'agent' accessing resource 'path'? # def can_fetch(agent,path) # we assume rules aren't ordered (check) excluded=false STDERR.puts "\n\nChecking ruleset #{self.name} if agent: #{agent} ok for #{path}" if @verbose # lets get the right set of rules first myrules=[] mentionedme=false rules.each do |r| if r.useragent.downcase == agent.downcase # todo: liberal match, substring, ignore version etc. myrules.push(r) mentionedme=true end end rules.each do |r| if r.useragent=="*" myrules.push(r) if !mentionedme # only note generic rule if nothing 4 me end end STDERR.puts "\nWORKING ruleset: \n#{myrules.inspect}\n\n" if @verbose # then check then myrules.each do |r| STDERR.puts "looking for #{path} in matched item: #{r.disallow.inspect}" if @verbose r.disallow.each do |privpath| STDERR.puts "check: is #{path} beneath #{privpath}" if lmatch(path, privpath) excluded=true STDERR.puts "LMATCH Refused access! setting excluded=true (matched #{path} in #{privpath}" if @verbose end end # if (r.disallow.member?(path)) # puts "Refused access! setting excluded=true." # excluded=true # end end STDERR.puts "WELCOME: #{!excluded} in #{name}\n\n" return !excluded # if !excluded, can_fetch... end end # is this super-verbose? I suspect so... # true if 'part' matches within 'main' anchored at start def lmatch(main,part) re=Regexp.new('^'+part) return main.scan(re).size!=0 end # A RobotRule has one useragent (which may be '*') and multiple "disallows". # class RobotRule attr_accessor :text, :useragent, :disallow def initialize(data='') @text=data.clone @useragent='' dis=[] text.split(/\n/).each do |line| # STDERR.puts "CHECKING LINE: #{line}\n" if (line =~ /User-agent: (.*)$/) @useragent=$1 end if (line =~ /^Disallow: (.*)$/) path=$1 path.gsub!(/\s#(.*)/, "") dis.push(path) # STDERR.puts "Storing dis: #{path}\n" end end @disallow=dis end end class RobotExclusionError < Exception # anything to add? end