#!/usr/bin/env ruby # # $Id: webdata,v 1.2 2003/07/13 12:09:42 danbri Exp $ # A sample scutter implementation, using the framework # defined in RDF4R/scutter.rb # # by Dan Brickley # $Id: webdata,v 1.2 2003/07/13 12:09:42 danbri Exp $ # # TODO: this is more a demo app than a test. write some tests and # turn the rest into a general RDF utility eg /usr/bin/webdata # ...move to ../examples/ ? # help_long=<... # airports = lambda do |crawler,page| #page.reg_xmlns 'http://www.megginson.com/exp/ns/airports#', 'air' rs = page.ask Statement.new(nil, air+"iata", nil) rs.objects.each do |a| a.graph=page puts "AIRPORT: #{a} got airport code in #{crawler.uri})" end end # stats to be output at start of each loop # loopstats = lambda do |s| puts "INIT: s.left.size=#{s.left.size} s.seen.size=#{s.seen.size} current: #{s.uri}" end error_logger = lambda {|e| puts "ERROR: #{e}" } #trying to find src of memory bloat: (on windows anyway) #objstats=nil #ayf.pagehandlers.push Proc.new {|c,page| # objstats=Hash.new(0) # empty and recount each loop # ObjectSpace.each_object{|x| # objstats[x.class.name]=objstats[x.class.name]+1 # } # puts "ObjectStats: #{objstats.inspect}\n" #} # puts "OPTIONS are: #{o.inspect}" #o['--dbi-user']='foooo' # puts "USER was: #{o['--dbi-user']}" n='DBI:Pg:scutter1' u='danbri' p='' n = o['--dbi-driver'] if o['--dbi-driver'] u = o['--dbi-user'] if o['--dbi-user'] p = o['--dbi-pass'] if o['--dbi-pass'] # n = (o['--dbi-driver'] or 'DBI:Pg:scutter1') # u = (o['--dbi-user'] or 'danbri') # p = (o['--dbi-pass'] or '') puts "\n\nDB config: -n=#{n} -u=#{u} -p=#{p} \n\nopts:#{o.inspect}\n\n" sink = SimpleSQLGraph.new('dbi_driver'=>n,'dbi_user'=>u, 'dbi_pass'=>p) sink.verbosity=5 if o['--destroydb'] puts "DESTROYING DATABASE:"+sink.destroydb end datasink = lambda do |crawler,page| puts "Storing data from #{crawler.uri}" puts "SQL log: "+sink.store_graph(page, crawler.uri) puts "Smushing..." puts sink.defrag sink.addAllSuperProperties #todo puts "SQLdb: status: #{sink.status}" end # register some handlers: ayf.pagehandlers.push page_summary, airports, datasink ayf.inithandlers.push loopstats ayf.errorhandlers.push error_logger ayf.run # set crawler running! end ################################################################# c = GetoptLong.new ( ['--check', '-c', GetoptLong::REQUIRED_ARGUMENT], ['--destroydb','-x', GetoptLong::NO_ARGUMENT], ['--test','-t', GetoptLong::NO_ARGUMENT], ['--collect','-f', GetoptLong::OPTIONAL_ARGUMENT], ['--scutter','-s', GetoptLong::REQUIRED_ARGUMENT], ['--dbi-driver','-n', GetoptLong::REQUIRED_ARGUMENT], ['--dbi-user','-u', GetoptLong::REQUIRED_ARGUMENT], ['--dbi-pass','-p', GetoptLong::REQUIRED_ARGUMENT] ) #start_uri = 'http://rdfweb.org/people/danbri/rdfweb/danbri-foaf.rdf' o={} c.each_option do |name,arg| arg.chomp! puts "#{name} -> #{arg}" o[name]=arg if arg end puts "INITIAL options: #{o.inspect}" start_uri = o['--scutter'] go(start_uri,o) # ./ayftest.rb --scutter=http://rdfweb.org/people/danbri/rdfweb/danbri-foaf.rdf -n DBI:Pg:scutter1 --dbi-user=danbri # select distinct o, count(o) from simple where o like 'http://xmlns.com/wordnet/1.6/%' group by o ORDER BY count(o) desc;