#!/usr/bin/env ruby
#
# scutter: an RDF harvesting, indexing and query answering gadget
# author: danbri@w3.org
#
# Usage: 
#	./scutter --scutter
#	./scutter --server
# ...to harvest, index data, and then run an rdf query server for it.

# needs ./webcache/_nt and ./webcache sub-directories, for now

# eg see http://starway.heddley.com/~edmundd/plan.rdf

# GPG/PGP integration:
# in webcache/ try:  grep WOT *meta | sort | uniq
# tood: feed these uris to GPG
# example...
#
# given an assurance and local copy of the RDF it allegedly signs:
#
# GET http://rdfweb.org/people/danbri/rdfweb/webwho.xrdf.asc > _tmpsig; gpg --verify _tmpsig  rdf-1679583185.rdf
# gpg: Warning: using insecure memory!
# gpg: Signature made Sat 06 Jul 2002 02:37:02 PM GMT using DSA key ID 73228FE4


## Notes

## re-org needed: parsing local should be distinct from storing. Example:
#u = 'http://www.ilrt.bristol.ac.uk/discovery/rdf/resources/rss.rdf'
#nt_cache = "#{base}webcache/_nt/rdf-#{ scutter_remote(u) }.nt"
#graph = Loader.ntfile2graph( nt_cache )
#puts graph.toNtriples


## HTTP Proxy:
## - scutter is now proxy-aware
##
## HTTP_PROXY=http://cache-edi.cableinet.co.uk:8080 GET http://rdfweb.org/people/danbri/rdfweb/danbri-biblio.rdf
## ...tests timeliness of proxy.
##
## Data QA:
##
## Here's a *nasty* hack:
##	grep genid webcache/*rdf | awk -F: '{ printf "%s\n",$1 }' | sort | uniq |  \
##		perl -e 'while (<>) { $_ =~ s/\.rdf\s*//g; print `cat "$_.meta" | grep baseuri` };'
##
## ...quick report on source URIs that contain the string 'genid' (ie. likely out of date)
##
## I18N:
##
## These tools _should_ be i18n friendly. 
## For testing, try http://www.kanzaki.com/info/rss.rdf 

##
## TODO:
##
## - Needs to know about disk/directories etc., complain or create dirs instead of fail.
## - todo: investigate non-SOAP server options (webrick?)


require '/home/danbri/s-rubyrdf/basicrdf'
require '/home/danbri/s-rubyrdf/squish/squish' 			
require 'dbi'
require 'net/http'
require 'sha1'
require 'getoptlong'
require 'soap/standaloneServer'	
	

## CONFIG
##
## -- find a better home! see also Proxy settings
##


  
###########################################################################################	


class App < SOAP::StandaloneServer
  def initialize( *arg )
    super( *arg )
  end

  def methodDef
    addMethod(self, 'squish', 'querystring', 'querylanguage' ,'datasrc')
  end

  def methodDef
    addMethod(self, 'squish', 'querystring', 'querylanguage' )
  end

   def squish ( querystring, querylanguage )
    self.squish(quertstring, querylanguage, 'default')
  end

  def squish ( querystring, querylanguage, datasrc='default' )

    ## CONFIG INFO (TODO: move this elsewhere)
    ##
    dbname='rdfweb1'                # database name
    dbi_driver = 'DBI:Pg:'+dbname   # DBI driver 
    dbi_user = 'danbri'		    # user
    dbi_pass=''	                    # autho


     # next bit should be conditional on rdfauthor  as client
    querystring.gsub!(/\s+WHERE/,", WHERE ") #baaad! fix the damn parser!
    querystring = querystring + "  USING dc for http://purl.org/dc/elements/1.1/ "

    total = ResultSet.new
    query = SquishQuery.new.parseFromText querystring
    service = DBIDataService.new(dbi_driver,dbi_user,'')
    service.defrag 
    service.addAllSuperProperties
    results=ResultSet.new

#    puts "Query: squish: #{querystring} sql: #{query.toSQLQuery} "

    DBI.connect( dbi_driver , dbi_user , dbi_pass ) do | dbh |
      # puts "Connected!..."
      dbh.select_all( query.toSQLQuery  ) do | row |
        # puts "Doing storing query..."
        total.push ResultRow.new(row.to_h.clone)
      end
    end
    ret=[]
    total.each do |row|
      ret.push row.values
      row.values.each_key do |field| puts "\t#{field}: #{row.values[field]} \n" end
      puts "\n\n"
    end
    puts "results are: #{ret.inspect} \n\n\n"
    return ret
  end
end

def getWireDumpLogFile
  logFilename = File.basename( $0 ) + '.log'
   f = File.open( logFilename, 'w' )
   f << "File: #{ logFilename } - rdf query server logs.\n"
   f << "Date: #{ Time.now }\n\n"
end
 
    



###########################################################################################	

# given a local RDF file (cached, in effect, parse and load)
# todo: * pass in datasource info
def scutter_local (file, base_uri='', cache_dir='./' )



  ## CONFIG INFO (TODO: move this elsewhere)
  ##
  dbname='rdfweb1'                # database name
  dbi_driver = 'DBI:Pg:'+dbname   # DBI driver 
  dbi_user = 'danbri'		    # user
  dbi_pass=''	                    # autho

  puts "scutter_local: file=#{file} with base=#{base_uri} :"
  pmsg=`rdfdump -q -r -o ntriples 'file:#{cache_dir}webcache/rdf-#{file}.rdf'  '#{base_uri}' > '#{cache_dir}webcache/_nt/rdf-#{file}.nt'`
  puts pmsg
  #
#  puts "\n\nPARSER_#5:\n\n"
#  p5_msg_c = `xsltproc ../../xsltrdf/rdfc14n.xsl '#{cache_dir}webcache/rdf-#{file}.rdf' > '#{cache_dir}_5/rdf-#{file}.c14.rdf'`
#  p5_msg = `xsltproc  --stringparam base '#{base_uri}' ../../xsltrdf/rdfc2nt.xsl   '#{cache_dir}_5/rdf-#{file}.c14.rdf' > '#{cache_dir}webcache/_nt/rdf-#{file}.p5.nt' `
#  puts "\n==#5\n\n"

#../../xsltrdf/rdfc14n.xsl
#../../xsltrdf/rdfc2nt.xsl

  nt_cache = "#{cache_dir}webcache/_nt/rdf-#{file}.nt"
  puts "N-Triples cache: #{nt_cache}"
  parsed_ok = (pmsg=='')
  graph = nil
  sql_script = nil
  if  parsed_ok   
    graph = Loader.ntfile2graph( nt_cache )
    sql_inserts = graph.toSQLInserts ("uri=#{file}")
 
    # puts "GOT SQL: #{sql_inserts} \n\n====\n\n"
    if !sql_inserts.empty?
      puts "updating query server."
      DBI.connect ( dbi_driver, dbi_user, dbi_pass ) do |dbh|

        # clean out last triples from this src
        # TODO: this risky? make sure won't accidentially zap the db.
        #
        puts "-  #dbi.do delete from triples where assertid = 'uri=#{file}';"
        begin 
          dbh.do "delete from triples where assertid = 'uri=#{file}';"
        rescue 
          puts "DBI: Error in sql delete, msg: #{$!}"
        end
        puts "+"

        sql_inserts.each do |sql_insert|
          begin 
            print '.'
            dbh.do sql_insert 
          rescue 
            # puts "DBI: Error in sql insert #{file} sql: #{sql_insert} msg: #{$!}"
            # this will be really verbose (lots of inserts into fields where dups not allowed)
          end
        end
        puts
      end
    else 
      puts "skipping reload (no triples)."
    end
  else 
    puts "Error parsing: #{pmsg}"
  end

   ### stopgap

#    sql_cache = "#{cache_dir}webcache/_nt/rdf-#{file}.sql"
#    if File::exists? sql_cache 
#      File::delete sql_cache
#    end
#    mf = File::new(sql_cache, File::CREAT|File::RDWR, 0644)
#    mf.puts sql_script
#    mf.close
#    `cat #{sql_cache} | psql rdfweb1`
   ### end stopgap

  return graph
end

def scutter_remote (uri, base=uri, cache_dir='./', proxy=true)
  puts "Scuttering remote: #{uri}"

  #### PROXY SETTINGS
  proxy_addr = 'cache-edi.cableinet.co.uk'
  proxy_port = 8080

  #######################################

  uri_hash = hashcodeIntFromString(uri)
  uri =~ /:\/\/([^\/]+)(\/.*)$/
  puts "Host: #{$1} Resource: #{$2} "
  h = Net::HTTP::new $1 

  begin 
    puts "Getting: #{$2}"


    # TODO: Make this configurable elsewhere. 


    data=''
    resp=''
    gzipped=false
    if proxy
      Net::HTTP::Proxy(proxy_addr, proxy_port).start( $1 ) do |http|
        resp, data = http.get $2 , {'Accept' => 'application/rdf+xml' } 
        puts "Proxied GET."
      end
      if resp['Content-encoding'] =~ /gzip/
        gzipped = true 
      end

    else
        puts "Un-Proxied GET."
      resp, data = h.get ($2, {'Accept' => 'application/rdf+xml'} )
    end

    # puts "Response: "+data.to_s
    puts "Storing in webcache URI: #{uri} as #{uri_hash} .rdf / .meta"
    # delete (todo: rcs/cvs archive) previous cached data
    rdf_fn =  "#{cache_dir}webcache/rdf-#{uri_hash}.rdf" 
    if File::exists? rdf_fn 
      File::delete rdf_fn
    end
    # store current data
 
    if !gzipped
      cf = File::new( rdf_fn, File::CREAT|File::RDWR, 0644)
      cf.write data  
      cf.close
    else

      require 'zlib'  # special handling of gzipped content

      cf = File::new( rdf_fn + ".gz", File::CREAT|File::RDWR, 0644)
      cf.write data  
      cf.close

      f = open( rdf_fn + ".gz" )

      gz = GzipReader.new(f)
      unzipped=gz.read
      gz.close

      cf = File::new( rdf_fn, File::CREAT|File::RDWR, 0644)
      cf.write unzipped 
      cf.close

    end

    puts "Stored RDF"

    mf = File::new("#{cache_dir}webcache/rdf-#{uri_hash}.meta", File::CREAT|File::RDWR, 0644)

    puts "Opened RDF .meta file"

    mf.puts "#baseuri: #{base} "
    mf.puts "#uri: #{uri}"
    mf.puts "#Last-Visit: <notrecorded>" #+Date::today::to_s
    # todo: use .nt or .rdf for this. Investigate soap/date clash.
    mf.close
    return uri_hash
   rescue
     puts "Scutter: #Error with URI #{uri} msg: #{$!}"
   end

   return uri_hash   
end


def raa_load
  ### RAA example.
  ###
  Dir['raa-dump/*.xml'].each do |file| 
    file.gsub!("^raa-dump/","")
    sb= `sabcmd soap2rdf.xsl 'raa-dump/#{file}' > 'web_cache/#{file}.rdf' 2>&1`
    # todo: add .meta files
    

    if ! (sb =~ /\w/)  
    scutter_local(file, 'http://www.ruby-lang.org/xmlns/raa/test1-ns#', './')
    else
      puts "Skipping #{file} due to XSLT / filename error"
    end
  end
end

####


def scutter (todo = ['http://rdfweb.org/people/danbri/rdfweb/webwho.xrdf'], cache_dir= './', crawl=true, proxy=true)
  rdfs = 'http://www.w3.org/2000/01/rdf-schema#'	
  wot = 'http://xmlns.com/wot/0.1/'	
  done = {}
  todo.each do |uri|
    uri_hash = hashcodeIntFromString(uri)
    if crawl
      fetched = scutter_remote (uri, proxy)
    else
      fetched = RDFGraph.new # todo: load from webcache/.nt
    end

    if (fetched != nil)
      loaded = scutter_local(fetched, uri)
      seeAlso = loaded.ask(Statement.new(nil,rdfs+'seeAlso',nil) ).objects
      puts "SeeAlso: #{seeAlso.inspect} " if !seeAlso.empty?
      seeAlso.each do |doc|
        puts "Scutter: adding to TODO list: #{doc} "
        if (!done[doc.to_s])
          todo.push doc.to_s 
          done[doc.to_s]=1
        end
      end

      # look for signatures  
      puts "WOT: looking for <#{uri}> <#{wot+'assurance'}> <?>"
      # puts "IN: "+loaded.toNtriples
      assurances = loaded.ask(Statement.new(uri,wot+'assurance',nil)).objects
      if !assurances.empty?
        puts "WOT assurances: #{assurances.inspect} "
        mf = File::new("#{cache_dir}webcache/rdf-#{uri_hash}.meta", File::WRONLY|File::APPEND|File::CREAT, 0644)
        #puts "Re-Opened RDF .meta file to store assurance ptr."
        assurances.each do |sig|
          puts "Scutter: invoking GPG : #{sig} "
          mf.puts "WOT-Assurance: #{sig.inspect} "
	  # gpg --quiet --verify sigfile contentfile # do here or elsewhere?
        end
        mf.close
      end
    end
  end
end

opt = GetoptLong.new (
    ['--import','-i', GetoptLong::NO_ARGUMENT],
    ['--check', '-c', GetoptLong::REQUIRED_ARGUMENT],
    ['--test','-t', GetoptLong::NO_ARGUMENT],
    ['--verbose','-v', GetoptLong::NO_ARGUMENT],
    ['--collect','-f', GetoptLong::OPTIONAL_ARGUMENT],
    ['--scutter','-h', GetoptLong::OPTIONAL_ARGUMENT],
    ['--nocache','-n', GetoptLong::OPTIONAL_ARGUMENT],
    ['--nocrawl','-1', GetoptLong::OPTIONAL_ARGUMENT],
    ['--server','-s', GetoptLong::OPTIONAL_ARGUMENT],
    ['--diskdb','-d', GetoptLong::REQUIRED_ARGUMENT],
    ['--lowercase-domains','-l', GetoptLong::NO_ARGUMENT],
    ['--realwhitelist','-r', GetoptLong::REQUIRED_ARGUMENT],
    ['--export', '-x', GetoptLong::OPTIONAL_ARGUMENT] )


proxy=true
crawl=true

opt.each_option do |name,arg|
case name

when '--noproxy'
  proxy=false

when '--nocrawl'
  crawl=false

when '--scutter'

  todolist = []
  if arg =~ /\w/
     puts "...."
     puts "scuttering: #{arg}"
     todolist.push arg
  else 
    todolist.push 'http://rdfweb.org/people/danbri/rdfweb/webwho.xrdf'
  end

  scutter (todolist, './', crawl, proxy)


  #  scutter [ ' http://jibbering.com/rdfsvg/1025083477768.rdf' ]

when '--raa'
  puts raa_load

when '--server'
  SquishProtocolNS="http://rdfweb.org/RDF/RDFWeb/SOAPDemo"
  rdfq_port=8082
  server = App.new( 'SOAPDemo', SquishProtocolNS, '0.0.0.0', rdfq_port )
  server.setLog( '_rdfq.log', 0,0 )
  puts "Running RDF Query SOAP Service on port: #{server} "
  server.start
end

end
