#!/usr/bin/env ruby # # Ruby-RDF # RDF Query support # $Id: squish.rb,v 1.10 2003/05/06 02:39:19 danbri Exp $ =begin This is a basic library to support RDF query applications in Ruby. It complements (but doesn't yet depend on) Ruby-RDF. It provides a SquishQuery class, representing simple "SQL-ish" queries against RDF datasets. A goal is to allow for multiple textual representations of the same logical query. The toAlgae() method, for example, shows how a Squish query can be exported in Algae syntax. The class also provides a toSQLQuery() method which will generate an SQL query suitable for use against a PostgreSQL database that stores RDF in a certain way. The Ruby-RDF's Graph.toSQL() method provides a corresponding mechanism for turning RDF graphs into such databases. acknowledgements: The SQL functionality is a pretty literal translation of Libby Miller's original Java version, which in turn was heavily inspired by Matt Biddulph's PHP implementation. It is currently one large method; I might reorganise it once the tests are in place. The Squish parser too is pretty rough, but worked well enough to get the SQL code working. Squish is based on Guha's original RDF query language (see RDFdb), but has a cuter name and a few new features (USING clauses, for example). The tests will have to replace documentation, for now. The hashcode function was written by Damian Steer, based on the Stanford RDF API SHA1 code. Note that this code is pretty undocumented and messy in its first release, but it WorksForMe. There are some test files in samples/. Investigate the runtests.rb script (or the last few lines of squish.rb) to see how it can be used. Dan Brickley useful refs: http://www.io.com/~jimm/downloads/rubytalk/talk.html http://www.daml.org/listarchive/joint-committee/0856.html http://www.hpl.hp.com/semweb/rdql.html http://www.hpl.hp.com/semweb/rdql-grammar.html Random Notes: a fresh query object always needed (since state isn't blanked down). So we see a lot of this sort of thing: query = SquishQuery.new() =end # # A class representing a simple, SQL-ish RDF query # =begin SquishQuery highlights: query = SquishQuery.new() query.parseFromText(squish_text) puts "Parsed and reprinted: "+query.toSquish puts "Variables: "+ query.all_vars.keys puts "Export as Algae: "+ query.toAlgae puts "Squish2SQL: "+ query.toSQLQuery =end class SquishQuery # incl parsed query state attr_accessor :select_args, :from_args, :using_args, :where_args, :clauses, :xmlns, :full_clauses, :all_vars #, :select_arglist ## Hmm, don't understand why needed a method for an array field ## todo: read docs on attr_accessor def sels return @select_arglist end def initialize @clauses = [] @full_clauses = [] @xmlns = {} @loginfo = [] @verboselog = [] @all_vars = {} # todo: find out where/how one declares fields of a class! # seems to be no point in doing so, since it's all so runtimy # but I want to document them somewhere. Maybe just via attr_accessor? # @select_args @from_args="" # @using_args # @where_args # state used in WHILE productions # @lastpred # @lastsub # @lastobj end def loginfo (production, content, nextdata) logmsg = "LOGINFO: ['#{production}'] got content: '#{content}'\n" # @loginfo.push(logmsg) # @verboselog.push logmsg + " remains: #{nextdata}\n\n" end def logwarning (msg) @warnings.push(msg) puts "PARSER WARNING: #{msg}\n" end def parseFromText (text) text.gsub!(/\n/," ") text.gsub!(/\r/," ") parse = select_keyword(text) extractAllVars expandAllNamespaces return self end # select_keyword: # entry point # # -> select_arglist # def select_keyword (text) if text =~ /^\s*SELECT\s+(.*)/is nextdata = $1 # puts "Found select keyword. Next is: #{nextdata}\n" loginfo('select_keyword', 'SELECT', nextdata) return(select_arglist(nextdata)) end puts "Error: Expected first chars to be 'SELECT ' GOT: '#{text}'" end ################ # select_arglist: # def select_arglist (text) ### # # The SELECT arg list can be terminated with an (optional) FROM clause if text =~ /^\s*(.*?)\s*(\s+FROM\s+.*)/si # TODO: test case that'll fail with a from: URI scheme. # this is all pretty ugly. content = $1 # content.gsub!(/\s*$/, "") ## todo: shouldn't have to strip ws here nextdata = $2 @select_args = content loginfo('select_arglist (found FROM) ', content, nextdata) rawsel = content rawsel.gsub!(/\\?/,"") rawsel.gsub!(/,\s*$/,"") vars = rawsel.split(/,\s*/) # ws mandatory? # puts ("SELS-with-from: #{vars}\n") @select_arglist = vars; return from_keyword(nextdata) # move to from_keyword production ### # # Else it should be terminated with a WHERE clause elsif text =~ /^\s*(.*)\s+WHERE\s+(.*)/is # puts "select_arglist: no from, got WHERE!\n" content = $1 nextdata = $2 @select_args = content loginfo('select_arglist (omitted FROM) ', content, nextdata) rawsel = content # Should be separate function rawsel.gsub!(/\\?/,"") vars = rawsel.split(/,\s*/) #puts ("SELS-without-from: #{vars.inspect}\n") @select_arglist = vars; if where_lpar(nextdata) #puts "Processed all where clauses OK, nothing left to do."# todo end # this is a *mess*! # if @remainder =~ /\S/ # return using_keyword ( nextdata ) # end return true # Done! end # neither ended in FROM or WHERE puts "Error: Expected arg list for SELECT to end with FROM or WHERE. GOT: #{text}" end # the optional FROM keyword # -> where_lpar (todo: to where_keyword) # def from_keyword (text) #puts "DEBUG from_keyword: #{text}" if text =~ /^\s*FROM\s+(.*?)(\s+WHERE\s+.*)/i content = $1 nextdata = $2 loginfo('from_keyword, next is where_lpar', content, nextdata) @from_args = content nextdata.gsub!(/WHERE\s+/i,"") ## TODO: make a node for WHERE_KEYWORD return where_lpar(nextdata) end #puts "Expecting (optional) FROM; didn't find it." return false end # using_keyword: # final clause (optional), deals with namespace expansions # def using_keyword (text) # puts "Using-keyword got: #{text} \n" if text =~ /\s*USING+(.*)/i nextdata = $1 loginfo('using_keyword', 'USING', nextdata) # puts "using_keyword: #{nextdata} \n" return using_arglist(nextdata) end puts "Expecting 'USING' keyword, found: #{text} " return false end def using_arglist(text) if text =~ /\s*(.*)$/ content = $1 loginfo('using_arglist', content, '[end]' ) @using_args = content usedPrefixes(content) # extract #puts "using_args: #{content} \n" return true end puts "Expected arguments to USING, found nothing.\n" return false end #### the 'WHERE' clause # where_lpar: '(' # where clause, left paren # # -> deal with each pred_expr or drop out of WHERE via using_keyword # def where_lpar (text) if text =~ /^\s*\(\s*(.*)/ nextdata = $1 loginfo('where_lpar', ' ( ', nextdata ) return pred_expr( nextdata ) end if text =~ /^\s*USING/i return using_keyword(text) end raise " squish parser error. Expected: '(' or 'USING' Got: #{text}" # note: test1-bogusclause.squish tries 'ABUSING' clause @remainder=text return false; end # pred_expr # where clause, predicate expression # -> sub_expr # def pred_expr (text) if (text =~ /\s*(\S+)\s+(.*)/) content = $1 @lastpred = content nextdata = $2 loginfo('pred_lpar', content, nextdata ) return sub_expr(nextdata) end puts "Error: pred_expr didn't find expected content \n" return false end # sub_expr # where clause, subject expression # -> obj_expr # def sub_expr (text) if (text =~ /\s*(\S+)\s+(.*)\s*/) content = $1 @lastsub = content nextdata = $2 loginfo('sub_expr', content, nextdata ) return obj_expr(nextdata) end puts "Error: sub_expr didn't find expected content in '#{text}'\n" end # obj_expr # where clause, object expression # -> where_rpar # def obj_expr (text) if (text =~ /(\S+)\s*(\).*)/) content = $1 # puts("object: "+content+"\n") @lastobj=content nextdata = $2 loginfo('obj_expr', content, nextdata ) return where_rpar(nextdata) end puts "Error: obj_expr didn't find expected content in '#{text} \n" end # where_rpar: # where clause, right paren # -> where_lpar # def where_rpar (text) # puts "Does text '#{text} match ')'... \n" if text =~ /^\)\s*(.*)\s*/ nextdata = $1 # puts "rpar: next prod: where_lpar #{$1} \n" loginfo('where_rpar', ' ) ', nextdata ) if (nextdata =~ /\S/ ) clause = "#{@lastsub} -- #{@lastpred} --> #{@lastobj}\n" qt = [ @lastpred, @lastsub, @lastobj ] self.clauses.push( qt ) ## puts ("DEBUG: #{clauses.inspect} \n") ## TODO: Store this in SquishQuery return where_lpar( nextdata ) end puts "[finished with the entire WHERE clause]\n\n" return true # we're done with this where # should flush temporary state, store goodstuff etc # Note: qnames need expanding later end puts "Error: Expected ')'" return false; end # output requested variables as contents of a Squish SELECT clause # # eg: "?x, ?y, ?z" # # todo: refactor to hide the * case from apps that want a clean list # of vars, ie. sharecode w/ algae function. self.sels is bad data. # def toVarQList s=[] if self.sels[0] =~ /^\*/ return "*" # special case for 'SELECT *' (ie. select all named vars) end self.sels.each {|q| s.push("?#{q}") } return s.join(", ") end def toSquish sq = "SELECT #{toVarQList} \n" if from_args =~ /\S/ sq += "FROM #{from_args} \n" end sq += "WHERE \n" cl= clauses.each { |qt| sq += " ( #{qt[0]} #{qt[1]} #{qt[2]} ) \n" #todo: commas? whitespace? } sq += "USING #{using_args} \n" return sq end # expand namespaces and store in expanded_clauses # def expandAllNamespaces sq="" full_clauses=[] # reset state. wise? cl= clauses.each { |qt| p = expns(qt[0]) s = expns(qt[1]) o = expns(qt[2]) sq += " ( #{p} #{s} #{o} ) " # puts ("Epanding: #{sq} \n") self.full_clauses.push([p,s,o] ) } return ("Expanded: "+sq) end def expns (text) # possibly qualified expression, eg dc::foo if text =~ /(\w+)::(.*)/ if ($1 != nil) ns = xmlns[$1] if (ns == nil) raise "squish parser error: undeclared namespace #{$1}\n" else text = xmlns[$1] + $2 end end # puts "Expns: Got #{$1} and #{$2} -> #{text} \n" end return text end # update self.all_vars based on variables named in self.clauses # def extractAllVars ## todo: reset state first? same issue re xmlns / USING... clauses.each { |qt| parts = qt[0..3] parts.each { |term| if term =~ /^\?(.*)/ l = self.all_vars[$1] # look for list of clauses using this variable if l == nil self.all_vars[$1] = [qt] else self.all_vars[$1].push(qt) end end } } end # Output in Algae syntax (URI for spec?) # todo: look at commas, throw exception for '*' or workaround as below # - for this, need an query.allvarnames() method # - warn if there's a FROM clause, or figure out Algae syntax # - find out Algae notation for using ns prefixes def toAlgae a = "(ask '(\n" cl= full_clauses.each { |qt| a += " ( #{qt[0]} #{qt[1]} #{qt[2]} ) \n" } vars = toVarQList vout=[] if vars =~ /\*/ # warnings.push "Algae doesn't support * selector. (todo) Default is collect all vars" all_vars.keys.each {|q| vout.push("?#{q}") } else vars.each {|q| vout.push("#{q}") } end a += " ) collect '( #{vout.join(', ')} )\n)\n" ## lose the commas? return a end def usedPrefixes (text) nslist = text.split(/\s+/) state='prefix' #Todo: investigate ruby constants mechanism content="" nslist.each { |item| if state =~ /prefix/ content=item state='for' elsif state =~ /for/ state = 'uri' elsif state =~ /uri/ self.xmlns[content]=item content="" state = 'prefix' elsif puts "TODO: parse USING clause properly!\n" end } return xmlns end ################################################################################ # # Squish2SQL facilities # # Convert a Squish query to SQL for our store. # # example: # # SquishQuery.new.parseFromText("SELECT ?res, ?val, WHERE (http://purl.foo/ ?res ?val) USING foo for bar").toSQLQuery # # => "SELECT DISTINCT b1.value AS res, b2.value AS val FROM triples a1, resources b1, resources b2 WHERE b1.keyhash=a1.subject AND b2.keyhash=a1.object AND a1.predicate = '4700643'" def toSQLQuery (opts={}) sql="" sqlVariableNamesA = [] sqlVariableNamesB = [] sqlVariableMatchAB = {} realToSqlVarname_A={} id_a_clause=1 # counter # RDBMS table and field names p_field='predicate' s_field='subject' o_field='object' main_table = 'triples' lookup_table = 'resources' # for storing generated WHERE Clause fragments, two categories: where_triples=[] # triples, ie. "a" (assertions) where_lookup=[] # libby's "b" or 'resources' table clauses.each { |clause| p,s,o = clause[0..2] p=expns(p) s=expns(s) o=expns(o) p_bound = false s_bound = false o_bound = false p.gsub!(/^\?/,"") # this by reference, changes the contents of clauses # TODO: this will trip us up. Fix! decide on whether has ? or not s.gsub!(/^\?/,"") o.gsub!(/^\?/, "") #puts("CLAUSE: #{p} ; #{s} ; #{o} \n") all_vars.keys.each { |varname| # puts "Scanning for varname: #{varname.inspect} sub=#{s.inspect} id: #{id_a_clause}\n" if s.eql? varname realToSqlVarname_A[varname] = "a#{id_a_clause}.#{s_field}" s_bound = true # hmm, back to front? end if p.eql? varname realToSqlVarname_A[varname]= "a#{id_a_clause}.#{p_field}" p_bound= true end if o.eql? varname realToSqlVarname_A[varname]= "a#{id_a_clause}.#{o_field}" o_bound = true end } sh1_sub = hashcodeIntFromString(s) # wasteful, not always needed (see below) sh1_pred = hashcodeIntFromString(p) sh1_obj = hashcodeIntFromString(o) # puts "\n\nSHA-Triple: s=#{s} (#{sh1_sub}) p=#{p} (#{sh1_pred}) o=#{o} (#{sh1_obj}) \n" # puts "Var bindings: #{s_bound} #{p_bound} #{o_bound} \n" if !s_bound where_triples.push "a#{id_a_clause}.#{s_field} = '#{sh1_sub}'" #puts "DEBUG: s = #{sh1_sub} \n" end if !p_bound where_triples.push "a#{id_a_clause}.#{p_field} = '#{sh1_pred}'" #puts "DEBUG: p = #{sh1_pred} \n" end if !o_bound where_triples.push "a#{id_a_clause}.#{o_field} = '#{sh1_obj}'" #puts "DEBUG: o = #{sh1_obj} \n" end id_a_clause += 1 } # end big loop thru clauses #puts "VARNAMES: #{realToSqlVarname_A.inspect}\n" #puts "Got constraints: \n\n #{where_triples.inspect} \n" sqlVarnames=[] # todo: describe this all_vars.keys.each do |variableNameToMatch| sqlVariableNamesA = [] #puts "\nScanning for clauses that use variable: #{variableNameToMatch} \n\n" cl_idx=1 clauses.each do |clause| # puts "Clause: #{clause.inspect} \n" p,s,o = clause[0..2] p_bound = false s_bound = false o_bound = false if s.eql? variableNameToMatch sqlVariableNamesA.push("a#{cl_idx}.#{s_field}") #puts "Matched! (subject)\n" end if p.eql? variableNameToMatch sqlVariableNamesA.push("a#{cl_idx}.#{p_field}") #puts "Matched! (predicate)\n" end if o.eql? variableNameToMatch sqlVariableNamesA.push("a#{cl_idx}.#{o_field}") #puts "Matched! (object) \n" end cl_idx= cl_idx+1 #++ didn't. end if sqlVariableNamesA.size > 1 sqlVarnames.push(sqlVariableNamesA) end end #puts "Current varname equalities for where_triples constraints: #{sqlVarnames.inspect}\n\n" # # this writes "a1.subject = a2.predicate" constraints. # note that there is (I think) some redundancy in this method. (@@check java code) sqlVarnames.each { |bindings| j = 0 bindings.each { |part| if (j+1 ( ( 1 << 31 ) -1 ) ) return ( r - ( 1 << 32 ) ) end return r end end # end of SquishQuery class definition ################################################################################ ## ## Basic Squish implementation (over RDF graph) # query a graph using squish (returns ResultSet) # def SquishQuery.ask(query,db) tables=[] query.full_clauses.each do |clause| vars={} p,s,o=clause[0..2] if p.gsub!(/^\?/,"") vars['p']=p p=nil end if s.gsub!(/^\?/,"") vars['s']=s s=nil end if o.gsub!(/^\?/,"") vars['o']=o o=nil end bindings = ResultSet.new a = db.ask(Statement.new(s,p,o)).statements.each do |s| # puts "Vars are: #{vars.values.join ' ' } " # todo: store vars vb={} vb[vars['s']] = s.subject.to_s if (vars['s'] != nil) vb[vars['p']] = s.predicate.to_s if (vars['p'] != nil) vb[vars['o']] = s.object.to_s if (vars['o'] != nil) bindings.push(ResultRow.new(vb)) # puts "vars.values: #{vars.values}\n" #hmm: bindings.vars=vars.values end tables.push bindings end htmllog = "" current = tables.shift # todo: if no matches, this would be returned. begin while (!tables.empty?) # puts "Calling chooseTable with current vars: #{current.vars}" nx = chooseTable(tables, current.vars) joinvars = current.vars & nx.vars htmllog += "

Matching next constraint: vars: #{joinvars.inspect}

" htmllog += "

Current matches:

" + current.html_report old=current new = current.match nx, joinvars current = new htmllog += "

New matches:

"+current.html_report end rescue if (!tables.empty?) # puts "query failed - unbound tables" tables.each do |t| # puts " todo: #{t} tables still to do: #{t.vars.inspect} " end current = ResultSet.new else # puts "Checked all tables." end # todo: should we check that tables is now empty? end current.debug_report += htmllog # todo: how to make sure all constraints met? todo... answers=ResultSet.new answers.debug_report=current.debug_report dup={} current.each do |row| r='' row.values.each_key do |col| r += col.to_s + row.values[col] end if (!dup[r.hash]) dup[r.hash] =1 answers.push row end end return answers end def SquishQuery.chooseTable (tables, vars) # puts "Choosing amongst tables to match vars: #{vars} \n" maybe=[] if (vars!=nil) vars.each do |v| # puts "trying for a table that knows about '#{v}' " tables.each do |t| if t.vars.member? v # puts "Got a match: #{v}" maybe.push t end end end else # raise "*** warning: chooseTable called with nil vars. (do we care?)" end # puts "Possibilities: #{maybe.size} \n" if maybe.size > 0 tables.delete(maybe[0]) ## puts "Chose table w/ #{maybe[0].vars} " return maybe[0] else raise "No plausible table" end end ################################################################################ ## ## Sample Web service plugins: Google/backlinks # # todo: a general API for such things would be nice. This is rather arbitrary. def SquishQuery.googleBacklinks (uri = ARGV[0], varname='page', nohost=nil) require 'webfetcher' r = ResultSet.new nohost=nil # TODO: it doesn't work yet, so ignore... uri = 'http://www.w3.org/2001/sw/' if (uri == nil) uri.gsub!(/http:\/\//,"") maxhits=100 #todo: nohost = "+-site:#{nohost}" if (nohost != nil) uri = "#{uri}#{nohost}&num=#{maxhits}" pc = WebFetcher::Page.url('http://www.google.com/search?q=link:'+uri).links pc.each do |link| doc="#{link.to_s}" unless (doc =~ /google/i) # puts "Storing: #{doc} " r.push ResultRow.new( {varname=> doc} ) end end return r end ################################################################################ ## ## DBI storage / query facilities # # haven't modeled this particularly well. # #require 'dbi' class ResultSet < Array attr_accessor :query , :debug_report#, :vars def initialize (query=nil) @query=query @debug_report='

Debugging info...

' end def vars= (v) puts "***WARNING: can't set vars() on ResultSet. Ignoring data: #{v} Real vars: #{self.vars} \n" end def html_report (report='') report += "\n" report += "\n\n" self.vars.each do |v| report += " " end report += "\n" self.each do |row| report += "\n" row.values.each_key do |col| report += "\n" end report += "\n" end report += "
#{v}
#{row.values[col]}
\n\n" return report end def vars if self.size>0 ## puts "!!! rs.vars: #{self[0].values.keys} " return self[0].values.keys else ## puts "!!! rs.vars: empty list. " return [] end end # match this result set against another, returning a 3rd # examine their associated queries to determine column to join on # # - tests should make sure that calling result2.match(result1) gets same results def match(results2=nil, joinvars=nil) results1=self results3 = ResultSet.new() if (joinvars==nil) joinvars = results2.query.sels & results1.query.sels end self.debug_report += "\n

Graph matcher called with tables:\n

" results3.debug_report += "
" #joinvars.each do |j| puts "matching on common field(s) jv: #{j} " end results2.each do |r2| results1.each do |r1| reject=false r1.values.each do |col,val| next unless joinvars.member? col if r2.values[col] == val #puts "Considering: #{col} -> #{val}" else # puts "Rejecting!" reject=true # a required column doesn't match break # we could be more efficient...? # also, if results from graph sorted # predictably, this might all be much faster end end if (!reject) # merge and store # todo: add .toHTML() to resultset and code # to make sure ResultSets are grids not wierd trees # puts "matched row! Storing #{r1.inspect} \n" r3 = r2.clone; r1.values.each do |col,val| r3.values[col]=val end results3.push r3 ## TODO: we currently allow duplicates. This is bad. ## Not clear how to avoid this problem. ## results3.vars = results1.vars & results2.vars end end end return results3 end # end match end class ResultRow attr_accessor :values def method_missing (methid) str = methid.id2name return values[str] end def initialize (row) @values=row # a hash #puts "NEW ROW: #{row.inspect} \n" #row.each do |k| # puts "K: #{k} V:#{row['loc']}" #end end end ############################################################################# # An SQL-based RDF data service # class SimpleSQLGraph require 'dbi' attr_accessor :opts, :verbosity def initialize (o={}) # puts "Initializing SimpleSQL store with opts: #{o.inspect}" @opts=o @verbosity=0 @verbosity=o['verbosity'] if o['verbosity'] end #not used yet def connect (d,u,p) c= DBI.connect(opts['dbi_driver'],opts['dbi_user'],opts['dbp_pass']) #puts "Connected: #{c} " return true end # return pairs of node content and node id that match some substring # def substrings (q=nil) matches={} DBI.connect(opts['dbi_driver'],opts['dbi_user'],opts['dbp_pass']) do | dbh | sql = "select keyhash, value from resources where lower(value) like '%#{q}%';" dbh.select_all( sql ) do | row | matches[ row['keyhash']] = row['value'] end end return matches end # TODO: - extend for RDF Core WG's datatyping idioms # - integrate more fully with query processor def findPropertySubstring (q=nil, propuri=nil) matches={} DBI.connect(opts['dbi_driver'],opts['dbi_user'],opts['dbp_pass']) do | dbh | sql = "SELECT r1.keyhash AS keyhash, r3.value AS res, r1.value AS val FROM resources r1, resources r2, resources r3, triples WHERE lower(r1.value) LIKE '%#{q}%' AND triples.object = r1.keyhash AND triples.subject = r3.keyhash AND triples.predicate = r2.keyhash AND r2.value='#{propuri}';" dbh.select_all( sql ) do | row | matches[ row['res']] = row['val'] end end return matches ## todo: we should be able to return a ResultRow end def defrag (idprops=[]) # Smushing: defragment the stored graph # TODO: OWL support #build-in some knowledge of some useful props... foaf='http://xmlns.com/foaf/0.1/' fp=[foaf+'mbox', foaf+'homepage',foaf+'mbox_sha1sum',foaf+'dnaChecksum'] idprops=idprops+fp sameIndividual={} DBI.connect(opts['dbi_driver'],opts['dbi_user'],opts['dbp_pass']) do | dbh | idprops.each do |idprop| STDERR.puts "defragmenting graph on owl:InverseFunctionalProperty: #{idprop}" if verbosity>3 q1 = "select ?resource, ?value, WHERE (#{idprop} ?resource ?value) USING foo for bar" query = SquishQuery.new sqlq = query.parseFromText(q1).toSQLQuery dbh.select_all(sqlq) do | row | res = row['resource'] val = row['value'] STDERR.puts "#{res} #{idprop} #{val} . " if verbosity>3 #nt-ish eq = sameIndividual[idprop+" "+val]; if(eq != nil) eq[res] = 1 else sameIndividual[ idprop+" "+val ] = { res => '1' } # TODO: this is confusing. rewrite! # sameIndividual[idprop+" "+val][res]='1' end end end sameIndividual.each_key do |nameset| idset = sameIndividual[nameset].keys; STDERR.puts "HELP: #{sameIndividual[nameset].inspect}" if verbosity>20 STDERR.puts "Property/Value: #{nameset} \nIdentifies: #{sameIndividual[nameset].keys.join(" ")}\n\n" if verbosity>3 new = idset.shift; # randomly picked; should take care to value URIs over genids, common terms over obscure ones STDERR.puts "Picked canonical ID: #{new} from idset of #{idset.size}" if verbosity>5 # if we have two URIs, should add URIs in as properties # rather than lose them. idset.each do |name| puts "SQL-SMUSH: update triples SET subject='#{hashcodeIntFromString new}' WHERE subject='#{hashcodeIntFromString name}';" if verbosity>3 begin dbh.do "update triples SET subject='#{hashcodeIntFromString new}' WHERE subject='#{hashcodeIntFromString name}';" rescue DBI::ProgrammingError raise "Unknown DBI error during subject smush: #{$!.to_s}" if !($!.to_s =~ /duplicate key/) end begin dbh.do "update triples SET predicate='#{hashcodeIntFromString new}' WHERE predicate='#{hashcodeIntFromString name}';" rescue DBI::ProgrammingError raise "Unknown DBI error during predicate smush: #{$!.to_s}" if !($!.to_s =~ /duplicate key/) end begin dbh.do "update triples SET object='#{hashcodeIntFromString new}' WHERE object='#{hashcodeIntFromString name}';" rescue DBI::ProgrammingError raise "Unknown DBI error during object smush: #{$!.to_s}" if !($!.to_s =~ /duplicate key/) end # puts "SMUSH last error was: $!" end end end return self end # status: not working yet. # def addAllSuperProperties subprop='http://www.w3.org/2000/01/rdf-schema#subPropertyOf'## @@todo: define elsewhere superProperty={} q1 = "select ?resource, ?value, WHERE (#{subprop} ?resource ?value) USING foo for bar" query = SquishQuery.new() newtriples=[] reshash={} # todo: fix to respect transitive hierachy. currently works on layer only. # FIXME: see SimpleSQLGraph code, merge. Allow for multiple SQL reprs. DBI.connect(opts['dbi_driver'],opts['dbi_user'],opts['dbp_pass'] ) do | dbh | dbh.select_all(query.parseFromText(q1).toSQLQuery ) do | row | superProperty[row['resource']] = row['value'] end # todo: should add the implied superProperty relations to superProperty superProperty.each_key do |prop| #puts " #{prop} is a sub of the superProperty: #{superProperty[prop]} \n\n" # DesignIssue: should we persevere with doing this over the RDF interface? # q1 = "SELECT ?resource, ?value, WHERE (#{prop} ?resource ?value) USING foo for bar" # squish TODO: remove requirement on USING field, and ',' bogosity. query = SquishQuery.new() sq= query.parseFromText(q1).toSQLQuery puts "superProperty: asking db: #{q1} \n #{sq}\n\n " if verbosity>2 dbh.select_all(sq) do | row | #puts "ADDING: #{row['resource']} -- #{superProperty[prop]} -> #{row['value']}" val = row['value'] #TODO: we need to know type, ie. whether a literal or res, genid... # this isn't right...! if (val =~ /\s/) val = "\""+val+"\"" end #puts "VALUE: #{val} " newtriples.push([ hashcodeIntFromString(row['resource']), hashcodeIntFromString( superProperty[prop] ), hashcodeIntFromString( val ) ]) reshash[row['resource']]= hashcodeIntFromString(row['resource']) reshash[row['value']]= hashcodeIntFromString(row['value']) reshash[superProperty[prop]]= hashcodeIntFromString(superProperty[prop]) end end # delete old cache'd triples (WARNING: be careful with DELETE WHEREs!) dbh.do "DELETE FROM triples WHERE assertid='SUPERPROP_CACHE'; " cache_id='SUPERPROP_CACHE' # todo: store these up as one batch for faster loading newtriples.each do |t| begin dbh.do "insert into triples values ('#{t[0]}','#{t[1]}','#{t[2]}', '#{cache_id}' ); " rescue Exception puts "DEBUG: Smush caught db storage gripe: '#{$!}'" # OK, just duplicates. TODO: check exact error # ERROR: Cannot insert a duplicate key into unique index onepso (DBI::ProgrammingError) end #puts "insert into triples values ('#{t[0]}','#{t[1]}','#{t[2]}');" # todo: isresource column shouldn't be empty. write a test that barfs. end reshash.each_key do |r| #debug: make sure resources table is populated. #todo: this shouldn't be needed. We're hashing literals incorrectly :( begin # dbh.do "insert into resources values ('#{reshash[r]}', '#{r}' ); " # puts "insert into resources values ('#{reshash[r]}', '#{r}' ); " rescue # puts "Caught error (duplicate resource info)" end end end return self end # basic RDF API with an SQL store, using a 'simple' # view over p/s/o tables. # TODOs: # bNodes, literal-vs-resources etc ## RDF API over SQL store (simple schema) def ask(st) #puts "Query: #{st.inspect} stype:#{st.subject.class}" st=Statement.new if st==nil #shorthand. do we want this? #2^3 2*2*2=8 combinations # oxo if((st.subject==nil) & (st.predicate!=nil) & (st.object==nil)) return sql2graph "SELECT distinct p,s,o FROM simple WHERE p='#{st.predicate.to_s}';" end # oxx if((st.subject==nil) & (st.predicate!=nil) & (st.object!=nil)) return sql2graph "SELECT distinct p,s,o FROM simple WHERE p='#{st.predicate.to_s}' \ AND o='#{st.object.to_s}';" end # xxx if((st.subject!=nil)&(st.predicate!=nil)&(st.object!=nil)) return sql2graph "SELECT distinct p,s,o FROM simple WHERE \ s='#{st.subejct.to_s}' \ AND p='#{st.predicate.to_s}' \ AND o='#{st.object.to_s}'; " end # ooo if((st.subject==nil)&(st.predicate==nil)&(st.object==nil)) return sql2graph "SELECT distinct p,s,o FROM simple"; end # xoo if((st.subject!=nil)&(st.predicate==nil)&(st.object==nil)) return sql2graph "SELECT distinct p,s,o FROM simple WHERE s='{st.subject.to_s}'"; end # xxo if((st.subject!=nil)&(st.predicate!=nil)&(st.object==nil)) return sql2graph "SELECT distinct p,s,o FROM simple WHERE s='#{st.subject.to_s}' \ AND p='#{st.predicate.to_s}'"; end # xox if((st.subject!=nil)&(st.predicate==nil)&(st.object!=nil)) return sql2graph "SELECT p,s,o FROM simple WHERE \ s='#{st.subejct.to_s}' \ AND o='#{st.object.to_s}'; " end # oox if((st.subject==nil) & (st.predicate==nil) & (st.object!=nil)) return sql2graph "SELECT p,s,o FROM simple WHERE o='#{st.object.to_s}';" end raise "unimplemented API query: #{st.inspect}" #shouldn't get here end def store_graph(data,src='',log='') #TODO: see if this works with MySQL too. sql_inserts=data.toSQLInserts src begin if !sql_inserts.empty? log += "updating query server.\n" DBI.connect( opts['dbi_driver'], opts['dbi_user'], opts['dbi_pass']) do | dbh | log+= "- # delete from triples where assertid = '#{src}';\n" begin dbh.do "delete from triples where assertid = '#{src}';" rescue log += "DBI: Error in sql delete, msg: #{$!}\n" end log += "+" sql_inserts.each do |sql_insert| begin log += '+' dbh.do sql_insert # puts "DEBUG: #{sql_insert} \n" rescue # puts "DBI: Error in sql insert #{src} sql: #{sql_insert} msg: #{$!}" #verbose re dups end end log += "\n" end else log+= "skipping reload (no triples).\n\n" end rescue Exception puts "Caught error: #{$!}" end return log end def status r='' DBI.connect(opts['dbi_driver'], opts['dbi_user'], opts['dbi_pass']) do |dbh| s1='select count(distinct assertid) from triples;' dbh.select_all(s1) do |row| r+="Number of distinct graphs: #{row.inspect}\n" end s1='select count(subject) from triples;' dbh.select_all(s1) do |row| r+="Number of distinct attributed triples: #{row.inspect}\n" end if verbosity>50 s2='select distinct p from simple;' r+="All predicates: " dbh.select_all(s2) do |row| r+="p= #{row['p']}\n" end end end #TODO: # see http://esw.w3.org/topic/RubyRdfDatabaseSetup #SELECT triples.subject FROM triples LEFT OUTER JOIN resources ON triples.subject=resources.keyhash WHERE resources.keyhash IS NULL # #SELECT triples.predicate FROM triples LEFT OUTER JOIN resources ON triples.predicate=resources.keyhash WHERE resources.keyhash IS NULL # #SELECT triples.object FROM triples LEFT OUTER JOIN resources ON triples.predicate=resources.keyhash WHERE resources.keyhash IS NULL return r end def destroydb DBI.connect(opts['dbi_driver'], opts['dbi_user'], opts['dbi_pass']) do |dbh| STDERR.puts "Destroying database (#{opts.inspect}): deleted from triples and resources." if verbosity>0 dbh.do "delete from triples; " dbh.do "delete from resources; " end '' end def sql2graph(sql) raise "No SQL query provided" if ((sql==nil) or (sql=="")) data=Graph.new([]) puts "SQL: #{sql}" if verbosity>0 DBI.connect( opts['dbi_driver'], opts['dbi_user'], opts['dbi_pass']) do | dbh | dbh.select_all(sql) do |row| s=Statement.new(row['s'],row['p'],row['o']); data.tell s end return data end end end