#! /usr/bin/env python # -*- coding: utf-8 -*- """ Merge the Yahoo and Sindice results. Input parameter is the number of results we would expect. The simple process is: - take both lists, take the top elements - if the one in yahoo is in sindice, add it to the final list, then if the one sindice is in yahoo, add that to the final list, too - go on until the final list reached the maximum value """ import sys SINDICE = 'sindice' YAHOO = 'yahoo' tops = { SINDICE : 12, YAHOO : 10 } f_csv = "result.csv" f_xml = "result.xml" def take_top(origin) : """ return an array of two times the top of one of the result set. The array consist of the URI, the rank, and the origin """ retval = [] with open(origin + '/result.csv') as f : for l in f.readlines() : (uri,rank) = l.strip().split(',') retval.append((uri, rank)) if len(retval) >= 2 * tops[origin] : break return retval def merge(sindice, yahoo) : def in_array(array, s) : i = 0 while i < len(array) : if array[i][0] == s[0] : return i i = i + 1 return -1 retval = [] i = 0 while i < tops[SINDICE] or i < tops[YAHOO] : s = (i < len(sindice) and sindice[i]) or None y = (i < len(yahoo) and yahoo[i]) or None # Check if the sindice appears on the yahoo side; if so, add it to the results if s != None : k = in_array(yahoo, s) if k != -1 and in_array(retval,s) == -1 : retval.append((s[0], s[1], yahoo[k][1])) if y != None : k = in_array(sindice, y) if k != -1 and in_array(retval,y) == -1 : retval.append((y[0], sindice[k][1], y[1])) i = i + 1 return retval def dump_data(retval) : with open(f_csv,'w') as d_out : for t in retval : d_out.write("%s,%s,%s\n" % t) with open(f_xml,'w') as d_out : i = 1 for (uri,s,y) in retval : d_out.write(' %s.%s%s%s\n' % (i,uri,uri,s,y)) i = i + 1 if __name__ == '__main__': sindice = take_top(SINDICE) yahoo = take_top(YAHOO) merged = merge(sindice, yahoo) dump_data(merged)