""" take a directory full of statements like this :traffic [ :month "2008-10"; :numPosts 35 ] . and add together all the numPosts values that are for the same forum+month. The output looks like the input, but with no repeated forum+month pairs. We also write out a pickle file with a dict like {(forumUri1, '2008-10') : 65, ...} """ import os, pickle from rdflib.Graph import Graph from rdflib import BNode, Literal from ns import BB count = {} # (forum, yearmonth) : count dirname = "forumTrafficSummaries" for i, filename in enumerate(os.listdir(dirname)): g = Graph() g.parse(os.path.join(dirname, filename), format='nt') for forum, month, num in g.query("SELECT ?forum ?month ?num WHERE { ?forum bb:traffic [ bb:month ?month; bb:numPosts ?num ] }", initNs=dict(bb=BB)): key = (forum, month) count[key] = count.get(key, 0) + int(num) if i % 50 == 0: print "%s files read" % i pickle.dump(count, open("forumCounts.pickle", "w"), protocol=-1) out = Graph() for (forum, month), num in count.items(): n = BNode() out.add((forum, BB['traffic'], n)) out.add((n, BB['month'], month)) out.add((n, BB['numPosts'], Literal(num))) out.serialize("forumCounts.nt", format='nt')