"""
take a directory full of statements like this
<forum> :traffic [ :month "2008-10"; :numPosts 35 ] .

and add together all the numPosts values that are for the same
forum+month. The output looks like the input, but with no repeated
forum+month pairs.

We also write out a pickle file with a dict like
  {(forumUri1, '2008-10') : 65, ...}
"""
import os, pickle
from rdflib.Graph import Graph
from rdflib import BNode, Literal
from ns import BB

count = {} # (forum, yearmonth) : count

dirname = "forumTrafficSummaries"
for i, filename in enumerate(os.listdir(dirname)):
    g = Graph()
    g.parse(os.path.join(dirname, filename), format='nt')
    for forum, month, num in g.query("SELECT ?forum ?month ?num WHERE { ?forum bb:traffic [ bb:month ?month; bb:numPosts ?num ] }", initNs=dict(bb=BB)):
        key = (forum, month)
        count[key] = count.get(key, 0) + int(num)

    if i % 50 == 0:
        print "%s files read" % i

pickle.dump(count, open("forumCounts.pickle", "w"), protocol=-1)

out = Graph()
for (forum, month), num in count.items():
    n = BNode()
    out.add((forum, BB['traffic'], n))
    out.add((n, BB['month'], month))
    out.add((n, BB['numPosts'], Literal(num)))
out.serialize("forumCounts.nt", format='nt')