""" read thread files like 2008/threads/000/001/http%3A%2F%2Fboards.ie%2Fvbulletin%2Fsioc.php%3Fsioc_type%3Dthread%26sioc_id%3D2055208846 to estimate how busy forums are over time. To avoid reading all the post files and their dates, I'll pretend that all the posts in a thread happened on the same day. Then we quantize to months, so it's probably good enough. Output statements are like this: :traffic [ :month "2008-10"; :numPosts 35 ] . you should be able to sum all the numPosts numbers for a given forum+month. the key part of the data we're looking at is like this: a :Thread; dc:title "happy new year guys"; dcterms:created "2008-01-01T00:24:53"; :container_of , , , ; :has_parent ; :link ; :num_views "108" . """ from __future__ import division import os, sys, time from rdflib.Graph import Graph from rdflib import FileInputSource, RDF, BNode, Literal from ns import SIOC, DCTERMS, BB def yearMonth(dateTime): return int(dateTime[:4]), int(dateTime[5:7]) def addForumCounts(graph, counts): """add this graph's data into the counts structure""" for thread in g.subjects(RDF.type, SIOC['Thread']): forum = g.value(thread, SIOC['has_parent']) forumCounts = counts.setdefault(forum, {}) y, m = yearMonth(g.value(thread, DCTERMS['created'])) numPosts = len(list(g.objects(thread, SIOC['container_of']))) forumCounts[(y,m)] = forumCounts.get((y,m), 0) + numPosts counts = {} # forum : { (year,month) : count } dirname = sys.argv[1] # "data/2008/threads/000/001" bytesRead = 0 startTime = time.time() for i, filename in enumerate(os.listdir(dirname)): fullFilename = os.path.join(dirname, filename) bytesRead += os.stat(fullFilename).st_size g = Graph() try: g.parse(FileInputSource(open(fullFilename)), format='xml') except: print "reading", fullFilename raise addForumCounts(g, counts) if i % 50 == 0: sys.stdout.write('.') sys.stdout.flush() print out = Graph() for forum, forumCounts in counts.iteritems(): for month, num in forumCounts.iteritems(): n = BNode() out.add((forum, BB['traffic'], n)) out.add((n, BB['month'], Literal("%04d-%02d" % month))) out.add((n, BB['numPosts'], Literal(num))) out.serialize("forumTrafficSummaries/%s" % dirname.replace('/','_'), format='nt') elapsed = time.time() - startTime print "%s xml bytes in %s sec, %s bps" % (bytesRead, elapsed, bytesRead / elapsed)