import boto import re import os import StringIO import zipfile #bucket wikipedia-delete-2012-05 # algorithm # get each wtarchive*.zip, get the list of articles first, check if there are any we need, then download them url = "http://archive.org/download/wikipedia-delete-2012-05/wtarchive300512081506.zip/enwikipediaorg_w-20120530-wikidump/enwikipediaorg_w-20120530-titles.txt" #http://ia601203.us.archive.org/zipview.php?zip=/24/items/wikipedia-delete-2012-05/wtarchive300512081506.zip&file=enwikipediaorg_w-20120530-wikidump/enwikipediaorg_w-20120530-titles.tx conn = boto.connect_s3(host='s3.us.archive.org', is_secure=False) buckets =conn.get_all_buckets() from boto.s3.key import Key def process_zip(fname, data) : output = StringIO.StringIO() output.write(data) zf = zipfile.ZipFile(output, mode='r') il= zf.infolist() for zi in il : print("%s %s" % (fname,zi.filename)) # d = zf.open(zi) # self.indexfile=d # self.readindex (fname,zi.filename,position,block) for b in buckets: # print "compare %s and %s " % (b.name , bucket) #if re.search(r'wikipedia-delete-(\d\d\d\d)-(\d\d)',b.name): if re.search(r'wikipedia-delete-(\d\d\d\d)-(\d\d)',b.name): print "found %s" % b.name store = b print store keys = store.get_all_keys() for k in keys : print k rkey = Key(store) rkey.key = k print "name %s type %s" % (rkey.name, rkey.content_type) name = "%s" % rkey.name match = re.search(r'.+(wtarchive\d+\.zip)>',name) if (match) : print "match %s" % match.group(1) # print "g1 %s" % match.group(1) # print "g0 %s" % match.group(0) zipfilename = match.group(1) # data = rkey.get_contents_as_string() outfilename="data/old%s" % zipfilename data = rkey.get_contents_to_filename(outfilename) print outfilename os.system("unzip %s-d data/" % outfilename)