import sys import re import MySQLdb connection = MySQLdb.connect(host="localhost", \ user="plusmobile", passwd="reify-plusmobile", \ db="plusmobile") cursor = connection.cursor() tagList = [] splitter = re.compile(r',|\.|\||\-| |:|\|\:|\/|\&|\'|\@|\>|\<') filter = re.compile(r'mobile|widget|the|la|le|de|of|this|in|on|for|an|m|y|el|at|us|you|my|to|by|not|com|www|rss|with|from|go|org|up|to|get|do|du|that|boobs|porn|liciousporn|fucking|pussy') def getTagsFromDB(dbcursor): print "getting tags...", sys.stdout.flush() count = 0 dbcursor.execute("select text, description, tags from pages where type='starterPack'") while(1): row = dbcursor.fetchone() if row: words= row[0]+row[1]+row[2] for word in splitter.split(words.lower()): if word and not filter.match(word) and len(word)>1: tagList.append(word) else: break count+= 1 if count % 50 == 0: print count,".. ", sys.stdout.flush() #if count == 50: break print "done." sys.stdout.flush() def countTags(): T= {} for tag in tagList: T[tag] = T.get(tag,0)+ 1 return T def sortTags(T): return sorted(T.items(), lambda k,v: cmp(k[1], v[1]), reverse=True) # # T is a list of (Tag, frequency) # def writeTagFiles(T): group = [0, 20, 50, 100] files = ["tags0.txt", "tags1.txt", "tags2.txt", "tags3.txt"] size = 3 g = group[size] f = open(files[size], "w") print "**GROUP "+ str(size) for k in T: if int(k[1]) < g: f.close() size= size -1 f = open(files[size], "w") print "\n**GROUP "+ str(size) g = group[size] f.write("%s "% k[0]) f.close() def writeTagPythonLists(T): group = [0, 20, 50, 100] size = 3 g = group[size] f = open("tagList.py", "w") f.write("#\n# Auto-generated\n#\n\n") f.write("tag%d = [" % size) firstchar = 1 for k in T: if int(k[1]) < g: f.write("]\n") firstchar=1 size = size - 1 f.write("\ntag%d = [" % size) g = group[size] if firstchar: count = 0 f.write("'%s'"% k[0]) firstchar=0 else: count+= 1 f.write(", '%s'"% k[0]) if count > 1000: break f.write("]\n") f.close() # # T is a list of (Tag, frequency) # def printTagCloud(T): group = [0, 20, 50, 100] size = 3 g = group[size] print g print "**********************************GROUP "+ str(size) for k in T: if int(k[1]) < g: size= size -1 print "\n**********************************GROUP "+ str(size) g = group[size] print "[",k[0],"]", getTagsFromDB(cursor) T= countTags() T = sortTags(T) #printTagCloud(T) writeTagFiles(T) writeTagPythonLists(T)