Talk:Machine Learning: Difference between revisions
Jump to navigation
Jump to search
(utility code to retrieve & decompress archived nb-discuss) |
(some more glue code to slurp nb-discuss content) |
||
Line 5: | Line 5: | ||
== python to download and decompress nb-discuss archive == | == python to download and decompress nb-discuss archive == | ||
<pre> | <pre> | ||
import re | |||
from itertools import chain, islice | |||
from StringIO import StringIO | from StringIO import StringIO | ||
from gzip import GzipFile | from gzip import GzipFile | ||
from time import gmtime | from time import gmtime | ||
from urllib import urlopen | from urllib import urlopen | ||
from contextlib import closing | |||
def decompress_from_url(u): | def decompress_from_url(u): | ||
with closing(urlopen(u)) as f: | |||
with closing(StringIO(f.read())) as fs: | |||
with GzipFile(fileobj = fs) as g: | |||
return g.read() | |||
def discuss_gz_url(m, y): | def discuss_gz_url(m, y): | ||
Line 40: | Line 39: | ||
mm = range(11, 12 + 1) # start with November 2007 | mm = range(11, 12 + 1) # start with November 2007 | ||
elif y == now.tm_year: | elif y == now.tm_year: | ||
mm = range(1, now.tm_mon + 1) # end with current month | mm = range(1, now.tm_mon + 1) # end with current month | ||
else: | else: | ||
mm = range(1, 13) | mm = range(1, 13) | ||
Line 59: | Line 58: | ||
for s in spew(): | for s in spew(): | ||
f.write(s) | f.write(s) | ||
def from_at_pattern(): | |||
# ... and so it begins: | |||
# 'From jacob at appelbaum.net Tue Nov 20 20:20:07 2007' | |||
# -> r'^From \S+ at \S+\s+\w{3}\s+\w{3}\s+\d+\s+\d{2}:\d{2}:\d{2} \d{4}$' | |||
# (return compiled regex roughly equivalent to the above) | |||
space = r'\s+' | |||
chars = r'\S+' | |||
from_at = space.join(('From', chars, 'at', chars)) | |||
datestr = space.join((r'\w{3}', r'\w{3}', r'\d+')) | |||
timestamp = r'\d{2}:\d{2}:\d{2} \d{4}' | |||
anchor = lambda s: ''.join(('^', s, '$')) | |||
return re.compile(anchor(space.join((from_at, datestr, timestamp)))) | |||
def msglists(s, fromp=from_at_pattern()): | |||
# yield list of strings for each msg in string s | |||
msg = [] | |||
for r in s.splitlines(): | |||
if fromp.match(r): | |||
if msg: | |||
yield msg | |||
msg = [] | |||
msg.append(r) | |||
if msg: | |||
yield msg | |||
def msg2dict(msg, fromp=from_at_pattern()): | |||
# msg is list of strings | |||
# return dict with headers, contents, etc | |||
d = dict() | |||
if not msg or not fromp.match(msg[0]): | |||
d['bogus'] = msg | |||
return d | |||
stack = islice(msg, 1, None) | |||
for s in stack: | |||
t = s.split(':', 1) | |||
if len(t) == 2: | |||
k, v = t | |||
d[k] = v.strip() | |||
if k == 'Message-ID': | |||
break | |||
else: | |||
d['bogus'] = s | |||
break | |||
# skip any leading blank lines | |||
s = stack.next() | |||
while not s: | |||
s = stack.next() | |||
d['contents'] = list(chain((s,), stack)) | |||
return d | |||
def msg2smtp(msg): | |||
smtp = dict() | |||
msgd = msg2dict(msg) | |||
q = (('From', 'fromline'), ('Date', 'dateline'), ('Subject', 'subjectline')) | |||
for t in q: | |||
k, j = t | |||
s = msgd.get(k) | |||
if s: | |||
smtp[j] = s | |||
if ('bogus' in msgd) or not ('contents' in msgd): | |||
return smtp | |||
message = '' | |||
htmldelim = '-------------- next part --------------' | |||
for s in msgd['contents']: | |||
if s == htmldelim: | |||
break | |||
if message: | |||
message += '\n' | |||
message += s | |||
smtp['messageline'] = message | |||
return smtp | |||
def dicterator(s): | |||
for msg in msglists(s): | |||
yield msg2dict(msg) | |||
def smtperator(s): | |||
for msg in msglists(s): | |||
yield msg2smtp(msg) | |||
</pre> | </pre> | ||
== Word parsing python script == | == Word parsing python script == |
Revision as of 19:16, 2 March 2014
Feb. 27, 2014
Folks met and hacked on the noisebridge discuss mailing list. We created a 102MB text dump, and a python script to parse it, File:Py-piper-parser.txt. We wrote pseudo code to implement a Naive Bayesian filter to protect the world from trolls. Will implement soon.
python to download and decompress nb-discuss archive
import re from itertools import chain, islice from StringIO import StringIO from gzip import GzipFile from time import gmtime from urllib import urlopen from contextlib import closing def decompress_from_url(u): with closing(urlopen(u)) as f: with closing(StringIO(f.read())) as fs: with GzipFile(fileobj = fs) as g: return g.read() def discuss_gz_url(m, y): if m < 1 or m > 12: return None if y < 2007: return None now = gmtime() if (y > now.tm_year) or (y == now.tm_year and m > now.tm_mon): return None mm = ('January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December') nb_pre = 'https://www.noisebridge.net/pipermail/noisebridge-discuss/' nb_post = '.txt.gz' s = '-'.join((str(y), mm[m-1])) return ''.join((nb_pre, s, nb_post)) def all_discuss_gz_urls(): now = gmtime() for y in range(2007, now.tm_year + 1): if y == 2007: mm = range(11, 12 + 1) # start with November 2007 elif y == now.tm_year: mm = range(1, now.tm_mon + 1) # end with current month else: mm = range(1, 13) for m in mm: yield discuss_gz_url(m, y) def discuss_a_month(month, year): u = discuss_gz_url(month, year) s = decompress_from_url(u) return s def spew(): for u in all_discuss_gz_urls(): yield decompress_from_url(u) def dump_uncompressed(filename="nb_wtf.txt"): with open(filename, "w") as f: for s in spew(): f.write(s) def from_at_pattern(): # ... and so it begins: # 'From jacob at appelbaum.net Tue Nov 20 20:20:07 2007' # -> r'^From \S+ at \S+\s+\w{3}\s+\w{3}\s+\d+\s+\d{2}:\d{2}:\d{2} \d{4}$' # (return compiled regex roughly equivalent to the above) space = r'\s+' chars = r'\S+' from_at = space.join(('From', chars, 'at', chars)) datestr = space.join((r'\w{3}', r'\w{3}', r'\d+')) timestamp = r'\d{2}:\d{2}:\d{2} \d{4}' anchor = lambda s: ''.join(('^', s, '$')) return re.compile(anchor(space.join((from_at, datestr, timestamp)))) def msglists(s, fromp=from_at_pattern()): # yield list of strings for each msg in string s msg = [] for r in s.splitlines(): if fromp.match(r): if msg: yield msg msg = [] msg.append(r) if msg: yield msg def msg2dict(msg, fromp=from_at_pattern()): # msg is list of strings # return dict with headers, contents, etc d = dict() if not msg or not fromp.match(msg[0]): d['bogus'] = msg return d stack = islice(msg, 1, None) for s in stack: t = s.split(':', 1) if len(t) == 2: k, v = t d[k] = v.strip() if k == 'Message-ID': break else: d['bogus'] = s break # skip any leading blank lines s = stack.next() while not s: s = stack.next() d['contents'] = list(chain((s,), stack)) return d def msg2smtp(msg): smtp = dict() msgd = msg2dict(msg) q = (('From', 'fromline'), ('Date', 'dateline'), ('Subject', 'subjectline')) for t in q: k, j = t s = msgd.get(k) if s: smtp[j] = s if ('bogus' in msgd) or not ('contents' in msgd): return smtp message = '' htmldelim = '-------------- next part --------------' for s in msgd['contents']: if s == htmldelim: break if message: message += '\n' message += s smtp['messageline'] = message return smtp def dicterator(s): for msg in msglists(s): yield msg2dict(msg) def smtperator(s): for msg in msglists(s): yield msg2smtp(msg)
Word parsing python script
Function 'get_words' takes list of dictionary of emails. Yields lists of words of in the message, for each message:
def get_words(lst): for d in lst: m = d['messageline'] yield m.split()
Plans to improve by using nltk[1]