Talk:Machine Learning: Difference between revisions

From Noisebridge
Jump to navigation Jump to search
(utility code to retrieve & decompress archived nb-discuss)
(some more glue code to slurp nb-discuss content)
Line 5: Line 5:
== python to download and decompress nb-discuss archive ==
== python to download and decompress nb-discuss archive ==
<pre>
<pre>
import re
from itertools import chain, islice
from StringIO import StringIO
from StringIO import StringIO
from gzip import GzipFile
from gzip import GzipFile
from time import gmtime
from time import gmtime
from urllib import urlopen
from urllib import urlopen
from contextlib import closing


def decompress_from_url(u):
def decompress_from_url(u):
   # return GzipFile(fileobj = StringIO(urlopen(u).read())).read()
   with closing(urlopen(u)) as f:
  f = urlopen(u)
    with closing(StringIO(f.read())) as fs:
  fs = StringIO(f.read())
      with GzipFile(fileobj = fs) as g:
  g = GzipFile(fileobj = fs)
        return g.read()
  s = g.read()
  for x in (f, fs, g):
    x.close()
  return s


def discuss_gz_url(m, y):
def discuss_gz_url(m, y):
Line 40: Line 39:
       mm = range(11, 12 + 1)  # start with November 2007
       mm = range(11, 12 + 1)  # start with November 2007
     elif y == now.tm_year:
     elif y == now.tm_year:
       mm = range(1, now.tm_mon + 1)  # end with current month  
       mm = range(1, now.tm_mon + 1)  # end with current month
     else:
     else:
       mm = range(1, 13)
       mm = range(1, 13)
Line 59: Line 58:
     for s in spew():
     for s in spew():
       f.write(s)
       f.write(s)
def from_at_pattern():
  # ... and so it begins:
  # 'From jacob at appelbaum.net  Tue Nov 20 20:20:07 2007'
  # -> r'^From \S+ at \S+\s+\w{3}\s+\w{3}\s+\d+\s+\d{2}:\d{2}:\d{2} \d{4}$'
  # (return compiled regex roughly equivalent to the above)
  space = r'\s+'
  chars = r'\S+'
  from_at = space.join(('From', chars, 'at', chars))
  datestr = space.join((r'\w{3}', r'\w{3}', r'\d+'))
  timestamp = r'\d{2}:\d{2}:\d{2} \d{4}'
  anchor = lambda s: ''.join(('^', s, '$'))
  return re.compile(anchor(space.join((from_at, datestr, timestamp))))
def msglists(s, fromp=from_at_pattern()):
  # yield list of strings for each msg in string s
  msg = []
  for r in s.splitlines():
    if fromp.match(r):
      if msg:
        yield msg
        msg = []
    msg.append(r)
  if msg:
    yield msg
def msg2dict(msg, fromp=from_at_pattern()):
  # msg is list of strings
  # return dict with headers, contents, etc
  d = dict()
  if not msg or not fromp.match(msg[0]):
    d['bogus'] = msg
    return d
  stack = islice(msg, 1, None)
  for s in stack:
    t = s.split(':', 1)
    if len(t) == 2:
      k, v = t
      d[k] = v.strip()
      if k == 'Message-ID':
        break
    else:
      d['bogus'] = s
      break
  # skip any leading blank lines
  s = stack.next()
  while not s:
    s = stack.next()
  d['contents'] = list(chain((s,), stack))
  return d
def msg2smtp(msg):
  smtp = dict()
  msgd = msg2dict(msg)
  q = (('From', 'fromline'), ('Date', 'dateline'), ('Subject', 'subjectline'))
  for t in q:
    k, j = t
    s = msgd.get(k)
    if s:
      smtp[j] = s
  if ('bogus' in msgd) or not ('contents' in msgd):
    return smtp
  message = ''
  htmldelim = '-------------- next part --------------'
  for s in msgd['contents']:
    if s == htmldelim:
      break
    if message:
      message += '\n'
    message += s
  smtp['messageline'] = message
  return smtp
def dicterator(s):
  for msg in msglists(s):
    yield msg2dict(msg)
def smtperator(s):
  for msg in msglists(s):
    yield msg2smtp(msg)


</pre>
</pre>


== Word parsing python script ==
== Word parsing python script ==

Revision as of 19:16, 2 March 2014

Feb. 27, 2014

Folks met and hacked on the noisebridge discuss mailing list. We created a 102MB text dump, and a python script to parse it, File:Py-piper-parser.txt. We wrote pseudo code to implement a Naive Bayesian filter to protect the world from trolls. Will implement soon.

python to download and decompress nb-discuss archive

import re
from itertools import chain, islice
from StringIO import StringIO
from gzip import GzipFile
from time import gmtime
from urllib import urlopen
from contextlib import closing

def decompress_from_url(u):
  with closing(urlopen(u)) as f:
    with closing(StringIO(f.read())) as fs:
      with GzipFile(fileobj = fs) as g:
        return g.read()

def discuss_gz_url(m, y):
  if m < 1 or m > 12:
    return None
  if y < 2007:
    return None
  now = gmtime()
  if (y > now.tm_year) or (y == now.tm_year and m > now.tm_mon):
    return None
  mm = ('January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December')
  nb_pre = 'https://www.noisebridge.net/pipermail/noisebridge-discuss/'
  nb_post = '.txt.gz'
  s = '-'.join((str(y), mm[m-1]))
  return ''.join((nb_pre, s, nb_post))

def all_discuss_gz_urls():
  now = gmtime()
  for y in range(2007, now.tm_year + 1):
    if y == 2007:
      mm = range(11, 12 + 1)  # start with November 2007
    elif y == now.tm_year:
      mm = range(1, now.tm_mon + 1)  # end with current month
    else:
      mm = range(1, 13)
    for m in mm:
      yield discuss_gz_url(m, y)

def discuss_a_month(month, year):
  u = discuss_gz_url(month, year)
  s = decompress_from_url(u)
  return s

def spew():
  for u in all_discuss_gz_urls():
    yield decompress_from_url(u)

def dump_uncompressed(filename="nb_wtf.txt"):
  with open(filename, "w") as f:
    for s in spew():
      f.write(s)

def from_at_pattern():
  # ... and so it begins:
  # 'From jacob at appelbaum.net  Tue Nov 20 20:20:07 2007'
  # -> r'^From \S+ at \S+\s+\w{3}\s+\w{3}\s+\d+\s+\d{2}:\d{2}:\d{2} \d{4}$'
  # (return compiled regex roughly equivalent to the above)
  space = r'\s+'
  chars = r'\S+'
  from_at = space.join(('From', chars, 'at', chars))
  datestr = space.join((r'\w{3}', r'\w{3}', r'\d+'))
  timestamp = r'\d{2}:\d{2}:\d{2} \d{4}'
  anchor = lambda s: ''.join(('^', s, '$'))
  return re.compile(anchor(space.join((from_at, datestr, timestamp))))

def msglists(s, fromp=from_at_pattern()):
  # yield list of strings for each msg in string s
  msg = []
  for r in s.splitlines():
    if fromp.match(r):
      if msg:
        yield msg
        msg = []
    msg.append(r)
  if msg:
    yield msg

def msg2dict(msg, fromp=from_at_pattern()):
  # msg is list of strings
  # return dict with headers, contents, etc
  d = dict()
  if not msg or not fromp.match(msg[0]):
    d['bogus'] = msg
    return d
  stack = islice(msg, 1, None)
  for s in stack:
    t = s.split(':', 1)
    if len(t) == 2:
      k, v = t
      d[k] = v.strip()
      if k == 'Message-ID':
        break
    else:
      d['bogus'] = s
      break
  # skip any leading blank lines
  s = stack.next()
  while not s:
    s = stack.next()
  d['contents'] = list(chain((s,), stack))
  return d

def msg2smtp(msg):
  smtp = dict()
  msgd = msg2dict(msg)
  q = (('From', 'fromline'), ('Date', 'dateline'), ('Subject', 'subjectline'))
  for t in q:
    k, j = t
    s = msgd.get(k)
    if s:
      smtp[j] = s
  if ('bogus' in msgd) or not ('contents' in msgd):
    return smtp
  message = ''
  htmldelim = '-------------- next part --------------'
  for s in msgd['contents']:
    if s == htmldelim:
      break
    if message:
      message += '\n'
    message += s
  smtp['messageline'] = message
  return smtp

def dicterator(s):
  for msg in msglists(s):
    yield msg2dict(msg)

def smtperator(s):
  for msg in msglists(s):
    yield msg2smtp(msg)

Word parsing python script

Function 'get_words' takes list of dictionary of emails. Yields lists of words of in the message, for each message:

 def get_words(lst):
   for d in lst:
     m = d['messageline']
     yield m.split()

Plans to improve by using nltk[1]