Editing
Talk:Machine Learning
(section)
Jump to navigation
Jump to search
Warning:
You are not logged in. Your IP address will be publicly visible if you make any edits. If you
log in
or
create an account
, your edits will be attributed to your username, along with other benefits.
Anti-spam check. Do
not
fill this in!
== python to download and decompress nb-discuss archive == <pre> import re from StringIO import StringIO from gzip import GzipFile from time import gmtime from urllib import urlopen from contextlib import closing def decompress_url(u): with closing(urlopen(u)) as f: with closing(StringIO(f.read())) as fs: with GzipFile(fileobj = fs) as g: return g.read() def date_in_discuss(m, y): if 1 <= m <= 12: if y > 2007: now = gmtime() yy, mm = now.tm_year, now.tm_mon if (y < yy) or ((y == yy) and (m <= mm)): return True elif (y == 2007) and (m >= 11): return True return False def datestr(m, y): try: ms = ('January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December')[m - 1] return '-'.join((str(y), ms)) except IndexError: return None def nb_gz_url(m, y, listname='noisebridge-discuss'): if not date_in_discuss(m, y): return None a = 'https://www.noisebridge.net/' b = 'pipermail/' c = '/'.join((listname, '')) d = datestr(m, y) e = '.txt.gz' return ''.join((a, b, c, d, e)) def all_nb_gz_urls(): now = gmtime() yy, mm = now.tm_year, now.tm_mon y, m = 2007, 11 while (y < yy) or ((y == yy) and (m <= mm)): yield nb_gz_url(m, y) if m < 12: m += 1 else: m = 1 y += 1 def get_month(month, year): u = nb_gz_url(month, year) s = decompress_url(u) return s def spew(): for u in all_nb_gz_urls(): yield decompress_url(u) def dump_uncompressed(filename='nb_wtf.txt'): with open(filename, 'w') as f: for s in spew(): f.write(s) def compiled_pattern(key, cache={}): try: return cache[key] except KeyError: if key == 'msg_start': p = msg_start_pattern() elif key == 'msg_stop': p = msg_stop_pattern() else: return None cache[key] = re.compile(p) return cache[key] def msg_start_pattern(): # ... and so it begins: # 'From jacob at appelbaum.net Tue Nov 20 20:20:07 2007' # -> r'^From .*\s+\w{3}\s+\w{3}\s+\d+\s+\d{2}:\d{2}:\d{2} \d{4}$' # (return compiled regex roughly equivalent to the above) space = r'\s+' datestr = space.join((r'\w{3}', r'\w{3}', r'\d+', r'\d{2}:\d{2}:\d{2} \d{4}')) pattern = ''.join(('^', 'From .*', space, datestr, '$')) return re.compile(pattern) def msg_stop_pattern(): anchor = lambda s: ''.join(('^', s, '$')) htmldelim = anchor('-------------- next part --------------') listdelim = anchor('_______________________________________________') pattern = '|'.join((htmldelim, listdelim)) return re.compile(pattern) def msglists(s): # yields list of strings for each msg in string s msg = [] p = compiled_pattern('msg_start') for r in s.splitlines(): if p.match(r): if msg: yield msg msg = [] msg.append(r) if msg: yield msg def msg2dict(msg): # msg is list of strings # return dict with headers, contents, cruft d = dict() p = compiled_pattern('msg_start') if not (msg and p.match(msg[0])): d['bogus'] = msg return d cruft = '' ss = iter(msg) d['fromkey'] = next(ss) header_list = [] for s in ss: t = s.split(':', 1) if len(t) != 2: try: header_list[-1][1] += s except IndexError: print 'this happened ???' header_list.append(['bogus_header', s]) else: k, v = t header_list.append([k, v.strip()]) if k == 'Message-ID': break d['headers'] = dict(header_list) # skip blank line(s) s = next(ss) while not s: s = next(ss) contents = [s.rstrip()] cruft = [] p = compiled_pattern('msg_stop') for s in ss: if p.match(s): cruft.append(s) break else: contents.append(s.rstrip()) d['contents'] = contents if cruft: cruft.extend([s.rstrip() for s in ss]) d['cruft'] = cruft return d def msg2smtp(msg): smtp = dict() msgd = msg2dict(msg) headers = msgd['headers'] q = (('From', 'fromline'), ('Date', 'dateline'), ('Subject', 'subjectline')) for k, v in q: try: smtp[v] = headers[k] except KeyError: print 'header not found: ', v continue smtp['messageline'] = '\n'.join(msgd['contents']) return smtp def dicterator(s): for msg in msglists(s): yield msg2dict(msg) def smtperator(s): for msg in msglists(s): yield msg2smtp(msg) </pre>
Summary:
Please note that all contributions to Noisebridge are considered to be released under the Creative Commons Attribution-NonCommercial-ShareAlike (see
Noisebridge:Copyrights
for details). If you do not want your writing to be edited mercilessly and redistributed at will, then do not submit it here.
You are also promising us that you wrote this yourself, or copied it from a public domain or similar free resource.
Do not submit copyrighted work without permission!
To protect the wiki against automated edit spam, we kindly ask you to solve the following CAPTCHA:
Cancel
Editing help
(opens in new window)
Navigation menu
Personal tools
Not logged in
Talk
Contributions
Log in
Request account
Namespaces
Page
Discussion
English
Views
Read
Edit
Add topic
View history
More
Search
Dig in!
Noisebridge
- Status: MOVED
- Donate
- ABOUT
- Accessibility
- Vision
- Blog
Manual
MANUAL
Visitors
Participation
Community Standards
Channels
Operations
Events
EVENTS
Guilds
GUILDS
- Meta
- Electronics
- Fabrication
- Games
- Music
- Library
- Neuro
- Philosophy
- Funding
- Art
- Crypto
- Documentation/Wiki
Wiki
Recent Changes
Random Page
Help
Categories
(Edit)
Tools
What links here
Related changes
Special pages
Page information