#!/usr/bin/env python ''' Takes a text dump from pipermail archive pickles a list of dictionaries: [{'fromline':' ', 'dateline':' ', 'subjectline':' ', 'messageline':' '}, ..., {lastmessage} ] ''' import pickle data_file = '/.../nb.txt' htmldelim = '-------------- next part --------------' message_container = [] smtp = {} message = '' reached_message_id = False with open(data_file, 'r') as f: for line in f: if htmldelim in line and '>' not in line: smtp['messageline'] = message if smtp.has_key('fromline'): message_container.append(smtp) message = '' smtp = {} reached_message_id = False continue if 'From:' in line and '>' not in line: if smtp.has_key('fromline'): smtp['messageline'] = message message_container.append(smtp) message = '' smtp = {} reached_message_id = False smtp['fromline'] = line.split('From:')[-1].strip() continue if 'Date:' in line and '>' not in line: dateline = line.split('Date:')[-1].strip() if smtp.has_key('dateline') == False: smtp['dateline'] = dateline continue if 'Subject:' in line and '>' not in line: smtp['subjectline'] = line.split('Subject:')[-1].strip() continue if 'Message-ID:' in line: reached_message_id = True continue if reached_message_id: message += line pickle.dump(message_container, open('data/discuss.p', 'wb'))