Integrating email and forum software

2010/12/19

Exporting a mailman archive to MySQL

Filed under: mailman, python — andrewsinlondon @ 16:34

I live!

Here’s a Python 3 script to export a mailman archive into a useful format in MySQL.

This is a big step in migrating from mailman to A.N.Other web-based forum (Buddypress, Drupal, whatever).

Set your MySQL login details about halfway down, on lines 168-169 (highlighted below)

#!/usr/bin/env python3

"""mbox_to_mysql: Import messages from mbox files to a mysql database """

# Mbox message handing is being done by mailbox module, which is a python core module.
#   However, mbox FILE handling is done internally in this file, because the
#   mailbox module does far more than we need, and takes up a lot of time to do it.
#
# Database handling is done through mysql.connector, which is not a core module;
#   Get it from https://launchpad.net/myconnpy

__author__ = "Andrew Smith"
__license__ = "GNU General Public License version 2"
__date__="December 2010"
__version__="0.4"

try:
  import hashlib
  from html.parser import HTMLParser
  from mailbox import mboxMessage
  import mysql.connector
  import os
  import re
  import sys
  import time
except ImportError:
  print ("Failed while importing modules")
  exit(-2)


#===================================================================


class Striptags(HTMLParser):
  '''Create a descendant class of the standard library's html parser, and
     as tags are found, strip them out.
     Replace block-level tags with two line breaks, and <br> tags with
     a single line break'''

  def __init__(self):
    HTMLParser.__init__(self)
    self.__text = []

  def handle_data(self, data):
    text = data.strip()
    if len(text) &gt; 0:
        self.__text.append(re.sub('[ \t\r\n]+', ' ', text) + ' ')

  def handle_starttag(self, tag, attrs):
    if tag in ('p','div','pre'):
        self.__text.append('\n\n')
    elif tag == 'br':
        self.__text.append('\n')

  def handle_startendtag(self, tag, attrs):
    if tag == 'br':
        self.__text.append('\n\n')

  def text(self):
    return ''.join(self.__text).strip()


#===================================================================


def apsmbox(filename):
  ''' generator that opens an mbox file, and yields one message for each iteration'''
  inFile=True
  with open(filename, 'r', errors='replace', encoding='utf-8') as mboxfile: # , newline='' 
    buffer=[mboxfile.readline()]
    while inFile:
      line =mboxfile.readline()
      if not line: inFile=False
      if inFile and not line.startswith('From '):
        buffer.append(line)
      else:
        from_line = buffer[0].replace('\n', '')
        string=''.join(buffer[1:])  # .replace(os.linesep, '\n')
        msg = mboxMessage(string)
        msg.set_from(from_line[5:])
        if inFile: buffer=[line]
        yield msg


#===================================================================


def usage():
  print ("mbox_mysql.py: Import messages from mbox files to a mysql database\n")
  print ("%s file.mbox Database_Name Table_Name" % sys.argv[0])
  print ("Example: ./%s mailmanlist.mbox from_mailman\n" % sys.argv[0])
  exit(0)


#===================================================================


def encodeattr(msg,fld):
  '''Takes a message and a field and encodes the value of msg[fld] as an md5.
     Returns an empty string if field is empty'''
  t=msg.__getitem__(fld)
  if t is None:return ''
  return hashlib.md5(t.encode('utf-8')).hexdigest()


#===================================================================

def clean_body(body_in):
  ''' parse body_in, strip html, strip quoted text from end of message'''

  new_body = []
  buffer   = []
  body     = body_in.splitlines()

  # Get rid of quoted lines at end
  for line in body:
    buffer.append(line)
    tst = line.lstrip()
    if tst and not tst.startswith('&gt;') and not tst.endswith('wrote:'):
      new_body.extend(buffer)
      buffer=[]

  alltext = '\n'.join(new_body)
  if alltext.lower().find('&lt;html') == -1:
    return alltext

  try:
    parser = Striptags()
    parser.feed( alltext )
    parser.close()
    return parser.text()
  except Exception as e:
    return re.sub('(]*&gt;)+','\n',alltext)
  


#===================================================================


def main():

  starttime = time.time()

  # first some sanity tests on the command-line arguments
  #sys.argv = ['mbox_to_mysql','list1.mbox','mailman','lists',] # !@!@! APS here for testing purposes only - comment out for live run

  if len(sys.argv) != 4:
    usage()
    exit(-2)
  mbox = sys.argv[1]
  db_name = sys.argv[2]
  table_name = sys.argv[3]

  if mbox[-5:] == '.mbox':
    source=mbox[:-5].lower().replace('/','').replace('-','')
  else:
    print ("Please give an mbox file.\nQuitting...")
    exit(-2)


  # mysql table creation string
  # if you change the string, you'll have to change the related entries on the mysql queries below

  create_table = """
  CREATE TABLE %s (
  id INT NOT NULL AUTO_INCREMENT,
  sent DATETIME,
  messageid CHAR(32),
  inreplyto CHAR(32),
  subject VARCHAR(200),
  sender VARCHAR(100),
  list VARCHAR(30),
  body LONGTEXT,
  PRIMARY KEY(id),
  UNIQUE KEY(messageid)
  );
  """

  oConfig={
    'host': 'localhost',
    'user': 'username',                           # YOUR MYSQL USERNAME HERE
    'password': 'password',                     # YOUR MYSQL PASSWORD HERE
    'charset': 'utf8',
    'use_unicode': True,
    'get_warnings': True,
    'port': 3306,
    }

  try:
    connection = mysql.connector.Connect(**oConfig)
  except (mysql.connector.errors.OperationalError,mysql.connector.errors.ProgrammingError,TypeError) as e:
    print ("Failed to connect to db. Check the connection config. Quitting...\n")
    exit(-2)
  except (mysql.connector.errors.InterfaceError) as e:
    print ("Failed to connect to db. Check MySQL is running. Quitting...\n")
    exit(-2)

  db = connection.cursor()

  # try to create database: if it already exists, use it

  try:
    db.execute("create database %s",(db_name,))
  except (mysql.connector.errors.OperationalError) as e:
    print ("Can't create database, seems that database %s already exists. We'll use it then " % db_name)

  db.execute("use %s",(db_name,))

  # try to create table: if it already exists, use it

  try:
    db.execute('DROP TABLE IF EXISTS %s',(table_name,))
    db.execute(create_table,(table_name,))
    print ("Successfully created table %s" % table_name)
  except (mysql.connector.errors.DatabaseError) as e:
    print ("Can't create table, seems that table %s exists. We'll use it then " % table_name)
  except (mysql.connector.errors.InterfaceError) as e:
    print ("Error when creating table")

  warnings = db.fetchwarnings()
  if warnings:
    print(warnings)

  # cycle over all the messages in the mbox file

  counter = 0
  for message in apsmbox(mbox):

    # extract from the message just the headings we want, and the body text

    fromline = message.get_from()

    # the msg object is used to store the headings we want

    msg={}
    msg['Subject']    = message.__getitem__('Subject')
    msg['Message-ID'] = encodeattr(message,'Message-ID')
    msg['In-Reply-To']= encodeattr(message,'In-Reply-To')
    #msg['References'] = message.get_all('References')
    #print("\n"+ '_'*45 + "\n %s" % fromline)
    print("%s" % fromline)

    breaker=fromline.find(' ')
    msg['From']=fromline[0:breaker]
    msg['Date']=time.strftime('%Y-%m-%d %H:%M:%S',time.strptime(fromline[breaker+1:])) #breaker+24

    # go find the very first message part - this should be the plaintext part, if present

    msgtext=message
    while msgtext.is_multipart and msgtext.is_multipart():
      msgtext = msgtext.get_payload(0)
    msgtext = clean_body(msgtext.get_payload())

    # got all the data we need - now stick it into the database

    try:
      db.execute("Insert into %s (id,sent,messageid,inreplyto,subject,sender,list,body) values(NULL,'%s','%s','%s','%s','%s','%s','%s')", \
      (table_name,msg['Date'],msg['Message-ID'],msg['In-Reply-To'],msg['Subject'],msg['From'],source,msgtext,))
    except Exception as e:
      warnings = db.fetchwarnings()
      if warnings:print(warnings)
      print("%s" % table_name)
      print("%s" % msg['Date'])
      print("%s" % msg['Message-ID'])
      print("%s" % msg['In-Reply-To'])
      print("%s" % msg['Subject'])
      print("%s" % msg['From'])
      print("%s" % source)
      print("%s" % msgtext)

    counter = counter+1

  connection.commit()

  # finished, so now report summary information about what we've done

  print ('*'*45 + "\nCreated table %s and inserted %s emails\n" % (table_name, counter))

  db.close()
  connection.close()
  print (starttime)
  print (time.time())
  print('time was %f seconds' % (time.time()-starttime))
  exit(1)

#===================================================================

if __name__ == '__main__':
   main()
About these ads

2 Comments »

  1. Hi

    Could we use this script to populate a wordpress based website, given a specific category ?
    Will posts will be such a questions and the answers will be populated in the comments ?

    Thank you

    Comment by Rad — 2011/02/12 @ 15:41

  2. This script could form the core of doing that – but it’s not enough on its own, it would need a bit of work to get it to do that.

    Comment by andrewsinlondon — 2011/02/12 @ 15:47


RSS feed for comments on this post. TrackBack URI

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

The WordPress Classic Theme Blog at WordPress.com.

Follow

Get every new post delivered to your Inbox.

%d bloggers like this: