torsten: utils/testing remove_kolab_doubles.py,NONE,1.1

cvs at intevation.de cvs at intevation.de
Fri Mar 24 11:39:08 CET 2006


Author: torsten

Update of /kolabrepository/utils/testing
In directory doto:/tmp/cvs-serv20028

Added Files:
	remove_kolab_doubles.py 
Log Message:
Added a short script for "removing" duplicated appointment in a mailfolder.


--- NEW FILE: remove_kolab_doubles.py ---
#!/usr/bin/env python
"""Tries to remove Kolab Event files doubled in a very special way.

Will search the current working directory for files like [1-9][0-9]*.
WARNING: This is a script for a one time situation! Only run under supervision!

Usage: unpackmail -d movetodirectory [--dry-run]

Options:
    -h / --help     Print this message and exit.

    -d directory
    --directory=directory
        Doubles will be moved to this directory.
        If directory does not exits, it will be created.

Roughly testing with Python2.1 on woody (needs python2.1-xml)
and a backported python2.1-email package.
"""

#initial 20060323/24 Bernhard <bernhard at intevation.de>
#
# This program is free software under the GNU GPL (>=v2)
# Read the file COPYING coming with the software for details.

__version__="$Revision: 1.1 $"[10:-1]

import sys
import os
import getopt
import errno
import mimetypes
import email

import re
import md5

import xml.dom.minidom


# verbose = 0 (only summary messages), 1 (diagonstics per file)
#           2 (verbose infos on per file operations)
#           3 (debug per file operations)
verbose = 1
def usage(code, msg=''):
    print >> sys.stderr, __doc__
    if msg:
        print >> sys.stderr, msg
    sys.exit(code)

def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hd:', ['help', 'directory=', 'dry-run'])
    except getopt.error, msg:
        usage(1, msg)

    dir = None
    dryrun = 0
    for opt, arg in opts:
        if opt in ('-h', '--help'):
            usage(0)
        elif opt in ('-d', '--directory'):
            dir = arg
        elif opt == '--dry-run':
            dryrun=1

    if not dir:
        usage(1,"Need a directory to move files to.\n")

    if not dryrun:
        try:
            os.mkdir(dir)
        except OSError, e:
            # Ignore directory exists error
            if e.errno <> errno.EEXIST: raise

    # read all files from standard directory and only 
    matchobject = re.compile("^[1-9][0-9]*\\.$")
    allfiles = os.listdir(".")
    filelist = filter(matchobject.match, allfiles)

    #filelist=["19367.", "19374.",  "19375.",  "19376." ]

    print "Scanning %d files:" % len(filelist)

    # let us put build a dictionary with the object checksum as key
    # and a list of filesnames that have the same checksum
    # E.g. for one entry: { "h3u5o6uid", ["19374.",  "19375.",  "19376." ] }
    compareheap = { }

    count = 0
    for file in filelist:
        count += 1
        if (count % 100 == 0):
            sys.stdout.write(".")
            sys.stdout.flush()

        if verbose > 1: print 'Dealing with "%s"' % file
        fp = open(file)
        msg = email.message_from_file(fp)
        fp.close()

        if not "X-Kolab-Type" in msg:
            if verbose > 0:
                sys.stderr.write(
                    "File %s not X-Kolab-Type, ignoring.\n" % file)
            continue

        try:
            kolabeventxml=extractkolabeventxml(msg)
        except:
            sys.stderr.write("Something wrong with file %s!\n" % file)
            raise

        if not kolabeventxml: continue

        # We want to find files that are the same but differ only
        # in the <uid> and <last-modifucation-dat> elements
        # so we clean those elements from the xml for comparison.

        comparexml = cookcomparexml(kolabeventxml)

        try:
            # md5 only eat regular strings, but cookcomparexml returns unicode
            hash = md5.new(comparexml).digest()
        except:
            sys.stderr.write("Trouble hashing file %s!\n" % file)
            raise

        if not compareheap.has_key(hash):
            compareheap[hash] = [ file ]
        else:
            compareheap[hash].append(file)
    print "completed." # scanning

    print "Found %d Kolab Event Objects.\n" % len(compareheap)

    if dryrun:
        print "Dry-run enabled. WOULD move away:"
    else:
        print "Moving away:"

    for object in compareheap.keys():
        filenamelist= compareheap[object]
        if len(filenamelist) > 1:
            # we have found similiar files
            # one to take

            if verbose > 1: print filenamelist
            # sort it after the numeric value
            filenamelist.sort(comparecyrusmsgnames)

            # move away the lower numbered ones
            for file in filenamelist[:-1]:
                print '"%s"' % file,
                if not dryrun:
                    os.rename(file, os.path.join(dir,file))

    print "Done."

def comparecyrusmsgnames(a,b):
    """Compares cyrus msg file names numerically like '121.' is > '20.' ."""
    x=int(a[:-1])
    y=int(b[:-1])

    if x < y: return -1
    elif x > y: return 1
    return 0


def extractkolabeventxml(msg):
    """Returns string contaning the x-vnd.kolab.event part of an emailobject.

    Will raise IndexError, if two mime parts of application/x-vnd.kolab.event
    are found.
    """

    kolabxmlpart = None
    for part in msg.walk():
        if part.get_content_type() == 'application/x-vnd.kolab.event':
            if not kolabxmlpart:
                kolabxmlpart=part
            else:
                sys.stderr.write("Arg, found second kolabxml part!\n")
                raise IndexError

    if kolabxmlpart:
        kolabxml = kolabxmlpart.get_payload(decode=1)
    else:
        return None

    if verbose > 3: print kolabxml

    return kolabxml

def cookcomparexml(kolabxml):
    """Remove special elements from a kolabxml string."""

    dom = xml.dom.minidom.parseString(kolabxml)
    event=dom.firstChild
    for child in event.childNodes:
        if child.nodeType == dom.ELEMENT_NODE:
            #if child.tagName in ['summary']:
            #    print child, child.firstChild.data.encode("latin-1")
            if child.tagName in ['uid', 'last-modification-date',]:
                #print child, child.firstChild.data
                event.removeChild(child)

    cleanedxml = dom.toxml().encode("utf-8","replace")
    dom.unlink()

    if verbose > 2: print cleanedxml

    return cleanedxml

if __name__ == '__main__':
    main()





More information about the commits mailing list