torsten: utils/testing remove_kolab_doubles.py,NONE,1.1
cvs at intevation.de
cvs at intevation.de
Fri Mar 24 11:39:08 CET 2006
Author: torsten
Update of /kolabrepository/utils/testing
In directory doto:/tmp/cvs-serv20028
Added Files:
remove_kolab_doubles.py
Log Message:
Added a short script for "removing" duplicated appointment in a mailfolder.
--- NEW FILE: remove_kolab_doubles.py ---
#!/usr/bin/env python
"""Tries to remove Kolab Event files doubled in a very special way.
Will search the current working directory for files like [1-9][0-9]*.
WARNING: This is a script for a one time situation! Only run under supervision!
Usage: unpackmail -d movetodirectory [--dry-run]
Options:
-h / --help Print this message and exit.
-d directory
--directory=directory
Doubles will be moved to this directory.
If directory does not exits, it will be created.
Roughly testing with Python2.1 on woody (needs python2.1-xml)
and a backported python2.1-email package.
"""
#initial 20060323/24 Bernhard <bernhard at intevation.de>
#
# This program is free software under the GNU GPL (>=v2)
# Read the file COPYING coming with the software for details.
__version__="$Revision: 1.1 $"[10:-1]
import sys
import os
import getopt
import errno
import mimetypes
import email
import re
import md5
import xml.dom.minidom
# verbose = 0 (only summary messages), 1 (diagonstics per file)
# 2 (verbose infos on per file operations)
# 3 (debug per file operations)
verbose = 1
def usage(code, msg=''):
print >> sys.stderr, __doc__
if msg:
print >> sys.stderr, msg
sys.exit(code)
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], 'hd:', ['help', 'directory=', 'dry-run'])
except getopt.error, msg:
usage(1, msg)
dir = None
dryrun = 0
for opt, arg in opts:
if opt in ('-h', '--help'):
usage(0)
elif opt in ('-d', '--directory'):
dir = arg
elif opt == '--dry-run':
dryrun=1
if not dir:
usage(1,"Need a directory to move files to.\n")
if not dryrun:
try:
os.mkdir(dir)
except OSError, e:
# Ignore directory exists error
if e.errno <> errno.EEXIST: raise
# read all files from standard directory and only
matchobject = re.compile("^[1-9][0-9]*\\.$")
allfiles = os.listdir(".")
filelist = filter(matchobject.match, allfiles)
#filelist=["19367.", "19374.", "19375.", "19376." ]
print "Scanning %d files:" % len(filelist)
# let us put build a dictionary with the object checksum as key
# and a list of filesnames that have the same checksum
# E.g. for one entry: { "h3u5o6uid", ["19374.", "19375.", "19376." ] }
compareheap = { }
count = 0
for file in filelist:
count += 1
if (count % 100 == 0):
sys.stdout.write(".")
sys.stdout.flush()
if verbose > 1: print 'Dealing with "%s"' % file
fp = open(file)
msg = email.message_from_file(fp)
fp.close()
if not "X-Kolab-Type" in msg:
if verbose > 0:
sys.stderr.write(
"File %s not X-Kolab-Type, ignoring.\n" % file)
continue
try:
kolabeventxml=extractkolabeventxml(msg)
except:
sys.stderr.write("Something wrong with file %s!\n" % file)
raise
if not kolabeventxml: continue
# We want to find files that are the same but differ only
# in the <uid> and <last-modifucation-dat> elements
# so we clean those elements from the xml for comparison.
comparexml = cookcomparexml(kolabeventxml)
try:
# md5 only eat regular strings, but cookcomparexml returns unicode
hash = md5.new(comparexml).digest()
except:
sys.stderr.write("Trouble hashing file %s!\n" % file)
raise
if not compareheap.has_key(hash):
compareheap[hash] = [ file ]
else:
compareheap[hash].append(file)
print "completed." # scanning
print "Found %d Kolab Event Objects.\n" % len(compareheap)
if dryrun:
print "Dry-run enabled. WOULD move away:"
else:
print "Moving away:"
for object in compareheap.keys():
filenamelist= compareheap[object]
if len(filenamelist) > 1:
# we have found similiar files
# one to take
if verbose > 1: print filenamelist
# sort it after the numeric value
filenamelist.sort(comparecyrusmsgnames)
# move away the lower numbered ones
for file in filenamelist[:-1]:
print '"%s"' % file,
if not dryrun:
os.rename(file, os.path.join(dir,file))
print "Done."
def comparecyrusmsgnames(a,b):
"""Compares cyrus msg file names numerically like '121.' is > '20.' ."""
x=int(a[:-1])
y=int(b[:-1])
if x < y: return -1
elif x > y: return 1
return 0
def extractkolabeventxml(msg):
"""Returns string contaning the x-vnd.kolab.event part of an emailobject.
Will raise IndexError, if two mime parts of application/x-vnd.kolab.event
are found.
"""
kolabxmlpart = None
for part in msg.walk():
if part.get_content_type() == 'application/x-vnd.kolab.event':
if not kolabxmlpart:
kolabxmlpart=part
else:
sys.stderr.write("Arg, found second kolabxml part!\n")
raise IndexError
if kolabxmlpart:
kolabxml = kolabxmlpart.get_payload(decode=1)
else:
return None
if verbose > 3: print kolabxml
return kolabxml
def cookcomparexml(kolabxml):
"""Remove special elements from a kolabxml string."""
dom = xml.dom.minidom.parseString(kolabxml)
event=dom.firstChild
for child in event.childNodes:
if child.nodeType == dom.ELEMENT_NODE:
#if child.tagName in ['summary']:
# print child, child.firstChild.data.encode("latin-1")
if child.tagName in ['uid', 'last-modification-date',]:
#print child, child.firstChild.data
event.removeChild(child)
cleanedxml = dom.toxml().encode("utf-8","replace")
dom.unlink()
if verbose > 2: print cleanedxml
return cleanedxml
if __name__ == '__main__':
main()
More information about the commits
mailing list