Friday, February 20, 2009

Using Python to reformat the xml within .odt files

A quick python 2,x script to copy, unzip, and uniformly reformat the XML of an .odt file. It adds indentations and line breaks. Useful to debug my invoicing script, which muddles with the xml files, by making the files diff-able and easier to read and easier to search.

#!/usr/bin/env python
import os
import xml.etree.ElementTree as ET
odt_path_and_file = 'path/to/file.odt'

# This function was copied from http://effbot.org/zone/element-lib.htm
def indent(elem, level=0):
    i = "\n" + level*"  "
    if len(elem):
        if not elem.text or not elem.text.strip():
            elem.text = i + "  "
        if not elem.tail or not elem.tail.strip():
            elem.tail = i
        for elem in elem:
            indent(elem, level+1)
        if not elem.tail or not elem.tail.strip():
            elem.tail = i
    else:
        if level and (not elem.tail or not elem.tail.strip()):
            elem.tail = i

odt_filename = odt_path_and_file.split('/')[-1]
folder_name = ('Desktop/' + odt_path_and_file.split('/')[-1].rstrip('.odt'))
os.popen('rm -r ' + folder_name) #Delete any old working files
os.popen('mkdir ' + folder_name)
os.popen('cp ' + odt_path_and_file + ' ' + folder_name)
os.popen('unzip ' + folder_name + '/' + odt_filename + ' -d ' + folder_name)
reply = os.popen('ls ' + folder_name)
file_list = [filename.rstrip('\n') for filename in reply.readlines() if filename.count('.xml') > 0]
for file in file_list:
    print ('Parsing ' + folder_name + '/' + file)
    tree = ET.parse(folder_name + '/' + file)
    indent(tree.getroot())
    tree.write(folder_name + '/' + file)
    print ('Completed ' + file)

No comments: