root/OOo/ooo2dbk/trunk/ooo2dbk

Revision 51878, 31.6 kB (checked in by madarche, 1 year ago)

- Fixed reference in the comments and messages to config.xml file which is now a

constant that may change.

  • Property svn:eol-style set to native
  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1 #!/usr/bin/python
2 # (C) Copyright 2003-2007 Nuxeo SAS <http://nuxeo.com>
3 # (C) Copyright 2002 Eric Bellot <ebellot@netcourrier.com>
4 #
5 # Authors:
6 # M.-A. Darche (Nuxeo)
7 # Ruslan Spivak (Nuxeo)
8 # Eric Bellot <ebellot@netcourrier.com>
9 # Laurent Godard (lgodard@indesko.com)
10 #
11 # This program is free software; you can redistribute it and/or modify
12 # it under the terms of the GNU General Public License version 2 as published
13 # by the Free Software Foundation.
14 #
15 # This program is distributed in the hope that it will be useful,
16 # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 # GNU General Public License for more details.
19 #
20 # You should have received a copy of the GNU General Public License
21 # along with this program; if not, write to the Free Software
22 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23 # 02111-1307, USA.
24 #
25 # See ``COPYING`` for more information
26 #
27 # $Id$
28
29 import zipfile
30 import os, os.path, sys
31 from string import join, split, find
32 import codecs
33 from xml.dom import minidom
34 import time, base64
35 import re
36 import xml.sax
37 import shutil
38 import locale
39 from optparse import OptionParser
40
41 VERSION = '2.0'
42
43 CONF_FILE_NAME = 'ooo2dbk.xml'
44
45 # OpenOffice.org canonical XML files
46 OOO_META_FILE_NAME = 'meta.xml'
47 OOO_STYLES_FILE_NAME = 'styles.xml'
48 OOO_CONTENT_FILE_NAME = 'content.xml'
49
50 DOCBOOK_FILE_SUFFIX = '.docb.xml'
51
52 # ZIP entries paths are stored in "code page 437" encoding (cp437).
53 # One cannot use UTF-8 for the ZIP entries paths.
54 # Read [ 878120 ] Zipfile archive name can't be unicode
55 # https://sourceforge.net/tracker/?func=detail&atid=105470&aid=878120&group_id=5470
56 ZIP_FILE_ENCODING = 'cp437'
57
58 # Initialization (attempt to remove some global statements)
59 oooVersion = 'ooo1'
60 docbookXSL = None
61 verbose = True
62 zipfile_target = None
63 docbook_top_element = 'book'
64 process_ole_objects = False
65
66 def execArgs():
67     """Analyze command line arguments.
68     """
69     usage = "usage: %prog [options] openoffice.org-file"
70     parser = OptionParser(usage=usage, version="%prog " + VERSION)
71
72     parser.add_option('-z', '--zipfile',
73                       action='store',
74                       dest='zipfile_target',
75                       type='string',
76                       metavar='FILE',
77                       default=None,
78                       help="Use FILE as the filename for "
79                       "the generated ZIP archive")
80
81     parser.add_option('-d', '--dbkfile',
82                       action='store',
83                       dest='dbkfile',
84                       type='string',
85                       metavar='FILE',
86                       default=None,
87                       help="Use FILE as the filename for "
88                       "the generated DocBook XML file. "
89                       "This option has no effect if the "
90                       "-z/--zipfile option is used.")
91
92     parser.add_option('-b', '--book',
93                       action='store_true',
94                       dest='book',
95                       default=False,
96                       help="Produce a DocBook XML book. "
97                       "This is the default.")
98
99     parser.add_option('-a', '--article',
100                       action='store_true',
101                       dest='article',
102                       default=False,
103                       help="Produce a DocBook XML article.")
104
105     parser.add_option('--ole',
106                       action='store_true',
107                       dest='ole',
108                       default=False,
109                       help="Include potential OLE objects as images in the "
110                       "resulting DocBook XML document. This option needs that "
111                       "a listening OpenOffice.org instance be running.")
112
113     parser.add_option('-c', '--config',
114                       action='store',
115                       dest='config',
116                       type='string',
117                       metavar='FILE',
118                       default=None,
119                       help="Use FILE as the file path for the program configuration file. "
120                       "Defaults to the global /etc/%s configuration file or to the "
121                       "%s configuration file in the ooo2dbk executable directory."
122                       % (CONF_FILE_NAME, CONF_FILE_NAME))
123
124     parser.add_option('-x', '--xslt',
125                       action='store',
126                       dest='xslt',
127                       type='string',
128                       metavar='FILE',
129                       default=None,
130                       help="Use FILE as the file path for the XSLT stylesheet. "
131                       "Defaults to the ooo2dbk ooo2dbk.xsl stylesheet.")
132
133     parser.add_option('-m', '--cmdxslt',
134                       action='store',
135                       dest='cmdxslt',
136                       type='string',
137                       metavar='NAME',
138                       default=None,
139                       help="Use command NAME as the XSLT processor. "
140                       "Available command names are defined in the "
141                       "ooo2dbk configuration file. "
142                       "Defaults to xsltproc.")
143
144     parser.add_option('-f', '--flatxml',
145                       action='store_false',
146                       dest='flatxml',
147                       default=True,
148                       help="Preserve the intermediate OpenOffice.org "
149                       "XML file (global.xml)")
150
151     parser.add_option('-v', '--verbose',
152                       action='store_true',
153                       dest='verbose',
154                       default=False,
155                       help="Print additional information to stdout "
156                       "when running conversion")
157
158     if len(sys.argv) < 2:
159         parser.print_help()
160         # Command line syntax errors return the error code "2"
161         sys.exit(2)
162
163     (options, args) = parser.parse_args()
164
165     if len(args) != 1:
166         parser.error("incorrect number of arguments")
167
168     ooo_file_path = args[0]
169
170     if options.book:
171         docbook_top_element = 'book'
172     elif options.article:
173         docbook_top_element = 'article'
174     else:
175         docbook_top_element = 'book'
176     xslParams = []
177     xslParams.append('topElementName')
178     xslParams.append(docbook_top_element)
179
180     process_ole_objects = options.ole
181
182     convert(ooo_file_path,
183             conf_file_path=options.config,
184             command=options.cmdxslt,
185             xslt_file_path=options.xslt,
186             xslParams=xslParams,
187             docbook_file_path=options.dbkfile,
188             deltemp=options.flatxml,
189             verbose=options.verbose,
190             zipfile_target=options.zipfile_target,
191             docbook_top_element=docbook_top_element,
192             process_ole_objects=options.ole,
193             docbookXSL=None,
194             )
195     return
196
197 # ---------
198 # Utilities
199 # ---------
200
201 def getModulePath():
202     """Return the path of the directory in which the ooo2dbk executable resides.
203     """
204     moduleFullname = os.path.abspath(execArgs.func_code.co_filename)
205     modulePath = os.path.split(moduleFullname)[0]
206     return modulePath
207
208 def fileExist(file):
209     if file != '':
210         return os.path.isfile(file)
211     else:
212         print "Bad filename: ", file
213         sys.exit(1)
214
215 def writeFile(file,strContent):
216     b = open(file, 'w')
217     b.write(strContent)
218     b.close()
219
220
221 def verifSys():
222     """Current system identification.
223     """
224     global preferred_encoding
225     preferred_encoding = locale.getpreferredencoding()
226
227     global currentSys
228     if sys.platform == 'win32':
229         currentSys = 'windows'
230     else:
231         currentSys = 'unix'
232
233
234 def verifPath(path):
235     """Syntax path verification.
236     """
237     global currentSys
238     if currentSys == 'windows':
239         modPathWin = re.compile(r"^(([a-zA-Z]:\\)?|(\.\.\\)*)([^\?:/\*\"<>\|]+[^\s\?:/\*\"<>\|]\\)*[^\?:/\*\"<>\|]+(\.[a-zA-Z0-9]+)?$")
240         verifPath = modPathWin.match(path)
241     if currentSys == 'unix':
242         modPathUnix = re.compile(r"^(~|(\.\./)*)?([^\\\?:\*\"<>\|]+[^\\\s\?:\*\"<>\|]/)*[^\\\?:\*\"<>\|]+(\.[a-zA-Z0-9]+)*$")
243         verifPath = modPathUnix.match(path)
244     if verifPath == None:
245         print "Bad path :\n", path
246         sys.exit(1)
247     else:
248         return verifPath.group()
249
250
251 def createDirectory(path):
252     drive = ''
253     if currentSys == 'windows' and os.path.isabs(path):
254         dualWin = os.path.splitdrive(path)
255         drive = dualWin[0]
256         path = dualWin[1][1:]
257     listDir = split(path, os.sep)
258     i = 1
259     while i <= len(listDir):
260         testPath = join(listDir[:i], os.sep)
261         if drive != '':
262             testPath = join([drive, testPath], os.sep)
263         if os.path.isdir(testPath) == 0:
264             os.mkdir(testPath)
265         i += 1
266
267 # --------
268 # SETTINGS
269 # --------
270
271 def getXSLfile(oooVersion):
272
273     # Using the XSLT stylesheets specified in the CONF_FILE_NAME file
274     stylesheet_file_name = getConfigValue('xslt-stylesheet',
275                                               'stylesheetPath',
276                                               name=oooVersion)
277     stylesheet_file_name = verifPath(stylesheet_file_name)
278     if stylesheet_file_name == None:
279         print "Bad filename %s for 'xslt-stylesheet' %s in '%s'" % (
280                                                 stylesheet_file_name,
281                                                 oooVersion,
282                                                 CONF_FILE_NAME,
283                         )
284
285     xsltfile = os.path.join(getModulePath(), stylesheet_file_name)
286
287     return xsltfile
288
289 def setConfFileSettings(conf_file_path=None):
290     global configXML
291     global configElts, imgRelDir, imgRootName
292     global oooserver_host, oooserver_port
293     global ole_img_format, ole2img_script_path, ooopython_path
294
295     # Configuration file
296     # look at options.config priorities for parameters file
297     #   1- c file.xml
298     #   2- /etc/ooo2dbk.xml
299     #   3- ooo2dbk.xml in the cuurent directory
300     if conf_file_path is not None:
301         configXML = conf_file_path
302     else:
303         conf_file_path_global = os.path.join('/etc', CONF_FILE_NAME)
304         if os.path.isfile(conf_file_path_global):
305             configXML = conf_file_path_global
306         else:
307             configXML = os.path.join(getModulePath(), CONF_FILE_NAME)
308
309     configParse = minidom.parse(configXML)
310     configDocElt = configParse.documentElement
311     eltsParse = configDocElt.childNodes
312     configElts = []
313     for node in eltsParse:
314         if node.nodeType == node.ELEMENT_NODE:
315             lenAtt = node.attributes.length
316             dictAtt = {}
317             i = 0
318             while i < lenAtt:
319                 att = node.attributes.item(i)
320                 dictAtt[att.name] = att.value
321                 i += 1
322             tupleElt = (node.nodeName, dictAtt)
323             configElts.append(tupleElt)
324
325
326
327     # Images relative directory
328     imgRelDir = getConfigValue('images', 'imagesRelativeDirectory')
329     verifPathIRD = re.match(r"^[a-zA-Z0-9]+$", imgRelDir)
330     if verifPathIRD == None:
331         msg = ("Only one depth relative directory (no '%s') "
332                "and only alphanum chars for 'imagesRelativeDirectory' in '%s'\n"
333                "Actual name is : '%s'"
334                % (os.sep, CONF_FILE_NAME, imgRelDir))
335         print msg
336         sys.exit(1)
337     # Images root name
338     imgRootName = getConfigValue('images', 'imageNameRoot')
339     verifPathIR = re.match(r"^[a-zA-Z0-9]+$", imgRootName)
340     if verifPathIR == None:
341         print "Only alphanum chars for 'imageNameRoot' in '%s'" % CONF_FILE_NAME
342         print "Actual name is :", imgRootName
343         sys.exit(1)
344
345     oooserver_host = getConfigValue('oooserver', 'host')
346     oooserver_port = getConfigValue('oooserver', 'port')
347     ole_img_format = getConfigValue('ole', 'imgFormat')
348     ole2img_script_path = getConfigValue('ole', 'scriptPath')
349     ooopython_path = getConfigValue('ooopython', 'path')
350
351
352 def getConfigValue(element, attribute, name=''):
353     """
354     Return from the CONF_FILE_NAME file the value of the specified attribute
355     ('command', 'param-syntax', etc.) for the specified element type
356     'xslt-command', 'xslt-stylesheet', etc.) with its 'name' attribute having
357     the name value.
358     """
359     global configElts
360     value = ''
361     i = len(configElts) - 1
362     while i >= 0 :
363         elt = configElts[i]
364         if name != '':
365             if elt[0] == element and elt[1]['name'] == name:
366                 value = elt[1][attribute]
367         else:
368             # We take the default element
369             if elt[0] == element:
370                 value = elt[1][attribute]
371         i = i - 1
372     if value != '':
373         return value
374     else:
375         if name != '':
376             print ("There isn't any value for this parameter. "
377                    "There should be an error in your %s." % CONF_FILE_NAME)
378             sys.exit(1)
379
380
381 def setUserSettings(ooofile, docbook, command, imagesrew, deltemp, dtd,
382                     xslt_file_path, xslParams, verbose):
383     global docOOoSXW, docbookXML, globalXML
384     global imgRelDir, imgAbsDir, rewriteImg
385     global XSLCmdTemplate, dtdPublic, dtdSystem, XSLParams
386
387     # OpenOffice.org filename
388     ooofile = verifPath(ooofile)
389     if fileExist(ooofile) == 0:
390         errorMsg = ("\n>>  ERROR : Incorrect OpenOffice.org file : \n>>  "
391                     + ooofile + "\n")
392         print errorMsg
393         sys.exit(1)
394     else:
395         docOOoSXW = ooofile
396     # DocBook filename
397     if docbook is not None:
398         docbook = verifPath(docbook)
399         path = os.path.split(docbook)[0]
400         docbookXML = docbook
401     else:
402         OOoSplit = os.path.split(docOOoSXW)
403         #path = OOoSplit[0]
404         # This line will result producing subobjects(images) and
405         # OOo & DocBook xml under directory where ooo2dbk.py resides
406         #path = os.path.abspath(os.path.dirname(__file__))
407         # This line will result producing subobjects(images) and
408         # OOo & DocBook xml under directory from which ooo2dbk.py was launched
409         path = os.getcwd()
410         rootName = os.path.splitext(OOoSplit[1])[0]
411         docbookXML = os.path.join(path, rootName) + DOCBOOK_FILE_SUFFIX
412         # Replace spaces in Writer document name with '_'
413         docbookXML = re.sub('\s', '_', docbookXML)
414     # Destination directory
415     if path != '' and os.path.isdir(path) == 0:
416         createDirectory(path)
417     # Temporary files names
418     if deltemp == 0:
419         globalXML = os.path.join(path, 'global.xml')
420     else:
421         import tempfile
422         tempfile.tempdir = path
423         globalXML = tempfile.mktemp('g.xml')
424
425     # Images Directory
426     imgAbsDir = os.path.join(toUnicode(path), imgRelDir)
427
428     # Force image rewriting (0|1)
429     rewriteImg = imagesrew
430
431     # XSL processor command
432     if command is not None:
433         XSLCmdTemplate = getConfigValue('xslt-command', 'command', command)
434     else:
435         XSLCmdTemplate = getConfigValue('xslt-command', 'command')
436
437     # DTD
438     if dtd is not None:
439         dtdPublic = getConfigValue('dtd', 'doctype-public', dtd)
440         dtdSystem = getConfigValue('dtd', 'doctype-system', dtd)
441     else:
442         dtdPublic = getConfigValue('dtd', 'doctype-public')
443         dtdSystem = getConfigValue('dtd', 'doctype-system')
444
445     # XSLT stylesheet
446     if xslt_file_path is not None:
447         docbookXSL = xslt_file_path
448
449     # XSLT Params
450     if xslParams is not None:
451         if command is not None:
452             param_syntax = getConfigValue('xslt-command', 'param-syntax',
453                                           command)
454         else:
455             param_syntax = getConfigValue('xslt-command', 'param-syntax')
456         # Retrieve the XSLT params and set them according to the param syntax.
457         # This is done because XSLT processors have different command line
458         # options.
459         XSLParams = ("%s" % (param_syntax)) % tuple(xslParams)
460     else:
461         XSLParams = ' '
462     if verbose:
463         print "       - xslParams = %s" % xslParams
464         print "       - param_syntax = %s" % param_syntax
465         print "       - XSLParams = %s" % XSLParams
466
467
468 def initializeSets(ooo_file_path, docbook, command, imagesrew, deltemp, dtd,
469                    conf_file_path, xslt_file_path, xslParams, verbose):
470     verifSys()
471     setConfFileSettings(conf_file_path)
472     setUserSettings(ooo_file_path, docbook, command, imagesrew, deltemp, dtd,
473                     xslt_file_path, xslParams, verbose)
474
475 # --------------------
476 # Conversion functions
477 # --------------------
478
479 def extractOooArchive(docOOoSXW, XMLFile):
480     """Generic XML files extraction.
481     """
482     # Checking that the OOo file is truly of the ZIP format
483     if zipfile.is_zipfile(docOOoSXW):
484         zip_file = zipfile.ZipFile(docOOoSXW, 'r')
485         # Listing the file content
486         contentListZip = zip_file.namelist()
487         # Checking that a "content.xml" file is truly present
488         for i in contentListZip:
489             if i == XMLFile:
490                 # If "content.xml" is truly present, we open it.
491                 # The result, "docOOoXML" is the content as text.
492                 docOOoXMLExist = 1
493                 strOOoXML = zip_file.read(XMLFile)
494                 zip_file.close()
495                 return strOOoXML
496
497 def listChildNodes(docOOoSXW, XMLFile, ooo_file_path, verbose):
498     """Extract and parse Zip XML files for concat.
499     """
500     # Extract and parse XML file
501     strXML = extractOooArchive(docOOoSXW, XMLFile)
502     XMLparse = minidom.parseString(strXML)
503     rootNode = XMLparse.documentElement
504     vChildNodes = rootNode.childNodes
505     # Images treatment
506     if XMLFile == OOO_CONTENT_FILE_NAME:
507         global dictImg, myZip, numImg, dictNamespace
508         numImg = 0
509         dictImg = {}
510         dictNamespace = {}
511         myZip = zipfile.ZipFile(docOOoSXW, 'r')
512         # Creating the directory where the images will be dropped.
513         # The exported OLE images go in this directory too.
514         if not (os.path.exists(imgAbsDir)
515                 and os.path.isdir(imgAbsDir)):
516             os.mkdir(imgAbsDir)
517         if process_ole_objects:
518             cmd = (('%s %s --target "%s" '
519                     '--oooserverhost %s --oooserverport %s '
520                     '--format %s "%s"')
521                    % (
522                 ooopython_path,
523                 ole2img_script_path,
524                 imgAbsDir,
525                 oooserver_host, oooserver_port,
526                 ole_img_format, ooo_file_path))
527             if verbose:
528                 print cmd
529             os.system(cmd)
530         replaceImageNode(vChildNodes)
531         myZip.close()
532     # Extract all root element's childs
533     listChildElts = []
534     for node in vChildNodes:
535         if node.nodeType == node.ELEMENT_NODE:
536             listChildElts.append(node)
537     return listChildElts
538
539
540 def replaceImageNode(vChildNodes):
541     """Replace the incorporated images links by the new images links
542     and extract and copy all incorporated images.
543     XXX: Why renaming images (apart from making their path relative)?
544     Please add comment if you know.
545     """
546     global numImg
547     for node in vChildNodes:
548         if node.nodeName == 'draw:image':
549             hRefValue = node.attributes['xlink:href'].value
550             if find(hRefValue, 'Pictures/', 0) != -1:
551                 nameImgOld = os.path.split(hRefValue)[1]
552
553                 # XXX: What is this block for? Please add comment if you know.
554                 if dictImg.has_key(nameImgOld):
555                     node.attributes['xlink:href'].value = dictImg[nameImgOld]
556                 else:
557                     extImg = os.path.splitext(nameImgOld)[1]
558                     numImg += 1
559                     nameImgNew = imgRootName + "%03i" % numImg + extImg
560                     hrefImgNew = os.path.join(imgRelDir, nameImgNew)
561                     pathImgNew = os.path.join(imgAbsDir, nameImgNew)
562                     if hRefValue.startswith('#'):
563                         # OOo 1
564                         pathImgZip = hRefValue[1:]
565                     else:
566                         # OOo 2
567                         pathImgZip = hRefValue
568                     zipImg = myZip.read(pathImgZip)
569                     if os.path.isfile(pathImgNew) and rewriteImg:
570                         os.remove(pathImgNew)
571                     if not os.path.isfile(pathImgNew):
572                         imgNew = open(pathImgNew, 'wb')
573                         imgNew.write(zipImg)
574                         imgNew.close()
575                     dictImg[nameImgOld] = hrefImgNew
576                     node.attributes['xlink:href'].value = dictImg[nameImgOld]
577             else:
578                 pass
579
580         # XXX: What is this block for? Please add comment if you know.
581         if node.hasChildNodes():
582             wChilNodes = node.childNodes
583             replaceImageNode(wChilNodes)
584
585
586 def getGlobalRootHead(sourcefile, XMLFile):
587
588   strXML = extractOooArchive(docOOoSXW, XMLFile)
589   XMLparse = minidom.parseString(strXML)
590   rootNode = XMLparse.documentElement
591
592   if rootNode.attributes['xmlns:office'].value == 'http://openoffice.org/2000/office':
593       oooVersion = 'ooo1'
594   elif rootNode.attributes['xmlns:office'].value == 'urn:oasis:names:tc:opendocument:xmlns:office:1.0':
595       oooVersion = 'ooo2'
596
597   if oooVersion == 'ooo1':
598     # OpenOffice.org 1.x
599     globalRootHead = """\
600 <?xml version="1.0" encoding="UTF-8"?>
601
602 <office:document xmlns:office="http://openoffice.org/2000/office"
603                  xmlns:style="http://openoffice.org/2000/style"
604                  xmlns:text="http://openoffice.org/2000/text"
605                  xmlns:table="http://openoffice.org/2000/table"
606                  xmlns:draw="http://openoffice.org/2000/drawing"
607                  xmlns:fo="http://www.w3.org/1999/XSL/Format"
608                  xmlns:xlink="http://www.w3.org/1999/xlink"
609                  xmlns:number="http://openoffice.org/2000/datastyle"
610                  xmlns:svg="http://www.w3.org/2000/svg"
611                  xmlns:chart="http://openoffice.org/2000/chart"
612                  xmlns:dr3d="http://openoffice.org/2000/dr3d"
613                  xmlns:math="http://www.w3.org/1998/Math/MathML"
614                  xmlns:form="http://openoffice.org/2000/form"
615                  xmlns:script="http://openoffice.org/2000/script"
616                  xmlns:dc="http://purl.org/dc/elements/1.1/"
617                  xmlns:meta="http://openoffice.org/2000/meta"
618                  office:class="text"
619                  office:version="1.0">
620 """
621
622   elif oooVersion == 'ooo2':
623     # OpenOffice.org 2.x - OpenDocument
624     globalRootHead = """\
625 <?xml version="1.0" encoding="UTF-8"?>
626
627 <office:document
628                  xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
629                  xmlns:style="urn:oasis:names:tc:opendocument:xmlns:style:1.0"
630                  xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0"
631                  xmlns:table="urn:oasis:names:tc:opendocument:xmlns:table:1.0"
632                  xmlns:draw="urn:oasis:names:tc:opendocument:xmlns:drawing:1.0"
633                  xmlns:fo="urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0"
634                  xmlns:xlink="http://www.w3.org/1999/xlink"
635                  xmlns:dc="http://purl.org/dc/elements/1.1/"
636                  xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
637                  xmlns:number="urn:oasis:names:tc:opendocument:xmlns:datastyle:1.0"
638                  xmlns:svg="urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0"
639                  xmlns:chart="urn:oasis:names:tc:opendocument:xmlns:chart:1.0"
640                  xmlns:dr3d="urn:oasis:names:tc:opendocument:xmlns:dr3d:1.0"
641                  xmlns:math="http://www.w3.org/1998/Math/MathML"
642                  xmlns:form="urn:oasis:names:tc:opendocument:xmlns:form:1.0"
643                  xmlns:script="urn:oasis:names:tc:opendocument:xmlns:script:1.0"
644                  xmlns:ooo="http://openoffice.org/2004/office"
645                  xmlns:ooow="http://openoffice.org/2004/writer"
646                  xmlns:oooc="http://openoffice.org/2004/calc"
647                  xmlns:dom="http://www.w3.org/2001/xml-events"
648                  xmlns:xforms="http://www.w3.org/2002/xforms"
649                  xmlns:xsd="http://www.w3.org/2001/XMLSchema"
650                  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
651                  office:version="1.0">
652 """
653
654   globalRootFoot = """\
655 </office:document>
656 """
657
658
659   return globalRootHead, globalRootFoot, oooVersion
660
661 def createGlobalXML(globalFile, ooo_file_path, verbose):
662     """
663     Create a global XML file by concatening the different XML files contained
664     within a .sxw OOo file (meta.xml, styles.xml, content.xml).
665     """
666     # First let's delete any previous images directory, because if we don't
667     # delete it there might be a previous directory with content in it and we
668     # don't want to get this unrequested content in a generated archive.
669     if os.path.exists(imgAbsDir):
670         shutil.rmtree(imgAbsDir)
671
672     globalRootHead, globalRootFoot, oooVersion = getGlobalRootHead(docOOoSXW,
673                                                                    OOO_META_FILE_NAME)
674     globalRootStr = globalRootHead + globalRootFoot
675
676     globalStrParse = minidom.parseString(globalRootStr)
677     globalRoot = globalStrParse.documentElement
678     metaListElts = listChildNodes(docOOoSXW, OOO_META_FILE_NAME, ooo_file_path, verbose)
679     stylesListElts = listChildNodes(docOOoSXW, OOO_STYLES_FILE_NAME, ooo_file_path, verbose)
680     contentListElts = listChildNodes(docOOoSXW, OOO_CONTENT_FILE_NAME, ooo_file_path, verbose)
681     globalListElts = metaListElts + stylesListElts + contentListElts
682     for node in globalListElts:
683         globalRoot.appendChild(node)
684     strXML = globalRoot.toxml()
685     listLine = split(strXML, '\n')[1:]
686     strXMLNS = join([globalRootHead, join(listLine, u"\n")], u"\n")
687     fileXML = codecs.open(globalFile, 'w', 'utf-8')
688     fileXML.write(strXMLNS)
689     fileXML.close()
690
691     return oooVersion
692
693
694 def tempFilesDelete(deltemp):
695     if deltemp == 1:
696         os.remove(globalXML)
697
698
699 def getXsltCommand(input_file_path, output_file_path, stylesheet, verbose):
700     """Return the actual XSLT processing command.
701     """
702     global XSLCmdTemplate
703     cmd = XSLCmdTemplate
704     gListVar = ['%o', '%i', '%s', '%p', '%y', '%v']
705     listVar = []
706     for var in gListVar:
707         if find(cmd, var) != -1:
708             listVar.append(var)
709     for var in listVar:
710         varSplit = split(cmd, var)
711         if var == '%o':
712             # Note that the file path has to be protected by "" in case it
713             # contains special characters such as spaces.
714             varSplit = '%s"%s"%s' % (varSplit[0], toUnicode(output_file_path), varSplit[1])
715         elif var == '%i':
716             # Note that the file path has to be protected by "" in case it
717             # contains special characters such as spaces.
718             varSplit = '%s"%s"%s' % (varSplit[0], toUnicode(input_file_path), varSplit[1])
719         elif var == '%s':
720             varSplit = '%s"%s"%s' % (varSplit[0], stylesheet, varSplit[1])
721         elif var == '%p':
722             varSplit = '%s%s%s' % (varSplit[0], dtdPublic, varSplit[1])
723         elif var == '%y':
724             varSplit = '%s%s%s' % (varSplit[0], dtdSystem, varSplit[1])
725         elif var == '%v':
726             varSplit = '%s%s%s' % (varSplit[0], XSLParams, varSplit[1])
727         cmd = join(varSplit, '')
728     if verbose:
729         print cmd
730     return cmd.encode(preferred_encoding)
731
732
733 def toUnicode(s):
734     return unicode(s, preferred_encoding)
735
736
737 def o2dConvert(input, output, stylesheet, verbose):
738     """Generic conversion.
739     """
740     startTime = time.time()
741     os.system(getXsltCommand(input, output, stylesheet,verbose))
742     endTime = time.time()
743     duration = round(endTime - startTime, 2)
744
745
746 # -------------
747 # User commands
748 # -------------
749
750 def createDocbookArchive(zipfile_target):
751     pjoin = os.path.join
752     psplit = os.path.split
753     psplitext = os.path.splitext
754     pbasename = os.path.basename
755
756     arch_dest_dir = psplit(zipfile_target)[0]
757     arch_top_dir = psplitext(pbasename(zipfile_target))[0]
758     arch_path = pjoin(arch_dest_dir, arch_top_dir + '.zip')
759     arch = zipfile.ZipFile(arch_path, 'w', zipfile.ZIP_DEFLATED)
760     docbook_fname = pbasename(docbookXML)
761     docbook_path_in_arch = pjoin(arch_top_dir, docbook_fname)
762     # ZIP entries paths are stored in "code page 437" encoding (cp437).
763     # One cannot use UTF-8 for the ZIP entries paths.
764     docbook_path_in_arch_enc = toUnicode(docbook_path_in_arch).encode(ZIP_FILE_ENCODING)
765     arch.write(docbookXML, docbook_path_in_arch_enc)
766     # Adding in the arch the images contained in the original OOo arch
767     if os.path.exists(imgAbsDir):
768         for img_name in os.listdir(imgAbsDir):
769             img_path = pjoin(imgAbsDir, img_name)
770             img_path_in_arch = pjoin(arch_top_dir, 'images', img_name)
771             # ZIP entries paths are stored in "code page 437" encoding (cp437).
772             # One cannot use UTF-8 for the ZIP entries paths.
773             img_path_in_arch_enc = img_path_in_arch.encode(ZIP_FILE_ENCODING)
774             arch.write(img_path, img_path_in_arch_enc)
775     arch.close()
776     # Remove created DocBook XML and subobjects, if any
777     os.remove(docbookXML)
778     if os.path.exists(imgAbsDir):
779         shutil.rmtree(imgAbsDir)
780
781
782 def convert(ooo_file_path,
783             command=None,
784             docbook_file_path=None,
785             imagesrew=1,
786             deltemp=1,
787             dtd=None,
788             conf_file_path=None,
789             xslt_file_path=None,
790             xslParams=None,
791             verbose=False,
792             zipfile_target=False,
793             docbook_top_element='book',
794             process_ole_objects=False,
795             docbookXSL=None,
796             ):
797     """Convert OpenOffice.org Writer file to DocBook XML.
798     """
799     startTime = time.time()
800
801     if verbose:
802         print "   1 - Command line options"
803         print "       - OOo2DBK config file : %s" % conf_file_path
804         print "       - OpenOffice.org file : %s" % ooo_file_path
805         print "       - DocBook file        : %s" % docbook_file_path
806         print "       - top element is      : %s" % docbook_top_element
807         print "       - process OLE objects : %s" % process_ole_objects
808
809     initializeSets(ooo_file_path, docbook_file_path, command, imagesrew,
810                    deltemp, dtd, conf_file_path, xslt_file_path, xslParams,
811                    verbose)
812
813     ooo_file_path = toUnicode(ooo_file_path)
814     if docbook_file_path is not None:
815         docbook_file_path = toUnicode(docbook_file_path)
816     if xslt_file_path is not None:
817         xslt_file_path = toUnicode(xslt_file_path)
818
819     endTime = time.time()
820     duration = round(endTime - startTime, 2)
821
822     if verbose:
823         print "       ==>", duration, "sec.\n"
824         print "   2 - Unzip and concat OpenOffice.org XML files"
825
826     startTime = time.time()
827
828     oooVersion = createGlobalXML(globalXML, ooo_file_path, verbose)
829
830     endTime = time.time()
831     duration = round(endTime - startTime, 2)
832
833     if verbose:
834         print "       - Detected file format: %s" % (oooVersion)
835         print "       ==>", duration, "sec.\n"
836         print "   3 - Initialization (configuration file and computed options)"
837
838     if docbookXSL is None:
839         # Get XSLT file to use from configuration file
840         docbookXSL = getXSLfile(oooVersion)
841
842     if verbose:
843         global configXML
844         print "       - preferred encoding       : %s" % preferred_encoding
845         print "       - OOo2DBK config file      : %s" % configXML
846         print "       - XSLT file                : %s" % docbookXSL
847         print "       - OpenOffice.org file path : %s" % docOOoSXW
848         print "       - DocBook file path        : %s" % docbookXML
849         if process_ole_objects:
850             print "       - oooserver host            : %s" % oooserver_host
851             print "       - oooserver port            : %s" % oooserver_port
852             print "       - exported OLE image format : %s" % ole_img_format
853             print "       - OOo Python path: %s" % ooopython_path
854         print "\n   4 - DocBook file creation"
855
856     startTime = time.time()
857     o2dConvert(globalXML, docbookXML, docbookXSL, verbose)
858
859     tempFilesDelete(deltemp)
860     endTime = time.time()
861     duration = round(endTime - startTime, 2)
862     if verbose:
863         print "       ==>", duration, "sec.\n"
864         print "Conversion completed\n"
865
866     if zipfile_target:
867         createDocbookArchive(zipfile_target)
868         if verbose:
869             print "Zip archive created\n"
870
871 # Shell conversion
872 if __name__ == "__main__":
873     execArgs()
Note: See TracBrowser for help on using the browser.