root/Zope2/PortalTransforms/trunk/transforms/word_to_text.py

Revision 50502, 1.0 kB (checked in by madarche, 3 years ago)

- Fixed #1760 : faster scrubHTML if lxml is present on the system.
- Added the word_to_text transform.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
Line 
1 # $Id$
2
3 import os
4 import sys
5 import re
6 from Products.PortalTransforms.interfaces import itransform
7 from Products.PortalTransforms.transforms import office_wvware
8 from Products.PortalTransforms.libtransforms.utils import getBodyText
9
10 _re_compactwhites = re.compile(r'\s+')
11
12 class document(office_wvware.document):
13
14     def text(self):
15         htmlfile = open(os.path.join(self.tmpdir, self.__name__+'.html'))
16         text = getBodyText(htmlfile)
17         text = _re_compactwhites.sub(' ', text)
18         return text
19
20
21 class word_to_text:
22     __implements__ = itransform
23
24     __name__ = 'word_to_text'
25     inputs   = ('application/msword',)
26     output  = 'text/plain'
27
28     def name(self):
29         return self.__name__
30
31     def convert(self, data, cache, **kwargs):
32         orig_file = os.path.basename((kwargs.get('filename') or 'unknown.doc'))
33
34         doc = document(orig_file, data)
35         doc.convert()
36         text = doc.text()
37
38         doc.cleanDir(doc.tmpdir)
39
40         cache.setData(text)
41         return cache
42
43 def register():
44     return word_to_text()
Note: See TracBrowser for help on using the browser.