|
Revision 50502, 1.0 kB
(checked in by madarche, 3 years ago)
|
- Fixed #1760 : faster scrubHTML if lxml is present on the system.
- Added the word_to_text transform.
|
- Property svn:eol-style set to
native
- Property svn:keywords set to
Id
|
| Line | |
|---|
| 1 |
# $Id$ |
|---|
| 2 |
|
|---|
| 3 |
import os |
|---|
| 4 |
import sys |
|---|
| 5 |
import re |
|---|
| 6 |
from Products.PortalTransforms.interfaces import itransform |
|---|
| 7 |
from Products.PortalTransforms.transforms import office_wvware |
|---|
| 8 |
from Products.PortalTransforms.libtransforms.utils import getBodyText |
|---|
| 9 |
|
|---|
| 10 |
_re_compactwhites = re.compile(r'\s+') |
|---|
| 11 |
|
|---|
| 12 |
class document(office_wvware.document): |
|---|
| 13 |
|
|---|
| 14 |
def text(self): |
|---|
| 15 |
htmlfile = open(os.path.join(self.tmpdir, self.__name__+'.html')) |
|---|
| 16 |
text = getBodyText(htmlfile) |
|---|
| 17 |
text = _re_compactwhites.sub(' ', text) |
|---|
| 18 |
return text |
|---|
| 19 |
|
|---|
| 20 |
|
|---|
| 21 |
class word_to_text: |
|---|
| 22 |
__implements__ = itransform |
|---|
| 23 |
|
|---|
| 24 |
__name__ = 'word_to_text' |
|---|
| 25 |
inputs = ('application/msword',) |
|---|
| 26 |
output = 'text/plain' |
|---|
| 27 |
|
|---|
| 28 |
def name(self): |
|---|
| 29 |
return self.__name__ |
|---|
| 30 |
|
|---|
| 31 |
def convert(self, data, cache, **kwargs): |
|---|
| 32 |
orig_file = os.path.basename((kwargs.get('filename') or 'unknown.doc')) |
|---|
| 33 |
|
|---|
| 34 |
doc = document(orig_file, data) |
|---|
| 35 |
doc.convert() |
|---|
| 36 |
text = doc.text() |
|---|
| 37 |
|
|---|
| 38 |
doc.cleanDir(doc.tmpdir) |
|---|
| 39 |
|
|---|
| 40 |
cache.setData(text) |
|---|
| 41 |
return cache |
|---|
| 42 |
|
|---|
| 43 |
def register(): |
|---|
| 44 |
return word_to_text() |
|---|