root/CPS3/products/CPSUtil/trunk/text.py

Revision 52633, 5.3 kB (checked in by madarche, 1 year ago)

- Factorized the toLatin9 method in CPSUtil.text.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
Line 
1 # -*- coding: ISO-8859-15 -*-
2 # (C) Copyright 2005-2008 Nuxeo SAS <http://nuxeo.com>
3 # Authors:
4 # M.-A. Darche <madarche@nuxeo.com>
5 #
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License version 2 as published
8 # by the Free Software Foundation.
9 #
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 # GNU General Public License for more details.
14 #
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
18 # 02111-1307, USA.
19 #
20 # $Id$
21 """Utility functions for manipulating text.
22 """
23
24 import string, codecs
25
26 from AccessControl import ModuleSecurityInfo
27
28 ACCENTED_CHARS_TRANSLATIONS = string.maketrans(
29     r"""ÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöøùúûüýÿ""",
30     r"""AAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy""")
31
32 # Allowing this method to be imported in restricted code
33 ModuleSecurityInfo('Products.CPSUtil.text').declarePublic('toAscii')
34 def toAscii(s):
35     """Change accented and special characters by ASCII characters.
36
37     >>> toAscii('caf\xe9')
38     'cafe'
39     >>> toAscii(u'caf\xe9-\u1234')
40     'cafe-?'
41     """
42     if isinstance(s, unicode):
43         s = s.encode('iso-8859-15', 'replace')
44     s = s.translate(ACCENTED_CHARS_TRANSLATIONS)
45     s = s.replace('Æ', 'AE')
46     s = s.replace('æ', 'ae')
47     s = s.replace('Œ', 'OE')
48     s = s.replace('œ', 'oe')
49     s = s.replace('ß', 'ss')
50     return s
51
52 # Allowing this method to be imported in restricted code
53 ModuleSecurityInfo('Products.CPSUtil.text').declarePublic('toLatin9')
54 def toLatin9(obj):
55     if isinstance(obj, dict):
56         for k, v in obj.items():
57             if isinstance(v, unicode):
58                 v = _unicodeToLatin9(v)
59                 obj[k] = v
60     elif isinstance(obj, unicode):
61         obj = _unicodeToLatin9(obj)
62     return obj
63
64 def _unicodeToLatin9(s):
65     if s is None:
66         return None
67     else:
68         # Replace RIGHT SINGLE QUOTATION MARK (unicode only)
69         # by the APOSTROPHE (ascii and latin1).
70         # cf. http://www.cl.cam.ac.uk/~mgk25/ucs/quotes.html
71         s = s.replace(u'\u2019', u'\u0027')
72         #&#8217;
73         return s.encode('iso-8859-15', 'ignore')
74
75 # Allowing this method to be imported in restricted code
76 ModuleSecurityInfo('Products.CPSUtil.text').declarePublic('truncateText')
77 def truncateText(text, size=25):
78     """Middle truncature."""
79     if text is None or len(text) < size:
80         return text
81     mid_size = (size - 3) / 2
82     return text[:mid_size] + '...' + text[-mid_size:]
83
84
85 # This table gives rough latin9 equivalents for Unicode characters coming from
86 # the MS Windows western charset (cp1522) that won't get directly translated
87 # to latin9.
88 # see also http://openweb.eu.org/articles/caracteres_illegaux/
89
90 win2latin9_approx = { # below are cp1252 codes
91 u'\u201a' : u',',    # 0x82 lower single quote
92 u'\u201e' : u'"',    # 0x84 lower double quote (german?)
93 u'\u02c6' : u'^',    # 0x88 small upper ^
94 u'\u2039' : u'<',    # 0x8b small <
95 u'\u2018' : u'`',    # 0x91 single curly backquote
96 u'\u2019' : u"'",    # 0x92 single curly quote
97 u'\u201c' : u'"',    # 0x93 double curly backquote
98 u'\u201d' : u'"',    # 0x94 double curly quote
99 u'\u2013' : u'\xad', # 0x96 small dash
100 u'\u2014' : u'-',    # 0x97 dash
101 u'\u02dc' : u'~',    # 0x98 upper tilda
102 u'\u203a' : u'>',    # 0x9b small >
103 u'\xb4'   : u"'",    # 0xb4 almost horizontal single quote
104 u'\u2026' : u'...',  # 0x85 dots in one char
105 u'\u2022' : u'.',    # bullet
106 }
107
108 def winToLatin9_errors(exc):
109     """ Fallback by approximation for latin9 encoding of unicode objects.
110
111     Mostly, this is about Unicode objects obtained from MS Windows Western
112     Europe strings (codec identifier 'cp1252').
113
114     This works as an error handler (registered at import time of the present
115     module).
116
117     An example going all the way from a Windows string
118
119     >>> wintext = 'L\x92apostrophe est jolie \x85'
120     >>> unitext = wintext.decode('cp1252')
121     >>> unitext
122     u'L\u2019apostrophe est jolie \u2026'
123     >>> unitext.encode('iso-8859-15', 'latin9_fallback')
124     "L'apostrophe est jolie ..."
125
126     >>> u'L\u2019apostrophe'.encode('iso-8859-15', 'latin9_fallback')
127     "L'apostrophe"
128
129     >>> u'1 maps to 3\u2026 See ?'.encode('iso-8859-15', 'latin9_fallback')
130     '1 maps to 3... See ?'
131
132     If we can't find an approximate equivalent, we fallback to
133     xmlcharrefreplace, that all modern browsers can handle:
134
135     >>> u'\u2032'.encode('iso-8859-15', 'latin9_fallback')
136     '&#8242;'
137
138
139     xmlcharrefreplace will be called for any block of non latin9 translatables
140     chars once one in the block cannot be approximated.
141     >>> u'ab\u2032\u2026cd\u2014'.encode('iso-8859-15', 'latin9_fallback')
142     'ab&#8242;&#8230;cd-'
143
144     Cf http://docs.python.org/lib/module-codecs.html#l2h-984) for more on
145     Unicode.encode error handlers
146     """
147
148     res = u''
149     inp = exc.args[1]
150     try:
151         for i in range(exc.start, exc.end):
152             res += win2latin9_approx[inp[i]]
153     except KeyError:
154         return codecs.lookup_error('xmlcharrefreplace')(exc)
155     return res, exc.end # we made at worst one to many mappings
156
157
158
159 ## Register the fallback
160 codecs.register_error('latin9_fallback', winToLatin9_errors)
Note: See TracBrowser for help on using the browser.