Skip to content
Snippets Groups Projects
Commit d838e449 authored by Roman Imankulov's avatar Roman Imankulov
Browse files

Ensure babel i18n extactor works properly with non-ascii input

If mako templates contain something like "_('Köln')", babel extractor converts
it to pure ASCII so that resulting .po file would contain "K\xf6ln". Not all
translation tools and translations are ready for such kind of escape sequences.

Babel allows message ids to be non-ascii, the plugin just has to return Unicode
objects instead of ASCII strings (and that's exactly how Babel built-in Python
and JavaScript extractors work).

This fix ensures mako extractor doesn't excape non-ascii symbols, works well
both for Unicode and non-unicode input (there is a test for cp1251 encoding),
and also provides a workaround for babel charset detector python-babel/babel#274.
parent 72e95faf
No related branches found
No related tags found
No related merge requests found
...@@ -16,6 +16,7 @@ class MessageExtractor(object): ...@@ -16,6 +16,7 @@ class MessageExtractor(object):
def extract_nodes(self, nodes): def extract_nodes(self, nodes):
translator_comments = [] translator_comments = []
in_translator_comments = False in_translator_comments = False
input_encoding = self.config['encoding'] or 'ascii'
comment_tags = list( comment_tags = list(
filter(None, re.split(r'\s+', self.config['comment-tags']))) filter(None, re.split(r'\s+', self.config['comment-tags'])))
...@@ -76,13 +77,18 @@ class MessageExtractor(object): ...@@ -76,13 +77,18 @@ class MessageExtractor(object):
comment[1] for comment in translator_comments] comment[1] for comment in translator_comments]
if isinstance(code, compat.text_type): if isinstance(code, compat.text_type):
code = code.encode('ascii', 'backslashreplace') code = code.encode(input_encoding, 'backslashreplace')
used_translator_comments = False used_translator_comments = False
code = compat.byte_buffer(code) # We add extra newline to work around a pybabel bug
# (see python-babel/babel#274, parse_encoding dies if the first
# input string of the input is non-ascii)
# Also, because we added it, we have to subtract one from
# node.lineno
code = compat.byte_buffer(compat.b('\n') + code)
for message in self.process_python( for message in self.process_python(
code, node.lineno, translator_strings): code, node.lineno - 1, translator_strings):
yield message yield message
used_translator_comments = True used_translator_comments = True
......
...@@ -78,3 +78,16 @@ class ExtractMakoTestCase(TemplateTest): ...@@ -78,3 +78,16 @@ class ExtractMakoTestCase(TemplateTest):
(99, '_', 'No action at a distance.', []), (99, '_', 'No action at a distance.', []),
] ]
self.assertEqual(expected, messages) self.assertEqual(expected, messages)
@skip()
def test_extract_utf8(self):
mako_tmpl = open(os.path.join(template_base, 'gettext_utf8.mako'), 'rb')
message = next(extract(mako_tmpl, {'_', None}, [], {'encoding': 'utf-8'}))
assert message == (1, '_', u'K\xf6ln', [])
@skip()
def test_extract_cp1251(self):
mako_tmpl = open(os.path.join(template_base, 'gettext_cp1251.mako'), 'rb')
message = next(extract(mako_tmpl, {'_', None}, [], {'encoding': 'cp1251'}))
# "test" in Rusian. File encoding is cp1251 (aka "windows-1251")
assert message == (1, '_', u'\u0442\u0435\u0441\u0442', [])
${_("")}
${_("Köln")}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment