From 43c1a9744765eebfb9eaf9113336d552cfc9096b Mon Sep 17 00:00:00 2001 From: Toshio Kuratomi Date: Mon, 30 Mar 2015 19:19:34 -0700 Subject: [PATCH] Various unicode and backslash escape cleanups * Do backslash escape parsing in parse_kv() [was being done in the copy module purely for newlines in the copy module's content param before] * Make parse_kv always return unicode * Add bandaid to transform args to unicode until we can fix things calling parse_kv to always send it unicode. * Make split_args deal with unicode internally. Warning, no bandaid for things calling split_args without giving it unicode (shouldn't matter as dealt with str internally before) * Fix copy and unarchive action plugins to not use setdefaultencoding * Remove escaping from copy (it was broken and made content into latin-1 sometimes). escaping is now in parse_kv. * Expect that content is now a unicode string so transform to bytes just before writing to the file. * Add initial unittests for split_args and parse_kv. 4 failing tests.because split_args is injecting extra newlines. --- v2/ansible/parsing/splitter.py | 42 +++++++--- v2/ansible/plugins/action/copy.py | 28 +------ v2/ansible/plugins/action/unarchive.py | 8 +- v2/test/parsing/test_splitter.py | 109 +++++++++++++++++++++++++ 4 files changed, 143 insertions(+), 44 deletions(-) create mode 100644 v2/test/parsing/test_splitter.py diff --git a/v2/ansible/parsing/splitter.py b/v2/ansible/parsing/splitter.py index 9705baf169..4af1c7b171 100644 --- a/v2/ansible/parsing/splitter.py +++ b/v2/ansible/parsing/splitter.py @@ -19,6 +19,27 @@ from __future__ import (absolute_import, division, print_function) __metaclass__ = type +import re +import codecs + +# Decode escapes adapted from rspeer's answer here: +# http://stackoverflow.com/questions/4020539/process-escape-sequences-in-a-string-in-python +_HEXCHAR = '[a-fA-F0-9]' +_ESCAPE_SEQUENCE_RE = re.compile(r''' + ( \\U{0} # 8-digit hex escapes + | \\u{1} # 4-digit hex escapes + | \\x{2} # 2-digit hex escapes + | \\[0-7]{{1,3}} # Octal escapes + | \\N\{{[^}}]+\}} # Unicode characters by name + | \\[\\'"abfnrtv] # Single-character escapes + )'''.format(_HEXCHAR*8, _HEXCHAR*4, _HEXCHAR*2), re.UNICODE | re.VERBOSE) + +def _decode_escapes(s): + def decode_match(match): + return codecs.decode(match.group(0), 'unicode-escape') + + return _ESCAPE_SEQUENCE_RE.sub(decode_match, s) + def parse_kv(args, check_raw=False): ''' Convert a string of key/value items to a dict. If any free-form params @@ -27,6 +48,10 @@ def parse_kv(args, check_raw=False): they will simply be ignored. ''' + ### FIXME: args should already be a unicode string + from ansible.utils.unicode import to_unicode + args = to_unicode(args, nonstring='passthru') + options = {} if args is not None: try: @@ -39,6 +64,7 @@ def parse_kv(args, check_raw=False): raw_params = [] for x in vargs: + x = _decode_escapes(x) if "=" in x: pos = 0 try: @@ -72,7 +98,7 @@ def parse_kv(args, check_raw=False): # recombine the free-form params, if any were found, and assign # them to a special option for use later by the shell/command module if len(raw_params) > 0: - options['_raw_params'] = ' '.join(raw_params) + options[u'_raw_params'] = ' '.join(raw_params) return options @@ -126,17 +152,11 @@ def split_args(args): ''' # the list of params parsed out of the arg string - # this is going to be the result value when we are donei + # this is going to be the result value when we are done params = [] - # here we encode the args, so we have a uniform charset to - # work with, and split on white space + # Initial split on white space args = args.strip() - try: - args = args.encode('utf-8') - do_decode = True - except UnicodeDecodeError: - do_decode = False items = args.strip().split('\n') # iterate over the tokens, and reassemble any that may have been @@ -242,10 +262,6 @@ def split_args(args): if print_depth or block_depth or comment_depth or inside_quotes: raise Exception("error while splitting arguments, either an unbalanced jinja2 block or quotes") - # finally, we decode each param back to the unicode it was in the arg string - if do_decode: - params = [x.decode('utf-8') for x in params] - return params def is_quoted(data): diff --git a/v2/ansible/plugins/action/copy.py b/v2/ansible/plugins/action/copy.py index 09990743bb..89c2fde7b3 100644 --- a/v2/ansible/plugins/action/copy.py +++ b/v2/ansible/plugins/action/copy.py @@ -30,18 +30,7 @@ from ansible import constants as C from ansible.plugins.action import ActionBase from ansible.utils.boolean import boolean from ansible.utils.hashing import checksum - -### FIXME: Find a different way to fix 3518 as sys.defaultencoding() breaks -# the python interpreter in subtle ways. It appears that this does not fix -# 3518 anyway (using binary files via lookup(). Instead, it tries to fix -# utf-8 strings in the content parameter. That should be fixable by properly -# encoding or decoding the value before we write it to a file. -# -## fixes https://github.com/ansible/ansible/issues/3518 -# http://mypy.pythonblogs.com/12_mypy/archive/1253_workaround_for_python_bug_ascii_codec_cant_encode_character_uxa0_in_position_111_ordinal_not_in_range128.html -#import sys -#reload(sys) -#sys.setdefaultencoding("utf8") +from ansible.utils.unicode import to_bytes class ActionModule(ActionBase): @@ -55,16 +44,6 @@ class ActionModule(ActionBase): raw = boolean(self._task.args.get('raw', 'no')) force = boolean(self._task.args.get('force', 'yes')) - # content with newlines is going to be escaped to safely load in yaml - # now we need to unescape it so that the newlines are evaluated properly - # when writing the file to disk - if content: - if isinstance(content, unicode): - try: - content = content.decode('unicode-escape') - except UnicodeDecodeError: - pass - # FIXME: first available file needs to be reworked somehow... #if (source is None and content is None and not 'first_available_file' in inject) or dest is None: # result=dict(failed=True, msg="src (or content) and dest are required") @@ -86,7 +65,7 @@ class ActionModule(ActionBase): try: # If content comes to us as a dict it should be decoded json. # We need to encode it back into a string to write it out. - if type(content) is dict: + if isinstance(content, dict): content_tempfile = self._create_content_tempfile(json.dumps(content)) else: content_tempfile = self._create_content_tempfile(content) @@ -316,7 +295,8 @@ class ActionModule(ActionBase): def _create_content_tempfile(self, content): ''' Create a tempfile containing defined content ''' fd, content_tempfile = tempfile.mkstemp() - f = os.fdopen(fd, 'w') + f = os.fdopen(fd, 'wb') + content = to_bytes(content) try: f.write(content) except Exception, err: diff --git a/v2/ansible/plugins/action/unarchive.py b/v2/ansible/plugins/action/unarchive.py index f99d7e28e6..1b6cb354f0 100644 --- a/v2/ansible/plugins/action/unarchive.py +++ b/v2/ansible/plugins/action/unarchive.py @@ -17,16 +17,10 @@ # along with Ansible. If not, see . import os +import pipes from ansible.plugins.action import ActionBase -## fixes https://github.com/ansible/ansible/issues/3518 -# http://mypy.pythonblogs.com/12_mypy/archive/1253_workaround_for_python_bug_ascii_codec_cant_encode_character_uxa0_in_position_111_ordinal_not_in_range128.html -import sys -reload(sys) -sys.setdefaultencoding("utf8") -import pipes - class ActionModule(ActionBase): diff --git a/v2/test/parsing/test_splitter.py b/v2/test/parsing/test_splitter.py new file mode 100644 index 0000000000..fc2c05d36f --- /dev/null +++ b/v2/test/parsing/test_splitter.py @@ -0,0 +1,109 @@ +# coding: utf-8 +# (c) 2015, Toshio Kuratomi +# +# This file is part of Ansible +# +# Ansible is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Ansible is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Ansible. If not, see . + +# Make coding more python3-ish +from __future__ import (absolute_import, division, print_function) +__metaclass__ = type + +from nose import tools +from ansible.compat.tests import unittest + +from ansible.parsing.splitter import split_args, parse_kv + + +# Tests using nose's test generators cannot use unittest base class. +# http://nose.readthedocs.org/en/latest/writing_tests.html#test-generators +class TestSplitter_Gen: + SPLIT_DATA = ( + (u'a', + [u'a'], + {u'_raw_params': u'a'}), + (u'a=b', + [u'a=b'], + {u'a': u'b'}), + (u'a="foo bar"', + [u'a="foo bar"'], + {u'a': u'foo bar'}), + (u'"foo bar baz"', + [u'"foo bar baz"'], + {u'_raw_params': '"foo bar baz"'}), + (u'foo bar baz', + [u'foo', u'bar', u'baz'], + {u'_raw_params': u'foo bar baz'}), + (u'a=b c="foo bar"', + [u'a=b', u'c="foo bar"'], + {u'a': u'b', u'c': u'foo bar'}), + (u'a="echo \\"hello world\\"" b=bar', + [u'a="echo \\"hello world\\""', u'b=bar'], + {u'a': u'echo "hello world"', u'b': u'bar'}), + (u'a="multi\nline"', + [u'a="multi\nline"'], + {u'a': u'multi\nline'}), + (u'a="blank\n\nline"', + [u'a="blank\n\nline"'], + {u'a': u'blank\n\nline'}), + (u'a="blank\n\n\nlines"', + [u'a="blank\n\n\nlines"'], + {u'a': u'blank\n\n\nlines'}), + (u'a="a long\nmessage\\\nabout a thing\n"', + [u'a="a long\nmessage\\\nabout a thing\n"'], + {u'a': u'a long\nmessage\\\nabout a thing\n'}), + (u'a="multiline\nmessage1\\\n" b="multiline\nmessage2\\\n"', + [u'a="multiline\nmessage1\\\n"', u'b="multiline\nmessage2\\\n"'], + {u'a': 'multiline\nmessage1\\\n', u'b': u'multiline\nmessage2\\\n'}), + (u'a={{jinja}}', + [u'a={{jinja}}'], + {u'a': u'{{jinja}}'}), + (u'a={{ jinja }}', + [u'a={{ jinja }}'], + {u'a': u'{{ jinja }}'}), + (u'a="{{jinja}}"', + [u'a="{{jinja}}"'], + {u'a': u'{{jinja}}'}), + (u'a={{ jinja }}{{jinja2}}', + [u'a={{ jinja }}{{jinja2}}'], + {u'a': u'{{ jinja }}{{jinja2}}'}), + (u'a="{{ jinja }}{{jinja2}}"', + [u'a="{{ jinja }}{{jinja2}}"'], + {u'a': u'{{ jinja }}{{jinja2}}'}), + (u'a={{jinja}} b={{jinja2}}', + [u'a={{jinja}}', u'b={{jinja2}}'], + {u'a': u'{{jinja}}', u'b': u'{{jinja2}}'}), + (u'a="café eñyei"', + [u'a="café eñyei"'], + {u'a': u'café eñyei'}), + (u'a=café b=eñyei', + [u'a=café', u'b=eñyei'], + {u'a': u'café', u'b': u'eñyei'}), + ) + + def check_split_args(self, args, expected): + tools.eq_(split_args(args), expected) + + def test_split_args(self): + for datapoint in self.SPLIT_DATA: + yield self.check_split_args, datapoint[0], datapoint[1] + + def check_parse_kv(self, args, expected): + tools.eq_(parse_kv(args), expected) + + def test_parse_kv(self): + for datapoint in self.SPLIT_DATA: + try: + yield self.check_parse_kv, datapoint[0], datapoint[2] + except: pass