Various unicode and backslash escape cleanups

* Do backslash escape parsing in parse_kv() [was being done in the copy module purely for newlines in the copy module's content param before] * Make parse_kv always return unicode * Add bandaid to transform args to unicode until we can fix things calling parse_kv to always send it unicode. * Make split_args deal with unicode internally. Warning, no bandaid for things calling split_args without giving it unicode (shouldn't matter as dealt with str internally before) * Fix copy and unarchive action plugins to not use setdefaultencoding * Remove escaping from copy (it was broken and made content into latin-1 sometimes). escaping is now in parse_kv. * Expect that content is now a unicode string so transform to bytes just before writing to the file. * Add initial unittests for split_args and parse_kv. 4 failing tests.because split_args is injecting extra newlines.
2025-09-30 05:23:21 -07:00 · 2015-03-30 19:19:34 -07:00 · 2015-03-30 19:19:34 -07:00 · 43c1a97447
commit 43c1a97447
parent 1cc2135a0d
4 changed files with 143 additions and 44 deletions
--- a/v2/ansible/parsing/splitter.py
+++ b/v2/ansible/parsing/splitter.py
@ -19,6 +19,27 @@
 from __future__ import (absolute_import, division, print_function)
 __metaclass__ = type
 import re
 import codecs
 # Decode escapes adapted from rspeer's answer here:
 # http://stackoverflow.com/questions/4020539/process-escape-sequences-in-a-string-in-python
 _HEXCHAR = '[a-fA-F0-9]'
 _ESCAPE_SEQUENCE_RE = re.compile(r'''
    ( \\U{0}           # 8-digit hex escapes
    | \\u{1}           # 4-digit hex escapes
    | \\x{2}           # 2-digit hex escapes
    | \\[0-7]{{1,3}}   # Octal escapes
    | \\N\{{[^}}]+\}}  # Unicode characters by name
    | \\[\\'"abfnrtv]  # Single-character escapes
    )'''.format(_HEXCHAR*8, _HEXCHAR*4, _HEXCHAR*2), re.UNICODE | re.VERBOSE)
 def _decode_escapes(s):
    def decode_match(match):
        return codecs.decode(match.group(0), 'unicode-escape')
    return _ESCAPE_SEQUENCE_RE.sub(decode_match, s)
 def parse_kv(args, check_raw=False):
    '''
    Convert a string of key/value items to a dict. If any free-form params
@ -27,6 +48,10 @@ def parse_kv(args, check_raw=False):
    they will simply be ignored.
    '''
    ### FIXME: args should already be a unicode string
    from ansible.utils.unicode import to_unicode
    args = to_unicode(args, nonstring='passthru')
    options = {}
    if args is not None:
        try:
@ -39,6 +64,7 @@ def parse_kv(args, check_raw=False):
        raw_params = []
        for x in vargs:
            x = _decode_escapes(x)
            if "=" in x:
                pos = 0
                try:
@ -72,7 +98,7 @@ def parse_kv(args, check_raw=False):
        # recombine the free-form params, if any were found, and assign
        # them to a special option for use later by the shell/command module
        if len(raw_params) > 0:
-            options['_raw_params'] = ' '.join(raw_params)
+            options[u'_raw_params'] = ' '.join(raw_params)
    return options
@ -126,17 +152,11 @@ def split_args(args):
    '''
    # the list of params parsed out of the arg string
-    # this is going to be the result value when we are donei
+    # this is going to be the result value when we are done
    params = []
-    # here we encode the args, so we have a uniform charset to
+    # Initial split on white space
    # work with, and split on white space
    args = args.strip()
    try:
        args = args.encode('utf-8')
        do_decode = True
    except UnicodeDecodeError:
        do_decode = False
    items = args.strip().split('\n')
    # iterate over the tokens, and reassemble any that may have been
@ -242,10 +262,6 @@ def split_args(args):
    if print_depth or block_depth or comment_depth or inside_quotes:
        raise Exception("error while splitting arguments, either an unbalanced jinja2 block or quotes")
    # finally, we decode each param back to the unicode it was in the arg string
    if do_decode:
        params = [x.decode('utf-8') for x in params]
    return params
 def is_quoted(data):
--- a/v2/ansible/plugins/action/copy.py
+++ b/v2/ansible/plugins/action/copy.py
@ -30,18 +30,7 @@ from ansible import constants as C
 from ansible.plugins.action import ActionBase
 from ansible.utils.boolean import boolean
 from ansible.utils.hashing import checksum
-
+from ansible.utils.unicode import to_bytes
 ### FIXME: Find a different way to fix 3518 as sys.defaultencoding() breaks
 # the python interpreter in subtle ways.  It appears that this does not fix
 # 3518 anyway (using binary files via lookup().  Instead, it tries to fix
 # utf-8 strings in the content parameter.  That should be fixable by properly
 # encoding or decoding the value before we write it to a file.
 #
 ## fixes https://github.com/ansible/ansible/issues/3518
 # http://mypy.pythonblogs.com/12_mypy/archive/1253_workaround_for_python_bug_ascii_codec_cant_encode_character_uxa0_in_position_111_ordinal_not_in_range128.html
 #import sys
 #reload(sys)
 #sys.setdefaultencoding("utf8")
 class ActionModule(ActionBase):
@ -55,16 +44,6 @@ class ActionModule(ActionBase):
        raw     = boolean(self._task.args.get('raw', 'no'))
        force   = boolean(self._task.args.get('force', 'yes'))
        # content with newlines is going to be escaped to safely load in yaml
        # now we need to unescape it so that the newlines are evaluated properly
        # when writing the file to disk
        if content:
            if isinstance(content, unicode):
                try:
                    content = content.decode('unicode-escape')
                except UnicodeDecodeError:
                    pass
        # FIXME: first available file needs to be reworked somehow...
        #if (source is None and content is None and not 'first_available_file' in inject) or dest is None:
        #    result=dict(failed=True, msg="src (or content) and dest are required")
@ -86,7 +65,7 @@ class ActionModule(ActionBase):
            try:
                # If content comes to us as a dict it should be decoded json.
                # We need to encode it back into a string to write it out.
-                if type(content) is dict:
+                if isinstance(content, dict):
                    content_tempfile = self._create_content_tempfile(json.dumps(content))
                else:
                    content_tempfile = self._create_content_tempfile(content)
@ -316,7 +295,8 @@ class ActionModule(ActionBase):
    def _create_content_tempfile(self, content):
        ''' Create a tempfile containing defined content '''
        fd, content_tempfile = tempfile.mkstemp()
-        f = os.fdopen(fd, 'w')
+        f = os.fdopen(fd, 'wb')
        content = to_bytes(content)
        try:
            f.write(content)
        except Exception, err:
--- a/v2/ansible/plugins/action/unarchive.py
+++ b/v2/ansible/plugins/action/unarchive.py
@ -17,16 +17,10 @@
 # along with Ansible.  If not, see <http://www.gnu.org/licenses/>.
 import os
 import pipes
 from ansible.plugins.action import ActionBase
 ## fixes https://github.com/ansible/ansible/issues/3518
 # http://mypy.pythonblogs.com/12_mypy/archive/1253_workaround_for_python_bug_ascii_codec_cant_encode_character_uxa0_in_position_111_ordinal_not_in_range128.html
 import sys
 reload(sys)
 sys.setdefaultencoding("utf8")
 import pipes
 class ActionModule(ActionBase):
--- a/v2/test/parsing/test_splitter.py
+++ b/v2/test/parsing/test_splitter.py
@ -0,0 +1,109 @@
 # coding: utf-8
 # (c) 2015, Toshio Kuratomi <tkuratomi@ansible.com>
 #
 # This file is part of Ansible
 #
 # Ansible is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # Ansible is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Ansible.  If not, see <http://www.gnu.org/licenses/>.
 # Make coding more python3-ish
 from __future__ import (absolute_import, division, print_function)
 __metaclass__ = type
 from nose import tools
 from ansible.compat.tests import unittest
 from ansible.parsing.splitter import split_args, parse_kv
 # Tests using nose's test generators cannot use unittest base class.
 # http://nose.readthedocs.org/en/latest/writing_tests.html#test-generators
 class TestSplitter_Gen:
    SPLIT_DATA = (
            (u'a',
                [u'a'],
                {u'_raw_params': u'a'}),
            (u'a=b',
                [u'a=b'],
                {u'a': u'b'}),
            (u'a="foo bar"',
                [u'a="foo bar"'],
                {u'a': u'foo bar'}),
            (u'"foo bar baz"',
                [u'"foo bar baz"'],
                {u'_raw_params': '"foo bar baz"'}),
            (u'foo bar baz',
                [u'foo', u'bar', u'baz'],
                {u'_raw_params': u'foo bar baz'}),
            (u'a=b c="foo bar"',
                [u'a=b', u'c="foo bar"'],
                {u'a': u'b', u'c': u'foo bar'}),
            (u'a="echo \\"hello world\\"" b=bar',
                [u'a="echo \\"hello world\\""', u'b=bar'],
                {u'a': u'echo "hello world"', u'b': u'bar'}),
            (u'a="multi\nline"',
                [u'a="multi\nline"'],
                {u'a': u'multi\nline'}),
            (u'a="blank\n\nline"',
                [u'a="blank\n\nline"'],
                {u'a': u'blank\n\nline'}),
            (u'a="blank\n\n\nlines"',
                [u'a="blank\n\n\nlines"'],
                {u'a': u'blank\n\n\nlines'}),
            (u'a="a long\nmessage\\\nabout a thing\n"',
                [u'a="a long\nmessage\\\nabout a thing\n"'],
                {u'a': u'a long\nmessage\\\nabout a thing\n'}),
            (u'a="multiline\nmessage1\\\n" b="multiline\nmessage2\\\n"',
                [u'a="multiline\nmessage1\\\n"', u'b="multiline\nmessage2\\\n"'],
                {u'a': 'multiline\nmessage1\\\n', u'b': u'multiline\nmessage2\\\n'}),
            (u'a={{jinja}}',
                [u'a={{jinja}}'],
                {u'a': u'{{jinja}}'}),
            (u'a={{ jinja }}',
                [u'a={{ jinja }}'],
                {u'a': u'{{ jinja }}'}),
            (u'a="{{jinja}}"',
                [u'a="{{jinja}}"'],
                {u'a': u'{{jinja}}'}),
            (u'a={{ jinja }}{{jinja2}}',
                [u'a={{ jinja }}{{jinja2}}'],
                {u'a': u'{{ jinja }}{{jinja2}}'}),
            (u'a="{{ jinja }}{{jinja2}}"',
                [u'a="{{ jinja }}{{jinja2}}"'],
                {u'a': u'{{ jinja }}{{jinja2}}'}),
            (u'a={{jinja}} b={{jinja2}}',
                [u'a={{jinja}}', u'b={{jinja2}}'],
                {u'a': u'{{jinja}}', u'b': u'{{jinja2}}'}),
            (u'a="café eñyei"',
                [u'a="café eñyei"'],
                {u'a': u'café eñyei'}),
            (u'a=café b=eñyei',
                [u'a=café', u'b=eñyei'],
                {u'a': u'café', u'b': u'eñyei'}),
            )
    def check_split_args(self, args, expected):
        tools.eq_(split_args(args), expected)
    def test_split_args(self):
        for datapoint in self.SPLIT_DATA:
            yield self.check_split_args, datapoint[0], datapoint[1]
    def check_parse_kv(self, args, expected):
        tools.eq_(parse_kv(args), expected)
    def test_parse_kv(self):
        for datapoint in self.SPLIT_DATA:
            try:
                yield self.check_parse_kv, datapoint[0], datapoint[2]
            except: pass