Various unicode and backslash escape cleanups

* Do backslash escape parsing in parse_kv() [was being done in the copy module purely for newlines in the copy module's content param before] * Make parse_kv always return unicode * Add bandaid to transform args to unicode until we can fix things calling parse_kv to always send it unicode. * Make split_args deal with unicode internally. Warning, no bandaid for things calling split_args without giving it unicode (shouldn't matter as dealt with str internally before) * Fix copy and unarchive action plugins to not use setdefaultencoding * Remove escaping from copy (it was broken and made content into latin-1 sometimes). escaping is now in parse_kv. * Expect that content is now a unicode string so transform to bytes just before writing to the file. * Add initial unittests for split_args and parse_kv. 4 failing tests.because split_args is injecting extra newlines.
2025-07-23 05:10:22 -07:00 · 2015-03-30 19:19:34 -07:00 · 2015-03-30 19:19:34 -07:00 · 43c1a97447
commit 43c1a97447
parent 1cc2135a0d
4 changed files with 143 additions and 44 deletions
--- a/v2/ansible/parsing/splitter.py
+++ b/v2/ansible/parsing/splitter.py
@ -19,6 +19,27 @@
 from __future__ import (absolute_import, division, print_function)
 __metaclass__ = type

+import re
+import codecs
+
+# Decode escapes adapted from rspeer's answer here:
+# http://stackoverflow.com/questions/4020539/process-escape-sequences-in-a-string-in-python
+_HEXCHAR = '[a-fA-F0-9]'
+_ESCAPE_SEQUENCE_RE = re.compile(r'''
+    ( \\U{0}           # 8-digit hex escapes
+    | \\u{1}           # 4-digit hex escapes
+    | \\x{2}           # 2-digit hex escapes
+    | \\[0-7]{{1,3}}   # Octal escapes
+    | \\N\{{[^}}]+\}}  # Unicode characters by name
+    | \\[\\'"abfnrtv]  # Single-character escapes
+    )'''.format(_HEXCHAR*8, _HEXCHAR*4, _HEXCHAR*2), re.UNICODE | re.VERBOSE)
+
+def _decode_escapes(s):
+    def decode_match(match):
+        return codecs.decode(match.group(0), 'unicode-escape')
+
+    return _ESCAPE_SEQUENCE_RE.sub(decode_match, s)
+
 def parse_kv(args, check_raw=False):
    '''
    Convert a string of key/value items to a dict. If any free-form params
@ -27,6 +48,10 @@ def parse_kv(args, check_raw=False):
    they will simply be ignored.
    '''

+    ### FIXME: args should already be a unicode string
+    from ansible.utils.unicode import to_unicode
+    args = to_unicode(args, nonstring='passthru')
+
    options = {}
    if args is not None:
        try:
@ -39,6 +64,7 @@ def parse_kv(args, check_raw=False):

        raw_params = []
        for x in vargs:
+            x = _decode_escapes(x)
            if "=" in x:
                pos = 0
                try:
@ -72,7 +98,7 @@ def parse_kv(args, check_raw=False):
        # recombine the free-form params, if any were found, and assign
        # them to a special option for use later by the shell/command module
        if len(raw_params) > 0:
-            options['_raw_params'] = ' '.join(raw_params)
+            options[u'_raw_params'] = ' '.join(raw_params)

    return options

@ -126,17 +152,11 @@ def split_args(args):
    '''

    # the list of params parsed out of the arg string
-    # this is going to be the result value when we are donei
+    # this is going to be the result value when we are done
    params = []

-    # here we encode the args, so we have a uniform charset to
-    # work with, and split on white space
+    # Initial split on white space
    args = args.strip()
-    try:
-        args = args.encode('utf-8')
-        do_decode = True
-    except UnicodeDecodeError:
-        do_decode = False
    items = args.strip().split('\n')

    # iterate over the tokens, and reassemble any that may have been
@ -242,10 +262,6 @@ def split_args(args):
    if print_depth or block_depth or comment_depth or inside_quotes:
        raise Exception("error while splitting arguments, either an unbalanced jinja2 block or quotes")

-    # finally, we decode each param back to the unicode it was in the arg string
-    if do_decode:
-        params = [x.decode('utf-8') for x in params]
-
    return params

 def is_quoted(data):