Various unicode and backslash escape cleanups

* Do backslash escape parsing in parse_kv() [was being done in the copy
  module purely for newlines in the copy module's content param before]
* Make parse_kv always return unicode
* Add bandaid to transform args to unicode until we can fix things
  calling parse_kv to always send it unicode.
* Make split_args deal with unicode internally.  Warning, no bandaid for
  things calling split_args without giving it unicode (shouldn't matter
  as dealt with str internally before)
* Fix copy and unarchive action plugins to not use setdefaultencoding
* Remove escaping from copy (it was broken and made content into latin-1
  sometimes). escaping is now in parse_kv.
* Expect that content is now a unicode string so transform to bytes just
  before writing to the file.
* Add initial unittests for split_args and parse_kv.  4 failing
  tests.because split_args is injecting extra newlines.
This commit is contained in:
Toshio Kuratomi 2015-03-30 19:19:34 -07:00
commit 43c1a97447
4 changed files with 143 additions and 44 deletions

View file

@ -19,6 +19,27 @@
from __future__ import (absolute_import, division, print_function)
__metaclass__ = type
import re
import codecs
# Decode escapes adapted from rspeer's answer here:
# http://stackoverflow.com/questions/4020539/process-escape-sequences-in-a-string-in-python
_HEXCHAR = '[a-fA-F0-9]'
_ESCAPE_SEQUENCE_RE = re.compile(r'''
( \\U{0} # 8-digit hex escapes
| \\u{1} # 4-digit hex escapes
| \\x{2} # 2-digit hex escapes
| \\[0-7]{{1,3}} # Octal escapes
| \\N\{{[^}}]+\}} # Unicode characters by name
| \\[\\'"abfnrtv] # Single-character escapes
)'''.format(_HEXCHAR*8, _HEXCHAR*4, _HEXCHAR*2), re.UNICODE | re.VERBOSE)
def _decode_escapes(s):
def decode_match(match):
return codecs.decode(match.group(0), 'unicode-escape')
return _ESCAPE_SEQUENCE_RE.sub(decode_match, s)
def parse_kv(args, check_raw=False):
'''
Convert a string of key/value items to a dict. If any free-form params
@ -27,6 +48,10 @@ def parse_kv(args, check_raw=False):
they will simply be ignored.
'''
### FIXME: args should already be a unicode string
from ansible.utils.unicode import to_unicode
args = to_unicode(args, nonstring='passthru')
options = {}
if args is not None:
try:
@ -39,6 +64,7 @@ def parse_kv(args, check_raw=False):
raw_params = []
for x in vargs:
x = _decode_escapes(x)
if "=" in x:
pos = 0
try:
@ -72,7 +98,7 @@ def parse_kv(args, check_raw=False):
# recombine the free-form params, if any were found, and assign
# them to a special option for use later by the shell/command module
if len(raw_params) > 0:
options['_raw_params'] = ' '.join(raw_params)
options[u'_raw_params'] = ' '.join(raw_params)
return options
@ -126,17 +152,11 @@ def split_args(args):
'''
# the list of params parsed out of the arg string
# this is going to be the result value when we are donei
# this is going to be the result value when we are done
params = []
# here we encode the args, so we have a uniform charset to
# work with, and split on white space
# Initial split on white space
args = args.strip()
try:
args = args.encode('utf-8')
do_decode = True
except UnicodeDecodeError:
do_decode = False
items = args.strip().split('\n')
# iterate over the tokens, and reassemble any that may have been
@ -242,10 +262,6 @@ def split_args(args):
if print_depth or block_depth or comment_depth or inside_quotes:
raise Exception("error while splitting arguments, either an unbalanced jinja2 block or quotes")
# finally, we decode each param back to the unicode it was in the arg string
if do_decode:
params = [x.decode('utf-8') for x in params]
return params
def is_quoted(data):

View file

@ -30,18 +30,7 @@ from ansible import constants as C
from ansible.plugins.action import ActionBase
from ansible.utils.boolean import boolean
from ansible.utils.hashing import checksum
### FIXME: Find a different way to fix 3518 as sys.defaultencoding() breaks
# the python interpreter in subtle ways. It appears that this does not fix
# 3518 anyway (using binary files via lookup(). Instead, it tries to fix
# utf-8 strings in the content parameter. That should be fixable by properly
# encoding or decoding the value before we write it to a file.
#
## fixes https://github.com/ansible/ansible/issues/3518
# http://mypy.pythonblogs.com/12_mypy/archive/1253_workaround_for_python_bug_ascii_codec_cant_encode_character_uxa0_in_position_111_ordinal_not_in_range128.html
#import sys
#reload(sys)
#sys.setdefaultencoding("utf8")
from ansible.utils.unicode import to_bytes
class ActionModule(ActionBase):
@ -55,16 +44,6 @@ class ActionModule(ActionBase):
raw = boolean(self._task.args.get('raw', 'no'))
force = boolean(self._task.args.get('force', 'yes'))
# content with newlines is going to be escaped to safely load in yaml
# now we need to unescape it so that the newlines are evaluated properly
# when writing the file to disk
if content:
if isinstance(content, unicode):
try:
content = content.decode('unicode-escape')
except UnicodeDecodeError:
pass
# FIXME: first available file needs to be reworked somehow...
#if (source is None and content is None and not 'first_available_file' in inject) or dest is None:
# result=dict(failed=True, msg="src (or content) and dest are required")
@ -86,7 +65,7 @@ class ActionModule(ActionBase):
try:
# If content comes to us as a dict it should be decoded json.
# We need to encode it back into a string to write it out.
if type(content) is dict:
if isinstance(content, dict):
content_tempfile = self._create_content_tempfile(json.dumps(content))
else:
content_tempfile = self._create_content_tempfile(content)
@ -316,7 +295,8 @@ class ActionModule(ActionBase):
def _create_content_tempfile(self, content):
''' Create a tempfile containing defined content '''
fd, content_tempfile = tempfile.mkstemp()
f = os.fdopen(fd, 'w')
f = os.fdopen(fd, 'wb')
content = to_bytes(content)
try:
f.write(content)
except Exception, err:

View file

@ -17,16 +17,10 @@
# along with Ansible. If not, see <http://www.gnu.org/licenses/>.
import os
import pipes
from ansible.plugins.action import ActionBase
## fixes https://github.com/ansible/ansible/issues/3518
# http://mypy.pythonblogs.com/12_mypy/archive/1253_workaround_for_python_bug_ascii_codec_cant_encode_character_uxa0_in_position_111_ordinal_not_in_range128.html
import sys
reload(sys)
sys.setdefaultencoding("utf8")
import pipes
class ActionModule(ActionBase):