From 9e3848e1a4d0067b8179914eae63910a7a1868fa Mon Sep 17 00:00:00 2001 From: Christof Schulze Date: Sat, 15 Feb 2020 22:32:24 +0100 Subject: [PATCH] Cleanup Python 2 workarounds closing #2 --- parsing/__init__.py | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/parsing/__init__.py b/parsing/__init__.py index 2a4007c..85731b7 100644 --- a/parsing/__init__.py +++ b/parsing/__init__.py @@ -20,8 +20,11 @@ BOOLEANS_TRUE = frozenset(('y', 'yes', 'on', '1', 'true', 't', 1, 1.0, True)) BOOLEANS_FALSE = frozenset(('n', 'no', 'off', '0', 'false', 'f', 0, 0.0, False)) BOOLEANS = BOOLEANS_TRUE.union(BOOLEANS_FALSE) +# The error handler to use if the byte string is not decodable using the specified encoding. +decoding_error = 'surrogateescape' -def to_text(obj, encoding='utf-8', nonstring: str='simplerepr'): + +def to_text(obj, encoding='utf-8', nonstring: str = 'simplerepr'): """ Make sure that a string is a text string @@ -49,13 +52,10 @@ def to_text(obj, encoding='utf-8', nonstring: str='simplerepr'): if isinstance(obj, str): return obj - # The error handler to use if the byte string is not decodable using the specified encoding. - errors = 'surrogateescape' - if isinstance(obj, bytes): # Note: We don't need special handling for surrogateescape because # all bytes will either be made into surrogates or are valid to decode. - return obj.decode(encoding, errors) + return obj.decode(encoding, decoding_error) # Note: We do these last even though we have to call to_text again on the # value because we're optimizing the common case @@ -78,7 +78,7 @@ def to_text(obj, encoding='utf-8', nonstring: str='simplerepr'): else: raise TypeError('Invalid value %s for to_text\'s nonstring parameter' % nonstring) - return to_text(value, encoding, errors) + return to_text(value, encoding, decoding_error) def boolean(value): @@ -114,11 +114,11 @@ def to_bytes(obj, encoding='utf-8', nonstring='simplerepr'): :empty: Return an empty byte string :passthru: Return the object passed in :strict: Raise a :exc:`TypeError` - :return: Typically this returns a byte string. If a nonstring object is + :return: Typically this returns a byte string. If a nonstring object is passed in this may be a different type depending on the strategy specified by nonstring. This will never return a text string. .. note:: If passed a byte string, this function does not check that the - string is valid in the specified encoding. If it's important that the + string is valid in the specified encoding. If it's important that the byte string is in the specified encoding do:: encoded_string = to_bytes(to_text(input_string, 'latin-1'), 'utf-8') """ @@ -126,26 +126,13 @@ def to_bytes(obj, encoding='utf-8', nonstring='simplerepr'): if isinstance(obj, bytes): return obj - # The error handler to use if the byte string is not decodable using the specified encoding. - errors = 'surrogateescape' - # We're given a text string # If it has surrogates, we know because it will decode - original_errors = errors - if isinstance(obj, str): try: # Try this first as it's the fastest - return obj.encode(encoding, errors) + return obj.encode(encoding) except UnicodeEncodeError: - if original_errors in (None, 'surrogate_then_replace'): - # We should only reach this if encoding was non-utf8 original_errors was - # surrogate_then_escape and errors was surrogateescape - - # Slow but works - return_string = obj.encode('utf-8', 'surrogateescape') - # return_string = return_string.decode('utf-8', 'replace') - # return return_string.encode(encoding, 'replace') raise # Note: We do these last even though we have to call to_bytes again on the @@ -168,7 +155,7 @@ def to_bytes(obj, encoding='utf-8', nonstring='simplerepr'): else: raise TypeError('Invalid value %s for to_bytes\' nonstring parameter' % nonstring) - return to_bytes(value, encoding, errors) + return to_bytes(value, encoding) def unquote(data): -- GitLab