From 7700b3b4e39e82e85d7fdc870d548e567069d25b Mon Sep 17 00:00:00 2001 From: Ryan Lovelett Date: Mon, 28 Dec 2015 10:13:48 -0500 Subject: [PATCH 1/7] [gyb] Use lambda function to work around PEP 3114 [PEP 3114](https://www.python.org/dev/peps/pep-3114/) renamed `iterator.next()` (Python 2) to `iterator.__next__()` (Python 3). The recommended solution to make the code work in both Python 2 and 3 is to call the global `next` function. To use this recommended global `next` function this patch uses a lambda function to supply `tokenize.generate_tokens` a callable function as it was previously. This should be functionally equivalent to the old code with the added benefit of working on both Python 2 and 3. --- utils/gyb.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/gyb.py b/utils/gyb.py index 0d123175c4c48..9b023160d4960 100755 --- a/utils/gyb.py +++ b/utils/gyb.py @@ -304,7 +304,7 @@ def splitGybLines(sourceLines): dedents = 0 try: for tokenKind, tokenText, tokenStart, (tokenEndLine, tokenEndCol), lineText \ - in tokenize.generate_tokens(sourceLines.__iter__().next): + in tokenize.generate_tokens(lambda i = iter(sourceLines): next(i)): if tokenKind in (tokenize.COMMENT, tokenize.ENDMARKER): continue @@ -347,7 +347,7 @@ def codeStartsWithDedentKeyword(sourceLines): """ tokenText = None for tokenKind, tokenText, _, _, _ \ - in tokenize.generate_tokens(sourceLines.__iter__().next): + in tokenize.generate_tokens(lambda i = iter(sourceLines): next(i)): if tokenKind != tokenize.COMMENT and tokenText.strip() != '': break From 86650bbb6d53ccdb47107295c28a4615b7509259 Mon Sep 17 00:00:00 2001 From: Ryan Lovelett Date: Mon, 28 Dec 2015 15:34:30 -0500 Subject: [PATCH 2/7] [gyb] Python 2 and 3 compatible StringIO import The `StringIO` and `cStringIO` modules are gone in Python 3. The recommendation is to import the `io` module on Python 3. Therefore, this patch first attempts to import the Python 2 module, `cStringIO`, and if that fails then attempts to import the Python 3 module, `io`. **NOTE**: There are still other Python 3.x fixes necessary to make `gyb` run on a Python 3.x interpreter. This is just one small incremental patch on the way there. --- utils/gyb.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/utils/gyb.py b/utils/gyb.py index 9b023160d4960..0dfd9b31b140c 100755 --- a/utils/gyb.py +++ b/utils/gyb.py @@ -5,7 +5,10 @@ from __future__ import print_function import re -from cStringIO import StringIO +try: + from cStringIO import StringIO +except ImportError: + from io import StringIO import tokenize import textwrap from bisect import bisect From c677844ba4f8ffaede13a651f32ee5ab08b307ed Mon Sep 17 00:00:00 2001 From: Ryan Lovelett Date: Mon, 28 Dec 2015 15:54:55 -0500 Subject: [PATCH 3/7] [gyb] Provide Python 2 and 3 compatable exception syntax In Python 2 the syntax to catch exceptions was: except (Exception1, Exception2), target: In Python 3 the syntax to catch exceptions is: except (Exception1, Exception2) as target: This newer Python 3 syntax is also available in 2.6 and 2.7. Therefore, it is preferred for compatibility reasons. Additionally, the target no longer can be a tuple. This patch refactors the exception handeling code to be the newer Python 3 exception catch syntax. --- utils/gyb.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/utils/gyb.py b/utils/gyb.py index 0dfd9b31b140c..ba0d76ce1f1a6 100755 --- a/utils/gyb.py +++ b/utils/gyb.py @@ -138,7 +138,8 @@ def tokenizePythonToUnmatchedCloseCurly(sourceText, start, lineStarts): if nesting < 0: return tokenPosToIndex(tokenStart, start, lineStarts) - except tokenize.TokenError, (message, errorPos): + except tokenize.TokenError as error: + (message, errorPos) = error.args return tokenPosToIndex(errorPos, start, lineStarts) return len(sourceText) @@ -327,7 +328,7 @@ def splitGybLines(sourceLines): lastTokenText,lastTokenKind = tokenText,tokenKind - except tokenize.TokenError, (message, errorPos): + except tokenize.TokenError: return [] # Let the later compile() call report the error if lastTokenText == ':': From 7dbb4127f55022bca7b191d448652b5decf8626e Mon Sep 17 00:00:00 2001 From: Ryan Lovelett Date: Mon, 28 Dec 2015 16:01:15 -0500 Subject: [PATCH 4/7] [gyb] Force Unicode strings in Python 2 All strings are sequences of Unicode characters in Python 3. This is entirely different than that of Python 2. Python 2's strings were of bytes. However, Python 2 does have the concept of Unicode strings. This patch changes the behavior of the file reader to use the same the codecs module on Python 2 to properly read a string into a unicode string. From there the strings are meant to be equivalent on 2 and 3. The rest of the patch just updates the code to natively work with unicode strings. To test the class `GraphemeClusterBreakPropertyTable`: $ python2 utils/gyb --test \ -DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \ -DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \ -DCMAKE_SIZEOF_VOID_P=8 \ -o /tmp/UnicodeExtendedGraphemeClusters.cpp.2.7.tmp \ ./stdlib/public/stubs/UnicodeExtendedGraphemeClusters.cpp.gyb $ python3 utils/gyb --test \ -DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \ -DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \ -DCMAKE_SIZEOF_VOID_P=8 \ -o /tmp/UnicodeExtendedGraphemeClusters.cpp.3.5.tmp \ ./stdlib/public/stubs/UnicodeExtendedGraphemeClusters.cpp.gyb $ diff -u /tmp/UnicodeExtendedGraphemeClusters.cpp.2.7.tmp \ /tmp/UnicodeExtendedGraphemeClusters.cpp.3.5.tmp To test the method `get_grapheme_cluster_break_tests_as_UTF8`: $ python2 utils/gyb --test \ -DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \ -DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \ -DCMAKE_SIZEOF_VOID_P=8 \ -o /tmp/UnicodeGraphemeBreakTest.cpp.2.7.tmp \ ./unittests/Basic/UnicodeGraphemeBreakTest.cpp.gyb $ python3 utils/gyb --test \ -DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \ -DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \ -DCMAKE_SIZEOF_VOID_P=8 \ -o /tmp/UnicodeGraphemeBreakTest.cpp.3.5.tmp \ ./unittests/Basic/UnicodeGraphemeBreakTest.cpp.gyb $ diff -u /tmp/UnicodeGraphemeBreakTest.cpp.2.7.tmp \ /tmp/UnicodeGraphemeBreakTest.cpp.3.5.tmp --- lib/ClangImporter/SortedCFDatabase.def.gyb | 4 +++- utils/GYBUnicodeDataUtils.py | 20 +++++++++++--------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/lib/ClangImporter/SortedCFDatabase.def.gyb b/lib/ClangImporter/SortedCFDatabase.def.gyb index 0cfa84e9c9f05..73d74ca26fdbd 100644 --- a/lib/ClangImporter/SortedCFDatabase.def.gyb +++ b/lib/ClangImporter/SortedCFDatabase.def.gyb @@ -17,6 +17,8 @@ %{ import re +import sys +import codecs prologueLines = "" epilogueLines = "" @@ -26,7 +28,7 @@ epilogueLines = "" lineForName = {} # Load the data file. -with open(CFDatabaseFile, 'rb') as f: +with codecs.open(CFDatabaseFile, encoding=sys.getfilesystemencoding(), errors='strict') as f: for line in f: # Pass through preprocessor directives literally. # Assume that they all fall into either a strict prologue or epilogue. diff --git a/utils/GYBUnicodeDataUtils.py b/utils/GYBUnicodeDataUtils.py index a4f76c65cc7a6..caadac6fe1b20 100644 --- a/utils/GYBUnicodeDataUtils.py +++ b/utils/GYBUnicodeDataUtils.py @@ -11,6 +11,8 @@ ##===----------------------------------------------------------------------===## import re +import sys +import codecs class UnicodeProperty(object): """Abstract base class for Unicode properties.""" @@ -68,7 +70,7 @@ def __init__(self, grapheme_break_property_file_name): self.symbolic_values[v] = k # Load the data file. - with open(grapheme_break_property_file_name, 'rb') as f: + with codecs.open(grapheme_break_property_file_name, encoding=sys.getfilesystemencoding(), errors='strict') as f: for line in f: # Strip comments. line = re.sub('#.*', '', line) @@ -514,9 +516,9 @@ def _convert_line(line): # Match a list of code points. for token in line.split(" "): - if token == "÷": + if token == u"÷": boundaries += [ curr_bytes ] - elif token == "×": + elif token == u"×": pass else: code_point = int(token, 16) @@ -529,21 +531,21 @@ def _convert_line(line): # and test separately that we handle ill-formed UTF-8 sequences. if code_point >= 0xd800 and code_point <= 0xdfff: code_point = 0x200b - code_point = ('\U%(cp)08x' % { 'cp': code_point }).decode('unicode_escape') - as_UTF8_bytes = code_point.encode('utf8') - as_UTF8_escaped = ''.join(['\\x%(byte)02x' % { 'byte': ord(byte) } for byte in as_UTF8_bytes]) + code_point = (b'\U%(cp)08x' % { b'cp': code_point }).decode('unicode_escape', 'strict') + as_UTF8_bytes = bytearray(code_point.encode('utf8', 'strict')) + as_UTF8_escaped = ''.join(['\\x%(byte)02x' % { 'byte': byte } for byte in as_UTF8_bytes]) test += as_UTF8_escaped curr_bytes += len(as_UTF8_bytes) return (test, boundaries) # Self-test. - assert(_convert_line('÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ])) - assert(_convert_line('÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ])) + assert(_convert_line(u'÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ])) + assert(_convert_line(u'÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ])) result = [] - with open(grapheme_break_test_file_name, 'rb') as f: + with codecs.open(grapheme_break_test_file_name, encoding=sys.getfilesystemencoding(), errors='strict') as f: for line in f: test = _convert_line(line) if test: From c8e74d1ba19da3510742ae17fdf3665cde0143a8 Mon Sep 17 00:00:00 2001 From: Ryan Lovelett Date: Mon, 28 Dec 2015 16:01:15 -0500 Subject: [PATCH 5/7] [gyb] Work-around PEP 3106 for Python 3 compatibility PEP 3106 [1] changed the behavior of the dictionaries `items` method. In Python 2, `items` builds a real list of tuples where `iteritems` returns a generator. PEP 3106 changes Python 3's `items` method to be equivalent to Python 2's `iteritems` and completely removes `iteritems` in Python 3. This patch switches to both to use `items`. This could have a negative impact on Python 2's performance because it now causes the dictionary tuples to be built in memory. [1] https://www.python.org/dev/peps/pep-3106/ --- utils/GYBUnicodeDataUtils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/GYBUnicodeDataUtils.py b/utils/GYBUnicodeDataUtils.py index caadac6fe1b20..587263702e638 100644 --- a/utils/GYBUnicodeDataUtils.py +++ b/utils/GYBUnicodeDataUtils.py @@ -66,7 +66,7 @@ def __init__(self, grapheme_break_property_file_name): # values to symbolic values. self.symbolic_values = \ [ None ] * (max(self.numeric_value_table.values()) + 1) - for k,v in self.numeric_value_table.iteritems(): + for k,v in self.numeric_value_table.items(): self.symbolic_values[v] = k # Load the data file. From 360c2b2bbcbeeb9f5967e7583294c1f6970015a6 Mon Sep 17 00:00:00 2001 From: Ryan Lovelett Date: Wed, 30 Dec 2015 11:10:52 -0500 Subject: [PATCH 6/7] [gyb] Convert map object to list object Python 2's map function [1] returns a list by default. Compared with Python 3's map function [2] which returns an iterator (or map object). The former is subscriptable, while the latter is not. This patch explicitly converts the result of some map operations to be a list. That way they have the same intended behaviour on both Python 2 and 3. [1] https://docs.python.org/2/library/functions.html#map [2] https://docs.python.org/3/library/functions.html#map --- utils/GYBUnicodeDataUtils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/utils/GYBUnicodeDataUtils.py b/utils/GYBUnicodeDataUtils.py index 587263702e638..338185af5c8f0 100644 --- a/utils/GYBUnicodeDataUtils.py +++ b/utils/GYBUnicodeDataUtils.py @@ -331,7 +331,10 @@ def map_index(idx): else: return idx - return map(map_index, indexes) + # NOTE: Python 2's `map` function returns a list. Where Python 3's + # `map` function returns an iterator. To work around this the + # result of the `map` is explicitly converted to a `list`. + return list(map(map_index, indexes)) # If self.BMP_data contains identical data blocks, keep the first one, # remove duplicates and change the indexes in self.BMP_lookup to point to From 8f223005d7c950c71f35283e0cd99a9ab49d67bf Mon Sep 17 00:00:00 2001 From: Ryan Lovelett Date: Wed, 30 Dec 2015 13:32:37 -0500 Subject: [PATCH 7/7] [gyb] Popen explicit string instead of byte sequence The Popen command on Python returns a byte sequence on stdout by default. However by sending the constructor the argument `universal_newlines=True` it forces the Popen to put a string on stdout. This was not a problem on Python 2 because the Python 2 regex engine seemed to work find on byte sequences where Python 3's does not. By explicitly converting everything to a string the same behavior is now seen on Python 2 and 3. See: https://docs.python.org/2/library/subprocess.html#frequently-used-arguments See: https://docs.python.org/3/library/subprocess.html#frequently-used-arguments --- utils/line-directive | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/utils/line-directive b/utils/line-directive index 7f147f815063d..2abcbbda6d42f 100755 --- a/utils/line-directive +++ b/utils/line-directive @@ -71,7 +71,10 @@ def run(): sources = sys.argv[1:dashes] command = subprocess.Popen( - sys.argv[dashes + 1:], stderr = subprocess.STDOUT, stdout = subprocess.PIPE + sys.argv[dashes + 1:], + stderr = subprocess.STDOUT, + stdout = subprocess.PIPE, + universal_newlines = True ) error_pattern = re.compile(