From 7700b3b4e39e82e85d7fdc870d548e567069d25b Mon Sep 17 00:00:00 2001
From: Ryan Lovelett <ryan@lovelett.me>
Date: Mon, 28 Dec 2015 10:13:48 -0500
Subject: [PATCH 1/7] [gyb] Use lambda function to work around PEP 3114

[PEP 3114](https://www.python.org/dev/peps/pep-3114/) renamed
`iterator.next()` (Python 2) to `iterator.__next__()` (Python 3). The
recommended solution to make the code work in both Python 2 and 3 is to
call the global `next` function.

To use this recommended global `next` function this patch uses a lambda
function to supply `tokenize.generate_tokens` a callable function as it was previously.

This should be functionally equivalent to the old code with the added
benefit of working on both Python 2 and 3.
---
 utils/gyb.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/utils/gyb.py b/utils/gyb.py
index 0d123175c4c48..9b023160d4960 100755
--- a/utils/gyb.py
+++ b/utils/gyb.py
@@ -304,7 +304,7 @@ def splitGybLines(sourceLines):
     dedents = 0
     try:
         for tokenKind, tokenText, tokenStart, (tokenEndLine, tokenEndCol), lineText \
-            in tokenize.generate_tokens(sourceLines.__iter__().next):
+            in tokenize.generate_tokens(lambda i = iter(sourceLines): next(i)):
 
             if tokenKind in (tokenize.COMMENT, tokenize.ENDMARKER): 
                 continue
@@ -347,7 +347,7 @@ def codeStartsWithDedentKeyword(sourceLines):
     """
     tokenText = None
     for tokenKind, tokenText, _, _, _ \
-        in tokenize.generate_tokens(sourceLines.__iter__().next):
+        in tokenize.generate_tokens(lambda i = iter(sourceLines): next(i)):
 
         if tokenKind != tokenize.COMMENT and tokenText.strip() != '':
             break

From 86650bbb6d53ccdb47107295c28a4615b7509259 Mon Sep 17 00:00:00 2001
From: Ryan Lovelett <ryan@lovelett.me>
Date: Mon, 28 Dec 2015 15:34:30 -0500
Subject: [PATCH 2/7] [gyb] Python 2 and 3 compatible StringIO import

The `StringIO` and `cStringIO` modules are gone in Python 3. The
recommendation is to import the `io` module on Python 3. Therefore, this
patch first attempts to import the Python 2 module, `cStringIO`, and if
that fails then attempts to import the Python 3 module, `io`.

**NOTE**: There are still other Python 3.x fixes necessary to make `gyb`
run on a Python 3.x interpreter. This is just one small incremental
patch on the way there.
---
 utils/gyb.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/utils/gyb.py b/utils/gyb.py
index 9b023160d4960..0dfd9b31b140c 100755
--- a/utils/gyb.py
+++ b/utils/gyb.py
@@ -5,7 +5,10 @@
 from __future__ import print_function
 
 import re
-from cStringIO import StringIO
+try:
+    from cStringIO import StringIO
+except ImportError:
+    from io import StringIO
 import tokenize
 import textwrap
 from bisect import bisect

From c677844ba4f8ffaede13a651f32ee5ab08b307ed Mon Sep 17 00:00:00 2001
From: Ryan Lovelett <ryan@lovelett.me>
Date: Mon, 28 Dec 2015 15:54:55 -0500
Subject: [PATCH 3/7] [gyb] Provide Python 2 and 3 compatable exception syntax

In Python 2 the syntax to catch exceptions was:

except (Exception1, Exception2), target:

In Python 3 the syntax to catch exceptions is:

except (Exception1, Exception2) as target:

This newer Python 3 syntax is also available in 2.6 and 2.7. Therefore,
it is preferred for compatibility reasons. Additionally, the target no
longer can be a tuple. This patch refactors the exception handeling code
to be the newer Python 3 exception catch syntax.
---
 utils/gyb.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/utils/gyb.py b/utils/gyb.py
index 0dfd9b31b140c..ba0d76ce1f1a6 100755
--- a/utils/gyb.py
+++ b/utils/gyb.py
@@ -138,7 +138,8 @@ def tokenizePythonToUnmatchedCloseCurly(sourceText, start, lineStarts):
                 if nesting < 0:
                     return tokenPosToIndex(tokenStart, start, lineStarts)
 
-    except tokenize.TokenError, (message, errorPos):
+    except tokenize.TokenError as error:
+        (message, errorPos) = error.args
         return tokenPosToIndex(errorPos, start, lineStarts)
 
     return len(sourceText)
@@ -327,7 +328,7 @@ def splitGybLines(sourceLines):
                 
             lastTokenText,lastTokenKind = tokenText,tokenKind
 
-    except tokenize.TokenError, (message, errorPos):
+    except tokenize.TokenError:
         return [] # Let the later compile() call report the error
 
     if lastTokenText == ':':

From 7dbb4127f55022bca7b191d448652b5decf8626e Mon Sep 17 00:00:00 2001
From: Ryan Lovelett <ryan@lovelett.me>
Date: Mon, 28 Dec 2015 16:01:15 -0500
Subject: [PATCH 4/7] [gyb] Force Unicode strings in Python 2

All strings are sequences of Unicode characters in Python 3. This is
entirely different than that of Python 2. Python 2's strings were of
bytes. However, Python 2 does have the concept of Unicode strings. This
patch changes the behavior of the file reader to use the same the codecs
module on Python 2 to properly read a string into a unicode string. From
there the strings are meant to be equivalent on 2 and 3. The rest of the
patch just updates the code to natively work with unicode strings.

To test the class `GraphemeClusterBreakPropertyTable`:

    $ python2 utils/gyb --test \
    -DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \
    -DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \
    -DCMAKE_SIZEOF_VOID_P=8 \
    -o /tmp/UnicodeExtendedGraphemeClusters.cpp.2.7.tmp \
    ./stdlib/public/stubs/UnicodeExtendedGraphemeClusters.cpp.gyb

    $ python3 utils/gyb --test \
    -DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \
    -DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \
    -DCMAKE_SIZEOF_VOID_P=8 \
    -o /tmp/UnicodeExtendedGraphemeClusters.cpp.3.5.tmp \
    ./stdlib/public/stubs/UnicodeExtendedGraphemeClusters.cpp.gyb

    $ diff -u /tmp/UnicodeExtendedGraphemeClusters.cpp.2.7.tmp \
    /tmp/UnicodeExtendedGraphemeClusters.cpp.3.5.tmp

To test the method `get_grapheme_cluster_break_tests_as_UTF8`:

    $ python2 utils/gyb --test \
    -DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \
    -DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \
    -DCMAKE_SIZEOF_VOID_P=8 \
    -o /tmp/UnicodeGraphemeBreakTest.cpp.2.7.tmp \
    ./unittests/Basic/UnicodeGraphemeBreakTest.cpp.gyb

    $ python3 utils/gyb --test \
    -DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \
    -DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \
    -DCMAKE_SIZEOF_VOID_P=8 \
    -o /tmp/UnicodeGraphemeBreakTest.cpp.3.5.tmp \
    ./unittests/Basic/UnicodeGraphemeBreakTest.cpp.gyb

    $ diff -u /tmp/UnicodeGraphemeBreakTest.cpp.2.7.tmp \
    /tmp/UnicodeGraphemeBreakTest.cpp.3.5.tmp
---
 lib/ClangImporter/SortedCFDatabase.def.gyb |  4 +++-
 utils/GYBUnicodeDataUtils.py               | 20 +++++++++++---------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/lib/ClangImporter/SortedCFDatabase.def.gyb b/lib/ClangImporter/SortedCFDatabase.def.gyb
index 0cfa84e9c9f05..73d74ca26fdbd 100644
--- a/lib/ClangImporter/SortedCFDatabase.def.gyb
+++ b/lib/ClangImporter/SortedCFDatabase.def.gyb
@@ -17,6 +17,8 @@
 %{
 
 import re
+import sys
+import codecs
 
 prologueLines = ""
 epilogueLines = ""
@@ -26,7 +28,7 @@ epilogueLines = ""
 lineForName = {}
 
 # Load the data file.
-with open(CFDatabaseFile, 'rb') as f:
+with codecs.open(CFDatabaseFile, encoding=sys.getfilesystemencoding(), errors='strict') as f:
   for line in f:
     # Pass through preprocessor directives literally.
     # Assume that they all fall into either a strict prologue or epilogue.
diff --git a/utils/GYBUnicodeDataUtils.py b/utils/GYBUnicodeDataUtils.py
index a4f76c65cc7a6..caadac6fe1b20 100644
--- a/utils/GYBUnicodeDataUtils.py
+++ b/utils/GYBUnicodeDataUtils.py
@@ -11,6 +11,8 @@
 ##===----------------------------------------------------------------------===##
 
 import re
+import sys
+import codecs
 
 class UnicodeProperty(object):
     """Abstract base class for Unicode properties."""
@@ -68,7 +70,7 @@ def __init__(self, grapheme_break_property_file_name):
             self.symbolic_values[v] = k
 
         # Load the data file.
-        with open(grapheme_break_property_file_name, 'rb') as f:
+        with codecs.open(grapheme_break_property_file_name, encoding=sys.getfilesystemencoding(), errors='strict') as f:
             for line in f:
                 # Strip comments.
                 line = re.sub('#.*', '', line)
@@ -514,9 +516,9 @@ def _convert_line(line):
 
         # Match a list of code points.
         for token in line.split(" "):
-            if token == "÷":
+            if token == u"÷":
                 boundaries += [ curr_bytes ]
-            elif token == "×":
+            elif token == u"×":
                 pass
             else:
                 code_point = int(token, 16)
@@ -529,21 +531,21 @@ def _convert_line(line):
                 # and test separately that we handle ill-formed UTF-8 sequences.
                 if code_point >= 0xd800 and code_point <= 0xdfff:
                     code_point = 0x200b
-                code_point = ('\U%(cp)08x' % { 'cp': code_point }).decode('unicode_escape')
-                as_UTF8_bytes = code_point.encode('utf8')
-                as_UTF8_escaped = ''.join(['\\x%(byte)02x' % { 'byte': ord(byte) } for byte in as_UTF8_bytes])
+                code_point = (b'\U%(cp)08x' % { b'cp': code_point }).decode('unicode_escape', 'strict')
+                as_UTF8_bytes = bytearray(code_point.encode('utf8', 'strict'))
+                as_UTF8_escaped = ''.join(['\\x%(byte)02x' % { 'byte': byte } for byte in as_UTF8_bytes])
                 test += as_UTF8_escaped
                 curr_bytes += len(as_UTF8_bytes)
 
         return (test, boundaries)
 
     # Self-test.
-    assert(_convert_line('÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ]))
-    assert(_convert_line('÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ]))
+    assert(_convert_line(u'÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ]))
+    assert(_convert_line(u'÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ]))
 
     result = []
 
-    with open(grapheme_break_test_file_name, 'rb') as f:
+    with codecs.open(grapheme_break_test_file_name, encoding=sys.getfilesystemencoding(), errors='strict') as f:
         for line in f:
             test = _convert_line(line)
             if test:

From c8e74d1ba19da3510742ae17fdf3665cde0143a8 Mon Sep 17 00:00:00 2001
From: Ryan Lovelett <ryan@lovelett.me>
Date: Mon, 28 Dec 2015 16:01:15 -0500
Subject: [PATCH 5/7] [gyb] Work-around PEP 3106 for Python 3 compatibility

PEP 3106 [1] changed the behavior of the dictionaries `items` method.
In Python 2, `items` builds a real list of tuples where `iteritems`
returns a generator. PEP 3106 changes Python 3's `items` method to be
equivalent to Python 2's `iteritems` and completely removes `iteritems`
in Python 3.

This patch switches to both to use `items`. This could have a negative
impact on Python 2's performance because it now causes the dictionary
tuples to be built in memory.

[1] https://www.python.org/dev/peps/pep-3106/
---
 utils/GYBUnicodeDataUtils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/GYBUnicodeDataUtils.py b/utils/GYBUnicodeDataUtils.py
index caadac6fe1b20..587263702e638 100644
--- a/utils/GYBUnicodeDataUtils.py
+++ b/utils/GYBUnicodeDataUtils.py
@@ -66,7 +66,7 @@ def __init__(self, grapheme_break_property_file_name):
         # values to symbolic values.
         self.symbolic_values = \
             [ None ] * (max(self.numeric_value_table.values()) + 1)
-        for k,v in self.numeric_value_table.iteritems():
+        for k,v in self.numeric_value_table.items():
             self.symbolic_values[v] = k
 
         # Load the data file.

From 360c2b2bbcbeeb9f5967e7583294c1f6970015a6 Mon Sep 17 00:00:00 2001
From: Ryan Lovelett <ryan@lovelett.me>
Date: Wed, 30 Dec 2015 11:10:52 -0500
Subject: [PATCH 6/7] [gyb] Convert map object to list object

Python 2's map function [1] returns a list by default. Compared with
Python 3's map function [2] which returns an iterator (or map object).
The former is subscriptable, while the latter is not.

This patch explicitly converts the result of some map operations to be
a list. That way they have the same intended behaviour on both Python 2
and 3.

[1] https://docs.python.org/2/library/functions.html#map
[2] https://docs.python.org/3/library/functions.html#map
---
 utils/GYBUnicodeDataUtils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/utils/GYBUnicodeDataUtils.py b/utils/GYBUnicodeDataUtils.py
index 587263702e638..338185af5c8f0 100644
--- a/utils/GYBUnicodeDataUtils.py
+++ b/utils/GYBUnicodeDataUtils.py
@@ -331,7 +331,10 @@ def map_index(idx):
                 else:
                     return idx
 
-            return map(map_index, indexes)
+            # NOTE: Python 2's `map` function returns a list. Where Python 3's
+            # `map` function returns an iterator. To work around this the
+            # result of the `map` is explicitly converted to a `list`.
+            return list(map(map_index, indexes))
 
         # If self.BMP_data contains identical data blocks, keep the first one,
         # remove duplicates and change the indexes in self.BMP_lookup to point to

From 8f223005d7c950c71f35283e0cd99a9ab49d67bf Mon Sep 17 00:00:00 2001
From: Ryan Lovelett <ryan@lovelett.me>
Date: Wed, 30 Dec 2015 13:32:37 -0500
Subject: [PATCH 7/7] [gyb] Popen explicit string instead of byte sequence

The Popen command on Python returns a byte sequence on stdout by
default. However by sending the constructor the argument
`universal_newlines=True` it forces the Popen to put a string on stdout.

This was not a problem on Python 2 because the Python 2 regex engine
seemed to work find on byte sequences where Python 3's does not. By
explicitly converting everything to a string the same behavior is now
seen on Python 2 and 3.

See: https://docs.python.org/2/library/subprocess.html#frequently-used-arguments
See: https://docs.python.org/3/library/subprocess.html#frequently-used-arguments
---
 utils/line-directive | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/utils/line-directive b/utils/line-directive
index 7f147f815063d..2abcbbda6d42f 100755
--- a/utils/line-directive
+++ b/utils/line-directive
@@ -71,7 +71,10 @@ def run():
         sources = sys.argv[1:dashes]
 
         command = subprocess.Popen(
-            sys.argv[dashes + 1:], stderr = subprocess.STDOUT, stdout = subprocess.PIPE
+            sys.argv[dashes + 1:],
+            stderr = subprocess.STDOUT,
+            stdout = subprocess.PIPE,
+            universal_newlines = True
         )
         
         error_pattern = re.compile(