Skip to content

Commit

Permalink
[gyb] Force Unicode strings in Python 2
Browse files Browse the repository at this point in the history
All strings are sequences of Unicode characters in Python 3. This is
entirely different than that of Python 2. Python 2's strings were of
bytes. However, Python 2 does have the concept of Unicode strings. This
patch changes the behavior of the file reader to use the same the codecs
module on Python 2 to properly read a string into a unicode string. From
there the strings are meant to be equivalent on 2 and 3. The rest of the
patch just updates the code to natively work with unicode strings.

To test the class `GraphemeClusterBreakPropertyTable`:

    $ python2 utils/gyb --test \
    -DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \
    -DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \
    -DCMAKE_SIZEOF_VOID_P=8 \
    -o /tmp/UnicodeExtendedGraphemeClusters.cpp.2.7.tmp \
    ./stdlib/public/stubs/UnicodeExtendedGraphemeClusters.cpp.gyb

    $ python3 utils/gyb --test \
    -DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \
    -DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \
    -DCMAKE_SIZEOF_VOID_P=8 \
    -o /tmp/UnicodeExtendedGraphemeClusters.cpp.3.5.tmp \
    ./stdlib/public/stubs/UnicodeExtendedGraphemeClusters.cpp.gyb

    $ diff -u /tmp/UnicodeExtendedGraphemeClusters.cpp.2.7.tmp \
    /tmp/UnicodeExtendedGraphemeClusters.cpp.3.5.tmp

To test the method `get_grapheme_cluster_break_tests_as_UTF8`:

    $ python2 utils/gyb --test \
    -DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \
    -DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \
    -DCMAKE_SIZEOF_VOID_P=8 \
    -o /tmp/UnicodeGraphemeBreakTest.cpp.2.7.tmp \
    ./unittests/Basic/UnicodeGraphemeBreakTest.cpp.gyb

    $ python3 utils/gyb --test \
    -DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \
    -DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \
    -DCMAKE_SIZEOF_VOID_P=8 \
    -o /tmp/UnicodeGraphemeBreakTest.cpp.3.5.tmp \
    ./unittests/Basic/UnicodeGraphemeBreakTest.cpp.gyb

    $ diff -u /tmp/UnicodeGraphemeBreakTest.cpp.2.7.tmp \
    /tmp/UnicodeGraphemeBreakTest.cpp.3.5.tmp
  • Loading branch information
RLovelett committed Dec 31, 2015
1 parent c677844 commit 7dbb412
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 10 deletions.
4 changes: 3 additions & 1 deletion lib/ClangImporter/SortedCFDatabase.def.gyb
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
%{

import re
import sys
import codecs

prologueLines = ""
epilogueLines = ""
Expand All @@ -26,7 +28,7 @@ epilogueLines = ""
lineForName = {}

# Load the data file.
with open(CFDatabaseFile, 'rb') as f:
with codecs.open(CFDatabaseFile, encoding=sys.getfilesystemencoding(), errors='strict') as f:
for line in f:
# Pass through preprocessor directives literally.
# Assume that they all fall into either a strict prologue or epilogue.
Expand Down
20 changes: 11 additions & 9 deletions utils/GYBUnicodeDataUtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
##===----------------------------------------------------------------------===##

import re
import sys
import codecs

class UnicodeProperty(object):
"""Abstract base class for Unicode properties."""
Expand Down Expand Up @@ -68,7 +70,7 @@ def __init__(self, grapheme_break_property_file_name):
self.symbolic_values[v] = k

# Load the data file.
with open(grapheme_break_property_file_name, 'rb') as f:
with codecs.open(grapheme_break_property_file_name, encoding=sys.getfilesystemencoding(), errors='strict') as f:
for line in f:
# Strip comments.
line = re.sub('#.*', '', line)
Expand Down Expand Up @@ -514,9 +516,9 @@ def _convert_line(line):

# Match a list of code points.
for token in line.split(" "):
if token == "÷":
if token == u"÷":
boundaries += [ curr_bytes ]
elif token == "×":
elif token == u"×":
pass
else:
code_point = int(token, 16)
Expand All @@ -529,21 +531,21 @@ def _convert_line(line):
# and test separately that we handle ill-formed UTF-8 sequences.
if code_point >= 0xd800 and code_point <= 0xdfff:
code_point = 0x200b
code_point = ('\U%(cp)08x' % { 'cp': code_point }).decode('unicode_escape')
as_UTF8_bytes = code_point.encode('utf8')
as_UTF8_escaped = ''.join(['\\x%(byte)02x' % { 'byte': ord(byte) } for byte in as_UTF8_bytes])
code_point = (b'\U%(cp)08x' % { b'cp': code_point }).decode('unicode_escape', 'strict')
as_UTF8_bytes = bytearray(code_point.encode('utf8', 'strict'))
as_UTF8_escaped = ''.join(['\\x%(byte)02x' % { 'byte': byte } for byte in as_UTF8_bytes])
test += as_UTF8_escaped
curr_bytes += len(as_UTF8_bytes)

return (test, boundaries)

# Self-test.
assert(_convert_line('÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ]))
assert(_convert_line('÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ]))
assert(_convert_line(u'÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ]))
assert(_convert_line(u'÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ]))

result = []

with open(grapheme_break_test_file_name, 'rb') as f:
with codecs.open(grapheme_break_test_file_name, encoding=sys.getfilesystemencoding(), errors='strict') as f:
for line in f:
test = _convert_line(line)
if test:
Expand Down

0 comments on commit 7dbb412

Please sign in to comment.