Skip to content

Commit

Permalink
[lit] Parse all strings as UTF-8 rather than ASCII.
Browse files Browse the repository at this point in the history
As far as I can tell UTF-8 has been supported since the beginning of Python's
codec support, and it's the de facto standard for text these days, at least
for primarily-English text. This allows us to put Unicode into lit RUN lines.

rdar://problem/18311663

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@217688 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
jrose-apple committed Sep 12, 2014
1 parent 81d53b7 commit c919e57
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 15 deletions.
3 changes: 3 additions & 0 deletions test/Other/lit-unicode.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
REQUIRES: shell
RUN: echo "ようこそ" | FileCheck %s
CHECK: {{^}}ようこそ{{$}}
8 changes: 4 additions & 4 deletions utils/lit/lit/ProgressBar.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
import sys, re, time

def to_bytes(str):
# Encode to Latin1 to get binary data.
return str.encode('ISO-8859-1')
# Encode to UTF-8 to get binary data.
return str.encode('utf-8')

class TerminalController:
"""
Expand Down Expand Up @@ -136,7 +136,7 @@ def __init__(self, term_stream=sys.stdout):

def _tparm(self, arg, index):
import curses
return curses.tparm(to_bytes(arg), index).decode('ascii') or ''
return curses.tparm(to_bytes(arg), index).decode('utf-8') or ''

def _tigetstr(self, cap_name):
# String capabilities can include "delays" of the form "$<2>".
Expand All @@ -147,7 +147,7 @@ def _tigetstr(self, cap_name):
if cap is None:
cap = ''
else:
cap = cap.decode('ascii')
cap = cap.decode('utf-8')
return re.sub(r'\$<\d+>[/*]?', '', cap)

def render(self, template):
Expand Down
26 changes: 18 additions & 8 deletions utils/lit/lit/TestRunner.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,11 @@ def executeShCmd(cmd, cfg, cwd, results):
f.seek(0, 0)
procData[i] = (procData[i][0], f.read())

def to_string(bytes):
if isinstance(bytes, str):
return bytes
return bytes.encode('utf-8')

exitCode = None
for i,(out,err) in enumerate(procData):
res = procs[i].wait()
Expand All @@ -201,11 +206,11 @@ def executeShCmd(cmd, cfg, cwd, results):

# Ensure the resulting output is always of string type.
try:
out = str(out.decode('ascii'))
out = to_string(out.decode('utf-8'))
except:
out = str(out)
try:
err = str(err.decode('ascii'))
err = to_string(err.decode('utf-8'))
except:
err = str(err)

Expand Down Expand Up @@ -314,13 +319,18 @@ def parseIntegratedTestScriptCommands(source_path):
# Python2 and bytes in Python3.
#
# Once we find a match, we do require each script line to be decodable to
# ascii, so we convert the outputs to ascii before returning. This way the
# UTF-8, so we convert the outputs to UTF-8 before returning. This way the
# remaining code can work with "strings" agnostic of the executing Python
# version.

def to_bytes(str):
# Encode to Latin1 to get binary data.
return str.encode('ISO-8859-1')
# Encode to UTF-8 to get binary data.
return str.encode('utf-8')
def to_string(bytes):
if isinstance(bytes, str):
return bytes
return to_bytes(bytes)

keywords = ('RUN:', 'XFAIL:', 'REQUIRES:', 'END.')
keywords_re = re.compile(
to_bytes("(%s)(.*)\n" % ("|".join(k for k in keywords),)))
Expand All @@ -341,13 +351,13 @@ def to_bytes(str):
match_position)
last_match_position = match_position

# Convert the keyword and line to ascii strings and yield the
# Convert the keyword and line to UTF-8 strings and yield the
# command. Note that we take care to return regular strings in
# Python 2, to avoid other code having to differentiate between the
# str and unicode types.
keyword,ln = match.groups()
yield (line_number, str(keyword[:-1].decode('ascii')),
str(ln.decode('ascii')))
yield (line_number, to_string(keyword[:-1].decode('utf-8')),
to_string(ln.decode('utf-8')))
finally:
f.close()

Expand Down
2 changes: 1 addition & 1 deletion utils/lit/lit/formats/googletest.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def getGTestTests(self, path, litConfig, localConfig):
try:
lines = lit.util.capture([path, '--gtest_list_tests'],
env=localConfig.environment)
lines = lines.decode('ascii')
lines = lines.decode('utf-8')
if kIsWindows:
lines = lines.replace('\r', '')
lines = lines.split('\n')
Expand Down
9 changes: 7 additions & 2 deletions utils/lit/lit/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,13 +156,18 @@ def executeCommand(command, cwd=None, env=None):
if exitCode == -signal.SIGINT:
raise KeyboardInterrupt

def to_string(bytes):
if isinstance(bytes, str):
return bytes
return bytes.encode('utf-8')

# Ensure the resulting output is always of string type.
try:
out = str(out.decode('ascii'))
out = to_string(out.decode('utf-8'))
except:
out = str(out)
try:
err = str(err.decode('ascii'))
err = to_string(err.decode('utf-8'))
except:
err = str(err)

Expand Down

0 comments on commit c919e57

Please sign in to comment.