[lit] Parse all strings as UTF-8 rather than ASCII.

As far as I can tell UTF-8 has been supported since the beginning of Python's codec support, and it's the de facto standard for text these days, at least for primarily-English text. This allows us to put Unicode into lit RUN lines. rdar://problem/18311663 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@217688 91177308-0d34-0410-b5e6-96231b3b80d8
stryku · Sep 12, 2014 · c919e57 · c919e57
1 parent 81d53b7
commit c919e57
Show file tree

Hide file tree

Showing 5 changed files with 33 additions and 15 deletions.
diff --git a/test/Other/lit-unicode.txt b/test/Other/lit-unicode.txt
@@ -0,0 +1,3 @@
+REQUIRES: shell
+RUN: echo "ようこそ" | FileCheck %s
+CHECK: {{^}}ようこそ{{$}}
diff --git a/utils/lit/lit/ProgressBar.py b/utils/lit/lit/ProgressBar.py
@@ -6,8 +6,8 @@
 import sys, re, time
 
 def to_bytes(str):
-    # Encode to Latin1 to get binary data.
-    return str.encode('ISO-8859-1')
+    # Encode to UTF-8 to get binary data.
+    return str.encode('utf-8')
 
 class TerminalController:
     """
@@ -136,7 +136,7 @@ def __init__(self, term_stream=sys.stdout):
 
     def _tparm(self, arg, index):
         import curses
-        return curses.tparm(to_bytes(arg), index).decode('ascii') or ''
+        return curses.tparm(to_bytes(arg), index).decode('utf-8') or ''
 
     def _tigetstr(self, cap_name):
         # String capabilities can include "delays" of the form "$<2>".
@@ -147,7 +147,7 @@ def _tigetstr(self, cap_name):
         if cap is None:
             cap = ''
         else:
-            cap = cap.decode('ascii')
+            cap = cap.decode('utf-8')
         return re.sub(r'\$<\d+>[/*]?', '', cap)
 
     def render(self, template):

diff --git a/utils/lit/lit/TestRunner.py b/utils/lit/lit/TestRunner.py
@@ -192,6 +192,11 @@ def executeShCmd(cmd, cfg, cwd, results):
         f.seek(0, 0)
         procData[i] = (procData[i][0], f.read())
 
+    def to_string(bytes):
+        if isinstance(bytes, str):
+            return bytes
+        return bytes.encode('utf-8')
+
     exitCode = None
     for i,(out,err) in enumerate(procData):
         res = procs[i].wait()
@@ -201,11 +206,11 @@ def executeShCmd(cmd, cfg, cwd, results):
 
         # Ensure the resulting output is always of string type.
         try:
-            out = str(out.decode('ascii'))
+            out = to_string(out.decode('utf-8'))
         except:
             out = str(out)
         try:
-            err = str(err.decode('ascii'))
+            err = to_string(err.decode('utf-8'))
         except:
             err = str(err)
 
@@ -314,13 +319,18 @@ def parseIntegratedTestScriptCommands(source_path):
     # Python2 and bytes in Python3.
     #
     # Once we find a match, we do require each script line to be decodable to
-    # ascii, so we convert the outputs to ascii before returning. This way the
+    # UTF-8, so we convert the outputs to UTF-8 before returning. This way the
     # remaining code can work with "strings" agnostic of the executing Python
     # version.
 
     def to_bytes(str):
-        # Encode to Latin1 to get binary data.
-        return str.encode('ISO-8859-1')
+        # Encode to UTF-8 to get binary data.
+        return str.encode('utf-8')
+    def to_string(bytes):
+        if isinstance(bytes, str):
+            return bytes
+        return to_bytes(bytes)
+
     keywords = ('RUN:', 'XFAIL:', 'REQUIRES:', 'END.')
     keywords_re = re.compile(
         to_bytes("(%s)(.*)\n" % ("|".join(k for k in keywords),)))
@@ -341,13 +351,13 @@ def to_bytes(str):
                                       match_position)
             last_match_position = match_position
 
-            # Convert the keyword and line to ascii strings and yield the
+            # Convert the keyword and line to UTF-8 strings and yield the
             # command. Note that we take care to return regular strings in
             # Python 2, to avoid other code having to differentiate between the
             # str and unicode types.
             keyword,ln = match.groups()
-            yield (line_number, str(keyword[:-1].decode('ascii')),
-                   str(ln.decode('ascii')))
+            yield (line_number, to_string(keyword[:-1].decode('utf-8')),
+                   to_string(ln.decode('utf-8')))
     finally:
         f.close()
 

diff --git a/utils/lit/lit/formats/googletest.py b/utils/lit/lit/formats/googletest.py
@@ -31,7 +31,7 @@ def getGTestTests(self, path, litConfig, localConfig):
         try:
             lines = lit.util.capture([path, '--gtest_list_tests'],
                                      env=localConfig.environment)
-            lines = lines.decode('ascii')
+            lines = lines.decode('utf-8')
             if kIsWindows:
               lines = lines.replace('\r', '')
             lines = lines.split('\n')

diff --git a/utils/lit/lit/util.py b/utils/lit/lit/util.py
@@ -156,13 +156,18 @@ def executeCommand(command, cwd=None, env=None):
     if exitCode == -signal.SIGINT:
         raise KeyboardInterrupt
 
+    def to_string(bytes):
+        if isinstance(bytes, str):
+            return bytes
+        return bytes.encode('utf-8')
+
     # Ensure the resulting output is always of string type.
     try:
-        out = str(out.decode('ascii'))
+        out = to_string(out.decode('utf-8'))
     except:
         out = str(out)
     try:
-        err = str(err.decode('ascii'))
+        err = to_string(err.decode('utf-8'))
     except:
         err = str(err)