forked from swiftlang/swift
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Unicode.cpp
113 lines (93 loc) · 3.67 KB
/
Unicode.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
//===--- Unicode.cpp - Unicode utilities ----------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See http://swift.org/LICENSE.txt for license information
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
#include "swift/Basic/Unicode.h"
#include "llvm/Support/ConvertUTF.h"
using namespace swift;
StringRef swift::unicode::extractFirstExtendedGraphemeCluster(StringRef S) {
// Extended grapheme cluster segmentation algorithm as described in Unicode
// Standard Annex #29.
if (S.empty())
return StringRef();
const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
const UTF8 *SourceNext = SourceStart;
UTF32 C[2];
UTF32 *TargetStart = C;
ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart, C + 1,
lenientConversion);
if (TargetStart == C) {
// The source string contains an ill-formed subsequence at the end.
return S;
}
GraphemeClusterBreakProperty GCBForC0 = getGraphemeClusterBreakProperty(C[0]);
while (true) {
if (isExtendedGraphemeClusterBoundaryAfter(GCBForC0))
return S.slice(0, SourceNext - SourceStart);
size_t C1Offset = SourceNext - SourceStart;
ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart, C + 2,
lenientConversion);
if (TargetStart == C + 1) {
// End of source string or the source string contains an ill-formed
// subsequence at the end.
return S.slice(0, C1Offset);
}
GraphemeClusterBreakProperty GCBForC1 =
getGraphemeClusterBreakProperty(C[1]);
if (isExtendedGraphemeClusterBoundary(GCBForC0, GCBForC1))
return S.slice(0, C1Offset);
C[0] = C[1];
TargetStart = C + 1;
GCBForC0 = GCBForC1;
}
}
static bool extractFirstUnicodeScalarImpl(StringRef S, unsigned &Scalar) {
if (S.empty())
return false;
const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
const UTF8 *SourceNext = SourceStart;
UTF32 C;
UTF32 *TargetStart = &C;
ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart,
TargetStart + 1, lenientConversion);
if (TargetStart == &C) {
// The source string contains an ill-formed subsequence at the end.
return false;
}
Scalar = C;
return size_t(SourceNext - SourceStart) == S.size();
}
bool swift::unicode::isSingleUnicodeScalar(StringRef S) {
unsigned Scalar;
return extractFirstUnicodeScalarImpl(S, Scalar);
}
unsigned swift::unicode::extractFirstUnicodeScalar(StringRef S) {
unsigned Scalar;
bool Result = extractFirstUnicodeScalarImpl(S, Scalar);
assert(Result && "string does not consist of one Unicode scalar");
(void)Result;
return Scalar;
}
uint64_t swift::unicode::getUTF16Length(StringRef Str) {
uint64_t Length;
// Transcode the string to UTF-16 to get its length.
SmallVector<UTF16, 128> buffer(Str.size() + 1); // +1 for ending nulls.
const UTF8 *fromPtr = (const UTF8 *) Str.data();
UTF16 *toPtr = &buffer[0];
ConversionResult Result = ConvertUTF8toUTF16(&fromPtr, fromPtr + Str.size(),
&toPtr, toPtr + Str.size(),
strictConversion);
assert(Result == conversionOK &&
"UTF-8 encoded string cannot be converted into UTF-16 encoding");
(void)Result;
// The length of the transcoded string in UTF-16 code points.
Length = toPtr - &buffer[0];
return Length;
}