Auto merge of rust-lang#27012 - pornel:master, r=Gankro

Fixes rust-lang#26689 This PR tries to clarify uses of "character" where it means "code point" or "UTF-8 sequence", which are almost, but not quite the same. Edge cases added to some examples to demonstrate this. However, I've kept use of the term "code point" instead of "Unicode scalar value", because in UTF-8 they're the same, and "code point" is more widely known.
vhbit · Jul 26, 2015 · 6232f95 · 6232f95
2 parents cf7e825 + c20e3fc
commit 6232f95
Showing 1 changed file with 71 additions and 47 deletions.
diff --git a/src/libcollections/str.rs b/src/libcollections/str.rs
@@ -500,7 +500,7 @@ impl str {
     ///
     /// # Unsafety
     ///
-    /// Caller must check both UTF-8 character boundaries and the boundaries
+    /// Caller must check both UTF-8 sequence boundaries and the boundaries
     /// of the entire slice as
     /// well.
     ///
@@ -526,15 +526,16 @@ impl str {
         core_str::StrExt::slice_mut_unchecked(self, begin, end)
     }
 
-    /// Returns a slice of the string from the character range [`begin`..`end`).
+    /// Returns a slice of the string from the range [`begin`..`end`) where indices
+    /// are counted in code points.
     ///
     /// That is, start at the `begin`-th code point of the string and continue
     /// to the `end`-th code point. This does not detect or handle edge cases
-    /// such as leaving a combining character as the first code point of the
+    /// such as leaving a combining character as the first `char` of the
     /// string.
     ///
     /// Due to the design of UTF-8, this operation is `O(end)`. Use slicing
-    /// syntax if you want to use byte indices rather than codepoint indices.
+    /// syntax if you want to use `O(1)` byte indices instead.
     ///
     /// # Panics
     ///
@@ -556,26 +557,26 @@ impl str {
         core_str::StrExt::slice_chars(self, begin, end)
     }
 
-    /// Given a byte position, return the next char and its index.
+    /// Given a byte position, return the next code point and its index.
     ///
-    /// This can be used to iterate over the Unicode characters of a string.
+    /// This can be used to iterate over the Unicode code points of a string.
     ///
     /// # Panics
     ///
     /// If `i` is greater than or equal to the length of the string.
-    /// If `i` is not the index of the beginning of a valid UTF-8 character.
+    /// If `i` is not the index of the beginning of a valid UTF-8 sequence.
     ///
     /// # Examples
     ///
-    /// This example manually iterates through the characters of a string;
+    /// This example manually iterates through the code points of a string;
     /// this should normally be
     /// done by `.chars()` or `.char_indices()`.
     ///
     /// ```
     /// # #![feature(str_char, core)]
     /// use std::str::CharRange;
     ///
-    /// let s = "中华Việt Nam";
+    /// let s = "中华Việt Nam";
     /// let mut i = 0;
     /// while i < s.len() {
     ///     let CharRange {ch, next} = s.char_range_at(i);
@@ -591,12 +592,14 @@ impl str {
     /// 3: 华
     /// 6: V
     /// 7: i
-    /// 8: ệ
-    /// 11: t
-    /// 12:
-    /// 13: N
-    /// 14: a
-    /// 15: m
+    /// 8: e
+    /// 9: ̣
+    /// 11: ̂
+    /// 13: t
+    /// 14:
+    /// 15: N
+    /// 16: a
+    /// 17: m
     /// ```
     #[unstable(feature = "str_char",
                reason = "often replaced by char_indices, this method may \
@@ -608,26 +611,29 @@ impl str {
 
     /// Given a byte position, return the previous `char` and its position.
     ///
-    /// This function can be used to iterate over a Unicode string in reverse.
+    /// This function can be used to iterate over a Unicode code points in reverse.
+    ///
+    /// Note that Unicode has many features, such as combining marks, ligatures,
+    /// and direction marks, that need to be taken into account to correctly reverse a string.
     ///
     /// Returns 0 for next index if called on start index 0.
     ///
     /// # Panics
     ///
     /// If `i` is greater than the length of the string.
-    /// If `i` is not an index following a valid UTF-8 character.
+    /// If `i` is not an index following a valid UTF-8 sequence.
     ///
     /// # Examples
     ///
-    /// This example manually iterates through the characters of a string;
+    /// This example manually iterates through the code points of a string;
     /// this should normally be
     /// done by `.chars().rev()` or `.char_indices()`.
     ///
     /// ```
     /// # #![feature(str_char, core)]
     /// use std::str::CharRange;
     ///
-    /// let s = "中华Việt Nam";
+    /// let s = "中华Việt Nam";
     /// let mut i = s.len();
     /// while i > 0 {
     ///     let CharRange {ch, next} = s.char_range_at_reverse(i);
@@ -639,12 +645,14 @@ impl str {
     /// This outputs:
     ///
     /// ```text
-    /// 16: m
-    /// 15: a
-    /// 14: N
-    /// 13:
-    /// 12: t
-    /// 11: ệ
+    /// 18: m
+    /// 17: a
+    /// 16: N
+    /// 15:
+    /// 14: t
+    /// 13: ̂
+    /// 11: ̣
+    /// 9: e
     /// 8: i
     /// 7: V
     /// 6: 华
@@ -663,7 +671,7 @@ impl str {
     /// # Panics
     ///
     /// If `i` is greater than or equal to the length of the string.
-    /// If `i` is not the index of the beginning of a valid UTF-8 character.
+    /// If `i` is not the index of the beginning of a valid UTF-8 sequence.
     ///
     /// # Examples
     ///
@@ -672,6 +680,7 @@ impl str {
     /// let s = "abπc";
     /// assert_eq!(s.char_at(1), 'b');
     /// assert_eq!(s.char_at(2), 'π');
+    /// assert_eq!(s.char_at(4), 'c');
     /// ```
     #[unstable(feature = "str_char",
                reason = "frequently replaced by the chars() iterator, this \
@@ -689,7 +698,7 @@ impl str {
     /// # Panics
     ///
     /// If `i` is greater than the length of the string.
-    /// If `i` is not an index following a valid UTF-8 character.
+    /// If `i` is not an index following a valid UTF-8 sequence.
     ///
     /// # Examples
     ///
@@ -698,6 +707,7 @@ impl str {
     /// let s = "abπc";
     /// assert_eq!(s.char_at_reverse(1), 'a');
     /// assert_eq!(s.char_at_reverse(2), 'b');
+    /// assert_eq!(s.char_at_reverse(3), 'π');
     /// ```
     #[unstable(feature = "str_char",
                reason = "see char_at for more details, but reverse semantics \
@@ -707,28 +717,30 @@ impl str {
         core_str::StrExt::char_at_reverse(self, i)
     }
 
-    /// Retrieves the first character from a `&str` and returns it.
+    /// Retrieves the first code point from a `&str` and returns it.
+    ///
+    /// Note that a single Unicode character (grapheme cluster)
+    /// can be composed of multiple `char`s.
     ///
     /// This does not allocate a new string; instead, it returns a slice that
-    /// points one character
-    /// beyond the character that was shifted.
+    /// points one code point beyond the code point that was shifted.
     ///
-    /// If the slice does not contain any characters, None is returned instead.
+    /// `None` is returned if the slice is empty.
     ///
     /// # Examples
     ///
     /// ```
     /// # #![feature(str_char)]
-    /// let s = "Löwe 老虎 Léopard";
+    /// let s = "Łódź"; // \u{141}o\u{301}dz\u{301}
     /// let (c, s1) = s.slice_shift_char().unwrap();
     ///
-    /// assert_eq!(c, 'L');
-    /// assert_eq!(s1, "öwe 老虎 Léopard");
+    /// assert_eq!(c, 'Ł');
+    /// assert_eq!(s1, "ódź");
     ///
     /// let (c, s2) = s1.slice_shift_char().unwrap();
     ///
-    /// assert_eq!(c, 'ö');
-    /// assert_eq!(s2, "we 老虎 Léopard");
+    /// assert_eq!(c, 'o');
+    /// assert_eq!(s2, "\u{301}dz\u{301}");
     /// ```
     #[unstable(feature = "str_char",
                reason = "awaiting conventions about shifting and slices and \
@@ -741,14 +753,14 @@ impl str {
     /// Divide one string slice into two at an index.
     ///
     /// The index `mid` is a byte offset from the start of the string
-    /// that must be on a character boundary.
+    /// that must be on a `char` boundary.
     ///
     /// Return slices `&self[..mid]` and `&self[mid..]`.
     ///
     /// # Panics
     ///
-    /// Panics if `mid` is beyond the last character of the string,
-    /// or if it is not on a character boundary.
+    /// Panics if `mid` is beyond the last code point of the string,
+    /// or if it is not on a `char` boundary.
     ///
     /// # Examples
     /// ```
@@ -773,27 +785,39 @@ impl str {
         core_str::StrExt::split_at_mut(self, mid)
     }
 
-    /// An iterator over the codepoints of `self`.
+    /// An iterator over the code points of `self`.
+    ///
+    /// In Unicode relationship between code points and characters is complex.
+    /// A single character may be composed of multiple code points
+    /// (e.g. diacritical marks added to a letter), and a single code point
+    /// (e.g. Hangul syllable) may contain multiple characters.
+    ///
+    /// For iteration over human-readable characters a grapheme cluster iterator
+    /// may be more appropriate. See the [unicode-segmentation crate][1].
+    ///
+    /// [1]: https://crates.io/crates/unicode-segmentation
     ///
     /// # Examples
     ///
     /// ```
-    /// let v: Vec<char> = "abc åäö".chars().collect();
+    /// let v: Vec<char> = "ASCII żółć 🇨🇭 한".chars().collect();
     ///
-    /// assert_eq!(v, ['a', 'b', 'c', ' ', 'å', 'ä', 'ö']);
+    /// assert_eq!(v, ['A', 'S', 'C', 'I', 'I', ' ',
+    ///     'z', '\u{307}', 'o', '\u{301}', 'ł', 'c', '\u{301}', ' ',
+    ///     '\u{1f1e8}', '\u{1f1ed}', ' ', '한']);
     /// ```
     #[stable(feature = "rust1", since = "1.0.0")]
     pub fn chars(&self) -> Chars {
         core_str::StrExt::chars(self)
     }
 
-    /// An iterator over the characters of `self` and their byte offsets.
+    /// An iterator over the `char`s of `self` and their byte offsets.
     ///
     /// # Examples
     ///
     /// ```
-    /// let v: Vec<(usize, char)> = "abc".char_indices().collect();
-    /// let b = vec![(0, 'a'), (1, 'b'), (2, 'c')];
+    /// let v: Vec<(usize, char)> = "A🇨🇭".char_indices().collect();
+    /// let b = vec![(0, 'A'), (1, '\u{1f1e8}'), (5, '\u{1f1ed}')];
     ///
     /// assert_eq!(v, b);
     /// ```
@@ -822,7 +846,7 @@ impl str {
     /// # Examples
     ///
     /// ```
-    /// let some_words = " Mary   had\ta little  \n\t lamb";
+    /// let some_words = " Mary   had\ta\u{2009}little  \n\t lamb";
     /// let v: Vec<&str> = some_words.split_whitespace().collect();
     ///
     /// assert_eq!(v, ["Mary", "had", "a", "little", "lamb"]);
@@ -840,7 +864,7 @@ impl str {
     /// ```
     /// # #![feature(str_words)]
     /// # #![allow(deprecated)]
-    /// let some_words = " Mary   had\ta little  \n\t lamb";
+    /// let some_words = " Mary   had\ta\u{2009}little  \n\t lamb";
     /// let v: Vec<&str> = some_words.words().collect();
     ///
     /// assert_eq!(v, ["Mary", "had", "a", "little", "lamb"]);