rust-lang · zachs18 · Dec 21, 2024 · Dec 21, 2024 · Dec 21, 2024 · Dec 21, 2024
diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs
@@ -350,6 +350,46 @@ use crate::vec::Vec;
 ///
 /// Here, there's no need to allocate more memory inside the loop.
 ///
+/// # Invariant
+///
+/// Rust libraries may assume that `String`s are always valid UTF-8, [just like `str`s](str#invariant).
+///
+/// Constructing a non-UTF-8 `String` is not immediate undefined behavior, but any function
+/// called on a `String` may assume that it is valid UTF-8, which means that a non-UTF-8 `String`
+/// can lead to undefined behavior down the road.
+///
+/// As an exception, some functions explicitly allow invalid UTF-8, and will not immediately cause undefined
+/// behavior if called on a `String` containing invalid UTF-8. In general, all of `String`'s associated
+/// functions other than those listed here should be assumed to require their input be valid UTF-8.
+/// Note that calling one of these functions on a `String` containing invalid UTF-8, may result in the return value
+/// also containing invalid UTF-8, if relevant.
+///
+/// * `String::as_bytes`
+/// * `String::as_bytes_mut`
+/// * `String::as_str`
+/// * `String::as_mut_str`
+/// * `String::as_ptr`
+/// * `String::as_mut_ptr`
+/// * `String::as_mut_vec`
+/// * `String::capacity`
+/// * `String::len`
+/// * `String::clear`
+/// * `<String as Drop>::drop` (i.e. dropping a `String` that contains invalid UTF-8 does not alone cause UB)
+/// * `String::leak`
+/// * `String::into_boxed_str`
+/// * `String::reserve`
+/// * `String::reserve_exact`
+/// * `String::try_reserve`
+/// * `String::try_reserve_exact`
+/// * `<String as Deref>::deref`
+/// * `<String as DerefMut>::deref_mut`
+/// * `<String as AsRef<str>>::as_ref`
+/// * `<String as AsMut<str>>::as_mut`
+/// * `<String as Borrow<str>>::borrow`
+/// * `<String as BorrowMut<str>>::borrow_mut`
+/// * `<String as Clone>::clone`
+/// * `<String as Clone>::clone_from`
+///
 /// [str]: prim@str "str"
 /// [`str`]: prim@str "str"
 /// [`&str`]: prim@str "&str"
@@ -1023,7 +1063,7 @@ impl String {
     /// This function is unsafe because it does not check that the bytes passed
     /// to it are valid UTF-8. If this constraint is violated, it may cause
     /// memory unsafety issues with future users of the `String`, as the rest of
-    /// the standard library assumes that `String`s are valid UTF-8.
+    /// the standard library [assumes that `String`s are valid UTF-8](String#invariant).
     ///
     /// # Examples
     ///
@@ -1775,8 +1815,8 @@ impl String {
     /// This function is unsafe because the returned `&mut Vec` allows writing
     /// bytes which are not valid UTF-8. If this constraint is violated, using
     /// the original `String` after dropping the `&mut Vec` may violate memory
-    /// safety, as the rest of the standard library assumes that `String`s are
-    /// valid UTF-8.
+    /// safety, as the rest of the standard library [assumes that `String`s are
+    /// valid UTF-8](String#invariant).
     ///
     /// # Examples
     ///

diff --git a/library/core/src/primitive_docs.rs b/library/core/src/primitive_docs.rs
@@ -1015,6 +1015,29 @@ mod prim_slice {}
 /// Constructing a non-UTF-8 string slice is not immediate undefined behavior, but any function
 /// called on a string slice may assume that it is valid UTF-8, which means that a non-UTF-8 string
 /// slice can lead to undefined behavior down the road.
+///
+/// As an exception, some functions explicitly allow invalid UTF-8, and will not immediately cause undefined
+/// behavior if called on a `str` containing invalid UTF-8. In general, all of `str`'s associated
+/// functions other than those listed here should be assumed to require their input be valid UTF-8.
+/// Note that calling one of these functions on a `str` containing invalid UTF-8, may result in the return value
+/// also containing invalid UTF-8, if relevant.
+///
+/// * `str::as_bytes`
+/// * `str::as_bytes_mut`
+/// * `str::as_str`
+/// * `str::as_ptr`
+/// * `str::as_mut_ptr`
+/// * `str::bytes`
+/// * `str::into_string`
+/// * `str::into_boxed_bytes`
+/// * `str::len`
+/// * `<str as AsRef<[u8]>>::as_ref`
+/// * `<str as AsRef<str>>::as_ref`
+/// * `<str as AsMut<str>>::as_mut`
+/// * `<str as Borrow<str>>::borrow`
+/// * `<str as BorrowMut<str>>::borrow_mut`
+/// * `<Box<str> as Clone>::clone`
+/// * `<Box<str> as Clone>::clone_from`
 #[stable(feature = "rust1", since = "1.0.0")]
 mod prim_str {}
 

diff --git a/library/core/src/str/converts.rs b/library/core/src/str/converts.rs
@@ -2,7 +2,7 @@
 
 use super::Utf8Error;
 use super::validations::run_utf8_validation;
-use crate::{mem, ptr};
+use crate::ptr;
 
 /// Converts a slice of bytes to a string slice.
 ///
@@ -145,7 +145,10 @@ pub const fn from_utf8_mut(v: &mut [u8]) -> Result<&mut str, Utf8Error> {
 ///
 /// # Safety
 ///
-/// The bytes passed in must be valid UTF-8.
+/// This function is unsafe because it does not check that the bytes passed
+/// to it are valid UTF-8. If this constraint is violated, it may cause
+/// memory unsafety issues with future users of the `str`, as the rest of
+/// the standard library [assumes that `str`s are valid UTF-8](str#invariant).
 ///
 /// # Examples
 ///
@@ -169,9 +172,11 @@ pub const fn from_utf8_mut(v: &mut [u8]) -> Result<&mut str, Utf8Error> {
 #[rustc_const_stable(feature = "const_str_from_utf8_unchecked", since = "1.55.0")]
 #[rustc_diagnostic_item = "str_from_utf8_unchecked"]
 pub const unsafe fn from_utf8_unchecked(v: &[u8]) -> &str {
-    // SAFETY: the caller must guarantee that the bytes `v` are valid UTF-8.
-    // Also relies on `&str` and `&[u8]` having the same layout.
-    unsafe { mem::transmute(v) }
+    // SAFETY: the pointer dereference is safe because that pointer
+    // comes from a reference which is guaranteed to be valid for reads.
+    // If the input bytes are not valid UTF-8, then the returned `&str` will
+    // have invalid UTF-8, which is unsafe but not immediate UB.
+    unsafe { &*(v as *const [u8] as *const str) }
 }
 
 /// Converts a slice of bytes to a string slice without checking
@@ -197,10 +202,10 @@ pub const unsafe fn from_utf8_unchecked(v: &[u8]) -> &str {
 #[rustc_const_stable(feature = "const_str_from_utf8_unchecked_mut", since = "1.83.0")]
 #[rustc_diagnostic_item = "str_from_utf8_unchecked_mut"]
 pub const unsafe fn from_utf8_unchecked_mut(v: &mut [u8]) -> &mut str {
-    // SAFETY: the caller must guarantee that the bytes `v`
-    // are valid UTF-8, thus the cast to `*mut str` is safe.
-    // Also, the pointer dereference is safe because that pointer
+    // SAFETY: the pointer dereference is safe because that pointer
     // comes from a reference which is guaranteed to be valid for writes.
+    // If the input bytes are not valid UTF-8, then the returned `&mut str` will
+    // have invalid UTF-8, which is unsafe but not immediate UB.
     unsafe { &mut *(v as *mut [u8] as *mut str) }
 }
 

diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs
@@ -309,10 +309,11 @@ impl str {
     ///
     /// # Safety
     ///
-    /// The caller must ensure that the content of the slice is valid UTF-8
-    /// before the borrow ends and the underlying `str` is used.
-    ///
-    /// Use of a `str` whose contents are not valid UTF-8 is undefined behavior.
+    /// This function is unsafe because the returned `&mut [u8]` allows writing
+    /// bytes which are not valid UTF-8. If this constraint is violated, using
+    /// the original `str` after the `&mut [u8]` borrow expires may violate memory
+    /// safety, as the rest of the standard library [assumes that `str`s are
+    /// valid UTF-8](str#invariant).
     ///
     /// # Examples
     ///