java_string/
cesu8.rs

1use std::borrow::Cow;
2
3use crate::validations::{utf8_char_width, CONT_MASK, TAG_CONT};
4use crate::{JavaStr, JavaString, Utf8Error};
5
6impl JavaStr {
7    /// Converts from Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format to a `Cow<JavaStr>`.
8    ///
9    /// ```
10    /// # use std::borrow::Cow;
11    /// # use java_string::{JavaCodePoint, JavaStr, JavaString};
12    ///
13    /// let result = JavaStr::from_modified_utf8("Hello World!".as_bytes()).unwrap();
14    /// assert!(matches!(result, Cow::Borrowed(_)));
15    /// assert_eq!(JavaStr::from_str("Hello World!"), result);
16    ///
17    /// let result = JavaStr::from_modified_utf8(&[
18    ///     0x61, 0x62, 0x63, 0xc0, 0x80, 0xe2, 0x84, 0x9d, 0xed, 0xa0, 0xbd, 0xed, 0xb2, 0xa3, 0xed,
19    ///     0xa0, 0x80,
20    /// ])
21    /// .unwrap();
22    /// assert!(matches!(result, Cow::Owned(_)));
23    /// let mut expected = JavaString::from("abc\0ℝ💣");
24    /// expected.push_java(JavaCodePoint::from_u32(0xd800).unwrap());
25    /// assert_eq!(expected, result);
26    ///
27    /// let result = JavaStr::from_modified_utf8(&[0xed]);
28    /// assert!(result.is_err());
29    /// ```
30    #[inline]
31    pub fn from_modified_utf8(bytes: &[u8]) -> Result<Cow<JavaStr>, Utf8Error> {
32        match JavaStr::from_full_utf8(bytes) {
33            Ok(str) => Ok(Cow::Borrowed(str)),
34            Err(_) => JavaString::from_modified_utf8_internal(bytes).map(Cow::Owned),
35        }
36    }
37
38    /// Converts to Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format.
39    ///
40    /// ```
41    /// # use std::borrow::Cow;
42    /// # use java_string::{JavaCodePoint, JavaStr, JavaString};
43    ///
44    /// let result = JavaStr::from_str("Hello World!").to_modified_utf8();
45    /// assert!(matches!(result, Cow::Borrowed(_)));
46    /// assert_eq!(result, &b"Hello World!"[..]);
47    ///
48    /// let mut str = JavaString::from("abc\0ℝ💣");
49    /// str.push_java(JavaCodePoint::from_u32(0xd800).unwrap());
50    /// let result = str.to_modified_utf8();
51    /// let expected = [
52    ///     0x61, 0x62, 0x63, 0xc0, 0x80, 0xe2, 0x84, 0x9d, 0xed, 0xa0, 0xbd, 0xed, 0xb2, 0xa3, 0xed,
53    ///     0xa0, 0x80,
54    /// ];
55    /// assert!(matches!(result, Cow::Owned(_)));
56    /// assert_eq!(result, &expected[..]);
57    /// ```
58    #[inline]
59    #[must_use]
60    pub fn to_modified_utf8(&self) -> Cow<[u8]> {
61        if is_valid_cesu8(self) {
62            Cow::Borrowed(self.as_bytes())
63        } else {
64            Cow::Owned(self.to_modified_utf8_internal())
65        }
66    }
67
68    #[inline]
69    fn to_modified_utf8_internal(&self) -> Vec<u8> {
70        let bytes = self.as_bytes();
71        let mut encoded = Vec::with_capacity((bytes.len() + bytes.len()) >> 2);
72        let mut i = 0;
73        while i < bytes.len() {
74            let b = bytes[i];
75            if b == 0 {
76                encoded.extend([0xc0, 0x80]);
77                i += 1;
78            } else if b < 128 {
79                // Pass ASCII through quickly.
80                encoded.push(b);
81                i += 1;
82            } else {
83                // Figure out how many bytes we need for this character.
84                let w = utf8_char_width(b);
85                let char_bytes = unsafe {
86                    // SAFETY: input must be valid semi UTF-8, so there must be at least w more
87                    // bytes from i
88                    bytes.get_unchecked(i..i + w)
89                };
90                if w != 4 {
91                    // Pass through short UTF-8 sequences unmodified.
92                    encoded.extend(char_bytes.iter().copied())
93                } else {
94                    // Encode 4-byte sequences as 6 bytes
95                    let s = unsafe {
96                        // SAFETY: input is valid semi UTF-8
97                        JavaStr::from_semi_utf8_unchecked(char_bytes)
98                    };
99                    let c = unsafe {
100                        // SAFETY: s contains a single char of width 4
101                        s.chars().next().unwrap_unchecked().as_u32() - 0x10000
102                    };
103                    let s = [((c >> 10) as u16) | 0xd800, ((c & 0x3ff) as u16) | 0xdc00];
104                    encoded.extend(enc_surrogate(s[0]));
105                    encoded.extend(enc_surrogate(s[1]));
106                }
107                i += w;
108            }
109        }
110        encoded
111    }
112}
113
114impl JavaString {
115    /// Converts from Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format to a `JavaString`.
116    ///
117    /// See [`JavaStr::from_modified_utf8`].
118    #[inline]
119    pub fn from_modified_utf8(bytes: Vec<u8>) -> Result<JavaString, Utf8Error> {
120        match JavaString::from_full_utf8(bytes) {
121            Ok(str) => Ok(str),
122            Err(err) => JavaString::from_modified_utf8_internal(&err.bytes),
123        }
124    }
125
126    fn from_modified_utf8_internal(slice: &[u8]) -> Result<JavaString, Utf8Error> {
127        let mut offset = 0;
128        let mut decoded = Vec::with_capacity(slice.len() + 1);
129
130        while let Some(&first) = slice.get(offset) {
131            let old_offset = offset;
132            offset += 1;
133
134            macro_rules! err {
135                ($error_len:expr) => {
136                    return Err(Utf8Error {
137                        valid_up_to: old_offset,
138                        error_len: $error_len,
139                    })
140                };
141            }
142
143            macro_rules! next {
144                () => {{
145                    if let Some(&b) = slice.get(offset) {
146                        offset += 1;
147                        b
148                    } else {
149                        err!(None)
150                    }
151                }};
152            }
153
154            macro_rules! next_cont {
155                ($error_len:expr) => {{
156                    let byte = next!();
157                    if (byte) & !CONT_MASK == TAG_CONT {
158                        byte
159                    } else {
160                        err!($error_len)
161                    }
162                }};
163            }
164
165            if first == 0 {
166                // modified UTF-8 should never contain \0 directly.
167                err!(Some(1));
168            } else if first < 128 {
169                // Pass ASCII through directly.
170                decoded.push(first);
171            } else if first == 0xc0 {
172                // modified UTF-8 encoding of null character
173                match next!() {
174                    0x80 => decoded.push(0),
175                    _ => err!(Some(1)),
176                }
177            } else {
178                let w = utf8_char_width(first);
179                let second = next_cont!(Some(1));
180                match w {
181                    // Two-byte sequences can be used directly.
182                    2 => {
183                        decoded.extend([first, second]);
184                    }
185                    3 => {
186                        let third = next_cont!(Some(2));
187                        #[allow(clippy::unnested_or_patterns)] // Justification: readability
188                        match (first, second) {
189                            // These are valid UTF-8, so pass them through.
190                            (0xe0, 0xa0..=0xbf)
191                            | (0xe1..=0xec, 0x80..=0xbf)
192                            | (0xed, 0x80..=0x9f)
193                            | (0xee..=0xef, 0x80..=0xbf)
194                            // Second half of a surrogate pair without a preceding first half, also pass this through.
195                            | (0xed, 0xb0..=0xbf)
196                            => decoded.extend([first, second, third]),
197                            // First half of a surrogate pair
198                            (0xed, 0xa0..=0xaf) => {
199                                // Peek ahead and try to pair the first half of surrogate pair with
200                                // second.
201                                match &slice[offset..] {
202                                    [0xed, fifth @ 0xb0..=0xbf, sixth, ..]
203                                    if *sixth & !CONT_MASK == TAG_CONT =>
204                                        {
205                                            let s = dec_surrogates(second, third, *fifth, *sixth);
206                                            decoded.extend(s);
207                                            offset += 3;
208                                        }
209                                    _ => {
210                                        // No second half, append the first half directly.
211                                        decoded.extend([first, second, third]);
212                                    }
213                                }
214                            }
215                            _ => err!(Some(1)),
216                        }
217                    }
218                    _ => err!(Some(1)), // modified UTF-8 doesn't allow width 4
219                }
220            }
221        }
222
223        unsafe {
224            // SAFETY: we built a semi UTF-8 encoded string
225            Ok(JavaString::from_semi_utf8_unchecked(decoded))
226        }
227    }
228
229    /// Converts to Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format.
230    ///
231    /// See [`JavaStr::to_modified_utf8`].
232    #[inline]
233    #[must_use]
234    pub fn into_modified_utf8(self) -> Vec<u8> {
235        if is_valid_cesu8(&self) {
236            self.into_bytes()
237        } else {
238            self.to_modified_utf8_internal()
239        }
240    }
241}
242
243#[inline]
244fn dec_surrogate(second: u8, third: u8) -> u32 {
245    0xd000 | (u32::from(second & CONT_MASK) << 6) | u32::from(third & CONT_MASK)
246}
247
248#[inline]
249fn dec_surrogates(second: u8, third: u8, fifth: u8, sixth: u8) -> [u8; 4] {
250    // Convert to a 32-bit code point.
251    let s1 = dec_surrogate(second, third);
252    let s2 = dec_surrogate(fifth, sixth);
253    let c = 0x10000 + (((s1 - 0xd800) << 10) | (s2 - 0xdc00));
254    assert!((0x010000..=0x10ffff).contains(&c));
255
256    // Convert to UTF-8.
257    // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
258    [
259        0b1111_0000_u8 | ((c & 0b1_1100_0000_0000_0000_0000) >> 18) as u8,
260        TAG_CONT | ((c & 0b0_0011_1111_0000_0000_0000) >> 12) as u8,
261        TAG_CONT | ((c & 0b0_0000_0000_1111_1100_0000) >> 6) as u8,
262        TAG_CONT | (c & 0b0_0000_0000_0000_0011_1111) as u8,
263    ]
264}
265
266#[inline]
267fn is_valid_cesu8(text: &JavaStr) -> bool {
268    text.bytes()
269        .all(|b| b != 0 && ((b & !CONT_MASK) == TAG_CONT || utf8_char_width(b) <= 3))
270}
271
272#[inline]
273fn enc_surrogate(surrogate: u16) -> [u8; 3] {
274    // 1110xxxx 10xxxxxx 10xxxxxx
275    [
276        0b11100000 | ((surrogate & 0b11110000_00000000) >> 12) as u8,
277        TAG_CONT | ((surrogate & 0b00001111_11000000) >> 6) as u8,
278        TAG_CONT | (surrogate & 0b00000000_00111111) as u8,
279    ]
280}