java_string/cesu8.rs
1use std::borrow::Cow;
2
3use crate::validations::{utf8_char_width, CONT_MASK, TAG_CONT};
4use crate::{JavaStr, JavaString, Utf8Error};
5
6impl JavaStr {
7 /// Converts from Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format to a `Cow<JavaStr>`.
8 ///
9 /// ```
10 /// # use std::borrow::Cow;
11 /// # use java_string::{JavaCodePoint, JavaStr, JavaString};
12 ///
13 /// let result = JavaStr::from_modified_utf8("Hello World!".as_bytes()).unwrap();
14 /// assert!(matches!(result, Cow::Borrowed(_)));
15 /// assert_eq!(JavaStr::from_str("Hello World!"), result);
16 ///
17 /// let result = JavaStr::from_modified_utf8(&[
18 /// 0x61, 0x62, 0x63, 0xc0, 0x80, 0xe2, 0x84, 0x9d, 0xed, 0xa0, 0xbd, 0xed, 0xb2, 0xa3, 0xed,
19 /// 0xa0, 0x80,
20 /// ])
21 /// .unwrap();
22 /// assert!(matches!(result, Cow::Owned(_)));
23 /// let mut expected = JavaString::from("abc\0ℝ💣");
24 /// expected.push_java(JavaCodePoint::from_u32(0xd800).unwrap());
25 /// assert_eq!(expected, result);
26 ///
27 /// let result = JavaStr::from_modified_utf8(&[0xed]);
28 /// assert!(result.is_err());
29 /// ```
30 #[inline]
31 pub fn from_modified_utf8(bytes: &[u8]) -> Result<Cow<JavaStr>, Utf8Error> {
32 match JavaStr::from_full_utf8(bytes) {
33 Ok(str) => Ok(Cow::Borrowed(str)),
34 Err(_) => JavaString::from_modified_utf8_internal(bytes).map(Cow::Owned),
35 }
36 }
37
38 /// Converts to Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format.
39 ///
40 /// ```
41 /// # use std::borrow::Cow;
42 /// # use java_string::{JavaCodePoint, JavaStr, JavaString};
43 ///
44 /// let result = JavaStr::from_str("Hello World!").to_modified_utf8();
45 /// assert!(matches!(result, Cow::Borrowed(_)));
46 /// assert_eq!(result, &b"Hello World!"[..]);
47 ///
48 /// let mut str = JavaString::from("abc\0ℝ💣");
49 /// str.push_java(JavaCodePoint::from_u32(0xd800).unwrap());
50 /// let result = str.to_modified_utf8();
51 /// let expected = [
52 /// 0x61, 0x62, 0x63, 0xc0, 0x80, 0xe2, 0x84, 0x9d, 0xed, 0xa0, 0xbd, 0xed, 0xb2, 0xa3, 0xed,
53 /// 0xa0, 0x80,
54 /// ];
55 /// assert!(matches!(result, Cow::Owned(_)));
56 /// assert_eq!(result, &expected[..]);
57 /// ```
58 #[inline]
59 #[must_use]
60 pub fn to_modified_utf8(&self) -> Cow<[u8]> {
61 if is_valid_cesu8(self) {
62 Cow::Borrowed(self.as_bytes())
63 } else {
64 Cow::Owned(self.to_modified_utf8_internal())
65 }
66 }
67
68 #[inline]
69 fn to_modified_utf8_internal(&self) -> Vec<u8> {
70 let bytes = self.as_bytes();
71 let mut encoded = Vec::with_capacity((bytes.len() + bytes.len()) >> 2);
72 let mut i = 0;
73 while i < bytes.len() {
74 let b = bytes[i];
75 if b == 0 {
76 encoded.extend([0xc0, 0x80]);
77 i += 1;
78 } else if b < 128 {
79 // Pass ASCII through quickly.
80 encoded.push(b);
81 i += 1;
82 } else {
83 // Figure out how many bytes we need for this character.
84 let w = utf8_char_width(b);
85 let char_bytes = unsafe {
86 // SAFETY: input must be valid semi UTF-8, so there must be at least w more
87 // bytes from i
88 bytes.get_unchecked(i..i + w)
89 };
90 if w != 4 {
91 // Pass through short UTF-8 sequences unmodified.
92 encoded.extend(char_bytes.iter().copied())
93 } else {
94 // Encode 4-byte sequences as 6 bytes
95 let s = unsafe {
96 // SAFETY: input is valid semi UTF-8
97 JavaStr::from_semi_utf8_unchecked(char_bytes)
98 };
99 let c = unsafe {
100 // SAFETY: s contains a single char of width 4
101 s.chars().next().unwrap_unchecked().as_u32() - 0x10000
102 };
103 let s = [((c >> 10) as u16) | 0xd800, ((c & 0x3ff) as u16) | 0xdc00];
104 encoded.extend(enc_surrogate(s[0]));
105 encoded.extend(enc_surrogate(s[1]));
106 }
107 i += w;
108 }
109 }
110 encoded
111 }
112}
113
114impl JavaString {
115 /// Converts from Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format to a `JavaString`.
116 ///
117 /// See [`JavaStr::from_modified_utf8`].
118 #[inline]
119 pub fn from_modified_utf8(bytes: Vec<u8>) -> Result<JavaString, Utf8Error> {
120 match JavaString::from_full_utf8(bytes) {
121 Ok(str) => Ok(str),
122 Err(err) => JavaString::from_modified_utf8_internal(&err.bytes),
123 }
124 }
125
126 fn from_modified_utf8_internal(slice: &[u8]) -> Result<JavaString, Utf8Error> {
127 let mut offset = 0;
128 let mut decoded = Vec::with_capacity(slice.len() + 1);
129
130 while let Some(&first) = slice.get(offset) {
131 let old_offset = offset;
132 offset += 1;
133
134 macro_rules! err {
135 ($error_len:expr) => {
136 return Err(Utf8Error {
137 valid_up_to: old_offset,
138 error_len: $error_len,
139 })
140 };
141 }
142
143 macro_rules! next {
144 () => {{
145 if let Some(&b) = slice.get(offset) {
146 offset += 1;
147 b
148 } else {
149 err!(None)
150 }
151 }};
152 }
153
154 macro_rules! next_cont {
155 ($error_len:expr) => {{
156 let byte = next!();
157 if (byte) & !CONT_MASK == TAG_CONT {
158 byte
159 } else {
160 err!($error_len)
161 }
162 }};
163 }
164
165 if first == 0 {
166 // modified UTF-8 should never contain \0 directly.
167 err!(Some(1));
168 } else if first < 128 {
169 // Pass ASCII through directly.
170 decoded.push(first);
171 } else if first == 0xc0 {
172 // modified UTF-8 encoding of null character
173 match next!() {
174 0x80 => decoded.push(0),
175 _ => err!(Some(1)),
176 }
177 } else {
178 let w = utf8_char_width(first);
179 let second = next_cont!(Some(1));
180 match w {
181 // Two-byte sequences can be used directly.
182 2 => {
183 decoded.extend([first, second]);
184 }
185 3 => {
186 let third = next_cont!(Some(2));
187 #[allow(clippy::unnested_or_patterns)] // Justification: readability
188 match (first, second) {
189 // These are valid UTF-8, so pass them through.
190 (0xe0, 0xa0..=0xbf)
191 | (0xe1..=0xec, 0x80..=0xbf)
192 | (0xed, 0x80..=0x9f)
193 | (0xee..=0xef, 0x80..=0xbf)
194 // Second half of a surrogate pair without a preceding first half, also pass this through.
195 | (0xed, 0xb0..=0xbf)
196 => decoded.extend([first, second, third]),
197 // First half of a surrogate pair
198 (0xed, 0xa0..=0xaf) => {
199 // Peek ahead and try to pair the first half of surrogate pair with
200 // second.
201 match &slice[offset..] {
202 [0xed, fifth @ 0xb0..=0xbf, sixth, ..]
203 if *sixth & !CONT_MASK == TAG_CONT =>
204 {
205 let s = dec_surrogates(second, third, *fifth, *sixth);
206 decoded.extend(s);
207 offset += 3;
208 }
209 _ => {
210 // No second half, append the first half directly.
211 decoded.extend([first, second, third]);
212 }
213 }
214 }
215 _ => err!(Some(1)),
216 }
217 }
218 _ => err!(Some(1)), // modified UTF-8 doesn't allow width 4
219 }
220 }
221 }
222
223 unsafe {
224 // SAFETY: we built a semi UTF-8 encoded string
225 Ok(JavaString::from_semi_utf8_unchecked(decoded))
226 }
227 }
228
229 /// Converts to Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format.
230 ///
231 /// See [`JavaStr::to_modified_utf8`].
232 #[inline]
233 #[must_use]
234 pub fn into_modified_utf8(self) -> Vec<u8> {
235 if is_valid_cesu8(&self) {
236 self.into_bytes()
237 } else {
238 self.to_modified_utf8_internal()
239 }
240 }
241}
242
243#[inline]
244fn dec_surrogate(second: u8, third: u8) -> u32 {
245 0xd000 | (u32::from(second & CONT_MASK) << 6) | u32::from(third & CONT_MASK)
246}
247
248#[inline]
249fn dec_surrogates(second: u8, third: u8, fifth: u8, sixth: u8) -> [u8; 4] {
250 // Convert to a 32-bit code point.
251 let s1 = dec_surrogate(second, third);
252 let s2 = dec_surrogate(fifth, sixth);
253 let c = 0x10000 + (((s1 - 0xd800) << 10) | (s2 - 0xdc00));
254 assert!((0x010000..=0x10ffff).contains(&c));
255
256 // Convert to UTF-8.
257 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
258 [
259 0b1111_0000_u8 | ((c & 0b1_1100_0000_0000_0000_0000) >> 18) as u8,
260 TAG_CONT | ((c & 0b0_0011_1111_0000_0000_0000) >> 12) as u8,
261 TAG_CONT | ((c & 0b0_0000_0000_1111_1100_0000) >> 6) as u8,
262 TAG_CONT | (c & 0b0_0000_0000_0000_0011_1111) as u8,
263 ]
264}
265
266#[inline]
267fn is_valid_cesu8(text: &JavaStr) -> bool {
268 text.bytes()
269 .all(|b| b != 0 && ((b & !CONT_MASK) == TAG_CONT || utf8_char_width(b) <= 3))
270}
271
272#[inline]
273fn enc_surrogate(surrogate: u16) -> [u8; 3] {
274 // 1110xxxx 10xxxxxx 10xxxxxx
275 [
276 0b11100000 | ((surrogate & 0b11110000_00000000) >> 12) as u8,
277 TAG_CONT | ((surrogate & 0b00001111_11000000) >> 6) as u8,
278 TAG_CONT | (surrogate & 0b00000000_00111111) as u8,
279 ]
280}