valence_nbt/binary/
modified_utf8.rs

1//! Utilities for working with Java's "Modified UTF-8" character encoding.
2//!
3//! For more information, refer to [Wikipedia].
4//!
5//! [Wikipedia]: https://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8
6
7use std::io;
8use std::io::Write;
9use std::str::from_utf8_unchecked;
10
11use byteorder::{BigEndian, WriteBytesExt};
12
13pub(crate) fn write_modified_utf8(mut writer: impl Write, text: &str) -> io::Result<()> {
14    let bytes = text.as_bytes();
15    let mut i = 0;
16
17    while i < bytes.len() {
18        match bytes[i] {
19            0 => {
20                writer.write_u16::<BigEndian>(0xc080)?;
21                i += 1;
22            }
23            b @ 1..=127 => {
24                writer.write_u8(b)?;
25                i += 1;
26            }
27            b => {
28                let w = utf8_char_width(b);
29                debug_assert!(w <= 4);
30                debug_assert!(i + w <= bytes.len());
31
32                if w != 4 {
33                    writer.write_all(&bytes[i..i + w])?;
34                } else {
35                    let s = unsafe { from_utf8_unchecked(&bytes[i..i + w]) };
36                    let c = s.chars().next().unwrap() as u32 - 0x10000;
37
38                    let s0 = ((c >> 10) as u16) | 0xd800;
39                    let s1 = ((c & 0x3ff) as u16) | 0xdc00;
40
41                    writer.write_all(encode_surrogate(s0).as_slice())?;
42                    writer.write_all(encode_surrogate(s1).as_slice())?;
43                }
44                i += w;
45            }
46        }
47    }
48
49    Ok(())
50}
51
52const fn utf8_char_width(first_byte: u8) -> usize {
53    const UTF8_CHAR_WIDTH: [u8; 256] = [
54        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
56        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
57        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58        1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
59        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
60        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
61        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
62        4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
63    ];
64
65    UTF8_CHAR_WIDTH[first_byte as usize] as usize
66}
67
68fn encode_surrogate(surrogate: u16) -> [u8; 3] {
69    debug_assert!((0xd800..=0xdfff).contains(&surrogate));
70
71    const TAG_CONT_U8: u8 = 0b1000_0000_u8;
72    [
73        0b11100000 | ((surrogate & 0b11110000_00000000) >> 12) as u8,
74        TAG_CONT_U8 | ((surrogate & 0b00001111_11000000) >> 6) as u8,
75        TAG_CONT_U8 | (surrogate & 0b00000000_00111111) as u8,
76    ]
77}
78
79pub(crate) fn encoded_len(bytes: &[u8]) -> usize {
80    let mut n = 0;
81    let mut i = 0;
82
83    while i < bytes.len() {
84        match bytes[i] {
85            // Fast path for ASCII here makes a huge difference in benchmarks.
86            1..=127 => {
87                n += 1;
88                i += 1;
89            }
90            0 => {
91                n += 2;
92                i += 1;
93            }
94            b => {
95                let w = utf8_char_width(b);
96
97                if w == 4 {
98                    n += 6;
99                } else {
100                    n += w;
101                }
102
103                i += w;
104            }
105        }
106    }
107
108    n
109}
110
111#[cfg(test)]
112#[test]
113fn equivalence() {
114    fn check(s: &str) {
115        let mut ours = vec![];
116
117        let theirs = cesu8::to_java_cesu8(s);
118        write_modified_utf8(&mut ours, s).unwrap();
119
120        assert_eq!(theirs, ours);
121        assert_eq!(theirs.len(), encoded_len(s.as_bytes()));
122    }
123
124    check("Mary had a little lamb\0");
125    check("🤡💩👻💀☠👽👾🤖🎃😺😸😹😻😼😽🙀😿😾");
126    check("ÅÆÇÈØõ÷£¥ý");
127}