1use std::io;
8use std::io::Write;
9use std::str::from_utf8_unchecked;
10
11use byteorder::{BigEndian, WriteBytesExt};
12
13pub(crate) fn write_modified_utf8(mut writer: impl Write, text: &str) -> io::Result<()> {
14 let bytes = text.as_bytes();
15 let mut i = 0;
16
17 while i < bytes.len() {
18 match bytes[i] {
19 0 => {
20 writer.write_u16::<BigEndian>(0xc080)?;
21 i += 1;
22 }
23 b @ 1..=127 => {
24 writer.write_u8(b)?;
25 i += 1;
26 }
27 b => {
28 let w = utf8_char_width(b);
29 debug_assert!(w <= 4);
30 debug_assert!(i + w <= bytes.len());
31
32 if w != 4 {
33 writer.write_all(&bytes[i..i + w])?;
34 } else {
35 let s = unsafe { from_utf8_unchecked(&bytes[i..i + w]) };
36 let c = s.chars().next().unwrap() as u32 - 0x10000;
37
38 let s0 = ((c >> 10) as u16) | 0xd800;
39 let s1 = ((c & 0x3ff) as u16) | 0xdc00;
40
41 writer.write_all(encode_surrogate(s0).as_slice())?;
42 writer.write_all(encode_surrogate(s1).as_slice())?;
43 }
44 i += w;
45 }
46 }
47 }
48
49 Ok(())
50}
51
52const fn utf8_char_width(first_byte: u8) -> usize {
53 const UTF8_CHAR_WIDTH: [u8; 256] = [
54 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
56 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
57 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
59 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
60 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
61 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
62 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
63 ];
64
65 UTF8_CHAR_WIDTH[first_byte as usize] as usize
66}
67
68fn encode_surrogate(surrogate: u16) -> [u8; 3] {
69 debug_assert!((0xd800..=0xdfff).contains(&surrogate));
70
71 const TAG_CONT_U8: u8 = 0b1000_0000_u8;
72 [
73 0b11100000 | ((surrogate & 0b11110000_00000000) >> 12) as u8,
74 TAG_CONT_U8 | ((surrogate & 0b00001111_11000000) >> 6) as u8,
75 TAG_CONT_U8 | (surrogate & 0b00000000_00111111) as u8,
76 ]
77}
78
79pub(crate) fn encoded_len(bytes: &[u8]) -> usize {
80 let mut n = 0;
81 let mut i = 0;
82
83 while i < bytes.len() {
84 match bytes[i] {
85 1..=127 => {
87 n += 1;
88 i += 1;
89 }
90 0 => {
91 n += 2;
92 i += 1;
93 }
94 b => {
95 let w = utf8_char_width(b);
96
97 if w == 4 {
98 n += 6;
99 } else {
100 n += w;
101 }
102
103 i += w;
104 }
105 }
106 }
107
108 n
109}
110
111#[cfg(test)]
112#[test]
113fn equivalence() {
114 fn check(s: &str) {
115 let mut ours = vec![];
116
117 let theirs = cesu8::to_java_cesu8(s);
118 write_modified_utf8(&mut ours, s).unwrap();
119
120 assert_eq!(theirs, ours);
121 assert_eq!(theirs.len(), encoded_len(s.as_bytes()));
122 }
123
124 check("Mary had a little lamb\0");
125 check("🤡💩👻💀☠👽👾🤖🎃😺😸😹😻😼😽🙀😿😾");
126 check("ÅÆÇÈØõ÷£¥ý");
127}