1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
//! Utilities for working with Java's "Modified UTF-8" character encoding.
//!
//! For more information, refer to [Wikipedia].
//!
//! [Wikipedia]: https://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8

use std::io;
use std::io::Write;
use std::str::from_utf8_unchecked;

use byteorder::{BigEndian, WriteBytesExt};

pub(crate) fn write_modified_utf8(mut writer: impl Write, text: &str) -> io::Result<()> {
    let bytes = text.as_bytes();
    let mut i = 0;

    while i < bytes.len() {
        match bytes[i] {
            0 => {
                writer.write_u16::<BigEndian>(0xc080)?;
                i += 1;
            }
            b @ 1..=127 => {
                writer.write_u8(b)?;
                i += 1;
            }
            b => {
                let w = utf8_char_width(b);
                debug_assert!(w <= 4);
                debug_assert!(i + w <= bytes.len());

                if w != 4 {
                    writer.write_all(&bytes[i..i + w])?;
                } else {
                    let s = unsafe { from_utf8_unchecked(&bytes[i..i + w]) };
                    let c = s.chars().next().unwrap() as u32 - 0x10000;

                    let s0 = ((c >> 10) as u16) | 0xd800;
                    let s1 = ((c & 0x3ff) as u16) | 0xdc00;

                    writer.write_all(encode_surrogate(s0).as_slice())?;
                    writer.write_all(encode_surrogate(s1).as_slice())?;
                }
                i += w;
            }
        }
    }

    Ok(())
}

const fn utf8_char_width(first_byte: u8) -> usize {
    const UTF8_CHAR_WIDTH: [u8; 256] = [
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    ];

    UTF8_CHAR_WIDTH[first_byte as usize] as usize
}

fn encode_surrogate(surrogate: u16) -> [u8; 3] {
    debug_assert!((0xd800..=0xdfff).contains(&surrogate));

    const TAG_CONT_U8: u8 = 0b1000_0000_u8;
    [
        0b11100000 | ((surrogate & 0b11110000_00000000) >> 12) as u8,
        TAG_CONT_U8 | ((surrogate & 0b00001111_11000000) >> 6) as u8,
        TAG_CONT_U8 | (surrogate & 0b00000000_00111111) as u8,
    ]
}

pub(crate) fn encoded_len(bytes: &[u8]) -> usize {
    let mut n = 0;
    let mut i = 0;

    while i < bytes.len() {
        match bytes[i] {
            // Fast path for ASCII here makes a huge difference in benchmarks.
            1..=127 => {
                n += 1;
                i += 1;
            }
            0 => {
                n += 2;
                i += 1;
            }
            b => {
                let w = utf8_char_width(b);

                if w == 4 {
                    n += 6;
                } else {
                    n += w;
                }

                i += w;
            }
        }
    }

    n
}

#[cfg(test)]
#[test]
fn equivalence() {
    fn check(s: &str) {
        let mut ours = vec![];

        let theirs = cesu8::to_java_cesu8(s);
        write_modified_utf8(&mut ours, s).unwrap();

        assert_eq!(theirs, ours);
        assert_eq!(theirs.len(), encoded_len(s.as_bytes()));
    }

    check("Mary had a little lamb\0");
    check("🤡💩👻💀☠👽👾🤖🎃😺😸😹😻😼😽🙀😿😾");
    check("ÅÆÇÈØõ÷£¥ý");
}