ucs2/
lib.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
//! Utility functions for the UCS-2 character encoding.

#![no_std]
#![deny(missing_docs)]
#![deny(clippy::all)]

mod macros;

/// These need to be public for the `ucs2_cstr!` macro, but are not
/// intended to be called directly.
#[doc(hidden)]
pub use macros::{str_num_ucs2_chars, str_to_ucs2};

use bit_field::BitField;
use core::fmt::{self, Display, Formatter};

/// Possible errors returned by the API.
#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
pub enum Error {
    /// Not enough space left in the output buffer.
    BufferOverflow,
    /// Input contained a character which cannot be represented in UCS-2.
    MultiByte,
}

impl Display for Error {
    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
        match self {
            Self::BufferOverflow => f.write_str("output buffer is too small"),
            Self::MultiByte => {
                f.write_str("input contains a character which cannot be represented in UCS-2")
            }
        }
    }
}

type Result<T> = core::result::Result<T, Error>;

/// Value returned by `ucs2_from_utf8_at_offset`.
struct Ucs2CharFromUtf8 {
    /// UCS-2 character.
    val: u16,
    /// Number of bytes needed to encode the character in UTF-8.
    num_bytes: u8,
}

/// Get a UCS-2 character from a UTF-8 byte slice at the given offset.
///
/// # Safety
///
/// The input `bytes` must be valid UTF-8.
const unsafe fn ucs2_from_utf8_at_offset(bytes: &[u8], offset: usize) -> Result<Ucs2CharFromUtf8> {
    let len = bytes.len();
    let ch;
    let ch_len;

    if bytes[offset] & 0b1000_0000 == 0b0000_0000 {
        ch = bytes[offset] as u16;
        ch_len = 1;
    } else if bytes[offset] & 0b1110_0000 == 0b1100_0000 {
        // 2 byte codepoint
        if offset + 1 >= len {
            // safe: len is the length of bytes,
            // and bytes is a direct view into the
            // buffer of input, which in order to be a valid
            // utf-8 string _must_ contain `i + 1`.
            unsafe { core::hint::unreachable_unchecked() }
        }

        let a = (bytes[offset] & 0b0001_1111) as u16;
        let b = (bytes[offset + 1] & 0b0011_1111) as u16;
        ch = a << 6 | b;
        ch_len = 2;
    } else if bytes[offset] & 0b1111_0000 == 0b1110_0000 {
        // 3 byte codepoint
        if offset + 2 >= len || offset + 1 >= len {
            // safe: impossible utf-8 string.
            unsafe { core::hint::unreachable_unchecked() }
        }

        let a = (bytes[offset] & 0b0000_1111) as u16;
        let b = (bytes[offset + 1] & 0b0011_1111) as u16;
        let c = (bytes[offset + 2] & 0b0011_1111) as u16;
        ch = a << 12 | b << 6 | c;
        ch_len = 3;
    } else if bytes[offset] & 0b1111_0000 == 0b1111_0000 {
        return Err(Error::MultiByte); // UTF-16
    } else {
        // safe: impossible utf-8 string.
        unsafe { core::hint::unreachable_unchecked() }
    }

    Ok(Ucs2CharFromUtf8 {
        val: ch,
        num_bytes: ch_len,
    })
}

/// Encodes an input UTF-8 string into a UCS-2 string.
///
/// The returned `usize` represents the length of the returned buffer,
/// measured in 2-byte characters.
pub fn encode(input: &str, buffer: &mut [u16]) -> Result<usize> {
    let buffer_size = buffer.len();
    let mut i = 0;

    encode_with(input, |ch| {
        if i >= buffer_size {
            Err(Error::BufferOverflow)
        } else {
            buffer[i] = ch;
            i += 1;
            Ok(())
        }
    })?;

    Ok(i)
}

/// Encode UTF-8 string to UCS-2 with a custom callback function.
///
/// `output` is a function which receives every encoded character.
pub fn encode_with<F>(input: &str, mut output: F) -> Result<()>
where
    F: FnMut(u16) -> Result<()>,
{
    let bytes = input.as_bytes();
    let len = bytes.len();
    let mut i = 0;

    while i < len {
        // SAFETY: `bytes` is valid UTF-8.
        let ch = unsafe { ucs2_from_utf8_at_offset(bytes, i) }?;
        i += usize::from(ch.num_bytes);
        output(ch.val)?;
    }
    Ok(())
}

/// Decode UCS-2 string to UTF-8 with a custom callback function.
///
/// `output` is a function which receives every decoded character.
/// Due to the nature of UCS-2, the function can receive an UTF-8 character
/// of up to three bytes, for every input character.
pub fn decode_with<F>(input: &[u16], mut output: F) -> Result<usize>
where
    F: FnMut(&[u8]) -> Result<()>,
{
    let mut written = 0;

    for ch in input.iter() {
        /*
         * We need to find how many bytes of UTF-8 this UCS-2 code-point needs. Because UCS-2 can only encode
         * the Basic Multilingual Plane, a maximum of three bytes are needed.
         */
        if (0x000..0x0080).contains(ch) {
            output(&[*ch as u8])?;

            written += 1;
        } else if (0x0080..0x0800).contains(ch) {
            let first = 0b1100_0000 + ch.get_bits(6..11) as u8;
            let last = 0b1000_0000 + ch.get_bits(0..6) as u8;

            output(&[first, last])?;

            written += 2;
        } else {
            let first = 0b1110_0000 + ch.get_bits(12..16) as u8;
            let mid = 0b1000_0000 + ch.get_bits(6..12) as u8;
            let last = 0b1000_0000 + ch.get_bits(0..6) as u8;

            output(&[first, mid, last])?;

            written += 3;
        }
    }

    Ok(written)
}

/// Decode an input UCS-2 string into a UTF-8 string.
///
/// The returned `usize` represents the length of the returned buffer,
/// in bytes. Due to the nature of UCS-2, the output buffer could end up with
/// three bytes for every character in the input buffer.
pub fn decode(input: &[u16], output: &mut [u8]) -> Result<usize> {
    let buffer_size = output.len();
    let mut i = 0;

    decode_with(input, |bytes| {
        if bytes.len() == 1 {
            // Can be encoded in a single byte
            if i >= buffer_size {
                return Err(Error::BufferOverflow);
            }

            output[i] = bytes[0];

            i += 1;
        } else if bytes.len() == 2 {
            // Can be encoded two bytes
            if i + 1 >= buffer_size {
                return Err(Error::BufferOverflow);
            }

            output[i] = bytes[0];
            output[i + 1] = bytes[1];

            i += 2;
        } else if bytes.len() == 3 {
            // Can be encoded three bytes
            if i + 2 >= buffer_size {
                return Err(Error::BufferOverflow);
            }

            output[i] = bytes[0];
            output[i + 1] = bytes[1];
            output[i + 2] = bytes[2];

            i += 3;
        } else {
            unreachable!("More than three bytes per UCS-2 character.");
        }

        Ok(())
    })
}