ucs2/lib.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227
//! Utility functions for the UCS-2 character encoding.
#![no_std]
#![deny(missing_docs)]
#![deny(clippy::all)]
mod macros;
/// These need to be public for the `ucs2_cstr!` macro, but are not
/// intended to be called directly.
#[doc(hidden)]
pub use macros::{str_num_ucs2_chars, str_to_ucs2};
use bit_field::BitField;
use core::fmt::{self, Display, Formatter};
/// Possible errors returned by the API.
#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
pub enum Error {
/// Not enough space left in the output buffer.
BufferOverflow,
/// Input contained a character which cannot be represented in UCS-2.
MultiByte,
}
impl Display for Error {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
match self {
Self::BufferOverflow => f.write_str("output buffer is too small"),
Self::MultiByte => {
f.write_str("input contains a character which cannot be represented in UCS-2")
}
}
}
}
type Result<T> = core::result::Result<T, Error>;
/// Value returned by `ucs2_from_utf8_at_offset`.
struct Ucs2CharFromUtf8 {
/// UCS-2 character.
val: u16,
/// Number of bytes needed to encode the character in UTF-8.
num_bytes: u8,
}
/// Get a UCS-2 character from a UTF-8 byte slice at the given offset.
///
/// # Safety
///
/// The input `bytes` must be valid UTF-8.
const unsafe fn ucs2_from_utf8_at_offset(bytes: &[u8], offset: usize) -> Result<Ucs2CharFromUtf8> {
let len = bytes.len();
let ch;
let ch_len;
if bytes[offset] & 0b1000_0000 == 0b0000_0000 {
ch = bytes[offset] as u16;
ch_len = 1;
} else if bytes[offset] & 0b1110_0000 == 0b1100_0000 {
// 2 byte codepoint
if offset + 1 >= len {
// safe: len is the length of bytes,
// and bytes is a direct view into the
// buffer of input, which in order to be a valid
// utf-8 string _must_ contain `i + 1`.
unsafe { core::hint::unreachable_unchecked() }
}
let a = (bytes[offset] & 0b0001_1111) as u16;
let b = (bytes[offset + 1] & 0b0011_1111) as u16;
ch = a << 6 | b;
ch_len = 2;
} else if bytes[offset] & 0b1111_0000 == 0b1110_0000 {
// 3 byte codepoint
if offset + 2 >= len || offset + 1 >= len {
// safe: impossible utf-8 string.
unsafe { core::hint::unreachable_unchecked() }
}
let a = (bytes[offset] & 0b0000_1111) as u16;
let b = (bytes[offset + 1] & 0b0011_1111) as u16;
let c = (bytes[offset + 2] & 0b0011_1111) as u16;
ch = a << 12 | b << 6 | c;
ch_len = 3;
} else if bytes[offset] & 0b1111_0000 == 0b1111_0000 {
return Err(Error::MultiByte); // UTF-16
} else {
// safe: impossible utf-8 string.
unsafe { core::hint::unreachable_unchecked() }
}
Ok(Ucs2CharFromUtf8 {
val: ch,
num_bytes: ch_len,
})
}
/// Encodes an input UTF-8 string into a UCS-2 string.
///
/// The returned `usize` represents the length of the returned buffer,
/// measured in 2-byte characters.
pub fn encode(input: &str, buffer: &mut [u16]) -> Result<usize> {
let buffer_size = buffer.len();
let mut i = 0;
encode_with(input, |ch| {
if i >= buffer_size {
Err(Error::BufferOverflow)
} else {
buffer[i] = ch;
i += 1;
Ok(())
}
})?;
Ok(i)
}
/// Encode UTF-8 string to UCS-2 with a custom callback function.
///
/// `output` is a function which receives every encoded character.
pub fn encode_with<F>(input: &str, mut output: F) -> Result<()>
where
F: FnMut(u16) -> Result<()>,
{
let bytes = input.as_bytes();
let len = bytes.len();
let mut i = 0;
while i < len {
// SAFETY: `bytes` is valid UTF-8.
let ch = unsafe { ucs2_from_utf8_at_offset(bytes, i) }?;
i += usize::from(ch.num_bytes);
output(ch.val)?;
}
Ok(())
}
/// Decode UCS-2 string to UTF-8 with a custom callback function.
///
/// `output` is a function which receives every decoded character.
/// Due to the nature of UCS-2, the function can receive an UTF-8 character
/// of up to three bytes, for every input character.
pub fn decode_with<F>(input: &[u16], mut output: F) -> Result<usize>
where
F: FnMut(&[u8]) -> Result<()>,
{
let mut written = 0;
for ch in input.iter() {
/*
* We need to find how many bytes of UTF-8 this UCS-2 code-point needs. Because UCS-2 can only encode
* the Basic Multilingual Plane, a maximum of three bytes are needed.
*/
if (0x000..0x0080).contains(ch) {
output(&[*ch as u8])?;
written += 1;
} else if (0x0080..0x0800).contains(ch) {
let first = 0b1100_0000 + ch.get_bits(6..11) as u8;
let last = 0b1000_0000 + ch.get_bits(0..6) as u8;
output(&[first, last])?;
written += 2;
} else {
let first = 0b1110_0000 + ch.get_bits(12..16) as u8;
let mid = 0b1000_0000 + ch.get_bits(6..12) as u8;
let last = 0b1000_0000 + ch.get_bits(0..6) as u8;
output(&[first, mid, last])?;
written += 3;
}
}
Ok(written)
}
/// Decode an input UCS-2 string into a UTF-8 string.
///
/// The returned `usize` represents the length of the returned buffer,
/// in bytes. Due to the nature of UCS-2, the output buffer could end up with
/// three bytes for every character in the input buffer.
pub fn decode(input: &[u16], output: &mut [u8]) -> Result<usize> {
let buffer_size = output.len();
let mut i = 0;
decode_with(input, |bytes| {
if bytes.len() == 1 {
// Can be encoded in a single byte
if i >= buffer_size {
return Err(Error::BufferOverflow);
}
output[i] = bytes[0];
i += 1;
} else if bytes.len() == 2 {
// Can be encoded two bytes
if i + 1 >= buffer_size {
return Err(Error::BufferOverflow);
}
output[i] = bytes[0];
output[i + 1] = bytes[1];
i += 2;
} else if bytes.len() == 3 {
// Can be encoded three bytes
if i + 2 >= buffer_size {
return Err(Error::BufferOverflow);
}
output[i] = bytes[0];
output[i + 1] = bytes[1];
output[i + 2] = bytes[2];
i += 3;
} else {
unreachable!("More than three bytes per UCS-2 character.");
}
Ok(())
})
}