ucs2/
macros.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
use crate::{ucs2_from_utf8_at_offset, Error};

/// Count the number of UCS-2 characters in a string. Return an error if
/// the string cannot be encoded in UCS-2.
pub const fn str_num_ucs2_chars(s: &str) -> Result<usize, Error> {
    let bytes = s.as_bytes();
    let len = bytes.len();

    let mut offset = 0;
    let mut num_ucs2_chars = 0;

    while offset < len {
        // SAFETY: `bytes` is valid UTF-8.
        match unsafe { ucs2_from_utf8_at_offset(bytes, offset) } {
            Ok(ch) => {
                offset += ch.num_bytes as usize;
                num_ucs2_chars += 1;
            }
            Err(err) => {
                return Err(err);
            }
        }
    }

    Ok(num_ucs2_chars)
}

/// Convert a `str` into a null-terminated UCS-2 character array.
pub const fn str_to_ucs2<const N: usize>(s: &str) -> Result<[u16; N], Error> {
    let bytes = s.as_bytes();
    let len = bytes.len();

    let mut output = [0; N];

    let mut output_offset = 0;
    let mut input_offset = 0;
    while input_offset < len {
        // SAFETY: `bytes` is valid UTF-8.
        match unsafe { ucs2_from_utf8_at_offset(bytes, input_offset) } {
            Ok(ch) => {
                if ch.val == 0 {
                    panic!("interior null character");
                } else {
                    output[output_offset] = ch.val;
                    output_offset += 1;
                    input_offset += ch.num_bytes as usize;
                }
            }
            Err(err) => {
                return Err(err);
            }
        }
    }

    // The output array must be one bigger than the converted string,
    // to leave room for the trailing null character.
    if output_offset + 1 != N {
        panic!("incorrect array length");
    }

    Ok(output)
}

/// Encode a string as UCS-2 with a trailing null character.
///
/// The encoding is done at compile time, so the result can be used in a
/// `const` item. The type returned by the macro is a `[u16; N]` array;
/// to avoid having to specify what `N` is in a `const` item, take a
/// reference and store it as `&[u16]`.
///
/// # Example
///
/// ```
/// use ucs2::ucs2_cstr;
///
/// const S: &[u16] = &ucs2_cstr!("abc");
/// assert_eq!(S, [97, 98, 99, 0]);
/// ```
#[macro_export]
macro_rules! ucs2_cstr {
    ($s:literal) => {{
        // Use `const` values here to force errors to happen at compile
        // time.

        const NUM_CHARS: usize = match $crate::str_num_ucs2_chars($s) {
            // Add one for the null char.
            Ok(num) => num + 1,
            Err(_) => panic!("input contains a character which cannot be represented in UCS-2"),
        };

        const VAL: [u16; NUM_CHARS] = match $crate::str_to_ucs2($s) {
            Ok(val) => val,
            // The string was already checked by `str_num_ucs2_chars`,
            // so this error is unreachable.
            Err(_) => {
                unreachable!();
            }
        };
        VAL
    }};
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_str_num_chars() {
        // Some of the strings here are from https://www.kermitproject.org/utf8.html.

        // One-byte chars.
        assert_eq!(str_num_ucs2_chars("abc"), Ok(3));
        // Two-byte chars.
        assert_eq!(str_num_ucs2_chars("Τη γλώσσα μου έδωσαν ελληνική"), Ok(29));
        // Three-byte chars.
        assert_eq!(str_num_ucs2_chars("ვეპხის ტყაოსანი შოთა რუსთაველი"), Ok(30));
        // Four-byte chars.
        assert_eq!(str_num_ucs2_chars("😎🔥"), Err(Error::MultiByte));
    }

    #[test]
    fn test_ucs2_cstr() {
        let s = ucs2_cstr!("abc");
        assert_eq!(s, [97, 98, 99, 0]);
    }
}