From 23f516f3d940294a4d1e475449bdc12889c1b3d8 Mon Sep 17 00:00:00 2001 From: Mike Dilger Date: Wed, 13 Dec 2023 17:17:50 +1300 Subject: [PATCH] mod types/parse/utf8: UTF-8 encoding/decoding: This helps avoid a separate validation pass that happens when converting bytes to &str We just work with bytes directly now --- src/error.rs | 4 + src/types/mod.rs | 2 + src/types/parse/mod.rs | 1 + src/types/parse/utf8.rs | 165 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 172 insertions(+) create mode 100644 src/types/parse/mod.rs create mode 100644 src/types/parse/utf8.rs diff --git a/src/error.rs b/src/error.rs index b3c1a57..003879d 100644 --- a/src/error.rs +++ b/src/error.rs @@ -22,4 +22,8 @@ pub enum Error { // I/O Error #[error("I/O: {0}")] Io(#[from] std::io::Error), + + // UTF-8 + #[error("UTF-8 error")] + Utf8Error, } diff --git a/src/types/mod.rs b/src/types/mod.rs index 23016d5..70bac05 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -7,6 +7,8 @@ pub use id::Id; mod kind; pub use kind::Kind; +pub mod parse; + mod pubkey; pub use pubkey::Pubkey; diff --git a/src/types/parse/mod.rs b/src/types/parse/mod.rs new file mode 100644 index 0000000..ec02418 --- /dev/null +++ b/src/types/parse/mod.rs @@ -0,0 +1 @@ +pub mod utf8; diff --git a/src/types/parse/utf8.rs b/src/types/parse/utf8.rs new file mode 100644 index 0000000..6b7704a --- /dev/null +++ b/src/types/parse/utf8.rs @@ -0,0 +1,165 @@ +use crate::Error; + +// Reads the next code point if UTF-8, and returns it along with the number of characters +// that make it up. +pub fn next_code_point(input: &[u8]) -> Result, Error> { + let len = input.len(); + if len < 1 { + return Ok(None); + } + + // Decode UTF-8 + let x = input[0]; + if x < 128 { + return Ok(Some((x as u32, 1))); + } + + // Multibyte case follows + // Decode from a byte combination out of: [[[x y] z] w] + let init = utf8_first_byte(x, 2); + if len < 2 { + return Err(Error::Utf8Error); + } + let y = input[1]; + let mut ch = utf8_acc_cont_byte(init, y); + if x >= 0xE0 { + // [[x y z] w] case + // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid + if len < 3 { + return Err(Error::Utf8Error); + } + let z = input[2]; + let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z); + ch = init << 12 | y_z; + if x >= 0xF0 { + // [x y z w] case + // use only the lower 3 bits of `init` + if len < 4 { + return Err(Error::Utf8Error); + } + let w = input[3]; + ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); + Ok(Some((ch, 4))) + } else { + Ok(Some((ch, 3))) + } + } else { + Ok(Some((ch, 2))) + } +} + +pub fn encode_utf8(code: u32, dst: &mut [u8]) -> Result { + // UTF-8 ranges and tags for encoding characters + const TAG_CONT: u8 = 0b1000_0000; + const TAG_TWO_B: u8 = 0b1100_0000; + const TAG_THREE_B: u8 = 0b1110_0000; + const TAG_FOUR_B: u8 = 0b1111_0000; + const MAX_ONE_B: u32 = 0x80; + const MAX_TWO_B: u32 = 0x800; + const MAX_THREE_B: u32 = 0x10000; + + let len = unsafe { + if code < MAX_ONE_B && !dst.is_empty() { + *dst.get_unchecked_mut(0) = code as u8; + 1 + } else if code < MAX_TWO_B && dst.len() >= 2 { + *dst.get_unchecked_mut(0) = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; + *dst.get_unchecked_mut(1) = (code & 0x3F) as u8 | TAG_CONT; + 2 + } else if code < MAX_THREE_B && dst.len() >= 3 { + *dst.get_unchecked_mut(0) = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; + *dst.get_unchecked_mut(1) = (code >> 6 & 0x3F) as u8 | TAG_CONT; + *dst.get_unchecked_mut(2) = (code & 0x3F) as u8 | TAG_CONT; + 3 + } else if dst.len() >= 4 { + *dst.get_unchecked_mut(0) = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; + *dst.get_unchecked_mut(1) = (code >> 12 & 0x3F) as u8 | TAG_CONT; + *dst.get_unchecked_mut(2) = (code >> 6 & 0x3F) as u8 | TAG_CONT; + *dst.get_unchecked_mut(3) = (code & 0x3F) as u8 | TAG_CONT; + 4 + } else { + return Err(Error::BufferTooSmall); + } + }; + Ok(len) +} + +/// Returns the initial codepoint accumulator for the first byte. +/// The first byte is special, only want bottom 5 bits for width 2, 4 bits +/// for width 3, and 3 bits for width 4. +#[inline] +const fn utf8_first_byte(byte: u8, width: u32) -> u32 { + (byte & (0x7F >> width)) as u32 +} + +/// Returns the value of `ch` updated with continuation byte `byte`. +#[inline] +const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { + (ch << 6) | (byte & CONT_MASK) as u32 +} + +/// Mask of the value bits of a continuation byte. +const CONT_MASK: u8 = 0b0011_1111; + +#[cfg(test)] +mod test { + use super::{encode_utf8, next_code_point}; + + #[test] + fn test_next_code_point() { + let (codepoint, size) = next_code_point(r#"𝄞"#.as_bytes()).unwrap().unwrap(); + assert_eq!(codepoint, 119070); + assert_eq!(size, 4); + + let (codepoint, size) = next_code_point(r#"†"#.as_bytes()).unwrap().unwrap(); + assert_eq!(codepoint, 0x2020); + assert_eq!(size, 3); + + // four codepoints + let s = [ + 0x61, 0xE0, 0xA4, 0xA8, 0xE0, 0xA4, 0xBF, 0xE4, 0xBA, 0x9C, 0xF0, 0x90, 0x82, 0x83, + ]; + let (codepoint, size) = next_code_point(s.as_slice()).unwrap().unwrap(); + assert_eq!(codepoint, 0x61); + assert_eq!(size, 1); + + let mut start = size; + let (codepoint, size) = next_code_point(&s[start..]).unwrap().unwrap(); + assert_eq!(codepoint, 0x928); + assert_eq!(size, 3); + + start += size; + let (codepoint, size) = next_code_point(&s[start..]).unwrap().unwrap(); + assert_eq!(codepoint, 0x93F); + assert_eq!(size, 3); + + start += size; + let (codepoint, size) = next_code_point(&s[start..]).unwrap().unwrap(); + assert_eq!(codepoint, 0x4E9C); + assert_eq!(size, 3); + + start += size; + let (codepoint, size) = next_code_point(&s[start..]).unwrap().unwrap(); + assert_eq!(codepoint, 0x10083); + assert_eq!(size, 4); + + assert_eq!(next_code_point(&s[0..0]).unwrap(), None); + } + + #[test] + fn test_encode_utf8() { + let mut buffer: Vec = vec![0, 0, 0, 0]; + + assert_eq!(encode_utf8(0x69, &mut buffer).unwrap(), 1); + assert_eq!(buffer[0], 0x69); + + assert_eq!(encode_utf8(0xEC, &mut buffer).unwrap(), 2); + assert_eq!(&buffer[0..2], &[0xC3, 0xAC]); + + assert_eq!(encode_utf8(0x5450, &mut buffer).unwrap(), 3); + assert_eq!(&buffer[0..3], &[0xE5, 0x91, 0x90]); + + assert_eq!(encode_utf8(0x2825F, &mut buffer).unwrap(), 4); + assert_eq!(&buffer[..], &[0xF0, 0xA8, 0x89, 0x9F]); + } +}