From 23f516f3d940294a4d1e475449bdc12889c1b3d8 Mon Sep 17 00:00:00 2001
From: Mike Dilger <mike@mikedilger.com>
Date: Wed, 13 Dec 2023 17:17:50 +1300
Subject: [PATCH] mod types/parse/utf8: UTF-8 encoding/decoding:

This helps avoid a separate validation pass that happens when converting bytes to &str
We just work with bytes directly now
---
 src/error.rs            |   4 +
 src/types/mod.rs        |   2 +
 src/types/parse/mod.rs  |   1 +
 src/types/parse/utf8.rs | 165 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 172 insertions(+)
 create mode 100644 src/types/parse/mod.rs
 create mode 100644 src/types/parse/utf8.rs

diff --git a/src/error.rs b/src/error.rs
index b3c1a57..003879d 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -22,4 +22,8 @@ pub enum Error {
     // I/O Error
     #[error("I/O: {0}")]
     Io(#[from] std::io::Error),
+
+    // UTF-8
+    #[error("UTF-8 error")]
+    Utf8Error,
 }
diff --git a/src/types/mod.rs b/src/types/mod.rs
index 23016d5..70bac05 100644
--- a/src/types/mod.rs
+++ b/src/types/mod.rs
@@ -7,6 +7,8 @@ pub use id::Id;
 mod kind;
 pub use kind::Kind;
 
+pub mod parse;
+
 mod pubkey;
 pub use pubkey::Pubkey;
 
diff --git a/src/types/parse/mod.rs b/src/types/parse/mod.rs
new file mode 100644
index 0000000..ec02418
--- /dev/null
+++ b/src/types/parse/mod.rs
@@ -0,0 +1 @@
+pub mod utf8;
diff --git a/src/types/parse/utf8.rs b/src/types/parse/utf8.rs
new file mode 100644
index 0000000..6b7704a
--- /dev/null
+++ b/src/types/parse/utf8.rs
@@ -0,0 +1,165 @@
+use crate::Error;
+
+// Reads the next code point if UTF-8, and returns it along with the number of characters
+// that make it up.
+pub fn next_code_point(input: &[u8]) -> Result<Option<(u32, usize)>, Error> {
+    let len = input.len();
+    if len < 1 {
+        return Ok(None);
+    }
+
+    // Decode UTF-8
+    let x = input[0];
+    if x < 128 {
+        return Ok(Some((x as u32, 1)));
+    }
+
+    // Multibyte case follows
+    // Decode from a byte combination out of: [[[x y] z] w]
+    let init = utf8_first_byte(x, 2);
+    if len < 2 {
+        return Err(Error::Utf8Error);
+    }
+    let y = input[1];
+    let mut ch = utf8_acc_cont_byte(init, y);
+    if x >= 0xE0 {
+        // [[x y z] w] case
+        // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
+        if len < 3 {
+            return Err(Error::Utf8Error);
+        }
+        let z = input[2];
+        let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
+        ch = init << 12 | y_z;
+        if x >= 0xF0 {
+            // [x y z w] case
+            // use only the lower 3 bits of `init`
+            if len < 4 {
+                return Err(Error::Utf8Error);
+            }
+            let w = input[3];
+            ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
+            Ok(Some((ch, 4)))
+        } else {
+            Ok(Some((ch, 3)))
+        }
+    } else {
+        Ok(Some((ch, 2)))
+    }
+}
+
+pub fn encode_utf8(code: u32, dst: &mut [u8]) -> Result<usize, Error> {
+    // UTF-8 ranges and tags for encoding characters
+    const TAG_CONT: u8 = 0b1000_0000;
+    const TAG_TWO_B: u8 = 0b1100_0000;
+    const TAG_THREE_B: u8 = 0b1110_0000;
+    const TAG_FOUR_B: u8 = 0b1111_0000;
+    const MAX_ONE_B: u32 = 0x80;
+    const MAX_TWO_B: u32 = 0x800;
+    const MAX_THREE_B: u32 = 0x10000;
+
+    let len = unsafe {
+        if code < MAX_ONE_B && !dst.is_empty() {
+            *dst.get_unchecked_mut(0) = code as u8;
+            1
+        } else if code < MAX_TWO_B && dst.len() >= 2 {
+            *dst.get_unchecked_mut(0) = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
+            *dst.get_unchecked_mut(1) = (code & 0x3F) as u8 | TAG_CONT;
+            2
+        } else if code < MAX_THREE_B && dst.len() >= 3 {
+            *dst.get_unchecked_mut(0) = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
+            *dst.get_unchecked_mut(1) = (code >> 6 & 0x3F) as u8 | TAG_CONT;
+            *dst.get_unchecked_mut(2) = (code & 0x3F) as u8 | TAG_CONT;
+            3
+        } else if dst.len() >= 4 {
+            *dst.get_unchecked_mut(0) = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
+            *dst.get_unchecked_mut(1) = (code >> 12 & 0x3F) as u8 | TAG_CONT;
+            *dst.get_unchecked_mut(2) = (code >> 6 & 0x3F) as u8 | TAG_CONT;
+            *dst.get_unchecked_mut(3) = (code & 0x3F) as u8 | TAG_CONT;
+            4
+        } else {
+            return Err(Error::BufferTooSmall);
+        }
+    };
+    Ok(len)
+}
+
+/// Returns the initial codepoint accumulator for the first byte.
+/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
+/// for width 3, and 3 bits for width 4.
+#[inline]
+const fn utf8_first_byte(byte: u8, width: u32) -> u32 {
+    (byte & (0x7F >> width)) as u32
+}
+
+/// Returns the value of `ch` updated with continuation byte `byte`.
+#[inline]
+const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
+    (ch << 6) | (byte & CONT_MASK) as u32
+}
+
+/// Mask of the value bits of a continuation byte.
+const CONT_MASK: u8 = 0b0011_1111;
+
+#[cfg(test)]
+mod test {
+    use super::{encode_utf8, next_code_point};
+
+    #[test]
+    fn test_next_code_point() {
+        let (codepoint, size) = next_code_point(r#"𝄞"#.as_bytes()).unwrap().unwrap();
+        assert_eq!(codepoint, 119070);
+        assert_eq!(size, 4);
+
+        let (codepoint, size) = next_code_point(r#"†"#.as_bytes()).unwrap().unwrap();
+        assert_eq!(codepoint, 0x2020);
+        assert_eq!(size, 3);
+
+        // four codepoints
+        let s = [
+            0x61, 0xE0, 0xA4, 0xA8, 0xE0, 0xA4, 0xBF, 0xE4, 0xBA, 0x9C, 0xF0, 0x90, 0x82, 0x83,
+        ];
+        let (codepoint, size) = next_code_point(s.as_slice()).unwrap().unwrap();
+        assert_eq!(codepoint, 0x61);
+        assert_eq!(size, 1);
+
+        let mut start = size;
+        let (codepoint, size) = next_code_point(&s[start..]).unwrap().unwrap();
+        assert_eq!(codepoint, 0x928);
+        assert_eq!(size, 3);
+
+        start += size;
+        let (codepoint, size) = next_code_point(&s[start..]).unwrap().unwrap();
+        assert_eq!(codepoint, 0x93F);
+        assert_eq!(size, 3);
+
+        start += size;
+        let (codepoint, size) = next_code_point(&s[start..]).unwrap().unwrap();
+        assert_eq!(codepoint, 0x4E9C);
+        assert_eq!(size, 3);
+
+        start += size;
+        let (codepoint, size) = next_code_point(&s[start..]).unwrap().unwrap();
+        assert_eq!(codepoint, 0x10083);
+        assert_eq!(size, 4);
+
+        assert_eq!(next_code_point(&s[0..0]).unwrap(), None);
+    }
+
+    #[test]
+    fn test_encode_utf8() {
+        let mut buffer: Vec<u8> = vec![0, 0, 0, 0];
+
+        assert_eq!(encode_utf8(0x69, &mut buffer).unwrap(), 1);
+        assert_eq!(buffer[0], 0x69);
+
+        assert_eq!(encode_utf8(0xEC, &mut buffer).unwrap(), 2);
+        assert_eq!(&buffer[0..2], &[0xC3, 0xAC]);
+
+        assert_eq!(encode_utf8(0x5450, &mut buffer).unwrap(), 3);
+        assert_eq!(&buffer[0..3], &[0xE5, 0x91, 0x90]);
+
+        assert_eq!(encode_utf8(0x2825F, &mut buffer).unwrap(), 4);
+        assert_eq!(&buffer[..], &[0xF0, 0xA8, 0x89, 0x9F]);
+    }
+}