Diffstat (limited to 'src/utf8.rs')
-rw-r--r--src/utf8.rs108
1 files changed, 108 insertions, 0 deletions
diff --git a/src/utf8.rs b/src/utf8.rs
new file mode 100644
index 0000000..0a3ffc8
--- /dev/null
+++ b/src/utf8.rs
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2008-2009 Bjoern Hoehrmann <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software
+ * and associated documentation files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+// See https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for more information on how this works.
+
+/// Decoder ground state.
+const OK: u8 = 0;
+/// Decoder error state.
+const ER: u8 = 96;
+/// Decoder error state. Offending byte should be passed in again ("rewind")
+const RW: u8 = 108;
+
+const UTF8_TABLE: [u8; 256+96] = [
+ // Maps bytes to character classes
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x00
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x80
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 0x90
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 0xA0
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xC0
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, // 0xE0
+ 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, // 0xF0
+
+ // Maps state (row) + character class (column) to next state
+ OK, ER, 12, 24, 48, 84, 72, ER, ER, ER, 36, 60, // 0 - OK
+ RW, OK, RW, RW, RW, RW, RW, OK, RW, OK, RW, RW, // 12 - 1 byte needed
+ RW, 12, RW, RW, RW, RW, RW, 12, RW, 12, RW, RW, // 24 - 2 bytes needed
+ RW, RW, RW, RW, RW, RW, RW, 12, RW, RW, RW, RW, // 36 - 2 bytes needed, E0 lead
+ RW, 12, RW, RW, RW, RW, RW, RW, RW, 12, RW, RW, // 48 - 2 bytes needed, ED lead
+ RW, RW, RW, RW, RW, RW, RW, 24, RW, 24, RW, RW, // 60 - 3 bytes needed, F0 lead
+ RW, 24, RW, RW, RW, RW, RW, 24, RW, 24, RW, RW, // 72 - 3 bytes needed
+ RW, 24, RW, RW, RW, RW, RW, RW, RW, RW, RW, RW, // 84 - 3 bytes needed, F4 lead
+];
+
+#[derive(Copy, Clone, Debug, Default)]
+pub struct UTF8Decoder {
+ code_point: u32,
+ state: u8
+}
+
+#[derive(Copy, Clone, Eq, PartialEq, Debug)]
+pub enum DecodeState {
+ Done(char),
+ Continue,
+ Error,
+ Rewind
+}
+
+impl UTF8Decoder {
+ #[inline]
+ pub fn reset(&mut self) {
+ self.state = 0;
+ }
+
+ pub fn decode_byte(&mut self, byte: u8) -> DecodeState {
+ let class = UTF8_TABLE[byte as usize];
+
+ self.code_point =
+ if self.state == OK {
+ // The character class values for leading bytes simultaneously form a bitmask.
+ // For class 0, this is a no-op
+ // For classes > 7, this is 0 (continuations & invalid bytes)
+ (0xFF >> class) & byte as u32
+ } else {
+ // Standard continuation byte extraction.
+ // It's okay if this is gibberish due to invalid input,
+ // errors reset state to OK, and code_point gets cleared on the next input.
+ (self.code_point << 6) | (byte as u32 & 0x3F)
+ };
+
+ unsafe {
+ // The compiler can't verify this access is always in bounds, but it is, I promise.
+ self.state = *UTF8_TABLE.get_unchecked(256 + self.state as usize + class as usize);
+
+ match self.state {
+ // Surrogate or out of bounds code points will be rejected, so this is safe.
+ OK => DecodeState::Done(std::char::from_u32_unchecked(self.code_point)),
+ ER => { self.reset(); DecodeState::Error },
+ RW => { self.reset(); DecodeState::Rewind },
+ _ => DecodeState::Continue
+ }
+ }
+ }
+}