Unnamed repository; edit this file 'description' to name the repository.
Diffstat (limited to 'crates/intern/src/symbol.rs')
-rw-r--r--crates/intern/src/symbol.rs360
1 files changed, 360 insertions, 0 deletions
diff --git a/crates/intern/src/symbol.rs b/crates/intern/src/symbol.rs
new file mode 100644
index 0000000000..ef76192ba8
--- /dev/null
+++ b/crates/intern/src/symbol.rs
@@ -0,0 +1,360 @@
+//! Attempt at flexible symbol interning, allowing to intern and free strings at runtime while also
+//! supporting compile time declaration of symbols that will never be freed.
+
+use std::{
+ borrow::Borrow,
+ fmt,
+ hash::{BuildHasherDefault, Hash, Hasher},
+ mem::{self, ManuallyDrop},
+ ptr::NonNull,
+ sync::OnceLock,
+};
+
+use dashmap::{DashMap, SharedValue};
+use hashbrown::{hash_map::RawEntryMut, HashMap};
+use rustc_hash::FxHasher;
+use sptr::Strict;
+use triomphe::Arc;
+
+pub mod symbols;
+
+// some asserts for layout compatibility
+const _: () = assert!(std::mem::size_of::<Box<str>>() == std::mem::size_of::<&str>());
+const _: () = assert!(std::mem::align_of::<Box<str>>() == std::mem::align_of::<&str>());
+
+const _: () = assert!(std::mem::size_of::<Arc<Box<str>>>() == std::mem::size_of::<&&str>());
+const _: () = assert!(std::mem::align_of::<Arc<Box<str>>>() == std::mem::align_of::<&&str>());
+
+const _: () =
+ assert!(std::mem::size_of::<*const *const str>() == std::mem::size_of::<TaggedArcPtr>());
+const _: () =
+ assert!(std::mem::align_of::<*const *const str>() == std::mem::align_of::<TaggedArcPtr>());
+
+const _: () = assert!(std::mem::size_of::<Arc<Box<str>>>() == std::mem::size_of::<TaggedArcPtr>());
+const _: () =
+ assert!(std::mem::align_of::<Arc<Box<str>>>() == std::mem::align_of::<TaggedArcPtr>());
+
+/// A pointer that points to a pointer to a `str`, it may be backed as a `&'static &'static str` or
+/// `Arc<Box<str>>` but its size is that of a thin pointer. The active variant is encoded as a tag
+/// in the LSB of the alignment niche.
+// Note, Ideally this would encode a `ThinArc<str>` and `ThinRef<str>`/`ThinConstPtr<str>` instead of the double indirection.
+#[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)]
+struct TaggedArcPtr {
+ packed: NonNull<*const str>,
+}
+
+unsafe impl Send for TaggedArcPtr {}
+unsafe impl Sync for TaggedArcPtr {}
+
+impl TaggedArcPtr {
+ const BOOL_BITS: usize = true as usize;
+
+ const fn non_arc(r: &'static &'static str) -> Self {
+ assert!(
+ mem::align_of::<&'static &'static str>().trailing_zeros() as usize > Self::BOOL_BITS
+ );
+ // SAFETY: The pointer is non-null as it is derived from a reference
+ // Ideally we would call out to `pack_arc` but for a `false` tag, unfortunately the
+ // packing stuff requires reading out the pointer to an integer which is not supported
+ // in const contexts, so here we make use of the fact that for the non-arc version the
+ // tag is false (0) and thus does not need touching the actual pointer value.ext)
+
+ let packed =
+ unsafe { NonNull::new_unchecked((r as *const &str).cast::<*const str>().cast_mut()) };
+ Self { packed }
+ }
+
+ fn arc(arc: Arc<Box<str>>) -> Self {
+ assert!(
+ mem::align_of::<&'static &'static str>().trailing_zeros() as usize > Self::BOOL_BITS
+ );
+ Self {
+ packed: Self::pack_arc(
+ // Safety: `Arc::into_raw` always returns a non null pointer
+ unsafe { NonNull::new_unchecked(Arc::into_raw(arc).cast_mut().cast()) },
+ ),
+ }
+ }
+
+ /// Retrieves the tag.
+ #[inline]
+ pub(crate) fn try_as_arc_owned(self) -> Option<ManuallyDrop<Arc<Box<str>>>> {
+ // Unpack the tag from the alignment niche
+ let tag = Strict::addr(self.packed.as_ptr()) & Self::BOOL_BITS;
+ if tag != 0 {
+ // Safety: We checked that the tag is non-zero -> true, so we are pointing to the data offset of an `Arc`
+ Some(ManuallyDrop::new(unsafe {
+ Arc::from_raw(self.pointer().as_ptr().cast::<Box<str>>())
+ }))
+ } else {
+ None
+ }
+ }
+
+ #[inline]
+ fn pack_arc(ptr: NonNull<*const str>) -> NonNull<*const str> {
+ let packed_tag = true as usize;
+
+ // can't use this strict provenance stuff here due to trait methods not being const
+ // unsafe {
+ // // Safety: The pointer is derived from a non-null
+ // NonNull::new_unchecked(Strict::map_addr(ptr.as_ptr(), |addr| {
+ // // Safety:
+ // // - The pointer is `NonNull` => it's address is `NonZero<usize>`
+ // // - `P::BITS` least significant bits are always zero (`Pointer` contract)
+ // // - `T::BITS <= P::BITS` (from `Self::ASSERTION`)
+ // //
+ // // Thus `addr >> T::BITS` is guaranteed to be non-zero.
+ // //
+ // // `{non_zero} | packed_tag` can't make the value zero.
+
+ // (addr >> Self::BOOL_BITS) | packed_tag
+ // }))
+ // }
+ // so what follows is roughly what the above looks like but inlined
+
+ let self_addr = ptr.as_ptr() as *const *const str as usize;
+ let addr = self_addr | packed_tag;
+ let dest_addr = addr as isize;
+ let offset = dest_addr.wrapping_sub(self_addr as isize);
+
+ // SAFETY: The resulting pointer is guaranteed to be NonNull as we only modify the niche bytes
+ unsafe { NonNull::new_unchecked(ptr.as_ptr().cast::<u8>().wrapping_offset(offset).cast()) }
+ }
+
+ #[inline]
+ pub(crate) fn pointer(self) -> NonNull<*const str> {
+ // SAFETY: The resulting pointer is guaranteed to be NonNull as we only modify the niche bytes
+ unsafe {
+ NonNull::new_unchecked(Strict::map_addr(self.packed.as_ptr(), |addr| {
+ addr & !Self::BOOL_BITS
+ }))
+ }
+ }
+
+ #[inline]
+ pub(crate) fn as_str(&self) -> &str {
+ // SAFETY: We always point to a pointer to a str no matter what variant is active
+ unsafe { *self.pointer().as_ptr().cast::<&str>() }
+ }
+}
+
+#[derive(PartialEq, Eq, Hash)]
+pub struct Symbol {
+ repr: TaggedArcPtr,
+}
+
+impl fmt::Debug for Symbol {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ self.as_str().fmt(f)
+ }
+}
+
+const _: () = assert!(std::mem::size_of::<Symbol>() == std::mem::size_of::<NonNull<()>>());
+const _: () = assert!(std::mem::align_of::<Symbol>() == std::mem::align_of::<NonNull<()>>());
+
+static MAP: OnceLock<DashMap<SymbolProxy, (), BuildHasherDefault<FxHasher>>> = OnceLock::new();
+
+impl Symbol {
+ pub fn intern(s: &str) -> Self {
+ let (mut shard, hash) = Self::select_shard(s);
+ // Atomically,
+ // - check if `obj` is already in the map
+ // - if so, copy out its entry, conditionally bumping the backing Arc and return it
+ // - if not, put it into a box and then into an Arc, insert it, bump the ref-count and return the copy
+ // This needs to be atomic (locking the shard) to avoid races with other thread, which could
+ // insert the same object between us looking it up and inserting it.
+ match shard.raw_entry_mut().from_key_hashed_nocheck(hash, s) {
+ RawEntryMut::Occupied(occ) => Self { repr: increase_arc_refcount(occ.key().0) },
+ RawEntryMut::Vacant(vac) => Self {
+ repr: increase_arc_refcount(
+ vac.insert_hashed_nocheck(
+ hash,
+ SymbolProxy(TaggedArcPtr::arc(Arc::new(Box::<str>::from(s)))),
+ SharedValue::new(()),
+ )
+ .0
+ .0,
+ ),
+ },
+ }
+ }
+
+ pub fn integer(i: usize) -> Self {
+ match i {
+ 0 => symbols::INTEGER_0.clone(),
+ 1 => symbols::INTEGER_1.clone(),
+ 2 => symbols::INTEGER_2.clone(),
+ 3 => symbols::INTEGER_3.clone(),
+ 4 => symbols::INTEGER_4.clone(),
+ 5 => symbols::INTEGER_5.clone(),
+ 6 => symbols::INTEGER_6.clone(),
+ 7 => symbols::INTEGER_7.clone(),
+ 8 => symbols::INTEGER_8.clone(),
+ 9 => symbols::INTEGER_9.clone(),
+ 10 => symbols::INTEGER_10.clone(),
+ 11 => symbols::INTEGER_11.clone(),
+ 12 => symbols::INTEGER_12.clone(),
+ 13 => symbols::INTEGER_13.clone(),
+ 14 => symbols::INTEGER_14.clone(),
+ 15 => symbols::INTEGER_15.clone(),
+ i => Symbol::intern(&format!("{i}")),
+ }
+ }
+
+ pub fn empty() -> Self {
+ symbols::__empty.clone()
+ }
+
+ pub fn as_str(&self) -> &str {
+ self.repr.as_str()
+ }
+
+ #[inline]
+ fn select_shard(
+ s: &str,
+ ) -> (
+ dashmap::RwLockWriteGuard<
+ 'static,
+ HashMap<SymbolProxy, SharedValue<()>, BuildHasherDefault<FxHasher>>,
+ >,
+ u64,
+ ) {
+ let storage = MAP.get_or_init(symbols::prefill);
+ let hash = {
+ let mut hasher = std::hash::BuildHasher::build_hasher(storage.hasher());
+ s.hash(&mut hasher);
+ hasher.finish()
+ };
+ let shard_idx = storage.determine_shard(hash as usize);
+ let shard = &storage.shards()[shard_idx];
+ (shard.write(), hash)
+ }
+
+ #[cold]
+ fn drop_slow(arc: &Arc<Box<str>>) {
+ let (mut shard, hash) = Self::select_shard(arc);
+
+ match Arc::count(arc) {
+ 0 => unreachable!(),
+ 1 => unreachable!(),
+ 2 => (),
+ _ => {
+ // Another thread has interned another copy
+ return;
+ }
+ }
+
+ ManuallyDrop::into_inner(
+ match shard.raw_entry_mut().from_key_hashed_nocheck::<str>(hash, arc.as_ref()) {
+ RawEntryMut::Occupied(occ) => occ.remove_entry(),
+ RawEntryMut::Vacant(_) => unreachable!(),
+ }
+ .0
+ .0
+ .try_as_arc_owned()
+ .unwrap(),
+ );
+ debug_assert_eq!(Arc::count(arc), 1);
+
+ // Shrink the backing storage if the shard is less than 50% occupied.
+ if shard.len() * 2 < shard.capacity() {
+ shard.shrink_to_fit();
+ }
+ }
+}
+
+impl Drop for Symbol {
+ #[inline]
+ fn drop(&mut self) {
+ let Some(arc) = self.repr.try_as_arc_owned() else {
+ return;
+ };
+ // When the last `Ref` is dropped, remove the object from the global map.
+ if Arc::count(&arc) == 2 {
+ // Only `self` and the global map point to the object.
+
+ Self::drop_slow(&arc);
+ }
+ // decrement the ref count
+ ManuallyDrop::into_inner(arc);
+ }
+}
+
+impl Clone for Symbol {
+ fn clone(&self) -> Self {
+ Self { repr: increase_arc_refcount(self.repr) }
+ }
+}
+
+fn increase_arc_refcount(repr: TaggedArcPtr) -> TaggedArcPtr {
+ let Some(arc) = repr.try_as_arc_owned() else {
+ return repr;
+ };
+ // increase the ref count
+ mem::forget(Arc::clone(&arc));
+ repr
+}
+
+impl fmt::Display for Symbol {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ self.as_str().fmt(f)
+ }
+}
+
+// only exists so we can use `from_key_hashed_nocheck` with a &str
+#[derive(Debug, PartialEq, Eq)]
+struct SymbolProxy(TaggedArcPtr);
+
+impl Hash for SymbolProxy {
+ fn hash<H: Hasher>(&self, state: &mut H) {
+ self.0.as_str().hash(state);
+ }
+}
+
+impl Borrow<str> for SymbolProxy {
+ fn borrow(&self) -> &str {
+ self.0.as_str()
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn smoke_test() {
+ Symbol::intern("isize");
+ let base_len = MAP.get().unwrap().len();
+ let hello = Symbol::intern("hello");
+ let world = Symbol::intern("world");
+ let more_worlds = world.clone();
+ let bang = Symbol::intern("!");
+ let q = Symbol::intern("?");
+ assert_eq!(MAP.get().unwrap().len(), base_len + 4);
+ let bang2 = Symbol::intern("!");
+ assert_eq!(MAP.get().unwrap().len(), base_len + 4);
+ drop(bang2);
+ assert_eq!(MAP.get().unwrap().len(), base_len + 4);
+ drop(q);
+ assert_eq!(MAP.get().unwrap().len(), base_len + 3);
+ let default = Symbol::intern("default");
+ let many_worlds = world.clone();
+ assert_eq!(MAP.get().unwrap().len(), base_len + 3);
+ assert_eq!(
+ "hello default world!",
+ format!("{} {} {}{}", hello.as_str(), default.as_str(), world.as_str(), bang.as_str())
+ );
+ drop(default);
+ assert_eq!(
+ "hello world!",
+ format!("{} {}{}", hello.as_str(), world.as_str(), bang.as_str())
+ );
+ drop(many_worlds);
+ drop(more_worlds);
+ drop(hello);
+ drop(world);
+ drop(bang);
+ assert_eq!(MAP.get().unwrap().len(), base_len);
+ }
+}