Unnamed repository; edit this file 'description' to name the repository.
Diffstat (limited to 'helix-view/src/handlers/word_index.rs')
| -rw-r--r-- | helix-view/src/handlers/word_index.rs | 509 |
1 files changed, 509 insertions, 0 deletions
diff --git a/helix-view/src/handlers/word_index.rs b/helix-view/src/handlers/word_index.rs new file mode 100644 index 00000000..9c3f8338 --- /dev/null +++ b/helix-view/src/handlers/word_index.rs @@ -0,0 +1,509 @@ +//! Indexing of words from open buffers. +//! +//! This provides an eventually consistent set of words used in any open buffers. This set is +//! later used for lexical completion. + +use std::{borrow::Cow, collections::HashMap, iter, mem, sync::Arc, time::Duration}; + +use helix_core::{ + chars::char_is_word, fuzzy::fuzzy_match, movement, ChangeSet, Range, Rope, RopeSlice, +}; +use helix_event::{register_hook, AsyncHook}; +use helix_stdx::rope::RopeSliceExt as _; +use parking_lot::RwLock; +use tokio::{sync::mpsc, time::Instant}; + +use crate::{ + events::{ConfigDidChange, DocumentDidChange, DocumentDidClose, DocumentDidOpen}, + DocumentId, +}; + +use super::Handlers; + +#[derive(Debug)] +struct Change { + old_text: Rope, + text: Rope, + changes: ChangeSet, +} + +#[derive(Debug)] +enum Event { + Insert(Rope), + Update(DocumentId, Change), + Delete(DocumentId, Rope), + /// Clear the entire word index. + /// This is used to clear memory when the feature is turned off. + Clear, +} + +#[derive(Debug)] +pub struct Handler { + pub(super) index: WordIndex, + /// A sender into an async hook which debounces updates to the index. + hook: mpsc::Sender<Event>, + /// A sender to a tokio task which coordinates the indexing of documents. + /// + /// See [WordIndex::run]. A supervisor-like task is in charge of spawning tasks to update the + /// index. This ensures that consecutive edits to a document trigger the correct order of + /// insertions and deletions into the word set. + coordinator: mpsc::UnboundedSender<Event>, +} + +impl Handler { + pub fn spawn() -> Self { + let index = WordIndex::default(); + let (tx, rx) = mpsc::unbounded_channel(); + tokio::spawn(index.clone().run(rx)); + Self { + hook: Hook { + changes: HashMap::default(), + coordinator: tx.clone(), + } + .spawn(), + index, + coordinator: tx, + } + } +} + +#[derive(Debug)] +struct Hook { + changes: HashMap<DocumentId, Change>, + coordinator: mpsc::UnboundedSender<Event>, +} + +const DEBOUNCE: Duration = Duration::from_secs(1); + +impl AsyncHook for Hook { + type Event = Event; + + fn handle_event(&mut self, event: Self::Event, timeout: Option<Instant>) -> Option<Instant> { + match event { + Event::Insert(_) => unreachable!("inserts are sent to the worker directly"), + Event::Update(doc, change) => { + if let Some(pending_change) = self.changes.get_mut(&doc) { + // If there is already a change waiting for this document, merge the two + // changes together by composing the changesets and saving the new `text`. + pending_change.changes = + mem::take(&mut pending_change.changes).compose(change.changes); + pending_change.text = change.text; + Some(Instant::now() + DEBOUNCE) + } else if !is_changeset_significant(&change.changes) { + // If the changeset is fairly large, debounce before updating the index. + self.changes.insert(doc, change); + Some(Instant::now() + DEBOUNCE) + } else { + // Otherwise if the change is small, queue the update to the index immediately. + self.coordinator.send(Event::Update(doc, change)).unwrap(); + timeout + } + } + Event::Delete(doc, text) => { + // If there are pending changes that haven't been indexed since the last debounce, + // forget them and delete the old text. + if let Some(change) = self.changes.remove(&doc) { + self.coordinator + .send(Event::Delete(doc, change.old_text)) + .unwrap(); + } else { + self.coordinator.send(Event::Delete(doc, text)).unwrap(); + } + timeout + } + Event::Clear => unreachable!("clear is sent to the worker directly"), + } + } + + fn finish_debounce(&mut self) { + for (doc, change) in self.changes.drain() { + self.coordinator.send(Event::Update(doc, change)).unwrap(); + } + } +} + +/// Minimum number of grapheme clusters required to include a word in the index +const MIN_WORD_GRAPHEMES: usize = 3; +/// Maximum word length allowed (in chars) +const MAX_WORD_LEN: usize = 50; + +type Word = kstring::KString; + +#[derive(Debug, Default)] +struct WordIndexInner { + /// Reference counted storage for words. + /// + /// Words are very likely to be reused many times. Instead of storing duplicates we keep a + /// reference count of times a word is used. When the reference count drops to zero the word + /// is removed from the index. + words: HashMap<Word, u32>, +} + +impl WordIndexInner { + fn words(&self) -> impl Iterator<Item = &Word> { + self.words.keys() + } + + fn insert(&mut self, word: RopeSlice) { + let word: Cow<str> = word.into(); + if let Some(rc) = self.words.get_mut(word.as_ref()) { + *rc = rc.saturating_add(1); + } else { + let word = match word { + Cow::Owned(s) => Word::from_string(s), + Cow::Borrowed(s) => Word::from_ref(s), + }; + self.words.insert(word, 1); + } + } + + fn remove(&mut self, word: RopeSlice) { + let word: Cow<str> = word.into(); + match self.words.get_mut(word.as_ref()) { + Some(1) => { + self.words.remove(word.as_ref()); + } + Some(n) => *n -= 1, + None => (), + } + } + + fn clear(&mut self) { + std::mem::take(&mut self.words); + } +} + +#[derive(Debug, Default, Clone)] +pub struct WordIndex { + inner: Arc<RwLock<WordIndexInner>>, +} + +impl WordIndex { + pub fn matches(&self, pattern: &str) -> Vec<String> { + let inner = self.inner.read(); + let mut matches = fuzzy_match(pattern, inner.words(), false); + matches.sort_unstable_by_key(|(_, score)| *score); + matches + .into_iter() + .map(|(word, _)| word.to_string()) + .collect() + } + + fn add_document(&self, text: &Rope) { + let mut inner = self.inner.write(); + for word in words(text.slice(..)) { + inner.insert(word); + } + } + + fn update_document(&self, old_text: &Rope, text: &Rope, changes: &ChangeSet) { + let mut inner = self.inner.write(); + for (old_window, new_window) in changed_windows(old_text.slice(..), text.slice(..), changes) + { + for word in words(new_window) { + inner.insert(word); + } + for word in words(old_window) { + inner.remove(word); + } + } + } + + fn remove_document(&self, text: &Rope) { + let mut inner = self.inner.write(); + for word in words(text.slice(..)) { + inner.remove(word); + } + } + + fn clear(&self) { + let mut inner = self.inner.write(); + inner.clear(); + } + + /// Coordinate the indexing of documents. + /// + /// This task wraps a MPSC queue and spawns blocking tasks which update the index. Updates + /// are applied one-by-one to ensure that changes to the index are **serialized**: + /// updates to each document must be applied in-order. + async fn run(self, mut events: mpsc::UnboundedReceiver<Event>) { + while let Some(event) = events.recv().await { + let this = self.clone(); + tokio::task::spawn_blocking(move || match event { + Event::Insert(text) => { + this.add_document(&text); + } + Event::Update( + _doc, + Change { + old_text, + text, + changes, + .. + }, + ) => { + this.update_document(&old_text, &text, &changes); + } + Event::Delete(_doc, text) => { + this.remove_document(&text); + } + Event::Clear => { + this.clear(); + } + }) + .await + .unwrap(); + } + } +} + +fn words(text: RopeSlice) -> impl Iterator<Item = RopeSlice> { + let mut cursor = Range::point(0); + if text + .get_char(cursor.anchor) + .is_some_and(|ch| !ch.is_whitespace()) + { + let cursor_word_end = movement::move_next_word_end(text, cursor, 1); + if cursor_word_end.anchor == 0 { + cursor = cursor_word_end; + } + } + + iter::from_fn(move || { + while cursor.head <= text.len_chars() { + let mut word = None; + if text + .slice(..cursor.head) + .graphemes_rev() + .take(MIN_WORD_GRAPHEMES) + .take_while(|g| g.chars().all(char_is_word)) + .count() + == MIN_WORD_GRAPHEMES + { + cursor.anchor += text + .chars_at(cursor.anchor) + .take_while(|&c| !char_is_word(c)) + .count(); + let slice = cursor.slice(text); + if slice.len_chars() <= MAX_WORD_LEN { + word = Some(slice); + } + } + let head = cursor.head; + cursor = movement::move_next_word_end(text, cursor, 1); + if cursor.head == head { + cursor.head = usize::MAX; + } + if word.is_some() { + return word; + } + } + None + }) +} + +/// Finds areas of the old and new texts around each operation in `changes`. +/// +/// The window is larger than the changed area and can encompass multiple insert/delete operations +/// if they are grouped closely together. +/// +/// The ranges of the old and new text should usually be of different sizes. For example a +/// deletion of "foo" surrounded by large retain sections would give a longer window into the +/// `old_text` and shorter window of `new_text`. Vice-versa for an insertion. A full replacement +/// of a word though would give two slices of the same size. +fn changed_windows<'a>( + old_text: RopeSlice<'a>, + new_text: RopeSlice<'a>, + changes: &'a ChangeSet, +) -> impl Iterator<Item = (RopeSlice<'a>, RopeSlice<'a>)> { + use helix_core::Operation::*; + + let mut operations = changes.changes().iter().peekable(); + let mut old_pos = 0; + let mut new_pos = 0; + iter::from_fn(move || loop { + let operation = operations.next()?; + let old_start = old_pos; + let new_start = new_pos; + let len = operation.len_chars(); + match operation { + Retain(_) => { + old_pos += len; + new_pos += len; + continue; + } + Insert(_) => new_pos += len, + Delete(_) => old_pos += len, + } + + // Scan ahead until a `Retain` is found which would end a window. + while let Some(o) = operations.next_if(|op| !matches!(op, Retain(n) if *n > MAX_WORD_LEN)) { + let len = o.len_chars(); + match o { + Retain(_) => { + old_pos += len; + new_pos += len; + } + Delete(_) => old_pos += len, + Insert(_) => new_pos += len, + } + } + + let old_window = old_start.saturating_sub(MAX_WORD_LEN) + ..(old_pos + MAX_WORD_LEN).min(old_text.len_chars()); + let new_window = new_start.saturating_sub(MAX_WORD_LEN) + ..(new_pos + MAX_WORD_LEN).min(new_text.len_chars()); + + return Some((old_text.slice(old_window), new_text.slice(new_window))); + }) +} + +/// Estimates whether a changeset is significant or small. +fn is_changeset_significant(changes: &ChangeSet) -> bool { + use helix_core::Operation::*; + + let mut diff = 0; + for operation in changes.changes() { + match operation { + Retain(_) => continue, + Delete(_) | Insert(_) => diff += operation.len_chars(), + } + } + + // This is arbitrary and could be tuned further: + diff > 1_000 +} + +pub(crate) fn register_hooks(handlers: &Handlers) { + let coordinator = handlers.word_index.coordinator.clone(); + register_hook!(move |event: &mut DocumentDidOpen<'_>| { + let doc = doc!(event.editor, &event.doc); + if doc.word_completion_enabled() { + coordinator.send(Event::Insert(doc.text().clone())).unwrap(); + } + Ok(()) + }); + + let tx = handlers.word_index.hook.clone(); + register_hook!(move |event: &mut DocumentDidChange<'_>| { + if !event.ghost_transaction && event.doc.word_completion_enabled() { + helix_event::send_blocking( + &tx, + Event::Update( + event.doc.id(), + Change { + old_text: event.old_text.clone(), + text: event.doc.text().clone(), + changes: event.changes.clone(), + }, + ), + ); + } + Ok(()) + }); + + let tx = handlers.word_index.hook.clone(); + register_hook!(move |event: &mut DocumentDidClose<'_>| { + if event.doc.word_completion_enabled() { + helix_event::send_blocking( + &tx, + Event::Delete(event.doc.id(), event.doc.text().clone()), + ); + } + Ok(()) + }); + + let coordinator = handlers.word_index.coordinator.clone(); + register_hook!(move |event: &mut ConfigDidChange<'_>| { + // The feature has been turned off. Clear the index and reclaim any used memory. + if event.old.word_completion.enable && !event.new.word_completion.enable { + coordinator.send(Event::Clear).unwrap(); + } + + // The feature has been turned on. Index open documents. + if !event.old.word_completion.enable && event.new.word_completion.enable { + for doc in event.editor.documents() { + if doc.word_completion_enabled() { + coordinator.send(Event::Insert(doc.text().clone())).unwrap(); + } + } + } + + Ok(()) + }); +} + +#[cfg(test)] +mod tests { + use std::collections::HashSet; + + use super::*; + use helix_core::diff::compare_ropes; + + impl WordIndex { + fn words(&self) -> HashSet<String> { + let inner = self.inner.read(); + inner.words().map(|w| w.to_string()).collect() + } + } + + #[track_caller] + fn assert_words<I: ToString, T: IntoIterator<Item = I>>(text: &str, expected: T) { + let text = Rope::from_str(text); + let index = WordIndex::default(); + index.add_document(&text); + let actual = index.words(); + let expected: HashSet<_> = expected.into_iter().map(|i| i.to_string()).collect(); + assert_eq!(expected, actual); + } + + #[test] + fn parse() { + assert_words("one two three", ["one", "two", "three"]); + assert_words("a foo c", ["foo"]); + } + + #[track_caller] + fn assert_diff<S, R, I>(before: &str, after: &str, expect_removed: R, expect_inserted: I) + where + S: ToString, + R: IntoIterator<Item = S>, + I: IntoIterator<Item = S>, + { + let before = Rope::from_str(before); + let after = Rope::from_str(after); + let diff = compare_ropes(&before, &after); + let expect_removed: HashSet<_> = + expect_removed.into_iter().map(|i| i.to_string()).collect(); + let expect_inserted: HashSet<_> = + expect_inserted.into_iter().map(|i| i.to_string()).collect(); + + let index = WordIndex::default(); + index.add_document(&before); + let words_before = index.words(); + index.update_document(&before, &after, diff.changes()); + let words_after = index.words(); + + let actual_removed = words_before.difference(&words_after).cloned().collect(); + let actual_inserted = words_after.difference(&words_before).cloned().collect(); + + eprintln!("\"{before}\" {words_before:?} => \"{after}\" {words_after:?}"); + assert_eq!( + expect_removed, actual_removed, + "expected {expect_removed:?} to be removed, instead {actual_removed:?} was" + ); + assert_eq!( + expect_inserted, actual_inserted, + "expected {expect_inserted:?} to be inserted, instead {actual_inserted:?} was" + ); + } + + #[test] + fn diff() { + assert_diff("one two three", "one five three", ["two"], ["five"]); + assert_diff("one two three", "one to three", ["two"], []); + assert_diff("one two three", "one three", ["two"], []); + assert_diff("one two three", "one t{o three", ["two"], []); + assert_diff("one foo three", "one fooo three", ["foo"], ["fooo"]); + } +} |