Unnamed repository; edit this file 'description' to name the repository.
Diffstat (limited to 'helix-view/src/handlers/word_index.rs')
-rw-r--r--helix-view/src/handlers/word_index.rs509
1 files changed, 509 insertions, 0 deletions
diff --git a/helix-view/src/handlers/word_index.rs b/helix-view/src/handlers/word_index.rs
new file mode 100644
index 00000000..9c3f8338
--- /dev/null
+++ b/helix-view/src/handlers/word_index.rs
@@ -0,0 +1,509 @@
+//! Indexing of words from open buffers.
+//!
+//! This provides an eventually consistent set of words used in any open buffers. This set is
+//! later used for lexical completion.
+
+use std::{borrow::Cow, collections::HashMap, iter, mem, sync::Arc, time::Duration};
+
+use helix_core::{
+ chars::char_is_word, fuzzy::fuzzy_match, movement, ChangeSet, Range, Rope, RopeSlice,
+};
+use helix_event::{register_hook, AsyncHook};
+use helix_stdx::rope::RopeSliceExt as _;
+use parking_lot::RwLock;
+use tokio::{sync::mpsc, time::Instant};
+
+use crate::{
+ events::{ConfigDidChange, DocumentDidChange, DocumentDidClose, DocumentDidOpen},
+ DocumentId,
+};
+
+use super::Handlers;
+
+#[derive(Debug)]
+struct Change {
+ old_text: Rope,
+ text: Rope,
+ changes: ChangeSet,
+}
+
+#[derive(Debug)]
+enum Event {
+ Insert(Rope),
+ Update(DocumentId, Change),
+ Delete(DocumentId, Rope),
+ /// Clear the entire word index.
+ /// This is used to clear memory when the feature is turned off.
+ Clear,
+}
+
+#[derive(Debug)]
+pub struct Handler {
+ pub(super) index: WordIndex,
+ /// A sender into an async hook which debounces updates to the index.
+ hook: mpsc::Sender<Event>,
+ /// A sender to a tokio task which coordinates the indexing of documents.
+ ///
+ /// See [WordIndex::run]. A supervisor-like task is in charge of spawning tasks to update the
+ /// index. This ensures that consecutive edits to a document trigger the correct order of
+ /// insertions and deletions into the word set.
+ coordinator: mpsc::UnboundedSender<Event>,
+}
+
+impl Handler {
+ pub fn spawn() -> Self {
+ let index = WordIndex::default();
+ let (tx, rx) = mpsc::unbounded_channel();
+ tokio::spawn(index.clone().run(rx));
+ Self {
+ hook: Hook {
+ changes: HashMap::default(),
+ coordinator: tx.clone(),
+ }
+ .spawn(),
+ index,
+ coordinator: tx,
+ }
+ }
+}
+
+#[derive(Debug)]
+struct Hook {
+ changes: HashMap<DocumentId, Change>,
+ coordinator: mpsc::UnboundedSender<Event>,
+}
+
+const DEBOUNCE: Duration = Duration::from_secs(1);
+
+impl AsyncHook for Hook {
+ type Event = Event;
+
+ fn handle_event(&mut self, event: Self::Event, timeout: Option<Instant>) -> Option<Instant> {
+ match event {
+ Event::Insert(_) => unreachable!("inserts are sent to the worker directly"),
+ Event::Update(doc, change) => {
+ if let Some(pending_change) = self.changes.get_mut(&doc) {
+ // If there is already a change waiting for this document, merge the two
+ // changes together by composing the changesets and saving the new `text`.
+ pending_change.changes =
+ mem::take(&mut pending_change.changes).compose(change.changes);
+ pending_change.text = change.text;
+ Some(Instant::now() + DEBOUNCE)
+ } else if !is_changeset_significant(&change.changes) {
+ // If the changeset is fairly large, debounce before updating the index.
+ self.changes.insert(doc, change);
+ Some(Instant::now() + DEBOUNCE)
+ } else {
+ // Otherwise if the change is small, queue the update to the index immediately.
+ self.coordinator.send(Event::Update(doc, change)).unwrap();
+ timeout
+ }
+ }
+ Event::Delete(doc, text) => {
+ // If there are pending changes that haven't been indexed since the last debounce,
+ // forget them and delete the old text.
+ if let Some(change) = self.changes.remove(&doc) {
+ self.coordinator
+ .send(Event::Delete(doc, change.old_text))
+ .unwrap();
+ } else {
+ self.coordinator.send(Event::Delete(doc, text)).unwrap();
+ }
+ timeout
+ }
+ Event::Clear => unreachable!("clear is sent to the worker directly"),
+ }
+ }
+
+ fn finish_debounce(&mut self) {
+ for (doc, change) in self.changes.drain() {
+ self.coordinator.send(Event::Update(doc, change)).unwrap();
+ }
+ }
+}
+
+/// Minimum number of grapheme clusters required to include a word in the index
+const MIN_WORD_GRAPHEMES: usize = 3;
+/// Maximum word length allowed (in chars)
+const MAX_WORD_LEN: usize = 50;
+
+type Word = kstring::KString;
+
+#[derive(Debug, Default)]
+struct WordIndexInner {
+ /// Reference counted storage for words.
+ ///
+ /// Words are very likely to be reused many times. Instead of storing duplicates we keep a
+ /// reference count of times a word is used. When the reference count drops to zero the word
+ /// is removed from the index.
+ words: HashMap<Word, u32>,
+}
+
+impl WordIndexInner {
+ fn words(&self) -> impl Iterator<Item = &Word> {
+ self.words.keys()
+ }
+
+ fn insert(&mut self, word: RopeSlice) {
+ let word: Cow<str> = word.into();
+ if let Some(rc) = self.words.get_mut(word.as_ref()) {
+ *rc = rc.saturating_add(1);
+ } else {
+ let word = match word {
+ Cow::Owned(s) => Word::from_string(s),
+ Cow::Borrowed(s) => Word::from_ref(s),
+ };
+ self.words.insert(word, 1);
+ }
+ }
+
+ fn remove(&mut self, word: RopeSlice) {
+ let word: Cow<str> = word.into();
+ match self.words.get_mut(word.as_ref()) {
+ Some(1) => {
+ self.words.remove(word.as_ref());
+ }
+ Some(n) => *n -= 1,
+ None => (),
+ }
+ }
+
+ fn clear(&mut self) {
+ std::mem::take(&mut self.words);
+ }
+}
+
+#[derive(Debug, Default, Clone)]
+pub struct WordIndex {
+ inner: Arc<RwLock<WordIndexInner>>,
+}
+
+impl WordIndex {
+ pub fn matches(&self, pattern: &str) -> Vec<String> {
+ let inner = self.inner.read();
+ let mut matches = fuzzy_match(pattern, inner.words(), false);
+ matches.sort_unstable_by_key(|(_, score)| *score);
+ matches
+ .into_iter()
+ .map(|(word, _)| word.to_string())
+ .collect()
+ }
+
+ fn add_document(&self, text: &Rope) {
+ let mut inner = self.inner.write();
+ for word in words(text.slice(..)) {
+ inner.insert(word);
+ }
+ }
+
+ fn update_document(&self, old_text: &Rope, text: &Rope, changes: &ChangeSet) {
+ let mut inner = self.inner.write();
+ for (old_window, new_window) in changed_windows(old_text.slice(..), text.slice(..), changes)
+ {
+ for word in words(new_window) {
+ inner.insert(word);
+ }
+ for word in words(old_window) {
+ inner.remove(word);
+ }
+ }
+ }
+
+ fn remove_document(&self, text: &Rope) {
+ let mut inner = self.inner.write();
+ for word in words(text.slice(..)) {
+ inner.remove(word);
+ }
+ }
+
+ fn clear(&self) {
+ let mut inner = self.inner.write();
+ inner.clear();
+ }
+
+ /// Coordinate the indexing of documents.
+ ///
+ /// This task wraps a MPSC queue and spawns blocking tasks which update the index. Updates
+ /// are applied one-by-one to ensure that changes to the index are **serialized**:
+ /// updates to each document must be applied in-order.
+ async fn run(self, mut events: mpsc::UnboundedReceiver<Event>) {
+ while let Some(event) = events.recv().await {
+ let this = self.clone();
+ tokio::task::spawn_blocking(move || match event {
+ Event::Insert(text) => {
+ this.add_document(&text);
+ }
+ Event::Update(
+ _doc,
+ Change {
+ old_text,
+ text,
+ changes,
+ ..
+ },
+ ) => {
+ this.update_document(&old_text, &text, &changes);
+ }
+ Event::Delete(_doc, text) => {
+ this.remove_document(&text);
+ }
+ Event::Clear => {
+ this.clear();
+ }
+ })
+ .await
+ .unwrap();
+ }
+ }
+}
+
+fn words(text: RopeSlice) -> impl Iterator<Item = RopeSlice> {
+ let mut cursor = Range::point(0);
+ if text
+ .get_char(cursor.anchor)
+ .is_some_and(|ch| !ch.is_whitespace())
+ {
+ let cursor_word_end = movement::move_next_word_end(text, cursor, 1);
+ if cursor_word_end.anchor == 0 {
+ cursor = cursor_word_end;
+ }
+ }
+
+ iter::from_fn(move || {
+ while cursor.head <= text.len_chars() {
+ let mut word = None;
+ if text
+ .slice(..cursor.head)
+ .graphemes_rev()
+ .take(MIN_WORD_GRAPHEMES)
+ .take_while(|g| g.chars().all(char_is_word))
+ .count()
+ == MIN_WORD_GRAPHEMES
+ {
+ cursor.anchor += text
+ .chars_at(cursor.anchor)
+ .take_while(|&c| !char_is_word(c))
+ .count();
+ let slice = cursor.slice(text);
+ if slice.len_chars() <= MAX_WORD_LEN {
+ word = Some(slice);
+ }
+ }
+ let head = cursor.head;
+ cursor = movement::move_next_word_end(text, cursor, 1);
+ if cursor.head == head {
+ cursor.head = usize::MAX;
+ }
+ if word.is_some() {
+ return word;
+ }
+ }
+ None
+ })
+}
+
+/// Finds areas of the old and new texts around each operation in `changes`.
+///
+/// The window is larger than the changed area and can encompass multiple insert/delete operations
+/// if they are grouped closely together.
+///
+/// The ranges of the old and new text should usually be of different sizes. For example a
+/// deletion of "foo" surrounded by large retain sections would give a longer window into the
+/// `old_text` and shorter window of `new_text`. Vice-versa for an insertion. A full replacement
+/// of a word though would give two slices of the same size.
+fn changed_windows<'a>(
+ old_text: RopeSlice<'a>,
+ new_text: RopeSlice<'a>,
+ changes: &'a ChangeSet,
+) -> impl Iterator<Item = (RopeSlice<'a>, RopeSlice<'a>)> {
+ use helix_core::Operation::*;
+
+ let mut operations = changes.changes().iter().peekable();
+ let mut old_pos = 0;
+ let mut new_pos = 0;
+ iter::from_fn(move || loop {
+ let operation = operations.next()?;
+ let old_start = old_pos;
+ let new_start = new_pos;
+ let len = operation.len_chars();
+ match operation {
+ Retain(_) => {
+ old_pos += len;
+ new_pos += len;
+ continue;
+ }
+ Insert(_) => new_pos += len,
+ Delete(_) => old_pos += len,
+ }
+
+ // Scan ahead until a `Retain` is found which would end a window.
+ while let Some(o) = operations.next_if(|op| !matches!(op, Retain(n) if *n > MAX_WORD_LEN)) {
+ let len = o.len_chars();
+ match o {
+ Retain(_) => {
+ old_pos += len;
+ new_pos += len;
+ }
+ Delete(_) => old_pos += len,
+ Insert(_) => new_pos += len,
+ }
+ }
+
+ let old_window = old_start.saturating_sub(MAX_WORD_LEN)
+ ..(old_pos + MAX_WORD_LEN).min(old_text.len_chars());
+ let new_window = new_start.saturating_sub(MAX_WORD_LEN)
+ ..(new_pos + MAX_WORD_LEN).min(new_text.len_chars());
+
+ return Some((old_text.slice(old_window), new_text.slice(new_window)));
+ })
+}
+
+/// Estimates whether a changeset is significant or small.
+fn is_changeset_significant(changes: &ChangeSet) -> bool {
+ use helix_core::Operation::*;
+
+ let mut diff = 0;
+ for operation in changes.changes() {
+ match operation {
+ Retain(_) => continue,
+ Delete(_) | Insert(_) => diff += operation.len_chars(),
+ }
+ }
+
+ // This is arbitrary and could be tuned further:
+ diff > 1_000
+}
+
+pub(crate) fn register_hooks(handlers: &Handlers) {
+ let coordinator = handlers.word_index.coordinator.clone();
+ register_hook!(move |event: &mut DocumentDidOpen<'_>| {
+ let doc = doc!(event.editor, &event.doc);
+ if doc.word_completion_enabled() {
+ coordinator.send(Event::Insert(doc.text().clone())).unwrap();
+ }
+ Ok(())
+ });
+
+ let tx = handlers.word_index.hook.clone();
+ register_hook!(move |event: &mut DocumentDidChange<'_>| {
+ if !event.ghost_transaction && event.doc.word_completion_enabled() {
+ helix_event::send_blocking(
+ &tx,
+ Event::Update(
+ event.doc.id(),
+ Change {
+ old_text: event.old_text.clone(),
+ text: event.doc.text().clone(),
+ changes: event.changes.clone(),
+ },
+ ),
+ );
+ }
+ Ok(())
+ });
+
+ let tx = handlers.word_index.hook.clone();
+ register_hook!(move |event: &mut DocumentDidClose<'_>| {
+ if event.doc.word_completion_enabled() {
+ helix_event::send_blocking(
+ &tx,
+ Event::Delete(event.doc.id(), event.doc.text().clone()),
+ );
+ }
+ Ok(())
+ });
+
+ let coordinator = handlers.word_index.coordinator.clone();
+ register_hook!(move |event: &mut ConfigDidChange<'_>| {
+ // The feature has been turned off. Clear the index and reclaim any used memory.
+ if event.old.word_completion.enable && !event.new.word_completion.enable {
+ coordinator.send(Event::Clear).unwrap();
+ }
+
+ // The feature has been turned on. Index open documents.
+ if !event.old.word_completion.enable && event.new.word_completion.enable {
+ for doc in event.editor.documents() {
+ if doc.word_completion_enabled() {
+ coordinator.send(Event::Insert(doc.text().clone())).unwrap();
+ }
+ }
+ }
+
+ Ok(())
+ });
+}
+
+#[cfg(test)]
+mod tests {
+ use std::collections::HashSet;
+
+ use super::*;
+ use helix_core::diff::compare_ropes;
+
+ impl WordIndex {
+ fn words(&self) -> HashSet<String> {
+ let inner = self.inner.read();
+ inner.words().map(|w| w.to_string()).collect()
+ }
+ }
+
+ #[track_caller]
+ fn assert_words<I: ToString, T: IntoIterator<Item = I>>(text: &str, expected: T) {
+ let text = Rope::from_str(text);
+ let index = WordIndex::default();
+ index.add_document(&text);
+ let actual = index.words();
+ let expected: HashSet<_> = expected.into_iter().map(|i| i.to_string()).collect();
+ assert_eq!(expected, actual);
+ }
+
+ #[test]
+ fn parse() {
+ assert_words("one two three", ["one", "two", "three"]);
+ assert_words("a foo c", ["foo"]);
+ }
+
+ #[track_caller]
+ fn assert_diff<S, R, I>(before: &str, after: &str, expect_removed: R, expect_inserted: I)
+ where
+ S: ToString,
+ R: IntoIterator<Item = S>,
+ I: IntoIterator<Item = S>,
+ {
+ let before = Rope::from_str(before);
+ let after = Rope::from_str(after);
+ let diff = compare_ropes(&before, &after);
+ let expect_removed: HashSet<_> =
+ expect_removed.into_iter().map(|i| i.to_string()).collect();
+ let expect_inserted: HashSet<_> =
+ expect_inserted.into_iter().map(|i| i.to_string()).collect();
+
+ let index = WordIndex::default();
+ index.add_document(&before);
+ let words_before = index.words();
+ index.update_document(&before, &after, diff.changes());
+ let words_after = index.words();
+
+ let actual_removed = words_before.difference(&words_after).cloned().collect();
+ let actual_inserted = words_after.difference(&words_before).cloned().collect();
+
+ eprintln!("\"{before}\" {words_before:?} => \"{after}\" {words_after:?}");
+ assert_eq!(
+ expect_removed, actual_removed,
+ "expected {expect_removed:?} to be removed, instead {actual_removed:?} was"
+ );
+ assert_eq!(
+ expect_inserted, actual_inserted,
+ "expected {expect_inserted:?} to be inserted, instead {actual_inserted:?} was"
+ );
+ }
+
+ #[test]
+ fn diff() {
+ assert_diff("one two three", "one five three", ["two"], ["five"]);
+ assert_diff("one two three", "one to three", ["two"], []);
+ assert_diff("one two three", "one three", ["two"], []);
+ assert_diff("one two three", "one t{o three", ["two"], []);
+ assert_diff("one foo three", "one fooo three", ["foo"], ["fooo"]);
+ }
+}