helix

Unnamed repository; edit this file 'description' to name the repository.

Clone

HTTPS

SSH

stdx: Add floor/ceil char boundary functions to RopeSliceExt

These functions mimic `str::floor_char_boundary` and `str::floor_char_boundary` (currently unstable under `round_char_boundary`). They're useful for correcting a byte index which may not lie on a character boundary. For example you might limit a search within a slice to some fixed number of bytes. The fixed number might not lie on a boundary though so it needs to be corrected to either the earlier (floor) or later (ceil) boundary.

Browse Source

Michael Davis 2025-01-26

parent aac0ce5 · commit 23b424a

Diffstat

-rw-r--r--

helix-stdx/src/rope.rs

1 files changed, 75 insertions, 0 deletions

diff --git a/helix-stdx/src/rope.rs b/helix-stdx/src/rope.rs
index 089c3678..0db9bf98 100644
--- a/helix-stdx/src/rope.rs
+++ b/helix-stdx/src/rope.rs

@@ -34,6 +34,42 @@ pub trait RopeSliceExt<'a>: Sized {

/// }

/// ```

fn byte_to_next_char(self, byte_idx: usize) -> usize;

+ /// Finds the closest byte index not exceeding `byte_idx` which lies on a character boundary.

+ ///

+ /// If `byte_idx` already lies on a character boundary then it is returned as-is. When

+ /// `byte_idx` lies between two character boundaries, this function returns the byte index of

+ /// the lesser / earlier / left-hand-side boundary.

+ ///

+ /// # Example

+ ///

+ /// ```

+ /// # use ropey::RopeSlice;

+ /// # use helix_stdx::rope::RopeSliceExt;

+ /// let text = RopeSlice::from("⌚"); // three bytes: e2 8c 9a

+ /// assert_eq!(text.floor_char_boundary(0), 0);

+ /// assert_eq!(text.floor_char_boundary(1), 0);

+ /// assert_eq!(text.floor_char_boundary(2), 0);

+ /// assert_eq!(text.floor_char_boundary(3), 3);

+ /// ```

+ fn floor_char_boundary(self, byte_idx: usize) -> usize;

+ /// Finds the closest byte index not below `byte_idx` which lies on a character boundary.

+ ///

+ /// If `byte_idx` already lies on a character boundary then it is returned as-is. When

+ /// `byte_idx` lies between two character boundaries, this function returns the byte index of

+ /// the greater / later / right-hand-side boundary.

+ ///

+ /// # Example

+ ///

+ /// ```

+ /// # use ropey::RopeSlice;

+ /// # use helix_stdx::rope::RopeSliceExt;

+ /// let text = RopeSlice::from("⌚"); // three bytes: e2 8c 9a

+ /// assert_eq!(text.ceil_char_boundary(0), 0);

+ /// assert_eq!(text.ceil_char_boundary(1), 3);

+ /// assert_eq!(text.ceil_char_boundary(2), 3);

+ /// assert_eq!(text.ceil_char_boundary(3), 3);

+ /// ```

+ fn ceil_char_boundary(self, byte_idx: usize) -> usize;

}

impl<'a> RopeSliceExt<'a> for RopeSlice<'a> {

@@ -103,6 +139,35 @@ impl<'a> RopeSliceExt<'a> for RopeSlice<'a> {

is_utf8_char_boundary(chunk.as_bytes().get(byte_idx).copied().unwrap_or(0));

chunk_char_off + byte_to_char_idx(chunk, byte_idx) + !is_char_boundary as usize

}

+ // These two are adapted from std's `round_char_boundary` functions:

+ fn floor_char_boundary(self, byte_idx: usize) -> usize {

+ if byte_idx >= self.len_bytes() {

+ self.len_bytes()

+ } else {

+ let offset = self

+ .bytes_at(byte_idx + 1)

+ .reversed()

+ .take(4)

+ .position(is_utf8_char_boundary)

+ // A char can only be four bytes long so we are guaranteed to find a boundary.

+ .unwrap();

+ byte_idx - offset

+ }

+ fn ceil_char_boundary(self, byte_idx: usize) -> usize {

+ if byte_idx > self.len_bytes() {

+ self.len_bytes()

+ } else {

+ let upper_bound = self.len_bytes().min(byte_idx + 4);

+ self.bytes_at(byte_idx)

+ .position(is_utf8_char_boundary)

+ .map_or(upper_bound, |pos| pos + byte_idx)

+ }

}

// copied from std

@@ -147,4 +212,14 @@ mod tests {

fn ends_with() {

assert!(RopeSlice::from("asdf").ends_with("f"));

}

+ #[test]

+ fn floor_ceil_char_boundary() {

+ let ascii = RopeSlice::from("ascii");

+ // When the given index lies on a character boundary, the index should not change.

+ for byte_idx in 0..=ascii.len_bytes() {

+ assert_eq!(ascii.floor_char_boundary(byte_idx), byte_idx);

+ assert_eq!(ascii.ceil_char_boundary(byte_idx), byte_idx);

+ }

}