my fork of dmp
Cleanups
Anubhab Bandyopadhyay 2025-01-08
parent e1ba219 · commit 15666a8
-rw-r--r--src/dmp.rs91
-rw-r--r--src/traits.rs120
2 files changed, 103 insertions, 108 deletions
diff --git a/src/dmp.rs b/src/dmp.rs
index fef0ebb..223e232 100644
--- a/src/dmp.rs
+++ b/src/dmp.rs
@@ -449,10 +449,10 @@ impl DiffMatchPatch {
// Quick line-level diff on both strings, then rediff the parts for greater accuracy
// This speedup can produce non-minimal diffs
- fn line_mode<'a, T: DType>(
+ fn line_mode<T: DType>(
&self,
- old: &'a [T],
- new: &'a [T],
+ old: &[T],
+ new: &[T],
deadline: Option<Time>,
) -> Result<Vec<Diff<T>>, crate::errors::Error> {
let mut diffs = {
@@ -530,10 +530,10 @@ impl DiffMatchPatch {
Ok(diffs)
}
- pub(crate) fn diff_lines<'a>(
+ pub(crate) fn diff_lines(
&self,
- old: &'a [usize],
- new: &'a [usize],
+ old: &[usize],
+ new: &[usize],
deadline: Option<Time>,
) -> Result<Vec<Diff<usize>>, crate::errors::Error> {
if old == new {
@@ -586,10 +586,10 @@ impl DiffMatchPatch {
Ok(diffs)
}
- fn compute_lines<'a>(
+ fn compute_lines(
&self,
- old: &'a [usize],
- new: &'a [usize],
+ old: &[usize],
+ new: &[usize],
deadline: Option<Time>,
) -> Result<Vec<Diff<usize>>, crate::errors::Error> {
// returning all of the new part
@@ -662,31 +662,38 @@ impl DiffMatchPatch {
// Find the 'middle snake' of a diff, split the problem in two
// and return the recursively constructed diff.
// See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
- pub fn bisect<'a, T: DType>(
+ pub fn bisect<T: DType>(
&self,
- old: &'a [T],
- new: &'a [T],
+ old: &[T],
+ new: &[T],
deadline: Option<Time>,
) -> Result<Vec<Diff<T>>, crate::errors::Error> {
- let old_len = old.len() as isize;
- let new_len = new.len() as isize;
-
- let v_offset = (old_len + new_len + 1) >> 1; // same as `max_d`, (/ 2)
- let v_len = v_offset << 1; // (* 2)
-
- let mut v = vec![-1_isize; (v_len << 1) as usize];
- let (v1, v2) = v.split_at_mut(v_len as usize);
-
- {
- let v_trg = v_offset as usize + 1;
+ // Micro optimization:
+ // Do all setup before casting to isize
+ let mut v;
+ let (v_offset, v_len, v1, v2, old_len, new_len) = {
+ let v_offset = (old.len() + new.len() + 1) / 2;
+ let v_len = v_offset * 2;
+
+ v = vec![-1_isize; v_len * 2];
+ let (v1, v2) = v.split_at_mut(v_len);
+ let v_trg = v_offset + 1;
if v_trg < v1.len() {
v1[v_trg] = 0;
}
-
if v_trg < v2.len() {
v2[v_trg] = 0;
}
- }
+
+ (
+ v_offset as isize,
+ v_len as isize,
+ v1,
+ v2,
+ old.len() as isize,
+ new.len() as isize,
+ )
+ };
let delta = old_len - new_len;
// If the total number of characters is odd, then the front path will collide
@@ -856,9 +863,9 @@ impl DiffMatchPatch {
Ok(vec![Diff::delete(old), Diff::insert(new)])
}
- fn bisect_fwd_path_i<'a, T: DType>(
- old: &'a [T],
- new: &'a [T],
+ fn bisect_fwd_path_i<T: DType>(
+ old: &[T],
+ new: &[T],
x1: usize,
y1: usize,
) -> (usize, usize) {
@@ -877,9 +884,9 @@ impl DiffMatchPatch {
(x1, y1)
}
- fn bisect_rev_path_i<'a, T: DType>(
- old: &'a [T],
- new: &'a [T],
+ fn bisect_rev_path_i<T: DType>(
+ old: &[T],
+ new: &[T],
x2: usize,
y2: usize,
) -> (usize, usize) {
@@ -1300,11 +1307,7 @@ impl DiffMatchPatch {
// Scores range from 6 (best) to 0 (worst)
fn cleanup_semantic_score<T: DType>(one: &[T], two: &[T]) -> u8 {
let (char1, char2) = if let (Some(&char1), Some(&char2)) = (one.last(), two.first()) {
- if let (Some(c1), Some(c2)) = (char1.as_char(), char2.as_char()) {
- (c1, c2)
- } else {
- return 6;
- }
+ (char1, char2)
} else {
return 6;
};
@@ -1318,8 +1321,8 @@ impl DiffMatchPatch {
let whitespace_1 = char1.is_whitespace();
let whitespace_2 = char2.is_whitespace();
- let linebreak_1 = whitespace_1 && (char1 == '\n' || char1 == '\r');
- let linebreak_2 = whitespace_2 && (char2 == '\n' || char2 == '\r');
+ let linebreak_1 = whitespace_1 && (char1.is_newline() || char1.is_carriage());
+ let linebreak_2 = whitespace_2 && (char2.is_newline() || char2.is_carriage());
let blankline_1 = linebreak_1 && T::is_linebreak_end(one);
let blankline_2 = linebreak_2 && T::is_linebreak_start(two);
@@ -1330,13 +1333,13 @@ impl DiffMatchPatch {
} else if linebreak_1 || linebreak_2 {
// Four points for line breaks.
4
- } else if !char1.is_alphanumeric() && !whitespace_1 && whitespace_2 {
+ } else if !char1.is_alphanum() && !whitespace_1 && whitespace_2 {
// Three points for end of sentences.
3
} else if whitespace_1 || whitespace_2 {
// 2 for whitespace
2
- } else if !char1.is_alphanumeric() || !char2.is_alphanumeric() {
+ } else if !char1.is_alphanum() || !char2.is_alphanum() {
// 1 for not alphanumeric
1
} else {
@@ -1871,12 +1874,10 @@ impl DiffMatchPatch {
let mut linehash: HashMap<&'a [T], usize> = HashMap::new();
// Allocate 2/3rds of the UTF16::MAX (65535) value space for text1, the rest for text2.
- let mut maxlines = 40000;
- let chars_old = Self::lines_to_chars_internal(old, &mut lines, &mut linehash, maxlines);
+ let chars_old = Self::lines_to_chars_internal(old, &mut lines, &mut linehash, 40000);
// This basically represents the U16::MAX value
- maxlines = 65535;
- let chars_new = Self::lines_to_chars_internal(new, &mut lines, &mut linehash, maxlines);
+ let chars_new = Self::lines_to_chars_internal(new, &mut lines, &mut linehash, 65535);
LineToChars {
chars_old,
@@ -1898,7 +1899,7 @@ impl DiffMatchPatch {
let mut broke = false;
let mut cursor = 0;
- text.split_inclusive(|u| *u == T::from_char('\n'))
+ text.split_inclusive(|u| u.is_newline())
.enumerate()
.take(take)
.for_each(|(idx, line)| {
diff --git a/src/traits.rs b/src/traits.rs
index 90f9078..02d0ee2 100644
--- a/src/traits.rs
+++ b/src/traits.rs
@@ -26,7 +26,6 @@ const ENCODE_SET: &AsciiSet = &CONTROLS
.add(b'|');
pub trait DType: Copy + Ord + Eq + Hash {
- // fn differ(dmp: &DiffMatchPatch, txt_old: &str, txt_new: &str) -> Result<Vec<Diff<Self>>, crate::errors::Error>;
fn bisect_split(
dmp: &DiffMatchPatch,
old: &[Self],
@@ -34,13 +33,30 @@ pub trait DType: Copy + Ord + Eq + Hash {
x: usize,
y: usize,
deadline: Option<Time>,
- ) -> Result<Vec<Diff<Self>>, crate::errors::Error>;
+ ) -> Result<Vec<Diff<Self>>, crate::errors::Error> {
+ let (old_a, new_a, old_b, new_b) = if x <= old.len() && y <= new.len() {
+ (&old[..x], &new[..y], &old[x..], &new[y..])
+ } else {
+ return Err(crate::errors::Error::InvalidInput);
+ };
+
+ // Compute both diffs serially.
+ let mut diffs_a = dmp.diff_internal(old_a, new_a, false, deadline)?;
+ diffs_a.append(&mut dmp.diff_internal(old_b, new_b, false, deadline)?);
+
+ Ok(diffs_a)
+ }
fn from_char(c: char) -> Self;
fn as_char(&self) -> Option<char>;
fn from_str(str: &str) -> Vec<Self>;
fn to_string(data: &[Self]) -> Result<String, crate::Error>;
+ fn is_whitespace(self) -> bool { unimplemented!() }
+ fn is_newline(self) -> bool { unimplemented!() }
+ fn is_carriage(self) -> bool { unimplemented!() }
+ fn is_alphanum(self) -> bool { unimplemented!() }
+
fn is_linebreak_end(input: &[Self]) -> bool;
fn is_linebreak_start(input: &[Self]) -> bool;
@@ -53,27 +69,6 @@ pub trait DType: Copy + Ord + Eq + Hash {
}
impl DType for u8 {
- fn bisect_split(
- dmp: &DiffMatchPatch,
- old: &[u8],
- new: &[u8],
- x: usize,
- y: usize,
- deadline: Option<Time>,
- ) -> Result<Vec<Diff<u8>>, crate::errors::Error> {
- let old_a = &old[..x];
- let new_a = &new[..y];
-
- let old_b = &old[x..];
- let new_b = &new[y..];
-
- // Compute both diffs serially.
- let mut diffs_a = dmp.diff_internal(old_a, new_a, false, deadline)?;
- diffs_a.append(&mut dmp.diff_internal(old_b, new_b, false, deadline)?);
-
- Ok(diffs_a)
- }
-
fn from_char(c: char) -> Self {
c as u8
}
@@ -86,19 +81,32 @@ impl DType for u8 {
str.as_bytes().to_vec()
}
- #[inline]
fn to_string(data: &[Self]) -> Result<String, crate::Error> {
std::str::from_utf8(data)
.map_err(|_| crate::Error::Utf8Error)
.map(|s| s.to_string())
}
- #[inline]
+ fn is_whitespace(self) -> bool {
+ char::is_whitespace(self.into())
+ }
+
+ fn is_newline(self) -> bool {
+ char::is_newline(self.into())
+ }
+
+ fn is_carriage(self) -> bool {
+ self == b'\r'
+ }
+
+ fn is_alphanum(self) -> bool {
+ char::is_alphanumeric(self.into())
+ }
+
fn is_linebreak_end(input: &[Self]) -> bool {
input.ends_with(b"\n\n") || input.ends_with(b"\n\r\n")
}
- #[inline]
fn is_linebreak_start(input: &[Self]) -> bool {
input.starts_with(b"\r\n\n")
|| input.starts_with(b"\r\n\r\n")
@@ -106,7 +114,7 @@ impl DType for u8 {
|| input.starts_with(b"\n\n")
}
- #[inline]
+
fn percent_encode(input: &[Self]) -> Vec<Self> {
percent_encoding::percent_encode(input, ENCODE_SET)
.collect::<String>()
@@ -114,12 +122,11 @@ impl DType for u8 {
.to_vec()
}
- #[inline]
+
fn percent_decode(input: &[Self]) -> Vec<Self> {
percent_decode(input).collect()
}
- #[inline]
fn humanize(diffs: &mut Vec<Diff<Self>>) -> Result<(), crate::Error> {
let mut idx = 0_usize;
let mut err_prefix = vec![];
@@ -214,27 +221,6 @@ impl DType for u8 {
}
impl DType for char {
- fn bisect_split(
- dmp: &DiffMatchPatch,
- old: &[char],
- new: &[char],
- x: usize,
- y: usize,
- deadline: Option<Time>,
- ) -> Result<Vec<Diff<char>>, crate::errors::Error> {
- let old_a = &old[..x];
- let new_a = &new[..y];
-
- let old_b = &old[x..];
- let new_b = &new[y..];
-
- // Compute both diffs serially.
- let mut diffs_a = dmp.diff_internal(old_a, new_a, false, deadline)?;
- diffs_a.append(&mut dmp.diff_internal(old_b, new_b, false, deadline)?);
-
- Ok(diffs_a)
- }
-
fn from_char(c: char) -> Self {
c
}
@@ -247,17 +233,30 @@ impl DType for char {
str.chars().collect::<Vec<_>>()
}
- #[inline]
fn to_string(data: &[Self]) -> Result<String, crate::Error> {
Ok(data.iter().collect::<String>())
}
- #[inline]
+ fn is_whitespace(self) -> bool {
+ char::is_whitespace(self)
+ }
+
+ fn is_newline(self) -> bool {
+ self == '\n'
+ }
+
+ fn is_carriage(self) -> bool {
+ self == '\r'
+ }
+
+ fn is_alphanum(self) -> bool {
+ self.is_alphanumeric()
+ }
+
fn is_linebreak_end(input: &[Self]) -> bool {
input.ends_with(&['\n', '\n']) || input.ends_with(&['\n', '\r', '\n'])
}
- #[inline]
fn is_linebreak_start(input: &[Self]) -> bool {
input.starts_with(&['\r', '\n', '\n'])
|| input.starts_with(&['\r', '\n', '\r', '\n'])
@@ -265,7 +264,6 @@ impl DType for char {
|| input.starts_with(&['\n', '\n'])
}
- #[inline]
fn percent_encode(input: &[Self]) -> Vec<Self> {
let d = input
.iter()
@@ -283,7 +281,6 @@ impl DType for char {
Self::from_str(&encoded)
}
- #[inline]
fn percent_decode(input: &[Self]) -> Vec<Self> {
let ip = input.iter().collect::<String>();
percent_decode(ip.as_bytes())
@@ -303,11 +300,11 @@ impl DType for usize {
y: usize,
deadline: Option<Time>,
) -> Result<Vec<Diff<usize>>, crate::errors::Error> {
- let old_a = &old[..x];
- let new_a = &new[..y];
-
- let old_b = &old[x..];
- let new_b = &new[y..];
+ let (old_a, new_a, old_b, new_b) = if x <= old.len() && y <= new.len() {
+ (&old[..x], &new[..y], &old[x..], &new[y..])
+ } else {
+ return Err(crate::errors::Error::InvalidInput);
+ };
// Compute both diffs serially.
let mut diffs_a = dmp.diff_lines(old_a, new_a, deadline)?;
@@ -336,17 +333,14 @@ impl DType for usize {
unimplemented!()
}
- #[inline]
fn is_linebreak_start(_: &[Self]) -> bool {
unimplemented!()
}
- #[inline]
fn percent_encode(_: &[Self]) -> Vec<Self> {
unimplemented!()
}
- #[inline]
fn percent_decode(_: &[Self]) -> Vec<Self> {
unimplemented!()
}