my fork of dmp
Cleanup
Anubhab Bandyopadhyay 2025-01-02
parent 8431bbf · commit a56e49a
-rw-r--r--README.md10
-rw-r--r--src/dmp.rs277
2 files changed, 157 insertions, 130 deletions
diff --git a/README.md b/README.md
index 2b651dc..179dbca 100644
--- a/README.md
+++ b/README.md
@@ -180,11 +180,11 @@ Benchmarks are maintained [diff-match-patch-bench repository](https://github.com
| Lang. | Library | Diff Avg. | Patch Avg. | Bencher | Mode | Correct |
|:-------:|:----------------------------------------------------------------------------------------:|:---------:|:----------:|:----------:|:-----------:|:-------:|
-| `rust` | [diff_match_patch v0.1.1](https://crates.io/crates/diff_match_patch)[^2] | 56.618 ms | 10.596 ms | Criterion | - | ✅ |
-| `rust` | [dmp v0.2.0](https://crates.io/crates/dmp) | 56.609 ms | 14.654 ms | Criterion | - | ✅ |
-| `rust` | [diff-match-patch-rs](https://github.com/AnubhabB/diff-match-patch-rs.git)<sup>our</sup> | 55.149 ms | 631.13 µs | Criterion | `Efficient`| ✅ |
-| `rust` | [diff-match-patch-rs](https://github.com/AnubhabB/diff-match-patch-rs.git)<sup>our</sup> | 55.275 ms | 1.1703 ms | Criterion | `Compat` | ✅ |
-| `go` | [go-diff](https://github.com/sergi/go-diff) | 50.31 ms | 135.2 ms | go test | - | ✅ |
+| `rust` | [diff_match_patch v0.1.1](https://crates.io/crates/diff_match_patch)[^2] | 56.618 ms | 9.00 ms | Criterion | - | ✅ |
+| `rust` | [dmp v0.2.0](https://crates.io/crates/dmp) | 56.609 ms | 12.25 ms | Criterion | - | ✅ |
+| `rust` | [diff-match-patch-rs](https://github.com/AnubhabB/diff-match-patch-rs.git)<sup>our</sup> | 34.660 ms | 533.17 µs | Criterion | `Efficient`| ✅ |
+| `rust` | [diff-match-patch-rs](https://github.com/AnubhabB/diff-match-patch-rs.git)<sup>our</sup> | 35.095 ms | 993.90 µs | Criterion | `Compat` | ✅ |
+| `go` | [go-diff](https://github.com/sergi/go-diff) | 40.31 ms | 135.2 ms | go test | - | ✅ |
| `node` | [diff-match-patch](https://www.npmjs.com/package/diff-match-patch)[^1] | 246.90 ms | 1.07 ms | tinybench | - | ❌ |
| `python`| [diff-match-patch](https://pypi.org/project/diff-match-patch/) | 1.01 s | 0.25 ms | timeit | - | ✅ |
diff --git a/src/dmp.rs b/src/dmp.rs
index 3d4463c..e1905ec 100644
--- a/src/dmp.rs
+++ b/src/dmp.rs
@@ -56,25 +56,27 @@ impl<T: DType> Diff<T> {
Self(op, data.to_vec())
}
- /// helper functions to create ops
+ /// helper functions to create op::Delete
pub fn delete(data: &[T]) -> Self {
Self::new(Ops::Delete, data)
}
+ /// helper functions to create op::Insert
pub fn insert(data: &[T]) -> Self {
Self::new(Ops::Insert, data)
}
+ /// helper functions to create op::Equal
pub fn equal(data: &[T]) -> Self {
Self::new(Ops::Equal, data)
}
- // returns the operation of the current diff
+ /// returns the operation of the current diff
pub fn op(&self) -> Ops {
self.0
}
- // returns the bytes of the data
+ /// returns slice of the data
pub fn data(&self) -> &[T] {
&self.1[..]
}
@@ -263,26 +265,42 @@ impl DiffMatchPatch {
}
// Trim common prefix
- let common_prefix = Self::common_prefix(old_t, new_t, false);
- let common_suffix =
- Self::common_prefix(&old_t[common_prefix..], &new_t[common_prefix..], true);
-
- let mut diffs = self.compute(
- &old_t[common_prefix..old_t.len() - common_suffix],
- &new_t[common_prefix..new_t.len() - common_suffix],
- linemode,
- deadline,
- )?;
+ let (common_prefix, common_suffix) = Self::common_prefix_suffix(old_t, new_t);
+
+ let (old_uniq_end, new_uniq_end) =
+ (old_t.len() - common_suffix, new_t.len() - common_suffix);
+
+ // Some elaborate checks to avoid bound checks
+ // slice_start_index_len_fail
+ // slice_end_index_len_fail
+ // slice_index_order_fail
+ let mut diffs = if common_prefix <= old_t.len()
+ && common_prefix <= new_t.len()
+ && old_uniq_end <= old_t.len()
+ && new_uniq_end <= new_t.len()
+ && common_prefix <= old_uniq_end
+ && common_prefix <= new_uniq_end
+ {
+ self.compute(
+ &old_t[common_prefix..old_uniq_end],
+ &new_t[common_prefix..new_uniq_end],
+ linemode,
+ deadline,
+ )?
+ } else {
+ // This is part of the bound check optimization! Should never fail!
+ return Err(crate::errors::Error::InvalidInput);
+ };
// Restore the prefix and suffix.
- if common_prefix > 0 {
+ if common_prefix > 0 && common_prefix <= old_t.len() {
let mut d = vec![Diff::equal(&old_t[..common_prefix])];
d.append(&mut diffs);
diffs = d;
}
- if common_suffix > 0 {
- diffs.push(Diff::new(Ops::Equal, &new_t[new_t.len() - common_suffix..]));
+ if common_suffix > 0 && new_t.len() >= common_suffix && new_uniq_end < new_t.len() {
+ diffs.push(Diff::new(Ops::Equal, &new_t[new_uniq_end..]));
}
Self::cleanup_merge(&mut diffs);
@@ -314,11 +332,7 @@ impl DiffMatchPatch {
};
// found a subsequence which contains the short text
- if let Some(idx) = long
- .windows(short.len())
- .step_by(1)
- .position(|k| k == short)
- {
+ if let Some(idx) = long.windows(short.len()).position(|k| k == short) {
// Shorter text is inside the longer text (speedup).
let op = if old_gt_new { Ops::Delete } else { Ops::Insert };
let diffs = vec![
@@ -388,11 +402,9 @@ impl DiffMatchPatch {
}
// First check if the second quarter is the seed for a half-match.
- // let hm1 = Self::diff_half_match_i(long, short, (long.len() as f32 / 4.).ceil() as usize);
- let hm1 = Self::half_match_i(long, short, long.len() >> 2);
+ let hm1 = Self::half_match_i(long, short, (long.len() + 3) >> 2);
// Check again based on the third quarter.
- // let hm2 = Self::diff_half_match_i(long, short, (long.len() as f32 / 2.).ceil() as usize);
- let hm2 = Self::half_match_i(long, short, long.len() >> 1);
+ let hm2 = Self::half_match_i(long, short, (long.len() + 1) >> 1);
if hm1.is_none() && hm2.is_none() {
return None;
@@ -458,7 +470,7 @@ impl DiffMatchPatch {
// Add a dummy entry at the end.
diffs.push(Diff::equal(&[]));
-
+
let mut pointer = 0_usize;
// count of bytes inserted
@@ -487,7 +499,9 @@ impl DiffMatchPatch {
let idxstart = pointer - delete_n - insert_n;
let idxend = idxstart + delete_n + insert_n;
- diffs.drain(idxstart..idxend);
+ if idxstart <= idxend {
+ diffs.drain(idxstart..idxend);
+ }
pointer = idxstart;
@@ -539,21 +553,32 @@ impl DiffMatchPatch {
}
// Trim common prefix
- let common_prefix = Self::common_prefix(old, new, false);
- let common_suffix = Self::common_prefix(&old[common_prefix..], &new[common_prefix..], true);
-
- let mut diffs = self.compute_lines(
- &old[common_prefix..old.len() - common_suffix],
- &new[common_prefix..new.len() - common_suffix],
- deadline,
- )?;
+ let (common_prefix, common_suffix) = Self::common_prefix_suffix(old, new);
+
+ let (old_uniq_end, new_uniq_end) = (old.len() - common_suffix, new.len() - common_suffix);
+ let mut diffs = if common_prefix <= old.len()
+ && common_prefix <= new.len()
+ && old_uniq_end <= old.len()
+ && new_uniq_end <= new.len()
+ && common_prefix <= old_uniq_end
+ && common_prefix <= new_uniq_end
+ {
+ self.compute_lines(
+ &old[common_prefix..old_uniq_end],
+ &new[common_prefix..new_uniq_end],
+ deadline,
+ )?
+ } else {
+ // This is part of the bound check optimization! Should never fail!
+ return Err(crate::errors::Error::InvalidInput);
+ };
- if common_prefix > 0 {
+ if common_prefix > 0 && common_prefix <= old.len() {
diffs.insert(0, Diff::equal(&old[..common_prefix]));
}
- if common_suffix > 0 {
- diffs.push(Diff::new(Ops::Equal, &new[new.len() - common_suffix..]));
+ if common_suffix > 0 && new.len() >= common_suffix && new_uniq_end < new.len() {
+ diffs.push(Diff::new(Ops::Equal, &new[new_uniq_end..]));
}
Self::cleanup_merge(&mut diffs);
@@ -584,11 +609,7 @@ impl DiffMatchPatch {
};
// found a subsequence which contains the short text
- if let Some(idx) = long
- .windows(short.len())
- .step_by(1)
- .position(|k| k == short)
- {
+ if let Some(idx) = long.windows(short.len()).position(|k| k == short) {
// Shorter text is inside the longer text (speedup).
let op = if old_gt_new { Ops::Delete } else { Ops::Insert };
let diffs = vec![
@@ -872,16 +893,20 @@ impl DiffMatchPatch {
// Does a substring of shorttext exist within longtext such that the substring
// is at least half the length of longtext?
// idx Start index of quarter length substring within longtext.
- #[inline]
fn half_match_i<'a, T: DType>(
long: &'a [T],
short: &'a [T],
idx: usize,
) -> Option<HalfMatch<'a, T>> {
// Start with a 1/4 length substring at position i as a seed.
-
- let seed = &long[idx..idx + (long.len() >> 2)];
- let seedleen = seed.len();
+ let seed = {
+ let end = idx + (long.len() >> 2);
+ if idx < long.len() && end < long.len() {
+ &long[idx..end]
+ } else {
+ return None;
+ }
+ };
let mut j = 0;
let mut best_common: &[T] = &[];
@@ -890,27 +915,47 @@ impl DiffMatchPatch {
let mut best_short_a: &[T] = &[];
let mut best_short_b: &[T] = &[];
- while let Some(pos) = &short[j..]
- .windows(seedleen)
- .step_by(1)
- .position(|p| p == seed)
- {
- j += *pos;
+ while j < short.len() {
+ if let Some(pos) = &short[j..].windows(seed.len()).position(|p| p == seed) {
+ j += *pos;
- let prefix_len = Self::common_prefix(&long[idx..], &short[j..], false);
- let suffix_len = Self::common_prefix(&long[..idx], &short[..j], true);
+ let (prefix_len, suffix_len) = if idx < long.len() && j < short.len() {
+ (
+ Self::common_prefix(&long[idx..], &short[j..], false),
+ Self::common_prefix(&long[..idx], &short[..j], true),
+ )
+ } else {
+ // this bound check shouldn't fail .. since it has, no point going further
+ return None;
+ };
- if best_common.len() < suffix_len + prefix_len {
- best_common = &short[j - suffix_len..j + prefix_len];
+ if best_common.len() < suffix_len + prefix_len {
+ let j_min_suf = j - suffix_len;
+ let j_pls_prf = j + prefix_len;
+ let i_min_suf = idx - suffix_len;
+ let i_pls_prf = idx + prefix_len;
+
+ // Optimizing for bound checks
+ if j_min_suf < j_pls_prf
+ && j_min_suf <= short.len()
+ && j_pls_prf <= short.len()
+ && i_min_suf <= long.len()
+ && i_pls_prf <= long.len()
+ {
+ best_common = &short[j_min_suf..j_pls_prf];
- best_long_a = &long[..idx - suffix_len];
- best_long_b = &long[idx + prefix_len..];
+ best_long_a = &long[..i_min_suf];
+ best_long_b = &long[i_pls_prf..];
- best_short_a = &short[..j - suffix_len];
- best_short_b = &short[j + prefix_len..];
- }
+ best_short_a = &short[..j_min_suf];
+ best_short_b = &short[j_pls_prf..];
+ }
+ }
- j += 1;
+ j += 1;
+ } else {
+ break;
+ }
}
if (best_common.len() << 1) >= long.len() {
@@ -930,8 +975,6 @@ impl DiffMatchPatch {
// We are doing a binary search here, and I've observed similar performance as noted by https://neil.fraser.name/news/2007/10/09/
// Some benchmark code can be found in benches/prefix.rs
// Reverse prefix is suffix
- // TODO: investigate this further
- #[inline]
fn common_prefix<T: DType>(lhs: &[T], rhs: &[T], reverse: bool) -> usize {
if lhs.is_empty()
|| rhs.is_empty()
@@ -957,7 +1000,10 @@ impl DiffMatchPatch {
)
};
- if lhs[lhsrange] == rhs[rhsrange] {
+ if lhsrange.end <= lhs.len()
+ && rhsrange.end <= rhs.len()
+ && lhs[lhsrange] == rhs[rhsrange]
+ {
pointmin = pointmid;
pointstart = pointmin;
} else {
@@ -970,6 +1016,19 @@ impl DiffMatchPatch {
pointmid
}
+ fn common_prefix_suffix<T: DType>(lhs: &[T], rhs: &[T]) -> (usize, usize) {
+ let common_prefix = Self::common_prefix(lhs, rhs, false);
+ let common_suffix = {
+ if common_prefix < lhs.len() && common_prefix < rhs.len() {
+ Self::common_prefix(&lhs[common_prefix..], &rhs[common_prefix..], true)
+ } else {
+ 0
+ }
+ };
+
+ (common_prefix, common_suffix)
+ }
+
fn common_overlap<T: DType>(lhs: &[T], rhs: &[T]) -> usize {
if lhs.is_empty() || rhs.is_empty() {
return 0;
@@ -993,11 +1052,7 @@ impl DiffMatchPatch {
let mut len = 1;
let mut best = 0;
- while let Some(found) = r
- .windows(len)
- .step_by(1)
- .position(|p| p == &l[l.len() - len..])
- {
+ while let Some(found) = r.windows(len).position(|p| p == &l[l.len() - len..]) {
len += found;
if found == 0 || l[l.len() - len..] == r[..len] {
best = len;
@@ -1009,7 +1064,6 @@ impl DiffMatchPatch {
}
// Reduce the number of edits by eliminating semantically trivial equalities
- #[inline]
fn cleanup_semantic<T: DType>(diffs: &mut Vec<Diff<T>>) {
let mut changes = false;
@@ -1026,9 +1080,7 @@ impl DiffMatchPatch {
let mut insert_len_post = 0_usize;
let mut delete_len_post = 0_usize;
- let mut difflen = diffs.len();
-
- while pointer < difflen {
+ while pointer < diffs.len() {
let mut diff_mod = false;
if diffs[pointer].op() == Ops::Equal {
equalities.push(pointer);
@@ -1061,8 +1113,6 @@ impl DiffMatchPatch {
diffs.insert(last, Diff::delete(last_eq));
// Change the other copy to insert
diffs[last + 1].0 = Ops::Insert;
- // change diff length
- difflen = diffs.len();
// Throw away the equality we just deleted.
equalities.pop();
@@ -1100,8 +1150,6 @@ impl DiffMatchPatch {
Self::cleanup_semantic_lossless(diffs);
- difflen = diffs.len();
-
// Find any overlaps between deletions and insertions.
// e.g: <del>abcxxx</del><ins>xxxdef</ins>
// -> <del>abc</del>xxx<ins>def</ins>
@@ -1109,7 +1157,7 @@ impl DiffMatchPatch {
// -> <ins>def</ins>xxx<del>abc</del>
// Only extract an overlap if it is as big as the edit ahead or behind it.
pointer = 1;
- while difflen > 0 && pointer < difflen {
+ while !diffs.is_empty() && pointer < diffs.len() {
if diffs[pointer - 1].op() == Ops::Delete && diffs[pointer].op() == Ops::Insert {
let delete = diffs[pointer - 1].data().to_vec();
let insert = diffs[pointer].data().to_vec();
@@ -1127,7 +1175,7 @@ impl DiffMatchPatch {
diffs.insert(pointer, Diff::equal(&insert[..overlap_1]));
diffs[pointer - 1].1 = delete[..delete.len() - overlap_1].to_vec();
diffs[pointer + 1].1 = insert[overlap_1..].to_vec();
- difflen = diffs.len();
+
pointer += 1;
} else if overlap_2 >= delete_thres || overlap_2 >= insert_thres {
// Reverse overlap
@@ -1136,7 +1184,6 @@ impl DiffMatchPatch {
diffs[pointer - 1] = Diff::insert(&insert[..insert.len() - overlap_2]);
diffs[pointer + 1] = Diff::delete(&delete[overlap_2..]);
- difflen = diffs.len();
pointer += 1;
}
pointer += 1;
@@ -1147,19 +1194,13 @@ impl DiffMatchPatch {
// Look for single edits surrounded on both sides by equalities
// e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
- #[inline]
fn cleanup_semantic_lossless<T: DType>(diffs: &mut Vec<Diff<T>>) {
let mut pointer = 1_usize;
- let mut difflen = diffs.len();
// Intentionally ignore the first and last element (don't need checking).
- while difflen > 0 && pointer < difflen - 1 {
+ while !diffs.is_empty() && (1..diffs.len() - 1).contains(&pointer) {
// an edit surrounded by equalities
if diffs[pointer - 1].op() == Ops::Equal && diffs[pointer + 1].op() == Ops::Equal {
- // let mut equality_prev = diffs[pointer - 1].data().to_vec();
- // let mut edit = diffs[pointer].data().to_vec();
- // let mut equality_next = diffs[pointer + 1].data().to_vec();
-
// Shift the edit as far left as possible
let (mut equality_prev, mut edit, mut equality_next) = {
let commonlen =
@@ -1194,7 +1235,7 @@ impl DiffMatchPatch {
let mut best_score = Self::cleanup_semantic_score(&equality_prev[..], &edit[..])
+ Self::cleanup_semantic_score(&edit[..], &equality_next[..]);
- while !edit.is_empty() && !equality_next.is_empty() && edit[0] == equality_next[0] {
+ while !edit.is_empty() && edit.first() == equality_next.first() {
equality_prev.push(edit[0]);
edit.remove(0);
edit.push(equality_next[0]);
@@ -1220,7 +1261,6 @@ impl DiffMatchPatch {
} else {
diffs.remove(pointer - 1);
pointer -= 1;
- difflen = diffs.len();
}
diffs[pointer].1.clone_from(&best_edit);
@@ -1230,7 +1270,6 @@ impl DiffMatchPatch {
} else {
diffs.remove(pointer + 1);
pointer -= 1;
- difflen = diffs.len();
}
}
}
@@ -1242,7 +1281,6 @@ impl DiffMatchPatch {
// Given two strings, compute a score representing whether the internal
// boundary falls on logical boundaries
// Scores range from 6 (best) to 0 (worst)
- #[inline]
fn cleanup_semantic_score<T: DType>(one: &[T], two: &[T]) -> u8 {
let (char1, char2) = if let (Some(&char1), Some(&char2)) = (one.last(), two.first()) {
if let (Some(c1), Some(c2)) = (char1.as_char(), char2.as_char()) {
@@ -1294,9 +1332,7 @@ impl DiffMatchPatch {
fn cleanup_merge<T: DType>(diffs: &mut Vec<Diff<T>>) {
Self::cleanup_merge_first(diffs);
- let changes = Self::cleanup_merge_second(diffs);
-
- if changes {
+ if Self::cleanup_merge_second(diffs) {
Self::cleanup_merge(diffs);
}
}
@@ -1328,7 +1364,7 @@ impl DiffMatchPatch {
pointer += 1;
}
Ops::Equal => {
- Self::cleanup_perge_proc_equal(
+ Self::cleanup_merge_proc_equal(
diffs,
insert_n,
delete_n,
@@ -1412,7 +1448,8 @@ impl DiffMatchPatch {
changes
}
- fn cleanup_perge_proc_equal<T: DType>(
+ // Handling `Equality` in the merge process
+ fn cleanup_merge_proc_equal<T: DType>(
diffs: &mut Vec<Diff<T>>,
insert_n: usize,
delete_n: usize,
@@ -1487,7 +1524,6 @@ impl DiffMatchPatch {
}
// Reduce the number of edits by eliminating operationally trivial equalities.
- #[inline]
fn cleanup_efficiency<T: DType>(&self, diffs: &mut Vec<Diff<T>>) {
if diffs.is_empty() {
return;
@@ -1549,11 +1585,14 @@ impl DiffMatchPatch {
&& pre_ins as u8 + pre_del as u8 + post_ins as u8 + post_del as u8 == 3)
{
// Duplicate record.
- let item = equalities.pop().unwrap();
- // change the second copy (after the insert in next line) to Insert
- diffs[item].0 = Ops::Insert;
- // add an item
- diffs.insert(item, Diff::delete(le));
+ if let Some(item) = equalities.pop() {
+ if (0..diffs.len()).contains(&item) {
+ // change the second copy (after the insert in next line) to Insert
+ diffs[item].0 = Ops::Insert;
+ // add an item
+ diffs.insert(item, Diff::delete(le));
+ }
+ }
last_eq = None;
@@ -1575,8 +1614,7 @@ impl DiffMatchPatch {
changes = true;
continue;
};
- // pointer = equalitiesLength > 0 ?
- // equalities[equalitiesLength - 1] : -1;
+
post_ins = false;
post_del = false;
}
@@ -1816,13 +1854,11 @@ impl DiffMatchPatch {
let mut linehash: HashMap<&'a [T], usize> = HashMap::new();
// Allocate 2/3rds of the UTF16::MAX (65535) value space for text1, the rest for text2.
- // let mut maxlines = 5;
let mut maxlines = 40000;
let chars_old = Self::lines_to_chars_internal(old, &mut lines, &mut linehash, maxlines);
// This basically represents the U16::MAX value
maxlines = 65535;
- // maxlines = 7;
let chars_new = Self::lines_to_chars_internal(new, &mut lines, &mut linehash, maxlines);
LineToChars {
@@ -1832,7 +1868,6 @@ impl DiffMatchPatch {
}
}
- #[inline]
fn lines_to_chars_internal<'a, T: DType>(
text: &'a [T],
array: &mut Vec<&'a [T]>,
@@ -1841,7 +1876,6 @@ impl DiffMatchPatch {
) -> Vec<usize> {
let take = maxlines - array.len();
- // let mut lines = ;
let mut charlist = Vec::with_capacity(take);
let mut broke = false;
@@ -1867,7 +1901,7 @@ impl DiffMatchPatch {
});
// We broke at max lines, so we'll account for the remaining text
- if broke {
+ if broke && cursor <= text.len() {
let line = &text[cursor..];
let e = hash.entry(line).or_insert(array.len());
// Fresh insert
@@ -1881,25 +1915,20 @@ impl DiffMatchPatch {
charlist
}
- #[inline]
fn chars_to_lines<T: DType>(diffs: &[Diff<usize>], lines: &[&[T]]) -> Vec<Diff<T>> {
diffs
.iter()
.map(|d| {
- let t = d
- .data()
- .iter()
- .map(|&d| {
- *lines.get(d).unwrap() // investigate
- })
- .collect::<Vec<_>>()
- .concat();
+ let mut line = vec![];
+ d.data().iter().for_each(|d| {
+ if let Some(l) = lines.get(*d) {
+ line.append(&mut l.to_vec())
+ }
+ });
- Diff::new(d.op(), &t)
+ Diff::new(d.op(), &line)
})
.collect::<Vec<_>>()
-
- // res
}
}
@@ -1942,7 +1971,6 @@ impl DiffMatchPatch {
// Is there a nearby exact match? (speedup)
if let Some(best_loc) = text
.windows(pattern.len())
- .step_by(1)
.skip(loc)
.position(|p| p == pattern)
.map(|pos| pos + loc)
@@ -1955,7 +1983,6 @@ impl DiffMatchPatch {
// best_loc = text.lastIndexOf(pattern, loc + pattern.length);
if let Some(best_loc_rev) = text
.windows(pattern.len())
- .step_by(1)
.skip(loc)
.rev()
.position(|p| p == pattern)