Cleanup - dmp - my fork of dmp

|:-------:|:----------------------------------------------------------------------------------------:|:---------:|:----------:|:----------:|:-----------:|:-------:|

-| `rust` | [diff_match_patch v0.1.1](https://crates.io/crates/diff_match_patch)[^2] | 56.618 ms | 10.596 ms | Criterion | - | ✅ |

-| `rust` | [dmp v0.2.0](https://crates.io/crates/dmp) | 56.609 ms | 14.654 ms | Criterion | - | ✅ |

-| `rust` | [diff-match-patch-rs](https://github.com/AnubhabB/diff-match-patch-rs.git)our | 55.149 ms | 631.13 µs | Criterion | `Efficient`| ✅ |

-| `rust` | [diff-match-patch-rs](https://github.com/AnubhabB/diff-match-patch-rs.git)our | 55.275 ms | 1.1703 ms | Criterion | `Compat` | ✅ |

-| `go` | [go-diff](https://github.com/sergi/go-diff) | 50.31 ms | 135.2 ms | go test | - | ✅ |

+| `rust` | [diff_match_patch v0.1.1](https://crates.io/crates/diff_match_patch)[^2] | 56.618 ms | 9.00 ms | Criterion | - | ✅ |

+| `rust` | [dmp v0.2.0](https://crates.io/crates/dmp) | 56.609 ms | 12.25 ms | Criterion | - | ✅ |

+| `rust` | [diff-match-patch-rs](https://github.com/AnubhabB/diff-match-patch-rs.git)our | 34.660 ms | 533.17 µs | Criterion | `Efficient`| ✅ |

+| `rust` | [diff-match-patch-rs](https://github.com/AnubhabB/diff-match-patch-rs.git)our | 35.095 ms | 993.90 µs | Criterion | `Compat` | ✅ |

+| `go` | [go-diff](https://github.com/sergi/go-diff) | 40.31 ms | 135.2 ms | go test | - | ✅ |

| `node` | [diff-match-patch](https://www.npmjs.com/package/diff-match-patch)[^1] | 246.90 ms | 1.07 ms | tinybench | - | ❌ |

| `python`| [diff-match-patch](https://pypi.org/project/diff-match-patch/) | 1.01 s | 0.25 ms | timeit | - | ✅ |

diff --git a/src/dmp.rs b/src/dmp.rs
index 3d4463c..e1905ec 100644
--- a/src/dmp.rs
+++ b/src/dmp.rs

@@ -56,25 +56,27 @@ impl<T: DType> Diff<T> {

Self(op, data.to_vec())

}

- /// helper functions to create ops

+ /// helper functions to create op::Delete

pub fn delete(data: &[T]) -> Self {

Self::new(Ops::Delete, data)

}

+ /// helper functions to create op::Insert

pub fn insert(data: &[T]) -> Self {

Self::new(Ops::Insert, data)

}

+ /// helper functions to create op::Equal

pub fn equal(data: &[T]) -> Self {

Self::new(Ops::Equal, data)

}

- // returns the operation of the current diff

+ /// returns the operation of the current diff

pub fn op(&self) -> Ops {

self.0

}

- // returns the bytes of the data

+ /// returns slice of the data

pub fn data(&self) -> &[T] {

&self.1[..]

}

@@ -263,26 +265,42 @@ impl DiffMatchPatch {

}

// Trim common prefix

- let common_prefix = Self::common_prefix(old_t, new_t, false);

- let common_suffix =

- Self::common_prefix(&old_t[common_prefix..], &new_t[common_prefix..], true);

- let mut diffs = self.compute(

- &old_t[common_prefix..old_t.len() - common_suffix],

- &new_t[common_prefix..new_t.len() - common_suffix],

- linemode,

- deadline,

- )?;

+ let (common_prefix, common_suffix) = Self::common_prefix_suffix(old_t, new_t);

+ let (old_uniq_end, new_uniq_end) =

+ (old_t.len() - common_suffix, new_t.len() - common_suffix);

+ // Some elaborate checks to avoid bound checks

+ // slice_start_index_len_fail

+ // slice_end_index_len_fail

+ // slice_index_order_fail

+ let mut diffs = if common_prefix <= old_t.len()

+ && common_prefix <= new_t.len()

+ && old_uniq_end <= old_t.len()

+ && new_uniq_end <= new_t.len()

+ && common_prefix <= old_uniq_end

+ && common_prefix <= new_uniq_end

+ {

+ self.compute(

+ &old_t[common_prefix..old_uniq_end],

+ &new_t[common_prefix..new_uniq_end],

+ linemode,

+ deadline,

+ )?

+ } else {

+ // This is part of the bound check optimization! Should never fail!

+ return Err(crate::errors::Error::InvalidInput);

+ };

// Restore the prefix and suffix.

- if common_prefix > 0 {

+ if common_prefix > 0 && common_prefix <= old_t.len() {

let mut d = vec![Diff::equal(&old_t[..common_prefix])];

d.append(&mut diffs);

diffs = d;

}

- if common_suffix > 0 {

- diffs.push(Diff::new(Ops::Equal, &new_t[new_t.len() - common_suffix..]));

+ if common_suffix > 0 && new_t.len() >= common_suffix && new_uniq_end < new_t.len() {

+ diffs.push(Diff::new(Ops::Equal, &new_t[new_uniq_end..]));

}

Self::cleanup_merge(&mut diffs);

@@ -314,11 +332,7 @@ impl DiffMatchPatch {

};

// found a subsequence which contains the short text

- if let Some(idx) = long

- .windows(short.len())

- .step_by(1)

- .position(|k| k == short)

- {

+ if let Some(idx) = long.windows(short.len()).position(|k| k == short) {

// Shorter text is inside the longer text (speedup).

let op = if old_gt_new { Ops::Delete } else { Ops::Insert };

let diffs = vec![

@@ -388,11 +402,9 @@ impl DiffMatchPatch {

}

// First check if the second quarter is the seed for a half-match.

- // let hm1 = Self::diff_half_match_i(long, short, (long.len() as f32 / 4.).ceil() as usize);

- let hm1 = Self::half_match_i(long, short, long.len() >> 2);

+ let hm1 = Self::half_match_i(long, short, (long.len() + 3) >> 2);

// Check again based on the third quarter.

- // let hm2 = Self::diff_half_match_i(long, short, (long.len() as f32 / 2.).ceil() as usize);

- let hm2 = Self::half_match_i(long, short, long.len() >> 1);

+ let hm2 = Self::half_match_i(long, short, (long.len() + 1) >> 1);

if hm1.is_none() && hm2.is_none() {

return None;

@@ -458,7 +470,7 @@ impl DiffMatchPatch {

// Add a dummy entry at the end.

diffs.push(Diff::equal(&[]));

let mut pointer = 0_usize;

// count of bytes inserted

@@ -487,7 +499,9 @@ impl DiffMatchPatch {

let idxstart = pointer - delete_n - insert_n;

let idxend = idxstart + delete_n + insert_n;

- diffs.drain(idxstart..idxend);

+ if idxstart <= idxend {

+ diffs.drain(idxstart..idxend);

+ }

pointer = idxstart;

@@ -539,21 +553,32 @@ impl DiffMatchPatch {

}

// Trim common prefix

- let common_prefix = Self::common_prefix(old, new, false);

- let common_suffix = Self::common_prefix(&old[common_prefix..], &new[common_prefix..], true);

- let mut diffs = self.compute_lines(

- &old[common_prefix..old.len() - common_suffix],

- &new[common_prefix..new.len() - common_suffix],

- deadline,

- )?;

+ let (common_prefix, common_suffix) = Self::common_prefix_suffix(old, new);

+ let (old_uniq_end, new_uniq_end) = (old.len() - common_suffix, new.len() - common_suffix);

+ let mut diffs = if common_prefix <= old.len()

+ && common_prefix <= new.len()

+ && old_uniq_end <= old.len()

+ && new_uniq_end <= new.len()

+ && common_prefix <= old_uniq_end

+ && common_prefix <= new_uniq_end

+ {

+ self.compute_lines(

+ &old[common_prefix..old_uniq_end],

+ &new[common_prefix..new_uniq_end],

+ deadline,

+ )?

+ } else {

+ // This is part of the bound check optimization! Should never fail!

+ return Err(crate::errors::Error::InvalidInput);

+ };

- if common_prefix > 0 {

+ if common_prefix > 0 && common_prefix <= old.len() {

diffs.insert(0, Diff::equal(&old[..common_prefix]));

}

- if common_suffix > 0 {

- diffs.push(Diff::new(Ops::Equal, &new[new.len() - common_suffix..]));

+ if common_suffix > 0 && new.len() >= common_suffix && new_uniq_end < new.len() {

+ diffs.push(Diff::new(Ops::Equal, &new[new_uniq_end..]));

}

Self::cleanup_merge(&mut diffs);

@@ -584,11 +609,7 @@ impl DiffMatchPatch {

};

// found a subsequence which contains the short text

- if let Some(idx) = long

- .windows(short.len())

- .step_by(1)

- .position(|k| k == short)

- {

+ if let Some(idx) = long.windows(short.len()).position(|k| k == short) {

// Shorter text is inside the longer text (speedup).

let op = if old_gt_new { Ops::Delete } else { Ops::Insert };

let diffs = vec![

@@ -872,16 +893,20 @@ impl DiffMatchPatch {

// Does a substring of shorttext exist within longtext such that the substring

// is at least half the length of longtext?

// idx Start index of quarter length substring within longtext.

- #[inline]

fn half_match_i<'a, T: DType>(

long: &'a [T],

short: &'a [T],

idx: usize,

) -> Option<HalfMatch<'a, T>> {

// Start with a 1/4 length substring at position i as a seed.

- let seed = &long[idx..idx + (long.len() >> 2)];

- let seedleen = seed.len();

+ let seed = {

+ let end = idx + (long.len() >> 2);

+ if idx < long.len() && end < long.len() {

+ &long[idx..end]

+ } else {

+ return None;

+ }

+ };

let mut j = 0;

let mut best_common: &[T] = &[];

@@ -890,27 +915,47 @@ impl DiffMatchPatch {

let mut best_short_a: &[T] = &[];

let mut best_short_b: &[T] = &[];

- while let Some(pos) = &short[j..]

- .windows(seedleen)

- .step_by(1)

- .position(|p| p == seed)

- {

- j += *pos;

+ while j < short.len() {

+ if let Some(pos) = &short[j..].windows(seed.len()).position(|p| p == seed) {

+ j += *pos;

- let prefix_len = Self::common_prefix(&long[idx..], &short[j..], false);

- let suffix_len = Self::common_prefix(&long[..idx], &short[..j], true);

+ let (prefix_len, suffix_len) = if idx < long.len() && j < short.len() {

+ (

+ Self::common_prefix(&long[idx..], &short[j..], false),

+ Self::common_prefix(&long[..idx], &short[..j], true),

+ )

+ } else {

+ // this bound check shouldn't fail .. since it has, no point going further

+ return None;

+ };

- if best_common.len() < suffix_len + prefix_len {

- best_common = &short[j - suffix_len..j + prefix_len];

+ if best_common.len() < suffix_len + prefix_len {

+ let j_min_suf = j - suffix_len;

+ let j_pls_prf = j + prefix_len;

+ let i_min_suf = idx - suffix_len;

+ let i_pls_prf = idx + prefix_len;

+ // Optimizing for bound checks

+ if j_min_suf < j_pls_prf

+ && j_min_suf <= short.len()

+ && j_pls_prf <= short.len()

+ && i_min_suf <= long.len()

+ && i_pls_prf <= long.len()

+ {

+ best_common = &short[j_min_suf..j_pls_prf];

- best_long_a = &long[..idx - suffix_len];

- best_long_b = &long[idx + prefix_len..];

+ best_long_a = &long[..i_min_suf];

+ best_long_b = &long[i_pls_prf..];

- best_short_a = &short[..j - suffix_len];

- best_short_b = &short[j + prefix_len..];

- }

+ best_short_a = &short[..j_min_suf];

+ best_short_b = &short[j_pls_prf..];

+ }

- j += 1;

+ j += 1;

+ } else {

+ break;

+ }

}

if (best_common.len() << 1) >= long.len() {

@@ -930,8 +975,6 @@ impl DiffMatchPatch {

// We are doing a binary search here, and I've observed similar performance as noted by https://neil.fraser.name/news/2007/10/09/

// Some benchmark code can be found in benches/prefix.rs

// Reverse prefix is suffix

- // TODO: investigate this further

- #[inline]

fn common_prefix<T: DType>(lhs: &[T], rhs: &[T], reverse: bool) -> usize {

if lhs.is_empty()

|| rhs.is_empty()

@@ -957,7 +1000,10 @@ impl DiffMatchPatch {

)

};

- if lhs[lhsrange] == rhs[rhsrange] {

+ if lhsrange.end <= lhs.len()

+ && rhsrange.end <= rhs.len()

+ && lhs[lhsrange] == rhs[rhsrange]

+ {

pointmin = pointmid;

pointstart = pointmin;

} else {

@@ -970,6 +1016,19 @@ impl DiffMatchPatch {

pointmid

}

+ fn common_prefix_suffix<T: DType>(lhs: &[T], rhs: &[T]) -> (usize, usize) {

+ let common_prefix = Self::common_prefix(lhs, rhs, false);

+ let common_suffix = {

+ if common_prefix < lhs.len() && common_prefix < rhs.len() {

+ Self::common_prefix(&lhs[common_prefix..], &rhs[common_prefix..], true)

+ } else {

+ 0

+ }

+ };

+ (common_prefix, common_suffix)

+ }

fn common_overlap<T: DType>(lhs: &[T], rhs: &[T]) -> usize {

if lhs.is_empty() || rhs.is_empty() {

return 0;

@@ -993,11 +1052,7 @@ impl DiffMatchPatch {

let mut len = 1;

let mut best = 0;

- while let Some(found) = r

- .windows(len)

- .step_by(1)

- .position(|p| p == &l[l.len() - len..])

- {

+ while let Some(found) = r.windows(len).position(|p| p == &l[l.len() - len..]) {

len += found;

if found == 0 || l[l.len() - len..] == r[..len] {

best = len;

@@ -1009,7 +1064,6 @@ impl DiffMatchPatch {

}

// Reduce the number of edits by eliminating semantically trivial equalities

- #[inline]

fn cleanup_semantic<T: DType>(diffs: &mut Vec<Diff<T>>) {

let mut changes = false;

@@ -1026,9 +1080,7 @@ impl DiffMatchPatch {

let mut insert_len_post = 0_usize;

let mut delete_len_post = 0_usize;

- let mut difflen = diffs.len();

- while pointer < difflen {

+ while pointer < diffs.len() {

let mut diff_mod = false;

if diffs[pointer].op() == Ops::Equal {

equalities.push(pointer);

@@ -1061,8 +1113,6 @@ impl DiffMatchPatch {

diffs.insert(last, Diff::delete(last_eq));

// Change the other copy to insert

diffs[last + 1].0 = Ops::Insert;

- // change diff length

- difflen = diffs.len();

// Throw away the equality we just deleted.

equalities.pop();

@@ -1100,8 +1150,6 @@ impl DiffMatchPatch {

Self::cleanup_semantic_lossless(diffs);

- difflen = diffs.len();

// Find any overlaps between deletions and insertions.

// e.g: <del>abcxxx</del><ins>xxxdef</ins>

// -> <del>abc</del>xxx<ins>def</ins>

@@ -1109,7 +1157,7 @@ impl DiffMatchPatch {

// -> <ins>def</ins>xxx<del>abc</del>

// Only extract an overlap if it is as big as the edit ahead or behind it.

pointer = 1;

- while difflen > 0 && pointer < difflen {

+ while !diffs.is_empty() && pointer < diffs.len() {

if diffs[pointer - 1].op() == Ops::Delete && diffs[pointer].op() == Ops::Insert {

let delete = diffs[pointer - 1].data().to_vec();

let insert = diffs[pointer].data().to_vec();

@@ -1127,7 +1175,7 @@ impl DiffMatchPatch {

diffs.insert(pointer, Diff::equal(&insert[..overlap_1]));

diffs[pointer - 1].1 = delete[..delete.len() - overlap_1].to_vec();

diffs[pointer + 1].1 = insert[overlap_1..].to_vec();

- difflen = diffs.len();

pointer += 1;

} else if overlap_2 >= delete_thres || overlap_2 >= insert_thres {

// Reverse overlap

@@ -1136,7 +1184,6 @@ impl DiffMatchPatch {

diffs[pointer - 1] = Diff::insert(&insert[..insert.len() - overlap_2]);

diffs[pointer + 1] = Diff::delete(&delete[overlap_2..]);

- difflen = diffs.len();

pointer += 1;

}

pointer += 1;

@@ -1147,19 +1194,13 @@ impl DiffMatchPatch {

// Look for single edits surrounded on both sides by equalities

// e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.

- #[inline]

fn cleanup_semantic_lossless<T: DType>(diffs: &mut Vec<Diff<T>>) {

let mut pointer = 1_usize;

- let mut difflen = diffs.len();

// Intentionally ignore the first and last element (don't need checking).

- while difflen > 0 && pointer < difflen - 1 {

+ while !diffs.is_empty() && (1..diffs.len() - 1).contains(&pointer) {

// an edit surrounded by equalities

if diffs[pointer - 1].op() == Ops::Equal && diffs[pointer + 1].op() == Ops::Equal {

- // let mut equality_prev = diffs[pointer - 1].data().to_vec();

- // let mut edit = diffs[pointer].data().to_vec();

- // let mut equality_next = diffs[pointer + 1].data().to_vec();

// Shift the edit as far left as possible

let (mut equality_prev, mut edit, mut equality_next) = {

let commonlen =

@@ -1194,7 +1235,7 @@ impl DiffMatchPatch {

let mut best_score = Self::cleanup_semantic_score(&equality_prev[..], &edit[..])

+ Self::cleanup_semantic_score(&edit[..], &equality_next[..]);

- while !edit.is_empty() && !equality_next.is_empty() && edit[0] == equality_next[0] {

+ while !edit.is_empty() && edit.first() == equality_next.first() {

equality_prev.push(edit[0]);

edit.remove(0);

edit.push(equality_next[0]);

@@ -1220,7 +1261,6 @@ impl DiffMatchPatch {

} else {

diffs.remove(pointer - 1);

pointer -= 1;

- difflen = diffs.len();

}

diffs[pointer].1.clone_from(&best_edit);

@@ -1230,7 +1270,6 @@ impl DiffMatchPatch {

} else {

diffs.remove(pointer + 1);

pointer -= 1;

- difflen = diffs.len();

}

@@ -1242,7 +1281,6 @@ impl DiffMatchPatch {

// Given two strings, compute a score representing whether the internal

// boundary falls on logical boundaries

// Scores range from 6 (best) to 0 (worst)

- #[inline]

fn cleanup_semantic_score<T: DType>(one: &[T], two: &[T]) -> u8 {

let (char1, char2) = if let (Some(&char1), Some(&char2)) = (one.last(), two.first()) {

if let (Some(c1), Some(c2)) = (char1.as_char(), char2.as_char()) {

@@ -1294,9 +1332,7 @@ impl DiffMatchPatch {

fn cleanup_merge<T: DType>(diffs: &mut Vec<Diff<T>>) {

Self::cleanup_merge_first(diffs);

- let changes = Self::cleanup_merge_second(diffs);

- if changes {

+ if Self::cleanup_merge_second(diffs) {

Self::cleanup_merge(diffs);

}

@@ -1328,7 +1364,7 @@ impl DiffMatchPatch {

pointer += 1;

}

Ops::Equal => {

- Self::cleanup_perge_proc_equal(

+ Self::cleanup_merge_proc_equal(

diffs,

insert_n,

delete_n,

@@ -1412,7 +1448,8 @@ impl DiffMatchPatch {

changes

}

- fn cleanup_perge_proc_equal<T: DType>(

+ // Handling `Equality` in the merge process

+ fn cleanup_merge_proc_equal<T: DType>(

diffs: &mut Vec<Diff<T>>,

insert_n: usize,

delete_n: usize,

@@ -1487,7 +1524,6 @@ impl DiffMatchPatch {

}

// Reduce the number of edits by eliminating operationally trivial equalities.

- #[inline]

fn cleanup_efficiency<T: DType>(&self, diffs: &mut Vec<Diff<T>>) {

if diffs.is_empty() {

return;

@@ -1549,11 +1585,14 @@ impl DiffMatchPatch {

&& pre_ins as u8 + pre_del as u8 + post_ins as u8 + post_del as u8 == 3)

{

// Duplicate record.

- let item = equalities.pop().unwrap();

- // change the second copy (after the insert in next line) to Insert

- diffs[item].0 = Ops::Insert;

- // add an item

- diffs.insert(item, Diff::delete(le));

+ if let Some(item) = equalities.pop() {

+ if (0..diffs.len()).contains(&item) {

+ // change the second copy (after the insert in next line) to Insert

+ diffs[item].0 = Ops::Insert;

+ // add an item

+ diffs.insert(item, Diff::delete(le));

+ }

last_eq = None;

@@ -1575,8 +1614,7 @@ impl DiffMatchPatch {

changes = true;

continue;

};

- // pointer = equalitiesLength > 0 ?

- // equalities[equalitiesLength - 1] : -1;

post_ins = false;

post_del = false;

}

@@ -1816,13 +1854,11 @@ impl DiffMatchPatch {

let mut linehash: HashMap<&'a [T], usize> = HashMap::new();

// Allocate 2/3rds of the UTF16::MAX (65535) value space for text1, the rest for text2.

- // let mut maxlines = 5;

let mut maxlines = 40000;

let chars_old = Self::lines_to_chars_internal(old, &mut lines, &mut linehash, maxlines);

// This basically represents the U16::MAX value

maxlines = 65535;

- // maxlines = 7;

let chars_new = Self::lines_to_chars_internal(new, &mut lines, &mut linehash, maxlines);

LineToChars {

@@ -1832,7 +1868,6 @@ impl DiffMatchPatch {

}

- #[inline]

fn lines_to_chars_internal<'a, T: DType>(

text: &'a [T],

array: &mut Vec<&'a [T]>,

@@ -1841,7 +1876,6 @@ impl DiffMatchPatch {

) -> Vec<usize> {

let take = maxlines - array.len();

- // let mut lines = ;

let mut charlist = Vec::with_capacity(take);

let mut broke = false;

@@ -1867,7 +1901,7 @@ impl DiffMatchPatch {

});

// We broke at max lines, so we'll account for the remaining text

- if broke {

+ if broke && cursor <= text.len() {

let line = &text[cursor..];

let e = hash.entry(line).or_insert(array.len());

// Fresh insert

@@ -1881,25 +1915,20 @@ impl DiffMatchPatch {

charlist

}

- #[inline]

fn chars_to_lines<T: DType>(diffs: &[Diff<usize>], lines: &[&[T]]) -> Vec<Diff<T>> {

diffs

.iter()

.map(|d| {

- let t = d

- .data()

- .iter()

- .map(|&d| {

- *lines.get(d).unwrap() // investigate

- })

- .collect::<Vec<_>>()

- .concat();

+ let mut line = vec![];

+ d.data().iter().for_each(|d| {

+ if let Some(l) = lines.get(*d) {

+ line.append(&mut l.to_vec())

+ }

+ });

- Diff::new(d.op(), &t)

+ Diff::new(d.op(), &line)

})

.collect::<Vec<_>>()

- // res

}

@@ -1942,7 +1971,6 @@ impl DiffMatchPatch {

// Is there a nearby exact match? (speedup)

if let Some(best_loc) = text

.windows(pattern.len())

- .step_by(1)

.skip(loc)

.position(|p| p == pattern)

.map(|pos| pos + loc)

@@ -1955,7 +1983,6 @@ impl DiffMatchPatch {

// best_loc = text.lastIndexOf(pattern, loc + pattern.length);

if let Some(best_loc_rev) = text

.windows(pattern.len())

- .step_by(1)

.skip(loc)

.rev()

.position(|p| p == pattern)