my fork of dmp
Cleanup
| -rw-r--r-- | README.md | 10 | ||||
| -rw-r--r-- | src/dmp.rs | 277 |
2 files changed, 157 insertions, 130 deletions
@@ -180,11 +180,11 @@ Benchmarks are maintained [diff-match-patch-bench repository](https://github.com | Lang. | Library | Diff Avg. | Patch Avg. | Bencher | Mode | Correct | |:-------:|:----------------------------------------------------------------------------------------:|:---------:|:----------:|:----------:|:-----------:|:-------:| -| `rust` | [diff_match_patch v0.1.1](https://crates.io/crates/diff_match_patch)[^2] | 56.618 ms | 10.596 ms | Criterion | - | ✅ | -| `rust` | [dmp v0.2.0](https://crates.io/crates/dmp) | 56.609 ms | 14.654 ms | Criterion | - | ✅ | -| `rust` | [diff-match-patch-rs](https://github.com/AnubhabB/diff-match-patch-rs.git)<sup>our</sup> | 55.149 ms | 631.13 µs | Criterion | `Efficient`| ✅ | -| `rust` | [diff-match-patch-rs](https://github.com/AnubhabB/diff-match-patch-rs.git)<sup>our</sup> | 55.275 ms | 1.1703 ms | Criterion | `Compat` | ✅ | -| `go` | [go-diff](https://github.com/sergi/go-diff) | 50.31 ms | 135.2 ms | go test | - | ✅ | +| `rust` | [diff_match_patch v0.1.1](https://crates.io/crates/diff_match_patch)[^2] | 56.618 ms | 9.00 ms | Criterion | - | ✅ | +| `rust` | [dmp v0.2.0](https://crates.io/crates/dmp) | 56.609 ms | 12.25 ms | Criterion | - | ✅ | +| `rust` | [diff-match-patch-rs](https://github.com/AnubhabB/diff-match-patch-rs.git)<sup>our</sup> | 34.660 ms | 533.17 µs | Criterion | `Efficient`| ✅ | +| `rust` | [diff-match-patch-rs](https://github.com/AnubhabB/diff-match-patch-rs.git)<sup>our</sup> | 35.095 ms | 993.90 µs | Criterion | `Compat` | ✅ | +| `go` | [go-diff](https://github.com/sergi/go-diff) | 40.31 ms | 135.2 ms | go test | - | ✅ | | `node` | [diff-match-patch](https://www.npmjs.com/package/diff-match-patch)[^1] | 246.90 ms | 1.07 ms | tinybench | - | ❌ | | `python`| [diff-match-patch](https://pypi.org/project/diff-match-patch/) | 1.01 s | 0.25 ms | timeit | - | ✅ | @@ -56,25 +56,27 @@ impl<T: DType> Diff<T> { Self(op, data.to_vec()) } - /// helper functions to create ops + /// helper functions to create op::Delete pub fn delete(data: &[T]) -> Self { Self::new(Ops::Delete, data) } + /// helper functions to create op::Insert pub fn insert(data: &[T]) -> Self { Self::new(Ops::Insert, data) } + /// helper functions to create op::Equal pub fn equal(data: &[T]) -> Self { Self::new(Ops::Equal, data) } - // returns the operation of the current diff + /// returns the operation of the current diff pub fn op(&self) -> Ops { self.0 } - // returns the bytes of the data + /// returns slice of the data pub fn data(&self) -> &[T] { &self.1[..] } @@ -263,26 +265,42 @@ impl DiffMatchPatch { } // Trim common prefix - let common_prefix = Self::common_prefix(old_t, new_t, false); - let common_suffix = - Self::common_prefix(&old_t[common_prefix..], &new_t[common_prefix..], true); - - let mut diffs = self.compute( - &old_t[common_prefix..old_t.len() - common_suffix], - &new_t[common_prefix..new_t.len() - common_suffix], - linemode, - deadline, - )?; + let (common_prefix, common_suffix) = Self::common_prefix_suffix(old_t, new_t); + + let (old_uniq_end, new_uniq_end) = + (old_t.len() - common_suffix, new_t.len() - common_suffix); + + // Some elaborate checks to avoid bound checks + // slice_start_index_len_fail + // slice_end_index_len_fail + // slice_index_order_fail + let mut diffs = if common_prefix <= old_t.len() + && common_prefix <= new_t.len() + && old_uniq_end <= old_t.len() + && new_uniq_end <= new_t.len() + && common_prefix <= old_uniq_end + && common_prefix <= new_uniq_end + { + self.compute( + &old_t[common_prefix..old_uniq_end], + &new_t[common_prefix..new_uniq_end], + linemode, + deadline, + )? + } else { + // This is part of the bound check optimization! Should never fail! + return Err(crate::errors::Error::InvalidInput); + }; // Restore the prefix and suffix. - if common_prefix > 0 { + if common_prefix > 0 && common_prefix <= old_t.len() { let mut d = vec![Diff::equal(&old_t[..common_prefix])]; d.append(&mut diffs); diffs = d; } - if common_suffix > 0 { - diffs.push(Diff::new(Ops::Equal, &new_t[new_t.len() - common_suffix..])); + if common_suffix > 0 && new_t.len() >= common_suffix && new_uniq_end < new_t.len() { + diffs.push(Diff::new(Ops::Equal, &new_t[new_uniq_end..])); } Self::cleanup_merge(&mut diffs); @@ -314,11 +332,7 @@ impl DiffMatchPatch { }; // found a subsequence which contains the short text - if let Some(idx) = long - .windows(short.len()) - .step_by(1) - .position(|k| k == short) - { + if let Some(idx) = long.windows(short.len()).position(|k| k == short) { // Shorter text is inside the longer text (speedup). let op = if old_gt_new { Ops::Delete } else { Ops::Insert }; let diffs = vec![ @@ -388,11 +402,9 @@ impl DiffMatchPatch { } // First check if the second quarter is the seed for a half-match. - // let hm1 = Self::diff_half_match_i(long, short, (long.len() as f32 / 4.).ceil() as usize); - let hm1 = Self::half_match_i(long, short, long.len() >> 2); + let hm1 = Self::half_match_i(long, short, (long.len() + 3) >> 2); // Check again based on the third quarter. - // let hm2 = Self::diff_half_match_i(long, short, (long.len() as f32 / 2.).ceil() as usize); - let hm2 = Self::half_match_i(long, short, long.len() >> 1); + let hm2 = Self::half_match_i(long, short, (long.len() + 1) >> 1); if hm1.is_none() && hm2.is_none() { return None; @@ -458,7 +470,7 @@ impl DiffMatchPatch { // Add a dummy entry at the end. diffs.push(Diff::equal(&[])); - + let mut pointer = 0_usize; // count of bytes inserted @@ -487,7 +499,9 @@ impl DiffMatchPatch { let idxstart = pointer - delete_n - insert_n; let idxend = idxstart + delete_n + insert_n; - diffs.drain(idxstart..idxend); + if idxstart <= idxend { + diffs.drain(idxstart..idxend); + } pointer = idxstart; @@ -539,21 +553,32 @@ impl DiffMatchPatch { } // Trim common prefix - let common_prefix = Self::common_prefix(old, new, false); - let common_suffix = Self::common_prefix(&old[common_prefix..], &new[common_prefix..], true); - - let mut diffs = self.compute_lines( - &old[common_prefix..old.len() - common_suffix], - &new[common_prefix..new.len() - common_suffix], - deadline, - )?; + let (common_prefix, common_suffix) = Self::common_prefix_suffix(old, new); + + let (old_uniq_end, new_uniq_end) = (old.len() - common_suffix, new.len() - common_suffix); + let mut diffs = if common_prefix <= old.len() + && common_prefix <= new.len() + && old_uniq_end <= old.len() + && new_uniq_end <= new.len() + && common_prefix <= old_uniq_end + && common_prefix <= new_uniq_end + { + self.compute_lines( + &old[common_prefix..old_uniq_end], + &new[common_prefix..new_uniq_end], + deadline, + )? + } else { + // This is part of the bound check optimization! Should never fail! + return Err(crate::errors::Error::InvalidInput); + }; - if common_prefix > 0 { + if common_prefix > 0 && common_prefix <= old.len() { diffs.insert(0, Diff::equal(&old[..common_prefix])); } - if common_suffix > 0 { - diffs.push(Diff::new(Ops::Equal, &new[new.len() - common_suffix..])); + if common_suffix > 0 && new.len() >= common_suffix && new_uniq_end < new.len() { + diffs.push(Diff::new(Ops::Equal, &new[new_uniq_end..])); } Self::cleanup_merge(&mut diffs); @@ -584,11 +609,7 @@ impl DiffMatchPatch { }; // found a subsequence which contains the short text - if let Some(idx) = long - .windows(short.len()) - .step_by(1) - .position(|k| k == short) - { + if let Some(idx) = long.windows(short.len()).position(|k| k == short) { // Shorter text is inside the longer text (speedup). let op = if old_gt_new { Ops::Delete } else { Ops::Insert }; let diffs = vec![ @@ -872,16 +893,20 @@ impl DiffMatchPatch { // Does a substring of shorttext exist within longtext such that the substring // is at least half the length of longtext? // idx Start index of quarter length substring within longtext. - #[inline] fn half_match_i<'a, T: DType>( long: &'a [T], short: &'a [T], idx: usize, ) -> Option<HalfMatch<'a, T>> { // Start with a 1/4 length substring at position i as a seed. - - let seed = &long[idx..idx + (long.len() >> 2)]; - let seedleen = seed.len(); + let seed = { + let end = idx + (long.len() >> 2); + if idx < long.len() && end < long.len() { + &long[idx..end] + } else { + return None; + } + }; let mut j = 0; let mut best_common: &[T] = &[]; @@ -890,27 +915,47 @@ impl DiffMatchPatch { let mut best_short_a: &[T] = &[]; let mut best_short_b: &[T] = &[]; - while let Some(pos) = &short[j..] - .windows(seedleen) - .step_by(1) - .position(|p| p == seed) - { - j += *pos; + while j < short.len() { + if let Some(pos) = &short[j..].windows(seed.len()).position(|p| p == seed) { + j += *pos; - let prefix_len = Self::common_prefix(&long[idx..], &short[j..], false); - let suffix_len = Self::common_prefix(&long[..idx], &short[..j], true); + let (prefix_len, suffix_len) = if idx < long.len() && j < short.len() { + ( + Self::common_prefix(&long[idx..], &short[j..], false), + Self::common_prefix(&long[..idx], &short[..j], true), + ) + } else { + // this bound check shouldn't fail .. since it has, no point going further + return None; + }; - if best_common.len() < suffix_len + prefix_len { - best_common = &short[j - suffix_len..j + prefix_len]; + if best_common.len() < suffix_len + prefix_len { + let j_min_suf = j - suffix_len; + let j_pls_prf = j + prefix_len; + let i_min_suf = idx - suffix_len; + let i_pls_prf = idx + prefix_len; + + // Optimizing for bound checks + if j_min_suf < j_pls_prf + && j_min_suf <= short.len() + && j_pls_prf <= short.len() + && i_min_suf <= long.len() + && i_pls_prf <= long.len() + { + best_common = &short[j_min_suf..j_pls_prf]; - best_long_a = &long[..idx - suffix_len]; - best_long_b = &long[idx + prefix_len..]; + best_long_a = &long[..i_min_suf]; + best_long_b = &long[i_pls_prf..]; - best_short_a = &short[..j - suffix_len]; - best_short_b = &short[j + prefix_len..]; - } + best_short_a = &short[..j_min_suf]; + best_short_b = &short[j_pls_prf..]; + } + } - j += 1; + j += 1; + } else { + break; + } } if (best_common.len() << 1) >= long.len() { @@ -930,8 +975,6 @@ impl DiffMatchPatch { // We are doing a binary search here, and I've observed similar performance as noted by https://neil.fraser.name/news/2007/10/09/ // Some benchmark code can be found in benches/prefix.rs // Reverse prefix is suffix - // TODO: investigate this further - #[inline] fn common_prefix<T: DType>(lhs: &[T], rhs: &[T], reverse: bool) -> usize { if lhs.is_empty() || rhs.is_empty() @@ -957,7 +1000,10 @@ impl DiffMatchPatch { ) }; - if lhs[lhsrange] == rhs[rhsrange] { + if lhsrange.end <= lhs.len() + && rhsrange.end <= rhs.len() + && lhs[lhsrange] == rhs[rhsrange] + { pointmin = pointmid; pointstart = pointmin; } else { @@ -970,6 +1016,19 @@ impl DiffMatchPatch { pointmid } + fn common_prefix_suffix<T: DType>(lhs: &[T], rhs: &[T]) -> (usize, usize) { + let common_prefix = Self::common_prefix(lhs, rhs, false); + let common_suffix = { + if common_prefix < lhs.len() && common_prefix < rhs.len() { + Self::common_prefix(&lhs[common_prefix..], &rhs[common_prefix..], true) + } else { + 0 + } + }; + + (common_prefix, common_suffix) + } + fn common_overlap<T: DType>(lhs: &[T], rhs: &[T]) -> usize { if lhs.is_empty() || rhs.is_empty() { return 0; @@ -993,11 +1052,7 @@ impl DiffMatchPatch { let mut len = 1; let mut best = 0; - while let Some(found) = r - .windows(len) - .step_by(1) - .position(|p| p == &l[l.len() - len..]) - { + while let Some(found) = r.windows(len).position(|p| p == &l[l.len() - len..]) { len += found; if found == 0 || l[l.len() - len..] == r[..len] { best = len; @@ -1009,7 +1064,6 @@ impl DiffMatchPatch { } // Reduce the number of edits by eliminating semantically trivial equalities - #[inline] fn cleanup_semantic<T: DType>(diffs: &mut Vec<Diff<T>>) { let mut changes = false; @@ -1026,9 +1080,7 @@ impl DiffMatchPatch { let mut insert_len_post = 0_usize; let mut delete_len_post = 0_usize; - let mut difflen = diffs.len(); - - while pointer < difflen { + while pointer < diffs.len() { let mut diff_mod = false; if diffs[pointer].op() == Ops::Equal { equalities.push(pointer); @@ -1061,8 +1113,6 @@ impl DiffMatchPatch { diffs.insert(last, Diff::delete(last_eq)); // Change the other copy to insert diffs[last + 1].0 = Ops::Insert; - // change diff length - difflen = diffs.len(); // Throw away the equality we just deleted. equalities.pop(); @@ -1100,8 +1150,6 @@ impl DiffMatchPatch { Self::cleanup_semantic_lossless(diffs); - difflen = diffs.len(); - // Find any overlaps between deletions and insertions. // e.g: <del>abcxxx</del><ins>xxxdef</ins> // -> <del>abc</del>xxx<ins>def</ins> @@ -1109,7 +1157,7 @@ impl DiffMatchPatch { // -> <ins>def</ins>xxx<del>abc</del> // Only extract an overlap if it is as big as the edit ahead or behind it. pointer = 1; - while difflen > 0 && pointer < difflen { + while !diffs.is_empty() && pointer < diffs.len() { if diffs[pointer - 1].op() == Ops::Delete && diffs[pointer].op() == Ops::Insert { let delete = diffs[pointer - 1].data().to_vec(); let insert = diffs[pointer].data().to_vec(); @@ -1127,7 +1175,7 @@ impl DiffMatchPatch { diffs.insert(pointer, Diff::equal(&insert[..overlap_1])); diffs[pointer - 1].1 = delete[..delete.len() - overlap_1].to_vec(); diffs[pointer + 1].1 = insert[overlap_1..].to_vec(); - difflen = diffs.len(); + pointer += 1; } else if overlap_2 >= delete_thres || overlap_2 >= insert_thres { // Reverse overlap @@ -1136,7 +1184,6 @@ impl DiffMatchPatch { diffs[pointer - 1] = Diff::insert(&insert[..insert.len() - overlap_2]); diffs[pointer + 1] = Diff::delete(&delete[overlap_2..]); - difflen = diffs.len(); pointer += 1; } pointer += 1; @@ -1147,19 +1194,13 @@ impl DiffMatchPatch { // Look for single edits surrounded on both sides by equalities // e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came. - #[inline] fn cleanup_semantic_lossless<T: DType>(diffs: &mut Vec<Diff<T>>) { let mut pointer = 1_usize; - let mut difflen = diffs.len(); // Intentionally ignore the first and last element (don't need checking). - while difflen > 0 && pointer < difflen - 1 { + while !diffs.is_empty() && (1..diffs.len() - 1).contains(&pointer) { // an edit surrounded by equalities if diffs[pointer - 1].op() == Ops::Equal && diffs[pointer + 1].op() == Ops::Equal { - // let mut equality_prev = diffs[pointer - 1].data().to_vec(); - // let mut edit = diffs[pointer].data().to_vec(); - // let mut equality_next = diffs[pointer + 1].data().to_vec(); - // Shift the edit as far left as possible let (mut equality_prev, mut edit, mut equality_next) = { let commonlen = @@ -1194,7 +1235,7 @@ impl DiffMatchPatch { let mut best_score = Self::cleanup_semantic_score(&equality_prev[..], &edit[..]) + Self::cleanup_semantic_score(&edit[..], &equality_next[..]); - while !edit.is_empty() && !equality_next.is_empty() && edit[0] == equality_next[0] { + while !edit.is_empty() && edit.first() == equality_next.first() { equality_prev.push(edit[0]); edit.remove(0); edit.push(equality_next[0]); @@ -1220,7 +1261,6 @@ impl DiffMatchPatch { } else { diffs.remove(pointer - 1); pointer -= 1; - difflen = diffs.len(); } diffs[pointer].1.clone_from(&best_edit); @@ -1230,7 +1270,6 @@ impl DiffMatchPatch { } else { diffs.remove(pointer + 1); pointer -= 1; - difflen = diffs.len(); } } } @@ -1242,7 +1281,6 @@ impl DiffMatchPatch { // Given two strings, compute a score representing whether the internal // boundary falls on logical boundaries // Scores range from 6 (best) to 0 (worst) - #[inline] fn cleanup_semantic_score<T: DType>(one: &[T], two: &[T]) -> u8 { let (char1, char2) = if let (Some(&char1), Some(&char2)) = (one.last(), two.first()) { if let (Some(c1), Some(c2)) = (char1.as_char(), char2.as_char()) { @@ -1294,9 +1332,7 @@ impl DiffMatchPatch { fn cleanup_merge<T: DType>(diffs: &mut Vec<Diff<T>>) { Self::cleanup_merge_first(diffs); - let changes = Self::cleanup_merge_second(diffs); - - if changes { + if Self::cleanup_merge_second(diffs) { Self::cleanup_merge(diffs); } } @@ -1328,7 +1364,7 @@ impl DiffMatchPatch { pointer += 1; } Ops::Equal => { - Self::cleanup_perge_proc_equal( + Self::cleanup_merge_proc_equal( diffs, insert_n, delete_n, @@ -1412,7 +1448,8 @@ impl DiffMatchPatch { changes } - fn cleanup_perge_proc_equal<T: DType>( + // Handling `Equality` in the merge process + fn cleanup_merge_proc_equal<T: DType>( diffs: &mut Vec<Diff<T>>, insert_n: usize, delete_n: usize, @@ -1487,7 +1524,6 @@ impl DiffMatchPatch { } // Reduce the number of edits by eliminating operationally trivial equalities. - #[inline] fn cleanup_efficiency<T: DType>(&self, diffs: &mut Vec<Diff<T>>) { if diffs.is_empty() { return; @@ -1549,11 +1585,14 @@ impl DiffMatchPatch { && pre_ins as u8 + pre_del as u8 + post_ins as u8 + post_del as u8 == 3) { // Duplicate record. - let item = equalities.pop().unwrap(); - // change the second copy (after the insert in next line) to Insert - diffs[item].0 = Ops::Insert; - // add an item - diffs.insert(item, Diff::delete(le)); + if let Some(item) = equalities.pop() { + if (0..diffs.len()).contains(&item) { + // change the second copy (after the insert in next line) to Insert + diffs[item].0 = Ops::Insert; + // add an item + diffs.insert(item, Diff::delete(le)); + } + } last_eq = None; @@ -1575,8 +1614,7 @@ impl DiffMatchPatch { changes = true; continue; }; - // pointer = equalitiesLength > 0 ? - // equalities[equalitiesLength - 1] : -1; + post_ins = false; post_del = false; } @@ -1816,13 +1854,11 @@ impl DiffMatchPatch { let mut linehash: HashMap<&'a [T], usize> = HashMap::new(); // Allocate 2/3rds of the UTF16::MAX (65535) value space for text1, the rest for text2. - // let mut maxlines = 5; let mut maxlines = 40000; let chars_old = Self::lines_to_chars_internal(old, &mut lines, &mut linehash, maxlines); // This basically represents the U16::MAX value maxlines = 65535; - // maxlines = 7; let chars_new = Self::lines_to_chars_internal(new, &mut lines, &mut linehash, maxlines); LineToChars { @@ -1832,7 +1868,6 @@ impl DiffMatchPatch { } } - #[inline] fn lines_to_chars_internal<'a, T: DType>( text: &'a [T], array: &mut Vec<&'a [T]>, @@ -1841,7 +1876,6 @@ impl DiffMatchPatch { ) -> Vec<usize> { let take = maxlines - array.len(); - // let mut lines = ; let mut charlist = Vec::with_capacity(take); let mut broke = false; @@ -1867,7 +1901,7 @@ impl DiffMatchPatch { }); // We broke at max lines, so we'll account for the remaining text - if broke { + if broke && cursor <= text.len() { let line = &text[cursor..]; let e = hash.entry(line).or_insert(array.len()); // Fresh insert @@ -1881,25 +1915,20 @@ impl DiffMatchPatch { charlist } - #[inline] fn chars_to_lines<T: DType>(diffs: &[Diff<usize>], lines: &[&[T]]) -> Vec<Diff<T>> { diffs .iter() .map(|d| { - let t = d - .data() - .iter() - .map(|&d| { - *lines.get(d).unwrap() // investigate - }) - .collect::<Vec<_>>() - .concat(); + let mut line = vec![]; + d.data().iter().for_each(|d| { + if let Some(l) = lines.get(*d) { + line.append(&mut l.to_vec()) + } + }); - Diff::new(d.op(), &t) + Diff::new(d.op(), &line) }) .collect::<Vec<_>>() - - // res } } @@ -1942,7 +1971,6 @@ impl DiffMatchPatch { // Is there a nearby exact match? (speedup) if let Some(best_loc) = text .windows(pattern.len()) - .step_by(1) .skip(loc) .position(|p| p == pattern) .map(|pos| pos + loc) @@ -1955,7 +1983,6 @@ impl DiffMatchPatch { // best_loc = text.lastIndexOf(pattern, loc + pattern.length); if let Some(best_loc_rev) = text .windows(pattern.len()) - .step_by(1) .skip(loc) .rev() .position(|p| p == pattern) |