my fork of dmp
Diffstat (limited to 'src/dmp.rs')
-rw-r--r--src/dmp.rs326
1 files changed, 284 insertions, 42 deletions
diff --git a/src/dmp.rs b/src/dmp.rs
index 41a263e..5d98030 100644
--- a/src/dmp.rs
+++ b/src/dmp.rs
@@ -64,7 +64,7 @@ pub struct DiffMatchPatch {
// the contents have to match the expected contents. (0.0 = perfection,
// 1.0 = very loose). Note that Match_Threshold controls how closely the
// end points of a delete need to match.
- // float Patch_DeleteThreshold;
+ delete_threshold: f32,
// Chunk size for context length.
patch_margin: u16,
@@ -81,6 +81,7 @@ impl Default for DiffMatchPatch {
match_distance: 1000,
match_max_bits: 32,
patch_margin: 4,
+ delete_threshold: 0.5
}
}
}
@@ -115,6 +116,11 @@ impl DiffMatchPatch {
self.patch_margin
}
+ // returns the configured patch delete threshold
+ fn delete_threshold(&self) -> f32 {
+ self.delete_threshold
+ }
+
// returns the configured max_bits
fn match_max_bits(&self) -> usize {
self.match_max_bits
@@ -1131,6 +1137,47 @@ impl DiffMatchPatch {
todo!()
}
+ fn x_index(diffs: &[Diff], loc: usize) -> usize {
+ let mut char1 = 0;
+ let mut char2 = 0;
+
+ let mut last_char1 = 0;
+ let mut last_char2 = 0;
+
+ let mut last_diff = None;
+
+ for diff in diffs.iter() {
+ if diff.0 != Ops::Insert {
+ // Equality or deletion
+ char1 += diff.1.len();
+ }
+
+ if diff.0 != Ops::Delete {
+ // Equality or insertion
+ char2 += diff.1.len();
+ }
+
+ if char1 > loc {
+ // overshot location
+ last_diff = Some(diff);
+ break;
+ }
+
+ last_char1 = char1;
+ last_char2 = char2;
+ }
+
+ if let Some(ld) = last_diff {
+ if ld.0 == Ops::Delete {
+ // The location was deleted.
+ return last_char2;
+ }
+ }
+
+ // Add the remaining character length.
+ last_char2 + (loc - last_char1)
+ }
+
fn diff_text_old(diffs: &[Diff]) -> Vec<u8> {
diffs
.iter()
@@ -1390,6 +1437,11 @@ impl DiffMatchPatch {
// Match methods
impl DiffMatchPatch {
fn match_internal(&self, text: &[u8], pattern: &[u8], loc: usize) -> Option<usize> {
+ println!(
+ "match_internal: {} {} {loc}",
+ std::str::from_utf8(text).unwrap(),
+ std::str::from_utf8(pattern).unwrap()
+ );
// Check for null inputs.
// Nothing to match.
if text.is_empty() {
@@ -1398,16 +1450,18 @@ impl DiffMatchPatch {
// loc = Math.max(0, Math.min(loc, text.length));
let loc = loc.min(text.len());
-
+ println!("match_internal: loc[{loc}]");
if text == pattern {
// Shortcut (potentially not guaranteed by the algorithm)
Some(0)
} else if &text[loc .. (loc + pattern.len()).min(text.len())] == pattern {
// Perfect match at the perfect spot! (Includes case of null pattern)
+ println!("match_internal: is perfect match");
Some(loc)
} else {
// Do a fuzzy compare.
// return this.match_bitap_(text, pattern, loc);
+ println!("match_internal: Fuzzy match");
self.match_bitap(text, pattern, loc)
}
}
@@ -1418,10 +1472,10 @@ impl DiffMatchPatch {
}
let alphabet = Self::match_alphabet(pattern);
- println!("Alphabet: {:?}", alphabet.iter().map(|(&a, v)| (a as char, v)).collect::<HashMap<_, _>>());
+
// Highest score beyond which we give up.
let mut score_thres = self.match_threshold();
- println!("Score thres init: {score_thres}");
+
// Is there a nearby exact match? (speedup)
if let Some(best_loc) = text.windows(pattern.len()).step_by(1).skip(loc).position(|p| p == pattern).map(|pos| pos + loc) {
score_thres = self.bitap_score(
@@ -1430,7 +1484,7 @@ impl DiffMatchPatch {
0,
best_loc
).min(score_thres);
- println!("Score thres fwd: {score_thres} best_loc[{best_loc}]");
+
// What about in the other direction? (speedup)
// best_loc = text.lastIndexOf(pattern, loc + pattern.length);
if let Some(best_loc_rev) = text
@@ -1441,13 +1495,12 @@ impl DiffMatchPatch {
.position(|p| p == pattern)
.map(|pos| text.len() - pos - pattern.len()) {
score_thres = self.bitap_score(loc, pattern.len(), 0, best_loc_rev);
- println!("Score thres rev: {score_thres} best_loc_rev[{best_loc_rev}]");
}
}
// Initialise the bit arrays.
let matchmask = 1 << (pattern.len() - 1);
- println!("Matchmask: {matchmask}");
+
// var matchmask = 1 << (pattern.length - 1);
let mut best_loc = None;
@@ -1462,7 +1515,6 @@ impl DiffMatchPatch {
// this error level.
bin_min = 0;
bin_mid = bin_max;
- println!("Loop at {d}: bin_min[{bin_min}] bin_mid[{bin_mid}] bin_max[{bin_max}]");
while bin_min < bin_mid {
let score = self.bitap_score(loc, pattern.len(), d, loc + bin_mid);
@@ -1473,14 +1525,12 @@ impl DiffMatchPatch {
}
bin_mid = (bin_max - bin_min) / 2 + bin_min;
- println!("While: score[{score}] bin_min[{bin_min}] bin_mid[{bin_mid}] bin_max[{bin_max}]");
}
// Use the result from this iteration as the maximum for the next.
bin_max = bin_mid;
let mut start = if loc > bin_mid { 1.max(loc - bin_mid + 1) } else { 1 };
let finish = (loc + bin_mid).min(text.len()) + pattern.len();
- println!("Start[{start}] Finish[{finish}]");
let mut rd = vec![0; finish + 2];
rd[finish + 1] = (1 << d) - 1;
@@ -1493,7 +1543,7 @@ impl DiffMatchPatch {
} else {
alphabet.get(&text[j - 1]).map_or(0, |&v| v)
};
- println!("Loop 2: {j} {char_match} last_rd[{}]", last_rd.len());
+
rd[j] = if d == 0 {
// first pass: exact match
((rd[j + 1] << 1) | 1) & char_match
@@ -1503,21 +1553,20 @@ impl DiffMatchPatch {
(((last_rd[j + 1] | last_rd[j]) << 1) | 1_usize) |
last_rd[j + 1]
};
- println!("Loop 2: rd[{j}] -> {}", rd[j]);
+
if (rd[j] & matchmask) != 0 {
let score = self.bitap_score(loc, pattern.len(), d, j - 1);
// This match will almost certainly be better than any existing
// match. But check anyway.
- println!("Loop 2: inner score [{score}] score_thres[{score_thres}]");
+ println!("match_bitap @{j}: score[{score}] score_thres[{score_thres}]");
if score <= score_thres {
score_thres = score;
let bst_loc = j - 1;
- println!("Loop 2: best_loc[{bst_loc}] orig_loc[{loc}]");
+
best_loc = Some(bst_loc);
if bst_loc > loc {
// When passing loc, don't exceed our current distance from loc.
start = 1.max(if loc > bst_loc { loc - bst_loc } else { 0 });
- println!("Start: {start}");
} else {
// Already passed loc, downhill from here on in.
break;
@@ -1857,6 +1906,14 @@ impl DiffMatchPatch {
return (source.to_vec(), vec![]);
}
+ let deadline = if let Some(i) = Instant::now().checked_add(
+ Duration::from_millis(self.timeout())
+ ) {
+ i
+ } else {
+ todo!()
+ };
+
// To avoid changes to original patches!!
let mut patches = patches.clone();
@@ -1870,16 +1927,17 @@ impl DiffMatchPatch {
// 20, but the first patch was found at 12, delta is 2 and the second patch
// has an effective expected position of 22.
let mut delta = 0;
- let mut results = vec![];
+ let mut results = vec![false; patches.len()];
patches.iter().enumerate().for_each(|(x, p)| {
let expected_loc = p.start2 + delta;
let txt_old = Self::diff_text_old(&p.diffs);
-
+ println!("patch_apply [{x}]: expected_loc[{expected_loc}]");
let (start_loc, end_loc) = if txt_old.len() > self.match_max_bits() {
// patch_splitMax will only provide an oversized pattern in the case of
// a monster delete.
if let Some(sl) = self.match_internal(&source, &txt_old[.. self.match_max_bits()], expected_loc) {
+ println!("patch_apply > max_bits: after match main with text[{}] txt_old[{}] start_loc[{sl}]", std::str::from_utf8(&source).unwrap(), std::str::from_utf8(&txt_old[.. self.match_max_bits()]).unwrap());
let el = self.match_internal(&source, &txt_old[txt_old.len() - self.match_max_bits() ..], expected_loc + txt_old.len() - self.match_max_bits());
if el.is_none() || Some(sl) >= el {
@@ -1892,26 +1950,70 @@ impl DiffMatchPatch {
(None, None)
}
} else {
+ println!(
+ "patch_apply <= max_bits: before match_main with text[{}] txt_old[{}] start_loc[-1] expected_loc[{expected_loc}]",
+ std::str::from_utf8(&source).unwrap(),
+ std::str::from_utf8(&txt_old).unwrap(),
+ );
(self.match_internal(&source, &txt_old, expected_loc), None)
};
+ println!("patch_apply [{x}]: {start_loc:?} {end_loc:?}");
+
if let Some(sl) = start_loc {
// Found a match. :)
results[x] = true;
delta = sl - expected_loc;
+ println!("patch_apply [{x}] delta[{delta}]");
let txt_new = if let Some(el) = end_loc {
// safeMid(text, start_loc, end_loc + Match_MaxBits - start_loc);
- &source[sl .. el + self.match_max_bits() - sl]
+ &source[sl .. el + self.match_max_bits()]
} else {
- &source[sl .. txt_old.len()]
+ &source[sl .. sl + txt_old.len()]
};
+ println!("patch_apply [{x}]: txt2[{}]", std::str::from_utf8(txt_new).unwrap());
if txt_old == txt_new {
+ println!("patch_apply [{x}] - perfect match");
// Perfect match, just shove the replacement text in.
source = [&source[..sl], &Self::diff_text_new(&p.diffs), &source[sl + txt_old.len() ..]].concat();
} else {
- todo!("anubhab")
+ // Imperfect match. Run a diff to get a framework of equivalent indices.
+ let mut diffs = self.diff_internal(&txt_old, txt_new, false, deadline);
+ println!("patch_apply: [{x}]: imperfect match. Diffs[{}]", diffs.len());
+ if txt_old.len() > self.match_max_bits() && (Self::diff_levenshtein(&diffs) as f32 / txt_old.len() as f32) > self.delete_threshold() {
+ // The end points match, but the content is unacceptably bad.
+ results[x] = false;
+ } else {
+ Self::cleanup_semantic_lossless(&mut diffs);
+ println!("patch_apply: [{x}] after lossless: diffs[{}]", diffs.len());
+ let mut idx1 = 0;
+ diffs.iter()
+ .for_each(|diff| {
+ if diff.0 != Ops::Equal {
+ let idx2 = Self::x_index(&diffs, idx1);
+ if diff.0 == Ops::Insert {
+ // Insertion
+ source = [
+ &source[.. sl + idx2],
+ &diff.1,
+ &source[sl + idx2 ..]
+ ].concat();
+ } else if diff.0 == Ops::Delete {
+ // Deletion
+ source = [
+ &source[.. sl + idx2],
+ &source[sl + Self::x_index(&diffs, idx1 + diff.1.len()) ..]
+ ].concat();
+ }
+ }
+
+ if diff.0 != Ops::Delete {
+ idx1 += diff.1.len()
+ }
+ });
+ }
}
} else {
// No match found. :(
@@ -1920,7 +2022,10 @@ impl DiffMatchPatch {
delta -= p.length2 - p.length1;
}
});
- todo!()
+
+ // Strip the padding off.
+ source = source[null_pad.len() .. source.len() - null_pad.len()].to_vec();
+ (source, results)
}
fn patch_add_padding(&self, patches: &mut Patches) -> Vec<u8> {
@@ -2005,8 +2110,8 @@ impl DiffMatchPatch {
/// Returns:
/// Vec of changes (Diff).
pub fn diff_main(&self, old: &str, new: &str) -> Vec<Diff> {
- let deadline =
- if let Some(i) = Instant::now().checked_add(Duration::from_millis(self.timeout())) {
+ let deadline = if let Some(i) = Instant::now()
+ .checked_add(Duration::from_millis(self.timeout())) {
i
} else {
todo!()
@@ -2027,8 +2132,28 @@ impl DiffMatchPatch {
todo!()
}
- pub fn diff_levenshtein(diffs: &[Diff]) -> u32 {
- todo!()
+ pub fn diff_levenshtein(diffs: &[Diff]) -> usize {
+ let mut levenshtein = 0;
+ let mut insert = 0;
+ let mut delete = 0;
+
+ diffs.iter()
+ .for_each(|diff| {
+ match diff.0 {
+ Ops::Insert => insert += diff.1.len(),
+ Ops::Delete => delete += diff.1.len(),
+ Ops::Equal => {
+ // A deletion and an insertion is one substitution.
+ levenshtein += insert.max(delete);
+ insert = 0;
+ delete = 0;
+ }
+ }
+ });
+
+ levenshtein += insert.max(delete);
+
+ levenshtein
}
/// Takes a diff array and returns a pretty HTML sequence. This function is mainly intended as an example from which to write ones own display functions.
@@ -2179,10 +2304,7 @@ mod tests {
// 'testDiffCleanupEfficiency',
// 'testDiffPrettyHtml',
// 'testDiffDelta',
- // 'testDiffXIndex',
- // 'testDiffLevenshtein',
// 'testPatchObj',
- // 'testPatchApply'
// ];
#[test]
@@ -2820,6 +2942,48 @@ mod tests {
}
#[test]
+ fn test_diff_levenshtein() {
+ let diffs = vec![
+ Diff::delete(b"abc"),
+ Diff::insert(b"1234"),
+ Diff::equal(b"xyz")
+ ];
+ assert_eq!(4, DiffMatchPatch::diff_levenshtein(&diffs));
+
+ let diffs = vec![
+ Diff::equal(b"xyz"),
+ Diff::delete(b"abc"),
+ Diff::insert(b"1234")
+ ];
+ assert_eq!(4, DiffMatchPatch::diff_levenshtein(&diffs));
+
+ let diffs = vec![
+ Diff::delete(b"abc"),
+ Diff::equal(b"xyz"),
+ Diff::insert(b"1234")
+ ];
+ assert_eq!(7, DiffMatchPatch::diff_levenshtein(&diffs));
+ }
+
+ #[test]
+ fn test_diff_x_index() {
+ // Translate a location in text1 to text2.
+ let diffs = vec![
+ Diff::delete(b"a"),
+ Diff::insert(b"1234"),
+ Diff::equal(b"xyz")
+ ];
+ assert_eq!(5, DiffMatchPatch::x_index(&diffs, 2));
+
+ let diffs = vec![
+ Diff::equal(b"a"),
+ Diff::delete(b"1234"),
+ Diff::equal(b"xyz")
+ ];
+ assert_eq!(1, DiffMatchPatch::x_index(&diffs, 3));
+ }
+
+ #[test]
fn test_diff_common_overlap() {
// Detect any suffix/prefix overlap.
// Null case
@@ -3112,20 +3276,6 @@ mod tests {
}
#[test]
- fn test_serde() {
- // let diffs = vec![
- // Diff::delete(b"a"),
- // Diff::insert("\u{0680}".as_bytes()),
- // Diff::equal(b"x"),
- // Diff::delete(b"\t"),
- // Diff::insert(b"\0")
- // ];
-
- // let serialized = serde_json::to_string(&diffs).unwrap();
- // !("{serialized}");
- }
-
- #[test]
fn test_patch_add_context() {
let dmp = DiffMatchPatch::default();
@@ -3411,6 +3561,98 @@ mod tests {
}
#[test]
+ fn test_patch_apply() {
+ // checklines: Some(true),
+ // timeout: Some(1000),
+ // match_threshold: 0.5,
+ // match_distance: 1000,
+ // match_max_bits: 32,
+ // patch_margin: 4,
+ // delete_threshold: 0.5
+
+ let mut dmp = DiffMatchPatch::default();
+
+ let patches = dmp.patch_make(PatchInput::Texts("", ""));
+ let (txt, results) = dmp.patch_apply_internal(&patches, b"Hello world.");
+ assert_eq!(format!("{}\t{}", std::str::from_utf8(&txt).unwrap(), results.len()), "Hello world.\t0");
+
+ let patches = dmp.patch_make(PatchInput::Texts("The quick brown fox jumps over the lazy dog.", "That quick brown fox jumped over a lazy dog."));
+
+ // Exact match
+ // assert_eq!((b"That quick brown fox jumped over a lazy dog.".to_vec(), vec![true, true]), dmp.patch_apply_internal(&patches, b"The quick brown fox jumps over the lazy dog."));
+
+ // Partial match
+ assert_eq!((b"That quick red rabbit jumped over a tired tiger.".to_vec(), vec![true, true]), dmp.patch_apply_internal(&patches, b"The quick red rabbit jumps over the tired tiger."));
+ // results = dmp.patch_apply(patches, "");
+ // boolArray = results.second;
+ // resultStr = results.first + "\t" + (boolArray[0] ? "true" : "false") + "\t" + (boolArray[1] ? "true" : "false");
+ // assertEquals("patch_apply: Partial match.", "\ttrue\ttrue", resultStr);
+
+ // results = dmp.patch_apply(patches, "I am the very model of a modern major general.");
+ // boolArray = results.second;
+ // resultStr = results.first + "\t" + (boolArray[0] ? "true" : "false") + "\t" + (boolArray[1] ? "true" : "false");
+ // assertEquals("patch_apply: Failed match.", "I am the very model of a modern major general.\tfalse\tfalse", resultStr);
+
+ // patches = dmp.patch_make("x1234567890123456789012345678901234567890123456789012345678901234567890y", "xabcy");
+ // results = dmp.patch_apply(patches, "x123456789012345678901234567890-----++++++++++-----123456789012345678901234567890y");
+ // boolArray = results.second;
+ // resultStr = results.first + "\t" + (boolArray[0] ? "true" : "false") + "\t" + (boolArray[1] ? "true" : "false");
+ // assertEquals("patch_apply: Big delete, small change.", "xabcy\ttrue\ttrue", resultStr);
+
+ // patches = dmp.patch_make("x1234567890123456789012345678901234567890123456789012345678901234567890y", "xabcy");
+ // results = dmp.patch_apply(patches, "x12345678901234567890---------------++++++++++---------------12345678901234567890y");
+ // boolArray = results.second;
+ // resultStr = results.first + "\t" + (boolArray[0] ? "true" : "false") + "\t" + (boolArray[1] ? "true" : "false");
+ // assertEquals("patch_apply: Big delete, large change 1.", "xabc12345678901234567890---------------++++++++++---------------12345678901234567890y\tfalse\ttrue", resultStr);
+
+ // dmp.Patch_DeleteThreshold = 0.6f;
+ // patches = dmp.patch_make("x1234567890123456789012345678901234567890123456789012345678901234567890y", "xabcy");
+ // results = dmp.patch_apply(patches, "x12345678901234567890---------------++++++++++---------------12345678901234567890y");
+ // boolArray = results.second;
+ // resultStr = results.first + "\t" + (boolArray[0] ? "true" : "false") + "\t" + (boolArray[1] ? "true" : "false");
+ // assertEquals("patch_apply: Big delete, large change 2.", "xabcy\ttrue\ttrue", resultStr);
+ // dmp.Patch_DeleteThreshold = 0.5f;
+
+ // dmp.Match_Threshold = 0.0f;
+ // dmp.Match_Distance = 0;
+ // patches = dmp.patch_make("abcdefghijklmnopqrstuvwxyz--------------------1234567890", "abcXXXXXXXXXXdefghijklmnopqrstuvwxyz--------------------1234567YYYYYYYYYY890");
+ // results = dmp.patch_apply(patches, "ABCDEFGHIJKLMNOPQRSTUVWXYZ--------------------1234567890");
+ // boolArray = results.second;
+ // resultStr = results.first + "\t" + (boolArray[0] ? "true" : "false") + "\t" + (boolArray[1] ? "true" : "false");
+ // assertEquals("patch_apply: Compensate for failed patch.", "ABCDEFGHIJKLMNOPQRSTUVWXYZ--------------------1234567YYYYYYYYYY890\tfalse\ttrue", resultStr);
+ // dmp.Match_Threshold = 0.5f;
+ // dmp.Match_Distance = 1000;
+
+ // patches = dmp.patch_make("", "test");
+ // QString patchStr = dmp.patch_toText(patches);
+ // dmp.patch_apply(patches, "");
+ // assertEquals("patch_apply: No side effects.", patchStr, dmp.patch_toText(patches));
+
+ // patches = dmp.patch_make("The quick brown fox jumps over the lazy dog.", "Woof");
+ // patchStr = dmp.patch_toText(patches);
+ // dmp.patch_apply(patches, "The quick brown fox jumps over the lazy dog.");
+ // assertEquals("patch_apply: No side effects with major delete.", patchStr, dmp.patch_toText(patches));
+
+ // patches = dmp.patch_make("", "test");
+ // results = dmp.patch_apply(patches, "");
+ // boolArray = results.second;
+ // resultStr = results.first + "\t" + (boolArray[0] ? "true" : "false");
+ // assertEquals("patch_apply: Edge exact match.", "test\ttrue", resultStr);
+
+ // patches = dmp.patch_make("XY", "XtestY");
+ // results = dmp.patch_apply(patches, "XY");
+ // boolArray = results.second;
+ // resultStr = results.first + "\t" + (boolArray[0] ? "true" : "false");
+ // assertEquals("patch_apply: Near edge exact match.", "XtestY\ttrue", resultStr);
+
+ // patches = dmp.patch_make("y", "y123");
+ // results = dmp.patch_apply(patches, "x");
+ // boolArray = results.second;
+ // resultStr = results.first + "\t" + (boolArray[0] ? "true" : "false");
+ // assertEquals("patch_apply: Edge partial match.", "x123\ttrue", resultStr);
+ }
+
+ #[test]
fn test_match_alphabet() {
// Initialise the bitmasks for Bitap.
// Unique.