my fork of dmp
Diffstat (limited to 'src/dmp.rs')
| -rw-r--r-- | src/dmp.rs | 393 |
1 files changed, 385 insertions, 8 deletions
@@ -1,3 +1,5 @@ +use std::{time::{Duration, Instant}, u32}; + /** * The data structure representing a diff is an array of tuples: @@ -6,9 +8,10 @@ */ /// Enum representing the different ops of diff -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +#[repr(i8)] pub enum Ops { - Delete, + Delete = -1, Insert, Equal } @@ -22,22 +25,174 @@ pub struct Diff(Ops, String); impl Diff { /// Create a new diff object - pub fn new(op: Ops, text: &str) -> Self { - Self(op, text.to_string()) + pub fn new(op: Ops, text: &[u8]) -> Self { + Self(op, String::from_utf8(text.to_vec()).unwrap()) } } +pub struct Patch {} + +pub type Patches = Vec<Patch>; + pub struct DiffMatchPatch { - checklines: Option<bool> + /// a speedup flag, If present and false, then don't run + /// a line-level diff first to identify the changed areas. + /// Defaults to true, which does a faster, slightly less optimal diff. + checklines: Option<bool>, + /// A default timeout in num seconds, defaults to 1 + timeout: Option<u64> +} + +impl Default for DiffMatchPatch { + fn default() -> Self { + Self { + checklines: Some(true), + timeout: Some(1) + } + } } impl DiffMatchPatch { fn checklines(&self) -> bool { self.checklines.map_or(true, |c| c) } + + // returns the configured timeout, defaults to `1`, None or `0` would mean infinite timeout + fn timeout(&self) -> u64 { + self.timeout.map_or(u64::MAX, |tout| if tout > 0 { tout } else { u64::MAX }) + } + + + fn diff_compute(&self, old: &[u8], new: &[u8]) -> Vec<Diff> { + // returning all of the new part + if old.is_empty() { + return vec![Diff::new(Ops::Insert, new)] + } + + // return everything deleted + if new.is_empty() { + return vec![Diff::new(Ops::Delete, old)] + } + + let (long, short, old_gt_new) = if old.len() > new.len() { (old, new, true) } else { (new, old, false) }; + + let idx = long.windows(short.len()).step_by(1).position(|k| k == short); + // found a subsequence which contains the short text + if let Some(idx) = idx { + // Shorter text is inside the longer text (speedup). + let op = if old_gt_new { Ops::Delete } else { Ops::Insert }; + let diffs = vec![ + Diff::new(op, &long[0 .. idx]), + Diff::new(Ops::Equal, short), + Diff::new(op, &long[idx .. short.len()]) + ]; + + return diffs; + } + + if short.len() == 1 { + // After previous case, this can't be an equality + return vec![Diff::new(Ops::Delete, old), Diff::new(Ops::Insert, new)]; + } + + // Check if the problem can be split in two + if let Some(half_match) = self.diff_half_match(old, new) { + + } + // // Check to see if the problem can be split in two. + // var hm = this.diff_halfMatch_(text1, text2); + // if (hm) { + // // A half-match was found, sort out the return data. + // var text1_a = hm[0]; + // var text1_b = hm[1]; + // var text2_a = hm[2]; + // var text2_b = hm[3]; + // var mid_common = hm[4]; + // // Send both pairs off for separate processing. + // var diffs_a = this.diff_main(text1_a, text2_a, checklines, deadline); + // var diffs_b = this.diff_main(text1_b, text2_b, checklines, deadline); + // // Merge the results. + // return diffs_a.concat([new diff_match_patch.Diff(DIFF_EQUAL, mid_common)], + // diffs_b); + // } + + // if (checklines && text1.length > 100 && text2.length > 100) { + // return this.diff_lineMode_(text1, text2, deadline); + // } + + // return this.diff_bisect_(text1, text2, deadline); + + todo!() + } + + fn diff_half_match(&self, old: &[u8], new: &[u8]) -> Option<()> { + // Don't risk returning a suboptimal diff when we have unlimited time + if self.timeout() == u64::MAX { + return None; + } + + let (long, short, old_gt_new) = if old.len() > new.len() { (old, new, true) } else { (new, old, false) }; + // pointless - two small for this algo + if long.len() < 4 || short.len() * 2 < long.len() { + return None; + } + + + None + } + + // returns the number of bytes common in both the str - this is the position in bytes not chars, [0 .. n] is your prefix + // We are doing a binary search here, and I've observed similar performance as noted by https://neil.fraser.name/news/2007/10/09/ + // Some benchmark code can be found in benches/prefix.rs + // Reverse prefix is suffix + // TODO: investigate this further + fn common_prefix(lhs: &[u8], rhs: &[u8], reverse: bool) -> usize { + if lhs.is_empty() || rhs.is_empty() || + (!reverse && (lhs.first() != rhs.first())) || + (reverse && (lhs.last() != rhs.last())) { + return 0; + } + + + let mut pointmin = 0; + let mut pointmax = lhs.len().min(rhs.len()); + let mut pointmid = pointmax; + + let mut pointstart = 0; + + while pointmin < pointmid { + let (lhsrange, rhsrange) = if !reverse { + (pointstart .. pointmid, pointstart .. pointmid) + } else { + (lhs.len() - pointmid .. lhs.len() - pointstart, rhs.len() - pointmid .. rhs.len() - pointstart) + }; + + if lhs[lhsrange] == rhs[rhsrange] { + pointmin = pointmid; + pointstart = pointmin; + } else { + pointmax = pointmid; + } + + pointmid = (pointmax - pointmin) / 2 + pointmin; + } + + pointmid + } + + } impl DiffMatchPatch { + /// Find the differences between two texts. Simplifies the problem by stripping any common prefix or suffix off the texts before diffing. + /// Args: + /// old: Old string to be diffed. + /// new: New string to be diffed. + /// deadline: Optional time when the diff should be complete by. Used + /// internally for recursive calls. Users should set DiffTimeout instead. + /// + /// Returns: + /// Vec of changes (Diff). pub fn diff_main(&self, old: &str, new: &str) -> Vec<Diff> { // First, check if lhs and rhs are equal if old == new { @@ -45,18 +200,240 @@ impl DiffMatchPatch { return Vec::new(); } - return vec![Diff::new(Ops::Equal, old)]; + return vec![Diff::new(Ops::Equal, old.as_bytes())]; } if old.is_empty() { - return vec![Diff::new(Ops::Insert, new)] + return vec![Diff::new(Ops::Insert, new.as_bytes())] } if new.is_empty() { - return vec![Diff::new(Ops::Delete, old)] + return vec![Diff::new(Ops::Delete, old.as_bytes())] } + let deadline = Instant::now().checked_add(Duration::from_secs(self.timeout())).unwrap(); + + let old_bytes = old.as_bytes(); + let new_bytes = new.as_bytes(); + + // Trim common prefix + let common_prefix = Self::common_prefix(old_bytes, new_bytes, false); + let common_suffix = Self::common_prefix(&old_bytes[common_prefix..], &new_bytes[common_prefix..], true); + + let diffs = self.diff_compute(&old_bytes[common_prefix .. old_bytes.len() - common_suffix], &new_bytes[common_prefix .. new_bytes.len() - common_suffix]); + + todo!() + } + + pub fn diff_cleanup_semantic(diffs: &mut [Diff]) { + todo!() + } + + pub fn diff_cleanup_efficiency(diffs: &mut [Diff]) { + todo!() + } + + pub fn diff_levenshtein(diffs: &[Diff]) -> u32 { + todo!() + } + + pub fn diff_pretty_html(diffs: &[Diff]) -> String { + todo!() + } + + pub fn match_main(text: &str, pattern: &str, loc: ()) -> () { + todo!() + } + + pub fn patch_make_text_text(text1: &str, text2: &str) -> Patches { + todo!() + } + + pub fn patch_make_diff(diffs: &[Diff]) -> Patches { + todo!() + } + pub fn patch_make_text_diff(text1: &str, diffs: &[Diff]) -> Patches { todo!() } + + pub fn patch_to_text(patches: Patches) -> String { + todo!() + } + + pub fn patch_from_text(text: &str) -> Patches { + todo!() + } + + pub fn patch_apply(patches: &[Patch], text: &str) -> (String, ()) { + todo!() + } +} + + +#[cfg(test)] +mod tests { + use crate::dmp::{Diff, Ops}; + + use super::DiffMatchPatch; + + // const tests = [ + // 'testDiffIsDestructurable', // TODO + // 'testDiffCommonOverlap', + // 'testDiffHalfMatch', + // 'testDiffLinesToChars', + // 'testDiffCharsToLines', + // 'testDiffCleanupMerge', + // 'testDiffCleanupSemanticLossless', + // 'testDiffCleanupSemantic', + // 'testDiffCleanupEfficiency', + // 'testDiffPrettyHtml', + // 'testDiffText', + // 'testDiffDelta', + // 'testDiffXIndex', + // 'testDiffLevenshtein', + // 'testDiffBisect', + // 'testMatchAlphabet', + // 'testMatchBitap', + // 'testMatchMain', + // 'testPatchObj', + // 'testPatchFromText', + // 'testPatchToText', + // 'testPatchAddContext', + // 'testPatchMake', + // 'testPatchSplitMax', + // 'testPatchAddPadding', + // 'testPatchApply' + // ]; + + + #[test] + fn test_diff_is_destructurable() { + + } + + #[test] + fn test_prefix() { + // Detect any common prefix. + // Null case. + assert_eq!(0, DiffMatchPatch::common_prefix("abc".as_bytes(), "xyz".as_bytes(), false)); + + // Non-null case. + assert_eq!(4, DiffMatchPatch::common_prefix("1234abcdef".as_bytes(), "1234xyz".as_bytes(), false)); + + // Whole case. + assert_eq!(4, DiffMatchPatch::common_prefix("1234".as_bytes(), "1234xyz".as_bytes(), false)); + } + + #[test] + fn test_suffix() { + // Detect any common suffix. + // Null case. + assert_eq!(0, DiffMatchPatch::common_prefix("abc".as_bytes(), "xyz".as_bytes(), true)); + + // Non-null case. + assert_eq!(4, DiffMatchPatch::common_prefix("abcdef1234".as_bytes(), "xyz1234".as_bytes(), true)); + + // Whole case. + assert_eq!(4, DiffMatchPatch::common_prefix("1234".as_bytes(), "xyz1234".as_bytes(), true)); + } + + #[test] + fn test_diff_main() { + let dmp = DiffMatchPatch::default(); + + // Perform a trivial diff. + // Null case. + assert!(dmp.diff_main("", "").is_empty()); + + // Equality + assert_eq!(vec![Diff::new(Ops::Equal, "abc".as_bytes())], dmp.diff_main("abc", "abc")); + + // Simple insert + assert_eq!( + vec![Diff::new(Ops::Equal, "ab".as_bytes()), Diff::new(Ops::Insert, "123".as_bytes()), Diff::new(Ops::Equal, "c".as_bytes())], + dmp.diff_main("abc", "ab123c") + ); + + // Simple delete + assert_eq!( + vec![Diff::new(Ops::Equal, "a".as_bytes()), Diff::new(Ops::Delete, "123".as_bytes()), Diff::new(Ops::Equal, "bc".as_bytes())], + dmp.diff_main("a123bc", "abc") + ); + + + +// // Two insertions. +// assertEquivalent([[DIFF_EQUAL, 'a'], [DIFF_INSERT, '123'], [DIFF_EQUAL, 'b'], [DIFF_INSERT, '456'], [DIFF_EQUAL, 'c']], dmp.diff_main('abc', 'a123b456c', false)); + +// // Two deletions. +// assertEquivalent([[DIFF_EQUAL, 'a'], [DIFF_DELETE, '123'], [DIFF_EQUAL, 'b'], [DIFF_DELETE, '456'], [DIFF_EQUAL, 'c']], dmp.diff_main('a123b456c', 'abc', false)); + +// // Perform a real diff. +// // Switch off the timeout. +// dmp.Diff_Timeout = 0; +// // Simple cases. +// assertEquivalent([[DIFF_DELETE, 'a'], [DIFF_INSERT, 'b']], dmp.diff_main('a', 'b', false)); + +// assertEquivalent([[DIFF_DELETE, 'Apple'], [DIFF_INSERT, 'Banana'], [DIFF_EQUAL, 's are a'], [DIFF_INSERT, 'lso'], [DIFF_EQUAL, ' fruit.']], dmp.diff_main('Apples are a fruit.', 'Bananas are also fruit.', false)); + +// assertEquivalent([[DIFF_DELETE, 'a'], [DIFF_INSERT, '\u0680'], [DIFF_EQUAL, 'x'], [DIFF_DELETE, '\t'], [DIFF_INSERT, '\0']], dmp.diff_main('ax\t', '\u0680x\0', false)); + +// // Overlaps. +// assertEquivalent([[DIFF_DELETE, '1'], [DIFF_EQUAL, 'a'], [DIFF_DELETE, 'y'], [DIFF_EQUAL, 'b'], [DIFF_DELETE, '2'], [DIFF_INSERT, 'xab']], dmp.diff_main('1ayb2', 'abxab', false)); + +// assertEquivalent([[DIFF_INSERT, 'xaxcx'], [DIFF_EQUAL, 'abc'], [DIFF_DELETE, 'y']], dmp.diff_main('abcy', 'xaxcxabc', false)); + +// assertEquivalent([[DIFF_DELETE, 'ABCD'], [DIFF_EQUAL, 'a'], [DIFF_DELETE, '='], [DIFF_INSERT, '-'], [DIFF_EQUAL, 'bcd'], [DIFF_DELETE, '='], [DIFF_INSERT, '-'], [DIFF_EQUAL, 'efghijklmnopqrs'], [DIFF_DELETE, 'EFGHIJKLMNOefg']], dmp.diff_main('ABCDa=bcd=efghijklmnopqrsEFGHIJKLMNOefg', 'a-bcd-efghijklmnopqrs', false)); + +// // Large equality. +// assertEquivalent([[DIFF_INSERT, ' '], [DIFF_EQUAL, 'a'], [DIFF_INSERT, 'nd'], [DIFF_EQUAL, ' [[Pennsylvania]]'], [DIFF_DELETE, ' and [[New']], dmp.diff_main('a [[Pennsylvania]] and [[New', ' and [[Pennsylvania]]', false)); + +// // Timeout. +// dmp.Diff_Timeout = 0.1; // 100ms +// var a = '`Twas brillig, and the slithy toves\nDid gyre and gimble in the wabe:\nAll mimsy were the borogoves,\nAnd the mome raths outgrabe.\n'; +// var b = 'I am the very model of a modern major general,\nI\'ve information vegetable, animal, and mineral,\nI know the kings of England, and I quote the fights historical,\nFrom Marathon to Waterloo, in order categorical.\n'; +// // Increase the text lengths by 1024 times to ensure a timeout. +// for (var i = 0; i < 10; i++) { +// a += a; +// b += b; +// } +// var startTime = (new Date()).getTime(); +// dmp.diff_main(a, b); +// var endTime = (new Date()).getTime(); +// // Test that we took at least the timeout period. +// assertTrue(dmp.Diff_Timeout * 1000 <= endTime - startTime); +// // Test that we didn't take forever (be forgiving). +// // Theoretically this test could fail very occasionally if the +// // OS task swaps or locks up for a second at the wrong moment. +// assertTrue(dmp.Diff_Timeout * 1000 * 2 > endTime - startTime); +// dmp.Diff_Timeout = 0; + +// // Test the linemode speedup. +// // Must be long to pass the 100 char cutoff. +// // Simple line-mode. +// a = '1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n'; +// b = 'abcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\n'; +// assertEquivalent(dmp.diff_main(a, b, false), dmp.diff_main(a, b, true)); + +// // Single line-mode. +// a = '1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890'; +// b = 'abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghij'; +// assertEquivalent(dmp.diff_main(a, b, false), dmp.diff_main(a, b, true)); + +// // Overlap line-mode. +// a = '1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n'; +// b = 'abcdefghij\n1234567890\n1234567890\n1234567890\nabcdefghij\n1234567890\n1234567890\n1234567890\nabcdefghij\n1234567890\n1234567890\n1234567890\nabcdefghij\n'; +// var texts_linemode = diff_rebuildtexts(dmp.diff_main(a, b, true)); +// var texts_textmode = diff_rebuildtexts(dmp.diff_main(a, b, false)); +// assertEquivalent(texts_textmode, texts_linemode); + +// // Test null inputs. +// try { +// dmp.diff_main(null, null); +// assertEquals(Error, null); +// } catch (e) { +// // Exception expected. +// } + } }
\ No newline at end of file |