WIP: add context and patch stuff - dmp

my fork of dmp

master 1Branch 0Tags

Clone

HTTPS

SSH

Open with VS Code

WIP: add context and patch stuff

Browse Source

Anubhab Bandyopadhyay 2024-08-09

parent 07bd29e · commit 8a3b5c5

-rw-r--r--

-rw-r--r--

-rw-r--r--

497

3 files changed, 491 insertions, 22 deletions

diff --git a/Cargo.lock b/Cargo.lock
index 24fe235..b61c72e 100644
--- a/Cargo.lock
+++ b/Cargo.lock

@@ -171,6 +171,7 @@ name = "diff-match-patch-rs"

version = "0.1.0"

dependencies = [

"criterion",

+ "percent-encoding",

"serde",

"serde_json",

"serde_repr",

@@ -273,6 +274,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"

checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9"

[[package]]

+name = "percent-encoding"

+version = "2.3.1"

+source = "registry+https://github.com/rust-lang/crates.io-index"

+checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"

+[[package]]

name = "plotters"

version = "0.3.6"

source = "registry+https://github.com/rust-lang/crates.io-index"

diff --git a/Cargo.toml b/Cargo.toml
index c1c4fe0..7e7e880 100644
--- a/Cargo.toml
+++ b/Cargo.toml

@@ -4,8 +4,13 @@ version = "0.1.0"

edition = "2021"

[dependencies]

-serde = { version = "1", features = ["derive"] }

-serde_repr = { version = "0" }

+percent-encoding = "2"

+serde = { version = "1", features = ["derive"], optional = true }

+serde_repr = { version = "0", optional = true }

+[features]

+serde = ["dep:serde", "dep:serde_repr"]

+default = ["serde"]

[dev-dependencies]

criterion = "0"

diff --git a/src/dmp.rs b/src/dmp.rs
index 647ae22..7c997a2 100644
--- a/src/dmp.rs
+++ b/src/dmp.rs

@@ -1,7 +1,8 @@

use std::{

- collections::HashMap, time::{Duration, Instant}

+ collections::HashMap, fmt::Display, time::{Duration, Instant}

};

+use percent_encoding::{percent_decode, percent_encode, NON_ALPHANUMERIC};

use serde::{Deserialize, Serialize};

use serde_repr::{Deserialize_repr, Serialize_repr};

@@ -18,7 +19,7 @@ pub enum Ops {

/// (Ops::Delete, String::new("Hello")) means delete `Hello`

/// (Ops::Insert, String::new("Goodbye")) means add `Goodbye`

/// (Ops::Equal, String::new("World")) means keep world

-#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]

+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]

pub struct Diff(

Ops,

Vec<u8>

@@ -45,17 +46,6 @@ impl Diff {

}

-pub struct Patch {}

-pub enum PatchInput<'a> {

- Texts(&'a [u8], &'a [u8]),

- Diffs(&'a [Diff]),

- TextDiffs(&'a [u8], &'a [Diff])

-pub type Patches = Vec<Patch>;

pub struct DiffMatchPatch {

/// a speedup flag, If present and false, then don't run

/// a line-level diff first to identify the changed areas.

@@ -63,6 +53,20 @@ pub struct DiffMatchPatch {

checklines: Option<bool>,

/// A default timeout in num milliseconda, defaults to 1000 (1 second)

timeout: Option<u64>,

+ // How far to search for a match (0 = exact location, 1000+ = broad match).

+ // A match this many characters away from the expected location will add

+ // 1.0 to the score (0.0 is a perfect match).

+ // int Match_Distance;

+ // When deleting a large block of text (over ~64 characters), how close does

+ // the contents have to match the expected contents. (0.0 = perfection,

+ // 1.0 = very loose). Note that Match_Threshold controls how closely the

+ // end points of a delete need to match.

+ // float Patch_DeleteThreshold;

+ // Chunk size for context length.

+ patch_margin: usize,

+ // The number of bits in an int.

+ match_max_bits: usize,

}

impl Default for DiffMatchPatch {

@@ -70,6 +74,8 @@ impl Default for DiffMatchPatch {

Self {

checklines: Some(true),

timeout: Some(1000),

+ patch_margin: 4,

+ match_max_bits: 32

}

@@ -94,6 +100,16 @@ impl DiffMatchPatch {

.map_or(31536000_u64, |tout| if tout > 0 { tout } else { u64::MAX })

}

+ // returns the current patch margin

+ fn patch_margin(&self) -> usize {

+ self.patch_margin

+ }

+ // returns the configured max_bits

+ fn match_max_bits(&self) -> usize {

+ self.match_max_bits

+ }

fn diff_internal<'a>(

&self,

old_bytes: &'a [u8],

@@ -1107,6 +1123,10 @@ impl DiffMatchPatch {

Self::cleanup_merge(diffs);

}

+ fn cleanup_efficiency(diffs: &mut Vec<Diff>) {

+ todo!()

+ }

}

#[derive(Debug, Eq, PartialEq)]

@@ -1170,7 +1190,6 @@ impl DiffMatchPatch {

}

- //

if broke.is_some() {

let line = &text[cursor..];

let e = hash.entry(line).or_insert(array.len());

@@ -1202,6 +1221,311 @@ impl DiffMatchPatch {

}

+// Patch Methods

+#[derive(Debug, Default)]

+pub struct Patch {

+ diffs: Vec<Diff>,

+ start1: usize,

+ start2: usize,

+ length1: usize,

+ length2: usize

+impl Display for Patch {

+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {

+ let coord1 = if self.length1 == 0 {

+ format!("{},0", self.start1)

+ } else {

+ format!(

+ "{}{}",

+ self.start1 + 1,

+ if self.length1 != 1 {

+ format!(",{}", self.length1)

+ } else {

+ String::new()

+ }

+ )

+ };

+ let coord2 = if self.length2 == 0 {

+ format!("{},0", self.start2)

+ } else {

+ format!(

+ "{}{}",

+ self.start2 + 1,

+ if self.length2 != 1 {

+ format!(",{}", self.length2)

+ } else {

+ String::new()

+ }

+ )

+ };

+ let mut segments = vec!["@@ -".to_string(), coord1, " +".to_string(), coord2 , " @@\n".to_string()];

+ self.diffs.iter()

+ .for_each(|diff| {

+ let sign = match diff.0 {

+ Ops::Insert => '+',

+ Ops::Delete => '-',

+ Ops::Equal => ' '

+ };

+ segments.push(

+ String::from_utf8([&[sign as u8], &diff.1[..], &[b'\n']].concat()).unwrap()

+ )

+ });

+ write!(f, "{}", segments.join(""))

+ }

+pub enum PatchInput<'a> {

+ Texts(&'a str, &'a str),

+ Diffs(&'a [Diff]),

+ TextDiffs(&'a str, &'a [Diff])

+pub type Patches = Vec<Patch>;

+impl DiffMatchPatch {

+ fn parse_patch_header(s: &[u8]) -> Option<(usize, Option<usize>, usize, Option<usize>)> {

+ let mut section = Vec::with_capacity(64);

+ let mut current_sect = 0;

+ let mut old_line = 0;

+ let mut old_cols = None;

+ let mut new_line = 0;

+ let mut new_cols = None;

+ for &c in s.iter() {

+ if c == b' ' {

+ match current_sect {

+ 0 => {

+ if &section != b"@@" {

+ return None;

+ }

+ 1 => {

+ if section.is_empty() {

+ return None;

+ }

+ let splits = section[1..].split(|&p| p == b',').collect::<Vec<_>>();

+ let ol = splits.first()?;

+ old_line = std::str::from_utf8(ol).ok()?.parse::<usize>().ok()?;

+ if let Some(&oc) = splits.get(1) {

+ old_cols = Some(std::str::from_utf8(oc).ok()?.parse::<usize>().ok()?);

+ }

+ 2 => {

+ let splits = section[

+ if *section.first()? == b'+' { 1 } else { 0 }

+ ..

+ ].split(|&p| p == b',').collect::<Vec<_>>();

+ let nl = splits.first()?;

+ new_line = std::str::from_utf8(nl).ok()?.parse::<usize>().ok()?;

+ if let Some(&nc) = splits.get(1) {

+ new_cols = Some(std::str::from_utf8(nc).ok()?.parse::<usize>().ok()?);

+ }

+ _ => {

+ // invalid pattern

+ return None;

+ }

+ section = Vec::with_capacity(64);

+ current_sect += 1;

+ continue;

+ }

+ if current_sect == 1 && section.is_empty() && c != b'-' {

+ return None;

+ }

+ section.push(c);

+ }

+ if &section != b"@@" {

+ return None;

+ }

+ Some(( old_line, old_cols, new_line, new_cols ))

+ // let old_line_col = parts.next()?;

+ // // let old_line_col_parts = old_line_col.splitn(2, |&b| b == b',');

+ // let old_line = old_line_col.iter()

+ // .skip_while(|&&b| b == b'-')

+ // .copied()

+ // .collect::<Vec<u8>>()

+ // .parse::<usize>().ok()?;

+ // // let old_line = old_line_col.next()?.trim_start_matches(b"-").parse::<usize>().ok()?;

+ // let old_col = if let Some(old_col) = old_line_col_parts.next() {

+ // Some(old_col.parse::<usize>().ok()?)

+ // } else {

+ // None

+ // };

+ // let infix = parts.next()?;

+ // if infix != b" +@" {

+ // return None;

+ // }

+ // let new_line_col = parts.next()?;

+ // let new_line_col_parts = new_line_col.splitn(2, |&b| b == b',');

+ // let new_line = new_line_col_parts.next()?.trim_start_matches(b"+").parse::<usize>().ok()?;

+ // let new_col = if let Some(new_col) = new_line_col_parts.next() {

+ // Some(new_col.parse::<usize>().ok()?)

+ // } else {

+ // None

+ // };

+ // let suffix = parts.next()?;

+ // if suffix != b" @@\n" {

+ // return None;

+ // }

+ // Some((old_line, old_col, new_line, new_col))

+ }

+ fn patch_make_internal(&self, txt: &[u8], diffs: &[Diff]) -> Result<Patches, ()> {

+ // The null cases

+ if txt.is_empty() {

+ todo!();

+ // return Err(())

+ }

+ // No diffs -> no patches

+ if diffs.is_empty() {

+ return Ok(Vec::new());

+ }

+ let mut patch = Patch::default();

+ let mut char_n1 = 0;

+ let mut char_n2 = 0;

+ let mut prepatch: Vec<u8> = vec![];

+ let mut postpatch: Vec<u8> = vec![];

+ diffs.iter().enumerate().for_each(|(idx, diff)| {

+ // a new patch starts here

+ if patch.diffs.is_empty() && diff.0 != Ops::Equal {

+ patch.start1 = char_n1;

+ patch.start2 = char_n2;

+ }

+ match diff.0 {

+ Ops::Insert => {

+ patch.length2 += diff.1.len();

+ postpatch = [&postpatch[..char_n2], &diff.1, &postpatch[char_n2..]].concat();

+ patch.diffs.push(diff.clone());

+ }

+ Ops::Delete => {

+ patch.length2 += diff.1.len();

+ postpatch = [&postpatch[..char_n2], &postpatch[char_n2 + diff.1.len()..]].concat();

+ patch.diffs.push(diff.clone());

+ }

+ Ops::Equal => {

+ if diff.1.len() <= 2 * self.patch_margin() && !patch.diffs.is_empty() && diffs.len() != idx + 1 {

+ // Small equality inside a patch.

+ patch.length1 += diff.1.len();

+ patch.length2 += diff.1.len();

+ patch.diffs.push(diff.clone());

+ } else if diff.1.len() >= 2 * self.patch_margin() && !patch.diffs.is_empty() {

+ // Time for a new patch.

+ self.patch_add_context(&mut patch, &prepatch);

+ // if (patchDiffLength) {

+ // this.patch_addContext_(patch, prepatch_text);

+ // patches.push(patch);

+ // patch = new diff_match_patch.patch_obj();

+ // patchDiffLength = 0;

+ // // Unlike Unidiff, our patch lists have a rolling context.

+ // // https://github.com/google/diff-match-patch/wiki/Unidiff

+ // // Update prepatch text & pos to reflect the application of the

+ // // just completed patch.

+ // prepatch_text = postpatch_text;

+ // char_count1 = char_count2;

+ // }

+ }

+ // if (aDiff.text.length() <= 2 * Patch_Margin

+ // && !patch.diffs.isEmpty() && !(aDiff == diffs.back())) {

+ // // Small equality inside a patch.

+ // patch.diffs.append(aDiff);

+ // patch.length1 += aDiff.text.length();

+ // patch.length2 += aDiff.text.length();

+ // }

+ // if (aDiff.text.length() >= 2 * Patch_Margin) {

+ // // Time for a new patch.

+ // if (!patch.diffs.isEmpty()) {

+ // patch_addContext(patch, prepatch_text);

+ // patches.append(patch);

+ // patch = Patch();

+ // // Unlike Unidiff, our patch lists have a rolling context.

+ // // http://code.google.com/p/google-diff-match-patch/wiki/Unidiff

+ // // Update prepatch text & pos to reflect the application of the

+ // // just completed patch.

+ // prepatch_text = postpatch_text;

+ // char_count1 = char_count2;

+ // }

+ }

+ });

+ todo!()

+ }

+ fn patch_add_context(&self, patch: &mut Patch, text: &[u8]) {

+ if text.is_empty() {

+ return;

+ }

+ let mut pattern = &text[patch.start2 .. patch.start2 + patch.length1];

+ let mut padding = 0;

+ // Look for the first and last matches of pattern in text. If two different

+ // matches are found, increase the pattern length.

+ while text.windows(pattern.len()).position(|p| p == pattern) !=

+ text.windows(pattern.len()).rev().position(|p| p == pattern) &&

+ pattern.len() < self.match_max_bits() - self.patch_margin() * 2 {

+ padding += self.patch_margin();

+ pattern = &text[patch.start2 - padding .. patch.start2 + patch.length1 + padding];

+ }

+ // while (text.indexOf(pattern) != text.lastIndexOf(pattern) &&

+ // pattern.length < this.Match_MaxBits - this.Patch_Margin -

+ // this.Patch_Margin) {

+ // padding += this.Patch_Margin;

+ // pattern = text.substring(patch.start2 - padding,

+ // patch.start2 + patch.length1 + padding);

+ // }

+ // Add one chunk for good luck.

+ padding += self.patch_margin();

+ // Add prefix

+ let prefix = &text[patch.start2 - padding .. patch.start2];

+ if !prefix.is_empty() {

+ patch.diffs.insert(0, Diff::equal(prefix));

+ }

+ // Add the suffix

+ let suffix = &text[patch.start2 + patch.length1 .. patch.start2 + patch.length1 + padding];

+ if !suffix.is_empty() {

+ patch.diffs.push(Diff::equal(suffix));

+ }

+ // Roll back the start points.

+ patch.start1 -= prefix.len();

+ patch.start2 -= prefix.len();

+ // extend the lengths

+ patch.length1 += prefix.len() + suffix.len();

+ patch.length2 += prefix.len() + suffix.len();

+ }

+// Exposed APIs

impl DiffMatchPatch {

/// Find the differences between two texts. Simplifies the problem by stripping any common prefix or suffix off the texts before diffing.

/// Args:

@@ -1212,7 +1536,7 @@ impl DiffMatchPatch {

///

/// Returns:

/// Vec of changes (Diff).

- pub fn diff_main<'a>(&self, old: &'a str, new: &'a str) -> Vec<Diff> {

+ pub fn diff_main(&self, old: &str, new: &str) -> Vec<Diff> {

let deadline = if let Some(i) = Instant::now()

.checked_add(Duration::from_millis(self.timeout())) {

@@ -1249,10 +1573,24 @@ impl DiffMatchPatch {

}

pub fn patch_make(input: PatchInput) -> Patches {

- todo!()

- }

+ let mut diff_input;

+ let (txt, diffs) = match input {

+ // No diffs provided, lets make our own

+ PatchInput::Texts(txt1, txt2 ) => {

+ let dmp = DiffMatchPatch::default();

+ diff_input = dmp.diff_main(txt1, txt2);

+ Self::cleanup_semantic(&mut diff_input);

+ (txt1.as_bytes(), &diff_input[..])

+ }

+ PatchInput::Diffs(diffs) => {

+ // No origin string provided, compute our own.

+ todo!()

+ }

+ PatchInput::TextDiffs(txt, diffs) => (txt.as_bytes(), diffs)

- pub fn patch_make_text_diff(text1: &str, diffs: &[Diff]) -> Patches {

+ };

todo!()

}

@@ -1261,7 +1599,91 @@ impl DiffMatchPatch {

}

pub fn patch_from_text(text: &str) -> Patches {

- todo!()

+ if text.is_empty() {

+ return vec![];

+ }

+ let mut text = text.as_bytes().split(|&p| p == b'\n').collect::<Vec<_>>();

+ let mut patches = vec![];

+ while !text.is_empty() {

+ let (old_line, old_cols, new_line, new_cols) = if let Some(p) = Self::parse_patch_header(text.first().unwrap()) {

+ p

+ } else {

+ todo!("return error")

+ };

+ let mut patch = Patch { start1: old_line, start2: new_line, ..Default::default() };

+ if let Some(old_cols) = old_cols {

+ if old_cols != 0 {

+ patch.start1 -= 1;

+ patch.length1 = old_cols;

+ }

+ } else {

+ patch.start1 -= 1;

+ patch.length1 = 1;

+ }

+ if let Some(new_cols) = new_cols {

+ if new_cols != 0 {

+ patch.start2 -= 1;

+ patch.length2 = new_cols;

+ }

+ } else {

+ patch.start2 -= 1;

+ patch.length2 = 1;

+ }

+ text.remove(0);

+ while !text.is_empty() {

+ let txt = if let Some(&s) = text.first() {

+ if !s.is_empty() {

+ s

+ } else {

+ text.remove(0);

+ continue;

+ }

+ } else {

+ text.remove(0);

+ continue;

+ };

+ let sign = if let Some(&sign) = txt.first() {

+ sign

+ } else {

+ unreachable!("Already checked for emptyness!");

+ };

+ let line = percent_decode(&txt[1..]).collect::<Vec<_>>();

+ match sign {

+ b'-' => {

+ patch.diffs.push(Diff::delete(&line));

+ }

+ b'+' => {

+ patch.diffs.push(Diff::insert(&line));

+ }

+ b' ' => {

+ patch.diffs.push(Diff::equal(&line));

+ }

+ b'@' => {

+ // next patch, break

+ break;

+ }

+ _ => {

+ todo!("throw error, invalid encoding")

+ }

+ text.remove(0);

+ }

+ patches.push(patch);

+ };

+ patches

}

pub fn patch_apply(patches: &[Patch], text: &str) -> (String, ()) {

@@ -1291,7 +1713,6 @@ mod tests {

// 'testPatchObj',

// 'testPatchFromText',

// 'testPatchToText',

- // 'testPatchAddContext',

// 'testPatchMake',

// 'testPatchSplitMax',

// 'testPatchAddPadding',

@@ -2232,4 +2653,40 @@ mod tests {

let serialized = serde_json::to_string(&diffs).unwrap();

println!("{serialized}");

}

+ #[test]

+ fn test_patch_add_context() {

+ let dmp = DiffMatchPatch::default();

+ let mut ps = DiffMatchPatch::patch_from_text("@@ -21,4 +21,10 @@\n-jump\n+somersault\n");

+ let p = ps.first_mut().unwrap();

+ dmp.patch_add_context(p, b"The quick brown fox jumps over the lazy dog.");

+ assert_eq!("@@ -17,12 +17,18 @@\n fox \n-jump\n+somersault\n s ov\n", p.to_string());

+ // Same, but not enough trailing context.

+ // Fails

+ let mut ps = DiffMatchPatch::patch_from_text("@@ -21,4 +21,10 @@\n-jump\n+somersault\n");

+ let p = ps.first_mut().unwrap();

+ dmp.patch_add_context(p, b"The quick brown fox jumps.");

+ assert_eq!("@@ -17,10 +17,16 @@\n fox \n-jump\n+somersault\n s.\n", p.to_string());

+ // // Same, but not enough leading context.

+ // p = dmp.patch_fromText('@@ -3 +3,2 @@\n-e\n+at\n')[0];

+ // dmp.patch_addContext_(p, 'The quick brown fox jumps.');

+ // assertEquals('@@ -1,7 +1,8 @@\n Th\n-e\n+at\n qui\n', p.toString());

+ // // Same, but with ambiguity.

+ // p = dmp.patch_fromText('@@ -3 +3,2 @@\n-e\n+at\n')[0];

+ // dmp.patch_addContext_(p, 'The quick brown fox jumps. The quick brown fox crashes.');

+ // assertEquals('@@ -1,27 +1,28 @@\n Th\n-e\n+at\n quick brown fox jumps. \n', p.toString());

+ }

+ #[test]

+ fn test_parse_patch_header() {

+ assert_eq!(Some((21, Some(4), 21, Some(10))), DiffMatchPatch::parse_patch_header("@@ -21,4 +21,10 @@".as_bytes()));

+ assert_eq!(Some((3, None, 3, Some(2))), DiffMatchPatch::parse_patch_header("@@ -3 +3,2 @@".as_bytes()));

+ // Bad cases

+ assert!(DiffMatchPatch::parse_patch_header("@@ +3,2 @@".as_bytes()).is_none());

+ assert!(DiffMatchPatch::parse_patch_header("@@ 2046 +3,2 @@".as_bytes()).is_none());

+ }

}