my fork of dmp
Finished with pretty html api
| -rw-r--r-- | src/dmp.rs | 264 | ||||
| -rw-r--r-- | src/errors.rs | 1 |
2 files changed, 219 insertions, 46 deletions
@@ -961,7 +961,6 @@ impl DiffMatchPatch { let delete = diffs[pointer - 1].data().to_vec(); let insert = diffs[pointer].data().to_vec(); - let delete_thres = delete.len() / 2 + delete.len() % 2; let insert_thres = insert.len() / 2 + insert.len() % 2; @@ -2469,47 +2468,143 @@ impl DiffMatchPatch { } /// Takes a diff array and returns a pretty HTML sequence. This function is mainly intended as an example from which to write ones own display functions. - pub fn diff_pretty_html(diffs: &[Diff<u8>]) -> String { - let html = diffs.iter().enumerate() - .map(|(idx, diff)| { - let txt = match str::from_utf8(diff.data()) { - Ok(txt) => { - txt.replace("&", "&").replace("<", "<") - .replace(">", ">").replace("\n", "¶<br>") - }, - Err(e) => { - println!("{e:?}"); - // finding previous of same type - let mut prev = idx; - while prev > 0 && diffs[prev].op() != diff.op() { - prev -= 1; + pub fn diff_pretty_html(diffs: &[Diff<u8>]) -> Result<String, crate::errors::Error> { + let mut diffs = diffs.to_vec(); + DiffMatchPatch::cleanup_semantic(&mut diffs); + + // let mut err_idx = None; + // let mut error_bytes = vec![]; + + let mut idx = 0_usize; + let mut err_prefix = vec![]; + + let mut err_start = None; + + // First pass, we'll chomp of errors in the diffs? + // The pattern we have seen is that + while idx < diffs.len() { + let diff = &mut diffs[idx]; + + // println!("[{idx}] {:?}: {:?}", diff.op(), diff.data()); + + if let Err(e) = str::from_utf8(diff.data()) { + // println!("{e:?} ------------ ErrStart[{err_start:?}]"); + + // Errors can come in 2 forms + // 1. error at the end of bytes - we'll keep prefixing the error bytes to all non equalities that follow + // 2. error at the begining of bytes - this one is tricky - we'll need to figure out the suffix at which the rest of the string is valid + if e.error_len().is_none() && err_start.is_none() { + err_start = Some(idx); + + if diff.op() == Ops::Equal { + err_prefix = diff.data()[e.valid_up_to()..].to_vec(); + diff.1 = if e.valid_up_to() > 0 { + diff.data()[..e.valid_up_to()].to_vec() + } else { + vec![] + }; + // println!("Err prefix: {:?} @ Index[{idx}]", err_prefix); + idx += 1; + continue; } - - println!("Prev: {:?}", diffs[prev]); - println!("{:?}", diff.data()); - - if idx < diffs.len() - 2 { - let mut next = idx + 1; - while next < diffs.len() - 1 && diffs[next].op() != diff.op() { - next += 1; + } + + if let Some(err_start_idx) = err_start { + // For insert and delete add the prefix collected earlier (end error bytes) + if diff.op() == Ops::Delete || diff.op() == Ops::Insert { + diff.1 = [&err_prefix, diff.data()].concat(); + // println!("{:?} After update prefix: {:?}", diff.op(), diff.data()); + } else { + if let Some(err_len) = e.error_len() { + // Iteratively figure out at what point does the error go away if at-all + let mut suffix = diff.data()[..err_len].to_vec(); + let mut data = diff.data()[err_len..].to_vec(); + + while let Err(e) = std::str::from_utf8(&data) { + if e.error_len().is_none() { + break; + } + + // should never panic cos empty data is also a valid utf8 + let first_byte = data.remove(0); + suffix.push(first_byte); + } + + // here, we have a suffix to be added to all previous cases and a data that might be good string or error at the end of bytes + // which is a separate cycle + + // println!("Err suffix: {suffix:?}"); + // Let's add the suffix to all the intermediate steps + diff.1 = data.to_vec(); + // println!("Current diff after update: {:?}", diff.data().to_vec()); + diffs + .iter_mut() + .take(idx) + .skip(err_start_idx) + .for_each(|d| { + if d.op() == Ops::Equal { + return; + } + d.1 = [d.data(), &suffix[..]].concat(); + // println!("[{:?}] After update suffix: {:?}", d.op(), d.data()); + }); + + // An equality within edits, lets seek the next one and update this suffix too + if data.is_empty() { + if idx < diffs.len() - 1 && diffs[idx + 1].op() != Ops::Equal { + diffs[idx + 1].1 = + [&err_prefix[..], &suffix, diffs[idx + 1].data()].concat(); + // println!("[{:?}] After update trivial suffix + prefix: {:?}", diffs[idx + 1].op(), diffs[idx + 1].data()); + } + + diffs.remove(idx); + } } - println!("Next: {:?}", diffs[next]); + // Move back to where all of this started + idx = err_start_idx; + err_start = None; + err_prefix = vec![]; + // println!("<<<<<<<<<<<<<<<<<<<<<<<<< Move back {idx}"); + continue; } - "error".to_string() - } - }; - - match diff.op() { - Ops::Insert => format!("<ins style=\"background:#e6ffe6;\">{txt}</ins>"), - Ops::Delete => format!("<del style=\"background:#ffe6e6;\">{txt}</del>"), - Ops::Equal => format!("<span>{txt}</span>") + } } - }) - .collect::<Vec<_>>() - .join(""); + idx += 1; + } + + let mut is_err = false; + let html = diffs + .iter() + .map(|diff| { + let txt = match str::from_utf8(diff.data()) { + Ok(txt) => txt + .replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace("\n", "¶<br>"), + Err(e) => { + eprintln!("{e:?}"); + is_err = true; + "error".to_string() + } + }; + + match diff.op() { + Ops::Insert => format!("<ins style=\"background:#e6ffe6;\">{txt}</ins>"), + Ops::Delete => format!("<del style=\"background:#ffe6e6;\">{txt}</del>"), + Ops::Equal => format!("<span>{txt}</span>"), + } + }) + .collect::<Vec<_>>() + .join(""); - html + if !is_err { + Ok(html) + } else { + Err(crate::errors::Error::HtmlWithError(html)) + } + // Ok(html) } pub fn match_main(&self, text: &str, pattern: &str, loc: usize) -> Option<usize> { @@ -3389,18 +3484,95 @@ mod tests { } #[test] - fn test_diff_pretty_html() { - // let diffs = [Diff::equal(b"a\n"), Diff::delete(b"<B>b</B>"), Diff::insert(b"c&d")]; - // assert_eq!("<span>a¶<br></span><del style=\"background:#ffe6e6;\"><B>b</B></del><ins style=\"background:#e6ffe6;\">c&d</ins>", DiffMatchPatch::diff_pretty_html(&diffs)); - + fn test_diff_pretty_html() -> Result<(), crate::errors::Error> { + // Basic + let diffs = [ + Diff::equal(b"a\n"), + Diff::delete(b"<B>b</B>"), + Diff::insert(b"c&d"), + ]; + assert_eq!("<span>a¶<br></span><del style=\"background:#ffe6e6;\"><B>b</B></del><ins style=\"background:#e6ffe6;\">c&d</ins>", DiffMatchPatch::diff_pretty_html(&diffs)?); + + // Monkey busiess around Emoticons and extended utf-8 π€ͺπ€©π€ + // This gave me a lot of heart-burn let dmp = DiffMatchPatch::default(); - let old = std::fs::read_to_string("testdata/txt_old.txt").unwrap(); - let new = std::fs::read_to_string("testdata/txt_new.txt").unwrap(); - let mut diffs = dmp.diff_main(&old, &new).unwrap(); - // DiffMatchPatch::cleanup_semantic(&mut diffs); - std::fs::write("testdata/diff.html", DiffMatchPatch::diff_pretty_html(&diffs)).unwrap(); - // println!("{}", ); + // Case 1. Two similar emoticons + // In bytes representation, these would have the last u8 different + // Which means the the diff should an equality block of 3 bytes folloed by insert and delete + let old = "π€ͺ"; // [240, 159, 164, 170] + let new = "π€"; // [240, 159, 164, 148] + let diffs = dmp.diff_main(old, new)?; + assert_eq!( + "<span></span><del style=\"background:#ffe6e6;\">π€ͺ</del><ins style=\"background:#e6ffe6;\">π€</ins>", + DiffMatchPatch::diff_pretty_html(&diffs)? + ); + + // Now Case 1. but with some text before and after + let old = "I'm puzzledπ€ͺ or am I?"; + let new = "I'm puzzledπ€ or thinking I guess!"; + let diffs = dmp.diff_main(old, new)?; + assert_eq!( + "<span>I'm puzzled</span><del style=\"background:#ffe6e6;\">π€ͺ</del><ins style=\"background:#e6ffe6;\">π€</ins><span> or </span><del style=\"background:#ffe6e6;\">am I?</del><ins style=\"background:#e6ffe6;\">thinking I guess!</ins>", + DiffMatchPatch::diff_pretty_html(&diffs)? + ); + + // Case 2. Emoticons with the third position different + let old = "π"; // [240, 159, 141, 138] + let new = "π"; // [240, 159, 140, 138] + let diffs = dmp.diff_main(old, new)?; + assert_eq!( + "<span></span><del style=\"background:#ffe6e6;\">π</del><ins style=\"background:#e6ffe6;\">π</ins>", + DiffMatchPatch::diff_pretty_html(&diffs)? + ); + + // Now Case 2. but with some text, lets complicate this + let old = "π, aah orange is the new black!"; // [240, 159, 141, 138] + let new = "Aah orange!πis the new π"; // [240, 159, 140, 138] + let diffs = dmp.diff_main(old, new)?; + assert_eq!( + "<del style=\"background:#ffe6e6;\">π, a</del><ins style=\"background:#e6ffe6;\">A</ins><span>ah orange</span><del style=\"background:#ffe6e6;\"> </del><ins style=\"background:#e6ffe6;\">!π</ins><span>is the new </span><del style=\"background:#ffe6e6;\">black!</del><ins style=\"background:#e6ffe6;\">π</ins>", + DiffMatchPatch::diff_pretty_html(&diffs)? + ); + + // Case 3. with second and third different, but lets complicate this with an equality + let old = "π "; // [240, 160, 140, 138] + let new = "π "; // [240, 150, 160, 138] + let diffs = dmp.diff_main(old, new)?; + assert_eq!( + "<span></span><ins style=\"background:#e6ffe6;\">π </ins><del style=\"background:#ffe6e6;\">π </del>", + DiffMatchPatch::diff_pretty_html(&diffs)? + ); + + // Case 3. but let there be a swap + let old = "π "; // [240, 158, 160, 132] + let new = std::str::from_utf8(&[240, 160, 158, 132]).unwrap(); // basically an undefined element `π `. Should still work + let diffs = dmp.diff_main(old, new)?; + assert_eq!( + "<span></span><del style=\"background:#ffe6e6;\">π </del><ins style=\"background:#e6ffe6;\">π </ins>", + DiffMatchPatch::diff_pretty_html(&diffs)? + ); + + // Case 4. swap at the last 2 positions + let old = "π"; // [240, 159, 141, 140] -- FINALLY A BANANA + let new = "π"; // [240, 159, 140, 141] -- interesting revelation - last 2 bytes swapped and π becomes π. Guess the world is going `Bananas!!` + let diffs = dmp.diff_main(old, new)?; + assert_eq!( + "<span></span><del style=\"background:#ffe6e6;\">π</del><ins style=\"background:#e6ffe6;\">π</ins>", + DiffMatchPatch::diff_pretty_html(&diffs)? + ); + + // Let's do this with a slightly longish string + let old = "Now, let's explore some emotional extremes π.\nWe've got your ecstatic face π€©, your devastated face π, and your utterly confused face π€―. But that's not all! π€ We've also got some subtle emotions like π, π, and π."; + let new = "Let's start with some basics π.\nWe've got your standard smiley face π, your sad face βΉοΈ, and your angry face π . But wait, there's more! π€© We've also got some more complex emotions like π, π€€, and π. And let's not forget about the classics: π, π, and π."; + let diffs = dmp.diff_main(old, new)?; + + assert_eq!( + "<del style=\"background:#ffe6e6;\">Now, let's explore some emotional extreme</del><ins style=\"background:#e6ffe6;\">Let's start with some basic</ins><span>s </span><del style=\"background:#ffe6e6;\">π</del><ins style=\"background:#e6ffe6;\">π</ins><span>.¶<br>We've got your </span><del style=\"background:#ffe6e6;\">ec</del><span>sta</span><del style=\"background:#ffe6e6;\">tic</del><ins style=\"background:#e6ffe6;\">ndard smiley</ins><span> face </span><del style=\"background:#ffe6e6;\">π€©</del><ins style=\"background:#e6ffe6;\">π</ins><span>, your </span><del style=\"background:#ffe6e6;\">devastate</del><ins style=\"background:#e6ffe6;\">sa</ins><span>d face </span><del style=\"background:#ffe6e6;\">π</del><ins style=\"background:#e6ffe6;\">βΉοΈ</ins><span>, and your </span><del style=\"background:#ffe6e6;\">utterly confused</del><ins style=\"background:#e6ffe6;\">angry</ins><span> face </span><del style=\"background:#ffe6e6;\">π€―</del><ins style=\"background:#e6ffe6;\">π </ins><span>. But </span><del style=\"background:#ffe6e6;\">that's not all</del><ins style=\"background:#e6ffe6;\">wait, there's more</ins><span>! </span><del style=\"background:#ffe6e6;\">π€</del><ins style=\"background:#e6ffe6;\">π€©</ins><span> We've also got some </span><del style=\"background:#ffe6e6;\">subt</del><ins style=\"background:#e6ffe6;\">more comp</ins><span>le</span><ins style=\"background:#e6ffe6;\">x</ins><span> emotions like </span><del style=\"background:#ffe6e6;\">π</del><ins style=\"background:#e6ffe6;\">π, π€€, and π. And let's not forget about the classics: π</ins><span>, </span><del style=\"background:#ffe6e6;\">π</del><ins style=\"background:#e6ffe6;\">π</ins><span>, and </span><del style=\"background:#ffe6e6;\">π</del><ins style=\"background:#e6ffe6;\">π</ins><span>.</span>", + DiffMatchPatch::diff_pretty_html(&diffs)? + ); + + Ok(()) } #[test] diff --git a/src/errors.rs b/src/errors.rs index 048e56a..4562239 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -2,4 +2,5 @@ pub enum Error { InvalidInput, Utf8Error, + HtmlWithError(String), } |