my fork of dmp
Diffstat (limited to 'src/traits.rs')
| -rw-r--r-- | src/traits.rs | 102 |
1 files changed, 101 insertions, 1 deletions
diff --git a/src/traits.rs b/src/traits.rs index 3818837..f02c63c 100644 --- a/src/traits.rs +++ b/src/traits.rs @@ -3,7 +3,10 @@ use std::hash::Hash; use chrono::NaiveTime; use percent_encoding::{percent_decode, AsciiSet, CONTROLS}; -use crate::dmp::{Diff, DiffMatchPatch}; +use crate::{ + dmp::{Diff, DiffMatchPatch}, + Ops, +}; pub type Efficient = u8; pub type Compat = char; @@ -44,6 +47,10 @@ pub trait DType: Copy + Ord + Eq + Hash { fn percent_encode(input: &[Self]) -> Vec<Self>; fn percent_decode(input: &[Self]) -> Vec<Self>; + + fn humanize(_diffs: &mut Vec<Diff<Self>>) -> Result<(), crate::Error> { + Ok(()) + } } impl DType for u8 { @@ -112,6 +119,99 @@ impl DType for u8 { fn percent_decode(input: &[Self]) -> Vec<Self> { percent_decode(input).collect() } + + #[inline] + fn humanize(diffs: &mut Vec<Diff<Self>>) -> Result<(), crate::Error> { + let mut idx = 0_usize; + let mut err_prefix = vec![]; + + let mut err_start = None; + + // First pass, we'll chomp of errors in the diffs? + // The pattern we have seen is that + while idx < diffs.len() { + let diff = &mut diffs[idx]; + + if let Err(e) = std::str::from_utf8(diff.data()) { + // Errors can come in 2 forms + // 1. error at the end of bytes - we'll keep prefixing the error bytes to all non equalities that follow + // 2. error at the begining of bytes - this one is tricky - we'll need to figure out the suffix at which the rest of the string is valid + if e.error_len().is_none() && err_start.is_none() { + err_start = Some(idx); + + if diff.op() == Ops::Equal { + err_prefix = diff.data()[e.valid_up_to()..].to_vec(); + diff.1 = if e.valid_up_to() > 0 { + diff.data()[..e.valid_up_to()].to_vec() + } else { + vec![] + }; + + idx += 1; + continue; + } + } + + if let Some(err_start_idx) = err_start { + // For insert and delete add the prefix collected earlier (end error bytes) + if diff.op() == Ops::Delete || diff.op() == Ops::Insert { + diff.1 = [&err_prefix, diff.data()].concat(); + } else { + if let Some(err_len) = e.error_len() { + // Iteratively figure out at what point does the error go away if at-all + let mut suffix = diff.data()[..err_len].to_vec(); + let mut data = diff.data()[err_len..].to_vec(); + + while let Err(e) = std::str::from_utf8(&data) { + if e.error_len().is_none() { + break; + } + + // should never panic cos empty data is also a valid utf8 + let first_byte = data.remove(0); + suffix.push(first_byte); + } + + // here, we have a suffix to be added to all previous cases and a data that might be good string or error at the end of bytes + // which is a separate cycle + + // Let's add the suffix to all the intermediate steps + diff.1 = data.to_vec(); + diffs + .iter_mut() + .take(idx) + .skip(err_start_idx) + .for_each(|d| { + if d.op() == Ops::Equal { + return; + } + d.1 = [d.data(), &suffix[..]].concat(); + }); + + // An equality within edits, lets seek the next one and update this suffix too + if data.is_empty() { + if idx < diffs.len() - 1 && diffs[idx + 1].op() != Ops::Equal { + diffs[idx + 1].1 = + [&err_prefix[..], &suffix, diffs[idx + 1].data()].concat(); + } + + diffs.remove(idx); + } + } + + // Move back to where all of this started + idx = err_start_idx; + err_start = None; + err_prefix = vec![]; + continue; + } + } + } + idx += 1; + } + + Ok(()) + } } impl DType for char { |