e
init
| -rwxr-xr-x | .gitignore | 1 | ||||
| -rwxr-xr-x | Cargo.lock | 41 | ||||
| -rwxr-xr-x | Cargo.toml | 28 | ||||
| -rwxr-xr-x | LICENSE | 20 | ||||
| -rwxr-xr-x | README.md | 53 | ||||
| -rwxr-xr-x | benches/iai.rs | 25 | ||||
| -rwxr-xr-x | benches/iai_simd.rs | 45 | ||||
| -rwxr-xr-x | src/color.rs | 109 | ||||
| -rwxr-xr-x | src/color/serial.rs | 49 | ||||
| -rwxr-xr-x | src/color/simd.rs | 73 | ||||
| -rwxr-xr-x | src/iter.rs | 258 | ||||
| -rwxr-xr-x | src/lib.rs | 205 | ||||
| -rwxr-xr-x | src/test.rs | 49 | ||||
| -rwxr-xr-x | src/traits.rs | 36 |
14 files changed, 992 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100755 index 0000000..2f7896d --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +target/ diff --git a/Cargo.lock b/Cargo.lock new file mode 100755 index 0000000..6b0a465 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,41 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "iai" +version = "0.1.1" +source = "git+https://github.com/bend-n/iai#30f77d9d165f8d2e1fcea8f87bd01cc03d821ae6" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "imgref" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90d944e334f00f4449c9640b440a171f816be0152305c12ef90424fc35fd035c" + +[[package]] +name = "imgref-iter" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58112733c0ee178b66dceba49e0445d0b293268719de579d2c7dd4c2290976aa" +dependencies = [ + "imgref", +] + +[[package]] +name = "slur" +version = "0.1.0" +dependencies = [ + "iai", + "imgref", + "imgref-iter", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100755 index 0000000..d580fa8 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = 'slur' +version = '0.1.0' +authors = ['LoganDark'] +edition = '2021' +description = 'A fast, iterative, correct approach to Stackblur, resulting in a very smooth and high-quality output, with no edge bleeding' +readme = 'README.md' +repository = 'https://github.com/bend-n/slur' +license = 'MIT' +keywords = ['stackblur', 'blur', 'gaussian'] +categories = ['algorithms', 'graphics', 'rendering'] + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +imgref = '^1.9.2' +imgref-iter = { version = '~0.4.0', features = ["simd"] } + +[dev-dependencies] +iai = { git = "https://github.com/bend-n/iai" } + +[[bench]] +name = 'iai' +harness = false + +[[bench]] +name = 'iai_simd' +harness = false @@ -0,0 +1,20 @@ +MIT License + +Copyright © 2022-present LoganDark + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100755 index 0000000..f1c4f08 --- /dev/null +++ b/README.md @@ -0,0 +1,53 @@ +A fast, iterative, correct approach to Stackblur, resulting in a very smooth and +high-quality output, with no edge bleeding. + +This crate implements a tweaked version of the Stackblur algorithm requiring +`radius * 2 + 2` elements of space rather than `radius * 2 + 1`, which is a +small tradeoff for much-increased visual quality. + +The algorithm is exposed as an iterator (`StackBlur`) that can wrap any other +iterator that yields elements of `StackBlurrable`. The `StackBlur` will then +yield elements blurred by the specified radius. + +## Benefits of this crate + +Stackblur is essentially constant-time. Regardless of the radius, it always +performs only 1 scan over the input iterator and outputs exactly the same amount +of elements. + +Additionally, it produces results that are comparable to slow and expensive +Gaussian blurs. As opposed to box blur which uses a basic rolling average, +Stackblur uses a weighted average where each output pixel is affected more +strongly by the inputs that were closest to it. + +Despite that, Stackblur does not perform much worse compared to naive box blurs, +and is quite cheap compared to full Gaussian blurs, at least for the CPU. The +implementation in this crate will most likely beat most unoptimized blurs you +can find on crates.io, as well as some optimized ones, and it is extremely +flexible and generic. + +For a full explanation of the improvements made to the Stackblur algorithm, see +the `iter` module. + +https://user-images.githubusercontent.com/4723091/173788732-2e3e125e-f7b3-4e0f-8582-cc2c148ba437.mp4 + +*(In the above video, `stackblur-iter` is the centered blur, whereas the +full-width one is another `stackblur` crate.)* + +## Comparison to the `stackblur` crate + +`stackblur` suffers from edge bleeding and flexibility problems. For +example, it can only operate on buffers of 32-bit integers, and expects them +to be packed linear ARGB pixels. Additionally, it cannot operate on a 2D +subslice of a buffer (like `imgref` allows for this crate), and it does not +offer any streaming iterators or documentation. And it also only supports +a blur radius of up to 255. + +## Usage + +Aside from `StackBlurrable` and `StackBlur` which host their own documentation, +there are helper functions like `blur` and `blur_argb` that can be used to +interact with 2D image buffers, due to the fact that doing so manually involves +unsafe code (if you want no-copy). + +See the [full documentation](https://docs.rs/stackblur-iter) for more. diff --git a/benches/iai.rs b/benches/iai.rs new file mode 100755 index 0000000..d28f6bc --- /dev/null +++ b/benches/iai.rs @@ -0,0 +1,25 @@ +use imgref::{Img, ImgRefMut}; + +const WIDTH: usize = 640; +const HEIGHT: usize = 480; + +static mut BUFFER: [u32; WIDTH * HEIGHT] = [0; WIDTH * HEIGHT]; + +#[inline(always)] +fn img() -> ImgRefMut<'static, u32> { + Img::new(unsafe { &mut BUFFER[..] }, WIDTH, HEIGHT) +} + +fn blur_argb_16() { + slur::blur_argb(&mut img(), 16) +} +fn blur_argb_128() { + slur::blur_argb(&mut img(), 128) +} +fn blur_argb_1024() { + slur::blur_argb(&mut img(), 1024) +} + +// parallel versions are non-deterministic + +iai::main!(blur_argb_16, blur_argb_128, blur_argb_1024,); diff --git a/benches/iai_simd.rs b/benches/iai_simd.rs new file mode 100755 index 0000000..c3a20c9 --- /dev/null +++ b/benches/iai_simd.rs @@ -0,0 +1,45 @@ +use imgref::{Img, ImgRefMut}; + +const WIDTH: usize = 640; +const HEIGHT: usize = 480; + +static mut BUFFER: [u32; WIDTH * HEIGHT] = [0; WIDTH * HEIGHT]; + +#[inline(always)] +fn img() -> ImgRefMut<'static, u32> { + Img::new(unsafe { &mut BUFFER[..] }, WIDTH, HEIGHT) +} + +use slur::simd_blur_argb; + +fn blur_argb_u32x1() { + simd_blur_argb::<1>(&mut img(), 16) +} +fn blur_argb_u32x2() { + simd_blur_argb::<2>(&mut img(), 16) +} +fn blur_argb_u32x4() { + simd_blur_argb::<4>(&mut img(), 16) +} +fn blur_argb_u32x8() { + simd_blur_argb::<8>(&mut img(), 16) +} +fn blur_argb_u32x16() { + simd_blur_argb::<16>(&mut img(), 16) +} +fn blur_argb_u32x32() { + simd_blur_argb::<32>(&mut img(), 16) +} +fn blur_argb_u32x64() { + simd_blur_argb::<64>(&mut img(), 16) +} + +iai::main!( + blur_argb_u32x1, + blur_argb_u32x2, + blur_argb_u32x4, + blur_argb_u32x8, + blur_argb_u32x16, + blur_argb_u32x32, + blur_argb_u32x64, +); diff --git a/src/color.rs b/src/color.rs new file mode 100755 index 0000000..96efe3b --- /dev/null +++ b/src/color.rs @@ -0,0 +1,109 @@ +use crate::StackBlurrable; +use std::ops::{Add, AddAssign, Div, Mul, Sub, SubAssign}; + +mod serial; +mod simd; + +pub use serial::BlurU32; +pub use simd::u32xN; + +use std::simd::{LaneCount, Simd, SupportedLaneCount}; + +#[repr(transparent)] +#[derive(Copy, Clone, Eq, PartialEq, Debug, Default)] +pub struct Argb<T: StackBlurrable>([T; 4]); + +impl From<u32> for Argb<BlurU32> { + fn from(argb: u32) -> Self { + let [a, r, g, b] = argb.to_be_bytes(); + let cvt = |i| BlurU32(i as u32); + Self([cvt(a), cvt(r), cvt(g), cvt(b)]) + } +} + +impl From<Argb<BlurU32>> for u32 { + fn from(Argb([a, r, g, b]): Argb<BlurU32>) -> Self { + let cvt = |i: BlurU32| i.0 as u8; + u32::from_be_bytes([cvt(a), cvt(r), cvt(g), cvt(b)]) + } +} + +impl<const N: usize> From<[u32; N]> for Argb<u32xN<N>> +where + LaneCount<N>: SupportedLaneCount, +{ + fn from(values: [u32; N]) -> Self { + let arrs: [[u8; 4]; N] = values.map(u32::to_be_bytes); + Self([ + u32xN(Simd::from_array(arrs.map(|a| a[0] as u32))), + u32xN(Simd::from_array(arrs.map(|a| a[1] as u32))), + u32xN(Simd::from_array(arrs.map(|a| a[2] as u32))), + u32xN(Simd::from_array(arrs.map(|a| a[3] as u32))), + ]) + } +} + +impl<const N: usize> From<Argb<u32xN<N>>> for [u32; N] +where + LaneCount<N>: SupportedLaneCount, +{ + fn from(value: Argb<u32xN<N>>) -> Self { + let [a, r, g, b] = value.0.map(|i| i.0.to_array()); + std::array::from_fn(|i| u32::from_be_bytes([a[i], r[i], g[i], b[i]].map(|x| x as u8))) + } +} + +impl<T: StackBlurrable> Add for Argb<T> { + type Output = Self; + + fn add(mut self, rhs: Self) -> Self::Output { + self += rhs; + self + } +} + +impl<T: StackBlurrable> Sub for Argb<T> { + type Output = Self; + + fn sub(mut self, rhs: Self) -> Self::Output { + self -= rhs; + self + } +} + +impl<T: StackBlurrable> AddAssign for Argb<T> { + fn add_assign(&mut self, rhs: Self) { + let [a, r, g, b] = rhs.0; + self.0[0] += a; + self.0[1] += r; + self.0[2] += g; + self.0[3] += b; + } +} + +impl<T: StackBlurrable> SubAssign for Argb<T> { + fn sub_assign(&mut self, rhs: Self) { + let [a, r, g, b] = rhs.0; + self.0[0] -= a; + self.0[1] -= r; + self.0[2] -= g; + self.0[3] -= b; + } +} + +impl<T: StackBlurrable> Mul<usize> for Argb<T> { + type Output = Self; + + fn mul(self, rhs: usize) -> Self::Output { + let [a, r, g, b] = self.0; + Self([a * rhs, r * rhs, g * rhs, b * rhs]) + } +} + +impl<T: StackBlurrable> Div<usize> for Argb<T> { + type Output = Self; + fn div(self, rhs: usize) -> Self::Output { + let [a, r, g, b] = self.0; + Self([a / rhs, r / rhs, g / rhs, b / rhs]) + } +} diff --git a/src/color/serial.rs b/src/color/serial.rs new file mode 100755 index 0000000..e353055 --- /dev/null +++ b/src/color/serial.rs @@ -0,0 +1,49 @@ +use std::ops::{Add, AddAssign, Div, Mul, Sub, SubAssign}; + +#[derive(Copy, Clone, Eq, PartialEq, Debug, Default)] +pub struct BlurU32(pub u32); + +impl Add for BlurU32 { + type Output = Self; + + fn add(self, rhs: Self) -> Self::Output { + Self(self.0.wrapping_add(rhs.0)) + } +} + +impl Sub for BlurU32 { + type Output = Self; + + fn sub(self, rhs: Self) -> Self::Output { + Self(self.0.wrapping_sub(rhs.0)) + } +} + +impl AddAssign for BlurU32 { + fn add_assign(&mut self, rhs: Self) { + self.0 = self.0.wrapping_add(rhs.0); + } +} + +impl SubAssign for BlurU32 { + fn sub_assign(&mut self, rhs: Self) { + self.0 = self.0.wrapping_sub(rhs.0); + } +} + +impl Mul<usize> for BlurU32 { + type Output = Self; + + fn mul(self, rhs: usize) -> Self::Output { + Self(self.0.wrapping_mul(rhs as u32)) + } +} + +impl Div<usize> for BlurU32 { + type Output = Self; + + #[inline] + fn div(self, rhs: usize) -> Self::Output { + Self(self.0.wrapping_div(rhs as u32)) + } +} diff --git a/src/color/simd.rs b/src/color/simd.rs new file mode 100755 index 0000000..3b0e593 --- /dev/null +++ b/src/color/simd.rs @@ -0,0 +1,73 @@ +use std::ops::{Add, AddAssign, Div, Mul, Sub, SubAssign}; + +pub use std::simd::{LaneCount, Simd, SupportedLaneCount}; + +#[derive(Copy, Clone, Eq, PartialEq, Debug, Default)] +#[allow(non_camel_case_types)] +pub struct u32xN<const N: usize>(pub Simd<u32, N>) +where + LaneCount<N>: SupportedLaneCount; + +impl<const N: usize> Add for u32xN<N> +where + LaneCount<N>: SupportedLaneCount, +{ + type Output = Self; + + fn add(self, rhs: Self) -> Self::Output { + Self(self.0 + rhs.0) + } +} + +impl<const N: usize> Sub for u32xN<N> +where + LaneCount<N>: SupportedLaneCount, +{ + type Output = Self; + + fn sub(self, rhs: Self) -> Self::Output { + Self(self.0 - rhs.0) + } +} + +impl<const N: usize> AddAssign for u32xN<N> +where + LaneCount<N>: SupportedLaneCount, +{ + fn add_assign(&mut self, rhs: Self) { + self.0 += rhs.0; + } +} + +impl<const N: usize> SubAssign for u32xN<N> +where + LaneCount<N>: SupportedLaneCount, +{ + fn sub_assign(&mut self, rhs: Self) { + self.0 -= rhs.0; + } +} + +impl<const N: usize> Mul<usize> for u32xN<N> +where + LaneCount<N>: SupportedLaneCount, +{ + type Output = Self; + + fn mul(self, rhs: usize) -> Self::Output { + Self(self.0 * Simd::<u32, N>::splat(rhs as u32)) + } +} + +impl<const N: usize> Div<usize> for u32xN<N> +where + LaneCount<N>: SupportedLaneCount, +{ + type Output = Self; + + fn div(self, rhs: usize) -> Self::Output { + Self(Simd::<u32, N>::from_array( + self.0.to_array().map(|e| (e as usize / rhs) as u32), + )) + } +} diff --git a/src/iter.rs b/src/iter.rs new file mode 100755 index 0000000..7fc7e38 --- /dev/null +++ b/src/iter.rs @@ -0,0 +1,258 @@ +//! This module implements the [`StackBlur`] struct and the improved version of +//! the Stackblur algorithm. +//! +//! ## The improved Stackblur algorithm +//! +//! As previously stated, this crate implements a modified version of Stackblur, +//! and understanding the original algorithm is key to understanding the +//! improvements that have been made to it. +//! +//! The original Stackblur is essentially a weighted box-blur. Instead of taking +//! the average of a flat array of pixels, like this: +//! +//! ```text +//! ( 00 + 01 + 02 + 03 + 04 + 05 + 06 ) / 7 +//! ``` +//! +//! it takes a weighted average of the pixels: +//! +//! ```text +//! ( 03 + +//! 02 + 03 + 04 + +//! 01 + 02 + 03 + 04 + 05 + +//! 00 + 01 + 02 + 03 + 04 + 05 + 06 ) / 16 +//! ``` +//! +//! This is a rough approximation of a Gaussian blur, and in fact it's already +//! most of the way there to being a complete algorithm on its own. You can just +//! multiply each pixel by something like `radius + 1 - center_dist` when you +//! make your sum, then divide by `radius * (radius + 2) + 1`. +//! +//! But there are two problems with this: +//! +//! 1. That would be *O*(n(m * 2 + 1)²) complexity, where `n` is the number of +//! pixels in the image, and `m` is the radius of the blur. This is basically +//! just as expensive as running a proper convolution filter. +//! +//! 2. How do we handle pixels off the edge of the image? +//! +//! I'm scared of #1 so I'm going to handle #2 first. What most implementations +//! choose to do is just repeat the edge of the image: +//! +//! ```text +//! ( 00 + +//! 00 + 00 + 01 + +//! 00 + 00 + 00 + 01 + 02 + +//! 00 + 00 + 00 + 00 + 01 + 02 + 03 ) / 16 +//! ``` +//! +//! But this creates even more problems, because the edge of the blur will be +//! quite biased towards the pixels that aren't even in the image. This is known +//! as "edge bleeding", where a single pixel at the edge of a blur can cause +//! very large and ugly artifacts to show up. +//! +//! The solution, of course, is to not calculate the denominator using that +//! equation from earlier, and instead incrementally update the denominator as +//! pixels are scanned, allowing you to sum a varying number of pixels: +//! +//! ```text +//! ( 00 + +//! 00 + 01 + +//! 00 + 01 + 02 + +//! 00 + 01 + 02 + 03 ) / 10 +//! +//! ( 01 + +//! 00 + 01 + 02 + +//! 00 + 01 + 02 + 03 + +//! 00 + 01 + 02 + 03 + 04 ) / 13 +//! +//! ( 02 + +//! 01 + 02 + 03 + +//! 00 + 01 + 02 + 03 + 04 + +//! 00 + 01 + 02 + 03 + 04 + 05 ) / 15 +//! +//! ( 03 + +//! 02 + 03 + 04 + +//! 01 + 02 + 03 + 04 + 05 + +//! 00 + 01 + 02 + 03 + 04 + 05 + 06 ) / 16 +//! ``` +//! +//! This is one of the improvements made to the Stackblur algorithm that is +//! implemented by this crate. +//! +//! Now for #1 - the complexity problem. It is possible to make a streaming +//! Stackblur that does not need to read any pixels out of order or more than +//! once, or even know how long the input is. In fact, that's exactly what this +//! crate does. +//! +//! First you fill up the cache with `radius + 1` pixels in order to be able to +//! make the first sum, then you start producing results until you run out of +//! pixels, then you produce the last `radius` results using what you have in +//! cache. I don't have the skill to explain the algorithm in full detail, but +//! it's open-source so you can look at it if you want. +//! +//! In this crate the "cache" is not actually the actual heap of values, as that +//! would be too slow. Instead, the "cache" is a list of changes to make to the +//! rate of change of the sum, and the denominator is updated incrementally in +//! response to the number of leading and trailing values changing. +//! +//! The reason the whole rate of change thing exists is so that adding to the +//! sum and registering it in the cache becomes *O*(1) instead of *O*(2n+1) +//! (where `n` is the radius). It's basically the most important thing that +//! makes the algorithm constant-time. + +use std::collections::VecDeque; + +use crate::traits::StackBlurrable; + +/// An iterator that implements an improved Stackblur algorithm. +/// +/// For any [`StackBlurrable`] element `T` and any iterator `I` over items of +/// type `T`, [`StackBlur`] will yield the same amount of elements as `I`, +/// blurred together according to the improved Stackblur algorithm as described +/// by the [`iter`][crate::iter] module documentation. +/// +/// ## Usage +/// +/// [`StackBlur`] just wraps another iterator with a radius and a cache (called +/// `ops` by [`StackBlur::new`]): +/// +/// ```compile_fail +/// # use std::collections::VecDeque; +/// # use std::num::Wrapping; +/// # use stackblur_iter::iter::StackBlur; +/// # +/// let arr = [255u8, 0, 0, 0, 127, 0, 0, 0, 255u8]; +/// let iter = arr.iter().copied().map(|i| Wrapping(i as usize)); +/// let blur = StackBlur::new(iter, 2, VecDeque::new()); +/// ``` +/// +/// That example unfortunately doesn't compile because `Wrapping<usize>` doesn't +/// implement `Mul<usize>` and `Div<usize>`, which are required for the +/// averaging that [`StackBlur`] performs. It's recommended to create a newtype +/// wrapper around whatever you plan on blurring, and implement all the traits +/// required by [`StackBlurrable`]. +/// +/// A [`StackBlur`] always yields exactly as many items as its inner iterator +/// does. Additionally, a non-fused iterator which repeats will cause the +/// [`StackBlur`] to repeat as well. +pub struct StackBlur<'a, T: StackBlurrable, I: Iterator<Item = T>> { + iter: I, + radius: usize, + sum: T, + rate: T, + dnom: usize, + ops: &'a mut VecDeque<T>, + leading: usize, + trailing: usize, + done: bool, + first: bool, +} + +impl<'a, T: StackBlurrable, I: Iterator<Item = T>> StackBlur<'a, T, I> { + /// Creates a new [`StackBlur`] from the provided iterator, radius, and + /// [`VecDeque`]. + /// + /// The iterator is not advanced until a call to [`StackBlur::next`]. + pub fn new(iter: I, radius: usize, ops: &'a mut VecDeque<T>) -> Self { + Self { + iter, + radius, + sum: T::default(), + rate: T::default(), + dnom: 0, + leading: 0, + trailing: 0, + ops, + done: true, + first: true, + } + } + + fn init(&mut self) { + self.done = false; + + for sub in 0..=self.radius { + let Some(item) = self.iter.next() else { break }; + + if sub == 0 { + let start = self.radius + 1; + let needed = start * 2 + 2; + self.ops.reserve(needed.saturating_sub(self.ops.capacity())); + self.ops + .iter_mut() + .take(start) + .for_each(|place| *place = T::default()); + self.ops.resize_with(start, T::default); + + self.sum = T::default(); + self.rate = T::default(); + self.dnom = 0; + self.leading = 0; + self.trailing = 0; + } + + let mul = self.radius + 1 - sub; + self.sum += item * mul; + self.rate += item; + self.dnom += mul; + + if self.dnom > mul { + self.trailing += 1; + } + + self.ops[sub] -= item * 2; + self.ops.push_back(item); + } + + if self.dnom == 0 { + self.done = true; + } + } +} + +impl<T: StackBlurrable, I: Iterator<Item = T>> Iterator for StackBlur<'_, T, I> { + type Item = T; + + #[inline] + fn next(&mut self) -> Option<Self::Item> { + if self.done { + self.init(); + + if !std::mem::replace(&mut self.first, false) { + return None; + } + } + + let result = self.sum / self.dnom; + + self.rate += self.ops.pop_front().unwrap(); + self.sum += self.rate; + + if self.leading < self.radius { + self.leading += 1; + self.dnom += self.radius + 1 - self.leading; + } + + if self.radius == 0 || self.trailing == self.radius { + if let Some(item) = self.iter.next() { + self.sum += item; + self.rate += item; + self.ops[self.radius] -= item * 2; + self.ops.push_back(item); + } else if self.radius > 0 { + self.dnom -= self.radius + 1 - self.trailing; + self.trailing -= 1; + } else { + self.done = true; + } + } else if self.trailing > 0 { + self.dnom -= self.radius + 1 - self.trailing; + self.trailing -= 1; + } else if self.trailing == 0 { + self.done = true; + } + + Some(result) + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100755 index 0000000..6cf1b17 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,205 @@ +//! A fast, iterative, correct approach to Stackblur, resulting in a very smooth +//! and high-quality output, with no edge bleeding. +//! +//! This crate implements a tweaked version of the Stackblur algorithm requiring +//! `radius * 2 + 2` elements of space rather than `radius * 2 + 1`, which is a +//! small tradeoff for much-increased visual quality. +//! +//! The algorithm is exposed as an iterator ([`StackBlur`]) that can wrap any +//! other iterator that yields elements of [`StackBlurrable`]. The [`StackBlur`] +//! will then yield elements blurred by the specified radius. +//! +//! ## Benefits of this crate +//! +//! Stackblur is essentially constant-time. Regardless of the radius, it always +//! performs only 1 scan over the input iterator and outputs exactly the same +//! amount of elements. +//! +//! Additionally, it produces results that are comparable to slow and expensive +//! Gaussian blurs. As opposed to box blur which uses a basic rolling average, +//! Stackblur uses a weighted average where each output pixel is affected more +//! strongly by the inputs that were closest to it. +//! +//! Despite that, Stackblur does not perform much worse compared to naive box +//! blurs, and is quite cheap compared to full Gaussian blurs, at least for the +//! CPU. The implementation in this crate will most likely beat most unoptimized +//! blurs you can find on crates.io, as well as some optimized ones, and it is +//! extremely flexible and generic. +//! +//! For a full explanation of the improvements made to the Stackblur algorithm, +//! see the [`iter`] module. +//! +//! ## Comparison to the `stackblur` crate +//! +//! `stackblur` suffers from edge bleeding and flexibility problems. For +//! example, it can only operate on buffers of 32-bit integers, and expects them +//! to be packed linear ARGB pixels. Additionally, it cannot operate on a 2D +//! subslice of a buffer (like `imgref` allows for this crate), and it does not +//! offer any streaming iterators or documentation. And it also only supports +//! a blur radius of up to 255. +//! +//! ## Usage +//! +//! Aside from [`StackBlurrable`] and [`StackBlur`] which host their own +//! documentation, there are helper functions like [`blur`] and [`blur_argb`] +//! that can be used to interact with 2D image buffers, due to the fact that +//! doing so manually involves unsafe code (if you want no-copy). + +#![feature(portable_simd, stmt_expr_attributes)] +#![cfg_attr(test, feature(test))] + +use std::collections::VecDeque; +use std::simd::{LaneCount, SupportedLaneCount}; + +pub extern crate imgref; + +use imgref::ImgRefMut; + +#[cfg(test)] +mod test; + +pub mod color; +pub mod iter; +pub mod traits; + +use color::Argb; +use iter::StackBlur; +use traits::StackBlurrable; + +/// Blurs a buffer, assuming one element per pixel. +/// +/// The provided closures are used to convert from the buffer's native pixel +/// format to [`StackBlurrable`] values that can be consumed by [`StackBlur`]. +pub fn blur<T, B: StackBlurrable>( + buffer: &mut ImgRefMut<T>, + radius: usize, + mut to_blurrable: impl FnMut(&T) -> B, + mut to_pixel: impl FnMut(B) -> T, +) { + use imgref_iter::iter::{IterWindows, IterWindowsPtrMut}; + use imgref_iter::traits::{ImgIter, ImgIterMut, ImgIterPtrMut}; + + let mut ops = VecDeque::new(); + + // This is needed to avoid Undefined Behavior. Writing to the rows of the + // must be done before constructing the columns iterators, because otherwise + // the writes would invalidate their borrows. However I don't want to + // duplicate this loop, so make it a closure. + let mut blur_windows = |writer: IterWindowsPtrMut<T>, reader: IterWindows<T>| { + for (write, read) in writer.zip(reader) { + let mut blur = StackBlur::new(read.map(&mut to_blurrable), radius, &mut ops); + write.for_each(|place| unsafe { *place = to_pixel(blur.next().unwrap()) }); + } + }; + + let buffer_ptr = buffer.as_mut_ptr(); + blur_windows( + unsafe { buffer_ptr.iter_rows_ptr_mut() }, + buffer.iter_rows(), + ); + blur_windows( + unsafe { buffer_ptr.iter_cols_ptr_mut() }, + buffer.iter_cols(), + ); +} + +/// Blurs a buffer with SIMD, assuming one element per pixel. +/// +/// The provided closures are used to convert from the buffer's native pixel +/// format to [`StackBlurrable`] values that can be consumed by [`StackBlur`]. +pub fn simd_blur<T, Bsimd: StackBlurrable, Bsingle: StackBlurrable, const LANES: usize>( + buffer: &mut ImgRefMut<T>, + radius: usize, + mut to_blurrable_simd: impl FnMut([&T; LANES]) -> Bsimd, + mut to_pixel_simd: impl FnMut(Bsimd) -> [T; LANES], + mut to_blurrable_single: impl FnMut(&T) -> Bsingle, + mut to_pixel_single: impl FnMut(Bsingle) -> T, +) where + LaneCount<LANES>: SupportedLaneCount, +{ + #[cfg(not(doc))] + use imgref_iter::iter::{ + SimdIterWindow, SimdIterWindowPtrMut, SimdIterWindows, SimdIterWindowsPtrMut, + }; + #[cfg(not(doc))] + use imgref_iter::traits::{ImgIterMut, ImgSimdIter, ImgSimdIterPtrMut}; + + let mut ops_simd = VecDeque::new(); + let mut ops_single = VecDeque::new(); + + let mut simd_blur_windows = + |writer: SimdIterWindowsPtrMut<T, LANES>, + reader: SimdIterWindows<T, LANES>, + mut ops_simd: VecDeque<Bsimd>, + mut ops_single: VecDeque<Bsingle>| { + for (write, read) in writer.zip(reader) { + match (write, read) { + (SimdIterWindowPtrMut::Simd(write), SimdIterWindow::Simd(read)) => { + let mut blur = + StackBlur::new(read.map(&mut to_blurrable_simd), radius, &mut ops_simd); + write.for_each(|place| { + place + .into_iter() + .zip(to_pixel_simd(blur.next().unwrap())) + .for_each(|(place, pixel)| unsafe { *place = pixel }); + }); + } + + (SimdIterWindowPtrMut::Single(write), SimdIterWindow::Single(read)) => { + let mut blur = StackBlur::new( + read.map(&mut to_blurrable_single), + radius, + &mut ops_single, + ); + write.for_each(|place| unsafe { + *place = to_pixel_single(blur.next().unwrap()); + }); + } + + _ => unreachable!(), + } + } + + (ops_simd, ops_single) + }; + + let buffer_ptr = buffer.as_mut_ptr(); + (ops_simd, ops_single) = simd_blur_windows( + unsafe { buffer_ptr.simd_iter_rows_ptr_mut::<LANES>() }, + buffer.simd_iter_rows::<LANES>(), + ops_simd, + ops_single, + ); + simd_blur_windows( + unsafe { buffer_ptr.simd_iter_cols_ptr_mut::<LANES>() }, + buffer.simd_iter_cols::<LANES>(), + ops_simd, + ops_single, + ); +} + +/// Blurs a buffer of 32-bit packed ARGB pixels (0xAARRGGBB). +/// +/// This is a version of [`blur`] with pre-filled conversion routines that +/// provide good results for blur radii <= 4096. Larger radii may overflow. +pub fn blur_argb(buffer: &mut ImgRefMut<u32>, radius: usize) { + blur(buffer, radius, |i| Argb::from(*i), Argb::into); +} + +/// Blurs a buffer of 32-bit packed ARGB pixels (0xAARRGGBB) with SIMD. +/// +/// This is a version of [`simd_blur`] with pre-filled conversion routines that +/// provide good results for blur radii <= 4096. Larger radii may overflow. +pub fn simd_blur_argb<const LANES: usize>(buffer: &mut ImgRefMut<u32>, radius: usize) +where + LaneCount<LANES>: SupportedLaneCount, +{ + simd_blur( + buffer, + radius, + |i: [&u32; LANES]| Argb::from(i.map(u32::clone)), + Argb::into, + |i| Argb::from(*i), + Argb::into, + ); +} diff --git a/src/test.rs b/src/test.rs new file mode 100755 index 0000000..2993e93 --- /dev/null +++ b/src/test.rs @@ -0,0 +1,49 @@ +extern crate test; + +use imgref::ImgVec; +use test::Bencher; + +const WIDTH: usize = 640; +const HEIGHT: usize = 480; + +#[bench] +#[inline(never)] +fn blur_argb_16(bencher: &mut Bencher) { + let mut buf = ImgVec::new(vec![0; WIDTH * HEIGHT], WIDTH, HEIGHT); + bencher.iter(|| crate::blur_argb(&mut buf.as_mut(), 16)); +} + +#[bench] +#[inline(never)] +fn blur_argb_128(bencher: &mut Bencher) { + let mut buf = ImgVec::new(vec![0; WIDTH * HEIGHT], WIDTH, HEIGHT); + bencher.iter(|| crate::blur_argb(&mut buf.as_mut(), 128)); +} + +#[bench] +#[inline(never)] +fn blur_argb_1024(bencher: &mut Bencher) { + let mut buf = ImgVec::new(vec![0; WIDTH * HEIGHT], WIDTH, HEIGHT); + bencher.iter(|| crate::blur_argb(&mut buf.as_mut(), 1024)); +} + +#[bench] +#[inline(never)] +fn simd_blur_argb_16(bencher: &mut Bencher) { + let mut buf = ImgVec::new(vec![0; WIDTH * HEIGHT], WIDTH, HEIGHT); + bencher.iter(|| crate::simd_blur_argb::<8>(&mut buf.as_mut(), 16)); +} + +#[bench] +#[inline(never)] +fn simd_blur_argb_128(bencher: &mut Bencher) { + let mut buf = ImgVec::new(vec![0; WIDTH * HEIGHT], WIDTH, HEIGHT); + bencher.iter(|| crate::simd_blur_argb::<8>(&mut buf.as_mut(), 128)); +} + +#[bench] +#[inline(never)] +fn simd_blur_argb_1024(bencher: &mut Bencher) { + let mut buf = ImgVec::new(vec![0; WIDTH * HEIGHT], WIDTH, HEIGHT); + bencher.iter(|| crate::simd_blur_argb::<8>(&mut buf.as_mut(), 1024)); +} diff --git a/src/traits.rs b/src/traits.rs new file mode 100755 index 0000000..852b1d0 --- /dev/null +++ b/src/traits.rs @@ -0,0 +1,36 @@ +//! The home of [`StackBlurrable`]. + +use std::ops::{Add, AddAssign, Div, Mul, SubAssign}; + +/// The trait for types which can be blurred by [`StackBlur`][crate::StackBlur]. +/// +/// This trait is auto-implemented for all types that satisfy its requirements. +/// +/// Types that wish to implement this trait should be signed or use explicitly +/// wrapping arithmetic. +/// +/// They should have a significantly higher precision than the pixel format that +/// they represent, as they may be multiplied by hundreds or thousands before +/// being divided. They should also ideally be `Copy` so that cloning is cheap. +pub trait StackBlurrable: + Default + + Copy + + Add<Output = Self> + + AddAssign + + SubAssign + + Mul<usize, Output = Self> + + Div<usize, Output = Self> +{ +} + +impl< + T: Default + + Copy + + Add<Output = T> + + AddAssign + + SubAssign + + Mul<usize, Output = T> + + Div<usize, Output = T>, + > StackBlurrable for T +{ +} |