init - slur

master 1Branch 0Tags

Clone

HTTPS

SSH

Open with VS Code

init

Browse Source

bendn 2023-11-23

commit 97ba835

-rwxr-xr-x

-rwxr-xr-x

-rwxr-xr-x

-rwxr-xr-x

-rwxr-xr-x

-rwxr-xr-x

-rwxr-xr-x

-rwxr-xr-x

109

-rwxr-xr-x

src/color/serial.rs

-rwxr-xr-x

src/color/simd.rs

-rwxr-xr-x

src/iter.rs

258

-rwxr-xr-x

src/lib.rs

205

-rwxr-xr-x

src/test.rs

-rwxr-xr-x

src/traits.rs

14 files changed, 992 insertions, 0 deletions

diff --git a/.gitignore b/.gitignore
new file mode 100755
index 0000000..2f7896d
--- /dev/null
+++ b/.gitignore

@@ -0,0 +1 @@

+target/

diff --git a/Cargo.lock b/Cargo.lock
new file mode 100755
index 0000000..6b0a465
--- /dev/null
+++ b/Cargo.lock

@@ -0,0 +1,41 @@

+# This file is automatically @generated by Cargo.

+# It is not intended for manual editing.

+version = 3

+[[package]]

+name = "cfg-if"

+version = "1.0.0"

+source = "registry+https://github.com/rust-lang/crates.io-index"

+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"

+[[package]]

+name = "iai"

+version = "0.1.1"

+source = "git+https://github.com/bend-n/iai#30f77d9d165f8d2e1fcea8f87bd01cc03d821ae6"

+dependencies = [

+ "cfg-if",

+[[package]]

+name = "imgref"

+version = "1.10.0"

+source = "registry+https://github.com/rust-lang/crates.io-index"

+checksum = "90d944e334f00f4449c9640b440a171f816be0152305c12ef90424fc35fd035c"

+[[package]]

+name = "imgref-iter"

+version = "0.4.0"

+source = "registry+https://github.com/rust-lang/crates.io-index"

+checksum = "58112733c0ee178b66dceba49e0445d0b293268719de579d2c7dd4c2290976aa"

+dependencies = [

+ "imgref",

+[[package]]

+name = "slur"

+version = "0.1.0"

+dependencies = [

+ "iai",

+ "imgref",

+ "imgref-iter",

diff --git a/Cargo.toml b/Cargo.toml
new file mode 100755
index 0000000..d580fa8
--- /dev/null
+++ b/Cargo.toml

@@ -0,0 +1,28 @@

+[package]

+name = 'slur'

+version = '0.1.0'

+authors = ['LoganDark']

+edition = '2021'

+description = 'A fast, iterative, correct approach to Stackblur, resulting in a very smooth and high-quality output, with no edge bleeding'

+readme = 'README.md'

+repository = 'https://github.com/bend-n/slur'

+license = 'MIT'

+keywords = ['stackblur', 'blur', 'gaussian']

+categories = ['algorithms', 'graphics', 'rendering']

+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

+[dependencies]

+imgref = '^1.9.2'

+imgref-iter = { version = '~0.4.0', features = ["simd"] }

+[dev-dependencies]

+iai = { git = "https://github.com/bend-n/iai" }

+[[bench]]

+name = 'iai'

+harness = false

+[[bench]]

+name = 'iai_simd'

+harness = false

diff --git a/LICENSE b/LICENSE
new file mode 100755
index 0000000..c11de7a
--- /dev/null
+++ b/LICENSE

@@ -0,0 +1,20 @@

+MIT License

+Permission is hereby granted, free of charge, to any person obtaining a copy of

+this software and associated documentation files (the "Software"), to deal in

+the Software without restriction, including without limitation the rights to

+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of

+the Software, and to permit persons to whom the Software is furnished to do so,

+subject to the following conditions:

+The above copyright notice and this permission notice shall be included in all

+copies or substantial portions of the Software.

+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS

+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR

+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER

+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN

+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

diff --git a/README.md b/README.md
new file mode 100755
index 0000000..f1c4f08
--- /dev/null
+++ b/README.md

@@ -0,0 +1,53 @@

+A fast, iterative, correct approach to Stackblur, resulting in a very smooth and

+high-quality output, with no edge bleeding.

+This crate implements a tweaked version of the Stackblur algorithm requiring

+`radius * 2 + 2` elements of space rather than `radius * 2 + 1`, which is a

+small tradeoff for much-increased visual quality.

+The algorithm is exposed as an iterator (`StackBlur`) that can wrap any other

+iterator that yields elements of `StackBlurrable`. The `StackBlur` will then

+yield elements blurred by the specified radius.

+## Benefits of this crate

+Stackblur is essentially constant-time. Regardless of the radius, it always

+performs only 1 scan over the input iterator and outputs exactly the same amount

+of elements.

+Additionally, it produces results that are comparable to slow and expensive

+Gaussian blurs. As opposed to box blur which uses a basic rolling average,

+Stackblur uses a weighted average where each output pixel is affected more

+strongly by the inputs that were closest to it.

+Despite that, Stackblur does not perform much worse compared to naive box blurs,

+and is quite cheap compared to full Gaussian blurs, at least for the CPU. The

+implementation in this crate will most likely beat most unoptimized blurs you

+can find on crates.io, as well as some optimized ones, and it is extremely

+flexible and generic.

+For a full explanation of the improvements made to the Stackblur algorithm, see

+the `iter` module.

+https://user-images.githubusercontent.com/4723091/173788732-2e3e125e-f7b3-4e0f-8582-cc2c148ba437.mp4

+*(In the above video, `stackblur-iter` is the centered blur, whereas the

+full-width one is another `stackblur` crate.)*

+## Comparison to the `stackblur` crate

+`stackblur` suffers from edge bleeding and flexibility problems. For

+example, it can only operate on buffers of 32-bit integers, and expects them

+to be packed linear ARGB pixels. Additionally, it cannot operate on a 2D

+subslice of a buffer (like `imgref` allows for this crate), and it does not

+offer any streaming iterators or documentation. And it also only supports

+a blur radius of up to 255.

+## Usage

+Aside from `StackBlurrable` and `StackBlur` which host their own documentation,

+there are helper functions like `blur` and `blur_argb` that can be used to

+interact with 2D image buffers, due to the fact that doing so manually involves

+unsafe code (if you want no-copy).

+See the [full documentation](https://docs.rs/stackblur-iter) for more.

diff --git a/benches/iai.rs b/benches/iai.rs
new file mode 100755
index 0000000..d28f6bc
--- /dev/null
+++ b/benches/iai.rs

@@ -0,0 +1,25 @@

+use imgref::{Img, ImgRefMut};

+const WIDTH: usize = 640;

+const HEIGHT: usize = 480;

+static mut BUFFER: [u32; WIDTH * HEIGHT] = [0; WIDTH * HEIGHT];

+#[inline(always)]

+fn img() -> ImgRefMut<'static, u32> {

+ Img::new(unsafe { &mut BUFFER[..] }, WIDTH, HEIGHT)

+fn blur_argb_16() {

+ slur::blur_argb(&mut img(), 16)

+fn blur_argb_128() {

+ slur::blur_argb(&mut img(), 128)

+fn blur_argb_1024() {

+ slur::blur_argb(&mut img(), 1024)

+// parallel versions are non-deterministic

+iai::main!(blur_argb_16, blur_argb_128, blur_argb_1024,);

diff --git a/benches/iai_simd.rs b/benches/iai_simd.rs
new file mode 100755
index 0000000..c3a20c9
--- /dev/null
+++ b/benches/iai_simd.rs

@@ -0,0 +1,45 @@

+use imgref::{Img, ImgRefMut};

+const WIDTH: usize = 640;

+const HEIGHT: usize = 480;

+static mut BUFFER: [u32; WIDTH * HEIGHT] = [0; WIDTH * HEIGHT];

+#[inline(always)]

+fn img() -> ImgRefMut<'static, u32> {

+ Img::new(unsafe { &mut BUFFER[..] }, WIDTH, HEIGHT)

+use slur::simd_blur_argb;

+fn blur_argb_u32x1() {

+ simd_blur_argb::<1>(&mut img(), 16)

+fn blur_argb_u32x2() {

+ simd_blur_argb::<2>(&mut img(), 16)

+fn blur_argb_u32x4() {

+ simd_blur_argb::<4>(&mut img(), 16)

+fn blur_argb_u32x8() {

+ simd_blur_argb::<8>(&mut img(), 16)

+fn blur_argb_u32x16() {

+ simd_blur_argb::<16>(&mut img(), 16)

+fn blur_argb_u32x32() {

+ simd_blur_argb::<32>(&mut img(), 16)

+fn blur_argb_u32x64() {

+ simd_blur_argb::<64>(&mut img(), 16)

+iai::main!(

+ blur_argb_u32x1,

+ blur_argb_u32x2,

+ blur_argb_u32x4,

+ blur_argb_u32x8,

+ blur_argb_u32x16,

+ blur_argb_u32x32,

+ blur_argb_u32x64,

+);

diff --git a/src/color.rs b/src/color.rs
new file mode 100755
index 0000000..96efe3b
--- /dev/null
+++ b/src/color.rs

@@ -0,0 +1,109 @@

+use crate::StackBlurrable;

+use std::ops::{Add, AddAssign, Div, Mul, Sub, SubAssign};

+mod serial;

+mod simd;

+pub use serial::BlurU32;

+pub use simd::u32xN;

+use std::simd::{LaneCount, Simd, SupportedLaneCount};

+#[repr(transparent)]

+#[derive(Copy, Clone, Eq, PartialEq, Debug, Default)]

+pub struct Argb<T: StackBlurrable>([T; 4]);

+impl From<u32> for Argb<BlurU32> {

+ fn from(argb: u32) -> Self {

+ let [a, r, g, b] = argb.to_be_bytes();

+ let cvt = |i| BlurU32(i as u32);

+ Self([cvt(a), cvt(r), cvt(g), cvt(b)])

+ }

+impl From<Argb<BlurU32>> for u32 {

+ fn from(Argb([a, r, g, b]): Argb<BlurU32>) -> Self {

+ let cvt = |i: BlurU32| i.0 as u8;

+ u32::from_be_bytes([cvt(a), cvt(r), cvt(g), cvt(b)])

+ }

+impl<const N: usize> From<[u32; N]> for Argb<u32xN<N>>

+where

+ LaneCount<N>: SupportedLaneCount,

+ fn from(values: [u32; N]) -> Self {

+ let arrs: [[u8; 4]; N] = values.map(u32::to_be_bytes);

+ Self([

+ u32xN(Simd::from_array(arrs.map(|a| a[0] as u32))),

+ u32xN(Simd::from_array(arrs.map(|a| a[1] as u32))),

+ u32xN(Simd::from_array(arrs.map(|a| a[2] as u32))),

+ u32xN(Simd::from_array(arrs.map(|a| a[3] as u32))),

+ ])

+ }

+impl<const N: usize> From<Argb<u32xN<N>>> for [u32; N]

+where

+ LaneCount<N>: SupportedLaneCount,

+ fn from(value: Argb<u32xN<N>>) -> Self {

+ let [a, r, g, b] = value.0.map(|i| i.0.to_array());

+ std::array::from_fn(|i| u32::from_be_bytes([a[i], r[i], g[i], b[i]].map(|x| x as u8)))

+ }

+impl<T: StackBlurrable> Add for Argb<T> {

+ type Output = Self;

+ fn add(mut self, rhs: Self) -> Self::Output {

+ self += rhs;

+ self

+ }

+impl<T: StackBlurrable> Sub for Argb<T> {

+ type Output = Self;

+ fn sub(mut self, rhs: Self) -> Self::Output {

+ self -= rhs;

+ self

+ }

+impl<T: StackBlurrable> AddAssign for Argb<T> {

+ fn add_assign(&mut self, rhs: Self) {

+ let [a, r, g, b] = rhs.0;

+ self.0[0] += a;

+ self.0[1] += r;

+ self.0[2] += g;

+ self.0[3] += b;

+ }

+impl<T: StackBlurrable> SubAssign for Argb<T> {

+ fn sub_assign(&mut self, rhs: Self) {

+ let [a, r, g, b] = rhs.0;

+ self.0[0] -= a;

+ self.0[1] -= r;

+ self.0[2] -= g;

+ self.0[3] -= b;

+ }

+impl<T: StackBlurrable> Mul<usize> for Argb<T> {

+ type Output = Self;

+ fn mul(self, rhs: usize) -> Self::Output {

+ let [a, r, g, b] = self.0;

+ Self([a * rhs, r * rhs, g * rhs, b * rhs])

+ }

+impl<T: StackBlurrable> Div<usize> for Argb<T> {

+ type Output = Self;

+ fn div(self, rhs: usize) -> Self::Output {

+ let [a, r, g, b] = self.0;

+ Self([a / rhs, r / rhs, g / rhs, b / rhs])

+ }

diff --git a/src/color/serial.rs b/src/color/serial.rs
new file mode 100755
index 0000000..e353055
--- /dev/null
+++ b/src/color/serial.rs

@@ -0,0 +1,49 @@

+use std::ops::{Add, AddAssign, Div, Mul, Sub, SubAssign};

+#[derive(Copy, Clone, Eq, PartialEq, Debug, Default)]

+pub struct BlurU32(pub u32);

+impl Add for BlurU32 {

+ type Output = Self;

+ fn add(self, rhs: Self) -> Self::Output {

+ Self(self.0.wrapping_add(rhs.0))

+ }

+impl Sub for BlurU32 {

+ type Output = Self;

+ fn sub(self, rhs: Self) -> Self::Output {

+ Self(self.0.wrapping_sub(rhs.0))

+ }

+impl AddAssign for BlurU32 {

+ fn add_assign(&mut self, rhs: Self) {

+ self.0 = self.0.wrapping_add(rhs.0);

+ }

+impl SubAssign for BlurU32 {

+ fn sub_assign(&mut self, rhs: Self) {

+ self.0 = self.0.wrapping_sub(rhs.0);

+ }

+impl Mul<usize> for BlurU32 {

+ type Output = Self;

+ fn mul(self, rhs: usize) -> Self::Output {

+ Self(self.0.wrapping_mul(rhs as u32))

+ }

+impl Div<usize> for BlurU32 {

+ type Output = Self;

+ #[inline]

+ fn div(self, rhs: usize) -> Self::Output {

+ Self(self.0.wrapping_div(rhs as u32))

+ }

diff --git a/src/color/simd.rs b/src/color/simd.rs
new file mode 100755
index 0000000..3b0e593
--- /dev/null
+++ b/src/color/simd.rs

@@ -0,0 +1,73 @@

+use std::ops::{Add, AddAssign, Div, Mul, Sub, SubAssign};

+pub use std::simd::{LaneCount, Simd, SupportedLaneCount};

+#[derive(Copy, Clone, Eq, PartialEq, Debug, Default)]

+#[allow(non_camel_case_types)]

+pub struct u32xN<const N: usize>(pub Simd<u32, N>)

+where

+ LaneCount<N>: SupportedLaneCount;

+impl<const N: usize> Add for u32xN<N>

+where

+ LaneCount<N>: SupportedLaneCount,

+ type Output = Self;

+ fn add(self, rhs: Self) -> Self::Output {

+ Self(self.0 + rhs.0)

+ }

+impl<const N: usize> Sub for u32xN<N>

+where

+ LaneCount<N>: SupportedLaneCount,

+ type Output = Self;

+ fn sub(self, rhs: Self) -> Self::Output {

+ Self(self.0 - rhs.0)

+ }

+impl<const N: usize> AddAssign for u32xN<N>

+where

+ LaneCount<N>: SupportedLaneCount,

+ fn add_assign(&mut self, rhs: Self) {

+ self.0 += rhs.0;

+ }

+impl<const N: usize> SubAssign for u32xN<N>

+where

+ LaneCount<N>: SupportedLaneCount,

+ fn sub_assign(&mut self, rhs: Self) {

+ self.0 -= rhs.0;

+ }

+impl<const N: usize> Mul<usize> for u32xN<N>

+where

+ LaneCount<N>: SupportedLaneCount,

+ type Output = Self;

+ fn mul(self, rhs: usize) -> Self::Output {

+ Self(self.0 * Simd::<u32, N>::splat(rhs as u32))

+ }

+impl<const N: usize> Div<usize> for u32xN<N>

+where

+ LaneCount<N>: SupportedLaneCount,

+ type Output = Self;

+ fn div(self, rhs: usize) -> Self::Output {

+ Self(Simd::<u32, N>::from_array(

+ self.0.to_array().map(|e| (e as usize / rhs) as u32),

+ ))

+ }

diff --git a/src/iter.rs b/src/iter.rs
new file mode 100755
index 0000000..7fc7e38
--- /dev/null
+++ b/src/iter.rs

@@ -0,0 +1,258 @@

+//! This module implements the [`StackBlur`] struct and the improved version of

+//! the Stackblur algorithm.

+//!

+//! ## The improved Stackblur algorithm

+//!

+//! As previously stated, this crate implements a modified version of Stackblur,

+//! and understanding the original algorithm is key to understanding the

+//! improvements that have been made to it.

+//!

+//! The original Stackblur is essentially a weighted box-blur. Instead of taking

+//! the average of a flat array of pixels, like this:

+//!

+//! ```text

+//! ( 00 + 01 + 02 + 03 + 04 + 05 + 06 ) / 7

+//! ```

+//!

+//! it takes a weighted average of the pixels:

+//!

+//! ```text

+//! ( 03 +

+//! 02 + 03 + 04 +

+//! 01 + 02 + 03 + 04 + 05 +

+//! 00 + 01 + 02 + 03 + 04 + 05 + 06 ) / 16

+//! ```

+//!

+//! This is a rough approximation of a Gaussian blur, and in fact it's already

+//! most of the way there to being a complete algorithm on its own. You can just

+//! multiply each pixel by something like `radius + 1 - center_dist` when you

+//! make your sum, then divide by `radius * (radius + 2) + 1`.

+//!

+//! But there are two problems with this:

+//!

+//! 1. That would be *O*(n(m * 2 + 1)²) complexity, where `n` is the number of

+//! pixels in the image, and `m` is the radius of the blur. This is basically

+//! just as expensive as running a proper convolution filter.

+//!

+//! 2. How do we handle pixels off the edge of the image?

+//!

+//! I'm scared of #1 so I'm going to handle #2 first. What most implementations

+//! choose to do is just repeat the edge of the image:

+//!

+//! ```text

+//! ( 00 +

+//! 00 + 00 + 01 +

+//! 00 + 00 + 00 + 01 + 02 +

+//! 00 + 00 + 00 + 00 + 01 + 02 + 03 ) / 16

+//! ```

+//!

+//! But this creates even more problems, because the edge of the blur will be

+//! quite biased towards the pixels that aren't even in the image. This is known

+//! as "edge bleeding", where a single pixel at the edge of a blur can cause

+//! very large and ugly artifacts to show up.

+//!

+//! The solution, of course, is to not calculate the denominator using that

+//! equation from earlier, and instead incrementally update the denominator as

+//! pixels are scanned, allowing you to sum a varying number of pixels:

+//!

+//! ```text

+//! ( 00 +

+//! 00 + 01 +

+//! 00 + 01 + 02 +

+//! 00 + 01 + 02 + 03 ) / 10

+//!

+//! ( 01 +

+//! 00 + 01 + 02 +

+//! 00 + 01 + 02 + 03 +

+//! 00 + 01 + 02 + 03 + 04 ) / 13

+//!

+//! ( 02 +

+//! 01 + 02 + 03 +

+//! 00 + 01 + 02 + 03 + 04 +

+//! 00 + 01 + 02 + 03 + 04 + 05 ) / 15

+//!

+//! ( 03 +

+//! 02 + 03 + 04 +

+//! 01 + 02 + 03 + 04 + 05 +

+//! 00 + 01 + 02 + 03 + 04 + 05 + 06 ) / 16

+//! ```

+//!

+//! This is one of the improvements made to the Stackblur algorithm that is

+//! implemented by this crate.

+//!

+//! Now for #1 - the complexity problem. It is possible to make a streaming

+//! Stackblur that does not need to read any pixels out of order or more than

+//! once, or even know how long the input is. In fact, that's exactly what this

+//! crate does.

+//!

+//! First you fill up the cache with `radius + 1` pixels in order to be able to

+//! make the first sum, then you start producing results until you run out of

+//! pixels, then you produce the last `radius` results using what you have in

+//! cache. I don't have the skill to explain the algorithm in full detail, but

+//! it's open-source so you can look at it if you want.

+//!

+//! In this crate the "cache" is not actually the actual heap of values, as that

+//! would be too slow. Instead, the "cache" is a list of changes to make to the

+//! rate of change of the sum, and the denominator is updated incrementally in

+//! response to the number of leading and trailing values changing.

+//!

+//! The reason the whole rate of change thing exists is so that adding to the

+//! sum and registering it in the cache becomes *O*(1) instead of *O*(2n+1)

+//! (where `n` is the radius). It's basically the most important thing that

+//! makes the algorithm constant-time.

+use std::collections::VecDeque;

+use crate::traits::StackBlurrable;

+/// An iterator that implements an improved Stackblur algorithm.

+///

+/// For any [`StackBlurrable`] element `T` and any iterator `I` over items of

+/// type `T`, [`StackBlur`] will yield the same amount of elements as `I`,

+/// blurred together according to the improved Stackblur algorithm as described

+/// by the [`iter`][crate::iter] module documentation.

+///

+/// ## Usage

+///

+/// [`StackBlur`] just wraps another iterator with a radius and a cache (called

+/// `ops` by [`StackBlur::new`]):

+///

+/// ```compile_fail

+/// # use std::collections::VecDeque;

+/// # use std::num::Wrapping;

+/// # use stackblur_iter::iter::StackBlur;

+/// #

+/// let arr = [255u8, 0, 0, 0, 127, 0, 0, 0, 255u8];

+/// let iter = arr.iter().copied().map(|i| Wrapping(i as usize));

+/// let blur = StackBlur::new(iter, 2, VecDeque::new());

+/// ```

+///

+/// That example unfortunately doesn't compile because `Wrapping<usize>` doesn't

+/// implement `Mul<usize>` and `Div<usize>`, which are required for the

+/// averaging that [`StackBlur`] performs. It's recommended to create a newtype

+/// wrapper around whatever you plan on blurring, and implement all the traits

+/// required by [`StackBlurrable`].

+///

+/// A [`StackBlur`] always yields exactly as many items as its inner iterator

+/// does. Additionally, a non-fused iterator which repeats will cause the

+/// [`StackBlur`] to repeat as well.

+pub struct StackBlur<'a, T: StackBlurrable, I: Iterator<Item = T>> {

+ iter: I,

+ radius: usize,

+ sum: T,

+ rate: T,

+ dnom: usize,

+ ops: &'a mut VecDeque<T>,

+ leading: usize,

+ trailing: usize,

+ done: bool,

+ first: bool,

+impl<'a, T: StackBlurrable, I: Iterator<Item = T>> StackBlur<'a, T, I> {

+ /// Creates a new [`StackBlur`] from the provided iterator, radius, and

+ /// [`VecDeque`].

+ ///

+ /// The iterator is not advanced until a call to [`StackBlur::next`].

+ pub fn new(iter: I, radius: usize, ops: &'a mut VecDeque<T>) -> Self {

+ Self {

+ iter,

+ radius,

+ sum: T::default(),

+ rate: T::default(),

+ dnom: 0,

+ leading: 0,

+ trailing: 0,

+ ops,

+ done: true,

+ first: true,

+ }

+ fn init(&mut self) {

+ self.done = false;

+ for sub in 0..=self.radius {

+ let Some(item) = self.iter.next() else { break };

+ if sub == 0 {

+ let start = self.radius + 1;

+ let needed = start * 2 + 2;

+ self.ops.reserve(needed.saturating_sub(self.ops.capacity()));

+ self.ops

+ .iter_mut()

+ .take(start)

+ .for_each(|place| *place = T::default());

+ self.ops.resize_with(start, T::default);

+ self.sum = T::default();

+ self.rate = T::default();

+ self.dnom = 0;

+ self.leading = 0;

+ self.trailing = 0;

+ }

+ let mul = self.radius + 1 - sub;

+ self.sum += item * mul;

+ self.rate += item;

+ self.dnom += mul;

+ if self.dnom > mul {

+ self.trailing += 1;

+ }

+ self.ops[sub] -= item * 2;

+ self.ops.push_back(item);

+ }

+ if self.dnom == 0 {

+ self.done = true;

+ }

+impl<T: StackBlurrable, I: Iterator<Item = T>> Iterator for StackBlur<'_, T, I> {

+ type Item = T;

+ #[inline]

+ fn next(&mut self) -> Option<Self::Item> {

+ if self.done {

+ self.init();

+ if !std::mem::replace(&mut self.first, false) {

+ return None;

+ }

+ let result = self.sum / self.dnom;

+ self.rate += self.ops.pop_front().unwrap();

+ self.sum += self.rate;

+ if self.leading < self.radius {

+ self.leading += 1;

+ self.dnom += self.radius + 1 - self.leading;

+ }

+ if self.radius == 0 || self.trailing == self.radius {

+ if let Some(item) = self.iter.next() {

+ self.sum += item;

+ self.rate += item;

+ self.ops[self.radius] -= item * 2;

+ self.ops.push_back(item);

+ } else if self.radius > 0 {

+ self.dnom -= self.radius + 1 - self.trailing;

+ self.trailing -= 1;

+ } else {

+ self.done = true;

+ }

+ } else if self.trailing > 0 {

+ self.dnom -= self.radius + 1 - self.trailing;

+ self.trailing -= 1;

+ } else if self.trailing == 0 {

+ self.done = true;

+ }

+ Some(result)

+ }

diff --git a/src/lib.rs b/src/lib.rs
new file mode 100755
index 0000000..6cf1b17
--- /dev/null
+++ b/src/lib.rs

@@ -0,0 +1,205 @@

+//! A fast, iterative, correct approach to Stackblur, resulting in a very smooth

+//! and high-quality output, with no edge bleeding.

+//!

+//! This crate implements a tweaked version of the Stackblur algorithm requiring

+//! `radius * 2 + 2` elements of space rather than `radius * 2 + 1`, which is a

+//! small tradeoff for much-increased visual quality.

+//!

+//! The algorithm is exposed as an iterator ([`StackBlur`]) that can wrap any

+//! other iterator that yields elements of [`StackBlurrable`]. The [`StackBlur`]

+//! will then yield elements blurred by the specified radius.

+//!

+//! ## Benefits of this crate

+//!

+//! Stackblur is essentially constant-time. Regardless of the radius, it always

+//! performs only 1 scan over the input iterator and outputs exactly the same

+//! amount of elements.

+//!

+//! Additionally, it produces results that are comparable to slow and expensive

+//! Gaussian blurs. As opposed to box blur which uses a basic rolling average,

+//! Stackblur uses a weighted average where each output pixel is affected more

+//! strongly by the inputs that were closest to it.

+//!

+//! Despite that, Stackblur does not perform much worse compared to naive box

+//! blurs, and is quite cheap compared to full Gaussian blurs, at least for the

+//! CPU. The implementation in this crate will most likely beat most unoptimized

+//! blurs you can find on crates.io, as well as some optimized ones, and it is

+//! extremely flexible and generic.

+//!

+//! For a full explanation of the improvements made to the Stackblur algorithm,

+//! see the [`iter`] module.

+//!

+//! ## Comparison to the `stackblur` crate

+//!

+//! `stackblur` suffers from edge bleeding and flexibility problems. For

+//! example, it can only operate on buffers of 32-bit integers, and expects them

+//! to be packed linear ARGB pixels. Additionally, it cannot operate on a 2D

+//! subslice of a buffer (like `imgref` allows for this crate), and it does not

+//! offer any streaming iterators or documentation. And it also only supports

+//! a blur radius of up to 255.

+//!

+//! ## Usage

+//!

+//! Aside from [`StackBlurrable`] and [`StackBlur`] which host their own

+//! documentation, there are helper functions like [`blur`] and [`blur_argb`]

+//! that can be used to interact with 2D image buffers, due to the fact that

+//! doing so manually involves unsafe code (if you want no-copy).

+#![feature(portable_simd, stmt_expr_attributes)]

+#![cfg_attr(test, feature(test))]

+use std::collections::VecDeque;

+use std::simd::{LaneCount, SupportedLaneCount};

+pub extern crate imgref;

+use imgref::ImgRefMut;

+#[cfg(test)]

+mod test;

+pub mod color;

+pub mod iter;

+pub mod traits;

+use color::Argb;

+use iter::StackBlur;

+use traits::StackBlurrable;

+/// Blurs a buffer, assuming one element per pixel.

+///

+/// The provided closures are used to convert from the buffer's native pixel

+/// format to [`StackBlurrable`] values that can be consumed by [`StackBlur`].

+pub fn blur<T, B: StackBlurrable>(

+ buffer: &mut ImgRefMut<T>,

+ radius: usize,

+ mut to_blurrable: impl FnMut(&T) -> B,

+ mut to_pixel: impl FnMut(B) -> T,

+) {

+ use imgref_iter::iter::{IterWindows, IterWindowsPtrMut};

+ use imgref_iter::traits::{ImgIter, ImgIterMut, ImgIterPtrMut};

+ let mut ops = VecDeque::new();

+ // This is needed to avoid Undefined Behavior. Writing to the rows of the

+ // must be done before constructing the columns iterators, because otherwise

+ // the writes would invalidate their borrows. However I don't want to

+ // duplicate this loop, so make it a closure.

+ let mut blur_windows = |writer: IterWindowsPtrMut<T>, reader: IterWindows<T>| {

+ for (write, read) in writer.zip(reader) {

+ let mut blur = StackBlur::new(read.map(&mut to_blurrable), radius, &mut ops);

+ write.for_each(|place| unsafe { *place = to_pixel(blur.next().unwrap()) });

+ }

+ };

+ let buffer_ptr = buffer.as_mut_ptr();

+ blur_windows(

+ unsafe { buffer_ptr.iter_rows_ptr_mut() },

+ buffer.iter_rows(),

+ );

+ blur_windows(

+ unsafe { buffer_ptr.iter_cols_ptr_mut() },

+ buffer.iter_cols(),

+ );

+/// Blurs a buffer with SIMD, assuming one element per pixel.

+///

+/// The provided closures are used to convert from the buffer's native pixel

+/// format to [`StackBlurrable`] values that can be consumed by [`StackBlur`].

+pub fn simd_blur<T, Bsimd: StackBlurrable, Bsingle: StackBlurrable, const LANES: usize>(

+ buffer: &mut ImgRefMut<T>,

+ radius: usize,

+ mut to_blurrable_simd: impl FnMut([&T; LANES]) -> Bsimd,

+ mut to_pixel_simd: impl FnMut(Bsimd) -> [T; LANES],

+ mut to_blurrable_single: impl FnMut(&T) -> Bsingle,

+ mut to_pixel_single: impl FnMut(Bsingle) -> T,

+) where

+ LaneCount<LANES>: SupportedLaneCount,

+ #[cfg(not(doc))]

+ use imgref_iter::iter::{

+ SimdIterWindow, SimdIterWindowPtrMut, SimdIterWindows, SimdIterWindowsPtrMut,

+ };

+ #[cfg(not(doc))]

+ use imgref_iter::traits::{ImgIterMut, ImgSimdIter, ImgSimdIterPtrMut};

+ let mut ops_simd = VecDeque::new();

+ let mut ops_single = VecDeque::new();

+ let mut simd_blur_windows =

+ |writer: SimdIterWindowsPtrMut<T, LANES>,

+ reader: SimdIterWindows<T, LANES>,

+ mut ops_simd: VecDeque<Bsimd>,

+ mut ops_single: VecDeque<Bsingle>| {

+ for (write, read) in writer.zip(reader) {

+ match (write, read) {

+ (SimdIterWindowPtrMut::Simd(write), SimdIterWindow::Simd(read)) => {

+ let mut blur =

+ StackBlur::new(read.map(&mut to_blurrable_simd), radius, &mut ops_simd);

+ write.for_each(|place| {

+ place

+ .into_iter()

+ .zip(to_pixel_simd(blur.next().unwrap()))

+ .for_each(|(place, pixel)| unsafe { *place = pixel });

+ });

+ }

+ (SimdIterWindowPtrMut::Single(write), SimdIterWindow::Single(read)) => {

+ let mut blur = StackBlur::new(

+ read.map(&mut to_blurrable_single),

+ radius,

+ &mut ops_single,

+ );

+ write.for_each(|place| unsafe {

+ *place = to_pixel_single(blur.next().unwrap());

+ });

+ }

+ _ => unreachable!(),

+ }

+ (ops_simd, ops_single)

+ };

+ let buffer_ptr = buffer.as_mut_ptr();

+ (ops_simd, ops_single) = simd_blur_windows(

+ unsafe { buffer_ptr.simd_iter_rows_ptr_mut::<LANES>() },

+ buffer.simd_iter_rows::<LANES>(),

+ ops_simd,

+ ops_single,

+ );

+ simd_blur_windows(

+ unsafe { buffer_ptr.simd_iter_cols_ptr_mut::<LANES>() },

+ buffer.simd_iter_cols::<LANES>(),

+ ops_simd,

+ ops_single,

+ );

+/// Blurs a buffer of 32-bit packed ARGB pixels (0xAARRGGBB).

+///

+/// This is a version of [`blur`] with pre-filled conversion routines that

+/// provide good results for blur radii <= 4096. Larger radii may overflow.

+pub fn blur_argb(buffer: &mut ImgRefMut<u32>, radius: usize) {

+ blur(buffer, radius, |i| Argb::from(*i), Argb::into);

+/// Blurs a buffer of 32-bit packed ARGB pixels (0xAARRGGBB) with SIMD.

+///

+/// This is a version of [`simd_blur`] with pre-filled conversion routines that

+/// provide good results for blur radii <= 4096. Larger radii may overflow.

+pub fn simd_blur_argb<const LANES: usize>(buffer: &mut ImgRefMut<u32>, radius: usize)

+where

+ LaneCount<LANES>: SupportedLaneCount,

+ simd_blur(

+ buffer,

+ radius,

+ |i: [&u32; LANES]| Argb::from(i.map(u32::clone)),

+ Argb::into,

+ |i| Argb::from(*i),

+ Argb::into,

+ );

diff --git a/src/test.rs b/src/test.rs
new file mode 100755
index 0000000..2993e93
--- /dev/null
+++ b/src/test.rs

@@ -0,0 +1,49 @@

+extern crate test;

+use imgref::ImgVec;

+use test::Bencher;

+const WIDTH: usize = 640;

+const HEIGHT: usize = 480;

+#[bench]

+#[inline(never)]

+fn blur_argb_16(bencher: &mut Bencher) {

+ let mut buf = ImgVec::new(vec![0; WIDTH * HEIGHT], WIDTH, HEIGHT);

+ bencher.iter(|| crate::blur_argb(&mut buf.as_mut(), 16));

+#[bench]

+#[inline(never)]

+fn blur_argb_128(bencher: &mut Bencher) {

+ let mut buf = ImgVec::new(vec![0; WIDTH * HEIGHT], WIDTH, HEIGHT);

+ bencher.iter(|| crate::blur_argb(&mut buf.as_mut(), 128));

+#[bench]

+#[inline(never)]

+fn blur_argb_1024(bencher: &mut Bencher) {

+ let mut buf = ImgVec::new(vec![0; WIDTH * HEIGHT], WIDTH, HEIGHT);

+ bencher.iter(|| crate::blur_argb(&mut buf.as_mut(), 1024));

+#[bench]

+#[inline(never)]

+fn simd_blur_argb_16(bencher: &mut Bencher) {

+ let mut buf = ImgVec::new(vec![0; WIDTH * HEIGHT], WIDTH, HEIGHT);

+ bencher.iter(|| crate::simd_blur_argb::<8>(&mut buf.as_mut(), 16));

+#[bench]

+#[inline(never)]

+fn simd_blur_argb_128(bencher: &mut Bencher) {

+ let mut buf = ImgVec::new(vec![0; WIDTH * HEIGHT], WIDTH, HEIGHT);

+ bencher.iter(|| crate::simd_blur_argb::<8>(&mut buf.as_mut(), 128));

+#[bench]

+#[inline(never)]

+fn simd_blur_argb_1024(bencher: &mut Bencher) {

+ let mut buf = ImgVec::new(vec![0; WIDTH * HEIGHT], WIDTH, HEIGHT);

+ bencher.iter(|| crate::simd_blur_argb::<8>(&mut buf.as_mut(), 1024));

diff --git a/src/traits.rs b/src/traits.rs
new file mode 100755
index 0000000..852b1d0
--- /dev/null
+++ b/src/traits.rs

@@ -0,0 +1,36 @@

+//! The home of [`StackBlurrable`].

+use std::ops::{Add, AddAssign, Div, Mul, SubAssign};

+/// The trait for types which can be blurred by [`StackBlur`][crate::StackBlur].

+///

+/// This trait is auto-implemented for all types that satisfy its requirements.

+///

+/// Types that wish to implement this trait should be signed or use explicitly

+/// wrapping arithmetic.

+///

+/// They should have a significantly higher precision than the pixel format that

+/// they represent, as they may be multiplied by hundreds or thousands before

+/// being divided. They should also ideally be `Copy` so that cloning is cheap.

+pub trait StackBlurrable:

+ Default

+ + Copy

+ + Add<Output = Self>

+ + AddAssign

+ + SubAssign

+ + Mul<usize, Output = Self>

+ + Div<usize, Output = Self>

+impl<

+ T: Default

+ + Copy

+ + Add<Output = T>

+ + AddAssign

+ + SubAssign

+ + Mul<usize, Output = T>

+ + Div<usize, Output = T>,

+ > StackBlurrable for T