e
bendn 2023-11-23
commit 97ba835
-rwxr-xr-x.gitignore1
-rwxr-xr-xCargo.lock41
-rwxr-xr-xCargo.toml28
-rwxr-xr-xLICENSE20
-rwxr-xr-xREADME.md53
-rwxr-xr-xbenches/iai.rs25
-rwxr-xr-xbenches/iai_simd.rs45
-rwxr-xr-xsrc/color.rs109
-rwxr-xr-xsrc/color/serial.rs49
-rwxr-xr-xsrc/color/simd.rs73
-rwxr-xr-xsrc/iter.rs258
-rwxr-xr-xsrc/lib.rs205
-rwxr-xr-xsrc/test.rs49
-rwxr-xr-xsrc/traits.rs36
14 files changed, 992 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100755
index 0000000..2f7896d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+target/
diff --git a/Cargo.lock b/Cargo.lock
new file mode 100755
index 0000000..6b0a465
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,41 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "iai"
+version = "0.1.1"
+source = "git+https://github.com/bend-n/iai#30f77d9d165f8d2e1fcea8f87bd01cc03d821ae6"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "imgref"
+version = "1.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90d944e334f00f4449c9640b440a171f816be0152305c12ef90424fc35fd035c"
+
+[[package]]
+name = "imgref-iter"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "58112733c0ee178b66dceba49e0445d0b293268719de579d2c7dd4c2290976aa"
+dependencies = [
+ "imgref",
+]
+
+[[package]]
+name = "slur"
+version = "0.1.0"
+dependencies = [
+ "iai",
+ "imgref",
+ "imgref-iter",
+]
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100755
index 0000000..d580fa8
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,28 @@
+[package]
+name = 'slur'
+version = '0.1.0'
+authors = ['LoganDark']
+edition = '2021'
+description = 'A fast, iterative, correct approach to Stackblur, resulting in a very smooth and high-quality output, with no edge bleeding'
+readme = 'README.md'
+repository = 'https://github.com/bend-n/slur'
+license = 'MIT'
+keywords = ['stackblur', 'blur', 'gaussian']
+categories = ['algorithms', 'graphics', 'rendering']
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+imgref = '^1.9.2'
+imgref-iter = { version = '~0.4.0', features = ["simd"] }
+
+[dev-dependencies]
+iai = { git = "https://github.com/bend-n/iai" }
+
+[[bench]]
+name = 'iai'
+harness = false
+
+[[bench]]
+name = 'iai_simd'
+harness = false
diff --git a/LICENSE b/LICENSE
new file mode 100755
index 0000000..c11de7a
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,20 @@
+MIT License
+
+Copyright © 2022-present LoganDark
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100755
index 0000000..f1c4f08
--- /dev/null
+++ b/README.md
@@ -0,0 +1,53 @@
+A fast, iterative, correct approach to Stackblur, resulting in a very smooth and
+high-quality output, with no edge bleeding.
+
+This crate implements a tweaked version of the Stackblur algorithm requiring
+`radius * 2 + 2` elements of space rather than `radius * 2 + 1`, which is a
+small tradeoff for much-increased visual quality.
+
+The algorithm is exposed as an iterator (`StackBlur`) that can wrap any other
+iterator that yields elements of `StackBlurrable`. The `StackBlur` will then
+yield elements blurred by the specified radius.
+
+## Benefits of this crate
+
+Stackblur is essentially constant-time. Regardless of the radius, it always
+performs only 1 scan over the input iterator and outputs exactly the same amount
+of elements.
+
+Additionally, it produces results that are comparable to slow and expensive
+Gaussian blurs. As opposed to box blur which uses a basic rolling average,
+Stackblur uses a weighted average where each output pixel is affected more
+strongly by the inputs that were closest to it.
+
+Despite that, Stackblur does not perform much worse compared to naive box blurs,
+and is quite cheap compared to full Gaussian blurs, at least for the CPU. The
+implementation in this crate will most likely beat most unoptimized blurs you
+can find on crates.io, as well as some optimized ones, and it is extremely
+flexible and generic.
+
+For a full explanation of the improvements made to the Stackblur algorithm, see
+the `iter` module.
+
+https://user-images.githubusercontent.com/4723091/173788732-2e3e125e-f7b3-4e0f-8582-cc2c148ba437.mp4
+
+*(In the above video, `stackblur-iter` is the centered blur, whereas the
+full-width one is another `stackblur` crate.)*
+
+## Comparison to the `stackblur` crate
+
+`stackblur` suffers from edge bleeding and flexibility problems. For
+example, it can only operate on buffers of 32-bit integers, and expects them
+to be packed linear ARGB pixels. Additionally, it cannot operate on a 2D
+subslice of a buffer (like `imgref` allows for this crate), and it does not
+offer any streaming iterators or documentation. And it also only supports
+a blur radius of up to 255.
+
+## Usage
+
+Aside from `StackBlurrable` and `StackBlur` which host their own documentation,
+there are helper functions like `blur` and `blur_argb` that can be used to
+interact with 2D image buffers, due to the fact that doing so manually involves
+unsafe code (if you want no-copy).
+
+See the [full documentation](https://docs.rs/stackblur-iter) for more.
diff --git a/benches/iai.rs b/benches/iai.rs
new file mode 100755
index 0000000..d28f6bc
--- /dev/null
+++ b/benches/iai.rs
@@ -0,0 +1,25 @@
+use imgref::{Img, ImgRefMut};
+
+const WIDTH: usize = 640;
+const HEIGHT: usize = 480;
+
+static mut BUFFER: [u32; WIDTH * HEIGHT] = [0; WIDTH * HEIGHT];
+
+#[inline(always)]
+fn img() -> ImgRefMut<'static, u32> {
+ Img::new(unsafe { &mut BUFFER[..] }, WIDTH, HEIGHT)
+}
+
+fn blur_argb_16() {
+ slur::blur_argb(&mut img(), 16)
+}
+fn blur_argb_128() {
+ slur::blur_argb(&mut img(), 128)
+}
+fn blur_argb_1024() {
+ slur::blur_argb(&mut img(), 1024)
+}
+
+// parallel versions are non-deterministic
+
+iai::main!(blur_argb_16, blur_argb_128, blur_argb_1024,);
diff --git a/benches/iai_simd.rs b/benches/iai_simd.rs
new file mode 100755
index 0000000..c3a20c9
--- /dev/null
+++ b/benches/iai_simd.rs
@@ -0,0 +1,45 @@
+use imgref::{Img, ImgRefMut};
+
+const WIDTH: usize = 640;
+const HEIGHT: usize = 480;
+
+static mut BUFFER: [u32; WIDTH * HEIGHT] = [0; WIDTH * HEIGHT];
+
+#[inline(always)]
+fn img() -> ImgRefMut<'static, u32> {
+ Img::new(unsafe { &mut BUFFER[..] }, WIDTH, HEIGHT)
+}
+
+use slur::simd_blur_argb;
+
+fn blur_argb_u32x1() {
+ simd_blur_argb::<1>(&mut img(), 16)
+}
+fn blur_argb_u32x2() {
+ simd_blur_argb::<2>(&mut img(), 16)
+}
+fn blur_argb_u32x4() {
+ simd_blur_argb::<4>(&mut img(), 16)
+}
+fn blur_argb_u32x8() {
+ simd_blur_argb::<8>(&mut img(), 16)
+}
+fn blur_argb_u32x16() {
+ simd_blur_argb::<16>(&mut img(), 16)
+}
+fn blur_argb_u32x32() {
+ simd_blur_argb::<32>(&mut img(), 16)
+}
+fn blur_argb_u32x64() {
+ simd_blur_argb::<64>(&mut img(), 16)
+}
+
+iai::main!(
+ blur_argb_u32x1,
+ blur_argb_u32x2,
+ blur_argb_u32x4,
+ blur_argb_u32x8,
+ blur_argb_u32x16,
+ blur_argb_u32x32,
+ blur_argb_u32x64,
+);
diff --git a/src/color.rs b/src/color.rs
new file mode 100755
index 0000000..96efe3b
--- /dev/null
+++ b/src/color.rs
@@ -0,0 +1,109 @@
+use crate::StackBlurrable;
+use std::ops::{Add, AddAssign, Div, Mul, Sub, SubAssign};
+
+mod serial;
+mod simd;
+
+pub use serial::BlurU32;
+pub use simd::u32xN;
+
+use std::simd::{LaneCount, Simd, SupportedLaneCount};
+
+#[repr(transparent)]
+#[derive(Copy, Clone, Eq, PartialEq, Debug, Default)]
+pub struct Argb<T: StackBlurrable>([T; 4]);
+
+impl From<u32> for Argb<BlurU32> {
+ fn from(argb: u32) -> Self {
+ let [a, r, g, b] = argb.to_be_bytes();
+ let cvt = |i| BlurU32(i as u32);
+ Self([cvt(a), cvt(r), cvt(g), cvt(b)])
+ }
+}
+
+impl From<Argb<BlurU32>> for u32 {
+ fn from(Argb([a, r, g, b]): Argb<BlurU32>) -> Self {
+ let cvt = |i: BlurU32| i.0 as u8;
+ u32::from_be_bytes([cvt(a), cvt(r), cvt(g), cvt(b)])
+ }
+}
+
+impl<const N: usize> From<[u32; N]> for Argb<u32xN<N>>
+where
+ LaneCount<N>: SupportedLaneCount,
+{
+ fn from(values: [u32; N]) -> Self {
+ let arrs: [[u8; 4]; N] = values.map(u32::to_be_bytes);
+ Self([
+ u32xN(Simd::from_array(arrs.map(|a| a[0] as u32))),
+ u32xN(Simd::from_array(arrs.map(|a| a[1] as u32))),
+ u32xN(Simd::from_array(arrs.map(|a| a[2] as u32))),
+ u32xN(Simd::from_array(arrs.map(|a| a[3] as u32))),
+ ])
+ }
+}
+
+impl<const N: usize> From<Argb<u32xN<N>>> for [u32; N]
+where
+ LaneCount<N>: SupportedLaneCount,
+{
+ fn from(value: Argb<u32xN<N>>) -> Self {
+ let [a, r, g, b] = value.0.map(|i| i.0.to_array());
+ std::array::from_fn(|i| u32::from_be_bytes([a[i], r[i], g[i], b[i]].map(|x| x as u8)))
+ }
+}
+
+impl<T: StackBlurrable> Add for Argb<T> {
+ type Output = Self;
+
+ fn add(mut self, rhs: Self) -> Self::Output {
+ self += rhs;
+ self
+ }
+}
+
+impl<T: StackBlurrable> Sub for Argb<T> {
+ type Output = Self;
+
+ fn sub(mut self, rhs: Self) -> Self::Output {
+ self -= rhs;
+ self
+ }
+}
+
+impl<T: StackBlurrable> AddAssign for Argb<T> {
+ fn add_assign(&mut self, rhs: Self) {
+ let [a, r, g, b] = rhs.0;
+ self.0[0] += a;
+ self.0[1] += r;
+ self.0[2] += g;
+ self.0[3] += b;
+ }
+}
+
+impl<T: StackBlurrable> SubAssign for Argb<T> {
+ fn sub_assign(&mut self, rhs: Self) {
+ let [a, r, g, b] = rhs.0;
+ self.0[0] -= a;
+ self.0[1] -= r;
+ self.0[2] -= g;
+ self.0[3] -= b;
+ }
+}
+
+impl<T: StackBlurrable> Mul<usize> for Argb<T> {
+ type Output = Self;
+
+ fn mul(self, rhs: usize) -> Self::Output {
+ let [a, r, g, b] = self.0;
+ Self([a * rhs, r * rhs, g * rhs, b * rhs])
+ }
+}
+
+impl<T: StackBlurrable> Div<usize> for Argb<T> {
+ type Output = Self;
+ fn div(self, rhs: usize) -> Self::Output {
+ let [a, r, g, b] = self.0;
+ Self([a / rhs, r / rhs, g / rhs, b / rhs])
+ }
+}
diff --git a/src/color/serial.rs b/src/color/serial.rs
new file mode 100755
index 0000000..e353055
--- /dev/null
+++ b/src/color/serial.rs
@@ -0,0 +1,49 @@
+use std::ops::{Add, AddAssign, Div, Mul, Sub, SubAssign};
+
+#[derive(Copy, Clone, Eq, PartialEq, Debug, Default)]
+pub struct BlurU32(pub u32);
+
+impl Add for BlurU32 {
+ type Output = Self;
+
+ fn add(self, rhs: Self) -> Self::Output {
+ Self(self.0.wrapping_add(rhs.0))
+ }
+}
+
+impl Sub for BlurU32 {
+ type Output = Self;
+
+ fn sub(self, rhs: Self) -> Self::Output {
+ Self(self.0.wrapping_sub(rhs.0))
+ }
+}
+
+impl AddAssign for BlurU32 {
+ fn add_assign(&mut self, rhs: Self) {
+ self.0 = self.0.wrapping_add(rhs.0);
+ }
+}
+
+impl SubAssign for BlurU32 {
+ fn sub_assign(&mut self, rhs: Self) {
+ self.0 = self.0.wrapping_sub(rhs.0);
+ }
+}
+
+impl Mul<usize> for BlurU32 {
+ type Output = Self;
+
+ fn mul(self, rhs: usize) -> Self::Output {
+ Self(self.0.wrapping_mul(rhs as u32))
+ }
+}
+
+impl Div<usize> for BlurU32 {
+ type Output = Self;
+
+ #[inline]
+ fn div(self, rhs: usize) -> Self::Output {
+ Self(self.0.wrapping_div(rhs as u32))
+ }
+}
diff --git a/src/color/simd.rs b/src/color/simd.rs
new file mode 100755
index 0000000..3b0e593
--- /dev/null
+++ b/src/color/simd.rs
@@ -0,0 +1,73 @@
+use std::ops::{Add, AddAssign, Div, Mul, Sub, SubAssign};
+
+pub use std::simd::{LaneCount, Simd, SupportedLaneCount};
+
+#[derive(Copy, Clone, Eq, PartialEq, Debug, Default)]
+#[allow(non_camel_case_types)]
+pub struct u32xN<const N: usize>(pub Simd<u32, N>)
+where
+ LaneCount<N>: SupportedLaneCount;
+
+impl<const N: usize> Add for u32xN<N>
+where
+ LaneCount<N>: SupportedLaneCount,
+{
+ type Output = Self;
+
+ fn add(self, rhs: Self) -> Self::Output {
+ Self(self.0 + rhs.0)
+ }
+}
+
+impl<const N: usize> Sub for u32xN<N>
+where
+ LaneCount<N>: SupportedLaneCount,
+{
+ type Output = Self;
+
+ fn sub(self, rhs: Self) -> Self::Output {
+ Self(self.0 - rhs.0)
+ }
+}
+
+impl<const N: usize> AddAssign for u32xN<N>
+where
+ LaneCount<N>: SupportedLaneCount,
+{
+ fn add_assign(&mut self, rhs: Self) {
+ self.0 += rhs.0;
+ }
+}
+
+impl<const N: usize> SubAssign for u32xN<N>
+where
+ LaneCount<N>: SupportedLaneCount,
+{
+ fn sub_assign(&mut self, rhs: Self) {
+ self.0 -= rhs.0;
+ }
+}
+
+impl<const N: usize> Mul<usize> for u32xN<N>
+where
+ LaneCount<N>: SupportedLaneCount,
+{
+ type Output = Self;
+
+ fn mul(self, rhs: usize) -> Self::Output {
+ Self(self.0 * Simd::<u32, N>::splat(rhs as u32))
+ }
+}
+
+impl<const N: usize> Div<usize> for u32xN<N>
+where
+ LaneCount<N>: SupportedLaneCount,
+{
+ type Output = Self;
+
+ fn div(self, rhs: usize) -> Self::Output {
+ Self(Simd::<u32, N>::from_array(
+ self.0.to_array().map(|e| (e as usize / rhs) as u32),
+ ))
+ }
+}
diff --git a/src/iter.rs b/src/iter.rs
new file mode 100755
index 0000000..7fc7e38
--- /dev/null
+++ b/src/iter.rs
@@ -0,0 +1,258 @@
+//! This module implements the [`StackBlur`] struct and the improved version of
+//! the Stackblur algorithm.
+//!
+//! ## The improved Stackblur algorithm
+//!
+//! As previously stated, this crate implements a modified version of Stackblur,
+//! and understanding the original algorithm is key to understanding the
+//! improvements that have been made to it.
+//!
+//! The original Stackblur is essentially a weighted box-blur. Instead of taking
+//! the average of a flat array of pixels, like this:
+//!
+//! ```text
+//! ( 00 + 01 + 02 + 03 + 04 + 05 + 06 ) / 7
+//! ```
+//!
+//! it takes a weighted average of the pixels:
+//!
+//! ```text
+//! ( 03 +
+//! 02 + 03 + 04 +
+//! 01 + 02 + 03 + 04 + 05 +
+//! 00 + 01 + 02 + 03 + 04 + 05 + 06 ) / 16
+//! ```
+//!
+//! This is a rough approximation of a Gaussian blur, and in fact it's already
+//! most of the way there to being a complete algorithm on its own. You can just
+//! multiply each pixel by something like `radius + 1 - center_dist` when you
+//! make your sum, then divide by `radius * (radius + 2) + 1`.
+//!
+//! But there are two problems with this:
+//!
+//! 1. That would be *O*(n(m * 2 + 1)²) complexity, where `n` is the number of
+//! pixels in the image, and `m` is the radius of the blur. This is basically
+//! just as expensive as running a proper convolution filter.
+//!
+//! 2. How do we handle pixels off the edge of the image?
+//!
+//! I'm scared of #1 so I'm going to handle #2 first. What most implementations
+//! choose to do is just repeat the edge of the image:
+//!
+//! ```text
+//! ( 00 +
+//! 00 + 00 + 01 +
+//! 00 + 00 + 00 + 01 + 02 +
+//! 00 + 00 + 00 + 00 + 01 + 02 + 03 ) / 16
+//! ```
+//!
+//! But this creates even more problems, because the edge of the blur will be
+//! quite biased towards the pixels that aren't even in the image. This is known
+//! as "edge bleeding", where a single pixel at the edge of a blur can cause
+//! very large and ugly artifacts to show up.
+//!
+//! The solution, of course, is to not calculate the denominator using that
+//! equation from earlier, and instead incrementally update the denominator as
+//! pixels are scanned, allowing you to sum a varying number of pixels:
+//!
+//! ```text
+//! ( 00 +
+//! 00 + 01 +
+//! 00 + 01 + 02 +
+//! 00 + 01 + 02 + 03 ) / 10
+//!
+//! ( 01 +
+//! 00 + 01 + 02 +
+//! 00 + 01 + 02 + 03 +
+//! 00 + 01 + 02 + 03 + 04 ) / 13
+//!
+//! ( 02 +
+//! 01 + 02 + 03 +
+//! 00 + 01 + 02 + 03 + 04 +
+//! 00 + 01 + 02 + 03 + 04 + 05 ) / 15
+//!
+//! ( 03 +
+//! 02 + 03 + 04 +
+//! 01 + 02 + 03 + 04 + 05 +
+//! 00 + 01 + 02 + 03 + 04 + 05 + 06 ) / 16
+//! ```
+//!
+//! This is one of the improvements made to the Stackblur algorithm that is
+//! implemented by this crate.
+//!
+//! Now for #1 - the complexity problem. It is possible to make a streaming
+//! Stackblur that does not need to read any pixels out of order or more than
+//! once, or even know how long the input is. In fact, that's exactly what this
+//! crate does.
+//!
+//! First you fill up the cache with `radius + 1` pixels in order to be able to
+//! make the first sum, then you start producing results until you run out of
+//! pixels, then you produce the last `radius` results using what you have in
+//! cache. I don't have the skill to explain the algorithm in full detail, but
+//! it's open-source so you can look at it if you want.
+//!
+//! In this crate the "cache" is not actually the actual heap of values, as that
+//! would be too slow. Instead, the "cache" is a list of changes to make to the
+//! rate of change of the sum, and the denominator is updated incrementally in
+//! response to the number of leading and trailing values changing.
+//!
+//! The reason the whole rate of change thing exists is so that adding to the
+//! sum and registering it in the cache becomes *O*(1) instead of *O*(2n+1)
+//! (where `n` is the radius). It's basically the most important thing that
+//! makes the algorithm constant-time.
+
+use std::collections::VecDeque;
+
+use crate::traits::StackBlurrable;
+
+/// An iterator that implements an improved Stackblur algorithm.
+///
+/// For any [`StackBlurrable`] element `T` and any iterator `I` over items of
+/// type `T`, [`StackBlur`] will yield the same amount of elements as `I`,
+/// blurred together according to the improved Stackblur algorithm as described
+/// by the [`iter`][crate::iter] module documentation.
+///
+/// ## Usage
+///
+/// [`StackBlur`] just wraps another iterator with a radius and a cache (called
+/// `ops` by [`StackBlur::new`]):
+///
+/// ```compile_fail
+/// # use std::collections::VecDeque;
+/// # use std::num::Wrapping;
+/// # use stackblur_iter::iter::StackBlur;
+/// #
+/// let arr = [255u8, 0, 0, 0, 127, 0, 0, 0, 255u8];
+/// let iter = arr.iter().copied().map(|i| Wrapping(i as usize));
+/// let blur = StackBlur::new(iter, 2, VecDeque::new());
+/// ```
+///
+/// That example unfortunately doesn't compile because `Wrapping<usize>` doesn't
+/// implement `Mul<usize>` and `Div<usize>`, which are required for the
+/// averaging that [`StackBlur`] performs. It's recommended to create a newtype
+/// wrapper around whatever you plan on blurring, and implement all the traits
+/// required by [`StackBlurrable`].
+///
+/// A [`StackBlur`] always yields exactly as many items as its inner iterator
+/// does. Additionally, a non-fused iterator which repeats will cause the
+/// [`StackBlur`] to repeat as well.
+pub struct StackBlur<'a, T: StackBlurrable, I: Iterator<Item = T>> {
+ iter: I,
+ radius: usize,
+ sum: T,
+ rate: T,
+ dnom: usize,
+ ops: &'a mut VecDeque<T>,
+ leading: usize,
+ trailing: usize,
+ done: bool,
+ first: bool,
+}
+
+impl<'a, T: StackBlurrable, I: Iterator<Item = T>> StackBlur<'a, T, I> {
+ /// Creates a new [`StackBlur`] from the provided iterator, radius, and
+ /// [`VecDeque`].
+ ///
+ /// The iterator is not advanced until a call to [`StackBlur::next`].
+ pub fn new(iter: I, radius: usize, ops: &'a mut VecDeque<T>) -> Self {
+ Self {
+ iter,
+ radius,
+ sum: T::default(),
+ rate: T::default(),
+ dnom: 0,
+ leading: 0,
+ trailing: 0,
+ ops,
+ done: true,
+ first: true,
+ }
+ }
+
+ fn init(&mut self) {
+ self.done = false;
+
+ for sub in 0..=self.radius {
+ let Some(item) = self.iter.next() else { break };
+
+ if sub == 0 {
+ let start = self.radius + 1;
+ let needed = start * 2 + 2;
+ self.ops.reserve(needed.saturating_sub(self.ops.capacity()));
+ self.ops
+ .iter_mut()
+ .take(start)
+ .for_each(|place| *place = T::default());
+ self.ops.resize_with(start, T::default);
+
+ self.sum = T::default();
+ self.rate = T::default();
+ self.dnom = 0;
+ self.leading = 0;
+ self.trailing = 0;
+ }
+
+ let mul = self.radius + 1 - sub;
+ self.sum += item * mul;
+ self.rate += item;
+ self.dnom += mul;
+
+ if self.dnom > mul {
+ self.trailing += 1;
+ }
+
+ self.ops[sub] -= item * 2;
+ self.ops.push_back(item);
+ }
+
+ if self.dnom == 0 {
+ self.done = true;
+ }
+ }
+}
+
+impl<T: StackBlurrable, I: Iterator<Item = T>> Iterator for StackBlur<'_, T, I> {
+ type Item = T;
+
+ #[inline]
+ fn next(&mut self) -> Option<Self::Item> {
+ if self.done {
+ self.init();
+
+ if !std::mem::replace(&mut self.first, false) {
+ return None;
+ }
+ }
+
+ let result = self.sum / self.dnom;
+
+ self.rate += self.ops.pop_front().unwrap();
+ self.sum += self.rate;
+
+ if self.leading < self.radius {
+ self.leading += 1;
+ self.dnom += self.radius + 1 - self.leading;
+ }
+
+ if self.radius == 0 || self.trailing == self.radius {
+ if let Some(item) = self.iter.next() {
+ self.sum += item;
+ self.rate += item;
+ self.ops[self.radius] -= item * 2;
+ self.ops.push_back(item);
+ } else if self.radius > 0 {
+ self.dnom -= self.radius + 1 - self.trailing;
+ self.trailing -= 1;
+ } else {
+ self.done = true;
+ }
+ } else if self.trailing > 0 {
+ self.dnom -= self.radius + 1 - self.trailing;
+ self.trailing -= 1;
+ } else if self.trailing == 0 {
+ self.done = true;
+ }
+
+ Some(result)
+ }
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100755
index 0000000..6cf1b17
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,205 @@
+//! A fast, iterative, correct approach to Stackblur, resulting in a very smooth
+//! and high-quality output, with no edge bleeding.
+//!
+//! This crate implements a tweaked version of the Stackblur algorithm requiring
+//! `radius * 2 + 2` elements of space rather than `radius * 2 + 1`, which is a
+//! small tradeoff for much-increased visual quality.
+//!
+//! The algorithm is exposed as an iterator ([`StackBlur`]) that can wrap any
+//! other iterator that yields elements of [`StackBlurrable`]. The [`StackBlur`]
+//! will then yield elements blurred by the specified radius.
+//!
+//! ## Benefits of this crate
+//!
+//! Stackblur is essentially constant-time. Regardless of the radius, it always
+//! performs only 1 scan over the input iterator and outputs exactly the same
+//! amount of elements.
+//!
+//! Additionally, it produces results that are comparable to slow and expensive
+//! Gaussian blurs. As opposed to box blur which uses a basic rolling average,
+//! Stackblur uses a weighted average where each output pixel is affected more
+//! strongly by the inputs that were closest to it.
+//!
+//! Despite that, Stackblur does not perform much worse compared to naive box
+//! blurs, and is quite cheap compared to full Gaussian blurs, at least for the
+//! CPU. The implementation in this crate will most likely beat most unoptimized
+//! blurs you can find on crates.io, as well as some optimized ones, and it is
+//! extremely flexible and generic.
+//!
+//! For a full explanation of the improvements made to the Stackblur algorithm,
+//! see the [`iter`] module.
+//!
+//! ## Comparison to the `stackblur` crate
+//!
+//! `stackblur` suffers from edge bleeding and flexibility problems. For
+//! example, it can only operate on buffers of 32-bit integers, and expects them
+//! to be packed linear ARGB pixels. Additionally, it cannot operate on a 2D
+//! subslice of a buffer (like `imgref` allows for this crate), and it does not
+//! offer any streaming iterators or documentation. And it also only supports
+//! a blur radius of up to 255.
+//!
+//! ## Usage
+//!
+//! Aside from [`StackBlurrable`] and [`StackBlur`] which host their own
+//! documentation, there are helper functions like [`blur`] and [`blur_argb`]
+//! that can be used to interact with 2D image buffers, due to the fact that
+//! doing so manually involves unsafe code (if you want no-copy).
+
+#![feature(portable_simd, stmt_expr_attributes)]
+#![cfg_attr(test, feature(test))]
+
+use std::collections::VecDeque;
+use std::simd::{LaneCount, SupportedLaneCount};
+
+pub extern crate imgref;
+
+use imgref::ImgRefMut;
+
+#[cfg(test)]
+mod test;
+
+pub mod color;
+pub mod iter;
+pub mod traits;
+
+use color::Argb;
+use iter::StackBlur;
+use traits::StackBlurrable;
+
+/// Blurs a buffer, assuming one element per pixel.
+///
+/// The provided closures are used to convert from the buffer's native pixel
+/// format to [`StackBlurrable`] values that can be consumed by [`StackBlur`].
+pub fn blur<T, B: StackBlurrable>(
+ buffer: &mut ImgRefMut<T>,
+ radius: usize,
+ mut to_blurrable: impl FnMut(&T) -> B,
+ mut to_pixel: impl FnMut(B) -> T,
+) {
+ use imgref_iter::iter::{IterWindows, IterWindowsPtrMut};
+ use imgref_iter::traits::{ImgIter, ImgIterMut, ImgIterPtrMut};
+
+ let mut ops = VecDeque::new();
+
+ // This is needed to avoid Undefined Behavior. Writing to the rows of the
+ // must be done before constructing the columns iterators, because otherwise
+ // the writes would invalidate their borrows. However I don't want to
+ // duplicate this loop, so make it a closure.
+ let mut blur_windows = |writer: IterWindowsPtrMut<T>, reader: IterWindows<T>| {
+ for (write, read) in writer.zip(reader) {
+ let mut blur = StackBlur::new(read.map(&mut to_blurrable), radius, &mut ops);
+ write.for_each(|place| unsafe { *place = to_pixel(blur.next().unwrap()) });
+ }
+ };
+
+ let buffer_ptr = buffer.as_mut_ptr();
+ blur_windows(
+ unsafe { buffer_ptr.iter_rows_ptr_mut() },
+ buffer.iter_rows(),
+ );
+ blur_windows(
+ unsafe { buffer_ptr.iter_cols_ptr_mut() },
+ buffer.iter_cols(),
+ );
+}
+
+/// Blurs a buffer with SIMD, assuming one element per pixel.
+///
+/// The provided closures are used to convert from the buffer's native pixel
+/// format to [`StackBlurrable`] values that can be consumed by [`StackBlur`].
+pub fn simd_blur<T, Bsimd: StackBlurrable, Bsingle: StackBlurrable, const LANES: usize>(
+ buffer: &mut ImgRefMut<T>,
+ radius: usize,
+ mut to_blurrable_simd: impl FnMut([&T; LANES]) -> Bsimd,
+ mut to_pixel_simd: impl FnMut(Bsimd) -> [T; LANES],
+ mut to_blurrable_single: impl FnMut(&T) -> Bsingle,
+ mut to_pixel_single: impl FnMut(Bsingle) -> T,
+) where
+ LaneCount<LANES>: SupportedLaneCount,
+{
+ #[cfg(not(doc))]
+ use imgref_iter::iter::{
+ SimdIterWindow, SimdIterWindowPtrMut, SimdIterWindows, SimdIterWindowsPtrMut,
+ };
+ #[cfg(not(doc))]
+ use imgref_iter::traits::{ImgIterMut, ImgSimdIter, ImgSimdIterPtrMut};
+
+ let mut ops_simd = VecDeque::new();
+ let mut ops_single = VecDeque::new();
+
+ let mut simd_blur_windows =
+ |writer: SimdIterWindowsPtrMut<T, LANES>,
+ reader: SimdIterWindows<T, LANES>,
+ mut ops_simd: VecDeque<Bsimd>,
+ mut ops_single: VecDeque<Bsingle>| {
+ for (write, read) in writer.zip(reader) {
+ match (write, read) {
+ (SimdIterWindowPtrMut::Simd(write), SimdIterWindow::Simd(read)) => {
+ let mut blur =
+ StackBlur::new(read.map(&mut to_blurrable_simd), radius, &mut ops_simd);
+ write.for_each(|place| {
+ place
+ .into_iter()
+ .zip(to_pixel_simd(blur.next().unwrap()))
+ .for_each(|(place, pixel)| unsafe { *place = pixel });
+ });
+ }
+
+ (SimdIterWindowPtrMut::Single(write), SimdIterWindow::Single(read)) => {
+ let mut blur = StackBlur::new(
+ read.map(&mut to_blurrable_single),
+ radius,
+ &mut ops_single,
+ );
+ write.for_each(|place| unsafe {
+ *place = to_pixel_single(blur.next().unwrap());
+ });
+ }
+
+ _ => unreachable!(),
+ }
+ }
+
+ (ops_simd, ops_single)
+ };
+
+ let buffer_ptr = buffer.as_mut_ptr();
+ (ops_simd, ops_single) = simd_blur_windows(
+ unsafe { buffer_ptr.simd_iter_rows_ptr_mut::<LANES>() },
+ buffer.simd_iter_rows::<LANES>(),
+ ops_simd,
+ ops_single,
+ );
+ simd_blur_windows(
+ unsafe { buffer_ptr.simd_iter_cols_ptr_mut::<LANES>() },
+ buffer.simd_iter_cols::<LANES>(),
+ ops_simd,
+ ops_single,
+ );
+}
+
+/// Blurs a buffer of 32-bit packed ARGB pixels (0xAARRGGBB).
+///
+/// This is a version of [`blur`] with pre-filled conversion routines that
+/// provide good results for blur radii <= 4096. Larger radii may overflow.
+pub fn blur_argb(buffer: &mut ImgRefMut<u32>, radius: usize) {
+ blur(buffer, radius, |i| Argb::from(*i), Argb::into);
+}
+
+/// Blurs a buffer of 32-bit packed ARGB pixels (0xAARRGGBB) with SIMD.
+///
+/// This is a version of [`simd_blur`] with pre-filled conversion routines that
+/// provide good results for blur radii <= 4096. Larger radii may overflow.
+pub fn simd_blur_argb<const LANES: usize>(buffer: &mut ImgRefMut<u32>, radius: usize)
+where
+ LaneCount<LANES>: SupportedLaneCount,
+{
+ simd_blur(
+ buffer,
+ radius,
+ |i: [&u32; LANES]| Argb::from(i.map(u32::clone)),
+ Argb::into,
+ |i| Argb::from(*i),
+ Argb::into,
+ );
+}
diff --git a/src/test.rs b/src/test.rs
new file mode 100755
index 0000000..2993e93
--- /dev/null
+++ b/src/test.rs
@@ -0,0 +1,49 @@
+extern crate test;
+
+use imgref::ImgVec;
+use test::Bencher;
+
+const WIDTH: usize = 640;
+const HEIGHT: usize = 480;
+
+#[bench]
+#[inline(never)]
+fn blur_argb_16(bencher: &mut Bencher) {
+ let mut buf = ImgVec::new(vec![0; WIDTH * HEIGHT], WIDTH, HEIGHT);
+ bencher.iter(|| crate::blur_argb(&mut buf.as_mut(), 16));
+}
+
+#[bench]
+#[inline(never)]
+fn blur_argb_128(bencher: &mut Bencher) {
+ let mut buf = ImgVec::new(vec![0; WIDTH * HEIGHT], WIDTH, HEIGHT);
+ bencher.iter(|| crate::blur_argb(&mut buf.as_mut(), 128));
+}
+
+#[bench]
+#[inline(never)]
+fn blur_argb_1024(bencher: &mut Bencher) {
+ let mut buf = ImgVec::new(vec![0; WIDTH * HEIGHT], WIDTH, HEIGHT);
+ bencher.iter(|| crate::blur_argb(&mut buf.as_mut(), 1024));
+}
+
+#[bench]
+#[inline(never)]
+fn simd_blur_argb_16(bencher: &mut Bencher) {
+ let mut buf = ImgVec::new(vec![0; WIDTH * HEIGHT], WIDTH, HEIGHT);
+ bencher.iter(|| crate::simd_blur_argb::<8>(&mut buf.as_mut(), 16));
+}
+
+#[bench]
+#[inline(never)]
+fn simd_blur_argb_128(bencher: &mut Bencher) {
+ let mut buf = ImgVec::new(vec![0; WIDTH * HEIGHT], WIDTH, HEIGHT);
+ bencher.iter(|| crate::simd_blur_argb::<8>(&mut buf.as_mut(), 128));
+}
+
+#[bench]
+#[inline(never)]
+fn simd_blur_argb_1024(bencher: &mut Bencher) {
+ let mut buf = ImgVec::new(vec![0; WIDTH * HEIGHT], WIDTH, HEIGHT);
+ bencher.iter(|| crate::simd_blur_argb::<8>(&mut buf.as_mut(), 1024));
+}
diff --git a/src/traits.rs b/src/traits.rs
new file mode 100755
index 0000000..852b1d0
--- /dev/null
+++ b/src/traits.rs
@@ -0,0 +1,36 @@
+//! The home of [`StackBlurrable`].
+
+use std::ops::{Add, AddAssign, Div, Mul, SubAssign};
+
+/// The trait for types which can be blurred by [`StackBlur`][crate::StackBlur].
+///
+/// This trait is auto-implemented for all types that satisfy its requirements.
+///
+/// Types that wish to implement this trait should be signed or use explicitly
+/// wrapping arithmetic.
+///
+/// They should have a significantly higher precision than the pixel format that
+/// they represent, as they may be multiplied by hundreds or thousands before
+/// being divided. They should also ideally be `Copy` so that cloning is cheap.
+pub trait StackBlurrable:
+ Default
+ + Copy
+ + Add<Output = Self>
+ + AddAssign
+ + SubAssign
+ + Mul<usize, Output = Self>
+ + Div<usize, Output = Self>
+{
+}
+
+impl<
+ T: Default
+ + Copy
+ + Add<Output = T>
+ + AddAssign
+ + SubAssign
+ + Mul<usize, Output = T>
+ + Div<usize, Output = T>,
+ > StackBlurrable for T
+{
+}