heh
speedinate
| -rw-r--r-- | src/main.rs | 9 | ||||
| -rw-r--r-- | src/rah.rs | 247 |
2 files changed, 151 insertions, 105 deletions
diff --git a/src/main.rs b/src/main.rs index 3286a79..9e93ca1 100644 --- a/src/main.rs +++ b/src/main.rs @@ -76,6 +76,7 @@ use atools::prelude::*; #[implicit_fn::implicit_fn] pub unsafe fn p1(x: &'static [u8; ISIZE]) -> impl Debug { let i = x.chunked::<{ 141 + 1 }>(); + assert_eq!(i.len(), 142); let start = i.find(b'S'); util::memo_countg( @@ -97,10 +98,10 @@ pub unsafe fn p1(x: &'static [u8; ISIZE]) -> impl Debug { const ISIZE: usize = include_bytes!("inp.txt").len(); fn main() { use atools::prelude::*; - unsafe { println!("{:?}", p1(include_bytes!("inp.txt"))) }; - // unsafe { println!("{:?}", p1(include_str!("../1"))) }; - // unsafe { println!("{:?}", p1(include_str!("../2"))) }; - // unsafe { println!("{:?}", p1(include_str!("../3"))) }; + unsafe { println!("{:?}", rah::run(include_bytes!("inp.txt"))) }; + unsafe { println!("{:?}", rah::run(include_bytes!("../1"))) }; + unsafe { println!("{:?}", rah::run(include_bytes!("../2"))) }; + unsafe { println!("{:?}", rah::run(include_bytes!("../3"))) }; } #[bench] @@ -1,108 +1,153 @@ -#![feature(generic_const_exprs, portable_simd)] -use std::{ops::Range, simd::prelude::*}; +#![feature(generic_const_exprs, portable_simd, stmt_expr_attributes)] +use std::{hint::assert_unchecked, ops::Range, simd::prelude::*}; #[unsafe(no_mangle)] -pub unsafe fn run(x: &'static [u8]) -> u64 { - let width = 3712 + memchr::memchr(b'\n', x.get_unchecked(3712..)).unwrap_unchecked(); - let numbers = [ - x, - x.get_unchecked(width + 1..), - x.get_unchecked((width + 1) * 2..), - x.get_unchecked((width + 1) * 3..), - ]; - let mut tot = 0; - let ll = x.get_unchecked((width + 1) * 4..(width + 1) * 5 - 1); - let mut at = 0; - while let Some((index, mut mask)) = { - let sl = ll.get(at..).unwrap_unchecked(); - let x = u8x64::load_or(sl, Simd::splat(b' ')); - let mask = x.simd_ne(Simd::splat(b' ')).to_bitmask() as u64; - (mask != 0).then(|| { - ( - at + mask.trailing_zeros() as usize, - mask ^ (mask & mask.wrapping_neg()), - ) - }) - } { - let mut x = Simd::splat(0u16); - for i in 0..4 { - let c = u8x64::from_slice(numbers.get_unchecked(i).get_unchecked(index..index + 64)); - let m = c.simd_ne(Simd::splat(b' ')); - x = m - .cast::<i16>() - .select(x * Simd::splat(10) + (c - Simd::splat(b'0')).cast(), x); - } - let f = |ix: Range<usize>| { - // let iter1 = ix.clone().map(|i| { - // numbers - // .iter() - // .filter(|x| *x.get_unchecked(index + i) != b' ') - // .fold(0, |acc, x| { - // acc * 10 + (x.get_unchecked(index + i) - b'0') as u64 - // }) - // }); - // let mut tot_ = 0; - // match *ll.get_unchecked(index + ix.start) { - // b'+' => tot_ += iter1.clone().sum::<u64>(), - // b'*' => tot_ += iter1.clone().product::<u64>(), - // _ => panic!(), - // } - - // print!("{:?} = ", iter1.clone().collect::<Vec<_>>()); - // println!("{:?} {}", &x[ix.clone()], ll[index + ix.start] as char); - const IDX: u32x64 = u32x64::from_array([ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, - 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, - 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, - ]); - - let i = x - .as_array() - .get_unchecked(ix.clone()) - .iter() - .map(|&x| x as u64); - let x = match *ll.get_unchecked(index + ix.start) { - b'+' => { - let mask = IDX.simd_lt(Simd::splat(ix.end as _)) - & IDX.simd_ge(Simd::splat(ix.start as _)); - (mask.to_int().cast::<u32>() & x.cast::<u32>()).reduce_sum() as u64 - } - _ => i.product::<u64>(), - }; - // assert_eq!(x, tot_); - x - }; - if mask.leading_zeros() as usize == 64 { - return tot + f(0..ll.len() - index); - } - - let mut last = 0; - let mut hold: Option<Range<usize>> = None; +pub unsafe fn run(x: &[u8]) -> u64 { + let i = unsafe { x.as_chunks_unchecked::<{ 141 + 1 }>() }; + #[rustfmt::skip] + static LUT: [(usize, usize); 141] = [(0, 0), (0, 0), (70, 70), (0, 0), (69, 71), (0, 0), (68, 72), (0, 0), (67, 73), (0, 0), (66, 74), (0, 0), (65, 75), (0, 0), (64, 76), (0, 0), (63, 77), (0, 0), (62, 78), (0, 0), (61, 79), (0, 0), (60, 80), (0, 0), (59, 81), (0, 0), (58, 82), (0, 0), (57, 83), (0, 0), (56, 84), (0, 0), (55, 85), (0, 0), (54, 86), (0, 0), (53, 87), (0, 0), (52, 88), (0, 0), (51, 89), (0, 0), (50, 90), (0, 0), (49, 91), (0, 0), (48, 92), (0, 0), (47, 93), (0, 0), (46, 94), (0, 0), (45, 95), (0, 0), (44, 96), (0, 0), (43, 97), (0, 0), (42, 98), (0, 0), (41, 99), (0, 0), (40, 100), (0, 0), (39, 101), (0, 0), (38, 102), (0, 0), (37, 103), (0, 0), (36, 104), (0, 0), (35, 105), (0, 0), (34, 106), (0, 0), (33, 107), (0, 0), (32, 108), (0, 0), (31, 109), (0, 0), (30, 110), (0, 0), (29, 111), (0, 0), (28, 112), (0, 0), (27, 113), (0, 0), (26, 114), (0, 0), (25, 115), (0, 0), (24, 116), (0, 0), (23, 117), (0, 0), (22, 118), (0, 0), (21, 119), (0, 0), (20, 120), (0, 0), (19, 121), (0, 0), (18, 122), (0, 0), (17, 123), (0, 0), (16, 124), (0, 0), (15, 125), (0, 0), (14, 126), (0, 0), (13, 127), (0, 0), (12, 128), (0, 0), (11, 129), (0, 0), (10, 130), (0, 0), (9, 131), (0, 0), (8, 132), (0, 0), (7, 133), (0, 0), (6, 134), (0, 0), (5, 135), (0, 0), (4, 136), (0, 0), (3, 137), (0, 0), (2, 138), (0, 0), (1, 139)]; - while mask != 0 { - let next = mask.trailing_zeros() as usize; - mask ^= mask & mask.wrapping_neg(); - if *ll.get_unchecked(index + last) == b'+' { - if let Some(r) = &mut hold { - r.end = next; - } else { - hold = Some(last..next); - } - } else { - // * - if let Some(r) = hold.take() { - tot += f(r); - } - tot += f(last..next - 1); + for n in (1..141).rev().step_by(2) { + let lem = &i[n]; + println!("{n}: {}", lem.iter().filter(|x| **x == b'^').count()); + // assert_eq!((start, end), LUT[n]); + // assert_eq!(LUT[n], (start, end)); + // println!("({start}, {end}),"); + } + // println!("{LUT:?}"); + assert_unchecked(i.len() == 142); + let mut v = [1u64; 141]; + let split = Simd::splat(b'^'); + let mut f = #[inline(always)] + |(star, end): (usize, usize), line: &[u8; 142]| { + let mut f = #[inline(always)] + |mut mask: u64, o: usize| { + while mask != 0 { + let x = mask.trailing_zeros() as usize + o; + mask ^= mask & mask.wrapping_neg(); + *v.get_unchecked_mut(x) = *v.get_unchecked(x - 1) + *v.get_unchecked(x + 1); } - - last = next; + }; + // let (star, end) = LUT[ix]; + if (end - star) < 64 { + let mask = u8x64::load_or_default(&line[star..]) + .simd_eq(split) + .to_bitmask(); + f(mask, star); + } else if (end - star) < 128 { + let mask = u8x64::load_or_default(&line[star..]) + .simd_eq(split) + .to_bitmask(); + f(mask, star); + let mask = u8x64::load_or_default(&line[star + 64..]) + .simd_eq(split) + .to_bitmask(); + f(mask, star + 64); + } else { + let mask = u8x64::load_or_default(&line[star..]) + .simd_eq(split) + .to_bitmask(); + f(mask, star); + let mask = u8x64::load_or_default(&line[star + 64..]) + .simd_eq(split) + .to_bitmask(); + f(mask, star + 64); + let mask = u8x64::load_or_default(&line[star + 128..]) + .simd_eq(split) + .to_bitmask(); + f(mask, star + 128); } + // if ix > 65 { + // let mask = u8x64::load_or_default(&line[..39]) + // .simd_eq(split) + // .to_bitmask(); + // f(mask, 0); + // } + // let middle = u8x64::load_or_default(&line[39..]) + // .simd_eq(split) + // .to_bitmask(); + // f(middle, 39); + // if ix > 65 { + // let end = u8x64::load_or_default(&line[39 + 64..]) + // .simd_eq(split) + // .to_bitmask(); + // f(end, 39 + 64); + // } + }; - if let Some(r) = hold { - tot += f(r); - } + f(const { LUT[140] }, &i[140]); + f(const { LUT[138] }, &i[138]); + f(const { LUT[136] }, &i[136]); + f(const { LUT[134] }, &i[134]); + f(const { LUT[132] }, &i[132]); + f(const { LUT[130] }, &i[130]); + f(const { LUT[128] }, &i[128]); + f(const { LUT[126] }, &i[126]); + f(const { LUT[124] }, &i[124]); + f(const { LUT[122] }, &i[122]); + f(const { LUT[120] }, &i[120]); + f(const { LUT[118] }, &i[118]); + f(const { LUT[116] }, &i[116]); + f(const { LUT[114] }, &i[114]); + f(const { LUT[112] }, &i[112]); + f(const { LUT[110] }, &i[110]); + f(const { LUT[108] }, &i[108]); + f(const { LUT[106] }, &i[106]); + f(const { LUT[104] }, &i[104]); + f(const { LUT[102] }, &i[102]); + f(const { LUT[100] }, &i[100]); + f(const { LUT[98] }, &i[98]); + f(const { LUT[96] }, &i[96]); + f(const { LUT[94] }, &i[94]); + f(const { LUT[92] }, &i[92]); + f(const { LUT[90] }, &i[90]); + f(const { LUT[88] }, &i[88]); + f(const { LUT[86] }, &i[86]); + f(const { LUT[84] }, &i[84]); + f(const { LUT[82] }, &i[82]); + f(const { LUT[80] }, &i[80]); + f(const { LUT[78] }, &i[78]); + f(const { LUT[76] }, &i[76]); + f(const { LUT[74] }, &i[74]); + f(const { LUT[72] }, &i[72]); + f(const { LUT[70] }, &i[70]); + f(const { LUT[68] }, &i[68]); + f(const { LUT[66] }, &i[66]); + f(const { LUT[64] }, &i[64]); + f(const { LUT[62] }, &i[62]); + f(const { LUT[60] }, &i[60]); + f(const { LUT[58] }, &i[58]); + f(const { LUT[56] }, &i[56]); + f(const { LUT[54] }, &i[54]); + f(const { LUT[52] }, &i[52]); + f(const { LUT[50] }, &i[50]); + f(const { LUT[48] }, &i[48]); + f(const { LUT[46] }, &i[46]); + f(const { LUT[44] }, &i[44]); + f(const { LUT[42] }, &i[42]); + f(const { LUT[40] }, &i[40]); + f(const { LUT[38] }, &i[38]); + f(const { LUT[36] }, &i[36]); + f(const { LUT[34] }, &i[34]); + f(const { LUT[32] }, &i[32]); + f(const { LUT[30] }, &i[30]); + f(const { LUT[28] }, &i[28]); + f(const { LUT[26] }, &i[26]); + f(const { LUT[24] }, &i[24]); + f(const { LUT[22] }, &i[22]); + f(const { LUT[20] }, &i[20]); + f(const { LUT[18] }, &i[18]); + f(const { LUT[16] }, &i[16]); + f(const { LUT[14] }, &i[14]); + f(const { LUT[12] }, &i[12]); + f(const { LUT[10] }, &i[10]); + f(const { LUT[8] }, &i[8]); + f(const { LUT[6] }, &i[6]); + f(const { LUT[4] }, &i[4]); + f(const { LUT[2] }, &i[2]); + // for n in (1..141).rev().step_by(2) { + // println!("f({n}, &i[{n}]);"); + // f(n, &i[n]); + // } - at += last; - } - tot + *v.get_unchecked(70) } |