Unnamed repository; edit this file 'description' to name the repository.
Diffstat (limited to 'lib/line-index/src/lib.rs')
| -rw-r--r-- | lib/line-index/src/lib.rs | 218 |
1 files changed, 180 insertions, 38 deletions
diff --git a/lib/line-index/src/lib.rs b/lib/line-index/src/lib.rs index ad67d3f246..03371c9c87 100644 --- a/lib/line-index/src/lib.rs +++ b/lib/line-index/src/lib.rs @@ -94,44 +94,7 @@ pub struct LineIndex { impl LineIndex { /// Returns a `LineIndex` for the `text`. pub fn new(text: &str) -> LineIndex { - let mut newlines = Vec::<TextSize>::with_capacity(16); - let mut line_wide_chars = IntMap::<u32, Box<[WideChar]>>::default(); - - let mut wide_chars = Vec::<WideChar>::new(); - let mut cur_row = TextSize::from(0); - let mut cur_col = TextSize::from(0); - let mut line = 0u32; - - for c in text.chars() { - let c_len = TextSize::of(c); - cur_row += c_len; - if c == '\n' { - newlines.push(cur_row); - - // Save any wide characters seen in the previous line - if !wide_chars.is_empty() { - let cs = std::mem::take(&mut wide_chars).into_boxed_slice(); - line_wide_chars.insert(line, cs); - } - - // Prepare for processing the next line - cur_col = TextSize::from(0); - line += 1; - continue; - } - - if !c.is_ascii() { - wide_chars.push(WideChar { start: cur_col, end: cur_col + c_len }); - } - - cur_col += c_len; - } - - // Save any wide characters seen in the last line - if !wide_chars.is_empty() { - line_wide_chars.insert(line, wide_chars.into_boxed_slice()); - } - + let (newlines, line_wide_chars) = analyze_source_file(text); LineIndex { newlines: newlines.into_boxed_slice(), line_wide_chars, @@ -235,3 +198,182 @@ impl LineIndex { self.len } } + +/// This is adapted from the rustc_span crate, https://github.com/rust-lang/rust/blob/de59844c98f7925242a798a72c59dc3610dd0e2c/compiler/rustc_span/src/analyze_source_file.rs +fn analyze_source_file(src: &str) -> (Vec<TextSize>, IntMap<u32, Box<[WideChar]>>) { + assert!(src.len() < !0u32 as usize); + let mut lines = vec![]; + let mut line_wide_chars = IntMap::<u32, Vec<WideChar>>::default(); + + // Calls the right implementation, depending on hardware support available. + analyze_source_file_dispatch(src, &mut lines, &mut line_wide_chars); + + (lines, line_wide_chars.into_iter().map(|(k, v)| (k, v.into_boxed_slice())).collect()) +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn analyze_source_file_dispatch( + src: &str, + lines: &mut Vec<TextSize>, + multi_byte_chars: &mut IntMap<u32, Vec<WideChar>>, +) { + if is_x86_feature_detected!("sse2") { + // SAFETY: SSE2 support was checked + unsafe { + analyze_source_file_sse2(src, lines, multi_byte_chars); + } + } else { + analyze_source_file_generic(src, src.len(), TextSize::from(0), lines, multi_byte_chars); + } +} + +/// Checks 16 byte chunks of text at a time. If the chunk contains +/// something other than printable ASCII characters and newlines, the +/// function falls back to the generic implementation. Otherwise it uses +/// SSE2 intrinsics to quickly find all newlines. +#[target_feature(enable = "sse2")] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +unsafe fn analyze_source_file_sse2( + src: &str, + lines: &mut Vec<TextSize>, + multi_byte_chars: &mut IntMap<u32, Vec<WideChar>>, +) { + #[cfg(target_arch = "x86")] + use std::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use std::arch::x86_64::*; + + const CHUNK_SIZE: usize = 16; + + let src_bytes = src.as_bytes(); + + let chunk_count = src.len() / CHUNK_SIZE; + + // This variable keeps track of where we should start decoding a + // chunk. If a multi-byte character spans across chunk boundaries, + // we need to skip that part in the next chunk because we already + // handled it. + let mut intra_chunk_offset = 0; + + for chunk_index in 0..chunk_count { + let ptr = src_bytes.as_ptr() as *const __m128i; + // We don't know if the pointer is aligned to 16 bytes, so we + // use `loadu`, which supports unaligned loading. + let chunk = _mm_loadu_si128(ptr.add(chunk_index)); + + // For character in the chunk, see if its byte value is < 0, which + // indicates that it's part of a UTF-8 char. + let multibyte_test = _mm_cmplt_epi8(chunk, _mm_set1_epi8(0)); + // Create a bit mask from the comparison results. + let multibyte_mask = _mm_movemask_epi8(multibyte_test); + + // If the bit mask is all zero, we only have ASCII chars here: + if multibyte_mask == 0 { + assert!(intra_chunk_offset == 0); + + // Check for newlines in the chunk + let newlines_test = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8)); + let newlines_mask = _mm_movemask_epi8(newlines_test); + + if newlines_mask != 0 { + // All control characters are newlines, record them + let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32; + let output_offset = TextSize::from((chunk_index * CHUNK_SIZE + 1) as u32); + + loop { + let index = newlines_mask.trailing_zeros(); + + if index >= CHUNK_SIZE as u32 { + // We have arrived at the end of the chunk. + break; + } + + lines.push(TextSize::from(index) + output_offset); + + // Clear the bit, so we can find the next one. + newlines_mask &= (!1) << index; + } + } + continue; + } + + // The slow path. + // There are control chars in here, fallback to generic decoding. + let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset; + intra_chunk_offset = analyze_source_file_generic( + &src[scan_start..], + CHUNK_SIZE - intra_chunk_offset, + TextSize::from(scan_start as u32), + lines, + multi_byte_chars, + ); + } + + // There might still be a tail left to analyze + let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset; + if tail_start < src.len() { + analyze_source_file_generic( + &src[tail_start..], + src.len() - tail_start, + TextSize::from(tail_start as u32), + lines, + multi_byte_chars, + ); + } +} + +#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] +// The target (or compiler version) does not support SSE2 ... +fn analyze_source_file_dispatch( + src: &str, + lines: &mut Vec<TextSize>, + multi_byte_chars: &mut IntMap<u32, Vec<WideChar>>, +) { + analyze_source_file_generic(src, src.len(), TextSize::from(0), lines, multi_byte_chars); +} + +// `scan_len` determines the number of bytes in `src` to scan. Note that the +// function can read past `scan_len` if a multi-byte character start within the +// range but extends past it. The overflow is returned by the function. +fn analyze_source_file_generic( + src: &str, + scan_len: usize, + output_offset: TextSize, + lines: &mut Vec<TextSize>, + multi_byte_chars: &mut IntMap<u32, Vec<WideChar>>, +) -> usize { + assert!(src.len() >= scan_len); + let mut i = 0; + let src_bytes = src.as_bytes(); + + while i < scan_len { + let byte = unsafe { + // We verified that i < scan_len <= src.len() + *src_bytes.get_unchecked(i) + }; + + // How much to advance in order to get to the next UTF-8 char in the + // string. + let mut char_len = 1; + + if byte == b'\n' { + lines.push(TextSize::from(i as u32 + 1) + output_offset); + } else if byte >= 127 { + // The slow path: Just decode to `char`. + let c = src[i..].chars().next().unwrap(); + char_len = c.len_utf8(); + + let pos = TextSize::from(i as u32) + output_offset; + + if char_len > 1 { + assert!((2..=4).contains(&char_len)); + let mbc = WideChar { start: pos, end: pos + TextSize::from(char_len as u32) }; + multi_byte_chars.entry(lines.len() as u32).or_default().push(mbc); + } + } + + i += char_len; + } + + i - scan_len +} |