rust-analyzer

Unnamed repository; edit this file 'description' to name the repository.

master 2Branches 0Tags

Clone

HTTPS

SSH

Open with VS Code

Diffstat (limited to 'lib/line-index/src/lib.rs')

-rw-r--r--

lib/line-index/src/lib.rs

218

1 files changed, 180 insertions, 38 deletions

diff --git a/lib/line-index/src/lib.rs b/lib/line-index/src/lib.rs
index ad67d3f246..03371c9c87 100644
--- a/lib/line-index/src/lib.rs
+++ b/lib/line-index/src/lib.rs

@@ -94,44 +94,7 @@ pub struct LineIndex {

impl LineIndex {

/// Returns a `LineIndex` for the `text`.

pub fn new(text: &str) -> LineIndex {

- let mut newlines = Vec::<TextSize>::with_capacity(16);

- let mut line_wide_chars = IntMap::<u32, Box<[WideChar]>>::default();

- let mut wide_chars = Vec::<WideChar>::new();

- let mut cur_row = TextSize::from(0);

- let mut cur_col = TextSize::from(0);

- let mut line = 0u32;

- for c in text.chars() {

- let c_len = TextSize::of(c);

- cur_row += c_len;

- if c == '\n' {

- newlines.push(cur_row);

- // Save any wide characters seen in the previous line

- if !wide_chars.is_empty() {

- let cs = std::mem::take(&mut wide_chars).into_boxed_slice();

- line_wide_chars.insert(line, cs);

- }

- // Prepare for processing the next line

- cur_col = TextSize::from(0);

- line += 1;

- continue;

- }

- if !c.is_ascii() {

- wide_chars.push(WideChar { start: cur_col, end: cur_col + c_len });

- }

- cur_col += c_len;

- }

- // Save any wide characters seen in the last line

- if !wide_chars.is_empty() {

- line_wide_chars.insert(line, wide_chars.into_boxed_slice());

- }

+ let (newlines, line_wide_chars) = analyze_source_file(text);

LineIndex {

newlines: newlines.into_boxed_slice(),

line_wide_chars,

@@ -235,3 +198,182 @@ impl LineIndex {

self.len

}

+/// This is adapted from the rustc_span crate, https://github.com/rust-lang/rust/blob/de59844c98f7925242a798a72c59dc3610dd0e2c/compiler/rustc_span/src/analyze_source_file.rs

+fn analyze_source_file(src: &str) -> (Vec<TextSize>, IntMap<u32, Box<[WideChar]>>) {

+ assert!(src.len() < !0u32 as usize);

+ let mut lines = vec![];

+ let mut line_wide_chars = IntMap::<u32, Vec<WideChar>>::default();

+ // Calls the right implementation, depending on hardware support available.

+ analyze_source_file_dispatch(src, &mut lines, &mut line_wide_chars);

+ (lines, line_wide_chars.into_iter().map(|(k, v)| (k, v.into_boxed_slice())).collect())

+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]

+fn analyze_source_file_dispatch(

+ src: &str,

+ lines: &mut Vec<TextSize>,

+ multi_byte_chars: &mut IntMap<u32, Vec<WideChar>>,

+) {

+ if is_x86_feature_detected!("sse2") {

+ // SAFETY: SSE2 support was checked

+ unsafe {

+ analyze_source_file_sse2(src, lines, multi_byte_chars);

+ }

+ } else {

+ analyze_source_file_generic(src, src.len(), TextSize::from(0), lines, multi_byte_chars);

+ }

+/// Checks 16 byte chunks of text at a time. If the chunk contains

+/// something other than printable ASCII characters and newlines, the

+/// function falls back to the generic implementation. Otherwise it uses

+/// SSE2 intrinsics to quickly find all newlines.

+#[target_feature(enable = "sse2")]

+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]

+unsafe fn analyze_source_file_sse2(

+ src: &str,

+ lines: &mut Vec<TextSize>,

+ multi_byte_chars: &mut IntMap<u32, Vec<WideChar>>,

+) {

+ #[cfg(target_arch = "x86")]

+ use std::arch::x86::*;

+ #[cfg(target_arch = "x86_64")]

+ use std::arch::x86_64::*;

+ const CHUNK_SIZE: usize = 16;

+ let src_bytes = src.as_bytes();

+ let chunk_count = src.len() / CHUNK_SIZE;

+ // This variable keeps track of where we should start decoding a

+ // chunk. If a multi-byte character spans across chunk boundaries,

+ // we need to skip that part in the next chunk because we already

+ // handled it.

+ let mut intra_chunk_offset = 0;

+ for chunk_index in 0..chunk_count {

+ let ptr = src_bytes.as_ptr() as *const __m128i;

+ // We don't know if the pointer is aligned to 16 bytes, so we

+ // use `loadu`, which supports unaligned loading.

+ let chunk = _mm_loadu_si128(ptr.add(chunk_index));

+ // For character in the chunk, see if its byte value is < 0, which

+ // indicates that it's part of a UTF-8 char.

+ let multibyte_test = _mm_cmplt_epi8(chunk, _mm_set1_epi8(0));

+ // Create a bit mask from the comparison results.

+ let multibyte_mask = _mm_movemask_epi8(multibyte_test);

+ // If the bit mask is all zero, we only have ASCII chars here:

+ if multibyte_mask == 0 {

+ assert!(intra_chunk_offset == 0);

+ // Check for newlines in the chunk

+ let newlines_test = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8));

+ let newlines_mask = _mm_movemask_epi8(newlines_test);

+ if newlines_mask != 0 {

+ // All control characters are newlines, record them

+ let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32;

+ let output_offset = TextSize::from((chunk_index * CHUNK_SIZE + 1) as u32);

+ loop {

+ let index = newlines_mask.trailing_zeros();

+ if index >= CHUNK_SIZE as u32 {

+ // We have arrived at the end of the chunk.

+ break;

+ }

+ lines.push(TextSize::from(index) + output_offset);

+ // Clear the bit, so we can find the next one.

+ newlines_mask &= (!1) << index;

+ }

+ continue;

+ }

+ // The slow path.

+ // There are control chars in here, fallback to generic decoding.

+ let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;

+ intra_chunk_offset = analyze_source_file_generic(

+ &src[scan_start..],

+ CHUNK_SIZE - intra_chunk_offset,

+ TextSize::from(scan_start as u32),

+ lines,

+ multi_byte_chars,

+ );

+ }

+ // There might still be a tail left to analyze

+ let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset;

+ if tail_start < src.len() {

+ analyze_source_file_generic(

+ &src[tail_start..],

+ src.len() - tail_start,

+ TextSize::from(tail_start as u32),

+ lines,

+ multi_byte_chars,

+ );

+ }

+#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]

+// The target (or compiler version) does not support SSE2 ...

+fn analyze_source_file_dispatch(

+ src: &str,

+ lines: &mut Vec<TextSize>,

+ multi_byte_chars: &mut IntMap<u32, Vec<WideChar>>,

+) {

+ analyze_source_file_generic(src, src.len(), TextSize::from(0), lines, multi_byte_chars);

+// `scan_len` determines the number of bytes in `src` to scan. Note that the

+// function can read past `scan_len` if a multi-byte character start within the

+// range but extends past it. The overflow is returned by the function.

+fn analyze_source_file_generic(

+ src: &str,

+ scan_len: usize,

+ output_offset: TextSize,

+ lines: &mut Vec<TextSize>,

+ multi_byte_chars: &mut IntMap<u32, Vec<WideChar>>,

+) -> usize {

+ assert!(src.len() >= scan_len);

+ let mut i = 0;

+ let src_bytes = src.as_bytes();

+ while i < scan_len {

+ let byte = unsafe {

+ // We verified that i < scan_len <= src.len()

+ *src_bytes.get_unchecked(i)

+ };

+ // How much to advance in order to get to the next UTF-8 char in the

+ // string.

+ let mut char_len = 1;

+ if byte == b'\n' {

+ lines.push(TextSize::from(i as u32 + 1) + output_offset);

+ } else if byte >= 127 {

+ // The slow path: Just decode to `char`.

+ let c = src[i..].chars().next().unwrap();

+ char_len = c.len_utf8();

+ let pos = TextSize::from(i as u32) + output_offset;

+ if char_len > 1 {

+ assert!((2..=4).contains(&char_len));

+ let mbc = WideChar { start: pos, end: pos + TextSize::from(char_len as u32) };

+ multi_byte_chars.entry(lines.len() as u32).or_default().push(mbc);

+ }

+ i += char_len;

+ }

+ i - scan_len