Unnamed repository; edit this file 'description' to name the repository.
Diffstat (limited to 'helix-syntax/src/lib.rs')
| -rw-r--r-- | helix-syntax/src/lib.rs | 355 |
1 files changed, 130 insertions, 225 deletions
diff --git a/helix-syntax/src/lib.rs b/helix-syntax/src/lib.rs index b7331a3a..593fe3de 100644 --- a/helix-syntax/src/lib.rs +++ b/helix-syntax/src/lib.rs @@ -1,33 +1,82 @@ use ::ropey::RopeSlice; -use ::tree_sitter::{Node, Parser, Point, Query, QueryCursor, Range, Tree}; -use slotmap::HopSlotMap; +use slotmap::{new_key_type, HopSlotMap}; use std::borrow::Cow; -use std::cell::RefCell; use std::hash::{Hash, Hasher}; use std::path::Path; use std::str; use std::sync::Arc; -use crate::injections_tree::LayerId; use crate::parse::LayerUpdateFlags; pub use crate::config::{read_query, HighlightConfiguration}; -pub use crate::ropey::RopeProvider; -pub use merge::merge; +use crate::tree_sitter::{SyntaxTree, SyntaxTreeNode}; pub use pretty_print::pretty_print_tree; pub use tree_cursor::TreeCursor; mod config; pub mod highlighter; -mod injections_tree; -mod merge; +pub mod highlighter2; mod parse; mod pretty_print; -mod ropey; +mod query_iter; +pub mod text_object; mod tree_cursor; pub mod tree_sitter; +new_key_type! { + /// The default slot map key type. + pub struct LayerId; +} + +/// The maximum number of in-progress matches a TS cursor can consider at once. +/// This is set to a constant in order to avoid performance problems for medium to large files. Set with `set_match_limit`. +/// Using such a limit means that we lose valid captures, so there is fundamentally a tradeoff here. +/// +/// +/// Old tree sitter versions used a limit of 32 by default until this limit was removed in version `0.19.5` (must now be set manually). +/// However, this causes performance issues for medium to large files. +/// In helix, this problem caused treesitter motions to take multiple seconds to complete in medium-sized rust files (3k loc). +/// +/// +/// Neovim also encountered this problem and reintroduced this limit after it was removed upstream +/// (see <https://github.com/neovim/neovim/issues/14897> and <https://github.com/neovim/neovim/pull/14915>). +/// The number used here is fundamentally a tradeoff between breaking some obscure edge cases and performance. +/// +/// +/// Neovim chose 64 for this value somewhat arbitrarily (<https://github.com/neovim/neovim/pull/18397>). +/// 64 is too low for some languages though. In particular, it breaks some highlighting for record fields in Erlang record definitions. +/// This number can be increased if new syntax highlight breakages are found, as long as the performance penalty is not too high. +pub const TREE_SITTER_MATCH_LIMIT: u32 = 256; + +// TODO(perf): replace std::ops::Range<usize> with helix_stdx::Range<u32> once added +type Range = std::ops::Range<usize>; + +/// The Tree siitter syntax tree for a single language. + +/// This is really multipe nested different syntax trees due to tree sitter +/// injections. A single syntax tree/parser is called layer. Each layer +/// is parsed as a single "file" by tree sitter. There can be multiple layers +/// for the same language. A layer corresponds to one of three things: +/// * the root layer +/// * a singular injection limited to a single node in it's parent layer +/// * Multiple injections (multiple disjoint nodes in parent layer) that are +/// parsed as tough they are a single uninterrupted file. +/// +/// An injection always refer to a single node into which another layer is +/// injected. As injections only correspond to syntax tree nodes injections in +/// the same layer do not intersect. However, the syntax tree in a an injected +/// layer can have nodes that intersect with nodes from the parent layer. For +/// example: +/// ``` +/// layer2: | Sibling A | Sibling B (layer3) | Sibling C | +/// layer1: | Sibling A (layer2) | Sibling B | Sibling C (layer2) | +/// ```` +/// In this case Sibling B really spans across a "GAP" in layer2. While the syntax +/// node can not be split up by tree sitter directly, we can treat Sibling B as two +/// seperate injections. That is done while parsing/running the query capture. As +/// a result the injections from a tree. Note that such other queries must account for +/// such multi injection nodes. #[derive(Debug)] pub struct Syntax { layers: HopSlotMap<LayerId, LanguageLayer>, @@ -41,16 +90,20 @@ impl Syntax { injection_callback: impl Fn(&InjectionLanguageMarker) -> Option<Arc<HighlightConfiguration>>, ) -> Option<Self> { let root_layer = LanguageLayer { - tree: None, + parse_tree: None, config, - depth: 0, flags: LayerUpdateFlags::empty(), - ranges: vec![Range { + ranges: vec![tree_sitter::Range { start_byte: 0, - end_byte: usize::MAX, - start_point: Point::new(0, 0), - end_point: Point::new(usize::MAX, usize::MAX), - }], + end_byte: u32::MAX, + start_point: tree_sitter::Point { row: 0, col: 0 }, + end_point: tree_sitter::Point { + row: u32::MAX, + col: u32::MAX, + }, + }] + .into_boxed_slice(), + injections: Box::new([]), parent: None, }; @@ -70,49 +123,75 @@ impl Syntax { Some(syntax) } - pub fn tree(&self) -> &Tree { + pub fn tree(&self) -> &SyntaxTree { self.layers[self.root].tree() } - pub fn tree_for_byte_range(&self, start: usize, end: usize) -> &Tree { - let mut container_id = self.root; - - for (layer_id, layer) in self.layers.iter() { - if layer.depth > self.layers[container_id].depth - && layer.contains_byte_range(start, end) - { - container_id = layer_id; - } - } - - self.layers[container_id].tree() + pub fn tree_for_byte_range(&self, start: usize, end: usize) -> &SyntaxTree { + let layer = self.layer_for_byte_range(start, end); + self.layers[layer].tree() } - pub fn named_descendant_for_byte_range(&self, start: usize, end: usize) -> Option<Node<'_>> { + pub fn named_descendant_for_byte_range( + &self, + start: usize, + end: usize, + ) -> Option<SyntaxTreeNode<'_>> { self.tree_for_byte_range(start, end) .root_node() .named_descendant_for_byte_range(start, end) } - pub fn descendant_for_byte_range(&self, start: usize, end: usize) -> Option<Node<'_>> { + pub fn descendant_for_byte_range( + &self, + start: usize, + end: usize, + ) -> Option<SyntaxTreeNode<'_>> { self.tree_for_byte_range(start, end) .root_node() .descendant_for_byte_range(start, end) } + pub fn layer_for_byte_range(&self, start: usize, end: usize) -> LayerId { + let mut cursor = self.root; + loop { + let layer = &self.layers[cursor]; + let Some(start_injection) = layer.injection_at_byte_idx(start) else { + break; + }; + let Some(end_injection) = layer.injection_at_byte_idx(end) else { + break; + }; + if start_injection.layer == end_injection.layer { + cursor = start_injection.layer; + } else { + break; + } + } + cursor + } + pub fn walk(&self) -> TreeCursor<'_> { TreeCursor::new(&self.layers, self.root) } } +#[derive(Debug, Clone)] +pub(crate) struct Injection { + pub byte_range: Range, + pub layer: LayerId, +} + #[derive(Debug)] pub struct LanguageLayer { - // mode - // grammar pub config: Arc<HighlightConfiguration>, - pub(crate) tree: Option<Tree>, - pub ranges: Vec<Range>, - pub depth: u32, + parse_tree: Option<SyntaxTree>, + ranges: Box<[tree_sitter::Range]>, + /// a list of **sorted** non-overlapping injection ranges. Note that + /// injection ranges are not relative to the start of this layer but the + /// start of the root layer + injections: Box<[Injection]>, + /// internal flags used during parsing to track incremental invalidation flags: LayerUpdateFlags, parent: Option<LayerId>, } @@ -123,8 +202,8 @@ pub struct LanguageLayer { /// state. impl PartialEq for LanguageLayer { fn eq(&self, other: &Self) -> bool { - self.depth == other.depth - && self.config.language == other.config.language + self.parent == other.parent + && self.config.grammar == other.config.grammar && self.ranges == other.ranges } } @@ -133,165 +212,27 @@ impl PartialEq for LanguageLayer { /// See its documentation for details. impl Hash for LanguageLayer { fn hash<H: Hasher>(&self, state: &mut H) { - self.depth.hash(state); - self.config.language.hash(state); + self.parent.hash(state); + self.config.grammar.hash(state); self.ranges.hash(state); } } impl LanguageLayer { - pub fn tree(&self) -> &Tree { + pub fn tree(&self) -> &SyntaxTree { // TODO: no unwrap - self.tree.as_ref().unwrap() - } - - /// Whether the layer contains the given byte range. - /// - /// If the layer has multiple ranges (i.e. combined injections), the - /// given range is considered contained if it is within the start and - /// end bytes of the first and last ranges **and** if the given range - /// starts or ends within any of the layer's ranges. - fn contains_byte_range(&self, start: usize, end: usize) -> bool { - let layer_start = self - .ranges - .first() - .expect("ranges should not be empty") - .start_byte; - let layer_end = self - .ranges - .last() - .expect("ranges should not be empty") - .end_byte; - - layer_start <= start - && layer_end >= end - && self.ranges.iter().any(|range| { - let byte_range = range.start_byte..range.end_byte; - byte_range.contains(&start) || byte_range.contains(&end) - }) - } -} - -#[derive(Debug, Clone)] -pub enum InjectionLanguageMarker<'a> { - Name(Cow<'a, str>), - Filename(Cow<'a, Path>), - Shebang(String), -} - -const SHEBANG: &str = r"#!\s*(?:\S*[/\\](?:env\s+(?:\-\S+\s+)*)?)?([^\s\.\d]+)"; - -#[derive(Debug)] -pub enum CapturedNode<'a> { - Single(Node<'a>), - /// Guaranteed to be not empty - Grouped(Vec<Node<'a>>), -} - -impl<'a> CapturedNode<'a> { - pub fn start_byte(&self) -> usize { - match self { - Self::Single(n) => n.start_byte(), - Self::Grouped(ns) => ns[0].start_byte(), - } - } - - pub fn end_byte(&self) -> usize { - match self { - Self::Single(n) => n.end_byte(), - Self::Grouped(ns) => ns.last().unwrap().end_byte(), - } + self.parse_tree.as_ref().unwrap() } - pub fn byte_range(&self) -> std::ops::Range<usize> { - self.start_byte()..self.end_byte() - } -} - -/// The maximum number of in-progress matches a TS cursor can consider at once. -/// This is set to a constant in order to avoid performance problems for medium to large files. Set with `set_match_limit`. -/// Using such a limit means that we lose valid captures, so there is fundamentally a tradeoff here. -/// -/// -/// Old tree sitter versions used a limit of 32 by default until this limit was removed in version `0.19.5` (must now be set manually). -/// However, this causes performance issues for medium to large files. -/// In helix, this problem caused treesitter motions to take multiple seconds to complete in medium-sized rust files (3k loc). -/// -/// -/// Neovim also encountered this problem and reintroduced this limit after it was removed upstream -/// (see <https://github.com/neovim/neovim/issues/14897> and <https://github.com/neovim/neovim/pull/14915>). -/// The number used here is fundamentally a tradeoff between breaking some obscure edge cases and performance. -/// -/// -/// Neovim chose 64 for this value somewhat arbitrarily (<https://github.com/neovim/neovim/pull/18397>). -/// 64 is too low for some languages though. In particular, it breaks some highlighting for record fields in Erlang record definitions. -/// This number can be increased if new syntax highlight breakages are found, as long as the performance penalty is not too high. -const TREE_SITTER_MATCH_LIMIT: u32 = 256; - -#[derive(Debug)] -pub struct TextObjectQuery { - pub query: Query, -} - -impl TextObjectQuery { - /// Run the query on the given node and return sub nodes which match given - /// capture ("function.inside", "class.around", etc). - /// - /// Captures may contain multiple nodes by using quantifiers (+, *, etc), - /// and support for this is partial and could use improvement. - /// - /// ```query - /// (comment)+ @capture - /// - /// ; OR - /// ( - /// (comment)* - /// . - /// (function) - /// ) @capture - /// ``` - pub fn capture_nodes<'a>( - &'a self, - capture_name: &str, - node: Node<'a>, - slice: RopeSlice<'a>, - cursor: &'a mut QueryCursor, - ) -> Option<impl Iterator<Item = CapturedNode<'a>>> { - self.capture_nodes_any(&[capture_name], node, slice, cursor) - } - - /// Find the first capture that exists out of all given `capture_names` - /// and return sub nodes that match this capture. - pub fn capture_nodes_any<'a>( - &'a self, - capture_names: &[&str], - node: Node<'a>, - slice: RopeSlice<'a>, - cursor: &'a mut QueryCursor, - ) -> Option<impl Iterator<Item = CapturedNode<'a>>> { - let capture_idx = capture_names - .iter() - .find_map(|cap| self.query.capture_index_for_name(cap))?; - - cursor.set_match_limit(TREE_SITTER_MATCH_LIMIT); - - let nodes = cursor - .captures(&self.query, node, RopeProvider(slice)) - .filter_map(move |(mat, _)| { - let nodes: Vec<_> = mat - .captures - .iter() - .filter_map(|cap| (cap.index == capture_idx).then_some(cap.node)) - .collect(); - - if nodes.len() > 1 { - Some(CapturedNode::Grouped(nodes)) - } else { - nodes.into_iter().map(CapturedNode::Single).next() - } - }); - - Some(nodes) + /// Returns the injection range **within this layers** that contains `idx`. + /// This function will not descend into nested injections + pub(crate) fn injection_at_byte_idx(&self, idx: usize) -> Option<&Injection> { + let i = self + .injections + .partition_point(|range| range.byte_range.start < idx); + self.injections + .get(i) + .filter(|injection| injection.byte_range.end > idx) } } @@ -304,42 +245,6 @@ pub enum Error { Unknown, } -#[derive(Clone)] -enum IncludedChildren { - None, - All, - Unnamed, -} - -impl Default for IncludedChildren { - fn default() -> Self { - Self::None - } -} - fn byte_range_to_str(range: std::ops::Range<usize>, source: RopeSlice) -> Cow<str> { Cow::from(source.byte_slice(range)) } - -struct TsParser { - parser: ::tree_sitter::Parser, - pub cursors: Vec<QueryCursor>, -} - -// could also just use a pool, or a single instance? -thread_local! { - static PARSER: RefCell<TsParser> = RefCell::new(TsParser { - parser: Parser::new(), - cursors: Vec::new(), - }) -} - -pub fn with_cursor<T>(f: impl FnOnce(&mut QueryCursor) -> T) -> T { - PARSER.with(|parser| { - let mut parser = parser.borrow_mut(); - let mut cursor = parser.cursors.pop().unwrap_or_default(); - let res = f(&mut cursor); - parser.cursors.push(cursor); - res - }) -} |