helix

Unnamed repository; edit this file 'description' to name the repository.

master 24Branches 30Tags

Clone

HTTPS

SSH

Open with VS Code

Diffstat (limited to 'helix-syntax/src/lib.rs')

-rw-r--r--

helix-syntax/src/lib.rs

355

1 files changed, 130 insertions, 225 deletions

diff --git a/helix-syntax/src/lib.rs b/helix-syntax/src/lib.rs
index b7331a3a..593fe3de 100644
--- a/helix-syntax/src/lib.rs
+++ b/helix-syntax/src/lib.rs

@@ -1,33 +1,82 @@

use ::ropey::RopeSlice;

-use ::tree_sitter::{Node, Parser, Point, Query, QueryCursor, Range, Tree};

-use slotmap::HopSlotMap;

+use slotmap::{new_key_type, HopSlotMap};

use std::borrow::Cow;

-use std::cell::RefCell;

use std::hash::{Hash, Hasher};

use std::path::Path;

use std::str;

use std::sync::Arc;

-use crate::injections_tree::LayerId;

use crate::parse::LayerUpdateFlags;

pub use crate::config::{read_query, HighlightConfiguration};

-pub use crate::ropey::RopeProvider;

-pub use merge::merge;

+use crate::tree_sitter::{SyntaxTree, SyntaxTreeNode};

pub use pretty_print::pretty_print_tree;

pub use tree_cursor::TreeCursor;

mod config;

pub mod highlighter;

-mod injections_tree;

-mod merge;

+pub mod highlighter2;

mod parse;

mod pretty_print;

-mod ropey;

+mod query_iter;

+pub mod text_object;

mod tree_cursor;

pub mod tree_sitter;

+new_key_type! {

+ /// The default slot map key type.

+ pub struct LayerId;

+/// The maximum number of in-progress matches a TS cursor can consider at once.

+/// This is set to a constant in order to avoid performance problems for medium to large files. Set with `set_match_limit`.

+/// Using such a limit means that we lose valid captures, so there is fundamentally a tradeoff here.

+///

+/// Old tree sitter versions used a limit of 32 by default until this limit was removed in version `0.19.5` (must now be set manually).

+/// However, this causes performance issues for medium to large files.

+/// In helix, this problem caused treesitter motions to take multiple seconds to complete in medium-sized rust files (3k loc).

+///

+/// Neovim also encountered this problem and reintroduced this limit after it was removed upstream

+/// (see <https://github.com/neovim/neovim/issues/14897> and <https://github.com/neovim/neovim/pull/14915>).

+/// The number used here is fundamentally a tradeoff between breaking some obscure edge cases and performance.

+///

+/// Neovim chose 64 for this value somewhat arbitrarily (<https://github.com/neovim/neovim/pull/18397>).

+/// 64 is too low for some languages though. In particular, it breaks some highlighting for record fields in Erlang record definitions.

+/// This number can be increased if new syntax highlight breakages are found, as long as the performance penalty is not too high.

+pub const TREE_SITTER_MATCH_LIMIT: u32 = 256;

+// TODO(perf): replace std::ops::Range<usize> with helix_stdx::Range<u32> once added

+type Range = std::ops::Range<usize>;

+/// The Tree siitter syntax tree for a single language.

+/// This is really multipe nested different syntax trees due to tree sitter

+/// injections. A single syntax tree/parser is called layer. Each layer

+/// is parsed as a single "file" by tree sitter. There can be multiple layers

+/// for the same language. A layer corresponds to one of three things:

+/// * the root layer

+/// * a singular injection limited to a single node in it's parent layer

+/// * Multiple injections (multiple disjoint nodes in parent layer) that are

+/// parsed as tough they are a single uninterrupted file.

+///

+/// An injection always refer to a single node into which another layer is

+/// injected. As injections only correspond to syntax tree nodes injections in

+/// the same layer do not intersect. However, the syntax tree in a an injected

+/// layer can have nodes that intersect with nodes from the parent layer. For

+/// example:

+/// ```

+/// layer2: | Sibling A | Sibling B (layer3) | Sibling C |

+/// layer1: | Sibling A (layer2) | Sibling B | Sibling C (layer2) |

+/// ````

+/// In this case Sibling B really spans across a "GAP" in layer2. While the syntax

+/// node can not be split up by tree sitter directly, we can treat Sibling B as two

+/// seperate injections. That is done while parsing/running the query capture. As

+/// a result the injections from a tree. Note that such other queries must account for

+/// such multi injection nodes.

#[derive(Debug)]

pub struct Syntax {

layers: HopSlotMap<LayerId, LanguageLayer>,

@@ -41,16 +90,20 @@ impl Syntax {

injection_callback: impl Fn(&InjectionLanguageMarker) -> Option<Arc<HighlightConfiguration>>,

) -> Option<Self> {

let root_layer = LanguageLayer {

- tree: None,

+ parse_tree: None,

config,

- depth: 0,

flags: LayerUpdateFlags::empty(),

- ranges: vec![Range {

+ ranges: vec![tree_sitter::Range {

start_byte: 0,

- end_byte: usize::MAX,

- start_point: Point::new(0, 0),

- end_point: Point::new(usize::MAX, usize::MAX),

- }],

+ end_byte: u32::MAX,

+ start_point: tree_sitter::Point { row: 0, col: 0 },

+ end_point: tree_sitter::Point {

+ row: u32::MAX,

+ col: u32::MAX,

+ },

+ }]

+ .into_boxed_slice(),

+ injections: Box::new([]),

parent: None,

};

@@ -70,49 +123,75 @@ impl Syntax {

Some(syntax)

}

- pub fn tree(&self) -> &Tree {

+ pub fn tree(&self) -> &SyntaxTree {

self.layers[self.root].tree()

}

- pub fn tree_for_byte_range(&self, start: usize, end: usize) -> &Tree {

- let mut container_id = self.root;

- for (layer_id, layer) in self.layers.iter() {

- if layer.depth > self.layers[container_id].depth

- && layer.contains_byte_range(start, end)

- {

- container_id = layer_id;

- }

- self.layers[container_id].tree()

+ pub fn tree_for_byte_range(&self, start: usize, end: usize) -> &SyntaxTree {

+ let layer = self.layer_for_byte_range(start, end);

+ self.layers[layer].tree()

}

- pub fn named_descendant_for_byte_range(&self, start: usize, end: usize) -> Option<Node<'_>> {

+ pub fn named_descendant_for_byte_range(

+ &self,

+ start: usize,

+ end: usize,

+ ) -> Option<SyntaxTreeNode<'_>> {

self.tree_for_byte_range(start, end)

.root_node()

.named_descendant_for_byte_range(start, end)

}

- pub fn descendant_for_byte_range(&self, start: usize, end: usize) -> Option<Node<'_>> {

+ pub fn descendant_for_byte_range(

+ &self,

+ start: usize,

+ end: usize,

+ ) -> Option<SyntaxTreeNode<'_>> {

self.tree_for_byte_range(start, end)

.root_node()

.descendant_for_byte_range(start, end)

}

+ pub fn layer_for_byte_range(&self, start: usize, end: usize) -> LayerId {

+ let mut cursor = self.root;

+ loop {

+ let layer = &self.layers[cursor];

+ let Some(start_injection) = layer.injection_at_byte_idx(start) else {

+ break;

+ };

+ let Some(end_injection) = layer.injection_at_byte_idx(end) else {

+ break;

+ };

+ if start_injection.layer == end_injection.layer {

+ cursor = start_injection.layer;

+ } else {

+ break;

+ }

+ cursor

+ }

pub fn walk(&self) -> TreeCursor<'_> {

TreeCursor::new(&self.layers, self.root)

}

+#[derive(Debug, Clone)]

+pub(crate) struct Injection {

+ pub byte_range: Range,

+ pub layer: LayerId,

#[derive(Debug)]

pub struct LanguageLayer {

- // mode

- // grammar

pub config: Arc<HighlightConfiguration>,

- pub(crate) tree: Option<Tree>,

- pub ranges: Vec<Range>,

- pub depth: u32,

+ parse_tree: Option<SyntaxTree>,

+ ranges: Box<[tree_sitter::Range]>,

+ /// a list of **sorted** non-overlapping injection ranges. Note that

+ /// injection ranges are not relative to the start of this layer but the

+ /// start of the root layer

+ injections: Box<[Injection]>,

+ /// internal flags used during parsing to track incremental invalidation

flags: LayerUpdateFlags,

parent: Option<LayerId>,

}

@@ -123,8 +202,8 @@ pub struct LanguageLayer {

/// state.

impl PartialEq for LanguageLayer {

fn eq(&self, other: &Self) -> bool {

- self.depth == other.depth

- && self.config.language == other.config.language

+ self.parent == other.parent

+ && self.config.grammar == other.config.grammar

&& self.ranges == other.ranges

}

@@ -133,165 +212,27 @@ impl PartialEq for LanguageLayer {

/// See its documentation for details.

impl Hash for LanguageLayer {

fn hash<H: Hasher>(&self, state: &mut H) {

- self.depth.hash(state);

- self.config.language.hash(state);

+ self.parent.hash(state);

+ self.config.grammar.hash(state);

self.ranges.hash(state);

}

impl LanguageLayer {

- pub fn tree(&self) -> &Tree {

+ pub fn tree(&self) -> &SyntaxTree {

// TODO: no unwrap

- self.tree.as_ref().unwrap()

- }

- /// Whether the layer contains the given byte range.

- ///

- /// If the layer has multiple ranges (i.e. combined injections), the

- /// given range is considered contained if it is within the start and

- /// end bytes of the first and last ranges **and** if the given range

- /// starts or ends within any of the layer's ranges.

- fn contains_byte_range(&self, start: usize, end: usize) -> bool {

- let layer_start = self

- .ranges

- .first()

- .expect("ranges should not be empty")

- .start_byte;

- let layer_end = self

- .ranges

- .last()

- .expect("ranges should not be empty")

- .end_byte;

- layer_start <= start

- && layer_end >= end

- && self.ranges.iter().any(|range| {

- let byte_range = range.start_byte..range.end_byte;

- byte_range.contains(&start) || byte_range.contains(&end)

- })

- }

-#[derive(Debug, Clone)]

-pub enum InjectionLanguageMarker<'a> {

- Name(Cow<'a, str>),

- Filename(Cow<'a, Path>),

- Shebang(String),

-const SHEBANG: &str = r"#!\s*(?:\S*[/\\](?:env\s+(?:\-\S+\s+)*)?)?([^\s\.\d]+)";

-#[derive(Debug)]

-pub enum CapturedNode<'a> {

- Single(Node<'a>),

- /// Guaranteed to be not empty

- Grouped(Vec<Node<'a>>),

-impl<'a> CapturedNode<'a> {

- pub fn start_byte(&self) -> usize {

- match self {

- Self::Single(n) => n.start_byte(),

- Self::Grouped(ns) => ns[0].start_byte(),

- }

- pub fn end_byte(&self) -> usize {

- match self {

- Self::Single(n) => n.end_byte(),

- Self::Grouped(ns) => ns.last().unwrap().end_byte(),

- }

+ self.parse_tree.as_ref().unwrap()

}

- pub fn byte_range(&self) -> std::ops::Range<usize> {

- self.start_byte()..self.end_byte()

- }

-/// The maximum number of in-progress matches a TS cursor can consider at once.

-/// This is set to a constant in order to avoid performance problems for medium to large files. Set with `set_match_limit`.

-/// Using such a limit means that we lose valid captures, so there is fundamentally a tradeoff here.

-///

-/// Old tree sitter versions used a limit of 32 by default until this limit was removed in version `0.19.5` (must now be set manually).

-/// However, this causes performance issues for medium to large files.

-/// In helix, this problem caused treesitter motions to take multiple seconds to complete in medium-sized rust files (3k loc).

-///

-/// Neovim also encountered this problem and reintroduced this limit after it was removed upstream

-/// (see <https://github.com/neovim/neovim/issues/14897> and <https://github.com/neovim/neovim/pull/14915>).

-/// The number used here is fundamentally a tradeoff between breaking some obscure edge cases and performance.

-///

-/// Neovim chose 64 for this value somewhat arbitrarily (<https://github.com/neovim/neovim/pull/18397>).

-/// 64 is too low for some languages though. In particular, it breaks some highlighting for record fields in Erlang record definitions.

-/// This number can be increased if new syntax highlight breakages are found, as long as the performance penalty is not too high.

-const TREE_SITTER_MATCH_LIMIT: u32 = 256;

-#[derive(Debug)]

-pub struct TextObjectQuery {

- pub query: Query,

-impl TextObjectQuery {

- /// Run the query on the given node and return sub nodes which match given

- /// capture ("function.inside", "class.around", etc).

- ///

- /// Captures may contain multiple nodes by using quantifiers (+, *, etc),

- /// and support for this is partial and could use improvement.

- ///

- /// ```query

- /// (comment)+ @capture

- ///

- /// ; OR

- /// (

- /// (comment)*

- /// .

- /// (function)

- /// ) @capture

- /// ```

- pub fn capture_nodes<'a>(

- &'a self,

- capture_name: &str,

- node: Node<'a>,

- slice: RopeSlice<'a>,

- cursor: &'a mut QueryCursor,

- ) -> Option<impl Iterator<Item = CapturedNode<'a>>> {

- self.capture_nodes_any(&[capture_name], node, slice, cursor)

- }

- /// Find the first capture that exists out of all given `capture_names`

- /// and return sub nodes that match this capture.

- pub fn capture_nodes_any<'a>(

- &'a self,

- capture_names: &[&str],

- node: Node<'a>,

- slice: RopeSlice<'a>,

- cursor: &'a mut QueryCursor,

- ) -> Option<impl Iterator<Item = CapturedNode<'a>>> {

- let capture_idx = capture_names

- .iter()

- .find_map(|cap| self.query.capture_index_for_name(cap))?;

- cursor.set_match_limit(TREE_SITTER_MATCH_LIMIT);

- let nodes = cursor

- .captures(&self.query, node, RopeProvider(slice))

- .filter_map(move |(mat, _)| {

- let nodes: Vec<_> = mat

- .captures

- .iter()

- .filter_map(|cap| (cap.index == capture_idx).then_some(cap.node))

- .collect();

- if nodes.len() > 1 {

- Some(CapturedNode::Grouped(nodes))

- } else {

- nodes.into_iter().map(CapturedNode::Single).next()

- }

- });

- Some(nodes)

+ /// Returns the injection range **within this layers** that contains `idx`.

+ /// This function will not descend into nested injections

+ pub(crate) fn injection_at_byte_idx(&self, idx: usize) -> Option<&Injection> {

+ let i = self

+ .injections

+ .partition_point(|range| range.byte_range.start < idx);

+ self.injections

+ .get(i)

+ .filter(|injection| injection.byte_range.end > idx)

}

@@ -304,42 +245,6 @@ pub enum Error {

Unknown,

}

-#[derive(Clone)]

-enum IncludedChildren {

- None,

- All,

- Unnamed,

-impl Default for IncludedChildren {

- fn default() -> Self {

- Self::None

- }

fn byte_range_to_str(range: std::ops::Range<usize>, source: RopeSlice) -> Cow<str> {

Cow::from(source.byte_slice(range))

}

-struct TsParser {

- parser: ::tree_sitter::Parser,

- pub cursors: Vec<QueryCursor>,

-// could also just use a pool, or a single instance?

-thread_local! {

- static PARSER: RefCell<TsParser> = RefCell::new(TsParser {

- parser: Parser::new(),

- cursors: Vec::new(),

- })

-pub fn with_cursor<T>(f: impl FnOnce(&mut QueryCursor) -> T) -> T {

- PARSER.with(|parser| {

- let mut parser = parser.borrow_mut();

- let mut cursor = parser.cursors.pop().unwrap_or_default();

- let res = f(&mut cursor);

- parser.cursors.push(cursor);

- res

- })