Unnamed repository; edit this file 'description' to name the repository.
Diffstat (limited to 'helix-syntax/src/lib.rs')
-rw-r--r--helix-syntax/src/lib.rs342
1 files changed, 342 insertions, 0 deletions
diff --git a/helix-syntax/src/lib.rs b/helix-syntax/src/lib.rs
new file mode 100644
index 00000000..04a2d27d
--- /dev/null
+++ b/helix-syntax/src/lib.rs
@@ -0,0 +1,342 @@
+use ::ropey::RopeSlice;
+use slotmap::{DefaultKey as LayerId, HopSlotMap};
+use tree_sitter::{Node, Parser, Point, Query, QueryCursor, Range, Tree};
+
+use std::borrow::Cow;
+use std::cell::RefCell;
+use std::hash::{Hash, Hasher};
+use std::path::Path;
+use std::str;
+use std::sync::Arc;
+
+use crate::parse::LayerUpdateFlags;
+
+pub use crate::config::{read_query, HighlightConfiguration};
+pub use crate::ropey::RopeProvider;
+pub use merge::merge;
+pub use pretty_print::pretty_print_tree;
+pub use tree_cursor::TreeCursor;
+
+mod config;
+pub mod highlighter;
+mod merge;
+mod parse;
+mod pretty_print;
+mod ropey;
+mod tree_cursor;
+
+#[derive(Debug)]
+pub struct Syntax {
+ layers: HopSlotMap<LayerId, LanguageLayer>,
+ root: LayerId,
+}
+
+impl Syntax {
+ pub fn new(
+ source: RopeSlice,
+ config: Arc<HighlightConfiguration>,
+ injection_callback: impl Fn(&InjectionLanguageMarker) -> Option<Arc<HighlightConfiguration>>,
+ ) -> Option<Self> {
+ let root_layer = LanguageLayer {
+ tree: None,
+ config,
+ depth: 0,
+ flags: LayerUpdateFlags::empty(),
+ ranges: vec![Range {
+ start_byte: 0,
+ end_byte: usize::MAX,
+ start_point: Point::new(0, 0),
+ end_point: Point::new(usize::MAX, usize::MAX),
+ }],
+ parent: None,
+ };
+
+ // track scope_descriptor: a Vec of scopes for item in tree
+
+ let mut layers = HopSlotMap::default();
+ let root = layers.insert(root_layer);
+
+ let mut syntax = Self { root, layers };
+
+ let res = syntax.update(source, Vec::new(), injection_callback);
+
+ if res.is_err() {
+ log::error!("TS parser failed, disabling TS for the current buffer: {res:?}");
+ return None;
+ }
+ Some(syntax)
+ }
+
+ pub fn tree(&self) -> &Tree {
+ self.layers[self.root].tree()
+ }
+
+ pub fn tree_for_byte_range(&self, start: usize, end: usize) -> &Tree {
+ let mut container_id = self.root;
+
+ for (layer_id, layer) in self.layers.iter() {
+ if layer.depth > self.layers[container_id].depth
+ && layer.contains_byte_range(start, end)
+ {
+ container_id = layer_id;
+ }
+ }
+
+ self.layers[container_id].tree()
+ }
+
+ pub fn named_descendant_for_byte_range(&self, start: usize, end: usize) -> Option<Node<'_>> {
+ self.tree_for_byte_range(start, end)
+ .root_node()
+ .named_descendant_for_byte_range(start, end)
+ }
+
+ pub fn descendant_for_byte_range(&self, start: usize, end: usize) -> Option<Node<'_>> {
+ self.tree_for_byte_range(start, end)
+ .root_node()
+ .descendant_for_byte_range(start, end)
+ }
+
+ pub fn walk(&self) -> TreeCursor<'_> {
+ TreeCursor::new(&self.layers, self.root)
+ }
+}
+
+#[derive(Debug)]
+pub struct LanguageLayer {
+ // mode
+ // grammar
+ pub config: Arc<HighlightConfiguration>,
+ pub(crate) tree: Option<Tree>,
+ pub ranges: Vec<Range>,
+ pub depth: u32,
+ flags: LayerUpdateFlags,
+ parent: Option<LayerId>,
+}
+
+/// This PartialEq implementation only checks if that
+/// two layers are theoretically identical (meaning they highlight the same text range with the same language).
+/// It does not check whether the layers have the same internal treesitter
+/// state.
+impl PartialEq for LanguageLayer {
+ fn eq(&self, other: &Self) -> bool {
+ self.depth == other.depth
+ && self.config.language == other.config.language
+ && self.ranges == other.ranges
+ }
+}
+
+/// Hash implementation belongs to PartialEq implementation above.
+/// See its documentation for details.
+impl Hash for LanguageLayer {
+ fn hash<H: Hasher>(&self, state: &mut H) {
+ self.depth.hash(state);
+ self.config.language.hash(state);
+ self.ranges.hash(state);
+ }
+}
+
+impl LanguageLayer {
+ pub fn tree(&self) -> &Tree {
+ // TODO: no unwrap
+ self.tree.as_ref().unwrap()
+ }
+
+ /// Whether the layer contains the given byte range.
+ ///
+ /// If the layer has multiple ranges (i.e. combined injections), the
+ /// given range is considered contained if it is within the start and
+ /// end bytes of the first and last ranges **and** if the given range
+ /// starts or ends within any of the layer's ranges.
+ fn contains_byte_range(&self, start: usize, end: usize) -> bool {
+ let layer_start = self
+ .ranges
+ .first()
+ .expect("ranges should not be empty")
+ .start_byte;
+ let layer_end = self
+ .ranges
+ .last()
+ .expect("ranges should not be empty")
+ .end_byte;
+
+ layer_start <= start
+ && layer_end >= end
+ && self.ranges.iter().any(|range| {
+ let byte_range = range.start_byte..range.end_byte;
+ byte_range.contains(&start) || byte_range.contains(&end)
+ })
+ }
+}
+
+#[derive(Debug, Clone)]
+pub enum InjectionLanguageMarker<'a> {
+ Name(Cow<'a, str>),
+ Filename(Cow<'a, Path>),
+ Shebang(String),
+}
+
+const SHEBANG: &str = r"#!\s*(?:\S*[/\\](?:env\s+(?:\-\S+\s+)*)?)?([^\s\.\d]+)";
+
+#[derive(Debug)]
+pub enum CapturedNode<'a> {
+ Single(Node<'a>),
+ /// Guaranteed to be not empty
+ Grouped(Vec<Node<'a>>),
+}
+
+impl<'a> CapturedNode<'a> {
+ pub fn start_byte(&self) -> usize {
+ match self {
+ Self::Single(n) => n.start_byte(),
+ Self::Grouped(ns) => ns[0].start_byte(),
+ }
+ }
+
+ pub fn end_byte(&self) -> usize {
+ match self {
+ Self::Single(n) => n.end_byte(),
+ Self::Grouped(ns) => ns.last().unwrap().end_byte(),
+ }
+ }
+
+ pub fn byte_range(&self) -> std::ops::Range<usize> {
+ self.start_byte()..self.end_byte()
+ }
+}
+
+/// The maximum number of in-progress matches a TS cursor can consider at once.
+/// This is set to a constant in order to avoid performance problems for medium to large files. Set with `set_match_limit`.
+/// Using such a limit means that we lose valid captures, so there is fundamentally a tradeoff here.
+///
+///
+/// Old tree sitter versions used a limit of 32 by default until this limit was removed in version `0.19.5` (must now be set manually).
+/// However, this causes performance issues for medium to large files.
+/// In helix, this problem caused treesitter motions to take multiple seconds to complete in medium-sized rust files (3k loc).
+///
+///
+/// Neovim also encountered this problem and reintroduced this limit after it was removed upstream
+/// (see <https://github.com/neovim/neovim/issues/14897> and <https://github.com/neovim/neovim/pull/14915>).
+/// The number used here is fundamentally a tradeoff between breaking some obscure edge cases and performance.
+///
+///
+/// Neovim chose 64 for this value somewhat arbitrarily (<https://github.com/neovim/neovim/pull/18397>).
+/// 64 is too low for some languages though. In particular, it breaks some highlighting for record fields in Erlang record definitions.
+/// This number can be increased if new syntax highlight breakages are found, as long as the performance penalty is not too high.
+const TREE_SITTER_MATCH_LIMIT: u32 = 256;
+
+#[derive(Debug)]
+pub struct TextObjectQuery {
+ pub query: Query,
+}
+
+impl TextObjectQuery {
+ /// Run the query on the given node and return sub nodes which match given
+ /// capture ("function.inside", "class.around", etc).
+ ///
+ /// Captures may contain multiple nodes by using quantifiers (+, *, etc),
+ /// and support for this is partial and could use improvement.
+ ///
+ /// ```query
+ /// (comment)+ @capture
+ ///
+ /// ; OR
+ /// (
+ /// (comment)*
+ /// .
+ /// (function)
+ /// ) @capture
+ /// ```
+ pub fn capture_nodes<'a>(
+ &'a self,
+ capture_name: &str,
+ node: Node<'a>,
+ slice: RopeSlice<'a>,
+ cursor: &'a mut QueryCursor,
+ ) -> Option<impl Iterator<Item = CapturedNode<'a>>> {
+ self.capture_nodes_any(&[capture_name], node, slice, cursor)
+ }
+
+ /// Find the first capture that exists out of all given `capture_names`
+ /// and return sub nodes that match this capture.
+ pub fn capture_nodes_any<'a>(
+ &'a self,
+ capture_names: &[&str],
+ node: Node<'a>,
+ slice: RopeSlice<'a>,
+ cursor: &'a mut QueryCursor,
+ ) -> Option<impl Iterator<Item = CapturedNode<'a>>> {
+ let capture_idx = capture_names
+ .iter()
+ .find_map(|cap| self.query.capture_index_for_name(cap))?;
+
+ cursor.set_match_limit(TREE_SITTER_MATCH_LIMIT);
+
+ let nodes = cursor
+ .captures(&self.query, node, RopeProvider(slice))
+ .filter_map(move |(mat, _)| {
+ let nodes: Vec<_> = mat
+ .captures
+ .iter()
+ .filter_map(|cap| (cap.index == capture_idx).then_some(cap.node))
+ .collect();
+
+ if nodes.len() > 1 {
+ Some(CapturedNode::Grouped(nodes))
+ } else {
+ nodes.into_iter().map(CapturedNode::Single).next()
+ }
+ });
+
+ Some(nodes)
+ }
+}
+
+/// Represents the reason why syntax highlighting failed.
+#[derive(Debug, PartialEq, Eq)]
+pub enum Error {
+ Cancelled,
+ InvalidLanguage,
+ InvalidRanges,
+ Unknown,
+}
+
+#[derive(Clone)]
+enum IncludedChildren {
+ None,
+ All,
+ Unnamed,
+}
+
+impl Default for IncludedChildren {
+ fn default() -> Self {
+ Self::None
+ }
+}
+
+fn byte_range_to_str(range: std::ops::Range<usize>, source: RopeSlice) -> Cow<str> {
+ Cow::from(source.byte_slice(range))
+}
+
+struct TsParser {
+ parser: tree_sitter::Parser,
+ pub cursors: Vec<QueryCursor>,
+}
+
+// could also just use a pool, or a single instance?
+thread_local! {
+ static PARSER: RefCell<TsParser> = RefCell::new(TsParser {
+ parser: Parser::new(),
+ cursors: Vec::new(),
+ })
+}
+
+pub fn with_cursor<T>(f: impl FnOnce(&mut QueryCursor) -> T) -> T {
+ PARSER.with(|parser| {
+ let mut parser = parser.borrow_mut();
+ let mut cursor = parser.cursors.pop().unwrap_or_else(QueryCursor::new);
+ let res = f(&mut cursor);
+ parser.cursors.push(cursor);
+ res
+ })
+}