Unnamed repository; edit this file 'description' to name the repository.
Diffstat (limited to 'helix-syntax/src/config.rs')
| -rw-r--r-- | helix-syntax/src/config.rs | 346 |
1 files changed, 346 insertions, 0 deletions
diff --git a/helix-syntax/src/config.rs b/helix-syntax/src/config.rs new file mode 100644 index 00000000..35774fdd --- /dev/null +++ b/helix-syntax/src/config.rs @@ -0,0 +1,346 @@ +use std::borrow::Cow; +use std::path::Path; +use std::sync::Arc; + +use crate::tree_sitter::query::{Capture, Pattern, QueryStr, UserPredicate}; +use crate::tree_sitter::{query, Grammar, Query, QueryMatch, SyntaxTreeNode}; +use arc_swap::ArcSwap; +use helix_stdx::rope::{self, RopeSliceExt}; +use once_cell::sync::Lazy; +use regex::Regex; +use ropey::RopeSlice; + +use crate::byte_range_to_str; +use crate::highlighter::Highlight; + +/// Contains the data needed to highlight code written in a particular language. +/// +/// This struct is immutable and can be shared between threads. +#[derive(Debug)] +pub struct HighlightConfiguration { + pub grammar: Grammar, + pub query: Query, + pub(crate) injections_query: Query, + pub(crate) combined_injections_patterns: Vec<Pattern>, + first_highlights_pattern: Pattern, + pub(crate) highlight_indices: ArcSwap<Vec<Highlight>>, + pub(crate) non_local_variable_patterns: Vec<bool>, + pub(crate) injection_content_capture: Option<Capture>, + pub(crate) injection_language_capture: Option<Capture>, + pub(crate) injection_filename_capture: Option<Capture>, + pub(crate) injection_shebang_capture: Option<Capture>, + pub(crate) local_scope_capture: Option<Capture>, + pub(crate) local_def_capture: Option<Capture>, + pub(crate) local_def_value_capture: Option<Capture>, + pub(crate) local_ref_capture: Option<Capture>, +} + +impl HighlightConfiguration { + /// Creates a `HighlightConfiguration` for a given `Grammar` and set of highlighting + /// queries. + /// + /// # Parameters + /// + /// * `language` - The Tree-sitter `Grammar` that should be used for parsing. + /// * `highlights_query` - A string containing tree patterns for syntax highlighting. This + /// should be non-empty, otherwise no syntax highlights will be added. + /// * `injections_query` - A string containing tree patterns for injecting other languages + /// into the document. This can be empty if no injections are desired. + /// * `locals_query` - A string containing tree patterns for tracking local variable + /// definitions and references. This can be empty if local variable tracking is not needed. + /// + /// Returns a `HighlightConfiguration` that can then be used with the `highlight` method. + pub fn new( + grammar: Grammar, + path: impl AsRef<Path>, + highlights_query: &str, + injection_query: &str, + locals_query: &str, + ) -> Result<Self, query::ParseError> { + // Concatenate the query strings, keeping track of the start offset of each section. + let mut query_source = String::new(); + query_source.push_str(locals_query); + let highlights_query_offset = query_source.len(); + query_source.push_str(highlights_query); + + let mut non_local_variable_patterns = Vec::with_capacity(32); + // Construct a single query by concatenating the three query strings, but record the + // range of pattern indices that belong to each individual string. + let query = Query::new(grammar, &query_source, path, |pattern, predicate| { + match predicate { + UserPredicate::IsPropertySet { + negate: true, + key: "local", + val: None, + } => { + if non_local_variable_patterns.len() < pattern.idx() { + non_local_variable_patterns.resize(pattern.idx(), false) + } + non_local_variable_patterns[pattern.idx()] = true; + } + predicate => { + return Err(format!("unsupported predicate {predicate}").into()); + } + } + Ok(()) + })?; + + let mut combined_injections_patterns = Vec::new(); + let injections_query = Query::new(grammar, injection_query, path, |pattern, predicate| { + match predicate { + UserPredicate::SetProperty { + key: "injection.combined", + val: None, + } => combined_injections_patterns.push(pattern), + predicate => { + return Err(format!("unsupported predicate {predicate}").into()); + } + } + Ok(()) + })?; + + let first_highlights_pattern = query + .patterns() + .find(|pattern| query.start_byte_for_pattern(*pattern) >= highlights_query_offset) + .unwrap_or(Pattern::SENTINEL); + + let injection_content_capture = query.get_capture("injection.content"); + let injection_language_capture = query.get_capture("injection.language"); + let injection_filename_capture = query.get_capture("injection.filename"); + let injection_shebang_capture = query.get_capture("injection.shebang"); + let local_def_capture = query.get_capture("local.definition"); + let local_def_value_capture = query.get_capture("local.definition-value"); + let local_ref_capture = query.get_capture("local.reference"); + let local_scope_capture = query.get_capture("local.scope"); + + let highlight_indices = + ArcSwap::from_pointee(vec![Highlight::NONE; query.num_captures() as usize]); + Ok(Self { + grammar, + query, + injections_query, + combined_injections_patterns, + first_highlights_pattern, + highlight_indices, + non_local_variable_patterns, + injection_content_capture, + injection_language_capture, + injection_filename_capture, + injection_shebang_capture, + local_scope_capture, + local_def_capture, + local_def_value_capture, + local_ref_capture, + }) + } + + /// Set the list of recognized highlight names. + /// + /// Tree-sitter syntax-highlighting queries specify highlights in the form of dot-separated + /// highlight names like `punctuation.bracket` and `function.method.builtin`. Consumers of + /// these queries can choose to recognize highlights with different levels of specificity. + /// For example, the string `function.builtin` will match against `function.builtin.constructor` + /// but will not match `function.method.builtin` and `function.method`. + /// + /// When highlighting, results are returned as `Highlight` values, which contain the index + /// of the matched highlight this list of highlight names. + pub fn configure(&self, recognized_names: &[String]) { + let mut capture_parts = Vec::new(); + let indices: Vec<_> = self + .query + .captures() + .map(move |(_, capture_name)| { + capture_parts.clear(); + capture_parts.extend(capture_name.split('.')); + + let mut best_index = u32::MAX; + let mut best_match_len = 0; + for (i, recognized_name) in recognized_names.iter().enumerate() { + let mut len = 0; + let mut matches = true; + for (i, part) in recognized_name.split('.').enumerate() { + match capture_parts.get(i) { + Some(capture_part) if *capture_part == part => len += 1, + _ => { + matches = false; + break; + } + } + } + if matches && len > best_match_len { + best_index = i as u32; + best_match_len = len; + } + } + Highlight(best_index) + }) + .collect(); + + self.highlight_indices.store(Arc::new(indices)); + } + + fn injection_pair<'a>( + &self, + query_match: &QueryMatch<'a, 'a>, + source: RopeSlice<'a>, + ) -> ( + Option<InjectionLanguageMarker<'a>>, + Option<SyntaxTreeNode<'a>>, + ) { + let mut injection_capture = None; + let mut content_node = None; + + for matched_node in query_match.matched_nodes() { + let capture = Some(matched_node.capture); + if capture == self.injection_language_capture { + let name = byte_range_to_str(matched_node.syntax_node.byte_range(), source); + injection_capture = Some(InjectionLanguageMarker::Name(name)); + } else if capture == self.injection_filename_capture { + let name = byte_range_to_str(matched_node.syntax_node.byte_range(), source); + let path = Path::new(name.as_ref()).to_path_buf(); + injection_capture = Some(InjectionLanguageMarker::Filename(path.into())); + } else if capture == self.injection_shebang_capture { + let node_slice = source.byte_slice(matched_node.syntax_node.byte_range()); + + // some languages allow space and newlines before the actual string content + // so a shebang could be on either the first or second line + let lines = if let Ok(end) = node_slice.try_line_to_byte(2) { + node_slice.byte_slice(..end) + } else { + node_slice + }; + + injection_capture = SHEBANG_REGEX + .captures_iter(lines.regex_input()) + .map(|cap| { + let cap = lines.byte_slice(cap.get_group(1).unwrap().range()); + InjectionLanguageMarker::Shebang(cap.into()) + }) + .next() + } else if capture == self.injection_content_capture { + content_node = Some(matched_node.syntax_node.clone()); + } + } + (injection_capture, content_node) + } + + pub(super) fn injection_for_match<'a>( + &self, + query: &'a Query, + query_match: &QueryMatch<'a, 'a>, + source: RopeSlice<'a>, + ) -> ( + Option<InjectionLanguageMarker<'a>>, + Option<SyntaxTreeNode<'a>>, + IncludedChildren, + ) { + let (mut injection_capture, content_node) = self.injection_pair(query_match, source); + + let mut included_children = IncludedChildren::default(); + for prop in query.property_settings(query_match.pattern_index) { + match prop.key.as_ref() { + // In addition to specifying the language name via the text of a + // captured node, it can also be hard-coded via a `#set!` predicate + // that sets the injection.language key. + "injection.language" if injection_capture.is_none() => { + injection_capture = prop + .value + .as_ref() + .map(|s| InjectionLanguageMarker::Name(s.as_ref().into())); + } + + // By default, injections do not include the *children* of an + // `injection.content` node - only the ranges that belong to the + // node itself. This can be changed using a `#set!` predicate that + // sets the `injection.include-children` key. + "injection.include-children" => included_children = IncludedChildren::All, + + // Some queries might only exclude named children but include unnamed + // children in their `injection.content` node. This can be enabled using + // a `#set!` predicate that sets the `injection.include-unnamed-children` key. + "injection.include-unnamed-children" => { + included_children = IncludedChildren::Unnamed + } + _ => {} + } + } + + (injection_capture, content_node, included_children) + } + + // pub fn load_query( + // &self, + // language: &str, + // filename: &str, + // read_query_text: impl FnMut(&str, &str) -> String, + // ) -> Result<Option<Query>, QueryError> { + // let query_text = read_query(language, filename, read_query_text); + // if query_text.is_empty() { + // return Ok(None); + // } + + // Query::new(&self.grammar, &query_text, ).map(Some) + // } +} + +/// reads a query by invoking `read_query_text`, handeles any `inherits` directives +pub fn read_query( + language: &str, + filename: &str, + mut read_query_text: impl FnMut(&str, &str) -> String, +) -> String { + fn read_query_impl( + language: &str, + filename: &str, + read_query_text: &mut impl FnMut(&str, &str) -> String, + ) -> String { + static INHERITS_REGEX: Lazy<Regex> = + Lazy::new(|| Regex::new(r";+\s*inherits\s*:?\s*([a-z_,()-]+)\s*").unwrap()); + + let query = read_query_text(language, filename); + + // replaces all "; inherits <language>(,<language>)*" with the queries of the given language(s) + INHERITS_REGEX + .replace_all(&query, |captures: ®ex::Captures| { + captures[1] + .split(',') + .map(|language| { + format!( + "\n{}\n", + read_query_impl(language, filename, &mut *read_query_text) + ) + }) + .collect::<String>() + }) + .to_string() + } + read_query_impl(language, filename, &mut read_query_text) +} + +const SHEBANG: &str = r"#!\s*(?:\S*[/\\](?:env\s+(?:\-\S+\s+)*)?)?([^\s\.\d]+)"; +static SHEBANG_REGEX: Lazy<rope::Regex> = Lazy::new(|| rope::Regex::new(SHEBANG).unwrap()); + +struct InjectionSettings { + include_children: IncludedChildren, + language: Option<QueryStr>, +} + +#[derive(Debug, Clone)] +pub enum InjectionLanguageMarker<'a> { + Name(Cow<'a, str>), + Filename(Cow<'a, Path>), + Shebang(String), +} + +#[derive(Clone)] +enum IncludedChildren { + None, + All, + Unnamed, +} + +impl Default for IncludedChildren { + fn default() -> Self { + Self::None + } +} |