use std::borrow::Cow; use std::path::Path; use std::sync::Arc; use crate::tree_sitter::query::{Capture, Pattern, QueryStr, UserPredicate}; use crate::tree_sitter::{query, Grammar, Query, QueryMatch, SyntaxTreeNode}; use arc_swap::ArcSwap; use helix_stdx::rope::{self, RopeSliceExt}; use once_cell::sync::Lazy; use regex::Regex; use ropey::RopeSlice; use crate::byte_range_to_str; use crate::highlighter::Highlight; /// Contains the data needed to highlight code written in a particular language. /// /// This struct is immutable and can be shared between threads. #[derive(Debug)] pub struct HighlightConfiguration { pub grammar: Grammar, pub query: Query, pub(crate) injections_query: Query, pub(crate) combined_injections_patterns: Vec, first_highlights_pattern: Pattern, pub(crate) highlight_indices: ArcSwap>, pub(crate) non_local_variable_patterns: Vec, pub(crate) injection_content_capture: Option, pub(crate) injection_language_capture: Option, pub(crate) injection_filename_capture: Option, pub(crate) injection_shebang_capture: Option, pub(crate) local_scope_capture: Option, pub(crate) local_def_capture: Option, pub(crate) local_def_value_capture: Option, pub(crate) local_ref_capture: Option, } impl HighlightConfiguration { /// Creates a `HighlightConfiguration` for a given `Grammar` and set of highlighting /// queries. /// /// # Parameters /// /// * `language` - The Tree-sitter `Grammar` that should be used for parsing. /// * `highlights_query` - A string containing tree patterns for syntax highlighting. This /// should be non-empty, otherwise no syntax highlights will be added. /// * `injections_query` - A string containing tree patterns for injecting other languages /// into the document. This can be empty if no injections are desired. /// * `locals_query` - A string containing tree patterns for tracking local variable /// definitions and references. This can be empty if local variable tracking is not needed. /// /// Returns a `HighlightConfiguration` that can then be used with the `highlight` method. pub fn new( grammar: Grammar, path: impl AsRef, highlights_query: &str, injection_query: &str, locals_query: &str, ) -> Result { // Concatenate the query strings, keeping track of the start offset of each section. let mut query_source = String::new(); query_source.push_str(locals_query); let highlights_query_offset = query_source.len(); query_source.push_str(highlights_query); let mut non_local_variable_patterns = Vec::with_capacity(32); // Construct a single query by concatenating the three query strings, but record the // range of pattern indices that belong to each individual string. let query = Query::new(grammar, &query_source, path, |pattern, predicate| { match predicate { UserPredicate::IsPropertySet { negate: true, key: "local", val: None, } => { if non_local_variable_patterns.len() < pattern.idx() { non_local_variable_patterns.resize(pattern.idx(), false) } non_local_variable_patterns[pattern.idx()] = true; } predicate => { return Err(format!("unsupported predicate {predicate}").into()); } } Ok(()) })?; let mut combined_injections_patterns = Vec::new(); let injections_query = Query::new(grammar, injection_query, path, |pattern, predicate| { match predicate { UserPredicate::SetProperty { key: "injection.combined", val: None, } => combined_injections_patterns.push(pattern), predicate => { return Err(format!("unsupported predicate {predicate}").into()); } } Ok(()) })?; let first_highlights_pattern = query .patterns() .find(|pattern| query.start_byte_for_pattern(*pattern) >= highlights_query_offset) .unwrap_or(Pattern::SENTINEL); let injection_content_capture = query.get_capture("injection.content"); let injection_language_capture = query.get_capture("injection.language"); let injection_filename_capture = query.get_capture("injection.filename"); let injection_shebang_capture = query.get_capture("injection.shebang"); let local_def_capture = query.get_capture("local.definition"); let local_def_value_capture = query.get_capture("local.definition-value"); let local_ref_capture = query.get_capture("local.reference"); let local_scope_capture = query.get_capture("local.scope"); let highlight_indices = ArcSwap::from_pointee(vec![Highlight::NONE; query.num_captures() as usize]); Ok(Self { grammar, query, injections_query, combined_injections_patterns, first_highlights_pattern, highlight_indices, non_local_variable_patterns, injection_content_capture, injection_language_capture, injection_filename_capture, injection_shebang_capture, local_scope_capture, local_def_capture, local_def_value_capture, local_ref_capture, }) } /// Set the list of recognized highlight names. /// /// Tree-sitter syntax-highlighting queries specify highlights in the form of dot-separated /// highlight names like `punctuation.bracket` and `function.method.builtin`. Consumers of /// these queries can choose to recognize highlights with different levels of specificity. /// For example, the string `function.builtin` will match against `function.builtin.constructor` /// but will not match `function.method.builtin` and `function.method`. /// /// When highlighting, results are returned as `Highlight` values, which contain the index /// of the matched highlight this list of highlight names. pub fn configure(&self, recognized_names: &[String]) { let mut capture_parts = Vec::new(); let indices: Vec<_> = self .query .captures() .map(move |(_, capture_name)| { capture_parts.clear(); capture_parts.extend(capture_name.split('.')); let mut best_index = u32::MAX; let mut best_match_len = 0; for (i, recognized_name) in recognized_names.iter().enumerate() { let mut len = 0; let mut matches = true; for (i, part) in recognized_name.split('.').enumerate() { match capture_parts.get(i) { Some(capture_part) if *capture_part == part => len += 1, _ => { matches = false; break; } } } if matches && len > best_match_len { best_index = i as u32; best_match_len = len; } } Highlight(best_index) }) .collect(); self.highlight_indices.store(Arc::new(indices)); } fn injection_pair<'a>( &self, query_match: &QueryMatch<'a, 'a>, source: RopeSlice<'a>, ) -> ( Option>, Option>, ) { let mut injection_capture = None; let mut content_node = None; for matched_node in query_match.matched_nodes() { let capture = Some(matched_node.capture); if capture == self.injection_language_capture { let name = byte_range_to_str(matched_node.syntax_node.byte_range(), source); injection_capture = Some(InjectionLanguageMarker::Name(name)); } else if capture == self.injection_filename_capture { let name = byte_range_to_str(matched_node.syntax_node.byte_range(), source); let path = Path::new(name.as_ref()).to_path_buf(); injection_capture = Some(InjectionLanguageMarker::Filename(path.into())); } else if capture == self.injection_shebang_capture { let node_slice = source.byte_slice(matched_node.syntax_node.byte_range()); // some languages allow space and newlines before the actual string content // so a shebang could be on either the first or second line let lines = if let Ok(end) = node_slice.try_line_to_byte(2) { node_slice.byte_slice(..end) } else { node_slice }; injection_capture = SHEBANG_REGEX .captures_iter(lines.regex_input()) .map(|cap| { let cap = lines.byte_slice(cap.get_group(1).unwrap().range()); InjectionLanguageMarker::Shebang(cap.into()) }) .next() } else if capture == self.injection_content_capture { content_node = Some(matched_node.syntax_node.clone()); } } (injection_capture, content_node) } pub(super) fn injection_for_match<'a>( &self, query: &'a Query, query_match: &QueryMatch<'a, 'a>, source: RopeSlice<'a>, ) -> ( Option>, Option>, IncludedChildren, ) { let (mut injection_capture, content_node) = self.injection_pair(query_match, source); let mut included_children = IncludedChildren::default(); for prop in query.property_settings(query_match.pattern_index) { match prop.key.as_ref() { // In addition to specifying the language name via the text of a // captured node, it can also be hard-coded via a `#set!` predicate // that sets the injection.language key. "injection.language" if injection_capture.is_none() => { injection_capture = prop .value .as_ref() .map(|s| InjectionLanguageMarker::Name(s.as_ref().into())); } // By default, injections do not include the *children* of an // `injection.content` node - only the ranges that belong to the // node itself. This can be changed using a `#set!` predicate that // sets the `injection.include-children` key. "injection.include-children" => included_children = IncludedChildren::All, // Some queries might only exclude named children but include unnamed // children in their `injection.content` node. This can be enabled using // a `#set!` predicate that sets the `injection.include-unnamed-children` key. "injection.include-unnamed-children" => { included_children = IncludedChildren::Unnamed } _ => {} } } (injection_capture, content_node, included_children) } // pub fn load_query( // &self, // language: &str, // filename: &str, // read_query_text: impl FnMut(&str, &str) -> String, // ) -> Result, QueryError> { // let query_text = read_query(language, filename, read_query_text); // if query_text.is_empty() { // return Ok(None); // } // Query::new(&self.grammar, &query_text, ).map(Some) // } } /// reads a query by invoking `read_query_text`, handeles any `inherits` directives pub fn read_query( language: &str, filename: &str, mut read_query_text: impl FnMut(&str, &str) -> String, ) -> String { fn read_query_impl( language: &str, filename: &str, read_query_text: &mut impl FnMut(&str, &str) -> String, ) -> String { static INHERITS_REGEX: Lazy = Lazy::new(|| Regex::new(r";+\s*inherits\s*:?\s*([a-z_,()-]+)\s*").unwrap()); let query = read_query_text(language, filename); // replaces all "; inherits (,)*" with the queries of the given language(s) INHERITS_REGEX .replace_all(&query, |captures: ®ex::Captures| { captures[1] .split(',') .map(|language| { format!( "\n{}\n", read_query_impl(language, filename, &mut *read_query_text) ) }) .collect::() }) .to_string() } read_query_impl(language, filename, &mut read_query_text) } const SHEBANG: &str = r"#!\s*(?:\S*[/\\](?:env\s+(?:\-\S+\s+)*)?)?([^\s\.\d]+)"; static SHEBANG_REGEX: Lazy = Lazy::new(|| rope::Regex::new(SHEBANG).unwrap()); struct InjectionSettings { include_children: IncludedChildren, language: Option, } #[derive(Debug, Clone)] pub enum InjectionLanguageMarker<'a> { Name(Cow<'a, str>), Filename(Cow<'a, Path>), Shebang(String), } #[derive(Clone)] enum IncludedChildren { None, All, Unnamed, } impl Default for IncludedChildren { fn default() -> Self { Self::None } }