emacs-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

feature/tree-sitter c2ecb08775: Lazily compile tree-sitter query


From: Yuan Fu
Subject: feature/tree-sitter c2ecb08775: Lazily compile tree-sitter query
Date: Tue, 11 Oct 2022 14:14:42 -0400 (EDT)

branch: feature/tree-sitter
commit c2ecb08775dc24618de507d2d1ce0f9b0debe17e
Author: Yuan Fu <casouri@gmail.com>
Commit: Yuan Fu <casouri@gmail.com>

    Lazily compile tree-sitter query
    
    See comment on struct Lisp_TS_Query for why.  Previous commits adding
    python and js support for tree-sitter breaks Emacs build if language
    definitions aren't available at build time.  This commit fixes that.
    
    Now query object don't compile the query upon creation, but stores the
    query source and language symbol, and compiles upon first use (in
    treesit-query-capture).
    
    I want ts_ensure_query_compiled to return signal symbol and data
    rather than signaling itself, because it's a helper function not lisp
    function.  But because it calls ts_load_language, I had to update
    ts_load_language to also use the signal symbol/data interface.
    
    * src/treesit.h (struct Lisp_TS_Query): Add two new field.
    * src/treesit.c (ts_load_language): Return signal symbol and data
    rather than signaling itself.
    (Ftreesit_langauge_available_p)
    (Ftreesit_parser_create): Update usage of ts_load_language
    
    (make_ts_query): Now returns a lisp object.
    (ts_query_error_to_string): Moved because it's used by
    ts_compose_query_signal_data.
    (ts_compose_query_signal_data)
    (ts_ensure_query_compiled): New functions.
    (Ftreesit_query_compile): Delay compiling the query.
    (Ftreesit_query_capture): Instead of creating a query object which
    compiles the query, now goes two routes: if QUERY is a query object,
    make sure it is compiled and use the TSQuery; if QUERY is a cons or
    string, compile directly to TSQuery, and free it after use.  Creating
    a lisp query requires the language symbol, but in this function we
    only have TSLanguage.
---
 src/treesit.c | 267 ++++++++++++++++++++++++++++++++++++++--------------------
 src/treesit.h |  20 ++++-
 2 files changed, 194 insertions(+), 93 deletions(-)

diff --git a/src/treesit.c b/src/treesit.c
index fc7e6c5593..2565464dea 100644
--- a/src/treesit.c
+++ b/src/treesit.c
@@ -89,6 +89,8 @@ along with GNU Emacs.  If not, see 
<https://www.gnu.org/licenses/>.  */
    - lisp/emacs-lisp/cl-preloaded.el & data.c & lisp.h for parser and
      node type.
 
+   Regarding signals: only raise signals in lisp functions.
+
    We don't parse at every keystroke.  Instead we only record the
    changes at each keystroke, and only parse when requested.  It is
    possible that lazy parsing is worse: instead of dispersed little
@@ -196,10 +198,11 @@ ts_load_language_push_for_each_suffix
    Qtreesit_load_language_error carries the error message from
    trying to load the library with each extension.
 
-   If SIGNAL is true, signal an error when failed to load LANGUAGE; if
-   false, return NULL when failed.  */
+   If error occurs, return NULL and fill SIGNAL_SYMBOL and SIGNAL_DATA
+   with values suitable for xsignal. */
 static TSLanguage *
-ts_load_language (Lisp_Object language_symbol, bool signal)
+ts_load_language (Lisp_Object language_symbol,
+                 Lisp_Object *signal_symbol, Lisp_Object *signal_data)
 {
   Lisp_Object symbol_name = Fsymbol_name (language_symbol);
 
@@ -263,11 +266,9 @@ ts_load_language (Lisp_Object language_symbol, bool signal)
     }
   if (error != NULL)
     {
-      if (signal)
-       xsignal2 (Qtreesit_load_language_error,
-                 symbol_name, Fnreverse (error_list));
-      else
-       return NULL;
+      *signal_symbol = Qtreesit_load_language_error;
+      *signal_data = list2 (symbol_name, Fnreverse (error_list));
+      return NULL;
     }
 
   /* Load TSLanguage.  */
@@ -277,11 +278,9 @@ ts_load_language (Lisp_Object language_symbol, bool signal)
   error = dynlib_error ();
   if (error != NULL)
     {
-      if (signal)
-       xsignal1 (Qtreesit_load_language_error,
-                 build_string (error));
-      else
-       return NULL;
+      *signal_symbol = Qtreesit_load_language_error;
+      *signal_data = build_string (error);
+      return NULL;
     }
   TSLanguage *lang = (*langfn) ();
 
@@ -291,12 +290,10 @@ ts_load_language (Lisp_Object language_symbol, bool 
signal)
   ts_parser_delete (parser);
   if (!success)
     {
-      if (signal)
-       xsignal2 (Qtreesit_load_language_error,
-                 build_pure_c_string ("Language version doesn't match 
tree-sitter version, language version:"),
-                 make_fixnum (ts_language_version (lang)));
-      else
-       return NULL;
+      *signal_symbol = Qtreesit_load_language_error;
+      *signal_data = list2 (build_pure_c_string ("Language version doesn't 
match tree-sitter version, language version:"),
+                           make_fixnum (ts_language_version (lang)));
+      return NULL;
     }
   return lang;
 }
@@ -310,7 +307,9 @@ DEFUN ("treesit-language-available-p",
 {
   CHECK_SYMBOL (language);
   ts_initialize ();
-  if (ts_load_language(language, false) == NULL)
+  Lisp_Object signal_symbol = Qnil;
+  Lisp_Object signal_data = Qnil;
+  if (ts_load_language(language, &signal_symbol, &signal_data) == NULL)
     return Qnil;
   else
     return Qt;
@@ -634,29 +633,96 @@ make_ts_node (Lisp_Object parser, TSNode node)
   return make_lisp_ptr (lisp_node, Lisp_Vectorlike);
 }
 
-/* Make a compiled query struct.  Return NULL if error occurs.  QUERY
-   has to be either a cons or a string.  */
-static struct Lisp_TS_Query *
-make_ts_query (Lisp_Object query, const TSLanguage *language,
-              uint32_t *error_offset, TSQueryError *error_type)
+/* Make a compiled query.  QUERY has to be either a cons or a
+   string.  */
+static Lisp_Object
+make_ts_query (Lisp_Object query, Lisp_Object language)
 {
-  if (CONSP (query))
-    query = Ftreesit_query_expand (query);
-  char *source = SSDATA (query);
-
-  TSQuery *ts_query = ts_query_new (language, source, strlen (source),
-                                   error_offset, error_type);
   TSQueryCursor *ts_cursor = ts_query_cursor_new ();
+  struct Lisp_TS_Query *lisp_query
+    = ALLOCATE_PSEUDOVECTOR (struct Lisp_TS_Query, source,
+                            PVEC_TS_COMPILED_QUERY);
 
-  if (ts_query == NULL)
+  lisp_query->language = language;
+  lisp_query->source = query;
+  lisp_query->query = NULL;
+  lisp_query->cursor = ts_cursor;
+  return make_lisp_ptr (lisp_query, Lisp_Vectorlike);
+}
+
+static const char*
+ts_query_error_to_string (TSQueryError error)
+{
+  switch (error)
+    {
+    case TSQueryErrorNone:
+      return "None";
+    case TSQueryErrorSyntax:
+      return "Syntax error at";
+    case TSQueryErrorNodeType:
+      return "Node type error at";
+    case TSQueryErrorField:
+      return "Field error at";
+    case TSQueryErrorCapture:
+      return "Capture error at";
+    case TSQueryErrorStructure:
+      return "Structure error at";
+    default:
+      return "Unknown error";
+    }
+}
+
+static Lisp_Object
+ts_compose_query_signal_data
+(uint32_t error_offset, TSQueryError error_type)
+{
+  return list3 (build_string
+               (ts_query_error_to_string (error_type)),
+               make_fixnum (error_offset + 1),
+               build_pure_c_string("Debug the query with 
`treesit-query-validate'"));
+}
+
+/* Ensure the QUERY is compiled.  Return the TSQuery.  It could be
+   NULL if error occurs, in which case ERROR_OFFSET and ERROR_TYPE are
+   bound.  If error occures, return NULL, and assign SIGNAL_SYMBOL and
+   SIGNAL_DATA accordingly.  */
+static TSQuery *
+ts_ensure_query_compiled
+(Lisp_Object query, Lisp_Object *signal_symbol, Lisp_Object *signal_data)
+{
+  /* If query is already compiled (not null), return that, otherwise
+     compile and return it.  */
+  TSQuery *ts_query = XTS_COMPILED_QUERY (query)->query;
+  if (ts_query != NULL)
+    return ts_query;
+
+  /* Get query source and TSLanguage ready.  */
+  Lisp_Object source = XTS_COMPILED_QUERY (query)->source;
+  Lisp_Object language = XTS_COMPILED_QUERY (query)->language;
+  /* This is the main reason why we compile query lazily: to avoid
+     loading languages early.  */
+  TSLanguage *ts_lang = ts_load_language (language, signal_symbol,
+                                         signal_data);
+  if (ts_lang == NULL)
     return NULL;
 
-  struct Lisp_TS_Query *lisp_query
-    = ALLOCATE_PLAIN_PSEUDOVECTOR (struct Lisp_TS_Query,
-                                  PVEC_TS_COMPILED_QUERY);
-  lisp_query->query = ts_query;
-  lisp_query->cursor = ts_cursor;
-  return lisp_query;
+  if (CONSP (source))
+    source = Ftreesit_query_expand (source);
+
+  /* Create TSQuery.  */
+  uint32_t error_offset;
+  TSQueryError error_type;
+  char *ts_source = SSDATA (source);
+  ts_query = ts_query_new (ts_lang, ts_source, strlen (ts_source),
+                          &error_offset, &error_type);
+  if (ts_query == NULL)
+    {
+      *signal_symbol = Qtreesit_query_error;
+      *signal_data = ts_compose_query_signal_data
+       (error_offset, error_type);
+    }
+  XTS_COMPILED_QUERY (query)->query = ts_query;
+  return ts_query;
 }
 
 DEFUN ("treesit-parser-p",
@@ -750,15 +816,23 @@ parser.  If NO-REUSE is non-nil, always create a new 
parser.  */)
        }
     }
 
+  /* Load language.  */
+  Lisp_Object signal_symbol = Qnil;
+  Lisp_Object signal_data = Qnil;
   TSParser *parser = ts_parser_new ();
-  TSLanguage *lang = ts_load_language (language, true);
+  TSLanguage *lang = ts_load_language (language, &signal_symbol,
+                                      &signal_data);
+  if (lang == NULL)
+    xsignal (signal_symbol, signal_data);
   /* We check language version when loading a language, so this should
      always succeed.  */
   ts_parser_set_language (parser, lang);
 
+  /* Create parser.  */
   Lisp_Object lisp_parser
     = make_ts_parser (Fcurrent_buffer (), parser, NULL, language);
 
+  /* Update parser-list.  */
   BVAR (buf, ts_parser_list)
     = Fcons (lisp_parser, BVAR (buf, ts_parser_list));
 
@@ -1454,28 +1528,6 @@ explanation.  */)
                     query, build_pure_c_string (" "));
 }
 
-static const char*
-ts_query_error_to_string (TSQueryError error)
-{
-  switch (error)
-    {
-    case TSQueryErrorNone:
-      return "None";
-    case TSQueryErrorSyntax:
-      return "Syntax error at";
-    case TSQueryErrorNodeType:
-      return "Node type error at";
-    case TSQueryErrorField:
-      return "Field error at";
-    case TSQueryErrorCapture:
-      return "Capture error at";
-    case TSQueryErrorStructure:
-      return "Structure error at";
-    default:
-      return "Unknown error";
-    }
-}
-
 /* This struct is used for passing captures to be check against
    predicates.  Captures we check for are the ones in START before
    END.  For example, if START and END are
@@ -1656,16 +1708,19 @@ ts_eval_predicates
 
 DEFUN ("treesit-query-compile",
        Ftreesit_query_compile,
-       Streesit_query_compile, 2, 2, 0,
+       Streesit_query_compile, 2, 3, 0,
        doc: /* Compile QUERY to a compiled query.
 
 Querying a compiled query is much faster than an uncompiled one.
 LANGUAGE is the language this query is for.
 
+If EAGER is non-nil, immediately load LANGUAGE and compile the query.
+Otherwise defer until the query is first used.
+
 Signals treesit-query-error if QUERY is malformed or something else
-goes wrong.  You can use `treesit-query-validate' to debug the
-query.  */)
-  (Lisp_Object language, Lisp_Object query)
+goes wrong.  (This of course would only happen if EAGER is non-nil.)
+You can use `treesit-query-validate' to debug the query.  */)
+  (Lisp_Object language, Lisp_Object query, Lisp_Object eager)
 {
   if (NILP (Ftreesit_query_p (query)))
     wrong_type_argument (Qtreesit_query_p, query);
@@ -1673,19 +1728,23 @@ query.  */)
   if (TS_COMPILED_QUERY_P (query))
     return query;
 
-  TSLanguage *ts_lang = ts_load_language (language, true);
-  uint32_t error_offset;
-  TSQueryError error_type;
+  Lisp_Object lisp_query = make_ts_query (query, language);
 
-  struct Lisp_TS_Query *lisp_query
-    = make_ts_query (query, ts_lang, &error_offset, &error_type);
+  /* Maybe actually compile. */
+  if (NILP (eager))
+      return lisp_query;
+  else
+    {
+      Lisp_Object signal_symbol = Qnil;
+      Lisp_Object signal_data = Qnil;
+      TSQuery *ts_query = ts_ensure_query_compiled
+       (lisp_query, &signal_symbol, &signal_data);
 
-  if (lisp_query == NULL)
-    xsignal2 (Qtreesit_query_error,
-             build_string (ts_query_error_to_string (error_type)),
-             make_fixnum (error_offset + 1));
+      if (ts_query == NULL)
+       xsignal (signal_symbol, signal_data);
 
-  return make_lisp_ptr (lisp_query, Lisp_Vectorlike);
+      return lisp_query;
+    }
 }
 
 DEFUN ("treesit-query-capture",
@@ -1725,7 +1784,7 @@ query.  */)
        || CONSP (query) || STRINGP (query)))
     wrong_type_argument (Qtreesit_query_p, query);
 
-
+  /* Resolve NODE into an actual node. */
   Lisp_Object lisp_node;
   if (TS_NODEP (node))
     lisp_node = node;
@@ -1751,30 +1810,50 @@ query.  */)
   const TSLanguage *lang = ts_parser_language
     (XTS_PARSER (lisp_parser)->parser);
 
-  /* Initialize query objects, and execute query.  */
-  struct Lisp_TS_Query *lisp_query;
+  /* Initialize query objects.  At the end of this block, we should
+     have a working TSQuery and a TSQueryCursor.  */
+  TSQuery *ts_query;
+  TSQueryCursor *cursor;
+  bool needs_to_free_query_and_cursor;
   if (TS_COMPILED_QUERY_P (query))
-      lisp_query = XTS_COMPILED_QUERY (query);
+    {
+      Lisp_Object signal_symbol = Qnil;
+      Lisp_Object signal_data = Qnil;
+      ts_query = ts_ensure_query_compiled
+       (query, &signal_symbol, &signal_data);
+      cursor = XTS_COMPILED_QUERY (query)->cursor;
+      /* We don't need to free ts_query and cursor because they
+        are stored in a lisp object, which is tracked by gc.  */
+      needs_to_free_query_and_cursor = false;
+      if (ts_query == NULL)
+       {
+         xsignal (signal_symbol, signal_data);
+       }
+    }
   else
     {
+      /* Since query is not TS_COMPILED_QUERY, it can only be a string
+        or a cons.  */
+      if (CONSP (query))
+       query = Ftreesit_query_expand (query);
+      char *query_string = SSDATA (query);
       uint32_t error_offset;
       TSQueryError error_type;
-      lisp_query = make_ts_query (query, lang,
-                                 &error_offset, &error_type);
-      if (lisp_query == NULL)
+      ts_query = ts_query_new (lang, query_string, strlen (query_string),
+                              &error_offset, &error_type);
+      if (ts_query == NULL)
        {
-         xsignal3 (Qtreesit_query_error,
-                   build_string
-                   (ts_query_error_to_string (error_type)),
-                   make_fixnum (error_offset + 1),
-                   build_pure_c_string("Debug the query with 
`treesit-query-validate'"));
+         xsignal (Qtreesit_query_error, ts_compose_query_signal_data
+                  (error_offset, error_type));
        }
-      /* We don't need need to free TS_QUERY and CURSOR, they are stored
-        in a lisp object, which is tracked by gc.  */
+      cursor = ts_query_cursor_new ();
+      needs_to_free_query_and_cursor = true;
     }
-  TSQuery *ts_query = lisp_query->query;
-  TSQueryCursor *cursor = lisp_query->cursor;
 
+  /* WARN: After this point, free ts_query and cursor before every
+     signal and return.  */
+
+  /* Set query range. */
   if (!NILP (beg) && !NILP (end))
     {
       EMACS_INT beg_byte = XFIXNUM (beg);
@@ -1784,6 +1863,7 @@ query.  */)
         (uint32_t) end_byte - visible_beg);
     }
 
+  /* Execute query.  */
   ts_query_cursor_exec (cursor, ts_query, ts_node);
   TSQueryMatch match;
 
@@ -1838,6 +1918,11 @@ query.  */)
          result = prev_result;
        }
     }
+  if (needs_to_free_query_and_cursor)
+    {
+      ts_query_delete (ts_query);
+      ts_query_cursor_delete (cursor);
+    }
   return Fnreverse (result);
 }
 
diff --git a/src/treesit.h b/src/treesit.h
index 0c043f7d25..20e7cd4107 100644
--- a/src/treesit.h
+++ b/src/treesit.h
@@ -84,11 +84,27 @@ struct Lisp_TS_Node
   ptrdiff_t timestamp;
 };
 
-/* A compiled tree-sitter query.  */
+/* A compiled tree-sitter query.
+
+   When we create a query object by treesit-compile-query, it is not
+   immediately compiled, because that would require the language
+   definition to be loaded.  For example, python.el contains
+
+   (defvar xxx (treesit-compile-query ...))
+
+   and (require 'python.el) requires python's language definition to
+   be available.  In the case of python.el, Emacs requires it when
+   building, so that breaks the build.  */
 struct Lisp_TS_Query
 {
   union vectorlike_header header;
-  /* Pointer to the query object.  */
+  /* Language symbol for the query.  */
+  Lisp_Object language;
+  /* Source lisp (sexp or string) query.  */
+  Lisp_Object source;
+  /* Pointer to the query object.  This can be NULL, meaning this
+     query is not initialized/compiled.  We compile the query when
+     it is used the first time (in treesit-query-capture).  */
   TSQuery *query;
   /* Pointer to a cursor.  If we are storing the query object, we
      might as well store a cursor, too.  */



reply via email to

[Prev in Thread] Current Thread [Next in Thread]