Quick tree-sitter grammars

A formula language that I wrote for a spreadsheet product called PT1 a couple years ago.

Quick is a formula language that I wrote a couple years back for a spreadsheet product called PT1. Tree-sitter is really great at this sort of thing, and, much easier than Antlr, which I’ve used in the past to build formula-like parsers. The basic code is one where you can execute simple calculation-like blocks, but not define vars. Was trying to stay simple here. Depending on the context, we used different implementations of functions, like when aggregating with sum() vs adding scalars with sum().

Here’s a couple examples:

10 * 20 + 4 - 1 - 4 ^ 24
22.222222 / 11.111111
10 % 3
2 * (3 + 4)
10 ^ 2
(10, 20, 30)
sum(1, 2, 3, 4, 5, variable, thing, another)
math.pow(2, 10) / 4
delta(10, -99)
monthly_salary * 12
{Yearly Salary} / 12
variables.tax_rate.city * 99100
variables.{Tax Rate}.city * 99100
100 * {Finance Variables}.{Tax Rate}.{City Tax Rate}

Features:

Basic arithmetic: add, subtract, multiply, divide, power, mod, parentheses, etc.
Simple types: boolean, string, number, null.
Possible multiple return values via parenthesized expressions.
Functions.
Variables, including user-friendly notation using {big variable name}
Dot-notation object access via selectors.
Numerics supported: int, uint, float, underscored, scientific notation

I based this on a combination of grammars that seemed clean and concise:

https://go.dev/ref/spec for naming, and their use of EBNF.
https://github.com/tree-sitter/tree-sitter-go for their implementation of the above spec.
https://github.com/siraben/tree-sitter-imp for example of simple grammars.

// Naming the precedences so we don't get confused.
const PRECEDENCE = {
  top: 9,
  identifier: 8,
  primary: 7,
  unary: 6,
  multiplicative: 5,
  additive: 4,
  comparative: 3,
  and: 2,
  or: 1,
  composite_literal: -1,
};

const multiplicative_operators = ["*", "/", "%", "^"];
const additive_operators = ["+", "-"];
const comparative_operators = ["==", "!=", "<", "<=", ">", ">="];

module.exports = grammar({
  "name": "quick",
  "extras": (_) => [
    // Ignore whitespace.
    /\s/,
  ],
  "inline": ($) => [$._string_literal],
  "supertypes": ($) => [$._expression],
  "rules": {
    // Program is the main entry point for a runnable calculation.
    "program": ($) => $._expression,

    // Reference is an alternate entry point for selector references. This is useful for places
    // where we're not running a calculation exactly, but we need to allow for the lookup
    // of some variable.
    "reference": ($) => $.selector_expression,

    // An expression is anything we can resolve to a value when executing the source.
    "_expression": ($) =>
        choice(
            $.unary_expression,
            $.binary_expression,
            $.selector_expression,
            $.call_expression,
            $._string_literal,
            $.number_literal,
            $.true,
            $.false,
            $.parenthesized_expressions
        ),

    // Parenthesizes expressions allow for multiple return types. Most of the time, we disallow
    // this because it is a bit too complex for our users, but there are edge cases where
    // we want users to be able, for example, to give us an order list of references.
    // Who knows, we could also use it to return lists.
    "parenthesized_expressions": ($) =>
        choice(
            seq("(", $._expression, ")"),
            seq("(", $._expression, repeat(seq(",", $._expression)), ")")
        ),

    // Simple unary expressions allow for operands that are themselves expressions. This means that
    // the user can, if they want, do strange things like `+-+-+-+---10`. Sure. Why not.
    "unary_expression": ($) =>
        prec(
            PRECEDENCE.unary,
            seq(field("operator", choice("+", "-", "!")), field("operand", $._expression))
        ),

    // Binary expression is anything that can be performed on two values, and an operator.
    // Here we're using our precedence rules to ensure that we have right- or left-associative
    // matching.
    "binary_expression": ($) => {
      const table = [
        [PRECEDENCE.multiplicative, choice(...multiplicative_operators)],
        [PRECEDENCE.additive, choice(...additive_operators)],
        [PRECEDENCE.comparative, choice(...comparative_operators)],
        [PRECEDENCE.and, "&&"],
        [PRECEDENCE.or, "||"],
      ];

      return choice(
          ...table.map(([precedence, operator]) =>
              prec.left(
                  precedence,
                  seq(
                      field("left", $._expression),
                      field("operator", operator),
                      field("right", $._expression)
                  )
              )
          )
      );
    },

    // Identifier in EBNF is something like `identifier = letter { letter | unicode_digit } .` but
    // we're condensing it down to a regex here, because we really don't need to know about
    // leading letters for syntax highlighting or execution purposes. We're also allowing
    // identifiers to contain strings by using a curly-brace syntax. For example, users can
    // do something like {This is a variable} and it will work.
    //
    // I'm sure that someone will try to use Emoji as variable names, but we'll figure that out
    // when we come to it.
    "identifier": (_) =>
        prec(
            PRECEDENCE.identifier,
            choice(/[a-zA-Z]+[a-zA-Z0-9_$]*/, token(seq("{", field("name", repeat(/[^{}]/)), "}")))
        ),

    // Numbers are required to start with a single, real integer, but they may contain
    // as many underscores as the user would like on either side of an optional, single
    // decimal point. They may contain an "E" character as long as it is followed by
    // one or more numbers, optionally inclusive of underscores. Using a regex here because
    // writing rules for something this simple seems a bit much. I'd rather have it
    // defined in a single place. At least until we need to support hex codes or something
    // weird like that.
    "number_literal": (_) => /[0-9]+_*[0-9_]*\.?[0-9_]*([eE]([+\-])?[0-9]+_*[0-9_]*)?/,

    // Mostly defaulting to how the Go spec defines this, and how the tree-sitter Go grammars
    // implement the spec. Allowing for escaping and raw strings.
    "_string_literal": ($) => choice($.raw_string_literal, $.inter_str_literal),
    "raw_string_literal": (_) => token(seq("`", repeat(/[^`]/), "`")),
    "inter_str_literal": ($) => seq('"', repeat(choice($._inter_str_literal_val, $.esc_seq)), '"'),
    "_inter_str_literal_val": (_) => token.immediate(prec(1, /[^"\n\\]+/)),
    "esc_seq": (_) =>
        token.immediate(
            seq(
                "\\",
                choice(
                    /[^xuU]/, /\d{2,3}/, /x[0-9a-fA-F]{2,}/, /u[0-9a-fA-F]{4}/, /U[0-9a-fA-F]{8}/
                )
            )
        ),

    // Case-sensitive boolean literals.
    "true": (_) => "true",
    "false": (_) => "false",

    // A selector expression is at least two or more identifiers or wildcards joined by a period.
    // We use a list for this type of expression, as it is easiest to work with during execution.
    "selector_expression": ($) => seq(
        choice($.wildcard, $.identifier),
        repeat(seq(".", choice($.wildcard, $.identifier))),
    ),
    "wildcard": (_) => "*",

    // A call expression is just a function name as a selector followed by an argument list,
    // which itself is just parens with an optional list of args.
    "call_expression": ($) => prec(
        PRECEDENCE.primary,
        seq(field("function", $.selector_expression), field("arguments", $.argument_list))
    ),
    "argument_list": ($) => seq(
        "(",
        optional(seq($._expression, repeat(seq(",", $._expression)))),
        ")"
    ),
  },
});

code | javascript

2024-07-28