Source: src/main.js

// @ts-check
/**
 * @author Aday Cuesta Correa <alu0101483887@ull.edu.es>
 * @date 02/04/2024
 * @module lexgen-code
 * @file This module exports the functions buildLexer and nearleyLexer
 *     that allows to create lexical analyzers
 */

'use strict';

const HasNamedRegexp = /[(][?]<(\w+)>(.+)[)]/;
const IsNamedRegexp = /^[(][?]<(\w+)>(.+)[)]$/; // The same regexp as before but with anchors

/**
 * A helper function to check a regular expression has a
 *     named and only one named parenthesis and a body
 * @param {RegExp} namedRegexp The regular expression
 * @return {string | boolean} Whether the regular expression is named and has no named parenthesis inside
 * @private
 */
const checkRegExpIsNamed = (namedRegexp) => {
  const id = IsNamedRegexp.exec(namedRegexp.source);
  if (!id) return false;
  let srcRegexp = id[2];
  if (!srcRegexp.length) return false;
  if (HasNamedRegexp.exec(srcRegexp)) return false;
  return id[1];
};

/**
 * Creates a lexical analyzer
 * @param {array} regexps An array of regular expressions.
 *     Regexps must be named using a parenthesis. Example: `/(?<NAME>.)/`.
 *     The whole regexp must be inside the parenthesis.
 *     The names SPACE and ERROR are special:<br><br>
 *     1. SPACE. If something matches a parenthesis named SPACE it will
 *     be ignored<br><br>
 *     2. ERROR. It is a special value reserved for the implementation.
 *     When something doesn't match any of the provided regexps it will
 *     be returned as error. The error will span from the point where nothing
 *     matched to the next whitespace(\s)<br><br>
 *     **Note**: When two regexps can match the one that appears
 *     earlier will be chosen
 * @throws {Error} Will throw if each regular expression isn't named
 *     or has more than one name
 * @return {Object} The map of valid tokens and a lexical analyzer in form of a function
 */
const buildLexer = (regexps) => {
  let validTokens = new Map();
  regexps.push(/(?<ERROR>(.|\n)+)/);
  regexps.forEach((regexp) => {
    let tokenName = checkRegExpIsNamed(regexp);
    if (!tokenName) throw new Error ('All regular expressions must be named, have a non empty regexp');
    validTokens.set(tokenName, regexp);
  });
  const regexp = new RegExp(
    regexps.map((regexp) => regexp.source).join('|'),
    'yu',
  );
  let lexer = (string, line=1) => {
    let match;
    const result = [];
    let start = 0;
    while (match = regexp.exec(string)) {
      const type = Object.keys(match.groups).find((type) => match.groups[type] !== undefined);      
      line += string.slice(start, regexp.lastIndex).split('\n').length - 1;
      const col = start - string.lastIndexOf('\n', start);
      if (!validTokens.get(type).skip) {
        let value = match.groups[type];
        if (validTokens.get(type).value) value = validTokens.get(type).value(value);
        result.push({type, value, line, col, length: regexp.lastIndex - start});
      }
      start = regexp.lastIndex;
    }
    return result;
  };
  return {validTokens, lexer};
};

const nearleyLexer = function(regexps, options) {
  //debugger;
  const {validTokens, lexer} = buildLexer(regexps);
  validTokens.set("EOF");
  return {
    currentPos: 0,
    buffer: '',
    lexer: lexer,
    validTokens: validTokens,
    regexps: regexps,
    /**
     * Sets the internal buffer to data, and restores line/col/state info taken from save().
     * Compatibility not tested
     */
    reset: function(data, info) { 
      this.buffer = data || '';
      this.currentPos = 0;
      let line = info ? info.line : 1;
      this.tokens = lexer(data, line);
      
      let lastToken = {}; 
        // Replicate the last token if it exists
      Object.assign(lastToken, this.tokens[this.tokens.length-1]);
      lastToken.type = "EOF"
      lastToken.value = "EOF"

      this.tokens.push(lastToken);

      //console.log(this.tokens);
      if (options && options.transform) {
        if (typeof options.transform === 'function') {
          debugger;
          this.tokens = options.transform(this.tokens);
        } else if (Array.isArray(options.transform)) {
          options.transform.forEach(trans => this.tokens = trans(this.tokens))
        }
      } 
      return this;
    },
    /**
     * Returns e.g. {type, value, line, col, …}. Only the value attribute is required.
     */
    next: function() { // next(): Token | undefined;
      if (this.currentPos < this.tokens.length)
        return this.tokens[this.currentPos++];
      return undefined;
    },
    has: function(tokenType) {
      return validTokens.has(tokenType);
    },
    /**
     * Returns an object describing the current line/col etc. This allows nearley.JS
     * to preserve this information between feed() calls, and also to support Parser#rewind().
     * The exact structure is lexer-specific; nearley doesn't care what's in it.
     */
    save: function() {
      return this.tokens[this.currentPos];
    }, // line and col
    /**
     * Returns a string with an error message describing the line/col of the offending token.
     * You might like to include a preview of the line in question.
     */
    formatError: function(token) {
      return `Error near "${token.value}" in line ${token.line}`;
    } // string with error message
  };
}

module.exports = { buildLexer, nearleyLexer };