Base grammar

Base language parselets:
/*
 * Copyright (c) 2021.  Jeffrey Vroom. All Rights Reserved.
 */

package sc.lang;

import sc.lang.java.NonKeywordString;
import sc.layer.Layer;
import sc.parser.*;
import sc.util.FileUtil;

import java.util.Collections;
import java.util.Set;

/**
 * The BaseLanguage contains some core constructs useful in all languages digits, whitespace, and some higher
 * level parselets like Keyword, SemanticToken, etc.
 * It's possible to modify parselets, or override language features by overriding methods (e.g. to modify what
 * characters are valid in an identifier). This is helpful to reuse higher level features that use those parselets.
 * Like changing EOLComment for SQL to -- or the array bracket character between JS and Java.
 * It's also possible to clone and change a parselet rather than copying the original definition.
 */
public abstract class BaseLanguage extends Language implements IParserConstants {
   public static Set DEPS_KEYWORDS = Collections.emptySet();

   public boolean ignoreCaseInKeywords = false;

   public Set getKeywords() {
      return DEPS_KEYWORDS;
   }
   
   public SymbolChoice digits = new SymbolChoice(REPEAT | NOERROR);
   public SymbolChoice optDigits = new SymbolChoice(REPEAT | OPTIONAL | NOERROR);
   public SymbolChoice nonZeroDigit = new SymbolChoice(NOERROR);
   public SymbolChoice hexDigits = new SymbolChoice(REPEAT | NOERROR);
   public SymbolChoice hexDigit = new SymbolChoice(NOERROR);
   public SymbolChoice binaryDigits = new SymbolChoice(REPEAT | NOERROR);
   public SymbolChoice binaryDigit = new SymbolChoice(NOERROR);
   public SymbolChoice octalDigits = new SymbolChoice(REPEAT | NOERROR);
   public SymbolChoice octalDigit = new SymbolChoice();
   public Sequence optOctalDigit = new Sequence(OPTIONAL | NOERROR, octalDigit);
   public SymbolChoice fpChar = new SymbolChoice(LOOKAHEAD | NOERROR); // start chars for floating point
   public SymbolChoice intChar = new SymbolChoice(LOOKAHEAD | NOERROR); // start chars for integer

   public String [] hexLetters = {"a", "b", "c", "d", "e", "f"};

   protected void addDigitChar(String s, int i) {
      digits.addExpectedValue(s);
      optDigits.addExpectedValue(s);
      hexDigits.addExpectedValue(s);
      hexDigit.addExpectedValue(s);
      fpChar.addExpectedValue(s);
      intChar.addExpectedValue(s);
      if (i < 8)
      {
         octalDigits.addExpectedValue(s);
         octalDigit.addExpectedValue(s);
      }
      if (i != 0)
         nonZeroDigit.addExpectedValue(s);
      if (i < 2) {
         binaryDigits.addExpectedValue(s);
         binaryDigit.addExpectedValue(s);
      }

   }

   {
      for (int i = 0; i < 10; i++) {
         String s = String.valueOf(i);
         addDigitChar(s, i);
      }
      for (String hexLetter : hexLetters) {
         hexDigits.addExpectedValue(hexLetter);
         String u = hexLetter.toUpperCase();
         hexDigits.addExpectedValue(u);
         hexDigit.addExpectedValue(hexLetter);
         hexDigit.addExpectedValue(u);
      }
      fpChar.addExpectedValue(".");
   }

   public SymbolChoice lineTerminator = new SymbolChoice(NOERROR, "\r\n", "\r", "\n");
   {
      lineTerminator.defaultGenerateValue = "\n"; // The symbol to use when generating this node from scratch - i.e. the default terminator
   }
   public SymbolChoice notLineTerminators = new SymbolChoice(NOT | REPEAT | OPTIONAL | NOERROR, "\r\n", "\r", "\n", Symbol.EOF);

   IndexedChoice whiteSpaceChunk = new IndexedChoice("<whitespaceChunk>", NOERROR | OPTIONAL);
   {
      whiteSpaceChunk.put(" ", new Symbol(REPEAT, " "));
      whiteSpaceChunk.put("\t", new Symbol("\t"));
      whiteSpaceChunk.put("\f", new Symbol("\f"));
      whiteSpaceChunk.put("\r", lineTerminator);
      whiteSpaceChunk.put("\n", lineTerminator);
   }

   public IndexedChoice whiteSpace = whiteSpaceChunk.clone();
   {
      whiteSpace.setName("<whiteSpace>");
      whiteSpace.repeat = true;
   }

   public Sequence EOLComment = new Sequence("<eolComment>", NOERROR,
           new Symbol("//"),
           notLineTerminators,
           new OrderedChoice(lineTerminator, new Symbol(Symbol.EOF)));
   {
      EOLComment.styleName = "comment";
   }

   public IndexedChoice commentBody = new IndexedChoice("<commentBody>", OPTIONAL | REPEAT | NOERROR);
   {
      commentBody.put("*", new Sequence(NOERROR,new Symbol(NOERROR,"*"), new Symbol(NOERROR | NOT | LOOKAHEAD, "/")));
      commentBody.addDefault(new Sequence(NOERROR,new Symbol(NOERROR | NOT, "*"), new Symbol(NOERROR | LOOKAHEAD, Symbol.ANYCHAR)));
   }

   public Sequence blockComment = new Sequence("<blockComment>", NOERROR);
   { blockComment.add(new Symbol("/*"), commentBody, new Symbol("*/")); }
   {
      blockComment.styleName = "comment";
   }

   public IndexedChoice spacing = new IndexedChoice("<spacing>", REPEAT | OPTIONAL | NOERROR);
   {
      spacing.put(" ", whiteSpaceChunk);
      spacing.put("\t", whiteSpaceChunk);
      spacing.put("\f", whiteSpaceChunk);
      spacing.put("\r", whiteSpaceChunk);
      spacing.put("\n", whiteSpaceChunk);
      spacing.put("//", EOLComment);
      spacing.put("/*", blockComment);

      spacing.generateParseNode = new SpacingParseNode(spacing, false);
      spacing.alwaysReparse = true;
   }

   public IndexedChoice spacingEOL = (IndexedChoice) spacing.clone();
   {
      spacingEOL.setName("<spacingEOL>");
      spacingEOL.generateParseNode = new SpacingParseNode(spacingEOL, true);
   }

   public SymbolChoiceSpace periodSpace = new SymbolChoiceSpace(".");
   {
      // We want to match '.' but not '...' in the partial values case so excluding '..'
      periodSpace.addExcludedValues("..");
   }
   public Symbol period = new Symbol(".");
   public SymbolSpace semicolonEOL = new SymbolSpace(";", SKIP_ON_ERROR);
   public SymbolSpace semicolonEOL2 = new SymbolSpace(";", SKIP_ON_ERROR);
   {
      semicolonEOL.generateParseNode = new NewlineParseNode(semicolonEOL, ";");
      semicolonEOL2.generateParseNode = new NewlineParseNode(semicolonEOL2, ";\n");
   }
   // A semicolon followed by 2 newlines for package, imports
   public SymbolSpace semicolonNewline = new SymbolSpace(";", SKIP_ON_ERROR);
   {
      semicolonNewline.generateParseNode = new NewlineParseNode(semicolonNewline, ";") {
         public String getNewlineSeparator() {
            return FileUtil.LINE_SEPARATOR + FileUtil.LINE_SEPARATOR;
         }
      };
   }
   public SymbolSpace semicolon = new SymbolSpace(";");
   public SymbolSpace colon = new SymbolSpace(":");
   public SymbolSpace colonEOL = new SymbolSpace(":");
   {
      colonEOL.generateParseNode = new NewlineParseNode(colonEOL, ":");
   }
   public Sequence optSemicolon = new Sequence(OPTIONAL, new Symbol(";"), spacing);
   public SymbolSpace comma = new SymbolSpace(",");
   public SymbolSpace commaEOL = new SymbolSpace(",");
   {
      commaEOL.generateParseNode = new NewlineParseNode(commaEOL, ",");
   }
   public SymbolSpace openBrace = new SymbolSpace("{");
   public SymbolSpace openBraceEOL = new SymbolSpace("{");
   {
      openBraceEOL.generateParseNode = new NewlineParseNode(openBraceEOL, "{");
      openBraceEOL.pushIndent = true;
   }
   public SymbolSpace closeBrace = new SymbolSpace("}");
   public SymbolSpace closeBraceEOL = new SymbolSpace("}", SKIP_ON_ERROR);
   {
      closeBraceEOL.generateParseNode = new NewlineParseNode(closeBraceEOL, "}");
      closeBraceEOL.popIndent = true;
   }
   public SymbolSpace openParen = new SymbolSpace("(");

   public SymbolSpace closeParenSkipOnError = new SymbolSpace(")", SKIP_ON_ERROR);
   // TODO: should all closeParen's have skip on error?  Right now for cast expressions, we'll terminate those with the ; there's a chance the "skip on error" will lead to ambiguities when the body of the construct we are skipping has not enough info to differentiate it
   public SymbolSpace closeParen = new SymbolSpace(")");
   // Use this one for annotations
   public SymbolSpace openParenEOL = new SymbolSpace("(");
   public SymbolSpace closeParenEOL = new SymbolSpace(")", SKIP_ON_ERROR);
   {
      openParenEOL.generateParseNode = new NewlineParseNode(openParenEOL, "(");
      closeParenEOL.generateParseNode = new NewlineParseNode(closeParenEOL, ")");
   }
   // and this one for if statements where we need to indent
   public SymbolSpace closeParenEOLIndent = new SymbolSpace(")", SKIP_ON_ERROR);
   {
      NewlineParseNode pn = new NewlineParseNode(closeParenEOLIndent, ")");
      pn.needsIndent = true;
      closeParenEOLIndent.generateParseNode = pn;
   }
   public SymbolSpace openSqBracket = new SymbolSpace("[");
   public SymbolSpace closeSqBracket = new SymbolSpace("]");
   public SymbolSpace lessThan = new SymbolSpace("<");
   public SymbolSpace greaterThan = new SymbolSpace(">");
   public SymbolSpace greaterThanSkipOnError = new SymbolSpace(">", SKIP_ON_ERROR);
   public SymbolSpace equalSign = new SymbolSpace("=");
   public SymbolSpace asterix = new SymbolSpace("*");
   public SymbolSpace questionMark = new SymbolSpace("?");

   public Symbol startIdentifierChar = new Symbol("<startIdChar>", 0, Symbol.ANYCHAR) {
      protected String accept(SemanticContext ctx, Object value, int startIx, int endIx) {
         IString str = PString.toIString(value);
         if (str == null)
            return "Identifiers must be non null";
         if (str.length() == 1 && isIdentifierStartChar(str.charAt(0)))
            return null;
         return "Not a valid start identifier character";
      }
   };

   public class IdentSymbol extends Symbol {
      public IdentSymbol(String id, int options, String ev) {
         super(id, options, ev);
      }

      protected String accept(SemanticContext ctx, Object value, int startIx, int endIx) {
         IString str = PString.toIString(value);
         if (str == null)
            return "Identifiers must be non null";
         if (!repeat) {
            if (str.length() == 1 && isIdentifierPartChar(str.charAt(0)))
               return null;
         }
         else {
            int len = str.length();
            int i;
            for (i = 0; i < len; i++) {
               if (!isIdentifierPartChar(str.charAt(i)))
                  break;
            }
            if (i == len) {
               return null;
            }
         }
         return "Not a valid character for the inside of an identifier";
      }
   }

   // Hooks so that new languages can change the rules for identifier and reuse more parselets
   public boolean isIdentifierPartChar(char c) {
      return Character.isJavaIdentifierPart(c);
   }

   public boolean isIdentifierStartChar(char c) {
      return Character.isJavaIdentifierStart(c);
   }

   public Symbol identifierChar = new IdentSymbol("<idChar>", 0, Symbol.ANYCHAR);
   public Symbol nextIdentChars = new IdentSymbol("<nextIdChars>", OPTIONAL | REPEAT, Symbol.ANYCHAR);

   public Symbol alphaNumChar = new Symbol("<alphaNumChar>", 0, Symbol.ANYCHAR) {
      protected String accept(SemanticContext ctx, Object value, int startIx, int endIx) {
         IString str = PString.toIString(value);
         if (str == null)
            return "AlphaNum char must not non null";
         if (str.length() == 1) {
            char c = str.charAt(0);
            if (Character.isLetterOrDigit(c))
               return null;
         }
         return "Not alpha numeric";
      }

   };

   public Sequence identifier = new Sequence("<identifier>('','',)", startIdentifierChar, nextIdentChars, spacing) {
      /** Assumes we have validated the start and other chars already */
      protected String accept(SemanticContext ctx, Object value, int startIx, int endIx) {
         if (value instanceof IParseNode)
            value = ((IParseNode) value).getSemanticValue();

         // This is a sentinel type you can use to push even 'this' through as non-keyword.  Used to avoid needing to convert
         // to a selector expression during code-generation
         if (value instanceof NonKeywordString)
            return null;

         if (value != null && !(value instanceof StringToken))
            value = PString.toIString(value);
         if (getLanguage() == null)
            throw new IllegalArgumentException("*** No language defined for parselet: " + this);
         if (ignoreCaseInKeywords && value != null) {
            value = new ToLowerWrapper((IString) value);
         }
         if (!((BaseLanguage) getLanguage()).getKeywords().contains(value))
            return null;
         return "Identifiers cannot be keywords";
      }
   };
   {
      identifier.cacheResults = true;
   }

   /**
    * Used for skipOnError parselet and for TestLogFilter to match {alphaNumChar}_{id} - where normal identifier
    * would consume the _id part
    */
   public Sequence alphaNumString = new Sequence("<alphaNumString>('','',)", alphaNumChar,
           new Sequence("('')", REPEAT | OPTIONAL, alphaNumChar), spacing);

   /** Used for log filter patterns - to match either quote character */
   public SymbolChoice quoteChar = new SymbolChoice("\"", "'");

   /**
    * Use this to create a parselet for your repeating parselets skipOnError parselet.  It's used to consume the next error token while trying to skip out
    * of the body of something which is incomplete.  It consumes text which is safe to skip when we encounter an error parsing
    * the main parselet.  It must not match text which would ordinarily complete the parent.
    */
   public Parselet createSkipOnErrorParselet(String name, String... exitSymbols) {
      return new OrderedChoice(name + "(.,.)", alphaNumString, new Sequence(new SymbolChoice(NOT, exitSymbols), spacing));
   }

   public Sequence identifierSp = (Sequence) identifier.copy();
   {
      identifierSp.setName("('','',)");
   }

   public Sequence optIdentifier = new Sequence("(.)", OPTIONAL, identifier);

   public Sequence qualifiedIdentifier = new Sequence("('','')", identifier,
           new Sequence("('','')", OPTIONAL | REPEAT, new SymbolSpace("."), identifier));

   public Sequence optQualifiedIdentifier = new Sequence("('')", OPTIONAL, qualifiedIdentifier);

   OrderedChoice escapeSequence = new OrderedChoice("<escape>",
           new SymbolChoice("\\b","\\t","\\n","\\f","\\r","\\\"","\\\\","\\'"),
           new Sequence("('','','','','')", new Symbol("\\u"), hexDigit, hexDigit, hexDigit, hexDigit),
           new Sequence("('','','','')", new Symbol("\\"), octalDigit, optOctalDigit, optOctalDigit));

   public SymbolChoice escapedStringBody = new SymbolChoice(NOT, "\\", "\"", "\n", EOF);
   public Parselet escapedString = new OrderedChoice("('','')", OPTIONAL | REPEAT, escapeSequence, escapedStringBody);
   public Parselet escapedSingleQuoteString = new OrderedChoice("('','')", OPTIONAL | REPEAT, escapeSequence, new SymbolChoice(NOT, "\\", "'", "\n", EOF));
   {
      escapedString.styleName = escapedSingleQuoteString.styleName = "string";
      escapedSingleQuoteString.setLanguage(this);
   }

   /**
    * The keyword ensures that it is not followed by an identifier character - i.e. "returnSpace" would
    * be rejected if "return" is a keyword rather than a symbol.  Just add a not/lookahead rule to
    * negate that case.
    */
   public class KeywordSpace extends Sequence {
      public KeywordSpace(String name, int options, String symbol) {
         super(name, options | NOERROR);
         int symbolOpts = (options & IGNORE_CASE) | NOERROR;
         add(new Symbol(symbolOpts, symbol), new Sequence(NOT | LOOKAHEAD | NOERROR, identifierChar), spacing);
         styleName = "keyword";
      }
      public KeywordSpace(String symbol, int options) {
         this("<" + ((options & OPTIONAL) != 0 ? "opt_" : "") + ((options & IGNORE_CASE) != 0 ? "ic_" : "") + "keyword_" +  symbol + ">" + "('',,)", options, symbol);
      }
      public KeywordSpace(String symbol) {
         this(symbol, 0);
      }
   }

   public class KeywordNewline extends KeywordSpace {
      public KeywordNewline(String symbol) {
         super(symbol);
         set(2, spacingEOL);
      }
   }

   public class KeywordChoice extends Sequence {
      public KeywordChoice() {
         this(0);
      }

      public KeywordChoice(String name, int options, boolean doSpacing, String... expectedValues) {
         super(name, options | NOERROR);
         int symbolOpts = (options & IGNORE_CASE) | NOERROR;
         add(new SymbolChoice(symbolOpts, expectedValues), new Sequence(NOT | LOOKAHEAD | NOERROR, identifierChar));
         if (doSpacing)
            add(spacing);
         styleName = "keyword";
      }

      public KeywordChoice(int options, String... expectedValues) {
         super("('',)", options | NOERROR);
         int symbolOpts = (options & IGNORE_CASE) | NOERROR;
         add(new SymbolChoice(symbolOpts, expectedValues), new Sequence(NOT | LOOKAHEAD | NOERROR, identifierChar));
         styleName = "keyword";
      }

      public KeywordChoice(boolean doSpacing, String... expectedValues) {
         this(doSpacing ? "('',,)" : "('',)", 0, doSpacing, expectedValues);
      }

      public KeywordChoice(String... expectedValues) {
         this(0,expectedValues);
      }

      /** Adds a new choice after this is constructed. */
      public void add(String newValue) {
         ((SymbolChoice) parselets.get(0)).add(newValue);
      }
   }

   public class SymbolSpace extends Sequence {
      Symbol symbolParselet;
      /**
       * Creates a symbol space token with a deliminator for output purposes.  This
       * deliminator is appended to the symbol on output.
       *
       * @param symbol The symbol to match
       * @param options Any options
       * @param delim The deliminator
       */
      public SymbolSpace(String symbol, String delim, int options) {
         // We used to include NOERROR here but need errors for ; and in general these seem to be important parselets
         super("('',):(.," + delim + ")", options);
         int symbolOpts = options & IGNORE_CASE;
         symbolParselet = new Symbol(NOERROR | symbolOpts, symbol);
         add(symbolParselet, spacing);
      }
      public SymbolSpace(String symbol, int options) {
         super("('',)", options);
         int symbolOpts = options & IGNORE_CASE;
         symbolParselet = new Symbol(NOERROR | symbolOpts, symbol);
         add(symbolParselet, spacing);
      }

      public SymbolSpace(String symbol, String delim) {
         this(symbol, delim, NOERROR);
      }

      public SymbolSpace(String symbol) {
         this(symbol, NOERROR);
      }

      public String toString() {
         return "Symbol: '" + ((Symbol) parselets.get(0)).expectedValue + "'";
      }

      public void addExcludedValues(String... excludedValues) {
         symbolParselet.addExcludedValues(excludedValues);
      }

      public void setSymbolStyleName(String styleName) {
         symbolParselet.styleName = styleName;
      }
   }

   /** Like SymbolSpace but styled as a keyword */
   public class KeywordSymbolSpace extends SymbolSpace {
      public KeywordSymbolSpace(String symbol) {
         super(symbol);
         symbolParselet.styleName = "keyword";
      }
   }

   public class KeywordSymbol extends Symbol {
      public KeywordSymbol(String ev) {
         super(ev);
         styleName = "keyword";
      }
   }

   private final static GenerateError MISSING_SEMANTIC_VALUE = new GenerateError("Null value for semantic parselet");

   /**
    * The SemanticToken is parsed like SymbolSpace (a symbol followed by spacing).  When the model is generated
    * back into a language representation, the symbol is used only when its corresponding value is not null, or
    * if a boolean if the value is true.
    */
   public class SemanticToken extends SymbolSpace {
      SemanticToken(String symbol, int options) {
         super(symbol, options);
      }
      SemanticToken(String symbol) {
         this(symbol, 0);
      }

      public Object generate(GenerateContext ctx, Object value) {
         if (value == null)
            return ctx.error(this, MISSING_SEMANTIC_VALUE, value, 0);

         return super.generate(ctx, value);
      }
   }

   /**
    * A SemanticSequence is parsed like a regular sequence.  During generation however, if the semantic value
    * is null, this rule is not matched at all and no output is presented.  
    */
   public class SemanticSequence extends Sequence {
      SemanticSequence(String symbol, Parselet ...values) {
         this(symbol, 0, values);
      }

      SemanticSequence(String symbol, int options, Parselet ...values) {
         super(symbol, options, values);
      }

      public Object generate(GenerateContext ctx, Object value) {
         if (value == null)
            return ctx.error(this, MISSING_SEMANTIC_VALUE, value, 0);

         return super.generate(ctx, value);
      }
   }

   public class SymbolChoiceSpace extends Sequence {
      SymbolChoice choice;

      public SymbolChoiceSpace(String...values) {
         this("('',)",0,values);
      }
      public SymbolChoiceSpace(int options, String...values) {
         this("('',)", options, values);
      }
      public SymbolChoiceSpace(String name, int options, String...values) {
         super(name, options | NOERROR);
         int symOptions = options & IGNORE_CASE;
         choice = new SymbolChoice(NOERROR | symOptions, values);
         add(choice, spacing);
      }

      public SymbolChoiceSpace(String symbol) {
         this(0, symbol);
      }

      public void add(String...choices) {
         choice.add(choices);
      }

      public void set(String...choices) {
         choice.set(choices);
      }

      public void addExcludedValues(String... excluded) {
         choice.addExcludedValues(excluded);
      }
   }

   public class SemanticTokenChoice extends SymbolChoiceSpace {
      SemanticTokenChoice(String...values) {
         super(values);
      }

      public Object generate(GenerateContext ctx, Object value) {
         if (value == null)
            return MISSING_SEMANTIC_VALUE;

         return super.generate(ctx, value);
      }
   }

   public BaseLanguage() {
      this(null);
   }

   public BaseLanguage(Layer layer) {
      super(layer);
   }
}