Add a grammar (in antlr4 format) to the WSL spec.

https://bugs.webkit.org/show_bug.cgi?id=186310 Rubberstamped by Filip Pizlo It is just the raw rules, without much comments and no typesetting. Compiles to any of Java/JS/C++/etc.. with antlr4 (requires a JDK) It does not exactly match the parser of the current js implementation of WSL, it: - Accepts negative literals - Reserves keywords fallthrough/auto - Refuses '_' as a valid identifier - Accepts several capitalizations for Null/True/False - Accepts variable declarations at the top-level - Correctly parses ternary expressions - Forbids empty extension list for protocols - Does not allow a space between '[' and ']' in type suffixes - Correctly parses nested generics - Accepts a 'fallthrough;' statement (for switches) - Refuses chained relational operator (e.g. x < y < z) - Generally a completely different structure for parsing effectful expressions, although it probably accepts about the same language - Currently only accepts literals and identifiers as constexpr (while the js parser accepts way too much, including '(x += 42)') There are probably more differences that I missed. The js parser will probably have to be mostly rewritten to follow the spec more closely (and fix all the bugs). I will try to see if it can be automatically derived from the antlr4 grammar. * WebGPUShadingLanguageRI/SpecWork/WSL.g4: Added. git-svn-id: http://svn.webkit.org/repository/webkit/trunk@232514 268f45cc-cd09-0410-ab3c-d52691b4dbfc
wqyfavor · Jun 5, 2018 · 303def7 · 303def7
1 parent c90330a
commit 303def7
Show file tree

Hide file tree

Showing 2 changed files with 262 additions and 0 deletions.
diff --git a/Tools/ChangeLog b/Tools/ChangeLog
@@ -1,3 +1,32 @@
+2018-06-05  Robin Morisset  <[email protected]>
+
+        Add a grammar (in antlr4 format) to the WSL spec.
+        https://bugs.webkit.org/show_bug.cgi?id=186310
+
+        Rubberstamped by Filip Pizlo
+
+        It is just the raw rules, without much comments and no typesetting.
+        Compiles to any of Java/JS/C++/etc.. with antlr4 (requires a JDK)
+
+        It does not exactly match the parser of the current js implementation of WSL, it:
+        - Accepts negative literals
+        - Reserves keywords fallthrough/auto
+        - Refuses '_' as a valid identifier
+        - Accepts several capitalizations for Null/True/False
+        - Accepts variable declarations at the top-level
+        - Correctly parses ternary expressions
+        - Forbids empty extension list for protocols
+        - Does not allow a space between '[' and ']' in type suffixes
+        - Correctly parses nested generics
+        - Accepts a 'fallthrough;' statement (for switches)
+        - Refuses chained relational operator (e.g. x < y < z)
+        - Generally a completely different structure for parsing effectful expressions, although it probably accepts about the same language
+        - Currently only accepts literals and identifiers as constexpr (while the js parser accepts way too much, including '(x += 42)')
+        There are probably more differences that I missed. The js parser will probably have to be mostly rewritten to follow the spec more closely (and fix all the bugs).
+        I will try to see if it can be automatically derived from the antlr4 grammar.
+
+        * WebGPUShadingLanguageRI/SpecWork/WSL.g4: Added.
+
 2018-06-04  Frederic Wang  <[email protected]>
 
         import-w3c-tests should rely on <meta name="flags"> to detect CSS manual tests

diff --git a/Tools/WebGPUShadingLanguageRI/SpecWork/WSL.g4 b/Tools/WebGPUShadingLanguageRI/SpecWork/WSL.g4
@@ -0,0 +1,233 @@
+grammar WSL;
+
+/*
+ * Lexer
+ */
+Whitespace: [ \t\r\n]+ -> skip ;
+
+// Note: we forbid leading 0s in decimal integers. to bikeshed.
+fragment CoreDecimalIntLiteral: [1-9] [0-9]* ;
+// Note: we allow a leading '-' but not a leading '+' in all kind of numeric literals. to bikeshed.
+fragment DecimalIntLiteral: '-'? CoreDecimalIntLiteral ;
+fragment DecimalUIntLiteral: CoreDecimalIntLiteral 'u' ;
+fragment CoreHexadecimalIntLiteral: '0x' [0-9a-fA-F]+ ;
+fragment HexadecimalIntLiteral: '-'? CoreHexadecimalIntLiteral;
+fragment HexadecimalUIntLiteral: CoreHexadecimalIntLiteral 'u';
+fragment IntLiteral: DecimalIntLiteral | DecimalUIntLiteral | HexadecimalIntLiteral | HexadecimalUIntLiteral ;
+// Do we want to allow underscores in the middle of numbers for readability?
+
+fragment CoreFloatLiteral: [0-9]+'.'[0-9]* | [0-9]*'.'[0-9]+ ;
+fragment FloatLiteral: '-'? CoreFloatLiteral [fd]? ;
+// TODO: what to do about floats that are too big or too small to represent?
+// TODO: what is the default precision? double?
+// IDEA: add Nan, +infinity, -infinity
+// IDEA: add half-precision literals
+
+// One rule per keyword, to prevent them from being recognized as identifiers
+STRUCT: 'struct';
+PROTOCOL: 'protocol';
+TYPEDEF: 'typedef';
+ENUM: 'enum';
+OPERATOR: 'operator';
+
+IF: 'if';
+ELSE: 'else';
+CONTINUE: 'continue';
+BREAK: 'break';
+SWITCH: 'switch';
+CASE: 'case';
+DEFAULT: 'default';
+FALLTHROUGH: 'fallthrough';
+FOR: 'for';
+WHILE: 'while';
+DO: 'do';
+RETURN: 'return';
+TRAP: 'trap';
+
+fragment NULL: 'null' | 'NULL' ;
+fragment TRUE: 'true' | 'True' ;
+fragment FALSE: 'false' | 'False' ;
+// Note: We could make these three fully case sensitive or insensitive. to bikeshed.
+
+CONSTANT: 'constant';
+DEVICE: 'device';
+THREADGROUP: 'threadgroup';
+THREAD: 'thread';
+
+VERTEX: 'vertex';
+FRAGMENT: 'fragment';
+
+NATIVE: 'native';
+RESTRICTED: 'restricted';
+// Note: these could be only keyword in the native mode, but I decided to make them always reserved. to bikeshed.
+
+UNDERSCORE: '_';
+AUTO: 'auto';
+// Note: these are currently not used by the grammar, but I would like to make them reserved keywords for future expansion of the language. to bikeshed
+
+fragment ValidIdentifier: [a-zA-Z_] [a-zA-Z0-9_]* ;
+Identifier: ValidIdentifier ;
+// Note: this currently excludes unicode, but allows digits in the middle of identifiers. We could easily restrict or extend this definition. to bikeshed
+
+OperatorName
+    : 'operator' ('>>' | '<<' | '+' | '-' | '*' | '/' | '%' | '&&' | '||' | '&' | '^' | '|' | '>=' | '<=' | '==' | '<' | '>' | '++' | '--' | '!' | '~' | '[]' | '[]=' | '&[]')
+    | 'operator&.' ValidIdentifier
+    | 'operator.' ValidIdentifier '='
+    | 'operator.' ValidIdentifier ;
+// Note: operator!= is not user-definable, it is automatically derived from operator==
+
+Literal: IntLiteral | FloatLiteral | NULL | TRUE | FALSE;
+
+/*
+ * Parser: Top-level
+ */
+file: topLevelDecl* EOF ;
+topLevelDecl
+    : ';'
+    | variableDecls ';'
+    | typeDef
+    | structDef
+    | enumDef
+    | funcDef
+    | nativeFuncDecl
+    | nativeTypeDecl
+    | protocolDecl ;
+
+typeDef: TYPEDEF Identifier typeParameters '=' type ';' ;
+
+structDef: STRUCT Identifier typeParameters '{' structElement* '}' ;
+structElement: type Identifier ';' ;
+
+enumDef: ENUM Identifier (':' type)? '{' enumMember (',' enumMember)* '}' ;
+// Note: we could allow an extra ',' at the end of the list of enumMembers, ala Rust, to make it easier to reorder the members. to bikeshed
+enumMember: Identifier ('=' constexpr)? ;
+
+funcDef: RESTRICTED? funcDecl block;
+funcDecl
+    : (VERTEX | FRAGMENT) type Identifier parameters
+    | type (Identifier | OperatorName) typeParameters parameters
+    | OPERATOR typeParameters type parameters ;
+// Note: the return type is moved in a different place for operator casts, as a hint that it plays a role in overload resolution. to bikeshed
+parameters
+    : '(' ')'
+    | '(' parameter (',' parameter)* ')' ;
+parameter: type Identifier? ;
+
+nativeFuncDecl: RESTRICTED? NATIVE funcDecl ';' ;
+nativeTypeDecl: NATIVE TYPEDEF Identifier typeParameters ';' ;
+
+protocolDecl: PROTOCOL Identifier (':' protocolRef (',' protocolRef)*)? '{' (funcDecl ';')* '}' ;
+// Note: I forbid empty extensions lists in protocol declarations, while the original js parser allowed them. to bikeshed
+protocolRef: Identifier ;
+
+/*
+ * Parser: Types 
+ */
+typeParameters
+    : '<' typeParameter (',' typeParameter)* '>'
+    | ('<' '>')?;
+// Note: contrary to C++ for example, we allow '<>' and consider it equivalent to having no type parameters at all. to bikeshed
+typeParameter
+    : type Identifier
+    | Identifier (':' protocolRef ('+' protocolRef)*)? ;
+
+type
+    : addressSpace Identifier typeArguments typeSuffixAbbreviated*
+    | Identifier typeArguments typeSuffixNonAbbreviated* ;
+addressSpace: CONSTANT | DEVICE | THREADGROUP | THREAD ;
+typeSuffixAbbreviated: '*' | '[]' | '[' constexpr ']';
+typeSuffixNonAbbreviated: '*' addressSpace | '[]' addressSpace | '[' constexpr ']' ;
+// Note: in this formulation of typeSuffix*, we don't allow whitespace between the '[' and the ']' in '[]'. We easily could at the cost of a tiny more bit of lookahead. to bikeshed
+
+typeArguments
+    : '<' (typeArgument ',')* addressSpace? Identifier '<' (typeArgument (',' typeArgument)*)? '>>'
+    //Note: this first alternative is a horrible hack to deal with nested generics that end with '>>'. As far as I can tell it works fine, but requires arbitrary lookahead.
+    | '<' typeArgument (',' typeArgument)* '>'
+    | ('<' '>')? ;
+typeArgument: constexpr | type ;
+
+/* 
+ * Parser: Statements 
+ */
+block: '{' blockBody '}' ;
+blockBody: stmt* ;
+
+stmt
+    : block
+    | ifStmt
+    | switchStmt
+    | forStmt
+    | whileStmt
+    | doStmt ';'
+    | BREAK ';'
+    | CONTINUE ';'
+    | FALLTHROUGH ';'
+    | TRAP ';'
+    | RETURN expr? ';'
+    | variableDecls ';'
+    | effectfulExpr ';' ;
+
+ifStmt: IF '(' expr ')' stmt (ELSE stmt)? ;
+
+switchStmt: SWITCH '(' expr ')' '{' switchCase* '}' ;
+switchCase: (CASE constexpr | DEFAULT) ':' blockBody ;
+
+forStmt: FOR '(' (variableDecls | effectfulExpr) ';' expr? ';' expr? ')' stmt ;
+whileStmt: WHILE '(' expr ')' stmt ;
+doStmt: DO stmt WHILE '(' expr ')' ;
+
+variableDecls: type variableDecl (',' variableDecl)* ;
+variableDecl: Identifier ('=' expr)? ;
+
+/* 
+ * Parser: Expressions
+ */
+constexpr: Literal | Identifier;
+
+// Note: we separate effectful expressions from normal expressions, and only allow the former in statement positions, to disambiguate the following:
+// "x * y;". Without this trick, it would look like both an expression and a variable declaration, and could not be disambiguated until name resolution.
+effectfulExpr: ((effAssignment ',')* effAssignment)? ; 
+effAssignment
+    : possiblePrefix assignOperator expr
+    | effPrefix ;
+assignOperator: '=' | '+=' | '-=' | '*=' | '/=' | '%=' | '^=' | '&=' | '|=' | '>>=' | '<<=' ;
+effPrefix
+    : ('++' | '--') possiblePrefix
+    | effSuffix ;
+effSuffix
+    : possibleSuffix ('++' | '--')
+    | callExpression
+    | '(' expr ')' ;
+// Note: this last case is to allow craziness like "(x < y ? z += 42 : w += 13);" 
+// TODO: Not sure at all how useful it is, I also still have to double check that it introduces no ambiguity.
+limitedSuffixOperator
+    : '.' Identifier 
+    | '->' Identifier 
+    | '[' expr ']' ;
+
+expr: (possibleTernaryConditional ',')* possibleTernaryConditional;
+// TODO: I tried to mimic https://en.cppreference.com/w/cpp/language/operator_precedence with regards to assignment and ternary conditionals, but it still needs some testing
+possibleTernaryConditional
+    : possibleLogicalBinop '?' expr ':' possibleTernaryConditional
+    | possiblePrefix assignOperator possibleTernaryConditional
+    | possibleLogicalBinop ;
+possibleLogicalBinop: possibleRelationalBinop (logicalBinop possibleLogicalBinop)*;
+logicalBinop: '||' | '&&' | '|' | '^' | '&' ;
+// Note: the list above may need some manipulation to get the proper left-to-right associativity
+possibleRelationalBinop: possibleShift (relationalBinop possibleShift)?;
+relationalBinop: '<' | '>' | '<=' | '>=' | '==' | '!=' ; 
+// Note: we made relational binops non-associative to better disambiguate "x<y>(z)" into a call expression and not a comparison of comparison
+// Idea: https://en.cppreference.com/w/cpp/language/operator_comparison#Three-way_comparison
+possibleShift: possibleAdd (('>>' | '<<') possibleAdd)* ;
+possibleAdd: possibleMult (('+' | '-') possibleMult)* ;
+possibleMult: possiblePrefix (('*' | '/' | '%') possiblePrefix)* ;
+possiblePrefix: prefixOp* possibleSuffix ;
+prefixOp: '++' | '--' | '+' | '-' | '~' | '!' | '&' | '@' | '*' ;
+possibleSuffix
+    : callExpression limitedSuffixOperator*
+    | term (limitedSuffixOperator | '++' | '--')* ;
+callExpression: Identifier typeArguments '(' (possibleTernaryConditional (',' possibleTernaryConditional)*)? ')';
+term
+    : Literal
+    | Identifier
+    | '(' expr ')' ;