Rustの字句

以下はRust1.15.1の syntax::parse::lexer をもとに作成したPEG風の字句規則である。

IdentStart <- [a-zA-Z_]
            / # Any Unicode scalar value >= 0x80 with XID_Start property
IdentContinue <- [a-zA-Z0-9_]
               / # Any Unicode scalar value >= 0x80 wih XID_Continue property
Whitespace <- # Any Unicode scalar value with PATTERN_WHITE_SPACE property
Ascii <- # Unicode scalar value from 0 to 0x7f, inclusive
Eof <- !.

Underscore <- "_" !IdentContinue
As         <- "as"       !IdentContinue
Box        <- "box"      !IdentContinue
Continue   <- "continue" !IdentContinue
Crate      <- "crate"    !IdentContinue
Else       <- "else"     !IdentContinue
Enum       <- "enum"     !IdentContinue
Extern     <- "extern"   !IdentContinue
False      <- "false"    !IdentContinue
Fn         <- "fn"       !IdentContinue
If         <- "if"       !IdentContinue
Impl       <- "impl"     !IdentContinue
In         <- "in"       !IdentContinue
Let        <- "let"      !IdentContinue
Loop       <- "loop"     !IdentContinue
Match      <- "match"    !IdentContinue
Mod        <- "mod"      !IdentContinue
Move       <- "move"     !IdentContinue
Mut        <- "mut"      !IdentContinue
Pub        <- "pub"      !IdentContinue
Ref        <- "ref"      !IdentContinue
Return     <- "return"   !IdentContinue
SelfValue  <- "self"     !IdentContinue
SelfType   <- "Self"     !IdentContinue
Static     <- "static"   !IdentContinue
Struct     <- "struct"   !IdentContinue
Super      <- "super"    !IdentContinue
Trait      <- "trait"    !IdentContinue
True       <- "true"     !IdentContinue
Type       <- "type"     !IdentContinue
Unsafe     <- "unsafe"   !IdentContinue
Use        <- "use"      !IdentContinue
Where      <- "where"    !IdentContinue
While      <- "while"    !IdentContinue
Trait      <- "trait"    !IdentContinue
Reserved <- ("abstract" / "alignof" / "become" / "do" / "final" / "macro"
           / "offsetof" / "override" / "priv" / "proc" / "pure" / "sizeof"
           / "typeof" / "unsized" / "virtual" / "yield") !IdentContinue
Keywords <- Underscore / As / Box / Continue / Crate / Else / Enum / Extern
          / False / Fn / If / Impl / In / Let / Loop / Match / Mod / Move
          / Mut / Pub / Ref / Return / SelfValue / SelfType / Static / Struct
          / Super / Trait / True / Type / Unsafe / Use / Where / While
          / Reserved
Ident <- !("r\"" | "r#" | "b\"" | "b'" | "br\"" | "br#" | Keywords)
         IdentStart IdentContinue*
Lifetime <- "'" (!(Keywords) IdentStart IdentContinue*) !("'")
          / "'static" !("'")

# These are usually treated as an Ident or Lifetime,
# but considered to be a keyword in special contexts.
Default <- "default" !IdentContinue
StaticLifetime <- "'static" !IdentContinue
Union <- "union" !IdentContinue

FloatExponent <- [eE] [+-]? [0-9_]+

FloatValue <- [0-9_]+ "." !("." | IdentStart) [0-9_]* FloatExponent
            / !("0e" | "0E") # Why this? Maybe just a bug
              [0-9_]+ [eE] FloatExponent

IntegerValue <- "0b" [01_]+
              / "0o" [0-7_]+
              / "0x" [0-9a-fA-F_]+
              / [0-9_]+ !([.eE])

NumberValue <- FloatValue
             / IntegerValue

NumberLiteral <- NumberValue (IdentStart IdentContinue*)?

ByteEsc <- "\\n" / "\\r" / "\\t" / "\\\\" / "\\'" / "\\\"" / "\\0"
         / "\\x" [0-9a-fA-F][0-9a-fA-F]
         / (!['"\r\n\t\\] Ascii)
CharEsc <- "\\n" / "\\r" / "\\t" / "\\\\" / "\\'" / "\\\"" / "\\0"
         / "\\x" [0-7][0-9a-fA-F]
           # Constraint 1: up to 6 digits
           # Constraint 2: must represent a Unicode scalar value
         / "\\u{" [0-9a-fA-F]+ "}"
         / (!['"\r\n\t\\] .)
NewlineEsc <- ("\\\n" | "\\\r\n") Whitespace*


StringLike <- "'" (CharEsc / "\"") "'"
            / "b'" (ByteEsc / "\"") "'"
            / "\"" (CharEsc / NewlineEsc / "\r\n" / ['\n\t])* "\""
            / "b\"" (ByteEsc / NewlineEsc / "\r\n" / ['\n\t])* "\""
            / "r\"" (!"\"" ("\r\n" / [^\r]))* "\""
            / "r#\"" (!"\"#" ("\r\n" / [^\r]))* "\"#"
            / "r##\"" (!"\"##" ("\r\n" / [^\r]))* "\"##"
            / "r###\"" (!"\"###" ("\r\n" / [^\r]))* "\"###"
            / "r####\"" (!"\"####" ("\r\n" / [^\r]))* "\"####"
            / "r#####\"" (!"\"#####" ("\r\n" / [^\r]))* "\"#####"
              ... (for arbitrary number of #s)
              ...
            / "br\"" (!"\"" Ascii)* "\""
            / "br#\"" (!"\"#" Ascii)* "\"#"
            / "br##\"" (!"\"##" Ascii)* "\"##"
            / "br###\"" (!"\"###" Ascii)* "\"###"
            / "br####\"" (!"\"####" Ascii)* "\"####"
            / "br#####\"" (!"\"#####" Ascii)* "\"#####"
              ... (for arbitrary number of #s)
              ...
StringLikeLiteral <- StringLike (IdentStart IdentContinue*)?

TokenInner <- Ident
            / Lifetime
            / Keywords
            / NumberLiteral
            / StringLikeLiteral
            / ";" / "," / "(" / ")" / "{" / "}" / "[" / "]" / "@" / "#" / "~"
            / "?" / "$" / "+" / "*" / "/" / "^" / "%"
            / ".." / "."
            / "::" / ":"
            / "==" / "=>" / "="
            / "!=" / "!"
            / "<=" / "<<" / "<-" / "<"
            / ">=" / ">>" / ">"
            / "->" / "-"
            / "&&" / "&"
            / "||" / "|"

NestedDocComment <- "/*" (!"*/" (NestedDocComment / "\r\n" / [^\r])) "*/"
NestedComment <- "/*" (!"*/" (NestedComment / .)) "*/"
DocComment <- ("///" !"/" / "//!") [^\r\n]* &("\r\n" / "\n" / Eof)
            / ("/**" / "/*!") (!"*/" (NestedDocComment / "\r\n" / [^\r])) "*/"
NormalComment <- "//" !("/" / "!") [^\n]* &("\n" / Eof)
               / "////" [^\r\n]* &("\r\n" / "\n" / Eof)
               / "/*" ![*!] (!"*/" (NestedComment / .)) "*/"

WhitespaceOrComment <- DocComment / NormalComment / Whitespace+

ShebangComment <- "#!" !"[" [^\n]*

Source = ShebangComment? (WhitespaceOrComment / TokenInner)* Eof

凡例

  • ここで文字といった場合はUnicode scalar value (Unicodeで定義される0以上0x10FFFF以下のコードポイントのうち、サロゲートペアのための0xD800から0xDFFFまでのコードポイントを除いたもの)である。
  • <- は非終端記号を定義する。
  • ダブルクオートで囲まれている部分は、それが示す文字列自身にマッチする。
  • [] で囲まれている部分は、それが示す文字クラスのうちの文字1文字にマッチする。
  • . は任意の1文字にマッチする。
  • / は左を優先的に試し、失敗したら右を試す。ただし今回の文法でこの非対称性を使っている場面はあまり多くない。
  • T*T を貪欲に0個以上読む。
  • T+T を貪欲に1個以上読む。1つも読めなかったら失敗とみなす。
  • T?T を貪欲に0個か1個読む。
  • !TT の否定先読み。
  • &TT の肯定先読み。