Rustの字句
以下はRust1.15.1の syntax::parse::lexer
をもとに作成したPEG風の字句規則である。
IdentStart <- [a-zA-Z_] / # Any Unicode scalar value >= 0x80 with XID_Start property IdentContinue <- [a-zA-Z0-9_] / # Any Unicode scalar value >= 0x80 wih XID_Continue property Whitespace <- # Any Unicode scalar value with PATTERN_WHITE_SPACE property Ascii <- # Unicode scalar value from 0 to 0x7f, inclusive Eof <- !. Underscore <- "_" !IdentContinue As <- "as" !IdentContinue Box <- "box" !IdentContinue Continue <- "continue" !IdentContinue Crate <- "crate" !IdentContinue Else <- "else" !IdentContinue Enum <- "enum" !IdentContinue Extern <- "extern" !IdentContinue False <- "false" !IdentContinue Fn <- "fn" !IdentContinue If <- "if" !IdentContinue Impl <- "impl" !IdentContinue In <- "in" !IdentContinue Let <- "let" !IdentContinue Loop <- "loop" !IdentContinue Match <- "match" !IdentContinue Mod <- "mod" !IdentContinue Move <- "move" !IdentContinue Mut <- "mut" !IdentContinue Pub <- "pub" !IdentContinue Ref <- "ref" !IdentContinue Return <- "return" !IdentContinue SelfValue <- "self" !IdentContinue SelfType <- "Self" !IdentContinue Static <- "static" !IdentContinue Struct <- "struct" !IdentContinue Super <- "super" !IdentContinue Trait <- "trait" !IdentContinue True <- "true" !IdentContinue Type <- "type" !IdentContinue Unsafe <- "unsafe" !IdentContinue Use <- "use" !IdentContinue Where <- "where" !IdentContinue While <- "while" !IdentContinue Trait <- "trait" !IdentContinue Reserved <- ("abstract" / "alignof" / "become" / "do" / "final" / "macro" / "offsetof" / "override" / "priv" / "proc" / "pure" / "sizeof" / "typeof" / "unsized" / "virtual" / "yield") !IdentContinue Keywords <- Underscore / As / Box / Continue / Crate / Else / Enum / Extern / False / Fn / If / Impl / In / Let / Loop / Match / Mod / Move / Mut / Pub / Ref / Return / SelfValue / SelfType / Static / Struct / Super / Trait / True / Type / Unsafe / Use / Where / While / Reserved Ident <- !("r\"" | "r#" | "b\"" | "b'" | "br\"" | "br#" | Keywords) IdentStart IdentContinue* Lifetime <- "'" (!(Keywords) IdentStart IdentContinue*) !("'") / "'static" !("'") # These are usually treated as an Ident or Lifetime, # but considered to be a keyword in special contexts. Default <- "default" !IdentContinue StaticLifetime <- "'static" !IdentContinue Union <- "union" !IdentContinue FloatExponent <- [eE] [+-]? [0-9_]+ FloatValue <- [0-9_]+ "." !("." | IdentStart) [0-9_]* FloatExponent / !("0e" | "0E") # Why this? Maybe just a bug [0-9_]+ [eE] FloatExponent IntegerValue <- "0b" [01_]+ / "0o" [0-7_]+ / "0x" [0-9a-fA-F_]+ / [0-9_]+ !([.eE]) NumberValue <- FloatValue / IntegerValue NumberLiteral <- NumberValue (IdentStart IdentContinue*)? ByteEsc <- "\\n" / "\\r" / "\\t" / "\\\\" / "\\'" / "\\\"" / "\\0" / "\\x" [0-9a-fA-F][0-9a-fA-F] / (!['"\r\n\t\\] Ascii) CharEsc <- "\\n" / "\\r" / "\\t" / "\\\\" / "\\'" / "\\\"" / "\\0" / "\\x" [0-7][0-9a-fA-F] # Constraint 1: up to 6 digits # Constraint 2: must represent a Unicode scalar value / "\\u{" [0-9a-fA-F]+ "}" / (!['"\r\n\t\\] .) NewlineEsc <- ("\\\n" | "\\\r\n") Whitespace* StringLike <- "'" (CharEsc / "\"") "'" / "b'" (ByteEsc / "\"") "'" / "\"" (CharEsc / NewlineEsc / "\r\n" / ['\n\t])* "\"" / "b\"" (ByteEsc / NewlineEsc / "\r\n" / ['\n\t])* "\"" / "r\"" (!"\"" ("\r\n" / [^\r]))* "\"" / "r#\"" (!"\"#" ("\r\n" / [^\r]))* "\"#" / "r##\"" (!"\"##" ("\r\n" / [^\r]))* "\"##" / "r###\"" (!"\"###" ("\r\n" / [^\r]))* "\"###" / "r####\"" (!"\"####" ("\r\n" / [^\r]))* "\"####" / "r#####\"" (!"\"#####" ("\r\n" / [^\r]))* "\"#####" ... (for arbitrary number of #s) ... / "br\"" (!"\"" Ascii)* "\"" / "br#\"" (!"\"#" Ascii)* "\"#" / "br##\"" (!"\"##" Ascii)* "\"##" / "br###\"" (!"\"###" Ascii)* "\"###" / "br####\"" (!"\"####" Ascii)* "\"####" / "br#####\"" (!"\"#####" Ascii)* "\"#####" ... (for arbitrary number of #s) ... StringLikeLiteral <- StringLike (IdentStart IdentContinue*)? TokenInner <- Ident / Lifetime / Keywords / NumberLiteral / StringLikeLiteral / ";" / "," / "(" / ")" / "{" / "}" / "[" / "]" / "@" / "#" / "~" / "?" / "$" / "+" / "*" / "/" / "^" / "%" / ".." / "." / "::" / ":" / "==" / "=>" / "=" / "!=" / "!" / "<=" / "<<" / "<-" / "<" / ">=" / ">>" / ">" / "->" / "-" / "&&" / "&" / "||" / "|" NestedDocComment <- "/*" (!"*/" (NestedDocComment / "\r\n" / [^\r])) "*/" NestedComment <- "/*" (!"*/" (NestedComment / .)) "*/" DocComment <- ("///" !"/" / "//!") [^\r\n]* &("\r\n" / "\n" / Eof) / ("/**" / "/*!") (!"*/" (NestedDocComment / "\r\n" / [^\r])) "*/" NormalComment <- "//" !("/" / "!") [^\n]* &("\n" / Eof) / "////" [^\r\n]* &("\r\n" / "\n" / Eof) / "/*" ![*!] (!"*/" (NestedComment / .)) "*/" WhitespaceOrComment <- DocComment / NormalComment / Whitespace+ ShebangComment <- "#!" !"[" [^\n]* Source = ShebangComment? (WhitespaceOrComment / TokenInner)* Eof
凡例
- ここで文字といった場合はUnicode scalar value (Unicodeで定義される0以上0x10FFFF以下のコードポイントのうち、サロゲートペアのための0xD800から0xDFFFまでのコードポイントを除いたもの)である。
<-
は非終端記号を定義する。- ダブルクオートで囲まれている部分は、それが示す文字列自身にマッチする。
[]
で囲まれている部分は、それが示す文字クラスのうちの文字1文字にマッチする。.
は任意の1文字にマッチする。/
は左を優先的に試し、失敗したら右を試す。ただし今回の文法でこの非対称性を使っている場面はあまり多くない。T*
はT
を貪欲に0個以上読む。T+
はT
を貪欲に1個以上読む。1つも読めなかったら失敗とみなす。T?
はT
を貪欲に0個か1個読む。!T
はT
の否定先読み。&T
はT
の肯定先読み。