! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays assocs combinators combinators.short-circuit interval-maps kernel locals math.parser memoize multiline peg.ebnf regexp.ast regexp.classes sequences sets splitting strings unicode unicode.data unicode.script ; IN: regexp.parser : allowed-char? ( ch -- ? ) ".()|[*+?$^" member? not ; ERROR: bad-number ; : ensure-number ( n -- n ) [ bad-number ] unless* ; :: at-error ( key assoc quot: ( key -- replacement ) -- value ) key assoc at* [ drop key quot call ] unless ; inline ERROR: bad-class name ; : simple ( str -- simple ) ! Alternatively, first collation key level? >case-fold [ " \t_" member? ] reject ; : simple-table ( seq -- table ) [ [ simple ] keep ] H{ } map>assoc ; MEMO: simple-script-table ( -- table ) script-table interval-values members simple-table ; MEMO: simple-category-table ( -- table ) categories simple-table ; : parse-unicode-class ( name -- class ) { { [ dup { [ length 1 = ] [ first "clmnpsz" member? ] } 1&& ] [ >upper first ] } { [ dup >title categories member? ] [ simple-category-table at ] } { [ "script=" ?head ] [ dup simple-script-table at [ ] [ "script=" prepend bad-class ] ?if ] } [ bad-class ] } cond ; : unicode-class ( name -- class ) dup parse-unicode-class [ ] [ bad-class ] ?if ; : name>class ( name -- class ) >string simple { { "lower" letter-class } { "upper" LETTER-class } { "alpha" Letter-class } { "ascii" ascii-class } { "digit" digit-class } { "alnum" alpha-class } { "punct" punctuation-class } { "graph" java-printable-class } { "blank" non-newline-blank-class } { "cntrl" control-character-class } { "xdigit" hex-digit-class } { "space" java-blank-class } } [ unicode-class ] at-error ; : lookup-escape ( char -- ast ) { { ch'a [ 0x7 ] } { ch'e [ 0x1b ] } { ch'f [ 0xc ] } ! { ch'f [ ch'\f ] } { ch'n [ ch'\n ] } { ch'r [ ch'\r ] } { ch't [ ch'\t ] } { ch'v [ ch'\v ] } { ch'0 [ ch'\0 ] } { ch'\\ [ ch'\\ ] } { ch'w [ c-identifier-class ] } { ch'W [ c-identifier-class ] } { ch's [ java-blank-class ] } { ch'S [ java-blank-class ] } { ch'd [ digit-class ] } { ch'D [ digit-class ] } { ch'z [ end-of-input ] } { ch'Z [ end-of-file ] } { ch'A [ beginning-of-input ] } { ch'b [ word-break ] } { ch'B [ word-break ] } [ ] } case ; : options-assoc ( -- assoc ) H{ { ch'i case-insensitive } { ch'd unix-lines } { ch'm multiline } { ch'r reversed-regexp } { ch's dotall } } ; ERROR: nonexistent-option name ; : ch>option ( ch -- singleton ) dup options-assoc at [ ] [ nonexistent-option ] ?if ; : option>ch ( option -- string ) options-assoc value-at ; : parse-options ( on off -- options ) [ [ ch>option ] { } map-as ] bi@ ; : string>options ( string -- options ) "-" split1 parse-options ; : options>string ( options -- string ) [ on>> ] [ off>> ] bi [ [ option>ch ] map ] bi@ [ "-" glue ] unless-empty "" like ; ! TODO: add syntax for various parenthized things, ! add greedy and nongreedy forms of matching ! (once it's all implemented) EBNF: parse-regexp [=[ CharacterInBracket = !("}") Character QuotedCharacter = !("\\E") . Escape = "p{" CharacterInBracket*:s "}" => [[ s name>class ]] | "P{" CharacterInBracket*:s "}" => [[ s name>class ]] | "Q" QuotedCharacter*:s "\\E" => [[ s ]] | "u" Character:a Character:b Character:c Character:d => [[ { a b c d } hex> ensure-number ]] | "x" Character:a Character:b => [[ { a b } hex> ensure-number ]] | "0" Character:a Character:b Character:c => [[ { a b c } oct> ensure-number ]] | . => [[ lookup-escape ]] EscapeSequence = "\\" Escape:e => [[ e ]] Character = EscapeSequence | "$" => [[ $ ]] | "^" => [[ ^ ]] | . ?[ allowed-char? ]? AnyRangeCharacter = !("&&"|"||"|"--"|"~~") (EscapeSequence | .) RangeCharacter = !("]") AnyRangeCharacter Range = RangeCharacter:a "-" !("-") RangeCharacter:b => [[ a b ]] | RangeCharacter StartRange = AnyRangeCharacter:a "-" !("-") RangeCharacter:b => [[ a b ]] | AnyRangeCharacter Ranges = StartRange:s Range*:r => [[ r s prefix ]] BasicCharClass = "^"?:n Ranges:e => [[ e n char-class ]] CharClass = BasicCharClass:b "&&" CharClass:c => [[ b c 2array ]] | BasicCharClass:b "||" CharClass:c => [[ b c 2array ]] | BasicCharClass:b "~~" CharClass:c => [[ b c ]] | BasicCharClass:b "--" CharClass:c => [[ b c ]] | BasicCharClass Options = [idmsux]* Parenthized = "?:" Alternation:a => [[ a ]] | "?" Options:on "-"? Options:off ":" Alternation:a => [[ a on off parse-options ]] | "?#" [^)]* => [[ f ]] | "?~" Alternation:a => [[ a ]] | "?=" Alternation:a => [[ a ]] | "?!" Alternation:a => [[ a ]] | "?<=" Alternation:a => [[ a ]] | "? [[ a ]] | Alternation Element = "(" Parenthized:p ")" => [[ p ]] | "[" CharClass:r "]" => [[ r ]] | ".":d => [[ dot ]] | Character Number = (!(","|"}").)* => [[ string>number ensure-number ]] Times = "," Number:n "}" => [[ 0 n ]] | Number:n ",}" => [[ n ]] | Number:n "}" => [[ n n ]] | "}" => [[ bad-number ]] | Number:n "," Number:m "}" => [[ n m ]] Repeated = Element:e "{" Times:t => [[ e t ]] | Element:e "??" => [[ e ]] | Element:e "*?" => [[ e ]] | Element:e "+?" => [[ e ]] | Element:e "?" => [[ e ]] | Element:e "*" => [[ e ]] | Element:e "+" => [[ e ]] | Element Concatenation = Repeated*:r => [[ r sift ]] Alternation = Concatenation:c ("|" Concatenation)*:a => [[ a empty? [ c ] [ a values c prefix ] if ]] End = !(.) Main = Alternation End ]=]