diff --git a/extra/regexp/regexp-tests.factor b/extra/regexp/regexp-tests.factor index 94f9ad172f..8e72c5c2f8 100644 --- a/extra/regexp/regexp-tests.factor +++ b/extra/regexp/regexp-tests.factor @@ -154,3 +154,5 @@ IN: regexp-tests [ t ] [ "|*+" "\\Q|*+\\E" matches? ] unit-test [ f ] [ "abc" "\\Q|*+\\E" matches? ] unit-test +[ t ] [ "S" "\\0123" matches? ] unit-test +[ t ] [ "SXY" "\\0123XY" matches? ] unit-test diff --git a/extra/regexp/regexp.factor b/extra/regexp/regexp.factor index ba1bd6c32d..51cda83cdc 100644 --- a/extra/regexp/regexp.factor +++ b/extra/regexp/regexp.factor @@ -1,135 +1,14 @@ USING: arrays combinators kernel lazy-lists math math.parser namespaces parser parser-combinators parser-combinators.simple -promises quotations sequences sequences.lib strings ; -USING: continuations io prettyprint ; +promises quotations sequences combinators.lib strings macros +assocs ; IN: regexp -: 1satisfy ( n -- parser ) - [ = ] curry satisfy ; - -: satisfy-token ( string quot -- parser ) - >r token r> [ satisfy ] curry [ drop ] swap compose <@ ; - -: octal-digit? ( n -- ? ) CHAR: 0 CHAR: 7 between? ; inline - -: decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ; inline - -: hex-digit? ( n -- ? ) - dup decimal-digit? - swap CHAR: a CHAR: f between? or ; - -: octal? ( str -- ? ) [ octal-digit? ] all? ; - -: decimal? ( str -- ? ) [ decimal-digit? ] all? ; - -: hex? ( str -- ? ) [ hex-digit? ] all? ; - -: control-char? ( n -- ? ) - dup 0 HEX: 1f between? - swap HEX: 7f = or ; - -: punct? ( n -- ? ) - "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ; - -: c-identifier-char? ( ch -- ? ) - dup alpha? swap CHAR: _ = or ; inline - -: c-identifier? ( str -- ? ) - [ c-identifier-char? ] all? ; - -: java-blank? ( n -- ? ) - { - CHAR: \t CHAR: \n CHAR: \r - HEX: c HEX: 7 HEX: 1b - } member? ; - -: java-printable? ( n -- ? ) - dup alpha? swap punct? or ; - - -: 'ordinary-char' ( -- parser ) - [ "\\^*+?|(){}[" member? not ] satisfy [ 1satisfy ] <@ ; - -: 'octal-digit' ( -- parser ) [ octal-digit? ] satisfy ; - -: 'octal' ( -- parser ) - "\\0" token - 'octal-digit' - 'octal-digit' 'octal-digit' <&> <|> - [ CHAR: 0 CHAR: 3 between? ] satisfy - 'octal-digit' <&> 'octal-digit' <:&> <|> - &> just [ oct> 1satisfy ] <@ ; - -: 'hex-digit' ( -- parser ) [ hex-digit? ] satisfy ; - -: 'hex' ( -- parser ) - "\\x" token 'hex-digit' 'hex-digit' <&> &> - "\\u" token 'hex-digit' 'hex-digit' <&> - 'hex-digit' <:&> 'hex-digit' <:&> &> <|> [ hex> 1satisfy ] <@ ; - -: 'control-character' ( -- parser ) - "\\c" token [ LETTER? ] satisfy &> [ 1satisfy ] <@ ; - -: 'simple-escape-char' ( -- parser ) - { - { "\\\\" [ CHAR: \\ = ] } - { "\\t" [ CHAR: \t = ] } - { "\\n" [ CHAR: \n = ] } - { "\\r" [ CHAR: \r = ] } - { "\\f" [ HEX: c = ] } - { "\\a" [ HEX: 7 = ] } - { "\\e" [ HEX: 1b = ] } - } [ first2 satisfy-token ] [ <|> ] map-reduce ; - -: 'predefined-char-class' ( -- parser ) - { - { "." [ drop any-char-parser ] } - { "\\d" [ digit? ] } - { "\\D" [ digit? not ] } - { "\\s" [ java-blank? ] } - { "\\S" [ java-blank? not ] } - { "\\w" [ c-identifier? ] } - { "\\W" [ c-identifier? not ] } - } [ first2 satisfy-token ] [ <|> ] map-reduce ; - -: 'posix-character-class' ( -- parser ) - { - { "\\p{Lower}" [ letter? ] } - { "\\p{Upper}" [ LETTER? ] } - { "\\p{ASCII}" [ 0 HEX: 7f between? ] } - { "\\p{Alpha}" [ Letter? ] } - { "\\p{Digit}" [ digit? ] } - { "\\p{Alnum}" [ alpha? ] } - { "\\p{Punct}" [ punct? ] } - { "\\p{Graph}" [ java-printable? ] } - { "\\p{Print}" [ java-printable? ] } - { "\\p{Blank}" [ " \t" member? ] } - { "\\p{Cntrl}" [ control-char? ] } - { "\\p{XDigit}" [ hex-digit? ] } - { "\\p{Space}" [ java-blank? ] } - } [ first2 satisfy-token ] [ <|> ] map-reduce ; - -: 'escaped-seq' ( -- parser ) - "\\Q" token - any-char-parser <*> [ token ] <@ &> - "\\E" token <& ; - -: 'escape-seq' ( -- parser ) - 'simple-escape-char' - 'predefined-char-class' <|> - 'octal' <|> - 'hex' <|> - 'escaped-seq' <|> - 'control-character' <|> - 'posix-character-class' <|> ; - -: 'char' 'escape-seq' 'ordinary-char' <|> ; - -: 'string' - 'char' <+> [ [ <&> ] reduce* ] <@ ; +: or-predicates ( quots -- quot ) + [ \ dup add* ] map [ [ t ] ] f short-circuit \ nip add ; : exactly-n ( parser n -- parser' ) - swap and-parser construct-boa ; + swap ; : at-most-n ( parser n -- parser' ) dup zero? [ @@ -145,6 +24,116 @@ IN: regexp : from-m-to-n ( parser m n -- parser' ) >r [ exactly-n ] 2keep r> swap - at-most-n <&> ; +: octal-digit? ( n -- ? ) CHAR: 0 CHAR: 7 between? ; + +: decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ; + +: hex-digit? ( n -- ? ) + dup decimal-digit? + swap CHAR: a CHAR: f between? or ; + +: control-char? ( n -- ? ) + dup 0 HEX: 1f between? + swap HEX: 7f = or ; + +MACRO: fast-member? ( str -- quot ) + [ dup ] H{ } map>assoc [ key? ] curry ; + +: punct? ( n -- ? ) + "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" fast-member? ; + +: c-identifier-char? ( ch -- ? ) + dup alpha? swap CHAR: _ = or ; + +: java-blank? ( n -- ? ) + { + CHAR: \t CHAR: \n CHAR: \r + HEX: c HEX: 7 HEX: 1b + } fast-member? ; + +: java-printable? ( n -- ? ) + dup alpha? swap punct? or ; + +: 'ordinary-char' ( -- parser ) + [ "\\^*+?|(){}[" fast-member? not ] satisfy + [ [ = ] curry ] <@ ; + +: 'octal-digit' ( -- parser ) + [ octal-digit? ] satisfy ; + +: 'octal' ( -- parser ) + "0" token + 'octal-digit' 3 exactly-n + 'octal-digit' 1 2 from-m-to-n <|> + &> [ oct> [ = ] curry ] <@ ; + +: 'hex-digit' ( -- parser ) [ hex-digit? ] satisfy ; + +: 'hex' ( -- parser ) + "x" token 'hex-digit' 2 exactly-n &> + "u" token 'hex-digit' 4 exactly-n &> <|> + [ hex> [ = ] curry ] <@ ; + +: 'control-character' ( -- parser ) + "c" token [ LETTER? ] satisfy [ [ = ] curry ] <@ &> ; + +: satisfy-tokens ( assoc -- parser ) + [ >r token r> [ nip ] curry <@ ] { } assoc>map ; + +: 'simple-escape-char' ( -- parser ) + { + { "\\" CHAR: \\ } + { "t" CHAR: \t } + { "n" CHAR: \n } + { "r" CHAR: \r } + { "f" HEX: c } + { "a" HEX: 7 } + { "e" HEX: 1b } + } [ [ = ] curry ] assoc-map satisfy-tokens ; + +: 'predefined-char-class' ( -- parser ) + { + { "d" [ digit? ] } + { "D" [ digit? not ] } + { "s" [ java-blank? ] } + { "S" [ java-blank? not ] } + { "w" [ c-identifier-char? ] } + { "W" [ c-identifier-char? not ] } + } satisfy-tokens ; + +: 'posix-character-class' ( -- parser ) + { + { "Lower" [ letter? ] } + { "Upper" [ LETTER? ] } + { "ASCII" [ 0 HEX: 7f between? ] } + { "Alpha" [ Letter? ] } + { "Digit" [ digit? ] } + { "Alnum" [ alpha? ] } + { "Punct" [ punct? ] } + { "Graph" [ java-printable? ] } + { "Print" [ java-printable? ] } + { "Blank" [ " \t" member? ] } + { "Cntrl" [ control-char? ] } + { "XDigit" [ hex-digit? ] } + { "Space" [ java-blank? ] } + } satisfy-tokens "p{" "}" surrounded-by ; + +: 'escape' ( -- parser ) + "\\" token + 'simple-escape-char' + 'predefined-char-class' <|> + 'octal' <|> + 'hex' <|> + 'control-character' <|> + 'posix-character-class' <|> &> ; + +: 'any-char' "." token [ drop [ drop t ] ] <@ ; + +: 'char' + 'any-char' 'escape' 'ordinary-char' <|> <|> [ satisfy ] <@ ; + +: 'string' 'char' <+> [ ] <@ ; + DEFER: 'regexp' TUPLE: group-result str ; @@ -152,60 +141,46 @@ TUPLE: group-result str ; C: group-result : 'grouping' - "(" token 'regexp' [ [ ] <@ ] <@ - ")" token <& &> ; - -! Special cases: ]\\^- -: predicates>cond ( seq -- quot ) - #! Takes an array of quotation predicates/objects and makes a cond - #! Makes a predicate of each obj like so: [ dup obj = ] - #! Leaves quotations alone - #! The cond returns a boolean, t if one of the predicates matches - [ - dup callable? [ [ = ] curry ] unless - [ dup ] swap compose [ drop t ] 2array - ] map { [ t ] [ drop f ] } add [ cond ] curry ; + "(" ")" surrounded-by ; : 'range' ( -- parser ) any-char-parser "-" token <& any-char-parser <&> - [ first2 [ between? ] 2curry satisfy ] <@ ; + [ first2 [ between? ] 2curry ] <@ ; -: 'character-class-contents' ( -- parser ) - 'escape-seq' - 'range' <|> - [ "\\]" member? not ] satisfy [ 1satisfy ] <@ <|> ; +: 'character-class-term' ( -- parser ) + 'range' + 'escape' <|> + [ "\\]" member? not ] satisfy [ [ = ] curry ] <@ <|> ; -: make-character-class ( seq ? -- ) - >r [ parser>predicate ] map predicates>cond r> - [ [ not ] compose ] when satisfy ; +: 'positive-character-class' ( -- parser ) + "]" token [ drop [ CHAR: ] = ] ] <@ 'character-class-term' <*> <&:> + 'character-class-term' <+> <|> + [ or-predicates ] <@ ; + +: 'negative-character-class' ( -- parser ) + "^" token 'positive-character-class' &> + [ [ not ] append ] <@ ; : 'character-class' ( -- parser ) - "[" token - "^" token 'character-class-contents' <+> &> [ t make-character-class ] <@ - "]" token [ first 1satisfy ] <@ 'character-class-contents' <*> <&:> - [ f make-character-class ] <@ <|> - 'character-class-contents' <+> [ f make-character-class ] <@ <|> - &> - "]" token <& ; + 'negative-character-class' 'positive-character-class' <|> + "[" "]" surrounded-by [ satisfy ] <@ ; + +: 'escaped-seq' ( -- parser ) + any-char-parser <*> [ token ] <@ "\\Q" "\\E" surrounded-by ; : 'term' ( -- parser ) - 'string' + 'escaped-seq' 'grouping' <|> + 'string' <|> 'character-class' <|> - <+> [ - dup length 1 = - [ first ] [ and-parser construct-boa ] if - ] <@ ; + <+> [ ] <@ ; : 'interval' ( -- parser ) - 'term' "{" token <& 'integer' <&> "}" token <& [ first2 exactly-n ] <@ - 'term' "{" token <& 'integer' <&> "," token <& "}" token <& - [ first2 at-least-n ] <@ <|> - 'term' "{" token <& "," token <& 'integer' <&> "}" token <& - [ first2 at-most-n ] <@ <|> - 'term' "{" token <& 'integer' <&> "," token <& 'integer' <:&> "}" token <& - [ first3 from-m-to-n ] <@ <|> ; + 'term' 'integer' "{" "}" surrounded-by <&> [ first2 exactly-n ] <@ + 'term' 'integer' "{" ",}" surrounded-by <&> [ first2 at-least-n ] <@ <|> + 'term' 'integer' "{," "}" surrounded-by <&> [ first2 at-most-n ] <@ <|> + 'term' 'integer' "," token <& 'integer' <&> "{" "}" surrounded-by <&> [ first2 first2 from-m-to-n ] <@ <|> ; : 'repetition' ( -- parser ) 'term' @@ -221,7 +196,7 @@ C: group-result LAZY: 'union' ( -- parser ) 'simple' - 'simple' "|" token 'union' &> <&> [ first2 <|> ] <@ + 'simple' "|" token nonempty-list-of [ ] <@ <|> ; LAZY: 'regexp' ( -- parser )