diff --git a/basis/ascii/ascii.factor b/basis/ascii/ascii.factor index 193e847d27..bd1b86b279 100644 --- a/basis/ascii/ascii.factor +++ b/basis/ascii/ascii.factor @@ -10,7 +10,7 @@ IN: ascii : LETTER? ( ch -- ? ) CHAR: A CHAR: Z between? ; inline : digit? ( ch -- ? ) CHAR: 0 CHAR: 9 between? ; inline : printable? ( ch -- ? ) CHAR: \s CHAR: ~ between? ; inline -: control? ( ch -- ? ) "\0\e\r\n\t\u000008\u00007f" member? ; inline +: control? ( ch -- ? ) { [ 0 HEX: 1F between? ] [ HEX: 7F = ] } 1|| ; inline : quotable? ( ch -- ? ) { [ printable? ] [ "\"\\" member? not ] } 1&& ; inline : Letter? ( ch -- ? ) { [ letter? ] [ LETTER? ] } 1|| ; inline : alpha? ( ch -- ? ) { [ Letter? ] [ digit? ] } 1|| ; inline @@ -20,4 +20,4 @@ IN: ascii : >upper ( str -- upper ) [ ch>upper ] map ; HINTS: >lower string ; -HINTS: >upper string ; \ No newline at end of file +HINTS: >upper string ; diff --git a/basis/globs/globs-tests.factor b/basis/globs/globs-tests.factor index 446f1ee0a9..45eb27ea62 100644 --- a/basis/globs/globs-tests.factor +++ b/basis/globs/globs-tests.factor @@ -14,5 +14,6 @@ USING: tools.test globs ; [ f ] [ "foo.java" "*.{xml,txt}" glob-matches? ] unit-test [ t ] [ "foo.txt" "*.{xml,txt}" glob-matches? ] unit-test [ t ] [ "foo.xml" "*.{xml,txt}" glob-matches? ] unit-test -[ f ] [ "foo." "*.{,xml,txt}" glob-matches? ] unit-test +[ f ] [ "foo." "*.{xml,txt}" glob-matches? ] unit-test +[ t ] [ "foo." "*.{,xml,txt}" glob-matches? ] unit-test [ t ] [ "foo.{" "*.{" glob-matches? ] unit-test diff --git a/basis/globs/globs.factor b/basis/globs/globs.factor index 14ddb0ed9b..173187574b 100644 --- a/basis/globs/globs.factor +++ b/basis/globs/globs.factor @@ -1,42 +1,42 @@ -! Copyright (C) 2007 Slava Pestov. +! Copyright (C) 2007, 2009 Slava Pestov, Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -USING: parser-combinators parser-combinators.regexp lists sequences kernel -promises strings unicode.case ; +USING: sequences kernel regexp.combinators regexp.matchers strings unicode.case +peg.ebnf regexp arrays ; IN: globs - -: 'char' ( -- parser ) - [ ",*?" member? not ] satisfy ; +Character = "\\" .:c => [[ c 1string ]] + | !(","|"}") . => [[ 1string ]] -: 'string' ( -- parser ) - 'char' <+> [ >lower token ] <@ ; +RangeCharacter = !("]") . -: 'escaped-char' ( -- parser ) - "\\" token any-char-parser &> [ 1token ] <@ ; +Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b ]] + | RangeCharacter => [[ 1string ]] -: 'escaped-string' ( -- parser ) - 'string' 'escaped-char' <|> ; +StartRange = .:a "-" RangeCharacter:b => [[ a b ]] + | . => [[ 1string ]] -DEFER: 'term' +Ranges = StartRange:s Range*:r => [[ r s prefix ]] -: 'glob' ( -- parser ) - 'term' <*> [ ] <@ ; +CharClass = "^"?:n Ranges:e => [[ e n [ ] when ]] -: 'union' ( -- parser ) - 'glob' "," token nonempty-list-of "{" "}" surrounded-by - [ ] <@ ; +AlternationBody = Concatenation:c "," AlternationBody:a => [[ a c prefix ]] + | Concatenation => [[ 1array ]] -LAZY: 'term' ( -- parser ) - 'union' - 'character-class' <|> - "?" token [ drop any-char-parser ] <@ <|> - "*" token [ drop any-char-parser <*> ] <@ <|> - 'escaped-string' <|> ; +Element = "*" => [[ R/ .*/ ]] + | "?" => [[ R/ ./ ]] + | "[" CharClass:c "]" => [[ c ]] + | "{" AlternationBody:b "}" => [[ b ]] + | Character -PRIVATE> +Concatenation = Element* => [[ ]] -: ( string -- glob ) 'glob' just parse-1 just ; +End = !(.) + +Main = Concatenation End + +;EBNF : glob-matches? ( input glob -- ? ) - [ >lower ] [ ] bi* parse nil? not ; + [ >case-fold ] bi@ matches? ; diff --git a/basis/regexp/ast/ast.factor b/basis/regexp/ast/ast.factor new file mode 100644 index 0000000000..9288766888 --- /dev/null +++ b/basis/regexp/ast/ast.factor @@ -0,0 +1,65 @@ +! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg. +! See http://factorcode.org/license.txt for BSD license. +USING: kernel arrays accessors fry sequences regexp.classes ; +FROM: math.ranges => [a,b] ; +IN: regexp.ast + +TUPLE: negation term ; +C: negation + +TUPLE: from-to n m ; +C: from-to + +TUPLE: at-least n ; +C: at-least + +TUPLE: tagged-epsilon tag ; +C: tagged-epsilon + +CONSTANT: epsilon T{ tagged-epsilon { tag t } } + +TUPLE: concatenation first second ; + +: ( seq -- concatenation ) + [ epsilon ] [ unclip [ concatenation boa ] reduce ] if-empty ; + +TUPLE: alternation first second ; + +: ( seq -- alternation ) + unclip [ alternation boa ] reduce ; + +TUPLE: star term ; +C: star + +TUPLE: with-options tree options ; +C: with-options + +TUPLE: options on off ; +C: options + +SINGLETONS: unix-lines dotall multiline comments case-insensitive +unicode-case reversed-regexp ; + +: ( term -- term' ) + f 2array ; + +: ( term -- term' ) + dup 2array ; + +: repetition ( n term -- term' ) + ; + +GENERIC: ( term times -- term' ) +M: at-least + n>> swap [ repetition ] [ ] bi 2array ; +M: from-to + [ n>> ] [ m>> ] bi [a,b] swap '[ _ repetition ] map ; + +: char-class ( ranges ? -- term ) + [ ] dip [ ] when ; + +TUPLE: lookahead term positive? ; +C: lookahead + +TUPLE: lookbehind term positive? ; +C: lookbehind diff --git a/basis/regexp/backend/backend.factor b/basis/regexp/backend/backend.factor deleted file mode 100644 index 5eff0579c8..0000000000 --- a/basis/regexp/backend/backend.factor +++ /dev/null @@ -1,27 +0,0 @@ -! Copyright (C) 2008 Doug Coleman. -! See http://factorcode.org/license.txt for BSD license. -USING: accessors hashtables kernel math vectors ; -IN: regexp.backend - -TUPLE: regexp - raw - { options hashtable } - stack - parse-tree - nfa-table - dfa-table - minimized-table - matchers - { nfa-traversal-flags hashtable } - { dfa-traversal-flags hashtable } - { state integer } - { new-states vector } - { visited-states hashtable } ; - -: reset-regexp ( regexp -- regexp ) - 0 >>state - V{ } clone >>stack - V{ } clone >>new-states - H{ } clone >>visited-states ; - -SYMBOL: current-regexp diff --git a/basis/regexp/classes/classes-tests.factor b/basis/regexp/classes/classes-tests.factor new file mode 100644 index 0000000000..520e23c749 --- /dev/null +++ b/basis/regexp/classes/classes-tests.factor @@ -0,0 +1,58 @@ +! Copyright (C) 2009 Daniel Ehrenberg. +! See http://factorcode.org/license.txt for BSD license. +USING: regexp.classes tools.test arrays kernel ; +IN: regexp.classes.tests + +! Class algebra + +[ f ] [ { 1 2 } ] unit-test +[ T{ or-class f { 2 1 } } ] [ { 1 2 } ] unit-test +[ 3 ] [ { 1 2 } 3 2array ] unit-test +[ CHAR: A ] [ CHAR: A LETTER-class 2array ] unit-test +[ CHAR: A ] [ LETTER-class CHAR: A 2array ] unit-test +[ T{ primitive-class { class LETTER-class } } ] [ CHAR: A LETTER-class 2array ] unit-test +[ T{ primitive-class { class LETTER-class } } ] [ LETTER-class CHAR: A 2array ] unit-test +[ t ] [ { t 1 } ] unit-test +[ t ] [ { 1 t } ] unit-test +[ f ] [ { f 1 } ] unit-test +[ f ] [ { 1 f } ] unit-test +[ 1 ] [ { f 1 } ] unit-test +[ 1 ] [ { 1 f } ] unit-test +[ 1 ] [ { t 1 } ] unit-test +[ 1 ] [ { 1 t } ] unit-test +[ 1 ] [ 1 ] unit-test +[ 1 ] [ { 1 1 } ] unit-test +[ 1 ] [ { 1 1 } ] unit-test +[ t ] [ { t t } ] unit-test +[ T{ primitive-class { class letter-class } } ] [ letter-class dup 2array ] unit-test +[ T{ primitive-class { class letter-class } } ] [ letter-class dup 2array ] unit-test +[ T{ or-class { seq { 2 3 1 } } } ] [ { 1 2 } { 2 3 } 2array ] unit-test +[ T{ or-class { seq { 3 2 } } } ] [ { 2 3 } 1 2array ] unit-test +[ f ] [ t ] unit-test +[ t ] [ f ] unit-test +[ f ] [ 1 1 t replace-question ] unit-test + +! Making classes into nested conditionals + +[ V{ 1 2 3 4 } ] [ T{ and-class f { 1 T{ not-class f 2 } T{ or-class f { 3 4 } } 2 } } class>questions ] unit-test +[ { 3 } ] [ { { 3 t } } table>condition ] unit-test +[ { T{ primitive-class } } ] [ { { 1 t } { 2 T{ primitive-class } } } table>questions ] unit-test +[ { { 1 t } { 2 t } } ] [ { { 1 t } { 2 T{ primitive-class } } } T{ primitive-class } t answer ] unit-test +[ { { 1 t } } ] [ { { 1 t } { 2 T{ primitive-class } } } T{ primitive-class } f answer ] unit-test +[ T{ condition f T{ primitive-class } { 1 2 } { 1 } } ] [ { { 1 t } { 2 T{ primitive-class } } } table>condition ] unit-test + +SYMBOL: foo +SYMBOL: bar + +[ T{ condition f T{ primitive-class f bar } T{ condition f T{ primitive-class f foo } { 1 3 2 } { 1 3 } } T{ condition f T{ primitive-class f foo } { 1 2 } { 1 } } } ] [ { { 1 t } { 3 T{ primitive-class f bar } } { 2 T{ primitive-class f foo } } } table>condition ] unit-test + +[ t ] [ foo dup t replace-question ] unit-test +[ f ] [ foo dup f replace-question ] unit-test +[ T{ primitive-class f foo } ] [ foo bar t replace-question ] unit-test +[ T{ primitive-class f foo } ] [ foo bar f replace-question ] unit-test +[ T{ primitive-class f foo } ] [ foo bar 2array bar t replace-question ] unit-test +[ T{ primitive-class f bar } ] [ foo bar 2array foo t replace-question ] unit-test +[ f ] [ foo bar 2array foo f replace-question ] unit-test +[ f ] [ foo bar 2array bar f replace-question ] unit-test +[ t ] [ foo bar 2array bar t replace-question ] unit-test +[ T{ primitive-class f foo } ] [ foo bar 2array bar f replace-question ] unit-test diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor index 4a807fa51b..8912082ec3 100644 --- a/basis/regexp/classes/classes.factor +++ b/basis/regexp/classes/classes.factor @@ -1,7 +1,8 @@ -! Copyright (C) 2008 Doug Coleman. +! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -USING: accessors kernel math math.order words regexp.utils -unicode.categories combinators.short-circuit ; +USING: accessors kernel math math.order words combinators locals +ascii unicode.categories combinators.short-circuit sequences +fry macros arrays assocs sets classes ; IN: regexp.classes SINGLETONS: any-char any-char-no-nl @@ -11,19 +12,18 @@ ascii-class punctuation-class java-printable-class blank-class control-character-class hex-digit-class java-blank-class c-identifier-class unmatchable-class terminator-class word-boundary-class ; -SINGLETONS: beginning-of-input beginning-of-line -end-of-input end-of-line ; +SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file ; -MIXIN: node -TUPLE: character-class-range from to ; INSTANCE: character-class-range node +TUPLE: range from to ; +C: range GENERIC: class-member? ( obj class -- ? ) -M: t class-member? ( obj class -- ? ) 2drop f ; +M: t class-member? ( obj class -- ? ) 2drop t ; -M: integer class-member? ( obj class -- ? ) 2drop f ; +M: integer class-member? ( obj class -- ? ) = ; -M: character-class-range class-member? ( obj class -- ? ) +M: range class-member? ( obj class -- ? ) [ from>> ] [ to>> ] bi between? ; M: any-char class-member? ( obj class -- ? ) @@ -47,16 +47,24 @@ M: ascii-class class-member? ( obj class -- ? ) M: digit-class class-member? ( obj class -- ? ) drop digit? ; +: c-identifier-char? ( ch -- ? ) + { [ alpha? ] [ CHAR: _ = ] } 1|| ; + M: c-identifier-class class-member? ( obj class -- ? ) - drop - { [ digit? ] [ Letter? ] [ CHAR: _ = ] } 1|| ; + drop c-identifier-char? ; M: alpha-class class-member? ( obj class -- ? ) drop alpha? ; +: punct? ( ch -- ? ) + "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ; + M: punctuation-class class-member? ( obj class -- ? ) drop punct? ; +: java-printable? ( ch -- ? ) + { [ alpha? ] [ punct? ] } 1|| ; + M: java-printable-class class-member? ( obj class -- ? ) drop java-printable? ; @@ -64,11 +72,24 @@ M: non-newline-blank-class class-member? ( obj class -- ? ) drop { [ blank? ] [ CHAR: \n = not ] } 1&& ; M: control-character-class class-member? ( obj class -- ? ) - drop control-char? ; + drop control? ; + +: hex-digit? ( ch -- ? ) + { + [ CHAR: A CHAR: F between? ] + [ CHAR: a CHAR: f between? ] + [ CHAR: 0 CHAR: 9 between? ] + } 1|| ; M: hex-digit-class class-member? ( obj class -- ? ) drop hex-digit? ; +: java-blank? ( ch -- ? ) + { + CHAR: \s CHAR: \t CHAR: \n + HEX: b HEX: 7 CHAR: \r + } member? ; + M: java-blank-class class-member? ( obj class -- ? ) drop java-blank? ; @@ -76,16 +97,187 @@ M: unmatchable-class class-member? ( obj class -- ? ) 2drop f ; M: terminator-class class-member? ( obj class -- ? ) - drop { - [ CHAR: \r = ] - [ CHAR: \n = ] - [ CHAR: \u000085 = ] - [ CHAR: \u002028 = ] - [ CHAR: \u002029 = ] - } 1|| ; + drop "\r\n\u000085\u002029\u002028" member? ; -M: beginning-of-line class-member? ( obj class -- ? ) +M: ^ class-member? ( obj class -- ? ) 2drop f ; -M: end-of-line class-member? ( obj class -- ? ) +M: $ class-member? ( obj class -- ? ) 2drop f ; + +M: f class-member? 2drop f ; + +TUPLE: primitive-class class ; +C: primitive-class + +TUPLE: or-class seq ; + +TUPLE: not-class class ; + +TUPLE: and-class seq ; + +GENERIC: combine-and ( class1 class2 -- combined ? ) + +: replace-if-= ( object object -- object ? ) + over = ; + +M: object combine-and replace-if-= ; + +M: t combine-and + drop t ; + +M: f combine-and + nip t ; + +M: not-class combine-and + class>> 2dup = [ 2drop f t ] [ + dup integer? [ + 2dup swap class-member? + [ 2drop f f ] + [ drop t ] if + ] [ 2drop f f ] if + ] if ; + +M: integer combine-and + swap 2dup class-member? [ drop t ] [ 2drop f t ] if ; + +GENERIC: combine-or ( class1 class2 -- combined ? ) + +M: object combine-or replace-if-= ; + +M: t combine-or + nip t ; + +M: f combine-or + drop t ; + +M: not-class combine-or + class>> = [ t t ] [ f f ] if ; + +M: integer combine-or + 2dup swap class-member? [ drop t ] [ 2drop f f ] if ; + +: flatten ( seq class -- newseq ) + '[ dup _ instance? [ seq>> ] [ 1array ] if ] map concat ; inline + +: try-combine ( elt1 elt2 quot -- combined/f ? ) + 3dup call [ [ 3drop ] dip t ] [ drop swapd call ] if ; inline + +:: prefix-combining ( seq elt quot: ( elt1 elt2 -- combined/f ? ) -- newseq ) + f :> combined! + seq [ elt quot try-combine swap combined! ] find drop + [ seq remove-nth combined prefix ] + [ seq elt prefix ] if* ; inline + +:: combine ( seq quot: ( elt1 elt2 -- combined/f ? ) empty class -- newseq ) + seq class flatten + { } [ quot prefix-combining ] reduce + dup length { + { 0 [ drop empty ] } + { 1 [ first ] } + [ drop class new swap >>seq ] + } case ; inline + +: ( seq -- class ) + [ combine-and ] t and-class combine ; + +M: and-class class-member? + seq>> [ class-member? ] with all? ; + +: ( seq -- class ) + [ combine-or ] f or-class combine ; + +M: or-class class-member? + seq>> [ class-member? ] with any? ; + +GENERIC: ( class -- inverse ) + +M: object + not-class boa ; + +M: not-class + class>> ; + +M: and-class + seq>> [ ] map ; + +M: or-class + seq>> [ ] map ; + +M: t drop f ; +M: f drop t ; + +M: not-class class-member? + class>> class-member? not ; + +M: primitive-class class-member? + class>> class-member? ; + +UNION: class primitive-class not-class or-class and-class range ; + +TUPLE: condition question yes no ; +C: condition + +GENERIC# replace-question 2 ( class from to -- new-class ) + +M:: object replace-question ( class from to -- new-class ) + class from = to class ? ; + +: replace-compound ( class from to -- seq ) + [ seq>> ] 2dip '[ _ _ replace-question ] map ; + +M: and-class replace-question + replace-compound ; + +M: or-class replace-question + replace-compound ; + +M: not-class replace-question + [ class>> ] 2dip replace-question ; + +: answer ( table question answer -- new-table ) + '[ _ _ replace-question ] assoc-map + [ nip ] assoc-filter ; + +: answers ( table questions answer -- new-table ) + '[ _ answer ] each ; + +DEFER: make-condition + +: (make-condition) ( table questions question -- condition ) + [ 2nip ] + [ swap [ t answer ] dip make-condition ] + [ swap [ f answer ] dip make-condition ] 3tri + 2dup = [ 2nip ] [ ] if ; + +: make-condition ( table questions -- condition ) + [ keys ] [ unclip (make-condition) ] if-empty ; + +GENERIC: class>questions ( class -- questions ) +: compound-questions ( class -- questions ) seq>> [ class>questions ] gather ; +M: or-class class>questions compound-questions ; +M: and-class class>questions compound-questions ; +M: not-class class>questions class>> class>questions ; +M: object class>questions 1array ; + +: table>questions ( table -- questions ) + values [ class>questions ] gather >array t swap remove ; + +: table>condition ( table -- condition ) + ! input table is state => class + >alist dup table>questions make-condition ; + +: condition-map ( condition quot: ( obj -- obj' ) -- new-condition ) + over condition? [ + [ [ question>> ] [ yes>> ] [ no>> ] tri ] dip + '[ _ condition-map ] bi@ + ] [ call ] if ; inline recursive + +: condition-states ( condition -- states ) + dup condition? [ + [ yes>> ] [ no>> ] bi + [ condition-states ] bi@ append prune + ] [ 1array ] if ; + +: condition-at ( condition assoc -- new-condition ) + '[ _ at ] condition-map ; diff --git a/basis/regexp/combinators/authors.txt b/basis/regexp/combinators/authors.txt new file mode 100644 index 0000000000..f990dd0ed2 --- /dev/null +++ b/basis/regexp/combinators/authors.txt @@ -0,0 +1 @@ +Daniel Ehrenberg diff --git a/basis/regexp/combinators/combinators-docs.factor b/basis/regexp/combinators/combinators-docs.factor new file mode 100644 index 0000000000..7cb214f42b --- /dev/null +++ b/basis/regexp/combinators/combinators-docs.factor @@ -0,0 +1,54 @@ +! Copyright (C) 2009 Daniel Ehrenberg +! See http://factorcode.org/license.txt for BSD license. +USING: help.syntax help.markup regexp strings ; +IN: regexp.combinators + +ABOUT: "regexp.combinators" + +ARTICLE: "regexp.combinators" "Regular expression combinators" +"The " { $vocab-link "regexp.combinators" } " vocabulary defines combinators which can be used to build up regular expressions to match strings. This is in addition to the traditional syntax defined in the " { $vocab-link "regexp" } " vocabulary." +{ $subsection } +{ $subsection } +{ $subsection } +{ $subsection } +{ $subsection } +{ $subsection } +{ $subsection } +{ $subsection } +{ $subsection