From 45aea34ca431d9c6b85ca079fe0f7a2edbbd6182 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Tue, 20 May 2008 18:26:30 -0500 Subject: [PATCH] movin the regexp2 to unmaintained --- unmaintained/regexp2/regexp2-tests.factor | 5 + unmaintained/regexp2/regexp2.factor | 262 ++++++++++++++++++++++ 2 files changed, 267 insertions(+) create mode 100644 unmaintained/regexp2/regexp2-tests.factor create mode 100644 unmaintained/regexp2/regexp2.factor diff --git a/unmaintained/regexp2/regexp2-tests.factor b/unmaintained/regexp2/regexp2-tests.factor new file mode 100644 index 0000000000..1fb3f61f29 --- /dev/null +++ b/unmaintained/regexp2/regexp2-tests.factor @@ -0,0 +1,5 @@ +USING: kernel peg regexp2 sequences tools.test ; +IN: regexp2.tests + +[ T{ parse-result f T{ slice f 3 3 "056" } 46 } ] + [ "056" 'octal' parse ] unit-test diff --git a/unmaintained/regexp2/regexp2.factor b/unmaintained/regexp2/regexp2.factor new file mode 100644 index 0000000000..f7023c74bf --- /dev/null +++ b/unmaintained/regexp2/regexp2.factor @@ -0,0 +1,262 @@ +USING: assocs combinators.lib kernel math math.parser +namespaces peg unicode.case sequences unicode.categories +memoize peg.parsers math.order ; +USE: io +USE: tools.walker +IN: regexp2 + +upper [ swap ch>upper = ] ] [ [ = ] ] if + curry ; + +: char-between?-quot ( ch1 ch2 -- quot ) + ignore-case? get + [ [ ch>upper ] bi@ [ >r >r ch>upper r> r> between? ] ] + [ [ between? ] ] + if 2curry ; + +: or-predicates ( quots -- quot ) + [ \ dup prefix ] map [ [ t ] ] f short-circuit \ nip suffix ; + +: literal-action [ nip ] curry action ; + +: delay-action [ curry ] curry action ; + +PRIVATE> + +: ascii? ( n -- ? ) + 0 HEX: 7f between? ; + +: octal-digit? ( n -- ? ) + CHAR: 0 CHAR: 7 between? ; + +: hex-digit? ( n -- ? ) + { + [ dup digit? ] + [ dup CHAR: a CHAR: f between? ] + [ dup CHAR: A CHAR: F between? ] + } || nip ; + +: control-char? ( n -- ? ) + { [ dup 0 HEX: 1f between? ] [ dup HEX: 7f = ] } || nip ; + +: punct? ( n -- ? ) + "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ; + +: c-identifier-char? ( ch -- ? ) + { [ dup alpha? ] [ dup CHAR: _ = ] } || nip ; + +: java-blank? ( n -- ? ) + { + CHAR: \s + CHAR: \t CHAR: \n CHAR: \r + HEX: c HEX: 7 HEX: 1b + } member? ; + +: java-printable? ( n -- ? ) + { [ dup alpha? ] [ dup punct? ] } || nip ; + +MEMO: 'ordinary-char' ( -- parser ) + [ "\\^*+?|(){}[$" member? not ] satisfy + [ char=-quot ] action ; + +MEMO: 'octal-digit' ( -- parser ) [ octal-digit? ] satisfy ; + +MEMO: 'octal' ( -- parser ) + "0" token hide 'octal-digit' 1 3 from-m-to-n 2seq + [ first oct> ] action ; + +MEMO: 'hex-digit' ( -- parser ) [ hex-digit? ] satisfy ; + +MEMO: 'hex' ( -- parser ) + "x" token hide 'hex-digit' 2 exactly-n 2seq + "u" token hide 'hex-digit' 6 exactly-n 2seq 2choice + [ first hex> ] action ; + +: satisfy-tokens ( assoc -- parser ) + [ >r token r> literal-action ] { } assoc>map choice ; + +MEMO: 'simple-escape-char' ( -- parser ) + { + { "\\" CHAR: \\ } + { "t" CHAR: \t } + { "n" CHAR: \n } + { "r" CHAR: \r } + { "f" HEX: c } + { "a" HEX: 7 } + { "e" HEX: 1b } + } [ char=-quot ] assoc-map satisfy-tokens ; + +MEMO: 'predefined-char-class' ( -- parser ) + { + { "d" [ digit? ] } + { "D" [ digit? not ] } + { "s" [ java-blank? ] } + { "S" [ java-blank? not ] } + { "w" [ c-identifier-char? ] } + { "W" [ c-identifier-char? not ] } + } satisfy-tokens ; + +MEMO: 'posix-character-class' ( -- parser ) + { + { "Lower" [ letter? ] } + { "Upper" [ LETTER? ] } + { "ASCII" [ ascii? ] } + { "Alpha" [ Letter? ] } + { "Digit" [ digit? ] } + { "Alnum" [ alpha? ] } + { "Punct" [ punct? ] } + { "Graph" [ java-printable? ] } + { "Print" [ java-printable? ] } + { "Blank" [ " \t" member? ] } + { "Cntrl" [ control-char? ] } + { "XDigit" [ hex-digit? ] } + { "Space" [ java-blank? ] } + } satisfy-tokens "p{" "}" surrounded-by ; + +MEMO: 'simple-escape' ( -- parser ) + [ + 'octal' , + 'hex' , + "c" token hide [ LETTER? ] satisfy 2seq , + any-char , + ] choice* [ char=-quot ] action ; + +MEMO: 'escape' ( -- parser ) + "\\" token hide [ + 'simple-escape-char' , + 'predefined-char-class' , + 'posix-character-class' , + 'simple-escape' , + ] choice* 2seq ; + +MEMO: 'any-char' ( -- parser ) + "." token [ drop t ] literal-action ; + +MEMO: 'char' ( -- parser ) + 'any-char' 'escape' 'ordinary-char' 3choice [ satisfy ] action ; + +DEFER: 'regexp' + +TUPLE: group-result str ; + +C: group-result + +MEMO: 'non-capturing-group' ( -- parser ) + "?:" token hide 'regexp' ; + +MEMO: 'positive-lookahead-group' ( -- parser ) + "?=" token hide 'regexp' [ ensure ] action ; + +MEMO: 'negative-lookahead-group' ( -- parser ) + "?!" token hide 'regexp' [ ensure-not ] action ; + +MEMO: 'simple-group' ( -- parser ) + 'regexp' [ [ ] action ] action ; + +MEMO: 'group' ( -- parser ) + [ + 'non-capturing-group' , + 'positive-lookahead-group' , + 'negative-lookahead-group' , + 'simple-group' , + ] choice* "(" ")" surrounded-by ; + +MEMO: 'range' ( -- parser ) + any-char "-" token hide any-char 3seq + [ first2 char-between?-quot ] action ; + +MEMO: 'character-class-term' ( -- parser ) + 'range' + 'escape' + [ "\\]" member? not ] satisfy [ char=-quot ] action + 3choice ; + +MEMO: 'positive-character-class' ( -- parser ) + ! todo + "]" token [ CHAR: ] = ] literal-action 'character-class-term' repeat0 2seq + 'character-class-term' repeat1 2choice [ or-predicates ] action ; + +MEMO: 'negative-character-class' ( -- parser ) + "^" token hide 'positive-character-class' 2seq + [ [ not ] append ] action ; + +MEMO: 'character-class' ( -- parser ) + 'negative-character-class' 'positive-character-class' 2choice + "[" "]" surrounded-by [ satisfy ] action ; + +MEMO: 'escaped-seq' ( -- parser ) + any-char repeat1 + [ ignore-case? get token ] action "\\Q" "\\E" surrounded-by ; + +MEMO: 'break' ( quot -- parser ) + satisfy ensure + epsilon just 2choice ; + +MEMO: 'break-escape' ( -- parser ) + "$" token [ "\r\n" member? ] 'break' literal-action + "\\b" token [ blank? ] 'break' literal-action + "\\B" token [ blank? not ] 'break' literal-action + "\\z" token epsilon just literal-action 4choice ; + +MEMO: 'simple' ( -- parser ) + [ + 'escaped-seq' , + 'break-escape' , + 'group' , + 'character-class' , + 'char' , + ] choice* ; + +MEMO: 'exactly-n' ( -- parser ) + 'integer' [ exactly-n ] delay-action ; + +MEMO: 'at-least-n' ( -- parser ) + 'integer' "," token hide 2seq [ at-least-n ] delay-action ; + +MEMO: 'at-most-n' ( -- parser ) + "," token hide 'integer' 2seq [ at-most-n ] delay-action ; + +MEMO: 'from-m-to-n' ( -- parser ) + 'integer' "," token hide 'integer' 3seq + [ first2 from-m-to-n ] delay-action ; + +MEMO: 'greedy-interval' ( -- parser ) + 'exactly-n' 'at-least-n' 'at-most-n' 'from-m-to-n' 4choice ; + +MEMO: 'interval' ( -- parser ) + 'greedy-interval' + 'greedy-interval' "?" token hide 2seq [ "reluctant {}" print ] action + 'greedy-interval' "+" token hide 2seq [ "possessive {}" print ] action + 3choice "{" "}" surrounded-by ; + +MEMO: 'repetition' ( -- parser ) + [ + ! Possessive + ! "*+" token [ ] literal-action , + ! "++" token [ ] literal-action , + ! "?+" token [ ] literal-action , + ! Reluctant + ! "*?" token [ <(*)> ] literal-action , + ! "+?" token [ <(+)> ] literal-action , + ! "??" token [ <(?)> ] literal-action , + ! Greedy + "*" token [ repeat0 ] literal-action , + "+" token [ repeat1 ] literal-action , + "?" token [ optional ] literal-action , + ] choice* ; + +MEMO: 'dummy' ( -- parser ) + epsilon [ ] literal-action ; + +! todo -- check the action +! MEMO: 'term' ( -- parser ) + ! 'simple' + ! 'repetition' 'interval' 'dummy' 3choice 2seq [ first2 call ] action + ! [ ] action ; +