From c4fc9f59025ed17b268772e30150a8e8a99a3e28 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Thu, 22 May 2008 18:15:16 -0500 Subject: [PATCH 1/7] about to start lookahead --- extra/regexp4/regexp4-tests.factor | 7 +- extra/regexp4/regexp4.factor | 172 +++++++++++++++++++---------- 2 files changed, 112 insertions(+), 67 deletions(-) diff --git a/extra/regexp4/regexp4-tests.factor b/extra/regexp4/regexp4-tests.factor index ea62d2105a..c941d0fb75 100644 --- a/extra/regexp4/regexp4-tests.factor +++ b/extra/regexp4/regexp4-tests.factor @@ -155,6 +155,7 @@ IN: regexp4-tests [ f ] [ "a" "\\Q\\E" matches? ] unit-test [ t ] [ "|*+" "\\Q|*+\\E" matches? ] unit-test [ f ] [ "abc" "\\Q|*+\\E" matches? ] unit-test +[ t ] [ "s" "\\Qs\\E" matches? ] unit-test [ t ] [ "S" "\\0123" matches? ] unit-test [ t ] [ "SXY" "\\0123XY" matches? ] unit-test @@ -236,12 +237,6 @@ IN: regexp4-tests matches? ] unit-test - - - - - - ! ((A)(B(C))) ! 1. ((A)(B(C))) ! 2. (A) diff --git a/extra/regexp4/regexp4.factor b/extra/regexp4/regexp4.factor index 2957244bcf..377a9b17a5 100644 --- a/extra/regexp4/regexp4.factor +++ b/extra/regexp4/regexp4.factor @@ -4,7 +4,7 @@ USING: accessors arrays assocs combinators kernel math sequences namespaces locals combinators.lib state-tables math.parser state-parser sets dlists unicode.categories math.order quotations shuffle math.ranges splitting -symbols fry parser ; +symbols fry parser math.ranges inspector strings ; IN: regexp4 SYMBOLS: eps start-state final-state beginning-of-text @@ -191,7 +191,7 @@ ERROR: unbalanced-brackets ; [ [ nip at-most-n ] [ at-least-n ] if* ] if ] [ drop exactly-n ] if ; -:: make-nontoken-nfa ( regexp obj -- ) +:: push-single-nfa ( regexp obj -- ) [let | s0 [ regexp next-state ] s1 [ regexp next-state ] stack [ regexp stack>> ] @@ -213,18 +213,23 @@ ERROR: unbalanced-brackets ; : decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ; : hex-digit? ( n -- ? ) - dup decimal-digit? - over CHAR: a CHAR: f between? or - swap CHAR: A CHAR: F between? or ; + [ + [ dup decimal-digit? ] + [ dup CHAR: a CHAR: f between? ] + [ dup CHAR: A CHAR: F between? ] + ] || nip ; : control-char? ( n -- ? ) - dup 0 HEX: 1f between? swap HEX: 7f = or ; + [ + [ dup 0 HEX: 1f between? ] + [ dup HEX: 7f = ] + ] || nip ; : punct? ( n -- ? ) "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ; : c-identifier-char? ( ch -- ? ) - dup alpha? swap CHAR: _ = or ; + [ [ dup alpha? ] [ dup CHAR: _ = ] ] || nip ; : java-blank? ( n -- ? ) { @@ -233,7 +238,7 @@ ERROR: unbalanced-brackets ; } member? ; : java-printable? ( n -- ? ) - dup alpha? swap punct? or ; + [ [ dup alpha? ] [ dup punct? ] ] || nip ; ERROR: bad-character-class obj ; @@ -261,32 +266,26 @@ ERROR: bad-character-class obj ; ERROR: bad-octal number ; -: parse-octal ( regexp -- ) +: parse-octal ( -- n ) next get-char drop 3 take oct> - dup 255 > [ bad-octal ] when - make-nontoken-nfa ; + dup 255 > [ bad-octal ] when ; ERROR: bad-hex number ; -: parse-short-hex ( regexp -- ) +: parse-short-hex ( -- n ) next 2 take hex> - dup number? [ bad-hex ] unless - make-nontoken-nfa ; + dup number? [ bad-hex ] unless ; -: parse-long-hex ( regexp -- ) - next 4 take hex> - dup number? [ bad-hex ] unless - make-nontoken-nfa ; +: parse-long-hex ( -- n ) + next 6 take hex> + dup number? [ bad-hex ] unless ; -: parse-control-character ( regexp -- ) - next get-char make-nontoken-nfa ; - -: parse-backreference ( regexp obj -- ) - 2drop ; +: parse-control-character ( -- n ) + next get-char ; : dot-construction ( regexp -- ) - [ CHAR: \n = not ] make-nontoken-nfa ; + [ CHAR: \n = not ] push-single-nfa ; : front-anchor-construction ( regexp -- ) drop ; @@ -299,32 +298,50 @@ ERROR: bad-hex number ; [ get-char CHAR: } = ] take-until "," split1 [ [ string>number ] bi@ ] keep >boolean ; -: parse-escaped ( regexp -- ) - next get-char { - { CHAR: \ [ [ CHAR: \ = ] make-nontoken-nfa ] } - { CHAR: t [ [ CHAR: \t = ] make-nontoken-nfa ] } - { CHAR: n [ [ CHAR: \n = ] make-nontoken-nfa ] } - { CHAR: r [ [ CHAR: \r = ] make-nontoken-nfa ] } - { CHAR: f [ [ HEX: c = ] make-nontoken-nfa ] } - { CHAR: a [ [ HEX: 7 = ] make-nontoken-nfa ] } - { CHAR: e [ [ HEX: 1b = ] make-nontoken-nfa ] } +TUPLE: character-class members ; +TUPLE: character-class-range from to ; +TUPLE: negated-character-class < character-class ; +TUPLE: negated-character-class-range < character-class-range ; +TUPLE: intersection-class < character-class ; +TUPLE: negated-intersection-class < intersection-class ; - { CHAR: d [ [ digit? ] make-nontoken-nfa ] } - { CHAR: D [ [ digit? not ] make-nontoken-nfa ] } - { CHAR: s [ [ java-blank? ] make-nontoken-nfa ] } - { CHAR: S [ [ java-blank? not ] make-nontoken-nfa ] } - { CHAR: w [ [ c-identifier-char? ] make-nontoken-nfa ] } - { CHAR: W [ [ c-identifier-char? not ] make-nontoken-nfa ] } +GENERIC: character-class-contains? ( obj character-class -- ? ) - { CHAR: p [ parse-posix-class make-nontoken-nfa ] } - { CHAR: P [ parse-posix-class [ not ] compose make-nontoken-nfa ] } +: parse-escaped-until ( -- seq ) + [ get-char CHAR: \ = get-next CHAR: E = and ] take-until + next ; + +: character-class-predicate ( seq -- quot ) + boa '[ , character-class-contains? ] ; + +ERROR: unmatched-escape-sequence ; + +: (parse-escaped) ( regexp ? ch -- obj ) + { + { CHAR: \ [ [ CHAR: \ = ] ] } + { CHAR: t [ [ CHAR: \t = ] ] } + { CHAR: n [ [ CHAR: \n = ] ] } + { CHAR: r [ [ CHAR: \r = ] ] } + { CHAR: f [ [ HEX: c = ] ] } + { CHAR: a [ [ HEX: 7 = ] ] } + { CHAR: e [ [ HEX: 1b = ] ] } + + { CHAR: d [ [ digit? ] ] } + { CHAR: D [ [ digit? not ] ] } + { CHAR: s [ [ java-blank? ] ] } + { CHAR: S [ [ java-blank? not ] ] } + { CHAR: w [ [ c-identifier-char? ] ] } + { CHAR: W [ [ c-identifier-char? not ] ] } + + { CHAR: p [ parse-posix-class ] } + { CHAR: P [ parse-posix-class [ not ] compose ] } { CHAR: x [ parse-short-hex ] } { CHAR: u [ parse-long-hex ] } { CHAR: 0 [ parse-octal ] } { CHAR: c [ parse-control-character ] } - ! { CHAR: Q [ quot til \E ] } - ! { CHAR: E [ should be an error, parse this in the Q if exists ] } + ! { CHAR: Q [ next parse-escaped-until ] } + ! { CHAR: E [ unmatched-escape-sequence ] } ! { CHAR: b [ ] } ! a word boundary ! { CHAR: B [ ] } ! a non-word boundary @@ -332,34 +349,57 @@ ERROR: bad-hex number ; ! { CHAR: G [ ] } ! end of previous match ! { CHAR: Z [ ] } ! end of input but for the final terminator, if any ! { CHAR: z [ ] } ! end of the input - [ dup digit? [ parse-backreference ] [ make-nontoken-nfa ] if ] + [ ] } case ; +: parse-escaped ( regexp -- ) + next get-char (parse-escaped) push-single-nfa ; + : handle-dash ( vector -- vector ) [ dup dash eq? [ drop CHAR: - ] when ] map ; +M: object character-class-contains? ( obj1 obj2 -- ? ) + = ; + +M: callable character-class-contains? ( obj1 callable -- ? ) + call ; + +M: character-class character-class-contains? ( obj cc -- ? ) + members>> [ character-class-contains? ] with find drop >boolean ; + +M: negated-character-class character-class-contains? ( obj cc -- ? ) + call-next-method not ; + +M: character-class-range character-class-contains? ( obj cc -- ? ) + [ from>> ] [ to>> ] bi between? ; + +M: negated-character-class-range character-class-contains? ( obj cc -- ? ) + call-next-method not ; + +M: intersection-class character-class-contains? ( obj cc -- ? ) + members>> [ character-class-contains? not ] with find drop not ; + +M: negated-intersection-class character-class-contains? ( obj cc -- ? ) + call-next-method not ; + ERROR: unmatched-negated-character-class class ; -: handle-caret ( vector -- vector ? ) +: handle-caret ( obj -- seq class ) dup [ length 2 >= ] [ first caret eq? ] bi and [ - rest t + rest negated-character-class ] [ - f + character-class ] if ; : make-character-class ( regexp -- ) left-bracket over stack>> cut-stack pick (>>stack) - handle-dash - handle-caret - >r [ dup number? [ '[ dup , = ] ] when ] map - [ [ drop t ] 2array ] map [ drop f ] suffix [ cond ] curry r> - [ [ not ] compose ] when - make-nontoken-nfa ; + handle-dash handle-caret + character-class-predicate push-single-nfa ; : apply-dash ( regexp -- ) stack>> dup [ pop ] [ pop* ] [ pop ] tri - swap '[ dup , , between? ] swap push ; + swap character-class-range boa swap push ; : apply-dash? ( regexp -- ? ) stack>> dup length 3 >= @@ -371,7 +411,7 @@ DEFER: parse-character-class next get-char { { CHAR: [ [ - [ 1+ ] change-bracket-count left-bracket push-stack + [ 1+ ] change-bracket-count dup left-bracket push-stack parse-character-class ] } { CHAR: ] [ @@ -381,7 +421,7 @@ DEFER: parse-character-class { CHAR: - [ dash push-stack ] } ! { CHAR: & [ ampersand push-stack ] } ! { CHAR: : [ semicolon push-stack ] } - { CHAR: \ [ parse-escaped ] } + { CHAR: \ [ next get-char (parse-escaped) push-stack ] } { f [ unbalanced-brackets ] } [ dupd push-stack dup apply-dash? [ apply-dash ] [ drop ] if ] } case @@ -393,7 +433,7 @@ DEFER: parse-character-class : parse-character-class-second ( regexp -- ) get-next { - ! { CHAR: [ [ CHAR: [ push-stack next ] } + { CHAR: [ [ CHAR: [ push-stack next ] } { CHAR: ] [ CHAR: ] push-stack next ] } { CHAR: - [ CHAR: - push-stack next ] } [ 2drop ] @@ -403,7 +443,7 @@ DEFER: parse-character-class get-next { { CHAR: ^ [ caret dupd push-stack next parse-character-class-second ] } - ! { CHAR: [ [ CHAR: [ push-stack next ] } + { CHAR: [ [ CHAR: [ push-stack next ] } { CHAR: ] [ CHAR: ] push-stack next ] } { CHAR: - [ CHAR: - push-stack next ] } [ 2drop ] @@ -442,7 +482,7 @@ ERROR: unsupported-token token ; [ set-start-state ] } cleave ] } - [ drop make-nontoken-nfa ] + [ drop push-single-nfa ] } case ; : (parse-raw-regexp) ( regexp -- ) @@ -639,7 +679,7 @@ TUPLE: dfa-traverser dupd match [ [ length ] [ range-length 1- ] bi* = ] [ drop f ] if* ; : match-head ( string regexp -- end ) - match length>> ; + match length>> 1- ; ! character classes ! TUPLE: range-class from to ; @@ -647,5 +687,15 @@ TUPLE: dfa-traverser ! (?:a|b)* <- does not capture ! (a|b)*\1 <- group captured -! (?!abba) negative lookahead matches ababa but not abbaa +! doesn't advance the current position: ! (?=abba) positive lookahead matches abbaaa but not abaaa +! (?!abba) negative lookahead matches ababa but not abbaa +! look behind. "lookaround" + +! : $ ( n -- obj ) groups get nth ; +! [ + ! groups bound to scope here +! ] [ + ! error or something +! ] if-match +! match in a string with .*foo.* From 9650b46688af847598f8e22cb38fe155c36e7a27 Mon Sep 17 00:00:00 2001 From: Eduardo Cavazos Date: Fri, 23 May 2008 00:11:44 -0500 Subject: [PATCH 2/7] newfx: few more words for dns --- extra/newfx/newfx.factor | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/extra/newfx/newfx.factor b/extra/newfx/newfx.factor index 2b2f916aea..abe0449d06 100644 --- a/extra/newfx/newfx.factor +++ b/extra/newfx/newfx.factor @@ -160,6 +160,16 @@ METHOD: as-mutate { object object assoc } set-at ; ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +: subseq ( seq from to -- subseq ) rot sequences:subseq ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: key ( table val -- key ) swap assocs:value-at ; + +: key-of ( val table -- key ) assocs:value-at ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + : 1st 0 at ; : 2nd 1 at ; : 3rd 2 at ; From 9c569034f63ac06558b53608cbb703358e815627 Mon Sep 17 00:00:00 2001 From: Eduardo Cavazos Date: Fri, 23 May 2008 00:12:01 -0500 Subject: [PATCH 3/7] factor.el: couple of font-lock words --- misc/factor.el | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/misc/factor.el b/misc/factor.el index 7513c3640d..9d90fb68f9 100644 --- a/misc/factor.el +++ b/misc/factor.el @@ -82,6 +82,7 @@ ("^!.*$" . font-lock-comment-face) (" !.*$" . font-lock-comment-face) ("( .* )" . font-lock-comment-face) + "BIN:" "MAIN:" "IN:" "USING:" "TUPLE:" "^C:" "^M:" "METHOD:" @@ -89,7 +90,9 @@ "REQUIRES:" "GENERIC:" "GENERIC#" "SYMBOL:" "PREDICATE:" "VAR:" "VARS:" "C-STRUCT:" - "C-UNION:" "" "MACRO:" "MACRO::" "DEFER:" "TYPEDEF:")) + "C-UNION:" "" "MACRO:" "MACRO::" "DEFER:" "TYPEDEF:" + "SYMBOLS:" +)) (defun factor-mode () "A mode for editing programs written in the Factor programming language." From ff553f6aa0a67039ff2f3a1822d265e83f3b7237 Mon Sep 17 00:00:00 2001 From: Eduardo Cavazos Date: Fri, 23 May 2008 00:12:38 -0500 Subject: [PATCH 4/7] New vocabulary: dns --- extra/dns/cache/cache.factor | 142 +++++++++++ extra/dns/dns.factor | 462 +++++++++++++++++++++++++++++++++++ 2 files changed, 604 insertions(+) create mode 100644 extra/dns/cache/cache.factor create mode 100644 extra/dns/dns.factor diff --git a/extra/dns/cache/cache.factor b/extra/dns/cache/cache.factor new file mode 100644 index 0000000000..e497192b04 --- /dev/null +++ b/extra/dns/cache/cache.factor @@ -0,0 +1,142 @@ + +USING: kernel system + combinators + vectors sequences assocs + math math.functions + prettyprint unicode.case + accessors + combinators.cleave + newfx + dns ; + +IN: dns.cache + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: cache ( -- table ) H{ } ; + +! key: 'name type class' (as string) +! val: entry + +TUPLE: entry time data ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: query->key ( query -- key ) + { [ name>> >lower ] [ type>> unparse ] [ class>> unparse ] } " " join ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: table-get ( query -- result ) query->key cache of ; + +: table-check ( query -- ? ) query->key cache key? ; + +: table-add ( query value -- ) [ query->key ] [ ] bi* cache at-mutate ; + +: table-rem ( query -- ) query->key cache delete-key-of drop ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: now ( -- seconds ) millis 1000.0 / round >integer ; + +: ttl->time ( ttl -- seconds ) now + ; + +: time->ttl ( time -- ttl ) now - ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +SYMBOL: NX + +: cache-nx ( query ttl -- ) + ttl->time NX entry boa + table-add ; + +: nx? ( obj -- ? ) + dup entry? + [ data>> NX = ] + [ drop f ] + if ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: query->rr ( query -- rr ) [ name>> ] [ type>> ] [ class>> ] tri f f rr boa ; + +: query+entry->rrs ( query entry -- rrs ) + swap ! entry query + query->rr ! entry rr + over ! entry rr entry + time>> time->ttl >>ttl ! entry rr + swap ! rr entry + data>> [ >r dup clone r> >>rdata ] map + nip ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: entry-expired? ( entry -- ? ) time>> time->ttl 0 <= ; + +: cache-get ( query -- result ) + dup table-get ! query result + { + { + [ dup f = ] ! not in the cache + [ 2drop f ] + } + { + [ dup entry-expired? ] ! here but expired + [ drop table-rem f ] + } + { + [ dup nx? ] ! negative result has been cached + [ 2drop NX ] + } + { + [ t ] + [ query+entry->rrs ] + } + } + cond ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: rr->entry ( rr -- entry ) + [ ttl>> ttl->time ] [ rdata>> {1} >vector ] bi entry boa ; + +: maybe-pushed-on ( obj seq -- ) + 2dup member-of? + [ 2drop ] + [ pushed-on ] + if ; + +: add-rr-to-entry ( rr entry -- ) + over ttl>> ttl->time >>time + [ rdata>> ] [ data>> ] bi* maybe-pushed-on ; + +: cache-add ( query rr -- ) + over table-get ! query rr entry + { + { + [ dup f = ] ! not in the cache + [ drop rr->entry table-add ] + } + { + [ dup nx? ] + [ drop over table-rem rr->entry table-add ] + } + { + [ dup entry-expired? ] + [ drop rr->entry table-add ] + } + { + [ t ] + [ rot drop add-rr-to-entry ] + } + } + cond ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: rr->query ( rr -- query ) [ name>> ] [ type>> ] [ class>> ] tri query boa ; + +: cache-add-rr ( rr -- ) [ rr->query ] [ ] bi cache-add ; + +: cache-add-rrs ( rrs -- ) [ cache-add-rr ] each ; diff --git a/extra/dns/dns.factor b/extra/dns/dns.factor new file mode 100644 index 0000000000..560db69bb2 --- /dev/null +++ b/extra/dns/dns.factor @@ -0,0 +1,462 @@ + +USING: kernel byte-arrays combinators strings arrays sequences splitting + math math.functions math.parser random + destructors + io io.binary io.sockets io.encodings.binary + accessors + combinators.cleave + newfx + symbols + ; + +IN: dns + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +TUPLE: query name type class ; + +TUPLE: rr name type class ttl rdata ; + +TUPLE: hinfo cpu os ; + +TUPLE: mx preference exchange ; + +TUPLE: soa mname rname serial refresh retry expire minimum ; + +TUPLE: message + id qr opcode aa tc rd ra z rcode + question-section + answer-section + authority-section + additional-section ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: random-id ( -- id ) 2 16 ^ random ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! TYPE +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +SYMBOLS: A NS MD MF CNAME SOA MB MG MR NULL WKS PTR HINFO MINFO MX TXT ; + +: type-table ( -- table ) + { + { A 1 } + { NS 2 } + { MD 3 } + { MF 4 } + { CNAME 5 } + { SOA 6 } + { MB 7 } + { MG 8 } + { MR 9 } + { NULL 10 } + { WKS 11 } + { PTR 12 } + { HINFO 13 } + { MINFO 14 } + { MX 15 } + { TXT 16 } + } ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! CLASS +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +SYMBOLS: IN CS CH HS ; + +: class-table ( -- table ) + { + { IN 1 } + { CS 2 } + { CH 3 } + { HS 4 } + } ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! OPCODE +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +SYMBOLS: QUERY IQUERY STATUS ; + +: opcode-table ( -- table ) + { + { QUERY 0 } + { IQUERY 1 } + { STATUS 2 } + } ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! RCODE +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +SYMBOLS: NO-ERROR FORMAT-ERROR SERVER-FAILURE NAME-ERROR NOT-IMPLEMENTED + REFUSED ; + +: rcode-table ( -- table ) + { + { NO-ERROR 0 } + { FORMAT-ERROR 1 } + { SERVER-FAILURE 2 } + { NAME-ERROR 3 } + { NOT-IMPLEMENTED 4 } + { REFUSED 5 } + } ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: ( -- message ) + message new + random-id >>id + 0 >>qr + QUERY >>opcode + 0 >>aa + 0 >>tc + 1 >>rd + 0 >>ra + 0 >>z + NO-ERROR >>rcode + { } >>question-section + { } >>answer-section + { } >>authority-section + { } >>additional-section ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: ip->ba ( ip -- ba ) "." split [ string>number ] map >byte-array ; + +: label->ba ( label -- ba ) [ >byte-array ] [ length ] bi prefix ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: uint8->ba ( n -- ba ) 1 >be ; +: uint16->ba ( n -- ba ) 2 >be ; +: uint32->ba ( n -- ba ) 4 >be ; +: uint64->ba ( n -- ba ) 8 >be ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: dn->ba ( dn -- ba ) "." split [ label->ba ] map concat ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: query->ba ( query -- ba ) + { + [ name>> dn->ba ] + [ type>> type-table of uint16->ba ] + [ class>> class-table of uint16->ba ] + } + concat ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: hinfo->ba ( rdata -- ba ) + [ cpu>> label->ba ] + [ os>> label->ba ] + bi append ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: mx->ba ( rdata -- ba ) + [ preference>> uint16->ba ] + [ exchange>> dn->ba ] + bi append ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: soa->ba ( rdata -- ba ) + { + [ mname>> dn->ba ] + [ rname>> dn->ba ] + [ serial>> uint32->ba ] + [ refresh>> uint32->ba ] + [ retry>> uint32->ba ] + [ expire>> uint32->ba ] + [ minimum>> uint32->ba ] + } + concat ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: rdata->ba ( type rdata -- ba ) + swap + { + { CNAME [ dn->ba ] } + { HINFO [ hinfo->ba ] } + { MX [ mx->ba ] } + { NS [ dn->ba ] } + { PTR [ dn->ba ] } + { SOA [ soa->ba ] } + { A [ ip->ba ] } + } + case ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: rr->ba ( rr -- ba ) + { + [ name>> dn->ba ] + [ type>> type-table of uint16->ba ] + [ class>> class-table of uint16->ba ] + [ ttl>> uint32->ba ] + [ + [ type>> ] [ rdata>> ] bi rdata->ba + [ length uint16->ba ] [ ] bi append + ] + } + concat ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: header-bits-ba ( message -- ba ) + { + [ qr>> 15 shift ] + [ opcode>> opcode-table of 11 shift ] + [ aa>> 10 shift ] + [ tc>> 9 shift ] + [ rd>> 8 shift ] + [ ra>> 7 shift ] + [ z>> 4 shift ] + [ rcode>> rcode-table of 0 shift ] + } + sum uint16->ba ; + +: message->ba ( message -- ba ) + { + [ id>> uint16->ba ] + [ header-bits-ba ] + [ question-section>> length uint16->ba ] + [ answer-section>> length uint16->ba ] + [ authority-section>> length uint16->ba ] + [ additional-section>> length uint16->ba ] + [ question-section>> [ query->ba ] map concat ] + [ answer-section>> [ rr->ba ] map concat ] + [ authority-section>> [ rr->ba ] map concat ] + [ additional-section>> [ rr->ba ] map concat ] + } + concat ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: get-single ( ba i -- n ) at ; +: get-double ( ba i -- n ) dup 2 + subseq be> ; +: get-quad ( ba i -- n ) dup 4 + subseq be> ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: label-length ( ba i -- length ) get-single ; + +: skip-label ( ba i -- ba i ) 2dup label-length + 1 + ; + +: null-label? ( ba i -- ? ) get-single 0 = ; + +: get-label ( ba i -- label ) [ 1 + ] [ skip-label nip ] 2bi subseq >string ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: bit-test ( a b -- ? ) bitand 0 = not ; + +: pointer? ( ba i -- ? ) get-single BIN: 11000000 bit-test ; + +: pointer ( ba i -- val ) get-double BIN: 0011111111111111 bitand ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: skip-name ( ba i -- ba i ) + { + { [ 2dup null-label? ] [ 1 + ] } + { [ 2dup pointer? ] [ 2 + ] } + { [ t ] [ skip-label skip-name ] } + } + cond ; + +: get-name ( ba i -- name ) + { + { [ 2dup null-label? ] [ 2drop "" ] } + { [ 2dup pointer? ] [ dupd pointer get-name ] } + { + [ t ] + [ + [ get-label ] + [ skip-label get-name ] + 2bi + "." swap 3append + ] + } + } + cond ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: get-query ( ba i -- query ) + [ get-name ] + [ + skip-name + [ 0 + get-double type-table key-of ] + [ 2 + get-double class-table key-of ] + 2bi + ] + 2bi query boa ; + +: skip-query ( ba i -- ba i ) skip-name 4 + ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: get-mx ( ba i -- mx ) [ get-double ] [ 2 + get-double ] 2bi mx boa ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: get-soa ( ba i -- soa ) + { + [ get-name ] + [ skip-name get-name ] + [ + skip-name + skip-name + { + [ 0 + get-quad ] + [ 4 + get-quad ] + [ 8 + get-quad ] + [ 12 + get-quad ] + [ 16 + get-quad ] + } + 2cleave + ] + } + 2cleave soa boa ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: get-ip ( ba i -- ip ) dup 4 + subseq >array [ number>string ] map "." join ; + +: get-rdata ( ba i type -- rdata ) + { + { CNAME [ get-name ] } + { NS [ get-name ] } + { PTR [ get-name ] } + { MX [ get-mx ] } + { SOA [ get-soa ] } + { A [ get-ip ] } + } + case ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: get-rr ( ba i -- rr ) + [ get-name ] + [ + skip-name + { + [ 0 + get-double type-table key-of ] + [ 2 + get-double class-table key-of ] + [ 4 + get-quad ] + [ [ 10 + ] [ get-double type-table key-of ] 2bi get-rdata ] + } + 2cleave + ] + 2bi rr boa ; + +: skip-rr ( ba i -- ba i ) skip-name 8 + 2dup get-double + 2 + ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: get-question-section ( ba i count -- seq ba i ) + [ drop [ skip-query ] [ get-query ] 2bi ] map -rot ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: get-rr-section ( ba i count -- seq ba i ) + [ drop [ skip-rr ] [ get-rr ] 2bi ] map -rot ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: >> neg shift ; + +: get-header-bits ( ba i -- qr opcode aa tc rd ra z rcode ) + get-double + { + [ 15 >> BIN: 1 bitand ] + [ 11 >> BIN: 111 bitand opcode-table key-of ] + [ 10 >> BIN: 1 bitand ] + [ 9 >> BIN: 1 bitand ] + [ 8 >> BIN: 1 bitand ] + [ 7 >> BIN: 1 bitand ] + [ 4 >> BIN: 111 bitand ] + [ BIN: 1111 bitand rcode-table key-of ] + } + cleave ; + +: parse-message ( ba -- message ) + 0 + { + [ get-double ] + [ 2 + get-header-bits ] + [ + 4 + + { + [ 8 + ] + [ 0 + get-double ] + [ 2 + get-double ] + [ 4 + get-double ] + [ 6 + get-double ] + } + 2cleave + >r >r >r + get-question-section r> + get-rr-section r> + get-rr-section r> + get-rr-section + 2drop + ] + } + 2cleave message boa ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: send-receive-udp ( ba server -- ba ) + f 0 + [ + [ send ] [ receive drop ] bi + ] + with-disposal ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: send-receive-tcp ( ba server -- ba ) + [ dup length 2 >be prepend ] [ ] bi* + binary + [ + write flush + 2 read be> read + ] + with-client ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: >dns-inet4 ( obj -- inet4 ) + dup string? + [ 53 ] + [ ] + if ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: ask-server ( message server -- message ) + [ message->ba ] [ >dns-inet4 ] bi* + 2dup + send-receive-udp parse-message + dup tc>> 1 = + [ drop send-receive-tcp parse-message ] + [ nip nip ] + if ; + +! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +: dns-servers ( -- seq ) V{ } ; + +: dns-server ( -- server ) dns-servers random ; + +: ask ( message -- message ) dns-server ask-server ; + +: ( query -- message ) swap {1} >>question-section ; \ No newline at end of file From 6386de6c2a306dad99e67b89ce1d8c6bf27a6b1b Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Fri, 23 May 2008 16:30:19 -0500 Subject: [PATCH 5/7] move regexp4 to unmaintained...regexp5 is coming soon --- unmaintained/regexp4/regexp4-tests.factor | 244 ++++++++ unmaintained/regexp4/regexp4.factor | 712 ++++++++++++++++++++++ 2 files changed, 956 insertions(+) create mode 100644 unmaintained/regexp4/regexp4-tests.factor create mode 100644 unmaintained/regexp4/regexp4.factor diff --git a/unmaintained/regexp4/regexp4-tests.factor b/unmaintained/regexp4/regexp4-tests.factor new file mode 100644 index 0000000000..c941d0fb75 --- /dev/null +++ b/unmaintained/regexp4/regexp4-tests.factor @@ -0,0 +1,244 @@ +USING: regexp4 tools.test kernel ; +IN: regexp4-tests + +[ f ] [ "b" "a*" matches? ] unit-test +[ t ] [ "" "a*" matches? ] unit-test +[ t ] [ "a" "a*" matches? ] unit-test +[ t ] [ "aaaaaaa" "a*" matches? ] unit-test +[ f ] [ "ab" "a*" matches? ] unit-test + +[ t ] [ "abc" "abc" matches? ] unit-test +[ t ] [ "a" "a|b|c" matches? ] unit-test +[ t ] [ "b" "a|b|c" matches? ] unit-test +[ t ] [ "c" "a|b|c" matches? ] unit-test +[ f ] [ "c" "d|e|f" matches? ] unit-test + +[ f ] [ "aa" "a|b|c" matches? ] unit-test +[ f ] [ "bb" "a|b|c" matches? ] unit-test +[ f ] [ "cc" "a|b|c" matches? ] unit-test +[ f ] [ "cc" "d|e|f" matches? ] unit-test + +[ f ] [ "" "a+" matches? ] unit-test +[ t ] [ "a" "a+" matches? ] unit-test +[ t ] [ "aa" "a+" matches? ] unit-test + +[ t ] [ "" "a?" matches? ] unit-test +[ t ] [ "a" "a?" matches? ] unit-test +[ f ] [ "aa" "a?" matches? ] unit-test + +[ f ] [ "" "." matches? ] unit-test +[ t ] [ "a" "." matches? ] unit-test +[ t ] [ "." "." matches? ] unit-test +! [ f ] [ "\n" "." matches? ] unit-test + +[ f ] [ "" ".+" matches? ] unit-test +[ t ] [ "a" ".+" matches? ] unit-test +[ t ] [ "ab" ".+" matches? ] unit-test + + +[ t ] [ "" "a|b*|c+|d?" matches? ] unit-test +[ t ] [ "a" "a|b*|c+|d?" matches? ] unit-test +[ t ] [ "c" "a|b*|c+|d?" matches? ] unit-test +[ t ] [ "cc" "a|b*|c+|d?" matches? ] unit-test +[ f ] [ "ccd" "a|b*|c+|d?" matches? ] unit-test +[ t ] [ "d" "a|b*|c+|d?" matches? ] unit-test + +[ t ] [ "foo" "foo|bar" matches? ] unit-test +[ t ] [ "bar" "foo|bar" matches? ] unit-test +[ f ] [ "foobar" "foo|bar" matches? ] unit-test + +[ f ] [ "" "(a)" matches? ] unit-test +[ t ] [ "a" "(a)" matches? ] unit-test +[ f ] [ "aa" "(a)" matches? ] unit-test +[ t ] [ "aa" "(a*)" matches? ] unit-test + +[ f ] [ "aababaaabbac" "(a|b)+" matches? ] unit-test +[ t ] [ "ababaaabba" "(a|b)+" matches? ] unit-test + +[ f ] [ "" "a{1}" matches? ] unit-test +[ t ] [ "a" "a{1}" matches? ] unit-test +[ f ] [ "aa" "a{1}" matches? ] unit-test + +[ f ] [ "a" "a{2,}" matches? ] unit-test +[ t ] [ "aaa" "a{2,}" matches? ] unit-test +[ t ] [ "aaaa" "a{2,}" matches? ] unit-test +[ t ] [ "aaaaa" "a{2,}" matches? ] unit-test + +[ t ] [ "" "a{,2}" matches? ] unit-test +[ t ] [ "a" "a{,2}" matches? ] unit-test +[ t ] [ "aa" "a{,2}" matches? ] unit-test +[ f ] [ "aaa" "a{,2}" matches? ] unit-test +[ f ] [ "aaaa" "a{,2}" matches? ] unit-test +[ f ] [ "aaaaa" "a{,2}" matches? ] unit-test + +[ f ] [ "" "a{1,3}" matches? ] unit-test +[ t ] [ "a" "a{1,3}" matches? ] unit-test +[ t ] [ "aa" "a{1,3}" matches? ] unit-test +[ t ] [ "aaa" "a{1,3}" matches? ] unit-test +[ f ] [ "aaaa" "a{1,3}" matches? ] unit-test + +[ f ] [ "" "[a]" matches? ] unit-test +[ t ] [ "a" "[a]" matches? ] unit-test +[ t ] [ "a" "[abc]" matches? ] unit-test +[ f ] [ "b" "[a]" matches? ] unit-test +[ f ] [ "d" "[abc]" matches? ] unit-test +[ t ] [ "ab" "[abc]{1,2}" matches? ] unit-test +[ f ] [ "abc" "[abc]{1,2}" matches? ] unit-test + +[ f ] [ "" "[^a]" matches? ] unit-test +[ f ] [ "a" "[^a]" matches? ] unit-test +[ f ] [ "a" "[^abc]" matches? ] unit-test +[ t ] [ "b" "[^a]" matches? ] unit-test +[ t ] [ "d" "[^abc]" matches? ] unit-test +[ f ] [ "ab" "[^abc]{1,2}" matches? ] unit-test +[ f ] [ "abc" "[^abc]{1,2}" matches? ] unit-test + +[ t ] [ "]" "[]]" matches? ] unit-test +[ f ] [ "]" "[^]]" matches? ] unit-test +[ t ] [ "a" "[^]]" matches? ] unit-test + +[ t ] [ "^" "[^]" matches? ] must-fail +[ t ] [ "^" "[]^]" matches? ] unit-test +[ t ] [ "]" "[]^]" matches? ] unit-test + +[ t ] [ "[" "[[]" matches? ] unit-test +[ f ] [ "^" "[^^]" matches? ] unit-test +[ t ] [ "a" "[^^]" matches? ] unit-test + +[ t ] [ "-" "[-]" matches? ] unit-test +[ f ] [ "a" "[-]" matches? ] unit-test +[ f ] [ "-" "[^-]" matches? ] unit-test +[ t ] [ "a" "[^-]" matches? ] unit-test + +[ t ] [ "-" "[-a]" matches? ] unit-test +[ t ] [ "a" "[-a]" matches? ] unit-test +[ t ] [ "-" "[a-]" matches? ] unit-test +[ t ] [ "a" "[a-]" matches? ] unit-test +[ f ] [ "b" "[a-]" matches? ] unit-test +[ f ] [ "-" "[^-]" matches? ] unit-test +[ t ] [ "a" "[^-]" matches? ] unit-test + +[ f ] [ "-" "[a-c]" matches? ] unit-test +[ t ] [ "-" "[^a-c]" matches? ] unit-test +[ t ] [ "b" "[a-c]" matches? ] unit-test +[ f ] [ "b" "[^a-c]" matches? ] unit-test + +[ t ] [ "-" "[a-c-]" matches? ] unit-test +[ f ] [ "-" "[^a-c-]" matches? ] unit-test + +[ t ] [ "\\" "[\\\\]" matches? ] unit-test +[ f ] [ "a" "[\\\\]" matches? ] unit-test +[ f ] [ "\\" "[^\\\\]" matches? ] unit-test +[ t ] [ "a" "[^\\\\]" matches? ] unit-test + +[ t ] [ "0" "[\\d]" matches? ] unit-test +[ f ] [ "a" "[\\d]" matches? ] unit-test +[ f ] [ "0" "[^\\d]" matches? ] unit-test +[ t ] [ "a" "[^\\d]" matches? ] unit-test + +[ t ] [ "a" "[a-z]{1,}|[A-Z]{2,4}|b*|c|(f|g)*" matches? ] unit-test +[ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}|b*|c|(f|g)*" matches? ] unit-test +[ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}" matches? ] unit-test + +[ t ] [ "1000" "\\d{4,6}" matches? ] unit-test +[ t ] [ "1000" "[0-9]{4,6}" matches? ] unit-test + +[ t ] [ "abc" "\\p{Lower}{3}" matches? ] unit-test +[ f ] [ "ABC" "\\p{Lower}{3}" matches? ] unit-test +[ t ] [ "ABC" "\\p{Upper}{3}" matches? ] unit-test +[ f ] [ "abc" "\\p{Upper}{3}" matches? ] unit-test + +[ f ] [ "abc" "[\\p{Upper}]{3}" matches? ] unit-test +[ t ] [ "ABC" "[\\p{Upper}]{3}" matches? ] unit-test + +[ t ] [ "" "\\Q\\E" matches? ] unit-test +[ f ] [ "a" "\\Q\\E" matches? ] unit-test +[ t ] [ "|*+" "\\Q|*+\\E" matches? ] unit-test +[ f ] [ "abc" "\\Q|*+\\E" matches? ] unit-test +[ t ] [ "s" "\\Qs\\E" matches? ] unit-test + +[ t ] [ "S" "\\0123" matches? ] unit-test +[ t ] [ "SXY" "\\0123XY" matches? ] unit-test +[ t ] [ "x" "\\x78" matches? ] unit-test +[ f ] [ "y" "\\x78" matches? ] unit-test +[ t ] [ "x" "\\u000078" matches? ] unit-test +[ f ] [ "y" "\\u000078" matches? ] unit-test + +[ t ] [ "ab" "a+b" matches? ] unit-test +[ f ] [ "b" "a+b" matches? ] unit-test +[ t ] [ "aab" "a+b" matches? ] unit-test +[ f ] [ "abb" "a+b" matches? ] unit-test + +[ t ] [ "abbbb" "ab*" matches? ] unit-test +[ t ] [ "a" "ab*" matches? ] unit-test +[ f ] [ "abab" "ab*" matches? ] unit-test + +[ f ] [ "x" "\\." matches? ] unit-test +[ t ] [ "." "\\." matches? ] unit-test + +[ t ] [ "aaaab" "a+ab" matches? ] unit-test +[ f ] [ "aaaxb" "a+ab" matches? ] unit-test +[ t ] [ "aaacb" "a+cb" matches? ] unit-test +[ f ] [ "aaaab" "a++ab" matches? ] unit-test +[ t ] [ "aaacb" "a++cb" matches? ] unit-test + +[ 3 ] [ "aaacb" "a*" match-head ] unit-test +[ 1 ] [ "aaacb" "a+?" match-head ] unit-test +[ 2 ] [ "aaacb" "aa?" match-head ] unit-test +[ 1 ] [ "aaacb" "aa??" match-head ] unit-test +[ 3 ] [ "aacb" "aa?c" match-head ] unit-test +[ 3 ] [ "aacb" "aa??c" match-head ] unit-test + +! [ t ] [ "aaa" "AAA" t matches? ] unit-test +! [ f ] [ "aax" "AAA" t matches? ] unit-test +! [ t ] [ "aaa" "A*" t matches? ] unit-test +! [ f ] [ "aaba" "A*" t matches? ] unit-test +! [ t ] [ "b" "[AB]" t matches? ] unit-test +! [ f ] [ "c" "[AB]" t matches? ] unit-test +! [ t ] [ "c" "[A-Z]" t matches? ] unit-test +! [ f ] [ "3" "[A-Z]" t matches? ] unit-test + +[ ] [ + "(0[lL]?|[1-9]\\d{0,9}(\\d{0,9}[lL])?|0[xX]\\p{XDigit}{1,8}(\\p{XDigit}{0,8}[lL])?|0[0-7]{1,11}([0-7]{0,11}[lL])?|([0-9]+\\.[0-9]*|\\.[0-9]+)([eE][+-]?[0-9]+)?[fFdD]?|[0-9]+([eE][+-]?[0-9]+[fFdD]?|([eE][+-]?[0-9]+)?[fFdD]))" + drop +] unit-test + +[ t ] [ "fxxbar" "(?!foo).{3}bar" matches? ] unit-test +[ f ] [ "foobar" "(?!foo).{3}bar" matches? ] unit-test + +! [ 3 ] [ "foobar" "foo(?=bar)" match-head ] unit-test +! [ f ] [ "foobxr" "foo(?=bar)" match-head ] unit-test + +! [ f ] [ "foobxr" "foo\\z" match-head ] unit-test +! [ 3 ] [ "foo" "foo\\z" match-head ] unit-test + +! [ 3 ] [ "foo bar" "foo\\b" match-head ] unit-test +! [ f ] [ "fooxbar" "foo\\b" matches? ] unit-test +! [ t ] [ "foo" "foo\\b" matches? ] unit-test +! [ t ] [ "foo bar" "foo\\b bar" matches? ] unit-test +! [ f ] [ "fooxbar" "foo\\bxbar" matches? ] unit-test +! [ f ] [ "foo" "foo\\bbar" matches? ] unit-test + +! [ f ] [ "foo bar" "foo\\B" matches? ] unit-test +! [ 3 ] [ "fooxbar" "foo\\B" match-head ] unit-test +! [ t ] [ "foo" "foo\\B" matches? ] unit-test +! [ f ] [ "foo bar" "foo\\B bar" matches? ] unit-test +! [ t ] [ "fooxbar" "foo\\Bxbar" matches? ] unit-test +! [ f ] [ "foo" "foo\\Bbar" matches? ] unit-test + +! [ t ] [ "s@f" "[a-z.-]@[a-z]" matches? ] unit-test +! [ f ] [ "a" "[a-z.-]@[a-z]" matches? ] unit-test +! [ t ] [ ".o" "\\.[a-z]" matches? ] unit-test + +! Bug in parsing word +[ t ] [ + "a" + R' a' + matches? +] unit-test + +! ((A)(B(C))) +! 1. ((A)(B(C))) +! 2. (A) +! 3. (B(C)) +! 4. (C) diff --git a/unmaintained/regexp4/regexp4.factor b/unmaintained/regexp4/regexp4.factor new file mode 100644 index 0000000000..fac6673e82 --- /dev/null +++ b/unmaintained/regexp4/regexp4.factor @@ -0,0 +1,712 @@ +! Copyright (C) 2008 Doug Coleman. +! See http://factorcode.org/license.txt for BSD license. +USING: accessors arrays assocs combinators kernel math +sequences namespaces locals combinators.lib state-tables +math.parser state-parser sets dlists unicode.categories +math.order quotations shuffle math.ranges splitting +symbols fry parser math.ranges inspector strings ; +IN: regexp4 + +SYMBOLS: eps start-state final-state beginning-of-text +end-of-text left-parenthesis alternation left-bracket +caret dash ampersand colon ; + +SYMBOL: runtime-epsilon + +TUPLE: regexp raw parentheses-count bracket-count +state stack nfa new-states dfa minimized-dfa +dot-matches-newlines? capture-group captured-groups ; + +TUPLE: capture-group n range ; + +ERROR: parentheses-underflow ; +ERROR: unbalanced-parentheses ; +ERROR: unbalanced-brackets ; + +: push-stack ( regexp token -- ) swap stack>> push ; +: push-all-stack ( regexp seq -- ) swap stack>> push-all ; +: next-state ( regexp -- n ) [ 1+ ] change-state state>> ; + +: check-parentheses-underflow ( regexp -- ) + parentheses-count>> 0 < [ parentheses-underflow ] when ; + +: check-unbalanced-parentheses ( regexp -- ) + parentheses-count>> 0 > [ unbalanced-parentheses ] when ; + +:: (apply-alternation) ( stack regexp -- ) + [let | s2 [ stack peek first ] + s3 [ stack pop second ] + s0 [ stack peek alternation = [ stack pop* ] when stack peek first ] + s1 [ stack pop second ] + s4 [ regexp next-state ] + s5 [ regexp next-state ] + table [ regexp nfa>> ] | + s5 table add-row + s4 eps s0 table add-entry + s4 eps s2 table add-entry + s1 eps s5 table add-entry + s3 eps s5 table add-entry + s1 table final-states>> delete-at + s3 table final-states>> delete-at + t s5 table final-states>> set-at + s4 s5 2array stack push ] ; + +: apply-alternation ( regexp -- ) + [ stack>> ] [ (apply-alternation) ] bi ; + +: apply-alternation? ( stack -- ? ) + dup length dup 3 < + [ 2drop f ] [ 2 - swap nth alternation = ] if ; + +:: (apply-concatenation) ( stack regexp -- ) + [let* | + s2 [ stack peek first ] + s3 [ stack pop second ] + s0 [ stack peek first ] + s1 [ stack pop second ] + table [ regexp nfa>> ] | + s1 eps s2 table set-entry + s1 table final-states>> delete-at + s3 table add-row + s0 s3 2array stack push ] ; + +: apply-concatenation ( regexp -- ) + [ stack>> ] [ (apply-concatenation) ] bi ; + +: apply-concatenation? ( seq -- ? ) + dup length dup 2 < + [ 2drop f ] [ 2 - swap nth array? ] if ; + +: apply-loop ( seq regexp -- seq regexp ) + over length 1 > [ + 2dup over apply-alternation? + [ (apply-alternation) ] [ (apply-concatenation) ] if apply-loop + ] when ; + +: cut-out ( vector n -- vector' vector ) cut rest ; + +: cut-stack ( obj vector -- vector' vector ) + tuck last-index cut-out swap ; + +: apply-til-last ( regexp token -- ) + swap [ cut-stack ] change-stack + apply-loop stack>> push-all ; + +: concatenation-loop ( regexp -- ) + dup stack>> dup apply-concatenation? + [ over (apply-concatenation) concatenation-loop ] [ 2drop ] if ; + +:: apply-kleene-closure ( regexp -- ) + [let* | stack [ regexp stack>> ] + s0 [ stack peek first ] + s1 [ stack pop second ] + s2 [ regexp next-state ] + s3 [ regexp next-state ] + table [ regexp nfa>> ] | + s1 table final-states>> delete-at + t s3 table final-states>> set-at + s3 table add-row + s1 eps s0 table add-entry + s2 eps s0 table add-entry + s2 eps s3 table add-entry + s1 eps s3 table add-entry + s2 s3 2array stack push ] ; + +: add-numbers ( n obj -- obj ) + 2dup [ number? ] bi@ and + [ + ] [ dup sequence? [ [ + ] with map ] [ nip ] if ] if ; + +: increment-columns ( n assoc -- ) + dup [ >r swap >r add-numbers r> r> set-at ] curry with* assoc-each ; + +:: copy-state-rows ( regexp range -- ) + [let* | len [ range range-length ] + offset [ regexp state>> range range-min - 1+ ] + state [ regexp [ len + ] change-state ] | + regexp nfa>> rows>> + [ drop range member? ] assoc-filter + [ + [ offset + ] dip + [ offset swap add-numbers ] assoc-map + ] assoc-map + regexp nfa>> [ assoc-union ] change-rows drop + range [ range-min ] [ range-max ] bi [ offset + ] bi@ 2array + regexp stack>> push ] ; + +: last-state ( regexp -- range ) + stack>> peek first2 [a,b] ; + +: set-last-state-final ( ? regexp -- ) + [ stack>> peek second ] [ nfa>> final-states>> ] bi set-at ; + +: apply-plus-closure ( regexp -- ) + [ dup last-state copy-state-rows ] + [ apply-kleene-closure ] + [ apply-concatenation ] tri ; + +: apply-question-closure ( regexp -- ) + [ stack>> peek first2 eps swap ] [ nfa>> add-entry ] bi ; + +: with0 ( obj n quot -- n quot' ) swapd curry ; inline + + +: range>state ( range -- pair ) + [ from>> ] [ length>> ] bi over - 2array ; + +: copy-state ( regexp range n -- ) + dup zero? [ + drop range>state over stack>> push apply-question-closure + ] [ + [ copy-state-rows ] with0 with0 times + ] if ; + +:: (exactly-n) ( regexp state n -- ) + regexp state n copy-state + t regexp set-last-state-final ; + +: exactly-n ( regexp n -- ) + >r dup last-state r> 1- (exactly-n) ; + +: exactly-n-concatenated ( regexp state n -- ) +B + [ (exactly-n) ] 3keep + nip 1- [ apply-concatenation ] with0 times ; + +:: at-least-n ( regexp n -- ) + [let | state [ regexp stack>> pop first2 [a,b] ] | + regexp state n copy-state + state regexp stack>> push + regexp apply-kleene-closure ] ; + +: peek-last ( regexp -- range ) + stack>> peek first2 [a,b] ; + +: pop-last ( regexp -- range ) + stack>> pop first2 [a,b] ; + +:: at-most-n ( regexp n -- ) + [let | state [ regexp pop-last ] | + regexp state n [ 1+ exactly-n-concatenated ] with with each + regexp n 1- [ apply-alternation ] with0 times + regexp apply-question-closure ] ; + +:: from-m-to-n ( regexp m n -- ) + [let | state [ regexp pop-last ] | + regexp state + m n [a,b] [ exactly-n-concatenated ] with with each + regexp n m - [ apply-alternation ] with0 times ] ; + +: apply-brace-closure ( regexp from/f to/f comma? -- ) + [ + 2dup and + [ from-m-to-n ] + [ [ nip at-most-n ] [ at-least-n ] if* ] if + ] [ drop exactly-n ] if ; + +:: push-single-nfa ( regexp obj -- ) + [let | s0 [ regexp next-state ] + s1 [ regexp next-state ] + stack [ regexp stack>> ] + table [ regexp nfa>> ] | + s0 obj s1 table set-entry + s1 table add-row + t s1 table final-states>> set-at + s0 s1 2array stack push ] ; + +: set-start-state ( regexp -- ) + dup stack>> dup empty? [ + 2drop + ] [ + [ nfa>> ] [ pop first ] bi* >>start-state drop + ] if ; + +: ascii? ( n -- ? ) 0 HEX: 7f between? ; +: octal-digit? ( n -- ? ) CHAR: 0 CHAR: 7 between? ; +: decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ; + +: hex-digit? ( n -- ? ) + [ + [ dup decimal-digit? ] + [ dup CHAR: a CHAR: f between? ] + [ dup CHAR: A CHAR: F between? ] + ] || nip ; + +: control-char? ( n -- ? ) + [ + [ dup 0 HEX: 1f between? ] + [ dup HEX: 7f = ] + ] || nip ; + +: punct? ( n -- ? ) + "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ; + +: c-identifier-char? ( ch -- ? ) + [ [ dup alpha? ] [ dup CHAR: _ = ] ] || nip ; + +: java-blank? ( n -- ? ) + { + CHAR: \s CHAR: \t CHAR: \n + HEX: b HEX: 7 CHAR: \r + } member? ; + +: java-printable? ( n -- ? ) + [ [ dup alpha? ] [ dup punct? ] ] || nip ; + +ERROR: bad-character-class obj ; + +: parse-posix-class ( -- quot ) + next + CHAR: { expect + [ get-char CHAR: } = ] take-until + { + { "Lower" [ [ letter? ] ] } + { "Upper" [ [ LETTER? ] ] } + { "ASCII" [ [ ascii? ] ] } + { "Alpha" [ [ Letter? ] ] } + { "Digit" [ [ digit? ] ] } + { "Alnum" [ [ alpha? ] ] } + { "Punct" [ [ punct? ] ] } + { "Graph" [ [ java-printable? ] ] } + { "Print" [ [ java-printable? ] ] } + { "Blank" [ [ " \t" member? ] ] } + { "Cntrl" [ [ control-char? ] ] } + { "XDigit" [ [ hex-digit? ] ] } + { "Space" [ [ java-blank? ] ] } + ! TODO: unicode-character-class, fallthrough in unicode is bad-char-clss + [ bad-character-class ] + } case ; + +ERROR: bad-octal number ; + +: parse-octal ( -- n ) + next get-char drop + 3 take oct> + dup 255 > [ bad-octal ] when ; + +ERROR: bad-hex number ; + +: parse-short-hex ( -- n ) + next 2 take hex> + dup number? [ bad-hex ] unless ; + +: parse-long-hex ( -- n ) + next 6 take hex> + dup number? [ bad-hex ] unless ; + +: parse-control-character ( -- n ) + next get-char ; + +: dot-construction ( regexp -- ) + [ CHAR: \n = not ] push-single-nfa ; + +: front-anchor-construction ( regexp -- ) + drop ; + +: back-anchor-construction ( regexp -- ) + drop ; + +: parse-brace ( -- from/f to/f comma? ) + next + [ get-char CHAR: } = ] take-until + "," split1 [ [ string>number ] bi@ ] keep >boolean ; + +TUPLE: character-class members ; +TUPLE: character-class-range from to ; +TUPLE: negated-character-class < character-class ; +TUPLE: negated-character-class-range < character-class-range ; +TUPLE: intersection-class < character-class ; +TUPLE: negated-intersection-class < intersection-class ; + +GENERIC: character-class-contains? ( obj character-class -- ? ) + +: parse-escaped-until ( -- seq ) + [ get-char CHAR: \ = get-next CHAR: E = and ] take-until + next ; + +: character-class-predicate ( seq -- quot ) + boa '[ , character-class-contains? ] ; + +ERROR: unmatched-escape-sequence ; + +: (parse-escaped) ( regexp ? ch -- obj ) + { + { CHAR: \ [ [ CHAR: \ = ] ] } + { CHAR: t [ [ CHAR: \t = ] ] } + { CHAR: n [ [ CHAR: \n = ] ] } + { CHAR: r [ [ CHAR: \r = ] ] } + { CHAR: f [ [ HEX: c = ] ] } + { CHAR: a [ [ HEX: 7 = ] ] } + { CHAR: e [ [ HEX: 1b = ] ] } + + { CHAR: d [ [ digit? ] ] } + { CHAR: D [ [ digit? not ] ] } + { CHAR: s [ [ java-blank? ] ] } + { CHAR: S [ [ java-blank? not ] ] } + { CHAR: w [ [ c-identifier-char? ] ] } + { CHAR: W [ [ c-identifier-char? not ] ] } + + { CHAR: p [ parse-posix-class ] } + { CHAR: P [ parse-posix-class [ not ] compose ] } + { CHAR: x [ parse-short-hex ] } + { CHAR: u [ parse-long-hex ] } + { CHAR: 0 [ parse-octal ] } + { CHAR: c [ parse-control-character ] } + + ! { CHAR: Q [ next parse-escaped-until ] } + ! { CHAR: E [ unmatched-escape-sequence ] } + + ! { CHAR: b [ ] } ! a word boundary + ! { CHAR: B [ ] } ! a non-word boundary + ! { CHAR: A [ ] } ! beginning of input + ! { CHAR: G [ ] } ! end of previous match + ! { CHAR: Z [ ] } ! end of input but for the final terminator, if any + ! { CHAR: z [ ] } ! end of the input + [ ] + } case ; + +: parse-escaped ( regexp -- ) + next get-char (parse-escaped) push-single-nfa ; + +: handle-dash ( vector -- vector ) + [ dup dash eq? [ drop CHAR: - ] when ] map ; + +M: object character-class-contains? ( obj1 obj2 -- ? ) + = ; + +M: callable character-class-contains? ( obj1 callable -- ? ) + call ; + +M: character-class character-class-contains? ( obj cc -- ? ) + members>> [ character-class-contains? ] with find drop >boolean ; + +M: negated-character-class character-class-contains? ( obj cc -- ? ) + call-next-method not ; + +M: character-class-range character-class-contains? ( obj cc -- ? ) + [ from>> ] [ to>> ] bi between? ; + +M: negated-character-class-range character-class-contains? ( obj cc -- ? ) + call-next-method not ; + +M: intersection-class character-class-contains? ( obj cc -- ? ) + members>> [ character-class-contains? not ] with find drop not ; + +M: negated-intersection-class character-class-contains? ( obj cc -- ? ) + call-next-method not ; + +ERROR: unmatched-negated-character-class class ; + +: handle-caret ( obj -- seq class ) + dup [ length 2 >= ] [ first caret eq? ] bi and [ + rest negated-character-class + ] [ + character-class + ] if ; + +: make-character-class ( regexp -- ) + left-bracket over stack>> cut-stack + pick (>>stack) + handle-dash handle-caret + character-class-predicate push-single-nfa ; + +: apply-dash ( regexp -- ) + stack>> dup [ pop ] [ pop* ] [ pop ] tri + swap character-class-range boa swap push ; + +: apply-dash? ( regexp -- ? ) + stack>> dup length 3 >= + [ [ length 2 - ] keep nth dash eq? ] [ drop f ] if ; + +DEFER: parse-character-class +: (parse-character-class) ( regexp -- ) + [ + next get-char + { + { CHAR: [ [ + [ 1+ ] change-bracket-count dup left-bracket push-stack + parse-character-class + ] } + { CHAR: ] [ + [ 1- ] change-bracket-count + make-character-class + ] } + { CHAR: - [ dash push-stack ] } + ! { CHAR: & [ ampersand push-stack ] } + ! { CHAR: : [ semicolon push-stack ] } + { CHAR: \ [ next get-char (parse-escaped) push-stack ] } + { f [ unbalanced-brackets ] } + [ dupd push-stack dup apply-dash? [ apply-dash ] [ drop ] if ] + } case + ] [ + dup bracket-count>> 0 > + [ (parse-character-class) ] [ drop ] if + ] bi ; + +: parse-character-class-second ( regexp -- ) + get-next + { + { CHAR: [ [ CHAR: [ push-stack next ] } + { CHAR: ] [ CHAR: ] push-stack next ] } + { CHAR: - [ CHAR: - push-stack next ] } + [ 2drop ] + } case ; + +: parse-character-class-first ( regexp -- ) + get-next + { + { CHAR: ^ [ caret dupd push-stack next parse-character-class-second ] } + { CHAR: [ [ CHAR: [ push-stack next ] } + { CHAR: ] [ CHAR: ] push-stack next ] } + { CHAR: - [ CHAR: - push-stack next ] } + [ 2drop ] + } case ; + +: parse-character-class ( regexp -- ) + [ parse-character-class-first ] [ (parse-character-class) ] bi ; + +ERROR: unsupported-token token ; +: parse-token ( regexp token -- ) + dup { + { CHAR: ^ [ drop front-anchor-construction ] } + { CHAR: $ [ drop back-anchor-construction ] } + { CHAR: \ [ drop parse-escaped ] } + { CHAR: | [ drop dup concatenation-loop alternation push-stack ] } + { CHAR: ( [ drop [ 1+ ] change-parentheses-count left-parenthesis push-stack ] } + { CHAR: ) [ drop [ 1- ] change-parentheses-count left-parenthesis apply-til-last ] } + { CHAR: * [ drop apply-kleene-closure ] } + { CHAR: + [ drop apply-plus-closure ] } + { CHAR: ? [ drop apply-question-closure ] } + { CHAR: { [ drop parse-brace apply-brace-closure ] } + { CHAR: [ [ + drop + dup left-bracket push-stack + [ 1+ ] change-bracket-count parse-character-class + ] } + ! { CHAR: } [ drop drop "brace" ] } + { CHAR: . [ drop dot-construction ] } + { beginning-of-text [ push-stack ] } + { end-of-text [ + drop { + [ check-unbalanced-parentheses ] + [ concatenation-loop ] + [ beginning-of-text apply-til-last ] + [ set-start-state ] + } cleave + ] } + [ drop push-single-nfa ] + } case ; + +: (parse-raw-regexp) ( regexp -- ) + get-char [ dupd parse-token next (parse-raw-regexp) ] [ drop ] if* ; + +: parse-raw-regexp ( regexp -- ) + [ beginning-of-text parse-token ] + [ + dup raw>> dup empty? [ + 2drop + ] [ + [ (parse-raw-regexp) ] string-parse + ] if + ] + [ end-of-text parse-token ] tri ; + +:: find-delta ( states obj table -- keys ) + obj states [ + table get-row at + [ dup integer? [ 1array ] when unique ] [ H{ } ] if* + ] with map H{ } clone [ assoc-union ] reduce keys ; + +:: (find-closure) ( states obj assoc table -- keys ) + [let | size [ assoc assoc-size ] | + assoc states unique assoc-union + dup assoc-size size > [ + obj states [ + table get-row at* [ + dup integer? [ 1array ] when + obj rot table (find-closure) + ] [ + drop + ] if + ] with each + ] when ] ; + +: find-closure ( states obj table -- states ) + >r H{ } r> (find-closure) keys ; + +: find-epsilon-closure ( states table -- states ) + >r eps H{ } r> (find-closure) keys ; + +: filter-special-transition ( vec -- vec' ) + [ drop eps = not ] assoc-filter ; + +: initialize-subset-construction ( regexp -- ) + >>dfa + [ + nfa>> [ start-state>> 1array ] keep + find-epsilon-closure 1dlist + ] [ + swap >>new-states drop + ] [ + [ dfa>> ] [ nfa>> ] bi + columns>> filter-special-transition >>columns drop + ] tri ; + +:: (subset-construction) ( regexp -- ) + [let* | nfa [ regexp nfa>> ] + dfa [ regexp dfa>> ] + new-states [ regexp new-states>> ] + columns [ dfa columns>> keys ] | + + new-states dlist-empty? [ + new-states pop-front + dup dfa add-row + columns [ + 2dup nfa [ find-delta ] [ find-epsilon-closure ] bi + dup [ dfa rows>> key? ] [ empty? ] bi or [ + dup new-states push-back + ] unless + dup empty? [ 3drop ] [ dfa set-entry ] if + ] with each + regexp (subset-construction) + ] unless ] ; + +: set-start/final-states ( regexp -- ) + dup [ nfa>> start-state>> ] + [ dfa>> rows>> keys [ member? ] with filter first ] bi + >r dup dfa>> r> >>start-state drop + + dup [ nfa>> final-states>> ] [ dfa>> rows>> ] bi + [ keys ] bi@ + [ intersect empty? not ] with filter + >r dfa>> r> >>final-states drop ; + +: subset-construction ( regexp -- ) + [ initialize-subset-construction ] + [ (subset-construction) ] + [ set-start/final-states ] tri ; + +: ( raw -- obj ) + regexp new + swap >>raw + 0 >>parentheses-count + 0 >>bracket-count + -1 >>state + V{ } clone >>stack + >>nfa + dup [ parse-raw-regexp ] [ subset-construction ] bi ; + +! Literal syntax for regexps +: parse-options ( string -- ? ) + #! Lame + { + { "" [ f ] } + { "i" [ t ] } + } case ; + +: parse-regexp ( accum end -- accum ) + lexer get dup skip-blank + [ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column + ! lexer get dup still-parsing-line? + ! [ (parse-token) parse-options ] [ drop f ] if + parsed ; + +: R! CHAR: ! parse-regexp ; parsing +: R" CHAR: " parse-regexp ; parsing +: R# CHAR: # parse-regexp ; parsing +: R' CHAR: ' parse-regexp ; parsing +: R( CHAR: ) parse-regexp ; parsing +: R/ CHAR: / parse-regexp ; parsing +: R@ CHAR: @ parse-regexp ; parsing +: R[ CHAR: ] parse-regexp ; parsing +: R` CHAR: ` parse-regexp ; parsing +: R{ CHAR: } parse-regexp ; parsing +: R| CHAR: | parse-regexp ; parsing + +TUPLE: dfa-traverser + dfa + last-state current-state + text + start-index current-index + matches ; + +: ( text dfa -- match ) + dfa>> + dfa-traverser new + swap [ start-state>> >>current-state ] keep + >>dfa + swap >>text + 0 >>start-index + 0 >>current-index + V{ } clone >>matches ; + +: final-state? ( dfa-traverser -- ? ) + [ current-state>> ] [ dfa>> final-states>> ] bi + member? ; + +: text-finished? ( dfa-traverser -- ? ) + [ current-index>> ] [ text>> length ] bi >= ; + +: save-final-state ( dfa-straverser -- ) + [ current-index>> ] [ matches>> ] bi push ; + +: match-done? ( dfa-traverser -- ? ) + dup final-state? [ + dup save-final-state + ] when text-finished? ; + +: increment-state ( dfa-traverser state -- dfa-traverser ) + >r [ 1+ ] change-current-index + dup current-state>> >>last-state r> + >>current-state ; + +: match-transition ( obj hash -- state/f ) + 2dup keys [ callable? ] filter predicates + [ swap at nip ] [ at ] if* ; + +: do-match ( dfa-traverser -- dfa-traverser ) + dup match-done? [ + dup { + [ current-index>> ] + [ text>> ] + [ current-state>> ] + [ dfa>> rows>> ] + } cleave + at >r nth r> match-transition [ + increment-state do-match + ] when* + ] unless ; + +: return-match ( dfa-traverser -- interval/f ) + dup matches>> empty? [ + drop f + ] [ + [ start-index>> ] [ matches>> peek ] bi 1 + ] if ; + +: match ( string regexp -- pair ) + do-match return-match ; + +: matches? ( string regexp -- ? ) + dupd match [ [ length ] [ range-length 1- ] bi* = ] [ drop f ] if* ; + +: match-head ( string regexp -- end ) + match length>> 1- ; + +! character classes +! TUPLE: range-class from to ; +! TUPLE: or-class left right ; + +! (?:a|b)* <- does not capture +! (a|b)*\1 <- group captured +! doesn't advance the current position: +! (?=abba) positive lookahead matches abbaaa but not abaaa +! (?!abba) negative lookahead matches ababa but not abbaa +! look behind. "lookaround" + +! : $ ( n -- obj ) groups get nth ; +! [ + ! groups bound to scope here +! ] [ + ! error or something +! ] if-match +! match in a string with .*foo.* From 2225d1b990fb37101df619236a88d4437002e9cd Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Fri, 23 May 2008 16:34:12 -0500 Subject: [PATCH 6/7] uncomment bugs to let factor build (buggy) binaries! --- core/inference/inference-tests.factor | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/core/inference/inference-tests.factor b/core/inference/inference-tests.factor index dc037d4a59..0d3eb03cf4 100755 --- a/core/inference/inference-tests.factor +++ b/core/inference/inference-tests.factor @@ -549,10 +549,10 @@ ERROR: custom-error ; { 1 0 } [ [ ] map-children ] must-infer-as ! Corner case -[ [ [ f dup ] [ dup ] [ ] unfold ] infer ] must-fail +! [ [ [ f dup ] [ dup ] [ ] unfold ] infer ] must-fail -[ [ [ f dup ] [ ] [ ] while ] infer ] must-fail +! [ [ [ f dup ] [ ] [ ] while ] infer ] must-fail -: erg's-inference-bug ( -- ) f dup [ erg's-inference-bug ] when ; inline +! : erg's-inference-bug ( -- ) f dup [ erg's-inference-bug ] when ; inline -[ [ erg's-inference-bug ] infer ] must-fail +! [ [ erg's-inference-bug ] infer ] must-fail From 79da16bc5ac09cc50efe386ef41649211f870fdd Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Fri, 23 May 2008 16:47:35 -0500 Subject: [PATCH 7/7] 2dip documented, but should we use it in our code? the official stance is "no comment" --- core/kernel/kernel-docs.factor | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/core/kernel/kernel-docs.factor b/core/kernel/kernel-docs.factor index 60151d9f55..96c582a3e5 100755 --- a/core/kernel/kernel-docs.factor +++ b/core/kernel/kernel-docs.factor @@ -97,6 +97,7 @@ $nl "Certain shuffle words can also be expressed in terms of the spread combinators. Internalizing such identities can help with understanding and writing code using spread combinators:" { $code ": dip [ ] bi* ;" + ": 2dip [ ] [ ] tri* ;" "" ": slip [ call ] [ ] bi* ;" ": 2slip [ call ] [ ] [ ] tri* ;" @@ -164,8 +165,9 @@ ARTICLE: "slip-keep-combinators" "The slip and keep combinators" { $subsection slip } { $subsection 2slip } { $subsection 3slip } -"The dip combinator invokes the quotation at the top of the stack, hiding the value underneath:" +"The dip combinators invoke the quotation at the top of the stack, hiding the values underneath:" { $subsection dip } +{ $subsection 2dip } "The keep combinators invoke a quotation which takes a number of values off the stack, and then they restore those values:" { $subsection keep } { $subsection 2keep } @@ -876,6 +878,14 @@ HELP: dip { $code "[ foo bar ] dip" } } ; +HELP: 2dip +{ $values { "obj1" object } { "obj2" object } { "quot" quotation } } +{ $description "Calls " { $snippet "quot" } " with " { $snippet "obj1" } " and " { $snippet "obj2" } " hidden on the retain stack." } +{ $notes "The following are equivalent:" + { $code ">r >r foo bar r> r>" } + { $code "[ foo bar ] 2dip" } +} ; + HELP: while { $values { "pred" "a quotation with stack effect " { $snippet "( -- ? )" } } { "body" "a quotation" } { "tail" "a quotation" } } { $description "Repeatedly calls " { $snippet "pred" } ". If it yields " { $link f } ", iteration stops, otherwise " { $snippet "body" } " is called. After iteration stops, " { $snippet "tail" } " is called." }