Merge branch 'master' of git://factorcode.org/git/factor

db4
Daniel Ehrenberg 2008-05-20 18:43:12 -05:00
commit 1f7f4bb278
14 changed files with 1147 additions and 12 deletions

View File

@ -630,7 +630,7 @@ HELP: tri*
"The following two lines are equivalent:" "The following two lines are equivalent:"
{ $code { $code
"[ p ] [ q ] [ r ] tri*" "[ p ] [ q ] [ r ] tri*"
">r >r q r> q r> r" ">r >r p r> q r> r"
} }
} ; } ;

View File

@ -17,7 +17,7 @@ tuple-syntax namespaces ;
path: "/index.html" path: "/index.html"
version: "1.1" version: "1.1"
cookies: V{ } cookies: V{ }
header: H{ { "connection" "close" } } header: H{ { "connection" "close" } { "user-agent" "Factor http.client vocabulary" } }
} }
] [ ] [
[ [
@ -35,7 +35,7 @@ tuple-syntax namespaces ;
path: "/index.html" path: "/index.html"
version: "1.1" version: "1.1"
cookies: V{ } cookies: V{ }
header: H{ { "connection" "close" } } header: H{ { "connection" "close" } { "user-agent" "Factor http.client vocabulary" } }
} }
] [ ] [
[ [

View File

@ -4,7 +4,7 @@ USING: assocs http kernel math math.parser namespaces sequences
io io.sockets io.streams.string io.files io.timeouts strings io io.sockets io.streams.string io.files io.timeouts strings
splitting calendar continuations accessors vectors math.order splitting calendar continuations accessors vectors math.order
io.encodings.8-bit io.encodings.binary io.streams.duplex io.encodings.8-bit io.encodings.binary io.streams.duplex
fry debugger inspector ; fry debugger inspector ascii ;
IN: http.client IN: http.client
: max-redirects 10 ; : max-redirects 10 ;
@ -37,8 +37,12 @@ SYMBOL: redirects
PRIVATE> PRIVATE>
: read-chunk-size ( -- n )
read-crlf ";" split1 drop [ blank? ] right-trim
hex> [ "Bad chunk size" throw ] unless* ;
: read-chunks ( -- ) : read-chunks ( -- )
read-crlf ";" split1 drop hex> dup { f 0 } member? read-chunk-size dup zero?
[ drop ] [ read % read-crlf "" assert= read-chunks ] if ; [ drop ] [ read % read-crlf "" assert= read-chunks ] if ;
: read-response-body ( response -- response data ) : read-response-body ( response -- response data )

View File

@ -256,7 +256,8 @@ cookies ;
H{ } clone >>header H{ } clone >>header
H{ } clone >>query H{ } clone >>query
V{ } clone >>cookies V{ } clone >>cookies
"close" "connection" set-header ; "close" "connection" set-header
"Factor http.client vocabulary" "user-agent" set-header ;
: query-param ( request key -- value ) : query-param ( request key -- value )
swap query>> at ; swap query>> at ;

View File

@ -153,13 +153,13 @@ M: openssl-context dispose*
TUPLE: ssl-handle file handle connected disposed ; TUPLE: ssl-handle file handle connected disposed ;
ERROR: no-ssl-context ; ERROR: no-secure-context ;
M: no-ssl-context summary M: no-secure-context summary
drop "SSL operations must be wrapped in calls to with-ssl-context" ; drop "Secure socket operations must be wrapped in calls to with-secure-context" ;
: current-ssl-context ( -- ctx ) : current-ssl-context ( -- ctx )
secure-context get [ no-ssl-context ] unless* ; secure-context get [ no-secure-context ] unless* ;
: <ssl-handle> ( fd -- ssl ) : <ssl-handle> ( fd -- ssl )
current-ssl-context handle>> SSL_new dup ssl-error current-ssl-context handle>> SSL_new dup ssl-error

View File

@ -0,0 +1,138 @@
USING: regexp4 tools.test kernel ;
IN: regexp4-tests
[ f ] [ "b" "a*" <regexp> matches? ] unit-test
[ t ] [ "" "a*" <regexp> matches? ] unit-test
[ t ] [ "a" "a*" <regexp> matches? ] unit-test
[ t ] [ "aaaaaaa" "a*" <regexp> matches? ] unit-test
[ f ] [ "ab" "a*" <regexp> matches? ] unit-test
[ t ] [ "abc" "abc" <regexp> matches? ] unit-test
[ t ] [ "a" "a|b|c" <regexp> matches? ] unit-test
[ t ] [ "b" "a|b|c" <regexp> matches? ] unit-test
[ t ] [ "c" "a|b|c" <regexp> matches? ] unit-test
[ f ] [ "c" "d|e|f" <regexp> matches? ] unit-test
[ f ] [ "aa" "a|b|c" <regexp> matches? ] unit-test
[ f ] [ "bb" "a|b|c" <regexp> matches? ] unit-test
[ f ] [ "cc" "a|b|c" <regexp> matches? ] unit-test
[ f ] [ "cc" "d|e|f" <regexp> matches? ] unit-test
[ f ] [ "" "a+" <regexp> matches? ] unit-test
[ t ] [ "a" "a+" <regexp> matches? ] unit-test
[ t ] [ "aa" "a+" <regexp> matches? ] unit-test
[ t ] [ "" "a?" <regexp> matches? ] unit-test
[ t ] [ "a" "a?" <regexp> matches? ] unit-test
[ f ] [ "aa" "a?" <regexp> matches? ] unit-test
[ f ] [ "" "." <regexp> matches? ] unit-test
[ t ] [ "a" "." <regexp> matches? ] unit-test
[ t ] [ "." "." <regexp> matches? ] unit-test
! [ f ] [ "\n" "." <regexp> matches? ] unit-test
[ f ] [ "" ".+" <regexp> matches? ] unit-test
[ t ] [ "a" ".+" <regexp> matches? ] unit-test
[ t ] [ "ab" ".+" <regexp> matches? ] unit-test
[ t ] [ "" "a|b*|c+|d?" <regexp> matches? ] unit-test
[ t ] [ "a" "a|b*|c+|d?" <regexp> matches? ] unit-test
[ t ] [ "c" "a|b*|c+|d?" <regexp> matches? ] unit-test
[ t ] [ "cc" "a|b*|c+|d?" <regexp> matches? ] unit-test
[ f ] [ "ccd" "a|b*|c+|d?" <regexp> matches? ] unit-test
[ t ] [ "d" "a|b*|c+|d?" <regexp> matches? ] unit-test
[ t ] [ "foo" "foo|bar" <regexp> matches? ] unit-test
[ t ] [ "bar" "foo|bar" <regexp> matches? ] unit-test
[ f ] [ "foobar" "foo|bar" <regexp> matches? ] unit-test
[ f ] [ "" "(a)" <regexp> matches? ] unit-test
[ t ] [ "a" "(a)" <regexp> matches? ] unit-test
[ f ] [ "aa" "(a)" <regexp> matches? ] unit-test
[ t ] [ "aa" "(a*)" <regexp> matches? ] unit-test
[ f ] [ "aababaaabbac" "(a|b)+" <regexp> matches? ] unit-test
[ t ] [ "ababaaabba" "(a|b)+" <regexp> matches? ] unit-test
[ f ] [ "" "a{1}" <regexp> matches? ] unit-test
[ t ] [ "a" "a{1}" <regexp> matches? ] unit-test
[ f ] [ "aa" "a{1}" <regexp> matches? ] unit-test
[ f ] [ "a" "a{2,}" <regexp> matches? ] unit-test
[ t ] [ "aaa" "a{2,}" <regexp> matches? ] unit-test
[ t ] [ "aaaa" "a{2,}" <regexp> matches? ] unit-test
[ t ] [ "aaaaa" "a{2,}" <regexp> matches? ] unit-test
[ t ] [ "" "a{,2}" <regexp> matches? ] unit-test
[ t ] [ "a" "a{,2}" <regexp> matches? ] unit-test
[ t ] [ "aa" "a{,2}" <regexp> matches? ] unit-test
[ f ] [ "aaa" "a{,2}" <regexp> matches? ] unit-test
[ f ] [ "aaaa" "a{,2}" <regexp> matches? ] unit-test
[ f ] [ "aaaaa" "a{,2}" <regexp> matches? ] unit-test
[ f ] [ "" "a{1,3}" <regexp> matches? ] unit-test
[ t ] [ "a" "a{1,3}" <regexp> matches? ] unit-test
[ t ] [ "aa" "a{1,3}" <regexp> matches? ] unit-test
[ t ] [ "aaa" "a{1,3}" <regexp> matches? ] unit-test
[ f ] [ "aaaa" "a{1,3}" <regexp> matches? ] unit-test
[ f ] [ "" "[a]" <regexp> matches? ] unit-test
[ t ] [ "a" "[a]" <regexp> matches? ] unit-test
[ t ] [ "a" "[abc]" <regexp> matches? ] unit-test
[ f ] [ "b" "[a]" <regexp> matches? ] unit-test
[ f ] [ "d" "[abc]" <regexp> matches? ] unit-test
[ t ] [ "ab" "[abc]{1,2}" <regexp> matches? ] unit-test
[ f ] [ "abc" "[abc]{1,2}" <regexp> matches? ] unit-test
[ f ] [ "" "[^a]" <regexp> matches? ] unit-test
[ f ] [ "a" "[^a]" <regexp> matches? ] unit-test
[ f ] [ "a" "[^abc]" <regexp> matches? ] unit-test
[ t ] [ "b" "[^a]" <regexp> matches? ] unit-test
[ t ] [ "d" "[^abc]" <regexp> matches? ] unit-test
[ f ] [ "ab" "[^abc]{1,2}" <regexp> matches? ] unit-test
[ f ] [ "abc" "[^abc]{1,2}" <regexp> matches? ] unit-test
[ t ] [ "]" "[]]" <regexp> matches? ] unit-test
[ f ] [ "]" "[^]]" <regexp> matches? ] unit-test
! [ "^" "[^]" <regexp> matches? ] must-fail
[ t ] [ "^" "[]^]" <regexp> matches? ] unit-test
[ t ] [ "]" "[]^]" <regexp> matches? ] unit-test
[ t ] [ "[" "[[]" <regexp> matches? ] unit-test
[ f ] [ "^" "[^^]" <regexp> matches? ] unit-test
[ t ] [ "a" "[^^]" <regexp> matches? ] unit-test
[ t ] [ "-" "[-]" <regexp> matches? ] unit-test
[ f ] [ "a" "[-]" <regexp> matches? ] unit-test
[ f ] [ "-" "[^-]" <regexp> matches? ] unit-test
[ t ] [ "a" "[^-]" <regexp> matches? ] unit-test
[ t ] [ "-" "[-a]" <regexp> matches? ] unit-test
[ t ] [ "a" "[-a]" <regexp> matches? ] unit-test
[ t ] [ "-" "[a-]" <regexp> matches? ] unit-test
[ t ] [ "a" "[a-]" <regexp> matches? ] unit-test
[ f ] [ "b" "[a-]" <regexp> matches? ] unit-test
[ f ] [ "-" "[^-]" <regexp> matches? ] unit-test
[ t ] [ "a" "[^-]" <regexp> matches? ] unit-test
[ f ] [ "-" "[a-c]" <regexp> matches? ] unit-test
[ t ] [ "-" "[^a-c]" <regexp> matches? ] unit-test
[ t ] [ "b" "[a-c]" <regexp> matches? ] unit-test
[ f ] [ "b" "[^a-c]" <regexp> matches? ] unit-test
[ t ] [ "-" "[a-c-]" <regexp> matches? ] unit-test
[ f ] [ "-" "[^a-c-]" <regexp> matches? ] unit-test
[ t ] [ "\\" "[\\\\]" <regexp> matches? ] unit-test
[ f ] [ "a" "[\\\\]" <regexp> matches? ] unit-test
[ f ] [ "\\" "[^\\\\]" <regexp> matches? ] unit-test
[ t ] [ "a" "[^\\\\]" <regexp> matches? ] unit-test
! ((A)(B(C)))
! 1. ((A)(B(C)))
! 2. (A)
! 3. (B(C))
! 4. (C)

View File

@ -0,0 +1,547 @@
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs combinators kernel math
sequences namespaces locals combinators.lib state-tables
math.parser state-parser sets dlists unicode.categories
math.order quotations shuffle math.ranges splitting
symbols ;
IN: regexp4
SYMBOLS: eps start-state final-state beginning-of-text
end-of-text left-paren right-paren alternation ;
SYMBOL: runtim-epsilon
TUPLE: regexp raw paren-count bracket-count
state stack nfa new-states dfa minimized-dfa
dot-matches-newlines? character-sets capture-group
captured-groups ;
TUPLE: capture-group n range ;
ERROR: paren-underflow ;
ERROR: unbalanced-paren ;
: push-stack ( regexp token -- ) swap stack>> push ;
: push-all-stack ( regexp seq -- ) swap stack>> push-all ;
: next-state ( regexp -- n ) [ 1+ ] change-state state>> ;
: check-paren-underflow ( regexp -- )
paren-count>> 0 < [ paren-underflow ] when ;
: check-unbalanced-paren ( regexp -- )
paren-count>> 0 > [ unbalanced-paren ] when ;
:: (apply-alternation) ( stack regexp -- )
[let | s2 [ stack peek first ]
s3 [ stack pop second ]
s0 [ stack peek alternation = [ stack pop* ] when stack peek first ]
s1 [ stack pop second ]
s4 [ regexp next-state ]
s5 [ regexp next-state ]
table [ regexp nfa>> ] |
s5 table add-row
s4 eps s0 <entry> table add-entry
s4 eps s2 <entry> table add-entry
s1 eps s5 <entry> table add-entry
s3 eps s5 <entry> table add-entry
s1 table final-states>> delete-at
s3 table final-states>> delete-at
t s5 table final-states>> set-at
s4 s5 2array stack push ] ;
: apply-alternation ( regexp -- )
[ stack>> ] [ (apply-alternation) ] bi ;
: apply-alternation? ( stack -- ? )
dup length dup 3 <
[ 2drop f ] [ 2 - swap nth alternation = ] if ;
:: (apply-concatenation) ( stack regexp -- )
[let* |
s2 [ stack peek first ]
s3 [ stack pop second ]
s0 [ stack peek first ]
s1 [ stack pop second ]
table [ regexp nfa>> ] |
s1 eps s2 <entry> table set-entry
s1 table final-states>> delete-at
s3 table add-row
s0 s3 2array stack push ] ;
: apply-concatenation ( regexp -- )
[ stack>> ] [ (apply-concatenation) ] bi ;
: apply-concatenation? ( seq -- ? )
dup length dup 2 <
[ 2drop f ] [ 2 - swap nth array? ] if ;
: apply-loop ( seq regexp -- seq regexp )
over length 1 > [
2dup over apply-alternation?
[ (apply-alternation) ] [ (apply-concatenation) ] if apply-loop
] when ;
: apply-til-last ( token regexp -- )
swap [
<reversed> tuck index cut reverse dup pop*
] change-stack >r reverse r> apply-loop stack>> push-all ;
: concatenation-loop ( regexp -- )
dup stack>> dup apply-concatenation?
[ over (apply-concatenation) concatenation-loop ] [ 2drop ] if ;
:: apply-kleene-closure ( regexp -- )
[let* | stack [ regexp stack>> ]
s0 [ stack peek first ]
s1 [ stack pop second ]
s2 [ regexp next-state ]
s3 [ regexp next-state ]
table [ regexp nfa>> ] |
s1 table final-states>> delete-at
t s3 table final-states>> set-at
s3 table add-row
s1 eps s0 <entry> table add-entry
s2 eps s0 <entry> table add-entry
s2 eps s3 <entry> table add-entry
s1 eps s3 <entry> table add-entry
s2 s3 2array stack push ] ;
: add-numbers ( n obj -- obj )
2dup [ number? ] bi@ and
[ + ] [ dup sequence? [ [ + ] with map ] [ nip ] if ] if ;
: increment-columns ( n assoc -- )
dup [ >r swap >r add-numbers r> r> set-at ] curry with* assoc-each ;
:: copy-state-rows ( regexp range -- )
[let* | len [ range range-length ]
offset [ regexp state>> range range-min - 1+ ]
state [ regexp [ len + ] change-state ] |
regexp nfa>> rows>>
[ drop range member? ] assoc-filter
[
[ offset + ] dip
[ offset swap add-numbers ] assoc-map
] assoc-map
regexp nfa>> [ assoc-union ] change-rows drop
range [ range-min ] [ range-max ] bi [ offset + ] bi@ 2array
regexp stack>> push ] ;
: last-state ( regexp -- range )
stack>> peek first2 [a,b] ;
: set-last-state-final ( ? regexp -- )
[ stack>> peek second ] [ nfa>> final-states>> ] bi set-at ;
: apply-plus-closure ( regexp -- )
[ dup last-state copy-state-rows ]
[ apply-kleene-closure ]
[ apply-concatenation ] tri ;
: apply-question-closure ( regexp -- )
[ stack>> peek first2 eps swap <entry> ] [ nfa>> add-entry ] bi ;
: with0 ( obj n quot -- n quot' ) swapd curry ; inline
: copy-state ( regexp state n -- )
[ copy-state-rows ] with0 with0 times ;
:: (exactly-n) ( regexp state n -- )
regexp state n copy-state
t regexp set-last-state-final ;
: exactly-n ( regexp n -- )
>r dup last-state r> 1- (exactly-n) ;
: exactly-n-concatenated ( regexp state n -- )
[ (exactly-n) ] 3keep
nip 1- [ apply-concatenation ] with0 times ;
:: at-least-n ( regexp n -- )
[let | state [ regexp stack>> pop first2 [a,b] ] |
regexp state n copy-state
state regexp stack>> push
regexp apply-kleene-closure ] ;
: pop-last ( regexp -- range )
stack>> pop first2 [a,b] ;
:: at-most-n ( regexp n -- )
[let | state [ regexp pop-last ] |
regexp state n [ 1+ exactly-n-concatenated ] with with each
regexp n 1- [ apply-alternation ] with0 times
regexp apply-question-closure ] ;
:: from-m-to-n ( regexp m n -- )
[let | state [ regexp pop-last ] |
regexp state
m n [a,b] [ exactly-n-concatenated ] with with each
regexp n m - [ apply-alternation ] with0 times ] ;
: apply-brace-closure ( regexp from/f to/f comma? -- )
[
2dup and
[ from-m-to-n ]
[ [ nip at-most-n ] [ at-least-n ] if* ] if
] [ drop exactly-n ] if ;
:: make-nontoken-nfa ( regexp obj -- )
[let | s0 [ regexp next-state ]
s1 [ regexp next-state ]
stack [ regexp stack>> ]
table [ regexp nfa>> ] |
s0 obj s1 <entry> table set-entry
s1 table add-row
t s1 table final-states>> set-at
s0 s1 2array stack push ] ;
: set-start-state ( regexp -- )
dup stack>> dup empty? [
2drop
] [
[ nfa>> ] [ pop first ] bi* >>start-state drop
] if ;
: ascii? ( n -- ? ) 0 HEX: 7f between? ;
: octal-digit? ( n -- ? ) CHAR: 0 CHAR: 7 between? ;
: decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ;
: hex-digit? ( n -- ? )
dup decimal-digit?
over CHAR: a CHAR: f between? or
swap CHAR: A CHAR: F between? or ;
: control-char? ( n -- ? )
dup 0 HEX: 1f between? swap HEX: 7f = or ;
: punct? ( n -- ? )
"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
: c-identifier-char? ( ch -- ? )
dup alpha? swap CHAR: _ = or ;
: java-blank? ( n -- ? )
{
CHAR: \s CHAR: \t CHAR: \n
HEX: b HEX: 7 CHAR: \r
} member? ;
: java-printable? ( n -- ? )
dup alpha? swap punct? or ;
ERROR: bad-character-class obj ;
: parse-posix-class ( -- quot )
next
CHAR: { expect
[ get-char CHAR: } = ] take-until
{
{ "Lower" [ [ letter? ] ] }
{ "Upper" [ [ LETTER? ] ] }
{ "ASCII" [ [ ascii? ] ] }
{ "Alpha" [ [ Letter? ] ] }
{ "Digit" [ [ digit? ] ] }
{ "Alnum" [ [ alpha? ] ] }
{ "Punct" [ [ punct? ] ] }
{ "Graph" [ [ java-printable? ] ] }
{ "Print" [ [ java-printable? ] ] }
{ "Blank" [ [ " \t" member? ] ] }
{ "Cntrl" [ [ control-char? ] ] }
{ "XDigit" [ [ hex-digit? ] ] }
{ "Space" [ [ java-blank? ] ] }
! TODO: unicode-character-class, fallthrough in unicode is bad-char-clss
[ bad-character-class ]
} case ;
ERROR: bad-octal number ;
: parse-octal ( regexp -- )
next get-char drop
3 take oct>
dup 255 > [ bad-octal ] when
make-nontoken-nfa ;
ERROR: bad-hex number ;
: parse-short-hex ( regexp -- )
next 2 take hex>
dup number? [ bad-hex ] unless
make-nontoken-nfa ;
: parse-long-hex ( regexp -- )
next 4 take hex>
dup number? [ bad-hex ] unless
make-nontoken-nfa ;
: parse-control-character ( regexp -- )
next get-char make-nontoken-nfa ;
: parse-backreference ( regexp obj -- )
2drop ;
: dot-construction ( regexp -- )
[ CHAR: \n = not ] make-nontoken-nfa ;
: front-anchor-construction ( regexp -- )
drop ;
: back-anchor-construction ( regexp -- )
drop ;
: parse-brace ( -- from/f to/f comma? )
next
[ get-char CHAR: } = ] take-until
"," split1 [ [ string>number ] bi@ ] keep >boolean ;
: take-until-]
[ get-char CHAR: ] = ] take-until ;
: make-character-set ( regexp str -- )
dup
[ length 1 > ] [ first CHAR: ^ = ] bi and
[ rest t ] [ f ] if
>r [ member? ] curry r>
[ [ not ] compose ] when make-nontoken-nfa ;
: parse-escaped ( regexp -- )
next get-char {
{ CHAR: \ [ CHAR: \ make-nontoken-nfa ] }
{ CHAR: t [ CHAR: \t make-nontoken-nfa ] }
{ CHAR: n [ CHAR: \n make-nontoken-nfa ] }
{ CHAR: r [ CHAR: \r make-nontoken-nfa ] }
{ CHAR: f [ HEX: c make-nontoken-nfa ] }
{ CHAR: a [ HEX: 7 make-nontoken-nfa ] }
{ CHAR: e [ HEX: 1b make-nontoken-nfa ] }
{ CHAR: d [ [ digit? ] make-nontoken-nfa ] }
{ CHAR: D [ [ digit? not ] make-nontoken-nfa ] }
{ CHAR: s [ [ java-blank? ] make-nontoken-nfa ] }
{ CHAR: S [ [ java-blank? not ] make-nontoken-nfa ] }
{ CHAR: w [ [ c-identifier-char? ] make-nontoken-nfa ] }
{ CHAR: W [ [ c-identifier-char? not ] make-nontoken-nfa ] }
{ CHAR: p [ parse-posix-class make-nontoken-nfa ] }
{ CHAR: P [ parse-posix-class [ not ] compose make-nontoken-nfa ] }
{ CHAR: x [ parse-short-hex ] }
{ CHAR: u [ parse-long-hex ] }
{ CHAR: 0 [ parse-octal ] }
{ CHAR: c [ parse-control-character ] }
! { CHAR: Q [ quot til \E ] }
! { CHAR: E [ should be an error, parse this in the Q if exists ] }
! { CHAR: b [ ] } ! a word boundary
! { CHAR: B [ ] } ! a non-word boundary
! { CHAR: A [ ] } ! beginning of input
! { CHAR: G [ ] } ! end of previous match
! { CHAR: Z [ ] } ! end of input but for the final terminator, if any
! { CHAR: z [ ] } ! end of the input
[ dup digit? [ parse-backreference ] [ make-nontoken-nfa ] if ]
} case ;
ERROR: unsupported-token token ;
: parse-token ( regexp token -- )
dup {
{ CHAR: ^ [ drop front-anchor-construction ] }
{ CHAR: $ [ drop back-anchor-construction ] }
{ CHAR: \ [ drop parse-escaped ] }
{ CHAR: | [ drop dup concatenation-loop alternation push-stack ] }
{ CHAR: ( [ drop [ 1+ ] change-paren-count left-paren push-stack ] }
{ CHAR: ) [ drop [ 1- ] change-paren-count left-paren apply-til-last ] }
{ CHAR: * [ drop apply-kleene-closure ] }
{ CHAR: + [ drop apply-plus-closure ] }
{ CHAR: ? [ drop apply-question-closure ] }
{ CHAR: { [ drop parse-brace apply-brace-closure ] }
! { CHAR: [ [ drop parse-character-set ] }
! { CHAR: } [ drop drop "brace" ] }
! { CHAR: ? [ drop ] }
{ CHAR: . [ drop dot-construction ] }
{ beginning-of-text [ push-stack ] }
{ end-of-text [
drop {
[ check-unbalanced-paren ]
[ concatenation-loop ]
[ beginning-of-text apply-til-last ]
[ set-start-state ]
} cleave
] }
[ drop make-nontoken-nfa ]
} case ;
: (parse-raw-regexp) ( regexp -- )
get-char [ dupd parse-token next (parse-raw-regexp) ] [ drop ] if* ;
: parse-raw-regexp ( regexp -- )
[ beginning-of-text parse-token ]
[
dup raw>> dup empty? [
2drop
] [
[ (parse-raw-regexp) ] string-parse
] if
]
[ end-of-text parse-token ] tri ;
:: find-delta ( states obj table -- keys )
obj states [
table get-row at
[ dup integer? [ 1array ] when unique ] [ H{ } ] if*
] with map H{ } clone [ assoc-union ] reduce keys ;
:: (find-closure) ( states obj assoc table -- keys )
[let | size [ assoc assoc-size ] |
assoc states unique assoc-union
dup assoc-size size > [
obj states [
table get-row at* [
dup integer? [ 1array ] when
obj rot table (find-closure)
] [
drop
] if
] with each
] when ] ;
: find-closure ( states obj table -- states )
>r H{ } r> (find-closure) keys ;
: find-epsilon-closure ( states table -- states )
>r eps H{ } r> (find-closure) keys ;
: filter-special-transition ( vec -- vec' )
[ drop eps = not ] assoc-filter ;
: initialize-subset-construction ( regexp -- )
<vector-table> >>dfa
[
nfa>> [ start-state>> 1array ] keep
find-epsilon-closure 1dlist
] [
swap >>new-states drop
] [
[ dfa>> ] [ nfa>> ] bi
columns>> filter-special-transition >>columns drop
] tri ;
:: (subset-construction) ( regexp -- )
[let* | nfa [ regexp nfa>> ]
dfa [ regexp dfa>> ]
new-states [ regexp new-states>> ]
columns [ dfa columns>> keys ] |
new-states dlist-empty? [
new-states pop-front
dup dfa add-row
columns [
2dup nfa [ find-delta ] [ find-epsilon-closure ] bi
dup [ dfa rows>> key? ] [ empty? ] bi or [
dup new-states push-back
] unless
dup empty? [ 3drop ] [ <entry> dfa set-entry ] if
] with each
regexp (subset-construction)
] unless ] ;
: set-start/final-states ( regexp -- )
dup [ nfa>> start-state>> ]
[ dfa>> rows>> keys [ member? ] with filter first ] bi
>r dup dfa>> r> >>start-state drop
dup [ nfa>> final-states>> ] [ dfa>> rows>> ] bi
[ keys ] bi@
[ intersect empty? not ] with filter
>r dfa>> r> >>final-states drop ;
: subset-construction ( regexp -- )
[ initialize-subset-construction ]
[ (subset-construction) ]
[ set-start/final-states ] tri ;
: <regexp> ( raw -- obj )
regexp new
swap >>raw
0 >>paren-count
-1 >>state
V{ } clone >>stack
V{ } clone >>character-sets
<vector-table> >>nfa
dup [ parse-raw-regexp ] [ subset-construction ] bi ;
TUPLE: dfa-traverser
dfa
last-state current-state
text
start-index current-index
matches ;
: <dfa-traverser> ( text dfa -- match )
dfa>>
dfa-traverser new
swap [ start-state>> >>current-state ] keep
>>dfa
swap >>text
0 >>start-index
0 >>current-index
V{ } clone >>matches ;
: final-state? ( dfa-traverser -- ? )
[ current-state>> ] [ dfa>> final-states>> ] bi
member? ;
: text-finished? ( dfa-traverser -- ? )
[ current-index>> ] [ text>> length ] bi >= ;
: save-final-state ( dfa-straverser -- )
[ current-index>> ] [ matches>> ] bi push ;
: match-done? ( dfa-traverser -- ? )
dup final-state? [
dup save-final-state
] when text-finished? ;
: increment-state ( dfa-traverser state -- dfa-traverser )
>r [ 1+ ] change-current-index
dup current-state>> >>last-state r>
>>current-state ;
: match-transition ( obj hash -- state/f )
2dup keys [ callable? ] filter predicates
[ swap at nip ] [ at ] if* ;
: do-match ( dfa-traverser -- dfa-traverser )
dup match-done? [
dup {
[ current-index>> ]
[ text>> ]
[ current-state>> ]
[ dfa>> rows>> ]
} cleave
at >r nth r> match-transition [
increment-state do-match
] when*
] unless ;
: return-match ( dfa-traverser -- interval/f )
dup matches>> empty? [
drop f
] [
[ start-index>> ] [ matches>> peek ] bi 1 <range>
] if ;
: match ( string regexp -- pair )
<dfa-traverser> do-match return-match ;
: matches? ( string regexp -- ? )
dupd match [ [ length ] [ range-length 1- ] bi* = ] [ drop f ] if* ;
! character classes
! TUPLE: range-class from to ;
! TUPLE: or-class left right ;
! (?:a|b)* <- does not capture
! (a|b)*\1 <- group captured
! (?!abba) negative lookahead matches ababa but not abbaa
! (?=abba) positive lookahead matches abbaaa but not abaaa

View File

@ -0,0 +1 @@
Doug Coleman

View File

@ -0,0 +1,49 @@
USING: kernel tables tools.test ;
IN: tables.tests
: test-table
<table>
"a" "c" "z" <entry> over set-entry
"a" "o" "y" <entry> over set-entry
"a" "l" "x" <entry> over set-entry
"b" "o" "y" <entry> over set-entry
"b" "l" "x" <entry> over set-entry
"b" "s" "u" <entry> over set-entry ;
[
T{ table f
H{
{ "a" H{ { "l" "x" } { "c" "z" } { "o" "y" } } }
{ "b" H{ { "l" "x" } { "s" "u" } { "o" "y" } } }
}
H{ { "l" t } { "s" t } { "c" t } { "o" t } } }
] [ test-table ] unit-test
[ "x" t ] [ "a" "l" test-table get-entry ] unit-test
[ "har" t ] [
"a" "z" "har" <entry> test-table [ set-entry ] keep
>r "a" "z" r> get-entry
] unit-test
: vector-test-table
<vector-table>
"a" "c" "z" <entry> over add-value
"a" "c" "r" <entry> over add-value
"a" "o" "y" <entry> over add-value
"a" "l" "x" <entry> over add-value
"b" "o" "y" <entry> over add-value
"b" "l" "x" <entry> over add-value
"b" "s" "u" <entry> over add-value ;
[
T{ vector-table f
H{
{ "a"
H{ { "l" "x" } { "c" V{ "z" "r" } } { "o" "y" } } }
{ "b"
H{ { "l" "x" } { "s" "u" } { "o" "y" } } }
}
H{ { "l" t } { "s" t } { "c" t } { "o" t } }
}
] [ vector-test-table ] unit-test

View File

@ -0,0 +1,123 @@
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: kernel namespaces sequences vectors assocs accessors ;
IN: state-tables
TUPLE: table rows columns start-state final-states ;
TUPLE: entry row-key column-key value ;
GENERIC: add-entry ( entry table -- )
: make-table ( class -- obj )
new
H{ } clone >>rows
H{ } clone >>columns
H{ } clone >>final-states ;
: <table> ( -- obj )
table make-table ;
C: <entry> entry
: (add-row) ( row-key table -- row )
2dup rows>> at* [
2nip
] [
drop H{ } clone [ -rot rows>> set-at ] keep
] if ;
: add-row ( row-key table -- )
(add-row) drop ;
: add-column ( column-key table -- )
t -rot columns>> set-at ;
: set-row ( row row-key table -- )
rows>> set-at ;
: lookup-row ( row-key table -- row/f ? )
rows>> at* ;
: row-exists? ( row-key table -- ? )
lookup-row nip ;
: lookup-column ( column-key table -- column/f ? )
columns>> at* ;
: column-exists? ( column-key table -- ? )
lookup-column nip ;
ERROR: no-row key ;
ERROR: no-column key ;
: get-row ( row-key table -- row )
dupd lookup-row [
nip
] [
drop no-row
] if ;
: get-column ( column-key table -- column )
dupd lookup-column [
nip
] [
drop no-column
] if ;
: get-entry ( row-key column-key table -- obj ? )
swapd lookup-row [
at*
] [
2drop f f
] if ;
: (set-entry) ( entry table -- value column-key row )
[ >r column-key>> r> add-column ] 2keep
dupd >r row-key>> r> (add-row)
>r [ value>> ] keep column-key>> r> ;
: set-entry ( entry table -- )
(set-entry) set-at ;
: delete-entry ( entry table -- )
>r [ column-key>> ] [ row-key>> ] bi r>
lookup-row [ delete-at ] [ 2drop ] if ;
: swap-rows ( row-key1 row-key2 table -- )
[ tuck get-row >r get-row r> ] 3keep
>r >r rot r> r> [ set-row ] keep set-row ;
: member?* ( obj obj -- bool )
2dup = [ 2drop t ] [ member? ] if ;
: find-by-column ( column-key data table -- seq )
swapd 2dup lookup-column 2drop
[
rows>> [
pick swap at* [
>r pick r> member?* [ , ] [ drop ] if
] [
2drop
] if
] assoc-each
] { } make 2nip ;
TUPLE: vector-table < table ;
: <vector-table> ( -- obj )
vector-table make-table ;
: add-hash-vector ( value key hash -- )
2dup at* [
dup vector? [
2nip push
] [
V{ } clone [ push ] keep
-rot >r >r [ push ] keep r> r> set-at
] if
] [
drop set-at
] if ;
M: vector-table add-entry ( entry table -- )
(set-entry) add-hash-vector ;

View File

@ -1,4 +1,4 @@
USING: kernel symbols tools.test parser generic words ; USING: kernel symbols tools.test parser generic words accessors ;
IN: symbols.tests IN: symbols.tests
[ ] [ SYMBOLS: a b c ; ] unit-test [ ] [ SYMBOLS: a b c ; ] unit-test
@ -13,3 +13,8 @@ DEFER: blah
[ f ] [ \ blah generic? ] unit-test [ f ] [ \ blah generic? ] unit-test
[ t ] [ \ blah symbol? ] unit-test [ t ] [ \ blah symbol? ] unit-test
[ "IN: symbols.tests USE: symbols SINGLETONS: blah blah blah ;" eval ]
[ error>> error>> def>> \ blah eq? ]
must-fail-with

View File

@ -10,5 +10,5 @@ IN: symbols
: SINGLETONS: : SINGLETONS:
";" parse-tokens ";" parse-tokens
[ create-class-in dup save-location define-singleton-class ] each ; [ create-class-in define-singleton-class ] each ;
parsing parsing

View File

@ -0,0 +1,5 @@
USING: kernel peg regexp2 sequences tools.test ;
IN: regexp2.tests
[ T{ parse-result f T{ slice f 3 3 "056" } 46 } ]
[ "056" 'octal' parse ] unit-test

View File

@ -0,0 +1,262 @@
USING: assocs combinators.lib kernel math math.parser
namespaces peg unicode.case sequences unicode.categories
memoize peg.parsers math.order ;
USE: io
USE: tools.walker
IN: regexp2
<PRIVATE
SYMBOL: ignore-case?
: char=-quot ( ch -- quot )
ignore-case? get
[ ch>upper [ swap ch>upper = ] ] [ [ = ] ] if
curry ;
: char-between?-quot ( ch1 ch2 -- quot )
ignore-case? get
[ [ ch>upper ] bi@ [ >r >r ch>upper r> r> between? ] ]
[ [ between? ] ]
if 2curry ;
: or-predicates ( quots -- quot )
[ \ dup prefix ] map [ [ t ] ] f short-circuit \ nip suffix ;
: literal-action [ nip ] curry action ;
: delay-action [ curry ] curry action ;
PRIVATE>
: ascii? ( n -- ? )
0 HEX: 7f between? ;
: octal-digit? ( n -- ? )
CHAR: 0 CHAR: 7 between? ;
: hex-digit? ( n -- ? )
{
[ dup digit? ]
[ dup CHAR: a CHAR: f between? ]
[ dup CHAR: A CHAR: F between? ]
} || nip ;
: control-char? ( n -- ? )
{ [ dup 0 HEX: 1f between? ] [ dup HEX: 7f = ] } || nip ;
: punct? ( n -- ? )
"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
: c-identifier-char? ( ch -- ? )
{ [ dup alpha? ] [ dup CHAR: _ = ] } || nip ;
: java-blank? ( n -- ? )
{
CHAR: \s
CHAR: \t CHAR: \n CHAR: \r
HEX: c HEX: 7 HEX: 1b
} member? ;
: java-printable? ( n -- ? )
{ [ dup alpha? ] [ dup punct? ] } || nip ;
MEMO: 'ordinary-char' ( -- parser )
[ "\\^*+?|(){}[$" member? not ] satisfy
[ char=-quot ] action ;
MEMO: 'octal-digit' ( -- parser ) [ octal-digit? ] satisfy ;
MEMO: 'octal' ( -- parser )
"0" token hide 'octal-digit' 1 3 from-m-to-n 2seq
[ first oct> ] action ;
MEMO: 'hex-digit' ( -- parser ) [ hex-digit? ] satisfy ;
MEMO: 'hex' ( -- parser )
"x" token hide 'hex-digit' 2 exactly-n 2seq
"u" token hide 'hex-digit' 6 exactly-n 2seq 2choice
[ first hex> ] action ;
: satisfy-tokens ( assoc -- parser )
[ >r token r> literal-action ] { } assoc>map choice ;
MEMO: 'simple-escape-char' ( -- parser )
{
{ "\\" CHAR: \\ }
{ "t" CHAR: \t }
{ "n" CHAR: \n }
{ "r" CHAR: \r }
{ "f" HEX: c }
{ "a" HEX: 7 }
{ "e" HEX: 1b }
} [ char=-quot ] assoc-map satisfy-tokens ;
MEMO: 'predefined-char-class' ( -- parser )
{
{ "d" [ digit? ] }
{ "D" [ digit? not ] }
{ "s" [ java-blank? ] }
{ "S" [ java-blank? not ] }
{ "w" [ c-identifier-char? ] }
{ "W" [ c-identifier-char? not ] }
} satisfy-tokens ;
MEMO: 'posix-character-class' ( -- parser )
{
{ "Lower" [ letter? ] }
{ "Upper" [ LETTER? ] }
{ "ASCII" [ ascii? ] }
{ "Alpha" [ Letter? ] }
{ "Digit" [ digit? ] }
{ "Alnum" [ alpha? ] }
{ "Punct" [ punct? ] }
{ "Graph" [ java-printable? ] }
{ "Print" [ java-printable? ] }
{ "Blank" [ " \t" member? ] }
{ "Cntrl" [ control-char? ] }
{ "XDigit" [ hex-digit? ] }
{ "Space" [ java-blank? ] }
} satisfy-tokens "p{" "}" surrounded-by ;
MEMO: 'simple-escape' ( -- parser )
[
'octal' ,
'hex' ,
"c" token hide [ LETTER? ] satisfy 2seq ,
any-char ,
] choice* [ char=-quot ] action ;
MEMO: 'escape' ( -- parser )
"\\" token hide [
'simple-escape-char' ,
'predefined-char-class' ,
'posix-character-class' ,
'simple-escape' ,
] choice* 2seq ;
MEMO: 'any-char' ( -- parser )
"." token [ drop t ] literal-action ;
MEMO: 'char' ( -- parser )
'any-char' 'escape' 'ordinary-char' 3choice [ satisfy ] action ;
DEFER: 'regexp'
TUPLE: group-result str ;
C: <group-result> group-result
MEMO: 'non-capturing-group' ( -- parser )
"?:" token hide 'regexp' ;
MEMO: 'positive-lookahead-group' ( -- parser )
"?=" token hide 'regexp' [ ensure ] action ;
MEMO: 'negative-lookahead-group' ( -- parser )
"?!" token hide 'regexp' [ ensure-not ] action ;
MEMO: 'simple-group' ( -- parser )
'regexp' [ [ <group-result> ] action ] action ;
MEMO: 'group' ( -- parser )
[
'non-capturing-group' ,
'positive-lookahead-group' ,
'negative-lookahead-group' ,
'simple-group' ,
] choice* "(" ")" surrounded-by ;
MEMO: 'range' ( -- parser )
any-char "-" token hide any-char 3seq
[ first2 char-between?-quot ] action ;
MEMO: 'character-class-term' ( -- parser )
'range'
'escape'
[ "\\]" member? not ] satisfy [ char=-quot ] action
3choice ;
MEMO: 'positive-character-class' ( -- parser )
! todo
"]" token [ CHAR: ] = ] literal-action 'character-class-term' repeat0 2seq
'character-class-term' repeat1 2choice [ or-predicates ] action ;
MEMO: 'negative-character-class' ( -- parser )
"^" token hide 'positive-character-class' 2seq
[ [ not ] append ] action ;
MEMO: 'character-class' ( -- parser )
'negative-character-class' 'positive-character-class' 2choice
"[" "]" surrounded-by [ satisfy ] action ;
MEMO: 'escaped-seq' ( -- parser )
any-char repeat1
[ ignore-case? get token ] action "\\Q" "\\E" surrounded-by ;
MEMO: 'break' ( quot -- parser )
satisfy ensure
epsilon just 2choice ;
MEMO: 'break-escape' ( -- parser )
"$" token [ "\r\n" member? ] 'break' literal-action
"\\b" token [ blank? ] 'break' literal-action
"\\B" token [ blank? not ] 'break' literal-action
"\\z" token epsilon just literal-action 4choice ;
MEMO: 'simple' ( -- parser )
[
'escaped-seq' ,
'break-escape' ,
'group' ,
'character-class' ,
'char' ,
] choice* ;
MEMO: 'exactly-n' ( -- parser )
'integer' [ exactly-n ] delay-action ;
MEMO: 'at-least-n' ( -- parser )
'integer' "," token hide 2seq [ at-least-n ] delay-action ;
MEMO: 'at-most-n' ( -- parser )
"," token hide 'integer' 2seq [ at-most-n ] delay-action ;
MEMO: 'from-m-to-n' ( -- parser )
'integer' "," token hide 'integer' 3seq
[ first2 from-m-to-n ] delay-action ;
MEMO: 'greedy-interval' ( -- parser )
'exactly-n' 'at-least-n' 'at-most-n' 'from-m-to-n' 4choice ;
MEMO: 'interval' ( -- parser )
'greedy-interval'
'greedy-interval' "?" token hide 2seq [ "reluctant {}" print ] action
'greedy-interval' "+" token hide 2seq [ "possessive {}" print ] action
3choice "{" "}" surrounded-by ;
MEMO: 'repetition' ( -- parser )
[
! Possessive
! "*+" token [ <!*> ] literal-action ,
! "++" token [ <!+> ] literal-action ,
! "?+" token [ <!?> ] literal-action ,
! Reluctant
! "*?" token [ <(*)> ] literal-action ,
! "+?" token [ <(+)> ] literal-action ,
! "??" token [ <(?)> ] literal-action ,
! Greedy
"*" token [ repeat0 ] literal-action ,
"+" token [ repeat1 ] literal-action ,
"?" token [ optional ] literal-action ,
] choice* ;
MEMO: 'dummy' ( -- parser )
epsilon [ ] literal-action ;
! todo -- check the action
! MEMO: 'term' ( -- parser )
! 'simple'
! 'repetition' 'interval' 'dummy' 3choice 2seq [ first2 call ] action
! <!+> [ <and-parser> ] action ;