From c2e3b6ac5a516dffb906c0767fe9c1e0ff10fe5a Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Fri, 20 Mar 2009 18:24:57 -0500 Subject: [PATCH 1/4] interval-maps:interval-values word, and more docs for interval-maps --- basis/interval-maps/interval-maps-docs.factor | 24 ++++++++++++++----- basis/interval-maps/interval-maps.factor | 17 +++++++++---- 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/basis/interval-maps/interval-maps-docs.factor b/basis/interval-maps/interval-maps-docs.factor index de18458546..0d5e471bff 100644 --- a/basis/interval-maps/interval-maps-docs.factor +++ b/basis/interval-maps/interval-maps-docs.factor @@ -1,22 +1,32 @@ -USING: help.markup help.syntax ; +! Copyright (C) 2008, 2009 Daniel Ehrenberg. +! See http://factorcode.org/license.txt for BSD license. +USING: help.markup help.syntax assocs kernel sequences ; IN: interval-maps HELP: interval-at* -{ $values { "key" "an object" } { "map" "an interval map" } { "value" "the value for the key, or f" } { "?" "whether the key is present" } } +{ $values { "key" object } { "map" interval-map } { "value" "the value for the key, or f" } { "?" "whether the key is present" } } { $description "Looks up a key in an interval map, returning the corresponding value if the item is in an interval in the map, and a boolean flag. The operation takes O(log n) time." } ; HELP: interval-at -{ $values { "key" "an object" } { "map" "an interval map" } { "value" "the value for the key, or f" } } +{ $values { "key" object } { "map" interval-map } { "value" "the value for the key, or f" } } { $description "Looks up a key in an interval map, returning the value of the corresponding interval, or f if the interval is not present in the map." } ; HELP: interval-key? -{ $values { "key" "an object" } { "map" "an interval map" } { "?" "a boolean" } } +{ $values { "key" object } { "map" interval-map } { "?" "a boolean" } } { $description "Tests whether an object is in an interval in the interval map, returning t if the object is present." } ; HELP: -{ $values { "specification" "an assoc" } { "map" "an interval map" } } +{ $values { "specification" assoc } { "map" interval-map } } { $description "From a specification, produce an interval tree. The specification is an assoc where the keys are intervals, or pairs of numbers to represent intervals, or individual numbers to represent singleton intervals. The values are the values int he interval map. Construction time is O(n log n)." } ; +HELP: interval-values +{ $values { "map" interval-map } { "values" sequence } } +{ $description "Constructs a list of all of the values that interval map keys are associated with. This list may contain duplicates." } ; + +HELP: coalesce +{ $values { "alist" "an association list with integer keys" } { "specification" { "array of the format used by " { $link } } } } +{ $description "Finds ranges used in the given alist, coalescing them into a single range." } ; + ARTICLE: "interval-maps" "Interval maps" "The " { $vocab-link "interval-maps" } " vocabulary implements a data structure, similar to assocs, where a set of closed intervals of keys are associated with values. As such, interval maps do not conform to the assoc protocol, because intervals of floats, for example, can be used, and it is impossible to get a list of keys in between." $nl @@ -24,7 +34,9 @@ $nl { $subsection interval-at* } { $subsection interval-at } { $subsection interval-key? } +{ $subsection interval-values } "Use the following to construct interval maps" -{ $subsection } ; +{ $subsection } +{ $subsection coalesce } ; ABOUT: "interval-maps" diff --git a/basis/interval-maps/interval-maps.factor b/basis/interval-maps/interval-maps.factor index 63a5740845..22283deecb 100644 --- a/basis/interval-maps/interval-maps.factor +++ b/basis/interval-maps/interval-maps.factor @@ -8,17 +8,21 @@ TUPLE: interval-map array ; ] with search nip ; + array>> [ start <=> ] with search nip ; : interval-contains? ( key interval-node -- ? ) - first2 between? ; + [ start ] [ end ] bi between? ; : all-intervals ( sequence -- intervals ) [ [ dup number? [ dup 2array ] when ] dip ] { } assoc-map-as ; : disjoint? ( node1 node2 -- ? ) - [ second ] [ first ] bi* < ; + [ end ] [ start ] bi* < ; : ensure-disjoint ( intervals -- intervals ) dup [ disjoint? ] monotonic? @@ -30,14 +34,17 @@ TUPLE: interval-map array ; PRIVATE> : interval-at* ( key map -- value ? ) - [ drop ] [ array>> find-interval ] 2bi + [ drop ] [ find-interval ] 2bi [ nip ] [ interval-contains? ] 2bi - [ third t ] [ drop f f ] if ; + [ value t ] [ drop f f ] if ; : interval-at ( key map -- value ) interval-at* drop ; : interval-key? ( key map -- ? ) interval-at* nip ; +: interval-values ( map -- values ) + array>> [ value ] map ; + : ( specification -- map ) all-intervals [ [ first second ] compare ] sort >intervals ensure-disjoint interval-map boa ; From a9a8e0539330c26a0a796cb220a218c890e138d0 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Fri, 20 Mar 2009 18:39:38 -0500 Subject: [PATCH 2/4] fix apt libpango name --- build-support/factor.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build-support/factor.sh b/build-support/factor.sh index cf6aacb84f..107339eb51 100755 --- a/build-support/factor.sh +++ b/build-support/factor.sh @@ -502,7 +502,7 @@ make_boot_image() { } install_build_system_apt() { - sudo apt-get --yes install libc6-dev libpango-1.0-dev libx11-dev xorg-dev glutg3-dev wget git-core git-doc rlwrap gcc make + sudo apt-get --yes install libc6-dev libpango1.0-dev libx11-dev xorg-dev glutg3-dev wget git-core git-doc rlwrap gcc make check_ret sudo } From d8b7fb25154cba1a7ad9aed3b117857a44f62adb Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Fri, 20 Mar 2009 18:45:54 -0500 Subject: [PATCH 3/4] remove check for freetype, add check for pango --- build-support/factor.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build-support/factor.sh b/build-support/factor.sh index 107339eb51..c5be9f8957 100755 --- a/build-support/factor.sh +++ b/build-support/factor.sh @@ -139,10 +139,10 @@ check_library_exists() { } check_X11_libraries() { - check_library_exists freetype check_library_exists GLU check_library_exists GL check_library_exists X11 + check_library_exists pango } check_libraries() { From 9760f54857656cdc1bbb833feeb61998e14a9bfc Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Fri, 20 Mar 2009 19:03:26 -0500 Subject: [PATCH 4/4] Regexp supports Unicode properties (categories and script) --- basis/regexp/classes/classes.factor | 11 +++++---- basis/regexp/parser/parser.factor | 35 +++++++++++++++++++++++++---- basis/regexp/regexp-docs.factor | 6 +++-- basis/regexp/regexp-tests.factor | 28 +++++++++++++++++++++++ 4 files changed, 70 insertions(+), 10 deletions(-) diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor index 28b0ed1563..e114dea260 100644 --- a/basis/regexp/classes/classes.factor +++ b/basis/regexp/classes/classes.factor @@ -108,21 +108,24 @@ M: terminator-class class-member? ( obj class -- ? ) M: f class-member? 2drop f ; +: same? ( obj1 obj2 quot1: ( obj1 -- val1 ) quot2: ( obj2 -- val2 ) -- ? ) + bi* = ; inline + M: script-class class-member? - [ script-of ] [ script>> ] bi* = ; + [ script-of ] [ script>> ] same? ; M: category-class class-member? - [ category# ] [ category>> ] bi* = ; + [ category ] [ category>> ] same? ; M: category-range-class class-member? - [ category first ] [ category>> ] bi* = ; + [ category first ] [ category>> ] same? ; TUPLE: not-class class ; PREDICATE: not-integer < not-class class>> integer? ; UNION: simple-class - primitive-class range-class category-class category-range-class dot ; + primitive-class range-class dot ; PREDICATE: not-simple < not-class class>> simple-class? ; M: not-class class-member? diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index bf5465e0e2..e8de469a94 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -2,7 +2,8 @@ ! See http://factorcode.org/license.txt for BSD license. USING: peg.ebnf kernel math.parser sequences assocs arrays fry math combinators regexp.classes strings splitting peg locals accessors -regexp.ast unicode.case ; +regexp.ast unicode.case unicode.script.private unicode.categories +memoize interval-maps sets unicode.data combinators.short-circuit ; IN: regexp.parser : allowed-char? ( ch -- ? ) @@ -18,15 +19,41 @@ ERROR: bad-number ; ERROR: bad-class name ; +: simple ( str -- simple ) + ! Alternatively, first collation key level? + >case-fold [ " \t_" member? not ] filter ; + +: simple-table ( seq -- table ) + [ [ simple ] keep ] H{ } map>assoc ; + +MEMO: simple-script-table ( -- table ) + script-table interval-values prune simple-table ; + +MEMO: simple-category-table ( -- table ) + categories simple-table ; + : parse-unicode-class ( name -- class ) - ! Implement this! - drop f ; + { + { [ dup { [ length 1 = ] [ first "clmnpsz" member? ] } 1&& ] [ + >upper first + + ] } + { [ dup >title categories member? ] [ + simple-category-table at + ] } + { [ "script=" ?head ] [ + dup simple-script-table at + [ ] + [ "script=" prepend bad-class ] ?if + ] } + [ bad-class ] + } cond ; : unicode-class ( name -- class ) dup parse-unicode-class [ ] [ bad-class ] ?if ; : name>class ( name -- class ) - >string >case-fold { + >string simple { { "lower" letter-class } { "upper" LETTER-class } { "alpha" Letter-class } diff --git a/basis/regexp/regexp-docs.factor b/basis/regexp/regexp-docs.factor index 6ad340a82d..6d9f03781d 100644 --- a/basis/regexp/regexp-docs.factor +++ b/basis/regexp/regexp-docs.factor @@ -72,8 +72,10 @@ ARTICLE: { "regexp" "syntax" } "Regular expression syntax" { { $snippet "\\p{blank}" } "Non-newline whitespace" } { { $snippet "\\p{cntrl}" } "Control character" } { { $snippet "\\p{space}" } "Whitespace" } - { { $snippet "\\p{xdigit}" } "Hexidecimal digit" } } ! In the future: Unicode -"Full unicode properties are not yet supported." + { { $snippet "\\p{xdigit}" } "Hexidecimal digit" } + { { $snippet "\\p{Nd}" } "Character in Unicode category Nd" } + { { $snippet "\\p{Z}" } "Character in Unicode category beginning with Z" } + { { $snippet "\\p{script=Cham}" } "Character in the Cham writing system" } } { $heading "Boundaries" } "Special operators exist to match certain points in the string. These are called 'zero-width' because they do not consume any characters." { $table diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor index 0836c0988b..999caeaed6 100644 --- a/basis/regexp/regexp-tests.factor +++ b/basis/regexp/regexp-tests.factor @@ -480,3 +480,31 @@ IN: regexp-tests [ f ] [ "a\r" R/ a$./mds matches? ] unit-test [ t ] [ "a\n" R/ a$./ms matches? ] unit-test [ t ] [ "a\n" R/ a$./mds matches? ] unit-test + +! Unicode categories +[ t ] [ "a" R/ \p{L}/ matches? ] unit-test +[ t ] [ "A" R/ \p{L}/ matches? ] unit-test +[ f ] [ " " R/ \p{L}/ matches? ] unit-test +[ f ] [ "a" R/ \P{L}/ matches? ] unit-test +[ f ] [ "A" R/ \P{L}/ matches? ] unit-test +[ t ] [ " " R/ \P{L}/ matches? ] unit-test + +[ t ] [ "a" R/ \p{Ll}/ matches? ] unit-test +[ f ] [ "A" R/ \p{Ll}/ matches? ] unit-test +[ f ] [ " " R/ \p{Ll}/ matches? ] unit-test +[ f ] [ "a" R/ \P{Ll}/ matches? ] unit-test +[ t ] [ "A" R/ \P{Ll}/ matches? ] unit-test +[ t ] [ " " R/ \P{Ll}/ matches? ] unit-test + +[ t ] [ "a" R/ \p{script=Latin}/ matches? ] unit-test +[ f ] [ " " R/ \p{script=Latin}/ matches? ] unit-test +[ f ] [ "a" R/ \P{script=Latin}/ matches? ] unit-test +[ t ] [ " " R/ \P{script=Latin}/ matches? ] unit-test + +! These should be case-insensitive +[ f ] [ " " R/ \p{l}/ matches? ] unit-test +[ f ] [ "a" R/ \P{l}/ matches? ] unit-test +[ f ] [ "a" R/ \P{ll}/ matches? ] unit-test +[ t ] [ " " R/ \P{LL}/ matches? ] unit-test +[ f ] [ "a" R/ \P{sCriPt = latin}/ matches? ] unit-test +[ t ] [ " " R/ \P{SCRIPT = laTIn}/ matches? ] unit-test