Making collation better

db4
Daniel Ehrenberg 2008-05-24 12:17:08 -05:00
parent 1f7f4bb278
commit f332686375
3 changed files with 66 additions and 19 deletions

View File

@ -8,14 +8,16 @@ IN: unicode.collation.tests
[ ";" split1 drop " " split [ hex> ] "" map-as ] map ; [ ";" split1 drop " " split [ hex> ] "" map-as ] map ;
: test-two ( str1 str2 -- ) : test-two ( str1 str2 -- )
[ +lt+ ] -rot [ string<=> ] 2curry unit-test ; [ +lt+ ] -rot [ string<=> ] 2curry unit-test ;
: test parse-test 2 <clumps> [ test-two ] assoc-each ;
: find-failure : find-failure
parse-test dup 2 <clumps> parse-test dup 2 <clumps>
[ string<=> +lt+ = not ] assoc-find drop ; [ string<=> +lt+ = not ] assoc-find drop ;
: (find-failure)
dup 2 <clumps>
[ string<=> +lt+ = not ] assoc-find drop ;
: failures : failures
parse-test dup 2 <clumps> parse-test dup 2 <clumps>
[ string<=> +lt+ = not ] assoc-filter dup assoc-size ; [ string<=> +lt+ = not ] assoc-filter dup assoc-size ;

View File

@ -1,7 +1,7 @@
USING: sequences io.files io.encodings.ascii kernel values USING: sequences io.files io.encodings.ascii kernel values
splitting accessors math.parser ascii io assocs strings math splitting accessors math.parser ascii io assocs strings math
namespaces sorting combinators math.order arrays namespaces sorting combinators math.order arrays
unicode.normalize ; unicode.normalize unicode.data combinators.lib locals ;
IN: unicode.collation IN: unicode.collation
VALUE: ducet VALUE: ducet
@ -33,14 +33,39 @@ ascii <file-reader> parse-ducet \ ducet set-value
: derive-weight ( char -- weight ) : derive-weight ( char -- weight )
! This should check Noncharacter_Code_Point ! This should check Noncharacter_Code_Point
! If yes, then ignore the character ! If yes, then ignore the character
! otherwise, apply derivation formula ! otherwise, apply derivation formula with the right base
drop { } ; drop { } ;
: string>weights ( string -- weights ) : last ( -- char )
! This should actually look things up with building get empty? [ 0 ] [ building get peek peek ] if ;
! multichar collation elements
! Also, do weight derivation for things not in DUCET : blocked? ( char -- ? )
[ dup 1string ducet at [ ] [ derive-weight ] ?if ] combining-class [
last combining-class =
] [ last combining-class ] if* ;
: possible-bases ( -- slice-of-building )
building get dup [ first combining-class not ] find-last
drop [ 0 ] unless* tail-slice ;
:: ?combine ( char slice i -- ? )
[let | str [ i slice nth char suffix ] |
str ducet key? dup
[ str i slice set-nth ] when
] ;
: add ( char -- )
dup blocked? [ 1string , ] [
dup possible-bases dup length
[ ?combine ] 2with contains?
[ drop ] [ 1string , ] if
] if ;
: string>graphemes ( string -- graphemes )
[ [ add ] each ] { } make ;
: graphemes>weights ( graphemes -- weights )
[ dup ducet at [ ] [ derive-weight ] ?if ]
{ } map-as concat ; { } map-as concat ;
: append-weights ( weights quot -- ) : append-weights ( weights quot -- )
@ -65,8 +90,6 @@ ascii <file-reader> parse-ducet \ ducet set-value
[ zero? ] tri@ and and ; [ zero? ] tri@ and and ;
: filter-ignorable ( weights -- weights' ) : filter-ignorable ( weights -- weights' )
! Filters primary-ignorables which follow variable weighteds
! and all completely-ignorables
>r f r> [ >r f r> [
tuck primary>> zero? and tuck primary>> zero? and
[ swap ignorable?>> or ] [ swap ignorable?>> or ]
@ -74,7 +97,8 @@ ascii <file-reader> parse-ducet \ ducet set-value
] filter nip ; ] filter nip ;
: collation-key ( string -- key ) : collation-key ( string -- key )
nfd string>weights filter-ignorable weights>bytes ; nfd string>graphemes graphemes>weights
filter-ignorable weights>bytes ;
: compare-collation ( {str1,key} {str2,key} -- <=> ) : compare-collation ( {str1,key} {str2,key} -- <=> )
2dup [ second ] bi@ <=> dup +eq+ = 2dup [ second ] bi@ <=> dup +eq+ =
@ -87,3 +111,17 @@ ascii <file-reader> parse-ducet \ ducet set-value
: string<=> ( str1 str2 -- <=> ) : string<=> ( str1 str2 -- <=> )
[ dup collation-key 2array ] bi@ compare-collation ; [ dup collation-key 2array ] bi@ compare-collation ;
! Fix up table for long contractions
: help-one ( assoc key -- )
! Does this need to be more general?
2 head 2dup swap key? [ 2drop ] [
[ [ 1string swap at ] with { } map-as concat ]
[ swap set-at ] 2bi
] if ;
: insert-helpers ( assoc -- )
dup keys [ length 3 >= ] filter
[ help-one ] with each ;
ducet insert-helpers

19
extra/yahoo/yahoo.factor Normal file → Executable file
View File

@ -1,7 +1,7 @@
! Copyright (C) 2006 Daniel Ehrenberg ! Copyright (C) 2006 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: http.client xml xml.utilities kernel sequences USING: http.client xml xml.utilities kernel sequences
namespaces http math.parser help math.order ; namespaces http math.parser help math.order locals ;
IN: yahoo IN: yahoo
TUPLE: result title url summary ; TUPLE: result title url summary ;
@ -16,14 +16,21 @@ C: <result> result
] map ; ] map ;
: yahoo-url ( -- str ) : yahoo-url ( -- str )
"http://search.yahooapis.com/WebSearchService/V1/webSearch?appid=Factor-search&query=" ; "http://search.yahooapis.com/WebSearchService/V1/webSearch" ;
: query ( search num -- url ) :: query ( search num appid -- url )
[ [
yahoo-url % yahoo-url %
swap url-encode % "?appid=" % appid %
"&results=" % # "&query=" % search url-encode %
"&results=" % num #
] "" make ; ] "" make ;
: search-yahoo ( search num -- seq ) : factor-id
"fRrVAKzV34GDyeRw6bUHDhEWHRedwfOC7e61wwXZLgGF80E67spxdQXuugBe2pgIevMmKwA-" ;
: search-yahoo/id ( search num id -- seq )
query http-get string>xml parse-yahoo ; query http-get string>xml parse-yahoo ;
: search-yahoo ( search num -- seq )
factor-id search-yahoo/id ;