diff --git a/basis/unicode/case/case-tests.factor b/basis/unicode/case/case-tests.factor index f9d304e05c..6e26a36a19 100644 --- a/basis/unicode/case/case-tests.factor +++ b/basis/unicode/case/case-tests.factor @@ -6,12 +6,12 @@ USING: unicode.case tools.test namespaces ; [ "Hello How Are You? I'm Good" ] [ "hEllo how ARE yOU? I'm good" >title ] unit-test [ "FUSS" ] [ "Fu\u0000DF" >upper ] unit-test -[ "\u0003C3\u0003C2" ] [ "\u0003A3\u0003A3" >lower ] unit-test +[ "\u0003C3a\u0003C2 \u0003C3\u0003C2 \u0003C3a\u0003C2" ] [ "\u0003A3A\u0003A3 \u0003A3\u0003A3 \u0003A3A\u0003A3" >lower ] unit-test [ t ] [ "hello how are you?" lower? ] unit-test [ "tr" locale set [ "i\u000131i \u000131jj" ] [ "i\u000131I\u000307 IJj" >lower ] unit-test -! [ "I\u00307\u000131i Ijj" ] [ "i\u000131I\u000307 IJj" >title ] unit-test + [ "I\u000307\u000131i Ijj" ] [ "i\u000131I\u000307 IJj" >title ] unit-test [ "I\u000307II\u000307 IJJ" ] [ "i\u000131I\u000307 IJj" >upper ] unit-test "lt" locale set ! Lithuanian casing tests diff --git a/basis/unicode/case/case.factor b/basis/unicode/case/case.factor index 5d103e2dd0..b0472cd9cb 100644 --- a/basis/unicode/case/case.factor +++ b/basis/unicode/case/case.factor @@ -1,8 +1,8 @@ ! Copyright (C) 2008 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: unicode.data sequences sequences.next namespaces make -unicode.normalize math unicode.categories combinators -assocs strings splitting kernel accessors unicode.breaks ; +unicode.normalize math unicode.categories combinators unicode.syntax +assocs strings splitting kernel accessors unicode.breaks fry ; IN: unicode.case SYMBOL: locale ! Just casing locale, or overall? upper ( ? next ch -- ? ) - rot [ 2drop f ] - [ swap dot-over = over "ij" member? and swap , ] if ; - : lithuanian>upper ( string -- lower ) - [ f swap [ lithuanian-ch>upper ] each-next drop ] "" make ; + "i\u000307" "i" replace + "j\u000307" "j" replace ; : mark-above? ( ch -- ? ) combining-class 230 = ; -: lithuanian-ch>lower ( next ch -- ) - ! This fails to add a dot above in certain edge cases - ! where there is a non-above combining mark before an above one - ! in Lithuanian - dup , "IJ" member? swap mark-above? and [ dot-over , ] when ; +: with-rest ( seq quot: ( seq -- seq ) -- seq ) + [ unclip ] dip swap slip prefix ; inline + +: add-dots ( seq -- seq ) + [ [ "" ] [ + dup first mark-above? + [ CHAR: combining-dot-above prefix ] when + ] if-empty ] with-rest ; : lithuanian>lower ( string -- lower ) - [ [ lithuanian-ch>lower ] each-next ] "" make ; - -: turk-ch>upper ( ch -- ) - dup CHAR: i = - [ drop CHAR: I , dot-over , ] [ , ] if ; + "i" split add-dots "i" join + "j" split add-dots "i" join ; : turk>upper ( string -- upper-i ) - [ [ turk-ch>upper ] each ] "" make ; - -: turk-ch>lower ( ? next ch -- ? ) - { - { [ rot ] [ 2drop f ] } - { [ dup CHAR: I = ] [ - drop dot-over = - dup CHAR: i HEX: 131 ? , - ] } - [ , drop f ] - } cond ; + "i" "I\u000307" replace ; : turk>lower ( string -- lower-i ) - [ f swap [ turk-ch>lower ] each-next drop ] "" make ; + "I\u000307" "i" replace + "I" "\u000131" replace ; -: word-boundary ( prev char -- new ? ) - dup non-starter? [ drop dup ] when - swap uncased? ; +: fix-sigma-end ( string -- string ) + [ "" ] [ + dup peek CHAR: greek-small-letter-sigma = + [ 1 head* CHAR: greek-small-letter-final-sigma suffix ] when + ] if-empty ; : sigma-map ( string -- string ) - [ - swap [ uncased? ] keep not or - [ drop HEX: 3C2 ] when - ] map-next ; + { CHAR: greek-capital-letter-sigma } split [ [ + [ { CHAR: greek-small-letter-sigma } ] [ + dup first uncased? + CHAR: greek-small-letter-final-sigma + CHAR: greek-small-letter-sigma ? prefix + ] if-empty + ] map ] with-rest concat fix-sigma-end ; : final-sigma ( string -- string ) - HEX: 3A3 over member? [ sigma-map ] when ; + CHAR: greek-capital-letter-sigma + over member? [ sigma-map ] when ; : map-case ( string string-quot char-quot -- case ) [ @@ -83,26 +84,26 @@ SYMBOL: locale ! Just casing locale, or overall? ] 2curry each ] "" make ; inline -: (>lower) ( string -- lower ) - [ lower>> ] [ ch>lower ] map-case ; - -: (>title) ( string -- title ) - [ title>> ] [ ch>title ] map-case ; - -: (>upper) ( string -- upper ) - [ upper>> ] [ ch>upper ] map-case ; - -: title-word ( string -- title ) - unclip 1string [ (>lower) ] [ (>title) ] bi* prepend ; - PRIVATE> : >lower ( string -- lower ) - i-dot? [ turk>lower ] when - final-sigma (>lower) ; + i-dot? [ turk>lower ] when final-sigma + [ lower>> ] [ ch>lower ] map-case ; : >upper ( string -- upper ) - i-dot? [ turk>upper ] when (>upper) ; + i-dot? [ turk>upper ] when + [ upper>> ] [ ch>upper ] map-case ; + +title) ( string -- title ) + i-dot? [ turk>upper ] when + [ title>> ] [ ch>title ] map-case ; + +: title-word ( string -- title ) + unclip 1string [ >lower ] [ (>title) ] bi* prepend ; + +PRIVATE> : >title ( string -- title ) final-sigma >words [ title-word ] map concat ;