| 
									
										
										
										
											2008-10-05 19:36:56 -04:00
										 |  |  | ! Copyright (C) 2008 Daniel Ehrenberg. | 
					
						
							|  |  |  | ! See http://factorcode.org/license.txt for BSD license. | 
					
						
							| 
									
										
										
										
											2016-03-31 02:29:48 -04:00
										 |  |  | USING: accessors arrays assocs combinators fry interval-maps | 
					
						
							|  |  |  | kernel literals locals math namespaces parser sequences | 
					
						
							|  |  |  | simple-flat-file unicode.categories unicode.data | 
					
						
							| 
									
										
										
										
											2014-05-19 23:59:42 -04:00
										 |  |  | unicode.normalize.private words words.constant ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | IN: unicode.breaks | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-07 18:59:01 -05:00
										 |  |  | <PRIVATE
 | 
					
						
							| 
									
										
										
										
											2009-01-07 13:23:07 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-05-19 23:59:42 -04:00
										 |  |  | ! Grapheme breaks | 
					
						
							|  |  |  | << | 
					
						
							| 
									
										
										
										
											2010-04-18 16:34:18 -04:00
										 |  |  | CONSTANT: Any 0
 | 
					
						
							|  |  |  | CONSTANT: L 1
 | 
					
						
							|  |  |  | CONSTANT: V 2
 | 
					
						
							|  |  |  | CONSTANT: T 3
 | 
					
						
							|  |  |  | CONSTANT: LV 4
 | 
					
						
							|  |  |  | CONSTANT: LVT 5
 | 
					
						
							|  |  |  | CONSTANT: Extend 6
 | 
					
						
							|  |  |  | CONSTANT: Control 7
 | 
					
						
							|  |  |  | CONSTANT: CR 8
 | 
					
						
							|  |  |  | CONSTANT: LF 9
 | 
					
						
							|  |  |  | CONSTANT: SpacingMark 10
 | 
					
						
							|  |  |  | CONSTANT: Prepend 11
 | 
					
						
							|  |  |  | CONSTANT: graphemes 12
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : jamo-class ( ch -- class )
 | 
					
						
							|  |  |  |     dup initial? [ drop L ] | 
					
						
							|  |  |  |     [ dup medial? [ drop V ] [ final? T Any ? ] if ] if ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-06 11:19:19 -05:00
										 |  |  | : hangul-class ( ch -- class )
 | 
					
						
							| 
									
										
										
										
											2011-11-23 21:49:33 -05:00
										 |  |  |     hangul-base - 0x1C mod zero? LV LVT ? ;
 | 
					
						
							| 
									
										
										
										
											2009-01-06 11:19:19 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | CATEGORY: grapheme-control Zl Zp Cc Cf ;
 | 
					
						
							|  |  |  | : control-class ( ch -- class )
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { CHAR: \r [ CR ] } | 
					
						
							|  |  |  |         { CHAR: \n [ LF ] } | 
					
						
							| 
									
										
										
										
											2011-11-23 21:49:33 -05:00
										 |  |  |         { 0x200C [ Extend ] } | 
					
						
							|  |  |  |         { 0x200D [ Extend ] } | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  |         [ drop Control ] | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-03-21 02:11:45 -04:00
										 |  |  | CATEGORY: extend | 
					
						
							|  |  |  |     Me Mn | | 
					
						
							|  |  |  |     "Other_Grapheme_Extend" property? ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-06 11:19:19 -05:00
										 |  |  | : loe? ( ch -- ? )
 | 
					
						
							|  |  |  |     "Logical_Order_Exception" property? ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | CATEGORY: spacing Mc ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | : grapheme-class ( ch -- class )
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { [ dup jamo? ] [ jamo-class ] } | 
					
						
							| 
									
										
										
										
											2009-01-06 11:19:19 -05:00
										 |  |  |         { [ dup hangul? ] [ hangul-class ] } | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  |         { [ dup grapheme-control? ] [ control-class ] } | 
					
						
							| 
									
										
										
										
											2009-01-06 11:19:19 -05:00
										 |  |  |         { [ dup extend? ] [ drop Extend ] } | 
					
						
							|  |  |  |         { [ dup spacing? ] [ drop SpacingMark ] } | 
					
						
							|  |  |  |         { [ loe? ] [ Prepend ] } | 
					
						
							| 
									
										
										
										
											2008-04-11 13:54:51 -04:00
										 |  |  |         [ Any ] | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  |     } cond ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-07 13:23:07 -05:00
										 |  |  | : init-table ( size -- table )
 | 
					
						
							|  |  |  |     dup [ f <array> ] curry replicate ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | SYMBOL: table | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : finish-table ( -- table )
 | 
					
						
							|  |  |  |     table get [ [ 1 = ] map ] map ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-06-07 12:56:47 -04:00
										 |  |  | : eval-seq ( seq -- seq )
 | 
					
						
							|  |  |  |     [ dup word? [ execute( -- x ) ] when ] map ;
 | 
					
						
							| 
									
										
										
										
											2009-01-09 16:53:35 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | : (set-table) ( class1 class2 val -- )
 | 
					
						
							| 
									
										
										
										
											2009-01-15 16:43:42 -05:00
										 |  |  |     [ table get nth ] dip '[ _ or ] change-nth ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-09 16:53:35 -05:00
										 |  |  | : set-table ( classes1 classes2 val -- )
 | 
					
						
							|  |  |  |     [ [ eval-seq ] bi@ ] dip
 | 
					
						
							|  |  |  |     [ [ (set-table) ] curry with each ] 2curry each ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | : connect ( class1 class2 -- ) 1 set-table ;
 | 
					
						
							|  |  |  | : disconnect ( class1 class2 -- ) 0 set-table ;
 | 
					
						
							| 
									
										
										
										
											2014-05-19 23:59:42 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | : make-grapheme-table ( -- )
 | 
					
						
							| 
									
										
										
										
											2009-01-09 16:53:35 -05:00
										 |  |  |     { CR } { LF } connect | 
					
						
							| 
									
										
										
										
											2010-01-14 10:10:13 -05:00
										 |  |  |     { Control CR LF } graphemes iota disconnect | 
					
						
							|  |  |  |     graphemes iota { Control CR LF } disconnect | 
					
						
							| 
									
										
										
										
											2009-01-09 16:53:35 -05:00
										 |  |  |     { L } { L V LV LVT } connect | 
					
						
							|  |  |  |     { LV V } { V T } connect | 
					
						
							|  |  |  |     { LVT T } { T } connect | 
					
						
							| 
									
										
										
										
											2010-01-14 10:10:13 -05:00
										 |  |  |     graphemes iota { Extend } connect | 
					
						
							|  |  |  |     graphemes iota { SpacingMark } connect | 
					
						
							|  |  |  |     { Prepend } graphemes iota connect ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-06-08 15:38:38 -04:00
										 |  |  | "grapheme-table" create-word-in | 
					
						
							| 
									
										
										
										
											2014-05-19 23:59:42 -04:00
										 |  |  | graphemes init-table table | 
					
						
							|  |  |  | [ make-grapheme-table finish-table ] with-variable
 | 
					
						
							|  |  |  | define-constant | 
					
						
							|  |  |  | >> | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : grapheme-break? ( class1 class2 -- ? )
 | 
					
						
							| 
									
										
										
										
											2014-05-19 23:59:42 -04:00
										 |  |  |     grapheme-table nth nth not ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-07 13:23:07 -05:00
										 |  |  | ! Word breaks | 
					
						
							| 
									
										
										
										
											2014-05-19 23:59:42 -04:00
										 |  |  | << | 
					
						
							| 
									
										
										
										
											2010-04-18 16:34:18 -04:00
										 |  |  | CONSTANT: wOther 0
 | 
					
						
							|  |  |  | CONSTANT: wCR 1
 | 
					
						
							|  |  |  | CONSTANT: wLF 2
 | 
					
						
							|  |  |  | CONSTANT: wNewline 3
 | 
					
						
							|  |  |  | CONSTANT: wExtend 4
 | 
					
						
							|  |  |  | CONSTANT: wFormat 5
 | 
					
						
							|  |  |  | CONSTANT: wKatakana 6
 | 
					
						
							|  |  |  | CONSTANT: wALetter 7
 | 
					
						
							|  |  |  | CONSTANT: wMidLetter 8
 | 
					
						
							|  |  |  | CONSTANT: wMidNum 9
 | 
					
						
							|  |  |  | CONSTANT: wMidNumLet 10
 | 
					
						
							|  |  |  | CONSTANT: wNumeric 11
 | 
					
						
							|  |  |  | CONSTANT: wExtendNumLet 12
 | 
					
						
							| 
									
										
										
										
											2015-06-08 14:48:13 -04:00
										 |  |  | CONSTANT: unicode-words 13
 | 
					
						
							| 
									
										
										
										
											2009-01-07 13:23:07 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-09-19 13:09:05 -04:00
										 |  |  | ! Is there a way to avoid this? | 
					
						
							|  |  |  | CONSTANT: word-break-classes H{ | 
					
						
							|  |  |  |     { "Other" 0 } { "CR" 1 } { "LF" 2 } { "Newline" 3 } | 
					
						
							|  |  |  |     { "Extend" 4 } { "Format" 5 } { "Katakana" 6 } | 
					
						
							|  |  |  |     { "ALetter" 7 } { "MidLetter" 8 } | 
					
						
							|  |  |  |     { "MidNum" 9 } { "MidNumLet" 10 } { "Numeric" 11 } | 
					
						
							|  |  |  |     { "ExtendNumLet" 12 } | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2009-01-07 13:23:07 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-06-08 15:38:38 -04:00
										 |  |  | "word-break-table" create-word-in | 
					
						
							| 
									
										
										
										
											2014-05-19 23:59:42 -04:00
										 |  |  | "vocab:unicode/data/WordBreakProperty.txt" | 
					
						
							|  |  |  | load-interval-file dup array>> | 
					
						
							|  |  |  | [ 2 swap [ word-break-classes at ] change-nth ] each
 | 
					
						
							|  |  |  | define-constant | 
					
						
							|  |  |  | >> | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-07 13:23:07 -05:00
										 |  |  | : word-break-prop ( char -- word-break-prop )
 | 
					
						
							| 
									
										
										
										
											2014-05-19 23:59:42 -04:00
										 |  |  |     word-break-table interval-at wOther or ;
 | 
					
						
							| 
									
										
										
										
											2009-01-07 13:23:07 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-05-19 23:59:42 -04:00
										 |  |  | << | 
					
						
							| 
									
										
										
										
											2009-01-07 13:23:07 -05:00
										 |  |  | SYMBOL: check-letter-before | 
					
						
							|  |  |  | SYMBOL: check-letter-after | 
					
						
							|  |  |  | SYMBOL: check-number-before | 
					
						
							|  |  |  | SYMBOL: check-number-after | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : make-word-table ( -- )
 | 
					
						
							| 
									
										
										
										
											2009-01-09 16:53:35 -05:00
										 |  |  |     { wCR } { wLF } connect | 
					
						
							| 
									
										
										
										
											2015-06-08 14:48:13 -04:00
										 |  |  |     { wNewline wCR wLF } unicode-words iota disconnect | 
					
						
							|  |  |  |     unicode-words iota { wNewline wCR wLF } disconnect | 
					
						
							| 
									
										
										
										
											2009-01-09 16:53:35 -05:00
										 |  |  |     { wALetter } { wMidLetter wMidNumLet } check-letter-after set-table | 
					
						
							|  |  |  |     { wMidLetter wMidNumLet } { wALetter } check-letter-before set-table | 
					
						
							|  |  |  |     { wNumeric wALetter } { wNumeric wALetter } connect | 
					
						
							|  |  |  |     { wNumeric } { wMidNum wMidNumLet } check-number-after set-table | 
					
						
							|  |  |  |     { wMidNum wMidNumLet } { wNumeric } check-number-before set-table | 
					
						
							|  |  |  |     { wKatakana } { wKatakana } connect | 
					
						
							|  |  |  |     { wALetter wNumeric wKatakana wExtendNumLet } { wExtendNumLet } | 
					
						
							|  |  |  |     [ connect ] [ swap connect ] 2bi ;
 | 
					
						
							| 
									
										
										
										
											2009-01-07 13:23:07 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | : finish-word-table ( -- table )
 | 
					
						
							|  |  |  |     table get [ | 
					
						
							|  |  |  |         [ { { 0 [ f ] } { 1 [ t ] } [ ] } case ] map
 | 
					
						
							|  |  |  |     ] map ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-06-08 15:38:38 -04:00
										 |  |  | "word-table" create-word-in | 
					
						
							| 
									
										
										
										
											2015-06-08 14:48:13 -04:00
										 |  |  | unicode-words init-table table | 
					
						
							| 
									
										
										
										
											2009-01-07 13:23:07 -05:00
										 |  |  | [ make-word-table finish-word-table ] with-variable
 | 
					
						
							| 
									
										
										
										
											2014-05-19 23:59:42 -04:00
										 |  |  | define-constant | 
					
						
							|  |  |  | >> | 
					
						
							| 
									
										
										
										
											2009-01-07 13:23:07 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-07 16:08:08 -05:00
										 |  |  | : word-table-nth ( class1 class2 -- ? )
 | 
					
						
							| 
									
										
										
										
											2014-05-19 23:59:42 -04:00
										 |  |  |     word-table nth nth ;
 | 
					
						
							| 
									
										
										
										
											2009-01-07 16:08:08 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-09 16:53:35 -05:00
										 |  |  | :: property-not= ( str i property -- ? )
 | 
					
						
							| 
									
										
										
										
											2009-01-08 23:23:39 -05:00
										 |  |  |     i [ | 
					
						
							|  |  |  |         i str ?nth [ word-break-prop property = not ] | 
					
						
							|  |  |  |         [ f ] if*
 | 
					
						
							|  |  |  |     ] [ t ] if ;
 | 
					
						
							| 
									
										
										
										
											2009-01-07 16:08:08 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-10-12 14:03:39 -04:00
										 |  |  | : (format/extended?) ( class -- ? )
 | 
					
						
							|  |  |  |     ${ wExtend wFormat } member? ; inline
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-07 16:08:08 -05:00
										 |  |  | : format/extended? ( ch -- ? )
 | 
					
						
							| 
									
										
										
										
											2011-10-12 14:03:39 -04:00
										 |  |  |     word-break-prop (format/extended?) ;
 | 
					
						
							| 
									
										
										
										
											2009-01-07 16:08:08 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-09 16:53:35 -05:00
										 |  |  | : (walk-up) ( str i -- j )
 | 
					
						
							|  |  |  |     swap [ format/extended? not ] find-from drop ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : walk-up ( str i -- j )
 | 
					
						
							| 
									
										
										
										
											2009-08-13 20:21:44 -04:00
										 |  |  |     dupd 1 + (walk-up) [ 1 + (walk-up) ] [ drop f ] if* ;
 | 
					
						
							| 
									
										
										
										
											2009-01-07 16:08:08 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-09 16:53:35 -05:00
										 |  |  | : (walk-down) ( str i -- j )
 | 
					
						
							|  |  |  |     swap [ format/extended? not ] find-last-from drop ;
 | 
					
						
							| 
									
										
										
										
											2009-01-07 16:08:08 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-09 16:53:35 -05:00
										 |  |  | : walk-down ( str i -- j )
 | 
					
						
							| 
									
										
										
										
											2009-08-13 20:21:44 -04:00
										 |  |  |     dupd (walk-down) [ 1 - (walk-down) ] [ drop f ] if* ;
 | 
					
						
							| 
									
										
										
										
											2009-01-09 16:53:35 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-15 17:46:57 -05:00
										 |  |  | : word-break? ( str i table-entry -- ? )
 | 
					
						
							|  |  |  |     { | 
					
						
							| 
									
										
										
										
											2009-01-09 16:53:35 -05:00
										 |  |  |         { t [ 2drop f ] } | 
					
						
							|  |  |  |         { f [ 2drop t ] } | 
					
						
							| 
									
										
										
										
											2009-01-07 16:08:08 -05:00
										 |  |  |         { check-letter-after | 
					
						
							| 
									
										
										
										
											2009-01-09 16:53:35 -05:00
										 |  |  |             [ dupd walk-up wALetter property-not= ] } | 
					
						
							| 
									
										
										
										
											2009-01-07 16:08:08 -05:00
										 |  |  |         { check-letter-before | 
					
						
							| 
									
										
										
										
											2009-01-09 16:53:35 -05:00
										 |  |  |             [ dupd walk-down wALetter property-not= ] } | 
					
						
							| 
									
										
										
										
											2009-01-07 16:08:08 -05:00
										 |  |  |         { check-number-after | 
					
						
							| 
									
										
										
										
											2009-01-09 16:53:35 -05:00
										 |  |  |             [ dupd walk-up wNumeric property-not= ] } | 
					
						
							| 
									
										
										
										
											2009-01-07 16:08:08 -05:00
										 |  |  |         { check-number-before | 
					
						
							| 
									
										
										
										
											2009-01-09 16:53:35 -05:00
										 |  |  |             [ dupd walk-down wNumeric property-not= ] } | 
					
						
							| 
									
										
										
										
											2009-01-11 20:41:48 -05:00
										 |  |  |     } case ;
 | 
					
						
							| 
									
										
										
										
											2009-01-07 13:23:07 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-07 16:08:08 -05:00
										 |  |  | :: word-break-next ( old-class new-char i str -- next-class ? )
 | 
					
						
							| 
									
										
										
										
											2011-10-12 14:03:39 -04:00
										 |  |  |     new-char word-break-prop :> new-class | 
					
						
							|  |  |  |     new-class (format/extended?) | 
					
						
							|  |  |  |     [ old-class dup ${ wCR wLF wNewline } member? ] [ | 
					
						
							|  |  |  |         new-class old-class over word-table-nth | 
					
						
							| 
									
										
										
										
											2012-08-24 17:57:26 -04:00
										 |  |  |         [ str i 1 - ] dip word-break? | 
					
						
							| 
									
										
										
										
											2009-01-11 20:41:48 -05:00
										 |  |  |     ] if ;
 | 
					
						
							| 
									
										
										
										
											2009-01-07 13:23:07 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-07 18:59:01 -05:00
										 |  |  | PRIVATE>
 |