| 
									
										
										
										
											2013-11-20 19:54:56 -05:00
										 |  |  | ! Copyright (C) 2013 Björn Lindqvist | 
					
						
							|  |  |  | ! See http://factorcode.org/license.txt for BSD license | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-11-20 16:06:49 -05:00
										 |  |  | USING: accessors alien alien.accessors alien.c-types alien.data | 
					
						
							| 
									
										
										
										
											2013-11-20 19:54:56 -05:00
										 |  |  | alien.enums alien.strings arrays assocs combinators fry | 
					
						
							|  |  |  | io.encodings.string io.encodings.utf8 kernel literals math | 
					
						
							|  |  |  | math.bitwise pcre.ffi sequences splitting strings ;
 | 
					
						
							| 
									
										
										
										
											2013-11-04 07:26:37 -05:00
										 |  |  | QUALIFIED: regexp | 
					
						
							| 
									
										
										
										
											2013-09-18 16:45:17 -04:00
										 |  |  | IN: pcre | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-11-29 09:52:30 -05:00
										 |  |  | ERROR: bad-option what ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-11-20 16:29:39 -05:00
										 |  |  | ERROR: malformed-regexp expr error ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ERROR: pcre-error value ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-11-20 16:06:49 -05:00
										 |  |  | <PRIVATE
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-11-20 16:29:39 -05:00
										 |  |  | : replace-all ( seq subseqs new -- seq )
 | 
					
						
							|  |  |  |     swapd '[ _ replace ] reduce ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : split-subseqs ( seq subseqs -- seqs )
 | 
					
						
							|  |  |  |     dup first [ replace-all ] keep split-subseq [ >string ] map harvest ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-11-20 16:06:49 -05:00
										 |  |  | : utf8-start-byte? ( byte -- ? )
 | 
					
						
							|  |  |  |     0xc0 bitand 0x80 = not ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : next-utf8-char ( byte-array pos -- pos' )
 | 
					
						
							| 
									
										
										
										
											2013-11-20 19:54:56 -05:00
										 |  |  |     1 + 2dup swap ?nth [ | 
					
						
							|  |  |  |         utf8-start-byte? [ nip ] [ next-utf8-char ] if
 | 
					
						
							|  |  |  |     ] [ 2drop f ] if* ;
 | 
					
						
							| 
									
										
										
										
											2013-11-20 16:06:49 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-11-30 12:38:55 -05:00
										 |  |  | : check-bad-option ( err value what -- value )
 | 
					
						
							| 
									
										
										
										
											2015-08-13 19:13:05 -04:00
										 |  |  |     rot 0 = [ drop ] [ bad-option ] if ;
 | 
					
						
							| 
									
										
										
										
											2013-11-20 16:29:39 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-11-30 12:38:55 -05:00
										 |  |  | : pcre-config ( what -- value )
 | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         dup { | 
					
						
							|  |  |  |             PCRE_CONFIG_MATCH_LIMIT | 
					
						
							|  |  |  |             PCRE_CONFIG_MATCH_LIMIT_RECURSION | 
					
						
							|  |  |  |         } member? [ | 
					
						
							|  |  |  |             { long } [ pcre_config ] with-out-parameters | 
					
						
							|  |  |  |         ] [ | 
					
						
							|  |  |  |             { int } [ pcre_config ] with-out-parameters | 
					
						
							|  |  |  |         ] if
 | 
					
						
							|  |  |  |     ] keep check-bad-option ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-11-20 19:54:56 -05:00
										 |  |  | : pcre-fullinfo ( pcre extra what -- obj )
 | 
					
						
							| 
									
										
										
										
											2013-11-29 10:17:33 -05:00
										 |  |  |     [ { int } [ pcre_fullinfo ] with-out-parameters ] keep
 | 
					
						
							| 
									
										
										
										
											2013-11-30 12:38:55 -05:00
										 |  |  |     check-bad-option ;
 | 
					
						
							| 
									
										
										
										
											2013-11-20 16:06:49 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-11-20 19:54:56 -05:00
										 |  |  | : pcre-substring-list ( subject match-array count -- alien )
 | 
					
						
							|  |  |  |     { void* } [ pcre_get_substring_list drop ] with-out-parameters ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-11-20 16:06:49 -05:00
										 |  |  | : name-count ( pcre extra -- n )
 | 
					
						
							| 
									
										
										
										
											2013-11-20 19:54:56 -05:00
										 |  |  |     PCRE_INFO_NAMECOUNT pcre-fullinfo ;
 | 
					
						
							| 
									
										
										
										
											2013-11-20 16:06:49 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | : name-table ( pcre extra -- addr )
 | 
					
						
							|  |  |  |     [ drop alien-address 32 on-bits unmask ] | 
					
						
							| 
									
										
										
										
											2014-07-10 19:33:20 -04:00
										 |  |  |     ! On at least win64, the pointer is returned as an int and is | 
					
						
							|  |  |  |     ! negative. Cast it to a uint and everything works. | 
					
						
							| 
									
										
										
										
											2014-07-10 19:22:45 -04:00
										 |  |  |     [ PCRE_INFO_NAMETABLE pcre-fullinfo int <ref> uint deref ] 2bi + ;
 | 
					
						
							| 
									
										
										
										
											2013-11-20 16:06:49 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | : name-entry-size ( pcre extra -- size )
 | 
					
						
							| 
									
										
										
										
											2013-11-20 19:54:56 -05:00
										 |  |  |     PCRE_INFO_NAMEENTRYSIZE pcre-fullinfo ;
 | 
					
						
							| 
									
										
										
										
											2013-11-20 16:06:49 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | : name-table-entry ( addr -- group-index group-name )
 | 
					
						
							| 
									
										
										
										
											2013-11-20 19:54:56 -05:00
										 |  |  |     [ <alien> 1 alien-unsigned-1 ] | 
					
						
							| 
									
										
										
										
											2013-11-29 09:52:30 -05:00
										 |  |  |     [ 2 + <alien> utf8 alien>string ] bi ;
 | 
					
						
							| 
									
										
										
										
											2013-11-20 16:06:49 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | : name-table-entries ( pcre extra -- addrs )
 | 
					
						
							|  |  |  |     [ name-table ] [ name-entry-size ] [ name-count ] 2tri
 | 
					
						
							| 
									
										
										
										
											2013-11-20 19:54:56 -05:00
										 |  |  |     iota [ * + name-table-entry 2array ] 2with map ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : options ( pcre -- opts )
 | 
					
						
							|  |  |  |     f PCRE_INFO_OPTIONS pcre-fullinfo ;
 | 
					
						
							| 
									
										
										
										
											2013-11-20 16:06:49 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-11-20 16:29:39 -05:00
										 |  |  | CONSTANT: default-opts flags{ PCRE_UTF8 PCRE_UCP } | 
					
						
							| 
									
										
										
										
											2013-09-18 16:45:17 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-09-19 17:41:11 -04:00
										 |  |  | : (pcre) ( expr -- pcre err-message err-offset )
 | 
					
						
							| 
									
										
										
										
											2013-09-21 20:52:39 -04:00
										 |  |  |     default-opts { c-string int } [ f pcre_compile ] with-out-parameters ;
 | 
					
						
							| 
									
										
										
										
											2013-09-18 16:45:17 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-09-19 17:41:11 -04:00
										 |  |  | : <pcre> ( expr -- pcre )
 | 
					
						
							| 
									
										
										
										
											2015-08-13 19:13:05 -04:00
										 |  |  |     dup (pcre) 2array swap [ 2nip ] [ malformed-regexp ] if* ;
 | 
					
						
							| 
									
										
										
										
											2013-09-19 17:41:11 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : <pcre-extra> ( pcre -- pcre-extra )
 | 
					
						
							|  |  |  |     0 { c-string } [ pcre_study ] with-out-parameters drop ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-11-20 16:29:39 -05:00
										 |  |  | : exec ( pcre extra subject ofs opts -- count match-data )
 | 
					
						
							|  |  |  |     [ dup length ] 2dip 30 int <c-array> 30 [ pcre_exec ] 2keep drop ;
 | 
					
						
							| 
									
										
										
										
											2013-09-21 20:52:39 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-11-20 19:54:56 -05:00
										 |  |  | TUPLE: matcher pcre extra subject ofs exec-opts ;
 | 
					
						
							| 
									
										
										
										
											2013-09-22 15:48:16 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : <matcher> ( subject compiled-pcre -- matcher )
 | 
					
						
							| 
									
										
										
										
											2013-11-20 19:54:56 -05:00
										 |  |  |     [ utf8 encode ] dip [ pcre>> ] [ extra>> ] bi rot 0 0 matcher boa ;
 | 
					
						
							| 
									
										
										
										
											2013-09-22 15:48:16 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-11-20 16:29:39 -05:00
										 |  |  | CONSTANT: empty-match-opts flags{ PCRE_NOTEMPTY_ATSTART PCRE_ANCHORED } | 
					
						
							| 
									
										
										
										
											2013-09-21 20:52:39 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-11-20 19:54:56 -05:00
										 |  |  | : findnext ( matcher -- matcher match/f )
 | 
					
						
							|  |  |  |     dup { | 
					
						
							|  |  |  |         [ pcre>> ] | 
					
						
							|  |  |  |         [ extra>> ] | 
					
						
							|  |  |  |         [ subject>> ] | 
					
						
							|  |  |  |         [ ofs>> ] | 
					
						
							|  |  |  |         [ exec-opts>> ] | 
					
						
							|  |  |  |     } cleave exec over dup -1 < [ | 
					
						
							| 
									
										
										
										
											2015-08-13 19:13:05 -04:00
										 |  |  |         PCRE_ERRORS number>enum pcre-error | 
					
						
							| 
									
										
										
										
											2013-11-20 19:54:56 -05:00
										 |  |  |     ] [ | 
					
						
							|  |  |  |         -1 = [ | 
					
						
							|  |  |  |             2drop dup exec-opts>> 0 =
 | 
					
						
							|  |  |  |             [ f ] | 
					
						
							|  |  |  |             [ | 
					
						
							|  |  |  |                 dup [ subject>> ] [ ofs>> ] bi next-utf8-char | 
					
						
							|  |  |  |                 [ >>ofs 0 >>exec-opts findnext ] [ f ] if*
 | 
					
						
							|  |  |  |             ] if
 | 
					
						
							|  |  |  |         ] [ | 
					
						
							|  |  |  |             [ | 
					
						
							|  |  |  |                 nip
 | 
					
						
							|  |  |  |                 [ first2 = [ empty-match-opts ] [ 0 ] if >>exec-opts ] | 
					
						
							|  |  |  |                 [ second >>ofs ] bi
 | 
					
						
							|  |  |  |             ] [ | 
					
						
							|  |  |  |                 2array
 | 
					
						
							|  |  |  |             ] 2bi
 | 
					
						
							| 
									
										
										
										
											2013-09-22 15:48:16 -04:00
										 |  |  |         ] if
 | 
					
						
							| 
									
										
										
										
											2013-09-23 08:51:36 -04:00
										 |  |  |     ] if ;
 | 
					
						
							| 
									
										
										
										
											2013-09-19 17:41:11 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-09-22 15:48:16 -04:00
										 |  |  | : parse-match ( subject nametable match-data -- match )
 | 
					
						
							| 
									
										
										
										
											2013-11-20 19:54:56 -05:00
										 |  |  |     swapd first2 swap [ pcre-substring-list ] keep void* <c-direct-array> | 
					
						
							| 
									
										
										
										
											2013-10-25 11:40:37 -04:00
										 |  |  |     [ utf8 alien>string ] { } map-as [ of swap 2array ] with map-index ;
 | 
					
						
							| 
									
										
										
										
											2013-09-21 20:52:39 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-11-20 16:29:39 -05:00
										 |  |  | PRIVATE>
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | TUPLE: compiled-pcre pcre extra nametable ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-09-19 17:41:11 -04:00
										 |  |  | : <compiled-pcre> ( expr -- compiled-pcre )
 | 
					
						
							| 
									
										
										
										
											2013-09-21 20:52:39 -04:00
										 |  |  |     <pcre> dup <pcre-extra> 2dup name-table-entries compiled-pcre boa ;
 | 
					
						
							| 
									
										
										
										
											2013-09-19 17:41:11 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-09-22 15:48:16 -04:00
										 |  |  | : has-option? ( compiled-pcre option -- ? )
 | 
					
						
							|  |  |  |     [ pcre>> options ] dip bitand 0 > ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-09-21 20:52:39 -04:00
										 |  |  | GENERIC: findall ( subject obj -- matches )
 | 
					
						
							| 
									
										
										
										
											2013-09-18 16:45:17 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-09-21 20:52:39 -04:00
										 |  |  | M: compiled-pcre findall | 
					
						
							| 
									
										
										
										
											2013-11-20 19:54:56 -05:00
										 |  |  |     [ <matcher> [ findnext dup ] [ ] produce 2nip ] | 
					
						
							| 
									
										
										
										
											2013-11-20 16:06:49 -05:00
										 |  |  |     [ nametable>> rot [ parse-match ] 2with { } map-as ] 2bi ;
 | 
					
						
							| 
									
										
										
										
											2013-09-18 16:45:17 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-09-21 20:52:39 -04:00
										 |  |  | M: string findall | 
					
						
							|  |  |  |     <compiled-pcre> findall ;
 | 
					
						
							| 
									
										
										
										
											2013-09-18 16:45:17 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-11-04 07:26:37 -05:00
										 |  |  | M: regexp:regexp findall | 
					
						
							|  |  |  |     raw>> findall ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-09-22 15:48:16 -04:00
										 |  |  | : matches? ( subject obj -- ? )
 | 
					
						
							| 
									
										
										
										
											2013-09-21 20:52:39 -04:00
										 |  |  |     dupd findall [ nip length 1 = ] [ ?first ?first ?last = ] 2bi and ;
 | 
					
						
							| 
									
										
										
										
											2013-09-19 17:41:11 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-09-22 15:48:16 -04:00
										 |  |  | : split ( subject obj -- strings )
 | 
					
						
							| 
									
										
										
										
											2013-09-23 10:10:11 -04:00
										 |  |  |     dupd findall [ first second ] map split-subseqs ;
 |