2008-10-01 19:17:31 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								! Copyright (C) 2008 Doug Coleman.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								! See http://factorcode.org/license.txt for BSD license.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								USING: accessors fry html.parser html.parser.analyzer
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								http.client kernel tools.time sets assocs sequences
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								concurrency.combinators io threads namespaces math multiline
							 | 
						
					
						
							
								
									
										
										
										
											2009-03-31 19:21:15 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								math.parser inspector urls logging combinators.short-circuit
							 | 
						
					
						
							
								
									
										
										
										
											2009-04-04 12:57:23 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								continuations calendar prettyprint dlists deques locals
							 | 
						
					
						
							
								
									
										
										
										
											2009-04-06 19:47:46 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								spider.unique-deque combinators concurrency.semaphores ;
							 | 
						
					
						
							
								
									
										
										
										
											2008-10-01 19:17:31 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								IN: spider
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2010-08-29 14:21:50 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								TUPLE: spider
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    base
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    { count integer initial: 0 }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    { max-count number initial: 1/0. }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    sleep
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    { max-depth integer initial: 0 }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    initial-links
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    filters
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    spidered
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    todo
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    nonmatching
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    quiet?
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    currently-spidering
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    { #threads integer initial: 1 }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    semaphore
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    follow-robots?
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    robots ;
							 | 
						
					
						
							
								
									
										
										
										
											2008-10-01 19:17:31 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2009-04-03 22:16:08 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								TUPLE: spider-result url depth headers
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								fetched-in parsed-html links processed-in fetched-at ;
							 | 
						
					
						
							
								
									
										
										
										
											2008-10-01 19:17:31 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								: <spider> ( base -- spider )
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    >url
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    spider new
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        over >>base
							 | 
						
					
						
							
								
									
										
										
										
											2009-04-01 12:08:19 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        over >>currently-spidering
							 | 
						
					
						
							
								
									
										
										
										
											2009-03-31 19:23:02 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        swap 0 <unique-deque> [ push-url ] keep >>todo
							 | 
						
					
						
							
								
									
										
										
										
											2009-03-31 19:21:15 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        <unique-deque> >>nonmatching
							 | 
						
					
						
							
								
									
										
										
										
											2009-04-03 22:16:08 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        H{ } clone >>spidered
							 | 
						
					
						
							
								
									
										
										
										
											2010-08-29 14:21:50 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        1 <semaphore> >>semaphore ;
							 | 
						
					
						
							
								
									
										
										
										
											2009-04-06 19:47:46 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								: <spider-result> ( url depth -- spider-result )
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    spider-result new
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        swap >>depth
							 | 
						
					
						
							
								
									
										
										
										
											2010-08-18 12:34:36 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        swap >>url ; inline
							 | 
						
					
						
							
								
									
										
										
										
											2008-10-01 19:17:31 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								<PRIVATE
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								: apply-filters ( links spider -- links' )
							 | 
						
					
						
							
								
									
										
										
										
											2010-08-18 12:34:36 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    filters>> [
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        '[ [ _ 1&& ] filter ] call( seq -- seq' )
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    ] when* ;
							 | 
						
					
						
							
								
									
										
										
										
											2008-10-01 19:17:31 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2009-03-31 19:21:15 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								: push-links ( links level unique-deque -- )
							 | 
						
					
						
							
								
									
										
										
										
											2009-03-31 19:23:02 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    '[ _ _ push-url ] each ;
							 | 
						
					
						
							
								
									
										
										
										
											2008-10-01 22:25:37 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2008-10-01 19:17:31 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								: add-todo ( links level spider -- )
							 | 
						
					
						
							
								
									
										
										
										
											2008-10-01 22:25:37 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    todo>> push-links ;
							 | 
						
					
						
							
								
									
										
										
										
											2008-10-01 19:17:31 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								: add-nonmatching ( links level spider -- )
							 | 
						
					
						
							
								
									
										
										
										
											2008-10-01 22:25:37 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    nonmatching>> push-links ;
							 | 
						
					
						
							
								
									
										
										
										
											2008-10-01 19:17:31 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2009-03-31 19:21:15 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								: filter-base-links ( spider spider-result -- base-links nonmatching-links )
							 | 
						
					
						
							
								
									
										
										
										
											2010-02-27 14:52:24 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    [ base>> host>> ] [ links>> members ] bi*
							 | 
						
					
						
							
								
									
										
										
										
											2008-10-01 19:17:31 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    [ host>> = ] with partition ;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2010-08-18 12:34:36 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								:: add-spidered ( spider spider-result -- )
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    spider [ 1 + ] change-count drop
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    spider-result dup url>>
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    spider spidered>> set-at
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    spider spider-result filter-base-links :> ( matching nonmatching )
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    spider-result depth>> 1 + :> depth
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    nonmatching depth spider add-nonmatching
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    matching spider apply-filters depth spider add-todo ;
							 | 
						
					
						
							
								
									
										
										
										
											2008-10-01 19:17:31 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2009-04-03 22:16:08 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								: normalize-hrefs ( base links -- links' )
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    [ derive-url ] with map ;
							 | 
						
					
						
							
								
									
										
										
										
											2008-10-01 19:17:31 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2009-04-06 19:47:46 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								: print-spidering ( spider-result -- )
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    [ url>> ] [ depth>> ] bi
							 | 
						
					
						
							
								
									
										
										
										
											2008-10-01 22:25:37 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    "depth: " write number>string write
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    ", spidering: " write . yield ;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2009-04-06 19:47:46 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								:: fill-spidered-result ( spider spider-result -- )
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    f spider-result url>> spider spidered>> set-at
							 | 
						
					
						
							
								
									
										
										
										
											2009-10-28 17:11:33 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    [ spider-result url>> http-get ] benchmark :> ( headers html fetched-in )
							 | 
						
					
						
							
								
									
										
										
										
											2009-03-31 19:21:15 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    [
							 | 
						
					
						
							
								
									
										
										
										
											2009-04-03 22:16:08 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        html parse-html
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        spider currently-spidering>>
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        over find-all-links normalize-hrefs
							 | 
						
					
						
							
								
									
										
										
										
											2009-10-28 17:11:33 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    ] benchmark :> ( parsed-html links processed-in )
							 | 
						
					
						
							
								
									
										
										
										
											2009-04-06 19:47:46 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    spider-result
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        headers >>headers
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        fetched-in >>fetched-in
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        parsed-html >>parsed-html
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        links >>links
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        processed-in >>processed-in
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        now >>fetched-at drop ;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								:: spider-page ( spider spider-result -- )
							 | 
						
					
						
							
								
									
										
										
										
											2010-08-29 14:21:50 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    spider quiet?>> [ spider-result print-spidering ] unless
							 | 
						
					
						
							
								
									
										
										
										
											2009-04-06 19:47:46 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    spider spider-result fill-spidered-result
							 | 
						
					
						
							
								
									
										
										
										
											2010-08-29 14:21:50 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    spider quiet?>> [ spider-result describe ] unless
							 | 
						
					
						
							
								
									
										
										
										
											2009-04-06 19:47:46 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    spider spider-result add-spidered ;
							 | 
						
					
						
							
								
									
										
										
										
											2008-10-01 19:17:31 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\ spider-page ERROR add-error-logging
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2009-04-03 22:16:08 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								: spider-sleep ( spider -- ) sleep>> [ sleep ] when* ;
							 | 
						
					
						
							
								
									
										
										
										
											2009-03-31 19:21:15 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2010-08-18 12:34:36 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								: queue-initial-links ( spider -- spider )
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    [ [ currently-spidering>> ] [ initial-links>> ] bi normalize-hrefs 0 ]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    [ add-todo ]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    [ ] tri ;
							 | 
						
					
						
							
								
									
										
										
										
											2008-10-01 19:17:31 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2009-03-31 19:21:15 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								: spider-page? ( spider -- ? )
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        [ todo>> deque>> deque-empty? not ]
							 | 
						
					
						
							
								
									
										
										
										
											2010-08-29 14:21:50 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        [ [ todo>> peek-url depth>> ] [ max-depth>> ] bi <= ]
							 | 
						
					
						
							
								
									
										
										
										
											2009-04-01 00:04:59 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        [ [ count>> ] [ max-count>> ] bi < ]
							 | 
						
					
						
							
								
									
										
										
										
											2009-03-31 19:21:15 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    } 1&& ;
							 | 
						
					
						
							
								
									
										
										
										
											2008-10-01 20:29:18 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2009-04-06 19:47:46 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								: setup-next-url ( spider -- spider spider-result )
							 | 
						
					
						
							
								
									
										
										
										
											2009-04-03 22:16:08 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    dup todo>> peek-url url>> >>currently-spidering
							 | 
						
					
						
							
								
									
										
										
										
											2009-04-06 19:47:46 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    dup todo>> pop-url [ url>> ] [ depth>> ] bi <spider-result> ;
							 | 
						
					
						
							
								
									
										
										
										
											2009-03-31 19:21:15 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								: spider-next-page ( spider -- )
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    setup-next-url spider-page ;
							 | 
						
					
						
							
								
									
										
										
										
											2008-10-01 21:54:58 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2008-10-01 19:17:31 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								PRIVATE>
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2009-03-31 19:21:15 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								: run-spider-loop ( spider -- )
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    dup spider-page? [
							 | 
						
					
						
							
								
									
										
										
										
											2009-04-01 12:08:19 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        [ spider-next-page ] [ spider-sleep ] [ run-spider-loop ] tri
							 | 
						
					
						
							
								
									
										
										
										
											2009-03-31 19:21:15 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    ] [
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        drop
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    ] if ;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2008-10-01 19:17:31 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								: run-spider ( spider -- spider )
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    "spider" [
							 | 
						
					
						
							
								
									
										
										
										
											2010-08-18 12:34:36 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        queue-initial-links
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        [ run-spider-loop ] keep
							 | 
						
					
						
							
								
									
										
										
										
											2008-10-01 19:17:31 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    ] with-logging ;
							 |