factor/basis/xml/tokenize/tokenize.factor

145 lines
3.9 KiB
Factor
Raw Normal View History

2009-01-21 19:16:51 -05:00
! Copyright (C) 2005, 2009 Daniel Ehrenberg
2007-09-20 18:09:08 -04:00
! See http://factorcode.org/license.txt for BSD license.
2009-01-21 19:16:51 -05:00
USING: namespaces xml.state kernel sequences accessors
xml.char-classes xml.errors math io sbufs fry strings ascii
circular xml.entities assocs make splitting math.parser
locals combinators arrays ;
2007-09-20 18:09:08 -04:00
IN: xml.tokenize
2009-01-21 00:54:33 -05:00
: version=1.0? ( -- ? )
prolog-data get [ version>> "1.0" = ] [ t ] if* ;
: assure-good-char ( ch -- ch )
[
version=1.0? over text? not get-check and
[ disallowed-char ] when
] [ f ] if* ;
! * Basic utility words
: record ( char -- )
CHAR: \n =
[ 0 get-line 1+ set-line ] [ get-column 1+ ] if
set-column ;
! (next) normalizes \r\n and \r
: (next) ( -- char )
get-next read1
2dup swap CHAR: \r = [
CHAR: \n =
[ nip read1 ] [ nip CHAR: \n swap ] if
] [ drop ] if
set-next dup set-char assure-good-char ;
: next ( -- )
#! Increment spot.
get-char [ unexpected-end ] unless (next) record ;
2009-01-21 19:16:51 -05:00
: init-parser ( -- )
0 1 0 f f <spot> spot set
read1 set-next next ;
: with-state ( stream quot -- )
! with-input-stream implicitly creates a new scope which we use
swap [ init-parser call ] with-input-stream ; inline
2009-01-21 00:54:33 -05:00
: skip-until ( quot: ( -- ? ) -- )
get-char [
[ call ] keep swap [ drop ] [
next skip-until
] if
] [ drop ] if ; inline recursive
: take-until ( quot -- string )
#! Take the substring of a string starting at spot
#! from code until the quotation given is true and
#! advance spot to after the substring.
10 <sbuf> [
'[ @ [ t ] [ get-char _ push f ] if ] skip-until
] keep >string ; inline
: take-char ( ch -- string )
[ dup get-char = ] take-until nip ;
: pass-blank ( -- )
#! Advance code past any whitespace, including newlines
[ get-char blank? not ] skip-until ;
: string-matches? ( string circular -- ? )
get-char over push-circular
sequence= ;
: take-string ( match -- string )
dup length <circular-string>
[ 2dup string-matches? ] take-until nip
dup length rot length 1- - head
get-char [ missing-close ] unless next ;
: expect ( string -- )
2009-01-21 00:54:33 -05:00
dup [ get-char next ] replicate 2dup =
[ 2drop ] [ expected ] if ;
2009-01-15 23:20:24 -05:00
: parse-named-entity ( string -- )
2009-01-19 23:25:15 -05:00
dup entities at [ , ] [
2009-01-15 23:20:24 -05:00
dup extra-entities get at
2009-01-19 23:25:15 -05:00
[ % ] [ no-entity ] ?if
2007-09-20 18:09:08 -04:00
] ?if ;
: parse-entity ( -- )
next CHAR: ; take-char next
"#" ?head [
"x" ?head 16 10 ? base> ,
2009-01-15 23:20:24 -05:00
] [ parse-named-entity ] if ;
2007-09-20 18:09:08 -04:00
2009-01-21 00:54:33 -05:00
: parse-pe ( -- )
next CHAR: ; take-char dup next
pe-table get at [ % ] [ no-entity ] ?if ;
2009-01-19 23:25:15 -05:00
:: (parse-char) ( quot: ( ch -- ? ) -- )
get-char :> char
{
{ [ char not ] [ ] }
{ [ char quot call ] [ next ] }
{ [ char CHAR: & = ] [ parse-entity quot (parse-char) ] }
2009-01-21 00:54:33 -05:00
{ [ in-dtd? get char CHAR: % = and ] [ parse-pe quot (parse-char) ] }
2009-01-19 23:25:15 -05:00
[ char , next quot (parse-char) ]
} cond ; inline recursive
: parse-char ( quot: ( ch -- ? ) -- seq )
[ (parse-char) ] "" make ; inline
2007-09-20 18:09:08 -04:00
2009-01-19 23:25:15 -05:00
: assure-no-]]> ( circular -- )
"]]>" sequence= [ text-w/]]> ] when ;
2007-09-20 18:09:08 -04:00
2009-01-20 16:37:21 -05:00
:: parse-text ( -- string )
3 f <array> <circular> :> circ
depth get zero? :> no-text [| char |
char circ push-circular
circ assure-no-]]>
no-text [ char blank? char CHAR: < = or [
char 1string t pre/post-content
] unless ] when
char CHAR: < =
2009-01-19 23:25:15 -05:00
] parse-char ;
2009-01-21 19:16:51 -05:00
: close ( -- )
pass-blank ">" expect ;
2007-09-20 18:09:08 -04:00
2009-01-21 00:54:33 -05:00
: normalize-quote ( str -- str )
[ dup "\t\r\n" member? [ drop CHAR: \s ] when ] map ;
2009-01-19 23:25:15 -05:00
: (parse-quote) ( <-disallowed? ch -- string )
swap '[
dup _ = [ drop t ]
[ CHAR: < = _ and [ attr-w/< ] [ f ] if ] if
2009-01-21 00:54:33 -05:00
] parse-char normalize-quote get-char
2009-01-19 23:25:15 -05:00
[ unclosed-quote ] unless ; inline
2009-01-15 23:20:24 -05:00
2009-01-19 23:25:15 -05:00
: parse-quote* ( <-disallowed? -- seq )
2009-01-15 23:20:24 -05:00
pass-blank get-char dup "'\"" member?
2009-01-19 23:25:15 -05:00
[ next (parse-quote) ] [ quoteless-attr ] if ; inline
: parse-quote ( -- seq )
f parse-quote* ;