factor/basis/xml/tokenize/tokenize.factor

182 lines
4.9 KiB
Factor

! Copyright (C) 2005, 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: namespaces xml.state kernel sequences accessors
xml.char-classes xml.errors math io sbufs fry strings ascii
xml.entities assocs splitting math.parser
locals combinators combinators.short-circuit arrays hints ;
IN: xml.tokenize
! * Basic utility words
: assure-good-char ( spot ch -- )
[
over {
[ version-1.0?>> over text? not ]
[ check>> ]
} 1&&
[
[ [ 1 + ] change-column drop ] dip
disallowed-char
] [ 2drop ] if
] [ drop ] if* ;
HINTS: assure-good-char { spot fixnum } ;
: record ( spot char -- spot )
over char>> [
CHAR: \n eq?
[ [ 1 + ] change-line -1 ] [ dup column>> 1 + ] if
>>column
] [ drop ] if ;
HINTS: record { spot fixnum } ;
:: (next) ( spot -- spot char )
spot next>> :> old-next
spot stream>> stream-read1 :> new-next
old-next CHAR: \r eq? [
spot CHAR: \n >>char
new-next CHAR: \n eq?
[ spot stream>> stream-read1 >>next ]
[ new-next >>next ] if
] [ spot old-next >>char new-next >>next ] if
spot next>> ; inline
: next* ( spot -- )
dup char>> [ unexpected-end ] unless
(next) [ record ] keep assure-good-char ;
HINTS: next* { spot } ;
: next ( -- )
spot get next* ;
: init-parser ( -- )
0 1 0 0 f t f <spot>
input-stream get >>stream
read1 >>next
spot set next ;
: with-state ( stream quot -- )
! with-input-stream implicitly creates a new scope which we use
swap [ init-parser call ] with-input-stream ; inline
:: (skip-until) ( ... quot: ( ... char -- ... ? ) spot -- ... )
spot char>> [
quot call [
spot next* quot spot (skip-until)
] unless
] when* ; inline recursive
: skip-until ( ... quot: ( ... char -- ... ? ) -- ... )
spot get (skip-until) ; inline
: take-until ( ... quot: ( ... char -- ... ? ) -- ... string )
#! Take the substring of a string starting at spot
#! from code until the quotation given is true and
#! advance spot to after the substring.
10 <sbuf> [
'[ _ keep over [ drop ] [ _ push ] if ] skip-until
] keep "" like ; inline
: take-to ( seq -- string )
'[ _ member? ] take-until ; inline
: pass-blank ( -- )
#! Advance code past any whitespace, including newlines
[ blank? not ] skip-until ;
: next-matching ( pos ch str -- pos' )
[ over ] dip nth eq? [ 1 + ] [ drop 0 ] if ; inline
: string-matcher ( str -- quot: ( pos char -- pos ? ) )
dup length 1 - '[ _ next-matching dup _ > ] ; inline
: take-string ( match -- string )
[ 0 swap string-matcher take-until nip ] keep
dupd [ length ] bi@ 1 - - head
get-char [ missing-close ] unless next ;
: expect ( string -- )
dup length spot get '[ _ [ char>> ] keep next* ] "" replicate-as
2dup = [ 2drop ] [ expected ] if ;
! Suddenly XML-specific
: parse-named-entity ( accum string -- )
dup entities at [ swap push ] [
dup extra-entities get at
[ swap push-all ] [ no-entity ] ?if
] ?if ;
: take-; ( -- string )
next ";" take-to next ;
: parse-entity ( accum -- )
take-; "#" ?head [
"x" ?head 16 10 ? base> swap push
] [ parse-named-entity ] if ;
: parse-pe ( accum -- )
take-; dup pe-table get at
[ swap push-all ] [ no-entity ] ?if ;
:: (parse-char) ( quot: ( ch -- ? ) accum spot -- )
spot char>> :> char
{
{ [ char not ] [ ] }
{ [ char quot call ] [ spot next* ] }
{ [ char CHAR: & eq? ] [
accum parse-entity
quot accum spot (parse-char)
] }
{ [ char CHAR: % eq? [ in-dtd? get ] [ f ] if ] [
accum parse-pe
quot accum spot (parse-char)
] }
[
char accum push
spot next*
quot accum spot (parse-char)
]
} cond ; inline recursive
: parse-char ( quot: ( ch -- ? ) -- seq )
512 <sbuf> [ spot get (parse-char) ] keep "" like ; inline
: assure-no-]]> ( pos char -- pos' )
"]]>" next-matching dup 2 > [ text-w/]]> ] when ; inline
:: parse-text ( -- string )
depth get zero? :> no-text
0 :> pos!
[| char |
pos char assure-no-]]> pos!
no-text [
char blank? char CHAR: < eq? or [
char 1string t pre/post-content
] unless
] when
char CHAR: < eq?
] parse-char ;
: close ( -- )
pass-blank ">" expect ;
: normalize-quote ( str -- str )
[ dup "\t\r\n" member? [ drop CHAR: \s ] when ] map! ;
: (parse-quote) ( <-disallowed? ch -- string )
swap '[
dup _ eq? [ drop t ]
[ CHAR: < eq? _ and [ attr-w/< ] [ f ] if ] if
] parse-char normalize-quote get-char
[ unclosed-quote ] unless ; inline
: parse-quote* ( <-disallowed? -- seq )
pass-blank get-char dup "'\"" member?
[ next (parse-quote) ] [ quoteless-attr ] if ; inline
: parse-quote ( -- seq )
f parse-quote* ;