factor/basis/xml/tokenize/tokenize.factor

! Copyright (C) 2005, 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: namespaces xml.state kernel sequences accessors
xml.char-classes xml.errors math io sbufs fry strings ascii
circular xml.entities assocs make splitting math.parser
locals combinators arrays ;
IN: xml.tokenize

: version=1.0? ( -- ? )
    prolog-data get [ version>> "1.0" = ] [ t ] if* ;

: assure-good-char ( ch -- ch )
    [
        version=1.0? over text? not get-check and
        [ disallowed-char ] when
    ] [ f ] if* ;

! * Basic utility words

: record ( char -- )
    CHAR: \n =
    [ 0 get-line 1+ set-line ] [ get-column 1+ ] if
    set-column ;

! (next) normalizes \r\n and \r
: (next) ( -- char )
    get-next read1
    2dup swap CHAR: \r = [
        CHAR: \n =
        [ nip read1 ] [ nip CHAR: \n swap ] if
    ] [ drop ] if
    set-next dup set-char assure-good-char ;

: next ( -- )
    #! Increment spot.
    get-char [ unexpected-end ] unless (next) record ;

: init-parser ( -- )
    0 1 0 f f <spot> spot set
    read1 set-next next ;

: with-state ( stream quot -- )
    ! with-input-stream implicitly creates a new scope which we use
    swap [ init-parser call ] with-input-stream ; inline

: skip-until ( quot: ( -- ? ) -- )
    get-char [
        [ call ] keep swap [ drop ] [
            next skip-until
        ] if
    ] [ drop ] if ; inline recursive

: take-until ( quot -- string )
    #! Take the substring of a string starting at spot
    #! from code until the quotation given is true and
    #! advance spot to after the substring.
    10 <sbuf> [
        '[ @ [ t ] [ get-char _ push f ] if ] skip-until
    ] keep >string ; inline

: take-char ( ch -- string )
    [ dup get-char = ] take-until nip ;

: pass-blank ( -- )
    #! Advance code past any whitespace, including newlines
    [ get-char blank? not ] skip-until ;

: string-matches? ( string circular -- ? )
    get-char over push-circular
    sequence= ;

: take-string ( match -- string )
    dup length <circular-string>
    [ 2dup string-matches? ] take-until nip
    dup length rot length 1- - head
    get-char [ missing-close ] unless next ;

: expect ( string -- )
    dup [ get-char next ] replicate 2dup =
    [ 2drop ] [ expected ] if ;

: parse-named-entity ( string -- )
    dup entities at [ , ] [
        dup extra-entities get at
        [ % ] [ no-entity ] ?if
    ] ?if ;

: parse-entity ( -- )
    next CHAR: ; take-char next
    "#" ?head [
        "x" ?head 16 10 ? base> ,
    ] [ parse-named-entity ] if ;

: parse-pe ( -- )
    next CHAR: ; take-char dup next
    pe-table get at [ % ] [ no-entity ] ?if ;

:: (parse-char) ( quot: ( ch -- ? ) -- )
    get-char :> char
    {
        { [ char not ] [ ] }
        { [ char quot call ] [ next ] }
        { [ char CHAR: & = ] [ parse-entity quot (parse-char) ] }
        { [ in-dtd? get char CHAR: % = and ] [ parse-pe quot (parse-char) ] }
        [ char , next quot (parse-char) ]
    } cond ; inline recursive

: parse-char ( quot: ( ch -- ? ) -- seq )
    [ (parse-char) ] "" make ; inline

: assure-no-]]> ( circular -- )
    "]]>" sequence= [ text-w/]]> ] when ;

:: parse-text ( -- string )
    3 f <array> <circular> :> circ
    depth get zero? :> no-text [| char |
        char circ push-circular
        circ assure-no-]]>
        no-text [ char blank? char CHAR: < = or [
            char 1string t pre/post-content
        ] unless ] when
        char CHAR: < =
    ] parse-char ;

: close ( -- )
    pass-blank ">" expect ;

: normalize-quote ( str -- str )
    [ dup "\t\r\n" member? [ drop CHAR: \s ] when ] map ;

: (parse-quote) ( <-disallowed? ch -- string )
    swap '[
        dup _ = [ drop t ]
        [ CHAR: < = _ and [ attr-w/< ] [ f ] if ] if
    ] parse-char normalize-quote get-char
    [ unclosed-quote ] unless ; inline

: parse-quote* ( <-disallowed? -- seq )
    pass-blank get-char dup "'\"" member?
    [ next (parse-quote) ] [ quoteless-attr ] if ; inline

: parse-quote ( -- seq )
   f parse-quote* ;
Reorganizing XML 2009-01-21 19:16:51 -05:00			`! Copyright (C) 2005, 2009 Daniel Ehrenberg`
Initial import 2007-09-20 18:09:08 -04:00			`! See http://factorcode.org/license.txt for BSD license.`
Reorganizing XML 2009-01-21 19:16:51 -05:00			`USING: namespaces xml.state kernel sequences accessors`
			`xml.char-classes xml.errors math io sbufs fry strings ascii`
			`circular xml.entities assocs make splitting math.parser`
			`locals combinators arrays ;`
Initial import 2007-09-20 18:09:08 -04:00			`IN: xml.tokenize`

XML refactoring, splitting up docs 2009-01-21 00:54:33 -05:00			`: version=1.0? ( -- ? )`
			`prolog-data get [ version>> "1.0" = ] [ t ] if* ;`

			`: assure-good-char ( ch -- ch )`
			`[`
			`version=1.0? over text? not get-check and`
			`[ disallowed-char ] when`
			`] [ f ] if* ;`

			`! * Basic utility words`

			`: record ( char -- )`
			`CHAR: \n =`
			`[ 0 get-line 1+ set-line ] [ get-column 1+ ] if`
			`set-column ;`

			`! (next) normalizes \r\n and \r`
			`: (next) ( -- char )`
			`get-next read1`
			`2dup swap CHAR: \r = [`
			`CHAR: \n =`
			`[ nip read1 ] [ nip CHAR: \n swap ] if`
			`] [ drop ] if`
			`set-next dup set-char assure-good-char ;`

			`: next ( -- )`
			`#! Increment spot.`
			`get-char [ unexpected-end ] unless (next) record ;`

Reorganizing XML 2009-01-21 19:16:51 -05:00			`: init-parser ( -- )`
			`0 1 0 f f <spot> spot set`
			`read1 set-next next ;`

			`: with-state ( stream quot -- )`
			`! with-input-stream implicitly creates a new scope which we use`
			`swap [ init-parser call ] with-input-stream ; inline`

XML refactoring, splitting up docs 2009-01-21 00:54:33 -05:00			`: skip-until ( quot: ( -- ? ) -- )`
			`get-char [`
			`[ call ] keep swap [ drop ] [`
			`next skip-until`
			`] if`
			`] [ drop ] if ; inline recursive`

			`: take-until ( quot -- string )`
			`#! Take the substring of a string starting at spot`
			`#! from code until the quotation given is true and`
			`#! advance spot to after the substring.`
			`10 <sbuf> [`
			`'[ @ [ t ] [ get-char _ push f ] if ] skip-until`
			`] keep >string ; inline`

			`: take-char ( ch -- string )`
			`[ dup get-char = ] take-until nip ;`

			`: pass-blank ( -- )`
			`#! Advance code past any whitespace, including newlines`
			`[ get-char blank? not ] skip-until ;`

			`: string-matches? ( string circular -- ? )`
			`get-char over push-circular`
			`sequence= ;`

			`: take-string ( match -- string )`
			`dup length <circular-string>`
			`[ 2dup string-matches? ] take-until nip`
			`dup length rot length 1- - head`
			`get-char [ missing-close ] unless next ;`

DTDs are a separate type now; all variables in xml.state 2009-01-23 16:29:28 -05:00			`: expect ( string -- )`
XML refactoring, splitting up docs 2009-01-21 00:54:33 -05:00			`dup [ get-char next ] replicate 2dup =`
			`[ 2drop ] [ expected ] if ;`

XML parses entities now 2009-01-15 23:20:24 -05:00			`: parse-named-entity ( string -- )`
Various XML fixes, XML test suite 2009-01-19 23:25:15 -05:00			`dup entities at [ , ] [`
XML parses entities now 2009-01-15 23:20:24 -05:00			`dup extra-entities get at`
Various XML fixes, XML test suite 2009-01-19 23:25:15 -05:00			`[ % ] [ no-entity ] ?if`
Initial import 2007-09-20 18:09:08 -04:00			`] ?if ;`

			`: parse-entity ( -- )`
			`next CHAR: ; take-char next`
			`"#" ?head [`
			`"x" ?head 16 10 ? base> ,`
XML parses entities now 2009-01-15 23:20:24 -05:00			`] [ parse-named-entity ] if ;`
Initial import 2007-09-20 18:09:08 -04:00
XML refactoring, splitting up docs 2009-01-21 00:54:33 -05:00			`: parse-pe ( -- )`
			`next CHAR: ; take-char dup next`
			`pe-table get at [ % ] [ no-entity ] ?if ;`

Various XML fixes, XML test suite 2009-01-19 23:25:15 -05:00			`:: (parse-char) ( quot: ( ch -- ? ) -- )`
			`get-char :> char`
			`{`
			`{ [ char not ] [ ] }`
			`{ [ char quot call ] [ next ] }`
			`{ [ char CHAR: & = ] [ parse-entity quot (parse-char) ] }`
XML refactoring, splitting up docs 2009-01-21 00:54:33 -05:00			`{ [ in-dtd? get char CHAR: % = and ] [ parse-pe quot (parse-char) ] }`
Various XML fixes, XML test suite 2009-01-19 23:25:15 -05:00			`[ char , next quot (parse-char) ]`
			`} cond ; inline recursive`

			`: parse-char ( quot: ( ch -- ? ) -- seq )`
			`[ (parse-char) ] "" make ; inline`
Initial import 2007-09-20 18:09:08 -04:00
Various XML fixes, XML test suite 2009-01-19 23:25:15 -05:00			`: assure-no-]]> ( circular -- )`
			`"]]>" sequence= [ text-w/]]> ] when ;`
Initial import 2007-09-20 18:09:08 -04:00
Going further towards conformance 2009-01-20 16:37:21 -05:00			`:: parse-text ( -- string )`
			`3 f <array> <circular> :> circ`
			`depth get zero? :> no-text [\| char \|`
			`char circ push-circular`
			`circ assure-no-]]>`
			`no-text [ char blank? char CHAR: < = or [`
			`char 1string t pre/post-content`
			`] unless ] when`
			`char CHAR: < =`
Various XML fixes, XML test suite 2009-01-19 23:25:15 -05:00			`] parse-char ;`

Reorganizing XML 2009-01-21 19:16:51 -05:00			`: close ( -- )`
DTDs are a separate type now; all variables in xml.state 2009-01-23 16:29:28 -05:00			`pass-blank ">" expect ;`
Initial import 2007-09-20 18:09:08 -04:00
XML refactoring, splitting up docs 2009-01-21 00:54:33 -05:00			`: normalize-quote ( str -- str )`
			`[ dup "\t\r\n" member? [ drop CHAR: \s ] when ] map ;`

Various XML fixes, XML test suite 2009-01-19 23:25:15 -05:00			`: (parse-quote) ( <-disallowed? ch -- string )`
			`swap '[`
			`dup _ = [ drop t ]`
			`[ CHAR: < = _ and [ attr-w/< ] [ f ] if ] if`
XML refactoring, splitting up docs 2009-01-21 00:54:33 -05:00			`] parse-char normalize-quote get-char`
Various XML fixes, XML test suite 2009-01-19 23:25:15 -05:00			`[ unclosed-quote ] unless ; inline`
XML parses entities now 2009-01-15 23:20:24 -05:00
Various XML fixes, XML test suite 2009-01-19 23:25:15 -05:00			`: parse-quote* ( <-disallowed? -- seq )`
XML parses entities now 2009-01-15 23:20:24 -05:00			`pass-blank get-char dup "'\"" member?`
Various XML fixes, XML test suite 2009-01-19 23:25:15 -05:00			`[ next (parse-quote) ] [ quoteless-attr ] if ; inline`

			`: parse-quote ( -- seq )`
			`f parse-quote* ;`