factor/core/strings/parser/parser.factor

! Copyright (C) 2008, 2009 Slava Pestov, Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors assocs kernel lexer make math math.parser
namespaces parser sequences splitting strings arrays
math.order ;
IN: strings.parser

ERROR: bad-escape ;

: escape ( escape -- ch )
    H{
        { CHAR: a  CHAR: \a }
        { CHAR: e  CHAR: \e }
        { CHAR: n  CHAR: \n }
        { CHAR: r  CHAR: \r }
        { CHAR: t  CHAR: \t }
        { CHAR: s  CHAR: \s }
        { CHAR: \s CHAR: \s }
        { CHAR: 0  CHAR: \0 }
        { CHAR: \\ CHAR: \\ }
        { CHAR: \" CHAR: \" }
    } at [ bad-escape ] unless* ;

SYMBOL: name>char-hook

name>char-hook [
    [ "Unicode support not available" throw ]
] initialize

: unicode-escape ( str -- ch str' )
    "{" ?head-slice [
        CHAR: } over index cut-slice
        [ >string name>char-hook get call( name -- char ) ] dip
        rest-slice
    ] [
        6 cut-slice [ hex> ] dip
    ] if ;

: next-escape ( str -- ch str' )
    "u" ?head-slice [
        unicode-escape
    ] [
        unclip-slice escape swap
    ] if ;

: (unescape-string) ( str -- )
    CHAR: \\ over index dup [
        cut-slice [ % ] dip rest-slice
        next-escape [ , ] dip
        (unescape-string)
    ] [
        drop %
    ] if ;

: unescape-string ( str -- str' )
    [ (unescape-string) ] "" make ;

: (parse-string) ( str -- m )
    dup [ "\"\\" member? ] find dup [
        [ cut-slice [ % ] dip rest-slice ] dip
        CHAR: " = [
            from>>
        ] [
            next-escape [ , ] dip (parse-string)
        ] if
    ] [
        "Unterminated string" throw
    ] if ;

: parse-string ( -- str )
    lexer get [
        [ swap tail-slice (parse-string) ] "" make swap
    ] change-lexer-column ;

<PRIVATE

: lexer-before ( i -- before )
    [
        [
            lexer get
            [ column>> ] [ line-text>> ] bi
        ] dip swap subseq
    ] [
        lexer get (>>column)
    ] bi ;

: find-next-token ( ch -- i elt )
    CHAR: \ 2array
    [ lexer get [ column>> ] [ line-text>> ] bi ] dip
    [ member? ] curry find-from ;

: rest-of-line ( lexer -- seq )
    [ line-text>> ] [ column>> ] bi tail-slice ;

: current-char ( lexer -- ch/f )
    [ column>> ] [ line-text>> ] bi ?nth ;

: advance-char ( lexer -- )
    [ 1 + ] change-column drop ;

ERROR: escaped-char-expected ;

: next-char ( lexer -- ch )
    dup still-parsing-line? [
        [ current-char ] [ advance-char ] bi
    ] [
        escaped-char-expected
    ] if ;

: next-line% ( lexer -- )
    [ rest-of-line % ]
    [ next-line "\n" % ] bi ;

: rest-begins? ( string -- ? )
    [
        lexer get [ line-text>> ] [ column>> ] bi tail-slice
    ] dip head? ;

: advance-lexer ( n -- )
    [ lexer get ] dip [ + ] curry change-column drop ; inline

: take-double-quotes ( -- string )
    lexer get dup current-char CHAR: " = [
        [ ] [ column>> ] [ line-text>> ] tri
        [ CHAR: " = not ] find-from drop [
            swap column>> - CHAR: " <repetition>
        ] [
            rest-of-line
        ] if*
    ] [
        drop f
    ] if dup length advance-lexer ;

: end-string-parse ( delimiter -- )
    length 3 = [
        take-double-quotes 3 tail %
    ] [
        lexer get advance-char
    ] if ;

DEFER: (parse-long-string)

: parse-found-token ( i string token -- )
    [ lexer-before % ] dip
    CHAR: \ = [
        lexer get [ next-char , ] [ next-char , ] bi (parse-long-string)
    ] [
        dup rest-begins? [
            end-string-parse
        ] [
            lexer get next-char , (parse-long-string)
        ] if
    ] if ;

ERROR: trailing-characters string ;

: (parse-long-string) ( string -- )
    lexer get still-parsing? [
        dup first find-next-token [
            parse-found-token
        ] [
            drop lexer get next-line%
            (parse-long-string)
        ] if*
    ] [
        unexpected-eof
    ] if ;

PRIVATE>

: parse-long-string ( string -- string' )
    [ (parse-long-string) ] "" make ;

: parse-multiline-string ( -- string )
    lexer get rest-of-line "\"\"" head? [
        lexer get [ 2 + ] change-column drop
        "\"\"\""
    ] [
        "\""
    ] if parse-long-string unescape-string ;
add multiline string support 2009-09-19 04:55:05 -04:00			`! Copyright (C) 2008, 2009 Slava Pestov, Doug Coleman.`
Split up huge parser vocabulary 2008-06-25 04:25:08 -04:00			`! See http://factorcode.org/license.txt for BSD license.`
add multiline string support 2009-09-19 04:55:05 -04:00			`USING: accessors assocs kernel lexer make math math.parser`
fix string parsing 2009-09-20 22:50:17 -04:00			`namespaces parser sequences splitting strings arrays`
			`math.order ;`
Split up huge parser vocabulary 2008-06-25 04:25:08 -04:00			`IN: strings.parser`

			`ERROR: bad-escape ;`

			`: escape ( escape -- ch )`
			`H{`
			`{ CHAR: a CHAR: \a }`
			`{ CHAR: e CHAR: \e }`
			`{ CHAR: n CHAR: \n }`
			`{ CHAR: r CHAR: \r }`
			`{ CHAR: t CHAR: \t }`
			`{ CHAR: s CHAR: \s }`
			`{ CHAR: \s CHAR: \s }`
			`{ CHAR: 0 CHAR: \0 }`
			`{ CHAR: \\ CHAR: \\ }`
			`{ CHAR: \" CHAR: \" }`
			`} at [ bad-escape ] unless* ;`

			`SYMBOL: name>char-hook`

Update some existing code to use initialize 2009-02-10 17:16:12 -05:00			`name>char-hook [`
			`[ "Unicode support not available" throw ]`
			`] initialize`
Split up huge parser vocabulary 2008-06-25 04:25:08 -04:00
			`: unicode-escape ( str -- ch str' )`
			`"{" ?head-slice [`
			`CHAR: } over index cut-slice`
Move call( and execute( to core 2009-03-16 21:11:36 -04:00			`[ >string name>char-hook get call( name -- char ) ] dip`
Split up huge parser vocabulary 2008-06-25 04:25:08 -04:00			`rest-slice`
			`] [`
Refactor all usages of >r/r> in core to use dip, 2dip, 3dip Non-optimizing compiler now special-cases dip, 2dip, 3dip following a literal quotation: this allows us to break the dip/slip meta-circle without explicit calls to >r/r> 2008-11-23 03:44:56 -05:00			`6 cut-slice [ hex> ] dip`
Split up huge parser vocabulary 2008-06-25 04:25:08 -04:00			`] if ;`

			`: next-escape ( str -- ch str' )`
			`"u" ?head-slice [`
			`unicode-escape`
			`] [`
			`unclip-slice escape swap`
			`] if ;`

add multiline string support 2009-09-19 04:55:05 -04:00			`: (unescape-string) ( str -- )`
			`CHAR: \\ over index dup [`
			`cut-slice [ % ] dip rest-slice`
			`next-escape [ , ] dip`
			`(unescape-string)`
			`] [`
			`drop %`
			`] if ;`

			`: unescape-string ( str -- str' )`
			`[ (unescape-string) ] "" make ;`

Split up huge parser vocabulary 2008-06-25 04:25:08 -04:00			`: (parse-string) ( str -- m )`
			`dup [ "\"\\" member? ] find dup [`
Refactor all usages of >r/r> in core to use dip, 2dip, 3dip Non-optimizing compiler now special-cases dip, 2dip, 3dip following a literal quotation: this allows us to break the dip/slip meta-circle without explicit calls to >r/r> 2008-11-23 03:44:56 -05:00			`[ cut-slice [ % ] dip rest-slice ] dip`
Move call( and execute( to core 2009-03-16 21:11:36 -04:00			`CHAR: " = [`
			`from>>`
Split up huge parser vocabulary 2008-06-25 04:25:08 -04:00			`] [`
Move call( and execute( to core 2009-03-16 21:11:36 -04:00			`next-escape [ , ] dip (parse-string)`
Split up huge parser vocabulary 2008-06-25 04:25:08 -04:00			`] if`
			`] [`
			`"Unterminated string" throw`
			`] if ;`

			`: parse-string ( -- str )`
			`lexer get [`
			`[ swap tail-slice (parse-string) ] "" make swap`
			`] change-lexer-column ;`
Remove eval dependency from unicode.syntax 2008-12-08 20:46:40 -05:00
add multiline string support 2009-09-19 04:55:05 -04:00			`<PRIVATE`

fix string parsing 2009-09-20 22:50:17 -04:00			`: lexer-before ( i -- before )`
add multiline string support 2009-09-19 04:55:05 -04:00			`[`
			`[`
			`lexer get`
			`[ column>> ] [ line-text>> ] bi`
			`] dip swap subseq`
Remove eval dependency from unicode.syntax 2008-12-08 20:46:40 -05:00			`] [`
add multiline string support 2009-09-19 04:55:05 -04:00			`lexer get (>>column)`
			`] bi ;`

			`: find-next-token ( ch -- i elt )`
			`CHAR: \ 2array`
			`[ lexer get [ column>> ] [ line-text>> ] bi ] dip`
			`[ member? ] curry find-from ;`

fix string parsing 2009-09-20 22:50:17 -04:00			`: rest-of-line ( lexer -- seq )`
			`[ line-text>> ] [ column>> ] bi tail-slice ;`
add multiline string support 2009-09-19 04:55:05 -04:00
fix string parsing 2009-09-20 22:50:17 -04:00			`: current-char ( lexer -- ch/f )`
			`[ column>> ] [ line-text>> ] bi ?nth ;`
the last character on a multiline string cannot be a backslash 2009-09-20 15:18:19 -04:00
			`: advance-char ( lexer -- )`
			`[ 1 + ] change-column drop ;`

			`ERROR: escaped-char-expected ;`

			`: next-char ( lexer -- ch )`
			`dup still-parsing-line? [`
			`[ current-char ] [ advance-char ] bi`
			`] [`
			`escaped-char-expected`
			`] if ;`

fix string parsing 2009-09-20 22:50:17 -04:00			`: next-line% ( lexer -- )`
			`[ rest-of-line % ]`
			`[ next-line "\n" % ] bi ;`
add multiline string support 2009-09-19 04:55:05 -04:00
			`: rest-begins? ( string -- ? )`
			`[`
			`lexer get [ line-text>> ] [ column>> ] bi tail-slice`
			`] dip head? ;`

fix string parsing 2009-09-20 22:50:17 -04:00			`: advance-lexer ( n -- )`
			`[ lexer get ] dip [ + ] curry change-column drop ; inline`

			`: take-double-quotes ( -- string )`
			`lexer get dup current-char CHAR: " = [`
			`[ ] [ column>> ] [ line-text>> ] tri`
			`[ CHAR: " = not ] find-from drop [`
			`swap column>> - CHAR: " <repetition>`
			`] [`
			`rest-of-line`
			`] if*`
			`] [`
			`drop f`
			`] if dup length advance-lexer ;`

			`: end-string-parse ( delimiter -- )`
			`length 3 = [`
			`take-double-quotes 3 tail %`
			`] [`
			`lexer get advance-char`
			`] if ;`

add multiline string support 2009-09-19 04:55:05 -04:00			`DEFER: (parse-long-string)`

fix string parsing 2009-09-20 22:50:17 -04:00			`: parse-found-token ( i string token -- )`
			`[ lexer-before % ] dip`
add multiline string support 2009-09-19 04:55:05 -04:00			`CHAR: \ = [`
fix string parsing 2009-09-20 22:50:17 -04:00			`lexer get [ next-char , ] [ next-char , ] bi (parse-long-string)`
add multiline string support 2009-09-19 04:55:05 -04:00			`] [`
			`dup rest-begins? [`
fix string parsing 2009-09-20 22:50:17 -04:00			`end-string-parse`
add multiline string support 2009-09-19 04:55:05 -04:00			`] [`
fixing some quotes in strings bugs 2009-09-20 16:08:06 -04:00			`lexer get next-char , (parse-long-string)`
add multiline string support 2009-09-19 04:55:05 -04:00			`] if`
Remove eval dependency from unicode.syntax 2008-12-08 20:46:40 -05:00			`] if ;`

fix string parsing 2009-09-20 22:50:17 -04:00			`ERROR: trailing-characters string ;`
add multiline string support 2009-09-19 04:55:05 -04:00
			`: (parse-long-string) ( string -- )`
			`lexer get still-parsing? [`
fix string parsing 2009-09-20 22:50:17 -04:00			`dup first find-next-token [`
			`parse-found-token`
			`] [`
			`drop lexer get next-line%`
			`(parse-long-string)`
			`] if*`
add multiline string support 2009-09-19 04:55:05 -04:00			`] [`
			`unexpected-eof`
			`] if ;`

fix string parsing 2009-09-20 22:50:17 -04:00			`PRIVATE>`

add multiline string support 2009-09-19 04:55:05 -04:00			`: parse-long-string ( string -- string' )`
the last character on a multiline string cannot be a backslash 2009-09-20 15:18:19 -04:00			`[ (parse-long-string) ] "" make ;`

add multiline string support 2009-09-19 04:55:05 -04:00			`: parse-multiline-string ( -- string )`
fix string parsing 2009-09-20 22:50:17 -04:00			`lexer get rest-of-line "\"\"" head? [`
add multiline string support 2009-09-19 04:55:05 -04:00			`lexer get [ 2 + ] change-column drop`
fix string parsing 2009-09-20 22:50:17 -04:00			`"\"\"\""`
add multiline string support 2009-09-19 04:55:05 -04:00			`] [`
fix string parsing 2009-09-20 22:50:17 -04:00			`"\""`
			`] if parse-long-string unescape-string ;`