factor/core/strings/parser/parser.factor

! Copyright (C) 2008, 2009 Slava Pestov, Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors assocs kernel lexer make math math.parser
namespaces parser sequences splitting strings arrays ;
IN: strings.parser

ERROR: bad-escape ;

: escape ( escape -- ch )
    H{
        { CHAR: a  CHAR: \a }
        { CHAR: e  CHAR: \e }
        { CHAR: n  CHAR: \n }
        { CHAR: r  CHAR: \r }
        { CHAR: t  CHAR: \t }
        { CHAR: s  CHAR: \s }
        { CHAR: \s CHAR: \s }
        { CHAR: 0  CHAR: \0 }
        { CHAR: \\ CHAR: \\ }
        { CHAR: \" CHAR: \" }
    } at [ bad-escape ] unless* ;

SYMBOL: name>char-hook

name>char-hook [
    [ "Unicode support not available" throw ]
] initialize

: unicode-escape ( str -- ch str' )
    "{" ?head-slice [
        CHAR: } over index cut-slice
        [ >string name>char-hook get call( name -- char ) ] dip
        rest-slice
    ] [
        6 cut-slice [ hex> ] dip
    ] if ;

: next-escape ( str -- ch str' )
    "u" ?head-slice [
        unicode-escape
    ] [
        unclip-slice escape swap
    ] if ;

: (unescape-string) ( str -- )
    CHAR: \\ over index dup [
        cut-slice [ % ] dip rest-slice
        next-escape [ , ] dip
        (unescape-string)
    ] [
        drop %
    ] if ;

: unescape-string ( str -- str' )
    [ (unescape-string) ] "" make ;

: (parse-string) ( str -- m )
    dup [ "\"\\" member? ] find dup [
        [ cut-slice [ % ] dip rest-slice ] dip
        CHAR: " = [
            from>>
        ] [
            next-escape [ , ] dip (parse-string)
        ] if
    ] [
        "Unterminated string" throw
    ] if ;

: parse-string ( -- str )
    lexer get [
        [ swap tail-slice (parse-string) ] "" make swap
    ] change-lexer-column ;

<PRIVATE

: lexer-advance ( i -- before )
    [
        [
            lexer get
            [ column>> ] [ line-text>> ] bi
        ] dip swap subseq
    ] [
        lexer get (>>column)
    ] bi ;

: find-next-token ( ch -- i elt )
    CHAR: \ 2array
    [ lexer get [ column>> ] [ line-text>> ] bi ] dip
    [ member? ] curry find-from ;

: rest-of-line ( -- seq )
    lexer get [ line-text>> ] [ column>> ] bi tail-slice ;

: current-char ( lexer -- ch )
    [ column>> ] [ line-text>> ] bi nth ;

: advance-char ( lexer -- )
    [ 1 + ] change-column drop ;

ERROR: escaped-char-expected ;

: next-char ( lexer -- ch )
    dup still-parsing-line? [
        [ current-char ] [ advance-char ] bi
    ] [
        escaped-char-expected
    ] if ;

: parse-escape ( i -- )
    lexer-advance % CHAR: \ ,
    lexer get
    [ advance-char ]
    [ next-char , ] bi ;

: next-string-line ( obj -- )
    drop rest-of-line %
    lexer get next-line "\n" % ;

: rest-begins? ( string -- ? )
    [
        lexer get [ line-text>> ] [ column>> ] bi tail-slice
    ] dip head? ;

DEFER: (parse-long-string)

: parse-rest-of-line ( string i token -- )
    CHAR: \ = [
        parse-escape (parse-long-string)
    ] [
        lexer-advance %
        dup rest-begins? [
            [ lexer get ] dip length [ + ] curry change-column drop
        ] [
            lexer get next-char , (parse-long-string)
        ] if
    ] if ;

: parse-til-separator ( string -- )
    dup first find-next-token [
        parse-rest-of-line
    ] [
        next-string-line (parse-long-string)
    ] if* ;

: (parse-long-string) ( string -- )
    lexer get still-parsing? [
        parse-til-separator
    ] [
        unexpected-eof
    ] if ;

: parse-long-string ( string -- string' )
    [ (parse-long-string) ] "" make ;

: parse-long-string-escaped ( string -- string' )
    parse-long-string unescape-string ;

PRIVATE>

: parse-multiline-string ( -- string )
    rest-of-line "\"\"" head? [
        lexer get [ 2 + ] change-column drop
        "\"\"\"" parse-long-string-escaped
    ] [
        "\"" parse-long-string-escaped
    ] if ;
add multiline string support 2009-09-19 04:55:05 -04:00			`! Copyright (C) 2008, 2009 Slava Pestov, Doug Coleman.`
Split up huge parser vocabulary 2008-06-25 04:25:08 -04:00			`! See http://factorcode.org/license.txt for BSD license.`
add multiline string support 2009-09-19 04:55:05 -04:00			`USING: accessors assocs kernel lexer make math math.parser`
			`namespaces parser sequences splitting strings arrays ;`
Split up huge parser vocabulary 2008-06-25 04:25:08 -04:00			`IN: strings.parser`

			`ERROR: bad-escape ;`

			`: escape ( escape -- ch )`
			`H{`
			`{ CHAR: a CHAR: \a }`
			`{ CHAR: e CHAR: \e }`
			`{ CHAR: n CHAR: \n }`
			`{ CHAR: r CHAR: \r }`
			`{ CHAR: t CHAR: \t }`
			`{ CHAR: s CHAR: \s }`
			`{ CHAR: \s CHAR: \s }`
			`{ CHAR: 0 CHAR: \0 }`
			`{ CHAR: \\ CHAR: \\ }`
			`{ CHAR: \" CHAR: \" }`
			`} at [ bad-escape ] unless* ;`

			`SYMBOL: name>char-hook`

Update some existing code to use initialize 2009-02-10 17:16:12 -05:00			`name>char-hook [`
			`[ "Unicode support not available" throw ]`
			`] initialize`
Split up huge parser vocabulary 2008-06-25 04:25:08 -04:00
			`: unicode-escape ( str -- ch str' )`
			`"{" ?head-slice [`
			`CHAR: } over index cut-slice`
Move call( and execute( to core 2009-03-16 21:11:36 -04:00			`[ >string name>char-hook get call( name -- char ) ] dip`
Split up huge parser vocabulary 2008-06-25 04:25:08 -04:00			`rest-slice`
			`] [`
Refactor all usages of >r/r> in core to use dip, 2dip, 3dip Non-optimizing compiler now special-cases dip, 2dip, 3dip following a literal quotation: this allows us to break the dip/slip meta-circle without explicit calls to >r/r> 2008-11-23 03:44:56 -05:00			`6 cut-slice [ hex> ] dip`
Split up huge parser vocabulary 2008-06-25 04:25:08 -04:00			`] if ;`

			`: next-escape ( str -- ch str' )`
			`"u" ?head-slice [`
			`unicode-escape`
			`] [`
			`unclip-slice escape swap`
			`] if ;`

add multiline string support 2009-09-19 04:55:05 -04:00			`: (unescape-string) ( str -- )`
			`CHAR: \\ over index dup [`
			`cut-slice [ % ] dip rest-slice`
			`next-escape [ , ] dip`
			`(unescape-string)`
			`] [`
			`drop %`
			`] if ;`

			`: unescape-string ( str -- str' )`
			`[ (unescape-string) ] "" make ;`

Split up huge parser vocabulary 2008-06-25 04:25:08 -04:00			`: (parse-string) ( str -- m )`
			`dup [ "\"\\" member? ] find dup [`
Refactor all usages of >r/r> in core to use dip, 2dip, 3dip Non-optimizing compiler now special-cases dip, 2dip, 3dip following a literal quotation: this allows us to break the dip/slip meta-circle without explicit calls to >r/r> 2008-11-23 03:44:56 -05:00			`[ cut-slice [ % ] dip rest-slice ] dip`
Move call( and execute( to core 2009-03-16 21:11:36 -04:00			`CHAR: " = [`
			`from>>`
Split up huge parser vocabulary 2008-06-25 04:25:08 -04:00			`] [`
Move call( and execute( to core 2009-03-16 21:11:36 -04:00			`next-escape [ , ] dip (parse-string)`
Split up huge parser vocabulary 2008-06-25 04:25:08 -04:00			`] if`
			`] [`
			`"Unterminated string" throw`
			`] if ;`

			`: parse-string ( -- str )`
			`lexer get [`
			`[ swap tail-slice (parse-string) ] "" make swap`
			`] change-lexer-column ;`
Remove eval dependency from unicode.syntax 2008-12-08 20:46:40 -05:00
add multiline string support 2009-09-19 04:55:05 -04:00			`<PRIVATE`

			`: lexer-advance ( i -- before )`
			`[`
			`[`
			`lexer get`
			`[ column>> ] [ line-text>> ] bi`
			`] dip swap subseq`
Remove eval dependency from unicode.syntax 2008-12-08 20:46:40 -05:00			`] [`
add multiline string support 2009-09-19 04:55:05 -04:00			`lexer get (>>column)`
			`] bi ;`

			`: find-next-token ( ch -- i elt )`
			`CHAR: \ 2array`
			`[ lexer get [ column>> ] [ line-text>> ] bi ] dip`
			`[ member? ] curry find-from ;`

			`: rest-of-line ( -- seq )`
			`lexer get [ line-text>> ] [ column>> ] bi tail-slice ;`

the last character on a multiline string cannot be a backslash 2009-09-20 15:18:19 -04:00			`: current-char ( lexer -- ch )`
			`[ column>> ] [ line-text>> ] bi nth ;`

			`: advance-char ( lexer -- )`
			`[ 1 + ] change-column drop ;`

			`ERROR: escaped-char-expected ;`

			`: next-char ( lexer -- ch )`
			`dup still-parsing-line? [`
			`[ current-char ] [ advance-char ] bi`
			`] [`
			`escaped-char-expected`
			`] if ;`

add multiline string support 2009-09-19 04:55:05 -04:00			`: parse-escape ( i -- )`
			`lexer-advance % CHAR: \ ,`
			`lexer get`
the last character on a multiline string cannot be a backslash 2009-09-20 15:18:19 -04:00			`[ advance-char ]`
			`[ next-char , ] bi ;`
add multiline string support 2009-09-19 04:55:05 -04:00
			`: next-string-line ( obj -- )`
			`drop rest-of-line %`
			`lexer get next-line "\n" % ;`

			`: rest-begins? ( string -- ? )`
			`[`
			`lexer get [ line-text>> ] [ column>> ] bi tail-slice`
			`] dip head? ;`

			`DEFER: (parse-long-string)`

			`: parse-rest-of-line ( string i token -- )`
			`CHAR: \ = [`
			`parse-escape (parse-long-string)`
			`] [`
			`lexer-advance %`
			`dup rest-begins? [`
			`[ lexer get ] dip length [ + ] curry change-column drop`
			`] [`
fixing some quotes in strings bugs 2009-09-20 16:08:06 -04:00			`lexer get next-char , (parse-long-string)`
add multiline string support 2009-09-19 04:55:05 -04:00			`] if`
Remove eval dependency from unicode.syntax 2008-12-08 20:46:40 -05:00			`] if ;`

add multiline string support 2009-09-19 04:55:05 -04:00			`: parse-til-separator ( string -- )`
			`dup first find-next-token [`
			`parse-rest-of-line`
			`] [`
			`next-string-line (parse-long-string)`
			`] if* ;`

			`: (parse-long-string) ( string -- )`
			`lexer get still-parsing? [`
			`parse-til-separator`
			`] [`
			`unexpected-eof`
			`] if ;`

			`: parse-long-string ( string -- string' )`
the last character on a multiline string cannot be a backslash 2009-09-20 15:18:19 -04:00			`[ (parse-long-string) ] "" make ;`

			`: parse-long-string-escaped ( string -- string' )`
			`parse-long-string unescape-string ;`

			`PRIVATE>`
add multiline string support 2009-09-19 04:55:05 -04:00
			`: parse-multiline-string ( -- string )`
			`rest-of-line "\"\"" head? [`
			`lexer get [ 2 + ] change-column drop`
the last character on a multiline string cannot be a backslash 2009-09-20 15:18:19 -04:00			`"\"\"\"" parse-long-string-escaped`
add multiline string support 2009-09-19 04:55:05 -04:00			`] [`
the last character on a multiline string cannot be a backslash 2009-09-20 15:18:19 -04:00			`"\"" parse-long-string-escaped`
add multiline string support 2009-09-19 04:55:05 -04:00			`] if ;`