factor/contrib/parser-combinators/parser-combinators.factor

! Copyright (C) 2004 Chris Double.
!
! Redistribution and use in source and binary forms, with or without
! modification, are permitted provided that the following conditions are met:
!
! 1. Redistributions of source code must retain the above copyright notice,
!    this list of conditions and the following disclaimer.
!
! 2. Redistributions in binary form must reproduce the above copyright notice,
!    this list of conditions and the following disclaimer in the documentation
!    and/or other materials provided with the distribution.
!
! THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
! INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
! FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
! DEVELOPERS AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
! SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
! PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
! OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
! WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
! OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
! ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
IN: parser-combinators
USE: lazy
USE: stack
USE: lists
USE: strings
USE: math
USE: logic
USE: kernel
USE: combinators
USE: parser

: phead ( object -- head )
  #! Polymorphic head. Return the head item of the object.
  #! For a string this is the first character.
  #! For a list this is the car.
  [
    [ string? ] [ 0 swap str-nth ]
    [ list? ] [ car ]
  ] cond ;

: ptail ( object -- tail )
  #! Polymorphic tail. Return the tail of the object.
  #! For a string this is everything but the first character.
  #! For a list this is the cdr.
  [
    [ string? ] [ 1 str-tail ]
    [ list? ] [ cdr ]
  ] cond ;

: pfirst ( object -- first )
  #! Polymorphic first
  phead ;

: psecond ( object -- second )
  #! Polymorphic second
  [
    [ string? ] [ 1 swap str-nth ]
    [ list? ] [ cdr car ]
  ] cond ;

: ph:t ( object -- head tail )
  #! Return the head and tail of the object.
  dup phead swap ptail ;

: pempty? ( object -- bool )
  #! Polymorphic empty test.
  [
    [ string? ] [ "" = ]
    [ list? ] [ not ]
  ] cond ;

: string-take ( n string -- string )
  #! Return a string with the first 'n' characters
  #! of the original string.
  dup str-length pick < [
    2drop ""
  ] [
    swap str-head
  ] ifte ;

: (list-take) ( n list accum -- list )
  >r >r pred dup 0 < [
    drop r> drop r> nreverse
  ] [
    r> uncons swap r> cons (list-take)
  ] ifte ;

: list-take ( n list -- list )
  #! Return a list with the first 'n' characters
  #! of the original list.
  [ ] (list-take) ;

: ptake ( n object -- object )
  #! Polymorphic take.
  #! Return a collection of the first 'n'
  #! characters from the original collection.
  [
    [ string? ] [ string-take ]
    [ list? ] [ list-take ]
  ] cond ;

: string-drop ( n string -- string )
  #! Return a string with the first 'n' characters
  #! of the original string removed.
  dup str-length pick < [
    2drop ""
  ] [
    swap str-tail
  ] ifte ;

: list-drop ( n list -- list )
  #! Return a list with the first 'n' items
  #! of the original list removed.
  >r pred dup 0 < [
    drop r>
  ] [
    r> cdr list-drop
  ] ifte ;

: pdrop ( n object -- object )
  #! Polymorphic drop.
  #! Return a collection the same as 'object'
  #! but with the first n items removed.
  [
    [ string? ] [ string-drop ]
    [ list? ] [ list-drop ]
  ] cond ;

: ifte-head= ( string-or-list ch [ quot1 ] [ quot2 ] -- )
  #! When the character 'ch' is equal to the head
  #! of the string or list, run the quot1 otherwise run quot2.
  [ swap phead = ] 2dip ifte ;

: symbol ( ch -- parser )
  #! Return a parser that parses the given symbol.
  [ ( inp ch -- result )
    2dup [
      swap ptail cons lunit
    ] [
      2drop [ ]
    ] ifte-head=
  ] curry1 ;

: token ( string -- parser )
  #! Return a parser that parses the given string.
  [ ( inp string -- result )
    2dup str-length swap ptake over = [
      swap over str-length swap pdrop cons lunit
    ] [
      2drop [ ]
    ] ifte
  ] curry1 ;

: satisfy ( p -- parser )
  #! Return a parser that succeeds if the predicate 'p',
  #! when passed the first character in the input, returns
  #! true.
  [ ( inp p -- result )
    over pempty? [
      2drop [ ]
    ] [
      over phead swap call [
        ph:t cons lunit
      ] [
        drop [ ]
      ] ifte
    ] ifte
  ] curry1 ;

: satisfy2 ( p r -- parser )
  #! Return a parser that succeeds if the predicate 'p',
  #! when passed the first character in the input, returns
  #! true. On success the word 'r' is called with the
  #! successfully parser character on the stack. The result
  #! of this is returned as the result of the parser.
  [ ( inp p r -- result )
    >r over phead swap call [
      ph:t swap r> call swons lunit
    ] [
      r> 2drop [ ]
    ] ifte
  ] curry2 ;

: epsilon ( -- parser )
  #! A parser that parses the empty string.
  [ ( inp -- result )
    "" swap cons lunit
  ] ;

: succeed ( r -- parser )
  #! A parser that always returns 'r' and consumes no input.
  [ ( inp r -- result )
    swap cons lunit
  ] curry1 ;

: fail ( -- parser )
  #! A parser that always fails
  [
    drop [ ]
  ] ;

USE: prettyprint
USE: unparser

: ensure-list ( a -- [ a ] )
  #! If 'a' is not a list, make it one.
  dup list? [ unit ] unless ;

: ++ ( a b -- [ a b ] )
  #! Join two items into a list.
  >r ensure-list r> ensure-list append ;

: <&> ( p1 p2 -- parser )
  #! Sequentially combine two parsers, returning a parser
  #! that first calls p1, then p2 all remaining results from
  #! p1.
  [ ( inp p1 p2 -- result )
    >r call r> [ ( [ x | xs ] p2 -- result )
      >r uncons r> call swap [ ( [ x2 | xs2 ] x -- result )
        >r uncons swap r> swap ++ swons
      ] curry1 lmap
    ] curry1 lmap lappend*
  ] curry2 ;


: <|> ( p1 p2 -- parser )
  #! Choice operator for parsers. Return a parser that does
  #! p1 or p2 depending on which will succeed.
  [ ( inp p1 p2 -- result )
    rot tuck swap call >r swap call r> lappend
  ] curry2 ;

: p-abc ( -- parser )
  #! Test Parser. Parses the string "abc"
  "a" token "b" token "c" token <&> <&> ;

: parse-skipwhite ( string -- string )
  dup phead blank? [
    ptail parse-skipwhite
  ] [
  ] ifte ;

: sp ( parser -- parser )
  #! Return a parser that first skips all whitespace before
  #! parsing.
  [ ( inp parser -- result )
    [ parse-skipwhite ] dip call
  ] curry1 ;

: just ( parser -- parser )
  #! Return a parser that works exactly like the input parser
  #! but guarantees that the rest string is empty.
  [ ( inp parser -- result )
    call [ ( [ x | xs ] -- )
      cdr str-length 0 =
    ] lsubset
  ] curry1 ;

: <@ ( p f -- parser )
  #! Given a parser p and a quotation f return a parser
  #! that does the same as p but in addition applies f
  #! to the resulting parse tree.
  [ ( inp p f -- result )
    >r call r> [ ( [ x | xs ] f -- [ fx | xs ] )
      swap uncons [ swap over [ call ] [ drop ] ifte ] dip cons
    ] curry1 lmap
  ] curry2 ;

: p-1 ( -- parser )
  "1" token "123" swap call lcar ;

: p-2 ( -- parser )
  "1" token [ str>number ] <@ "123" swap call lcar ;

: some ( parser -- det-parser )
  #! Given a parser, return a parser that only produces the
  #! resulting parse tree of the first successful complete parse.
  [ ( inp parser -- result )
    just call lcar car
  ] curry1 ;

: delayed-parser ( [ parser ] -- parser )
  [ ( inp [ parser ] -- result )
    call call
  ] curry1 ;

: parens ;
: parens ( -- parser )
  #! Parse nested parentheses
  "(" token [ parens ] delayed-parser <&>
  ")" token <&> [ parens ] delayed-parser <&>
  epsilon <|> ;

: nesting ( -- parser )
  #! Count the maximum depth of nested parentheses.
  "(" token [ nesting ] delayed-parser <&> ")" token <&>
  [ nesting ] delayed-parser <&> [ .s drop "a" ] <@ epsilon <|> ;

: <& ( parser1 parser2 -- parser )
  #! Same as <&> except only return the first item in the parse tree.
  <&> [ pfirst ] <@ ;

: &> ( parser1 parser2 -- parser )
  #! Same as <&> except only return the second item in the parse tree.
  <&> [ psecond  ] <@ ;

: lst ( [ x [ xs ] ] -- [x:xs] )
  #! I need a good name for this word...
  dup cdr [ uncons car cons ] when unit ;

: <*> ( parser -- parser )
  #! Return a parser that accepts zero or more occurences of the original
  #! parser.
  dup [ <*> ] curry1 delayed-parser <&> [ lst ] <@ [ ] succeed <|> ;

: <+> ( parser -- parser )
  #! Return a parser that accepts one or more occurences of the original
  #! parser.
  dup [ <*> ] curry1 delayed-parser <&> [ lst ] <@  ;

: <?> ( parser -- parser )
  #! Return a parser where its construct is optional. It may or may not occur.
  [ ] succeed <|> ;

: <first> ( parser -- parser )
  #! Transform a parser into a parser that only returns the first success.
  [
    call dup [ lcar lunit ] when
  ] curry1 ;

: <!*> ( parser -- parser )
  #! Version of <*> that only returns the first success.
  <*> <first> ;

: <!+> ( parser -- parser )
  #! Version of <+> that only returns the first success.
  <+> <first> ;

: ab-test
  "a" token <*> "b" token <&> "aaaaab" swap call [ . ] leach ;

: ab-test2
  "a" token <*> "b" token <&> [ "a" "a" "a" "b" ] swap call [ . ] leach ;

: a "a" token "a" token <&> epsilon <|> ;
: b "b" token epsilon <|> ;
: c "c" token "c" token <&> ;
: d "d" token "d" token <&> ;
: count-a "a" token [ count-a ] delayed-parser &> "b" token <& [ 1 + ] <@ 0 succeed <|> ;
: tca "aaabbb" count-a call [ . ] leach ;

: parse-digit ( -- parser )
  #! Return a parser for digits
  [ digit? ] satisfy [ CHAR: 0 - ] <@ ;

: (reduce) ( start quot list -- value )
  #! Call quot with start and the first value in the list.
  #! quot is then called with the result of quot and the
  #! next item in the list until the list is exhausted.
  uncons >r swap dup dip r> dup [
    (reduce)
  ] [
    2drop
  ] ifte ;

: reduce ( list start quot -- value )
  #! Call quot with start and the first value in the list.
  #! quot is then called with the result of quot and the
  #! next item in the list until the list is exhausted.
  rot (reduce) ;

: natural ( -- parser )
  #! a parser for natural numbers.
  parse-digit <*> [ car 0 [ swap 10 * + ] reduce unit  ] <@  ;

: natural2 ( -- parser )
  #! a parser for natural numbers.
  parse-digit <!+> [ car 0 [ swap 10 * + ] reduce unit  ] <@  ;

: integer ( -- parser )
  #! A parser that can parser possible negative numbers.
  "-" token <?> [ drop -1 ] <@ natural2 <&> [ 1 [ * ] reduce ] <@  ;

: identifier ( -- parser )
  #! Parse identifiers
  [ letter? ] satisfy <+> [ car cat ] <@ ;

: identifier2 ( -- parser )
  #! Parse identifiers
  [ letter? ] satisfy <!+> [ car cat ] <@  ;

: ints ( -- parser )
  integer "+" token [ drop [ [ + ] ] ] <@ <&>
  integer <&> [ call swap call ] <@ ;

: url-quotable ( -- parser )
 ! [a-zA-Z0-9/_?] re-matches
 [ letter? ] satisfy
 [ LETTER? ] satisfy <|>
 [ digit? ] satisfy <|>
 CHAR: / symbol <|>
 CHAR: _ symbol <|>
 CHAR: ? symbol <|> just ;

: http-header ( -- parser )
  [ CHAR: : = not ] satisfy <!+> [ car cat ] <@
  ": " token [ drop f ] <@ <&>
  [ drop t ] satisfy <!+> [ car cat ] <@ <&> just ;

: parse-http-header ( string -- [ name value ] )
  http-header call lcar car ;

: get-request ( -- parser )
  "GET" token
  [ drop t ] satisfy <!+> sp [ car cat ] <@ <&> ;

: post-request ( -- parser )
  "POST" token
  [ drop t ] satisfy <!+> sp [ car cat ] <@ <&> ;

: all-request ( -- parser )
  "POST" token
  [ 32 = not  ] satisfy <!+> sp [ car cat ] <@ <&>
  "HTTP/1.0" token sp <&> ;

: split-url ( -- parser )
  "http://" token
  [ CHAR: / = not ] satisfy <!*> [ car cat ] <@ <&>
  "/" token <&>
  [ drop t ] satisfy <!*> [ car cat ] <@ <&> ;