factor/contrib/parser-combinators/parser-combinators.factor

! Copyright (C) 2004 Chris Double.
!
! Redistribution and use in source and binary forms, with or without
! modification, are permitted provided that the following conditions are met:
!
! 1. Redistributions of source code must retain the above copyright notice,
!    this list of conditions and the following disclaimer.
!
! 2. Redistributions in binary form must reproduce the above copyright notice,
!    this list of conditions and the following disclaimer in the documentation
!    and/or other materials provided with the distribution.
!
! THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
! INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
! FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
! DEVELOPERS AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
! SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
! PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
! OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
! WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
! OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
! ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
IN: parser-combinators
USE: lazy
USE: kernel
USE: sequences
USE: strings
USE: lists
USE: math

GENERIC: phead

M: string phead ( object -- head )
  #! Polymorphic head. Return the head item of the object.
  #! For a string this is the first character.
  0 swap nth ;

M: list phead ( object -- head )
  #! Polymorphic head. Return the head item of the object.
  #! For a list this is the car.
  car ;

M: cons phead ( object -- head )
  #! Polymorphic head. Return the head item of the object.
  #! For a list this is the car.
  car ;

GENERIC: ptail

M: string ptail ( object -- tail )
  #! Polymorphic tail. Return the tail of the object.
  #! For a string this is everything but the first character.
  1 swap tail ;

M: list ptail ( object -- tail )
  #! Polymorphic tail. Return the tail of the object.
  #! For a list this is the cdr.
  cdr ;

M: cons ptail ( object -- tail )
  #! Polymorphic tail. Return the tail of the object.
  #! For a list this is the cdr.
  cdr ;

: pfirst ( object -- first )
  #! Polymorphic first. The first item in a collection.
  phead ;

GENERIC: psecond

M: string psecond ( object -- second )
  #! Polymorphic second
  1 swap nth ;

M: list psecond ( object -- second )
  #! Polymorphic second
  cdr car ;

: ph:t ( object -- head tail )
  #! Return the head and tail of the object.
  dup phead swap ptail ;

GENERIC: pempty?

M: string pempty? ( object -- bool )
  #! Return true if the collection is empty.
  length 0 = ;

M: list pempty? ( object -- bool )
  #! Return true if the collection is empty.
  not ;

: string-take ( n string -- string )
  #! Return a string with the first 'n' characters
  #! of the original string.
  dup length pick < [
    2drop ""
  ] [
    head
  ] ifte ;

: (list-take) ( n list accum -- list )
  >r >r 1 - dup 0 < [
    drop r> drop r> reverse
  ] [
    r> uncons swap r> cons (list-take)
  ] ifte ;

: list-take ( n list -- list )
  #! Return a list with the first 'n' characters
  #! of the original list.
  [ ] (list-take) ;

GENERIC: ptake

M: string ptake ( n object -- object )
  #! Polymorphic take.
  #! Return a collection of the first 'n'
  #! characters from the original collection.
  string-take ;

M: list ptake ( n object -- object )
  #! Polymorphic take.
  #! Return a collection of the first 'n'
  #! characters from the original collection.
  list-take ;

: string-drop ( n string -- string )
  #! Return a string with the first 'n' characters
  #! of the original string removed.
  dup length pick < [
    2drop ""
  ] [
    tail
  ] ifte ;

: list-drop ( n list -- list )
  #! Return a list with the first 'n' items
  #! of the original list removed.
  >r 1 - dup 0 < [
    drop r>
  ] [
    r> cdr list-drop
  ] ifte ;

GENERIC: pdrop

M: string pdrop ( n object -- object )
  #! Polymorphic drop.
  #! Return a collection the same as 'object'
  #! but with the first n items removed.
  string-drop ;

M: list pdrop ( n object -- object )
  #! Polymorphic drop.
  #! Return a collection the same as 'object'
  #! but with the first n items removed.
  list-drop ;

: token-parser ( inp sequence -- llist )
  #! A parser that parses a specific sequence of
  #! characters.
  2dup length swap ptake over = [
    swap over length swap pdrop swons unit delay lunit
  ] [
    2drop lnil
  ] ifte ;

: token ( string -- parser )
  #! Return a token parser that parses the given string.
  [ token-parser ] cons ;

: satisfy-parser ( inp pred -- llist )
  #! A parser that succeeds if the predicate,
  #! when passed the first character in the input, returns
  #! true.
  over pempty? [
    2drop lnil
  ] [
    over phead swap call [
      ph:t swons unit delay lunit
    ] [
      drop lnil
    ] ifte
  ] ifte ;

: satisfy ( p -- parser )
  #! Return a parser that succeeds if the predicate 'p',
  #! when passed the first character in the input, returns
  #! true.
  [ satisfy-parser ] cons ;

: satisfy2-parser ( inp pred quot -- llist )
  #! A parser that succeeds if the predicate,
  #! when passed the first character in the input, returns
  #! true. On success the quotation is called with the
  #! successfully parsed character on the stack. The result
  #! of that call is returned as the result portion of the
  #! successfull parse lazy list.
  -rot over phead swap call [ ( quot inp -- )
    ph:t >r swap call r> swons unit delay lunit
  ] [
    2drop lnil
  ] ifte ;

  : satisfy2 ( pred quot -- parser )
  #! Return a satisfy2-parser.
  [ satisfy2-parser ] cons cons ;

: epsilon-parser ( input -- llist )
  #! A parser that parses the empty string. It
  #! does not consume any input and always returns
  #! an empty list as the parse tree with the
  #! unmodified input.
  "" cons unit delay lunit ;

: epsilon ( -- parser )
  #! Return an epsilon parser
  [ epsilon-parser ] ;

: succeed-parser ( input result -- llist )
  #! A parser that always returns 'result' as a
  #! successful parse with no input consumed.
  cons unit delay lunit ;

: succeed ( result -- parser )
  #! Return a succeed parser.
  [ succeed-parser ] cons ;

: fail-parser ( input -- llist )
  #! A parser that always fails and returns
  #! an empty list of successes.
  drop lnil ;

: fail ( -- parser )
  #! Return a fail-parser.
  [ fail-parser ] ;

: <&>-do-parser3 ( [[ x1 xs2 ]] x -- result )
  #! Called by <&>-do-parser2 on each result of the
  #! parse from parser2.
  >r uncons r> ( x1 xs2 x )
  swap cons cons ;

: <&>-do-parser2 ( [[ x xs ]] parser2 -- result )
  #! Called by the <&>-parser on each result of the
  #! successfull parse of parser1. It's input is the
  #! cons containing the data parsed and the remaining
  #! input. This word will parser2 on the remaining input
  #! returning a new cons cell containing the combined
  #! parse result.
  >r unswons r> ( x xs parser2 )
  call swap    ( llist x )
  [ <&>-do-parser3 ] cons lmap ;

: <&>-parser ( input parser1 parser2 -- llist )
  #! Parse 'input' by sequentially combining the
  #! two parsers. First parser1 is applied to the
  #! input then parser2 is applied to the rest of
  #! the input strings from the first parser.
  >r call r>   ( [[ x xs ]] p2 -- result )
  [ <&>-do-parser2 ] cons lmap lappend* ;

: <&> ( parser1 parser2 -- parser )
  #! Sequentially combine two parsers, returning a parser
  #! that first calls p1, then p2 all remaining results from
  #! p1.
  [ <&>-parser ] cons cons ;

: <|>-parser ( input parser1 parser2 -- result )
  #! Return the combined list resulting from the parses
  #! of parser1 and parser2 being applied to the same
  #! input. This implements the choice parsing operator.
  >r dupd call swap r> call lappend ;

: <|> ( p1 p2 -- parser )
  #! Choice operator for parsers. Return a parser that does
  #! p1 or p2 depending on which will succeed.
  [ <|>-parser ] cons cons ;

: string-ltrim ( string -- string )
  #! Return a new string without any leading whitespace
  #! from the original string.
  dup phead blank? [ ptail string-ltrim ] when ;

: sp-parser ( input parser -- result )
  #! Skip all leading whitespace from the input then call
  #! the parser on the remaining input.
  >r string-ltrim r> call ;

: sp ( parser -- parser )
  #! Return a parser that first skips all whitespace before
  #! calling the original parser.
  [ sp-parser ] cons ;

: just-parser ( input parser -- result )
  #! Calls the given parser on the input removes
  #! from the results anything where the remaining
  #! input to be parsed is not empty. So ensures a
  #! fully parsed input string.
  call [ car pempty? ] lsubset ;

: just ( parser -- parser )
  #! Return an instance of the just-parser.
  [ just-parser ] cons ;

: (<@-parser-replace) ( [[ inp result ]] quot -- [[ inp new-result ]] )
  #! Perform the result replacement step of <@-parser.
  #! Given a successfull parse result, calls the quotation
  #! with the result portion on the stack. The result of
  #! that call is then used as the new result.
  swap uncons rot call cons ;

: <@-parser ( input parser quot -- result )
  #! Calls the parser on the input. For each successfull
  #! parse the quot is call with the parse result on the stack.
  #! The result of that quotation then becomes the new parse result.
  #! This allows modification of parse tree results (like
  #! converting strings to integers, etc).
  -rot call dup lnil? [ ( quot lnil -- )
    nip
  ] [ ( quot result -- )
    [ (<@-parser-replace) ] rot swons lmap
  ] ifte ;

: <@ ( parser quot -- parser )
  #! Return an <@-parser.
  [ <@-parser ] cons cons ;

: some-parser ( input parser -- result )
  #! Calls the parser on the input, guarantees
  #! the parse is complete (the remaining input is empty),
  #! picks the first solution and only returns the parse
  #! tree since the remaining input is empty.
  just call lcar cdr ;

: some ( parser -- deterministic-parser )
  #! Creates a 'some-parser'.
  [ some-parser ] cons ;

: <&-parser ( input parser1 parser2 -- result )
  #! Same as <&> except discard the results of the second parser.
  <&> [ phead ] <@ call ;

: <& ( parser1 parser2 -- parser )
  #! Same as <&> except discard the results of the second parser.
  [ <&-parser ] cons cons ;

: &>-parser ( input parser1 parser2 -- result )
  #! Same as <&> except discard the results of the first parser.
  <&> [ ptail ] <@ call ;

: &> ( parser1 parser2 -- parser )
  #! Same as <&> except discard the results of the first parser.
  [ &>-parser ] cons cons ;

: (a,(b,c))>((a,b,c)) ( list -- list )
  #! Convert a list where the car is a single value
  #! and the cdr is a list to a list containing a flattened
  #! list.
  uncons car cons unit ;

: <:&>-parser ( input parser1 parser2 -- result )
  #! Same as <&> except postprocess the result with
  #! (a,(b,c))>((a,b,c)).
  <&> [ (a,(b,c))>((a,b,c)) ] <@ call ;

: <:&> ( parser1 parser2 -- parser )
  #! Same as <&> except postprocess the result with
  #! (a,(b,c))>((a,b,c)).
  [ <:&>-parser ] cons cons ;

DEFER: <*>

: (<*>) ( parser -- parser )
  #! Non-delayed implementation of <*>
  dup <*> <:&> [ ] succeed <|> ;

: <*> ( parser -- parser )
  #! Return a parser that accepts zero or more occurences of the original
  #! parser.
  [  (<*>) call ] cons ;

: (<+>) ( parser -- parser )
  #! Non-delayed implementation of <+>
  dup <*> <:&> ;

: <+> ( parser -- parser )
  #! Return a parser that accepts one or more occurences of the original
  #! parser.
  [  (<+>) call ] cons ;

: (<?>) ( parser -- parser )
  #! Non-delayed implementation of <?>
  [ unit ] <@ [ ] succeed <|> ;

: <?> ( parser -- parser )
  #! Return a parser that optionally uses the parser
  #! if that parser would be successfull.
  [  (<?>) call ] cons ;

USE: prettyprint
USE: parser
USE: unparser
USE: stdio

! Testing <&>
: test1 "abcd" "a" token "b" token <&> call [ . ] leach ;
: test1a "abcd" "a" token "b" token <&> "c" token <&> call [ . ] leach ;
: test1b "abcd" "a" token "b" token "c" token <&> <&> call [ . ] leach ;
: test2 "decd" "a" token "b" token <&> call [ . ] leach ;
: test3 "dbcd" "a" token "b" token <&> call [ . ] leach ;
: test4 "adcd" "a" token "b" token <&> call [ . ] leach ;

! Testing <|>
: test5 "abcd" "a" token "b" token <|> call [ . ] leach ;
: test6 "bbcd" "a" token "b" token <|> call [ . ] leach ;
: test7 "cbcd" "a" token "b" token <|> call [ . ] leach ;

! Testing sp
: test8 "  abcd" "a" token call [ . ] leach ;
: test9 "  abcd" "a" token sp call [ . ] leach ;

! Testing just
: test10 "abcd" "abcd" token "abc" token <|> call [ . ] leach ;
: test11 "abcd" "abcd" token "abc" token <|> just call [ . ] leach ;

! Testing <@
: test12 "01234" [ digit? ] satisfy call [ . ] leach ;
: test13 "01234" [ digit? ] satisfy [ digit> ] <@ call [ . ] leach ;

! Testing some
: test14 "begin1" "begin" token call [ . ] leach ;
: test15 "This should fail with an error" print
         "begin1" "begin" token some call . ;
: test16 "begin" "begin" token some call . ;

! parens test function
: parens ( -- parser )
  #! Return a parser that parses nested parentheses.
  [ "(" token parens <&> ")" token <&> parens <&> epsilon <|> call ]  ;

: test17 "" parens call [ . ] leach ;
: test18 "()" parens call [ . ] leach ;
: test19 "((()))" parens call [ . ] leach ;

! <& parser and &> parser
: test20 "abcd" "a" token "b" token <&> call [ . ] leach ;
: test21 "abcd" "a" token "b" token <& call [ . ] leach ;
: test22 "abcd" "a" token "b" token &> call [ . ] leach ;

! nesting example
: parens-open "(" token ;
: parens-close ")" token ;
: nesting
  [ parens-open
    nesting &>
    parens-close <&
    nesting <&>
    [ unswons 1 + max ] <@
    0 succeed <|>
    call ] ;

: test23 "" nesting just call [ . ] leach ;
: test24 "()" nesting just call [ . ] leach ;
: test25 "(())" nesting just call [ . ] leach ;
: test26 "()(()(()()))()" nesting just call [ . ] leach ;

! Testing <*> and <:&>
: test27 "1234" "1" token <*> call [ . ] leach ;
: test28 "1111234" "1" token <*> call [ . ] leach ;
: test28a "1111234" "1" token <*> [ car concat unit ] <@ call [ . ] leach ;
: test29 "234" "1" token <*> call [ . ] leach ;
: pdigit [ digit? ] satisfy [ digit> ] <@ ;
: pnatural pdigit <*> ;
: pnatural2 pnatural [ car [ >digit ] map >string dup pempty? [ drop 0 ] [ str>number ] ifte unit ] <@ ;
: test30 "12345" pnatural2 call [ . ] leach ;

! Testing <+>
: test31 "1234" "1" token <+> call [ . ] leach ;
: test32 "1111234" "1" token <+> call [ . ] leach ;
: test33 "234" "1" token <+> call [ . ] leach ;

! Testing <?>
: test34 "ab" "a" token pdigit <?> <&> "b" token <&> call [ . ] leach ;
: test35 "ac" "a" token pdigit <?> <&> "b" token <&> call [ . ] leach ;
: test36 "a5b" "a" token pdigit <?> <&> "b" token <&> call [ . ] leach ;
: pinteger "-" token <?> pnatural2 <&> [ uncons swap [ car -1 * ] when ] <@ ;
: test37 "123" pinteger call [ . ] leach ;
: test38 "-123" pinteger call [ . ] leach ;