Merge branch 'master' of git://factorcode.org/git/factor

db4
Doug Coleman 2009-03-17 10:14:32 -05:00
commit 655e8ddbac
13 changed files with 211 additions and 59 deletions

View File

@ -37,8 +37,7 @@ C: <with-options> with-options
TUPLE: options on off ; TUPLE: options on off ;
C: <options> options C: <options> options
SINGLETONS: unix-lines dotall multiline comments case-insensitive SINGLETONS: unix-lines dotall multiline case-insensitive reversed-regexp ;
unicode-case reversed-regexp ;
: <maybe> ( term -- term' ) : <maybe> ( term -- term' )
f <concatenation> 2array <alternation> ; f <concatenation> 2array <alternation> ;

View File

@ -12,7 +12,7 @@ ascii-class punctuation-class java-printable-class blank-class
control-character-class hex-digit-class java-blank-class c-identifier-class control-character-class hex-digit-class java-blank-class c-identifier-class
unmatchable-class terminator-class word-boundary-class ; unmatchable-class terminator-class word-boundary-class ;
SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file word-break ; SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file ^unix $unix word-break ;
TUPLE: range from to ; TUPLE: range from to ;
C: <range> range C: <range> range

View File

@ -17,9 +17,6 @@ SYMBOL: backwards?
M: t question>quot drop [ 2drop t ] ; M: t question>quot drop [ 2drop t ] ;
M: f question>quot drop [ 2drop f ] ; M: f question>quot drop [ 2drop f ] ;
M: not-class question>quot
class>> question>quot [ not ] compose ;
M: beginning-of-input question>quot M: beginning-of-input question>quot
drop [ drop zero? ] ; drop [ drop zero? ] ;
@ -40,6 +37,12 @@ M: $ question>quot
M: ^ question>quot M: ^ question>quot
drop [ { [ drop zero? ] [ [ 1- ] dip ?nth "\r\n" member? ] } 2|| ] ; drop [ { [ drop zero? ] [ [ 1- ] dip ?nth "\r\n" member? ] } 2|| ] ;
M: $unix question>quot
drop [ { [ length = ] [ ?nth CHAR: \n = ] } 2|| ] ;
M: ^unix question>quot
drop [ { [ drop zero? ] [ [ 1- ] dip ?nth CHAR: \n = ] } 2|| ] ;
M: word-break question>quot M: word-break question>quot
drop [ word-break-at? ] ; drop [ word-break-at? ] ;

View File

@ -1,7 +1,8 @@
! Copyright (C) 2009 Daniel Ehrenberg. ! Copyright (C) 2009 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: kernel accessors regexp.classes math.bits assocs sequences USING: kernel accessors regexp.classes math.bits assocs sequences
arrays sets regexp.dfa math fry regexp.minimize regexp.ast regexp.transition-tables ; arrays sets regexp.dfa math fry regexp.minimize regexp.ast
locals regexp.transition-tables ;
IN: regexp.disambiguate IN: regexp.disambiguate
TUPLE: parts in out ; TUPLE: parts in out ;
@ -9,7 +10,7 @@ TUPLE: parts in out ;
: make-partition ( choices classes -- partition ) : make-partition ( choices classes -- partition )
zip [ first ] partition [ values ] bi@ parts boa ; zip [ first ] partition [ values ] bi@ parts boa ;
: powerset-partition ( classes -- partitions ) : powerset-partition ( sequence -- partitions )
[ length [ 2^ ] keep ] keep '[ [ length [ 2^ ] keep ] keep '[
_ <bits> _ make-partition _ <bits> _ make-partition
] map rest ; ] map rest ;
@ -19,19 +20,49 @@ TUPLE: parts in out ;
[ in>> <and-class> ] bi [ in>> <and-class> ] bi
prefix <and-class> ; prefix <and-class> ;
: get-transitions ( partition state-transitions -- next-states ) : singleton-partition ( integer non-integers -- {class,partition} )
[ in>> ] dip '[ _ at ] gather sift ; dupd
'[ _ [ class-member? ] with filter ] keep
prefix f parts boa
2array ;
: add-out ( seq partition -- partition' )
[ out>> append ] [ in>> ] bi swap parts boa ;
: intersection ( seq -- elts )
[ f ] [ unclip [ intersect ] reduce ] if-empty ;
: meaningful-integers ( partition table -- integers )
[ [ in>> ] [ out>> ] bi ] dip
'[ [ _ at ] map intersection ] bi@ diff ;
: class-integers ( classes integers -- table )
'[ _ over '[ _ class-member? ] filter ] H{ } map>assoc ;
: add-integers ( partitions classes integers -- partitions )
class-integers '[
[ _ meaningful-integers ] keep add-out
] map ;
: class-partitions ( classes -- assoc )
[ integer? ] partition [
dup powerset-partition spin add-integers
[ [ partition>class ] keep 2array ] map
[ first ] filter
] [ '[ _ singleton-partition ] map ] 2bi append ;
: new-transitions ( transitions -- assoc ) ! assoc is class, partition : new-transitions ( transitions -- assoc ) ! assoc is class, partition
values [ keys ] gather values [ keys ] gather
[ tagged-epsilon? not ] filter [ tagged-epsilon? not ] filter
powerset-partition class-partitions ;
[ [ partition>class ] keep ] { } map>assoc
[ drop ] assoc-filter ; : get-transitions ( partition state-transitions -- next-states )
[ in>> ] dip '[ _ at ] gather sift ;
: preserving-epsilon ( state-transitions quot -- new-state-transitions ) : preserving-epsilon ( state-transitions quot -- new-state-transitions )
[ [ drop tagged-epsilon? ] assoc-filter ] bi [ [ drop tagged-epsilon? ] assoc-filter ] bi
assoc-union H{ } assoc-like ; inline assoc-union H{ } assoc-like ; inline
: disambiguate ( nfa -- nfa ) : disambiguate ( nfa -- nfa )
expand-ors [ expand-ors [
dup new-transitions '[ dup new-transitions '[

View File

@ -60,11 +60,16 @@ GENERIC: modify-epsilon ( tag -- newtag )
M: object modify-epsilon ; M: object modify-epsilon ;
: line-option ( multiline unix-lines default -- option )
multiline option? [
drop [ unix-lines option? ] 2dip swap ?
] [ 2nip ] if ;
M: $ modify-epsilon M: $ modify-epsilon
multiline option? [ drop end-of-input ] unless ; $unix end-of-input line-option ;
M: ^ modify-epsilon M: ^ modify-epsilon
multiline option? [ drop beginning-of-input ] unless ; ^unix beginning-of-input line-option ;
M: tagged-epsilon nfa-node M: tagged-epsilon nfa-node
clone [ modify-epsilon ] change-tag add-simple-entry ; clone [ modify-epsilon ] change-tag add-simple-entry ;

View File

@ -2,7 +2,7 @@
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: peg.ebnf kernel math.parser sequences assocs arrays fry math USING: peg.ebnf kernel math.parser sequences assocs arrays fry math
combinators regexp.classes strings splitting peg locals accessors combinators regexp.classes strings splitting peg locals accessors
regexp.ast ; regexp.ast unicode.case ;
IN: regexp.parser IN: regexp.parser
: allowed-char? ( ch -- ? ) : allowed-char? ( ch -- ? )
@ -19,20 +19,19 @@ ERROR: bad-number ;
ERROR: bad-class name ; ERROR: bad-class name ;
: name>class ( name -- class ) : name>class ( name -- class )
{ >string >case-fold {
{ "Lower" letter-class } { "lower" letter-class }
{ "Upper" LETTER-class } { "upper" LETTER-class }
{ "Alpha" Letter-class } { "alpha" Letter-class }
{ "ASCII" ascii-class } { "ascii" ascii-class }
{ "Digit" digit-class } { "digit" digit-class }
{ "Alnum" alpha-class } { "alnum" alpha-class }
{ "Punct" punctuation-class } { "punct" punctuation-class }
{ "Graph" java-printable-class } { "graph" java-printable-class }
{ "Print" java-printable-class } { "blank" non-newline-blank-class }
{ "Blank" non-newline-blank-class } { "cntrl" control-character-class }
{ "Cntrl" control-character-class } { "xdigit" hex-digit-class }
{ "XDigit" hex-digit-class } { "space" java-blank-class }
{ "Space" java-blank-class }
! TODO: unicode-character-class ! TODO: unicode-character-class
} [ bad-class ] at-error ; } [ bad-class ] at-error ;
@ -66,11 +65,8 @@ ERROR: bad-class name ;
{ CHAR: i case-insensitive } { CHAR: i case-insensitive }
{ CHAR: d unix-lines } { CHAR: d unix-lines }
{ CHAR: m multiline } { CHAR: m multiline }
{ CHAR: n multiline }
{ CHAR: r reversed-regexp } { CHAR: r reversed-regexp }
{ CHAR: s dotall } { CHAR: s dotall }
{ CHAR: u unicode-case }
{ CHAR: x comments }
} ; } ;
: ch>option ( ch -- singleton ) : ch>option ( ch -- singleton )
@ -101,8 +97,8 @@ CharacterInBracket = !("}") Character
QuotedCharacter = !("\\E") . QuotedCharacter = !("\\E") .
Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-class> ]] Escape = "p{" CharacterInBracket*:s "}" => [[ s name>class <primitive-class> ]]
| "P{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-class> <negation> ]] | "P{" CharacterInBracket*:s "}" => [[ s name>class <primitive-class> <negation> ]]
| "Q" QuotedCharacter*:s "\\E" => [[ s <concatenation> ]] | "Q" QuotedCharacter*:s "\\E" => [[ s <concatenation> ]]
| "u" Character:a Character:b Character:c Character:d | "u" Character:a Character:b Character:c Character:d
=> [[ { a b c d } hex> ensure-number ]] => [[ { a b c d } hex> ensure-number ]]

View File

@ -1,6 +1,7 @@
! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg. ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: kernel strings help.markup help.syntax math regexp.parser regexp.ast ; USING: kernel strings help.markup help.syntax math regexp.parser
regexp.ast multiline ;
IN: regexp IN: regexp
ABOUT: "regexp" ABOUT: "regexp"
@ -21,8 +22,17 @@ ARTICLE: "regexp" "Regular expressions"
{ $subsection { "regexp" "deploy" } } ; { $subsection { "regexp" "deploy" } } ;
ARTICLE: { "regexp" "intro" } "A quick introduction to regular expressions" ARTICLE: { "regexp" "intro" } "A quick introduction to regular expressions"
"Regular expressions are a terse way to do certain simple string processing tasks. For example, to replace all instances of " { $snippet "foo" } " in one string with " { $snippet "bar" } ", the following can be used:"
; { $code "R/ foo/ \"bar\" re-replace" }
"That could be done with sequence operations, but consider doing this replacement for an arbitrary number of o's, at least two:"
{ $code "R/ foo+/ \"bar\" re-replace" }
"The " { $snippet "+" } " operator matches one or more occurrences of the previous expression; in this case " { $snippet "o" } ". Another useful feature is alternation. Say we want to do this replacement with fooooo or boooo. Then we could use the code"
{ $code "R/ (f|b)oo+/ \"bar\" re-replace" }
"To search a file for all lines that match a given regular expression, you could use code like this:"
{ $code <" "file.txt" ascii file-lines [ R/ (f|b)oo+/ re-contains? ] filter "> }
"To test if a string in its entirety matches a regular expression, the following can be used:"
{ $example <" USING: regexp prettyprint ; "fooo" R/ (b|f)oo+/ matches? . "> "t" }
"Regular expressions can't be used for all parsing tasks. For example, they are not powerful enough to match balancing parentheses." ;
ARTICLE: { "regexp" "construction" } "Constructing regular expressions" ARTICLE: { "regexp" "construction" } "Constructing regular expressions"
"Most of the time, regular expressions are literals and the parsing word should be used, to construct them at parse time. This ensures that they are only compiled once, and gives parse time syntax checking." "Most of the time, regular expressions are literals and the parsing word should be used, to construct them at parse time. This ensures that they are only compiled once, and gives parse time syntax checking."
@ -33,20 +43,71 @@ ARTICLE: { "regexp" "construction" } "Constructing regular expressions"
"Another approach is to use " { $vocab-link "regexp.combinators" } "." ; "Another approach is to use " { $vocab-link "regexp.combinators" } "." ;
ARTICLE: { "regexp" "syntax" } "Regular expression syntax" ARTICLE: { "regexp" "syntax" } "Regular expression syntax"
"Regexp syntax is largely compatible with Perl, Java and extended POSIX regexps, but not completely. A new addition is the inclusion of a negation operator, with the syntax " { $snippet "(?~foo)" } " to match everything that does not match " { $snippet "foo" } "." "Regexp syntax is largely compatible with Perl, Java and extended POSIX regexps, but not completely. Below, the syntax is documented."
{ $heading "Characters" } { $heading "Characters" }
"At its core, regular expressions consist of character literals. For example, " { $snippet "R/ f/" } " is a regular expression matching just the string 'f'. In addition, the normal escape codes are provided, like " { $snippet "\\t" } " for the tab character and " { $snippet "\\uxxxxxx" } "for an arbitrary Unicode code point, by its hex value. In addition, any character can be preceded by a backslash to escape it, unless this has special meaning. For example, to match a literal opening parenthesis, use " { $snippet "\\(" } "."
{ $heading "Concatenation, alternation and grouping" }
"Regular expressions can be built out of multiple characters by concatenation. For example, " { $snippet "R/ ab/" } " matches a followed by b. The " { $snippet "|" } " (alternation) operator can construct a regexp which matches one of two alternatives. Parentheses can be used for gropuing. So " { $snippet "R/ f(oo|ar)/" } " would match either 'foo' or 'far'."
{ $heading "Character classes" } { $heading "Character classes" }
"Square brackets define a convenient way to refer to a set of characters. For example, " { $snippet "[ab]" } " refers to either a or b. And " { $snippet "[a-z]" } " refers to all of the characters between a and z, in code point order. You can use these together, as in " { $snippet "[ac-fz]" } " which matches all of the characters between c and f, in addition to a and z. Character classes can be negated using a carat, as in " { $snippet "[^a]" } " which matches all characters which are not a."
{ $heading "Predefined character classes" } { $heading "Predefined character classes" }
"Several character classes are predefined, both for convenience and because they are too large to represent directly. In Factor regular expressions, all character classes are Unicode-aware."
{ $table
{ { $snippet "\\d" } "Digits" }
{ { $snippet "\\D" } "Not digits" }
{ { $snippet "\\s" } "Whitespace" }
{ { $snippet "\\S" } "Not whitespace" }
{ { $snippet "\\w" } "Word character (alphanumeric or underscore)" }
{ { $snippet "\\W" } "Not word character" }
{ { $snippet "\\p{property}" } "Character which fulfils the property" }
{ { $snippet "\\P{property}" } "Character which does not fulfil the property" } }
"Properties for " { $snippet "\\p" } " and " { $snippet "\\P" } " (case-insensitive):"
{ $table
{ { $snippet "\\p{lower}" } "Lower case letters" }
{ { $snippet "\\p{upper}" } "Upper case letters" }
{ { $snippet "\\p{alpha}" } "Letters" }
{ { $snippet "\\p{ascii}" } "Characters in the ASCII range" }
{ { $snippet "\\p{alnum}" } "Letters or numbers" }
{ { $snippet "\\p{punct}" } "Punctuation" }
{ { $snippet "\\p{blank}" } "Non-newline whitespace" }
{ { $snippet "\\p{cntrl}" } "Control character" }
{ { $snippet "\\p{space}" } "Whitespace" }
{ { $snippet "\\p{xdigit}" } "Hexidecimal digit" } } ! In the future: Unicode
"Full unicode properties are not yet supported."
{ $heading "Boundaries" } { $heading "Boundaries" }
"Special operators exist to match certain points in the string. These are called 'zero-width' because they do not consume any characters."
{ $table
{ { $snippet "^" } "Beginning of a line" }
{ { $snippet "$" } "End of a line" }
{ { $snippet "\\A" } "Beginning of text" }
{ { $snippet "\\z" } "End of text" }
{ { $snippet "\\Z" } "Almost end of text: only thing after is newline" }
{ { $snippet "\\b" } "Word boundary (by Unicode word boundaries)" }
{ { $snippet "\\b" } "Not word boundary (by Unicode word boundaries)" } }
{ $heading "Greedy quantifiers" } { $heading "Greedy quantifiers" }
{ $heading "Reluctant quantifiers" } "It is possible to have a regular expression which matches a variable number of occurrences of another regular expression."
{ $heading "Posessive quantifiers" } { $table
{ $heading "Logical operations" } { { $snippet "a*" } "Zero or more occurrences of a" }
{ { $snippet "a+" } "One or more occurrences of a" }
{ { $snippet "a?" } "Zero or one occurrences of a" }
{ { $snippet "a{n}" } "n occurrences of a" }
{ { $snippet "a{n,}" } "At least n occurrences of a" }
{ { $snippet "a{,m}" } "At most m occurrences of a" }
{ { $snippet "a{n,m}" } "Between n and m occurrences of a" } }
"All of these quantifiers are " { $emphasis "greedy" } ", meaning that they take as many repetitions as possible within the larger regular expression. Reluctant and posessive quantifiers are not yet supported."
{ $heading "Lookaround" } { $heading "Lookaround" }
"Operators are provided to look ahead and behind the current point in the regular expression. These can be used in any context, but they're the most useful at the beginning or end of a regular expression."
{ $table
{ { $snippet "(?=a)" } "Asserts that the current position is immediately followed by a" }
{ { $snippet "(?!a)" } "Asserts that the current position is not immediately followed by a" }
{ { $snippet "(?<=a)" } "Asserts that the current position is immediately preceded by a" }
{ { $snippet "(?<!a)" } "Asserts that the current position is not immediately preceded by a" } }
{ $heading "Quotation" }
"To make it convenient to have a long string which uses regexp operators, a special syntax is provided. If a substring begins with " { $snippet "\\Q" } " then everything until " { $snippet "\\E" } " is quoted (escaped). For example, " { $snippet "R/ \\Qfoo\\bar|baz()\\E/" } " matches exactly the string " { $snippet "\"foo\\bar|baz()\"" } "."
{ $heading "Unsupported features" } { $heading "Unsupported features" }
"One missing feature is backreferences. This is because of a design decision to allow only regular expressions following the formal theory of regular languages. For more information, see " { $link { "regexp" "theory" } } ". You can create a new regular expression to match a particular string using " { $vocab-link "regexp.combinators" } " and group capture is available to extract parts of a regular expression match." $nl "One missing feature is backreferences. This is because of a design decision to allow only regular expressions following the formal theory of regular languages. For more information, see " { $link { "regexp" "theory" } } ". You can create a new regular expression to match a particular string using " { $vocab-link "regexp.combinators" } " and group capture is available to extract parts of a regular expression match." $nl
"Another feature is Perl's " { $snippet "\\G" } " syntax, which references the previous match, is not included. This is because that sequence is inherently stateful, and Factor regexps don't hold state." $nl "Another feature is Perl's " { $snippet "\\G" } " syntax, which references the previous match, is not included. This is because that sequence is inherently stateful, and Factor regexps don't hold state." $nl
"Additionally, none of the operations which embed code into a regexp are supported, as this would require the inclusion of the Factor parser and compiler in any application which wants to expose regexps to the user. None of the casing operations are included, for simplicity." ; ! Also describe syntax, from the beginning "None of the operations which embed code into a regexp are supported, as this would require the inclusion of the Factor parser and compiler in any application which wants to expose regexps to the user. None of the casing operations are included of Perl like \\L, for simplicity." ; ! Also describe syntax, from the beginning
ARTICLE: { "regexp" "options" } "Regular expression options" ARTICLE: { "regexp" "options" } "Regular expression options"
"When " { $link { "regexp" "construction" } } ", various options can be provided. Options have single-character names. A string of options has one of the following two forms:" "When " { $link { "regexp" "construction" } } ", various options can be provided. Options have single-character names. A string of options has one of the following two forms:"
@ -58,13 +119,30 @@ $nl
{ "i" { $link case-insensitive } } { "i" { $link case-insensitive } }
{ "d" { $link unix-lines } } { "d" { $link unix-lines } }
{ "m" { $link multiline } } { "m" { $link multiline } }
{ "n" { $link multiline } }
{ "r" { $link reversed-regexp } }
{ "s" { $link dotall } } { "s" { $link dotall } }
{ "u" { $link unicode-case } } { "r" { $link reversed-regexp } }
{ "x" { $link comments } }
} ; } ;
HELP: case-insensitive
{ $syntax "R/ .../i" }
{ $description "On regexps, the " { $snippet "i" } " option makes the match case-insenstive. Currently, this is handled incorrectly with respect to Unicode, as characters like ß do not expand into SS in upper case. This should be fixed in a future version." } ;
HELP: unix-lines
{ $syntax "R/ .../d" }
{ $description "With this mode, only newlines (" { $snippet "\\n" } ") are recognized for line breaking. This affects " { $snippet "$" } " and " { $snippet "^" } " when in multiline mode." } ;
HELP: multiline
{ $syntax "R/ .../m" }
{ $description "This mode makes the zero-width constraints " { $snippet "$" } " and " { $snippet "^" } " match the beginning or end of a line. Otherwise, they only match the beginning or end of the input text. This can be used together with " { $link dotall } "." } ;
HELP: dotall
{ $syntax "R/ .../s" }
{ $description "This mode, traditionally called single line mode, makes " { $snippet "." } " match everything, including line breaks. By default, it does not match line breaking characters. This can be used together with " { $link multiline } "." } ;
HELP: reversed-regexp
{ $syntax "R/ .../r" }
{ $description "When running a regexp compiled with this mode, matches will start from the end of the input string, going towards the beginning." } ;
ARTICLE: { "regexp" "theory" } "The theory of regular expressions" ARTICLE: { "regexp" "theory" } "The theory of regular expressions"
"Far from being just a practical tool invented by Unix hackers, regular expressions were studied formally before computer programs were written to process them." $nl "Far from being just a practical tool invented by Unix hackers, regular expressions were studied formally before computer programs were written to process them." $nl
"A regular language is a set of strings that is matched by a regular expression, which is defined to have characters and the empty string, along with the operations concatenation, disjunction and Kleene star. Another way to define the class of regular languages is as the class of languages which can be recognized with constant space overhead, ie with a DFA. These two definitions are provably equivalent." $nl "A regular language is a set of strings that is matched by a regular expression, which is defined to have characters and the empty string, along with the operations concatenation, disjunction and Kleene star. Another way to define the class of regular languages is as the class of languages which can be recognized with constant space overhead, ie with a DFA. These two definitions are provably equivalent." $nl

View File

@ -470,3 +470,13 @@ IN: regexp-tests
[ t ] [ "abcdefg" "a(?:bcdefg)" <regexp> matches? ] unit-test [ t ] [ "abcdefg" "a(?:bcdefg)" <regexp> matches? ] unit-test
[ 3 ] [ "caba" "(?<=b)a" <regexp> first-match from>> ] unit-test [ 3 ] [ "caba" "(?<=b)a" <regexp> first-match from>> ] unit-test
[ t ] [ "\ra" R/ .^a/ms matches? ] unit-test
[ f ] [ "\ra" R/ .^a/mds matches? ] unit-test
[ t ] [ "\na" R/ .^a/ms matches? ] unit-test
[ t ] [ "\na" R/ .^a/mds matches? ] unit-test
[ t ] [ "a\r" R/ a$./ms matches? ] unit-test
[ f ] [ "a\r" R/ a$./mds matches? ] unit-test
[ t ] [ "a\n" R/ a$./ms matches? ] unit-test
[ t ] [ "a\n" R/ a$./mds matches? ] unit-test

View File

@ -5,7 +5,7 @@ io.files hashtables quotations splitting grouping arrays io
math.parser hash2 math.order byte-arrays words namespaces words math.parser hash2 math.order byte-arrays words namespaces words
compiler.units parser io.encodings.ascii values interval-maps compiler.units parser io.encodings.ascii values interval-maps
ascii sets combinators locals math.ranges sorting make ascii sets combinators locals math.ranges sorting make
strings.parser io.encodings.utf8 ; strings.parser io.encodings.utf8 memoize ;
IN: unicode.data IN: unicode.data
VALUE: simple-lower VALUE: simple-lower
@ -108,6 +108,9 @@ CONSTANT: categories
"Zs" "Zl" "Zp" "Zs" "Zl" "Zp"
"Cc" "Cf" "Cs" "Co" } "Cc" "Cf" "Cs" "Co" }
MEMO: categories-map ( -- hashtable )
categories <enum> [ swap ] H{ } assoc-map-as ;
CONSTANT: num-chars HEX: 2FA1E CONSTANT: num-chars HEX: 2FA1E
! the maximum unicode char in the first 3 planes ! the maximum unicode char in the first 3 planes
@ -124,10 +127,10 @@ CONSTANT: num-chars HEX: 2FA1E
] assoc-each table ; ] assoc-each table ;
:: process-category ( data -- category-listing ) :: process-category ( data -- category-listing )
[let | table [ num-chars <byte-array> ] | num-chars <byte-array> :> table
2 data (process-data) [| char cat | 2 data (process-data) [| char cat |
cat categories index char table ?set-nth cat categories-map at char table ?set-nth
] assoc-each table fill-ranges ] ; ] assoc-each table fill-ranges ;
: process-names ( data -- names-hash ) : process-names ( data -- names-hash )
1 swap (process-data) [ 1 swap (process-data) [

View File

@ -74,3 +74,4 @@ SYMBOL: xml-file
[ "foo" ] [ "<!DOCTYPE foo [<!ENTITY bar 'foo'>]><x>&bar;</x>" string>xml children>string ] unit-test [ "foo" ] [ "<!DOCTYPE foo [<!ENTITY bar 'foo'>]><x>&bar;</x>" string>xml children>string ] unit-test
[ T{ xml-chunk f V{ "hello" } } ] [ "hello" string>xml-chunk ] unit-test [ T{ xml-chunk f V{ "hello" } } ] [ "hello" string>xml-chunk ] unit-test
[ "1.1" ] [ "<?xml version='1.1'?><x/>" string>xml prolog>> version>> ] unit-test [ "1.1" ] [ "<?xml version='1.1'?><x/>" string>xml prolog>> version>> ] unit-test
[ "ß" ] [ "<x>ß</x>" <string-reader> read-xml children>string ] unit-test

View File

@ -1,6 +1,6 @@
! Copyright (C) 2005, 2009 Daniel Ehrenberg ! Copyright (C) 2005, 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: help.markup help.syntax xml.data sequences strings ; USING: help.markup help.syntax xml.data sequences strings multiline ;
IN: xml.traversal IN: xml.traversal
ABOUT: "xml.traversal" ABOUT: "xml.traversal"
@ -8,7 +8,7 @@ ABOUT: "xml.traversal"
ARTICLE: "xml.traversal" "Utilities for traversing XML" ARTICLE: "xml.traversal" "Utilities for traversing XML"
"The " { $vocab-link "xml.traversal" } " vocabulary provides utilities for traversing an XML DOM tree and viewing the contents of a single tag. The following words are defined:" "The " { $vocab-link "xml.traversal" } " vocabulary provides utilities for traversing an XML DOM tree and viewing the contents of a single tag. The following words are defined:"
$nl $nl
"Note: the difference between deep-tag-named and tag-named is that the former searches recursively among all children and children of children of the tag, while the latter only looks at the direct children, and is therefore more efficient." { $subsection { "xml.traversal" "intro" } }
{ $subsection tag-named } { $subsection tag-named }
{ $subsection tags-named } { $subsection tags-named }
{ $subsection deep-tag-named } { $subsection deep-tag-named }
@ -20,6 +20,20 @@ ARTICLE: "xml.traversal" "Utilities for traversing XML"
{ $subsection first-child-tag } { $subsection first-child-tag }
{ $subsection assert-tag } ; { $subsection assert-tag } ;
ARTICLE: { "xml.traversal" "intro" } "An example of XML processing"
"To illustrate how to use the XML library, we develop a simple Atom parser in Factor. Atom is an XML-based syndication format, like RSS. To see the full version of what we develop here, look at " { $snippet "basis/syndication" } " at the " { $snippet "atom1.0" } " word. First, we want to load a file and get a DOM tree for it."
{ $code <" "file.xml" file>xml "> }
"No encoding descriptor is needed, because XML files contain sufficient information to auto-detect the encoding. Next, we want to extract information from the tree. To get the title, we can use the following:"
{ $code <" "title" tag-named children>string "> }
"The " { $link tag-named } " word finds the first tag named " { $snippet "title" } " in the top level (just under the main tag). Then, with a tag on the stack, its children are asserted to be a string, and the string is returned." $nl
"For a slightly more complicated example, we can look at how entries are parsed. To get a sequence of tags with the name " { $snippet "entry" } ":"
{ $code <" "entry" tags-named "> }
"Imagine that, for each of these, we want to get the URL of the entry. In Atom, the URLs are in a " { $snippet "link" } " tag which is contained in the " { $snippet "entry" } " tag. There are multiple " { $snippet "link" } " tags, but one of them contains the attribute " { $snippet "rel=alternate" } ", and the " { $snippet "href" } " attribute has the URL. So, given an element of the sequence produced in the above quotation, we run the code:"
{ $code <" "link" tags-named [ "rel" attr "alternate" = ] find nip "> }
"to get the link tag on the stack, and"
{ $code <" "href" attr >url "> }
"to extract the URL from it." ;
HELP: deep-tag-named HELP: deep-tag-named
{ $values { "tag" "an XML tag or document" } { "name/string" "an XML name or string representing a name" } { "matching-tag" tag } } { $values { "tag" "an XML tag or document" } { "name/string" "an XML name or string representing a name" } { "matching-tag" tag } }
{ $description "Finds an XML tag with a matching name, recursively searching children and children of children." } { $description "Finds an XML tag with a matching name, recursively searching children and children of children." }

View File

@ -67,9 +67,9 @@ HELP: string>dtd
ARTICLE: { "xml" "reading" } "Reading XML" ARTICLE: { "xml" "reading" } "Reading XML"
"The following words are used to read something into an XML document" "The following words are used to read something into an XML document"
{ $subsection string>xml }
{ $subsection read-xml } { $subsection read-xml }
{ $subsection read-xml-chunk } { $subsection read-xml-chunk }
{ $subsection string>xml }
{ $subsection string>xml-chunk } { $subsection string>xml-chunk }
{ $subsection file>xml } { $subsection file>xml }
{ $subsection bytes>xml } { $subsection bytes>xml }
@ -90,10 +90,16 @@ ARTICLE: { "xml" "events" } "Event-based XML parsing"
{ $subsection pull-event } { $subsection pull-event }
{ $subsection pull-elem } ; { $subsection pull-elem } ;
ARTICLE: { "xml" "namespaces" } "Working with XML namespaces"
"The Factor XML parser implements XML namespaces, and provides convenient utilities for working with them. Anywhere in the public API that a name is accepted as an argument, either a string or an XML name is accepted. If a string is used, it is coerced into a name by giving it a null namespace. Names are stored as " { $link name } " tuples, which have slots for the namespace prefix and namespace URL as well as the main part of the tag name." $nl
"To make it easier to create XML names, the parsing word " { $snippet "XML-NS:" } " is provided in the " { $vocab-link "xml.syntax" } " vocabulary." $nl
"When parsing XML, names are automatically augmented with the appropriate namespace URL when the information is available. This does not take into account any XML schema which might allow for such prefixes to be omitted. When generating XML to be written, keep in mind that the XML writer knows only about the literal prefixes and ignores the URLs. It is your job to make sure that they match up correctly, and that there is the appropriate " { $snippet "xmlns" } " declaration." ;
ARTICLE: "xml" "XML parser" ARTICLE: "xml" "XML parser"
"The " { $vocab-link "xml" } " vocabulary implements the XML 1.0 and 1.1 standards, converting strings of text into XML and vice versa. The parser checks for well-formedness but is not validating. There is only partial support for processing DTDs." "The " { $vocab-link "xml" } " vocabulary implements the XML 1.0 and 1.1 standards, converting strings of text into XML and vice versa. The parser checks for well-formedness but is not validating. There is only partial support for processing DTDs."
{ $subsection { "xml" "reading" } } { $subsection { "xml" "reading" } }
{ $subsection { "xml" "events" } } { $subsection { "xml" "events" } }
{ $subsection { "xml" "namespaces" } }
{ $vocab-subsection "Writing XML" "xml.writer" } { $vocab-subsection "Writing XML" "xml.writer" }
{ $vocab-subsection "XML parsing errors" "xml.errors" } { $vocab-subsection "XML parsing errors" "xml.errors" }
{ $vocab-subsection "XML entities" "xml.entities" } { $vocab-subsection "XML entities" "xml.entities" }

View File

@ -4,7 +4,8 @@ USING: accessors arrays io io.encodings.binary io.files
io.streams.string kernel namespaces sequences strings io.encodings.utf8 io.streams.string kernel namespaces sequences strings io.encodings.utf8
xml.data xml.errors xml.elements ascii xml.entities xml.data xml.errors xml.elements ascii xml.entities
xml.writer xml.state xml.autoencoding assocs xml.tokenize xml.writer xml.state xml.autoencoding assocs xml.tokenize
combinators.short-circuit xml.name splitting io.streams.byte-array ; combinators.short-circuit xml.name splitting io.streams.byte-array
combinators ;
IN: xml IN: xml
<PRIVATE <PRIVATE
@ -159,6 +160,9 @@ PRIVATE>
xml-stack get first second xml-stack get first second
] with-state ; inline ] with-state ; inline
: make-xml ( stream quot -- xml )
0 read-seq make-xml-doc ; inline
PRIVATE> PRIVATE>
: each-element ( stream quot: ( xml-elem -- ) -- ) : each-element ( stream quot: ( xml-elem -- ) -- )
@ -169,14 +173,16 @@ PRIVATE>
] with-state ; inline ] with-state ; inline
: read-xml ( stream -- xml ) : read-xml ( stream -- xml )
[ start-document [ process ] when* ] dup stream-element-type {
0 read-seq make-xml-doc ; { +character+ [ [ check ] make-xml ] }
{ +byte+ [ [ start-document [ process ] when* ] make-xml ] }
} case ;
: read-xml-chunk ( stream -- seq ) : read-xml-chunk ( stream -- seq )
[ check ] 1 read-seq <xml-chunk> ; [ check ] 1 read-seq <xml-chunk> ;
: string>xml ( string -- xml ) : string>xml ( string -- xml )
<string-reader> [ check ] 0 read-seq make-xml-doc ; <string-reader> read-xml ;
: string>xml-chunk ( string -- xml ) : string>xml-chunk ( string -- xml )
<string-reader> read-xml-chunk ; <string-reader> read-xml-chunk ;