regexp: Add re:: and re[[ ]] and use new syntax.
R/ / is a crazy syntax to support. - it makes / a special character - you have to escape forward slashes which is awkward in any language with R/ / syntax - re"\d" is not a valid regexp string, gotta do the escapes later -- the only universal escape in a string is either \" or even "none" as a default optionmodern-harvey3
parent
a654c7b879
commit
422078e01e
|
@ -7,10 +7,10 @@ unicode multiline ;
|
|||
IN: globs
|
||||
|
||||
: not-path-separator ( -- sep )
|
||||
os windows? R/ [^\/\\]/ R/ [^\/]/ ? ; foldable
|
||||
os windows? re"[^\/\\]" re"[^\/]" ? ; foldable
|
||||
|
||||
: wild-path-separator ( -- sep )
|
||||
os windows? R/ [^\/\\][\/\\]|[^\/\\]/ R/ [^\/][\/]|[^\/]/ ? ; foldable
|
||||
os windows? re"[^\/\\][\/\\]|[^\/\\]" re"[^\/][\/]|[^\/]" ? ; foldable
|
||||
|
||||
EBNF: <glob> [=[
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@ IN: regexp.combinators.tests
|
|||
{ f f f } [ "food" "ibar" "ba" [ strings matches? ] tri@ ] unit-test
|
||||
|
||||
: conj ( -- regexp )
|
||||
{ R/ .*a/ R/ b.*/ } <and> ;
|
||||
{ re".*a" re"b.*" } <and> ;
|
||||
|
||||
{ t } [ "bljhasflsda" conj matches? ] unit-test
|
||||
{ f } [ "bsdfdfs" conj matches? ] unit-test
|
||||
|
|
|
@ -13,7 +13,7 @@ IN: regexp.combinators
|
|||
|
||||
PRIVATE>
|
||||
|
||||
CONSTANT: <nothing> R/ (?~.*)/s
|
||||
CONSTANT: <nothing> re:: "(?~.*)" "s"
|
||||
|
||||
: <literal> ( string -- regexp )
|
||||
[ "\\Q" "\\E" surround ] [ <concatenation> ] bi make-regexp ; foldable
|
||||
|
|
|
@ -229,6 +229,16 @@ SYNTAX: \R[=[ "]=]" parse-optioned-regexp ;
|
|||
SYNTAX: \R[==[ "]==]" parse-optioned-regexp ;
|
||||
SYNTAX: \R[===[ "]===]" parse-optioned-regexp ;
|
||||
SYNTAX: \R[====[ "]====]" parse-optioned-regexp ;
|
||||
SYNTAX: \re" "\"" parse-optioned-regexp ;
|
||||
SYNTAX: \re[[ "]]" parse-optioned-regexp ;
|
||||
SYNTAX: \re[=[ "]=]" parse-optioned-regexp ;
|
||||
SYNTAX: \re[==[ "]==]" parse-optioned-regexp ;
|
||||
SYNTAX: \re[===[ "]===]" parse-optioned-regexp ;
|
||||
SYNTAX: \re[====[ "]====]" parse-optioned-regexp ;
|
||||
|
||||
SYNTAX: \re: scan-object "" <optioned-regexp> suffix! ;
|
||||
SYNTAX: \re:: scan-object scan-object <optioned-regexp> suffix! ;
|
||||
|
||||
SYNTAX: \R(( "))" parse-optioned-regexp ;
|
||||
SYNTAX: \R(=( ")=)" parse-optioned-regexp ;
|
||||
SYNTAX: \R(==( ")==)" parse-optioned-regexp ;
|
||||
|
@ -240,6 +250,7 @@ SYNTAX: \R{=={ "}==}" parse-optioned-regexp ;
|
|||
SYNTAX: \R{==={ "}===}" parse-optioned-regexp ;
|
||||
SYNTAX: \R{===={ "}====}" parse-optioned-regexp ;
|
||||
|
||||
|
||||
USE: vocabs.loader
|
||||
|
||||
{ "prettyprint" "regexp" } "regexp.prettyprint" require-when
|
||||
|
|
|
@ -61,11 +61,11 @@ IN: validators
|
|||
! From http://www.regular-expressions.info/email.html
|
||||
320 v-max-length
|
||||
"e-mail"
|
||||
R/ [A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/i
|
||||
re:: [[[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}]] "i"
|
||||
v-regexp ;
|
||||
|
||||
: v-url ( str -- str )
|
||||
"url" R/ (?:ftp|http|https):\/\/\S+/ v-regexp ;
|
||||
"url" re[[(?:ftp|http|https):\/\/\S+]] v-regexp ;
|
||||
|
||||
: v-captcha ( str -- str )
|
||||
dup empty? [ "must remain blank" throw ] unless ;
|
||||
|
|
|
@ -7,19 +7,19 @@ IN: benchmark.regex-dna
|
|||
! Based on http://shootout.alioth.debian.org/gp4/benchmark.php?test=regexdna&lang=ruby&id=1
|
||||
|
||||
: strip-line-breaks ( string -- string' )
|
||||
R/ >.*\n|\n/ "" re-replace ;
|
||||
re">.*\n|\n" "" re-replace ;
|
||||
|
||||
: count-patterns ( string -- )
|
||||
{
|
||||
R/ agggtaaa|tttaccct/i
|
||||
R/ [cgt]gggtaaa|tttaccc[acg]/i
|
||||
R/ a[act]ggtaaa|tttacc[agt]t/i
|
||||
R/ ag[act]gtaaa|tttac[agt]ct/i
|
||||
R/ agg[act]taaa|ttta[agt]cct/i
|
||||
R/ aggg[acg]aaa|ttt[cgt]ccct/i
|
||||
R/ agggt[cgt]aa|tt[acg]accct/i
|
||||
R/ agggta[cgt]a|t[acg]taccct/i
|
||||
R/ agggtaa[cgt]|[acg]ttaccct/i
|
||||
re:: "agggtaaa|tttaccct" "i"
|
||||
re:: "[cgt]gggtaaa|tttaccc[acg]" "i"
|
||||
re:: "a[act]ggtaaa|tttacc[agt]t" "i"
|
||||
re:: "ag[act]gtaaa|tttac[agt]ct" "i"
|
||||
re:: "agg[act]taaa|ttta[agt]cct" "i"
|
||||
re:: "aggg[acg]aaa|ttt[cgt]ccct" "i"
|
||||
re:: "agggt[cgt]aa|tt[acg]accct" "i"
|
||||
re:: "agggta[cgt]a|t[acg]taccct" "i"
|
||||
re:: "agggtaa[cgt]|[acg]ttaccct" "i"
|
||||
} [
|
||||
[ raw>> write bl ]
|
||||
[ count-matches number>string print ]
|
||||
|
@ -28,17 +28,17 @@ IN: benchmark.regex-dna
|
|||
|
||||
: do-replacements ( string -- string' )
|
||||
{
|
||||
{ R/ B/ "(c|g|t)" }
|
||||
{ R/ D/ "(a|g|t)" }
|
||||
{ R/ H/ "(a|c|t)" }
|
||||
{ R/ K/ "(g|t)" }
|
||||
{ R/ M/ "(a|c)" }
|
||||
{ R/ N/ "(a|c|g|t)" }
|
||||
{ R/ R/ "(a|g)" }
|
||||
{ R/ S/ "(c|t)" }
|
||||
{ R/ V/ "(a|c|g)" }
|
||||
{ R/ W/ "(a|t)" }
|
||||
{ R/ Y/ "(c|t)" }
|
||||
{ re"B" "(c|g|t)" }
|
||||
{ re"D" "(a|g|t)" }
|
||||
{ re"H" "(a|c|t)" }
|
||||
{ re"K" "(g|t)" }
|
||||
{ re"M" "(a|c)" }
|
||||
{ re"N" "(a|c|g|t)" }
|
||||
{ re"R" "(a|g)" }
|
||||
{ re"S" "(c|t)" }
|
||||
{ re"V" "(a|c|g)" }
|
||||
{ re"W" "(a|t)" }
|
||||
{ re"Y" "(c|t)" }
|
||||
} [ re-replace ] assoc-each ;
|
||||
|
||||
SYMBOL: ilen
|
||||
|
|
|
@ -9,8 +9,8 @@ IN: benchmark.regexp
|
|||
20,000 <iota> [ number>string ] map
|
||||
200 <iota> [ 1 + char: a <string> ] map
|
||||
'[
|
||||
_ R/ \d+/ [ matches? ] curry all? t assert=
|
||||
_ R/ [a]+/ [ matches? ] curry all? t assert=
|
||||
_ re[[\d+]] [ matches? ] curry all? t assert=
|
||||
_ re[[[a]+]] [ matches? ] curry all? t assert=
|
||||
] times ;
|
||||
|
||||
MAIN: regexp-benchmark
|
||||
|
|
|
@ -869,7 +869,7 @@ CONSTANT: emoji H{
|
|||
}
|
||||
|
||||
: emojify ( str -- str' )
|
||||
R/ :([^:])+:/ [ >string emoji at ] re-replace-with ;
|
||||
R[[:([^:])+:]] [ >string emoji at ] re-replace-with ;
|
||||
|
||||
: emojify-main ( -- )
|
||||
command-line get [
|
||||
|
|
|
@ -2352,7 +2352,7 @@ CONSTANT: html5 H{
|
|||
"#" ?head [ numeric-charref ] [ named-charref ] if ;
|
||||
|
||||
CONSTANT: re-charref
|
||||
R/ &(#[0-9]+|#[xX][0-9a-fA-F]+|[^\t\n\f <&#;]{1,32});?/
|
||||
re"&(#[0-9]+|#[xX][0-9a-fA-F]+|[^\t\n\f <&#;]{1,32});?"
|
||||
|
||||
PRIVATE>
|
||||
|
||||
|
|
|
@ -87,7 +87,7 @@ M: text-printer print-comment-tag drop ;
|
|||
M: text-printer print-dtd-tag drop ;
|
||||
|
||||
: collapse-spaces ( text -- text' )
|
||||
preformatted? get [ R/ \s+/ " " re-replace ] unless ;
|
||||
preformatted? get [ re"\s+" " " re-replace ] unless ;
|
||||
|
||||
M: text-printer print-text-tag
|
||||
script? get style? get or
|
||||
|
|
|
@ -29,25 +29,25 @@ M: lexed length tokens>> length ;
|
|||
|
||||
|
||||
TUPLE: comment < lexed payload ;
|
||||
CONSTRUCTOR: <comment> comment ( tokens payload -- obj ) ;
|
||||
CONSTRUCTOR: <comment> comment ( tokens -- obj ) ;
|
||||
|
||||
TUPLE: escaped-identifier < lexed name ;
|
||||
CONSTRUCTOR: <escaped-identifier> escaped-identifier ( tokens name -- obj ) ;
|
||||
CONSTRUCTOR: <escaped-identifier> escaped-identifier ( tokens -- obj ) ;
|
||||
|
||||
TUPLE: escaped-object < lexed name payload ;
|
||||
CONSTRUCTOR: <escaped-object> escaped-object ( tokens name payload -- obj ) ;
|
||||
CONSTRUCTOR: <escaped-object> escaped-object ( tokens -- obj ) ;
|
||||
|
||||
TUPLE: section < lexed tag payload ;
|
||||
CONSTRUCTOR: <section> section ( tokens tag payload -- obj ) ;
|
||||
CONSTRUCTOR: <section> section ( tokens -- obj ) ;
|
||||
|
||||
TUPLE: named-section < lexed tag name payload ;
|
||||
CONSTRUCTOR: <named-section> named-section ( tokens tag name payload -- obj ) ;
|
||||
CONSTRUCTOR: <named-section> named-section ( tokens -- obj ) ;
|
||||
|
||||
TUPLE: upper-colon < lexed tag payload ;
|
||||
CONSTRUCTOR: <upper-colon> upper-colon ( tokens -- obj ) ;
|
||||
|
||||
TUPLE: lower-colon < lexed tag payload ;
|
||||
CONSTRUCTOR: <lower-colon> lower-colon ( tokens tag payload -- obj ) ;
|
||||
CONSTRUCTOR: <lower-colon> lower-colon ( tokens -- obj ) ;
|
||||
|
||||
TUPLE: matched < lexed tag payload ;
|
||||
|
||||
|
@ -98,11 +98,11 @@ CONSTRUCTOR: <double-paren> double-paren ( tokens -- obj )
|
|||
|
||||
|
||||
TUPLE: double-quote < matched ;
|
||||
CONSTRUCTOR: <double-quote> double-quote ( tokens tag payload -- obj ) ;
|
||||
CONSTRUCTOR: <double-quote> double-quote ( tokens -- obj ) ;
|
||||
|
||||
|
||||
TUPLE: identifier < lexed name ;
|
||||
CONSTRUCTOR: <identifier> identifier ( tokens name -- obj ) ;
|
||||
CONSTRUCTOR: <identifier> identifier ( tokens -- obj ) ;
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -144,7 +144,7 @@ MACRO:: read-matched ( ch -- quot: ( string n tag -- string n' slice' ) )
|
|||
2over ?nth-of char: \[ = [
|
||||
[ 1 + ] dip 1 modify-to 2over ?nth-of read-double-matched-bracket
|
||||
] [
|
||||
[ slice-til-eol drop ] dip swap 2array
|
||||
[ slice-til-eol drop ] dip swap 2array <comment>
|
||||
] if ;
|
||||
|
||||
: terminator? ( slice -- ? )
|
||||
|
@ -166,7 +166,7 @@ MACRO:: read-matched ( ch -- quot: ( string n tag -- string n' slice' ) )
|
|||
dup [ f unexpected-eof ] unless
|
||||
lex-factor
|
||||
] replicate ensure-tokens ! concat
|
||||
] dip swap 2array ;
|
||||
] dip swap 2array <lower-colon> ;
|
||||
|
||||
: (strict-upper?) ( string -- ? )
|
||||
{
|
||||
|
@ -280,17 +280,16 @@ MACRO:: read-matched ( ch -- quot: ( string n tag -- string n' slice' ) )
|
|||
swap
|
||||
! What ended the FOO: .. ; form?
|
||||
! Remove the ; from the payload if present
|
||||
! XXX: probably can remove this, T: is dumb
|
||||
! Also in stack effects ( T: int -- ) can be ended by -- and )
|
||||
dup ?last {
|
||||
{ [ dup ";" sequence= ] [ drop unclip-last 3array ] }
|
||||
{ [ dup ";" tail? ] [ drop unclip-last 3array ] }
|
||||
{ [ dup "--" sequence= ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] }
|
||||
{ [ dup "]" sequence= ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] }
|
||||
{ [ dup "}" sequence= ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] }
|
||||
{ [ dup ")" sequence= ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] } ! (n*quot) breaks
|
||||
{ [ dup section-close? ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] }
|
||||
{ [ dup upper-colon? ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] }
|
||||
{ [ dup ";" sequence= ] [ drop unclip-last 3array <upper-colon> ] }
|
||||
{ [ dup ";" tail? ] [ drop unclip-last 3array <upper-colon> ] }
|
||||
{ [ dup "--" sequence= ] [ drop unclip-last -rot 2array <upper-colon> [ rewind-slice ] dip ] }
|
||||
{ [ dup "]" sequence= ] [ "omg1" throw drop unclip-last -rot 2array <upper-colon> [ rewind-slice ] dip ] }
|
||||
{ [ dup "}" sequence= ] [ "omg2" throw drop unclip-last -rot 2array <upper-colon> [ rewind-slice ] dip ] }
|
||||
{ [ dup ")" sequence= ] [ B "opg3" throw drop unclip-last -rot 2array <upper-colon> [ rewind-slice ] dip ] } ! (n*quot) breaks
|
||||
{ [ dup section-close? ] [ drop unclip-last -rot 2array <upper-colon> [ rewind-slice ] dip ] }
|
||||
{ [ dup upper-colon? ] [ drop unclip-last -rot 2array <upper-colon> [ rewind-slice ] dip ] }
|
||||
[ drop 2array <upper-colon> ]
|
||||
} cond ;
|
||||
|
||||
|
@ -342,7 +341,7 @@ MACRO:: read-matched ( ch -- quot: ( string n tag -- string n' slice' ) )
|
|||
} cond ;
|
||||
|
||||
: read-acute ( string n slice -- string n' acute )
|
||||
[ matching-section-delimiter 1array lex-until ] keep swap unclip-last 3array ;
|
||||
[ matching-section-delimiter 1array lex-until ] keep swap unclip-last 3array <section> ;
|
||||
|
||||
! #{ } turned off, foo# not turned off
|
||||
: read-turnoff ( string n slice -- string n' obj )
|
||||
|
|
|
@ -158,4 +158,4 @@ version 8.36 <= [
|
|||
] unit-test
|
||||
|
||||
! Test that the regexp syntax works.
|
||||
{ t } [ "1234abcd" regexp:R/ ^\d+\w+$/ matches? ] unit-test
|
||||
{ t } [ "1234abcd" regexp:R[[^\d+\w+$]] matches? ] unit-test
|
||||
|
|
|
@ -14,7 +14,7 @@ IN: text-analysis
|
|||
[ [ blank? ] trim ] map harvest ;
|
||||
|
||||
: split-paragraphs ( str -- seq )
|
||||
R/ \r?\n\r?\n/ re-split trimmed ;
|
||||
re"\r?\n\r?\n" re-split trimmed ;
|
||||
|
||||
<<
|
||||
CONSTANT: ABBREVIATIONS {
|
||||
|
@ -39,7 +39,7 @@ CONSTANT: ABBREVIATIONS {
|
|||
: split-sentences ( str -- seq )
|
||||
|
||||
! Mark end of sentences with EOS marker
|
||||
R/ ((?:[\.?!]|[\r\n]+)(?:\"|\'|\)|\]|\})?)(\s+)/
|
||||
re[[((?:[\.?!]|[\r\n]+)(?:\"|\'|\)|\]|\})?)(\s+)]]
|
||||
[ [ ".?!\r\n\"')]}" member? not ] cut-when "\x01" glue ]
|
||||
re-replace-with
|
||||
|
||||
|
@ -63,46 +63,46 @@ CONSTANT: ABBREVIATIONS {
|
|||
"\x01" split trimmed ;
|
||||
|
||||
CONSTANT: sub-syllable {
|
||||
R/ [^aeiou]e$/ ! give, love, bone, done, ride ...
|
||||
R/ [aeiou](?:([cfghklmnprsvwz])\1?|ck|sh|[rt]ch)e[ds]$/
|
||||
re"[^aeiou]e$" ! give, love, bone, done, ride ...
|
||||
re"[aeiou](?:([cfghklmnprsvwz])\1?|ck|sh|[rt]ch)e[ds]$"
|
||||
! (passive) past participles and 3rd person sing present verbs:
|
||||
! bared, liked, called, tricked, bashed, matched
|
||||
|
||||
R/ .e(?:ly|less(?:ly)?|ness?|ful(?:ly)?|ments?)$/
|
||||
re".e(?:ly|less(?:ly)?|ness?|ful(?:ly)?|ments?)$"
|
||||
! nominal, adjectival and adverbial derivatives from -e$ roots:
|
||||
! absolutely, nicely, likeness, basement, hopeless
|
||||
! hopeful, tastefully, wasteful
|
||||
|
||||
R/ ion/ ! action, diction, fiction
|
||||
R/ [ct]ia[nl]/ ! special(ly), initial, physician, christian
|
||||
R/ [^cx]iou/ ! illustrious, NOT spacious, gracious, anxious, noxious
|
||||
R/ sia$/ ! amnesia, polynesia
|
||||
R/ .gue$/ ! dialogue, intrigue, colleague
|
||||
re"ion" ! action, diction, fiction
|
||||
re"[ct]ia[nl]" ! special(ly), initial, physician, christian
|
||||
re"[^cx]iou" ! illustrious, NOT spacious, gracious, anxious, noxious
|
||||
re"sia$" ! amnesia, polynesia
|
||||
re".gue$" ! dialogue, intrigue, colleague
|
||||
}
|
||||
|
||||
CONSTANT: add-syllable {
|
||||
R/ i[aiou]/ ! alias, science, phobia
|
||||
R/ [dls]ien/ ! salient, gradient, transient
|
||||
R/ [aeiouym]ble$/ ! -Vble, plus -mble
|
||||
R/ [aeiou]{3}/ ! agreeable
|
||||
R/ ^mc/ ! mcwhatever
|
||||
R/ ism$/ ! sexism, racism
|
||||
R/ (?:([^aeiouy])\1|ck|mp|ng)le$/ ! bubble, cattle, cackle, sample, angle
|
||||
R/ dnt$/ ! couldn/t
|
||||
R/ [aeiou]y[aeiou]/ ! annoying, layer
|
||||
re"i[aiou]" ! alias, science, phobia
|
||||
re"[dls]ien" ! salient, gradient, transient
|
||||
re"[aeiouym]ble$" ! -Vble, plus -mble
|
||||
re"[aeiou]{3}" ! agreeable
|
||||
re"^mc" ! mcwhatever
|
||||
re"ism$" ! sexism, racism
|
||||
re"(?:([^aeiouy])\1|ck|mp|ng)le$" ! bubble, cattle, cackle, sample, angle
|
||||
re"dnt$" ! couldn/t
|
||||
re"[aeiou]y[aeiou]" ! annoying, layer
|
||||
}
|
||||
|
||||
: syllables ( str -- n )
|
||||
dup length 1 = [ drop 1 ] [
|
||||
>lower char: . swap remove
|
||||
[ R/ [aeiouy]+/ count-matches ]
|
||||
[ re"[aeiouy]+" count-matches ]
|
||||
[ sub-syllable [ matches? ] with count - ]
|
||||
[ add-syllable [ matches? ] with count + ] tri
|
||||
1 max
|
||||
] if ;
|
||||
|
||||
: split-words ( str -- words )
|
||||
R/ \b([a-z][a-z\-']*)\b/i all-matching-subseqs ;
|
||||
re:: [[\b([a-z][a-z\-']*)\b]] "i" all-matching-subseqs ;
|
||||
|
||||
TUPLE: text-analysis n-paragraphs n-sentences n-chars n-words
|
||||
n-syllables n-complex-words n-unique-words n-difficult-words ;
|
||||
|
|
|
@ -116,7 +116,7 @@ TUPLE: entry key value ;
|
|||
: multi-string ( -- parser )
|
||||
multi-basic-string multi-literal-string 2choice [
|
||||
"" like "\n" ?head drop
|
||||
R/ \\[ \t\r\n]*\n[ \t\r\n]*/m "" re-replace
|
||||
re:: [[\\[ \t\r\n]*\n[ \t\r\n]*]] "m" "" re-replace
|
||||
] action ;
|
||||
|
||||
: string-parser ( -- parser )
|
||||
|
|
|
@ -13,7 +13,7 @@ IN: xkcd
|
|||
|
||||
: comic-image ( url -- image )
|
||||
http-get nip
|
||||
R/ \/\/imgs\.xkcd\.com\/comics\/[^\.]+\.(png|jpg)/
|
||||
re"//imgs\.xkcd\.com/comics/[^\.]+\.(png|jpg)"
|
||||
first-match >string "http:" prepend load-http-image ;
|
||||
|
||||
: comic-image. ( url -- )
|
||||
|
|
Loading…
Reference in New Issue