regexp: Add re:: and re[[ ]] and use new syntax.

R/ / is a crazy syntax to support.
- it makes / a special character
- you have to escape forward slashes which is awkward in any language with R/ / syntax
- re"\d" is not a valid regexp string, gotta do the escapes later
-- the only universal escape in a string is either \" or even "none" as a default option
modern-harvey3
Doug Coleman 2019-10-29 23:03:33 -05:00
parent a654c7b879
commit 422078e01e
16 changed files with 86 additions and 76 deletions

View File

@ -7,10 +7,10 @@ unicode multiline ;
IN: globs
: not-path-separator ( -- sep )
os windows? R/ [^\/\\]/ R/ [^\/]/ ? ; foldable
os windows? re"[^\/\\]" re"[^\/]" ? ; foldable
: wild-path-separator ( -- sep )
os windows? R/ [^\/\\][\/\\]|[^\/\\]/ R/ [^\/][\/]|[^\/]/ ? ; foldable
os windows? re"[^\/\\][\/\\]|[^\/\\]" re"[^\/][\/]|[^\/]" ? ; foldable
EBNF: <glob> [=[

View File

@ -10,7 +10,7 @@ IN: regexp.combinators.tests
{ f f f } [ "food" "ibar" "ba" [ strings matches? ] tri@ ] unit-test
: conj ( -- regexp )
{ R/ .*a/ R/ b.*/ } <and> ;
{ re".*a" re"b.*" } <and> ;
{ t } [ "bljhasflsda" conj matches? ] unit-test
{ f } [ "bsdfdfs" conj matches? ] unit-test

View File

@ -13,7 +13,7 @@ IN: regexp.combinators
PRIVATE>
CONSTANT: <nothing> R/ (?~.*)/s
CONSTANT: <nothing> re:: "(?~.*)" "s"
: <literal> ( string -- regexp )
[ "\\Q" "\\E" surround ] [ <concatenation> ] bi make-regexp ; foldable

View File

@ -229,6 +229,16 @@ SYNTAX: \R[=[ "]=]" parse-optioned-regexp ;
SYNTAX: \R[==[ "]==]" parse-optioned-regexp ;
SYNTAX: \R[===[ "]===]" parse-optioned-regexp ;
SYNTAX: \R[====[ "]====]" parse-optioned-regexp ;
SYNTAX: \re" "\"" parse-optioned-regexp ;
SYNTAX: \re[[ "]]" parse-optioned-regexp ;
SYNTAX: \re[=[ "]=]" parse-optioned-regexp ;
SYNTAX: \re[==[ "]==]" parse-optioned-regexp ;
SYNTAX: \re[===[ "]===]" parse-optioned-regexp ;
SYNTAX: \re[====[ "]====]" parse-optioned-regexp ;
SYNTAX: \re: scan-object "" <optioned-regexp> suffix! ;
SYNTAX: \re:: scan-object scan-object <optioned-regexp> suffix! ;
SYNTAX: \R(( "))" parse-optioned-regexp ;
SYNTAX: \R(=( ")=)" parse-optioned-regexp ;
SYNTAX: \R(==( ")==)" parse-optioned-regexp ;
@ -240,6 +250,7 @@ SYNTAX: \R{=={ "}==}" parse-optioned-regexp ;
SYNTAX: \R{==={ "}===}" parse-optioned-regexp ;
SYNTAX: \R{===={ "}====}" parse-optioned-regexp ;
USE: vocabs.loader
{ "prettyprint" "regexp" } "regexp.prettyprint" require-when

View File

@ -61,11 +61,11 @@ IN: validators
! From http://www.regular-expressions.info/email.html
320 v-max-length
"e-mail"
R/ [A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/i
re:: [[[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}]] "i"
v-regexp ;
: v-url ( str -- str )
"url" R/ (?:ftp|http|https):\/\/\S+/ v-regexp ;
"url" re[[(?:ftp|http|https):\/\/\S+]] v-regexp ;
: v-captcha ( str -- str )
dup empty? [ "must remain blank" throw ] unless ;

View File

@ -7,19 +7,19 @@ IN: benchmark.regex-dna
! Based on http://shootout.alioth.debian.org/gp4/benchmark.php?test=regexdna&lang=ruby&id=1
: strip-line-breaks ( string -- string' )
R/ >.*\n|\n/ "" re-replace ;
re">.*\n|\n" "" re-replace ;
: count-patterns ( string -- )
{
R/ agggtaaa|tttaccct/i
R/ [cgt]gggtaaa|tttaccc[acg]/i
R/ a[act]ggtaaa|tttacc[agt]t/i
R/ ag[act]gtaaa|tttac[agt]ct/i
R/ agg[act]taaa|ttta[agt]cct/i
R/ aggg[acg]aaa|ttt[cgt]ccct/i
R/ agggt[cgt]aa|tt[acg]accct/i
R/ agggta[cgt]a|t[acg]taccct/i
R/ agggtaa[cgt]|[acg]ttaccct/i
re:: "agggtaaa|tttaccct" "i"
re:: "[cgt]gggtaaa|tttaccc[acg]" "i"
re:: "a[act]ggtaaa|tttacc[agt]t" "i"
re:: "ag[act]gtaaa|tttac[agt]ct" "i"
re:: "agg[act]taaa|ttta[agt]cct" "i"
re:: "aggg[acg]aaa|ttt[cgt]ccct" "i"
re:: "agggt[cgt]aa|tt[acg]accct" "i"
re:: "agggta[cgt]a|t[acg]taccct" "i"
re:: "agggtaa[cgt]|[acg]ttaccct" "i"
} [
[ raw>> write bl ]
[ count-matches number>string print ]
@ -28,17 +28,17 @@ IN: benchmark.regex-dna
: do-replacements ( string -- string' )
{
{ R/ B/ "(c|g|t)" }
{ R/ D/ "(a|g|t)" }
{ R/ H/ "(a|c|t)" }
{ R/ K/ "(g|t)" }
{ R/ M/ "(a|c)" }
{ R/ N/ "(a|c|g|t)" }
{ R/ R/ "(a|g)" }
{ R/ S/ "(c|t)" }
{ R/ V/ "(a|c|g)" }
{ R/ W/ "(a|t)" }
{ R/ Y/ "(c|t)" }
{ re"B" "(c|g|t)" }
{ re"D" "(a|g|t)" }
{ re"H" "(a|c|t)" }
{ re"K" "(g|t)" }
{ re"M" "(a|c)" }
{ re"N" "(a|c|g|t)" }
{ re"R" "(a|g)" }
{ re"S" "(c|t)" }
{ re"V" "(a|c|g)" }
{ re"W" "(a|t)" }
{ re"Y" "(c|t)" }
} [ re-replace ] assoc-each ;
SYMBOL: ilen

View File

@ -9,8 +9,8 @@ IN: benchmark.regexp
20,000 <iota> [ number>string ] map
200 <iota> [ 1 + char: a <string> ] map
'[
_ R/ \d+/ [ matches? ] curry all? t assert=
_ R/ [a]+/ [ matches? ] curry all? t assert=
_ re[[\d+]] [ matches? ] curry all? t assert=
_ re[[[a]+]] [ matches? ] curry all? t assert=
] times ;
MAIN: regexp-benchmark

View File

@ -869,7 +869,7 @@ CONSTANT: emoji H{
}
: emojify ( str -- str' )
R/ :([^:])+:/ [ >string emoji at ] re-replace-with ;
R[[:([^:])+:]] [ >string emoji at ] re-replace-with ;
: emojify-main ( -- )
command-line get [

View File

@ -2352,7 +2352,7 @@ CONSTANT: html5 H{
"#" ?head [ numeric-charref ] [ named-charref ] if ;
CONSTANT: re-charref
R/ &(#[0-9]+|#[xX][0-9a-fA-F]+|[^\t\n\f <&#;]{1,32});?/
re"&(#[0-9]+|#[xX][0-9a-fA-F]+|[^\t\n\f <&#;]{1,32});?"
PRIVATE>

View File

@ -87,7 +87,7 @@ M: text-printer print-comment-tag drop ;
M: text-printer print-dtd-tag drop ;
: collapse-spaces ( text -- text' )
preformatted? get [ R/ \s+/ " " re-replace ] unless ;
preformatted? get [ re"\s+" " " re-replace ] unless ;
M: text-printer print-text-tag
script? get style? get or

View File

@ -29,25 +29,25 @@ M: lexed length tokens>> length ;
TUPLE: comment < lexed payload ;
CONSTRUCTOR: <comment> comment ( tokens payload -- obj ) ;
CONSTRUCTOR: <comment> comment ( tokens -- obj ) ;
TUPLE: escaped-identifier < lexed name ;
CONSTRUCTOR: <escaped-identifier> escaped-identifier ( tokens name -- obj ) ;
CONSTRUCTOR: <escaped-identifier> escaped-identifier ( tokens -- obj ) ;
TUPLE: escaped-object < lexed name payload ;
CONSTRUCTOR: <escaped-object> escaped-object ( tokens name payload -- obj ) ;
CONSTRUCTOR: <escaped-object> escaped-object ( tokens -- obj ) ;
TUPLE: section < lexed tag payload ;
CONSTRUCTOR: <section> section ( tokens tag payload -- obj ) ;
CONSTRUCTOR: <section> section ( tokens -- obj ) ;
TUPLE: named-section < lexed tag name payload ;
CONSTRUCTOR: <named-section> named-section ( tokens tag name payload -- obj ) ;
CONSTRUCTOR: <named-section> named-section ( tokens -- obj ) ;
TUPLE: upper-colon < lexed tag payload ;
CONSTRUCTOR: <upper-colon> upper-colon ( tokens -- obj ) ;
TUPLE: lower-colon < lexed tag payload ;
CONSTRUCTOR: <lower-colon> lower-colon ( tokens tag payload -- obj ) ;
CONSTRUCTOR: <lower-colon> lower-colon ( tokens -- obj ) ;
TUPLE: matched < lexed tag payload ;
@ -98,11 +98,11 @@ CONSTRUCTOR: <double-paren> double-paren ( tokens -- obj )
TUPLE: double-quote < matched ;
CONSTRUCTOR: <double-quote> double-quote ( tokens tag payload -- obj ) ;
CONSTRUCTOR: <double-quote> double-quote ( tokens -- obj ) ;
TUPLE: identifier < lexed name ;
CONSTRUCTOR: <identifier> identifier ( tokens name -- obj ) ;
CONSTRUCTOR: <identifier> identifier ( tokens -- obj ) ;

View File

@ -144,7 +144,7 @@ MACRO:: read-matched ( ch -- quot: ( string n tag -- string n' slice' ) )
2over ?nth-of char: \[ = [
[ 1 + ] dip 1 modify-to 2over ?nth-of read-double-matched-bracket
] [
[ slice-til-eol drop ] dip swap 2array
[ slice-til-eol drop ] dip swap 2array <comment>
] if ;
: terminator? ( slice -- ? )
@ -166,7 +166,7 @@ MACRO:: read-matched ( ch -- quot: ( string n tag -- string n' slice' ) )
dup [ f unexpected-eof ] unless
lex-factor
] replicate ensure-tokens ! concat
] dip swap 2array ;
] dip swap 2array <lower-colon> ;
: (strict-upper?) ( string -- ? )
{
@ -280,17 +280,16 @@ MACRO:: read-matched ( ch -- quot: ( string n tag -- string n' slice' ) )
swap
! What ended the FOO: .. ; form?
! Remove the ; from the payload if present
! XXX: probably can remove this, T: is dumb
! Also in stack effects ( T: int -- ) can be ended by -- and )
dup ?last {
{ [ dup ";" sequence= ] [ drop unclip-last 3array ] }
{ [ dup ";" tail? ] [ drop unclip-last 3array ] }
{ [ dup "--" sequence= ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] }
{ [ dup "]" sequence= ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] }
{ [ dup "}" sequence= ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] }
{ [ dup ")" sequence= ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] } ! (n*quot) breaks
{ [ dup section-close? ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] }
{ [ dup upper-colon? ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] }
{ [ dup ";" sequence= ] [ drop unclip-last 3array <upper-colon> ] }
{ [ dup ";" tail? ] [ drop unclip-last 3array <upper-colon> ] }
{ [ dup "--" sequence= ] [ drop unclip-last -rot 2array <upper-colon> [ rewind-slice ] dip ] }
{ [ dup "]" sequence= ] [ "omg1" throw drop unclip-last -rot 2array <upper-colon> [ rewind-slice ] dip ] }
{ [ dup "}" sequence= ] [ "omg2" throw drop unclip-last -rot 2array <upper-colon> [ rewind-slice ] dip ] }
{ [ dup ")" sequence= ] [ B "opg3" throw drop unclip-last -rot 2array <upper-colon> [ rewind-slice ] dip ] } ! (n*quot) breaks
{ [ dup section-close? ] [ drop unclip-last -rot 2array <upper-colon> [ rewind-slice ] dip ] }
{ [ dup upper-colon? ] [ drop unclip-last -rot 2array <upper-colon> [ rewind-slice ] dip ] }
[ drop 2array <upper-colon> ]
} cond ;
@ -342,7 +341,7 @@ MACRO:: read-matched ( ch -- quot: ( string n tag -- string n' slice' ) )
} cond ;
: read-acute ( string n slice -- string n' acute )
[ matching-section-delimiter 1array lex-until ] keep swap unclip-last 3array ;
[ matching-section-delimiter 1array lex-until ] keep swap unclip-last 3array <section> ;
! #{ } turned off, foo# not turned off
: read-turnoff ( string n slice -- string n' obj )

View File

@ -158,4 +158,4 @@ version 8.36 <= [
] unit-test
! Test that the regexp syntax works.
{ t } [ "1234abcd" regexp:R/ ^\d+\w+$/ matches? ] unit-test
{ t } [ "1234abcd" regexp:R[[^\d+\w+$]] matches? ] unit-test

View File

@ -14,7 +14,7 @@ IN: text-analysis
[ [ blank? ] trim ] map harvest ;
: split-paragraphs ( str -- seq )
R/ \r?\n\r?\n/ re-split trimmed ;
re"\r?\n\r?\n" re-split trimmed ;
<<
CONSTANT: ABBREVIATIONS {
@ -39,7 +39,7 @@ CONSTANT: ABBREVIATIONS {
: split-sentences ( str -- seq )
! Mark end of sentences with EOS marker
R/ ((?:[\.?!]|[\r\n]+)(?:\"|\'|\)|\]|\})?)(\s+)/
re[[((?:[\.?!]|[\r\n]+)(?:\"|\'|\)|\]|\})?)(\s+)]]
[ [ ".?!\r\n\"')]}" member? not ] cut-when "\x01" glue ]
re-replace-with
@ -63,46 +63,46 @@ CONSTANT: ABBREVIATIONS {
"\x01" split trimmed ;
CONSTANT: sub-syllable {
R/ [^aeiou]e$/ ! give, love, bone, done, ride ...
R/ [aeiou](?:([cfghklmnprsvwz])\1?|ck|sh|[rt]ch)e[ds]$/
re"[^aeiou]e$" ! give, love, bone, done, ride ...
re"[aeiou](?:([cfghklmnprsvwz])\1?|ck|sh|[rt]ch)e[ds]$"
! (passive) past participles and 3rd person sing present verbs:
! bared, liked, called, tricked, bashed, matched
R/ .e(?:ly|less(?:ly)?|ness?|ful(?:ly)?|ments?)$/
re".e(?:ly|less(?:ly)?|ness?|ful(?:ly)?|ments?)$"
! nominal, adjectival and adverbial derivatives from -e$ roots:
! absolutely, nicely, likeness, basement, hopeless
! hopeful, tastefully, wasteful
R/ ion/ ! action, diction, fiction
R/ [ct]ia[nl]/ ! special(ly), initial, physician, christian
R/ [^cx]iou/ ! illustrious, NOT spacious, gracious, anxious, noxious
R/ sia$/ ! amnesia, polynesia
R/ .gue$/ ! dialogue, intrigue, colleague
re"ion" ! action, diction, fiction
re"[ct]ia[nl]" ! special(ly), initial, physician, christian
re"[^cx]iou" ! illustrious, NOT spacious, gracious, anxious, noxious
re"sia$" ! amnesia, polynesia
re".gue$" ! dialogue, intrigue, colleague
}
CONSTANT: add-syllable {
R/ i[aiou]/ ! alias, science, phobia
R/ [dls]ien/ ! salient, gradient, transient
R/ [aeiouym]ble$/ ! -Vble, plus -mble
R/ [aeiou]{3}/ ! agreeable
R/ ^mc/ ! mcwhatever
R/ ism$/ ! sexism, racism
R/ (?:([^aeiouy])\1|ck|mp|ng)le$/ ! bubble, cattle, cackle, sample, angle
R/ dnt$/ ! couldn/t
R/ [aeiou]y[aeiou]/ ! annoying, layer
re"i[aiou]" ! alias, science, phobia
re"[dls]ien" ! salient, gradient, transient
re"[aeiouym]ble$" ! -Vble, plus -mble
re"[aeiou]{3}" ! agreeable
re"^mc" ! mcwhatever
re"ism$" ! sexism, racism
re"(?:([^aeiouy])\1|ck|mp|ng)le$" ! bubble, cattle, cackle, sample, angle
re"dnt$" ! couldn/t
re"[aeiou]y[aeiou]" ! annoying, layer
}
: syllables ( str -- n )
dup length 1 = [ drop 1 ] [
>lower char: . swap remove
[ R/ [aeiouy]+/ count-matches ]
[ re"[aeiouy]+" count-matches ]
[ sub-syllable [ matches? ] with count - ]
[ add-syllable [ matches? ] with count + ] tri
1 max
] if ;
: split-words ( str -- words )
R/ \b([a-z][a-z\-']*)\b/i all-matching-subseqs ;
re:: [[\b([a-z][a-z\-']*)\b]] "i" all-matching-subseqs ;
TUPLE: text-analysis n-paragraphs n-sentences n-chars n-words
n-syllables n-complex-words n-unique-words n-difficult-words ;

View File

@ -116,7 +116,7 @@ TUPLE: entry key value ;
: multi-string ( -- parser )
multi-basic-string multi-literal-string 2choice [
"" like "\n" ?head drop
R/ \\[ \t\r\n]*\n[ \t\r\n]*/m "" re-replace
re:: [[\\[ \t\r\n]*\n[ \t\r\n]*]] "m" "" re-replace
] action ;
: string-parser ( -- parser )

View File

@ -13,7 +13,7 @@ IN: xkcd
: comic-image ( url -- image )
http-get nip
R/ \/\/imgs\.xkcd\.com\/comics\/[^\.]+\.(png|jpg)/
re"//imgs\.xkcd\.com/comics/[^\.]+\.(png|jpg)"
first-match >string "http:" prepend load-http-image ;
: comic-image. ( url -- )