regexp: Add re:: and re[[ ]] and use new syntax.

R/ / is a crazy syntax to support.
- it makes / a special character
- you have to escape forward slashes which is awkward in any language with R/ / syntax
- re"\d" is not a valid regexp string, gotta do the escapes later
-- the only universal escape in a string is either \" or even "none" as a default option
modern-harvey3
Doug Coleman 2019-10-29 23:03:33 -05:00
parent a654c7b879
commit 422078e01e
16 changed files with 86 additions and 76 deletions

View File

@ -7,10 +7,10 @@ unicode multiline ;
IN: globs IN: globs
: not-path-separator ( -- sep ) : not-path-separator ( -- sep )
os windows? R/ [^\/\\]/ R/ [^\/]/ ? ; foldable os windows? re"[^\/\\]" re"[^\/]" ? ; foldable
: wild-path-separator ( -- sep ) : wild-path-separator ( -- sep )
os windows? R/ [^\/\\][\/\\]|[^\/\\]/ R/ [^\/][\/]|[^\/]/ ? ; foldable os windows? re"[^\/\\][\/\\]|[^\/\\]" re"[^\/][\/]|[^\/]" ? ; foldable
EBNF: <glob> [=[ EBNF: <glob> [=[

View File

@ -10,7 +10,7 @@ IN: regexp.combinators.tests
{ f f f } [ "food" "ibar" "ba" [ strings matches? ] tri@ ] unit-test { f f f } [ "food" "ibar" "ba" [ strings matches? ] tri@ ] unit-test
: conj ( -- regexp ) : conj ( -- regexp )
{ R/ .*a/ R/ b.*/ } <and> ; { re".*a" re"b.*" } <and> ;
{ t } [ "bljhasflsda" conj matches? ] unit-test { t } [ "bljhasflsda" conj matches? ] unit-test
{ f } [ "bsdfdfs" conj matches? ] unit-test { f } [ "bsdfdfs" conj matches? ] unit-test

View File

@ -13,7 +13,7 @@ IN: regexp.combinators
PRIVATE> PRIVATE>
CONSTANT: <nothing> R/ (?~.*)/s CONSTANT: <nothing> re:: "(?~.*)" "s"
: <literal> ( string -- regexp ) : <literal> ( string -- regexp )
[ "\\Q" "\\E" surround ] [ <concatenation> ] bi make-regexp ; foldable [ "\\Q" "\\E" surround ] [ <concatenation> ] bi make-regexp ; foldable

View File

@ -229,6 +229,16 @@ SYNTAX: \R[=[ "]=]" parse-optioned-regexp ;
SYNTAX: \R[==[ "]==]" parse-optioned-regexp ; SYNTAX: \R[==[ "]==]" parse-optioned-regexp ;
SYNTAX: \R[===[ "]===]" parse-optioned-regexp ; SYNTAX: \R[===[ "]===]" parse-optioned-regexp ;
SYNTAX: \R[====[ "]====]" parse-optioned-regexp ; SYNTAX: \R[====[ "]====]" parse-optioned-regexp ;
SYNTAX: \re" "\"" parse-optioned-regexp ;
SYNTAX: \re[[ "]]" parse-optioned-regexp ;
SYNTAX: \re[=[ "]=]" parse-optioned-regexp ;
SYNTAX: \re[==[ "]==]" parse-optioned-regexp ;
SYNTAX: \re[===[ "]===]" parse-optioned-regexp ;
SYNTAX: \re[====[ "]====]" parse-optioned-regexp ;
SYNTAX: \re: scan-object "" <optioned-regexp> suffix! ;
SYNTAX: \re:: scan-object scan-object <optioned-regexp> suffix! ;
SYNTAX: \R(( "))" parse-optioned-regexp ; SYNTAX: \R(( "))" parse-optioned-regexp ;
SYNTAX: \R(=( ")=)" parse-optioned-regexp ; SYNTAX: \R(=( ")=)" parse-optioned-regexp ;
SYNTAX: \R(==( ")==)" parse-optioned-regexp ; SYNTAX: \R(==( ")==)" parse-optioned-regexp ;
@ -240,6 +250,7 @@ SYNTAX: \R{=={ "}==}" parse-optioned-regexp ;
SYNTAX: \R{==={ "}===}" parse-optioned-regexp ; SYNTAX: \R{==={ "}===}" parse-optioned-regexp ;
SYNTAX: \R{===={ "}====}" parse-optioned-regexp ; SYNTAX: \R{===={ "}====}" parse-optioned-regexp ;
USE: vocabs.loader USE: vocabs.loader
{ "prettyprint" "regexp" } "regexp.prettyprint" require-when { "prettyprint" "regexp" } "regexp.prettyprint" require-when

View File

@ -61,11 +61,11 @@ IN: validators
! From http://www.regular-expressions.info/email.html ! From http://www.regular-expressions.info/email.html
320 v-max-length 320 v-max-length
"e-mail" "e-mail"
R/ [A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/i re:: [[[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}]] "i"
v-regexp ; v-regexp ;
: v-url ( str -- str ) : v-url ( str -- str )
"url" R/ (?:ftp|http|https):\/\/\S+/ v-regexp ; "url" re[[(?:ftp|http|https):\/\/\S+]] v-regexp ;
: v-captcha ( str -- str ) : v-captcha ( str -- str )
dup empty? [ "must remain blank" throw ] unless ; dup empty? [ "must remain blank" throw ] unless ;

View File

@ -7,19 +7,19 @@ IN: benchmark.regex-dna
! Based on http://shootout.alioth.debian.org/gp4/benchmark.php?test=regexdna&lang=ruby&id=1 ! Based on http://shootout.alioth.debian.org/gp4/benchmark.php?test=regexdna&lang=ruby&id=1
: strip-line-breaks ( string -- string' ) : strip-line-breaks ( string -- string' )
R/ >.*\n|\n/ "" re-replace ; re">.*\n|\n" "" re-replace ;
: count-patterns ( string -- ) : count-patterns ( string -- )
{ {
R/ agggtaaa|tttaccct/i re:: "agggtaaa|tttaccct" "i"
R/ [cgt]gggtaaa|tttaccc[acg]/i re:: "[cgt]gggtaaa|tttaccc[acg]" "i"
R/ a[act]ggtaaa|tttacc[agt]t/i re:: "a[act]ggtaaa|tttacc[agt]t" "i"
R/ ag[act]gtaaa|tttac[agt]ct/i re:: "ag[act]gtaaa|tttac[agt]ct" "i"
R/ agg[act]taaa|ttta[agt]cct/i re:: "agg[act]taaa|ttta[agt]cct" "i"
R/ aggg[acg]aaa|ttt[cgt]ccct/i re:: "aggg[acg]aaa|ttt[cgt]ccct" "i"
R/ agggt[cgt]aa|tt[acg]accct/i re:: "agggt[cgt]aa|tt[acg]accct" "i"
R/ agggta[cgt]a|t[acg]taccct/i re:: "agggta[cgt]a|t[acg]taccct" "i"
R/ agggtaa[cgt]|[acg]ttaccct/i re:: "agggtaa[cgt]|[acg]ttaccct" "i"
} [ } [
[ raw>> write bl ] [ raw>> write bl ]
[ count-matches number>string print ] [ count-matches number>string print ]
@ -28,17 +28,17 @@ IN: benchmark.regex-dna
: do-replacements ( string -- string' ) : do-replacements ( string -- string' )
{ {
{ R/ B/ "(c|g|t)" } { re"B" "(c|g|t)" }
{ R/ D/ "(a|g|t)" } { re"D" "(a|g|t)" }
{ R/ H/ "(a|c|t)" } { re"H" "(a|c|t)" }
{ R/ K/ "(g|t)" } { re"K" "(g|t)" }
{ R/ M/ "(a|c)" } { re"M" "(a|c)" }
{ R/ N/ "(a|c|g|t)" } { re"N" "(a|c|g|t)" }
{ R/ R/ "(a|g)" } { re"R" "(a|g)" }
{ R/ S/ "(c|t)" } { re"S" "(c|t)" }
{ R/ V/ "(a|c|g)" } { re"V" "(a|c|g)" }
{ R/ W/ "(a|t)" } { re"W" "(a|t)" }
{ R/ Y/ "(c|t)" } { re"Y" "(c|t)" }
} [ re-replace ] assoc-each ; } [ re-replace ] assoc-each ;
SYMBOL: ilen SYMBOL: ilen

View File

@ -9,8 +9,8 @@ IN: benchmark.regexp
20,000 <iota> [ number>string ] map 20,000 <iota> [ number>string ] map
200 <iota> [ 1 + char: a <string> ] map 200 <iota> [ 1 + char: a <string> ] map
'[ '[
_ R/ \d+/ [ matches? ] curry all? t assert= _ re[[\d+]] [ matches? ] curry all? t assert=
_ R/ [a]+/ [ matches? ] curry all? t assert= _ re[[[a]+]] [ matches? ] curry all? t assert=
] times ; ] times ;
MAIN: regexp-benchmark MAIN: regexp-benchmark

View File

@ -869,7 +869,7 @@ CONSTANT: emoji H{
} }
: emojify ( str -- str' ) : emojify ( str -- str' )
R/ :([^:])+:/ [ >string emoji at ] re-replace-with ; R[[:([^:])+:]] [ >string emoji at ] re-replace-with ;
: emojify-main ( -- ) : emojify-main ( -- )
command-line get [ command-line get [

View File

@ -2352,7 +2352,7 @@ CONSTANT: html5 H{
"#" ?head [ numeric-charref ] [ named-charref ] if ; "#" ?head [ numeric-charref ] [ named-charref ] if ;
CONSTANT: re-charref CONSTANT: re-charref
R/ &(#[0-9]+|#[xX][0-9a-fA-F]+|[^\t\n\f <&#;]{1,32});?/ re"&(#[0-9]+|#[xX][0-9a-fA-F]+|[^\t\n\f <&#;]{1,32});?"
PRIVATE> PRIVATE>

View File

@ -87,7 +87,7 @@ M: text-printer print-comment-tag drop ;
M: text-printer print-dtd-tag drop ; M: text-printer print-dtd-tag drop ;
: collapse-spaces ( text -- text' ) : collapse-spaces ( text -- text' )
preformatted? get [ R/ \s+/ " " re-replace ] unless ; preformatted? get [ re"\s+" " " re-replace ] unless ;
M: text-printer print-text-tag M: text-printer print-text-tag
script? get style? get or script? get style? get or

View File

@ -29,25 +29,25 @@ M: lexed length tokens>> length ;
TUPLE: comment < lexed payload ; TUPLE: comment < lexed payload ;
CONSTRUCTOR: <comment> comment ( tokens payload -- obj ) ; CONSTRUCTOR: <comment> comment ( tokens -- obj ) ;
TUPLE: escaped-identifier < lexed name ; TUPLE: escaped-identifier < lexed name ;
CONSTRUCTOR: <escaped-identifier> escaped-identifier ( tokens name -- obj ) ; CONSTRUCTOR: <escaped-identifier> escaped-identifier ( tokens -- obj ) ;
TUPLE: escaped-object < lexed name payload ; TUPLE: escaped-object < lexed name payload ;
CONSTRUCTOR: <escaped-object> escaped-object ( tokens name payload -- obj ) ; CONSTRUCTOR: <escaped-object> escaped-object ( tokens -- obj ) ;
TUPLE: section < lexed tag payload ; TUPLE: section < lexed tag payload ;
CONSTRUCTOR: <section> section ( tokens tag payload -- obj ) ; CONSTRUCTOR: <section> section ( tokens -- obj ) ;
TUPLE: named-section < lexed tag name payload ; TUPLE: named-section < lexed tag name payload ;
CONSTRUCTOR: <named-section> named-section ( tokens tag name payload -- obj ) ; CONSTRUCTOR: <named-section> named-section ( tokens -- obj ) ;
TUPLE: upper-colon < lexed tag payload ; TUPLE: upper-colon < lexed tag payload ;
CONSTRUCTOR: <upper-colon> upper-colon ( tokens -- obj ) ; CONSTRUCTOR: <upper-colon> upper-colon ( tokens -- obj ) ;
TUPLE: lower-colon < lexed tag payload ; TUPLE: lower-colon < lexed tag payload ;
CONSTRUCTOR: <lower-colon> lower-colon ( tokens tag payload -- obj ) ; CONSTRUCTOR: <lower-colon> lower-colon ( tokens -- obj ) ;
TUPLE: matched < lexed tag payload ; TUPLE: matched < lexed tag payload ;
@ -98,11 +98,11 @@ CONSTRUCTOR: <double-paren> double-paren ( tokens -- obj )
TUPLE: double-quote < matched ; TUPLE: double-quote < matched ;
CONSTRUCTOR: <double-quote> double-quote ( tokens tag payload -- obj ) ; CONSTRUCTOR: <double-quote> double-quote ( tokens -- obj ) ;
TUPLE: identifier < lexed name ; TUPLE: identifier < lexed name ;
CONSTRUCTOR: <identifier> identifier ( tokens name -- obj ) ; CONSTRUCTOR: <identifier> identifier ( tokens -- obj ) ;

View File

@ -144,7 +144,7 @@ MACRO:: read-matched ( ch -- quot: ( string n tag -- string n' slice' ) )
2over ?nth-of char: \[ = [ 2over ?nth-of char: \[ = [
[ 1 + ] dip 1 modify-to 2over ?nth-of read-double-matched-bracket [ 1 + ] dip 1 modify-to 2over ?nth-of read-double-matched-bracket
] [ ] [
[ slice-til-eol drop ] dip swap 2array [ slice-til-eol drop ] dip swap 2array <comment>
] if ; ] if ;
: terminator? ( slice -- ? ) : terminator? ( slice -- ? )
@ -166,7 +166,7 @@ MACRO:: read-matched ( ch -- quot: ( string n tag -- string n' slice' ) )
dup [ f unexpected-eof ] unless dup [ f unexpected-eof ] unless
lex-factor lex-factor
] replicate ensure-tokens ! concat ] replicate ensure-tokens ! concat
] dip swap 2array ; ] dip swap 2array <lower-colon> ;
: (strict-upper?) ( string -- ? ) : (strict-upper?) ( string -- ? )
{ {
@ -280,17 +280,16 @@ MACRO:: read-matched ( ch -- quot: ( string n tag -- string n' slice' ) )
swap swap
! What ended the FOO: .. ; form? ! What ended the FOO: .. ; form?
! Remove the ; from the payload if present ! Remove the ; from the payload if present
! XXX: probably can remove this, T: is dumb
! Also in stack effects ( T: int -- ) can be ended by -- and ) ! Also in stack effects ( T: int -- ) can be ended by -- and )
dup ?last { dup ?last {
{ [ dup ";" sequence= ] [ drop unclip-last 3array ] } { [ dup ";" sequence= ] [ drop unclip-last 3array <upper-colon> ] }
{ [ dup ";" tail? ] [ drop unclip-last 3array ] } { [ dup ";" tail? ] [ drop unclip-last 3array <upper-colon> ] }
{ [ dup "--" sequence= ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] } { [ dup "--" sequence= ] [ drop unclip-last -rot 2array <upper-colon> [ rewind-slice ] dip ] }
{ [ dup "]" sequence= ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] } { [ dup "]" sequence= ] [ "omg1" throw drop unclip-last -rot 2array <upper-colon> [ rewind-slice ] dip ] }
{ [ dup "}" sequence= ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] } { [ dup "}" sequence= ] [ "omg2" throw drop unclip-last -rot 2array <upper-colon> [ rewind-slice ] dip ] }
{ [ dup ")" sequence= ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] } ! (n*quot) breaks { [ dup ")" sequence= ] [ B "opg3" throw drop unclip-last -rot 2array <upper-colon> [ rewind-slice ] dip ] } ! (n*quot) breaks
{ [ dup section-close? ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] } { [ dup section-close? ] [ drop unclip-last -rot 2array <upper-colon> [ rewind-slice ] dip ] }
{ [ dup upper-colon? ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] } { [ dup upper-colon? ] [ drop unclip-last -rot 2array <upper-colon> [ rewind-slice ] dip ] }
[ drop 2array <upper-colon> ] [ drop 2array <upper-colon> ]
} cond ; } cond ;
@ -342,7 +341,7 @@ MACRO:: read-matched ( ch -- quot: ( string n tag -- string n' slice' ) )
} cond ; } cond ;
: read-acute ( string n slice -- string n' acute ) : read-acute ( string n slice -- string n' acute )
[ matching-section-delimiter 1array lex-until ] keep swap unclip-last 3array ; [ matching-section-delimiter 1array lex-until ] keep swap unclip-last 3array <section> ;
! #{ } turned off, foo# not turned off ! #{ } turned off, foo# not turned off
: read-turnoff ( string n slice -- string n' obj ) : read-turnoff ( string n slice -- string n' obj )

View File

@ -158,4 +158,4 @@ version 8.36 <= [
] unit-test ] unit-test
! Test that the regexp syntax works. ! Test that the regexp syntax works.
{ t } [ "1234abcd" regexp:R/ ^\d+\w+$/ matches? ] unit-test { t } [ "1234abcd" regexp:R[[^\d+\w+$]] matches? ] unit-test

View File

@ -14,7 +14,7 @@ IN: text-analysis
[ [ blank? ] trim ] map harvest ; [ [ blank? ] trim ] map harvest ;
: split-paragraphs ( str -- seq ) : split-paragraphs ( str -- seq )
R/ \r?\n\r?\n/ re-split trimmed ; re"\r?\n\r?\n" re-split trimmed ;
<< <<
CONSTANT: ABBREVIATIONS { CONSTANT: ABBREVIATIONS {
@ -39,7 +39,7 @@ CONSTANT: ABBREVIATIONS {
: split-sentences ( str -- seq ) : split-sentences ( str -- seq )
! Mark end of sentences with EOS marker ! Mark end of sentences with EOS marker
R/ ((?:[\.?!]|[\r\n]+)(?:\"|\'|\)|\]|\})?)(\s+)/ re[[((?:[\.?!]|[\r\n]+)(?:\"|\'|\)|\]|\})?)(\s+)]]
[ [ ".?!\r\n\"')]}" member? not ] cut-when "\x01" glue ] [ [ ".?!\r\n\"')]}" member? not ] cut-when "\x01" glue ]
re-replace-with re-replace-with
@ -63,46 +63,46 @@ CONSTANT: ABBREVIATIONS {
"\x01" split trimmed ; "\x01" split trimmed ;
CONSTANT: sub-syllable { CONSTANT: sub-syllable {
R/ [^aeiou]e$/ ! give, love, bone, done, ride ... re"[^aeiou]e$" ! give, love, bone, done, ride ...
R/ [aeiou](?:([cfghklmnprsvwz])\1?|ck|sh|[rt]ch)e[ds]$/ re"[aeiou](?:([cfghklmnprsvwz])\1?|ck|sh|[rt]ch)e[ds]$"
! (passive) past participles and 3rd person sing present verbs: ! (passive) past participles and 3rd person sing present verbs:
! bared, liked, called, tricked, bashed, matched ! bared, liked, called, tricked, bashed, matched
R/ .e(?:ly|less(?:ly)?|ness?|ful(?:ly)?|ments?)$/ re".e(?:ly|less(?:ly)?|ness?|ful(?:ly)?|ments?)$"
! nominal, adjectival and adverbial derivatives from -e$ roots: ! nominal, adjectival and adverbial derivatives from -e$ roots:
! absolutely, nicely, likeness, basement, hopeless ! absolutely, nicely, likeness, basement, hopeless
! hopeful, tastefully, wasteful ! hopeful, tastefully, wasteful
R/ ion/ ! action, diction, fiction re"ion" ! action, diction, fiction
R/ [ct]ia[nl]/ ! special(ly), initial, physician, christian re"[ct]ia[nl]" ! special(ly), initial, physician, christian
R/ [^cx]iou/ ! illustrious, NOT spacious, gracious, anxious, noxious re"[^cx]iou" ! illustrious, NOT spacious, gracious, anxious, noxious
R/ sia$/ ! amnesia, polynesia re"sia$" ! amnesia, polynesia
R/ .gue$/ ! dialogue, intrigue, colleague re".gue$" ! dialogue, intrigue, colleague
} }
CONSTANT: add-syllable { CONSTANT: add-syllable {
R/ i[aiou]/ ! alias, science, phobia re"i[aiou]" ! alias, science, phobia
R/ [dls]ien/ ! salient, gradient, transient re"[dls]ien" ! salient, gradient, transient
R/ [aeiouym]ble$/ ! -Vble, plus -mble re"[aeiouym]ble$" ! -Vble, plus -mble
R/ [aeiou]{3}/ ! agreeable re"[aeiou]{3}" ! agreeable
R/ ^mc/ ! mcwhatever re"^mc" ! mcwhatever
R/ ism$/ ! sexism, racism re"ism$" ! sexism, racism
R/ (?:([^aeiouy])\1|ck|mp|ng)le$/ ! bubble, cattle, cackle, sample, angle re"(?:([^aeiouy])\1|ck|mp|ng)le$" ! bubble, cattle, cackle, sample, angle
R/ dnt$/ ! couldn/t re"dnt$" ! couldn/t
R/ [aeiou]y[aeiou]/ ! annoying, layer re"[aeiou]y[aeiou]" ! annoying, layer
} }
: syllables ( str -- n ) : syllables ( str -- n )
dup length 1 = [ drop 1 ] [ dup length 1 = [ drop 1 ] [
>lower char: . swap remove >lower char: . swap remove
[ R/ [aeiouy]+/ count-matches ] [ re"[aeiouy]+" count-matches ]
[ sub-syllable [ matches? ] with count - ] [ sub-syllable [ matches? ] with count - ]
[ add-syllable [ matches? ] with count + ] tri [ add-syllable [ matches? ] with count + ] tri
1 max 1 max
] if ; ] if ;
: split-words ( str -- words ) : split-words ( str -- words )
R/ \b([a-z][a-z\-']*)\b/i all-matching-subseqs ; re:: [[\b([a-z][a-z\-']*)\b]] "i" all-matching-subseqs ;
TUPLE: text-analysis n-paragraphs n-sentences n-chars n-words TUPLE: text-analysis n-paragraphs n-sentences n-chars n-words
n-syllables n-complex-words n-unique-words n-difficult-words ; n-syllables n-complex-words n-unique-words n-difficult-words ;

View File

@ -116,7 +116,7 @@ TUPLE: entry key value ;
: multi-string ( -- parser ) : multi-string ( -- parser )
multi-basic-string multi-literal-string 2choice [ multi-basic-string multi-literal-string 2choice [
"" like "\n" ?head drop "" like "\n" ?head drop
R/ \\[ \t\r\n]*\n[ \t\r\n]*/m "" re-replace re:: [[\\[ \t\r\n]*\n[ \t\r\n]*]] "m" "" re-replace
] action ; ] action ;
: string-parser ( -- parser ) : string-parser ( -- parser )

View File

@ -13,7 +13,7 @@ IN: xkcd
: comic-image ( url -- image ) : comic-image ( url -- image )
http-get nip http-get nip
R/ \/\/imgs\.xkcd\.com\/comics\/[^\.]+\.(png|jpg)/ re"//imgs\.xkcd\.com/comics/[^\.]+\.(png|jpg)"
first-match >string "http:" prepend load-http-image ; first-match >string "http:" prepend load-http-image ;
: comic-image. ( url -- ) : comic-image. ( url -- )