regexp: Add re:: and re[[ ]] and use new syntax.

R/ / is a crazy syntax to support. - it makes / a special character - you have to escape forward slashes which is awkward in any language with R/ / syntax - re"\d" is not a valid regexp string, gotta do the escapes later -- the only universal escape in a string is either \" or even "none" as a default option
2019-10-29 23:03:33 -05:00 · 2019-10-29 23:03:33 -05:00 · 422078e01e
parent a654c7b879
commit 422078e01e
16 changed files with 86 additions and 76 deletions
--- a/basis/globs/globs.factor
+++ b/basis/globs/globs.factor
@ -7,10 +7,10 @@ unicode multiline ;
 IN: globs

 : not-path-separator ( -- sep )
-    os windows? R/ [^\/\\]/ R/ [^\/]/ ? ; foldable
+    os windows? re"[^\/\\]" re"[^\/]" ? ; foldable

 : wild-path-separator ( -- sep )
-    os windows? R/ [^\/\\][\/\\]|[^\/\\]/ R/ [^\/][\/]|[^\/]/ ? ; foldable
+    os windows? re"[^\/\\][\/\\]|[^\/\\]" re"[^\/][\/]|[^\/]" ? ; foldable

 EBNF: <glob> [=[

--- a/basis/regexp/combinators/combinators-tests.factor
+++ b/basis/regexp/combinators/combinators-tests.factor
@ -10,7 +10,7 @@ IN: regexp.combinators.tests
 { f f f } [ "food" "ibar" "ba" [ strings matches? ] tri@ ] unit-test

 : conj ( -- regexp )
-    { R/ .*a/ R/ b.*/ } <and> ;
+    { re".*a" re"b.*" } <and> ;

 { t } [ "bljhasflsda" conj matches? ] unit-test
 { f } [ "bsdfdfs" conj matches? ] unit-test
--- a/basis/regexp/combinators/combinators.factor
+++ b/basis/regexp/combinators/combinators.factor
@ -13,7 +13,7 @@ IN: regexp.combinators

 PRIVATE>

-CONSTANT: <nothing> R/ (?~.*)/s
+CONSTANT: <nothing> re:: "(?~.*)" "s"

 : <literal> ( string -- regexp )
    [ "\\Q" "\\E" surround ] [ <concatenation> ] bi make-regexp ; foldable
--- a/basis/regexp/regexp.factor
+++ b/basis/regexp/regexp.factor
@ -229,6 +229,16 @@ SYNTAX: \R[=[ "]=]" parse-optioned-regexp ;
 SYNTAX: \R[==[ "]==]" parse-optioned-regexp ;
 SYNTAX: \R[===[ "]===]" parse-optioned-regexp ;
 SYNTAX: \R[====[ "]====]" parse-optioned-regexp ;
+SYNTAX: \re" "\"" parse-optioned-regexp ;
+SYNTAX: \re[[ "]]" parse-optioned-regexp ;
+SYNTAX: \re[=[ "]=]" parse-optioned-regexp ;
+SYNTAX: \re[==[ "]==]" parse-optioned-regexp ;
+SYNTAX: \re[===[ "]===]" parse-optioned-regexp ;
+SYNTAX: \re[====[ "]====]" parse-optioned-regexp ;
+
+SYNTAX: \re: scan-object "" <optioned-regexp> suffix! ;
+SYNTAX: \re:: scan-object scan-object <optioned-regexp> suffix! ;
+
 SYNTAX: \R(( "))" parse-optioned-regexp ;
 SYNTAX: \R(=( ")=)" parse-optioned-regexp ;
 SYNTAX: \R(==( ")==)" parse-optioned-regexp ;
@ -240,6 +250,7 @@ SYNTAX: \R{=={ "}==}" parse-optioned-regexp ;
 SYNTAX: \R{==={ "}===}" parse-optioned-regexp ;
 SYNTAX: \R{===={ "}====}" parse-optioned-regexp ;

+
 USE: vocabs.loader

 { "prettyprint" "regexp" } "regexp.prettyprint" require-when
--- a/basis/validators/validators.factor
+++ b/basis/validators/validators.factor
@ -61,11 +61,11 @@ IN: validators
    ! From http://www.regular-expressions.info/email.html
    320 v-max-length
    "e-mail"
-    R/ [A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/i
+    re:: [[[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}]] "i"
    v-regexp ;

 : v-url ( str -- str )
-    "url" R/ (?:ftp|http|https):\/\/\S+/ v-regexp ;
+    "url" re[[(?:ftp|http|https):\/\/\S+]] v-regexp ;

 : v-captcha ( str -- str )
    dup empty? [ "must remain blank" throw ] unless ;
--- a/extra/benchmark/regex-dna/regex-dna.factor
+++ b/extra/benchmark/regex-dna/regex-dna.factor
@ -7,19 +7,19 @@ IN: benchmark.regex-dna
 ! Based on http://shootout.alioth.debian.org/gp4/benchmark.php?test=regexdna&lang=ruby&id=1

 : strip-line-breaks ( string -- string' )
-    R/ >.*\n|\n/ "" re-replace ;
+    re">.*\n|\n" "" re-replace ;

 : count-patterns ( string -- )
    {
-        R/ agggtaaa|tttaccct/i
-        R/ [cgt]gggtaaa|tttaccc[acg]/i
-        R/ a[act]ggtaaa|tttacc[agt]t/i
-        R/ ag[act]gtaaa|tttac[agt]ct/i
-        R/ agg[act]taaa|ttta[agt]cct/i
-        R/ aggg[acg]aaa|ttt[cgt]ccct/i
-        R/ agggt[cgt]aa|tt[acg]accct/i
-        R/ agggta[cgt]a|t[acg]taccct/i
-        R/ agggtaa[cgt]|[acg]ttaccct/i
+        re:: "agggtaaa|tttaccct" "i"
+        re:: "[cgt]gggtaaa|tttaccc[acg]" "i"
+        re:: "a[act]ggtaaa|tttacc[agt]t" "i"
+        re:: "ag[act]gtaaa|tttac[agt]ct" "i"
+        re:: "agg[act]taaa|ttta[agt]cct" "i"
+        re:: "aggg[acg]aaa|ttt[cgt]ccct" "i"
+        re:: "agggt[cgt]aa|tt[acg]accct" "i"
+        re:: "agggta[cgt]a|t[acg]taccct" "i"
+        re:: "agggtaa[cgt]|[acg]ttaccct" "i"
    } [
        [ raw>> write bl ]
        [ count-matches number>string print ]
@ -28,17 +28,17 @@ IN: benchmark.regex-dna

 : do-replacements ( string -- string' )
    {
-        { R/ B/ "(c|g|t)" }
-        { R/ D/ "(a|g|t)" }
-        { R/ H/ "(a|c|t)" }
-        { R/ K/ "(g|t)" }
-        { R/ M/ "(a|c)" }
-        { R/ N/ "(a|c|g|t)" }
-        { R/ R/ "(a|g)" }
-        { R/ S/ "(c|t)" }
-        { R/ V/ "(a|c|g)" }
-        { R/ W/ "(a|t)" }
-        { R/ Y/ "(c|t)" }
+        { re"B" "(c|g|t)" }
+        { re"D" "(a|g|t)" }
+        { re"H" "(a|c|t)" }
+        { re"K" "(g|t)" }
+        { re"M" "(a|c)" }
+        { re"N" "(a|c|g|t)" }
+        { re"R" "(a|g)" }
+        { re"S" "(c|t)" }
+        { re"V" "(a|c|g)" }
+        { re"W" "(a|t)" }
+        { re"Y" "(c|t)" }
    } [ re-replace ] assoc-each ;

 SYMBOL: ilen
--- a/extra/benchmark/regexp/regexp.factor
+++ b/extra/benchmark/regexp/regexp.factor
@ -9,8 +9,8 @@ IN: benchmark.regexp
    20,000 <iota> [ number>string ] map
    200 <iota> [ 1 + char: a <string> ] map
    '[
-        _ R/ \d+/ [ matches? ] curry all? t assert=
-        _ R/ [a]+/ [ matches? ] curry all? t assert=
+        _ re[[\d+]] [ matches? ] curry all? t assert=
+        _ re[[[a]+]] [ matches? ] curry all? t assert=
    ] times ;

 MAIN: regexp-benchmark
--- a/extra/emojify/emojify.factor
+++ b/extra/emojify/emojify.factor
@ -869,7 +869,7 @@ CONSTANT: emoji H{
 }

 : emojify ( str -- str' )
-    R/ :([^:])+:/ [ >string emoji at ] re-replace-with ;
+    R[[:([^:])+:]] [ >string emoji at ] re-replace-with ;

 : emojify-main ( -- )
    command-line get [
--- a/extra/html/entities/entities.factor
+++ b/extra/html/entities/entities.factor
@ -2352,7 +2352,7 @@ CONSTANT: html5 H{
    "#" ?head [ numeric-charref ] [ named-charref ] if ;

 CONSTANT: re-charref
-R/ &(#[0-9]+|#[xX][0-9a-fA-F]+|[^\t\n\f <&#;]{1,32});?/
+re"&(#[0-9]+|#[xX][0-9a-fA-F]+|[^\t\n\f <&#;]{1,32});?"

 PRIVATE>

--- a/extra/html/parser/printer/printer.factor
+++ b/extra/html/parser/printer/printer.factor
@ -87,7 +87,7 @@ M: text-printer print-comment-tag drop ;
 M: text-printer print-dtd-tag drop ;

 : collapse-spaces ( text -- text' )
-    preformatted? get [ R/ \s+/ " " re-replace ] unless ;
+    preformatted? get [ re"\s+" " " re-replace ] unless ;

 M: text-printer print-text-tag
    script? get style? get or
--- a/extra/modern/compiler/compiler.factor
+++ b/extra/modern/compiler/compiler.factor
@ -29,25 +29,25 @@ M: lexed length tokens>> length ;


 TUPLE: comment < lexed payload ;
-CONSTRUCTOR: <comment> comment ( tokens payload -- obj ) ;
+CONSTRUCTOR: <comment> comment ( tokens -- obj ) ;

 TUPLE: escaped-identifier < lexed name ;
-CONSTRUCTOR: <escaped-identifier> escaped-identifier ( tokens name -- obj ) ;
+CONSTRUCTOR: <escaped-identifier> escaped-identifier ( tokens -- obj ) ;

 TUPLE: escaped-object < lexed name payload ;
-CONSTRUCTOR: <escaped-object> escaped-object ( tokens name payload -- obj ) ;
+CONSTRUCTOR: <escaped-object> escaped-object ( tokens -- obj ) ;

 TUPLE: section < lexed tag payload ;
-CONSTRUCTOR: <section> section ( tokens tag payload -- obj ) ;
+CONSTRUCTOR: <section> section ( tokens -- obj ) ;

 TUPLE: named-section < lexed tag name payload ;
-CONSTRUCTOR: <named-section> named-section ( tokens tag name payload -- obj ) ;
+CONSTRUCTOR: <named-section> named-section ( tokens -- obj ) ;

 TUPLE: upper-colon < lexed tag payload ;
 CONSTRUCTOR: <upper-colon> upper-colon ( tokens -- obj ) ;

 TUPLE: lower-colon < lexed tag payload ;
-CONSTRUCTOR: <lower-colon> lower-colon ( tokens tag payload -- obj ) ;
+CONSTRUCTOR: <lower-colon> lower-colon ( tokens -- obj ) ;

 TUPLE: matched < lexed tag payload ;

@ -98,11 +98,11 @@ CONSTRUCTOR: <double-paren> double-paren ( tokens -- obj )


 TUPLE: double-quote < matched ;
-CONSTRUCTOR: <double-quote> double-quote ( tokens tag payload -- obj ) ;
+CONSTRUCTOR: <double-quote> double-quote ( tokens -- obj ) ;


 TUPLE: identifier < lexed name ;
-CONSTRUCTOR: <identifier> identifier ( tokens name -- obj ) ;
+CONSTRUCTOR: <identifier> identifier ( tokens -- obj ) ;



--- a/extra/modern/modern.factor
+++ b/extra/modern/modern.factor
@ -144,7 +144,7 @@ MACRO:: read-matched ( ch -- quot: ( string n tag -- string n' slice' ) )
    2over ?nth-of char: \[ = [
        [ 1 + ] dip 1 modify-to 2over ?nth-of read-double-matched-bracket
    ] [
-        [ slice-til-eol drop ] dip swap 2array
+        [ slice-til-eol drop ] dip swap 2array <comment>
    ] if ;

 : terminator? ( slice -- ? )
@ -166,7 +166,7 @@ MACRO:: read-matched ( ch -- quot: ( string n tag -- string n' slice' ) )
            dup [ f unexpected-eof ] unless
            lex-factor
        ] replicate ensure-tokens ! concat
-    ] dip swap 2array ;
+    ] dip swap 2array <lower-colon> ;

 : (strict-upper?) ( string -- ? )
    {
@ -280,17 +280,16 @@ MACRO:: read-matched ( ch -- quot: ( string n tag -- string n' slice' ) )
    swap
    ! What ended the FOO: .. ; form?
    ! Remove the ; from the payload if present
-    ! XXX: probably can remove this, T: is dumb
    ! Also in stack effects ( T: int -- ) can be ended by -- and )
    dup ?last {
-        { [ dup ";" sequence= ] [ drop unclip-last 3array ] }
-        { [ dup ";" tail? ] [ drop unclip-last 3array ] }
-        { [ dup "--" sequence= ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] }
-        { [ dup "]" sequence= ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] }
-        { [ dup "}" sequence= ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] }
-        { [ dup ")" sequence= ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] } ! (n*quot) breaks
-        { [ dup section-close? ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] }
-        { [ dup upper-colon? ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] }
+        { [ dup ";" sequence= ] [ drop unclip-last 3array <upper-colon> ] }
+        { [ dup ";" tail? ] [ drop unclip-last 3array <upper-colon> ] }
+        { [ dup "--" sequence= ] [ drop unclip-last -rot 2array <upper-colon> [ rewind-slice ] dip ] }
+        { [ dup "]" sequence= ] [ "omg1" throw drop unclip-last -rot 2array <upper-colon> [ rewind-slice ] dip ] }
+        { [ dup "}" sequence= ] [ "omg2" throw drop unclip-last -rot 2array <upper-colon> [ rewind-slice ] dip ] }
+        { [ dup ")" sequence= ] [ B "opg3" throw drop unclip-last -rot 2array <upper-colon> [ rewind-slice ] dip ] } ! (n*quot) breaks
+        { [ dup section-close? ] [ drop unclip-last -rot 2array <upper-colon> [ rewind-slice ] dip ] }
+        { [ dup upper-colon? ] [ drop unclip-last -rot 2array <upper-colon> [ rewind-slice ] dip ] }
        [ drop 2array <upper-colon> ]
    } cond ;

@ -342,7 +341,7 @@ MACRO:: read-matched ( ch -- quot: ( string n tag -- string n' slice' ) )
    } cond ;

 : read-acute ( string n slice -- string n' acute )
-    [ matching-section-delimiter 1array lex-until ] keep swap unclip-last 3array ;
+    [ matching-section-delimiter 1array lex-until ] keep swap unclip-last 3array <section> ;

 ! #{ } turned off, foo# not turned off
 : read-turnoff ( string n slice -- string n' obj )
--- a/extra/pcre/pcre-tests.factor
+++ b/extra/pcre/pcre-tests.factor
@ -158,4 +158,4 @@ version 8.36 <= [
 ] unit-test

 ! Test that the regexp syntax works.
-{ t } [ "1234abcd" regexp:R/ ^\d+\w+$/ matches? ] unit-test
+{ t } [ "1234abcd" regexp:R[[^\d+\w+$]] matches? ] unit-test
--- a/extra/text-analysis/text-analysis.factor
+++ b/extra/text-analysis/text-analysis.factor
@ -14,7 +14,7 @@ IN: text-analysis
    [ [ blank? ] trim ] map harvest ;

 : split-paragraphs ( str -- seq )
-    R/ \r?\n\r?\n/ re-split trimmed ;
+    re"\r?\n\r?\n" re-split trimmed ;

 <<
 CONSTANT: ABBREVIATIONS {
@ -39,7 +39,7 @@ CONSTANT: ABBREVIATIONS {
 : split-sentences ( str -- seq )

    ! Mark end of sentences with EOS marker
-    R/ ((?:[\.?!]|[\r\n]+)(?:\"|\'|\)|\]|\})?)(\s+)/
+    re[[((?:[\.?!]|[\r\n]+)(?:\"|\'|\)|\]|\})?)(\s+)]]
    [ [ ".?!\r\n\"')]}" member? not ] cut-when "\x01" glue ]
    re-replace-with

@ -63,46 +63,46 @@ CONSTANT: ABBREVIATIONS {
    "\x01" split trimmed ;

 CONSTANT: sub-syllable {
-    R/ [^aeiou]e$/ ! give, love, bone, done, ride ...
-    R/ [aeiou](?:([cfghklmnprsvwz])\1?|ck|sh|[rt]ch)e[ds]$/
+    re"[^aeiou]e$" ! give, love, bone, done, ride ...
+    re"[aeiou](?:([cfghklmnprsvwz])\1?|ck|sh|[rt]ch)e[ds]$"
    ! (passive) past participles and 3rd person sing present verbs:
    ! bared, liked, called, tricked, bashed, matched

-    R/ .e(?:ly|less(?:ly)?|ness?|ful(?:ly)?|ments?)$/
+    re".e(?:ly|less(?:ly)?|ness?|ful(?:ly)?|ments?)$"
    ! nominal, adjectival and adverbial derivatives from -e$ roots:
    ! absolutely, nicely, likeness, basement, hopeless
    ! hopeful, tastefully, wasteful

-    R/ ion/ ! action, diction, fiction
-    R/ [ct]ia[nl]/ ! special(ly), initial, physician, christian
-    R/ [^cx]iou/ ! illustrious, NOT spacious, gracious, anxious, noxious
-    R/ sia$/ ! amnesia, polynesia
-    R/ .gue$/ ! dialogue, intrigue, colleague
+    re"ion" ! action, diction, fiction
+    re"[ct]ia[nl]" ! special(ly), initial, physician, christian
+    re"[^cx]iou" ! illustrious, NOT spacious, gracious, anxious, noxious
+    re"sia$" ! amnesia, polynesia
+    re".gue$" ! dialogue, intrigue, colleague
 }

 CONSTANT: add-syllable {
-    R/ i[aiou]/ ! alias, science, phobia
-    R/ [dls]ien/ ! salient, gradient, transient
-    R/ [aeiouym]ble$/ ! -Vble, plus -mble
-    R/ [aeiou]{3}/ ! agreeable
-    R/ ^mc/ ! mcwhatever
-    R/ ism$/ ! sexism, racism
-    R/ (?:([^aeiouy])\1|ck|mp|ng)le$/ ! bubble, cattle, cackle, sample, angle
-    R/ dnt$/ ! couldn/t
-    R/ [aeiou]y[aeiou]/ ! annoying, layer
+    re"i[aiou]" ! alias, science, phobia
+    re"[dls]ien" ! salient, gradient, transient
+    re"[aeiouym]ble$" ! -Vble, plus -mble
+    re"[aeiou]{3}" ! agreeable
+    re"^mc" ! mcwhatever
+    re"ism$" ! sexism, racism
+    re"(?:([^aeiouy])\1|ck|mp|ng)le$" ! bubble, cattle, cackle, sample, angle
+    re"dnt$" ! couldn/t
+    re"[aeiou]y[aeiou]" ! annoying, layer
 }

 : syllables ( str -- n )
    dup length 1 = [ drop 1 ] [
        >lower char: . swap remove
-        [ R/ [aeiouy]+/ count-matches ]
+        [ re"[aeiouy]+" count-matches ]
        [ sub-syllable [ matches? ] with count - ]
        [ add-syllable [ matches? ] with count + ] tri
        1 max
    ] if ;

 : split-words ( str -- words )
-    R/ \b([a-z][a-z\-']*)\b/i all-matching-subseqs ;
+    re:: [[\b([a-z][a-z\-']*)\b]] "i" all-matching-subseqs ;

 TUPLE: text-analysis n-paragraphs n-sentences n-chars n-words
 n-syllables n-complex-words n-unique-words n-difficult-words ;
--- a/extra/toml/toml.factor
+++ b/extra/toml/toml.factor
@ -116,7 +116,7 @@ TUPLE: entry key value ;
 : multi-string ( -- parser )
    multi-basic-string multi-literal-string 2choice [
        "" like "\n" ?head drop
-        R/ \\[ \t\r\n]*\n[ \t\r\n]*/m "" re-replace
+        re:: [[\\[ \t\r\n]*\n[ \t\r\n]*]] "m" "" re-replace
    ] action ;

 : string-parser ( -- parser )
--- a/extra/xkcd/xkcd.factor
+++ b/extra/xkcd/xkcd.factor
@ -13,7 +13,7 @@ IN: xkcd

 : comic-image ( url -- image )
    http-get nip
-    R/ \/\/imgs\.xkcd\.com\/comics\/[^\.]+\.(png|jpg)/
+    re"//imgs\.xkcd\.com/comics/[^\.]+\.(png|jpg)"
    first-match >string "http:" prepend load-http-image ;

 : comic-image. ( url -- )