From f1197fe327aee06e0749f45ea125f1df5548e99f Mon Sep 17 00:00:00 2001 From: Slava Pestov Date: Mon, 16 Mar 2009 03:29:04 -0500 Subject: [PATCH 01/16] peg.ebnf gives better stack effects --- basis/peg/ebnf/ebnf.factor | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/basis/peg/ebnf/ebnf.factor b/basis/peg/ebnf/ebnf.factor index 91af91b3a1..db29ce1ee7 100644 --- a/basis/peg/ebnf/ebnf.factor +++ b/basis/peg/ebnf/ebnf.factor @@ -530,7 +530,7 @@ M: ebnf-non-terminal (transform) ( ast -- parser ) : EBNF: reset-tokenizer CREATE-WORD dup ";EBNF" parse-multiline-string - ebnf>quot swapd 1 1 define-declared "ebnf-parser" set-word-prop + ebnf>quot swapd (( input -- ast )) define-declared "ebnf-parser" set-word-prop reset-tokenizer ; parsing From cac26d7b44208d1ed6f4f8a48878556919a16f26 Mon Sep 17 00:00:00 2001 From: Slava Pestov Date: Mon, 16 Mar 2009 03:29:16 -0500 Subject: [PATCH 02/16] peg-lexer: fix help lint --- extra/peg-lexer/peg-lexer-docs.factor | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extra/peg-lexer/peg-lexer-docs.factor b/extra/peg-lexer/peg-lexer-docs.factor index 22e620235d..18a458e8ff 100644 --- a/extra/peg-lexer/peg-lexer-docs.factor +++ b/extra/peg-lexer/peg-lexer-docs.factor @@ -1,14 +1,14 @@ USING: peg.ebnf help.syntax help.markup strings ; IN: peg-lexer -ABOUT: "peg-lexer" HELP: ON-BNF: { $syntax "ON-BNF: word ... ;ON-BNF" } { $description "Creates a parsing word using a parser for lexer control, adding the resulting ast to the stack. Parser syntax is as in " { $link POSTPONE: EBNF: } } ; HELP: create-bnf -{ $values { "word" string } { "parser" parser } } +{ $values { "name" string } { "parser" parser } } { $description "Runtime equivalent of " { $link POSTPONE: ON-BNF: } " also useful with manually constructed parsers." } ; HELP: factor +{ $values { "input" string } { "ast" "a sequence of tokens" } } { $description "Tokenizer that acts like standard factor lexer, separating tokens by whitespace." } ; \ No newline at end of file From e697fe8a90fce07b5f66ef8831fc33dae157b275 Mon Sep 17 00:00:00 2001 From: Slava Pestov Date: Mon, 16 Mar 2009 03:29:29 -0500 Subject: [PATCH 03/16] Remove Farkup caching from Wiki now that Farkup parser is fast --- extra/webapps/wiki/view.xml | 2 +- extra/webapps/wiki/wiki.factor | 9 ++------- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/extra/webapps/wiki/view.xml b/extra/webapps/wiki/view.xml index e3774bbe0b..38d9d39d55 100644 --- a/extra/webapps/wiki/view.xml +++ b/extra/webapps/wiki/view.xml @@ -5,7 +5,7 @@
- +

diff --git a/extra/webapps/wiki/wiki.factor b/extra/webapps/wiki/wiki.factor index 07fbbe0596..2341b020a8 100644 --- a/extra/webapps/wiki/wiki.factor +++ b/extra/webapps/wiki/wiki.factor @@ -47,7 +47,7 @@ article "ARTICLES" { :

( title -- article ) article new swap >>title ; -TUPLE: revision id title author date content parsed description ; +TUPLE: revision id title author date content description ; revision "REVISIONS" { { "id" "ID" INTEGER +db-assigned-id+ } @@ -55,7 +55,6 @@ revision "REVISIONS" { { "author" "AUTHOR" { VARCHAR 256 } +not-null+ } ! uid { "date" "DATE" TIMESTAMP +not-null+ } { "content" "CONTENT" TEXT +not-null+ } - { "parsed" "PARSED" FACTOR-BLOB +not-null+ } ! Farkup AST { "description" "DESCRIPTION" TEXT } } define-persistent @@ -72,9 +71,6 @@ M: revision feed-entry-url id>> revision-url ; : ( id -- revision ) revision new swap >>id ; -: compute-html ( revision -- ) - dup content>> parse-farkup >>parsed drop ; - : validate-title ( -- ) { { "title" [ v-one-line ] } } validate-params ; @@ -141,13 +137,12 @@ M: revision feed-entry-url id>> revision-url ; [ title>> ] [ id>> ] bi article boa insert-tuple ; : add-revision ( revision -- ) - [ compute-html ] [ insert-tuple ] [ dup title>>
select-tuple [ amend-article ] [ add-article ] if* ] - tri ; + bi ; : ( -- action ) From 27a68b8aa43b9382b0077faefdee7cd382b61c5b Mon Sep 17 00:00:00 2001 From: Slava Pestov Date: Mon, 16 Mar 2009 03:39:42 -0500 Subject: [PATCH 04/16] Update meta-data --- basis/farkup/authors.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/basis/farkup/authors.txt b/basis/farkup/authors.txt index 5674120196..a4a77d97e9 100644 --- a/basis/farkup/authors.txt +++ b/basis/farkup/authors.txt @@ -1,2 +1,2 @@ Doug Coleman -Slava Pestov +Daniel Ehrenberg From 58d997de5c9053ff6d987d3ac2f4f081e747dc74 Mon Sep 17 00:00:00 2001 From: Slava Pestov Date: Mon, 16 Mar 2009 05:00:27 -0500 Subject: [PATCH 05/16] Refactor regexp.compiler to not all with-compilation-unit so much; benchmark.regex-dna loads about twice as fast now --- basis/regexp/compiler/compiler.factor | 17 ++++++----------- basis/regexp/regexp.factor | 17 +++++++---------- 2 files changed, 13 insertions(+), 21 deletions(-) diff --git a/basis/regexp/compiler/compiler.factor b/basis/regexp/compiler/compiler.factor index b55cab6294..a0646002f9 100644 --- a/basis/regexp/compiler/compiler.factor +++ b/basis/regexp/compiler/compiler.factor @@ -104,13 +104,11 @@ C: box transitions>quot ; : states>code ( words dfa -- ) - [ - '[ - dup _ word>quot - (( last-match index string -- ? )) - define-declared - ] each - ] with-compilation-unit ; + '[ + dup _ word>quot + (( last-match index string -- ? )) + define-declared + ] each ; : states>words ( dfa -- words dfa ) dup transitions>> keys [ gensym ] H{ } map>assoc @@ -123,12 +121,9 @@ C: box PRIVATE> -: simple-define-temp ( quot effect -- word ) - [ define-temp ] with-compilation-unit ; - : dfa>word ( dfa -- quot ) dfa>main-word execution-quot '[ drop [ f ] 2dip @ ] - (( start-index string regexp -- i/f )) simple-define-temp ; + (( start-index string regexp -- i/f )) define-temp ; : dfa>shortest-word ( dfa -- word ) t shortest? [ dfa>word ] with-variable ; diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor index 29f7e3e84e..63a2f25885 100644 --- a/basis/regexp/regexp.factor +++ b/basis/regexp/regexp.factor @@ -4,7 +4,7 @@ USING: accessors combinators kernel kernel.private math sequences sequences.private strings sets assocs prettyprint.backend prettyprint.custom make lexer namespaces parser arrays fry locals regexp.parser splitting sorting regexp.ast regexp.negation -regexp.compiler words call call.private math.ranges ; +regexp.compiler compiler.units words call call.private math.ranges ; IN: regexp TUPLE: regexp @@ -35,7 +35,7 @@ M: lookbehind question>quot ! Returns ( index string -- ? ) : match-index-from ( i string regexp -- index/f ) ! This word is unsafe. It assumes that i is a fixnum ! and that string is a string. - dup dfa>> execute-unsafe( index string regexp -- i/f ) ; + dup dfa>> execute-unsafe( index string regexp -- i/f ) ; inline GENERIC: end/start ( string regexp -- end start ) M: regexp end/start drop length 0 ; @@ -129,31 +129,28 @@ PRIVATE> GENERIC: compile-regexp ( regex -- regexp ) : regexp-initial-word ( i string regexp -- i/f ) - compile-regexp match-index-from ; + [ compile-regexp ] with-compilation-unit match-index-from ; -: do-compile-regexp ( regexp -- regexp ) +M: regexp compile-regexp ( regexp -- regexp ) dup '[ dup \ regexp-initial-word = [ drop _ get-ast ast>dfa dfa>word ] when ] change-dfa ; -M: regexp compile-regexp ( regexp -- regexp ) - do-compile-regexp ; - M: reverse-regexp compile-regexp ( regexp -- regexp ) - t backwards? [ do-compile-regexp ] with-variable ; + t backwards? [ call-next-method ] with-variable ; DEFER: compile-next-match : next-initial-word ( i string regexp -- i start end string ) - compile-next-match do-next-match ; + [ compile-next-match ] with-compilation-unit do-next-match ; : compile-next-match ( regexp -- regexp ) dup '[ dup \ next-initial-word = [ drop _ [ compile-regexp dfa>> def>> ] [ reverse-regexp? ] bi '[ { array-capacity string regexp } declare _ _ next-match ] - (( i string regexp -- i start end string )) simple-define-temp + (( i string regexp -- i start end string )) define-temp ] when ] change-next-match ; From baf2cc6c5a55188c86fbb8012a9dedd9b1010eee Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Mon, 16 Mar 2009 07:08:35 -0500 Subject: [PATCH 06/16] fix bitmap loading of odd-width files --- basis/images/bitmap/bitmap.factor | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/basis/images/bitmap/bitmap.factor b/basis/images/bitmap/bitmap.factor index ffe3adff48..8209159a8e 100755 --- a/basis/images/bitmap/bitmap.factor +++ b/basis/images/bitmap/bitmap.factor @@ -83,14 +83,15 @@ ERROR: bmp-not-supported n ; :: fixup-color-index ( loading-bitmap -- loading-bitmap ) loading-bitmap width>> :> width + width 3 * :> width*3 loading-bitmap height>> abs :> height loading-bitmap color-index>> length :> color-index-length - height 3 * :> height*3 - color-index-length width height*3 * - height*3 /i :> misaligned - misaligned 0 > [ + color-index-length height /i :> stride + color-index-length width*3 height * - height /i :> padding + padding 0 > [ loading-bitmap [ - loading-bitmap width>> misaligned + 3 * - [ 3 misaligned * head* ] map concat + stride + [ width*3 head-slice ] map concat ] change-color-index ] [ loading-bitmap From bb33894d8af2a4216660b62d4446bb233171baa8 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Mon, 16 Mar 2009 07:11:46 -0500 Subject: [PATCH 07/16] check in more test images --- basis/images/test-images/40red24bit.bmp | Bin 0 -> 4854 bytes basis/images/test-images/41red24bit.bmp | Bin 0 -> 5014 bytes basis/images/test-images/42red24bit.bmp | Bin 0 -> 5174 bytes basis/images/test-images/43red24bit.bmp | Bin 0 -> 5334 bytes basis/images/test-images/elephants.tiff | Bin 0 -> 8466 bytes 5 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 basis/images/test-images/40red24bit.bmp create mode 100644 basis/images/test-images/41red24bit.bmp create mode 100644 basis/images/test-images/42red24bit.bmp create mode 100644 basis/images/test-images/43red24bit.bmp create mode 100644 basis/images/test-images/elephants.tiff diff --git a/basis/images/test-images/40red24bit.bmp b/basis/images/test-images/40red24bit.bmp new file mode 100644 index 0000000000000000000000000000000000000000..5e694559c681682f57c889475a0499fddefb6d86 GIT binary patch literal 4854 zcmeHF+YNvq5W^Mxas(%E0RP?Dsr=Oo7;@oNBbXR10Rl&9?+Wv>^~SbI=24B#3^u-= z=XP^WI~yH%p99A1<^EjCqFZP#s{-KmQCU}LA>f`gq=f+LUU?m8BLpOUhPUbg)aCoe ze}Mlg@B|tX;N2G9st0%>q0&8>j}JD~)tvRTkiplhb^SWGfV2?sC0!}@xq2OWa?ZU@ g9M|dTb&PBq(ORyAw){CZt((IzP6y}!9r%v}2UeN#FaQ7m literal 0 HcmV?d00001 diff --git a/basis/images/test-images/41red24bit.bmp b/basis/images/test-images/41red24bit.bmp new file mode 100644 index 0000000000000000000000000000000000000000..6599dcc10772ac1cd46166e13a10e10f1698ea56 GIT binary patch literal 5014 zcmeH_-3`Jp41~=Lyf6YRzyQ3nL#OJxB}${ju8b~D4F$CJa~hrhXdm05@MbW2!rx{Nw-Xk;V}REUrC~vM zO9H`M@@N@6O0b}5$tajh9xa1NVL{W9k-|J_Z7$ie&78ff{5px6s=)`nRX_eQyg7}I zSU+pEv4>|*O%>#UgEyBm#n^jzS#Tb}8}{!hP`vlZnR>(9gv+Hkn7d#!i3D>KE|+3@ E0TpDtwg3PC literal 0 HcmV?d00001 diff --git a/basis/images/test-images/42red24bit.bmp b/basis/images/test-images/42red24bit.bmp new file mode 100644 index 0000000000000000000000000000000000000000..e95a4f75f5335c9819d5bd04be732d56b2e3605e GIT binary patch literal 5174 zcmeHE!41MN3=BK)UJ{*^hxoriX%%-P5KaJ2)tO}jXf7pz9@cvXX{#s@U-dhYh($ literal 0 HcmV?d00001 diff --git a/basis/images/test-images/elephants.tiff b/basis/images/test-images/elephants.tiff new file mode 100644 index 0000000000000000000000000000000000000000..f462a0c0432f8c9e0c3138236bc698c4eb081979 GIT binary patch literal 8466 zcmb`LcUTiayXa@r10f`#w~zoTB2Bv3LKOj(rhuaK-aD^VJJ!&P ziim)q2n3KOAR;I#%3UnyJNMr6Jolfwd9wSPci*=2PG&N@V+T+IKyVXFO@u~6E&%{) zcvhuaa$$YlH`-iUY zxk1N0ZnXO6WoW0ob-!aD(EY*f+mVJ#uO@Y>P5i#a-RxG4;-9vM!b|LD7cJhmcE#5Z z+z@xn5|VLg^cdzJMGs+P?!H}rZ?mZXcAuBSpF|6?oJB5md0yN5j(EW}zoRFpcQV4z z2Cw#DKR-!ORG2~#I@gw5Xsketdau*|D6e~1AUPu9-Ilho+eS0z;gf6+Ljkdalkk`iky3@0fwz&u<16>r6lK?qg&t5Oa25>;gv!M!qeA97QHvM+4j|+Rf@eGDfANeGd9|nwymp<{&hR5ZdbVTS&{eGo%KgO zuT9>KeE7jvE2ZXw|BqE7rd5Z`M;@koba!{jWoZXyE4?jy;#yg6{kx)m)=E2vK1g|A?{(Vq(WcogfxI!+dd&Jm z-;3_zr}q7Nedgxs$1g6-M)iMes(yQ3JL%==hRez?()Z=MzlrMWemlFk86{iw>yh_- zUq``w_{ZK~4Be&4otj_c%3InPr2)Uldk{W_Wow4QOT@!lK#8k`Pt&=`7PFso-c zE=+PeRgre$h(e+npM(v?h}c3w$RW?CyFc^GY^REI!6ylbfN5xaD#Q!l!CJ2)Oj=Z( zD<~1p;Su;u6`KXPPH~%w%p*h+3WXAXCX>LF08pJw(rk+K5ReAl<|CJ^KyJbkHZdfB zeJUdYOJ1lan@wnr9({$e;kXe=#b{%3F(Ig3JCQ`$($p)+rdd)d<6G6o6kE?hZzA%q z)LN%zcZ;zYQY`Y+W`y*7MwUp8?tN;%Pt3av-nlEM2J`1SPGDq_8S0ikPdCz{KS~){ zZAyh`t5Zk-M^95uhfpNQhagi-Sc7FR?__{=xC_RPNUMHaqhf(fJi{L!;v!TxUPCwF zT4F;8Vvvd?eRwdK@FqaNCQVX0zQmBg0)h#OwL)|=D%8$T^AOjSaMuV=T$`hVzV}$b z)xkD%j>wp*(us)3!0C z`IR8O-Yoro*v^URt!Op_>NYqlRUbWxVFZMBx4v^)mm6N*al;-N(z7PX0T2v2_h&PM zlueoY@a%jviBqYPJe^p1hgzRrAS%Fpe)}5dlo_!yhrg-PNnofaF_CPiX$0(0TxB5m za#XqgoPC*gP29n|Aj-TnwrDjrY`1a$0ySw_U+7TTlez z8mEQyiD1o|a7gw6g4KOtQ*k#DgJ{*nL5gC1&jcT+4@Q9e4Gs5N_}D&?0_q1S4SEN~ z49yG^&xj7Zcd02XJ_^vR``U@^G*Q<|gVc~DvT4sO-=IT4+xF+q$Wm1qBLyUarz4G; zjS}EKL-Ny!v^v&2LB7Whx&DsoCmqEGhe~+_O%B4v9w-_jVDO|jQV>%>)_wel-okx^ zfy+iDOVnVmC6|zTucy{f9iPhWP(Wl8>!QOWhoEchiuLqcyf3~6N9#B6#zw5!Ref+jb2z_R7WErS1`Kk7oN8sQli zGBG2d_|Vwe71hYtoMyF~I^WiG9&%Wct>Xn+s|EI^&Zp%X*O`WRM~BZUR0 zLdzYWR;Ox=o(htw-llxz%Eobd2W7)7=9sQ|>pMxe=$*5EeY;Vq+XT4m@Mo41vinI3 z(!~S~9QdVz+h0j`ou?sbN}xHpK6jAfi2R`r@Tz=kwQx`hh1Lja@+Pu6WOP5l`3lISbXAa zDuIEa(sOV?lFpecYSBg+D9mxP3OQe7IOBK>Xj$7T;hOiI>jmiDLUb&GvnBv2W<@&=D7P)(BL zAR7zxAx?*wfen(qggBM~RRxJK=P(Rne6b+*N*c+fNP?~^z}+*BaH7Uvt5DCE9vlaB zGjbKc(A&jbjzIJ|*+HCa{D}A`3Gy=_wKEjM`zq>^z5GF2Dl2QQ(g(v^7IjIx35{f% zE6y~xh_6o-uR1P4?-X;jL9%{G;=N=i_7-Q~%S*mLI(-`*d=14Jlz}=SpopBgCdn(7 zqATHe>qXYvi(Jw>rpQca>dVJ<7=~i_Ievm{CupRPEk~4P3nPIR5^5B4QN`8d3&-aW zS+~$GB=LAv3N4n%rhH;`qwy(7QnLhIR_Ws}2|9_u3Ka!6q~p_YE?OXdN|@U$P(wk( z{ieaB&psA*nH7%5khp9FJ(SGT$J4$8-nCO)J*lc^=fEW7YBR|0N75h43>G7y9;v}W z6o(+hh~=kuh&FSnX-||{P0AhZ)6CfObU7mE#<)mQYre=96PNiXp+P+)Lr>;qv_zN! z%C%S&PMBAq#2QnYySXm(!8+ccfQwG|RGy$Fio`BLK%o?9^LktHQ(PUXvJ3zXQe9G@ zS|NU3p7@-+MmYgVe@q1>2wu0;%`K$Q#{5F&I2M_o5v-sg{>6OlW}jvNtyRf+r%HLR znlmL(Bcf*QDAI8xfOB7x)+T@bORik$bh_5b1RI>%s2IBgX-gEc6_yyH3Du;L2e}wQ zeHq3VENh;`D@Dd<6}SX_=CnzHt0<1RsGKSmik2LFj5D8#cF$BB(x*T^WL6&z(9s5^ z*@HbOPMahx4_&jmvQ|`Ja1_gvBXbhaFGeLnE2QOa8t%V29-I+aIw`EbZZLt&_%2F2 zA~k_GckK~i?S-JP#vrRbc5;-mTU22{~_*bfV#HlIf>rp-Q<^equ!*-3X+ zS=ikxa~a3-Xyj9kzg&~BoI4264767l?(!(CFqMJp&gqvNga?(B8G?qHPay~&leW`R zx}`)IG)0eT&=wL(*h$}4b);j{!d*XY;`Fg;;s_uKb;U;1EXhK%0FWS|@Q9j4>6%8W z!8~9QkU#^vTEE6%RC2A80Kie9EUfFC3^W5+au8d7e`q+Z=DRGtTgLSZB=UK`aUa!n zR+cTauYxMeTa<<7fcArZ6CsAo$&iR5YD6&*nTObtIhUzaS9UJ<1%{V{<;+U+rkzC= z1U=t5<}SO?zkop+#FFOH7pd{HvbpymQ7O;ySXtLrX@e1279ZfM?Wd958W52pi4NWx z`{3d-Uao19WH!+XCb*|lo_H8Ap}}ZCI)>*n@J}l-ZZ-0o)L*7^{J+!%xX1!Q`*@vTcMMT*~T)rCnb$6t$!@<|qLE~nJ zb)EKfUv{uJG`_1JIz@`ZU0CD8AEvl2`m1U(&ikNG{K32H4t~vz(alNNHhS8+DyBuk zcOgBdSSUR0Hu3n`L->f8sPvGpVh6<C3O#x$_9 zvixoPKc2pHuXqCBmCA~*0KMwNnPOjyHr)v(CAN>}{*NaA5fVe;fN;jjut?bZ4>MNS z?GM|na8x)m9Mny%N{MBg{Pz=4=2ETLJ@vVgICdSgz>K425k;i!dv!^b35256=+z zAIf?F@Ol4{`N7Xn5ek6hxBtl2@B<(k1wi?K)B9}myRRV87hztS8o%8^1VaGO{maXr zSHrpG0Pyp~^74=3<>j9j;9=DV05`xrA~f>1F8mxp*6bYh; z(bR+LF*=7N4$Z{Y9?m-w5@*JeWBrVOlkgz1D2bKqn_`=~HBBv@o*|kka1?!X`Pk2_ zU&ohEKsm^hm{XWjKXQli8c&zy=bVW;>sDZ0xaFKm5%oNYjbblyriuqIbd@w+ynm_k z^10GuS7NUQl-XZnTvsd?EdNyrs<1csZxU|_+$P-Nzl*)Qd~d#b;(q^w_J=i(Dj%PJ zlEICt@v7ZjXI!t{u&PnKNuU|gyx20)%6*#l%%DGo#NWJlo6@(rZ~k3nzvlp9;LM=H;C0>xUgMD6(C~1?2zDg@ zz4CkRsO{+J*uina`1uKqiTX*~$>9&-Q|PIa)AZ?znT<0IA8kGk&ic)M{uDEZm^=De z^fP;2alU*(W8wA}wJ*hA1-^QHeZ0uOXtH=@@!sO6Zyw+JzjrPj{*}4B4A)-)#6k!t z8^J&dBY&agF;3WO{6@YGLMG9jWKPi+HWb|}epr$t-9(+CQ{^pId8|oVdum;nih^ph zdWeRkX05ibE>Z8QzTw6pXS7$A_eGzFzAyZE{+|PJLDXOkrg?}{ zXk=J+I42@4(lcsPv{>}Q!B;U=hfc&Y5AQjmA4g@OSX1%c3Eaf8q`c&Wl%UkTX=drV z8LKiWnaf8%9P7$@a{NMe>WP3Ho0GbyWOMm)Kj-zIZq2`QrsV92g1EwfbB;wO=QY@} z96ruGJQ|*t+`3qBDe1CL>E0_|R}Yn?T|0Zdr2KltqsoS=XE)kzzP#0ayXQ{t-QIg| zt9$PEKInhQdo=p^*V+}^+j ziG2CGqwG~&r)}4&uKDgKuaEWY?OokF`{v%;gg&!(*mrmOJqE}F+`(X;EbsYH%<$Ub z*CUDVRo{1y9v)K|YZ(ukAWu|IT2FrZkTs<=)i~`s{bT0DN5zk|vktRUpW@~ybETiR zeD0WcoBy;B|Aq9W=&Qol!bR~#_P6!lI=`Rz;l1SkGwD~sGHuym`S^0v@A``Xb8rTN zpb%&Qk%ioZ0;sp>SD3drfLG>=;%^dIMa&TVP7a~G7j_hRDyAo1BFQJ^E&Z6vFKb21 zrw_=lQSe)JW%aP4)Y@H2N7j`occ^?(6IIt(Z=tbA(_4$F9i<(y;h=u#MxRZd zn?1I8ZS`U}8|*c-F|sr^Fwr!nZzGujvzhI^=5-e3miaqaJH2-8veL6wup!!fvmLQ( z-~D(`iG7TNxub~V(B7(jET>)jrT0%BxbK|qvd49m>%3d7`w0&hPi4< zvA0*Mt<#9pFZ0iz2|mjx5Gr_Ic=udpk=ywVY%+V2 z(@@O0kWk`r(d5#a%lOOFrLV5szsi9}S@`t><=ZQCE9I*MZ-5(fH-~Ta-hO_k?(W@t zx2ntTUwpuRSnw$S@#!ZgxyNcUYE$b{>RAm38zY*S&0Z~DtuaqeKD+$rK^ z;2oqX0DE2lU~>h46FcFhjs$>+3jkIE0N8B+h(1MtC|v+hm?rqQeux7NumgC3BOniy zgGTTU%tHi74$_CLARj0WIt`UWjnDwJh#(;p5rzmSL?j{`QHE$l3?Y`0Qb;}IZe%#} zB=RQm74kDm1f`4Gi#m+DfNDX_p{3E=&|&EF=x69}7)6XdCKYoJGl7-HT47VLkFfJN zWtrb{=-1W+l| zhqB%@5n2O1TyB-zkbI-Uja8i08EXzIdat!tGG3>yOjF@k`J&2GYgNCwzEC4kGf-={ z_C_5!T|V7;z5WeP^&f9++Vpnw%vOjYVX)S4gV7FSXOmFVq?t9cI4e=dy?(z9Y!6O_tBgT_WK;jb>_Ovx~+Ei@F?+o@4e0^(zn51G$1&z zAy|?b9`ZD7bvP^Hb(DH^`oZBtp@-#<46!aI1SM@u5lLN0@5*dEcJFw_iAyIrxtC90 zKT}n3?_AyaI?g~z&gFeq#jm}pVBOHVHE{RLeY;1(PX=lqHe@#+dCF`%^fIFJ#Onud z-t{AS+ea>sr%dN9M10fw{`$xCQt!`8zc|0=4^gllSp2U4X;2C8g$BVQM26Nv29P5> zr({4Ephr+QG>hOvC?K{V91-Dg^{*jX5Ti&GQVz*LIwRwdMMy4k2!2ny24#*4M&+Tn zsE=q_v^6>jeHZ-!L&NOB9L3aQmasb55NtVi3a5ng!dN-hGU;0M0l7^10}5MLDXu22o?p|i z*tGVRQsKHZh8g|&P3Xq8*d=p4?aF^mu>Y zfh8AFSA91Z_biV`o|E3vJ~qA?ezgHm;Fh4M;G4|ZP{lBBcwOv`qC{I9OpY0dbvyhu z?s)u~geOT3$=^~7(l=)Cj%H*D?pwrafn)BSbo!YU~HQH1E_IiIcuW@8x{P>67nZntz zxz72pg~hM8e%F5$*ae~i8#KZHA}Np#Gr<+$KI3=b%lP>Dg7`Z5H}PL1 zND+<;U<6nKzlbbBgkT0qj8sfkAy-qjQTl{Jg!zRpikOIui=GhMC^jm7TEapSlDsb! zDXlBLDD!}NSk{=vPwS?im-Cn3sjzvK)@r3Sa*8r*rIlpXQI*%IsH*C#ZC7_(&(uiM zyrfm9J)+B}r?tUdzhL9n&AwX(86F1jjY3Tz(>ycf?Tr@Bmfv?Cw^FozXlr3NxF^{D zmt*oiiWB>Qp7T>zceiB^ju*pw*f+ypGhiqvler<}eOP{kN#sIwY0Umu)ZvmibJk2k zPLft~Uut5yLPq0J|15#y*H4(`@J=1cqn>_nX7|}og;_=G&ew8Wi+_~lUedYTamBj~ zeeF!SX2sJg-y4g!a_=bLt*_pHfBs?SW5p*eHQsgD`jW<7P2XCIo?1Rfw4H00dRg>J zsq;yk}ZRaldy29hkaI2NQpeX$P z9u{e90$<@Z3nF>>nr{EgG!Iu}3z*@(E~FmgV+5~jfB>_lSE#8C%&;HG!Ozv)3T7hA zex5D-F(v9tec+ZY;Qvm0gwSf|rJ*kVP0 zCDx*cr}5w15U8MlotCg1&KV)>8@_!de_@!FfGKc=cVphb6Q}@7U<8c+o@ub02DgLZ zyPhBn)*Jb)_fKoj@3!y%VWk4k$}9dNf7|$jQJ4dreG@`qUw^!}0C!mC3WEMg)XUT- z)R+FXen$-W9rd4u1>E}IC4W5q$?ne`U*G|2{gb=@vsj57c*83)Klq^j9v{?dlm<#0 zWr*60+Jf2uXed#XBv3(Vq4ZD&sI91t@VyOfQ+KESE0;fgmTS z^~C=b75KvMFn;gd;a&z9cx)6Vtx#R=B5URP-;Z$5Xn0=_fKf2>V5qORPdLp0eskfe ROxqshuBuGaP*>jo{uc>T4FUiF literal 0 HcmV?d00001 From 8eea41b53727448dc00b7da673da310c9cbf0b3f Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Mon, 16 Mar 2009 07:11:56 -0500 Subject: [PATCH 08/16] normalizing alpha data for tiffs is done too often, check in test image for it --- basis/images/tiff/tiff.factor | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/basis/images/tiff/tiff.factor b/basis/images/tiff/tiff.factor index 2ea1b08e20..80eaff8140 100755 --- a/basis/images/tiff/tiff.factor +++ b/basis/images/tiff/tiff.factor @@ -477,26 +477,24 @@ ERROR: unknown-component-order ifd ; [ unknown-component-order ] } case ; +: normalize-alpha-data ( seq -- byte-array ) + ! [ normalize-alpha-data ] change-bitmap + B{ } like dup + byte-array>float-array + 4 + [ + dup fourth dup 0 = [ + 2drop + ] [ + [ 3 head-slice ] dip '[ _ / ] change-each + ] if + ] each ; + : handle-alpha-data ( ifd -- ifd ) dup extra-samples find-tag { - { extra-samples-associated-alpha-data [ - [ - B{ } like dup - byte-array>float-array - 4 - [ - dup fourth dup 0 = [ - 2drop - ] [ - [ 3 head-slice ] dip '[ _ / ] change-each - ] if - ] each - ] change-bitmap - ] } - { extra-samples-unspecified-alpha-data [ - ] } - { extra-samples-unassociated-alpha-data [ - ] } + { extra-samples-associated-alpha-data [ ] } + { extra-samples-unspecified-alpha-data [ ] } + { extra-samples-unassociated-alpha-data [ ] } [ bad-extra-samples ] } case ; From ababfe80efee2df3e9ec2e1fae5064b5d91c3aff Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Mon, 16 Mar 2009 17:53:38 -0500 Subject: [PATCH 09/16] More regexp docs; unix line ending support --- basis/regexp/ast/ast.factor | 3 +- basis/regexp/classes/classes.factor | 2 +- basis/regexp/compiler/compiler.factor | 9 ++- basis/regexp/nfa/nfa.factor | 9 ++- basis/regexp/parser/parser.factor | 36 +++++------ basis/regexp/regexp-docs.factor | 86 ++++++++++++++++++++++++--- basis/regexp/regexp-tests.factor | 10 ++++ 7 files changed, 118 insertions(+), 37 deletions(-) diff --git a/basis/regexp/ast/ast.factor b/basis/regexp/ast/ast.factor index ffaed2db62..1c11ed5c7d 100644 --- a/basis/regexp/ast/ast.factor +++ b/basis/regexp/ast/ast.factor @@ -37,8 +37,7 @@ C: with-options TUPLE: options on off ; C: options -SINGLETONS: unix-lines dotall multiline comments case-insensitive -unicode-case reversed-regexp ; +SINGLETONS: unix-lines dotall multiline case-insensitive reversed-regexp ; : ( term -- term' ) f 2array ; diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor index d26ff7f69c..e3a1774585 100644 --- a/basis/regexp/classes/classes.factor +++ b/basis/regexp/classes/classes.factor @@ -12,7 +12,7 @@ ascii-class punctuation-class java-printable-class blank-class control-character-class hex-digit-class java-blank-class c-identifier-class unmatchable-class terminator-class word-boundary-class ; -SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file word-break ; +SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file ^unix $unix word-break ; TUPLE: range from to ; C: range diff --git a/basis/regexp/compiler/compiler.factor b/basis/regexp/compiler/compiler.factor index b55cab6294..95511965d1 100644 --- a/basis/regexp/compiler/compiler.factor +++ b/basis/regexp/compiler/compiler.factor @@ -17,9 +17,6 @@ SYMBOL: backwards? M: t question>quot drop [ 2drop t ] ; M: f question>quot drop [ 2drop f ] ; -M: not-class question>quot - class>> question>quot [ not ] compose ; - M: beginning-of-input question>quot drop [ drop zero? ] ; @@ -40,6 +37,12 @@ M: $ question>quot M: ^ question>quot drop [ { [ drop zero? ] [ [ 1- ] dip ?nth "\r\n" member? ] } 2|| ] ; +M: $unix question>quot + drop [ { [ length = ] [ ?nth CHAR: \n = ] } 2|| ] ; + +M: ^unix question>quot + drop [ { [ drop zero? ] [ [ 1- ] dip ?nth CHAR: \n = ] } 2|| ] ; + M: word-break question>quot drop [ word-break-at? ] ; diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor index 20be6b87d8..d59d4818ec 100644 --- a/basis/regexp/nfa/nfa.factor +++ b/basis/regexp/nfa/nfa.factor @@ -60,11 +60,16 @@ GENERIC: modify-epsilon ( tag -- newtag ) M: object modify-epsilon ; +: line-option ( multiline unix-lines default -- option ) + multiline option? [ + drop [ unix-lines option? ] 2dip swap ? + ] [ 2nip ] if ; + M: $ modify-epsilon - multiline option? [ drop end-of-input ] unless ; + $unix end-of-input line-option ; M: ^ modify-epsilon - multiline option? [ drop beginning-of-input ] unless ; + ^unix beginning-of-input line-option ; M: tagged-epsilon nfa-node clone [ modify-epsilon ] change-tag add-simple-entry ; diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index c6a69f2508..7b2d6af2c1 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -2,7 +2,7 @@ ! See http://factorcode.org/license.txt for BSD license. USING: peg.ebnf kernel math.parser sequences assocs arrays fry math combinators regexp.classes strings splitting peg locals accessors -regexp.ast ; +regexp.ast unicode.case ; IN: regexp.parser : allowed-char? ( ch -- ? ) @@ -19,20 +19,19 @@ ERROR: bad-number ; ERROR: bad-class name ; : name>class ( name -- class ) - { - { "Lower" letter-class } - { "Upper" LETTER-class } - { "Alpha" Letter-class } - { "ASCII" ascii-class } - { "Digit" digit-class } - { "Alnum" alpha-class } - { "Punct" punctuation-class } - { "Graph" java-printable-class } - { "Print" java-printable-class } - { "Blank" non-newline-blank-class } - { "Cntrl" control-character-class } - { "XDigit" hex-digit-class } - { "Space" java-blank-class } + >string >case-fold { + { "lower" letter-class } + { "upper" LETTER-class } + { "alpha" Letter-class } + { "ascii" ascii-class } + { "digit" digit-class } + { "alnum" alpha-class } + { "punct" punctuation-class } + { "graph" java-printable-class } + { "blank" non-newline-blank-class } + { "cntrl" control-character-class } + { "xdigit" hex-digit-class } + { "space" java-blank-class } ! TODO: unicode-character-class } [ bad-class ] at-error ; @@ -66,11 +65,8 @@ ERROR: bad-class name ; { CHAR: i case-insensitive } { CHAR: d unix-lines } { CHAR: m multiline } - { CHAR: n multiline } { CHAR: r reversed-regexp } { CHAR: s dotall } - { CHAR: u unicode-case } - { CHAR: x comments } } ; : ch>option ( ch -- singleton ) @@ -101,8 +97,8 @@ CharacterInBracket = !("}") Character QuotedCharacter = !("\\E") . -Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class ]] - | "P{" CharacterInBracket*:s "}" => [[ s >string name>class ]] +Escape = "p{" CharacterInBracket*:s "}" => [[ s name>class ]] + | "P{" CharacterInBracket*:s "}" => [[ s name>class ]] | "Q" QuotedCharacter*:s "\\E" => [[ s ]] | "u" Character:a Character:b Character:c Character:d => [[ { a b c d } hex> ensure-number ]] diff --git a/basis/regexp/regexp-docs.factor b/basis/regexp/regexp-docs.factor index b35f8d1cf3..a7cb0a3715 100644 --- a/basis/regexp/regexp-docs.factor +++ b/basis/regexp/regexp-docs.factor @@ -33,20 +33,71 @@ ARTICLE: { "regexp" "construction" } "Constructing regular expressions" "Another approach is to use " { $vocab-link "regexp.combinators" } "." ; ARTICLE: { "regexp" "syntax" } "Regular expression syntax" -"Regexp syntax is largely compatible with Perl, Java and extended POSIX regexps, but not completely. A new addition is the inclusion of a negation operator, with the syntax " { $snippet "(?~foo)" } " to match everything that does not match " { $snippet "foo" } "." +"Regexp syntax is largely compatible with Perl, Java and extended POSIX regexps, but not completely. Below, the syntax is documented." { $heading "Characters" } +"At its core, regular expressions consist of character literals. For example, " { $snippet "R/ f/" } " is a regular expression matching just the string 'f'. In addition, the normal escape codes are provided, like " { $snippet "\\t" } " for the tab character and " { $snippet "\\uxxxxxx" } "for an arbitrary Unicode code point, by its hex value. In addition, any character can be preceded by a backslash to escape it, unless this has special meaning. For example, to match a literal opening parenthesis, use " { $snippet "\\(" } "." +{ $heading "Concatenation, alternation and grouping" } +"Regular expressions can be built out of multiple characters by concatenation. For example, " { $snippet "R/ ab/" } " matches a followed by b. The " { $snippet "|" } " (alternation) operator can construct a regexp which matches one of two alternatives. Parentheses can be used for gropuing. So " { $snippet "R/ f(oo|ar)/" } " would match either 'foo' or 'far'." { $heading "Character classes" } +"Square brackets define a convenient way to refer to a set of characters. For example, " { $snippet "[ab]" } " refers to either a or b. And " { $snippet "[a-z]" } " refers to all of the characters between a and z, in code point order. You can use these together, as in " { $snippet "[ac-fz]" } " which matches all of the characters between c and f, in addition to a and z. Character classes can be negated using a carat, as in " { $snippet "[^a]" } " which matches all characters which are not a." { $heading "Predefined character classes" } +"Several character classes are predefined, both for convenience and because they are too large to represent directly. In Factor regular expressions, all character classes are Unicode-aware." +{ $table + { { $snippet "\\d" } "Digits" } + { { $snippet "\\D" } "Not digits" } + { { $snippet "\\s" } "Whitespace" } + { { $snippet "\\S" } "Not whitespace" } + { { $snippet "\\w" } "Word character (alphanumeric or underscore)" } + { { $snippet "\\W" } "Not word character" } + { { $snippet "\\p{property}" } "Character which fulfils the property" } + { { $snippet "\\P{property}" } "Character which does not fulfil the property" } } +"Properties for " { $snippet "\\p" } " and " { $snippet "\\P" } " (case-insensitive):" +{ $table + { { $snippet "\\p{lower}" } "Lower case letters" } + { { $snippet "\\p{upper}" } "Upper case letters" } + { { $snippet "\\p{alpha}" } "Letters" } + { { $snippet "\\p{ascii}" } "Characters in the ASCII range" } + { { $snippet "\\p{alnum}" } "Letters or numbers" } + { { $snippet "\\p{punct}" } "Punctuation" } + { { $snippet "\\p{blank}" } "Non-newline whitespace" } + { { $snippet "\\p{cntrl}" } "Control character" } + { { $snippet "\\p{space}" } "Whitespace" } + { { $snippet "\\p{xdigit}" } "Hexidecimal digit" } } ! In the future: Unicode +"Full unicode properties are not yet supported." { $heading "Boundaries" } +"Special operators exist to match certain points in the string. These are called 'zero-width' because they do not consume any characters." +{ $table + { { $snippet "^" } "Beginning of a line" } + { { $snippet "$" } "End of a line" } + { { $snippet "\\A" } "Beginning of text" } + { { $snippet "\\z" } "End of text" } + { { $snippet "\\Z" } "Almost end of text: only thing after is newline" } + { { $snippet "\\b" } "Word boundary (by Unicode word boundaries)" } + { { $snippet "\\b" } "Not word boundary (by Unicode word boundaries)" } } { $heading "Greedy quantifiers" } -{ $heading "Reluctant quantifiers" } -{ $heading "Posessive quantifiers" } -{ $heading "Logical operations" } +"It is possible to have a regular expression which matches a variable number of occurrences of another regular expression." +{ $table + { { $snippet "a*" } "Zero or more occurrences of a" } + { { $snippet "a+" } "One or more occurrences of a" } + { { $snippet "a?" } "Zero or one occurrences of a" } + { { $snippet "a{n}" } "n occurrences of a" } + { { $snippet "a{n,}" } "At least n occurrences of a" } + { { $snippet "a{,m}" } "At most m occurrences of a" } + { { $snippet "a{n,m}" } "Between n and m occurrences of a" } } +"All of these quantifiers are " { $emphasis "greedy" } ", meaning that they take as many repetitions as possible within the larger regular expression. Reluctant and posessive quantifiers are not yet supported." { $heading "Lookaround" } +"Operators are provided to look ahead and behind the current point in the regular expression. These can be used in any context, but they're the most useful at the beginning or end of a regular expression." +{ $table + { { $snippet "(?=a)" } "Asserts that the current position is immediately followed by a" } + { { $snippet "(?!a)" } "Asserts that the current position is not immediately followed by a" } + { { $snippet "(?<=a)" } "Asserts that the current position is immediately preceded by a" } + { { $snippet "(? matches? ] unit-test [ 3 ] [ "caba" "(?<=b)a" first-match from>> ] unit-test + +[ t ] [ "\ra" R/ .^a/ms matches? ] unit-test +[ f ] [ "\ra" R/ .^a/mds matches? ] unit-test +[ t ] [ "\na" R/ .^a/ms matches? ] unit-test +[ t ] [ "\na" R/ .^a/mds matches? ] unit-test + +[ t ] [ "a\r" R/ a$./ms matches? ] unit-test +[ f ] [ "a\r" R/ a$./mds matches? ] unit-test +[ t ] [ "a\n" R/ a$./ms matches? ] unit-test +[ t ] [ "a\n" R/ a$./mds matches? ] unit-test From b745930b2875797d24cab2aac71adc57bb57e1f0 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Mon, 16 Mar 2009 18:14:39 -0500 Subject: [PATCH 10/16] More regexp docs --- basis/regexp/regexp-docs.factor | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/basis/regexp/regexp-docs.factor b/basis/regexp/regexp-docs.factor index a7cb0a3715..041c78380b 100644 --- a/basis/regexp/regexp-docs.factor +++ b/basis/regexp/regexp-docs.factor @@ -1,6 +1,7 @@ ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -USING: kernel strings help.markup help.syntax math regexp.parser regexp.ast ; +USING: kernel strings help.markup help.syntax math regexp.parser +regexp.ast multiline ; IN: regexp ABOUT: "regexp" @@ -21,8 +22,17 @@ ARTICLE: "regexp" "Regular expressions" { $subsection { "regexp" "deploy" } } ; ARTICLE: { "regexp" "intro" } "A quick introduction to regular expressions" - -; +"Regular expressions are a terse way to do certain simple string processing tasks. For example, to replace all instances of " { $snippet "foo" } " in one string with { $snippet "bar" } ", the following can be used: +{ $code "R/ foo/ \"bar\" re-replace" } +"That could be done with sequence operations, but consider doing this replacement for an arbitrary number of o's, at least two:" +{ $code "R/ foo+/ \"bar\" re-replace" } +"The " { $snippet "+" } " operator matches one or more occurrences of the previous expression; in this case " { $snippet "o" } ". Another useful feature is alternation. Say we want to do this replacement with fooooo or boooo. Then we could use the code" +{ $code "R/ (f|b)oo+/ \"bar\" re-replace" } +"To search a file for all lines that match a given regular expression, you could use code like this:" +{ $code <" "file.txt" ascii file-lines [ R/ (f|b)oo+/ re-contains? ] filter "> } +"To test if a string in its entirity matches a regular expression, the following can be used:" +{ $example <" "fooo" R/ (b|f)oo+/ matches? . "> "t" } +"Regular expressions can't be used for all parsing tasks. For example, they are not powerful enough to match balancing parentheses." ; ARTICLE: { "regexp" "construction" } "Constructing regular expressions" "Most of the time, regular expressions are literals and the parsing word should be used, to construct them at parse time. This ensures that they are only compiled once, and gives parse time syntax checking." From 7a0ce748df5cc57ffe2c34014d5be1846c469223 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Mon, 16 Mar 2009 18:28:15 -0500 Subject: [PATCH 11/16] Cleaning up XML to make : string>xml read-xml ; --- basis/xml/tests/test.factor | 1 + basis/xml/xml.factor | 14 ++++++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/basis/xml/tests/test.factor b/basis/xml/tests/test.factor index 818a28c892..1d07aa9406 100644 --- a/basis/xml/tests/test.factor +++ b/basis/xml/tests/test.factor @@ -74,3 +74,4 @@ SYMBOL: xml-file [ "foo" ] [ "]>&bar;" string>xml children>string ] unit-test [ T{ xml-chunk f V{ "hello" } } ] [ "hello" string>xml-chunk ] unit-test [ "1.1" ] [ "" string>xml prolog>> version>> ] unit-test +[ "ß" ] [ "ß" read-xml children>string ] unit-test diff --git a/basis/xml/xml.factor b/basis/xml/xml.factor index 073f46cbae..fba2eafaba 100755 --- a/basis/xml/xml.factor +++ b/basis/xml/xml.factor @@ -4,7 +4,8 @@ USING: accessors arrays io io.encodings.binary io.files io.streams.string kernel namespaces sequences strings io.encodings.utf8 xml.data xml.errors xml.elements ascii xml.entities xml.writer xml.state xml.autoencoding assocs xml.tokenize -combinators.short-circuit xml.name splitting io.streams.byte-array ; +combinators.short-circuit xml.name splitting io.streams.byte-array +combinators ; IN: xml xml-stack get first second ] with-state ; inline +: make-xml ( stream quot -- xml ) + 0 read-seq make-xml-doc ; inline + PRIVATE> : each-element ( stream quot: ( xml-elem -- ) -- ) @@ -169,14 +173,16 @@ PRIVATE> ] with-state ; inline : read-xml ( stream -- xml ) - [ start-document [ process ] when* ] - 0 read-seq make-xml-doc ; + dup stream-element-type { + { +character+ [ [ check ] make-xml ] } + { +byte+ [ [ start-document [ process ] when* ] make-xml ] } + } case ; : read-xml-chunk ( stream -- seq ) [ check ] 1 read-seq ; : string>xml ( string -- xml ) - [ check ] 0 read-seq make-xml-doc ; + read-xml ; : string>xml-chunk ( string -- xml ) read-xml-chunk ; From fec49cb61683bf2656eecf01fe2a9a3ed2bf83a1 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Mon, 16 Mar 2009 22:29:38 -0500 Subject: [PATCH 12/16] More expository XML docs --- basis/xml/traversal/traversal-docs.factor | 18 ++++++++++++++++-- basis/xml/xml-docs.factor | 8 +++++++- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/basis/xml/traversal/traversal-docs.factor b/basis/xml/traversal/traversal-docs.factor index 1329c4975e..9f26774647 100644 --- a/basis/xml/traversal/traversal-docs.factor +++ b/basis/xml/traversal/traversal-docs.factor @@ -1,6 +1,6 @@ ! Copyright (C) 2005, 2009 Daniel Ehrenberg ! See http://factorcode.org/license.txt for BSD license. -USING: help.markup help.syntax xml.data sequences strings ; +USING: help.markup help.syntax xml.data sequences strings multiline ; IN: xml.traversal ABOUT: "xml.traversal" @@ -8,7 +8,7 @@ ABOUT: "xml.traversal" ARTICLE: "xml.traversal" "Utilities for traversing XML" "The " { $vocab-link "xml.traversal" } " vocabulary provides utilities for traversing an XML DOM tree and viewing the contents of a single tag. The following words are defined:" $nl - "Note: the difference between deep-tag-named and tag-named is that the former searches recursively among all children and children of children of the tag, while the latter only looks at the direct children, and is therefore more efficient." + { $subsection { "xml.traversal" "intro" } } { $subsection tag-named } { $subsection tags-named } { $subsection deep-tag-named } @@ -20,6 +20,20 @@ ARTICLE: "xml.traversal" "Utilities for traversing XML" { $subsection first-child-tag } { $subsection assert-tag } ; +ARTICLE: { "xml.traversal" "intro" } "An example of XML processing" +"To illustrate how to use the XML library, we develop a simple Atom parser in Factor. Atom is an XML-based syndication format, like RSS. To see the full version of what we develop here, look at " { $snippet "basis/syndication" } " at the " { $snippet "atom1.0" } " word. First, we want to load a file and get a DOM tree for it." +{ $code <" "file.xml" file>xml "> } +"No encoding descriptor is needed, because XML files contain sufficient information to auto-detect the encoding. Next, we want to extract information from the tree. To get the title, we can use the following:" +{ $code <" "title" tag-named children>string "> } +"The " { $link tag-named } " word finds the first tag named " { $snippet "title" } " in the top level (just under the main tag). Then, with a tag on the stack, its children are asserted to be a string, and the string is returned." $nl +"For a slightly more complicated example, we can look at how entries are parsed. To get a sequence of tags with the name " { $snippet "entry" } ":" +{ $code <" "entry" tags-named "> } +"Imagine that, for each of these, we want to get the URL of the entry. In Atom, the URLs are in a " { $snippet "link" } " tag which is contained in the " { $snippet "entry" } " tag. There are multiple " { $snippet "link" } " tags, but one of them contains the attribute " { $snippet "rel=alternate" } ", and the " { $snippet "href" } " attribute has the URL. So, given an element of the sequence produced in the above quotation, we run the code:" +{ $code <" "link" tags-named [ "rel" attr "alternate" = ] find nip "> } +"to get the link tag on the stack, and" +{ $code <" "href" attr >url "> } +"to extract the URL from it." ; + HELP: deep-tag-named { $values { "tag" "an XML tag or document" } { "name/string" "an XML name or string representing a name" } { "matching-tag" tag } } { $description "Finds an XML tag with a matching name, recursively searching children and children of children." } diff --git a/basis/xml/xml-docs.factor b/basis/xml/xml-docs.factor index 77969c55cd..434209620b 100644 --- a/basis/xml/xml-docs.factor +++ b/basis/xml/xml-docs.factor @@ -67,9 +67,9 @@ HELP: string>dtd ARTICLE: { "xml" "reading" } "Reading XML" "The following words are used to read something into an XML document" - { $subsection string>xml } { $subsection read-xml } { $subsection read-xml-chunk } + { $subsection string>xml } { $subsection string>xml-chunk } { $subsection file>xml } { $subsection bytes>xml } @@ -90,10 +90,16 @@ ARTICLE: { "xml" "events" } "Event-based XML parsing" { $subsection pull-event } { $subsection pull-elem } ; +ARTICLE: { "xml" "namespaces" } "Working with XML namespaces" +"The Factor XML parser implements XML namespaces, and provides convenient utilities for working with them. Anywhere in the public API that a name is accepted as an argument, either a string or an XML name is accepted. If a string is used, it is coerced into a name by giving it a null namespace. Names are stored as " { $link name } " tuples, which have slots for the namespace prefix and namespace URL as well as the main part of the tag name." $nl +"To make it easier to create XML names, the parsing word " { $snippet "XML-NS:" } " is provided in the " { $vocab-link "xml.syntax" } " vocabulary." $nl +"When parsing XML, names are automatically augmented with the appropriate namespace URL when the information is available. This does not take into account any XML schema which might allow for such prefixes to be omitted. When generating XML to be written, keep in mind that the XML writer knows only about the literal prefixes and ignores the URLs. It is your job to make sure that they match up correctly, and that there is the appropriate " { $snippet "xmlns" } " declaration." ; + ARTICLE: "xml" "XML parser" "The " { $vocab-link "xml" } " vocabulary implements the XML 1.0 and 1.1 standards, converting strings of text into XML and vice versa. The parser checks for well-formedness but is not validating. There is only partial support for processing DTDs." { $subsection { "xml" "reading" } } { $subsection { "xml" "events" } } + { $subsection { "xml" "namespaces" } } { $vocab-subsection "Writing XML" "xml.writer" } { $vocab-subsection "XML parsing errors" "xml.errors" } { $vocab-subsection "XML entities" "xml.entities" } From 2f8adf9d4d4941bb51f14a749be635e516af9a39 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Mon, 16 Mar 2009 22:43:08 -0500 Subject: [PATCH 13/16] Fixing regexp docs --- basis/regexp/regexp-docs.factor | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/basis/regexp/regexp-docs.factor b/basis/regexp/regexp-docs.factor index 041c78380b..d62a03ade5 100644 --- a/basis/regexp/regexp-docs.factor +++ b/basis/regexp/regexp-docs.factor @@ -22,7 +22,7 @@ ARTICLE: "regexp" "Regular expressions" { $subsection { "regexp" "deploy" } } ; ARTICLE: { "regexp" "intro" } "A quick introduction to regular expressions" -"Regular expressions are a terse way to do certain simple string processing tasks. For example, to replace all instances of " { $snippet "foo" } " in one string with { $snippet "bar" } ", the following can be used: +"Regular expressions are a terse way to do certain simple string processing tasks. For example, to replace all instances of " { $snippet "foo" } " in one string with " { $snippet "bar" } ", the following can be used:" { $code "R/ foo/ \"bar\" re-replace" } "That could be done with sequence operations, but consider doing this replacement for an arbitrary number of o's, at least two:" { $code "R/ foo+/ \"bar\" re-replace" } @@ -31,7 +31,7 @@ ARTICLE: { "regexp" "intro" } "A quick introduction to regular expressions" "To search a file for all lines that match a given regular expression, you could use code like this:" { $code <" "file.txt" ascii file-lines [ R/ (f|b)oo+/ re-contains? ] filter "> } "To test if a string in its entirity matches a regular expression, the following can be used:" -{ $example <" "fooo" R/ (b|f)oo+/ matches? . "> "t" } +{ $example <" USING: regexp prettyprint ; "fooo" R/ (b|f)oo+/ matches? . "> "t" } "Regular expressions can't be used for all parsing tasks. For example, they are not powerful enough to match balancing parentheses." ; ARTICLE: { "regexp" "construction" } "Constructing regular expressions" From 9f8ccb67a7955025859f66783968e4edaef555fd Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Mon, 16 Mar 2009 23:49:31 -0500 Subject: [PATCH 14/16] Making disambiguation faster --- basis/regexp/disambiguate/disambiguate.factor | 45 ++++++++++++++++--- 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/basis/regexp/disambiguate/disambiguate.factor b/basis/regexp/disambiguate/disambiguate.factor index 67b1503f9b..876d898cb4 100644 --- a/basis/regexp/disambiguate/disambiguate.factor +++ b/basis/regexp/disambiguate/disambiguate.factor @@ -1,7 +1,8 @@ ! Copyright (C) 2009 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: kernel accessors regexp.classes math.bits assocs sequences -arrays sets regexp.dfa math fry regexp.minimize regexp.ast regexp.transition-tables ; +arrays sets regexp.dfa math fry regexp.minimize regexp.ast +locals regexp.transition-tables ; IN: regexp.disambiguate TUPLE: parts in out ; @@ -9,7 +10,7 @@ TUPLE: parts in out ; : make-partition ( choices classes -- partition ) zip [ first ] partition [ values ] bi@ parts boa ; -: powerset-partition ( classes -- partitions ) +: powerset-partition ( sequence -- partitions ) [ length [ 2^ ] keep ] keep '[ _ _ make-partition ] map rest ; @@ -19,19 +20,49 @@ TUPLE: parts in out ; [ in>> ] bi prefix ; -: get-transitions ( partition state-transitions -- next-states ) - [ in>> ] dip '[ _ at ] gather sift ; +: singleton-partition ( integer non-integers -- {class,partition} ) + dupd + '[ _ [ class-member? ] with filter ] keep + prefix f parts boa + 2array ; + +: add-out ( seq partition -- partition' ) + [ out>> append ] [ in>> ] bi swap parts boa ; + +: intersection ( seq -- elts ) + [ f ] [ unclip [ intersect ] reduce ] if-empty ; + +: meaningful-integers ( partition table -- integers ) + [ [ in>> ] [ out>> ] bi ] dip + '[ [ _ at ] map intersection ] bi@ diff ; + +: class-integers ( classes integers -- table ) + '[ _ over '[ _ class-member? ] filter ] H{ } map>assoc ; + +: add-integers ( partitions classes integers -- partitions ) + class-integers '[ + [ _ meaningful-integers ] keep add-out + ] map ; + +: class-partitions ( classes -- assoc ) + [ integer? ] partition [ + dup powerset-partition spin add-integers + [ [ partition>class ] keep 2array ] map + [ first ] filter + ] [ '[ _ singleton-partition ] map ] 2bi append ; : new-transitions ( transitions -- assoc ) ! assoc is class, partition values [ keys ] gather [ tagged-epsilon? not ] filter - powerset-partition - [ [ partition>class ] keep ] { } map>assoc - [ drop ] assoc-filter ; + class-partitions ; + +: get-transitions ( partition state-transitions -- next-states ) + [ in>> ] dip '[ _ at ] gather sift ; : preserving-epsilon ( state-transitions quot -- new-state-transitions ) [ [ drop tagged-epsilon? ] assoc-filter ] bi assoc-union H{ } assoc-like ; inline + : disambiguate ( nfa -- nfa ) expand-ors [ dup new-transitions '[ From 4a79ee9bb92adac7f9d3b118c92dcc97e1e297a1 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Tue, 17 Mar 2009 00:04:27 -0500 Subject: [PATCH 15/16] Making unicode.data slightly more efficient --- basis/unicode/data/data.factor | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/basis/unicode/data/data.factor b/basis/unicode/data/data.factor index bff4ddeaab..74914e8537 100644 --- a/basis/unicode/data/data.factor +++ b/basis/unicode/data/data.factor @@ -5,7 +5,7 @@ io.files hashtables quotations splitting grouping arrays io math.parser hash2 math.order byte-arrays words namespaces words compiler.units parser io.encodings.ascii values interval-maps ascii sets combinators locals math.ranges sorting make -strings.parser io.encodings.utf8 ; +strings.parser io.encodings.utf8 memoize ; IN: unicode.data VALUE: simple-lower @@ -108,6 +108,9 @@ CONSTANT: categories "Zs" "Zl" "Zp" "Cc" "Cf" "Cs" "Co" } +MEMO: categories-map ( -- hashtable ) + categories [ swap ] H{ } assoc-map-as ; + CONSTANT: num-chars HEX: 2FA1E ! the maximum unicode char in the first 3 planes @@ -124,10 +127,10 @@ CONSTANT: num-chars HEX: 2FA1E ] assoc-each table ; :: process-category ( data -- category-listing ) - [let | table [ num-chars ] | - 2 data (process-data) [| char cat | - cat categories index char table ?set-nth - ] assoc-each table fill-ranges ] ; + num-chars :> table + 2 data (process-data) [| char cat | + cat categories-map at char table ?set-nth + ] assoc-each table fill-ranges ; : process-names ( data -- names-hash ) 1 swap (process-data) [ From a181c220154050ed785fad36a78917f0a21d2dd8 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Tue, 17 Mar 2009 00:10:55 -0500 Subject: [PATCH 16/16] Regexp docs typo fix --- basis/regexp/regexp-docs.factor | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/basis/regexp/regexp-docs.factor b/basis/regexp/regexp-docs.factor index d62a03ade5..6ad340a82d 100644 --- a/basis/regexp/regexp-docs.factor +++ b/basis/regexp/regexp-docs.factor @@ -30,7 +30,7 @@ ARTICLE: { "regexp" "intro" } "A quick introduction to regular expressions" { $code "R/ (f|b)oo+/ \"bar\" re-replace" } "To search a file for all lines that match a given regular expression, you could use code like this:" { $code <" "file.txt" ascii file-lines [ R/ (f|b)oo+/ re-contains? ] filter "> } -"To test if a string in its entirity matches a regular expression, the following can be used:" +"To test if a string in its entirety matches a regular expression, the following can be used:" { $example <" USING: regexp prettyprint ; "fooo" R/ (b|f)oo+/ matches? . "> "t" } "Regular expressions can't be used for all parsing tasks. For example, they are not powerful enough to match balancing parentheses." ;