217 lines
		
	
	
		
			6.1 KiB
		
	
	
	
		
			Factor
		
	
	
		
		
			
		
	
	
			217 lines
		
	
	
		
			6.1 KiB
		
	
	
	
		
			Factor
		
	
	
|  | ! Copyright (C) 2014 John Benediktsson | |||
|  | ! See http://factorcode.org/license.txt for BSD license | |||
|  | 
 | |||
|  | USING: accessors combinators formatting fry io.encodings.ascii | |||
|  | io.files kernel literals math math.functions math.order | |||
|  | multiline regexp sequences sequences.extras sets splitting | |||
|  | unicode ;
 | |||
|  | 
 | |||
|  | IN: text-analysis | |||
|  | 
 | |||
|  | <PRIVATE
 | |||
|  | 
 | |||
|  | : trimmed ( seq -- seq )
 | |||
|  |     [ [ blank? ] trim ] map harvest ;
 | |||
|  | 
 | |||
|  | : split-paragraphs ( str -- seq )
 | |||
|  |     R/ \r?\n\r?\n/ re-split trimmed ;
 | |||
|  | 
 | |||
|  | << | |||
|  | CONSTANT: ABBREVIATIONS { | |||
|  |     "jr" "mr" "mrs" "ms" "dr" "prof" "sr" "sen" "rep" "rev" | |||
|  |     "gov" "atty" "supt" "det" "rev" "col','gen" "lt" "cmdr" | |||
|  |     "adm" "capt" "sgt" "cpl" "maj" ! titles | |||
|  | 
 | |||
|  |     "dept" "univ" "uni" "assn" "bros" "inc" "ltd" "co" "corp" | |||
|  |     "plc" ! entities | |||
|  | 
 | |||
|  |     "jan" "feb" "mar" "apr" "may" "jun" "jul" "aug" "sep" "oct" | |||
|  |     "nov" "dec" "sept" ! months | |||
|  | 
 | |||
|  |     "mon" "tue" "wed" "thu" "fri" "sat" "sun" ! days | |||
|  | 
 | |||
|  |     "vs" "etc" "no" "esp" "cf" ! misc | |||
|  | 
 | |||
|  |     "ave" "bld" "blvd" "cl" "ct" "cres" "dr" "rd" "st" ! streets | |||
|  | } | |||
|  | >> | |||
|  | 
 | |||
|  | : split-sentences ( str -- seq )
 | |||
|  | 
 | |||
|  |     ! Mark end of sentences with EOS marker | |||
|  |     R/ ((?:[\.?!]|[\r\n]+)(?:\"|\'|\)|\]|\})?)(\s+)/ | |||
|  |     [ [ ".?!\r\n\"')]}" member? not ] cut-when "\x01" glue ] | |||
|  |     re-replace-with | |||
|  | 
 | |||
|  |     ! Fix ellipsis marks | |||
|  |     $[ "(\\.\\.\\.*)\x01" <regexp> ] [ but-last-slice ] | |||
|  |     re-replace-with | |||
|  | 
 | |||
|  |     ! Fix e.g, i.e. marks | |||
|  |     $[ | |||
|  |         "(?:\\s(?:(?:(?:\\w\\.){2,}\\w?)|(?:\\w\\.\\w)))\x01(\\s+[a-z0-9])" | |||
|  |         <regexp> | |||
|  |     ] [ [ 1 = ] cut-when append ] re-replace-with | |||
|  | 
 | |||
|  |     ! Fix abbreviations | |||
|  |     $[ | |||
|  |         ABBREVIATIONS "|" join "(" ")\\.\x01" surround
 | |||
|  |         "i" <optioned-regexp> | |||
|  |     ] [ CHAR: . over index head ] re-replace-with | |||
|  | 
 | |||
|  |     ! Split on EOS marker | |||
|  |     "\x01" split trimmed ;
 | |||
|  | 
 | |||
|  | CONSTANT: sub-syllable { | |||
|  |     R/ [^aeiou]e$/ ! give, love, bone, done, ride ... | |||
|  |     R/ [aeiou](?:([cfghklmnprsvwz])\1?|ck|sh|[rt]ch)e[ds]$/ | |||
|  |     ! (passive) past participles and 3rd person sing present verbs: | |||
|  |     ! bared, liked, called, tricked, bashed, matched | |||
|  | 
 | |||
|  |     R/ .e(?:ly|less(?:ly)?|ness?|ful(?:ly)?|ments?)$/ | |||
|  |     ! nominal, adjectival and adverbial derivatives from -e$ roots: | |||
|  |     ! absolutely, nicely, likeness, basement, hopeless | |||
|  |     ! hopeful, tastefully, wasteful | |||
|  | 
 | |||
|  |     R/ ion/ ! action, diction, fiction | |||
|  |     R/ [ct]ia[nl]/ ! special(ly), initial, physician, christian | |||
|  |     R/ [^cx]iou/ ! illustrious, NOT spacious, gracious, anxious, noxious | |||
|  |     R/ sia$/ ! amnesia, polynesia | |||
|  |     R/ .gue$/ ! dialogue, intrigue, colleague | |||
|  | } | |||
|  | 
 | |||
|  | CONSTANT: add-syllable { | |||
|  |     R/ i[aiou]/ ! alias, science, phobia | |||
|  |     R/ [dls]ien/ ! salient, gradient, transient | |||
|  |     R/ [aeiouym]ble$/ ! -Vble, plus -mble | |||
|  |     R/ [aeiou]{3}/ ! agreeable | |||
|  |     R/ ^mc/ ! mcwhatever | |||
|  |     R/ ism$/ ! sexism, racism | |||
|  |     R/ (?:([^aeiouy])\1|ck|mp|ng)le$/ ! bubble, cattle, cackle, sample, angle | |||
|  |     R/ dnt$/ ! couldn/t | |||
|  |     R/ [aeiou]y[aeiou]/ ! annoying, layer | |||
|  | } | |||
|  | 
 | |||
|  | : syllables ( str -- n )
 | |||
|  |     dup length 1 = [ drop 1 ] [ | |||
|  |         >lower CHAR: . swap remove
 | |||
|  |         [ R/ [aeiouy]+/ count-matches ] | |||
|  |         [ sub-syllable [ matches? ] with count - ] | |||
|  |         [ add-syllable [ matches? ] with count + ] tri
 | |||
|  |         1 max | |||
|  |     ] if ;
 | |||
|  | 
 | |||
|  | : split-words ( str -- words )
 | |||
|  |     R/ \b([a-z][a-z\-']*)\b/i all-matching-subseqs ;
 | |||
|  | 
 | |||
|  | TUPLE: text-analysis #paragraphs #sentences #chars #words | |||
|  | #syllables #complex-words #unique-words #difficult-words ;
 | |||
|  | 
 | |||
|  | : <text-analysis> ( str -- text-analysis )
 | |||
|  |     { | |||
|  |         [ split-paragraphs length ] | |||
|  |         [ split-sentences length ] | |||
|  |         [ [ blank? not ] count ] | |||
|  |         [ split-words ] | |||
|  |     } cleave { | |||
|  |         [ length ] | |||
|  |         [ | |||
|  |             [ 0 0 ] dip [ | |||
|  |                 [ syllables ] [ "-" member? not ] bi
 | |||
|  |                 over 2 > and 1 0 ? [ + ] bi-curry@ bi*
 | |||
|  |             ] each
 | |||
|  |         ] | |||
|  |         [ members length ] | |||
|  |         [ | |||
|  |             "vocab:text-analysis/dale-chall.txt" ascii | |||
|  |             file-lines fast-set '[ >lower _ in? not ] count
 | |||
|  |         ] | |||
|  |     } cleave text-analysis boa ;
 | |||
|  | 
 | |||
|  | : syllables-per-word ( text-analysis -- n )
 | |||
|  |     [ #syllables>> ] [ #words>> ] bi / ;
 | |||
|  | 
 | |||
|  | : words-per-sentence ( text-analysis -- n )
 | |||
|  |     [ #words>> ] [ #sentences>> ] bi / ;
 | |||
|  | 
 | |||
|  | : chars-per-word ( text-analysis -- n )
 | |||
|  |     [ #chars>> ] [ #words>> ] bi / ;
 | |||
|  | 
 | |||
|  | : sentences-per-word ( text-analysis -- n )
 | |||
|  |     [ #sentences>> ] [ #words>> ] bi / ;
 | |||
|  | 
 | |||
|  | : percent-complex-words ( text-analysis -- n )
 | |||
|  |     [ #complex-words>> ] [ #words>> ] bi / 100 * ;
 | |||
|  | 
 | |||
|  | : percent-difficult-words ( text-analysis -- n )
 | |||
|  |     [ #difficult-words>> ] [ #words>> ] bi / 100 * ;
 | |||
|  | 
 | |||
|  | : flesch-kincaid ( text-analysis -- n )
 | |||
|  |     [ words-per-sentence 0.39 * ] | |||
|  |     [ syllables-per-word 11.8 * ] bi + 15.59 - ;
 | |||
|  | 
 | |||
|  | : flesch ( text-analysis -- n )
 | |||
|  |     206.835 swap
 | |||
|  |     [ words-per-sentence 1.015 * - ] | |||
|  |     [ syllables-per-word 84.6 * - ] bi ;
 | |||
|  | 
 | |||
|  | : gunning-fog ( text-analysis -- n )
 | |||
|  |     [ words-per-sentence ] [ percent-complex-words ] bi + 0.4 * ;
 | |||
|  | 
 | |||
|  | : coleman-liau ( text-analysis -- n )
 | |||
|  |     [ chars-per-word 5.88 * ] | |||
|  |     [ sentences-per-word 29.6 * ] bi - 15.8 - ;
 | |||
|  | 
 | |||
|  | : smog ( text-analysis -- n )
 | |||
|  |     [ #complex-words>> ] [ #sentences>> 30 swap / ] bi *
 | |||
|  |     sqrt 1.0430 * 3.1291 + ;
 | |||
|  | 
 | |||
|  | : automated-readability ( text-analysis -- n )
 | |||
|  |     [ chars-per-word 4.71 * ] | |||
|  |     [ words-per-sentence 0.5 * ] bi + 21.43 - ;
 | |||
|  | 
 | |||
|  | : dale-chall ( text-analysis -- n )
 | |||
|  |     [ | |||
|  |         percent-difficult-words | |||
|  |         [ 0.1579 * ] [ 0.05 > [ 3.6365 + ] when ] bi
 | |||
|  |     ] | |||
|  |     [ words-per-sentence 0.0496 * ] bi + ;
 | |||
|  | 
 | |||
|  | STRING: text-report-format | |||
|  | Number of paragraphs           %d | |||
|  | Number of sentences            %d | |||
|  | Number of words                %d | |||
|  | Number of characters           %d | |||
|  | 
 | |||
|  | Average words per sentence     %.2f | |||
|  | Average syllables per word     %.2f | |||
|  | 
 | |||
|  | Flesch Reading Ease            %2.2f | |||
|  | Flesh-Kincaid Grade Level      %2.2f | |||
|  | Gunning fog index              %2.2f | |||
|  | Coleman–Liau index             %2.2f | |||
|  | SMOG grade                     %2.2f | |||
|  | Automated Readability index    %2.2f | |||
|  | Dale-Chall readability         %2.2f | |||
|  | 
 | |||
|  | ;
 | |||
|  | 
 | |||
|  | PRIVATE>
 | |||
|  | 
 | |||
|  | : analyze-text. ( str -- )
 | |||
|  |     <text-analysis> { | |||
|  |         [ #paragraphs>> ] | |||
|  |         [ #sentences>> ] | |||
|  |         [ #words>> ] | |||
|  |         [ #chars>> ] | |||
|  |         [ words-per-sentence ] | |||
|  |         [ syllables-per-word ] | |||
|  |         [ flesch ] | |||
|  |         [ flesch-kincaid ] | |||
|  |         [ gunning-fog ] | |||
|  |         [ coleman-liau ] | |||
|  |         [ smog ] | |||
|  |         [ automated-readability ] | |||
|  |         [ dale-chall ] | |||
|  |     } cleave text-report-format printf ;
 |