217 lines
		
	
	
		
			6.1 KiB
		
	
	
	
		
			Factor
		
	
	
		
		
			
		
	
	
			217 lines
		
	
	
		
			6.1 KiB
		
	
	
	
		
			Factor
		
	
	
| 
								 | 
							
								! Copyright (C) 2014 John Benediktsson
							 | 
						|||
| 
								 | 
							
								! See http://factorcode.org/license.txt for BSD license
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								USING: accessors combinators formatting fry io.encodings.ascii
							 | 
						|||
| 
								 | 
							
								io.files kernel literals math math.functions math.order
							 | 
						|||
| 
								 | 
							
								multiline regexp sequences sequences.extras sets splitting
							 | 
						|||
| 
								 | 
							
								unicode ;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								IN: text-analysis
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								<PRIVATE
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								: trimmed ( seq -- seq )
							 | 
						|||
| 
								 | 
							
								    [ [ blank? ] trim ] map harvest ;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								: split-paragraphs ( str -- seq )
							 | 
						|||
| 
								 | 
							
								    R/ \r?\n\r?\n/ re-split trimmed ;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								<<
							 | 
						|||
| 
								 | 
							
								CONSTANT: ABBREVIATIONS {
							 | 
						|||
| 
								 | 
							
								    "jr" "mr" "mrs" "ms" "dr" "prof" "sr" "sen" "rep" "rev"
							 | 
						|||
| 
								 | 
							
								    "gov" "atty" "supt" "det" "rev" "col','gen" "lt" "cmdr"
							 | 
						|||
| 
								 | 
							
								    "adm" "capt" "sgt" "cpl" "maj" ! titles
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								    "dept" "univ" "uni" "assn" "bros" "inc" "ltd" "co" "corp"
							 | 
						|||
| 
								 | 
							
								    "plc" ! entities
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								    "jan" "feb" "mar" "apr" "may" "jun" "jul" "aug" "sep" "oct"
							 | 
						|||
| 
								 | 
							
								    "nov" "dec" "sept" ! months
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								    "mon" "tue" "wed" "thu" "fri" "sat" "sun" ! days
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								    "vs" "etc" "no" "esp" "cf" ! misc
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								    "ave" "bld" "blvd" "cl" "ct" "cres" "dr" "rd" "st" ! streets
							 | 
						|||
| 
								 | 
							
								}
							 | 
						|||
| 
								 | 
							
								>>
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								: split-sentences ( str -- seq )
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								    ! Mark end of sentences with EOS marker
							 | 
						|||
| 
								 | 
							
								    R/ ((?:[\.?!]|[\r\n]+)(?:\"|\'|\)|\]|\})?)(\s+)/
							 | 
						|||
| 
								 | 
							
								    [ [ ".?!\r\n\"')]}" member? not ] cut-when "\x01" glue ]
							 | 
						|||
| 
								 | 
							
								    re-replace-with
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								    ! Fix ellipsis marks
							 | 
						|||
| 
								 | 
							
								    $[ "(\\.\\.\\.*)\x01" <regexp> ] [ but-last-slice ]
							 | 
						|||
| 
								 | 
							
								    re-replace-with
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								    ! Fix e.g, i.e. marks
							 | 
						|||
| 
								 | 
							
								    $[
							 | 
						|||
| 
								 | 
							
								        "(?:\\s(?:(?:(?:\\w\\.){2,}\\w?)|(?:\\w\\.\\w)))\x01(\\s+[a-z0-9])"
							 | 
						|||
| 
								 | 
							
								        <regexp>
							 | 
						|||
| 
								 | 
							
								    ] [ [ 1 = ] cut-when append ] re-replace-with
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								    ! Fix abbreviations
							 | 
						|||
| 
								 | 
							
								    $[
							 | 
						|||
| 
								 | 
							
								        ABBREVIATIONS "|" join "(" ")\\.\x01" surround
							 | 
						|||
| 
								 | 
							
								        "i" <optioned-regexp>
							 | 
						|||
| 
								 | 
							
								    ] [ CHAR: . over index head ] re-replace-with
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								    ! Split on EOS marker
							 | 
						|||
| 
								 | 
							
								    "\x01" split trimmed ;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								CONSTANT: sub-syllable {
							 | 
						|||
| 
								 | 
							
								    R/ [^aeiou]e$/ ! give, love, bone, done, ride ...
							 | 
						|||
| 
								 | 
							
								    R/ [aeiou](?:([cfghklmnprsvwz])\1?|ck|sh|[rt]ch)e[ds]$/
							 | 
						|||
| 
								 | 
							
								    ! (passive) past participles and 3rd person sing present verbs:
							 | 
						|||
| 
								 | 
							
								    ! bared, liked, called, tricked, bashed, matched
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								    R/ .e(?:ly|less(?:ly)?|ness?|ful(?:ly)?|ments?)$/
							 | 
						|||
| 
								 | 
							
								    ! nominal, adjectival and adverbial derivatives from -e$ roots:
							 | 
						|||
| 
								 | 
							
								    ! absolutely, nicely, likeness, basement, hopeless
							 | 
						|||
| 
								 | 
							
								    ! hopeful, tastefully, wasteful
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								    R/ ion/ ! action, diction, fiction
							 | 
						|||
| 
								 | 
							
								    R/ [ct]ia[nl]/ ! special(ly), initial, physician, christian
							 | 
						|||
| 
								 | 
							
								    R/ [^cx]iou/ ! illustrious, NOT spacious, gracious, anxious, noxious
							 | 
						|||
| 
								 | 
							
								    R/ sia$/ ! amnesia, polynesia
							 | 
						|||
| 
								 | 
							
								    R/ .gue$/ ! dialogue, intrigue, colleague
							 | 
						|||
| 
								 | 
							
								}
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								CONSTANT: add-syllable {
							 | 
						|||
| 
								 | 
							
								    R/ i[aiou]/ ! alias, science, phobia
							 | 
						|||
| 
								 | 
							
								    R/ [dls]ien/ ! salient, gradient, transient
							 | 
						|||
| 
								 | 
							
								    R/ [aeiouym]ble$/ ! -Vble, plus -mble
							 | 
						|||
| 
								 | 
							
								    R/ [aeiou]{3}/ ! agreeable
							 | 
						|||
| 
								 | 
							
								    R/ ^mc/ ! mcwhatever
							 | 
						|||
| 
								 | 
							
								    R/ ism$/ ! sexism, racism
							 | 
						|||
| 
								 | 
							
								    R/ (?:([^aeiouy])\1|ck|mp|ng)le$/ ! bubble, cattle, cackle, sample, angle
							 | 
						|||
| 
								 | 
							
								    R/ dnt$/ ! couldn/t
							 | 
						|||
| 
								 | 
							
								    R/ [aeiou]y[aeiou]/ ! annoying, layer
							 | 
						|||
| 
								 | 
							
								}
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								: syllables ( str -- n )
							 | 
						|||
| 
								 | 
							
								    dup length 1 = [ drop 1 ] [
							 | 
						|||
| 
								 | 
							
								        >lower CHAR: . swap remove
							 | 
						|||
| 
								 | 
							
								        [ R/ [aeiouy]+/ count-matches ]
							 | 
						|||
| 
								 | 
							
								        [ sub-syllable [ matches? ] with count - ]
							 | 
						|||
| 
								 | 
							
								        [ add-syllable [ matches? ] with count + ] tri
							 | 
						|||
| 
								 | 
							
								        1 max
							 | 
						|||
| 
								 | 
							
								    ] if ;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								: split-words ( str -- words )
							 | 
						|||
| 
								 | 
							
								    R/ \b([a-z][a-z\-']*)\b/i all-matching-subseqs ;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								TUPLE: text-analysis #paragraphs #sentences #chars #words
							 | 
						|||
| 
								 | 
							
								#syllables #complex-words #unique-words #difficult-words ;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								: <text-analysis> ( str -- text-analysis )
							 | 
						|||
| 
								 | 
							
								    {
							 | 
						|||
| 
								 | 
							
								        [ split-paragraphs length ]
							 | 
						|||
| 
								 | 
							
								        [ split-sentences length ]
							 | 
						|||
| 
								 | 
							
								        [ [ blank? not ] count ]
							 | 
						|||
| 
								 | 
							
								        [ split-words ]
							 | 
						|||
| 
								 | 
							
								    } cleave {
							 | 
						|||
| 
								 | 
							
								        [ length ]
							 | 
						|||
| 
								 | 
							
								        [
							 | 
						|||
| 
								 | 
							
								            [ 0 0 ] dip [
							 | 
						|||
| 
								 | 
							
								                [ syllables ] [ "-" member? not ] bi
							 | 
						|||
| 
								 | 
							
								                over 2 > and 1 0 ? [ + ] bi-curry@ bi*
							 | 
						|||
| 
								 | 
							
								            ] each
							 | 
						|||
| 
								 | 
							
								        ]
							 | 
						|||
| 
								 | 
							
								        [ members length ]
							 | 
						|||
| 
								 | 
							
								        [
							 | 
						|||
| 
								 | 
							
								            "vocab:text-analysis/dale-chall.txt" ascii
							 | 
						|||
| 
								 | 
							
								            file-lines fast-set '[ >lower _ in? not ] count
							 | 
						|||
| 
								 | 
							
								        ]
							 | 
						|||
| 
								 | 
							
								    } cleave text-analysis boa ;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								: syllables-per-word ( text-analysis -- n )
							 | 
						|||
| 
								 | 
							
								    [ #syllables>> ] [ #words>> ] bi / ;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								: words-per-sentence ( text-analysis -- n )
							 | 
						|||
| 
								 | 
							
								    [ #words>> ] [ #sentences>> ] bi / ;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								: chars-per-word ( text-analysis -- n )
							 | 
						|||
| 
								 | 
							
								    [ #chars>> ] [ #words>> ] bi / ;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								: sentences-per-word ( text-analysis -- n )
							 | 
						|||
| 
								 | 
							
								    [ #sentences>> ] [ #words>> ] bi / ;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								: percent-complex-words ( text-analysis -- n )
							 | 
						|||
| 
								 | 
							
								    [ #complex-words>> ] [ #words>> ] bi / 100 * ;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								: percent-difficult-words ( text-analysis -- n )
							 | 
						|||
| 
								 | 
							
								    [ #difficult-words>> ] [ #words>> ] bi / 100 * ;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								: flesch-kincaid ( text-analysis -- n )
							 | 
						|||
| 
								 | 
							
								    [ words-per-sentence 0.39 * ]
							 | 
						|||
| 
								 | 
							
								    [ syllables-per-word 11.8 * ] bi + 15.59 - ;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								: flesch ( text-analysis -- n )
							 | 
						|||
| 
								 | 
							
								    206.835 swap
							 | 
						|||
| 
								 | 
							
								    [ words-per-sentence 1.015 * - ]
							 | 
						|||
| 
								 | 
							
								    [ syllables-per-word 84.6 * - ] bi ;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								: gunning-fog ( text-analysis -- n )
							 | 
						|||
| 
								 | 
							
								    [ words-per-sentence ] [ percent-complex-words ] bi + 0.4 * ;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								: coleman-liau ( text-analysis -- n )
							 | 
						|||
| 
								 | 
							
								    [ chars-per-word 5.88 * ]
							 | 
						|||
| 
								 | 
							
								    [ sentences-per-word 29.6 * ] bi - 15.8 - ;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								: smog ( text-analysis -- n )
							 | 
						|||
| 
								 | 
							
								    [ #complex-words>> ] [ #sentences>> 30 swap / ] bi *
							 | 
						|||
| 
								 | 
							
								    sqrt 1.0430 * 3.1291 + ;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								: automated-readability ( text-analysis -- n )
							 | 
						|||
| 
								 | 
							
								    [ chars-per-word 4.71 * ]
							 | 
						|||
| 
								 | 
							
								    [ words-per-sentence 0.5 * ] bi + 21.43 - ;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								: dale-chall ( text-analysis -- n )
							 | 
						|||
| 
								 | 
							
								    [
							 | 
						|||
| 
								 | 
							
								        percent-difficult-words
							 | 
						|||
| 
								 | 
							
								        [ 0.1579 * ] [ 0.05 > [ 3.6365 + ] when ] bi
							 | 
						|||
| 
								 | 
							
								    ]
							 | 
						|||
| 
								 | 
							
								    [ words-per-sentence 0.0496 * ] bi + ;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								STRING: text-report-format
							 | 
						|||
| 
								 | 
							
								Number of paragraphs           %d
							 | 
						|||
| 
								 | 
							
								Number of sentences            %d
							 | 
						|||
| 
								 | 
							
								Number of words                %d
							 | 
						|||
| 
								 | 
							
								Number of characters           %d
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								Average words per sentence     %.2f
							 | 
						|||
| 
								 | 
							
								Average syllables per word     %.2f
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								Flesch Reading Ease            %2.2f
							 | 
						|||
| 
								 | 
							
								Flesh-Kincaid Grade Level      %2.2f
							 | 
						|||
| 
								 | 
							
								Gunning fog index              %2.2f
							 | 
						|||
| 
								 | 
							
								Coleman–Liau index             %2.2f
							 | 
						|||
| 
								 | 
							
								SMOG grade                     %2.2f
							 | 
						|||
| 
								 | 
							
								Automated Readability index    %2.2f
							 | 
						|||
| 
								 | 
							
								Dale-Chall readability         %2.2f
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								PRIVATE>
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								: analyze-text. ( str -- )
							 | 
						|||
| 
								 | 
							
								    <text-analysis> {
							 | 
						|||
| 
								 | 
							
								        [ #paragraphs>> ]
							 | 
						|||
| 
								 | 
							
								        [ #sentences>> ]
							 | 
						|||
| 
								 | 
							
								        [ #words>> ]
							 | 
						|||
| 
								 | 
							
								        [ #chars>> ]
							 | 
						|||
| 
								 | 
							
								        [ words-per-sentence ]
							 | 
						|||
| 
								 | 
							
								        [ syllables-per-word ]
							 | 
						|||
| 
								 | 
							
								        [ flesch ]
							 | 
						|||
| 
								 | 
							
								        [ flesch-kincaid ]
							 | 
						|||
| 
								 | 
							
								        [ gunning-fog ]
							 | 
						|||
| 
								 | 
							
								        [ coleman-liau ]
							 | 
						|||
| 
								 | 
							
								        [ smog ]
							 | 
						|||
| 
								 | 
							
								        [ automated-readability ]
							 | 
						|||
| 
								 | 
							
								        [ dale-chall ]
							 | 
						|||
| 
								 | 
							
								    } cleave text-report-format printf ;
							 |