factor/extra/text-analysis/text-analysis.factor

217 lines
6.1 KiB
Factor
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

! Copyright (C) 2014 John Benediktsson
! See http://factorcode.org/license.txt for BSD license
USING: accessors combinators formatting fry io.encodings.ascii
io.files kernel literals math math.functions math.order
multiline regexp sequences sequences.extras sets splitting
unicode ;
IN: text-analysis
<PRIVATE
: trimmed ( seq -- seq )
[ [ blank? ] trim ] map harvest ;
: split-paragraphs ( str -- seq )
R/ \r?\n\r?\n/ re-split trimmed ;
<<
CONSTANT: ABBREVIATIONS {
"jr" "mr" "mrs" "ms" "dr" "prof" "sr" "sen" "rep" "rev"
"gov" "atty" "supt" "det" "rev" "col','gen" "lt" "cmdr"
"adm" "capt" "sgt" "cpl" "maj" ! titles
"dept" "univ" "uni" "assn" "bros" "inc" "ltd" "co" "corp"
"plc" ! entities
"jan" "feb" "mar" "apr" "may" "jun" "jul" "aug" "sep" "oct"
"nov" "dec" "sept" ! months
"mon" "tue" "wed" "thu" "fri" "sat" "sun" ! days
"vs" "etc" "no" "esp" "cf" ! misc
"ave" "bld" "blvd" "cl" "ct" "cres" "dr" "rd" "st" ! streets
}
>>
: split-sentences ( str -- seq )
! Mark end of sentences with EOS marker
R/ ((?:[\.?!]|[\r\n]+)(?:\"|\'|\)|\]|\})?)(\s+)/
[ [ ".?!\r\n\"')]}" member? not ] cut-when "\x01" glue ]
re-replace-with
! Fix ellipsis marks
$[ "(\\.\\.\\.*)\x01" <regexp> ] [ but-last-slice ]
re-replace-with
! Fix e.g, i.e. marks
$[
"(?:\\s(?:(?:(?:\\w\\.){2,}\\w?)|(?:\\w\\.\\w)))\x01(\\s+[a-z0-9])"
<regexp>
] [ [ 1 = ] cut-when append ] re-replace-with
! Fix abbreviations
$[
ABBREVIATIONS "|" join "(" ")\\.\x01" surround
"i" <optioned-regexp>
] [ CHAR: . over index head ] re-replace-with
! Split on EOS marker
"\x01" split trimmed ;
CONSTANT: sub-syllable {
R/ [^aeiou]e$/ ! give, love, bone, done, ride ...
R/ [aeiou](?:([cfghklmnprsvwz])\1?|ck|sh|[rt]ch)e[ds]$/
! (passive) past participles and 3rd person sing present verbs:
! bared, liked, called, tricked, bashed, matched
R/ .e(?:ly|less(?:ly)?|ness?|ful(?:ly)?|ments?)$/
! nominal, adjectival and adverbial derivatives from -e$ roots:
! absolutely, nicely, likeness, basement, hopeless
! hopeful, tastefully, wasteful
R/ ion/ ! action, diction, fiction
R/ [ct]ia[nl]/ ! special(ly), initial, physician, christian
R/ [^cx]iou/ ! illustrious, NOT spacious, gracious, anxious, noxious
R/ sia$/ ! amnesia, polynesia
R/ .gue$/ ! dialogue, intrigue, colleague
}
CONSTANT: add-syllable {
R/ i[aiou]/ ! alias, science, phobia
R/ [dls]ien/ ! salient, gradient, transient
R/ [aeiouym]ble$/ ! -Vble, plus -mble
R/ [aeiou]{3}/ ! agreeable
R/ ^mc/ ! mcwhatever
R/ ism$/ ! sexism, racism
R/ (?:([^aeiouy])\1|ck|mp|ng)le$/ ! bubble, cattle, cackle, sample, angle
R/ dnt$/ ! couldn/t
R/ [aeiou]y[aeiou]/ ! annoying, layer
}
: syllables ( str -- n )
dup length 1 = [ drop 1 ] [
>lower CHAR: . swap remove
[ R/ [aeiouy]+/ count-matches ]
[ sub-syllable [ matches? ] with count - ]
[ add-syllable [ matches? ] with count + ] tri
1 max
] if ;
: split-words ( str -- words )
R/ \b([a-z][a-z\-']*)\b/i all-matching-subseqs ;
TUPLE: text-analysis #paragraphs #sentences #chars #words
#syllables #complex-words #unique-words #difficult-words ;
: <text-analysis> ( str -- text-analysis )
{
[ split-paragraphs length ]
[ split-sentences length ]
[ [ blank? not ] count ]
[ split-words ]
} cleave {
[ length ]
[
[ 0 0 ] dip [
[ syllables ] [ "-" member? not ] bi
over 2 > and 1 0 ? [ + ] bi-curry@ bi*
] each
]
[ members length ]
[
"vocab:text-analysis/dale-chall.txt" ascii
file-lines fast-set '[ >lower _ in? not ] count
]
} cleave text-analysis boa ;
: syllables-per-word ( text-analysis -- n )
[ #syllables>> ] [ #words>> ] bi / ;
: words-per-sentence ( text-analysis -- n )
[ #words>> ] [ #sentences>> ] bi / ;
: chars-per-word ( text-analysis -- n )
[ #chars>> ] [ #words>> ] bi / ;
: sentences-per-word ( text-analysis -- n )
[ #sentences>> ] [ #words>> ] bi / ;
: percent-complex-words ( text-analysis -- n )
[ #complex-words>> ] [ #words>> ] bi / 100 * ;
: percent-difficult-words ( text-analysis -- n )
[ #difficult-words>> ] [ #words>> ] bi / 100 * ;
: flesch-kincaid ( text-analysis -- n )
[ words-per-sentence 0.39 * ]
[ syllables-per-word 11.8 * ] bi + 15.59 - ;
: flesch ( text-analysis -- n )
206.835 swap
[ words-per-sentence 1.015 * - ]
[ syllables-per-word 84.6 * - ] bi ;
: gunning-fog ( text-analysis -- n )
[ words-per-sentence ] [ percent-complex-words ] bi + 0.4 * ;
: coleman-liau ( text-analysis -- n )
[ chars-per-word 5.88 * ]
[ sentences-per-word 29.6 * ] bi - 15.8 - ;
: smog ( text-analysis -- n )
[ #complex-words>> ] [ #sentences>> 30 swap / ] bi *
sqrt 1.0430 * 3.1291 + ;
: automated-readability ( text-analysis -- n )
[ chars-per-word 4.71 * ]
[ words-per-sentence 0.5 * ] bi + 21.43 - ;
: dale-chall ( text-analysis -- n )
[
percent-difficult-words
[ 0.1579 * ] [ 0.05 > [ 3.6365 + ] when ] bi
]
[ words-per-sentence 0.0496 * ] bi + ;
STRING: text-report-format
Number of paragraphs %d
Number of sentences %d
Number of words %d
Number of characters %d
Average words per sentence %.2f
Average syllables per word %.2f
Flesch Reading Ease %2.2f
Flesh-Kincaid Grade Level %2.2f
Gunning fog index %2.2f
ColemanLiau index %2.2f
SMOG grade %2.2f
Automated Readability index %2.2f
Dale-Chall readability %2.2f
;
PRIVATE>
: analyze-text. ( str -- )
<text-analysis> {
[ #paragraphs>> ]
[ #sentences>> ]
[ #words>> ]
[ #chars>> ]
[ words-per-sentence ]
[ syllables-per-word ]
[ flesch ]
[ flesch-kincaid ]
[ gunning-fog ]
[ coleman-liau ]
[ smog ]
[ automated-readability ]
[ dale-chall ]
} cleave text-report-format printf ;