machine-learning.data-sets: load commonly used test data sets.
parent
53382f4472
commit
bb3d028d30
|
@ -0,0 +1,34 @@
|
||||||
|
! Copyright (C) 2012 John Benediktsson
|
||||||
|
! See http://factorcode.org/license.txt for BSD license
|
||||||
|
|
||||||
|
USING: assocs csv io.encodings.utf8 io.files kernel math.parser
|
||||||
|
sequences ;
|
||||||
|
|
||||||
|
IN: machine-learning.data-sets
|
||||||
|
|
||||||
|
TUPLE: data-set data target target-names description
|
||||||
|
feature-names ;
|
||||||
|
|
||||||
|
C: <data-set> data-set
|
||||||
|
|
||||||
|
<PRIVATE
|
||||||
|
|
||||||
|
: load-file ( name -- contents )
|
||||||
|
"resource:extra/machine-learning/data-sets/" prepend
|
||||||
|
utf8 file-contents ;
|
||||||
|
|
||||||
|
PRIVATE>
|
||||||
|
|
||||||
|
: load-iris ( -- data-set )
|
||||||
|
"iris.csv" load-file string>csv unclip [
|
||||||
|
[
|
||||||
|
unclip-last
|
||||||
|
[ [ string>number ] map ]
|
||||||
|
[ string>number ] bi*
|
||||||
|
] { } map>assoc unzip
|
||||||
|
] [ 2 tail ] bi*
|
||||||
|
"iris.rst" load-file
|
||||||
|
{
|
||||||
|
"sepal length (cm)" "sepal width (cm)"
|
||||||
|
"petal length (cm)" "petal width (cm)"
|
||||||
|
} <data-set> ;
|
|
@ -0,0 +1,151 @@
|
||||||
|
150,4,setosa,versicolor,virginica
|
||||||
|
5.1,3.5,1.4,0.2,0
|
||||||
|
4.9,3.0,1.4,0.2,0
|
||||||
|
4.7,3.2,1.3,0.2,0
|
||||||
|
4.6,3.1,1.5,0.2,0
|
||||||
|
5.0,3.6,1.4,0.2,0
|
||||||
|
5.4,3.9,1.7,0.4,0
|
||||||
|
4.6,3.4,1.4,0.3,0
|
||||||
|
5.0,3.4,1.5,0.2,0
|
||||||
|
4.4,2.9,1.4,0.2,0
|
||||||
|
4.9,3.1,1.5,0.1,0
|
||||||
|
5.4,3.7,1.5,0.2,0
|
||||||
|
4.8,3.4,1.6,0.2,0
|
||||||
|
4.8,3.0,1.4,0.1,0
|
||||||
|
4.3,3.0,1.1,0.1,0
|
||||||
|
5.8,4.0,1.2,0.2,0
|
||||||
|
5.7,4.4,1.5,0.4,0
|
||||||
|
5.4,3.9,1.3,0.4,0
|
||||||
|
5.1,3.5,1.4,0.3,0
|
||||||
|
5.7,3.8,1.7,0.3,0
|
||||||
|
5.1,3.8,1.5,0.3,0
|
||||||
|
5.4,3.4,1.7,0.2,0
|
||||||
|
5.1,3.7,1.5,0.4,0
|
||||||
|
4.6,3.6,1.0,0.2,0
|
||||||
|
5.1,3.3,1.7,0.5,0
|
||||||
|
4.8,3.4,1.9,0.2,0
|
||||||
|
5.0,3.0,1.6,0.2,0
|
||||||
|
5.0,3.4,1.6,0.4,0
|
||||||
|
5.2,3.5,1.5,0.2,0
|
||||||
|
5.2,3.4,1.4,0.2,0
|
||||||
|
4.7,3.2,1.6,0.2,0
|
||||||
|
4.8,3.1,1.6,0.2,0
|
||||||
|
5.4,3.4,1.5,0.4,0
|
||||||
|
5.2,4.1,1.5,0.1,0
|
||||||
|
5.5,4.2,1.4,0.2,0
|
||||||
|
4.9,3.1,1.5,0.1,0
|
||||||
|
5.0,3.2,1.2,0.2,0
|
||||||
|
5.5,3.5,1.3,0.2,0
|
||||||
|
4.9,3.1,1.5,0.1,0
|
||||||
|
4.4,3.0,1.3,0.2,0
|
||||||
|
5.1,3.4,1.5,0.2,0
|
||||||
|
5.0,3.5,1.3,0.3,0
|
||||||
|
4.5,2.3,1.3,0.3,0
|
||||||
|
4.4,3.2,1.3,0.2,0
|
||||||
|
5.0,3.5,1.6,0.6,0
|
||||||
|
5.1,3.8,1.9,0.4,0
|
||||||
|
4.8,3.0,1.4,0.3,0
|
||||||
|
5.1,3.8,1.6,0.2,0
|
||||||
|
4.6,3.2,1.4,0.2,0
|
||||||
|
5.3,3.7,1.5,0.2,0
|
||||||
|
5.0,3.3,1.4,0.2,0
|
||||||
|
7.0,3.2,4.7,1.4,1
|
||||||
|
6.4,3.2,4.5,1.5,1
|
||||||
|
6.9,3.1,4.9,1.5,1
|
||||||
|
5.5,2.3,4.0,1.3,1
|
||||||
|
6.5,2.8,4.6,1.5,1
|
||||||
|
5.7,2.8,4.5,1.3,1
|
||||||
|
6.3,3.3,4.7,1.6,1
|
||||||
|
4.9,2.4,3.3,1.0,1
|
||||||
|
6.6,2.9,4.6,1.3,1
|
||||||
|
5.2,2.7,3.9,1.4,1
|
||||||
|
5.0,2.0,3.5,1.0,1
|
||||||
|
5.9,3.0,4.2,1.5,1
|
||||||
|
6.0,2.2,4.0,1.0,1
|
||||||
|
6.1,2.9,4.7,1.4,1
|
||||||
|
5.6,2.9,3.6,1.3,1
|
||||||
|
6.7,3.1,4.4,1.4,1
|
||||||
|
5.6,3.0,4.5,1.5,1
|
||||||
|
5.8,2.7,4.1,1.0,1
|
||||||
|
6.2,2.2,4.5,1.5,1
|
||||||
|
5.6,2.5,3.9,1.1,1
|
||||||
|
5.9,3.2,4.8,1.8,1
|
||||||
|
6.1,2.8,4.0,1.3,1
|
||||||
|
6.3,2.5,4.9,1.5,1
|
||||||
|
6.1,2.8,4.7,1.2,1
|
||||||
|
6.4,2.9,4.3,1.3,1
|
||||||
|
6.6,3.0,4.4,1.4,1
|
||||||
|
6.8,2.8,4.8,1.4,1
|
||||||
|
6.7,3.0,5.0,1.7,1
|
||||||
|
6.0,2.9,4.5,1.5,1
|
||||||
|
5.7,2.6,3.5,1.0,1
|
||||||
|
5.5,2.4,3.8,1.1,1
|
||||||
|
5.5,2.4,3.7,1.0,1
|
||||||
|
5.8,2.7,3.9,1.2,1
|
||||||
|
6.0,2.7,5.1,1.6,1
|
||||||
|
5.4,3.0,4.5,1.5,1
|
||||||
|
6.0,3.4,4.5,1.6,1
|
||||||
|
6.7,3.1,4.7,1.5,1
|
||||||
|
6.3,2.3,4.4,1.3,1
|
||||||
|
5.6,3.0,4.1,1.3,1
|
||||||
|
5.5,2.5,4.0,1.3,1
|
||||||
|
5.5,2.6,4.4,1.2,1
|
||||||
|
6.1,3.0,4.6,1.4,1
|
||||||
|
5.8,2.6,4.0,1.2,1
|
||||||
|
5.0,2.3,3.3,1.0,1
|
||||||
|
5.6,2.7,4.2,1.3,1
|
||||||
|
5.7,3.0,4.2,1.2,1
|
||||||
|
5.7,2.9,4.2,1.3,1
|
||||||
|
6.2,2.9,4.3,1.3,1
|
||||||
|
5.1,2.5,3.0,1.1,1
|
||||||
|
5.7,2.8,4.1,1.3,1
|
||||||
|
6.3,3.3,6.0,2.5,2
|
||||||
|
5.8,2.7,5.1,1.9,2
|
||||||
|
7.1,3.0,5.9,2.1,2
|
||||||
|
6.3,2.9,5.6,1.8,2
|
||||||
|
6.5,3.0,5.8,2.2,2
|
||||||
|
7.6,3.0,6.6,2.1,2
|
||||||
|
4.9,2.5,4.5,1.7,2
|
||||||
|
7.3,2.9,6.3,1.8,2
|
||||||
|
6.7,2.5,5.8,1.8,2
|
||||||
|
7.2,3.6,6.1,2.5,2
|
||||||
|
6.5,3.2,5.1,2.0,2
|
||||||
|
6.4,2.7,5.3,1.9,2
|
||||||
|
6.8,3.0,5.5,2.1,2
|
||||||
|
5.7,2.5,5.0,2.0,2
|
||||||
|
5.8,2.8,5.1,2.4,2
|
||||||
|
6.4,3.2,5.3,2.3,2
|
||||||
|
6.5,3.0,5.5,1.8,2
|
||||||
|
7.7,3.8,6.7,2.2,2
|
||||||
|
7.7,2.6,6.9,2.3,2
|
||||||
|
6.0,2.2,5.0,1.5,2
|
||||||
|
6.9,3.2,5.7,2.3,2
|
||||||
|
5.6,2.8,4.9,2.0,2
|
||||||
|
7.7,2.8,6.7,2.0,2
|
||||||
|
6.3,2.7,4.9,1.8,2
|
||||||
|
6.7,3.3,5.7,2.1,2
|
||||||
|
7.2,3.2,6.0,1.8,2
|
||||||
|
6.2,2.8,4.8,1.8,2
|
||||||
|
6.1,3.0,4.9,1.8,2
|
||||||
|
6.4,2.8,5.6,2.1,2
|
||||||
|
7.2,3.0,5.8,1.6,2
|
||||||
|
7.4,2.8,6.1,1.9,2
|
||||||
|
7.9,3.8,6.4,2.0,2
|
||||||
|
6.4,2.8,5.6,2.2,2
|
||||||
|
6.3,2.8,5.1,1.5,2
|
||||||
|
6.1,2.6,5.6,1.4,2
|
||||||
|
7.7,3.0,6.1,2.3,2
|
||||||
|
6.3,3.4,5.6,2.4,2
|
||||||
|
6.4,3.1,5.5,1.8,2
|
||||||
|
6.0,3.0,4.8,1.8,2
|
||||||
|
6.9,3.1,5.4,2.1,2
|
||||||
|
6.7,3.1,5.6,2.4,2
|
||||||
|
6.9,3.1,5.1,2.3,2
|
||||||
|
5.8,2.7,5.1,1.9,2
|
||||||
|
6.8,3.2,5.9,2.3,2
|
||||||
|
6.7,3.3,5.7,2.5,2
|
||||||
|
6.7,3.0,5.2,2.3,2
|
||||||
|
6.3,2.5,5.0,1.9,2
|
||||||
|
6.5,3.0,5.2,2.0,2
|
||||||
|
6.2,3.4,5.4,2.3,2
|
||||||
|
5.9,3.0,5.1,1.8,2
|
|
|
@ -0,0 +1,59 @@
|
||||||
|
Iris Plants Database
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
Data Set Characteristics:
|
||||||
|
:Number of Instances: 150 (50 in each of three classes)
|
||||||
|
:Number of Attributes: 4 numeric, predictive attributes and the class
|
||||||
|
:Attribute Information:
|
||||||
|
- sepal length in cm
|
||||||
|
- sepal width in cm
|
||||||
|
- petal length in cm
|
||||||
|
- petal width in cm
|
||||||
|
- class:
|
||||||
|
- Iris-Setosa
|
||||||
|
- Iris-Versicolour
|
||||||
|
- Iris-Virginica
|
||||||
|
:Summary Statistics:
|
||||||
|
============== ==== ==== ======= ===== ====================
|
||||||
|
Min Max Mean SD Class Correlation
|
||||||
|
============== ==== ==== ======= ===== ====================
|
||||||
|
sepal length: 4.3 7.9 5.84 0.83 0.7826
|
||||||
|
sepal width: 2.0 4.4 3.05 0.43 -0.4194
|
||||||
|
petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)
|
||||||
|
petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)
|
||||||
|
============== ==== ==== ======= ===== ====================
|
||||||
|
:Missing Attribute Values: None
|
||||||
|
:Class Distribution: 33.3% for each of 3 classes.
|
||||||
|
:Creator: R.A. Fisher
|
||||||
|
:Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
|
||||||
|
:Date: July, 1988
|
||||||
|
|
||||||
|
This is a copy of UCI ML iris datasets.
|
||||||
|
http://archive.ics.uci.edu/ml/datasets/Iris
|
||||||
|
|
||||||
|
The famous Iris database, first used by Sir R.A Fisher
|
||||||
|
|
||||||
|
This is perhaps the best known database to be found in the
|
||||||
|
pattern recognition literature. Fisher's paper is a classic in the field and
|
||||||
|
is referenced frequently to this day. (See Duda & Hart, for example.) The
|
||||||
|
data set contains 3 classes of 50 instances each, where each class refers to a
|
||||||
|
type of iris plant. One class is linearly separable from the other 2; the
|
||||||
|
latter are NOT linearly separable from each other.
|
||||||
|
|
||||||
|
References
|
||||||
|
----------
|
||||||
|
- Fisher,R.A. "The use of multiple measurements in taxonomic problems"
|
||||||
|
Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to
|
||||||
|
Mathematical Statistics" (John Wiley, NY, 1950).
|
||||||
|
- Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis.
|
||||||
|
(Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.
|
||||||
|
- Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System
|
||||||
|
Structure and Classification Rule for Recognition in Partially Exposed
|
||||||
|
Environments". IEEE Transactions on Pattern Analysis and Machine
|
||||||
|
Intelligence, Vol. PAMI-2, No. 1, 67-71.
|
||||||
|
- Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE Transactions
|
||||||
|
on Information Theory, May 1972, 431-433.
|
||||||
|
- See also: 1988 MLC Proceedings, 54-64. Cheeseman et al"s AUTOCLASS II
|
||||||
|
conceptual clustering system finds 3 classes in the data.
|
||||||
|
- Many, many more ...
|
Loading…
Reference in New Issue