119 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			Factor
		
	
	
			
		
		
	
	
			119 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			Factor
		
	
	
! Copyright (C) 2012 John Benediktsson, Doug Coleman
 | 
						|
! See http://factorcode.org/license.txt for BSD license
 | 
						|
 | 
						|
USING: accessors arrays ascii assocs byte-arrays combinators
 | 
						|
combinators.short-circuit concurrency.combinators csv grouping
 | 
						|
http.client images images.viewer io io.directories
 | 
						|
io.encodings.binary io.encodings.utf8 io.files io.launcher
 | 
						|
io.pathnames kernel math math.parser namespaces sequences
 | 
						|
splitting ui.gadgets.panes ;
 | 
						|
IN: machine-learning.data-sets
 | 
						|
 | 
						|
TUPLE: data-set
 | 
						|
    features targets
 | 
						|
    feature-names target-names
 | 
						|
    description ;
 | 
						|
 | 
						|
C: <data-set> data-set
 | 
						|
 | 
						|
<PRIVATE
 | 
						|
 | 
						|
: load-file ( name -- contents )
 | 
						|
    "resource:extra/machine-learning/data-sets/" prepend
 | 
						|
    utf8 file-contents ;
 | 
						|
 | 
						|
: load-tabular-file ( name -- lines )
 | 
						|
    load-file [ blank? ] trim string-lines
 | 
						|
    [ [ blank? ] split-when harvest ] map harvest ;
 | 
						|
 | 
						|
: numerify ( table -- data names )
 | 
						|
    unclip [ [ [ string>number ] map ] map ] dip ;
 | 
						|
 | 
						|
: load-table ( name -- data names )
 | 
						|
    load-tabular-file numerify ;
 | 
						|
 | 
						|
: load-table-csv ( name -- data names )
 | 
						|
    load-file string>csv numerify ;
 | 
						|
 | 
						|
PRIVATE>
 | 
						|
 | 
						|
: load-monks ( name -- data-set )
 | 
						|
    load-tabular-file
 | 
						|
    ! Omits the identifiers which are not so interesting.
 | 
						|
    [ but-last [ string>number ] map ] map
 | 
						|
    [ [ rest ] map ] [ [ first ] map ] bi
 | 
						|
    { "a1" "a2" "a3" "a4" "a5" "a6" }
 | 
						|
    { "no" "yes" }
 | 
						|
    "monks.names" load-file
 | 
						|
    <data-set> ;
 | 
						|
 | 
						|
: load-iris ( -- data-set )
 | 
						|
    "iris.csv" load-table-csv
 | 
						|
    [ [ unclip-last ] { } map>assoc unzip ] [ 2 tail ] bi*
 | 
						|
    {
 | 
						|
        "sepal length (cm)" "sepal width (cm)"
 | 
						|
        "petal length (cm)" "petal width (cm)"
 | 
						|
    } swap
 | 
						|
    "iris.rst" load-file
 | 
						|
    <data-set> ;
 | 
						|
 | 
						|
: load-linnerud ( -- data-set )
 | 
						|
    data-set new
 | 
						|
        "linnerud_exercise.csv" load-table
 | 
						|
        [ >>features ] [ >>feature-names ] bi*
 | 
						|
        "linnerud_physiological.csv" load-table
 | 
						|
        [ >>targets ] [ >>target-names ] bi*
 | 
						|
        "linnerud.rst" load-file >>description ;
 | 
						|
 | 
						|
: download-to-directory ( url directory -- )
 | 
						|
    dup make-directories
 | 
						|
    [
 | 
						|
        dup { [ download-name exists? ] [ file-stem exists? ] } 1|| [
 | 
						|
            drop
 | 
						|
        ] [
 | 
						|
            download
 | 
						|
        ] if
 | 
						|
    ] with-directory ;
 | 
						|
 | 
						|
: gzip-decompress-file ( path -- )
 | 
						|
    { "gzip" "-d" } swap suffix run-process drop ;
 | 
						|
 | 
						|
: mnist-data>array ( bytes -- seq )
 | 
						|
    16 tail-slice 28 28 * <groups> [
 | 
						|
        >byte-array <image>
 | 
						|
            swap >>bitmap
 | 
						|
            { 28 28 } >>dim
 | 
						|
            L >>component-order
 | 
						|
            ubyte-components >>component-type
 | 
						|
    ] map ;
 | 
						|
 | 
						|
: mnist-labels>array ( bytes -- seq )
 | 
						|
    8 tail-slice >array ;
 | 
						|
 | 
						|
: image-grid. ( image-seq -- )
 | 
						|
    [
 | 
						|
        [
 | 
						|
            <image-gadget> output-stream get write-gadget
 | 
						|
        ] each
 | 
						|
        output-stream get stream-nl
 | 
						|
    ] each ;
 | 
						|
 | 
						|
: load-mnist ( -- data-set )
 | 
						|
    "resource:datasets" dup make-directories [
 | 
						|
        {
 | 
						|
            "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz"
 | 
						|
            "http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz"
 | 
						|
            "http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz"
 | 
						|
            "http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz"
 | 
						|
        }
 | 
						|
        [ [ "resource:datasets/" download-to-directory ] parallel-each ]
 | 
						|
        [ [ dup file-stem exists? [ drop ] [ file-name gzip-decompress-file ] if ] each ]
 | 
						|
        [ [ file-stem binary file-contents ] map ] tri
 | 
						|
        first4 {
 | 
						|
            [ mnist-data>array ]
 | 
						|
            [ mnist-labels>array ]
 | 
						|
            [ mnist-data>array ]
 | 
						|
            [ mnist-labels>array ]
 | 
						|
        } spread 4array
 | 
						|
    ] with-directory ;
 |