From 18c36b971ff2e4529368d3df9807457874521e6f Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Wed, 7 Nov 2018 08:02:56 +0100 Subject: [PATCH] Mnist Dataset (#326) * Implement MnistDataset * Add MNIST dataset documentation --- README.md | 1 + docs/index.md | 1 + .../datasets/mnist-dataset.md | 26 +++++ mkdocs.yml | 1 + phpstan.neon | 2 +- src/Dataset/MnistDataset.php | 101 ++++++++++++++++++ tests/Dataset/MnistDatasetTest.php | 33 ++++++ .../Dataset/Resources/mnist/images-idx-ubyte | Bin 0 -> 7856 bytes .../Resources/mnist/labels-11-idx-ubyte | Bin 0 -> 19 bytes .../Dataset/Resources/mnist/labels-idx-ubyte | Bin 0 -> 18 bytes 10 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 docs/machine-learning/datasets/mnist-dataset.md create mode 100644 src/Dataset/MnistDataset.php create mode 100644 tests/Dataset/MnistDatasetTest.php create mode 100644 tests/Dataset/Resources/mnist/images-idx-ubyte create mode 100644 tests/Dataset/Resources/mnist/labels-11-idx-ubyte create mode 100644 tests/Dataset/Resources/mnist/labels-idx-ubyte diff --git a/README.md b/README.md index d93996f7..f518fd0d 100644 --- a/README.md +++ b/README.md @@ -112,6 +112,7 @@ Public datasets are available in a separate repository [php-ai/php-ml-datasets]( * [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset/) * [Files](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/files-dataset/) * [SVM](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/svm-dataset/) + * [MNIST](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/mnist-dataset.md) * Ready to use: * [Iris](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/iris/) * [Wine](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/wine/) diff --git a/docs/index.md b/docs/index.md index 12cbbd5f..3c6ede22 100644 --- a/docs/index.md +++ b/docs/index.md @@ -93,6 +93,7 @@ Example scripts are available in a separate repository [php-ai/php-ml-examples]( * [CSV](machine-learning/datasets/csv-dataset.md) * [Files](machine-learning/datasets/files-dataset.md) * [SVM](machine-learning/datasets/svm-dataset.md) + * [MNIST](machine-learning/datasets/mnist-dataset.md) * Ready to use: * [Iris](machine-learning/datasets/demo/iris.md) * [Wine](machine-learning/datasets/demo/wine.md) diff --git a/docs/machine-learning/datasets/mnist-dataset.md b/docs/machine-learning/datasets/mnist-dataset.md new file mode 100644 index 00000000..1ed50816 --- /dev/null +++ b/docs/machine-learning/datasets/mnist-dataset.md @@ -0,0 +1,26 @@ +# MnistDataset + +Helper class that load data from MNIST dataset: [http://yann.lecun.com/exdb/mnist/](http://yann.lecun.com/exdb/mnist/) + +> The MNIST database of handwritten digits, available from this page, has a training set of 60,000 examples, and a test set of 10,000 examples. It is a subset of a larger set available from NIST. The digits have been size-normalized and centered in a fixed-size image. + It is a good database for people who want to try learning techniques and pattern recognition methods on real-world data while spending minimal efforts on preprocessing and formatting. + +### Constructors Parameters + +* $imagePath - (string) path to image file +* $labelPath - (string) path to label file + +``` +use Phpml\Dataset\MnistDataset; + +$trainDataset = new MnistDataset('train-images-idx3-ubyte', 'train-labels-idx1-ubyte'); +``` + +### Samples and labels + +To get samples or labels you can use getters: + +``` +$dataset->getSamples(); +$dataset->getTargets(); +``` diff --git a/mkdocs.yml b/mkdocs.yml index 490e5dc0..451d6e90 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -39,6 +39,7 @@ pages: - CSV Dataset: machine-learning/datasets/csv-dataset.md - Files Dataset: machine-learning/datasets/files-dataset.md - SVM Dataset: machine-learning/datasets/svm-dataset.md + - MNIST Dataset: machine-learning/datasets/mnist-dataset.md - Ready to use datasets: - Iris: machine-learning/datasets/demo/iris.md - Wine: machine-learning/datasets/demo/wine.md diff --git a/phpstan.neon b/phpstan.neon index 7a676fa0..0ee43c49 100644 --- a/phpstan.neon +++ b/phpstan.neon @@ -6,7 +6,7 @@ includes: parameters: ignoreErrors: - '#Property Phpml\\Clustering\\KMeans\\Cluster\:\:\$points \(iterable\\&SplObjectStorage\) does not accept SplObjectStorage#' - - '#Phpml\\Dataset\\FilesDataset::__construct\(\) does not call parent constructor from Phpml\\Dataset\\ArrayDataset#' + - '#Phpml\\Dataset\\(.*)Dataset::__construct\(\) does not call parent constructor from Phpml\\Dataset\\ArrayDataset#' # wide range cases - '#Parameter \#1 \$coordinates of class Phpml\\Clustering\\KMeans\\Point constructor expects array, array\|Phpml\\Clustering\\KMeans\\Point given#' diff --git a/src/Dataset/MnistDataset.php b/src/Dataset/MnistDataset.php new file mode 100644 index 00000000..59a3a26d --- /dev/null +++ b/src/Dataset/MnistDataset.php @@ -0,0 +1,101 @@ +samples = $this->readImages($imagePath); + $this->targets = $this->readLabels($labelPath); + + if (count($this->samples) !== count($this->targets)) { + throw new InvalidArgumentException('Must have the same number of images and labels'); + } + } + + private function readImages(string $imagePath): array + { + $stream = fopen($imagePath, 'rb'); + + if ($stream === false) { + throw new InvalidArgumentException('Could not open file: '.$imagePath); + } + + $images = []; + + try { + $header = fread($stream, 16); + + $fields = unpack('Nmagic/Nsize/Nrows/Ncols', (string) $header); + + if ($fields['magic'] !== self::MAGIC_IMAGE) { + throw new InvalidArgumentException('Invalid magic number: '.$imagePath); + } + + if ($fields['rows'] != self::IMAGE_ROWS) { + throw new InvalidArgumentException('Invalid number of image rows: '.$imagePath); + } + + if ($fields['cols'] != self::IMAGE_COLS) { + throw new InvalidArgumentException('Invalid number of image cols: '.$imagePath); + } + + for ($i = 0; $i < $fields['size']; $i++) { + $imageBytes = fread($stream, $fields['rows'] * $fields['cols']); + + // Convert to float between 0 and 1 + $images[] = array_map(function ($b) { + return $b / 255; + }, array_values(unpack('C*', (string) $imageBytes))); + } + } finally { + fclose($stream); + } + + return $images; + } + + private function readLabels(string $labelPath): array + { + $stream = fopen($labelPath, 'rb'); + + if ($stream === false) { + throw new InvalidArgumentException('Could not open file: '.$labelPath); + } + + $labels = []; + + try { + $header = fread($stream, 8); + + $fields = unpack('Nmagic/Nsize', (string) $header); + + if ($fields['magic'] !== self::MAGIC_LABEL) { + throw new InvalidArgumentException('Invalid magic number: '.$labelPath); + } + + $labels = fread($stream, $fields['size']); + } finally { + fclose($stream); + } + + return array_values(unpack('C*', (string) $labels)); + } +} diff --git a/tests/Dataset/MnistDatasetTest.php b/tests/Dataset/MnistDatasetTest.php new file mode 100644 index 00000000..5fc73744 --- /dev/null +++ b/tests/Dataset/MnistDatasetTest.php @@ -0,0 +1,33 @@ +getSamples()); + self::assertCount(10, $dataset->getTargets()); + } + + public function testCheckSamplesAndTargetsCountMatch(): void + { + $this->expectException(InvalidArgumentException::class); + + new MnistDataset( + __DIR__.'/Resources/mnist/images-idx-ubyte', + __DIR__.'/Resources/mnist/labels-11-idx-ubyte' + ); + } +} diff --git a/tests/Dataset/Resources/mnist/images-idx-ubyte b/tests/Dataset/Resources/mnist/images-idx-ubyte new file mode 100644 index 0000000000000000000000000000000000000000..40b870a36999319757f59edae7335dcfe73a7dae GIT binary patch literal 7856 zcmeHMYfKeK6rM^|@GSyrV}(>Pz7XH3Pij$%v1qMeo2t>Y{_s(2d`847M!`~*+NvlT z@r|NYs5K;3@evg9S;QC>MWmn?P>Co4vU|?$?B1QdyR$oO`lm^)C%Jdd`Mz_`&YYc{ znZ1_~PY*&!Bl>BleQ5JKNW}3Sn+JEaAIW{BQkkvE!M;YX(0O6=_7r0g^{9J@xYo$2 zddm-+g@%R>@_m9cEu037hwI(~&acclUOY;q|CWG@MKs+UgWHhjVlE2tZfAq9QH{di zMfb+&_OIv*zCGl0CQi1yRw}5H-`+|p)Xju+fF%~LZyn?XG$9SHfpy-7OL=W+z>-=R zW{D&IYE7<#9jAIPtbuH0m9UYIYrqj=gDX%Bzoy)suH;_EP6QrdrxVIL?A;K%3HAB8n`=9YSt1 zCJ4C_{7}RVk;L3Oys8{S?4;2xjIzTOtbTU7`;ikly|#GcM7ZQH7+s(w!7O{u_q>q% z=9EOa)Y*1N4WN`nzwY(1q|IXB*F)w%U_ zBn3lQ4z*=mMgirHnJAJEFltl@fLa_gMdqrdbbW2$I9d%?tsiA}u*~kE2`OGhI|7rGqppARvyOsh(IW%bF0iJt1i*Zx)__cn79H+U@lYPyBK~L_mPivYx>O`N{edkSgzIZm!nr z3VTQK1Itp8Dqo1CvSM$RtY4XhcRBb{rc&yVWZ8>>dfO|43L(2>y;nNLl;yG<#B{!; z=S3dUZ5pTRys}!Z;vwBGeKJSadF5AgF?ss6w!?9jXwz*~jA*^Lu4;@!+$VTnYj_!P zl%Qt)==-#QB5{b^;_3E0hqXQ3mPs3k_|ya#j5&y_45mrlgB8k;9d!NwwP z6>&5rd$4(zb~5Ji7_%oJu4Fpj&cJdX9%FVIUNJ8ajG$+t8+gubnKh4O+-XJ();rtn zHx+}+YpVL4i3?yuYaVVYY|F=WsXBG9GLY%C^~nVH*1ZO%87MLm+>3iRJlxd4`6aJrAJw*A?D}9zjq!pV2SN z&oeED#barFe0+C+U8buUjQ8-Gd80CLJ7I*uGO3YR+1msdjAYzru*|BXN*FdBOU=Fp zD;Kttrh=1CE^M86MKHcxT9>WXQ-PX!$f=mt4Ghw&|=k5iRwY_yC}l1yzA| zkLmnN@wO$`s}rs_Jr^LWSGutmpoW%x5E{^&_}vO?3#ii;nfoItzb zl_q}3$mj5fbrf3mdMU6>zYwqVdL_>=Jphs1cjpW4oQR~f+*z?sb1bo zz$$OG(uRpqkzKn+^2yg|gI1bS?ybtBU;cfG@$ynj_Rui;ORXwt7LJg`@M=q+y#RJx zZyI%y-iVDhabEkClyzxbsa>9}k2#x+ye?#+*p-B;<7G?CL`Ip^FgDWq@Td>J889Hj?Hw z9Ad&=U`c-^k4M0QqUY$`ZP?I8Kf}V=-yc1inkPAERH<=s^-8M(C!(Zu6XiqPKLahk z%S!(#DM#TFTbO#ujNSBK$FJ<)xfazSe!=|A?KJvN*Nui|Otbn|B0e8vOOcg`RUg<> vO|u&?lM=*ST2DI}L&q+9U%rkQZx^Fr#m+LWRBq3?&ad6Z-)nyYgxQHU literal 0 HcmV?d00001 diff --git a/tests/Dataset/Resources/mnist/labels-11-idx-ubyte b/tests/Dataset/Resources/mnist/labels-11-idx-ubyte new file mode 100644 index 0000000000000000000000000000000000000000..db9362d1c9efd60e2a836a33f8834979d23196e1 GIT binary patch literal 19 acmZQz;9z86P#0ilVq{=pWZ`7xWB>pJO8{#C literal 0 HcmV?d00001 diff --git a/tests/Dataset/Resources/mnist/labels-idx-ubyte b/tests/Dataset/Resources/mnist/labels-idx-ubyte new file mode 100644 index 0000000000000000000000000000000000000000..eca5265988e2688261819cf634b7be2e5089d7bd GIT binary patch literal 18 ZcmZQz;9z86P#0ilVq{=pWZ`7x1ONlz0BZmM literal 0 HcmV?d00001