Merge pull request #9 from php-ai/develop

Add missing docs and create changelog
JonasHaouzi · Jul 11, 2016 · 9d900be · 9d900be
2 parents a4c8fba + 7c0767c
commit 9d900be
Show file tree

Hide file tree

Showing 8 changed files with 202 additions and 21 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,17 @@
+CHANGELOG
+=========
+
+This changelog references the relevant changes done in PHP-ML library.
+
+* 0.2.0 (in plan)
+    * feature [Dataset] - FileDataset - load dataset from files (folders as targets)
+    * feature [Metric] - ClassificationReport - report about trained classifier
+
+* 0.1.1 (2016-07-12)
+    * feature [Cross Validation] Stratified Random Split - equal distribution for targets in split
+    * feature [General] Documentation - add missing pages (Pipeline, ConfusionMatrix and TfIdfTransformer) and fix links 
+
+* 0.1.0 (2016-07-08)
+    * first develop release
+    * base tools for Machine Learning: Algorithms, Cross Validation, Preprocessing, Feature Extraction
+    * bug [General] #7 - PHP-ML doesn't work on Mac
diff --git a/README.md b/README.md
@@ -48,6 +48,9 @@ composer require php-ai/php-ml
     * [DBSCAN](http://php-ml.readthedocs.io/en/latest/machine-learning/clustering/dbscan/)
 * Metric
     * [Accuracy](http://php-ml.readthedocs.io/en/latest/machine-learning/metric/accuracy/)
+    * [Confusion Matrix](http://php-ml.readthedocs.io/en/latest/machine-learning/metric/confusion-matrix/)
+* Workflow
+    * [Pipeline](http://php-ml.readthedocs.io/en/latest/machine-learning/workflow/pipeline)
 * Cross Validation
     * [Random Split](http://php-ml.readthedocs.io/en/latest/machine-learning/cross-validation/random-split/)
     * [Stratified Random Split](http://php-ml.readthedocs.io/en/latest/machine-learning/cross-validation/stratified-random-split/)
@@ -56,6 +59,7 @@ composer require php-ai/php-ml
     * [Imputation missing values](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/imputation-missing-values/)
 * Feature Extraction
     * [Token Count Vectorizer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/token-count-vectorizer/)
+    * [Tf-idf Transformer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/tf-idf-transformer/)
 * Datasets
     * [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset/)
     * Ready to use:

diff --git a/docs/index.md b/docs/index.md
@@ -37,35 +37,39 @@ composer require php-ai/php-ml
 ## Features
 
 * Classification
-    * [SVC](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/svc/)
-    * [k-Nearest Neighbors](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/k-nearest-neighbors/)
-    * [Naive Bayes](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/naive-bayes/)
+    * [SVC](machine-learning/classification/svc/)
+    * [k-Nearest Neighbors](machine-learning/classification/k-nearest-neighbors/)
+    * [Naive Bayes](machine-learning/classification/naive-bayes/)
 * Regression
-    * [Least Squares](http://php-ml.readthedocs.io/en/latest/machine-learning/regression/least-squares/)
-    * [SVR](http://php-ml.readthedocs.io/en/latest/machine-learning/regression/svr/)
+    * [Least Squares](machine-learning/regression/least-squares/)
+    * [SVR](machine-learning/regression/svr/)
 * Clustering
-    * [k-Means](http://php-ml.readthedocs.io/en/latest/machine-learning/clustering/k-means/)
-    * [DBSCAN](http://php-ml.readthedocs.io/en/latest/machine-learning/clustering/dbscan/)
+    * [k-Means](machine-learning/clustering/k-means/)
+    * [DBSCAN](machine-learning/clustering/dbscan/)
 * Metric
-    * [Accuracy](http://php-ml.readthedocs.io/en/latest/machine-learning/metric/accuracy/)
+    * [Accuracy](machine-learning/metric/accuracy/)
+    * [Confusion Matrix](machine-learning/metric/confusion-matrix/)
+* Workflow
+    * [Pipeline](machine-learning/workflow/pipeline)
 * Cross Validation
-    * [Random Split](http://php-ml.readthedocs.io/en/latest/machine-learning/cross-validation/random-split/)
-    * [Stratified Random Split](http://php-ml.readthedocs.io/en/latest/machine-learning/cross-validation/stratified-random-split/)
+    * [Random Split](machine-learning/cross-validation/random-split/)
+    * [Stratified Random Split](machine-learning/cross-validation/stratified-random-split/)
 * Preprocessing
-    * [Normalization](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/normalization/)
-    * [Imputation missing values](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/imputation-missing-values/)
+    * [Normalization](machine-learning/preprocessing/normalization/)
+    * [Imputation missing values](machine-learning/preprocessing/imputation-missing-values/)
 * Feature Extraction
-    * [Token Count Vectorizer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/token-count-vectorizer/)
+    * [Token Count Vectorizer](machine-learning/feature-extraction/token-count-vectorizer/)
+    * [Tf-idf Transformer](machine-learning/feature-extraction/tf-idf-transformer/)
 * Datasets
-    * [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset/)
+    * [CSV](machine-learning/datasets/csv-dataset/)
     * Ready to use:
-        * [Iris](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/iris/)
-        * [Wine](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/wine/)
-        * [Glass](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/glass/)
+        * [Iris](machine-learning/datasets/demo/iris/)
+        * [Wine](machine-learning/datasets/demo/wine/)
+        * [Glass](machine-learning/datasets/demo/glass/)
 * Math
-    * [Distance](http://php-ml.readthedocs.io/en/latest/math/distance/)
-    * [Matrix](http://php-ml.readthedocs.io/en/latest/math/matrix/)
-    * [Statistic](http://php-ml.readthedocs.io/en/latest/math/statistic/)
+    * [Distance](math/distance/)
+    * [Matrix](math/matrix/)
+    * [Statistic](math/statistic/)
 
 
 ## Contribute

diff --git a/docs/machine-learning/feature-extraction/tf-idf-transformer.md b/docs/machine-learning/feature-extraction/tf-idf-transformer.md
@@ -0,0 +1,42 @@
+# Tf-idf Transformer
+
+Tf–idf, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.
+
+### Constructor Parameters
+
+* $samples (array) - samples for fit tf-idf model
+
+```
+use Phpml\FeatureExtraction\TfIdfTransformer;
+
+$samples = [
+    [1, 2, 4],
+    [0, 2, 1]
+];
+
+$transformer = new TfIdfTransformer($samples);
+```
+
+### Transformation
+
+To transform a collection of text samples use `transform` method. Example:
+
+```
+use Phpml\FeatureExtraction\TfIdfTransformer;
+
+$samples = [
+    [0 => 1, 1 => 1, 2 => 2, 3 => 1, 4 => 0, 5 => 0],
+    [0 => 1, 1 => 1, 2 => 0, 3 => 0, 4 => 2, 5 => 3],
+];
+        
+$transformer = new TfIdfTransformer($samples);
+$transformer->transform($samples);
+
+/*
+$samples = [
+   [0 => 0, 1 => 0, 2 => 0.602, 3 => 0.301, 4 => 0, 5 => 0],
+   [0 => 0, 1 => 0, 2 => 0, 3 => 0, 4 => 0.602, 5 => 0.903],
+];
+*/
+        
+```
diff --git a/docs/machine-learning/metric/confusion-matrix.md b/docs/machine-learning/metric/confusion-matrix.md
@@ -0,0 +1,44 @@
+# Confusion Matrix
+
+Class for compute confusion matrix to evaluate the accuracy of a classification.
+
+### Example (all targets)
+
+Compute ConfusionMatrix for all targets.
+
+```
+use Phpml\Metric\ConfusionMatrix;
+
+$actualTargets = [2, 0, 2, 2, 0, 1];
+$predictedTargets = [0, 0, 2, 2, 0, 2];
+
+$confusionMatrix = ConfusionMatrix::compute($actualTargets, $predictedTargets)
+
+/*
+$confusionMatrix = [
+    [2, 0, 0],
+    [0, 0, 1],
+    [1, 0, 2],
+];
+*/
+```
+
+### Example (chosen targets)
+
+Compute ConfusionMatrix for chosen targets.
+
+```
+use Phpml\Metric\ConfusionMatrix;
+
+$actualTargets = ['cat', 'ant', 'cat', 'cat', 'ant', 'bird'];
+$predictedTargets = ['ant', 'ant', 'cat', 'cat', 'ant', 'cat'];
+
+$confusionMatrix = ConfusionMatrix::compute($actualTargets, $predictedTargets, ['ant', 'bird'])
+
+/*
+$confusionMatrix = [
+    [2, 0],
+    [0, 0],
+];
+*/
+```
diff --git a/docs/machine-learning/workflow/pipeline.md b/docs/machine-learning/workflow/pipeline.md
@@ -0,0 +1,65 @@
+# Pipeline
+
+In machine learning, it is common to run a sequence of algorithms to process and learn from dataset. For example:
+
+    * Split each document’s text into tokens.
+    * Convert each document’s words into a numerical feature vector ([Token Count Vectorizer](machine-learning/feature-extraction/token-count-vectorizer/)).
+    * Learn a prediction model using the feature vectors and labels.
+
+PHP-ML represents such a workflow as a Pipeline, which consists sequence of transformers and a estimator.
+
+
+### Constructor Parameters
+
+* $transformers (array|Transformer[]) - sequence of objects that implements Transformer interface
+* $estimator (Estimator) - estimator that can train and predict
+
+```
+use Phpml\Classification\SVC;
+use Phpml\FeatureExtraction\TfIdfTransformer;
+use Phpml\Pipeline;
+
+$transformers = [
+    new TfIdfTransformer(),
+];
+$estimator = new SVC();
+
+$pipeline = new Pipeline($transformers, $estimator);
+```
+
+### Example
+
+First our pipeline replace missing value, then normalize samples and finally train SVC estimator. Thus prepared pipeline repeats each transformation step for predicted sample.
+
+```
+use Phpml\Classification\SVC;
+use Phpml\Pipeline;
+use Phpml\Preprocessing\Imputer;
+use Phpml\Preprocessing\Normalizer;
+use Phpml\Preprocessing\Imputer\Strategy\MostFrequentStrategy;
+
+$transformers = [
+    new Imputer(null, new MostFrequentStrategy()),
+    new Normalizer(),
+];
+$estimator = new SVC();
+
+$samples = [
+    [1, -1, 2],
+    [2, 0, null],
+    [null, 1, -1],
+];
+
+$targets = [
+    4,
+    1,
+    4,
+];
+
+$pipeline = new Pipeline($transformers, $estimator);
+$pipeline->train($samples, $targets);
+
+$predicted = $pipeline->predict([[0, 0, 0]]);
+
+// $predicted == 4
+```
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -14,13 +14,18 @@ pages:
       - DBSCAN: machine-learning/clustering/dbscan.md
     - Metric:
       - Accuracy: machine-learning/metric/accuracy.md
+      - Confusion Matrix: machine-learning/metric/confusion-matrix.md
+    - Workflow:
+      - Pipeline: machine-learning/workflow/pipeline.md
     - Cross Validation:
       - RandomSplit: machine-learning/cross-validation/random-split.md
+      - Stratified Random Split: machine-learning/cross-validation/stratified-random-split.md
     - Preprocessing:
       - Normalization: machine-learning/preprocessing/normalization.md
       - Imputation missing values: machine-learning/preprocessing/imputation-missing-values.md
     - Feature Extraction:
       - Token Count Vectorizer: machine-learning/feature-extraction/token-count-vectorizer.md
+      - Tf-idf Transformer: machine-learning/feature-extraction/tf-idf-transformer.md
     - Datasets:
       - Array Dataset: machine-learning/datasets/array-dataset.md
       - CSV Dataset: machine-learning/datasets/csv-dataset.md

diff --git a/tests/Phpml/FeatureExtraction/TfIdfTransformerTest.php b/tests/Phpml/FeatureExtraction/TfIdfTransformerTest.php
@@ -10,7 +10,7 @@ class TfIdfTransformerTest extends \PHPUnit_Framework_TestCase
 {
     public function testTfIdfTransformation()
     {
-        //https://en.wikipedia.org/wiki/Tf%E2%80%93idf
+        // https://en.wikipedia.org/wiki/Tf-idf
 
         $samples = [
             [0 => 1, 1 => 1, 2 => 2, 3 => 1, 4 => 0, 5 => 0],