Skip to content

Commit

Permalink
Linear Discrimant Analysis (LDA) (#82)
Browse files Browse the repository at this point in the history
* Linear Discrimant Analysis (LDA)

* LDA test file

* Matrix inverse via LUDecomposition

* LUDecomposition inverse() and det() applied

* Readme update for LDA
  • Loading branch information
MustafaKarabulut authored and akondas committed Apr 25, 2017
1 parent 12b8b11 commit 5b373fa
Show file tree
Hide file tree
Showing 9 changed files with 735 additions and 131 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,9 @@ Example scripts are available in a separate repository [php-ai/php-ml-examples](
* [Token Count Vectorizer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/token-count-vectorizer/)
* [Tf-idf Transformer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/tf-idf-transformer/)
* Dimensionality Reduction
* PCA
* PCA (Principal Component Analysis)
* Kernel PCA
* LDA (Linear Discriminant Analysis)
* Datasets
* [Array](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/array-dataset/)
* [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset/)
Expand Down
98 changes: 98 additions & 0 deletions src/Phpml/DimensionReduction/EigenTransformerBase.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
<?php declare(strict_types=1);

namespace Phpml\DimensionReduction;

use Phpml\Math\LinearAlgebra\EigenvalueDecomposition;
use Phpml\Math\Matrix;

/**
* Class to compute eigen pairs (values & vectors) of a given matrix
* with the consideration of numFeatures or totalVariance to be preserved
*
* @author hp
*/
abstract class EigenTransformerBase
{
/**
* Total variance to be conserved after the reduction
*
* @var float
*/
public $totalVariance = 0.9;

/**
* Number of features to be preserved after the reduction
*
* @var int
*/
public $numFeatures = null;

/**
* Top eigenvectors of the matrix
*
* @var array
*/
protected $eigVectors = [];

/**
* Top eigenValues of the matrix
*
* @var type
*/
protected $eigValues = [];

/**
* Calculates eigenValues and eigenVectors of the given matrix. Returns
* top eigenVectors along with the largest eigenValues. The total explained variance
* of these eigenVectors will be no less than desired $totalVariance value
*
* @param array $matrix
*/
protected function eigenDecomposition(array $matrix)
{
$eig = new EigenvalueDecomposition($matrix);
$eigVals = $eig->getRealEigenvalues();
$eigVects= $eig->getEigenvectors();

$totalEigVal = array_sum($eigVals);
// Sort eigenvalues in descending order
arsort($eigVals);

$explainedVar = 0.0;
$vectors = [];
$values = [];
foreach ($eigVals as $i => $eigVal) {
$explainedVar += $eigVal / $totalEigVal;
$vectors[] = $eigVects[$i];
$values[] = $eigVal;

if ($this->numFeatures !== null) {
if (count($vectors) == $this->numFeatures) {
break;
}
} else {
if ($explainedVar >= $this->totalVariance) {
break;
}
}
}

$this->eigValues = $values;
$this->eigVectors = $vectors;
}

/**
* Returns the reduced data
*
* @param array $data
*
* @return array
*/
protected function reduce(array $data)
{
$m1 = new Matrix($data);
$m2 = new Matrix($this->eigVectors);

return $m1->multiply($m2->transpose())->toArray();
}
}
2 changes: 1 addition & 1 deletion src/Phpml/DimensionReduction/KernelPCA.php
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ public function fit(array $data)
$matrix = $this->calculateKernelMatrix($this->data, $numRows);
$matrix = $this->centerMatrix($matrix, $numRows);

list($this->eigValues, $this->eigVectors) = $this->eigenDecomposition($matrix, $numRows);
$this->eigenDecomposition($matrix, $numRows);

$this->fit = true;

Expand Down
247 changes: 247 additions & 0 deletions src/Phpml/DimensionReduction/LDA.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
<?php

declare(strict_types=1);

namespace Phpml\DimensionReduction;

use Phpml\Math\Statistic\Mean;
use Phpml\Math\Matrix;

class LDA extends EigenTransformerBase
{
/**
* @var bool
*/
public $fit = false;

/**
* @var array
*/
public $labels;

/**
* @var array
*/
public $means;

/**
* @var array
*/
public $counts;

/**
* @var float
*/
public $overallMean;

/**
* Linear Discriminant Analysis (LDA) is used to reduce the dimensionality
* of the data. Unlike Principal Component Analysis (PCA), it is a supervised
* technique that requires the class labels in order to fit the data to a
* lower dimensional space. <br><br>
* The algorithm can be initialized by speciyfing
* either with the totalVariance(a value between 0.1 and 0.99)
* or numFeatures (number of features in the dataset) to be preserved.
*
* @param float|null $totalVariance Total explained variance to be preserved
* @param int|null $numFeatures Number of features to be preserved
*
* @throws \Exception
*/
public function __construct($totalVariance = null, $numFeatures = null)
{
if ($totalVariance !== null && ($totalVariance < 0.1 || $totalVariance > 0.99)) {
throw new \Exception("Total variance can be a value between 0.1 and 0.99");
}
if ($numFeatures !== null && $numFeatures <= 0) {
throw new \Exception("Number of features to be preserved should be greater than 0");
}
if ($totalVariance !== null && $numFeatures !== null) {
throw new \Exception("Either totalVariance or numFeatures should be specified in order to run the algorithm");
}

if ($numFeatures !== null) {
$this->numFeatures = $numFeatures;
}
if ($totalVariance !== null) {
$this->totalVariance = $totalVariance;
}
}

/**
* Trains the algorithm to transform the given data to a lower dimensional space.
*
* @param array $data
* @param array $classes
*
* @return array
*/
public function fit(array $data, array $classes) : array
{
$this->labels = $this->getLabels($classes);
$this->means = $this->calculateMeans($data, $classes);

$sW = $this->calculateClassVar($data, $classes);
$sB = $this->calculateClassCov();

$S = $sW->inverse()->multiply($sB);
$this->eigenDecomposition($S->toArray());

$this->fit = true;

return $this->reduce($data);
}

/**
* Returns unique labels in the dataset
*
* @param array $classes
*
* @return array
*/
protected function getLabels(array $classes): array
{
$counts = array_count_values($classes);

return array_keys($counts);
}


/**
* Calculates mean of each column for each class and returns
* n by m matrix where n is number of labels and m is number of columns
*
* @param type $data
* @param type $classes
*
* @return array
*/
protected function calculateMeans($data, $classes) : array
{
$means = [];
$counts= [];
$overallMean = array_fill(0, count($data[0]), 0.0);

foreach ($data as $index => $row) {
$label = array_search($classes[$index], $this->labels);

foreach ($row as $col => $val) {
if (! isset($means[$label][$col])) {
$means[$label][$col] = 0.0;
}
$means[$label][$col] += $val;
$overallMean[$col] += $val;
}

if (! isset($counts[$label])) {
$counts[$label] = 0;
}
$counts[$label]++;
}

foreach ($means as $index => $row) {
foreach ($row as $col => $sum) {
$means[$index][$col] = $sum / $counts[$index];
}
}

// Calculate overall mean of the dataset for each column
$numElements = array_sum($counts);
$map = function ($el) use ($numElements) {
return $el / $numElements;
};
$this->overallMean = array_map($map, $overallMean);
$this->counts = $counts;

return $means;
}


/**
* Returns in-class scatter matrix for each class, which
* is a n by m matrix where n is number of classes and
* m is number of columns
*
* @param array $data
* @param array $classes
*
* @return Matrix
*/
protected function calculateClassVar($data, $classes)
{
// s is an n (number of classes) by m (number of column) matrix
$s = array_fill(0, count($data[0]), array_fill(0, count($data[0]), 0));
$sW = new Matrix($s, false);

foreach ($data as $index => $row) {
$label = array_search($classes[$index], $this->labels);
$means = $this->means[$label];

$row = $this->calculateVar($row, $means);

$sW = $sW->add($row);
}

return $sW;
}

/**
* Returns between-class scatter matrix for each class, which
* is an n by m matrix where n is number of classes and
* m is number of columns
*
* @return Matrix
*/
protected function calculateClassCov()
{
// s is an n (number of classes) by m (number of column) matrix
$s = array_fill(0, count($this->overallMean), array_fill(0, count($this->overallMean), 0));
$sB = new Matrix($s, false);

foreach ($this->means as $index => $classMeans) {
$row = $this->calculateVar($classMeans, $this->overallMean);
$N = $this->counts[$index];
$sB = $sB->add($row->multiplyByScalar($N));
}

return $sB;
}

/**
* Returns the result of the calculation (x - m)T.(x - m)
*
* @param array $row
* @param array $means
*
* @return Matrix
*/
protected function calculateVar(array $row, array $means)
{
$x = new Matrix($row, false);
$m = new Matrix($means, false);
$diff = $x->subtract($m);

return $diff->transpose()->multiply($diff);
}

/**
* Transforms the given sample to a lower dimensional vector by using
* the eigenVectors obtained in the last run of <code>fit</code>.
*
* @param array $sample
*
* @return array
*/
public function transform(array $sample)
{
if (!$this->fit) {
throw new \Exception("LDA has not been fitted with respect to original dataset, please run LDA::fit() first");
}

if (! is_array($sample[0])) {
$sample = [$sample];
}

return $this->reduce($sample);
}
}
Loading

0 comments on commit 5b373fa

Please sign in to comment.