forked from moodle/moodle
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
MDL-58859 mlbackend_php: Added to core
Part of MDL-57791 epic.
- Loading branch information
David Monllao
committed
Jul 24, 2017
1 parent
229ae61
commit 9e84757
Showing
129 changed files
with
11,485 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,340 @@ | ||
<?php | ||
// This file is part of Moodle - http://moodle.org/ | ||
// | ||
// Moodle is free software: you can redistribute it and/or modify | ||
// it under the terms of the GNU General Public License as published by | ||
// the Free Software Foundation, either version 3 of the License, or | ||
// (at your option) any later version. | ||
// | ||
// Moodle is distributed in the hope that it will be useful, | ||
// but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
// GNU General Public License for more details. | ||
// | ||
// You should have received a copy of the GNU General Public License | ||
// along with Moodle. If not, see <http://www.gnu.org/licenses/>. | ||
|
||
/** | ||
* Php predictions processor | ||
* | ||
* @package mlbackend_php | ||
* @copyright 2016 David Monllao {@link http://www.davidmonllao.com} | ||
* @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later | ||
*/ | ||
|
||
namespace mlbackend_php; | ||
|
||
// TODO No support for 3rd party plugins psr4?? | ||
spl_autoload_register(function($class) { | ||
// Autoload Phpml classes. | ||
$path = __DIR__ . '/../phpml/src/' . str_replace('\\', '/', $class) . '.php'; | ||
if (file_exists($path)) { | ||
require_once($path); | ||
} | ||
}); | ||
|
||
use Phpml\NeuralNetwork\Network\MultilayerPerceptron; | ||
use Phpml\NeuralNetwork\Training\Backpropagation; | ||
use Phpml\CrossValidation\RandomSplit; | ||
use Phpml\Dataset\ArrayDataset; | ||
|
||
defined('MOODLE_INTERNAL') || die(); | ||
|
||
/** | ||
* PHP predictions processor. | ||
* | ||
* @package mlbackend_php | ||
* @copyright 2016 David Monllao {@link http://www.davidmonllao.com} | ||
* @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later | ||
*/ | ||
class processor implements \core_analytics\predictor { | ||
|
||
const BATCH_SIZE = 1000; | ||
const TRAIN_ITERATIONS = 20; | ||
const MODEL_FILENAME = 'model.ser'; | ||
|
||
protected $limitedsize = false; | ||
|
||
public function is_ready() { | ||
return true; | ||
} | ||
|
||
public function train($uniqueid, \stored_file $dataset, $outputdir) { | ||
|
||
// Output directory is already unique to the model. | ||
$modelfilepath = $outputdir . DIRECTORY_SEPARATOR . self::MODEL_FILENAME; | ||
|
||
$modelmanager = new \Phpml\ModelManager(); | ||
|
||
if (file_exists($modelfilepath)) { | ||
$classifier = $modelmanager->restoreFromFile($modelfilepath); | ||
} else { | ||
$classifier = new \Phpml\Classification\Linear\Perceptron(0.001, self::TRAIN_ITERATIONS, false); | ||
} | ||
|
||
$fh = $dataset->get_content_file_handle(); | ||
|
||
// The first lines are var names and the second one values. | ||
$metadata = $this->extract_metadata($fh); | ||
|
||
// Skip headers. | ||
fgets($fh); | ||
|
||
$samples = array(); | ||
$targets = array(); | ||
while (($data = fgetcsv($fh)) !== false) { | ||
$sampledata = array_map('floatval', $data); | ||
$samples[] = array_slice($sampledata, 0, $metadata['nfeatures']); | ||
$targets[] = intval($data[$metadata['nfeatures']]); | ||
|
||
if (count($samples) === self::BATCH_SIZE) { | ||
// Training it batches to avoid running out of memory. | ||
|
||
$classifier->partialTrain($samples, $targets, array(0, 1)); | ||
$samples = array(); | ||
$targets = array(); | ||
} | ||
} | ||
fclose($fh); | ||
|
||
// Train the remaining samples. | ||
if ($samples) { | ||
$classifier->partialTrain($samples, $targets, array(0, 1)); | ||
} | ||
|
||
$resultobj = new \stdClass(); | ||
$resultobj->status = \core_analytics\model::OK; | ||
$resultobj->info = array(); | ||
|
||
// Store the trained model. | ||
$modelmanager->saveToFile($classifier, $modelfilepath); | ||
|
||
return $resultobj; | ||
} | ||
|
||
public function predict($uniqueid, \stored_file $dataset, $outputdir) { | ||
|
||
// Output directory is already unique to the model. | ||
$modelfilepath = $outputdir . DIRECTORY_SEPARATOR . self::MODEL_FILENAME; | ||
|
||
if (!file_exists($modelfilepath)) { | ||
throw new \moodle_exception('errorcantloadmodel', 'analytics', '', $modelfilepath); | ||
} | ||
|
||
$modelmanager = new \Phpml\ModelManager(); | ||
$classifier = $modelmanager->restoreFromFile($modelfilepath); | ||
|
||
$fh = $dataset->get_content_file_handle(); | ||
|
||
// The first lines are var names and the second one values. | ||
$metadata = $this->extract_metadata($fh); | ||
|
||
// Skip headers. | ||
fgets($fh); | ||
|
||
$sampleids = array(); | ||
$samples = array(); | ||
$predictions = array(); | ||
while (($data = fgetcsv($fh)) !== false) { | ||
$sampledata = array_map('floatval', $data); | ||
$sampleids[] = $data[0]; | ||
$samples[] = array_slice($sampledata, 1, $metadata['nfeatures']); | ||
|
||
if (count($samples) === self::BATCH_SIZE) { | ||
// Prediction it batches to avoid running out of memory. | ||
|
||
// Append predictions incrementally, we want $sampleids keys in sync with $predictions keys. | ||
$newpredictions = $classifier->predict($samples); | ||
foreach ($newpredictions as $prediction) { | ||
array_push($predictions, $prediction); | ||
} | ||
$samples = array(); | ||
} | ||
} | ||
fclose($fh); | ||
|
||
// Finish the remaining predictions. | ||
if ($samples) { | ||
$predictions = $predictions + $classifier->predict($samples); | ||
} | ||
|
||
$resultobj = new \stdClass(); | ||
$resultobj->status = \core_analytics\model::OK; | ||
$resultobj->info = array(); | ||
|
||
foreach ($predictions as $index => $prediction) { | ||
$resultobj->predictions[$index] = array($sampleids[$index], $prediction); | ||
} | ||
|
||
return $resultobj; | ||
} | ||
|
||
/** | ||
* Evaluates the provided dataset. | ||
* | ||
* During evaluation we need to shuffle the evaluation dataset samples to detect deviated results, | ||
* if the dataset is massive we can not load everything into memory. We know that 2GB is the | ||
* minimum memory limit we should have (\core_analytics\model::increase_memory), if we substract the memory | ||
* that we already consumed and the memory that Phpml algorithms will need we should still have at | ||
* least 500MB of memory, which should be enough to evaluate a model. In any case this is a robust | ||
* solution that will work for all sites but it should minimize memory limit problems. Site admins | ||
* can still set $CFG->mlbackend_php_no_evaluation_limits to true to skip this 500MB limit. | ||
* | ||
* @param string $uniqueid | ||
* @param float $maxdeviation | ||
* @param int $niterations | ||
* @param \stored_file $dataset | ||
* @param string $outputdir | ||
* @return \stdClass | ||
*/ | ||
public function evaluate($uniqueid, $maxdeviation, $niterations, \stored_file $dataset, $outputdir) { | ||
$fh = $dataset->get_content_file_handle(); | ||
|
||
// The first lines are var names and the second one values. | ||
$metadata = $this->extract_metadata($fh); | ||
|
||
// Skip headers. | ||
fgets($fh); | ||
|
||
if (empty($CFG->mlbackend_php_no_evaluation_limits)) { | ||
$samplessize = 0; | ||
$limit = get_real_size('500MB'); | ||
|
||
// Just an approximation, will depend on PHP version, compile options... | ||
// Double size + zval struct (6 bytes + 8 bytes + 16 bytes) + array bucket (96 bytes) | ||
// https://nikic.github.io/2011/12/12/How-big-are-PHP-arrays-really-Hint-BIG.html | ||
$floatsize = (PHP_INT_SIZE * 2) + 6 + 8 + 16 + 96; | ||
} | ||
|
||
$samples = array(); | ||
$targets = array(); | ||
while (($data = fgetcsv($fh)) !== false) { | ||
$sampledata = array_map('floatval', $data); | ||
|
||
$samples[] = array_slice($sampledata, 0, $metadata['nfeatures']); | ||
$targets[] = array(intval($data[$metadata['nfeatures']])); | ||
|
||
if (empty($CFG->mlbackend_php_no_evaluation_limits)) { | ||
// We allow admins to disable evaluation memory usage limits by modifying config.php. | ||
|
||
// We will have plenty of missing values in the dataset so it should be a conservative approximation: | ||
$samplessize = $samplessize + (count($sampledata) * $floatsize); | ||
|
||
// Stop fetching more samples. | ||
if ($samplessize >= $limit) { | ||
$this->limitedsize = true; | ||
break; | ||
} | ||
} | ||
} | ||
fclose($fh); | ||
|
||
$phis = array(); | ||
|
||
// Evaluate the model multiple times to confirm the results are not significantly random due to a short amount of data. | ||
for ($i = 0; $i < $niterations; $i++) { | ||
|
||
//$classifier = new \Phpml\Classification\Linear\Perceptron(0.001, self::TRAIN_ITERATIONS, false); | ||
$network = new MultilayerPerceptron([intval($metadata['nfeatures']), 2, 1]); | ||
$training = new Backpropagation($network); | ||
|
||
// Split up the dataset in classifier and testing. | ||
$data = new RandomSplit(new ArrayDataset($samples, $targets), 0.2); | ||
|
||
$training->train($data->getTrainSamples(), $data->getTrainLabels(), 0, 1); | ||
|
||
$predictedlabels = array(); | ||
foreach ($data->getTestSamples() as $input) { | ||
$output = $network->setInput($input)->getOutput(); | ||
$predictedlabels[] = reset($output); | ||
} | ||
$phis[] = $this->get_phi($data->getTestLabels(), $predictedlabels); | ||
} | ||
|
||
// Let's fill the results changing the returned status code depending on the phi-related calculated metrics. | ||
return $this->get_evaluation_result_object($dataset, $phis, $maxdeviation); | ||
} | ||
|
||
protected function get_evaluation_result_object(\stored_file $dataset, $phis, $maxdeviation) { | ||
|
||
if (count($phis) === 1) { | ||
$avgphi = reset($phis); | ||
} else { | ||
$avgphi = \Phpml\Math\Statistic\Mean::arithmetic($phis); | ||
} | ||
|
||
// Standard deviation should ideally be calculated against the area under the curve. | ||
if (count($phis) === 1) { | ||
$modeldev = 0; | ||
} else { | ||
$modeldev = \Phpml\Math\Statistic\StandardDeviation::population($phis); | ||
} | ||
|
||
// Let's fill the results object. | ||
$resultobj = new \stdClass(); | ||
|
||
// Zero is ok, now we add other bits if something is not right. | ||
$resultobj->status = \core_analytics\model::OK; | ||
$resultobj->info = array(); | ||
|
||
// Convert phi to a standard score (from -1 to 1 to a value between 0 and 1). | ||
$resultobj->score = ($avgphi + 1) / 2; | ||
|
||
// If each iteration results varied too much we need more data to confirm that this is a valid model. | ||
if ($modeldev > $maxdeviation) { | ||
$resultobj->status = $resultobj->status + \core_analytics\model::EVALUATE_NOT_ENOUGH_DATA; | ||
$a = new \stdClass(); | ||
$a->deviation = $modeldev; | ||
$a->accepteddeviation = $maxdeviation; | ||
$resultobj->info[] = get_string('errornotenoughdata', 'mlbackend_php', $a); | ||
} | ||
|
||
if ($resultobj->score < \core_analytics\model::MIN_SCORE) { | ||
$resultobj->status = $resultobj->status + \core_analytics\model::EVALUATE_LOW_SCORE; | ||
$a = new \stdClass(); | ||
$a->score = $resultobj->score; | ||
$a->minscore = \core_analytics\model::MIN_SCORE; | ||
$resultobj->info[] = get_string('errorlowscore', 'mlbackend_php', $a); | ||
} | ||
|
||
if ($this->limitedsize === true) { | ||
$resultobj->info[] = get_string('datasetsizelimited', 'mlbackend_php', display_size($dataset->get_filesize())); | ||
} | ||
|
||
return $resultobj; | ||
} | ||
|
||
protected function get_phi($testlabels, $predictedlabels) { | ||
|
||
foreach ($testlabels as $key => $element) { | ||
$value = reset($element); | ||
$testlabels[$key] = $value; | ||
} | ||
|
||
foreach ($predictedlabels as $key => $element) { | ||
$predictedlabels[$key] = ($element > 0.5) ? 1 : 0; | ||
} | ||
|
||
// Binary here only as well. | ||
$matrix = \Phpml\Metric\ConfusionMatrix::compute($testlabels, $predictedlabels, array(0, 1)); | ||
|
||
$tptn = $matrix[0][0] * $matrix[1][1]; | ||
$fpfn = $matrix[1][0] * $matrix[0][1]; | ||
$tpfp = $matrix[0][0] + $matrix[1][0]; | ||
$tpfn = $matrix[0][0] + $matrix[0][1]; | ||
$tnfp = $matrix[1][1] + $matrix[1][0]; | ||
$tnfn = $matrix[1][1] + $matrix[0][1]; | ||
if ($tpfp === 0 || $tpfn === 0 || $tnfp === 0 || $tnfn === 0) { | ||
$phi = 0; | ||
} else { | ||
$phi = ( $tptn - $fpfn ) / sqrt( $tpfp * $tpfn * $tnfp * $tnfn); | ||
} | ||
|
||
return $phi; | ||
} | ||
|
||
protected function extract_metadata($fh) { | ||
$metadata = fgetcsv($fh); | ||
return array_combine($metadata, fgetcsv($fh)); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
<?php | ||
|
||
$string['pluginname'] = 'PHP predictor'; | ||
$string['errorcantloadmodel'] = 'Model file {$a} does not exist, ensure the model has been trained before using it to predict.'; | ||
$string['errornotenoughdata'] = 'The evaluation results varied too much, you could try to gather more data to ensure the model is valid. Evaluation results standard deviation = {$a->deviation}, maximum recommended standard deviation = {$a->accepteddeviation}'; | ||
$string['errorlowscore'] = 'The evaluated model prediction accuracy is not very high, some predictions may not be accurate. Model score = {$a->score}, minimum score = {$a->minscore}'; | ||
$string['datasetsizelimited'] = 'Only a part of the evaluation dataset has been evaluated due to its size. Set $CFG->mlbackend_php_no_memory_limit if you are confident that your system can cope a {$a} dataset'; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
The MIT License (MIT) | ||
|
||
Copyright (c) 2016 Arkadiusz Kondas <arkadiusz.kondas[at]gmail> | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../vendor/phpunit/phpunit/phpunit |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
Current version is 12b8b11 | ||
|
||
# Download latest stable version from https://github.com/php-ai/php-ml | ||
# Remove all files but: | ||
* src/ | ||
* LICENSE |
Oops, something went wrong.