diff --git a/admin/searchareas.php b/admin/searchareas.php index dcbc7cb0bfd2b..9ce7af6d032b1 100644 --- a/admin/searchareas.php +++ b/admin/searchareas.php @@ -125,6 +125,9 @@ $areasconfig[$areaid]->docsprocessed . ' , ' . $areasconfig[$areaid]->recordsprocessed . ' , ' . $areasconfig[$areaid]->docsignored; + if ($areasconfig[$areaid]->partial) { + $laststatus .= ' ' . get_string('searchpartial', 'admin'); + } } else { $laststatus = ''; } diff --git a/admin/settings/plugins.php b/admin/settings/plugins.php index e63f19071e35c..3bb3c4a3988d1 100644 --- a/admin/settings/plugins.php +++ b/admin/settings/plugins.php @@ -557,6 +557,13 @@ $temp->add(new admin_setting_heading('searchengineheading', new lang_string('searchengine', 'admin'), '')); $temp->add(new admin_setting_configselect('searchengine', new lang_string('selectsearchengine', 'admin'), '', 'solr', $engines)); + $temp->add(new admin_setting_heading('searchindexingheading', new lang_string('searchoptions', 'admin'), '')); + $temp->add(new admin_setting_configcheckbox('searchindexwhendisabled', + new lang_string('searchindexwhendisabled', 'admin'), new lang_string('searchindexwhendisabled_desc', 'admin'), + 0)); + $temp->add(new admin_setting_configduration('searchindextime', + new lang_string('searchindextime', 'admin'), new lang_string('searchindextime_desc', 'admin'), + 600)); $ADMIN->add('searchplugins', $temp); $ADMIN->add('searchplugins', new admin_externalpage('searchareas', new lang_string('searchareas', 'admin'), diff --git a/lang/en/admin.php b/lang/en/admin.php index dca2305d5fec7..399a4b9f89d6e 100644 --- a/lang/en/admin.php +++ b/lang/en/admin.php @@ -985,10 +985,16 @@ $string['searchengine'] = 'Search engine'; $string['searchindexactions'] = 'Index actions'; $string['searchindexdeleted'] = 'Index deleted'; +$string['searchindextime'] = 'Indexing time limit'; +$string['searchindextime_desc'] = 'When indexing large amounts of new content, the scheduled task will stop after this time limit is reached. It will continue the next time the task runs.'; $string['searchindexupdated'] = 'Search engine contents have been updated'; +$string['searchindexwhendisabled'] = 'Index when disabled'; +$string['searchindexwhendisabled_desc'] = 'Allows the scheduled task to build the search index even when search is disabled. This is useful if you want to build the index before the search facility appears to students.'; $string['searchinsettings'] = 'Search in settings'; $string['searchlastrun'] = 'Last run (time, # docs, # records, # ignores)'; $string['searchnotavailable'] = 'Search is not available'; +$string['searchpartial'] = '(not yet fully indexed)'; +$string['searchoptions'] = 'Search options'; $string['searchreindexed'] = 'All site contents have been reindexed.'; $string['searchreindexindex'] = 'Reindex all site contents'; $string['searchresults'] = 'Search results'; diff --git a/lib/classes/task/search_index_task.php b/lib/classes/task/search_index_task.php index bdeedf1363a37..870c4dc24f8dc 100644 --- a/lib/classes/task/search_index_task.php +++ b/lib/classes/task/search_index_task.php @@ -46,12 +46,13 @@ public function get_name() { * Throw exceptions on errors (the job will be retried). */ public function execute() { - if (!\core_search\manager::is_global_search_enabled()) { + if (!\core_search\manager::is_global_search_enabled() && + !get_config('core', 'searchindexwhendisabled')) { return; } $globalsearch = \core_search\manager::instance(); // Indexing database records for modules + rich documents of forum. - $globalsearch->index(); + $globalsearch->index(false, get_config('core', 'searchindextime'), new \text_progress_trace()); } } diff --git a/lib/classes/task/search_optimize_task.php b/lib/classes/task/search_optimize_task.php index 9f4b8feb11b2a..1a006c7dde51e 100644 --- a/lib/classes/task/search_optimize_task.php +++ b/lib/classes/task/search_optimize_task.php @@ -49,7 +49,8 @@ public function get_name() { * Throw exceptions on errors (the job will be retried). */ public function execute() { - if (!\core_search\manager::is_global_search_enabled()) { + if (!\core_search\manager::is_global_search_enabled() && + !get_config('core', 'searchindexwhendisabled')) { return; } diff --git a/search/classes/base.php b/search/classes/base.php index 9b7180601c335..c0b21f4ee8264 100644 --- a/search/classes/base.php +++ b/search/classes/base.php @@ -175,7 +175,8 @@ public function get_config() { list($componentname, $varname) = $this->get_config_var_name(); $config = []; - $settingnames = array('_enabled', '_indexingstart', '_indexingend', '_lastindexrun', '_docsignored', '_docsprocessed', '_recordsprocessed'); + $settingnames = array('_enabled', '_indexingstart', '_indexingend', '_lastindexrun', + '_docsignored', '_docsprocessed', '_recordsprocessed', '_partial'); foreach ($settingnames as $name) { $config[$varname . $name] = get_config($componentname, $varname . $name); } @@ -209,6 +210,22 @@ public function set_enabled($isenabled) { return set_config($varname . '_enabled', $isenabled, $componentname); } + /** + * Gets the length of time spent indexing this area (the last time it was indexed). + * + * @return int|bool Time in seconds spent indexing this area last time, false if never indexed + */ + public function get_last_indexing_duration() { + list($componentname, $varname) = $this->get_config_var_name(); + $start = get_config($componentname, $varname . '_indexingstart'); + $end = get_config($componentname, $varname . '_indexingend'); + if ($start && $end) { + return $end - $start; + } else { + return false; + } + } + /** * Returns true if this area uses file indexing. * diff --git a/search/classes/engine.php b/search/classes/engine.php index a1dd10ce9555e..6957139613bc4 100644 --- a/search/classes/engine.php +++ b/search/classes/engine.php @@ -213,8 +213,18 @@ public function add_documents($iterator, $searcharea, $options) { $numdocs = 0; $numdocsignored = 0; $lastindexeddoc = 0; + $firstindexeddoc = 0; + $partial = false; foreach ($iterator as $document) { + // Stop if we have exceeded the time limit (and there are still more items). Always + // do at least one second's worth of documents otherwise it will never make progress. + if ($lastindexeddoc !== $firstindexeddoc && + !empty($options['stopat']) && microtime(true) >= $options['stopat']) { + $partial = true; + break; + } + if (!$document instanceof \core_search\document) { continue; } @@ -236,10 +246,13 @@ public function add_documents($iterator, $searcharea, $options) { } $lastindexeddoc = $document->get('modified'); + if (!$firstindexeddoc) { + $firstindexeddoc = $lastindexeddoc; + } $numrecords++; } - return array($numrecords, $numdocs, $numdocsignored, $lastindexeddoc); + return array($numrecords, $numdocs, $numdocsignored, $lastindexeddoc, $partial); } /** diff --git a/search/classes/manager.php b/search/classes/manager.php index cef7900d1d948..833ae499d92cf 100644 --- a/search/classes/manager.php +++ b/search/classes/manager.php @@ -521,11 +521,19 @@ public function optimize_index() { * Index all documents. * * @param bool $fullindex Whether we should reindex everything or not. + * @param float $timelimit Time limit in seconds (0 = no time limit) + * @param \progress_trace $progress Optional class for tracking progress * @throws \moodle_exception * @return bool Whether there was any updated document or not. */ - public function index($fullindex = false) { - global $CFG; + public function index($fullindex = false, $timelimit = 0, \progress_trace $progress = null) { + // Cannot combine time limit with reindex. + if ($timelimit && $fullindex) { + throw new \coding_exception('Cannot apply time limit when reindexing'); + } + if (!$progress) { + $progress = new \null_progress_trace(); + } // Unlimited time. \core_php_time_limit::raise(); @@ -536,11 +544,25 @@ public function index($fullindex = false) { $sumdocs = 0; $searchareas = $this->get_search_areas_list(true); + + if ($timelimit) { + // If time is limited (and therefore we're not just indexing everything anyway), select + // an order for search areas. The intention here is to avoid a situation where a new + // large search area is enabled, and this means all our other search areas go out of + // date while that one is being indexed. To do this, we order by the time we spent + // indexing them last time we ran, meaning anything that took a very long time will be + // done last. + uasort($searchareas, function(\core_search\base $area1, \core_search\base $area2) { + return (int)$area1->get_last_indexing_duration() - (int)$area2->get_last_indexing_duration(); + }); + + // Decide time to stop. + $stopat = microtime(true) + $timelimit; + } + foreach ($searchareas as $areaid => $searcharea) { - if (CLI_SCRIPT && !PHPUNIT_TEST) { - mtrace('Processing ' . $searcharea->get_visible_name() . ' area'); - } + $progress->output('Processing area: ' . $searcharea->get_visible_name()); // Notify the engine that an area is starting. $this->engine->area_index_starting($searcharea, $fullindex); @@ -556,7 +578,16 @@ public function index($fullindex = false) { if ($fullindex === true) { $referencestarttime = 0; } else { - $referencestarttime = $prevtimestart; + $partial = get_config($componentconfigname, $varname . '_partial'); + if ($partial) { + // When the previous index did not complete all data, we start from the time of the + // last document that was successfully indexed. (Note this will result in + // re-indexing that one document, but we can't avoid that because there may be + // other documents in the same second.) + $referencestarttime = intval(get_config($componentconfigname, $varname . '_lastindexrun')); + } else { + $referencestarttime = $prevtimestart; + } } // Getting the recordset from the area. @@ -565,27 +596,35 @@ public function index($fullindex = false) { // Pass get_document as callback. $fileindexing = $this->engine->file_indexing_enabled() && $searcharea->uses_file_indexing(); $options = array('indexfiles' => $fileindexing, 'lastindexedtime' => $prevtimestart); + if ($timelimit) { + $options['stopat'] = $stopat; + } $iterator = new \core\dml\recordset_walk($recordset, array($searcharea, 'get_document'), $options); - list($numrecords, - $numdocs, - $numdocsignored, - $lastindexeddoc) = $this->engine->add_documents($iterator, $searcharea, $options); - - if (CLI_SCRIPT && !PHPUNIT_TEST) { - if ($numdocs > 0) { - $elapsed = round((microtime(true) - $elapsed), 3); - mtrace('Processed ' . $numrecords . ' records containing ' . $numdocs . ' documents for ' . - $searcharea->get_visible_name() . ' area, in ' . $elapsed . ' seconds.'); - } else { - mtrace('No new documents to index for ' . $searcharea->get_visible_name() . ' area.'); - } + $result = $this->engine->add_documents($iterator, $searcharea, $options); + if (count($result) === 5) { + list($numrecords, $numdocs, $numdocsignored, $lastindexeddoc, $partial) = $result; + } else { + // Backward compatibility for engines that don't support partial adding. + list($numrecords, $numdocs, $numdocsignored, $lastindexeddoc) = $result; + debugging('engine::add_documents() should return $partial (4-value return is deprecated)', + DEBUG_DEVELOPER); + $partial = false; + } + + if ($numdocs > 0) { + $elapsed = round((microtime(true) - $elapsed), 3); + $progress->output('Processed ' . $numrecords . ' records containing ' . $numdocs . + ' documents, in ' . $elapsed . ' seconds' . + ($partial ? ' (not complete)' : '') . '.', 1); + } else { + $progress->output('No new documents to index.', 1); } // Notify the engine this area is complete, and only mark times if true. if ($this->engine->area_index_complete($searcharea, $numdocs, $fullindex)) { $sumdocs += $numdocs; - // Store last index run once documents have been commited to the search engine. + // Store last index run once documents have been committed to the search engine. set_config($varname . '_indexingstart', $indexingstart, $componentconfigname); set_config($varname . '_indexingend', time(), $componentconfigname); set_config($varname . '_docsignored', $numdocsignored, $componentconfigname); @@ -594,6 +633,18 @@ public function index($fullindex = false) { if ($lastindexeddoc > 0) { set_config($varname . '_lastindexrun', $lastindexeddoc, $componentconfigname); } + if ($partial) { + set_config($varname . '_partial', 1, $componentconfigname); + } else { + unset_config($varname . '_partial', $componentconfigname); + } + } else { + $progress->output('Engine reported error.'); + } + + if ($timelimit && (microtime(true) >= $stopat)) { + $progress->output('Stopping indexing due to time limit.'); + break; } } @@ -673,7 +724,8 @@ public function delete_index_by_id($id) { */ public function get_areas_config($searchareas) { - $vars = array('indexingstart', 'indexingend', 'lastindexrun', 'docsignored', 'docsprocessed', 'recordsprocessed'); + $vars = array('indexingstart', 'indexingend', 'lastindexrun', 'docsignored', + 'docsprocessed', 'recordsprocessed', 'partial'); $configsettings = []; foreach ($searchareas as $searcharea) { diff --git a/search/cli/indexer.php b/search/cli/indexer.php index f42b8e979ba4c..8e204acb7bc4f 100644 --- a/search/cli/indexer.php +++ b/search/cli/indexer.php @@ -27,8 +27,9 @@ require(__DIR__.'/../../config.php'); require_once($CFG->libdir.'/clilib.php'); // cli only functions -list($options, $unrecognized) = cli_get_params(array('help' => false, 'force' => false, 'reindex' => false), - array('h' => 'help', 'f' => 'force', 'r' => 'reindex')); +list($options, $unrecognized) = cli_get_params(array('help' => false, 'force' => false, + 'reindex' => false, 'timelimit' => 0), + array('h' => 'help', 'f' => 'force', 'r' => 'reindex', 't' => 'timelimit')); if ($unrecognized) { $unrecognized = implode("\n ", $unrecognized); @@ -40,18 +41,24 @@ "Index search data Options: --h, --help Print out this help --r, --reindex Reindex data --f, --force Allow indexer to run, even if global search is disabled. +-h, --help Print out this help +-r, --reindex Reindex data +-f, --force Allow indexer to run, even if global search is disabled. +-t=, --timelimit= Stop after indexing for specified time (in seconds) -Example: +Examples: \$ sudo -u www-data /usr/bin/php search/cli/indexer.php --reindex +\$ sudo -u www-data /usr/bin/php search/cli/indexer.php --timelimit=300 "; echo $help; die; } +if ($options['timelimit'] && $options['reindex']) { + cli_error('Cannot apply time limit when reindexing'); +} + if (!\core_search\manager::is_global_search_enabled() && empty($options['force'])) { cli_error('Global search is disabled. Use --force if you want to force an index while disabled'); } @@ -70,13 +77,20 @@ $globalsearch = \core_search\manager::instance(); if (empty($options['reindex'])) { - echo "Running full index of site\n"; - echo "==========================\n"; - $globalsearch->index(); + if ($options['timelimit']) { + $limitinfo = ' (max ' . $options['timelimit'] . ' seconds)'; + $limitunderline = preg_replace('~.~', '=', $limitinfo); + echo "Running index of site$limitinfo\n"; + echo "=====================$limitunderline\n"; + } else { + echo "Running full index of site\n"; + echo "==========================\n"; + } + $globalsearch->index(false, $options['timelimit'], new text_progress_trace()); } else { echo "Running full reindex of site\n"; echo "============================\n"; - $globalsearch->index(true); + $globalsearch->index(true, 0, new text_progress_trace()); } // Optimize index at last. diff --git a/search/tests/fixtures/mock_search_area.php b/search/tests/fixtures/mock_search_area.php index bdc3867058667..1ed17d3402e70 100644 --- a/search/tests/fixtures/mock_search_area.php +++ b/search/tests/fixtures/mock_search_area.php @@ -122,4 +122,8 @@ public function get_doc_url(\core_search\document $doc) { public function get_context_url(\core_search\document $doc) { return new \moodle_url('/index.php'); } + + public function get_visible_name($lazyload = false) { + return 'Mock search area'; + } } diff --git a/search/tests/fixtures/mock_search_engine.php b/search/tests/fixtures/mock_search_engine.php index cdf99c4bca0b8..ad63e84bd7a30 100644 --- a/search/tests/fixtures/mock_search_engine.php +++ b/search/tests/fixtures/mock_search_engine.php @@ -29,6 +29,12 @@ class engine extends \core_search\engine { + /** @var int If set, waits when adding each document (microseconds) */ + protected $adddelay = 0; + + /** @var \core_search\document[] Documents added */ + protected $added = []; + public function is_installed() { return true; } @@ -38,7 +44,11 @@ public function is_server_ready() { } public function add_document($document, $fileindexing = false) { - // No need to implement. + if ($this->adddelay) { + usleep($this->adddelay); + } + $this->added[] = $document; + return true; } public function execute_query($data, $usercontexts, $limit = 0) { @@ -64,4 +74,25 @@ public function get_search_area($areaid) { public function get_query_total_count() { return 0; } + + /** + * Sets an add delay to simulate time taken indexing. + * + * @param float $seconds Delay in seconds for each document + */ + public function set_add_delay($seconds) { + $this->adddelay = (int)($seconds * 1000000); + } + + /** + * Gets the list of indexed (added) documents since last time this function + * was called. + * + * @return \core_search\document[] List of documents, in order added. + */ + public function get_and_clear_added_documents() { + $added = $this->added; + $this->added = []; + return $added; + } } diff --git a/search/tests/manager_test.php b/search/tests/manager_test.php index a0b60b69bd539..ecc845e7d7cc5 100644 --- a/search/tests/manager_test.php +++ b/search/tests/manager_test.php @@ -116,6 +116,7 @@ public function test_search_config() { $configs = $search->get_areas_config(array($this->forumpostareaid => $searcharea)); $this->assertEquals($start, $configs[$this->forumpostareaid]->indexingstart); $this->assertEquals($end, $configs[$this->forumpostareaid]->indexingend); + $this->assertEquals(false, $configs[$this->forumpostareaid]->partial); try { $fakeareaid = \core_search\manager::generate_areaid('mod_unexisting', 'chihuaquita'); @@ -132,6 +133,7 @@ public function test_search_config() { $this->assertEquals(0, $config[$varname . '_indexingstart']); $this->assertEquals(0, $config[$varname . '_indexingend']); $this->assertEquals(0, $config[$varname . '_lastindexrun']); + $this->assertEquals(0, $config[$varname . '_partial']); // No caching. $configs = $search->get_areas_config(array($this->forumpostareaid => $searcharea)); $this->assertEquals(0, $configs[$this->forumpostareaid]->indexingstart); @@ -151,6 +153,114 @@ public function test_search_config() { $this->assertEquals(0, $configs[$this->forumpostareaid]->indexingend); } + /** + * Tests the get_last_indexing_duration method in the base area class. + */ + public function test_get_last_indexing_duration() { + $this->resetAfterTest(); + + $search = testable_core_search::instance(); + + $searcharea = $search->get_search_area($this->forumpostareaid); + + // When never indexed, the duration is false. + $this->assertSame(false, $searcharea->get_last_indexing_duration()); + + // Set the start/end times. + list($componentname, $varname) = $searcharea->get_config_var_name(); + $start = time() - 100; + $end = time(); + set_config($varname . '_indexingstart', $start, $componentname); + set_config($varname . '_indexingend', $end, $componentname); + + // The duration should now be 100. + $this->assertSame(100, $searcharea->get_last_indexing_duration()); + } + + /** + * Tests that partial indexing works correctly. + */ + public function test_partial_indexing() { + global $USER; + + $this->resetAfterTest(); + $this->setAdminUser(); + + // Create a course and a forum. + $generator = $this->getDataGenerator(); + $course = $generator->create_course(); + $forum = $generator->create_module('forum', ['course' => $course->id]); + + // Index everything up to current. Ensure the course is older than current second so it + // definitely doesn't get indexed again next time. + $this->waitForSecond(); + $search = testable_core_search::instance(); + $search->index(false, 0); + + $searcharea = $search->get_search_area($this->forumpostareaid); + list($componentname, $varname) = $searcharea->get_config_var_name(); + $this->assertFalse(get_config($componentname, $varname . '_partial')); + + // Add 3 discussions to the forum. + $now = time(); + $generator->get_plugin_generator('mod_forum')->create_discussion(['course' => $course->id, + 'forum' => $forum->id, 'userid' => $USER->id, 'timemodified' => $now, + 'name' => 'Frog']); + $generator->get_plugin_generator('mod_forum')->create_discussion(['course' => $course->id, + 'forum' => $forum->id, 'userid' => $USER->id, 'timemodified' => $now + 1, + 'name' => 'Toad']); + $generator->get_plugin_generator('mod_forum')->create_discussion(['course' => $course->id, + 'forum' => $forum->id, 'userid' => $USER->id, 'timemodified' => $now + 2, + 'name' => 'Zombie']); + time_sleep_until($now + 3); + + // Clear the count of added documents. + $search->get_engine()->get_and_clear_added_documents(); + + // Make the search engine delay while indexing each document. + $search->get_engine()->set_add_delay(1.2); + + // Index with a limit of 2 seconds - it should index 2 of the documents (after the second + // one, it will have taken 2.4 seconds so it will stop). + $search->index(false, 2); + $added = $search->get_engine()->get_and_clear_added_documents(); + $this->assertCount(2, $added); + $this->assertEquals('Frog', $added[0]->get('title')); + $this->assertEquals('Toad', $added[1]->get('title')); + $this->assertEquals(1, get_config($componentname, $varname . '_partial')); + + // Add a label. + $generator->create_module('label', ['course' => $course->id, 'intro' => 'Vampire']); + + // Wait to next second (so as to not reindex the label more than once, as it will now + // be timed before the indexing run). + $this->waitForSecond(); + + // Next index with 1 second limit should do the label and not the forum - the logic is, + // if it spent ages indexing an area last time, do that one last on next run. + $search->index(false, 1); + $added = $search->get_engine()->get_and_clear_added_documents(); + $this->assertCount(1, $added); + $this->assertEquals('Vampire', $added[0]->get('title')); + + // Index again with a 2 second limit - it will redo last post for safety (because of other + // things possibly having the same time second), and then do the remaining one. (Note: + // because it always does more than one second worth of items, it would actually index 2 + // posts even if the limit were less than 2.) + $search->index(false, 2); + $added = $search->get_engine()->get_and_clear_added_documents(); + $this->assertCount(2, $added); + $this->assertEquals('Toad', $added[0]->get('title')); + $this->assertEquals('Zombie', $added[1]->get('title')); + $this->assertFalse(get_config($componentname, $varname . '_partial')); + + // Index again - there should be nothing to index this time. + $search->index(false, 2); + $added = $search->get_engine()->get_and_clear_added_documents(); + $this->assertCount(0, $added); + $this->assertFalse(get_config($componentname, $varname . '_partial')); + } + /** * Adding this test here as get_areas_user_accesses process is the same, results just depend on the context level. * diff --git a/search/upgrade.txt b/search/upgrade.txt index 94d0d69fbc355..5a8415878d657 100644 --- a/search/upgrade.txt +++ b/search/upgrade.txt @@ -1,6 +1,13 @@ This files describes API changes in /search/*, information provided here is intended especially for developers. +=== 3.4 === + +* Search indexing now supports time limits to make the scheduled task run more neatly. In order for + this to work, search engine plugins will need to implement the 'stopat' parameter if they + override the add_documents() function, and return an extra parameter from this function (see base + class in engine.php). Unmodified plugins will still work, but without supporting time limits. + === 3.2 === * Base search area classes have been renamed, please update your search areas to use the classes below: