Small enhancements (sstsimulator#883)

* Added new call in sst python module to control whether or not Py_Finalize will be called in the destructor. By default it will not be called. Setting this to true is experimental. * Change parallel-load command line option to be able to take NONE as an option, indicating serial loading should be used. Added most of the program options in the Config class to be available through the python call sst.getProgramOptions. * Updates to the verbose output including the addition of component count from the graph.
hpcgarage · Nov 10, 2022 · eb04249 · eb04249
1 parent 427a142
commit eb04249
Show file tree

Hide file tree

Showing 7 changed files with 190 additions and 63 deletions.
diff --git a/src/sst/core/config.cc b/src/sst/core/config.cc
@@ -340,6 +340,11 @@ class ConfigHelper
         for ( auto& ch : arg_lower )
             ch = std::tolower(ch, loc);
 
+        if ( arg_lower == "none" )
+            cfg.parallel_load_ = false;
+        else
+            cfg.parallel_load_ = true;
+
         if ( arg_lower == "single" )
             cfg.parallel_load_mode_multi_ = false;
         else if ( arg_lower == "multi" )
@@ -351,7 +356,6 @@ class ConfigHelper
             return false;
         }
 
-        cfg.parallel_load_ = true;
         return true;
     }
 
@@ -889,7 +893,8 @@ static const struct sstLongOpts_s sstOptions[] = {
     DEF_ARG_OPTVAL(
         "parallel-load", 0, "MODE",
         "Enable parallel loading of configuration. This option is ignored for single rank jobs.  Optional mode "
-        "parameters are SINGLE and MULTI (default).  If SINGLE is specified, the same file will be passed to all MPI "
+        "parameters are NONE, SINGLE and MULTI (default).  If NONE is specified, parallel-load is turned off. If "
+        "SINGLE is specified, the same file will be passed to all MPI "
         "ranks.  If MULTI is specified, each MPI rank is required to have it's own file to load. Note, not all input "
         "formats support both types of file loading.",
         &ConfigHelper::enableParallelLoad, &ConfigHelper::enableParallelLoadMode, false),

diff --git a/src/sst/core/config.h b/src/sst/core/config.h
@@ -219,6 +219,17 @@ class Config : public SST::Core::Serialization::serializable
     */
     bool parallel_load_mode_multi() const { return parallel_load_mode_multi_; }
 
+    /**
+       Retruns the string equivalent for parallel-load: NONE (if
+       parallel load is off), SINGLE or MULTI.
+    */
+    std::string parallel_load_str() const
+    {
+        if ( !parallel_load_ ) return "NONE";
+        if ( parallel_load_mode_multi_ ) return "MULTI";
+        return "SINGLE";
+    }
+
     /**
        TimeVortex implementation to use
     */
@@ -267,6 +278,24 @@ class Config : public SST::Core::Serialization::serializable
     */
     Simulation::Mode_t runMode() const { return runMode_; }
 
+    /**
+       Get string version of runmode.
+    */
+    std::string runMode_str() const
+    {
+        switch ( runMode_ ) {
+        case Simulation::INIT:
+            return "INIT";
+        case Simulation::RUN:
+            return "RUN";
+        case Simulation::BOTH:
+            return "BOTH";
+        case Simulation::UNKNOWN:
+            return "UNKNOWN";
+        }
+        return "UNKNOWN";
+    }
+
 
 #ifdef USE_MEMPOOL
     /**

diff --git a/src/sst/core/configGraph.cc b/src/sst/core/configGraph.cc
@@ -663,6 +663,17 @@ ConfigComponent::checkPorts() const
     }
 }
 
+size_t
+ConfigGraph::getNumComponentsInMPIRank(uint32_t rank)
+{
+    size_t count = 0;
+    for ( auto* comp : comps ) {
+        if ( comp->rank.rank == rank ) ++count;
+    }
+    return count;
+}
+
+
 void
 ConfigGraph::setComponentRanks(RankInfo rank)
 {

diff --git a/src/sst/core/configGraph.h b/src/sst/core/configGraph.h
@@ -412,6 +412,8 @@ class ConfigGraph : public SST::Core::Serialization::serializable
 
     size_t getNumComponents() { return comps.data.size(); }
 
+    size_t getNumComponentsInMPIRank(uint32_t rank);
+
     /** Helper function to set all the ranks to the same value */
     void setComponentRanks(RankInfo rank);
     /** Checks to see if rank contains at least one component */

diff --git a/src/sst/core/main.cc b/src/sst/core/main.cc
@@ -574,7 +574,9 @@ main(int argc, char* argv[])
             return -1;
         }
 
-        modelGen = factory->Create<SSTModelDescription>(model_name, cfg.configFile(), cfg.verbose(), &cfg, start);
+        if ( myRank.rank == 0 || cfg.parallel_load() ) {
+            modelGen = factory->Create<SSTModelDescription>(model_name, cfg.configFile(), cfg.verbose(), &cfg, start);
+        }
     }
 
 
@@ -619,6 +621,12 @@ main(int argc, char* argv[])
         CALL_INFO, 1, 0, "#main() My rank is (%u.%u), on %u/%u nodes/threads\n", myRank.rank, myRank.thread,
         world_size.rank, world_size.thread);
 
+    // Delete the model generator
+    if ( modelGen ) {
+        delete modelGen;
+        modelGen = nullptr;
+    }
+
     // Need to initialize TimeLord
     Simulation_impl::getTimeLord()->init(cfg.timeBase());
 
@@ -631,15 +639,27 @@ main(int argc, char* argv[])
         }
     }
 
-    // Delete the model generator
-    delete modelGen;
-    modelGen = nullptr;
-
     double end_graph_gen = sst_get_cpu_time();
 
+    // If verbose level is high enough, compute the total number
+    // components in the simulation.  NOTE: if parallel-load is
+    // enabled, then the parittioning won't actually happen and all
+    // ranks already have their parts of the graph.
+    uint64_t comp_count = 0;
+    if ( cfg.verbose() >= 1 ) {
+        if ( !cfg.parallel_load() && myRank.rank == 0 ) { comp_count = graph->getNumComponents(); }
+#ifdef SST_CONFIG_HAVE_MPI
+        else if ( cfg.parallel_load() ) {
+            uint64_t my_count = graph->getNumComponentsInMPIRank(myRank.rank);
+            MPI_Allreduce(&my_count, &comp_count, 1, MPI_UINT64_T, MPI_SUM, MPI_COMM_WORLD);
+        }
+#endif
+    }
+
     if ( myRank.rank == 0 ) {
         g_output.verbose(CALL_INFO, 1, 0, "# ------------------------------------------------------------\n");
         g_output.verbose(CALL_INFO, 1, 0, "# Graph construction took %f seconds.\n", (end_graph_gen - start_graph_gen));
+        g_output.verbose(CALL_INFO, 1, 0, "# Graph contains %" PRIu64 " components\n", comp_count);
     }
 
     ////// End ConfigGraph Creation //////
@@ -706,7 +726,8 @@ main(int argc, char* argv[])
     const uint64_t post_graph_create_rss = maxGlobalMemSize();
 
     if ( myRank.rank == 0 ) {
-        g_output.verbose(CALL_INFO, 1, 0, "# Graph partitioning took %lg seconds.\n", (end_part - start_part));
+        if ( !cfg.parallel_load() )
+            g_output.verbose(CALL_INFO, 1, 0, "# Graph partitioning took %lg seconds.\n", (end_part - start_part));
         g_output.verbose(
             CALL_INFO, 1, 0, "# Graph construction and partition raised RSS by %" PRIu64 " KB\n",
             (post_graph_create_rss - pre_graph_create_rss));
@@ -969,26 +990,29 @@ main(int argc, char* argv[])
         g_output.output("\n");
         g_output.output("\n");
         g_output.output("------------------------------------------------------------\n");
-        g_output.output("Simulation Timing Information:\n");
-        g_output.output("Build time:                      %f seconds\n", max_build_time);
-        g_output.output("Simulation time:                 %f seconds\n", max_run_time);
-        g_output.output("Total time:                      %f seconds\n", max_total_time);
-        g_output.output("Simulated time:                  %s\n", threadInfo[0].simulated_time.toStringBestSI().c_str());
+        g_output.output("Simulation Timing Information (Wall Clock Times):\n");
+        g_output.output("  Build time:                      %f seconds\n", max_build_time);
+        g_output.output("  Run loop time:                   %f seconds\n", max_run_time);
+        g_output.output("  Total time:                      %f seconds\n", max_total_time);
+        g_output.output("\n");
+        g_output.output(
+            "Simulated time:                    %s\n", threadInfo[0].simulated_time.toStringBestSI().c_str());
         g_output.output("\n");
         g_output.output("Simulation Resource Information:\n");
-        g_output.output("Max Resident Set Size:           %s\n", max_rss_ua.toStringBestSI().c_str());
-        g_output.output("Approx. Global Max RSS Size:     %s\n", global_rss_ua.toStringBestSI().c_str());
-        g_output.output("Max Local Page Faults:           %" PRIu64 " faults\n", local_max_pf);
-        g_output.output("Global Page Faults:              %" PRIu64 " faults\n", global_pf);
-        g_output.output("Max Output Blocks:               %" PRIu64 " blocks\n", global_max_io_out);
-        g_output.output("Max Input Blocks:                %" PRIu64 " blocks\n", global_max_io_in);
-        g_output.output("Max mempool usage:               %s\n", max_mempool_size_ua.toStringBestSI().c_str());
-        g_output.output("Global mempool usage:            %s\n", global_mempool_size_ua.toStringBestSI().c_str());
-        g_output.output("Global active activities:        %" PRIu64 " activities\n", global_active_activities);
-        g_output.output("Current global TimeVortex depth: %" PRIu64 " entries\n", global_current_tv_depth);
-        g_output.output("Max TimeVortex depth:            %" PRIu64 " entries\n", global_max_tv_depth);
-        g_output.output("Max Sync data size:              %s\n", global_max_sync_data_size_ua.toStringBestSI().c_str());
-        g_output.output("Global Sync data size:           %s\n", global_sync_data_size_ua.toStringBestSI().c_str());
+        g_output.output("  Max Resident Set Size:           %s\n", max_rss_ua.toStringBestSI().c_str());
+        g_output.output("  Approx. Global Max RSS Size:     %s\n", global_rss_ua.toStringBestSI().c_str());
+        g_output.output("  Max Local Page Faults:           %" PRIu64 " faults\n", local_max_pf);
+        g_output.output("  Global Page Faults:              %" PRIu64 " faults\n", global_pf);
+        g_output.output("  Max Output Blocks:               %" PRIu64 " blocks\n", global_max_io_out);
+        g_output.output("  Max Input Blocks:                %" PRIu64 " blocks\n", global_max_io_in);
+        g_output.output("  Max mempool usage:               %s\n", max_mempool_size_ua.toStringBestSI().c_str());
+        g_output.output("  Global mempool usage:            %s\n", global_mempool_size_ua.toStringBestSI().c_str());
+        g_output.output("  Global active activities:        %" PRIu64 " activities\n", global_active_activities);
+        g_output.output("  Current global TimeVortex depth: %" PRIu64 " entries\n", global_current_tv_depth);
+        g_output.output("  Max TimeVortex depth:            %" PRIu64 " entries\n", global_max_tv_depth);
+        g_output.output(
+            "  Max Sync data size:              %s\n", global_max_sync_data_size_ua.toStringBestSI().c_str());
+        g_output.output("  Global Sync data size:           %s\n", global_sync_data_size_ua.toStringBestSI().c_str());
         g_output.output("------------------------------------------------------------\n");
         g_output.output("\n");
         g_output.output("\n");

diff --git a/src/sst/core/model/python/pymodel.cc b/src/sst/core/model/python/pymodel.cc
@@ -71,6 +71,8 @@ static PyObject* enableStatisticForComponentType(PyObject* self, PyObject* args)
 static PyObject* setStatisticLoadLevelForComponentName(PyObject* self, PyObject* args);
 static PyObject* setStatisticLoadLevelForComponentType(PyObject* self, PyObject* args);
 
+static PyObject* setCallPythonFinalize(PyObject* self, PyObject* args);
+
 static PyObject* mlFindModule(PyObject* self, PyObject* args);
 static PyObject* mlLoadModule(PyObject* self, PyObject* args);
 
@@ -299,40 +301,70 @@ getProgramOptions(PyObject* UNUSED(self), PyObject* UNUSED(args))
     Config* cfg = gModel->getConfig();
 
     PyObject* dict = PyDict_New();
-    PyDict_SetItem(dict, SST_ConvertToPythonString("debug-file"), SST_ConvertToPythonString(cfg->debugFile().c_str()));
+    // Basic options
+    PyDict_SetItem(dict, SST_ConvertToPythonString("verbose"), SST_ConvertToPythonLong(cfg->verbose()));
+    PyDict_SetItem(dict, SST_ConvertToPythonString("num-ranks"), SST_ConvertToPythonLong(cfg->num_ranks()));
+    PyDict_SetItem(dict, SST_ConvertToPythonString("num-threads"), SST_ConvertToPythonLong(cfg->num_threads()));
+    PyDict_SetItem(dict, SST_ConvertToPythonString("sdl-file"), SST_ConvertToPythonString(cfg->configFile().c_str()));
+    PyDict_SetItem(dict, SST_ConvertToPythonString("print-timing-info"), SST_ConvertToPythonBool(cfg->print_timing()));
     PyDict_SetItem(dict, SST_ConvertToPythonString("stop-at"), SST_ConvertToPythonString(cfg->stop_at().c_str()));
+    PyDict_SetItem(dict, SST_ConvertToPythonString("exit-after"), SST_ConvertToPythonLong(cfg->exit_after()));
+    PyDict_SetItem(
+        dict, SST_ConvertToPythonString("partitioner"), SST_ConvertToPythonString(cfg->partitioner().c_str()));
     PyDict_SetItem(
         dict, SST_ConvertToPythonString("heartbeat-period"), SST_ConvertToPythonString(cfg->heartbeatPeriod().c_str()));
-    PyDict_SetItem(dict, SST_ConvertToPythonString("timebase"), SST_ConvertToPythonString(cfg->timeBase().c_str()));
     PyDict_SetItem(
-        dict, SST_ConvertToPythonString("partitioner"), SST_ConvertToPythonString(cfg->partitioner().c_str()));
-    PyDict_SetItem(dict, SST_ConvertToPythonString("verbose"), SST_ConvertToPythonLong(cfg->verbose()));
+        dict, SST_ConvertToPythonString("output-directory"),
+        SST_ConvertToPythonString(cfg->output_directory().c_str()));
     PyDict_SetItem(
-        dict, SST_ConvertToPythonString("output-partition"),
-        SST_ConvertToPythonString(cfg->component_partition_file().c_str()));
+        dict, SST_ConvertToPythonString("output-prefix-core"),
+        SST_ConvertToPythonString(cfg->output_core_prefix().c_str()));
+
+    // Configuration output options
     PyDict_SetItem(
         dict, SST_ConvertToPythonString("output-config"),
         SST_ConvertToPythonString(cfg->output_config_graph().c_str()));
+    PyDict_SetItem(
+        dict, SST_ConvertToPythonString("output-json"), SST_ConvertToPythonString(cfg->output_json().c_str()));
+    PyDict_SetItem(dict, SST_ConvertToPythonString("parallel-output"), SST_ConvertToPythonBool(cfg->parallel_output()));
+
+    // Graph output options
     PyDict_SetItem(dict, SST_ConvertToPythonString("output-dot"), SST_ConvertToPythonString(cfg->output_dot().c_str()));
-    PyDict_SetItem(dict, SST_ConvertToPythonString("numRanks"), SST_ConvertToPythonLong(cfg->num_ranks()));
-    PyDict_SetItem(dict, SST_ConvertToPythonString("numThreads"), SST_ConvertToPythonLong(cfg->num_threads()));
-    PyDict_SetItem(dict, SST_ConvertToPythonString("parallel-load"), SST_ConvertToPythonBool(cfg->parallel_load()));
-
-    const char* runModeStr = "UNKNOWN";
-    switch ( cfg->runMode() ) {
-    case Simulation::INIT:
-        runModeStr = "init";
-        break;
-    case Simulation::RUN:
-        runModeStr = "run";
-        break;
-    case Simulation::BOTH:
-        runModeStr = "both";
-        break;
-    default:
-        break;
-    }
-    PyDict_SetItem(dict, SST_ConvertToPythonString("run-mode"), SST_ConvertToPythonString(runModeStr));
+    PyDict_SetItem(dict, SST_ConvertToPythonString("dot-verbosity"), SST_ConvertToPythonLong(cfg->dot_verbosity()));
+    PyDict_SetItem(
+        dict, SST_ConvertToPythonString("output-partition"),
+        SST_ConvertToPythonString(cfg->component_partition_file().c_str()));
+
+    // Advanced options
+    PyDict_SetItem(dict, SST_ConvertToPythonString("timebase"), SST_ConvertToPythonString(cfg->timeBase().c_str()));
+    PyDict_SetItem(
+        dict, SST_ConvertToPythonString("parallel-load"), SST_ConvertToPythonString(cfg->parallel_load_str().c_str()));
+    PyDict_SetItem(
+        dict, SST_ConvertToPythonString("time-vortex"), SST_ConvertToPythonString(cfg->timeVortex().c_str()));
+    PyDict_SetItem(
+        dict, SST_ConvertToPythonString("interthread-links"), SST_ConvertToPythonBool(cfg->interthread_links()));
+    PyDict_SetItem(dict, SST_ConvertToPythonString("debug-file"), SST_ConvertToPythonString(cfg->debugFile().c_str()));
+    PyDict_SetItem(dict, SST_ConvertToPythonString("lib-path"), SST_ConvertToPythonString(cfg->libpath().c_str()));
+    PyDict_SetItem(
+        dict, SST_ConvertToPythonString("add-lib-path"), SST_ConvertToPythonString(cfg->addLibPath().c_str()));
+
+    // Advanced options - profiling
+    PyDict_SetItem(
+        dict, SST_ConvertToPythonString("enable-profiling"),
+        SST_ConvertToPythonString(cfg->enabledProfiling().c_str()));
+    PyDict_SetItem(
+        dict, SST_ConvertToPythonString("profiling-output"), SST_ConvertToPythonString(cfg->profilingOutput().c_str()));
+
+    // Advanced options - debug
+    PyDict_SetItem(dict, SST_ConvertToPythonString("run-mode"), SST_ConvertToPythonString(cfg->runMode_str().c_str()));
+#ifdef USE_MEMPOOL
+    PyDict_SetItem(
+        dict, SST_ConvertToPythonString("output-undeleted-events"),
+        SST_ConvertToPythonString(cfg->event_dump_file().c_str()));
+#endif
+    PyDict_SetItem(
+        dict, SST_ConvertToPythonString("force-rank-seq-startup"), SST_ConvertToPythonBool(cfg->rank_seq_startup()));
+
     return dict;
 }
 
@@ -776,6 +808,30 @@ setStatisticLoadLevelForComponentType(PyObject* UNUSED(self), PyObject* args)
     return SST_ConvertToPythonLong(0);
 }
 
+static PyObject*
+setCallPythonFinalize(PyObject* UNUSED(self), PyObject* arg)
+{
+    PyErr_Clear();
+
+    bool state = SST_ConvertToCppLong(arg);
+    if ( PyErr_Occurred() ) {
+        PyErr_Print();
+        exit(-1);
+    }
+
+    gModel->setCallPythonFinalize(state);
+    int myrank = 0;
+#ifdef SST_CONFIG_HAVE_MPI
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+#endif
+    if ( state && myrank == 0 ) {
+        gModel->getOutput()->output(
+            "WARNING: Setting callPythonFinalize to True is EXPERIMENTAL pending further testing.\n");
+    }
+
+    return SST_ConvertToPythonLong(0);
+}
+
 static PyObject*
 globalAddParam(PyObject* UNUSED(self), PyObject* args)
 {
@@ -894,6 +950,9 @@ static PyMethodDef sstModuleMethods[] = {
       "getting fine timings.  For that, use the built-in time module." },
     { "getLocalMemoryUsage", getLocalMemoryUsage, METH_NOARGS,
       "Gets the current memory use, returned as a UnitAlgebra" },
+    { "setCallPythonFinalize", setCallPythonFinalize, METH_O,
+      "Sets whether or not Py_Finalize will be called after SST model generation is done.  Py_Finalize will be "
+      "called by default if this function is not called." },
     { nullptr, nullptr, 0, nullptr }
 };
 
@@ -1028,7 +1087,8 @@ SSTPythonModelDefinition::SSTPythonModelDefinition(
     config(configObj),
     namePrefix(nullptr),
     namePrefixLen(0),
-    start_time(start_time)
+    start_time(start_time),
+    callPythonFinalize(false)
 {
     std::vector<std::string> argv_vector;
     argv_vector.push_back("sstsim.x");
@@ -1093,23 +1153,16 @@ SSTPythonModelDefinition::SSTPythonModelDefinition(
     free(argv);
 }
 
-// SSTPythonModelDefinition::SSTPythonModelDefinition(
-//     const std::string& script_file, int verbosity, Config* configObj, double start_time, int argc, char** argv) :
-//     SSTModelDescription(),
-//     scriptName(script_file),
-//     config(configObj),
-//     start_time(start_time)
-// {
-//     initModel(script_file, verbosity, configObj, argc, argv);
-// }
-
 SSTPythonModelDefinition::~SSTPythonModelDefinition()
 {
     delete output;
     gModel = nullptr;
 
     if ( nullptr != namePrefix ) free(namePrefix);
-    PyGC_Collect();
+    if ( callPythonFinalize ) { Py_Finalize(); }
+    else {
+        PyGC_Collect();
+    }
 }
 
 ConfigGraph*