From eb0424981f44bcb1b18f93b850d6018691431e86 Mon Sep 17 00:00:00 2001
From: Scott Hemmert <kshemme@sandia.gov>
Date: Thu, 10 Nov 2022 13:18:30 -0700
Subject: [PATCH] Small enhancements (#883)

* Added new call in sst python module to control whether or not Py_Finalize will be called in the destructor.  By default it will not be called.  Setting this to true is experimental.

* Change parallel-load command line option to be able to take NONE as an option, indicating serial loading should be used.  Added most of the program options in the Config class to be available through the python call sst.getProgramOptions.

* Updates to the verbose output including the addition of component count from the graph.
---
 src/sst/core/config.cc               |   9 +-
 src/sst/core/config.h                |  29 ++++++
 src/sst/core/configGraph.cc          |  11 +++
 src/sst/core/configGraph.h           |   2 +
 src/sst/core/main.cc                 |  72 ++++++++++-----
 src/sst/core/model/python/pymodel.cc | 127 +++++++++++++++++++--------
 src/sst/core/model/python/pymodel.h  |   3 +
 7 files changed, 190 insertions(+), 63 deletions(-)

diff --git a/src/sst/core/config.cc b/src/sst/core/config.cc
index e98f624e4..0f97779ad 100644
--- a/src/sst/core/config.cc
+++ b/src/sst/core/config.cc
@@ -340,6 +340,11 @@ class ConfigHelper
         for ( auto& ch : arg_lower )
             ch = std::tolower(ch, loc);
 
+        if ( arg_lower == "none" )
+            cfg.parallel_load_ = false;
+        else
+            cfg.parallel_load_ = true;
+
         if ( arg_lower == "single" )
             cfg.parallel_load_mode_multi_ = false;
         else if ( arg_lower == "multi" )
@@ -351,7 +356,6 @@ class ConfigHelper
             return false;
         }
 
-        cfg.parallel_load_ = true;
         return true;
     }
 
@@ -889,7 +893,8 @@ static const struct sstLongOpts_s sstOptions[] = {
     DEF_ARG_OPTVAL(
         "parallel-load", 0, "MODE",
         "Enable parallel loading of configuration. This option is ignored for single rank jobs.  Optional mode "
-        "parameters are SINGLE and MULTI (default).  If SINGLE is specified, the same file will be passed to all MPI "
+        "parameters are NONE, SINGLE and MULTI (default).  If NONE is specified, parallel-load is turned off. If "
+        "SINGLE is specified, the same file will be passed to all MPI "
         "ranks.  If MULTI is specified, each MPI rank is required to have it's own file to load. Note, not all input "
         "formats support both types of file loading.",
         &ConfigHelper::enableParallelLoad, &ConfigHelper::enableParallelLoadMode, false),
diff --git a/src/sst/core/config.h b/src/sst/core/config.h
index d483de0d4..736bf155b 100644
--- a/src/sst/core/config.h
+++ b/src/sst/core/config.h
@@ -219,6 +219,17 @@ class Config : public SST::Core::Serialization::serializable
     */
     bool parallel_load_mode_multi() const { return parallel_load_mode_multi_; }
 
+    /**
+       Retruns the string equivalent for parallel-load: NONE (if
+       parallel load is off), SINGLE or MULTI.
+    */
+    std::string parallel_load_str() const
+    {
+        if ( !parallel_load_ ) return "NONE";
+        if ( parallel_load_mode_multi_ ) return "MULTI";
+        return "SINGLE";
+    }
+
     /**
        TimeVortex implementation to use
     */
@@ -267,6 +278,24 @@ class Config : public SST::Core::Serialization::serializable
     */
     Simulation::Mode_t runMode() const { return runMode_; }
 
+    /**
+       Get string version of runmode.
+    */
+    std::string runMode_str() const
+    {
+        switch ( runMode_ ) {
+        case Simulation::INIT:
+            return "INIT";
+        case Simulation::RUN:
+            return "RUN";
+        case Simulation::BOTH:
+            return "BOTH";
+        case Simulation::UNKNOWN:
+            return "UNKNOWN";
+        }
+        return "UNKNOWN";
+    }
+
 
 #ifdef USE_MEMPOOL
     /**
diff --git a/src/sst/core/configGraph.cc b/src/sst/core/configGraph.cc
index 004d1cee6..9abf52f28 100644
--- a/src/sst/core/configGraph.cc
+++ b/src/sst/core/configGraph.cc
@@ -663,6 +663,17 @@ ConfigComponent::checkPorts() const
     }
 }
 
+size_t
+ConfigGraph::getNumComponentsInMPIRank(uint32_t rank)
+{
+    size_t count = 0;
+    for ( auto* comp : comps ) {
+        if ( comp->rank.rank == rank ) ++count;
+    }
+    return count;
+}
+
+
 void
 ConfigGraph::setComponentRanks(RankInfo rank)
 {
diff --git a/src/sst/core/configGraph.h b/src/sst/core/configGraph.h
index 7aee6cf1e..a57ad2d2b 100644
--- a/src/sst/core/configGraph.h
+++ b/src/sst/core/configGraph.h
@@ -412,6 +412,8 @@ class ConfigGraph : public SST::Core::Serialization::serializable
 
     size_t getNumComponents() { return comps.data.size(); }
 
+    size_t getNumComponentsInMPIRank(uint32_t rank);
+
     /** Helper function to set all the ranks to the same value */
     void setComponentRanks(RankInfo rank);
     /** Checks to see if rank contains at least one component */
diff --git a/src/sst/core/main.cc b/src/sst/core/main.cc
index ab41b7488..af0997702 100644
--- a/src/sst/core/main.cc
+++ b/src/sst/core/main.cc
@@ -574,7 +574,9 @@ main(int argc, char* argv[])
             return -1;
         }
 
-        modelGen = factory->Create<SSTModelDescription>(model_name, cfg.configFile(), cfg.verbose(), &cfg, start);
+        if ( myRank.rank == 0 || cfg.parallel_load() ) {
+            modelGen = factory->Create<SSTModelDescription>(model_name, cfg.configFile(), cfg.verbose(), &cfg, start);
+        }
     }
 
 
@@ -619,6 +621,12 @@ main(int argc, char* argv[])
         CALL_INFO, 1, 0, "#main() My rank is (%u.%u), on %u/%u nodes/threads\n", myRank.rank, myRank.thread,
         world_size.rank, world_size.thread);
 
+    // Delete the model generator
+    if ( modelGen ) {
+        delete modelGen;
+        modelGen = nullptr;
+    }
+
     // Need to initialize TimeLord
     Simulation_impl::getTimeLord()->init(cfg.timeBase());
 
@@ -631,15 +639,27 @@ main(int argc, char* argv[])
         }
     }
 
-    // Delete the model generator
-    delete modelGen;
-    modelGen = nullptr;
-
     double end_graph_gen = sst_get_cpu_time();
 
+    // If verbose level is high enough, compute the total number
+    // components in the simulation.  NOTE: if parallel-load is
+    // enabled, then the parittioning won't actually happen and all
+    // ranks already have their parts of the graph.
+    uint64_t comp_count = 0;
+    if ( cfg.verbose() >= 1 ) {
+        if ( !cfg.parallel_load() && myRank.rank == 0 ) { comp_count = graph->getNumComponents(); }
+#ifdef SST_CONFIG_HAVE_MPI
+        else if ( cfg.parallel_load() ) {
+            uint64_t my_count = graph->getNumComponentsInMPIRank(myRank.rank);
+            MPI_Allreduce(&my_count, &comp_count, 1, MPI_UINT64_T, MPI_SUM, MPI_COMM_WORLD);
+        }
+#endif
+    }
+
     if ( myRank.rank == 0 ) {
         g_output.verbose(CALL_INFO, 1, 0, "# ------------------------------------------------------------\n");
         g_output.verbose(CALL_INFO, 1, 0, "# Graph construction took %f seconds.\n", (end_graph_gen - start_graph_gen));
+        g_output.verbose(CALL_INFO, 1, 0, "# Graph contains %" PRIu64 " components\n", comp_count);
     }
 
     ////// End ConfigGraph Creation //////
@@ -706,7 +726,8 @@ main(int argc, char* argv[])
     const uint64_t post_graph_create_rss = maxGlobalMemSize();
 
     if ( myRank.rank == 0 ) {
-        g_output.verbose(CALL_INFO, 1, 0, "# Graph partitioning took %lg seconds.\n", (end_part - start_part));
+        if ( !cfg.parallel_load() )
+            g_output.verbose(CALL_INFO, 1, 0, "# Graph partitioning took %lg seconds.\n", (end_part - start_part));
         g_output.verbose(
             CALL_INFO, 1, 0, "# Graph construction and partition raised RSS by %" PRIu64 " KB\n",
             (post_graph_create_rss - pre_graph_create_rss));
@@ -969,26 +990,29 @@ main(int argc, char* argv[])
         g_output.output("\n");
         g_output.output("\n");
         g_output.output("------------------------------------------------------------\n");
-        g_output.output("Simulation Timing Information:\n");
-        g_output.output("Build time:                      %f seconds\n", max_build_time);
-        g_output.output("Simulation time:                 %f seconds\n", max_run_time);
-        g_output.output("Total time:                      %f seconds\n", max_total_time);
-        g_output.output("Simulated time:                  %s\n", threadInfo[0].simulated_time.toStringBestSI().c_str());
+        g_output.output("Simulation Timing Information (Wall Clock Times):\n");
+        g_output.output("  Build time:                      %f seconds\n", max_build_time);
+        g_output.output("  Run loop time:                   %f seconds\n", max_run_time);
+        g_output.output("  Total time:                      %f seconds\n", max_total_time);
+        g_output.output("\n");
+        g_output.output(
+            "Simulated time:                    %s\n", threadInfo[0].simulated_time.toStringBestSI().c_str());
         g_output.output("\n");
         g_output.output("Simulation Resource Information:\n");
-        g_output.output("Max Resident Set Size:           %s\n", max_rss_ua.toStringBestSI().c_str());
-        g_output.output("Approx. Global Max RSS Size:     %s\n", global_rss_ua.toStringBestSI().c_str());
-        g_output.output("Max Local Page Faults:           %" PRIu64 " faults\n", local_max_pf);
-        g_output.output("Global Page Faults:              %" PRIu64 " faults\n", global_pf);
-        g_output.output("Max Output Blocks:               %" PRIu64 " blocks\n", global_max_io_out);
-        g_output.output("Max Input Blocks:                %" PRIu64 " blocks\n", global_max_io_in);
-        g_output.output("Max mempool usage:               %s\n", max_mempool_size_ua.toStringBestSI().c_str());
-        g_output.output("Global mempool usage:            %s\n", global_mempool_size_ua.toStringBestSI().c_str());
-        g_output.output("Global active activities:        %" PRIu64 " activities\n", global_active_activities);
-        g_output.output("Current global TimeVortex depth: %" PRIu64 " entries\n", global_current_tv_depth);
-        g_output.output("Max TimeVortex depth:            %" PRIu64 " entries\n", global_max_tv_depth);
-        g_output.output("Max Sync data size:              %s\n", global_max_sync_data_size_ua.toStringBestSI().c_str());
-        g_output.output("Global Sync data size:           %s\n", global_sync_data_size_ua.toStringBestSI().c_str());
+        g_output.output("  Max Resident Set Size:           %s\n", max_rss_ua.toStringBestSI().c_str());
+        g_output.output("  Approx. Global Max RSS Size:     %s\n", global_rss_ua.toStringBestSI().c_str());
+        g_output.output("  Max Local Page Faults:           %" PRIu64 " faults\n", local_max_pf);
+        g_output.output("  Global Page Faults:              %" PRIu64 " faults\n", global_pf);
+        g_output.output("  Max Output Blocks:               %" PRIu64 " blocks\n", global_max_io_out);
+        g_output.output("  Max Input Blocks:                %" PRIu64 " blocks\n", global_max_io_in);
+        g_output.output("  Max mempool usage:               %s\n", max_mempool_size_ua.toStringBestSI().c_str());
+        g_output.output("  Global mempool usage:            %s\n", global_mempool_size_ua.toStringBestSI().c_str());
+        g_output.output("  Global active activities:        %" PRIu64 " activities\n", global_active_activities);
+        g_output.output("  Current global TimeVortex depth: %" PRIu64 " entries\n", global_current_tv_depth);
+        g_output.output("  Max TimeVortex depth:            %" PRIu64 " entries\n", global_max_tv_depth);
+        g_output.output(
+            "  Max Sync data size:              %s\n", global_max_sync_data_size_ua.toStringBestSI().c_str());
+        g_output.output("  Global Sync data size:           %s\n", global_sync_data_size_ua.toStringBestSI().c_str());
         g_output.output("------------------------------------------------------------\n");
         g_output.output("\n");
         g_output.output("\n");
diff --git a/src/sst/core/model/python/pymodel.cc b/src/sst/core/model/python/pymodel.cc
index e951c6b2f..57199e97a 100644
--- a/src/sst/core/model/python/pymodel.cc
+++ b/src/sst/core/model/python/pymodel.cc
@@ -71,6 +71,8 @@ static PyObject* enableStatisticForComponentType(PyObject* self, PyObject* args)
 static PyObject* setStatisticLoadLevelForComponentName(PyObject* self, PyObject* args);
 static PyObject* setStatisticLoadLevelForComponentType(PyObject* self, PyObject* args);
 
+static PyObject* setCallPythonFinalize(PyObject* self, PyObject* args);
+
 static PyObject* mlFindModule(PyObject* self, PyObject* args);
 static PyObject* mlLoadModule(PyObject* self, PyObject* args);
 
@@ -299,40 +301,70 @@ getProgramOptions(PyObject* UNUSED(self), PyObject* UNUSED(args))
     Config* cfg = gModel->getConfig();
 
     PyObject* dict = PyDict_New();
-    PyDict_SetItem(dict, SST_ConvertToPythonString("debug-file"), SST_ConvertToPythonString(cfg->debugFile().c_str()));
+    // Basic options
+    PyDict_SetItem(dict, SST_ConvertToPythonString("verbose"), SST_ConvertToPythonLong(cfg->verbose()));
+    PyDict_SetItem(dict, SST_ConvertToPythonString("num-ranks"), SST_ConvertToPythonLong(cfg->num_ranks()));
+    PyDict_SetItem(dict, SST_ConvertToPythonString("num-threads"), SST_ConvertToPythonLong(cfg->num_threads()));
+    PyDict_SetItem(dict, SST_ConvertToPythonString("sdl-file"), SST_ConvertToPythonString(cfg->configFile().c_str()));
+    PyDict_SetItem(dict, SST_ConvertToPythonString("print-timing-info"), SST_ConvertToPythonBool(cfg->print_timing()));
     PyDict_SetItem(dict, SST_ConvertToPythonString("stop-at"), SST_ConvertToPythonString(cfg->stop_at().c_str()));
+    PyDict_SetItem(dict, SST_ConvertToPythonString("exit-after"), SST_ConvertToPythonLong(cfg->exit_after()));
+    PyDict_SetItem(
+        dict, SST_ConvertToPythonString("partitioner"), SST_ConvertToPythonString(cfg->partitioner().c_str()));
     PyDict_SetItem(
         dict, SST_ConvertToPythonString("heartbeat-period"), SST_ConvertToPythonString(cfg->heartbeatPeriod().c_str()));
-    PyDict_SetItem(dict, SST_ConvertToPythonString("timebase"), SST_ConvertToPythonString(cfg->timeBase().c_str()));
     PyDict_SetItem(
-        dict, SST_ConvertToPythonString("partitioner"), SST_ConvertToPythonString(cfg->partitioner().c_str()));
-    PyDict_SetItem(dict, SST_ConvertToPythonString("verbose"), SST_ConvertToPythonLong(cfg->verbose()));
+        dict, SST_ConvertToPythonString("output-directory"),
+        SST_ConvertToPythonString(cfg->output_directory().c_str()));
     PyDict_SetItem(
-        dict, SST_ConvertToPythonString("output-partition"),
-        SST_ConvertToPythonString(cfg->component_partition_file().c_str()));
+        dict, SST_ConvertToPythonString("output-prefix-core"),
+        SST_ConvertToPythonString(cfg->output_core_prefix().c_str()));
+
+    // Configuration output options
     PyDict_SetItem(
         dict, SST_ConvertToPythonString("output-config"),
         SST_ConvertToPythonString(cfg->output_config_graph().c_str()));
+    PyDict_SetItem(
+        dict, SST_ConvertToPythonString("output-json"), SST_ConvertToPythonString(cfg->output_json().c_str()));
+    PyDict_SetItem(dict, SST_ConvertToPythonString("parallel-output"), SST_ConvertToPythonBool(cfg->parallel_output()));
+
+    // Graph output options
     PyDict_SetItem(dict, SST_ConvertToPythonString("output-dot"), SST_ConvertToPythonString(cfg->output_dot().c_str()));
-    PyDict_SetItem(dict, SST_ConvertToPythonString("numRanks"), SST_ConvertToPythonLong(cfg->num_ranks()));
-    PyDict_SetItem(dict, SST_ConvertToPythonString("numThreads"), SST_ConvertToPythonLong(cfg->num_threads()));
-    PyDict_SetItem(dict, SST_ConvertToPythonString("parallel-load"), SST_ConvertToPythonBool(cfg->parallel_load()));
-
-    const char* runModeStr = "UNKNOWN";
-    switch ( cfg->runMode() ) {
-    case Simulation::INIT:
-        runModeStr = "init";
-        break;
-    case Simulation::RUN:
-        runModeStr = "run";
-        break;
-    case Simulation::BOTH:
-        runModeStr = "both";
-        break;
-    default:
-        break;
-    }
-    PyDict_SetItem(dict, SST_ConvertToPythonString("run-mode"), SST_ConvertToPythonString(runModeStr));
+    PyDict_SetItem(dict, SST_ConvertToPythonString("dot-verbosity"), SST_ConvertToPythonLong(cfg->dot_verbosity()));
+    PyDict_SetItem(
+        dict, SST_ConvertToPythonString("output-partition"),
+        SST_ConvertToPythonString(cfg->component_partition_file().c_str()));
+
+    // Advanced options
+    PyDict_SetItem(dict, SST_ConvertToPythonString("timebase"), SST_ConvertToPythonString(cfg->timeBase().c_str()));
+    PyDict_SetItem(
+        dict, SST_ConvertToPythonString("parallel-load"), SST_ConvertToPythonString(cfg->parallel_load_str().c_str()));
+    PyDict_SetItem(
+        dict, SST_ConvertToPythonString("time-vortex"), SST_ConvertToPythonString(cfg->timeVortex().c_str()));
+    PyDict_SetItem(
+        dict, SST_ConvertToPythonString("interthread-links"), SST_ConvertToPythonBool(cfg->interthread_links()));
+    PyDict_SetItem(dict, SST_ConvertToPythonString("debug-file"), SST_ConvertToPythonString(cfg->debugFile().c_str()));
+    PyDict_SetItem(dict, SST_ConvertToPythonString("lib-path"), SST_ConvertToPythonString(cfg->libpath().c_str()));
+    PyDict_SetItem(
+        dict, SST_ConvertToPythonString("add-lib-path"), SST_ConvertToPythonString(cfg->addLibPath().c_str()));
+
+    // Advanced options - profiling
+    PyDict_SetItem(
+        dict, SST_ConvertToPythonString("enable-profiling"),
+        SST_ConvertToPythonString(cfg->enabledProfiling().c_str()));
+    PyDict_SetItem(
+        dict, SST_ConvertToPythonString("profiling-output"), SST_ConvertToPythonString(cfg->profilingOutput().c_str()));
+
+    // Advanced options - debug
+    PyDict_SetItem(dict, SST_ConvertToPythonString("run-mode"), SST_ConvertToPythonString(cfg->runMode_str().c_str()));
+#ifdef USE_MEMPOOL
+    PyDict_SetItem(
+        dict, SST_ConvertToPythonString("output-undeleted-events"),
+        SST_ConvertToPythonString(cfg->event_dump_file().c_str()));
+#endif
+    PyDict_SetItem(
+        dict, SST_ConvertToPythonString("force-rank-seq-startup"), SST_ConvertToPythonBool(cfg->rank_seq_startup()));
+
     return dict;
 }
 
@@ -776,6 +808,30 @@ setStatisticLoadLevelForComponentType(PyObject* UNUSED(self), PyObject* args)
     return SST_ConvertToPythonLong(0);
 }
 
+static PyObject*
+setCallPythonFinalize(PyObject* UNUSED(self), PyObject* arg)
+{
+    PyErr_Clear();
+
+    bool state = SST_ConvertToCppLong(arg);
+    if ( PyErr_Occurred() ) {
+        PyErr_Print();
+        exit(-1);
+    }
+
+    gModel->setCallPythonFinalize(state);
+    int myrank = 0;
+#ifdef SST_CONFIG_HAVE_MPI
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+#endif
+    if ( state && myrank == 0 ) {
+        gModel->getOutput()->output(
+            "WARNING: Setting callPythonFinalize to True is EXPERIMENTAL pending further testing.\n");
+    }
+
+    return SST_ConvertToPythonLong(0);
+}
+
 static PyObject*
 globalAddParam(PyObject* UNUSED(self), PyObject* args)
 {
@@ -894,6 +950,9 @@ static PyMethodDef sstModuleMethods[] = {
       "getting fine timings.  For that, use the built-in time module." },
     { "getLocalMemoryUsage", getLocalMemoryUsage, METH_NOARGS,
       "Gets the current memory use, returned as a UnitAlgebra" },
+    { "setCallPythonFinalize", setCallPythonFinalize, METH_O,
+      "Sets whether or not Py_Finalize will be called after SST model generation is done.  Py_Finalize will be "
+      "called by default if this function is not called." },
     { nullptr, nullptr, 0, nullptr }
 };
 
@@ -1028,7 +1087,8 @@ SSTPythonModelDefinition::SSTPythonModelDefinition(
     config(configObj),
     namePrefix(nullptr),
     namePrefixLen(0),
-    start_time(start_time)
+    start_time(start_time),
+    callPythonFinalize(false)
 {
     std::vector<std::string> argv_vector;
     argv_vector.push_back("sstsim.x");
@@ -1093,23 +1153,16 @@ SSTPythonModelDefinition::SSTPythonModelDefinition(
     free(argv);
 }
 
-// SSTPythonModelDefinition::SSTPythonModelDefinition(
-//     const std::string& script_file, int verbosity, Config* configObj, double start_time, int argc, char** argv) :
-//     SSTModelDescription(),
-//     scriptName(script_file),
-//     config(configObj),
-//     start_time(start_time)
-// {
-//     initModel(script_file, verbosity, configObj, argc, argv);
-// }
-
 SSTPythonModelDefinition::~SSTPythonModelDefinition()
 {
     delete output;
     gModel = nullptr;
 
     if ( nullptr != namePrefix ) free(namePrefix);
-    PyGC_Collect();
+    if ( callPythonFinalize ) { Py_Finalize(); }
+    else {
+        PyGC_Collect();
+    }
 }
 
 ConfigGraph*
diff --git a/src/sst/core/model/python/pymodel.h b/src/sst/core/model/python/pymodel.h
index 0df6c5999..1039ddff2 100644
--- a/src/sst/core/model/python/pymodel.h
+++ b/src/sst/core/model/python/pymodel.h
@@ -71,6 +71,7 @@ class SSTPythonModelDefinition : public SSTModelDescription
     std::map<std::string, ComponentId_t> compNameMap;
     ComponentId_t                        nextComponentId;
     double                               start_time;
+    bool                                 callPythonFinalize;
 
 public: /* Public, but private.  Called only from Python functions */
     Config* getConfig(void) const { return config; }
@@ -123,6 +124,8 @@ class SSTPythonModelDefinition : public SSTModelDescription
 
     UnitAlgebra getElapsedExecutionTime() const;
     UnitAlgebra getLocalMemoryUsage() const;
+
+    void setCallPythonFinalize(bool state) { callPythonFinalize = state; }
 };
 
 // For xml inputs (.xml or .sdl), we just use a python script to parse