Merge pull request janhq#931 from janhq/fix/nitro_perf_cpu

feat: Hotfit for Nitro loading on CPU with hyper-threading support
luozhenyu336 · Dec 12, 2023 · fef97f6 · fef97f6
2 parents 121dc11 + 14f83dd
commit fef97f6
Show file tree

Hide file tree

Showing 6 changed files with 43 additions and 19 deletions.
diff --git a/core/src/types/index.ts b/core/src/types/index.ts
@@ -275,6 +275,7 @@ export type ModelSettingParams = {
   ngl?: number;
   embedding?: boolean;
   n_parallel?: number;
+  cpu_threads?: number;
   system_prompt?: string;
   user_prompt?: string;
   ai_prompt?: string;

diff --git a/extensions/inference-nitro-extension/package.json b/extensions/inference-nitro-extension/package.json
@@ -36,6 +36,7 @@
     "kill-port": "^2.0.1",
     "path-browserify": "^1.0.1",
     "rxjs": "^7.8.1",
+    "systeminformation": "^5.21.20",
     "tcp-port-used": "^1.0.2",
     "ts-loader": "^9.5.0",
     "ulid": "^2.3.0"
@@ -52,6 +53,7 @@
     "tcp-port-used",
     "kill-port",
     "fetch-retry",
-    "electron-log"
+    "electron-log",
+    "systeminformation"
   ]
 }
diff --git a/extensions/inference-nitro-extension/src/@types/global.d.ts b/extensions/inference-nitro-extension/src/@types/global.d.ts
@@ -12,6 +12,7 @@ declare const INFERENCE_URL: string;
 interface EngineSettings {
   ctx_len: number;
   ngl: number;
+  cpu_threads: number;
   cont_batching: boolean;
   embedding: boolean;
 }
@@ -24,3 +25,8 @@ interface ModelOperationResponse {
   error?: any;
   modelFile?: string;
 }
+
+interface ResourcesInfo {
+  numCpuPhysicalCore: number;
+  memAvailable: number;
+}
diff --git a/extensions/inference-nitro-extension/src/index.ts b/extensions/inference-nitro-extension/src/index.ts
@@ -12,7 +12,6 @@ import {
   EventName,
   MessageRequest,
   MessageStatus,
-  ModelSettingParams,
   ExtensionType,
   ThreadContent,
   ThreadMessage,
@@ -41,6 +40,7 @@ export default class JanInferenceNitroExtension implements InferenceExtension {
   private static _engineSettings: EngineSettings = {
     ctx_len: 2048,
     ngl: 100,
+    cpu_threads: 1,
     cont_batching: false,
     embedding: false,
   };

diff --git a/extensions/inference-nitro-extension/src/module.ts b/extensions/inference-nitro-extension/src/module.ts
@@ -4,6 +4,7 @@ const path = require("path");
 const { spawn } = require("child_process");
 const tcpPortUsed = require("tcp-port-used");
 const fetchRetry = require("fetch-retry")(global.fetch);
+const si = require("systeminformation");
 
 const log = require("electron-log");
 
@@ -38,23 +39,29 @@ function stopModel(): Promise<ModelOperationResponse> {
  * TODO: Should pass absolute of the model file instead of just the name - So we can modurize the module.ts to npm package
  * TODO: Should it be startModel instead?
  */
-function initModel(wrapper: any): Promise<ModelOperationResponse> {
+async function initModel(wrapper: any): Promise<ModelOperationResponse> {
   currentModelFile = wrapper.modelFullPath;
   if (wrapper.model.engine !== "nitro") {
     return Promise.resolve({ error: "Not a nitro model" });
   } else {
-    log.info("Started to load model " + wrapper.model.modelFullPath);
+    // Gather system information for CPU physical cores and memory
+    const nitroResourceProbe = await getResourcesInfo();
+    console.log(
+      "Nitro with physical core: " + nitroResourceProbe.numCpuPhysicalCore
+    );
     const settings = {
       llama_model_path: currentModelFile,
       ...wrapper.model.settings,
+      // This is critical and requires real system information
+      cpu_threads: nitroResourceProbe.numCpuPhysicalCore,
     };
     log.info(`Load model settings: ${JSON.stringify(settings, null, 2)}`);
     return (
       // 1. Check if the port is used, if used, attempt to unload model / kill nitro process
       validateModelVersion()
         .then(checkAndUnloadNitro)
         // 2. Spawn the Nitro subprocess
-        .then(spawnNitroProcess)
+        .then(await spawnNitroProcess(nitroResourceProbe))
         // 4. Load the model into the Nitro subprocess (HTTP POST request)
         .then(() => loadLLMModel(settings))
         // 5. Check if the model is loaded successfully
@@ -166,32 +173,28 @@ async function checkAndUnloadNitro() {
  * Using child-process to spawn the process
  * Should run exactly platform specified Nitro binary version
  */
-async function spawnNitroProcess(): Promise<void> {
-  return new Promise((resolve, reject) => {
+async function spawnNitroProcess(nitroResourceProbe: any): Promise<any> {
+  return new Promise(async (resolve, reject) => {
     let binaryFolder = path.join(__dirname, "bin"); // Current directory by default
     let binaryName;
 
     if (process.platform === "win32") {
-      // Todo: Need to check for CUDA support to switch between CUDA and non-CUDA binaries
       binaryName = "win-start.bat";
     } else if (process.platform === "darwin") {
-      // Mac OS platform
       if (process.arch === "arm64") {
         binaryFolder = path.join(binaryFolder, "mac-arm64");
       } else {
         binaryFolder = path.join(binaryFolder, "mac-x64");
       }
       binaryName = "nitro";
     } else {
-      // Linux
-      // Todo: Need to check for CUDA support to switch between CUDA and non-CUDA binaries
-      binaryName = "linux-start.sh"; // For other platforms
+      binaryName = "linux-start.sh";
     }
 
     const binaryPath = path.join(binaryFolder, binaryName);
 
     // Execute the binary
-    subprocess = spawn(binaryPath, [1, "127.0.0.1", PORT], {
+    subprocess = spawn(binaryPath, [1, LOCAL_HOST, PORT], {
       cwd: binaryFolder,
     });
 
@@ -211,7 +214,7 @@ async function spawnNitroProcess(): Promise<void> {
       reject(`Nitro process exited. ${code ?? ""}`);
     });
     tcpPortUsed.waitUntilUsed(PORT, 300, 30000).then(() => {
-      resolve();
+      resolve(nitroResourceProbe);
     });
   });
 }
@@ -263,17 +266,30 @@ function validateModelVersion(): Promise<void> {
   });
 }
 
-/**
- * Cleans up any registered resources.
- * Its module specific function, should be called when application is closed
- */
 function dispose() {
   // clean other registered resources here
   killSubprocess();
 }
 
+/**
+ * Get the system resources information
+ */
+async function getResourcesInfo(): Promise<ResourcesInfo> {
+  return new Promise(async (resolve) => {
+    const cpu = await si.cpu();
+    const mem = await si.mem();
+
+    const response = {
+      numCpuPhysicalCore: cpu.physicalCores,
+      memAvailable: mem.available,
+    };
+    resolve(response);
+  });
+}
+
 module.exports = {
   initModel,
+  stopModel,
   killSubprocess,
   dispose,
 };
diff --git a/extensions/inference-openai-extension/src/index.ts b/extensions/inference-openai-extension/src/index.ts
@@ -12,7 +12,6 @@ import {
   EventName,
   MessageRequest,
   MessageStatus,
-  ModelSettingParams,
   ExtensionType,
   ThreadContent,
   ThreadMessage,