1.support bsz>1 2. support args.performance_stats: make render.py and…

… metrics.py run successfully again; change the stats logging and arguments checking code.
MyForking · Feb 19, 2024 · 6ee8e5e · 6ee8e5e
1 parent 2523630
commit 6ee8e5e
Show file tree

Hide file tree

Showing 5 changed files with 174 additions and 35 deletions.
diff --git a/render.py b/render.py
@@ -20,6 +20,7 @@
 from argparse import ArgumentParser
 from arguments import ModelParams, PipelineParams, get_combined_args
 from gaussian_renderer import GaussianModel
+from scene.workload_division import DivisionStrategyHistoryWS1
 
 def render_set(model_path, name, iteration, views, gaussians, pipeline, background, generate_num):
     render_path = os.path.join(model_path, name, "ours_{}".format(iteration), "renders")
@@ -43,8 +44,10 @@ def render_set(model_path, name, iteration, views, gaussians, pipeline, backgrou
     for idx, view in enumerate(tqdm(views, desc="Rendering progress")):
         if idx == generate_num:
             break
+        strategy_history = DivisionStrategyHistoryWS1(view, 1, 0)
+        strategy = strategy_history.start_strategy()
 
-        rendering = render(view, gaussians, pipeline, background, cuda_args=cuda_args)["render"]
+        rendering = render(view, gaussians, pipeline, background, cuda_args=cuda_args, strategy=strategy)["render"]
         gt = view.original_image[0:3, :, :]
         torchvision.utils.save_image(rendering, os.path.join(render_path, '{0:05d}'.format(idx) + ".png"))
         torchvision.utils.save_image(gt, os.path.join(gts_path, '{0:05d}'.format(idx) + ".png"))

diff --git a/scene/dataset_readers.py b/scene/dataset_readers.py
@@ -101,7 +101,7 @@ def readColmapCameras(cam_extrinsics, cam_intrinsics, images_folder):
         image_name = os.path.basename(image_path).split(".")[0]
         image = Image.open(image_path) # this is a lazy load, the image is not loaded yet
 
-        if args.fixed_training_image == -1:
+        if hasattr(args, "fixed_training_image") and args.fixed_training_image == -1:
             image.load() # load immediately after open file. 
 
         cam_info = CameraInfo(uid=uid, R=R, T=T, FovY=FovY, FovX=FovX, image=image,

diff --git a/time_statistic.py b/time_statistic.py
@@ -2302,6 +2302,73 @@ def compare_GPU_utilization(save_folder, file_paths):
         all_df[file_path] = df["b10 render time"] / all_df["baseline"]
     all_df.to_csv(save_folder + "compare_multiple_GPU_utilization.csv", index=False)
 
+def draw_epoch_loss(file_paths):
+    epoch_losses = []
+    for file_path in file_paths:
+        epoch_loss = []
+        lines = open(file_path, "r").readlines()
+        for line in lines:
+            #epoch 2 loss: 0.17376218013391145
+            if line.startswith("epoch "):
+                epoch_loss.append(float(line.split(" ")[-1]))
+        epoch_losses.append(epoch_loss)
+
+    fig, ax = plt.subplots(figsize=(20, 10))
+    for i, epoch_loss in enumerate(epoch_losses):
+        ax.plot(range(len(epoch_loss)), epoch_loss, label=file_paths[i])
+    ax.legend(loc='upper right')
+    folder = "/".join(file_paths[0].split("/")[:-1]) + "/"
+    plt.savefig(folder+"compare_epoch_loss.png")
+
+def draw_evaluation_results(file_paths):
+    eval_tests_PSNR = []
+    eval_trains_PSNR = []
+    iterations = []
+    # Evaluating test: 
+    for file_path in file_paths:
+        lines = open(file_path, "r").readlines()
+        eval_test_PSNR = []
+        eval_train_PSNR = []
+        for line in lines:
+            # [ITER 30000] Evaluating test: L1 0.058287687942777805 PSNR 21.94811627739354
+            # [ITER 30000] Evaluating train: L1 0.03144958354532719 PSNR 26.123293685913087
+            if "Evaluating test: " in line:
+                eval_test_PSNR.append(float(line.split(" ")[-1]))
+                if len(eval_tests_PSNR) == 0:
+                    iterations.append(int(line.split(" ")[1][:-1]))
+            if "Evaluating train: " in line:
+                eval_train_PSNR.append(float(line.split(" ")[-1]))
+        eval_tests_PSNR.append(eval_test_PSNR)
+        eval_trains_PSNR.append(eval_train_PSNR)
+
+    # draw the two figures on the same graph.
+    fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(20, 10))
+    for i, eval_test_PSNR in enumerate(eval_tests_PSNR):
+        # x-axis is iteration
+        # y-axis is PSNR
+        ax[0].plot(iterations, eval_test_PSNR, label=file_paths[i])
+
+    ax[0].set_ylabel('PSNR')
+    secax = ax[0].secondary_yaxis('right')
+    secax.set_ylabel('PSNR')
+    ax[0].legend(loc='lower right')
+    ax[0].set_title("Evaluating test PSNR")
+
+    for i, eval_train_PSNR in enumerate(eval_trains_PSNR):
+        # x-axis is iteration
+        # y-axis is PSNR
+        ax[1].plot(iterations, eval_train_PSNR, label=file_paths[i])
+
+    ax[1].set_ylabel('PSNR')
+    secax = ax[1].secondary_yaxis('right')
+    secax.set_ylabel('PSNR')
+    ax[1].legend(loc='lower right')
+    ax[1].set_title("Evaluating train PSNR")
+
+    folder = "/".join(file_paths[0].split("/")[:-1]) + "/"
+    plt.savefig(folder+"compare_evaluation_results.png")
+
+
 if __name__ == "__main__":
     # NOTE: folder_path must end with "/" !!!
 
@@ -2605,9 +2672,43 @@ def compare_GPU_utilization(save_folder, file_paths):
 
 
 
+    # draw_epoch_loss(
+    #     [
+    #         "experiments/bsz1/python_ws=4_rk=0.log",
+    #         "experiments/bsz2/python_ws=4_rk=0.log",
+    #         "experiments/bsz4/python_ws=4_rk=0.log",
+    #     ]
+    # )
 
+    # draw_epoch_loss(
+    #     [
+    #         "experiments/bsz1_2/python_ws=4_rk=0.log",
+    #         "experiments/bsz2_2/python_ws=4_rk=0.log",
+    #         "experiments/bsz4_2/python_ws=4_rk=0.log",
+    #     ]
+    # )
 
+    # draw_epoch_loss(
+    #     [
+    #         "experiments/bsz1/python_ws=4_rk=0.log",
+    #         "experiments/bsz2/python_ws=4_rk=0.log",
+    #         "experiments/bsz4/python_ws=4_rk=0.log",
+    #         "experiments/bsz1_2/python_ws=4_rk=0.log",
+    #         "experiments/bsz2_2/python_ws=4_rk=0.log",
+    #         "experiments/bsz4_2/python_ws=4_rk=0.log",
+    #     ]
+    # )
 
+    draw_evaluation_results(
+        [
+            "experiments/bsz1_perf/python_ws=4_rk=0.log",
+            "experiments/bsz2_perf/python_ws=4_rk=0.log",
+            "experiments/bsz4_perf/python_ws=4_rk=0.log",
+            "experiments/bsz8_perf/python_ws=4_rk=0.log",
+            "experiments/bsz16_perf/python_ws=4_rk=0.log",
+            "experiments/bsz32_perf/python_ws=4_rk=0.log",   
+        ]
+    )
 
     pass
 

diff --git a/train.py b/train.py
@@ -453,7 +453,7 @@ def training(dataset, opt, pipe, args, log_file):
                 timers.stop("densification")
 
             # Optimizer step
-            if iteration < opt.iterations:
+            if iteration < opt.iterations and iteration % args.bsz == 0:
                 timers.start("optimizer_step")
                 if not args.stop_update_param:
                     gaussians.optimizer.step()
@@ -476,31 +476,35 @@ def training(dataset, opt, pipe, args, log_file):
             i2jsend_file.write("iteration {}:{}\n".format(iteration, json.dumps(i2j_send_size)))
             i2jsend_file.flush()
 
-
     # Finish training
-    if (args.adjust_div_stra and args.adjust_mode in ["1", "2", "4"]) or (not args.adjust_div_stra and utils.WORLD_SIZE == 1):
-        data_json = {}
-        for camera_id, strategy_history in cameraId2StrategyHistory.items():
-            data_json[camera_id] = strategy_history.to_json()
-
-        with open(args.log_folder+"/strategy_history_ws="+str(utils.WORLD_SIZE)+"_rk="+str(utils.LOCAL_RANK)+".json", 'w') as f:
-            json.dump(data_json, f)
-
     if args.end2end_time:
         torch.cuda.synchronize()
         log_file.write("end2end total_time: {:.6f} ms, iterations: {}, throughput {:.2f} it/s\n".format(time.time() - train_start_time, opt.iterations, opt.iterations/(time.time() - train_start_time)))
 
     log_file.write("Max Memory usage: {} GB.\n".format(torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024))
 
-    # DEBUG
-    if args.memory_distribution:
-        # save gaussians.send_to_gpui_cnt to file.
-        with open(args.log_folder+"/send_to_gpui_cnt_ws="+str(utils.WORLD_SIZE)+"_rk="+str(utils.LOCAL_RANK)+".json", 'w') as f:
-            send_to_gpui_cnt_cpu = gaussians.send_to_gpui_cnt.cpu().numpy().tolist()
-            data2save = []
-            for i in range(len(send_to_gpui_cnt_cpu)):
-                data2save.append( ",".join([str(x) for x in send_to_gpui_cnt_cpu[i]]) )
-            json.dump(data2save, f, indent=4)
+    if not args.performance_stats:
+
+        # Save some statistics to file for future usage. 
+
+        if (args.adjust_div_stra and args.adjust_mode in ["1", "2", "4"]) or (not args.adjust_div_stra and utils.WORLD_SIZE == 1):
+            data_json = {}
+            for camera_id, strategy_history in cameraId2StrategyHistory.items():
+                data_json[camera_id] = strategy_history.to_json()
+
+            with open(args.log_folder+"/strategy_history_ws="+str(utils.WORLD_SIZE)+"_rk="+str(utils.LOCAL_RANK)+".json", 'w') as f:
+                json.dump(data_json, f)
+
+
+        # DEBUG
+        if args.memory_distribution:
+            # save gaussians.send_to_gpui_cnt to file.
+            with open(args.log_folder+"/send_to_gpui_cnt_ws="+str(utils.WORLD_SIZE)+"_rk="+str(utils.LOCAL_RANK)+".json", 'w') as f:
+                send_to_gpui_cnt_cpu = gaussians.send_to_gpui_cnt.cpu().numpy().tolist()
+                data2save = []
+                for i in range(len(send_to_gpui_cnt_cpu)):
+                    data2save.append( ",".join([str(x) for x in send_to_gpui_cnt_cpu[i]]) )
+                json.dump(data2save, f, indent=4)
 
 
 def prepare_output_and_logger(args):    
@@ -632,6 +636,8 @@ def training_report(tb_writer, iteration, Ll1, loss, l1_loss, elapsed, testing_i
     parser.add_argument("--benchmark_stats", action='store_true', default=False)
     parser.add_argument("--stop_adjust2_well_balanced", action='store_true', default=False)
     parser.add_argument("--log_tiles_stats_img_num", type=int, default=-1)
+    parser.add_argument("--bsz", type=int, default=1)
+    parser.add_argument("--performance_stats", action='store_true', default=False)
     args = parser.parse_args(sys.argv[1:])
     args.save_iterations.append(args.iterations)
 
@@ -644,6 +650,48 @@ def training_report(tb_writer, iteration, Ll1, loss, l1_loss, elapsed, testing_i
     print("Local rank: " + str(utils.LOCAL_RANK) + " World size: " + str(utils.WORLD_SIZE))
 
     # Check arguments
+    assert not (args.benchmark_stats and args.performance_stats), "benchmark_stats and performance_stats can not be enabled at the same time."
+
+    if args.benchmark_stats:
+        args.zhx_time = True
+        args.zhx_python_time = True
+        args.log_iteration_memory_usage = True
+        args.check_memory_usage = True
+        args.end2end_time = True
+        args.disable_checkpoint_and_save = True
+        args.checkpoint_iterations = []
+        args.save_iterations = []
+        assert args.fixed_training_image == -1, "benchmark mode does not support fixed_training_image."
+        assert not args.disable_auto_densification, "benchmark mode needs auto densification."
+        assert not args.save_i2jsend, "benchmark mode does not support save_i2jsend."
+        assert not args.stop_update_param, "benchmark mode does not support stop_update_param."
+
+    if args.performance_stats:
+        args.eval = True
+        args.zhx_time = False
+        args.zhx_python_time = False
+        args.end2end_time = True
+        args.log_iteration_memory_usage = False
+        args.check_memory_usage = False
+        args.save_iterations = [2000, 7000, 15000, 30000]
+        args.test_iterations = [500]+ [i for i in range(2000, args.iterations+1, 1000)]
+        args.checkpoint_iterations = []
+
+        # use the fastest mode.
+        if utils.WORLD_SIZE > 1:
+            args.adjust_div_stra = True
+            args.adjust_mode = "1"
+        else:
+            args.adjust_div_stra = False
+        args.lazy_load_image = True
+        args.memory_distribution = True
+        args.image_distribution = True
+
+        assert args.fixed_training_image == -1, "performance_stats mode does not support fixed_training_image."
+        assert not args.disable_auto_densification, "performance_stats mode needs auto densification."
+        assert not args.save_i2jsend, "performance_stats mode does not support save_i2jsend."
+        assert not args.stop_update_param, "performance_stats mode does not support stop_update_param."
+
     if args.adjust_div_stra and utils.WORLD_SIZE == 1:
         print("adjust_div_stra is enabled, but WORLD_SIZE is 1. disable adjust_div_stra.")
         args.adjust_div_stra = False
@@ -674,19 +722,6 @@ def training_report(tb_writer, iteration, Ll1, loss, l1_loss, elapsed, testing_i
     if args.log_iteration_memory_usage:
         args.check_memory_usage = True
 
-    if args.benchmark_stats:
-        args.zhx_time = True
-        args.zhx_python_time = True
-        args.log_iteration_memory_usage = True
-        args.check_memory_usage = True
-        args.end2end_time = True
-        args.disable_checkpoint_and_save = True
-        args.checkpoint_iterations = []
-        args.save_iterations = []
-        assert args.fixed_training_image == -1, "benchmark mode does not support fixed_training_image."
-        assert not args.disable_auto_densification, "benchmark mode needs auto densification."
-        assert not args.save_i2jsend, "benchmark mode does not support save_i2jsend."
-        assert not args.stop_update_param, "benchmark mode does not support stop_update_param."
 
     # create log folder
     if utils.LOCAL_RANK == 0:

diff --git a/utils/camera_utils.py b/utils/camera_utils.py
@@ -63,7 +63,7 @@ def cameraList_from_camInfos(cam_infos, resolution_scale, args):
     args = get_args()
     from tqdm import tqdm
     for id, c in tqdm(enumerate(cam_infos), total=len(cam_infos)):
-        if args.fixed_training_image == -1 or id == args.fixed_training_image:
+        if not hasattr(args, "fixed_training_image") or args.fixed_training_image == -1 or id == args.fixed_training_image:
             camera_list.append(loadCam(args, id, c, resolution_scale))
         else:
             camera_list.append(None)