Skip to content

Commit

Permalink
1.support bsz>1 2. support args.performance_stats: make render.py and…
Browse files Browse the repository at this point in the history
… metrics.py run successfully again; change the stats logging and arguments checking code.
  • Loading branch information
Hexu Zhao committed Feb 19, 2024
1 parent 2523630 commit 6ee8e5e
Show file tree
Hide file tree
Showing 5 changed files with 174 additions and 35 deletions.
5 changes: 4 additions & 1 deletion render.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from argparse import ArgumentParser
from arguments import ModelParams, PipelineParams, get_combined_args
from gaussian_renderer import GaussianModel
from scene.workload_division import DivisionStrategyHistoryWS1

def render_set(model_path, name, iteration, views, gaussians, pipeline, background, generate_num):
render_path = os.path.join(model_path, name, "ours_{}".format(iteration), "renders")
Expand All @@ -43,8 +44,10 @@ def render_set(model_path, name, iteration, views, gaussians, pipeline, backgrou
for idx, view in enumerate(tqdm(views, desc="Rendering progress")):
if idx == generate_num:
break
strategy_history = DivisionStrategyHistoryWS1(view, 1, 0)
strategy = strategy_history.start_strategy()

rendering = render(view, gaussians, pipeline, background, cuda_args=cuda_args)["render"]
rendering = render(view, gaussians, pipeline, background, cuda_args=cuda_args, strategy=strategy)["render"]
gt = view.original_image[0:3, :, :]
torchvision.utils.save_image(rendering, os.path.join(render_path, '{0:05d}'.format(idx) + ".png"))
torchvision.utils.save_image(gt, os.path.join(gts_path, '{0:05d}'.format(idx) + ".png"))
Expand Down
2 changes: 1 addition & 1 deletion scene/dataset_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def readColmapCameras(cam_extrinsics, cam_intrinsics, images_folder):
image_name = os.path.basename(image_path).split(".")[0]
image = Image.open(image_path) # this is a lazy load, the image is not loaded yet

if args.fixed_training_image == -1:
if hasattr(args, "fixed_training_image") and args.fixed_training_image == -1:
image.load() # load immediately after open file.

cam_info = CameraInfo(uid=uid, R=R, T=T, FovY=FovY, FovX=FovX, image=image,
Expand Down
101 changes: 101 additions & 0 deletions time_statistic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2302,6 +2302,73 @@ def compare_GPU_utilization(save_folder, file_paths):
all_df[file_path] = df["b10 render time"] / all_df["baseline"]
all_df.to_csv(save_folder + "compare_multiple_GPU_utilization.csv", index=False)

def draw_epoch_loss(file_paths):
epoch_losses = []
for file_path in file_paths:
epoch_loss = []
lines = open(file_path, "r").readlines()
for line in lines:
#epoch 2 loss: 0.17376218013391145
if line.startswith("epoch "):
epoch_loss.append(float(line.split(" ")[-1]))
epoch_losses.append(epoch_loss)

fig, ax = plt.subplots(figsize=(20, 10))
for i, epoch_loss in enumerate(epoch_losses):
ax.plot(range(len(epoch_loss)), epoch_loss, label=file_paths[i])
ax.legend(loc='upper right')
folder = "/".join(file_paths[0].split("/")[:-1]) + "/"
plt.savefig(folder+"compare_epoch_loss.png")

def draw_evaluation_results(file_paths):
eval_tests_PSNR = []
eval_trains_PSNR = []
iterations = []
# Evaluating test:
for file_path in file_paths:
lines = open(file_path, "r").readlines()
eval_test_PSNR = []
eval_train_PSNR = []
for line in lines:
# [ITER 30000] Evaluating test: L1 0.058287687942777805 PSNR 21.94811627739354
# [ITER 30000] Evaluating train: L1 0.03144958354532719 PSNR 26.123293685913087
if "Evaluating test: " in line:
eval_test_PSNR.append(float(line.split(" ")[-1]))
if len(eval_tests_PSNR) == 0:
iterations.append(int(line.split(" ")[1][:-1]))
if "Evaluating train: " in line:
eval_train_PSNR.append(float(line.split(" ")[-1]))
eval_tests_PSNR.append(eval_test_PSNR)
eval_trains_PSNR.append(eval_train_PSNR)

# draw the two figures on the same graph.
fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(20, 10))
for i, eval_test_PSNR in enumerate(eval_tests_PSNR):
# x-axis is iteration
# y-axis is PSNR
ax[0].plot(iterations, eval_test_PSNR, label=file_paths[i])

ax[0].set_ylabel('PSNR')
secax = ax[0].secondary_yaxis('right')
secax.set_ylabel('PSNR')
ax[0].legend(loc='lower right')
ax[0].set_title("Evaluating test PSNR")

for i, eval_train_PSNR in enumerate(eval_trains_PSNR):
# x-axis is iteration
# y-axis is PSNR
ax[1].plot(iterations, eval_train_PSNR, label=file_paths[i])

ax[1].set_ylabel('PSNR')
secax = ax[1].secondary_yaxis('right')
secax.set_ylabel('PSNR')
ax[1].legend(loc='lower right')
ax[1].set_title("Evaluating train PSNR")

folder = "/".join(file_paths[0].split("/")[:-1]) + "/"
plt.savefig(folder+"compare_evaluation_results.png")


if __name__ == "__main__":
# NOTE: folder_path must end with "/" !!!

Expand Down Expand Up @@ -2605,9 +2672,43 @@ def compare_GPU_utilization(save_folder, file_paths):



# draw_epoch_loss(
# [
# "experiments/bsz1/python_ws=4_rk=0.log",
# "experiments/bsz2/python_ws=4_rk=0.log",
# "experiments/bsz4/python_ws=4_rk=0.log",
# ]
# )

# draw_epoch_loss(
# [
# "experiments/bsz1_2/python_ws=4_rk=0.log",
# "experiments/bsz2_2/python_ws=4_rk=0.log",
# "experiments/bsz4_2/python_ws=4_rk=0.log",
# ]
# )

# draw_epoch_loss(
# [
# "experiments/bsz1/python_ws=4_rk=0.log",
# "experiments/bsz2/python_ws=4_rk=0.log",
# "experiments/bsz4/python_ws=4_rk=0.log",
# "experiments/bsz1_2/python_ws=4_rk=0.log",
# "experiments/bsz2_2/python_ws=4_rk=0.log",
# "experiments/bsz4_2/python_ws=4_rk=0.log",
# ]
# )

draw_evaluation_results(
[
"experiments/bsz1_perf/python_ws=4_rk=0.log",
"experiments/bsz2_perf/python_ws=4_rk=0.log",
"experiments/bsz4_perf/python_ws=4_rk=0.log",
"experiments/bsz8_perf/python_ws=4_rk=0.log",
"experiments/bsz16_perf/python_ws=4_rk=0.log",
"experiments/bsz32_perf/python_ws=4_rk=0.log",
]
)

pass

Expand Down
99 changes: 67 additions & 32 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ def training(dataset, opt, pipe, args, log_file):
timers.stop("densification")

# Optimizer step
if iteration < opt.iterations:
if iteration < opt.iterations and iteration % args.bsz == 0:
timers.start("optimizer_step")
if not args.stop_update_param:
gaussians.optimizer.step()
Expand All @@ -476,31 +476,35 @@ def training(dataset, opt, pipe, args, log_file):
i2jsend_file.write("iteration {}:{}\n".format(iteration, json.dumps(i2j_send_size)))
i2jsend_file.flush()


# Finish training
if (args.adjust_div_stra and args.adjust_mode in ["1", "2", "4"]) or (not args.adjust_div_stra and utils.WORLD_SIZE == 1):
data_json = {}
for camera_id, strategy_history in cameraId2StrategyHistory.items():
data_json[camera_id] = strategy_history.to_json()

with open(args.log_folder+"/strategy_history_ws="+str(utils.WORLD_SIZE)+"_rk="+str(utils.LOCAL_RANK)+".json", 'w') as f:
json.dump(data_json, f)

if args.end2end_time:
torch.cuda.synchronize()
log_file.write("end2end total_time: {:.6f} ms, iterations: {}, throughput {:.2f} it/s\n".format(time.time() - train_start_time, opt.iterations, opt.iterations/(time.time() - train_start_time)))

log_file.write("Max Memory usage: {} GB.\n".format(torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024))

# DEBUG
if args.memory_distribution:
# save gaussians.send_to_gpui_cnt to file.
with open(args.log_folder+"/send_to_gpui_cnt_ws="+str(utils.WORLD_SIZE)+"_rk="+str(utils.LOCAL_RANK)+".json", 'w') as f:
send_to_gpui_cnt_cpu = gaussians.send_to_gpui_cnt.cpu().numpy().tolist()
data2save = []
for i in range(len(send_to_gpui_cnt_cpu)):
data2save.append( ",".join([str(x) for x in send_to_gpui_cnt_cpu[i]]) )
json.dump(data2save, f, indent=4)
if not args.performance_stats:

# Save some statistics to file for future usage.

if (args.adjust_div_stra and args.adjust_mode in ["1", "2", "4"]) or (not args.adjust_div_stra and utils.WORLD_SIZE == 1):
data_json = {}
for camera_id, strategy_history in cameraId2StrategyHistory.items():
data_json[camera_id] = strategy_history.to_json()

with open(args.log_folder+"/strategy_history_ws="+str(utils.WORLD_SIZE)+"_rk="+str(utils.LOCAL_RANK)+".json", 'w') as f:
json.dump(data_json, f)


# DEBUG
if args.memory_distribution:
# save gaussians.send_to_gpui_cnt to file.
with open(args.log_folder+"/send_to_gpui_cnt_ws="+str(utils.WORLD_SIZE)+"_rk="+str(utils.LOCAL_RANK)+".json", 'w') as f:
send_to_gpui_cnt_cpu = gaussians.send_to_gpui_cnt.cpu().numpy().tolist()
data2save = []
for i in range(len(send_to_gpui_cnt_cpu)):
data2save.append( ",".join([str(x) for x in send_to_gpui_cnt_cpu[i]]) )
json.dump(data2save, f, indent=4)


def prepare_output_and_logger(args):
Expand Down Expand Up @@ -632,6 +636,8 @@ def training_report(tb_writer, iteration, Ll1, loss, l1_loss, elapsed, testing_i
parser.add_argument("--benchmark_stats", action='store_true', default=False)
parser.add_argument("--stop_adjust2_well_balanced", action='store_true', default=False)
parser.add_argument("--log_tiles_stats_img_num", type=int, default=-1)
parser.add_argument("--bsz", type=int, default=1)
parser.add_argument("--performance_stats", action='store_true', default=False)
args = parser.parse_args(sys.argv[1:])
args.save_iterations.append(args.iterations)

Expand All @@ -644,6 +650,48 @@ def training_report(tb_writer, iteration, Ll1, loss, l1_loss, elapsed, testing_i
print("Local rank: " + str(utils.LOCAL_RANK) + " World size: " + str(utils.WORLD_SIZE))

# Check arguments
assert not (args.benchmark_stats and args.performance_stats), "benchmark_stats and performance_stats can not be enabled at the same time."

if args.benchmark_stats:
args.zhx_time = True
args.zhx_python_time = True
args.log_iteration_memory_usage = True
args.check_memory_usage = True
args.end2end_time = True
args.disable_checkpoint_and_save = True
args.checkpoint_iterations = []
args.save_iterations = []
assert args.fixed_training_image == -1, "benchmark mode does not support fixed_training_image."
assert not args.disable_auto_densification, "benchmark mode needs auto densification."
assert not args.save_i2jsend, "benchmark mode does not support save_i2jsend."
assert not args.stop_update_param, "benchmark mode does not support stop_update_param."

if args.performance_stats:
args.eval = True
args.zhx_time = False
args.zhx_python_time = False
args.end2end_time = True
args.log_iteration_memory_usage = False
args.check_memory_usage = False
args.save_iterations = [2000, 7000, 15000, 30000]
args.test_iterations = [500]+ [i for i in range(2000, args.iterations+1, 1000)]
args.checkpoint_iterations = []

# use the fastest mode.
if utils.WORLD_SIZE > 1:
args.adjust_div_stra = True
args.adjust_mode = "1"
else:
args.adjust_div_stra = False
args.lazy_load_image = True
args.memory_distribution = True
args.image_distribution = True

assert args.fixed_training_image == -1, "performance_stats mode does not support fixed_training_image."
assert not args.disable_auto_densification, "performance_stats mode needs auto densification."
assert not args.save_i2jsend, "performance_stats mode does not support save_i2jsend."
assert not args.stop_update_param, "performance_stats mode does not support stop_update_param."

if args.adjust_div_stra and utils.WORLD_SIZE == 1:
print("adjust_div_stra is enabled, but WORLD_SIZE is 1. disable adjust_div_stra.")
args.adjust_div_stra = False
Expand Down Expand Up @@ -674,19 +722,6 @@ def training_report(tb_writer, iteration, Ll1, loss, l1_loss, elapsed, testing_i
if args.log_iteration_memory_usage:
args.check_memory_usage = True

if args.benchmark_stats:
args.zhx_time = True
args.zhx_python_time = True
args.log_iteration_memory_usage = True
args.check_memory_usage = True
args.end2end_time = True
args.disable_checkpoint_and_save = True
args.checkpoint_iterations = []
args.save_iterations = []
assert args.fixed_training_image == -1, "benchmark mode does not support fixed_training_image."
assert not args.disable_auto_densification, "benchmark mode needs auto densification."
assert not args.save_i2jsend, "benchmark mode does not support save_i2jsend."
assert not args.stop_update_param, "benchmark mode does not support stop_update_param."

# create log folder
if utils.LOCAL_RANK == 0:
Expand Down
2 changes: 1 addition & 1 deletion utils/camera_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def cameraList_from_camInfos(cam_infos, resolution_scale, args):
args = get_args()
from tqdm import tqdm
for id, c in tqdm(enumerate(cam_infos), total=len(cam_infos)):
if args.fixed_training_image == -1 or id == args.fixed_training_image:
if not hasattr(args, "fixed_training_image") or args.fixed_training_image == -1 or id == args.fixed_training_image:
camera_list.append(loadCam(args, id, c, resolution_scale))
else:
camera_list.append(None)
Expand Down

0 comments on commit 6ee8e5e

Please sign in to comment.