Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Volod/eng 2733 avoid dlp validator crashes eg if rpc goes down 2 #57

Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
feat: gracefully shutdown on KeyboardInterrupt
  • Loading branch information
volod-vana committed Jul 12, 2024
commit 9913775ce7e35e9a03c0b29dd31e9fe7c9037ac0
63 changes: 35 additions & 28 deletions chatgpt/nodes/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,44 +328,40 @@ def run(self):

# This loop maintains the validator's operations until intentionally stopped.
try:
while not self.should_exit:
try:
vana.logging.info(f"step({self.step}) block({self.block})")
while True:
vana.logging.info(f"step({self.step}) block({self.block})")

# Run multiple forwards concurrently.
self.loop.run_until_complete(self.concurrent_forward())
# Run multiple forwards concurrently.
self.loop.run_until_complete(self.concurrent_forward())

# Process peer scoring queue every tempo period
current_block = self.chain_manager.get_current_block()
if current_block % self.config.dlp.tempo == 0:
self.loop.run_until_complete(self.process_peer_scoring_queue())
# Process peer scoring queue every tempo period
current_block = self.chain_manager.get_current_block()
if current_block % self.config.dlp.tempo == 0:
self.loop.run_until_complete(self.process_peer_scoring_queue())

# Sync state and potentially set weights.
self.sync()
# Check if we should exit.
if self.should_exit:
break

self.step += 1
# Sync state and potentially set weights.
self.sync()

except Exception as e:
vana.logging.error(f"Error during validation step: {str(e)}")
vana.logging.debug(
print_exception(type(e), e, e.__traceback__)
)
# Wait for a short period before retrying
time.sleep(30)
self.step += 1

# If someone intentionally stops the validator, it'll safely terminate operations.
except KeyboardInterrupt:
vana.logging.info("Keyboard interrupt received. Shutting down gracefully...")
if hasattr(self, 'node_server') and self.node_server:
self.node_server.stop()
self.node_server.unserve(dlp_uid=self.config.dlpuid, chain_manager=self.chain_manager)
vana.logging.success("Validator killed by keyboard interrupt.")
exit()

# In case of unforeseen errors, the validator will log the error and continue operations.
except Exception as err:
vana.logging.error(f"Unexpected error in main loop: {str(err)}")
vana.logging.error("Error during validation", str(err))
vana.logging.debug(
print_exception(type(err), err, err.__traceback__)
)
finally:
# Cleanup code
if hasattr(self, 'node_server') and self.node_server:
self.node_server.stop()
self.node_server.unserve(dlp_uid=self.config.dlpuid, chain_manager=self.chain_manager)
vana.logging.success("Validator has been stopped.")

def run_in_background_thread(self):
"""
Expand Down Expand Up @@ -421,4 +417,15 @@ def resync_state(self):

if __name__ == "__main__":
vana.trace()
Validator().run()
try:
while True:
try:
validator = Validator()
asyncio.run(validator.run())
except Exception as e:
vana.logging.error(f"An error occurred: {str(e)}")
vana.logging.error("Restarting the validator in 30 seconds...")
time.sleep(30)
finally:
vana.logging.info("Validator stopped.")
sys.exit(0)
Loading