Skip to content

Commit

Permalink
[Test] Verify metadata and data self heal
Browse files Browse the repository at this point in the history
Test: test_metadata_heal_from_shd
Steps:
1. Create, mount and run IO on volume
2. Set `self-heal-daemon` to `off` and bring files into metadata
   split brain
3. Set `self-heal-daemon` to `on` and wait for heal completion
4. Validate areequal checksum on backend bricks

Test: test_metadata_heal_from_heal_cmd
Steps:
1. Create, mount and run IO on volume
2. Set `self-heal-daemon` to `off` and bring files into metadata
   split brain
3. Set `self-heal-daemon` to `on`, invoke `gluster vol <vol> heal`
4. Validate areequal checksum on backend bricks

Test: test_data_heal_from_shd
Steps:
1. Create, mount and run IO on volume
2. Set `self-heal-daemon` to `off` and bring files into data
   split brain
3. Set `self-heal-daemon` to `on` and wait for heal completion
4. Validate areequal checksum on backend bricks

Change-Id: I24411d964fb6252ae5b621c6569e791b54dcc311
Signed-off-by: Leela Venkaiah G <[email protected]>
  • Loading branch information
leelavg committed Feb 12, 2021
1 parent 63b1b19 commit 967ed15
Showing 1 changed file with 297 additions and 0 deletions.
297 changes: 297 additions & 0 deletions tests/functional/arbiter/test_verify_metadata_and_data_heal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,297 @@
# Copyright (C) 2021 Red Hat, Inc. <http://www.redhat.com>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

from glusto.core import Glusto as g

from glustolibs.gluster.brick_libs import (bring_bricks_offline,
bring_bricks_online,
get_online_bricks_list)
from glustolibs.gluster.exceptions import ExecutionError
from glustolibs.gluster.gluster_base_class import GlusterBaseClass, runs_on
from glustolibs.gluster.glusterdir import mkdir
from glustolibs.gluster.heal_libs import (
is_heal_complete, is_volume_in_split_brain, monitor_heal_completion,
wait_for_self_heal_daemons_to_be_online)
from glustolibs.gluster.heal_ops import (disable_self_heal_daemon,
enable_self_heal_daemon, trigger_heal)
from glustolibs.gluster.lib_utils import (add_user, collect_bricks_arequal,
del_user, group_add, group_del)
from glustolibs.gluster.volume_libs import get_subvols
from glustolibs.io.utils import list_all_files_and_dirs_mounts


@runs_on([['arbiter', 'replicated'], ['glusterfs']])
class TestMetadataAndDataHeal(GlusterBaseClass):
'''Description: Verify shd heals files after performing metadata and data
operations while a brick was down'''
def _dac_helper(self, host, option):
'''Helper for creating, deleting users and groups'''

# Permission/Ownership changes required only for `test_metadata..`
# tests, using random group and usernames
if 'metadata' not in self.test_dir:
return

if option == 'create':
# Groups
for group in ('qa_func', 'qa_system'):
if not group_add(host, group):
raise ExecutionError('Unable to {} group {} on '
'{}'.format(option, group, host))

# User
if not add_user(host, 'qa_all', group='qa_func'):
raise ExecutionError('Unable to {} user {} under {} on '
'{}'.format(option, 'qa_all', 'qa_func',
host))
elif option == 'delete':
# Groups
for group in ('qa_func', 'qa_system'):
if not group_del(host, group):
raise ExecutionError('Unable to {} group {} on '
'{}'.format(option, group, host))

# User
if not del_user(host, 'qa_all'):
raise ExecutionError('Unable to {} user on {}'.format(
option, host))

def setUp(self):
self.get_super_method(self, 'setUp')()

# A single mount is enough for all the tests
self.mounts = self.mounts[0:1]
self.client = self.mounts[0].client_system

# Use testcase name as test directory
self.test_dir = self.id().split('.')[-1]
self.fqpath = self.mounts[0].mountpoint + '/' + self.test_dir

if not self.setup_volume_and_mount_volume(mounts=self.mounts):
raise ExecutionError('Failed to setup and mount '
'{}'.format(self.volname))

# Crete group and user names required for the test
self._dac_helper(host=self.client, option='create')

def tearDown(self):
# Delete group and user names created as part of setup
self._dac_helper(host=self.client, option='delete')

if not self.unmount_volume_and_cleanup_volume(mounts=self.mounts):
raise ExecutionError('Not able to unmount and cleanup '
'{}'.format(self.volname))

self.get_super_method(self, 'tearDown')()

def _perform_io_and_disable_self_heal(self):
'''Refactor of steps common to all tests: Perform IO, disable heal'''
ret = mkdir(self.client, self.fqpath)
self.assertTrue(ret,
'Directory creation failed on {}'.format(self.client))
self.io_cmd = 'cat /dev/urandom | tr -dc [:space:][:print:] | head -c '
# Create 6 dir's, 6 files and 6 files in each subdir with 10K data
file_io = ('''cd {0}; for i in `seq 1 6`;
do mkdir dir.$i; {1} 10K > file.$i;
for j in `seq 1 6`;
do {1} 10K > dir.$i/file.$j; done;
done;'''.format(self.fqpath, self.io_cmd))
ret, _, err = g.run(self.client, file_io)
self.assertEqual(ret, 0, 'Unable to create directories and data files')
self.assertFalse(err, '{0} failed with {1}'.format(file_io, err))

# Disable self heal deamon
self.assertTrue(disable_self_heal_daemon(self.mnode, self.volname),
'Disabling self-heal-daemon falied')

def _perform_brick_ops_and_enable_self_heal(self, op_type):
'''Refactor of steps common to all tests: Brick down and perform
metadata/data operations'''
# First brick in the subvol will always be online and used for self
# heal, so make keys match brick index
self.op_cmd = {
# Metadata Operations (owner and permission changes)
'metadata': {
2:
'''cd {0}; for i in `seq 1 3`; do chown -R qa_all:qa_func \
dir.$i file.$i; chmod -R 555 dir.$i file.$i; done;''',
3:
'''cd {0}; for i in `seq 1 3`; do chown -R :qa_system \
dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''',
# 4 - Will be used for final data consistency check
4:
'''cd {0}; for i in `seq 1 6`; do chown -R qa_all:qa_system \
dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''',
},
# Data Operations (append data to the files)
'data': {
2:
'''cd {0}; for i in `seq 1 3`;
do {1} 2K >> file.$i;
for j in `seq 1 3`;
do {1} 2K >> dir.$i/file.$j; done;
done;''',
3:
'''cd {0}; for i in `seq 1 3`;
do {1} 3K >> file.$i;
for j in `seq 1 3`;
do {1} 3K >> dir.$i/file.$j; done;
done;''',
# 4 - Will be used for final data consistency check
4:
'''cd {0}; for i in `seq 1 6`;
do {1} 4K >> file.$i;
for j in `seq 1 6`;
do {1} 4K >> dir.$i/file.$j; done;
done;''',
},
}
bricks = get_online_bricks_list(self.mnode, self.volname)
self.assertIsNotNone(bricks,
'Not able to get list of bricks in the volume')

# Make first brick always online and start operations from second brick
for index, brick in enumerate(bricks[1:], start=2):

# Bring brick offline
ret = bring_bricks_offline(self.volname, brick)
self.assertTrue(ret, 'Unable to bring {} offline'.format(bricks))

# Perform metadata/data operation
cmd = self.op_cmd[op_type][index].format(self.fqpath, self.io_cmd)
ret, _, err = g.run(self.client, cmd)
self.assertEqual(ret, 0, '{0} failed with {1}'.format(cmd, err))
self.assertFalse(err, '{0} failed with {1}'.format(cmd, err))

# Bring brick online
ret = bring_bricks_online(
self.mnode,
self.volname,
brick,
bring_bricks_online_methods='volume_start_force')

# Assert metadata/data operations resulted in pending heals
self.assertFalse(is_heal_complete(self.mnode, self.volname))

# Enable and wait self heal daemon to be online
self.assertTrue(enable_self_heal_daemon(self.mnode, self.volname),
'Enabling self heal daemon failed')
self.assertTrue(
wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname),
'Not all self heal daemons are online')

def _validate_heal_completion_and_arequal(self, op_type):
'''Refactor of steps common to all tests: Validate heal from heal
commands, verify arequal, perform IO and verify arequal after IO'''

# Validate heal completion
self.assertTrue(monitor_heal_completion(self.mnode, self.volname),
'Self heal is not completed within timeout')
self.assertFalse(
is_volume_in_split_brain(self.mnode, self.volname),
'Volume is in split brain even after heal completion')

subvols = get_subvols(self.mnode, self.volname)['volume_subvols']
self.assertTrue(subvols, 'Not able to get list of subvols')
arbiter = self.volume_type.find('arbiter') >= 0
stop = len(subvols[0]) - 1 if arbiter else len(subvols[0])

# Validate arequal
self._validate_arequal_and_perform_lookup(subvols, stop)

# Perform some additional metadata/data operations
cmd = self.op_cmd[op_type][4].format(self.fqpath, self.io_cmd)
ret, _, err = g.run(self.client, cmd)
self.assertEqual(ret, 0, '{0} failed with {1}'.format(cmd, err))
self.assertFalse(err, '{0} failed with {1}'.format(cmd, err))

# Validate arequal after additional operations
self._validate_arequal_and_perform_lookup(subvols, stop)

def _validate_arequal_and_perform_lookup(self, subvols, stop):
'''Refactor of steps common to all tests: Validate arequal from bricks
backend and perform a lookup of all files from mount'''
for subvol in subvols:
ret, arequal = collect_bricks_arequal(subvol[0:stop])
self.assertTrue(
ret, 'Unable to get `arequal` checksum on '
'{}'.format(subvol[0:stop]))
self.assertEqual(
len(set(arequal)), 1, 'Mismatch of `arequal` '
'checksum among {} is identified'.format(subvol[0:stop]))

# Perform a lookup of all files and directories on mounts
self.assertTrue(list_all_files_and_dirs_mounts(self.mounts),
'Failed to list all files and dirs from mount')

def test_metadata_heal_from_shd(self):
'''Description: Verify files heal after switching on `self-heal-daemon`
when metadata operations are performed while a brick was down
Steps:
1. Create, mount and run IO on volume
2. Set `self-heal-daemon` to `off`, cyclic brick down and perform
metadata operations
3. Set `self-heal-daemon` to `on` and wait for heal completion
4. Validate areequal checksum on backend bricks
'''
op_type = 'metadata'
self._perform_io_and_disable_self_heal()
self._perform_brick_ops_and_enable_self_heal(op_type=op_type)
self._validate_heal_completion_and_arequal(op_type=op_type)
g.log.info('Pass: Verification of metadata heal after switching on '
'`self heal daemon` is complete')

def test_metadata_heal_from_heal_cmd(self):
'''Description: Verify files heal after triggering heal command when
metadata operations are performed while a brick was down
Steps:
1. Create, mount and run IO on volume
2. Set `self-heal-daemon` to `off`, cyclic brick down and perform
metadata operations
3. Set `self-heal-daemon` to `on`, invoke `gluster vol <vol> heal`
4. Validate areequal checksum on backend bricks
'''
op_type = 'metadata'
self._perform_io_and_disable_self_heal()
self._perform_brick_ops_and_enable_self_heal(op_type=op_type)

# Invoke `glfsheal`
self.assertTrue(trigger_heal(self.mnode, self.volname),
'Unable to trigger index heal on the volume')

self._validate_heal_completion_and_arequal(op_type=op_type)
g.log.info(
'Pass: Verification of metadata heal via `glfsheal` is complete')

def test_data_heal_from_shd(self):
'''Description: Verify files heal after triggering heal command when
data operations are performed while a brick was down
Steps:
1. Create, mount and run IO on volume
2. Set `self-heal-daemon` to `off`, cyclic brick down and perform data
operations
3. Set `self-heal-daemon` to `on` and wait for heal completion
4. Validate areequal checksum on backend bricks
'''
op_type = 'data'
self._perform_io_and_disable_self_heal()
self._perform_brick_ops_and_enable_self_heal(op_type=op_type)
self._validate_heal_completion_and_arequal(op_type=op_type)
g.log.info('Pass: Verification of data heal after switching on '
'`self heal daemon` is complete')

0 comments on commit 967ed15

Please sign in to comment.