[Test] Verify metadata and data self heal

Test: test_metadata_heal_from_shd Steps: 1. Create, mount and run IO on volume 2. Set `self-heal-daemon` to `off` and bring files into metadata split brain 3. Set `self-heal-daemon` to `on` and wait for heal completion 4. Validate areequal checksum on backend bricks Test: test_metadata_heal_from_heal_cmd Steps: 1. Create, mount and run IO on volume 2. Set `self-heal-daemon` to `off` and bring files into metadata split brain 3. Set `self-heal-daemon` to `on`, invoke `gluster vol <vol> heal` 4. Validate areequal checksum on backend bricks Test: test_data_heal_from_shd Steps: 1. Create, mount and run IO on volume 2. Set `self-heal-daemon` to `off` and bring files into data split brain 3. Set `self-heal-daemon` to `on` and wait for heal completion 4. Validate areequal checksum on backend bricks Change-Id: I24411d964fb6252ae5b621c6569e791b54dcc311 Signed-off-by: Leela Venkaiah G <[email protected]>
gluster · Feb 12, 2021 · 967ed15 · 967ed15
1 parent 63b1b19
commit 967ed15
Showing 1 changed file with 297 additions and 0 deletions.
diff --git a/tests/functional/arbiter/test_verify_metadata_and_data_heal.py b/tests/functional/arbiter/test_verify_metadata_and_data_heal.py
@@ -0,0 +1,297 @@
+#  Copyright (C) 2021 Red Hat, Inc. <http://www.redhat.com>
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 2 of the License, or
+#  any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License along
+#  with this program; if not, write to the Free Software Foundation, Inc.,
+#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+from glusto.core import Glusto as g
+
+from glustolibs.gluster.brick_libs import (bring_bricks_offline,
+                                           bring_bricks_online,
+                                           get_online_bricks_list)
+from glustolibs.gluster.exceptions import ExecutionError
+from glustolibs.gluster.gluster_base_class import GlusterBaseClass, runs_on
+from glustolibs.gluster.glusterdir import mkdir
+from glustolibs.gluster.heal_libs import (
+    is_heal_complete, is_volume_in_split_brain, monitor_heal_completion,
+    wait_for_self_heal_daemons_to_be_online)
+from glustolibs.gluster.heal_ops import (disable_self_heal_daemon,
+                                         enable_self_heal_daemon, trigger_heal)
+from glustolibs.gluster.lib_utils import (add_user, collect_bricks_arequal,
+                                          del_user, group_add, group_del)
+from glustolibs.gluster.volume_libs import get_subvols
+from glustolibs.io.utils import list_all_files_and_dirs_mounts
+
+
+@runs_on([['arbiter', 'replicated'], ['glusterfs']])
+class TestMetadataAndDataHeal(GlusterBaseClass):
+    '''Description: Verify shd heals files after performing metadata and data
+    operations while a brick was down'''
+    def _dac_helper(self, host, option):
+        '''Helper for creating, deleting users and groups'''
+
+        # Permission/Ownership changes required only for `test_metadata..`
+        # tests, using random group and usernames
+        if 'metadata' not in self.test_dir:
+            return
+
+        if option == 'create':
+            # Groups
+            for group in ('qa_func', 'qa_system'):
+                if not group_add(host, group):
+                    raise ExecutionError('Unable to {} group {} on '
+                                         '{}'.format(option, group, host))
+
+            # User
+            if not add_user(host, 'qa_all', group='qa_func'):
+                raise ExecutionError('Unable to {} user {} under {} on '
+                                     '{}'.format(option, 'qa_all', 'qa_func',
+                                                 host))
+        elif option == 'delete':
+            # Groups
+            for group in ('qa_func', 'qa_system'):
+                if not group_del(host, group):
+                    raise ExecutionError('Unable to {} group {} on '
+                                         '{}'.format(option, group, host))
+
+            # User
+            if not del_user(host, 'qa_all'):
+                raise ExecutionError('Unable to {} user on {}'.format(
+                    option, host))
+
+    def setUp(self):
+        self.get_super_method(self, 'setUp')()
+
+        # A single mount is enough for all the tests
+        self.mounts = self.mounts[0:1]
+        self.client = self.mounts[0].client_system
+
+        # Use testcase name as test directory
+        self.test_dir = self.id().split('.')[-1]
+        self.fqpath = self.mounts[0].mountpoint + '/' + self.test_dir
+
+        if not self.setup_volume_and_mount_volume(mounts=self.mounts):
+            raise ExecutionError('Failed to setup and mount '
+                                 '{}'.format(self.volname))
+
+        # Crete group and user names required for the test
+        self._dac_helper(host=self.client, option='create')
+
+    def tearDown(self):
+        # Delete group and user names created as part of setup
+        self._dac_helper(host=self.client, option='delete')
+
+        if not self.unmount_volume_and_cleanup_volume(mounts=self.mounts):
+            raise ExecutionError('Not able to unmount and cleanup '
+                                 '{}'.format(self.volname))
+
+        self.get_super_method(self, 'tearDown')()
+
+    def _perform_io_and_disable_self_heal(self):
+        '''Refactor of steps common to all tests: Perform IO, disable heal'''
+        ret = mkdir(self.client, self.fqpath)
+        self.assertTrue(ret,
+                        'Directory creation failed on {}'.format(self.client))
+        self.io_cmd = 'cat /dev/urandom | tr -dc [:space:][:print:] | head -c '
+        # Create 6 dir's, 6 files and 6 files in each subdir with 10K data
+        file_io = ('''cd {0}; for i in `seq 1 6`;
+                    do mkdir dir.$i; {1} 10K > file.$i;
+                    for j in `seq 1 6`;
+                    do {1} 10K > dir.$i/file.$j; done;
+                    done;'''.format(self.fqpath, self.io_cmd))
+        ret, _, err = g.run(self.client, file_io)
+        self.assertEqual(ret, 0, 'Unable to create directories and data files')
+        self.assertFalse(err, '{0} failed with {1}'.format(file_io, err))
+
+        # Disable self heal deamon
+        self.assertTrue(disable_self_heal_daemon(self.mnode, self.volname),
+                        'Disabling self-heal-daemon falied')
+
+    def _perform_brick_ops_and_enable_self_heal(self, op_type):
+        '''Refactor of steps common to all tests: Brick down and perform
+        metadata/data operations'''
+        # First brick in the subvol will always be online and used for self
+        # heal, so make keys match brick index
+        self.op_cmd = {
+            # Metadata Operations (owner and permission changes)
+            'metadata': {
+                2:
+                '''cd {0}; for i in `seq 1 3`; do chown -R qa_all:qa_func \
+                dir.$i file.$i; chmod -R 555 dir.$i file.$i; done;''',
+                3:
+                '''cd {0}; for i in `seq 1 3`; do chown -R :qa_system \
+                dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''',
+                # 4 - Will be used for final data consistency check
+                4:
+                '''cd {0}; for i in `seq 1 6`; do chown -R qa_all:qa_system \
+                dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''',
+            },
+            # Data Operations (append data to the files)
+            'data': {
+                2:
+                '''cd {0}; for i in `seq 1 3`;
+                    do {1} 2K >> file.$i;
+                    for j in `seq 1 3`;
+                    do {1} 2K >> dir.$i/file.$j; done;
+                    done;''',
+                3:
+                '''cd {0}; for i in `seq 1 3`;
+                    do {1} 3K >> file.$i;
+                    for j in `seq 1 3`;
+                    do {1} 3K >> dir.$i/file.$j; done;
+                    done;''',
+                # 4 - Will be used for final data consistency check
+                4:
+                '''cd {0}; for i in `seq 1 6`;
+                    do {1} 4K >> file.$i;
+                    for j in `seq 1 6`;
+                    do {1} 4K >> dir.$i/file.$j; done;
+                    done;''',
+            },
+        }
+        bricks = get_online_bricks_list(self.mnode, self.volname)
+        self.assertIsNotNone(bricks,
+                             'Not able to get list of bricks in the volume')
+
+        # Make first brick always online and start operations from second brick
+        for index, brick in enumerate(bricks[1:], start=2):
+
+            # Bring brick offline
+            ret = bring_bricks_offline(self.volname, brick)
+            self.assertTrue(ret, 'Unable to bring {} offline'.format(bricks))
+
+            # Perform metadata/data operation
+            cmd = self.op_cmd[op_type][index].format(self.fqpath, self.io_cmd)
+            ret, _, err = g.run(self.client, cmd)
+            self.assertEqual(ret, 0, '{0} failed with {1}'.format(cmd, err))
+            self.assertFalse(err, '{0} failed with {1}'.format(cmd, err))
+
+            # Bring brick online
+            ret = bring_bricks_online(
+                self.mnode,
+                self.volname,
+                brick,
+                bring_bricks_online_methods='volume_start_force')
+
+        # Assert metadata/data operations resulted in pending heals
+        self.assertFalse(is_heal_complete(self.mnode, self.volname))
+
+        # Enable and wait self heal daemon to be online
+        self.assertTrue(enable_self_heal_daemon(self.mnode, self.volname),
+                        'Enabling self heal daemon failed')
+        self.assertTrue(
+            wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname),
+            'Not all self heal daemons are online')
+
+    def _validate_heal_completion_and_arequal(self, op_type):
+        '''Refactor of steps common to all tests: Validate heal from heal
+        commands, verify arequal, perform IO and verify arequal after IO'''
+
+        # Validate heal completion
+        self.assertTrue(monitor_heal_completion(self.mnode, self.volname),
+                        'Self heal is not completed within timeout')
+        self.assertFalse(
+            is_volume_in_split_brain(self.mnode, self.volname),
+            'Volume is in split brain even after heal completion')
+
+        subvols = get_subvols(self.mnode, self.volname)['volume_subvols']
+        self.assertTrue(subvols, 'Not able to get list of subvols')
+        arbiter = self.volume_type.find('arbiter') >= 0
+        stop = len(subvols[0]) - 1 if arbiter else len(subvols[0])
+
+        # Validate arequal
+        self._validate_arequal_and_perform_lookup(subvols, stop)
+
+        # Perform some additional metadata/data operations
+        cmd = self.op_cmd[op_type][4].format(self.fqpath, self.io_cmd)
+        ret, _, err = g.run(self.client, cmd)
+        self.assertEqual(ret, 0, '{0} failed with {1}'.format(cmd, err))
+        self.assertFalse(err, '{0} failed with {1}'.format(cmd, err))
+
+        # Validate arequal after additional operations
+        self._validate_arequal_and_perform_lookup(subvols, stop)
+
+    def _validate_arequal_and_perform_lookup(self, subvols, stop):
+        '''Refactor of steps common to all tests: Validate arequal from bricks
+        backend and perform a lookup of all files from mount'''
+        for subvol in subvols:
+            ret, arequal = collect_bricks_arequal(subvol[0:stop])
+            self.assertTrue(
+                ret, 'Unable to get `arequal` checksum on '
+                '{}'.format(subvol[0:stop]))
+            self.assertEqual(
+                len(set(arequal)), 1, 'Mismatch of `arequal` '
+                'checksum among {} is identified'.format(subvol[0:stop]))
+
+        # Perform a lookup of all files and directories on mounts
+        self.assertTrue(list_all_files_and_dirs_mounts(self.mounts),
+                        'Failed to list all files and dirs from mount')
+
+    def test_metadata_heal_from_shd(self):
+        '''Description: Verify files heal after switching on `self-heal-daemon`
+        when metadata operations are performed while a brick was down
+
+        Steps:
+        1. Create, mount and run IO on volume
+        2. Set `self-heal-daemon` to `off`, cyclic brick down and perform
+           metadata operations
+        3. Set `self-heal-daemon` to `on` and wait for heal completion
+        4. Validate areequal checksum on backend bricks
+        '''
+        op_type = 'metadata'
+        self._perform_io_and_disable_self_heal()
+        self._perform_brick_ops_and_enable_self_heal(op_type=op_type)
+        self._validate_heal_completion_and_arequal(op_type=op_type)
+        g.log.info('Pass: Verification of metadata heal after switching on '
+                   '`self heal daemon` is complete')
+
+    def test_metadata_heal_from_heal_cmd(self):
+        '''Description: Verify files heal after triggering heal command when
+        metadata operations are performed while a brick was down
+
+        Steps:
+        1. Create, mount and run IO on volume
+        2. Set `self-heal-daemon` to `off`, cyclic brick down and perform
+        metadata operations
+        3. Set `self-heal-daemon` to `on`, invoke `gluster vol <vol> heal`
+        4. Validate areequal checksum on backend bricks
+        '''
+        op_type = 'metadata'
+        self._perform_io_and_disable_self_heal()
+        self._perform_brick_ops_and_enable_self_heal(op_type=op_type)
+
+        # Invoke `glfsheal`
+        self.assertTrue(trigger_heal(self.mnode, self.volname),
+                        'Unable to trigger index heal on the volume')
+
+        self._validate_heal_completion_and_arequal(op_type=op_type)
+        g.log.info(
+            'Pass: Verification of metadata heal via `glfsheal` is complete')
+
+    def test_data_heal_from_shd(self):
+        '''Description: Verify files heal after triggering heal command when
+        data operations are performed while a brick was down
+
+        Steps:
+        1. Create, mount and run IO on volume
+        2. Set `self-heal-daemon` to `off`, cyclic brick down and perform data
+           operations
+        3. Set `self-heal-daemon` to `on` and wait for heal completion
+        4. Validate areequal checksum on backend bricks
+        '''
+        op_type = 'data'
+        self._perform_io_and_disable_self_heal()
+        self._perform_brick_ops_and_enable_self_heal(op_type=op_type)
+        self._validate_heal_completion_and_arequal(op_type=op_type)
+        g.log.info('Pass: Verification of data heal after switching on '
+                   '`self heal daemon` is complete')