Skip to content

Commit

Permalink
Fix wrong validation of top-parent pointer during page deletion in Bt…
Browse files Browse the repository at this point in the history
…ree.

After introducing usage of t_tid of inner or page high key for storing
number of attributes of tuple, validation of tuple's ItemPointer with
ItemPointerIsValid becomes incorrect, it's need to validate only blocknumber of
ItemPointer. Missing this causes a incorrect page deletion, fix that. Test is
added.

BTW, current contrib/amcheck doesn't fail on index corrupted by this way.

Also introduce BTreeTupleGetTopParent/BTreeTupleSetTopParent macroses to improve
code readability and to avoid possible confusion with page high key: high key
is used to store top-parent link for branch to remove.

Bug found by Michael Paquier, but bug doesn't exist in previous versions because
t_tid was set to P_HIKEY.

Author: Teodor Sigaev
Reviewer: Peter Geoghegan
Discussion: https://www.postgresql.org/message-id/flat/20180419052436.GA16000%40paquier.xyz
  • Loading branch information
feodor committed Apr 23, 2018
1 parent 6a7b2ce commit 6db4b49
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 23 deletions.
21 changes: 8 additions & 13 deletions src/backend/access/nbtree/nbtpage.c
Original file line number Diff line number Diff line change
Expand Up @@ -1602,10 +1602,9 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
MemSet(&trunctuple, 0, sizeof(IndexTupleData));
trunctuple.t_info = sizeof(IndexTupleData);
if (target != leafblkno)
ItemPointerSetBlockNumber(&trunctuple.t_tid, target);
BTreeTupleSetTopParent(&trunctuple, target);
else
ItemPointerSetInvalid(&trunctuple.t_tid);
BTreeTupleSetNAtts(&trunctuple, 0);
BTreeTupleSetTopParent(&trunctuple, InvalidBlockNumber);

if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
false, false) == InvalidOffsetNumber)
Expand Down Expand Up @@ -1690,7 +1689,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
BTPageOpaque opaque;
bool rightsib_is_rightmost;
int targetlevel;
ItemPointer leafhikey;
IndexTuple leafhikey;
BlockNumber nextchild;

page = BufferGetPage(leafbuf);
Expand All @@ -1702,7 +1701,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
* Remember some information about the leaf page.
*/
itemid = PageGetItemId(page, P_HIKEY);
leafhikey = &((IndexTuple) PageGetItem(page, itemid))->t_tid;
leafhikey = (IndexTuple) PageGetItem(page, itemid);
leafleftsib = opaque->btpo_prev;
leafrightsib = opaque->btpo_next;

Expand All @@ -1714,9 +1713,10 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
* parent in the branch. Set 'target' and 'buf' to reference the page
* actually being unlinked.
*/
if (ItemPointerIsValid(leafhikey))
target = BTreeTupleGetTopParent(leafhikey);

if (target != InvalidBlockNumber)
{
target = ItemPointerGetBlockNumberNoCheck(leafhikey);
Assert(target != leafblkno);

/* fetch the block number of the topmost parent's left sibling */
Expand Down Expand Up @@ -1919,12 +1919,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
* branch.
*/
if (target != leafblkno)
{
if (nextchild == InvalidBlockNumber)
ItemPointerSetInvalid(leafhikey);
else
ItemPointerSetBlockNumber(leafhikey, nextchild);
}
BTreeTupleSetTopParent(leafhikey, nextchild);

/*
* Mark the page itself deleted. It can be recycled when all current
Expand Down
12 changes: 2 additions & 10 deletions src/backend/access/nbtree/nbtxlog.c
Original file line number Diff line number Diff line change
Expand Up @@ -800,11 +800,7 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record)
*/
MemSet(&trunctuple, 0, sizeof(IndexTupleData));
trunctuple.t_info = sizeof(IndexTupleData);
if (xlrec->topparent != InvalidBlockNumber)
ItemPointerSetBlockNumber(&trunctuple.t_tid, xlrec->topparent);
else
ItemPointerSetInvalid(&trunctuple.t_tid);
BTreeTupleSetNAtts(&trunctuple, 0);
BTreeTupleSetTopParent(&trunctuple, xlrec->topparent);

if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
false, false) == InvalidOffsetNumber)
Expand Down Expand Up @@ -912,11 +908,7 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
/* Add a dummy hikey item */
MemSet(&trunctuple, 0, sizeof(IndexTupleData));
trunctuple.t_info = sizeof(IndexTupleData);
if (xlrec->topparent != InvalidBlockNumber)
ItemPointerSetBlockNumber(&trunctuple.t_tid, xlrec->topparent);
else
ItemPointerSetInvalid(&trunctuple.t_tid);
BTreeTupleSetNAtts(&trunctuple, 0);
BTreeTupleSetTopParent(&trunctuple, xlrec->topparent);

if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
false, false) == InvalidOffsetNumber)
Expand Down
14 changes: 14 additions & 0 deletions src/include/access/nbtree.h
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,20 @@ typedef struct BTMetaPageData
#define BTreeInnerTupleSetDownLink(itup, blkno) \
ItemPointerSetBlockNumber(&((itup)->t_tid), (blkno))

/*
* Get/set leaf page highkey's link. During the second phase of deletion, the
* target leaf page's high key may point to an ancestor page (at all other
* times, the leaf level high key's link is not used). See the nbtree README
* for full details.
*/
#define BTreeTupleGetTopParent(itup) \
ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid))
#define BTreeTupleSetTopParent(itup, blkno) \
do { \
ItemPointerSetBlockNumber(&((itup)->t_tid), (blkno)); \
BTreeTupleSetNAtts((itup), 0); \
} while(0)

/*
* Get/set number of attributes within B-tree index tuple. Asserts should be
* removed when BT_RESERVED_OFFSET_MASK bits will be used.
Expand Down
10 changes: 10 additions & 0 deletions src/test/regress/expected/create_index.out
Original file line number Diff line number Diff line change
Expand Up @@ -3052,6 +3052,16 @@ explain (costs off)
Filter: (NOT b)
(4 rows)

--
-- Test for multilevel page deletion
--
CREATE TABLE delete_test_table (a bigint, b bigint, c bigint, d bigint);
INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,80000) i;
ALTER TABLE delete_test_table ADD PRIMARY KEY (a,b,c,d);
DELETE FROM delete_test_table WHERE a > 40000;
VACUUM delete_test_table;
DELETE FROM delete_test_table WHERE a > 10;
VACUUM delete_test_table;
--
-- REINDEX (VERBOSE)
--
Expand Down
1 change: 1 addition & 0 deletions src/test/regress/expected/sanity_check.out
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ d_star|f
date_tbl|f
default_tbl|f
defaultexpr_tbl|f
delete_test_table|t
dept|f
dupindexcols|t
e_star|f
Expand Down
11 changes: 11 additions & 0 deletions src/test/regress/sql/create_index.sql
Original file line number Diff line number Diff line change
Expand Up @@ -1061,6 +1061,17 @@ explain (costs off)
explain (costs off)
select * from boolindex where not b order by i limit 10;

--
-- Test for multilevel page deletion
--
CREATE TABLE delete_test_table (a bigint, b bigint, c bigint, d bigint);
INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,80000) i;
ALTER TABLE delete_test_table ADD PRIMARY KEY (a,b,c,d);
DELETE FROM delete_test_table WHERE a > 40000;
VACUUM delete_test_table;
DELETE FROM delete_test_table WHERE a > 10;
VACUUM delete_test_table;

--
-- REINDEX (VERBOSE)
--
Expand Down

0 comments on commit 6db4b49

Please sign in to comment.