Index: nbtpage.c =================================================================== RCS file: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v retrieving revision 1.89 diff -c -r1.89 nbtpage.c *** nbtpage.c 6 Nov 2005 19:29:00 -0000 1.89 --- nbtpage.c 7 Nov 2005 22:53:41 -0000 *************** *** 205,223 **** if (metad->btm_root != P_NONE) { /* ! * Metadata initialized by someone else. In order to guarantee no ! * deadlocks, we have to release the metadata page and start all ! * over again. (Is that really true? But it's hardly worth trying ! * to optimize this case.) */ _bt_relbuf(rel, metabuf); return _bt_getroot(rel, access); } /* ! * Get, initialize, write, and leave a lock of the appropriate type on ! * the new root page. Since this is the first page in the tree, it's ! * a leaf as well as the root. */ rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); rootblkno = BufferGetBlockNumber(rootbuf); --- 205,223 ---- if (metad->btm_root != P_NONE) { /* ! * Metadata initialized by someone else. In order to guarantee ! * no deadlocks, we have to release the metadata page and start ! * all over again. (Is that really true? But it's hardly worth ! * trying to optimize this case.) */ _bt_relbuf(rel, metabuf); return _bt_getroot(rel, access); } /* ! * Get, initialize, write, and leave a lock of the appropriate type ! * on the new root page. Since this is the first page in the tree, ! * it's a leaf as well as the root. */ rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); rootblkno = BufferGetBlockNumber(rootbuf); *************** *** 412,427 **** Page page = BufferGetPage(buf); /* ! * ReadBuffer verifies that every newly-read page passes PageHeaderIsValid, ! * which means it either contains a reasonably sane page header or is ! * all-zero. We have to defend against the all-zero case, however. */ if (PageIsNew(page)) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), ! errmsg("index \"%s\" contains unexpected zero page at block %u", ! RelationGetRelationName(rel), ! BufferGetBlockNumber(buf)), errhint("Please REINDEX it."))); /* --- 412,428 ---- Page page = BufferGetPage(buf); /* ! * ReadBuffer verifies that every newly-read page passes ! * PageHeaderIsValid, which means it either contains a reasonably sane ! * page header or is all-zero. We have to defend against the all-zero ! * case, however. */ if (PageIsNew(page)) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), ! errmsg("index \"%s\" contains unexpected zero page at block %u", ! RelationGetRelationName(rel), ! BufferGetBlockNumber(buf)), errhint("Please REINDEX it."))); /* *************** *** 440,446 **** /* * _bt_getbuf() -- Get a buffer by block number for read or write. * ! * blkno == P_NEW means to get an unallocated index page. The page * will be initialized before returning it. * * When this routine returns, the appropriate lock is set on the --- 441,447 ---- /* * _bt_getbuf() -- Get a buffer by block number for read or write. * ! * blkno == P_NEW means to get an unallocated index page. The page * will be initialized before returning it. * * When this routine returns, the appropriate lock is set on the *************** *** 480,495 **** * it, or even worse our own caller does, we could deadlock. (The * own-caller scenario is actually not improbable. Consider an index * on a serial or timestamp column. Nearly all splits will be at the ! * rightmost page, so it's entirely likely that _bt_split will call us ! * while holding a lock on the page most recently acquired from FSM. ! * A VACUUM running concurrently with the previous split could well ! * have placed that page back in FSM.) * ! * To get around that, we ask for only a conditional lock on the reported ! * page. If we fail, then someone else is using the page, and we may ! * reasonably assume it's not free. (If we happen to be wrong, the ! * worst consequence is the page will be lost to use till the next ! * VACUUM, which is no big problem.) */ for (;;) { --- 481,496 ---- * it, or even worse our own caller does, we could deadlock. (The * own-caller scenario is actually not improbable. Consider an index * on a serial or timestamp column. Nearly all splits will be at the ! * rightmost page, so it's entirely likely that _bt_split will call ! * us while holding a lock on the page most recently acquired from ! * FSM. A VACUUM running concurrently with the previous split could ! * well have placed that page back in FSM.) * ! * To get around that, we ask for only a conditional lock on the ! * reported page. If we fail, then someone else is using the page, ! * and we may reasonably assume it's not free. (If we happen to be ! * wrong, the worst consequence is the page will be lost to use till ! * the next VACUUM, which is no big problem.) */ for (;;) { *************** *** 649,658 **** BTPageOpaque opaque; /* ! * It's possible to find an all-zeroes page in an index --- for example, a ! * backend might successfully extend the relation one page and then crash ! * before it is able to make a WAL entry for adding the page. If we find a ! * zeroed page then reclaim it. */ if (PageIsNew(page)) return true; --- 650,659 ---- BTPageOpaque opaque; /* ! * It's possible to find an all-zeroes page in an index --- for example, ! * a backend might successfully extend the relation one page and then ! * crash before it is able to make a WAL entry for adding the page. If we ! * find a zeroed page then reclaim it. */ if (PageIsNew(page)) return true; *************** *** 782,789 **** BTPageOpaque opaque; /* ! * We can never delete rightmost pages nor root pages. While at it, check ! * that page is not already deleted and is empty. */ page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); --- 783,790 ---- BTPageOpaque opaque; /* ! * We can never delete rightmost pages nor root pages. While at it, ! * check that page is not already deleted and is empty. */ page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); *************** *** 808,815 **** * We need to get an approximate pointer to the page's parent page. Use * the standard search mechanism to search for the page's high key; this * will give us a link to either the current parent or someplace to its ! * left (if there are multiple equal high keys). To avoid deadlocks, we'd ! * better drop the target page lock first. */ _bt_relbuf(rel, buf); /* we need a scan key to do our search, so build one */ --- 809,816 ---- * We need to get an approximate pointer to the page's parent page. Use * the standard search mechanism to search for the page's high key; this * will give us a link to either the current parent or someplace to its ! * left (if there are multiple equal high keys). To avoid deadlocks, ! * we'd better drop the target page lock first. */ _bt_relbuf(rel, buf); /* we need a scan key to do our search, so build one */ *************** *** 843,851 **** * page. The sibling that was current a moment ago could have split, so * we may have to move right. This search could fail if either the * sibling or the target page was deleted by someone else meanwhile; if ! * so, give up. (Right now, that should never happen, since page deletion ! * is only done in VACUUM and there shouldn't be multiple VACUUMs ! * concurrently on the same table.) */ if (leftsib != P_NONE) { --- 844,852 ---- * page. The sibling that was current a moment ago could have split, so * we may have to move right. This search could fail if either the * sibling or the target page was deleted by someone else meanwhile; if ! * so, give up. (Right now, that should never happen, since page ! * deletion is only done in VACUUM and there shouldn't be multiple ! * VACUUMs concurrently on the same table.) */ if (leftsib != P_NONE) { *************** *** 872,880 **** lbuf = InvalidBuffer; /* ! * Next write-lock the target page itself. It should be okay to take just ! * a write lock not a superexclusive lock, since no scans would stop on an ! * empty page. */ buf = _bt_getbuf(rel, target, BT_WRITE); page = BufferGetPage(buf); --- 873,881 ---- lbuf = InvalidBuffer; /* ! * Next write-lock the target page itself. It should be okay to take ! * just a write lock not a superexclusive lock, since no scans would stop ! * on an empty page. */ buf = _bt_getbuf(rel, target, BT_WRITE); page = BufferGetPage(buf); *************** *** 904,911 **** rbuf = _bt_getbuf(rel, rightsib, BT_WRITE); /* ! * Next find and write-lock the current parent of the target page. This is ! * essentially the same as the corresponding step of splitting. */ ItemPointerSet(&(stack->bts_btitem.bti_itup.t_tid), target, P_HIKEY); --- 905,912 ---- rbuf = _bt_getbuf(rel, rightsib, BT_WRITE); /* ! * Next find and write-lock the current parent of the target page. This ! * is essentially the same as the corresponding step of splitting. */ ItemPointerSet(&(stack->bts_btitem.bti_itup.t_tid), target, P_HIKEY); *************** *** 949,957 **** /* * If we are deleting the next-to-last page on the target's level, then ! * the rightsib is a candidate to become the new fast root. (In theory, it ! * might be possible to push the fast root even further down, but the odds ! * of doing so are slim, and the locking considerations daunting.) * * We can safely acquire a lock on the metapage here --- see comments for * _bt_newroot(). --- 950,958 ---- /* * If we are deleting the next-to-last page on the target's level, then ! * the rightsib is a candidate to become the new fast root. (In theory, ! * it might be possible to push the fast root even further down, but the ! * odds of doing so are slim, and the locking considerations daunting.) * * We can safely acquire a lock on the metapage here --- see comments for * _bt_newroot(). *************** *** 992,999 **** /* * Update parent. The normal case is a tad tricky because we want to * delete the target's downlink and the *following* key. Easiest way is ! * to copy the right sibling's downlink over the target downlink, and then ! * delete the following item. */ page = BufferGetPage(pbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); --- 993,1000 ---- /* * Update parent. The normal case is a tad tricky because we want to * delete the target's downlink and the *following* key. Easiest way is ! * to copy the right sibling's downlink over the target downlink, and ! * then delete the following item. */ page = BufferGetPage(pbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); *************** *** 1154,1162 **** /* * If parent became half dead, recurse to try to delete it. Otherwise, if ! * right sibling is empty and is now the last child of the parent, recurse ! * to try to delete it. (These cases cannot apply at the same time, ! * though the second case might itself recurse to the first.) */ if (parent_half_dead) { --- 1155,1163 ---- /* * If parent became half dead, recurse to try to delete it. Otherwise, if ! * right sibling is empty and is now the last child of the parent, ! * recurse to try to delete it. (These cases cannot apply at the same ! * time, though the second case might itself recurse to the first.) */ if (parent_half_dead) {