You've already forked pg_probackup
mirror of
https://github.com/postgrespro/pg_probackup.git
synced 2025-07-07 06:05:35 +02:00
[PBCKP-98] fix invalid stop lsn. Reported by Alexander Lakhin and Alex Ignatov
This commit is contained in:
138
src/backup.c
138
src/backup.c
@ -401,10 +401,10 @@ do_backup_instance(PGconn *backup_conn, PGNodeInfo *nodeInfo, bool no_sync)
|
|||||||
|
|
||||||
if (current.backup_mode != BACKUP_MODE_FULL)
|
if (current.backup_mode != BACKUP_MODE_FULL)
|
||||||
{
|
{
|
||||||
elog(LOG, "current_tli:%X", current.tli);
|
elog(LOG, "Current tli: %X", current.tli);
|
||||||
elog(LOG, "prev_backup->start_lsn: %X/%X",
|
elog(LOG, "Parent start_lsn: %X/%X",
|
||||||
(uint32) (prev_backup->start_lsn >> 32), (uint32) (prev_backup->start_lsn));
|
(uint32) (prev_backup->start_lsn >> 32), (uint32) (prev_backup->start_lsn));
|
||||||
elog(LOG, "current.start_lsn: %X/%X",
|
elog(LOG, "start_lsn: %X/%X",
|
||||||
(uint32) (current.start_lsn >> 32), (uint32) (current.start_lsn));
|
(uint32) (current.start_lsn >> 32), (uint32) (current.start_lsn));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -583,9 +583,6 @@ do_backup_instance(PGconn *backup_conn, PGNodeInfo *nodeInfo, bool no_sync)
|
|||||||
/* Notify end of backup */
|
/* Notify end of backup */
|
||||||
pg_stop_backup(¤t, pg_startbackup_conn, nodeInfo);
|
pg_stop_backup(¤t, pg_startbackup_conn, nodeInfo);
|
||||||
|
|
||||||
elog(LOG, "current.stop_lsn: %X/%X",
|
|
||||||
(uint32) (stop_backup_lsn >> 32), (uint32) (stop_backup_lsn));
|
|
||||||
|
|
||||||
/* In case of backup from replica >= 9.6 we must fix minRecPoint,
|
/* In case of backup from replica >= 9.6 we must fix minRecPoint,
|
||||||
* First we must find pg_control in backup_files_list.
|
* First we must find pg_control in backup_files_list.
|
||||||
*/
|
*/
|
||||||
@ -1742,65 +1739,66 @@ pg_stop_backup(pgBackup *backup, PGconn *pg_startbackup_conn,
|
|||||||
/* Calculate LSN */
|
/* Calculate LSN */
|
||||||
stop_backup_lsn_tmp = ((uint64) lsn_hi) << 32 | lsn_lo;
|
stop_backup_lsn_tmp = ((uint64) lsn_hi) << 32 | lsn_lo;
|
||||||
|
|
||||||
|
/* It is ok for replica to return invalid STOP LSN
|
||||||
|
* UPD: Apparently it is ok even for a master.
|
||||||
|
*/
|
||||||
if (!XRecOffIsValid(stop_backup_lsn_tmp))
|
if (!XRecOffIsValid(stop_backup_lsn_tmp))
|
||||||
{
|
{
|
||||||
/* It is ok for replica to return STOP LSN with NullXRecOff
|
char *xlog_path,
|
||||||
* UPD: Apparently it is ok even for master.
|
stream_xlog_path[MAXPGPATH];
|
||||||
|
XLogSegNo segno = 0;
|
||||||
|
XLogRecPtr lsn_tmp = InvalidXLogRecPtr;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Even though the value is invalid, it's expected postgres behaviour
|
||||||
|
* and we're trying to fix it below.
|
||||||
*/
|
*/
|
||||||
if (XRecOffIsNull(stop_backup_lsn_tmp))
|
elog(LOG, "Invalid offset in stop_lsn value %X/%X, trying to fix",
|
||||||
|
(uint32) (stop_backup_lsn_tmp >> 32), (uint32) (stop_backup_lsn_tmp));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Note: even with gdb it is very hard to produce automated tests for
|
||||||
|
* contrecord + invalid LSN, so emulate it for manual testing.
|
||||||
|
*/
|
||||||
|
//stop_backup_lsn_tmp = stop_backup_lsn_tmp - XLOG_SEG_SIZE;
|
||||||
|
//elog(WARNING, "New Invalid stop_backup_lsn value %X/%X",
|
||||||
|
// (uint32) (stop_backup_lsn_tmp >> 32), (uint32) (stop_backup_lsn_tmp));
|
||||||
|
|
||||||
|
if (stream_wal)
|
||||||
{
|
{
|
||||||
char *xlog_path,
|
pgBackupGetPath2(backup, stream_xlog_path,
|
||||||
stream_xlog_path[MAXPGPATH];
|
lengthof(stream_xlog_path),
|
||||||
XLogSegNo segno = 0;
|
DATABASE_DIR, PG_XLOG_DIR);
|
||||||
XLogRecPtr lsn_tmp = InvalidXLogRecPtr;
|
xlog_path = stream_xlog_path;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
xlog_path = arclog_path;
|
||||||
|
|
||||||
/*
|
GetXLogSegNo(stop_backup_lsn_tmp, segno, instance_config.xlog_seg_size);
|
||||||
* Even though the value is invalid, it's expected postgres behaviour
|
|
||||||
* and we're trying to fix it below.
|
|
||||||
*/
|
|
||||||
elog(LOG, "Null offset in stop_backup_lsn value %X/%X, trying to fix",
|
|
||||||
(uint32) (stop_backup_lsn_tmp >> 32), (uint32) (stop_backup_lsn_tmp));
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Note: even with gdb it is very hard to produce automated tests for
|
* Note, that there is no guarantee that corresponding WAL file even exists.
|
||||||
* contrecord + NullXRecOff, so emulate it for manual testing.
|
* Replica may return LSN from future and keep staying in present.
|
||||||
*/
|
* Or it can return invalid LSN.
|
||||||
//stop_backup_lsn_tmp = stop_backup_lsn_tmp - XLOG_SEG_SIZE;
|
*
|
||||||
//elog(WARNING, "New Invalid stop_backup_lsn value %X/%X",
|
* That's bad, since we want to get real LSN to save it in backup label file
|
||||||
// (uint32) (stop_backup_lsn_tmp >> 32), (uint32) (stop_backup_lsn_tmp));
|
* and to use it in WAL validation.
|
||||||
|
*
|
||||||
if (stream_wal)
|
* So we try to do the following:
|
||||||
{
|
* 1. Wait 'archive_timeout' seconds for segment containing stop_lsn and
|
||||||
pgBackupGetPath2(backup, stream_xlog_path,
|
* look for the first valid record in it.
|
||||||
lengthof(stream_xlog_path),
|
* It solves the problem of occasional invalid LSN on write-busy system.
|
||||||
DATABASE_DIR, PG_XLOG_DIR);
|
* 2. Failing that, look for record in previous segment with endpoint
|
||||||
xlog_path = stream_xlog_path;
|
* equal or greater than stop_lsn. It may(!) solve the problem of invalid LSN
|
||||||
}
|
* on write-idle system. If that fails too, error out.
|
||||||
else
|
*/
|
||||||
xlog_path = arclog_path;
|
|
||||||
|
|
||||||
GetXLogSegNo(stop_backup_lsn_tmp, segno, instance_config.xlog_seg_size);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Note, that there is no guarantee that corresponding WAL file even exists.
|
|
||||||
* Replica may return LSN from future and keep staying in present.
|
|
||||||
* Or it can return LSN with NullXRecOff.
|
|
||||||
*
|
|
||||||
* That's bad, since we want to get real LSN to save it in backup label file
|
|
||||||
* and to use it in WAL validation.
|
|
||||||
*
|
|
||||||
* So we try to do the following:
|
|
||||||
* 1. Wait 'archive_timeout' seconds for segment containing stop_lsn and
|
|
||||||
* look for the first valid record in it.
|
|
||||||
* It solves the problem of occasional invalid XRecOff on write-busy system.
|
|
||||||
* 2. Failing that, look for record in previous segment with endpoint
|
|
||||||
* equal or greater than stop_lsn. It may(!) solve the problem of NullXRecOff
|
|
||||||
* on write-idle system. If that fails too, error out.
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
/* stop_lsn is pointing to a 0 byte of xlog segment */
|
||||||
|
if (stop_backup_lsn_tmp % instance_config.xlog_seg_size == 0)
|
||||||
|
{
|
||||||
/* Wait for segment with current stop_lsn, it is ok for it to never arrive */
|
/* Wait for segment with current stop_lsn, it is ok for it to never arrive */
|
||||||
wait_wal_lsn(stop_backup_lsn_tmp, false, backup->tli,
|
wait_wal_lsn(stop_backup_lsn_tmp, false, backup->tli,
|
||||||
false, true, WARNING, stream_wal);
|
false, true, WARNING, stream_wal);
|
||||||
|
|
||||||
/* Get the first record in segment with current stop_lsn */
|
/* Get the first record in segment with current stop_lsn */
|
||||||
lsn_tmp = get_first_record_lsn(xlog_path, segno, backup->tli,
|
lsn_tmp = get_first_record_lsn(xlog_path, segno, backup->tli,
|
||||||
@ -1836,17 +1834,39 @@ pg_stop_backup(pgBackup *backup, PGconn *pg_startbackup_conn,
|
|||||||
(uint32) (stop_backup_lsn_tmp >> 32),
|
(uint32) (stop_backup_lsn_tmp >> 32),
|
||||||
(uint32) (stop_backup_lsn_tmp));
|
(uint32) (stop_backup_lsn_tmp));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
/* stop lsn is aligned to xlog block size, just find next lsn */
|
||||||
|
else if (stop_backup_lsn_tmp % XLOG_BLCKSZ == 0)
|
||||||
|
{
|
||||||
|
/* Wait for segment with current stop_lsn */
|
||||||
|
wait_wal_lsn(stop_backup_lsn_tmp, false, backup->tli,
|
||||||
|
false, true, ERROR, stream_wal);
|
||||||
|
|
||||||
/* Setting stop_backup_lsn will set stop point for streaming */
|
/* Get the next closest record in segment with current stop_lsn */
|
||||||
stop_backup_lsn = lsn_tmp;
|
lsn_tmp = get_next_record_lsn(xlog_path, segno, backup->tli,
|
||||||
stop_lsn_exists = true;
|
instance_config.xlog_seg_size,
|
||||||
|
instance_config.archive_timeout,
|
||||||
|
stop_backup_lsn_tmp);
|
||||||
|
|
||||||
|
/* sanity */
|
||||||
|
if (!XRecOffIsValid(lsn_tmp) || XLogRecPtrIsInvalid(lsn_tmp))
|
||||||
|
elog(ERROR, "Failed to get WAL record next to %X/%X",
|
||||||
|
(uint32) (stop_backup_lsn_tmp >> 32),
|
||||||
|
(uint32) (stop_backup_lsn_tmp));
|
||||||
}
|
}
|
||||||
/* PostgreSQL returned something very illegal as STOP_LSN, error out */
|
/* PostgreSQL returned something very illegal as STOP_LSN, error out */
|
||||||
else
|
else
|
||||||
elog(ERROR, "Invalid stop_backup_lsn value %X/%X",
|
elog(ERROR, "Invalid stop_backup_lsn value %X/%X",
|
||||||
(uint32) (stop_backup_lsn_tmp >> 32), (uint32) (stop_backup_lsn_tmp));
|
(uint32) (stop_backup_lsn_tmp >> 32), (uint32) (stop_backup_lsn_tmp));
|
||||||
|
|
||||||
|
/* Setting stop_backup_lsn will set stop point for streaming */
|
||||||
|
stop_backup_lsn = lsn_tmp;
|
||||||
|
stop_lsn_exists = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
elog(LOG, "stop_lsn: %X/%X",
|
||||||
|
(uint32) (stop_backup_lsn >> 32), (uint32) (stop_backup_lsn));
|
||||||
|
|
||||||
/* Write backup_label and tablespace_map */
|
/* Write backup_label and tablespace_map */
|
||||||
if (!exclusive_backup)
|
if (!exclusive_backup)
|
||||||
{
|
{
|
||||||
|
@ -680,6 +680,97 @@ get_first_record_lsn(const char *archivedir, XLogSegNo segno,
|
|||||||
return record;
|
return record;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Get LSN of the record next after target lsn.
|
||||||
|
*/
|
||||||
|
XLogRecPtr
|
||||||
|
get_next_record_lsn(const char *archivedir, XLogSegNo segno,
|
||||||
|
TimeLineID tli, uint32 wal_seg_size, int timeout,
|
||||||
|
XLogRecPtr target)
|
||||||
|
{
|
||||||
|
XLogReaderState *xlogreader;
|
||||||
|
XLogReaderData reader_data;
|
||||||
|
XLogRecPtr startpoint, found, res;
|
||||||
|
char wal_segment[MAXFNAMELEN];
|
||||||
|
int attempts = 0;
|
||||||
|
|
||||||
|
if (segno <= 1)
|
||||||
|
elog(ERROR, "Invalid WAL segment number " UINT64_FORMAT, segno);
|
||||||
|
|
||||||
|
GetXLogFileName(wal_segment, tli, segno, instance_config.xlog_seg_size);
|
||||||
|
|
||||||
|
xlogreader = InitXLogPageRead(&reader_data, archivedir, tli, wal_seg_size,
|
||||||
|
false, false, true);
|
||||||
|
if (xlogreader == NULL)
|
||||||
|
elog(ERROR, "Out of memory");
|
||||||
|
xlogreader->system_identifier = instance_config.system_identifier;
|
||||||
|
|
||||||
|
/* Set startpoint to 0 in segno */
|
||||||
|
GetXLogRecPtr(segno, 0, wal_seg_size, startpoint);
|
||||||
|
|
||||||
|
found = XLogFindNextRecord(xlogreader, startpoint);
|
||||||
|
|
||||||
|
if (XLogRecPtrIsInvalid(found))
|
||||||
|
{
|
||||||
|
if (xlogreader->errormsg_buf[0] != '\0')
|
||||||
|
elog(WARNING, "Could not read WAL record at %X/%X: %s",
|
||||||
|
(uint32) (startpoint >> 32), (uint32) (startpoint),
|
||||||
|
xlogreader->errormsg_buf);
|
||||||
|
else
|
||||||
|
elog(WARNING, "Could not read WAL record at %X/%X",
|
||||||
|
(uint32) (startpoint >> 32), (uint32) (startpoint));
|
||||||
|
PrintXLogCorruptionMsg(&reader_data, ERROR);
|
||||||
|
}
|
||||||
|
startpoint = found;
|
||||||
|
|
||||||
|
while (attempts <= timeout)
|
||||||
|
{
|
||||||
|
XLogRecord *record;
|
||||||
|
char *errormsg;
|
||||||
|
|
||||||
|
if (interrupted)
|
||||||
|
elog(ERROR, "Interrupted during WAL reading");
|
||||||
|
|
||||||
|
record = XLogReadRecord(xlogreader, startpoint, &errormsg);
|
||||||
|
|
||||||
|
if (record == NULL)
|
||||||
|
{
|
||||||
|
XLogRecPtr errptr;
|
||||||
|
|
||||||
|
errptr = XLogRecPtrIsInvalid(startpoint) ? xlogreader->EndRecPtr :
|
||||||
|
startpoint;
|
||||||
|
|
||||||
|
if (errormsg)
|
||||||
|
elog(WARNING, "Could not read WAL record at %X/%X: %s",
|
||||||
|
(uint32) (errptr >> 32), (uint32) (errptr),
|
||||||
|
errormsg);
|
||||||
|
else
|
||||||
|
elog(WARNING, "Could not read WAL record at %X/%X",
|
||||||
|
(uint32) (errptr >> 32), (uint32) (errptr));
|
||||||
|
PrintXLogCorruptionMsg(&reader_data, ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (xlogreader->ReadRecPtr >= target)
|
||||||
|
{
|
||||||
|
elog(LOG, "Record %X/%X is next after target LSN %X/%X",
|
||||||
|
(uint32) (xlogreader->ReadRecPtr >> 32), (uint32) (xlogreader->ReadRecPtr),
|
||||||
|
(uint32) (target >> 32), (uint32) (target));
|
||||||
|
res = xlogreader->ReadRecPtr;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
startpoint = InvalidXLogRecPtr;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* cleanup */
|
||||||
|
CleanupXLogPageRead(xlogreader);
|
||||||
|
XLogReaderFree(xlogreader);
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Get LSN of a record prior to target_lsn.
|
* Get LSN of a record prior to target_lsn.
|
||||||
* If 'start_lsn' is in the segment with number 'segno' then start from 'start_lsn',
|
* If 'start_lsn' is in the segment with number 'segno' then start from 'start_lsn',
|
||||||
|
@ -960,6 +960,8 @@ extern XLogRecPtr get_prior_record_lsn(const char *archivedir, XLogRecPtr start_
|
|||||||
|
|
||||||
extern XLogRecPtr get_first_record_lsn(const char *archivedir, XLogRecPtr start_lsn,
|
extern XLogRecPtr get_first_record_lsn(const char *archivedir, XLogRecPtr start_lsn,
|
||||||
TimeLineID tli, uint32 wal_seg_size, int timeout);
|
TimeLineID tli, uint32 wal_seg_size, int timeout);
|
||||||
|
extern XLogRecPtr get_next_record_lsn(const char *archivedir, XLogSegNo segno, TimeLineID tli,
|
||||||
|
uint32 wal_seg_size, int timeout, XLogRecPtr target);
|
||||||
|
|
||||||
/* in util.c */
|
/* in util.c */
|
||||||
extern TimeLineID get_current_timeline(PGconn *conn);
|
extern TimeLineID get_current_timeline(PGconn *conn);
|
||||||
|
Reference in New Issue
Block a user