maria-developers team mailing list archive
-
maria-developers team
-
Mailing list archive
-
Message #06246
Re: Wrong SQL thread position reporting after IO thread restart
Here's my approach to fixing both of these problems if you are interested.
Pavel
On Thu, Sep 12, 2013 at 12:14 AM, Pavel Ivanov <pivanof@xxxxxxxxxx> wrote:
> Krisitan,
>
> I found what I think is a bug in IO and SQL thread accounting. How to reproduce:
> 1) Set up two servers S1 and S2. S1 is a master, S2 is slave with
> master_using_gtid = current_pos.
> 2) Execute some transactions on the master, e.g.
>
> create database d;
> create table d.t (n int);
> insert into d.t values (1);
>
> 3) Both servers are at 0-1-3 now, SHOW SLAVE STATUS on S2 shows
> Read_Master_Log_Pos equals to Exec_Master_Log_Pos.
> 4) Execute STOP SLAVE IO_THREAD on S2.
> 5) S2 reports in the logs: "Slave I/O thread exiting, ... GTID
> position 0-1-2". So IO thread didn't realize that it received full
> transaction for 0-1-3 even though it didn't receive next GTID event.
> 6) Execute START SLAVE IO_THREAD on S2.
> 7) At this point SHOW SLAVE STATUS on S2 shows Read_Master_Log_Pos the
> same as in step 3, but Exec_Master_Log_Pos is now less than in step 3
> as if SQL thread didn't catch up with IO thread yet. But despite both
> threads running and no more transactions executed on master
> Exec_Master_Log_Pos doesn't change and doesn't become equal to
> Read_Master_Log_Pos. This apparently happens because IO thread
> restarts from one transaction behind, adds to relay log Rotate event
> that master sends with the position of that transaction, but then
> doesn't add any events for the transaction because it knows they
> already were added into relay log.
>
> I think both problems are bugs. And although after fixing the first it
> would be really hard (if possible) to reproduce second, I'd think the
> reporting of SQL thread's position still should be fixed.
>
>
> Thank you,
> Pavel
diff --git a/sql/rpl_mi.cc b/sql/rpl_mi.cc
index 390fdfa..ef00f9c 100644
--- a/sql/rpl_mi.cc
+++ b/sql/rpl_mi.cc
@@ -39,7 +39,8 @@ Master_info::Master_info(LEX_STRING *connection_name_arg,
slave_running(0), slave_run_id(0), sync_counter(0),
heartbeat_period(0), received_heartbeats(0), master_id(0),
using_gtid(USE_GTID_NO), events_queued_since_last_gtid(0),
- gtid_reconnect_event_skip_count(0), gtid_event_seen(false)
+ gtid_reconnect_event_skip_count(0), gtid_event_seen(false),
+ last_gtid_is_standalone(false)
{
host[0] = 0; user[0] = 0; password[0] = 0;
ssl_ca[0]= 0; ssl_capath[0]= 0; ssl_cert[0]= 0;
@@ -168,6 +169,7 @@ void init_master_log_pos(Master_info* mi)
mi->events_queued_since_last_gtid= 0;
mi->gtid_reconnect_event_skip_count= 0;
mi->gtid_event_seen= false;
+ mi->last_gtid_is_standalone= false;
/* Intentionally init ssl_verify_server_cert to 0, no option available */
mi->ssl_verify_server_cert= 0;
diff --git a/sql/rpl_mi.h b/sql/rpl_mi.h
index 57f0f57..4aaa256 100644
--- a/sql/rpl_mi.h
+++ b/sql/rpl_mi.h
@@ -167,6 +167,11 @@ class Master_info : public Slave_reporting_capability
uint64 gtid_reconnect_event_skip_count;
/* gtid_event_seen is false until we receive first GTID event from master. */
bool gtid_event_seen;
+ /*
+ False if last GTID event wasn't standalone, i.e. it was an equivalent of the
+ BEGIN statement. True otherwise.
+ */
+ bool last_gtid_is_standalone;
};
int init_master_info(Master_info* mi, const char* master_info_fname,
const char* slave_info_fname,
diff --git a/sql/slave.cc b/sql/slave.cc
index 9d3a172..f97567f 100644
--- a/sql/slave.cc
+++ b/sql/slave.cc
@@ -5153,11 +5153,11 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
case GTID_EVENT:
{
- uchar dummy_flag;
+ uchar gtid_flags;
if (Gtid_log_event::peek(buf, event_len, checksum_alg,
&event_gtid.domain_id, &event_gtid.server_id,
- &event_gtid.seq_no, &dummy_flag,
+ &event_gtid.seq_no, >id_flags,
rli->relay_log.description_event_for_queue))
{
error= ER_SLAVE_RELAY_LOG_WRITE_FAILURE;
@@ -5217,6 +5217,7 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
mi->events_queued_since_last_gtid= 0;
}
mi->last_queued_gtid= event_gtid;
+ mi->last_gtid_is_standalone= (gtid_flags & Gtid_log_event::FL_STANDALONE) != 0;
++mi->events_queued_since_last_gtid;
inc_pos= event_len;
}
@@ -5298,6 +5299,15 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
else
rli->relay_log.harvest_bytes_written(&rli->log_space_total);
}
+ else
+ {
+ Rotate_log_event fake_rev(mi->master_log_name, 0, mi->master_log_pos, 0);
+ fake_rev.server_id= mi->master_id;
+ if (rli->relay_log.append_no_lock(&fake_rev))
+ error= ER_SLAVE_RELAY_LOG_WRITE_FAILURE;
+ else
+ rli->relay_log.harvest_bytes_written(&rli->log_space_total);
+ }
}
else
if ((s_id == global_system_variables.server_id &&
@@ -5369,6 +5379,16 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
}
mysql_mutex_unlock(log_lock);
+ if (!error && mi->using_gtid != Master_info::USE_GTID_NO &&
+ (buf[EVENT_TYPE_OFFSET] == XID_EVENT ||
+ (buf[EVENT_TYPE_OFFSET] == QUERY_EVENT &&
+ (mi->last_gtid_is_standalone ||
+ Query_log_event::peek_is_commit_rollback(buf, event_len, checksum_alg)))))
+ {
+ mi->gtid_current_pos.update(&mi->last_queued_gtid);
+ mi->events_queued_since_last_gtid= 0;
+ }
+
skip_relay_logging:
err:
Follow ups
References