← Back to team overview

maria-developers team mailing list archive

Re: 21a84e23cf3: MDEV-16437: merge 5.7 P_S replication instrumentation and tables

 

Hi, Sujatha!

Note, it's a review of the combined `git diff e5fc78 21a84e`

On Mar 31, Sujatha wrote:
> revision-id: 21a84e23cf3 (mariadb-10.5.2-307-g21a84e23cf3)
> parent(s): 4e6de585ff7
> author: Sujatha <sujatha.sivakumar@xxxxxxxxxxx>
> committer: Sujatha <sujatha.sivakumar@xxxxxxxxxxx>
> timestamp: 2020-11-30 13:22:32 +0530
> message:
> 
> MDEV-16437: merge 5.7 P_S replication instrumentation and tables

That was pretty good, thanks!
Just few small comments below, no big changes will be needed.

> diff --git a/mysql-test/suite/multi_source/simple.result b/mysql-test/suite/multi_source/simple.result
> index a66d49e88cb..b57e146feb5 100644
> --- a/mysql-test/suite/multi_source/simple.result
> +++ b/mysql-test/suite/multi_source/simple.result
> @@ -21,7 +21,21 @@ show all slaves status;
>  Connection_name	Slave_SQL_State	Slave_IO_State	Master_Host	Master_User	Master_Port	Connect_Retry	Master_Log_File	Read_Master_Log_Pos	Relay_Log_File	Relay_Log_Pos	Relay_Master_Log_File	Slave_IO_Running	Slave_SQL_Running	Replicate_Do_DB	Replicate_Ignore_DB	Replicate_Do_Table	Replicate_Ignore_Table	Replicate_Wild_Do_Table	Replicate_Wild_Ignore_Table	Last_Errno	Last_Error	Skip_Counter	Exec_Master_Log_Pos	Relay_Log_Space	Until_Condition	Until_Log_File	Until_Log_Pos	Master_SSL_Allowed	Master_SSL_CA_File	Master_SSL_CA_Path	Master_SSL_Cert	Master_SSL_Cipher	Master_SSL_Key	Seconds_Behind_Master	Master_SSL_Verify_Server_Cert	Last_IO_Errno	Last_IO_Error	Last_SQL_Errno	Last_SQL_Error	Replicate_Ignore_Server_Ids	Master_Server_Id	Master_SSL_Crl	Master_SSL_Crlpath	Using_Gtid	Gtid_IO_Pos	Replicate_Do_Domain_Ids	Replicate_Ignore_Domain_Ids	Parallel_Mode	SQL_Delay	SQL_Remaining_Delay	Slave_SQL_Running_State	Slave_DDL_Groups	Slave_Non_Transactional_Groups	Slave_Transactional_Groups	Retried_transactions	Max_relay_log_size	Executed_log_entries	Slave_received_heartbeats	Slave_heartbeat_period	Gtid_Slave_Pos
>  slave1	Slave has read all relay log; waiting for more updates	Waiting for master to send event	127.0.0.1	root	MYPORT_1	60	master-bin.000001	<read_master_log_pos>	mysqld-relay-bin-slave1.000002	<relay_log_pos>	master-bin.000001	Yes	Yes							0		0	<read_master_log_pos>	<relay_log_space1>	None		0	No						0	No	0		0			1			No				optimistic	0	NULL	Slave has read all relay log; waiting for more updates	0	0	0	0	1073741824	7	0	60.000	
>  slave2	Slave has read all relay log; waiting for more updates	Waiting for master to send event	127.0.0.1	root	MYPORT_2	60	master-bin.000001	<read_master_log_pos>	mysqld-relay-bin-slave2.000002	<relay_log_pos>	master-bin.000001	Yes	Yes							0		0	<read_master_log_pos>	<relay_log_space1>	None		0	No						0	No	0		0			2			No				optimistic	0	NULL	Slave has read all relay log; waiting for more updates	0	0	0	0	1073741824	7	0	60.000	
> +#
> +# MDEV:16437: merge 5.7 P_S replication instrumentation and tables
> +#
> +select * from performance_schema.replication_connection_configuration;
> +CONNECTION_NAME	HOST	PORT	USER	USING_GTID	SSL_ALLOWED	SSL_CA_FILE	SSL_CA_PATH	SSL_CERTIFICATE	SSL_CIPHER	SSL_KEY	SSL_VERIFY_SERVER_CERTIFICATE	SSL_CRL_FILE	SSL_CRL_PATH	CONNECTION_RETRY_INTERVAL	CONNECTION_RETRY_COUNT	HEARTBEAT_INTERVAL	IGNORE_SERVER_IDS	REPL_DO_DOMAIN_IDS	REPL_IGNORE_DOMAIN_IDS
> +slave2	#	#	root	NO	NO						NO			60	86400	60.000			
> +slave1	#	#	root	NO	NO						NO			60	86400	60.000			

Isn't the host always 127.0.0.1 ? Why to mask it?

Also, may be it'd be easier to read the result, if you print it vertically?
It looks that CONNECTION_RETRY_COUNT is 86400.
And 86400 is clearly a timeout, not a retry count.

>  start all slaves;
> +#
> +# MDEV:16437: merge 5.7 P_S replication instrumentation and tables
> +#
> +select * from performance_schema.replication_applier_status_by_coordinator;
> +CONNECTION_NAME	THREAD_ID	SERVICE_STATE	LAST_SEEN_TRANSACTION	LAST_ERROR_NUMBER	LAST_ERROR_MESSAGE	LAST_ERROR_TIMESTAMP	LAST_TRANS_RETRY_COUNT
> +slave2	#	ON		0		0000-00-00 00:00:00	0
> +slave1	#	ON		0		0000-00-00 00:00:00	0
>  stop slave 'slave1';
>  show slave 'slave1' status;
>  Slave_IO_State	
> diff --git a/mysql-test/suite/rpl/include/rpl_deadlock.test b/mysql-test/suite/rpl/include/rpl_deadlock.test
> index e9191d5fcd8..bccbe044a36 100644
> --- a/mysql-test/suite/rpl/include/rpl_deadlock.test
> +++ b/mysql-test/suite/rpl/include/rpl_deadlock.test
> @@ -59,6 +59,16 @@ let $status_var_comparsion= >;
>  connection slave;
>  SELECT COUNT(*) FROM t2;
>  COMMIT;
> +
> +--echo
> +--echo # Test that the performance schema coulumn shows > 0 values.
> +--echo
> +
> +--let $assert_text= current number of retries should be more than the value saved before deadlock.
> +--let $assert_cond= [SELECT COUNT_TRANSACTIONS_RETRIES FROM performance_schema.replication_applier_status, COUNT_TRANSACTIONS_RETRIES, 1] > "$slave_retried_transactions"
> +--source include/assert.inc

what's wrong with simple

  SELECT COUNT_TRANSACTIONS_RETRIES > $slave_retried_transactions FROM performance_schema.replication_applier_status

?

> +
> +source include/check_slave_is_running.inc;
>  sync_with_master;
>  
>  # Check the data
> diff --git a/mysql-test/suite/rpl/r/rpl_perfschema_connect_config.result b/mysql-test/suite/rpl/r/rpl_perfschema_connect_config.result
> new file mode 100644
> index 00000000000..4ace84ffac4
> --- /dev/null
> +++ b/mysql-test/suite/rpl/r/rpl_perfschema_connect_config.result
> @@ -0,0 +1,124 @@
> +include/master-slave.inc
> +[connection master]
> +# Asserted this: On master, the table should return an empty set.
> +connection slave;
> +
> +# Verify that SELECT works for every field and produces an output
> +# similar to the corresponding field in SHOW SLAVE STATUS(SSS).
> +
> +include/assert.inc [Value returned by SSS and PS table for Host should be same.]
> +include/assert.inc [Value returned by SSS and PS table for Port should be same.]
> +include/assert.inc [Value returned by SSS and PS table for User should be same.]
> +include/assert.inc [Value returned by SSS and PS table for Using_Gtid should be same.]
> +include/assert.inc [Value returned by SSS and PS table for SSL_Allowed should be same.]
> +include/assert.inc [Value returned by SSS and PS table for SSL_CA_File should be same.]
> +include/assert.inc [Value returned by SSS and PS table for SSL_CA_Path should be same.]
> +include/assert.inc [Value returned by SSS and PS table for SSL_Certificate should be same.]
> +include/assert.inc [Value returned by SSS and PS table for SSL_Cipher should be same.]
> +include/assert.inc [Value returned by SSS and PS table for SSL_Key should be same.]
> +include/assert.inc [Value returned by SSS and PS table for SSL_Verify_Server_Certificate should be same.]
> +include/assert.inc [Value returned by SSS and PS table for SSL_Crl_File should be same.]
> +include/assert.inc [Value returned by SSS and PS table for SSL_Crl_Path should be same.]
> +include/assert.inc [Value returned by SSS and PS table for Connection_Retry_Interval should be same.]
> +include/assert.inc [Value returned by PS table for Connection_Retry_Count should be 10.]

this is just unreadable

> +
> +# Heartbeat_Interval is part of I_S and P_S. We will compare the
> +# two to make sure both match.
...
> diff --git a/mysql-test/suite/rpl/t/rpl_perfschema_applier_config.test b/mysql-test/suite/rpl/t/rpl_perfschema_applier_config.test
> new file mode 100644
> index 00000000000..132d9912222
> --- /dev/null
> +++ b/mysql-test/suite/rpl/t/rpl_perfschema_applier_config.test
> @@ -0,0 +1,96 @@
> +# ==== Purpose ====
> +#
> +# This test script serves as the functionality testing for the table
> +# performance_schema.replication_applier_configuration. Test for ddl and dml
> +# operations is a part of the perfschema suite. The ddl/dml tests are named:
> +# 1) ddl_replication_applier_configuration.test and
> +# 2) dml_replication_applier_configuration.test.
> +#
> +# The follwing scenarios are tested in this script:
> +#
> +#  - Verify that output is same as SSS on a fresh slave.
> +#  - Verify that the value of this field is correct after STOP SLAVE.
> +#  - Verify that, when desired delay is set, the value is shown corectly.
> +#  - Verify that the value is preserved after STOP SLAVE.
> +#  - Verify that, when desired delay is reset, the value is shown corectly.
> +#
> +#  ==== Related Worklog ====
> +#
> +#  MDEV-16437: merge 5.7 P_S replication instrumentation and tables
> +#
> +
> +source include/master-slave.inc;
> +source include/have_binlog_format_mixed.inc;

master-slave should be included last

> +
> +let $assert_text= On master, the table should return an empty set.;
> +let $assert_cond= count(*) = 0 from performance_schema.replication_applier_configuration;
> +source include/assert.inc;

again, what's wrong with just

  select * from performance_schema.replication_applier_configuration;

or, may be, if you want to be very explicit

 select count(*) as 'must be 0' from performance_schema.replication_applier_configuration;

> +
> +--connection slave
> +
> +--echo
> +--echo # Verify that SELECT works and produces an output similar to
> +--echo # the corresponding field in SHOW SLAVE STATUS(SSS) in all scenarios.
> +--echo
> +
> +--echo
> +--echo # Verify that output is same as SSS on a fresh slave.
> +--echo
> +
> +let $sss_value= query_get_value(SHOW SLAVE STATUS, SQL_Delay, 1);
> +let $ps_value= query_get_value(select Desired_Delay from performance_schema.replication_applier_configuration, Desired_Delay, 1);
> +let $assert_text= Value returned by SSS and PS table for Desired_Delay should be same.;
> +let $assert_cond= "$sss_value" = "$ps_value";
> +source include/assert.inc;

I'll stop commenting on every assert.inc, but please, please, stop overusing them.

> +
> +--echo
> +--echo # Verify that the value of this field is correct after STOP SLAVE.
> +--echo
> +
> +source include/stop_slave.inc;
> +
> +let $sss_value= query_get_value(SHOW SLAVE STATUS, SQL_Delay, 1);
> +let $ps_value= query_get_value(select Desired_Delay from performance_schema.replication_applier_configuration, Desired_Delay, 1);
> +let $assert_text= Value returned by SSS and PS table for Desired_Delay should be same.;
> +let $assert_cond= "$sss_value" = "$ps_value";
> +source include/assert.inc;
> +
> +--echo
> +--echo # Verify that, when desired delay is set, the value is shown corectly.
> +--echo
> +
> +eval change master to master_delay= 2;
> +source include/start_slave.inc;
> +
> +let $sss_value= query_get_value(SHOW SLAVE STATUS, SQL_Delay, 1);
> +let $ps_value= query_get_value(select Desired_Delay from performance_schema.replication_applier_configuration, Desired_Delay, 1);
> +let $assert_text= Value returned by SSS and PS table for Desired_Delay should be same.;
> +let $assert_cond= "$sss_value" = "$ps_value";
> +source include/assert.inc;
> +
> +--echo
> +--echo # Verify that the value is preserved after STOP SLAVE.
> +--echo
> +
> +source include/stop_slave.inc;
> +
> +let $ss_value= query_get_value(SHOW SLAVE STATUS, SQL_Delay, 1);
> +let $ps_value= query_get_value(select Desired_Delay from performance_schema.replication_applier_configuration, Desired_Delay, 1);
> +let $assert_text= Value returned by SSS and PS table for Desired_Delay should be same.;
> +let $assert_cond= "$sss_value" = "$ps_value";
> +source include/assert.inc;
> +
> +--echo
> +--echo # Verify that, when desired delay is reset, the value is shown corectly.
> +--echo
> +
> +eval change master to master_delay= 0;
> +source include/start_slave.inc;
> +
> +let $sss_value= query_get_value(SHOW SLAVE STATUS, SQL_Delay, 1);
> +let $ps_value= query_get_value(select Desired_Delay from performance_schema.replication_applier_configuration, Desired_Delay, 1);
> +let $assert_text= Value returned by SSS and PS table for Desired_Delay should be same.;
> +let $assert_cond= "$sss_value" = "$ps_value";
> +source include/assert.inc;
> +
> +source include/rpl_end.inc;
> diff --git a/mysql-test/suite/rpl/t/rpl_perfschema_applier_status.test b/mysql-test/suite/rpl/t/rpl_perfschema_applier_status.test
> new file mode 100644
> index 00000000000..52ee14cef2a
> --- /dev/null
> +++ b/mysql-test/suite/rpl/t/rpl_perfschema_applier_status.test
> @@ -0,0 +1,72 @@
> +# ==== Purpose ====
> +#
> +# This test script serves as the functionality testing for the table
> +# performance_schema.replication_applier_status. Test for ddl and dml
> +# operations is a part of the perfschema suite. The ddl/dml tests are named:
> +# 1) ddl_replication_applier_status.test and
> +# 2) dml_replication_applier_status.test.
> +#
> +# The follwing scenarios are tested in this script:
> +#
> +#  - Verify that output is same as SSS on a fresh slave.
> +#  - Verify that the value of this field is correct after STOP SLAVE.
> +#  - Remaining delay is not tested.
> +#  - Count_trnsaction is partially tested here making sure it can be queried.
> +#    More testing in rpl/include/rpl_deadlock.test
> +#
> +#  ==== Related Worklog ====
> +#
> +# MDEV-16437: merge 5.7 P_S replication instrumentation and tables
> +#
> +
> +source include/master-slave.inc;
> +source include/have_binlog_format_mixed.inc;

master-slave must always be last (I'll stop commenting on that too)

> +
> +let $assert_text= On master, the table should return an empty set.;
> +let $assert_cond= count(*) = 0 from performance_schema.replication_applier_status;
> +source include/assert.inc;
> +
> +--connection slave
> +
> +--echo
> +--echo # Verify that SELECT works and produces an output similar to
> +--echo # the corresponding field in SHOW SLAVE STATUS(SSS) in all scenarios.
> +--echo
> +
> +--echo
> +--echo # Verify that output is same as SSS on a fresh slave.
> +--echo
> +
> +let $sss_value= query_get_value(SHOW SLAVE STATUS, Slave_SQL_Running, 1);
> +let $ps_value= query_get_value(select Service_State from performance_schema.replication_applier_status, Service_State, 1);
> +let $assert_text= SSS shows Slave_SQL_Running as "Yes". So, Service_State from this PS table should be "ON".;
> +let $assert_cond= "$sss_value" = "Yes" AND "$ps_value"= "ON";
> +source include/assert.inc;
> +
> +let $ss_value= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
> +let $ps_value= query_get_value(select count_transactions_retries from performance_schema.replication_applier_status, count_transactions_retries, 1);
> +let $assert_text= COUNT_TRANSACTION_RETRIES should be equal to Slave_retried_transactions.;
> +let $assert_cond= "$ps_value"= "$ss_value";
> +source include/assert.inc;
> +
> +--echo
> +--echo # Verify that the fields show the correct values after STOP SLAVE.
> +--echo
> +
> +source include/stop_slave.inc;
> +
> +let $sss_value= query_get_value(SHOW SLAVE STATUS, Slave_SQL_Running, 1);
> +let $ps_value= query_get_value(select Service_State from performance_schema.replication_applier_status, Service_State, 1);
> +let $assert_text= SSS shows Slave_SQL_Running as "No". So, Service_State from this PS table should be "OFF".;
> +let $assert_cond= "$sss_value" = "No" AND "$ps_value"= "OFF";
> +source include/assert.inc;
> +
> +let $ss_value= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
> +let $ps_value= query_get_value(select count_transactions_retries from performance_schema.replication_applier_status, count_transactions_retries, 1);
> +let $assert_text= COUNT_TRANSACTION_RETRIES should be equal to Slave_retried_transactions.;
> +let $assert_cond= "$ps_value"= "$ss_value";
> +source include/assert.inc;
> +
> +source include/start_slave.inc;
> +source include/rpl_end.inc;
> +
> diff --git a/sql/rpl_mi.h b/sql/rpl_mi.h
> index 4d47689ac18..946d138d618 100644
> --- a/sql/rpl_mi.h
> +++ b/sql/rpl_mi.h
> @@ -50,13 +57,6 @@ class Domain_id_filter
>    */
>    DYNAMIC_ARRAY m_domain_ids[2];
>  
> -public:
> -  /* domain id list types */
> -  enum enum_list_type {
> -    DO_DOMAIN_IDS= 0,
> -    IGNORE_DOMAIN_IDS
> -  };
> -

Why did you move it up? Does it make any difference?

>    Domain_id_filter();
>  
>    ~Domain_id_filter();
> diff --git a/storage/perfschema/table_replication_applier_status_by_coordinator.cc b/storage/perfschema/table_replication_applier_status_by_coordinator.cc
> index beb8620b240..30cf56ce0c2 100644
> --- a/storage/perfschema/table_replication_applier_status_by_coordinator.cc
> +++ b/storage/perfschema/table_replication_applier_status_by_coordinator.cc
> @@ -55,12 +55,14 @@ table_replication_applier_status_by_coordinator::m_share=
>    sizeof(pos_t), /* ref length */
>    &m_table_lock,
>    { C_STRING_WITH_LEN("CREATE TABLE replication_applier_status_by_coordinator("
> -  "CHANNEL_NAME CHAR(64) collate utf8_general_ci not null,"
> +  "CONNECTION_NAME VARCHAR(256) collate utf8_general_ci not null,"

please, don't rename existing columns, let's keep calling it CHANNEL_NAME everywhere.

>    "THREAD_ID BIGINT UNSIGNED,"
>    "SERVICE_STATE ENUM('ON','OFF') not null,"
> +  "LAST_SEEN_TRANSACTION CHAR(57) not null,"

I think it's better to put new columns at the end.

>    "LAST_ERROR_NUMBER INTEGER not null,"
>    "LAST_ERROR_MESSAGE VARCHAR(1024) not null,"
> -  "LAST_ERROR_TIMESTAMP TIMESTAMP(0) not null)") },
> +  "LAST_ERROR_TIMESTAMP TIMESTAMP(0) not null,"
> +  "LAST_TRANS_RETRY_COUNT INTEGER not null)") },
>    false  /* perpetual */
>  };
>  
> @@ -104,15 +106,7 @@ int table_replication_applier_status_by_coordinator::rnd_next(void)
>    {
>      mi= (Master_info *)my_hash_element(&master_info_index->master_info_hash, m_pos.m_index);
>  
> -    /*
> -      Construct and display SQL Thread's (Coordinator) information in
> -      'replication_applier_status_by_coordinator' table only in the case of
> -      multi threaded slave mode. Code should do nothing in the case of single
> -      threaded slave mode. In case of single threaded slave mode SQL Thread's
> -      status will be reported as part of
> -      'replication_applier_status_by_worker' table.
> -    */

is this no longer true?

> -    if (mi && mi->host[0] && /*mi->rli.get_worker_count() > */ 0)
> +    if (mi && mi->host[0])
>      {
>        make_row(mi);
>        m_next_pos.set_after(&m_pos);
> @@ -147,13 +141,20 @@ int table_replication_applier_status_by_coordinator::rnd_pos(const void *pos)
>  void table_replication_applier_status_by_coordinator::make_row(Master_info *mi)
>  {
>    m_row_exists= false;
> +  rpl_gtid gtid;
> +  char buf[10+1+10+1+20+1];
> +  String str(buf, sizeof(buf), system_charset_info);

There's a special template for it:

  StringBuffer<10+1+10+1+20+1> str;

> +  bool first= true;
> +
> +  str.length(0);

not needed if you use StringBuffer<>

>  
>    DBUG_ASSERT(mi != NULL);
>  
>    mysql_mutex_lock(&mi->rli.data_lock);
>  
> -  m_row.channel_name_length= static_cast<uint>(mi->connection_name.length);
> -  memcpy(m_row.channel_name, mi->connection_name.str, m_row.channel_name_length);
> +  gtid= mi->rli.last_seen_gtid;
> +  m_row.connection_name_length= static_cast<uint>(mi->connection_name.length);
> +  memcpy(m_row.connection_name, mi->connection_name.str, m_row.connection_name_length);
>  
>    if (mi->rli.slave_running)
>    {
> @@ -175,6 +176,18 @@ void table_replication_applier_status_by_coordinator::make_row(Master_info *mi)
>    else
>      m_row.service_state= PS_RPL_NO;
>  
> +  if ((gtid.seq_no > 0 &&
> +        !rpl_slave_state_tostring_helper(&str, &gtid, &first)))
> +  {
> +    strmake(m_row.last_seen_transaction,str.ptr(), str.length());
> +    m_row.last_seen_transaction_length= str.length();
> +  }
> +  else
> +  {
> +    m_row.last_seen_transaction_length= 0;
> +    memcpy(m_row.last_seen_transaction, "", 1);

a strange way to write m_row.last_seen_transaction[0]= 0, but ok :)

> +  }
> +
>    mysql_mutex_lock(&mi->rli.err_lock);
>  
>    m_row.last_error_number= (long int) mi->rli.last_error().number;

Regards,
Sergei
VP of MariaDB Server Engineering
and security@xxxxxxxxxxx


Follow ups