← Back to team overview

maria-developers team mailing list archive

Re: 21a84e23cf3: MDEV-16437: merge 5.7 P_S replication instrumentation and tables

 

Hello Sergei,

Thank you for the review comments. Please find my replies inline.

On 01/04/21 12:38 am, Sergei Golubchik wrote:
Hi, Sujatha!

Note, it's a review of the combined `git diff e5fc78 21a84e`

On Mar 31, Sujatha wrote:
revision-id: 21a84e23cf3 (mariadb-10.5.2-307-g21a84e23cf3)
parent(s): 4e6de585ff7
author: Sujatha<sujatha.sivakumar@xxxxxxxxxxx>
committer: Sujatha<sujatha.sivakumar@xxxxxxxxxxx>
timestamp: 2020-11-30 13:22:32 +0530
message:

MDEV-16437: merge 5.7 P_S replication instrumentation and tables
That was pretty good, thanks!
Just few small comments below, no big changes will be needed.

diff --git a/mysql-test/suite/multi_source/simple.result b/mysql-test/suite/multi_source/simple.result
index a66d49e88cb..b57e146feb5 100644
--- a/mysql-test/suite/multi_source/simple.result
+++ b/mysql-test/suite/multi_source/simple.result
@@ -21,7 +21,21 @@ show all slaves status;
  Connection_name	Slave_SQL_State	Slave_IO_State	Master_Host	Master_User	Master_Port	Connect_Retry	Master_Log_File	Read_Master_Log_Pos	Relay_Log_File	Relay_Log_Pos	Relay_Master_Log_File	Slave_IO_Running	Slave_SQL_Running	Replicate_Do_DB	Replicate_Ignore_DB	Replicate_Do_Table	Replicate_Ignore_Table	Replicate_Wild_Do_Table	Replicate_Wild_Ignore_Table	Last_Errno	Last_Error	Skip_Counter	Exec_Master_Log_Pos	Relay_Log_Space	Until_Condition	Until_Log_File	Until_Log_Pos	Master_SSL_Allowed	Master_SSL_CA_File	Master_SSL_CA_Path	Master_SSL_Cert	Master_SSL_Cipher	Master_SSL_Key	Seconds_Behind_Master	Master_SSL_Verify_Server_Cert	Last_IO_Errno	Last_IO_Error	Last_SQL_Errno	Last_SQL_Error	Replicate_Ignore_Server_Ids	Master_Server_Id	Master_SSL_Crl	Master_SSL_Crlpath	Using_Gtid	Gtid_IO_Pos	Replicate_Do_Domain_Ids	Replicate_Ignore_Domain_Ids	Parallel_Mode	SQL_Delay	SQL_Remaining_Delay	Slave_SQL_Running_State	Slave_DDL_Groups	Slave_Non_Transactional_Groups	Slave_Transactional_Groups	Retried_transactions	Max_relay_log_size	Executed_log_entries	Slave_received_heartbeats	Slave_heartbeat_period	Gtid_Slave_Pos
  slave1	Slave has read all relay log; waiting for more updates	Waiting for master to send event	127.0.0.1	root	MYPORT_1	60	master-bin.000001	<read_master_log_pos>	mysqld-relay-bin-slave1.000002	<relay_log_pos>	master-bin.000001	Yes	Yes							0		0	<read_master_log_pos>	<relay_log_space1>	None		0	No						0	No	0		0			1			No				optimistic	0	NULL	Slave has read all relay log; waiting for more updates	0	0	0	0	1073741824	7	0	60.000	
  slave2	Slave has read all relay log; waiting for more updates	Waiting for master to send event	127.0.0.1	root	MYPORT_2	60	master-bin.000001	<read_master_log_pos>	mysqld-relay-bin-slave2.000002	<relay_log_pos>	master-bin.000001	Yes	Yes							0		0	<read_master_log_pos>	<relay_log_space1>	None		0	No						0	No	0		0			2			No				optimistic	0	NULL	Slave has read all relay log; waiting for more updates	0	0	0	0	1073741824	7	0	60.000	
+#
+# MDEV:16437: merge 5.7 P_S replication instrumentation and tables
+#
+select * from performance_schema.replication_connection_configuration;
+CONNECTION_NAME	HOST	PORT	USER	USING_GTID	SSL_ALLOWED	SSL_CA_FILE	SSL_CA_PATH	SSL_CERTIFICATE	SSL_CIPHER	SSL_KEY	SSL_VERIFY_SERVER_CERTIFICATE	SSL_CRL_FILE	SSL_CRL_PATH	CONNECTION_RETRY_INTERVAL	CONNECTION_RETRY_COUNT	HEARTBEAT_INTERVAL	IGNORE_SERVER_IDS	REPL_DO_DOMAIN_IDS	REPL_IGNORE_DOMAIN_IDS
+slave2	#	#	root	NO	NO						NO			60	86400	60.000			
+slave1	#	#	root	NO	NO						NO			60	86400	60.000			
Isn't the host always 127.0.0.1 ? Why to mask it?


Sure.

Also, may be it'd be easier to read the result, if you print it vertically?
It looks that CONNECTION_RETRY_COUNT is 86400.
And 86400 is clearly a timeout, not a retry count.

Will use "--query_vertical". Thank you.


Regarding the timeout, actually the above table just displays the user specified connection configuration.

In case, user has not provided any value for 'CONNECTION_RETRY_COUNT' it will hold the default value(86400).

Please refer the following link.

https://mariadb.com/kb/en/mysqld-options/#-master-retry-count


  start all slaves;
+#
+# MDEV:16437: merge 5.7 P_S replication instrumentation and tables
+#
+select * from performance_schema.replication_applier_status_by_coordinator;
+CONNECTION_NAME	THREAD_ID	SERVICE_STATE	LAST_SEEN_TRANSACTION	LAST_ERROR_NUMBER	LAST_ERROR_MESSAGE	LAST_ERROR_TIMESTAMP	LAST_TRANS_RETRY_COUNT
+slave2	#	ON		0		0000-00-00 00:00:00	0
+slave1	#	ON		0		0000-00-00 00:00:00	0
  stop slave 'slave1';
  show slave 'slave1' status;
  Slave_IO_State	
diff --git a/mysql-test/suite/rpl/include/rpl_deadlock.test b/mysql-test/suite/rpl/include/rpl_deadlock.test
index e9191d5fcd8..bccbe044a36 100644
--- a/mysql-test/suite/rpl/include/rpl_deadlock.test
+++ b/mysql-test/suite/rpl/include/rpl_deadlock.test
@@ -59,6 +59,16 @@ let $status_var_comparsion= >;
  connection slave;
  SELECT COUNT(*) FROM t2;
  COMMIT;
+
+--echo
+--echo # Test that the performance schema coulumn shows > 0 values.
+--echo
+
+--let $assert_text= current number of retries should be more than the value saved before deadlock.
+--let $assert_cond= [SELECT COUNT_TRANSACTIONS_RETRIES FROM performance_schema.replication_applier_status, COUNT_TRANSACTIONS_RETRIES, 1] > "$slave_retried_transactions"
+--source include/assert.inc
what's wrong with simple

   SELECT COUNT_TRANSACTIONS_RETRIES > $slave_retried_transactions FROM performance_schema.replication_applier_status

?


'asserts' are preferred by upstream team and they find asserts to be more readable rather than comparing SELECT OUTPUTS from result files.

Hence you will find this pattern. Should I replace them with SELECTS? Please let me know.

+
+source include/check_slave_is_running.inc;
  sync_with_master;
# Check the data
diff --git a/mysql-test/suite/rpl/r/rpl_perfschema_connect_config.result b/mysql-test/suite/rpl/r/rpl_perfschema_connect_config.result
new file mode 100644
index 00000000000..4ace84ffac4
--- /dev/null
+++ b/mysql-test/suite/rpl/r/rpl_perfschema_connect_config.result
@@ -0,0 +1,124 @@
+include/master-slave.inc
+[connection master]
+# Asserted this: On master, the table should return an empty set.
+connection slave;
+
+# Verify that SELECT works for every field and produces an output
+# similar to the corresponding field in SHOW SLAVE STATUS(SSS).
+
+include/assert.inc [Value returned by SSS and PS table for Host should be same.]
+include/assert.inc [Value returned by SSS and PS table for Port should be same.]
+include/assert.inc [Value returned by SSS and PS table for User should be same.]
+include/assert.inc [Value returned by SSS and PS table for Using_Gtid should be same.]
+include/assert.inc [Value returned by SSS and PS table for SSL_Allowed should be same.]
+include/assert.inc [Value returned by SSS and PS table for SSL_CA_File should be same.]
+include/assert.inc [Value returned by SSS and PS table for SSL_CA_Path should be same.]
+include/assert.inc [Value returned by SSS and PS table for SSL_Certificate should be same.]
+include/assert.inc [Value returned by SSS and PS table for SSL_Cipher should be same.]
+include/assert.inc [Value returned by SSS and PS table for SSL_Key should be same.]
+include/assert.inc [Value returned by SSS and PS table for SSL_Verify_Server_Certificate should be same.]
+include/assert.inc [Value returned by SSS and PS table for SSL_Crl_File should be same.]
+include/assert.inc [Value returned by SSS and PS table for SSL_Crl_Path should be same.]
+include/assert.inc [Value returned by SSS and PS table for Connection_Retry_Interval should be same.]
+include/assert.inc [Value returned by PS table for Connection_Retry_Count should be 10.]
this is just unreadable

+
+# Heartbeat_Interval is part of I_S and P_S. We will compare the
+# two to make sure both match.
...
diff --git a/mysql-test/suite/rpl/t/rpl_perfschema_applier_config.test b/mysql-test/suite/rpl/t/rpl_perfschema_applier_config.test
new file mode 100644
index 00000000000..132d9912222
--- /dev/null
+++ b/mysql-test/suite/rpl/t/rpl_perfschema_applier_config.test
@@ -0,0 +1,96 @@
+# ==== Purpose ====
+#
+# This test script serves as the functionality testing for the table
+# performance_schema.replication_applier_configuration. Test for ddl and dml
+# operations is a part of the perfschema suite. The ddl/dml tests are named:
+# 1) ddl_replication_applier_configuration.test and
+# 2) dml_replication_applier_configuration.test.
+#
+# The follwing scenarios are tested in this script:
+#
+#  - Verify that output is same as SSS on a fresh slave.
+#  - Verify that the value of this field is correct after STOP SLAVE.
+#  - Verify that, when desired delay is set, the value is shown corectly.
+#  - Verify that the value is preserved after STOP SLAVE.
+#  - Verify that, when desired delay is reset, the value is shown corectly.
+#
+#  ==== Related Worklog ====
+#
+#  MDEV-16437: merge 5.7 P_S replication instrumentation and tables
+#
+
+source include/master-slave.inc;
+source include/have_binlog_format_mixed.inc;
master-slave should be included last


Sure.

+
+let $assert_text= On master, the table should return an empty set.;
+let $assert_cond= count(*) = 0 from performance_schema.replication_applier_configuration;
+source include/assert.inc;
again, what's wrong with just

   select * from performance_schema.replication_applier_configuration;

or, may be, if you want to be very explicit

  select count(*) as 'must be 0' from performance_schema.replication_applier_configuration;

+
+--connection slave
+
+--echo
+--echo # Verify that SELECT works and produces an output similar to
+--echo # the corresponding field in SHOW SLAVE STATUS(SSS) in all scenarios.
+--echo
+
+--echo
+--echo # Verify that output is same as SSS on a fresh slave.
+--echo
+
+let $sss_value= query_get_value(SHOW SLAVE STATUS, SQL_Delay, 1);
+let $ps_value= query_get_value(select Desired_Delay from performance_schema.replication_applier_configuration, Desired_Delay, 1);
+let $assert_text= Value returned by SSS and PS table for Desired_Delay should be same.;
+let $assert_cond= "$sss_value" = "$ps_value";
+source include/assert.inc;
I'll stop commenting on every assert.inc, but please, please, stop overusing them.

+
+--echo
+--echo # Verify that the value of this field is correct after STOP SLAVE.
+--echo
+
+source include/stop_slave.inc;
+
+let $sss_value= query_get_value(SHOW SLAVE STATUS, SQL_Delay, 1);
+let $ps_value= query_get_value(select Desired_Delay from performance_schema.replication_applier_configuration, Desired_Delay, 1);
+let $assert_text= Value returned by SSS and PS table for Desired_Delay should be same.;
+let $assert_cond= "$sss_value" = "$ps_value";
+source include/assert.inc;
+
+--echo
+--echo # Verify that, when desired delay is set, the value is shown corectly.
+--echo
+
+eval change master to master_delay= 2;
+source include/start_slave.inc;
+
+let $sss_value= query_get_value(SHOW SLAVE STATUS, SQL_Delay, 1);
+let $ps_value= query_get_value(select Desired_Delay from performance_schema.replication_applier_configuration, Desired_Delay, 1);
+let $assert_text= Value returned by SSS and PS table for Desired_Delay should be same.;
+let $assert_cond= "$sss_value" = "$ps_value";
+source include/assert.inc;
+
+--echo
+--echo # Verify that the value is preserved after STOP SLAVE.
+--echo
+
+source include/stop_slave.inc;
+
+let $ss_value= query_get_value(SHOW SLAVE STATUS, SQL_Delay, 1);
+let $ps_value= query_get_value(select Desired_Delay from performance_schema.replication_applier_configuration, Desired_Delay, 1);
+let $assert_text= Value returned by SSS and PS table for Desired_Delay should be same.;
+let $assert_cond= "$sss_value" = "$ps_value";
+source include/assert.inc;
+
+--echo
+--echo # Verify that, when desired delay is reset, the value is shown corectly.
+--echo
+
+eval change master to master_delay= 0;
+source include/start_slave.inc;
+
+let $sss_value= query_get_value(SHOW SLAVE STATUS, SQL_Delay, 1);
+let $ps_value= query_get_value(select Desired_Delay from performance_schema.replication_applier_configuration, Desired_Delay, 1);
+let $assert_text= Value returned by SSS and PS table for Desired_Delay should be same.;
+let $assert_cond= "$sss_value" = "$ps_value";
+source include/assert.inc;
+
+source include/rpl_end.inc;
diff --git a/mysql-test/suite/rpl/t/rpl_perfschema_applier_status.test b/mysql-test/suite/rpl/t/rpl_perfschema_applier_status.test
new file mode 100644
index 00000000000..52ee14cef2a
--- /dev/null
+++ b/mysql-test/suite/rpl/t/rpl_perfschema_applier_status.test
@@ -0,0 +1,72 @@
+# ==== Purpose ====
+#
+# This test script serves as the functionality testing for the table
+# performance_schema.replication_applier_status. Test for ddl and dml
+# operations is a part of the perfschema suite. The ddl/dml tests are named:
+# 1) ddl_replication_applier_status.test and
+# 2) dml_replication_applier_status.test.
+#
+# The follwing scenarios are tested in this script:
+#
+#  - Verify that output is same as SSS on a fresh slave.
+#  - Verify that the value of this field is correct after STOP SLAVE.
+#  - Remaining delay is not tested.
+#  - Count_trnsaction is partially tested here making sure it can be queried.
+#    More testing in rpl/include/rpl_deadlock.test
+#
+#  ==== Related Worklog ====
+#
+# MDEV-16437: merge 5.7 P_S replication instrumentation and tables
+#
+
+source include/master-slave.inc;
+source include/have_binlog_format_mixed.inc;
master-slave must always be last (I'll stop commenting on that too)


Sure.

+
+let $assert_text= On master, the table should return an empty set.;
+let $assert_cond= count(*) = 0 from performance_schema.replication_applier_status;
+source include/assert.inc;
+
+--connection slave
+
+--echo
+--echo # Verify that SELECT works and produces an output similar to
+--echo # the corresponding field in SHOW SLAVE STATUS(SSS) in all scenarios.
+--echo
+
+--echo
+--echo # Verify that output is same as SSS on a fresh slave.
+--echo
+
+let $sss_value= query_get_value(SHOW SLAVE STATUS, Slave_SQL_Running, 1);
+let $ps_value= query_get_value(select Service_State from performance_schema.replication_applier_status, Service_State, 1);
+let $assert_text= SSS shows Slave_SQL_Running as "Yes". So, Service_State from this PS table should be "ON".;
+let $assert_cond= "$sss_value" = "Yes" AND "$ps_value"= "ON";
+source include/assert.inc;
+
+let $ss_value= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
+let $ps_value= query_get_value(select count_transactions_retries from performance_schema.replication_applier_status, count_transactions_retries, 1);
+let $assert_text= COUNT_TRANSACTION_RETRIES should be equal to Slave_retried_transactions.;
+let $assert_cond= "$ps_value"= "$ss_value";
+source include/assert.inc;
+
+--echo
+--echo # Verify that the fields show the correct values after STOP SLAVE.
+--echo
+
+source include/stop_slave.inc;
+
+let $sss_value= query_get_value(SHOW SLAVE STATUS, Slave_SQL_Running, 1);
+let $ps_value= query_get_value(select Service_State from performance_schema.replication_applier_status, Service_State, 1);
+let $assert_text= SSS shows Slave_SQL_Running as "No". So, Service_State from this PS table should be "OFF".;
+let $assert_cond= "$sss_value" = "No" AND "$ps_value"= "OFF";
+source include/assert.inc;
+
+let $ss_value= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
+let $ps_value= query_get_value(select count_transactions_retries from performance_schema.replication_applier_status, count_transactions_retries, 1);
+let $assert_text= COUNT_TRANSACTION_RETRIES should be equal to Slave_retried_transactions.;
+let $assert_cond= "$ps_value"= "$ss_value";
+source include/assert.inc;
+
+source include/start_slave.inc;
+source include/rpl_end.inc;
+
diff --git a/sql/rpl_mi.h b/sql/rpl_mi.h
index 4d47689ac18..946d138d618 100644
--- a/sql/rpl_mi.h
+++ b/sql/rpl_mi.h
@@ -50,13 +57,6 @@ class Domain_id_filter
    */
    DYNAMIC_ARRAY m_domain_ids[2];
-public:
-  /* domain id list types */
-  enum enum_list_type {
-    DO_DOMAIN_IDS= 0,
-    IGNORE_DOMAIN_IDS
-  };
-
Why did you move it up? Does it make any difference?


Actually I didn't move, I changed the access specifier of 'm_domain_ids[2] from 'private' to 'public',

so that I can use it from PFS code as shown below.

File:table_replication_connection_configuration.cc

convert_array_to_str(&mi->domain_id_filter.m_domain_ids[Domain_id_filter::DO_DOMAIN_IDS]);
convert_array_to_str(&mi->domain_id_filter.m_domain_ids[Domain_id_filter::IGNORE_DOMAIN_IDS]);


    Domain_id_filter();
~Domain_id_filter();
diff --git a/storage/perfschema/table_replication_applier_status_by_coordinator.cc b/storage/perfschema/table_replication_applier_status_by_coordinator.cc
index beb8620b240..30cf56ce0c2 100644
--- a/storage/perfschema/table_replication_applier_status_by_coordinator.cc
+++ b/storage/perfschema/table_replication_applier_status_by_coordinator.cc
@@ -55,12 +55,14 @@ table_replication_applier_status_by_coordinator::m_share=
    sizeof(pos_t), /* ref length */
    &m_table_lock,
    { C_STRING_WITH_LEN("CREATE TABLE replication_applier_status_by_coordinator("
-  "CHANNEL_NAME CHAR(64) collate utf8_general_ci not null,"
+  "CONNECTION_NAME VARCHAR(256) collate utf8_general_ci not null,"
please, don't rename existing columns, let's keep calling it CHANNEL_NAME everywhere.


Sure.


    "THREAD_ID BIGINT UNSIGNED,"
    "SERVICE_STATE ENUM('ON','OFF') not null,"
+  "LAST_SEEN_TRANSACTION CHAR(57) not null,"
I think it's better to put new columns at the end.


Sure.


    "LAST_ERROR_NUMBER INTEGER not null,"
    "LAST_ERROR_MESSAGE VARCHAR(1024) not null,"
-  "LAST_ERROR_TIMESTAMP TIMESTAMP(0) not null)") },
+  "LAST_ERROR_TIMESTAMP TIMESTAMP(0) not null,"
+  "LAST_TRANS_RETRY_COUNT INTEGER not null)") },
    false  /* perpetual */
  };
@@ -104,15 +106,7 @@ int table_replication_applier_status_by_coordinator::rnd_next(void)
    {
      mi= (Master_info *)my_hash_element(&master_info_index->master_info_hash, m_pos.m_index);
- /*
-      Construct and display SQL Thread's (Coordinator) information in
-      'replication_applier_status_by_coordinator' table only in the case of
-      multi threaded slave mode. Code should do nothing in the case of single
-      threaded slave mode. In case of single threaded slave mode SQL Thread's
-      status will be reported as part of
-      'replication_applier_status_by_worker' table.
-    */
is this no longer true?


Yes. Irrespective of single or multi threaded slave mode,

Worker thread information will be available in : replication_applier_status_by_worker (If slave_parallel_threads > 0)

Co-ordinator thread information will be available in : replication_applier_status_by_coordinator (For both single and multi threaded case)


-    if (mi && mi->host[0] && /*mi->rli.get_worker_count() > */ 0)
+    if (mi && mi->host[0])
      {
        make_row(mi);
        m_next_pos.set_after(&m_pos);
@@ -147,13 +141,20 @@ int table_replication_applier_status_by_coordinator::rnd_pos(const void *pos)
  void table_replication_applier_status_by_coordinator::make_row(Master_info *mi)
  {
    m_row_exists= false;
+  rpl_gtid gtid;
+  char buf[10+1+10+1+20+1];
+  String str(buf, sizeof(buf), system_charset_info);
There's a special template for it:

   StringBuffer<10+1+10+1+20+1> str;


Sure.


+  bool first= true;
+
+  str.length(0);
not needed if you use StringBuffer<>


Sure.


DBUG_ASSERT(mi != NULL); mysql_mutex_lock(&mi->rli.data_lock); - m_row.channel_name_length= static_cast<uint>(mi->connection_name.length);
-  memcpy(m_row.channel_name, mi->connection_name.str, m_row.channel_name_length);
+  gtid= mi->rli.last_seen_gtid;
+  m_row.connection_name_length= static_cast<uint>(mi->connection_name.length);
+  memcpy(m_row.connection_name, mi->connection_name.str, m_row.connection_name_length);
if (mi->rli.slave_running)
    {
@@ -175,6 +176,18 @@ void table_replication_applier_status_by_coordinator::make_row(Master_info *mi)
    else
      m_row.service_state= PS_RPL_NO;
+ if ((gtid.seq_no > 0 &&
+        !rpl_slave_state_tostring_helper(&str, &gtid, &first)))
+  {
+    strmake(m_row.last_seen_transaction,str.ptr(), str.length());
+    m_row.last_seen_transaction_length= str.length();
+  }
+  else
+  {
+    m_row.last_seen_transaction_length= 0;
+    memcpy(m_row.last_seen_transaction, "", 1);
a strange way to write m_row.last_seen_transaction[0]= 0, but ok :)

+  }
+
    mysql_mutex_lock(&mi->rli.err_lock);
m_row.last_error_number= (long int) mi->rli.last_error().number;
Regards,
Sergei
VP of MariaDB Server Engineering
andsecurity@xxxxxxxxxxx

Please provide your suggestions on above replies.


Thank you

S.Sujatha





Follow ups

References