← Back to team overview

maria-developers team mailing list archive

Re: d25439f1cbb: MDEV-31140: FLUSH BINARY LOGS DELETE_DOMAIN_ID=(D) can errorneously delete active domains

 

Hi Andrei,

Want to review a small patch for https://jira.mariadb.org/browse/MDEV-31140 ?

I found this problem while trying to find the root cause of
https://jira.mariadb.org/browse/MDEV-30386 .

However this bug seems to be unrelated to MDEV-30386. Basically the logic
for comparing the gtid_binlog_state with the GTID_LIST event was flawed,
making it possible to delete a domain_id from the binlog state when this
should not be allowed.

 - Kristian.

Kristian Nielsen <knielsen@xxxxxxxxxxxxxxx> writes:

> revision-id: d25439f1cbb79e5467b4249792130bfe524e3f10 (mariadb-10.11.2-21-gd25439f1cbb)
> parent(s): 656c2e18b1e9ea5d0314745f3988d126eedbc22a
> author: Kristian Nielsen
> committer: Kristian Nielsen
> timestamp: 2023-04-27 12:16:07 +0200
> message:
>
> MDEV-31140: FLUSH BINARY LOGS DELETE_DOMAIN_ID=(D) can errorneously delete active domains
>
> Fix the code in rpl_binlog_state::drop_domain(), so that _all_ entries for
> the domain in the binlog state must match an entry in the initial GTID_LIST,
> not just one entry match.
>
> Signed-off-by: Kristian Nielsen <knielsen@xxxxxxxxxxxxxxx>
>
> ---
>  .../r/binlog_flush_binlogs_delete_domain.result    | 14 ++++++++++---
>  .../t/binlog_flush_binlogs_delete_domain.test      | 13 +++++++++++-
>  sql/rpl_gtid.cc                                    | 24 ++++++++++++----------
>  3 files changed, 36 insertions(+), 15 deletions(-)
>
> diff --git a/mysql-test/suite/binlog/r/binlog_flush_binlogs_delete_domain.result b/mysql-test/suite/binlog/r/binlog_flush_binlogs_delete_domain.result
> index fdcfb4bfa01..1c11191802f 100644
> --- a/mysql-test/suite/binlog/r/binlog_flush_binlogs_delete_domain.result
> +++ b/mysql-test/suite/binlog/r/binlog_flush_binlogs_delete_domain.result
> @@ -46,15 +46,23 @@ Warning	1076	The current gtid binlog state is incompatible with a former one mis
>  Warning	1076	The gtid domain being deleted ('1') is not in the current binlog state
>  FLUSH BINARY LOGS DELETE_DOMAIN_ID = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 0);
>  ERROR HY000: Could not delete gtid domain. Reason: binlog files may contain gtids from the domain ('1') being deleted. Make sure to first purge those files.
> +MDEV-31140: Missing error from DELETE_DOMAIN_ID when gtid_binlog_state partially matches GTID_LIST.
>  FLUSH BINARY LOGS;
>  PURGE BINARY LOGS TO 'master-bin.000005';
> +SET @@SESSION.gtid_domain_id=8;
> +SET @@SESSION.server_id=10*8 + 1;
> +INSERT INTO t SELECT 1+MAX(a) FROM t;
> +FLUSH BINARY LOGS DELETE_DOMAIN_ID = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 0);
> +ERROR HY000: Could not delete gtid domain. Reason: binlog files may contain gtids from the domain ('8') being deleted. Make sure to first purge those files.
> +FLUSH BINARY LOGS;
> +PURGE BINARY LOGS TO 'master-bin.000006';
>  FLUSH BINARY LOGS DELETE_DOMAIN_ID = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 0);
>  Warnings:
>  Warning	1076	The gtid domain being deleted ('0') is not in the current binlog state
>  Gtid_list of the current binlog does not contain 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 0:
> -show binlog events in 'master-bin.000006' limit 1,1;
> +show binlog events in 'master-bin.000007' limit 1,1;
>  Log_name	Pos	Event_type	Server_id	End_log_pos	Info
> -master-bin.000006	#	Gtid_list	1	#	[]
> +master-bin.000007	#	Gtid_list	1	#	[]
>  SET @@SESSION.gtid_domain_id=1;;
>  SET @@SESSION.server_id=1;
>  SET @@SESSION.gtid_seq_no=1;
> @@ -75,7 +83,7 @@ INSERT INTO t SET a=1;
>  SELECT @gtid_binlog_state_saved "as original state", @@GLOBAL.gtid_binlog_state as "out of order for 11 domain state";
>  as original state	out of order for 11 domain state
>  1-1-1,1-2-2,11-11-11	1-1-1,1-2-2,11-11-1
> -PURGE BINARY LOGS TO 'master-bin.000007';
> +PURGE BINARY LOGS TO 'master-bin.000008';
>  the following command succeeds with warnings
>  FLUSH BINARY LOGS DELETE_DOMAIN_ID = (1);
>  Warnings:
> diff --git a/mysql-test/suite/binlog/t/binlog_flush_binlogs_delete_domain.test b/mysql-test/suite/binlog/t/binlog_flush_binlogs_delete_domain.test
> index 8311f4bd800..1643ecff72d 100644
> --- a/mysql-test/suite/binlog/t/binlog_flush_binlogs_delete_domain.test
> +++ b/mysql-test/suite/binlog/t/binlog_flush_binlogs_delete_domain.test
> @@ -21,7 +21,6 @@ FLUSH BINARY LOGS DELETE_DOMAIN_ID = ();
>  --echo but with a warning
>  --let $binlog_pre_flush=query_get_value(SHOW MASTER STATUS, Position, 1)
>  FLUSH BINARY LOGS DELETE_DOMAIN_ID = (99);
> ---let $binlog_start=$binlog_pre_flush
>  --source include/show_binary_logs.inc
>  
>  # Log one event in a specified domain and try to delete the domain
> @@ -62,6 +61,8 @@ FLUSH BINARY LOGS DELETE_DOMAIN_ID = (1);
>  # expected overrun of the static buffers of underlying dynamic arrays is doing.
>  --let $domain_cnt=17
>  --let $server_in_domain_cnt=3
> +--let $err_domain_id=`SELECT FLOOR($domain_cnt/2)`
> +--let $err_server_id=`SELECT FLOOR($server_in_domain_cnt/2)`
>  --let $domain_list=
>  --disable_query_log
>  while ($domain_cnt)
> @@ -86,6 +87,16 @@ while ($domain_cnt)
>  --error ER_BINLOG_CANT_DELETE_GTID_DOMAIN
>  --eval FLUSH BINARY LOGS DELETE_DOMAIN_ID = ($domain_list)
>  
> +--echo MDEV-31140: Missing error from DELETE_DOMAIN_ID when gtid_binlog_state partially matches GTID_LIST.
> +FLUSH BINARY LOGS;
> +--let $purge_to_binlog= query_get_value(SHOW MASTER STATUS, File, 1)
> +--eval PURGE BINARY LOGS TO '$purge_to_binlog'
> +--eval SET @@SESSION.gtid_domain_id=$err_domain_id
> +--eval SET @@SESSION.server_id=10*$err_domain_id + $err_server_id
> +eval INSERT INTO t SELECT 1+MAX(a) FROM t;
> +--error ER_BINLOG_CANT_DELETE_GTID_DOMAIN
> +--eval FLUSH BINARY LOGS DELETE_DOMAIN_ID = ($domain_list)
> +
>  # Now satisfy the safety condtion to purge log files containing $domain list
>  FLUSH BINARY LOGS;
>  --let $purge_to_binlog= query_get_value(SHOW MASTER STATUS, File, 1)
> diff --git a/sql/rpl_gtid.cc b/sql/rpl_gtid.cc
> index c4e5c75b10a..7b67a83b3dd 100644
> --- a/sql/rpl_gtid.cc
> +++ b/sql/rpl_gtid.cc
> @@ -2209,18 +2209,16 @@ rpl_binlog_state::drop_domain(DYNAMIC_ARRAY *ids,
>  
>    /*
>      For each domain_id from ids
> -      when no such domain in binlog state
> -        warn && continue
> -      For each domain.server's last gtid
> -        when not locate the last gtid in glev.list
> -          error out binlog state can't change
> -        otherwise continue
> +      If the domain is already absent from the binlog state
> +        Warn && continue
> +      If any GTID with that domain in binlog state is missing from glev.list
> +        Error out binlog state can't change
>    */
>    for (ulong i= 0; i < ids->elements; i++)
>    {
>      rpl_binlog_state::element *elem= NULL;
>      uint32 *ptr_domain_id;
> -    bool not_match;
> +    bool all_found;
>  
>      ptr_domain_id= (uint32*) dynamic_array_ptr(ids, i);
>      elem= (rpl_binlog_state::element *)
> @@ -2235,14 +2233,18 @@ rpl_binlog_state::drop_domain(DYNAMIC_ARRAY *ids,
>        continue;
>      }
>  
> -    for (not_match= true, k= 0; k < elem->hash.records; k++)
> +    all_found= true;
> +    for (k= 0; k < elem->hash.records && all_found; k++)
>      {
>        rpl_gtid *d_gtid= (rpl_gtid *)my_hash_element(&elem->hash, k);
> -      for (ulong l= 0; l < glev->count && not_match; l++)
> -        not_match= !(*d_gtid == glev->list[l]);
> +      bool match_found= false;
> +      for (ulong l= 0; l < glev->count && !match_found; l++)
> +        match_found= match_found || (*d_gtid == glev->list[l]);
> +      if (!match_found)
> +        all_found= false;
>      }
>  
> -    if (not_match)
> +    if (!all_found)
>      {
>        sprintf(errbuf, "binlog files may contain gtids from the domain ('%u') "
>                "being deleted. Make sure to first purge those files",