← Back to team overview

maria-developers team mailing list archive

Re: [Commits] b2f45ba: MDEV-7324 - Lock-free hash for table definition cache

 

Hi, Sergey!

On Dec 22, svoj@xxxxxxxxxxx wrote:
> revision-id: b2f45ba8280c97f8dc8b1adb3234cf2e28e48982
> parent(s): 4a469b0ed68bb341e1a1e1ba540a45a3b6ff3ef9
> committer: Sergey Vojtovich
> branch nick: 10.1
> timestamp: 2014-12-22 19:56:48 +0400
> message:
> 
> MDEV-7324 - Lock-free hash for table definition cache

That looked good to me.
I didn't find anything obviously wrong.
There are mostly questions below.
Some are style suggestions (like, renaming of methods, etc).
The only place I'm not sure about is early lf_hash_search_unpin()
in tdc_acquire_share().

> diff --git a/mysys/lf_hash.c b/mysys/lf_hash.c
> --- a/mysys/lf_hash.c
> +++ b/mysys/lf_hash.c
> @@ -122,7 +122,7 @@ static int lfind(LF_SLIST * volatile *head, CHARSET_INFO *cs, uint32 hashnr,
>      {
>        if (unlikely(callback))
>        {
> -        if (callback(cursor->curr + 1, (void*)key))
> +        if (cur_hashnr & 1 && callback(cursor->curr + 1, (void*)key))

Sorry for this.

>            return 1;
>        }
>        else if (cur_hashnr >= hashnr)
> diff --git a/sql/sql_class.cc b/sql/sql_class.cc
> --- a/sql/sql_class.cc
> +++ b/sql/sql_class.cc
> @@ -912,17 +912,17 @@ bool Drop_table_error_handler::handle_condition(THD *thd,
>  #endif /* defined(ENABLED_DEBUG_SYNC) */
>     wait_for_commit_ptr(0),
>     main_da(0, false, false),
> -   m_stmt_da(&main_da)
> +   m_stmt_da(&main_da),
>  #ifdef WITH_WSREP
> -  ,
>     wsrep_applier(is_wsrep_applier),
>     wsrep_applier_closing(false),
>     wsrep_client_thread(false),
>     wsrep_apply_toi(false),
>     wsrep_po_handle(WSREP_PO_INITIALIZER),
>     wsrep_po_cnt(0),
> -   wsrep_apply_format(0)
> +   wsrep_apply_format(0),
>  #endif
> +  tdc_hash_pins(0)

Eh... I actually wanted wsrep related data to the at the end of the THD.
Just to keep the rest of the THD layout identical for wsrep and non-wsrep
builds. In case a wsrep-enabled plugin is loaded into non-wsrep server or
visa versa. That was an early change, though, I've done more cleanups after
that and moved wsrep into a service, so perhaps (even, most probably)
this is not needed anymore. But I'd better stay on the safe side and keep
wsrep at the end of THD.

>  {
>    ulong tmp;
>  
> @@ -1698,6 +1698,8 @@ void THD::cleanup(void)
>  
>    free_root(&main_mem_root, MYF(0));
>    main_da.free_memory();
> +  if (tdc_hash_pins)
> +    lf_hash_put_pins(tdc_hash_pins);

no tdc_hash_pins=0 here?

>    if (status_var.memory_used != 0)
>    {
>      DBUG_PRINT("error", ("memory_used: %lld", status_var.memory_used));
> diff --git a/sql/table.h b/sql/table.h
> --- a/sql/table.h
> +++ b/sql/table.h
> @@ -611,32 +612,7 @@ struct TABLE_SHARE
>    mysql_mutex_t LOCK_ha_data;           /* To protect access to ha_data */
>    mysql_mutex_t LOCK_share;             /* To protect TABLE_SHARE */
>  
> -  typedef I_P_List <TABLE, TABLE_share> TABLE_list;
> -  typedef I_P_List <TABLE, All_share_tables> All_share_tables_list;
> -  struct
> -  {
> -    /**
> -      Protects ref_count, m_flush_tickets, all_tables, free_tables, flushed,
> -      all_tables_refs.
> -    */
> -    mysql_mutex_t LOCK_table_share;
> -    mysql_cond_t COND_release;
> -    TABLE_SHARE *next, **prev;            /* Link to unused shares */
> -    uint ref_count;                       /* How many TABLE objects uses this */
> -    uint all_tables_refs;                 /* Number of refs to all_tables */
> -    /**
> -      List of tickets representing threads waiting for the share to be flushed.
> -    */
> -    Wait_for_flush_list m_flush_tickets;
> -    /*
> -      Doubly-linked (back-linked) lists of used and unused TABLE objects
> -      for this share.
> -    */
> -    All_share_tables_list all_tables;
> -    TABLE_list free_tables;
> -    ulong version;
> -    bool flushed;
> -  } tdc;
> +  TDC_element *tdc;

Hmm. Ok. I think I like this change - TABLE_SHARE now contains table metadata
and everything related to tdc is moved to TDC_element. Right?

>  
>    LEX_CUSTRING tabledef_version;
>  
> diff --git a/sql/table.cc b/sql/table.cc
> --- a/sql/table.cc
> +++ b/sql/table.cc
> @@ -3845,11 +3844,11 @@ bool TABLE_SHARE::visit_subgraph(Wait_for_flush *wait_for_flush,
>      because we won't try to acquire tdc.LOCK_table_share while
>      holding a write-lock on MDL_lock::m_rwlock.
>    */
> -  mysql_mutex_lock(&tdc.LOCK_table_share);
> -  tdc.all_tables_refs++;
> -  mysql_mutex_unlock(&tdc.LOCK_table_share);
> +  mysql_mutex_lock(&tdc->LOCK_table_share);
> +  tdc->all_tables_refs++;
> +  mysql_mutex_unlock(&tdc->LOCK_table_share);

Do I understand it correctly that a TDC_element can never be deleted from
LF_HASH as long as at least one TABLE_SHARE references it?

>  
> -  All_share_tables_list::Iterator tables_it(tdc.all_tables);
> +  TDC_element::All_share_tables_list::Iterator tables_it(tdc->all_tables);
>  
>    /*
>      In case of multiple searches running in parallel, avoid going
> @@ -3946,21 +3945,10 @@ bool TABLE_SHARE::wait_for_old_version(THD *thd, struct timespec *abstime,
>  
>    mdl_context->done_waiting_for();
>  
> -  mysql_mutex_lock(&tdc.LOCK_table_share);
> -
> -  tdc.m_flush_tickets.remove(&ticket);
> -
> -  if (tdc.m_flush_tickets.is_empty() && tdc.ref_count == 0)
> -  {
> -    /*
> -      If our thread was the last one using the share,
> -      we must destroy it here.
> -    */
> -    mysql_mutex_unlock(&tdc.LOCK_table_share);
> -    destroy();
> -  }
> -  else
> -    mysql_mutex_unlock(&tdc.LOCK_table_share);
> +  mysql_mutex_lock(&tdc->LOCK_table_share);
> +  tdc->m_flush_tickets.remove(&ticket);
> +  mysql_cond_broadcast(&tdc->COND_release);
> +  mysql_mutex_unlock(&tdc->LOCK_table_share);

Why is the logic here different?

>  
>  
>    /*
> diff --git a/sql/table_cache.h b/sql/table_cache.h
> --- a/sql/table_cache.h
> +++ b/sql/table_cache.h
> @@ -16,6 +16,171 @@
>     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
>  
>  
> +extern PSI_mutex_key key_TABLE_SHARE_LOCK_table_share;
> +extern PSI_cond_key key_TABLE_SHARE_COND_release;
> +
> +class TDC_element
> +{
> +public:
> +  uchar m_key[NAME_LEN + 1 + NAME_LEN + 1];
> +  uint m_key_length;
> +  ulong version;
> +  bool flushed;
> +  TABLE_SHARE *share;
> +
> +  typedef I_P_List <TABLE, TABLE_share> TABLE_list;
> +  typedef I_P_List <TABLE, All_share_tables> All_share_tables_list;
> +  /**
> +    Protects ref_count, m_flush_tickets, all_tables, free_tables, flushed,
> +    all_tables_refs.
> +  */
> +  mysql_mutex_t LOCK_table_share;
> +  mysql_cond_t COND_release;
> +  TDC_element *next, **prev;            /* Link to unused shares */
> +  uint ref_count;                       /* How many TABLE objects uses this */
> +  uint all_tables_refs;                 /* Number of refs to all_tables */
> +  /**
> +    List of tickets representing threads waiting for the share to be flushed.
> +  */
> +  Wait_for_flush_list m_flush_tickets;
> +  /*
> +    Doubly-linked (back-linked) lists of used and unused TABLE objects
> +    for this share.
> +  */
> +  All_share_tables_list all_tables;
> +  TABLE_list free_tables;
> +
> +  TDC_element() {}
> +
> +  TDC_element(const char *key, uint key_length) : m_key_length(key_length)
> +  {
> +    memcpy(m_key, key, key_length);
> +  }
> +
> +
> +  void dbug_release_assertions()

Somewhat confusing name. If I hadn't see the code below
I'd thought this function releases some assertions (how? why?)
may be assert_correctness_at_release() would be less ambiguous?

> +  {
> +    DBUG_ASSERT(share == 0);
> +    DBUG_ASSERT(ref_count == 0);
> +    DBUG_ASSERT(m_flush_tickets.is_empty());
> +    DBUG_ASSERT(all_tables.is_empty());
> +    DBUG_ASSERT(free_tables.is_empty());
> +    DBUG_ASSERT(all_tables_refs == 0);
> +    DBUG_ASSERT(next == 0);
> +    DBUG_ASSERT(prev == 0);
> +  }
> +
> +
> +  /**
> +    Acquire TABLE object from table cache.
> +
> +    @pre share must be protected against removal.
> +
> +    Acquired object cannot be evicted or acquired again.
> +
> +    While locked:
> +    - pop object from TABLE_SHARE::tdc.free_tables
> +
> +    While unlocked:
> +    - mark object used by thd

"While locked" / "While unlocked" - what does that mean?

> +
> +    @return TABLE object, or NULL if no unused objects.
> +  */
> +
> +  TABLE *acquire_table(THD *thd)
> +  {
> +    TABLE *table;
> +
> +    mysql_mutex_lock(&LOCK_table_share);
> +    table= free_tables.pop_front();
> +    if (table)
> +    {
> +      DBUG_ASSERT(!table->in_use);

lots of asserts - good!

> +      table->in_use= thd;
> +      /* The ex-unused table must be fully functional. */
> +      DBUG_ASSERT(table->db_stat && table->file);
> +      /* The children must be detached from the table. */
> +      DBUG_ASSERT(!table->file->extra(HA_EXTRA_IS_ATTACHED_CHILDREN));

I don't even think of asking why it's HA_EXTRA_IS_ATTACHED_CHILDREN and not
HA_EXTRA_HAS_ATTACHED_CHILDREN or, at least, HA_EXTRA_CHILDREN_ARE_ATTACHED.
Ah, whatever... Horrors of the English language hidden in old MySQL code :)

> +    }
> +    mysql_mutex_unlock(&LOCK_table_share);
> +    return table;
> +  }
> +
> +
> +  /**
> +    Get last element of free_tables.
> +  */
> +
> +  TABLE *free_tables_back()

Eh, I've thought for a few seconds what could it mean to "free tables back"
until I read the function comment. You see, "free" is so often used as a verb
that one absolutely doesn't see it that here it means "free_tables' back"
even though it's also a common naming pattern (xxx_back for the list xxx).
Rename the method, may be?

> +  {
> +    TABLE_list::Iterator it(share->tdc->free_tables);
> +    TABLE *entry, *last= 0;
> +     while ((entry= it++))
> +       last= entry;
> +    return last;
> +  }
> +
> +
> +  /**
> +    Wait for MDL deadlock detector to complete traversing tdc.all_tables.
> +
> +    Must be called before updating TABLE_SHARE::tdc.all_tables.
> +  */
> +
> +  void wait_for_mdl_deadlock_detector()
> +  {
> +    while (all_tables_refs)
> +      mysql_cond_wait(&COND_release, &LOCK_table_share);
> +  }
> +
> +
> +  /**
> +    Prepeare table share for use with table definition cache.
> +  */
> +
> +  static void lf_alloc_constructor(uchar *arg)
> +  {
> +    TDC_element *element= (TDC_element*) (arg + LF_HASH_OVERHEAD);
> +    DBUG_ENTER("tdc_init_share");

intentionally? or forgot to update DBUG_ENTER?

> +    mysql_mutex_init(key_TABLE_SHARE_LOCK_table_share,
> +                     &element->LOCK_table_share, MY_MUTEX_INIT_FAST);
> +    mysql_cond_init(key_TABLE_SHARE_COND_release, &element->COND_release, 0);
> +    element->m_flush_tickets.empty();
> +    element->all_tables.empty();
> +    element->free_tables.empty();
> +    element->all_tables_refs= 0;
> +    element->share= 0;
> +    element->ref_count= 0;
> +    element->next= 0;
> +    element->prev= 0;
> +    DBUG_VOID_RETURN;
> +  }
> +
> +
> +  /**
> +    Release table definition cache specific resources of table share.
> +  */
> +
> +  static void lf_alloc_destructor(uchar *arg)
> +  {
> +    TDC_element *element= (TDC_element*) (arg + LF_HASH_OVERHEAD);
> +    DBUG_ENTER("tdc_deinit_share");

intentionally? or forgot to update DBUG_ENTER?

> +    element->dbug_release_assertions();
> +    mysql_cond_destroy(&element->COND_release);
> +    mysql_mutex_destroy(&element->LOCK_table_share);
> +    DBUG_VOID_RETURN;
> +  }
> +
> +
> +  static uchar *key(const TDC_element *element, size_t *length,
> +                    my_bool not_used __attribute__((unused)))
> +  {
> +    *length= element->m_key_length;
> +    return (uchar*) element->m_key;
> +  }
> +};
> +
> +
>  enum enum_tdc_remove_table_type
>  {
>    TDC_RT_REMOVE_ALL,
> diff --git a/sql/sql_base.cc b/sql/sql_base.cc
> --- a/sql/sql_base.cc
> +++ b/sql/sql_base.cc
> @@ -270,58 +270,75 @@ uint get_table_def_key(const TABLE_LIST *table_list, const char **key)
>      #           Pointer to list of names of open tables.
>  */
>  
> -OPEN_TABLE_LIST *list_open_tables(THD *thd, const char *db, const char *wild)
> +struct list_open_tables_arg
>  {
> +  THD *thd;
> +  const char *db;
> +  const char *wild;
>    TABLE_LIST table_list;
>    OPEN_TABLE_LIST **start_list, *open_list;
> -  TABLE_SHARE *share;
> -  TDC_iterator tdc_it;
> -  DBUG_ENTER("list_open_tables");
> +};
>  
> -  bzero((char*) &table_list,sizeof(table_list));
> -  start_list= &open_list;
> -  open_list=0;
>  
> -  tdc_it.init();
> -  while ((share= tdc_it.next()))
> +static my_bool list_open_tables_callback(TDC_element *element,
> +                                         list_open_tables_arg *arg)
>    {
> -    if (db && my_strcasecmp(system_charset_info, db, share->db.str))
> -      continue;
> -    if (wild && wild_compare(share->table_name.str, wild, 0))
> -      continue;
> +  char *db= (char*) element->m_key;
> +  char *table_name= (char*) element->m_key + strlen((char*) element->m_key) + 1;
>  
> +  if (arg->db && my_strcasecmp(system_charset_info, arg->db, db))
> +    return FALSE;
> +  if (arg->wild && wild_compare(table_name, arg->wild, 0))
> +    return FALSE;
> +
>      /* Check if user has SELECT privilege for any column in the table */
> -    table_list.db=         share->db.str;
> -    table_list.table_name= share->table_name.str;
> -    table_list.grant.privilege=0;
> +  arg->table_list.db= db;
> +  arg->table_list.table_name= table_name;
> +  arg->table_list.grant.privilege= 0;
>  
> -    if (check_table_access(thd,SELECT_ACL,&table_list, TRUE, 1, TRUE))
> -      continue;
> +  if (check_table_access(arg->thd, SELECT_ACL, &arg->table_list, TRUE, 1, TRUE))
> +    return FALSE;
>  
> -    if (!(*start_list = (OPEN_TABLE_LIST *)
> -          sql_alloc(sizeof(**start_list)+share->table_cache_key.length)))
> -    {
> -      open_list=0;                              // Out of memory
> -      break;
> -    }
> +  if (!(*arg->start_list= (OPEN_TABLE_LIST *) sql_alloc(

when you know the thd, prefer thd->alloc() over sql_alloc()
(which is, basically, current_thd->alloc())

> +                    sizeof(**arg->start_list) + element->m_key_length)))
> +    return TRUE;
> +
> -    strmov((*start_list)->table=
> -           strmov(((*start_list)->db= (char*) ((*start_list)+1)),
> -                  share->db.str)+1,
> -           share->table_name.str);
> +  strmov((*arg->start_list)->table=
> +         strmov(((*arg->start_list)->db= (char*) ((*arg->start_list) + 1)),
> +                db) + 1, table_name);
> -    (*start_list)->in_use= 0;
> +  (*arg->start_list)->in_use= 0;
> -    mysql_mutex_lock(&share->tdc.LOCK_table_share);
> -    TABLE_SHARE::All_share_tables_list::Iterator it(share->tdc.all_tables);
> +
> +  mysql_mutex_lock(&element->LOCK_table_share);
> +  TDC_element::All_share_tables_list::Iterator it(element->all_tables);
>      TABLE *table;
>      while ((table= it++))
>        if (table->in_use)
> -        ++(*start_list)->in_use;
> +      ++(*arg->start_list)->in_use;
> -    mysql_mutex_unlock(&share->tdc.LOCK_table_share);
> +  mysql_mutex_unlock(&element->LOCK_table_share);
> -    (*start_list)->locked= 0;                   /* Obsolete. */
> +  (*arg->start_list)->locked= 0;                   /* Obsolete. */
> -    start_list= &(*start_list)->next;
> +  arg->start_list= &(*arg->start_list)->next;
> -    *start_list=0;
> +  *arg->start_list= 0;
> +  return FALSE;
>    }
> -  tdc_it.deinit();
> -  DBUG_RETURN(open_list);
> +
> +
> +OPEN_TABLE_LIST *list_open_tables(THD *thd, const char *db, const char *wild)
> +{
> +  list_open_tables_arg argument;
> +  DBUG_ENTER("list_open_tables");
> +
> +  argument.thd= thd;
> +  argument.db= db;
> +  argument.wild= wild;
> +  bzero((char*) &argument.table_list, sizeof(argument.table_list));
> +  argument.start_list= &argument.open_list;
> +  argument.open_list= 0;
> +
> +  if (tdc_iterate(thd, (my_hash_walk_action) list_open_tables_callback,
> +                  &argument, true))
> +    DBUG_RETURN(0);
> +
> +  DBUG_RETURN(argument.open_list);
>  }
>  
>  /*****************************************************************************
> diff --git a/sql/table_cache.cc b/sql/table_cache.cc
> --- a/sql/table_cache.cc
> +++ b/sql/table_cache.cc
> @@ -72,32 +76,23 @@
>  /**
>    Protects unused shares list.
>  
> -  TABLE_SHARE::tdc.prev
> -  TABLE_SHARE::tdc.next
> -  oldest_unused_share
> -  end_of_unused_share
> +  TDC_element::prev
> +  TDC_element::next
> +  unused_shares
>  */
>  
>  static mysql_mutex_t LOCK_unused_shares;
> -static mysql_rwlock_t LOCK_tdc; /**< Protects tdc_hash. */
>  my_atomic_rwlock_t LOCK_tdc_atomics; /**< Protects tdc_version. */
>  
>  #ifdef HAVE_PSI_INTERFACE
> -static PSI_mutex_key key_LOCK_unused_shares, key_TABLE_SHARE_LOCK_table_share;
> +PSI_mutex_key key_LOCK_unused_shares, key_TABLE_SHARE_LOCK_table_share;

really? not static?

>  static PSI_mutex_info all_tc_mutexes[]=
>  {
>    { &key_LOCK_unused_shares, "LOCK_unused_shares", PSI_FLAG_GLOBAL },
>    { &key_TABLE_SHARE_LOCK_table_share, "TABLE_SHARE::tdc.LOCK_table_share", 0 }
>  };
>  
> -static PSI_rwlock_key key_rwlock_LOCK_tdc;
> -static PSI_rwlock_info all_tc_rwlocks[]=
> -{
> -  { &key_rwlock_LOCK_tdc, "LOCK_tdc", PSI_FLAG_GLOBAL }
> -};
> -
> -
> -static PSI_cond_key key_TABLE_SHARE_COND_release;
> +PSI_cond_key key_TABLE_SHARE_COND_release;

and here

>  static PSI_cond_info all_tc_conds[]=
>  {
>    { &key_TABLE_SHARE_COND_release, "TABLE_SHARE::tdc.COND_release", 0 }
> @@ -265,85 +269,41 @@ void tc_add_table(THD *thd, TABLE *table)
>  
>    if (need_purge)
>    {
> -    TABLE_SHARE *purge_share= 0;
> -    TABLE_SHARE *share;
> -    TABLE *entry;
> -    ulonglong UNINIT_VAR(purge_time);
> -    TDC_iterator tdc_it;
> +    tc_add_table_arg argument;
> +    argument.purge_time= ULONGLONG_MAX;
> +    tdc_iterate(thd, (my_hash_walk_action) tc_add_table_callback, &argument);
>  
> -    tdc_it.init();
> -    while ((share= tdc_it.next()))
> +    if (argument.purge_time != ULONGLONG_MAX)
>      {
> -      mysql_mutex_lock(&share->tdc.LOCK_table_share);
> -      if ((entry= tc_free_tables_back(share)) &&
> -          (!purge_share || entry->tc_time < purge_time))
> +      TDC_element *element= (TDC_element*) lf_hash_search(&tdc_hash,
> +                                                          thd->tdc_hash_pins,
> +                                                          argument.key,
> +                                                          argument.key_length);
> +      if (element)
>        {
> -          purge_share= share;
> -          purge_time= entry->tc_time;
> -      }
> -      mysql_mutex_unlock(&share->tdc.LOCK_table_share);
> -    }
> +        TABLE *entry;
> +        mysql_mutex_lock(&element->LOCK_table_share);
> +        lf_hash_search_unpin(thd->tdc_hash_pins);
> +        element->wait_for_mdl_deadlock_detector();

you wait under a mutex? hmm...

>  
> -    if (purge_share)
> -    {
> -      mysql_mutex_lock(&purge_share->tdc.LOCK_table_share);
> -      tc_wait_for_mdl_deadlock_detector(purge_share);
> -      tdc_it.deinit();
>        /*
>          It may happen that oldest table was acquired meanwhile. In this case
>          just go ahead, number of objects in table cache will normalize
>          eventually.
>        */
> -      if ((entry= tc_free_tables_back(purge_share)) &&
> -          entry->tc_time == purge_time)
> +        if ((entry= element->free_tables_back()) &&
> +            entry->tc_time == argument.purge_time)
>        {
> -        entry->s->tdc.free_tables.remove(entry);
> +          element->free_tables.remove(entry);
>          tc_remove_table(entry);
> -        mysql_mutex_unlock(&purge_share->tdc.LOCK_table_share);
> +          mysql_mutex_unlock(&element->LOCK_table_share);
>          intern_close_table(entry);
>        }
>        else
> -        mysql_mutex_unlock(&purge_share->tdc.LOCK_table_share);
> +          mysql_mutex_unlock(&element->LOCK_table_share);
>      }
> -    else
> -      tdc_it.deinit();
>    }
>  }
> -
> -
> -/**
> -  Acquire TABLE object from table cache.
> -
> -  @pre share must be protected against removal.
> -
> -  Acquired object cannot be evicted or acquired again.
> -
> -  While locked:
> -  - pop object from TABLE_SHARE::tdc.free_tables
> -
> -  While unlocked:
> -  - mark object used by thd
> -
> -  @return TABLE object, or NULL if no unused objects.
> -*/
> -
> -static TABLE *tc_acquire_table(THD *thd, TABLE_SHARE *share)
> -{
> -  TABLE *table;
> -
> -  mysql_mutex_lock(&share->tdc.LOCK_table_share);
> -  table= share->tdc.free_tables.pop_front();
> -  if (table)
> -  {
> -    DBUG_ASSERT(!table->in_use);
> -    table->in_use= thd;
> -    /* The ex-unused table must be fully functional. */
> -    DBUG_ASSERT(table->db_stat && table->file);
> -    /* The children must be detached from the table. */
> -    DBUG_ASSERT(!table->file->extra(HA_EXTRA_IS_ATTACHED_CHILDREN));
> -  }

ok, you've moved it to table_cache.h...

> -  mysql_mutex_unlock(&share->tdc.LOCK_table_share);
> -  return table;
>  }
>  
>  
> @@ -397,88 +357,76 @@ bool tc_release_table(TABLE *table)
>    */
>    table->in_use= 0;
>    /* Add table to the list of unused TABLE objects for this share. */
> -  table->s->tdc.free_tables.push_front(table);
> -  mysql_mutex_unlock(&table->s->tdc.LOCK_table_share);
> +  table->s->tdc->free_tables.push_front(table);
> +  mysql_mutex_unlock(&table->s->tdc->LOCK_table_share);
>    return false;
>  
>  purge:
> -  tc_wait_for_mdl_deadlock_detector(table->s);
> +  table->s->tdc->wait_for_mdl_deadlock_detector();
>    tc_remove_table(table);
> -  mysql_mutex_unlock(&table->s->tdc.LOCK_table_share);
> +  mysql_mutex_unlock(&table->s->tdc->LOCK_table_share);
>    table->in_use= 0;
>    intern_close_table(table);
>    return true;
>  }
>  
>  
> -extern "C" uchar *tdc_key(const uchar *record, size_t *length,
> -                          my_bool not_used __attribute__((unused)))
> -{
> -  TABLE_SHARE *entry= (TABLE_SHARE*) record;
> -  *length= entry->table_cache_key.length;
> -  return (uchar*) entry->table_cache_key.str;
> -}
> -
> -
>  /**
>    Delete share from hash and free share object.
> -
> -  @return
> -    @retval 0 Success
> -    @retval 1 Share is referenced
>  */
>  
> -static int tdc_delete_share_from_hash(TABLE_SHARE *share)
> +static void tdc_delete_share_from_hash(TDC_element *element)
>  {
> +  THD *thd= current_thd;
> +  LF_PINS *pins;
> +  TABLE_SHARE *share;
>    DBUG_ENTER("tdc_delete_share_from_hash");
> -  mysql_rwlock_wrlock(&LOCK_tdc);
> -  mysql_mutex_lock(&share->tdc.LOCK_table_share);
> -  if (--share->tdc.ref_count)
> -  {
> -    mysql_cond_broadcast(&share->tdc.COND_release);
> -    mysql_mutex_unlock(&share->tdc.LOCK_table_share);
> -    mysql_rwlock_unlock(&LOCK_tdc);
> -    DBUG_RETURN(1);
> -  }
> -  my_hash_delete(&tdc_hash, (uchar*) share);
> +
> +  share= element->share;
> +  DBUG_ASSERT(share);
> +  element->share= 0;
>    /* Notify PFS early, while still locked. */

"while still locked" what?

>    PSI_CALL_release_table_share(share->m_psi);
>    share->m_psi= 0;
> -  mysql_rwlock_unlock(&LOCK_tdc);
>  
> -  if (share->tdc.m_flush_tickets.is_empty())
> +  if (!element->m_flush_tickets.is_empty())
>    {
> -    /* No threads are waiting for this share to be flushed, destroy it. */
> -    mysql_mutex_unlock(&share->tdc.LOCK_table_share);
> -    free_table_share(share);
> -  }
> -  else
> -  {
> -    Wait_for_flush_list::Iterator it(share->tdc.m_flush_tickets);
> +    Wait_for_flush_list::Iterator it(element->m_flush_tickets);
>      Wait_for_flush *ticket;
>      while ((ticket= it++))
>        (void) ticket->get_ctx()->m_wait.set_status(MDL_wait::GRANTED);
> -    /*
> -      If there are threads waiting for this share to be flushed,
> -      the last one to receive the notification will destroy the
> -      share. At this point the share is removed from the table
> -      definition cache, so is OK to proceed here without waiting
> -      for this thread to do the work.
> -    */
> -    mysql_mutex_unlock(&share->tdc.LOCK_table_share);
> +
> +    do
> +    {
> +      mysql_cond_wait(&element->COND_release, &element->LOCK_table_share);
> +    } while (!element->m_flush_tickets.is_empty());
>    }
> -  DBUG_RETURN(0);
> +
> +  mysql_mutex_unlock(&element->LOCK_table_share);

Ok, so the caller must lock LOCK_table_share?
please put mysql_mutex_assert_owner() in the beginning of the function

> +
> +  if (thd)

When thd is NULL here? On shutdown?

> +  {
> +    fix_thd_pins(thd);
> +    pins= thd->tdc_hash_pins;
>  }
> +  else
> +    pins= lf_hash_get_pins(&tdc_hash);
>  
> +  DBUG_ASSERT(pins); // What can we do about it?
> +  element->dbug_release_assertions();
> +  lf_hash_delete(&tdc_hash, pins, element->m_key, element->m_key_length);
> +  if (!thd)
> +    lf_hash_put_pins(pins);
> +  free_table_share(share);
> +  DBUG_VOID_RETURN;
> +}
>  
> +
>  /**
>    Initialize table definition cache.
> -
> -  @retval  0  Success
> -  @retval !0  Error
>  */
>  
> -int tdc_init(void)
> +void tdc_init(void)
>  {
>    DBUG_ENTER("tdc_init");
>  #ifdef HAVE_PSI_INTERFACE
> @@ -642,26 +548,34 @@ void tdc_deinit_share(TABLE_SHARE *share)
>    Caller is expected to unlock table share with tdc_unlock_share().
>  
>    @retval  0 Share not found
> -  @retval !0 Pointer to locked table share
> +  @retval MY_ERRPTR OOM
> +  @retval ptr Pointer to locked table share
>  */
>  
> -TABLE_SHARE *tdc_lock_share(const char *db, const char *table_name)
> +TDC_element *tdc_lock_share(THD *thd, const char *db, const char *table_name)
>  {
> +  TDC_element *element;
>    char key[MAX_DBKEY_LENGTH];
> -  uint key_length;
>  
>    DBUG_ENTER("tdc_lock_share");
> -  key_length= tdc_create_key(key, db, table_name);
> +  if (fix_thd_pins(thd))
> +    DBUG_RETURN((TDC_element*) MY_ERRPTR);

I suppose you can do it in THD:init, if you'd like. A THD almost always
will need to access table definition cache, so it'll need these pins.

Then there will be no need to return MY_ERRPTR here and handle it in the
caller

>  
> -  mysql_rwlock_rdlock(&LOCK_tdc);
> -  TABLE_SHARE* share= (TABLE_SHARE*) my_hash_search(&tdc_hash,
> -                                                    (uchar*) key, key_length);
> -  if (share && !share->error)
> -    mysql_mutex_lock(&share->tdc.LOCK_table_share);
> -  else
> -    share= 0;
> -  mysql_rwlock_unlock(&LOCK_tdc);
> -  DBUG_RETURN(share);
> +  element= (TDC_element *) lf_hash_search(&tdc_hash, thd->tdc_hash_pins,
> +                                          (uchar*) key,
> +                                          tdc_create_key(key, db, table_name));
> +  if (element)
> +  {
> +    mysql_mutex_lock(&element->LOCK_table_share);
> +    if (!element->share || element->share->error)
> +    {
> +      mysql_mutex_unlock(&element->LOCK_table_share);
> +      element= 0;
> +    }
> +    lf_hash_search_unpin(thd->tdc_hash_pins);
> +  }
> +
> +  DBUG_RETURN(element);

Looks good to me

>  }
>  
>  
> @@ -702,60 +616,60 @@ TABLE_SHARE *tdc_acquire_share(THD *thd, const char *db, const char *table_name,
>                                 TABLE **out_table)
>  {
>    TABLE_SHARE *share;
> +  TDC_element *element;
>    bool was_unused;
>    DBUG_ENTER("tdc_acquire_share");
>  
> -  mysql_rwlock_rdlock(&LOCK_tdc);
> -  share= (TABLE_SHARE*) my_hash_search_using_hash_value(&tdc_hash, hash_value,
> -                                                        (uchar*) key,
> -                                                        key_length);
> -  if (!share)
> +  if (fix_thd_pins(thd))
> +    DBUG_RETURN(0);
> +
> +retry:
> +  while (!(element= (TDC_element*) lf_hash_search_using_hash_value(&tdc_hash,

Was lf_hash_search_using_hash_value added in a separate patch?

> +                    thd->tdc_hash_pins, hash_value, (uchar*) key, key_length)))
>    {
> -    TABLE_SHARE *new_share;
> -    mysql_rwlock_unlock(&LOCK_tdc);
> +    TDC_element tmp(key, key_length);
> +    int res= lf_hash_insert(&tdc_hash, thd->tdc_hash_pins, (uchar*) &tmp);
>  
> -    if (!(new_share= alloc_table_share(db, table_name, key, key_length)))
> +    if (res == -1)
>        DBUG_RETURN(0);
> -    new_share->error= OPEN_FRM_OPEN_ERROR;
> +    else if (res == 1)
> +      continue;
>  
> -    mysql_rwlock_wrlock(&LOCK_tdc);
> -    share= (TABLE_SHARE*) my_hash_search_using_hash_value(&tdc_hash, hash_value,
> -                                                          (uchar*) key,
> -                                                          key_length);
> -    if (!share)
> -    {
> -      bool need_purge;
> +    element= (TDC_element*) lf_hash_search_using_hash_value(&tdc_hash,
> +             thd->tdc_hash_pins, hash_value, (uchar*) key, key_length);
> +    lf_hash_search_unpin(thd->tdc_hash_pins);

How can you be sure that nobody frees your element after you unpin it?

> +    DBUG_ASSERT(element);
> +    element->dbug_release_assertions();
>  
> -      share= new_share;
> -      mysql_mutex_lock(&share->tdc.LOCK_table_share);
> -      if (my_hash_insert(&tdc_hash, (uchar*) share))
> +    if (!(share= alloc_table_share(db, table_name, key, key_length)))
>        {
> -        mysql_mutex_unlock(&share->tdc.LOCK_table_share);
> -        mysql_rwlock_unlock(&LOCK_tdc);
> -        free_table_share(share);
> +      lf_hash_delete(&tdc_hash, thd->tdc_hash_pins, key, key_length);
>          DBUG_RETURN(0);
>        }
> -      need_purge= tdc_hash.records > tdc_size;
> -      mysql_rwlock_unlock(&LOCK_tdc);
>  
>        /* note that tdc_acquire_share() *always* uses discovery */
>        open_table_def(thd, share, flags | GTS_USE_DISCOVERY);
> -      share->tdc.ref_count++;
> -      mysql_mutex_unlock(&share->tdc.LOCK_table_share);
>  
>        if (share->error)
>        {
> -        tdc_delete_share_from_hash(share);
> +      free_table_share(share);
> +      lf_hash_delete(&tdc_hash, thd->tdc_hash_pins, key, key_length);
>          DBUG_RETURN(0);
>        }
> -      else if (need_purge)
> +
> +    mysql_mutex_lock(&element->LOCK_table_share);
> +    element->share= share;
> +    share->tdc= element;
> +    element->ref_count++;
> +    element->version= tdc_refresh_version();
> +    element->flushed= false;
> +    mysql_mutex_unlock(&element->LOCK_table_share);
> +
>          tdc_purge(false);
>        if (out_table)
>          *out_table= 0;
>        share->m_psi= PSI_CALL_get_table_share(false, share);
>        goto end;
> -    }
> -    free_table_share(new_share);
>    }
>  
>    /* cannot force discovery of a cached share */
> @@ -763,18 +677,25 @@ TABLE_SHARE *tdc_acquire_share(THD *thd, const char *db, const char *table_name,
>  
>    if (out_table && (flags & GTS_TABLE))
>    {
> -    if ((*out_table= tc_acquire_table(thd, share)))
> +    if ((*out_table= element->acquire_table(thd)))
>      {
> -      mysql_rwlock_unlock(&LOCK_tdc);
> +      lf_hash_search_unpin(thd->tdc_hash_pins);

what this lf_hash_search_unpin corresponds to?

>        DBUG_ASSERT(!(flags & GTS_NOLOCK));
> -      DBUG_ASSERT(!share->error);
> -      DBUG_ASSERT(!share->is_view);
> -      DBUG_RETURN(share);
> +      DBUG_ASSERT(element->share);
> +      DBUG_ASSERT(!element->share->error);
> +      DBUG_ASSERT(!element->share->is_view);
> +      DBUG_RETURN(element->share);
>      }
>    }
>  
> -  mysql_mutex_lock(&share->tdc.LOCK_table_share);
> -  mysql_rwlock_unlock(&LOCK_tdc);
> +  mysql_mutex_lock(&element->LOCK_table_share);
> +  if (!(share= element->share))
> +  {
> +    mysql_mutex_unlock(&element->LOCK_table_share);
> +    lf_hash_search_unpin(thd->tdc_hash_pins);

and here

> +    goto retry;
> +  }
> +  lf_hash_search_unpin(thd->tdc_hash_pins);

may be that first lf_hash_search_unpin was wrong?
the one that goes directly after lf_hash_search.

>  
>    /*
>       We found an existing table definition. Return it if we didn't get
> @@ -983,13 +864,36 @@ bool tdc_remove_table(THD *thd, enum_tdc_remove_table_type remove_type,
>                thd->mdl_context.is_lock_owner(MDL_key::TABLE, db, table_name,
>                                               MDL_EXCLUSIVE));
>  
> -  if ((share= tdc_delete_share(db, table_name)))
> +
> +  mysql_mutex_lock(&LOCK_unused_shares);
> +  if (!(element= tdc_lock_share(thd, db, table_name)))
>    {
> -    I_P_List <TABLE, TABLE_share> purge_tables;
> -    uint my_refs= 1;
> +    mysql_mutex_unlock(&LOCK_unused_shares);
> +    DBUG_ASSERT(remove_type != TDC_RT_REMOVE_NOT_OWN_KEEP_SHARE);
> +    DBUG_RETURN(false);
> +  }
>  
> -    mysql_mutex_lock(&share->tdc.LOCK_table_share);
> -    tc_wait_for_mdl_deadlock_detector(share);
> +  DBUG_ASSERT(element != MY_ERRPTR); // What can we do about it?

allocate thd->tdc_hash_pins in THD::init

> +
> +  if (!element->ref_count)
> +  {
> +    if (element->prev)
> +    {
> +      unused_shares.remove(element);
> +      /* Concurrent thread may start using share again, reset prev and next. */

What concurrent thread? element->ref_count is 0, you removed
the the share from the unused_shares list. There can be no concurrent
threads, as far as I understand. And indeed, you do remove the share
from the hash (which does free_table_share) immediately below,
which also proves that there can be no concurrent threads.

> +      element->prev= 0;
> +      element->next= 0;
> +    }
> +    mysql_mutex_unlock(&LOCK_unused_shares);
> +
> +    tdc_delete_share_from_hash(element);
> +    DBUG_RETURN(true);
> +  }
> +  mysql_mutex_unlock(&LOCK_unused_shares);
> +
> +  element->ref_count++;
> +
> +  element->wait_for_mdl_deadlock_detector();
>      /*
>        Mark share flushed in order to ensure that it gets
>        automatically deleted once it is no longer referenced.

Regards,
Sergei


Follow ups