MySQL 执行show slave hosts 引发服务器 c
author:sufei
版本:影响版本 mysql 8.0.18,mysql 5.7.21,mysql 8.0.20(修复)
现象:
在mysql数据库中执行show slave hosts引发数据库coredump。
其错误日志如下
03:20:52 UTC - mysqld got signal 11 ;
Most likely, you have hit a bug, but this error can also be caused by malfunctioning hardware.
Thread pointer: 0x2b93280008c0
Attempting backtrace. You can use the following information to find out
where mysqld died. If you see no messages after this, something went
terribly wrong...
stack_bottom = 2b92edccec80 thread_stack 0x40000
/flash/zgj/mysql-install/bin/mysqld(my_print_stacktrace(unsigned char const*, unsigned long)+0x2e) [0x1e8233e]
/flash/zgj/mysql-install/bin/mysqld(handle_fatal_signal+0x341) [0xf8af51]
/lib64/libpthread.so.0(+0xf5e0) [0x2b92752bb5e0]
/flash/zgj/mysql-install/bin/mysqld(std::_Hashtable<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::pair<std::__cxx11::basic_string<char, std::c
har_traits<char>, std::allocator<char> > const, std::unique_ptr<user_var_entry, void (*)(user_var_entry*)> >, Malloc_allocator<std::pair<std::__cxx11::basic_string<char, std::char_tr
aits<char>, std::allocator<char> > const, std::unique_ptr<user_var_entry, void (*)(user_var_entry*)> > >, std::__detail::_Select1st, Collation_key_/flash/zgj/mysql-install/bin/mysqld
(get_slave_uuid(THD*, String*)+0xa7) [0x1b49997]
/flash/zgj/mysql-install/bin/mysqld(show_slave_hosts(THD*)+0xcf5) [0x1b4a755]
/flash/zgj/mysql-install/bin/mysqld(mysql_execute_command(THD*, bool)+0x183f) [0xe56cef]
/flash/zgj/mysql-install/bin/mysqld(mysql_parse(THD*, Parser_state*)+0x3d0) [0xe5ab30]
/flash/zgj/mysql-install/bin/mysqld(dispatch_command(THD*, COM_DATA const*, enum_server_command)+0x25d2) [0xe5d5d2]
/flash/zgj/mysql-install/bin/mysqld(do_command(THD*)+0x168) [0xe5e178]
/flash/zgj/mysql-install/bin/mysqld() [0xf78ed0]
/flash/zgj/mysql-install/bin/mysqld() [0x23c06b5]
/lib64/libpthread.so.0(+0x7e25) [0x2b92752b3e25]
/lib64/libc.so.6(clone+0x6d) [0x2b9276fbf34d]
Trying to get some variables.
Some pointers may be invalid and cause the dump to abort.
Query (2b932801f398): show slave hosts
Connection ID (thread ID): 37
Status: NOT_KILLED
The manual page at http://dev.mysql.com/doc/mysql/en/crashing.html contains
information that should help you find out what is causing the crash.
Writing a core file
从错误日志可以看到为空指针错误。
分析:
通常上述错误日志的调用栈,我们可以看到出现coredump的show_slave_hosts函数代码如下:
// 以下是部分show_slave_hosts函数代码
mysql_mutex_lock(&LOCK_slave_list); // 获得slave列表锁
for (uint i = 0; i < slave_list.records; ++i) // 遍历所有slave,以获取各从库信息
{
SLAVE_INFO* si = (SLAVE_INFO*) my_hash_element(&slave_list, i);
protocol->start_row();
protocol->store((uint32) si->server_id);
protocol->store(si->host, &my_charset_bin);
if (opt_show_slave_auth_info)
{
protocol->store(si->user, &my_charset_bin);
protocol->store(si->password, &my_charset_bin);
}
protocol->store((uint32) si->port);
protocol->store((uint32) si->master_id);
/* get slave's UUID */
String slave_uuid;
if (get_slave_uuid(si->thd, &slave_uuid)) // 从从库的thd结构中获取从库salve_uuid(保存在thd会话变量的hash结构中),此处coredump
protocol->store(slave_uuid.c_ptr_safe(), &my_charset_bin);
if (protocol->end_row())
{
mysql_mutex_unlock(&LOCK_slave_list);
DBUG_RETURN(TRUE);
}
}
mysql_mutex_unlock(&LOCK_slave_list); // 释放slave列表锁
可以看到发生coredump的位置是通过从库thd结构获取相应信息,而通过core文件我们可以看到相应从库的thd结构已经有部分析构了。这是为什么呢?通常情况下的逻辑复制如下:
- connect 从库连接主库,此时构建了thd结构
- register_slave 从库执行COM_REGISTER_SLAVE命令完成注册,即注册到slave_list列表上
- dump binlog 从库执行COM_BINLOG_DUMP_GTID命令进行复制binlog
- 复制终止(可能从库执行stop salve,也可能dump线程错误),unregister_slave 从slave_list列表上移除相应从库
- 断开连接,thd结构析构
从上述流程可以看到,通常在slave_list列表时,其thd结构不应该被析构呀?
考虑如下情况:如果从库执行完第2步,完成了register注册(此时注册到了列表slave_list中) ,而在没有执行发送dump指令时,从库就中断连接了。由于从库终端了连接,那相应的thd结构体被析构。但是并没有从slave_list中剔除。此时不就出现上述问题了吗?
为了验证上述想法,一个就是通过源代码,一个就是通过实验复现。
通过查看源代码,我们可以发现在调用unregister_slave函数的地方只有两个:
// 一个是调用register_slave进行在注册时,移除重复的从库
mysql_mutex_lock(&LOCK_slave_list);
unregister_slave(thd, false, false/*need_lock_slave_list=false*/); // 移除重复的从库
res= my_hash_insert(&slave_list, (uchar*) si); // 注册从库
mysql_mutex_unlock(&LOCK_slave_list);
// 一个是在进行binlog dump指令时,当发送binlog退出时
mysql_binlog_send(thd, name, (my_off_t) pos, &slave_gtid_executed, flags); // 死循环发送binlog
unregister_slave(thd, true, true/*need_lock_slave_list=true*/); // 发送binlog推出了,此时需要unregister_slave
通过查看源代码可以确定:如果从库执行完register指令后没有执行dump执行,那么就不会触发unregister_slave。
实验复现:
我们可以同一台MySQL服务器上运行两个循环的脚本:
一个是不断的重复:
连接数据库进行register从库注册,但不进行dump,然后断开连接。这个是模拟从库注册register但是没有注销unregister
另一个不断的复制:
连接数据库,执行show slave hosts指令,以触发访问已经析构的从库thd结构。
通过上述实现,已经在实验环境中重现了上述coredump。
修复:
查看官方代码提交情况,可以看到在mysql 8.0.20中,对上述问题进行了修复,即在thd析构中主动调用unregister_slave函数
THD::~THD() {
THD_CHECK_SENTRY(this);
DBUG_TRACE;
DBUG_PRINT("info", ("THD dtor, this %p", this));
if (!m_release_resources_done) release_resources();
clear_next_event_pos();
/* Ensure that no one is using THD */
mysql_mutex_lock(&LOCK_thd_data);
mysql_mutex_unlock(&LOCK_thd_data);
mysql_mutex_lock(&LOCK_thd_query);
mysql_mutex_unlock(&LOCK_thd_query);
DBUG_ASSERT(!m_attachable_trx);
my_free(const_cast<char *>(m_db.str));
m_db = NULL_CSTR;
get_transaction()->free_memory(MYF(0));
mysql_mutex_destroy(&LOCK_query_plan);
mysql_mutex_destroy(&LOCK_thd_data);
mysql_mutex_destroy(&LOCK_thd_query);
mysql_mutex_destroy(&LOCK_thd_sysvar);
mysql_mutex_destroy(&LOCK_thd_protocol);
mysql_mutex_destroy(&LOCK_current_cond);
mysql_cond_destroy(&COND_thr_lock);
#ifndef DBUG_OFF
dbug_sentry = THD_SENTRY_GONE;
#endif
if (variables.gtid_next_list.gtid_set != nullptr) {
#ifdef HAVE_GTID_NEXT_LIST
delete variables.gtid_next_list.gtid_set;
variables.gtid_next_list.gtid_set = NULL;
variables.gtid_next_list.is_non_null = false;
#else
DBUG_ASSERT(0);
#endif
}
if (rli_slave) rli_slave->cleanup_after_session();
/*
As slaves can be added in one mysql command like COM_REGISTER_SLAVE
but then need to be removed on error scenarios, we call this method
here
*/
unregister_slave(this, true, true); // slave注销
free_root(&main_mem_root, MYF(0));
if (m_token_array != nullptr) {
my_free(m_token_array);
}
}
官方提交.png