MySQL InnoDB的MVCC原理简析

2019-05-23  本文已影响0人  CXYMichael

数据结构

storage/innobase/include/data0type.h

行结构中,除了用户定义的列外还有3个隐藏系统列:DATA_ROW_ID、DATA_TRX_ID、DATA_ROLL_PTR
如果表没有定义主键那么DATA_ROW_ID作为主键列,否则行结构中没有DATA_ROW_ID列
具体的隐藏列插入过程在dict_table_add_system_columns函数中
无论是聚簇索引,还是二级索引,其每条记录都包含了一个DELETED BIT位,用于标识该记录是否是删除记录,真正意义的删除是在commit的时候,聚簇索引设置记录deleted bit时,会同时更新DATA_TRX_ID列。老版本DATA_TRX_ID进入undo表空间;二级索引设置deleted bit时,不写入undo log

#define DATA_ROW_ID 0     /* row id: a 48-bit integer */
#define DATA_ROW_ID_LEN 6 /* stored length for row id */

/** Transaction id: 6 bytes */
constexpr size_t DATA_TRX_ID = 1;

/** Transaction ID type size in bytes. */
constexpr size_t DATA_TRX_ID_LEN = 6;

/** Rollback data pointer: 7 bytes */
constexpr size_t DATA_ROLL_PTR = 2;

/** Rollback data pointer type size in bytes. */
constexpr size_t DATA_ROLL_PTR_LEN = 7;

#define DATA_N_SYS_COLS 3 /* number of system columns defined above */

#define DATA_ITT_N_SYS_COLS 2 /* number of system columns for intrinsic
temporary table */

storage/innobase/include/page0page.h

而对于二级索引记录,是不包含上面这两个隐藏字段信息的,但对于二级索引,会在页中会记录一个PAGE_MAX_TRX_ID,表示对该页数据修改过的最大事务id

#define PAGE_MAX_TRX_ID                             \
  18 /* highest id of a trx which may have modified \
     a record on the page; trx_id_t; defined only   \
     in secondary indexes and in the insert buffer  \
     tree */

MySQL中的事务在开始到提交这段过程中,都会被保存到一个叫trx_sys的事务链表中,这是一个基本的链表结构,在客户端执行命令:show engine innodb status就能看到事务的链表

storage/innobase/trx/trx0sys.cc

/** The transaction system */
trx_sys_t *trx_sys = NULL;

ReadView类维护了为了实现事务一致性读所需要的事务ID列表,用来保证活动事务中一些对数据库的修改对当前事务不可见,其主要内容如下:

storage/innobase/include/read0types.h

class ReadView {
  /** 类似于Vector的事务ID数组类ids_t */
  class ids_t {
    typedef trx_ids_t::value_type value_type;
  }

 public:

  /** 用来判断某事务的改变对当前ReadView是否可见 */
  bool changes_visible(trx_id_t id, const table_name_t &name) const
      MY_ATTRIBUTE((warn_unused_result)) {
    ut_ad(id > 0);
    //小于高水位或者是当前事务,都可见
    if (id < m_up_limit_id || id == m_creator_trx_id) {
      return (true);
    }

    check_trx_id_sanity(id, name);
    //大于等于高水位时不可见
    if (id >= m_low_limit_id) {
      return (false);
    //没有活跃事务则认为可见
    } else if (m_ids.empty()) {
      return (true);
    }

    const ids_t::value_type *p = m_ids.data();
    //通过二分查找搜索事务是否在活动事务列表中,/如果在说明在创建Read View时,此条记录还处于活跃状态则不应该查询到,否则说明创建Read View是此条记录已经是不活跃状态则可以查询到
    return (!std::binary_search(p, p + m_ids.size(), id));
  }

 private:
  /** 任何ID大于等于此值的事务都对当前事务不可见,换句话说这是High water mark */
  trx_id_t m_low_limit_id;

  /** 任何ID小于此值的事务都对当前事务可见,换句话说这是Low water mark */
  trx_id_t m_up_limit_id;

  /** 当前事务(视图创建者)的ID, 设为TRX_ID_MAX时表示释放该视图views */
  trx_id_t m_creator_trx_id;

  /** 快照生成时的活动事务集合 */
  ids_t m_ids;

  /** 任何ID小于此值的事务的undo log都不再被这个ReadView需要,故这些undo log都可以被purge线程清理 */
  trx_id_t m_low_limit_no;

  /** AC-NL-RO transaction view that has been "closed". */
  bool m_closed;

  typedef UT_LIST_NODE_T(ReadView) node_t;

  /** trx_sys中的ReadView列表 */
  byte pad1[64 - sizeof(node_t)];
  node_t m_view_list;
};

MVCC类作为ReadView的管理类,主要起到打开、关闭、释放、回收、重用ReadView的作用,主要功能如下:

storage/innobase/include/read0read.h

class MVCC {
 public:

  /** 打开视图 */
  void view_open(ReadView *&view, trx_t *trx);

  /** 关闭视图 */
  void view_close(ReadView *&view, bool own_mutex);

  /** 释放不活动的未关闭视图,调用者必须持有trx_sys_t::mutex互斥锁 */
  void view_release(ReadView *&view);

  /** 用于预分配,它会把最旧的视图克隆到传入的指针指向的视图,同时也会试图将标记为删除的视图移动到freed list,无需显式调用close*/
  void clone_oldest_view(ReadView *view);

  /** 由RW事务创建的视图必须用此函数设置持有者ID */
  static void set_view_creator_trx_id(ReadView *view, trx_id_t id);

 private:

  /** 从active list中获取一个空闲视图,如果没有就创建一个新的,同时也会试图将标记为删除的视图移动到freed list */
  inline ReadView *get_view();

  /** 获取系统中最旧的视图,同时将标记为删除的视图移动到freed list */
  inline ReadView *get_oldest_view() const;
  ReadView *get_view_created_by_trx_id(trx_id_t trx_id) const;

 private:
  typedef UT_LIST_BASE_NODE_T(ReadView) view_list_t;

  /** 被回收的ReadView,等待被重用 */
  view_list_t m_free;

  /** 活动或关闭的视图,已关闭视图的creator trx id会被设为TRX_ID_MAX */
  view_list_t m_views;
};
innodb-mvcc.png

ReadView创建

start transaction和begin语句执行后并没有在innodb层分配事务ID、回滚段、将事务放到读写事务链表等,这些操作需要第一个SQL语句调用函数trx_start_if_not_started_xa->trx_start_low来完成,然后调用trx_assign_read_view方法为每个事务分配唯一的ReadView

storage/innobase/trx/trx0trx.cc

ReadView *trx_assign_read_view(trx_t *trx) /*!< in/out: active transaction */
{
  ut_ad(trx->state == TRX_STATE_ACTIVE);

  if (srv_read_only_mode) {
    ut_ad(trx->read_view == NULL);
    return (NULL);

  } else if (!MVCC::is_view_active(trx->read_view)) {
    trx_sys->mvcc->view_open(trx->read_view, trx);
  }

  return (trx->read_view);
}

InnoDB默认的是RR级别,在这种级别下,相当于事务开启后,事务链中所有的事务,它们在事务处理期间的一切改变对我们当前开启的事务而言都是不可见的,也可以相当于看作 m_up_limit_id == m_low_limit_id

storage/innobase/read/read0read.cc

void MVCC::view_open(ReadView *&view, trx_t *trx) {
  ut_ad(!srv_read_only_mode);

  /** If no new RW transaction has been started since the last view
  was created then reuse the the existing view. */
  if (view != NULL) {
    uintptr_t p = reinterpret_cast<uintptr_t>(view);

    view = reinterpret_cast<ReadView *>(p & ~1);

    ut_ad(view->m_closed);

    if (trx_is_autocommit_non_locking(trx) && view->empty()) {
      view->m_closed = false;

      if (view->m_low_limit_id == trx_sys_get_max_trx_id()) {
        return;
      } else {
        view->m_closed = true;
      }
    }

    mutex_enter(&trx_sys->mutex);

    UT_LIST_REMOVE(m_views, view);

  } else {
    mutex_enter(&trx_sys->mutex);

    view = get_view();
  }

  if (view != NULL) {
    view->prepare(trx->id);

    UT_LIST_ADD_FIRST(m_views, view);

    ut_ad(!view->is_closed());

    ut_ad(validate());
  }

  trx_sys_mutex_exit();
}

void ReadView::prepare(trx_id_t id) {
  ut_ad(mutex_own(&trx_sys->mutex));

  m_creator_trx_id = id;

  m_low_limit_no = m_low_limit_id = m_up_limit_id = trx_sys->max_trx_id;

  if (!trx_sys->rw_trx_ids.empty()) {
    copy_trx_ids(trx_sys->rw_trx_ids);
  } else {
    m_ids.clear();
  }

  ut_ad(m_up_limit_id <= m_low_limit_id);

  if (UT_LIST_GET_LEN(trx_sys->serialisation_list) > 0) {
    const trx_t *trx;

    trx = UT_LIST_GET_FIRST(trx_sys->serialisation_list);

    if (trx->no < m_low_limit_no) {
      m_low_limit_no = trx->no;
    }
  }

  m_closed = false;
}

ReadView销毁

storage/innobase/trx/trx0trx.cc

/** Commits a transaction in memory. */
static void trx_commit_in_memory(
    trx_t *trx,       /*!< in/out: transaction */
    const mtr_t *mtr, /*!< in: mini-transaction of
                      trx_write_serialisation_history(), or NULL if
                      the transaction did not modify anything */
    bool serialised)
/*!< in: true if serialisation log was
written */
{
  trx->must_flush_log_later = false;
  trx->ddl_must_flush = false;

  if (trx_is_autocommit_non_locking(trx)) {
    ...

    if (trx->read_view != NULL) {
      trx_sys->mvcc->view_close(trx->read_view, false);
    }
    ...

  } else {
    ...

    if (trx->read_only || trx->rsegs.m_redo.rseg == NULL) {
      MONITOR_INC(MONITOR_TRX_RO_COMMIT);
      if (trx->read_view != NULL) {
        trx_sys->mvcc->view_close(trx->read_view, false);
      }

    } else {
      ut_ad(trx->id > 0);
      MONITOR_INC(MONITOR_TRX_RW_COMMIT);
    }
  }
}

storage/innobase/handler/ha_innodb.cc

int ha_innobase::external_lock(THD *thd, /*!< in: handle to the user thread */
                               int lock_type) /*!< in: lock type */
{
  ...
  /* If the MySQL lock count drops to zero we know that the current SQL
  statement has ended */

  if (trx->n_mysql_tables_in_use == 0) {
    trx->mysql_n_tables_locked = 0;
    m_prebuilt->used_in_HANDLER = FALSE;

    if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
      if (trx_is_started(trx)) {
        innobase_commit(ht, thd, TRUE);
      } else {
        /* Since the trx state is TRX_NOT_STARTED,
        trx_commit() will not be called. Reset
        trx->is_dd_trx here */
        ut_d(trx->is_dd_trx = false);
      }

    } else if (trx->isolation_level <= TRX_ISO_READ_COMMITTED &&
               MVCC::is_view_active(trx->read_view)) {
      mutex_enter(&trx_sys->mutex);

      trx_sys->mvcc->view_close(trx->read_view, true);

      mutex_exit(&trx_sys->mutex);
    }
  }

  if (!trx_is_started(trx) && lock_type != F_UNLCK &&
      (m_prebuilt->select_lock_type != LOCK_NONE ||
       m_stored_select_lock_type != LOCK_NONE)) {
    ++trx->will_lock;
  }

  DBUG_RETURN(0);
}

可见性判断

storage/innobase/lock/lock0lock.cc

/** Checks that a record is seen in a consistent read.
 @return true if sees, or false if an earlier version of the record
 should be retrieved */
bool lock_clust_rec_cons_read_sees(
    const rec_t *rec,     /*!< in: user record which should be read or
                          passed over by a read cursor */
    dict_index_t *index,  /*!< in: clustered index */
    const ulint *offsets, /*!< in: rec_get_offsets(rec, index) */
    ReadView *view)       /*!< in: consistent read view */
{
  ut_ad(index->is_clustered());
  ut_ad(page_rec_is_user_rec(rec));
  ut_ad(rec_offs_validate(rec, index, offsets));

  /* Temp-tables are not shared across connections and multiple
  transactions from different connections cannot simultaneously
  operate on same temp-table and so read of temp-table is
  always consistent read. */
  if (srv_read_only_mode || index->table->is_temporary()) {
    ut_ad(view == 0 || index->table->is_temporary());
    return (true);
  }

  /* NOTE that we call this function while holding the search
  system latch. */

  trx_id_t trx_id = row_get_rec_trx_id(rec, index, offsets);

  return (view->changes_visible(trx_id, index->table->name));
}

storage/innobase/lock/lock0lock.cc

/** Checks that a non-clustered index record is seen in a consistent read.
 NOTE that a non-clustered index page contains so little information on
 its modifications that also in the case false, the present version of
 rec may be the right, but we must check this from the clustered index
 record.
 @return true if certainly sees, or false if an earlier version of the
 clustered index record might be needed */
bool lock_sec_rec_cons_read_sees(
    const rec_t *rec,          /*!< in: user record which
                               should be read or passed over
                               by a read cursor */
    const dict_index_t *index, /*!< in: index */
    const ReadView *view)      /*!< in: consistent read view */
{
  ut_ad(page_rec_is_user_rec(rec));

  /* NOTE that we might call this function while holding the search
  system latch. */

  if (recv_recovery_is_on()) {
    return (false);

  } else if (index->table->is_temporary()) {
    /* Temp-tables are not shared across connections and multiple
    transactions from different connections cannot simultaneously
    operate on same temp-table and so read of temp-table is
    always consistent read. */

    return (true);
  }

  trx_id_t max_trx_id = page_get_max_trx_id(page_align(rec));

  ut_ad(max_trx_id > 0);

  return (view->sees(max_trx_id));
}

bool sees(trx_id_t id) const
{
    return(id < m_up_limit_id);
}

事务的执行过程

流程图

mvcc.png
innodb-update.png

总结

实质上MySQL所实现的MVCC机制就是水位线机制,或者说窗口机制。在其他系统里也有很多类似的设计思想,比如:

上一篇下一篇

猜你喜欢

热点阅读