Ceph

ceph rbd:Image create

2018-02-28  本文已影响17人  chnmagnus

创建image过程的代码走读。过程中,发现自己对librados aio机制和cls 注册的函数调用机制不太了解,有空单独写篇文。

浅析

先走一遍流程,从宏观上看一下image创建的过程。
初始化rbd并创建image。

ceph osd pool create rbd 32
rbd pool init rbd
rbd create --size 1024 rbd/testimage

查看已有的对象

rados ls -p rbd

rbd_directory
rbd_id.testimage
rbd_info
rbd_object_map.105d2ae8944a
rbd_header.105d2ae8944a

一个pool中的rbd对象分成两类:
第一类,整个pool的rbd元数据对象
1.rbd_directory:在每个pool中都存在,用于保存该pool下所有的image的信息。该对象的omap中保存该pool中所有image的name和id。对于每一个image,保存两条信息,第一条key为id_<image id>,value为image name;第二条key为name_<image name>,value为image id。

rados listomapvals rbd_directory -p rbd

id_105d2ae8944a
value (13 bytes) :
00000000  09 00 00 00 74 65 73 74  69 6d 61 67 65           |....testimage|
0000000d

name_testimage
value (16 bytes) :
00000000  0c 00 00 00 31 30 35 64  32 61 65 38 39 34 34 61  |....105d2ae8944a|
00000010

2.rbd_info:正常情况下内容为overwrite validated,如果是v1 image,情况不同。暂时忽略。

第二类,一个image的元数据对象
文档描述如下:

/* New-style rbd image 'testimage' consists of objects
 *   rbd_id.testimage        - id of image
 *   rbd_header.<id>         - image metadata
 *   rbd_object_map.<id>     - optional image object map
 *   rbd_data.<id>.00000000
 *   rbd_data.<id>.00000001
 *   ...                     - data
 */

但之前的rados ls结果只有前三个。为了加速image创建、节省空间。数据对象只有在使用时才会被分配。

1.rbd_id.testimage:被称为image的id_obj对象,其内容为该image的id。

2.rbd_header.105d2ae8944a:被称为image的head_obj对象,它的omap中保存了该image的元数据。

rados listomapvals rbd_header.105d2ae8944a -p rbd

create_timestamp - 创建时间
value (8 bytes) :
00000000  17 25 96 5a a7 bc 94 35                           |.%.Z...5|
00000008

features - 开启的特性
value (8 bytes) :
00000000  3d 00 00 00 00 00 00 00                           |=.......|
00000008

object_prefix - data对象的前缀
value (25 bytes) :
00000000  15 00 00 00 72 62 64 5f  64 61 74 61 2e 31 30 35  |....rbd_data.105|
00000010  64 32 61 65 38 39 34 34  61                       |d2ae8944a|
00000019

order - 每个data对象的大小
value (1 bytes) :
00000000  16                                                |.|
00000001

size - image size
value (8 bytes) :
00000000  00 00 00 40 00 00 00 00                           |...@....|
00000008

snap_seq - 当前存在的最新的seq
value (8 bytes) :
00000000  00 00 00 00 00 00 00 00                           |........|
00000008

如果创建了快照会有快照相关的key value存在于omap中,暂且不表。

3.rbd_object_map.105d2ae8944a:用于支持object map特性,开启object map时会创建。

代码

省略了部分代码,不影响阅读。

image的创建从librbd.cc的create函数开始,调用了internal.cc中的create。注意,有多个版本的create函数,其区别主要在于指定选项的多寡,其最终实现是一致的。

  /*
    io_ctx参数为调用librados创建的,用于连接rados中对应的pool
      librados::IoCtx io_ctx;
      rados.ioctx_create(pool_name.c_str(), io_ctx);
    name参数表示要创建的image的名称
    size参数为image size
    order为rbd对应到rados中每个对象的大小,默认为4MB,即1<<22
  */
  int RBD::create(IoCtx& io_ctx, const char *name, uint64_t size, int *order)
  {
    int r = librbd::create(io_ctx, name, size, order);
    return r;
  }

internal.cc中的create

  int create(librados::IoCtx& io_ctx, const char *imgname, uint64_t size,
         int *order)
  {
    uint64_t order_ = *order;
    ImageOptions opts;

    int r = opts.set(RBD_IMAGE_OPTION_ORDER, order_);
    assert(r == 0);
    // 转调
    r = create(io_ctx, imgname, "", size, opts, "", "", false);

    int r1 = opts.get(RBD_IMAGE_OPTION_ORDER, &order_);
    assert(r1 == 0);
    *order = order_;

    return r;
  }
  
  // 真正的实现
  int create(IoCtx& io_ctx, const std::string &image_name,
         const std::string &image_id, uint64_t size,
         ImageOptions& opts,
         const std::string &non_primary_global_image_id,
         const std::string &primary_mirror_uuid,
         bool skip_mirror_enable)
  {
    // 准备image的id,不存在则生成
    std::string id(image_id);
    if (id.empty()) {
      id = util::generate_image_id(io_ctx);
    }

    CephContext *cct = (CephContext *)io_ctx.cct();
    ldout(cct, 10) << __func__ << " name=" << image_name << ", "
           << "id= " << id << ", "
           << "size=" << size << ", opts=" << opts << dendl;
    // 准备image的format类型,不存在则设为默认值
    uint64_t format;
    if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0)
      format = cct->_conf->get_val<int64_t>("rbd_default_format");
    bool old_format = format == 1;

    // make sure it doesn't already exist, in either format
    int r = detect_format(io_ctx, image_name, NULL, NULL);
    if (r != -ENOENT) {
      if (r) {
    lderr(cct) << "Could not tell if " << image_name << " already exists"
           << dendl;
    return r;
      }
      lderr(cct) << "rbd image " << image_name << " already exists" << dendl;
      return -EEXIST;
    }
    // 准备order,不存在则设为默认值
    uint64_t order = 0;
    if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0 || order == 0) {
      order = cct->_conf->get_val<int64_t>("rbd_default_order");
    }
    r = image::CreateRequest<>::validate_order(cct, order);
    if (r < 0) {
      return r;
    }
    // 根据不同的format,创建不同的镜像,old format只为向下兼容,不深究
    if (old_format) {
      r = create_v1(io_ctx, image_name.c_str(), size, order);
    } else {
      // ceph 使用的线程池和队列,ContextWQ是异步回调方式的队列
      // 放入其中的任务,在线程池中执行完成后,最终会调用用户实现的回调函数(Context::finish())
      ThreadPool *thread_pool;
      ContextWQ *op_work_queue;
      ImageCtx::get_thread_pool_instance(cct, &thread_pool, &op_work_queue);

      C_SaferCond cond;
      // new一个CreateRequest对象,其中模版参数为默认值,ImageCtx
      // 在构造函数中,解析出所有需要的参数,列举如下:
      /*
        name
        id
        size
        features
        order
        stripe_unit
        stripe_count
        journal_order
        journal_splay_width
        journal_pool
        data_pool
      */
      image::CreateRequest<> *req = image::CreateRequest<>::create(
        io_ctx, image_name, id, size, opts, non_primary_global_image_id,
        primary_mirror_uuid, skip_mirror_enable, op_work_queue, &cond);
      // 执行操作的入口函数
      req->send();
      // 等待req的完成
      r = cond.wait();
    }

    int r1 = opts.set(RBD_IMAGE_OPTION_ORDER, order);
    assert(r1 == 0);

    return r;
  }

CreateRequest.h/cc中定义了创建操作的具体实现,先贴出状态图。之后的代码执行流程与状态图一致。

  /**
   * @verbatim
   *
   *                                  <start> . . . . > . . . . .
   *                                     |                      .
   *                                     v                      .
   *                               VALIDATE POOL                v (pool validation
   *                                     |                      .  disabled)
   *                                     v                      .
   *                             VALIDATE OVERWRITE             .
   *                                     |                      .
   *                                     v                      .
   * (error: bottom up)           CREATE ID OBJECT. . < . . . . .
   *  _______<_______                    |
   * |               |                   v
   * |               |          ADD IMAGE TO DIRECTORY
   * |               |               /   |
   * |      REMOVE ID OBJECT<-------/    v
   * |               |           NEGOTIATE FEATURES (when using default features)
   * |               |                   |
   * |               |                   v         (stripingv2 disabled)
   * |               |              CREATE IMAGE. . . . > . . . .
   * v               |               /   |                      .
   * |      REMOVE FROM DIR<--------/    v                      .
   * |               |          SET STRIPE UNIT COUNT           .
   * |               |               /   |  \ . . . . . > . . . .
   * |      REMOVE HEADER OBJ<------/    v                     /. (object-map
   * |               |\           OBJECT MAP RESIZE . . < . . * v  disabled)
   * |               | \              /  |  \ . . . . . > . . . .
   * |               |  *<-----------/   v                     /. (journaling
   * |               |             FETCH MIRROR MODE. . < . . * v  disabled)
   * |               |                /   |                     .
   * |     REMOVE OBJECT MAP<--------/    v                     .
   * |               |\             JOURNAL CREATE              .
   * |               | \               /  |                     .
   * v               |  *<------------/   v                     .
   * |               |           MIRROR IMAGE ENABLE            .
   * |               |                /   |                     .
   * |        JOURNAL REMOVE*<-------/    |                     .
   * |                                    v                     .
   * |_____________>___________________<finish> . . . . < . . . .
   *
   * @endverbatim
   */

对应于状态图每一步的函数如下:

下面是上述函数的详细代码:

send,状态机的入口函数,在这个函数中验证各种参数的正确性,如果出错,则调用complete函数,complete函数最终会调用继承自Context::finish()的回调函数,进行错误处理。如果没有出错,则调用validate_pool()函数,进入下一状态。

template<typename I>
void CreateRequest<I>::send() {
  ldout(m_cct, 20) << dendl;
  // 校验各种参数
  int r = validate_features(m_cct, m_features, m_force_non_primary);
  if (r < 0) {
    complete(r);
    return;
  }

  r = validate_order(m_cct, m_order);
  if (r < 0) {
    complete(r);
    return;
  }

  r = validate_striping(m_cct, m_order, m_stripe_unit, m_stripe_count);
  if (r < 0) {
    complete(r);
    return;
  }

  r = validate_data_pool(m_cct, m_ioctx, m_features, m_data_pool,
                         &m_data_pool_id);
  if (r < 0) {
    complete(r);
    return;
  }

  if (((m_features & RBD_FEATURE_OBJECT_MAP) != 0) &&
      (!validate_layout(m_cct, m_size, m_layout))) {
    complete(-EINVAL);
    return;
  }
  // 进入下一状态
  validate_pool();
}

validate_pool,校验rbd_directory对象是否存在

template<typename I>
void CreateRequest<I>::validate_pool() {
  // 判断是否跳过validate_pool阶段
  if (!m_cct->_conf->get_val<bool>("rbd_validate_pool")) {
    create_id_object();
    return;
  }

  // 将handle_validate_pool函数封装成AioCompletion对象,作为aio_operate完成时的回调函数
  // 在handle_validate_pool函数中调用了validate_overwrite,进入下一状态
  using klass = CreateRequest<I>;
  librados::AioCompletion *comp =
    create_rados_callback<klass, &klass::handle_validate_pool>(this);

  librados::ObjectReadOperation op;
  op.stat(NULL, NULL, NULL);

  m_outbl.clear();
  // 通过读取rbd_directory对象,判断其是否存在
  int r = m_ioctx.aio_operate(RBD_DIRECTORY, comp, &op, &m_outbl);
  assert(r == 0);
  comp->release();
}

template<typename I>
void CreateRequest<I>::handle_validate_pool(int r) {
  ldout(m_cct, 20) << "r=" << r << dendl;

  if (r == 0) {
    validate_overwrite();
    return;
  } else if ((r < 0) && (r != -ENOENT)) {
    lderr(m_cct) << "failed to stat RBD directory: " << cpp_strerror(r)
                 << dendl;
    complete(r);
    return;
  }

  // allocate a self-managed snapshot id if this a new pool to force
  // self-managed snapshot mode
  // This call is executed just once per (fresh) pool, hence we do not
  // try hard to make it asynchronous (and it's pretty safe not to cause
  // deadlocks).

  uint64_t snap_id;
  r = m_ioctx.selfmanaged_snap_create(&snap_id);
  if (r == -EINVAL) {
    lderr(m_cct) << "pool not configured for self-managed RBD snapshot support"
                 << dendl;
    complete(r);
    return;
  } else if (r < 0) {
    lderr(m_cct) << "failed to allocate self-managed snapshot: "
                 << cpp_strerror(r) << dendl;
    complete(r);
    return;
  }

  r = m_ioctx.selfmanaged_snap_remove(snap_id);
  if (r < 0) {
    // we've already switched to self-managed snapshots -- no need to
    // error out in case of failure here.
    ldout(m_cct, 10) << "failed to release self-managed snapshot " << snap_id
                     << ": " << cpp_strerror(r) << dendl;
  }

  validate_overwrite();
}

validate_overwrite,校验rbd_info对象的内容,与新旧版本image有关,可以忽略。

template <typename I>
void CreateRequest<I>::validate_overwrite() {
  ...
  // handle_validate_overwrite为aio_operate的回调函数
  // handle_validate_overwrite函数会调用create_id_object进入下一状态
  using klass = CreateRequest<I>;
  librados::AioCompletion *comp =
    create_rados_callback<klass, &klass::handle_validate_overwrite>(this);

  librados::ObjectReadOperation op;
  op.read(0, 0, nullptr, nullptr);

  m_outbl.clear();
  // 通过读取rbd_info对象,判断rbd_info对象是否存在
  int r = m_data_io_ctx.aio_operate(RBD_INFO, comp, &op, &m_outbl);
  assert(r == 0);
  comp->release();
}

template <typename I>
void CreateRequest<I>::handle_validate_overwrite(int r) {
  ldout(m_cct, 20) << "r=" << r << dendl;

  bufferlist bl;
  bl.append("overwrite validated");
  // 如果rbd_info存在并且,内容为overwrite validated,直接进入下一状态
  if (r == 0 && m_outbl.contents_equal(bl)) {
    create_id_object();
    return;
  } else if ((r < 0) && (r != -ENOENT)) {
    lderr(m_cct) << "failed to read RBD info: " << cpp_strerror(r) << dendl;
    complete(r);
    return;
  }
  
  // 旧版本image相关,不管它
  // validate the pool supports overwrites. We cannot use rbd_directory
  // since the v1 images store the directory as tmap data within the object.
  ldout(m_cct, 10) << "validating overwrite support" << dendl;
  bufferlist initial_bl;
  initial_bl.append("validate");
  r = m_data_io_ctx.write(RBD_INFO, initial_bl, initial_bl.length(), 0);
  if (r >= 0) {
    r = m_data_io_ctx.write(RBD_INFO, bl, bl.length(), 0);
  }
  if (r == -EOPNOTSUPP) {
    lderr(m_cct) << "pool missing required overwrite support" << dendl;
    complete(-EINVAL);
    return;
  } else if (r < 0) {
    lderr(m_cct) << "failed to validate overwrite support: " << cpp_strerror(r)
                 << dendl;
    complete(r);
    return;
  }

  create_id_object();
}

create_id_object,创建rbd_id.<image name>对象

template<typename I>
void CreateRequest<I>::create_id_object() {
  ldout(m_cct, 20) << dendl;
  // 创建一个writeoption对象
  librados::ObjectWriteOperation op;
  // 创建该对象
  op.create(true);
  // 通过cls client调用注册在osd上的set_id函数
  // 其功能为将op对应的对象的内容设置为image_id。
  // 也就是将rbd_id.<image name>的内容设置为image id
  cls_client::set_id(&op, m_image_id);

  // handle_create_id_object为aio_operate完成后调用的回调函数
  // 在handle_create_id_object中,调用了add_image_to_directory,进入下一状态
  using klass = CreateRequest<I>;
  librados::AioCompletion *comp =
    create_rados_callback<klass, &klass::handle_create_id_object>(this);
  // 疑问。
  // 之前已经通过cls创建了rbd_id对象,这里的作用是?做进一步验证?或者仅仅为了触发回调函数?
  // 或者说,之前cls的操作并不会直接执行,需要通过aio_operate来触发。我倾向于后者。
  int r = m_ioctx.aio_operate(m_id_obj, comp, &op);
  assert(r == 0);
  comp->release();
}

template<typename I>
void CreateRequest<I>::handle_create_id_object(int r) {
  ldout(m_cct, 20) << "r=" << r << dendl;

  if (r < 0) {
    lderr(m_cct) << "error creating RBD id object: " << cpp_strerror(r)
                 << dendl;
    complete(r);
    return;
  }

  add_image_to_directory();
}

add_image_to_directory,在rbd_directory对象中加入该image的id和name

template<typename I>
void CreateRequest<I>::add_image_to_directory() {
  ldout(m_cct, 20) << dendl;
  // 通过cls client调用注册在osd上的dir_add_image函数,
  // 在rbd_directory的omap中增加两条key value。
  librados::ObjectWriteOperation op;
  cls_client::dir_add_image(&op, m_image_name, m_image_id);

  using klass = CreateRequest<I>;
  librados::AioCompletion *comp =
    create_rados_callback<klass, &klass::handle_add_image_to_directory>(this);
  int r = m_ioctx.aio_operate(RBD_DIRECTORY, comp, &op);
  assert(r == 0);
  comp->release();
}

template<typename I>
void CreateRequest<I>::handle_add_image_to_directory(int r) {
  ldout(m_cct, 20) << "r=" << r << dendl;

  if (r < 0) {
    lderr(m_cct) << "error adding image to directory: " << cpp_strerror(r)
                 << dendl;

    m_r_saved = r;
    remove_id_object();
  }

  negotiate_features();
}

negotiate_features,

template<typename I>
void CreateRequest<I>::negotiate_features() {
  if (!m_negotiate_features) {
    create_image();
    return;
  }

  ldout(m_cct, 20) << dendl;

  librados::ObjectReadOperation op;
  // 获取所有的features
  cls_client::get_all_features_start(&op);
  
  using klass = CreateRequest<I>;
  librados::AioCompletion *comp =
    create_rados_callback<klass, &klass::handle_negotiate_features>(this);
  // 执行op并触发回调函数
  m_outbl.clear();
  int r = m_ioctx.aio_operate(RBD_DIRECTORY, comp, &op, &m_outbl);
  assert(r == 0);
  comp->release();
}

template<typename I>
void CreateRequest<I>::handle_negotiate_features(int r) {
  ldout(m_cct, 20) << "r=" << r << dendl;

  uint64_t all_features;
  if (r >= 0) {
    bufferlist::iterator it = m_outbl.begin();
    // 将返回的features decode到all_features
    r = cls_client::get_all_features_finish(&it, &all_features);
  }
  if (r < 0) {
    ldout(m_cct, 10) << "error retrieving server supported features set: "
                     << cpp_strerror(r) << dendl;
  } else if ((m_features & all_features) != m_features) {
    m_features &= all_features;
    ldout(m_cct, 10) << "limiting default features set to server supported: "
             << m_features << dendl;
  }

  create_image();
}

create_image,

template<typename I>
void CreateRequest<I>::create_image() {
  ldout(m_cct, 20) << dendl;
  assert(m_data_pool.empty() || m_data_pool_id != -1);
  // 准备数据对象的名称
  ostringstream oss;
  oss << RBD_DATA_PREFIX;
  if (m_data_pool_id != -1) {
    oss << stringify(m_ioctx.get_id()) << ".";
  }
  oss << m_image_id;
  if (oss.str().length() > RBD_MAX_BLOCK_NAME_PREFIX_LENGTH) {
    lderr(m_cct) << "object prefix '" << oss.str() << "' too large" << dendl;
    complete(-EINVAL);
    return;
  }

  librados::ObjectWriteOperation op;
  op.create(true);
  // 通过cls注册的函数,创建rbd_header对象,并设置omap中的值
  cls_client::create_image(&op, m_size, m_order, m_features, oss.str(),
                           m_data_pool_id);

  using klass = CreateRequest<I>;
  librados::AioCompletion *comp =
    create_rados_callback<klass, &klass::handle_create_image>(this);
  int r = m_ioctx.aio_operate(m_header_obj, comp, &op);
  assert(r == 0);
  comp->release();
}

template<typename I>
void CreateRequest<I>::handle_create_image(int r) {
  ldout(m_cct, 20) << "r=" << r << dendl;

  if (r < 0) {
    lderr(m_cct) << "error writing header: " << cpp_strerror(r) << dendl;
    m_r_saved = r;
    remove_from_dir();
    return;
  }

  set_stripe_unit_count();
}

以下函数代码暂时省略。流程类似。

set_stripe_unit_count
object_map_resize
fetch_mirror_mode
journal_create
mirror_image_enable

最后调用complete函数,传入的参数为0

template<typename I>
void CreateRequest<I>::complete(int r) {
  if (r == 0) {
    ldout(m_cct, 20) << "done." << dendl;
  }
  // 释放数据对象上下文
  m_data_io_ctx.close();
  // 调用CreateRequest结束回调函数,完成步骤
  m_on_finish->complete(r);
  delete this;
}

上一篇下一篇

猜你喜欢

热点阅读