1、如何配置多路径。在使用rpc指令”bdev_nvme_attach_controller “以 -x multipath添加多个路径之后,可以继续使用rpc指令“bdev_nvme_set_multipath_policy”对nvme bdev配置多路径策略。bdev_nvme_set_multipath_policy支持的配置参数如下:
'-b', '--name', help='Name of the NVMe bdev', required=True
'-p', '--policy', help='Multipath policy (active_passive or active_active)', required=True
'-s', '--selector', help='Multipath selector (round_robin, queue_depth)', required=False
'-r', '--rr-min-io', help='Number of IO to route to a path before switching to another for round-robin', type=int, required=False
从配置参数可以看出,多路径策略当前支持active_passive或active_active两种。
在active_active模式下,可以继续配置multipath selector,round_roubin轮询和queue_depth最小队列深度调度选择器selector。其中round_roubin轮询模式下要配置rr-min-io,即队列的outstanding IO数量阈值。
2、基于1中配置多路径策略,当提交IO请求SPDK如何根据配置的策略选择路径?
通过NVMe-oF创建的bdev,IO request通过bdev_nvme_submit_request提交。
static void bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
{
struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
if (spdk_likely(nbdev_io->submit_tsc == 0)) {
nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io);
} else {
/* There are cases where submit_tsc != 0, i.e. retry I/O.
* We need to update submit_tsc here.
*/
nbdev_io->submit_tsc = spdk_get_ticks();
}
spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io);
nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch);
if (spdk_unlikely(!nbdev_io->io_path)) {
if (!bdev_nvme_io_type_is_admin(bdev_io->type)) {
bdev_nvme_io_complete(nbdev_io, -ENXIO);
return;
}
/* Admin commands do not use the optimal I/O path.
* Simply fall through even if it is not found.
*/
}
_bdev_nvme_submit_request(nbdev_ch, bdev_io);
}
通过bdev_nvme_submit_request实现可以看出,其首先通过bdev_nvme_find_io_path选择当前io request需要提交的io_path,而bdev_nvme_find_io_path就会根据1中配置多路径策略进行选择,具体实现如下:
static inline struct nvme_io_path *bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
{
if (spdk_likely(nbdev_ch->current_io_path != NULL)) {
if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) {
return nbdev_ch->current_io_path;
} else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) {
if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) {
return nbdev_ch->current_io_path;
}
nbdev_ch->rr_counter = 0;
}
}
if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE ||
nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) {
return _bdev_nvme_find_io_path(nbdev_ch);
} else {
return _bdev_nvme_find_io_path_min_qd(nbdev_ch);
}
}
static struct nvme_io_path * _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
{
struct nvme_io_path *io_path, *start, *non_optimized = NULL;
start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path);
io_path = start;
do {
if (spdk_likely(nvme_io_path_is_connected(io_path) &&
!io_path->nvme_ns->ana_state_updating)) {
switch (io_path->nvme_ns->ana_state) {
case SPDK_NVME_ANA_OPTIMIZED_STATE:
nbdev_ch->current_io_path = io_path;
return io_path;
case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
if (non_optimized == NULL) {
non_optimized = io_path;
}
break;
default:
break;
}
}
io_path = nvme_io_path_get_next(nbdev_ch, io_path);
} while (io_path != start);
if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) {
/* We come here only if there is no optimized path. Cache even non_optimized
* path for load balance across multiple non_optimized paths.
*/
nbdev_ch->current_io_path = non_optimized;
}
return non_optimized;
}
static struct nvme_io_path * _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch)
{
struct nvme_io_path *io_path;
struct nvme_io_path *optimized = NULL, *non_optimized = NULL;
uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX;
uint32_t num_outstanding_reqs;
STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) {
/* The device is currently resetting. */
continue;
}
if (spdk_unlikely(io_path->nvme_ns->ana_state_updating)) {
continue;
}
num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair);
switch (io_path->nvme_ns->ana_state) {
case SPDK_NVME_ANA_OPTIMIZED_STATE:
if (num_outstanding_reqs < opt_min_qd) {
opt_min_qd = num_outstanding_reqs;
optimized = io_path;
}
break;
case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
if (num_outstanding_reqs < non_opt_min_qd) {
non_opt_min_qd = num_outstanding_reqs;
non_optimized = io_path;
}
break;
default:
break;
}
}
/* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */
if (optimized != NULL) {
return optimized;
}
return non_optimized;
}
通过bdev_nvme_find_io_path结合1中的多路径策略,可以发现:
当多路径策略为active_passive模式时:若current io path为NULL,则调用_bdev_nvme_find_io_path从当前io path list选择第一条正常路径作为current io path,后续提交的io请求则继续从当前current io path下发;
当多路径策略为active_active模式时:若multipath selector为round robin即轮询模式,则判断当前current io path中outstanding io数量rr_counter是否小于1中配置的rr-min-io,若大于等于rr-min-io,则调用_bdev_nvme_find_io_path从current io path开始继续选择下一条可用的io path作为current_io_path下发io,若小于rr-min-io则继续从当前current io path下发IO;若multipath selector为queue_depth即最小队列深度模式,则每次下发IO时均会调用_bdev_nvme_find_io_path_min_qd,遍历每条正常路径并选择outstanding req最少的io path下发IO.
以上即为当前SPDK实现的基于NVMe-oF创建的bdev配置多路径策略方法及如何根据多路径策略选择路径下发IO的实现。