searchusermenu
  • 发布文章
  • 消息中心
点赞
收藏
评论
分享
原创

SPDK多路径策略配置及实现原理简介

2023-10-17 02:29:01
137
0

1、如何配置多路径。在使用rpc指令”bdev_nvme_attach_controller “以 -x multipath添加多个路径之后,可以继续使用rpc指令“bdev_nvme_set_multipath_policy”对nvme bdev配置多路径策略。bdev_nvme_set_multipath_policy支持的配置参数如下:

'-b', '--name', help='Name of the NVMe bdev', required=True
'-p', '--policy', help='Multipath policy (active_passive or active_active)', required=True
'-s', '--selector', help='Multipath selector (round_robin, queue_depth)', required=False
'-r', '--rr-min-io', help='Number of IO to route to a path before switching to another for round-robin', type=int, required=False

从配置参数可以看出,多路径策略当前支持active_passive或active_active两种。

在active_active模式下,可以继续配置multipath selector,round_roubin轮询和queue_depth最小队列深度调度选择器selector。其中round_roubin轮询模式下要配置rr-min-io,即队列的outstanding IO数量阈值。

2、基于1中配置多路径策略,当提交IO请求SPDK如何根据配置的策略选择路径?

通过NVMe-oF创建的bdev,IO request通过bdev_nvme_submit_request提交。

static void bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
{
	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
	struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;

	if (spdk_likely(nbdev_io->submit_tsc == 0)) {
		nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io);
	} else {
		/* There are cases where submit_tsc != 0, i.e. retry I/O.
		 * We need to update submit_tsc here.
		 */
		nbdev_io->submit_tsc = spdk_get_ticks();
	}

	spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io);
	nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch);
	if (spdk_unlikely(!nbdev_io->io_path)) {
		if (!bdev_nvme_io_type_is_admin(bdev_io->type)) {
			bdev_nvme_io_complete(nbdev_io, -ENXIO);
			return;
		}

		/* Admin commands do not use the optimal I/O path.
		 * Simply fall through even if it is not found.
		 */
	}

	_bdev_nvme_submit_request(nbdev_ch, bdev_io);
}

通过bdev_nvme_submit_request实现可以看出,其首先通过bdev_nvme_find_io_path选择当前io request需要提交的io_path,而bdev_nvme_find_io_path就会根据1中配置多路径策略进行选择,具体实现如下:

static inline struct nvme_io_path *bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
{
	if (spdk_likely(nbdev_ch->current_io_path != NULL)) {
		if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) {
			return nbdev_ch->current_io_path;
		} else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) {
			if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) {
				return nbdev_ch->current_io_path;
			}
			nbdev_ch->rr_counter = 0;
		}
	}

	if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE ||
	    nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) {
		return _bdev_nvme_find_io_path(nbdev_ch);
	} else {
		return _bdev_nvme_find_io_path_min_qd(nbdev_ch);
	}
}
static struct nvme_io_path * _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
{
	struct nvme_io_path *io_path, *start, *non_optimized = NULL;

	start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path);

	io_path = start;
	do {
		if (spdk_likely(nvme_io_path_is_connected(io_path) &&
				!io_path->nvme_ns->ana_state_updating)) {
			switch (io_path->nvme_ns->ana_state) {
			case SPDK_NVME_ANA_OPTIMIZED_STATE:
				nbdev_ch->current_io_path = io_path;
				return io_path;
			case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
				if (non_optimized == NULL) {
					non_optimized = io_path;
				}
				break;
			default:
				break;
			}
		}
		io_path = nvme_io_path_get_next(nbdev_ch, io_path);
	} while (io_path != start);

	if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) {
		/* We come here only if there is no optimized path. Cache even non_optimized
		 * path for load balance across multiple non_optimized paths.
		 */
		nbdev_ch->current_io_path = non_optimized;
	}

	return non_optimized;
}

static struct nvme_io_path * _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch)
{
	struct nvme_io_path *io_path;
	struct nvme_io_path *optimized = NULL, *non_optimized = NULL;
	uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX;
	uint32_t num_outstanding_reqs;

	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
		if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) {
			/* The device is currently resetting. */
			continue;
		}

		if (spdk_unlikely(io_path->nvme_ns->ana_state_updating)) {
			continue;
		}

		num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair);
		switch (io_path->nvme_ns->ana_state) {
		case SPDK_NVME_ANA_OPTIMIZED_STATE:
			if (num_outstanding_reqs < opt_min_qd) {
				opt_min_qd = num_outstanding_reqs;
				optimized = io_path;
			}
			break;
		case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
			if (num_outstanding_reqs < non_opt_min_qd) {
				non_opt_min_qd = num_outstanding_reqs;
				non_optimized = io_path;
			}
			break;
		default:
			break;
		}
	}

	/* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */
	if (optimized != NULL) {
		return optimized;
	}

	return non_optimized;
}

通过bdev_nvme_find_io_path结合1中的多路径策略,可以发现:

当多路径策略为active_passive模式时:若current io path为NULL,则调用_bdev_nvme_find_io_path从当前io path list选择第一条正常路径作为current io path,后续提交的io请求则继续从当前current io path下发;

当多路径策略为active_active模式时:若multipath selector为round robin即轮询模式,则判断当前current io path中outstanding io数量rr_counter是否小于1中配置的rr-min-io,若大于等于rr-min-io,则调用_bdev_nvme_find_io_path从current io path开始继续选择下一条可用的io path作为current_io_path下发io,若小于rr-min-io则继续从当前current io path下发IO;若multipath selector为queue_depth即最小队列深度模式,则每次下发IO时均会调用_bdev_nvme_find_io_path_min_qd,遍历每条正常路径并选择outstanding req最少的io path下发IO.

以上即为当前SPDK实现的基于NVMe-oF创建的bdev配置多路径策略方法及如何根据多路径策略选择路径下发IO的实现。

0条评论
0 / 1000
l****n
5文章数
0粉丝数
l****n
5 文章 | 0 粉丝
原创

SPDK多路径策略配置及实现原理简介

2023-10-17 02:29:01
137
0

1、如何配置多路径。在使用rpc指令”bdev_nvme_attach_controller “以 -x multipath添加多个路径之后,可以继续使用rpc指令“bdev_nvme_set_multipath_policy”对nvme bdev配置多路径策略。bdev_nvme_set_multipath_policy支持的配置参数如下:

'-b', '--name', help='Name of the NVMe bdev', required=True
'-p', '--policy', help='Multipath policy (active_passive or active_active)', required=True
'-s', '--selector', help='Multipath selector (round_robin, queue_depth)', required=False
'-r', '--rr-min-io', help='Number of IO to route to a path before switching to another for round-robin', type=int, required=False

从配置参数可以看出,多路径策略当前支持active_passive或active_active两种。

在active_active模式下,可以继续配置multipath selector,round_roubin轮询和queue_depth最小队列深度调度选择器selector。其中round_roubin轮询模式下要配置rr-min-io,即队列的outstanding IO数量阈值。

2、基于1中配置多路径策略,当提交IO请求SPDK如何根据配置的策略选择路径?

通过NVMe-oF创建的bdev,IO request通过bdev_nvme_submit_request提交。

static void bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
{
	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
	struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;

	if (spdk_likely(nbdev_io->submit_tsc == 0)) {
		nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io);
	} else {
		/* There are cases where submit_tsc != 0, i.e. retry I/O.
		 * We need to update submit_tsc here.
		 */
		nbdev_io->submit_tsc = spdk_get_ticks();
	}

	spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io);
	nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch);
	if (spdk_unlikely(!nbdev_io->io_path)) {
		if (!bdev_nvme_io_type_is_admin(bdev_io->type)) {
			bdev_nvme_io_complete(nbdev_io, -ENXIO);
			return;
		}

		/* Admin commands do not use the optimal I/O path.
		 * Simply fall through even if it is not found.
		 */
	}

	_bdev_nvme_submit_request(nbdev_ch, bdev_io);
}

通过bdev_nvme_submit_request实现可以看出,其首先通过bdev_nvme_find_io_path选择当前io request需要提交的io_path,而bdev_nvme_find_io_path就会根据1中配置多路径策略进行选择,具体实现如下:

static inline struct nvme_io_path *bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
{
	if (spdk_likely(nbdev_ch->current_io_path != NULL)) {
		if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) {
			return nbdev_ch->current_io_path;
		} else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) {
			if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) {
				return nbdev_ch->current_io_path;
			}
			nbdev_ch->rr_counter = 0;
		}
	}

	if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE ||
	    nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) {
		return _bdev_nvme_find_io_path(nbdev_ch);
	} else {
		return _bdev_nvme_find_io_path_min_qd(nbdev_ch);
	}
}
static struct nvme_io_path * _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
{
	struct nvme_io_path *io_path, *start, *non_optimized = NULL;

	start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path);

	io_path = start;
	do {
		if (spdk_likely(nvme_io_path_is_connected(io_path) &&
				!io_path->nvme_ns->ana_state_updating)) {
			switch (io_path->nvme_ns->ana_state) {
			case SPDK_NVME_ANA_OPTIMIZED_STATE:
				nbdev_ch->current_io_path = io_path;
				return io_path;
			case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
				if (non_optimized == NULL) {
					non_optimized = io_path;
				}
				break;
			default:
				break;
			}
		}
		io_path = nvme_io_path_get_next(nbdev_ch, io_path);
	} while (io_path != start);

	if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) {
		/* We come here only if there is no optimized path. Cache even non_optimized
		 * path for load balance across multiple non_optimized paths.
		 */
		nbdev_ch->current_io_path = non_optimized;
	}

	return non_optimized;
}

static struct nvme_io_path * _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch)
{
	struct nvme_io_path *io_path;
	struct nvme_io_path *optimized = NULL, *non_optimized = NULL;
	uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX;
	uint32_t num_outstanding_reqs;

	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
		if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) {
			/* The device is currently resetting. */
			continue;
		}

		if (spdk_unlikely(io_path->nvme_ns->ana_state_updating)) {
			continue;
		}

		num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair);
		switch (io_path->nvme_ns->ana_state) {
		case SPDK_NVME_ANA_OPTIMIZED_STATE:
			if (num_outstanding_reqs < opt_min_qd) {
				opt_min_qd = num_outstanding_reqs;
				optimized = io_path;
			}
			break;
		case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
			if (num_outstanding_reqs < non_opt_min_qd) {
				non_opt_min_qd = num_outstanding_reqs;
				non_optimized = io_path;
			}
			break;
		default:
			break;
		}
	}

	/* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */
	if (optimized != NULL) {
		return optimized;
	}

	return non_optimized;
}

通过bdev_nvme_find_io_path结合1中的多路径策略,可以发现:

当多路径策略为active_passive模式时:若current io path为NULL,则调用_bdev_nvme_find_io_path从当前io path list选择第一条正常路径作为current io path,后续提交的io请求则继续从当前current io path下发;

当多路径策略为active_active模式时:若multipath selector为round robin即轮询模式,则判断当前current io path中outstanding io数量rr_counter是否小于1中配置的rr-min-io,若大于等于rr-min-io,则调用_bdev_nvme_find_io_path从current io path开始继续选择下一条可用的io path作为current_io_path下发io,若小于rr-min-io则继续从当前current io path下发IO;若multipath selector为queue_depth即最小队列深度模式,则每次下发IO时均会调用_bdev_nvme_find_io_path_min_qd,遍历每条正常路径并选择outstanding req最少的io path下发IO.

以上即为当前SPDK实现的基于NVMe-oF创建的bdev配置多路径策略方法及如何根据多路径策略选择路径下发IO的实现。

文章来自个人专栏
文章 | 订阅
0条评论
0 / 1000
请输入你的评论
0
0