Linux中,IO从文件系统开始,按照如下流程在各个层级中传递:
VFS——BLOCK——SCSI
VFS到BLOCK
文件系统层,通过submit_bio,提交BIO到块设备层
提交的BIO中,包含了page(写入数据或是读出数据)以及文件系统的block转换到盘的扇区(sector)位置
/*
* 以一个文件系统的读请求为例:
* -在BIO中标注IO类型为READ
* -将文件系统的Block起始位置转换成对应扇区
* -将申请好的page添加到该BIO下
* -提交BIO到块设备层
*/
static int
iomap_read_page_sync(loff_t block_start, struct page *page, unsigned poff,
unsigned plen, struct iomap *iomap)
{
struct bio_vec bvec;
struct bio bio;
bio_init(&bio, &bvec, 1);
bio.bi_opf = REQ_OP_READ;
bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
bio_set_dev(&bio, iomap->bdev);
__bio_add_page(&bio, page, plen, poff);
return submit_bio_wait(&bio);
}
blk_mq_submit_bio
会将BIO关联到一个req中。req可以通过同步/异步等方式下发到硬件队列。
/*
* 入参BIO,从block disk ctx中申请req
* -req中保存bio中的扇区,size等信息
* -根据bio,blk类型,将req以不同方式下发
* -队列存在IO调度器的,下发到IO调度器
*/
blk_qc_t blk_mq_submit_bio(struct bio *bio)
{
struct request_queue *q = bio->bi_disk->queue;
const int is_sync = op_is_sync(bio->bi_opf);
const int is_flush_fua = op_is_flush(bio->bi_opf);
struct blk_mq_alloc_data data = {
.q = q,
};
struct request *rq;
...
rq = __blk_mq_alloc_request(&data);
...
blk_mq_bio_to_request(rq, bio, nr_segs);
...
else if (q->elevator) {
/* Insert the request at the IO scheduler queue */
blk_mq_sched_insert_request(rq, false, true, true);
}
...
}
接下来以 req下发到IO调度器——>IO调度器处理req 的流程举例:
BLOCK到SCSI
BLOCK往下可以对接多种不同存储协议层,比如UFS,NVME,SCSI。这里以SCSI举例。
SCSI通过blk_mq_init_queue
初始化队列,创建delay work,用于调度和处理req。
BLOCK IO调度器初始化流程:
BLOCK初始化块设备下的request queue时,会初始化request queue对应的调度器
[blk_mq_init_queue]
|
\|/
[blk_mq_init_allocated_queue]
|
\|/
[blk_mq_realloc_hw_ctxs]
|
\|/
[blk_mq_alloc_hctx:init delay work]
blk_mq_alloc_hctx
中,初始化了request queue对应的work处理函数为blk_mq_run_work_fn
BLOCK IO调度器添加req流程:
上文blk_mq_sched_insert_request
将req添加到request queue中,并唤醒delay work
[blk_mq_sched_insert_request]
|
\|/
[blk_mq_run_hw_queue]
|
\|/
[__blk_mq_delay_run_hw_queue]
|
\|/
[kblockd_mod_delayed_work_on]
BLOCK IO调度器处理req流程:
blk_mq_run_work_fn
在接受delay work调度后,处理request queue下的req。
[blk_mq_run_work_fn]
|
\|/
[__blk_mq_run_hw_queue]
|
\|/
[blk_mq_sched_dispatch_requests]
|
\|/
[blk_mq_dispatch_rq_list]
|
\|/
[SCSI queue_rq]
在blk_mq_dispatch_rq_list
中,通过req->mq_ops->queue_rq
,最终调用到SCSI注册给BLOCK的req方法:
ret = q->mq_ops->queue_rq(hctx, &bd);
switch (ret) {
case BLK_STS_OK:
queued++;
break;
case BLK_STS_RESOURCE:
needs_resource = true;
fallthrough;
case BLK_STS_DEV_RESOURCE:
blk_mq_handle_dev_resource(rq, list);
goto out;
case BLK_STS_ZONE_RESOURCE:
/*
* Move the request to zone_list and keep going through
* the dispatch list to find more requests the drive can
* accept.
*/
blk_mq_handle_zone_resource(rq, &zone_list);
needs_resource = true;
break;
default:
errors++;
blk_mq_end_request(rq, BLK_STS_IOERR);
}
SCSI定义的req ops模版为scsi_mq_ops,实现如下:
static const struct blk_mq_ops scsi_mq_ops = {
.get_budget = scsi_mq_get_budget,
.put_budget = scsi_mq_put_budget,
.queue_rq = scsi_queue_rq,
.commit_rqs = scsi_commit_rqs,
.complete = scsi_softirq_done,
.timeout = scsi_timeout,
#ifdef CONFIG_BLK_DEBUG_FS
.show_rq = scsi_show_rq,
#endif
.init_request = scsi_mq_init_request,
.exit_request = scsi_mq_exit_request,
.initialize_rq_fn = scsi_initialize_rq,
.cleanup_rq = scsi_cleanup_rq,
.busy = scsi_mq_lld_busy,
.map_queues = scsi_map_queues,
};
SCSI到disk driver
scsi_queue_rq
将req转换为scsi cmd,分发到对应的scsi device driver处理。
scsi_queue_rq
scsi_queue_rq
下发scsi cmd的整体逻辑:
scsi_queue_rq==>scsi_dispatch_cmd==>queuecommand
如果scsi_dispatch_cmd成功,返回BLK_STS_OK
如果scsi_dispatch_cmd失败,唤醒scsi recover handler,返回BLK_STS_RESOURCE
其他错误返回对应的scsi码
/*
* 分发req与错误处理
* -req转换为scsi cmd
* -检查device/target
* -scsi_dispatch_cmd下发命令,如果失败唤醒recover handler
* -错误处理
*/
static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
...
struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
...
if (unlikely(sdev->sdev_state != SDEV_RUNNING)) {
ret = scsi_device_state_check(sdev, req);
if (ret != BLK_STS_OK)
goto out_put_budget;
}
...
blk_mq_start_request(req);
reason = scsi_dispatch_cmd(cmd);
if (reason) {
scsi_set_blocked(cmd, reason);
ret = BLK_STS_RESOURCE;
goto out_dec_host_busy;
}
return BLK_STS_OK;
out_dec_host_busy:
scsi_dec_host_busy(shost, cmd);
...
}
scsi_dispatch_cmd失败,返回BLK_STS_RESOURCE的场景:
1.req重新添加到list中
2.scsi通过scsi_commit_rqs
重新提交硬件队列,处理List(如果scsi host没有提供commit_rqs则没有此步。目前看仅virtio scsi有)
3.blk_mq_delay_run_hw_queue
异步调度队列
4.再次调度block delay work:blk_mq_run_work_fn
,重走Block到scsi的流程
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
unsigned int nr_budgets)
{
...
/* req重新入队 */
ret = q->mq_ops->queue_rq(hctx, &bd);
switch (ret) {
case BLK_STS_OK:
queued++;
break;
case BLK_STS_RESOURCE:
needs_resource = true;
fallthrough;
case BLK_STS_DEV_RESOURCE:
blk_mq_handle_dev_resource(rq, list);
goto out;
...
out:
...
/* scsi_host有注册commit_rqs,会在这里commit_rqs */
if ((!list_empty(list) || errors || needs_resource ||
ret == BLK_STS_DEV_RESOURCE) && q->mq_ops->commit_rqs && queued)
q->mq_ops->commit_rqs(hctx);
...
/* restart调度器 */
if (!list_empty(list)) {
...
needs_restart = blk_mq_sched_needs_restart(hctx);
if (prep == PREP_DISPATCH_NO_BUDGET)
needs_resource = true;
if (!needs_restart ||
(no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
blk_mq_run_hw_queue(hctx, true);
else if (needs_restart && needs_resource)
blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
blk_mq_update_dispatch_busy(hctx, true);
return false;
}
...
}
scsi_dispatch_cmd中,queuecmd失败的命令,会一直尝试重发。直到req的定时器爆发
scsi_dispatch_cmd
scsi_dispatch_cmd
将cmd下发给Low-level driver。调用的方法是queuecommand
/*
* SCSI分发scsi cmd到scsi host
* -检查device在位状态,不在位返回DID_NO_CONNECT
* -检查device block状态,block中返回DEVICE_BUSY
* -queuecommand下发scsi cmd
* -queuecommand失败则返回BUSY
*/
static int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
{
...
if (unlikely(cmd->device->sdev_state == SDEV_DEL)) {
cmd->result = DID_NO_CONNECT << 16;
goto done;
}
...
if (unlikely(scsi_device_blocked(cmd->device))) {
SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd,
"queuecommand : device blocked\n"));
atomic_dec(&cmd->device->iorequest_cnt);
return SCSI_MLQUEUE_DEVICE_BUSY;
}
...
rtn = host->hostt->queuecommand(host, cmd);
if (rtn) {
atomic_dec(&cmd->device->iorequest_cnt);
trace_scsi_dispatch_cmd_error(cmd, rtn);
if (rtn != SCSI_MLQUEUE_DEVICE_BUSY &&
rtn != SCSI_MLQUEUE_TARGET_BUSY)
rtn = SCSI_MLQUEUE_HOST_BUSY;
SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd,
"queuecommand : request rejected\n"));
}
...
}
SCSI CMD完成与超时处理
blk_mq_timeout_work
scsi_queue_rq
在scsi_dispatch_cmd
下发cmd前,通过blk_mq_start_request
添加了req的超时处理
blk_mq_start_request
通过blk_add_timer
,更新req的超时时间
req会被添加到request queue的timer list下,所有的req在request queue的timer中统一判断是否超时。request queue的超时处理函数是blk_rq_timed_out_timer
根据req的超时时间,判断是否需要修改request queue的timer。
修改request queue的timer需要满足条件:request queue的timer没有在处理中,或是req的超时时间要早于之前设置的时间
void blk_add_timer(struct request *req)
{
...
/* 设置req的超时时间 */
if (!req->timeout)
req->timeout = q->rq_timeout;
req->rq_flags &= ~RQF_TIMED_OUT;
blk_rq_set_deadline(req, jiffies + req->timeout);
/* 将req timer添加到request queue的timer list上 */
if (!q->mq_ops)
list_add_tail(&req->timeout_list, &req->q->timeout_list);
/* 根据req的超时时间,判断是否需要修改request queue的timer */
expiry = blk_rq_timeout(round_jiffies_up(blk_rq_deadline(req)));
if (!timer_pending(&q->timeout) ||
time_before(expiry, q->timeout.expires)) {
unsigned long diff = q->timeout.expires - expiry;
if (!timer_pending(&q->timeout) || (diff >= HZ / 2))
mod_timer(&q->timeout, expiry);
}
...
}
blk_alloc_queue_node
在创建block request queue时初始化了request queue下的timer和delay work
struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id,
spinlock_t *lock)
{
...
timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
INIT_WORK(&q->timeout_work, blk_timeout_work_empty);
...
}
blk_rq_timed_out_timer
在定时器爆发后会调度timeout_work
static void blk_rq_timed_out_timer(struct timer_list *t)
{
struct request_queue *q = from_timer(q, t, timeout);
kblockd_schedule_work(&q->timeout_work);
}
思考:为什么要在timer里调度work?是不是兼容了老版本的做法?
timeout_work真正的处理函数不是blk_timeout_work_empty
,而是blk_mq_timeout_work
。
blk_alloc_queue_node中INIT_WORK的blk_timeout_work_empty
是个空函数,blk_mq_init_allocated_queue
中完成了真正的初始化
static void blk_timeout_work_empty(struct work_struct *work)
{
}
struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
struct request_queue *q,
bool elevator_init)
{
...
INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
...
}
blk_mq_timeout_work
遍历request_queue下的所有req,判断其是否已达到超时时间
如果处理一圈后有还未到超时时间的req,修改request_queue的下一次到期时间为剩下req中最早的到期时间
static void blk_mq_timeout_work(struct work_struct *work)
{
...
blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
if (next != 0) {
mod_timer(&q->timeout, next);
} else {
queue_for_each_hw_ctx(q, hctx, i) {
if (blk_mq_hw_queue_mapped(hctx))
blk_mq_tag_idle(hctx);
}
}
blk_queue_exit(q);
...
}
blk_mq_check_expired
检查request_queue下的每一个req,对于已经到了超时时间的req调用超时处理
超时处理完成后,如果满足释放条件(req引用计数为0),则释放req
static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
struct request *rq, void *priv, bool reserved)
{
...
if (blk_mq_req_expired(rq, next))
blk_mq_rq_timed_out(rq, reserved);
if (is_flush_rq(rq, hctx))
rq->end_io(rq, 0);
else if (refcount_dec_and_test(&rq->ref))
__blk_mq_free_request(rq);
}
blk_mq_rq_timed_out
处理超时req,调用对应的mq_ops->timeout
对于scsi req而言,超时处理函数是scsi_timeout
static void blk_mq_rq_timed_out(struct request *req, bool reserved)
{
req->rq_flags |= RQF_TIMED_OUT;
if (req->q->mq_ops->timeout) {
enum blk_eh_timer_return ret;
ret = req->q->mq_ops->timeout(req, reserved);
if (ret == BLK_EH_DONE)
return;
WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
}
blk_add_timer(req);
}
scsi_mq_done
scsi_queue_rq
在scsi_dispatch_cmd
下发cmd前,将scsi cmd完成的回调函数scsi_done设为scsi_mq_done
当IO完成时,调用scsi_mq_done返回success的status,并取消req timer。
scsi_mq_done
调用blk_mq_complete_request通知block,req完成
static void scsi_mq_done(struct scsi_cmnd *cmd)
{
trace_scsi_dispatch_cmd_done(cmd);
blk_mq_complete_request(cmd->request);
}
通过了scsi_mq_done的I/O,无论最终状态是success亦或是有其他status,均被视为完成,会在blk_finish_request
取消之前在blk_mq_start_request
启动的timer:
void blk_finish_request(struct request *req, blk_status_t error)
{
struct request_queue *q = req->q;
u64 now = ktime_get_ns();
...
blk_delete_timer(req);
...
}
blk_delete_timer
会将req timer从request queue的timer list上摘除
void blk_delete_timer(struct request *req)
{
list_del_init(&req->timeout_list);
}