在dmesg 日志或者在message日志中,会有段错误相关的日志,如何通过该日志找到对应代码位置呢?下面以实际案例来讲述:
Oct 11 18:10:02 gd03-compute-10e37e48e71 kernel: revalidator625[571802]: segfault at 1c0 ip 000055b8df7cd23b sp 00007f6fba7f7310 error 4 in ovs-vswitchd[55b8df62a000+3ff000]
Oct 11 18:10:02 gd03-compute-10e37e48e71 kernel: revalidator625[571802]: segfault at 1c0 ip 000055b8df7cd23b sp 00007f6fba7f7310 error 4 in ovs-vswitchd[55b8df62a000+3ff000]
按照该指令来看,ip寄存器值为000055b8df7cd23b,即二进制代码执行到了这一行地址代码。由于我们不能直接在线上调试,需要在其他地方找个代码二进制相同的节点查看,因此,我们需要获取该代码相对于基地址的偏移量:
偏移量=000055b8df7cd23b-55b8df62a000=1a323b
确定其他环境ovs-vswitchd代码基地址,
[root@gz15-compute-s2-55e255e16e35 940418]# cat /proc/$(pidof ovs-vswitchd)/maps | head
55dd0df79000-55dd0e378000 r-xp 00000000 08:01 83898960 /usr/sbin/ovs-vswitchd
55dd0e578000-55dd0e593000 r--p 003ff000 08:01 83898960 /usr/sbin/ovs-vswitchd
55dd0e593000-55dd0e5d5000 rw-p 0041a000 08:01 83898960 /usr/sbin/ovs-vswitchd
55dd0e5d5000-55dd0e91e000 rw-p 00000000 00:00 0
55dd0e966000-55dd0e987000 rw-p 00000000 00:00 0 [heap]
55dd0e987000-55dd0ecf4000 rw-p 00000000 00:00 0 [heap]
7fcf40000000-7fcf40022000 rw-p 00000000 00:00 0
7fcf40022000-7fcf44000000 ---p 00000000 00:00 0
7fcf44000000-7fcf4404e000 rw-p 00000000 00:00 0
7fcf4404e000-7fcf48000000 ---p 00000000 00:00 0
获取到基地址:55dd0df79000。 计算在当前环境中,问题代码地址:
问题代码地址=基地址+偏移量=55dd0df79000+1a323b=55dd0e11c23b
根据问题地址代码,查看当前环境中代码情况:
[root@gz15-compute-s2-55e255e16e35 940418]# gdb -pid=$(pidof ovs-vswitchd)
(gdb) x/i 0x55DD0E11C23B
0x55dd0e11c23b <revalidate_ukey__+555>: mov 0x1c0(%rax),%eax
(gdb) x/32i 0x55DD0E11C23B
0x55dd0e11c23b <revalidate_ukey__+555>: mov 0x1c0(%rax),%eax
0x55dd0e11c241 <revalidate_ukey__+561>: mov %r10,%rdx
0x55dd0e11c244 <revalidate_ukey__+564>: mov %eax,(%rsp)
0x55dd0e11c247 <revalidate_ukey__+567>: callq 0x55dd0e11bed0 <compose_slow_path.isra.6>
0x55dd0e11c24c <revalidate_ukey__+572>: jmpq 0x55dd0e11c14f <revalidate_ukey__+319>
0x55dd0e11c251 <revalidate_ukey__+577>: callq 0x55dd0dfaa610 <memcmp@plt>
0x55dd0e11c256 <revalidate_ukey__+582>: xor %r12d,%r12d
0x55dd0e11c259 <revalidate_ukey__+585>: test %eax,%eax
0x55dd0e11c25b <revalidate_ukey__+587>: je 0x55dd0e11c1dc <revalidate_ukey__+460>
0x55dd0e11c261 <revalidate_ukey__+593>: jmpq 0x55dd0e11c1a8 <revalidate_ukey__+408>
0x55dd0e11c266 <revalidate_ukey__+598>: callq 0x55dd0dfaae40 <__stack_chk_fail@plt>
0x55dd0e11c26b: nopl 0x0(%rax,%rax,1)
0x55dd0e11c270 <revalidate_ukey>: push %r15
0x55dd0e11c272 <revalidate_ukey+2>: push %r14
0x55dd0e11c274 <revalidate_ukey+4>: mov %r9,%r14
0x55dd0e11c277 <revalidate_ukey+7>: push %r13
0x55dd0e11c279 <revalidate_ukey+9>: mov %rdi,%r13
0x55dd0e11c27c <revalidate_ukey+12>: push %r12
0x55dd0e11c27e <revalidate_ukey+14>: mov %r8,%r12
0x55dd0e11c281 <revalidate_ukey+17>: push %rbp
0x55dd0e11c282 <revalidate_ukey+18>: mov %rdx,%rbp
0x55dd0e11c285 <revalidate_ukey+21>: push %rbx
0x55dd0e11c286 <revalidate_ukey+22>: mov %rsi,%rbx
0x55dd0e11c289 <revalidate_ukey+25>: sub $0x48,%rsp
0x55dd0e11c28d <revalidate_ukey+29>: mov 0xa8(%rsi),%r8
0x55dd0e11c294 <revalidate_ukey+36>: movl $0x0,0x10(%rcx)
0x55dd0e11c29b <revalidate_ukey+43>: mov %fs:0x28,%rax
0x55dd0e11c2a4 <revalidate_ukey+52>: mov %rax,0x38(%rsp)
0x55dd0e11c2a9 <revalidate_ukey+57>: xor %eax,%eax
0x55dd0e11c2ab <revalidate_ukey+59>: mov (%rcx),%rax
0x55dd0e11c2ae <revalidate_ukey+62>: mov %rax,0x8(%rcx)
0x55dd0e11c2b2 <revalidate_ukey+66>: mov 0x0(%rbp),%rdi
查看代码具体行,以及代码上下文,可以看出代码是正在给函数参数赋值,且即将调用函数compose_slow_path,因此查看函数revalidate_ukey__及调用compose_slow_path处代码,
if (xoutp->slow) {
struct ofproto_dpif *ofproto;
ofp_port_t ofp_in_port;
ofproto = xlate_lookup_ofproto(udpif->backer, &ctx.flow, &ofp_in_port);
ofpbuf_clear(odp_actions);
compose_slow_path(udpif, xoutp, &ctx.flow, ctx.flow.in_port.odp_port,
ofp_in_port, odp_actions,
ofproto->up.slowpath_meter_id, &ofproto->uuid);
}
if (odp_flow_key_to_mask(ukey->mask, ukey->mask_len, &dp_mask, &ctx.flow)
== ODP_FIT_ERROR) {
goto exit;
}
分析函数参数,涉及指针调用的,只有ofproto这个参数,再查看该参数的来源,发现该参数有可能为空,
ofproto = xlate_lookup_ofproto(udpif->backer, &ctx.flow, &ofp_in_port);
struct ofproto_dpif *
xlate_lookup_ofproto(const struct dpif_backer *backer, const struct flow *flow,
ofp_port_t *ofp_in_port)
{
const struct xport *xport;
return xlate_lookup_ofproto_(backer, flow, ofp_in_port, &xport);
}
static struct ofproto_dpif *
xlate_lookup_ofproto_(const struct dpif_backer *backer, const struct flow *flow,
ofp_port_t *ofp_in_port, const struct xport **xportp)
{
struct xlate_cfg *xcfg = ovsrcu_get(struct xlate_cfg *, &xcfgp);
const struct xport *xport;
/* If packet is recirculated, xport can be retrieved from frozen state. */
if (flow->recirc_id) {
const struct recirc_id_node *recirc_id_node;
recirc_id_node = recirc_id_node_find(flow->recirc_id);
if (OVS_UNLIKELY(!recirc_id_node)) {
**return NULL;**
}
/* If recirculation was initiated due to bond (in_port = OFPP_NONE)
* then frozen state is static and xport_uuid is not defined, so xport
* cannot be restored from frozen state. */
if (recirc_id_node->state.metadata.in_port != OFPP_NONE) {
struct uuid xport_uuid = recirc_id_node->state.xport_uuid;
xport = xport_lookup_by_uuid(xcfg, &xport_uuid);
if (xport && xport->xbridge && xport->xbridge->ofproto) {
goto out;
}
}
}
xport = xport_lookup(xcfg, tnl_port_should_receive(flow)
? tnl_port_receive(flow)
: odp_port_to_ofport(backer, flow->in_port.odp_port));
if (OVS_UNLIKELY(!xport)) {
** return NULL;**
}
out:
*xportp = xport;
if (ofp_in_port) {
*ofp_in_port = xport->ofp_port;
}
return xport->xbridge->ofproto;
}
该函数xlate_lookup_ofproto_返回值有可能为空,但是ofproto在使用的时候,没有判空,所以问题极有可能是该值为空导致的。
再看汇编代码中
mov 0x1c0(%rax),%eax
提到的偏移量0x1c0,以及日志中也提到的这个偏移量,是否对应参数ofproto->up.slowpath_meter_id
(gdb) p sizeof(struct ofproto_dpif)
$1 = 1384
(gdb) p &(((struct ofproto_dpif *)0)->up)
$2 = (struct ofproto *) 0x20
(gdb) p &(((struct ofproto *)0)->slowpath_meter_id)
$4 = (uint32_t *) 0x1a0
(gdb) p &(((struct ofproto_dpif *)0)->up.slowpath_meter_id)
$5 = (uint32_t *) 0x1c0
ofproto->up.slowpath_meter_id相对于ofproto的地址,正好得出偏移量0x1c0,所以可以确认就是ofproto为空触发的segfault。
查看最新的openvswitch开源代码,发现这部分代码已经修改了。
if (xoutp->slow) {
struct ofproto_dpif *ofproto;
ofp_port_t ofp_in_port;
ofproto = xlate_lookup_ofproto(udpif->backer, &ctx.flow, &ofp_in_port,
NULL);
ofpbuf_clear(odp_actions);
if (!ofproto) {
goto exit;
}
compose_slow_path(udpif, xoutp, ctx.flow.in_port.odp_port,
ofp_in_port, odp_actions,
ofproto->up.slowpath_meter_id, &ofproto->uuid);
}
补充:
1.disassemble revalidate_ukey__可以反汇编这个函数,看到完整汇编代码上下文。
2.如果gdb调试的时候看不到符号信息,可以安装debuginfo包后再查看。