searchusermenu
  • 发布文章
  • 消息中心
点赞
收藏
评论
分享
原创

根据日志查找代码空指针位置

2023-10-13 07:25:29
17
0

在dmesg 日志或者在message日志中,会有段错误相关的日志,如何通过该日志找到对应代码位置呢?下面以实际案例来讲述

Oct 11 18:10:02 gd03-compute-10e37e48e71 kernel: revalidator625[571802]: segfault at 1c0 ip 000055b8df7cd23b sp 00007f6fba7f7310 error 4 in ovs-vswitchd[55b8df62a000+3ff000]
Oct 11 18:10:02 gd03-compute-10e37e48e71 kernel: revalidator625[571802]: segfault at 1c0 ip 000055b8df7cd23b sp 00007f6fba7f7310 error 4 in ovs-vswitchd[55b8df62a000+3ff000]

按照该指令来看,ip寄存器值为000055b8df7cd23b,即二进制代码执行到了这一行地址代码。由于我们不能直接在线上调试,需要在其他地方找个代码二进制相同的节点查看,因此,我们需要获取该代码相对于基地址的偏移量

偏移量=000055b8df7cd23b-55b8df62a000=1a323b


确定其他环境ovs-vswitchd代码基地址,

[root@gz15-compute-s2-55e255e16e35 940418]# cat /proc/$(pidof ovs-vswitchd)/maps | head 
55dd0df79000-55dd0e378000 r-xp 00000000 08:01 83898960                   /usr/sbin/ovs-vswitchd
55dd0e578000-55dd0e593000 r--p 003ff000 08:01 83898960                   /usr/sbin/ovs-vswitchd
55dd0e593000-55dd0e5d5000 rw-p 0041a000 08:01 83898960                   /usr/sbin/ovs-vswitchd
55dd0e5d5000-55dd0e91e000 rw-p 00000000 00:00 0 
55dd0e966000-55dd0e987000 rw-p 00000000 00:00 0                          [heap]
55dd0e987000-55dd0ecf4000 rw-p 00000000 00:00 0                          [heap]
7fcf40000000-7fcf40022000 rw-p 00000000 00:00 0 
7fcf40022000-7fcf44000000 ---p 00000000 00:00 0 
7fcf44000000-7fcf4404e000 rw-p 00000000 00:00 0 
7fcf4404e000-7fcf48000000 ---p 00000000 00:00 0

获取到基地址55dd0df79000。 计算在当前环境中,问题代码地址:

问题代码地址=基地址+偏移量=55dd0df79000+1a323b=55dd0e11c23b

根据问题地址代码,查看当前环境中代码情况:

[root@gz15-compute-s2-55e255e16e35 940418]# gdb -pid=$(pidof ovs-vswitchd)
(gdb) x/i 0x55DD0E11C23B
   0x55dd0e11c23b <revalidate_ukey__+555>:	mov    0x1c0(%rax),%eax
(gdb) x/32i 0x55DD0E11C23B
   0x55dd0e11c23b <revalidate_ukey__+555>:	mov    0x1c0(%rax),%eax
   0x55dd0e11c241 <revalidate_ukey__+561>:	mov    %r10,%rdx
   0x55dd0e11c244 <revalidate_ukey__+564>:	mov    %eax,(%rsp)
   0x55dd0e11c247 <revalidate_ukey__+567>:	callq  0x55dd0e11bed0 <compose_slow_path.isra.6>
   0x55dd0e11c24c <revalidate_ukey__+572>:	jmpq   0x55dd0e11c14f <revalidate_ukey__+319>
   0x55dd0e11c251 <revalidate_ukey__+577>:	callq  0x55dd0dfaa610 <memcmp@plt>
   0x55dd0e11c256 <revalidate_ukey__+582>:	xor    %r12d,%r12d
   0x55dd0e11c259 <revalidate_ukey__+585>:	test   %eax,%eax
   0x55dd0e11c25b <revalidate_ukey__+587>:	je     0x55dd0e11c1dc <revalidate_ukey__+460>
   0x55dd0e11c261 <revalidate_ukey__+593>:	jmpq   0x55dd0e11c1a8 <revalidate_ukey__+408>
   0x55dd0e11c266 <revalidate_ukey__+598>:	callq  0x55dd0dfaae40 <__stack_chk_fail@plt>
   0x55dd0e11c26b:	nopl   0x0(%rax,%rax,1)
   0x55dd0e11c270 <revalidate_ukey>:	push   %r15
   0x55dd0e11c272 <revalidate_ukey+2>:	push   %r14
   0x55dd0e11c274 <revalidate_ukey+4>:	mov    %r9,%r14
   0x55dd0e11c277 <revalidate_ukey+7>:	push   %r13
   0x55dd0e11c279 <revalidate_ukey+9>:	mov    %rdi,%r13
   0x55dd0e11c27c <revalidate_ukey+12>:	push   %r12
   0x55dd0e11c27e <revalidate_ukey+14>:	mov    %r8,%r12
   0x55dd0e11c281 <revalidate_ukey+17>:	push   %rbp
   0x55dd0e11c282 <revalidate_ukey+18>:	mov    %rdx,%rbp
   0x55dd0e11c285 <revalidate_ukey+21>:	push   %rbx
   0x55dd0e11c286 <revalidate_ukey+22>:	mov    %rsi,%rbx
   0x55dd0e11c289 <revalidate_ukey+25>:	sub    $0x48,%rsp
   0x55dd0e11c28d <revalidate_ukey+29>:	mov    0xa8(%rsi),%r8
   0x55dd0e11c294 <revalidate_ukey+36>:	movl   $0x0,0x10(%rcx)
   0x55dd0e11c29b <revalidate_ukey+43>:	mov    %fs:0x28,%rax
   0x55dd0e11c2a4 <revalidate_ukey+52>:	mov    %rax,0x38(%rsp)
   0x55dd0e11c2a9 <revalidate_ukey+57>:	xor    %eax,%eax
   0x55dd0e11c2ab <revalidate_ukey+59>:	mov    (%rcx),%rax
   0x55dd0e11c2ae <revalidate_ukey+62>:	mov    %rax,0x8(%rcx)
   0x55dd0e11c2b2 <revalidate_ukey+66>:	mov    0x0(%rbp),%rdi

查看代码具体行,以及代码上下文,可以看出代码是正在给函数参数赋值,且即将调用函数compose_slow_path,因此查看函数revalidate_ukey__及调用compose_slow_path处代码,

 if (xoutp->slow) {
        struct ofproto_dpif *ofproto;
        ofp_port_t ofp_in_port;

        ofproto = xlate_lookup_ofproto(udpif->backer, &ctx.flow, &ofp_in_port);

        ofpbuf_clear(odp_actions);
        compose_slow_path(udpif, xoutp, &ctx.flow, ctx.flow.in_port.odp_port,
                          ofp_in_port, odp_actions,
                          ofproto->up.slowpath_meter_id, &ofproto->uuid);
    }

    if (odp_flow_key_to_mask(ukey->mask, ukey->mask_len, &dp_mask, &ctx.flow)
        == ODP_FIT_ERROR) {
        goto exit;
    }

分析函数参数,涉及指针调用的,只有ofproto这个参数,再查看该参数的来源,发现该参数有可能为空,

  ofproto = xlate_lookup_ofproto(udpif->backer, &ctx.flow, &ofp_in_port);
  
  struct ofproto_dpif *
xlate_lookup_ofproto(const struct dpif_backer *backer, const struct flow *flow,
                     ofp_port_t *ofp_in_port)
{
    const struct xport *xport;

    return xlate_lookup_ofproto_(backer, flow, ofp_in_port, &xport);
}

static struct ofproto_dpif *
xlate_lookup_ofproto_(const struct dpif_backer *backer, const struct flow *flow,
                      ofp_port_t *ofp_in_port, const struct xport **xportp)
{
    struct xlate_cfg *xcfg = ovsrcu_get(struct xlate_cfg *, &xcfgp);
    const struct xport *xport;

    /* If packet is recirculated, xport can be retrieved from frozen state. */
    if (flow->recirc_id) {
        const struct recirc_id_node *recirc_id_node;

        recirc_id_node = recirc_id_node_find(flow->recirc_id);

        if (OVS_UNLIKELY(!recirc_id_node)) {
            **return NULL;**
        }

        /* If recirculation was initiated due to bond (in_port = OFPP_NONE)
         * then frozen state is static and xport_uuid is not defined, so xport
         * cannot be restored from frozen state. */
        if (recirc_id_node->state.metadata.in_port != OFPP_NONE) {
            struct uuid xport_uuid = recirc_id_node->state.xport_uuid;
            xport = xport_lookup_by_uuid(xcfg, &xport_uuid);
            if (xport && xport->xbridge && xport->xbridge->ofproto) {
                goto out;
            }
        }
    }

    xport = xport_lookup(xcfg, tnl_port_should_receive(flow)
                         ? tnl_port_receive(flow)
                         : odp_port_to_ofport(backer, flow->in_port.odp_port));
    if (OVS_UNLIKELY(!xport)) {
      **  return NULL;**
    }

out:
    *xportp = xport;
    if (ofp_in_port) {
        *ofp_in_port = xport->ofp_port;
    }
    return xport->xbridge->ofproto;
}

该函数xlate_lookup_ofproto_返回值有可能为空,但是ofproto在使用的时候,没有判空,所以问题极有可能是该值为空导致的。

再看汇编代码中

mov    0x1c0(%rax),%eax

提到的偏移量0x1c0,以及日志中也提到的这个偏移量,是否对应参数ofproto->up.slowpath_meter_id

(gdb) p sizeof(struct ofproto_dpif)
$1 = 1384
(gdb) p &(((struct ofproto_dpif *)0)->up)
$2 = (struct ofproto *) 0x20
(gdb) p &(((struct ofproto *)0)->slowpath_meter_id)
$4 = (uint32_t *) 0x1a0
(gdb) p &(((struct ofproto_dpif *)0)->up.slowpath_meter_id)
$5 = (uint32_t *) 0x1c0

ofproto->up.slowpath_meter_id相对于ofproto的地址,正好得出偏移量0x1c0,所以可以确认就是ofproto为空触发的segfault。

查看最新的openvswitch开源代码,发现这部分代码已经修改了。 

 if (xoutp->slow) {
        struct ofproto_dpif *ofproto;
        ofp_port_t ofp_in_port;

        ofproto = xlate_lookup_ofproto(udpif->backer, &ctx.flow, &ofp_in_port,
                                       NULL);

        ofpbuf_clear(odp_actions);

        if (!ofproto) {
            goto exit;
        }

        compose_slow_path(udpif, xoutp, ctx.flow.in_port.odp_port,
                          ofp_in_port, odp_actions,
                          ofproto->up.slowpath_meter_id, &ofproto->uuid);
    }

 

补充:
1.disassemble revalidate_ukey__可以反汇编这个函数,看到完整汇编代码上下文。
2.如果gdb调试的时候看不到符号信息,可以安装debuginfo包后再查看。

 

0条评论
0 / 1000
杨****昌
3文章数
1粉丝数
杨****昌
3 文章 | 1 粉丝
杨****昌
3文章数
1粉丝数
杨****昌
3 文章 | 1 粉丝
原创

根据日志查找代码空指针位置

2023-10-13 07:25:29
17
0

在dmesg 日志或者在message日志中,会有段错误相关的日志,如何通过该日志找到对应代码位置呢?下面以实际案例来讲述

Oct 11 18:10:02 gd03-compute-10e37e48e71 kernel: revalidator625[571802]: segfault at 1c0 ip 000055b8df7cd23b sp 00007f6fba7f7310 error 4 in ovs-vswitchd[55b8df62a000+3ff000]
Oct 11 18:10:02 gd03-compute-10e37e48e71 kernel: revalidator625[571802]: segfault at 1c0 ip 000055b8df7cd23b sp 00007f6fba7f7310 error 4 in ovs-vswitchd[55b8df62a000+3ff000]

按照该指令来看,ip寄存器值为000055b8df7cd23b,即二进制代码执行到了这一行地址代码。由于我们不能直接在线上调试,需要在其他地方找个代码二进制相同的节点查看,因此,我们需要获取该代码相对于基地址的偏移量

偏移量=000055b8df7cd23b-55b8df62a000=1a323b


确定其他环境ovs-vswitchd代码基地址,

[root@gz15-compute-s2-55e255e16e35 940418]# cat /proc/$(pidof ovs-vswitchd)/maps | head 
55dd0df79000-55dd0e378000 r-xp 00000000 08:01 83898960                   /usr/sbin/ovs-vswitchd
55dd0e578000-55dd0e593000 r--p 003ff000 08:01 83898960                   /usr/sbin/ovs-vswitchd
55dd0e593000-55dd0e5d5000 rw-p 0041a000 08:01 83898960                   /usr/sbin/ovs-vswitchd
55dd0e5d5000-55dd0e91e000 rw-p 00000000 00:00 0 
55dd0e966000-55dd0e987000 rw-p 00000000 00:00 0                          [heap]
55dd0e987000-55dd0ecf4000 rw-p 00000000 00:00 0                          [heap]
7fcf40000000-7fcf40022000 rw-p 00000000 00:00 0 
7fcf40022000-7fcf44000000 ---p 00000000 00:00 0 
7fcf44000000-7fcf4404e000 rw-p 00000000 00:00 0 
7fcf4404e000-7fcf48000000 ---p 00000000 00:00 0

获取到基地址55dd0df79000。 计算在当前环境中,问题代码地址:

问题代码地址=基地址+偏移量=55dd0df79000+1a323b=55dd0e11c23b

根据问题地址代码,查看当前环境中代码情况:

[root@gz15-compute-s2-55e255e16e35 940418]# gdb -pid=$(pidof ovs-vswitchd)
(gdb) x/i 0x55DD0E11C23B
   0x55dd0e11c23b <revalidate_ukey__+555>:	mov    0x1c0(%rax),%eax
(gdb) x/32i 0x55DD0E11C23B
   0x55dd0e11c23b <revalidate_ukey__+555>:	mov    0x1c0(%rax),%eax
   0x55dd0e11c241 <revalidate_ukey__+561>:	mov    %r10,%rdx
   0x55dd0e11c244 <revalidate_ukey__+564>:	mov    %eax,(%rsp)
   0x55dd0e11c247 <revalidate_ukey__+567>:	callq  0x55dd0e11bed0 <compose_slow_path.isra.6>
   0x55dd0e11c24c <revalidate_ukey__+572>:	jmpq   0x55dd0e11c14f <revalidate_ukey__+319>
   0x55dd0e11c251 <revalidate_ukey__+577>:	callq  0x55dd0dfaa610 <memcmp@plt>
   0x55dd0e11c256 <revalidate_ukey__+582>:	xor    %r12d,%r12d
   0x55dd0e11c259 <revalidate_ukey__+585>:	test   %eax,%eax
   0x55dd0e11c25b <revalidate_ukey__+587>:	je     0x55dd0e11c1dc <revalidate_ukey__+460>
   0x55dd0e11c261 <revalidate_ukey__+593>:	jmpq   0x55dd0e11c1a8 <revalidate_ukey__+408>
   0x55dd0e11c266 <revalidate_ukey__+598>:	callq  0x55dd0dfaae40 <__stack_chk_fail@plt>
   0x55dd0e11c26b:	nopl   0x0(%rax,%rax,1)
   0x55dd0e11c270 <revalidate_ukey>:	push   %r15
   0x55dd0e11c272 <revalidate_ukey+2>:	push   %r14
   0x55dd0e11c274 <revalidate_ukey+4>:	mov    %r9,%r14
   0x55dd0e11c277 <revalidate_ukey+7>:	push   %r13
   0x55dd0e11c279 <revalidate_ukey+9>:	mov    %rdi,%r13
   0x55dd0e11c27c <revalidate_ukey+12>:	push   %r12
   0x55dd0e11c27e <revalidate_ukey+14>:	mov    %r8,%r12
   0x55dd0e11c281 <revalidate_ukey+17>:	push   %rbp
   0x55dd0e11c282 <revalidate_ukey+18>:	mov    %rdx,%rbp
   0x55dd0e11c285 <revalidate_ukey+21>:	push   %rbx
   0x55dd0e11c286 <revalidate_ukey+22>:	mov    %rsi,%rbx
   0x55dd0e11c289 <revalidate_ukey+25>:	sub    $0x48,%rsp
   0x55dd0e11c28d <revalidate_ukey+29>:	mov    0xa8(%rsi),%r8
   0x55dd0e11c294 <revalidate_ukey+36>:	movl   $0x0,0x10(%rcx)
   0x55dd0e11c29b <revalidate_ukey+43>:	mov    %fs:0x28,%rax
   0x55dd0e11c2a4 <revalidate_ukey+52>:	mov    %rax,0x38(%rsp)
   0x55dd0e11c2a9 <revalidate_ukey+57>:	xor    %eax,%eax
   0x55dd0e11c2ab <revalidate_ukey+59>:	mov    (%rcx),%rax
   0x55dd0e11c2ae <revalidate_ukey+62>:	mov    %rax,0x8(%rcx)
   0x55dd0e11c2b2 <revalidate_ukey+66>:	mov    0x0(%rbp),%rdi

查看代码具体行,以及代码上下文,可以看出代码是正在给函数参数赋值,且即将调用函数compose_slow_path,因此查看函数revalidate_ukey__及调用compose_slow_path处代码,

 if (xoutp->slow) {
        struct ofproto_dpif *ofproto;
        ofp_port_t ofp_in_port;

        ofproto = xlate_lookup_ofproto(udpif->backer, &ctx.flow, &ofp_in_port);

        ofpbuf_clear(odp_actions);
        compose_slow_path(udpif, xoutp, &ctx.flow, ctx.flow.in_port.odp_port,
                          ofp_in_port, odp_actions,
                          ofproto->up.slowpath_meter_id, &ofproto->uuid);
    }

    if (odp_flow_key_to_mask(ukey->mask, ukey->mask_len, &dp_mask, &ctx.flow)
        == ODP_FIT_ERROR) {
        goto exit;
    }

分析函数参数,涉及指针调用的,只有ofproto这个参数,再查看该参数的来源,发现该参数有可能为空,

  ofproto = xlate_lookup_ofproto(udpif->backer, &ctx.flow, &ofp_in_port);
  
  struct ofproto_dpif *
xlate_lookup_ofproto(const struct dpif_backer *backer, const struct flow *flow,
                     ofp_port_t *ofp_in_port)
{
    const struct xport *xport;

    return xlate_lookup_ofproto_(backer, flow, ofp_in_port, &xport);
}

static struct ofproto_dpif *
xlate_lookup_ofproto_(const struct dpif_backer *backer, const struct flow *flow,
                      ofp_port_t *ofp_in_port, const struct xport **xportp)
{
    struct xlate_cfg *xcfg = ovsrcu_get(struct xlate_cfg *, &xcfgp);
    const struct xport *xport;

    /* If packet is recirculated, xport can be retrieved from frozen state. */
    if (flow->recirc_id) {
        const struct recirc_id_node *recirc_id_node;

        recirc_id_node = recirc_id_node_find(flow->recirc_id);

        if (OVS_UNLIKELY(!recirc_id_node)) {
            **return NULL;**
        }

        /* If recirculation was initiated due to bond (in_port = OFPP_NONE)
         * then frozen state is static and xport_uuid is not defined, so xport
         * cannot be restored from frozen state. */
        if (recirc_id_node->state.metadata.in_port != OFPP_NONE) {
            struct uuid xport_uuid = recirc_id_node->state.xport_uuid;
            xport = xport_lookup_by_uuid(xcfg, &xport_uuid);
            if (xport && xport->xbridge && xport->xbridge->ofproto) {
                goto out;
            }
        }
    }

    xport = xport_lookup(xcfg, tnl_port_should_receive(flow)
                         ? tnl_port_receive(flow)
                         : odp_port_to_ofport(backer, flow->in_port.odp_port));
    if (OVS_UNLIKELY(!xport)) {
      **  return NULL;**
    }

out:
    *xportp = xport;
    if (ofp_in_port) {
        *ofp_in_port = xport->ofp_port;
    }
    return xport->xbridge->ofproto;
}

该函数xlate_lookup_ofproto_返回值有可能为空,但是ofproto在使用的时候,没有判空,所以问题极有可能是该值为空导致的。

再看汇编代码中

mov    0x1c0(%rax),%eax

提到的偏移量0x1c0,以及日志中也提到的这个偏移量,是否对应参数ofproto->up.slowpath_meter_id

(gdb) p sizeof(struct ofproto_dpif)
$1 = 1384
(gdb) p &(((struct ofproto_dpif *)0)->up)
$2 = (struct ofproto *) 0x20
(gdb) p &(((struct ofproto *)0)->slowpath_meter_id)
$4 = (uint32_t *) 0x1a0
(gdb) p &(((struct ofproto_dpif *)0)->up.slowpath_meter_id)
$5 = (uint32_t *) 0x1c0

ofproto->up.slowpath_meter_id相对于ofproto的地址,正好得出偏移量0x1c0,所以可以确认就是ofproto为空触发的segfault。

查看最新的openvswitch开源代码,发现这部分代码已经修改了。 

 if (xoutp->slow) {
        struct ofproto_dpif *ofproto;
        ofp_port_t ofp_in_port;

        ofproto = xlate_lookup_ofproto(udpif->backer, &ctx.flow, &ofp_in_port,
                                       NULL);

        ofpbuf_clear(odp_actions);

        if (!ofproto) {
            goto exit;
        }

        compose_slow_path(udpif, xoutp, ctx.flow.in_port.odp_port,
                          ofp_in_port, odp_actions,
                          ofproto->up.slowpath_meter_id, &ofproto->uuid);
    }

 

补充:
1.disassemble revalidate_ukey__可以反汇编这个函数,看到完整汇编代码上下文。
2.如果gdb调试的时候看不到符号信息,可以安装debuginfo包后再查看。

 

文章来自个人专栏
文章 | 订阅
0条评论
0 / 1000
请输入你的评论
0
0