3

请帮我解决这个问题。我使用了一个 1 毫秒的高分辨率计时器并将其安装为带有“insmod”的单独模块。这每 1 毫秒触发一次,我必须用这个定时器中断做一些任务。还有其他进行图像传输的进程,我看到以太网驱动程序中断出现以发送图像。这个enet中断有一些高优先级,看起来它正在延迟上面的1 ms定时器中断,但我不确定。

在运行测试 3 到 3 小时后,我看到了下面的 Oops。如何从根本上解决这个问题?请帮忙。系统为ARM omap,运行Linux 2.6.33 交叉编译。

[root@user:/]# 
Unable to handle kernel paging request at virtual address 7eb52754
pgd = 80004000
[7eb52754] *pgd=00000000
Internal error: Oops: 80000005 [#1] PREEMPT
last sysfs file: /sys/devices/virtual/spi/spi/dev
Modules linked in: mod timermod mod2(P) mod3(P) mod4
CPU: 0    Tainted: P            (2.6.33_appl #1)
PC is at 0x7eb52754
LR is at walk_stackframe+0x24/0x40
pc : [<7eb52754>]    lr : [<8002d4dc>]    psr: a0000013
sp : 80395f10  ip : 80395f30  fp : 80395f2c
r10: 0000001f  r9 : 00000000  r8 : 87a25200
r7 : 878b0380  r6 : 80395f40  r5 : 80028374  r4 : 80395f30
r3 : 80000100  r2 : 80395f40  r1 : 80395f40  r0 : 80395f30
Flags: NzCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment kernel
Control: 10c5387d  Table: 86fb0019  DAC: 00000017
Process swapper (pid: 0, stack limit = 0x803942e8)
Stack: (0x80395f10 to 0x80396000)
5f00:                                     8002bfa4 00000001 802c678c 87a25380
5f20: 80395f54 80395f30 8002bfe0 8002d4c4 80395f54 80395f30 8004998c 8002bfa4
5f40: 00000002 00000002 80395f6c 80395f58 8004998c 8002bfb0 80396ea8 80394000
5f60: 80395fa4 80395f70 802c678c 800498d0 8002b320 80023218 80398408 80021e10
5f80: 80394000 8002321c 80023218 80398408 80021e10 413fc082 80395fbc 80395fa8
5fa0: 8002b324 802c62fc 803f4cc8 803f5190 80395fcc 80395fc0 802c3ee4 8002b28c
5fc0: 80395ff4 80395fd0 8000897c 802c3e6c 800084fc 00000000 00000000 8002321c
5fe0: 10c53c7d 803c7630 00000000 80395ff8 80008034 80008754 00000000 00000000
Backtrace:
[<8002d4b8>] (walk_stackframe+0x0/0x40) from [<8002bfe0>] (return_address+0x3c/0x5c)
 r6:87a25380 r5:802c678c r4:00000001 r3:8002bfa4
[<8002bfa4>] (return_address+0x0/0x5c) from [<8004998c>] (sub_preempt_count+0xc8/0xfc)
[<800498c4>] (sub_preempt_count+0x0/0xfc) from [<802c678c>] (schedule+0x49c/0x4d8)
 r5:80394000 r4:80396ea8
[<802c62f0>] (schedule+0x0/0x4d8) from [<8002b324>] (cpu_idle+0xa4/0xbc)
 r9:413fc082 r8:80021e10 r7:80398408 r6:80023218 r5:8002321c
r4:80394000
[<8002b280>] (cpu_idle+0x0/0xbc) from [<802c3ee4>] (rest_init+0x84/0xa0)
 r4:803f5190 r3:803f4cc8
[<802c3e60>] (rest_init+0x0/0xa0) from [<8000897c>] (start_kernel+0x234/0x284)
[<80008748>] (start_kernel+0x0/0x284) from [<80008034>] (__enable_mmu+0x0/0x2c)
Code: bad PC value
---[ end trace 7e26218fd59f68a5 ]---
Kernel panic - not syncing: Attempted to kill the idle task!
Backtrace:
[<8002db2c>] (dump_backtrace+0x0/0x114) from [<802c610c>] (dump_stack+0x20/0x24)
 r6:fffffffc r5:0000000b r4:803c8518 r3:00000002
[<802c60ec>] (dump_stack+0x0/0x24) from [<802c6168>] (panic+0x58/0x130)
[<802c6110>] (panic+0x0/0x130) from [<80057330>] (do_exit+0x7c/0x6e0)
 r3:80394000 r2:00000000 r1:80395d28 r0:80348e90
[<800572b4>] (do_exit+0x0/0x6e0) from [<8002dfc0>] (die+0x290/0x2c4)
 r7:7eb52744
[<8002dd30>] (die+0x0/0x2c4) from [<8002f4d4>] (__do_kernel_fault+0x74/0x84)
 r7:80395ec8
[<8002f460>] (__do_kernel_fault+0x0/0x84) from [<8002f6bc>] (do_page_fault+0x1d8/0x1f0)
 r7:00000000 r6:80395ec8 r5:7eb52754 r4:80396ea8
[<8002f4e4>] (do_page_fault+0x0/0x1f0) from [<8002f794>] (do_translation_fault+0x20/0x80)
[<8002f774>] (do_translation_fault+0x0/0x80) from [<80029250>] (do_PrefetchAbort+0x44/0xa8)
 r6:7eb52754 r5:80398820 r4:00000005 r3:8002f774
[<8002920c>] (do_PrefetchAbort+0x0/0xa8) from [<80029d1c>] (__pabt_svc+0x5c/0xa0)
Exception stack(0x80395ec8 to 0x80395f10)
5ec0:                   80395f30 80395f40 80395f40 80000100 80395f30 80028374
5ee0: 80395f40 878b0380 87a25200 00000000 0000001f 80395f2c 80395f30 80395f10
5f00: 8002d4dc 7eb52754 a0000013 ffffffff
 r7:878b0380 r6:80395f40 r5:80395efc r4:ffffffff
[<8002d4b8>] (walk_stackframe+0x0/0x40) from [<8002bfe0>] (return_address+0x3c/0x5c)
 r6:87a25380 r5:802c678c r4:00000001 r3:8002bfa4
[<8002bfa4>] (return_address+0x0/0x5c) from [<8004998c>] (sub_preempt_count+0xc8/0xfc)
[<800498c4>] (sub_preempt_count+0x0/0xfc) from [<802c678c>] (schedule+0x49c/0x4d8)
 r5:80394000 r4:80396ea8
[<802c62f0>] (schedule+0x0/0x4d8) from [<8002b324>] (cpu_idle+0xa4/0xbc)
 r9:413fc082 r8:80021e10 r7:80398408 r6:80023218 r5:8002321c
r4:80394000
[<8002b280>] (cpu_idle+0x0/0xbc) from [<802c3ee4>] (rest_init+0x84/0xa0)
 r4:803f5190 r3:803f4cc8
[<802c3e60>] (rest_init+0x0/0xa0) from [<8000897c>] (start_kernel+0x234/0x284)
[<80008748>] (start_kernel+0x0/0x284) from [<80008034>] (__enable_mmu+0x0/0x2c)

==========================================

#include <linux/hrtimer.h>
#include <linux/module.h>
#include <linux/ktime.h>
#include <linux/kdev_t.h>
#include <linux/cdev.h>
#include <linux/device.h>
#include <linux/fs.h>
#include <linux/wait.h>
#include <linux/sched.h>

#define FIRST_MINOR 0
#define MINOR_CNT   1

static struct class *cl;
static struct cdev cdev;
static dev_t dev;
static u8 timer_expired = 0;
static wait_queue_head_t wq_head;

static struct hrtimer timer;

static ssize_t hr_read(struct file *f, char * __user buff, size_t cnt, loff_t *off)
{
    wait_event_interruptible(wq_head, timer_expired);
    timer_expired = 0;
    return 0;
}

static int hr_open(struct inode *i, struct file *f)
{
    ktime_t ktime;
        ktime.tv64 = 1E6L;
        hrtimer_start(&timer, ktime, HRTIMER_MODE_REL);
    return 0;
}   


static int hr_close(struct inode *i, struct file *f)
{
    if (hrtimer_cancel(&timer))
       printk(KERN_INFO "timercancelled\n");

    return 0;
}   

static struct file_operations hr_fops = {
    .read = hr_read,
    .open = hr_open,
    .release = hr_close
};

static enum hrtimer_restart timer_callback(struct hrtimer *timer)
{
    ktime_t ktime;
    u64 overrun;
    ktime.tv64 = 1E6L;
    //printk("KERN_INFO""Timer Expired");

    overrun = hrtimer_forward_now(timer, ktime);
    timer_expired = 1;
    wake_up_interruptible(&wq_head);
    return HRTIMER_RESTART;
}
#if 1

static int init_hrtimer(void)
{   
    ktime_t ktime;
    unsigned long delay_in_ms = 500L;
    printk(KERN_ERR "Timer being set up\n");

    ktime = ktime_set(0,delay_in_ms*1E6L);
    hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

    timer.function = &timer_callback;
    printk(KERN_ERR "Timer starting to fire\n");
    printk(KERN_ERR "in %ldms %ld\n", delay_in_ms, jiffies);

    if (alloc_chrdev_region(&dev, FIRST_MINOR, MINOR_CNT, "Hr Timer") < 0)
    {
        return -1;
    }
    printk("Major Nr: %d\n", MAJOR(dev));

    cdev_init(&cdev, &hr_fops);

    if (cdev_add(&cdev, dev, MINOR_CNT) == -1)
    {
        unregister_chrdev_region(dev, MINOR_CNT);
        return -1;
    }

    if ((cl = class_create(THIS_MODULE, "hrtimer")) == NULL)
    {
        cdev_del(&cdev);
        unregister_chrdev_region(dev, MINOR_CNT);
        return -1;
    }
    if (IS_ERR(device_create(cl, NULL, dev, NULL, "hrt%d", 0)))
    {
        class_destroy(cl);
        cdev_del(&cdev);
        unregister_chrdev_region(dev, MINOR_CNT);
        return -1;
    }

    init_waitqueue_head(&wq_head);

    return 0;
}
#endif


static void clean_hrtimer(void)
{
    int cancelled = hrtimer_cancel(&timer);

    if (cancelled)
        printk(KERN_ERR "Timer still running\n");
    else
        printk(KERN_ERR "Timer cancelled\n");

     device_destroy(cl, dev);
     class_destroy(cl);
        cdev_del(&cdev);
        unregister_chrdev_region(dev, MINOR_CNT);
}

module_init(init_hrtimer);
module_exit(clean_hrtimer);

MODULE_LICENSE("GPL");

=========================

我用上面的代码作为驱动模块,用insmod插入。我希望它每 1 毫秒触发一次,它工作正常,但有时当以太网流量太高时,它会给出一个内核糟糕的解释。请检查代码是否有任何问题?

我检查了 lsmod,我看到所有 5 个内核模块(我自己的)都加载在:0x7f000000 到 0x7f02xxxx 之间

mod at 0x7f020xxxx, 
timermod at 0x7f01xxx, 
mod2 at 0x7f01xxxx, 
mod3 at 0x7f00xxxx, 
mod4 at 0x7f000000. 

在 oops 地址 0x7eb52754 处未加载任何模块。我从 /proc/kallsyms 文件中检查以验证这一点。如何检查 0x7eb5xxxx 到源文件的映射?我还能在哪里获得系统上的数据。

4

1 回答 1

4

根据错误消息,导致此内核恐慌的代码位于虚拟地址 0x7eb52754。从地址(略低于 0x8000000)来看,我猜这是内核模块的代码段 - 可能是您自己的内核模块之一。

要进行根本原因分析,请按照发生此恐慌时的加载顺序加载您的(和所有其他)内核模块,并观察它们的加载地址,如 lsmod (或几乎相同的 cat /proc/modules )打印的那样.

使用它们的代码大小和加载地址,计算哪个模块文本段位于虚拟地址 0x7eb52754。从模块加载地址中减去 0x7eb52754。

您将得到的是导致恐慌的指令在模块二进制中的偏移量。

现在在内核模块二进制文件上使用 objdump 并查找该偏移量,并检查它属于哪个函数(这也可以使用 add2line 完成,如果你也有的话)。这应该将您指向导致此恐慌的指令的函数甚至行号(如果您有调试信息)。

祝你好运。

于 2013-04-02T07:19:41.657 回答