我试图通过使用 AER 注入在 Linux nvme 驱动程序中引起回调。我修改了 AER 源代码,直接通过模块加载而不是从用户态程序注入错误。
我已经验证我得到了正确的总线、dev 和 fn,并且错误注入成功,但是 nvme 驱动程序的错误处理函数没有收到任何东西。
这是 aer_injection.c 文件的修改
这填写了错误结构
554 static int __init aer_inject_init(void)
555 {
556
557 printk(KERN_INFO "%s %d\n", __func__, __LINE__);
558 int ret = misc_register(&aer_inject_device);
559 struct aer_error_inj ae = {
560 .bus = 0x84,
561 .dev = 0x00,
562 .fn = 0x00,
563 .uncor_status = 0x00040000, //poisoned TLP
564 .cor_status = 0x0,
565 .header_log0 = 0x0,
566 .header_log1 = 0x1,
567 .header_log2 = 0x2,
568 .header_log3 = 0x3,
569 .domain = 0x00
570 };
571 printk(KERN_INFO "%s %d\n", __func__, __LINE__);
572 aer_inject(&ae);
573 printk(KERN_INFO "%s %d\n", __func__, __LINE__);
574 return ret;
575 }
这是 aer_inject() 函数:
320 static int aer_inject(struct aer_error_inj *einj)
321 {
322 struct aer_error *err, *rperr;
323 struct aer_error *err_alloc = NULL, *rperr_alloc = NULL;
324 struct pci_dev *dev, *rpdev;
325 struct pcie_device *edev;
326 unsigned long flags;
327 unsigned int devfn = PCI_DEVFN(einj->dev, einj->fn);
328 int pos_cap_err, rp_pos_cap_err;
329 u32 sever, cor_mask, uncor_mask, cor_mask_orig = 0, uncor_mask_orig = 0;
330 int ret = 0;
331
332 //einj->domain = 0x0000;
333 //einj->bus = 0x84;
334 //devfn = 0x0;
335
336 printk(KERN_INFO "%s %d\n", __func__, __LINE__);
337 dev = pci_get_domain_bus_and_slot((int)einj->domain, einj->bus, devfn);
338 printk(KERN_INFO "dev->vendor %#x\n", dev->vendor);
339 printk(KERN_INFO "dev->device %#x\n", dev->device);
340 if (!dev) {
341 printk(KERN_INFO "ENODEV %s %d\n", __func__, __LINE__);
342 return -ENODEV;
343 }
344 rpdev = pcie_find_root_port(dev);
345 if (!rpdev) {
346 ret = -ENODEV;
347 goto out_put;
348 }
349
350 printk(KERN_INFO "%s %d\n", __func__, __LINE__);
351 pos_cap_err = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
352 if (!pos_cap_err) {
353 ret = -EPERM;
354 goto out_put;
355 }
356 pci_read_config_dword(dev, pos_cap_err + PCI_ERR_UNCOR_SEVER, &sever);
357 pci_read_config_dword(dev, pos_cap_err + PCI_ERR_COR_MASK, &cor_mask);
358 pci_read_config_dword(dev, pos_cap_err + PCI_ERR_UNCOR_MASK,
359 &uncor_mask);
360
361 printk(KERN_INFO "%s %d\n", __func__, __LINE__);
362 rp_pos_cap_err = pci_find_ext_capability(rpdev, PCI_EXT_CAP_ID_ERR);
363 if (!rp_pos_cap_err) {
364 ret = -EPERM;
365 goto out_put;
366 }
367
368 err_alloc = kzalloc(sizeof(struct aer_error), GFP_KERNEL);
369 if (!err_alloc) {
370 ret = -ENOMEM;
371 goto out_put;
372 }
373 rperr_alloc = kzalloc(sizeof(struct aer_error), GFP_KERNEL);
374 if (!rperr_alloc) {
375 ret = -ENOMEM;
376 goto out_put;
377 }
378
379 if (aer_mask_override) {
380 cor_mask_orig = cor_mask;
381 cor_mask &= !(einj->cor_status);
382 pci_write_config_dword(dev, pos_cap_err + PCI_ERR_COR_MASK,
383 cor_mask);
384
385 uncor_mask_orig = uncor_mask;
386 uncor_mask &= !(einj->uncor_status);
387 pci_write_config_dword(dev, pos_cap_err + PCI_ERR_UNCOR_MASK,
388 uncor_mask);
389 }
390
391 printk(KERN_INFO "%s %d\n", __func__, __LINE__);
392 spin_lock_irqsave(&inject_lock, flags);
393
394 err = __find_aer_error_by_dev(dev);
395 if (!err) {
396 err = err_alloc;
397 err_alloc = NULL;
398 aer_error_init(err, einj->domain, einj->bus, devfn,
399 pos_cap_err);
400 list_add(&err->list, &einjected);
401 }
402 err->uncor_status |= einj->uncor_status;
403 printk(KERN_INFO "err->uncor_status %#x\n", err->uncor_status);
404 err->cor_status |= einj->cor_status;
405 err->header_log0 = einj->header_log0;
406 err->header_log1 = einj->header_log1;
407 err->header_log2 = einj->header_log2;
408 err->header_log3 = einj->header_log3;
409
410 printk(KERN_INFO "%s %d\n", __func__, __LINE__);
411 if (!aer_mask_override && einj->cor_status &&
412 !(einj->cor_status & ~cor_mask)) {
413 ret = -EINVAL;
414 printk(KERN_WARNING "The correctable error(s) is masked "
415 "by device\n");
416 spin_unlock_irqrestore(&inject_lock, flags);
417 goto out_put;
418 }
419 printk(KERN_INFO "%s %d\n", __func__, __LINE__);
420 if (!aer_mask_override && einj->uncor_status &&
421 !(einj->uncor_status & ~uncor_mask)) {
422 ret = -EINVAL;
423 printk(KERN_WARNING "The uncorrectable error(s) is masked "
424 "by device\n");
425 spin_unlock_irqrestore(&inject_lock, flags);
426 goto out_put;
427 }
428
429 printk(KERN_INFO "%s %d\n", __func__, __LINE__);
430 rperr = __find_aer_error_by_dev(rpdev);
431 if (!rperr) {
432 printk(KERN_INFO "%s %d\n", __func__, __LINE__);
433 rperr = rperr_alloc;
434 rperr_alloc = NULL;
435 aer_error_init(rperr, pci_domain_nr(rpdev->bus),
436 rpdev->bus->number, rpdev->devfn,
437 rp_pos_cap_err);
438 list_add(&rperr->list, &einjected);
439 }
440 printk(KERN_INFO "%s %d\n", __func__, __LINE__);
441 if (einj->cor_status) {
442 if (rperr->root_status & PCI_ERR_ROOT_COR_RCV)
443 rperr->root_status |= PCI_ERR_ROOT_MULTI_COR_RCV;
444 else
445 rperr->root_status |= PCI_ERR_ROOT_COR_RCV;
446 rperr->source_id &= 0xffff0000;
447 rperr->source_id |= (einj->bus << 8) | devfn;
448 }
449 printk(KERN_INFO "%s %d\n", __func__, __LINE__);
450 if (einj->uncor_status) {
451 printk(KERN_INFO "%s %d\n", __func__, __LINE__);
452 if (rperr->root_status & PCI_ERR_ROOT_UNCOR_RCV)
453 {
454 printk(KERN_INFO "%s %d\n", __func__, __LINE__);
455 rperr->root_status |= PCI_ERR_ROOT_MULTI_UNCOR_RCV;
456 if (sever & einj->uncor_status) {
457 rperr->root_status |= PCI_ERR_ROOT_FATAL_RCV;
458 if (!(rperr->root_status & PCI_ERR_ROOT_UNCOR_RCV))
459 rperr->root_status |= PCI_ERR_ROOT_FIRST_FATAL;
460 } else
461 rperr->root_status |= PCI_ERR_ROOT_NONFATAL_RCV;
462 rperr->root_status |= PCI_ERR_ROOT_UNCOR_RCV;
463 rperr->source_id &= 0x0000ffff;
464 rperr->source_id |= ((einj->bus << 8) | devfn) << 16;
465 }
466 printk(KERN_INFO "%s %d\n", __func__, __LINE__);
467 spin_unlock_irqrestore(&inject_lock, flags);
468
469 if (aer_mask_override) {
470 pci_write_config_dword(dev, pos_cap_err + PCI_ERR_COR_MASK,
471 cor_mask_orig);
472 pci_write_config_dword(dev, pos_cap_err + PCI_ERR_UNCOR_MASK,
473 uncor_mask_orig);
474 }
475
476 printk(KERN_INFO "%s %d\n", __func__, __LINE__);
477 ret = pci_bus_set_aer_ops(dev->bus);
478 if (ret)
479 goto out_put;
480 printk(KERN_INFO "%s %d\n", __func__, __LINE__);
481 ret = pci_bus_set_aer_ops(rpdev->bus);
482 if (ret)
483 goto out_put;
484
485 printk(KERN_INFO "%s %d\n", __func__, __LINE__);
486 if (find_aer_device(rpdev, &edev)) {
487 if (!get_service_data(edev)) {
488 printk(KERN_WARNING "AER service is not initialized\n");
489 ret = -EINVAL;
490 printk(KERN_INFO "%s %d\n", __func__, __LINE__);
491 goto out_put;
492 }
493 aer_irq(-1, edev);
494 }
495 else
496 ret = -EINVAL;
497 printk(KERN_INFO "%s %d\n", __func__, __LINE__);
498 out_put:
499 kfree(err_alloc);
500 kfree(rperr_alloc);
501 pci_dev_put(dev);
502 return ret;
503 }
这是我试图在 nvme 驱动程序中实现的功能:
2563 static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
2564 enum pci_channel_state state)
2565 {
2566
2567 printk(KERN_INFO "nvme_error_detected called\n");
2568 dev_printk(KERN_ERR, &pdev->dev, "%s\n", __func__);
2569 }
内核验证:
aer_inject_init 555
aer_inject_init 569
aer_inject 336
dev->vendor 0x1c58
dev->device 0x3
aer_inject 350
aer_inject 361
aer_inject 391
err->uncor_status 0x40000
aer_inject 410
aer_inject 419
aer_inject 429
aer_inject 432
aer_inject 440
aer_inject 449
aer_inject 451
aer_inject 464
aer_inject 474
aer_inject 478
aer_inject 483
aer_inject 495
aer_inject_init 571
正如我们所见,内核消息不是从驱动程序中打印出来的,而是在整个 AER 注入方法中完全执行的。这可能是什么原因?感谢您为混乱的代码格式道歉。