12

我正在编写一个在 x86-32 和 x86-64 Linux 中使用libbfdlibopcodes执行反汇编的工具。问题是,虽然我能够让 libopcodes 反汇编,但我无法获得任何指令信息。为了演示的目的,我做了一个最小的例子来重现我的问题。程序应该从入口点反汇编到第一个RET/ RETQ

该代码有点被全局变量破坏,并且为了简洁起见省略了错误检查等,但应该清楚地说明问题。

#include <bfd.h>
#include <dis-asm.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <string.h>
#include <ctype.h>
#include <limits.h>
#include <libiberty.h>

/*
 * Holds state for BFD and libopcodes.
 */
bfd *        abfd  = NULL;
disassemble_info dinfo = {0};

/*
 * Temporary hack to signal when disassembling should stop.
 */
static bool stop_disassembling = FALSE;

/*
 * Gets path to currently running executable.
 */
bool get_target_path(char * target_path, size_t size)
{
    char *   path;
    ssize_t len;

    pid_t pid = getpid();
    sprintf(target_path, "/proc/%d/exe", (int)pid );

    path = strdup(target_path);
    len  = readlink(path, target_path, size);

    target_path[len] = '\0';
    free(path);
    return TRUE;
}

/*
 * libopcodes appends spaces on the end of some instructions so for
 * comparisons, we want to strip those first.
 */
void strip_tail(char * str, unsigned int size)
{
    int i;
    for(i = 0; i < size; i++) {
        if(!isgraph(str[i])) {
            str[i] = '\0';
            break;
        }
    }
}

/*
 * Checks whether the current instruction will cause the control flow to not
 * proceed to the linearly subsequent instruction (e.g. ret, jmp, etc.)
 */
bool breaks_control_flow(char * str)
{
    if(abfd->arch_info->bits_per_address == 64) {
        if(strcmp(str, "retq") == 0) {
            return TRUE;
        }
    } else {
        if(strcmp(str, "ret") == 0) {
            return TRUE;
        }
    }

    return FALSE;
}

/*
 * Used as a callback for libopcodes so we can do something useful with the
 * disassembly. Currently this just outputs to stdout.
 */
int custom_fprintf(void * stream, const char * format, ...)
{
    /* silly amount */
    char    str[128] = {0};
    int rv;
    va_list args;

    va_start(args, format);
    rv = vsnprintf(str, ARRAY_SIZE(str) - 1, format, args);
    va_end(args);

    puts(str);
    strip_tail(str, ARRAY_SIZE(str));

    if(breaks_control_flow(str)) {
        puts("Stopped disassembly");
        stop_disassembling = TRUE;
    }

    if(dinfo.insn_info_valid) {
        switch(dinfo.insn_type) {
            case dis_noninsn:
                printf("not an instruction\n");
                break;
            case dis_nonbranch:
                printf("not a branch\n");
                break;
            case dis_branch:
                printf("is a branch\n");
                break;
            case dis_condbranch:
                printf("is a conditional branch\n");
                break;
            case dis_jsr:
                printf("jump to subroutine\n");
                break;
            case dis_condjsr:
                printf("conditional jump to subroutine\n");
                break;
            case dis_dref:
                printf("data reference in instruction\n");
                break;
            case dis_dref2:
                printf("two data references in instruction\n");
                break;
            default:
                printf("not enumerated\n");
                break;
        }
    } else {
        printf("insn_info not valid\n");
    }

    return rv;
}

/*
 * Initialises libopcodes disassembler and returns an instance of it.
 */
disassembler_ftype init_disasm(bfd * abfd, disassemble_info * dinfo)
{
    /* Override the stream the disassembler outputs to */
    init_disassemble_info(dinfo, NULL, custom_fprintf);
    dinfo->flavour = bfd_get_flavour(abfd);
    dinfo->arch    = bfd_get_arch(abfd);
    dinfo->mach    = bfd_get_mach(abfd);
    dinfo->endian  = abfd->xvec->byteorder;
    disassemble_init_for_target(dinfo);

    return disassembler(abfd);
}

/*
 * Method of locating section from VMA taken from opdis.
 */
typedef struct {
    bfd_vma    vma;
    asection * sec;
} BFD_VMA_SECTION;

/*
 * Loads section and fills in dinfo accordingly. Since this function allocates
 * memory in dinfo->buffer, callers need to call free once they are finished.
 */
bool load_section(bfd * abfd, disassemble_info * dinfo, asection * s)
{
    int     size = bfd_section_size(s->owner, s);
    unsigned char * buf  = xmalloc(size);

    if(!bfd_get_section_contents(s->owner, s, buf, 0, size)) {
        free(buf);
        return FALSE;
    }

    dinfo->section       = s;
    dinfo->buffer        = buf;
    dinfo->buffer_length = size;
    dinfo->buffer_vma    = bfd_section_vma(s->owner, s);

    printf("Allocated %d bytes for %s section\n: 0x%lX", size, s->name,
            dinfo->buffer_vma);
    return TRUE;
}

/*
 * Used to locate section for a vma.
 */
void vma_in_section(bfd * abfd, asection * s, void * data)
{
    BFD_VMA_SECTION * req = data;

    if(req && req->vma >= s->vma &&
    req->vma < (s->vma + bfd_section_size(abfd, s)) ) {
        req->sec = s;
    }
}

/*
 * Locate and load section containing vma.
 */
bool load_section_for_vma(bfd * abfd, disassemble_info * dinfo,
        bfd_vma vma)
{
    BFD_VMA_SECTION req = {vma, NULL};
    bfd_map_over_sections(abfd, vma_in_section, &req);

    if(!req.sec) {
        return FALSE;
    } else {
        return load_section(abfd, dinfo, req.sec);
    }
}

/*
 * Start disassembling from entry point.
 */
bool disassemble_entry(bfd * abfd, disassemble_info * dinfo,
        disassembler_ftype disassembler)
{
    bfd_vma    vma = bfd_get_start_address(abfd);

    /* First locate and load the section containing the vma */
    if(load_section_for_vma(abfd, dinfo, vma)) {
        int size;

        /* Keep disassembling until signalled otherwise or error */
        while(true) {
            dinfo->insn_info_valid = 0;
            size = disassembler(vma, dinfo);
            printf("Disassembled %d bytes at 0x%lX\n", size, vma);

            if(size == 0 || size == -1 || stop_disassembling) {
                break;
            }

            vma += size;
        }

        free(dinfo->buffer);
        return TRUE;
    }

    return FALSE;
}

int main(void)
{
    char  target_path[PATH_MAX] = {0};

    bfd_init();

    /* Get path for the running instance of this program */
    get_target_path(target_path, ARRAY_SIZE(target_path));

    abfd = bfd_openr(target_path, NULL);

    if(abfd != NULL && bfd_check_format(abfd, bfd_object)) {
        disassembler_ftype disassembler = init_disasm(abfd, &dinfo);

        disassemble_entry(abfd, &dinfo, disassembler);

        bfd_close(abfd);
    }

    return EXIT_SUCCESS;
}

可以使用以下内容构建此源makefile。要执行成功的链接,binutils-dev需要在本地机器上安装包:

all:
    gcc -Wall disasm.c -o disasm -lbfd -lopcodes

clean:
    rm -f disasm

运行时,输出是这样的:

Allocated 2216 bytes for .text section
: 0x400BF0xor    
insn_info not valid
%ebp
insn_info not valid
,
insn_info not valid
%ebp
insn_info not valid
Disassembled 2 bytes at 0x400BF0
mov    
insn_info not valid
%rdx
insn_info not valid
,
insn_info not valid
%r9
insn_info not valid
Disassembled 3 bytes at 0x400BF2
pop    
insn_info not valid
%rsi
insn_info not valid
Disassembled 1 bytes at 0x400BF5
mov    
insn_info not valid
%rsp
insn_info not valid
,
insn_info not valid
%rdx
insn_info not valid
Disassembled 3 bytes at 0x400BF6
and    
insn_info not valid
$0xfffffffffffffff0
insn_info not valid
,
insn_info not valid
%rsp
insn_info not valid
Disassembled 4 bytes at 0x400BF9
push   
insn_info not valid
%rax
insn_info not valid
Disassembled 1 bytes at 0x400BFD
push   
insn_info not valid
%rsp
insn_info not valid
Disassembled 1 bytes at 0x400BFE
mov    
insn_info not valid
$0x401450
insn_info not valid
,
insn_info not valid
%r8
insn_info not valid
Disassembled 7 bytes at 0x400BFF
mov    
insn_info not valid
$0x4013c0
insn_info not valid
,
insn_info not valid
%rcx
insn_info not valid
Disassembled 7 bytes at 0x400C06
mov    
insn_info not valid
$0x4012ce
insn_info not valid
,
insn_info not valid
%rdi
insn_info not valid
Disassembled 7 bytes at 0x400C0D
callq  
insn_info not valid
0x0000000000400ad8
insn_info not valid
Disassembled 5 bytes at 0x400C14
hlt    
insn_info not valid
Disassembled 1 bytes at 0x400C19
nop
insn_info not valid
Disassembled 1 bytes at 0x400C1A
nop
insn_info not valid
Disassembled 1 bytes at 0x400C1B
sub    
insn_info not valid
$0x8
insn_info not valid
,
insn_info not valid
%rsp
insn_info not valid
Disassembled 4 bytes at 0x400C1C
mov    
insn_info not valid
0x2013b9(%rip)
insn_info not valid
,
insn_info not valid
%rax
insn_info not valid
        # 
insn_info not valid
0x0000000000601fe0
insn_info not valid
Disassembled 7 bytes at 0x400C20
test   
insn_info not valid
%rax
insn_info not valid
,
insn_info not valid
%rax
insn_info not valid
Disassembled 3 bytes at 0x400C27
je     
insn_info not valid
0x0000000000400c2e
insn_info not valid
Disassembled 2 bytes at 0x400C2A
callq  
insn_info not valid
*%rax
insn_info not valid
Disassembled 2 bytes at 0x400C2C
add    
insn_info not valid
$0x8
insn_info not valid
,
insn_info not valid
%rsp
insn_info not valid
Disassembled 4 bytes at 0x400C2E
retq   
Stopped disassembly
insn_info not valid
Disassembled 1 bytes at 0x400C32

我期望的是能够通过 、 等读取每条指令的指令信息dinfo->insn_typetarget这种行为在 x86-32 和 x86-64 上都有体现。如果我至少可以确认这在这两种架构上没有实现,那么我可以自己填写这些信息。

4

3 回答 3

10

不幸的是,从 binutils libopcodes 2.22 开始,insn_typei386 或 x86_64 都没有填写。唯一广泛支持的架构是 MIPS、Sparc 和 Cell 的 SPU。在当前的 CVS HEAD 中仍然如此。

很难证明某些东西不存在,但是例如,在Sparc 反汇编程序源代码中,您可以看到多次出现的insn_type设置,例如info->insn_type = dis_branch,而在i386 反汇编程序源代码中没有出现,insn_type也没有出现任何值预计有 (dis_branchdis_nonbranch)。

检查所有支持insn_type您的 libopcodes 文件:

  • opcodes/mips-dis.c
  • opcodes/spu-dis.c
  • opcodes/microblaze-dis.c
  • opcodes/cris-dis.c
  • opcodes/sparc-dis.c
  • opcodes/mmix-dis.c
于 2012-02-06T22:53:35.367 回答
3

仅使用这些库来执行此操作将是一个极其痛苦和艰巨的过程。我认为你应该听 Necrolis 并使用已经这样做的库。我过去使用过Dyninst(即InstructionAPI + ParseAPI)。他们有很好的记录,并且会完全做到你想做什么。至少,花一个小时使用这个库并在手册中编译它们的示例将为您提供一个应用程序,让您可以检查每条指令的操作码、每条指令的长度、每条指令的参数数量等内容。这些是 libopcodes 不会告诉您也不会处理的事情(它一次解码地址,不能保证是指令)。

这是我从他们的手册中摘录的 Opdis 开发人员的一个片段(如果你没有,我建议你阅读,那里有很多关于 的好东西libopcodes):

libopcodes 库是一个非常实用的反汇编程序,但它有三个缺点:

  1. 文档不足,使新用户难以理解
  2. 它的功能集仅限于单个地址的反汇编
  3. 它主要用于将反汇编指令打印到流中

除其他外,我认为您可能会被该列表中的第二项刺痛。也就是说,大多数(所有?)操作码都适合单个地址并且与观察到的输出一致(例如,您正在获取movandpop和一些寄存器参数)。但是,诸如可变长度指令或未在 4 字节边界处精确排列的指令等棘手的事情呢?你没有做任何事情来处理这些。

libopcodes 生成的反汇编是一系列用于写入流的字符串。没有元数据,因此必须检查字符串以确定哪些是助记符,哪些是操作数,哪些是分支/跳转/返回指令以及它们的目标是什么。

我猜 Opdis 比你的程序更聪明——它知道如何以及在流中寻找什么。也许有时它知道在反汇编之前它需要读取两个地址而不是一个地址。从您的代码和 libopcodes 的描述来看,两者都没有这样做。

祝你好运!请记住阅读该手册,也许可以考虑libopdis改用!

于 2012-02-07T08:13:20.697 回答
0

Libopcodes 将反汇编指令打印到您的 custom_printf 函数拦截的流中。您的错误是您假设每次反汇编一条指令时都会调用一次 custom_printf ,但是,它被更频繁地调用,特别是打印每个助记符、操作数、地址或分隔符。

因此,二进制文件的反汇编结果是

xor %ebp, %ebp

mov %rdx, %r9

pop %rsi

mov %rsp, %rdx

and $0xfffffffffffffff0, %rsp

push %rax

push %rsp

mov $0x401450,%r8

...
于 2012-04-30T21:21:41.433 回答