0

对该问题进行了一些介绍,在
发布此问题之前,我尝试在 google/stack 上搜索此问题,但其中大多数都不清楚。
我有一个基于 cortex-a8 的板,我在其上运行裸机 RTOS,显示(帧缓冲区)有点慢,因为我现在还没有为我的目标实现 DMA,但并不是那么慢,但是我注意到一个改进的机会。在我的 CPU 和工具链组合中,32 位数学,数据访问比 16 位访问快,显示是 16 位 rgb565,所以一些帧缓冲操作比它们本来可以的慢一点(其中一些使用 memcpy,memmove和 memset 负责数据对齐等。)

我试图将两个像素塞进一个 32 位数据类型并使用它来访问内存(据我记得对齐,即使没有,我的 cpu 支持硬件中的非对齐内存访问,所以问题不应该是这个。)请注意,我不是在谈论我的实现速度,而是我得到的一个奇怪的效果,我怀疑这是因为我如何将两个像素塞进一个 32 位数据类型。

这是我的 fb_putc 的大部分内容

if (((unsigned char)c > 32) && ((unsigned char) c < 127)) {
    check_for_scroll(49);

    // fontdata starts from ASCII 33 shifted by logarithm(base2, font_height)
    c -= 33;
    c <<= 4;

    uint16_t pallete_16[2] = {fb.fg_color, fb.tg_color};

    uint32_t y;
    uint32_t *pixel_32;
    uint32_t fb_shifter;
    uint32_t pixel_32_holder;
    uint32_t fb_bg_32 = ((pallete_16[1] << 16) | (pallete_16[1]));
    /*
     * Each pixel is 16 bits, we access them using 32 bit data type,
     * which is faster for aligned memory access. Also many architectures
     * have free bit shifts with each instruction so we use that too.
     */
    pixel_32 = (uint32_t *) fb.config->base;
    pixel_32 += ( ((fb.cursor.y * (FONT_HEIGHT * fb.config->width)) + ((fb.cursor.x * (FONT_WIDTH))))
                    / ((sizeof(uint32_t))/(sizeof(uint16_t))) );
    for (y = 0; y < 16; y++) {
        for ( unsigned x = 7; x >= 0; x -= 2 )
        {
            if (fontdata[c + y] & (1 << x)) {   
                pixel_32_holder = (pallete_16[0] << 16);
            } else {
                pixel_32_holder = (pallete_16[1] << 16);
            }
            if (fontdata[c + y] & (1 << (x -1))) {
                pixel_32_holder |= (pallete_16[0] & 0xffff);
            } else {
                pixel_32_holder |= (pallete_16[1] & 0xffff);
            }
            *pixel_32++ = pixel_32_holder;
        }
        // Panel stride = width (480) - font_width (8)
        pixel_32 += (472 / ((sizeof(uint32_t))/(sizeof(uint16_t))));
    }

    fb.cursor.x++;
}

关于我哪里出错的任何帮助?我对编程有点陌生,并且将其作为爱好。

4

2 回答 2

1

在将它们写入内存之前组合 2 个像素的想法是正确的。这样一来,ARM 的写缓冲区硬件将被更有效地使用,代码运行得更快。我认为以这种形式混合 C 和 ASM 不会产生最好的结果。坚持使用纯 ASM 将保证您使用有条件执行的指令。此外,为您的调色板使用数组可能会导致编译器输出非常低效的代码。这是一种在纯 ASM 中更有效地执行此操作的方法。展开循环是个好主意。这是处理双色调字体数据的每个字节的代码。

@ Register usage
@ R0 = source data pointer
@ R1 = destination data pointer
@ R2 = foreground color (loaded outside of loop)
@ R3 = background color (loaded outside of loop)
@ R4,R5 = temp registers
@ Assumes that the most significant short of each 32-bit word is on the left

  ldrb r4,[r0],#1  @ source bitonal image data
@ first pair of pixels
  tst r4,#0x80
  movne r5,r5,r2,LSL #16
  moveq r5,r5,r3,LSL #16
  tst r4,#0x40
  orrne r5,r5,r2
  orreq r5,r5,r3
  str r5,[r1],#4
@ second pair of pixels
  tst r4,#0x20
  movne r5,r5,r2,LSL #16
  moveq r5,r5,r3,LSL #16
  tst r4,#0x10
  orrne r5,r5,r2
  orreq r5,r5,r3
  str r5,[r1],#4
@ third pair of pixels
  tst r4,#0x8
  movne r5,r5,r2,LSL #16
  moveq r5,r5,r3,LSL #16
  tst r4,#0x4
  orrne r5,r5,r2
  orreq r5,r5,r3
  str r5,[r1],#4
@ fourth pair of pixels
  tst r4,#0x2
  movne r5,r5,r2,LSL #16
  moveq r5,r5,r3,LSL #16
  tst r4,#0x1
  orrne r5,r5,r2
  orreq r5,r5,r3
  str r5,[r1],#4

更新稍微简单的代码

于 2012-06-04T16:39:59.883 回答
0

在过去几个小时编译器吃掉我的脑袋之后,我已经通过使用 asm 修复了一次存储两个像素的问题,但现在似乎还有其他问题,因为字符出现乱码,除了少数几个,我不确定是什么世界上正在导致这种情况......

至于打包像素,这是我最终使用的(以防将来有人可能需要这样做)

if (((unsigned char)c > 32) && ((unsigned char) c < 127)) {

    check_for_scroll(FB_MAX_Y_UNDER);

    uint32_t pixel_32_tmp;
    uint16_t pallete[2] = { (fb.fg_color), (fb.tg_color)};
    uint32_t *pixel_32 = (uint32_t *)fb.base +((((fb.cursor.y << 13)-(fb.cursor.y << 9))+(fb.cursor.x << 3)) >> 1);

    c -= 32;
    c <<= 4;

    for (int y = 0; y < 16; y++) {
        unsigned char font_bits = fontdata[c + y];

        if (font_bits & 0x80) {
            __asm__ volatile("mov %0, %1, lsl $16" : "=r" (pixel_32_tmp) : "r" (pallete[0]));
        } else {
            __asm__ volatile("mov %0, %1, lsl $16" : "=r" (pixel_32_tmp) : "r" (pallete[1]));
        }
        if (font_bits & 0x40) {
            __asm__ volatile("orr %0, %0, %1" : "=r" (pixel_32_tmp) : "r" (pallete[0]));
        } else {
            __asm__ volatile("orr %0, %0, %1" : "=r" (pixel_32_tmp) : "r" (pallete[1]));
        }
        *pixel_32++ = pixel_32_tmp;

        if (font_bits & 0x20) {
            __asm__ volatile("mov %0, %1, lsl $16" : "=r" (pixel_32_tmp) : "r" (pallete[0]));
        } else {
            __asm__ volatile("mov %0, %1, lsl $16" : "=r" (pixel_32_tmp) : "r" (pallete[1]));
        }
        if (font_bits & 0x10) {
            __asm__ volatile("orr %0, %0, %1" : "=r" (pixel_32_tmp) : "r" (pallete[0]));
        } else {
            __asm__ volatile("orr %0, %0, %1" : "=r" (pixel_32_tmp) : "r" (pallete[1]));
        }
        *pixel_32++ = pixel_32_tmp;

        if (font_bits & 0x08) {
            __asm__ volatile("mov %0, %1, lsl $16" : "=r" (pixel_32_tmp) : "r" (pallete[0]));
        } else {
            __asm__ volatile("mov %0, %1, lsl $16" : "=r" (pixel_32_tmp) : "r" (pallete[1]));
        }
        if (font_bits & 0x04) {
            __asm__ volatile("orr %0, %0, %1" : "=r" (pixel_32_tmp) : "r" (pallete[0]));
        } else {
            __asm__ volatile("orr %0, %0, %1" : "=r" (pixel_32_tmp) : "r" (pallete[1]));
        }
        *pixel_32++ = pixel_32_tmp;

        if (font_bits & 0x02) {
            __asm__ volatile("mov %0, %1, lsl $16" : "=r" (pixel_32_tmp) : "r" (pallete[0]));
        } else {
            __asm__ volatile("mov %0, %1, lsl $16" : "=r" (pixel_32_tmp) : "r" (pallete[1]));
        }
        if (font_bits & 0x01) {
            __asm__ volatile("orr %0, %0, %1" : "=r" (pixel_32_tmp) : "r" (pallete[0]));
        } else {
            __asm__ volatile("orr %0, %0, %1" : "=r" (pixel_32_tmp) : "r" (pallete[1]));
        }
        *pixel_32++ = pixel_32_tmp;

        pixel_32 += 236;
    }
    fb.cursor.x++;
}
于 2012-06-04T13:08:52.253 回答