掌機 - Anbernic RetroGame - Write DMA Framebuffer



多數掌機使用的LCD輸出模組都會搭配DMA使用,藉此減輕CPU負擔,而加速顯示的最快方式則是軟體程式直接操作DMA暫存器,最典型的例子就是Notaz於Wiz掌機實作的PCSX ReARMed模擬器,Notaz將Wiz的硬體DMA暫存器映射成一塊顯示Buffer,然後SDL最後輸出畫面時,直接更新到該Buffer,接著Toggle DMA旗標輸出,成功讓PS模擬器加速運行

RS97掌機由於具備較低的CPU速度,因此,司徒目前想到最快速的方式也就是Notaz這招,藉此提升PS模擬器的效能,但是,如何在RS97實作出DMA操作,則需要一個小程式測試,證明可以直接操作DMA暫存器並且可以輸出LCD畫面,但是比較大的問題在於DMA暫存器,因為目前Kernel還是只能使用官方的版本,因此,無法直接從Kernel配置DMA Buffer並設定給DMA暫存器,相反的,司徒則是直接讀取DMA指標並重新映射Buffer給軟體程式使用,而在畫面更新時,則不能呼叫SDL_Flip(),測試程式碼如下:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>
#include <SDL.h>
#include <SDL_image.h>
#include <SDL_ttf.h>

#define DMA_BASE  0x13420000
#define DSA0      (0x00 >> 2)
#define DTA0      (0x04 >> 2)
#define DTC0      (0x08 >> 2)
#define DRT0      (0x0C >> 2)
#define DCS0      (0x10 >> 2)
#define DCM0      (0x14 >> 2)
#define DDA0      (0x18 >> 2)
#define DSD0      (0x1C >> 2)
#define DSA1      (0x20 >> 2)
#define DTA1      (0x24 >> 2)
#define DTC1      (0x28 >> 2)
#define DRT1      (0x2C >> 2)
#define DCS1      (0x30 >> 2)
#define DCM1      (0x34 >> 2)
#define DDA1      (0x38 >> 2)
#define DSD1      (0x3C >> 2)

#define LCD_BASE  0x13050000
#define LCDDA0    (0x40 >> 2)
#define LCDSA0    (0x44 >> 2)
#define LCDFID0   (0x48 >> 2)
#define LCDCMD0   (0x4C >> 2)
#define LCDDA1    (0x50 >> 2)
#define LCDSA1    (0x54 >> 2)
#define LCDFID1   (0x58 >> 2)
#define LCDCMD1   (0x5C >> 2)

#define PAGE_SIZE  1024

int fd = -1;
volatile unsigned long *mem;

int map_it(unsigned long addr, unsigned long size)
{
    fd = open("/dev/mem", O_RDWR | O_SYNC);
    if (fd < 0) {
        printf("failed to open /dev/mem\n");
        return -1;
    }
    mem = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, addr);
    return 0;
}

void unmap_it(unsigned long size)
{
    munmap((void *)mem, size);
    close(fd);
}

int main(int argc, char* argv[])
{
    uint32_t dma_addr;

    map_it(LCD_BASE, PAGE_SIZE);
    printf("LCDDA0: 0x%x\n", mem[LCDDA0]);
    printf("LCDSA0: 0x%x\n", mem[LCDSA0]);
    printf("LCDFID0: 0x%x\n", mem[LCDFID0]);
    printf("LCDCMD0: 0x%x\n", mem[LCDCMD0]);

    printf("LCDDA1: 0x%x\n", mem[LCDDA1]);
    printf("LCDSA1: 0x%x\n", mem[LCDSA1]);
    printf("LCDFID1: 0x%x\n", mem[LCDFID1]);
    printf("LCDCMD1: 0x%x\n", mem[LCDCMD1]);

    dma_addr = mem[LCDSA0];
    unmap_it(PAGE_SIZE);

    map_it(DMA_BASE, PAGE_SIZE);
    printf("DSA0: 0x%x\n", mem[DSA0]);
    printf("DTA0: 0x%x\n", mem[DTA0]);
    printf("DTC0: 0x%x\n", mem[DTC0]);
    printf("DRT0: 0x%x\n", mem[DRT0]);
    printf("DCS0: 0x%x\n", mem[DCS0]);
    printf("DCM0: 0x%x\n", mem[DCM0]);
    printf("DDA0: 0x%x\n", mem[DDA0]);
    printf("DSD0: 0x%x\n", mem[DSD0]);

    printf("DSA1: 0x%x\n", mem[DSA1]);
    printf("DTA1: 0x%x\n", mem[DTA1]);
    printf("DTC1: 0x%x\n", mem[DTC1]);
    printf("DRT1: 0x%x\n", mem[DRT1]);
    printf("DCS1: 0x%x\n", mem[DCS1]);
    printf("DCM1: 0x%x\n", mem[DCM1]);
    printf("DDA1: 0x%x\n", mem[DDA1]);
    printf("DSD1: 0x%x\n", mem[DSD1]);
    unmap_it(PAGE_SIZE);

    SDL_Init(SDL_INIT_VIDEO);
    SDL_Surface *screen = SDL_SetVideoMode(320, 480, 16, SDL_HWSURFACE);

    unsigned long c = 0, idx = 0;
    const uint32_t size = 320 * 480 * 2;
    map_it(dma_addr, size);
    volatile uint16_t *ptr = (volatile uint16_t *)mem;
    uint16_t color[] = { 0xf800, 0x7e0, 0x1f };
    while (1) {
        c = 0;
        for (int y = 0; y < 480; y++) {
            for (int x = 0; x < 320; x++) {
                ptr[c++] = color[idx];
            }
        }
        idx += 1;
        if (idx >= 3) {
            idx = 0;
        }
        SDL_Delay(100);
    }
    unmap_it(size);
    SDL_Delay(5000);
    SDL_Quit();
    return 0;
}

P.S. O_SYNC可以配置Memory成沒有Cache機制的記憶體,但是,沒有Cache的缺點就是速度會比較慢一點