Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Prevent GCC from optimizing away a looped write to memory mapped address

I've got an address which points to a control port like so (for context I'm working on a Sega/Megadrive game):

volatile u32 * vdp_ctrl = (u32 *) 0x00C00004;

And an array of initial values I want to set:

const u8 initial_vdp_vals[24] = {
  0x20,
  0x74,
  0x30,
  ..etc
};

With a loop:

typedef struct {
  u16 upper;
  u8 reg;
  u8 val;
} bitset;

typedef union {
  bitset b;
  u32 as_u32;
} u_bitset;

static inline void init_vdp() {
  u_bitset cmd = {{0x00008000}};
  for(int i = 0; i < 24; i++) {
     cmd.b.val = initial_vdp_vals[i];
     *vdp_ctrl = cmd.as_u32;
     cmd.b.reg += 1;
  }
}

The problem is that gcc (with O2 at least) optimizes this away and only writes the last value to the *vdp_ctrl pointer. I've managed to get around this by setting one of the properties in the bitset struct to volatile, but that doesn't seem like an intuitive solution, and the assembly it generates as a result is very lengthy.

So my question is two fold:

  1. What's the "correct" way to suggest to gcc that my vdp_ctrl pointer needs to accept multiple writes over time. I'm going to be using this pattern often (writing const/static data to control/data addresses in a loop), and marking random fields as volatile seem unintuitive.
  2. The "standard of quality" I'm using for the generated asm looks like the following (not gcc generated):
    move.l #initial_vdp_vals, a0
    move.l #24, d0
    move.l #0x00008000, d1

.copy:
    move.b (a0)+, d1
    move.w d1, 0x00C00004
    add.w #0x0100, d1
    dbra d0, .copy

Which is very nice and succinct. So my other question might be (as a complete C novice): Is there a better approach to my C solution to get me closer to the assembly above? I'm not even sure if my code is correct to be honest as I'm just trying to get past this loop optimization issue first since I know it's going to be a constant issue moving forward.

Runnable example to produce my issue:

volatile unsigned long * vdp_ctrl = (unsigned long *) 0x00C00004;

const unsigned char initial_vdp_vals[24] = {
  0x20,
  0x74,
  0x30,
  0x40,
  0x05,
  0x70,
  0x00,
  0x00,
  0x00,
  0x00,
  0x00,
  0x08,
  0x81,
  0x34,
  0x00,
  0x00,
  0x01,
  0x00,
  0x00,
  0x00,
  0x00,
  0x00,
  0x00,
  0x00
};

typedef struct {
  unsigned int upper;
  unsigned char reg;
  unsigned char val;
} bitset;

typedef union {
  bitset b;
  unsigned long as_u32;
} u_bitset;

static inline void init_vdp() {
  u_bitset cmd = {{0x00008000}};
  for(int i = 0; i < 24; i++) {
     cmd.b.val = initial_vdp_vals[i];
     *vdp_ctrl = cmd.as_u32;
  }
}

void init() {
  init_vdp();
  for(;;) {
  }
}

With m68k-linux-gnu-gcc -ffreestanding -O2 -S -c test.c -o test.s. Generates the following:

#NO_APP
    .file   "test.c"
    .text
    .align  2
    .globl  init
    .type   init, @function
init:
    link.w %fp,#0
    move.l vdp_ctrl,%a0
    moveq #24,%d0
    move.l #32768,%d1
.L2:
    move.l %d1,(%a0)
    subq.l #1,%d0
    jne .L2
.L3:
    jra .L3
    .size   init, .-init
    .globl  initial_vdp_vals
    .section    .rodata
    .type   initial_vdp_vals, @object
    .size   initial_vdp_vals, 24
initial_vdp_vals:
    .byte   32
    .byte   116
    .byte   48
    .byte   64
    .byte   5
    .byte   112
    .byte   0
    .byte   0
    .byte   0
    .byte   0
    .byte   0
    .byte   8
    .byte   -127
    .byte   52
    .byte   0
    .byte   0
    .byte   1
    .byte   0
    .byte   0
    .byte   0
    .byte   0
    .byte   0
    .byte   0
    .byte   0
    .globl  vdp_ctrl
    .data
    .align  2
    .type   vdp_ctrl, @object
    .size   vdp_ctrl, 4
vdp_ctrl:
    .long   12582916
    .ident  "GCC: (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0"
    .section    .note.GNU-stack,"",@progbits
gcc version 7.4.0 (Ubuntu 7.4.0-1ubuntu1~18.04.1)

Note: It does appear the size of my array determines if it gets optimized out or not. When I made it only two elements it did not optimize out.

like image 572
grep Avatar asked Nov 07 '22 09:11

grep


1 Answers

In this kind of code you shuold use exact size integers. I strongly advice to pack the structures and unions as well.

#include <stdint.h>
#define vdp_ctrl  ((volatile uint32_t *) 0x00C00004)

const unsigned char initial_vdp_vals[24] = {
  0x20,
  0x74,
  0x30,
  0x40,
  0x05,
  0x70,
  0x00,
  0x00,
  0x00,
  0x00,
  0x00,
  0x08,
  0x81,
  0x34,
  0x00,
  0x00,
  0x01,
  0x00,
  0x00,
  0x00,
  0x00,
  0x00,
  0x00,
  0x00
};

typedef struct {
  uint16_t upper;
  uint8_t reg;
  uint8_t val;
} __attribute__((packed)) bitset;

typedef union {
  bitset b;
  uint32_t as_u32;
} __attribute__((packed)) u_bitset ;

static inline void init_vdp() {
  u_bitset cmd = {.b.upper = 0x00008000};
  for(int i = 0; i < 24; i++) 
  {
     cmd.b.val = initial_vdp_vals[i];
     *vdp_ctrl = cmd.as_u32;
  }
}

void init() {
  init_vdp();
  for(;;) {
  }
}

and it generates the code you need.

IMO it better yo have macro instead of the real object. It may not make any difference in this trivial code, but it will if the code becomes more complicated.

like image 63
0___________ Avatar answered Nov 15 '22 06:11

0___________