How to measure program execution time in ARM Cortex-A53 processor?

Tags:

c

arm64

I was using following method to read clock in cortex-a15:

static void readticks(unsigned int *result)
{
    struct timeval t;
    unsigned int cc;
    if (!enabled) {
        // program the performance-counter control-register:
        asm volatile("mcr p15, 0, %0, c9, c12, 0" :: "r"(17));
        //enable all counters
        asm volatile("mcr p15, 0, %0, c9, c12, 1" :: "r"(0x8000000f));
        //Clear overflow.
        asm volatile("mcr p15, 0, %0, c9, c12, 3" :: "r"(0x8000000f));
        enabled = 1;
    }
    asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(cc));
    gettimeofday(&t,(struct timezone *) 0);
    result[0] = cc;
    result[1] = t.tv_usec;
    result[2] = t.tv_sec;
}

And final performance profilinglooks like:

before = readticks();
  foo();
after = readticks();
clock_cycles = after - before.

I want to use same logic in cortex-A53, ARM64 (not aarch32).

I have tried this after following online portals:

    /* All counters, including PMCCNTR_EL0, are disabled/enabled */

        #define QUADD_ARMV8_PMCR_E      (1 << 0)
        /* Reset all event counters, not including PMCCNTR_EL0, to 0

 */
    #define QUADD_ARMV8_PMCR_P      (1 << 1)
    /* Reset PMCCNTR_EL0 to 0 */
    #define QUADD_ARMV8_PMCR_C      (1 << 2)
    /* Clock divider: PMCCNTR_EL0 counts every clock cycle/every 64 clock cycles */
    #define QUADD_ARMV8_PMCR_D      (1 << 3)
    /* Export of events is disabled/enabled */
    #define QUADD_ARMV8_PMCR_X      (1 << 4)
    /* Disable cycle counter, PMCCNTR_EL0 when event counting is prohibited */
    #define QUADD_ARMV8_PMCR_DP     (1 << 5)
    /* Long cycle count enable */
    #define QUADD_ARMV8_PMCR_LC     (1 << 6)

static inline unsigned int armv8_pmu_pmcr_read(void)
{

        unsigned int val;
        /* Read Performance Monitors Control Register */
        asm volatile("mrs %0, pmcr_el0" : "=r" (val));
        return val;
}
static inline void armv8_pmu_pmcr_write(unsigned int val)
{
    asm volatile("msr pmcr_el0, %0" : :"r" (val & QUADD_ARMV8_PMCR_WR_MASK));
}

static void enable_all_counters(void)
{
    unsigned int val;
    /* Enable all counters */
    val = armv8_pmu_pmcr_read();
    val |= QUADD_ARMV8_PMCR_E | QUADD_ARMV8_PMCR_X;
    armv8_pmu_pmcr_write(val);
}

static void reset_all_counters(void)
{

    unsigned int val;
    val = armv8_pmu_pmcr_read();
    val |= QUADD_ARMV8_PMCR_P | QUADD_ARMV8_PMCR_C;
    armv8_pmu_pmcr_write(val);
}

static void readticks(unsigned int *result)
{
    struct timeval t;
    unsigned int cc;
    unsigned int val;
    if (!enabled) {
        reset_all_counters();
        enable_all_counters();
        enabled = 1;
    }
    cc = armv8_pmu_pmcr_read();
    gettimeofday(&t,(struct timezone *) 0);
    result[0] = cc;
    result[1] = t.tv_usec;
    result[2] = t.tv_sec;
}

But it gives "Illegal instruction" as error while I am trying profiling. Can anyone help me to change the above code for cortex-a53?

628

asked May 08 '15 19:05

rajeevakarv

Video Answer

1 Answers

You need to enable the PMU for user mode. Here is the kernel module I wrote for it(For ARM V7 in Raspberry Pi 2):

/* Module source file 'module.c'. */
#include <linux/module.h>
#include <linux/init.h>
#include <linux/kernel.h>

arm_write(unsigned long val)
{
        //Enabling both read and write - note difference between mcr and mrc
        asm volatile("mrc p15, 0, %0, c9, c14, 0" :: "r"(1));
        asm volatile("mcr p15, 0, %0, c9, c14, 0" :: "r"(1));

}

static int enabler(void)
{
     unsigned long value = 1;
     printk(KERN_INFO "Enabling PMU usermode.\n");
     arm_write(value);
     return 0;
}

static void end(void)
{
     printk(KERN_INFO "module unloaded.\n");
}

module_init(enabler);
module_exit(end);

MODULE_AUTHOR("Sama");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Blahblah");

This will enable user mode access to the PMU. once you compiled it , you need to enable the PMU counters as follow:

int main(int argc, char **argv){
    int enable_divider =1;
    int do_reset=1;
        int value = 1;

        // peform reset:
        if (do_reset) {
                value |= 2;     // reset all counters to zero.
                value |= 4;     // reset cycle counter to zero.
        }

        if (enable_divider)
                value |= 8;     // enable "by 64" divider for CCNT. You really do not want to get all cycle count. This will increment the counter by 1 for every 64 cpu cycle.

        value |= 16;
        // program the performance-counter control-register with mask constructed above
        asm volatile ("MCR p15, 0, %0, c9, c12, 0\t\n" :: "r"(value));
        // enable all counters:
        asm volatile ("MCR p15, 0, %0, c9, c12, 1\t\n" :: "r"(0x8000000f));

        // clear overflows:
        asm volatile ("MCR p15, 0, %0, c9, c12, 3\t\n" :: "r"(0x80000001));

        // Select individual counter (0)
        asm volatile ("MCR p15,   0,    %0,  c9  ,   c12 ,   5\t\n":: "r"(0x00));

        // Write event (0x11 = Cycle count)
        asm volatile ("MCR p15,   0,    %0,  c9  ,   c13 ,   1\t\n":: "r"(0xD));

    printf("Hi");
        unsigned int output;

        // Read current event counter
        asm volatile ("MRC p15,   0,    %0,  c9  ,   c13 ,   2\t\n": "=r"(output));
        printf("Event count 0: %ul\n", output);
    printf("Normal Execution, No Buffer Overflow Occurred.\n");
   return 0;
}

However unfortunately what you get is not only your program cpu cycle, but entire system cpu cycle!. So what I recommend is to use perf.

Write your asm code in an inline assembly code in C and then put it like this:

int dummya(int z, int b){
//This is my function you need to change it for yourself
struct perf_event_attr pe;
long long count;
int fd;
        memset(&pe, 0, sizeof(struct perf_event_attr));
        pe.type = PERF_TYPE_HARDWARE;
        pe.size = sizeof(struct perf_event_attr);
        pe.config = PERF_COUNT_HW_CPU_CYCLES;
        pe.disabled = 1;
        pe.exclude_kernel = 1;
        pe.exclude_hv = 1;
        fd = perf_event_open(&pe, 0, -1, -1, 0);
        if (fd == -1) {
                fprintf(stderr, "Error opening leader %llx\n", pe.config);
                exit(EXIT_FAILURE);
               }

        ioctl(fd, PERF_EVENT_IOC_RESET, 0);
        ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
//From here the counter starts.
        asm("Your ASM Codes");
        asm("Your ASM Codes");
        asm("Your ASM Codes");
        asm("Your ASM Codes");
        asm("Your ASM Codes");
        asm("Your ASM Codes");
        asm("Your ASM Codes");
        asm("Your ASM Codes");
//Disabling Counter
        ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);

        read(fd, &count, sizeof(long long));
        printf("%lld\n", count);
        close(fd);
return 5;
}

And be advised you need a new kernels to access the Perf driver.

answered Oct 13 '22 09:10

Sama Azari

Related questions
                            
                                Linking compiled library to newer version of glibc
                            
                                IPPROTO_RM blocks during accept call
                            
                                Macro hell: Platform-independent pointer to setjmp/sigsetjmp
                            
                                Get GCC To Use Carry Logic For Arbitrary Precision Arithmetic Without Inline Assembly?
                            
                                Maximum stack size needed for a C program on MSP430
                            
                                ALSA snd_pcm_drop() is not clearing complete buffer
                            
                                Block Matching optimization using x86/x64 Streaming SIMD Extension
                            
                                Python-C integration: Ctypes, CFFI or create a Binary Module
                            
                                Does Posix supply format string macros for printf/scanf?
                            
                                Is it possible to port GNU grep as a library? [closed]
                            
                                Python C Module - Malloc fails in specific version of Python
                            
                                Automated testing of bare metal C code (microprocessor firmware): simulating changes in hardware registers
                            
                                openssl cannot get ENGINE_by_id() to work
                            
                                HOWTO: Cross-Operating-System Large File IO in C?
                            
                                Compiling FFmpeg 2.3 with Android NDK r10
                            
                                Build system for project that uses C and Haskell
                            
                                Timezone file to POSIX timezone string
                            
                                gcc disable ALL warnings for a few lines of code
                            
                                How to find if a function is reentrant
                            
                                MPI One Sided: Exclusive Lock with MPI_Win_lock_all

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With