Appendix B: Embedded and RTOS Implementation
"In embedded systems, every cycle counts." — Embedded Systems Proverb
Simulator Environment Setup
Since this book doesn't assume readers have physical hardware, all exercises use simulators.
QEMU ARM Setup
# Install QEMU
sudo apt install qemu-system-arm
# Install ARM toolchain
sudo apt install gcc-arm-none-eabi
# Test QEMU
qemu-system-arm -M help | grep lm3s
# lm3s6965evb Stellaris LM3S6965EVB (Cortex-M3)
QEMU RISC-V Setup
# Install QEMU
sudo apt install qemu-system-riscv32 qemu-system-riscv64
# Install RISC-V toolchain
sudo apt install gcc-riscv64-unknown-elf
# Test QEMU
qemu-system-riscv32 -M help | grep sifive
# sifive_e RISC-V Board compatible with SiFive E SDK
# sifive_u RISC-V Board compatible with SiFive U SDK
Renode Setup
# Download Renode
wget https://github.com/renode/renode/releases/download/v1.14.0/renode_1.14.0_amd64.deb
sudo dpkg -i renode_1.14.0_amd64.deb
# Test
renode --version
Exercise 1: ARM Cortex-M Cycle Counting
Goal
Use DWT (Data Watchpoint and Trace) cycle counter to measure function execution time.
Code
// cycle_count.c - ARM Cortex-M3 cycle counting example
#include <stdint.h>
// DWT register definitions
#define DWT_CTRL (*(volatile uint32_t*)0xE0001000)
#define DWT_CYCCNT (*(volatile uint32_t*)0xE0001004)
#define DEMCR (*(volatile uint32_t*)0xE000EDFC)
// Enable DWT
void dwt_init(void) {
DEMCR |= (1 << 24); // TRCENA
DWT_CYCCNT = 0;
DWT_CTRL |= 1; // CYCCNTENA
}
// Measure function
uint32_t measure_cycles(void (*func)(void)) {
uint32_t start = DWT_CYCCNT;
func();
uint32_t end = DWT_CYCCNT;
return end - start;
}
// Test function
void test_function(void) {
volatile int sum = 0;
for (int i = 0; i < 1000; i++) {
sum += i;
}
}
int main(void) {
dwt_init();
uint32_t cycles = measure_cycles(test_function);
// Use semihosting for output
// printf("Cycles: %u\n", cycles);
while (1);
return 0;
}
Compile and Run
# Compile
arm-none-eabi-gcc -mcpu=cortex-m3 -mthumb \
-specs=nosys.specs -specs=nano.specs \
-T linker.ld -o cycle_count.elf cycle_count.c
# Run in QEMU
qemu-system-arm -M lm3s6965evb -nographic \
-kernel cycle_count.elf -semihosting
Exercise 2: RISC-V mcycle/minstret
Goal
Use RISC-V CSRs to read cycle count and instruction count.
Code
// riscv_counters.c - RISC-V performance counter example
#include <stdint.h>
// Read mcycle
static inline uint64_t read_mcycle(void) {
uint32_t lo, hi;
asm volatile (
"csrr %0, mcycle\n"
"csrr %1, mcycleh\n"
: "=r"(lo), "=r"(hi)
);
return ((uint64_t)hi << 32) | lo;
}
// Read minstret
static inline uint64_t read_minstret(void) {
uint32_t lo, hi;
asm volatile (
"csrr %0, minstret\n"
"csrr %1, minstreth\n"
: "=r"(lo), "=r"(hi)
);
return ((uint64_t)hi << 32) | lo;
}
// Calculate CPI
void measure_cpi(void (*func)(void)) {
uint64_t cycles_start = read_mcycle();
uint64_t instrs_start = read_minstret();
func();
uint64_t cycles_end = read_mcycle();
uint64_t instrs_end = read_minstret();
uint64_t cycles = cycles_end - cycles_start;
uint64_t instrs = instrs_end - instrs_start;
// CPI = cycles / instructions
// Using integer division
uint32_t cpi_int = cycles / instrs;
uint32_t cpi_frac = (cycles * 100 / instrs) % 100;
// Output: CPI = cpi_int.cpi_frac
}
int main(void) {
// Test...
return 0;
}
Exercise 3: FreeRTOS Context Switch Measurement
Goal
Measure FreeRTOS context switch time.
Method
Measurement method:
1. Create two tasks
2. Task A records time, then yields
3. Task B records time, then yields
4. Calculate time difference
Time difference = Context switch time
FreeRTOS Code
// context_switch.c - FreeRTOS context switch measurement
#include "FreeRTOS.h"
#include "task.h"
volatile uint32_t timestamp_a, timestamp_b;
volatile uint32_t switch_time;
void TaskA(void *pvParameters) {
while (1) {
timestamp_a = DWT_CYCCNT;
taskYIELD();
// Calculate time switching back from B
switch_time = DWT_CYCCNT - timestamp_b;
}
}
void TaskB(void *pvParameters) {
while (1) {
timestamp_b = DWT_CYCCNT;
taskYIELD();
}
}
int main(void) {
dwt_init();
xTaskCreate(TaskA, "TaskA", 128, NULL, 1, NULL);
xTaskCreate(TaskB, "TaskB", 128, NULL, 1, NULL);
vTaskStartScheduler();
while (1);
}
Running on Renode
# Create Renode script
cat > freertos_test.resc << 'EOF'
mach create
machine LoadPlatformDescription @platforms/cpus/stm32f4.repl
sysbus LoadELF @context_switch.elf
showAnalyzer sysbus.uart1
start
EOF
# Run
renode freertos_test.resc
Exercise 4: Interrupt Latency Measurement
Measure time from interrupt trigger to ISR execution start.
Measurement Method Description
Measurement steps:
1. Record time before triggering interrupt
2. Record time at ISR start
3. Calculate difference
Notes:
- Need to consider interrupt priority
- Need to consider impact of other interrupts
- Multiple measurements for statistics
Code
// interrupt_latency.c
volatile uint32_t trigger_time;
volatile uint32_t isr_start_time;
volatile uint32_t latency;
void SysTick_Handler(void) {
isr_start_time = DWT_CYCCNT;
latency = isr_start_time - trigger_time;
}
void measure_interrupt_latency(void) {
// Configure SysTick
SysTick->LOAD = 1000; // Short period
SysTick->VAL = 0;
SysTick->CTRL = 7; // Enable, use processor clock, enable interrupt
// Wait for interrupt
trigger_time = DWT_CYCCNT;
__WFI(); // Wait for interrupt
// latency now contains interrupt latency
}
Exercise 5: Memory Access Pattern Analysis
Analyze the impact of different memory access patterns on performance.
Memory Access Code
// memory_access.c
#define ARRAY_SIZE 1024
volatile uint32_t array[ARRAY_SIZE];
// Sequential access
uint32_t sequential_access(void) {
uint32_t start = DWT_CYCCNT;
for (int i = 0; i < ARRAY_SIZE; i++) {
array[i] = i;
}
return DWT_CYCCNT - start;
}
// Strided access (stride = 16)
uint32_t strided_access(void) {
uint32_t start = DWT_CYCCNT;
for (int s = 0; s < 16; s++) {
for (int i = s; i < ARRAY_SIZE; i += 16) {
array[i] = i;
}
}
return DWT_CYCCNT - start;
}
// Random access
uint32_t random_access(uint32_t *indices) {
uint32_t start = DWT_CYCCNT;
for (int i = 0; i < ARRAY_SIZE; i++) {
array[indices[i]] = i;
}
return DWT_CYCCNT - start;
}
Power Measurement (Theory)
Without physical hardware, power measurement can only be discussed theoretically.
Measurement Equipment
Power measurement equipment:
1. Current Probe
- Connected in series with power line
- Measures current waveform
- Example: Keysight N2820A
2. Power Analyzer
- High precision power measurement
- Example: Keysight N6705C
3. Built-in on Dev Boards
- STM32 Nucleo IDD jumper
- Nordic PPK2
Measurement Method
Power measurement steps:
1. Baseline measurement
- Measure idle state power
- Measure various sleep mode power
2. Dynamic measurement
- Run benchmark
- Record power waveform
- Calculate average power
3. Energy calculation
Energy = ∫ Power(t) dt
≈ Σ Power[i] × Δt
Simulated Power Estimation
# Simplified power model
def estimate_power(cycles, frequency_mhz, voltage_v):
"""
Estimate dynamic power
P = C × V² × f
Assumptions:
- Switching capacitance per cycle C ≈ 10 pF
- Activity factor α ≈ 0.3
"""
C = 10e-12 # 10 pF
alpha = 0.3
f = frequency_mhz * 1e6
dynamic_power = alpha * C * (voltage_v ** 2) * f
# Add static power (assume 1 mW)
static_power = 1e-3
return dynamic_power + static_power
# Example
power = estimate_power(1000000, 100, 1.8)
print(f"Estimated power: {power * 1000:.2f} mW")
Summary
Key techniques for embedded performance measurement:
Timing Measurement
- ARM: DWT Cycle Counter
- RISC-V: mcycle/minstret CSR
- General: SysTick, hardware timers
RTOS Measurement
- Context switch time
- Interrupt latency
- Task switching overhead
Memory Analysis
- Access pattern impact
- Cache effects (if available)
- Alignment impact
Power Measurement
- Requires dedicated equipment
- Dynamic vs static power
- Energy efficiency metrics