Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARM32 prefetch triggers infinite loop #1139

Open
arichardson opened this issue May 15, 2024 · 1 comment
Open

ARM32 prefetch triggers infinite loop #1139

arichardson opened this issue May 15, 2024 · 1 comment
Labels

Comments

@arichardson
Copy link
Contributor

Describe the bug
I was trying to run a baremetal arm32 binary built using picolibc, which uses prefetch (pld [srcin, #0]) inside strlen(). When using TimingSimpleCpu or MinorCpu, the CPU no longer makes any forward progress and just keeps processing DRAM events.

Affects version
develop: 65976e4

gem5 Modifications
If you have modified gem5 in some way please state, to the best of your ability, how it has been modified.

To Reproduce
Steps to reproduce the behavior. Please assume starting from a clean repository:

  1. Compile build/ALL/gem.debug
  2. Build https://github.com/picolibc/picolibc for arm32
  3. I created an example config based on riscv/fs_linux and arm/baremetal.py:
import argparse
from m5.objects import *
from m5.util import addToPath
addToPath("..")
from common import CacheConfig, CpuConfig, MemConfig, Options, Simulation, ObjectList
from common.Caches import IOCache
from common.FSConfig import MemBus
from common.Benchmarks import SysConfig
from gem5.isas import ISA

parser = argparse.ArgumentParser(
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
chosen_isa = ISA.ARM
Options.addCommonOptions(parser, chosen_isa)
Options.addFSOptions(parser)
parser.add_argument("--semihosting-root", default="/tmp", type=str,
                    help="The root directory for files exposed to semihosting")
parser.set_defaults(
    bare_metal=True,
    num_cpus=1,
    # cpu_type=CpuConfig.isa_string_map[chosen_isa] + "MinorCPU",
    cpu_type=CpuConfig.isa_string_map[chosen_isa] + "TimingSimpleCPU",
    # cpu_type=CpuConfig.isa_string_map[chosen_isa] + "AtomicSimpleCPU",
    mem_size="128MB",
    cpu_clock="1GHz",
    caches=True,
    l2cache=True,
    num_l2caches=1,
    num_l3caches=0,
)
args = parser.parse_args()
# CPU and Memory
(CPUClass, mem_mode, FutureClass) = Simulation.setCPUClass(args)
MemClass = Simulation.setMemClass(args)
np = args.num_cpus
assert args.num_cpus == 1

mdesc = SysConfig(
    mem=args.mem_size,
    os_type=args.os_type,
)
system = ArmSystem()
system.mem_ranges = [AddrRange(start=0, size=mdesc.mem())]
system.highest_el_is_64 = False
system.release = Armv8()
system.auto_reset_addr = False
system.reset_addr = 0x00000000
system.semihosting = ArmSemihosting(
    files_root_dir=args.semihosting_root,
    cmd_line=args.kernel,
)
system.workload = ArmFsWorkload(
    object_file=args.kernel, early_kernel_symbols=True, panic_on_panic=True
)

system.mem_mode = mem_mode
# assert mem_mode == "timing"
system.iobus = IOXBar()
system.membus = MemBus()
system.platform = RealView()
system.platform.attachIO(system.iobus)
off_chip_ranges = system.platform._off_chip_ranges
system.system_port = system.membus.cpu_side_ports
system.bridge = Bridge(delay="50ns")
system.bridge.mem_side_port = system.iobus.cpu_side_ports
system.bridge.cpu_side_port = system.membus.mem_side_ports
system.bridge.ranges = off_chip_ranges
system.platform.attachOnChipIO(system.membus)

# ---------------------------- Default Setup --------------------------- #
# Set the cache line size for the entire system
system.cache_line_size = args.cacheline_size
# Create a top-level voltage domain
system.voltage_domain = VoltageDomain(voltage=args.sys_voltage)
# Create a source clock for the system and set the clock period
system.clk_domain = SrcClockDomain(
    clock=args.sys_clock, voltage_domain=system.voltage_domain
)
# Create a CPU voltage domain
system.cpu_voltage_domain = VoltageDomain()
# Create a source clock for the CPUs and set the clock period
system.cpu_clk_domain = SrcClockDomain(
    clock=args.cpu_clock, voltage_domain=system.cpu_voltage_domain
)
system.init_param = args.init_param

system.cpu = [CPUClass(clk_domain=system.cpu_clk_domain, cpu_id=i) for i in range(np)]

# By default the IOCache runs at the system clock
system.iocache = IOCache(addr_ranges=system.mem_ranges)
system.iocache.cpu_side = system.iobus.mem_side_ports
system.iocache.mem_side = system.membus.cpu_side_ports

for i in range(np):
  assert not args.simpoint_profile, "Unsupported"
  assert not ObjectList.is_kvm_cpu(CPUClass), "kvm unsupported"
  if args.checker:
    system.cpu[i].addCheckerCpu()
  if args.bp_type:
    bpClass = ObjectList.bp_list.get(args.bp_type)
    system.cpu[i].branchPred = bpClass()
  if args.indirect_bp_type:
    IndirectBPClass = ObjectList.indirect_bp_list.get(args.indirect_bp_type)
    system.cpu[i].branchPred.indirectBranchPred = IndirectBPClass()
  system.cpu[i].createThreads()

# ---------------------------- Default Setup --------------------------- #

CacheConfig.config_cache(args, system)
MemConfig.config_mem(args, system)
root = Root(full_system=True, system=system)
Simulation.setWorkCountOptions(system, args)
Simulation.run(args, root, system, FutureClass)
  1. Run build/ALL/gem5.debug --verbose --debug-flags=Semihosting,ExecAll,Faults,Decode,Decoder,MemCtrl,Event,CacheAll,MemoryAccess --debug-file=trace.log configs/example/repro.py --kernel $HOME/picolibc-arm32-build/test/posix-io

Terminal Output
I then get expected terminal output until it ends up in an infinite loop (which does not happen with the AtomicCpu) once the prefetch instruction is executed: Decoded pld instruction: 0x18f890f000

Last few 100 lines of debug output for
--debug-flags=Semihosting,ExecAll,Faults,Decode,Decoder,MemCtrl,Event,CacheAll,MemoryAccess before the infinite loop.

760766000: Event_64: Timing CPU icache tick 64 scheduled @ 760766000
760766000: Event_64: Timing CPU icache tick 64 executed @ 760766000
760766000: system.cpu.decoder: Second half of 32 bit Thumb: 0xf890f000.
760766000: system.cpu.decoder: Decode: Decoded pld instruction: 0x18f890f000
760766000: system.cpu.dcache: access for SoftPFReq [5695:5695] (s) UC
760766000: system.cpu.dcache.mshr_queue: Allocating new MSHR. Number in use will be 1/4
760766000: system.cpu.dcache.mshr_queue.entry.targets: New target allocated: SoftPFReq [5695:5695] (s) UC
760766000: system.cpu.dcache.mem_side_port: Scheduling send event at 760768000
760766000: system.cpu.dcache.mem_side_port-MemSidePort.wrapped_function_event: EventFunctionWrapped 70 scheduled @ 760768000
760768000: system.cpu.dcache.mem_side_port-MemSidePort.wrapped_function_event: EventFunctionWrapped 70 executed @ 760768000
760768000: system.cpu.dcache: sendMSHRQueuePacket: MSHR SoftPFReq [5695:5695] (s) UC
760768000: system.l2: access for SoftPFReq [5695:5695] (s) UC
760768000: system.l2.mshr_queue: Allocating new MSHR. Number in use will be 1/20
760768000: system.l2.mshr_queue.entry.targets: New target allocated: SoftPFReq [5695:5695] (s) UC
760768000: system.l2.mem_side_port: Scheduling send event at 760789000
760768000: system.l2.mem_side_port-MemSidePort.wrapped_function_event: EventFunctionWrapped 88 scheduled @ 760789000
760768000: system.tol2bus.reqLayer0.wrapped_function_event: EventFunctionWrapped 108 scheduled @ 760769000
760769000: system.tol2bus.reqLayer0.wrapped_function_event: EventFunctionWrapped 108 executed @ 760769000
760789000: system.l2.mem_side_port-MemSidePort.wrapped_function_event: EventFunctionWrapped 88 executed @ 760789000
760789000: system.l2: sendMSHRQueuePacket: MSHR SoftPFReq [5695:5695] (s) UC
760789000: system.mem_ctrls: recvTimingReq: request SoftPFReq addr 0x5695 size 1
760789000: system.mem_ctrls: Read queue limit 32, current size 0, entries needed 1
760789000: system.mem_ctrls: Read queue limit 32, current size 0, entries needed 1
760789000: system.mem_ctrls: Adding to read queue
760789000: system.mem_ctrls: Request scheduled immediately
760789000: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 92 scheduled @ 760789000
760789000: system.membus.reqLayer1.wrapped_function_event: EventFunctionWrapped 96 scheduled @ 760790000
760789000: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 92 executed @ 760789000
760789000: system.mem_ctrls: QoS Turnarounds selected state READ 
760789000: system.mem_ctrls: Single request, going to a free rank
760789000: system.mem_ctrls: Removing burstTick for 760250000
760789000: system.mem_ctrls: Access to 0x5695, ready at 760807750 next burst at 760794000.
760789000: system.mem_ctrls: Command for 0x5695, issued at 760789000.
760789000: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 93 scheduled @ 760807750
760789000: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 92 scheduled @ 760789000
760789000: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 92 executed @ 760789000
760789000: system.mem_ctrls: QoS Turnarounds selected state READ 
760790000: system.membus.reqLayer1.wrapped_function_event: EventFunctionWrapped 96 executed @ 760790000
760807750: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 93 executed @ 760807750
760807750: system.mem_ctrls: processRespondEvent(): Some req has reached its readyTime
760807750: system.mem_ctrls: Responding to Address 0x5695.. 
760807750: global: Read from cpu.data of size 1 on address 0x5695 data 0x3a U
760807750: system.mem_ctrls.port-RespPacketQueue.wrapped_function_event: EventFunctionWrapped 91 scheduled @ 761275750
760807750: system.mem_ctrls: Done
761275750: system.mem_ctrls.port-RespPacketQueue.wrapped_function_event: EventFunctionWrapped 91 executed @ 761275750
761275750: system.membus.cpu_side_port[2]-RespPacketQueue.wrapped_function_event: EventFunctionWrapped 104 scheduled @ 761278000
761275750: system.membus.respLayer2.wrapped_function_event: EventFunctionWrapped 105 scheduled @ 761278000
761278000: system.membus.respLayer2.wrapped_function_event: EventFunctionWrapped 105 executed @ 761278000
761278000: system.membus.cpu_side_port[2]-RespPacketQueue.wrapped_function_event: EventFunctionWrapped 104 executed @ 761278000
761278000: system.l2: recvTimingResp: Handling response SoftPFResp [5695:5695] (s) UC
761278000: system.l2.mshr_queue: Deallocating all targets: [0x5680:0x56bf](s) Forward   state: Unc InSvc    
761278000: system.l2.mshr_queue: MSHR deallocated. Number in use: 0/20
761278000: system.l2: recvTimingResp: Leaving with SoftPFResp [5695:5695] (s) UC
763052500: 0.wrapped_function_event: EventFunctionWrapped 11 executed @ 763052500
763052500: 0.wrapped_function_event: EventFunctionWrapped 10 scheduled @ 763066250
763052500: 1.wrapped_function_event: EventFunctionWrapped 17 executed @ 763052500
763052500: 1.wrapped_function_event: EventFunctionWrapped 18 scheduled @ 763052500
763052500: 1.wrapped_function_event: EventFunctionWrapped 18 executed @ 763052500
763052500: 1.wrapped_function_event: EventFunctionWrapped 17 scheduled @ 763052500
763052500: 1.wrapped_function_event: EventFunctionWrapped 17 executed @ 763052500
763052500: 1.wrapped_function_event: EventFunctionWrapped 17 scheduled @ 763312500
763066250: 0.wrapped_function_event: EventFunctionWrapped 10 executed @ 763066250
763066250: 0.wrapped_function_event: EventFunctionWrapped 12 scheduled @ 763066250
763066250: 0.wrapped_function_event: EventFunctionWrapped 12 executed @ 763066250
763066250: 0.wrapped_function_event: EventFunctionWrapped 11 scheduled @ 763066250
763066250: 0.wrapped_function_event: EventFunctionWrapped 11 executed @ 763066250
763066250: 0.wrapped_function_event: EventFunctionWrapped 11 scheduled @ 763326250
763312500: 1.wrapped_function_event: EventFunctionWrapped 17 executed @ 763312500
763312500: 1.wrapped_function_event: EventFunctionWrapped 18 scheduled @ 763312500
763312500: 1.wrapped_function_event: EventFunctionWrapped 17 scheduled @ 770838750
763312500: 1.wrapped_function_event: EventFunctionWrapped 18 executed @ 763312500
763312500: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 92 scheduled @ 763312500
763312500: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 92 executed @ 763312500
763312500: system.mem_ctrls: QoS Turnarounds selected state READ 
763326250: 0.wrapped_function_event: EventFunctionWrapped 11 executed @ 763326250
763326250: 0.wrapped_function_event: EventFunctionWrapped 12 scheduled @ 763326250
763326250: 0.wrapped_function_event: EventFunctionWrapped 11 scheduled @ 770838750
763326250: 0.wrapped_function_event: EventFunctionWrapped 12 executed @ 763326250
763326250: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 92 scheduled @ 763326250
763326250: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 92 executed @ 763326250
763326250: system.mem_ctrls: QoS Turnarounds selected state READ 
770838750: 0.wrapped_function_event: EventFunctionWrapped 11 executed @ 770838750
770838750: 0.wrapped_function_event: EventFunctionWrapped 12 scheduled @ 770838750
770838750: 0.wrapped_function_event: EventFunctionWrapped 12 executed @ 770838750
770838750: 0.wrapped_function_event: EventFunctionWrapped 11 scheduled @ 770838750
770838750: 0.wrapped_function_event: EventFunctionWrapped 11 executed @ 770838750
770838750: 0.wrapped_function_event: EventFunctionWrapped 11 scheduled @ 771098750
770838750: 1.wrapped_function_event: EventFunctionWrapped 17 executed @ 770838750
770838750: 1.wrapped_function_event: EventFunctionWrapped 18 scheduled @ 770838750
770838750: 1.wrapped_function_event: EventFunctionWrapped 18 executed @ 770838750
770838750: 1.wrapped_function_event: EventFunctionWrapped 17 scheduled @ 770838750
770838750: 1.wrapped_function_event: EventFunctionWrapped 17 executed @ 770838750
770838750: 1.wrapped_function_event: EventFunctionWrapped 17 scheduled @ 771098750
771098750: 1.wrapped_function_event: EventFunctionWrapped 17 executed @ 771098750
771098750: 1.wrapped_function_event: EventFunctionWrapped 18 scheduled @ 771098750
771098750: 1.wrapped_function_event: EventFunctionWrapped 17 scheduled @ 778625000
771098750: 1.wrapped_function_event: EventFunctionWrapped 18 executed @ 771098750
771098750: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 92 scheduled @ 771098750
771098750: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 92 executed @ 771098750
771098750: system.mem_ctrls: QoS Turnarounds selected state READ 

Expected behavior
Prefetch sends request to DRAM and CPU continues execution. I tried modifying timing.cc to change
if (_status == BaseSimpleCPU::Running) to if (_status == BaseSimpleCPU::Running || curStaticInst->isPrefetch()) { but that didn't fix the problem.

Host Operating System
Debian

Host ISA
ARM

Compiler used
System GCC (12).

arichardson added a commit to arichardson/gem5 that referenced this issue May 15, 2024
Currently, a prefetch instruction will result in an infinite sleep where
the CPU never wakes back up as it is expecting a dcache response even
though this will not be delivered for prefetches. Work around this by
overriding the _status field for prefetches in completeIfetch().

This allows my test workload from gem5#1139
to continue running beyond strlen() when using TimingSimpleCpu (but not
other CPUs such as MinorCPU).

Partially fixes: gem5#1139

Change-Id: Ic44bdb87f4099b11a7f9c6c99768a12fbef5842e
@arichardson
Copy link
Contributor Author

arichardson commented Jun 6, 2024

I have reduced my test case down to the following C code which I would like to add to the gem5 tests but I'm not quite sure how to do this.

//
// This test checks that software prefetches do not result in an infinite
// loop/panic. This includes prefetches with caches/MMU enabled+disabled.
//

#define __STRING(x) #x           /* stringify without expanding x */
#define __XSTRING(x) __STRING(x) /* expand x, then stringify */

typedef __UINT32_TYPE__ uint32_t;
typedef __UINTPTR_TYPE__ uintptr_t;

static char stack_memory[4096];

__attribute__((naked, used)) void _start(void) {
  // Set up stack for C code and jump to test_code.
  __asm__("mov sp, %0" : : "r"(stack_memory));
  __asm__("b test_code");
}

#define MMU_TYPE_1MB (0x2 << 0)
#define MMU_RW (0x3 << 10)
#define MMU_NORMAL_CACHEABLE ((0x0 << 12) | (0x3 << 2))
#define MMU_MAPPING_FLAGS (MMU_TYPE_1MB | MMU_RW | MMU_NORMAL_CACHEABLE)
// We need 4096 1MB mappings to cover the full 32-bit address space.
#define MMU_MAPPING_COUNT 4096
extern uint32_t identity_page_table[MMU_MAPPING_COUNT];
// clang-format off
__asm__(".section .rodata\n"
        ".global identity_page_table\n"
        ".balign 16384\n"
        "identity_page_table:\n"
        ".set _i, 0\n" //
        ".rept " __XSTRING(MMU_MAPPING_COUNT) "\n"
        "  .4byte (_i << 20) |" __XSTRING(MMU_MAPPING_FLAGS) "\n"
        "  .set _i, _i + 1\n"
        ".endr\n"
        ".size identity_page_table, " __XSTRING(MMU_MAPPING_COUNT * 4) "\n"
        ".text");
// clang-format on

#define SCTLR_MMU (1 << 0)
#define SCTLR_DATA_L2 (1 << 2)
#define SCTLR_BRANCH_PRED (1 << 11)
#define SCTLR_ICACHE (1 << 12)

static uint32_t read_sctlr(void) {
  uint32_t result;
  __asm__("mrc p15, 0, %0, c1, c0, 0" : "=r"(result));
  return result;
}

static void write_sctlr(uint32_t value) {
  __asm__("mcr p15, 0, r0, c1, c0, 0\n"
          "isb\n" ::"r"(value)
          : "memory");
}

static void enable_mmu(void) {
  // We have to set up an identity map and enable the MMU for caches.
  // Additionally, all page table entries are set to Domain 0, so set up DACR
  // so that Domain zero has permission checks enabled rather than "deny all".
  __asm__("mov r0, #1\n"
          "mcr p15, 0, r0, c3, c0, 0\n" // Set DACR Domain 0 permissions checked
          "mcr p15, 0, %[TTBR], c2, c0, 0\n" // Write TTBR
          "mov r0, #0\n"
          // Note: we assume Data+L2 cache has been invalidated by reset.
          "mcr p15, 0, r0, c7, c5, 0\n" // ICIALLU: invalidate instruction cache
          "mcr p15, 0, r0, c8, c7, 0\n" // TLBIALL: invalidate TLB
          "mcr p15, 0, r0, c7, c5, 6\n" // BPIALL: invalidate branch predictor
          "isb\n" ::[TTBR] "r"(identity_page_table)
          : "r0");
  write_sctlr(read_sctlr() | SCTLR_MMU);
}

static void enable_caches(void) {
  write_sctlr(read_sctlr() | SCTLR_ICACHE | SCTLR_BRANCH_PRED | SCTLR_DATA_L2);
}

uintptr_t call_semihosting(uintptr_t operation, uintptr_t arg) {
  uintptr_t result;
  __asm__ volatile("mov r0, %1\n"
                   "mov r1, %2\n"
                   "svc #0x123456\n"
                   "mov %0, r0\n"
                   : "=r"(result)
                   : "r"(operation), "r"(arg)
                   : "memory", "r0", "r1");
  return result;
}

// https://github.com/ARM-software/abi-aa/blob/main/semihosting/semihosting.rst
#define SEMIHOSTING_WRITE0 0x4
#define SEMIHOSTING_EXIT 0x18
#define ADP_Stopped_ApplicationExit 0x20026

struct AtLeastOneCacheline {
  char data[64];
};

#define test_prefetch(addr, msg)                                               \
  do {                                                                         \
    call_semihosting(SEMIHOSTING_WRITE0,                                       \
                     (uintptr_t)("Prefetch with " msg "... "));                \
    __asm__("pld [%0]\n" ::"r"(addr));                                         \
    call_semihosting(SEMIHOSTING_WRITE0, (uintptr_t)"OK\n");                   \
  } while (0)

__attribute__((used, noreturn)) void test_code(void) {
  call_semihosting(SEMIHOSTING_WRITE0, (uintptr_t)"Starting test...\n");
  static struct AtLeastOneCacheline prefetch_test[4];
  test_prefetch(&prefetch_test[0], "Test MMU off, caches off");
  enable_mmu();
  test_prefetch(&prefetch_test[1], "MMU on, caches off");
  enable_caches();
  test_prefetch(&prefetch_test[2], "MMU on, caches on");
  write_sctlr(read_sctlr() & ~SCTLR_MMU);
  // Finally test MMU off, with  caches enable bits still on.
  // CPU will treat this as dcache disabled since it requires MMU on.
  test_prefetch(&prefetch_test[3], "MMU off, caches still on");
  call_semihosting(SEMIHOSTING_WRITE0, (uintptr_t)"Test complete!\n");
  call_semihosting(SEMIHOSTING_EXIT, ADP_Stopped_ApplicationExit);
  __builtin_trap();
  __builtin_unreachable();
}

Can be compiled using arm-none-eabi-gcc -nostartfiles -nodefaultlibs -O2 -mcpu=cortex-a7 test.c -ggdb -o test.elf.

I think it might be nice to add this test and the reproducer python config to the overall tests to ensure that software prefetches work for all CPU models.

arichardson added a commit to arichardson/gem5 that referenced this issue Jun 6, 2024
Currently, a prefetch instruction will result in an infinite sleep where
the CPU never wakes back up as it is expecting a dcache response even
though this will not be delivered for prefetches. Avoid this problem by
rejecting uncacheable prefetches in the MMU translation logic.

This allows my test workload from gem5#1139
to continue running beyond strlen(). Tested using Atomic,Timing,Minor
and O3 CPU. See gem5#1139 for the test
case that was used.

Fixes: gem5#1139

Change-Id: Ic44bdb87f4099b11a7f9c6c99768a12fbef5842e
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

No branches or pull requests

1 participant