that’s exactly what I meant. 256byte stage2 does nothing other than chainloading stage3.
Stage3 by default boots into FLASH app, and only loads the UI, FAT driver etc. when entering an alt mode.
I asked AI to re-write stage2 to load 4KB stage3 into RAM:
// ----------------------------------------------------------------------------
// Second stage boot code - microSD variant
// Copyright (c) 2025 Custom Implementation
// SPDX-License-Identifier: BSD-3-Clause
//
// Device: microSD card via SPI
//
// Description: Configures SPI interface to communicate with microSD card,
// loads stage3 bootloader from fixed offset (1MB) into SRAM,
// then jumps to stage3 for further initialization.
//
// Details: * Initialize SPI interface for microSD communication
// * Send CMD0, CMD8, ACMD41 to initialize SD card
// * Read stage3 from sector 2048 (1MB offset) into SRAM
// * Jump to stage3 in SRAM
//
// Building: * This code must be position-independent, and use stack only
// * The code will be padded to a size of 256 bytes, including a
// 4-byte checksum. Therefore code size cannot exceed 252 bytes.
// ----------------------------------------------------------------------------
#include "pico/asm_helper.S"
#include "hardware/regs/addressmap.h"
#include "hardware/regs/spi.h"
#include "hardware/regs/pads_bank0.h"
#include "hardware/regs/io_bank0.h"
#include "hardware/regs/resets.h"
// ----------------------------------------------------------------------------
// Config section
// ----------------------------------------------------------------------------
// SPI pins for microSD (using SPI0)
#define SD_CLK_PIN 18
#define SD_MOSI_PIN 19
#define SD_MISO_PIN 16
#define SD_CS_PIN 17
// SPI clock divider (400kHz for init, will switch to faster later)
#define SPI_CLKDIV_INIT 312 // 125MHz / 312 ≈ 400kHz
#define SPI_CLKDIV_FAST 4 // 125MHz / 4 = 31.25MHz
// Stage3 parameters
#define STAGE3_SECTOR 2048 // 1MB offset (512 * 2048)
#define STAGE3_SIZE 8192 // 4KB stage3 size
#define STAGE3_LOAD_ADDR 0x20010000 // Load into SRAM
// SD commands
#define CMD0 0x40
#define CMD8 0x48
#define CMD55 0x77
#define ACMD41 0x69
#define CMD17 0x51
// ----------------------------------------------------------------------------
// Start of 2nd Stage Boot Code
// ----------------------------------------------------------------------------
pico_default_asm_setup
.section .text
regular_func _stage2_boot
push {lr}
// Enable SPI0 and IO_BANK0 in reset controller
ldr r3, =RESETS_BASE
ldr r0, =(RESETS_RESET_SPI0_BITS | RESETS_RESET_IO_BANK0_BITS | RESETS_RESET_PADS_BANK0_BITS)
ldr r1, [r3, #RESETS_RESET_OFFSET]
bics r1, r0
str r1, [r3, #RESETS_RESET_OFFSET]
// Wait for reset done
1: ldr r1, [r3, #RESETS_RESET_DONE_OFFSET]
tst r1, r0
bne 1b
// Configure GPIO pins for SPI
ldr r3, =IO_BANK0_BASE
// CLK pin
movs r0, #5 // SPI0 SCK function
str r0, [r3, #(IO_BANK0_GPIO0_CTRL_OFFSET + SD_CLK_PIN * 8)]
// MOSI pin
str r0, [r3, #(IO_BANK0_GPIO0_CTRL_OFFSET + SD_MOSI_PIN * 8)]
// MISO pin
str r0, [r3, #(IO_BANK0_GPIO0_CTRL_OFFSET + SD_MISO_PIN * 8)]
// CS pin as GPIO output
movs r0, #5 // SIO function
str r0, [r3, #(IO_BANK0_GPIO0_CTRL_OFFSET + SD_CS_PIN * 8)]
// Set CS high initially
ldr r3, =SIO_BASE
movs r0, #(1 << SD_CS_PIN)
str r0, [r3, #SIO_GPIO_OUT_SET_OFFSET]
str r0, [r3, #SIO_GPIO_OE_SET_OFFSET]
// Configure SPI0
ldr r3, =SPI0_BASE
// Disable SPI
movs r0, #0
str r0, [r3, #SPI_SSPCR1_OFFSET]
// Set clock rate for initialization
movs r0, #SPI_CLKDIV_INIT
str r0, [r3, #SPI_SSPCPSR_OFFSET]
// Configure SPI: 8-bit, SPI mode 0
movs r0, #7 // 8-bit data
str r0, [r3, #SPI_SSPCR0_OFFSET]
// Enable SPI
movs r0, #SPI_SSPCR1_SSE_BITS
str r0, [r3, #SPI_SSPCR1_OFFSET]
// Send 80 clock cycles with CS high (SD card initialization)
movs r2, #10
init_clocks:
movs r0, #0xFF
bl spi_write_byte
subs r2, #1
bne init_clocks
// Assert CS low
ldr r1, =SIO_BASE
movs r0, #(1 << SD_CS_PIN)
str r0, [r1, #SIO_GPIO_OUT_CLR_OFFSET]
// Send CMD0 (GO_IDLE_STATE)
movs r0, #CMD0
bl send_sd_cmd
movs r1, #0
movs r2, #0
bl send_sd_args
bl get_sd_response
// Send CMD8 (SEND_IF_COND)
movs r0, #CMD8
bl send_sd_cmd
ldr r1, =0x1AA
movs r2, #0
bl send_sd_args
bl get_sd_response
// Send ACMD41 (SD_SEND_OP_COND) - loop until ready
acmd41_loop:
// First send CMD55
movs r0, #CMD55
bl send_sd_cmd
movs r1, #0
movs r2, #0
bl send_sd_args
bl get_sd_response
// Then send ACMD41
movs r0, #ACMD41
bl send_sd_cmd
ldr r1, =0x40000000
movs r2, #0
bl send_sd_args
bl get_sd_response
// Check if ready (bit 7 of response)
movs r1, #0x80
tst r0, r1
beq acmd41_loop
// Switch to faster clock
movs r0, #SPI_CLKDIV_FAST
str r0, [r3, #SPI_SSPCPSR_OFFSET]
// Read stage3 from sector 2048
ldr r4, =STAGE3_LOAD_ADDR
ldr r5, =STAGE3_SECTOR
read_stage3:
// Send CMD17 (READ_SINGLE_BLOCK)
movs r0, #CMD17
bl send_sd_cmd
mov r1, r5
lsls r1, #9 // Convert sector to byte address
movs r2, #0
bl send_sd_args
bl get_sd_response
// Wait for data token (0xFE)
wait_data_token:
bl spi_read_byte
cmp r0, #0xFE
bne wait_data_token
// Read 512 bytes
movs r2, #512
read_sector_loop:
bl spi_read_byte
strb r0, [r4]
adds r4, #1
subs r2, #1
bne read_sector_loop
// Read CRC (2 bytes, ignore)
bl spi_read_byte
bl spi_read_byte
// Read next sectors if needed
adds r5, #1
ldr r0, =(STAGE3_SIZE / 512)
cmp r5, r0
blt read_stage3
// Deassert CS
ldr r1, =SIO_BASE
movs r0, #(1 << SD_CS_PIN)
str r0, [r1, #SIO_GPIO_OUT_SET_OFFSET]
// Jump to stage3
ldr r0, =STAGE3_LOAD_ADDR
bx r0
// Helper functions (ultra-compact versions due to space constraints)
spi_write_byte:
ldr r1, =SPI0_BASE
str r0, [r1, #SPI_SSPDR_OFFSET]
1: ldr r2, [r1, #SPI_SSPSR_OFFSET]
movs r3, #SPI_SSPSR_BSY_BITS
tst r2, r3
bne 1b
ldr r0, [r1, #SPI_SSPDR_OFFSET]
bx lr
spi_read_byte:
movs r0, #0xFF
b spi_write_byte
send_sd_cmd:
push {lr}
bl spi_write_byte
pop {pc}
send_sd_args:
push {lr}
lsrs r0, r1, #24
bl spi_write_byte
lsrs r0, r1, #16
bl spi_write_byte
lsrs r0, r1, #8
bl spi_write_byte
mov r0, r1
bl spi_write_byte
movs r0, #0x95 // CRC for CMD0/CMD8
bl spi_write_byte
pop {pc}
get_sd_response:
push {lr}
movs r2, #8
1: bl spi_read_byte
cmp r0, #0xFF
bne 2f
subs r2, #1
bne 1b
2: pop {pc}
.global literals
literals:
.ltorg
.end
AI says it will be 252 bytes.
lr
is pushed on stack at the beginning, so stage3
will have a similar context as the current stage2. stage3 is then responsible for FLASH and i2c (for keyboard) initialization, and jumps to one of the following:
- FLASH app
- A “SD-RAM” app, which is programed at a fixed offset of the SD card
- This has huge potential because we can chainload a lot of different bootloaders and simple apps.
- Can also load “stage4” (which is the full SD-boot)
Something like this: