Created
March 28, 2020 07:41
-
-
Save matthijskooijman/e636e1f962125e58a80c583260a96c95 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Put this function in RAM to prevent any unpredictable latency from | |
// loading instructions from flash. | |
byte BB_SPITransfer (byte c) __attribute__((section(".ramfunc"))); | |
byte BB_SPITransfer (byte c) | |
{ | |
// Number of cycles between writes to SCK (excluding one SCK write | |
// but including the other). These counts do not include the delay | |
// loops. The movs before the delay loop is also not counted, | |
// since it compensates for the missing cycle on the last branch | |
// in the delay loop (when it is *not* taken). Note that the PORT | |
// I/O registers are on the "Single Cycle IOBUS" on the SAMD21, so | |
// they can be accessed in 1 cycle rather than 2 for other | |
// peripherals: | |
// https://microchipdeveloper.com/32arm:samd21-iobus-overview | |
// Cycle counts come from the Instruction Set Summary in the Cortex | |
// M0+ Technical Reference Manual: | |
// http://infocenter.arm.com/help/topic/com.arm.doc.ddi0484c/CHDCICDF.html | |
// For a reference on the assembler instructions used below, see the | |
// Instruction Set chapter in the Cortex M0+ Devices Generic User | |
// Guide: | |
// http://infocenter.arm.com/help/topic/com.arm.doc.dui0662b/BABIHJGA.html | |
const unsigned LOOP_OVERHEAD = 9; // From SCK change to SCK change | |
const unsigned CYCLES_PER_LOOP = 4; // Number of cycles in the delay loop | |
// TODO: In theory this should go up to about 1.8Mhz (2x(9+4)=26 | |
// cycles per loop, so 48Mhz / 26 = 1.8Mhz), and be perfectly | |
// balanced, but in practice I see about 1.2Mhz with the clock | |
// slightly out of balance. Maybe the PORT accesses are not | |
// 1-cycle after all? | |
// Make sure that the loop counts never become less than 1 | |
const unsigned MIN_CYCLES = CYCLES_PER_LOOP + LOOP_OVERHEAD; | |
unsigned delay_cycles = max(isp_delay, MIN_CYCLES); | |
// Calculate the number of delay loop counts | |
// TODO: Maybe do this calculation in setIspSpeed already? | |
unsigned delay_loop_count = (delay_cycles - LOOP_OVERHEAD) / CYCLES_PER_LOOP; | |
// Passing the offsets within the PORT register as literals allows | |
// encoding these offsets efficiently in the ldr/str instructions as | |
// long as they are <= 124 (limitation of ldr/str instruction). This | |
// is more efficient than loading the absolute values of all these. | |
const unsigned MOSI_CLR_OFFSET = (unsigned)( (char*)(&PORT->Group[BB_MOSI_PORT].OUTCLR.reg) - (char*)(PORT) ); | |
const unsigned MOSI_SET_OFFSET = (unsigned)( (char*)(&PORT->Group[BB_MOSI_PORT].OUTSET.reg) - (char*)(PORT) ); | |
const unsigned SCK_CLR_OFFSET = (unsigned)( (char*)(&PORT->Group[BB_SCK_PORT].OUTCLR.reg) - (char*)(PORT) ); | |
const unsigned SCK_SET_OFFSET = (unsigned)( (char*)(&PORT->Group[BB_SCK_PORT].OUTSET.reg) - (char*)(PORT) ); | |
const unsigned MISO_IN_OFFSET = (unsigned)( (char*)(&PORT->Group[BB_MISO_PORT].IN.reg) - (char*)(PORT) ); | |
unsigned tmp; | |
asm volatile ( | |
// Use the unified ARM/Thumb syntax, which seems to be more | |
// universally used and corresponds to what avr-objdump outputs | |
// See https://sourceware.org/binutils/docs/as/ARM_002dInstruction_002dSet.html | |
".syntax unified\n\t" | |
// Shift c left so that bit 8 is now at bit 31 (which can be | |
// easily tested when shifting it out). | |
"lsls %[c], %[c], #24\n\t" | |
// Start of 8-bit loop | |
"0:\n\t" | |
// Shift c by one, then check the carry bit shifted out | |
"lsls %[c], %[c], #1 /* 1 cycle */\n\t" | |
"bcs set_mosi%= /* 2 if taken, 1 otherwise */\n\t" | |
"str %[mosi_bit_mask], [%[port], %[mosi_clr]] /* 1 cycle */\n\t" | |
"b done_mosi%= /* 2 cycles */\n\t" | |
"set_mosi%=:\n\t" | |
// To balance cycles for both branches | |
"nop /* 1 cycle */\n\t" | |
"str %[mosi_bit_mask], [%[port], %[mosi_set]] /* 1 cycle */\n\t" | |
"done_mosi%=:\n\t" | |
// Then, delay for count * 4 cycles | |
"movs %[tmp], %[delay_loop_count] /* 1 cycle */\n\t" | |
"1:\n\t" | |
" subs %[tmp], %[tmp], #1 /* 1 cycle */\n\t" | |
// TODO: This cmp can be removed (subs already sets flags), but | |
// then we have 3 cycles per loop, which is of course more work to | |
// calculate the cycle count... | |
" cmp %[tmp], #0 /* 1 cycle */\n\t" | |
"bne 1b /* 2 if taken, 1 otherwise */\n\t" | |
// Set SCK | |
"str %[sck_bit_mask], [%[port], %[sck_set]] /* 1 cycle */\n\t" | |
// Read MISO | |
"ldr %[tmp], [%[port], %[miso_in]] /* 1 cycle */\n\t" | |
// Move bit read to LSB and or into c | |
"lsls %[tmp], %[miso_bit_inv] /* 1 cycle */\n\t" | |
"lsrs %[tmp], #31 /* 1 cycle */\n\t" | |
"orrs %[c], %[tmp] /* 1 cycle */\n\t" | |
// Then, delay for count * 4 cycles | |
"movs %[tmp], %[delay_loop_count] /* 1 cycle */\n\t" | |
"2:\n\t" | |
" subs %[tmp], %[tmp], #1 /* 1 cycle */\n\t" | |
" bne 2b /* 2 if taken, 1 otherwise */\n\t" | |
// Add nops to balance the number of cycles from SCK to SCK | |
"nop\n\tnop\n\tnop\n\tnop /* 4 cycles */\n\t" | |
// Clear SCK | |
"str %[sck_bit_mask], [%[port], %[sck_clr]] /* 1 cycle */\n\t" | |
// Loop | |
"subs %[loop], %[loop], #1 /* 1 cycle */\n\t" | |
"bne 0b /* 2 if taken, 1 otherwise */\n\t" | |
// Revert to the default "divided" syntax, which is apparently | |
// generated by gcc (without this, the code generated below will | |
// break). | |
".syntax divided\n\t" | |
// Below, map C-level variables and values into assembler | |
// registers and immediate values. See | |
// http://www.ethernut.de/en/documents/arm-inline-asm.html | |
: // Outputs: | |
[tmp] "+l" (tmp), | |
// This puts c in a 32-bit register for us to play with, but | |
// the compiler takes care of truncating it to 8-bits | |
// afterwards (because c is declared as byte). | |
[c] "+l" (c) | |
: // Inputs: | |
[loop] "l" (8), | |
[port] "l" (PORT), | |
[sck_clr] "M" (SCK_CLR_OFFSET), | |
[sck_set] "M" (SCK_SET_OFFSET), | |
[sck_bit_mask] "l" (1 << BB_SCK_BIT), | |
[mosi_clr] "M" (MOSI_CLR_OFFSET), | |
[mosi_set] "M" (MOSI_SET_OFFSET), | |
[mosi_bit_mask] "l" (1UL << BB_MOSI_BIT), | |
[miso_in] "M" (MISO_IN_OFFSET), | |
[miso_bit_inv] "N" (31 - BB_MISO_BIT), | |
[delay_loop_count] "l" (delay_loop_count) | |
: // Clobbers: | |
"cc" | |
); | |
// TODO: Static assert that all offsets are <= 124 | |
// since gcc has no operand class for this particular limitation). | |
return c; | |
} // end of BB_SPITransfer |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment