diff --git a/libpolyml/arm64.cpp b/libpolyml/arm64.cpp
index c2920756..273a6fe5 100644
--- a/libpolyml/arm64.cpp
+++ b/libpolyml/arm64.cpp
@@ -1,1042 +1,1053 @@
 /*
     Machine-dependent code for ARM64
 
     Copyright David C.J. Matthews 2020-21.
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 
 */
 
 // Currently this is just copied from the interpreted version.
 
 #ifdef HAVE_CONFIG_H
 #include "config.h"
 #elif defined(_WIN32)
 #include "winconfig.h"
 #else
 #error "No configuration file"
 #endif
 
 #ifdef HAVE_ASSERT_H
 #include <assert.h>
 #define ASSERT(x) assert(x)
 #else
 #define ASSERT(x) 0
 #endif
 
 #ifdef HAVE_STRING_H
 #include <string.h>
 #endif
 
 #include "globals.h"
 #include "machine_dep.h"
 #include "sys.h"
 #include "profiling.h"
 #include "arb.h"
 #include "processes.h"
 #include "run_time.h"
 #include "gc.h"
 #include "diagnostics.h"
 #include "polystring.h"
 #include "save_vec.h"
 #include "memmgr.h"
 #include "scanaddrs.h"
 #include "rtsentry.h"
 #include "bytecode.h"
 #include "int_opcodes.h"
 
 /*
 * ARM64 register use:
 * X0        First argument and return value
 * X1-X7     Second-eighth argument
 * X8        Indirect result (C), ML closure pointer on entry
 * X9-X15    Volatile scratch registers
 * X16-17    Intra-procedure-call (C).  Only used for special cases in ML.
 * X18       Platform register. Not used in ML.
 * X19-X23   Non-volatile (C).  Scratch registers (ML).
 * X24       Non-volatile (C).  Scratch register (ML).  Heap base in 32-in-64.
 * X25       ML Heap limit pointer
 * X26       ML assembly interface pointer.  Non-volatile (C).
 * X27       ML Heap allocation pointer.  Non-volatile (C).
 * X28       ML Stack pointer. Non-volatile (C).
 * X29       Frame pointer (C). Not used in ML
 * X30       Link register.
 * X31       Stack pointer (C).  Only used when calling C.  Also zero register.
 * 
 * Floating point registers:
 * V0        First argument and return value
 * V1-V7     Second-eighth argument
 * V8-V15    Non volatile. Not currently used in ML.
 * V16-V31   Volatile. Not currently used in ML.
 * 
 * The ML calling conventions generally follow the C ABI except that
 * all registers are volatile and X28 is used for the stack.
 */
 
 /* the amount of ML stack space to reserve for registers,
    C exception handling etc. The compiler requires us to
    reserve 2 stack-frames worth (2 * 20 words). We actually reserve
    slightly more than this.
 */
 
 // Arm64 instructions are all 32-bit values.
 typedef uint32_t arm64Instr, *arm64CodePointer;
 
 // Each function checks for space on the stack at the start.  To reduce the
 // code size it assumes there are at least 10 words on the stack and only
 // checks the exact space if it requires more than that.  For safety we
 // always make sure there are 50 words spare.
 #define OVERFLOW_STACK_SIZE 50
 
 // X26 always points at this area when executing ML code.
 // The offsets are built into the assembly code and some are built into
 // the code generator so this must not be changed without checking these.
 typedef struct _AssemblyArgs {
 public:
     byte*           enterInterpreter;  // These are filled in with the functions.
     byte*           heapOverFlowCall;
     byte*           stackOverFlowCall;
     byte*           stackOverFlowCallEx;
     byte*           trapHandlerEntry;
     stackItem*      handlerRegister;   // Current exception handler
     stackItem*      stackLimit;        // Lower limit of stack
     stackItem       exceptionPacket;    // Set if there is an exception
     stackItem       threadId;           // My thread id.  Saves having to call into RTS for it. (stackItem so it's 64-bits)
     stackItem       registers[25];      // Save/load area for registers X0-X24 inclusive
     double          fpRegisters[8];     // Save/load area for floating point regs D0-D7
     PolyWord*       localMbottom;      // Base of memory + 1 word
     PolyWord*       localMpointer;      // X27 Allocation ptr + 1 word
     stackItem*      stackPtr;           // X28 Current stack pointer
     arm64CodePointer linkRegister;       // X30 - Link register (return address)
     arm64CodePointer entryPoint;         // PC address to return to
 
     byte            returnReason;       // Reason for returning from ML - Set by assembly code.
 }  AssemblyArgs;
 
 class Arm64TaskData: public TaskData, ByteCodeInterpreter {
 public:
     Arm64TaskData();
     ~Arm64TaskData() {}
     unsigned allocReg; // The register to take the allocated space.
     POLYUNSIGNED allocWords; // The words to allocate.
 
     AssemblyArgs assemblyInterface;
     uint32_t saveRegisterMask; // Registers that need to be updated by a GC.
 
     virtual void GarbageCollect(ScanAddress *process);
     void ScanStackAddress(ScanAddress *process, stackItem& val, StackSpace *stack);
     virtual void EnterPolyCode(); // Start running ML
 
     virtual void SetException(poly_exn *exc) { assemblyInterface.exceptionPacket = (PolyWord)exc;  }
     virtual void InterruptCode();
 
     // AddTimeProfileCount is used in time profiling.
     virtual bool AddTimeProfileCount(SIGNALCONTEXT *context);
 
     virtual void InitStackFrame(TaskData *newTask, Handle proc);
 
     // Atomically release a mutex using hardware interlock.
     virtual bool AtomicallyReleaseMutex(PolyObject* mutexp);
 
     // Return the minimum space occupied by the stack.  Used when setting a limit.
     // N.B. This is PolyWords not native words.
     virtual uintptr_t currentStackSpace(void) const
     {
         return (this->stack->top - (PolyWord*)assemblyInterface.stackPtr) + OVERFLOW_STACK_SIZE;
     }
 
     virtual void addProfileCount(POLYUNSIGNED words) { addSynchronousCount(interpreterPc, words); }
 
     // PreRTSCall: After calling from ML to the RTS we need to save the current heap pointer
     virtual void PreRTSCall(void) { TaskData::PreRTSCall();  SaveMemRegisters(); }
     // PostRTSCall: Before returning we need to restore the heap pointer.
     // If there has been a GC in the RTS call we need to create a new heap area.
     virtual void PostRTSCall(void) { SetMemRegisters(); TaskData::PostRTSCall(); }
 
     virtual void CopyStackFrame(StackObject *old_stack, uintptr_t old_length, StackObject *new_stack, uintptr_t new_length);
 
     void SetMemRegisters();
     void SaveMemRegisters();
 
     void HandleTrap();
     // ByteCode overrides.  The interpreter and native code states need to be in sync.
     // The interpreter is only used during the initial bootstrap.
     virtual void ClearExceptionPacket() { assemblyInterface.exceptionPacket = TAGGED(0); }
     virtual PolyWord GetExceptionPacket() { return assemblyInterface.exceptionPacket; }
     virtual stackItem* GetHandlerRegister() { return assemblyInterface.handlerRegister; }
     virtual void SetHandlerRegister(stackItem* hr) { assemblyInterface.handlerRegister = hr; }
 
     void Interpret();
     void EndBootStrap() { mixedCode = true; }
 
     PLock interruptLock;
 
     virtual void HandleStackOverflow(uintptr_t space);
 };
 
 class Arm64Dependent : public MachineDependent {
 public:
     Arm64Dependent() : mustInterpret(false) {}
 
     // Create a task data object.
     virtual TaskData* CreateTaskData(void) { return new Arm64TaskData(); }
 
     virtual void ScanConstantsWithinCode(PolyObject* addr, PolyObject* oldAddr, POLYUNSIGNED length,
         PolyWord* newConstAddr, PolyWord* oldConstAddr, POLYUNSIGNED numConsts, ScanAddress* process);
 
     virtual Architectures MachineArchitecture(void);
 
     virtual void SetBootArchitecture(char arch, unsigned wordLength);
 
     // The ARM has separate instruction and data caches.
     virtual void FlushInstructionCache(void* p, POLYUNSIGNED bytes);
 
     // During the first bootstrap phase this is interpreted.
     bool mustInterpret;
 
 #if (! defined(POLYML32IN64))
     // Address of the constant segment from the code segment.  This is complicated because
     // some OSs require the code to position-independent which means the code can only
     // contain relative offsets.  This isn't a problem for 32-in-64 because the code is
     // copied before it is executed.
 
     // Set the address of the constant area.  If this is within the code segment itself we use the
     // default, negative, byte offset.  If the constant area has been split off we use a pair of
     // dummy ADRP/LDR instructions.  They aren't ever executed but allow us to use relative addressing.
     virtual void SetAddressOfConstants(PolyObject* objAddr, PolyObject* writable, POLYUNSIGNED length, PolyWord* constAddr)
     {
         if (constAddr > (PolyWord*)objAddr && constAddr < (PolyWord*)objAddr + length)
         {
             int64_t offset = (byte*)constAddr - (byte*)objAddr - length * sizeof(PolyWord);
             writable->Set(length - 1, PolyWord::FromSigned(offset));
         }
         else
         {
             PolyWord* last_word = objAddr->Offset(length - 1); // Last word in the code
             MemSpace* space = gMem.SpaceForAddress(last_word);
             uint32_t* pt = (uint32_t*)space->writeAble(last_word);
             pt[0] = 0x90000000; // Insert dummy ADRP and LDR
             pt[1] = 0xf9400000;
             ScanAddress::SetConstantValue((byte*)last_word, (PolyObject*)constAddr, PROCESS_RELOC_ARM64ADRPLDR);
         }
     }
 
     virtual void GetConstSegmentForCode(PolyObject* obj, POLYUNSIGNED obj_length, PolyWord*& cp, POLYUNSIGNED& count) const
     {
         PolyWord* last_word = obj->Offset(obj_length - 1); // Last word in the code
         if ((last_word[0].AsUnsigned() >> 56) == 0xff)
         {
             // If the high-order byte is 0xff it's a (-ve) byte offset.
             POLYSIGNED offset = last_word->AsSigned();
             cp = last_word + 1 + offset / sizeof(PolyWord);
             count = cp[-1].AsUnsigned();
         }
         else
         {
             PolyObject* addr = ScanAddress::GetConstantValue((byte*)last_word, PROCESS_RELOC_ARM64ADRPLDR, 0);
             cp = (PolyWord*)addr;
             count = addr->Length();
         }
 
     }
 #endif
 
     // Override for X86-64 because of the need for position-independent code.
 #if (defined(HOSTARCHITECTURE_X86_64) && !defined(POLYML32IN64))
     // Find the start of the constant section for a piece of code.
     virtual void GetConstSegmentForCode(PolyObject* obj, POLYUNSIGNED obj_length, PolyWord*& cp, POLYUNSIGNED& count) const
     {
         PolyWord* last_word = obj->Offset(obj_length - 1); // Last word in the code
         // Only the low order 32-bits are valid since this may be
         // set by a 32-bit relative relocation.
         int32_t offset = (int32_t)last_word->AsSigned();
         POLYSIGNED offset = last_word->AsSigned();
         cp = last_word + 1 + offset / sizeof(PolyWord);
         count = cp[-1].AsUnsigned();
     }
     // Set the address of the constant area.  The default is a relative byte offset.
     virtual void SetAddressOfConstants(PolyObject* objAddr, PolyObject* writable, POLYUNSIGNED length, PolyWord* constAddr)
     {
         int64_t offset = (byte*)constAddr - (byte*)objAddr - length * sizeof(PolyWord);
         ASSERT(offset >= -(int64_t)0x80000000 && offset <= (int64_t)0x7fffffff);
         ASSERT(offset < ((int64_t)1) << 32 && offset >((int64_t)(-1)) << 32);
         writable->Set(length - 1, PolyWord::FromSigned(offset & 0xffffffff));
     }
 #endif
 };
 
 static Arm64Dependent arm64Dependent;
 
 MachineDependent* machineDependent = &arm64Dependent;
 
 Architectures Arm64Dependent::MachineArchitecture(void)
 {
     // During the first phase of the bootstrap we
     // compile the interpreted version.
     if (mustInterpret) return MA_Interpreted;
 #if defined(POLYML32IN64)
     return MA_Arm64_32;
 #else
     return MA_Arm64;
 #endif
 }
 
 // Values for the returnReason byte. These values are put into returnReason by the assembly code
 // depending on which of the "trap" functions has been called.
 enum RETURN_REASON {
     RETURN_HEAP_OVERFLOW = 1,           // Heap space check has failed.
     RETURN_STACK_OVERFLOW = 2,          // Stack space check has failed (<= 10 words).
     RETURN_STACK_OVERFLOWEX = 3,        // Stack space check has failed.  Adjusted SP is in X9.
     RETURN_ENTER_INTERPRETER = 4        // Native code has entered interpreted code.
 };
 
 extern "C" {
 
     // These are declared in the assembly code segment.
     void Arm64AsmEnterCompiledCode(void*);
     int  Arm64AsmCallExtraRETURN_ENTER_INTERPRETER(void);
     int  Arm64AsmCallExtraRETURN_HEAP_OVERFLOW(void);
     int  Arm64AsmCallExtraRETURN_STACK_OVERFLOW(void);
     int  Arm64AsmCallExtraRETURN_STACK_OVERFLOWEX(void);
 
     // This is declared here and called from the assembly code.
     // It avoids having a call to an external in the assembly code
     // which sometimes gives problems with position-indepent code.
     void  Arm64TrapHandler(PolyWord threadId);
 };
 
 Arm64TaskData::Arm64TaskData() : ByteCodeInterpreter(&assemblyInterface.stackPtr, &assemblyInterface.stackLimit),
     allocReg(0), allocWords(0), saveRegisterMask(0)
 {
     assemblyInterface.enterInterpreter = (byte*)Arm64AsmCallExtraRETURN_ENTER_INTERPRETER;
     assemblyInterface.heapOverFlowCall = (byte*)Arm64AsmCallExtraRETURN_HEAP_OVERFLOW;
     assemblyInterface.stackOverFlowCall = (byte*)Arm64AsmCallExtraRETURN_STACK_OVERFLOW;
     assemblyInterface.stackOverFlowCallEx = (byte*)Arm64AsmCallExtraRETURN_STACK_OVERFLOWEX;
     assemblyInterface.trapHandlerEntry = (byte*)Arm64TrapHandler;
     interpreterPc = 0;
     mixedCode = !arm64Dependent.mustInterpret;
 }
 
 void Arm64Dependent::SetBootArchitecture(char arch, unsigned wordLength)
 {
     if (arch == 'I')
         mustInterpret = true;
     else if (arch != 'A')
         Crash("Boot file has unexpected architecture code: %c", arch);
 }
 
 // The ARM has separate instruction and data caches so we must flush
 // the cache when creating or modifying code.
 void Arm64Dependent::FlushInstructionCache(void* p, POLYUNSIGNED bytes)
 {
 #ifdef _WIN32
     ::FlushInstructionCache(GetCurrentProcess(), p, bytes);
 #elif defined (__GNUC__)
     __builtin___clear_cache((char*)p, (char*)p + bytes);
 #elif (defined (__clang__) && defined (__APPLE__))
     sys_icache_invalidate(p, bytes);
 #else
 #error "No code to flush the instruction cache."
 #endif
 }
 
 void Arm64TaskData::GarbageCollect(ScanAddress *process)
 {
     TaskData::GarbageCollect(process);
     ByteCodeInterpreter::GarbageCollect(process);
     assemblyInterface.threadId = stackItem(threadObject); // threadObject updated by TaskData::GarbageCollect
 
     if (assemblyInterface.exceptionPacket.w().IsDataPtr())
     {
         PolyObject* obj = assemblyInterface.exceptionPacket.w().AsObjPtr();
         obj = process->ScanObjectAddress(obj);
         assemblyInterface.exceptionPacket = (PolyWord)obj;
     }
 
     if (stack != 0)
     {
         stackItem*stackPtr = assemblyInterface.stackPtr;
         // Now the values on the stack.
         for (stackItem* q = stackPtr; q < (stackItem*)stack->top; q++)
             ScanStackAddress(process, *q, stack);
     }
 
     // Register mask.  There is a bit for each of the registers up to X24.
     for (int i = 0; i < 25; i++)
     {
         if (saveRegisterMask & (1 << i))
             ScanStackAddress(process, assemblyInterface.registers[i], stack);
     }
 
     // Make sure the code is still reachable. Code addresses aren't updated.
     {
         stackItem code;
         code.codeAddr = (POLYCODEPTR)assemblyInterface.linkRegister;
         ScanStackAddress(process, code, stack);
         code.codeAddr = (POLYCODEPTR)assemblyInterface.entryPoint;
         ScanStackAddress(process, code, stack);
     }
 }
 
 // Process a value within the stack.
 void Arm64TaskData::ScanStackAddress(ScanAddress *process, stackItem& stackItem, StackSpace *stack)
 {
     // Code addresses on the ARM are always even, unlike the X86, so if it's tagged
     // it can't be an address.
     if (stackItem.w().IsTagged()) return;
 
 #ifdef POLYML32IN64
     // In 32-in-64 we can have either absolute addresses or object indexes.
     // Absolute addresses always have the top 32-bits non-zero
     if (stackItem.argValue < ((uintptr_t)1 << 32))
     {
         PolyWord val = process->ScanObjectAddress(stackItem.w().AsObjPtr());
         stackItem = val;
     }
     else
     {
         // Could be a code address or a stack address.
         MemSpace* space = gMem.SpaceForAddress(stackItem.codeAddr - 1);
         if (space == 0 || space->spaceType != ST_CODE) return;
         PolyObject* obj = gMem.FindCodeObject(stackItem.codeAddr);
         ASSERT(obj != 0);
         // Process the address of the start.  Don't update anything.
         process->ScanObjectAddress(obj);
     }
 #else
     MemSpace* space = gMem.SpaceForAddress(stackItem.codeAddr - 1);
 
     if (space->spaceType == ST_CODE)
     {
         PolyObject* obj = gMem.FindCodeObject(stackItem.codeAddr);
         // Process the address of the start.  Don't update anything.
         process->ScanObjectAddress(obj);
     }
     else if (space->spaceType == ST_LOCAL)
         // Local values must be word addresses.
     {
         PolyWord val = process->ScanObjectAddress(stackItem.w().AsObjPtr());
         stackItem = val;
     }
 #endif
 }
 
 // Copy a stack
 void Arm64TaskData::CopyStackFrame(StackObject *old_stack, uintptr_t old_length, StackObject *new_stack, uintptr_t new_length)
 {
 #ifdef POLYML32IN64
     old_length = old_length / 2;
     new_length = new_length / 2;
 #endif
     /* Moves a stack, updating all references within the stack */
     stackItem*old_base = (stackItem*)old_stack;
     stackItem*new_base = (stackItem*)new_stack;
     stackItem*old_top = old_base + old_length;
 
     /* Calculate the offset of the new stack from the old. If the frame is
     being extended objects in the new frame will be further up the stack
     than in the old one. */
 
     uintptr_t offset = new_base - old_base + new_length - old_length;
     stackItem *oldSp = assemblyInterface.stackPtr;
     assemblyInterface.stackPtr = oldSp + offset;
     assemblyInterface.handlerRegister = assemblyInterface.handlerRegister + offset;
 
     /* Skip the unused part of the stack. */
 
     uintptr_t i = oldSp - old_base;
 
     ASSERT(i <= old_length);
 
     i = old_length - i;
 
     stackItem *old = oldSp;
     stackItem *newp = assemblyInterface.stackPtr;
 
     while (i--)
     {
         stackItem old_word = *old++;
         if (old_word.w().IsDataPtr() && old_word.stackAddr >= old_base && old_word.stackAddr <= old_top)
             old_word.stackAddr = old_word.stackAddr + offset;
         else if (old_word.w().IsDataPtr() && IsHeapAddress(old_word.stackAddr))
         {
             stackItem* addr = (stackItem*)old_word.w().AsStackAddr();
             if (addr >= old_base && addr <= old_top)
             {
                 addr += offset;
                 old_word = PolyWord::FromStackAddr((PolyWord*)addr);
             }
         }
         *newp++ = old_word;
     }
     ASSERT(old == ((stackItem*)old_stack) + old_length);
     ASSERT(newp == ((stackItem*)new_stack) + new_length);
 }
 
 void Arm64TaskData::EnterPolyCode()
 /* Called from "main" to enter the code. */
 {
     assemblyInterface.stackLimit = (stackItem*)((PolyWord*)this->stack->stack() + OVERFLOW_STACK_SIZE);
     if (arm64Dependent.mustInterpret)
     {
         PolyWord closure = assemblyInterface.registers[8];
         *(--assemblyInterface.stackPtr) = closure; /* Closure address */
         interpreterPc = *(POLYCODEPTR*)closure.AsObjPtr();
         Interpret();
         ASSERT(0); // Should never return
     }
     SetMemRegisters();
     // Jump into the ML code.  This code sets up the registers and puts the
     // address of the assemblyInterface into X26
     Arm64AsmEnterCompiledCode(&assemblyInterface);
     // This should never return
     ASSERT(0);
 
 }
 
 void Arm64TaskData::Interpret()
 {
     while (true)
     {
         switch (RunInterpreter(this))
         {
         case ReturnCall:
             // After the call there will be an enter-int instruction so that when this
             // returns we will re-enter the interpreter.  The number of arguments for
             // this call is after that.
             while ((uintptr_t)interpreterPc & 3)
             {
                 ASSERT(interpreterPc[0] == INSTR_no_op);
                 interpreterPc++;
             }
             ASSERT(interpreterPc[0] == 0xe9);
             numTailArguments = interpreterPc[12];
 
         case ReturnTailCall:
         {
             ClearExceptionPacket();
             // Pop the closure.
             PolyWord closureWord = *assemblyInterface.stackPtr++;
             PolyObject* closure = closureWord.AsObjPtr();
             arm64CodePointer cp = *(arm64CodePointer*)closure;
             if (cp[0] == 0xAA1E03E9 && cp[1] == 0xF9400350 && cp[2] == 0xD63F0200)
             {
                 // If the code we're going to is interpreted push back the closure and
                 // continue.
                 interpreterPc = (POLYCODEPTR)cp;
                 assemblyInterface.stackPtr--;
                 HandleStackOverflow(128); // Make sure we have space since we're bypassing the check.
                 continue;
             }
             assemblyInterface.registers[8] = closureWord; // Put closure in the closure reg.
             // Pop the return address.  We may need to align this to a word boundary.
             POLYCODEPTR originalReturn = (POLYCODEPTR)((assemblyInterface.stackPtr++)->codeAddr);
             while ((uintptr_t)originalReturn & 3)
             {
                 ASSERT(originalReturn[0] == INSTR_no_op); 
                 originalReturn++;
             }
             // Get the arguments into the correct registers.
             // Load the register arguments.  The first 8 arguments go into X0-X7.
             // These will have been the first arguments to be pushed so will be
             // furthest away on the stack.
             // Note: we don't currently pass any arguments in the FP regs.
             for (unsigned i = 0; i < numTailArguments && i < 8; i++)
                 assemblyInterface.registers[i] = assemblyInterface.stackPtr[numTailArguments - i - 1];
             // If there are any more arguments these need to be shifted down the stack.
             while (numTailArguments > 8)
             {
                 numTailArguments--;
                 assemblyInterface.stackPtr[numTailArguments] = assemblyInterface.stackPtr[numTailArguments - 8];
             }
             // Remove the register arguments
             assemblyInterface.stackPtr += numTailArguments > 8 ? 8 : numTailArguments;
 
             assemblyInterface.linkRegister = (arm64CodePointer)originalReturn; // Set the return address to caller
             assemblyInterface.entryPoint = *(arm64CodePointer*)closure; // Entry point to callee
             interpreterPc = 0; // No longer in the interpreter (See SaveMemRegs)
             return;
         }
 
         case ReturnReturn:
         {
             ClearExceptionPacket();
             // Returning from an interpreted function.  Normally we'll be returning to
             // interpreted code.
             if ((uintptr_t)interpreterPc & 3) // ARM64 addresses will always be 4-byte aligned.
                 continue;
             arm64CodePointer cp = (arm64CodePointer)interpreterPc;
             if (cp[0] == 0xAA1E03E9 && cp[1] == 0xF9400350 && cp[2] == 0xD63F0200)
                 continue;
             // Pop the value we're returning.  Set the entry point to the code we're returning to.
             assemblyInterface.registers[0] = *assemblyInterface.stackPtr++;
             assemblyInterface.entryPoint = cp;
             interpreterPc = 0; // No longer in the interpreter (See SaveMemRegs)
             return;
         }
 
         }
     }
 }
 
 // Called from the assembly code as a result of a trap i.e. a request for
 // a GC or to extend the stack.
 void Arm64TrapHandler(PolyWord threadId)
 {
     Arm64TaskData* taskData = (Arm64TaskData*)TaskData::FindTaskForId(threadId);
     taskData->HandleTrap();
 }
 
 void Arm64TaskData::HandleTrap()
 {
     SaveMemRegisters(); // Update globals from the memory registers.
 
     switch (this->assemblyInterface.returnReason)
     {
 
     case RETURN_HEAP_OVERFLOW:
     {
         // The heap has overflowed.
         // The register mask is the word after the return.
         saveRegisterMask = *assemblyInterface.entryPoint++;
         // The generated code first subtracts the space required from x27 and puts the
         // result into a separate register.  It then compares this with x25 and comes here if
         // it is not above that.  Either way it is going to execute an instruction to put
         // this value back into x27.
         // Look at that instruction to find out the register.
         arm64Instr moveInstr = *assemblyInterface.entryPoint;
         ASSERT((moveInstr & 0xffe0ffff) == 0xaa0003fb); // mov x27,xN
         allocReg = (moveInstr >> 16) & 0x1f;
         allocWords = (allocPointer - (PolyWord*)assemblyInterface.registers[allocReg].stackAddr) + 1;
         assemblyInterface.registers[allocReg] = TAGGED(0); // Clear this - it's not a valid address.
         if (profileMode == kProfileStoreAllocation)
             addProfileCount(allocWords);
         // The actual allocation is done in SetMemRegisters.
         break;
     }
 
     case RETURN_STACK_OVERFLOW:
     case RETURN_STACK_OVERFLOWEX:
     {
         // The register mask is the word after the return.
         saveRegisterMask = *assemblyInterface.entryPoint++;
         uintptr_t min_size = 0; // Size in PolyWords
         if (assemblyInterface.returnReason == RETURN_STACK_OVERFLOW)
         {
             min_size = (this->stack->top - (PolyWord*)assemblyInterface.stackPtr) +
                 OVERFLOW_STACK_SIZE * sizeof(uintptr_t) / sizeof(PolyWord);
         }
         else
         {
             // Stack limit overflow.  If the required stack space is larger than
             // the fixed overflow size the code will calculate the limit in X9.
             stackItem* stackP = assemblyInterface.registers[9].stackAddr;
             min_size = (this->stack->top - (PolyWord*)stackP) +
                 OVERFLOW_STACK_SIZE * sizeof(uintptr_t) / sizeof(PolyWord);
         }
         HandleStackOverflow(min_size);
         break;
     }
 
     case RETURN_ENTER_INTERPRETER:
     {
         interpreterPc = (POLYCODEPTR)assemblyInterface.linkRegister;
         byte reasonCode = *interpreterPc++;
         // Sort out arguments.
         assemblyInterface.exceptionPacket = TAGGED(0);
         if (reasonCode == 0xff)
         {
             // Exception handler.
             assemblyInterface.exceptionPacket = assemblyInterface.registers[0]; // Get the exception packet
             // We need to leave the current handler in place.  When we enter the interpreter it will
             // check the exception packet and if it is non-null will raise it.
         }
         else if (reasonCode >= 128)
         {
             // Start of function.
             unsigned numArgs = reasonCode - 128;
             // We need the stack to contain:
             // The closure, the return address, the arguments.
             // The stack will currently contain the stack arguments.
             // Add space for the register arguments
             if (numArgs > 8)
                 assemblyInterface.stackPtr -= 8;
             else assemblyInterface.stackPtr -= numArgs;
             // Move up any stack arguments.
             for (unsigned n = 8; n < numArgs; n++)
             {
                 assemblyInterface.stackPtr[n - 8] = assemblyInterface.stackPtr[n];
             }
             // Store the register arguments
             for (unsigned n = 0; n < numArgs && n < 8; n++)
                 assemblyInterface.stackPtr[numArgs - n - 1] = assemblyInterface.registers[n];
 
             // Finally push the return address and closure pointer
             *(--assemblyInterface.stackPtr) = assemblyInterface.registers[9]; // Return address - value of X30 before enter-int
             *(--assemblyInterface.stackPtr) = assemblyInterface.registers[8]; // Closure
         }
         else
         {
             // Return from call. Push X0
             *(--assemblyInterface.stackPtr) = assemblyInterface.registers[0];
         }
         Interpret();
         break;
     }
 
     default:
         Crash("Unknown return reason code %u", this->assemblyInterface.returnReason);
     }
     SetMemRegisters();
 }
 
 void Arm64TaskData::HandleStackOverflow(uintptr_t space)
 {
     uintptr_t min_size = (this->stack->top - (PolyWord*)assemblyInterface.stackPtr) + OVERFLOW_STACK_SIZE + space;
     try {
         // The stack check has failed.  This may either be because we really have
         // overflowed the stack or because the stack limit value has been adjusted
         // to result in a call here.
         CheckAndGrowStack(this, min_size);
     }
     catch (IOException&) {
         // We may get an exception while handling this if we run out of store
     }
     {
         PLocker l(&interruptLock);
         // Set the stack limit.  This clears any interrupt and also sets the
         // correct value if we've grown the stack.
         assemblyInterface.stackLimit = (stackItem*)stack->bottom + OVERFLOW_STACK_SIZE;
     }
 
     try {
         processes->ProcessAsynchRequests(this);
         // Release and re-acquire use of the ML memory to allow another thread
         // to GC.
         processes->ThreadReleaseMLMemory(this);
         processes->ThreadUseMLMemory(this);
     }
     catch (IOException&) {
     }
 }
 
 void Arm64TaskData::InitStackFrame(TaskData* parentTask, Handle proc)
 /* Initialise stack frame. */
 {
     StackSpace* space = this->stack;
     StackObject* stack = (StackObject*)space->stack();
     uintptr_t stack_size = space->spaceSize() * sizeof(PolyWord) / sizeof(stackItem);
     assemblyInterface.stackPtr = (stackItem*)stack + stack_size;
     assemblyInterface.stackLimit = (stackItem*)space->bottom + OVERFLOW_STACK_SIZE;
     assemblyInterface.handlerRegister = assemblyInterface.stackPtr;
     // Store the argument and the closure.
     assemblyInterface.registers[8] = proc->Word(); // Closure
     assemblyInterface.registers[0] = TAGGED(0); // Argument
     assemblyInterface.linkRegister = (arm64CodePointer)1; // We never return. Use a tagged value because it may be pushed
     assemblyInterface.entryPoint = (arm64CodePointer)1;
     // Have to set the register mask in case we get a GC before the thread starts.
     saveRegisterMask = (1 << 8) | 1; // X8 and X0
 
 #ifdef POLYML32IN64
     // In 32-in-64 RBX always contains the heap base address.
     assemblyInterface.registers[24].stackAddr = (stackItem*)globalHeapBase;
 #endif
 
 }
 
 // This is called from a different thread so we have to be careful.
 void Arm64TaskData::InterruptCode()
 {
     PLocker l(&interruptLock);
     // Set the stack limit pointer to the top of the stack to cause
     // a trap when we next check for stack overflow.
     // We use a lock here to ensure that we always use the current value of the
     // stack.  The thread we're interrupting could be growing the stack at this point.
     if (stack != 0)
         assemblyInterface.stackLimit = (stackItem*)(stack->top - 1);
 }
 
 // Called before entering ML code from the run-time system
 void Arm64TaskData::SetMemRegisters()
 {
     // Copy the current store limits into variables before we go into the assembly code.
 
     // If we haven't yet set the allocation area or we don't have enough we need
     // to create one (or a new one).
     if (allocPointer <= allocLimit + allocWords)
     {
         if (allocPointer < allocLimit)
             Crash("Bad length in heap overflow trap");
 
         // Find some space to allocate in.  Updates taskData->allocPointer and
         // returns a pointer to the newly allocated space (if allocWords != 0)
         PolyWord* space =
             processes->FindAllocationSpace(this, allocWords, true);
         if (space == 0)
         {
             // We will now raise an exception instead of returning.
             // Set allocWords to zero so we don't set the allocation register
             // since that could be holding the exception packet.
             allocWords = 0;
         }
         // Undo the allocation just now.
         allocPointer += allocWords;
     }
     if (this->allocWords != 0)
     {
         // If we have had a heap trap we actually do the allocation here.
         // We will have already garbage collected and recovered sufficient space.
         // This also happens if we have just trapped because of store profiling.
         allocPointer -= allocWords; // Now allocate
         // Set the allocation register to this area. N.B.  This is an absolute address.
         assemblyInterface.registers[allocReg].codeAddr = (POLYCODEPTR)(allocPointer + 1); /* remember: it's off-by-one */
         allocWords = 0;
     }
 
     // If we have run out of store, either just above or while allocating in the RTS,
     // allocPointer and allocLimit will have been set to zero as part of the GC.  We will
     // now be raising an exception which may free some store but we need to come back here
     // before we allocate anything.  The compiled code uses unsigned arithmetic to check for
     // heap overflow but only after subtracting the space required.  We need to make sure
     // that the values are still non-negative after substracting any object size.
     if (allocPointer == 0) allocPointer += MAX_OBJECT_SIZE;
     if (allocLimit == 0) allocLimit += MAX_OBJECT_SIZE;
 
     assemblyInterface.localMbottom = allocLimit + 1;
     assemblyInterface.localMpointer = allocPointer + 1;
     // If we are profiling store allocation we set mem_hl so that a trap
     // will be generated.
     if (profileMode == kProfileStoreAllocation)
         assemblyInterface.localMbottom = assemblyInterface.localMpointer;
 
     assemblyInterface.threadId = stackItem(threadObject);
 }
 
 // This is called whenever we have returned from ML to C.
 void Arm64TaskData::SaveMemRegisters()
 {
     if (interpreterPc == 0)
     {   // Not if we're already in the interpreter
         // The normal return is to the link register address.
         assemblyInterface.entryPoint = assemblyInterface.linkRegister;
         allocPointer = assemblyInterface.localMpointer - 1;
     }
     allocWords = 0;
     assemblyInterface.exceptionPacket = TAGGED(0);
     saveRegisterMask = 0;
 }
 
 // Process addresses in the code.  The only case where we need to do that on the ARM64 is to deal
 // with spltting the constant area from the code in order to make the code position-independent.
 // We need to convert pc-relative LDR instructions into ADRP/LDR pairs.
 void Arm64Dependent::ScanConstantsWithinCode(PolyObject* addr, PolyObject* oldAddr, POLYUNSIGNED length,
     PolyWord* newConstAddr, PolyWord* oldConstAddr, POLYUNSIGNED numConsts, ScanAddress* process)
 {
-#ifndef POLYML32IN64
     arm64CodePointer pt = (arm64CodePointer)addr;
+#ifdef POLYML32IN64
+    // The only case we have to consider in 32-in-64 is the special hack for
+    // the global heap base in callbacks.
+    if (pt[0] == 0xD503201F && (pt[1] & 0xff000000) == 0x58000000)
+    {
+        // nop (special marker) followed by LDR Xn,pc-relative
+        uint32_t pcOffset = (pt[1] >> 5) & 0x3ffff; // This is a number of 32-bit words
+        PolyWord* gHeapAddr = ((PolyWord*)addr) + pcOffset + 1; // PolyWords are 32-bits
+        if (((PolyWord**)gHeapAddr)[0] != globalHeapBase)
+            ((PolyWord**)gMem.SpaceForAddress(gHeapAddr)->writeAble(gHeapAddr))[0] = globalHeapBase;
+    }
+#else
      // If it begins with the enter-int sequence it's interpreted code.
     if (pt[0] == 0xAA1E03E9 && pt[1] == 0xF9400350 && pt[2] == 0xD63F0200)
         return;
     // We only need a split if the constant area is not at the original offset.
     POLYSIGNED constAdjustment =
         (byte*)newConstAddr - (byte*)addr - ((byte*)oldConstAddr - (byte*)oldAddr);
     // If we have replaced the offset with a dummy ADRP/LDR pair we have to add a relocation.
     PolyWord* end = addr->Offset(length - 1);
     if ((end[0].AsUnsigned() >> 56) != 0xff)
         process->RelocateOnly(addr, (byte*)end, PROCESS_RELOC_ARM64ADRPLDR);
 
     while (*pt != 0) // The code ends with a UDF instruction (0)
     {
         if ((*pt & 0xbf000000) == 0x18000000) // LDR with pc-relative offset
         {
             // This could be a reference to the constant area or to the non-address area.
             // References to the constant area are followed by a nop
             if (constAdjustment != 0 && pt[1] == 0xd503201f)
             {
                 unsigned reg = pt[0] & 0x1f;
                 // The displacement is a signed multiple of 4 bytes but it will always be +ve
                 ASSERT((pt[0] & 0x00800000) == 0);
                 // The constant address is relative to the new location of the code.
                 byte* constAddress = (byte*)(pt + ((pt[0] >> 5) & 0x7ffff));
                 byte* newAddress = (byte*)constAddress + constAdjustment;
                 pt[0] = 0x90000000 + reg; // ADRP Xn, 0
                 pt[1] = 0xf9400000 + (reg << 5) + reg; // LDR Xn,[Xn+#0]
                 ScanAddress::SetConstantValue((byte*)pt, (PolyObject*)newAddress, PROCESS_RELOC_ARM64ADRPLDR);
             }
         }
         else if ((*pt & 0x9f000000) == 0x90000000) // ADRP instruction
         {
             // These only occur after we have converted LDRs above
             ASSERT((pt[1] & 0xffc00000) == 0xf9400000); // The next should be the Load
             // For the moment assume that this does not require a move.
             ASSERT(addr == oldAddr && newConstAddr == oldConstAddr);
             process->RelocateOnly(addr, (byte*)pt, PROCESS_RELOC_ARM64ADRPLDR);
         }
         pt++;
     }
 #endif
 }
 
 // As far as possible we want locking and unlocking an ML mutex to be fast so
 // we try to implement the code in the assembly code using appropriate
 // interlocked instructions.  That does mean that if we need to lock and
 // unlock an ML mutex in this code we have to use the same, machine-dependent,
 // code to do it.  These are defaults that are used where there is no
 // machine-specific code.
 
 #if defined(_MSC_VER)
 // This saves having to define it in the MASM assembly code.
 static uintptr_t Arm64AsmAtomicExchange(PolyObject* mutexp, uintptr_t value)
 {
 #   if (SIZEOF_POLYWORD == 8)
     return InterlockedExchange64((LONG64*)mutexp, value);
 #   else
     return InterlockedExchange((LONG*)mutexp, value);
 #  endif
 }
 
 #else
 extern "C" {
     // This is only defined in the GAS assembly code
     uintptr_t Arm64AsmAtomicExchange(PolyObject*, uintptr_t);
 }
 #endif
 
 
 bool Arm64TaskData::AtomicallyReleaseMutex(PolyObject* mutexp)
 {
     uintptr_t oldValue = Arm64AsmAtomicExchange(mutexp, 0);
     return oldValue == 1;
 }
 
 bool Arm64TaskData::AddTimeProfileCount(SIGNALCONTEXT *context)
 {
     stackItem* sp = 0;
     POLYCODEPTR pc = 0;
     if (context != 0)
     {
 #if defined(HAVE_WINDOWS_H)
         sp = (stackItem*)context->Sp;
         pc = (POLYCODEPTR)context->Pc;
 #elif defined(HAVE_UCONTEXT_T)
 #ifdef HAVE_MCONTEXT_T_REGS
         // Linux
         sp = (stackItem*)context->uc_mcontext.sp;
         pc = (POLYCODEPTR)context->uc_mcontext.pc;
 #endif
 #endif
     }
     if (pc != 0)
     {
         // See if the PC we've got is an ML code address.
         MemSpace* space = gMem.SpaceForAddress(pc);
         if (space != 0 && (space->spaceType == ST_CODE || space->spaceType == ST_PERMANENT))
         {
             incrementCountAsynch(pc);
             return true;
         }
     }
     // See if the sp value is in the current stack.
     if (sp >= (stackItem*)this->stack->bottom && sp < (stackItem*)this->stack->top)
     {
         // We may be in the assembly code.  The top of the stack will be a return address.
         pc = sp[0].w().AsCodePtr();
         MemSpace* space = gMem.SpaceForAddress(pc);
         if (space != 0 && (space->spaceType == ST_CODE || space->spaceType == ST_PERMANENT))
         {
             incrementCountAsynch(pc);
             return true;
         }
     }
     // See if the value of regSP is a valid stack pointer.
     // This works if we happen to be in an RTS call using a "Full" call.
     // It doesn't work if we've used a "Fast" call because that doesn't save the SP.
     sp = assemblyInterface.stackPtr;
     if (sp >= (stackItem*)this->stack->bottom && sp < (stackItem*)this->stack->top)
     {
         // We may be in the run-time system.
         pc = sp[0].w().AsCodePtr();
         MemSpace* space = gMem.SpaceForAddress(pc);
         if (space != 0 && (space->spaceType == ST_CODE || space->spaceType == ST_PERMANENT))
         {
             incrementCountAsynch(pc);
             return true;
         }
     }
     return false;
 }
 
 extern "C" {
     POLYEXTERNALSYMBOL void* PolyArm64GetThreadData();
     POLYEXTERNALSYMBOL POLYUNSIGNED PolyInterpretedEnterIntMode();
     POLYEXTERNALSYMBOL POLYUNSIGNED PolyEndBootstrapMode(FirstArgument threadId, PolyWord function);
 }
 
 // Return the address of assembly data for the current thread.  This is normally in
 // X26 except if we are in a callback.
 void* PolyArm64GetThreadData()
 {
     // We should get the task data for the thread that is running this code.
     // If this thread has been created by the foreign code we will have to
     // create a new one here.
     TaskData* taskData = processes->GetTaskDataForThread();
     if (taskData == 0)
     {
         try {
             taskData = processes->CreateNewTaskData();
         }
         catch (std::bad_alloc&) {
             ::Exit("Unable to create thread data - insufficient memory");
         }
         catch (MemoryException&) {
             ::Exit("Unable to create thread data - insufficient memory");
         }
     }
     return &((Arm64TaskData*)taskData)->assemblyInterface;
 }
 
 // Do we require EnterInt instructions and if so for which architecture?
 // 0 = > None; 1 => X86_32, 2 => X86_64. 3 => X86_32_in_64. 4 => ARM_64.
 // ARM_64 in 32 is the same as ARM64.
 POLYUNSIGNED PolyInterpretedEnterIntMode()
 {
     return TAGGED(4).AsUnsigned();
 }
 
 // End the first stage of bootstrap mode and run a new function.
 // The first stage is always interpreted.  Once that is complete every function will have
 // at least an executable "enter-interpreter" stub so it can be called as machine code.
 POLYUNSIGNED PolyEndBootstrapMode(FirstArgument threadId, PolyWord function)
 {
     TaskData* taskData = TaskData::FindTaskForId(threadId);
     ASSERT(taskData != 0);
     taskData->PreRTSCall();
     Handle pushedFunction = taskData->saveVec.push(function);
     arm64Dependent.mustInterpret = false;
     ((Arm64TaskData*)taskData)->EndBootStrap();
     taskData->InitStackFrame(taskData, pushedFunction);
     taskData->EnterPolyCode();
     // Should never return.
     ASSERT(0);
     return TAGGED(0).AsUnsigned();
 }
 
 // No machine-specific calls in the interpreter.
 struct _entrypts machineSpecificEPT[] =
 {
     { "PolyArm64GetThreadData",         (polyRTSFunction)&PolyArm64GetThreadData },
     { "PolyInterpretedEnterIntMode",    (polyRTSFunction)&PolyInterpretedEnterIntMode },
     { "PolyEndBootstrapMode",           (polyRTSFunction)&PolyEndBootstrapMode },
     { NULL, NULL} // End of list.
 };
diff --git a/libpolyml/poly_specific.cpp b/libpolyml/poly_specific.cpp
index e71e5af8..0a098c51 100644
--- a/libpolyml/poly_specific.cpp
+++ b/libpolyml/poly_specific.cpp
@@ -1,455 +1,480 @@
 /*
     Title:  poly_specific.cpp - Poly/ML specific RTS calls.
 
     Copyright (c) 2006, 2015-17, 2019, 2021 David C. J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 
 */
 
 /* This module is used for various run-time calls that are either in the
    PolyML structure or otherwise specific to Poly/ML. */
 
 #ifdef HAVE_CONFIG_H
 #include "config.h"
 #elif defined(_WIN32)
 #include "winconfig.h"
 #else
 #error "No configuration file"
 #endif
 
 #ifdef HAVE_ASSERT_H
 #include <assert.h>
 #define ASSERT(x) assert(x)
 #else
 #define ASSERT(x) 0
 #endif
 
 #ifdef HAVE_STRING_H
 #include <string.h>
 #endif
 
 #include "globals.h"
 #include "poly_specific.h"
 #include "arb.h"
 #include "mpoly.h"
 #include "sys.h"
 #include "machine_dep.h"
 #include "polystring.h"
 #include "run_time.h"
 #include "version.h"
 #include "save_vec.h"
 #include "version.h"
 #include "memmgr.h"
 #include "processes.h"
 #include "gc.h"
 #include "rtsentry.h"
 
 extern "C" {
     POLYEXTERNALSYMBOL POLYUNSIGNED PolySpecificGeneral(FirstArgument threadId, PolyWord code, PolyWord arg);
     POLYEXTERNALSYMBOL POLYUNSIGNED PolyGetABI();
     POLYEXTERNALSYMBOL POLYUNSIGNED PolyLockMutableClosure(FirstArgument threadId, PolyWord closure);
     POLYEXTERNALSYMBOL POLYUNSIGNED PolyCopyByteVecToClosure(FirstArgument threadId, PolyWord byteVec, PolyWord closure);
     POLYEXTERNALSYMBOL POLYUNSIGNED PolySetCodeConstant(PolyWord closure, PolyWord offset, PolyWord c, PolyWord flags);
     POLYEXTERNALSYMBOL POLYUNSIGNED PolyGetCodeConstant(PolyWord closure, PolyWord offset, PolyWord flags);
     POLYEXTERNALSYMBOL POLYUNSIGNED PolySetCodeByte(PolyWord closure, PolyWord offset, PolyWord c);
     POLYEXTERNALSYMBOL POLYUNSIGNED PolyGetCodeByte(PolyWord closure, PolyWord offset);
     POLYEXTERNALSYMBOL POLYUNSIGNED PolySortArrayOfAddresses(PolyWord array);
+    POLYEXTERNALSYMBOL POLYUNSIGNED PolyGetHeapBase(FirstArgument threadId);
     POLYEXTERNALSYMBOL POLYUNSIGNED PolyTest4(FirstArgument threadId, PolyWord arg1, PolyWord arg2, PolyWord arg3, PolyWord arg4);
     POLYEXTERNALSYMBOL POLYUNSIGNED PolyTest5(FirstArgument threadId, PolyWord arg1, PolyWord arg2, PolyWord arg3, PolyWord arg4, PolyWord arg5);
 }
 
 #define SAVE(x) taskData->saveVec.push(x)
 
 #ifndef GIT_VERSION
 #define GIT_VERSION             ""
 #endif
 
 
 Handle poly_dispatch_c(TaskData *taskData, Handle args, Handle code)
 {
     unsigned c = get_C_unsigned(taskData, DEREFWORD(code));
     switch (c)
     {
     case 9: // Return the GIT version if appropriate
         {
              return SAVE(C_string_to_Poly(taskData, GIT_VERSION));
         }
 
     case 10: // Return the RTS version string.
         {
             const char *version;
             switch (machineDependent->MachineArchitecture())
             {
             case MA_Interpreted:    version = "Portable-" TextVersion; break;
             case MA_I386:           version = "I386-" TextVersion; break;
             case MA_X86_64:         version = "X86_64-" TextVersion; break;
             case MA_X86_64_32:      version = "X86_64_32-" TextVersion; break;
             case MA_Arm64:          version = "Arm64-" TextVersion; break;
             case MA_Arm64_32:       version = "Arm64_32-" TextVersion; break;
             default:                version = "Unknown-" TextVersion; break;
             }
             return SAVE(C_string_to_Poly(taskData, version));
         }
 
     case 12: // Return the architecture
         // Used in InitialPolyML.ML for PolyML.architecture
         {
             const char *arch;
             switch (machineDependent->MachineArchitecture())
             {
             case MA_Interpreted:    arch = "Interpreted"; break;
             case MA_I386:           arch = "I386"; break;
             case MA_X86_64:         arch = "X86_64"; break;
             case MA_X86_64_32:      arch = "X86_64_32"; break;
             case MA_Arm64:          arch = "Arm64"; break;
             case MA_Arm64_32:       arch = "Arm64_32"; break;
             default:                arch = "Unknown"; break;
             }
             return SAVE(C_string_to_Poly(taskData, arch));
         }
 
     case 19: // Return the RTS argument help string.
         return SAVE(C_string_to_Poly(taskData, RTSArgHelp()));
 
     default:
         {
             char msg[100];
             sprintf(msg, "Unknown poly-specific function: %d", c);
             raise_exception_string(taskData, EXC_Fail, msg);
             return 0;
         }
     }
 }
 
 // General interface to poly-specific.  Ideally the various cases will be made into
 // separate functions.
 POLYUNSIGNED PolySpecificGeneral(FirstArgument threadId, PolyWord code, PolyWord arg)
 {
     TaskData *taskData = TaskData::FindTaskForId(threadId);
     ASSERT(taskData != 0);
     taskData->PreRTSCall();
     Handle reset = taskData->saveVec.mark();
     Handle pushedCode = taskData->saveVec.push(code);
     Handle pushedArg = taskData->saveVec.push(arg);
     Handle result = 0;
 
     try {
         result = poly_dispatch_c(taskData, pushedArg, pushedCode);
     } catch (...) { } // If an ML exception is raised
 
     taskData->saveVec.reset(reset);
     taskData->PostRTSCall();
     if (result == 0) return TAGGED(0).AsUnsigned();
     else return result->Word().AsUnsigned();
 }
 
 // Return the ABI - i.e. the calling conventions used when calling external functions.
 POLYEXTERNALSYMBOL POLYUNSIGNED PolyGetABI()
 {
     // Return the ABI.  For 64-bit we need to know if this is Windows.
 #if (SIZEOF_VOIDP == 8)
 #if (defined(_WIN32) || defined(__CYGWIN__))
     return TAGGED(2).AsUnsigned(); // 64-bit Windows
 #else
     return TAGGED(1).AsUnsigned(); // 64-bit Unix
 #endif
 #else
     return TAGGED(0).AsUnsigned(); // 32-bit Unix and Windows
 #endif
 }
 
 // Code generation - Code is initially allocated in a byte segment.  When all the
 // values have been set apart from any addresses the byte segment is copied into
 // a mutable code segment.
 
 // Copy the byte vector into code space.
 POLYUNSIGNED PolyCopyByteVecToClosure(FirstArgument threadId, PolyWord byteVec, PolyWord closure)
 {
     TaskData *taskData = TaskData::FindTaskForId(threadId);
     ASSERT(taskData != 0);
     taskData->PreRTSCall();
     Handle reset = taskData->saveVec.mark();
     Handle pushedByteVec = taskData->saveVec.push(byteVec);
     Handle pushedClosure = taskData->saveVec.push(closure);
     PolyObject *result = 0;
 
 #ifdef HAVE_PTHREAD_JIT_WRITE_PROTECT_NP
     pthread_jit_write_protect_np(false);
 #endif
 
     try {
         if (!pushedByteVec->WordP()->IsByteObject())
             raise_fail(taskData, "Not byte data area");
         if (pushedClosure->WordP()->Length() != sizeof(PolyObject*)/sizeof(PolyWord))
             raise_fail(taskData, "Invalid closure size");
         if (!pushedClosure->WordP()->IsMutable())
             raise_fail(taskData, "Closure is not mutable");
         do {
             PolyObject *initCell = pushedByteVec->WordP();
             POLYUNSIGNED requiredSize = initCell->Length();
             result = gMem.AllocCodeSpace(requiredSize);
             if (result == 0)
             {
                 // Could not allocate - must GC.
                 if (!QuickGC(taskData, pushedByteVec->WordP()->Length()))
                     raise_fail(taskData, "Insufficient memory");
             }
             else memcpy(gMem.SpaceForObjectAddress(result)->writeAble((byte*)result), initCell, requiredSize * sizeof(PolyWord));
         } while (result == 0);
     }
     catch (...) {} // If an ML exception is raised
 
     // Store the code address in the closure.
     *((PolyObject**)pushedClosure->WordP()) = result;
     // Lock the closure.
     pushedClosure->WordP()->SetLengthWord(pushedClosure->WordP()->LengthWord() & ~_OBJ_MUTABLE_BIT);
 
 #ifdef HAVE_PTHREAD_JIT_WRITE_PROTECT_NP
     pthread_jit_write_protect_np(true);
 #endif
 
     taskData->saveVec.reset(reset);
     taskData->PostRTSCall();
     return TAGGED(0).AsUnsigned();
 }
 
 // Code generation - Lock a mutable code segment and return the original address.
 // Currently this does not allocate so other than the exception it could
 // be a fast call.
 POLYEXTERNALSYMBOL POLYUNSIGNED PolyLockMutableClosure(FirstArgument threadId, PolyWord closure)
 {
     TaskData *taskData = TaskData::FindTaskForId(threadId);
     ASSERT(taskData != 0);
     taskData->PreRTSCall();
     Handle reset = taskData->saveVec.mark();
     PolyObject *codeObj = *(PolyObject**)(closure.AsObjPtr());
 
 #ifdef HAVE_PTHREAD_JIT_WRITE_PROTECT_NP
     pthread_jit_write_protect_np(false);
 #endif
 
     try {
         if (!codeObj->IsCodeObject() || !codeObj->IsMutable())
             raise_fail(taskData, "Not mutable code area");
         POLYUNSIGNED segLength = codeObj->Length();
         gMem.SpaceForObjectAddress(codeObj)->writeAble(codeObj)->SetLengthWord(segLength, F_CODE_OBJ);
         // Flush cache on ARM at least.
         machineDependent->FlushInstructionCache(codeObj, segLength * sizeof(PolyWord));
         // In the future it may be necessary to return a different address here.
         // N.B.  The code area should only have execute permission in the native
         // code version, not the interpreted version.
     }
     catch (...) {} // If an ML exception is raised
 
 #ifdef HAVE_PTHREAD_JIT_WRITE_PROTECT_NP
     pthread_jit_write_protect_np(true);
 #endif
 
     taskData->saveVec.reset(reset);
     taskData->PostRTSCall();
     return TAGGED(0).AsUnsigned();
 }
 
 // Set code constant.  This can be a fast call.
 // This is in the RTS both because we pass a closure in here and cannot have
 // code addresses in 32-in-64 and also because we need to ensure there is no
 // possibility of a GC while the code is an inconsistent state.
 POLYUNSIGNED PolySetCodeConstant(PolyWord closure, PolyWord offset, PolyWord cWord, PolyWord flags)
 {
     byte *pointer;
 #ifdef HAVE_PTHREAD_JIT_WRITE_PROTECT_NP
     pthread_jit_write_protect_np(false);
 #endif
 
     // Previously we passed the code address in here and we need to
     // retain that for legacy code.  This is now the closure.
     if (closure.AsObjPtr()->IsCodeObject())
         pointer = closure.AsCodePtr();
     else pointer = *(POLYCODEPTR*)(closure.AsObjPtr());
     // pointer is the start of the code segment.
     // c will usually be an address.
     // offset is a byte offset
     pointer += offset.UnTaggedUnsigned();
     byte* writeable = gMem.SpaceForAddress(pointer)->writeAble(pointer);
     switch (UNTAGGED(flags))
     {
         case 0: // Absolute constant - size PolyWord
         {
             POLYUNSIGNED c = cWord.AsUnsigned();
 #ifdef WORDS_BIGENDIAN
             // This is used to store constants in the constant area
             // on the interpreted version. 
             for (unsigned i = sizeof(PolyWord); i > 0; i--)
             {
                 writeable[i-1] = (byte)(c & 255);
                 c >>= 8;
             }
 #else
             for (unsigned i = 0; i < sizeof(PolyWord); i++)
             {
                 writeable[i] = (byte)(c & 255);
                 c >>= 8;
             }
 #endif
             break;
         }
         case 1: // Relative constant - X86 - size 4 bytes
         {
             // The offset is relative to the END of the constant.
             byte *target;
             // In 32-in-64 we pass in the closure address here
             // rather than the code address.
             if (cWord.AsObjPtr()->IsCodeObject())
                 target = cWord.AsCodePtr();
             else target = *(POLYCODEPTR*)(cWord.AsObjPtr());
             size_t c = target - pointer - 4;
             for (unsigned i = 0; i < 4; i++)
             {
                 writeable[i] = (byte)(c & 255);
                 c >>= 8;
             }
             break;
         }
         case 2: // Absolute constant - size uintptr_t
             // This is the same as case 0 except in 32-in-64 when
             // it is an absolute address rather than an object pointer.
         {
             uintptr_t c = (uintptr_t)(cWord.AsObjPtr());
             for (unsigned i = 0; i < sizeof(uintptr_t); i++)
             {
                 pointer[i] = (byte)(c & 255);
                 c >>= 8;
             }
             break;
         }
     }
 
 #ifdef HAVE_PTHREAD_JIT_WRITE_PROTECT_NP
     pthread_jit_write_protect_np(true);
 #endif
 
     return TAGGED(0).AsUnsigned();
 }
 
 // Get a code constant.  This is only used for debugging.
 POLYUNSIGNED PolyGetCodeConstant(PolyWord closure, PolyWord offset, PolyWord flags)
 {
     byte* pointer = *(POLYCODEPTR*)(closure.AsObjPtr());
     // offset is a byte offset
     pointer += offset.UnTaggedUnsigned();
     switch (UNTAGGED(flags))
     {
     case 0: // Absolute constant - size PolyWord
     {
         POLYUNSIGNED c = 0;
 #ifdef WORDS_BIGENDIAN
         for (unsigned i = 0; i < sizeof(PolyWord); i++)
             c = (c << 8) | pointer[i];
 #else
         for (unsigned i = sizeof(PolyWord); i > 0; i--)
             c = (c << 8) | pointer[i-1];
 #endif
         return c;
     }
     }
     // For the moment just handle that case.
     return TAGGED(0).AsUnsigned();
 }
 
 // Set a code byte.  This needs to be in the RTS because it uses the closure
 POLYEXTERNALSYMBOL POLYUNSIGNED PolySetCodeByte(PolyWord closure, PolyWord offset, PolyWord cWord)
 {
     byte *pointer = *(POLYCODEPTR*)(closure.AsObjPtr());
     byte* writable = gMem.SpaceForAddress(pointer)->writeAble(pointer);
     writable[UNTAGGED_UNSIGNED(offset)] = (byte)UNTAGGED_UNSIGNED(cWord);
     return TAGGED(0).AsUnsigned();
 }
 
 POLYEXTERNALSYMBOL POLYUNSIGNED PolyGetCodeByte(PolyWord closure, PolyWord offset)
 {
     byte *pointer = *(POLYCODEPTR*)(closure.AsObjPtr());
     return TAGGED(pointer[UNTAGGED_UNSIGNED(offset)]).AsUnsigned();
 }
 
 static int compare(const void *a, const void *b)
 {
     PolyWord *av = (PolyWord*)a;
     PolyWord *bv = (PolyWord*)b;
     if ((*av).IsTagged() || (*bv).IsTagged()) return 0; // Shouldn't happen
     PolyObject *ao = (*av).AsObjPtr(), *bo = (*bv).AsObjPtr();
     if (ao->Length() < 1 || bo->Length() < 1) return 0; // Shouldn't happen
     if (ao->Get(0).AsUnsigned() < bo->Get(0).AsUnsigned())
         return -1;
     if (ao->Get(0).AsUnsigned() > bo->Get(0).AsUnsigned())
         return 1;
     return 0;
 }
 
 // Sort an array of addresses.  This is used in the code-generator to search for
 // duplicates in the address area.  The argument is an array of pairs.  The first
 // item of each pair is an address, the second is an identifier of some kind.
 POLYEXTERNALSYMBOL POLYUNSIGNED PolySortArrayOfAddresses(PolyWord array)
 {
     if (!array.IsDataPtr()) return(TAGGED(0)).AsUnsigned();
     PolyObject *arrayP = array.AsObjPtr();
     POLYUNSIGNED numberOfItems = arrayP->Length();
     if (!arrayP->IsMutable()) return(TAGGED(0)).AsUnsigned();
     qsort(arrayP, numberOfItems, sizeof(PolyWord), compare);
     return (TAGGED(1)).AsUnsigned();
 }
 
+// Return the value of globalHeapBase as a SysWord value.
+// This is used in just one place: when compiling an FFI callback stub in ARM 32-in-64.
+POLYEXTERNALSYMBOL POLYUNSIGNED PolyGetHeapBase(FirstArgument threadId)
+{
+    TaskData* taskData = TaskData::FindTaskForId(threadId);
+    ASSERT(taskData != 0);
+    taskData->PreRTSCall();
+    Handle result = 0;
+
+    try {
+#ifdef POLYML32IN64
+        result = Make_sysword(taskData, (uintptr_t)globalHeapBase);
+#else
+        result = Make_sysword(taskData, 0);
+#endif
+    }
+    catch (...) {} // If an ML exception is raised
+
+    taskData->PostRTSCall();
+    if (result == 0) return TAGGED(0).AsUnsigned();
+    else return result->Word().AsUnsigned();
+}
+
 POLYEXTERNALSYMBOL POLYUNSIGNED PolyTest4(FirstArgument threadId, PolyWord arg1, PolyWord arg2, PolyWord arg3, PolyWord arg4)
 {
     switch (arg1.UnTaggedUnsigned())
     {
     case 1: return arg1.AsUnsigned();
     case 2: return arg2.AsUnsigned();
     case 3: return arg3.AsUnsigned();
     case 4: return arg4.AsUnsigned();
     default: return TAGGED(0).AsUnsigned();
     }
 }
 
 POLYEXTERNALSYMBOL POLYUNSIGNED PolyTest5(FirstArgument threadId, PolyWord arg1, PolyWord arg2, PolyWord arg3, PolyWord arg4, PolyWord arg5)
 {
     switch (arg1.UnTaggedUnsigned())
     {
     case 1: return arg1.AsUnsigned();
     case 2: return arg2.AsUnsigned();
     case 3: return arg3.AsUnsigned();
     case 4: return arg4.AsUnsigned();
     case 5: return arg5.AsUnsigned();
     default: return TAGGED(0).AsUnsigned();
     }
 
 }
 
 
 struct _entrypts polySpecificEPT[] =
 {
     { "PolySpecificGeneral",            (polyRTSFunction)&PolySpecificGeneral},
     { "PolyGetABI",                     (polyRTSFunction)&PolyGetABI },
     { "PolyCopyByteVecToClosure",       (polyRTSFunction)&PolyCopyByteVecToClosure },
     { "PolyLockMutableClosure",         (polyRTSFunction)&PolyLockMutableClosure },
     { "PolySetCodeConstant",            (polyRTSFunction)&PolySetCodeConstant },
     { "PolyGetCodeConstant",            (polyRTSFunction)&PolyGetCodeConstant },
     { "PolySetCodeByte",                (polyRTSFunction)&PolySetCodeByte },
     { "PolyGetCodeByte",                (polyRTSFunction)&PolyGetCodeByte },
     { "PolySortArrayOfAddresses",       (polyRTSFunction)&PolySortArrayOfAddresses },
+    { "PolyGetHeapBase",                (polyRTSFunction)&PolyGetHeapBase },
     { "PolyTest4",                      (polyRTSFunction)&PolyTest4 },
     { "PolyTest5",                      (polyRTSFunction)&PolyTest5 },
 
     { NULL, NULL} // End of list.
 };
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ASSEMBLY.sml b/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ASSEMBLY.sml
index 2719460c..becf3441 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ASSEMBLY.sml
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ASSEMBLY.sml
@@ -1,2291 +1,2307 @@
 (*
     Copyright (c) 2021 David C. J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     Licence version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public Licence for more details.
     
     You should have received a copy of the GNU Lesser General Public
     Licence along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor ARM64ASSEMBLY (
     structure Debug: DEBUG
     and       Pretty: PRETTYSIG
     and       CodeArray: CODEARRAYSIG
 ) : Arm64Assembly =
 
 struct
     open CodeArray Address
     
     val is32in64 = Address.wordSize = 0w4
     
     val wordsPerNativeWord: word = Address.nativeWordSize div Address.wordSize
     
     exception InternalError = Misc.InternalError
 
     infix 5 << <<+ <<- >> >>+ >>- ~>> ~>>+ ~>>- (* Shift operators *)
     infix 3 andb orb xorb andbL orbL xorbL andb8 orb8 xorb8
     
     val op << = Word32.<< and op >> = Word32.>> and op ~>> = Word32.~>>
     and op andb = Word32.andb and op orb = Word32.orb
 
     val word32ToWord8 = Word8.fromLargeWord o Word32.toLargeWord
     and word8ToWord32 = Word32.fromLargeWord o Word8.toLargeWord
     and word32ToWord = Word.fromLargeWord o Word32.toLargeWord
     and wordToWord32 = Word32.fromLargeWord o Word.toLargeWord
     and word8ToWord = Word.fromLargeWord o Word8.toLargeWord
    
     (* XReg is used for fixed point registers since X0 and W0 are
        the same register. *)
     datatype xReg = XReg of Word8.word | XZero | XSP
     (* VReg is used for the floating point registers since V0, D0 and
        S0 are the same register. *)
     and vReg = VReg of Word8.word
 
     (* A Label is a ref that is later set to the location.
        Several labels can be linked together so that they are only set
        at a single point.
        Only forward jumps are linked so when we come to finally set the
        label we will have the full list. *)
     type labels = Word.word ref list ref
 
     (* Condition codes.  The encoding is standard. *)
     datatype condition = CCode of Word8.word
     
     val condEqual           = CCode 0wx0 (* Z=1 *)
     and condNotEqual        = CCode 0wx1 (* Z=0 *)
     and condCarrySet        = CCode 0wx2 (* C=1 *)
     and condCarryClear      = CCode 0wx3 (* C=0 *)
     and condNegative        = CCode 0wx4 (* N=1 *)
     and condPositive        = CCode 0wx5 (* N=0 imcludes zero *)
     and condOverflow        = CCode 0wx6 (* V=1 *)
     and condNoOverflow      = CCode 0wx7 (* V=0 *)
     and condUnsignedHigher  = CCode 0wx8 (* C=1 && Z=0 *)
     and condUnsignedLowOrEq = CCode 0wx9 (* ! (C=1 && Z=0) *)
     and condSignedGreaterEq = CCode 0wxa (* N=V *)
     and condSignedLess      = CCode 0wxb (* N<>V *)
     and condSignedGreater   = CCode 0wxc (* Z==0 && N=V *)
     and condSignedLessEq    = CCode 0wxd (* !(Z==0 && N=V) *)
     (* use unconditional branches for the "always" cases. *)
     (* N.B. On subtraction and comparison the ARM uses an inverted carry
        flag for borrow.  The C flag is set if there is NO borrow.
        This is the reverse of the X86. *)
 
     (* Offsets in the assembly code interface pointed at by X26
        These are in units of 64-bits NOT bytes. *)
     val heapOverflowCallOffset  = 1
     and stackOverflowCallOffset = 2
     and stackOverflowXCallOffset= 3
     and exceptionHandlerOffset  = 5
     and stackLimitOffset        = 6
     and exceptionPacketOffset   = 7
     and threadIdOffset          = 8
     and heapLimitPtrOffset      = 42
     and heapAllocPtrOffset      = 43
     and mlStackPtrOffset        = 44
 
     (* 31 in the register field can either mean the zero register or
        the hardware stack pointer.  Which meaning depends on the instruction. *)
     fun xRegOrXZ(XReg w) = w
     |   xRegOrXZ XZero = 0w31
     |   xRegOrXZ XSP = raise InternalError "XSP not valid here"
     
     and xRegOrXSP(XReg w) = w
     |   xRegOrXSP XZero = raise InternalError "XZero not valid here"
     |   xRegOrXSP XSP = 0w31
     
     (* There are cases where it isn't clear. *)
     and xRegOnly (XReg w) = w
     |   xRegOnly XZero = raise InternalError "XZero not valid here"
     |   xRegOnly XSP = raise InternalError "XSP not valid here"
 
     val X0  = XReg 0w0  and X1  = XReg 0w1  and X2 = XReg 0w2   and X3  = XReg 0w3
     and X4  = XReg 0w4  and X5  = XReg 0w5  and X6 = XReg 0w6   and X7  = XReg 0w7
     and X8  = XReg 0w8  and X9  = XReg 0w9  and X10= XReg 0w10  and X11 = XReg 0w11
     and X12 = XReg 0w12 and X13 = XReg 0w13 and X14= XReg 0w14  and X15 = XReg 0w15
     and X16 = XReg 0w16 and X17 = XReg 0w17 and X18= XReg 0w18  and X19 = XReg 0w19
     and X20 = XReg 0w20 and X21 = XReg 0w21 and X22= XReg 0w22  and X23 = XReg 0w23
     and X24 = XReg 0w24 and X25 = XReg 0w25 and X26= XReg 0w26  and X27 = XReg 0w27
     and X28 = XReg 0w28 and X29 = XReg 0w29 and X30= XReg 0w30
     
     val X_MLHeapLimit       = X25 (* ML Heap limit pointer *)
     and X_MLAssemblyInt     = X26 (* ML assembly interface pointer. *)
     and X_MLHeapAllocPtr    = X27 (* ML Heap allocation pointer. *)
     and X_MLStackPtr        = X28 (* ML Stack pointer. *)
     and X_LinkReg           = X30 (* Link reg - return address *)
     and X_Base32in64        = X24 (* X24 is used for the heap base in 32-in-64. *)
     
     fun vReg(VReg v) = v
     (* Only the first eight registers are currently used by ML. *)
     val V0  = VReg 0w0  and V1  = VReg 0w1 and V2 = VReg 0w2   and V3  = VReg 0w3
     and V4  = VReg 0w4  and V5  = VReg 0w5 and V6 = VReg 0w6   and V7  = VReg 0w7
 
     (* Some data instructions include a possible shift. *)
     datatype shiftType =
         ShiftLSL of Word8.word
     |   ShiftLSR of Word8.word
     |   ShiftASR of Word8.word
     |   ShiftNone
 
     local
         fun checkImm6 w = if w > 0w63 then raise InternalError "shift > 63" else w
     in 
         fun shiftEncode(ShiftLSL w) = (0w0, checkImm6 w)
         |   shiftEncode(ShiftLSR w) = (0w1, checkImm6 w)
         |   shiftEncode(ShiftASR w) = (0w2, checkImm6 w)
         |   shiftEncode ShiftNone   = (0w0, 0w0)
     end
 
     (* Other instructions include an extension i.e. a sign- or zero-extended
        value from one of the argument registers.  When an extension is encoded
        there can also be a left shift which applies after the extension.
        I don't understand what difference, if any, there is between UXTX
        and SXTX.
        There's no ExtNone because we need to use either UXTW or UXTX depending
        on the length *)
     datatype 'a extend =
         ExtUXTB of 'a (* Unsigned extend byte *)
     |   ExtUXTH of 'a (* Unsigned extend byte *)
     |   ExtUXTW of 'a (* Unsigned extend byte *)
     |   ExtUXTX of 'a (* Left shift *)
     |   ExtSXTB of 'a (* Sign extend byte *)
     |   ExtSXTH of 'a (* Sign extend halfword *)
     |   ExtSXTW of 'a (* Sign extend word *)
     |   ExtSXTX of 'a (* Left shift *)
 
     (* Load/store instructions have only a single bit for the shift.  For byte
        operations this is one bit shift; for others it scales by the size of
        the operand if set. *)
     datatype scale =
         ScaleOrShift
     |   NoScale
 
     local
         (* Although there are three bits it seems that the shift is limited to 0 to 4. *)
         fun checkImm3 w = if w > 0w4 then raise InternalError "extend shift > 4" else w
     in
         fun extendArithEncode(ExtUXTB w) = (0w0, checkImm3 w)
         |   extendArithEncode(ExtUXTH w) = (0w1, checkImm3 w)
         |   extendArithEncode(ExtUXTW w) = (0w2, checkImm3 w)
         |   extendArithEncode(ExtUXTX w) = (0w3, checkImm3 w)
         |   extendArithEncode(ExtSXTB w) = (0w4, checkImm3 w)
         |   extendArithEncode(ExtSXTH w) = (0w5, checkImm3 w)
         |   extendArithEncode(ExtSXTW w) = (0w6, checkImm3 w)
         |   extendArithEncode(ExtSXTX w) = (0w7, checkImm3 w)
         
         fun extendLSEncode(ExtUXTB v) = (0w0, v)
         |   extendLSEncode(ExtUXTH v) = (0w1, v)
         |   extendLSEncode(ExtUXTW v) = (0w2, v)
         |   extendLSEncode(ExtUXTX v) = (0w3, v)
         |   extendLSEncode(ExtSXTB v) = (0w4, v)
         |   extendLSEncode(ExtSXTH v) = (0w5, v)
         |   extendLSEncode(ExtSXTW v) = (0w6, v)
         |   extendLSEncode(ExtSXTX v) = (0w7, v)
     end
 
     datatype wordSize = WordSize32 | WordSize64
 
     (* Bit patterns on the ARM64 are encoded using a complicated scheme and
        only certain values can be encoded.  An element can be 2, 4, 8, 16, 32 or
        64 bits and must be a sequence of at least one zero bits followed by at
        least one one bit.  This sequence can then be rotated within the element.
        Finally the element is replicated within the register up to 32 or
        64 bits.  All this information is encoded in 13 bits.
        N.B. Bit patterns of all zeros or all ones cannot be encoded. *)
 
     (* Encode the value if it is possible. *)
     fun encodeBitPattern(value, sf (* size flag *)) =
     (* Can't encode 0 or all ones. *)
     if value = 0w0 orelse value = Word64.notb 0w0
     then NONE
     (* If this is 32-bits we can't encode all ones in the
        low-order 32-bits or any value that won't fit in 32-bits, *)
     else if sf = WordSize32 andalso value >= 0wxffffffff
     then NONE
     else
     let
         val regSize = case sf of WordSize32 => 0w32 | WordSize64 => 0w64
         (* Get the element size.  Look for the repeat of the
            pattern. *)
         fun getElemSize size =
         let
             val ns = size div 0w2
             val mask = Word64.<<(0w1, ns)  - 0w1
         in
             if Word64.andb(value, mask) <> Word64.andb(Word64.>>(value, ns), mask)
             then size
             else if ns <= 0w2
             then ns
             else getElemSize ns
         end
         val elemSize = getElemSize regSize
         fun log2 0w1 = 0w0 | log2 n = 0w1 + log2(Word.>>(n, 0w1))
         val elemBits = log2 elemSize
 
         (* Find the rotation that puts as many of the zero bits in the
            element at the top. *)
         val elemMask = Word64.>>(Word64.notb 0w0, 0w64-elemSize)
         fun ror elt =
             Word64.orb((Word64.<<(Word64.andb(elt, 0w1), elemSize-0w1),
                 Word64.>>(elt, 0w1)))
         and rol elt =
             Word64.orb(Word64.andb(elemMask, Word64.<<(elt, 0w1)),
                 Word64.>>(elt, elemSize-0w1))
 
         fun findRotation(v, n) =
             if ror v < v then findRotation(ror v, (n-0w1) mod elemSize)
             else if rol v < v then findRotation(rol v, n+0w1)
             else (v, n)
 
         val (rotated, rotation) = findRotation(Word64.andb(value, elemMask), 0w0)
 
         (* Count out the low order ones.  If the result is zero
            then we;ve got a valid sequence of zeros followed by ones
            but if we discover a zero bit and the result isn't zero
            then we can't encode this. *)
         fun countLowOrderOnes(v, n) =
             if v = 0w0
             then SOME n
             else if Word64.andb(v, 0w1) = 0w1
             then countLowOrderOnes(Word64.>>(v, 0w1), n+0w1)
             else NONE
      in
         case countLowOrderOnes(rotated, 0w0) of
             NONE => NONE
         |   SOME lowOrderOnes =>
             let
                 (* Encode the element size. *)
                 val elemSizeEnc = 0wx7f - (Word.<<(0w1, elemBits+0w1) - 0w1)
                 val n = if Word.andb(elemSizeEnc, 0wx40) = 0w0 then 0w1 else 0w0
                 val imms = Word.andb(Word.orb(elemSizeEnc, lowOrderOnes-0w1), 0wx3f)
             in
                 SOME{n=n, imms=imms, immr=rotation}
             end
     end;
 
     (* Decode a pattern for printing. *)
     fun decodeBitPattern{sf, n, immr, imms} =
     let
         (* Find the highest bit set in N:NOT(imms) *)
         fun highestBitSet 0w0 = 0
         |   highestBitSet n = 1+highestBitSet(Word32.>>(n, 0w1))
         val len = highestBitSet(Word32.orb(Word32.<<(n, 0w6), Word32.xorb(imms, 0wx3f))) - 1
         val _ = if len < 0 then raise InternalError "decodeBitPattern: invalid" else ()
         val size = Word32.<<(0w1, Word.fromInt len)
         val r = Word32.andb(immr, size-0w1)
         and s = Word32.andb(imms, size-0w1)
         val _ = if s = size-0w1 then raise InternalError "decodeBitPattern: invalid" else ()
         val pattern = Word64.<<(0w1, word32ToWord(s+0w1)) - 0w1
         (* Rotate right: shift left and put the top bit in the high order bit*)
         fun ror elt =
             Word64.orb((Word64.<<(Word64.andb(elt, 0w1), word32ToWord(size-0w1)),
                 Word64.>>(elt, 0w1)))
 
         fun rotateBits(value, 0w0) = value
         |   rotateBits(value, n) = rotateBits(ror value, n-0w1)
 
         val rotated = rotateBits(pattern, r)
 
         val regSize = if sf = 0w0 then 0w32 else 0w64
 
         (* Replicate the rotated pattern to fill the register. *)
         fun replicate(pattern, size) =
             if size >= regSize
             then pattern
             else replicate(Word64.orb(pattern, Word64.<<(pattern, word32ToWord size)), size * 0w2)
     in
         replicate(rotated, size)
     end
 
     val isEncodableBitPattern = isSome o encodeBitPattern
 
 
     datatype instr =
         SimpleInstr of Word32.word
     |   LoadAddressLiteral of {reg: xReg, value: machineWord}
     |   LoadNonAddressLiteral of {reg: xReg, value: Word64.word}
     |   Label of labels
     |   UnconditionalBranch of labels
     |   ConditionalBranch of { label: labels, jumpCondition: condition, length: brLength ref }
     |   LoadLabelAddress of { label: labels, reg: xReg }
     |   TestBitBranch of { label: labels, bitNo: Word8.word, brNonZero: bool, reg: xReg, length: brLength ref }
     |   CompareBranch of { label: labels, brNonZero: bool, size: wordSize, reg: xReg, length: brLength ref }
     
     and brLength = BrShort | BrExtended
 
     val nopCode  = 0wxD503201F
     and undefCode = 0wx00000000 (* Permanently undefined instruction. *)
 
     (* Add/subtract an optionally shifted 12-bit immediate (i.e. constant) to/from a register.
        The constant is zero-extended.  The versions that do not set the flags can use XSP as
        the destination; the versions that use the signs can use XZero as the destination i.e.
        they discard the result and act as a comparison. *)
     local
         fun addSubRegImmediate(sf, oper, s, xdOp) ({regN, regD, immed, shifted}) =
         let
             val () =
                 if immed >= 0wx1000 then raise InternalError "addSubRegImmediate: immed > 12 bits" else ()
         in
             SimpleInstr(
                 0wx11000000 orb (sf << 0w31) orb (oper << 0w30) orb (s << 0w29) orb
                 (if shifted then 0wx400000 else 0w0) orb
                 (wordToWord32 immed << 0w10) orb (word8ToWord32(xRegOrXSP regN) << 0w5) orb
                 word8ToWord32(xdOp regD))
         end
     in
         val addImmediate = addSubRegImmediate(0w1, 0w0, 0w0, xRegOrXSP)
         and addSImmediate = addSubRegImmediate(0w1, 0w0, 0w1, xRegOrXZ)
         and subImmediate = addSubRegImmediate(0w1, 0w1, 0w0, xRegOrXSP)
         and subSImmediate = addSubRegImmediate(0w1, 0w1, 0w1, xRegOrXZ)
         and addImmediate32 = addSubRegImmediate(0w0, 0w0, 0w0, xRegOrXSP)
         and addSImmediate32 = addSubRegImmediate(0w0, 0w0, 0w1, xRegOrXZ)
         and subImmediate32 = addSubRegImmediate(0w0, 0w1, 0w0, xRegOrXSP)
         and subSImmediate32 = addSubRegImmediate(0w0, 0w1, 0w1, xRegOrXZ)
     end
 
     (* Add/subtract a shifted register, optionally setting the flags. *)
     local
         (* X31 is XZ here unlike the extended version.*)
         fun addSubtractShiftedReg (sf, oper, s) ({regM, regN, regD, shift}) =
         let
             val (shift, imm6) = shiftEncode shift
         in
             SimpleInstr(0wx0b000000 orb (sf << 0w31) orb (oper << 0w30) orb (s << 0w29) orb
                 (shift << 0w22) orb (word8ToWord32(xRegOnly regM) << 0w16) orb
                 (word8ToWord32 imm6 << 0w10) orb (word8ToWord32(xRegOrXZ regN) << 0w5) orb
                 word8ToWord32(xRegOrXZ regD))
         end
     in
         val addShiftedReg = addSubtractShiftedReg(0w1, 0w0, 0w0)
         and addSShiftedReg = addSubtractShiftedReg(0w1, 0w0, 0w1)
         and subShiftedReg = addSubtractShiftedReg(0w1, 0w1, 0w0)
         and subSShiftedReg = addSubtractShiftedReg(0w1, 0w1, 0w1)
         and addShiftedReg32 = addSubtractShiftedReg(0w0, 0w0, 0w0)
         and addSShiftedReg32 = addSubtractShiftedReg(0w0, 0w0, 0w1)
         and subShiftedReg32 = addSubtractShiftedReg(0w0, 0w1, 0w0)
         and subSShiftedReg32 = addSubtractShiftedReg(0w0, 0w1, 0w1)
     end
 
     (* Add/subtract an extended register, optionally setting the flags. *)
     local
         (* SP can be used as Xn and also for Xd for the non-flags versions. *)
         fun addSubtractExtendedReg (sf, oper, s, opt, xD) ({regM, regN, regD, extend}) =
         let
             val (option, imm3) = extendArithEncode extend
         in
             SimpleInstr(0wx0b200000 orb (sf << 0w31) orb (oper << 0w30) orb (s << 0w29) orb
                 (opt << 0w22) orb (word8ToWord32(xRegOnly regM) << 0w16) orb
                 (option << 0w13) orb (word8ToWord32 imm3 << 0w10) orb
                 (word8ToWord32(xRegOrXSP regN) << 0w5) orb
                 word8ToWord32(xD regD))
         end
     in
         val addExtendedReg = addSubtractExtendedReg(0w1, 0w0, 0w0, 0w0, xRegOrXSP)
         and addSExtendedReg = addSubtractExtendedReg(0w1, 0w0, 0w1, 0w0, xRegOrXZ)
         and subExtendedReg = addSubtractExtendedReg(0w1, 0w1, 0w0, 0w0, xRegOrXSP)
         and subSExtendedReg = addSubtractExtendedReg(0w1, 0w1, 0w1, 0w0, xRegOrXZ)
     end
 
     (* Logical operations on a shifted register. *)
     local
         fun logicalShiftedReg (sf, oper, n) ({regM, regN, regD, shift}) =
         let
             val (shift, imm6) = shiftEncode shift
         in
             SimpleInstr(0wx0a000000 orb (sf << 0w31) orb (oper << 0w29) orb
                 (shift << 0w22) orb (n << 0w21) orb (word8ToWord32(xRegOrXZ regM) << 0w16) orb
                 (word8ToWord32 imm6 << 0w10) orb (word8ToWord32(xRegOrXZ regN) << 0w5) orb
                 word8ToWord32(xRegOrXZ regD))
         end
     in
         val andShiftedReg = logicalShiftedReg(0w1, 0w0, 0w0)
         and orrShiftedReg = logicalShiftedReg(0w1, 0w1, 0w0)
         and eorShiftedReg = logicalShiftedReg(0w1, 0w2, 0w0)
         and andsShiftedReg = logicalShiftedReg(0w1, 0w3, 0w0)
         (* There are also versions that operate with an inverted version
            of the argument. *)
     end
 
     (* Two-source operations. *)
     local
         fun twoSourceInstr (sf, s, opcode) ({regM, regN, regD}) =
             SimpleInstr(0wx1ac00000 orb (sf << 0w31) orb (s << 0w29) orb
                 (word8ToWord32(xRegOnly regM) << 0w16) orb (opcode << 0w10) orb
                 (word8ToWord32(xRegOnly regN) << 0w5) orb
                 word8ToWord32(xRegOnly regD))
     in
         (* Signed and unsigned division. *)
         val unsignedDivide   = twoSourceInstr(0w1, 0w0, 0wx2)
         and signedDivide     = twoSourceInstr(0w1, 0w0, 0wx3)
         and unsignedDivide32 = twoSourceInstr(0w0, 0w0, 0wx2)
         and signedDivide32   = twoSourceInstr(0w0, 0w0, 0wx3)
         (* Logical shift left Rd = Rn << (Rm mod 0w64) *)
         and logicalShiftLeftVariable = twoSourceInstr(0w1, 0w0, 0wx8)
         (* Logical shift right Rd = Rn >> (Rm mod 0w64) *)
         and logicalShiftRightVariable = twoSourceInstr(0w1, 0w0, 0wx9)
         (* Arithmetic shift right Rd = Rn ~>> (Rm mod 0w64) *)
         and arithmeticShiftRightVariable = twoSourceInstr(0w1, 0w0, 0wxa)
         and logicalShiftLeftVariable32 = twoSourceInstr(0w0, 0w0, 0wx8)
         and logicalShiftRightVariable32 = twoSourceInstr(0w0, 0w0, 0wx9)
         and arithmeticShiftRightVariable32 = twoSourceInstr(0w0, 0w0, 0wxa)
     end
 
     (* Three source operations.  These are all variations of multiply. *)
     local
         fun threeSourceInstr (sf, op54, op31, o0) ({regM, regA, regN, regD}) =
             SimpleInstr(0wx1b000000 orb (sf << 0w31) orb (op54 << 0w29) orb
                 (op31 << 0w21) orb (word8ToWord32(xRegOnly regM) << 0w16) orb
                 (o0 << 0w15) orb (word8ToWord32(xRegOrXZ regA) << 0w10) orb
                 (word8ToWord32(xRegOnly regN) << 0w5) orb
                 word8ToWord32(xRegOnly regD))
     in
         (* regD = regA + regN * regM *)
         val multiplyAndAdd = threeSourceInstr(0w1, 0w0, 0w0, 0w0)
         (* regD = regA - regN * regM *)
         and multiplyAndSub = threeSourceInstr(0w1, 0w0, 0w0, 0w1)
         and multiplyAndAdd32 = threeSourceInstr(0w0, 0w0, 0w0, 0w0)
         and multiplyAndSub32 = threeSourceInstr(0w0, 0w0, 0w0, 0w1)
         (* Multiply two 32-bit quantities and add/subtract a 64-bit quantity. *)
         and signedMultiplyAndAddLong = threeSourceInstr(0w1, 0w0, 0w1, 0w0)
         and signedMultiplyAndSubLong = threeSourceInstr(0w1, 0w0, 0w1, 0w1)
         (* Return the high-order part of a signed multiplication. *)
         fun signedMultiplyHigh({regM, regN, regD}) =
             threeSourceInstr(0w1, 0w0, 0w2, 0w0) { regM=regM, regN=regN, regD=regD, regA=XZero}
     end
 
     (* Loads: There are two versions of this on the ARM.  There is a version that
        takes a signed 9-bit byte offset and a version that takes an unsigned
        12-bit word offset. *)
     
     local
         fun loadStoreRegScaled (size, v, opc, xD) ({regT, regN, unitOffset}) =
         let
             val _ = (unitOffset >= 0 andalso unitOffset < 0x1000)
                 orelse raise InternalError "loadStoreRegScaled: value out of range"
         in
             SimpleInstr(0wx39000000 orb (size << 0w30) orb (opc << 0w22) orb
                 (v << 0w26) orb (Word32.fromInt unitOffset << 0w10) orb
                 (word8ToWord32(xRegOrXSP regN) << 0w5) orb word8ToWord32(xD regT))
         end
     in
         val loadRegScaled = loadStoreRegScaled(0w3, 0w0, 0w1, xRegOrXZ)
         and storeRegScaled = loadStoreRegScaled(0w3, 0w0, 0w0, xRegOrXZ)
         (* (Unsigned) byte operations.  There are also signed versions. *)
         and loadRegScaledByte = loadStoreRegScaled (0w0, 0w0, 0w1, xRegOrXZ)
         and storeRegScaledByte = loadStoreRegScaled (0w0, 0w0, 0w0, xRegOrXZ)
         and loadRegScaled16 = loadStoreRegScaled (0w1, 0w0, 0w1, xRegOrXZ)
         and storeRegScaled16 = loadStoreRegScaled (0w1, 0w0, 0w0, xRegOrXZ)
         and loadRegScaled32 = loadStoreRegScaled (0w2, 0w0, 0w1, xRegOrXZ)
         and storeRegScaled32 = loadStoreRegScaled (0w2, 0w0, 0w0, xRegOrXZ)
         and loadRegScaledDouble = loadStoreRegScaled(0w3, 0w1, 0w1, vReg)
         and storeRegScaledDouble = loadStoreRegScaled(0w3, 0w1, 0w0, vReg)
         and loadRegScaledFloat = loadStoreRegScaled(0w2, 0w1, 0w1, vReg)
         and storeRegScaledFloat = loadStoreRegScaled(0w2, 0w1, 0w0, vReg)
     end    
 
     local
         (* Loads and stores with a signed byte offset.  This includes simple
            unscaled addresses, pre-indexing and post-indexing. *)
         fun loadStoreByteAddress (op4, xD) (size, v, opc) ({regT, regN, byteOffset}) =
         let
             val _ = (byteOffset >= ~256 andalso byteOffset < 256)
                 orelse raise InternalError "loadStoreUnscaled: value out of range"
             val imm9 = Word32.fromInt byteOffset andb 0wx1ff
         in
             SimpleInstr(0wx38000000 orb (size << 0w30) orb (opc << 0w22) orb
                 (v << 0w26) orb (imm9 << 0w12) orb (op4 << 0w10) orb
                 (word8ToWord32(xRegOrXSP regN) << 0w5) orb word8ToWord32(xD regT))
         end
         
         val loadStoreUnscaled = loadStoreByteAddress (0w0, xRegOrXZ)
         and loadStoreUnscaledSIMD = loadStoreByteAddress (0w0, vReg)
         and loadStorePostIndex = loadStoreByteAddress (0w1, xRegOrXZ)
         and loadStorePreIndex = loadStoreByteAddress (0w3, xRegOrXZ)
     in
         val loadRegUnscaled = loadStoreUnscaled (0w3, 0w0, 0w1)
         and storeRegUnscaled = loadStoreUnscaled (0w3, 0w0, 0w0)
         (* (Unsigned) byte operations.  There are also signed versions. *)
         and loadRegUnscaledByte = loadStoreUnscaled (0w0, 0w0, 0w1)
         and loadRegUnscaledSignedByteTo64 = loadStoreUnscaled (0w0, 0w0, 0w2)
         and loadRegUnscaledSignedByteTo32 = loadStoreUnscaled (0w0, 0w0, 0w3)
         and storeRegUnscaledByte = loadStoreUnscaled (0w0, 0w0, 0w0)
         and loadRegUnscaled16 = loadStoreUnscaled (0w1, 0w0, 0w1)
         and loadRegUnscaledSigned16To64 = loadStoreUnscaled (0w1, 0w0, 0w2)
         and loadRegUnscaledSigned16To32 = loadStoreUnscaled (0w1, 0w0, 0w3)
         and storeRegUnscaled16 = loadStoreUnscaled (0w1, 0w0, 0w0)
         and loadRegUnscaled32 = loadStoreUnscaled (0w2, 0w0, 0w1)
         and loadRegUnscaledSigned32To64 = loadStoreUnscaled (0w2, 0w0, 0w2)
         and storeRegUnscaled32 = loadStoreUnscaled (0w2, 0w0, 0w0)
         and loadRegUnscaledFloat = loadStoreUnscaledSIMD (0w2, 0w1, 0w1)
         and storeRegUnscaledFloat = loadStoreUnscaledSIMD (0w2, 0w1, 0w0)
         and loadRegUnscaledDouble = loadStoreUnscaledSIMD (0w3, 0w1, 0w1)
         and storeRegUnscaledDouble = loadStoreUnscaledSIMD (0w3, 0w1, 0w0)
 
         val loadRegPostIndex = loadStorePostIndex (0w3, 0w0, 0w1)
         and storeRegPostIndex = loadStorePostIndex (0w3, 0w0, 0w0)
         and loadRegPostIndex32 = loadStorePostIndex (0w2, 0w0, 0w1)
         and storeRegPostIndex32 = loadStorePostIndex (0w2, 0w0, 0w0)
         and loadRegPostIndexByte = loadStorePostIndex (0w0, 0w0, 0w1)
         and storeRegPostIndexByte = loadStorePostIndex (0w0, 0w0, 0w0)
 
         val loadRegPreIndex = loadStorePreIndex (0w3, 0w0, 0w1)
         and storeRegPreIndex = loadStorePreIndex (0w3, 0w0, 0w0)
         and loadRegPreIndex32 = loadStorePreIndex (0w2, 0w0, 0w1)
         and storeRegPreIndex32 = loadStorePreIndex (0w2, 0w0, 0w0)
         and loadRegPreIndexByte = loadStorePreIndex (0w0, 0w0, 0w1)
         and storeRegPreIndexByte = loadStorePreIndex (0w0, 0w0, 0w0)
     end
 
     (* Load/store with a register offset i.e. an index register. *)
     local
         fun loadStoreRegRegisterOffset (size, v, opc, xD) ({regT, regN, regM, option}) =
         let
             val (opt, s) =
                 case extendLSEncode option of
                     (opt, ScaleOrShift) => (opt, 0w1) | (opt, NoScale) => (opt, 0w0)
         in
             SimpleInstr(0wx38200800 orb (size << 0w30) orb (v << 0w26) orb (opc << 0w22) orb
                 (word8ToWord32(xRegOnly regM) << 0w16) orb (opt << 0w13) orb (s << 0w12) orb
                 (word8ToWord32(xRegOrXSP regN) << 0w5) orb word8ToWord32(xD regT))
         end
     in
         val loadRegIndexed = loadStoreRegRegisterOffset(0w3, 0w0, 0w1, xRegOrXZ)
         and storeRegIndexed = loadStoreRegRegisterOffset(0w3, 0w0, 0w0, xRegOrXZ)
         and loadRegIndexedByte = loadStoreRegRegisterOffset(0w0, 0w0, 0w1, xRegOrXZ)
         and storeRegIndexedByte = loadStoreRegRegisterOffset(0w0, 0w0, 0w0, xRegOrXZ)
         and loadRegIndexed16 = loadStoreRegRegisterOffset(0w1, 0w0, 0w1, xRegOrXZ)
         and storeRegIndexed16 = loadStoreRegRegisterOffset(0w1, 0w0, 0w0, xRegOrXZ)
         and loadRegIndexed32 = loadStoreRegRegisterOffset(0w2, 0w0, 0w1, xRegOrXZ)
         and storeRegIndexed32 = loadStoreRegRegisterOffset(0w2, 0w0, 0w0, xRegOrXZ)
         and loadRegIndexedFloat = loadStoreRegRegisterOffset(0w2, 0w1, 0w1, vReg)
         and storeRegIndexedFloat = loadStoreRegRegisterOffset(0w2, 0w1, 0w0, vReg)
         and loadRegIndexedDouble = loadStoreRegRegisterOffset(0w3, 0w1, 0w1, vReg)
         and storeRegIndexedDouble = loadStoreRegRegisterOffset(0w3, 0w1, 0w0, vReg)
     end
 
     local
         (* Loads and stores with special ordering. *)
         fun loadStoreExclusive(size, o2, l, o1, o0) {regS, regT2, regN, regT} =
             SimpleInstr(0wx08000000 orb (size << 0w30) orb (o2 << 0w23) orb (l << 0w22) orb
             
                 (o1 << 0w21) orb (word8ToWord32(xRegOrXZ regS) << 0w16) orb (o0 << 0w15) orb
                 (word8ToWord32(xRegOrXZ regT2) << 0w10) orb (word8ToWord32(xRegOrXSP regN) << 0w5) orb
                  word8ToWord32(xRegOrXZ regT))
     in
         fun loadAcquire{regN, regT} =
             loadStoreExclusive(0w3, 0w1, 0w1, 0w0, 0w1) {regS=XZero, regT2=XZero, regN=regN, regT=regT}
         and storeRelease{regN, regT} =
             loadStoreExclusive(0w3, 0w1, 0w0, 0w0, 0w1) {regS=XZero, regT2=XZero, regN=regN, regT=regT}
         and loadAcquire32{regN, regT} =
             loadStoreExclusive(0w2, 0w1, 0w1, 0w0, 0w1) {regS=XZero, regT2=XZero, regN=regN, regT=regT}
         and storeRelease32{regN, regT} =
             loadStoreExclusive(0w2, 0w1, 0w0, 0w0, 0w1) {regS=XZero, regT2=XZero, regN=regN, regT=regT}
 
         (* Acquire exclusive access to a memory location and load its current value *)
         and loadAcquireExclusiveRegister{regN, regT} =
             loadStoreExclusive(0w3, 0w0, 0w1, 0w0, 0w1) {regS=XZero, regT2=XZero, regN=regN, regT=regT}
         (* Release exclusive access and test whether it succeeded.  Sets regS to 0
            if successful otherwise 1, in which case we have to repeat the operation. *)
         and storeReleaseExclusiveRegister{regN, regS, regT} =
             loadStoreExclusive(0w3, 0w0, 0w0, 0w0, 0w1) {regS=regS, regT2=XZero, regN=regN, regT=regT}
     end
 
     local
         (* Load and store pairs.  The offsets are signed scaled values. *)
         fun loadStorePair op2 (opc, v, l, rT) {regT1, regT2, regN, unitOffset} =
         let
             val _ = (unitOffset >= ~64 andalso unitOffset < 64)
                 orelse raise InternalError "loadStorePair: value out of range"
             val imm7 = Word32.fromInt unitOffset andb 0wx7f
         in
            SimpleInstr(0wx28000000 orb (opc << 0w30) orb (v << 0w26) orb (op2 << 0w23) orb
             (l << 0w22) orb (imm7 << 0w15) orb (word8ToWord32(rT regT2) << 0w10) orb
             (word8ToWord32(xRegOrXSP regN) << 0w5) orb word8ToWord32(rT regT1))
         end
         
         fun loadStorePairOffset args = loadStorePair 0w2 args
         and loadStorePairPostIndexed args = loadStorePair 0w1 args
         and loadStorePairPreIndexed args = loadStorePair 0w3 args
     in
         val storePairOffset = loadStorePairOffset(0w2, 0w0, 0w0, xRegOnly)
         and loadPairOffset =  loadStorePairOffset(0w2, 0w0, 0w1, xRegOnly)
         and storePairPostIndexed = loadStorePairPostIndexed(0w2, 0w0, 0w0, xRegOnly)
         and loadPairPostIndexed =  loadStorePairPostIndexed(0w2, 0w0, 0w1, xRegOnly)
         and storePairPreIndexed = loadStorePairPreIndexed(0w2, 0w0, 0w0, xRegOnly)
         and loadPairPreIndexed =  loadStorePairPreIndexed(0w2, 0w0, 0w1, xRegOnly)
         and storePairOffsetFloat = loadStorePairOffset(0w0, 0w1, 0w0, vReg)
         and loadPairOffsetFloat = loadStorePairOffset(0w0, 0w1, 0w1, vReg)
         and storePairPostIndexedFloat = loadStorePairPostIndexed(0w0, 0w1, 0w0, vReg)
         and loadPairPostIndexesFloat = loadStorePairPostIndexed(0w0, 0w1, 0w1, vReg)
         and storePairPreIndexedFloat = loadStorePairPreIndexed(0w0, 0w1, 0w0, vReg)
         and loadPairPreIndexesFloat = loadStorePairPreIndexed(0w0, 0w1, 0w1, vReg)
         and storePairOffsetDouble = loadStorePairOffset(0w0, 0w1, 0w0, vReg)
         and loadPairOffsetDouble = loadStorePairOffset(0w0, 0w1, 0w1, vReg)
         and storePairPostIndexedDouble = loadStorePairPostIndexed(0w1, 0w1, 0w0, vReg)
         and loadPairPostIndexesDouble = loadStorePairPostIndexed(0w1, 0w1, 0w1, vReg)
         and storePairPreIndexedDouble = loadStorePairPreIndexed(0w1, 0w1, 0w0, vReg)
         and loadPairPreIndexesDouble = loadStorePairPreIndexed(0w1, 0w1, 0w1, vReg)
     end
 
     (* Addresses must go in the constant area at the end of the code where they
        can be found by the GC. *)
     fun loadAddressConstant(xReg, valu) = LoadAddressLiteral{reg=xReg, value=valu}
 
     (* Non-address constants.  These may or may not be tagged values. *)
     fun loadNonAddressConstant(xReg, valu) = LoadNonAddressLiteral{reg=xReg, value=valu}
 
     local
         fun moveWideImmediate(sf, opc) {regD, immediate, shift} =
         let
             val hw =
                 case (shift, sf) of
                     (0w0, _) => 0w0
                 |   (0w16, _) => 0w1
                 |   (0w24, 0w1) => 0w2
                 |   (0w48, 0w1) => 0w3
                 |   _ => raise InternalError "moveWideImmediate: invalid shift"
             val _ =
                 immediate <= 0wxffff orelse raise InternalError "moveWideImmediate: immediate too large"
         in
             SimpleInstr(0wx12800000 orb (sf << 0w31) orb (opc << 0w29) orb
                 (hw << 0w21) orb (wordToWord32 immediate << 0w5) orb word8ToWord32(xRegOnly regD))
         end
     in
         val moveNot32 = moveWideImmediate(0w0, 0w0)
         and moveZero32 = moveWideImmediate(0w0, 0w2)
         and moveKeep32 = moveWideImmediate(0w0, 0w3)
         and moveNot = moveWideImmediate(0w1, 0w0)
         and moveZero = moveWideImmediate(0w1, 0w2)
         and moveKeep = moveWideImmediate(0w1, 0w3)
     end
 
     (* Instructions involved in thread synchonisation. *)
     val yield = SimpleInstr 0wxD503203F (* Yield inside a spin-lock. *)
     and dmbIsh = SimpleInstr 0wxD5033BBF (* Memory barrier. *)
     
     (* Jump to the address in the register and put the address of the
        next instruction into X30. *)
     fun branchAndLinkReg(dest) =
         SimpleInstr(0wxD63F0000 orb (word8ToWord32(xRegOnly dest) << 0w5))
 
     (* Jump to the address in the register. *)
     fun branchRegister(dest) =
         SimpleInstr(0wxD61F0000 orb (word8ToWord32(xRegOnly dest) << 0w5))
 
     (* Jump to the address in the register and hint this is a return. *)
     fun returnRegister(dest) =
         SimpleInstr(0wxD65F0000 orb (word8ToWord32(xRegOnly dest) << 0w5))
 
     (* Put a label into the code. *)
     val setLabel = Label
 
     (* Create a label. *)
     fun createLabel () = ref [ref 0w0]
 
     (* A conditional or unconditional branch. *)
     and conditionalBranch(cond, label) = ConditionalBranch{label=label, jumpCondition=cond, length=ref BrExtended }
     and unconditionalBranch label = UnconditionalBranch label
     (* Put the address of a label into a register - used for handlers and cases. *)
     and loadLabelAddress(reg, label) = LoadLabelAddress{label=label, reg=reg}
     (* Test a bit in a register and branch if zero/nonzero *)
     and testBitBranchZero(reg, bit, label) =
         TestBitBranch{label=label, bitNo=bit, brNonZero=false, reg=reg, length=ref BrExtended}
     and testBitBranchNonZero(reg, bit, label) =
         TestBitBranch{label=label, bitNo=bit, brNonZero=true, reg=reg, length=ref BrExtended}
     (* Compare a register with zero and branch if zero/nonzero *)
     and compareBranchZero(reg,  label) =
         CompareBranch{label=label, brNonZero=false, size=WordSize64, reg=reg, length=ref BrExtended}
     and compareBranchNonZero(reg, label) =
         CompareBranch{label=label, brNonZero=true, size=WordSize64, reg=reg, length=ref BrExtended}
     and compareBranchZero32(reg, label) =
         CompareBranch{label=label, brNonZero=false, size=WordSize32, reg=reg, length=ref BrExtended}
     and compareBranchNonZero32(reg, label) =
         CompareBranch{label=label, brNonZero=true, size=WordSize32, reg=reg, length=ref BrExtended}
     
 
     (* Set the destination register to the value of the first reg if the
        condition is true otherwise to a, possibly modified, version of
        the second argument.  There are variants that set it unmodified,
        incremented, inverted and negated. *)
     local
         fun conditionalSelect (sf, opc, op2) {regD, regFalse, regTrue, cond=CCode cond} =
             SimpleInstr(0wx1A800000 orb (sf << 0w31) orb (opc << 0w30) orb
                 (word8ToWord32(xRegOrXZ regFalse) << 0w16) orb (word8ToWord32 cond << 0w12) orb
                 (op2 << 0w10) orb (word8ToWord32(xRegOrXZ regTrue) << 0w5) orb
                 word8ToWord32(xRegOrXZ regD))
     in
         val conditionalSet = conditionalSelect(0w1, 0w0, 0w0)
         and conditionalSetIncrement = conditionalSelect(0w1, 0w0, 0w1)
         and conditionalSetInverted = conditionalSelect(0w1, 0w1, 0w0)
         and conditionalSetNegated = conditionalSelect(0w1, 0w1, 0w1)
     end
 
     (* This combines the effect of a left and right shift.  There are various
        derived forms of this depending on the relative values of immr and imms.
        if imms >= immr copies imms-immr-1 bits from bit position immr to the lsb
        bits of the destination.
        if imms < immr copies imms+1 bits from the lsb bit to bit position
        regsize-immr.
        How the remaining bits are affected depends on the instruction.
        BitField instructions do not affect other bits.
        UnsignedBitField instructions zero other bits.
        SignedBitField instructions set the high order bits to a copy of
        the high order bit copied and zero the low order bits. *)
     local
         fun bitfield (sf, opc, n) {immr, imms, regN, regD} =
             SimpleInstr(0wx13000000 orb (sf << 0w31) orb (opc << 0w29) orb (n << 0w22) orb
                 (wordToWord32 immr << 0w16) orb (wordToWord32 imms << 0w10) orb (word8ToWord32(xRegOrXZ regN) << 0w5) orb
                 word8ToWord32(xRegOrXZ regD))
 
         val signedBitfieldMove32 = bitfield(0w0, 0w0, 0w0)
         and bitfieldMove32 = bitfield(0w0, 0w1, 0w0)
         and unsignedBitfieldMove32 = bitfield(0w0, 0w2, 0w0)
         and signedBitfieldMove64 = bitfield(0w1, 0w0, 0w1)
         and bitfieldMove64 = bitfield(0w1, 0w1, 0w1)
         and unsignedBitfieldMove64 = bitfield(0w1, 0w2, 0w1)
     in
         fun logicalShiftLeft{shift, regN, regD} =
                 unsignedBitfieldMove64{immr=Word.~ shift mod 0w64,
                     imms=0w64-0w1-shift, regN=regN, regD=regD}
         and logicalShiftLeft32{shift, regN, regD} =
                 unsignedBitfieldMove32{immr=Word.~ shift mod 0w32,
                     imms=0w32-0w1-shift, regN=regN, regD=regD}
 
         and logicalShiftRight{shift, regN, regD} =
                 unsignedBitfieldMove64{immr=shift, imms=0wx3f, regN=regN, regD=regD}
         and logicalShiftRight32{shift, regN, regD} =
                 unsignedBitfieldMove32{immr=shift, imms=0wx1f, regN=regN, regD=regD}
 
         and unsignedBitfieldInsertinZeros{lsb, width, regN, regD} =
                 unsignedBitfieldMove64{immr=Word.~ lsb mod 0w64,
                     imms=width-0w1, regN=regN, regD=regD}
         and unsignedBitfieldInsertinZeros32{lsb, width, regN, regD} =
                 unsignedBitfieldMove32{immr=Word.~ lsb mod 0w32,
                     imms=width-0w1, regN=regN, regD=regD}
 
         and arithmeticShiftRight{shift, regN, regD} =
                 signedBitfieldMove64{immr=shift, imms=0wx3f, regN=regN, regD=regD}
         and arithmeticShiftRight32{shift, regN, regD} =
                 signedBitfieldMove32{immr=shift, imms=0wx1f, regN=regN, regD=regD}
         and signedBitfieldExtract{lsb, width, regN, regD} =
                 signedBitfieldMove64{immr=lsb, imms=lsb+width-0w1, regN=regN, regD=regD}
 
         and bitfieldInsert{lsb, width, regN, regD} =
                 bitfieldMove64{immr=Word.~ lsb mod 0w64, imms=width-0w1, regN=regN, regD=regD}
         and bitfieldInsert32{lsb, width, regN, regD} =
                 bitfieldMove32{immr=Word.~ lsb mod 0w32, imms=width-0w1, regN=regN, regD=regD}
     end
 
     local
         (* Logical immediates.  AND, OR, XOR and ANDS.  Assumes that the immediate value
            has already been checked as valid.  The non-flags versions can use SP as the
            destination. *)
         fun logicalImmediate (s, opc, xD) {bits, regN, regD} =
         let
             val {n, imms, immr} = 
                 case encodeBitPattern(bits, if s = 0w0 then WordSize32 else WordSize64) of
                     NONE => raise InternalError "testBitPattern: unable to encode bit pattern"
                 |   SOME res => res
         in
             SimpleInstr(0wx12000000 orb (opc << 0w29) orb (s << 0w31) orb (n << 0w22) orb
                 (wordToWord32 immr << 0w16) orb (wordToWord32 imms << 0w10) orb (word8ToWord32(xRegOrXZ regN) << 0w5) orb
                 word8ToWord32(xD regD))
         end
     in
         val bitwiseAndImmediate = logicalImmediate (0w1, 0w0, xRegOrXSP)
         and bitwiseOrImmediate = logicalImmediate (0w1, 0w1, xRegOrXSP)
         and bitwiseXorImmediate = logicalImmediate (0w1, 0w2, xRegOrXSP)
         and bitwiseAndSImmediate = logicalImmediate (0w1, 0w3, xRegOrXZ)
         and bitwiseAndImmediate32 = logicalImmediate (0w0, 0w0, xRegOrXSP)
         and bitwiseOrImmediate32 = logicalImmediate (0w0, 0w1, xRegOrXSP)
         and bitwiseXorImmediate32 = logicalImmediate (0w0, 0w2, xRegOrXSP)
         and bitwiseAndSImmediate32 = logicalImmediate (0w0, 0w3, xRegOrXZ)
         
         (* Test a bit pattern in a register.  If the pattern is within the low-order
            32-bits we use a 32-bit test. *)
         fun testBitPattern(reg, bits) =
             (if bits <= 0wxffffffff then bitwiseAndSImmediate32 else bitwiseAndSImmediate)
                 {bits=bits, regN=reg, regD=XZero}
     end
 
     local
         (* Floating point operations - 2 source *)
         fun floatingPoint2Source (pt, opc) {regM, regN, regD} =
             SimpleInstr(0wx1E200800 orb (pt << 0w22) orb (word8ToWord32(vReg regM) << 0w16) orb
                 (opc << 0w12) orb (word8ToWord32(vReg regN) << 0w5) orb word8ToWord32(vReg regD))
     in
         val multiplyFloat = floatingPoint2Source(0w0, 0wx0)
         and divideFloat = floatingPoint2Source(0w0, 0wx1)
         and addFloat = floatingPoint2Source(0w0, 0wx2)
         and subtractFloat = floatingPoint2Source(0w0, 0wx3)
         and multiplyDouble = floatingPoint2Source(0w1, 0wx0)
         and divideDouble = floatingPoint2Source(0w1, 0wx1)
         and addDouble = floatingPoint2Source(0w1, 0wx2)
         and subtractDouble = floatingPoint2Source(0w1, 0wx3)
     end
 
     local
         (* Move between a floating point and a general register with or without conversion. *)
         fun fmoveGeneral (sf, s, ptype, mode, opcode, rN, rD) {regN, regD} =
             SimpleInstr(0wx1E200000 orb (sf << 0w31) orb (s << 0w29) orb (ptype << 0w22) orb
                 (mode << 0w19) orb (opcode << 0w16) orb
                 (word8ToWord32(rN regN) << 0w5) orb word8ToWord32(rD regD))
         open IEEEReal
     in
         (* Moves without conversion *)
         val moveGeneralToFloat = fmoveGeneral(0w0, 0w0, 0w0, 0w0, 0w7, xRegOrXZ, vReg)
         and moveFloatToGeneral = fmoveGeneral(0w0, 0w0, 0w0, 0w0, 0w6, vReg, xRegOnly)
         and moveGeneralToDouble = fmoveGeneral(0w1, 0w0, 0w1, 0w0, 0w7, xRegOrXZ, vReg)
         and moveDoubleToGeneral = fmoveGeneral(0w1, 0w0, 0w1, 0w0, 0w6, vReg, xRegOnly)
         (* Moves with conversion - signed.  The argument is a 64-bit value. *)
         and convertIntToFloat = fmoveGeneral(0w1, 0w0, 0w0, 0w0, 0w2, xRegOrXZ, vReg)
         and convertIntToDouble = fmoveGeneral(0w1, 0w0, 0w1, 0w0, 0w2, xRegOrXZ, vReg)
         and convertInt32ToFloat = fmoveGeneral(0w0, 0w0, 0w0, 0w0, 0w2, xRegOrXZ, vReg)
         and convertInt32ToDouble = fmoveGeneral(0w0, 0w0, 0w1, 0w0, 0w2, xRegOrXZ, vReg)
 
         fun convertFloatToInt TO_NEAREST =
                 fmoveGeneral(0w1, 0w0, 0w0, 0w0, 0w4, vReg, xRegOnly) (* fcvtas *)
         |   convertFloatToInt TO_NEGINF =
                 fmoveGeneral(0w1, 0w0, 0w0, 0w2, 0w0, vReg, xRegOnly) (* fcvtms *)
         |   convertFloatToInt TO_POSINF =
                 fmoveGeneral(0w1, 0w0, 0w0, 0w1, 0w0, vReg, xRegOnly) (* fcvtps *)
         |   convertFloatToInt TO_ZERO =
                 fmoveGeneral(0w1, 0w0, 0w0, 0w3, 0w0, vReg, xRegOnly) (* fcvtzs *)
 
         and convertDoubleToInt TO_NEAREST =
                 fmoveGeneral(0w1, 0w0, 0w1, 0w0, 0w4, vReg, xRegOnly) (* fcvtas *)
         |   convertDoubleToInt TO_NEGINF =
                 fmoveGeneral(0w1, 0w0, 0w1, 0w2, 0w0, vReg, xRegOnly) (* fcvtms *)
         |   convertDoubleToInt TO_POSINF =
                 fmoveGeneral(0w1, 0w0, 0w1, 0w1, 0w0, vReg, xRegOnly) (* fcvtps *)
         |   convertDoubleToInt TO_ZERO =
                 fmoveGeneral(0w1, 0w0, 0w1, 0w3, 0w0, vReg, xRegOnly) (* fcvtzs *)
 
         and convertFloatToInt32 TO_NEAREST =
                 fmoveGeneral(0w0, 0w0, 0w0, 0w0, 0w4, vReg, xRegOnly) (* fcvtas *)
         |   convertFloatToInt32 TO_NEGINF =
                 fmoveGeneral(0w0, 0w0, 0w0, 0w2, 0w0, vReg, xRegOnly) (* fcvtms *)
         |   convertFloatToInt32 TO_POSINF =
                 fmoveGeneral(0w0, 0w0, 0w0, 0w1, 0w0, vReg, xRegOnly) (* fcvtps *)
         |   convertFloatToInt32 TO_ZERO =
                 fmoveGeneral(0w0, 0w0, 0w0, 0w3, 0w0, vReg, xRegOnly) (* fcvtzs *)
 
         and convertDoubleToInt32 TO_NEAREST =
                 fmoveGeneral(0w0, 0w0, 0w1, 0w0, 0w4, vReg, xRegOnly) (* fcvtas *)
         |   convertDoubleToInt32 TO_NEGINF =
                 fmoveGeneral(0w0, 0w0, 0w1, 0w2, 0w0, vReg, xRegOnly) (* fcvtms *)
         |   convertDoubleToInt32 TO_POSINF =
                 fmoveGeneral(0w0, 0w0, 0w1, 0w1, 0w0, vReg, xRegOnly) (* fcvtps *)
         |   convertDoubleToInt32 TO_ZERO =
                 fmoveGeneral(0w0, 0w0, 0w1, 0w3, 0w0, vReg, xRegOnly) (* fcvtzs *)
     end
 
     local
         fun floatingPtCompare(ptype, opc) {regM, regN} =
             SimpleInstr(0wx1E202000 orb (ptype << 0w22) orb
                 (word8ToWord32(vReg regM) << 0w16) orb (word8ToWord32(vReg regN) << 0w5) orb
                 (opc << 0w3))
     in
         val compareFloat = floatingPtCompare(0w0, 0w0) (* fcmp *)
         and compareDouble = floatingPtCompare(0w1, 0w0)
         (* It is also possible to compare a single register with zero using opc=1/3 *)
     end
 
     local
         (* Floating point single source. *)
         fun floatingPtSingle (ptype, opc) {regN, regD} =
             SimpleInstr(0wx1E204000 orb (ptype << 0w22) orb (opc << 0w15) orb
                 (word8ToWord32(vReg regN) << 0w5) orb word8ToWord32(vReg regD))
     in
         val moveFloatToFloat = floatingPtSingle(0w0, 0wx0)
         and absFloat = floatingPtSingle(0w0, 0wx1)
         and negFloat = floatingPtSingle(0w0, 0wx2)
         and convertFloatToDouble = floatingPtSingle(0w0, 0wx5)
         and moveDoubleToDouble = floatingPtSingle(0w1, 0wx0)
         and absDouble = floatingPtSingle(0w1, 0wx1)
         and negDouble = floatingPtSingle(0w1, 0wx2)
         and convertDoubleToFloat = floatingPtSingle(0w1, 0wx4)
     end
 
     (* This word is put in after a call to the RTS trap-handler.  All the registers
        are saved and restored across a call to the trap-handler; the register
        mask contains those that may contain an address and so need to be scanned and
        possibly updated if there is a GC. *)
     fun registerMask(regs) =
     let
         fun addToMask(r, mask) = mask orb (0w1 << word8ToWord(xRegOnly r))
         val maskWord = List.foldl addToMask 0w0 regs
     in
         SimpleInstr(0wx02000000 (* Reserved instr range. *) orb maskWord)
     end
     
+    (* This is a bit of a hack but is the only way to get round the problem that when
+       a callback (FFI closure) is called the code has none of the global registers.
+       This isn't a problem in the native addressing version because we have
+       absolute addresses but in 32-in-64 we need at least one absolute address to
+       begin.  This embeds the global heap base pointer as a constant in the
+       non-address constant area.  It requires the RTS to be able to find it and
+       update it when the code is loaded.  We insert a nop followed by the
+       pc-relative load.  This MUST be the first instruction in the code. *)
+    local
+        val getHeapBase: unit -> LargeWord.word = RunCall.rtsCallFull0 "PolyGetHeapBase"
+    in
+        fun loadGlobalHeapBaseInCallback reg =
+            if is32in64
+            then [SimpleInstr nopCode, loadNonAddressConstant(reg, getHeapBase())]
+            else raise InternalError "loadGlobalHeapBaseInCallback called with native addressing"
+    end
 
     (* Size of each code word. *)
     fun codeSize (SimpleInstr _) = 1 (* Number of 32-bit words *)
     |   codeSize (LoadAddressLiteral _) = if is32in64 then 1 else 2
     |   codeSize (LoadNonAddressLiteral _) = 1
     |   codeSize (Label _) = 0
     |   codeSize (UnconditionalBranch _) = 1
     |   codeSize (LoadLabelAddress _) = 1
     |   codeSize (ConditionalBranch { length=ref BrShort, ...}) = 1
     |   codeSize (ConditionalBranch { length=ref BrExtended, ...}) = 2
     |   codeSize (TestBitBranch { length=ref BrShort, ...}) = 1
     |   codeSize (TestBitBranch { length=ref BrExtended, ...}) = 2
     |   codeSize (CompareBranch { length=ref BrShort, ...}) = 1
     |   codeSize (CompareBranch { length=ref BrExtended, ...}) = 2
 
     (* Store a 32-bit value in the code *)
     fun writeInstr(value, wordAddr, seg) =
     let
         fun putBytes(value, a, seg, i) =
         if i = 0w4 then ()
         else
         (
             byteVecSet(seg, a+i, word32ToWord8(value andb 0wxff));
             putBytes(value >> 0w8, a, seg, i+0w1)
         )
     in
         putBytes(value, Word.<<(wordAddr, 0w2), seg, 0w0)
     end
     
     (* Store a 64-bit constant in the code area. *)
     fun write64Bit(value, word64Addr, seg) =
     let
         fun putBytes(value, a, seg, i) =
         if i = 0w8 then ()
         else
         (
             byteVecSet(seg, a+i, Word8.fromLarge(Word64.toLarge value));
             putBytes(Word64.>>(value, 0w8), a, seg, i+0w1)
         )
     in
         putBytes(value, Word.<<(word64Addr, 0w3), seg, 0w0)
     end
 
     (* Set the sizes of branches depending on the distance to the destination. *)
     fun setLabelsAndSizes ops =
     let
         (* Set the labels and get the current size of the code. *)
         fun setLabels(Label(ref labs) :: ops, ic) = (List.app(fn d => d := ic) labs; setLabels(ops, ic))
         |   setLabels(oper :: ops, ic) = setLabels(ops, ic + Word.fromInt(codeSize oper))
         |   setLabels([], ic) = ic
 
         (* Set the labels and adjust the sizes, repeating until it never gets smaller *)
         fun setLabAndSize(ops, lastSize) =
         let
             (* See if we can shorten any branches.  The "addr" is the original address since
                that's what we've used to set the labels.  *)
             fun adjust([], _) = ()
 
             |   adjust(ConditionalBranch { length as ref BrExtended, label=ref labs, ...} :: instrs, addr) =
                 let
                     val dest = !(hd labs)
                     val offset = Word.toInt dest - Word.toInt addr
                 in
                     if offset < Word32.toInt(0w1 << 0w18) andalso offset >= ~ (Word32.toInt(0w1 << 0w18))
                     then length := BrShort
                     else ();
                     adjust(instrs, addr + 0w2) (* N.B. Size BEFORE any adjustment *)
                 end
             
             |   adjust(TestBitBranch { length as ref BrExtended, label=ref labs, ...} :: instrs, addr) =
                 let
                     val dest = !(hd labs)
                     val offset = Word.toInt dest - Word.toInt addr
                 in
                     if offset < 0x2000 andalso offset >= ~ 0x2000
                     then length := BrShort
                     else ();
                     adjust(instrs, addr + 0w2) (* N.B. Size BEFORE any adjustment *)
                 end
 
             |   adjust(CompareBranch { length as ref BrExtended, label=ref labs, ...} :: instrs, addr) =
                 let
                     val dest = !(hd labs)
                     val offset = Word.toInt dest - Word.toInt addr
                 in
                     if offset < 0x40000 andalso offset >= ~ 0x40000
                     then length := BrShort
                     else ();
                     adjust(instrs, addr + 0w2) (* N.B. Size BEFORE any adjustment *)
                 end
             
             |   adjust(instr :: instrs, addr) = adjust(instrs, addr + Word.fromInt(codeSize instr))
 
             val () = adjust(ops, 0w0)
 
             val nextSize = setLabels(ops, 0w0)
         in
             if nextSize < lastSize then setLabAndSize(ops, nextSize)
             else if nextSize = lastSize then lastSize
             else raise InternalError "setLabAndSize - size increased"
         end
     in
         setLabAndSize(ops, setLabels(ops, 0w0))
     end
 
     fun genCode(ops, addressConsts, nonAddressConsts) =
     let
         val codeSize = setLabelsAndSizes ops (* Number of 32-bit instructions *)
         val wordsOfCode = (codeSize + 0w2) div 0w2 (* Round up to 64-bits with the UDF marker(s) added. *)
         (* Put one or two UDF instructions at the end as markers. *)
         val endOfCodeWords =
             if Word.andb(codeSize, 0w1) = 0w0 then [SimpleInstr undefCode, SimpleInstr undefCode] else [SimpleInstr undefCode]
         
         val numNonAddrConsts = Word.fromInt(List.length nonAddressConsts)
         and numAddrConsts = Word.fromInt(List.length addressConsts) (* 32-bit words. *)
 
         (* Segment size in Poly words. *)
         val segSize =
             (wordsOfCode + numNonAddrConsts) * wordsPerNativeWord + numAddrConsts + 0w4 (* 4 extra words *)
         val codeVec = byteVecMake segSize
         
         fun testBit(bitNo, brNonZero, offset, reg) =
             0wx36000000 orb (if bitNo >= 0w32 then 0wx80000000 else 0w0) orb
                 (if brNonZero then 0wx01000000 else 0w0) orb
                 (word8ToWord32(Word8.andb(bitNo, 0wx3f)) << 0w19) orb
                 ((offset andb 0wx3fff) << 0w5) orb word8ToWord32(xRegOnly reg)
         and compareBranch(size, brNonZero, offset, reg) =
             0wx34000000 orb (case size of WordSize64 => 0wx80000000 | WordSize32 => 0w0) orb
                 (if brNonZero then 0wx01000000 else 0w0) orb
                 ((offset andb 0wx7ffff) << 0w5) orb word8ToWord32(xRegOnly reg)
 
         fun genCodeWords([], _ , _, _) = ()
 
         |   genCodeWords(SimpleInstr code :: tail, wordNo, aConstNum, nonAConstNum) =
             (
                 writeInstr(code, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum)
             )
 
         |   genCodeWords(LoadAddressLiteral{reg, ...} :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 (* The offset is in 32-bit words.  The first of the constants is
                    at offset wordsOfCode+3.  Non-address constants are always 8 bytes but
                    address constants are 4 bytes in 32-in-64. *)
                 val s = if is32in64 then 0w0 else 0w1 (* Load 64-bit word in 64-bit mode and 32-bits in 32-in-64. *)
                 val offsetOfConstant =
                     (wordsOfCode+numNonAddrConsts)*0w2 + (0w3+aConstNum)*(Address.wordSize div 0w4) - wordNo
                 val _ = offsetOfConstant < 0wx100000 orelse raise InternalError "Offset to constant is too large"
                 val code =
                     0wx18000000 orb (s << 0w30) orb (wordToWord32 offsetOfConstant << 0w5) orb word8ToWord32(xRegOnly reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 (* On native ARM64 we may need to split off the constant area so that the
                    code area is position-independent.  That requires references to the
                    constant area to be patched to use ADRP+LDR. *)
                 if is32in64
                 then genCodeWords(tail, wordNo+0w1, aConstNum+0w1, nonAConstNum)
                 else
                 (
                     writeInstr(nopCode, wordNo+0w1, codeVec);
                     genCodeWords(tail, wordNo+0w2, aConstNum+0w1, nonAConstNum)
                 )
             end
 
         |   genCodeWords(LoadNonAddressLiteral{reg, ...} :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 (* The offset is in 32-bit words.  These are always 64-bits. *)
                 val offsetOfConstant = (wordsOfCode+nonAConstNum)*0w2 - wordNo
                 val _ = offsetOfConstant < 0wx100000 orelse raise InternalError "Offset to constant is too large"
                 val code = 0wx58000000 orb (wordToWord32 offsetOfConstant << 0w5) orb word8ToWord32(xRegOnly reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum+0w1)
             end
 
         |   genCodeWords(Label _ :: tail, wordNo, aConstNum, nonAConstNum) = 
                 genCodeWords(tail, wordNo, aConstNum, nonAConstNum) (* No code. *)
 
         |   genCodeWords(UnconditionalBranch(ref labs) :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt wordNo
                 val _ = (offset < Word32.toInt(0w1 << 0w25) andalso offset >= ~ (Word32.toInt(0w1 << 0w25)))
                     orelse raise InternalError "genCodeWords: branch too far";
             in
                 writeInstr(0wx14000000 orb (Word32.fromInt offset andb 0wx03ffffff), wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(ConditionalBranch{ label=ref labs, jumpCondition=CCode cond, length=ref BrShort }:: tail, wordNo,
                             aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt wordNo
                 val _ = (offset < Word32.toInt(0w1 << 0w18) andalso offset >= ~ (Word32.toInt(0w1 << 0w18)))
                         orelse raise InternalError "genCodeWords: branch too far"
             in
                 writeInstr(0wx54000000 orb ((Word32.fromInt offset andb 0wx07ffff) << 0w5)
                         orb word8ToWord32 cond, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(ConditionalBranch{ label=ref labs, jumpCondition=CCode cond, length=ref BrExtended }:: tail, wordNo,
                             aConstNum, nonAConstNum) =
             let (* Long form - put a conditional branch with reversed sense round an unconditional branch. *)
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt (wordNo + 0w1) (* Next instruction. *)
                 val _ = (offset < Word32.toInt(0w1 << 0w25) andalso offset >= ~ (Word32.toInt(0w1 << 0w25)))
                     orelse raise InternalError "genCodeWords: branch too far"
                 val revCond = Word8.xorb(cond, 0w1)
             in
                 writeInstr(0wx54000000 orb (0w2 << 0w5) orb word8ToWord32 revCond, wordNo, codeVec);
                 writeInstr(0wx14000000 orb (Word32.fromInt offset andb 0wx03ffffff), wordNo+0w1, codeVec);
                 genCodeWords(tail, wordNo+0w2, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(LoadLabelAddress{label=ref labs, reg} :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt wordNo
                 val _ = offset < 0x100000 orelse offset >= ~ 0x100000
                     orelse raise InternalError "Offset to label address is too large"
                 val code = 0wx10000000 orb ((Word32.fromInt offset andb 0wx7ffff) << 0w5) orb word8ToWord32(xRegOnly reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(TestBitBranch{label=ref labs, bitNo, brNonZero, reg, length=ref BrExtended} :: tail,
                     wordNo, aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt (wordNo + 0w1) (* Next instruction *)
                 val _ = (offset < Word32.toInt(0w1 << 0w25) andalso offset >= ~ (Word32.toInt(0w1 << 0w25)))
                     orelse raise InternalError "genCodeWords: branch too far"
                 val _ = bitNo <= 0w63 orelse
                     raise InternalError "TestBitBranch: bit number > 63"
                 val code = testBit(bitNo, (* Invert test *) not brNonZero, 0w2 (* Skip branch *), reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 writeInstr(0wx14000000 orb (Word32.fromInt offset andb 0wx03ffffff), wordNo+0w1, codeVec);
                 genCodeWords(tail, wordNo+0w2, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(TestBitBranch{label=ref labs, bitNo, brNonZero, reg, length=ref BrShort} :: tail,
                     wordNo, aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt wordNo
                 val _ = (offset < 0x2000 andalso offset >= ~ 0x2000)
                     orelse raise InternalError "TestBitBranch: Offset to label address is too large"
                 val _ = bitNo <= 0w63 orelse
                     raise InternalError "TestBitBranch: bit number > 63"
                 val code = testBit(bitNo, brNonZero, Word32.fromInt offset, reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(CompareBranch{label=ref labs, brNonZero, size, reg, length=ref BrExtended} :: tail,
                     wordNo, aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt (wordNo+0w1)
                 val _ = (offset < Word32.toInt(0w1 << 0w25) andalso offset >= ~ (Word32.toInt(0w1 << 0w25)))
                     orelse raise InternalError "genCodeWords: branch too far"
                 val code = compareBranch(size, (* Invert test *) not brNonZero, 0w2, reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 writeInstr(0wx14000000 orb (Word32.fromInt offset andb 0wx03ffffff), wordNo+0w1, codeVec);
                 genCodeWords(tail, wordNo+0w2, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(CompareBranch{label=ref labs, brNonZero, size, reg, length=ref BrShort} :: tail,
                     wordNo, aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt wordNo
                 val _ = (offset < 0x40000 andalso offset >= ~ 0x40000)
                     orelse raise InternalError "CompareBranch: Offset to label address is too large"
                 val code = compareBranch(size, brNonZero, Word32.fromInt offset, reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum)
             end
     in
         genCodeWords (ops @ endOfCodeWords, 0w0, 0w0, 0w0);
         (* Copy in the non-address constants. *)
         List.foldl(fn (cVal, addr) => (write64Bit(cVal, addr, codeVec); addr+0w1)) wordsOfCode nonAddressConsts;
         (codeVec (* Return the completed code. *), wordsOfCode+numNonAddrConsts (* And the size in 64-bit words. *))
     end
 
     (* Store a word, either 64-bit or 32-bit. *)
     fun setWord(value, wordNo, seg) =
     let
         val addrs = wordNo * Address.wordSize
         fun putBytes(value, a, seg, i) =
         if i = Address.wordSize then ()
         else
         (
             byteVecSet(seg, a+i, Word8.fromLarge value);
             putBytes(LargeWord.>>(value, 0w8), a, seg, i+0w1)
         )
     in
         putBytes(value, addrs, seg, 0w0)
     end
     
    
     (* Print the instructions in the code. *)
     fun printCode (codeVec, functionName, wordsOfCode, printStream) =
     let
         val numInstructions = wordsOfCode * (Address.wordSize div 0w4)
     
         fun printHex (v, n) =
         let
             val s = Word.fmt StringCvt.HEX v
             val pad = CharVector.tabulate(Int.max(0, n-size s), fn _ => #"0")
         in
             printStream pad; printStream s
         end
 
         fun printCondition 0wx0 = printStream "eq"
         |   printCondition 0wx1 = printStream "ne"
         |   printCondition 0wx2 = printStream "cs"
         |   printCondition 0wx3 = printStream "cc"
         |   printCondition 0wx4 = printStream "mi"
         |   printCondition 0wx5 = printStream "pl"
         |   printCondition 0wx6 = printStream "vs"
         |   printCondition 0wx7 = printStream "vc"
         |   printCondition 0wx8 = printStream "hi"
         |   printCondition 0wx9 = printStream "ls"
         |   printCondition 0wxa = printStream "ge"
         |   printCondition 0wxb = printStream "lt"
         |   printCondition 0wxc = printStream "gt"
         |   printCondition 0wxd = printStream "le"
         |   printCondition 0wxe = printStream "al"
         |   printCondition _    = printStream "nv"
 
         (* Normal XReg with 31 being XZ *)
         fun prXReg 0w31 = printStream "xz"
         |   prXReg r = printStream("x" ^ Word32.fmt StringCvt.DEC r)
 
         (* XReg when 31 is SP *)
         fun prXRegOrSP 0w31 = printStream "sp"
         |   prXRegOrSP r = printStream("x" ^ Word32.fmt StringCvt.DEC r)
 
         (* Normal WReg with 31 being WZ *)
         fun prWReg 0w31 = printStream "wz"
         |   prWReg r = printStream("w" ^ Word32.fmt StringCvt.DEC r)
 
         (* WReg when 31 is WSP *)
         fun prWRegOrSP 0w31 = printStream "wsp"
         |   prWRegOrSP r = printStream("w" ^ Word32.fmt StringCvt.DEC r)
 
         (* Each instruction is 32-bytes. *)
         fun printWordAt wordNo =
         let
             val byteNo = Word.<<(wordNo, 0w2)
             val () = printHex(byteNo, 6)  (* Address *)
             val () = printStream "\t"
             val wordValue =
                 word8ToWord32 (codeVecGet (codeVec, byteNo)) orb
                 (word8ToWord32 (codeVecGet (codeVec, byteNo+0w1)) << 0w8) orb
                 (word8ToWord32 (codeVecGet (codeVec, byteNo+0w2)) << 0w16) orb
                 (word8ToWord32 (codeVecGet (codeVec, byteNo+0w3)) << 0w24)
             val () = printHex(word32ToWord wordValue, 8) (* Instr as hex *)
             val () = printStream "\t"
         in
             if (wordValue andb 0wxfffffc1f) = 0wxD61F0000
             then
             let
                 val rN = (wordValue andb 0wx3e0) >> 0w5
             in
                 printStream "br\tx";
                 printStream(Word32.fmt StringCvt.DEC rN)
             end
 
             else if (wordValue andb 0wxfffffc1f) = 0wxD63F0000
             then
             let
                 val rN = (wordValue andb 0wx3e0) >> 0w5
             in
                 printStream "blr\tx";
                 printStream(Word32.fmt StringCvt.DEC rN)
             end
 
             else if (wordValue andb 0wxfffffc1f) = 0wxD65F0000
             then
             let
                 val rN = (wordValue andb 0wx3e0) >> 0w5
             in
                 printStream "ret\tx";
                 printStream(Word32.fmt StringCvt.DEC rN)
             end
 
             else if wordValue = 0wxD503201F
             then printStream "nop"
             else if wordValue = 0wxD503203F
             then printStream "yield"
             else if wordValue = 0wxD5033BBF
             then printStream "dmb\tish"
 
             else if (wordValue andb 0wx1f800000) = 0wx12800000
             then (* Move of constants.  Includes movn and movk. *)
             let
                 val rD = wordValue andb 0wx1f
                 val imm16 = Word32.toInt((wordValue >> 0w5) andb 0wxffff)
                 val isXReg = (wordValue andb 0wx80000000) <> 0w0
                 val opc = (wordValue >> 0w29) andb 0w3
                 val shift = (wordValue >> 0w21) andb 0w3
             in
                 printStream (if opc = 0w3 then "movk\t" else "mov\t");
                 printStream (if isXReg then "x" else "w");
                 printStream(Word32.fmt StringCvt.DEC rD);
                 printStream ",#";
                 printStream(Int.toString(if opc = 0w0 then ~1 - imm16 else imm16));
                 if shift = 0w0
                 then ()
                 else (printStream ",lsl #"; printStream(Word32.fmt StringCvt.DEC (shift*0w16)))
             end
 
             else if (wordValue andb 0wx3b000000) = 0wx39000000
             then (* Load/Store with unsigned, scaled offset. *)
             let
                 (* The offset is in units of the size of the operand. *)
                 val size = wordValue >> 0w30
                 and v = (wordValue >> 0w26) andb 0w1
                 and opc = (wordValue >> 0w22) andb 0w3
                 val rT = wordValue andb 0wx1f
                 and rN = (wordValue andb 0wx3e0) >> 0w5
                 and imm12 = (wordValue andb 0wx3ffc00) >> 0w10
                 val (opcode, r, scale) =
                     case (size, v, opc) of
                         (0w0, 0w0, 0w0) => ("strb", "w", 0w0)
                     |   (0w0, 0w0, 0w1) => ("ldrb", "w", 0w0)
                     |   (0w1, 0w0, 0w0) => ("strh", "w", 0w2)
                     |   (0w1, 0w0, 0w1) => ("ldrh", "w", 0w2)
                     |   (0w2, 0w0, 0w0) => ("str", "w", 0w4)
                     |   (0w2, 0w0, 0w1) => ("ldr", "w", 0w4)
                     |   (0w3, 0w0, 0w0) => ("str", "x", 0w8)
                     |   (0w3, 0w0, 0w1) => ("ldr", "x", 0w8)
                     |   (0w2, 0w1, 0w0) => ("str", "s", 0w4)
                     |   (0w2, 0w1, 0w1) => ("ldr", "s", 0w4)
                     |   (0w3, 0w1, 0w0) => ("str", "d", 0w8)
                     |   (0w3, 0w1, 0w1) => ("ldr", "d", 0w8)
                     |   _ => ("??", "?", 0w1)
             in
                 printStream opcode; printStream "\t"; printStream r; printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",["; prXRegOrSP rN;
                 printStream ",#"; printStream(Word32.fmt StringCvt.DEC(imm12*scale));
                 printStream "]"
             end
 
             else if (wordValue andb 0wx3b200c00) = 0wx38000000
             then (* Load/store unscaled immediate *)
             let
                 val size = wordValue >> 0w30
                 and v = (wordValue >> 0w26) andb 0w1
                 and opc = (wordValue >> 0w22) andb 0w3
                 val rT = wordValue andb 0wx1f
                 and rN = (wordValue andb 0wx3e0) >> 0w5
                 and imm9 = (wordValue andb 0wx1ff000) >> 0w12
                 val imm9Text =
                     if imm9 > 0wxff
                     then "-" ^ Word32.fmt StringCvt.DEC (0wx200 - imm9)
                     else Word32.fmt StringCvt.DEC imm9
                 val (opcode, r) =
                     case (size, v, opc) of
                         (0w0, 0w0, 0w0) => ("sturb", "w")
                     |   (0w0, 0w0, 0w1) => ("ldurb", "w")
                     |   (0w0, 0w0, 0w2) => ("ldursb", "w")
                     |   (0w0, 0w0, 0w3) => ("ldursb", "x")
                     |   (0w1, 0w0, 0w0) => ("sturh", "w")
                     |   (0w1, 0w0, 0w1) => ("ldurh", "w")
                     |   (0w1, 0w0, 0w2) => ("ldursh", "w")
                     |   (0w1, 0w0, 0w3) => ("ldursh", "x")
                     |   (0w2, 0w0, 0w0) => ("stur", "w")
                     |   (0w2, 0w0, 0w1) => ("ldur", "w")
                     |   (0w2, 0w0, 0w2) => ("ldursw", "x")
                     |   (0w3, 0w0, 0w0) => ("stur", "x")
                     |   (0w3, 0w0, 0w1) => ("ldur", "x")
                     |   (0w2, 0w1, 0w0) => ("stur", "s")
                     |   (0w2, 0w1, 0w1) => ("ldur", "s")
                     |   (0w3, 0w1, 0w0) => ("stur", "d")
                     |   (0w3, 0w1, 0w1) => ("ldur", "d")
                     |   _ => ("???", "?")
             in
                 printStream opcode; printStream "\t"; printStream r;
                 printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",["; prXRegOrSP rN;
                 printStream ",#"; printStream imm9Text; printStream "]"
             end
 
             else if (wordValue andb 0wx3b200c00) = 0wx38000400
             then (* Load/store immediate post-indexed *)
             let
                 val size = wordValue >> 0w30
                 and v = (wordValue >> 0w26) andb 0w1
                 and opc = (wordValue >> 0w22) andb 0w3
                 val rT = wordValue andb 0wx1f
                 and rN = (wordValue andb 0wx3e0) >> 0w5
                 and imm9 = (wordValue andb 0wx1ff000) >> 0w12
                 val imm9Text =
                     if imm9 > 0wxff
                     then "-" ^ Word32.fmt StringCvt.DEC (0wx200 - imm9)
                     else Word32.fmt StringCvt.DEC imm9
                 val (opcode, r) =
                     case (size, v, opc) of
                         (0w0, 0w0, 0w0) => ("strb", "w")
                     |   (0w0, 0w0, 0w1) => ("ldrb", "w")
                     |   (0w2, 0w0, 0w0) => ("str", "w")
                     |   (0w2, 0w0, 0w1) => ("ldr", "w")
                     |   (0w3, 0w0, 0w0) => ("str", "x")
                     |   (0w3, 0w0, 0w1) => ("ldr", "x")
                     |   _ => ("???", "?")
             in
                 printStream opcode; printStream "\t"; printStream r;
                 printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",["; prXRegOrSP rN;
                 printStream "],#"; printStream imm9Text
             end
 
             else if (wordValue andb 0wx3b200c00) = 0wx38000c00
             then (* Load/store immediate pre-indexed *)
             let
                 val size = wordValue >> 0w30
                 and v = (wordValue >> 0w26) andb 0w1
                 and opc = (wordValue >> 0w22) andb 0w3
                 val rT = wordValue andb 0wx1f
                 and rN = (wordValue andb 0wx3e0) >> 0w5
                 and imm9 = (wordValue andb 0wx1ff000) >> 0w12
                 val imm9Text =
                     if imm9 > 0wxff
                     then "-" ^ Word32.fmt StringCvt.DEC (0wx200 - imm9)
                     else Word32.fmt StringCvt.DEC imm9
                 val (opcode, r) =
                     case (size, v, opc) of
                         (0w0, 0w0, 0w0) => ("strb", "w")
                     |   (0w0, 0w0, 0w1) => ("ldrb", "w")
                     |   (0w2, 0w0, 0w0) => ("str", "w")
                     |   (0w2, 0w0, 0w1) => ("ldr", "w")
                     |   (0w3, 0w0, 0w0) => ("str", "x")
                     |   (0w3, 0w0, 0w1) => ("ldr", "x")
                     |   _ => ("???", "?")
             in
                 printStream opcode; printStream "\t"; printStream r;
                 printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",["; prXRegOrSP rN;
                 printStream ",#"; printStream imm9Text; printStream "]!"
             end
 
             else if (wordValue andb 0wx3b200c00) = 0wx38200800
             then (* Load/store with register offset i.e. an index register. *)
             let
                 val size = wordValue >> 0w30
                 and v = (wordValue >> 0w26) andb 0w1
                 and opc = (wordValue >> 0w22) andb 0w3
                 val rT = wordValue andb 0wx1f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rM = (wordValue >> 0w16) andb 0wx1f
                 val option = (wordValue >> 0w13) andb 0w7
                 val s = (wordValue andb 0wx1000) <> 0w0 
                 val (opcode, r) =
                     case (size, v, opc) of
                         (0w0, 0w0, 0w0) => ("strb", "w")
                     |   (0w0, 0w0, 0w1) => ("ldrb", "w")
                     |   (0w1, 0w0, 0w0) => ("strh", "w")
                     |   (0w1, 0w0, 0w1) => ("ldrh", "w")
                     |   (0w2, 0w0, 0w0) => ("str", "w")
                     |   (0w2, 0w0, 0w1) => ("ldr", "w")
                     |   (0w3, 0w0, 0w0) => ("str", "x")
                     |   (0w3, 0w0, 0w1) => ("ldr", "x")
                     |   (0w2, 0w1, 0w0) => ("str", "s")
                     |   (0w2, 0w1, 0w1) => ("ldr", "s")
                     |   (0w3, 0w1, 0w0) => ("str", "d")
                     |   (0w3, 0w1, 0w1) => ("ldr", "d")
                     |   _ => ("???", "?")
                 val (extend, xr) =
                     case option of
                         0w2 => (" uxtw", "w")
                     |   0w3 => if s then (" lsl", "x") else ("", "x")
                     |   0w6 => (" sxtw", "w")
                     |   0w7 => (" sxtx", "x")
                     |   _   => ("?", "?")
                 val indexShift =
                     case (size, s) of
                         (0w0, true) => " #1"
                     |   (0w1, true) => " #1"
                     |   (0w2, true) => " #2"
                     |   (0w3, true) => " #3"
                     |   _ => ""
             in
                 printStream opcode; printStream "\t"; printStream r;
                 printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",["; prXRegOrSP rN;
                 printStream ","; printStream xr; printStream(Word32.fmt StringCvt.DEC rM);
                 printStream extend; printStream indexShift;
                 printStream "]"
             end
 
             else if (wordValue andb 0wx3f000000) = 0wx08000000
             then (* Loads and stores with special ordering. *)
             let
                 val size = (wordValue >> 0w30) andb 0w3
                 and o2 = (wordValue >> 0w23) andb 0w1
                 and l = (wordValue >> 0w22) andb 0w1
                 and o1 = (wordValue >> 0w21) andb 0w1
                 and o0 = (wordValue >> 0w15) andb 0w1
                 val rT = wordValue andb 0wx1f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rS = (wordValue >> 0w16) andb 0wx1f
                 val (opcode, r) =
                     case (size, o2, l, o1, o0) of
                         (0w3, 0w1, 0w1, 0w0, 0w1) => ("ldar", "x")
                     |   (0w3, 0w1, 0w0, 0w0, 0w1) => ("stlr", "x")
                     |   (0w2, 0w1, 0w1, 0w0, 0w1) => ("ldar", "w")
                     |   (0w2, 0w1, 0w0, 0w0, 0w1) => ("stlr", "w")
                     |   (0w3, 0w0, 0w1, 0w0, 0w1) => ("ldaxr", "x")
                     |   (0w3, 0w0, 0w0, 0w0, 0w1) => ("stlxr", "x")
                     |   _ => ("??", "?")
             in
                 printStream opcode; printStream "\t";
                 if opcode = "stlxr"
                 then (printStream "w"; printStream(Word32.fmt StringCvt.DEC rS); printStream ",")
                 else ();
                 printStream r;
                 printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",["; prXRegOrSP rN; printStream "]"
             end
 
             else if (wordValue andb 0wx3a000000) = 0wx28000000
             then (* Load/store pairs of registers *)
             let
                 val opc = (wordValue >> 0w30) andb 0w3
                 and v = (wordValue >> 0w26) andb 0w1
                 and op2 = (wordValue >> 0w23) andb 0w3
                 and l = (wordValue >> 0w22) andb 0w1
                 and imm7 = (wordValue >> 0w15) andb 0wx7f
                 and rT2 = (wordValue >> 0w10) andb 0wx1f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rT1 = wordValue andb 0wx1f
                 val (opcode, r, scale) =
                     case (opc, v, l) of
                         (0w0, 0w0, 0w0) => ("stp", "w", 0w4)
                     |   (0w0, 0w0, 0w1) => ("ldp", "w", 0w4)
                     |   (0w2, 0w0, 0w0) => ("stp", "x", 0w8)
                     |   (0w2, 0w0, 0w1) => ("ldp", "x", 0w8)
                     |   (0w0, 0w1, 0w0) => ("stp", "s", 0w4)
                     |   (0w0, 0w1, 0w1) => ("ldp", "s", 0w4)
                     |   (0w1, 0w1, 0w0) => ("stp", "d", 0w8)
                     |   (0w1, 0w1, 0w1) => ("ldp", "d", 0w8)
                     |   _ => ("??", "?", 0w1)
                 val imm7Text =
                     if imm7 > 0wx3f
                     then "-" ^ Word32.fmt StringCvt.DEC ((0wx80 - imm7) * scale)
                     else Word32.fmt StringCvt.DEC (imm7 * scale)
             in
                 printStream opcode; printStream "\t"; printStream r;
                 printStream(Word32.fmt StringCvt.DEC rT1); printStream ",";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rT2);
                 printStream ",["; prXRegOrSP rN;
                 case op2 of
                     0w1 => (* Post indexed *)
                         (printStream "],#"; printStream imm7Text)
                 |   0w2 => (* Offset *)
                         (printStream ",#"; printStream imm7Text; printStream "]")
                 |   0w3 => (* Pre indexed *)
                         (printStream ",#"; printStream imm7Text; printStream "]!")
                 |   _ => printStream "??"
             end
 
             else if (wordValue andb 0wx3f800000) = 0wx11000000
             then
             let
                 (* Add/Subtract a 12-bit immediate with possible shift. *)
                 val sf = (wordValue >> 0w31) andb 0w1
                 val rD = wordValue andb 0wx1f
                 and rN = (wordValue andb 0wx3e0) >> 0w5
                 and imm12 = (wordValue andb 0wx3ffc00) >> 0w10
                 and shiftBit = wordValue andb 0wx400000
                 val imm = if shiftBit <> 0w0 then imm12 << 0w12 else imm12
                 val opr = if (wordValue andb 0wx40000000) = 0w0 then "add" else "sub"
                 val prReg = if sf = 0w1 then prXRegOrSP else prWRegOrSP
             in
                 if imm12 = 0w0 andalso (rN = 0w31 orelse rD = 0w31)
                 then (printStream "mov\t"; prReg rD; printStream ","; prReg rN)
                 else
                 (
                     printStream opr; printStream "\t"; prReg rD;
                     printStream ","; prReg rN;
                     printStream ",#"; printStream(Word32.fmt StringCvt.DEC imm)
                 )
             end
 
             else if (wordValue andb 0wx7f800000) = 0wx71000000
             then
             let
                 (* Subtract a 12-bit immediate with possible shift, setting flags. *)
                 val sf = (wordValue >> 0w31) andb 0w1
                 val rD = wordValue andb 0wx1f
                 and rN = (wordValue andb 0wx3e0) >> 0w5
                 and imm12 = (wordValue andb 0wx3ffc00) >> 0w10
                 and shiftBit = wordValue andb 0wx400000
                 val imm = if shiftBit <> 0w0 then imm12 << 0w12 else imm12
                 val prReg = if sf = 0w1 then prXRegOrSP else prWRegOrSP
             in
                 if rD = 0w31
                 then printStream "cmp\t"
                 else (printStream "subs\t"; prReg rD; printStream ",");
                 prReg rN; printStream ",#"; printStream(Word32.fmt StringCvt.DEC imm)
             end
 
             else if (wordValue andb 0wx7fe0ffe0) = 0wx2A0003E0
             then (* Move reg,reg.  This is a subset of ORR shifted register. *)
             let
                 val reg = if (wordValue andb 0wx80000000) <> 0w0 then "x" else "w"
             in
                 printStream "mov\t"; printStream reg;
                 printStream(Word32.fmt StringCvt.DEC(wordValue andb 0wx1f));
                 printStream ","; printStream reg;
                 printStream(Word32.fmt StringCvt.DEC((wordValue >> 0w16) andb 0wx1f))
             end
 
             else if (wordValue andb 0wx1f000000) = 0wx0A000000
             then
             let
                 (* Logical operations with shifted register. *)
                 val rD = wordValue andb 0wx1f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rM = (wordValue >> 0w16) andb 0wx1f
                 and imm6 = (wordValue >> 0w10) andb 0wx3f
                 and shiftCode = (wordValue >> 0w22) andb 0wx3
                 val opc = (wordValue >> 0w29) andb 0wx3
                 val nBit = (wordValue >> 0w21) andb 0w1
                 val reg = if (wordValue andb 0wx80000000) <> 0w0 then "x" else "w"
                 val opcode =
                     case (opc, nBit) of
                         (0w0, 0w0) => "and"
                     |   (0w1, 0w0) => "orr"
                     |   (0w2, 0w0) => "eor"
                     |   (0w3, 0w0) => "ands"
                     |   _ => "??"
             in
                 printStream opcode; printStream"\t";
                 printStream reg;
                 printStream(Word32.fmt StringCvt.DEC rD); printStream ",";
                 printStream reg; printStream(Word32.fmt StringCvt.DEC rN);
                 printStream ","; printStream reg; printStream(Word32.fmt StringCvt.DEC rM);
                 if imm6 <> 0w0
                 then
                 (
                     case shiftCode of
                         0w0 => printStream ",lsl #"
                     |   0w1 => printStream ",lsr #"
                     |   0w2 => printStream ",asr #"
                     |   _ => printStream ",?? #";
                     printStream(Word32.fmt StringCvt.DEC imm6)
                 )
                 else ()
             end
 
             else if (wordValue andb 0wx1f200000) = 0wx0B000000
             then
             let
                 (* Add/subtract shifted register. *)
                 val rD = wordValue andb 0wx1f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rM = (wordValue >> 0w16) andb 0wx1f
                 and imm6 = (wordValue >> 0w10) andb 0wx3f
                 and shiftCode = (wordValue >> 0w22) andb 0wx3
                 val oper = (wordValue andb 0wx40000000) = 0w0
                 val isS = (wordValue andb 0wx20000000) <> 0w0
                 val pReg = if (wordValue andb 0wx80000000) <> 0w0 then prXReg else prWReg
             in
                 if isS andalso rD = 0w31
                 then printStream(if oper then "cmn\t" else "cmp\t")
                 else
                 (
                     printStream(if oper then "add" else "sub"); printStream(if isS then "s\t" else "\t");
                     pReg rD; printStream ","
                 );
                 pReg rN;
                 printStream ","; pReg rM;
                 if imm6 <> 0w0
                 then
                 (
                     case shiftCode of
                         0w0 => printStream ",lsl #"
                     |   0w1 => printStream ",lsr #"
                     |   0w2 => printStream ",asr #"
                     |   _ => printStream ",?? #";
                     printStream(Word32.fmt StringCvt.DEC imm6)
                 )
                 else ()
             end
 
             else if (wordValue andb 0wx1fe00000) = 0wx0b200000
             then
             let
                 (* Add/subtract extended register. *)
                 val rD = wordValue andb 0wx1f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rM = (wordValue >> 0w16) andb 0wx1f
                 and extend = (wordValue >> 0w13) andb 0w7
                 and amount = (wordValue >> 0w10) andb 0w7
                 and sf = (wordValue >> 0w31) andb 0w1
                 and p = (wordValue >> 0w30) andb 0w1
                 and s = (wordValue >> 0w29) andb 0w1
             in
                 if s = 0w1 andalso rD = 0w31
                 then printStream(if p = 0w0 then "cmn\t" else "cmp\t")
                 else
                 (
                     printStream(if p = 0w0 then "add" else "sub");
                     printStream(if s = 0w1 then "s\t" else "\t");
                     (if sf = 0w1 then prXRegOrSP else prWRegOrSP) rD; printStream ","
                 );
                 (if sf = 0w1 then prXRegOrSP else prWRegOrSP) rN;
                 printStream ",";
                 (if extend = 0w3 orelse extend = 0w7 then prXReg else prWReg) rM;
                 case extend of
                     0w0 => printStream ",uxtb"
                 |   0w1 => printStream ",uxth"
                 |   0w2 => if amount = 0w0 andalso sf = 0w0 then () else printStream ",uxtw"
                 |   0w3 => if amount = 0w0 andalso sf = 0w1 then () else printStream ",uxtx"
                 |   0w4 => printStream ",sxtb"
                 |   0w5 => printStream ",sxth"
                 |   0w6 => printStream ",sxtw"
                 |   0w7 => printStream ",sxtx"
                 |   _ => printStream "?";
                
                 if amount <> 0w0
                 then printStream(" #" ^ Word32.fmt StringCvt.DEC amount)
                 else ()
             end
 
             else if (wordValue andb 0wxbf000000) = 0wx18000000
             then
             let
                 (* Load from a PC-relative address.  This may refer to the
                    address constant area or the non-address constant area. *)
                 val rT = wordValue andb 0wx1f
                 val s = (wordValue >> 0w30) andb 0w1
                 (* The offset is in 32-bit words *)
                 val byteAddr = word32ToWord(((wordValue andb 0wx00ffffe0) >> (0w5-0w2))) + byteNo
                 val wordAddr = byteAddr div wordSize
                 (* We must NOT use codeVecGetWord if this is in the non-address
                    area.  It may well not be a tagged value. *)
                 local
                     fun getConstant(cVal, 0w0) = cVal
                     |   getConstant(cVal, offset) =
                         let
                             val byteVal =
                                 Word64.fromLarge(Word8.toLarge(codeVecGet (codeVec, byteAddr+offset-0w1)))
                         in
                             getConstant(Word64.orb(Word64.<<(cVal, 0w8), byteVal), offset-0w1)
                         end
                 in
                     val constantValue =
                         if wordAddr <= wordsOfCode
                         then "0x" ^ Word64.toString(getConstant(0w0, 0w8)) (* It's a non-address constant *)
                         else stringOfWord(codeVecGetWord(codeVec, wordAddr))
                 end
             in
                 printStream "ldr\t"; printStream (if s = 0w0 then "w" else "x");
                 printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",0x"; printStream(Word.fmt StringCvt.HEX byteAddr);
                 printStream "\t// "; printStream constantValue
             end
 
             else if (wordValue andb 0wxbf000000) = 0wx10000000
             then
             let
                 (* Put a pc-relative address into a register. *)
                 val rT = wordValue andb 0wx1f
                 val byteOffset =
                     ((wordValue andb 0wx00ffffe0) << (Word.fromInt Word32.wordSize - 0w23) ~>>
                         (Word.fromInt Word32.wordSize - 0w20)) + ((wordValue >> 0w29) andb 0w3)
             in
                 printStream "adr\tx"; printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",0x"; printStream(Word32.fmt StringCvt.HEX (wordToWord32 byteNo+byteOffset))
             end
 
             else if (wordValue andb 0wxfc000000) = 0wx14000000
             then (* Unconditional branch. *)
             let
                 (* The offset is signed and the destination may be earlier. *)
                 val byteOffset =
                     (wordValue andb 0wx03ffffff) << (Word.fromInt Word32.wordSize - 0w26) ~>>
                         (Word.fromInt Word32.wordSize - 0w28)
             in
                 printStream "b\t0x";
                 printStream(Word32.fmt StringCvt.HEX (wordToWord32 byteNo + byteOffset))
             end
 
             else if (wordValue andb 0wxff000000) = 0wx54000000
             then (* Conditional branch *)
             let
                 val byteOffset =
                     (wordValue andb 0wx00ffffe0) << (Word.fromInt Word32.wordSize - 0w24) ~>>
                         (Word.fromInt Word32.wordSize - 0w21)
             in
                 printStream "b.";
                 printCondition(wordValue andb 0wxf);
                 printStream "\t0x";
                 printStream(Word32.fmt StringCvt.HEX (wordToWord32 byteNo+byteOffset))
             end
 
             else if (wordValue andb 0wx7e000000) = 0wx34000000
             then (* Compare and branch *)
             let
                 val byteOffset =
                     (wordValue andb 0wx00ffffe0) << (Word.fromInt Word32.wordSize - 0w24) ~>>
                         (Word.fromInt Word32.wordSize - 0w21)
                 val oper =
                     if (wordValue andb 0wx01000000) = 0w0
                     then "cbz" else "cbnz"
                 val r = if (wordValue andb 0wx80000000) = 0w0 then "w" else "x"
             in
                 printStream oper; printStream "\t";
                 printStream r; printStream(Word32.fmt StringCvt.DEC (wordValue andb 0wx1f));
                 printStream ",0x";
                 printStream(Word32.fmt StringCvt.HEX (wordToWord32 byteNo+byteOffset))
             end
 
             else if (wordValue andb 0wx7e000000) = 0wx36000000
             then (* Test bit and branch *)
             let
                 val byteOffset =
                     (wordValue andb 0wx00ffffe0) << (Word.fromInt Word.wordSize - 0w19) ~>>
                         (Word.fromInt Word.wordSize - 0w16)
                 val oper =
                     if (wordValue andb 0wx01000000) = 0w0
                     then "tbz" else "tbnz"
                 val b40 = (wordValue >> 0w19) andb 0wx1f
                 val bitNo = b40 orb ((wordValue >> 0w26) andb 0wx20)
                 val r = if bitNo < 0w32 then "w" else "x"
             in
                 printStream oper; printStream "\t";
                 printStream r; printStream(Word32.fmt StringCvt.DEC (wordValue andb 0wx1f));
                 printStream ",#"; printStream(Word32.fmt StringCvt.DEC bitNo); printStream ",0x";
                 printStream(Word.fmt StringCvt.HEX (byteNo+word32ToWord byteOffset))
             end
 
             else if (wordValue andb 0wx3fe00000) = 0wx1A800000
             then
             let
                 val sf = wordValue >> 0w31
                 val opc = (wordValue >> 0w30) andb 0w1
                 val op2 = (wordValue >> 0w10) andb 0w3
                 val rT = wordValue andb 0wx1f
                 val rN = (wordValue >> 0w5) andb 0wx1f
                 val rM = (wordValue >> 0w16) andb 0wx1f
                 val cond = (wordValue >> 0w12) andb 0wxf
                 val opcode =
                     case (opc, op2) of
                         (0w0, 0w0) => "csel"
                     |   (0w0, 0w1) => "csinc"
                     |   (0w1, 0w0) => "csinv"
                     |   (0w1, 0w1) => "csneg"
                     |   _ => "??"
                 val r = if sf = 0w0 then "w" else "x"
             in
                 printStream opcode; printStream "\t";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ","; printStream r; printStream(Word32.fmt StringCvt.DEC rN);
                 printStream ","; printStream r; printStream(Word32.fmt StringCvt.DEC rM);
                 printStream ","; printCondition cond
             end
 
             else if (wordValue andb 0wx7f800000) = 0wx13000000
             then (* signed bitfield *)
             let
                 val sf = wordValue >> 0w31
                 (* N is always the same as sf. *)
                 (*val nBit = (wordValue >> 0w22) andb 0w1*)
                 val immr = (wordValue >> 0w16) andb 0wx3f
                 val imms = (wordValue >> 0w10) andb 0wx3f
                 val rN = (wordValue >> 0w5) andb 0wx1f
                 val rD = wordValue andb 0wx1f
                 val (r, wordSize) = if sf = 0w0 then ("w", 0w32) else if sf = 0w1 then ("x", 0w64) else raise InternalError "Neither"
             in
                 if imms = wordSize - 0w1
                 then printStream "asr\t"
                 else printStream "sbfm\t";
                 printStream r;
                 printStream(Word32.fmt StringCvt.DEC rD);
                 printStream ",";
                 printStream r;
                 printStream(Word32.fmt StringCvt.DEC rN);
                 if imms = wordSize - 0w1
                 then (printStream ",#0x"; printStream(Word32.toString immr))
                 else
                 (
                     printStream ",#0x"; printStream(Word32.toString immr);
                     printStream ",#0x"; printStream(Word32.toString imms)
                 )
             end
 
             else if (wordValue andb 0wx7f800000) = 0wx53000000
             then (* unsigned bitfield move *)
             let
                 val sf = wordValue >> 0w31
                 (* N is always the same as sf. *)
                 (*val nBit = (wordValue >> 0w22) andb 0w1*)
                 val immr = (wordValue >> 0w16) andb 0wx3f
                 val imms = (wordValue >> 0w10) andb 0wx3f
                 val rN = (wordValue >> 0w5) andb 0wx1f
                 val rD = wordValue andb 0wx1f
                 val (r, wordSize) = if sf = 0w0 then ("w", 0w32) else ("x", 0w64)
             in
                 if imms + 0w1 = immr
                 then printStream "lsl\t"
                 else if imms = wordSize - 0w1
                 then printStream "lsr\t"
                 else printStream "ubfm\t";
                 printStream r;
                 printStream(Word32.fmt StringCvt.DEC rD);
                 printStream ",";
                 printStream r;
                 printStream(Word32.fmt StringCvt.DEC rN);
                 if imms + 0w1 = immr
                 then (printStream ",#0x"; printStream(Word32.toString(wordSize - immr)))
                 else if imms = wordSize - 0w1
                 then (printStream ",#0x"; printStream(Word32.toString immr))
                 else
                 (
                     printStream ",#0x"; printStream(Word32.toString immr);
                     printStream ",#0x"; printStream(Word32.toString imms)
                 )
             end
 
             else if (wordValue andb 0wx1f800000) = 0wx12000000
             then (* logical immediate *)
             let
                 val sf = wordValue >> 0w31
                 val opc = (wordValue >> 0w29) andb 0w3
                 val nBit = (wordValue >> 0w22) andb 0w1
                 val immr = (wordValue >> 0w16) andb 0wx3f
                 val imms = (wordValue >> 0w10) andb 0wx3f
                 val rN = (wordValue >> 0w5) andb 0wx1f
                 val rD = wordValue andb 0wx1f
                 val (opcode, r) =
                     case (sf, opc, nBit) of
                         (0w0, 0w0, 0w0) => ("and", "w")
                     |   (0w0, 0w1, 0w0) => ("orr", "w")
                     |   (0w0, 0w2, 0w0) => ("eor", "w")
                     |   (0w0, 0w3, 0w0) => ("ands", "w")
                     |   (0w1, 0w0, _) => ("and", "x")
                     |   (0w1, 0w1, _) => ("orr", "x")
                     |   (0w1, 0w2, _) => ("eor", "x")
                     |   (0w1, 0w3, _) => ("ands", "x")
                     |   _ => ("??", "?")
             in
                 printStream opcode;
                 printStream "\t";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rD); printStream ",";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rN); printStream ",#0x";
                 printStream(Word64.toString(decodeBitPattern{sf=sf, n=nBit, immr=immr, imms=imms}))
             end
 
             else if (wordValue andb 0wx5fe00000) = 0wx1ac00000
             then (* Two source operations - shifts and divide. *)
             let
                 val sf = wordValue >> 0w31
                 val s = (wordValue >> 0w29) andb 0w1
                 val rM = (wordValue >> 0w16) andb 0wx1f
                 val opcode = (wordValue >> 0w10) andb 0wx3f
                 val rN = (wordValue >> 0w5) andb 0wx1f
                 val rD = wordValue andb 0wx1f
                 val (oper, r) =
                     case (sf, s, opcode) of
                         (0w1, 0w0, 0wx2) => ("udiv", "x")
                     |   (0w1, 0w0, 0wx3) => ("sdiv", "x")
                     |   (0w0, 0w0, 0wx2) => ("udiv", "w")
                     |   (0w0, 0w0, 0wx3) => ("sdiv", "w")
                     |   (0w1, 0w0, 0wx8) => ("lsl", "x")
                     |   (0w0, 0w0, 0wx8) => ("lsl", "w")
                     |   (0w1, 0w0, 0wx9) => ("lsr", "x")
                     |   (0w0, 0w0, 0wx9) => ("lsr", "w")
                     |   (0w1, 0w0, 0wxa) => ("asr", "x")
                     |   (0w0, 0w0, 0wxa) => ("asr", "w")
                     |   _ => ("??", "?")
             in
                 printStream oper;
                 printStream "\t";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rD); printStream ",";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rN); printStream ",";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rM)
             end
 
             else if (wordValue andb 0wx1f000000) = 0wx1b000000
             then (* Three source operations - multiply add/subtract. *)
             let
                 val sf = wordValue >> 0w31
                 val op54 = (wordValue >> 0w29) andb 0w3
                 val op31 = (wordValue >> 0w21) andb 0w7
                 val o0 = (wordValue >> 0w15) andb 0w1
                 val rM = (wordValue >> 0w16) andb 0wx1f
                 val rA = (wordValue >> 0w10) andb 0wx1f
                 val rN = (wordValue >> 0w5) andb 0wx1f
                 val rD = wordValue andb 0wx1f
                 val (oper, r1, r2) =
                     case (sf, op54, op31, o0, rA) of
                         (0w1, 0w0, 0w0, 0w0, 0w31) => ("mul", "x", "x")
                     |   (0w1, 0w0, 0w0, 0w0, _)    => ("madd", "x", "x")
                     |   (0w1, 0w0, 0w0, 0w1, 0w31) => ("mneg", "x", "x")
                     |   (0w1, 0w0, 0w0, 0w1, _)    => ("msub", "x", "x")
                     |   (0w0, 0w0, 0w0, 0w0, _)    => ("madd", "w", "w")
                     |   (0w0, 0w0, 0w0, 0w1, _)    => ("msub", "w", "w")
                     |   (0w1, 0w0, 0w2, 0w0, 0w31) => ("smulh", "x", "x")
                     |   (0w1, 0w0, 0w1, 0w0, 0w31) => ("smull", "x", "w")
                     |   (0w1, 0w0, 0w1, 0w0, _)    => ("smaddl", "x", "w")
                     |   (0w1, 0w0, 0w1, 0w1, _)    => ("smsubl", "x", "w")
                     |   _ => ("??", "?", "?")
             in
                 printStream oper;
                 printStream "\t";
                 printStream r1; printStream(Word32.fmt StringCvt.DEC rD); printStream ",";
                 printStream r2; printStream(Word32.fmt StringCvt.DEC rN); printStream ",";
                 printStream r2; printStream(Word32.fmt StringCvt.DEC rM);
                 if rA = 0w31 then ()
                 else (printStream ","; printStream r1; printStream(Word32.fmt StringCvt.DEC rA))
             end
 
             else if (wordValue andb 0wx7f20fc00) = 0wx1E200000
             then (* Moves between floating point and general regs. *)
             let
                 val sf = (wordValue >> 0w31) andb 0w1
                 and s = (wordValue >> 0w29) andb 0w1
                 and ptype = (wordValue >> 0w22) andb 0w3
                 and mode = (wordValue >> 0w19) andb 0w3
                 and opcode = (wordValue >> 0w16) andb 0w7
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rD = wordValue andb 0wx1f
                 val (opc, dr, nr) =
                     case (sf, s, ptype, mode, opcode) of
                         (0w0, 0w0, 0w0, 0w0, 0w7) => ("fmov", "s", "w") (* w -> s *)
                     |   (0w0, 0w0, 0w0, 0w0, 0w6) => ("fmov", "w", "s") (* s -> w *)
                     |   (0w1, 0w0, 0w1, 0w0, 0w7) => ("fmov", "d", "x") (* d -> x *)
                     |   (0w1, 0w0, 0w1, 0w0, 0w6) => ("fmov", "x", "d") (* x -> d *)
                     |   (0w0, 0w0, 0w0, 0w0, 0w2) => ("scvtf", "w", "s")
                     |   (0w0, 0w0, 0w1, 0w0, 0w2) => ("scvtf", "w", "d")
                     |   (0w1, 0w0, 0w0, 0w0, 0w2) => ("scvtf", "x", "s")
                     |   (0w1, 0w0, 0w1, 0w0, 0w2) => ("scvtf", "x", "d")
 
                     |   (0w0, 0w0, 0w0, 0w0, 0w4) => ("fcvtas", "w", "s") (* s -> w *)
                     |   (0w0, 0w0, 0w0, 0w2, 0w0) => ("fcvtms", "w", "s") (* s -> w *)
                     |   (0w0, 0w0, 0w0, 0w1, 0w0) => ("fcvtps", "w", "s") (* s -> w *)
                     |   (0w0, 0w0, 0w0, 0w3, 0w0) => ("fcvtzs", "w", "s") (* s -> w *)
                     |   (0w0, 0w0, 0w1, 0w0, 0w4) => ("fcvtas", "w", "d") (* d -> w *)
                     |   (0w0, 0w0, 0w1, 0w2, 0w0) => ("fcvtms", "w", "d") (* d -> w *)
                     |   (0w0, 0w0, 0w1, 0w1, 0w0) => ("fcvtps", "w", "d") (* d -> w *)
                     |   (0w0, 0w0, 0w1, 0w3, 0w0) => ("fcvtzs", "w", "d") (* d -> w *)
 
                     |   (0w1, 0w0, 0w0, 0w0, 0w4) => ("fcvtas", "x", "s") (* s -> x *)
                     |   (0w1, 0w0, 0w0, 0w2, 0w0) => ("fcvtms", "x", "s") (* s -> x *)
                     |   (0w1, 0w0, 0w0, 0w1, 0w0) => ("fcvtps", "x", "s") (* s -> x *)
                     |   (0w1, 0w0, 0w0, 0w3, 0w0) => ("fcvtzs", "x", "s") (* s -> x *)
                     |   (0w1, 0w0, 0w1, 0w0, 0w4) => ("fcvtas", "x", "d") (* d -> x *)
                     |   (0w1, 0w0, 0w1, 0w2, 0w0) => ("fcvtms", "x", "d") (* d -> x *)
                     |   (0w1, 0w0, 0w1, 0w1, 0w0) => ("fcvtps", "x", "d") (* d -> x *)
                     |   (0w1, 0w0, 0w1, 0w3, 0w0) => ("fcvtzs", "x", "d") (* d -> x *)
                     |   _ => ("?", "?", "?")
             in
                 printStream opc; printStream "\t";
                 printStream dr; printStream(Word32.fmt StringCvt.DEC rD); printStream ",";
                 printStream nr; printStream(Word32.fmt StringCvt.DEC rN)
             end
             
             else if (wordValue andb 0wxff200c00) = 0wx1E200800
             then (* Floating point two source operations. *)
             let
                 val pt = (wordValue >> 0w22) andb 0w3
                 and rM = (wordValue >> 0w16) andb 0wx1f
                 and opc = (wordValue >> 0w12) andb 0wxf
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rT = wordValue andb 0wx1f
                 val (opcode, r) =
                     case (pt, opc) of
                         (0w0, 0wx0) => ("fmul", "s")
                     |   (0w0, 0wx1) => ("fdiv", "s")
                     |   (0w0, 0wx2) => ("fadd", "s")
                     |   (0w0, 0wx3) => ("fsub", "s")
                     |   (0w1, 0wx0) => ("fmul", "d")
                     |   (0w1, 0wx1) => ("fdiv", "d")
                     |   (0w1, 0wx2) => ("fadd", "d")
                     |   (0w1, 0wx3) => ("fsub", "d")
                     |   _ => ("??", "?")
             in
                 printStream opcode; printStream "\t";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rT); printStream ",";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rN); printStream ",";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rM)
             end
 
             else if (wordValue andb 0wxff207c00) = 0wx1E204000
             then (* Floating point single source. *)
             let
                 val pt = (wordValue >> 0w22) andb 0w3
                 and opc = (wordValue >> 0w15) andb 0wx3f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rT = wordValue andb 0wx1f
                 val (opcode, rS, rD) =
                     case (pt, opc) of
                         (0w0, 0wx0) => ("fmov", "s", "s")
                     |   (0w0, 0wx1) => ("fabs", "s", "s")
                     |   (0w0, 0wx2) => ("fneg", "s", "s")
                     |   (0w0, 0wx5) => ("fcvt", "s", "d")
                     |   (0w1, 0wx0) => ("fmov", "d", "d")
                     |   (0w1, 0wx1) => ("fabs", "d", "d")
                     |   (0w1, 0wx2) => ("fneg", "d", "d")
                     |   (0w1, 0wx4) => ("fcvt", "d", "s")
                     |   _ => ("??", "?", "?")
             in
                 printStream opcode; printStream "\t";
                 printStream rD; printStream(Word32.fmt StringCvt.DEC rT); printStream ",";
                 printStream rS; printStream(Word32.fmt StringCvt.DEC rN)
             end
 
             else if (wordValue andb 0wxff20fc07) = 0wx1E202000
             then (* Floating point comparison *)
             let
                 val pt = (wordValue >> 0w22) andb 0w3
                 and rM = (wordValue >> 0w16) andb 0wx1f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and opc = (wordValue >> 0w3) andb 0w3
                 val (opcode, r) =
                     case (pt, opc) of
                         (0w0, 0wx0) => ("fcmp", "s")
                     |   (0w1, 0wx0) => ("fcmp", "d")
                     |   (0w0, 0wx2) => ("fcmpe", "s")
                     |   (0w1, 0wx2) => ("fcmpe", "d")
                     |   _ => ("??", "?")
             in
                 printStream opcode; printStream "\t";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rN); printStream ",";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rM)
             end
 
             else if (wordValue andb 0wx1e000000) = 0wx02000000
             then (* This is an unallocated range.  We use it for the register mask. *)
             let
                 fun printMask (0w25, _) = ()
                 |   printMask (i, comma) =
                     if ((0w1 << i) andb wordValue) <> 0w0
                     then
                     (
                         if comma then printStream ", " else ();
                         printStream "x";
                         printStream(Word.fmt StringCvt.DEC i);
                         printMask(i+0w1, true)
                     )
                     else printMask(i+0w1, comma)
             in
                 printStream "["; printMask(0w0, false); printStream "]"
             end
 
             else printStream "?"
             ;
             printStream "\n"
         end
         
         fun printAll i =
             if i = numInstructions then ()
             else (printWordAt i; printAll(i+0w1))
     in
         printStream functionName;
         printStream ":\n";
         printAll 0w0
     end
 
     (* Adds the constants onto the code, and copies the code into a new segment *)
     fun generateCode {instrs, name=functionName, parameters, resultClosure} =
     let
         val printStream = Pretty.getSimplePrinter(parameters, [])
         and printAssemblyCode = Debug.getParameter Debug.assemblyCodeTag parameters
         
         local
             (* Extract the constants. *)
             fun getConsts(LoadAddressLiteral {value, ...}, (addrs, nonAddrs)) = (value::addrs, nonAddrs)
             |   getConsts(LoadNonAddressLiteral {value, ...}, (addrs, nonAddrs)) = (addrs, value::nonAddrs)
             |   getConsts(_, consts) = consts
 
             val (addrConsts, nonAddrConsts) = List.foldl getConsts ([], []) instrs
         in
             val addressConsts = List.rev addrConsts
             and nonAddressConsts = List.rev nonAddrConsts
         end
         
         val (byteVec, nativeWordsOfCode) = genCode(instrs, addressConsts, nonAddressConsts)
         val wordsOfCode = nativeWordsOfCode * wordsPerNativeWord
 
         (* +3 for profile count, function name and constants count *)
         val numOfConst = List.length addressConsts
         val segSize   = wordsOfCode + Word.fromInt numOfConst + 0w4
         val firstConstant = wordsOfCode + 0w3 (* Add 3 for no of consts, fn name and profile count. *)
     
         (* Put in the number of constants. This must go in before
            we actually put in any constants. *)
         local
             val lastWord = segSize - 0w1
         in
             val () = setWord(LargeWord.fromInt(numOfConst + 2), wordsOfCode, byteVec)
             (* Set the last word of the code to the (negative) byte offset of the start of the code area
                from the end of this word. *)
             val () = setWord(LargeWord.fromInt(numOfConst + 3) * ~(Word.toLarge Address.wordSize), lastWord, byteVec) 
         end
 
         (* Now we've filled in all the size info we need to convert the segment
            into a proper code segment before it's safe to put in any ML values. *)
         val codeVec = byteVecToCodeVec(byteVec, resultClosure)
 
         local
             val name     : string = functionName
             val nameWord : machineWord = toMachineWord name
         in
             val () = codeVecPutWord (codeVec, wordsOfCode+0w1, nameWord)
         end
         (* Profile ref.  A byte ref used by the profiler in the RTS. *)
         local
             val v = RunCall.allocateByteMemory(0w1, Word.fromLargeWord(Word8.toLargeWord(Word8.orb(F_mutable, F_bytes))))
             fun clear 0w0 = ()
             |   clear i = (assignByte(v, i-0w1, 0w0); clear (i-0w1))
             val () = clear(wordSize)
         in
             val () = codeVecPutWord (codeVec, wordsOfCode+0w2, toMachineWord v)
         end
 
         (* and then copy the constants from the constant list. *)
         local
             fun setConstant(value, num) =
             (
                 codeVecPutWord (codeVec, firstConstant + num, value);
                 num+0w1
             )
         in
             val _ = List.foldl setConstant 0w0 addressConsts
         end
     in
         if printAssemblyCode
         then (* print out the code *)
             (printCode (codeVec, functionName, wordsOfCode, printStream); printStream"\n")
         else ();
         codeVecLock(codeVec, resultClosure)
     end (* copyCode *)
 
 
     structure Sharing =
     struct
         type closureRef = closureRef
         type instr = instr
         type xReg = xReg
         type vReg = vReg
         type labels = labels
         type condition = condition
         type shiftType = shiftType
         type wordSize = wordSize
         type 'a extend = 'a extend
         type scale = scale
     end
 end;
 
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64Assembly.sig b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64Assembly.sig
index afef16ba..47c475b4 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64Assembly.sig
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64Assembly.sig
@@ -1,426 +1,430 @@
 (*
     Copyright (c) 2021 David C. J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     Licence version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public Licence for more details.
     
     You should have received a copy of the GNU Lesser General Public
     Licence along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 signature Arm64Assembly =
 sig
     type closureRef
     type instr
     type machineWord = Address.machineWord
     type labels
     type condition
 
     (* XZero and XSP are both encoded as 31 but the interpretation
        depends on the instruction 
        The datatype definition is included here to allow for
        pattern matching on XSP and XZero. *)
     datatype xReg = XReg of Word8.word | XZero | XSP
     and vReg = VReg of Word8.word
 
     val X0:  xReg   and X1:  xReg   and X2:  xReg   and X3: xReg
     and X4:  xReg   and X5:  xReg   and X6:  xReg   and X7: xReg
     and X8:  xReg   and X9:  xReg   and X10: xReg   and X11: xReg
     and X12: xReg   and X13: xReg   and X14: xReg   and X15: xReg
     and X16: xReg   and X17: xReg   and X18: xReg   and X19: xReg
     and X20: xReg   and X21: xReg   and X22: xReg   and X23: xReg
     and X24: xReg   and X25: xReg   and X26: xReg   and X27: xReg
     and X28: xReg   and X29: xReg   and X30: xReg
 
     val X_MLHeapLimit: xReg (* ML Heap limit pointer *)
     and X_MLAssemblyInt: xReg (* ML assembly interface pointer. *)
     and X_MLHeapAllocPtr: xReg (* ML Heap allocation pointer. *)
     and X_MLStackPtr: xReg (* ML Stack pointer. *)
     and X_LinkReg: xReg (* Link reg - return address *)
     and X_Base32in64: xReg (* X24 is used for the heap base in 32-in-64. *)
     
     val V0:  vReg   and V1:  vReg   and V2:  vReg   and V3: vReg
     and V4:  vReg   and V5:  vReg   and V6:  vReg   and V7: vReg
 
     (* Condition for conditional branches etc. *)
     val condEqual: condition
     and condNotEqual: condition
     and condCarrySet: condition
     and condCarryClear: condition
     and condNegative: condition
     and condPositive: condition
     and condOverflow: condition
     and condNoOverflow: condition
     and condUnsignedHigher: condition
     and condUnsignedLowOrEq: condition
     and condSignedGreaterEq: condition
     and condSignedLess: condition
     and condSignedGreater: condition
     and condSignedLessEq: condition
 
     datatype shiftType =
         ShiftLSL of Word8.word
     |   ShiftLSR of Word8.word
     |   ShiftASR of Word8.word
     |   ShiftNone
 
     datatype wordSize = WordSize32 | WordSize64
 
     datatype 'a extend =
         ExtUXTB of 'a (* Unsigned extend byte *)
     |   ExtUXTH of 'a (* Unsigned extend byte *)
     |   ExtUXTW of 'a (* Unsigned extend byte *)
     |   ExtUXTX of 'a (* Left shift *)
     |   ExtSXTB of 'a (* Sign extend byte *)
     |   ExtSXTH of 'a (* Sign extend halfword *)
     |   ExtSXTW of 'a (* Sign extend word *)
     |   ExtSXTX of 'a (* Left shift *)
 
     (* Load/store instructions have only a single bit for the shift.  For byte
        operations this is one bit shift; for others it scales by the size of
        the operand if set. *)
     datatype scale =
         ScaleOrShift
     |   NoScale
 
     (* Jump to the address in the register and put the address of the
        next instruction into X30. *)
     val branchAndLinkReg: xReg -> instr
     (* Jump to the address in the register. *)
     and branchRegister: xReg -> instr
     (* Jump to the address in the register and hint this is a return. *)
     and returnRegister: xReg -> instr
 
     (* Move an address constant to a register. *)
     val loadAddressConstant: xReg * machineWord -> instr
     (* Move a constant into a register that is not an address.
        The argument is the actual bit pattern to be copied.
        For tagged integers that means that the value must have
        been shifted and the tag bit set. *)
     and loadNonAddressConstant: xReg * Word64.word -> instr
 
     (* Move a value into a register.  The immediate is 16-bits and the shift
        is 0, 16, 24, or 48.  moveKeep affect only the specific 16-bits and
        leaves the remainder unchanged. *)
     val moveNot32: {regD: xReg, immediate: word, shift: word} -> instr
     and moveZero32: {regD: xReg, immediate: word, shift: word} -> instr
     and moveKeep32: {regD: xReg, immediate: word, shift: word} -> instr
     val moveNot: {regD: xReg, immediate: word, shift: word} -> instr
     and moveZero: {regD: xReg, immediate: word, shift: word} -> instr
     and moveKeep: {regD: xReg, immediate: word, shift: word} -> instr
 
     (* Add/subtract an optionally shifted 12-bit immediate (i.e. constant) to/from a register.
        The constant is zero-extended. *)
     val addImmediate: {regN: xReg, regD: xReg, immed: word, shifted: bool} -> instr
     and addSImmediate: {regN: xReg, regD: xReg, immed: word, shifted: bool} -> instr
     and subImmediate: {regN: xReg, regD: xReg, immed: word, shifted: bool} -> instr
     and subSImmediate: {regN: xReg, regD: xReg, immed: word, shifted: bool} -> instr
     and addImmediate32: {regN: xReg, regD: xReg, immed: word, shifted: bool} -> instr
     and addSImmediate32: {regN: xReg, regD: xReg, immed: word, shifted: bool} -> instr
     and subImmediate32: {regN: xReg, regD: xReg, immed: word, shifted: bool} -> instr
     and subSImmediate32: {regN: xReg, regD: xReg, immed: word, shifted: bool} -> instr
 
     (* Add/subtract a shifted register, optionally setting the flags. *)
     val addShiftedReg: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and addSShiftedReg: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and subShiftedReg: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and subSShiftedReg: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and addShiftedReg32: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and addSShiftedReg32: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and subShiftedReg32: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and subSShiftedReg32: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     
     (* Add/subtract an extended register, optionally setting the flags. *)
     val addExtendedReg: {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend} -> instr
     and addSExtendedReg: {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend} -> instr
     and subExtendedReg: {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend} -> instr
     and subSExtendedReg: {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend} -> instr
 
     (* Multiplication *)
     (* regD = regA + regN * regM *)
     val multiplyAndAdd: {regM: xReg, regN: xReg, regA: xReg, regD: xReg} -> instr
     (* regD = regA - regN * regM *)
     and multiplyAndSub: {regM: xReg, regN: xReg, regA: xReg, regD: xReg} -> instr
     (* Return the high-order part of a signed multiplication. *)
     and signedMultiplyHigh: {regM: xReg, regN: xReg, regD: xReg} -> instr
     and multiplyAndAdd32: {regM: xReg, regN: xReg, regA: xReg, regD: xReg} -> instr
     and multiplyAndSub32: {regM: xReg, regN: xReg, regA: xReg, regD: xReg} -> instr
     (* Multiply two 32-bit quantities and add/subtract a 64-bit quantity. *)
     and signedMultiplyAndAddLong: {regM: xReg, regN: xReg, regA: xReg, regD: xReg} -> instr
     and signedMultiplyAndSubLong: {regM: xReg, regN: xReg, regA: xReg, regD: xReg} -> instr
 
     (* Division *)
     val unsignedDivide: {regM: xReg, regN: xReg, regD: xReg} -> instr
     and signedDivide: {regM: xReg, regN: xReg, regD: xReg} -> instr
     and unsignedDivide32: {regM: xReg, regN: xReg, regD: xReg} -> instr
     and signedDivide32: {regM: xReg, regN: xReg, regD: xReg} -> instr
 
     (* Logical operations on a shifted register, optionally setting the flags. *)
     val andShiftedReg: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and orrShiftedReg: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and eorShiftedReg: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and andsShiftedReg: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
 
     (* And a register with a bit pattern, discarding the results but setting the
        condition codes.  The bit pattern must be encodable. *)
     val testBitPattern: xReg * Word64.word -> instr
 
     (* Check whether a constant can be encoded. *)
     val isEncodableBitPattern: Word64.word * wordSize -> bool
 
     (* Load/Store an aligned word using a 12-bit offset.  The offset is in units
        of the size of the operand. *)
     val loadRegScaled: {regT: xReg, regN: xReg, unitOffset: int} -> instr
     and storeRegScaled: {regT: xReg, regN: xReg, unitOffset: int} -> instr
     and loadRegScaledByte: {regT: xReg, regN: xReg, unitOffset: int} -> instr
     and storeRegScaledByte: {regT: xReg, regN: xReg, unitOffset: int} -> instr
     and loadRegScaled16: {regT: xReg, regN: xReg, unitOffset: int} -> instr
     and storeRegScaled16: {regT: xReg, regN: xReg, unitOffset: int} -> instr
     and loadRegScaled32: {regT: xReg, regN: xReg, unitOffset: int} -> instr
     and storeRegScaled32: {regT: xReg, regN: xReg, unitOffset: int} -> instr
     and loadRegScaledDouble: {regT: vReg, regN: xReg, unitOffset: int} -> instr
     and storeRegScaledDouble: {regT: vReg, regN: xReg, unitOffset: int} -> instr
     and loadRegScaledFloat: {regT: vReg, regN: xReg, unitOffset: int} -> instr
     and storeRegScaledFloat: {regT: vReg, regN: xReg, unitOffset: int} -> instr
 
     (* Load/Store a value using a signed byte offset. *)
     val loadRegUnscaled: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegUnscaled: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegUnscaledByte: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegUnscaledSignedByteTo64: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegUnscaledSignedByteTo32: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegUnscaledByte: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegUnscaled16: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegUnscaledSigned16To64: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegUnscaledSigned16To32: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegUnscaled16: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegUnscaled32: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegUnscaledSigned32To64: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegUnscaled32: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegUnscaledFloat: {regT: vReg, regN: xReg, byteOffset: int} -> instr
     and storeRegUnscaledFloat: {regT: vReg, regN: xReg, byteOffset: int} -> instr
     and loadRegUnscaledDouble: {regT: vReg, regN: xReg, byteOffset: int} -> instr
     and storeRegUnscaledDouble: {regT: vReg, regN: xReg, byteOffset: int} -> instr
 
     (* Load/store with a register offset i.e. an index register. *)
     val loadRegIndexed: {regN: xReg, regM: xReg, regT: xReg, option: scale extend} -> instr
     and storeRegIndexed: {regN: xReg, regM: xReg, regT: xReg, option: scale extend} -> instr
     and loadRegIndexedByte: {regN: xReg, regM: xReg, regT: xReg, option: scale extend} -> instr
     and storeRegIndexedByte: {regN: xReg, regM: xReg, regT: xReg, option: scale extend} -> instr
     and loadRegIndexed16: {regN: xReg, regM: xReg, regT: xReg, option: scale extend} -> instr
     and storeRegIndexed16: {regN: xReg, regM: xReg, regT: xReg, option: scale extend} -> instr
     and loadRegIndexed32: {regN: xReg, regM: xReg, regT: xReg, option: scale extend} -> instr
     and storeRegIndexed32: {regN: xReg, regM: xReg, regT: xReg, option: scale extend} -> instr
     and loadRegIndexedFloat: {regN: xReg, regM: xReg, regT: vReg, option: scale extend} -> instr
     and storeRegIndexedFloat: {regN: xReg, regM: xReg, regT: vReg, option: scale extend} -> instr
     and loadRegIndexedDouble: {regN: xReg, regM: xReg, regT: vReg, option: scale extend} -> instr
     and storeRegIndexedDouble: {regN: xReg, regM: xReg, regT: vReg, option: scale extend} -> instr
 
     (* Load/Store a value using a signed byte offset and post-indexing (post-increment). *)
     (* The terminology is confusing. Pre-indexing means adding the offset into base address
        before loading the value, typically used for push, and post-index means using the
        original value of the base register as the address and adding in the offset after
        the value has been loaded, e.g. pop. *)
     val loadRegPostIndex: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegPostIndex: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegPostIndex32: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegPostIndex32: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegPostIndexByte: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegPostIndexByte: {regT: xReg, regN: xReg, byteOffset: int} -> instr
 
     (* Load/Store a value using a signed byte offset and pre-indexing (pre-increment). *)
     val loadRegPreIndex: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegPreIndex: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegPreIndex32: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegPreIndex32: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegPreIndexByte: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegPreIndexByte: {regT: xReg, regN: xReg, byteOffset: int} -> instr
 
     (* Loads and stores with special ordering. *)
     val loadAcquire: {regN: xReg, regT: xReg} -> instr
     and storeRelease: {regN: xReg, regT: xReg} -> instr
     and loadAcquire32: {regN: xReg, regT: xReg} -> instr
     and storeRelease32: {regN: xReg, regT: xReg} -> instr
 
     (* Load and store pairs of registers.  The offsets are signed scaled values. *)
     val storePairOffset: {regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int} -> instr
     and loadPairOffset: {regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int} -> instr
     and storePairPostIndexed: {regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int} -> instr
     and loadPairPostIndexed: {regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int} -> instr
     and storePairPreIndexed: {regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int} -> instr
     and loadPairPreIndexed: {regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int} -> instr
     and storePairOffsetFloat: {regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int} -> instr
     and loadPairOffsetFloat: {regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int} -> instr
     and storePairPostIndexedFloat: {regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int} -> instr
     and loadPairPostIndexesFloat: {regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int} -> instr
     and storePairPreIndexedFloat: {regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int} -> instr
     and loadPairPreIndexesFloat: {regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int} -> instr
     and storePairOffsetDouble: {regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int} -> instr
     and loadPairOffsetDouble: {regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int} -> instr
     and storePairPostIndexedDouble: {regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int} -> instr
     and loadPairPostIndexesDouble: {regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int} -> instr
     and storePairPreIndexedDouble: {regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int} -> instr
     and loadPairPreIndexesDouble: {regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int} -> instr
 
     (* This word is put in after a call to the RTS trap-handler.  All the registers
        are saved and restored across a call to the trap-handler; the register
        mask contains those that may contain an address and so need to be scanned and
        possibly updated if there is a GC. *)
     val registerMask: xReg list -> instr
 
     (* Create a label. *)
     val createLabel: unit -> labels
     (* Put a label into the code. *)
     val setLabel: labels -> instr
     (* A conditional branch. *)
     val conditionalBranch: condition * labels -> instr
     (* Unconditional branch *)
     and unconditionalBranch: labels -> instr
     (* Put the address of a label into a register - used for handlers and cases. *)
     and loadLabelAddress: xReg * labels -> instr
     (* Test a bit in a register and branch if zero/nonzero *)
     and testBitBranchZero: xReg * Word8.word * labels -> instr
     and testBitBranchNonZero: xReg * Word8.word * labels -> instr
     (* Compare a register with zero and branch if zero/nonzero *)
     and compareBranchZero: xReg * labels -> instr
     and compareBranchZero32: xReg * labels -> instr
     and compareBranchNonZero: xReg * labels -> instr
     and compareBranchNonZero32: xReg * labels -> instr
 
     (* Set the destination register to the value of the first reg if the
        condition is true otherwise to a, possibly modified, version of
        the second argument.  There are variants that set it unmodified,
        incremented, inverted and negated. *)
     val conditionalSet:
         {regD: xReg, regTrue: xReg, regFalse: xReg, cond: condition} -> instr
     val conditionalSetIncrement:
         {regD: xReg, regTrue: xReg, regFalse: xReg, cond: condition} -> instr
     val conditionalSetInverted:
         {regD: xReg, regTrue: xReg, regFalse: xReg, cond: condition} -> instr
     val conditionalSetNegated:
         {regD: xReg, regTrue: xReg, regFalse: xReg, cond: condition} -> instr
 
     (* Various shifts *)
     val logicalShiftLeft: {shift: word, regN: xReg, regD: xReg} -> instr
     and logicalShiftLeft32: {shift: word, regN: xReg, regD: xReg} -> instr
     and logicalShiftRight: {shift: word, regN: xReg, regD: xReg} -> instr
     and logicalShiftRight32: {shift: word, regN: xReg, regD: xReg} -> instr
     and arithmeticShiftRight: {shift: word, regN: xReg, regD: xReg} -> instr
     and arithmeticShiftRight32: {shift: word, regN: xReg, regD: xReg} -> instr
     (* Extract "width" least significant bits and place at offset "lsb" in the destination
        setting the rest of the register to zero. *)
     and unsignedBitfieldInsertinZeros: {lsb: word, width: word, regN: xReg, regD: xReg} -> instr
     and unsignedBitfieldInsertinZeros32: {lsb: word, width: word, regN: xReg, regD: xReg} -> instr
     (* Extract bits but leave the rest of the register unchanged.  Can be used
        to clear a specific range of bits by using XZero as the source. *)
     and bitfieldInsert: {lsb: word, width: word, regN: xReg, regD: xReg} -> instr
     and bitfieldInsert32: {lsb: word, width: word, regN: xReg, regD: xReg} -> instr
     (* Extract "width" bits starting from "lsb" in the source and place in the
        least significant bits of the destination, setting the high order bits to the
        sign bit. *)
     and signedBitfieldExtract: {lsb: word, width: word, regN: xReg, regD: xReg} -> instr
 
     (* Logical shift left Rd = Rn << (Rm mod 0w64) *)
     val logicalShiftLeftVariable: {regM: xReg, regN: xReg, regD: xReg} -> instr
     (* Logical shift right Rd = Rn >> (Rm mod 0w64) *)
     and logicalShiftRightVariable: {regM: xReg, regN: xReg, regD: xReg} -> instr
     (* Arithmetic shift right Rd = Rn ~>> (Rm mod 0w64) *)
     and arithmeticShiftRightVariable: {regM: xReg, regN: xReg, regD: xReg} -> instr
     and logicalShiftLeftVariable32: {regM: xReg, regN: xReg, regD: xReg} -> instr
     and logicalShiftRightVariable32: {regM: xReg, regN: xReg, regD: xReg} -> instr
     and arithmeticShiftRightVariable32: {regM: xReg, regN: xReg, regD: xReg} -> instr
 
     (* Logical operations on bit patterns.  The pattern must be valid.
        ANDS is an AND that also sets the flags, typically used for a test. *)
     val bitwiseAndImmediate: {bits: Word64.word, regN: xReg, regD: xReg} -> instr
     and bitwiseAndImmediate32: {bits: Word64.word, regN: xReg, regD: xReg} -> instr
     and bitwiseOrImmediate: {bits: Word64.word, regN: xReg, regD: xReg} -> instr
     and bitwiseOrImmediate32: {bits: Word64.word, regN: xReg, regD: xReg} -> instr
     and bitwiseXorImmediate: {bits: Word64.word, regN: xReg, regD: xReg} -> instr
     and bitwiseXorImmediate32: {bits: Word64.word, regN: xReg, regD: xReg} -> instr
     and bitwiseAndSImmediate: {bits: Word64.word, regN: xReg, regD: xReg} -> instr
     and bitwiseAndSImmediate32: {bits: Word64.word, regN: xReg, regD: xReg} -> instr
 
     (* Instructions involved in thread synchonisation. *)
     val yield: instr and dmbIsh: instr
     val loadAcquireExclusiveRegister: {regN: xReg, regT: xReg} -> instr
     val storeReleaseExclusiveRegister: {regN: xReg, regS: xReg, regT: xReg} -> instr
 
     (* Floating point moves and conversions.  Moves simply copy the bits.
        In all cases the integer argument is signed 64-bits. *)
     val moveGeneralToDouble: {regN: xReg, regD: vReg} -> instr
     and moveGeneralToFloat: {regN: xReg, regD: vReg} -> instr
     and moveDoubleToGeneral: {regN: vReg, regD: xReg} -> instr
     and moveFloatToGeneral: {regN: vReg, regD: xReg} -> instr
     and convertIntToDouble: {regN: xReg, regD: vReg} -> instr
     and convertIntToFloat: {regN: xReg, regD: vReg} -> instr
     and convertFloatToInt: IEEEReal.rounding_mode -> {regN: vReg, regD: xReg} -> instr
     and convertDoubleToInt: IEEEReal.rounding_mode -> {regN: vReg, regD: xReg} -> instr
     and convertInt32ToDouble: {regN: xReg, regD: vReg} -> instr
     and convertInt32ToFloat: {regN: xReg, regD: vReg} -> instr
     and convertFloatToInt32: IEEEReal.rounding_mode -> {regN: vReg, regD: xReg} -> instr
     and convertDoubleToInt32: IEEEReal.rounding_mode -> {regN: vReg, regD: xReg} -> instr
    
     (* Floating point operations. *)
     val multiplyFloat: {regM: vReg, regN: vReg, regD: vReg} -> instr
     and divideFloat: {regM: vReg, regN: vReg, regD: vReg} -> instr
     and addFloat: {regM: vReg, regN: vReg, regD: vReg} -> instr
     and subtractFloat: {regM: vReg, regN: vReg, regD: vReg} -> instr
     and multiplyDouble: {regM: vReg, regN: vReg, regD: vReg} -> instr
     and divideDouble: {regM: vReg, regN: vReg, regD: vReg} -> instr
     and addDouble: {regM: vReg, regN: vReg, regD: vReg} -> instr
     and subtractDouble: {regM: vReg, regN: vReg, regD: vReg} -> instr
 
     val compareFloat: {regM: vReg, regN: vReg} -> instr
     and compareDouble: {regM: vReg, regN: vReg} -> instr
     
     val moveFloatToFloat: {regN: vReg, regD: vReg} -> instr
     and absFloat: {regN: vReg, regD: vReg} -> instr
     and negFloat: {regN: vReg, regD: vReg} -> instr
     and convertFloatToDouble: {regN: vReg, regD: vReg} -> instr
     and moveDoubleToDouble: {regN: vReg, regD: vReg} -> instr
     and absDouble: {regN: vReg, regD: vReg} -> instr
     and negDouble: {regN: vReg, regD: vReg} -> instr
     and convertDoubleToFloat: {regN: vReg, regD: vReg} -> instr
+    
+    (* Special hack for callbacks in 32-in-64.  Must appear as
+       the first instructions in the callback. *)
+    val loadGlobalHeapBaseInCallback: xReg -> instr list
 
     (* Create the vector of code from the list of instructions and update the
        closure reference to point to it. *)
     val generateCode:
         {instrs: instr list, name: string, parameters: Universal.universal list, resultClosure: closureRef} -> unit
 
     (* Offsets in the assembly code interface pointed at by X26
        These are in units of 64-bits NOT bytes. *)
     val heapOverflowCallOffset: int
     and stackOverflowCallOffset: int
     and stackOverflowXCallOffset: int
     and exceptionHandlerOffset: int
     and stackLimitOffset: int
     and exceptionPacketOffset: int
     and threadIdOffset: int
     and heapLimitPtrOffset: int
     and heapAllocPtrOffset: int
     and mlStackPtrOffset: int
 
     val is32in64: bool
 
     structure Sharing:
     sig
         type closureRef = closureRef
         type instr = instr
         type xReg = xReg
         type vReg = vReg
         type labels = labels
         type condition = condition
         type shiftType = shiftType
         type wordSize = wordSize
         type 'a extend = 'a extend
         type scale = scale
     end
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ForeignCall.sml b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ForeignCall.sml
index bbeb8d27..d92669fc 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ForeignCall.sml
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ForeignCall.sml
@@ -1,850 +1,881 @@
 (*
     Copyright (c) 2021 David C. J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     Licence version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public Licence for more details.
     
     You should have received a copy of the GNU Lesser General Public
     Licence along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64ForeignCall(
     structure CodeArray: CODEARRAYSIG
     and       Arm64Assembly: Arm64Assembly
     and       Debug: DEBUG
     and       Arm64Sequences: Arm64Sequences
 
     sharing CodeArray.Sharing = Arm64Assembly.Sharing = Arm64Sequences.Sharing
 ): FOREIGNCALLSIG
 =
 struct
 
     open CodeArray Arm64Assembly Arm64Sequences
     
     exception InternalError = Misc.InternalError
     and Foreign = Foreign.Foreign
 
     datatype fastArgs = FastArgFixed | FastArgDouble | FastArgFloat
 
     val makeEntryPoint: string -> machineWord = RunCall.rtsCallFull1 "PolyCreateEntryPointObject"
 
     fun absoluteAddressToIndex reg =
     if is32in64
     then
     [
         subShiftedReg{regM=X_Base32in64, regN=reg, regD=reg, shift=ShiftNone},
         logicalShiftRight{shift=0w2, regN=reg, regD=reg}
     ]
     else []
     
     (* Turn an index into an absolute address. *)
     and indexToAbsoluteAddress(iReg, absReg) =
     if is32in64
     then [addShiftedReg{regM=iReg, regN=X_Base32in64, regD=absReg, shift=ShiftLSL 0w2}]
     else if iReg = absReg
     then []
     else [moveRegToReg{sReg=iReg, dReg=absReg}]
 
     local
         fun allocateWords(fixedReg, workReg, words, bytes) =
         let
             val label = createLabel()
         in
             [
                 (* Subtract the number of bytes required from the heap pointer. *)
                 subImmediate{regN=X_MLHeapAllocPtr, regD=fixedReg, immed=bytes, shifted=false},
                 (* Compare the result with the heap limit. *)
                 subSShiftedReg{regM=X_MLHeapLimit, regN=fixedReg, regD=XZero, shift=ShiftNone},
                 conditionalBranch(condCarrySet, label),
                 loadRegScaled{regT=X16, regN=X_MLAssemblyInt, unitOffset=heapOverflowCallOffset},
                 branchAndLinkReg X16,
                 registerMask [], (* Not used at the moment. *)
                 setLabel label,
                 (* Update the heap pointer. *)
                 moveRegToReg{sReg=fixedReg, dReg=X_MLHeapAllocPtr}
             ] @
                 loadNonAddress(workReg,
                     Word64.orb(words, Word64.<<(Word64.fromLarge(Word8.toLarge Address.F_bytes),
                         if is32in64 then 0w24 else 0w56)))
             @
             [
                 (* Store the length word.  Have to use the unaligned version because offset is -ve. *)
                 if is32in64
                 then storeRegUnscaled32{regT=workReg, regN=fixedReg, byteOffset= ~4}
                 else storeRegUnscaled{regT=workReg, regN=fixedReg, byteOffset= ~8}
             ]
         end
     in
         fun boxDouble(floatReg, fixedReg, workReg) =
             allocateWords(fixedReg, workReg, if is32in64 then 0w2 else 0w1, 0w16) @
                 storeRegScaledDouble{regT=floatReg, regN=fixedReg, unitOffset=0} ::
                 absoluteAddressToIndex fixedReg
         and boxSysWord(toBoxReg, fixedReg, workReg) =
             allocateWords(fixedReg, workReg, if is32in64 then 0w2 else 0w1, 0w16) @
                 storeRegScaled{regT=toBoxReg, regN=fixedReg, unitOffset=0} ::
                 absoluteAddressToIndex fixedReg
         
         fun boxOrTagFloat(floatReg, fixedReg, workReg) =
         if is32in64
         then
             allocateWords(fixedReg, workReg, 0w1, 0w8) @
             storeRegScaledFloat{regT=floatReg, regN=fixedReg, unitOffset=0} ::
             absoluteAddressToIndex fixedReg
         else
         [
             moveFloatToGeneral{regN=floatReg, regD=fixedReg},
             logicalShiftLeft{shift=0w32, regN=fixedReg, regD=fixedReg},
             bitwiseOrImmediate{regN=fixedReg, regD=fixedReg, bits=0w1}
         ]
     end
 
     fun unboxDouble(addrReg, workReg, valReg) =
     if is32in64
     then indexToAbsoluteAddress(addrReg, workReg) @
         [loadRegScaledDouble{regT=valReg, regN=workReg, unitOffset=0}]
     else [loadRegScaledDouble{regT=valReg, regN=addrReg, unitOffset=0}]
 
     fun unboxOrUntagSingle(addrReg, workReg, valReg) =
     if is32in64
     then [loadRegIndexedFloat{regN=X_Base32in64, regM=addrReg, regT=valReg, option=ExtUXTX ScaleOrShift}]
     else
     [
         logicalShiftRight{shift=0w32, regN=addrReg, regD=workReg},
         moveGeneralToFloat{regN=workReg, regD=valReg}
     ]
 
     fun rtsCallFastGeneral (functionName, argFormats, resultFormat, debugSwitches) =
     let
         val entryPointAddr = makeEntryPoint functionName
         (* The maximum we currently have is five so we don't need to worry about stack args. *)
 
         fun loadArgs([], _, _, _) = []
         |   loadArgs(FastArgFixed :: argTypes, srcReg :: srcRegs, fixed :: fixedRegs, fpRegs) =
                 if srcReg = fixed
                 then loadArgs(argTypes, srcRegs, fixedRegs, fpRegs) (* Already in the right reg *)
                 else moveRegToReg{sReg=srcReg, dReg=fixed} ::
                         loadArgs(argTypes, srcRegs, fixedRegs, fpRegs)
         |   loadArgs(FastArgDouble :: argTypes, srcReg :: srcRegs, fixedRegs, fp :: fpRegs) =
                 (* Unbox the value into a fp reg. *)
                 unboxDouble(srcReg, srcReg, fp) @
                 loadArgs(argTypes, srcRegs, fixedRegs, fpRegs)
         |   loadArgs(FastArgFloat :: argTypes, srcReg :: srcRegs, fixedRegs, fp :: fpRegs) =
                 (* Untag and move into the fp reg *)
                 unboxOrUntagSingle(srcReg, srcReg, fp) @
                 loadArgs(argTypes, srcRegs, fixedRegs, fpRegs)
         |   loadArgs _ = raise InternalError "rtsCall: Too many arguments"
 
         val instructions =
             loadArgs(argFormats,
                 (* ML Arguments *) [X0, X1, X2, X3, X4, X5, X6, X7],
                 (* C fixed pt args *) [X0, X1, X2, X3, X4, X5, X6, X7],
                 (* C floating pt args *) [V0, V1, V2, V3, V4, V5, V6, V7]) @
             [
                 (* Move X30 to X23, a callee-save register. *)
                 (* Note: maybe we should push X24 just in case this is the only
                    reachable reference to the code. *)
                 orrShiftedReg{regN=XZero, regM=X_LinkReg, regD=X23, shift=ShiftNone},
                 loadAddressConstant(X16, entryPointAddr) (* Load entry point *)
             ] @ indexToAbsoluteAddress(X16, X16) @
             [
                 loadRegScaled{regT=X16, regN=X16, unitOffset=0}, (* Load the actual address. *)
                 (* Store the current heap allocation pointer. *)
                 storeRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset},
                 (* For the moment save and restore the ML stack pointer.  No RTS call should change
                    it and it's callee-save but just in case... *)
                 storeRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset},
                 branchAndLinkReg X16, (* Call the function. *)
                 (* Restore the ML stack pointer. *)
                 loadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset},
                 (* Load the heap allocation ptr and limit.  We could have GCed in the RTS call. *)
                 loadRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset},
                 loadRegScaled{regT=X_MLHeapLimit, regN=X_MLAssemblyInt, unitOffset=heapLimitPtrOffset}
             ] @
             (
                 case resultFormat of
                     FastArgFixed => []
                 |   FastArgDouble => (* This must be boxed. *) boxDouble(V0, X0, X1)
                 |   FastArgFloat => (* This must be tagged or boxed *) boxOrTagFloat(V0, X0, X1)
             ) @
             [
                 returnRegister X23
             ]
         val closure = makeConstantClosure()
         val () = generateCode{instrs=instructions, name=functionName, parameters=debugSwitches, resultClosure=closure}
     in
         closureAsAddress closure
     end
 
 
     fun rtsCallFast (functionName, nArgs, debugSwitches) =
         rtsCallFastGeneral (functionName, List.tabulate(nArgs, fn _ => FastArgFixed), FastArgFixed, debugSwitches)
 
     (* RTS call with one double-precision floating point argument and a floating point result. *)
     fun rtsCallFastRealtoReal (functionName, debugSwitches) =
         rtsCallFastGeneral (functionName, [FastArgDouble], FastArgDouble, debugSwitches)
     
     (* RTS call with two double-precision floating point arguments and a floating point result. *)
     fun rtsCallFastRealRealtoReal (functionName, debugSwitches) =
         rtsCallFastGeneral (functionName, [FastArgDouble, FastArgDouble], FastArgDouble, debugSwitches)
 
     (* RTS call with one double-precision floating point argument, one fixed point argument and a
        floating point result. *)
     fun rtsCallFastRealGeneraltoReal (functionName, debugSwitches) =
         rtsCallFastGeneral (functionName, [FastArgDouble, FastArgFixed], FastArgDouble, debugSwitches)
 
     (* RTS call with one general (i.e. ML word) argument and a floating point result.
        This is used only to convert arbitrary precision values to floats. *)
     fun rtsCallFastGeneraltoReal (functionName, debugSwitches) =
         rtsCallFastGeneral (functionName, [FastArgFixed], FastArgDouble, debugSwitches)
 
     (* Operations on Real32.real values. *)
 
     fun rtsCallFastFloattoFloat (functionName, debugSwitches) =
         rtsCallFastGeneral (functionName, [FastArgFloat], FastArgFloat, debugSwitches)
     
     fun rtsCallFastFloatFloattoFloat (functionName, debugSwitches) =
         rtsCallFastGeneral (functionName, [FastArgFloat, FastArgFloat], FastArgFloat, debugSwitches)
 
     (* RTS call with one double-precision floating point argument, one fixed point argument and a
        floating point result. *)
     fun rtsCallFastFloatGeneraltoFloat (functionName, debugSwitches) =
         rtsCallFastGeneral (functionName, [FastArgFloat, FastArgFixed], FastArgFloat, debugSwitches)
 
     (* RTS call with one general (i.e. ML word) argument and a floating point result.
        This is used only to convert arbitrary precision values to floats. *)
     fun rtsCallFastGeneraltoFloat (functionName, debugSwitches) =
         rtsCallFastGeneral (functionName, [FastArgFixed], FastArgFloat, debugSwitches)
 
     (* There is only one ABI value. *)
     datatype abi = ARM64Abi
     fun abiList () = [("default", ARM64Abi)]
 
     fun alignUp(s, align) = Word.andb(s + align-0w1, ~ align)
     
     val getThreadDataCall = makeEntryPoint "PolyArm64GetThreadData"
 
     (* This must match the type in Foreign.LowLevel.  Once this is bootstrapped we could use that
        type but note that this is the type we use within the compiler and we build Foreign.LowLevel
        AFTER compiling this. *)
     datatype cTypeForm =
         CTypeFloatingPt | CTypePointer | CTypeSignedInt | CTypeUnsignedInt
     |   CTypeStruct of cType list | CTypeVoid
     withtype cType = { typeForm: cTypeForm, align: word, size: word }
 
     (* Load a byte, halfword, word or long *)
     fun loadAlignedValue(reg, base, offset, size) =
     let
         val _ = offset mod size = 0w0 orelse raise InternalError "loadAlignedValue: not aligned"
         val loadOp =
             case size of
                 0w8 => loadRegScaled
             |   0w4 => loadRegScaled32
             |   0w2 => loadRegScaled16
             |   0w1 => loadRegScaledByte
             |   _ => raise InternalError "loadAlignedValue: invalid length"
     in
         loadOp{regT=reg, regN=base, unitOffset=Word.toInt(offset div size)}
     end
 
     (* Store a register into upto 8 bytes.  Most values will involve a single store but odd-sized
        structs can require shifts and multiple stores.  N.B.  May modify the source register. *)
     and storeUpTo8(reg, base, offset, size) =
     let
         val storeOp =
             if size = 0w8 then storeRegUnscaled else if size >= 0w4 then storeRegUnscaled32
             else if size >= 0w2 then storeRegUnscaled16 else storeRegUnscaledByte
     in
         [storeOp{regT=reg, regN=base, byteOffset=offset}]
     end @
     (
         if size = 0w6 orelse size = 0w7
         then
         [
             logicalShiftRight{regN=reg, regD=reg, shift=0w32 },
             storeRegUnscaled16{regT=reg, regN=base, byteOffset=offset+4}
         ]
         else []
     ) @
     (
         if size = 0w3 orelse size = 0w5 orelse size = 0w7
         then
         [
             logicalShiftRight{regN=reg, regD=reg, shift=(size-0w1)*0w8 },
             storeRegUnscaledByte{regT=reg, regN=base, byteOffset=offset+Word.toInt(size-0w1)}
         ]
         else []
     )
 
     (* Extract the elements of structures. *)
     fun unwrap(CTypeStruct ctypes, _) = List.foldr(fn({typeForm, size, ...}, l) => unwrap(typeForm, size) @ l) [] ctypes
     |   unwrap (ctype, size) = [(ctype, size)]
 
     (* Structures of up to four floating point values of the same precision are treated specially. *)
     datatype argClass =
         ArgClassHFA of Word8.word * bool (* 1 - 4 floating pt values *)
     |   ArgLargeStruct          (* > 16 bytes and not an HFA *)
     |   ArgSmall                (* Scalars or small structures *)
 
     fun classifyArg(ctype, size) =
         case unwrap (ctype, size) of
             [(CTypeFloatingPt, 0w4)] =>
                 ArgClassHFA(0w1, false)
         |   [(CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4)] =>
                 ArgClassHFA(0w2, false)
         |   [(CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4)] =>
                 ArgClassHFA(0w3, false)
         |   [(CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4)] =>
                 ArgClassHFA(0w4, false)
         |   [(CTypeFloatingPt, 0w8)] =>
                 ArgClassHFA(0w1, true)
         |   [(CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8)] =>
                 ArgClassHFA(0w2, true)
         |   [(CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8)] =>
                 ArgClassHFA(0w3, true)
         |   [(CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8)] =>
                 ArgClassHFA(0w4, true)
         |   _ =>
                 if size > 0w16 then ArgLargeStruct
                 else ArgSmall
 
     (* Can we load this in a single instruction? *)
     fun alignedLoadStore(_, 0w1) = true
     |   alignedLoadStore(addr, 0w2) = addr mod 0w2 = 0w0
     |   alignedLoadStore(addr, 0w4) = addr mod 0w4 = 0w0
     |   alignedLoadStore(addr, 0w8) = addr mod 0w8 = 0w0
     |   alignedLoadStore(addr, 0w16) = addr mod 0w8 = 0w0 (* Can use load-pair. *)
     |   alignedLoadStore _ = false
 
     (* This builds a piece of code that takes three arguments and returns a unit result.
        All three arguments are SysWord.word values i.e. ML addresses containing the address
        of the actual C value.
        The first argument (X0) is the address of the function to call.
        The second argument (X1) points to a struct that contains the argument(s) for the
        function.  The arguments have to be unpacked from the struct into the appropriate
        registers or to the C stack.
        The third argument (X2) points to a piece of memory to receive the result of the call.
        It may be empty if the function returns void.  It may only be as big as required
        for the result type. *)
     fun foreignCall(_: abi, args: cType list, result: cType): Address.machineWord =
     let
         val resultAreaPtr = X19 (* Unboxed value from X2 - This is callee save. *)
         val argPtrReg = X9 (* A scratch register that isn't used for arguments. *)
         val entryPtReg = X16 (* Contains the address of the function to call. *)
         val argWorkReg = X10 (* Used in loading arguments if necessary. *)
         and argWorkReg2 = X11
         and structSpacePtr = X12
         and argWorkReg3 = X13
         and argWorkReg4 = X14
 
         fun loadArgs([], stackOffset, _, _, _, code, largeStructSpace) =
                 (code, stackOffset, largeStructSpace)
 
         |   loadArgs(arg::args, stackOffset, argOffset, gRegNo, fpRegNo, code, largeStructSpace) =
             let
                 val {size, align, typeForm, ...} =  arg
                 val newArgOffset = alignUp(argOffset, align)
             in
                 case classifyArg(typeForm, size) of
                     ArgClassHFA(numItems, isDouble) =>
                         if fpRegNo + numItems <= 0w8
                         then
                         let
                             val scale = if isDouble then 0w8 else 0w4
                             (* Load the values to the floating point registers. *)
                             fun loadFPRegs(0w0, _, _) = []
                             |   loadFPRegs(0w1, fpRegNo, offset) =
                                 [(if isDouble then loadRegScaledDouble else loadRegScaledFloat)
                                     {regT=VReg fpRegNo, regN=argPtrReg, unitOffset=offset}]
                             |   loadFPRegs(n, fpRegNo, offset) =
                                 (if isDouble then loadPairOffsetDouble else loadPairOffsetFloat)
                                     {regT1=VReg fpRegNo, regT2=VReg(fpRegNo+0w1), regN=argPtrReg, unitOffset=offset} ::
                                         loadFPRegs(n-0w2, fpRegNo+0w2, offset+2)
                         in
                             loadArgs(args, stackOffset, newArgOffset+size, gRegNo, fpRegNo+numItems,
                                 loadFPRegs(numItems, fpRegNo, Word.toInt(newArgOffset div scale)) @ code,
                                 largeStructSpace)
                         end
                         else
                         let
                             (* If we have insufficient number of registers we discard any that are
                                left and push the argument to the stack. *)
                             (* The floating point value or structure is copied to the stack
                                as a contiguous area.  Use general registers to copy the data.
                                It could be on a 4-byte alignment.  In the typical case of a single
                                floating point value this will just be a single load and store. *)
                             fun copyData(0w0, _, _) = []
                             |   copyData(n, srcOffset, stackOffset) =
                                 if isDouble
                                 then loadRegScaled{regT=argWorkReg2, regN=argPtrReg, unitOffset=srcOffset} ::
                                      storeRegScaled{regT=argWorkReg2, regN=XSP, unitOffset=stackOffset} ::
                                      copyData(n-0w1, srcOffset+1, stackOffset+1)
                                 else loadRegScaled32{regT=argWorkReg2, regN=argPtrReg, unitOffset=srcOffset} ::
                                      storeRegScaled32{regT=argWorkReg2, regN=XSP, unitOffset=stackOffset} ::
                                      copyData(n-0w1, srcOffset+1, stackOffset+1)
 
                             val copyToStack =
                                 if isDouble
                                 then copyData(numItems, Word.toInt(newArgOffset div 0w8), stackOffset)
                                 else copyData(numItems, Word.toInt(newArgOffset div 0w4), stackOffset*2)
                             (* The overall size is rounded up to a multiple of 8 *)
                             val newStackOffset = stackOffset + Word.toInt(alignUp(size, 0w8) div 0w8)
                         in
                             loadArgs(args, newStackOffset, newArgOffset+size, gRegNo, 0w8,
                                 copyToStack @ code, largeStructSpace)
                         end
              
                 |   _ =>
                     let
                         (* Load an aligned argument into one or two registers or copy it to the stack. *)
                         fun loadArgumentValues(argSize, sourceOffset, sourceBase, newStructSpace, preCode) =
                             if gRegNo <= 0w6 orelse (size <= 0w8 andalso gRegNo <= 0w7)
                             then (* There are sufficient registers *)
                             let
                                 val (loadInstr, nextGReg) =
                                     if argSize = 0w16
                                     then ([loadPairOffset{regT1=XReg gRegNo, regT2=XReg(gRegNo+0w1),
                                                 regN=sourceBase, unitOffset=Word.toInt(sourceOffset div 0w8)}], gRegNo+0w2)
                                     else ([loadAlignedValue(XReg gRegNo, sourceBase, sourceOffset, size)], gRegNo+0w1)
                             in
                                 loadArgs(args, stackOffset, newArgOffset+size, nextGReg, fpRegNo,
                                         preCode @ loadInstr @ code, newStructSpace)
                             end
                             else if argSize = 0w16
                             then loadArgs(args, stackOffset+2, newArgOffset+size, 0w8, fpRegNo,
                                     preCode @
                                     loadPairOffset{regT1=argWorkReg2, regT2=argWorkReg3, regN=sourceBase, unitOffset=Word.toInt(sourceOffset div 0w8)} ::
                                     storePairOffset{regT1=argWorkReg2, regT2=argWorkReg3, regN=XSP, unitOffset=stackOffset} :: code, newStructSpace)
                             else loadArgs(args, stackOffset+1, newArgOffset+size, 0w8, fpRegNo,
                                     preCode @ loadAlignedValue(argWorkReg2, sourceBase, sourceOffset, argSize) ::
                                     storeRegScaled{regT=argWorkReg2, regN=XSP, unitOffset=stackOffset} :: code, newStructSpace)
                     in
                         if alignedLoadStore(newArgOffset, size)
                         then loadArgumentValues(size, newArgOffset, argPtrReg, largeStructSpace, [])
 
                         else (* General case.  Either a large structure or a small structure that
                                 can't easily be loaded,  First copy it to the stack, and either pass
                                 the address or load it once it's aligned. *)
                         let
                             val newStructSpace = alignUp(largeStructSpace + size, 0w16)
                             val loopLabel = createLabel()
                             (* The address of the area we're copying to is in argRegNo. *)
                             val argRegNo = if gRegNo < 0w8 then XReg gRegNo else argWorkReg
                             (* Copy from the end back to the start. *)
                             val copyToStructSpace =
                             [
                                 addImmediate{regN=structSpacePtr, regD=argRegNo, immed=largeStructSpace, shifted=false}, 
                                 addImmediate{regN=argRegNo, regD=argWorkReg2, immed=size, shifted=false}, (* End of dest area *)
                                 addImmediate{regN=argPtrReg, regD=argWorkReg3, immed=newArgOffset+size, shifted=false}, (* end of source *)
                                 setLabel loopLabel,
                                 loadRegPreIndexByte{regT=argWorkReg4, regN=argWorkReg3, byteOffset= ~1},
                                 storeRegPreIndexByte{regT=argWorkReg4, regN=argWorkReg2, byteOffset= ~1},
                                 subSShiftedReg{regM=argWorkReg2, regN=argRegNo, regD=XZero, shift=ShiftNone}, (* At start? *)
                                 conditionalBranch(condNotEqual, loopLabel)
                             ]
                         in
                             if size > 0w16
                             then (* Large struct - pass by reference *)
                             (
                                 if gRegNo < 0w8
                                 then loadArgs(args, stackOffset, newArgOffset+size, gRegNo+0w1, fpRegNo, copyToStructSpace @ code, newStructSpace)
                                 else loadArgs(args, stackOffset+1, newArgOffset+size, 0w8, fpRegNo,
                                         copyToStructSpace @ storeRegScaled{regT=argWorkReg, regN=XSP, unitOffset=stackOffset} :: code,
                                         newStructSpace)
                             )
                             else (* Small struct.  Since it's now in an area at least 16 bytes and properly aligned we can load it. *)
                                 (* argRegNo points to where we copied it *)
                                 loadArgumentValues(if size > 0w8 then 0w16 else 0w8, 0w0, argRegNo, newStructSpace, copyToStructSpace)
                         end
                     end
             end
     
         local
             val {size, typeForm, ...} =  result
             (* Store a result register into the result area.  In almost all cases
                this is very simple: the only complication is with structs of odd sizes. *)
             fun storeResult(reg, offset, size) = storeUpTo8(reg, resultAreaPtr, offset, size)
         in
             val (getResult, passArgAddress) =
                 if typeForm = CTypeVoid
                 then ([], false)
                 else case classifyArg(typeForm, size) of
                     (* Floating point values are returned in s0-sn, d0-dn. *)
                     ArgClassHFA(numItems, isDouble) =>
                         let
                             fun storeFPRegs(0w0, _, _) = []
                             |   storeFPRegs(0w1, fpRegNo, offset) =
                                 [(if isDouble then storeRegScaledDouble else storeRegScaledFloat)
                                     {regT=VReg fpRegNo, regN=resultAreaPtr, unitOffset=offset}]
                             |   storeFPRegs(n, fpRegNo, offset) =
                                 (if isDouble then storePairOffsetDouble else storePairOffsetFloat)
                                     {regT1=VReg fpRegNo, regT2=VReg(fpRegNo+0w1), regN=resultAreaPtr, unitOffset=offset} ::
                                         storeFPRegs(n-0w2, fpRegNo+0w2, offset+2)
                         in
                             (storeFPRegs(numItems, 0w0 (* V0-Vn*), 0), false)
                         end
 
                 |   ArgLargeStruct => ([], true) (* Structures larger than 16 bytes are passed by reference. *)
 
                 |   _ =>
                         if size = 0w16
                         then ([storePairOffset{regT1=X0, regT2=X1, regN=resultAreaPtr, unitOffset=0}], false)
                         else if size > 0w8
                         then (storeRegScaled{regT=X0, regN=resultAreaPtr, unitOffset=0} :: storeResult(X1, 8, size-0w8), false)
                         else (storeResult(X0, 0, size), false)
         end
 
         val (argCode, argStack, largeStructSpace) =
             loadArgs(args, 0, 0w0, 0w0, 0w0,
                     if passArgAddress (* If we have to pass the address of the result struct it goes in X8. *)
                     then [moveRegToReg{sReg=resultAreaPtr, dReg=X8}]
                     else [], 0w0)
 
         val stackSpaceRequired = alignUp(Word.fromInt argStack * 0w8, 0w16) + largeStructSpace
 
         val instructions =
             [(* Push the return address to the stack.  We could put it in a callee-save register but
                 there's a very small chance that this could be the last reference to a piece of code. *)
              storeRegPreIndex{regT=X30, regN=X_MLStackPtr, byteOffset= ~8},
              (* Save heap ptr.  Needed in case we have a callback. *)
              storeRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset}
             ] @ indexToAbsoluteAddress(X0, X0) @
              (* Load the entry point address. *)
              loadRegScaled{regT=entryPtReg, regN=X0, unitOffset=0} ::
             (
                 (* Unbox the address of the result area into a callee save resgister.  This is where
                    the result will be stored on return if it is anything other than a struct.
                    We have to put the C address in there now because an ML address wouldn't be updated
                    by a possible GC in a callback. *)
                 if #typeForm(result) <> CTypeVoid
                 then indexToAbsoluteAddress(X2, X2) @ [loadRegScaled{regT=resultAreaPtr, regN=X2, unitOffset=0}]
                 else []
             ) @
             [storeRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset}] @ (* Save the stack pointer. *)
             (
                 if stackSpaceRequired = 0w0
                 then []
                 else [subImmediate{regN=XSP, regD=XSP, immed=stackSpaceRequired, shifted=false}]
             ) @
             (
                 (* If we need to copy a struct load a register with a pointer to the area for it. *)
                 if largeStructSpace = 0w0
                 then []
                 else [addImmediate{regN=XSP, regD=structSpacePtr, immed=stackSpaceRequired-largeStructSpace, shifted=false}]
             ) @
             (
                 (* The second argument is a SysWord containing the address of a malloced area of memory
                    with the actual arguments in it. *)
                 if null args
                 then []
                 else indexToAbsoluteAddress(X1, X1) @ [loadRegScaled{regT=argPtrReg, regN=X1, unitOffset=0}]
             ) @ argCode @
             [branchAndLinkReg X16] @ (* Call the function. *)
             (* Restore the C stack value in case it's been changed by a callback. *)
             (
                 if stackSpaceRequired = 0w0
                 then []
                 else [addImmediate{regN=XSP, regD=XSP, immed=stackSpaceRequired, shifted=false}]
             ) @
             [
                 (* Reload the ML stack pointer even though it's callee save.  If we've made a callback
                    the ML stack could have grown and so moved to a different address. *)
                 loadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset},
                 (* Load the heap allocation ptr and limit in case of a callback. *)
                 loadRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset},
                 loadRegScaled{regT=X_MLHeapLimit, regN=X_MLAssemblyInt, unitOffset=heapLimitPtrOffset}
             ] @ (* Store the result in the destination. *) getResult @
             (* Pop the return address and return. *)
             [ loadRegPostIndex{regT=X30, regN=X_MLStackPtr, byteOffset= 8}, returnRegister X30 ]
 
         val functionName = "foreignCall"
         val debugSwitches =
             [(*Universal.tagInject Pretty.compilerOutputTag (Pretty.prettyPrint(print, 70)),
                Universal.tagInject Debug.assemblyCodeTag true*)]
         val closure = makeConstantClosure()
         val () = generateCode{instrs=instructions, name=functionName, parameters=debugSwitches, resultClosure=closure}
     in
         closureAsAddress closure
     end
 
     (* Build a callback function.  The arguments are the abi, the list of argument types and the result type.
        The result is an ML function that takes an ML function, f, as its argument, registers it as a callback and
        returns the C function as its result.  When the C function is called the arguments are copied into
        temporary memory and the vector passed to f along with the address of the memory for the result.
        "f" stores the result in it when it returns and the result is then passed back as the result of the
        callback.
        N.B.  This returns a closure cell which contains the address of the code.  It can be used as a
        SysWord.word value except that while it exists the code will not be GCed.  *)
     fun buildCallBack(_: abi, args: cType list, result: cType): Address.machineWord =
     let
-        val _ = if is32in64 then raise Foreign "Callbacks not yet implemented in compact 32-bit ARM" else ()
         val argWorkReg = X10 (* Used in loading arguments if necessary. *)
         and argWorkReg2 = X11
         and argWorkReg3 = X13
         and argWorkReg4 = X14
         
         (* The stack contains a 32-byte result area then an aligned area for the arguments. *)
 
         (* Store the argument values to the structure that will be passed to the ML callback function. *)
         (* Note.  We've loaded the frame pointer with the original stack ptr-96 so we can
            access any stack arguments from that. *)
         fun moveArgs([], _, _, _, _, moveFromStack) = moveFromStack
         
         |   moveArgs(arg::args, stackSpace, argOffset, gRegNo, fpRegNo, moveFromStack) =
             let
                 val {size, align, typeForm, ...} =  arg
                 val newArgOffset = alignUp(argOffset, align)
             in
                 case classifyArg(typeForm, size) of
                     ArgClassHFA(numItems, isDouble) =>
                         if fpRegNo + numItems <= 0w8
                         then
                         let
                             val scale = if isDouble then 0w8 else 0w4
                             (* Store the values from the FP registers. *)
                             fun storeFPRegs(0w0, _, _) = []
                             |   storeFPRegs(0w1, fpRegNo, offset) =
                                 [(if isDouble then storeRegScaledDouble else storeRegScaledFloat)
                                     {regT=VReg fpRegNo, regN=XSP, unitOffset=offset}]
                             |   storeFPRegs(n, fpRegNo, offset) =
                                 (if isDouble then storePairOffsetDouble else storePairOffsetFloat)
                                     {regT1=VReg fpRegNo, regT2=VReg(fpRegNo+0w1), regN=XSP, unitOffset=offset} ::
                                         storeFPRegs(n-0w2, fpRegNo+0w2, offset+2)
                         in
                             moveArgs(args, stackSpace, newArgOffset+size, gRegNo, fpRegNo+numItems,
                                 storeFPRegs(numItems, fpRegNo, Word.toInt(newArgOffset div scale)) @ moveFromStack)
                         end
                         else
                         let
                             (* Load the arguments from the stack and store into the result area. *)
                             fun copyData(0w0, _, _) = []
                             |   copyData(n, dstOffset, stackOffset) =
                                 if isDouble
                                 then loadRegScaled{regT=argWorkReg2, regN=X29, unitOffset=stackOffset} ::
                                      storeRegScaled{regT=argWorkReg2, regN=XSP, unitOffset=dstOffset} ::
                                      copyData(n-0w1, dstOffset+1, stackOffset+1)
                                 else loadRegScaled32{regT=argWorkReg2, regN=X29, unitOffset=stackOffset} ::
                                      storeRegScaled32{regT=argWorkReg2, regN=XSP, unitOffset=dstOffset} ::
                                      copyData(n-0w1, dstOffset+1, stackOffset+1)
 
                             val copyFromStack =
                                 if isDouble
                                 then copyData(numItems, Word.toInt(newArgOffset div 0w8), stackSpace)
                                 else copyData(numItems, Word.toInt(newArgOffset div 0w4), stackSpace*2)
                             (* The overall size is rounded up to a multiple of 8 *)
                             val newStackOffset = stackSpace + Word.toInt(alignUp(size, 0w8) div 0w8)
                         in
                             moveArgs(args, newStackOffset, newArgOffset+size, gRegNo, 0w8,
                                 copyFromStack @ moveFromStack)
                         end
 
                 |   _ =>
                     if alignedLoadStore(newArgOffset, size) andalso
                         (gRegNo <= 0w6 orelse gRegNo = 0w7 andalso size <= 0w8)
                     then (* Usual case: argument passed in one or two registers. *)
                     (
                         if size > 0w8
                         then moveArgs(args, stackSpace, newArgOffset+size, gRegNo + 0w2, fpRegNo,
                                 storePairOffset{regT1=XReg gRegNo, regT2=XReg(gRegNo+0w1), regN=XSP,
                                     unitOffset=Word.toInt(newArgOffset div 0w8)} :: moveFromStack)
                         else moveArgs(args, stackSpace, newArgOffset+size, gRegNo + 0w1, fpRegNo,
                                 storeUpTo8(XReg gRegNo, XSP, Word.toInt newArgOffset, size) @ moveFromStack)
                     )
                     else
                     (* General case.  Store the argument registers if necessary and then use
                        a byte copy to copy into the argument area.  This sorts out any odd alignments or
                        lengths.  In some cases the source will be in memory already. *)
                     let
                         (* The source is either the register value or the value on the stack. *)
                         val (argRegNo, nextGReg, newStack, loadArg) =
                             if size > 0w16
                             then
                             (
                                 if gRegNo < 0w8
                                 then (XReg gRegNo, gRegNo + 0w1, stackSpace, [])
                                 else (argWorkReg, 0w8, stackSpace+1, [loadRegScaled{regT=argWorkReg, regN=X29, unitOffset=stackSpace}])
                             )
                             else
                             let
                                 val regsNeeded = if size > 0w8 then 0w2 else 0w1
                             in
                                 if gRegNo + regsNeeded <= 0w8
                                 then (XReg gRegNo, gRegNo+regsNeeded, stackSpace,
                                         [if size > 0w8
                                          then storePairOffset{regT1=XReg gRegNo, regT2=XReg(gRegNo+0w1), regN=XSP, unitOffset=2}
                                          else storeRegScaled{regT=XReg gRegNo, regN=XSP, unitOffset=2},
                                          addImmediate{regD=XReg gRegNo, regN=XSP, immed=0w16, shifted=false}])
                                 else (* Being passed on the stack *)
                                     (argWorkReg, 0w8, stackSpace+Word8.toInt regsNeeded,
                                         [addImmediate{regD=argWorkReg, regN=X29, immed=Word.fromInt stackSpace*0w8, shifted=false}])
                             end
 
                         val loopLabel = createLabel()
 
                         val copyCode =
                         [ 
                             addImmediate{regN=argRegNo, regD=argWorkReg3, immed=size, shifted=false}, (* End of source area *)
                             addImmediate{regN=XSP, regD=argWorkReg2, immed=newArgOffset+size, shifted=false}, (* end of dest *)
                             setLabel loopLabel,
                             loadRegPreIndexByte{regT=argWorkReg4, regN=argWorkReg3, byteOffset= ~1},
                             storeRegPreIndexByte{regT=argWorkReg4, regN=argWorkReg2, byteOffset= ~1},
                             subSShiftedReg{regM=argWorkReg3, regN=argRegNo, regD=XZero, shift=ShiftNone}, (* At start? *)
                             conditionalBranch(condNotEqual, loopLabel)
                         ]
                     in
                         moveArgs(args, newStack, newArgOffset+size, nextGReg, fpRegNo, loadArg @ copyCode @ moveFromStack)
                     end
             end
 
         val copyArgsFromRegsAndStack =
             moveArgs(args, 12 (* Offset to first stack arg *), 0w32 (* Size of result area *), 0w0, 0w0, [])
 
         local
             fun getNextSize (arg, argOffset) =
             let val {size, align, ...} =  arg in alignUp(argOffset, align) + size end
         in
             val argumentSpace = alignUp(List.foldl getNextSize 0w0 args, 0w16)
         end
 
         local
             val {size, typeForm, ...} =  result
         in
             (* Load the results from the result area except that if we're
                passing the result structure by reference this is done by
                the caller.  Generally similar to how arguments are passed
                in a call. *)
             val (loadResults, resultByReference) =
                 if typeForm = CTypeVoid
                 then ([], false)
                 else case classifyArg(typeForm, size) of
                     ArgClassHFA(numItems, isDouble) =>
                         let
                             (* Load the values to the floating point registers. *)
                             fun loadFPRegs(0w0, _, _) = []
                             |   loadFPRegs(0w1, fpRegNo, offset) =
                                 [(if isDouble then loadRegScaledDouble else loadRegScaledFloat)
                                     {regT=VReg fpRegNo, regN=XSP, unitOffset=offset}]
                             |   loadFPRegs(n, fpRegNo, offset) =
                                 (if isDouble then loadPairOffsetDouble else loadPairOffsetFloat)
                                     {regT1=VReg fpRegNo, regT2=VReg(fpRegNo+0w1), regN=XSP, unitOffset=offset} ::
                                         loadFPRegs(n-0w2, fpRegNo+0w2, offset+2)
                         in
                             (loadFPRegs(numItems, 0w0, 0 (* result area *)), false)
                         end
 
                 |   ArgLargeStruct => ([], true) (* Structures larger than 16 bytes are passed by reference. *)
 
                 |   _ =>
                     (* We've allocated a 32-byte area aligned onto a 16-byte boundary so
                        we can simply load one or two registers. *)
                     if size > 0w8
                     then ([loadPairOffset{regT1=X0, regT2=X1, regN=XSP, unitOffset=0}], false)
                     else ([loadRegScaled{regT=X0, regN=XSP, unitOffset=0}], false)
         end
 
         val instructions =
             [ (* Push LR, FP and the callee-save registers. *)
                 storePairPreIndexed{regT1=X29, regT2=X30, regN=XSP, unitOffset= ~12},
                 moveRegToReg{sReg=XSP, dReg=X29},
                 storePairOffset{regT1=X19, regT2=X20, regN=X29, unitOffset=2},
                 storePairOffset{regT1=X21, regT2=X22, regN=X29, unitOffset=4},
                 storePairOffset{regT1=X23, regT2=X24, regN=X29, unitOffset=6},
                 storePairOffset{regT1=X25, regT2=X26, regN=X29, unitOffset=8},
                 storePairOffset{regT1=X27, regT2=X28, regN=X29, unitOffset=10},
                 (* Reserve space for the arguments and results. *)
                 subImmediate{regN=XSP, regD=XSP, immed=argumentSpace+0w32, shifted=false},
                 (* We passed the function we're calling in X9 but we need to move
                    it to a callee-save register before we call the RTS. *)
                 moveRegToReg{sReg=X9, dReg=X20}
             ] @
                 (* Save X8 if we're going to need it. *)
             (if resultByReference then [storeRegScaled{regT=X8, regN=XSP, unitOffset=0}] else []) @
+            (* Now we've saved X24 we can move the global heap base into it. *)
+            (if is32in64 then [moveRegToReg{sReg=X10, dReg=X_Base32in64}] else []) @
             copyArgsFromRegsAndStack @
+            [loadAddressConstant(X0, getThreadDataCall)] @
+            (
+                if is32in64
+                then [addShiftedReg{regM=X0, regN=X_Base32in64, regD=X0, shift=ShiftLSL 0w2}]
+                else []
+            ) @
             [
                 (* Call into the RTS to get the thread data ptr. *)
-                loadAddressConstant(X0, getThreadDataCall),
                 loadRegScaled{regT=X0, regN=X0, unitOffset=0},
                 branchAndLinkReg X0,
                 moveRegToReg{sReg=X0, dReg=X_MLAssemblyInt},
                 (* Load the ML regs. *)
                 loadRegScaled{regT=X_MLHeapLimit, regN=X_MLAssemblyInt, unitOffset=heapLimitPtrOffset},
                 loadRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset},
                 loadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset},
                 (* Prepare the arguments.  They are both syswords so have to be boxed.
                    First load the address of the argument area which is after the
                    32-byte result area. *)
                 addImmediate{regN=XSP, regD=X2, immed=0w32, shifted=false}
             ] @ boxSysWord(X2, X0, X3) @ (* Address of arguments. *)
             (
                 (* Result area pointer.  If we're returning by reference this is the original
                    value of X8 otherwise it's the address of the 32 bytes we've reserved. *)
                 if resultByReference
                 then [loadRegScaled{regT=X2, regN=XSP, unitOffset=0}]
                 else [moveRegToReg{sReg=XSP, dReg=X2}]
             ) @ boxSysWord(X2, X1, X3) @
-            [
                 (* Put the ML closure pointer, originally in X9 now in X20, into the
                    ML closure pointer register, X8.  Then call the ML code. *)
-                moveRegToReg{sReg=X20, dReg=X8},
-                loadRegScaled{regT=X16, regN=X8, unitOffset=0},
+            [moveRegToReg{sReg=X20, dReg=X8}] @
+            (
+                if is32in64
+                then
+                [
+                    addShiftedReg{regM=X8, regN=X_Base32in64, regD=X16, shift=ShiftLSL 0w2},
+                    loadRegScaled{regT=X16, regN=X16, unitOffset=0}
+                ]
+                else [loadRegScaled{regT=X16, regN=X8, unitOffset=0}]
+            ) @
+            [
                 branchAndLinkReg X16,
                 (* Save the ML stack and heap pointers.  We could have allocated or
                    grown the stack.  The limit pointer is maintained by the RTS. *)
                 storeRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset},
                 storeRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset}
             ] @ loadResults @ (* Load the return values *)
             [
                 (* Restore the callee-save registers and return. *)
                 moveRegToReg{sReg=X29, dReg=XSP},
                 loadPairOffset{regT1=X19, regT2=X20, regN=X29, unitOffset=2},
                 loadPairOffset{regT1=X21, regT2=X22, regN=X29, unitOffset=4},
                 loadPairOffset{regT1=X23, regT2=X24, regN=X29, unitOffset=6},
                 loadPairOffset{regT1=X25, regT2=X26, regN=X29, unitOffset=8},
                 loadPairOffset{regT1=X27, regT2=X28, regN=X29, unitOffset=10},
                 loadPairPostIndexed{regT1=X29, regT2=X30, regN=XSP, unitOffset=12},
                 returnRegister X30
             ]
 
         val functionName = "foreignCallBack(2)"
         val debugSwitches =
             [(*Universal.tagInject Pretty.compilerOutputTag (Pretty.prettyPrint(print, 70)),
                Universal.tagInject Debug.assemblyCodeTag true*)]
 
         val closure = makeConstantClosure()
         val () = generateCode{instrs=instructions, name=functionName, parameters=debugSwitches, resultClosure=closure}
         val stage2Code = closureAsAddress closure
-
+        
         fun resultFunction f =
         let
             (* Generate a small function to load the address of f into a register and then jump to stage2.
                The idea is that it should be possible to generate this eventually in a single RTS call.
                That could be done by using a version of this as a model. *)
-            val codeAddress = Address.loadWord(Address.toAddress stage2Code, 0w0)
             val instructions =
-                [
-                    loadAddressConstant(X9, Address.toMachineWord f),
-                    loadAddressConstant(X16, codeAddress),
-                    branchRegister X16
-                ]
+                if is32in64
+                then
+                    (* Get the global heap base into X10. *)
+                    loadGlobalHeapBaseInCallback X10 @
+                    [
+                        loadAddressConstant(X9, Address.toMachineWord f),
+                        (* Have to load the actual address at run-time. *)
+                        loadAddressConstant(X16, stage2Code),
+                        addShiftedReg{regM=X16, regN=X10, regD=X16, shift=ShiftLSL 0w2},
+                        loadRegScaled{regT=X16, regN=X16, unitOffset=0},
+                        branchRegister X16
+                    ]
+                else
+                let
+                    (* We can extract the actual code address in the native address version. *)
+                    val codeAddress = Address.loadWord(Address.toAddress stage2Code, 0w0)
+                in
+                    [
+                        loadAddressConstant(X9, Address.toMachineWord f),
+                        loadAddressConstant(X16, codeAddress),
+                        branchRegister X16
+                    ]
+                end
             val functionName = "foreignCallBack(1)"
             val debugSwitches =
                 [(*Universal.tagInject Pretty.compilerOutputTag (Pretty.prettyPrint(print, 70)),
                    Universal.tagInject Debug.assemblyCodeTag true*)]
             val closure = makeConstantClosure()
             val () = generateCode{instrs=instructions, name=functionName, parameters=debugSwitches, resultClosure=closure}
             val res = closureAsAddress closure
             (*val _ = print("Address is " ^ (LargeWord.toString(RunCall.unsafeCast res)) ^ "\n")*)
         in
             res
         end
     in
         Address.toMachineWord resultFunction
     end
 
 end;