diff --git a/libpolyml/arm64.cpp b/libpolyml/arm64.cpp
index c6701589..deebf06e 100644
--- a/libpolyml/arm64.cpp
+++ b/libpolyml/arm64.cpp
@@ -1,1099 +1,1102 @@
 /*
     Machine-dependent code for ARM64
 
-    Copyright David C.J. Matthews 2020-21.
+    Copyright David C.J. Matthews 2020-22.
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 
 */
 
 // Currently this is just copied from the interpreted version.
 
 #ifdef HAVE_CONFIG_H
 #include "config.h"
 #elif defined(_WIN32)
 #include "winconfig.h"
 #else
 #error "No configuration file"
 #endif
 
 #ifdef HAVE_ASSERT_H
 #include <assert.h>
 #define ASSERT(x) assert(x)
 #else
 #define ASSERT(x) 0
 #endif
 
 #ifdef HAVE_STRING_H
 #include <string.h>
 #endif
 
 #include "globals.h"
 #include "machine_dep.h"
 #include "sys.h"
 #include "profiling.h"
 #include "arb.h"
 #include "processes.h"
 #include "run_time.h"
 #include "gc.h"
 #include "diagnostics.h"
 #include "polystring.h"
 #include "save_vec.h"
 #include "memmgr.h"
 #include "scanaddrs.h"
 #include "rtsentry.h"
 #include "bytecode.h"
 #include "int_opcodes.h"
 
 /*
 * ARM64 register use:
 * X0        First argument and return value
 * X1-X7     Second-eighth argument
 * X8        Indirect result (C), ML closure pointer on entry
 * X9-X15    Volatile scratch registers
 * X16-17    Intra-procedure-call (C).  Only used for special cases in ML.
 * X18       Platform register. Not used in ML.
 * X19-X23   Non-volatile (C).  Scratch registers (ML).
 * X24       Non-volatile (C).  Scratch register (ML).  Heap base in 32-in-64.
 * X25       ML Heap limit pointer
 * X26       ML assembly interface pointer.  Non-volatile (C).
 * X27       ML Heap allocation pointer.  Non-volatile (C).
 * X28       ML Stack pointer. Non-volatile (C).
 * X29       Frame pointer (C). Not used in ML
 * X30       Link register.
 * X31       Stack pointer (C).  Only used when calling C.  Also zero register.
 * 
 * Floating point registers:
 * V0        First argument and return value
 * V1-V7     Second-eighth argument
 * V8-V15    Non volatile. Not currently used in ML.
 * V16-V31   Volatile. Not currently used in ML.
 * 
 * The ML calling conventions generally follow the C ABI except that
 * all registers are volatile and X28 is used for the stack.
 */
 
 /* the amount of ML stack space to reserve for registers,
    C exception handling etc. The compiler requires us to
    reserve 2 stack-frames worth (2 * 20 words). We actually reserve
    slightly more than this.
 */
 
 // Arm64 instructions are all 32-bit values.
 typedef uint32_t arm64Instr, *arm64CodePointer;
 
 // Each function checks for space on the stack at the start.  To reduce the
 // code size it assumes there are at least 10 words on the stack and only
 // checks the exact space if it requires more than that.  For safety we
 // always make sure there are 50 words spare.
 #define OVERFLOW_STACK_SIZE 50
 
 // X26 always points at this area when executing ML code.
 // The offsets are built into the assembly code and some are built into
 // the code generator so this must not be changed without checking these.
 typedef struct _AssemblyArgs {
 public:
     byte*           enterInterpreter;  // These are filled in with the functions.
     byte*           heapOverFlowCall;
     byte*           stackOverFlowCall;
     byte*           stackOverFlowCallEx;
     byte*           trapHandlerEntry;
     stackItem*      handlerRegister;   // Current exception handler
     stackItem*      stackLimit;        // Lower limit of stack
     stackItem       exceptionPacket;    // Set if there is an exception
     stackItem       threadId;           // My thread id.  Saves having to call into RTS for it. (stackItem so it's 64-bits)
     stackItem       registers[25];      // Save/load area for registers X0-X24 inclusive
     double          fpRegisters[8];     // Save/load area for floating point regs D0-D7
     PolyWord*       localMbottom;      // Base of memory + 1 word
     PolyWord*       localMpointer;      // X27 Allocation ptr + 1 word
     stackItem*      stackPtr;           // X28 Current stack pointer
     arm64CodePointer linkRegister;       // X30 - Link register (return address)
     arm64CodePointer entryPoint;         // PC address to return to
 
     byte            returnReason;       // Reason for returning from ML - Set by assembly code.
 }  AssemblyArgs;
 
 class Arm64TaskData: public TaskData, ByteCodeInterpreter {
 public:
     Arm64TaskData();
     ~Arm64TaskData() {}
     unsigned allocReg; // The register to take the allocated space.
     POLYUNSIGNED allocWords; // The words to allocate.
 
     AssemblyArgs assemblyInterface;
     uint32_t saveRegisterMask; // Registers that need to be updated by a GC.
 
     virtual void GarbageCollect(ScanAddress *process);
     void ScanStackAddress(ScanAddress *process, stackItem& val, StackSpace *stack);
     virtual void EnterPolyCode(); // Start running ML
 
     virtual void SetException(poly_exn *exc) { assemblyInterface.exceptionPacket = (PolyWord)exc;  }
     virtual void InterruptCode();
 
     // AddTimeProfileCount is used in time profiling.
     virtual bool AddTimeProfileCount(SIGNALCONTEXT *context);
 
     virtual void InitStackFrame(TaskData *newTask, Handle proc);
 
     // Atomically release a mutex using hardware interlock.
     virtual bool AtomicallyReleaseMutex(PolyObject* mutexp);
 
     // Return the minimum space occupied by the stack.  Used when setting a limit.
     // N.B. This is PolyWords not native words.
     virtual uintptr_t currentStackSpace(void) const
     {
         return (this->stack->top - (PolyWord*)assemblyInterface.stackPtr) + OVERFLOW_STACK_SIZE;
     }
 
     virtual void addProfileCount(POLYUNSIGNED words) { addSynchronousCount((POLYCODEPTR)assemblyInterface.entryPoint, words); }
 
     // PreRTSCall: After calling from ML to the RTS we need to save the current heap pointer
     virtual void PreRTSCall(void) { TaskData::PreRTSCall();  SaveMemRegisters(); }
     // PostRTSCall: Before returning we need to restore the heap pointer.
     // If there has been a GC in the RTS call we need to create a new heap area.
     virtual void PostRTSCall(void) { SetMemRegisters(); TaskData::PostRTSCall(); }
 
     virtual void CopyStackFrame(StackObject *old_stack, uintptr_t old_length, StackObject *new_stack, uintptr_t new_length);
 
     void SetMemRegisters();
     void SaveMemRegisters();
 
     void HandleTrap();
     // ByteCode overrides.  The interpreter and native code states need to be in sync.
     // The interpreter is only used during the initial bootstrap.
     virtual void ClearExceptionPacket() { assemblyInterface.exceptionPacket = TAGGED(0); }
     virtual PolyWord GetExceptionPacket() { return assemblyInterface.exceptionPacket; }
     virtual stackItem* GetHandlerRegister() { return assemblyInterface.handlerRegister; }
     virtual void SetHandlerRegister(stackItem* hr) { assemblyInterface.handlerRegister = hr; }
 
     void Interpret();
     void EndBootStrap() { mixedCode = true; }
 
     PLock interruptLock;
 
     virtual void HandleStackOverflow(uintptr_t space);
 };
 
 class Arm64Dependent : public MachineDependent {
 public:
     Arm64Dependent() : mustInterpret(false) {}
 
     // Create a task data object.
     virtual TaskData* CreateTaskData(void) { return new Arm64TaskData(); }
 
     virtual void ScanConstantsWithinCode(PolyObject* addr, PolyObject* oldAddr, POLYUNSIGNED length,
         PolyWord* newConstAddr, PolyWord* oldConstAddr, POLYUNSIGNED numConsts, ScanAddress* process);
 
     virtual void RelocateConstantsWithinCode(PolyObject* addr, ScanAddress* process);
 
     virtual Architectures MachineArchitecture(void);
 
     virtual void SetBootArchitecture(char arch, unsigned wordLength);
 
     // The ARM has separate instruction and data caches.
     virtual void FlushInstructionCache(void* p, POLYUNSIGNED bytes);
 
     // During the first bootstrap phase this is interpreted.
     bool mustInterpret;
 
 #if defined(POLYML32IN64)
     virtual void UpdateGlobalHeapReference(PolyObject* addr);
 #else
     // Address of the constant segment from the code segment.  This is complicated because
     // some OSs require the code to position-independent which means the code can only
     // contain relative offsets.  This isn't a problem for 32-in-64 because the code is
     // copied before it is executed.
 
     // Set the address of the constant area.  If this is within the code segment itself we use the
     // default, negative, byte offset.  If the constant area has been split off we use a pair of
     // dummy ADRP/LDR instructions.  They aren't ever executed but allow us to use relative addressing.
     virtual void SetAddressOfConstants(PolyObject* objAddr, PolyObject* writable, POLYUNSIGNED length, PolyWord* constAddr)
     {
         if (constAddr > (PolyWord*)objAddr && constAddr < (PolyWord*)objAddr + length)
         {
             int64_t offset = (byte*)constAddr - (byte*)objAddr - length * sizeof(PolyWord);
             writable->Set(length - 1, PolyWord::FromSigned(offset));
         }
         else
         {
             PolyWord* last_word = objAddr->Offset(length - 1); // Last word in the code
             MemSpace* space = gMem.SpaceForAddress(last_word);
             uint32_t* pt = (uint32_t*)space->writeAble(last_word);
             pt[0] = toARMInstr(0x90000000); // Insert dummy ADRP and LDR
             pt[1] = toARMInstr(0xf9400000);
             ScanAddress::SetConstantValue((byte*)last_word, (PolyObject*)constAddr, PROCESS_RELOC_ARM64ADRPLDR64);
         }
     }
 
     virtual void GetConstSegmentForCode(PolyObject* obj, POLYUNSIGNED obj_length, PolyWord*& cp, POLYUNSIGNED& count) const
     {
         PolyWord* last_word = obj->Offset(obj_length - 1); // Last word in the code
         if ((last_word[0].AsUnsigned() >> 56) == 0xff)
         {
             // If the high-order byte is 0xff it's a (-ve) byte offset.
             POLYSIGNED offset = last_word->AsSigned();
             cp = last_word + 1 + offset / sizeof(PolyWord);
             count = cp[-1].AsUnsigned();
         }
         else
         {
             PolyObject* addr = ScanAddress::GetConstantValue((byte*)last_word, PROCESS_RELOC_ARM64ADRPLDR64, 0);
             cp = (PolyWord*)addr;
             count = addr->Length();
         }
 
     }
 #endif
 };
 
 static Arm64Dependent arm64Dependent;
 
 MachineDependent* machineDependent = &arm64Dependent;
 
 Architectures Arm64Dependent::MachineArchitecture(void)
 {
     // During the first phase of the bootstrap we
     // compile the interpreted version.
     if (mustInterpret) return MA_Interpreted;
 #if defined(POLYML32IN64)
     return MA_Arm64_32;
 #else
     return MA_Arm64;
 #endif
 }
 
 // Values for the returnReason byte. These values are put into returnReason by the assembly code
 // depending on which of the "trap" functions has been called.
 enum RETURN_REASON {
     RETURN_HEAP_OVERFLOW = 1,           // Heap space check has failed.
     RETURN_STACK_OVERFLOW = 2,          // Stack space check has failed (<= 10 words).
     RETURN_STACK_OVERFLOWEX = 3,        // Stack space check has failed.  Adjusted SP is in X9.
     RETURN_ENTER_INTERPRETER = 4        // Native code has entered interpreted code.
 };
 
 extern "C" {
 
     // These are declared in the assembly code segment.
     void Arm64AsmEnterCompiledCode(void*);
     int  Arm64AsmCallExtraRETURN_ENTER_INTERPRETER(void);
     int  Arm64AsmCallExtraRETURN_HEAP_OVERFLOW(void);
     int  Arm64AsmCallExtraRETURN_STACK_OVERFLOW(void);
     int  Arm64AsmCallExtraRETURN_STACK_OVERFLOWEX(void);
 
     // This is declared here and called from the assembly code.
     // It avoids having a call to an external in the assembly code
     // which sometimes gives problems with position-indepent code.
     void  Arm64TrapHandler(stackItem threadId);
 };
 
 Arm64TaskData::Arm64TaskData() : ByteCodeInterpreter(&assemblyInterface.stackPtr, &assemblyInterface.stackLimit),
     allocReg(0), allocWords(0), saveRegisterMask(0)
 {
     assemblyInterface.enterInterpreter = (byte*)Arm64AsmCallExtraRETURN_ENTER_INTERPRETER;
     assemblyInterface.heapOverFlowCall = (byte*)Arm64AsmCallExtraRETURN_HEAP_OVERFLOW;
     assemblyInterface.stackOverFlowCall = (byte*)Arm64AsmCallExtraRETURN_STACK_OVERFLOW;
     assemblyInterface.stackOverFlowCallEx = (byte*)Arm64AsmCallExtraRETURN_STACK_OVERFLOWEX;
     assemblyInterface.trapHandlerEntry = (byte*)Arm64TrapHandler;
     interpreterPc = 0;
     mixedCode = !arm64Dependent.mustInterpret;
 }
 
 void Arm64Dependent::SetBootArchitecture(char arch, unsigned wordLength)
 {
     if (arch == 'I')
         mustInterpret = true;
     else if (arch != 'A')
         Crash("Boot file has unexpected architecture code: %c", arch);
 }
 
 // The ARM has separate instruction and data caches so we must flush
 // the cache when creating or modifying code.
 void Arm64Dependent::FlushInstructionCache(void* p, POLYUNSIGNED bytes)
 {
 #ifdef _WIN32
     ::FlushInstructionCache(GetCurrentProcess(), p, bytes);
 #elif defined (__GNUC__)
     __builtin___clear_cache((char*)p, (char*)p + bytes);
 #elif (defined (__clang__) && defined (__APPLE__))
     sys_icache_invalidate(p, bytes);
 #else
 #error "No code to flush the instruction cache."
 #endif
 }
 
 void Arm64TaskData::GarbageCollect(ScanAddress *process)
 {
     TaskData::GarbageCollect(process);
     ByteCodeInterpreter::GarbageCollect(process);
     assemblyInterface.threadId = stackItem(threadObject); // threadObject updated by TaskData::GarbageCollect
 
     if (assemblyInterface.exceptionPacket.w().IsDataPtr())
     {
         PolyObject* obj = assemblyInterface.exceptionPacket.w().AsObjPtr();
         obj = process->ScanObjectAddress(obj);
         assemblyInterface.exceptionPacket = (PolyWord)obj;
     }
 
     if (stack != 0)
     {
         stackItem*stackPtr = assemblyInterface.stackPtr;
         // Now the values on the stack.
         for (stackItem* q = stackPtr; q < (stackItem*)stack->top; q++)
             ScanStackAddress(process, *q, stack);
     }
 
     // Register mask.  There is a bit for each of the registers up to X24.
     for (int i = 0; i < 25; i++)
     {
         if (saveRegisterMask & (1 << i))
             ScanStackAddress(process, assemblyInterface.registers[i], stack);
     }
 
     // Make sure the code is still reachable. Code addresses aren't updated.
     {
         stackItem code;
         code.codeAddr = (POLYCODEPTR)assemblyInterface.linkRegister;
         ScanStackAddress(process, code, stack);
         code.codeAddr = (POLYCODEPTR)assemblyInterface.entryPoint;
         ScanStackAddress(process, code, stack);
     }
 }
 
 // Process a value within the stack.
 void Arm64TaskData::ScanStackAddress(ScanAddress *process, stackItem& stackItem, StackSpace *stack)
 {
     // Code addresses on the ARM are always even, unlike the X86, so if it's tagged
     // it can't be an address.
     if (stackItem.w().IsTagged()) return;
 
 #ifdef POLYML32IN64
     // In 32-in-64 we can have either absolute addresses or object indexes.
     // Absolute addresses always have the top 32-bits non-zero
     if (stackItem.argValue < ((uintptr_t)1 << 32))
     {
         PolyWord val = process->ScanObjectAddress(stackItem.w().AsObjPtr());
         stackItem = val;
     }
     else
     {
-        // Could be a code address or a stack address.
+        // Could be a code address, a stack address or a heap address that has been
+        // converted from an object pointer.  Currently local addresses only occur
+        // in registers, not on the stack.
         MemSpace* space = gMem.SpaceForAddress(stackItem.codeAddr - 1);
-        if (space == 0 || space->spaceType != ST_CODE) return;
-        PolyObject* obj = gMem.FindCodeObject(stackItem.codeAddr);
-        ASSERT(obj != 0);
-        // Process the address of the start.  Don't update anything.
-        process->ScanObjectAddress(obj);
+        if (space->spaceType == ST_CODE)
+        {
+            PolyObject* obj = gMem.FindCodeObject(stackItem.codeAddr);
+            ASSERT(obj != 0);
+            // Process the address of the start.  Don't update anything.
+            process->ScanObjectAddress(obj);
+        }
+        else if (space->spaceType == ST_LOCAL)
+            stackItem.absAddress = process->ScanObjectAddress(stackItem.absAddress);
     }
 #else
     MemSpace* space = gMem.SpaceForAddress(stackItem.codeAddr - 1);
 
     if (space->spaceType == ST_CODE)
     {
         PolyObject* obj = gMem.FindCodeObject(stackItem.codeAddr);
         // Process the address of the start.  Don't update anything.
         process->ScanObjectAddress(obj);
     }
     else if (space->spaceType == ST_LOCAL)
         // Local values must be word addresses.
     {
         PolyWord val = process->ScanObjectAddress(stackItem.w().AsObjPtr());
         stackItem = val;
     }
 #endif
 }
 
 // Copy a stack
 void Arm64TaskData::CopyStackFrame(StackObject *old_stack, uintptr_t old_length, StackObject *new_stack, uintptr_t new_length)
 {
 #ifdef POLYML32IN64
     old_length = old_length / 2;
     new_length = new_length / 2;
 #endif
     /* Moves a stack, updating all references within the stack */
     stackItem*old_base = (stackItem*)old_stack;
     stackItem*new_base = (stackItem*)new_stack;
     stackItem*old_top = old_base + old_length;
 
     /* Calculate the offset of the new stack from the old. If the frame is
     being extended objects in the new frame will be further up the stack
     than in the old one. */
 
     uintptr_t offset = new_base - old_base + new_length - old_length;
     stackItem *oldSp = assemblyInterface.stackPtr;
     assemblyInterface.stackPtr = oldSp + offset;
     assemblyInterface.handlerRegister = assemblyInterface.handlerRegister + offset;
 
     /* Skip the unused part of the stack. */
 
     uintptr_t i = oldSp - old_base;
 
     ASSERT(i <= old_length);
 
     i = old_length - i;
 
     stackItem *old = oldSp;
     stackItem *newp = assemblyInterface.stackPtr;
 
     while (i--)
     {
         stackItem old_word = *old++;
         if (old_word.w().IsDataPtr() && old_word.stackAddr >= old_base && old_word.stackAddr <= old_top)
             old_word.stackAddr = old_word.stackAddr + offset;
         else if (old_word.w().IsDataPtr() && IsHeapAddress(old_word.stackAddr))
         {
             stackItem* addr = (stackItem*)old_word.w().AsStackAddr();
             if (addr >= old_base && addr <= old_top)
             {
                 addr += offset;
                 old_word = PolyWord::FromStackAddr((PolyWord*)addr);
             }
         }
         *newp++ = old_word;
     }
     ASSERT(old == ((stackItem*)old_stack) + old_length);
     ASSERT(newp == ((stackItem*)new_stack) + new_length);
     // And change any registers that pointed into the old stack
     for (int j = 0; j < 25; j++)
     {
         if (saveRegisterMask & (1 << j))
         {
             stackItem* regAddr = &(assemblyInterface.registers[j]);
             stackItem old_word = *regAddr;
             if (old_word.w().IsDataPtr() && old_word.stackAddr >= old_base && old_word.stackAddr <= old_top)
                 old_word.stackAddr = old_word.stackAddr + offset;
             else if (old_word.w().IsDataPtr() && IsHeapAddress(old_word.stackAddr))
             {
                 stackItem* addr = (stackItem*)old_word.w().AsStackAddr();
                 if (addr >= old_base && addr <= old_top)
                 {
                     addr += offset;
                     old_word = PolyWord::FromStackAddr((PolyWord*)addr);
                 }
             }
             *regAddr = old_word;
         }
     }
 }
 
 void Arm64TaskData::EnterPolyCode()
 /* Called from "main" to enter the code. */
 {
     assemblyInterface.stackLimit = (stackItem*)((PolyWord*)this->stack->stack() + OVERFLOW_STACK_SIZE);
     if (arm64Dependent.mustInterpret)
     {
         PolyWord closure = assemblyInterface.registers[8];
         *(--assemblyInterface.stackPtr) = closure; /* Closure address */
         interpreterPc = *(POLYCODEPTR*)closure.AsObjPtr();
         Interpret();
         ASSERT(0); // Should never return
     }
     SetMemRegisters();
     // Jump into the ML code.  This code sets up the registers and puts the
     // address of the assemblyInterface into X26
     Arm64AsmEnterCompiledCode(&assemblyInterface);
     // This should never return
     ASSERT(0);
 
 }
 
 void Arm64TaskData::Interpret()
 {
     while (true)
     {
         switch (RunInterpreter(this))
         {
         case ReturnCall:
             // After the call there will be an enter-int instruction so that when this
             // returns we will re-enter the interpreter.  The number of arguments for
             // this call is after that.
             while ((uintptr_t)interpreterPc & 3)
             {
                 ASSERT(interpreterPc[0] == INSTR_no_op);
                 interpreterPc++;
             }
             ASSERT(interpreterPc[0] == 0xe9);
             numTailArguments = interpreterPc[12];
 
         case ReturnTailCall:
         {
             ClearExceptionPacket();
             // Pop the closure.
             PolyWord closureWord = *assemblyInterface.stackPtr++;
             PolyObject* closure = closureWord.AsObjPtr();
             arm64CodePointer cp = *(arm64CodePointer*)closure;
             if (fromARMInstr(cp[0]) == 0xAA1E03E9 && fromARMInstr(cp[1]) == 0xF9400350 && fromARMInstr(cp[2]) == 0xD63F0200)
             {
                 // If the code we're going to is interpreted push back the closure and
                 // continue.
                 interpreterPc = (POLYCODEPTR)cp;
                 assemblyInterface.stackPtr--;
                 HandleStackOverflow(128); // Make sure we have space since we're bypassing the check.
                 continue;
             }
             assemblyInterface.registers[8] = closureWord; // Put closure in the closure reg.
             // Pop the return address.  We may need to align this to a word boundary.
             POLYCODEPTR originalReturn = (POLYCODEPTR)((assemblyInterface.stackPtr++)->codeAddr);
             while ((uintptr_t)originalReturn & 3)
             {
                 ASSERT(originalReturn[0] == INSTR_no_op); 
                 originalReturn++;
             }
             // Get the arguments into the correct registers.
             // Load the register arguments.  The first 8 arguments go into X0-X7.
             // These will have been the first arguments to be pushed so will be
             // furthest away on the stack.
             // Note: we don't currently pass any arguments in the FP regs.
             for (unsigned i = 0; i < numTailArguments && i < 8; i++)
                 assemblyInterface.registers[i] = assemblyInterface.stackPtr[numTailArguments - i - 1];
             // If there are any more arguments these need to be shifted down the stack.
             while (numTailArguments > 8)
             {
                 numTailArguments--;
                 assemblyInterface.stackPtr[numTailArguments] = assemblyInterface.stackPtr[numTailArguments - 8];
             }
             // Remove the register arguments
             assemblyInterface.stackPtr += numTailArguments > 8 ? 8 : numTailArguments;
 
             assemblyInterface.linkRegister = (arm64CodePointer)originalReturn; // Set the return address to caller
             assemblyInterface.entryPoint = *(arm64CodePointer*)closure; // Entry point to callee
             interpreterPc = 0; // No longer in the interpreter (See SaveMemRegs)
             return;
         }
 
         case ReturnReturn:
         {
             ClearExceptionPacket();
             // Returning from an interpreted function.  Normally we'll be returning to
             // interpreted code.
             if ((uintptr_t)interpreterPc & 3) // ARM64 addresses will always be 4-byte aligned.
                 continue;
             arm64CodePointer cp = (arm64CodePointer)interpreterPc;
             if (fromARMInstr(cp[0]) == 0xAA1E03E9 && fromARMInstr(cp[1]) == 0xF9400350 && fromARMInstr(cp[2]) == 0xD63F0200)
                 continue;
             // Pop the value we're returning.  Set the entry point to the code we're returning to.
             assemblyInterface.registers[0] = *assemblyInterface.stackPtr++;
             assemblyInterface.entryPoint = cp;
             interpreterPc = 0; // No longer in the interpreter (See SaveMemRegs)
             return;
         }
 
         }
     }
 }
 
 // Called from the assembly code as a result of a trap i.e. a request for
 // a GC or to extend the stack.
 // N.B. Argument must be stackItem not PolyWord so that it's compatible with
 // big-endian 32-in-64.
 void Arm64TrapHandler(stackItem threadId)
 {
     Arm64TaskData* taskData = (Arm64TaskData*)TaskData::FindTaskForId(threadId);
     taskData->HandleTrap();
 }
 
 void Arm64TaskData::HandleTrap()
 {
     SaveMemRegisters(); // Update globals from the memory registers.
 
     switch (this->assemblyInterface.returnReason)
     {
 
     case RETURN_HEAP_OVERFLOW:
     {
         // The heap has overflowed.
         // The register mask is the word after the return.
         saveRegisterMask = fromARMInstr(*assemblyInterface.entryPoint++);
         // The generated code first subtracts the space required from x27 and puts the
         // result into a separate register.  It then compares this with x25 and comes here if
         // it is not above that.  Either way it is going to execute an instruction to put
         // this value back into x27.
         // Look at that instruction to find out the register.
         arm64Instr moveInstr = fromARMInstr(*assemblyInterface.entryPoint);
         // We may have an instruction to pop X30 first.
         if (moveInstr == 0xF840879E)
             moveInstr = fromARMInstr(assemblyInterface.entryPoint[1]);
         ASSERT((moveInstr & 0xffe0ffff) == 0xaa0003fb); // mov x27,xN
         allocReg = (moveInstr >> 16) & 0x1f;
-        allocWords = (allocPointer - (PolyWord*)assemblyInterface.registers[allocReg].stackAddr) + 1;
+        allocWords = (POLYUNSIGNED)((allocPointer - (PolyWord*)assemblyInterface.registers[allocReg].stackAddr) + 1);
         assemblyInterface.registers[allocReg] = TAGGED(0); // Clear this - it's not a valid address.
         if (profileMode == kProfileStoreAllocation)
             addProfileCount(allocWords);
         // The actual allocation is done in SetMemRegisters.
         break;
     }
 
     case RETURN_STACK_OVERFLOW:
     case RETURN_STACK_OVERFLOWEX:
     {
         // The register mask is the word after the return.
         saveRegisterMask = fromARMInstr(*assemblyInterface.entryPoint++);
         uintptr_t min_size = 0; // Size in PolyWords
         if (assemblyInterface.returnReason == RETURN_STACK_OVERFLOW)
         {
             min_size = (this->stack->top - (PolyWord*)assemblyInterface.stackPtr) +
                 OVERFLOW_STACK_SIZE * sizeof(uintptr_t) / sizeof(PolyWord);
         }
         else
         {
             // Stack limit overflow.  If the required stack space is larger than
             // the fixed overflow size the code will calculate the limit in X9.
             stackItem* stackP = assemblyInterface.registers[9].stackAddr;
             min_size = (this->stack->top - (PolyWord*)stackP) +
                 OVERFLOW_STACK_SIZE * sizeof(uintptr_t) / sizeof(PolyWord);
         }
         HandleStackOverflow(min_size);
         break;
     }
 
     case RETURN_ENTER_INTERPRETER:
     {
         interpreterPc = (POLYCODEPTR)assemblyInterface.linkRegister;
         byte reasonCode = *interpreterPc++;
         // Sort out arguments.
         assemblyInterface.exceptionPacket = TAGGED(0);
         if (reasonCode == 0xff)
         {
             // Exception handler.
             assemblyInterface.exceptionPacket = assemblyInterface.registers[0]; // Get the exception packet
             // We need to leave the current handler in place.  When we enter the interpreter it will
             // check the exception packet and if it is non-null will raise it.
         }
         else if (reasonCode >= 128)
         {
             // Start of function.
             unsigned numArgs = reasonCode - 128;
             // We need the stack to contain:
             // The closure, the return address, the arguments.
             // The stack will currently contain the stack arguments.
             // Add space for the register arguments
             if (numArgs > 8)
                 assemblyInterface.stackPtr -= 8;
             else assemblyInterface.stackPtr -= numArgs;
             // Move up any stack arguments.
             for (unsigned n = 8; n < numArgs; n++)
             {
                 assemblyInterface.stackPtr[n - 8] = assemblyInterface.stackPtr[n];
             }
             // Store the register arguments
             for (unsigned n = 0; n < numArgs && n < 8; n++)
                 assemblyInterface.stackPtr[numArgs - n - 1] = assemblyInterface.registers[n];
 
             // Finally push the return address and closure pointer
             *(--assemblyInterface.stackPtr) = assemblyInterface.registers[9]; // Return address - value of X30 before enter-int
             *(--assemblyInterface.stackPtr) = assemblyInterface.registers[8]; // Closure
         }
         else
         {
             // Return from call. Push X0
             *(--assemblyInterface.stackPtr) = assemblyInterface.registers[0];
         }
         Interpret();
         break;
     }
 
     default:
         Crash("Unknown return reason code %u", this->assemblyInterface.returnReason);
     }
     SetMemRegisters();
 }
 
 void Arm64TaskData::HandleStackOverflow(uintptr_t space)
 {
     uintptr_t min_size = (this->stack->top - (PolyWord*)assemblyInterface.stackPtr) + OVERFLOW_STACK_SIZE + space;
     try {
         // The stack check has failed.  This may either be because we really have
         // overflowed the stack or because the stack limit value has been adjusted
         // to result in a call here.
         CheckAndGrowStack(this, min_size);
     }
     catch (IOException&) {
         // We may get an exception while handling this if we run out of store
     }
     {
         PLocker l(&interruptLock);
         // Set the stack limit.  This clears any interrupt and also sets the
         // correct value if we've grown the stack.
         assemblyInterface.stackLimit = (stackItem*)stack->bottom + OVERFLOW_STACK_SIZE;
     }
 
     try {
         processes->ProcessAsynchRequests(this);
         // Release and re-acquire use of the ML memory to allow another thread
         // to GC.
         processes->ThreadReleaseMLMemory(this);
         processes->ThreadUseMLMemory(this);
     }
     catch (IOException&) {
     }
     catch (KillException&) {
         processes->ThreadExit(this);
     }
 }
 
 void Arm64TaskData::InitStackFrame(TaskData* parentTask, Handle proc)
 /* Initialise stack frame. */
 {
     StackSpace* space = this->stack;
     StackObject* stack = (StackObject*)space->stack();
     uintptr_t stack_size = space->spaceSize() * sizeof(PolyWord) / sizeof(stackItem);
     assemblyInterface.stackPtr = (stackItem*)stack + stack_size;
     assemblyInterface.stackLimit = (stackItem*)space->bottom + OVERFLOW_STACK_SIZE;
     assemblyInterface.handlerRegister = assemblyInterface.stackPtr;
     // Store the argument and the closure.
     assemblyInterface.registers[8] = proc->Word(); // Closure
     assemblyInterface.registers[0] = TAGGED(0); // Argument
     assemblyInterface.linkRegister = (arm64CodePointer)1; // We never return. Use a tagged value because it may be pushed
     assemblyInterface.entryPoint = (arm64CodePointer)1;
     // Have to set the register mask in case we get a GC before the thread starts.
     saveRegisterMask = (1 << 8) | 1; // X8 and X0
 
 #ifdef POLYML32IN64
     // In 32-in-64 RBX always contains the heap base address.
     assemblyInterface.registers[24].stackAddr = (stackItem*)globalHeapBase;
 #endif
 
 }
 
 // This is called from a different thread so we have to be careful.
 void Arm64TaskData::InterruptCode()
 {
     PLocker l(&interruptLock);
     // Set the stack limit pointer to the top of the stack to cause
     // a trap when we next check for stack overflow.
     // We use a lock here to ensure that we always use the current value of the
     // stack.  The thread we're interrupting could be growing the stack at this point.
     if (stack != 0)
         assemblyInterface.stackLimit = (stackItem*)(stack->top - 1);
 }
 
 // Called before entering ML code from the run-time system
 void Arm64TaskData::SetMemRegisters()
 {
     // Copy the current store limits into variables before we go into the assembly code.
 
     // If we haven't yet set the allocation area or we don't have enough we need
     // to create one (or a new one).
     if (allocPointer <= allocLimit + allocWords)
     {
         if (allocPointer < allocLimit)
             Crash("Bad length in heap overflow trap");
 
         // Find some space to allocate in.  Updates taskData->allocPointer and
         // returns a pointer to the newly allocated space (if allocWords != 0)
         PolyWord* space =
             processes->FindAllocationSpace(this, allocWords, true);
         if (space == 0)
         {
             // We will now raise an exception instead of returning.
             // Set allocWords to zero so we don't set the allocation register
             // since that could be holding the exception packet.
             allocWords = 0;
         }
         // Undo the allocation just now.
         allocPointer += allocWords;
     }
     if (this->allocWords != 0)
     {
         // If we have had a heap trap we actually do the allocation here.
         // We will have already garbage collected and recovered sufficient space.
         // This also happens if we have just trapped because of store profiling.
         allocPointer -= allocWords; // Now allocate
         // Set the allocation register to this area. N.B.  This is an absolute address.
         assemblyInterface.registers[allocReg].codeAddr = (POLYCODEPTR)(allocPointer + 1); /* remember: it's off-by-one */
         allocWords = 0;
     }
 
     // If we have run out of store, either just above or while allocating in the RTS,
     // allocPointer and allocLimit will have been set to zero as part of the GC.  We will
     // now be raising an exception which may free some store but we need to come back here
     // before we allocate anything.  The compiled code uses unsigned arithmetic to check for
     // heap overflow but only after subtracting the space required.  We need to make sure
     // that the values are still non-negative after substracting any object size.
     if (allocPointer == 0) allocPointer += MAX_OBJECT_SIZE;
     if (allocLimit == 0) allocLimit += MAX_OBJECT_SIZE;
 
     assemblyInterface.localMbottom = allocLimit + 1;
     assemblyInterface.localMpointer = allocPointer + 1;
     // If we are profiling store allocation we set mem_hl so that a trap
     // will be generated.
     if (profileMode == kProfileStoreAllocation)
         assemblyInterface.localMbottom = assemblyInterface.localMpointer;
 
     assemblyInterface.threadId = stackItem(threadObject);
 }
 
 // This is called whenever we have returned from ML to C.
 void Arm64TaskData::SaveMemRegisters()
 {
     if (interpreterPc == 0)
     {   // Not if we're already in the interpreter
         // The normal return is to the link register address.
         assemblyInterface.entryPoint = assemblyInterface.linkRegister;
         allocPointer = assemblyInterface.localMpointer - 1;
     }
     allocWords = 0;
     assemblyInterface.exceptionPacket = TAGGED(0);
     saveRegisterMask = 0;
 }
 
 // Process addresses in the code
 // Because we don't have constants actually in the code we only have to process this in two
 // cases.  If we are exporting the code we first copy it to a new location.  We have to update
 // the ADRP+LDR/ADD pairs at that point.
 // When we construct the relocations we need to identify the points where the relocations apply in
 // the code.  This applies both to exporting to object code and to saved states.
 void Arm64Dependent::ScanConstantsWithinCode(PolyObject* addr, PolyObject* oldAddr, POLYUNSIGNED length,
     PolyWord* newConstAddr, PolyWord* oldConstAddr, POLYUNSIGNED numConsts, ScanAddress* process)
 {
     arm64CodePointer pt = (arm64CodePointer)addr;
     if (addr == oldAddr && newConstAddr == oldConstAddr)
         return;
     // If it begins with the enter-int sequence it's interpreted code.
     if (fromARMInstr(pt[0]) == 0xAA1E03E9 && fromARMInstr(pt[1]) == 0xF9400350 && fromARMInstr(pt[2]) == 0xD63F0200)
         return;
 
     while (*pt != 0) // The code ends with a UDF instruction (0)
     {
         arm64Instr instr0 = fromARMInstr(pt[0]);
         if ((instr0 & 0x9f000000) == 0x90000000) // ADRP instruction
         {
             // Look at the instruction at the original location, before it was copied, to
             // find out the address it referred to.
             byte* oldInstrAddress = (byte*)pt - (byte*)addr + (byte*)oldAddr;
             arm64Instr instr1 = fromARMInstr(pt[1]);
             ScanRelocationKind scanKind;
             if ((instr1 & 0xffc00000) == 0xf9400000)
                 scanKind = PROCESS_RELOC_ARM64ADRPLDR64; // LDR of 64-bit quantity
             else if ((instr1 & 0xffc00000) == 0xb9400000)
                 scanKind = PROCESS_RELOC_ARM64ADRPLDR32; // LDR of 32-bit quantity
             else if ((instr1 & 0xff800000) == 0x91000000)
                 scanKind = PROCESS_RELOC_ARM64ADRPADD; // ADD
             else ASSERT(0); // Invalid instruction
             byte* constAddress = (byte*)ScanAddress::GetConstantValue(oldInstrAddress, scanKind, 0);
             // This could be a reference to the code itself or the non-constant area.
             // If it's in the code we relocate it to the new code; if it's in the constant
             // area to the new constant area.
             byte* newAddress;
             if (constAddress > oldInstrAddress && constAddress < ((byte*)oldConstAddr))
                 newAddress =  (byte*)addr + (constAddress - (byte*)oldAddr);
             else newAddress = (byte*)newConstAddr + (constAddress - (byte*)oldConstAddr);
             ScanAddress::SetConstantValue((byte*)pt, (PolyObject*)newAddress, scanKind);
         }
         pt++;
     }
 }
 
 void Arm64Dependent::RelocateConstantsWithinCode(PolyObject* addr, ScanAddress* process)
 {
     arm64CodePointer pt = (arm64CodePointer)addr;
     // If it begins with the enter-int sequence it's interpreted code.
     if (fromARMInstr(pt[0]) == 0xAA1E03E9 && fromARMInstr(pt[1]) == 0xF9400350 && fromARMInstr(pt[2]) == 0xD63F0200)
         return;
 #ifndef POLYML32IN64
     POLYUNSIGNED length = addr->Length();
     // If we have replaced the offset with a dummy ADRP/LDR pair we have to add a relocation.
     PolyWord* end = addr->Offset(length - 1);
     if ((end[0].AsUnsigned() >> 56) != 0xff)
         process->RelocateOnly(addr, (byte*)end, PROCESS_RELOC_ARM64ADRPLDR64);
 #endif
 
     while (*pt != 0) // The code ends with a UDF instruction (0)
     {
         arm64Instr instr0 = fromARMInstr(pt[0]);
         if ((instr0 & 0x9f000000) == 0x90000000) // ADRP instruction
         {
             arm64Instr instr1 = fromARMInstr(pt[1]);
             ScanRelocationKind scanKind;
             if ((instr1 & 0xffc00000) == 0xf9400000)
                 scanKind = PROCESS_RELOC_ARM64ADRPLDR64; // LDR of 64-bit quantity
             else if ((instr1 & 0xffc00000) == 0xb9400000)
                 scanKind = PROCESS_RELOC_ARM64ADRPLDR32; // LDR of 32-bit quantity
             else if ((instr1 & 0xff800000) == 0x91000000)
                 scanKind = PROCESS_RELOC_ARM64ADRPADD; // ADD
             else ASSERT(0); // Invalid instruction
             process->RelocateOnly(addr, (byte*)pt, scanKind);
         }
         pt++;
 }
 }
 
 // This is a special hack for FFI callbacks in 32-in-64.  This is called
 // 
 #ifdef POLYML32IN64
 void Arm64Dependent::UpdateGlobalHeapReference(PolyObject* addr)
 {
     arm64CodePointer pt = (arm64CodePointer)addr;
     if (fromARMInstr(pt[0]) == 0xD503201F && (fromARMInstr(pt[1]) & 0xff000000) == 0x58000000)
     {
         // nop (special marker) followed by LDR Xn,pc-relative
         uint32_t pcOffset = (fromARMInstr(pt[1]) >> 5) & 0x3ffff; // This is a number of 32-bit words
         PolyWord* gHeapAddr = ((PolyWord*)addr) + pcOffset + 1; // PolyWords are 32-bits
         if (((PolyWord**)gHeapAddr)[0] != globalHeapBase)
             ((PolyWord**)gMem.SpaceForAddress(gHeapAddr)->writeAble(gHeapAddr))[0] = globalHeapBase;
     }
 }
 #endif
 
 // As far as possible we want locking and unlocking an ML mutex to be fast so
 // we try to implement the code in the assembly code using appropriate
 // interlocked instructions.  That does mean that if we need to lock and
 // unlock an ML mutex in this code we have to use the same, machine-dependent,
 // code to do it.  These are defaults that are used where there is no
 // machine-specific code.
 
 #if defined(_MSC_VER)
 // This saves having to define it in the MASM assembly code.
 static uintptr_t Arm64AsmAtomicExchange(PolyObject* mutexp, uintptr_t value)
 {
-#   if (SIZEOF_POLYWORD == 8)
+    // Mutexes are always 64-bit values even on 32-in-64.
     return InterlockedExchange64((LONG64*)mutexp, value);
-#   else
-    return InterlockedExchange((LONG*)mutexp, value);
-#  endif
 }
 
 #else
 extern "C" {
     // This is only defined in the GAS assembly code
     uintptr_t Arm64AsmAtomicExchange(PolyObject*, uintptr_t);
 }
 #endif
 
 
 bool Arm64TaskData::AtomicallyReleaseMutex(PolyObject* mutexp)
 {
     uintptr_t oldValue = Arm64AsmAtomicExchange(mutexp, 0);
     return oldValue == 1;
 }
 
 bool Arm64TaskData::AddTimeProfileCount(SIGNALCONTEXT *context)
 {
     stackItem* sp = 0;
     POLYCODEPTR pc = 0;
     if (context != 0)
     {
 #if defined(HAVE_WINDOWS_H)
         sp = (stackItem*)context->Sp;
         pc = (POLYCODEPTR)context->Pc;
 #elif defined(HAVE_UCONTEXT_T)
 #ifdef HAVE_MCONTEXT_T_REGS
         // Linux
         sp = (stackItem*)context->uc_mcontext.sp;
         pc = (POLYCODEPTR)context->uc_mcontext.pc;
 #endif
 #endif
     }
     if (pc != 0)
     {
         // See if the PC we've got is an ML code address.
         MemSpace* space = gMem.SpaceForAddress(pc);
         if (space != 0 && (space->spaceType == ST_CODE || space->spaceType == ST_PERMANENT))
         {
             incrementCountAsynch(pc);
             return true;
         }
     }
     // See if the sp value is in the current stack.
     if (sp >= (stackItem*)this->stack->bottom && sp < (stackItem*)this->stack->top)
     {
         // We may be in the assembly code.  The top of the stack will be a return address.
         pc = sp[0].w().AsCodePtr();
         MemSpace* space = gMem.SpaceForAddress(pc);
         if (space != 0 && (space->spaceType == ST_CODE || space->spaceType == ST_PERMANENT))
         {
             incrementCountAsynch(pc);
             return true;
         }
     }
     // See if the value of regSP is a valid stack pointer.
     // This works if we happen to be in an RTS call using a "Full" call.
     // It doesn't work if we've used a "Fast" call because that doesn't save the SP.
     sp = assemblyInterface.stackPtr;
     if (sp >= (stackItem*)this->stack->bottom && sp < (stackItem*)this->stack->top)
     {
         // We may be in the run-time system.
         pc = sp[0].w().AsCodePtr();
         MemSpace* space = gMem.SpaceForAddress(pc);
         if (space != 0 && (space->spaceType == ST_CODE || space->spaceType == ST_PERMANENT))
         {
             incrementCountAsynch(pc);
             return true;
         }
     }
     return false;
 }
 
 extern "C" {
     POLYEXTERNALSYMBOL void* PolyArm64GetThreadData();
     POLYEXTERNALSYMBOL POLYUNSIGNED PolyInterpretedEnterIntMode();
     POLYEXTERNALSYMBOL POLYUNSIGNED PolyEndBootstrapMode(POLYUNSIGNED threadId, POLYUNSIGNED function);
 }
 
 // Return the address of assembly data for the current thread.  This is normally in
 // X26 except if we are in a callback.
 void* PolyArm64GetThreadData()
 {
     // We should get the task data for the thread that is running this code.
     // If this thread has been created by the foreign code we will have to
     // create a new one here.
     TaskData* taskData = processes->GetTaskDataForThread();
     if (taskData == 0)
     {
         try {
             taskData = processes->CreateNewTaskData();
         }
         catch (std::bad_alloc&) {
             ::Exit("Unable to create thread data - insufficient memory");
         }
         catch (MemoryException&) {
             ::Exit("Unable to create thread data - insufficient memory");
         }
     }
     return &((Arm64TaskData*)taskData)->assemblyInterface;
 }
 
 // Do we require EnterInt instructions and if so for which architecture?
 // 0 = > None; 1 => X86_32, 2 => X86_64. 3 => X86_32_in_64. 4 => ARM_64.
 // ARM_64 in 32 is the same as ARM64.
 POLYUNSIGNED PolyInterpretedEnterIntMode()
 {
     return TAGGED(4).AsUnsigned();
 }
 
 // End the first stage of bootstrap mode and run a new function.
 // The first stage is always interpreted.  Once that is complete every function will have
 // at least an executable "enter-interpreter" stub so it can be called as machine code.
 POLYUNSIGNED PolyEndBootstrapMode(POLYUNSIGNED threadId, POLYUNSIGNED function)
 {
     TaskData* taskData = TaskData::FindTaskForId(threadId);
     ASSERT(taskData != 0);
     taskData->PreRTSCall();
     Handle pushedFunction = taskData->saveVec.push(function);
     arm64Dependent.mustInterpret = false;
     ((Arm64TaskData*)taskData)->EndBootStrap();
     taskData->InitStackFrame(taskData, pushedFunction);
     taskData->EnterPolyCode();
     // Should never return.
     ASSERT(0);
     return TAGGED(0).AsUnsigned();
 }
 
 // No machine-specific calls in the interpreter.
 struct _entrypts machineSpecificEPT[] =
 {
     { "PolyArm64GetThreadData",         (polyRTSFunction)&PolyArm64GetThreadData },
     { "PolyInterpretedEnterIntMode",    (polyRTSFunction)&PolyInterpretedEnterIntMode },
     { "PolyEndBootstrapMode",           (polyRTSFunction)&PolyEndBootstrapMode },
     { NULL, NULL} // End of list.
 };
diff --git a/libpolyml/globals.h b/libpolyml/globals.h
index 313d5038..c30c8ada 100644
--- a/libpolyml/globals.h
+++ b/libpolyml/globals.h
@@ -1,426 +1,427 @@
 /*
     Title:  Globals for the system.
     Author:     Dave Matthews, Cambridge University Computer Laboratory
 
-    Copyright David C. J. Matthews 2017-21
+    Copyright David C. J. Matthews 2017-22
 
     Copyright (c) 2000-7
         Cambridge University Technical Services Limited
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 
 */
 
 #ifndef _GLOBALS_H
 #define _GLOBALS_H
 
 /*
     Poly words, pointers and cells (objects).
 
     The garbage collector needs to be able to distinguish different uses of a
     memory word.  We need to be able find which words are pointers to other
     objects and which are simple integers.  The simple distinction is between
     integers, which are tagged by having the bottom bit set, and Addresses
     which are word aligned (bottom 2 bits zero on a 32 bit machine, bottom 3
     bits on a 64 bit machine, bottom bit in 32-in-64).  
 
     Addresses always point to the start of cells.  The preceding word of a
     cell is the length word.  This contains the
     length of the cell in words in the low-order 3 (7 in native 64-bits)
     bytes and a flag byte in the top byte.  The flags give information about
     the type of the object.  The length word is also used by the garbage
     collector and other object processors.
 */
 
 #if HAVE_STDINT_H
 #  include <stdint.h>
 #endif
 
 #if HAVE_INTTYPES_H
 #  ifndef __STDC_FORMAT_MACROS
 #    define __STDC_FORMAT_MACROS
 #  endif
 #  include <inttypes.h>
 
 #elif (defined(_MSC_VER) && (_MSC_VER >= 1900))
 // In VS 2015 and later we need to use <cinttypes>
 #  include <cinttypes>  
 #endif
 
 #ifdef HAVE_STDDEF_H
 #  include <stddef.h>
 #endif
 
 #define POLY_TAGSHIFT    1
 
 #if (defined(_WIN32))
 #  include <windows.h>
 #endif
 
 #ifdef POLYML32IN64
 typedef int32_t         POLYSIGNED;
 typedef uint32_t        POLYUNSIGNED;
 #define SIZEOF_POLYWORD 4
 #else
 typedef intptr_t        POLYSIGNED;
 typedef uintptr_t       POLYUNSIGNED;
 #define SIZEOF_POLYWORD SIZEOF_VOIDP
 #endif
 
 // libpolyml uses printf-style I/O instead of C++ standard IOstreams,
 // so we need specifier to format POLYUNSIGNED/POLYSIGNED values.
 #ifdef POLYML32IN64
 #if (defined(PRIu32))
 #  define POLYUFMT PRIu32
 #  define POLYSFMT PRId32
 #elif (defined(_MSC_VER))
 #  define POLYUFMT "lu"
 #  define POLYSFMT "ld"
 #else
 #  define POLYUFMT "u"
 #  define POLYSFMT "d"
 #endif
 #elif (defined(PRIuPTR))
 #  define POLYUFMT PRIuPTR
 #  define POLYSFMT PRIdPTR
 #elif (defined(_MSC_VER) && (SIZEOF_POLYWORD == 8))
 #  define POLYUFMT "llu"
 #  define POLYSFMT "lld"
 #else
 #  define POLYUFMT "lu"         // as before.  Cross your fingers.
 #  define POLYSFMT "ld"         // idem.
 #endif
 
 // We can use the C99 %zu in most cases except MingW since it uses
 // the old msvcrt and that only supports C89.
 #if (defined(_WIN32) && (! defined(_MSC_VER) || _MSC_VER < 1800))
 #  if (SIZEOF_VOIDP == 8)
 #    define PRI_SIZET PRIu64
 #  else
 #    define PRI_SIZET PRIu32
 #  endif
 #else
 #   define PRI_SIZET "zu"
 #endif
 
 typedef unsigned char   byte;
 
 class PolyObject;
 typedef PolyObject *POLYOBJPTR;
 
 #ifdef POLYML32IN64
 class PolyWord;
 extern PolyWord *globalHeapBase, *globalCodeBase;
 typedef uint32_t POLYOBJECTPTR; // This is an index into globalHeapBase
 
 // If a 64-bit value if in the range of the object pointers.
 inline bool IsHeapAddress(void *addr) { return (uintptr_t)addr <= 0xffffffff; }
 #else
 typedef POLYOBJPTR POLYOBJECTPTR;
 inline bool IsHeapAddress(void *) { return true; }
 #endif
 
 typedef byte *POLYCODEPTR;
 
 class PolyWord {
 public:
     // Initialise to TAGGED(0).  This is very rarely used.
     PolyWord() { contents.unsignedInt = 1; }
 
     // Integers need to be tagged.
     static PolyWord TaggedInt(POLYSIGNED s) { return PolyWord((s << POLY_TAGSHIFT) | (POLYSIGNED)0x01); }
     static PolyWord TaggedUnsigned(POLYUNSIGNED u) { return PolyWord((u << POLY_TAGSHIFT) | 0x01); }
 
     static PolyWord FromStackAddr(PolyWord *sp) { return PolyWord(sp); }
     static PolyWord FromCodePtr(POLYCODEPTR p) { return PolyWord(p); }
 
     // Tests for the various cases.
     bool IsTagged(void) const { return (contents.unsignedInt & 1) != 0; }
 #ifndef POLYML32IN64
     // In native 32-bit and 64-bit addresses are on word boundaries
     bool IsDataPtr(void) const { return (contents.unsignedInt & (sizeof(PolyWord) - 1)) == 0; }
 #else
     // In 32-in-64 addresses are anything that isn't tagged.
     bool IsDataPtr(void) const { return (contents.unsignedInt & 1) == 0; }
 #ifdef POLYML32IN64DEBUG
     static POLYOBJECTPTR AddressToObjectPtr(void *address);
 #else
     static POLYOBJECTPTR AddressToObjectPtr(void *address)
         { return (POLYOBJECTPTR)((PolyWord*)address - globalHeapBase); }
 #endif
 #endif
 
     // Extract the various cases.
     POLYSIGNED UnTagged(void) const { return contents.signedInt >> POLY_TAGSHIFT; }
     POLYUNSIGNED UnTaggedUnsigned(void) const { return contents.unsignedInt >> POLY_TAGSHIFT; }
 #ifdef POLYML32IN64
     PolyWord(POLYOBJPTR p) { contents.objectPtr = AddressToObjectPtr(p); }
     PolyWord *AsStackAddr(PolyWord *base = globalHeapBase) const { return base + contents.objectPtr; }
     POLYOBJPTR AsObjPtr(PolyWord *base = globalHeapBase) const { return (POLYOBJPTR)AsStackAddr(base); }
 #else
     // An object pointer can become a word directly.
     PolyWord(POLYOBJPTR p) { contents.objectPtr = p; }
     POLYOBJPTR AsObjPtr(PolyWord *base = 0) const { return contents.objectPtr; }
     PolyWord *AsStackAddr(PolyWord *base=0) const { return (PolyWord *)contents.objectPtr; }
 #endif
     POLYCODEPTR AsCodePtr(void) const { return (POLYCODEPTR)AsObjPtr(); }
     void *AsAddress(void)const { return AsCodePtr(); }
 
     // There are a few cases where we need to store and extract untagged values
     static PolyWord FromUnsigned(POLYUNSIGNED u) { return PolyWord(u); }
     static PolyWord FromSigned(POLYSIGNED s) { return PolyWord(s); }
     POLYUNSIGNED AsUnsigned(void) const { return contents.unsignedInt; }
     POLYSIGNED AsSigned(void) const { return contents.signedInt; }
 
 protected:
     PolyWord(POLYSIGNED s) { contents.signedInt = s; }
     PolyWord(POLYUNSIGNED u) { contents.unsignedInt = u; }
 
 public:
     bool operator == (PolyWord b) const { return contents.unsignedInt == b.contents.unsignedInt; }
     bool operator != (PolyWord b) const { return contents.unsignedInt != b.contents.unsignedInt; }
 
 protected:
 #ifdef POLYML32IN64
     PolyWord(PolyWord *sp) { contents.objectPtr = AddressToObjectPtr(sp); }
     PolyWord(POLYCODEPTR p) { contents.objectPtr = AddressToObjectPtr(p); }
 #else
     PolyWord(PolyWord *sp) { contents.objectPtr = (PolyObject*)sp; }
     PolyWord(POLYCODEPTR p) { contents.objectPtr = (PolyObject*)p; }
 #endif
     union {
         POLYSIGNED      signedInt;      // A tagged integer - lowest bit set
         POLYUNSIGNED    unsignedInt;    // A tagged integer - lowest bit set
         POLYOBJECTPTR   objectPtr;      // Object pointer   - lowest bit clear.
     } contents;
 };
 
 //typedef PolyWord POLYWORD;
 
 inline bool OBJ_IS_AN_INTEGER(const PolyWord & a)           { return a.IsTagged(); }
 inline bool OBJ_IS_DATAPTR(const PolyWord & a)              { return a.IsDataPtr(); }
 // The maximum tagged signed number is one less than 0x80 shifted into the top byte then shifted down
 // by the tag shift.
 #define MAXTAGGED                                   (((POLYSIGNED)0x80 << (POLYSIGNED)(8*(sizeof(PolyWord)-1) -POLY_TAGSHIFT)) -1)
 inline PolyWord TAGGED(POLYSIGNED a)                { return PolyWord::TaggedInt(a); }
 inline POLYSIGNED UNTAGGED(PolyWord a)              { return a.UnTagged(); }
 inline POLYUNSIGNED UNTAGGED_UNSIGNED(PolyWord a)   { return a.UnTaggedUnsigned(); }
 
 
 #define IS_INT(x) ((x).IsTagged())
 
 /* length word flags */
 #define OBJ_PRIVATE_FLAGS_SHIFT     (8 * (sizeof(PolyWord) - 1))
 
 #define _TOP_BYTE(x)                ((POLYUNSIGNED)(x) << OBJ_PRIVATE_FLAGS_SHIFT)
 
 // Bottom two bits define the content format.
 // Zero bits mean ordinary word object containing addresses or tagged integers.
 #define F_BYTE_OBJ                  0x01  /* byte object (contains no pointers) */
 #define F_CODE_OBJ                  0x02  /* code object (mixed bytes and words) */
 #define F_CLOSURE_OBJ               0x03  /* closure (32-in-64 only).  First word is code addr. */
 #define F_GC_MARK                   0x04  // Used during the GC marking phase
 #define F_NO_OVERWRITE              0x08  /* don't overwrite when loading - mutables only. */
 // This bit is overloaded and has different meanings depending on what other bits are set.
 // For byte objects it is the sign bit for arbitrary precision ints.
 // For other data it indicates either that the object is a profile block or contains
 // information for allocation profiling.
 #define F_NEGATIVE_BIT              0x10  // Sign bit for arbitrary precision ints (byte segs only)
 #define F_PROFILE_BIT               0x10  // Object has a profile pointer (word segs only)
 #define F_WEAK_BIT                  0x20  /* object contains weak references to option values. */
 // The Weak bit is only used on mutables.  The data sharing (sharedata.cpp) uses this with
 // immutables to indicate that the length field is being used to store the "depth".
 #define F_MUTABLE_BIT               0x40  /* object is mutable */
 #define F_TOMBSTONE_BIT             0x80  // Object is a forwarding pointer
 #define F_PRIVATE_FLAGS_MASK        0xFF
 
 // Shifted bits
 #define _OBJ_BYTE_OBJ                _TOP_BYTE(F_BYTE_OBJ)  /* byte object (contains no pointers) */
 #define _OBJ_CODE_OBJ                _TOP_BYTE(F_CODE_OBJ)  /* code object (mixed bytes and words) */
 #define _OBJ_CLOSURE_OBJ             _TOP_BYTE(F_CLOSURE_OBJ)  // closure (32-in-64 only).  First word is code addr.
 #define _OBJ_GC_MARK                 _TOP_BYTE(F_GC_MARK)  // Mark bit
 #define _OBJ_NO_OVERWRITE            _TOP_BYTE(F_NO_OVERWRITE)  /* don't overwrite when loading - mutables only. */
 #define _OBJ_NEGATIVE_BIT            _TOP_BYTE(F_NEGATIVE_BIT)  /* sign bit for arbitrary precision ints */
 #define _OBJ_PROFILE_BIT             _TOP_BYTE(F_PROFILE_BIT)  /* sign bit for arbitrary precision ints */
 #define _OBJ_WEAK_BIT                _TOP_BYTE(F_WEAK_BIT)
 #define _OBJ_MUTABLE_BIT             _TOP_BYTE(F_MUTABLE_BIT)  /* object is mutable */
 #define _OBJ_TOMBSTONE_BIT           _TOP_BYTE(F_TOMBSTONE_BIT)  // object is a tombstone.
 #define _OBJ_PRIVATE_FLAGS_MASK      _TOP_BYTE(F_PRIVATE_FLAGS_MASK)
 #define _OBJ_PRIVATE_LENGTH_MASK     ((-1) ^ _OBJ_PRIVATE_FLAGS_MASK)
 #define MAX_OBJECT_SIZE              _OBJ_PRIVATE_LENGTH_MASK
 
 // 
 inline bool OBJ_IS_LENGTH(POLYUNSIGNED L)               { return ((L & _OBJ_TOMBSTONE_BIT) == 0); }
 
 /* these should only be applied to proper length words */
 /* discards GC flag, mutable bit and weak bit. */
 inline byte GetTypeBits(POLYUNSIGNED L)             { return (byte)(L >> OBJ_PRIVATE_FLAGS_SHIFT) & 0x03; }
 
 inline POLYUNSIGNED OBJ_OBJECT_LENGTH(POLYUNSIGNED L)   { return L & _OBJ_PRIVATE_LENGTH_MASK; }
 inline bool OBJ_IS_BYTE_OBJECT(POLYUNSIGNED L)          { return (GetTypeBits(L) == F_BYTE_OBJ); }
 inline bool OBJ_IS_CODE_OBJECT(POLYUNSIGNED L)          { return (GetTypeBits(L) == F_CODE_OBJ); }
 inline bool OBJ_IS_CLOSURE_OBJECT(POLYUNSIGNED L)       { return (GetTypeBits(L) == F_CLOSURE_OBJ); }
 inline bool OBJ_IS_NO_OVERWRITE(POLYUNSIGNED L)         { return ((L & _OBJ_NO_OVERWRITE) != 0); }
 inline bool OBJ_IS_NEGATIVE(POLYUNSIGNED L)             { return ((L & _OBJ_NEGATIVE_BIT) != 0); }
 inline bool OBJ_HAS_PROFILE(POLYUNSIGNED L)             { return ((L & _OBJ_PROFILE_BIT) != 0); }
 inline bool OBJ_IS_MUTABLE_OBJECT(POLYUNSIGNED L)       { return ((L & _OBJ_MUTABLE_BIT) != 0); }
 inline bool OBJ_IS_WEAKREF_OBJECT(POLYUNSIGNED L)       { return ((L & _OBJ_WEAK_BIT) != 0); }
 
 /* Don't need to worry about whether shift is signed, 
    because OBJ_PRIVATE_USER_FLAGS_MASK removes the sign bit.
    We don't want the GC bit (which should be 0) anyway.
 */
 #define OBJ_PRIVATE_USER_FLAGS_MASK     _TOP_BYTE(0x7F)
 
 #define OBJ_IS_WORD_OBJECT(L)           (GetTypeBits(L) == 0)
 
 /* case 2 - forwarding pointer */
 inline bool OBJ_IS_POINTER(POLYUNSIGNED L)  { return (L & _OBJ_TOMBSTONE_BIT) != 0; }
 #ifdef POLYML32IN64
 inline PolyObject *OBJ_GET_POINTER(POLYUNSIGNED L) { return (PolyObject*)(globalHeapBase + ((L & ~_OBJ_TOMBSTONE_BIT) << 1)); }
 inline POLYUNSIGNED OBJ_SET_POINTER(PolyObject *pt) { return PolyWord::AddressToObjectPtr(pt) >> 1 | _OBJ_TOMBSTONE_BIT; }
 #else
 inline PolyObject *OBJ_GET_POINTER(POLYUNSIGNED L) { return (PolyObject*)(( L & ~_OBJ_TOMBSTONE_BIT) <<2); }
 inline POLYUNSIGNED OBJ_SET_POINTER(PolyObject *pt) { return ((POLYUNSIGNED)pt >> 2) | _OBJ_TOMBSTONE_BIT; }
 #endif
 
 // An object i.e. a piece of allocated memory in the heap.  In the simplest case this is a
 // tuple, a list cons cell, a string or a ref.  Every object has a length word in the word before
 // where its address points.  The top byte of this contains flags. 
 class PolyObject {
 public:
     byte *AsBytePtr(void)const { return (byte*)this; }
     PolyWord *AsWordPtr(void)const { return (PolyWord*)this; }
     POLYUNSIGNED LengthWord(void)const { return ((PolyWord*)this)[-1].AsUnsigned(); }
     POLYUNSIGNED Length(void)const { return OBJ_OBJECT_LENGTH(LengthWord()); }
 
     // Get and set a word
     PolyWord Get(POLYUNSIGNED i) const { return ((PolyWord*)this)[i]; }
     void Set(POLYUNSIGNED i, PolyWord v) { ((PolyWord*)this)[i] = v; }
     PolyWord *Offset(POLYUNSIGNED i) const { return ((PolyWord*)this)+i; }
 
     // Create a length word from a length and the flags in the top byte. 
     void SetLengthWord(POLYUNSIGNED l, byte f)
         { ((POLYUNSIGNED*)this)[-1] = l | ((POLYUNSIGNED)f << OBJ_PRIVATE_FLAGS_SHIFT); }
     void SetLengthWord(POLYUNSIGNED l) { ((PolyWord*)this)[-1] = PolyWord::FromUnsigned(l); }
 
     bool IsByteObject(void) const { return OBJ_IS_BYTE_OBJECT(LengthWord()); }
     bool IsCodeObject(void) const { return OBJ_IS_CODE_OBJECT(LengthWord()); }
     bool IsClosureObject(void) const { return OBJ_IS_CLOSURE_OBJECT(LengthWord()); }
     bool IsWordObject(void) const { return OBJ_IS_WORD_OBJECT(LengthWord()); }
     bool IsMutable(void) const { return OBJ_IS_MUTABLE_OBJECT(LengthWord()); }
     bool IsWeakRefObject(void) const { return OBJ_IS_WEAKREF_OBJECT(LengthWord()); }
     bool IsNoOverwriteObject(void) const { return OBJ_IS_NO_OVERWRITE(LengthWord()); }
 
     bool ContainsForwardingPtr(void) const { return OBJ_IS_POINTER(LengthWord()); }
     PolyObject *GetForwardingPtr(void) const { return OBJ_GET_POINTER(LengthWord()); }
     void SetForwardingPtr(PolyObject *newp) { ((PolyWord*)this)[-1] = PolyWord::FromUnsigned(OBJ_SET_POINTER(newp)); }
 
     bool ContainsNormalLengthWord(void) const { return OBJ_IS_LENGTH(LengthWord()); }
 
     // Follow a chain of forwarding pointers
     PolyObject *FollowForwardingChain(void)
     {
         if (ContainsForwardingPtr())
             return GetForwardingPtr()->FollowForwardingChain();
         else return this;
     }
 };
 
 // Stacks are native-words size even in 32-in-64.
 union stackItem
 {
     stackItem(PolyWord v) { argValue = v.AsUnsigned(); }
     stackItem() { argValue = TAGGED(0).AsUnsigned(); }
 
     // These return the low order word.
     PolyWord w()const { return PolyWord::FromUnsigned((POLYUNSIGNED)argValue); }
     operator PolyWord () { return PolyWord::FromUnsigned((POLYUNSIGNED)argValue); }
     POLYCODEPTR codeAddr; // Return addresses
     stackItem* stackAddr; // Stack addresses
     uintptr_t argValue; // Treat an address as an int
+    PolyObject* absAddress; // It could be an absolute address to the heap
 };
 
 
 /* There was a problem with version 2.95 on Sparc/Solaris at least.  The PolyObject
    class has no members so classes derived from it e.g. ML_Cons_Cell should begin at
    the beginning of the object.  Later versions of GCC get this right. */
 #if defined(__GNUC__) && (__GNUC__ <= 2)
 #error Poly/ML requires GCC version 3 or newer
 #endif
 
 inline POLYUNSIGNED GetLengthWord(PolyWord p) { return p.AsObjPtr()->LengthWord(); }
 // Get the length of an object.
 inline POLYUNSIGNED OBJECT_LENGTH(PolyWord p) { return OBJ_OBJECT_LENGTH(GetLengthWord(p)); }
 
 // A list cell.  This can be passed to or returned from certain RTS functions.
 class ML_Cons_Cell: public PolyObject {
 public:
     PolyWord    h;
     PolyWord    t;
 
 #define ListNull (TAGGED(0))
 
     static bool IsNull(PolyWord p) { return p == ListNull; }
 };
 
 /* An exception packet.  This contains an identifier (either a tagged integer for RTS
    exceptions or the address of a mutable for those created within ML), a string
    name for printing and an exception argument value. */
 class PolyException: public PolyObject {
 public:
     PolyWord    ex_id; /* Exc identifier */
     PolyWord    ex_name;/* Exc name */
     PolyWord    arg; /* Exc arguments */
     PolyWord    ex_location; // Location of "raise".  Always zero for RTS exceptions.
 };
 
 typedef PolyException poly_exn;
 
 /* Macro to round a number of bytes up to a number of words. */
 #define WORDS(s) ((s+sizeof(PolyWord)-1)/sizeof(PolyWord))
 
 /**********************************************************************
  *
  * Representation of option type.
  *
  **********************************************************************/
 #define NONE_VALUE      (TAGGED(0))
 /* SOME x is represented by a single word cell containing x. */
 
 #if (defined(_WIN32))
 /* Windows doesn't include 0x in %p format. */
 #define ZERO_X  "0x"
 #else
 #define ZERO_X  ""
 #endif
 
 
 // ARM instructions are always little-endian even in big-endian mode
 #ifdef WORDS_BIGENDIAN
 inline uint32_t reverseBytes32(uint32_t value)
 {
     return (((value & 0x000000ff) << 24) |
         ((value & 0x0000ff00) << 8) |
         ((value & 0x00ff0000) >> 8) |
         ((value & 0xff000000) >> 24));
 }
 inline uint32_t fromARMInstr(uint32_t instr) { return reverseBytes32(instr); }
 inline uint32_t toARMInstr(uint32_t instr) { return reverseBytes32(instr); }
 #else
 inline uint32_t fromARMInstr(uint32_t instr) { return instr; }
 inline uint32_t toARMInstr(uint32_t instr) { return instr; }
 #endif
 
 #endif
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODE.sig b/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODE.sig
index f4ca93a4..8b7e57e3 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODE.sig
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODE.sig
@@ -1,438 +1,438 @@
 (*
     Signature for the high-level ARM64 code
 
-    Copyright David C. J. Matthews 2021
+    Copyright David C. J. Matthews 2021-2
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 signature ARM64ICODE =
 sig
     type machineWord = Address.machineWord
     type address = Address.address
     
     type closureRef
 
     (* Registers. *)
     datatype xReg = XReg of Word8.word | XZero | XSP
     and vReg = VReg of Word8.word
 
     (* It is simpler to use a single type for all registers. *)
     datatype reg = GenReg of xReg | FPReg of vReg
 
     val X0:  xReg   and X1:  xReg   and X2:  xReg   and X3: xReg
     and X4:  xReg   and X5:  xReg   and X6:  xReg   and X7: xReg
     and X8:  xReg   and X9:  xReg   and X10: xReg   and X11: xReg
     and X12: xReg   and X13: xReg   and X14: xReg   and X15: xReg
     and X16: xReg   and X17: xReg   and X18: xReg   and X19: xReg
     and X20: xReg   and X21: xReg   and X22: xReg   and X23: xReg
     and X24: xReg   and X25: xReg   and X26: xReg   and X27: xReg
     and X28: xReg   and X29: xReg   and X30: xReg
 
     val V0:  vReg   and V1:  vReg   and V2:  vReg   and V3: vReg
     and V4:  vReg   and V5:  vReg   and V6:  vReg   and V7: vReg
     
     val is32in64: bool and isBigEndian: bool
     
     (* Condition for conditional branches etc. *)
     datatype condition =
         CondEqual            (* Z=1 *)
     |   CondNotEqual         (* Z=0 *)
     |   CondCarrySet         (* C=1 *)
     |   CondCarryClear       (* C=0 *)
     |   CondNegative         (* N=1 *)
     |   CondPositive         (* N=0 imcludes zero *)
     |   CondOverflow         (* V=1 *)
     |   CondNoOverflow       (* V=0 *)
     |   CondUnsignedHigher   (* C=1 && Z=0 *)
     |   CondUnsignedLowOrEq  (* ! (C=1 && Z=0) *)
     |   CondSignedGreaterEq  (* N=V *)
     |   CondSignedLess       (* N<>V *)
     |   CondSignedGreater    (* Z==0 && N=V *)
     |   CondSignedLessEq     (* !(Z==0 && N=V) *)
 
     (* The shift used in arithemtic operations. *)
     and shiftType =
         ShiftLSL of Word8.word
     |   ShiftLSR of Word8.word
     |   ShiftASR of Word8.word
     |   ShiftNone
 
     datatype preg = PReg of int (* A pseudo-register - an abstract register. *)
 
     (* If the value is zero we can use X0/W0. *)
     datatype pregOrZero = SomeReg of preg | ZeroReg
     
     (* A location on the stack.  May be more than word if this is a container or a handler entry. *)
     datatype stackLocn = StackLoc of {size: int, rno: int }
     
     (* This combines pregKind and stackLocn.  *)
     datatype regProperty =
         RegPropGeneral      (* A general register. *)
     |   RegPropUntagged     (* An untagged general register. *)
     |   RegPropStack of int (* A stack location or container. *)
     |   RegPropCacheTagged
     |   RegPropCacheUntagged
     |   RegPropMultiple     (* The result of a conditional or case. May be defined at multiple points. *)
     
     (* The reference to a condition code. *)
     datatype ccRef = CcRef of int
 
     datatype loadType = Load64 | Load32 | Load16 | Load8
     and opSize = OpSize32 | OpSize64
     and logicalOp = LogAnd | LogOr | LogXor
     and callKind = Recursive | ConstantCode of machineWord | FullCall
     and floatSize = Float32 | Double64
     and shiftDirection = ShiftLeft | ShiftRightLogical | ShiftRightArithmetic
     and multKind =
         MultAdd32 | MultSub32 | MultAdd64 | MultSub64 |
         SignedMultAddLong (* 32bit*32bit + 64bit => 64Bit *) |
         SignedMultHigh (* High order part of 64bit*64Bit *)
     and fpUnary = NegFloat | NegDouble | AbsFloat | AbsDouble | ConvFloatToDble | ConvDbleToFloat
     and fpBinary = MultiplyFP | DivideFP | AddFP | SubtractFP
 
     (* Function calls can have an unlimited number of arguments so it isn't always
        going to be possible to load them into registers. *)
     datatype 'genReg fnarg = ArgInReg of 'genReg | ArgOnStack of { wordOffset: int, container: stackLocn, field: int }
 
     datatype ('genReg, 'optGenReg, 'fpReg) arm64ICode =
         (* Move the contents of one preg to another.  These are always 64-bits. *)
         MoveRegister of { source: 'genReg, dest: 'genReg }
 
         (* Numerical constant. *)
     |   LoadNonAddressConstant of { source: Word64.word, dest: 'genReg }
 
         (* Address constant. *)
     |   LoadAddressConstant of { source: machineWord, dest: 'genReg }
 
         (* Load a value into a register using a constant, signed, byte offset.  The offset
            is in the range of -256 to (+4095*unit size). *)
     |   LoadWithConstantOffset of { base: 'genReg, dest: 'genReg, byteOffset: int, loadType: loadType }
 
         (* Similarly for FP registers. *)
     |   LoadFPWithConstantOffset of { base: 'genReg, dest: 'fpReg, byteOffset: int, floatSize: floatSize }
 
         (* Load a value into a register using an index register. *)
     |   LoadWithIndexedOffset of { base: 'genReg, dest: 'genReg, index: 'genReg, loadType: loadType }
 
         (* Ditto for FP. *)
     |   LoadFPWithIndexedOffset of { base: 'genReg, dest: 'fpReg, index: 'genReg, floatSize: floatSize }
 
         (* Returns the current thread ID.  Always a 64-bit value.. *)
     |   GetThreadId of { dest: 'genReg }
 
         (* Convert a 32-in-64 object index into an absolute address. *)
     |   ObjectIndexAddressToAbsolute of { source: 'genReg, dest: 'genReg }
 
         (* Convert an absolute address into an object index. *)
     |   AbsoluteToObjectIndex of { source: 'genReg, dest: 'genReg }
 
         (* Allocate a fixed sized piece of memory and puts the absolute address into dest.
            bytesRequired is the total number of bytes including the length word and any alignment
            necessary for 32-in-64. saveRegs is the list of registers that need to be saved if we
            need to do a garbage collection. *)
     |   AllocateMemoryFixed of { bytesRequired: Word64.word, dest: 'genReg, saveRegs: 'genReg list }
 
         (* Allocate a piece of memory.  The size argument is an untagged value containing
            the number of words i.e. the same value used for InitialiseMemory and to store
            in the length word. *)
     |   AllocateMemoryVariable of { size: 'genReg, dest: 'genReg, saveRegs: 'genReg list }
 
         (* Initialise a piece of memory by writing "size" copies of the value
            in "init".  N.B. The size is an untagged value containing the
            number of words. *)
     |   InitialiseMem of { size: 'genReg, addr: 'genReg, init: 'genReg }
 
         (* Mark the beginning of a loop.  This is really only to prevent the initialisation code being
            duplicated in ICodeOptimise. *)
     |   BeginLoop
 
         (* Set up the registers for a jump back to the start of a loop. *)
     |   JumpLoop of
             { regArgs: {src: 'genReg fnarg, dst: 'genReg} list,
               stackArgs: {src: 'genReg fnarg, wordOffset: int, stackloc: stackLocn} list,
               checkInterrupt: 'genReg list option }
 
         (* Store a register using a constant, signed, byte offset.  The offset
            is in the range of -256 to (+4095*unit size). *)
     |   StoreWithConstantOffset of { source: 'genReg, base: 'genReg, byteOffset: int, loadType: loadType }
 
         (* Ditto for FP regs. *)
     |   StoreFPWithConstantOffset of { source: 'fpReg, base: 'genReg, byteOffset: int, floatSize: floatSize }
 
         (* Store a register using an index register. *)
     |   StoreWithIndexedOffset of { source: 'genReg, base: 'genReg, index: 'genReg, loadType: loadType }
 
         (* and for FP regs. *)
     |   StoreFPWithIndexedOffset of { source: 'fpReg, base: 'genReg, index: 'genReg, floatSize: floatSize }
 
         (* Add/Subtract immediate.  The destination is optional in which case XZero is used.
            ccRef is optional.  If it is NONE the version of the instruction that does not generate
            a condition code is used. immed must be < 0wx1000. *)
     |   AddSubImmediate of { source: 'genReg, dest: 'optGenReg, ccRef: ccRef option, immed: word,
                              isAdd: bool, length: opSize }
 
         (* Add/Subtract register.  As with AddSubImmediate, both the destination and cc are optional. *)
     |   AddSubRegister of { base: 'genReg, shifted: 'genReg, dest: 'optGenReg, ccRef: ccRef option,
                             isAdd: bool, length: opSize, shift: shiftType }
 
         (* Bitwise logical operations.  The immediate value must be a valid bit pattern.  ccRef can
            only be SOME if logOp is LogAnd. *)
     |   LogicalImmediate of { source: 'genReg, dest: 'optGenReg, ccRef: ccRef option, immed: Word64.word,
                               logOp: logicalOp, length: opSize }
 
         (* Register logical operations.  ccRef can only be SOME if logOp is LogAnd.*)
     |   LogicalRegister of { base: 'genReg, shifted: 'genReg, dest: 'optGenReg, ccRef: ccRef option,
                              logOp: logicalOp, length: opSize, shift: shiftType }
 
         (* Shift a word by an amount specified in a register. *)
     |   ShiftRegister of { direction: shiftDirection, dest: 'genReg, source: 'genReg, shift: 'genReg, opSize: opSize }
 
         (* The various forms of multiply all take three arguments and the general form is
            dest = M * N +/- A..   *)
     |   Multiplication of { kind: multKind, dest: 'genReg, sourceA: 'optGenReg, sourceM: 'genReg, sourceN: 'genReg }
 
         (* Signed or unsigned division.  Sets the result to zero if the divisor is zero. *)
     |   Division of { isSigned: bool, dest: 'genReg, dividend: 'genReg, divisor: 'genReg, opSize: opSize }
 
         (* Start of function.  Set the register arguments.  stackArgs is the list of
            stack arguments.  If the function has a real closure regArgs includes the
            closure register (X8).  The register arguments include the return register
            (X30). *)
     |   BeginFunction of { regArgs: ('genReg * xReg) list, stackArgs: stackLocn list }
 
         (* Call a function.  If the code address is a constant it is passed here.
            Otherwise the address is obtained by indirecting through X8 which has been loaded
            as one of the argument registers.  The result is stored in the destination register.
            The "containers" argument is used to ensure that any container whose address is passed
            as one of the other arguments continues to be referenced until the function is called
            since there's a possibility that it isn't actually used after the function. *)
     |   FunctionCall of
             { callKind: callKind, regArgs: ('genReg fnarg * xReg) list,
               stackArgs: 'genReg fnarg list, dest: 'genReg, saveRegs: 'genReg list, containers: stackLocn list}
 
         (* Jump to a tail-recursive function.  This is similar to FunctionCall
            but complicated for stack arguments because the stack and the return
            address need to be overwritten.
            stackAdjust is the number of words to remove (positive) or add
            (negative) to the stack before the call.
            currStackSize contains the number of items currently on the stack. *)
     |   TailRecursiveCall of
             { callKind: callKind, regArgs: ('genReg fnarg * xReg) list,
               stackArgs: {src: 'genReg fnarg, stack: int} list,
               stackAdjust: int, currStackSize: int }
 
         (* Return from the function.  resultReg is the preg that contains the result,
            returnReg is the preg that contains the return address. *)
     |   ReturnResultFromFunction of { resultReg: 'genReg, returnReg: 'genReg, numStackArgs: int }
 
         (* Raise an exception.  The packet is always loaded into X0. *)
     |   RaiseExceptionPacket of { packetReg: 'genReg }
 
         (* Push a register to the stack.  This is used both for a normal push, copies=1, and
            also to reserve a container. *)
     |   PushToStack of { source: 'genReg, copies: int, container: stackLocn }
 
         (* Load a register from the stack.  The container is the stack location identifier,
            the field is an offset in a container. *)
     |   LoadStack of { dest: 'genReg, wordOffset: int, container: stackLocn, field: int }
 
         (* Store a value into the stack. *)
     |   StoreToStack of { source: 'genReg, container: stackLocn, field: int, stackOffset: int }
 
         (* Set the register to the address of the container i.e. a specific offset on the stack. *)
     |   ContainerAddress of { dest: 'genReg, container: stackLocn, stackOffset: int }
 
         (* Remove items from the stack.  Used to remove containers or
            registers pushed to the stack.. *)
     |   ResetStackPtr of { numWords: int }
 
         (* Tag a value by shifting and setting the tag bit. *)
     |   TagValue of { source: 'genReg, dest: 'genReg, isSigned: bool, opSize: opSize }
 
         (* Shift a value to remove the tag bit.  The cache is used if this is untagging a
            value that has previously been tagged. *)
     |   UntagValue of { source: 'genReg, dest: 'genReg, isSigned: bool, opSize: opSize }
 
         (* Box a largeword value.  Stores a value
            into a byte area.  This can be implemented using AllocateMemoryFixed
            but keeping it separate makes optimisation easier.
            The result is always an address and needs to be converted to an
            object index on 32-in-64. *)
     |   BoxLarge of { source: 'genReg, dest: 'genReg, saveRegs: 'genReg list }
 
         (* Load a value from a box.  This can be implemented using a load but
            is kept separate to simplify optimisation.  The source is always
            an absolute address. *)
     |   UnboxLarge of { source: 'genReg, dest: 'genReg }
 
         (* Convert a floating point value into a value suitable for storing
            in the heap.  This normally involves boxing except that 32-bit
            floats can be tagged in native 64-bits. *)
     |   BoxTagFloat of { floatSize: floatSize, source: 'fpReg, dest: 'genReg, saveRegs: 'genReg list }
 
         (* The reverse of BoxTagFloat. *)
     |   UnboxTagFloat of { floatSize: floatSize, source: 'genReg, dest: 'fpReg }
 
         (* Load a value with acquire semantics.  This means that any other
            load in this thread after this sees the value of the shared
            memory at this point and not earlier.  This is used for
            references and arrays to ensure that if another thread has
            built a data structure on the heap and then assigns the
            address to a shared ref this thread will see the updated heap
            and not any locally cached previous version. *)
     |   LoadAcquire of { base: 'genReg, dest: 'genReg, loadType: loadType }
 
         (* Store a value with release semantics.  This ensures that any
            other write completes before this operation and works with
            LoadAcquire. *)
     |   StoreRelease of { base: 'genReg, source: 'genReg, loadType: loadType }
 
         (* This is a generalised constant shift which includes selection of a
            range of bits. *)
     |   BitFieldShift of { source: 'genReg, dest: 'genReg, isSigned: bool, length: opSize, immr: word, imms: word }
 
         (*  Copy a range of bits and insert it into another register.  This is the
             only case where a register functions both as a source and a destination. *)
     |   BitFieldInsert of { source: 'genReg, destAsSource: 'genReg, dest: 'genReg,
                             length: opSize, immr: word, imms: word }
 
         (* Indexed case. *)
     |   IndexedCaseOperation of { testReg: 'genReg }
 
         (* Exception handling.  - Set up an exception handler. *)
     |   PushExceptionHandler
 
         (* End of a handled section.  Restore the previous handler. *)
     |   PopExceptionHandler
 
         (* Marks the start of a handler.  This sets the stack pointer and
            restores the old handler.  Sets the exception packet register. *) 
     |   BeginHandler of { packetReg: 'genReg }
 
         (* Compare two vectors of bytes and set the condition code on the result.
            The registers are modified by the instruction. *)
     |   CompareByteVectors of
             { vec1Addr: 'genReg, vec2Addr: 'genReg, length: 'genReg, ccRef: ccRef }
 
         (* Move a block of bytes (isByteMove true) or words (isByteMove false).  The length is the
            number of items (bytes or words) to move. The registers are modified by
            the instruction. *)
     |   BlockMove of { srcAddr: 'genReg, destAddr: 'genReg, length: 'genReg, isByteMove: bool }
 
         (* Add or subtract to the system stack pointer and optionally return the new value.
            This is used to allocate and deallocate C space. *)
     |   AddSubXSP of { source: 'genReg, dest: 'optGenReg, isAdd: bool  }
 
         (* Ensures the value will actually be referenced although it doesn't generate any code. *)
     |   TouchValue of { source: 'genReg }
 
         (* Load a value at the address and get exclusive access.  Always loads a
            64-bit value. *)
     |   LoadAcquireExclusive of { base: 'genReg, dest: 'genReg }
 
         (* Store a value into an address releasing the lock.  Sets the result to
            either 0 or 1 if it succeeds or fails. *)
     |   StoreReleaseExclusive of { base: 'genReg, source: 'optGenReg, result: 'genReg }
 
         (* Insert a memory barrier. dmb ish. *)
     |   MemoryBarrier
 
         (* Convert an integer to a floating point value. *)
     |   ConvertIntToFloat of { source: 'genReg, dest: 'fpReg, srcSize: opSize, destSize: floatSize }
 
         (* Convert a floating point value to an integer using the specified rounding mode.
            We could get an overflow here but fortunately the ARM generates a value
            that will cause an overflow when we tag it, provided we tag it explicitly. *)
     |   ConvertFloatToInt of { source: 'fpReg, dest: 'genReg, srcSize: floatSize, destSize: opSize, rounding: IEEEReal.rounding_mode }
 
         (* Unary floating point.  This includes conversions between float and double. *)
     |   UnaryFloatingPt of { source: 'fpReg, dest: 'fpReg, fpOp: fpUnary }
 
         (* Binary floating point: addition, subtraction, multiplication and division. *)
     |   BinaryFloatingPoint of { arg1: 'fpReg, arg2: 'fpReg, dest: 'fpReg, fpOp: fpBinary, opSize: floatSize }
 
         (* Floating point comparison. *)
     |   CompareFloatingPoint of { arg1: 'fpReg, arg2: 'fpReg, ccRef: ccRef, opSize: floatSize }
 
         (* Debugging - fault if values don't match. *)
     |   CacheCheck of { arg1: 'genReg, arg2: 'genReg }
 
         (* Destinations at the end of a basic block. *)
     and controlFlow =
         (* Unconditional branch to a label - should be a merge point. *)
         Unconditional of int
         (* Conditional branch. Jumps to trueJump if the condional is false, falseJump if false. *)
     |   Conditional of { ccRef: ccRef, condition: condition, trueJump: int, falseJump: int }
         (* Exit - the last instruction of the block is a return, raise or tailcall. *)
     |   ExitCode
         (* Indexed case - this branches to one of a number of labels *)
     |   IndexedBr of int list
         (* Set up a handler.  This doesn't cause an immediate branch but the state at the
            start of the handler is the state at this point. *)
     |   SetHandler of { handler: int, continue: int }
         (* Unconditional branch to a handler.  If an exception is raised explicitly
            within the scope of a handler. *)
     |   UnconditionalHandle of int
         (* Conditional branch to a handler.  Occurs if there is a call to a
            function within the scope of a handler.  It may jump to the handler. *)
     |   ConditionalHandle of { handler: int, continue: int }
 
     and ('genReg, 'optGenReg, 'fpReg) basicBlock =
             BasicBlock of { block: ('genReg, 'optGenReg, 'fpReg) arm64ICode list, flow: controlFlow }
     
     (* Return the successor blocks from a control flow. *)
     val successorBlocks: controlFlow -> int list
 
     type iCodeAbstract = (preg, pregOrZero, preg) arm64ICode and basicBlockAbstract = (preg, pregOrZero, preg) basicBlock
     and  iCodeConcrete = (xReg, xReg, vReg) arm64ICode and basicBlockConcrete = (xReg, xReg, vReg) basicBlock
 
     val printICodeAbstract: basicBlockAbstract vector * (string -> unit) -> unit
     and printICodeConcrete: basicBlockConcrete vector * (string -> unit) -> unit
 
     (* Check whether this value is acceptable for LogicalImmediate. *)
     val isEncodableBitPattern: Word64.word * opSize -> bool
 
     (* This generates a  BitField instruction with the appropriate values for immr and imms. *)
     val shiftConstant:
         { direction: shiftDirection, dest: preg, source: preg, shift: word, opSize: opSize } -> iCodeAbstract
     
     structure Sharing:
     sig
         type xReg           = xReg
         and  vReg           = vReg
         and  reg            = reg
         and  condition      = condition
         and  shiftType      = shiftType
         and  ('genReg, 'optGenReg, 'fpReg) arm64ICode = ('genReg, 'optGenReg, 'fpReg) arm64ICode
         and  preg           = preg
         and  pregOrZero     = pregOrZero
         and  controlFlow    = controlFlow
         and  ('genReg, 'optGenReg, 'fpReg) basicBlock = ('genReg, 'optGenReg, 'fpReg) basicBlock
         and  stackLocn      = stackLocn
         and  regProperty    = regProperty
         and  ccRef          = ccRef
         and  'genReg fnarg  = 'genReg fnarg
         and  closureRef     = closureRef
         and  loadType       = loadType
         and  opSize         = opSize
         and  logicalOp      = logicalOp
         and  callKind       = callKind
         and  floatSize      = floatSize
         and  shiftDirection = shiftDirection
         and  multKind       = multKind
         and  fpUnary        = fpUnary
         and  fpBinary       = fpBinary
    end
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64AllocateRegisters.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64AllocateRegisters.ML
index 2a783a5e..26faaf83 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64AllocateRegisters.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64AllocateRegisters.ML
@@ -1,1094 +1,1113 @@
 (*
     Copyright David C. J. Matthews 2016-21
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64AllocateRegisters(
     structure Arm64ICode: ARM64ICODE
     structure Identify: ARM64IDENTIFYREFERENCES
     structure IntSet: INTSET
     
     sharing Arm64ICode.Sharing = Identify.Sharing = IntSet
 ): ARM64ALLOCATEREGISTERS =
 struct
     open Arm64ICode
     open Identify
     open IntSet
     open Address
     
     exception InternalError = Misc.InternalError
 
+    val checkCache = true (* Check the cache rather than use it *)
+
     datatype allocateResult =
         AllocateSuccess of basicBlockConcrete vector
     |   AllocateFailure of intSet list
 
     (* General registers.  X24 is used as the global heap base in 32-in-64. X30 is the
        return address set by blr but is otherwise a general register.
        Put the argument registers at the end of the list so they'll only be
        used when hinted. *)
     val generalRegisters =
         map GenReg ([X9, X10, X11, X12, X13, X14, X15, X19, X20, X21, X22, X23,
                       X0, X1, X2, X3, X4, X5, X6, X7, X8, X30] @ (if is32in64 then [] else [X24]))
 
     val floatingPtRegisters =
         map FPReg [V7, V6, V5, V4, V3, V2, V1]
     type conflictState =
     {
         conflicts: intSet, realConflicts: reg list
     }
     
     type triple = {instr: iCodeAbstract, current: intSet, active: intSet}
 
     exception InternalError = Misc.InternalError
 
     (* Get the conflict states, allocate registers and return the code with the allocated
        registers if it is successful. *)
     fun allocateRegisters{blocks, regProps, maxPRegs, ...} =
     let
         (* Other registers that conflict with this i.e. cannot share the same
            real register. *)
         val regConflicts = Array.array(maxPRegs, emptySet)
         (* Real registers that cannot be used for this because they are needed for
            an instruction.  Only X30 in calls and RTS traps. *)
         and regRealConflicts = Array.array(maxPRegs, []: reg list)
  
         fun addConflictsTo(addTo, conflicts) =
             List.app(fn aReg => Array.update(regConflicts, aReg, union(Array.sub(regConflicts, aReg), conflicts))) addTo
 
         (* To reserve a register we need to add the real register to the
            real conflict sets of all the abstract conflicts. *)
         local
             fun isInset reg set = List.exists (fn r => r = reg) set
         in
             fun reserveRegister(reserveFor, reg) =
             let
                 fun reserveAReg r =
                 let
                     val absConflicts = Array.sub(regConflicts, r)
                     fun addConflict i =
                         if isInset i reserveFor then () else addRealConflict (i, reg)
                 in
                     List.app addConflict (setToList absConflicts)
                 end
             in
                 List.app reserveAReg reserveFor
             end
         
             and addRealConflict (i, reg) =
             let
                 val currentConflicts = Array.sub(regRealConflicts, i)
             in
                 if isInset reg currentConflicts
                 then ()
                 else Array.update(regRealConflicts, i, reg :: currentConflicts)
             end
         end
       
         fun conflictsForInstr passThrough {instr, current, ...} =
         let
             val {sources, dests} = getInstructionRegisters instr
             fun regNo(PReg i) = i
             val destRegNos = map regNo dests
             and sourceRegNos = map regNo sources
             val destSet = listToSet destRegNos
             val afterRemoveDests = minus(current, destSet)
             
             local
                 (* In almost all circumstances the destination and sources don't
                    conflict and the same register can be used as a destination and
                    a source.  BoxLarge can only store the value after the memory
                    has been allocated.  BitFieldInsert has to copy the "destAsSource"
                    value into the destination so cannot use the same register for
                    the "source". *)
                 val postInstruction =
                     case instr of
                         BoxLarge _ => destRegNos @ sourceRegNos
                     |   BoxTagFloat _ => destRegNos @ sourceRegNos (* Not sure about this. *)
                     |   BitFieldInsert{source, ...} => regNo source :: destRegNos
                     |   _ => destRegNos
             in
                 (* If there is more than one destination they conflict with each other. *)
                 val () = addConflictsTo(postInstruction, listToSet postInstruction);
                 (* Mark conflicts for the destinations, i.e. after the instruction.
                    The destinations conflict with the registers that are used
                    subsequently. *)
                 val () = addConflictsTo(postInstruction, current);
                 val () = addConflictsTo(postInstruction, passThrough);
 
                 (* Mark conflicts for the sources i.e. before the instruction. *)
                 (* Sources must be set up as conflicts with each other i.e. when we
                    come to allocate registers we must choose different real registers
                    for different abstract registers. *)
                 val () = addConflictsTo(sourceRegNos, listToSet sourceRegNos)
                 val () = addConflictsTo(sourceRegNos, afterRemoveDests);
                 val () = addConflictsTo(sourceRegNos, passThrough)
             end
             
             (* I'm not sure if this is needed.  There was a check in the old code to ensure that
                different registers were used for loop variables even if they were actually unused.
                This may happen anyway.
                Comment and code copied from X86 version.  Retain it for the moment. *)
             val () =
                 case instr of
                     JumpLoop{regArgs, ...} =>
                     let
                         val destRegs = List.foldl(fn ({dst=PReg loopReg, ...}, dests) => loopReg :: dests) [] regArgs
                     in
                         addConflictsTo(destRegs, listToSet destRegs);
                         addConflictsTo(destRegs, current);
                         addConflictsTo(destRegs, passThrough)
                     end
                 |   _ => ()
 
             (* Certain instructions are specific as to the real registers. *)
             val () =
                 case instr of
                     ReturnResultFromFunction{ returnReg=PReg retReg, ... } =>
                         (* We're going to put the return value in X0 so we can't use that for
                            the return address. *)
                         addRealConflict(retReg, GenReg X0)
 
                 |   RaiseExceptionPacket{ packetReg } =>
                         (* This wasn't needed previously because we always pushed the registers
                            across an exception. *)
                         reserveRegister([regNo packetReg], GenReg X0)
                 
                 |   BeginHandler { packetReg, ...} =>
                         reserveRegister([regNo packetReg], GenReg X0)
                 
                 |   FunctionCall { dest, regArgs, ...} =>
                         (* This is only needed if we are saving the registers rather
                            than marking them as "must push". *)
                     let
                         val destReg = regNo dest
                     in
                         reserveRegister([destReg], GenReg X0);
                         (* The argument registers also conflict.  In order to execute this call we need to load
                            the arguments into specific registers so we can't use them for values that we want after
                            the call.   We use regNo dest here because that will conflict with everything
                            immediately afterwards. *)
                         List.app(fn (_, r) => reserveRegister([destReg], GenReg r)) regArgs;
                         (* Likewise X30 since that's the return address. *)
                         addRealConflict(destReg, GenReg X30)
                     end
 
                     (* We can't use X30 as the result because it's needed for the return addr if we have to GC. *)
                 |   AllocateMemoryFixed{dest=PReg destReg, ...} => addRealConflict(destReg, GenReg X30)
                 |   AllocateMemoryVariable{dest=PReg destReg, ...} => addRealConflict(destReg, GenReg X30)
                 |   BoxLarge{dest=PReg destReg, ...} => addRealConflict(destReg, GenReg X30)
                 |   BoxTagFloat{dest=PReg destReg, ...} => addRealConflict(destReg, GenReg X30) (* Could exclude floats on native addr. *)
 
                 |   _ => ()
         in
             ()
         end
         
         (* Process the block. *)
         fun conflictsForBlock(ExtendedBasicBlock{block, passThrough, exports, ...}) =
         let
             (* We need to establish conflicts between all the registers active at
                the end of the block since they may not be established elsewhere.
                This isn't necessary for an unconditional branch since the
                same registers will be included in the block that is the target
                of the branch, possibly along with others.  However if this is
                a conditional or indexed branch we may have different sets at
                each of the targets and we have to ensure that all the registers
                differ. *)
             val united = union(exports, passThrough)
             val () = addConflictsTo(setToList united, united)
 
             val () = List.app (conflictsForInstr passThrough) block
         in
             ()
         end
 
         val () = Vector.app conflictsForBlock blocks
 
         (* Hint values.  The idea of hints is that by using a hinted register
            we may avoid an unnecessary move instruction.  realHints is set when
            a pseudo-register is going to be loaded from a specific register
            e.g. a register argument, or moved into one e.g. X0 for the
            function result.
            friends is set to the other pReg that may be associated
            with the pReg.  Typically this is where we have a merge
            register that we move some value into. *)
         val realHints = Array.array(maxPRegs, NONE: reg option)
 
         (* Sources and destinations.  These indicate the registers that are
            the sources and destinations of the indexing register and are used
            as hints.  If a register has been allocated for a source or destination
            we may be able to reuse it.  *)
         val sourceRegs = Array.array(maxPRegs, []: int list)
         and destinationRegs = Array.array(maxPRegs, []: int list)
         local
             (* Real hints.  If this is the source of a value e.g. a function argument in a register,
                we'll use it directly.  If, though, this is the result of a function and we want the
                result to end up in a specific register we want to propagate it to any pReg that
                moves its value into this. *)
             fun addRealHint(r, reg) =
                 case Array.sub(realHints, r) of
                     SOME _ => ()
                 |   NONE =>
                     (
                         (* Add to this pReg *)
                         Array.update(realHints, r, SOME reg);
                         (* and to any other pReg that moves here. *)
                         List.app(fn r => addRealHint(r, reg)) (Array.sub(sourceRegs, r))
                     )
 
             fun addSourceAndDestinationHint{src, dst} =
             let
                 val conflicts = Array.sub(regConflicts, src)
             in
                 (* If they conflict we can't add them. *)
                 if member(dst, conflicts)
                 then ()
                 else
                 let
                     val currentDests = Array.sub(destinationRegs, src)
                     val currentSources = Array.sub(sourceRegs, dst)
                 in
                     (* Add the destination for this source i.e. the registers we move
                        this source into. *)
                     if List.exists(fn i => i=dst) currentDests
                     then ()
                     else Array.update(destinationRegs, src, dst :: currentDests);
 
                     (* Add the source to the list of sources for this destination.
                        A merge register may have several sources, a different one
                        for each path. If the destination has a real hint we want to
                        propagate that back.  That isn't needed for the destinations
                        because we allocate the registers from the start forward. *)
                     if List.exists(fn i => i=src) currentSources
                     then ()
                     else
                     let
                         val sources = src :: currentSources
                         val () = Array.update(sourceRegs, dst, sources)
                     in
                         case Array.sub(realHints, dst) of
                             NONE => ()
                         |   SOME real => List.app(fn r => addRealHint(r, real)) sources
                     end
                 end
             end
 
             (* Add the hints to steer the register allocation.  The idea is to avoid moves between
                registers by getting values into the appropriate register in advance.
                We don't actually need to add real hints where the real register is providing
                the value, e.g. BeginFunction, because the allocation process will take care of
                that. *)
             fun addHints{instr=MoveRegister{source=PReg sreg, dest=PReg dreg, ...}, ...} =
                     addSourceAndDestinationHint {src=sreg, dst=dreg}
 
             |   addHints{instr=BitFieldInsert{destAsSource=PReg dsReg, dest=PReg dReg, ...}, ...} =
                     (* The "destAsSource" is the destination if some bits are retained. *)
                     addSourceAndDestinationHint {src=dsReg, dst=dReg}
 
             |   addHints{instr=ReturnResultFromFunction { resultReg=PReg resReg, ... }, ...} =
                     addRealHint(resReg, GenReg X0)
 
             |   addHints{instr=JumpLoop{regArgs, ...}, ...} =
                 let
                     fun addRegArg {src=ArgInReg(PReg argReg), dst=PReg resReg} =
                             addSourceAndDestinationHint {dst=resReg, src=argReg}
                     |   addRegArg {src=ArgOnStack _, ...} = ()
                 in
                     List.app addRegArg regArgs
                 end
 
             |   addHints{instr=BeginFunction{regArgs, ...}, ...} =
                     List.app (fn (PReg pr, reg) => addRealHint(pr, GenReg reg)) regArgs
 
             |   addHints{instr=TailRecursiveCall{regArgs, ...}, ...} =
                 let
                     fun setHint(ArgInReg(PReg pr), reg) = addRealHint(pr, GenReg reg) | setHint _ = ()
                 in
                     List.app setHint regArgs
                 end
         
             |   addHints{instr=FunctionCall{regArgs, dest=PReg dreg, ...}, ...} =
                 let
                     fun setHint(ArgInReg(PReg pr), reg) = addRealHint(pr, GenReg reg) | setHint _ = ()
                 in
                     addRealHint(dreg, GenReg X0);
                     List.app setHint regArgs
                 end
 
                 (* Exception packets are in X0 *)
             |   addHints{instr=RaiseExceptionPacket{ packetReg=PReg preg }, ...} = addRealHint(preg, GenReg X0)
             |   addHints{instr=BeginHandler{ packetReg=PReg preg }, ...} = addRealHint(preg, GenReg X0)
 
             |   addHints _ = ()
         in
             val () = Vector.app(fn ExtendedBasicBlock { block, ...} => List.app addHints block) blocks        
         end
 
 
         val allocatedRegs = Array.array(maxPRegs, NONE: reg option)
         val failures = ref []: intSet list ref
 
         (* Find a real register for a preg.
            1.  If a register is already allocated use that.
            2.  Try the "preferred" register if one has been given.
            3.  Try the realHints value if there is one.
            4.  See if there is a "friend" that has an appropriate register
            5.  Look at all the registers and find one. *)
         fun findRegister(r, pref, regSet, cache) =
         case Array.sub(allocatedRegs, r) of
             SOME reg => reg
 
         |   NONE =>
             let
                 val conflicts = Array.sub(regConflicts, r)
                 and realConflicts = Array.sub(regRealConflicts, r)
                 (* Find the registers we've already allocated that may conflict. *)
                 val conflictingRegs =
                     List.mapPartial(fn i => Array.sub(allocatedRegs, i)) (setToList conflicts) @
                         realConflicts
 
                 fun isFree aReg = not (List.exists(fn i => i=aReg) conflictingRegs)
 
                 fun tryAReg NONE = NONE
                 |   tryAReg (somePref as SOME prefReg) =
                         if isFree prefReg
                         then (Array.update(allocatedRegs, r, somePref); somePref)
                         else NONE
 
                 (* Search the sources and destinations to see if a register has
                    already been allocated or there is a hint. *)
                 fun findAFriend([], [], _) = NONE
 
                 |   findAFriend(aDest :: otherDests, sources, alreadySeen) =
                     let
                         val possReg =
                             case Array.sub(allocatedRegs, aDest) of
                                 v as SOME _ => tryAReg v
                             |   NONE => tryAReg(Array.sub(realHints, aDest))
                     in
                         case possReg of
                             reg as SOME _ => reg
                         |   NONE =>
                             let
                                 (* Add the destinations of the destinations to the list
                                    if they don't conflict and haven't been seen. *)
                                 fun newFriend f =
                                     not(List.exists (fn n => n=f) alreadySeen) andalso not(member(f, conflicts))
                                 val fOfF = List.filter newFriend (Array.sub(destinationRegs, aDest))
                             in
                                 findAFriend(otherDests @ fOfF, sources, aDest :: alreadySeen)
                             end
                     end
 
                 |   findAFriend([], aSrc :: otherSrcs, alreadySeen) =
                     let
                         val possReg =
                             case Array.sub(allocatedRegs, aSrc) of
                                 v as SOME _ => tryAReg v
                             |   NONE => tryAReg(Array.sub(realHints, aSrc))
                     in
                         case possReg of
                             reg as SOME _ => reg
                         |   NONE =>
                             let
                                 (* Add the sources of the sources to the list
                                    if they don't conflict and haven't been seen. *)
                                 fun newFriend f =
                                     not(List.exists (fn n => n=f) alreadySeen) andalso not(member(f, conflicts))
                                 val fOfF = List.filter newFriend (Array.sub(sourceRegs, aSrc))
                             in
                                 findAFriend([], otherSrcs @ fOfF, aSrc :: alreadySeen)
                             end
                     end
             in
                 case tryAReg pref of
                     SOME r => r
                 |   NONE =>
                     (
                         case tryAReg (Array.sub(realHints, r)) of
                             SOME r => r
                         |   NONE =>
                             (
                                 case findAFriend(Array.sub(destinationRegs, r), Array.sub(sourceRegs, r), []) of
                                     SOME r => r
                                     (* Look through the registers to find one that's free.  First try excluding
                                        the cache registers. *)
                                 |   NONE =>
                                     let
                                         (* First try filtering all the cache registers to see if we can find a
                                            register.  If not see if it works by *)
                                         fun filterCache(filteredRegset, []) = List.find isFree filteredRegset
 
                                         |   filterCache(filteredRegset, (cReg, _) :: cache) =
                                             (
                                                 case filterCache(List.filter(fn r => r <> cReg) filteredRegset, cache) of
                                                     NONE => if isFree cReg then SOME cReg else NONE
                                                 |   result => result
                                             )
 
                                        val pick =
                                             case filterCache(regSet, cache) of
                                                 SOME reg => reg
                                             |   NONE =>
                                                 (
                                                     (* This failed.  We're going to have to spill something. *)
                                                     failures := conflicts :: ! failures;
                                                     hd regSet (* Return something to allow this pass to complete *)
                                                 )
                                         val () = Array.update(allocatedRegs, r, SOME pick)
                                     in
                                         pick
                                     end
                             )
                     )
             end
 
         (* Turn the abstract icode into a concrete version by allocating the registers. *)
         local
+            fun asGenReg(GenReg reg) = reg | asGenReg _ = raise InternalError "asGenReg"
+            and asFPReg(FPReg reg) = reg | asFPReg _ = raise InternalError "asFPReg"
+
             datatype cacheItem =
                 CacheStack of stackLocn
             |   CacheObjectAddress of preg
         
             (* Remove any reference to newly allocated registers from the cache.  Also used after
                block move and comparison that modify registers *)
             fun pruneCache(reg: reg, cache) = List.filter(fn (r, _) => r <> reg) cache
 
+            (* Return the cache registers that contain valid addresses. *)
+            fun cachedAddressRegs cache = List.map (asGenReg o #1) cache
+
             fun allocateNewDestination(PReg r, pref, regSet, cacheList) =
                 case Array.sub(allocatedRegs, r) of
                     SOME reg =>
                         (
                             case Vector.sub(regProps, r) of
                                 RegPropMultiple => (reg, pruneCache(reg, cacheList)) (* This is allowed for merge registers *)
                             |   _ => raise InternalError "Register defined at multiple points"
                         )
                 |   NONE =>
                     let
                         val reg = findRegister(r, pref, regSet, cacheList)
                     in
                         (reg, pruneCache(reg, cacheList))
                     end
 
-            fun asGenReg(GenReg reg) = reg | asGenReg _ = raise InternalError "asGenReg"
-            and asFPReg(FPReg reg) = reg | asFPReg _ = raise InternalError "asFPReg"
-
             fun allocateGenReg(r, cache) =
             let
                 val (reg, newCache) = allocateNewDestination(r, NONE, generalRegisters, cache)
             in
                 (asGenReg reg, newCache)
             end
             and allocateFPReg(r, cache) =
             let
                 val (reg, newCache) = allocateNewDestination(r, NONE, floatingPtRegisters, cache)
             in
                 (asFPReg reg, newCache)
             end
 
             and allocateGenRegOrZero(ZeroReg, cache) = (XZero, cache)
             |   allocateGenRegOrZero(SomeReg reg, cache) = allocateGenReg(reg, cache)
 
             fun getAllocatedGenReg(PReg r) =
                 case Array.sub(allocatedRegs, r) of SOME(GenReg reg) => reg | _ => raise InternalError "getAllocatedGenReg"
             and getAllocatedFPReg(PReg r) =
                 case Array.sub(allocatedRegs, r) of SOME(FPReg reg) => reg | _ => raise InternalError "getAllocatedFPReg"
 
             fun getAllocatedGenRegOrZero ZeroReg = XZero
             |   getAllocatedGenRegOrZero (SomeReg reg) = getAllocatedGenReg reg
 
             fun getAllocatedArg(ArgInReg reg) = ArgInReg(getAllocatedGenReg reg)
             |   getAllocatedArg(ArgOnStack stackLoc) = ArgOnStack stackLoc
 
             val getSaveRegs = List.map getAllocatedGenReg
 
             (* Reversed cons and append to make the code easier to read. *)
             infix 5 <::> <@>
             fun tl <::> hd = hd :: tl
             (*and snd <@> fst = fst @ snd*)
 
             fun absToConcrete([], context, code) = (context, code)
 
             |   absToConcrete({instr=MoveRegister{ source, dest}, ...} :: rest, cache, code) =
                 let
                     (* Try to use the register we've allocated for the source as the destination so
                        that we can eliminate this instruction altogether. *)
                     val sourceReg = getAllocatedGenReg source
                     val (destReg, newCache) = allocateNewDestination(dest, SOME(GenReg sourceReg), generalRegisters, cache)
                     val dReg = asGenReg destReg
                 in
                     if sourceReg = dReg
                     then absToConcrete(rest, newCache, code)
                     else absToConcrete(rest, newCache, code <::> MoveRegister { source=sourceReg, dest=dReg})
                 end
 
             |   absToConcrete({instr=LoadNonAddressConstant { dest, source}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::> LoadNonAddressConstant { dest=destReg, source=source})
                 end
 
             |   absToConcrete({instr=LoadAddressConstant { dest, source}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::> LoadAddressConstant { dest=destReg, source=source})
                 end
 
             |   absToConcrete({instr=LoadWithConstantOffset { base, dest, byteOffset, loadType}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         LoadWithConstantOffset { base=getAllocatedGenReg base, dest=destReg, byteOffset=byteOffset, loadType=loadType})
                 end
 
             |   absToConcrete({instr=LoadFPWithConstantOffset { base, dest, byteOffset, floatSize}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateFPReg(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         LoadFPWithConstantOffset { base=getAllocatedGenReg base, dest=destReg, byteOffset=byteOffset, floatSize=floatSize})
                 end
 
             |   absToConcrete({instr=LoadWithIndexedOffset { base, dest, index, loadType}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         LoadWithIndexedOffset { base=getAllocatedGenReg base, dest=destReg, index=getAllocatedGenReg index, loadType=loadType})
                 end
 
             |   absToConcrete({instr=LoadFPWithIndexedOffset { base, dest, index, floatSize}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateFPReg(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         LoadFPWithIndexedOffset { base=getAllocatedGenReg base, dest=destReg, index=getAllocatedGenReg index, floatSize=floatSize})
                 end
 
             |   absToConcrete({instr=GetThreadId { dest}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::> GetThreadId { dest=destReg})
                 end
 
             |   absToConcrete({instr=ObjectIndexAddressToAbsolute { source as PReg srcNo, dest}, kill, ...} :: rest, cache, code) =
                 let
                     (* See if this is in the cache and use it if it is.  If this is the last reference to this
                        source entry we don't want it in the cache any longer. *)
                     val killThis = member(srcNo, kill)
                     val (newCode, destReg, newCache) =
                         case List.find(fn (_, CacheObjectAddress c) => c=source | _ => false) cache of
                             SOME (srcReg, _) =>
+                            if checkCache
+                            then
                             let
                                 val (destReg, newCache) = allocateGenReg(dest, cache)
                             in
                                 (code <::> ObjectIndexAddressToAbsolute { source=getAllocatedGenReg source, dest=destReg}
                                     <::> CacheCheck{ arg1=destReg, arg2=asGenReg srcReg },
                                  GenReg destReg, newCache)
                             end
-(*                            let
+                            else
+                            let
                                 (* Try to use the cache register as the destination if we can. *)
                                 val (destReg, newCache) = allocateNewDestination(dest, SOME srcReg, generalRegisters, cache)
                                 val dReg = asGenReg destReg
                                 and sReg = asGenReg srcReg
                             in
                                 if dReg = sReg
                                 then (code, destReg, newCache) (* We will have pruned this since it's the destination. *)
                                 else (code <::> MoveRegister { source=sReg, dest=dReg}, destReg,
                                         if killThis then pruneCache(srcReg, newCache) else newCache)
-                            end*)
+                            end
+
                         |   NONE =>
                             let
                                 val (destReg, newCache) = allocateGenReg(dest, cache)
                             in
                                 (code <::> ObjectIndexAddressToAbsolute { source=getAllocatedGenReg source, dest=destReg},
                                  GenReg destReg, newCache)
                             end
                 in
                     absToConcrete(rest, if killThis then newCache else (destReg, CacheObjectAddress source) :: newCache, newCode)
                 end
 
             |   absToConcrete({instr=AbsoluteToObjectIndex { source, dest}, ...} :: rest, cache, code) =
                 let
                     (* Don't make an entry in the cache for this; it won't be used. *)
                     val (destReg, newCache) = allocateGenReg(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::> AbsoluteToObjectIndex { source=getAllocatedGenReg source, dest=destReg})
                 end
 
             |   absToConcrete({instr=AllocateMemoryFixed { bytesRequired, dest, saveRegs }, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, cache)
                     val filteredCache = pruneCache(GenReg X30, newCache)
+                    val saved = cachedAddressRegs filteredCache @ getSaveRegs saveRegs
                 in
                     absToConcrete(rest, filteredCache, code <::>
-                        AllocateMemoryFixed { dest=destReg, bytesRequired=bytesRequired, saveRegs=getSaveRegs saveRegs})
+                        AllocateMemoryFixed { dest=destReg, bytesRequired=bytesRequired, saveRegs=saved})
                 end
 
             |   absToConcrete({instr=AllocateMemoryVariable{size, dest, saveRegs}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, cache)
                     val filteredCache = pruneCache(GenReg X30, newCache)
+                    val saved = cachedAddressRegs filteredCache @ getSaveRegs saveRegs
                 in
                     absToConcrete(rest, filteredCache, code <::>
-                        AllocateMemoryVariable{size=getAllocatedGenReg size, dest=destReg, saveRegs=getSaveRegs saveRegs})
+                        AllocateMemoryVariable{size=getAllocatedGenReg size, dest=destReg, saveRegs=saved})
                 end
 
             |   absToConcrete({instr=InitialiseMem{size, addr, init}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::> InitialiseMem{size=getAllocatedGenReg size, addr=getAllocatedGenReg addr, init=getAllocatedGenReg init})
 
             |   absToConcrete({instr=BeginLoop, ...} :: rest, context, code) = absToConcrete(rest, context, code <::> BeginLoop)
 
             |   absToConcrete({instr=JumpLoop{regArgs, stackArgs, checkInterrupt}, ...} :: rest, context, code) =
                 let
                     fun getStackArg{src, wordOffset, stackloc} =
                             {src=getAllocatedArg src, wordOffset=wordOffset, stackloc=stackloc}
                     and getRegArg{src, dst} = {src=getAllocatedArg src, dst=getAllocatedGenReg dst}
                 in
                     absToConcrete(rest, context, code <::> JumpLoop{ regArgs=map getRegArg regArgs, stackArgs=map getStackArg stackArgs,
                               checkInterrupt=Option.map getSaveRegs checkInterrupt})
                 end
 
             |   absToConcrete({instr=StoreWithConstantOffset { base, source, byteOffset, loadType}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::>
                         StoreWithConstantOffset{ base=getAllocatedGenReg base, source=getAllocatedGenReg source, byteOffset=byteOffset, loadType=loadType})
 
             |   absToConcrete({instr=StoreFPWithConstantOffset { base, source, byteOffset, floatSize}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::>
                         StoreFPWithConstantOffset{ base=getAllocatedGenReg base, source=getAllocatedFPReg source, byteOffset=byteOffset, floatSize=floatSize})
 
             |   absToConcrete({instr=StoreWithIndexedOffset { base, source, index, loadType}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::>
                         StoreWithIndexedOffset{ base=getAllocatedGenReg base, source=getAllocatedGenReg source, index=getAllocatedGenReg index, loadType=loadType})
 
             |   absToConcrete({instr=StoreFPWithIndexedOffset { base, source, index, floatSize}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::>
                         StoreFPWithIndexedOffset{ base=getAllocatedGenReg base, source=getAllocatedFPReg source, index=getAllocatedGenReg index, floatSize=floatSize})
 
             |   absToConcrete({instr=AddSubImmediate{ source, dest, ccRef, immed, isAdd, length }, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenRegOrZero(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         AddSubImmediate { source=getAllocatedGenReg source, dest=destReg, ccRef=ccRef,
                                                  immed=immed, isAdd=isAdd, length=length})
                 end
 
             |   absToConcrete({instr=AddSubRegister{ base, shifted, dest, ccRef, isAdd, length, shift}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenRegOrZero(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         AddSubRegister { base=getAllocatedGenReg base, shifted=getAllocatedGenReg shifted,
                                      dest=destReg, ccRef=ccRef,
                                      isAdd=isAdd, length=length, shift=shift})
                 end
 
             |   absToConcrete({instr=LogicalImmediate{ source, dest, ccRef, immed, logOp, length }, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenRegOrZero(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         LogicalImmediate { source=getAllocatedGenReg source, dest=destReg, ccRef=ccRef,
                                                   immed=immed, logOp=logOp, length=length})
                 end
 
             |   absToConcrete({instr=LogicalRegister{ base, shifted, dest, ccRef, logOp, length, shift}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenRegOrZero(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         LogicalRegister { base=getAllocatedGenReg base, shifted=getAllocatedGenReg shifted,
                                       dest=destReg, ccRef=ccRef,
                                       logOp=logOp, length=length, shift=shift})
                 end
 
             |   absToConcrete({instr=ShiftRegister{ direction, dest, source, shift, opSize}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         ShiftRegister { source=getAllocatedGenReg source, shift=getAllocatedGenReg shift, dest=destReg,
                                                direction=direction, opSize=opSize})
                 end
 
             |   absToConcrete({instr=Multiplication{ kind, dest, sourceA, sourceM, sourceN }, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         Multiplication { kind=kind, sourceA=getAllocatedGenRegOrZero sourceA, sourceM=getAllocatedGenReg sourceM,
                                      sourceN=getAllocatedGenReg sourceN, dest=destReg})
                 end
 
             |   absToConcrete({instr=Division{ isSigned, dest, dividend, divisor, opSize }, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         Division { isSigned=isSigned, dividend=getAllocatedGenReg dividend, divisor=getAllocatedGenReg divisor,
                                dest=destReg, opSize=opSize})
                 end
 
             |   absToConcrete({instr=BeginFunction {regArgs, stackArgs}, ...} :: rest, _, code) =
                 let
                     (* Allocate the register arguments.  At this point all the registers are free and the cache is empty.
                        However we may have a "real conflict" that means that the allocated register is different.  e.g.
                        we need this argument some time after an arbitrary precision operation that may call a
                        function. *)
                     fun allocReg(src, dst) =
                     let
                         val (destReg, _) = allocateNewDestination(src, SOME(GenReg dst), generalRegisters, [])
                     in
                         (asGenReg destReg, dst)
                     end
                 in
                     absToConcrete(rest, [], code <::>
                         BeginFunction {regArgs=map allocReg regArgs, stackArgs=stackArgs})
                 end
 
             |   absToConcrete({instr=FunctionCall{callKind, regArgs, stackArgs, dest, containers, saveRegs, ...}, ...} :: rest, _, code) =
                 let
                     fun getRegArg(src, dst) = (getAllocatedArg src, dst)
                     val (destReg, _) = allocateGenReg(dest, [])
-                    (* Currently we empty the cache at this point.  At the very least we need to remove
-                       any registers that could be modified e.g. X0 and X30. *)
+                    (* Currently we empty the cache at this point.  Saving the resgisters across
+                       the call would probably be more expensive than reloading them. *)
                 in
                     absToConcrete(rest, [] (* Empty after a function call. *), code <::>
                         FunctionCall{ callKind=callKind, regArgs=map getRegArg regArgs, stackArgs=map getAllocatedArg stackArgs,
                                   dest=destReg, saveRegs=getSaveRegs saveRegs, containers=containers})
                 end
 
             |   absToConcrete({instr=TailRecursiveCall{callKind, regArgs, stackArgs, stackAdjust, currStackSize}, ...} :: rest, context, code) =
                 let
                     fun getRegArg(src, dst) = (getAllocatedArg src, dst)
                     and getStackArg{src, stack} = {src=getAllocatedArg src, stack=stack}
                 in
                     absToConcrete(rest, context, code <::>
                         TailRecursiveCall{ callKind=callKind, regArgs=map getRegArg regArgs,
                             stackArgs=map getStackArg stackArgs, stackAdjust=stackAdjust, currStackSize=currStackSize})
                 end
 
             |   absToConcrete({instr=ReturnResultFromFunction{resultReg, returnReg, numStackArgs}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::>
                         ReturnResultFromFunction{resultReg=getAllocatedGenReg resultReg, returnReg=getAllocatedGenReg returnReg, numStackArgs=numStackArgs})
 
             |   absToConcrete({instr=RaiseExceptionPacket{packetReg}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::> RaiseExceptionPacket{packetReg=getAllocatedGenReg packetReg})
 
             |   absToConcrete({instr=PushToStack{ source, container, copies }, ...} :: rest, cache, code) =
                 let
                     val srcReg = getAllocatedGenReg source
                     val newCache = (GenReg srcReg, CacheStack container) :: cache
                 in
                     absToConcrete(rest, newCache, code <::> PushToStack{source=srcReg, container=container, copies=copies})
                 end
 
             |   absToConcrete({instr=LoadStack{ dest, container as StackLoc{rno, ...} , field=0, wordOffset}, kill, ...} :: rest, cache, code) =
                 let
                     (* See if this is in the cache and use it if it is.  If this is the last reference to this
                        stack entry we don't want it in the cache any longer. *)
                     val killThis = member(rno, kill)
                     val (newCode, destReg, newCache) =
                         case List.find(fn (_, CacheStack c) => c=container | _ => false) cache of
                             SOME (srcReg, _) =>
+                            if checkCache
+                            then
                             let
                                 val (destReg, newCache) = allocateGenReg(dest, cache)
                             in
                                 (code <::> LoadStack{ dest=destReg, container=container, field=0, wordOffset=wordOffset }
                                     <::> CacheCheck{ arg1=destReg, arg2=asGenReg srcReg },
                                  GenReg destReg, newCache)
                             end
-                            (*
+                            else
                             let
                                 (* Try to use the cache register as the destination if we can. *)
                                 val (destReg, newCache) = allocateNewDestination(dest, SOME srcReg, generalRegisters, cache)
                                 val dReg = asGenReg destReg
                                 and sReg = asGenReg srcReg
                             in
                                 if dReg = sReg
                                 then (code, destReg, newCache) (* We will have pruned this since it's the destination. *)
                                 else (code <::> MoveRegister { source=sReg, dest=dReg}, destReg,
                                         if killThis then pruneCache(srcReg, newCache) else newCache)
-                            end*)
+                            end
+
                         |   NONE =>
                             let
                                 val (destReg, newCache) = allocateGenReg(dest, cache)
                             in
                                 (code <::> LoadStack{ dest=destReg, container=container, field=0, wordOffset=wordOffset },
                                  GenReg destReg, newCache)
                             end
                 in
                     absToConcrete(rest, if killThis then newCache else (destReg, CacheStack container) :: newCache, newCode)
                 end
 
             |   absToConcrete({instr=LoadStack{ dest, container, field, wordOffset}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::> LoadStack{ dest=destReg, container=container, field=field, wordOffset=wordOffset })
                 end
 
             |   absToConcrete({instr=StoreToStack{source, container, field, stackOffset}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::>
                         StoreToStack{source=getAllocatedGenReg source, container=container, field=field, stackOffset=stackOffset})
 
             |   absToConcrete({instr=ContainerAddress{ dest, container, stackOffset}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         ContainerAddress{ dest=destReg, container=container, stackOffset=stackOffset })
                 end
 
             |   absToConcrete({instr=ResetStackPtr {numWords}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::> ResetStackPtr {numWords=numWords})
 
             |   absToConcrete({instr=TagValue{source, dest, isSigned, opSize}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         TagValue{source=getAllocatedGenReg source, dest=destReg, isSigned=isSigned, opSize=opSize})
                 end
 
             |   absToConcrete({instr=UntagValue{source, dest, isSigned, opSize, ...}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         UntagValue{source=getAllocatedGenReg source, dest=destReg, isSigned=isSigned, opSize=opSize})
                 end
 
             |   absToConcrete({instr=BoxLarge{source, dest, saveRegs, ...}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, cache)
                     val filteredCache = pruneCache(GenReg X30, newCache)
+                    val saved = cachedAddressRegs filteredCache @ getSaveRegs saveRegs
                 in
                     absToConcrete(rest, filteredCache, code <::>
-                        BoxLarge{source=getAllocatedGenReg source, dest=destReg, saveRegs=getSaveRegs saveRegs})
+                        BoxLarge{source=getAllocatedGenReg source, dest=destReg, saveRegs=saved})
                 end
 
             |   absToConcrete({instr=UnboxLarge{source, dest}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::> UnboxLarge{source=getAllocatedGenReg source, dest=destReg})
                 end
 
             |   absToConcrete({instr=BoxTagFloat{floatSize, source, dest, saveRegs}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, cache)
                     val filteredCache = pruneCache(GenReg X30, newCache)
+                    val saved = cachedAddressRegs filteredCache @ getSaveRegs saveRegs
                 in
-                    absToConcrete(rest, filteredCache, code <::> BoxTagFloat{floatSize=floatSize, source=getAllocatedFPReg source,
-                                dest=destReg, saveRegs=getSaveRegs saveRegs})
+                    absToConcrete(rest, filteredCache, code <::>
+                        BoxTagFloat{floatSize=floatSize, source=getAllocatedFPReg source, dest=destReg, saveRegs=saved})
                 end
 
             |   absToConcrete({instr=UnboxTagFloat{floatSize, source, dest}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateFPReg(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::> UnboxTagFloat{floatSize=floatSize, source=getAllocatedGenReg source, dest=destReg})
                 end
 
             |   absToConcrete({instr=LoadAcquire { base, dest, loadType}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::> LoadAcquire { base=getAllocatedGenReg base, dest=destReg, loadType=loadType})
                 end
 
             |   absToConcrete({instr=StoreRelease { base, source, loadType}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::> StoreRelease{ base=getAllocatedGenReg base, source=getAllocatedGenReg source, loadType=loadType})
 
             |   absToConcrete({instr=BitFieldShift{source, dest, isSigned, length, immr, imms}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::> BitFieldShift { source=getAllocatedGenReg source, dest=destReg, isSigned=isSigned,
                                     immr=immr, imms=imms, length=length})
                 end
 
             |   absToConcrete({instr=BitFieldInsert{source, destAsSource, dest, length, immr, imms}, ...} :: rest, cache, code) =
                 let
                     val destAsSourceReg = getAllocatedGenReg destAsSource
                     val (destReg, newCache) = allocateNewDestination(dest, SOME(GenReg destAsSourceReg), generalRegisters, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         BitFieldInsert { source=getAllocatedGenReg source, destAsSource=destAsSourceReg, dest=asGenReg destReg,
                                      immr=immr, imms=imms, length=length})
                 end
 
             |   absToConcrete({instr=IndexedCaseOperation{testReg}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::> IndexedCaseOperation{testReg=getAllocatedGenReg testReg})
 
             |   absToConcrete({instr=PushExceptionHandler, ...} :: rest, context, code) = absToConcrete(rest, context, code <::> PushExceptionHandler)
 
             |   absToConcrete({instr=PopExceptionHandler, ...} :: rest, context, code) = absToConcrete(rest, context, code <::> PopExceptionHandler)
 
             |   absToConcrete({instr=BeginHandler{packetReg}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(packetReg, cache)
                 in
                     absToConcrete(rest, newCache, code <::> BeginHandler{packetReg=destReg})
                 end
 
             |   absToConcrete({instr=CompareByteVectors{vec1Addr, vec2Addr, length, ccRef}, ...} :: rest, cache, code) =
                 let
                     (* This instruction modifies these registers so they must be removed from the cache *)
                     val vec1Reg = getAllocatedGenReg vec1Addr
                     and vec2Reg = getAllocatedGenReg vec2Addr
                     and lenReg = getAllocatedGenReg length
                     val newCache = pruneCache(GenReg vec1Reg, pruneCache(GenReg vec2Reg, pruneCache(GenReg lenReg, cache)))
                 in
                     absToConcrete(rest, newCache, code <::> CompareByteVectors{vec1Addr=vec1Reg, vec2Addr=vec2Reg, length=lenReg, ccRef=ccRef})
                 end
 
             |   absToConcrete({instr=BlockMove{srcAddr, destAddr, length, isByteMove}, ...} :: rest, cache, code) =
                 let
                     (* This instruction modifies these registers so they must be removed from the cache *)
                     val srcAReg = getAllocatedGenReg srcAddr
                     and dstAReg = getAllocatedGenReg destAddr
                     and lenReg = getAllocatedGenReg length
                     val newCache = pruneCache(GenReg srcAReg, pruneCache(GenReg dstAReg, pruneCache(GenReg lenReg, cache)))
                 in
                     absToConcrete(rest, newCache, code <::> BlockMove{srcAddr=srcAReg, destAddr=dstAReg, length=lenReg, isByteMove=isByteMove})
                 end
 
             |   absToConcrete({instr=AddSubXSP{source, dest, isAdd}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenRegOrZero(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::> AddSubXSP { source=getAllocatedGenReg source, dest=destReg, isAdd=isAdd})
                 end
 
             |   absToConcrete({instr=TouchValue{source}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::> TouchValue { source=getAllocatedGenReg source})
 
             |   absToConcrete({instr=LoadAcquireExclusive{ base, dest }, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::> LoadAcquireExclusive { base=getAllocatedGenReg base, dest=destReg})
                 end
 
             |   absToConcrete({instr=StoreReleaseExclusive{ base, source, result }, ...} :: rest, cache, code) =
                 let
                     val (resultReg, newCache) = allocateGenReg(result, cache)
                 in
                     absToConcrete(rest, newCache, code <::> StoreReleaseExclusive{ base=getAllocatedGenReg base, source=getAllocatedGenRegOrZero source,
                                            result=resultReg})
                 end
 
             |   absToConcrete({instr=MemoryBarrier, ...} :: rest, cache, code) = absToConcrete(rest, cache, code <::> MemoryBarrier)
 
             |   absToConcrete({instr=ConvertIntToFloat{ source, dest, srcSize, destSize}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateFPReg(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         ConvertIntToFloat{ source=getAllocatedGenReg source, dest=destReg, srcSize=srcSize, destSize=destSize})
                 end
 
             |   absToConcrete({instr=ConvertFloatToInt{ source, dest, srcSize, destSize, rounding}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         ConvertFloatToInt{ source=getAllocatedFPReg source, dest=destReg, srcSize=srcSize,
                                        destSize=destSize, rounding=rounding})
                 end
 
             |   absToConcrete({instr=UnaryFloatingPt{ source, dest, fpOp}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateFPReg(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::> UnaryFloatingPt{ source=getAllocatedFPReg source, dest=destReg, fpOp=fpOp})
                 end
 
             |   absToConcrete({instr=BinaryFloatingPoint{ arg1, arg2, dest, fpOp, opSize}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateFPReg(dest, cache)
                 in
                     absToConcrete(rest, newCache, code <::> BinaryFloatingPoint{ arg1=getAllocatedFPReg arg1, arg2=getAllocatedFPReg arg2,
                                          dest=destReg, fpOp=fpOp, opSize=opSize})
                 end
 
             |   absToConcrete({instr=CompareFloatingPoint{ arg1, arg2, opSize, ccRef}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::>
                         CompareFloatingPoint{ arg1=getAllocatedFPReg arg1, arg2=getAllocatedFPReg arg2, opSize=opSize, ccRef=ccRef})
 
+            |   absToConcrete({instr=CacheCheck _, ...} :: _, _, _) =  (* Concrete only. *)
+                    raise InternalError "absToConcrete: CheckCache"
+                    
         in
             fun concreteBlock(ExtendedBasicBlock{ block, flow, ...}) =
             let
                 val (_, code) = absToConcrete(block, [], [])
             in
                 BasicBlock{block=List.rev code, flow=flow}
             end
         end
 
         val numBlocks = Vector.length blocks
         val resultArray = Array.array(numBlocks, NONE)
 
         fun processBlocks blockNo =
         case Array.sub(resultArray, blockNo) of
             SOME _ => () (* Done . *)
         |   NONE =>
             let
                 val thisBlock as ExtendedBasicBlock { flow, ...} = Vector.sub(blocks, blockNo)
             in
                 (* Process this block and add it to the results. *)
                 Array.update(resultArray, blockNo, SOME(concreteBlock thisBlock));
             
                 (* Now the blocks that depend on this. *)
                 case flow of
                     ExitCode => ()
                 |   Unconditional m => processBlocks m
                 |   Conditional {trueJump, falseJump, ...} =>
                         (processBlocks trueJump; processBlocks falseJump)
                 |   IndexedBr cases => List.app processBlocks cases
                 |   SetHandler{ handler, continue } =>
                         (processBlocks handler; processBlocks continue)
                 |   UnconditionalHandle _ => ()
                 |   ConditionalHandle { continue, ...} => processBlocks continue
             end
     in
         processBlocks 0;
         (* If the failures list is empty we succeeded. *)
         case !failures of
             [] =>  (* Return the allocation vector.  We may have unused registers, *)
                 AllocateSuccess(Vector.tabulate(numBlocks, fn i => valOf(Array.sub(resultArray, i))) )
             (* Else we'll have to spill something. *)
         |   l => AllocateFailure l
     end
 
     val nGenRegs = List.length generalRegisters
 
     structure Sharing =
     struct
         type intSet             = intSet
         and extendedBasicBlock  = extendedBasicBlock
         and ('genReg, 'optGenReg, 'fpReg) basicBlock = ('genReg, 'optGenReg, 'fpReg) basicBlock
         and regProperty         = regProperty
         and reg                 = reg
         and xReg                = xReg
         and vReg                = vReg
         and allocateResult      = allocateResult
     end
 
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICode.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICode.ML
index b36d4681..b75c6257 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICode.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICode.ML
@@ -1,973 +1,976 @@
 (*
-    Copyright David C. J. Matthews 2021
+    Copyright David C. J. Matthews 2021-2
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64ICode(
 
     structure Arm64Code: ARM64PREASSEMBLY
 
 ): ARM64ICODE =
 
 struct
     open Arm64Code
 
     open Address
     datatype preg = PReg of int (* A pseudo-register - an abstract register. *)
 
     (* If the value is zero we can use X0/W0. *)
     datatype pregOrZero = SomeReg of preg | ZeroReg
     
     (* A location on the stack.  May be more than word if this is a container or a handler entry. *)
     datatype stackLocn = StackLoc of {size: int, rno: int }
     
     (* This combines pregKind and stackLocn.  *)
     datatype regProperty =
         RegPropGeneral      (* A general register. *)
     |   RegPropUntagged     (* An untagged general register. *)
     |   RegPropStack of int (* A stack location or container. *)
     |   RegPropCacheTagged
     |   RegPropCacheUntagged
     |   RegPropMultiple     (* The result of a conditional or case. May be defined at multiple points. *)
 
     (* The reference to a condition code. *)
     datatype ccRef = CcRef of int
     datatype reg = GenReg of xReg | FPReg of vReg
 
     datatype callKind = Recursive | ConstantCode of machineWord | FullCall
 
     (* Function calls can have an unlimited number of arguments so it isn't always
        going to be possible to load them into registers. *)
     datatype 'genReg fnarg = ArgInReg of 'genReg | ArgOnStack of { wordOffset: int, container: stackLocn, field: int }
 
     datatype ('genReg, 'optGenReg, 'fpReg) arm64ICode =
         (* Move the contents of one preg to another.  These are always 64-bits. *)
         MoveRegister of { source: 'genReg, dest: 'genReg }
 
         (* Numerical constant. *)
     |   LoadNonAddressConstant of { source: Word64.word, dest: 'genReg }
 
         (* Address constant. *)
     |   LoadAddressConstant of { source: machineWord, dest: 'genReg }
 
         (* Load a value into a register using a constant, signed, byte offset.  The offset
            is in the range of -256 to (+4095*unit size). *)
     |   LoadWithConstantOffset of { base: 'genReg, dest: 'genReg, byteOffset: int, loadType: loadType }
 
         (* Similarly for FP registers. *)
     |   LoadFPWithConstantOffset of { base: 'genReg, dest: 'fpReg, byteOffset: int, floatSize: floatSize }
 
         (* Load a value into a register using an index register. *)
     |   LoadWithIndexedOffset of { base: 'genReg, dest: 'genReg, index: 'genReg, loadType: loadType }
 
         (* Ditto for FP. *)
     |   LoadFPWithIndexedOffset of { base: 'genReg, dest: 'fpReg, index: 'genReg, floatSize: floatSize }
 
         (* Returns the current thread ID.  Always a 64-bit value.. *)
     |   GetThreadId of { dest: 'genReg }
 
         (* Convert a 32-in-64 object index into an absolute address. *)
     |   ObjectIndexAddressToAbsolute of { source: 'genReg, dest: 'genReg }
 
         (* Convert an absolute address into an object index. *)
     |   AbsoluteToObjectIndex of { source: 'genReg, dest: 'genReg }
 
         (* Allocate a fixed sized piece of memory and puts the absolute address into dest.
            bytesRequired is the total number of bytes including the length word and any alignment
            necessary for 32-in-64. saveRegs is the list of registers that need to be saved if we
            need to do a garbage collection. *)
     |   AllocateMemoryFixed of { bytesRequired: Word64.word, dest: 'genReg, saveRegs: 'genReg list }
 
         (* Allocate a piece of memory.  The size argument is an untagged value containing
            the number of words i.e. the same value used for InitialiseMemory and to store
            in the length word. *)
     |   AllocateMemoryVariable of { size: 'genReg, dest: 'genReg, saveRegs: 'genReg list }
 
         (* Initialise a piece of memory by writing "size" copies of the value
            in "init".  N.B. The size is an untagged value containing the
            number of words. *)
     |   InitialiseMem of { size: 'genReg, addr: 'genReg, init: 'genReg }
 
         (* Mark the beginning of a loop.  This is really only to prevent the initialisation code being
            duplicated in ICodeOptimise. *)
     |   BeginLoop
 
         (* Set up the registers for a jump back to the start of a loop. *)
     |   JumpLoop of
             { regArgs: {src: 'genReg fnarg, dst: 'genReg} list,
               stackArgs: {src: 'genReg fnarg, wordOffset: int, stackloc: stackLocn} list,
               checkInterrupt: 'genReg list option }
 
         (* Store a register using a constant, signed, byte offset.  The offset
            is in the range of -256 to (+4095*unit size). *)
     |   StoreWithConstantOffset of { source: 'genReg, base: 'genReg, byteOffset: int, loadType: loadType }
 
         (* Ditto for FP regs. *)
     |   StoreFPWithConstantOffset of { source: 'fpReg, base: 'genReg, byteOffset: int, floatSize: floatSize }
 
         (* Store a register using an index register. *)
     |   StoreWithIndexedOffset of { source: 'genReg, base: 'genReg, index: 'genReg, loadType: loadType }
 
         (* and for FP regs. *)
     |   StoreFPWithIndexedOffset of { source: 'fpReg, base: 'genReg, index: 'genReg, floatSize: floatSize }
 
         (* Add/Subtract immediate.  The destination is optional in which case XZero is used.
            ccRef is optional.  If it is NONE the version of the instruction that does not generate
            a condition code is used. immed must be < 0wx1000. *)
     |   AddSubImmediate of { source: 'genReg, dest: 'optGenReg, ccRef: ccRef option, immed: word,
                              isAdd: bool, length: opSize }
 
         (* Add/Subtract register.  As with AddSubImmediate, both the destination and cc are optional. *)
     |   AddSubRegister of { base: 'genReg, shifted: 'genReg, dest: 'optGenReg, ccRef: ccRef option,
                             isAdd: bool, length: opSize, shift: shiftType }
 
         (* Bitwise logical operations.  The immediate value must be a valid bit pattern.  ccRef can
            only be SOME if logOp is LogAnd. *)
     |   LogicalImmediate of { source: 'genReg, dest: 'optGenReg, ccRef: ccRef option, immed: Word64.word,
                               logOp: logicalOp, length: opSize }
 
         (* Register logical operations.  ccRef can only be SOME if logOp is LogAnd.*)
     |   LogicalRegister of { base: 'genReg, shifted: 'genReg, dest: 'optGenReg, ccRef: ccRef option,
                              logOp: logicalOp, length: opSize, shift: shiftType }
 
         (* Shift a word by an amount specified in a register. *)
     |   ShiftRegister of { direction: shiftDirection, dest: 'genReg, source: 'genReg, shift: 'genReg, opSize: opSize }
 
         (* The various forms of multiply all take three arguments and the general form is
            dest = M * N +/- A..   *)
     |   Multiplication of { kind: multKind, dest: 'genReg, sourceA: 'optGenReg, sourceM: 'genReg, sourceN: 'genReg }
 
         (* Signed or unsigned division.  Sets the result to zero if the divisor is zero. *)
     |   Division of { isSigned: bool, dest: 'genReg, dividend: 'genReg, divisor: 'genReg, opSize: opSize }
 
         (* Start of function.  Set the register arguments.  stackArgs is the list of
            stack arguments.  If the function has a real closure regArgs includes the
            closure register (X8).  The register arguments include the return register
            (X30). *)
     |   BeginFunction of { regArgs: ('genReg * xReg) list, stackArgs: stackLocn list }
 
         (* Call a function.  If the code address is a constant it is passed here.
            Otherwise the address is obtained by indirecting through X8 which has been loaded
            as one of the argument registers.  The result is stored in the destination register.
            The "containers" argument is used to ensure that any container whose address is passed
            as one of the other arguments continues to be referenced until the function is called
            since there's a possibility that it isn't actually used after the function. *)
     |   FunctionCall of
             { callKind: callKind, regArgs: ('genReg fnarg * xReg) list,
               stackArgs: 'genReg fnarg list, dest: 'genReg, saveRegs: 'genReg list, containers: stackLocn list}
 
         (* Jump to a tail-recursive function.  This is similar to FunctionCall
            but complicated for stack arguments because the stack and the return
            address need to be overwritten.
            stackAdjust is the number of words to remove (positive) or add
            (negative) to the stack before the call.
            currStackSize contains the number of items currently on the stack. *)
     |   TailRecursiveCall of
             { callKind: callKind, regArgs: ('genReg fnarg * xReg) list,
               stackArgs: {src: 'genReg fnarg, stack: int} list,
               stackAdjust: int, currStackSize: int }
 
         (* Return from the function.  resultReg is the preg that contains the result,
            returnReg is the preg that contains the return address. *)
     |   ReturnResultFromFunction of { resultReg: 'genReg, returnReg: 'genReg, numStackArgs: int }
 
         (* Raise an exception.  The packet is always loaded into X0. *)
     |   RaiseExceptionPacket of { packetReg: 'genReg }
 
         (* Push a register to the stack.  This is used both for a normal push, copies=1, and
            also to reserve a container. *)
     |   PushToStack of { source: 'genReg, copies: int, container: stackLocn }
 
         (* Load a register from the stack.  The container is the stack location identifier,
            the field is an offset in a container. *)
     |   LoadStack of { dest: 'genReg, wordOffset: int, container: stackLocn, field: int }
 
         (* Store a value into the stack. *)
     |   StoreToStack of { source: 'genReg, container: stackLocn, field: int, stackOffset: int }
 
         (* Set the register to the address of the container i.e. a specific offset on the stack. *)
     |   ContainerAddress of { dest: 'genReg, container: stackLocn, stackOffset: int }
 
         (* Remove items from the stack.  Used to remove containers or
            registers pushed to the stack.. *)
     |   ResetStackPtr of { numWords: int }
 
         (* Tag a value by shifting and setting the tag bit. *)
     |   TagValue of { source: 'genReg, dest: 'genReg, isSigned: bool, opSize: opSize }
 
         (* Shift a value to remove the tag bit.  The cache is used if this is untagging a
            value that has previously been tagged. *)
     |   UntagValue of { source: 'genReg, dest: 'genReg, isSigned: bool, opSize: opSize }
 
         (* Box a largeword value.  Stores a value
            into a byte area.  This can be implemented using AllocateMemoryFixed
            but keeping it separate makes optimisation easier.
            The result is always an address and needs to be converted to an
            object index on 32-in-64. *)
     |   BoxLarge of { source: 'genReg, dest: 'genReg, saveRegs: 'genReg list }
 
         (* Load a value from a box.  This can be implemented using a load but
            is kept separate to simplify optimisation.  The source is always
            an absolute address. *)
     |   UnboxLarge of { source: 'genReg, dest: 'genReg }
 
         (* Convert a floating point value into a value suitable for storing
            in the heap.  This normally involves boxing except that 32-bit
            floats can be tagged in native 64-bits. *)
     |   BoxTagFloat of { floatSize: floatSize, source: 'fpReg, dest: 'genReg, saveRegs: 'genReg list }
 
         (* The reverse of BoxTagFloat. *)
     |   UnboxTagFloat of { floatSize: floatSize, source: 'genReg, dest: 'fpReg }
 
         (* Load a value with acquire semantics.  This means that any other
            load in this thread after this sees the value of the shared
            memory at this point and not earlier.  This is used for
            references and arrays to ensure that if another thread has
            built a data structure on the heap and then assigns the
            address to a shared ref this thread will see the updated heap
            and not any locally cached previous version. *)
     |   LoadAcquire of { base: 'genReg, dest: 'genReg, loadType: loadType }
 
         (* Store a value with release semantics.  This ensures that any
            other write completes before this operation and works with
            LoadAcquire. *)
     |   StoreRelease of { base: 'genReg, source: 'genReg, loadType: loadType }
 
         (* This is a generalised constant shift which includes selection of a
            range of bits. *)
     |   BitFieldShift of { source: 'genReg, dest: 'genReg, isSigned: bool, length: opSize, immr: word, imms: word }
 
         (*  Copy a range of bits and insert it into another register.  This is the
             only case where a register functions both as a source and a destination. *)
     |   BitFieldInsert of { source: 'genReg, destAsSource: 'genReg, dest: 'genReg,
                             length: opSize, immr: word, imms: word }
 
         (* Indexed case. *)
     |   IndexedCaseOperation of { testReg: 'genReg }
 
         (* Exception handling.  - Set up an exception handler. *)
     |   PushExceptionHandler
 
         (* End of a handled section.  Restore the previous handler. *)
     |   PopExceptionHandler
 
         (* Marks the start of a handler.  This sets the stack pointer and
            restores the old handler.  Sets the exception packet register. *) 
     |   BeginHandler of { packetReg: 'genReg }
 
         (* Compare two vectors of bytes and set the condition code on the result.
            The registers are modified by the instruction. *)
     |   CompareByteVectors of
             { vec1Addr: 'genReg, vec2Addr: 'genReg, length: 'genReg, ccRef: ccRef }
 
         (* Move a block of bytes (isByteMove true) or words (isByteMove false).  The length is the
            number of items (bytes or words) to move. The registers are modified by
            the instruction. *)
     |   BlockMove of { srcAddr: 'genReg, destAddr: 'genReg, length: 'genReg, isByteMove: bool }
 
         (* Add or subtract to the system stack pointer and optionally return the new value.
            This is used to allocate and deallocate C space. *)
     |   AddSubXSP of { source: 'genReg, dest: 'optGenReg, isAdd: bool  }
 
         (* Ensures the value will actually be referenced although it doesn't generate any code. *)
     |   TouchValue of { source: 'genReg }
 
         (* Load a value at the address and get exclusive access.  Always loads a
            64-bit value. *)
     |   LoadAcquireExclusive of { base: 'genReg, dest: 'genReg }
 
         (* Store a value into an address releasing the lock.  Sets the result to
            either 0 or 1 if it succeeds or fails. *)
     |   StoreReleaseExclusive of { base: 'genReg, source: 'optGenReg, result: 'genReg }
 
         (* Insert a memory barrier. dmb ish. *)
     |   MemoryBarrier
 
         (* Convert an integer to a floating point value. *)
     |   ConvertIntToFloat of { source: 'genReg, dest: 'fpReg, srcSize: opSize, destSize: floatSize }
 
         (* Convert a floating point value to an integer using the specified rounding mode.
            We could get an overflow here but fortunately the ARM generates a value
            that will cause an overflow when we tag it, provided we tag it explicitly. *)
     |   ConvertFloatToInt of { source: 'fpReg, dest: 'genReg, srcSize: floatSize, destSize: opSize, rounding: IEEEReal.rounding_mode }
 
         (* Unary floating point.  This includes conversions between float and double. *)
     |   UnaryFloatingPt of { source: 'fpReg, dest: 'fpReg, fpOp: fpUnary }
 
         (* Binary floating point: addition, subtraction, multiplication and division. *)
     |   BinaryFloatingPoint of { arg1: 'fpReg, arg2: 'fpReg, dest: 'fpReg, fpOp: fpBinary, opSize: floatSize }
 
         (* Floating point comparison. *)
     |   CompareFloatingPoint of { arg1: 'fpReg, arg2: 'fpReg, ccRef: ccRef, opSize: floatSize }
 
         (* Debugging - fault if values don't match. *)
     |   CacheCheck of { arg1: 'genReg, arg2: 'genReg }
 
         (* Destinations at the end of a basic block. *)
     and controlFlow =
         (* Unconditional branch to a label - should be a merge point. *)
         Unconditional of int
         (* Conditional branch. Jumps to trueJump if the condional is false, falseJump if false. *)
     |   Conditional of { ccRef: ccRef, condition: condition, trueJump: int, falseJump: int }
         (* Exit - the last instruction of the block is a return, raise or tailcall. *)
     |   ExitCode
         (* Indexed case - this branches to one of a number of labels *)
     |   IndexedBr of int list
         (* Set up a handler.  This doesn't cause an immediate branch but the state at the
            start of the handler is the state at this point. *)
     |   SetHandler of { handler: int, continue: int }
         (* Unconditional branch to a handler.  If an exception is raised explicitly
            within the scope of a handler. *)
     |   UnconditionalHandle of int
         (* Conditional branch to a handler.  Occurs if there is a call to a
            function within the scope of a handler.  It may jump to the handler. *)
     |   ConditionalHandle of { handler: int, continue: int }
 
     and ('genReg, 'optGenReg, 'fpReg) basicBlock =
             BasicBlock of { block: ('genReg, 'optGenReg, 'fpReg) arm64ICode list, flow: controlFlow }
   
     type iCodeAbstract = (preg, pregOrZero, preg) arm64ICode and basicBlockAbstract = (preg, pregOrZero, preg) basicBlock
     and  iCodeConcrete = (xReg, xReg, vReg) arm64ICode and basicBlockConcrete = (xReg, xReg, vReg) basicBlock
 
     (* Return the list of blocks that are the immediate successor of this. *)
     fun successorBlocks(Unconditional l) = [l]
     |   successorBlocks(Conditional{trueJump, falseJump, ...}) = [trueJump, falseJump]
     |   successorBlocks ExitCode = []
     |   successorBlocks(IndexedBr cases) = cases
     |   successorBlocks(SetHandler{handler, continue, ...}) = [handler, continue]
         (* We only need "handler" in SetHandler because we may have a handler that is never actually jumped to. *)
     |   successorBlocks(UnconditionalHandle handler) = [handler]
     |   successorBlocks(ConditionalHandle{handler, continue, ...}) = [handler, continue]
 
     local
         fun printCC(CcRef ccRef, stream) = stream ("CC" ^ Int.toString ccRef)
 
         fun printStackLoc(StackLoc{size, rno}, stream) =
             (stream "S"; stream(Int.toString rno); stream "("; stream(Int.toString size); stream ")")
         
         fun regRepr(XReg w) = "X" ^ Int.toString(Word8.toInt w)
         |   regRepr XZero = "XZ"
         |   regRepr XSP = "SP"
 
         fun arithRepr OpSize64 = "64"
         |   arithRepr OpSize32 = "32"
 
         fun printLoadType(Load64, stream) = stream "64"
         |   printLoadType(Load32, stream) = stream "32"
         |   printLoadType(Load16, stream) = stream "16"
         |   printLoadType(Load8, stream) = stream "8"
 
         fun printSaves([], _, _) = ()
         |   printSaves([areg], _, printReg) = printReg areg
         |   printSaves(areg::more, stream, printReg) =
                 (printReg areg; stream ","; printSaves(more, stream, printReg))
 
         fun printArg(ArgInReg reg, _, printReg) = printReg reg
         |   printArg(ArgOnStack{wordOffset, container, field, ...}, stream, _) =
             (
                 printStackLoc(container, stream); stream " + ";
                 stream(Int.toString field);
                 stream " (";  stream(Int.toString wordOffset); stream ")"
             )
 
         fun printShift(ShiftLSL w, stream) = stream(" LSL " ^ Word8.toString w)
         |   printShift(ShiftLSR w, stream) = stream(" LSR " ^ Word8.toString w)
         |   printShift(ShiftASR w, stream) = stream(" ASR " ^ Word8.toString w)
         |   printShift(ShiftNone, _) = ()
 
         fun printFloatSize(Float32, stream) = stream "F"
         |   printFloatSize(Double64, stream) = stream "D"
 
         fun printICode (stream, printGenReg:'a -> unit, _: 'b->unit, _: 'c->unit) (MoveRegister{ source, dest }: ('a, 'b, 'c) arm64ICode) =
             (
                 stream "\tMove\t";
                 printGenReg source;
                 stream " => ";
                 printGenReg dest
             )
 
         |   printICode (stream, printGenReg, _, _) (LoadNonAddressConstant{ source, dest }) =
             (
                 stream "\tLoadNonAddress\t"; stream(Word64.toString source);
                 stream " => ";
                 printGenReg dest
             )
 
         |   printICode (stream, printGenReg, _, _) (LoadAddressConstant{ source, dest }) =
             (
                 stream "\tLoadAddress\t"; stream(Address.stringOfWord source);
                 stream " => ";
                 printGenReg dest
             )
 
         |   printICode (stream, printGenReg, _, _) (LoadWithConstantOffset{ base, dest, byteOffset, loadType }) =
             (
                 stream "\tLoadConstOffset"; printLoadType(loadType, stream); stream "\t[";
                 printGenReg base; stream "]+";
                 stream(Int.toString byteOffset);
                 stream " => ";
                 printGenReg dest
             )
 
         |   printICode (stream, printGenReg, _, printFPReg) (LoadFPWithConstantOffset{ base, dest, byteOffset, floatSize }) =
             (
                 stream "\tLoadConstOffset"; printFloatSize(floatSize, stream); stream "\t[";
                 printGenReg base; stream "]+";
                 stream(Int.toString byteOffset);
                 stream " => ";
                 printFPReg dest
             )
 
         |   printICode (stream, printGenReg, _, _) (LoadWithIndexedOffset{ base, dest, index, loadType }) =
             (
                 stream "\tLoadIndexed"; printLoadType(loadType, stream); stream "\t[";
                 printGenReg base; stream "+"; printGenReg index;
                 stream "] => "; printGenReg dest
             )
 
         |   printICode (stream, printGenReg, _, printFPReg) (LoadFPWithIndexedOffset{ base, dest, index, floatSize }) =
             (
                 stream "\tLoadIndexed"; printFloatSize(floatSize, stream); stream "\t[";
                 printGenReg base; stream "+"; printGenReg index;
                 stream "] => "; printFPReg dest
             )
 
         |   printICode (stream, printGenReg, _, _) (GetThreadId { dest}) =
                 ( stream "\tGetThreadId\t"; stream " => "; printGenReg dest )
 
         |   printICode (stream, printGenReg, _, _) (ObjectIndexAddressToAbsolute{ source, dest }) =
             (
                 stream "\tObjectAddrToAbs\t";
                 printGenReg source; stream " => "; printGenReg dest
             )
 
         |   printICode (stream, printGenReg, _, _) (AbsoluteToObjectIndex{ source, dest }) =
             (
                 stream "\tAbsToObjectAddr\t";
                 printGenReg source; stream " => "; printGenReg dest
             )
 
         |   printICode (stream, printGenReg, _, _) (AllocateMemoryFixed{bytesRequired, dest, saveRegs}) =
             (
                 stream "\tAllocateMemory\t";
                 stream(Word64.fmt StringCvt.DEC bytesRequired); stream " => ";
                 printGenReg dest; stream " save="; printSaves(saveRegs, stream, printGenReg)
             )
 
         |   printICode (stream, printGenReg, _, _) (AllocateMemoryVariable{size, dest, saveRegs}) =
             (
                 stream "\tAllocateMemory\t";
                 stream "s="; printGenReg(size);
                 stream " => "; printGenReg dest;
                 stream " save="; printSaves(saveRegs, stream, printGenReg)
             )
 
         |   printICode (stream, printGenReg, _, _) (InitialiseMem{size, addr, init}) =
             (
                 stream "\tInitialiseMem\t";
                 stream "s="; printGenReg(size);
                 stream ",i="; printGenReg(init);
                 stream ",a="; printGenReg(addr)
             )
 
         |   printICode (stream, _, _, _) BeginLoop = stream "\tBeginLoop"
 
         |   printICode (stream, printGenReg, _, _) (JumpLoop{regArgs, stackArgs, checkInterrupt, ... }) =
             (
                 stream "\tJumpLoop\t";
                 List.app(fn {src, dst} => (printGenReg(dst); stream "="; printArg(src, stream, printGenReg); stream " ")) regArgs;
                 List.app(
                     fn {src, wordOffset, stackloc} =>
                         (printStackLoc(stackloc, stream); stream("(sp" ^ Int.toString wordOffset); stream ")="; printArg(src, stream, printGenReg); stream " ")
                     ) stackArgs;
                 case checkInterrupt of
                     NONE => ()
                 |   SOME saveRegs => (stream " Check:save="; printSaves(saveRegs, stream, printGenReg))
             )
 
         |   printICode (stream, printGenReg, _, _) (StoreWithConstantOffset{ base, source, byteOffset, loadType }) =
             (
                 stream "\tStoreConstOffset"; printLoadType(loadType, stream); stream "\t";
                 printGenReg source; stream " => [";
                 printGenReg base; stream "+";
                 stream(Int.toString byteOffset); stream "]"
             )
 
         |   printICode (stream, printGenReg, _, printFPReg) (StoreFPWithConstantOffset{ base, source, byteOffset, floatSize }) =
             (
                 stream "\tStoreConstOffset"; printFloatSize(floatSize, stream); stream "\t";
                 printFPReg source; stream " => [";
                 printGenReg base; stream "+";
                 stream(Int.toString byteOffset); stream "]"
             )
 
         |   printICode (stream, printGenReg, _, _) (StoreWithIndexedOffset{ base, source, index, loadType }) =
             (
                 stream "\tStoreIndexed"; printLoadType(loadType, stream); stream "\t";
                 printGenReg source; stream " => [";
                 printGenReg base; stream "+"; printGenReg index; stream "]"
             )
 
         |   printICode (stream, printGenReg, _, printFPReg) (StoreFPWithIndexedOffset{ base, source, index, floatSize }) =
             (
                 stream "\tStoreIndexed"; printFloatSize(floatSize, stream); stream "\t";
                 printFPReg source; stream " => [";
                 printGenReg base; stream "+"; printGenReg index; stream "]"
             )
 
         |   printICode (stream, printGenReg, printOptGenReg, _) (AddSubImmediate{ source, dest, ccRef, immed, isAdd, length }) =
             (
                 stream (if isAdd then "\tAddImmediate" else "\tSubImmediate"); stream(arithRepr length);
                 stream "\t"; printGenReg source; stream ",0x"; stream(Word.toString immed);
                 stream " => "; printOptGenReg dest;
                 case ccRef of NONE => () | SOME cc => (stream ", "; printCC(cc, stream))
             )
 
         |   printICode (stream, printGenReg, printOptGenReg, _) (AddSubRegister{ base, shifted, dest, ccRef, isAdd, length, shift }) =
             (
                 stream (if isAdd then "\tAddRegister" else "\tSubRegister"); stream(arithRepr length);
                 stream "\t"; printGenReg base; stream ", ";
                 printGenReg(shifted); printShift(shift, stream);
                 stream " => "; printOptGenReg dest;
                 case ccRef of NONE => () | SOME cc => (stream ", "; printCC(cc, stream))
             )
 
         |   printICode (stream, printGenReg, printOptGenReg, _) (LogicalImmediate{ source, dest, ccRef, immed, logOp, length }) =
             (
                 stream (case logOp of LogAnd => "\tAndImmediate" | LogOr => "\tOrImmediate" | LogXor => "\tXorImmediate");
                 stream(arithRepr length);
                 stream "\t"; printGenReg source; stream ",0x"; stream(Word64.toString immed);
                 stream " => "; printOptGenReg dest;
                 case ccRef of NONE => () | SOME cc => (stream ", "; printCC(cc, stream))
             )
 
         |   printICode (stream, printGenReg, printOptGenReg, _) (LogicalRegister{ base, shifted, dest, ccRef, logOp, length, shift }) =
             (
                 stream (case logOp of LogAnd => "\tAndRegister"  | LogOr => "\tOrRegister" | LogXor => "\tXorRegister");
                 stream(arithRepr length);
                 stream "\t"; printGenReg base; stream ", ";
                 printGenReg(shifted); printShift(shift, stream);
                 stream " => "; printOptGenReg dest;
                 case ccRef of NONE => () | SOME cc => (stream ", "; printCC(cc, stream))
             )
 
         |   printICode (stream, printGenReg, _, _) (ShiftRegister{ direction, dest, source, shift, opSize }) =
             (
                 stream (
                     case direction of
                         ShiftLeft => "\tShiftLeft"
                     |   ShiftRightLogical => "\tShiftRightLog"
                     |   ShiftRightArithmetic => "\tShiftRightArith");
                 stream(arithRepr opSize);
                 stream "\t"; printGenReg source; stream " by ";
                 printGenReg(shift);
                 stream " => "; printGenReg dest
             )
 
         |   printICode (stream, printGenReg, printOptGenReg, _) (Multiplication{ kind, dest, sourceA, sourceM, sourceN }) =
             (
                 stream (
                     case kind of
                         MultAdd32 => "\tMultAdd32\t"
                     |   MultSub32 => "\tMultSub32\t"
                     |   MultAdd64 => "\tMultAdd64\t"
                     |   MultSub64 => "\tMultSub64\t"
                     |   SignedMultAddLong => "\tSignedMultAddLong\t"
                     |   SignedMultHigh => "\tSignedMultHigh\t");
                 printGenReg(sourceM); stream " * ";
                 printGenReg(sourceN);
                 stream " +/- "; printOptGenReg sourceA;
                 stream " => "; printGenReg dest
             )
 
         |   printICode (stream, printGenReg, _, _) (Division{ isSigned, dest, dividend, divisor, opSize }) =
             (
                 stream (if isSigned then "\tSignedDivide" else "\tUnsignedDivide");
                 stream(arithRepr opSize);
                 stream "\t"; printGenReg(dividend); stream " by ";
                 printGenReg(divisor);
                 stream " => "; printGenReg dest
             )
 
         |   printICode (stream, printGenReg, _, _) (BeginFunction{ regArgs, stackArgs }) =
             (
                 stream "\tBeginFunction\t";
                 List.app(fn (arg, r) => (stream(regRepr r); stream "="; printGenReg(arg); stream " ")) regArgs;
                 List.app(fn s => printStackLoc(s, stream)) stackArgs
             )
 
         |   printICode (stream, printGenReg, _, _) (FunctionCall{callKind, regArgs, stackArgs, dest, saveRegs, containers}) =
             (
                 stream "\tFunctionCall\t";
                 case callKind of
                     Recursive => stream "recursive "
                 |   ConstantCode m => (stream(stringOfWord m); stream " ")
                 |   FullCall => ();
                 List.app(fn (arg, r) => (stream(regRepr r); stream "="; printArg(arg, stream, printGenReg); stream " ")) regArgs;
                 List.app(fn arg => (stream "p="; printArg(arg, stream, printGenReg); stream " ")) stackArgs;
                 stream "=> "; printGenReg dest;
                 stream " save="; printSaves(saveRegs, stream, printGenReg);
                 if null containers then ()
                 else (stream " containers="; List.app (fn c => (printStackLoc(c, stream); stream " ")) containers)
             )
 
         |   printICode (stream, printGenReg, _, _) (TailRecursiveCall{callKind, regArgs, stackArgs, stackAdjust, currStackSize, ...}) =
             (
                 stream "\tTailCall\t";
                 case callKind of
                     Recursive => stream "recursive "
                 |   ConstantCode m => (stream(stringOfWord m); stream " ")
                 |   FullCall => ();
                 List.app(fn (arg, r) => (stream(regRepr r); stream "="; printArg(arg, stream, printGenReg); stream " ")) regArgs;
                 List.app(fn {src, stack} => (stream (Int.toString stack); stream "<="; printArg(src, stream, printGenReg); stream " ")) stackArgs;
                 stream "adjust="; stream(Int.toString stackAdjust);
                 stream " stackSize="; stream(Int.toString currStackSize)
             )
 
         |   printICode (stream, printGenReg, _, _) (ReturnResultFromFunction{ resultReg, returnReg, numStackArgs }) =
             (
                 stream "\tReturnFromFunction\t";
                 printGenReg(returnReg); stream " with "; printGenReg(resultReg);
                 stream("," ^ Int.toString numStackArgs)
             )
 
         |   printICode (stream, printGenReg, _, _) (RaiseExceptionPacket{ packetReg }) =
             ( stream "\tRaiseException\t"; printGenReg(packetReg) )
 
         |   printICode (stream, printGenReg, _, _) (PushToStack{ source, copies, container }) =
             (
                 stream "\tPushToStack\t"; printGenReg source;
                 if copies > 1 then (stream " * "; stream(Int.toString copies)) else ();
                 stream " => "; printStackLoc(container, stream)
             )
 
         |   printICode (stream, printGenReg, _, _) (LoadStack{ dest, wordOffset, container, field }) =
             (
                 stream "\tLoadStack\t";
                 printStackLoc(container, stream); stream " + ";
                 stream(Int.toString field);
                 stream " (";  stream(Int.toString wordOffset); stream ")";
                 stream " => "; printGenReg dest
             )
 
         |   printICode (stream, printGenReg, _, _) (StoreToStack{ source, container, field, stackOffset }) =
             (
                 stream "\tStoreToStack\t"; printGenReg source;
                 stream " => "; printStackLoc(container, stream); stream "+";
                 stream (Int.toString field); stream "(";
                 stream(Int.toString stackOffset); stream ")"
             )
 
         |   printICode (stream, printGenReg, _, _) (ContainerAddress{ dest, container, stackOffset }) =
             (
                 stream "\tContainerAddress\t";
                 stream "@"; printStackLoc(container, stream);
                 stream " (";  stream(Int.toString stackOffset); stream ") => ";
                 printGenReg dest
             )
 
         |   printICode (stream, _, _, _) (ResetStackPtr{ numWords }) =
             ( stream "\tResetStackPtr\t"; stream(Int.toString numWords) )
 
         |   printICode (stream, printGenReg, _, _) (TagValue{ source, dest, isSigned, opSize }) =
             (
                 stream "\tTag"; stream(if isSigned then "Signed" else "Unsigned");
                 stream(arithRepr opSize); stream "\t";
                 printGenReg source; stream " => "; printGenReg dest
             )
 
         |   printICode (stream, printGenReg, _, _) (UntagValue{ source, dest, isSigned, opSize }) =
             (
                 stream "\tUntag"; stream(if isSigned then "Signed" else "Unsigned");
                 stream(arithRepr opSize); stream "\t";
                 printGenReg source; stream " => "; printGenReg dest
             )
 
         |   printICode (stream, printGenReg, _, _) (BoxLarge{source, dest, saveRegs}) =
             (
                 stream "\tBoxLarge\t";
                 printGenReg source;
                 stream " => ";
                 printGenReg dest;
                 stream " save="; printSaves(saveRegs, stream, printGenReg)
             )
 
         |   printICode (stream, printGenReg, _, _) (UnboxLarge{source, dest}) =
             (
                 stream "\tUnboxLarge\t";
                 printGenReg source;
                 stream " => ";
                 printGenReg dest
             )
 
         |   printICode (stream, printGenReg, _, printFPReg) (BoxTagFloat{floatSize, source, dest, saveRegs}) =
             (
                 stream "\tBoxTagFloat"; printFloatSize(floatSize, stream); stream "\t";
                 printFPReg source;
                 stream " => ";
                 printGenReg dest;
                 stream " save="; printSaves(saveRegs, stream, printGenReg)
             )
 
         |   printICode (stream, printGenReg, _, printFPReg) (UnboxTagFloat{floatSize, source, dest}) =
             (
                 stream "\tUnboxTagFloat"; printFloatSize(floatSize, stream); stream "\t";
                 printGenReg source;
                 stream " => ";
                 printFPReg dest
             )
 
         |   printICode (stream, printGenReg, _, _) (LoadAcquire{ base, dest, loadType }) =
             (
                 stream "\tLoadAcquire"; printLoadType(loadType, stream); stream "\t[";
                 printGenReg base; stream "] => "; printGenReg dest
             )
 
         |   printICode (stream, printGenReg, _, _) (StoreRelease{ base, source, loadType }) =
             (
                 stream "\tStoreRelease"; printLoadType(loadType, stream); stream "\t";
                 printGenReg source; stream " => [";
                 printGenReg base; stream "]"
             )
 
         |   printICode (stream, printGenReg, _, _) (BitFieldShift{ source, dest, isSigned, length, immr, imms }) =
             (
                 stream "\tBitShift"; stream(if isSigned then "Signed" else "Unsigned"); stream(arithRepr length); stream "\t";
                 printGenReg source;
                 stream " => "; printGenReg dest;
                 stream " immr="; stream(Word.fmt StringCvt.DEC immr);
                 stream " imms="; stream(Word.fmt StringCvt.DEC imms)
             )
 
         |   printICode (stream, printGenReg, _, _) (BitFieldInsert{ source, dest, destAsSource, length, immr, imms }) =
             (
                 stream "\tBitInsert";  stream(arithRepr length); stream "\t";
                 printGenReg source; stream " with "; printGenReg destAsSource;
                 stream " => "; printGenReg dest;
                 stream " immr="; stream(Word.fmt StringCvt.DEC immr);
                 stream " imms="; stream(Word.fmt StringCvt.DEC imms)
             )
 
         |   printICode (stream, printGenReg, _, _) (IndexedCaseOperation{testReg}) =
                 ( stream "\tIndexedCase\t"; printGenReg testReg )
 
         |   printICode (stream, _, _, _) PushExceptionHandler = stream "\tPushExcHandler"
 
         |   printICode (stream, _, _, _) PopExceptionHandler = stream "\tPopExcHandler"
 
         |   printICode (stream, printGenReg, _, _) (BeginHandler{packetReg}) =
             (
                 stream "\tBeginHandler\t";
                 printGenReg packetReg
             )
 
         |   printICode (stream, printGenReg, _, _) (CompareByteVectors{vec1Addr, vec2Addr, length, ccRef, ...}) =
             (
                 stream "\tCompareByteVectors\t";
                 printGenReg(vec1Addr); stream ",";
                 printGenReg(vec2Addr); stream ",";
                 printGenReg(length);
                 stream " => "; printCC(ccRef, stream)
             )
 
         |   printICode (stream, printGenReg, _, _) (BlockMove{srcAddr, destAddr, length, isByteMove}) =
             (
                 stream(if isByteMove then "\tBlockByteMove\t" else "\tBlockWordMove\t");
                 stream "src="; printGenReg(srcAddr);
                 stream ",dest="; printGenReg(destAddr);
                 stream ",len="; printGenReg(length)
             )
 
         |   printICode (stream, printGenReg, printOptGenReg, _) (AddSubXSP{ source, dest, isAdd }) =
             (
                 stream(if isAdd then "\tAdd\t" else "\tSubtract\t");
                 printGenReg source; stream " XSP => ";
                 printOptGenReg dest
             )
 
         |   printICode (stream, printGenReg, _, _) (TouchValue{ source }) =
             ( stream "\tTouchValue\t"; printGenReg source )
 
         |   printICode (stream, printGenReg, _, _) (LoadAcquireExclusive{ base, dest }) =
             (
                 stream "\tLoadExclusive\t["; printGenReg base;
                 stream "] => "; printGenReg dest
             )
 
         |   printICode (stream, printGenReg, printOptGenReg, _) (StoreReleaseExclusive{ base, source, result }) =
             (
                 stream "\tStoreExclusive\t";
                 printOptGenReg source; stream " => [";
                 printGenReg base; stream "] result => ";
                 printGenReg result
             )
 
         |   printICode (stream, _, _, _) MemoryBarrier = stream "\tMemoryBarrier"
 
         |   printICode (stream, printGenReg, _, printFPReg) (ConvertIntToFloat{ source, dest, srcSize, destSize}) =
             (
                 stream "\tConvert"; stream(arithRepr srcSize); stream "To";
                 printFloatSize(destSize, stream); stream "\t";
                 printGenReg source; stream " => "; printFPReg dest
             )
 
         |   printICode (stream, printGenReg, _, printFPReg) (ConvertFloatToInt{ source, dest, srcSize, destSize, rounding}) =
             let
                 open IEEEReal
             in
                 stream "\tConvert"; printFloatSize(srcSize, stream); stream "To";
                 stream(arithRepr destSize); stream "\t";
                 printFPReg source; stream " => "; printGenReg dest;
                 stream(
                     case rounding of
                         TO_NEAREST => " rounding"
                     |   TO_NEGINF => " rounding down"
                     |   TO_POSINF => " rounding up"
                     |   TO_ZERO => " truncating"
                 )
             end
 
         |   printICode (stream, _, _, printFPReg) (UnaryFloatingPt{ source, dest, fpOp}) =
             (
                 stream(
                     case fpOp of
                         NegFloat => "\tNegFloat\t"
                     |   NegDouble => "\tNegDouble\t"
                     |   AbsFloat => "\tAbsFloat\t"
                     |   AbsDouble => "\tAbsDouble\t"
                     |   ConvFloatToDble => "\tFloatToDble\t"
                     |   ConvDbleToFloat => "\t\t"
                 );
                 printFPReg source; stream " => "; printFPReg dest
             )
 
         |   printICode (stream, _, _, printFPReg) (BinaryFloatingPoint{ arg1, arg2, dest, fpOp, opSize}) =
             (
                 stream(
                     case fpOp of
                         MultiplyFP => "\tMultiply"
                     |   DivideFP => "\tDivide"
                     |   AddFP => "\tAdd"
                     |   SubtractFP => "\tSubtract"
                 );
                 printFloatSize(opSize, stream); stream "\t";
                 printFPReg arg1; stream ", "; printFPReg arg2;
                 stream " => "; printFPReg dest
             )
 
         |   printICode (stream, _, _, printFPReg) (CompareFloatingPoint{ arg1, arg2, opSize, ccRef}) =
             (
                 stream "\tCompare"; printFloatSize(opSize, stream); stream "\t";
                 printFPReg arg1; stream ", "; printFPReg arg2;
                 stream ", "; printCC(ccRef, stream)
             )
 
+        |   printICode (stream, printGenReg, _, _) (CacheCheck{ arg1, arg2}) =
+            ( stream "\tCacheCheck\t"; printGenReg arg1; stream ", "; printGenReg arg2 )
+
         and printCondition(cond,  stream)  = stream(condToString cond)
 
         (* Print a basic block. *)
         fun printBlock (stream, printGenReg, printOptGenReg, printFPReg) (blockNo, BasicBlock{block, flow, ...}) =
         (
             (* Put a label on all but the first. *)
             if blockNo <> 0 then stream("L" ^ Int.toString blockNo ^ ":") else ();
             List.app (fn icode => (printICode (stream, printGenReg, printOptGenReg, printFPReg) (icode); stream "\n")) block;
             case flow of
                 Unconditional l => stream("\tJump\tL" ^ Int.toString l ^ "\n")
             |   Conditional {condition, trueJump, falseJump, ccRef, ...} =>
                 (
                     stream "\tJump"; printCondition(condition, stream);
                     stream "\t"; printCC(ccRef, stream);
                     stream " L"; stream (Int.toString trueJump);
                     stream " else L"; stream (Int.toString falseJump); stream "\n"
                 )
             |   ExitCode => ()
             |   IndexedBr _ => ()
             |   SetHandler{handler, continue} =>
                     stream(concat["\tSetHandler\tH", Int.toString handler, "\n",
                                   "\tJump\tL", Int.toString continue, "\n"])
             |   UnconditionalHandle handler => stream("\tJump\tH" ^ Int.toString handler ^ "\n")
             |   ConditionalHandle{handler, continue} =>
                     stream(concat["\tJump\tL", Int.toString continue, " or H", Int.toString handler, "\n"])
         )
     in
         fun printPReg stream (PReg i) = stream("R" ^ Int.toString i)
         fun printOptPReg stream ZeroReg = stream "Zero"
         |   printOptPReg stream (SomeReg reg) = printPReg stream reg
         
         fun printXReg stream (XReg w) = stream("X" ^ Int.toString(Word8.toInt w))
         |   printXReg stream XZero = stream "XZ"
         |   printXReg stream XSP = stream "XSP"
 
         fun printVReg stream (VReg w) = stream("V" ^ Int.toString(Word8.toInt w))
 
         fun printICodeAbstract(blockVec, stream) =
             Vector.appi(printBlock(stream, printPReg stream, printOptPReg stream, printPReg stream)) blockVec
         
         and printICodeConcrete(blockVec, stream) =
             Vector.appi(printBlock(stream, printXReg stream, printXReg stream, printVReg stream)) blockVec
     end
 
     (* Only certain bit patterns are allowed in a logical immediate instruction
        but the encoding is complex so it's easiest to inherit the test from
        the assembler layer. *)
     local
         fun optow OpSize32 = WordSize32 | optow OpSize64 = WordSize64
     in
         fun isEncodableBitPattern(v, w) = Arm64Code.isEncodableBitPattern(v, optow w)
     end
 
     (* This generates a  BitField instruction with the appropriate values for immr and imms. *)
     fun shiftConstant{ direction, dest, source, shift, opSize } =
     let
         val (isSigned, immr, imms) =
             case (direction, opSize) of
                 (ShiftLeft, OpSize64) => (false, Word.~ shift mod 0w64, 0w64-0w1-shift)
             |   (ShiftLeft, OpSize32) => (false, Word.~ shift mod 0w32, 0w32-0w1-shift)
             |   (ShiftRightLogical, OpSize64) => (false, shift, 0wx3f)
             |   (ShiftRightLogical, OpSize32) => (false, shift, 0wx1f)
             |   (ShiftRightArithmetic, OpSize64) => (true, shift, 0wx3f)
             |   (ShiftRightArithmetic, OpSize32) => (true, shift, 0wx1f)
     in
         BitFieldShift{ source=source, dest=dest, isSigned=isSigned, length=opSize, immr=immr, imms=imms }
     end
 
     structure Sharing =
     struct
         type xReg           = xReg
         and  vReg           = vReg
         and  reg            = reg
         and  condition      = condition
         and  shiftType      = shiftType
         and  ('genReg, 'optGenReg, 'fpReg) arm64ICode = ('genReg, 'optGenReg, 'fpReg) arm64ICode
         and  preg           = preg
         and  pregOrZero     = pregOrZero
         and  controlFlow    = controlFlow
         and  ('genReg, 'optGenReg, 'fpReg) basicBlock = ('genReg, 'optGenReg, 'fpReg) basicBlock
         and  stackLocn      = stackLocn
         and  regProperty    = regProperty
         and  ccRef          = ccRef
         and  'genReg fnarg  = 'genReg fnarg
         and  closureRef     = closureRef
         and  loadType       = loadType
         and  opSize         = opSize
         and  logicalOp      = logicalOp
         and  callKind       = callKind
         and  floatSize      = floatSize
         and  shiftDirection = shiftDirection
         and  multKind       = multKind
         and  fpUnary        = fpUnary
         and  fpBinary       = fpBinary
    end
 end;
 
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64IdentifyReferences.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64IdentifyReferences.ML
index 0ccc3b38..f0f511a6 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64IdentifyReferences.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64IdentifyReferences.ML
@@ -1,875 +1,877 @@
 (*
-    Copyright (c) 2021 David C.J. Matthews
+    Copyright (c) 2021-2 David C.J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64IdentifyReferences(
     structure Arm64ICode: ARM64ICODE
     structure Debug: DEBUG
     structure IntSet: INTSET
 ): ARM64IDENTIFYREFERENCES =
 struct
     open Arm64ICode
     open IntSet
 
     type regState =
     { 
         active: int, refs: int, pushState: bool, prop: regProperty
     }
 
     (* CC states before and after.  Currently no instruction uses the
        condition; conditional branches are handled at the block level.
        The result of executing the instruction may be to set the condition code to a
        defined state, an undefined state or leave it unchanged. *)
     datatype outCCState = CCSet of ccRef | CCIndeterminate | CCUnchanged
     and inCCState = CCNeeded of ccRef | CCUnused
     
     datatype extendedBasicBlock =
         ExtendedBasicBlock of
         {
             block: {instr: iCodeAbstract, current: intSet, active: intSet, kill: intSet } list,
             flow: controlFlow,
             locals: intSet, (* Defined and used entirely within the block. *)
             imports: intSet, (* Defined outside the block, used inside it, but not needed afterwards. *)
             exports: intSet, (* Defined within the block, possibly used inside, but used outside. *)
             passThrough: intSet, (* Active throughout the block. May be referred to by it but needed afterwards. *)
             loopRegs: intSet, (* Destination registers for a loop.  They will be updated by this block. *)
             initialStacks: intSet, (* Stack items required at the start i.e. imports+passThrough for stack items. *)
             inCCState: ccRef option, (* The state this block assumes.  If SOME _ all predecessors must set it. *)
             outCCState: ccRef option (* The condition code set by this block.  SOME _ if at least one successor needs it. *)
         }
     
     exception InternalError = Misc.InternalError
 
     (* Return the list of blocks that are the immediate successor of this. *)
     fun blockSuccessors(BasicBlock{flow, ...}) = successorBlocks flow
 
     fun getOptReg(SomeReg reg) = [reg] | getOptReg ZeroReg = []
 
     fun getInstructionState(MoveRegister { source, dest, ...}) =
         { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadNonAddressConstant { dest, ...}) =
         { sources=[], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadAddressConstant { dest, ...}) =
         { sources=[], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadWithConstantOffset { base, dest, ...}) =
         { sources=[base], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadFPWithConstantOffset { base, dest, ...}) =
         { sources=[base], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadWithIndexedOffset { base, dest, index, ...}) =
         { sources=[base, index], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadFPWithIndexedOffset { base, dest, index, ...}) =
         { sources=[base, index], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(GetThreadId { dest, ...}) =
         { sources=[], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(ObjectIndexAddressToAbsolute { source, dest, ...}) =
         { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(AbsoluteToObjectIndex { source, dest, ...}) =
         { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(AllocateMemoryFixed { dest, ...}) =
         { sources=[], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(AllocateMemoryVariable{size, dest, ...}) =
             { sources=[size], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(InitialiseMem{size, addr, init}) =
             { sources=[size, addr, init], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BeginLoop) =
             (* This is just a marker.  It doesn't actually generate any code. *)
             { sources=[], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(JumpLoop{regArgs, stackArgs, ...}) =
         let
             fun getSourceFromRegs({src=ArgInReg reg, ...}, (regArgs, stackArgs)) = (reg :: regArgs, stackArgs)
             |   getSourceFromRegs({src=ArgOnStack { container, ...}, ...}, (regArgs, stackArgs)) = (regArgs, container :: stackArgs)
             and getSourceFromStack({src=ArgInReg reg, ...}, (regArgs, stackArgs)) = (reg :: regArgs, stackArgs)
             |   getSourceFromStack({src=ArgOnStack { container, ...}, ...}, (regArgs, stackArgs)) = (regArgs, container :: stackArgs)
             val (regSources, stackSources) =
                 List.foldl getSourceFromRegs (List.foldl getSourceFromStack ([], []) stackArgs) regArgs
         in
             { sources=regSources, dests=[], sStacks=stackSources, dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
         end
 
     |   getInstructionState(StoreWithConstantOffset { base, source, ...}) =
         { sources=[source, base], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(StoreFPWithConstantOffset { base, source, ...}) =
         { sources=[source, base], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(StoreWithIndexedOffset { base, source, index, ...}) =
         { sources=[source, base, index], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(StoreFPWithIndexedOffset { base, source, index, ...}) =
         { sources=[source, base, index], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(AddSubImmediate{ source, dest, ccRef, ... }) =
         { sources=[source], dests=getOptReg dest, sStacks=[], dStacks=[],
           ccIn=CCUnused, ccOut=case ccRef of NONE => CCUnchanged | SOME cc => CCSet cc }
 
     |   getInstructionState(AddSubRegister{ base, shifted, dest, ccRef, ... }) =
         { sources=[base, shifted], dests=getOptReg dest, sStacks=[], dStacks=[],
           ccIn=CCUnused, ccOut=case ccRef of NONE => CCUnchanged | SOME cc => CCSet cc }
 
     |   getInstructionState(LogicalImmediate{ source, dest, ccRef, ... }) =
         { sources=[source], dests=getOptReg dest, sStacks=[], dStacks=[],
           ccIn=CCUnused, ccOut=case ccRef of NONE => CCUnchanged | SOME cc => CCSet cc }
 
     |   getInstructionState(LogicalRegister{ base, shifted, dest, ccRef, ... }) =
         { sources=[base, shifted], dests=getOptReg dest, sStacks=[], dStacks=[],
           ccIn=CCUnused, ccOut=case ccRef of NONE => CCUnchanged | SOME cc => CCSet cc }
 
     |   getInstructionState(ShiftRegister{ source, shift, dest, ... }) =
         { sources=[source, shift], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(Multiplication{ dest, sourceA, sourceM, sourceN, ... }) =
         { sources=getOptReg sourceA @ [sourceM, sourceN],
           dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(Division{ dest, dividend, divisor, ... }) =
         { sources=[dividend, divisor], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BeginFunction {regArgs, stackArgs, ...}) =
             { sources=[], dests=map #1 regArgs, sStacks=[], dStacks=stackArgs, ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(FunctionCall{regArgs, stackArgs, dest, containers, ...}) =
         let
             (* Non-tail-recursive.  Behaves as a normal reference to sources. *)
             fun getSourceFromRegs((ArgInReg reg, _), (regArgs, stackArgs)) = (reg :: regArgs, stackArgs)
             |   getSourceFromRegs((ArgOnStack { container, ...}, _), (regArgs, stackArgs)) = (regArgs, container :: stackArgs)
             and getSourceFromStack(ArgInReg reg, (regArgs, stackArgs)) = (reg :: regArgs, stackArgs)
             |   getSourceFromStack(ArgOnStack { container, ...}, (regArgs, stackArgs)) = (regArgs, container :: stackArgs)
             val (argSources, stackSources) =
                 List.foldl getSourceFromRegs (List.foldl getSourceFromStack ([], []) stackArgs) regArgs
         in
             { sources=argSources, dests=[dest], sStacks=stackSources @ containers, dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
         end
 
     |   getInstructionState(TailRecursiveCall{regArgs, stackArgs, ...}) =
         let
             (* Tail recursive call.  References the argument sources but exits. *)
             fun getSourceFromRegs((ArgInReg reg, _), (regArgs, stackArgs)) = (reg :: regArgs, stackArgs)
             |   getSourceFromRegs((ArgOnStack { container, ...}, _), (regArgs, stackArgs)) = (regArgs, container :: stackArgs)
             and getSourceFromStack({src=ArgInReg reg, ...}, (regArgs, stackArgs)) = (reg :: regArgs, stackArgs)
             |   getSourceFromStack({src=ArgOnStack { container, ...}, ...}, (regArgs, stackArgs)) = (regArgs, container :: stackArgs)
             val (argSources, stackSources) =
                 List.foldl getSourceFromRegs (List.foldl getSourceFromStack ([], []) stackArgs) regArgs
         in
             { sources=argSources, dests=[], sStacks=stackSources, dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
         end
 
     |   getInstructionState(ReturnResultFromFunction{resultReg, returnReg, ...}) =
             { sources=[resultReg, returnReg], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(RaiseExceptionPacket{packetReg}) =
             { sources=[packetReg], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(PushToStack{ source, container, ... }) =
             { sources=[source], dests=[], sStacks=[], dStacks=[container], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadStack{ dest, container, ... }) =
             { sources=[], dests=[dest], sStacks=[container], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(StoreToStack{ source, container, ... }) =
         (* Although this stores into the container it must already exist. *)
             { sources=[source], dests=[], sStacks=[container], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(ContainerAddress{ dest, container, ... }) =
             { sources=[], dests=[dest], sStacks=[container], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(ResetStackPtr _) =
             { sources=[], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(TagValue{source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(UntagValue{source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BoxLarge{source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(UnboxLarge{source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BoxTagFloat{source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(UnboxTagFloat{source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadAcquire { base, dest, ...}) =
         { sources=[base], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(StoreRelease { base, source, ...}) =
         { sources=[source, base], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BitFieldShift{ source, dest, ... }) =
         { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BitFieldInsert{ source, destAsSource, dest, ... }) =
         { sources=[source, destAsSource], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(IndexedCaseOperation{ testReg }) =
         { sources=[testReg], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(PushExceptionHandler) =
         { sources=[], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(PopExceptionHandler) =
         { sources=[], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BeginHandler{packetReg}) =
         (* The packet register is a destination since this provides its definition. *)
         { sources=[], dests=[packetReg], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(CompareByteVectors{vec1Addr, vec2Addr, length, ccRef, ...}) =
             { sources=[vec1Addr, vec2Addr, length], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCSet ccRef }
 
     |   getInstructionState(BlockMove{srcAddr, destAddr, length, ...}) =
             { sources=[srcAddr, destAddr, length], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(AddSubXSP{source, dest, ...}) =
             { sources=[source], dests=getOptReg dest, sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(TouchValue{source}) =
             { sources=[source], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadAcquireExclusive{base, dest}) =
             { sources=[base], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(StoreReleaseExclusive{base, source, result}) =
             { sources=[base] @ getOptReg source, dests=[result], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(MemoryBarrier) =
             { sources=[], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(ConvertIntToFloat{ source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(ConvertFloatToInt{ source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(UnaryFloatingPt{ source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BinaryFloatingPoint{ arg1, arg2, dest, ...}) =
             { sources=[arg1, arg2], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(CompareFloatingPoint{ arg1, arg2, ccRef, ...}) =
             { sources=[arg1, arg2], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCSet ccRef }
 
+    |   getInstructionState(CacheCheck{ arg1, arg2}) =
+            { sources=[arg1, arg2], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     (* These instructions can be eliminated if their register sources are not used.
        There may be other cases. *)
     fun eliminateable(MoveRegister _) = true
     |   eliminateable(LoadNonAddressConstant _) = true
     |   eliminateable(LoadAddressConstant _) = true
     |   eliminateable(LoadWithConstantOffset _) = true
     |   eliminateable(LoadWithIndexedOffset _) = true
     |   eliminateable(ObjectIndexAddressToAbsolute _) = true
     |   eliminateable(TagValue _) = true
     |   eliminateable(UntagValue _) = true
     |   eliminateable(BoxLarge _) = true
     |   eliminateable(UnboxLarge _) = true
 
     |   eliminateable _ = false
    
 
     fun identifyRegs(blockVector, pregProps): extendedBasicBlock vector * regState vector =
     let
         val maxPRegs = Vector.length pregProps
         val vectorLength = Vector.length blockVector
         (* Initial arrays - declarationArray is the set of registers given
            values by the block, importArray is the set of registers referenced by
            the block and not declared locally. *)
         val declarationArray = Array.array(vectorLength, emptySet)
         and importArray = Array.array(vectorLength, emptySet)
         val stackDecArray = Array.array(vectorLength, emptySet)
         and stackImportArray = Array.array(vectorLength, emptySet)
         and localLoopRegArray = Array.array(vectorLength, emptySet)
         
         (* References - this is used locally to see if a register is ever
            actually used and also included in the result which uses it as
            part of the choice of which register to spill. *)
         val regRefs = Array.array(maxPRegs, 0)
         (* Registers that must be pushed because they are required after
            a function call.  For cache registers this means "discard". *)
         and requirePushOrDiscard = Array.array(maxPRegs, false)
 
         fun incrRef r = Array.update(regRefs, r, Array.sub(regRefs, r)+1)
         
         (* Contains the, possibly filtered, code for each block. *)
         val resultCode = Array.array(vectorLength, NONE)
         
         val ccInStates = Array.array(vectorLength, CCUnused)
         and ccOutStates = Array.array(vectorLength, CCIndeterminate)
         
         (* First pass - for each block build up the sets of registers defined and
            used in the block.  We do this depth-first so that we can use "refs" to
            see if a register is used.  If this is an instruction that can be eliminated
            we don't need to generate it and can ignore any references it makes. *)
         local
             fun blockScan blockNo =
             if isSome(Array.sub(resultCode, blockNo)) then ()
             else
             let
                 val () = Array.update(resultCode, blockNo, SOME []) (* Prevent looping. *)
                 val thisBlock as BasicBlock { block, flow, ...} = Vector.sub(blockVector, blockNo)
                 val successors = blockSuccessors thisBlock
                 (* Visit everything reachable first. *)
                 val () = List.app blockScan successors
                 
                 fun scanCode(instr, original as { code, decs, refs, sDecs, sRefs, occIn, occOut, loopRegs, ... }) =
                 let
                     val { sources, dests, sStacks=stackSrcs, dStacks=stackDests, ccIn, ccOut, ... } =
                         getInstructionState instr
                     fun regNo(PReg i) = i
                     and stackNo(StackLoc{rno, ...}) = rno
                     val destRegNos = map regNo dests
                     and sourceRegNos = map regNo sources
                     val stackDestRegNos = map stackNo stackDests
                     and stackSourceRegNos = map stackNo stackSrcs
                     (* If this instruction requires a cc i.e. is SetToCondition or X87FPGetCondition we
                        need to set this as a requirement earlier.  If this sets the CC and it is the condition
                        we've been expecting we've satisfied it and can set the previous condition to Unused.
                        We could use this to decide if a comparison is no longer required.  That can only
                        happen in very specific circumstances e.g. some tests in Test176.ML so it's
                        not worthwhile. *)
                     val newInCC =
                         case (ccIn, ccOut, occIn) of
                             (cc as CCNeeded _, _, _) => cc (* This instr needs a particular cc. *)
                         |   (CCUnused, CCSet _, _) => CCUnused
                         |   (CCUnused, _, occIn) => occIn
                     (* If this instruction modifies the CC check to see if it is setting an requirement. *)
                     val _ =
                         case (occIn, ccOut) of
                             (CCNeeded ccRIn, CCSet ccRout) =>
                                 if ccRIn = ccRout then () else raise InternalError "CCCheck failed"
                         |   (CCNeeded _, CCIndeterminate) => raise InternalError "CCCheck failed"
                         |   _ => ()
                     (* The output CC is the last CC set.  Tail instructions that don't change
                        the CC state are ignored until we reach an instruction that sets it. *)
                     val newOutCC = case occOut of CCUnchanged => ccOut | _ => occOut
 
                     val instrLoopRegs =
                         case instr of
                             JumpLoop{regArgs, ...} => listToSet (map (regNo o #dst) regArgs)
                         |   _ => emptySet
                 in
                     if eliminateable instr andalso
                         List.all(fn dReg => Array.sub(regRefs, dReg) = 0) destRegNos
                     then original (* Don't include this instruction. *)
                     else
                     let
                         (* Only mark the sources as referred after we know we're going to need this.
                            In that way we may eliminate the instruction that created this source. *)
                         val () = List.app incrRef sourceRegNos
                     in
                         { code = instr :: code, decs = union(listToSet destRegNos, decs), refs = union(listToSet sourceRegNos, refs),
                           sDecs = union(listToSet stackDestRegNos, sDecs), sRefs = union(listToSet stackSourceRegNos, sRefs),
                           occIn = newInCC, occOut = newOutCC, loopRegs = union(loopRegs, instrLoopRegs)}
                     end
                 end
                 
                 (* If we have a conditional branch at the end we need the condition code.  It should either
                    be set here or in a preceding block. *)
                 val inCC = case flow of Conditional { ccRef, ...} => CCNeeded ccRef | _ => CCUnused
 
                 val { code, decs, refs, sDecs, sRefs, occIn, occOut, loopRegs, ... } =
                     List.foldr scanCode
                         {code=[], decs=emptySet, refs=emptySet, sDecs=emptySet, sRefs=emptySet, occIn=inCC, occOut=CCUnchanged, loopRegs=emptySet} block
             in
                 Array.update(declarationArray, blockNo, decs);
                 (* refs includes local declarations. Remove before adding to the result. *)
                 Array.update(importArray, blockNo, minus(refs, decs));
                 Array.update(localLoopRegArray, blockNo, loopRegs);
                 Array.update(stackDecArray, blockNo, sDecs);
                 Array.update(stackImportArray, blockNo, minus(sRefs, sDecs));
                 Array.update(resultCode, blockNo, SOME code);
                 Array.update(ccInStates, blockNo, occIn);
                 Array.update(ccOutStates, blockNo, occOut)
             end
         in
             val () = blockScan 0 (* Start with the root block. *)
         end
         
         (* Second phase - Propagate reference information between the blocks.
            We need to consider loops here.  Do a depth-first scan marking each
            block.  If we find a loop we save the import information we've used.
            If when we come to process that block we find the import information
            is different we need to reprocess. *)
         (* Pass through array - values used in other blocks after this that
            are not declared in this block. *)
         val passThroughArray = Array.array(vectorLength, emptySet)
         val stackPassThroughArray = Array.array(vectorLength, emptySet)
         (* Exports - those of our declarations that are used in other blocks. *)
         val exportArray = Array.array(vectorLength, emptySet)
         val stackExportArray = Array.array(vectorLength, emptySet)
         (* Loop registers.  This contains the registers that are not exported
            from or passed through this block but are used subsequently as
            loop registers. *)
         val loopRegArray = Array.array(vectorLength, emptySet)
         val () = Array.copy{src=localLoopRegArray, dst=loopRegArray, di=0}
         (* If any one of the successors requires the CC then this is set.
            Otherwise we leave it as Unused. *)
         val ccRequiredOut = Array.array(vectorLength, CCUnused)
         local
             datatype loopData =
                 Unprocessed | Processing | Processed
             |   Looped of { regSet: intSet, loopSet: intSet, stackSet: intSet, ccState: inCCState }
             
             fun reprocessLoop () =
             let
                 val reprocess = ref false
                 val loopArray = Array.array(vectorLength, Unprocessed)
             
                 fun processBlocks blockNo =
                     case Array.sub(loopArray, blockNo) of
                         Processed => (* Already seen this by a different route. *)
                             {
                                 regSet = union(Array.sub(passThroughArray, blockNo), Array.sub(importArray, blockNo)),
                                 stackSet = union(Array.sub(stackPassThroughArray, blockNo), Array.sub(stackImportArray, blockNo)),
                                 ccState = Array.sub(ccInStates, blockNo),
                                 loopSet = Array.sub(loopRegArray, blockNo)
                             }
                     |   Looped s => s (* We've already seen this in a loop. *)
                     |   Processing => (* We have a loop. *)
                         let
                             (* Use the existing input array. *)
                             val inputs =
                             {
                                 regSet = union(Array.sub(passThroughArray, blockNo), Array.sub(importArray, blockNo)),
                                 stackSet = union(Array.sub(stackPassThroughArray, blockNo), Array.sub(stackImportArray, blockNo)),
                                 ccState = Array.sub(ccInStates, blockNo),
                                 loopSet = Array.sub(loopRegArray, blockNo)
                             }
                             val () = Array.update(loopArray, blockNo, Looped inputs)
                         in
                             inputs
                         end
                     |   Unprocessed => (* Normal case - not visited yet. *)
                         let
                             val () = Array.update(loopArray, blockNo, Processing)
                             val thisBlock = Vector.sub(blockVector, blockNo)
                             val ourDeclarations = Array.sub(declarationArray, blockNo)
                             and ourStackDeclarations = Array.sub(stackDecArray, blockNo)
                             and ourLocalLoopRegs = Array.sub(localLoopRegArray, blockNo)
                             val successors = blockSuccessors thisBlock
 
                             fun addSuccessor b =
                             let
                                 val {regSet=theirImports, stackSet=theirStackImports, ccState=theirInState, loopSet=theirLoops} = processBlocks b
                                 (* Remove loop regs from the imports if they are actually given new
                                    values by this block.  We don't want to pass the old loop regs through here. *)
                                 val theirImports = minus(theirImports, ourLocalLoopRegs)
                                 (* Split the imports.  If a register is a local declaration then
                                    it becomes an export.  If it is not it becomes part of our
                                    passThrough. *)
                                 val (addToExp, addToImp) =
                                     IntSet.partition (fn i => member(i, ourDeclarations)) theirImports
                                 val (addToStackExp, addToStackImp) =
                                     IntSet.partition (fn i => member(i, ourStackDeclarations)) theirStackImports
                                 (* Merge the input states from each of the successors. *)
                                 val () =
                                     case (theirInState, Array.sub(ccRequiredOut, blockNo)) of
                                         (CCNeeded ts, CCNeeded req) =>
                                             if ts = req then () else raise InternalError "Mismatched states"
                                     |   (ts as CCNeeded _, _) => Array.update(ccRequiredOut, blockNo, ts)
                                     |   _ => ()
                                 (* Add loop registers to the set if they are not declared here.  The
                                    only place they are declared is at the entry to the loop so that
                                    stops them being propagated further. *)
                                 val addToLoops = minus(theirLoops, ourDeclarations)
                             in
                                 Array.update(exportArray, blockNo,
                                     union(Array.sub(exportArray, blockNo), addToExp));
                                 Array.update(passThroughArray, blockNo,
                                     union(Array.sub(passThroughArray, blockNo), addToImp));
                                 Array.update(stackExportArray, blockNo,
                                     union(Array.sub(stackExportArray, blockNo), addToStackExp));
                                 Array.update(stackPassThroughArray, blockNo,
                                     union(Array.sub(stackPassThroughArray, blockNo), addToStackImp));
                                 Array.update(loopRegArray, blockNo,
                                     union(Array.sub(loopRegArray, blockNo), addToLoops))
                             end
                             val () = List.app addSuccessor successors
                             val ourInputs =
                                 union(Array.sub(passThroughArray, blockNo), Array.sub(importArray, blockNo))
                             val ourStackInputs =
                                 union(Array.sub(stackPassThroughArray, blockNo), Array.sub(stackImportArray, blockNo))
                         in
                             (* Check that we supply the required state. *)
                             case (Array.sub(ccRequiredOut, blockNo), Array.sub(ccOutStates, blockNo)) of
                                 (CCNeeded ccReq, CCSet ccSet) =>
                                     if ccReq = ccSet then () else raise InternalError "Mismatched cc states"
                             |   (CCNeeded _, CCIndeterminate) => raise InternalError "Mismatched cc states"
                             |   (cc as CCNeeded needOut, CCUnchanged) =>
                                 (
                                     (* We pass through the state.  If we don't use the state then we
                                        need to set this as the input.  If we do use the state it must be
                                        the same. *)
                                     case Array.sub(ccInStates, blockNo) of
                                         CCUnused => Array.update(ccInStates, blockNo, cc)
                                     |   CCNeeded needIn =>
                                             if needOut = needIn then () else raise InternalError "Mismatched cc states"
                                 )
                             |   _ => ();
                             (* Was this block used in a loop?  If so we should not be requiring a CC. *)
                             case Array.sub(loopArray, blockNo) of
                                 Looped {regSet, stackSet, ...} =>
                                 (
                                     case Array.sub(ccInStates, blockNo) of
                                         CCNeeded _ => raise InternalError "Looped state needs cc" | _ => ();
                                     if setToList regSet = setToList ourInputs andalso
                                         setToList stackSet = setToList ourStackInputs
                                     then ()
                                     else reprocess := true
                                 )
                             |   _ => ();
                             Array.update(loopArray, blockNo, Processed);
                             { regSet = ourInputs, stackSet = ourStackInputs,
                               ccState = Array.sub(ccInStates, blockNo), loopSet=Array.sub(loopRegArray, blockNo)}
                         end
             in
                 reprocess := false;
                 processBlocks 0;
                 if !reprocess then reprocessLoop () else ()
             end
         in
             val () = reprocessLoop ()
         end
         
         (* Third pass - Build the result list with the active registers for each
            instruction.  We don't include registers in the passThrough set since
            they are active throughout the block. *)
         local
             (* Number of instrs for which this is active.  We use this to try to select a
                register to push to the stack if we have too many.  Registers that have
                only a short lifetime are less likely to be pushed than those that are
                active longer. *)
             val regActive = Array.array(maxPRegs, 0)
             fun addActivity n r = Array.update(regActive, r, Array.sub(regActive, r)+n)
             
             fun createResultInstrs (passThrough, stackPassThrough)
                 (instr, (tail, activeAfterThis, stackActiveAfterThis)) =
             let
                 val { sources, dests, sStacks=stackSrcs, dStacks=stackDests, ... } = getInstructionState instr
             in
                 (* Eliminate instructions if their results are not required.  The earlier check for this
                    will remove most cases but if we have duplicated a block we may have a register that
                    is required elsewhere but not in this particular branch.  *)
                 if not(List.exists(fn PReg d => member(d, activeAfterThis)) dests) andalso eliminateable instr
                 then (tail, activeAfterThis, stackActiveAfterThis)
                 else
                 let
                     fun regNo(PReg i) = i
                     fun stackNo(StackLoc{rno, ...}) = rno
                     val destRegNos = map regNo dests
                     and sourceRegNos = map regNo sources
                     val destSet = listToSet destRegNos
                     (* Remove any sources that are present in passThrough since
                        they are going to be active throughout the block. *)
                     and sourceSet = minus(listToSet sourceRegNos, passThrough)
                     val stackDestRegNos = map stackNo stackDests
                     and stackSourceRegNos = map stackNo stackSrcs
                     val stackDestSet = listToSet stackDestRegNos
                     and stackSourceSet = minus(listToSet stackSourceRegNos, stackPassThrough)
 
                     (* To compute the active set for the PREVIOUS instruction (we're processing from the
                        end back to the start) we remove any registers that have been given values in this
                        instruction and add anything that we are using in this instruction since they will
                        now need to have values. *)
                     val afterRemoveDests = minus(activeAfterThis, destSet)
                     val stackAfterRemoveDests = minus(stackActiveAfterThis, stackDestSet)
                     val activeForPrevious = union(sourceSet, afterRemoveDests)
                     val stackActiveForPrevious = union(stackSourceSet, stackAfterRemoveDests)
             
                     (* The "active" set is the set of registers that need to be active DURING the
                        instruction.  It includes destinations, which will usually be in
                        "activeAfterThis", because there may be destinations that are not actually used
                        subsequently but still need a register. *)
                     val activeForInstr =
                         case instr of
                             FunctionCall _ => sourceSet (* Is this still needed? *)
                         |   TailRecursiveCall _ =>
                                 (* Set the active set to the total set of registers we require including
                                    the work register.  This ensures that we will spill as many registers
                                    as we require when we look at the size of the active set. *)
                                 union(sourceSet, destSet)
                         |   BoxLarge _ =>
                                 (* We can only store the value in the box after the box is allocated. *)
                                 union(activeAfterThis, union(sourceSet, destSet))
                         |   BoxTagFloat _ =>
                                 (* Since the source must be a V register and the destination an
                                    X register there isn't actually a problem here, but do this
                                    anyway. *)
                                 union(activeAfterThis, union(sourceSet, destSet))
                         |   _ => union(activeAfterThis, destSet)
                 
                     val () = List.app(addActivity 1) (setToList activeForInstr)
 
                     local
                         (* If we are allocating memory we have to save the current registers if
                            they could contain an address.  We mustn't push untagged registers
                            and we mustn't push the destination. *)
                         fun getSaveSet dReg =
                         let
                             val activeAfter = union(activeAfterThis, passThrough)
                             (* Remove any registers marked - must-not-push.  These are
                                registers holding non-address values.  They will actually
                                be saved by the RTS across any GC but not checked or
                                modified by the GC.
                                Exclude the result register. *)
                             fun getSave i =
                                 if i = dReg
                                 then NONE
                                 else case Vector.sub(pregProps, i) of
                                     RegPropGeneral => SOME(PReg i)
                                 |   RegPropCacheTagged => SOME(PReg i)
                                 |   RegPropUntagged => NONE
                                 |   RegPropStack _ => NONE
                                 |   RegPropCacheUntagged => NONE
                                 |   RegPropMultiple => raise InternalError "getSave: RegPropMultiple"
                         in
                             List.mapPartial getSave (setToList activeAfter)
                         end
                     in
                         (* Sometimes we need to modify the instruction e.g. to include the set
                            of registers to save. *)
                         val convertedInstr =
                             case instr of
                                 AllocateMemoryFixed{bytesRequired, dest, saveRegs=_} =>
                                     AllocateMemoryFixed{bytesRequired=bytesRequired, dest=dest,
                                         saveRegs=getSaveSet(regNo dest)}
 
                             |   AllocateMemoryVariable{size, dest, saveRegs=_} =>
                                     AllocateMemoryVariable{size=size, dest=dest, saveRegs=getSaveSet(regNo dest)}
 
                             |   BoxLarge{source, dest, saveRegs=_} =>
                                     BoxLarge{source=source, dest=dest, saveRegs=getSaveSet(regNo dest)}
 
                             |   BoxTagFloat{source, dest, floatSize, saveRegs=_} =>
                                     BoxTagFloat{source=source, dest=dest, floatSize=floatSize,
                                         saveRegs=getSaveSet(regNo dest)}
                        
                             |   JumpLoop{regArgs, stackArgs, checkInterrupt = SOME _, ...} =>
                                 let
                                     (* If we have to check for interrupts we must preserve registers across
                                        the RTS call. *)
                                     fun getSave i =
                                         case Vector.sub(pregProps, i) of
                                         RegPropGeneral => SOME(PReg i)
                                     |   RegPropCacheTagged => SOME(PReg i)
                                     |   RegPropUntagged => NONE
                                     |   RegPropStack _ => NONE
                                     |   RegPropCacheUntagged => NONE
                                     |   RegPropMultiple => raise InternalError "getSave: RegPropMultiple"
                                     val currentRegs = union(activeAfterThis, passThrough)
                                     (* Have to include the loop registers.  These were previously included
                                        automatically because they were part of the import set. *)
                                     val check = List.mapPartial getSave (map (regNo o #dst) regArgs @ setToList currentRegs)
                                 in
                                     JumpLoop{regArgs=regArgs, stackArgs=stackArgs, checkInterrupt=SOME check}
                                 end
 
                             |   FunctionCall{regArgs, stackArgs=[], dest, callKind as ConstantCode m,
                                              saveRegs=_, containers} =>
                                 (* If this is arbitrary precision push the registers rather than marking them as "save".
                                    stringOfWord returns 'CODE "PolyAddArbitrary"' etc. *)
                                 if (String.isSubstring "Arbitrary\"" (Address.stringOfWord m))
                                 then FunctionCall{regArgs=regArgs, stackArgs=[], callKind=callKind, dest=dest,
                                         containers=containers, saveRegs=getSaveSet(regNo dest) }
                                 else instr
 
                             |   _ => instr
                     end
                 
                     (* FunctionCall must mark all registers as "push". *)
                     local
                         fun pushRegisters () =
                         let
                             val activeAfter = union(activeAfterThis, passThrough)
                             fun pushAllButDests i =
                                 if List.exists(fn j => i=j) destRegNos
                                 then ()
                                 else case Vector.sub(pregProps, i) of
                                     RegPropCacheTagged => raise InternalError "pushRegisters: cache reg"
                                 |   RegPropCacheUntagged => raise InternalError "pushRegisters: cache reg"
                                 |  _ => Array.update(requirePushOrDiscard, i, true)
                         in
                             (* We need to push everything active after this
                                except the result register. *)
                             List.app pushAllButDests (setToList activeAfter)
                         end
                     in
                         val () =
                            case instr of
                                 FunctionCall{ stackArgs=[], callKind=ConstantCode m, ...} =>
                                 if (String.isSubstring "Arbitrary\"" (Address.stringOfWord m))
                                 then ()
                                 else pushRegisters ()
                             
                             |   FunctionCall _ => pushRegisters ()
                             
                                 (* It should no longer be necessary to push across a handler but
                                    there still seem to be cases that need it. *)
                             (*|   BeginHandler _ => pushRegisters ()*)
 
                             |   _ => ()
                     end
                 
                     (* Which entries are active in this instruction but not afterwards? *)
                     val kill = union(minus(stackSourceSet, stackActiveAfterThis), minus(sourceSet, activeAfterThis))
                 in
                     ({instr=convertedInstr, active=activeForInstr, current=activeAfterThis, kill=kill} :: tail, activeForPrevious,
                      stackActiveForPrevious)
                 end
             end
 
             fun createResult blockNo =
             let
                 val BasicBlock{ flow, ...} = Vector.sub(blockVector, blockNo)
                 val declSet   = Array.sub(declarationArray, blockNo)
                 and importSet = Array.sub(importArray, blockNo)
                 and passSet   = Array.sub(passThroughArray, blockNo)
                 and loopSet   = Array.sub(loopRegArray, blockNo)
                 and exportSet = Array.sub(exportArray, blockNo)
                 and stackPassSet = Array.sub(stackPassThroughArray, blockNo)
                 and stackImportSet = Array.sub(stackImportArray, blockNo)
                 and stackExportSet = Array.sub(stackExportArray, blockNo)
                 val filteredCode = getOpt(Array.sub(resultCode, blockNo), [])
                 (* At the end of the block we should have the exports active. *)
                 val (resultInstrs, _, _) = List.foldr (createResultInstrs (passSet, stackPassSet)) ([], exportSet, stackExportSet) filteredCode
                 (* Set the active count for the pass through. *)
                 val instrCount = List.length filteredCode
                 val () = List.app(addActivity instrCount) (setToList passSet)
                 val inCCState =
                     case Array.sub(ccInStates, blockNo) of CCNeeded s => SOME s | CCUnused => NONE
                 val outCCState =
                     case Array.sub(ccRequiredOut, blockNo) of CCNeeded s => SOME s | CCUnused => NONE
             in
                 ExtendedBasicBlock {
                     block = resultInstrs,
                     flow=flow,
                     locals = minus(declSet, exportSet),
                     imports = importSet,
                     exports = exportSet,
                     passThrough = passSet,
                     loopRegs = loopSet,
                     initialStacks = union(stackPassSet, stackImportSet),
                     inCCState = inCCState,
                     outCCState = outCCState
                 }
             end
         in
             val resultBlocks = Vector.tabulate(vectorLength, createResult)
             val regActive = regActive
         end
         
         val registerState: regState vector =
             Vector.tabulate(maxPRegs,
                 fn i => {
                     active = Array.sub(regActive, i),
                     refs = Array.sub(regRefs, i),
                     pushState = Array.sub(requirePushOrDiscard, i),
                     prop = Vector.sub(pregProps, i)
                 }
             )
     in
         (resultBlocks, registerState)
     end
     (* Exported function.  First filter out unreferenced blocks then process the
        registers themselves. *)
     fun identifyRegisters(blockVector, pregProps) =
     let
         val vectorLength = Vector.length blockVector
         val mapArray = Array.array(vectorLength, NONE)
         and resArray = Array.array(vectorLength, NONE)
         val count = ref 0
         
         fun setReferences label =
             case Array.sub(mapArray, label) of
                 NONE => (* Not yet visited *)
                 let
                     val BasicBlock{flow, block} = Vector.sub(blockVector, label)
                     (* Create a new entry for it. *)
                     val newLabel = ! count before count := !count + 1
                     (* Add it to the map.  Any other references will use this
                        without reprocessing. *)
                     val () = Array.update(mapArray, label, SOME newLabel)
                     val newFlow =
                         case flow of
                             Unconditional l => Unconditional(setReferences l)
                         |   Conditional{trueJump, falseJump, ccRef, condition} =>
                                 Conditional{trueJump=setReferences trueJump, falseJump=setReferences falseJump,
                                         ccRef=ccRef, condition=condition}
                         |   ExitCode => ExitCode
                         |   IndexedBr list => IndexedBr(map setReferences list)
                         |   SetHandler{handler, continue} =>
                                 SetHandler{handler=setReferences handler, continue=setReferences continue}
                         |   UnconditionalHandle l => UnconditionalHandle(setReferences l)
                         |   ConditionalHandle{handler, continue} =>
                                 ConditionalHandle{handler=setReferences handler, continue=setReferences continue}
 
                     val () = Array.update(resArray, newLabel, SOME(BasicBlock{flow=newFlow, block=block}))
                 in
                     newLabel
                 end
             |   SOME lab => lab
        
         val _ = setReferences 0
         
         val newBlockVector =
             Vector.tabulate(!count, fn i => valOf(Array.sub(resArray, i)))
     in
         identifyRegs(newBlockVector, pregProps)
     end
 
     (* Exported for use in GetConflictSets *)
     fun getInstructionRegisters instr =
     let
         val {sources, dests, ...} = getInstructionState instr
     in
         {sources=sources, dests=dests}
     end
     
     (* Exported for use in ICodeOptimise *)
     val getInstructionCC = #ccOut o getInstructionState
 
     structure Sharing =
     struct
         type ('genReg, 'optGenReg, 'fpReg) arm64ICode = ('genReg, 'optGenReg, 'fpReg) arm64ICode
         and preg                = preg
         and pregOrZero          = pregOrZero
         and intSet              = intSet
         and ('genReg, 'optGenReg, 'fpReg) basicBlock = ('genReg, 'optGenReg, 'fpReg) basicBlock
         and extendedBasicBlock  = extendedBasicBlock
         and controlFlow         = controlFlow
         and regProperty         = regProperty
         and ccRef               = ccRef
         and outCCState          = outCCState
     end
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PushRegisters.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PushRegisters.ML
index 16a7da9c..c8d9620e 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PushRegisters.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PushRegisters.ML
@@ -1,1139 +1,1141 @@
 (*
-    Copyright David C. J. Matthews 2021
+    Copyright David C. J. Matthews 2021-2
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64PushRegisters(
     structure Arm64ICode: ARM64ICODE
     structure IntSet: INTSET
     structure Identify: ARM64IDENTIFYREFERENCES
     sharing Arm64ICode.Sharing = Identify.Sharing = IntSet
 ) : ARM64PUSHREGISTERS
 =
 struct
     open Arm64ICode
     open IntSet
     open Identify
     
     type basicBlockAbstract = (preg, pregOrZero, preg) basicBlock
     
     (* Curried subscript functions *)
     fun asub a i = Array.sub(a, i)
     and vsub v i = Vector.sub(v, i)
     
     exception InternalError = Misc.InternalError
 
     (* Each preg in the input is mapped to either a new preg or the stack. *)
     datatype pregMapType = Unset | ToPReg of preg | ToStack of int * stackLocn
 
     (* The stack contains both entries in the input code and entries added here.
        It is really used to ensure that the stack at run time is the same size
        at the start of a block whichever block has jumped to it. *)
     datatype stackEntry =
         NewEntry of {pregNo: int} (* pregNo is the original preg that has been pushed here. *)
     |   OriginalEntry of { stackLoc: stackLocn }
     |   HandlerEntry
 
 
     fun addRegisterPushes{code: extendedBasicBlock vector, pushVec: bool vector, pregProps, firstPass=_} =
     let
         val maxPRegs = Vector.length pregProps
         val numberOfBlocks = Vector.length code
         (* Output registers and properties. *)
         val pregCounter = ref 0
         val pregPropList = ref []
         val pregMap = Array.array(maxPRegs, Unset)
  
         val maxStack = ref 0
         (* The stack size we've assumed for the block.  Also indicates if
            a block has already been processed. *)
         val inputStackSizes = Array.array(numberOfBlocks, NONE)
         (* The result of processing a block. *)
         val blockOutput = Array.array(numberOfBlocks, {code=[], stackCount=0})
         
         (* Extra blocks to adjust the stack are added here. *)
         val extraBlocks: basicBlockAbstract list ref = ref []
         val blockCounter = ref numberOfBlocks
         
         (* Get the blocks that are inputs for each one. *)
         local
             val blockRefs = Array.array(numberOfBlocks, [])
             
             fun setReferences fromBlock =
             let
                 val ExtendedBasicBlock{ flow, ...} = vsub code fromBlock
                 val refs = successorBlocks flow
                 fun setRefs toBlock =
                 let
                     val oldRefs = asub blockRefs toBlock
                 in
                     Array.update(blockRefs, toBlock, fromBlock :: oldRefs);
                     if null oldRefs
                     then setReferences toBlock
                     else ()
                 end
             in
                 List.app setRefs refs
             end
 
             val () = setReferences 0
         in
             val blockRefs = blockRefs
         end
 
         (* Recursive scan of the blocks.  For each block we produce an input and output state.
            The input state is the output state of the predecessor i.e. some block that jumps to
            this, but with any entries removed that are not used in this block.  It is then
            necessary to match the input state, if necessary by adding extra blocks that just
            do the matching. *)
         local
             val haveProcessed = isSome o asub inputStackSizes
         
             fun processBlocks toDo =
             case List.filter (fn (n, _) => not(haveProcessed n)) toDo of
                 [] => () (* Nothing left to do *)
 
             |   stillToDo as head :: _ =>
                 let
                     (* Try to find a block all of whose predecessors have been processed.  That
                        increases the chances that we will have cached items. *)
                     fun available(dest, _) = List.all haveProcessed (Array.sub(blockRefs, dest))
                     val (blockNo, lastOutputState) =
                         case List.find available stillToDo of
                             SOME c => c
                         |   NONE => head
                 
                     (* This is the first time we've come to this block.  *)
                     val ExtendedBasicBlock{ block, flow, imports, passThrough, loopRegs, initialStacks, ...} = vsub code blockNo
 
                     (* Remove any items from the input state that are no longer needed for
                        this block.  They could be local to the previous block or needed by
                        a different successor.  Although the values in loopRegs are not
                        required the stack space is so that they can be updated. *)
                     fun removeItems(result as {stack=[], stackCount=0}) = result
                 
                     |   removeItems{stack=[], ...} = raise InternalError "removeItems - stack size"
 
                     |   removeItems (thisStack as {stack=NewEntry{pregNo} :: rest, stackCount}) =
                         if member(pregNo, imports) orelse member(pregNo, passThrough) orelse member(pregNo, loopRegs)
                         then thisStack
                         else removeItems{stack=rest, stackCount=stackCount-1}
 
                     |   removeItems (thisStack as {stack=OriginalEntry{stackLoc=StackLoc{rno, size}, ...} :: rest, stackCount}) =
                         if member(rno, initialStacks)
                         then thisStack
                         else removeItems{stack=rest, stackCount=stackCount-size}
 
                     |   removeItems result = result
 
                     val {stackCount=newSp, stack=newStack} = removeItems lastOutputState
                     
                     (* References to hold the current stack count (number of words on the stack)
                        and the list of items on the stack.  The list is not used directly to map
                        stack addresses.  Instead it is used to match the stack at the beginning
                        and end of a block. *)
                     val stackCount = ref newSp
                     val stack = ref newStack
                     (* Items from the stack that have been marked as deleted but not yet
                        removed.  We only remove items from the top of the stack to avoid
                        quadratic behaviour with a very deep stack. *)
                     val deletedItems = ref []
                     
                     (* Save the stack size in case we come by a different route. *)
                     val () = Array.update(inputStackSizes, blockNo, SOME newSp)
                     
                     fun pushItemToStack item =
                     let
                         val size =
                             case item of
                                 NewEntry _ => 1
                             |   OriginalEntry{stackLoc=StackLoc{size, ...}, ...} => size
                             |   HandlerEntry => 2
                     in
                         stackCount := ! stackCount+size;
                         stack := item :: ! stack;
                         maxStack := Int.max(!maxStack, !stackCount)
                     end
  
                     fun newPReg propKind =
                     let
                         val regNo = !pregCounter before pregCounter := !pregCounter + 1
                         val () = pregPropList := propKind :: !pregPropList
                     in
                         PReg regNo
                     end
         
                     and newStackLoc size =
                     let
                         val regNo = !pregCounter before pregCounter := !pregCounter + 1
                         val () = pregPropList := RegPropStack size :: !pregPropList
                     in
                         StackLoc{size=size, rno=regNo}
                     end
         
                     (* Map a source register.  This always loads the argument. *)
                     fun mapSrcReg(PReg n) =
                         case Array.sub(pregMap, n) of
                             Unset => raise InternalError "mapSrcReg - unset"
                         |   ToPReg preg => (preg, [])
                         |   ToStack(stackLoc, container as StackLoc{size, ...}) =>
                             let
                                 (* Make a new untagged register. That will prevent us pushing it if
                                    we have to spill registers. *)
                                 val newReg = newPReg RegPropUntagged
                             in
                                 (newReg,
                                     [LoadStack{wordOffset= !stackCount-stackLoc-size, container=container,
                                                field=0, dest=newReg}])
                             end
 
                     fun mapDestReg(PReg n) =
                     let
                         val currentLocation = Array.sub(pregMap, n)
                         val kind = Vector.sub(pregProps, n)
                     in
                         if Vector.sub(pushVec, n)
                         then
                         let
                             (* This should not have been seen before. *)
                             val _ = case currentLocation of Unset => () | _ => raise InternalError "mapDestReg - already set"
                             val newReg = newPReg kind
                             val newContainer = newStackLoc 1
                             val () = Array.update(pregMap, n, ToStack (!stackCount, newContainer))
                             val () = pushItemToStack(NewEntry{pregNo=n})
                         in
                             (newReg, [PushToStack{source= newReg, container=newContainer, copies=1}])
                         end
                         else
                         let
                         (* See if we already have a number for it.  We may encounter the same preg
                            as a destination when returning the result from a conditional in which
                            case we have to use the same number.  We shouldn't have pushed it. *)
                             val newReg =
                                 case (currentLocation, kind) of
                                     (Unset, _) =>
                                     let
                                         val newReg = newPReg kind
                                         val () = Array.update(pregMap, n, ToPReg newReg)
                                     in
                                         newReg
                                     end
                                 |   (ToPReg preg, RegPropMultiple) => preg
                                 |   _ => raise InternalError "mapDestReg - multiply defined non-merge reg"
                         in
                             (newReg, [])
                         end
                     end
                     
                     (* Optional destination for arithmetic and logical ops. *)
                     fun mapOptDest ZeroReg = (ZeroReg, [])
                     |   mapOptDest (SomeReg destReg) =
                         let val (destVal, destCode) = mapDestReg destReg in (SomeReg destVal, destCode) end
 
                     fun mapOptSrc ZeroReg = (ZeroReg, [])
                     |   mapOptSrc (SomeReg srcReg) =
                         let val (srcVal, srcCode) = mapSrcReg srcReg in (SomeReg srcVal, srcCode) end
 
                     (* Adjust a stack offset from the old state to the new state. *)
                     fun mapContainerAndStack(StackLoc{rno, size}, field) =
                     let
                         val (newStackAddr, newContainer) =
                         case Array.sub(pregMap, rno) of
                             Unset => raise InternalError "mapContainer - unset"
                         |   ToPReg _ => raise InternalError "mapContainer - ToPReg"
                         |   ToStack stackContainer => stackContainer
 
                         val newOffset = !stackCount-(newStackAddr+size) + field
                     in
                         (newOffset, newContainer)
                     end
         
                     (* Add an entry for an existing stack entry. *)
                     fun mapDestContainer(StackLoc{rno, size}, locn) =
                     (
                         case Array.sub(pregMap, rno) of
                             Unset =>
                             let
                                 val newContainer = newStackLoc size
                                 val () = Array.update(pregMap, rno, ToStack(locn, newContainer))
                             in
                                 newContainer
                             end
                         |   _ => raise InternalError "mapDestContainer: already set"
                     )
 
                     (* Map a function argument which could be a register or a stack entry.  A register
                        entry could have been pushed. *)
                     fun mapArgument(ArgInReg (PReg r)) =
                         (
                             case Array.sub(pregMap, r) of
                                 Unset => raise InternalError "mapSource - unset"
                             |   ToPReg preg => ArgInReg preg
                             |   ToStack(stackLoc, container as StackLoc{size, ...}) =>
                                 ArgOnStack{wordOffset= !stackCount-stackLoc-size, container=container, field=0}
                         )
 
                     |   mapArgument(ArgOnStack{container, field, ...}) =
                             let
                                 val (newOffset, newContainer) = mapContainerAndStack(container, field)
                             in
                                 ArgOnStack{container=newContainer, wordOffset=newOffset, field=field}
                             end
 
                     (* Rewrite the code, replacing any registers that need to be pushed with references to
                        the stack.  The result is built up in reverse order and then reversed. *)
                     fun pushRegisters({instr=MoveRegister{ source, dest as PReg dReg }, ...}, code) =
                         if Vector.sub(pushVec, dReg)
                         then (* We're going to push this. *)
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                             (* TODO: Since we're pushing it we don't need to move it first. *)
                         in
                             destCode @ MoveRegister { source=sourceVal, dest=destVal} :: sourceCode @ code
                         end
                         else
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ MoveRegister { source=sourceVal, dest=destVal} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=LoadNonAddressConstant { dest, source}, ...}, code) =
                         let
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ LoadNonAddressConstant { dest=destVal, source=source} :: code
                         end
 
                     |   pushRegisters({instr=LoadAddressConstant { dest, source}, ...}, code) =
                         let
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ LoadAddressConstant { dest=destVal, source=source} :: code
                         end
 
                     |   pushRegisters({instr=LoadWithConstantOffset { base, dest, byteOffset, loadType}, ...}, code) =
                         let
                             val (baseVal, baseCode) = mapSrcReg base
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @
                                 LoadWithConstantOffset { base=baseVal, dest=destVal, byteOffset=byteOffset, loadType=loadType} ::
                                     baseCode @ code
                         end
 
                     |   pushRegisters({instr=LoadFPWithConstantOffset { base, dest, byteOffset, floatSize}, ...}, code) =
                         let
                             val (baseVal, baseCode) = mapSrcReg base
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @
                                 LoadFPWithConstantOffset { base=baseVal, dest=destVal, byteOffset=byteOffset, floatSize=floatSize} ::
                                     baseCode @ code
                         end
 
                     |   pushRegisters({instr=LoadWithIndexedOffset { base, dest, index, loadType}, ...}, code) =
                         let
                             val (baseVal, baseCode) = mapSrcReg base
                             val (indexVal, indexCode) = mapSrcReg index
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @
                                 LoadWithIndexedOffset { base=baseVal, dest=destVal, index=indexVal, loadType=loadType} ::
                                     indexCode @ baseCode @ code
                         end
 
                     |   pushRegisters({instr=LoadFPWithIndexedOffset { base, dest, index, floatSize}, ...}, code) =
                         let
                             val (baseVal, baseCode) = mapSrcReg base
                             val (indexVal, indexCode) = mapSrcReg index
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @
                                 LoadFPWithIndexedOffset { base=baseVal, dest=destVal, index=indexVal, floatSize=floatSize} ::
                                     indexCode @ baseCode @ code
                         end
 
                     |   pushRegisters({instr=GetThreadId { dest}, ...}, code) =
                         let
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ GetThreadId { dest=destVal} :: code
                         end
 
                     |   pushRegisters({instr=ObjectIndexAddressToAbsolute { source, dest}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ ObjectIndexAddressToAbsolute { source=sourceVal, dest=destVal} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=AbsoluteToObjectIndex { source, dest}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ AbsoluteToObjectIndex { source=sourceVal, dest=destVal} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=AllocateMemoryFixed { bytesRequired, dest, ...}, ...}, code) =
                         let
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ AllocateMemoryFixed { dest=destVal, bytesRequired=bytesRequired, saveRegs=[]} :: code
                         end
 
                     |   pushRegisters({instr=AllocateMemoryVariable{size, dest, ...}, ...}, code) =
                         let
                             val (sizeVal, sizeCode) = mapSrcReg size
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ AllocateMemoryVariable{size=sizeVal, dest=destVal, saveRegs=[]} :: sizeCode @ code
                         end
 
                     |   pushRegisters({instr=InitialiseMem{size, addr, init}, ...}, code) =
                         let
                             val (sizeVal, sizeCode) = mapSrcReg size
                             val (addrVal, addrCode) = mapSrcReg addr
                             val (initVal, initCode) = mapSrcReg init
                         in
                             InitialiseMem{size=sizeVal, addr=addrVal, init=initVal} :: initCode @ addrCode @ sizeCode @ code
                         end
 
                     |   pushRegisters({instr=BeginLoop, ...}, code) = BeginLoop :: code
 
                     |   pushRegisters({instr=JumpLoop{regArgs, stackArgs, checkInterrupt}, ...}, code) =
                         let
                             (* Normally JumpLoop will be the last item in a block but it is possible that we've
                                added a reset-stack after it. *)
                             fun getValues [] = ([], [])
                             |   getValues ({src=source, dst=PReg n} :: rest) =
                                 let
                                     val (otherRegArgs, otherStackArgs) = getValues rest
                                 in
                                     case Array.sub(pregMap, n) of
                                         ToPReg lReg => ({src=mapArgument source, dst=lReg} :: otherRegArgs, otherStackArgs)
 
                                     |   ToStack(stackloc, stackC as StackLoc{size, ...}) =>
                                         let
                                             val sourceVal = mapArgument source
                                             val stackOff = !stackCount - stackloc - size
                                         in
                                             (otherRegArgs, {src=sourceVal, wordOffset=stackOff, stackloc=stackC} :: otherStackArgs)
                                         end
 
                                     |   Unset =>
                                         (* Drop it.  It's never used. This can happen if we are folding a
                                            function over a list such that it always returns the last value and
                                            then discard the result of the fold. *)
                                             (otherRegArgs, otherStackArgs)
                                 end
                             val (newRegArguments, newStackArgs) = getValues regArgs
                             fun loadStackArg({src=source, stackloc=destC, ...}, otherArgs) =
                             let
                                 val sourceVal = mapArgument source
                                 val (newOffset, newContainer) = mapContainerAndStack(destC, 0)
                             in
                                 {src=sourceVal, wordOffset=newOffset, stackloc=newContainer} :: otherArgs
                             end
                 
                             val oldStackArgs = List.foldr loadStackArg [] stackArgs
                             val check = case checkInterrupt of NONE => NONE | SOME _ => SOME []
                         in
                             JumpLoop{ regArgs=newRegArguments, stackArgs=oldStackArgs @ newStackArgs, checkInterrupt=check} :: code
                         end
 
                     |   pushRegisters({instr=StoreWithConstantOffset { base, source, byteOffset, loadType}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (baseVal, baseCode) = mapSrcReg base 
                         in
                             StoreWithConstantOffset{ base=baseVal, source=sourceVal, byteOffset=byteOffset, loadType=loadType} ::
                                 baseCode @ sourceCode @ code
                         end
 
                     |   pushRegisters({instr=StoreFPWithConstantOffset { base, source, byteOffset, floatSize}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (baseVal, baseCode) = mapSrcReg base 
                         in
                             StoreFPWithConstantOffset{ base=baseVal, source=sourceVal, byteOffset=byteOffset, floatSize=floatSize} ::
                                 baseCode @ sourceCode @ code
                         end
 
                     |   pushRegisters({instr=StoreWithIndexedOffset { base, source, index, loadType}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (baseVal, baseCode) = mapSrcReg base 
                             val (indexVal, indexCode) = mapSrcReg index 
                         in
                             StoreWithIndexedOffset{ base=baseVal, source=sourceVal, index=indexVal, loadType=loadType} ::
                                 indexCode @ baseCode @ sourceCode @ code
                         end
 
                     |   pushRegisters({instr=StoreFPWithIndexedOffset { base, source, index, floatSize}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (baseVal, baseCode) = mapSrcReg base 
                             val (indexVal, indexCode) = mapSrcReg index 
                         in
                             StoreFPWithIndexedOffset{ base=baseVal, source=sourceVal, index=indexVal, floatSize=floatSize} ::
                                 indexCode @ baseCode @ sourceCode @ code
                         end
 
                     |   pushRegisters({instr=AddSubImmediate{ source, dest, ccRef, immed, isAdd, length }, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapOptDest dest
                         in
                             destCode @ AddSubImmediate { source=sourceVal, dest=destVal, ccRef=ccRef,
                                                          immed=immed, isAdd=isAdd, length=length} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=AddSubRegister{ base, shifted, dest, ccRef, isAdd, length, shift}, ...}, code) =
                         let
                             val (op1Val, op1Code) = mapSrcReg base
                             val (op2Val, op2Code) = mapSrcReg shifted
                             val (destVal, destCode) = mapOptDest dest
                         in
                             destCode @ AddSubRegister { base=op1Val, shifted=op2Val, dest=destVal, ccRef=ccRef,
                                                          isAdd=isAdd, length=length, shift=shift} :: op2Code @ op1Code @ code
                         end
 
                     |   pushRegisters({instr=LogicalImmediate{ source, dest, ccRef, immed, logOp, length }, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapOptDest dest
                         in
                             destCode @ LogicalImmediate { source=sourceVal, dest=destVal, ccRef=ccRef,
                                                           immed=immed, logOp=logOp, length=length} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=LogicalRegister{ base, shifted, dest, ccRef, logOp, length, shift}, ...}, code) =
                         let
                             val (op1Val, op1Code) = mapSrcReg base
                             val (op2Val, op2Code) = mapSrcReg shifted
                             val (destVal, destCode) = mapOptDest dest
                         in
                             destCode @ LogicalRegister { base=op1Val, shifted=op2Val, dest=destVal, ccRef=ccRef,
                                                          logOp=logOp, length=length, shift=shift} :: op2Code @ op1Code @ code
                         end
 
                     |   pushRegisters({instr=ShiftRegister{ direction, dest, source, shift, opSize}, ...}, code) =
                         let
                             val (srcVal, op1Code) = mapSrcReg source
                             val (shiftVal, op2Code) = mapSrcReg shift
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ ShiftRegister { source=srcVal, shift=shiftVal, dest=destVal,
                                                        direction=direction, opSize=opSize} :: op2Code @ op1Code @ code
                         end
 
                     |   pushRegisters({instr=Multiplication{ kind, dest, sourceA, sourceM, sourceN }, ...}, code) =
                         let
                             val (srcAVal, srcACode) = mapOptSrc sourceA
                             val (srcMVal, srcMCode) = mapSrcReg sourceM
                             val (srcNVal, srcNCode) = mapSrcReg sourceN
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ Multiplication { kind=kind, sourceA=srcAVal, sourceM=srcMVal, sourceN=srcNVal, dest=destVal} ::
                                 srcNCode @ srcMCode @ srcACode @ code
                         end
 
                     |   pushRegisters({instr=Division{ isSigned, dest, dividend, divisor, opSize }, ...}, code) =
                         let
                             val (dividendVal, dividendCode) = mapSrcReg dividend
                             val (divisorVal, divisorCode) = mapSrcReg divisor
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ Division { isSigned=isSigned, dividend=dividendVal, divisor=divisorVal, dest=destVal,
                                                        opSize=opSize} :: divisorCode @ dividendCode @ code
                         end
 
                     |   pushRegisters({instr=BeginFunction {regArgs, stackArgs}, ...}, code) =
                         let
                             (* Create a new container list.  The offsets begin at -numArgs. *)
                             fun newContainers(src :: srcs, offset) =
                                 let
                                     val newContainer = mapDestContainer(src, offset)
                                 in
                                     newContainer :: newContainers(srcs, offset+1)
                                 end
                             |   newContainers _ = []
 
                             val newStackArgs = newContainers(stackArgs, ~ (List.length stackArgs))
                             (* Push any registers that need to be pushed. *)
                             fun pushReg((preg, rreg), (others, code)) =
                             let
                                 val (newReg, newCode) = mapDestReg(preg)
                             in
                                 ((newReg, rreg) :: others, newCode @ code)
                             end
                             val (newRegArgs, pushCode) = List.foldl pushReg ([], []) regArgs
                         in
                             pushCode @ BeginFunction {regArgs=newRegArgs, stackArgs=newStackArgs} :: code
                         end
 
                     |   pushRegisters({instr=FunctionCall{callKind, regArgs, stackArgs, dest, containers, ...}, ...}, code) =
                         let
                             (* It's possible that this could lead to having to spill registers in order
                                to load others.  Leave that problem for the moment. *)
                             fun loadStackArg (arg, otherArgs) =
                             let
                                 val argVal = mapArgument arg
                             in
                                 argVal :: otherArgs
                             end
                             val newStackArgs = List.foldr loadStackArg [] stackArgs
 
                             fun loadRegArg ((arg, reg), otherArgs) =
                             let
                                 val argVal = mapArgument arg
                             in
                                 (argVal, reg) :: otherArgs
                             end
                             val newRegArgs = List.foldr loadRegArg [] regArgs
                             val (destVal, destCode) = mapDestReg dest
                             val newContainers = List.map(fn c => #2(mapContainerAndStack(c, 0))) containers
                         in
                             destCode @ FunctionCall{ callKind=callKind, regArgs=newRegArgs, stackArgs=newStackArgs,
                                                   dest=destVal, saveRegs=[], containers=newContainers} :: code
                         end
 
                     |   pushRegisters({instr=TailRecursiveCall{callKind, regArgs, stackArgs, stackAdjust, ...}, ...}, code) =
                         let
                             val newStackOffset = !stackCount
                             
                             fun loadStackArg ({src, stack}, (otherLoads, otherArgs)) =
                             let
                                 val (argVal, loadCode) =
                                     case mapArgument src of
                                         (source as ArgOnStack{wordOffset, container, field}) =>
                                             (* If we're leaving it in its old location or we're pushing it
                                                above the current top we're ok.  We're also ok if
                                                we're moving it from a somewhere above the last argument.
                                                Otherwise we have to load it.
                                                It goes into a normal tagged register which may mean that it
                                                could be pushed onto the stack in a subsequent pass. *)
                                             if wordOffset = stack+newStackOffset orelse stack+newStackOffset < 0
                                                 orelse newStackOffset-wordOffset > ~ stackAdjust
                                             then (source, [])
                                             else
                                             let
                                                 val preg = newPReg RegPropGeneral
                                             in
                                                 (ArgInReg preg,
                                                     [LoadStack{wordOffset=wordOffset, container=container, field=field, dest=preg}])
                                             end
                                     |   argCode => (argCode, [])
                             in
                                 (loadCode @ otherLoads, {src=argVal, stack=stack} :: otherArgs)
                             end
                             val (stackArgLoads, newStackArgs) = List.foldr loadStackArg ([], []) stackArgs
                 
                             fun loadRegArg ((arg, reg), otherArgs) =
                             let
                                 val argVal = mapArgument arg
                             in
                                 (argVal, reg) :: otherArgs
                             end
                             val newRegArgs = List.foldr loadRegArg [] regArgs
 
                         in
                             TailRecursiveCall{ callKind=callKind, regArgs=newRegArgs,
                                     stackArgs=newStackArgs, stackAdjust=stackAdjust, currStackSize=newStackOffset} ::
                                         stackArgLoads @ code
                         end
 
                     |   pushRegisters({instr=ReturnResultFromFunction{resultReg, returnReg, numStackArgs}, ...}, code) =
                         let
                             val (resultValue, loadResult) = mapSrcReg resultReg
                             val (returnValue, loadReturn) = mapSrcReg returnReg
                             val resetCode =
                                 if !stackCount = 0 then [] else [ResetStackPtr{numWords= !stackCount}]
                         in
                             ReturnResultFromFunction{resultReg=resultValue, returnReg=returnValue, numStackArgs=numStackArgs} ::
                                 resetCode @ loadReturn @ loadResult @ code
                         end
 
                     |   pushRegisters({instr=RaiseExceptionPacket{packetReg}, ...}, code) =
                         let
                             val (packetVal, packetCode) = mapSrcReg packetReg
                         in
                             RaiseExceptionPacket{packetReg=packetVal} :: packetCode @ code
                         end
 
                     |   pushRegisters({instr=PushToStack{ source, container, copies }, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             (* This was a push from a previous pass.  Treat as a container of size "copies". *)
                             val newContainer = mapDestContainer(container, !stackCount)
                             val () = pushItemToStack(OriginalEntry{stackLoc=container})
                         in
                             PushToStack{source=sourceVal, container=newContainer, copies=copies} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=LoadStack{ dest, container, field, ... }, ...}, code) =
                         let
                             val (newOffset, newContainer) = mapContainerAndStack(container, field)
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ LoadStack{ dest=destVal, container=newContainer, field=field,
                                                   wordOffset=newOffset } :: code
                         end
 
                     |   pushRegisters({instr=StoreToStack{source, container, field, ...}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (newOffset, newContainer) = mapContainerAndStack(container, field)
                         in
                             StoreToStack{source=sourceVal, container=newContainer, field=field, stackOffset=newOffset} ::
                                     sourceCode @ code
                         end
 
                     |   pushRegisters({instr=ContainerAddress{ dest, container, ... }, ...}, code) =
                         let
                             val (newOffset, newContainer) = mapContainerAndStack(container, 0)
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ ContainerAddress{ dest=destVal, container=newContainer, stackOffset=newOffset } :: code
                         end
 
                     |   pushRegisters({instr=ResetStackPtr _, ...}, code) = code
                             (* Added in a previous pass - discard it. *)
 
                     |   pushRegisters({instr=TagValue{source, dest, isSigned, opSize}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ TagValue{source=sourceVal, dest=destVal, isSigned=isSigned, opSize=opSize} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=UntagValue{source, dest, isSigned, opSize, ...}, ...}, code) =
                         let
                             val (loadedSource, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ UntagValue{source=loadedSource, dest=destVal, isSigned=isSigned, opSize=opSize} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=BoxLarge{source, dest, ...}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ BoxLarge{source=sourceVal, dest=destVal, saveRegs=[]} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=UnboxLarge{source, dest, ...}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ UnboxLarge{source=sourceVal, dest=destVal} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=BoxTagFloat{floatSize, source, dest, ...}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ BoxTagFloat{floatSize=floatSize, source=sourceVal, dest=destVal, saveRegs=[]} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=UnboxTagFloat{floatSize, source, dest, ...}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ UnboxTagFloat{floatSize=floatSize, source=sourceVal, dest=destVal} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=LoadAcquire { base, dest, loadType}, ...}, code) =
                         let
                             val (baseVal, baseCode) = mapSrcReg base
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ LoadAcquire { base=baseVal, dest=destVal, loadType=loadType} :: baseCode @ code
                         end
 
                     |   pushRegisters({instr=StoreRelease { base, source, loadType}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (baseVal, baseCode) = mapSrcReg base 
                         in
                             StoreRelease{ base=baseVal, source=sourceVal, loadType=loadType} :: baseCode @ sourceCode @ code
                         end
 
                     |   pushRegisters({instr=BitFieldShift{source, dest, isSigned, length, immr, imms}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @
                                 BitFieldShift { source=sourceVal, dest=destVal, isSigned=isSigned, immr=immr, imms=imms, length=length} ::
                                     sourceCode @ code
                         end
 
                     |   pushRegisters({instr=BitFieldInsert{source, destAsSource, dest, length, immr, imms}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destSrcVal, destSrcCode) = mapSrcReg destAsSource
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @
                                 BitFieldInsert { source=sourceVal, destAsSource=destSrcVal, dest=destVal, immr=immr, imms=imms, length=length} ::
                                     destSrcCode @ sourceCode @ code
                         end
 
                     |   pushRegisters({instr=IndexedCaseOperation{testReg}, ...}, code) =
                         let
                             val (testVal, testCode) = mapSrcReg testReg
                         in
                             IndexedCaseOperation{testReg=testVal} :: testCode @ code
                         end
 
                     |   pushRegisters({instr=PushExceptionHandler, ...}, code) =
                         let
                             (* Add a handler entry to the stack. *)
                             val () = pushItemToStack HandlerEntry
                         in
                             PushExceptionHandler :: code
                         end
 
                     |   pushRegisters({instr=PopExceptionHandler, ...}, code) =
                         let
                             (* Appears at the end of the block whose exceptions are being handled.  Delete the
                                handler and anything above it. *)
                             (* Get the state after removing the handler. *)
                             fun popContext ([], _) = raise InternalError "pushRegisters - pop handler"
                             |   popContext (HandlerEntry :: tl, new) = (tl, new-2)
                             |   popContext (OriginalEntry{stackLoc=StackLoc{size, ...}, ...} :: tl, new) = popContext(tl, new-size)
                             |   popContext (NewEntry _ :: tl, new) = popContext(tl, new-1)
                             val (newStack, nnCount) = popContext(!stack, !stackCount)
                             val () = stack := newStack
                             val oldStackPtr = ! stackCount
                             val () = stackCount := nnCount
                             (* Reset the stack to just above the two words of the handler. *)
                             val resetCode =
                                 if oldStackPtr <> nnCount+2
                                 then [ResetStackPtr{numWords=oldStackPtr-nnCount-2}]
                                 else []
                         in
                             PopExceptionHandler :: resetCode @ code
                         end
 
                     |   pushRegisters({instr=BeginHandler{packetReg}, ...}, code) =
                         let
                             (* Start of a handler.  The top active entry should be the handler. *)
                             val () =
                                 case !stack of
                                     HandlerEntry :: tl => stack := tl
                                 |   _ => raise InternalError "pushRegisters: BeginHandler"
                             val () = stackCount := !stackCount - 2
                             val (packetVal, packetCode) = mapDestReg packetReg
                         in
                             packetCode @ BeginHandler{packetReg=packetVal} :: code
                         end
 
                     |   pushRegisters({instr=CompareByteVectors{vec1Addr, vec2Addr, length, ccRef}, ...}, code) =
                         let
                             val (vec1Val, vec1Code) = mapSrcReg vec1Addr
                             val (vec2Val, vec2Code) = mapSrcReg vec2Addr
                             val (lenVal, lenCode) = mapSrcReg length
                         in
                             CompareByteVectors{vec1Addr=vec1Val, vec2Addr=vec2Val, length=lenVal, ccRef=ccRef} ::
                                 lenCode @ vec2Code @ vec1Code @ code
                         end
 
                     |   pushRegisters({instr=BlockMove{srcAddr, destAddr, length, isByteMove}, ...}, code) =
                         let
                             val (srcVal, srcCode) = mapSrcReg srcAddr
                             val (destVal, destCode) = mapSrcReg destAddr
                             val (lenVal, lenCode) = mapSrcReg length
                         in
                             BlockMove{srcAddr=srcVal, destAddr=destVal, length=lenVal, isByteMove=isByteMove} ::
                                 lenCode @ destCode @ srcCode @ code
                         end
 
 
                     |   pushRegisters({instr=AddSubXSP{source, dest, isAdd}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapOptDest dest
                         in
                             destCode @ AddSubXSP { source=sourceVal, dest=destVal, isAdd=isAdd} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=TouchValue{source, ...}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                         in
                             TouchValue { source=sourceVal} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=LoadAcquireExclusive{ base, dest }, ...}, code) =
                         let
                             val (baseVal, baseCode) = mapSrcReg base
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ LoadAcquireExclusive { base=baseVal, dest=destVal} :: baseCode @ code
                         end
 
                     |   pushRegisters({instr=StoreReleaseExclusive{ base, source, result }, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapOptSrc source
                             val (baseVal, baseCode) = mapSrcReg base 
                             val (resVal, resCode) = mapDestReg result
                         in
                             resCode @ StoreReleaseExclusive{ base=baseVal, source=sourceVal, result=resVal} :: baseCode @ sourceCode @ code
                         end
 
                     |   pushRegisters({instr=MemoryBarrier, ...}, code) = MemoryBarrier :: code
 
                     |   pushRegisters({instr=ConvertIntToFloat{ source, dest, srcSize, destSize}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @
                                 ConvertIntToFloat{ source=sourceVal, dest=destVal, srcSize=srcSize, destSize=destSize} ::
                                 sourceCode @ code
                         end
 
                     |   pushRegisters({instr=ConvertFloatToInt{ source, dest, srcSize, destSize, rounding}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @
                                 ConvertFloatToInt{ source=sourceVal, dest=destVal, srcSize=srcSize, destSize=destSize, rounding=rounding} ::
                                 sourceCode @ code
                         end
 
                     |   pushRegisters({instr=UnaryFloatingPt{ source, dest, fpOp}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ UnaryFloatingPt{ source=sourceVal, dest=destVal, fpOp=fpOp} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=BinaryFloatingPoint{ arg1, arg2, dest, fpOp, opSize}, ...}, code) =
                         let
                             val (arg1Val, arg1Code) = mapSrcReg arg1
                             val (arg2Val, arg2Code) = mapSrcReg arg2
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ BinaryFloatingPoint{ arg1=arg1Val, arg2=arg2Val, dest=destVal, fpOp=fpOp, opSize=opSize} ::
                                 arg2Code @ arg1Code @ code
                         end
 
                     |   pushRegisters({instr=CompareFloatingPoint{ arg1, arg2, opSize, ccRef}, ...}, code) =
                         let
                             val (arg1Val, arg1Code) = mapSrcReg arg1
                             val (arg2Val, arg2Code) = mapSrcReg arg2
                         in
                             CompareFloatingPoint{ arg1=arg1Val, arg2=arg2Val, opSize=opSize, ccRef=ccRef} ::
                                 arg2Code @ arg1Code @ code
                         end
 
+                    |   pushRegisters({instr=CacheCheck _, ...}, _) = raise InternalError "pushRegisters: CacheCheck"
+
                     local
                         fun doPush(instr as {kill, ...}, code) =
                         let
                             val newCode = pushRegisters(instr, code)
                             (* Can we pop the stack? *)
                             val stackReset =
                                 case setToList (minus(kill, loopRegs)) of
                                     [] => []
                                 |   killList =>
                                     let
                                         (* See if any of the kill items are at the top of the stack.
                                            If they are we can pop them and perhaps items we've
                                            previously marked for deletion but not been able to pop. *)
                                         val oldStack = !stackCount
                                         fun checkAndAdd(r, output) =
                                             case Array.sub(pregMap, r) of
                                                 ToStack(stackLoc, StackLoc{size, ...}) =>
                                                     if stackLoc < 0
                                                     then r :: output (* We can have arguments and return address. *)
                                                     else if !stackCount = stackLoc+size
                                                     then
                                                     (
                                                         stack := tl (!stack);
                                                         stackCount := stackLoc;
                                                         output
                                                     )
                                                     else r :: output
                                             |   _ => r :: output
                                         val toAdd = List.foldl checkAndAdd [] killList
                                         fun reprocess list =
                                         let
                                             val prevStack = !stackCount
                                             val outlist = List.foldl checkAndAdd [] list
                                         in
                                             if !stackCount = prevStack
                                             then list
                                             else reprocess outlist
                                         end
                                         val () =
                                             if !stackCount = oldStack
                                             then deletedItems := toAdd @ !deletedItems
                                             else deletedItems := reprocess(toAdd @ !deletedItems)
                                         val _ = oldStack >= !stackCount orelse raise InternalError "negative stack offset"
                                     in
                                         if !stackCount = oldStack then []
                                         else [ResetStackPtr{numWords=oldStack - !stackCount}]
                                     end
                         in
                             stackReset @ newCode
                         end
                     in
                         val codeResult = List.foldl doPush [] block
                         val outputCount = ! stackCount
                         val results = {code=codeResult, stackCount= outputCount}
                         val stateResult = { stackCount= outputCount, stack= !stack }
                         
                         val () = Array.update(blockOutput, blockNo,  results)
                     end
 
                     val addSet =
                         case flow of
                             ExitCode => []
                         |   IndexedBr cases => cases
                         |   Unconditional dest => [dest]
                         |   Conditional {trueJump, falseJump, ...} => [falseJump, trueJump]
                         |   SetHandler { handler, continue } => [handler, continue]
                         |   UnconditionalHandle _ => []
                         |   ConditionalHandle { continue, ...} => [continue]
                         
                     val addItems = List.map(fn m => (m, stateResult)) addSet
                 in
                     processBlocks(addItems @ stillToDo)
                 end
         in
             val () = processBlocks([(0, {stack=[], stackCount=0})])
         end
 
 
         (* Put together the result code and blocks. *)
         local
             fun createBlock blockNo =
             (* Skip unreferenced blocks apart from block 0. *)
             if blockNo <> 0 andalso null (asub blockRefs blockNo)
             then BasicBlock{block=[], flow=ExitCode}
             else
             let
                 val ExtendedBasicBlock{ flow, ...} = vsub code blockNo
                 val {code=codeResult, stackCount=outputCount, ...} = asub blockOutput blockNo
                 
                 (* Process the successor.  If we need a stack adjustment this will require
                    an adjustment block.  TODO: We could put a pre-adjustment if we only have one
                    branch to this block. *)
                 fun matchStacks targetBlock =
                 let
                     (* Process the destination.  If it hasn't been processed. *)
                     val expectedInput = valOf (asub inputStackSizes targetBlock)
                 in
                     if expectedInput = outputCount
                     then targetBlock
                     else
                     let
                         val _ = outputCount > expectedInput orelse raise InternalError "adjustStack"
                         val adjustCode = [ResetStackPtr{numWords=outputCount-expectedInput}]
                         val newBlock = BasicBlock{block=adjustCode, flow=Unconditional targetBlock}
                         val newBlockNo = !blockCounter before blockCounter := !blockCounter+1
                         val () = extraBlocks := newBlock :: !extraBlocks
                     in
                         newBlockNo
                     end
                 end
                 
                 val (finalCode, newFlow) =
                     case flow of
                         ExitCode => (codeResult, ExitCode)
                     
                     |   Unconditional m =>
                         let
                             (* Process the block.  Since we're making an unconditional jump
                                we can include any stack adjustment needed to match the
                                destination in here.  In particular this includes loops. *)
                             val expectedInput = valOf (asub inputStackSizes m)
                             val _ = outputCount >= expectedInput orelse raise InternalError "negative reset"
                             val resultCode =
                                 if expectedInput = outputCount
                                 then codeResult
                                 else ResetStackPtr{numWords=outputCount-expectedInput} :: codeResult
                         in
                             (resultCode, Unconditional m)
                         end
 
                         (* For any of these, if we need to adjust the stack we have to add an
                            adjustment block. *)
                     |   Conditional {trueJump, falseJump, ccRef, condition} =>
                         (codeResult,
                             Conditional{trueJump=matchStacks trueJump, falseJump=matchStacks falseJump,
                                 ccRef=ccRef, condition=condition})
                     
                     |   SetHandler{ handler, continue } =>
                         (codeResult, SetHandler{ handler=matchStacks handler, continue=matchStacks continue})
                     
                     |   IndexedBr cases => (codeResult, IndexedBr(map matchStacks cases))
                     
                     |   u as UnconditionalHandle _ => (codeResult, u)
                     
                     |   c as ConditionalHandle{ continue, ... } =>
                         let
                             (* As for unconditional branch *)
                             val expectedInput = valOf (asub inputStackSizes continue)
                             val _ = outputCount >= expectedInput orelse raise InternalError "negative reset"
                             val resultCode =
                                 if expectedInput = outputCount
                                 then codeResult
                                 else ResetStackPtr{numWords=outputCount-expectedInput} :: codeResult
                         in
                             (resultCode, c)
                         end
              in
                 BasicBlock{block=List.rev finalCode, flow=newFlow}
             end
         in
             val resultBlocks = Vector.tabulate(numberOfBlocks, createBlock)
         end
         
         (* Add any extra blocks to the result. *)
         val finalResult =
             case !extraBlocks of
                 [] => resultBlocks
             |   blocks => Vector.concat[resultBlocks, Vector.fromList(List.rev blocks)]
         
         val pregProperties = Vector.fromList(List.rev(! pregPropList))
     in
         {code=finalResult, pregProps=pregProperties, maxStack= !maxStack}
     end
 
     structure Sharing =
     struct
         type extendedBasicBlock = extendedBasicBlock
         and ('genReg, 'optGenReg, 'fpReg) basicBlock =  ('genReg, 'optGenReg, 'fpReg) basicBlock
         and regProperty = regProperty
         and preg = preg
         and pregOrZero = pregOrZero
     end
 end;