diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODE.sig b/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODE.sig
index 4218a9b7..c31473f6 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODE.sig
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODE.sig
@@ -1,455 +1,457 @@
 (*
     Signature for the high-level ARM64 code
 
     Copyright David C. J. Matthews 2021-2
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 signature ARM64ICODE =
 sig
     type machineWord = Address.machineWord
     type address = Address.address
     
     type closureRef
 
     (* Registers. *)
     datatype xReg = XReg of Word8.word | XZero | XSP
     and vReg = VReg of Word8.word
 
     (* It is simpler to use a single type for all registers. *)
     datatype reg = GenReg of xReg | FPReg of vReg
 
     val X0:  xReg   and X1:  xReg   and X2:  xReg   and X3: xReg
     and X4:  xReg   and X5:  xReg   and X6:  xReg   and X7: xReg
     and X8:  xReg   and X9:  xReg   and X10: xReg   and X11: xReg
     and X12: xReg   and X13: xReg   and X14: xReg   and X15: xReg
     and X16: xReg   and X17: xReg   and X18: xReg   and X19: xReg
     and X20: xReg   and X21: xReg   and X22: xReg   and X23: xReg
     and X24: xReg   and X25: xReg   and X26: xReg   and X27: xReg
     and X28: xReg   and X29: xReg   and X30: xReg
 
     val V0:  vReg   and V1:  vReg   and V2:  vReg   and V3: vReg
     and V4:  vReg   and V5:  vReg   and V6:  vReg   and V7: vReg
     
     val is32in64: bool and isBigEndian: bool
     
     (* Condition for conditional branches etc. *)
     datatype condition =
         CondEqual            (* Z=1 *)
     |   CondNotEqual         (* Z=0 *)
     |   CondCarrySet         (* C=1 *)
     |   CondCarryClear       (* C=0 *)
     |   CondNegative         (* N=1 *)
     |   CondPositive         (* N=0 imcludes zero *)
     |   CondOverflow         (* V=1 *)
     |   CondNoOverflow       (* V=0 *)
     |   CondUnsignedHigher   (* C=1 && Z=0 *)
     |   CondUnsignedLowOrEq  (* ! (C=1 && Z=0) *)
     |   CondSignedGreaterEq  (* N=V *)
     |   CondSignedLess       (* N<>V *)
     |   CondSignedGreater    (* Z==0 && N=V *)
     |   CondSignedLessEq     (* !(Z==0 && N=V) *)
 
     (* The shift used in arithemtic operations. *)
     and shiftType =
         ShiftLSL of Word8.word
     |   ShiftLSR of Word8.word
     |   ShiftASR of Word8.word
     |   ShiftNone
 
     datatype preg = PReg of int (* A pseudo-register - an abstract register. *)
 
     (* If the value is zero we can use X0/W0. *)
     datatype pregOrZero = SomeReg of preg | ZeroReg
     
     (* A location on the stack.  May be more than word if this is a container or a handler entry. *)
     datatype stackLocn = StackLoc of {size: int, rno: int }
     
     (* This combines pregKind and stackLocn.  *)
     datatype regProperty =
         RegPropGeneral      (* A general register. *)
     |   RegPropUntagged     (* An untagged general register. *)
     |   RegPropStack of int (* A stack location or container. *)
     |   RegPropCacheTagged
     |   RegPropCacheUntagged
     |   RegPropMultiple     (* The result of a conditional or case. May be defined at multiple points. *)
     
     (* The reference to a condition code. *)
     datatype ccRef = CcRef of int
 
     datatype loadType = Load64 | Load32 | Load16 | Load8
     and opSize = OpSize32 | OpSize64
     and logicalOp = LogAnd | LogOr | LogXor
     and callKind = Recursive | ConstantCode of machineWord | FullCall
     and floatSize = Float32 | Double64
     and shiftDirection = ShiftLeft | ShiftRightLogical | ShiftRightArithmetic
     and multKind =
         MultAdd32 | MultSub32 | MultAdd64 | MultSub64 |
         SignedMultAddLong (* 32bit*32bit + 64bit => 64Bit *) |
         SignedMultHigh (* High order part of 64bit*64Bit *)
-    and fpUnary = NegFloat | NegDouble | AbsFloat | AbsDouble | ConvFloatToDble | ConvDbleToFloat
+    and fpUnary =
+        NegFloat | NegDouble | AbsFloat | AbsDouble | ConvFloatToDble |
+        ConvDbleToFloat | MoveDouble | MoveFloat
     and fpBinary = MultiplyFP | DivideFP | AddFP | SubtractFP
         (* Some of the atomic operations added in 8.1 *)
     and atomicOp = LoadAddAL | LoadUmaxAL | SwapAL | LoadAddAcquire | LoadUMaxAcquire | SwapRelease
 
     (* Function calls can have an unlimited number of arguments so it isn't always
        going to be possible to load them into registers. *)
     datatype 'genReg fnarg = ArgInReg of 'genReg | ArgOnStack of { wordOffset: int, container: stackLocn, field: int }
 
     datatype ('genReg, 'optGenReg, 'fpReg) arm64ICode =
         (* Move the contents of one preg to another.  These are always 64-bits. *)
         MoveRegister of { source: 'genReg, dest: 'genReg }
 
         (* Numerical constant. *)
     |   LoadNonAddressConstant of { source: Word64.word, dest: 'genReg }
 
         (* Floating point constant *)
     |   LoadFPConstant of { source: Word64.word, dest: 'fpReg, floatSize: floatSize }
 
         (* Address constant. *)
     |   LoadAddressConstant of { source: machineWord, dest: 'genReg }
 
         (* Load a value into a register using a constant, signed, byte offset.  The offset
            is in the range of -256 to (+4095*unit size). *)
     |   LoadWithConstantOffset of { base: 'genReg, dest: 'genReg, byteOffset: int, loadType: loadType }
 
         (* Similarly for FP registers. *)
     |   LoadFPWithConstantOffset of { base: 'genReg, dest: 'fpReg, byteOffset: int, floatSize: floatSize }
 
         (* Load a value into a register using an index register. *)
     |   LoadWithIndexedOffset of { base: 'genReg, dest: 'genReg, index: 'genReg, loadType: loadType, signExtendIndex: bool }
 
         (* Ditto for FP. *)
     |   LoadFPWithIndexedOffset of { base: 'genReg, dest: 'fpReg, index: 'genReg, floatSize: floatSize, signExtendIndex: bool }
 
         (* Returns the current thread ID.  Always a 64-bit value.. *)
     |   GetThreadId of { dest: 'genReg }
 
         (* Convert a 32-in-64 object index into an absolute address. *)
     |   ObjectIndexAddressToAbsolute of { source: 'genReg, dest: 'genReg }
 
         (* Convert an absolute address into an object index. *)
     |   AbsoluteToObjectIndex of { source: 'genReg, dest: 'genReg }
 
         (* Allocate a fixed sized piece of memory and puts the absolute address into dest.
            bytesRequired is the total number of bytes including the length word and any alignment
            necessary for 32-in-64. saveRegs is the list of registers that need to be saved if we
            need to do a garbage collection. *)
     |   AllocateMemoryFixed of { bytesRequired: Word64.word, dest: 'genReg, saveRegs: 'genReg list }
 
         (* Allocate a piece of memory.  The size argument is an untagged value containing
            the number of words i.e. the same value used for InitialiseMemory and to store
            in the length word. *)
     |   AllocateMemoryVariable of { size: 'genReg, dest: 'genReg, saveRegs: 'genReg list }
 
         (* Initialise a piece of memory by writing "size" copies of the value
            in "init".  N.B. The size is an untagged value containing the
            number of words. *)
     |   InitialiseMem of { size: 'genReg, addr: 'genReg, init: 'genReg }
 
         (* Mark the beginning of a loop.  This is really only to prevent the initialisation code being
            duplicated in ICodeOptimise. *)
     |   BeginLoop
 
         (* Set up the registers for a jump back to the start of a loop. *)
     |   JumpLoop of
             { regArgs: {src: 'genReg fnarg, dst: 'genReg} list,
               stackArgs: {src: 'genReg fnarg, wordOffset: int, stackloc: stackLocn} list,
               checkInterrupt: 'genReg list option }
 
         (* Store a register using a constant, signed, byte offset.  The offset
            is in the range of -256 to (+4095*unit size). *)
     |   StoreWithConstantOffset of { source: 'genReg, base: 'genReg, byteOffset: int, loadType: loadType }
 
         (* Ditto for FP regs. *)
     |   StoreFPWithConstantOffset of { source: 'fpReg, base: 'genReg, byteOffset: int, floatSize: floatSize }
 
         (* Store a register using an index register. *)
     |   StoreWithIndexedOffset of { source: 'genReg, base: 'genReg, index: 'genReg, loadType: loadType, signExtendIndex: bool }
 
         (* and for FP regs. *)
     |   StoreFPWithIndexedOffset of { source: 'fpReg, base: 'genReg, index: 'genReg, floatSize: floatSize, signExtendIndex: bool }
 
         (* Add/Subtract immediate.  The destination is optional in which case XZero is used.
            ccRef is optional.  If it is NONE the version of the instruction that does not generate
            a condition code is used. immed must be < 0wx1000. *)
     |   AddSubImmediate of { source: 'genReg, dest: 'optGenReg, ccRef: ccRef option, immed: word,
                              isAdd: bool, length: opSize }
 
         (* Add/Subtract register.  As with AddSubImmediate, both the destination and cc are optional. *)
     |   AddSubRegister of { base: 'genReg, shifted: 'genReg, dest: 'optGenReg, ccRef: ccRef option,
                             isAdd: bool, length: opSize, shift: shiftType }
 
         (* Bitwise logical operations.  The immediate value must be a valid bit pattern.  ccRef can
            only be SOME if logOp is LogAnd. *)
     |   LogicalImmediate of { source: 'genReg, dest: 'optGenReg, ccRef: ccRef option, immed: Word64.word,
                               logOp: logicalOp, length: opSize }
 
         (* Register logical operations.  ccRef can only be SOME if logOp is LogAnd.*)
     |   LogicalRegister of { base: 'genReg, shifted: 'genReg, dest: 'optGenReg, ccRef: ccRef option,
                              logOp: logicalOp, length: opSize, shift: shiftType }
 
         (* Shift a word by an amount specified in a register. *)
     |   ShiftRegister of { direction: shiftDirection, dest: 'genReg, source: 'genReg, shift: 'genReg, opSize: opSize }
 
         (* The various forms of multiply all take three arguments and the general form is
            dest = M * N +/- A..   *)
     |   Multiplication of { kind: multKind, dest: 'genReg, sourceA: 'optGenReg, sourceM: 'genReg, sourceN: 'genReg }
 
         (* Signed or unsigned division.  Sets the result to zero if the divisor is zero. *)
     |   Division of { isSigned: bool, dest: 'genReg, dividend: 'genReg, divisor: 'genReg, opSize: opSize }
 
         (* Start of function.  Set the register arguments.  stackArgs is the list of
            stack arguments.  If the function has a real closure regArgs includes the
            closure register (X8).  The register arguments include the return register
            (X30). *)
     |   BeginFunction of { regArgs: ('genReg * xReg) list, fpRegArgs: ('fpReg * vReg) list, stackArgs: stackLocn list }
 
         (* Call a function.  If the code address is a constant it is passed here.
            Otherwise the address is obtained by indirecting through X8 which has been loaded
            as one of the argument registers.  The results are stored in the result registers,
            usually just X0.
            The "containers" argument is used to ensure that any container whose address is passed
            as one of the other arguments continues to be referenced until the function is called
            since there's a possibility that it isn't actually used after the function. *)
     |   FunctionCall of
             { callKind: callKind, regArgs: ('genReg fnarg * xReg) list,
               stackArgs: 'genReg fnarg list, dests: ('genReg * xReg) list,
               fpRegArgs: ('fpReg * vReg) list, fpDests: ('fpReg * vReg) list,
               saveRegs: 'genReg list, containers: stackLocn list}
 
         (* Jump to a tail-recursive function.  This is similar to FunctionCall
            but complicated for stack arguments because the stack and the return
            address need to be overwritten.
            stackAdjust is the number of words to remove (positive) or add
            (negative) to the stack before the call.
            currStackSize contains the number of items currently on the stack. *)
     |   TailRecursiveCall of
             { callKind: callKind, regArgs: ('genReg fnarg * xReg) list,
               stackArgs: {src: 'genReg fnarg, stack: int} list,
               fpRegArgs: ('fpReg * vReg) list,
               stackAdjust: int, currStackSize: int }
 
         (* Return from the function.  resultRegs are the registers containing
            the result,
            returnReg is the preg that contains the return address. *)
     |   ReturnResultFromFunction of { results: ('genReg * xReg) list, returnReg: 'genReg, numStackArgs: int }
 
         (* Raise an exception.  The packet is always loaded into X0. *)
     |   RaiseExceptionPacket of { packetReg: 'genReg }
 
         (* Push a register to the stack.  This is used both for a normal push, copies=1, and
            also to reserve a container. *)
     |   PushToStack of { source: 'genReg, copies: int, container: stackLocn }
 
         (* Load a register from the stack.  The container is the stack location identifier,
            the field is an offset in a container. *)
     |   LoadStack of { dest: 'genReg, wordOffset: int, container: stackLocn, field: int }
 
         (* Store a value into the stack. *)
     |   StoreToStack of { source: 'genReg, container: stackLocn, field: int, stackOffset: int }
 
         (* Set the register to the address of the container i.e. a specific offset on the stack. *)
     |   ContainerAddress of { dest: 'genReg, container: stackLocn, stackOffset: int }
 
         (* Remove items from the stack.  Used to remove containers or
            registers pushed to the stack.. *)
     |   ResetStackPtr of { numWords: int }
 
         (* Tag a value by shifting and setting the tag bit. *)
     |   TagValue of { source: 'genReg, dest: 'genReg, isSigned: bool, opSize: opSize }
 
         (* Shift a value to remove the tag bit.  The cache is used if this is untagging a
            value that has previously been tagged. *)
     |   UntagValue of { source: 'genReg, dest: 'genReg, isSigned: bool, opSize: opSize }
 
         (* Box a largeword value.  Stores a value
            into a byte area.  This can be implemented using AllocateMemoryFixed
            but keeping it separate makes optimisation easier.
            The result is always an address and needs to be converted to an
            object index on 32-in-64. *)
     |   BoxLarge of { source: 'genReg, dest: 'genReg, saveRegs: 'genReg list }
 
         (* Load a value from a box.  This can be implemented using a load but
            is kept separate to simplify optimisation.  The source is always
            an absolute address. *)
     |   UnboxLarge of { source: 'genReg, dest: 'genReg }
 
         (* Convert a floating point value into a value suitable for storing
            in the heap.  This normally involves boxing except that 32-bit
            floats can be tagged in native 64-bits. *)
     |   BoxTagFloat of { floatSize: floatSize, source: 'fpReg, dest: 'genReg, saveRegs: 'genReg list }
 
         (* The reverse of BoxTagFloat. *)
     |   UnboxTagFloat of { floatSize: floatSize, source: 'genReg, dest: 'fpReg }
 
         (* Load a value with acquire semantics.  This means that any other
            load in this thread after this sees the value of the shared
            memory at this point and not earlier.  This is used for
            references and arrays to ensure that if another thread has
            built a data structure on the heap and then assigns the
            address to a shared ref this thread will see the updated heap
            and not any locally cached previous version. *)
     |   LoadAcquire of { base: 'genReg, dest: 'genReg, loadType: loadType }
 
         (* Store a value with release semantics.  This ensures that any
            other write completes before this operation and works with
            LoadAcquire. *)
     |   StoreRelease of { base: 'genReg, source: 'genReg, loadType: loadType }
 
         (* This is a generalised constant shift which includes selection of a
            range of bits. *)
     |   BitFieldShift of { source: 'genReg, dest: 'genReg, isSigned: bool, length: opSize, immr: word, imms: word }
 
         (*  Copy a range of bits and insert it into another register.  This is the
             only case where a register functions both as a source and a destination. *)
     |   BitFieldInsert of { source: 'genReg, destAsSource: 'genReg, dest: 'genReg,
                             length: opSize, immr: word, imms: word }
 
         (* Indexed case. *)
     |   IndexedCaseOperation of { testReg: 'genReg }
 
         (* Exception handling.  - Set up an exception handler. *)
     |   PushExceptionHandler
 
         (* End of a handled section.  Restore the previous handler. *)
     |   PopExceptionHandler
 
         (* Marks the start of a handler.  This sets the stack pointer and
            restores the old handler.  Sets the exception packet register. *) 
     |   BeginHandler of { packetReg: 'genReg }
 
         (* Compare two vectors of bytes and set the condition code on the result.
            The registers are modified by the instruction. *)
     |   CompareByteVectors of
             { vec1Addr: 'genReg, vec2Addr: 'genReg, length: 'genReg, ccRef: ccRef }
 
         (* Move a block of bytes (isByteMove true) or words (isByteMove false).  The length is the
            number of items (bytes or words) to move. The registers are modified by
            the instruction. *)
     |   BlockMove of { srcAddr: 'genReg, destAddr: 'genReg, length: 'genReg, isByteMove: bool }
 
         (* Add or subtract to the system stack pointer and optionally return the new value.
            This is used to allocate and deallocate C space. *)
     |   AddSubXSP of { source: 'genReg, dest: 'optGenReg, isAdd: bool  }
 
         (* Ensures the value will actually be referenced although it doesn't generate any code. *)
     |   TouchValue of { source: 'genReg }
 
         (* Load a value at the address and get exclusive access.  Always loads a
            64-bit value. *)
     |   LoadAcquireExclusive of { base: 'genReg, dest: 'genReg }
 
         (* Store a value into an address releasing the lock.  Sets the result to
            either 0 or 1 if it succeeds or fails. *)
     |   StoreReleaseExclusive of { base: 'genReg, source: 'optGenReg, result: 'genReg }
 
         (* Insert a memory barrier. dmb ish. *)
     |   MemoryBarrier
 
         (* Convert an integer to a floating point value. *)
     |   ConvertIntToFloat of { source: 'genReg, dest: 'fpReg, srcSize: opSize, destSize: floatSize }
 
         (* Convert a floating point value to an integer using the specified rounding mode.
            We could get an overflow here but fortunately the ARM generates a value
            that will cause an overflow when we tag it, provided we tag it explicitly. *)
     |   ConvertFloatToInt of { source: 'fpReg, dest: 'genReg, srcSize: floatSize, destSize: opSize, rounding: IEEEReal.rounding_mode }
 
-        (* Unary floating point.  This includes conversions between float and double. *)
+        (* Unary floating point.  This includes moves and conversions between float and double. *)
     |   UnaryFloatingPt of { source: 'fpReg, dest: 'fpReg, fpOp: fpUnary }
 
         (* Binary floating point: addition, subtraction, multiplication and division. *)
     |   BinaryFloatingPoint of { arg1: 'fpReg, arg2: 'fpReg, dest: 'fpReg, fpOp: fpBinary, opSize: floatSize }
 
         (* Floating point comparison. *)
     |   CompareFloatingPoint of { arg1: 'fpReg, arg2: 'fpReg, ccRef: ccRef, opSize: floatSize }
 
         (* Yield control during a spin-lock. *)
     |   CPUYield
 
         (* Atomic operations added for ARM 8.1 *)
     |   AtomicOperation of { base: 'genReg, source: 'optGenReg, dest: 'optGenReg, atOp: atomicOp }
 
         (* Debugging - fault if values don't match. *)
     |   CacheCheck of { arg1: 'genReg, arg2: 'genReg }
 
         (* Destinations at the end of a basic block. *)
     and controlFlow =
         (* Unconditional branch to a label - should be a merge point. *)
         Unconditional of int
         (* Conditional branch. Jumps to trueJump if the condional is false, falseJump if false. *)
     |   Conditional of { ccRef: ccRef, condition: condition, trueJump: int, falseJump: int }
         (* Exit - the last instruction of the block is a return, raise or tailcall. *)
     |   ExitCode
         (* Indexed case - this branches to one of a number of labels *)
     |   IndexedBr of int list
         (* Set up a handler.  This doesn't cause an immediate branch but the state at the
            start of the handler is the state at this point. *)
     |   SetHandler of { handler: int, continue: int }
         (* Unconditional branch to a handler.  If an exception is raised explicitly
            within the scope of a handler. *)
     |   UnconditionalHandle of int
         (* Conditional branch to a handler.  Occurs if there is a call to a
            function within the scope of a handler.  It may jump to the handler. *)
     |   ConditionalHandle of { handler: int, continue: int }
 
     and ('genReg, 'optGenReg, 'fpReg) basicBlock =
             BasicBlock of { block: ('genReg, 'optGenReg, 'fpReg) arm64ICode list, flow: controlFlow }
     
     (* Return the successor blocks from a control flow. *)
     val successorBlocks: controlFlow -> int list
 
     type iCodeAbstract = (preg, pregOrZero, preg) arm64ICode and basicBlockAbstract = (preg, pregOrZero, preg) basicBlock
     and  iCodeConcrete = (xReg, xReg, vReg) arm64ICode and basicBlockConcrete = (xReg, xReg, vReg) basicBlock
 
     val printICodeAbstract: basicBlockAbstract vector * (string -> unit) -> unit
     and printICodeConcrete: basicBlockConcrete vector * (string -> unit) -> unit
 
     (* Check whether this value is acceptable for LogicalImmediate. *)
     val isEncodableBitPattern: Word64.word * opSize -> bool
 
     (* This generates a  BitField instruction with the appropriate values for immr and imms. *)
     val shiftConstant:
         { direction: shiftDirection, dest: preg, source: preg, shift: word, opSize: opSize } -> iCodeAbstract
     
     structure Sharing:
     sig
         type xReg           = xReg
         and  vReg           = vReg
         and  reg            = reg
         and  condition      = condition
         and  shiftType      = shiftType
         and  ('genReg, 'optGenReg, 'fpReg) arm64ICode = ('genReg, 'optGenReg, 'fpReg) arm64ICode
         and  preg           = preg
         and  pregOrZero     = pregOrZero
         and  controlFlow    = controlFlow
         and  ('genReg, 'optGenReg, 'fpReg) basicBlock = ('genReg, 'optGenReg, 'fpReg) basicBlock
         and  stackLocn      = stackLocn
         and  regProperty    = regProperty
         and  ccRef          = ccRef
         and  'genReg fnarg  = 'genReg fnarg
         and  closureRef     = closureRef
         and  loadType       = loadType
         and  opSize         = opSize
         and  logicalOp      = logicalOp
         and  callKind       = callKind
         and  floatSize      = floatSize
         and  shiftDirection = shiftDirection
         and  multKind       = multKind
         and  fpUnary        = fpUnary
         and  fpBinary       = fpBinary
         and  atomicOp       = atomicOp
    end
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64PREASSEMBLY.sig b/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64PREASSEMBLY.sig
index 89568c67..ce76df1c 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64PREASSEMBLY.sig
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64PREASSEMBLY.sig
@@ -1,268 +1,269 @@
 (*
     Copyright (c) 2021-2 David C. J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     Licence version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public Licence for more details.
     
     You should have received a copy of the GNU Lesser General Public
     Licence along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 (* The pre-assembly layer goes below the icode and allows peep-hole optimisation. *)
 signature ARM64PREASSEMBLY =
 sig
     type closureRef
     type machineWord = Address.machineWord
 
     (* XZero and XSP are both encoded as 31 but the interpretation
        depends on the instruction 
        The datatype definition is included here to allow for
        pattern matching on XSP and XZero. *)
     datatype xReg = XReg of Word8.word | XZero | XSP
     and vReg = VReg of Word8.word
 
     val X0:  xReg   and X1:  xReg   and X2:  xReg   and X3: xReg
     and X4:  xReg   and X5:  xReg   and X6:  xReg   and X7: xReg
     and X8:  xReg   and X9:  xReg   and X10: xReg   and X11: xReg
     and X12: xReg   and X13: xReg   and X14: xReg   and X15: xReg
     and X16: xReg   and X17: xReg   and X18: xReg   and X19: xReg
     and X20: xReg   and X21: xReg   and X22: xReg   and X23: xReg
     and X24: xReg   and X25: xReg   and X26: xReg   and X27: xReg
     and X28: xReg   and X29: xReg   and X30: xReg
 
     val X_MLHeapLimit: xReg (* ML Heap limit pointer *)
     and X_MLAssemblyInt: xReg (* ML assembly interface pointer. *)
     and X_MLHeapAllocPtr: xReg (* ML Heap allocation pointer. *)
     and X_MLStackPtr: xReg (* ML Stack pointer. *)
     and X_LinkReg: xReg (* Link reg - return address *)
     and X_Base32in64: xReg (* X24 is used for the heap base in 32-in-64. *)
 
     val V0:  vReg   and V1:  vReg   and V2:  vReg   and V3: vReg
     and V4:  vReg   and V5:  vReg   and V6:  vReg   and V7: vReg
 
     (* Condition for conditional branches etc. *)
     datatype condition =
         CondEqual            (* Z=1 *)
     |   CondNotEqual         (* Z=0 *)
     |   CondCarrySet         (* C=1 *)
     |   CondCarryClear       (* C=0 *)
     |   CondNegative         (* N=1 *)
     |   CondPositive         (* N=0 imcludes zero *)
     |   CondOverflow         (* V=1 *)
     |   CondNoOverflow       (* V=0 *)
     |   CondUnsignedHigher   (* C=1 && Z=0 *)
     |   CondUnsignedLowOrEq  (* ! (C=1 && Z=0) *)
     |   CondSignedGreaterEq  (* N=V *)
     |   CondSignedLess       (* N<>V *)
     |   CondSignedGreater    (* Z==0 && N=V *)
     |   CondSignedLessEq     (* !(Z==0 && N=V) *)
 
     val invertTest: condition -> condition (* i.e. jump when the condition is not true. *)
     val condToString: condition -> string
 
     datatype shiftType =
         ShiftLSL of Word8.word
     |   ShiftLSR of Word8.word
     |   ShiftASR of Word8.word
     |   ShiftNone
 
     datatype wordSize = WordSize32 | WordSize64
 
     datatype 'a extend =
         ExtUXTB of 'a (* Unsigned extend byte *)
     |   ExtUXTH of 'a (* Unsigned extend byte *)
     |   ExtUXTW of 'a (* Unsigned extend byte *)
     |   ExtUXTX of 'a (* Left shift *)
     |   ExtSXTB of 'a (* Sign extend byte *)
     |   ExtSXTH of 'a (* Sign extend halfword *)
     |   ExtSXTW of 'a (* Sign extend word *)
     |   ExtSXTX of 'a (* Left shift *)
 
     (* Load/store instructions have only a single bit for the shift.  For byte
        operations this is one bit shift; for others it scales by the size of
        the operand if set. *)
     datatype scale =
         ScaleOrShift
     |   NoScale
 
     datatype loadType = Load64 | Load32 | Load16 | Load8
     and opSize = OpSize32 | OpSize64
     and logicalOp = LogAnd | LogOr | LogXor
     and floatSize = Float32 | Double64
     and shiftDirection = ShiftLeft | ShiftRightLogical | ShiftRightArithmetic
     and multKind =
         MultAdd32 | MultSub32 | MultAdd64 | MultSub64 |
         SignedMultAddLong (* 32bit*32bit + 64bit => 64Bit *) |
         SignedMultHigh (* High order part of 64bit*64Bit *)
-    and fpUnary = NegFloat | NegDouble | AbsFloat | AbsDouble | ConvFloatToDble | ConvDbleToFloat
+    and fpUnary =
+        NegFloat | NegDouble | AbsFloat | AbsDouble | ConvFloatToDble |
+        ConvDbleToFloat | MoveDouble | MoveFloat
     and fpBinary = MultiplyFP | DivideFP | AddFP | SubtractFP
     and unscaledType = NoUpdate | PreIndex | PostIndex
     and condSet = CondSet | CondSetIncr | CondSetInvert | CondSetNegate
     and bitfieldKind = BFUnsigned | BFSigned | BFInsert
     and brRegType = BRRBranch | BRRAndLink | BRRReturn
         (* Some of the atomic operations added in 8.1 *)
     and atomicOp = LoadAddAL | LoadUmaxAL | SwapAL | LoadAddAcquire | LoadUMaxAcquire | SwapRelease
 
     type label and labelMaker
     val createLabelMaker: unit -> labelMaker
     and createLabel: labelMaker -> label
 
     datatype precode =
         (* Basic instructions *)
         AddImmediate of {regN: xReg, regD: xReg, immed: word, shifted: bool, opSize: opSize, setFlags: bool}
     |   SubImmediate of {regN: xReg, regD: xReg, immed: word, shifted: bool, opSize: opSize, setFlags: bool}
     |   AddShiftedReg of {regM: xReg, regN: xReg, regD: xReg, shift: shiftType, opSize: opSize, setFlags: bool}
     |   SubShiftedReg of {regM: xReg, regN: xReg, regD: xReg, shift: shiftType, opSize: opSize, setFlags: bool}
     |   AddExtendedReg of {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend, opSize: opSize, setFlags: bool}
     |   SubExtendedReg of {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend, opSize: opSize, setFlags: bool}
     |   MultiplyAndAddSub of {regM: xReg, regN: xReg, regA: xReg, regD: xReg, multKind: multKind}
     |   DivideRegs of
             {regM: xReg, regN: xReg, regD: xReg, isSigned: bool, opSize: opSize}
     |   LogicalShiftedReg of
             {regM: xReg, regN: xReg, regD: xReg, shift: shiftType, logOp: logicalOp, opSize: opSize, setFlags: bool}
     |   LoadRegScaled of
             {regT: xReg, regN: xReg, unitOffset: int, loadType: loadType}
     |   LoadFPRegScaled of
             {regT: vReg, regN: xReg, unitOffset: int, floatSize: floatSize}
     |   StoreRegScaled of
             {regT: xReg, regN: xReg, unitOffset: int, loadType: loadType}
     |   StoreFPRegScaled of
             {regT: vReg, regN: xReg, unitOffset: int, floatSize: floatSize}
     |   LoadRegUnscaled of
             {regT: xReg, regN: xReg, byteOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   StoreRegUnscaled of
             {regT: xReg, regN: xReg, byteOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   LoadFPRegUnscaled of
             {regT: vReg, regN: xReg, byteOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   StoreFPRegUnscaled of
             {regT: vReg, regN: xReg, byteOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   LoadRegIndexed of {regT: xReg, regN: xReg, regM: xReg, loadType: loadType, option: scale extend}
     |   StoreRegIndexed of {regT: xReg, regN: xReg, regM: xReg, loadType: loadType, option: scale extend}
     |   LoadFPRegIndexed of {regT: vReg, regN: xReg, regM: xReg, floatSize: floatSize, option: scale extend}
     |   StoreFPRegIndexed of {regT: vReg, regN: xReg, regM: xReg, floatSize: floatSize, option: scale extend}
         (* LoadAcquire and StoreRelease are used for mutables. *)
     |   LoadAcquireReg of {regN: xReg, regT: xReg, loadType: loadType}
     |   StoreReleaseReg of {regN: xReg, regT: xReg, loadType: loadType}
         (* LoadAcquireExclusiveRegister and StoreReleaseExclusiveRegister are used for mutexes. *)
     |   LoadAcquireExclusiveRegister of {regN: xReg, regT: xReg}
     |   StoreReleaseExclusiveRegister of {regS: xReg, regT: xReg, regN: xReg}
     |   MemBarrier
         (* Additional atomic operations. *)
     |   AtomicExtension of { regT: xReg, regN: xReg, regS: xReg, atOp: atomicOp }
     |   LoadRegPair of
             { regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   StoreRegPair of
             { regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   LoadFPRegPair of
             { regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   StoreFPRegPair of
             { regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   ConditionalSet of
             {regD: xReg, regTrue: xReg, regFalse: xReg, cond: condition, condSet: condSet, opSize: opSize}
     |   BitField of {immr: word, imms: word, regN: xReg, regD: xReg, opSize: opSize, bitfieldKind: bitfieldKind}
     |   ShiftRegisterVariable of {regM: xReg, regN: xReg, regD: xReg, opSize: opSize, shiftDirection: shiftDirection}
     |   BitwiseLogical of { bits: Word64.word, regN: xReg, regD: xReg, opSize: opSize, setFlags: bool, logOp: logicalOp}
         (* Floating point *)
-    |   MoveFPToFP of { regN: vReg, regD: vReg, floatSize: floatSize}
     |   MoveGeneralToFP of { regN: xReg, regD: vReg, floatSize: floatSize}
     |   MoveFPToGeneral of {regN: vReg, regD: xReg, floatSize: floatSize}
     |   CvtIntToFP of { regN: xReg, regD: vReg, floatSize: floatSize, opSize: opSize}
     |   CvtFloatToInt of { round: IEEEReal.rounding_mode, regN: vReg, regD: xReg, floatSize: floatSize, opSize: opSize}
     |   FPBinaryOp of { regM: vReg, regN: vReg, regD: vReg, floatSize: floatSize, fpOp: fpBinary}
     |   FPComparison of { regM: vReg, regN: vReg, floatSize: floatSize}
     |   FPUnaryOp of {regN: vReg, regD: vReg, fpOp: fpUnary}
         (* Branches and Labels. *)
     |   SetLabel of label
     |   ConditionalBranch of condition * label
     |   UnconditionalBranch of label
     |   BranchAndLink of label
     |   BranchReg of {regD: xReg, brRegType: brRegType }
     |   LoadLabelAddress of xReg * label
     |   TestBitBranch of { test: xReg, bit: Word8.word, label: label, onZero: bool }
     |   CompareBranch of { test: xReg, label: label, onZero: bool, opSize: opSize }
         (* Composite instructions *)
     |   MoveXRegToXReg of {sReg: xReg, dReg: xReg}
     |   LoadNonAddr of xReg * Word64.word
     |   LoadFPConst of {dest: vReg, value: Word64.word, floatSize: floatSize, work: xReg}
     |   LoadAddr of xReg * machineWord
     |   RTSTrap of { rtsEntry: int, work: xReg, save: xReg list }
         (* Allocate memory - bytes includes the length word and rounding. *)
     |   AllocateMemoryFixedSize of { bytes: word, dest: xReg, save: xReg list, work: xReg }
         (* Allocate memory - sizeReg is number of ML words needed for cell. *)
     |   AllocateMemoryVariableSize of { sizeReg: xReg, dest: xReg, save: xReg list, work: xReg }
         (* Branch table for indexed case. startLabel is the address of the first label in
            the list.  The branch table is a sequence of unconditional branches. *)
     |   BranchTable of { startLabel: label, brTable: label list }
     |   LoadGlobalHeapBaseInCallback of xReg
     |   Yield
 
     (* Wrapper for BitField *)
     val shiftConstant: { direction: shiftDirection, regD: xReg, regN: xReg, shift: word, opSize: opSize } -> precode
     
     (* Convenient sequences.  N.B. These are in reverse order. *)
     val boxDouble:
         {source: vReg, destination: xReg, workReg: xReg, saveRegs: xReg list} * precode list -> precode list
     and boxFloat:
         {source: vReg, destination: xReg, workReg: xReg, saveRegs: xReg list} * precode list -> precode list
     and boxSysWord:
         {source: xReg, destination: xReg, workReg: xReg, saveRegs: xReg list} * precode list -> precode list
 
     (* Create the vector of code from the list of instructions and update the
        closure reference to point to it. *)
     val generateFinalCode:
         {instrs: precode list, name: string, parameters: Universal.universal list, resultClosure: closureRef,
          profileObject: machineWord, labelMaker: labelMaker} -> unit
 
     (* Offsets in the assembly code interface pointed at by X26
        These are in units of 64-bits NOT bytes. *)
     val heapOverflowCallOffset: int
     and stackOverflowCallOffset: int
     and stackOverflowXCallOffset: int
     and exceptionHandlerOffset: int
     and stackLimitOffset: int
     and threadIdOffset: int
     and heapLimitPtrOffset: int
     and heapAllocPtrOffset: int
     and mlStackPtrOffset: int
     and exceptionPacketOffset: int
 
     val is32in64: bool and isBigEndian: bool
 
     val isEncodableBitPattern: Word64.word * wordSize -> bool
 
     structure Sharing:
     sig
         type closureRef = closureRef
         type loadType = loadType
         type opSize = opSize
         type logicalOp = logicalOp
         type floatSize = floatSize
         type shiftDirection = shiftDirection
         type multKind = multKind
         type fpUnary = fpUnary
         type fpBinary = fpBinary
         type unscaledType = unscaledType
         type condSet = condSet
         type bitfieldKind = bitfieldKind
         type brRegType = brRegType
         type precode = precode
         type xReg = xReg
         type vReg = vReg
         type label = label
         type labelMaker = labelMaker
         type condition = condition
         type shiftType = shiftType
         type wordSize = wordSize
         type 'a extend = 'a extend
         type scale = scale
         type atomicOp = atomicOp
     end
 
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64AllocateRegisters.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64AllocateRegisters.ML
index 00136a4b..395f983e 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64AllocateRegisters.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64AllocateRegisters.ML
@@ -1,1329 +1,1354 @@
 (*
     Copyright David C. J. Matthews 2016-22
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64AllocateRegisters(
     structure Arm64ICode: ARM64ICODE
     structure Identify: ARM64IDENTIFYREFERENCES
     structure IntSet: INTSET
     
     sharing Arm64ICode.Sharing = Identify.Sharing = IntSet
 ): ARM64ALLOCATEREGISTERS =
 struct
     open Arm64ICode
     open Identify
     open IntSet
     
     exception InternalError = Misc.InternalError
 
     val checkCache = false (* Use the cache *)
 
     datatype allocateResult =
         AllocateSuccess of basicBlockConcrete vector
     |   AllocateFailure of intSet list
 
     (* General registers.  X24 is used as the global heap base in 32-in-64. X30 is the
        return address set by blr but is otherwise a general register.
        Put the argument registers at the end of the list so they'll only be
        used when hinted. *)
     val generalRegisters =
         map GenReg ([X9, X10, X11, X12, X13, X14, X15, X19, X20, X21, X22, X23,
                       X0, X1, X2, X3, X4, X5, X6, X7, X8, X30] @ (if is32in64 then [] else [X24]))
 
     val floatingPtRegisters =
         map FPReg [V7, V6, V5, V4, V3, V2, V1]
     type conflictState =
     {
         conflicts: intSet, realConflicts: reg list
     }
     
     type triple = {instr: iCodeAbstract, current: intSet, active: intSet}
 
     exception InternalError = Misc.InternalError
 
     (* Get the conflict states, allocate registers and return the code with the allocated
        registers if it is successful. *)
     fun allocateRegisters{blocks, regProps, maxPRegs, ...} =
     let
         (* Other registers that conflict with this i.e. cannot share the same
            real register. *)
         val regConflicts = Array.array(maxPRegs, emptySet)
         (* Real registers that cannot be used for this because they are needed for
            an instruction.  Only X30 in calls and RTS traps. *)
         and regRealConflicts = Array.array(maxPRegs, []: reg list)
  
         fun addConflictsTo(addTo, conflicts) =
             List.app(fn aReg => Array.update(regConflicts, aReg, union(Array.sub(regConflicts, aReg), conflicts))) addTo
 
         (* To reserve a register we need to add the real register to the
            real conflict sets of all the abstract conflicts. *)
         local
             fun isInset reg set = List.exists (fn r => r = reg) set
         in
             fun reserveRegister(reserveFor, reg) =
             let
                 val absConflicts = Array.sub(regConflicts, reserveFor)
                 fun addConflict i =
                     if i = reserveFor then () else addRealConflict (i, reg)
             in
                 List.app addConflict (setToList absConflicts)
             end
 
             and addRealConflict (i, reg) =
             let
                 val currentConflicts = Array.sub(regRealConflicts, i)
             in
                 if isInset reg currentConflicts
                 then ()
                 else Array.update(regRealConflicts, i, reg :: currentConflicts)
             end
         end
       
         fun conflictsForInstr passThrough {instr, current, ...} =
         let
             val {sources, dests} = getInstructionRegisters instr
             fun regNo(PReg i) = i
             val destRegNos = map regNo dests
             and sourceRegNos = map regNo sources
             val destSet = listToSet destRegNos
             val afterRemoveDests = minus(current, destSet)
             
             local
                 (* In almost all circumstances the destination and sources don't
                    conflict and the same register can be used as a destination and
                    a source.  BoxLarge can only store the value after the memory
                    has been allocated.  BitFieldInsert has to copy the "destAsSource"
                    value into the destination so cannot use the same register for
                    the "source". *)
                 val postInstruction =
                     case instr of
                         BoxLarge _ => destRegNos @ sourceRegNos
                     |   BoxTagFloat _ => destRegNos @ sourceRegNos (* Not sure about this. *)
                     |   BitFieldInsert{source, ...} => regNo source :: destRegNos
                     |   _ => destRegNos
             in
                 (* If there is more than one destination they conflict with each other. *)
                 val () = addConflictsTo(postInstruction, listToSet postInstruction);
                 (* Mark conflicts for the destinations, i.e. after the instruction.
                    The destinations conflict with the registers that are used
                    subsequently. *)
                 val () = addConflictsTo(postInstruction, current);
                 val () = addConflictsTo(postInstruction, passThrough);
 
                 (* Mark conflicts for the sources i.e. before the instruction. *)
                 (* Sources must be set up as conflicts with each other i.e. when we
                    come to allocate registers we must choose different real registers
                    for different abstract registers. *)
                 val () = addConflictsTo(sourceRegNos, listToSet sourceRegNos)
                 val () = addConflictsTo(sourceRegNos, afterRemoveDests);
                 val () = addConflictsTo(sourceRegNos, passThrough)
             end
             
             (* I'm not sure if this is needed.  There was a check in the old code to ensure that
                different registers were used for loop variables even if they were actually unused.
                This may happen anyway.
                Comment and code copied from X86 version.  Retain it for the moment. *)
             val () =
                 case instr of
                     JumpLoop{regArgs, ...} =>
                     let
                         val destRegs = List.foldl(fn ({dst=PReg loopReg, ...}, dests) => loopReg :: dests) [] regArgs
                     in
                         addConflictsTo(destRegs, listToSet destRegs);
                         addConflictsTo(destRegs, current);
                         addConflictsTo(destRegs, passThrough)
                     end
                 |   _ => ()
 
             (* Certain instructions are specific as to the real registers. *)
             val () =
                 case instr of
                     ReturnResultFromFunction{ returnReg=PReg retReg, ... } =>
                         (* We're going to put the return value in X0 so we can't use that for
                            the return address. *)
                         addRealConflict(retReg, GenReg X0)
 
                 |   RaiseExceptionPacket{ packetReg } =>
                         (* This wasn't needed previously because we always pushed the registers
                            across an exception. *)
                         reserveRegister(regNo packetReg, GenReg X0)
                 
                 |   BeginHandler { packetReg, ...} =>
                         reserveRegister(regNo packetReg, GenReg X0)
                 
                 |   FunctionCall { dests, regArgs, ...} =>
                         (* This is only needed if we are saving the registers rather
                            than marking them as "must push". *)
                     let
                         val () = List.app(fn (PReg pr, r) => reserveRegister(pr, GenReg r)) dests
                         (* The argument registers also conflict.  In order to execute this call we need to load
                            the arguments into specific registers so we can't use them for values that we want after
                            the call. *)
                         val toReserve = X30 :: List.map #2 regArgs
                     in
                         List.app(fn i => List.app(fn r => addRealConflict(i, GenReg r)) toReserve)
                             (setToList passThrough @ setToList afterRemoveDests)
                     end
 
                     (* We can't use X30 as the result because it's needed for the return addr if we have to GC. *)
                 |   AllocateMemoryFixed{dest=PReg destReg, ...} => addRealConflict(destReg, GenReg X30)
                 |   AllocateMemoryVariable{dest=PReg destReg, ...} => addRealConflict(destReg, GenReg X30)
                 |   BoxLarge{dest=PReg destReg, ...} => addRealConflict(destReg, GenReg X30)
                 |   BoxTagFloat{dest=PReg destReg, ...} => addRealConflict(destReg, GenReg X30) (* Could exclude floats on native addr. *)
 
                 |   _ => ()
         in
             ()
         end
         
         (* Process the block. *)
         fun conflictsForBlock(ExtendedBasicBlock{block, passThrough, exports, ...}) =
         let
             (* We need to establish conflicts between all the registers active at
                the end of the block since they may not be established elsewhere.
                This isn't necessary for an unconditional branch since the
                same registers will be included in the block that is the target
                of the branch, possibly along with others.  However if this is
                a conditional or indexed branch we may have different sets at
                each of the targets and we have to ensure that all the registers
                differ. *)
             val united = union(exports, passThrough)
             val () = addConflictsTo(setToList united, united)
 
             val () = List.app (conflictsForInstr passThrough) block
         in
             ()
         end
 
         val () = Vector.app conflictsForBlock blocks
 
         (* Hint values.  The idea of hints is that by using a hinted register
            we may avoid an unnecessary move instruction.  realHints is set when
            a pseudo-register is going to be loaded from a specific register
            e.g. a register argument, or moved into one e.g. X0 for the
            function result.
            friends is set to the other pReg that may be associated
            with the pReg.  Typically this is where we have a merge
            register that we move some value into. *)
         val realHints = Array.array(maxPRegs, NONE: reg option)
 
         (* Sources and destinations.  These indicate the registers that are
            the sources and destinations of the indexing register and are used
            as hints.  If a register has been allocated for a source or destination
            we may be able to reuse it.  *)
         val sourceRegs = Array.array(maxPRegs, []: int list)
         and destinationRegs = Array.array(maxPRegs, []: int list)
         local
             (* Real hints.  If this is the source of a value e.g. a function argument in a register,
                we'll use it directly.  If, though, this is the result of a function and we want the
                result to end up in a specific register we want to propagate it to any pReg that
                moves its value into this. *)
             fun addRealHint(r, reg) =
                 case Array.sub(realHints, r) of
                     SOME _ => ()
                 |   NONE =>
                     (
                         (* Add to this pReg *)
                         Array.update(realHints, r, SOME reg);
                         (* and to any other pReg that moves here. *)
                         List.app(fn r => addRealHint(r, reg)) (Array.sub(sourceRegs, r))
                     )
 
             fun addSourceAndDestinationHint{src, dst} =
             let
                 val conflicts = Array.sub(regConflicts, src)
             in
                 (* If they conflict we can't add them. *)
                 if member(dst, conflicts)
                 then ()
                 else
                 let
                     val currentDests = Array.sub(destinationRegs, src)
                     val currentSources = Array.sub(sourceRegs, dst)
                 in
                     (* Add the destination for this source i.e. the registers we move
                        this source into. *)
                     if List.exists(fn i => i=dst) currentDests
                     then ()
                     else Array.update(destinationRegs, src, dst :: currentDests);
 
                     (* Add the source to the list of sources for this destination.
                        A merge register may have several sources, a different one
                        for each path. If the destination has a real hint we want to
                        propagate that back.  That isn't needed for the destinations
                        because we allocate the registers from the start forward. *)
                     if List.exists(fn i => i=src) currentSources
                     then ()
                     else
                     let
                         val sources = src :: currentSources
                         val () = Array.update(sourceRegs, dst, sources)
                     in
                         case Array.sub(realHints, dst) of
                             NONE => ()
                         |   SOME real => List.app(fn r => addRealHint(r, real)) sources
                     end
                 end
             end
 
             (* Add the hints to steer the register allocation.  The idea is to avoid moves between
                registers by getting values into the appropriate register in advance.
                We don't actually need to add real hints where the real register is providing
                the value, e.g. BeginFunction, because the allocation process will take care of
                that. *)
             fun addHints{instr=MoveRegister{source=PReg sreg, dest=PReg dreg, ...}, ...} =
                     addSourceAndDestinationHint {src=sreg, dst=dreg}
 
             |   addHints{instr=BitFieldInsert{destAsSource=PReg dsReg, dest=PReg dReg, ...}, ...} =
                     (* The "destAsSource" is the destination if some bits are retained. *)
                     addSourceAndDestinationHint {src=dsReg, dst=dReg}
 
             |   addHints{instr=ReturnResultFromFunction { results, ... }, ...} =
                     List.app(fn(PReg pr, r) => addRealHint(pr, GenReg r)) results
 
             |   addHints{instr=JumpLoop{regArgs, ...}, ...} =
                 let
                     fun addRegArg {src=ArgInReg(PReg argReg), dst=PReg resReg} =
                             addSourceAndDestinationHint {dst=resReg, src=argReg}
                     |   addRegArg {src=ArgOnStack _, ...} = ()
                 in
                     List.app addRegArg regArgs
                 end
 
             |   addHints{instr=BeginFunction{regArgs, fpRegArgs, ...}, ...} =
                 (
                     List.app (fn (PReg pr, reg) => addRealHint(pr, GenReg reg)) regArgs;
                     List.app (fn (PReg pr, reg) => addRealHint(pr, FPReg reg)) fpRegArgs
                 )
 
             |   addHints{instr=TailRecursiveCall{regArgs, fpRegArgs, ...}, ...} =
                 let
                     fun setHint(ArgInReg(PReg pr), reg) = addRealHint(pr, GenReg reg) | setHint _ = ()
                 in
                     List.app setHint regArgs;
                     List.app (fn(PReg pr, reg) => addRealHint(pr, FPReg reg)) fpRegArgs
                 end
         
             |   addHints{instr=FunctionCall{regArgs, dests, fpRegArgs, fpDests, ...}, ...} =
                 let
                     fun setHint(ArgInReg(PReg pr), reg) = addRealHint(pr, GenReg reg) | setHint _ = ()
                 in
                     List.app(fn(PReg pr, r) => addRealHint(pr, GenReg r)) dests;
                     List.app(fn(PReg pr, r) => addRealHint(pr, FPReg r)) fpDests;
                     List.app setHint regArgs;
                     List.app (fn(PReg pr, reg) => addRealHint(pr, FPReg reg)) fpRegArgs
                 end
 
                 (* Exception packets are in X0 *)
             |   addHints{instr=RaiseExceptionPacket{ packetReg=PReg preg }, ...} = addRealHint(preg, GenReg X0)
             |   addHints{instr=BeginHandler{ packetReg=PReg preg }, ...} = addRealHint(preg, GenReg X0)
 
+            |   addHints{instr=UnaryFloatingPt{source=PReg sreg, dest=PReg dreg, fpOp=MoveFloat}, ...} =
+                    addSourceAndDestinationHint {src=sreg, dst=dreg}
+
+            |   addHints{instr=UnaryFloatingPt{source=PReg sreg, dest=PReg dreg, fpOp=MoveDouble}, ...} =
+                    addSourceAndDestinationHint {src=sreg, dst=dreg}
+
             |   addHints _ = ()
         in
             val () = Vector.app(fn ExtendedBasicBlock { block, ...} => List.app addHints block) blocks        
         end
 
 
         val allocatedRegs = Array.array(maxPRegs, NONE: reg option)
         val failures = ref []: intSet list ref
 
         (* Find a real register for a preg.
            1.  If a register is already allocated use that.
            2.  Try the "preferred" register if one has been given.
            3.  Try the realHints value if there is one.
            4.  See if there is a "friend" that has an appropriate register
            5.  Look at all the registers and find one. *)
         fun findRegister(r, pref, regSet, cache) =
         case Array.sub(allocatedRegs, r) of
             SOME reg => reg
 
         |   NONE =>
             let
                 val conflicts = Array.sub(regConflicts, r)
                 and realConflicts = Array.sub(regRealConflicts, r)
                 (* Find the registers we've already allocated that may conflict. *)
                 val conflictingRegs =
                     List.mapPartial(fn i => Array.sub(allocatedRegs, i)) (setToList conflicts) @
                         realConflicts
 
                 fun isFree aReg = not (List.exists(fn i => i=aReg) conflictingRegs)
 
                 fun tryAReg NONE = NONE
                 |   tryAReg (somePref as SOME prefReg) =
                         if isFree prefReg
                         then (Array.update(allocatedRegs, r, somePref); somePref)
                         else NONE
 
                 (* Search the sources and destinations to see if a register has
                    already been allocated or there is a hint. *)
                 fun findAFriend([], [], _) = NONE
 
                 |   findAFriend(aDest :: otherDests, sources, alreadySeen) =
                     let
                         val possReg =
                             case Array.sub(allocatedRegs, aDest) of
                                 v as SOME _ => tryAReg v
                             |   NONE => tryAReg(Array.sub(realHints, aDest))
                     in
                         case possReg of
                             reg as SOME _ => reg
                         |   NONE =>
                             let
                                 (* Add the destinations of the destinations to the list
                                    if they don't conflict and haven't been seen. *)
                                 fun newFriend f =
                                     not(List.exists (fn n => n=f) alreadySeen) andalso not(member(f, conflicts))
                                 val fOfF = List.filter newFriend (Array.sub(destinationRegs, aDest))
                             in
                                 findAFriend(otherDests @ fOfF, sources, aDest :: alreadySeen)
                             end
                     end
 
                 |   findAFriend([], aSrc :: otherSrcs, alreadySeen) =
                     let
                         val possReg =
                             case Array.sub(allocatedRegs, aSrc) of
                                 v as SOME _ => tryAReg v
                             |   NONE => tryAReg(Array.sub(realHints, aSrc))
                     in
                         case possReg of
                             reg as SOME _ => reg
                         |   NONE =>
                             let
                                 (* Add the sources of the sources to the list
                                    if they don't conflict and haven't been seen. *)
                                 fun newFriend f =
                                     not(List.exists (fn n => n=f) alreadySeen) andalso not(member(f, conflicts))
                                 val fOfF = List.filter newFriend (Array.sub(sourceRegs, aSrc))
                             in
                                 findAFriend([], otherSrcs @ fOfF, aSrc :: alreadySeen)
                             end
                     end
             in
                 case tryAReg pref of
                     SOME r => r
                 |   NONE =>
                     (
                         case tryAReg (Array.sub(realHints, r)) of
                             SOME r => r
                         |   NONE =>
                             (
                                 case findAFriend(Array.sub(destinationRegs, r), Array.sub(sourceRegs, r), []) of
                                     SOME r => r
                                     (* Look through the registers to find one that's free.  First try excluding
                                        the cache registers. *)
                                 |   NONE =>
                                     let
                                         (* First try filtering all the cache registers to see if we can find a
                                            register.  If not see if it works by *)
                                         fun filterCache(filteredRegset, []) = List.find isFree filteredRegset
 
                                         |   filterCache(filteredRegset, (cReg, _) :: cache) =
                                             (
                                                 case filterCache(List.filter(fn r => r <> cReg) filteredRegset, cache) of
                                                     NONE => if isFree cReg then SOME cReg else NONE
                                                 |   result => result
                                             )
 
                                        val pick =
                                             case filterCache(regSet, cache) of
                                                 SOME reg => reg
                                             |   NONE =>
                                                 (
                                                     (* This failed.  We're going to have to spill something. *)
                                                     failures := conflicts :: ! failures;
                                                     hd regSet (* Return something to allow this pass to complete *)
                                                 )
                                         val () = Array.update(allocatedRegs, r, SOME pick)
                                     in
                                         pick
                                     end
                             )
                     )
             end
 
         (* Turn the abstract icode into a concrete version by allocating the registers. *)
         local
             fun asGenReg(GenReg reg) = reg | asGenReg _ = raise InternalError "asGenReg"
             and asFPReg(FPReg reg) = reg | asFPReg _ = raise InternalError "asFPReg"
 
             datatype cacheItem =
                 CacheStack of stackLocn             (* A value loaded from the stack. *)
             |   CacheAbsAddress of preg             (* 32-in-64: An absolute address from an object ID *)
             |   CacheAbsAddrOnStack of stackLocn    (* 32-in-64: An absolute address from an object loaded from the stack. *)
 
             (* Cache hints: Try to use the same register for values that can be cached.  This increases
                the chances that we will be able to retain the cache when we merge different branches. *)
             val cacheHints = Array.array(maxPRegs, NONE: reg option)
 
             (* Remove any reference to newly allocated registers from the cache.  Also used after
                block move and comparison that modify registers *)
             fun pruneCache(reg: reg, cache) = List.filter(fn (r, _) => r <> reg) cache
 
             (* Return the cache registers that contain valid addresses. *)
             fun cachedAddressRegs cache = List.map (asGenReg o #1) cache
 
             (* Merge the cache states *)
             fun mergeCacheStates ([]: (reg * cacheItem) list list) = []: (reg * cacheItem) list
             |   mergeCacheStates [single] = single
         
             |   mergeCacheStates (many as first :: rest) =
                 let
                     (* Generally we will either be unable to merge and have an empty cache or
                        will have just one or two entries. *)
                     (* Find the shortest.  If it's empty we're done. *)
                     fun findShortest(_, [], _) = []
                     |   findShortest(_, shortest, []) = shortest
                 
                     |   findShortest(len, shortest, hd::tl) =
                         let
                             val hdLen = List.length hd
                         in
                             if hdLen < len then findShortest(hdLen, hd, tl) else findShortest(len, shortest, tl)
                         end
                     val shortest = findShortest(List.length first, first, rest)
                 
                     (* Find the item we're caching for.  If it is in a different register we
                        can't use it. *)
                     fun findItem search (hd::tl) = search = hd orelse findItem search tl
                     |   findItem _ [] = false
                 
                     (* It's present if it's in all the sources. *)
                     fun present search = List.all(findItem search) many
                 
                     val filtered =
                         List.foldl (fn (search, l) => if present search then search :: l else l) [] shortest
                 in
                     filtered
                 end
 
             fun allocateNewDestination(PReg r, pref, regSet, cacheList) =
                 case Array.sub(allocatedRegs, r) of
                     SOME reg =>
                         (
                             case Vector.sub(regProps, r) of
                                 RegPropMultiple => (reg, pruneCache(reg, cacheList)) (* This is allowed for merge registers *)
                             |   _ => raise InternalError "Register defined at multiple points"
                         )
                 |   NONE =>
                     let
                         val reg = findRegister(r, pref, regSet, cacheList)
                     in
                         (reg, pruneCache(reg, cacheList))
                     end
 
             fun allocateGenReg(r, hint, cache) =
             let
                 val (reg, newCache) = allocateNewDestination(r, hint, generalRegisters, cache)
             in
                 (asGenReg reg, newCache)
             end
             and allocateFPReg(r, hint, cache) =
             let
                 val (reg, newCache) = allocateNewDestination(r, hint, floatingPtRegisters, cache)
             in
                 (asFPReg reg, newCache)
             end
 
             and allocateGenRegOrZero(ZeroReg, _, cache) = (XZero, cache)
             |   allocateGenRegOrZero(SomeReg reg, hint, cache) = allocateGenReg(reg, hint, cache)
 
             fun getAllocatedGenReg(PReg r) =
                 case Array.sub(allocatedRegs, r) of SOME(GenReg reg) => reg | _ => raise InternalError "getAllocatedGenReg"
             and getAllocatedFPReg(PReg r) =
                 case Array.sub(allocatedRegs, r) of SOME(FPReg reg) => reg | _ => raise InternalError "getAllocatedFPReg"
 
             fun getAllocatedGenRegOrZero ZeroReg = XZero
             |   getAllocatedGenRegOrZero (SomeReg reg) = getAllocatedGenReg reg
 
             fun getAllocatedArg(ArgInReg reg) = ArgInReg(getAllocatedGenReg reg)
             |   getAllocatedArg(ArgOnStack stackLoc) = ArgOnStack stackLoc
 
             val getSaveRegs = List.map getAllocatedGenReg
 
             (* Reversed cons and append to make the code easier to read. *)
             infix 5 <::> <@>
             fun tl <::> hd = hd :: tl
             (*and snd <@> fst = fst @ snd*)
 
             fun absToConcrete([], context, code) = (context, code)
 
             |   absToConcrete({instr=MoveRegister{ source, dest}, ...} :: rest, cache, code) =
                 let
                     (* Try to use the register we've allocated for the source as the destination so
                        that we can eliminate this instruction altogether. *)
                     val sourceReg = getAllocatedGenReg source
                     val (destReg, newCache) = allocateGenReg(dest, SOME(GenReg sourceReg), cache)
                 in
                     if sourceReg = destReg
                     then absToConcrete(rest, newCache, code)
                     else absToConcrete(rest, newCache, code <::> MoveRegister { source=sourceReg, dest=destReg})
                 end
 
             |   absToConcrete({instr=LoadNonAddressConstant { dest, source}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::> LoadNonAddressConstant { dest=destReg, source=source})
                 end
 
             |   absToConcrete({instr=LoadFPConstant { dest, source, floatSize }, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateFPReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::> LoadFPConstant { dest=destReg, source=source, floatSize=floatSize})
                 end
 
             |   absToConcrete({instr=LoadAddressConstant { dest, source}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::> LoadAddressConstant { dest=destReg, source=source})
                 end
 
             |   absToConcrete({instr=LoadWithConstantOffset { base, dest, byteOffset, loadType}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         LoadWithConstantOffset { base=getAllocatedGenReg base, dest=destReg, byteOffset=byteOffset, loadType=loadType})
                 end
 
             |   absToConcrete({instr=LoadFPWithConstantOffset { base, dest, byteOffset, floatSize}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateFPReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         LoadFPWithConstantOffset { base=getAllocatedGenReg base, dest=destReg, byteOffset=byteOffset, floatSize=floatSize})
                 end
 
             |   absToConcrete({instr=LoadWithIndexedOffset { base, dest, index, loadType, signExtendIndex}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         LoadWithIndexedOffset { base=getAllocatedGenReg base, dest=destReg, index=getAllocatedGenReg index,
                                                 loadType=loadType, signExtendIndex=signExtendIndex})
                 end
 
             |   absToConcrete({instr=LoadFPWithIndexedOffset { base, dest, index, floatSize, signExtendIndex}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateFPReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         LoadFPWithIndexedOffset { base=getAllocatedGenReg base, dest=destReg, index=getAllocatedGenReg index,
                                                   floatSize=floatSize, signExtendIndex=signExtendIndex})
                 end
 
             |   absToConcrete({instr=GetThreadId { dest}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::> GetThreadId { dest=destReg})
                 end
 
             |   absToConcrete({instr=ObjectIndexAddressToAbsolute { source as PReg srcNo, dest=destOiA as PReg doia}, kill, ...}
                                     :: rest, cache, code) =
                 let
                     (* See if this is in the cache and use it if it is.  If this is the last reference to this
                        source entry we don't want it in the cache any longer. *)
                     val killThis = member(srcNo, kill)
                     val (newCode, destReg, newCache, next) =
                         case List.find(fn (_, CacheAbsAddress c) => c=source | _ => false) cache of
                             SOME (srcReg, _) =>
                             let
                                 (* Try to use the cache register as the destination if we can. *)
                                 val (destReg, newCache) = allocateNewDestination(destOiA, SOME srcReg, generalRegisters, cache)
                                 val dReg = asGenReg destReg
                                 and sReg = asGenReg srcReg
                             in
                                 if checkCache
                                 then (code <::> MoveRegister{source=sReg, dest=X17} <::> ObjectIndexAddressToAbsolute { source=getAllocatedGenReg source, dest=dReg}
                                         <::> CacheCheck{ arg1=dReg, arg2=X17 },
                                      destReg, if killThis then pruneCache(srcReg, newCache) else newCache, rest)
                                 else if dReg = sReg
                                 then (code, destReg, newCache, rest) (* We will have pruned this since it's the destination. *)
                                 else (code <::> MoveRegister { source=sReg, dest=dReg}, destReg,
                                         if killThis then pruneCache(srcReg, newCache) else newCache, rest)
                             end
 
                         |   NONE =>
                                 (* If this is the last reference and the next instruction is loading with a zero
                                    offset we can use an indexed load and avoid converting to an absolute address.
                                    If this is not the last reference it's likely that we're loading another field
                                    so it's probably better to convert the object index and cache it.  We might
                                    manage to use a load-pair instruction. *)
                             (
                                 case (killThis, rest) of
                                     (true, {instr=LoadWithConstantOffset{ byteOffset=0, loadType=Load32, base, dest=destLoad, ... },
                                             kill=killLoad, ...} :: next) =>
                                     if base = destOiA (* of objectindex *) andalso member(doia, killLoad)
                                     then
                                     let
                                         val (destReg, newCache) = allocateGenReg(destLoad, NONE, cache)
                                     in
                                         (code <::> LoadWithIndexedOffset{ base=X24(*X_Base32in64*), dest=destReg,
                                                         index=getAllocatedGenReg source, loadType=Load32, signExtendIndex=false },
                                                         GenReg destReg, newCache, next)
                                     end
                                     else
                                     let
                                         val (destReg, newCache) = allocateGenReg(destOiA, Array.sub(cacheHints, srcNo), cache)
                                     in
                                         (code <::> ObjectIndexAddressToAbsolute { source=getAllocatedGenReg source, dest=destReg},
                                          GenReg destReg, newCache, rest)
                                     end
                                     
                                 |   _ =>
                                     let
                                         val (destReg, newCache) = allocateGenReg(destOiA, Array.sub(cacheHints, srcNo), cache)
                                     in
                                         (code <::> ObjectIndexAddressToAbsolute { source=getAllocatedGenReg source, dest=destReg},
                                          GenReg destReg, newCache, rest)
                                     end
                             )
                     val () = if killThis then () else Array.update(cacheHints, srcNo, SOME destReg)
                 in
                     absToConcrete(next, if killThis then newCache else (destReg, CacheAbsAddress source) :: newCache, newCode)
                 end
 
             |   absToConcrete({instr=AbsoluteToObjectIndex { source, dest}, ...} :: rest, cache, code) =
                 let
                     (* Don't make an entry in the cache for this; it won't be used. *)
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::> AbsoluteToObjectIndex { source=getAllocatedGenReg source, dest=destReg})
                 end
 
             |   absToConcrete({instr=AllocateMemoryFixed { bytesRequired, dest, saveRegs }, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                     val filteredCache = pruneCache(GenReg X30, newCache)
                     val saved = cachedAddressRegs filteredCache @ getSaveRegs saveRegs
                 in
                     absToConcrete(rest, filteredCache, code <::>
                         AllocateMemoryFixed { dest=destReg, bytesRequired=bytesRequired, saveRegs=saved})
                 end
 
             |   absToConcrete({instr=AllocateMemoryVariable{size, dest, saveRegs}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                     val filteredCache = pruneCache(GenReg X30, newCache)
                     val saved = cachedAddressRegs filteredCache @ getSaveRegs saveRegs
                 in
                     absToConcrete(rest, filteredCache, code <::>
                         AllocateMemoryVariable{size=getAllocatedGenReg size, dest=destReg, saveRegs=saved})
                 end
 
             |   absToConcrete({instr=InitialiseMem{size, addr, init}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::> InitialiseMem{size=getAllocatedGenReg size, addr=getAllocatedGenReg addr, init=getAllocatedGenReg init})
 
             |   absToConcrete({instr=BeginLoop, ...} :: rest, context, code) = absToConcrete(rest, context, code <::> BeginLoop)
 
             |   absToConcrete({instr=JumpLoop{regArgs, stackArgs, checkInterrupt}, ...} :: rest, context, code) =
                 let
                     fun getStackArg{src, wordOffset, stackloc} =
                             {src=getAllocatedArg src, wordOffset=wordOffset, stackloc=stackloc}
                     and getRegArg{src, dst} = {src=getAllocatedArg src, dst=getAllocatedGenReg dst}
                 in
                     absToConcrete(rest, context, code <::> JumpLoop{ regArgs=map getRegArg regArgs, stackArgs=map getStackArg stackArgs,
                               checkInterrupt=Option.map getSaveRegs checkInterrupt})
                 end
 
             |   absToConcrete({instr=StoreWithConstantOffset { base, source, byteOffset, loadType}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::>
                         StoreWithConstantOffset{ base=getAllocatedGenReg base, source=getAllocatedGenReg source, byteOffset=byteOffset, loadType=loadType})
 
             |   absToConcrete({instr=StoreFPWithConstantOffset { base, source, byteOffset, floatSize}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::>
                         StoreFPWithConstantOffset{ base=getAllocatedGenReg base, source=getAllocatedFPReg source, byteOffset=byteOffset, floatSize=floatSize})
 
             |   absToConcrete({instr=StoreWithIndexedOffset { base, source, index, loadType, signExtendIndex}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::>
                         StoreWithIndexedOffset{ base=getAllocatedGenReg base, source=getAllocatedGenReg source, index=getAllocatedGenReg index,
                                                 loadType=loadType, signExtendIndex=signExtendIndex})
 
             |   absToConcrete({instr=StoreFPWithIndexedOffset { base, source, index, floatSize, signExtendIndex}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::>
                         StoreFPWithIndexedOffset{ base=getAllocatedGenReg base, source=getAllocatedFPReg source, index=getAllocatedGenReg index,
                                                   floatSize=floatSize, signExtendIndex=signExtendIndex})
 
             |   absToConcrete({instr=AddSubImmediate{ source, dest, ccRef, immed, isAdd, length }, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenRegOrZero(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         AddSubImmediate { source=getAllocatedGenReg source, dest=destReg, ccRef=ccRef,
                                                  immed=immed, isAdd=isAdd, length=length})
                 end
 
             |   absToConcrete({instr=AddSubRegister{ base, shifted, dest, ccRef, isAdd, length, shift}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenRegOrZero(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         AddSubRegister { base=getAllocatedGenReg base, shifted=getAllocatedGenReg shifted,
                                      dest=destReg, ccRef=ccRef,
                                      isAdd=isAdd, length=length, shift=shift})
                 end
 
             |   absToConcrete({instr=LogicalImmediate{ source, dest, ccRef, immed, logOp, length }, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenRegOrZero(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         LogicalImmediate { source=getAllocatedGenReg source, dest=destReg, ccRef=ccRef,
                                                   immed=immed, logOp=logOp, length=length})
                 end
 
             |   absToConcrete({instr=LogicalRegister{ base, shifted, dest, ccRef, logOp, length, shift}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenRegOrZero(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         LogicalRegister { base=getAllocatedGenReg base, shifted=getAllocatedGenReg shifted,
                                       dest=destReg, ccRef=ccRef,
                                       logOp=logOp, length=length, shift=shift})
                 end
 
             |   absToConcrete({instr=ShiftRegister{ direction, dest, source, shift, opSize}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         ShiftRegister { source=getAllocatedGenReg source, shift=getAllocatedGenReg shift, dest=destReg,
                                                direction=direction, opSize=opSize})
                 end
 
             |   absToConcrete({instr=Multiplication{ kind, dest, sourceA, sourceM, sourceN }, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         Multiplication { kind=kind, sourceA=getAllocatedGenRegOrZero sourceA, sourceM=getAllocatedGenReg sourceM,
                                      sourceN=getAllocatedGenReg sourceN, dest=destReg})
                 end
 
             |   absToConcrete({instr=Division{ isSigned, dest, dividend, divisor, opSize }, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         Division { isSigned=isSigned, dividend=getAllocatedGenReg dividend, divisor=getAllocatedGenReg divisor,
                                dest=destReg, opSize=opSize})
                 end
 
             |   absToConcrete({instr=BeginFunction {regArgs, fpRegArgs, stackArgs}, ...} :: rest, _, code) =
                 let
                     (* Allocate the register arguments.  At this point all the registers are free and the cache is empty.
                        However we may have a "real conflict" that means that the allocated register is different.  e.g.
                        we need this argument some time after an arbitrary precision operation that may call a
                        function. *)
                     fun allocReg(src, dst) =
                     let
                         val (destReg, _) = allocateNewDestination(src, SOME(GenReg dst), generalRegisters, [])
                     in
                         (asGenReg destReg, dst)
                     end
                     fun allocFPReg(src, dst) =
                     let
                         val (destReg, _) = allocateNewDestination(src, SOME(FPReg dst), floatingPtRegisters, [])
                     in
                         (asFPReg destReg, dst)
                     end
                 in
                     absToConcrete(rest, [], code <::>
                         BeginFunction {regArgs=map allocReg regArgs, fpRegArgs=map allocFPReg fpRegArgs, stackArgs=stackArgs})
                 end
 
             |   absToConcrete({instr=FunctionCall{callKind, regArgs, stackArgs, dests, fpRegArgs, fpDests, containers, saveRegs, ...}, ...} :: rest, _, code) =
                 let
                     fun getRegArg(src, dst) = (getAllocatedArg src, dst)
                     and getFPRegArg(src, dst) = (getAllocatedFPReg src, dst)
                     fun getResult allocReg (preg, reg) =
                     let
                         (* We empty the cache at this point. *)
                         val (newReg, _) = allocReg(preg, NONE, [])
                     in
                         (newReg, reg)
                     end
                 in
                     absToConcrete(rest, [] (* Empty after a function call. *), code <::>
                         FunctionCall{ callKind=callKind, regArgs=map getRegArg regArgs, stackArgs=map getAllocatedArg stackArgs,
                                   dests=map (getResult allocateGenReg) dests, fpRegArgs=map getFPRegArg fpRegArgs,
                                   fpDests=map (getResult allocateFPReg) fpDests, saveRegs=getSaveRegs saveRegs, containers=containers})
                 end
 
             |   absToConcrete({instr=TailRecursiveCall{callKind, regArgs, stackArgs, fpRegArgs, stackAdjust, currStackSize}, ...} :: rest, context, code) =
                 let
                     fun getRegArg(src, dst) = (getAllocatedArg src, dst)
                     and getStackArg{src, stack} = {src=getAllocatedArg src, stack=stack}
                     and getFPRegArg(src, dst) = (getAllocatedFPReg src, dst)
                 in
                     absToConcrete(rest, context, code <::>
                         TailRecursiveCall{ callKind=callKind, regArgs=map getRegArg regArgs, fpRegArgs=map getFPRegArg fpRegArgs,
                             stackArgs=map getStackArg stackArgs, stackAdjust=stackAdjust, currStackSize=currStackSize})
                 end
 
             |   absToConcrete({instr=ReturnResultFromFunction{results, returnReg, numStackArgs}, ...} :: rest, context, code) =
                 let
                     fun getResult(preg, reg) = (getAllocatedGenReg preg, reg)
                 in
                     absToConcrete(rest, context, code <::>
                         ReturnResultFromFunction{results=map getResult results, returnReg=getAllocatedGenReg returnReg, numStackArgs=numStackArgs})
                 end
 
             |   absToConcrete({instr=RaiseExceptionPacket{packetReg}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::> RaiseExceptionPacket{packetReg=getAllocatedGenReg packetReg})
 
             |   absToConcrete({instr=PushToStack{ source, container as StackLoc{size, rno}, copies }, ...} :: rest, cache, code) =
                 let
                     val srcReg = getAllocatedGenReg source
                     val newCache =
                         if size = 1 then (GenReg srcReg, CacheStack container) :: cache else cache
                     val () = Array.update(cacheHints, rno, SOME(GenReg srcReg))
                 in
                     absToConcrete(rest, newCache, code <::> PushToStack{source=srcReg, container=container, copies=copies})
                 end
 
             |   absToConcrete({instr=LoadStack{ dest=destLoad, container as StackLoc{rno, ...} , field=0, wordOffset}, kill, ...} :: 
                               (restPlusOia as {instr=ObjectIndexAddressToAbsolute { source as PReg srcNo, dest=destOia}, kill=killOia, ...} :: rest), cache, code) =
                 (* If a preg has been pushed to the stack every subsequent reference will be via the stack.  If we want to
                    be able to cache object index to absolute addresses for them we have to recognise this combination. *)
                 (* They could be unrelated in which case process the LoadStack and then the ObjectIndex...
                         It seems there are also rare circumstances(??) where the result of the load is not
                         killed and so would have to be preserved. *)
                 if destLoad = source andalso member(srcNo, killOia)
                 then
                 let
                     val killThis = member(rno, kill) (* Is it the last reference to the stack entry? *)
 
                     val (newCode, destReg, newCache) =
                         case List.find(fn (_, CacheAbsAddrOnStack c) => c=container | _ => false) cache of
                             SOME (srcReg, _) =>
                             let
                                 (* Try to use the cache register as the destination if we can. *)
                                 val (destReg, newCache) = allocateNewDestination(destOia, SOME srcReg, generalRegisters, cache)
                                 val dReg = asGenReg destReg
                                 and sReg = asGenReg srcReg
                             in
                                 if checkCache
                                 then (code <::> MoveRegister{source=sReg, dest=X17} <::> LoadStack{ dest=X16, container=container, field=0, wordOffset=wordOffset } <::>
                                         ObjectIndexAddressToAbsolute { source=X16, dest=dReg} <::> CacheCheck{ arg1=dReg, arg2=X17 }, destReg,
                                         if killThis then pruneCache(srcReg, newCache) else newCache)
                                 else if dReg = sReg
                                 then (code, destReg, newCache) (* We will have pruned this since it's the destination. *)
                                 else (code <::> MoveRegister { source=sReg, dest=dReg}, destReg,
                                         if killThis then pruneCache(srcReg, newCache) else newCache)
                             end
 
                         |   NONE => (* It's not in the cache - load it which could be cached. *)
                             let
                                 val (cachePostLoad, loadCode) =
                                     processLoadStack(destLoad, container, wordOffset, kill, cache, code)
                                 val (destReg, cachePlusOia) = allocateGenReg(destOia, Array.sub(cacheHints, rno), cachePostLoad)
                             in
                                 (loadCode <::> ObjectIndexAddressToAbsolute { source=getAllocatedGenReg source, dest=destReg},
                                  GenReg destReg, cachePlusOia)
                             end
                     val () = if killThis then () else Array.update(cacheHints, rno, SOME destReg)
                 in
                     absToConcrete(rest, if killThis then newCache else (destReg, CacheAbsAddrOnStack container) :: newCache, newCode)
                 end
                 else (* Can't combine these. *)
                 let
                     val (newCache, newCode) = processLoadStack(destLoad, container, wordOffset, kill, cache, code)
                 in
                     absToConcrete(restPlusOia, newCache, newCode)
                 end
 
             |   absToConcrete({instr=LoadStack{ dest, container, wordOffset, field=0, ...}, kill, ...} :: rest, cache, code) =
                 let
                     val (newCache, newCode) = processLoadStack(dest, container, wordOffset, kill, cache, code)
                 in
                     absToConcrete(rest, newCache, newCode)
                 end
 
             |   absToConcrete({instr=LoadStack{ dest, container, field, wordOffset}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::> LoadStack{ dest=destReg, container=container, field=field, wordOffset=wordOffset })
                 end
 
             |   absToConcrete({instr=StoreToStack{source, container, field, stackOffset}, ...} :: rest, cache, code) =
                 (* We may have cached the original push that cleared the container.  We could cache this since it now
                    contains the entry but it's probably better to deal with multiple results at a higher level. *)
                 let
                     val sReg = getAllocatedGenReg source
                     val newCache =
                         List.filter(fn (_, CacheStack c) => c <> container | (_, CacheAbsAddrOnStack c) => c <> container | _ => true) cache
                 in
                     absToConcrete(rest, newCache, code <::>
                         StoreToStack{source=sReg, container=container, field=field, stackOffset=stackOffset})
                 end
 
             |   absToConcrete({instr=ContainerAddress{ dest, container, stackOffset}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         ContainerAddress{ dest=destReg, container=container, stackOffset=stackOffset })
                 end
 
             |   absToConcrete({instr=ResetStackPtr {numWords}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::> ResetStackPtr {numWords=numWords})
 
             |   absToConcrete({instr=TagValue{source, dest, isSigned, opSize}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         TagValue{source=getAllocatedGenReg source, dest=destReg, isSigned=isSigned, opSize=opSize})
                 end
 
             |   absToConcrete({instr=UntagValue{source, dest, isSigned, opSize, ...}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         UntagValue{source=getAllocatedGenReg source, dest=destReg, isSigned=isSigned, opSize=opSize})
                 end
 
             |   absToConcrete({instr=BoxLarge{source, dest, saveRegs, ...}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                     val filteredCache = pruneCache(GenReg X30, newCache)
                     val saved = cachedAddressRegs filteredCache @ getSaveRegs saveRegs
                 in
                     absToConcrete(rest, filteredCache, code <::>
                         BoxLarge{source=getAllocatedGenReg source, dest=destReg, saveRegs=saved})
                 end
 
             |   absToConcrete({instr=UnboxLarge{source, dest}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::> UnboxLarge{source=getAllocatedGenReg source, dest=destReg})
                 end
 
             |   absToConcrete({instr=BoxTagFloat{floatSize, source, dest, saveRegs}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                     val filteredCache = pruneCache(GenReg X30, newCache)
                     val saved = cachedAddressRegs filteredCache @ getSaveRegs saveRegs
                 in
                     absToConcrete(rest, filteredCache, code <::>
                         BoxTagFloat{floatSize=floatSize, source=getAllocatedFPReg source, dest=destReg, saveRegs=saved})
                 end
 
             |   absToConcrete({instr=UnboxTagFloat{floatSize, source, dest}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateFPReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::> UnboxTagFloat{floatSize=floatSize, source=getAllocatedGenReg source, dest=destReg})
                 end
 
             |   absToConcrete({instr=LoadAcquire { base, dest, loadType}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::> LoadAcquire { base=getAllocatedGenReg base, dest=destReg, loadType=loadType})
                 end
 
             |   absToConcrete({instr=StoreRelease { base, source, loadType}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::> StoreRelease{ base=getAllocatedGenReg base, source=getAllocatedGenReg source, loadType=loadType})
 
             |   absToConcrete({instr=BitFieldShift{source, dest, isSigned, length, immr, imms}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::> BitFieldShift { source=getAllocatedGenReg source, dest=destReg, isSigned=isSigned,
                                     immr=immr, imms=imms, length=length})
                 end
 
             |   absToConcrete({instr=BitFieldInsert{source, destAsSource, dest, length, immr, imms}, ...} :: rest, cache, code) =
                 let
                     val destAsSourceReg = getAllocatedGenReg destAsSource
                     val (destReg, newCache) = allocateNewDestination(dest, SOME(GenReg destAsSourceReg), generalRegisters, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         BitFieldInsert { source=getAllocatedGenReg source, destAsSource=destAsSourceReg, dest=asGenReg destReg,
                                      immr=immr, imms=imms, length=length})
                 end
 
             |   absToConcrete({instr=IndexedCaseOperation{testReg}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::> IndexedCaseOperation{testReg=getAllocatedGenReg testReg})
 
             |   absToConcrete({instr=PushExceptionHandler, ...} :: rest, context, code) = absToConcrete(rest, context, code <::> PushExceptionHandler)
 
             |   absToConcrete({instr=PopExceptionHandler, ...} :: rest, context, code) = absToConcrete(rest, context, code <::> PopExceptionHandler)
 
             |   absToConcrete({instr=BeginHandler{packetReg}, ...} :: rest, _, code) =
                 let
                     (* The cache is undefined at the start of a handler. *)
                     val (destReg, newCache) = allocateGenReg(packetReg, NONE, [])
                 in
                     absToConcrete(rest, newCache, code <::> BeginHandler{packetReg=destReg})
                 end
 
             |   absToConcrete({instr=CompareByteVectors{vec1Addr, vec2Addr, length, ccRef}, ...} :: rest, cache, code) =
                 let
                     (* This instruction modifies these registers so they must be removed from the cache *)
                     val vec1Reg = getAllocatedGenReg vec1Addr
                     and vec2Reg = getAllocatedGenReg vec2Addr
                     and lenReg = getAllocatedGenReg length
                     val newCache = pruneCache(GenReg vec1Reg, pruneCache(GenReg vec2Reg, pruneCache(GenReg lenReg, cache)))
                 in
                     absToConcrete(rest, newCache, code <::> CompareByteVectors{vec1Addr=vec1Reg, vec2Addr=vec2Reg, length=lenReg, ccRef=ccRef})
                 end
 
             |   absToConcrete({instr=BlockMove{srcAddr, destAddr, length, isByteMove}, ...} :: rest, cache, code) =
                 let
                     (* This instruction modifies these registers so they must be removed from the cache *)
                     val srcAReg = getAllocatedGenReg srcAddr
                     and dstAReg = getAllocatedGenReg destAddr
                     and lenReg = getAllocatedGenReg length
                     val newCache = pruneCache(GenReg srcAReg, pruneCache(GenReg dstAReg, pruneCache(GenReg lenReg, cache)))
                 in
                     absToConcrete(rest, newCache, code <::> BlockMove{srcAddr=srcAReg, destAddr=dstAReg, length=lenReg, isByteMove=isByteMove})
                 end
 
             |   absToConcrete({instr=AddSubXSP{source, dest, isAdd}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenRegOrZero(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::> AddSubXSP { source=getAllocatedGenReg source, dest=destReg, isAdd=isAdd})
                 end
 
             |   absToConcrete({instr=TouchValue{source}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::> TouchValue { source=getAllocatedGenReg source})
 
             |   absToConcrete({instr=LoadAcquireExclusive{ base, dest }, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::> LoadAcquireExclusive { base=getAllocatedGenReg base, dest=destReg})
                 end
 
             |   absToConcrete({instr=StoreReleaseExclusive{ base, source, result }, ...} :: rest, cache, code) =
                 let
                     val (resultReg, newCache) = allocateGenReg(result, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::> StoreReleaseExclusive{ base=getAllocatedGenReg base, source=getAllocatedGenRegOrZero source,
                                            result=resultReg})
                 end
 
             |   absToConcrete({instr=MemoryBarrier, ...} :: rest, cache, code) = absToConcrete(rest, cache, code <::> MemoryBarrier)
 
             |   absToConcrete({instr=ConvertIntToFloat{ source, dest, srcSize, destSize}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateFPReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         ConvertIntToFloat{ source=getAllocatedGenReg source, dest=destReg, srcSize=srcSize, destSize=destSize})
                 end
 
             |   absToConcrete({instr=ConvertFloatToInt{ source, dest, srcSize, destSize, rounding}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         ConvertFloatToInt{ source=getAllocatedFPReg source, dest=destReg, srcSize=srcSize,
                                        destSize=destSize, rounding=rounding})
                 end
 
+            |   absToConcrete({instr=UnaryFloatingPt{ source, dest, fpOp=MoveFloat}, ...} :: rest, cache, code) =
+                (* Register-register moves can be eliminated if the same resgister is used. *)
+                let
+                    val srcReg = getAllocatedFPReg source
+                    val (destReg, newCache) = allocateFPReg(dest, SOME(FPReg srcReg), cache)
+                in
+                    absToConcrete(rest, newCache,
+                        if destReg = srcReg then code else code <::> UnaryFloatingPt{ source=srcReg, dest=destReg, fpOp=MoveFloat})
+                end
+
+            |   absToConcrete({instr=UnaryFloatingPt{ source, dest, fpOp=MoveDouble}, ...} :: rest, cache, code) =
+                let
+                    val srcReg = getAllocatedFPReg source
+                    val (destReg, newCache) = allocateFPReg(dest, SOME(FPReg srcReg), cache)
+                in
+                    absToConcrete(rest, newCache,
+                        if destReg = srcReg then code else code <::> UnaryFloatingPt{ source=srcReg, dest=destReg, fpOp=MoveDouble})
+                end
+
             |   absToConcrete({instr=UnaryFloatingPt{ source, dest, fpOp}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateFPReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::> UnaryFloatingPt{ source=getAllocatedFPReg source, dest=destReg, fpOp=fpOp})
                 end
 
             |   absToConcrete({instr=BinaryFloatingPoint{ arg1, arg2, dest, fpOp, opSize}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateFPReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::> BinaryFloatingPoint{ arg1=getAllocatedFPReg arg1, arg2=getAllocatedFPReg arg2,
                                          dest=destReg, fpOp=fpOp, opSize=opSize})
                 end
 
             |   absToConcrete({instr=CompareFloatingPoint{ arg1, arg2, opSize, ccRef}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::>
                         CompareFloatingPoint{ arg1=getAllocatedFPReg arg1, arg2=getAllocatedFPReg arg2, opSize=opSize, ccRef=ccRef})
 
             |   absToConcrete({instr=CPUYield, ...} :: rest, cache, code) = absToConcrete(rest, cache, code <::> CPUYield)
 
             |   absToConcrete({instr=AtomicOperation{ base, source, dest, atOp }, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenRegOrZero(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         AtomicOperation{ base=getAllocatedGenReg base, source=getAllocatedGenRegOrZero source, dest=destReg, atOp=atOp })
                 end
 
             |   absToConcrete({instr=CacheCheck _, ...} :: _, _, _) =  (* Concrete only. *)
                     raise InternalError "absToConcrete: CheckCache"
 
             (* LoadStack. *)
             and processLoadStack(dest, container as StackLoc{rno, ...}, wordOffset, kill, cache, code) =
             let
                 (* See if this is in the cache and use it if it is.  If this is the last reference to this
                    stack entry we don't want it in the cache any longer. *)
                 val killThis = member(rno, kill)
 
                 val (newCode, destReg, newCache) =
                     case List.find(fn (_, CacheStack c) => c=container | _ => false) cache of
                         SOME (srcReg, _) =>
                         let
                             val (destReg, newCache) = allocateNewDestination(dest, SOME srcReg, generalRegisters, cache)
                             val dReg = asGenReg destReg
                             and sReg = asGenReg srcReg
                         in
                             if checkCache
                             then (code <::> MoveRegister{source=sReg, dest=X17} <::> LoadStack{ dest=dReg, container=container, field=0, wordOffset=wordOffset }
                                 <::> CacheCheck{ arg1=dReg, arg2=X17 }, destReg, if killThis then pruneCache(srcReg, newCache) else newCache)
                             else if dReg = sReg andalso false
                             then (code, destReg, newCache) (* We will have pruned this since it's the destination. *)
                             else (code <::> MoveRegister { source=sReg, dest=dReg}, destReg,
                                     if killThis then pruneCache(srcReg, newCache) else newCache)
                         end
 
                     |   NONE =>
                         let
                             val (destReg, newCache) = allocateGenReg(dest, Array.sub(cacheHints, rno), cache)
                         in
                             (code <::> LoadStack{ dest=destReg, container=container, field=0, wordOffset=wordOffset },
                              GenReg destReg, newCache)
                         end
                 val () = if killThis then () else Array.update(cacheHints, rno, SOME destReg)
             in
                 (if killThis then newCache else (destReg, CacheStack container) :: newCache, newCode)
             end
 
         in
             fun concreteBlock(ExtendedBasicBlock{ block, ...}, inputCache) =
             let
                 val (cache, code) = absToConcrete(block, inputCache, [])
             in
                 {cache=cache, code=List.rev code}
             end
             
             val mergeCacheStates = mergeCacheStates
         end
 
         val numBlocks = Vector.length blocks
         (* The results.  The cache state is initialised to empty so that if we
            have a loop we will end up with an empty input cache. *)
         val resultArray = Array.array(numBlocks, {code=[], cache=[]})
 
         (* Process the blocks in execution order so that normally we will be able to
            propagate the cache states.  If we have a loop the input cache state will
            be empty because the output cache state for an unprocessed block is empty. *)
         (* Get the blocks that are inputs for each one. *)
         local
             val blockRefs = Array.array(numBlocks, [])
 
             (* The successors of this block but only including handlers in SetHandler. *)
             fun directSuccessors ExitCode = []
             |   directSuccessors(IndexedBr cases) = cases
             |   directSuccessors(Unconditional dest) = [dest]
             |   directSuccessors(Conditional {trueJump, falseJump, ...}) = [falseJump, trueJump]
             |   directSuccessors(SetHandler { handler, continue }) = [handler, continue]
             |   directSuccessors(UnconditionalHandle _) = []
             |   directSuccessors(ConditionalHandle { continue, ...}) = [continue]
             
             fun setReferences fromBlock =
             let
                 val ExtendedBasicBlock{ flow, ...} = Vector.sub(blocks, fromBlock)
                 val refs = directSuccessors flow
                 fun setRefs toBlock =
                 let
                     val oldRefs = Array.sub(blockRefs, toBlock)
                 in
                     Array.update(blockRefs, toBlock, fromBlock :: oldRefs);
                     if null oldRefs
                     then setReferences toBlock
                     else ()
                 end
             in
                 List.app setRefs refs
             end
 
             val () = setReferences 0
         in
             val directSuccessors = directSuccessors
             val blockRefs = blockRefs
         end
 
         val processed = Array.array(numBlocks, false)
         fun haveProcessed n = Array.sub(processed, n)
            
         fun processBlocks (toDo: int list) =
             case List.filter (fn n => not(haveProcessed n)) toDo of
                 [] => ()
             |   stillToDo as head :: _ =>
                 let
                     (* Try to find a block all of whose predecessors have been processed.  That
                        increases the chances that we will have cached items. *)
                     fun available dest = List.all haveProcessed (Array.sub(blockRefs, dest))
                     val blockNo =
                         case List.find available stillToDo of
                             SOME c => c
                         |   NONE => head
                     val thisBlock as ExtendedBasicBlock { flow, ...} = Vector.sub(blocks, blockNo)
                     (* Get the input cache state.  Take the list of output caches of everything that
                        jumps here and produce the intersection. *)
                     val inputCacheList =
                         List.map (fn n => #cache(Array.sub(resultArray, n))) (Array.sub(blockRefs, blockNo))
                     val inputCache = mergeCacheStates inputCacheList
                     val inputCache = [] (* Temporarily *)
                     (* Process this block and add it to the results. *)
                     val () = Array.update(processed, blockNo, true)
                     val () = Array.update(resultArray, blockNo, concreteBlock(thisBlock, inputCache))
 
                     (* Add the successors but with handlers only included in SetHandler. *)
                     val addSet = directSuccessors flow
                 in
                     processBlocks(addSet @ stillToDo)
                 end
     in
         processBlocks [0];
         (* If the failures list is empty we succeeded. *)
         case !failures of
             [] =>  (* Return the allocation vector.  We may have unused registers, *)
                 AllocateSuccess(
                     Vector.mapi(fn (i, ExtendedBasicBlock{ flow, ...}) =>
                         BasicBlock{block= #code(Array.sub(resultArray, i)), flow=flow}) blocks )
             (* Else we'll have to spill something. *)
         |   l => AllocateFailure l
     end
 
     val nGenRegs = List.length generalRegisters
 
     structure Sharing =
     struct
         type intSet             = intSet
         and extendedBasicBlock  = extendedBasicBlock
         and ('genReg, 'optGenReg, 'fpReg) basicBlock = ('genReg, 'optGenReg, 'fpReg) basicBlock
         and regProperty         = regProperty
         and reg                 = reg
         and xReg                = xReg
         and vReg                = vReg
         and allocateResult      = allocateResult
     end
 
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICode.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICode.ML
index 83bcdc47..f3c19751 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICode.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICode.ML
@@ -1,1021 +1,1023 @@
 (*
     Copyright David C. J. Matthews 2021-2
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64ICode(
 
     structure Arm64Code: ARM64PREASSEMBLY
 
 ): ARM64ICODE =
 
 struct
     open Arm64Code
 
     open Address
     datatype preg = PReg of int (* A pseudo-register - an abstract register. *)
 
     (* If the value is zero we can use X0/W0. *)
     datatype pregOrZero = SomeReg of preg | ZeroReg
     
     (* A location on the stack.  May be more than word if this is a container or a handler entry. *)
     datatype stackLocn = StackLoc of {size: int, rno: int }
     
     (* This combines pregKind and stackLocn.  *)
     datatype regProperty =
         RegPropGeneral      (* A general register. *)
     |   RegPropUntagged     (* An untagged general register. *)
     |   RegPropStack of int (* A stack location or container. *)
     |   RegPropCacheTagged
     |   RegPropCacheUntagged
     |   RegPropMultiple     (* The result of a conditional or case. May be defined at multiple points. *)
 
     (* The reference to a condition code. *)
     datatype ccRef = CcRef of int
     datatype reg = GenReg of xReg | FPReg of vReg
 
     datatype callKind = Recursive | ConstantCode of machineWord | FullCall
 
     (* Function calls can have an unlimited number of arguments so it isn't always
        going to be possible to load them into registers. *)
     datatype 'genReg fnarg = ArgInReg of 'genReg | ArgOnStack of { wordOffset: int, container: stackLocn, field: int }
 
     datatype ('genReg, 'optGenReg, 'fpReg) arm64ICode =
         (* Move the contents of one preg to another.  These are always 64-bits. *)
         MoveRegister of { source: 'genReg, dest: 'genReg }
 
         (* Numerical constant. *)
     |   LoadNonAddressConstant of { source: Word64.word, dest: 'genReg }
 
         (* Floating point constant *)
     |   LoadFPConstant of { source: Word64.word, dest: 'fpReg, floatSize: floatSize }
 
         (* Address constant. *)
     |   LoadAddressConstant of { source: machineWord, dest: 'genReg }
 
         (* Load a value into a register using a constant, signed, byte offset.  The offset
            is in the range of -256 to (+4095*unit size). *)
     |   LoadWithConstantOffset of { base: 'genReg, dest: 'genReg, byteOffset: int, loadType: loadType }
 
         (* Similarly for FP registers. *)
     |   LoadFPWithConstantOffset of { base: 'genReg, dest: 'fpReg, byteOffset: int, floatSize: floatSize }
 
         (* Load a value into a register using an index register. *)
     |   LoadWithIndexedOffset of { base: 'genReg, dest: 'genReg, index: 'genReg, loadType: loadType, signExtendIndex: bool }
 
         (* Ditto for FP. *)
     |   LoadFPWithIndexedOffset of { base: 'genReg, dest: 'fpReg, index: 'genReg, floatSize: floatSize, signExtendIndex: bool }
 
         (* Returns the current thread ID.  Always a 64-bit value.. *)
     |   GetThreadId of { dest: 'genReg }
 
         (* Convert a 32-in-64 object index into an absolute address. *)
     |   ObjectIndexAddressToAbsolute of { source: 'genReg, dest: 'genReg }
 
         (* Convert an absolute address into an object index. *)
     |   AbsoluteToObjectIndex of { source: 'genReg, dest: 'genReg }
 
         (* Allocate a fixed sized piece of memory and puts the absolute address into dest.
            bytesRequired is the total number of bytes including the length word and any alignment
            necessary for 32-in-64. saveRegs is the list of registers that need to be saved if we
            need to do a garbage collection. *)
     |   AllocateMemoryFixed of { bytesRequired: Word64.word, dest: 'genReg, saveRegs: 'genReg list }
 
         (* Allocate a piece of memory.  The size argument is an untagged value containing
            the number of words i.e. the same value used for InitialiseMemory and to store
            in the length word. *)
     |   AllocateMemoryVariable of { size: 'genReg, dest: 'genReg, saveRegs: 'genReg list }
 
         (* Initialise a piece of memory by writing "size" copies of the value
            in "init".  N.B. The size is an untagged value containing the
            number of words. *)
     |   InitialiseMem of { size: 'genReg, addr: 'genReg, init: 'genReg }
 
         (* Mark the beginning of a loop.  This is really only to prevent the initialisation code being
            duplicated in ICodeOptimise. *)
     |   BeginLoop
 
         (* Set up the registers for a jump back to the start of a loop. *)
     |   JumpLoop of
             { regArgs: {src: 'genReg fnarg, dst: 'genReg} list,
               stackArgs: {src: 'genReg fnarg, wordOffset: int, stackloc: stackLocn} list,
               checkInterrupt: 'genReg list option }
 
         (* Store a register using a constant, signed, byte offset.  The offset
            is in the range of -256 to (+4095*unit size). *)
     |   StoreWithConstantOffset of { source: 'genReg, base: 'genReg, byteOffset: int, loadType: loadType }
 
         (* Ditto for FP regs. *)
     |   StoreFPWithConstantOffset of { source: 'fpReg, base: 'genReg, byteOffset: int, floatSize: floatSize }
 
         (* Store a register using an index register. *)
     |   StoreWithIndexedOffset of { source: 'genReg, base: 'genReg, index: 'genReg, loadType: loadType, signExtendIndex: bool }
 
         (* and for FP regs. *)
     |   StoreFPWithIndexedOffset of { source: 'fpReg, base: 'genReg, index: 'genReg, floatSize: floatSize, signExtendIndex: bool }
 
         (* Add/Subtract immediate.  The destination is optional in which case XZero is used.
            ccRef is optional.  If it is NONE the version of the instruction that does not generate
            a condition code is used. immed must be < 0wx1000. *)
     |   AddSubImmediate of { source: 'genReg, dest: 'optGenReg, ccRef: ccRef option, immed: word,
                              isAdd: bool, length: opSize }
 
         (* Add/Subtract register.  As with AddSubImmediate, both the destination and cc are optional. *)
     |   AddSubRegister of { base: 'genReg, shifted: 'genReg, dest: 'optGenReg, ccRef: ccRef option,
                             isAdd: bool, length: opSize, shift: shiftType }
 
         (* Bitwise logical operations.  The immediate value must be a valid bit pattern.  ccRef can
            only be SOME if logOp is LogAnd. *)
     |   LogicalImmediate of { source: 'genReg, dest: 'optGenReg, ccRef: ccRef option, immed: Word64.word,
                               logOp: logicalOp, length: opSize }
 
         (* Register logical operations.  ccRef can only be SOME if logOp is LogAnd.*)
     |   LogicalRegister of { base: 'genReg, shifted: 'genReg, dest: 'optGenReg, ccRef: ccRef option,
                              logOp: logicalOp, length: opSize, shift: shiftType }
 
         (* Shift a word by an amount specified in a register. *)
     |   ShiftRegister of { direction: shiftDirection, dest: 'genReg, source: 'genReg, shift: 'genReg, opSize: opSize }
 
         (* The various forms of multiply all take three arguments and the general form is
            dest = M * N +/- A..   *)
     |   Multiplication of { kind: multKind, dest: 'genReg, sourceA: 'optGenReg, sourceM: 'genReg, sourceN: 'genReg }
 
         (* Signed or unsigned division.  Sets the result to zero if the divisor is zero. *)
     |   Division of { isSigned: bool, dest: 'genReg, dividend: 'genReg, divisor: 'genReg, opSize: opSize }
 
         (* Start of function.  Set the register arguments.  stackArgs is the list of
            stack arguments.  If the function has a real closure regArgs includes the
            closure register (X8).  The register arguments include the return register
            (X30). *)
     |   BeginFunction of { regArgs: ('genReg * xReg) list, fpRegArgs: ('fpReg * vReg) list, stackArgs: stackLocn list }
 
         (* Call a function.  If the code address is a constant it is passed here.
            Otherwise the address is obtained by indirecting through X8 which has been loaded
            as one of the argument registers.  The results are stored in the result registers,
            usually just X0.
            The "containers" argument is used to ensure that any container whose address is passed
            as one of the other arguments continues to be referenced until the function is called
            since there's a possibility that it isn't actually used after the function. *)
     |   FunctionCall of
             { callKind: callKind, regArgs: ('genReg fnarg * xReg) list,
               stackArgs: 'genReg fnarg list, dests: ('genReg * xReg) list,
               fpRegArgs: ('fpReg * vReg) list, fpDests: ('fpReg * vReg) list,
               saveRegs: 'genReg list, containers: stackLocn list}
 
         (* Jump to a tail-recursive function.  This is similar to FunctionCall
            but complicated for stack arguments because the stack and the return
            address need to be overwritten.
            stackAdjust is the number of words to remove (positive) or add
            (negative) to the stack before the call.
            currStackSize contains the number of items currently on the stack. *)
     |   TailRecursiveCall of
             { callKind: callKind, regArgs: ('genReg fnarg * xReg) list,
               stackArgs: {src: 'genReg fnarg, stack: int} list,
               fpRegArgs: ('fpReg * vReg) list,
               stackAdjust: int, currStackSize: int }
 
         (* Return from the function.  resultRegs are the registers containing
            the result,
            returnReg is the preg that contains the return address. *)
     |   ReturnResultFromFunction of { results: ('genReg * xReg) list, returnReg: 'genReg, numStackArgs: int }
 
         (* Raise an exception.  The packet is always loaded into X0. *)
     |   RaiseExceptionPacket of { packetReg: 'genReg }
 
         (* Push a register to the stack.  This is used both for a normal push, copies=1, and
            also to reserve a container. *)
     |   PushToStack of { source: 'genReg, copies: int, container: stackLocn }
 
         (* Load a register from the stack.  The container is the stack location identifier,
            the field is an offset in a container. *)
     |   LoadStack of { dest: 'genReg, wordOffset: int, container: stackLocn, field: int }
 
         (* Store a value into the stack. *)
     |   StoreToStack of { source: 'genReg, container: stackLocn, field: int, stackOffset: int }
 
         (* Set the register to the address of the container i.e. a specific offset on the stack. *)
     |   ContainerAddress of { dest: 'genReg, container: stackLocn, stackOffset: int }
 
         (* Remove items from the stack.  Used to remove containers or
            registers pushed to the stack.. *)
     |   ResetStackPtr of { numWords: int }
 
         (* Tag a value by shifting and setting the tag bit. *)
     |   TagValue of { source: 'genReg, dest: 'genReg, isSigned: bool, opSize: opSize }
 
         (* Shift a value to remove the tag bit.  The cache is used if this is untagging a
            value that has previously been tagged. *)
     |   UntagValue of { source: 'genReg, dest: 'genReg, isSigned: bool, opSize: opSize }
 
         (* Box a largeword value.  Stores a value
            into a byte area.  This can be implemented using AllocateMemoryFixed
            but keeping it separate makes optimisation easier.
            The result is always an address and needs to be converted to an
            object index on 32-in-64. *)
     |   BoxLarge of { source: 'genReg, dest: 'genReg, saveRegs: 'genReg list }
 
         (* Load a value from a box.  This can be implemented using a load but
            is kept separate to simplify optimisation.  The source is always
            an absolute address. *)
     |   UnboxLarge of { source: 'genReg, dest: 'genReg }
 
         (* Convert a floating point value into a value suitable for storing
            in the heap.  This normally involves boxing except that 32-bit
            floats can be tagged in native 64-bits. *)
     |   BoxTagFloat of { floatSize: floatSize, source: 'fpReg, dest: 'genReg, saveRegs: 'genReg list }
 
         (* The reverse of BoxTagFloat. *)
     |   UnboxTagFloat of { floatSize: floatSize, source: 'genReg, dest: 'fpReg }
 
         (* Load a value with acquire semantics.  This means that any other
            load in this thread after this sees the value of the shared
            memory at this point and not earlier.  This is used for
            references and arrays to ensure that if another thread has
            built a data structure on the heap and then assigns the
            address to a shared ref this thread will see the updated heap
            and not any locally cached previous version. *)
     |   LoadAcquire of { base: 'genReg, dest: 'genReg, loadType: loadType }
 
         (* Store a value with release semantics.  This ensures that any
            other write completes before this operation and works with
            LoadAcquire. *)
     |   StoreRelease of { base: 'genReg, source: 'genReg, loadType: loadType }
 
         (* This is a generalised constant shift which includes selection of a
            range of bits. *)
     |   BitFieldShift of { source: 'genReg, dest: 'genReg, isSigned: bool, length: opSize, immr: word, imms: word }
 
         (*  Copy a range of bits and insert it into another register.  This is the
             only case where a register functions both as a source and a destination. *)
     |   BitFieldInsert of { source: 'genReg, destAsSource: 'genReg, dest: 'genReg,
                             length: opSize, immr: word, imms: word }
 
         (* Indexed case. *)
     |   IndexedCaseOperation of { testReg: 'genReg }
 
         (* Exception handling.  - Set up an exception handler. *)
     |   PushExceptionHandler
 
         (* End of a handled section.  Restore the previous handler. *)
     |   PopExceptionHandler
 
         (* Marks the start of a handler.  This sets the stack pointer and
            restores the old handler.  Sets the exception packet register. *) 
     |   BeginHandler of { packetReg: 'genReg }
 
         (* Compare two vectors of bytes and set the condition code on the result.
            The registers are modified by the instruction. *)
     |   CompareByteVectors of
             { vec1Addr: 'genReg, vec2Addr: 'genReg, length: 'genReg, ccRef: ccRef }
 
         (* Move a block of bytes (isByteMove true) or words (isByteMove false).  The length is the
            number of items (bytes or words) to move. The registers are modified by
            the instruction. *)
     |   BlockMove of { srcAddr: 'genReg, destAddr: 'genReg, length: 'genReg, isByteMove: bool }
 
         (* Add or subtract to the system stack pointer and optionally return the new value.
            This is used to allocate and deallocate C space. *)
     |   AddSubXSP of { source: 'genReg, dest: 'optGenReg, isAdd: bool  }
 
         (* Ensures the value will actually be referenced although it doesn't generate any code. *)
     |   TouchValue of { source: 'genReg }
 
         (* Load a value at the address and get exclusive access.  Always loads a
            64-bit value. *)
     |   LoadAcquireExclusive of { base: 'genReg, dest: 'genReg }
 
         (* Store a value into an address releasing the lock.  Sets the result to
            either 0 or 1 if it succeeds or fails. *)
     |   StoreReleaseExclusive of { base: 'genReg, source: 'optGenReg, result: 'genReg }
 
         (* Insert a memory barrier. dmb ish. *)
     |   MemoryBarrier
 
         (* Convert an integer to a floating point value. *)
     |   ConvertIntToFloat of { source: 'genReg, dest: 'fpReg, srcSize: opSize, destSize: floatSize }
 
         (* Convert a floating point value to an integer using the specified rounding mode.
            We could get an overflow here but fortunately the ARM generates a value
            that will cause an overflow when we tag it, provided we tag it explicitly. *)
     |   ConvertFloatToInt of { source: 'fpReg, dest: 'genReg, srcSize: floatSize, destSize: opSize, rounding: IEEEReal.rounding_mode }
 
         (* Unary floating point.  This includes conversions between float and double. *)
     |   UnaryFloatingPt of { source: 'fpReg, dest: 'fpReg, fpOp: fpUnary }
 
         (* Binary floating point: addition, subtraction, multiplication and division. *)
     |   BinaryFloatingPoint of { arg1: 'fpReg, arg2: 'fpReg, dest: 'fpReg, fpOp: fpBinary, opSize: floatSize }
 
         (* Floating point comparison. *)
     |   CompareFloatingPoint of { arg1: 'fpReg, arg2: 'fpReg, ccRef: ccRef, opSize: floatSize }
 
         (* Yield control during a spin-lock. *)
     |   CPUYield
 
         (* Atomic operations added for ARM 8.1 *)
     |   AtomicOperation of { base: 'genReg, source: 'optGenReg, dest: 'optGenReg, atOp: atomicOp }
 
         (* Debugging - fault if values don't match. *)
     |   CacheCheck of { arg1: 'genReg, arg2: 'genReg }
 
         (* Destinations at the end of a basic block. *)
     and controlFlow =
         (* Unconditional branch to a label - should be a merge point. *)
         Unconditional of int
         (* Conditional branch. Jumps to trueJump if the condional is false, falseJump if false. *)
     |   Conditional of { ccRef: ccRef, condition: condition, trueJump: int, falseJump: int }
         (* Exit - the last instruction of the block is a return, raise or tailcall. *)
     |   ExitCode
         (* Indexed case - this branches to one of a number of labels *)
     |   IndexedBr of int list
         (* Set up a handler.  This doesn't cause an immediate branch but the state at the
            start of the handler is the state at this point. *)
     |   SetHandler of { handler: int, continue: int }
         (* Unconditional branch to a handler.  If an exception is raised explicitly
            within the scope of a handler. *)
     |   UnconditionalHandle of int
         (* Conditional branch to a handler.  Occurs if there is a call to a
            function within the scope of a handler.  It may jump to the handler. *)
     |   ConditionalHandle of { handler: int, continue: int }
 
     and ('genReg, 'optGenReg, 'fpReg) basicBlock =
             BasicBlock of { block: ('genReg, 'optGenReg, 'fpReg) arm64ICode list, flow: controlFlow }
   
     type iCodeAbstract = (preg, pregOrZero, preg) arm64ICode and basicBlockAbstract = (preg, pregOrZero, preg) basicBlock
     and  iCodeConcrete = (xReg, xReg, vReg) arm64ICode and basicBlockConcrete = (xReg, xReg, vReg) basicBlock
 
     (* Return the list of blocks that are the immediate successor of this. *)
     fun successorBlocks(Unconditional l) = [l]
     |   successorBlocks(Conditional{trueJump, falseJump, ...}) = [trueJump, falseJump]
     |   successorBlocks ExitCode = []
     |   successorBlocks(IndexedBr cases) = cases
     |   successorBlocks(SetHandler{handler, continue, ...}) = [handler, continue]
         (* We only need "handler" in SetHandler because we may have a handler that is never actually jumped to. *)
     |   successorBlocks(UnconditionalHandle handler) = [handler]
     |   successorBlocks(ConditionalHandle{handler, continue, ...}) = [handler, continue]
 
     local
         fun printCC(CcRef ccRef, stream) = stream ("CC" ^ Int.toString ccRef)
 
         fun printStackLoc(StackLoc{size, rno}, stream) =
             (stream "S"; stream(Int.toString rno); stream "("; stream(Int.toString size); stream ")")
         
         fun regRepr(XReg w) = "X" ^ Int.toString(Word8.toInt w)
         |   regRepr XZero = "XZ"
         |   regRepr XSP = "SP"
 
         fun arithRepr OpSize64 = "64"
         |   arithRepr OpSize32 = "32"
 
         fun printLoadType(Load64, stream) = stream "64"
         |   printLoadType(Load32, stream) = stream "32"
         |   printLoadType(Load16, stream) = stream "16"
         |   printLoadType(Load8, stream) = stream "8"
 
         fun printSaves([], _, _) = ()
         |   printSaves([areg], _, printReg) = printReg areg
         |   printSaves(areg::more, stream, printReg) =
                 (printReg areg; stream ","; printSaves(more, stream, printReg))
 
         fun printArg(ArgInReg reg, _, printReg) = printReg reg
         |   printArg(ArgOnStack{wordOffset, container, field, ...}, stream, _) =
             (
                 printStackLoc(container, stream); stream " + ";
                 stream(Int.toString field);
                 stream " (";  stream(Int.toString wordOffset); stream ")"
             )
 
         fun printShift(ShiftLSL w, stream) = stream(" LSL " ^ Word8.toString w)
         |   printShift(ShiftLSR w, stream) = stream(" LSR " ^ Word8.toString w)
         |   printShift(ShiftASR w, stream) = stream(" ASR " ^ Word8.toString w)
         |   printShift(ShiftNone, _) = ()
 
         fun printFloatSize(Float32, stream) = stream "F"
         |   printFloatSize(Double64, stream) = stream "D"
 
         fun printICode {stream, printGenReg, ...} (MoveRegister{ source, dest }: ('a, 'b, 'c) arm64ICode) =
             (
                 stream "\tMove\t";
                 printGenReg source;
                 stream " => ";
                 printGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (LoadNonAddressConstant{ source, dest }) =
             (
                 stream "\tLoadNonAddress\t"; stream(Word64.toString source);
                 stream " => ";
                 printGenReg dest
             )
 
         |   printICode {stream, printFPReg, ...} (LoadFPConstant{ source, dest, floatSize }) =
             (
                 stream "\tLoadFPConstant"; printFloatSize(floatSize, stream); stream "\t";
                 stream(Word64.toString source); stream " => "; printFPReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (LoadAddressConstant{ source, dest }) =
             (
                 stream "\tLoadAddress\t"; stream(Address.stringOfWord source);
                 stream " => ";
                 printGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (LoadWithConstantOffset{ base, dest, byteOffset, loadType }) =
             (
                 stream "\tLoadConstOffset"; printLoadType(loadType, stream); stream "\t[";
                 printGenReg base; stream "]+";
                 stream(Int.toString byteOffset);
                 stream " => ";
                 printGenReg dest
             )
 
         |   printICode {stream, printGenReg, printFPReg, ...} (LoadFPWithConstantOffset{ base, dest, byteOffset, floatSize }) =
             (
                 stream "\tLoadConstOffset"; printFloatSize(floatSize, stream); stream "\t[";
                 printGenReg base; stream "]+";
                 stream(Int.toString byteOffset);
                 stream " => ";
                 printFPReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (LoadWithIndexedOffset{ base, dest, index, loadType, signExtendIndex }) =
             (
                 stream "\tLoadIndexed"; printLoadType(loadType, stream); stream "\t[";
                 printGenReg base; stream "+"; printGenReg index; if signExtendIndex then stream " SX" else ();
                 stream "] => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, printFPReg, ...} (LoadFPWithIndexedOffset{ base, dest, index, floatSize, signExtendIndex }) =
             (
                 stream "\tLoadIndexed"; printFloatSize(floatSize, stream); stream "\t[";
                 printGenReg base; stream "+"; printGenReg index; if signExtendIndex then stream " SX" else ();
                 stream "] => "; printFPReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (GetThreadId { dest}) =
                 ( stream "\tGetThreadId\t"; stream " => "; printGenReg dest )
 
         |   printICode {stream, printGenReg, ...} (ObjectIndexAddressToAbsolute{ source, dest }) =
             (
                 stream "\tObjectAddrToAbs\t";
                 printGenReg source; stream " => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (AbsoluteToObjectIndex{ source, dest }) =
             (
                 stream "\tAbsToObjectAddr\t";
                 printGenReg source; stream " => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (AllocateMemoryFixed{bytesRequired, dest, saveRegs}) =
             (
                 stream "\tAllocateMemory\t";
                 stream(Word64.fmt StringCvt.DEC bytesRequired); stream " => ";
                 printGenReg dest; stream " save="; printSaves(saveRegs, stream, printGenReg)
             )
 
         |   printICode {stream, printGenReg, ...} (AllocateMemoryVariable{size, dest, saveRegs}) =
             (
                 stream "\tAllocateMemory\t";
                 stream "s="; printGenReg(size);
                 stream " => "; printGenReg dest;
                 stream " save="; printSaves(saveRegs, stream, printGenReg)
             )
 
         |   printICode {stream, printGenReg, ...} (InitialiseMem{size, addr, init}) =
             (
                 stream "\tInitialiseMem\t";
                 stream "s="; printGenReg(size);
                 stream ",i="; printGenReg(init);
                 stream ",a="; printGenReg(addr)
             )
 
         |   printICode {stream, ...} BeginLoop = stream "\tBeginLoop"
 
         |   printICode {stream, printGenReg, ...} (JumpLoop{regArgs, stackArgs, checkInterrupt, ... }) =
             (
                 stream "\tJumpLoop\t";
                 List.app(fn {src, dst} => (printGenReg(dst); stream "="; printArg(src, stream, printGenReg); stream " ")) regArgs;
                 List.app(
                     fn {src, wordOffset, stackloc} =>
                         (printStackLoc(stackloc, stream); stream("(sp" ^ Int.toString wordOffset); stream ")="; printArg(src, stream, printGenReg); stream " ")
                     ) stackArgs;
                 case checkInterrupt of
                     NONE => ()
                 |   SOME saveRegs => (stream " Check:save="; printSaves(saveRegs, stream, printGenReg))
             )
 
         |   printICode {stream, printGenReg, ...} (StoreWithConstantOffset{ base, source, byteOffset, loadType }) =
             (
                 stream "\tStoreConstOffset"; printLoadType(loadType, stream); stream "\t";
                 printGenReg source; stream " => [";
                 printGenReg base; stream "+";
                 stream(Int.toString byteOffset); stream "]"
             )
 
         |   printICode {stream, printGenReg, printFPReg, ...} (StoreFPWithConstantOffset{ base, source, byteOffset, floatSize }) =
             (
                 stream "\tStoreConstOffset"; printFloatSize(floatSize, stream); stream "\t";
                 printFPReg source; stream " => [";
                 printGenReg base; stream "+";
                 stream(Int.toString byteOffset); stream "]"
             )
 
         |   printICode {stream, printGenReg, ...} (StoreWithIndexedOffset{ base, source, index, loadType, signExtendIndex }) =
             (
                 stream "\tStoreIndexed"; printLoadType(loadType, stream); stream "\t";
                 printGenReg source; stream " => [";
                 printGenReg base; stream "+"; printGenReg index;
                 if signExtendIndex then stream " SX" else (); stream "]"
             )
 
         |   printICode {stream, printGenReg, printFPReg, ...} (StoreFPWithIndexedOffset{ base, source, index, floatSize, signExtendIndex }) =
             (
                 stream "\tStoreIndexed"; printFloatSize(floatSize, stream); stream "\t";
                 printFPReg source; stream " => [";
                 printGenReg base; stream "+"; printGenReg index;
                 if signExtendIndex then stream " SX" else (); stream "]"
             )
 
         |   printICode {stream, printGenReg, printOptGenReg, ...} (AddSubImmediate{ source, dest, ccRef, immed, isAdd, length }) =
             (
                 stream (if isAdd then "\tAddImmediate" else "\tSubImmediate"); stream(arithRepr length);
                 stream "\t"; printGenReg source; stream ",0x"; stream(Word.toString immed);
                 stream " => "; printOptGenReg dest;
                 case ccRef of NONE => () | SOME cc => (stream ", "; printCC(cc, stream))
             )
 
         |   printICode {stream, printGenReg, printOptGenReg, ...} (AddSubRegister{ base, shifted, dest, ccRef, isAdd, length, shift }) =
             (
                 stream (if isAdd then "\tAddRegister" else "\tSubRegister"); stream(arithRepr length);
                 stream "\t"; printGenReg base; stream ", ";
                 printGenReg(shifted); printShift(shift, stream);
                 stream " => "; printOptGenReg dest;
                 case ccRef of NONE => () | SOME cc => (stream ", "; printCC(cc, stream))
             )
 
         |   printICode {stream, printGenReg, printOptGenReg, ...} (LogicalImmediate{ source, dest, ccRef, immed, logOp, length }) =
             (
                 stream (case logOp of LogAnd => "\tAndImmediate" | LogOr => "\tOrImmediate" | LogXor => "\tXorImmediate");
                 stream(arithRepr length);
                 stream "\t"; printGenReg source; stream ",0x"; stream(Word64.toString immed);
                 stream " => "; printOptGenReg dest;
                 case ccRef of NONE => () | SOME cc => (stream ", "; printCC(cc, stream))
             )
 
         |   printICode {stream, printGenReg, printOptGenReg, ...} (LogicalRegister{ base, shifted, dest, ccRef, logOp, length, shift }) =
             (
                 stream (case logOp of LogAnd => "\tAndRegister"  | LogOr => "\tOrRegister" | LogXor => "\tXorRegister");
                 stream(arithRepr length);
                 stream "\t"; printGenReg base; stream ", ";
                 printGenReg(shifted); printShift(shift, stream);
                 stream " => "; printOptGenReg dest;
                 case ccRef of NONE => () | SOME cc => (stream ", "; printCC(cc, stream))
             )
 
         |   printICode {stream, printGenReg, ...} (ShiftRegister{ direction, dest, source, shift, opSize }) =
             (
                 stream (
                     case direction of
                         ShiftLeft => "\tShiftLeft"
                     |   ShiftRightLogical => "\tShiftRightLog"
                     |   ShiftRightArithmetic => "\tShiftRightArith");
                 stream(arithRepr opSize);
                 stream "\t"; printGenReg source; stream " by ";
                 printGenReg(shift);
                 stream " => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, printOptGenReg, ...} (Multiplication{ kind, dest, sourceA, sourceM, sourceN }) =
             (
                 stream (
                     case kind of
                         MultAdd32 => "\tMultAdd32\t"
                     |   MultSub32 => "\tMultSub32\t"
                     |   MultAdd64 => "\tMultAdd64\t"
                     |   MultSub64 => "\tMultSub64\t"
                     |   SignedMultAddLong => "\tSignedMultAddLong\t"
                     |   SignedMultHigh => "\tSignedMultHigh\t");
                 printGenReg(sourceM); stream " * ";
                 printGenReg(sourceN);
                 stream " +/- "; printOptGenReg sourceA;
                 stream " => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (Division{ isSigned, dest, dividend, divisor, opSize }) =
             (
                 stream (if isSigned then "\tSignedDivide" else "\tUnsignedDivide");
                 stream(arithRepr opSize);
                 stream "\t"; printGenReg(dividend); stream " by ";
                 printGenReg(divisor);
                 stream " => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, printFPReg, ...} (BeginFunction{ regArgs, stackArgs, fpRegArgs }) =
             (
                 stream "\tBeginFunction\t";
                 List.app(fn (arg, r) => (stream(regRepr r); stream "="; printGenReg(arg); stream " ")) regArgs;
                 List.app(fn (arg, VReg w) => (stream("V" ^ Int.toString(Word8.toInt w)); stream "="; printFPReg(arg); stream " ")) fpRegArgs;
                 List.app(fn s => printStackLoc(s, stream)) stackArgs
             )
 
         |   printICode {stream, printGenReg, printFPReg, ...} (FunctionCall{callKind, regArgs, stackArgs, dests,
                         fpRegArgs, fpDests, saveRegs, containers}) =
             (
                 stream "\tFunctionCall\t";
                 case callKind of
                     Recursive => stream "recursive "
                 |   ConstantCode m => (stream(stringOfWord m); stream " ")
                 |   FullCall => ();
                 stream "(";
                 List.app(fn (arg, r) => (stream(regRepr r); stream "="; printArg(arg, stream, printGenReg); stream " ")) regArgs;
                 List.app(fn (arg, VReg w) => (stream("V" ^ Int.toString(Word8.toInt w)); stream "="; printFPReg(arg); stream " ")) fpRegArgs;
                 List.app(fn arg => (stream "p="; printArg(arg, stream, printGenReg); stream " ")) stackArgs;
                 stream ") ";
                 List.app(fn (pr, r) => (stream(regRepr r); stream "=>"; printGenReg pr; stream " ")) dests;
                 List.app(fn (pr, VReg w) => (stream("V" ^ Int.toString(Word8.toInt w)); stream "=>"; printFPReg pr; stream " ")) fpDests;
                 stream " save="; printSaves(saveRegs, stream, printGenReg);
                 if null containers then ()
                 else (stream " containers="; List.app (fn c => (printStackLoc(c, stream); stream " ")) containers)
             )
 
         |   printICode {stream, printGenReg, printFPReg, ...} (TailRecursiveCall{callKind, regArgs, stackArgs, stackAdjust, currStackSize, fpRegArgs, ...}) =
             (
                 stream "\tTailCall\t";
                 case callKind of
                     Recursive => stream "recursive "
                 |   ConstantCode m => (stream(stringOfWord m); stream " ")
                 |   FullCall => ();
                 List.app(fn (arg, r) => (stream(regRepr r); stream "="; printArg(arg, stream, printGenReg); stream " ")) regArgs;
                 List.app(fn (arg, VReg w) => (stream("V" ^ Int.toString(Word8.toInt w)); stream "="; printFPReg(arg); stream " ")) fpRegArgs;
                 List.app(fn {src, stack} => (stream (Int.toString stack); stream "<="; printArg(src, stream, printGenReg); stream " ")) stackArgs;
                 stream "adjust="; stream(Int.toString stackAdjust);
                 stream " stackSize="; stream(Int.toString currStackSize)
             )
 
         |   printICode {stream, printGenReg, ...} (ReturnResultFromFunction{ results, returnReg, numStackArgs }) =
             (
                 stream "\tReturnFromFunction\t"; printGenReg(returnReg); stream "with ";
                 List.app(fn (reg, r) => (stream(regRepr r); stream "=>"; printGenReg reg; stream " ")) results;
                 stream("," ^ Int.toString numStackArgs)
             )
 
         |   printICode {stream, printGenReg, ...} (RaiseExceptionPacket{ packetReg }) =
             ( stream "\tRaiseException\t"; printGenReg(packetReg) )
 
         |   printICode {stream, printGenReg, ...} (PushToStack{ source, copies, container }) =
             (
                 stream "\tPushToStack\t"; printGenReg source;
                 if copies > 1 then (stream " * "; stream(Int.toString copies)) else ();
                 stream " => "; printStackLoc(container, stream)
             )
 
         |   printICode {stream, printGenReg, ...} (LoadStack{ dest, wordOffset, container, field }) =
             (
                 stream "\tLoadStack\t";
                 printStackLoc(container, stream); stream " + ";
                 stream(Int.toString field);
                 stream " (";  stream(Int.toString wordOffset); stream ")";
                 stream " => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (StoreToStack{ source, container, field, stackOffset }) =
             (
                 stream "\tStoreToStack\t"; printGenReg source;
                 stream " => "; printStackLoc(container, stream); stream "+";
                 stream (Int.toString field); stream "(";
                 stream(Int.toString stackOffset); stream ")"
             )
 
         |   printICode {stream, printGenReg, ...} (ContainerAddress{ dest, container, stackOffset }) =
             (
                 stream "\tContainerAddress\t";
                 stream "@"; printStackLoc(container, stream);
                 stream " (";  stream(Int.toString stackOffset); stream ") => ";
                 printGenReg dest
             )
 
         |   printICode {stream, ...} (ResetStackPtr{ numWords }) =
             ( stream "\tResetStackPtr\t"; stream(Int.toString numWords) )
 
         |   printICode {stream, printGenReg, ...} (TagValue{ source, dest, isSigned, opSize }) =
             (
                 stream "\tTag"; stream(if isSigned then "Signed" else "Unsigned");
                 stream(arithRepr opSize); stream "\t";
                 printGenReg source; stream " => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (UntagValue{ source, dest, isSigned, opSize }) =
             (
                 stream "\tUntag"; stream(if isSigned then "Signed" else "Unsigned");
                 stream(arithRepr opSize); stream "\t";
                 printGenReg source; stream " => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (BoxLarge{source, dest, saveRegs}) =
             (
                 stream "\tBoxLarge\t";
                 printGenReg source;
                 stream " => ";
                 printGenReg dest;
                 stream " save="; printSaves(saveRegs, stream, printGenReg)
             )
 
         |   printICode {stream, printGenReg, ...} (UnboxLarge{source, dest}) =
             (
                 stream "\tUnboxLarge\t";
                 printGenReg source;
                 stream " => ";
                 printGenReg dest
             )
 
         |   printICode {stream, printGenReg, printFPReg, ...} (BoxTagFloat{floatSize, source, dest, saveRegs}) =
             (
                 stream "\tBoxTagFloat"; printFloatSize(floatSize, stream); stream "\t";
                 printFPReg source;
                 stream " => ";
                 printGenReg dest;
                 stream " save="; printSaves(saveRegs, stream, printGenReg)
             )
 
         |   printICode {stream, printGenReg, printFPReg, ...} (UnboxTagFloat{floatSize, source, dest}) =
             (
                 stream "\tUnboxTagFloat"; printFloatSize(floatSize, stream); stream "\t";
                 printGenReg source;
                 stream " => ";
                 printFPReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (LoadAcquire{ base, dest, loadType }) =
             (
                 stream "\tLoadAcquire"; printLoadType(loadType, stream); stream "\t[";
                 printGenReg base; stream "] => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (StoreRelease{ base, source, loadType }) =
             (
                 stream "\tStoreRelease"; printLoadType(loadType, stream); stream "\t";
                 printGenReg source; stream " => [";
                 printGenReg base; stream "]"
             )
 
         |   printICode {stream, printGenReg, ...} (BitFieldShift{ source, dest, isSigned, length, immr, imms }) =
             (
                 stream "\tBitShift"; stream(if isSigned then "Signed" else "Unsigned"); stream(arithRepr length); stream "\t";
                 printGenReg source;
                 stream " => "; printGenReg dest;
                 stream " immr="; stream(Word.fmt StringCvt.DEC immr);
                 stream " imms="; stream(Word.fmt StringCvt.DEC imms)
             )
 
         |   printICode {stream, printGenReg, ...} (BitFieldInsert{ source, dest, destAsSource, length, immr, imms }) =
             (
                 stream "\tBitInsert";  stream(arithRepr length); stream "\t";
                 printGenReg source; stream " with "; printGenReg destAsSource;
                 stream " => "; printGenReg dest;
                 stream " immr="; stream(Word.fmt StringCvt.DEC immr);
                 stream " imms="; stream(Word.fmt StringCvt.DEC imms)
             )
 
         |   printICode {stream, printGenReg, ...} (IndexedCaseOperation{testReg}) =
                 ( stream "\tIndexedCase\t"; printGenReg testReg )
 
         |   printICode {stream, ...} PushExceptionHandler = stream "\tPushExcHandler"
 
         |   printICode {stream, ...} PopExceptionHandler = stream "\tPopExcHandler"
 
         |   printICode {stream, printGenReg, ...} (BeginHandler{packetReg}) =
             (
                 stream "\tBeginHandler\t";
                 printGenReg packetReg
             )
 
         |   printICode {stream, printGenReg, ...} (CompareByteVectors{vec1Addr, vec2Addr, length, ccRef, ...}) =
             (
                 stream "\tCompareByteVectors\t";
                 printGenReg(vec1Addr); stream ",";
                 printGenReg(vec2Addr); stream ",";
                 printGenReg(length);
                 stream " => "; printCC(ccRef, stream)
             )
 
         |   printICode {stream, printGenReg, ...} (BlockMove{srcAddr, destAddr, length, isByteMove}) =
             (
                 stream(if isByteMove then "\tBlockByteMove\t" else "\tBlockWordMove\t");
                 stream "src="; printGenReg(srcAddr);
                 stream ",dest="; printGenReg(destAddr);
                 stream ",len="; printGenReg(length)
             )
 
         |   printICode {stream, printGenReg, printOptGenReg, ...} (AddSubXSP{ source, dest, isAdd }) =
             (
                 stream(if isAdd then "\tAdd\t" else "\tSubtract\t");
                 printGenReg source; stream " XSP => ";
                 printOptGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (TouchValue{ source }) =
             ( stream "\tTouchValue\t"; printGenReg source )
 
         |   printICode {stream, printGenReg, ...} (LoadAcquireExclusive{ base, dest }) =
             (
                 stream "\tLoadExclusive\t["; printGenReg base;
                 stream "] => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, printOptGenReg, ...} (StoreReleaseExclusive{ base, source, result }) =
             (
                 stream "\tStoreExclusive\t";
                 printOptGenReg source; stream " => [";
                 printGenReg base; stream "] result => ";
                 printGenReg result
             )
 
         |   printICode {stream, ...} MemoryBarrier = stream "\tMemoryBarrier"
 
         |   printICode {stream, printGenReg, printFPReg, ...} (ConvertIntToFloat{ source, dest, srcSize, destSize}) =
             (
                 stream "\tConvert"; stream(arithRepr srcSize); stream "To";
                 printFloatSize(destSize, stream); stream "\t";
                 printGenReg source; stream " => "; printFPReg dest
             )
 
         |   printICode {stream, printGenReg, printFPReg, ...} (ConvertFloatToInt{ source, dest, srcSize, destSize, rounding}) =
             let
                 open IEEEReal
             in
                 stream "\tConvert"; printFloatSize(srcSize, stream); stream "To";
                 stream(arithRepr destSize); stream "\t";
                 printFPReg source; stream " => "; printGenReg dest;
                 stream(
                     case rounding of
                         TO_NEAREST => " rounding"
                     |   TO_NEGINF => " rounding down"
                     |   TO_POSINF => " rounding up"
                     |   TO_ZERO => " truncating"
                 )
             end
 
         |   printICode {stream, printFPReg, ...} (UnaryFloatingPt{ source, dest, fpOp}) =
             (
                 stream(
                     case fpOp of
                         NegFloat => "\tNegFloat\t"
                     |   NegDouble => "\tNegDouble\t"
                     |   AbsFloat => "\tAbsFloat\t"
                     |   AbsDouble => "\tAbsDouble\t"
                     |   ConvFloatToDble => "\tFloatToDble\t"
-                    |   ConvDbleToFloat => "\t\t"
+                    |   ConvDbleToFloat => "\tDbleToFloat\t"
+                    |   MoveDouble => "\tMoveDouble\t"
+                    |   MoveFloat => "\tMoveFloat\t"
                 );
                 printFPReg source; stream " => "; printFPReg dest
             )
 
         |   printICode {stream, printFPReg, ...} (BinaryFloatingPoint{ arg1, arg2, dest, fpOp, opSize}) =
             (
                 stream(
                     case fpOp of
                         MultiplyFP => "\tMultiply"
                     |   DivideFP => "\tDivide"
                     |   AddFP => "\tAdd"
                     |   SubtractFP => "\tSubtract"
                 );
                 printFloatSize(opSize, stream); stream "\t";
                 printFPReg arg1; stream ", "; printFPReg arg2;
                 stream " => "; printFPReg dest
             )
 
         |   printICode {stream, printFPReg, ...} (CompareFloatingPoint{ arg1, arg2, opSize, ccRef}) =
             (
                 stream "\tCompare"; printFloatSize(opSize, stream); stream "\t";
                 printFPReg arg1; stream ", "; printFPReg arg2;
                 stream ", "; printCC(ccRef, stream)
             )
 
         |   printICode {stream, ...} CPUYield = stream "\tCpuYield"
 
         |   printICode {stream, printGenReg, printOptGenReg, ...} (AtomicOperation{ base, source, dest, atOp }) =
             (
                 case atOp of
                     LoadAddAL => stream "\tLoadAddAL\t"
                 |   LoadUmaxAL => stream "\tLoadUmaxAL\t"
                 |   SwapAL => stream "\tSwapAL\t"
                 |   LoadAddAcquire => stream "\tLoadAddAcquire\t"
                 |   LoadUMaxAcquire => stream "\tLoadUMaxAcquire\t"
                 |   SwapRelease => stream "\tSwapRelease\t";
                 printOptGenReg source; stream ",["; printGenReg base; stream "] => "; printOptGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (CacheCheck{ arg1, arg2}) =
             ( stream "\tCacheCheck\t"; printGenReg arg1; stream ", "; printGenReg arg2 )
 
         and printCondition(cond,  stream)  = stream(condToString cond)
 
         (* Print a basic block. *)
         fun printBlock {stream, printGenReg, printOptGenReg, printFPReg} (blockNo, BasicBlock{block, flow, ...}) =
         (
             (* Put a label on all but the first. *)
             if blockNo <> 0 then stream("L" ^ Int.toString blockNo ^ ":") else ();
             List.app (fn icode =>
                 (printICode {stream=stream, printGenReg=printGenReg, printOptGenReg=printOptGenReg, printFPReg=printFPReg} (icode); stream "\n")) block;
             case flow of
                 Unconditional l => stream("\tJump\tL" ^ Int.toString l ^ "\n")
             |   Conditional {condition, trueJump, falseJump, ccRef, ...} =>
                 (
                     stream "\tJump"; printCondition(condition, stream);
                     stream "\t"; printCC(ccRef, stream);
                     stream " L"; stream (Int.toString trueJump);
                     stream " else L"; stream (Int.toString falseJump); stream "\n"
                 )
             |   ExitCode => ()
             |   IndexedBr _ => ()
             |   SetHandler{handler, continue} =>
                     stream(concat["\tSetHandler\tH", Int.toString handler, "\n",
                                   "\tJump\tL", Int.toString continue, "\n"])
             |   UnconditionalHandle handler => stream("\tJump\tH" ^ Int.toString handler ^ "\n")
             |   ConditionalHandle{handler, continue} =>
                     stream(concat["\tJump\tL", Int.toString continue, " or H", Int.toString handler, "\n"])
         )
     in
         fun printPReg stream (PReg i) = stream("R" ^ Int.toString i)
         fun printOptPReg stream ZeroReg = stream "Zero"
         |   printOptPReg stream (SomeReg reg) = printPReg stream reg
         
         fun printXReg stream (XReg w) = stream("X" ^ Int.toString(Word8.toInt w))
         |   printXReg stream XZero = stream "XZ"
         |   printXReg stream XSP = stream "XSP"
 
         fun printVReg stream (VReg w) = stream("V" ^ Int.toString(Word8.toInt w))
 
         fun printICodeAbstract(blockVec, stream) =
             Vector.appi(printBlock{stream=stream, printGenReg=printPReg stream, printOptGenReg=printOptPReg stream, printFPReg=printPReg stream}) blockVec
         
         and printICodeConcrete(blockVec, stream) =
             Vector.appi(printBlock{stream=stream, printGenReg=printXReg stream, printOptGenReg=printXReg stream, printFPReg=printVReg stream}) blockVec
     end
 
     (* Only certain bit patterns are allowed in a logical immediate instruction
        but the encoding is complex so it's easiest to inherit the test from
        the assembler layer. *)
     local
         fun optow OpSize32 = WordSize32 | optow OpSize64 = WordSize64
     in
         fun isEncodableBitPattern(v, w) = Arm64Code.isEncodableBitPattern(v, optow w)
     end
 
     (* This generates a  BitField instruction with the appropriate values for immr and imms. *)
     fun shiftConstant{ direction, dest, source, shift, opSize } =
     let
         val (isSigned, immr, imms) =
             case (direction, opSize) of
                 (ShiftLeft, OpSize64) => (false, Word.~ shift mod 0w64, 0w64-0w1-shift)
             |   (ShiftLeft, OpSize32) => (false, Word.~ shift mod 0w32, 0w32-0w1-shift)
             |   (ShiftRightLogical, OpSize64) => (false, shift, 0wx3f)
             |   (ShiftRightLogical, OpSize32) => (false, shift, 0wx1f)
             |   (ShiftRightArithmetic, OpSize64) => (true, shift, 0wx3f)
             |   (ShiftRightArithmetic, OpSize32) => (true, shift, 0wx1f)
     in
         BitFieldShift{ source=source, dest=dest, isSigned=isSigned, length=opSize, immr=immr, imms=imms }
     end
 
     structure Sharing =
     struct
         type xReg           = xReg
         and  vReg           = vReg
         and  reg            = reg
         and  condition      = condition
         and  shiftType      = shiftType
         and  ('genReg, 'optGenReg, 'fpReg) arm64ICode = ('genReg, 'optGenReg, 'fpReg) arm64ICode
         and  preg           = preg
         and  pregOrZero     = pregOrZero
         and  controlFlow    = controlFlow
         and  ('genReg, 'optGenReg, 'fpReg) basicBlock = ('genReg, 'optGenReg, 'fpReg) basicBlock
         and  stackLocn      = stackLocn
         and  regProperty    = regProperty
         and  ccRef          = ccRef
         and  'genReg fnarg  = 'genReg fnarg
         and  closureRef     = closureRef
         and  loadType       = loadType
         and  opSize         = opSize
         and  logicalOp      = logicalOp
         and  callKind       = callKind
         and  floatSize      = floatSize
         and  shiftDirection = shiftDirection
         and  multKind       = multKind
         and  fpUnary        = fpUnary
         and  fpBinary       = fpBinary
         and  atomicOp       = atomicOp
    end
 end;
 
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICodeOptimise.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICodeOptimise.ML
index 4add52b0..6ab8e2a3 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICodeOptimise.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICodeOptimise.ML
@@ -1,622 +1,643 @@
 (*
-    Copyright David C. J. Matthews 2021
+    Copyright David C. J. Matthews 2021-2
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64ICodeOptimise(
     structure Arm64ICode: ARM64ICODE
     structure IntSet: INTSET
     structure Identify: ARM64IDENTIFYREFERENCES
     structure Debug: DEBUG
     structure Pretty: PRETTY
     sharing Arm64ICode.Sharing = Identify.Sharing = IntSet = Arm64ICode
 ): ARM64ICODEOPTIMISE =
 struct
     open Arm64ICode
     open IntSet
     open Identify
 
     datatype optimise =
         Changed of (preg, pregOrZero, preg) basicBlock vector * regProperty vector | Unchanged
     
     exception InternalError = Misc.InternalError
     
     (* Optimiser.
        This could incorporate optimisations done elsewhere.
        IdentifyReferences currently removes instructions that
        produce results in registers that are never used.
+       AllocateRegisters deals generally with caching.
 
-       PushRegisters deals with caching. Caching involves
-       speculative changes that can be reversed if there is a need
-       to spill registers.
-       
-       The optimiser currently deals with booleans and conditions.
+       The optimiser currently deals with booleans and conditions and with
+       constants.
     *)
     
     (* This is a rewrite of the last instruction to set a boolean.
        This is almost always rewriting the next instruction.  The only
        possibility is that we have a ResetStackPtr in between. *)
     datatype boolRegRewrite =
         BRNone
         (* BRSetConditionToConstant - we have a comparison of two constant value.
            This will usually happen because we've duplicated a branch and
            set a register to a constant which we then compare. *)
     |   BRSetConditionToConstant of
             { srcCC: ccRef, signedCompare: order, unsignedCompare: order }
 
     fun optimiseICode{ code, pregProps, ccCount=_, debugSwitches=_ } =
     let
         val hasChanged = ref false
         val pregPropSize = Vector.length pregProps
         val regCounter = ref pregPropSize (* Allocate new registers after the old. *)
         val regList = ref [] (* New properties are added in reverse order. *)
         fun newReg kind =
         (
             regList := kind :: ! regList;
             PReg (!regCounter)
         ) before regCounter := !regCounter + 1
 
         (* Constant values in registers. *)
         datatype regValue =
             SomeValue
         |   NonAddressConst of Word64.word
         |   AddressConstant of machineWord
+        |   LargeBox of preg
+        |   RealBox of preg * floatSize
 
         (* If this argument is a register and the register is mapped to a constant
            or another register replace the value.  Unlike the X86 version we
            don't map memory locations but we do map registers. *)
         (* TODO: This is potentially quadratic if we have a long piece
            of code with very many registers. *)
         fun getRegisterValue(preg as PReg pregNo, kill, regMap) =
         (
             case List.find(fn {dest, ... } => dest = preg) regMap of
                 SOME { source, ...} =>
                 (
                     source,
                     (* Filter it if it is the last reference. *)
                     if member(pregNo, kill)
                     then List.filter(fn {dest, ...} => dest <> preg) regMap
                     else regMap
                 )
             |   NONE => (SomeValue, regMap)
         )
 
         fun optimiseBlock processed (block, flow, outCCState) =
         let
             fun optCode([], brCond, regMap, code, changed) = (code, brCond, regMap, changed)
 
             |   optCode({instr as AddSubImmediate{source, dest=ZeroReg, ccRef=SOME ccRefOut, immed, isAdd=false, length}, kill, ...} :: rest,
                         _, regMap, code, changed) =
                 let
                     val (repArg1, memRefsOut) = getRegisterValue(source, kill, regMap)
                 in
                     case repArg1 of
                         NonAddressConst test =>
                         (* AddSubImmediate is put in by CodetreeToIcode to test a boolean value.  It can also
                            arise as the result of pattern matching on booleans or even by tests such as = true.
                            If the source register is now a constant we want to propagate the constant
                            condition. *)
                         let
                             (* This comparison reduces to a constant.  *)
                             val _ = hasChanged := true
                             (* Signed comparison.  If this is a 32-bit operation the top word
                                could be zero so we need to convert this as Word32.
                                immediate values are always unsigned. *)
                             val testValue =
                                 case length of
                                     OpSize64 => Word64.toLargeIntX test
                                 |   OpSize32 => Word32.toLargeIntX(Word32.fromLarge test)
                             (* Put in a replacement so that if we were previously testing ccRefOut
                                we should instead test ccRef. *)
                             val repl =
                                 BRSetConditionToConstant{srcCC=ccRefOut,
                                     signedCompare=LargeInt.compare(testValue, Word.toLargeInt immed),
                                     unsignedCompare=Word64.compare(test, Word64.fromLarge(Word.toLargeWord immed))}
                             val _ = isSome outCCState andalso raise InternalError "optCode: CC exported"
                         in
                             optCode(rest, repl, memRefsOut, code, true)
                         end
                 
                     |   _ => optCode(rest, BRNone, memRefsOut, instr :: code, changed)
                 end
 
             |   optCode({instr as AddSubImmediate{source, dest=SomeReg dest, ccRef=NONE, immed, isAdd, length}, kill, ...} :: rest, _, regMap, code, changed) =
                 (* This is frequently used to remove a tag from a value before an addition or subtraction.
                    If it's a constant we want to do that now. *)
                 let
                     val (repArg1, newMap) = getRegisterValue(source, kill, regMap)
                 in
                     case repArg1 of
                         NonAddressConst cVal =>
                         let
                             val addSub = if isAdd then cVal + Word.toLarge immed else cVal - Word.toLarge immed
                             (* Mask the result to 32-bits if this is a 32-bit operation. *)
                             val result =
                                 case length of OpSize32 => Word64.andb(addSub, Word64.<<(0w1, 0w32) - 0w1) | OpSize64 => addSub
                         in
                             optCode(rest, BRNone,
                                 {dest=dest, source=NonAddressConst result} :: newMap,
                                     LoadNonAddressConstant{dest=dest, source=result} :: code, true)
                         end
 
                     |   _ => optCode(rest, BRNone, newMap, instr ::code, changed)
                 end
 
             |   optCode({instr as AddSubRegister{base, shifted, dest, ccRef, isAdd, length, shift}, kill, ...} :: rest,
                         _, regMap, code, changed) =
                 let
                     (* If we have a constant as the second argument we can change this to the
                        immediate form. *)
                     val (repOp1, mapOp1) = getRegisterValue(base, kill, regMap)
                     val (repOp2, mapOp2) = getRegisterValue(shifted, kill, mapOp1)
                     val regAndImmed =
                         case (repOp1, repOp2, isAdd, shift) of
                             (_, NonAddressConst immed, _, ShiftNone) => if immed < 0w4096 then SOME(base, immed) else NONE
                             (* If this is an ADD we can also use the immediate if the first arg is a constant. *)
                         |   (NonAddressConst immed, _, true, ShiftNone) => if immed < 0w4096 then SOME(shifted, immed) else NONE
                         |   _ => NONE
                 in
                     case regAndImmed of
                         SOME(srcReg, immediate) =>
                                 optCode(rest, BRNone, mapOp2,
                                 AddSubImmediate{source=srcReg, dest=dest, ccRef=ccRef,
                                     immed=Word.fromLargeWord(Word64.toLargeWord immediate), isAdd=isAdd, length=length} :: code, true)
                     |   NONE =>
                             optCode(rest, BRNone, mapOp2, instr :: code, changed)
                 end
 
             |   optCode({instr as LogicalRegister{base, shifted, dest, ccRef, logOp, length, shift}, kill, ...} :: rest,
                         _, regMap, code, changed) =
                 let
                     (* If we have a constant as the second argument we can change this to the
                        immediate form. *)
                     val (repOp1, mapOp1) = getRegisterValue(base, kill, regMap)
                     val (repOp2, mapOp2) = getRegisterValue(shifted, kill, mapOp1)
                     val regAndImmed =
                         case (repOp1, repOp2, shift) of
                             (_, NonAddressConst immed, ShiftNone) =>
                                 if isEncodableBitPattern (immed, length) then SOME(base, immed) else NONE
                         |   (NonAddressConst immed, _, ShiftNone) =>
                                 if isEncodableBitPattern (immed, length) then SOME(shifted, immed) else NONE
                         |   _ => NONE
                 in
                     case regAndImmed of
                         SOME(srcReg, immediate) =>
                                 optCode(rest, BRNone, mapOp2,
                                 LogicalImmediate{source=srcReg, dest=dest, ccRef=ccRef,
                                     immed=immediate, logOp=logOp, length=length} :: code, true)
                     |   NONE =>
                             optCode(rest, BRNone, mapOp2, instr :: code, changed)
                 end
 
             |   optCode({instr as MoveRegister{dest, source}, kill, ...} :: rest, inCond, regMap, code, changed) =
                 let
                     val (repSource, mapAfterReplace) = getRegisterValue(source, kill, regMap)
                     (* If the source is mapped to a constant we can set the destination to
                        the same constant. *)
                     val newMap =
                         case repSource of
                             SomeValue => mapAfterReplace
                         |   _ => {dest=dest, source=repSource} :: mapAfterReplace
                 in
                     optCode(rest, inCond, newMap, instr :: code, changed)
                 end
 
             |   optCode({instr as LoadNonAddressConstant{dest, source}, ...} :: rest, inCond, regMap, code, changed) =
                 let
                     (* If we already have a register with this constant we would probably
                        be better off reusing it.  The map, though, needs to indicate that the
                        destination register contains the constant.
                        The X86 version always uses a load-constant here. *)
                     val newInstr =
                         case List.find(fn {source=NonAddressConst c, ... } => c = source | _ => false) regMap of
                             SOME{dest=cDest, ...} => MoveRegister{dest=dest, source=cDest}
                         |   NONE => instr
                 in
                     optCode(rest, inCond, {dest=dest, source=NonAddressConst source} :: regMap, newInstr :: code, changed)
                 end
 
             |   optCode({instr as LoadAddressConstant{dest, source}, ...} :: rest, inCond, regMap, code, changed) =
                     (* Address constant.  This is used in conjunction with UnboxValue *)
                     optCode(rest, inCond, {dest=dest, source=AddressConstant source} :: regMap, instr :: code, changed)
 
+            |   optCode({instr as BoxLarge{ source, dest, ... }, ...} :: rest, inCond, regMap, code, changed) =
+                (* Try to eliminate adjacent sequences of boxing and unboxing.  *)
+                    optCode(rest, inCond, {dest=dest, source=LargeBox source} :: regMap, instr :: code, changed)
+
+            |   optCode({instr as BoxTagFloat{ source, dest, floatSize, ... }, ...} :: rest, inCond, regMap, code, changed) =
+                    optCode(rest, inCond, {dest=dest, source=RealBox(source, floatSize)} :: regMap, instr :: code, changed)
+
             |   optCode({instr as UnboxLarge{ source, dest, ... }, kill, ...} :: rest,
                         inCond, regMap, code, changed) =
                 let
                     val (repSource, mapAfterReplace) = getRegisterValue(source, kill, regMap)
                 in
                     (* If we're unboxing a constant address we can unpack it now.  This is
                        intended to handle constant aruments to LargeWord operations. *)
                     case repSource of
                         AddressConstant cVal =>
                         let
                             (* Check this looks like a boxed large word. *)
                             open Address
                             val addr = toAddress cVal
                             val _ = isBytes addr andalso length addr = 0w8 div wordSize
                             val wordConst: LargeWord.word = RunCall.unsafeCast cVal
                             val newMap = {dest=dest, source=NonAddressConst wordConst} :: mapAfterReplace
                         in
                             optCode(rest, inCond, newMap, LoadNonAddressConstant{dest=dest, source=wordConst} :: code, true)
                         end
+
+                        (* Or if it was recently boxed we can use the original and hopefully eliminate the box. *)
+                    |   LargeBox original => optCode(rest, inCond, mapAfterReplace, MoveRegister{dest=dest, source=original} :: code, true)
+
                     |   _ => optCode(rest, inCond, mapAfterReplace, instr :: code, changed)
                 end
 
             |   optCode({instr as UntagValue{ source, dest, isSigned, opSize }, kill, ...} :: rest, inCond, regMap, code, changed) =
                 (* If we're untagging a constant we can produce the result now. *)
                 let
                     val (repSource, mapAfterReplace) = getRegisterValue(source, kill, regMap)
                 in
                     case repSource of
                         NonAddressConst cVal =>
                         let
                             (* The result depends on the kind of untagging.  Unsigned values
                                can just be left shifted but signed values need to be treated
                                differently for 32-bit and 64-bit operations. *)
                             val wordConst =
                                 case (isSigned, opSize) of
                                     (false, _) => LargeWord.>>(cVal, 0w1)
                                 |   (true, OpSize64) => LargeWord.~>>(cVal, 0w1)
                                 |   (true, OpSize32) => Word32.toLarge(Word32.~>>(Word32.fromLarge cVal, 0w1))
                             val newMap = {dest=dest, source=NonAddressConst wordConst} :: mapAfterReplace
                         in
                             optCode(rest, inCond, newMap, LoadNonAddressConstant{dest=dest, source=wordConst} :: code, true)
                         end
                     |   _ => optCode(rest, inCond, mapAfterReplace, instr :: code, changed)
                 end
 
             |   optCode({instr as UnboxTagFloat{ source, dest, floatSize=Float32 }, kill, ...} :: rest, inCond, regMap, code, changed) =
                 let
                     val (repSource, mapAfterReplace) = getRegisterValue(source, kill, regMap)
                 in
                     case repSource of
                         NonAddressConst cVal => (* Should only be Float32 in native 64-bits. *)
                         let
                             val _  = not is32in64 andalso LargeWord.andb(cVal, 0wxffffffff) = 0w1
                                         orelse raise InternalError "incorrect FP constant form"
                             val fpConstant = LargeWord.>>(cVal, 0w32)
                         in
                             optCode(rest, inCond, mapAfterReplace, LoadFPConstant{dest=dest, floatSize=Float32, source=fpConstant} :: code, true)
                         end
 
                     |   AddressConstant cVal =>
                         let
                             open Address
                             val addr = toAddress cVal
                             val _ = is32in64 andalso length addr = 0w1 andalso flags addr = F_bytes
                                 orelse raise InternalError "incorrect FP constant form"
                             val fpConstant = RunCall.loadPolyWord(addr, 0w0)
                         in
                             optCode(rest, inCond, mapAfterReplace, LoadFPConstant{dest=dest, floatSize=Float32, source=fpConstant} :: code, true)
                         end
 
+                    |   RealBox(original, fSize) =>
+                        (
+                            fSize = Float32 orelse raise InternalError "Mismatch float size";
+                            optCode(rest, inCond, mapAfterReplace, UnaryFloatingPt{dest=dest, source=original, fpOp=MoveFloat} :: code, true)
+                        )
+
                     |   _ => optCode(rest, inCond, mapAfterReplace, instr :: code, changed)
                 end
 
             |   optCode({instr as UnboxTagFloat{ source, dest, floatSize=Double64 }, kill, ...} :: rest, inCond, regMap, code, changed) =
                 let
                     val (repSource, mapAfterReplace) = getRegisterValue(source, kill, regMap)
                 in
                     case repSource of
                         AddressConstant cVal =>
                         let
                             open Address
                             val addr = toAddress cVal
                             val _ = length addr = (0w8 div wordSize) andalso flags addr = F_bytes
                                 orelse raise InternalError "incorrect FP constant form"
                             val fpConstant = RunCall.loadNativeWord(addr, 0w0)
                         in
                             optCode(rest, inCond, mapAfterReplace, LoadFPConstant{dest=dest, floatSize=Double64, source=fpConstant} :: code, true)
                         end
 
+                    |   RealBox(original, fSize) =>
+                        (
+                            fSize = Double64 orelse raise InternalError "Mismatch float size";
+                            optCode(rest, inCond, mapAfterReplace, UnaryFloatingPt{dest=dest, source=original, fpOp=MoveDouble} :: code, true)
+                        )
+
                     |   _ => optCode(rest, inCond, mapAfterReplace, instr :: code, changed)
                 end
 
                 (* Some of these are specifically to reduce "ref" to its simplest form since it's generated
                    as a variable-length item. *)
 
             |   optCode({instr as AllocateMemoryVariable{ size, dest, saveRegs }, kill, ...} :: rest, inCond, regMap, code, changed) =
                 (* Turn a variable size allocation into a fixed size allocation if the size is a constant. *)
                 let
                     val (repSize, mapAfterReplace) = getRegisterValue(size, kill, regMap)
                 in
                     case repSize of
                         NonAddressConst words =>
                         let
                             val wordsRequired =
                                 if is32in64
                                 then (* Have to round this up to 8 bytes *)
                                     Word64.andb(words+0w2, ~ 0w2)
                                 else words+0w1
                             val bytesRequired = Word64.fromLarge(Word.toLarge Address.wordSize) * wordsRequired
                         in
                             optCode(rest, inCond, mapAfterReplace,
                                 AllocateMemoryFixed{bytesRequired=bytesRequired, dest=dest, saveRegs=saveRegs} :: code, true)
                         end
                     |   _ => optCode(rest, inCond, mapAfterReplace, instr :: code, changed)
                 end
 
             |   optCode({instr as InitialiseMem{size, addr, init}, kill, ...} :: rest, inCond, regMap, code, changed) =
                 (* If we're initialising "a few" words we're better off unrolling the loop. *)
                 let
                     val (repSize, mapAfterReplace) = getRegisterValue(size, kill, regMap)
                 in
                     case repSize of
                         NonAddressConst words =>
                             if words <= 0w8
                             then
                             let
                                 val nWords = LargeWord.toInt words
                                 fun unroll(n, l) =
                                     if n = nWords then l
                                     else unroll(n+1, StoreWithConstantOffset{ source=init, base=addr,
                                             byteOffset=n*Word.toInt Address.wordSize, loadType=if is32in64 then Load32 else Load64 } :: l)
                             in
                                 optCode(rest, inCond, mapAfterReplace, unroll(0, code), true)
                             end
                             else optCode(rest, inCond, mapAfterReplace, instr :: code, changed)
                     |   _ => optCode(rest, inCond, mapAfterReplace, instr :: code, changed)
                 end
 
             |   optCode({instr as BitFieldShift{ source, dest, isSigned, length, immr, imms }, kill, ...} :: rest,
                             inCond, regMap, code, changed) =
                 (* Bit shift.  Specifically this is used to shift the flags byte to construct a length word.  The flags are
                    frequently a constant.  Unlike BitFieldInsert this sets unused bits to either zero or the sign bit. *)
                 let
                     val (repSource, mapAfterReplace) = getRegisterValue(source, kill, regMap)
                 in
                     case repSource of
                         NonAddressConst cVal =>
                         let
                             val regSize = case length of OpSize32 => 0w32 | OpSize64 => 0w64
                         in
                             if not isSigned andalso imms + 0w1 = immr
                             then (* Simple left shift: ignore the other cases for the moment. *)
                             let
                                 val wordConst64 = Word64.<<(cVal, regSize-immr)
                                 val wordConst =
                                     case length of OpSize64 => wordConst64 | OpSize32 => Word64.andb(wordConst64, 0wxffffffff)
                                 val newMap = {dest=dest, source=NonAddressConst wordConst} :: mapAfterReplace
                             in
                                 optCode(rest, inCond, newMap, LoadNonAddressConstant{dest=dest, source=wordConst} :: code, true)
                             end
                             else optCode(rest, inCond, mapAfterReplace, instr :: code, changed)
                         end
                     |   _ => optCode(rest, inCond, mapAfterReplace, instr :: code, changed)
                 end
 
-
             |   optCode({instr as BitFieldInsert{ source, dest, destAsSource, length=_, immr, imms }, kill, ...} :: rest,
                     inCond, regMap, code, changed) =
                 (* Bit field insertion.  This is used to insert the length field into the register containing the
                    shifted flags value. *)
                 let
                     val (repSource, mapAfterRepSrc) = getRegisterValue(source, kill, regMap)
                     val (repDestAs, mapAfterReplace) = getRegisterValue(destAsSource, kill, mapAfterRepSrc)
                 in
                     case (repSource, repDestAs) of
                         (NonAddressConst srcVal, NonAddressConst dstVal) =>
                         if immr = 0w0
                         then (* Insert new bits without shifting. *)
                         let
                             (* We take imms-immr+1 bits from the source. *)
                             val maskSrc = Word64.>>(Word64.notb 0w0, 0w64-(imms+0w1))
                             val maskDst = Word64.notb maskSrc
                             val wordConst = Word64.orb(Word64.andb(dstVal, maskDst), Word64.andb(srcVal, maskSrc))
                             val newMap = {dest=dest, source=NonAddressConst wordConst} :: mapAfterReplace
                         in
                             optCode(rest, inCond, newMap, LoadNonAddressConstant{dest=dest, source=wordConst} :: code, true)
                         end
                         else optCode(rest, inCond, mapAfterReplace, instr :: code, changed)
                     |   _ => optCode(rest, inCond, mapAfterReplace, instr :: code, changed)
                 end
 
                 (* Clear the cache across a function call.  We would have to push these registers. *)
             |   optCode({instr as FunctionCall _, ...} :: rest, _, _, code, changed) =
                     optCode(rest, BRNone, [], instr::code, changed)                
 
             |   optCode({instr as BeginLoop, ...} :: rest, _, _, code, changed) =
                     (* Any register value from outside the loop is not valid inside. *)
                     optCode(rest, BRNone, [], instr::code, changed)                
 
             |   optCode({instr as JumpLoop _, ...} :: rest, _, _, code, changed) =
                     (* Likewise at the end of the loop.  Not sure if this is essential. *)
                     optCode(rest, BRNone, [], instr::code, changed)
 
                 (* CompareByteVectors and BlockMove modify their arguments.  In particular
                    if we have the length as a constant in a register it won't still
                    have that value at the end.
                    TODO: This coult be refined. *)
             |   optCode({instr as CompareByteVectors _, ...} :: rest, _, _, code, changed) =
                     (* Likewise at the end of the loop.  Not sure if this is essential. *)
                     optCode(rest, BRNone, [], instr::code, changed)
 
             |   optCode({instr as BlockMove _, ...} :: rest, _, _, code, changed) =
                     (* Likewise at the end of the loop.  Not sure if this is essential. *)
                     optCode(rest, BRNone, [], instr::code, changed)
-            
 
             |   optCode({instr, ...} :: rest, inCond, regMap, code, changed) =
                 let
                     (* If this instruction affects the CC the cached SetToCondition will no longer be valid. *)
                     val afterCond =
                         case getInstructionCC instr of
                             CCUnchanged => inCond
                         |   _ => BRNone
                 in
                     optCode(rest, afterCond, regMap, instr::code, changed)
                 end
 
             val (blkCode, finalRepl, finalMap, blockChanged) = optCode(block, BRNone, [], processed, false)
             val _ = if blockChanged then hasChanged := true else ()
         in
             case (flow, finalRepl) of
                 (* We have a Condition and a change to the condition. *)
                 (flow as Conditional{ccRef, condition, trueJump, falseJump},
                  BRSetConditionToConstant({srcCC, signedCompare, unsignedCompare, ...})) =>
                     if srcCC = ccRef
                     then
                     let
                         val testResult =
                             case (condition, signedCompare, unsignedCompare) of
                                 (CondEqual,             EQUAL,  _)   => true
                             |   (CondEqual,             _,      _)   => false
                             |   (CondNotEqual,          EQUAL,  _)   => false
                             |   (CondNotEqual,          _,      _)   => true
                             |   (CondSignedLess,        LESS,   _)   => true
                             |   (CondSignedLess,        _,      _)   => false
                             |   (CondSignedGreater,     GREATER,_)   => true
                             |   (CondSignedGreater,     _,      _)   => false
                             |   (CondSignedLessEq,      GREATER,_)   => false
                             |   (CondSignedLessEq,      _,      _)   => true
                             |   (CondSignedGreaterEq,   LESS,   _)   => false
                             |   (CondSignedGreaterEq,   _,      _)   => true
                             |   (CondCarryClear,        _, LESS  )   => true
                             |   (CondCarryClear,        _,      _)   => false
                             |   (CondUnsignedHigher,    _,GREATER)   => true
                             |   (CondUnsignedHigher,    _,      _)   => false
                             |   (CondUnsignedLowOrEq,    _,GREATER)   => false
                             |   (CondUnsignedLowOrEq,    _,      _)   => true
                             |   (CondCarrySet,          _, LESS  )   => false
                             |   (CondCarrySet,          _,      _)   => true
                                 (* The overflow and parity checks should never occur. *)
                             |   _   => raise InternalError "getCondResult: comparison"
 
                         val newFlow =
                             if testResult
                             then Unconditional trueJump
                             else Unconditional falseJump
 
                         val() = hasChanged := true
                     in
                         BasicBlock{flow=newFlow, block=List.rev blkCode}
                     end
                     else BasicBlock{flow=flow, block=List.rev blkCode}
               
             |   (flow as Unconditional jmp, _) =>
                 let
                     val ExtendedBasicBlock{block=targetBlck, locals, exports, flow=targetFlow, outCCState=targetCC, ...} =
                         Vector.sub(code, jmp)
                     (* If the target is empty or is simply one or more Resets or a Return we're
                        better off merging this in rather than doing the jump.  We allow a single
                        Load  e.g. when loading a constant or moving a register.
                        If we have a CompareLiteral and we're comparing with a register in the map
                        that has been set to a constant we include that because the comparison will
                        then be reduced to a constant. *)
                     fun isSimple([], _, _) = true
                     |   isSimple ({instr=ResetStackPtr _, ...} :: instrs, moves, regMap) = isSimple(instrs, moves, regMap)
                     |   isSimple ({instr=ReturnResultFromFunction _, ...} :: instrs, moves, regMap) = isSimple(instrs, moves, regMap)
                     |   isSimple ({instr=RaiseExceptionPacket _, ...} :: instrs, moves, regMap) = isSimple(instrs, moves, regMap)
                     |   isSimple ({instr=MoveRegister{source, dest}, ...} :: instrs, moves, regMap) =
                         let
                             (* We frequently have a move of the original register into a new register before the test. *)
                             val newMap =
                                 case List.find(fn {dest, ... } => dest = source) regMap of
                                     SOME {source, ...} => {dest=dest, source=source} :: regMap
                                 |   NONE => regMap
                         in
                             moves = 0 andalso isSimple(instrs, moves+1, newMap)
                         end
                     |   isSimple ({instr=LoadNonAddressConstant _, ...} :: instrs, moves, regMap) =
                             moves = 0 andalso isSimple(instrs, moves+1, regMap)
                     |   isSimple ({instr=LoadAddressConstant _, ...} :: instrs, moves, regMap) =
                             moves = 0 andalso isSimple(instrs, moves+1, regMap)
                     |   isSimple ({instr=AddSubImmediate{source, dest=ZeroReg, ...}, ...} :: instrs, moves, regMap) =
                         let
                             val isReplace = List.find(fn {dest, ... } => dest = source) regMap
                         in
                             case isReplace of
                                 SOME {source=NonAddressConst _, ...} => isSimple(instrs, moves, regMap)
                             |   _ => false
                         end
                     |   isSimple _ = false
  
                 in
                     (* Merge trivial blocks.  This previously also tried to merge non-trivial blocks if
                        they only had one reference but this ends up duplicating non-trivial code.  If we
                        have a trivial block that has multiple references but is the only reference to
                        a non-trivial block we can merge the non-trivial block into it.  That would
                        be fine except that at the same time we may merge this trivial block elsewhere. *)
                     (* The restriction that a block must only export "merge" registers is unfortunate
                        but necessary to avoid the situation where a non-merge register is defined at
                        multiple points and cannot be pushed to the stack.  This really isn't an issue
                        with blocks with unconditional branches but there are cases where we have
                        successive tests of the same condition and that results in local registers
                        being defined and then exported.  This occurs in, for example,
                        fun f x = if x > "abcde" then "yes" else "no"; *)
                     if isSimple(targetBlck, 0, finalMap) andalso
                             List.all (fn i => Vector.sub(pregProps, i) = RegPropMultiple) (setToList exports)
                     then
                     let
                         (* Copy the block, creating new registers for the locals. *)
                         val localMap = List.map (fn r => (PReg r, newReg(Vector.sub(pregProps, r)))) (setToList locals)
                         fun mapReg r = case List.find (fn (s, _) => r = s) localMap of SOME(_, s) => s | NONE => r
                         fun mapInstr(instr as ResetStackPtr _) = instr
                         |   mapInstr(ReturnResultFromFunction{results, returnReg, numStackArgs}) =
                                 ReturnResultFromFunction{results=List.map(fn(pr, r) => (mapReg pr, r))results,
                                                          returnReg=mapReg returnReg, numStackArgs=numStackArgs}
                         |   mapInstr(RaiseExceptionPacket{packetReg}) =
                                 RaiseExceptionPacket{packetReg=mapReg packetReg}
                         |   mapInstr(MoveRegister{source, dest}) =
                                 MoveRegister{source=mapReg source, dest=mapReg dest}
                         |   mapInstr(LoadNonAddressConstant{source, dest}) =
                                 LoadNonAddressConstant{source=source, dest=mapReg dest}
                         |   mapInstr(LoadAddressConstant{source, dest}) =
                                 LoadAddressConstant{source=source, dest=mapReg dest}
                         |   mapInstr(AddSubImmediate{source, dest=ZeroReg, immed, ccRef, isAdd, length}) =
                                 AddSubImmediate{source=mapReg source, dest=ZeroReg, immed=immed, ccRef=ccRef, isAdd=isAdd, length=length}
                         |   mapInstr _ = raise InternalError "mapInstr: other instruction"
                         fun mapRegNo i = case(mapReg(PReg i)) of PReg r => r
                         (* Map the instructions and the sets although we only use the kill set. *)
                         fun mapCode{instr, current, active, kill} =
                             {instr=mapInstr instr, current=listToSet(map mapRegNo (setToList current)),
                              active=listToSet(map mapRegNo (setToList active)), kill=listToSet(map mapRegNo (setToList kill))}
                     in
                         hasChanged := true;
                         optimiseBlock blkCode(map mapCode targetBlck, targetFlow, targetCC)
                     end
                     else BasicBlock{flow=flow, block=List.rev blkCode}
                 end
 
             |   (flow, _) => BasicBlock{flow=flow, block=List.rev blkCode}
         end
         
         fun optBlck(ExtendedBasicBlock{block, flow, outCCState, ...}) = optimiseBlock [] (block, flow, outCCState)
         val resVector = Vector.map optBlck code
     in
         if !hasChanged
         then
         let
             val extraRegs = List.rev(! regList)
             val props =
                 if null extraRegs
                 then pregProps
                 else Vector.concat[pregProps, Vector.fromList extraRegs]
         in
             Changed(resVector, props)
         end
         else Unchanged
     end
 
     structure Sharing =
     struct
         type extendedBasicBlock = extendedBasicBlock
         and ('genReg, 'optGenReg, 'fpReg) basicBlock = ('genReg, 'optGenReg, 'fpReg) basicBlock
         and regProperty = regProperty
         and optimise = optimise
         and preg = preg
         and pregOrZero = pregOrZero
     end
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICodeToArm64Code.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICodeToArm64Code.ML
index a856cf01..6e9de6c3 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICodeToArm64Code.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICodeToArm64Code.ML
@@ -1,1291 +1,1291 @@
 (*
     Copyright David C. J. Matthews 2021-2
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64ICodeToArm64Code(
     structure Arm64PreAssembly: ARM64PREASSEMBLY
     structure Debug: DEBUG
     structure Arm64ICode: ARM64ICODE
     structure Identify: ARM64IDENTIFYREFERENCES
     structure IntSet: INTSET
     structure Pretty: PRETTY
 
     structure Strongly:
         sig
             val stronglyConnectedComponents: {nodeAddress: 'a -> int, arcs: 'a -> int list } -> 'a list -> 'a list list
         end
     
     sharing Arm64PreAssembly.Sharing = Arm64ICode.Sharing = Identify.Sharing = IntSet
 ): ARM64ICODEGENERATE =
 struct
     open Identify
     open Arm64ICode
     open Arm64PreAssembly
     open Address
     exception InternalError = Misc.InternalError
 
     (* Reversed cons and append to make the code easier to read. *)
     infix 5 <::> <@>
     fun tl <::> hd = hd :: tl
     (*and snd <@> fst = fst @ snd*)
 
     (* These aren't currently used for anything. *)
     val workReg1 = X16 and workReg2 = X17
 
     fun icodeToArm64Code {blocks: basicBlockConcrete vector, functionName, stackRequired, debugSwitches, resultClosure, profileObject, ...} =
     let
         val numBlocks = Vector.length blocks
 
         (* Load from and store to stack. *)
         fun loadFromStack(destReg, wordOffset, code) =
             if wordOffset >= 4096
             then (LoadRegIndexed{regT=destReg, regN=X_MLStackPtr, regM=destReg, option=ExtUXTX ScaleOrShift, loadType=Load64}) ::
                     [LoadNonAddr(destReg, Word64.fromInt wordOffset)] @ code
             else (LoadRegScaled{regT=destReg, regN=X_MLStackPtr, unitOffset=wordOffset, loadType=Load64}) :: code
 
         and storeToStack(sourceReg, wordOffset, workReg, code) =
             if wordOffset >= 4096
             then (StoreRegIndexed{regT=sourceReg, regN=X_MLStackPtr, regM=workReg, option=ExtUXTX ScaleOrShift, loadType=Load64}) ::
                     [LoadNonAddr(workReg, Word64.fromInt wordOffset)] @ code
             else (StoreRegScaled{regT=sourceReg, regN=X_MLStackPtr, unitOffset=wordOffset, loadType=Load64}) :: code
 
         datatype srcAndDest = IsInReg of xReg | IsOnStack of int
 
         local
             (* The registers are numbered from 0.  Choose values that don't conflict with
                the stack addresses. *)
             fun regNo(XReg r) = ~1 - Word8.toInt r | regNo _ = ~1 - 31
             type node = {src: srcAndDest, dst: srcAndDest }
             
             fun nodeAddress({dst=IsInReg r, ...}: node) = regNo r
             |   nodeAddress({dst=IsOnStack a, ...}) = a
             
             fun arcs({src=IsOnStack wordOffset, ...}: node) = [wordOffset]
             |   arcs{src=IsInReg r, ...} = [regNo r]
         in
             val stronglyConnected = Strongly.stronglyConnectedComponents { nodeAddress=nodeAddress, arcs=arcs }
         end
         
         (* This is a general function for moving values into registers or to the stack
            where it is possible that the source values might also be in use as destinations.
            The stack is used for destinations only for tail recursive calls. *)
         fun moveMultipleValues(moves, code) =
         let
             fun moveValues ([], code) = code (* We're done. *)
 
             |   moveValues (arguments, code) =
                 let
                     (* stronglyConnectedComponents does two things.  It detects loops where
                        it's not possible to move items without breaking the loop but more
                        importantly it orders the dependencies so that if there are no loops we
                        can load the source and store it in the destination knowing that
                        we won't overwrite anything we might later need. *)
                     
                     val ordered = stronglyConnected arguments
                     
                     fun loadIntoReg(IsInReg sReg, dReg, code) =
                             if sReg = dReg then code else (MoveXRegToXReg{sReg=sReg, dReg=dReg}) :: code
                     |   loadIntoReg(IsOnStack wordOffset, dReg, code) =
                             loadFromStack(dReg, wordOffset, code)
 
                     fun moveEachValue ([], code) = code
 
                     |   moveEachValue ([{dst=IsInReg dReg, src}] :: rest, code) =
                             moveEachValue(rest, loadIntoReg(src, dReg, code))
 
                     |   moveEachValue ([{dst=IsOnStack wordOffset, src=IsInReg sReg}] :: rest, code) =
                             (* Storing into the stack. *)
                             moveEachValue(rest, storeToStack(sReg, wordOffset, workReg1, code))
 
                     |   moveEachValue ([{dst=IsOnStack dstOffset, src=IsOnStack srcOffset}] :: rest, code) =
                             (* Copy a stack location - needs a load and store unless the address is the same. *)
                             if dstOffset = srcOffset
                             then moveEachValue(rest, code)
                             else moveEachValue(rest,
                                     storeToStack(workReg2, dstOffset, workReg1,
                                         loadFromStack(workReg2, srcOffset, code)))
 
                     |   moveEachValue((cycle as first :: _ :: _) :: rest, code) =
                         (* We have a cycle. *)
                         let
                             (* We need to exchange some of the arguments.  Doing an exchange here will
                                set the destination with the correct source.  However we have to process
                                every subsequent entry with the swapped registers.  That may well mean that
                                one of those entries becomes trivial.
                                We also need to rerun stronglyConnectedComponents on at least the rest of
                                this cycle.  It's easiest to flatten the rest and do everything. *)
                             (* Exchange the source and destination.  We don't have an exchange instruction
                                and there's a further complication.  We could be copying between stack locations
                                and their offsets could be > 4096.  Since we've only got two work registers
                                we need to use the hardware stack as an extra location. Stack-stack
                                exchange is very rare so the extra overhead to handle the general case
                                is worth it. *)
                             local
                                 fun storeToDest(sReg, IsInReg dReg, _, code) =
                                         (MoveXRegToXReg{sReg=sReg, dReg=dReg}) :: code
                                 |   storeToDest(sReg, IsOnStack wordOffset, work, code) =
                                         storeToStack(sReg, wordOffset, work, code)
                             in
                                 fun exchange(IsInReg arg1Reg, arg2, code) =
                                     (MoveXRegToXReg{sReg=workReg2, dReg=arg1Reg}) ::
                                     storeToDest(arg1Reg, arg2, workReg1,
                                         loadIntoReg(arg2, workReg2, code))
 
                                 |   exchange(arg1, IsInReg arg2Reg, code) =
                                     (MoveXRegToXReg{sReg=workReg2, dReg=arg2Reg}) ::
                                     storeToDest(arg2Reg, arg1, workReg1,
                                         loadIntoReg(arg1, workReg2, code))
 
                                 |   exchange(arg1, arg2, code) =
                                     (* The hardware stack must be 16-byte aligned. *)
                                     storeToDest(workReg2, arg2, workReg1,
                                         (LoadRegUnscaled{regT=workReg2, regN=XSP, byteOffset=16, loadType=Load64, unscaledType=PostIndex}) ::
                                         storeToDest(workReg2, arg1, workReg1,
                                             loadIntoReg(arg2, workReg2,
                                                 (StoreRegUnscaled{regT=workReg2, regN=XSP, byteOffset= ~16, loadType=Load64, unscaledType=PreIndex}) ::
                                                     loadIntoReg(arg1, workReg2, code))))
                             end
                             
                             (* Try to find either a register-register move or a register-stack move.
                                If not use the first.  If there's a stack-register move there will
                                also be a register-stack so we don't need to look for both. *)
                                
                             val {dst=selectDst, src=selectSrc} = first
 
                             (* This includes this entry but after the swap we'll eliminate it. *)
                             val flattened = List.foldl(fn (a, b) => a @ b) [] (cycle :: rest)
                             val destAsSource = selectDst
 
                             fun match(s1: srcAndDest, s2) = s1 = s2
                             
                             fun swapSources{src, dst} =
                                 if match(src, selectSrc) then {src=destAsSource, dst=dst}
                                 else if match(src, destAsSource) then {src=selectSrc, dst=dst}
                                 else {src=src, dst=dst}
 
                             val exchangeCode = exchange(selectDst, selectSrc, code)                                    
                         in
                             moveValues(List.map swapSources flattened, exchangeCode)
                         end
 
                     |   moveEachValue(([]) :: _, _) = (* This should not happen - avoid warning. *)
                             raise InternalError "moveEachValue - empty set"
                 in
                     moveEachValue(ordered, code)
                 end
         in
             moveValues(moves, code)
         end
 
         (* Where we have multiple specific registers as either source or
            destination there is the potential that a destination register
            if currently in use as a source. *) 
         fun moveMultipleRegisters(regPairList, code) =
         let
             val regPairsAsDests =
                 List.map(fn {src, dst} => {src=IsInReg src, dst=IsInReg dst}) regPairList
         in
             moveMultipleValues(regPairsAsDests, code)
         end
 
         (* Floating point registers can only be moved into other floating point registers but
            it is possible to have a cycle. *)
         fun moveMultipleFPRegisters(moves: {dst: vReg, src: vReg} list, code) =
         let
             local
                 fun regNo(VReg r) = Word8.toInt r
                 type node = {src: vReg, dst: vReg }
                 fun nodeAddress({dst=r, ...}: node) = regNo r
                 fun arcs{src=r, ...} = [regNo r]
             in
                 val stronglyConnected = Strongly.stronglyConnectedComponents { nodeAddress=nodeAddress, arcs=arcs }
             end
 
             fun moveValues ([], code) = code (* We're done. *)
 
             |   moveValues (arguments, code) =
                 let
                     val ordered = stronglyConnected arguments
 
                     fun moveEachValue ([], code) = code
 
                     |   moveEachValue ([{dst, src}] :: rest, code) =
                             moveEachValue(rest,
-                                if src = dst then code else (MoveFPToFP{regN=src, regD=dst, floatSize=Double64}) :: code)
+                                if src = dst then code else (FPUnaryOp{regN=src, regD=dst, fpOp=MoveDouble}) :: code)
 
                     |   moveEachValue((cycle as first :: _ :: _) :: rest, code) =
                         (* We have a cycle. *)
                         let
                             val {dst=selectDst: vReg, src=selectSrc: vReg} = first
 
                             (* This includes this entry but after the swap we'll eliminate it. *)
                             val flattened = List.foldl(fn (a, b) => a @ b) [] (cycle :: rest)
                             val destAsSource = selectDst
                             fun swapSources{src, dst} =
                                 if src=selectSrc then {src=destAsSource, dst=dst}
                                 else if src=destAsSource then {src=selectSrc, dst=dst}
                                 else {src=src, dst=dst}
 
                             (* Exchange the values of two floating point registers.  There are
                                various ways to do this.  For the moment just use the hardware stack. *)
                             val exchangeCode =
                                 code <::>
                                     StoreFPRegUnscaled{regT=selectDst, regN=XSP, byteOffset= ~16, floatSize=Double64, unscaledType=PreIndex} <::>
-                                    MoveFPToFP{regN=selectSrc, regD=selectDst, floatSize=Double64} <::>
+                                    FPUnaryOp{regN=selectSrc, regD=selectDst, fpOp=MoveDouble} <::>
                                     LoadFPRegUnscaled{regT=selectSrc, regN=XSP, byteOffset=16, floatSize=Double64, unscaledType=PostIndex}
                         in
                             moveValues(List.map swapSources flattened, exchangeCode)
                         end
 
                     |   moveEachValue(([]) :: _, _) = (* This should not happen - avoid warning. *)
                             raise InternalError "moveEachValue - empty set"
                 in
                     moveEachValue(ordered, code)
                 end
         in
             moveValues(moves, code)
         end
 
 
 
         fun moveIfNecessary({src, dst}, code) =
             if src = dst then code
             else MoveXRegToXReg{sReg=src, dReg=dst} :: code
 
         (* Add a constant word to the source register and put the result in the
            destination.  regW is used as a work register if necessary.  This is used
            both for addition and subtraction. *)
         fun addConstantWord({regS, regD, value=0w0, ...}, code) =
             if regS = regD then code else MoveXRegToXReg{sReg=regS, dReg=regD} :: code
     
         |   addConstantWord({regS, regD, regW, value}, code) =
             let
                 (* If we have to load the constant it's better if the top 32-bits are
                    zero if possible. *)
                 val (isSub, unsigned) =
                     if value > Word64.<<(0w1, 0w63)
                     then (true, ~ value)
                     else (false, value)
             in
                 if unsigned < Word64.<<(0w1, 0w24)
                 then (* We can put up to 24 in a shifted and an unshifted constant. *)
                 let
                     val w = Word.fromLarge(Word64.toLarge unsigned)
                     val high = Word.andb(Word.>>(w, 0w12), 0wxfff)
                     val low = Word.andb(w, 0wxfff)
                     val addSub = if isSub then SubImmediate else AddImmediate
                 in
                     if high <> 0w0
                     then
                     (
                         (if low <> 0w0 then [addSub{regN=regD, regD=regD, immed=low, shifted=false, setFlags=false, opSize=OpSize64}] else []) @
                         addSub{regN=regS, regD=regD, immed=high, shifted=true, setFlags=false, opSize=OpSize64} :: code
                     )
                     else addSub{regN=regS, regD=regD, immed=low, shifted=false, setFlags=false, opSize=OpSize64} :: code
                 end
                 else
                 let
                     (* To minimise the constant and increase the chances that it
                        will fit in a single word look to see if we can shift it. *)
                     fun getShift(value, shift) =
                         if Word64.andb(value, 0w1) = 0w0
                         then getShift(Word64.>>(value, 0w1), shift+0w1)
                         else (value, shift)
                     val (shifted, shift) = getShift(unsigned, 0w0)
                 in
                     code <::> LoadNonAddr(regW, shifted) <::>
                         (if isSub then SubShiftedReg else AddShiftedReg)
                             {regM=regW, regN=regS, regD=regD, shift=ShiftLSL shift, setFlags=false, opSize=OpSize64}
                 end
             end
 
         val labelMaker = createLabelMaker()
 
         val startOfFunctionLabel = createLabel labelMaker (* Used for recursive calls/jumps *)
 
         val blockToLabelMap = Vector.tabulate(numBlocks, fn _ => createLabel labelMaker)
 
         fun getBlockLabel blockNo = Vector.sub(blockToLabelMap, blockNo)
 
         fun codeExtended _ (MoveRegister{source, dest, ...}, code) =
             moveIfNecessary({src=source, dst=dest}, code)
 
         |   codeExtended _ (LoadNonAddressConstant{source, dest}, code) =
                 code <::> LoadNonAddr(dest, source)
 
         |   codeExtended _ (LoadFPConstant{source, dest, floatSize}, code) =
                 code <::> LoadFPConst{dest=dest, value=source, floatSize=floatSize, work=workReg1}
 
         |   codeExtended _ (LoadAddressConstant{source, dest, ...}, code) =
                 code <::> LoadAddr(dest, source)
 
         |   codeExtended _ (LoadWithConstantOffset{dest, base, byteOffset, loadType, ...}, code) =
             if byteOffset < 0
             then LoadRegUnscaled{regT=dest, regN=base, byteOffset=byteOffset,
                                         loadType=loadType, unscaledType=NoUpdate} :: code
             else
             let
                 val unitOffset =
                     case loadType of
                         Load64 => Int.quot(byteOffset, 8)
                     |   Load32 => Int.quot(byteOffset, 4)
                     |   Load16 => Int.quot(byteOffset, 2)
                     |   Load8 => byteOffset
             in
                 LoadRegScaled{regT=dest, regN=base, unitOffset=unitOffset, loadType=loadType} :: code
             end
 
         |   codeExtended _ (LoadFPWithConstantOffset{dest, base, byteOffset, floatSize, ...}, code) =
             if byteOffset < 0
             then (LoadFPRegUnscaled{regT=dest, regN=base, byteOffset=byteOffset,
                                         floatSize=floatSize, unscaledType=NoUpdate}) :: code
             else
             let
                 val unitOffset = Int.quot(byteOffset, case floatSize of Float32 => 4 | Double64 => 8)
             in
                 (LoadFPRegScaled{regT=dest, regN=base, unitOffset=unitOffset, floatSize=floatSize}) :: code
             end
 
         |   codeExtended _ (LoadWithIndexedOffset{dest, base, index, loadType, signExtendIndex, ...}, code) =
             let
                 val baseReg = base and indexReg = index
                 (* In 32-in-64 the index is a 32-bit value in the low order 32-bits.  It may be
                    signed if it is a C address. *)
                 val scaleType = if is32in64 andalso signExtendIndex then ExtSXTW else ExtUXTX
                 val scale = case loadType of Load8 => NoScale | _ => ScaleOrShift
             in
                 (LoadRegIndexed{regT=dest, regN=baseReg, regM=indexReg, option=scaleType scale, loadType=loadType}) :: code
             end
 
         |   codeExtended _ (LoadFPWithIndexedOffset{dest, base, index, floatSize, signExtendIndex, ...}, code) =
             let
                 val baseReg = base and indexReg = index
                 (* In 32-in-64 the index is a 32-bit value in the low order 32-bits.  It may be
                    signed if it is a C address. *)
                 val scaleType = if is32in64 andalso signExtendIndex then ExtSXTW else ExtUXTX
             in
                 (LoadFPRegIndexed{regT=dest, regN=baseReg, regM=indexReg, option=scaleType ScaleOrShift, floatSize=floatSize}) :: code
             end
 
         |   codeExtended _ (GetThreadId { dest}, code) =
                 (* Load the thread id.  This is always a 64-bit value. *)
                 (LoadRegScaled{regT=dest, regN=X_MLAssemblyInt, unitOffset=threadIdOffset, loadType=Load64}) :: code
 
         |   codeExtended _ (ObjectIndexAddressToAbsolute{source, dest, ...}, code) =
                 (AddShiftedReg{regM=source, regN=X_Base32in64, regD=dest,
                                       shift=ShiftLSL 0w2, setFlags=false, opSize=OpSize64}) :: code
 
         |   codeExtended _ (AbsoluteToObjectIndex{source, dest, ...}, code) =
             let
                 val destReg = dest
             in
                 code <::>
                 (SubShiftedReg{regM=X_Base32in64, regN=source, regD=destReg, shift=ShiftNone, setFlags=false, opSize=OpSize64}) <::>
                 (shiftConstant{shift=0w2, regN=destReg, regD=destReg, direction=ShiftRightLogical, opSize=OpSize64})
             end
 
         |   codeExtended _ (AllocateMemoryFixed{ bytesRequired, dest, saveRegs }, code) =
                 code <::> AllocateMemoryFixedSize{ bytes=Word.fromLarge bytesRequired, dest=dest, save=saveRegs, work=workReg1 }
 
         |   codeExtended _ (AllocateMemoryVariable{ size, dest, saveRegs }, code) =
                 code <::> AllocateMemoryVariableSize{ sizeReg=size, dest=dest, save=saveRegs, work=workReg1 }
 
         |   codeExtended _ (InitialiseMem{ size, addr, init}, code) =
             let
                 val sizeReg = size
                 and addrReg = addr
                 and initReg = init
                 val exitLabel = createLabel labelMaker and loopLabel = createLabel labelMaker
                 (* This uses a loop to initialise.  It's possible the size is
                    zero so we have to check at the top of the loop. *)
                 val (bShift, offset, loadType) = if is32in64 then (0w2, ~4, Load32) else (0w3, ~8, Load64)
             in
                 code <::>
                 (* Add the length in bytes so we point at the end. *)
                 AddShiftedReg{regM=sizeReg, regN=addrReg, regD=workReg1, shift=ShiftLSL bShift, setFlags=false, opSize=OpSize64} <::>
                 SetLabel loopLabel <::>
                 (* Are we at the start? *)
                 SubShiftedReg{regM=workReg1, regN=addrReg, regD=XZero, shift=ShiftNone, setFlags=true, opSize=OpSize64} <::>
                 ConditionalBranch(CondEqual, exitLabel) <::>
                 StoreRegUnscaled{regT=initReg, regN=workReg1, byteOffset=offset, loadType=loadType, unscaledType=PreIndex } <::>
                 UnconditionalBranch loopLabel <::>
                 SetLabel exitLabel
             end
 
         |   codeExtended _ (BeginLoop, code) = code
 
         |   codeExtended _ (JumpLoop{regArgs, stackArgs, checkInterrupt}, code) =
             let
                 (* TODO: We could have a single list and use ArgOnStack and ArgInReg to distinguish. *)
                 fun convertArg(ArgOnStack{wordOffset, ...}) = IsOnStack wordOffset
                 |   convertArg(ArgInReg reg) = IsInReg(reg)
                 val extStackArgs =
                     map (fn {wordOffset, src, ...} => {src=convertArg src, dst=IsOnStack wordOffset}) stackArgs
                 val extRegArgs =
                     map (fn {dst, src} => {src=convertArg src, dst=convertArg(ArgInReg dst)}) regArgs
                 val code2 = moveMultipleValues(extStackArgs @ extRegArgs, code)
             in
                 case checkInterrupt of
                     NONE => code2
                 |   SOME saveRegs =>
                     let
                         val skipCheck = createLabel labelMaker
                     in
                         code2 <::>
                         (* Put in stack-check code to allow this to be interrupted. *)
                         LoadRegScaled{regT=workReg1, regN=X_MLAssemblyInt, unitOffset=stackLimitOffset, loadType=Load64} <::>
                         SubShiftedReg{regM=workReg1, regN=X_MLStackPtr, regD=XZero, shift=ShiftNone, setFlags=true, opSize=OpSize64} <::>
                         ConditionalBranch(CondCarrySet, skipCheck) <::>
                         RTSTrap{rtsEntry=stackOverflowCallOffset, work=workReg1, save=saveRegs} <::>
                         SetLabel skipCheck
                     end
             end
 
         |   codeExtended _ (StoreWithConstantOffset{source, base, byteOffset, loadType, ...}, code) =
             if byteOffset < 0
             then (StoreRegUnscaled{regT=source, regN=base, byteOffset=byteOffset,
                                         loadType=loadType, unscaledType=NoUpdate}) :: code
             else
             let
                 val unitOffset =
                     case loadType of
                         Load64 => Int.quot(byteOffset, 8)
                     |   Load32 => Int.quot(byteOffset, 4)
                     |   Load16 => Int.quot(byteOffset, 2)
                     |   Load8 => byteOffset
             in
                 (StoreRegScaled{regT=source, regN=base, unitOffset=unitOffset, loadType=loadType}) :: code
             end
 
         |   codeExtended _ (StoreFPWithConstantOffset{source, base, byteOffset, floatSize, ...}, code) =
             if byteOffset < 0
             then (StoreFPRegUnscaled{regT=source, regN=base, byteOffset=byteOffset,
                                         floatSize=floatSize, unscaledType=NoUpdate}) :: code
             else
             let
                 val unitOffset = Int.quot(byteOffset, case floatSize of Float32 => 4 | Double64 => 8)
             in
                 (StoreFPRegScaled{regT=source, regN=base, unitOffset=unitOffset, floatSize=floatSize}) :: code
             end
 
         |   codeExtended _ (StoreWithIndexedOffset{source, base, index, loadType, signExtendIndex, ...}, code) =
             let
                 val baseReg = base and indexReg = index
                 (* In 32-in-64 the index is a 32-bit value in the low order 32-bits.  It may be
                    signed if it is a C address. *)
                 val scaleType = if is32in64 andalso signExtendIndex then ExtSXTW else ExtUXTX
                 val scale = case loadType of Load8 => NoScale | _ => ScaleOrShift
             in
                 (StoreRegIndexed{regT=source, regN=baseReg, regM=indexReg, option=scaleType scale, loadType=loadType}) :: code
             end
 
         |   codeExtended _ (StoreFPWithIndexedOffset{source, base, index, floatSize, signExtendIndex, ...}, code) =
             let
                 val baseReg = base and indexReg = index
                 (* In 32-in-64 the index is a 32-bit value in the low order 32-bits.  It may be
                    signed if it is a C address. *)
                 val scaleType = if is32in64 andalso signExtendIndex then ExtSXTW else ExtUXTX
             in
                 (StoreFPRegIndexed{regT=source, regN=baseReg, regM=indexReg, option=scaleType ScaleOrShift, floatSize=floatSize}) :: code
             end
 
         |   codeExtended _ (AddSubImmediate{ source, dest, immed, isAdd, length, ccRef}, code) =
             let
                 val destReg = dest
             in
                 ((if isAdd then AddImmediate else SubImmediate)
                     {regN=source, regD=destReg, immed=immed, shifted=false, opSize=length, setFlags=isSome ccRef}) :: code
             end
 
         |   codeExtended _ (AddSubRegister{ base, shifted, dest, isAdd, length, ccRef, shift}, code) =
             let
                 val destReg = dest
             in
                 (
                     (if isAdd then AddShiftedReg else SubShiftedReg)
                         {regN=base, regM=shifted, regD=destReg, shift=shift, opSize=length, setFlags=isSome ccRef}) :: code
             end           
 
         |   codeExtended _ (LogicalImmediate{ source, dest, immed, logOp, length, ccRef}, code) =
             let
                 val destReg = dest
             in
                 (BitwiseLogical{regN=source, regD=destReg, bits=immed, opSize=length,
                                        logOp=logOp, setFlags=isSome ccRef}) :: code
             end
 
         |   codeExtended _ (LogicalRegister{ base, shifted, dest, logOp, length, ccRef, shift}, code) =
             let
                 (* There are also versions of AND/OR/XOR which operate on a complement (NOT)
                    of the shifted register.  It's probably not worth looking for a use for them. *)
                 val destReg = dest
             in
                 (LogicalShiftedReg{regN=base, regM=shifted, regD=destReg,
                                           shift=shift, opSize=length, logOp=logOp, setFlags=isSome ccRef}) :: code
             end
 
         |   codeExtended _ (ShiftRegister{ direction, dest, source, shift, opSize }, code) =
                 (ShiftRegisterVariable{regN=source, regM=shift, regD=dest,
                                               shiftDirection=direction, opSize=opSize}) :: code
 
         |   codeExtended _ (Multiplication{ kind, dest, sourceA, sourceM, sourceN }, code) =
             let
                 val destReg = dest
                 and srcAReg = sourceA
                 and srcNReg = sourceN
                 and srcMReg = sourceM
             in
                 (MultiplyAndAddSub{regM=srcMReg, regN=srcNReg, regA=srcAReg, regD=destReg, multKind=kind}) :: code
             end
 
         |   codeExtended _ (Division{ isSigned, dest, dividend, divisor, opSize }, code) =
                 (DivideRegs{regN=dividend, regM=divisor,
                                    regD=dest, isSigned=isSigned, opSize=opSize}) :: code
 
         |   codeExtended _ (BeginFunction{regArgs, fpRegArgs, ...}, code) =
             let
                 val skipCheck = createLabel labelMaker
                 val defaultWords = 10 (* This is wired into the RTS. *)
                 val workRegister = workReg1
                 val debugTrapAlways = false (* Can be set to true for debugging *)
 
                 (* Test with either the stack-pointer or a high-water value.
                    The RTS assumes that X9 has been used as the high-water if it is
                    called through stackOverflowXCallOffset rather than stackOverflowCallOffset *)
                 val (testReg, entryPt, code1) =
                     if stackRequired <= defaultWords
                     then (X_MLStackPtr, stackOverflowCallOffset, code)
                     else (X9, stackOverflowXCallOffset,
                             addConstantWord({regS=X_MLStackPtr, regD=X9, regW=workRegister,
                             value= ~ (Word64.fromLarge(Word.toLarge nativeWordSize)) * Word64.fromInt stackRequired}, code))
 
                 (* Skip the RTS call if there is enough stack.  N.B. The RTS can modify the end-of-stack value
                    to force a trap here even if there is really enough stack. *)
                 val code2 =
                     (if debugTrapAlways then []
                      else
                      [ConditionalBranch(CondCarrySet, skipCheck),
                       SubShiftedReg{regM=workRegister, regN=testReg, regD=XZero, shift=ShiftNone, setFlags=true, opSize=OpSize64}]) @
                     (* Load the end-of-stack value. *)
                     LoadRegScaled{regT=workRegister, regN=X_MLAssemblyInt, unitOffset=stackLimitOffset, loadType=Load64} :: code1
 
                 val code3 =
                     code2 <::>
                         RTSTrap{rtsEntry=entryPt, work=workReg1, save=List.map #2 regArgs} <::>
                         SetLabel skipCheck
 
                 fun mkPair(pr, rr) = {src=rr,dst=pr}
                 val regPairs = List.map mkPair regArgs
                 val fpRegPairs = List.map mkPair fpRegArgs
             in
                 moveMultipleFPRegisters(fpRegPairs, moveMultipleRegisters(regPairs, code3))
             end
 
         |   codeExtended _ (TailRecursiveCall{callKind, regArgs, stackArgs, fpRegArgs, stackAdjust, currStackSize}, code) =
             let
                 fun convertArg(ArgOnStack{wordOffset, ...}) = IsOnStack wordOffset
                 |   convertArg(ArgInReg reg) = IsInReg(reg)
 
                 val extStackArgs = map (fn {stack, src} => {dst=IsOnStack(stack+currStackSize), src=convertArg src}) stackArgs
                 val extRegArgs = map (fn (a, r) => {src=convertArg a, dst=IsInReg r}) regArgs
                 val extFPArgs = map (fn (a, r) => {src=a, dst=r}) fpRegArgs
 
                 (* Tail recursive calls are complicated because we generally have to overwrite the existing stack.
                    That means storing the arguments in the right order to avoid overwriting a
                    value that we are using for a different argument. *)
                 fun codeTailCall(arguments, stackAdjust, code) =
                     if stackAdjust < 0
                     then
                     let
                         (* If the function we're calling takes more arguments on the stack than the
                            current function we will have to extend the stack.  Do that by pushing the
                            argument whose offset is at -1.  Then adjust all the offsets and repeat. *)
                         val {src=argM1, ...} = valOf(List.find(fn {dst=IsOnStack ~1, ...} => true | _ => false) arguments)
                         fun renumberArgs [] = []
                         |   renumberArgs ({dst=IsOnStack ~1, ...} :: args) = renumberArgs args (* Remove the one we've done. *)
                         |   renumberArgs ({dst, src} :: args) =
                             let
                                 val newDest = case dst of IsOnStack d => IsOnStack(d+1) | regDest => regDest
                                 val newSrc =
                                     case src of
                                         IsOnStack wordOffset => IsOnStack(wordOffset+1)
                                     |   other => other
                             in
                                 {dst=newDest, src=newSrc} :: renumberArgs args
                             end
                         val pushCode =
                             case argM1 of
                                 IsOnStack wordOffset =>
                                     (StoreRegUnscaled{regT=workReg2, regN=X_MLStackPtr, byteOffset= ~8, unscaledType=PreIndex, loadType=Load64}) ::
                                         loadFromStack(workReg2, wordOffset, code)
                             |   IsInReg reg =>
                                     (StoreRegUnscaled{regT=reg, regN=X_MLStackPtr, byteOffset= ~8, unscaledType=PreIndex, loadType=Load64}) :: code
                                 
                     in
                         codeTailCall(renumberArgs arguments, stackAdjust+1, pushCode)
                     end
                     else
                     let
                         val loadArgs =
                             moveMultipleFPRegisters(extFPArgs, moveMultipleValues(arguments, code))
                     in
                         if stackAdjust = 0
                         then loadArgs
                         else addConstantWord({regS=X_MLStackPtr, regD=X_MLStackPtr, regW=workReg1,
                                 value=Word64.fromInt stackAdjust * Word.toLarge nativeWordSize}, loadArgs)
                     end
                 val setArgumentsCode = codeTailCall(extStackArgs @ extRegArgs, stackAdjust+currStackSize, code)
                 
                 val jumpToFunctionCode =
                     case callKind of
                         Recursive => [(UnconditionalBranch startOfFunctionLabel)]
                     |   ConstantCode m =>
                             if is32in64
                             then raise InternalError "ConstantCode" (* Not currently *)
                             else [(BranchReg{regD=workReg1, brRegType=BRRBranch}), (LoadAddr(workReg1, m))]
                     |   FullCall =>
                             if is32in64
                             then [BranchReg{regD=workReg1, brRegType=BRRBranch},
                                  LoadRegScaled{regT=workReg1, regN=workReg1, unitOffset=0, loadType=Load64},
                                  AddShiftedReg{regM=X8, regN=X_Base32in64, regD=workReg1, shift=ShiftLSL 0w2, setFlags=false, opSize=OpSize64}]
                             else [BranchReg{regD=workReg1, brRegType=BRRBranch},
                                     LoadRegScaled{regT=workReg1, regN=X8, unitOffset=0, loadType=Load64}]
             in
                 jumpToFunctionCode @ setArgumentsCode
             end
 
         |   codeExtended _ (FunctionCall{callKind, regArgs, stackArgs, dests, fpRegArgs, fpDests, saveRegs, ...}, code) =
             let
                 local
                     fun pushStackArgs ([], _, code) = code
 
                     |   pushStackArgs (ArgOnStack {wordOffset, ...} ::args, argNum, code) =
                         let
                             (* Have to adjust the offsets of stack arguments. *)
                             val adjustedOffset = wordOffset+argNum
                         in
                             pushStackArgs(args, argNum+1,
                                 loadFromStack(workReg1, adjustedOffset, code) <::>
                                 StoreRegUnscaled{regT=workReg1, regN=X_MLStackPtr, byteOffset= ~8, unscaledType=PreIndex, loadType=Load64})
                         end
 
                     |   pushStackArgs (ArgInReg reg ::args, argNum, code) =
                             pushStackArgs(args, argNum+1,
                                 code <::>
                                 (StoreRegUnscaled{regT=reg, regN=X_MLStackPtr, byteOffset= ~8,
                                                          unscaledType=PreIndex, loadType=Load64}))
 
                     val pushedArgs = pushStackArgs(stackArgs, 0, code (* Initial code *))
 
                     (* We have to adjust any stack offset to account for the arguments we've pushed. *)
                     val numStackArgs = List.length stackArgs
 
                     fun convertArg(ArgOnStack{wordOffset, ...}) = IsOnStack(wordOffset+numStackArgs)
                     |   convertArg(ArgInReg reg) = IsInReg(reg)
 
                 in
                     val extRegArgs = map (fn (a, r) => {src=convertArg a, dst=IsInReg r}) regArgs
                     val extFPArgs = map (fn (a, r) => {src=a, dst=r}) fpRegArgs
                     val loadArgs = moveMultipleFPRegisters(extFPArgs, moveMultipleValues(extRegArgs, pushedArgs))
                 end
 
                 (* Push the registers before the call and pop them afterwards. *)
                 fun makeSavesAndCall([], code) =
                     (
                         case callKind of
                             Recursive => code <::> (BranchAndLink startOfFunctionLabel)
                         |   ConstantCode m =>
                                 if is32in64
                                 then raise InternalError "ConstantCode" (* Not currently *)
                                 else code <::> (LoadAddr(workReg1, m)) <::> (BranchReg{regD=workReg1, brRegType=BRRAndLink})
                         |   FullCall =>
                                 if is32in64
                                 then code <::>
                                     AddShiftedReg{regM=X8, regN=X_Base32in64, regD=workReg1, shift=ShiftLSL 0w2,
                                                   setFlags=false, opSize=OpSize64} <::>
                                     LoadRegScaled{regT=workReg1, regN=workReg1, unitOffset=0, loadType=Load64} <::>
                                     BranchReg{regD=workReg1, brRegType=BRRAndLink}
                                 else code <::> LoadRegScaled{regT=workReg1, regN=X8, unitOffset=0, loadType=Load64} <::>
                                             BranchReg{regD=workReg1, brRegType=BRRAndLink}
                     )
 
                 |   makeSavesAndCall(reg::regs, code) =
                     let
                         val areg = reg
                     in
                         makeSavesAndCall(regs,
                                 code <::> StoreRegUnscaled{regT=areg, regN=X_MLStackPtr, byteOffset= ~8,
                                                            loadType=Load64, unscaledType=PreIndex}) <::>
                             LoadRegUnscaled{regT=areg, regN=X_MLStackPtr, byteOffset= 8, loadType=Load64, unscaledType=PostIndex}
                     end
 
                 (* Results.  These go from the specific result register into the allocated register. *)
                 val resultPairs = List.map(fn (pr, rr) => {src=rr,dst=pr}) dests
                 val fpResultPairs = List.map(fn (pr, rr) => {src=rr,dst=pr}) fpDests
             in
                 moveMultipleFPRegisters(fpResultPairs,
                     moveMultipleRegisters(resultPairs, makeSavesAndCall(saveRegs, loadArgs)))
             end
 
         |   codeExtended _ (ReturnResultFromFunction { results, returnReg, numStackArgs }, code) =
             let
                 fun resetStack(0, code) = code
                 |   resetStack(nItems, code) =
                         addConstantWord({regS=X_MLStackPtr, regD=X_MLStackPtr, regW=X3,
                                         value=Word64.fromLarge(Word.toLarge nativeWordSize) * Word64.fromInt nItems}, code)
                 (* Return results.  This goes from the allocated register into the specific register rr. *)
                 val resultPairs = List.map(fn (pr, rr) => {src=pr,dst=rr}) results
             in
                 BranchReg{regD=returnReg, brRegType=BRRReturn} :: resetStack(numStackArgs, moveMultipleRegisters(resultPairs, code))
             end
 
         |   codeExtended _ (RaiseExceptionPacket{ packetReg }, code) =
                 (* We need a work register here.  It can be any register other than X0 since
                    we don't preserve registers across calls. *)
                 (* Copy the handler "register" into the stack pointer.  Then
                    jump to the address in the first word.  The second word is
                    the next handler.  This is set up in the handler.  We have a lot
                    more raises than handlers since most raises are exceptional conditions
                    such as overflow so it makes sense to minimise the code in each raise. *)
                 moveIfNecessary({src=packetReg, dst=X0}, code) <::>
                 LoadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset, loadType=Load64} <::>
                 LoadRegScaled{regT=workReg1, regN=X_MLStackPtr, unitOffset=0, loadType=Load64} <::>
                 BranchReg{regD=workReg1, brRegType=BRRBranch }
 
         |   codeExtended _ (PushToStack{ source, copies, ... }, code) =
             let
                 val reg = source
                 val _ = copies > 0 orelse raise InternalError "PushToStack: copies<1"
                 fun pushn(0, c) = c
                 |   pushn(n, c) =
                         pushn(n-1,
                             (StoreRegUnscaled{regT=reg, regN=X_MLStackPtr, byteOffset= ~8, loadType=Load64, unscaledType=PreIndex}) :: c)
             in
                 pushn(copies, code)
             end
 
         |   codeExtended _ (LoadStack{ dest, wordOffset, ... }, code) =
                 loadFromStack(dest, wordOffset, code)
 
         |   codeExtended _ (StoreToStack{ source, stackOffset, ... }, code) =
                 (* Store into the stack to set a field of a container.  Always 64-bits. *)
                 storeToStack(source, stackOffset, workReg1, code)
 
         |   codeExtended _ (ContainerAddress{ dest, stackOffset, ... }, code) =
                 (* Set the register to an offset in the stack. *)
             let
                 val _ = stackOffset >= 0 orelse raise InternalError "codeGenICode: ContainerAddress - negative offset"
                 val byteOffset = stackOffset * Word.toInt nativeWordSize
             in
                 if byteOffset >= 4096
                 then code <::> LoadNonAddr(dest, Word64.fromInt byteOffset) <::>
                         AddShiftedReg{regN=X_MLStackPtr, regM=dest, regD=dest, shift=ShiftNone, setFlags=false, opSize=OpSize64}
                 else code <::> AddImmediate{regN=X_MLStackPtr, regD=dest, immed=Word.fromInt byteOffset,
                                           shifted=false, setFlags=false, opSize=OpSize64}
             end
 
         |   codeExtended _ (ResetStackPtr{ numWords, ... }, code) =
                 addConstantWord({regS=X_MLStackPtr, regD=X_MLStackPtr, regW=workReg1,
                     value=Word64.fromInt numWords * Word.toLarge nativeWordSize}, code)
 
         |   codeExtended _ (TagValue{ source, dest, isSigned=_, opSize }, code) =
                 (* Shift left by one bit and add one. *)
                 code <::>
                     shiftConstant{ direction=ShiftLeft, regD=dest, regN=source, shift=0w1, opSize=opSize } <::>
                     BitwiseLogical{ bits=0w1, regN=dest, regD=dest, opSize=opSize, setFlags=false, logOp=LogOr}
 
         |   codeExtended _ (UntagValue{ source, dest, isSigned, opSize }, code) =
                 code <::>
                     shiftConstant{ direction=if isSigned then ShiftRightArithmetic else ShiftRightLogical,
                                     regD=dest, regN=source, shift=0w1, opSize=opSize }
 
         |   codeExtended _ (BoxLarge{ source, dest, saveRegs }, code) =
                boxSysWord({source=source, destination=dest,
                            workReg=workReg1, saveRegs=saveRegs}, code)
 
         |   codeExtended _ (UnboxLarge{ source, dest }, code) =
             let
                 (* Unbox a large word.  The argument is a poly word. *)
                 val destReg = dest
                 and srcReg = source
             in
                 if is32in64
                 then LoadRegScaled{regT=destReg, regN=destReg, unitOffset=0, loadType=Load64} ::
                         AddShiftedReg{regM=srcReg, regN=X_Base32in64, regD=destReg, shift=ShiftLSL 0w2, setFlags=false, opSize=OpSize64} :: code
                 else LoadRegScaled{regT=destReg, regN=srcReg, unitOffset=0, loadType=Load64} :: code
             end
 
         |   codeExtended _ (BoxTagFloat{ floatSize=Double64, source, dest, saveRegs }, code) =
                 boxDouble({source=source, destination=dest,
                            workReg=workReg1, saveRegs=saveRegs}, code)
 
         |   codeExtended _ (BoxTagFloat{ floatSize=Float32, source, dest, saveRegs }, code) =
             let
                 val floatReg = source
                 and fixedReg = dest
             in
                 if is32in64
                 then boxFloat({source=floatReg, destination=fixedReg,
                                workReg=workReg1, saveRegs=saveRegs}, code)
                 else
                     code <::>
                         MoveFPToGeneral{regN=floatReg, regD=fixedReg, floatSize=Float32} <::>
                         shiftConstant{ direction=ShiftLeft, shift=0w32, regN=fixedReg, regD=fixedReg, opSize=OpSize64} <::>
                         BitwiseLogical{ bits=0w1, regN=fixedReg, regD=fixedReg, opSize=OpSize64, setFlags=false, logOp=LogOr}
             end
 
         |   codeExtended _ (UnboxTagFloat { floatSize=Double64, source, dest }, code) =
                 if is32in64
                 then code <::>
                     AddShiftedReg{regM=source, regN=X_Base32in64, regD=workReg1, shift=ShiftLSL 0w2,
                                           setFlags=false, opSize=OpSize64} <::>
                     LoadFPRegScaled{regT=dest, regN=workReg1, unitOffset=0, floatSize=Double64}
                 else code <::> LoadFPRegScaled{regT=dest, regN=source, unitOffset=0, floatSize=Double64}
 
         |   codeExtended _ (UnboxTagFloat { floatSize=Float32, source, dest }, code) =
                 (* This is tagged in native 64-bits.  In 32-in-64 we're loading 32-bits so we
                    can use an indexed load directly. *)
                 if is32in64
                 then code <::> LoadFPRegIndexed{regN=X_Base32in64, regM=source, regT=dest, option=ExtUXTX ScaleOrShift, floatSize=Float32}
                 else code <::>
                         shiftConstant{direction=ShiftRightLogical, shift=0w32, regN=source, regD=workReg1, opSize=OpSize64} <::>
                         MoveGeneralToFP{regN=workReg1, regD=dest, floatSize=Float32}
 
         |   codeExtended _ (LoadAcquire{dest, base, loadType, ...}, code) =
                 LoadAcquireReg{regT=dest, regN=base, loadType=loadType} :: code
 
         |   codeExtended _ (StoreRelease{source, base, loadType, ...}, code) =
                 StoreReleaseReg{regT=source, regN=base, loadType=loadType} :: code
 
         |   codeExtended _ (BitFieldShift{ source, dest, isSigned, length, immr, imms }, code) =
                 BitField{immr=immr, imms=imms, regN=source, regD=dest,
                          bitfieldKind=if isSigned then BFSigned else BFUnsigned, opSize=length} :: code
 
         |   codeExtended _ (BitFieldInsert{ source, destAsSource, dest, length, immr, imms }, code) =
             let
                 (* If we're using BitFieldMove we retain some of the bits of the
                    destination.  The higher levels require us to treat that as a source.  *)
                 val _ = source = dest andalso raise InternalError "codeExtended: bitfield: dest=source"
             in
                 BitField{immr=immr, imms=imms, regN=source, regD=dest, bitfieldKind=BFInsert, opSize=length} ::
                     moveIfNecessary({src=destAsSource, dst=dest}, code)
             end
 
         |   codeExtended {flow} (IndexedCaseOperation{testReg}, code) =
             let
                 (* testReg contains the original value after the lowest value has been subtracted.
                    Since both the original value and the lowest value were tagged it contains a
                    shifted but untagged value. *)
                 (* This should only be within a block with an IndexedBr flow type. *)
                 val cases =
                     case flow of IndexedBr cases => cases | _ => raise InternalError "codeGenICode: IndexedCaseOperation"
                 val caseLabels = map getBlockLabel cases
                 val tableLabel = createLabel labelMaker
             in
                 code <::>
                     LoadLabelAddress(workReg1, tableLabel) <::>
                     (* Add the value shifted by one since it's already shifted. *)
                     AddShiftedReg{regN=workReg1, regD=workReg1, regM=testReg,
                                           shift=ShiftLSL 0w1, setFlags=false, opSize=OpSize64} <::>
                     BranchReg{regD=workReg1, brRegType=BRRBranch} <::>
                     BranchTable{ startLabel=tableLabel, brTable=caseLabels }
             end
 
         |   codeExtended {flow} (PushExceptionHandler, code) =
             let
                 (* This should only be within a block with a SetHandler flow type. *)
                 val handleLabel =
                     case flow of
                         SetHandler{ handler, ...} => handler
                     |   _ => raise InternalError "codeGenICode: PushExceptionHandler"
                 val labelRef = getBlockLabel handleLabel
             in
                 (* Push the old handler and the handler entry point and set the "current handler"
                    to point to the stack after we've pushed these. *)
                 code <::>
                     LoadRegScaled{regT=workReg1, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset, loadType=Load64} <::>
                     LoadLabelAddress(workReg2, labelRef) <::>
                     StoreRegPair{regT1=workReg2, regT2=workReg1, regN=X_MLStackPtr, unitOffset= ~2, unscaledType=PreIndex, loadType=Load64} <::>
                     StoreRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset, loadType=Load64}
             end
 
         |   codeExtended _ (PopExceptionHandler, code) =
                 (* Remove and discard the handler we've set up.  Pop the previous handler
                    and put into "current handler". *)
                 code <::>
                     LoadRegPair{regT1=XZero, regT2=workReg2, regN=X_MLStackPtr, unitOffset=2, unscaledType=PostIndex, loadType=Load64} <::>
                     StoreRegScaled{regT=workReg2, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset, loadType=Load64}
 
         |   codeExtended _ (BeginHandler{packetReg}, code) =
             let
                 val beginHandleCode =
                     code <::>
                         (* The exception raise code resets the stack pointer to the value in the exception handler
                            so this is probably redundant.  Leave it for the moment, *)
                         LoadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset, loadType=Load64} <::>
                         LoadRegPair{regT1=XZero, regT2=workReg2, regN=X_MLStackPtr, unitOffset=2, unscaledType=PostIndex, loadType=Load64} <::>
                         StoreRegScaled{regT=workReg2, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset, loadType=Load64}
             in
                 moveIfNecessary({src=X0, dst=packetReg }, beginHandleCode)
             end
 
         |   codeExtended _ (CompareByteVectors{vec1Addr, vec2Addr, length,  ...}, code) =
             let
                 (* Construct a loop to compare two vectors of bytes. *)
                 val vec1Reg = vec1Addr
                 and vec2Reg = vec2Addr
                 and lenReg = length
                 val loopLabel = createLabel labelMaker and exitLabel = createLabel labelMaker
             in
                 code <::>
                 (* Set the CC to Equal before we start in case length = 0 *)
                 SubShiftedReg{regM=lenReg, regN=lenReg, regD=XZero, shift=ShiftNone, setFlags=true, opSize=OpSize64} <::>
                 SetLabel loopLabel <::> (* Start of loop *)
                 CompareBranch{ test=lenReg, label=exitLabel, onZero=true, opSize=OpSize64} <::> (* Go to the end when len = zero *)
                 (* Load the bytes for the comparison and increment each. *)
                 LoadRegUnscaled{regT=workReg1, regN=vec1Reg, byteOffset=1, unscaledType=PostIndex, loadType=Load8} <::>
                 LoadRegUnscaled{regT=workReg2, regN=vec2Reg, byteOffset=1, unscaledType=PostIndex, loadType=Load8} <::>            
                 SubImmediate{regN=lenReg, regD=lenReg, immed=0w1, shifted=false, setFlags=false, opSize=OpSize64} <::> (* Decr len *)
                 (* Compare *)
                 SubShiftedReg{regM=workReg2, regN=workReg1, regD=XZero, shift=ShiftNone, setFlags=true, opSize=OpSize64} <::>
                 ConditionalBranch(CondEqual, loopLabel) <::> (* Loop if they're equal *)
                 SetLabel exitLabel
             end
 
         |   codeExtended _ (BlockMove{srcAddr, destAddr, length, isByteMove}, code) =
             let
                 (* Construct a loop to move the data. *)
                 val srcReg = srcAddr
                 and destReg = destAddr
                 and lenReg = length
                 val loopLabel = createLabel labelMaker and exitLabel = createLabel labelMaker
                 val (offset, loadType) = if isByteMove then (1, Load8) else if is32in64 then (4, Load32) else (8, Load64)
             in
                 code <::> SetLabel loopLabel (* Start of loop *) <::> 
                 CompareBranch{ test=lenReg, label=exitLabel, onZero=true, opSize=OpSize64} <::> (* Exit when length = 0 *)
                 LoadRegUnscaled{regT=workReg1, regN=srcReg, byteOffset=offset, loadType=loadType, unscaledType=PostIndex} <::>
                 StoreRegUnscaled{regT=workReg1, regN=destReg, byteOffset=offset, loadType=loadType, unscaledType=PostIndex} <::>
                 SubImmediate{regN=lenReg, regD=lenReg, immed=0w1, shifted=false, setFlags=false, opSize=OpSize64} <::> (* Decr len *)
                 UnconditionalBranch loopLabel <::> (* Back to the start *)
                 SetLabel exitLabel
             end
 
         |   codeExtended _ (AddSubXSP{ source, dest, isAdd }, code) =
             let
                 val allocFreeCode =
                     (if isAdd then AddExtendedReg else SubExtendedReg)
                         {regM=source, regN=XSP, regD=XSP, extend=ExtUXTX 0w0, setFlags=false, opSize=OpSize64} :: code
             in
                 case dest of
                     XZero => allocFreeCode
                 |   destReg =>
                         (* We have to use add here to get the SP into the destination instead of the usual move. *)
                         AddImmediate{regN=XSP, regD=destReg, immed=0w0, shifted=false, setFlags=false, opSize=OpSize64} ::
                             allocFreeCode
             end
 
         |   codeExtended _ (TouchValue _, code) = code (* Don't need to do anything now. *)
 
             (* Used in mutex operations. *)
         |   codeExtended _ (LoadAcquireExclusive{ base, dest }, code) =
                 LoadAcquireExclusiveRegister{regN=base, regT=dest} :: code
 
         |   codeExtended _ (StoreReleaseExclusive{ base, source, result }, code) =
                 StoreReleaseExclusiveRegister{regS=result,
                         regT=source, regN=base} :: code
 
         |   codeExtended _ (MemoryBarrier, code) = code <::> MemBarrier
 
         |   codeExtended _ (ConvertIntToFloat{ source, dest, srcSize, destSize}, code) =
                 (CvtIntToFP{regN=source, regD=dest, floatSize=destSize, opSize=srcSize}) :: code
 
         |   codeExtended _ (ConvertFloatToInt{ source, dest, srcSize, destSize, rounding}, code) =
                 (CvtFloatToInt{regN=source, regD=dest, round=rounding, floatSize=srcSize, opSize=destSize}) :: code
 
         |   codeExtended _ (UnaryFloatingPt{ source, dest, fpOp}, code) =
                 (FPUnaryOp{regN=source, regD=dest, fpOp=fpOp}) :: code
 
         |   codeExtended _ (BinaryFloatingPoint{ arg1, arg2, dest, fpOp, opSize}, code) =
                 (FPBinaryOp{regN=arg1, regM=arg2, regD=dest, floatSize=opSize, fpOp=fpOp}) :: code
 
         |   codeExtended _ (CompareFloatingPoint{ arg1, arg2, opSize, ...}, code) =
                 (FPComparison{regN=arg1, regM=arg2, floatSize=opSize}) :: code
 
         |   codeExtended _ (CPUYield, code) = code <::> Yield
 
         |   codeExtended _ (AtomicOperation{ base, source, dest, atOp }, code) = AtomicExtension{regN=base, regS=source, regT=dest, atOp=atOp} :: code
 
         |   codeExtended _ (CacheCheck{ arg1, arg2 }, code) =
             let
                 val okLabel = createLabel labelMaker
             in
                 (code <::> SubShiftedReg {regM=arg1, regN=arg2, regD=XZero, shift=ShiftNone, opSize=OpSize64, setFlags=true} <::>
                     ConditionalBranch(CondEqual, okLabel) <::>
                     MoveXRegToXReg{sReg=XZero, dReg=X16} <::>
                     LoadRegScaled{regT=X16, regN=X16, unitOffset=0, loadType=Load16} <::>
                     SetLabel okLabel)
             end
 
         local
             (* processed - set to true when a block has been processed. *)
             val processed = Array.array(numBlocks, false)
             fun haveProcessed n = Array.sub(processed, n)
             
             (* Find the blocks that reference this one.  This isn't essential but
                allows us to try to generate blocks in the order of the control
                flow.  This in turn may allow us to use short branches rather
                than long ones. *)
             val labelRefs = Array.array(numBlocks, [])
 
             datatype flowCode =
                 FlowCodeSimple of int
             |   FlowCodeCMove of {code: precode list, trueJump: int, falseJump: int}
             
             (* Process this recursively to set the references.  If we have
                unreachable blocks, perhaps because they've been merged, we
                don't want to include them in the reference counting.
                This shouldn't happen now that IdentifyReferences removes
                unreferenced blocks. *)
             fun setReferences fromLabel toLabel =
                 case Array.sub(labelRefs, toLabel) of
                     [] => (* Not yet visited at all. *)
                     let
                         val BasicBlock{ flow, ...} = Vector.sub(blocks, toLabel)
                         val refs =
                             case flow of
                                 ExitCode => []
                             |   Unconditional lab => [lab]
                             |   Conditional{trueJump, falseJump, ... } => [trueJump, falseJump]
                             |   IndexedBr labs => labs
                             |   SetHandler { handler, continue } => [handler, continue]
                             |   UnconditionalHandle _ => []
                             |   ConditionalHandle { continue, ...} => [continue]
 
                         val () =
                             if fromLabel >= 0 then Array.update(labelRefs, toLabel, [fromLabel]) else ()
                     in
                         List.app (setReferences toLabel) refs
                     end
                     
                 |   refs =>
                     (* We've visiting this at least once.  Just add us to the list. *)
                         Array.update(labelRefs, toLabel, fromLabel :: refs)
             
             val _ = setReferences 0 0
             
             (* Process the blocks.  We keep the "stack" explicit rather than using recursion
                because this allows us to select both arms of a conditional branch sooner. *)
             fun genCode(toDo, lastFlow, code) =
             case List.filter (not o haveProcessed) toDo of
                 [] =>
                 let
                     (* There's nothing left to do. We may need to add a final branch to the end. *)
                     val finalBranch =
                         case lastFlow of
                             ExitCode => []
                         |   IndexedBr _ => []
                         |   Unconditional dest => [(UnconditionalBranch(getBlockLabel dest))]
                         |   Conditional { condition, trueJump, falseJump, ...} =>
                                 [
                                     (UnconditionalBranch(getBlockLabel falseJump)),
                                     (ConditionalBranch(condition, getBlockLabel trueJump))
                                 ]
                         |   SetHandler { continue, ...} => [(UnconditionalBranch(getBlockLabel continue))]
                         |   UnconditionalHandle _ => []
                         |   ConditionalHandle { continue, ...} => [(UnconditionalBranch(getBlockLabel continue))]
                 in
                     finalBranch @ code (* Done. *)
                 end
 
             |   stillToDo as head :: _ =>
                 let
                     local
                         (* Check the references.  If all the sources that lead up to this have
                            already been we won't have any backward jumps. *)
                         fun available dest = List.all haveProcessed (Array.sub(labelRefs, dest))
 
                         val continuation =
                             case lastFlow of
                                 ExitCode => NONE
                             |   IndexedBr _ => NONE (* We could put the last branch in here. *)
                             |   Unconditional dest =>
                                     if not (haveProcessed dest) andalso available dest
                                     then SOME(FlowCodeSimple dest)
                                     else NONE
                             |   Conditional {trueJump, falseJump, condition, ...} =>
                                 (* We can usually choose either destination and in nearly all cases
                                    it won't matter.  The default branch is not to take forward jumps
                                    so if there is reason to believe that one branch is more likely
                                    we should follow that branch now and leave the other.  If we
                                    have Cond(No)Overflow we assume that overflow is unusual.
                                    If one branch raises an exception we assume that that is unusual. *)
                                 let
                                     val (first, second) =
                                         case (condition, Vector.sub(blocks, falseJump)) of
                                             (CondNoOverflow, _) => (trueJump, falseJump)
                                         |   (_, BasicBlock{ flow=ExitCode, block, ...}) =>
                                                 if List.exists(fn RaiseExceptionPacket _ => true | _ => false) block
                                                 then (trueJump, falseJump)
                                                 else (falseJump, trueJump)
                                         |   _ => (falseJump, trueJump)
                                 in
                                     if not (haveProcessed first) andalso available first
                                     then SOME(FlowCodeSimple first)
                                     else if not (haveProcessed second) andalso available second
                                     then SOME(FlowCodeSimple second)
                                     else NONE
                                 end
                            |    SetHandler { continue, ... } =>
                                     (* We want the continuation if possible.  We'll need a
                                        branch round the handler so that won't help. *)
                                     if not (haveProcessed continue) andalso available continue
                                     then SOME(FlowCodeSimple continue)
                                     else NONE
                            |    UnconditionalHandle _ => NONE
                            |    ConditionalHandle _ => NONE
                     in
                         (* First choice - continue the existing block.
                            Second choice - the first item whose sources have all been
                            processed.
                            Third choice - something from the list. *)
                         val picked =
                             case continuation of
                                 SOME c => c
                             |   NONE =>
                                     case List.find available stillToDo of
                                         SOME c => FlowCodeSimple c
                                     |   NONE => FlowCodeSimple head
                     end
                     
                 in
                     case picked of
                         FlowCodeSimple picked =>
                         let
                             val () = Array.update(processed, picked, true)
 
                             (* Code to terminate the previous block. *)
                             val startCode =
                                 case lastFlow of
                                     ExitCode => []
                                 |   IndexedBr _ => []
                                 |   UnconditionalHandle _ => []
                                 |   Unconditional dest =>
                                         if dest = picked then [] else [(UnconditionalBranch(getBlockLabel dest))]
                                 |   ConditionalHandle { continue, ...} =>
                                         if continue = picked then [] else [(UnconditionalBranch(getBlockLabel continue))]
                                 |   SetHandler { continue, ... } =>
                                         if continue = picked then [] else [(UnconditionalBranch(getBlockLabel continue))]
                                 |   Conditional { condition, trueJump, falseJump, ...} =>
                                     if picked = falseJump (* Usual case. *)
                                     then [(ConditionalBranch(condition, getBlockLabel trueJump))]
                                     else if picked = trueJump
                                     then (* We have a jump to the true condition. Invert the jump.
                                             This is more than an optimisation.  Because this immediately precedes the
                                             true block we're not going to generate a label. *)
                                         [(ConditionalBranch(invertTest condition, getBlockLabel falseJump))]
                                     else
                                     [
                                         (UnconditionalBranch(getBlockLabel falseJump)),
                                         (ConditionalBranch(condition, getBlockLabel trueJump))
                                     ]
 
                             (* Code-generate the body with the code we've done so far
                                at the end.  Add a label at the start if necessary. *)
                             local
                                 (* If the previous block dropped through to this and this was
                                    the only reference then we don't need a label. *)
                                 fun onlyJumpingHere (lab: int) =
                                     if lab <> picked then false
                                     else case Array.sub(labelRefs, picked) of
                                         [singleton] => singleton = lab
                                     |   _ => false
                     
                                 val noLabel =
                                     case lastFlow of
                                         ExitCode => picked = 0 (* Unless this was the first block. *)
                                     |   Unconditional dest => onlyJumpingHere dest
                                     |   Conditional { trueJump, falseJump, ...} =>
                                             onlyJumpingHere trueJump orelse onlyJumpingHere falseJump
                                     |   IndexedBr _ => false
                                     |   SetHandler _ => false
                                     |   UnconditionalHandle _ => false
                                     |   ConditionalHandle { continue, ...} => onlyJumpingHere continue
                             in
                                 val startLabel = if noLabel then [] else [(SetLabel(getBlockLabel picked))]
                             end
 
                             val BasicBlock { flow, block, ...} = Vector.sub(blocks, picked)
 
                             local
                                 fun genCodeBlock(instr, code) = codeExtended {flow=flow} (instr, code)
                             in
                                 val bodyCode = List.foldl genCodeBlock (startLabel @ startCode @ code) block
                             end
 
                             val addSet =
                                 case flow of
                                     ExitCode => []
                                 |   IndexedBr cases => cases
                                 |   Unconditional dest => [dest]
                                 |   Conditional {trueJump, falseJump, ...} => [falseJump, trueJump]
                                 |   SetHandler { handler, continue } => [handler, continue]
                                 |   UnconditionalHandle _ => []
                                 |   ConditionalHandle { continue, ...} => [continue]
 
                         in
                             genCode(addSet @ stillToDo, flow, bodyCode)
                         end
                 
                     |   FlowCodeCMove{code, trueJump, falseJump} =>
                         let
                             (* We've generated a conditional move and possibly a return.  If the
                                trueJump and falseJump are only ever referenced from this block
                                they're done, otherwise we still need to do them. *)
                             val _ =
                                 case Array.sub(labelRefs, trueJump) of
                                     [_] => Array.update(processed, trueJump, true)
                                 |   _ => ()
                             val _ =
                                 case Array.sub(labelRefs, falseJump) of
                                     [_] => Array.update(processed, falseJump, true)
                                 |   _ => ()
                             val BasicBlock { flow, ...} = Vector.sub(blocks, trueJump)
                             val addSet =
                                 case flow of
                                     ExitCode => []
                                 |   Unconditional dest => [dest]
                                 |   _ => raise InternalError "FlowCodeCMove"
                         in
                             genCode(addSet @ stillToDo, flow, code)
                         end
                 end
         in
             val ops = genCode([0], ExitCode, [(SetLabel startOfFunctionLabel)])
         end
     in
         generateFinalCode{instrs=List.rev ops, name=functionName, resultClosure=resultClosure,
                                    parameters=debugSwitches, profileObject=profileObject,  labelMaker= labelMaker}
     end
 
     structure Sharing =
     struct
         type ('genReg, 'optGenReg, 'fpReg) basicBlock = ('genReg, 'optGenReg, 'fpReg) basicBlock
         and xReg        = xReg
         and vReg        = vReg
         and closureRef  = closureRef
     end
 
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64IdentifyReferences.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64IdentifyReferences.ML
index 20c21b7c..e2f9faef 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64IdentifyReferences.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64IdentifyReferences.ML
@@ -1,894 +1,896 @@
 (*
     Copyright (c) 2021-2 David C.J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64IdentifyReferences(
     structure Arm64ICode: ARM64ICODE
     structure Debug: DEBUG
     structure IntSet: INTSET
 ): ARM64IDENTIFYREFERENCES =
 struct
     open Arm64ICode
     open IntSet
 
     type regState =
     { 
         active: int, refs: int, pushState: bool, prop: regProperty
     }
 
     (* CC states before and after.  Currently no instruction uses the
        condition; conditional branches are handled at the block level.
        The result of executing the instruction may be to set the condition code to a
        defined state, an undefined state or leave it unchanged. *)
     datatype outCCState = CCSet of ccRef | CCIndeterminate | CCUnchanged
     and inCCState = CCNeeded of ccRef | CCUnused
     
     datatype extendedBasicBlock =
         ExtendedBasicBlock of
         {
             block: {instr: iCodeAbstract, current: intSet, active: intSet, kill: intSet } list,
             flow: controlFlow,
             locals: intSet, (* Defined and used entirely within the block. *)
             imports: intSet, (* Defined outside the block, used inside it, but not needed afterwards. *)
             exports: intSet, (* Defined within the block, possibly used inside, but used outside. *)
             passThrough: intSet, (* Active throughout the block. May be referred to by it but needed afterwards. *)
             loopRegs: intSet, (* Destination registers for a loop.  They will be updated by this block. *)
             initialStacks: intSet, (* Stack items required at the start i.e. imports+passThrough for stack items. *)
             inCCState: ccRef option, (* The state this block assumes.  If SOME _ all predecessors must set it. *)
             outCCState: ccRef option (* The condition code set by this block.  SOME _ if at least one successor needs it. *)
         }
     
     exception InternalError = Misc.InternalError
 
     (* Return the list of blocks that are the immediate successor of this. *)
     fun blockSuccessors(BasicBlock{flow, ...}) = successorBlocks flow
 
     fun getOptReg(SomeReg reg) = [reg] | getOptReg ZeroReg = []
 
     fun getInstructionState(MoveRegister { source, dest, ...}) =
         { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadNonAddressConstant { dest, ...}) =
         { sources=[], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadFPConstant { dest, ...}) =
         { sources=[], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadAddressConstant { dest, ...}) =
         { sources=[], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadWithConstantOffset { base, dest, ...}) =
         { sources=[base], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadFPWithConstantOffset { base, dest, ...}) =
         { sources=[base], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadWithIndexedOffset { base, dest, index, ...}) =
         { sources=[base, index], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadFPWithIndexedOffset { base, dest, index, ...}) =
         { sources=[base, index], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(GetThreadId { dest, ...}) =
         { sources=[], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(ObjectIndexAddressToAbsolute { source, dest, ...}) =
         { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(AbsoluteToObjectIndex { source, dest, ...}) =
         { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(AllocateMemoryFixed { dest, ...}) =
         { sources=[], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(AllocateMemoryVariable{size, dest, ...}) =
             { sources=[size], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(InitialiseMem{size, addr, init}) =
             { sources=[size, addr, init], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BeginLoop) =
             (* This is just a marker.  It doesn't actually generate any code. *)
             { sources=[], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(JumpLoop{regArgs, stackArgs, ...}) =
         let
             fun getSourceFromRegs({src=ArgInReg reg, ...}, (regArgs, stackArgs)) = (reg :: regArgs, stackArgs)
             |   getSourceFromRegs({src=ArgOnStack { container, ...}, ...}, (regArgs, stackArgs)) = (regArgs, container :: stackArgs)
             and getSourceFromStack({src=ArgInReg reg, ...}, (regArgs, stackArgs)) = (reg :: regArgs, stackArgs)
             |   getSourceFromStack({src=ArgOnStack { container, ...}, ...}, (regArgs, stackArgs)) = (regArgs, container :: stackArgs)
             val (regSources, stackSources) =
                 List.foldl getSourceFromRegs (List.foldl getSourceFromStack ([], []) stackArgs) regArgs
         in
             { sources=regSources, dests=[], sStacks=stackSources, dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
         end
 
     |   getInstructionState(StoreWithConstantOffset { base, source, ...}) =
         { sources=[source, base], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(StoreFPWithConstantOffset { base, source, ...}) =
         { sources=[source, base], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(StoreWithIndexedOffset { base, source, index, ...}) =
         { sources=[source, base, index], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(StoreFPWithIndexedOffset { base, source, index, ...}) =
         { sources=[source, base, index], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(AddSubImmediate{ source, dest, ccRef, ... }) =
         { sources=[source], dests=getOptReg dest, sStacks=[], dStacks=[],
           ccIn=CCUnused, ccOut=case ccRef of NONE => CCUnchanged | SOME cc => CCSet cc }
 
     |   getInstructionState(AddSubRegister{ base, shifted, dest, ccRef, ... }) =
         { sources=[base, shifted], dests=getOptReg dest, sStacks=[], dStacks=[],
           ccIn=CCUnused, ccOut=case ccRef of NONE => CCUnchanged | SOME cc => CCSet cc }
 
     |   getInstructionState(LogicalImmediate{ source, dest, ccRef, ... }) =
         { sources=[source], dests=getOptReg dest, sStacks=[], dStacks=[],
           ccIn=CCUnused, ccOut=case ccRef of NONE => CCUnchanged | SOME cc => CCSet cc }
 
     |   getInstructionState(LogicalRegister{ base, shifted, dest, ccRef, ... }) =
         { sources=[base, shifted], dests=getOptReg dest, sStacks=[], dStacks=[],
           ccIn=CCUnused, ccOut=case ccRef of NONE => CCUnchanged | SOME cc => CCSet cc }
 
     |   getInstructionState(ShiftRegister{ source, shift, dest, ... }) =
         { sources=[source, shift], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(Multiplication{ dest, sourceA, sourceM, sourceN, ... }) =
         { sources=getOptReg sourceA @ [sourceM, sourceN],
           dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(Division{ dest, dividend, divisor, ... }) =
         { sources=[dividend, divisor], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BeginFunction {regArgs, fpRegArgs, stackArgs, ...}) =
             { sources=[], dests=map #1 regArgs @ map #1 fpRegArgs, sStacks=[], dStacks=stackArgs, ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(FunctionCall{regArgs, stackArgs, dests, fpRegArgs, fpDests, containers, ...}) =
         let
             (* Non-tail-recursive.  Behaves as a normal reference to sources. *)
             fun getSourceFromRegs((ArgInReg reg, _), (regArgs, stackArgs)) = (reg :: regArgs, stackArgs)
             |   getSourceFromRegs((ArgOnStack { container, ...}, _), (regArgs, stackArgs)) = (regArgs, container :: stackArgs)
             and getSourceFromStack(ArgInReg reg, (regArgs, stackArgs)) = (reg :: regArgs, stackArgs)
             |   getSourceFromStack(ArgOnStack { container, ...}, (regArgs, stackArgs)) = (regArgs, container :: stackArgs)
             val (argSources, stackSources) =
                 List.foldl getSourceFromRegs (List.foldl getSourceFromStack ([], []) stackArgs) regArgs
             val fpArgSources = List.map #1 fpRegArgs
         in
             { sources=argSources @ fpArgSources, dests=List.map #1 dests @ List.map #1 fpDests, sStacks=stackSources @ containers, dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
         end
 
     |   getInstructionState(TailRecursiveCall{regArgs, fpRegArgs, stackArgs, ...}) =
         let
             (* Tail recursive call.  References the argument sources but exits. *)
             fun getSourceFromRegs((ArgInReg reg, _), (regArgs, stackArgs)) = (reg :: regArgs, stackArgs)
             |   getSourceFromRegs((ArgOnStack { container, ...}, _), (regArgs, stackArgs)) = (regArgs, container :: stackArgs)
             and getSourceFromStack({src=ArgInReg reg, ...}, (regArgs, stackArgs)) = (reg :: regArgs, stackArgs)
             |   getSourceFromStack({src=ArgOnStack { container, ...}, ...}, (regArgs, stackArgs)) = (regArgs, container :: stackArgs)
             val (argSources, stackSources) =
                 List.foldl getSourceFromRegs (List.foldl getSourceFromStack ([], []) stackArgs) regArgs
             val fpArgSources = List.map #1 fpRegArgs
         in
             { sources=argSources@fpArgSources, dests=[], sStacks=stackSources, dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
         end
 
     |   getInstructionState(ReturnResultFromFunction{results, returnReg, ...}) =
             { sources=returnReg :: List.map #1 results, dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(RaiseExceptionPacket{packetReg}) =
             { sources=[packetReg], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(PushToStack{ source, container, ... }) =
             { sources=[source], dests=[], sStacks=[], dStacks=[container], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadStack{ dest, container, ... }) =
             { sources=[], dests=[dest], sStacks=[container], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(StoreToStack{ source, container, ... }) =
         (* Although this stores into the container it must already exist. *)
             { sources=[source], dests=[], sStacks=[container], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(ContainerAddress{ dest, container, ... }) =
             { sources=[], dests=[dest], sStacks=[container], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(ResetStackPtr _) =
             { sources=[], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(TagValue{source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(UntagValue{source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BoxLarge{source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(UnboxLarge{source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BoxTagFloat{source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(UnboxTagFloat{source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadAcquire { base, dest, ...}) =
         { sources=[base], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(StoreRelease { base, source, ...}) =
         { sources=[source, base], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BitFieldShift{ source, dest, ... }) =
         { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BitFieldInsert{ source, destAsSource, dest, ... }) =
         { sources=[source, destAsSource], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(IndexedCaseOperation{ testReg }) =
         { sources=[testReg], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(PushExceptionHandler) =
         { sources=[], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(PopExceptionHandler) =
         { sources=[], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BeginHandler{packetReg}) =
         (* The packet register is a destination since this provides its definition. *)
         { sources=[], dests=[packetReg], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(CompareByteVectors{vec1Addr, vec2Addr, length, ccRef, ...}) =
             { sources=[vec1Addr, vec2Addr, length], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCSet ccRef }
 
     |   getInstructionState(BlockMove{srcAddr, destAddr, length, ...}) =
             { sources=[srcAddr, destAddr, length], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(AddSubXSP{source, dest, ...}) =
             { sources=[source], dests=getOptReg dest, sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(TouchValue{source}) =
             { sources=[source], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadAcquireExclusive{base, dest}) =
             { sources=[base], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(StoreReleaseExclusive{base, source, result}) =
             { sources=[base] @ getOptReg source, dests=[result], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(MemoryBarrier) =
             { sources=[], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(ConvertIntToFloat{ source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(ConvertFloatToInt{ source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(UnaryFloatingPt{ source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BinaryFloatingPoint{ arg1, arg2, dest, ...}) =
             { sources=[arg1, arg2], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(CompareFloatingPoint{ arg1, arg2, ccRef, ...}) =
             { sources=[arg1, arg2], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCSet ccRef }
 
     |   getInstructionState(CPUYield) =
             { sources=[], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(AtomicOperation{ base, source, dest, ... }) =
             { sources=base :: getOptReg source, dests=getOptReg dest, sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(CacheCheck{ arg1, arg2}) =
             { sources=[arg1, arg2], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     (* These instructions can be eliminated if their register sources are not used.
        There may be other cases. *)
     fun eliminateable(MoveRegister _) = true
     |   eliminateable(LoadNonAddressConstant _) = true
     |   eliminateable(LoadAddressConstant _) = true
     |   eliminateable(LoadWithConstantOffset _) = true
     |   eliminateable(LoadWithIndexedOffset _) = true
     |   eliminateable(ObjectIndexAddressToAbsolute _) = true
     |   eliminateable(TagValue _) = true
     |   eliminateable(UntagValue _) = true
     |   eliminateable(BoxLarge _) = true
     |   eliminateable(UnboxLarge _) = true
+    |   eliminateable(BoxTagFloat _) = true
+    |   eliminateable(UnboxTagFloat _) = true
 
     |   eliminateable _ = false
    
 
     fun identifyRegs(blockVector, pregProps): extendedBasicBlock vector * regState vector =
     let
         val maxPRegs = Vector.length pregProps
         val vectorLength = Vector.length blockVector
         (* Initial arrays - declarationArray is the set of registers given
            values by the block, importArray is the set of registers referenced by
            the block and not declared locally. *)
         val declarationArray = Array.array(vectorLength, emptySet)
         and importArray = Array.array(vectorLength, emptySet)
         val stackDecArray = Array.array(vectorLength, emptySet)
         and stackImportArray = Array.array(vectorLength, emptySet)
         and localLoopRegArray = Array.array(vectorLength, emptySet)
         
         (* References - this is used locally to see if a register is ever
            actually used and also included in the result which uses it as
            part of the choice of which register to spill. *)
         val regRefs = Array.array(maxPRegs, 0)
         (* Registers that must be pushed because they are required after
            a function call.  For cache registers this means "discard". *)
         and requirePushOrDiscard = Array.array(maxPRegs, false)
 
         fun incrRef r = Array.update(regRefs, r, Array.sub(regRefs, r)+1)
         
         (* Contains the, possibly filtered, code for each block. *)
         val resultCode = Array.array(vectorLength, NONE)
         
         val ccInStates = Array.array(vectorLength, CCUnused)
         and ccOutStates = Array.array(vectorLength, CCIndeterminate)
         
         (* First pass - for each block build up the sets of registers defined and
            used in the block.  We do this depth-first so that we can use "refs" to
            see if a register is used.  If this is an instruction that can be eliminated
            we don't need to generate it and can ignore any references it makes. *)
         local
             fun blockScan blockNo =
             if isSome(Array.sub(resultCode, blockNo)) then ()
             else
             let
                 val () = Array.update(resultCode, blockNo, SOME []) (* Prevent looping. *)
                 val thisBlock as BasicBlock { block, flow, ...} = Vector.sub(blockVector, blockNo)
                 val successors = blockSuccessors thisBlock
                 (* Visit everything reachable first. *)
                 val () = List.app blockScan successors
                 
                 fun scanCode(instr, original as { code, decs, refs, sDecs, sRefs, occIn, occOut, loopRegs, ... }) =
                 let
                     val { sources, dests, sStacks=stackSrcs, dStacks=stackDests, ccIn, ccOut, ... } =
                         getInstructionState instr
                     fun regNo(PReg i) = i
                     and stackNo(StackLoc{rno, ...}) = rno
                     val destRegNos = map regNo dests
                     and sourceRegNos = map regNo sources
                     val stackDestRegNos = map stackNo stackDests
                     and stackSourceRegNos = map stackNo stackSrcs
                     (* If this instruction requires a cc i.e. is SetToCondition or X87FPGetCondition we
                        need to set this as a requirement earlier.  If this sets the CC and it is the condition
                        we've been expecting we've satisfied it and can set the previous condition to Unused.
                        We could use this to decide if a comparison is no longer required.  That can only
                        happen in very specific circumstances e.g. some tests in Test176.ML so it's
                        not worthwhile. *)
                     val newInCC =
                         case (ccIn, ccOut, occIn) of
                             (cc as CCNeeded _, _, _) => cc (* This instr needs a particular cc. *)
                         |   (CCUnused, CCSet _, _) => CCUnused
                         |   (CCUnused, _, occIn) => occIn
                     (* If this instruction modifies the CC check to see if it is setting an requirement. *)
                     val _ =
                         case (occIn, ccOut) of
                             (CCNeeded ccRIn, CCSet ccRout) =>
                                 if ccRIn = ccRout then () else raise InternalError "CCCheck failed"
                         |   (CCNeeded _, CCIndeterminate) => raise InternalError "CCCheck failed"
                         |   _ => ()
                     (* The output CC is the last CC set.  Tail instructions that don't change
                        the CC state are ignored until we reach an instruction that sets it. *)
                     val newOutCC = case occOut of CCUnchanged => ccOut | _ => occOut
 
                     val instrLoopRegs =
                         case instr of
                             JumpLoop{regArgs, ...} => listToSet (map (regNo o #dst) regArgs)
                         |   _ => emptySet
                 in
                     if eliminateable instr andalso
                         List.all(fn dReg => Array.sub(regRefs, dReg) = 0) destRegNos
                     then original (* Don't include this instruction. *)
                     else
                     let
                         (* Only mark the sources as referred after we know we're going to need this.
                            In that way we may eliminate the instruction that created this source. *)
                         val () = List.app incrRef sourceRegNos
                     in
                         { code = instr :: code, decs = union(listToSet destRegNos, decs), refs = union(listToSet sourceRegNos, refs),
                           sDecs = union(listToSet stackDestRegNos, sDecs), sRefs = union(listToSet stackSourceRegNos, sRefs),
                           occIn = newInCC, occOut = newOutCC, loopRegs = union(loopRegs, instrLoopRegs)}
                     end
                 end
                 
                 (* If we have a conditional branch at the end we need the condition code.  It should either
                    be set here or in a preceding block. *)
                 val inCC = case flow of Conditional { ccRef, ...} => CCNeeded ccRef | _ => CCUnused
 
                 val { code, decs, refs, sDecs, sRefs, occIn, occOut, loopRegs, ... } =
                     List.foldr scanCode
                         {code=[], decs=emptySet, refs=emptySet, sDecs=emptySet, sRefs=emptySet, occIn=inCC, occOut=CCUnchanged, loopRegs=emptySet} block
             in
                 Array.update(declarationArray, blockNo, decs);
                 (* refs includes local declarations. Remove before adding to the result. *)
                 Array.update(importArray, blockNo, minus(refs, decs));
                 Array.update(localLoopRegArray, blockNo, loopRegs);
                 Array.update(stackDecArray, blockNo, sDecs);
                 Array.update(stackImportArray, blockNo, minus(sRefs, sDecs));
                 Array.update(resultCode, blockNo, SOME code);
                 Array.update(ccInStates, blockNo, occIn);
                 Array.update(ccOutStates, blockNo, occOut)
             end
         in
             val () = blockScan 0 (* Start with the root block. *)
         end
         
         (* Second phase - Propagate reference information between the blocks.
            We need to consider loops here.  Do a depth-first scan marking each
            block.  If we find a loop we save the import information we've used.
            If when we come to process that block we find the import information
            is different we need to reprocess. *)
         (* Pass through array - values used in other blocks after this that
            are not declared in this block. *)
         val passThroughArray = Array.array(vectorLength, emptySet)
         val stackPassThroughArray = Array.array(vectorLength, emptySet)
         (* Exports - those of our declarations that are used in other blocks. *)
         val exportArray = Array.array(vectorLength, emptySet)
         val stackExportArray = Array.array(vectorLength, emptySet)
         (* Loop registers.  This contains the registers that are not exported
            from or passed through this block but are used subsequently as
            loop registers. *)
         val loopRegArray = Array.array(vectorLength, emptySet)
         val () = Array.copy{src=localLoopRegArray, dst=loopRegArray, di=0}
         (* If any one of the successors requires the CC then this is set.
            Otherwise we leave it as Unused. *)
         val ccRequiredOut = Array.array(vectorLength, CCUnused)
         local
             datatype loopData =
                 Unprocessed | Processing | Processed
             |   Looped of { regSet: intSet, loopSet: intSet, stackSet: intSet, ccState: inCCState }
             
             fun reprocessLoop () =
             let
                 val reprocess = ref false
                 val loopArray = Array.array(vectorLength, Unprocessed)
             
                 fun processBlocks blockNo =
                     case Array.sub(loopArray, blockNo) of
                         Processed => (* Already seen this by a different route. *)
                             {
                                 regSet = union(Array.sub(passThroughArray, blockNo), Array.sub(importArray, blockNo)),
                                 stackSet = union(Array.sub(stackPassThroughArray, blockNo), Array.sub(stackImportArray, blockNo)),
                                 ccState = Array.sub(ccInStates, blockNo),
                                 loopSet = Array.sub(loopRegArray, blockNo)
                             }
                     |   Looped s => s (* We've already seen this in a loop. *)
                     |   Processing => (* We have a loop. *)
                         let
                             (* Use the existing input array. *)
                             val inputs =
                             {
                                 regSet = union(Array.sub(passThroughArray, blockNo), Array.sub(importArray, blockNo)),
                                 stackSet = union(Array.sub(stackPassThroughArray, blockNo), Array.sub(stackImportArray, blockNo)),
                                 ccState = Array.sub(ccInStates, blockNo),
                                 loopSet = Array.sub(loopRegArray, blockNo)
                             }
                             val () = Array.update(loopArray, blockNo, Looped inputs)
                         in
                             inputs
                         end
                     |   Unprocessed => (* Normal case - not visited yet. *)
                         let
                             val () = Array.update(loopArray, blockNo, Processing)
                             val thisBlock = Vector.sub(blockVector, blockNo)
                             val ourDeclarations = Array.sub(declarationArray, blockNo)
                             and ourStackDeclarations = Array.sub(stackDecArray, blockNo)
                             and ourLocalLoopRegs = Array.sub(localLoopRegArray, blockNo)
                             val successors = blockSuccessors thisBlock
 
                             fun addSuccessor b =
                             let
                                 val {regSet=theirImports, stackSet=theirStackImports, ccState=theirInState, loopSet=theirLoops} = processBlocks b
                                 (* Remove loop regs from the imports if they are actually given new
                                    values by this block.  We don't want to pass the old loop regs through here. *)
                                 val theirImports = minus(theirImports, ourLocalLoopRegs)
                                 (* Split the imports.  If a register is a local declaration then
                                    it becomes an export.  If it is not it becomes part of our
                                    passThrough. *)
                                 val (addToExp, addToImp) =
                                     IntSet.partition (fn i => member(i, ourDeclarations)) theirImports
                                 val (addToStackExp, addToStackImp) =
                                     IntSet.partition (fn i => member(i, ourStackDeclarations)) theirStackImports
                                 (* Merge the input states from each of the successors. *)
                                 val () =
                                     case (theirInState, Array.sub(ccRequiredOut, blockNo)) of
                                         (CCNeeded ts, CCNeeded req) =>
                                             if ts = req then () else raise InternalError "Mismatched states"
                                     |   (ts as CCNeeded _, _) => Array.update(ccRequiredOut, blockNo, ts)
                                     |   _ => ()
                                 (* Add loop registers to the set if they are not declared here.  The
                                    only place they are declared is at the entry to the loop so that
                                    stops them being propagated further. *)
                                 val addToLoops = minus(theirLoops, ourDeclarations)
                             in
                                 Array.update(exportArray, blockNo,
                                     union(Array.sub(exportArray, blockNo), addToExp));
                                 Array.update(passThroughArray, blockNo,
                                     union(Array.sub(passThroughArray, blockNo), addToImp));
                                 Array.update(stackExportArray, blockNo,
                                     union(Array.sub(stackExportArray, blockNo), addToStackExp));
                                 Array.update(stackPassThroughArray, blockNo,
                                     union(Array.sub(stackPassThroughArray, blockNo), addToStackImp));
                                 Array.update(loopRegArray, blockNo,
                                     union(Array.sub(loopRegArray, blockNo), addToLoops))
                             end
                             val () = List.app addSuccessor successors
                             val ourInputs =
                                 union(Array.sub(passThroughArray, blockNo), Array.sub(importArray, blockNo))
                             val ourStackInputs =
                                 union(Array.sub(stackPassThroughArray, blockNo), Array.sub(stackImportArray, blockNo))
                         in
                             (* Check that we supply the required state. *)
                             case (Array.sub(ccRequiredOut, blockNo), Array.sub(ccOutStates, blockNo)) of
                                 (CCNeeded ccReq, CCSet ccSet) =>
                                     if ccReq = ccSet then () else raise InternalError "Mismatched cc states"
                             |   (CCNeeded _, CCIndeterminate) => raise InternalError "Mismatched cc states"
                             |   (cc as CCNeeded needOut, CCUnchanged) =>
                                 (
                                     (* We pass through the state.  If we don't use the state then we
                                        need to set this as the input.  If we do use the state it must be
                                        the same. *)
                                     case Array.sub(ccInStates, blockNo) of
                                         CCUnused => Array.update(ccInStates, blockNo, cc)
                                     |   CCNeeded needIn =>
                                             if needOut = needIn then () else raise InternalError "Mismatched cc states"
                                 )
                             |   _ => ();
                             (* Was this block used in a loop?  If so we should not be requiring a CC. *)
                             case Array.sub(loopArray, blockNo) of
                                 Looped {regSet, stackSet, ...} =>
                                 (
                                     case Array.sub(ccInStates, blockNo) of
                                         CCNeeded _ => raise InternalError "Looped state needs cc" | _ => ();
                                     if setToList regSet = setToList ourInputs andalso
                                         setToList stackSet = setToList ourStackInputs
                                     then ()
                                     else reprocess := true
                                 )
                             |   _ => ();
                             Array.update(loopArray, blockNo, Processed);
                             { regSet = ourInputs, stackSet = ourStackInputs,
                               ccState = Array.sub(ccInStates, blockNo), loopSet=Array.sub(loopRegArray, blockNo)}
                         end
             in
                 reprocess := false;
                 processBlocks 0;
                 if !reprocess then reprocessLoop () else ()
             end
         in
             val () = reprocessLoop ()
         end
         
         (* Third pass - Build the result list with the active registers for each
            instruction.  We don't include registers in the passThrough set since
            they are active throughout the block. *)
         local
             (* Number of instrs for which this is active.  We use this to try to select a
                register to push to the stack if we have too many.  Registers that have
                only a short lifetime are less likely to be pushed than those that are
                active longer. *)
             val regActive = Array.array(maxPRegs, 0)
             fun addActivity n r = Array.update(regActive, r, Array.sub(regActive, r)+n)
             
             fun createResultInstrs (passThrough, stackPassThrough)
                 (instr, (tail, activeAfterThis, stackActiveAfterThis)) =
             let
                 val { sources, dests, sStacks=stackSrcs, dStacks=stackDests, ... } = getInstructionState instr
             in
                 (* Eliminate instructions if their results are not required.  The earlier check for this
                    will remove most cases but if we have duplicated a block we may have a register that
                    is required elsewhere but not in this particular branch.  *)
                 if not(List.exists(fn PReg d => member(d, activeAfterThis)) dests) andalso eliminateable instr
                 then (tail, activeAfterThis, stackActiveAfterThis)
                 else
                 let
                     fun regNo(PReg i) = i
                     fun stackNo(StackLoc{rno, ...}) = rno
                     val destRegNos = map regNo dests
                     and sourceRegNos = map regNo sources
                     val destSet = listToSet destRegNos
                     (* Remove any sources that are present in passThrough since
                        they are going to be active throughout the block. *)
                     and sourceSet = minus(listToSet sourceRegNos, passThrough)
                     val stackDestRegNos = map stackNo stackDests
                     and stackSourceRegNos = map stackNo stackSrcs
                     val stackDestSet = listToSet stackDestRegNos
                     and stackSourceSet = minus(listToSet stackSourceRegNos, stackPassThrough)
 
                     (* To compute the active set for the PREVIOUS instruction (we're processing from the
                        end back to the start) we remove any registers that have been given values in this
                        instruction and add anything that we are using in this instruction since they will
                        now need to have values. *)
                     val afterRemoveDests = minus(activeAfterThis, destSet)
                     val stackAfterRemoveDests = minus(stackActiveAfterThis, stackDestSet)
                     val activeForPrevious = union(sourceSet, afterRemoveDests)
                     val stackActiveForPrevious = union(stackSourceSet, stackAfterRemoveDests)
             
                     (* The "active" set is the set of registers that need to be active DURING the
                        instruction.  It includes destinations, which will usually be in
                        "activeAfterThis", because there may be destinations that are not actually used
                        subsequently but still need a register. *)
                     val activeForInstr =
                         case instr of
                             FunctionCall _ => sourceSet (* Is this still needed? *)
                         |   TailRecursiveCall _ =>
                                 (* Set the active set to the total set of registers we require including
                                    the work register.  This ensures that we will spill as many registers
                                    as we require when we look at the size of the active set. *)
                                 union(sourceSet, destSet)
                         |   BoxLarge _ =>
                                 (* We can only store the value in the box after the box is allocated. *)
                                 union(activeAfterThis, union(sourceSet, destSet))
                         |   BoxTagFloat _ =>
                                 (* Since the source must be a V register and the destination an
                                    X register there isn't actually a problem here, but do this
                                    anyway. *)
                                 union(activeAfterThis, union(sourceSet, destSet))
                         |   _ => union(activeAfterThis, destSet)
                 
                     val () = List.app(addActivity 1) (setToList activeForInstr)
 
                     local
                         (* If we are allocating memory we have to save the current registers if
                            they could contain an address.  We mustn't push untagged registers
                            and we mustn't push the destination. *)
                         fun getSaveSet includeReg =
                         let
                             val activeAfter = union(activeAfterThis, passThrough)
                             (* Remove any registers marked - must-not-push.  These are
                                registers holding non-address values.  They will actually
                                be saved by the RTS across any GC but not checked or
                                modified by the GC.
                                Exclude the result register. *)
                             fun getSave i =
                                 if includeReg i
                                 then case Vector.sub(pregProps, i) of
                                     RegPropGeneral => SOME(PReg i)
                                 |   RegPropCacheTagged => SOME(PReg i)
                                 |   RegPropUntagged => NONE
                                 |   RegPropStack _ => NONE
                                 |   RegPropCacheUntagged => NONE
                                 |   RegPropMultiple => raise InternalError "getSave: RegPropMultiple"
                                 else NONE
                         in
                             List.mapPartial getSave (setToList activeAfter)
                         end
                     in
                         (* Sometimes we need to modify the instruction e.g. to include the set
                            of registers to save. *)
                         val convertedInstr =
                             case instr of
                                 AllocateMemoryFixed{bytesRequired, dest, saveRegs=_} =>
                                     AllocateMemoryFixed{bytesRequired=bytesRequired, dest=dest,
                                         saveRegs=getSaveSet(fn i => i <> regNo dest)}
 
                             |   AllocateMemoryVariable{size, dest, saveRegs=_} =>
                                     AllocateMemoryVariable{size=size, dest=dest, saveRegs=getSaveSet(fn i => i <> regNo dest)}
 
                             |   BoxLarge{source, dest, saveRegs=_} =>
                                     BoxLarge{source=source, dest=dest, saveRegs=getSaveSet(fn i => i <> regNo dest)}
 
                             |   BoxTagFloat{source, dest, floatSize, saveRegs=_} =>
                                     BoxTagFloat{source=source, dest=dest, floatSize=floatSize,
                                         saveRegs=getSaveSet(fn i => i <> regNo dest)}
                        
                             |   JumpLoop{regArgs, stackArgs, checkInterrupt = SOME _, ...} =>
                                 let
                                     (* If we have to check for interrupts we must preserve registers across
                                        the RTS call. *)
                                     fun getSave i =
                                         case Vector.sub(pregProps, i) of
                                         RegPropGeneral => SOME(PReg i)
                                     |   RegPropCacheTagged => SOME(PReg i)
                                     |   RegPropUntagged => NONE
                                     |   RegPropStack _ => NONE
                                     |   RegPropCacheUntagged => NONE
                                     |   RegPropMultiple => raise InternalError "getSave: RegPropMultiple"
                                     val currentRegs = union(activeAfterThis, passThrough)
                                     (* Have to include the loop registers.  These were previously included
                                        automatically because they were part of the import set. *)
                                     val check = List.mapPartial getSave (map (regNo o #dst) regArgs @ setToList currentRegs)
                                 in
                                     JumpLoop{regArgs=regArgs, stackArgs=stackArgs, checkInterrupt=SOME check}
                                 end
 
                             |   FunctionCall{regArgs, stackArgs=[], dests, fpRegArgs=[], fpDests=[], callKind as ConstantCode m,
                                              saveRegs=_, containers} =>
                                 (* If this is arbitrary precision push the registers rather than marking them as "save".
                                    stringOfWord returns 'CODE "PolyAddArbitrary"' etc. *)
                                 if (String.isSubstring "Arbitrary\"" (Address.stringOfWord m))
                                 then
                                 let
                                     val destRegs = List.map (regNo o #1) dests
                                     fun includeInSave i = not(List.exists(fn r => r=i) destRegs)
                                 in
                                     FunctionCall{regArgs=regArgs, stackArgs=[], callKind=callKind, dests=dests,
                                         fpRegArgs=[], fpDests=[], containers=containers, saveRegs=getSaveSet includeInSave}
                                 end
                                 else instr
 
                             |   _ => instr
                     end
                 
                     (* FunctionCall must mark all registers as "push". *)
                     local
                         fun pushRegisters () =
                         let
                             val activeAfter = union(activeAfterThis, passThrough)
                             fun pushAllButDests i =
                                 if List.exists(fn j => i=j) destRegNos
                                 then ()
                                 else case Vector.sub(pregProps, i) of
                                     RegPropCacheTagged => raise InternalError "pushRegisters: cache reg"
                                 |   RegPropCacheUntagged => raise InternalError "pushRegisters: cache reg"
                                 |  _ => Array.update(requirePushOrDiscard, i, true)
                         in
                             (* We need to push everything active after this
                                except the result register. *)
                             List.app pushAllButDests (setToList activeAfter)
                         end
                     in
                         val () =
                            case instr of
                                 FunctionCall{ stackArgs=[], callKind=ConstantCode m, ...} =>
                                 if (String.isSubstring "Arbitrary\"" (Address.stringOfWord m))
                                 then ()
                                 else pushRegisters ()
                             
                             |   FunctionCall _ => pushRegisters ()
                             
                                 (* It should no longer be necessary to push across a handler but
                                    there still seem to be cases that need it. *)
                             (*|   BeginHandler _ => pushRegisters ()*)
 
                             |   _ => ()
                     end
                 
                     (* Which entries are active in this instruction but not afterwards? *)
                     val kill = union(minus(stackSourceSet, stackActiveAfterThis), minus(sourceSet, activeAfterThis))
                 in
                     ({instr=convertedInstr, active=activeForInstr, current=activeAfterThis, kill=kill} :: tail, activeForPrevious,
                      stackActiveForPrevious)
                 end
             end
 
             fun createResult blockNo =
             let
                 val BasicBlock{ flow, ...} = Vector.sub(blockVector, blockNo)
                 val declSet   = Array.sub(declarationArray, blockNo)
                 and importSet = Array.sub(importArray, blockNo)
                 and passSet   = Array.sub(passThroughArray, blockNo)
                 and loopSet   = Array.sub(loopRegArray, blockNo)
                 and exportSet = Array.sub(exportArray, blockNo)
                 and stackPassSet = Array.sub(stackPassThroughArray, blockNo)
                 and stackImportSet = Array.sub(stackImportArray, blockNo)
                 and stackExportSet = Array.sub(stackExportArray, blockNo)
                 val filteredCode = getOpt(Array.sub(resultCode, blockNo), [])
                 (* At the end of the block we should have the exports active. *)
                 val (resultInstrs, _, _) = List.foldr (createResultInstrs (passSet, stackPassSet)) ([], exportSet, stackExportSet) filteredCode
                 (* Set the active count for the pass through. *)
                 val instrCount = List.length filteredCode
                 val () = List.app(addActivity instrCount) (setToList passSet)
                 val inCCState =
                     case Array.sub(ccInStates, blockNo) of CCNeeded s => SOME s | CCUnused => NONE
                 val outCCState =
                     case Array.sub(ccRequiredOut, blockNo) of CCNeeded s => SOME s | CCUnused => NONE
             in
                 ExtendedBasicBlock {
                     block = resultInstrs,
                     flow=flow,
                     locals = minus(declSet, exportSet),
                     imports = importSet,
                     exports = exportSet,
                     passThrough = passSet,
                     loopRegs = loopSet,
                     initialStacks = union(stackPassSet, stackImportSet),
                     inCCState = inCCState,
                     outCCState = outCCState
                 }
             end
         in
             val resultBlocks = Vector.tabulate(vectorLength, createResult)
             val regActive = regActive
         end
         
         val registerState: regState vector =
             Vector.tabulate(maxPRegs,
                 fn i => {
                     active = Array.sub(regActive, i),
                     refs = Array.sub(regRefs, i),
                     pushState = Array.sub(requirePushOrDiscard, i),
                     prop = Vector.sub(pregProps, i)
                 }
             )
     in
         (resultBlocks, registerState)
     end
     (* Exported function.  First filter out unreferenced blocks then process the
        registers themselves. *)
     fun identifyRegisters(blockVector, pregProps) =
     let
         val vectorLength = Vector.length blockVector
         val mapArray = Array.array(vectorLength, NONE)
         and resArray = Array.array(vectorLength, NONE)
         val count = ref 0
         
         fun setReferences label =
             case Array.sub(mapArray, label) of
                 NONE => (* Not yet visited *)
                 let
                     val BasicBlock{flow, block} = Vector.sub(blockVector, label)
                     (* Create a new entry for it. *)
                     val newLabel = ! count before count := !count + 1
                     (* Add it to the map.  Any other references will use this
                        without reprocessing. *)
                     val () = Array.update(mapArray, label, SOME newLabel)
                     val newFlow =
                         case flow of
                             Unconditional l => Unconditional(setReferences l)
                         |   Conditional{trueJump, falseJump, ccRef, condition} =>
                                 Conditional{trueJump=setReferences trueJump, falseJump=setReferences falseJump,
                                         ccRef=ccRef, condition=condition}
                         |   ExitCode => ExitCode
                         |   IndexedBr list => IndexedBr(map setReferences list)
                         |   SetHandler{handler, continue} =>
                                 SetHandler{handler=setReferences handler, continue=setReferences continue}
                         |   UnconditionalHandle l => UnconditionalHandle(setReferences l)
                         |   ConditionalHandle{handler, continue} =>
                                 ConditionalHandle{handler=setReferences handler, continue=setReferences continue}
 
                     val () = Array.update(resArray, newLabel, SOME(BasicBlock{flow=newFlow, block=block}))
                 in
                     newLabel
                 end
             |   SOME lab => lab
        
         val _ = setReferences 0
         
         val newBlockVector =
             Vector.tabulate(!count, fn i => valOf(Array.sub(resArray, i)))
     in
         identifyRegs(newBlockVector, pregProps)
     end
 
     (* Exported for use in GetConflictSets *)
     fun getInstructionRegisters instr =
     let
         val {sources, dests, ...} = getInstructionState instr
     in
         {sources=sources, dests=dests}
     end
     
     (* Exported for use in ICodeOptimise *)
     val getInstructionCC = #ccOut o getInstructionState
 
     structure Sharing =
     struct
         type ('genReg, 'optGenReg, 'fpReg) arm64ICode = ('genReg, 'optGenReg, 'fpReg) arm64ICode
         and preg                = preg
         and pregOrZero          = pregOrZero
         and intSet              = intSet
         and ('genReg, 'optGenReg, 'fpReg) basicBlock = ('genReg, 'optGenReg, 'fpReg) basicBlock
         and extendedBasicBlock  = extendedBasicBlock
         and controlFlow         = controlFlow
         and regProperty         = regProperty
         and ccRef               = ccRef
         and outCCState          = outCCState
     end
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PreAssembly.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PreAssembly.ML
index 0b7481e8..1282ee5c 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PreAssembly.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PreAssembly.ML
@@ -1,1107 +1,1105 @@
 (*
     Copyright (c) 2021-2 David C. J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     Licence version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public Licence for more details.
     
     You should have received a copy of the GNU Lesser General Public
     Licence along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 (* The pre-assembly layer goes below the icode and allows peep-hole optimisation. *)
 
 functor Arm64PreAssembly(
 
     structure Arm64Assembly: ARM64ASSEMBLY
     structure Debug: DEBUG
     structure Pretty: PRETTY
 
 ): ARM64PREASSEMBLY =
 struct
     open Arm64Assembly
     
     exception InternalError = Misc.InternalError
 
     (* Reversed cons and append to make the code easier to read. *)
     infix 5 <::> <@>
     fun tl <::> hd = hd :: tl
     and snd <@> fst = fst @ snd
 
     (* Many of the datatypes are inherited from Arm64Assembly *)
 
     datatype loadType = Load64 | Load32 | Load16 | Load8
     and opSize = OpSize32 | OpSize64
     and logicalOp = LogAnd | LogOr | LogXor
     and floatSize = Float32 | Double64
     and shiftDirection = ShiftLeft | ShiftRightLogical | ShiftRightArithmetic
     and multKind =
         MultAdd32 | MultSub32 | MultAdd64 | MultSub64 |
         SignedMultAddLong (* 32bit*32bit + 64bit => 64Bit *) |
         SignedMultHigh (* High order part of 64bit*64Bit *)
-    and fpUnary = NegFloat | NegDouble | AbsFloat | AbsDouble | ConvFloatToDble | ConvDbleToFloat
+    and fpUnary =
+        NegFloat | NegDouble | AbsFloat | AbsDouble | ConvFloatToDble |
+        ConvDbleToFloat | MoveDouble | MoveFloat
     and fpBinary = MultiplyFP | DivideFP | AddFP | SubtractFP
     and unscaledType = NoUpdate | PreIndex | PostIndex
     and condSet = CondSet | CondSetIncr | CondSetInvert | CondSetNegate
     and bitfieldKind = BFUnsigned | BFSigned | BFInsert
     and brRegType = BRRBranch | BRRAndLink | BRRReturn
         (* Some of the atomic operations added in 8.1 *)
     and atomicOp = LoadAddAL | LoadUmaxAL | SwapAL | LoadAddAcquire | LoadUMaxAcquire | SwapRelease
 
     datatype label = Label of int
     type labelMaker = int ref
     fun createLabelMaker() = ref 0
     fun createLabel(r as ref n) = Label n before r := n+1
 
     datatype precode =
         (* Basic instructions *)
         AddImmediate of {regN: xReg, regD: xReg, immed: word, shifted: bool, opSize: opSize, setFlags: bool}
     |   SubImmediate of {regN: xReg, regD: xReg, immed: word, shifted: bool, opSize: opSize, setFlags: bool}
     |   AddShiftedReg of {regM: xReg, regN: xReg, regD: xReg, shift: shiftType, opSize: opSize, setFlags: bool}
     |   SubShiftedReg of {regM: xReg, regN: xReg, regD: xReg, shift: shiftType, opSize: opSize, setFlags: bool}
     |   AddExtendedReg of {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend, opSize: opSize, setFlags: bool}
     |   SubExtendedReg of {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend, opSize: opSize, setFlags: bool}
     |   MultiplyAndAddSub of {regM: xReg, regN: xReg, regA: xReg, regD: xReg, multKind: multKind}
     |   DivideRegs of
             {regM: xReg, regN: xReg, regD: xReg, isSigned: bool, opSize: opSize}
     |   LogicalShiftedReg of
             {regM: xReg, regN: xReg, regD: xReg, shift: shiftType, logOp: logicalOp, opSize: opSize, setFlags: bool}
     |   LoadRegScaled of
             {regT: xReg, regN: xReg, unitOffset: int, loadType: loadType}
     |   LoadFPRegScaled of
             {regT: vReg, regN: xReg, unitOffset: int, floatSize: floatSize}
     |   StoreRegScaled of
             {regT: xReg, regN: xReg, unitOffset: int, loadType: loadType}
     |   StoreFPRegScaled of
             {regT: vReg, regN: xReg, unitOffset: int, floatSize: floatSize}
     |   LoadRegUnscaled of
             {regT: xReg, regN: xReg, byteOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   StoreRegUnscaled of
             {regT: xReg, regN: xReg, byteOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   LoadFPRegUnscaled of
             {regT: vReg, regN: xReg, byteOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   StoreFPRegUnscaled of
             {regT: vReg, regN: xReg, byteOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   LoadRegIndexed of {regT: xReg, regN: xReg, regM: xReg, loadType: loadType, option: scale extend}
     |   StoreRegIndexed of {regT: xReg, regN: xReg, regM: xReg, loadType: loadType, option: scale extend}
     |   LoadFPRegIndexed of {regT: vReg, regN: xReg, regM: xReg, floatSize: floatSize, option: scale extend}
     |   StoreFPRegIndexed of {regT: vReg, regN: xReg, regM: xReg, floatSize: floatSize, option: scale extend}
         (* LoadAcquire and StoreRelease are used for mutables. *)
     |   LoadAcquireReg of {regN: xReg, regT: xReg, loadType: loadType}
     |   StoreReleaseReg of {regN: xReg, regT: xReg, loadType: loadType}
         (* LoadAcquireExclusiveRegister and StoreReleaseExclusiveRegister are used for mutexes. *)
     |   LoadAcquireExclusiveRegister of {regN: xReg, regT: xReg}
     |   StoreReleaseExclusiveRegister of {regS: xReg, regT: xReg, regN: xReg}
     |   MemBarrier
         (* Additional atomic operations. *)
     |   AtomicExtension of { regT: xReg, regN: xReg, regS: xReg, atOp: atomicOp }
     |   LoadRegPair of
             { regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   StoreRegPair of
             { regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   LoadFPRegPair of
             { regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   StoreFPRegPair of
             { regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   ConditionalSet of
             {regD: xReg, regTrue: xReg, regFalse: xReg, cond: condition, condSet: condSet, opSize: opSize}
     |   BitField of {immr: word, imms: word, regN: xReg, regD: xReg, opSize: opSize, bitfieldKind: bitfieldKind}
     |   ShiftRegisterVariable of {regM: xReg, regN: xReg, regD: xReg, opSize: opSize, shiftDirection: shiftDirection}
     |   BitwiseLogical of { bits: Word64.word, regN: xReg, regD: xReg, opSize: opSize, setFlags: bool, logOp: logicalOp}
         (* Floating point *)
-    |   MoveFPToFP of { regN: vReg, regD: vReg, floatSize: floatSize}
     |   MoveGeneralToFP of { regN: xReg, regD: vReg, floatSize: floatSize}
     |   MoveFPToGeneral of {regN: vReg, regD: xReg, floatSize: floatSize}
     |   CvtIntToFP of { regN: xReg, regD: vReg, floatSize: floatSize, opSize: opSize}
     |   CvtFloatToInt of { round: IEEEReal.rounding_mode, regN: vReg, regD: xReg, floatSize: floatSize, opSize: opSize}
     |   FPBinaryOp of { regM: vReg, regN: vReg, regD: vReg, floatSize: floatSize, fpOp: fpBinary}
     |   FPComparison of { regM: vReg, regN: vReg, floatSize: floatSize}
     |   FPUnaryOp of {regN: vReg, regD: vReg, fpOp: fpUnary}
         (* Branches and Labels. *)
     |   SetLabel of label
     |   ConditionalBranch of condition * label
     |   UnconditionalBranch of label
     |   BranchAndLink of label
     |   BranchReg of {regD: xReg, brRegType: brRegType }
     |   LoadLabelAddress of xReg * label
     |   TestBitBranch of { test: xReg, bit: Word8.word, label: label, onZero: bool }
     |   CompareBranch of { test: xReg, label: label, onZero: bool, opSize: opSize }
         (* Composite instructions *)
     |   MoveXRegToXReg of {sReg: xReg, dReg: xReg}
     |   LoadNonAddr of xReg * Word64.word
     |   LoadFPConst of {dest: vReg, value: Word64.word, floatSize: floatSize, work: xReg}
     |   LoadAddr of xReg * machineWord
     |   RTSTrap of { rtsEntry: int, work: xReg, save: xReg list }
     |   AllocateMemoryFixedSize of { bytes: word, dest: xReg, save: xReg list, work: xReg }
     |   AllocateMemoryVariableSize of { sizeReg: xReg, dest: xReg, save: xReg list, work: xReg }
         (* Branch table for indexed case. startLabel is the address of the first label in
            the list.  The branch table is a sequence of unconditional branches. *)
     |   BranchTable of { startLabel: label, brTable: label list }
     |   LoadGlobalHeapBaseInCallback of xReg
     |   Yield
 
 
     (* Optimise the pre-assembler code and then generate the final code. *)
     fun generateFinalCode {instrs, name, parameters, resultClosure, profileObject, labelMaker=ref labelCount} =
     let
         val labelTargets = Array.tabulate(labelCount, fn i => (Arm64Assembly.createLabel(), i) )
 
         (* Follow the chain of forwarded labels. *)
         local
             fun forwardLab(labelNo, labels) =
             let
                 val dest as (_, dNo) = Array.sub(labelTargets, labelNo)
             in
                 if dNo = labelNo
                 then dest
                 (* This should not happen but just in case... *)
                 else if List.exists(fn i => i = dNo) labels
                 then raise InternalError "Infinite loop"
                 else forwardLab(dNo, dNo::labels)
             end
         in
             fun getLabel labelNo = forwardLab(labelNo, [labelNo])
             val getLabelTarget = #1 o getLabel
         end                
 
         fun toAssembler([], code) = code
 
         |   toAssembler(AddImmediate{regN, regD, immed, shifted, opSize, setFlags} :: rest, code) =
             let
                 val instr =
                     case (opSize, setFlags) of
                         (OpSize64, false) => addImmediate
                     |   (OpSize32, false) => addImmediate32
                     |   (OpSize64, true) => addSImmediate
                     |   (OpSize32, true) => addSImmediate32
             in
                 toAssembler(rest, code <::> instr{regN=regN, regD=regD, immed=immed, shifted=shifted})
             end
 
         |   toAssembler(SubImmediate{regN, regD, immed, shifted, opSize, setFlags} :: rest, code) =
             let
                 val instr =
                     case (opSize, setFlags) of
                         (OpSize64, false) => subImmediate
                     |   (OpSize32, false) => subImmediate32
                     |   (OpSize64, true) => subSImmediate
                     |   (OpSize32, true) => subSImmediate32
             in
                 toAssembler(rest, code <::> instr{regN=regN, regD=regD, immed=immed, shifted=shifted})
             end
 
         |   toAssembler(AddShiftedReg{regM, regN, regD, shift, opSize, setFlags} :: rest, code) =
             let
                 val instr =
                     case (opSize, setFlags) of
                         (OpSize64, false) => addShiftedReg
                     |   (OpSize32, false) => addShiftedReg32
                     |   (OpSize64, true) => addSShiftedReg
                     |   (OpSize32, true) => addSShiftedReg32
             in
                 toAssembler(rest, code <::> instr{regM=regM, regN=regN, regD=regD, shift=shift})
             end
 
         |   toAssembler(SubShiftedReg{regM, regN, regD, shift, opSize, setFlags} :: rest, code) =
             let
                 val instr =
                     case (opSize, setFlags) of
                         (OpSize64, false) => subShiftedReg
                     |   (OpSize32, false) => subShiftedReg32
                     |   (OpSize64, true) => subSShiftedReg
                     |   (OpSize32, true) => subSShiftedReg32
             in
                 toAssembler(rest, code <::> instr{regM=regM, regN=regN, regD=regD, shift=shift})
             end
 
         |   toAssembler(AddExtendedReg{regM, regN, regD, extend, opSize, setFlags} :: rest, code) =
             (* Add/SubExtended are only used to access XSP. *)
             let
                 val instr =
                     case (opSize, setFlags) of
                         (OpSize64, false) => addExtendedReg
                     |   (OpSize32, false) => raise InternalError "AddExtendedReg; 32"
                     |   (OpSize64, true) => addSExtendedReg
                     |   (OpSize32, true) => raise InternalError "AddExtendedReg; 32"
             in
                 toAssembler(rest, code <::> instr{regM=regM, regN=regN, regD=regD, extend=extend})
             end
 
         |   toAssembler(SubExtendedReg{regM, regN, regD, extend, opSize, setFlags} :: rest, code) =
             let
                 val instr =
                     case (opSize, setFlags) of
                         (OpSize64, false) => subExtendedReg
                     |   (OpSize32, false) => raise InternalError "AddExtendedReg; 32"
                     |   (OpSize64, true) => subSExtendedReg
                     |   (OpSize32, true) => raise InternalError "AddExtendedReg; 32"
             in
                 toAssembler(rest, code <::> instr{regM=regM, regN=regN, regD=regD, extend=extend})
             end
 
         |   toAssembler(MultiplyAndAddSub{regM, regN, regA, regD, multKind} :: rest, code) =
             let
                 val instr =
                     case multKind of
                         MultAdd32 => multiplyAndAdd32{regM=regM, regN=regN, regA=regA, regD=regD}
                     |   MultSub32 => multiplyAndSub32{regM=regM, regN=regN, regA=regA, regD=regD}
                     |   MultAdd64 => multiplyAndAdd{regM=regM, regN=regN, regA=regA, regD=regD}
                     |   MultSub64 => multiplyAndSub{regM=regM, regN=regN, regA=regA, regD=regD}
                     |   SignedMultAddLong => signedMultiplyAndAddLong{regM=regM, regN=regN, regA=regA, regD=regD}
                     |   SignedMultHigh => signedMultiplyHigh{regM=regM, regN=regN, regD=regD}
             in
                 toAssembler(rest, code <::> instr)
             end
 
         |   toAssembler(DivideRegs{regM, regN, regD, isSigned, opSize} :: rest, code) =
             let
                 val instr =
                     case (isSigned, opSize) of
                         (true, OpSize64) => signedDivide
                     |   (true, OpSize32) => signedDivide32
                     |   (false, OpSize64) => unsignedDivide
                     |   (false, OpSize32) => unsignedDivide32
             in
                 toAssembler(rest, code <::> instr{regN=regN, regM=regM, regD=regD})
             end
 
         |   toAssembler(LogicalShiftedReg{regM, regN, regD, shift, logOp, opSize, setFlags} :: rest, code) =
             let
                 val instr =
                     case (logOp, setFlags, opSize) of
                         (LogAnd, false, OpSize64) => andShiftedReg
                     |   (LogAnd, true, OpSize64) => andsShiftedReg
                     |   (LogOr, false, OpSize64) => orrShiftedReg
                     |   (LogXor, false, OpSize64) => eorShiftedReg
 
                     |   (LogAnd, false, OpSize32) => andShiftedReg32
                     |   (LogAnd, true, OpSize32) => andsShiftedReg32
                     |   (LogOr, false, OpSize32) => orrShiftedReg32
                     |   (LogXor, false, OpSize32) => eorShiftedReg32
 
                     |   _ => raise InternalError "setFlags not valid with OR or XOR"
                 (* There are also versions of AND/OR/XOR which operate on a complement (NOT)
                    of the shifted register.  It's probably not worth looking for a use for them. *)
             in
                 toAssembler(rest, code <::> instr{regN=regN, regM=regM, regD=regD, shift=shift})
             end
 
         |   toAssembler(LoadRegScaled{regT, regN, unitOffset, loadType} :: rest, code) =
             let
                 val instr =
                     case loadType of
                         Load64 => loadRegScaled
                     |   Load32 => loadRegScaled32
                     |   Load16 => loadRegScaled16
                     |   Load8 => loadRegScaledByte
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, unitOffset=unitOffset})
             end
 
         |   toAssembler(StoreRegScaled{regT, regN, unitOffset, loadType} :: rest, code) =
             let
                 val instr =
                     case loadType of
                         Load64 => storeRegScaled
                     |   Load32 => storeRegScaled32
                     |   Load16 => storeRegScaled16
                     |   Load8 => storeRegScaledByte
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, unitOffset=unitOffset})
             end
 
         |   toAssembler(LoadFPRegScaled{regT, regN, unitOffset, floatSize} :: rest, code) =
             let
                 val instr =
                     case floatSize of
                         Float32 => loadRegScaledFloat
                     |   Double64 => loadRegScaledDouble
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, unitOffset=unitOffset})
             end
 
         |   toAssembler(StoreFPRegScaled{regT, regN, unitOffset, floatSize} :: rest, code) =
             let
                 val instr =
                     case floatSize of
                         Float32 => storeRegScaledFloat
                     |   Double64 => storeRegScaledDouble
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, unitOffset=unitOffset})
             end
 
         |   toAssembler(LoadRegUnscaled{regT, regN, byteOffset, loadType, unscaledType} :: rest, code) =
             let
                 val instr =
                     case (loadType, unscaledType) of
                         (Load64, NoUpdate) => loadRegUnscaled
                     |   (Load32, NoUpdate) => loadRegUnscaled32
                     |   (Load16, NoUpdate) => loadRegUnscaled16
                     |   (Load8, NoUpdate) => loadRegUnscaledByte
                     |   (Load64, PreIndex) => loadRegPreIndex
                     |   (Load32, PreIndex) => loadRegPreIndex32
                     |   (Load16, PreIndex) => raise InternalError "loadRegPreIndex16"
                     |   (Load8, PreIndex) => loadRegPreIndexByte
                     |   (Load64, PostIndex) => loadRegPostIndex
                     |   (Load32, PostIndex) => loadRegPostIndex32
                     |   (Load16, PostIndex) => raise InternalError "loadRegPostIndex16"
                     |   (Load8, PostIndex) => loadRegPostIndexByte
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, byteOffset=byteOffset})
             end
 
         |   toAssembler(LoadFPRegUnscaled{regT, regN, byteOffset, floatSize, unscaledType} :: rest, code) =
             let
                 val instr =
                     case (floatSize, unscaledType) of
                         (Float32, NoUpdate) => loadRegUnscaledFloat
                     |   (Double64, NoUpdate) => loadRegUnscaledDouble
                     |   _ => raise InternalError "LoadFPRegUnscaled: pre/post indexed"
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, byteOffset=byteOffset})
             end
 
         |   toAssembler(StoreRegUnscaled{regT, regN, byteOffset, loadType, unscaledType} :: rest, code) =
             let
                 val instr =
                     case (loadType, unscaledType) of
                         (Load64, NoUpdate) => storeRegUnscaled
                     |   (Load32, NoUpdate) => storeRegUnscaled32
                     |   (Load16, NoUpdate) => storeRegUnscaled16
                     |   (Load8, NoUpdate) => storeRegUnscaledByte
                     |   (Load64, PreIndex) => storeRegPreIndex
                     |   (Load32, PreIndex) => storeRegPreIndex32
                     |   (Load16, PreIndex) => raise InternalError "storeRegPreIndex16"
                     |   (Load8, PreIndex) => storeRegPreIndexByte
                     |   (Load64, PostIndex) => storeRegPostIndex
                     |   (Load32, PostIndex) => storeRegPostIndex32
                     |   (Load16, PostIndex) => raise InternalError "storeRegPostIndex16"
                     |   (Load8, PostIndex) => storeRegPostIndexByte
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, byteOffset=byteOffset})
             end
 
         |   toAssembler(StoreFPRegUnscaled{regT, regN, byteOffset, floatSize, unscaledType} :: rest, code) =
             let
                 val instr =
                     case (floatSize, unscaledType) of
                         (Float32, NoUpdate) => storeRegUnscaledFloat
                     |   (Double64, NoUpdate) => storeRegUnscaledDouble
                     |   _ => raise InternalError "StoreFPRegUnscaled: pre/post indexed"
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, byteOffset=byteOffset})
             end
 
         |   toAssembler(LoadRegIndexed{regT, regN, regM, loadType, option} :: rest, code) =
             let
                 val instr =
                     case loadType of
                         Load64 => loadRegIndexed
                     |   Load32 => loadRegIndexed32
                     |   Load16 => loadRegIndexed16
                     |   Load8 => loadRegIndexedByte
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, regM=regM, option=option})
             end
 
         |   toAssembler(StoreRegIndexed{regT, regN, regM, loadType, option} :: rest, code) =
             let
                 val instr =
                     case loadType of
                         Load64 => storeRegIndexed
                     |   Load32 => storeRegIndexed32
                     |   Load16 => storeRegIndexed16
                     |   Load8 => storeRegIndexedByte
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, regM=regM, option=option})
             end
 
         |   toAssembler(LoadFPRegIndexed{regT, regN, regM, floatSize, option} :: rest, code) =
             let
                 val instr =
                     case floatSize of
                         Float32 => loadRegIndexedFloat
                     |   Double64 => loadRegIndexedDouble
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, regM=regM, option=option})
             end
 
         |   toAssembler(StoreFPRegIndexed{regT, regN, regM, floatSize, option} :: rest, code) =
             let
                 val instr =
                     case floatSize of
                         Float32 => storeRegIndexedFloat
                     |   Double64 => storeRegIndexedDouble
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, regM=regM, option=option})
             end
 
         |   toAssembler(LoadAcquireReg{regN, regT, loadType} :: rest, code) =
             let
                 val loadInstr  =
                     case loadType of
                         Load64 => loadAcquire
                     |   Load32 => loadAcquire32
                     |   Load8 => loadAcquireByte
                     |   _ => raise InternalError "LoadAcquire: Unsupported size" (* Not used *)
             in
                 toAssembler(rest, code <::> loadInstr{regT=regT, regN=regN})
             end
 
         |   toAssembler(StoreReleaseReg{regN, regT, loadType} :: rest, code) =
             let
                 val storeInstr  =
                     case loadType of
                         Load64 => storeRelease
                     |   Load32 => storeRelease32
                     |   Load8 => storeReleaseByte
                     |   _ => raise InternalError "StoreRelease: Unsupported size" (* Not used *)
             in
                 toAssembler(rest, code <::> storeInstr{regT=regT, regN=regN})
             end
 
         |   toAssembler(LoadAcquireExclusiveRegister{regN, regT} :: rest, code) =
                 toAssembler(rest, code <::> loadAcquireExclusiveRegister{regN=regN, regT=regT})
 
         |   toAssembler(StoreReleaseExclusiveRegister{regN, regT, regS} :: rest, code) =
                 toAssembler(rest, code <::> storeReleaseExclusiveRegister{regN=regN, regT=regT, regS=regS})
 
         |   toAssembler(MemBarrier :: rest, code) =
                 toAssembler(rest, code <::> dmbIsh)
 
         |   toAssembler(AtomicExtension{ regT, regN, regS, atOp} :: rest, code) =
             let
                 val instr =
                     case atOp of
                         LoadAddAL => loadAddAL | LoadUmaxAL => loadUMaxAL | SwapAL => swapAL
                     |   LoadAddAcquire => loadAddA | LoadUMaxAcquire => loadUMaxA | SwapRelease => swapL
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, regS=regS})
             end
 
         |   toAssembler(LoadRegPair{ regT1, regT2, regN, unitOffset, loadType, unscaledType} :: rest, code) =
             let
                 val _ = regT1 <> regT2 orelse raise InternalError "LoadRegPair: same register"
                 val instr =
                     case (loadType, unscaledType) of
                         (Load64, NoUpdate) => loadPairOffset
                     |   (Load64, PreIndex) => loadPairPreIndexed
                     |   (Load64, PostIndex) => loadPairPostIndexed
                     |   (Load32, NoUpdate) => loadPairOffset32
                     |   (Load32, PreIndex) => loadPairPreIndexed32
                     |   (Load32, PostIndex) => loadPairPostIndexed32
                     |   _ => raise InternalError "LoadRegPair: unimplemented"
             in
                 toAssembler(rest, code <::> instr{regT1=regT1, regT2=regT2, regN=regN, unitOffset=unitOffset})
             end
 
         |   toAssembler(StoreRegPair{ regT1, regT2, regN, unitOffset, loadType, unscaledType} :: rest, code) =
             let
                 val instr =
                     case (loadType, unscaledType) of
                         (Load64, NoUpdate) => storePairOffset
                     |   (Load64, PreIndex) => storePairPreIndexed
                     |   (Load64, PostIndex) => storePairPostIndexed
                     |   (Load32, NoUpdate) => storePairOffset32
                     |   (Load32, PreIndex) => storePairPreIndexed32
                     |   (Load32, PostIndex) => storePairPostIndexed32
                     |   _ => raise InternalError "StoreRegPair: unimplemented"
             in
                 toAssembler(rest, code <::> instr{regT1=regT1, regT2=regT2, regN=regN, unitOffset=unitOffset})
             end
 
         |   toAssembler(LoadFPRegPair{ regT1, regT2, regN, unitOffset, floatSize, unscaledType} :: rest, code) =
             let
                 val _ = regT1 <> regT2 orelse raise InternalError "LoadRegPair: same register"
                 val instr =
                     case (floatSize, unscaledType) of
                         (Double64, NoUpdate) => loadPairOffsetDouble
                     |   (Double64, PreIndex) => loadPairPreIndexedDouble
                     |   (Double64, PostIndex) => loadPairPostIndexedDouble
                     |   (Float32, NoUpdate) => loadPairOffsetFloat
                     |   (Float32, PreIndex) => loadPairPreIndexedFloat
                     |   (Float32, PostIndex) => loadPairPostIndexedFloat
             in
                 toAssembler(rest, code <::> instr{regT1=regT1, regT2=regT2, regN=regN, unitOffset=unitOffset})
             end
 
         |   toAssembler(StoreFPRegPair{ regT1, regT2, regN, unitOffset, floatSize, unscaledType} :: rest, code) =
             let
                 val instr =
                     case (floatSize, unscaledType) of
                         (Double64, NoUpdate) => storePairOffsetDouble
                     |   (Double64, PreIndex) => storePairPreIndexedDouble
                     |   (Double64, PostIndex) => storePairPostIndexedDouble
                     |   (Float32, NoUpdate) => storePairOffsetFloat
                     |   (Float32, PreIndex) => storePairPreIndexedFloat
                     |   (Float32, PostIndex) => storePairPostIndexedFloat
             in
                 toAssembler(rest, code <::> instr{regT1=regT1, regT2=regT2, regN=regN, unitOffset=unitOffset})
             end
 
         |   toAssembler(ConditionalSet{regD, regTrue, regFalse, cond, condSet, opSize} :: rest, code) =
             let
                 val instr =
                     case (condSet, opSize) of
                         (CondSet, OpSize64) => conditionalSet
                     |   (CondSetIncr, OpSize64) => conditionalSetIncrement
                     |   (CondSetInvert, OpSize64) => conditionalSetInverted
                     |   (CondSetNegate, OpSize64) => conditionalSetNegated
                     |   (CondSet, OpSize32) => conditionalSet32
                     |   (CondSetIncr, OpSize32) => conditionalSetIncrement32
                     |   (CondSetInvert, OpSize32) => conditionalSetInverted32
                     |   (CondSetNegate, OpSize32) => conditionalSetNegated32
             in
                 toAssembler(rest, code <::> instr{regD=regD, regTrue=regTrue, regFalse=regFalse, cond=cond})
             end
 
         |   toAssembler(BitField{immr, imms, regN, regD, opSize, bitfieldKind} :: rest, code) =
             let
                 val bfInstr =
                     case (bitfieldKind, opSize) of
                         (BFSigned, OpSize64) => signedBitfieldMove64
                     |   (BFUnsigned, OpSize64) => unsignedBitfieldMove64
                     |   (BFInsert, OpSize64) => bitfieldMove64
                     |   (BFSigned, OpSize32) => signedBitfieldMove32
                     |   (BFUnsigned, OpSize32) => unsignedBitfieldMove32
                     |   (BFInsert, OpSize32) => bitfieldMove32
             in
                 toAssembler(rest, code <::> bfInstr{immr=immr, imms=imms, regN=regN, regD=regD})
             end
 
         |   toAssembler(ShiftRegisterVariable{regM, regN, regD, opSize, shiftDirection} :: rest, code) =
             let
                 val instr =
                     case (shiftDirection, opSize) of
                         (ShiftLeft, OpSize64) => logicalShiftLeftVariable
                     |   (ShiftLeft, OpSize32) => logicalShiftLeftVariable32
                     |   (ShiftRightLogical, OpSize64) => logicalShiftRightVariable
                     |   (ShiftRightLogical, OpSize32) => logicalShiftRightVariable32
                     |   (ShiftRightArithmetic, OpSize64) => arithmeticShiftRightVariable
                     |   (ShiftRightArithmetic, OpSize32) => arithmeticShiftRightVariable32
             in
                 toAssembler(rest, code <::> instr{regN=regN, regM=regM, regD=regD})
             end
 
         |   toAssembler(BitwiseLogical{ bits, regN, regD, opSize, setFlags, logOp} :: rest, code) =
             let
                 val instr =
                     case (logOp, setFlags, opSize) of
                         (LogAnd, false, OpSize64) => bitwiseAndImmediate
                     |   (LogAnd, true, OpSize64) => bitwiseAndSImmediate
                     |   (LogOr, false, OpSize64) => bitwiseOrImmediate
                     |   (LogXor, false, OpSize64) => bitwiseXorImmediate
 
                     |   (LogAnd, false, OpSize32) => bitwiseAndImmediate32
                     |   (LogAnd, true, OpSize32) => bitwiseAndSImmediate32
                     |   (LogOr, false, OpSize32) => bitwiseOrImmediate32
                     |   (LogXor, false, OpSize32) => bitwiseXorImmediate32
 
                     |   _ => raise InternalError "flags not valid with OR or XOR"
             in
                 toAssembler(rest, code <::> instr{regN=regN, regD=regD, bits=bits})
             end
 
-        |   toAssembler(MoveFPToFP{ regN, regD, floatSize=Float32} :: rest, code) =
-                toAssembler(rest, code <::> moveFloatToFloat{regN=regN, regD=regD})
-        |   toAssembler(MoveFPToFP{ regN, regD, floatSize=Double64} :: rest, code) =
-                toAssembler(rest, code <::> moveDoubleToDouble{regN=regN, regD=regD})
-
         |   toAssembler(MoveGeneralToFP{ regN, regD, floatSize=Float32} :: rest, code) =
                 toAssembler(rest, code <::> moveGeneralToFloat{regN=regN, regD=regD})
         |   toAssembler(MoveGeneralToFP{ regN, regD, floatSize=Double64} :: rest, code) =
                 toAssembler(rest, code <::> moveGeneralToDouble{regN=regN, regD=regD})
 
         |   toAssembler(MoveFPToGeneral{ regN, regD, floatSize=Float32} :: rest, code) =
                 toAssembler(rest, code <::> moveFloatToGeneral{regN=regN, regD=regD})
         |   toAssembler(MoveFPToGeneral{ regN, regD, floatSize=Double64} :: rest, code) =
                 toAssembler(rest, code <::> moveDoubleToGeneral{regN=regN, regD=regD})
 
         |   toAssembler(CvtIntToFP{ regN, regD, floatSize, opSize} :: rest, code) =
             let
                 val instr =
                     case (opSize, floatSize) of
                         (OpSize32, Float32) => convertInt32ToFloat
                     |   (OpSize64, Float32) => convertIntToFloat
                     |   (OpSize32, Double64) => convertInt32ToDouble
                     |   (OpSize64, Double64) => convertIntToDouble
             in
                 toAssembler(rest, code <::> instr{regN=regN, regD=regD})
             end
 
         |   toAssembler(CvtFloatToInt{ round, regN, regD, floatSize, opSize} :: rest, code) =
             let
                 val instr =
                     case (floatSize, opSize) of
                         (Float32, OpSize32) => convertFloatToInt32
                     |   (Float32, OpSize64) => convertFloatToInt
                     |   (Double64, OpSize32) => convertDoubleToInt32
                     |   (Double64, OpSize64) => convertDoubleToInt
             in
                 toAssembler(rest, code <::> instr round {regN=regN, regD=regD})
             end
 
         |   toAssembler(FPBinaryOp{ regM, regN, regD, floatSize, fpOp} :: rest, code) =
             let
                 val instr =
                     case (fpOp, floatSize) of
                         (MultiplyFP, Float32) => multiplyFloat
                     |   (DivideFP, Float32) => divideFloat
                     |   (AddFP, Float32) => addFloat
                     |   (SubtractFP, Float32) => subtractFloat
                     |   (MultiplyFP, Double64) => multiplyDouble
                     |   (DivideFP, Double64) => divideDouble
                     |   (AddFP, Double64) => addDouble
                     |   (SubtractFP, Double64) => subtractDouble
             in
                 toAssembler(rest, code <::> instr {regN=regN, regM=regM, regD=regD})
             end
 
         |   toAssembler(FPComparison{ regM, regN, floatSize} :: rest, code) =
                 toAssembler(rest, code <::> (case floatSize of Float32 => compareFloat | Double64 => compareDouble){regN=regN, regM=regM})
 
         |   toAssembler(FPUnaryOp{ regN, regD, fpOp} :: rest, code) =
             let
                 val instr =
                     case fpOp of
                         NegFloat => negFloat | NegDouble => negDouble
                     |   AbsFloat => absFloat | AbsDouble => absDouble
                     |   ConvFloatToDble => convertFloatToDouble
                     |   ConvDbleToFloat => convertDoubleToFloat
+                    |   MoveDouble => moveDoubleToDouble
+                    |   MoveFloat => moveFloatToFloat
             in
                 toAssembler(rest, code <::> instr {regN=regN, regD=regD})
             end
 
         |   toAssembler(SetLabel(Label lab) :: rest, code) = toAssembler(rest, code <::> setLabel(getLabelTarget lab))
 
         |   toAssembler(ConditionalBranch(cond, Label lab) :: rest, code) = toAssembler(rest, code <::> conditionalBranch(cond, getLabelTarget lab))
 
         |   toAssembler(UnconditionalBranch(Label lab) :: rest, code) = toAssembler(rest, code <::> unconditionalBranch(getLabelTarget lab))
 
         |   toAssembler(BranchAndLink(Label lab) :: rest, code) = toAssembler(rest, code <::> branchAndLink(getLabelTarget lab))
 
         |   toAssembler(BranchReg{regD, brRegType=BRRBranch} :: rest, code) = toAssembler(rest, code <::> branchRegister regD)
         |   toAssembler(BranchReg{regD, brRegType=BRRAndLink} :: rest, code) = toAssembler(rest, code <::> branchAndLinkReg regD)
         |   toAssembler(BranchReg{regD, brRegType=BRRReturn} :: rest, code) = toAssembler(rest, code <::> returnRegister regD)
 
         |   toAssembler(LoadLabelAddress(reg, Label lab) :: rest, code) = toAssembler(rest, code <::> loadLabelAddress(reg, getLabelTarget lab))
 
         |   toAssembler(TestBitBranch{ test, bit, label=Label lab, onZero } :: rest, code) =
                 toAssembler(rest, code <::> (if onZero then testBitBranchZero else testBitBranchNonZero)(test, bit, getLabelTarget lab))
 
         |   toAssembler(CompareBranch{ test, label=Label lab, onZero, opSize } :: rest, code) =
             let
                 val instr =
                     case (onZero, opSize) of
                         (true, OpSize64) => compareBranchZero
                     |   (false, OpSize64) => compareBranchNonZero
                     |   (true, OpSize32) => compareBranchZero32
                     |   (false, OpSize32) => compareBranchNonZero32
             in
                 toAssembler(rest, code <::> instr(test, getLabelTarget lab))
             end
 
             (* Register-register moves - special case for XSP. *)
         |   toAssembler(MoveXRegToXReg{sReg=XSP, dReg} :: rest, code) =
                 toAssembler(rest, code <::> addImmediate{regN=XSP, regD=dReg, immed=0w0, shifted=false})
         |   toAssembler(MoveXRegToXReg{sReg, dReg=XSP} :: rest, code) =
                 toAssembler(rest, code <::> addImmediate{regN=sReg, regD=XSP, immed=0w0, shifted=false})
         |   toAssembler(MoveXRegToXReg{sReg, dReg} :: rest, code) =
                 toAssembler(rest, code <::> orrShiftedReg{regN=XZero, regM=sReg, regD=dReg, shift=ShiftNone})
 
         |   toAssembler(LoadNonAddr(xReg, value) :: rest, code) =
             let
                 (* Load a non-address constant.  Tries to use movz/movn/movk if
                    that can be done easily, othewise uses loadNonAddressConstant to
                    load the value from the non-address constant area. *)
                 fun extW (v, h) = Word.andb(Word.fromLarge(LargeWord.>>(Word64.toLarge v, h*0w16)), 0wxffff)
                 val hw0 = extW(value, 0w3) and hw1 = extW(value, 0w2)
                 and hw2 = extW(value, 0w1) and hw3 = extW(value, 0w0)
                 val nextCode =
                     if value < 0wx100000000
                     then
                     let
                         (* 32-bit constants can be loaded using at most a movz and movk but
                            various cases can be reduced since all 32-bit operations set
                            the top word to zero. *)
                         val hi = hw2
                         and lo = hw3
                     in
                         (* 32-bit constants can be loaded with at most a movz and a movk but
                            it may be that there is something shorter. *)
                         if hi = 0w0
                         then code <::> moveZero32{regD=xReg, immediate=lo, shift=0w0}
                         else if hi = 0wxffff
                         then code <::> moveNot32{regD=xReg, immediate=Word.xorb(0wxffff, lo), shift=0w0}
                         else if lo = 0w0
                         then code <::> moveZero32{regD=xReg, immediate=hi, shift=0w16}
                         else if isEncodableBitPattern(value, WordSize32)
                         then code <::> bitwiseOrImmediate32{bits=value, regN=XZero, regD=xReg}
                         else (* Have to use two instructions *)
                             code <::>
                                 moveZero32{regD=xReg, immediate=lo, shift=0w0} <::>
                                 moveKeep{regD=xReg, immediate=hi, shift=0w16}
                     end
                     else if isEncodableBitPattern(value, WordSize64)
                     then code <::> bitwiseOrImmediate{bits=value, regN=XZero, regD=xReg}
                     else if hw0 = 0wxffff andalso hw1 = 0wxffff andalso hw2 = 0wxffff
                     then code <::> moveNot{regD=xReg, immediate=Word.xorb(0wxffff, hw3), shift=0w0}
                     else if hw1 = 0w0 andalso hw2 = 0w0
                     then (* This is common for length words with a flags byte *)
                         code <::> moveZero32{regD=xReg, immediate=hw3, shift=0w0} <::>
                             moveKeep{regD=xReg, immediate=hw0, shift=0w48} 
                     else code <::> loadNonAddressConstant(xReg, value)
             in
                 toAssembler(rest, nextCode)
             end
 
         |   toAssembler(LoadFPConst{dest, value, floatSize=Float32, work} :: rest, code) =
                 toAssembler(rest, loadFloatConstant(dest, value, work)::code)
 
         |   toAssembler(LoadFPConst{dest, value, floatSize=Double64, work} :: rest, code) =
                 toAssembler(rest, loadDoubleConstant(dest, value, work)::code)
 
         |   toAssembler(LoadAddr(dReg, source) :: rest, code) = toAssembler(rest, loadAddressConstant(dReg, source) :: code)
 
         |   toAssembler(RTSTrap{ rtsEntry, work, save } :: rest, code) =
             let
                 (* Because X30 is used in the branchAndLink it has to be pushed
                    across any trap. *)
                 val saveX30 = List.exists (fn r => r = X30) save
                 val preserve = List.filter (fn r => r <> X30) save
             in
                 toAssembler(rest,
                     code <@>
                         (if saveX30 then [storeRegPreIndex{regT=X30, regN=X_MLStackPtr, byteOffset= ~8}] else []) <::>
                     loadRegScaled{regT=work, regN=X_MLAssemblyInt, unitOffset=rtsEntry} <::>
                     branchAndLinkReg work <::>
                     registerMask preserve <@>
                     (if saveX30 then [loadRegPostIndex{regT=X30, regN=X_MLStackPtr, byteOffset= 8}] else [])
                 )
             end
 
         |   toAssembler(AllocateMemoryFixedSize{ bytes, dest, save, work } :: rest, code) =
             let
                 val label = Arm64Assembly.createLabel()
                 val saveX30 = List.exists (fn r => r = X30) save
                 val preserve = List.filter (fn r => r <> X30) save
 
                 val allocCode =
                     code <@>
                     (* Subtract the number of bytes required from the heap pointer. *)
                     (if bytes >= 0w4096
                     then [subShiftedReg{regM=work, regN=X_MLHeapAllocPtr, regD=dest, shift=ShiftNone},
                           loadNonAddressConstant(work, Word.toLarge bytes)]
                     else [subImmediate{regN=X_MLHeapAllocPtr, regD=dest, immed=bytes, shifted=false}]) <::>
                     (* Compare the result with the heap limit. *)
                     subSShiftedReg{regM=X_MLHeapLimit, regN=dest, regD=XZero, shift=ShiftNone} <::>
                     conditionalBranch(CondCarrySet, label) <@>
                         (if saveX30 then [storeRegPreIndex{regT=X30, regN=X_MLStackPtr, byteOffset= ~8}] else []) <::>
                     loadRegScaled{regT=work, regN=X_MLAssemblyInt, unitOffset=heapOverflowCallOffset} <::>
                     branchAndLinkReg work <::>
                     registerMask preserve <@>
                     (if saveX30 then [loadRegPostIndex{regT=X30, regN=X_MLStackPtr, byteOffset= 8}] else []) <::>
                     setLabel label <::>
                     (* Update the heap pointer. *)
                     orrShiftedReg{regN=XZero, regM=dest, regD=X_MLHeapAllocPtr, shift=ShiftNone}
             in
                 toAssembler(rest, allocCode)
             end
 
         |   toAssembler(AllocateMemoryVariableSize{ sizeReg, dest, save, work } :: rest, code) =
             let
                 val trapLabel = Arm64Assembly.createLabel() and noTrapLabel = Arm64Assembly.createLabel()
                 val saveX30 = List.exists (fn r => r = X30) save
                 val preserve = List.filter (fn r => r <> X30) save
             
                 val allocCode =
                     (
                         (* Subtract the size into the result register.  Subtract a further word for
                            the length word and round down in 32-in-64. *)
                         if is32in64
                         then code <::>
                             subShiftedReg{regM=sizeReg, regN=X_MLHeapAllocPtr, regD=dest, shift=ShiftLSL 0w2} <::>
                             subImmediate{regN=dest, regD=dest, immed=0w4, shifted=false} <::>
                             bitwiseAndImmediate{bits= ~ 0w8, regN=dest, regD=dest}
                         else code <::>
                             subShiftedReg{regM=sizeReg, regN=X_MLHeapAllocPtr, regD=dest, shift=ShiftLSL 0w3} <::>
                             subImmediate{regN=dest, regD=dest, immed=0w8, shifted=false}
                     ) <::>
                     (* Check against the limit. If the size is large enough it is possible that this could wrap round. 
                        To check for that we trap if either the result is less than the limit or if it is
                        now greater than the allocation pointer. *)
                     subSShiftedReg{regM=X_MLHeapLimit, regN=dest, regD=XZero, shift=ShiftNone} <::>
                     conditionalBranch(CondCarryClear, trapLabel) <::>
                     subSShiftedReg{regM=X_MLHeapAllocPtr, regN=dest, regD=XZero, shift=ShiftNone} <::>
                     conditionalBranch(CondCarryClear, noTrapLabel) <::>
                     setLabel trapLabel <@>
                         (if saveX30 then [storeRegPreIndex{regT=X30, regN=X_MLStackPtr, byteOffset= ~8}] else []) <::>
                     loadRegScaled{regT=work, regN=X_MLAssemblyInt, unitOffset=heapOverflowCallOffset} <::>
                     branchAndLinkReg work <::>
                     registerMask preserve <@>
                     (if saveX30 then [loadRegPostIndex{regT=X30, regN=X_MLStackPtr, byteOffset= 8}] else []) <::>
                     setLabel noTrapLabel <::>
                     (* Update the heap pointer. *)
                     orrShiftedReg{regN=XZero, regM=dest, regD=X_MLHeapAllocPtr, shift=ShiftNone}
             in
                 toAssembler(rest, allocCode)
             end
 
         |   toAssembler(BranchTable{ startLabel=Label lab, brTable } :: rest, code) =
                 toAssembler(rest,
                     List.foldl (fn (Label lab, code) => (unconditionalBranch(getLabelTarget lab)) :: code)
                         (code <::> setLabel(getLabelTarget lab)) brTable)
 
         |   toAssembler(LoadGlobalHeapBaseInCallback dest :: rest, code) =
                 toAssembler(rest,
                     code <@> List.rev(loadGlobalHeapBaseInCallback dest))
 
         |   toAssembler(Yield :: rest, code) =
                 toAssembler(rest, code <::> yield)
 
         (* Optimisation passes. *)
         fun isValidForPair(offset1, offset2) =
             let val v = Int.min(offset1, offset2) in v >= ~64 andalso v < 64 end
 
         fun forward([], list, rep) = reverse(list, [], rep)
 
         |   forward(SetLabel(Label srcLab) :: (ubr as UnconditionalBranch(Label destLab)) :: tl, list, _) =
             if srcLab = destLab
             (* We should never get this because there should always be a stack-check to
                allow a loop to be broken.  If that ever changes we need to retain the label. *)
             then raise InternalError "Infinite loop detected"
             else (* Mark this to forward to its destination. *)
             (
                 Array.update(labelTargets, srcLab, getLabel destLab);
                 forward(ubr :: tl, list, true)
             )
 
         |   forward(SetLabel(Label jmpLab1) :: (tl as SetLabel(Label jmpLab2) :: _), list, _) =
             (* Eliminate adjacent labels.  They complicate the other tests although they
                don't incur any run-time cost. *)
             (
                 (* Any reference to the first label is forwarded to the second. *)
                 Array.update(labelTargets, jmpLab1, getLabel jmpLab2);
                 forward(tl, list, true)
             )
 
         |   forward((ubr as UnconditionalBranch(Label ubrLab)) :: (tl as SetLabel(Label jumpLab) :: _), list, rep) =
                 (* Eliminate unconditional jumps to the next instruction. *)
             if ubrLab = jumpLab
             then forward(tl, list, true)
             else forward(tl, ubr :: list, rep)
 
         |   forward((cbr as ConditionalBranch(test, Label cbrLab)) :: (ubr as UnconditionalBranch(Label ubrLab)) ::
                     (tl as SetLabel(Label jumpLab) :: _), list, rep) =
             if cbrLab = jumpLab
             then (* We have a conditional branch followed by an unconditional branch followed by the destination of
                     the conditional branch.  Eliminate the unconditional branch by reversing the test.
                     This can often happen if one branch of an if-then-else has been reduced to zero
                     because the same register has been chosen for the input and output. *)
                 forward(tl (* Leave the label just in case it's used elsewhere*),
                     ConditionalBranch(invertTest test, Label ubrLab) :: list, true)
 
             else forward(ubr :: tl, cbr :: list, rep)
 
         |   forward((load as LoadRegScaled{regT=regT1, regN=regN1, unitOffset=offset1, loadType=lt1}) ::
                      (tl1 as LoadRegScaled{regT=regT2, regN=regN2, unitOffset=offset2, loadType=lt2} ::tl2), list, rep) =
             (* Two adjacent loads - can this be converted to load-pair?  N.B.  We have to be careful about the
                sequence ldr x0,[x0]; ldr x1,[x0+8] which isn't the same at all. *)
             if regN1 = regN2 andalso regN1 <> regT1 andalso lt1 = lt2 andalso (offset2 = offset1 + 1 orelse offset2 = offset1 - 1) andalso
                 (case lt1 of Load64 => true | Load32 => true | _ => false) andalso isValidForPair(offset1, offset2)
             then
             let
                 val (reg1, reg2, offset) =
                     if offset1 < offset2 then (regT1, regT2, offset1) else (regT2, regT1, offset2)
             in
                 forward(tl2,
                     LoadRegPair{ regT1=reg1, regT2=reg2, regN=regN1, unitOffset=offset, loadType=lt1, unscaledType=NoUpdate} :: list, true)
             end
             else forward(tl1, load :: list, rep)
 
         |   forward((store as StoreRegScaled{regT=regT1, regN=regN1, unitOffset=offset1, loadType=lt1}) ::
                      (tl1 as StoreRegScaled{regT=regT2, regN=regN2, unitOffset=offset2, loadType=lt2} ::tl2), list, rep) =
             (* Two adjacent stores - can this be converted to store-pair? *)
             if regN1 = regN2 andalso lt1 = lt2 andalso (offset2 = offset1 + 1 orelse offset2 = offset1 - 1) andalso
                 (case lt1 of Load64 => true | Load32 => true | _ => false) andalso isValidForPair(offset1, offset2)
             then
             let
                 val (reg1, reg2, offset) =
                     if offset1 < offset2 then (regT1, regT2, offset1) else (regT2, regT1, offset2)
             in
                 forward(tl2,
                     StoreRegPair{ regT1=reg1, regT2=reg2, regN=regN1, unitOffset=offset, loadType=lt1, unscaledType=NoUpdate} :: list, true)
             end
             else forward(tl1, store :: list, rep)
 
         |   forward((store as StoreRegUnscaled{regT=regT1, regN=regN1, byteOffset= ~8, loadType=Load64, unscaledType=NoUpdate}) ::
                      (tl1 as StoreRegScaled{regT=regT2, regN=regN2, unitOffset=0, loadType=Load64} ::tl2), list, rep) =
             (* Common case - store length word and then the first word of the cell. *)
             if regN1 = regN2
             then forward(tl2,
                     StoreRegPair{ regT1=regT1, regT2=regT2, regN=regN1, unitOffset= ~1, loadType=Load64, unscaledType=NoUpdate} :: list, true)
             else forward(tl1, store :: list, rep)
 
         |   forward((store as StoreRegUnscaled{regT=regT1, regN=regN1, byteOffset= ~4, loadType=Load32, unscaledType=NoUpdate}) ::
                      (tl1 as StoreRegScaled{regT=regT2, regN=regN2, unitOffset=0, loadType=Load32} ::tl2), list, rep) =
             (* Common case - store length word and then the first word of the cell. *)
             if regN1 = regN2
             then forward(tl2,
                     StoreRegPair{ regT1=regT1, regT2=regT2, regN=regN1, unitOffset= ~1, loadType=Load32, unscaledType=NoUpdate} :: list, true)
             else forward(tl1, store :: list, rep)
 
         |   forward((store as StoreRegUnscaled{regT=regT1, regN=regN1, byteOffset= ~8, loadType=Load64, unscaledType=PreIndex}) ::
                      (tl1 as StoreRegUnscaled{regT=regT2, regN=regN2, byteOffset= ~8, loadType=Load64, unscaledType=PreIndex} :: tl2), list, rep) =
             (* Adjacent pushes T2 is in the lower address so the order is T2, T1.  The stack is always 64-bit aligned so
                this works on both native addressing and 32-in-64. *)
             if regN1 = regN2
             then forward(tl2,
                     StoreRegPair{ regT1=regT2, regT2=regT1, regN=regN1, unitOffset= ~2, loadType=Load64, unscaledType=PreIndex} :: list, true)
             else forward(tl1, store :: list, rep)
 
         |   forward((add1 as AddImmediate{regN=regN1, regD=regD1, immed=immed1, shifted=false, opSize=OpSize64, setFlags=false}) ::
                     (tl1 as AddImmediate{regN=regN2, regD=regD2, immed=immed2, shifted=false, opSize=OpSize64, setFlags=false}  ::tl2), list, rep) =
             (* Adjacent stack resets.  This can apply more generally but only if the result registers are the same.  If they're
                not we may need the intermediate result.  We put the result back into the input stream in case it can be combined
                with another stack reset. *)
             if regN2 = regD2 andalso regD1 = regD2 andalso immed2+immed1 < 0w4096
             then forward(AddImmediate{regN=regN1, regD=regD2, immed=immed2+immed1, shifted=false, opSize=OpSize64, setFlags=false} :: tl2, list, true)
             else forward(tl1, add1 :: list, rep)
 
         |   forward(BitwiseLogical{bits=0w1, regN, regD=XZero, logOp=LogAnd, opSize=_, setFlags=true} ::
                     ConditionalBranch(CondEqual, label) :: tl2, list, _) =
             (* Test the tag bit: bit 0.  This is very common to test for nil/not nil.  We could include other
                values but they're far less likely. *)
                 forward(TestBitBranch{test=regN, bit=0w0, label=label, onZero=true} :: tl2, list, true)
 
         |   forward(BitwiseLogical{bits=0w1, regN, regD=XZero, logOp=LogAnd, opSize=_, setFlags=true} ::
                     ConditionalBranch(CondNotEqual, label) :: tl2, list, _) =
                 forward(TestBitBranch{test=regN, bit=0w0, label=label, onZero=false} :: tl2, list, true)
 
         |   forward(hd :: tl, list, rep) = forward(tl, hd :: list, rep) 
         
         and reverse([], list, rep) = (list, rep)
 
         |   reverse((add as AddImmediate{regN=regN2, regD=regD2, immed, shifted=false, opSize=OpSize64, setFlags=false}) ::
                      (tl1 as LoadRegScaled{regT=regT1, regN=regN1, unitOffset=0, loadType=Load64} ::tl2), list, rep) =
             (* A stack reset occurring after a load.  This is usually the ML SP but can also occur with C memory ops.
                It might be possible to consider other cases. *)
             if regN1 = regD2 andalso regN2 = regD2 andalso regT1 <> regN1 andalso immed < 0w256
             then reverse(tl2,
                     LoadRegUnscaled{regT=regT1, regN=regN1, byteOffset=Word.toInt immed, loadType=Load64, unscaledType=PostIndex} :: list, true)
             else reverse(tl1, add :: list, rep)
 
         |   reverse((add as AddImmediate{regN=regN2, regD=regD2, immed, shifted=false, opSize=OpSize64, setFlags=false}) ::
                      (tl1 as LoadRegPair{regT1=regT1, regT2=regT2, regN=regN1, unitOffset=0, loadType=Load64, unscaledType=NoUpdate} ::tl2), list, rep) =
             (* A stack reset occurring after a load pair *)
             if regN1 = regD2 andalso regN2 = regD2 andalso regT1 <> regN1 andalso regT2 <> regN1 andalso immed < 0w64 * 0w8
             then reverse(tl2,
                     LoadRegPair{regT1=regT1, regT2=regT2, regN=regN1, unitOffset=Word.toInt(immed div 0w8),
                                 loadType=Load64, unscaledType=PostIndex} :: list, true)
             else reverse(tl1, add :: list, rep)
 
         |   reverse(hd :: tl, list, rep) = reverse(tl, hd :: list, rep)
 
         (* Repeat scans through the code until there are no further changes. *)
         fun repeat ops =
             case forward(ops, [], false) of
                 (list, false) => list
             |   (list, true) => repeat list
 
         val optimised = repeat instrs
 
     in
         generateCode{instrs=List.rev(toAssembler(optimised, [])), name=name, parameters=parameters,
                      resultClosure=resultClosure, profileObject=profileObject}
     end
 
     (* Constant shifts are encoded in the immr and imms fields of the bit-field instruction. *)
     fun shiftConstant{ direction, regD, regN, shift, opSize } =
     let
         val (bitfieldKind, immr, imms) =
             case (direction, opSize) of
                 (ShiftLeft, OpSize64) => (BFUnsigned, Word.~ shift mod 0w64, 0w64-0w1-shift)
             |   (ShiftLeft, OpSize32) => (BFUnsigned, Word.~ shift mod 0w32, 0w32-0w1-shift)
             |   (ShiftRightLogical, OpSize64) => (BFUnsigned, shift, 0wx3f)
             |   (ShiftRightLogical, OpSize32) => (BFUnsigned, shift, 0wx1f)
             |   (ShiftRightArithmetic, OpSize64) => (BFSigned, shift, 0wx3f)
             |   (ShiftRightArithmetic, OpSize32) => (BFSigned, shift, 0wx1f)
     in
         BitField{ regN=regN, regD=regD, opSize=opSize, immr=immr, imms=imms, bitfieldKind=bitfieldKind }
     end
 
     (* These sequences are used both in the ML code-generator and in the FFI code so it
        is convenient to have them here and share the code. *)
     local
         fun allocateWords(fixedReg, workReg, words, bytes, regMask, code) =
         let
             val (lengthWord, setLength, flagShift) = if is32in64 then (~4, Load32, 0w24) else (~8, Load64, 0w56)
         in
             code <::>
             AllocateMemoryFixedSize{ bytes=bytes, dest=fixedReg, save=regMask, work=X16 } <::>
             LoadNonAddr(workReg,
                     Word64.orb(words, Word64.<<(Word64.fromLarge(Word8.toLarge Address.F_bytes), flagShift))) <::>
             (* Store the length word.  Have to use the unaligned version because offset is -ve. *)
             StoreRegUnscaled{regT=workReg, regN=fixedReg, byteOffset= lengthWord, loadType=setLength, unscaledType=NoUpdate}
         end
 
         fun absoluteAddressToIndex(reg, code) =
         if is32in64
         then
             code <::>
             SubShiftedReg{regM=X_Base32in64, regN=reg, regD=reg, shift=ShiftNone, opSize=OpSize64, setFlags=false} <::>
             shiftConstant{direction=ShiftRightLogical, regN=reg, regD=reg, shift=0w2, opSize=OpSize64}
         else code
     in
         fun boxDouble({source, destination, workReg, saveRegs}, code) =
             absoluteAddressToIndex(destination,
                 allocateWords(destination, workReg, if is32in64 then 0w2 else 0w1, 0w16, saveRegs, code) <::>
                     StoreFPRegScaled{regT=source, regN=destination, unitOffset=0, floatSize=Double64})
                 
         and boxSysWord({source, destination, workReg, saveRegs}, code) =
             absoluteAddressToIndex(destination,
                 allocateWords(destination, workReg, if is32in64 then 0w2 else 0w1, 0w16, saveRegs, code) <::>
                     StoreRegScaled{regT=source, regN=destination, unitOffset=0, loadType=Load64})
 
         and boxFloat({source, destination, workReg, saveRegs}, code) =
             absoluteAddressToIndex(destination, 
                 allocateWords(destination, workReg, 0w1, 0w8, saveRegs, code) <::>
                     StoreFPRegScaled{regT=source, regN=destination, unitOffset=0, floatSize=Float32})
     end
 
     structure Sharing =
     struct
         type closureRef = closureRef
         type loadType = loadType
         type opSize = opSize
         type logicalOp = logicalOp
         type floatSize = floatSize
         type shiftDirection = shiftDirection
         type multKind = multKind
         type fpUnary = fpUnary
         type fpBinary = fpBinary
         type unscaledType = unscaledType
         type condSet = condSet
         type bitfieldKind = bitfieldKind
         type brRegType = brRegType
         type precode = precode
         type xReg = xReg
         type vReg = vReg
         type label = label
         type labelMaker = labelMaker
         type condition = condition
         type shiftType = shiftType
         type wordSize = wordSize
         type 'a extend = 'a extend
         type scale = scale
         type instr = instr
         type atomicOp = atomicOp
     end
 
 end;