diff --git a/basis/BoolArray.sml b/basis/BoolArray.sml
index f8604c02..d34f7da1 100644
--- a/basis/BoolArray.sml
+++ b/basis/BoolArray.sml
@@ -1,517 +1,516 @@
 (*
     Title:      Standard Basis Library: BoolArray and BoolVector Structures
     Author:     David Matthews
-    Copyright   David Matthews 1999, 2005, 2016
+    Copyright   David Matthews 1999, 2005, 2016, 2022
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 local
     open LibrarySupport
 
     (* TODO: Use a single word for vectors of size <= number of bits in a word. *)
     (* We use int here for the length rather than word because the number of bits
        could be more than the maximum value of Word.word. *)
     datatype vector = Vector of int * Bootstrap.byteVector (* This has a byte-wise equality. *)
     and array = Array of int * Bootstrap.byteArray (* This has pointer equality. *)
 
     val wordSize : word = LibrarySupport.wordSize
 
     (* Casts between int and word. *)
     val intAsWord: int -> word = RunCall.unsafeCast
     and wordAsInt: word -> int = RunCall.unsafeCast
 
     val bitsPerWord = wordSize * 0w8
 
     (* Limit the size to Array.maxLen to avoid arithmetic overflow. *)
     val maxLen = Array.maxLen
 
     local
         val F_mutable_bytes = 0wx41
     in
         (* Allocate memory for a vector or an array. *)
         fun alloc (bits: int) =
         let
             val words : word =
                 if bits < 0 orelse bits > maxLen
                 then raise General.Size
                 else (Word.fromInt bits + (bitsPerWord - 0w1)) div bitsPerWord
             val vec = RunCall.allocateByteMemory(words, F_mutable_bytes)
             val bytes = words * wordSize
             fun fill n =
                 if n = bytes
                 then ()
                 else (RunCall.storeByte(vec, n, 0w0); fill(n+0w1))
             (* We will only set the bits that we actually use.  Unused bytes will be uninitialised.
                The equality function we're using tests all the bytes so we need to initialise them. *)
         in
             if bytes = 0w0 then () else fill(bytes - wordSize);
             vec
         end
     end
 
     val andb = Word.andb and orb = Word.orb and notb = Word.notb
     and << = Word.<< and >> = Word.>>;
     
     infix 9 sub
     infix 7 andb
     infix 6 orb
     infix 5 << >>
 
 
     (* Create a vector/array from a list.  Used as the basis of
        Array.fromList and Vector.fromList. *)
     fun fromList' (l : bool list) =
         let
             val length = List.length l
             (* Make a array initialised to zero. *)
             val vec = alloc length
             
             (* Accumulate the list elements into bytes and store
                them in the vector. *)
             fun init (byteno, acc, bit, []) =
                 if bit = 0w1 then () else RunCall.storeByte(vec, byteno, acc)
               | init (byteno, acc, bit, a :: b) =
                 let
                     val byte = if a then bit orb acc else acc
                 in
                     if bit = 0wx80
                     then
                     (
                         RunCall.storeByte(vec, byteno, byte);
                         init(byteno+0w1, 0w0, 0w1, b)
                     )
                     else init(byteno, byte, bit << 0w1, b)
                 end
         in
             init(0w0, 0w0, 0w1, l);
             (length, vec)
         end
 
     fun tabulate' (length: int, f : int->bool) =
     let
         val vec =
             if length >= 0 then alloc length else raise General.Size;
 
         (* Accumulate the bits into bytes and store into the array. *)
         fun init i byteNo bit acc =
         if i < length
         then
         let
             val byte = if f i then bit orb acc else acc
         in
             if bit = 0wx80
             then ( RunCall.storeByte(vec, byteNo, byte) ; init (i+1) (byteNo+0w1) 0w1 0w0 )
             else init (i+1) byteNo (bit << 0w1) byte
         end
         else if acc = 0w0
         then ()
         else (* Put in the last byte. *)
             RunCall.storeByte(vec, byteNo, acc)
     in
         init 0 0w0 0w1 0w0;
         (length, vec)
     end
 
     (* Internal function which subscripts the vector assuming that
        the index has already been checked for validity. *)
     fun uncheckedSub (v, i: int): bool =
         let
             val iW = Word.fromInt i
             val byte = RunCall.loadByte(v, iW >> 0w3)
             val mask = 0w1 << (iW andb 0w7)
         in
             byte andb mask <> 0w0
         end
 
     (* Move a set of bits from one vector of bytes to another.  The bits
        may not be on the same byte alignment.  Does not examine the
        destination so if dest_off is not byte aligned any bits required in
        the first byte must be passed in as src_in.  Returns any bits which
        do not exactly fit into a byte.  *)
     (* TODO: This only handles the case where the source starts at the beginning
        of the vector.  It is easy to modify it for the case where the source
        offset is a multiple of 8 but more difficult to handle the other cases. *)
     fun move_bits(src: Bootstrap.byteVector, dest: Bootstrap.byteVector, dest_off, len, last_bits) =
     let
-        val dest_byte = intAsWord(Int.quot(dest_off, 8)) (* Byte offset *)
+        val dest_byte = intAsWord dest_off div 0w8 (* Byte offset *)
         val dest_bit = intAsWord dest_off - dest_byte*0w8 (* Bit offset *)
 
         fun do_move last byte len : word =
             if len >= 8
             then let
                 (* Get the next byte and shift it up *)
                 val newbyte = last orb (RunCall.loadByteFromImmutable(src, byte) << dest_bit)
             in
                 (* Store the low-order 8 bits into the destination. *)
                 RunCall.storeByte(dest, dest_byte+byte, newbyte);
                 (* Shift the accumulator down by 8 bits and get ready for
                    the next byte. *)
                 do_move (newbyte >> 0w8) (byte+0w1) (len-8)
             end
             else if len <= 0
             then last
             else (* 0 < len < 8 *)
             let
                 (* Get the next byte and shift it up *)
                 val nextsrc = RunCall.loadByteFromImmutable(src, byte);
                 val newbyte: word = last orb (nextsrc << dest_bit)
                 (* This assumes that any extra bits of the source are
                    zero. *)
             in
                 if len + Word.toInt dest_bit >= 8
                 then
                     (
                     (* Store the low-order 8 bits into the destination. *)
                     RunCall.storeByte(dest, dest_byte+byte, newbyte);
                     (* Shift the accumulator down by 8 bits and get ready for
                        the next byte. *)
                     do_move (newbyte >> 0w8) (byte+0w1) (len-8)
                     )
                 else newbyte
             end
     in
         (* TODO: If dest_bit is zero we can simply move the bytes.  If len
            is not a multiple of 8 we may have to return the low-order bits. *)
         do_move last_bits 0w0 len
     end
 
 in
     structure BoolVector: MONO_VECTOR =
     struct
         type vector = vector
         type elem = bool
         val maxLen = maxLen
         
         fun length(Vector(l, _)) = l
         
         fun op sub (Vector(l, v), i: int): bool =
             if i < 0 orelse i >= l then raise General.Subscript
             else uncheckedSub(v, i)
     
         (* Create a vector from a list.  Must lock the vector before
            returning it. *)
         fun fromList (l : elem list) : vector =
         let
             val (length, vec) = fromList' l
         in
             RunCall.clearMutableBit vec;
             Vector(length, vec)
         end
     
         fun tabulate (length: int, f : int->elem): vector =
         let
             val (length, vec) = tabulate' (length, f)
         in
             RunCall.clearMutableBit vec;
             Vector(length, vec)
         end
             
 (*      fun map f (Vector(len, vec)) =
             let
                 val new_vec = alloc len (* Destination vector. *)
                 fun mapbyte b i acc max =
                     if i = max then acc
                     else if f ((b andb i) <> 0w0)
                     then mapbyte b (i<<0w1) (acc orb i) max
                     else mapbyte b (i<<0w1) acc max
                 fun copy b l =
                     if l <= 0 then ()
                     else let
                         val byte = System_loadb(vec, b)
                         val res =
                             (* Map each byte to get the result.  Must not
                                apply the function beyond the last bit. *)
                             if l >= 8 then mapbyte byte 0w1 0w0 0wx100
                             else mapbyte byte 0w1 0w0 (0w1 << Word.fromInt l)
                     in
                         RunCall.storeByte(new_vec, b, res);
                         copy (b+0w1) (l-8)
                     end
             in
                 copy 0w0 len;
                 RunCall.clearMutableBit new_vec;
                 Vector(len, new_vec)
             end*)
 
         fun mapi f (Vector(len, vec)) =
             let
                 val new_vec = alloc len (* Destination vector. *)
                 fun mapbyte b i acc max l =
                     if i = max then acc
                     else if f (len-l, ((b andb i) <> 0w0))
                     then mapbyte b (i<<0w1) (acc orb i) max (l-1)
                     else mapbyte b (i<<0w1) acc max (l-1)
                 fun copy b l =
                     if l <= 0 then ()
                     else let
                         val byte = RunCall.loadByteFromImmutable(vec, b)
                         val res =
                             (* Map each byte to get the result.  Must not
                                apply the function beyond the last bit. *)
                             if l >= 8 then mapbyte byte 0w1 0w0 0wx100 l
                             else mapbyte byte 0w1 0w0 (0w1 << Word.fromInt l) l
                     in
                         RunCall.storeByte(new_vec, b, res);
                         copy (b+0w1) (l-8)
                     end
             in
                 copy 0w0 len;
                 RunCall.clearMutableBit new_vec;
                 Vector(len, new_vec)
             end
 
         (* To save duplicating almost the same code just define map in terms of mapi. *)
         fun map f v = mapi (fn (_, x) => f x) v
 
         (* Return a copy of the vector with a particular entry replaced *)
         fun update (v as Vector(len, _), i, c) =
             if i < 0 orelse i >= len
             then raise Subscript
             else mapi (fn (j, s) => if j = i then c else s) v
     
         fun concat l =
         let
             (* Calculate the total length *)
             fun total [] i = i
               | total (Vector(len, _)::t) i = total t (i+len)
         
             val total_len = total l 0
         in
             let
                 (* Allocate a new vector. *)
                 val new_vec = alloc total_len
                 (* Copy all the source vectors into the destination. *)
                 fun copy_list (Vector(src_len, src_vec)::t) dest_off bits =
                     let
-                        val next = move_bits(src_vec, new_vec,
-                                             dest_off, src_len, bits)
+                        val next = move_bits(src_vec, new_vec, dest_off, src_len, bits)
                     in
                         copy_list t (dest_off+src_len) next
                     end
                  |  copy_list [] dest_off bits =
                     (* At the end of the lists store any extra in the last byte. *)
                     if bits = 0w0 then ()
-                    else RunCall.storeByte(new_vec, intAsWord(Int.quot(dest_off, 8)), bits)
+                    else RunCall.storeByte(new_vec, intAsWord dest_off div 0w8, bits)
             in
                 copy_list l 0 0w0;
                 RunCall.clearMutableBit new_vec;
                 Vector(total_len, new_vec)
             end
         end
 
         (* Create the other functions. *)
         structure VectorOps =
             VectorOperations(
                 struct
                     type vector = vector and elem = elem
                     fun length(Vector(l, _)) = intAsWord l
                     fun unsafeSub (Vector(_, v), i) = uncheckedSub(v, wordAsInt i)
                     fun unsafeSet _ = raise Fail "Should not be called"
                 end);
     
         open VectorOps;
 
 
         local
             (* Install the pretty printer for BoolVector.vector *)
             fun pretty(depth: FixedInt.int) _ (x: vector) =
                 let
                     open PolyML
                     val last = length x - 1
                     fun put_elem (index, w, (l, d)) =
                         if d = 0 then ([PrettyString "...]"], d+1)
                         else if d < 0 then ([], d+1)
                         else
                         (
                         PrettyString(if w then "true" else "false") ::
                             (if index <> last then PrettyString "," :: PrettyBreak(1, 0) :: l else l),
                         d+1
                         )
                 in
                     PrettyBlock(3, false, [],
                         PrettyString "fromList[" ::
                         (if depth <= 0 then [PrettyString "...]"]
                          else #1 (foldri put_elem ([PrettyString "]"], depth - FixedInt.fromInt last) x) )
                    )
                 end
         in
             val () = PolyML.addPrettyPrinter pretty
         end
     
     end
 
     structure BoolArray: MONO_ARRAY =
     struct
         type array = array
         type elem = bool
         type vector = vector
         val maxLen = maxLen;
 
         fun length(Array(l, _)) = l
         
         (* Internal function for updating a bit assuming the bounds
            checks have already been done. *)
         fun uncheckedUpdate(v, i, new): unit =
         let
             val iW = Word.fromInt i
             val byteOffsetW = iW >> 0w3
             val byte = RunCall.loadByte(v, byteOffsetW);
             val mask = 0w1 << (iW andb 0w7)
             val newByte =
                 if new then byte orb mask
                 else byte andb (notb mask)
         in
             RunCall.storeByte(v, byteOffsetW, newByte)
         end
 
         fun array (len, ini) =
         let
             (* Create the uninitialised array. *)
             val vec = alloc len
             (* Set the bytes to all zeros or all ones.  Generally this will set
                more bits than we need but that doesn't matter. *)
             val initByte = if ini then 0wxff else 0wx00
             val bytes = (Word.fromInt len + 0w7) >> 0w3
             (* TODO: This should be set by a built-in. *)
             fun setBytes b =
                 if b >= bytes then ()
                 else (RunCall.storeByte(vec, b, initByte); setBytes (b+0w1))
             val () = setBytes 0w0
         in
             Array(len, vec)
         end
     
         fun op sub (Array(l, v), i: int): elem =
             if i < 0 orelse i >= l then raise General.Subscript
             else uncheckedSub(v, i)
 
         (* Exported update function. *)
         fun update (Array (l, v), i: int, new) : unit =
             if i < 0 orelse i >= l
             then raise General.Subscript
             else uncheckedUpdate(v, i, new)
 
         (* Create an array from a list. *)
         fun fromList (l : elem list) : array = Array(fromList' l)
 
         fun tabulate (length: int , f : int->elem): array =
             Array(tabulate'(length, f))
 
         fun vector(Array(len, vec)): vector =
             (* TODO: We may be able to handle special cases where the
                source and destination are aligned on the same bit offset.
                For the moment just take the simple approach. *)
             BoolVector.tabulate(len, fn j => uncheckedSub(vec, j))
 
         (* Copy one array into another. The arrays could be the same but in that case di must be zero. *)
         fun copy {src=Array (slen, s), dst=Array (dlen, d), di: int} =
             if di < 0 orelse di+slen > dlen
             then raise General.Subscript
             else (* TODO: Handle multiple bits where possible by using
                move_bits or a variant. *)
             let
             fun copyBits n =
                     if n >= slen then ()
                     else
                         (uncheckedUpdate(d, di+n, uncheckedSub(s, n));
                          copyBits(n+1))
             in
                 copyBits 0
             end
 
 (*      fun copy {src as Array (slen, s), dst as Array (dlen, d), di: int} =
             let
             in
                 if di < 0 orelse di+slen > dlen
                 then raise General.Subscript
                 else if si < di
                 then (* Moving up - Start from the end *)
                 (* TODO: Handle multiple bits where possible by using
                    move_bits or a variant. *)
                 let
                     fun copyBits n =
                         if n < 0 then ()
                         else
                             (uncheckedUpdate(d, di+n, uncheckedSub(s, si+n));
                              copyBits(n-1))
                 in
                     copyBits (slen-1)
                 end
                 else (* Moving down. *)
                 let
                     fun copyBits n =
                         if n >= slice_len then ()
                         else
                             (uncheckedUpdate(d, di+n, uncheckedSub(s, si+n));
                              copyBits(n+1))
                 in
                     copyBits 0
                 end
             end
 *)  
         (* Copy a vector into an array. *)
         fun copyVec {src=Vector(slen, s), dst=Array (dlen, d), di: int} =
             let
                 fun copyBits n =
                     if n >= slen then ()
                     else
                         (uncheckedUpdate(d, di+n, uncheckedSub(s, n));
                          copyBits(n+1))
             in
                 if di < 0 orelse di+slen > dlen
                 then raise General.Subscript
                 else copyBits 0
             end
 
         (* Create the other functions. *)
         structure VectorOps =
             VectorOperations(
                 struct
                     type vector = array and elem = elem
                     fun length(Array(l, _)) = intAsWord l
                     fun unsafeSub (Array(_, v), i) = uncheckedSub(v, wordAsInt i)
                     fun unsafeSet (Array(_, v), i, new) = uncheckedUpdate(v, wordAsInt i, new)
                 end);
     
         open VectorOps;
     
         local
             (* Install the pretty printer for BoolArray.array *)
             (* We may have to do this outside the structure if we
                have opaque signature matching. *)
             fun pretty(depth: FixedInt.int) _ (x: array) =
                 let
                     open PolyML
                     val last = length x - 1
                     fun put_elem (index, w, (l, d)) =
                         if d = 0 then ([PrettyString "...]"], d+1)
                         else if d < 0 then ([], d+1)
                         else
                         (
                         PrettyString(if w then "true" else "false") ::
                             (if index <> last then PrettyString "," :: PrettyBreak(1, 0) :: l else l),
                         d+1
                         )
                 in
                     PrettyBlock(3, false, [],
                         PrettyString "fromList[" ::
                         (if depth <= 0 then [PrettyString "...]"]
                          else #1 (foldri put_elem ([PrettyString "]"], depth - FixedInt.fromInt last) x) )
                    )
                 end
         in
             val () = PolyML.addPrettyPrinter pretty
         end
     end
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODE.sig b/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODE.sig
index eb3d9c81..592c9a8a 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODE.sig
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODE.sig
@@ -1,453 +1,455 @@
 (*
     Signature for the high-level ARM64 code
 
     Copyright David C. J. Matthews 2021-2
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 signature ARM64ICODE =
 sig
     type machineWord = Address.machineWord
     type address = Address.address
     
     type closureRef
 
     (* Registers. *)
     datatype xReg = XReg of Word8.word | XZero | XSP
     and vReg = VReg of Word8.word
 
     (* It is simpler to use a single type for all registers. *)
     datatype reg = GenReg of xReg | FPReg of vReg
 
     val X0:  xReg   and X1:  xReg   and X2:  xReg   and X3: xReg
     and X4:  xReg   and X5:  xReg   and X6:  xReg   and X7: xReg
     and X8:  xReg   and X9:  xReg   and X10: xReg   and X11: xReg
     and X12: xReg   and X13: xReg   and X14: xReg   and X15: xReg
     and X16: xReg   and X17: xReg   and X18: xReg   and X19: xReg
     and X20: xReg   and X21: xReg   and X22: xReg   and X23: xReg
     and X24: xReg   and X25: xReg   and X26: xReg   and X27: xReg
     and X28: xReg   and X29: xReg   and X30: xReg
 
     val V0:  vReg   and V1:  vReg   and V2:  vReg   and V3: vReg
     and V4:  vReg   and V5:  vReg   and V6:  vReg   and V7: vReg
     
     val is32in64: bool and isBigEndian: bool
     
     (* Condition for conditional branches etc. *)
     datatype condition =
         CondEqual            (* Z=1 *)
     |   CondNotEqual         (* Z=0 *)
     |   CondCarrySet         (* C=1 *)
     |   CondCarryClear       (* C=0 *)
     |   CondNegative         (* N=1 *)
     |   CondPositive         (* N=0 imcludes zero *)
     |   CondOverflow         (* V=1 *)
     |   CondNoOverflow       (* V=0 *)
     |   CondUnsignedHigher   (* C=1 && Z=0 *)
     |   CondUnsignedLowOrEq  (* ! (C=1 && Z=0) *)
     |   CondSignedGreaterEq  (* N=V *)
     |   CondSignedLess       (* N<>V *)
     |   CondSignedGreater    (* Z==0 && N=V *)
     |   CondSignedLessEq     (* !(Z==0 && N=V) *)
 
     (* The shift used in arithemtic operations. *)
     and shiftType =
         ShiftLSL of Word8.word
     |   ShiftLSR of Word8.word
     |   ShiftASR of Word8.word
     |   ShiftNone
 
     datatype preg = PReg of int (* A pseudo-register - an abstract register. *)
 
     (* If the value is zero we can use X0/W0. *)
     datatype pregOrZero = SomeReg of preg | ZeroReg
     
     (* A location on the stack.  May be more than word if this is a container or a handler entry. *)
     datatype stackLocn = StackLoc of {size: int, rno: int }
     
     (* This combines pregKind and stackLocn.  *)
     datatype regProperty =
         RegPropGeneral      (* A general register. *)
     |   RegPropUntagged     (* An untagged general register. *)
     |   RegPropStack of int (* A stack location or container. *)
     |   RegPropCacheTagged
     |   RegPropCacheUntagged
     |   RegPropMultiple     (* The result of a conditional or case. May be defined at multiple points. *)
     
     (* The reference to a condition code. *)
     datatype ccRef = CcRef of int
 
     datatype loadType = Load64 | Load32 | Load16 | Load8
     and opSize = OpSize32 | OpSize64
     and logicalOp = LogAnd | LogOr | LogXor
     and callKind = Recursive | ConstantCode of machineWord | FullCall
     and floatSize = Float32 | Double64
     and shiftDirection = ShiftLeft | ShiftRightLogical | ShiftRightArithmetic
     and multKind =
         MultAdd32 | MultSub32 | MultAdd64 | MultSub64 |
         SignedMultAddLong (* 32bit*32bit + 64bit => 64Bit *) |
         SignedMultHigh (* High order part of 64bit*64Bit *)
-    and fpUnary = NegFloat | NegDouble | AbsFloat | AbsDouble | ConvFloatToDble | ConvDbleToFloat
+    and fpUnary =
+        NegFloat | NegDouble | AbsFloat | AbsDouble | ConvFloatToDble |
+        ConvDbleToFloat | MoveDouble | MoveFloat
     and fpBinary = MultiplyFP | DivideFP | AddFP | SubtractFP
         (* Some of the atomic operations added in 8.1 *)
     and atomicOp = LoadAddAL | LoadUmaxAL | SwapAL | LoadAddAcquire | LoadUMaxAcquire | SwapRelease
 
     (* Function calls can have an unlimited number of arguments so it isn't always
        going to be possible to load them into registers. *)
     datatype 'genReg fnarg = ArgInReg of 'genReg | ArgOnStack of { wordOffset: int, container: stackLocn, field: int }
 
     datatype ('genReg, 'optGenReg, 'fpReg) arm64ICode =
         (* Move the contents of one preg to another.  These are always 64-bits. *)
         MoveRegister of { source: 'genReg, dest: 'genReg }
 
         (* Numerical constant. *)
     |   LoadNonAddressConstant of { source: Word64.word, dest: 'genReg }
 
         (* Floating point constant *)
     |   LoadFPConstant of { source: Word64.word, dest: 'fpReg, floatSize: floatSize }
 
         (* Address constant. *)
     |   LoadAddressConstant of { source: machineWord, dest: 'genReg }
 
         (* Load a value into a register using a constant, signed, byte offset.  The offset
            is in the range of -256 to (+4095*unit size). *)
     |   LoadWithConstantOffset of { base: 'genReg, dest: 'genReg, byteOffset: int, loadType: loadType }
 
         (* Similarly for FP registers. *)
     |   LoadFPWithConstantOffset of { base: 'genReg, dest: 'fpReg, byteOffset: int, floatSize: floatSize }
 
         (* Load a value into a register using an index register. *)
     |   LoadWithIndexedOffset of { base: 'genReg, dest: 'genReg, index: 'genReg, loadType: loadType, signExtendIndex: bool }
 
         (* Ditto for FP. *)
     |   LoadFPWithIndexedOffset of { base: 'genReg, dest: 'fpReg, index: 'genReg, floatSize: floatSize, signExtendIndex: bool }
 
         (* Returns the current thread ID.  Always a 64-bit value.. *)
     |   GetThreadId of { dest: 'genReg }
 
         (* Convert a 32-in-64 object index into an absolute address. *)
     |   ObjectIndexAddressToAbsolute of { source: 'genReg, dest: 'genReg }
 
         (* Convert an absolute address into an object index. *)
     |   AbsoluteToObjectIndex of { source: 'genReg, dest: 'genReg }
 
         (* Allocate a fixed sized piece of memory and puts the absolute address into dest.
            bytesRequired is the total number of bytes including the length word and any alignment
            necessary for 32-in-64. saveRegs is the list of registers that need to be saved if we
            need to do a garbage collection. *)
     |   AllocateMemoryFixed of { bytesRequired: Word64.word, dest: 'genReg, saveRegs: 'genReg list }
 
         (* Allocate a piece of memory.  The size argument is an untagged value containing
            the number of words i.e. the same value used for InitialiseMemory and to store
            in the length word. *)
     |   AllocateMemoryVariable of { size: 'genReg, dest: 'genReg, saveRegs: 'genReg list }
 
         (* Initialise a piece of memory by writing "size" copies of the value
            in "init".  N.B. The size is an untagged value containing the
            number of words. *)
     |   InitialiseMem of { size: 'genReg, addr: 'genReg, init: 'genReg }
 
         (* Mark the beginning of a loop.  This is really only to prevent the initialisation code being
            duplicated in ICodeOptimise. *)
     |   BeginLoop
 
         (* Set up the registers for a jump back to the start of a loop. *)
     |   JumpLoop of
             { regArgs: {src: 'genReg fnarg, dst: 'genReg} list,
               stackArgs: {src: 'genReg fnarg, wordOffset: int, stackloc: stackLocn} list,
               checkInterrupt: 'genReg list option }
 
         (* Store a register using a constant, signed, byte offset.  The offset
            is in the range of -256 to (+4095*unit size). *)
     |   StoreWithConstantOffset of { source: 'genReg, base: 'genReg, byteOffset: int, loadType: loadType }
 
         (* Ditto for FP regs. *)
     |   StoreFPWithConstantOffset of { source: 'fpReg, base: 'genReg, byteOffset: int, floatSize: floatSize }
 
         (* Store a register using an index register. *)
     |   StoreWithIndexedOffset of { source: 'genReg, base: 'genReg, index: 'genReg, loadType: loadType, signExtendIndex: bool }
 
         (* and for FP regs. *)
     |   StoreFPWithIndexedOffset of { source: 'fpReg, base: 'genReg, index: 'genReg, floatSize: floatSize, signExtendIndex: bool }
 
         (* Add/Subtract immediate.  The destination is optional in which case XZero is used.
            ccRef is optional.  If it is NONE the version of the instruction that does not generate
            a condition code is used. immed must be < 0wx1000. *)
     |   AddSubImmediate of { source: 'genReg, dest: 'optGenReg, ccRef: ccRef option, immed: word,
                              isAdd: bool, length: opSize }
 
         (* Add/Subtract register.  As with AddSubImmediate, both the destination and cc are optional. *)
     |   AddSubRegister of { base: 'genReg, shifted: 'genReg, dest: 'optGenReg, ccRef: ccRef option,
                             isAdd: bool, length: opSize, shift: shiftType }
 
         (* Bitwise logical operations.  The immediate value must be a valid bit pattern.  ccRef can
            only be SOME if logOp is LogAnd. *)
     |   LogicalImmediate of { source: 'genReg, dest: 'optGenReg, ccRef: ccRef option, immed: Word64.word,
                               logOp: logicalOp, length: opSize }
 
         (* Register logical operations.  ccRef can only be SOME if logOp is LogAnd.*)
     |   LogicalRegister of { base: 'genReg, shifted: 'genReg, dest: 'optGenReg, ccRef: ccRef option,
                              logOp: logicalOp, length: opSize, shift: shiftType }
 
         (* Shift a word by an amount specified in a register. *)
     |   ShiftRegister of { direction: shiftDirection, dest: 'genReg, source: 'genReg, shift: 'genReg, opSize: opSize }
 
         (* The various forms of multiply all take three arguments and the general form is
            dest = M * N +/- A..   *)
     |   Multiplication of { kind: multKind, dest: 'genReg, sourceA: 'optGenReg, sourceM: 'genReg, sourceN: 'genReg }
 
         (* Signed or unsigned division.  Sets the result to zero if the divisor is zero. *)
     |   Division of { isSigned: bool, dest: 'genReg, dividend: 'genReg, divisor: 'genReg, opSize: opSize }
 
         (* Start of function.  Set the register arguments.  stackArgs is the list of
            stack arguments.  If the function has a real closure regArgs includes the
            closure register (X8).  The register arguments include the return register
            (X30). *)
-    |   BeginFunction of { regArgs: ('genReg * xReg) list, stackArgs: stackLocn list }
+    |   BeginFunction of { regArgs: ('genReg * xReg) list, fpRegArgs: ('fpReg * vReg) list, stackArgs: stackLocn list }
 
         (* Call a function.  If the code address is a constant it is passed here.
            Otherwise the address is obtained by indirecting through X8 which has been loaded
            as one of the argument registers.  The results are stored in the result registers,
            usually just X0.
            The "containers" argument is used to ensure that any container whose address is passed
            as one of the other arguments continues to be referenced until the function is called
            since there's a possibility that it isn't actually used after the function. *)
     |   FunctionCall of
             { callKind: callKind, regArgs: ('genReg fnarg * xReg) list,
               stackArgs: 'genReg fnarg list, dests: ('genReg * xReg) list,
+              fpRegArgs: ('fpReg * vReg) list, fpDests: ('fpReg * vReg) list,
               saveRegs: 'genReg list, containers: stackLocn list}
 
         (* Jump to a tail-recursive function.  This is similar to FunctionCall
            but complicated for stack arguments because the stack and the return
            address need to be overwritten.
            stackAdjust is the number of words to remove (positive) or add
            (negative) to the stack before the call.
            currStackSize contains the number of items currently on the stack. *)
     |   TailRecursiveCall of
             { callKind: callKind, regArgs: ('genReg fnarg * xReg) list,
               stackArgs: {src: 'genReg fnarg, stack: int} list,
+              fpRegArgs: ('fpReg * vReg) list,
               stackAdjust: int, currStackSize: int }
 
         (* Return from the function.  resultRegs are the registers containing
            the result,
            returnReg is the preg that contains the return address. *)
-    |   ReturnResultFromFunction of { results: ('genReg * xReg) list, returnReg: 'genReg, numStackArgs: int }
+    |   ReturnResultFromFunction of
+            { results: ('genReg * xReg) list, fpResults:  ('fpReg * vReg) list, returnReg: 'genReg, numStackArgs: int }
 
         (* Raise an exception.  The packet is always loaded into X0. *)
     |   RaiseExceptionPacket of { packetReg: 'genReg }
 
         (* Push a register to the stack.  This is used both for a normal push, copies=1, and
            also to reserve a container. *)
     |   PushToStack of { source: 'genReg, copies: int, container: stackLocn }
 
         (* Load a register from the stack.  The container is the stack location identifier,
            the field is an offset in a container. *)
     |   LoadStack of { dest: 'genReg, wordOffset: int, container: stackLocn, field: int }
 
         (* Store a value into the stack. *)
     |   StoreToStack of { source: 'genReg, container: stackLocn, field: int, stackOffset: int }
 
         (* Set the register to the address of the container i.e. a specific offset on the stack. *)
     |   ContainerAddress of { dest: 'genReg, container: stackLocn, stackOffset: int }
 
         (* Remove items from the stack.  Used to remove containers or
            registers pushed to the stack.. *)
     |   ResetStackPtr of { numWords: int }
 
         (* Tag a value by shifting and setting the tag bit. *)
     |   TagValue of { source: 'genReg, dest: 'genReg, isSigned: bool, opSize: opSize }
 
         (* Shift a value to remove the tag bit.  The cache is used if this is untagging a
            value that has previously been tagged. *)
     |   UntagValue of { source: 'genReg, dest: 'genReg, isSigned: bool, opSize: opSize }
 
         (* Box a largeword value.  Stores a value
            into a byte area.  This can be implemented using AllocateMemoryFixed
            but keeping it separate makes optimisation easier.
            The result is always an address and needs to be converted to an
            object index on 32-in-64. *)
     |   BoxLarge of { source: 'genReg, dest: 'genReg, saveRegs: 'genReg list }
 
         (* Load a value from a box.  This can be implemented using a load but
            is kept separate to simplify optimisation.  The source is always
            an absolute address. *)
     |   UnboxLarge of { source: 'genReg, dest: 'genReg }
 
         (* Convert a floating point value into a value suitable for storing
            in the heap.  This normally involves boxing except that 32-bit
            floats can be tagged in native 64-bits. *)
     |   BoxTagFloat of { floatSize: floatSize, source: 'fpReg, dest: 'genReg, saveRegs: 'genReg list }
 
         (* The reverse of BoxTagFloat. *)
     |   UnboxTagFloat of { floatSize: floatSize, source: 'genReg, dest: 'fpReg }
 
         (* Load a value with acquire semantics.  This means that any other
            load in this thread after this sees the value of the shared
            memory at this point and not earlier.  This is used for
            references and arrays to ensure that if another thread has
            built a data structure on the heap and then assigns the
            address to a shared ref this thread will see the updated heap
            and not any locally cached previous version. *)
     |   LoadAcquire of { base: 'genReg, dest: 'genReg, loadType: loadType }
 
         (* Store a value with release semantics.  This ensures that any
            other write completes before this operation and works with
            LoadAcquire. *)
     |   StoreRelease of { base: 'genReg, source: 'genReg, loadType: loadType }
 
         (* This is a generalised constant shift which includes selection of a
            range of bits. *)
     |   BitFieldShift of { source: 'genReg, dest: 'genReg, isSigned: bool, length: opSize, immr: word, imms: word }
 
         (*  Copy a range of bits and insert it into another register.  This is the
             only case where a register functions both as a source and a destination. *)
     |   BitFieldInsert of { source: 'genReg, destAsSource: 'genReg, dest: 'genReg,
                             length: opSize, immr: word, imms: word }
 
         (* Indexed case. *)
     |   IndexedCaseOperation of { testReg: 'genReg }
 
         (* Exception handling.  - Set up an exception handler. *)
     |   PushExceptionHandler
 
         (* End of a handled section.  Restore the previous handler. *)
     |   PopExceptionHandler
 
         (* Marks the start of a handler.  This sets the stack pointer and
            restores the old handler.  Sets the exception packet register. *) 
     |   BeginHandler of { packetReg: 'genReg }
 
         (* Compare two vectors of bytes and set the condition code on the result.
            The registers are modified by the instruction. *)
     |   CompareByteVectors of
             { vec1Addr: 'genReg, vec2Addr: 'genReg, length: 'genReg, ccRef: ccRef }
 
         (* Move a block of bytes (isByteMove true) or words (isByteMove false).  The length is the
            number of items (bytes or words) to move. The registers are modified by
            the instruction. *)
     |   BlockMove of { srcAddr: 'genReg, destAddr: 'genReg, length: 'genReg, isByteMove: bool }
 
         (* Add or subtract to the system stack pointer and optionally return the new value.
            This is used to allocate and deallocate C space. *)
     |   AddSubXSP of { source: 'genReg, dest: 'optGenReg, isAdd: bool  }
 
         (* Ensures the value will actually be referenced although it doesn't generate any code. *)
     |   TouchValue of { source: 'genReg }
 
         (* Load a value at the address and get exclusive access.  Always loads a
            64-bit value. *)
     |   LoadAcquireExclusive of { base: 'genReg, dest: 'genReg }
 
         (* Store a value into an address releasing the lock.  Sets the result to
            either 0 or 1 if it succeeds or fails. *)
     |   StoreReleaseExclusive of { base: 'genReg, source: 'optGenReg, result: 'genReg }
 
         (* Insert a memory barrier. dmb ish. *)
     |   MemoryBarrier
 
         (* Convert an integer to a floating point value. *)
     |   ConvertIntToFloat of { source: 'genReg, dest: 'fpReg, srcSize: opSize, destSize: floatSize }
 
         (* Convert a floating point value to an integer using the specified rounding mode.
            We could get an overflow here but fortunately the ARM generates a value
            that will cause an overflow when we tag it, provided we tag it explicitly. *)
     |   ConvertFloatToInt of { source: 'fpReg, dest: 'genReg, srcSize: floatSize, destSize: opSize, rounding: IEEEReal.rounding_mode }
 
-        (* Unary floating point.  This includes conversions between float and double. *)
+        (* Unary floating point.  This includes moves and conversions between float and double. *)
     |   UnaryFloatingPt of { source: 'fpReg, dest: 'fpReg, fpOp: fpUnary }
 
         (* Binary floating point: addition, subtraction, multiplication and division. *)
     |   BinaryFloatingPoint of { arg1: 'fpReg, arg2: 'fpReg, dest: 'fpReg, fpOp: fpBinary, opSize: floatSize }
 
         (* Floating point comparison. *)
     |   CompareFloatingPoint of { arg1: 'fpReg, arg2: 'fpReg, ccRef: ccRef, opSize: floatSize }
 
         (* Yield control during a spin-lock. *)
     |   CPUYield
 
         (* Atomic operations added for ARM 8.1 *)
     |   AtomicOperation of { base: 'genReg, source: 'optGenReg, dest: 'optGenReg, atOp: atomicOp }
 
-        (* Debugging - fault if values don't match. *)
-    |   CacheCheck of { arg1: 'genReg, arg2: 'genReg }
-
         (* Destinations at the end of a basic block. *)
     and controlFlow =
         (* Unconditional branch to a label - should be a merge point. *)
         Unconditional of int
         (* Conditional branch. Jumps to trueJump if the condional is false, falseJump if false. *)
     |   Conditional of { ccRef: ccRef, condition: condition, trueJump: int, falseJump: int }
         (* Exit - the last instruction of the block is a return, raise or tailcall. *)
     |   ExitCode
         (* Indexed case - this branches to one of a number of labels *)
     |   IndexedBr of int list
         (* Set up a handler.  This doesn't cause an immediate branch but the state at the
            start of the handler is the state at this point. *)
     |   SetHandler of { handler: int, continue: int }
         (* Unconditional branch to a handler.  If an exception is raised explicitly
            within the scope of a handler. *)
     |   UnconditionalHandle of int
         (* Conditional branch to a handler.  Occurs if there is a call to a
            function within the scope of a handler.  It may jump to the handler. *)
     |   ConditionalHandle of { handler: int, continue: int }
 
     and ('genReg, 'optGenReg, 'fpReg) basicBlock =
             BasicBlock of { block: ('genReg, 'optGenReg, 'fpReg) arm64ICode list, flow: controlFlow }
     
     (* Return the successor blocks from a control flow. *)
     val successorBlocks: controlFlow -> int list
 
     type iCodeAbstract = (preg, pregOrZero, preg) arm64ICode and basicBlockAbstract = (preg, pregOrZero, preg) basicBlock
     and  iCodeConcrete = (xReg, xReg, vReg) arm64ICode and basicBlockConcrete = (xReg, xReg, vReg) basicBlock
 
     val printICodeAbstract: basicBlockAbstract vector * (string -> unit) -> unit
     and printICodeConcrete: basicBlockConcrete vector * (string -> unit) -> unit
 
     (* Check whether this value is acceptable for LogicalImmediate. *)
     val isEncodableBitPattern: Word64.word * opSize -> bool
 
     (* This generates a  BitField instruction with the appropriate values for immr and imms. *)
     val shiftConstant:
         { direction: shiftDirection, dest: preg, source: preg, shift: word, opSize: opSize } -> iCodeAbstract
     
     structure Sharing:
     sig
         type xReg           = xReg
         and  vReg           = vReg
         and  reg            = reg
         and  condition      = condition
         and  shiftType      = shiftType
         and  ('genReg, 'optGenReg, 'fpReg) arm64ICode = ('genReg, 'optGenReg, 'fpReg) arm64ICode
         and  preg           = preg
         and  pregOrZero     = pregOrZero
         and  controlFlow    = controlFlow
         and  ('genReg, 'optGenReg, 'fpReg) basicBlock = ('genReg, 'optGenReg, 'fpReg) basicBlock
         and  stackLocn      = stackLocn
         and  regProperty    = regProperty
         and  ccRef          = ccRef
         and  'genReg fnarg  = 'genReg fnarg
         and  closureRef     = closureRef
         and  loadType       = loadType
         and  opSize         = opSize
         and  logicalOp      = logicalOp
         and  callKind       = callKind
         and  floatSize      = floatSize
         and  shiftDirection = shiftDirection
         and  multKind       = multKind
         and  fpUnary        = fpUnary
         and  fpBinary       = fpBinary
         and  atomicOp       = atomicOp
    end
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64PREASSEMBLY.sig b/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64PREASSEMBLY.sig
index 36be1203..ce76df1c 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64PREASSEMBLY.sig
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64PREASSEMBLY.sig
@@ -1,267 +1,269 @@
 (*
     Copyright (c) 2021-2 David C. J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     Licence version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public Licence for more details.
     
     You should have received a copy of the GNU Lesser General Public
     Licence along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 (* The pre-assembly layer goes below the icode and allows peep-hole optimisation. *)
 signature ARM64PREASSEMBLY =
 sig
     type closureRef
     type machineWord = Address.machineWord
 
     (* XZero and XSP are both encoded as 31 but the interpretation
        depends on the instruction 
        The datatype definition is included here to allow for
        pattern matching on XSP and XZero. *)
     datatype xReg = XReg of Word8.word | XZero | XSP
     and vReg = VReg of Word8.word
 
     val X0:  xReg   and X1:  xReg   and X2:  xReg   and X3: xReg
     and X4:  xReg   and X5:  xReg   and X6:  xReg   and X7: xReg
     and X8:  xReg   and X9:  xReg   and X10: xReg   and X11: xReg
     and X12: xReg   and X13: xReg   and X14: xReg   and X15: xReg
     and X16: xReg   and X17: xReg   and X18: xReg   and X19: xReg
     and X20: xReg   and X21: xReg   and X22: xReg   and X23: xReg
     and X24: xReg   and X25: xReg   and X26: xReg   and X27: xReg
     and X28: xReg   and X29: xReg   and X30: xReg
 
     val X_MLHeapLimit: xReg (* ML Heap limit pointer *)
     and X_MLAssemblyInt: xReg (* ML assembly interface pointer. *)
     and X_MLHeapAllocPtr: xReg (* ML Heap allocation pointer. *)
     and X_MLStackPtr: xReg (* ML Stack pointer. *)
     and X_LinkReg: xReg (* Link reg - return address *)
     and X_Base32in64: xReg (* X24 is used for the heap base in 32-in-64. *)
 
     val V0:  vReg   and V1:  vReg   and V2:  vReg   and V3: vReg
     and V4:  vReg   and V5:  vReg   and V6:  vReg   and V7: vReg
 
     (* Condition for conditional branches etc. *)
     datatype condition =
         CondEqual            (* Z=1 *)
     |   CondNotEqual         (* Z=0 *)
     |   CondCarrySet         (* C=1 *)
     |   CondCarryClear       (* C=0 *)
     |   CondNegative         (* N=1 *)
     |   CondPositive         (* N=0 imcludes zero *)
     |   CondOverflow         (* V=1 *)
     |   CondNoOverflow       (* V=0 *)
     |   CondUnsignedHigher   (* C=1 && Z=0 *)
     |   CondUnsignedLowOrEq  (* ! (C=1 && Z=0) *)
     |   CondSignedGreaterEq  (* N=V *)
     |   CondSignedLess       (* N<>V *)
     |   CondSignedGreater    (* Z==0 && N=V *)
     |   CondSignedLessEq     (* !(Z==0 && N=V) *)
 
     val invertTest: condition -> condition (* i.e. jump when the condition is not true. *)
     val condToString: condition -> string
 
     datatype shiftType =
         ShiftLSL of Word8.word
     |   ShiftLSR of Word8.word
     |   ShiftASR of Word8.word
     |   ShiftNone
 
     datatype wordSize = WordSize32 | WordSize64
 
     datatype 'a extend =
         ExtUXTB of 'a (* Unsigned extend byte *)
     |   ExtUXTH of 'a (* Unsigned extend byte *)
     |   ExtUXTW of 'a (* Unsigned extend byte *)
     |   ExtUXTX of 'a (* Left shift *)
     |   ExtSXTB of 'a (* Sign extend byte *)
     |   ExtSXTH of 'a (* Sign extend halfword *)
     |   ExtSXTW of 'a (* Sign extend word *)
     |   ExtSXTX of 'a (* Left shift *)
 
     (* Load/store instructions have only a single bit for the shift.  For byte
        operations this is one bit shift; for others it scales by the size of
        the operand if set. *)
     datatype scale =
         ScaleOrShift
     |   NoScale
 
     datatype loadType = Load64 | Load32 | Load16 | Load8
     and opSize = OpSize32 | OpSize64
     and logicalOp = LogAnd | LogOr | LogXor
     and floatSize = Float32 | Double64
     and shiftDirection = ShiftLeft | ShiftRightLogical | ShiftRightArithmetic
     and multKind =
         MultAdd32 | MultSub32 | MultAdd64 | MultSub64 |
         SignedMultAddLong (* 32bit*32bit + 64bit => 64Bit *) |
         SignedMultHigh (* High order part of 64bit*64Bit *)
-    and fpUnary = NegFloat | NegDouble | AbsFloat | AbsDouble | ConvFloatToDble | ConvDbleToFloat
+    and fpUnary =
+        NegFloat | NegDouble | AbsFloat | AbsDouble | ConvFloatToDble |
+        ConvDbleToFloat | MoveDouble | MoveFloat
     and fpBinary = MultiplyFP | DivideFP | AddFP | SubtractFP
     and unscaledType = NoUpdate | PreIndex | PostIndex
     and condSet = CondSet | CondSetIncr | CondSetInvert | CondSetNegate
     and bitfieldKind = BFUnsigned | BFSigned | BFInsert
     and brRegType = BRRBranch | BRRAndLink | BRRReturn
         (* Some of the atomic operations added in 8.1 *)
     and atomicOp = LoadAddAL | LoadUmaxAL | SwapAL | LoadAddAcquire | LoadUMaxAcquire | SwapRelease
 
     type label and labelMaker
     val createLabelMaker: unit -> labelMaker
     and createLabel: labelMaker -> label
 
     datatype precode =
         (* Basic instructions *)
         AddImmediate of {regN: xReg, regD: xReg, immed: word, shifted: bool, opSize: opSize, setFlags: bool}
     |   SubImmediate of {regN: xReg, regD: xReg, immed: word, shifted: bool, opSize: opSize, setFlags: bool}
     |   AddShiftedReg of {regM: xReg, regN: xReg, regD: xReg, shift: shiftType, opSize: opSize, setFlags: bool}
     |   SubShiftedReg of {regM: xReg, regN: xReg, regD: xReg, shift: shiftType, opSize: opSize, setFlags: bool}
     |   AddExtendedReg of {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend, opSize: opSize, setFlags: bool}
     |   SubExtendedReg of {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend, opSize: opSize, setFlags: bool}
     |   MultiplyAndAddSub of {regM: xReg, regN: xReg, regA: xReg, regD: xReg, multKind: multKind}
     |   DivideRegs of
             {regM: xReg, regN: xReg, regD: xReg, isSigned: bool, opSize: opSize}
     |   LogicalShiftedReg of
             {regM: xReg, regN: xReg, regD: xReg, shift: shiftType, logOp: logicalOp, opSize: opSize, setFlags: bool}
     |   LoadRegScaled of
             {regT: xReg, regN: xReg, unitOffset: int, loadType: loadType}
     |   LoadFPRegScaled of
             {regT: vReg, regN: xReg, unitOffset: int, floatSize: floatSize}
     |   StoreRegScaled of
             {regT: xReg, regN: xReg, unitOffset: int, loadType: loadType}
     |   StoreFPRegScaled of
             {regT: vReg, regN: xReg, unitOffset: int, floatSize: floatSize}
     |   LoadRegUnscaled of
             {regT: xReg, regN: xReg, byteOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   StoreRegUnscaled of
             {regT: xReg, regN: xReg, byteOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   LoadFPRegUnscaled of
             {regT: vReg, regN: xReg, byteOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   StoreFPRegUnscaled of
             {regT: vReg, regN: xReg, byteOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   LoadRegIndexed of {regT: xReg, regN: xReg, regM: xReg, loadType: loadType, option: scale extend}
     |   StoreRegIndexed of {regT: xReg, regN: xReg, regM: xReg, loadType: loadType, option: scale extend}
     |   LoadFPRegIndexed of {regT: vReg, regN: xReg, regM: xReg, floatSize: floatSize, option: scale extend}
     |   StoreFPRegIndexed of {regT: vReg, regN: xReg, regM: xReg, floatSize: floatSize, option: scale extend}
         (* LoadAcquire and StoreRelease are used for mutables. *)
     |   LoadAcquireReg of {regN: xReg, regT: xReg, loadType: loadType}
     |   StoreReleaseReg of {regN: xReg, regT: xReg, loadType: loadType}
         (* LoadAcquireExclusiveRegister and StoreReleaseExclusiveRegister are used for mutexes. *)
     |   LoadAcquireExclusiveRegister of {regN: xReg, regT: xReg}
     |   StoreReleaseExclusiveRegister of {regS: xReg, regT: xReg, regN: xReg}
     |   MemBarrier
         (* Additional atomic operations. *)
     |   AtomicExtension of { regT: xReg, regN: xReg, regS: xReg, atOp: atomicOp }
     |   LoadRegPair of
             { regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   StoreRegPair of
             { regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   LoadFPRegPair of
             { regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   StoreFPRegPair of
             { regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   ConditionalSet of
             {regD: xReg, regTrue: xReg, regFalse: xReg, cond: condition, condSet: condSet, opSize: opSize}
     |   BitField of {immr: word, imms: word, regN: xReg, regD: xReg, opSize: opSize, bitfieldKind: bitfieldKind}
     |   ShiftRegisterVariable of {regM: xReg, regN: xReg, regD: xReg, opSize: opSize, shiftDirection: shiftDirection}
     |   BitwiseLogical of { bits: Word64.word, regN: xReg, regD: xReg, opSize: opSize, setFlags: bool, logOp: logicalOp}
         (* Floating point *)
     |   MoveGeneralToFP of { regN: xReg, regD: vReg, floatSize: floatSize}
     |   MoveFPToGeneral of {regN: vReg, regD: xReg, floatSize: floatSize}
     |   CvtIntToFP of { regN: xReg, regD: vReg, floatSize: floatSize, opSize: opSize}
     |   CvtFloatToInt of { round: IEEEReal.rounding_mode, regN: vReg, regD: xReg, floatSize: floatSize, opSize: opSize}
     |   FPBinaryOp of { regM: vReg, regN: vReg, regD: vReg, floatSize: floatSize, fpOp: fpBinary}
     |   FPComparison of { regM: vReg, regN: vReg, floatSize: floatSize}
     |   FPUnaryOp of {regN: vReg, regD: vReg, fpOp: fpUnary}
         (* Branches and Labels. *)
     |   SetLabel of label
     |   ConditionalBranch of condition * label
     |   UnconditionalBranch of label
     |   BranchAndLink of label
     |   BranchReg of {regD: xReg, brRegType: brRegType }
     |   LoadLabelAddress of xReg * label
     |   TestBitBranch of { test: xReg, bit: Word8.word, label: label, onZero: bool }
     |   CompareBranch of { test: xReg, label: label, onZero: bool, opSize: opSize }
         (* Composite instructions *)
     |   MoveXRegToXReg of {sReg: xReg, dReg: xReg}
     |   LoadNonAddr of xReg * Word64.word
     |   LoadFPConst of {dest: vReg, value: Word64.word, floatSize: floatSize, work: xReg}
     |   LoadAddr of xReg * machineWord
     |   RTSTrap of { rtsEntry: int, work: xReg, save: xReg list }
         (* Allocate memory - bytes includes the length word and rounding. *)
     |   AllocateMemoryFixedSize of { bytes: word, dest: xReg, save: xReg list, work: xReg }
         (* Allocate memory - sizeReg is number of ML words needed for cell. *)
     |   AllocateMemoryVariableSize of { sizeReg: xReg, dest: xReg, save: xReg list, work: xReg }
         (* Branch table for indexed case. startLabel is the address of the first label in
            the list.  The branch table is a sequence of unconditional branches. *)
     |   BranchTable of { startLabel: label, brTable: label list }
     |   LoadGlobalHeapBaseInCallback of xReg
     |   Yield
 
     (* Wrapper for BitField *)
     val shiftConstant: { direction: shiftDirection, regD: xReg, regN: xReg, shift: word, opSize: opSize } -> precode
     
     (* Convenient sequences.  N.B. These are in reverse order. *)
     val boxDouble:
         {source: vReg, destination: xReg, workReg: xReg, saveRegs: xReg list} * precode list -> precode list
     and boxFloat:
         {source: vReg, destination: xReg, workReg: xReg, saveRegs: xReg list} * precode list -> precode list
     and boxSysWord:
         {source: xReg, destination: xReg, workReg: xReg, saveRegs: xReg list} * precode list -> precode list
 
     (* Create the vector of code from the list of instructions and update the
        closure reference to point to it. *)
     val generateFinalCode:
         {instrs: precode list, name: string, parameters: Universal.universal list, resultClosure: closureRef,
          profileObject: machineWord, labelMaker: labelMaker} -> unit
 
     (* Offsets in the assembly code interface pointed at by X26
        These are in units of 64-bits NOT bytes. *)
     val heapOverflowCallOffset: int
     and stackOverflowCallOffset: int
     and stackOverflowXCallOffset: int
     and exceptionHandlerOffset: int
     and stackLimitOffset: int
     and threadIdOffset: int
     and heapLimitPtrOffset: int
     and heapAllocPtrOffset: int
     and mlStackPtrOffset: int
     and exceptionPacketOffset: int
 
     val is32in64: bool and isBigEndian: bool
 
     val isEncodableBitPattern: Word64.word * wordSize -> bool
 
     structure Sharing:
     sig
         type closureRef = closureRef
         type loadType = loadType
         type opSize = opSize
         type logicalOp = logicalOp
         type floatSize = floatSize
         type shiftDirection = shiftDirection
         type multKind = multKind
         type fpUnary = fpUnary
         type fpBinary = fpBinary
         type unscaledType = unscaledType
         type condSet = condSet
         type bitfieldKind = bitfieldKind
         type brRegType = brRegType
         type precode = precode
         type xReg = xReg
         type vReg = vReg
         type label = label
         type labelMaker = labelMaker
         type condition = condition
         type shiftType = shiftType
         type wordSize = wordSize
         type 'a extend = 'a extend
         type scale = scale
         type atomicOp = atomicOp
     end
 
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64AllocateRegisters.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64AllocateRegisters.ML
index 468f13c7..7b1ce1fa 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64AllocateRegisters.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64AllocateRegisters.ML
@@ -1,1314 +1,1400 @@
 (*
     Copyright David C. J. Matthews 2016-22
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64AllocateRegisters(
     structure Arm64ICode: ARM64ICODE
     structure Identify: ARM64IDENTIFYREFERENCES
     structure IntSet: INTSET
     
     sharing Arm64ICode.Sharing = Identify.Sharing = IntSet
 ): ARM64ALLOCATEREGISTERS =
 struct
     open Arm64ICode
     open Identify
     open IntSet
     
     exception InternalError = Misc.InternalError
 
-    val checkCache = false (* Use the cache *)
-
     datatype allocateResult =
         AllocateSuccess of basicBlockConcrete vector
     |   AllocateFailure of intSet list
 
     (* General registers.  X24 is used as the global heap base in 32-in-64. X30 is the
        return address set by blr but is otherwise a general register.
        Put the argument registers at the end of the list so they'll only be
        used when hinted. *)
     val generalRegisters =
         map GenReg ([X9, X10, X11, X12, X13, X14, X15, X19, X20, X21, X22, X23,
                       X0, X1, X2, X3, X4, X5, X6, X7, X8, X30] @ (if is32in64 then [] else [X24]))
 
     val floatingPtRegisters =
         map FPReg [V7, V6, V5, V4, V3, V2, V1]
     type conflictState =
     {
         conflicts: intSet, realConflicts: reg list
     }
     
     type triple = {instr: iCodeAbstract, current: intSet, active: intSet}
 
     exception InternalError = Misc.InternalError
 
     (* Get the conflict states, allocate registers and return the code with the allocated
        registers if it is successful. *)
     fun allocateRegisters{blocks, regProps, maxPRegs, ...} =
     let
         (* Other registers that conflict with this i.e. cannot share the same
            real register. *)
         val regConflicts = Array.array(maxPRegs, emptySet)
         (* Real registers that cannot be used for this because they are needed for
            an instruction.  Only X30 in calls and RTS traps. *)
         and regRealConflicts = Array.array(maxPRegs, []: reg list)
  
         fun addConflictsTo(addTo, conflicts) =
             List.app(fn aReg => Array.update(regConflicts, aReg, union(Array.sub(regConflicts, aReg), conflicts))) addTo
 
         (* To reserve a register we need to add the real register to the
            real conflict sets of all the abstract conflicts. *)
         local
             fun isInset reg set = List.exists (fn r => r = reg) set
         in
             fun reserveRegister(reserveFor, reg) =
             let
                 val absConflicts = Array.sub(regConflicts, reserveFor)
                 fun addConflict i =
                     if i = reserveFor then () else addRealConflict (i, reg)
             in
                 List.app addConflict (setToList absConflicts)
             end
 
             and addRealConflict (i, reg) =
             let
                 val currentConflicts = Array.sub(regRealConflicts, i)
             in
                 if isInset reg currentConflicts
                 then ()
                 else Array.update(regRealConflicts, i, reg :: currentConflicts)
             end
         end
       
         fun conflictsForInstr passThrough {instr, current, ...} =
         let
             val {sources, dests} = getInstructionRegisters instr
             fun regNo(PReg i) = i
             val destRegNos = map regNo dests
             and sourceRegNos = map regNo sources
             val destSet = listToSet destRegNos
             val afterRemoveDests = minus(current, destSet)
             
             local
                 (* In almost all circumstances the destination and sources don't
                    conflict and the same register can be used as a destination and
                    a source.  BoxLarge can only store the value after the memory
                    has been allocated.  BitFieldInsert has to copy the "destAsSource"
                    value into the destination so cannot use the same register for
                    the "source". *)
                 val postInstruction =
                     case instr of
                         BoxLarge _ => destRegNos @ sourceRegNos
                     |   BoxTagFloat _ => destRegNos @ sourceRegNos (* Not sure about this. *)
                     |   BitFieldInsert{source, ...} => regNo source :: destRegNos
                     |   _ => destRegNos
             in
                 (* If there is more than one destination they conflict with each other. *)
                 val () = addConflictsTo(postInstruction, listToSet postInstruction);
                 (* Mark conflicts for the destinations, i.e. after the instruction.
                    The destinations conflict with the registers that are used
                    subsequently. *)
                 val () = addConflictsTo(postInstruction, current);
                 val () = addConflictsTo(postInstruction, passThrough);
 
                 (* Mark conflicts for the sources i.e. before the instruction. *)
                 (* Sources must be set up as conflicts with each other i.e. when we
                    come to allocate registers we must choose different real registers
                    for different abstract registers. *)
+                (* TODO: It may may well no longer be necessary to make sources
+                   conflict with each other.  It was probably a consequence of
+                   the old process of allocating registers from the end back. *)
                 val () = addConflictsTo(sourceRegNos, listToSet sourceRegNos)
                 val () = addConflictsTo(sourceRegNos, afterRemoveDests);
                 val () = addConflictsTo(sourceRegNos, passThrough)
             end
             
             (* I'm not sure if this is needed.  There was a check in the old code to ensure that
                different registers were used for loop variables even if they were actually unused.
                This may happen anyway.
                Comment and code copied from X86 version.  Retain it for the moment. *)
             val () =
                 case instr of
                     JumpLoop{regArgs, ...} =>
                     let
                         val destRegs = List.foldl(fn ({dst=PReg loopReg, ...}, dests) => loopReg :: dests) [] regArgs
                     in
                         addConflictsTo(destRegs, listToSet destRegs);
                         addConflictsTo(destRegs, current);
                         addConflictsTo(destRegs, passThrough)
                     end
                 |   _ => ()
 
             (* Certain instructions are specific as to the real registers. *)
             val () =
                 case instr of
-                    ReturnResultFromFunction{ returnReg=PReg retReg, ... } =>
-                        (* We're going to put the return value in X0 so we can't use that for
-                           the return address. *)
-                        addRealConflict(retReg, GenReg X0)
+                    ReturnResultFromFunction{ returnReg=PReg retReg, results, ... } =>
+                        (* We can't use any of the registers we're using to carry results, typically X0,
+                           to hold the return address. *)
+                        List.app(fn (_, xReg) => addRealConflict(retReg, GenReg xReg)) results
 
                 |   RaiseExceptionPacket{ packetReg } =>
                         (* This wasn't needed previously because we always pushed the registers
                            across an exception. *)
                         reserveRegister(regNo packetReg, GenReg X0)
                 
                 |   BeginHandler { packetReg, ...} =>
                         reserveRegister(regNo packetReg, GenReg X0)
                 
                 |   FunctionCall { dests, regArgs, ...} =>
                         (* This is only needed if we are saving the registers rather
                            than marking them as "must push". *)
                     let
                         val () = List.app(fn (PReg pr, r) => reserveRegister(pr, GenReg r)) dests
                         (* The argument registers also conflict.  In order to execute this call we need to load
                            the arguments into specific registers so we can't use them for values that we want after
                            the call. *)
                         val toReserve = X30 :: List.map #2 regArgs
                     in
                         List.app(fn i => List.app(fn r => addRealConflict(i, GenReg r)) toReserve)
                             (setToList passThrough @ setToList afterRemoveDests)
                     end
 
                     (* We can't use X30 as the result because it's needed for the return addr if we have to GC. *)
                 |   AllocateMemoryFixed{dest=PReg destReg, ...} => addRealConflict(destReg, GenReg X30)
                 |   AllocateMemoryVariable{dest=PReg destReg, ...} => addRealConflict(destReg, GenReg X30)
                 |   BoxLarge{dest=PReg destReg, ...} => addRealConflict(destReg, GenReg X30)
                 |   BoxTagFloat{dest=PReg destReg, ...} => addRealConflict(destReg, GenReg X30) (* Could exclude floats on native addr. *)
 
                 |   _ => ()
         in
             ()
         end
         
         (* Process the block. *)
         fun conflictsForBlock(ExtendedBasicBlock{block, passThrough, exports, ...}) =
         let
             (* We need to establish conflicts between all the registers active at
                the end of the block since they may not be established elsewhere.
                This isn't necessary for an unconditional branch since the
                same registers will be included in the block that is the target
                of the branch, possibly along with others.  However if this is
                a conditional or indexed branch we may have different sets at
                each of the targets and we have to ensure that all the registers
                differ. *)
             val united = union(exports, passThrough)
             val () = addConflictsTo(setToList united, united)
 
             val () = List.app (conflictsForInstr passThrough) block
         in
             ()
         end
 
         val () = Vector.app conflictsForBlock blocks
 
         (* Hint values.  The idea of hints is that by using a hinted register
            we may avoid an unnecessary move instruction.  realHints is set when
            a pseudo-register is going to be loaded from a specific register
            e.g. a register argument, or moved into one e.g. X0 for the
            function result.
            friends is set to the other pReg that may be associated
            with the pReg.  Typically this is where we have a merge
            register that we move some value into. *)
         val realHints = Array.array(maxPRegs, NONE: reg option)
 
         (* Sources and destinations.  These indicate the registers that are
            the sources and destinations of the indexing register and are used
            as hints.  If a register has been allocated for a source or destination
            we may be able to reuse it.  *)
         val sourceRegs = Array.array(maxPRegs, []: int list)
         and destinationRegs = Array.array(maxPRegs, []: int list)
         local
             (* Real hints.  If this is the source of a value e.g. a function argument in a register,
                we'll use it directly.  If, though, this is the result of a function and we want the
                result to end up in a specific register we want to propagate it to any pReg that
                moves its value into this. *)
             fun addRealHint(r, reg) =
                 case Array.sub(realHints, r) of
                     SOME _ => ()
                 |   NONE =>
                     (
                         (* Add to this pReg *)
                         Array.update(realHints, r, SOME reg);
                         (* and to any other pReg that moves here. *)
                         List.app(fn r => addRealHint(r, reg)) (Array.sub(sourceRegs, r))
                     )
 
             fun addSourceAndDestinationHint{src, dst} =
             let
                 val conflicts = Array.sub(regConflicts, src)
             in
                 (* If they conflict we can't add them. *)
                 if member(dst, conflicts)
                 then ()
                 else
                 let
                     val currentDests = Array.sub(destinationRegs, src)
                     val currentSources = Array.sub(sourceRegs, dst)
                 in
                     (* Add the destination for this source i.e. the registers we move
                        this source into. *)
                     if List.exists(fn i => i=dst) currentDests
                     then ()
                     else Array.update(destinationRegs, src, dst :: currentDests);
 
                     (* Add the source to the list of sources for this destination.
                        A merge register may have several sources, a different one
                        for each path. If the destination has a real hint we want to
                        propagate that back.  That isn't needed for the destinations
                        because we allocate the registers from the start forward. *)
                     if List.exists(fn i => i=src) currentSources
                     then ()
                     else
                     let
                         val sources = src :: currentSources
                         val () = Array.update(sourceRegs, dst, sources)
                     in
                         case Array.sub(realHints, dst) of
                             NONE => ()
                         |   SOME real => List.app(fn r => addRealHint(r, real)) sources
                     end
                 end
             end
 
             (* Add the hints to steer the register allocation.  The idea is to avoid moves between
                registers by getting values into the appropriate register in advance.
                We don't actually need to add real hints where the real register is providing
                the value, e.g. BeginFunction, because the allocation process will take care of
                that. *)
             fun addHints{instr=MoveRegister{source=PReg sreg, dest=PReg dreg, ...}, ...} =
                     addSourceAndDestinationHint {src=sreg, dst=dreg}
 
             |   addHints{instr=BitFieldInsert{destAsSource=PReg dsReg, dest=PReg dReg, ...}, ...} =
                     (* The "destAsSource" is the destination if some bits are retained. *)
                     addSourceAndDestinationHint {src=dsReg, dst=dReg}
 
-            |   addHints{instr=ReturnResultFromFunction { results, ... }, ...} =
-                    List.app(fn(PReg pr, r) => addRealHint(pr, GenReg r)) results
+            |   addHints{instr=ReturnResultFromFunction { results, fpResults, ... }, ...} =
+                (
+                    List.app(fn(PReg pr, r) => addRealHint(pr, GenReg r)) results;
+                    List.app(fn(PReg pr, r) => addRealHint(pr, FPReg r)) fpResults
+                )
 
             |   addHints{instr=JumpLoop{regArgs, ...}, ...} =
                 let
                     fun addRegArg {src=ArgInReg(PReg argReg), dst=PReg resReg} =
                             addSourceAndDestinationHint {dst=resReg, src=argReg}
                     |   addRegArg {src=ArgOnStack _, ...} = ()
                 in
                     List.app addRegArg regArgs
                 end
 
-            |   addHints{instr=BeginFunction{regArgs, ...}, ...} =
-                    List.app (fn (PReg pr, reg) => addRealHint(pr, GenReg reg)) regArgs
+            |   addHints{instr=BeginFunction{regArgs, fpRegArgs, ...}, ...} =
+                (
+                    List.app (fn (PReg pr, reg) => addRealHint(pr, GenReg reg)) regArgs;
+                    List.app (fn (PReg pr, reg) => addRealHint(pr, FPReg reg)) fpRegArgs
+                )
 
-            |   addHints{instr=TailRecursiveCall{regArgs, ...}, ...} =
+            |   addHints{instr=TailRecursiveCall{regArgs, fpRegArgs, ...}, ...} =
                 let
                     fun setHint(ArgInReg(PReg pr), reg) = addRealHint(pr, GenReg reg) | setHint _ = ()
                 in
-                    List.app setHint regArgs
+                    List.app setHint regArgs;
+                    List.app (fn(PReg pr, reg) => addRealHint(pr, FPReg reg)) fpRegArgs
                 end
         
-            |   addHints{instr=FunctionCall{regArgs, dests, ...}, ...} =
+            |   addHints{instr=FunctionCall{regArgs, dests, fpRegArgs, fpDests, ...}, ...} =
                 let
                     fun setHint(ArgInReg(PReg pr), reg) = addRealHint(pr, GenReg reg) | setHint _ = ()
                 in
                     List.app(fn(PReg pr, r) => addRealHint(pr, GenReg r)) dests;
-                    List.app setHint regArgs
+                    List.app(fn(PReg pr, r) => addRealHint(pr, FPReg r)) fpDests;
+                    List.app setHint regArgs;
+                    List.app (fn(PReg pr, reg) => addRealHint(pr, FPReg reg)) fpRegArgs
                 end
 
                 (* Exception packets are in X0 *)
             |   addHints{instr=RaiseExceptionPacket{ packetReg=PReg preg }, ...} = addRealHint(preg, GenReg X0)
             |   addHints{instr=BeginHandler{ packetReg=PReg preg }, ...} = addRealHint(preg, GenReg X0)
 
+            |   addHints{instr=UnaryFloatingPt{source=PReg sreg, dest=PReg dreg, fpOp=MoveFloat}, ...} =
+                    addSourceAndDestinationHint {src=sreg, dst=dreg}
+
+            |   addHints{instr=UnaryFloatingPt{source=PReg sreg, dest=PReg dreg, fpOp=MoveDouble}, ...} =
+                    addSourceAndDestinationHint {src=sreg, dst=dreg}
+
             |   addHints _ = ()
         in
             val () = Vector.app(fn ExtendedBasicBlock { block, ...} => List.app addHints block) blocks        
         end
 
-
-        val allocatedRegs = Array.array(maxPRegs, NONE: reg option)
         val failures = ref []: intSet list ref
 
-        (* Find a real register for a preg.
-           1.  If a register is already allocated use that.
-           2.  Try the "preferred" register if one has been given.
-           3.  Try the realHints value if there is one.
-           4.  See if there is a "friend" that has an appropriate register
-           5.  Look at all the registers and find one. *)
-        fun findRegister(r, pref, regSet, cache) =
-        case Array.sub(allocatedRegs, r) of
-            SOME reg => reg
-
-        |   NONE =>
-            let
-                val conflicts = Array.sub(regConflicts, r)
-                and realConflicts = Array.sub(regRealConflicts, r)
-                (* Find the registers we've already allocated that may conflict. *)
-                val conflictingRegs =
-                    List.mapPartial(fn i => Array.sub(allocatedRegs, i)) (setToList conflicts) @
-                        realConflicts
-
-                fun isFree aReg = not (List.exists(fn i => i=aReg) conflictingRegs)
-
-                fun tryAReg NONE = NONE
-                |   tryAReg (somePref as SOME prefReg) =
-                        if isFree prefReg
-                        then (Array.update(allocatedRegs, r, somePref); somePref)
-                        else NONE
-
-                (* Search the sources and destinations to see if a register has
-                   already been allocated or there is a hint. *)
-                fun findAFriend([], [], _) = NONE
-
-                |   findAFriend(aDest :: otherDests, sources, alreadySeen) =
-                    let
-                        val possReg =
-                            case Array.sub(allocatedRegs, aDest) of
-                                v as SOME _ => tryAReg v
-                            |   NONE => tryAReg(Array.sub(realHints, aDest))
-                    in
-                        case possReg of
-                            reg as SOME _ => reg
-                        |   NONE =>
-                            let
-                                (* Add the destinations of the destinations to the list
-                                   if they don't conflict and haven't been seen. *)
-                                fun newFriend f =
-                                    not(List.exists (fn n => n=f) alreadySeen) andalso not(member(f, conflicts))
-                                val fOfF = List.filter newFriend (Array.sub(destinationRegs, aDest))
-                            in
-                                findAFriend(otherDests @ fOfF, sources, aDest :: alreadySeen)
-                            end
-                    end
-
-                |   findAFriend([], aSrc :: otherSrcs, alreadySeen) =
-                    let
-                        val possReg =
-                            case Array.sub(allocatedRegs, aSrc) of
-                                v as SOME _ => tryAReg v
-                            |   NONE => tryAReg(Array.sub(realHints, aSrc))
-                    in
-                        case possReg of
-                            reg as SOME _ => reg
-                        |   NONE =>
-                            let
-                                (* Add the sources of the sources to the list
-                                   if they don't conflict and haven't been seen. *)
-                                fun newFriend f =
-                                    not(List.exists (fn n => n=f) alreadySeen) andalso not(member(f, conflicts))
-                                val fOfF = List.filter newFriend (Array.sub(sourceRegs, aSrc))
-                            in
-                                findAFriend([], otherSrcs @ fOfF, aSrc :: alreadySeen)
-                            end
-                    end
-            in
-                case tryAReg pref of
-                    SOME r => r
-                |   NONE =>
-                    (
-                        case tryAReg (Array.sub(realHints, r)) of
-                            SOME r => r
-                        |   NONE =>
-                            (
-                                case findAFriend(Array.sub(destinationRegs, r), Array.sub(sourceRegs, r), []) of
-                                    SOME r => r
-                                    (* Look through the registers to find one that's free.  First try excluding
-                                       the cache registers. *)
-                                |   NONE =>
-                                    let
-                                        (* First try filtering all the cache registers to see if we can find a
-                                           register.  If not see if it works by *)
-                                        fun filterCache(filteredRegset, []) = List.find isFree filteredRegset
-
-                                        |   filterCache(filteredRegset, (cReg, _) :: cache) =
-                                            (
-                                                case filterCache(List.filter(fn r => r <> cReg) filteredRegset, cache) of
-                                                    NONE => if isFree cReg then SOME cReg else NONE
-                                                |   result => result
-                                            )
-
-                                       val pick =
-                                            case filterCache(regSet, cache) of
-                                                SOME reg => reg
-                                            |   NONE =>
-                                                (
-                                                    (* This failed.  We're going to have to spill something. *)
-                                                    failures := conflicts :: ! failures;
-                                                    hd regSet (* Return something to allow this pass to complete *)
-                                                )
-                                        val () = Array.update(allocatedRegs, r, SOME pick)
-                                    in
-                                        pick
-                                    end
-                            )
-                    )
-            end
-
         (* Turn the abstract icode into a concrete version by allocating the registers. *)
         local
+
             fun asGenReg(GenReg reg) = reg | asGenReg _ = raise InternalError "asGenReg"
             and asFPReg(FPReg reg) = reg | asFPReg _ = raise InternalError "asFPReg"
 
             datatype cacheItem =
-                CacheStack of stackLocn             (* A value loaded from the stack. *)
-            |   CacheAbsAddress of preg             (* 32-in-64: An absolute address from an object ID *)
-            |   CacheAbsAddrOnStack of stackLocn    (* 32-in-64: An absolute address from an object loaded from the stack. *)
+                CacheStack of {cached: stackLocn, cachedIn: xReg }             (* A value loaded from the stack. *)
+            |   CacheAbsAddress of {cached: preg, cachedIn: xReg }             (* 32-in-64: An absolute address from an object ID *)
+            |   CacheAbsAddrOnStack of {cached: stackLocn, cachedIn: xReg }    (* 32-in-64: An absolute address from an object loaded from the stack. *)
+            |   CacheLargeUnbox of {cached: preg, cachedIn: xReg }             (* A LargeWord.word value has been unboxed *)
+            |   CacheRealUnbox of {cached: preg, cachedIn: vReg, floatSize: floatSize } (* A double or float has been unboxed. *)
+
+            fun regCachedIn(CacheStack{cachedIn, ...}) = GenReg cachedIn
+            |   regCachedIn(CacheAbsAddress{cachedIn, ...}) = GenReg cachedIn
+            |   regCachedIn(CacheAbsAddrOnStack{cachedIn, ...}) = GenReg cachedIn
+            |   regCachedIn(CacheLargeUnbox{cachedIn, ...}) = GenReg cachedIn
+            |   regCachedIn(CacheRealUnbox{cachedIn, ...}) = FPReg cachedIn
+
+            val allocatedRegs = Array.array(maxPRegs, NONE: reg option)
+
+            (* Find a real register for a preg.
+               1.  If a register is already allocated use that.
+               2.  Try the "preferred" register if one has been given.
+               3.  Try the realHints value if there is one.
+               4.  See if there is a "friend" that has an appropriate register
+               5.  Look at all the registers and find one. *)
+            fun findRegister(r, pref, regSet, cache: cacheItem list) =
+            case Array.sub(allocatedRegs, r) of
+                SOME reg => reg
+
+            |   NONE =>
+                let
+                    val conflicts = Array.sub(regConflicts, r)
+                    and realConflicts = Array.sub(regRealConflicts, r)
+                    (* Find the registers we've already allocated that may conflict. *)
+                    val conflictingRegs =
+                        List.mapPartial(fn i => Array.sub(allocatedRegs, i)) (setToList conflicts) @
+                            realConflicts
+
+                    fun isFree aReg = not (List.exists(fn i => i=aReg) conflictingRegs)
+
+                    fun tryAReg NONE = NONE
+                    |   tryAReg (somePref as SOME prefReg) =
+                            if isFree prefReg
+                            then (Array.update(allocatedRegs, r, somePref); somePref)
+                            else NONE
+
+                    (* Search the sources and destinations to see if a register has
+                       already been allocated or there is a hint. *)
+                    fun findAFriend([], [], _) = NONE
+
+                    |   findAFriend(aDest :: otherDests, sources, alreadySeen) =
+                        let
+                            val possReg =
+                                case Array.sub(allocatedRegs, aDest) of
+                                    v as SOME _ => tryAReg v
+                                |   NONE => tryAReg(Array.sub(realHints, aDest))
+                        in
+                            case possReg of
+                                reg as SOME _ => reg
+                            |   NONE =>
+                                let
+                                    (* Add the destinations of the destinations to the list
+                                       if they don't conflict and haven't been seen. *)
+                                    fun newFriend f =
+                                        not(List.exists (fn n => n=f) alreadySeen) andalso not(member(f, conflicts))
+                                    val fOfF = List.filter newFriend (Array.sub(destinationRegs, aDest))
+                                in
+                                    findAFriend(otherDests @ fOfF, sources, aDest :: alreadySeen)
+                                end
+                        end
+
+                    |   findAFriend([], aSrc :: otherSrcs, alreadySeen) =
+                        let
+                            val possReg =
+                                case Array.sub(allocatedRegs, aSrc) of
+                                    v as SOME _ => tryAReg v
+                                |   NONE => tryAReg(Array.sub(realHints, aSrc))
+                        in
+                            case possReg of
+                                reg as SOME _ => reg
+                            |   NONE =>
+                                let
+                                    (* Add the sources of the sources to the list
+                                       if they don't conflict and haven't been seen. *)
+                                    fun newFriend f =
+                                        not(List.exists (fn n => n=f) alreadySeen) andalso not(member(f, conflicts))
+                                    val fOfF = List.filter newFriend (Array.sub(sourceRegs, aSrc))
+                                in
+                                    findAFriend([], otherSrcs @ fOfF, aSrc :: alreadySeen)
+                                end
+                        end
+                in
+                    case tryAReg pref of
+                        SOME r => r
+                    |   NONE =>
+                        (
+                            case tryAReg (Array.sub(realHints, r)) of
+                                SOME r => r
+                            |   NONE =>
+                                (
+                                    case findAFriend(Array.sub(destinationRegs, r), Array.sub(sourceRegs, r), []) of
+                                        SOME r => r
+                                        (* Look through the registers to find one that's free.  First try excluding
+                                           the cache registers. *)
+                                    |   NONE =>
+                                        let
+                                            (* First try filtering all the cache registers to see if we can find a
+                                               register.  If not see if it works by freeing one. *)
+                                            fun filterCache(filteredRegset, []) = List.find isFree filteredRegset
+
+                                            |   filterCache(filteredRegset, cacheItem :: cache) =
+                                                let
+                                                    val cReg = regCachedIn cacheItem
+                                                in
+                                                    case filterCache(List.filter(fn r => r <> cReg) filteredRegset, cache) of
+                                                        NONE => if isFree cReg then SOME cReg else NONE
+                                                    |   result => result
+                                                end
+
+                                           val pick =
+                                                case filterCache(regSet, cache) of
+                                                    SOME reg => reg
+                                                |   NONE =>
+                                                    (
+                                                        (* This failed.  We're going to have to spill something. *)
+                                                        failures := conflicts :: ! failures;
+                                                        hd regSet (* Return something to allow this pass to complete *)
+                                                    )
+                                            val () = Array.update(allocatedRegs, r, SOME pick)
+                                        in
+                                            pick
+                                        end
+                                )
+                        )
+                end
 
             (* Cache hints: Try to use the same register for values that can be cached.  This increases
                the chances that we will be able to retain the cache when we merge different branches. *)
             val cacheHints = Array.array(maxPRegs, NONE: reg option)
 
             (* Remove any reference to newly allocated registers from the cache.  Also used after
                block move and comparison that modify registers *)
-            fun pruneCache(reg: reg, cache) = List.filter(fn (r, _) => r <> reg) cache
+            fun pruneCache(reg: reg, cache) = List.filter(fn c => regCachedIn c <> reg) cache
 
-            (* Return the cache registers that contain valid addresses. *)
-            fun cachedAddressRegs cache = List.map (asGenReg o #1) cache
+            (* Return the cache registers that contain valid addresses.  N.B. We must only
+               save registers that contain properly tagged values together, on compact 32-bit,
+               with absolute addresses.  All registers are saved across a GC but only these
+               registers are updated. *)
+            local
+                fun hasAddress(CacheStack{cachedIn, ...}) = SOME cachedIn
+                |   hasAddress(CacheAbsAddress{cachedIn, ...}) = SOME cachedIn
+                |   hasAddress(CacheAbsAddrOnStack{cachedIn, ...}) = SOME cachedIn
+                |   hasAddress(CacheLargeUnbox _) = NONE (* The value was untagged. *)
+                |   hasAddress(CacheRealUnbox _) = NONE (* Untagged value in a FP register. *)
+            in
+                val cachedAddressRegs = List.mapPartial hasAddress
+            end
 
             (* Merge the cache states *)
-            fun mergeCacheStates ([]: (reg * cacheItem) list list) = []: (reg * cacheItem) list
+            fun mergeCacheStates ([]: cacheItem list list) = []: cacheItem list
             |   mergeCacheStates [single] = single
         
             |   mergeCacheStates (many as first :: rest) =
                 let
                     (* Generally we will either be unable to merge and have an empty cache or
                        will have just one or two entries. *)
                     (* Find the shortest.  If it's empty we're done. *)
                     fun findShortest(_, [], _) = []
                     |   findShortest(_, shortest, []) = shortest
                 
                     |   findShortest(len, shortest, hd::tl) =
                         let
                             val hdLen = List.length hd
                         in
                             if hdLen < len then findShortest(hdLen, hd, tl) else findShortest(len, shortest, tl)
                         end
                     val shortest = findShortest(List.length first, first, rest)
                 
                     (* Find the item we're caching for.  If it is in a different register we
                        can't use it. *)
                     fun findItem search (hd::tl) = search = hd orelse findItem search tl
                     |   findItem _ [] = false
                 
                     (* It's present if it's in all the sources. *)
                     fun present search = List.all(findItem search) many
                 
                     val filtered =
                         List.foldl (fn (search, l) => if present search then search :: l else l) [] shortest
                 in
                     filtered
                 end
 
             fun allocateNewDestination(PReg r, pref, regSet, cacheList) =
                 case Array.sub(allocatedRegs, r) of
                     SOME reg =>
                         (
                             case Vector.sub(regProps, r) of
                                 RegPropMultiple => (reg, pruneCache(reg, cacheList)) (* This is allowed for merge registers *)
                             |   _ => raise InternalError "Register defined at multiple points"
                         )
                 |   NONE =>
                     let
                         val reg = findRegister(r, pref, regSet, cacheList)
                     in
                         (reg, pruneCache(reg, cacheList))
                     end
 
             fun allocateGenReg(r, hint, cache) =
             let
                 val (reg, newCache) = allocateNewDestination(r, hint, generalRegisters, cache)
             in
                 (asGenReg reg, newCache)
             end
             and allocateFPReg(r, hint, cache) =
             let
                 val (reg, newCache) = allocateNewDestination(r, hint, floatingPtRegisters, cache)
             in
                 (asFPReg reg, newCache)
             end
 
             and allocateGenRegOrZero(ZeroReg, _, cache) = (XZero, cache)
             |   allocateGenRegOrZero(SomeReg reg, hint, cache) = allocateGenReg(reg, hint, cache)
 
             fun getAllocatedGenReg(PReg r) =
                 case Array.sub(allocatedRegs, r) of SOME(GenReg reg) => reg | _ => raise InternalError "getAllocatedGenReg"
             and getAllocatedFPReg(PReg r) =
                 case Array.sub(allocatedRegs, r) of SOME(FPReg reg) => reg | _ => raise InternalError "getAllocatedFPReg"
 
             fun getAllocatedGenRegOrZero ZeroReg = XZero
             |   getAllocatedGenRegOrZero (SomeReg reg) = getAllocatedGenReg reg
 
             fun getAllocatedArg(ArgInReg reg) = ArgInReg(getAllocatedGenReg reg)
             |   getAllocatedArg(ArgOnStack stackLoc) = ArgOnStack stackLoc
 
             val getSaveRegs = List.map getAllocatedGenReg
 
             (* Reversed cons and append to make the code easier to read. *)
             infix 5 <::> <@>
             fun tl <::> hd = hd :: tl
             (*and snd <@> fst = fst @ snd*)
 
             fun absToConcrete([], context, code) = (context, code)
 
             |   absToConcrete({instr=MoveRegister{ source, dest}, ...} :: rest, cache, code) =
                 let
                     (* Try to use the register we've allocated for the source as the destination so
                        that we can eliminate this instruction altogether. *)
                     val sourceReg = getAllocatedGenReg source
                     val (destReg, newCache) = allocateGenReg(dest, SOME(GenReg sourceReg), cache)
                 in
                     if sourceReg = destReg
                     then absToConcrete(rest, newCache, code)
                     else absToConcrete(rest, newCache, code <::> MoveRegister { source=sourceReg, dest=destReg})
                 end
 
             |   absToConcrete({instr=LoadNonAddressConstant { dest, source}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::> LoadNonAddressConstant { dest=destReg, source=source})
                 end
 
             |   absToConcrete({instr=LoadFPConstant { dest, source, floatSize }, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateFPReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::> LoadFPConstant { dest=destReg, source=source, floatSize=floatSize})
                 end
 
             |   absToConcrete({instr=LoadAddressConstant { dest, source}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::> LoadAddressConstant { dest=destReg, source=source})
                 end
 
             |   absToConcrete({instr=LoadWithConstantOffset { base, dest, byteOffset, loadType}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         LoadWithConstantOffset { base=getAllocatedGenReg base, dest=destReg, byteOffset=byteOffset, loadType=loadType})
                 end
 
             |   absToConcrete({instr=LoadFPWithConstantOffset { base, dest, byteOffset, floatSize}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateFPReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         LoadFPWithConstantOffset { base=getAllocatedGenReg base, dest=destReg, byteOffset=byteOffset, floatSize=floatSize})
                 end
 
             |   absToConcrete({instr=LoadWithIndexedOffset { base, dest, index, loadType, signExtendIndex}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         LoadWithIndexedOffset { base=getAllocatedGenReg base, dest=destReg, index=getAllocatedGenReg index,
                                                 loadType=loadType, signExtendIndex=signExtendIndex})
                 end
 
             |   absToConcrete({instr=LoadFPWithIndexedOffset { base, dest, index, floatSize, signExtendIndex}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateFPReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         LoadFPWithIndexedOffset { base=getAllocatedGenReg base, dest=destReg, index=getAllocatedGenReg index,
                                                   floatSize=floatSize, signExtendIndex=signExtendIndex})
                 end
 
             |   absToConcrete({instr=GetThreadId { dest}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::> GetThreadId { dest=destReg})
                 end
 
             |   absToConcrete({instr=ObjectIndexAddressToAbsolute { source as PReg srcNo, dest=destOiA as PReg doia}, kill, ...}
                                     :: rest, cache, code) =
                 let
                     (* See if this is in the cache and use it if it is.  If this is the last reference to this
                        source entry we don't want it in the cache any longer. *)
                     val killThis = member(srcNo, kill)
-                    val (newCode, destReg, newCache, next) =
-                        case List.find(fn (_, CacheAbsAddress c) => c=source | _ => false) cache of
-                            SOME (srcReg, _) =>
+                    val (newCode, dReg, newCache, next) =
+                        case List.find(fn (CacheAbsAddress{cached, ...}) => cached=source | _ => false) cache of
+                            SOME (CacheAbsAddress{cachedIn, ...}) =>
                             let
                                 (* Try to use the cache register as the destination if we can. *)
-                                val (destReg, newCache) = allocateNewDestination(destOiA, SOME srcReg, generalRegisters, cache)
+                                val (destReg, newCache) = allocateNewDestination(destOiA, SOME(GenReg cachedIn), generalRegisters, cache)
                                 val dReg = asGenReg destReg
-                                and sReg = asGenReg srcReg
                             in
-                                if checkCache
-                                then (code <::> MoveRegister{source=sReg, dest=X17} <::> ObjectIndexAddressToAbsolute { source=getAllocatedGenReg source, dest=dReg}
-                                        <::> CacheCheck{ arg1=dReg, arg2=X17 },
-                                     destReg, if killThis then pruneCache(srcReg, newCache) else newCache, rest)
-                                else if dReg = sReg
-                                then (code, destReg, newCache, rest) (* We will have pruned this since it's the destination. *)
-                                else (code <::> MoveRegister { source=sReg, dest=dReg}, destReg,
-                                        if killThis then pruneCache(srcReg, newCache) else newCache, rest)
+                                if dReg = cachedIn
+                                then (code, dReg, newCache, rest) (* We will have pruned this since it's the destination. *)
+                                else (code <::> MoveRegister { source=cachedIn, dest=dReg}, dReg,
+                                        if killThis then pruneCache(GenReg cachedIn, newCache) else newCache, rest)
                             end
 
-                        |   NONE =>
+                        |   _ =>
                                 (* If this is the last reference and the next instruction is loading with a zero
                                    offset we can use an indexed load and avoid converting to an absolute address.
                                    If this is not the last reference it's likely that we're loading another field
                                    so it's probably better to convert the object index and cache it.  We might
                                    manage to use a load-pair instruction. *)
                             (
                                 case (killThis, rest) of
                                     (true, {instr=LoadWithConstantOffset{ byteOffset=0, loadType=Load32, base, dest=destLoad, ... },
                                             kill=killLoad, ...} :: next) =>
                                     if base = destOiA (* of objectindex *) andalso member(doia, killLoad)
                                     then
                                     let
                                         val (destReg, newCache) = allocateGenReg(destLoad, NONE, cache)
                                     in
                                         (code <::> LoadWithIndexedOffset{ base=X24(*X_Base32in64*), dest=destReg,
                                                         index=getAllocatedGenReg source, loadType=Load32, signExtendIndex=false },
-                                                        GenReg destReg, newCache, next)
+                                                        destReg, newCache, next)
                                     end
                                     else
                                     let
                                         val (destReg, newCache) = allocateGenReg(destOiA, Array.sub(cacheHints, srcNo), cache)
                                     in
                                         (code <::> ObjectIndexAddressToAbsolute { source=getAllocatedGenReg source, dest=destReg},
-                                         GenReg destReg, newCache, rest)
+                                         destReg, newCache, rest)
                                     end
                                     
                                 |   _ =>
                                     let
                                         val (destReg, newCache) = allocateGenReg(destOiA, Array.sub(cacheHints, srcNo), cache)
                                     in
                                         (code <::> ObjectIndexAddressToAbsolute { source=getAllocatedGenReg source, dest=destReg},
-                                         GenReg destReg, newCache, rest)
+                                         destReg, newCache, rest)
                                     end
                             )
-                    val () = if killThis then () else Array.update(cacheHints, srcNo, SOME destReg)
+                    val () = if killThis then () else Array.update(cacheHints, srcNo, SOME(GenReg dReg))
                 in
-                    absToConcrete(next, if killThis then newCache else (destReg, CacheAbsAddress source) :: newCache, newCode)
+                    absToConcrete(next, if killThis then newCache else CacheAbsAddress{cached=source, cachedIn=dReg} :: newCache, newCode)
                 end
 
             |   absToConcrete({instr=AbsoluteToObjectIndex { source, dest}, ...} :: rest, cache, code) =
                 let
                     (* Don't make an entry in the cache for this; it won't be used. *)
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::> AbsoluteToObjectIndex { source=getAllocatedGenReg source, dest=destReg})
                 end
 
             |   absToConcrete({instr=AllocateMemoryFixed { bytesRequired, dest, saveRegs }, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                     val filteredCache = pruneCache(GenReg X30, newCache)
                     val saved = cachedAddressRegs filteredCache @ getSaveRegs saveRegs
                 in
                     absToConcrete(rest, filteredCache, code <::>
                         AllocateMemoryFixed { dest=destReg, bytesRequired=bytesRequired, saveRegs=saved})
                 end
 
             |   absToConcrete({instr=AllocateMemoryVariable{size, dest, saveRegs}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                     val filteredCache = pruneCache(GenReg X30, newCache)
                     val saved = cachedAddressRegs filteredCache @ getSaveRegs saveRegs
                 in
                     absToConcrete(rest, filteredCache, code <::>
                         AllocateMemoryVariable{size=getAllocatedGenReg size, dest=destReg, saveRegs=saved})
                 end
 
             |   absToConcrete({instr=InitialiseMem{size, addr, init}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::> InitialiseMem{size=getAllocatedGenReg size, addr=getAllocatedGenReg addr, init=getAllocatedGenReg init})
 
             |   absToConcrete({instr=BeginLoop, ...} :: rest, context, code) = absToConcrete(rest, context, code <::> BeginLoop)
 
             |   absToConcrete({instr=JumpLoop{regArgs, stackArgs, checkInterrupt}, ...} :: rest, context, code) =
                 let
                     fun getStackArg{src, wordOffset, stackloc} =
                             {src=getAllocatedArg src, wordOffset=wordOffset, stackloc=stackloc}
                     and getRegArg{src, dst} = {src=getAllocatedArg src, dst=getAllocatedGenReg dst}
                 in
                     absToConcrete(rest, context, code <::> JumpLoop{ regArgs=map getRegArg regArgs, stackArgs=map getStackArg stackArgs,
                               checkInterrupt=Option.map getSaveRegs checkInterrupt})
                 end
 
             |   absToConcrete({instr=StoreWithConstantOffset { base, source, byteOffset, loadType}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::>
                         StoreWithConstantOffset{ base=getAllocatedGenReg base, source=getAllocatedGenReg source, byteOffset=byteOffset, loadType=loadType})
 
             |   absToConcrete({instr=StoreFPWithConstantOffset { base, source, byteOffset, floatSize}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::>
                         StoreFPWithConstantOffset{ base=getAllocatedGenReg base, source=getAllocatedFPReg source, byteOffset=byteOffset, floatSize=floatSize})
 
             |   absToConcrete({instr=StoreWithIndexedOffset { base, source, index, loadType, signExtendIndex}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::>
                         StoreWithIndexedOffset{ base=getAllocatedGenReg base, source=getAllocatedGenReg source, index=getAllocatedGenReg index,
                                                 loadType=loadType, signExtendIndex=signExtendIndex})
 
             |   absToConcrete({instr=StoreFPWithIndexedOffset { base, source, index, floatSize, signExtendIndex}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::>
                         StoreFPWithIndexedOffset{ base=getAllocatedGenReg base, source=getAllocatedFPReg source, index=getAllocatedGenReg index,
                                                   floatSize=floatSize, signExtendIndex=signExtendIndex})
 
             |   absToConcrete({instr=AddSubImmediate{ source, dest, ccRef, immed, isAdd, length }, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenRegOrZero(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         AddSubImmediate { source=getAllocatedGenReg source, dest=destReg, ccRef=ccRef,
                                                  immed=immed, isAdd=isAdd, length=length})
                 end
 
             |   absToConcrete({instr=AddSubRegister{ base, shifted, dest, ccRef, isAdd, length, shift}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenRegOrZero(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         AddSubRegister { base=getAllocatedGenReg base, shifted=getAllocatedGenReg shifted,
                                      dest=destReg, ccRef=ccRef,
                                      isAdd=isAdd, length=length, shift=shift})
                 end
 
             |   absToConcrete({instr=LogicalImmediate{ source, dest, ccRef, immed, logOp, length }, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenRegOrZero(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         LogicalImmediate { source=getAllocatedGenReg source, dest=destReg, ccRef=ccRef,
                                                   immed=immed, logOp=logOp, length=length})
                 end
 
             |   absToConcrete({instr=LogicalRegister{ base, shifted, dest, ccRef, logOp, length, shift}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenRegOrZero(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         LogicalRegister { base=getAllocatedGenReg base, shifted=getAllocatedGenReg shifted,
                                       dest=destReg, ccRef=ccRef,
                                       logOp=logOp, length=length, shift=shift})
                 end
 
             |   absToConcrete({instr=ShiftRegister{ direction, dest, source, shift, opSize}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         ShiftRegister { source=getAllocatedGenReg source, shift=getAllocatedGenReg shift, dest=destReg,
                                                direction=direction, opSize=opSize})
                 end
 
             |   absToConcrete({instr=Multiplication{ kind, dest, sourceA, sourceM, sourceN }, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         Multiplication { kind=kind, sourceA=getAllocatedGenRegOrZero sourceA, sourceM=getAllocatedGenReg sourceM,
                                      sourceN=getAllocatedGenReg sourceN, dest=destReg})
                 end
 
             |   absToConcrete({instr=Division{ isSigned, dest, dividend, divisor, opSize }, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         Division { isSigned=isSigned, dividend=getAllocatedGenReg dividend, divisor=getAllocatedGenReg divisor,
                                dest=destReg, opSize=opSize})
                 end
 
-            |   absToConcrete({instr=BeginFunction {regArgs, stackArgs}, ...} :: rest, _, code) =
+            |   absToConcrete({instr=BeginFunction {regArgs, fpRegArgs, stackArgs}, ...} :: rest, _, code) =
                 let
                     (* Allocate the register arguments.  At this point all the registers are free and the cache is empty.
                        However we may have a "real conflict" that means that the allocated register is different.  e.g.
                        we need this argument some time after an arbitrary precision operation that may call a
                        function. *)
                     fun allocReg(src, dst) =
                     let
                         val (destReg, _) = allocateNewDestination(src, SOME(GenReg dst), generalRegisters, [])
                     in
                         (asGenReg destReg, dst)
                     end
+                    fun allocFPReg(src, dst) =
+                    let
+                        val (destReg, _) = allocateNewDestination(src, SOME(FPReg dst), floatingPtRegisters, [])
+                    in
+                        (asFPReg destReg, dst)
+                    end
                 in
                     absToConcrete(rest, [], code <::>
-                        BeginFunction {regArgs=map allocReg regArgs, stackArgs=stackArgs})
+                        BeginFunction {regArgs=map allocReg regArgs, fpRegArgs=map allocFPReg fpRegArgs, stackArgs=stackArgs})
                 end
 
-            |   absToConcrete({instr=FunctionCall{callKind, regArgs, stackArgs, dests, containers, saveRegs, ...}, ...} :: rest, _, code) =
+            |   absToConcrete({instr=FunctionCall{callKind, regArgs, stackArgs, dests, fpRegArgs, fpDests, containers, saveRegs, ...}, ...} :: rest, _, code) =
                 let
                     fun getRegArg(src, dst) = (getAllocatedArg src, dst)
-                    fun getResult(preg, reg) =
+                    and getFPRegArg(src, dst) = (getAllocatedFPReg src, dst)
+                    fun getResult allocReg (preg, reg) =
                     let
                         (* We empty the cache at this point. *)
-                        val (newReg, _) = allocateGenReg(preg, NONE, [])
+                        val (newReg, _) = allocReg(preg, NONE, [])
                     in
                         (newReg, reg)
                     end
                 in
                     absToConcrete(rest, [] (* Empty after a function call. *), code <::>
                         FunctionCall{ callKind=callKind, regArgs=map getRegArg regArgs, stackArgs=map getAllocatedArg stackArgs,
-                                  dests=map getResult dests, saveRegs=getSaveRegs saveRegs, containers=containers})
+                                  dests=map (getResult allocateGenReg) dests, fpRegArgs=map getFPRegArg fpRegArgs,
+                                  fpDests=map (getResult allocateFPReg) fpDests, saveRegs=getSaveRegs saveRegs, containers=containers})
                 end
 
-            |   absToConcrete({instr=TailRecursiveCall{callKind, regArgs, stackArgs, stackAdjust, currStackSize}, ...} :: rest, context, code) =
+            |   absToConcrete({instr=TailRecursiveCall{callKind, regArgs, stackArgs, fpRegArgs, stackAdjust, currStackSize}, ...} :: rest, context, code) =
                 let
                     fun getRegArg(src, dst) = (getAllocatedArg src, dst)
                     and getStackArg{src, stack} = {src=getAllocatedArg src, stack=stack}
+                    and getFPRegArg(src, dst) = (getAllocatedFPReg src, dst)
                 in
                     absToConcrete(rest, context, code <::>
-                        TailRecursiveCall{ callKind=callKind, regArgs=map getRegArg regArgs,
+                        TailRecursiveCall{ callKind=callKind, regArgs=map getRegArg regArgs, fpRegArgs=map getFPRegArg fpRegArgs,
                             stackArgs=map getStackArg stackArgs, stackAdjust=stackAdjust, currStackSize=currStackSize})
                 end
 
-            |   absToConcrete({instr=ReturnResultFromFunction{results, returnReg, numStackArgs}, ...} :: rest, context, code) =
+            |   absToConcrete({instr=ReturnResultFromFunction{results, fpResults, returnReg, numStackArgs}, ...} :: rest, context, code) =
                 let
                     fun getResult(preg, reg) = (getAllocatedGenReg preg, reg)
+                    and getFPResult(preg, reg) = (getAllocatedFPReg preg, reg)
                 in
                     absToConcrete(rest, context, code <::>
-                        ReturnResultFromFunction{results=map getResult results, returnReg=getAllocatedGenReg returnReg, numStackArgs=numStackArgs})
+                        ReturnResultFromFunction{results=map getResult results, fpResults=map getFPResult fpResults,
+                                                 returnReg=getAllocatedGenReg returnReg, numStackArgs=numStackArgs})
                 end
 
             |   absToConcrete({instr=RaiseExceptionPacket{packetReg}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::> RaiseExceptionPacket{packetReg=getAllocatedGenReg packetReg})
 
             |   absToConcrete({instr=PushToStack{ source, container as StackLoc{size, rno}, copies }, ...} :: rest, cache, code) =
                 let
                     val srcReg = getAllocatedGenReg source
                     val newCache =
-                        if size = 1 then (GenReg srcReg, CacheStack container) :: cache else cache
+                        if size = 1 then CacheStack{cached=container, cachedIn=srcReg} :: cache else cache
                     val () = Array.update(cacheHints, rno, SOME(GenReg srcReg))
                 in
                     absToConcrete(rest, newCache, code <::> PushToStack{source=srcReg, container=container, copies=copies})
                 end
 
             |   absToConcrete({instr=LoadStack{ dest=destLoad, container as StackLoc{rno, ...} , field=0, wordOffset}, kill, ...} :: 
                               (restPlusOia as {instr=ObjectIndexAddressToAbsolute { source as PReg srcNo, dest=destOia}, kill=killOia, ...} :: rest), cache, code) =
                 (* If a preg has been pushed to the stack every subsequent reference will be via the stack.  If we want to
                    be able to cache object index to absolute addresses for them we have to recognise this combination. *)
                 (* They could be unrelated in which case process the LoadStack and then the ObjectIndex...
                         It seems there are also rare circumstances(??) where the result of the load is not
                         killed and so would have to be preserved. *)
                 if destLoad = source andalso member(srcNo, killOia)
                 then
                 let
                     val killThis = member(rno, kill) (* Is it the last reference to the stack entry? *)
 
                     val (newCode, destReg, newCache) =
-                        case List.find(fn (_, CacheAbsAddrOnStack c) => c=container | _ => false) cache of
-                            SOME (srcReg, _) =>
+                        case List.find(fn CacheAbsAddrOnStack{cached, ...} => cached=container | _ => false) cache of
+                            SOME (CacheAbsAddrOnStack{cachedIn, ...}) =>
                             let
                                 (* Try to use the cache register as the destination if we can. *)
-                                val (destReg, newCache) = allocateNewDestination(destOia, SOME srcReg, generalRegisters, cache)
+                                val (destReg, newCache) = allocateNewDestination(destOia, SOME(GenReg cachedIn), generalRegisters, cache)
                                 val dReg = asGenReg destReg
-                                and sReg = asGenReg srcReg
                             in
-                                if checkCache
-                                then (code <::> MoveRegister{source=sReg, dest=X17} <::> LoadStack{ dest=X16, container=container, field=0, wordOffset=wordOffset } <::>
-                                        ObjectIndexAddressToAbsolute { source=X16, dest=dReg} <::> CacheCheck{ arg1=dReg, arg2=X17 }, destReg,
-                                        if killThis then pruneCache(srcReg, newCache) else newCache)
-                                else if dReg = sReg
-                                then (code, destReg, newCache) (* We will have pruned this since it's the destination. *)
-                                else (code <::> MoveRegister { source=sReg, dest=dReg}, destReg,
-                                        if killThis then pruneCache(srcReg, newCache) else newCache)
+                                if dReg = cachedIn
+                                then (code, dReg, newCache) (* We will have pruned this since it's the destination. *)
+                                else (code <::> MoveRegister { source=cachedIn, dest=dReg}, dReg,
+                                        if killThis then pruneCache(GenReg cachedIn, newCache) else newCache)
                             end
 
-                        |   NONE => (* It's not in the cache - load it which could be cached. *)
+                        |   _ => (* It's not in the cache - load it which could be cached. *)
                             let
                                 val (cachePostLoad, loadCode) =
                                     processLoadStack(destLoad, container, wordOffset, kill, cache, code)
                                 val (destReg, cachePlusOia) = allocateGenReg(destOia, Array.sub(cacheHints, rno), cachePostLoad)
                             in
                                 (loadCode <::> ObjectIndexAddressToAbsolute { source=getAllocatedGenReg source, dest=destReg},
-                                 GenReg destReg, cachePlusOia)
+                                 destReg, cachePlusOia)
                             end
-                    val () = if killThis then () else Array.update(cacheHints, rno, SOME destReg)
+                    val () = if killThis then () else Array.update(cacheHints, rno, SOME(GenReg destReg))
                 in
-                    absToConcrete(rest, if killThis then newCache else (destReg, CacheAbsAddrOnStack container) :: newCache, newCode)
+                    absToConcrete(rest, if killThis then newCache else CacheAbsAddrOnStack{cached=container, cachedIn=destReg} :: newCache, newCode)
                 end
                 else (* Can't combine these. *)
                 let
                     val (newCache, newCode) = processLoadStack(destLoad, container, wordOffset, kill, cache, code)
                 in
                     absToConcrete(restPlusOia, newCache, newCode)
                 end
 
             |   absToConcrete({instr=LoadStack{ dest, container, wordOffset, field=0, ...}, kill, ...} :: rest, cache, code) =
                 let
                     val (newCache, newCode) = processLoadStack(dest, container, wordOffset, kill, cache, code)
                 in
                     absToConcrete(rest, newCache, newCode)
                 end
 
             |   absToConcrete({instr=LoadStack{ dest, container, field, wordOffset}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::> LoadStack{ dest=destReg, container=container, field=field, wordOffset=wordOffset })
                 end
 
             |   absToConcrete({instr=StoreToStack{source, container, field, stackOffset}, ...} :: rest, cache, code) =
                 (* We may have cached the original push that cleared the container.  We could cache this since it now
                    contains the entry but it's probably better to deal with multiple results at a higher level. *)
                 let
                     val sReg = getAllocatedGenReg source
                     val newCache =
-                        List.filter(fn (_, CacheStack c) => c <> container | (_, CacheAbsAddrOnStack c) => c <> container | _ => true) cache
+                        List.filter(
+                            fn  CacheStack{cached, ...} => cached <> container
+                            |   CacheAbsAddrOnStack{cached, ...} => cached <> container | _ => true) cache
                 in
                     absToConcrete(rest, newCache, code <::>
                         StoreToStack{source=sReg, container=container, field=field, stackOffset=stackOffset})
                 end
 
             |   absToConcrete({instr=ContainerAddress{ dest, container, stackOffset}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         ContainerAddress{ dest=destReg, container=container, stackOffset=stackOffset })
                 end
 
             |   absToConcrete({instr=ResetStackPtr {numWords}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::> ResetStackPtr {numWords=numWords})
 
             |   absToConcrete({instr=TagValue{source, dest, isSigned, opSize}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         TagValue{source=getAllocatedGenReg source, dest=destReg, isSigned=isSigned, opSize=opSize})
                 end
 
             |   absToConcrete({instr=UntagValue{source, dest, isSigned, opSize, ...}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         UntagValue{source=getAllocatedGenReg source, dest=destReg, isSigned=isSigned, opSize=opSize})
                 end
 
             |   absToConcrete({instr=BoxLarge{source, dest, saveRegs, ...}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                     val filteredCache = pruneCache(GenReg X30, newCache)
                     val saved = cachedAddressRegs filteredCache @ getSaveRegs saveRegs
                 in
                     absToConcrete(rest, filteredCache, code <::>
                         BoxLarge{source=getAllocatedGenReg source, dest=destReg, saveRegs=saved})
                 end
 
-            |   absToConcrete({instr=UnboxLarge{source, dest}, ...} :: rest, cache, code) =
+            |   absToConcrete({instr=UnboxLarge{source as PReg rno, dest}, kill, ...} :: rest, cache, code) =
+                (* Unboxing a large word.  We cache this in case we need to unbox it again. *)
                 let
-                    val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
+                    val killThis = member(rno, kill)
                 in
-                    absToConcrete(rest, newCache, code <::> UnboxLarge{source=getAllocatedGenReg source, dest=destReg})
+                    case List.find(fn CacheLargeUnbox{cached, ...} => cached=source | _ => false) cache of
+                        SOME(CacheLargeUnbox{cachedIn, ...}) =>
+                        let
+                            val (destReg, newCache) = allocateNewDestination(dest, SOME(GenReg cachedIn), generalRegisters, cache)
+                            val dReg = asGenReg destReg
+                        in
+                            absToConcrete(rest,
+                                if killThis then pruneCache(GenReg cachedIn, newCache) else newCache,
+                                    code <::> MoveRegister { source=cachedIn, dest=dReg})
+                        end
+                   |    _ =>
+                        let
+                            val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
+                        in
+                            absToConcrete(rest, if killThis then newCache else CacheLargeUnbox{cached=source, cachedIn=destReg} :: newCache,
+                                code <::> UnboxLarge{source=getAllocatedGenReg source, dest=destReg})
+                        end
                 end
 
             |   absToConcrete({instr=BoxTagFloat{floatSize, source, dest, saveRegs}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                     val filteredCache = pruneCache(GenReg X30, newCache)
                     val saved = cachedAddressRegs filteredCache @ getSaveRegs saveRegs
                 in
                     absToConcrete(rest, filteredCache, code <::>
                         BoxTagFloat{floatSize=floatSize, source=getAllocatedFPReg source, dest=destReg, saveRegs=saved})
                 end
 
-            |   absToConcrete({instr=UnboxTagFloat{floatSize, source, dest}, ...} :: rest, cache, code) =
+            |   absToConcrete({instr=UnboxTagFloat{floatSize, source as PReg rno, dest}, kill, ...} :: rest, cache, code) =
+                (* Unboxing a large word.  We cache this in case we need to unbox it again. *)
                 let
-                    val (destReg, newCache) = allocateFPReg(dest, NONE, cache)
+                    val killThis = member(rno, kill)
                 in
-                    absToConcrete(rest, newCache, code <::> UnboxTagFloat{floatSize=floatSize, source=getAllocatedGenReg source, dest=destReg})
+                    case List.find(fn CacheRealUnbox{cached=s,floatSize=f, ...}=> s=source andalso f=floatSize | _ => false) cache of
+                        SOME (CacheRealUnbox{cachedIn, ...}) =>
+                        let
+                            val (destReg, newCache) = allocateNewDestination(dest, SOME (FPReg cachedIn), floatingPtRegisters, cache)
+                            val dReg = asFPReg destReg
+                        in
+                            absToConcrete(rest,
+                                if killThis then pruneCache(FPReg cachedIn, newCache) else newCache,
+                                    code <::> UnaryFloatingPt { source=cachedIn, dest=dReg, fpOp=case floatSize of Float32=>MoveFloat | Double64=>MoveDouble})
+                        end
+                   |    _ =>
+                        let
+                            val (destReg, newCache) = allocateFPReg(dest, NONE, cache)
+                        in
+                            absToConcrete(rest, if killThis then newCache else CacheRealUnbox{cached=source, floatSize=floatSize, cachedIn=destReg} :: newCache,
+                                code <::> UnboxTagFloat{floatSize=floatSize, source=getAllocatedGenReg source, dest=destReg})
+                        end
                 end
 
             |   absToConcrete({instr=LoadAcquire { base, dest, loadType}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::> LoadAcquire { base=getAllocatedGenReg base, dest=destReg, loadType=loadType})
                 end
 
             |   absToConcrete({instr=StoreRelease { base, source, loadType}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::> StoreRelease{ base=getAllocatedGenReg base, source=getAllocatedGenReg source, loadType=loadType})
 
             |   absToConcrete({instr=BitFieldShift{source, dest, isSigned, length, immr, imms}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::> BitFieldShift { source=getAllocatedGenReg source, dest=destReg, isSigned=isSigned,
                                     immr=immr, imms=imms, length=length})
                 end
 
             |   absToConcrete({instr=BitFieldInsert{source, destAsSource, dest, length, immr, imms}, ...} :: rest, cache, code) =
                 let
                     val destAsSourceReg = getAllocatedGenReg destAsSource
                     val (destReg, newCache) = allocateNewDestination(dest, SOME(GenReg destAsSourceReg), generalRegisters, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         BitFieldInsert { source=getAllocatedGenReg source, destAsSource=destAsSourceReg, dest=asGenReg destReg,
                                      immr=immr, imms=imms, length=length})
                 end
 
             |   absToConcrete({instr=IndexedCaseOperation{testReg}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::> IndexedCaseOperation{testReg=getAllocatedGenReg testReg})
 
             |   absToConcrete({instr=PushExceptionHandler, ...} :: rest, context, code) = absToConcrete(rest, context, code <::> PushExceptionHandler)
 
             |   absToConcrete({instr=PopExceptionHandler, ...} :: rest, context, code) = absToConcrete(rest, context, code <::> PopExceptionHandler)
 
             |   absToConcrete({instr=BeginHandler{packetReg}, ...} :: rest, _, code) =
                 let
                     (* The cache is undefined at the start of a handler. *)
                     val (destReg, newCache) = allocateGenReg(packetReg, NONE, [])
                 in
                     absToConcrete(rest, newCache, code <::> BeginHandler{packetReg=destReg})
                 end
 
             |   absToConcrete({instr=CompareByteVectors{vec1Addr, vec2Addr, length, ccRef}, ...} :: rest, cache, code) =
                 let
                     (* This instruction modifies these registers so they must be removed from the cache *)
                     val vec1Reg = getAllocatedGenReg vec1Addr
                     and vec2Reg = getAllocatedGenReg vec2Addr
                     and lenReg = getAllocatedGenReg length
                     val newCache = pruneCache(GenReg vec1Reg, pruneCache(GenReg vec2Reg, pruneCache(GenReg lenReg, cache)))
                 in
                     absToConcrete(rest, newCache, code <::> CompareByteVectors{vec1Addr=vec1Reg, vec2Addr=vec2Reg, length=lenReg, ccRef=ccRef})
                 end
 
             |   absToConcrete({instr=BlockMove{srcAddr, destAddr, length, isByteMove}, ...} :: rest, cache, code) =
                 let
                     (* This instruction modifies these registers so they must be removed from the cache *)
                     val srcAReg = getAllocatedGenReg srcAddr
                     and dstAReg = getAllocatedGenReg destAddr
                     and lenReg = getAllocatedGenReg length
                     val newCache = pruneCache(GenReg srcAReg, pruneCache(GenReg dstAReg, pruneCache(GenReg lenReg, cache)))
                 in
                     absToConcrete(rest, newCache, code <::> BlockMove{srcAddr=srcAReg, destAddr=dstAReg, length=lenReg, isByteMove=isByteMove})
                 end
 
             |   absToConcrete({instr=AddSubXSP{source, dest, isAdd}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenRegOrZero(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::> AddSubXSP { source=getAllocatedGenReg source, dest=destReg, isAdd=isAdd})
                 end
 
             |   absToConcrete({instr=TouchValue{source}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::> TouchValue { source=getAllocatedGenReg source})
 
             |   absToConcrete({instr=LoadAcquireExclusive{ base, dest }, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::> LoadAcquireExclusive { base=getAllocatedGenReg base, dest=destReg})
                 end
 
             |   absToConcrete({instr=StoreReleaseExclusive{ base, source, result }, ...} :: rest, cache, code) =
                 let
                     val (resultReg, newCache) = allocateGenReg(result, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::> StoreReleaseExclusive{ base=getAllocatedGenReg base, source=getAllocatedGenRegOrZero source,
                                            result=resultReg})
                 end
 
             |   absToConcrete({instr=MemoryBarrier, ...} :: rest, cache, code) = absToConcrete(rest, cache, code <::> MemoryBarrier)
 
             |   absToConcrete({instr=ConvertIntToFloat{ source, dest, srcSize, destSize}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateFPReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         ConvertIntToFloat{ source=getAllocatedGenReg source, dest=destReg, srcSize=srcSize, destSize=destSize})
                 end
 
             |   absToConcrete({instr=ConvertFloatToInt{ source, dest, srcSize, destSize, rounding}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         ConvertFloatToInt{ source=getAllocatedFPReg source, dest=destReg, srcSize=srcSize,
                                        destSize=destSize, rounding=rounding})
                 end
 
+            |   absToConcrete({instr=UnaryFloatingPt{ source, dest, fpOp=MoveFloat}, ...} :: rest, cache, code) =
+                (* Register-register moves can be eliminated if the same resgister is used. *)
+                let
+                    val srcReg = getAllocatedFPReg source
+                    val (destReg, newCache) = allocateFPReg(dest, SOME(FPReg srcReg), cache)
+                in
+                    absToConcrete(rest, newCache,
+                        if destReg = srcReg then code else code <::> UnaryFloatingPt{ source=srcReg, dest=destReg, fpOp=MoveFloat})
+                end
+
+            |   absToConcrete({instr=UnaryFloatingPt{ source, dest, fpOp=MoveDouble}, ...} :: rest, cache, code) =
+                let
+                    val srcReg = getAllocatedFPReg source
+                    val (destReg, newCache) = allocateFPReg(dest, SOME(FPReg srcReg), cache)
+                in
+                    absToConcrete(rest, newCache,
+                        if destReg = srcReg then code else code <::> UnaryFloatingPt{ source=srcReg, dest=destReg, fpOp=MoveDouble})
+                end
+
             |   absToConcrete({instr=UnaryFloatingPt{ source, dest, fpOp}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateFPReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::> UnaryFloatingPt{ source=getAllocatedFPReg source, dest=destReg, fpOp=fpOp})
                 end
 
             |   absToConcrete({instr=BinaryFloatingPoint{ arg1, arg2, dest, fpOp, opSize}, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateFPReg(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::> BinaryFloatingPoint{ arg1=getAllocatedFPReg arg1, arg2=getAllocatedFPReg arg2,
                                          dest=destReg, fpOp=fpOp, opSize=opSize})
                 end
 
             |   absToConcrete({instr=CompareFloatingPoint{ arg1, arg2, opSize, ccRef}, ...} :: rest, context, code) =
                     absToConcrete(rest, context, code <::>
                         CompareFloatingPoint{ arg1=getAllocatedFPReg arg1, arg2=getAllocatedFPReg arg2, opSize=opSize, ccRef=ccRef})
 
             |   absToConcrete({instr=CPUYield, ...} :: rest, cache, code) = absToConcrete(rest, cache, code <::> CPUYield)
 
             |   absToConcrete({instr=AtomicOperation{ base, source, dest, atOp }, ...} :: rest, cache, code) =
                 let
                     val (destReg, newCache) = allocateGenRegOrZero(dest, NONE, cache)
                 in
                     absToConcrete(rest, newCache, code <::>
                         AtomicOperation{ base=getAllocatedGenReg base, source=getAllocatedGenRegOrZero source, dest=destReg, atOp=atOp })
                 end
 
-            |   absToConcrete({instr=CacheCheck _, ...} :: _, _, _) =  (* Concrete only. *)
-                    raise InternalError "absToConcrete: CheckCache"
-
             (* LoadStack. *)
             and processLoadStack(dest, container as StackLoc{rno, ...}, wordOffset, kill, cache, code) =
             let
                 (* See if this is in the cache and use it if it is.  If this is the last reference to this
                    stack entry we don't want it in the cache any longer. *)
                 val killThis = member(rno, kill)
 
-                val (newCode, destReg, newCache) =
-                    case List.find(fn (_, CacheStack c) => c=container | _ => false) cache of
-                        SOME (srcReg, _) =>
+                val (newCode, destReg: xReg, newCache) =
+                    case List.find(fn CacheStack{cached=c, ...} => c=container | _ => false) cache of
+                        SOME (CacheStack{cachedIn, ...}) =>
                         let
-                            val (destReg, newCache) = allocateNewDestination(dest, SOME srcReg, generalRegisters, cache)
+                            val (destReg, newCache) = allocateNewDestination(dest, SOME(GenReg cachedIn), generalRegisters, cache)
                             val dReg = asGenReg destReg
-                            and sReg = asGenReg srcReg
                         in
-                            if checkCache
-                            then (code <::> MoveRegister{source=sReg, dest=X17} <::> LoadStack{ dest=dReg, container=container, field=0, wordOffset=wordOffset }
-                                <::> CacheCheck{ arg1=dReg, arg2=X17 }, destReg, if killThis then pruneCache(srcReg, newCache) else newCache)
-                            else if dReg = sReg andalso false
-                            then (code, destReg, newCache) (* We will have pruned this since it's the destination. *)
-                            else (code <::> MoveRegister { source=sReg, dest=dReg}, destReg,
-                                    if killThis then pruneCache(srcReg, newCache) else newCache)
+                            if dReg = cachedIn
+                            then (code, dReg, newCache) (* We will have pruned this since it's the destination. *)
+                            else (code <::> MoveRegister { source=cachedIn, dest=dReg}, dReg,
+                                    if killThis then pruneCache(GenReg cachedIn, newCache) else newCache)
                         end
 
-                    |   NONE =>
+                    |   _ =>
                         let
                             val (destReg, newCache) = allocateGenReg(dest, Array.sub(cacheHints, rno), cache)
                         in
                             (code <::> LoadStack{ dest=destReg, container=container, field=0, wordOffset=wordOffset },
-                             GenReg destReg, newCache)
+                             destReg, newCache)
                         end
-                val () = if killThis then () else Array.update(cacheHints, rno, SOME destReg)
+                val () = if killThis then () else Array.update(cacheHints, rno, SOME(GenReg destReg))
             in
-                (if killThis then newCache else (destReg, CacheStack container) :: newCache, newCode)
+                (if killThis then newCache else CacheStack{cached=container, cachedIn=destReg} :: newCache, newCode)
             end
 
         in
             fun concreteBlock(ExtendedBasicBlock{ block, ...}, inputCache) =
             let
                 val (cache, code) = absToConcrete(block, inputCache, [])
             in
                 {cache=cache, code=List.rev code}
             end
             
             val mergeCacheStates = mergeCacheStates
         end
 
         val numBlocks = Vector.length blocks
         (* The results.  The cache state is initialised to empty so that if we
            have a loop we will end up with an empty input cache. *)
         val resultArray = Array.array(numBlocks, {code=[], cache=[]})
 
         (* Process the blocks in execution order so that normally we will be able to
            propagate the cache states.  If we have a loop the input cache state will
            be empty because the output cache state for an unprocessed block is empty. *)
         (* Get the blocks that are inputs for each one. *)
         local
             val blockRefs = Array.array(numBlocks, [])
 
             (* The successors of this block but only including handlers in SetHandler. *)
             fun directSuccessors ExitCode = []
             |   directSuccessors(IndexedBr cases) = cases
             |   directSuccessors(Unconditional dest) = [dest]
             |   directSuccessors(Conditional {trueJump, falseJump, ...}) = [falseJump, trueJump]
             |   directSuccessors(SetHandler { handler, continue }) = [handler, continue]
             |   directSuccessors(UnconditionalHandle _) = []
             |   directSuccessors(ConditionalHandle { continue, ...}) = [continue]
             
             fun setReferences fromBlock =
             let
                 val ExtendedBasicBlock{ flow, ...} = Vector.sub(blocks, fromBlock)
                 val refs = directSuccessors flow
                 fun setRefs toBlock =
                 let
                     val oldRefs = Array.sub(blockRefs, toBlock)
                 in
                     Array.update(blockRefs, toBlock, fromBlock :: oldRefs);
                     if null oldRefs
                     then setReferences toBlock
                     else ()
                 end
             in
                 List.app setRefs refs
             end
 
             val () = setReferences 0
         in
             val directSuccessors = directSuccessors
             val blockRefs = blockRefs
         end
 
         val processed = Array.array(numBlocks, false)
         fun haveProcessed n = Array.sub(processed, n)
            
         fun processBlocks (toDo: int list) =
             case List.filter (fn n => not(haveProcessed n)) toDo of
                 [] => ()
             |   stillToDo as head :: _ =>
                 let
                     (* Try to find a block all of whose predecessors have been processed.  That
                        increases the chances that we will have cached items. *)
                     fun available dest = List.all haveProcessed (Array.sub(blockRefs, dest))
                     val blockNo =
                         case List.find available stillToDo of
                             SOME c => c
                         |   NONE => head
                     val thisBlock as ExtendedBasicBlock { flow, ...} = Vector.sub(blocks, blockNo)
                     (* Get the input cache state.  Take the list of output caches of everything that
                        jumps here and produce the intersection. *)
                     val inputCacheList =
                         List.map (fn n => #cache(Array.sub(resultArray, n))) (Array.sub(blockRefs, blockNo))
                     val inputCache = mergeCacheStates inputCacheList
-                    val inputCache = [] (* Temporarily *)
                     (* Process this block and add it to the results. *)
                     val () = Array.update(processed, blockNo, true)
                     val () = Array.update(resultArray, blockNo, concreteBlock(thisBlock, inputCache))
 
                     (* Add the successors but with handlers only included in SetHandler. *)
                     val addSet = directSuccessors flow
                 in
                     processBlocks(addSet @ stillToDo)
                 end
     in
         processBlocks [0];
         (* If the failures list is empty we succeeded. *)
         case !failures of
             [] =>  (* Return the allocation vector.  We may have unused registers, *)
                 AllocateSuccess(
                     Vector.mapi(fn (i, ExtendedBasicBlock{ flow, ...}) =>
                         BasicBlock{block= #code(Array.sub(resultArray, i)), flow=flow}) blocks )
             (* Else we'll have to spill something. *)
         |   l => AllocateFailure l
     end
 
     val nGenRegs = List.length generalRegisters
 
     structure Sharing =
     struct
         type intSet             = intSet
         and extendedBasicBlock  = extendedBasicBlock
         and ('genReg, 'optGenReg, 'fpReg) basicBlock = ('genReg, 'optGenReg, 'fpReg) basicBlock
         and regProperty         = regProperty
         and reg                 = reg
         and xReg                = xReg
         and vReg                = vReg
         and allocateResult      = allocateResult
     end
 
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64Assembly.sml b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64Assembly.sml
index 9efadc38..aba06ac9 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64Assembly.sml
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64Assembly.sml
@@ -1,2732 +1,2730 @@
 (*
     Copyright (c) 2021-2 David C. J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     Licence version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public Licence for more details.
     
     You should have received a copy of the GNU Lesser General Public
     Licence along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64Assembly (
     structure Debug: DEBUG
     and       Pretty: PRETTY
     and       CodeArray: CODEARRAY
 ) : ARM64ASSEMBLY =
 
 struct
     open CodeArray Address
     
     val is32in64 = Address.wordSize = 0w4
     
     val wordsPerNativeWord: word = Address.nativeWordSize div Address.wordSize
     
     local
         (* Almost every ARM64 platform is little-endian but it is possible to
            run it in big-endian mode.  Instructions are always little-endian.
            The value of isBigEndian will be determined when the structure is
            constructed.  That's not a problem since it will be built on the
            target machine. *)
         val isBigEndian: unit -> bool = RunCall.rtsCallFast1 "PolyIsBigEndian"
     in
         val isBigEndian = isBigEndian()
     end
     
     exception InternalError = Misc.InternalError
 
     infix 5 << <<+ <<- >> >>+ >>- ~>> ~>>+ ~>>- (* Shift operators *)
     infix 3 andb orb xorb andbL orbL xorbL andb8 orb8 xorb8
     
     val op << = Word32.<< and op >> = Word32.>> and op ~>> = Word32.~>>
     and op andb = Word32.andb and op orb = Word32.orb
 
     val word32ToWord8 = Word8.fromLargeWord o Word32.toLargeWord
     and word8ToWord32 = Word32.fromLargeWord o Word8.toLargeWord
     and word32ToWord = Word.fromLargeWord o Word32.toLargeWord
     and wordToWord32 = Word32.fromLargeWord o Word.toLargeWord
     and word8ToWord = Word.fromLargeWord o Word8.toLargeWord
 
     (* The maximum positive number that will fit in a signed "bits" field. *)
     fun maxSigned bits = Word.<<(0w1, bits-0w1) - 0w1
     fun willFitInRange(offset, bits) = offset <= Word.toInt(maxSigned bits) andalso offset >= ~ (Word.toInt(maxSigned bits)) - 1
    
     (* XReg is used for fixed point registers since X0 and W0 are
        the same register. *)
     datatype xReg = XReg of Word8.word | XZero | XSP
     (* VReg is used for the floating point registers since V0, D0 and
        S0 are the same register. *)
     and vReg = VReg of Word8.word
 
     (* A Label is a ref that is later set to the location.
        Several labels can be linked together so that they are only set
        at a single point.
        Only forward jumps are linked so when we come to finally set the
        label we will have the full list. *)
     type labels = Word.word ref list ref
 
     (* Condition codes. *)
 
     (* N.B. On subtraction and comparison the ARM uses an inverted carry
        flag for borrow.  The C flag is set if there is NO borrow.
        This is the reverse of the X86. *)
     datatype condition =
         CondEqual            (* Z=1 *)
     |   CondNotEqual         (* Z=0 *)
     |   CondCarrySet         (* C=1 *)
     |   CondCarryClear       (* C=0 *)
     |   CondNegative         (* N=1 *)
     |   CondPositive         (* N=0 imcludes zero *)
     |   CondOverflow         (* V=1 *)
     |   CondNoOverflow       (* V=0 *)
     |   CondUnsignedHigher   (* C=1 && Z=0 *)
     |   CondUnsignedLowOrEq  (* ! (C=1 && Z=0) *)
     |   CondSignedGreaterEq  (* N=V *)
     |   CondSignedLess       (* N<>V *)
     |   CondSignedGreater    (* Z==0 && N=V *)
     |   CondSignedLessEq     (* !(Z==0 && N=V) *)
 
     (* The negation of a test just involves inverting the bottom bit. *)
     fun invertTest CondEqual            = CondNotEqual
     |   invertTest CondNotEqual         = CondEqual
     |   invertTest CondCarrySet         = CondCarryClear
     |   invertTest CondCarryClear       = CondCarrySet
     |   invertTest CondNegative         = CondPositive
     |   invertTest CondPositive         = CondNegative
     |   invertTest CondOverflow         = CondNoOverflow
     |   invertTest CondNoOverflow       = CondOverflow
     |   invertTest CondUnsignedHigher   = CondUnsignedLowOrEq
     |   invertTest CondUnsignedLowOrEq  = CondUnsignedHigher
     |   invertTest CondSignedGreaterEq  = CondSignedLess
     |   invertTest CondSignedLess       = CondSignedGreaterEq
     |   invertTest CondSignedGreater    = CondSignedLessEq
     |   invertTest CondSignedLessEq     = CondSignedGreater
 
     fun condToString CondEqual            = "EQ"
     |   condToString CondNotEqual         = "NE"
     |   condToString CondCarrySet         = "CS"
     |   condToString CondCarryClear       = "CC"
     |   condToString CondNegative         = "MI"
     |   condToString CondPositive         = "PL"
     |   condToString CondOverflow         = "VS"
     |   condToString CondNoOverflow       = "VC"
     |   condToString CondUnsignedHigher   = "HI"
     |   condToString CondUnsignedLowOrEq  = "LS"
     |   condToString CondSignedGreaterEq  = "GE"
     |   condToString CondSignedLess       = "LT"
     |   condToString CondSignedGreater    = "GT"
     |   condToString CondSignedLessEq     = "LE"
 
 
     (* Condition codes to binary encoding. *)
     fun cCode CondEqual           = 0wx0: Word32.word
     |   cCode CondNotEqual        = 0wx1
     |   cCode CondCarrySet        = 0wx2 (* C=1 *)
     |   cCode CondCarryClear      = 0wx3 (* C=0 *)
     |   cCode CondNegative        = 0wx4 (* N=1 *)
     |   cCode CondPositive        = 0wx5 (* N=0 imcludes zero *)
     |   cCode CondOverflow        = 0wx6 (* V=1 *)
     |   cCode CondNoOverflow      = 0wx7 (* V=0 *)
     |   cCode CondUnsignedHigher  = 0wx8 (* C=1 && Z=0 *)
     |   cCode CondUnsignedLowOrEq = 0wx9 (* ! (C=1 && Z=0) *)
     |   cCode CondSignedGreaterEq = 0wxa (* N=V *)
     |   cCode CondSignedLess      = 0wxb (* N<>V *)
     |   cCode CondSignedGreater   = 0wxc (* Z==0 && N=V *)
     |   cCode CondSignedLessEq    = 0wxd (* !(Z==0 && N=V) *)
 
 
     (* Offsets in the assembly code interface pointed at by X26
        These are in units of 64-bits NOT bytes. *)
     val heapOverflowCallOffset  = 1
     and stackOverflowCallOffset = 2
     and stackOverflowXCallOffset= 3
     and exceptionHandlerOffset  = 5
     and stackLimitOffset        = 6
     and exceptionPacketOffset   = 7
     and threadIdOffset          = 8
     and heapLimitPtrOffset      = 42
     and heapAllocPtrOffset      = 43
     and mlStackPtrOffset        = 44
 
     (* 31 in the register field can either mean the zero register or
        the hardware stack pointer.  Which meaning depends on the instruction. *)
     fun xRegOrXZ(XReg w) = w
     |   xRegOrXZ XZero = 0w31
     |   xRegOrXZ XSP = raise InternalError "XSP not valid here"
     
     and xRegOrXSP(XReg w) = w
     |   xRegOrXSP XZero = raise InternalError "XZero not valid here"
     |   xRegOrXSP XSP = 0w31
     
     (* There are cases where it isn't clear. *)
     and xRegOnly (XReg w) = w
     |   xRegOnly XZero = raise InternalError "XZero not valid here"
     |   xRegOnly XSP = raise InternalError "XSP not valid here"
 
     val X0  = XReg 0w0  and X1  = XReg 0w1  and X2 = XReg 0w2   and X3  = XReg 0w3
     and X4  = XReg 0w4  and X5  = XReg 0w5  and X6 = XReg 0w6   and X7  = XReg 0w7
     and X8  = XReg 0w8  and X9  = XReg 0w9  and X10= XReg 0w10  and X11 = XReg 0w11
     and X12 = XReg 0w12 and X13 = XReg 0w13 and X14= XReg 0w14  and X15 = XReg 0w15
     and X16 = XReg 0w16 and X17 = XReg 0w17 and X18= XReg 0w18  and X19 = XReg 0w19
     and X20 = XReg 0w20 and X21 = XReg 0w21 and X22= XReg 0w22  and X23 = XReg 0w23
     and X24 = XReg 0w24 and X25 = XReg 0w25 and X26= XReg 0w26  and X27 = XReg 0w27
     and X28 = XReg 0w28 and X29 = XReg 0w29 and X30= XReg 0w30
     
     val X_MLHeapLimit       = X25 (* ML Heap limit pointer *)
     and X_MLAssemblyInt     = X26 (* ML assembly interface pointer. *)
     and X_MLHeapAllocPtr    = X27 (* ML Heap allocation pointer. *)
     and X_MLStackPtr        = X28 (* ML Stack pointer. *)
     and X_LinkReg           = X30 (* Link reg - return address *)
     and X_Base32in64        = X24 (* X24 is used for the heap base in 32-in-64. *)
     
     fun vReg(VReg v) = v
     (* Only the first eight registers are currently used by ML. *)
     val V0  = VReg 0w0  and V1  = VReg 0w1 and V2 = VReg 0w2   and V3  = VReg 0w3
     and V4  = VReg 0w4  and V5  = VReg 0w5 and V6 = VReg 0w6   and V7  = VReg 0w7
 
     (* Some data instructions include a possible shift. *)
     datatype shiftType =
         ShiftLSL of Word8.word
     |   ShiftLSR of Word8.word
     |   ShiftASR of Word8.word
     |   ShiftNone
 
     local
         fun checkImm6 w = if w > 0w63 then raise InternalError "shift > 63" else w
     in 
         fun shiftEncode(ShiftLSL w) = (0w0, checkImm6 w)
         |   shiftEncode(ShiftLSR w) = (0w1, checkImm6 w)
         |   shiftEncode(ShiftASR w) = (0w2, checkImm6 w)
         |   shiftEncode ShiftNone   = (0w0, 0w0)
     end
 
     (* Other instructions include an extension i.e. a sign- or zero-extended
        value from one of the argument registers.  When an extension is encoded
        there can also be a left shift which applies after the extension.
        I don't understand what difference, if any, there is between UXTX
        and SXTX.
        There's no ExtNone because we need to use either UXTW or UXTX depending
        on the length *)
     datatype 'a extend =
         ExtUXTB of 'a (* Unsigned extend byte *)
     |   ExtUXTH of 'a (* Unsigned extend byte *)
     |   ExtUXTW of 'a (* Unsigned extend byte *)
     |   ExtUXTX of 'a (* Left shift *)
     |   ExtSXTB of 'a (* Sign extend byte *)
     |   ExtSXTH of 'a (* Sign extend halfword *)
     |   ExtSXTW of 'a (* Sign extend word *)
     |   ExtSXTX of 'a (* Left shift *)
 
     (* Load/store instructions have only a single bit for the shift.  For byte
        operations this is one bit shift; for others it scales by the size of
        the operand if set. *)
     datatype scale =
         ScaleOrShift
     |   NoScale
 
     local
         (* Although there are three bits it seems that the shift is limited to 0 to 4. *)
         fun checkImm3 w = if w > 0w4 then raise InternalError "extend shift > 4" else w
     in
         fun extendArithEncode(ExtUXTB w) = (0w0, checkImm3 w)
         |   extendArithEncode(ExtUXTH w) = (0w1, checkImm3 w)
         |   extendArithEncode(ExtUXTW w) = (0w2, checkImm3 w)
         |   extendArithEncode(ExtUXTX w) = (0w3, checkImm3 w)
         |   extendArithEncode(ExtSXTB w) = (0w4, checkImm3 w)
         |   extendArithEncode(ExtSXTH w) = (0w5, checkImm3 w)
         |   extendArithEncode(ExtSXTW w) = (0w6, checkImm3 w)
         |   extendArithEncode(ExtSXTX w) = (0w7, checkImm3 w)
         
         fun extendLSEncode(ExtUXTB v) = (0w0, v)
         |   extendLSEncode(ExtUXTH v) = (0w1, v)
         |   extendLSEncode(ExtUXTW v) = (0w2, v)
         |   extendLSEncode(ExtUXTX v) = (0w3, v)
         |   extendLSEncode(ExtSXTB v) = (0w4, v)
         |   extendLSEncode(ExtSXTH v) = (0w5, v)
         |   extendLSEncode(ExtSXTW v) = (0w6, v)
         |   extendLSEncode(ExtSXTX v) = (0w7, v)
     end
 
     datatype wordSize = WordSize32 | WordSize64
 
     (* Bit patterns on the ARM64 are encoded using a complicated scheme and
        only certain values can be encoded.  An element can be 2, 4, 8, 16, 32 or
        64 bits and must be a sequence of at least one zero bits followed by at
        least one one bit.  This sequence can then be rotated within the element.
        Finally the element is replicated within the register up to 32 or
        64 bits.  All this information is encoded in 13 bits.
        N.B. Bit patterns of all zeros or all ones cannot be encoded. *)
 
     (* Encode the value if it is possible. *)
     fun encodeBitPattern(value, sf (* size flag *)) =
     (* Can't encode 0 or all ones. *)
     if value = 0w0 orelse value = Word64.notb 0w0
     then NONE
     (* If this is 32-bits we can't encode all ones in the
        low-order 32-bits or any value that won't fit in 32-bits, *)
     else if sf = WordSize32 andalso value >= 0wxffffffff
     then NONE
     else
     let
         val regSize = case sf of WordSize32 => 0w32 | WordSize64 => 0w64
         (* Get the element size.  Look for the repeat of the
            pattern. *)
         fun getElemSize size =
         let
             val ns = size div 0w2
             val mask = Word64.<<(0w1, ns)  - 0w1
         in
             if Word64.andb(value, mask) <> Word64.andb(Word64.>>(value, ns), mask)
             then size
             else if ns <= 0w2
             then ns
             else getElemSize ns
         end
         val elemSize = getElemSize regSize
         fun log2 0w1 = 0w0 | log2 n = 0w1 + log2(Word.>>(n, 0w1))
         val elemBits = log2 elemSize
 
         (* Find the rotation that puts as many of the zero bits in the
            element at the top. *)
         val elemMask = Word64.>>(Word64.notb 0w0, 0w64-elemSize)
         fun ror elt =
             Word64.orb((Word64.<<(Word64.andb(elt, 0w1), elemSize-0w1),
                 Word64.>>(elt, 0w1)))
         and rol elt =
             Word64.orb(Word64.andb(elemMask, Word64.<<(elt, 0w1)),
                 Word64.>>(elt, elemSize-0w1))
 
         fun findRotation(v, n) =
             if ror v < v then findRotation(ror v, (n-0w1) mod elemSize)
             else if rol v < v then findRotation(rol v, n+0w1)
             else (v, n)
 
         val (rotated, rotation) = findRotation(Word64.andb(value, elemMask), 0w0)
 
         (* Count out the low order ones.  If the result is zero
            then we;ve got a valid sequence of zeros followed by ones
            but if we discover a zero bit and the result isn't zero
            then we can't encode this. *)
         fun countLowOrderOnes(v, n) =
             if v = 0w0
             then SOME n
             else if Word64.andb(v, 0w1) = 0w1
             then countLowOrderOnes(Word64.>>(v, 0w1), n+0w1)
             else NONE
      in
         case countLowOrderOnes(rotated, 0w0) of
             NONE => NONE
         |   SOME lowOrderOnes =>
             let
                 (* Encode the element size. *)
                 val elemSizeEnc = 0wx7f - (Word.<<(0w1, elemBits+0w1) - 0w1)
                 val n = if Word.andb(elemSizeEnc, 0wx40) = 0w0 then 0w1 else 0w0
                 val imms = Word.andb(Word.orb(elemSizeEnc, lowOrderOnes-0w1), 0wx3f)
             in
                 SOME{n=n, imms=imms, immr=rotation}
             end
     end;
 
     (* Decode a pattern for printing. *)
     fun decodeBitPattern{sf, n, immr, imms} =
     let
         (* Find the highest bit set in N:NOT(imms) *)
         fun highestBitSet 0w0 = 0
         |   highestBitSet n = 1+highestBitSet(Word32.>>(n, 0w1))
         val len = highestBitSet(Word32.orb(Word32.<<(n, 0w6), Word32.xorb(imms, 0wx3f))) - 1
         val _ = if len < 0 then raise InternalError "decodeBitPattern: invalid" else ()
         val size = Word32.<<(0w1, Word.fromInt len)
         val r = Word32.andb(immr, size-0w1)
         and s = Word32.andb(imms, size-0w1)
         val _ = if s = size-0w1 then raise InternalError "decodeBitPattern: invalid" else ()
         val pattern = Word64.<<(0w1, word32ToWord(s+0w1)) - 0w1
         (* Rotate right: shift left and put the top bit in the high order bit*)
         fun ror elt =
             Word64.orb((Word64.<<(Word64.andb(elt, 0w1), word32ToWord(size-0w1)),
                 Word64.>>(elt, 0w1)))
 
         fun rotateBits(value, 0w0) = value
         |   rotateBits(value, n) = rotateBits(ror value, n-0w1)
 
         val rotated = rotateBits(pattern, r)
 
         val regSize = if sf = 0w0 then 0w32 else 0w64
 
         (* Replicate the rotated pattern to fill the register. *)
         fun replicate(pattern, size) =
             if size >= regSize
             then pattern
             else replicate(Word64.orb(pattern, Word64.<<(pattern, word32ToWord size)), size * 0w2)
     in
         replicate(rotated, size)
     end
 
     val isEncodableBitPattern = isSome o encodeBitPattern
 
 
     datatype instr =
         SimpleInstr of Word32.word
     |   LoadAddressLiteral of {reg: xReg, value: machineWord, length: brLength ref}
     |   LoadNonAddressLiteral of {reg: xReg, value: Word64.word, length: brLength ref}
     |   LoadFPLiteral of {reg: vReg, value: Word64.word, length: brLength ref, isDouble: bool, work: xReg}
     |   Label of labels
     |   UnconditionalBranch of {label: labels, andLink: bool}
     |   ConditionalBranch of { label: labels, jumpCondition: condition, length: brLength ref }
     |   LoadLabelAddress of { label: labels, reg: xReg, length: brLength ref }
     |   TestBitBranch of { label: labels, bitNo: Word8.word, brNonZero: bool, reg: xReg, length: brLength ref }
     |   CompareBranch of { label: labels, brNonZero: bool, size: wordSize, reg: xReg, length: brLength ref }
     
     and brLength = BrShort | BrExtended
 
     val nopCode  = 0wxD503201F
     and undefCode = 0wx00000000 (* Permanently undefined instruction. *)
 
     (* Add/subtract an optionally shifted 12-bit immediate (i.e. constant) to/from a register.
        The constant is zero-extended.  The versions that do not set the flags can use XSP as
        the destination; the versions that use the signs can use XZero as the destination i.e.
        they discard the result and act as a comparison. *)
     local
         fun addSubRegImmediate(sf, oper, s, xdOp) ({regN, regD, immed, shifted}) =
         let
             val () =
                 if immed >= 0wx1000 then raise InternalError "addSubRegImmediate: immed > 12 bits" else ()
         in
             SimpleInstr(
                 0wx11000000 orb (sf << 0w31) orb (oper << 0w30) orb (s << 0w29) orb
                 (if shifted then 0wx400000 else 0w0) orb
                 (wordToWord32 immed << 0w10) orb (word8ToWord32(xRegOrXSP regN) << 0w5) orb
                 word8ToWord32(xdOp regD))
         end
     in
         val addImmediate = addSubRegImmediate(0w1, 0w0, 0w0, xRegOrXSP)
         and addSImmediate = addSubRegImmediate(0w1, 0w0, 0w1, xRegOrXZ)
         and subImmediate = addSubRegImmediate(0w1, 0w1, 0w0, xRegOrXSP)
         and subSImmediate = addSubRegImmediate(0w1, 0w1, 0w1, xRegOrXZ)
         and addImmediate32 = addSubRegImmediate(0w0, 0w0, 0w0, xRegOrXSP)
         and addSImmediate32 = addSubRegImmediate(0w0, 0w0, 0w1, xRegOrXZ)
         and subImmediate32 = addSubRegImmediate(0w0, 0w1, 0w0, xRegOrXSP)
         and subSImmediate32 = addSubRegImmediate(0w0, 0w1, 0w1, xRegOrXZ)
     end
 
     (* Add/subtract a shifted register, optionally setting the flags. *)
     local
         (* X31 is XZ here unlike the extended version.*)
         fun addSubtractShiftedReg (sf, oper, s) ({regM, regN, regD, shift}) =
         let
             val (shift, imm6) = shiftEncode shift
         in
             SimpleInstr(0wx0b000000 orb (sf << 0w31) orb (oper << 0w30) orb (s << 0w29) orb
                 (shift << 0w22) orb (word8ToWord32(xRegOnly regM) << 0w16) orb
                 (word8ToWord32 imm6 << 0w10) orb (word8ToWord32(xRegOrXZ regN) << 0w5) orb
                 word8ToWord32(xRegOrXZ regD))
         end
     in
         val addShiftedReg = addSubtractShiftedReg(0w1, 0w0, 0w0)
         and addSShiftedReg = addSubtractShiftedReg(0w1, 0w0, 0w1)
         and subShiftedReg = addSubtractShiftedReg(0w1, 0w1, 0w0)
         and subSShiftedReg = addSubtractShiftedReg(0w1, 0w1, 0w1)
         and addShiftedReg32 = addSubtractShiftedReg(0w0, 0w0, 0w0)
         and addSShiftedReg32 = addSubtractShiftedReg(0w0, 0w0, 0w1)
         and subShiftedReg32 = addSubtractShiftedReg(0w0, 0w1, 0w0)
         and subSShiftedReg32 = addSubtractShiftedReg(0w0, 0w1, 0w1)
     end
 
     (* Add/subtract an extended register, optionally setting the flags. *)
     local
         (* SP can be used as Xn and also for Xd for the non-flags versions. *)
         fun addSubtractExtendedReg (sf, oper, s, opt, xD) ({regM, regN, regD, extend}) =
         let
             val (option, imm3) = extendArithEncode extend
         in
             SimpleInstr(0wx0b200000 orb (sf << 0w31) orb (oper << 0w30) orb (s << 0w29) orb
                 (opt << 0w22) orb (word8ToWord32(xRegOnly regM) << 0w16) orb
                 (option << 0w13) orb (word8ToWord32 imm3 << 0w10) orb
                 (word8ToWord32(xRegOrXSP regN) << 0w5) orb
                 word8ToWord32(xD regD))
         end
     in
         val addExtendedReg = addSubtractExtendedReg(0w1, 0w0, 0w0, 0w0, xRegOrXSP)
         and addSExtendedReg = addSubtractExtendedReg(0w1, 0w0, 0w1, 0w0, xRegOrXZ)
         and subExtendedReg = addSubtractExtendedReg(0w1, 0w1, 0w0, 0w0, xRegOrXSP)
         and subSExtendedReg = addSubtractExtendedReg(0w1, 0w1, 0w1, 0w0, xRegOrXZ)
     end
 
     (* Logical operations on a shifted register. *)
     local
         fun logicalShiftedReg (sf, oper, n) ({regM, regN, regD, shift}) =
         let
             val (shift, imm6) = shiftEncode shift
         in
             SimpleInstr(0wx0a000000 orb (sf << 0w31) orb (oper << 0w29) orb
                 (shift << 0w22) orb (n << 0w21) orb (word8ToWord32(xRegOrXZ regM) << 0w16) orb
                 (word8ToWord32 imm6 << 0w10) orb (word8ToWord32(xRegOrXZ regN) << 0w5) orb
                 word8ToWord32(xRegOrXZ regD))
         end
     in
         val andShiftedReg = logicalShiftedReg(0w1, 0w0, 0w0)
         and orrShiftedReg = logicalShiftedReg(0w1, 0w1, 0w0)
         and eorShiftedReg = logicalShiftedReg(0w1, 0w2, 0w0)
         and andsShiftedReg = logicalShiftedReg(0w1, 0w3, 0w0)
         val andShiftedReg32 = logicalShiftedReg(0w0, 0w0, 0w0)
         and orrShiftedReg32 = logicalShiftedReg(0w0, 0w1, 0w0)
         and eorShiftedReg32 = logicalShiftedReg(0w0, 0w2, 0w0)
         and andsShiftedReg32 = logicalShiftedReg(0w0, 0w3, 0w0)
         (* There are also versions that operate with an inverted version
            of the argument. *)
     end
 
     (* Two-source operations. *)
     local
         fun twoSourceInstr (sf, s, opcode) ({regM, regN, regD}) =
             SimpleInstr(0wx1ac00000 orb (sf << 0w31) orb (s << 0w29) orb
                 (word8ToWord32(xRegOnly regM) << 0w16) orb (opcode << 0w10) orb
                 (word8ToWord32(xRegOnly regN) << 0w5) orb
                 word8ToWord32(xRegOnly regD))
     in
         (* Signed and unsigned division. *)
         val unsignedDivide   = twoSourceInstr(0w1, 0w0, 0wx2)
         and signedDivide     = twoSourceInstr(0w1, 0w0, 0wx3)
         and unsignedDivide32 = twoSourceInstr(0w0, 0w0, 0wx2)
         and signedDivide32   = twoSourceInstr(0w0, 0w0, 0wx3)
         (* Logical shift left Rd = Rn << (Rm mod 0w64) *)
         and logicalShiftLeftVariable = twoSourceInstr(0w1, 0w0, 0wx8)
         (* Logical shift right Rd = Rn >> (Rm mod 0w64) *)
         and logicalShiftRightVariable = twoSourceInstr(0w1, 0w0, 0wx9)
         (* Arithmetic shift right Rd = Rn ~>> (Rm mod 0w64) *)
         and arithmeticShiftRightVariable = twoSourceInstr(0w1, 0w0, 0wxa)
         and logicalShiftLeftVariable32 = twoSourceInstr(0w0, 0w0, 0wx8)
         and logicalShiftRightVariable32 = twoSourceInstr(0w0, 0w0, 0wx9)
         and arithmeticShiftRightVariable32 = twoSourceInstr(0w0, 0w0, 0wxa)
     end
 
     (* Three source operations.  These are all variations of multiply. *)
     local
         fun threeSourceInstr (sf, op54, op31, o0) ({regM, regA, regN, regD}) =
             SimpleInstr(0wx1b000000 orb (sf << 0w31) orb (op54 << 0w29) orb
                 (op31 << 0w21) orb (word8ToWord32(xRegOnly regM) << 0w16) orb
                 (o0 << 0w15) orb (word8ToWord32(xRegOrXZ regA) << 0w10) orb
                 (word8ToWord32(xRegOnly regN) << 0w5) orb
                 word8ToWord32(xRegOnly regD))
     in
         (* regD = regA + regN * regM *)
         val multiplyAndAdd = threeSourceInstr(0w1, 0w0, 0w0, 0w0)
         (* regD = regA - regN * regM *)
         and multiplyAndSub = threeSourceInstr(0w1, 0w0, 0w0, 0w1)
         and multiplyAndAdd32 = threeSourceInstr(0w0, 0w0, 0w0, 0w0)
         and multiplyAndSub32 = threeSourceInstr(0w0, 0w0, 0w0, 0w1)
         (* Multiply two 32-bit quantities and add/subtract a 64-bit quantity. *)
         and signedMultiplyAndAddLong = threeSourceInstr(0w1, 0w0, 0w1, 0w0)
         and signedMultiplyAndSubLong = threeSourceInstr(0w1, 0w0, 0w1, 0w1)
         (* Return the high-order part of a signed multiplication. *)
         fun signedMultiplyHigh({regM, regN, regD}) =
             threeSourceInstr(0w1, 0w0, 0w2, 0w0) { regM=regM, regN=regN, regD=regD, regA=XZero}
     end
 
     (* Loads: There are two versions of this on the ARM.  There is a version that
        takes a signed 9-bit byte offset and a version that takes an unsigned
        12-bit word offset. *)
     
     local
         fun loadStoreRegScaled (size, v, opc, xD) ({regT, regN, unitOffset}) =
         let
             val _ = (unitOffset >= 0 andalso unitOffset < 0x1000)
                 orelse raise InternalError "loadStoreRegScaled: value out of range"
         in
             SimpleInstr(0wx39000000 orb (size << 0w30) orb (opc << 0w22) orb
                 (v << 0w26) orb (Word32.fromInt unitOffset << 0w10) orb
                 (word8ToWord32(xRegOrXSP regN) << 0w5) orb word8ToWord32(xD regT))
         end
     in
         val loadRegScaled = loadStoreRegScaled(0w3, 0w0, 0w1, xRegOrXZ)
         and storeRegScaled = loadStoreRegScaled(0w3, 0w0, 0w0, xRegOrXZ)
         (* (Unsigned) byte operations.  There are also signed versions. *)
         and loadRegScaledByte = loadStoreRegScaled (0w0, 0w0, 0w1, xRegOrXZ)
         and storeRegScaledByte = loadStoreRegScaled (0w0, 0w0, 0w0, xRegOrXZ)
         and loadRegScaled16 = loadStoreRegScaled (0w1, 0w0, 0w1, xRegOrXZ)
         and storeRegScaled16 = loadStoreRegScaled (0w1, 0w0, 0w0, xRegOrXZ)
         and loadRegScaled32 = loadStoreRegScaled (0w2, 0w0, 0w1, xRegOrXZ)
         and storeRegScaled32 = loadStoreRegScaled (0w2, 0w0, 0w0, xRegOrXZ)
         and loadRegScaledDouble = loadStoreRegScaled(0w3, 0w1, 0w1, vReg)
         and storeRegScaledDouble = loadStoreRegScaled(0w3, 0w1, 0w0, vReg)
         and loadRegScaledFloat = loadStoreRegScaled(0w2, 0w1, 0w1, vReg)
         and storeRegScaledFloat = loadStoreRegScaled(0w2, 0w1, 0w0, vReg)
     end    
 
     local
         (* Loads and stores with a signed byte offset.  This includes simple
            unscaled addresses, pre-indexing and post-indexing. *)
         fun loadStoreByteAddress (op4, xD) (size, v, opc) ({regT, regN, byteOffset}) =
         let
             val _ = (byteOffset >= ~256 andalso byteOffset < 256)
                 orelse raise InternalError "loadStoreUnscaled: value out of range"
             val imm9 = Word32.fromInt byteOffset andb 0wx1ff
         in
             SimpleInstr(0wx38000000 orb (size << 0w30) orb (opc << 0w22) orb
                 (v << 0w26) orb (imm9 << 0w12) orb (op4 << 0w10) orb
                 (word8ToWord32(xRegOrXSP regN) << 0w5) orb word8ToWord32(xD regT))
         end
         
         val loadStoreUnscaled = loadStoreByteAddress (0w0, xRegOrXZ)
         and loadStoreUnscaledSIMD = loadStoreByteAddress (0w0, vReg)
         and loadStorePostIndex = loadStoreByteAddress (0w1, xRegOrXZ)
         and loadStorePreIndex = loadStoreByteAddress (0w3, xRegOrXZ)
     in
         val loadRegUnscaled = loadStoreUnscaled (0w3, 0w0, 0w1)
         and storeRegUnscaled = loadStoreUnscaled (0w3, 0w0, 0w0)
         (* (Unsigned) byte operations.  There are also signed versions. *)
         and loadRegUnscaledByte = loadStoreUnscaled (0w0, 0w0, 0w1)
         and loadRegUnscaledSignedByteTo64 = loadStoreUnscaled (0w0, 0w0, 0w2)
         and loadRegUnscaledSignedByteTo32 = loadStoreUnscaled (0w0, 0w0, 0w3)
         and storeRegUnscaledByte = loadStoreUnscaled (0w0, 0w0, 0w0)
         and loadRegUnscaled16 = loadStoreUnscaled (0w1, 0w0, 0w1)
         and loadRegUnscaledSigned16To64 = loadStoreUnscaled (0w1, 0w0, 0w2)
         and loadRegUnscaledSigned16To32 = loadStoreUnscaled (0w1, 0w0, 0w3)
         and storeRegUnscaled16 = loadStoreUnscaled (0w1, 0w0, 0w0)
         and loadRegUnscaled32 = loadStoreUnscaled (0w2, 0w0, 0w1)
         and loadRegUnscaledSigned32To64 = loadStoreUnscaled (0w2, 0w0, 0w2)
         and storeRegUnscaled32 = loadStoreUnscaled (0w2, 0w0, 0w0)
         and loadRegUnscaledFloat = loadStoreUnscaledSIMD (0w2, 0w1, 0w1)
         and storeRegUnscaledFloat = loadStoreUnscaledSIMD (0w2, 0w1, 0w0)
         and loadRegUnscaledDouble = loadStoreUnscaledSIMD (0w3, 0w1, 0w1)
         and storeRegUnscaledDouble = loadStoreUnscaledSIMD (0w3, 0w1, 0w0)
 
         val loadRegPostIndex = loadStorePostIndex (0w3, 0w0, 0w1)
         and storeRegPostIndex = loadStorePostIndex (0w3, 0w0, 0w0)
         and loadRegPostIndex32 = loadStorePostIndex (0w2, 0w0, 0w1)
         and storeRegPostIndex32 = loadStorePostIndex (0w2, 0w0, 0w0)
         and loadRegPostIndexByte = loadStorePostIndex (0w0, 0w0, 0w1)
         and storeRegPostIndexByte = loadStorePostIndex (0w0, 0w0, 0w0)
 
         val loadRegPreIndex = loadStorePreIndex (0w3, 0w0, 0w1)
         and storeRegPreIndex = loadStorePreIndex (0w3, 0w0, 0w0)
         and loadRegPreIndex32 = loadStorePreIndex (0w2, 0w0, 0w1)
         and storeRegPreIndex32 = loadStorePreIndex (0w2, 0w0, 0w0)
         and loadRegPreIndexByte = loadStorePreIndex (0w0, 0w0, 0w1)
         and storeRegPreIndexByte = loadStorePreIndex (0w0, 0w0, 0w0)
     end
 
     (* Load/store with a register offset i.e. an index register. *)
     local
         fun loadStoreRegRegisterOffset (size, v, opc, xD) ({regT, regN, regM, option}) =
         let
             val (opt, s) =
                 case extendLSEncode option of
                     (opt, ScaleOrShift) => (opt, 0w1) | (opt, NoScale) => (opt, 0w0)
         in
             SimpleInstr(0wx38200800 orb (size << 0w30) orb (v << 0w26) orb (opc << 0w22) orb
                 (word8ToWord32(xRegOnly regM) << 0w16) orb (opt << 0w13) orb (s << 0w12) orb
                 (word8ToWord32(xRegOrXSP regN) << 0w5) orb word8ToWord32(xD regT))
         end
     in
         val loadRegIndexed = loadStoreRegRegisterOffset(0w3, 0w0, 0w1, xRegOrXZ)
         and storeRegIndexed = loadStoreRegRegisterOffset(0w3, 0w0, 0w0, xRegOrXZ)
         and loadRegIndexedByte = loadStoreRegRegisterOffset(0w0, 0w0, 0w1, xRegOrXZ)
         and storeRegIndexedByte = loadStoreRegRegisterOffset(0w0, 0w0, 0w0, xRegOrXZ)
         and loadRegIndexed16 = loadStoreRegRegisterOffset(0w1, 0w0, 0w1, xRegOrXZ)
         and storeRegIndexed16 = loadStoreRegRegisterOffset(0w1, 0w0, 0w0, xRegOrXZ)
         and loadRegIndexed32 = loadStoreRegRegisterOffset(0w2, 0w0, 0w1, xRegOrXZ)
         and storeRegIndexed32 = loadStoreRegRegisterOffset(0w2, 0w0, 0w0, xRegOrXZ)
         and loadRegIndexedFloat = loadStoreRegRegisterOffset(0w2, 0w1, 0w1, vReg)
         and storeRegIndexedFloat = loadStoreRegRegisterOffset(0w2, 0w1, 0w0, vReg)
         and loadRegIndexedDouble = loadStoreRegRegisterOffset(0w3, 0w1, 0w1, vReg)
         and storeRegIndexedDouble = loadStoreRegRegisterOffset(0w3, 0w1, 0w0, vReg)
     end
 
     local
         (* Loads and stores with special ordering. *)
         fun loadStoreExclusive(size, o2, l, o1, o0) {regS, regT2, regN, regT} =
             SimpleInstr(0wx08000000 orb (size << 0w30) orb (o2 << 0w23) orb (l << 0w22) orb
             
                 (o1 << 0w21) orb (word8ToWord32(xRegOrXZ regS) << 0w16) orb (o0 << 0w15) orb
                 (word8ToWord32(xRegOrXZ regT2) << 0w10) orb (word8ToWord32(xRegOrXSP regN) << 0w5) orb
                  word8ToWord32(xRegOrXZ regT))
     in
         fun loadAcquire{regN, regT} =
             loadStoreExclusive(0w3, 0w1, 0w1, 0w0, 0w1) {regS=XZero, regT2=XZero, regN=regN, regT=regT}
         and storeRelease{regN, regT} =
             loadStoreExclusive(0w3, 0w1, 0w0, 0w0, 0w1) {regS=XZero, regT2=XZero, regN=regN, regT=regT}
         and loadAcquire32{regN, regT} =
             loadStoreExclusive(0w2, 0w1, 0w1, 0w0, 0w1) {regS=XZero, regT2=XZero, regN=regN, regT=regT}
         and storeRelease32{regN, regT} =
             loadStoreExclusive(0w2, 0w1, 0w0, 0w0, 0w1) {regS=XZero, regT2=XZero, regN=regN, regT=regT}
         and loadAcquireByte{regN, regT} =
             loadStoreExclusive(0w0, 0w1, 0w1, 0w0, 0w1) {regS=XZero, regT2=XZero, regN=regN, regT=regT}
         and storeReleaseByte{regN, regT} =
             loadStoreExclusive(0w0, 0w1, 0w0, 0w0, 0w1) {regS=XZero, regT2=XZero, regN=regN, regT=regT}
 
         (* Acquire exclusive access to a memory location and load its current value *)
         and loadAcquireExclusiveRegister{regN, regT} =
             loadStoreExclusive(0w3, 0w0, 0w1, 0w0, 0w1) {regS=XZero, regT2=XZero, regN=regN, regT=regT}
         (* Release exclusive access and test whether it succeeded.  Sets regS to 0
            if successful otherwise 1, in which case we have to repeat the operation. *)
         and storeReleaseExclusiveRegister{regN, regS, regT} =
             loadStoreExclusive(0w3, 0w0, 0w0, 0w0, 0w1) {regS=regS, regT2=XZero, regN=regN, regT=regT}
     end
 
     local
         (* Load and store pairs.  The offsets are signed scaled values. *)
         fun loadStorePair op2 (opc, v, l, rT) {regT1, regT2, regN, unitOffset} =
         let
             val _ = (unitOffset >= ~64 andalso unitOffset < 64)
                 orelse raise InternalError "loadStorePair: value out of range"
             val imm7 = Word32.fromInt unitOffset andb 0wx7f
         in
            SimpleInstr(0wx28000000 orb (opc << 0w30) orb (v << 0w26) orb (op2 << 0w23) orb
             (l << 0w22) orb (imm7 << 0w15) orb (word8ToWord32(rT regT2) << 0w10) orb
             (word8ToWord32(xRegOrXSP regN) << 0w5) orb word8ToWord32(rT regT1))
         end
         
         fun loadStorePairOffset args = loadStorePair 0w2 args
         and loadStorePairPostIndexed args = loadStorePair 0w1 args
         and loadStorePairPreIndexed args = loadStorePair 0w3 args
     in
         val storePairOffset = loadStorePairOffset(0w2, 0w0, 0w0, xRegOrXZ)
         and loadPairOffset =  loadStorePairOffset(0w2, 0w0, 0w1, xRegOrXZ)
         and storePairPostIndexed = loadStorePairPostIndexed(0w2, 0w0, 0w0, xRegOrXZ)
         and loadPairPostIndexed =  loadStorePairPostIndexed(0w2, 0w0, 0w1, xRegOrXZ)
         and storePairPreIndexed = loadStorePairPreIndexed(0w2, 0w0, 0w0, xRegOrXZ)
         and loadPairPreIndexed =  loadStorePairPreIndexed(0w2, 0w0, 0w1, xRegOrXZ)
         
         and storePairOffset32 = loadStorePairOffset(0w0, 0w0, 0w0, xRegOrXZ)
         and loadPairOffset32 =  loadStorePairOffset(0w0, 0w0, 0w1, xRegOrXZ)
         and storePairPostIndexed32 = loadStorePairPostIndexed(0w0, 0w0, 0w0, xRegOrXZ)
         and loadPairPostIndexed32 =  loadStorePairPostIndexed(0w0, 0w0, 0w1, xRegOrXZ)
         and storePairPreIndexed32 = loadStorePairPreIndexed(0w0, 0w0, 0w0, xRegOrXZ)
         and loadPairPreIndexed32 =  loadStorePairPreIndexed(0w0, 0w0, 0w1, xRegOrXZ)
 
         and storePairOffsetFloat = loadStorePairOffset(0w0, 0w1, 0w0, vReg)
         and loadPairOffsetFloat = loadStorePairOffset(0w0, 0w1, 0w1, vReg)
         and storePairPostIndexedFloat = loadStorePairPostIndexed(0w0, 0w1, 0w0, vReg)
         and loadPairPostIndexedFloat = loadStorePairPostIndexed(0w0, 0w1, 0w1, vReg)
         and storePairPreIndexedFloat = loadStorePairPreIndexed(0w0, 0w1, 0w0, vReg)
         and loadPairPreIndexedFloat = loadStorePairPreIndexed(0w0, 0w1, 0w1, vReg)
 
         and storePairOffsetDouble = loadStorePairOffset(0w0, 0w1, 0w0, vReg)
         and loadPairOffsetDouble = loadStorePairOffset(0w0, 0w1, 0w1, vReg)
         and storePairPostIndexedDouble = loadStorePairPostIndexed(0w1, 0w1, 0w0, vReg)
         and loadPairPostIndexedDouble = loadStorePairPostIndexed(0w1, 0w1, 0w1, vReg)
         and storePairPreIndexedDouble = loadStorePairPreIndexed(0w1, 0w1, 0w0, vReg)
         and loadPairPreIndexedDouble = loadStorePairPreIndexed(0w1, 0w1, 0w1, vReg)
     end
 
     (* Addresses must go in the constant area at the end of the code where they
        can be found by the GC. *)
     fun loadAddressConstant(xReg, valu) =
         LoadAddressLiteral{reg=xReg, value=valu, length=ref BrExtended}
 
     (* Non-address constants.  These may or may not be tagged values. *)
     fun loadNonAddressConstant(xReg, valu) =
         LoadNonAddressLiteral{reg=xReg, value=valu, length=ref BrExtended}
 
     (* Floating point constants.  TODO: We can use fmov dn,c for various  constant values. *)
     local
         (* Use the same instruction for both float and double. *)
         fun moviZero regD = SimpleInstr(0wx2F00E400 orb word8ToWord32(vReg regD))
     in
         fun loadFloatConstant(vReg, 0w0, _) = moviZero vReg
     
         |   loadFloatConstant(vReg, valu, work) =
                 LoadFPLiteral{reg=vReg, value=valu, isDouble=false, length=ref BrExtended, work=work}
 
         and loadDoubleConstant(vReg, 0w0, _) = moviZero vReg
 
         |   loadDoubleConstant(vReg, valu, work) =
                 LoadFPLiteral{reg=vReg, value=valu, isDouble=true, length=ref BrExtended, work=work}
     end
 
     local
         fun moveWideImmediate(sf, opc) {regD, immediate, shift} =
         let
             val hw =
                 case (shift, sf) of
                     (0w0, _) => 0w0
                 |   (0w16, _) => 0w1
                 |   (0w24, 0w1) => 0w2
                 |   (0w48, 0w1) => 0w3
                 |   _ => raise InternalError "moveWideImmediate: invalid shift"
             val _ =
                 immediate <= 0wxffff orelse raise InternalError "moveWideImmediate: immediate too large"
         in
             SimpleInstr(0wx12800000 orb (sf << 0w31) orb (opc << 0w29) orb
                 (hw << 0w21) orb (wordToWord32 immediate << 0w5) orb word8ToWord32(xRegOnly regD))
         end
     in
         val moveNot32 = moveWideImmediate(0w0, 0w0)
         and moveZero32 = moveWideImmediate(0w0, 0w2)
         and moveKeep32 = moveWideImmediate(0w0, 0w3)
         and moveNot = moveWideImmediate(0w1, 0w0)
         and moveZero = moveWideImmediate(0w1, 0w2)
         and moveKeep = moveWideImmediate(0w1, 0w3)
     end
 
     (* Instructions involved in thread synchonisation. *)
     val yield = SimpleInstr 0wxD503203F (* Yield inside a spin-lock. *)
     and dmbIsh = SimpleInstr 0wxD5033BBF (* Memory barrier. *)
     
     (* Jump to the address in the register and put the address of the
        next instruction into X30. *)
     fun branchAndLinkReg(dest) =
         SimpleInstr(0wxD63F0000 orb (word8ToWord32(xRegOnly dest) << 0w5))
 
     (* Jump to the address in the register. *)
     fun branchRegister(dest) =
         SimpleInstr(0wxD61F0000 orb (word8ToWord32(xRegOnly dest) << 0w5))
 
     (* Jump to the address in the register and hint this is a return. *)
     fun returnRegister(dest) =
         SimpleInstr(0wxD65F0000 orb (word8ToWord32(xRegOnly dest) << 0w5))
 
     (* Put a label into the code. *)
     val setLabel = Label
 
     (* Create a label. *)
     fun createLabel () = ref [ref 0w0]
 
     (* A conditional or unconditional branch. *)
     and conditionalBranch(cond, label) = ConditionalBranch{label=label, jumpCondition=cond, length=ref BrExtended }
     and unconditionalBranch label = UnconditionalBranch{label=label, andLink=false}
     and branchAndLink label = UnconditionalBranch{label=label, andLink=true}
     (* Put the address of a label into a register - used for handlers and cases. *)
     and loadLabelAddress(reg, label) = LoadLabelAddress{label=label, reg=reg, length=ref BrExtended}
     (* Test a bit in a register and branch if zero/nonzero *)
     and testBitBranchZero(reg, bit, label) =
         TestBitBranch{label=label, bitNo=bit, brNonZero=false, reg=reg, length=ref BrExtended}
     and testBitBranchNonZero(reg, bit, label) =
         TestBitBranch{label=label, bitNo=bit, brNonZero=true, reg=reg, length=ref BrExtended}
     (* Compare a register with zero and branch if zero/nonzero *)
     and compareBranchZero(reg,  label) =
         CompareBranch{label=label, brNonZero=false, size=WordSize64, reg=reg, length=ref BrExtended}
     and compareBranchNonZero(reg, label) =
         CompareBranch{label=label, brNonZero=true, size=WordSize64, reg=reg, length=ref BrExtended}
     and compareBranchZero32(reg, label) =
         CompareBranch{label=label, brNonZero=false, size=WordSize32, reg=reg, length=ref BrExtended}
     and compareBranchNonZero32(reg, label) =
         CompareBranch{label=label, brNonZero=true, size=WordSize32, reg=reg, length=ref BrExtended}
     
 
     (* Set the destination register to the value of the first reg if the
        condition is true otherwise to a, possibly modified, version of
        the second argument.  There are variants that set it unmodified,
        incremented, inverted and negated. *)
     local
         fun conditionalSelect (sf, opc, op2) {regD, regFalse, regTrue, cond} =
             SimpleInstr(0wx1A800000 orb (sf << 0w31) orb (opc << 0w30) orb
                 (word8ToWord32(xRegOrXZ regFalse) << 0w16) orb (cCode cond << 0w12) orb
                 (op2 << 0w10) orb (word8ToWord32(xRegOrXZ regTrue) << 0w5) orb
                 word8ToWord32(xRegOrXZ regD))
     in
         val conditionalSet = conditionalSelect(0w1, 0w0, 0w0)
         and conditionalSetIncrement = conditionalSelect(0w1, 0w0, 0w1)
         and conditionalSetInverted = conditionalSelect(0w1, 0w1, 0w0)
         and conditionalSetNegated = conditionalSelect(0w1, 0w1, 0w1)
         and conditionalSet32 = conditionalSelect(0w0, 0w0, 0w0)
         and conditionalSetIncrement32 = conditionalSelect(0w0, 0w0, 0w1)
         and conditionalSetInverted32 = conditionalSelect(0w0, 0w1, 0w0)
         and conditionalSetNegated32 = conditionalSelect(0w0, 0w1, 0w1)
     end
 
     (* This combines the effect of a left and right shift.  There are various
        derived forms of this depending on the relative values of immr and imms.
        if imms >= immr copies imms-immr-1 bits from bit position immr to the lsb
        bits of the destination.
        if imms < immr copies imms+1 bits from the lsb bit to bit position
        regsize-immr.
        How the remaining bits are affected depends on the instruction.
        BitField instructions do not affect other bits.
        UnsignedBitField instructions zero other bits.
        SignedBitField instructions set the high order bits to a copy of
        the high order bit copied and zero the low order bits. *)
     local
         fun bitfield (sf, opc, n) {immr, imms, regN, regD} =
             SimpleInstr(0wx13000000 orb (sf << 0w31) orb (opc << 0w29) orb (n << 0w22) orb
                 (wordToWord32 immr << 0w16) orb (wordToWord32 imms << 0w10) orb (word8ToWord32(xRegOrXZ regN) << 0w5) orb
                 word8ToWord32(xRegOrXZ regD))
 
     in
         val signedBitfieldMove32 = bitfield(0w0, 0w0, 0w0)
         and bitfieldMove32 = bitfield(0w0, 0w1, 0w0)
         and unsignedBitfieldMove32 = bitfield(0w0, 0w2, 0w0)
         and signedBitfieldMove64 = bitfield(0w1, 0w0, 0w1)
         and bitfieldMove64 = bitfield(0w1, 0w1, 0w1)
         and unsignedBitfieldMove64 = bitfield(0w1, 0w2, 0w1)
 
         (* Derived forms. *)
         fun logicalShiftLeft{shift, regN, regD} =
                 unsignedBitfieldMove64{immr=Word.~ shift mod 0w64,
                     imms=0w64-0w1-shift, regN=regN, regD=regD}
         and logicalShiftLeft32{shift, regN, regD} =
                 unsignedBitfieldMove32{immr=Word.~ shift mod 0w32,
                     imms=0w32-0w1-shift, regN=regN, regD=regD}
 
         and logicalShiftRight{shift, regN, regD} =
                 unsignedBitfieldMove64{immr=shift, imms=0wx3f, regN=regN, regD=regD}
         and logicalShiftRight32{shift, regN, regD} =
                 unsignedBitfieldMove32{immr=shift, imms=0wx1f, regN=regN, regD=regD}
 
         and unsignedBitfieldInsertinZeros{lsb, width, regN, regD} =
                 unsignedBitfieldMove64{immr=Word.~ lsb mod 0w64,
                     imms=width-0w1, regN=regN, regD=regD}
         and unsignedBitfieldInsertinZeros32{lsb, width, regN, regD} =
                 unsignedBitfieldMove32{immr=Word.~ lsb mod 0w32,
                     imms=width-0w1, regN=regN, regD=regD}
 
         and arithmeticShiftRight{shift, regN, regD} =
                 signedBitfieldMove64{immr=shift, imms=0wx3f, regN=regN, regD=regD}
         and arithmeticShiftRight32{shift, regN, regD} =
                 signedBitfieldMove32{immr=shift, imms=0wx1f, regN=regN, regD=regD}
         and signedBitfieldExtract{lsb, width, regN, regD} =
                 signedBitfieldMove64{immr=lsb, imms=lsb+width-0w1, regN=regN, regD=regD}
 
         and bitfieldInsert{lsb, width, regN, regD} =
                 bitfieldMove64{immr=Word.~ lsb mod 0w64, imms=width-0w1, regN=regN, regD=regD}
         and bitfieldInsert32{lsb, width, regN, regD} =
                 bitfieldMove32{immr=Word.~ lsb mod 0w32, imms=width-0w1, regN=regN, regD=regD}
     end
 
     local
         (* Logical immediates.  AND, OR, XOR and ANDS.  Assumes that the immediate value
            has already been checked as valid.  The non-flags versions can use SP as the
            destination. *)
         fun logicalImmediate (s, opc, xD) {bits, regN, regD} =
         let
             val {n, imms, immr} = 
                 case encodeBitPattern(bits, if s = 0w0 then WordSize32 else WordSize64) of
                     NONE => raise InternalError "testBitPattern: unable to encode bit pattern"
                 |   SOME res => res
         in
             SimpleInstr(0wx12000000 orb (opc << 0w29) orb (s << 0w31) orb (n << 0w22) orb
                 (wordToWord32 immr << 0w16) orb (wordToWord32 imms << 0w10) orb (word8ToWord32(xRegOrXZ regN) << 0w5) orb
                 word8ToWord32(xD regD))
         end
     in
         val bitwiseAndImmediate = logicalImmediate (0w1, 0w0, xRegOrXSP)
         and bitwiseOrImmediate = logicalImmediate (0w1, 0w1, xRegOrXSP)
         and bitwiseXorImmediate = logicalImmediate (0w1, 0w2, xRegOrXSP)
         and bitwiseAndSImmediate = logicalImmediate (0w1, 0w3, xRegOrXZ)
         and bitwiseAndImmediate32 = logicalImmediate (0w0, 0w0, xRegOrXSP)
         and bitwiseOrImmediate32 = logicalImmediate (0w0, 0w1, xRegOrXSP)
         and bitwiseXorImmediate32 = logicalImmediate (0w0, 0w2, xRegOrXSP)
         and bitwiseAndSImmediate32 = logicalImmediate (0w0, 0w3, xRegOrXZ)
     end
 
     local
         (* Floating point operations - 2 source *)
         fun floatingPoint2Source (pt, opc) {regM, regN, regD} =
             SimpleInstr(0wx1E200800 orb (pt << 0w22) orb (word8ToWord32(vReg regM) << 0w16) orb
                 (opc << 0w12) orb (word8ToWord32(vReg regN) << 0w5) orb word8ToWord32(vReg regD))
     in
         val multiplyFloat = floatingPoint2Source(0w0, 0wx0)
         and divideFloat = floatingPoint2Source(0w0, 0wx1)
         and addFloat = floatingPoint2Source(0w0, 0wx2)
         and subtractFloat = floatingPoint2Source(0w0, 0wx3)
         and multiplyDouble = floatingPoint2Source(0w1, 0wx0)
         and divideDouble = floatingPoint2Source(0w1, 0wx1)
         and addDouble = floatingPoint2Source(0w1, 0wx2)
         and subtractDouble = floatingPoint2Source(0w1, 0wx3)
     end
 
     local
         (* Move between a floating point and a general register with or without conversion. *)
         fun fmoveGeneral (sf, s, ptype, mode, opcode, rN, rD) {regN, regD} =
             SimpleInstr(0wx1E200000 orb (sf << 0w31) orb (s << 0w29) orb (ptype << 0w22) orb
                 (mode << 0w19) orb (opcode << 0w16) orb
                 (word8ToWord32(rN regN) << 0w5) orb word8ToWord32(rD regD))
         open IEEEReal
     in
         (* Moves without conversion *)
         val moveGeneralToFloat = fmoveGeneral(0w0, 0w0, 0w0, 0w0, 0w7, xRegOrXZ, vReg)
         and moveFloatToGeneral = fmoveGeneral(0w0, 0w0, 0w0, 0w0, 0w6, vReg, xRegOnly)
         and moveGeneralToDouble = fmoveGeneral(0w1, 0w0, 0w1, 0w0, 0w7, xRegOrXZ, vReg)
         and moveDoubleToGeneral = fmoveGeneral(0w1, 0w0, 0w1, 0w0, 0w6, vReg, xRegOnly)
         (* Moves with conversion - signed.  The argument is a 64-bit value. *)
         and convertIntToFloat = fmoveGeneral(0w1, 0w0, 0w0, 0w0, 0w2, xRegOrXZ, vReg)
         and convertIntToDouble = fmoveGeneral(0w1, 0w0, 0w1, 0w0, 0w2, xRegOrXZ, vReg)
         and convertInt32ToFloat = fmoveGeneral(0w0, 0w0, 0w0, 0w0, 0w2, xRegOrXZ, vReg)
         and convertInt32ToDouble = fmoveGeneral(0w0, 0w0, 0w1, 0w0, 0w2, xRegOrXZ, vReg)
 
         fun convertFloatToInt TO_NEAREST =
                 fmoveGeneral(0w1, 0w0, 0w0, 0w0, 0w4, vReg, xRegOnly) (* fcvtas *)
         |   convertFloatToInt TO_NEGINF =
                 fmoveGeneral(0w1, 0w0, 0w0, 0w2, 0w0, vReg, xRegOnly) (* fcvtms *)
         |   convertFloatToInt TO_POSINF =
                 fmoveGeneral(0w1, 0w0, 0w0, 0w1, 0w0, vReg, xRegOnly) (* fcvtps *)
         |   convertFloatToInt TO_ZERO =
                 fmoveGeneral(0w1, 0w0, 0w0, 0w3, 0w0, vReg, xRegOnly) (* fcvtzs *)
 
         and convertDoubleToInt TO_NEAREST =
                 fmoveGeneral(0w1, 0w0, 0w1, 0w0, 0w4, vReg, xRegOnly) (* fcvtas *)
         |   convertDoubleToInt TO_NEGINF =
                 fmoveGeneral(0w1, 0w0, 0w1, 0w2, 0w0, vReg, xRegOnly) (* fcvtms *)
         |   convertDoubleToInt TO_POSINF =
                 fmoveGeneral(0w1, 0w0, 0w1, 0w1, 0w0, vReg, xRegOnly) (* fcvtps *)
         |   convertDoubleToInt TO_ZERO =
                 fmoveGeneral(0w1, 0w0, 0w1, 0w3, 0w0, vReg, xRegOnly) (* fcvtzs *)
 
         and convertFloatToInt32 TO_NEAREST =
                 fmoveGeneral(0w0, 0w0, 0w0, 0w0, 0w4, vReg, xRegOnly) (* fcvtas *)
         |   convertFloatToInt32 TO_NEGINF =
                 fmoveGeneral(0w0, 0w0, 0w0, 0w2, 0w0, vReg, xRegOnly) (* fcvtms *)
         |   convertFloatToInt32 TO_POSINF =
                 fmoveGeneral(0w0, 0w0, 0w0, 0w1, 0w0, vReg, xRegOnly) (* fcvtps *)
         |   convertFloatToInt32 TO_ZERO =
                 fmoveGeneral(0w0, 0w0, 0w0, 0w3, 0w0, vReg, xRegOnly) (* fcvtzs *)
 
         and convertDoubleToInt32 TO_NEAREST =
                 fmoveGeneral(0w0, 0w0, 0w1, 0w0, 0w4, vReg, xRegOnly) (* fcvtas *)
         |   convertDoubleToInt32 TO_NEGINF =
                 fmoveGeneral(0w0, 0w0, 0w1, 0w2, 0w0, vReg, xRegOnly) (* fcvtms *)
         |   convertDoubleToInt32 TO_POSINF =
                 fmoveGeneral(0w0, 0w0, 0w1, 0w1, 0w0, vReg, xRegOnly) (* fcvtps *)
         |   convertDoubleToInt32 TO_ZERO =
                 fmoveGeneral(0w0, 0w0, 0w1, 0w3, 0w0, vReg, xRegOnly) (* fcvtzs *)
     end
 
     local
         fun floatingPtCompare(ptype, opc) {regM, regN} =
             SimpleInstr(0wx1E202000 orb (ptype << 0w22) orb
                 (word8ToWord32(vReg regM) << 0w16) orb (word8ToWord32(vReg regN) << 0w5) orb
                 (opc << 0w3))
     in
         val compareFloat = floatingPtCompare(0w0, 0w0) (* fcmp *)
         and compareDouble = floatingPtCompare(0w1, 0w0)
         (* It is also possible to compare a single register with zero using opc=1/3 *)
     end
 
     local
         (* Floating point single source. *)
         fun floatingPtSingle (ptype, opc) {regN, regD} =
             SimpleInstr(0wx1E204000 orb (ptype << 0w22) orb (opc << 0w15) orb
                 (word8ToWord32(vReg regN) << 0w5) orb word8ToWord32(vReg regD))
     in
         val moveFloatToFloat = floatingPtSingle(0w0, 0wx0)
         and absFloat = floatingPtSingle(0w0, 0wx1)
         and negFloat = floatingPtSingle(0w0, 0wx2)
         and convertFloatToDouble = floatingPtSingle(0w0, 0wx5)
         and moveDoubleToDouble = floatingPtSingle(0w1, 0wx0)
         and absDouble = floatingPtSingle(0w1, 0wx1)
         and negDouble = floatingPtSingle(0w1, 0wx2)
         and convertDoubleToFloat = floatingPtSingle(0w1, 0wx4)
     end
 
     local
         fun atomicMemory (size, v, a, r, o3, opc) {regS, regN, regT} =
             SimpleInstr(0wx38200000 orb (size << 0w30) orb (v << 0w26) orb (a << 0w23) orb (r << 0w22) orb (o3 << 0w15) orb (opc << 0w12)
                 orb (word8ToWord32(xRegOrXZ regS) << 0w16) orb (word8ToWord32(xRegOrXSP regN) << 0w5) orb word8ToWord32(xRegOrXZ regT))
     in
         val loadAddAL = atomicMemory(0w3, 0w0, 0w1, 0w1, 0w0, 0w0)
         and loadUMaxAL = atomicMemory(0w3, 0w0, 0w1, 0w1, 0w0, 0w6)
         and swapAL = atomicMemory(0w3, 0w0, 0w1, 0w1, 0w1, 0w0)
         and loadAddA = atomicMemory(0w3, 0w0, 0w1, 0w0, 0w0, 0w0)
         and loadUMaxA = atomicMemory(0w3, 0w0, 0w1, 0w0, 0w0, 0w6)
         and swapL = atomicMemory(0w3, 0w0, 0w0, 0w1, 0w1, 0w0)
     end
 
 
     (* This word is put in after a call to the RTS trap-handler.  All the registers
        are saved and restored across a call to the trap-handler; the register
        mask contains those that may contain an address and so need to be scanned and
        possibly updated if there is a GC. *)
     fun registerMask(regs) =
     let
         fun addToMask(r, mask) =
         let
             val rno = word8ToWord(xRegOnly r)
         in
             if rno > 0w24 (* In particular this can't be X30. *)
             then raise InternalError ("registerMask: invalid register "^Word.toString rno)
             else mask orb (0w1 << word8ToWord(xRegOnly r))
         end
         val maskWord = List.foldl addToMask 0w0 regs
     in
         SimpleInstr(0wx02000000 (* Reserved instr range. *) orb maskWord)
     end
     
     (* This is a bit of a hack but is the only way to get round the problem that when
        a callback (FFI closure) is called the code has none of the global registers.
        This isn't a problem in the native addressing version because we have
        absolute addresses but in 32-in-64 we need at least one absolute address to
        begin.  This embeds the global heap base pointer as a constant in the
        non-address constant area.  It requires the RTS to be able to find it and
        update it when the code is loaded.  We insert a nop followed by the
        pc-relative load.  This MUST be the first instruction in the code. *)
     local
         val getHeapBase: unit -> LargeWord.word = RunCall.rtsCallFull0 "PolyGetHeapBase"
     in
         fun loadGlobalHeapBaseInCallback reg =
             if is32in64
             then [SimpleInstr nopCode, loadNonAddressConstant(reg, getHeapBase())]
             else raise InternalError "loadGlobalHeapBaseInCallback called with native addressing"
     end
 
     (* Size of each code word. *)
     fun codeSize (SimpleInstr _) = 1 (* Number of 32-bit words *)
     |   codeSize (LoadAddressLiteral{ length=ref BrShort, ...}) = 1
     |   codeSize (LoadAddressLiteral{ length=ref BrExtended, ...}) = 2
     |   codeSize (LoadNonAddressLiteral{ length=ref BrShort, ...}) = 1
     |   codeSize (LoadNonAddressLiteral{ length=ref BrExtended, ...}) = 2
     |   codeSize (LoadFPLiteral{ length=ref BrShort, ...}) = 1
     |   codeSize (LoadFPLiteral{ length=ref BrExtended, ...}) = 2
     |   codeSize (Label _) = 0
     |   codeSize (UnconditionalBranch _) = 1
     |   codeSize (LoadLabelAddress { length=ref BrShort, ...}) = 1
     |   codeSize (LoadLabelAddress { length=ref BrExtended, ...}) = 2
     |   codeSize (ConditionalBranch { length=ref BrShort, ...}) = 1
     |   codeSize (ConditionalBranch { length=ref BrExtended, ...}) = 2
     |   codeSize (TestBitBranch { length=ref BrShort, ...}) = 1
     |   codeSize (TestBitBranch { length=ref BrExtended, ...}) = 2
     |   codeSize (CompareBranch { length=ref BrShort, ...}) = 1
     |   codeSize (CompareBranch { length=ref BrExtended, ...}) = 2
 
     (* Store a 32-bit value in the code.  Always little-endian. *)
     fun writeInstr(value, wordAddr, seg) =
     let
         fun putBytes(value, a, seg, i) =
         if i = 0w4 then ()
         else
         (
             byteVecSet(seg, a+i, word32ToWord8(value andb 0wxff));
             putBytes(value >> 0w8, a, seg, i+0w1)
         )
     in
         putBytes(value, Word.<<(wordAddr, 0w2), seg, 0w0)
     end
     
     (* Store a 64-bit constant in the code area. *)
     fun write64Bit(value, word64Addr, seg) =
     let
         fun putBytes(value, a, seg, i) =
         if i = 0w8 then ()
         else
         (
             byteVecSet(seg,
                 if not isBigEndian then a+i else a+0w8-i-0w1,
                 Word8.fromLarge(Word64.toLarge value));
             putBytes(Word64.>>(value, 0w8), a, seg, i+0w1)
         )
     in
         putBytes(value, Word.<<(word64Addr, 0w3), seg, 0w0)
     end
 
     (* Set the sizes of branches depending on the distance to the destination. *)
     fun setLabelsAndSizes(ops, maxConstantSize) =
     let
         (* Set the labels and get the current size of the code. *)
         fun setLabels(Label(ref labs) :: ops, ic) = (List.app(fn d => d := ic) labs; setLabels(ops, ic))
         |   setLabels(oper :: ops, ic) = setLabels(ops, ic + Word.fromInt(codeSize oper))
         |   setLabels([], ic) = ic
 
         (* Set the labels and adjust the sizes, repeating until it never gets smaller *)
         fun setLabAndSize(ops, lastSize) =
         let
             (* See if we can shorten any branches.  The "addr" is the original address since
                that's what we've used to set the labels.  *)
             fun adjust([], _) = ()
 
             |   adjust(ConditionalBranch { length as ref BrExtended, label=ref labs, ...} :: instrs, addr) =
                 let
                     val dest = !(hd labs)
                     val offset = Word.toInt dest - Word.toInt addr
                 in
                     if willFitInRange(offset, 0w19) then length := BrShort else ();
                     adjust(instrs, addr + 0w2) (* N.B. Size BEFORE any adjustment *)
                 end
             
             |   adjust(TestBitBranch { length as ref BrExtended, label=ref labs, ...} :: instrs, addr) =
                 let
                     val dest = !(hd labs)
                     val offset = Word.toInt dest - Word.toInt addr
                 in
                     if willFitInRange(offset, 0w14) then length := BrShort else ();
                     adjust(instrs, addr + 0w2) (* N.B. Size BEFORE any adjustment *)
                 end
 
             |   adjust(CompareBranch { length as ref BrExtended, label=ref labs, ...} :: instrs, addr) =
                 let
                     val dest = !(hd labs)
                     val offset = Word.toInt dest - Word.toInt addr
                 in
                     if willFitInRange(offset, 0w19) then length := BrShort else ();
                     adjust(instrs, addr + 0w2) (* N.B. Size BEFORE any adjustment *)
                 end
 
             |   adjust(LoadAddressLiteral { length as ref BrExtended, ...} :: instrs, addr) =
                 let
                     val offset = Word.toInt (lastSize + maxConstantSize) - Word.toInt addr
                 in
                     (* We can only shorten these in 32-in-64.  In native 64-bits we may need to move
                        the constant area *)
                     if is32in64 andalso willFitInRange(offset, 0w19) then length := BrShort else ();
                     adjust(instrs, addr + 0w2) (* N.B. Size BEFORE any adjustment *)
                 end
 
             |   adjust(LoadNonAddressLiteral { length as ref BrExtended, ...} :: instrs, addr) =
                 let
                     val offset = Word.toInt (lastSize + maxConstantSize) - Word.toInt addr
                 in
                     if willFitInRange(offset, 0w19) then length := BrShort else ();
                     adjust(instrs, addr + 0w2) (* N.B. Size BEFORE any adjustment *)
                 end
 
             |   adjust(LoadFPLiteral { length as ref BrExtended, ...} :: instrs, addr) =
                 let
                     val offset = Word.toInt (lastSize + maxConstantSize) - Word.toInt addr
                 in
                     if willFitInRange(offset, 0w19) then length := BrShort else ();
                     adjust(instrs, addr + 0w2) (* N.B. Size BEFORE any adjustment *)
                 end
 
             |   adjust(LoadLabelAddress { length as ref BrExtended, label=ref labs, ...} :: instrs, addr) =
                 let
                     val dest = !(hd labs)
                     val offset = Word.toInt dest - Word.toInt addr
                 in
                     if willFitInRange(offset, 0w19) then length := BrShort else ();
                     adjust(instrs, addr + 0w2) (* N.B. Size BEFORE any adjustment *)
                 end
 
             |   adjust(instr :: instrs, addr) = adjust(instrs, addr + Word.fromInt(codeSize instr))
 
             val () = adjust(ops, 0w0)
 
             val nextSize = setLabels(ops, 0w0)
         in
             if nextSize < lastSize then setLabAndSize(ops, nextSize)
             else if nextSize = lastSize then lastSize
             else raise InternalError "setLabAndSize - size increased"
         end
     in
         setLabAndSize(ops, setLabels(ops, 0w0))
     end
 
     fun genCode(ops, addressConsts, nonAddressConsts, addrConstMap, nonAddrConstMap) =
     let
         val numNonAddrConsts = Word.fromInt(List.length nonAddressConsts)
         and numAddrConsts = Word.fromInt(List.length addressConsts) (* 32-bit words. *)
         val constSizePlusExtras = (* Number of extra (poly)words needed. *)
             numNonAddrConsts * wordsPerNativeWord + numAddrConsts + 0w4 (* 4 extra words *)
 
         val codeSize (* Number of 32-bit instructions *) =
             setLabelsAndSizes(ops, constSizePlusExtras  * (Address.wordSize div 0w4) + 0w2 (*allow 2 UDFs*))
 
         val wordsOfCode = (codeSize + 0w2) div 0w2 (* Round up to 64-bits with the UDF marker(s) added. *)
         (* Put one or two UDF instructions at the end as markers. *)
         val endOfCodeWords =
             if Word.andb(codeSize, 0w1) = 0w0 then [SimpleInstr undefCode, SimpleInstr undefCode] else [SimpleInstr undefCode]
  
         (* Segment size in Poly words. *)
         val segSize = wordsOfCode*wordsPerNativeWord + constSizePlusExtras
         val codeVec = byteVecMake segSize
         
         fun testBit(bitNo, brNonZero, offset, reg) =
             0wx36000000 orb (if bitNo >= 0w32 then 0wx80000000 else 0w0) orb
                 (if brNonZero then 0wx01000000 else 0w0) orb
                 (word8ToWord32(Word8.andb(bitNo, 0wx3f)) << 0w19) orb
                 ((offset andb 0wx3fff) << 0w5) orb word8ToWord32(xRegOnly reg)
         and compareBranch(size, brNonZero, offset, reg) =
             0wx34000000 orb (case size of WordSize64 => 0wx80000000 | WordSize32 => 0w0) orb
                 (if brNonZero then 0wx01000000 else 0w0) orb
                 ((offset andb 0wx7ffff) << 0w5) orb word8ToWord32(xRegOnly reg)
 
         fun genCodeWords([], _ , _, _) = ()
 
         |   genCodeWords(SimpleInstr code :: tail, wordNo, aConstNum, nonAConstNum) =
             (
                 writeInstr(code, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum)
             )
 
         |   genCodeWords(LoadAddressLiteral{reg, length=ref BrExtended, ...} :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 val code1 = 0wx90000000 orb word8ToWord32(xRegOnly reg)
                 val code2 =
                     (if is32in64 then loadRegScaled32 else loadRegScaled) {regT=reg, regN=reg, unitOffset=0}
             in
                 writeInstr(code1, wordNo, codeVec);
                 genCodeWords(code2 :: tail, wordNo+0w1, aConstNum+1, nonAConstNum)
             end
 
         |   genCodeWords(LoadAddressLiteral{reg, length=ref BrShort, ...} :: tail, wordNo, aConstNum, nonAConstNum) =
             (* Address literals can be shortened in 32-in-64 but are always 2 instrs in 64-bit.
                That allows for the constant area to be pulled out if necessary to make the
                code position-independent. *)
             let
                 (* The offset is in 32-bit words.  The first of the constants is
                    at offset wordsOfCode+3.  Non-address constants are always 8 bytes but
                    address constants are 4 bytes in 32-in-64. *)
                 val s = if is32in64 then 0w0 else 0w1 (* Load 64-bit word in 64-bit mode and 32-bits in 32-in-64. *)
                 val constPos = Array.sub(addrConstMap, aConstNum)
                 val offsetOfConstant =
                     (wordsOfCode+numNonAddrConsts)*0w2 + (0w3+constPos)*(Address.wordSize div 0w4) - wordNo
                 val _ = willFitInRange(Word.toInt offsetOfConstant, 0w19) orelse raise InternalError "Offset to constant is too large"
                 val code =
                     0wx18000000 orb (s << 0w30) orb (wordToWord32 offsetOfConstant << 0w5) orb word8ToWord32(xRegOnly reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum+1, nonAConstNum)
             end
 
         |   genCodeWords(LoadNonAddressLiteral{reg, length=ref BrExtended, ...} :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 val code1 = 0wx90000000 orb word8ToWord32(xRegOnly reg)
                 (* The load instruction is always 64-bits even in 32-in-64. *)
                 val code2 = loadRegScaled{regT=reg, regN=reg, unitOffset=0}
             in
                 writeInstr(code1, wordNo, codeVec);
                 genCodeWords(code2 :: tail, wordNo+0w1, aConstNum, nonAConstNum+1)
             end
 
         |   genCodeWords(LoadNonAddressLiteral{reg, length=ref BrShort, ...} :: tail, wordNo, aConstNum, nonAConstNum) =
             (* These can be shortened since they're always part of the code. *)
             let
                 (* The offset is in 32-bit words.  These are always 64-bits. *)
                 val constPos = Array.sub(nonAddrConstMap, nonAConstNum)
                 val offsetOfConstant = (wordsOfCode+constPos)*0w2 - wordNo
                 val _ = willFitInRange(Word.toInt offsetOfConstant, 0w19) orelse raise InternalError "Offset to constant is too large"
                 val code = 0wx58000000 orb (wordToWord32 offsetOfConstant << 0w5) orb word8ToWord32(xRegOnly reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum+1)
             end
 
         |   genCodeWords(LoadFPLiteral{reg, work, isDouble, length=ref BrExtended, ...} :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 val code1 = 0wx90000000 orb word8ToWord32(xRegOnly work)
                 val code2 = (if isDouble then loadRegScaledDouble else loadRegScaledFloat){regT=reg, regN=work, unitOffset=0}
             in
                 writeInstr(code1, wordNo, codeVec);
                 genCodeWords(code2 :: tail, wordNo+0w1, aConstNum, nonAConstNum+1)
             end
 
         |   genCodeWords(LoadFPLiteral{reg, isDouble, length=ref BrShort, ...} :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 (* The offset is in 32-bit words.  These are always 64-bits. *)
                 val constPos = Array.sub(nonAddrConstMap, nonAConstNum)
                 val offsetOfConstant = (wordsOfCode+constPos)*0w2 - wordNo
                 val _ = willFitInRange(Word.toInt offsetOfConstant, 0w19) orelse raise InternalError "Offset to constant is too large"
                 val code = (if isDouble then 0wx5c000000 else 0wx1c000000) orb
                                 (wordToWord32 offsetOfConstant << 0w5) orb word8ToWord32(vReg reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum+1)
             end
 
         |   genCodeWords(Label _ :: tail, wordNo, aConstNum, nonAConstNum) = 
                 genCodeWords(tail, wordNo, aConstNum, nonAConstNum) (* No code. *)
 
         |   genCodeWords(UnconditionalBranch{label=ref labs, andLink} :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt wordNo
                 val _ = willFitInRange(offset, 0w26) orelse raise InternalError "genCodeWords: branch too far"
                 val linkBit = if andLink then 0wx80000000 else 0w0
             in
                 writeInstr(0wx14000000 orb linkBit orb (Word32.fromInt offset andb 0wx03ffffff), wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(ConditionalBranch{ label=ref labs, jumpCondition=cond, length=ref BrShort }:: tail, wordNo,
                             aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt wordNo
                 val _ = willFitInRange(offset, 0w19) orelse raise InternalError "genCodeWords: branch too far"
             in
                 writeInstr(0wx54000000 orb ((Word32.fromInt offset andb 0wx07ffff) << 0w5)
                         orb cCode cond, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(ConditionalBranch{ label=ref labs, jumpCondition, length=ref BrExtended }:: tail, wordNo,
                             aConstNum, nonAConstNum) =
             let (* Long form - put a conditional branch with reversed sense round an unconditional branch. *)
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt (wordNo + 0w1) (* Next instruction. *)
                 val _ = willFitInRange(offset, 0w26) orelse raise InternalError "genCodeWords: branch too far"
                 val revCond = invertTest jumpCondition
             in
                 writeInstr(0wx54000000 orb (0w2 << 0w5) orb cCode revCond, wordNo, codeVec);
                 writeInstr(0wx14000000 orb (Word32.fromInt offset andb 0wx03ffffff), wordNo+0w1, codeVec);
                 genCodeWords(tail, wordNo+0w2, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(LoadLabelAddress{reg, length=ref BrExtended, ...} :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 val code1 = 0wx90000000 orb word8ToWord32(xRegOnly reg)
                 val code2 = addImmediate{regN=reg, regD=reg, immed=0w0, shifted=false}
             in
                 writeInstr(code1, wordNo, codeVec);
                 genCodeWords(code2 :: tail, wordNo+0w1, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(LoadLabelAddress{label=ref labs, reg, length=ref BrShort, ...} :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt wordNo
                 val _ = willFitInRange(offset, 0w19) orelse raise InternalError "Offset to label address is too large"
                 val code = 0wx10000000 orb ((Word32.fromInt offset andb 0wx7ffff) << 0w5) orb word8ToWord32(xRegOnly reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(TestBitBranch{label=ref labs, bitNo, brNonZero, reg, length=ref BrExtended} :: tail,
                     wordNo, aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt (wordNo + 0w1) (* Next instruction *)
                 val _ = willFitInRange(offset, 0w25) orelse raise InternalError "genCodeWords: branch too far"
                 val _ = bitNo <= 0w63 orelse
                     raise InternalError "TestBitBranch: bit number > 63"
                 val code = testBit(bitNo, (* Invert test *) not brNonZero, 0w2 (* Skip branch *), reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 writeInstr(0wx14000000 orb (Word32.fromInt offset andb 0wx03ffffff), wordNo+0w1, codeVec);
                 genCodeWords(tail, wordNo+0w2, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(TestBitBranch{label=ref labs, bitNo, brNonZero, reg, length=ref BrShort} :: tail,
                     wordNo, aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt wordNo
                 val _ = willFitInRange(offset, 0w14) orelse raise InternalError "TestBitBranch: Offset to label address is too large"
                 val _ = bitNo <= 0w63 orelse
                     raise InternalError "TestBitBranch: bit number > 63"
                 val code = testBit(bitNo, brNonZero, Word32.fromInt offset, reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(CompareBranch{label=ref labs, brNonZero, size, reg, length=ref BrExtended} :: tail,
                     wordNo, aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt (wordNo+0w1)
                 val _ = willFitInRange(offset, 0w25) orelse raise InternalError "genCodeWords: branch too far"
                 val code = compareBranch(size, (* Invert test *) not brNonZero, 0w2, reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 writeInstr(0wx14000000 orb (Word32.fromInt offset andb 0wx03ffffff), wordNo+0w1, codeVec);
                 genCodeWords(tail, wordNo+0w2, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(CompareBranch{label=ref labs, brNonZero, size, reg, length=ref BrShort} :: tail,
                     wordNo, aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt wordNo
                 val _ = willFitInRange(offset, 0w19) orelse raise InternalError "CompareBranch: Offset to label address is too large"
                 val code = compareBranch(size, brNonZero, Word32.fromInt offset, reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum)
             end
     in
         genCodeWords (ops @ endOfCodeWords, 0w0, 0, 0);
         (* Copy in the non-address constants. *)
         List.foldl(fn (cVal, addr) => (write64Bit(cVal, addr, codeVec); addr+0w1)) wordsOfCode nonAddressConsts;
         (codeVec (* Return the completed code. *), wordsOfCode+numNonAddrConsts (* And the size in 64-bit words. *))
     end
 
     (* Store a word, either 64-bit or 32-bit. *)
     fun setWord(value, wordNo, seg) =
     let
         val addrs = wordNo * Address.wordSize
         fun putBytes(value, a, seg, i) =
         if i = Address.wordSize then ()
         else
         (
             byteVecSet(seg,
                 if not isBigEndian then a+i else a+wordSize-i-0w1,
                 Word8.fromLarge value);
             putBytes(LargeWord.>>(value, 0w8), a, seg, i+0w1)
         )
     in
         putBytes(value, addrs, seg, 0w0)
     end
     
    
     (* Print the instructions in the code. *)
     fun printCode (codeVec, functionName, wordsOfCode, printStream) =
     let
         val numInstructions = wordsOfCode * (Address.wordSize div 0w4)
     
         fun printHex (v, n) =
         let
             val s = Word32.fmt StringCvt.HEX v
             val pad = CharVector.tabulate(Int.max(0, n-size s), fn _ => #"0")
         in
             printStream pad; printStream s
         end
 
         fun printCondition 0wx0 = printStream "eq"
         |   printCondition 0wx1 = printStream "ne"
         |   printCondition 0wx2 = printStream "cs"
         |   printCondition 0wx3 = printStream "cc"
         |   printCondition 0wx4 = printStream "mi"
         |   printCondition 0wx5 = printStream "pl"
         |   printCondition 0wx6 = printStream "vs"
         |   printCondition 0wx7 = printStream "vc"
         |   printCondition 0wx8 = printStream "hi"
         |   printCondition 0wx9 = printStream "ls"
         |   printCondition 0wxa = printStream "ge"
         |   printCondition 0wxb = printStream "lt"
         |   printCondition 0wxc = printStream "gt"
         |   printCondition 0wxd = printStream "le"
         |   printCondition 0wxe = printStream "al"
         |   printCondition _    = printStream "nv"
 
         (* Normal XReg with 31 being XZ *)
         fun prXReg 0w31 = printStream "xz"
         |   prXReg r = printStream("x" ^ Word32.fmt StringCvt.DEC r)
 
         (* XReg when 31 is SP *)
         fun prXRegOrSP 0w31 = printStream "sp"
         |   prXRegOrSP r = printStream("x" ^ Word32.fmt StringCvt.DEC r)
 
         (* Normal WReg with 31 being WZ *)
         fun prWReg 0w31 = printStream "wz"
         |   prWReg r = printStream("w" ^ Word32.fmt StringCvt.DEC r)
 
         (* WReg when 31 is WSP *)
         fun prWRegOrSP 0w31 = printStream "wsp"
         |   prWRegOrSP r = printStream("w" ^ Word32.fmt StringCvt.DEC r)
 
         (* Each instruction is 32-bytes. *)
         fun printWordAt wordNo =
         let
             val byteNo = Word.<<(wordNo, 0w2)
             val () = printHex(wordToWord32 byteNo, 6)  (* Address *)
             val () = printStream "\t"
             val wordValue =
                 word8ToWord32 (codeVecGet (codeVec, byteNo)) orb
                 (word8ToWord32 (codeVecGet (codeVec, byteNo+0w1)) << 0w8) orb
                 (word8ToWord32 (codeVecGet (codeVec, byteNo+0w2)) << 0w16) orb
                 (word8ToWord32 (codeVecGet (codeVec, byteNo+0w3)) << 0w24)
             val () = printHex(wordValue, 8) (* Instr as hex *)
             val () = printStream "\t"
         in
             if (wordValue andb 0wxfffffc1f) = 0wxD61F0000
             then
             let
                 val rN = (wordValue andb 0wx3e0) >> 0w5
             in
                 printStream "br\tx";
                 printStream(Word32.fmt StringCvt.DEC rN)
             end
 
             else if (wordValue andb 0wxfffffc1f) = 0wxD63F0000
             then
             let
                 val rN = (wordValue andb 0wx3e0) >> 0w5
             in
                 printStream "blr\tx";
                 printStream(Word32.fmt StringCvt.DEC rN)
             end
 
             else if (wordValue andb 0wxfffffc1f) = 0wxD65F0000
             then
             let
                 val rN = (wordValue andb 0wx3e0) >> 0w5
             in
                 printStream "ret\tx";
                 printStream(Word32.fmt StringCvt.DEC rN)
             end
 
             else if wordValue = 0wxD503201F
             then printStream "nop"
             else if wordValue = 0wxD503203F
             then printStream "yield"
             else if wordValue = 0wxD5033BBF
             then printStream "dmb\tish"
 
             else if (wordValue andb 0wx1f800000) = 0wx12800000
             then (* Move of constants.  Includes movn and movk. *)
             let
                 val rD = wordValue andb 0wx1f
                 val imm16 = Word32.toInt((wordValue >> 0w5) andb 0wxffff)
                 val isXReg = (wordValue andb 0wx80000000) <> 0w0
                 val opc = (wordValue >> 0w29) andb 0w3
                 val shift = (wordValue >> 0w21) andb 0w3
             in
                 printStream (if opc = 0w3 then "movk\t" else "mov\t");
                 printStream (if isXReg then "x" else "w");
                 printStream(Word32.fmt StringCvt.DEC rD);
                 printStream ",#";
                 printStream(Int.toString(if opc = 0w0 then ~1 - imm16 else imm16));
                 if shift = 0w0
                 then ()
                 else (printStream ",lsl #"; printStream(Word32.fmt StringCvt.DEC (shift*0w16)))
             end
 
             else if (wordValue andb 0wx3b000000) = 0wx39000000
             then (* Load/Store with unsigned, scaled offset. *)
             let
                 (* The offset is in units of the size of the operand. *)
                 val size = wordValue >> 0w30
                 and v = (wordValue >> 0w26) andb 0w1
                 and opc = (wordValue >> 0w22) andb 0w3
                 val rT = wordValue andb 0wx1f
                 and rN = (wordValue andb 0wx3e0) >> 0w5
                 and imm12 = (wordValue andb 0wx3ffc00) >> 0w10
                 val (opcode, r, scale) =
                     case (size, v, opc) of
                         (0w0, 0w0, 0w0) => ("strb", "w", 0w0)
                     |   (0w0, 0w0, 0w1) => ("ldrb", "w", 0w0)
                     |   (0w1, 0w0, 0w0) => ("strh", "w", 0w2)
                     |   (0w1, 0w0, 0w1) => ("ldrh", "w", 0w2)
                     |   (0w2, 0w0, 0w0) => ("str", "w", 0w4)
                     |   (0w2, 0w0, 0w1) => ("ldr", "w", 0w4)
                     |   (0w3, 0w0, 0w0) => ("str", "x", 0w8)
                     |   (0w3, 0w0, 0w1) => ("ldr", "x", 0w8)
                     |   (0w2, 0w1, 0w0) => ("str", "s", 0w4)
                     |   (0w2, 0w1, 0w1) => ("ldr", "s", 0w4)
                     |   (0w3, 0w1, 0w0) => ("str", "d", 0w8)
                     |   (0w3, 0w1, 0w1) => ("ldr", "d", 0w8)
                     |   _ => ("??", "?", 0w1)
             in
                 printStream opcode; printStream "\t"; printStream r; printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",["; prXRegOrSP rN;
                 printStream ",#"; printStream(Word32.fmt StringCvt.DEC(imm12*scale));
                 printStream "]"
             end
 
             else if (wordValue andb 0wx3b200c00) = 0wx38000000
             then (* Load/store unscaled immediate *)
             let
                 val size = wordValue >> 0w30
                 and v = (wordValue >> 0w26) andb 0w1
                 and opc = (wordValue >> 0w22) andb 0w3
                 val rT = wordValue andb 0wx1f
                 and rN = (wordValue andb 0wx3e0) >> 0w5
                 and imm9 = (wordValue andb 0wx1ff000) >> 0w12
                 val imm9Text =
                     if imm9 > 0wxff
                     then "-" ^ Word32.fmt StringCvt.DEC (0wx200 - imm9)
                     else Word32.fmt StringCvt.DEC imm9
                 val (opcode, r) =
                     case (size, v, opc) of
                         (0w0, 0w0, 0w0) => ("sturb", "w")
                     |   (0w0, 0w0, 0w1) => ("ldurb", "w")
                     |   (0w0, 0w0, 0w2) => ("ldursb", "w")
                     |   (0w0, 0w0, 0w3) => ("ldursb", "x")
                     |   (0w1, 0w0, 0w0) => ("sturh", "w")
                     |   (0w1, 0w0, 0w1) => ("ldurh", "w")
                     |   (0w1, 0w0, 0w2) => ("ldursh", "w")
                     |   (0w1, 0w0, 0w3) => ("ldursh", "x")
                     |   (0w2, 0w0, 0w0) => ("stur", "w")
                     |   (0w2, 0w0, 0w1) => ("ldur", "w")
                     |   (0w2, 0w0, 0w2) => ("ldursw", "x")
                     |   (0w3, 0w0, 0w0) => ("stur", "x")
                     |   (0w3, 0w0, 0w1) => ("ldur", "x")
                     |   (0w2, 0w1, 0w0) => ("stur", "s")
                     |   (0w2, 0w1, 0w1) => ("ldur", "s")
                     |   (0w3, 0w1, 0w0) => ("stur", "d")
                     |   (0w3, 0w1, 0w1) => ("ldur", "d")
                     |   _ => ("???", "?")
             in
                 printStream opcode; printStream "\t"; printStream r;
                 printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",["; prXRegOrSP rN;
                 printStream ",#"; printStream imm9Text; printStream "]"
             end
 
             else if (wordValue andb 0wx3b200c00) = 0wx38000400
             then (* Load/store immediate post-indexed *)
             let
                 val size = wordValue >> 0w30
                 and v = (wordValue >> 0w26) andb 0w1
                 and opc = (wordValue >> 0w22) andb 0w3
                 val rT = wordValue andb 0wx1f
                 and rN = (wordValue andb 0wx3e0) >> 0w5
                 and imm9 = (wordValue andb 0wx1ff000) >> 0w12
                 val imm9Text =
                     if imm9 > 0wxff
                     then "-" ^ Word32.fmt StringCvt.DEC (0wx200 - imm9)
                     else Word32.fmt StringCvt.DEC imm9
                 val (opcode, r) =
                     case (size, v, opc) of
                         (0w0, 0w0, 0w0) => ("strb", "w")
                     |   (0w0, 0w0, 0w1) => ("ldrb", "w")
                     |   (0w2, 0w0, 0w0) => ("str", "w")
                     |   (0w2, 0w0, 0w1) => ("ldr", "w")
                     |   (0w3, 0w0, 0w0) => ("str", "x")
                     |   (0w3, 0w0, 0w1) => ("ldr", "x")
                     |   _ => ("???", "?")
             in
                 printStream opcode; printStream "\t"; printStream r;
                 printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",["; prXRegOrSP rN;
                 printStream "],#"; printStream imm9Text
             end
 
             else if (wordValue andb 0wx3b200c00) = 0wx38000c00
             then (* Load/store immediate pre-indexed *)
             let
                 val size = wordValue >> 0w30
                 and v = (wordValue >> 0w26) andb 0w1
                 and opc = (wordValue >> 0w22) andb 0w3
                 val rT = wordValue andb 0wx1f
                 and rN = (wordValue andb 0wx3e0) >> 0w5
                 and imm9 = (wordValue andb 0wx1ff000) >> 0w12
                 val imm9Text =
                     if imm9 > 0wxff
                     then "-" ^ Word32.fmt StringCvt.DEC (0wx200 - imm9)
                     else Word32.fmt StringCvt.DEC imm9
                 val (opcode, r) =
                     case (size, v, opc) of
                         (0w0, 0w0, 0w0) => ("strb", "w")
                     |   (0w0, 0w0, 0w1) => ("ldrb", "w")
                     |   (0w2, 0w0, 0w0) => ("str", "w")
                     |   (0w2, 0w0, 0w1) => ("ldr", "w")
                     |   (0w3, 0w0, 0w0) => ("str", "x")
                     |   (0w3, 0w0, 0w1) => ("ldr", "x")
                     |   _ => ("???", "?")
             in
                 printStream opcode; printStream "\t"; printStream r;
                 printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",["; prXRegOrSP rN;
                 printStream ",#"; printStream imm9Text; printStream "]!"
             end
 
             else if (wordValue andb 0wx3b200c00) = 0wx38200800
             then (* Load/store with register offset i.e. an index register. *)
             let
                 val size = wordValue >> 0w30
                 and v = (wordValue >> 0w26) andb 0w1
                 and opc = (wordValue >> 0w22) andb 0w3
                 val rT = wordValue andb 0wx1f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rM = (wordValue >> 0w16) andb 0wx1f
                 val option = (wordValue >> 0w13) andb 0w7
                 val s = (wordValue andb 0wx1000) <> 0w0 
                 val (opcode, r) =
                     case (size, v, opc) of
                         (0w0, 0w0, 0w0) => ("strb", "w")
                     |   (0w0, 0w0, 0w1) => ("ldrb", "w")
                     |   (0w1, 0w0, 0w0) => ("strh", "w")
                     |   (0w1, 0w0, 0w1) => ("ldrh", "w")
                     |   (0w2, 0w0, 0w0) => ("str", "w")
                     |   (0w2, 0w0, 0w1) => ("ldr", "w")
                     |   (0w3, 0w0, 0w0) => ("str", "x")
                     |   (0w3, 0w0, 0w1) => ("ldr", "x")
                     |   (0w2, 0w1, 0w0) => ("str", "s")
                     |   (0w2, 0w1, 0w1) => ("ldr", "s")
                     |   (0w3, 0w1, 0w0) => ("str", "d")
                     |   (0w3, 0w1, 0w1) => ("ldr", "d")
                     |   _ => ("???", "?")
                 val (extend, xr) =
                     case option of
                         0w2 => (" uxtw", "w")
                     |   0w3 => if s then (" lsl", "x") else ("", "x")
                     |   0w6 => (" sxtw", "w")
                     |   0w7 => (" sxtx", "x")
                     |   _   => ("?", "?")
                 val indexShift =
                     case (size, s) of
                         (0w0, true) => " #1"
                     |   (0w1, true) => " #1"
                     |   (0w2, true) => " #2"
                     |   (0w3, true) => " #3"
                     |   _ => ""
             in
                 printStream opcode; printStream "\t"; printStream r;
                 printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",["; prXRegOrSP rN;
                 printStream ","; printStream xr; printStream(Word32.fmt StringCvt.DEC rM);
                 printStream extend; printStream indexShift;
                 printStream "]"
             end
 
             else if (wordValue andb 0wx3f000000) = 0wx08000000
             then (* Loads and stores with special ordering. *)
             let
                 val size = (wordValue >> 0w30) andb 0w3
                 and o2 = (wordValue >> 0w23) andb 0w1
                 and l = (wordValue >> 0w22) andb 0w1
                 and o1 = (wordValue >> 0w21) andb 0w1
                 and o0 = (wordValue >> 0w15) andb 0w1
                 val rT = wordValue andb 0wx1f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rS = (wordValue >> 0w16) andb 0wx1f
                 val (opcode, r) =
                     case (size, o2, l, o1, o0) of
                         (0w3, 0w1, 0w1, 0w0, 0w1) => ("ldar", "x")
                     |   (0w3, 0w1, 0w0, 0w0, 0w1) => ("stlr", "x")
                     |   (0w2, 0w1, 0w1, 0w0, 0w1) => ("ldar", "w")
                     |   (0w2, 0w1, 0w0, 0w0, 0w1) => ("stlr", "w")
                     |   (0w3, 0w0, 0w1, 0w0, 0w1) => ("ldaxr", "x")
                     |   (0w3, 0w0, 0w0, 0w0, 0w1) => ("stlxr", "x")
                     |   (0w0, 0w1, 0w1, 0w0, 0w1) => ("ldarb", "w")
                     |   (0w0, 0w1, 0w0, 0w0, 0w1) => ("stlrb", "w")
                     |   _ => ("??", "?")
             in
                 printStream opcode; printStream "\t";
                 if opcode = "stlxr"
                 then (printStream "w"; printStream(Word32.fmt StringCvt.DEC rS); printStream ",")
                 else ();
                 printStream r;
                 printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",["; prXRegOrSP rN; printStream "]"
             end
 
             else if (wordValue andb 0wx3a000000) = 0wx28000000
             then (* Load/store pairs of registers *)
             let
                 val opc = (wordValue >> 0w30) andb 0w3
                 and v = (wordValue >> 0w26) andb 0w1
                 and op2 = (wordValue >> 0w23) andb 0w3
                 and l = (wordValue >> 0w22) andb 0w1
                 and imm7 = (wordValue >> 0w15) andb 0wx7f
                 and rT2 = (wordValue >> 0w10) andb 0wx1f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rT1 = wordValue andb 0wx1f
                 val (opcode, r, scale) =
                     case (opc, v, l) of
                         (0w0, 0w0, 0w0) => ("stp", "w", 0w4)
                     |   (0w0, 0w0, 0w1) => ("ldp", "w", 0w4)
                     |   (0w2, 0w0, 0w0) => ("stp", "x", 0w8)
                     |   (0w2, 0w0, 0w1) => ("ldp", "x", 0w8)
                     |   (0w0, 0w1, 0w0) => ("stp", "s", 0w4)
                     |   (0w0, 0w1, 0w1) => ("ldp", "s", 0w4)
                     |   (0w1, 0w1, 0w0) => ("stp", "d", 0w8)
                     |   (0w1, 0w1, 0w1) => ("ldp", "d", 0w8)
                     |   _ => ("??", "?", 0w1)
                 val imm7Text =
                     if imm7 > 0wx3f
                     then "-" ^ Word32.fmt StringCvt.DEC ((0wx80 - imm7) * scale)
                     else Word32.fmt StringCvt.DEC (imm7 * scale)
             in
                 printStream opcode; printStream "\t"; printStream r;
                 printStream(Word32.fmt StringCvt.DEC rT1); printStream ",";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rT2);
                 printStream ",["; prXRegOrSP rN;
                 case op2 of
                     0w1 => (* Post indexed *)
                         (printStream "],#"; printStream imm7Text)
                 |   0w2 => (* Offset *)
                         (printStream ",#"; printStream imm7Text; printStream "]")
                 |   0w3 => (* Pre indexed *)
                         (printStream ",#"; printStream imm7Text; printStream "]!")
                 |   _ => printStream "??"
             end
 
             else if (wordValue andb 0wx1f800000) = 0wx11000000
             then
             let
                 (* Add/Subtract a 12-bit immediate with possible shift. *)
                 val sf = (wordValue >> 0w31) andb 0w1
                 val rD = wordValue andb 0wx1f
                 and rN = (wordValue andb 0wx3e0) >> 0w5
                 and imm12 = (wordValue andb 0wx3ffc00) >> 0w10
                 and shiftBit = wordValue andb 0wx400000
                 val imm = if shiftBit <> 0w0 then imm12 << 0w12 else imm12
                 val oper = (wordValue andb 0wx40000000) = 0w0
                 val isS = (wordValue andb 0wx20000000) <> 0w0
                 val prReg = if sf = 0w1 then prXRegOrSP else prWRegOrSP
             in
                 if imm12 = 0w0 andalso (rN = 0w31 orelse rD = 0w31) andalso not isS
                 then (printStream "mov\t"; prReg rD; printStream ","; prReg rN)
                 else
                 (
                     if isS andalso rD = 0w31
                     then printStream(if oper then "cmn\t" else "cmp\t")
                     else
                     (
                         printStream(if oper then "add" else "sub"); printStream(if isS then "s\t" else "\t");
                         prReg rD; printStream ","
                     );
                     prReg rN; printStream ",#"; printStream(Word32.fmt StringCvt.DEC imm)
                 )
             end
 
             else if (wordValue andb 0wx7fe0ffe0) = 0wx2A0003E0
             then (* Move reg,reg.  This is a subset of ORR shifted register. *)
             let
                 val reg = if (wordValue andb 0wx80000000) <> 0w0 then "x" else "w"
             in
                 printStream "mov\t"; printStream reg;
                 printStream(Word32.fmt StringCvt.DEC(wordValue andb 0wx1f));
                 printStream ","; printStream reg;
                 printStream(Word32.fmt StringCvt.DEC((wordValue >> 0w16) andb 0wx1f))
             end
 
             else if (wordValue andb 0wx1f000000) = 0wx0A000000
             then
             let
                 (* Logical operations with shifted register. *)
                 val rD = wordValue andb 0wx1f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rM = (wordValue >> 0w16) andb 0wx1f
                 and imm6 = (wordValue >> 0w10) andb 0wx3f
                 and shiftCode = (wordValue >> 0w22) andb 0wx3
                 val opc = (wordValue >> 0w29) andb 0wx3
                 val nBit = (wordValue >> 0w21) andb 0w1
                 val reg = if (wordValue andb 0wx80000000) <> 0w0 then "x" else "w"
                 val opcode =
                     case (opc, nBit) of
                         (0w0, 0w0) => "and"
                     |   (0w1, 0w0) => "orr"
                     |   (0w2, 0w0) => "eor"
                     |   (0w3, 0w0) => "ands"
                     |   _ => "??"
             in
                 if rD = 0w31 andalso opc=0w3 andalso nBit = 0w0
                 then printStream "tst\t"
                 else
                 (
                     printStream opcode; printStream"\t";
                     printStream reg;
                     printStream(Word32.fmt StringCvt.DEC rD); printStream ","
                 );
                 printStream reg; printStream(Word32.fmt StringCvt.DEC rN);
                 printStream ","; printStream reg; printStream(Word32.fmt StringCvt.DEC rM);
                 if imm6 <> 0w0
                 then
                 (
                     case shiftCode of
                         0w0 => printStream ",lsl #"
                     |   0w1 => printStream ",lsr #"
                     |   0w2 => printStream ",asr #"
                     |   _ => printStream ",?? #";
                     printStream(Word32.fmt StringCvt.DEC imm6)
                 )
                 else ()
             end
 
             else if (wordValue andb 0wx1f200000) = 0wx0B000000
             then
             let
                 (* Add/subtract shifted register. *)
                 val rD = wordValue andb 0wx1f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rM = (wordValue >> 0w16) andb 0wx1f
                 and imm6 = (wordValue >> 0w10) andb 0wx3f
                 and shiftCode = (wordValue >> 0w22) andb 0wx3
                 val oper = (wordValue andb 0wx40000000) = 0w0
                 val isS = (wordValue andb 0wx20000000) <> 0w0
                 val pReg = if (wordValue andb 0wx80000000) <> 0w0 then prXReg else prWReg
             in
                 if isS andalso rD = 0w31
                 then printStream(if oper then "cmn\t" else "cmp\t")
                 else
                 (
                     printStream(if oper then "add" else "sub"); printStream(if isS then "s\t" else "\t");
                     pReg rD; printStream ","
                 );
                 pReg rN;
                 printStream ","; pReg rM;
                 if imm6 <> 0w0
                 then
                 (
                     case shiftCode of
                         0w0 => printStream ",lsl #"
                     |   0w1 => printStream ",lsr #"
                     |   0w2 => printStream ",asr #"
                     |   _ => printStream ",?? #";
                     printStream(Word32.fmt StringCvt.DEC imm6)
                 )
                 else ()
             end
 
             else if (wordValue andb 0wx1fe00000) = 0wx0b200000
             then
             let
                 (* Add/subtract extended register. *)
                 val rD = wordValue andb 0wx1f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rM = (wordValue >> 0w16) andb 0wx1f
                 and extend = (wordValue >> 0w13) andb 0w7
                 and amount = (wordValue >> 0w10) andb 0w7
                 and sf = (wordValue >> 0w31) andb 0w1
                 and p = (wordValue >> 0w30) andb 0w1
                 and s = (wordValue >> 0w29) andb 0w1
             in
                 if s = 0w1 andalso rD = 0w31
                 then printStream(if p = 0w0 then "cmn\t" else "cmp\t")
                 else
                 (
                     printStream(if p = 0w0 then "add" else "sub");
                     printStream(if s = 0w1 then "s\t" else "\t");
                     (if sf = 0w1 then prXRegOrSP else prWRegOrSP) rD; printStream ","
                 );
                 (if sf = 0w1 then prXRegOrSP else prWRegOrSP) rN;
                 printStream ",";
                 (if extend = 0w3 orelse extend = 0w7 then prXReg else prWReg) rM;
                 case extend of
                     0w0 => printStream ",uxtb"
                 |   0w1 => printStream ",uxth"
                 |   0w2 => if amount = 0w0 andalso sf = 0w0 then () else printStream ",uxtw"
                 |   0w3 => if amount = 0w0 andalso sf = 0w1 then () else printStream ",uxtx"
                 |   0w4 => printStream ",sxtb"
                 |   0w5 => printStream ",sxth"
                 |   0w6 => printStream ",sxtw"
                 |   0w7 => printStream ",sxtx"
                 |   _ => printStream "?";
                
                 if amount <> 0w0
                 then printStream(" #" ^ Word32.fmt StringCvt.DEC amount)
                 else ()
             end
 
             else if (wordValue andb 0wx3b000000) = 0wx18000000
             then
             let
                 (* Load from a PC-relative address.  This may refer to the
                    address constant area or the non-address constant area. *)
                 val rT = wordValue andb 0wx1f
                 val opc = (wordValue >> 0w30) andb 0w3
                 val v = (wordValue >> 0w26) andb 0w1
                 (* The offset is in 32-bit words *)
                 val byteAddr = word32ToWord(((wordValue andb 0wx00ffffe0) >> (0w5-0w2))) + byteNo
                 (* We must NOT use codeVecGetWord if this is in the non-address
                    area.  It may well not be a tagged value. *)
                 local
                     fun getConstant(cVal, 0w0) = cVal
                     |   getConstant(cVal, offset) =
                         let
                             val byteVal =
                                 Word64.fromLarge(Word8.toLarge(codeVecGet (codeVec, byteAddr+offset-0w1)))
                         in
                             getConstant(Word64.orb(Word64.<<(cVal, 0w8), byteVal), offset-0w1)
                         end
                 in
                     val constantValue = "0x" ^ Word64.toString(getConstant(0w0, 0w8)) (* It's a non-address constant *)
                 end
                 val reg =
                     case (opc, v) of
                         (0w0, 0w0) => "w"
                     |   (0w1, 0w0) => "x"
                     |   (0w0, 0w1) => "s"
                     |   (0w1, 0w1) => "d"
                     |   _ => "?"
             in
                 printStream "ldr\t"; printStream reg;
                 printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",0x"; printStream(Word.fmt StringCvt.HEX byteAddr);
                 printStream "\t// "; printStream constantValue
             end
 
             else if (wordValue andb 0wxbf000000) = 0wx10000000
             then
             let
                 (* Put a pc-relative address into a register. *)
                 val rT = wordValue andb 0wx1f
                 val byteOffset =
                     ((wordValue andb 0wx00ffffe0) << (Word.fromInt Word32.wordSize - 0w23) ~>>
                         (Word.fromInt Word32.wordSize - 0w20)) + ((wordValue >> 0w29) andb 0w3)
             in
                 printStream "adr\tx"; printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",0x"; printStream(Word32.fmt StringCvt.HEX (wordToWord32 byteNo+byteOffset))
             end
 
             else if (wordValue andb 0wx9f000000) = 0wx90000000
             then (* ADRP *)
             let
                 val rT = wordValue andb 0wx1f
                 (* The value is a page offset *)
                 val pageOffset = ((wordValue >> 0w29) andb 0w3) (* immlo *) orb
                     ((wordValue >> 0w3) andb 0wx1fffc)
             in
                 printStream "adrp\tx"; printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",0x"; printStream(Word32.fmt StringCvt.HEX (pageOffset*0w4096))
             end
 
             else if (wordValue andb 0wx7c000000) = 0wx14000000
             then (* Unconditional branch. *)
             let
                 (* The offset is signed and the destination may be earlier. *)
                 val byteOffset =
                     (wordValue andb 0wx03ffffff) << (Word.fromInt Word32.wordSize - 0w26) ~>>
                         (Word.fromInt Word32.wordSize - 0w28)
                 val opc = if (wordValue andb 0wx80000000) = 0w0 then "b" else "bl"
             in
                 printStream opc; printStream "\t0x";
                 printStream(Word32.fmt StringCvt.HEX (wordToWord32 byteNo + byteOffset))
             end
 
             else if (wordValue andb 0wxff000000) = 0wx54000000
             then (* Conditional branch *)
             let
                 val byteOffset =
                     (wordValue andb 0wx00ffffe0) << (Word.fromInt Word32.wordSize - 0w24) ~>>
                         (Word.fromInt Word32.wordSize - 0w21)
             in
                 printStream "b.";
                 printCondition(wordValue andb 0wxf);
                 printStream "\t0x";
                 printStream(Word32.fmt StringCvt.HEX (wordToWord32 byteNo+byteOffset))
             end
 
             else if (wordValue andb 0wx7e000000) = 0wx34000000
             then (* Compare and branch *)
             let
                 val byteOffset =
                     (wordValue andb 0wx00ffffe0) << (Word.fromInt Word32.wordSize - 0w24) ~>>
                         (Word.fromInt Word32.wordSize - 0w21)
                 val oper =
                     if (wordValue andb 0wx01000000) = 0w0
                     then "cbz" else "cbnz"
                 val r = if (wordValue andb 0wx80000000) = 0w0 then "w" else "x"
             in
                 printStream oper; printStream "\t";
                 printStream r; printStream(Word32.fmt StringCvt.DEC (wordValue andb 0wx1f));
                 printStream ",0x";
                 printStream(Word32.fmt StringCvt.HEX (wordToWord32 byteNo+byteOffset))
             end
 
             else if (wordValue andb 0wx7e000000) = 0wx36000000
             then (* Test bit and branch *)
             let
                 val byteOffset =
                     (wordValue andb 0wx000fffe0) << (Word.fromInt Word32.wordSize - 0w20) ~>>
                         (Word.fromInt Word32.wordSize - 0w17)
                 val oper =
                     if (wordValue andb 0wx01000000) = 0w0
                     then "tbz" else "tbnz"
                 val b40 = (wordValue >> 0w19) andb 0wx1f
                 val bitNo = b40 orb ((wordValue >> 0w26) andb 0wx20)
                 val r = if bitNo < 0w32 then "w" else "x"
             in
                 printStream oper; printStream "\t";
                 printStream r; printStream(Word32.fmt StringCvt.DEC (wordValue andb 0wx1f));
                 printStream ",#"; printStream(Word32.fmt StringCvt.DEC bitNo); printStream ",0x";
                 printStream(Word32.fmt StringCvt.HEX (wordToWord32 byteNo+byteOffset))
             end
 
             else if (wordValue andb 0wx3fe00000) = 0wx1A800000
             then
             let
                 val sf = wordValue >> 0w31
                 val opc = (wordValue >> 0w30) andb 0w1
                 val op2 = (wordValue >> 0w10) andb 0w3
                 val rT = wordValue andb 0wx1f
                 val rN = (wordValue >> 0w5) andb 0wx1f
                 val rM = (wordValue >> 0w16) andb 0wx1f
                 val cond = (wordValue >> 0w12) andb 0wxf
                 val opcode =
                     case (opc, op2) of
                         (0w0, 0w0) => "csel"
                     |   (0w0, 0w1) => "csinc"
                     |   (0w1, 0w0) => "csinv"
                     |   (0w1, 0w1) => "csneg"
                     |   _ => "??"
-                val r = if sf = 0w0 then "w" else "x"
+                val pReg = if sf = 0w0 then prWReg else prXReg
             in
                 printStream opcode; printStream "\t";
-                printStream r; printStream(Word32.fmt StringCvt.DEC rT);
-                printStream ","; printStream r; printStream(Word32.fmt StringCvt.DEC rN);
-                printStream ","; printStream r; printStream(Word32.fmt StringCvt.DEC rM);
+                pReg rT; printStream ","; pReg rN; printStream ","; pReg rM;
                 printStream ","; printCondition cond
             end
 
             else if (wordValue andb 0wx7f800000) = 0wx13000000
             then (* signed bitfield *)
             let
                 val sf = wordValue >> 0w31
                 (* N is always the same as sf. *)
                 (*val nBit = (wordValue >> 0w22) andb 0w1*)
                 val immr = (wordValue >> 0w16) andb 0wx3f
                 val imms = (wordValue >> 0w10) andb 0wx3f
                 val rN = (wordValue >> 0w5) andb 0wx1f
                 val rD = wordValue andb 0wx1f
                 val (r, wordSize) = if sf = 0w0 then ("w", 0w32) else if sf = 0w1 then ("x", 0w64) else raise InternalError "Neither"
             in
                 if imms = wordSize - 0w1
                 then printStream "asr\t"
                 else printStream "sbfm\t";
                 printStream r;
                 printStream(Word32.fmt StringCvt.DEC rD);
                 printStream ",";
                 printStream r;
                 printStream(Word32.fmt StringCvt.DEC rN);
                 if imms = wordSize - 0w1
                 then (printStream ",#0x"; printStream(Word32.toString immr))
                 else
                 (
                     printStream ",#0x"; printStream(Word32.toString immr);
                     printStream ",#0x"; printStream(Word32.toString imms)
                 )
             end
 
             else if (wordValue andb 0wx7f800000) = 0wx33000000
             then (* bitfield move *)
             let
                 val sf = wordValue >> 0w31
                 (* N is always the same as sf. *)
                 (*val nBit = (wordValue >> 0w22) andb 0w1*)
                 val immr = (wordValue >> 0w16) andb 0wx3f
                 val imms = (wordValue >> 0w10) andb 0wx3f
                 val rN = (wordValue >> 0w5) andb 0wx1f
                 val rD = wordValue andb 0wx1f
                 val (r, wordSize) = if sf = 0w0 then ("w", 0w32) else ("x", 0w64)
             in
                 if imms < immr
                 then if rD = 0wx31 then printStream "bfc\t" else printStream "bfi\t"
                 else printStream "bfxil\t";
                 if imms >= immr orelse rD <> 0w31
                 then
                 (
                     printStream r;
                     printStream(Word32.fmt StringCvt.DEC rD);
                     printStream ","
                 )
                 else ();
                 printStream r;
                 printStream(Word32.fmt StringCvt.DEC rN);
                 (* Not certain that these are correct. *)
                 if imms < immr
                 then
                 (
                     printStream ",#0x"; printStream(Word32.toString(wordSize - immr));
                     printStream ",#0x"; printStream(Word32.toString(imms+0w1))
                 )
                 else
                 (
                     printStream ",#0x"; printStream(Word32.toString immr);
                     printStream ",#0x"; printStream(Word32.toString(imms+0w1-immr))
                 )
             end
 
             else if (wordValue andb 0wx7f800000) = 0wx53000000
             then (* unsigned bitfield move *)
             let
                 val sf = wordValue >> 0w31
                 (* N is always the same as sf. *)
                 (*val nBit = (wordValue >> 0w22) andb 0w1*)
                 val immr = (wordValue >> 0w16) andb 0wx3f
                 val imms = (wordValue >> 0w10) andb 0wx3f
                 val rN = (wordValue >> 0w5) andb 0wx1f
                 val rD = wordValue andb 0wx1f
                 val (r, wordSize) = if sf = 0w0 then ("w", 0w32) else ("x", 0w64)
             in
                 if imms + 0w1 = immr
                 then
                 (
                     printStream "lsl\t";
                     printStream r;
                     printStream(Word32.fmt StringCvt.DEC rD);
                     printStream ",";
                     printStream r;
                     printStream(Word32.fmt StringCvt.DEC rN);
                     printStream ",#0x"; printStream(Word32.toString(wordSize - immr))
                 )
                 else if imms = wordSize - 0w1
                 then
                 (
                     printStream "lsr\t";
                     printStream r;
                     printStream(Word32.fmt StringCvt.DEC rD);
                     printStream ",";
                     printStream r;
                     printStream(Word32.fmt StringCvt.DEC rN);
                     printStream ",#0x"; printStream(Word32.toString immr)
                 )
                 else if imms < immr
                 then
                 (
                     printStream "ubfiz\t";
                     printStream r;
                     printStream(Word32.fmt StringCvt.DEC rD);
                     printStream ",";
                     printStream r;
                     printStream(Word32.fmt StringCvt.DEC rN);
                     printStream ",#0x"; printStream(Word32.toString(wordSize - immr));
                     printStream ",#0x"; printStream(Word32.toString(imms+0w1))
                 )
                 else
                 (
                     printStream "ubfm\t";
                     printStream r;
                     printStream(Word32.fmt StringCvt.DEC rD);
                     printStream ",";
                     printStream r;
                     printStream(Word32.fmt StringCvt.DEC rN);
                     printStream ",#0x"; printStream(Word32.toString immr);
                     printStream ",#0x"; printStream(Word32.toString imms)
                 )
             end
 
             else if (wordValue andb 0wx1f800000) = 0wx12000000
             then (* logical immediate *)
             let
                 val sf = wordValue >> 0w31
                 val opc = (wordValue >> 0w29) andb 0w3
                 val nBit = (wordValue >> 0w22) andb 0w1
                 val immr = (wordValue >> 0w16) andb 0wx3f
                 val imms = (wordValue >> 0w10) andb 0wx3f
                 val rN = (wordValue >> 0w5) andb 0wx1f
                 val rD = wordValue andb 0wx1f
                 val (opcode, r) =
                     case (sf, opc, nBit) of
                         (0w0, 0w0, 0w0) => ("and", "w")
                     |   (0w0, 0w1, 0w0) => ("orr", "w")
                     |   (0w0, 0w2, 0w0) => ("eor", "w")
                     |   (0w0, 0w3, 0w0) => ("ands", "w")
                     |   (0w1, 0w0, _) => ("and", "x")
                     |   (0w1, 0w1, _) => ("orr", "x")
                     |   (0w1, 0w2, _) => ("eor", "x")
                     |   (0w1, 0w3, _) => ("ands", "x")
                     |   _ => ("??", "?")
             in
                 if rD = 0w31 andalso opc=0w3
                 then printStream "tst\t"
                 else
                 (
                     printStream opcode;
                     printStream "\t";
                     printStream r; printStream(Word32.fmt StringCvt.DEC rD); printStream ","
                 );
                 printStream r; printStream(Word32.fmt StringCvt.DEC rN); printStream ",#0x";
                 printStream(Word64.toString(decodeBitPattern{sf=sf, n=nBit, immr=immr, imms=imms}))
             end
 
             else if (wordValue andb 0wx5fe00000) = 0wx1ac00000
             then (* Two source operations - shifts and divide. *)
             let
                 val sf = wordValue >> 0w31
                 val s = (wordValue >> 0w29) andb 0w1
                 val rM = (wordValue >> 0w16) andb 0wx1f
                 val opcode = (wordValue >> 0w10) andb 0wx3f
                 val rN = (wordValue >> 0w5) andb 0wx1f
                 val rD = wordValue andb 0wx1f
                 val (oper, r) =
                     case (sf, s, opcode) of
                         (0w1, 0w0, 0wx2) => ("udiv", "x")
                     |   (0w1, 0w0, 0wx3) => ("sdiv", "x")
                     |   (0w0, 0w0, 0wx2) => ("udiv", "w")
                     |   (0w0, 0w0, 0wx3) => ("sdiv", "w")
                     |   (0w1, 0w0, 0wx8) => ("lsl", "x")
                     |   (0w0, 0w0, 0wx8) => ("lsl", "w")
                     |   (0w1, 0w0, 0wx9) => ("lsr", "x")
                     |   (0w0, 0w0, 0wx9) => ("lsr", "w")
                     |   (0w1, 0w0, 0wxa) => ("asr", "x")
                     |   (0w0, 0w0, 0wxa) => ("asr", "w")
                     |   _ => ("??", "?")
             in
                 printStream oper;
                 printStream "\t";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rD); printStream ",";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rN); printStream ",";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rM)
             end
 
             else if (wordValue andb 0wx1f000000) = 0wx1b000000
             then (* Three source operations - multiply add/subtract. *)
             let
                 val sf = wordValue >> 0w31
                 val op54 = (wordValue >> 0w29) andb 0w3
                 val op31 = (wordValue >> 0w21) andb 0w7
                 val o0 = (wordValue >> 0w15) andb 0w1
                 val rM = (wordValue >> 0w16) andb 0wx1f
                 val rA = (wordValue >> 0w10) andb 0wx1f
                 val rN = (wordValue >> 0w5) andb 0wx1f
                 val rD = wordValue andb 0wx1f
                 val (oper, r1, r2) =
                     case (sf, op54, op31, o0, rA) of
                         (0w1, 0w0, 0w0, 0w0, 0w31) => ("mul", "x", "x")
                     |   (0w1, 0w0, 0w0, 0w0, _)    => ("madd", "x", "x")
                     |   (0w1, 0w0, 0w0, 0w1, 0w31) => ("mneg", "x", "x")
                     |   (0w1, 0w0, 0w0, 0w1, _)    => ("msub", "x", "x")
                     |   (0w0, 0w0, 0w0, 0w0, _)    => ("madd", "w", "w")
                     |   (0w0, 0w0, 0w0, 0w1, _)    => ("msub", "w", "w")
                     |   (0w1, 0w0, 0w2, 0w0, 0w31) => ("smulh", "x", "x")
                     |   (0w1, 0w0, 0w1, 0w0, 0w31) => ("smull", "x", "w")
                     |   (0w1, 0w0, 0w1, 0w0, _)    => ("smaddl", "x", "w")
                     |   (0w1, 0w0, 0w1, 0w1, _)    => ("smsubl", "x", "w")
                     |   _ => ("??", "?", "?")
             in
                 printStream oper;
                 printStream "\t";
                 printStream r1; printStream(Word32.fmt StringCvt.DEC rD); printStream ",";
                 printStream r2; printStream(Word32.fmt StringCvt.DEC rN); printStream ",";
                 printStream r2; printStream(Word32.fmt StringCvt.DEC rM);
                 if rA = 0w31 then ()
                 else (printStream ","; printStream r1; printStream(Word32.fmt StringCvt.DEC rA))
             end
 
             else if (wordValue andb 0wx7f20fc00) = 0wx1E200000
             then (* Moves between floating point and general regs. *)
             let
                 val sf = (wordValue >> 0w31) andb 0w1
                 and s = (wordValue >> 0w29) andb 0w1
                 and ptype = (wordValue >> 0w22) andb 0w3
                 and mode = (wordValue >> 0w19) andb 0w3
                 and opcode = (wordValue >> 0w16) andb 0w7
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rD = wordValue andb 0wx1f
                 val (opc, dr, nr) =
                     case (sf, s, ptype, mode, opcode) of
                         (0w0, 0w0, 0w0, 0w0, 0w7) => ("fmov", "s", "w") (* w -> s *)
                     |   (0w0, 0w0, 0w0, 0w0, 0w6) => ("fmov", "w", "s") (* s -> w *)
                     |   (0w1, 0w0, 0w1, 0w0, 0w7) => ("fmov", "d", "x") (* d -> x *)
                     |   (0w1, 0w0, 0w1, 0w0, 0w6) => ("fmov", "x", "d") (* x -> d *)
                     |   (0w0, 0w0, 0w0, 0w0, 0w2) => ("scvtf", "w", "s")
                     |   (0w0, 0w0, 0w1, 0w0, 0w2) => ("scvtf", "w", "d")
                     |   (0w1, 0w0, 0w0, 0w0, 0w2) => ("scvtf", "x", "s")
                     |   (0w1, 0w0, 0w1, 0w0, 0w2) => ("scvtf", "x", "d")
 
                     |   (0w0, 0w0, 0w0, 0w0, 0w4) => ("fcvtas", "w", "s") (* s -> w *)
                     |   (0w0, 0w0, 0w0, 0w2, 0w0) => ("fcvtms", "w", "s") (* s -> w *)
                     |   (0w0, 0w0, 0w0, 0w1, 0w0) => ("fcvtps", "w", "s") (* s -> w *)
                     |   (0w0, 0w0, 0w0, 0w3, 0w0) => ("fcvtzs", "w", "s") (* s -> w *)
                     |   (0w0, 0w0, 0w1, 0w0, 0w4) => ("fcvtas", "w", "d") (* d -> w *)
                     |   (0w0, 0w0, 0w1, 0w2, 0w0) => ("fcvtms", "w", "d") (* d -> w *)
                     |   (0w0, 0w0, 0w1, 0w1, 0w0) => ("fcvtps", "w", "d") (* d -> w *)
                     |   (0w0, 0w0, 0w1, 0w3, 0w0) => ("fcvtzs", "w", "d") (* d -> w *)
 
                     |   (0w1, 0w0, 0w0, 0w0, 0w4) => ("fcvtas", "x", "s") (* s -> x *)
                     |   (0w1, 0w0, 0w0, 0w2, 0w0) => ("fcvtms", "x", "s") (* s -> x *)
                     |   (0w1, 0w0, 0w0, 0w1, 0w0) => ("fcvtps", "x", "s") (* s -> x *)
                     |   (0w1, 0w0, 0w0, 0w3, 0w0) => ("fcvtzs", "x", "s") (* s -> x *)
                     |   (0w1, 0w0, 0w1, 0w0, 0w4) => ("fcvtas", "x", "d") (* d -> x *)
                     |   (0w1, 0w0, 0w1, 0w2, 0w0) => ("fcvtms", "x", "d") (* d -> x *)
                     |   (0w1, 0w0, 0w1, 0w1, 0w0) => ("fcvtps", "x", "d") (* d -> x *)
                     |   (0w1, 0w0, 0w1, 0w3, 0w0) => ("fcvtzs", "x", "d") (* d -> x *)
                     |   _ => ("?", "?", "?")
             in
                 printStream opc; printStream "\t";
                 printStream dr; printStream(Word32.fmt StringCvt.DEC rD); printStream ",";
                 printStream nr; printStream(Word32.fmt StringCvt.DEC rN)
             end
             
             else if (wordValue andb 0wxff200c00) = 0wx1E200800
             then (* Floating point two source operations. *)
             let
                 val pt = (wordValue >> 0w22) andb 0w3
                 and rM = (wordValue >> 0w16) andb 0wx1f
                 and opc = (wordValue >> 0w12) andb 0wxf
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rT = wordValue andb 0wx1f
                 val (opcode, r) =
                     case (pt, opc) of
                         (0w0, 0wx0) => ("fmul", "s")
                     |   (0w0, 0wx1) => ("fdiv", "s")
                     |   (0w0, 0wx2) => ("fadd", "s")
                     |   (0w0, 0wx3) => ("fsub", "s")
                     |   (0w1, 0wx0) => ("fmul", "d")
                     |   (0w1, 0wx1) => ("fdiv", "d")
                     |   (0w1, 0wx2) => ("fadd", "d")
                     |   (0w1, 0wx3) => ("fsub", "d")
                     |   _ => ("??", "?")
             in
                 printStream opcode; printStream "\t";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rT); printStream ",";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rN); printStream ",";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rM)
             end
 
             else if (wordValue andb 0wxff207c00) = 0wx1E204000
             then (* Floating point single source. *)
             let
                 val pt = (wordValue >> 0w22) andb 0w3
                 and opc = (wordValue >> 0w15) andb 0wx3f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rT = wordValue andb 0wx1f
                 val (opcode, rS, rD) =
                     case (pt, opc) of
                         (0w0, 0wx0) => ("fmov", "s", "s")
                     |   (0w0, 0wx1) => ("fabs", "s", "s")
                     |   (0w0, 0wx2) => ("fneg", "s", "s")
                     |   (0w0, 0wx5) => ("fcvt", "s", "d")
                     |   (0w1, 0wx0) => ("fmov", "d", "d")
                     |   (0w1, 0wx1) => ("fabs", "d", "d")
                     |   (0w1, 0wx2) => ("fneg", "d", "d")
                     |   (0w1, 0wx4) => ("fcvt", "d", "s")
                     |   _ => ("??", "?", "?")
             in
                 printStream opcode; printStream "\t";
                 printStream rD; printStream(Word32.fmt StringCvt.DEC rT); printStream ",";
                 printStream rS; printStream(Word32.fmt StringCvt.DEC rN)
             end
 
             else if (wordValue andb 0wxff20fc07) = 0wx1E202000
             then (* Floating point comparison *)
             let
                 val pt = (wordValue >> 0w22) andb 0w3
                 and rM = (wordValue >> 0w16) andb 0wx1f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and opc = (wordValue >> 0w3) andb 0w3
                 val (opcode, r) =
                     case (pt, opc) of
                         (0w0, 0wx0) => ("fcmp", "s")
                     |   (0w1, 0wx0) => ("fcmp", "d")
                     |   (0w0, 0wx2) => ("fcmpe", "s")
                     |   (0w1, 0wx2) => ("fcmpe", "d")
                     |   _ => ("??", "?")
             in
                 printStream opcode; printStream "\t";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rN); printStream ",";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rM)
             end
 
             else if (wordValue andb 0wxffffffe0) = 0wx2F00E400
             then (* movi dn,#0 *)
             let
                 val rD = wordValue andb 0wx1f
             in
                 printStream "movi\td"; printStream(Word32.fmt StringCvt.DEC rD);
                 printStream ",#0"
             end
 
             else if (wordValue andb 0wx1e000000) = 0wx02000000
             then (* This is an unallocated range.  We use it for the register mask. *)
             let
                 fun printMask (0w25, _) = ()
                 |   printMask (i, comma) =
                     if ((0w1 << i) andb wordValue) <> 0w0
                     then
                     (
                         if comma then printStream ", " else ();
                         printStream "x";
                         printStream(Word.fmt StringCvt.DEC i);
                         printMask(i+0w1, true)
                     )
                     else printMask(i+0w1, comma)
             in
                 printStream "["; printMask(0w0, false); printStream "]"
             end
 
             else printStream "?"
             ;
             printStream "\n"
         end
         
         fun printAll i =
             if i = numInstructions then ()
             else (printWordAt i; printAll(i+0w1))
     in
         printStream functionName;
         printStream ":\n";
         printAll 0w0
     end
 
     (* Set the offsets of ADRP+LDR and ADRP+ADD instruction pairs.  The values in these instructions are,
        to some extent, absolute addresses so this needs to be done by the RTS. firstNonAddrConst and
        firstAddrConst are the offsets in bytes. *)
     fun setADRPAddresses(ops, codeVec, firstNonAddrConst, firstAddrConst, addrConstMap, nonAddrConstMap) =
     let
         fun setADRPAddrs([], _ , _, _) = ()
 
         |   setADRPAddrs(LoadAddressLiteral{length=ref BrExtended, ...} :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 (* Address constants are 32-bits in 32-in-64 and 64-bits in native 64-bits *)
                 val constPos = Array.sub(addrConstMap, aConstNum)
                 val addrOfConstant (* byte offset *) = firstAddrConst + constPos * Address.wordSize
             in
                 codeVecPutConstant (codeVec, wordNo * 0w4, toMachineWord addrOfConstant,
                     if is32in64 then ConstArm64AdrpLdr32 else ConstArm64AdrpLdr64);
                 setADRPAddrs(tail, wordNo+0w2, aConstNum+1, nonAConstNum)
             end
 
         |   setADRPAddrs(LoadNonAddressLiteral{length=ref BrExtended, ...} :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 (* The offset is in 32-bit words.  These are always 64-bits. *)
                 val constPos = Array.sub(nonAddrConstMap, nonAConstNum)
                 val offsetOfConstant (* byte offset *) = firstNonAddrConst+constPos*0w8
             in
                 codeVecPutConstant (codeVec, wordNo * 0w4, toMachineWord offsetOfConstant, ConstArm64AdrpLdr64);
                 setADRPAddrs(tail, wordNo+0w2, aConstNum, nonAConstNum+1)
             end
 
         |   setADRPAddrs(LoadFPLiteral{length=ref BrExtended, isDouble, ...} :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 (* The offset is in 32-bit words and the constants themselves are always 64-bits.  If
                    we're loading a 32-bit float we have to use 32-bit offsets. *)
                 val constPos = Array.sub(nonAddrConstMap, nonAConstNum)
                 val offsetOfConstant (* byte offset *) = firstNonAddrConst+constPos*0w8
             in
                 codeVecPutConstant (codeVec, wordNo * 0w4, toMachineWord offsetOfConstant,
                                             if isDouble then ConstArm64AdrpLdr64 else ConstArm64AdrpLdr32);
                 setADRPAddrs(tail, wordNo+0w2, aConstNum, nonAConstNum+1)
             end
 
         |   setADRPAddrs(LoadLabelAddress{label=ref labs, length=ref BrExtended, ...} :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs) * 0w4
             in
                 codeVecPutConstant (codeVec, wordNo * 0w4, toMachineWord dest, ConstArm64AdrpAdd);
                 setADRPAddrs(tail, wordNo+0w2, aConstNum, nonAConstNum)
             end
 
        |    setADRPAddrs(instr :: tail, wordNo, aConstNum, nonAConstNum) =
                 setADRPAddrs(tail, wordNo+Word.fromInt(codeSize instr), aConstNum, nonAConstNum)
     in
         setADRPAddrs (ops, 0w0, 0, 0)
     end
 
     (* Although this is used locally it must be defined at the top level
        otherwise a new RTS function will be compiled every time the
        containing function is called *)
     val sortFunction: (machineWord * int) array -> bool =
         RunCall.rtsCallFast1 "PolySortArrayOfAddresses"
 
     (* Adds the constants onto the code, and copies the code into a new segment *)
     fun generateCode {instrs, name=functionName, parameters, resultClosure, profileObject} =
     let
         val printStream = Pretty.getSimplePrinter(parameters, [])
         and printAssemblyCode = Debug.getParameter Debug.assemblyCodeTag parameters
         
         local
             (* Extract the constants. *)
             fun getConsts(LoadAddressLiteral {value, ...}, (addrs, nonAddrs, addrCount, nonAddrCount)) =
                     ((value, addrCount)::addrs, nonAddrs, addrCount+1, nonAddrCount)
             |   getConsts(LoadNonAddressLiteral {value, ...}, (addrs, nonAddrs, addrCount, nonAddrCount)) =
                     (addrs, (value, nonAddrCount)::nonAddrs, addrCount, nonAddrCount+1)
             |   getConsts(LoadFPLiteral {value, isDouble, ...}, (addrs, nonAddrs, addrCount, nonAddrCount)) =
                 let
                     (* When loading a float we will only access the first 32-bits so if this is
                        big-endian we have to shift the value so it's there. *)
                     val shifted = if not isDouble andalso isBigEndian then LargeWord.<<(value, 0w32) else value
                 in
                     (addrs, (shifted, nonAddrCount)::nonAddrs, addrCount, nonAddrCount+1)
                 end
             |   getConsts(_, consts) = consts
 
             val (addressConstants, nonAddressConstants, addrConstCount, nonAddrConstCount) =
                 List.foldl getConsts ([], [], 0, 0) instrs
 
             (* Sort the non-address constants to remove duplicates.  There don't seem to be
                many in practice.
                Since we're not actually interested in the order but only
                sorting to remove duplicates we can use a stripped-down Quicksort. *)
             fun sort([], out) = out
             |   sort((median, addr) :: tl, out) = partition(median, tl, [addr], [], [], out)
 
             and partition(median, [], addrs, less, greater, out) =
                     sort(less, sort(greater, (median, addrs) :: out))
             |   partition(median, (entry as (value, addr)) :: tl, addrs, less, greater, out) =
                     if value = median
                     then partition(median, tl, addr::addrs, less, greater, out)
                     else if value < median
                     then partition(median, tl, addrs, entry :: less, greater, out)
                     else partition(median, tl, addrs, less, entry :: greater, out)
             
             (* Non-address constants.  We can't use any ordering on them because a GC could
                change the values half way through the sort.  Instead we use a simple search
                for a small number of constants and use an RTS call for larger numbers.  We
                want to avoid quadratic cost when there are large numbers. *)
 
             val sortedConstants =
                 if List.length addressConstants < 10
                 then
                 let
                     fun findDups([], out) = out
                     |   findDups((value, addr) :: tl, out) =
                         let
                             fun partition(e as (v, a), (eq, neq)) =
                                 if PolyML.pointerEq(value, v)
                                 then (a :: eq, neq)
                                 else (eq, e :: neq)
                             val (eqAddr, neq) = List.foldl partition ([addr], []) tl
                         in
                             findDups(neq, (value, eqAddr) :: out)
                         end
                 in
                     findDups(addressConstants, [])
                 end
                 else
                 let
                     val arrayToSort = Array.fromList addressConstants
                     val _ = sortFunction arrayToSort
                     
                     fun makeList((v, a), []) = [(v, [a])]
                     |   makeList((v, a), l as (vv, aa) :: tl) =
                         if PolyML.pointerEq(v, vv)
                         then (vv, a :: aa) :: tl
                         else (v, [a]) :: l
                 in
                     (Array.foldl makeList [] arrayToSort)
                 end
         in
             val addressConsts = sortedConstants
             and nonAddressConsts = sort(nonAddressConstants, []) : (Word64.word * int list) list
             and addrConstCount = addrConstCount
             and nonAddrConstCount = nonAddrConstCount
         end
 
         (* Create maps that indicate for each constant where it is in the constant area. *)
         val addrConstMap = Array.array(addrConstCount, 0w0) and nonAddrConstMap = Array.array(nonAddrConstCount, 0w0)
         val _ = List.foldl(fn ((_, cnums), n) => (List.app(fn i => Array.update(addrConstMap, i, n)) cnums; n+0w1)) 0w0 addressConsts
         val _ = List.foldl(fn ((_, cnums), n) => (List.app(fn i => Array.update(nonAddrConstMap, i, n)) cnums; n+0w1)) 0w0 nonAddressConsts
         
         (* Generate the code and set the constant addresses at the same time. *)
         val (byteVec, nativeWordsOfCode) =
             genCode(instrs, List.map #1 addressConsts, List.map #1 nonAddressConsts, addrConstMap, nonAddrConstMap)
 
         val wordsOfCode = nativeWordsOfCode * wordsPerNativeWord
 
         (* +3 for profile count, function name and constants count *)
         val numOfConst = List.length addressConsts
         val segSize   = wordsOfCode + Word.fromInt numOfConst + 0w4
         val firstConstant = wordsOfCode + 0w3 (* Add 3 for no of consts, fn name and profile count. *)
     
         (* Put in the number of constants. This must go in before
            we actually put in any constants. *)
         local
             val lastWord = segSize - 0w1
         in
             val () = setWord(LargeWord.fromInt(numOfConst + 2), wordsOfCode, byteVec)
             (* Set the last word of the code to the (negative) byte offset of the start of the code area
                from the end of this word. *)
             val () = setWord(LargeWord.fromInt(numOfConst + 3) * ~(Word.toLarge Address.wordSize), lastWord, byteVec) 
         end
 
         (* Now we've filled in all the size info we need to convert the segment
            into a proper code segment before it's safe to put in any ML values. *)
         val codeVec = byteVecToCodeVec(byteVec, resultClosure)
 
         local
             val name     : string = functionName
             val nameWord : machineWord = toMachineWord name
         in
             val () = codeVecPutWord (codeVec, wordsOfCode+0w1, nameWord)
         end
         (* Profile ref.  A byte ref used by the profiler in the RTS. *)
         val () = codeVecPutWord (codeVec, wordsOfCode+0w2, profileObject)
 
         (* and then copy the constants from the constant list. *)
         local
             fun setConstant((value, _), num) =
             (
                 codeVecPutWord (codeVec, firstConstant + num, value);
                 num+0w1
             )
         in
             val _ = List.foldl setConstant 0w0 addressConsts
         end
         
         val () = setADRPAddresses(instrs, codeVec,
                     (nativeWordsOfCode-Word.fromInt(List.length nonAddressConsts)) * Address.nativeWordSize,
                     firstConstant * Address.wordSize, addrConstMap, nonAddrConstMap)
     in
         if printAssemblyCode
         then (* print out the code *)
             (printCode (codeVec, functionName, wordsOfCode, printStream); printStream"\n")
         else ();
         codeVecLock(codeVec, resultClosure)
     end (* copyCode *)
 
 
     structure Sharing =
     struct
         type closureRef = closureRef
         type instr = instr
         type xReg = xReg
         type vReg = vReg
         type labels = labels
         type condition = condition
         type shiftType = shiftType
         type wordSize = wordSize
         type 'a extend = 'a extend
         type scale = scale
     end
 end;
 
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64CodetreeToICode.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64CodetreeToICode.ML
index 1cd0d7f4..66990ca8 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64CodetreeToICode.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64CodetreeToICode.ML
@@ -1,3284 +1,3354 @@
 (*
     Copyright David C. J. Matthews 2021-2
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64CodetreeToICode(
     structure BackendTree: BACKENDINTERMEDIATECODE
     structure Arm64ICode: ARM64ICODE
     structure Debug: DEBUG
     structure Arm64Foreign: FOREIGNCALL
     structure ICodeTransform: ARM64ICODETRANSFORM
     structure CodeArray: CODEARRAY
     structure Pretty:PRETTY
 
     sharing Arm64ICode.Sharing = ICodeTransform.Sharing = CodeArray.Sharing = BackendTree.Sharing
 ): GENCODE =
 struct
     open BackendTree
     open Address
     open Arm64ICode
     open CodeArray
     open BuiltIns
     
     val useLSEAtomics = false (* Use 8.1 atomics?  Not for the moment: keep compatibility with older processors. *)
 
     (* Reversed cons and append to make the code easier to read. *)
     infix 5 <::> <@>
     fun tl <::> hd = hd :: tl
     and snd <@> fst = fst @ snd
 
     type iCodeAbstract = (preg, pregOrZero, preg) arm64ICode and basicBlockAbstract = (preg, pregOrZero, preg) basicBlock
     
     exception InternalError = Misc.InternalError
     
     fun taggedWord64 w: Word64.word = w * 0w2 + 0w1
-    and taggedWord w: word = w * 0w2 + 0w1
     
     datatype blockStruct =
         BlockSimple of iCodeAbstract
     |   BlockExit of iCodeAbstract
     |   BlockLabel of int
     |   BlockFlow of controlFlow
-    |   BlockBegin of { regArgs: (preg * xReg) list, stackArgs: stackLocn list }
+    |   BlockBegin of { regArgs: (preg * xReg) list, fpRegArgs: (preg * vReg) list, stackArgs: stackLocn list }
     |   BlockRaiseAndHandle of iCodeAbstract * int
     |   BlockOptionalHandle of {call: iCodeAbstract, handler: int, label: int }
 
 
     val moveRegister = BlockSimple o MoveRegister
     and loadNonAddressConstant = BlockSimple o LoadNonAddressConstant
     and loadAddressConstant = BlockSimple o LoadAddressConstant
     and loadWithConstantOffset = BlockSimple o LoadWithConstantOffset
     and loadFPWithConstantOffset = BlockSimple o LoadFPWithConstantOffset
     and loadWithIndexedOffset = BlockSimple o LoadWithIndexedOffset
     and loadFPWithIndexedOffset = BlockSimple o LoadFPWithIndexedOffset
     and getThreadId = BlockSimple o GetThreadId
     and objectIndexAddressToAbsolute = BlockSimple o ObjectIndexAddressToAbsolute
     and absoluteToObjectIndex = BlockSimple o AbsoluteToObjectIndex
     and allocateMemoryFixed = BlockSimple o AllocateMemoryFixed
     and allocateMemoryVariable = BlockSimple o AllocateMemoryVariable
     and initialiseMem = BlockSimple o InitialiseMem
     and storeWithConstantOffset = BlockSimple o StoreWithConstantOffset
     and storeFPWithConstantOffset = BlockSimple o StoreFPWithConstantOffset
     and storeWithIndexedOffset = BlockSimple o StoreWithIndexedOffset
     and storeFPWithIndexedOffset = BlockSimple o StoreFPWithIndexedOffset
     and addSubImmediate = BlockSimple o AddSubImmediate
     and addSubRegister = BlockSimple o AddSubRegister
     and logicalImmediate = BlockSimple o LogicalImmediate
     and logicalRegister = BlockSimple o LogicalRegister
     and shiftRegister = BlockSimple o ShiftRegister
     and multiplication = BlockSimple o Multiplication
     and division = BlockSimple o Division
     and pushToStack = BlockSimple o PushToStack
     and loadStack = BlockSimple o LoadStack
     and storeToStack = BlockSimple o StoreToStack
     and containerAddress = BlockSimple o ContainerAddress
     and resetStackPtr = BlockSimple o ResetStackPtr
     and tagValue = BlockSimple o TagValue
     and untagValue = BlockSimple o UntagValue
     and boxLarge = BlockSimple o BoxLarge
     and unboxLarge = BlockSimple o UnboxLarge
     and boxTagFloat = BlockSimple o BoxTagFloat
     and unboxTagFloat = BlockSimple o UnboxTagFloat
     and loadAcquire = BlockSimple o LoadAcquire
     and storeRelease = BlockSimple o StoreRelease
     and bitFieldShift = BlockSimple o BitFieldShift
     and bitFieldInsert = BlockSimple o BitFieldInsert
     and compareByteVectors = BlockSimple o CompareByteVectors
     and blockMove = BlockSimple o BlockMove
     and addSubXSP = BlockSimple o AddSubXSP
     and touchValue = BlockSimple o TouchValue
     and loadAcquireExclusive = BlockSimple o LoadAcquireExclusive
     and storeReleaseExclusive = BlockSimple o StoreReleaseExclusive
     and memoryBarrier = BlockSimple MemoryBarrier
     and convertIntToFloat = BlockSimple o ConvertIntToFloat
     and convertFloatToInt = BlockSimple o ConvertFloatToInt
     and unaryFloatingPt = BlockSimple o UnaryFloatingPt
     and binaryFloatingPoint = BlockSimple o BinaryFloatingPoint
     and compareFloatingPoint = BlockSimple o CompareFloatingPoint
     and cpuYield = BlockSimple CPUYield
     val atomicOperation = BlockSimple o AtomicOperation
 
     val shiftConstant = BlockSimple o shiftConstant
 
     (* Many operations use 32-bit arguments in 32-in-64 and 64-bit in native 64. *)
     val polyWordLoadSize = if is32in64 then Load32 else Load64
     val polyWordOpSize = if is32in64 then OpSize32 else OpSize64
 
     val tagBitMask64 = Word64.<<(Word64.fromInt ~1, 0w1)
     val tagBitMask32 = Word64.andb(tagBitMask64, 0wxffffffff)
 
     val polyWordTagBitMask = if is32in64 then tagBitMask32 else tagBitMask64
 
     (* The flags byte is the high-order byte of length word. *)
     val flagsByteOffset = if isBigEndian then ~ (Word.toInt wordSize) else ~1
 
     (* Size of operand in bytes and therefore the scale factor. *)
     fun opWordSize Load64 = 8
     |   opWordSize Load32 = 4
     |   opWordSize Load16 = 2
     |   opWordSize Load8 = 1
 
     (* Shift for each size.  i.e. log2 of opWordSize. *)
     fun loadShift Load64 = 0w3
     |   loadShift Load32 = 0w2
     |   loadShift Load16 = 0w1
     |   loadShift Load8 = 0w0
 
     fun precisionToFpSize PrecSingle = Float32 | precisionToFpSize PrecDouble = Double64
 
-    fun codeFunctionToArm64({body, localCount, name, argTypes, closure, ...}:bicLambdaForm, debugSwitches, resultClosure) =
+    fun codeFunctionToArm64({body, localCount, name, argTypes, resultType=fnResultType, closure, ...}:bicLambdaForm, debugSwitches, resultClosure) =
     let
         (* Pseudo-registers are allocated sequentially and the properties added to the list. *)
         val pregCounter = ref 0
         val pregPropList = ref []
         fun newPReg() =
         let
             val regNo = !pregCounter before pregCounter := !pregCounter + 1
             val () = pregPropList := RegPropGeneral :: !pregPropList
         in
             PReg regNo
         end
         
         and newUReg() =
         let
             val regNo = !pregCounter before pregCounter := !pregCounter + 1
             val () = pregPropList := RegPropUntagged :: !pregPropList
         in
             PReg regNo
         end
         
         and newStackLoc size =
         let
             val regNo = !pregCounter before pregCounter := !pregCounter + 1
             val () = pregPropList := RegPropStack size :: !pregPropList
         in
             StackLoc{size=size, rno=regNo}
         end
         
         and newMergeReg() =
         let
             val regNo = !pregCounter before pregCounter := !pregCounter + 1
             val () = pregPropList := RegPropMultiple :: !pregPropList
         in
             PReg regNo
         end
         
         datatype locationValue =
             NoLocation
         |   PregLocation of preg
         |   StackContainer of { container: stackLocn, stackOffset: int }
         |   RegisterContainer of preg list
 
         val locToPregArray = Array.array(localCount, NoLocation)
         val labelCounter = ref 1 (* Start at 1.  Zero is used for the root. *)
         fun newLabel() = !labelCounter before labelCounter := !labelCounter + 1
         val ccRefCounter = ref 0
         fun newCCRef() = CcRef(!ccRefCounter) before ccRefCounter := !ccRefCounter + 1
 
         (* The profile object is a single mutable with the F_bytes bit set. *)        
         val profileObject = CodeArray.createProfileObject()
 
         (* Switch to indicate if we want to trace where live data has been allocated. *)
         (* TODO: This should be used in AllocateMemoryOperation and BoxValue and possibly AllocateMemoryVariable. *)
         val addAllocatingFunction =
             Debug.getParameter Debug.profileAllocationTag debugSwitches = 1
 
         datatype destination =
             SpecificPReg of preg
         |   NoResult
         |   AnyReg
 
         (* Context type. *)
         type context =
             { loopArgs: (preg list * int * int) option, stackPtr: int, currHandler: int option,
               overflowBlock: int option ref }
 
         datatype argLoc =
-            ArgumentIsInReg of { realReg: xReg, argReg: preg }
+            ArgumentIsInReg of preg
         |   ArgumentIsOnStack of { stackOffset: int, stackReg: stackLocn }
         |   ArgumentIsRegContainer of preg list
 
         (* An address as either suitable for Load/StoreWithConstantOffset or else Load/StoreWithIndexedOffset. *)
         datatype addressKind =
             AddrOffset of {base: preg, offset: int}
         |   AddrIndex of {base: preg, index: preg}
 
         (* Pseudo-regs for the result, the closure and the args that were passed in real regs. *)
         val resultTarget = newPReg()
         val closureRegAddr = newPReg()
         val returnAddrReg = newPReg()
         
         val generalArgRegs = [X0, X1, X2, X3, X4, X5, X6, X7]
+        (* we just use the first four.  The ARM API uses V0 to V8. *)
+        val floatingPtArgRegs = [V0, V1, V2, V3]
         
         (* If a container is larger than this it is passed on the stack. *)
         val smallContainerSize = 4
 
         (* Create a map for the arguments indicating their register or stack location. *)
         local
             val containerRegs =
                 case List.filter(fn ContainerType _ => true | _ => false) argTypes of
                     [] => NONE
                 |   [ContainerType s] =>
                         if s <= smallContainerSize
                         then SOME(List.tabulate(s, fn _ => newMergeReg()))
                         else SOME [] (* Larger containers return their result on the stack. *)
                 |   _ => raise InternalError "more than one container arg"
 
             (* Select the appropriate argument register depending on the argument type. *)
-            fun argTypesToArgEntries([], _, _) = ([], [], [], [])
+            fun argTypesToArgEntries([], _, _, _) = ([], [], [], [], [])
 
-            |   argTypesToArgEntries(ContainerType s :: tl, gRegs, n) =
+            |   argTypesToArgEntries(DoubleFloatType :: tl, gRegs, fpReg :: fpRegs, n) =
+                let
+                    val (argTypes, argCode, argRegs, fpArgRegs, stackArgs) = argTypesToArgEntries(tl, gRegs, fpRegs, n-1)
+                    val pRegArg = newPReg() and uRegArg = newUReg()
+                in
+                    (ArgumentIsInReg pRegArg :: argTypes,
+                         boxTagFloat{source=uRegArg, dest=pRegArg, floatSize=Double64, saveRegs=[]} :: argCode,
+                         argRegs, (uRegArg, fpReg) :: fpArgRegs, stackArgs)
+                end
+
+            |   argTypesToArgEntries(SingleFloatType :: tl, gRegs, fpReg :: fpRegs, n) =
+                let
+                    val (argTypes, argCode, argRegs, fpArgRegs, stackArgs) = argTypesToArgEntries(tl, gRegs, fpRegs, n-1)
+                    val pRegArg = newPReg() and uRegArg = newUReg()
+                in
+                    (ArgumentIsInReg pRegArg :: argTypes,
+                         boxTagFloat{source=uRegArg, dest=pRegArg, floatSize=Float32, saveRegs=[]} :: argCode,
+                         argRegs, (uRegArg, fpReg) :: fpArgRegs, stackArgs)
+                end
+
+            |   argTypesToArgEntries(ContainerType s :: tl, gRegs, fpRegs, n) =
                 if s <= smallContainerSize
                 then
                 let
-                    val (argTypes, argCode, argRegs, stackArgs) = argTypesToArgEntries(tl, gRegs, n-1)
+                    val (argTypes, argCode, argRegs, fpArgRegs, stackArgs) = argTypesToArgEntries(tl, gRegs, fpRegs, n-1)
                     val regs = valOf containerRegs
                 in
-                    (ArgumentIsRegContainer regs :: argTypes, argCode, argRegs, stackArgs)
+                    (ArgumentIsRegContainer regs :: argTypes, argCode, argRegs, fpArgRegs, stackArgs)
                 end
                 (* The address of a larger container is passed as an argument *)
-                else argTypesToArgEntries(GeneralType :: tl, gRegs, n)
+                else argTypesToArgEntries(GeneralType :: tl, gRegs, fpRegs, n)
 
-            |   argTypesToArgEntries(_ :: tl, gReg :: gRegs, n) =
+            |   argTypesToArgEntries(_ :: tl, gReg :: gRegs, fpRegs, n) =
                 (* This deals with general arguments but also with extra floating point arguments.
                    They are boxed as usual. *)
                 let
-                    val (argTypes, argCode, argRegs, stackArgs) =
-                        argTypesToArgEntries(tl, gRegs, n-1)
+                    val (argTypes, argCode, argRegs, fpArgRegs, stackArgs) =
+                        argTypesToArgEntries(tl, gRegs, fpRegs, n-1)
                     val argReg=newPReg()
                 in
-                    (ArgumentIsInReg{realReg=gReg, argReg=argReg} :: argTypes, argCode, (argReg, gReg) :: argRegs, stackArgs)
+                    (ArgumentIsInReg argReg :: argTypes, argCode, (argReg, gReg) :: argRegs, fpArgRegs, stackArgs)
                 end
 
-            |   argTypesToArgEntries(_ :: tl, [], n) =
+            |   argTypesToArgEntries(_ :: tl, [], fpRegs, n) =
                 let
-                    val (argTypes, argCode, argRegs, stackArgs) = argTypesToArgEntries(tl, [], n-1)
+                    val (argTypes, argCode, argRegs, fpArgRegs, stackArgs) = argTypesToArgEntries(tl, [], fpRegs, n-1)
                     val stackLoc = newStackLoc 1
                 in
-                    (ArgumentIsOnStack {stackOffset=n, stackReg = stackLoc } :: argTypes, argCode, argRegs, stackLoc :: stackArgs)
+                    (ArgumentIsOnStack {stackOffset=n, stackReg = stackLoc } :: argTypes, argCode, argRegs, fpArgRegs, stackLoc :: stackArgs)
                 end
 
-            val (argEntries, argCode, argRegs, stackArguments) =
-                argTypesToArgEntries(argTypes, generalArgRegs, List.length argTypes)
+            val (argEntries, argCode, argRegs, fpArgRegs, stackArguments) =
+                argTypesToArgEntries(argTypes, generalArgRegs, floatingPtArgRegs, List.length argTypes)
 
             val clReg = case closure of [] => [] | _ => [(closureRegAddr, X8)]
             val retReg = [(returnAddrReg, X30)]
         in
             val argumentVector = Vector.fromList argEntries
 
             (* Start code for the function. *)
             val beginInstructions = argCode @
-                [BlockBegin{regArgs=retReg @ clReg @ argRegs, stackArgs=stackArguments }]
+                [BlockBegin{regArgs=retReg @ clReg @ argRegs, fpRegArgs=fpArgRegs, stackArgs=stackArguments }]
 
             (* The number of arguments on the stack.  Needed in return instrs and tail calls. *)
             val currentStackArgs = List.length stackArguments
             
             val containerResults = Option.map(fn regs => ListPair.zip(regs, generalArgRegs)) containerRegs
         end
 
-        (* TODO: Return the values of the container registers if we have multiple results. *)
-
-
         fun returnInstruction({stackPtr, ...}, resReg, tailCode) =
         let
-            val results = getOpt(containerResults, [(resReg, X0)]) (* Return the result in X0 unless there's a container. *)
+            val (results, fpResults, unBoxCode) =
+                case (containerResults, fnResultType) of
+                    (NONE, GeneralType) => ([(resReg, X0)], [], tailCode)
+                |   (NONE, DoubleFloatType) =>
+                    let
+                        val reg = newUReg()
+                    in
+                        ([], [(reg, V0)], tailCode <::> unboxTagFloat{ floatSize=Double64, source=resReg, dest=reg })
+                    end
+                |   (NONE, SingleFloatType) =>
+                    let
+                        val reg = newUReg()
+                    in
+                        ([], [(reg, V0)], tailCode <::> unboxTagFloat{ floatSize=Float32, source=resReg, dest=reg })
+                    end
+                |   (SOME cResult, GeneralType) => (cResult, [], tailCode)
+                |   _ => raise InternalError "returnInstruction: result type mismatch"
         in
-            BlockExit(ReturnResultFromFunction{results=results, returnReg = returnAddrReg, numStackArgs=currentStackArgs}) ::
-                (if stackPtr <> 0 then resetStackPtr{numWords=stackPtr} :: tailCode else tailCode)
+            BlockExit(ReturnResultFromFunction{results=results, fpResults=fpResults, returnReg=returnAddrReg, numStackArgs=currentStackArgs}) ::
+                (if stackPtr <> 0 then resetStackPtr{numWords=stackPtr} :: unBoxCode else unBoxCode)
         end
 
         fun asTarget(SpecificPReg preg) = preg
         |   asTarget _ = newPReg()
 
         fun moveToResult(SpecificPReg tReg, code, sReg) =
                 (moveRegister{source=sReg, dest=tReg} :: code, tReg, false)
         |   moveToResult(AnyReg, code, sReg) = (code, sReg, false)
         |   moveToResult(NoResult, code, sReg) =
             let
                 val tReg = newPReg()
             in
                 (moveRegister{source=sReg, dest=tReg} :: code, tReg, false)
             end
 
         (* Store a register at a given offset.  This may have to use an index register
            if the offset is too large. *)
         fun storeAtWordOffset(toStore, offset, base, loadSize, tailCode) =
         let
             val wSize = opWordSize loadSize
             val byteOffset = offset*wSize
         in
             if offset < 4096 andalso byteOffset > ~256 
             then storeWithConstantOffset{base=base, source=toStore,
                             byteOffset=byteOffset, loadType=loadSize} :: tailCode
             else
             let
                 val indexReg = newUReg()
             in
                 storeWithIndexedOffset{ base=base, source=toStore, index=indexReg, loadType=loadSize, signExtendIndex=false } ::
                     loadNonAddressConstant{ source=LargeWord.fromInt offset, dest=indexReg } :: tailCode
             end
         end
 
         (* Allocate a fixed size cell with a reference to the profile object if we want
            to trace the location of live data.  Currently only used for tuples and closures. *)
         fun allocateWithProfileRev(n, flags, memAddr, tlCode) =
         let
             fun doAllocation(words, flags, tlCode) =
             let
                 val wordsRequired =
                     if is32in64
                     then (* Have to round this up to 8 bytes *)
                         Word64.andb(Word64.fromInt(words+2), ~ 0w2)
                     else Word64.fromInt(words+1)
                 val bytesRequired = Word64.fromLarge(Word.toLarge wordSize) * wordsRequired
                 val lengthWord =
                     Word64.orb(Word64.fromInt words, Word64.<<(Word64.fromLarge(Word8.toLarge flags),
                         if is32in64 then 0w24 else 0w56))
                 val lengthReg = newUReg()
             in
                 storeWithConstantOffset{ source=lengthReg, base=memAddr, byteOffset= ~(Word.toInt wordSize), loadType=polyWordLoadSize } ::
                 loadNonAddressConstant{ source=lengthWord, dest=lengthReg } ::
                 allocateMemoryFixed{bytesRequired=bytesRequired, dest=memAddr, saveRegs=[]} :: tlCode
             end
         in
             if addAllocatingFunction
             then
             let
                 val profReg = newPReg()
             in
                 storeAtWordOffset(profReg, n, memAddr, polyWordLoadSize,
                     loadAddressConstant{ source=profileObject, dest=profReg} ::
                         doAllocation(n+1, Word8.orb(flags, Address.F_profile), tlCode))
             end
             else doAllocation(n, flags, tlCode)
         end
 
         (* Return a unit result. *)
         fun returnUnit(target, code, exit) =
         let
             val tReg = asTarget target
         in
             (loadNonAddressConstant{source=taggedWord64 0w0, dest=tReg} :: code, tReg, exit)
         end
 
         (* Create a bool result from a test by returning true or false. *)
         fun makeBoolResultRev(condition, ccRef, target, testCode) =
         let
             val trueLab = newLabel() and falseLab = newLabel() and mergeLab = newLabel()
             val mergeReg = newMergeReg()
         in
             moveRegister{dest=target, source=mergeReg} ::
             BlockLabel mergeLab ::
             BlockFlow(Unconditional mergeLab) ::
             loadNonAddressConstant{dest=mergeReg, source=taggedWord64 0w0} ::
             BlockLabel falseLab ::
             BlockFlow(Unconditional mergeLab) ::
             loadNonAddressConstant{dest=mergeReg, source=taggedWord64 0w1} ::
             BlockLabel trueLab ::
             BlockFlow(Conditional{ ccRef=ccRef, condition=condition, trueJump=trueLab, falseJump=falseLab }) ::
             testCode
         end
 
         (* Return an absolute address in both native addressing and 32-in-64. *)
         fun getAbsoluteAddress(code, baseReg) =
             if is32in64
             then let val absReg = newUReg() in (objectIndexAddressToAbsolute{ source=baseReg, dest=absReg } :: code, absReg) end
             else (code, baseReg)
 
         (* Load a value aligned on a 64 or 32-bit boundary.  offset is the number
            of units.  Typically this will be a polyword. *)
         fun wordAddressOffset(destination, baseReg1, offset, loadOp, code) =
         let
             val dReg = asTarget destination
             val opWordSize = opWordSize loadOp
             val byteOffset = offset * opWordSize
             val (codeBase, baseReg) = getAbsoluteAddress(code, baseReg1)
             val code =
                 if offset < 4096 andalso byteOffset > ~256
                 then loadWithConstantOffset{base=baseReg, dest=dReg, byteOffset=byteOffset, loadType=loadOp} :: codeBase
                 else
                 let
                     val indexReg = newUReg()
                 in
                     loadWithIndexedOffset{ base=baseReg, dest=dReg, index=indexReg, loadType=loadOp, signExtendIndex=false } ::
                         loadNonAddressConstant{ source=LargeWord.fromInt offset, dest=indexReg } :: codeBase
                 end
         in
             (code, dReg, false)
         end
 
         (* See if we have a container and return the entry if present. *)
         datatype containerType =
             NoContainer | ContainerOnStack of { container: stackLocn, stackOffset: int } | ContainerInRegs of preg list
         fun getContainerIfPresent(BICExtract(BICLoadLocal l)) =
             (
                 case Array.sub(locToPregArray, l) of
                     StackContainer container => ContainerOnStack container
                 |   RegisterContainer rc => ContainerInRegs rc
                 |   _ => NoContainer
             )
         |   getContainerIfPresent(BICExtract(BICLoadArgument a)) =
             (
                 case Vector.sub(argumentVector, a) of
                     ArgumentIsRegContainer rc => ContainerInRegs rc
                 |   _ => NoContainer
             )
             
         |   getContainerIfPresent _ = NoContainer
 
         (* General function for loads and stores. *)
         fun loadAndStoreWithAddress
                 ({base=bReg1, index, offset}, loadSize, loadShift, isCAddress, loadStoreOffset, loadStoreIndex, code) =
         let
             val byteOffset = offset * loadSize
             (* Get the base register value *)
             val bCode = code
             val sCode = bCode
 
             (* Get any index register value. *)
             val (iCode, iReg1Opt) =
                 case index of
                     NONE =>
                         if offset < 4096 andalso byteOffset > ~256
                         then (sCode, NONE) (* We can use this offset. *)
                         else
                         let
                             val iReg = newUReg()
                         in
                             (loadNonAddressConstant{ source=LargeWord.fromInt offset, dest=iReg } :: sCode, SOME iReg)
                         end
                 |   SOME iReg1 =>
                         let
                             val iCode1 = sCode
                             (* The index is a tagged integer containing the number of units (words, bytes etc).
                                It has to be untagged.  If this is a C address it may be negative. *)
                             val iReg2 = newUReg()
                             (* Logical shift if this is a Poly address, arithmetic shift if this is
                                a C address. *)
                             val iCode2 =
                                 untagValue{source=iReg1, dest=iReg2, opSize=polyWordOpSize, isSigned=isCAddress } :: iCode1
                         in
                             if offset = 0
                             then (iCode2, SOME iReg2)
                             else
                             let
                                 (* If there's some constant offset add it to the index.  Because it's a byte offset
                                    we need to divide it by the scale but it should always be a multiple.
                                    N.B. In 32-in-64 the index register contains a 32-bit value even when
                                    the offset is negative. *)
                                 val cReg = newUReg() and iReg3 = newUReg()
                                 val offsetAsWord = LargeWord.fromInt offset
                                 (* It could be negative if it's a C address. *)
                                 val shiftedOffset = (if isCAddress then LargeWord.~>> else LargeWord.>>) (offsetAsWord, loadShift)
                             in
                                 (addSubRegister{ base=iReg2, shifted=cReg, dest=SomeReg iReg3, ccRef=NONE, isAdd=true,
                                     length=polyWordOpSize, shift=ShiftNone} ::
                                  loadNonAddressConstant{ source=shiftedOffset, dest=cReg } :: iCode2,
                                  SOME iReg3)
                             end
                         end
 
             (* If this is 32in64 get the absolute address. *)
             val (absBCode, absBReg) = getAbsoluteAddress(iCode, bReg1)
 
             (* If this is a C address the "base address" is actually a box containing the address. *)
             val (effBCode, effBReg) =
                 if isCAddress
                 then
                     let
                         val bReg = newUReg()
                     in
                         (loadWithConstantOffset{ base=absBReg, dest=bReg, byteOffset=0, loadType=Load64 } :: absBCode, bReg)
                     end
                 else (absBCode, absBReg)
 
         in
             case iReg1Opt of
                 SOME iReg => loadStoreIndex(effBReg, iReg, effBCode)
             |   NONE => loadStoreOffset(effBReg, offset, effBCode)
         end
 
         (* Some operations require a single absolute address.
            These are all ML addresses so the index/offset is always unsigned. *)
         fun loadAndStoreWithAbsolute (address, loadSize, loadShift, loadStore, code) =
         let
             (* Have to add the offset/index register. *)
             fun loadStoreOffset(bReg, 0, code) = loadStore(bReg, code)
             |   loadStoreOffset(bReg, offset, code) =
                 let
                     val cReg = newUReg() and aReg = newUReg()
                 in
                     loadStore(aReg,
                         addSubRegister{ base=bReg, shifted=cReg, dest=SomeReg aReg, ccRef=NONE, isAdd=true,
                                         length=OpSize64, shift=ShiftNone} ::
                         loadNonAddressConstant{ source=LargeWord.fromInt offset, dest=cReg } :: code)
                 end
 
             and loadStoreIndex(bReg, iReg, code) =
             let
                 val aReg = newUReg()
                 (* The index register is a number of words/bytes etc so has to be multiplied when it's
                    added in. *)
                 val indexShift =
                     if loadShift = 0w0 then ShiftNone else ShiftLSL(Word8.fromLarge(Word.toLarge loadShift))
             in
                 loadStore(aReg,
                     addSubRegister{ base=bReg, shifted=iReg, dest=SomeReg aReg, ccRef=NONE, isAdd=true,
                                     length=OpSize64, shift=indexShift} :: code)
             end
         in
             loadAndStoreWithAddress (address, loadSize, loadShift, false, loadStoreOffset, loadStoreIndex, code)
         end
 
         (* Overflow check.  This raises Overflow if the condition is satisfied.  Normally this will be
            that the overflow bit is set but for multiplication it's more complicated.  This generates
            a single block for the function unless there is a handler.
            As well as reducing the size of the code this also means that overflow checks are generally
            BO instructions to the end of the code.  Since the default branch prediction is not to take
            forward jumps this should improve prefetching on the normal, non-overflow, path. *)
         fun checkOverflow (condition, {currHandler=NONE, overflowBlock=ref(SOME overFlowLab), ...}, ccRef) =
             (* It's already been set and there's no surrounding handler - use this. *)
             let
                 val noOverflowLab = newLabel()
             in
                 [
                     BlockLabel noOverflowLab,
                     BlockFlow(Conditional{ ccRef=ccRef, condition=condition, trueJump=overFlowLab, falseJump=noOverflowLab })
                 ]
             end
 
         |   checkOverflow (condition, {currHandler=NONE, overflowBlock, ...}, ccRef) =
             let
                 (* *)
                 val overFlowLab = newLabel() and noOverflowLab = newLabel()
                 val packetReg = newPReg()
                 val () = overflowBlock := SOME overFlowLab
             in
                 [
                     BlockLabel noOverflowLab,
                     BlockExit(RaiseExceptionPacket{packetReg=packetReg}),
                     loadAddressConstant{source=toMachineWord(Overflow), dest=packetReg},
                     BlockLabel overFlowLab,
                     BlockFlow(Conditional{ ccRef=ccRef, condition=condition, trueJump=overFlowLab, falseJump=noOverflowLab })
                 ]
             end
 
         |   checkOverflow (condition, {currHandler=SOME h, ...}, ccRef) =
             let
                 val overFlowLab = newLabel() and noOverflowLab = newLabel()
                 val packetReg = newPReg()
             in
                 [
                     BlockLabel noOverflowLab,
                     BlockRaiseAndHandle(RaiseExceptionPacket{packetReg=packetReg}, h),
                     loadAddressConstant{source=toMachineWord(Overflow), dest=packetReg},
                     BlockLabel overFlowLab,
                     BlockFlow(Conditional{ ccRef=ccRef, condition=condition, trueJump=overFlowLab, falseJump=noOverflowLab })
                 ]
             end
 
         fun codeToICodeRev(BICNewenv (bindings, exp), context: context as {stackPtr=initialSp, ...}, isTail, destination, tailCode) =
             let
                 (* Process a list of bindings.  We need to accumulate the space used by
                    any containers and reset the stack pointer at the end if necessary. *)
                 fun doBindings([], context, tailCode) = (tailCode, context)
  
                 |   doBindings(BICDeclar{value=BICExtract(BICLoadLocal l), addr, ...} :: decs, context, tailCode) =
                     let
                         (* Giving a new name to an existing entry.  This should have been removed
                            at a higher level but it doesn't always seem to be.  In particular we
                            must treat this specially if it's a container. *)
                         val original = Array.sub(locToPregArray, l)
                         val () = Array.update(locToPregArray, addr, original)
                     in
                         doBindings(decs, context, tailCode)
                     end
 
                 |   doBindings(BICDeclar{value, addr, ...} :: decs, context, tailCode) =
                     let
                         val (code, dest, _) = codeToICodeRev(value, context, false, AnyReg, tailCode)
                         val () = Array.update(locToPregArray, addr, PregLocation dest)
                     in
                         doBindings(decs, context, code)
                     end
 
                 |   doBindings(BICRecDecs [{lambda, addr, ...}] :: decs, context, tailCode) =
                     (* We shouldn't have single entries in RecDecs but it seems to occur at the moment. *)
                     let
                         val dest = newPReg()
                         val (code, _, _) = codeToICodeRev(BICLambda lambda, context, false, SpecificPReg dest, tailCode)
                         val () = Array.update(locToPregArray, addr, PregLocation dest)
                     in
                         doBindings(decs, context, code)
                     end
 
                 |   doBindings(BICRecDecs recDecs :: decs, context, tailCode) =
                     let
                         val destRegs = map (fn _ => newPReg()) recDecs
                         val flagsValue = if is32in64 then F_closure else 0w0
                         (* First build the closures as mutable cells containing zeros.  Set the
                            entry in the address table to the register containing the address. *)
                         fun makeClosure({lambda={closure, ...}, addr, ...}, dest, tailCode) =
                         let
                             val () = Array.update(locToPregArray, addr, PregLocation dest)
                             val wordsRequired = List.length closure + (if is32in64 then 2 else 1)
                             val absAddr = if is32in64 then newUReg() else dest
                             val zeroReg = newPReg()
                             val allocAndSetZero =
                                 loadNonAddressConstant{ source=taggedWord64 0w0, dest=zeroReg} ::
                                     allocateWithProfileRev(wordsRequired, Word8.orb(F_mutable, flagsValue), absAddr, tailCode)
                             val (_, clearCode) =
                                 List.foldl(fn (_, (n, l)) =>
                                     (n+1, storeAtWordOffset(zeroReg, n, absAddr, polyWordLoadSize, l))) (0, allocAndSetZero) closure
                         in
                             if is32in64
                             then absoluteToObjectIndex{ source=absAddr, dest=dest } :: clearCode
                             else clearCode
                         end
                         val allocClosures = ListPair.foldlEq makeClosure tailCode (recDecs, destRegs)
 
                         fun setClosure({lambda, ...}, dest, l) =
                         let
                             val absAddr = if is32in64 then newUReg() else dest
                             val flagsReg = newUReg()
                             (* Lock the closure by storing the flags byte without the mutable flag.
                                TODO: We could simply use XZ here. *)
                         in
                             storeWithConstantOffset{ base=absAddr, source=flagsReg, byteOffset=flagsByteOffset,
                                     loadType=Load8 } ::
                             loadNonAddressConstant{ source=Word8.toLarge flagsValue, dest=flagsReg } ::
                             storeIntoClosure(lambda, absAddr, context,
                                     if is32in64
                                     then objectIndexAddressToAbsolute{ source=dest, dest=absAddr } :: l else l)
                         end
                             
                         val setAndLockClosures = ListPair.foldlEq setClosure allocClosures (recDecs, destRegs)
                     in
                         doBindings(decs, context, setAndLockClosures)
                     end
 
                 |   doBindings(BICNullBinding exp :: decs, context, tailCode) =
                     let
                         val (code, _, _) = codeToICodeRev(exp, context, false, NoResult, tailCode) (* And discard result. *)
                     in
                         doBindings(decs, context, code)
                     end
        
                 |   doBindings(BICDecContainer{ addr, size } :: decs, context as {loopArgs, stackPtr, currHandler, overflowBlock}, tailCode) =
                     if size <= smallContainerSize
                     then
                     let
                         val regs = List.tabulate(size, fn _ => newMergeReg())
                         val () = Array.update(locToPregArray, addr, RegisterContainer regs)
                     in
                         doBindings(decs, context, tailCode)
                     end
                     else
                     let (* Larger container - reserve a portion of stack and zero it. *)
                         val containerLoc = newStackLoc size
                         val () = Array.update(locToPregArray, addr,
                                     StackContainer{container=containerLoc, stackOffset=stackPtr+size})
                         val zeroReg = newPReg()
                     in
                         doBindings(decs,
                             {loopArgs=loopArgs, stackPtr=stackPtr+size, currHandler=currHandler, overflowBlock=overflowBlock},
                             tailCode <::> loadNonAddressConstant{ source=taggedWord64 0w0, dest=zeroReg } <::>
                             pushToStack{copies=size, container=containerLoc, source=zeroReg})
                     end
 
                 val (codeBindings, resContext as {stackPtr=finalSp, ...}) = doBindings(bindings, context, tailCode)
                 (* If we have had a container we'll need to reset the stack *)
             in
                 if initialSp <> finalSp
                 then
                 let
                     val _ = finalSp >= initialSp orelse raise InternalError "codeToICode - stack ptr"
                     val bodyReg = newPReg() and resultReg = asTarget destination
                     val (codeExp, result, haveExited) =
                         codeToICodeRev(exp, resContext, isTail, SpecificPReg bodyReg, codeBindings)
                     val afterAdjustSp =
                         if haveExited
                         then codeExp
                         else
                             moveRegister{source=result, dest=resultReg} ::
                             resetStackPtr{numWords=finalSp-initialSp} :: codeExp
                 in
                     (afterAdjustSp, resultReg, haveExited)
                 end
                 else codeToICodeRev(exp, resContext, isTail, destination, codeBindings)
             end
 
         |   codeToICodeRev(BICExtract(BICLoadLocal l), {stackPtr, ...}, _, destination, tailCode) =
             (
                 case Array.sub(locToPregArray, l) of
                     NoLocation => raise InternalError "codeToICodeRev - local unset"
                 |   PregLocation preg => moveToResult(destination, tailCode, preg)
                 |   StackContainer{container, stackOffset} =>
                     let
                         val target = asTarget destination
                     in
                         (containerAddress{dest=target, container=container, stackOffset=stackPtr-stackOffset}
                             :: tailCode, target, false)
                     end
                 |   RegisterContainer _ => raise InternalError "BICExtract local: reg container"
             )
 
         |   codeToICodeRev(BICExtract(BICLoadArgument a), {stackPtr, ...}, _, destination, tailCode) =
             (
                 case Vector.sub(argumentVector, a) of
-                    ArgumentIsInReg{argReg, ...} => (* It was originally in a register.  It's now in a preg. *)
+                    ArgumentIsInReg argReg => (* It was originally in a register.  It's now in a preg. *)
                         moveToResult(destination, tailCode, argReg)
                 |   ArgumentIsOnStack{stackOffset, stackReg} => (* Pushed before call. *)
                     let
                         val target = asTarget destination
                     in
                         (loadStack{wordOffset=stackOffset+stackPtr, container=stackReg, field=0,
                                                dest=target} :: tailCode, target, false)
                     end
                 |   ArgumentIsRegContainer _ => raise InternalError "BICExtract argument: reg container"
             )
 
         |   codeToICodeRev(BICExtract(BICLoadClosure c), _, _, destination, tailCode) =
             let
                 (* Add the number of words for the code address.  This is 1 in native but 2 in 32-in-64. *)
                 val offset = if is32in64 then c+2 else c+1
             in
                 if c >= List.length closure then raise InternalError "BICExtract: closure" else ();
                 wordAddressOffset(destination, closureRegAddr, offset, polyWordLoadSize, tailCode)
             end
 
         |   codeToICodeRev(BICExtract BICLoadRecursive, _, _, destination, tailCode) =
                 (* If the closure is empty we must use the constant.  We can't guarantee that
                    the caller will actually load the closure register if it knows the closure
                    is empty. *)
             (
                  case closure of
                     [] =>
                         let
                             val dReg = asTarget destination
                         in
                             (loadAddressConstant{source=closureAsAddress resultClosure, dest=dReg} :: tailCode, dReg, false)
                         end
                  |  _ => moveToResult(destination, tailCode, closureRegAddr)
             )
 
         |   codeToICodeRev(BICConstnt(w, _), _, _, destination, tailCode) =
             let
                 val dReg = asTarget destination
                 val instr =
                     if isShort w
                     then (* When converting to Word64 we do NOT want to use sign-extension.
                             In 32-in-64 signed fixed-precision ints need to have zeros
                             in the top 32 bits. *)
                         loadNonAddressConstant{source=taggedWord64(Word64.fromLarge(Word.toLarge(toShort w))),
                             dest=dReg}
                     else loadAddressConstant{source=w, dest=dReg}
             in
                 (instr :: tailCode, dReg, false)
             end
 
         |   codeToICodeRev(BICField{base, offset}, context, _, destination, tailCode) =
             let
                 val (codeBase, baseReg, _) = codeToICodeRev(base, context, false, AnyReg, tailCode)
             in
                 wordAddressOffset(destination, baseReg, offset, polyWordLoadSize, codeBase)
             end
 
         |   codeToICodeRev(BICCond(test, thenPt, elsePt), context, isTail, NoResult, tailCode) =
             let
                 (* If we don't want the result but are only evaluating for side-effects we
                    may be able to optimise special cases.  This was easier in the forward
                    case but for now we don't bother and leave it to the lower levels. *)
                 val startElse = newLabel() and skipElse = newLabel()
                 val codeTest = codeConditionRev(test, context, false, startElse, tailCode)
                 val (codeThen, _, _) =
                     codeToICodeRev(thenPt, context, isTail, NoResult, codeTest)
                 val (codeElse, _, _) =
                      codeToICodeRev(elsePt, context, isTail, NoResult,
                         BlockLabel startElse ::
                         BlockFlow(Unconditional skipElse) :: codeThen)
             in
                 returnUnit(NoResult, BlockLabel skipElse :: codeElse, false(*??*))
             end
 
         |   codeToICodeRev(BICCond(test, thenPt, elsePt), context, isTail, destination, tailCode) =
             let
                 (* Because we may push the result onto the stack we have to create a new preg to
                    hold the result and then copy that to the final result. *)
                 (* If this is a tail each arm will exit separately and neither will return a result. *)
                 val target = asTarget destination
                 val condResult = newMergeReg()
                 val thenTarget = if isTail then newPReg() else condResult
                 val startElse = newLabel()
                 val testCode = codeConditionRev(test, context, false, startElse, tailCode)
                 
                 (* Put the result in the target register. *)
                 val (thenCode, _, thenExited) = codeToICodeRev(thenPt, context, isTail, SpecificPReg thenTarget, testCode)
                 (* Add a jump round the else-part except that if this is a tail we
                    return.  The then-part could have exited e.g. with a raise or a loop. *)
                 val (exitThen, thenLabel, elseTarget) =
                     if thenExited then (thenCode, [], target (* Can use original target. *))
                     else if isTail then (returnInstruction(context, thenTarget, thenCode), [], newPReg())
                     else
                     let
                         val skipElse = newLabel()
                     in
                         (BlockFlow(Unconditional skipElse) :: thenCode,
                          [moveRegister{source=condResult, dest=target},
                           BlockLabel skipElse],
                          condResult)
                     end
                 val (elseCode, _, elseExited) =
                     codeToICodeRev(elsePt, context, isTail, SpecificPReg elseTarget,
                         BlockLabel startElse :: exitThen)
                 (* Add a return to the else-part if necessary so we will always exit on a tail. *)
                 val exitElse =
                     if isTail andalso not elseExited
                     then returnInstruction(context, elseTarget, elseCode) else elseCode
             in
                 (thenLabel @ exitElse, target, isTail orelse thenExited andalso elseExited)
             end
 
         |   codeToICodeRev(BICUnary instr, context, isTail, destination, tailCode) =
                 codeToICodeUnaryRev(instr, context, isTail, destination, tailCode)
 
         |   codeToICodeRev(BICBinary instr, context, isTail, destination, tailCode) =
                 codeToICodeBinaryRev(instr, context, isTail, destination, tailCode)
 
         |   codeToICodeRev(BICTagTest{test, tag=tagValue, ...}, context, isTail, destination, tailCode) =
             (* Check the "tag" word of a union (datatype).  N.B.  Not the same as testing the
                tag bit of a word.  Just generate it as a general word comparison.  The optimiser
                will sort out whether the tag value can be an immediate. *)
                 codeToICodeRev(BICBinary{oper=WordComparison{test=TestEqual, isSigned=false},
                                          arg1=test, arg2=BICConstnt(toMachineWord tagValue, [])},
                         context, isTail, destination, tailCode)
 
         |   codeToICodeRev(BICTuple fields, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 (* The allocator sets the register to the absolute address.  It
                    has to be converted to an object pointer in 32-in-64. *)
                 val absAddr = if is32in64 then newUReg() else target
                 
                 fun loadFields([], n, tlCode) = allocateWithProfileRev(n, 0w0, absAddr, tlCode)
                 
                 |   loadFields((f as BICConstnt _) :: rest, n, tlCode) =
                     let
                         (* Unlike the X86 we still need to load a constant into a register in order to
                            store it in the new tuple.  However, it's better to leave that until after
                            the allocation and move it then.  That way we can use the same register
                            for different constants if we have a very large tuple. *)
                         val restAndAlloc = loadFields(rest, n+1, tlCode)
                         val (code1, source, _) = codeToICodeRev(f, context, false, AnyReg, restAndAlloc)
                     in
                         storeAtWordOffset(source, n, absAddr, polyWordLoadSize, code1)
                     end
                     
                 |   loadFields(f :: rest, n, tlCode) =
                     let
                         val (code1, source, _) = codeToICodeRev(f, context, false, AnyReg, tlCode)
                         val restAndAlloc = loadFields(rest, n+1, code1)
                     in
                         storeAtWordOffset(source, n, absAddr, polyWordLoadSize, restAndAlloc)
                     end
                 val allocAndStore = loadFields(fields, 0, tailCode)
                 val code =
                     if is32in64
                     then absoluteToObjectIndex{source=absAddr, dest=target} :: allocAndStore
                     else allocAndStore
             in
                 (code, target, false)
             end
 
         |   codeToICodeRev(BICRaise exc, context as { currHandler, ...}, _, destination, tailCode) =
             let
                 val (code, packetReg, _) = codeToICodeRev(exc, context, false, AnyReg, tailCode)
                 val raiseCode = RaiseExceptionPacket{packetReg=packetReg}
                 val block =
                     case currHandler of
                         NONE => BlockExit raiseCode | SOME h => BlockRaiseAndHandle(raiseCode, h)
             in
                 returnUnit(destination,  block :: code, true (* Always exits *))
             end
 
-        |   codeToICodeRev(BICEval{function, argList, ...}, context as { currHandler, ...}, isTail, destination, tailCode) =
+        |   codeToICodeRev(BICEval{function, argList, resultType, ...}, context as { currHandler, ...}, isTail, destination, tailCode) =
             let
                 val target = asTarget destination
                 (* Create pregs for the closure and each argument. *)
                 val clPReg = newPReg()
                 (* If we have a constant closure we can go directly to the entry point.
                    If the closure is a single word we don't need to load the closure register. *)
                 val (functionCode, closureEntry, callKind) =
                     case function of
                         BICConstnt(addr, _) =>
                         let
                             val addrAsAddr = toAddress addr
                             (* If this is a closure we're still compiling we can't get the code address.
                                However if this is directly recursive we can use the recursive
                                convention. *)
                         in
                             if wordEq(closureAsAddress resultClosure, addr)
                             then (tailCode, [], Recursive)
                             else if flags addrAsAddr <> Address.F_words andalso flags addrAsAddr <> Address.F_closure
                             then (loadAddressConstant{source=addr, dest=clPReg} :: tailCode,
                                       [(ArgInReg clPReg, X8)], FullCall)
                             else if is32in64
                             then (* The code address is a 64-bit value so we have to load it
                                     at run-time.  The X86 version passes the closure address here
                                     and generates a relative CALL/JMP.  The actual offset is
                                     computed by the RTS.
                                     For the moment just use a full call. *)
                                 (loadAddressConstant{source=addr, dest=clPReg} :: tailCode,
                                       [(ArgInReg clPReg, X8)], FullCall)
                             else (* Native 64-bits. *)
                             let
                                 val addrLength = length addrAsAddr
                                 val _ = addrLength >= 0w1 orelse raise InternalError "BICEval address"
                                 val codeAddr = loadWord(addrAsAddr, 0w0)
                                 val _ = isCode (toAddress codeAddr) orelse raise InternalError "BICEval address not code"
                             in
                                 if addrLength = 0w1
                                 then (tailCode, [], ConstantCode codeAddr)
                                 else (loadAddressConstant{source=addr, dest=clPReg} :: tailCode,
                                       [(ArgInReg clPReg, X8)], ConstantCode codeAddr)
                             end
                         end
 
                     |   BICExtract BICLoadRecursive =>
                         (
                             (* If the closure is empty we don't need to load X8 *)
                             case closure of
                                 [] => (tailCode, [], Recursive)
                             |   _ =>
                                     (moveRegister {source=closureRegAddr, dest=clPReg} :: tailCode,
                                      [(ArgInReg clPReg, X8)], Recursive)
                         )
 
                     |   function => (* General case. *)
                             (#1 (codeToICodeRev(function, context, false, SpecificPReg clPReg, tailCode)), [(ArgInReg clPReg, X8)], FullCall)
 
                 local
                     (* Load the first arguments into registers and the rest to the stack. *)
-                    fun loadArgs ([], _, tailCode) = (tailCode, [], [])
+                    fun loadArgs ([], _, _, tailCode) = (tailCode, [], [], [])
+
+                    |   loadArgs ((arg, DoubleFloatType) :: args, gRegs, fpReg :: fpRegs, tailCode) =
+                        let (* Floating point register argument. *)
+                            val (c, r, _) = codeToICodeRev(arg, context, false, AnyReg, tailCode)
+                            val r1 = newUReg()
+                            val (code, regArgs, fpRegArgs, stackArgs) =
+                                loadArgs(args, gRegs, fpRegs, c <::> unboxTagFloat{floatSize=Double64, source=r, dest=r1})
+                        in
+                            (code, regArgs, (r1, fpReg: vReg) :: fpRegArgs, stackArgs)
+                        end
 
-                    |   loadArgs ((arg, _) :: args, gReg::gRegs, tailCode) =
+                    |   loadArgs ((arg, SingleFloatType) :: args, gRegs, fpReg :: fpRegs, tailCode) =
+                        let (* Floating point register argument. *)
+                            val (c, r, _) = codeToICodeRev(arg, context, false, AnyReg, tailCode)
+                            val r1 = newUReg()
+                            val (code, regArgs, fpRegArgs, stackArgs) =
+                                loadArgs(args, gRegs, fpRegs, c <::> unboxTagFloat{floatSize=Float32, source=r, dest=r1})
+                        in
+                            (code, regArgs, (r1, fpReg: vReg) :: fpRegArgs, stackArgs)
+                        end
+
+                    |   loadArgs ((arg, _) :: args, gReg::gRegs, fpRegs, tailCode) =
                         let (* General register argument. *)
                             val (c, r, _) = codeToICodeRev(arg, context, false, AnyReg, tailCode)
-                            val (code, regArgs, stackArgs) = loadArgs(args, gRegs, c)
+                            val (code, regArgs, fpRegArgs, stackArgs) = loadArgs(args, gRegs, fpRegs, c)
                         in
-                            (code, (ArgInReg r, gReg) :: regArgs, stackArgs)
+                            (code, (ArgInReg r, gReg) :: regArgs, fpRegArgs, stackArgs)
                         end
 
-                    |   loadArgs ((arg, _) :: args, [], tailCode) =
+                    |   loadArgs ((arg, _) :: args, [], fpRegs, tailCode) =
                         let (* Stack argument. *)
                             val (c, r, _) = codeToICodeRev(arg, context, false, AnyReg, tailCode)
-                            val (code, regArgs, stackArgs) = loadArgs(args, [], c)
+                            val (code, regArgs, fpRegArgs, stackArgs) = loadArgs(args, [], fpRegs, c)
                         in
-                            (code, regArgs, ArgInReg r :: stackArgs)
+                            (code, regArgs, fpRegArgs, ArgInReg r :: stackArgs)
                         end
 
                     fun isSmallContainer(ContainerType s) = s <= smallContainerSize | isSmallContainer _ = false
                 in
-                    val (codeArgs, regArgs, stackArgs) =
-                        loadArgs(List.filter(not o isSmallContainer o #2) argList, generalArgRegs, functionCode)
+                    val (codeArgs, regArgs, fpRegArgs, stackArgs) =
+                        loadArgs(List.filter(not o isSmallContainer o #2) argList, generalArgRegs, floatingPtArgRegs, functionCode)
                 end
                 
                 (* If this is at the end of the function and the result types are the
                    same we can use a tail-recursive call. *)
-                val tailCall = isTail (*andalso resultType = fnResultType*)
+                val tailCall = isTail andalso resultType = fnResultType
                 
                 val callCode =
                     if tailCall
                     then
                     let
                         val {stackPtr, ...} = context
                         (* The number of arguments currently on the stack. *)
                         val currentStackArgCount = currentStackArgs
                         val newStackArgCount = List.length stackArgs
                         (* The offset of the first argument.  Offsets can be negative. *)
                         val stackOffset = stackPtr
                         fun makeStackArgs([], _) = []
                         |   makeStackArgs(arg::args, offset) = {src=arg, stack=offset} :: makeStackArgs(args, offset-1)
                         val stackArgs = makeStackArgs(stackArgs, currentStackArgCount-1)
                         (* The stack adjustment needed to compensate for any items that have been pushed
                            and the differences in the number of arguments.  May be positive or negative. *)
                         val stackAdjust = currentStackArgCount - newStackArgCount
                         (* Add an entry for the return address to the register arguments. *)
                     in
                         BlockExit(TailRecursiveCall{regArgs=(ArgInReg returnAddrReg, X30) :: closureEntry @ regArgs, stackArgs=stackArgs,
-                                  stackAdjust = stackAdjust, currStackSize=stackOffset, callKind=callKind}) ::
+                                  fpRegArgs=fpRegArgs, stackAdjust = stackAdjust, currStackSize=stackOffset, callKind=callKind}) ::
                                   codeArgs
                     end
                     else
                     let
                         (* See if there is a container argument. *)
                         val containerArg = List.find(fn (_, ContainerType _) => true | _ => false) argList
 
                         val containerValue =
                             case containerArg of
                                 SOME(argVal, _) => getContainerIfPresent argVal
                             |   NONE => NoContainer
                         (* When a container is passed as an argument we put the address into a register.
                            Normally the container will be referenced after the call in order to extract
                            the values but if it's discarded we need to make sure it will continue to
                            be referenced at least as far as the call.  This isn't a problem for the X86
                            code-generator since container addresses are a form of the "argument"
                            datatype. *)
                         val stackContainers =
                             case containerValue of ContainerOnStack{container, ...} => [container] | _ => []
 
                         (* Get the results.  If we're returning the result through a container the target isn't used
                            so we return unit. *)
-                        val (results, setTarget) =
-                            case containerValue of
-                                ContainerInRegs regs =>
-                                    (ListPair.zip(regs, generalArgRegs), [loadNonAddressConstant{source=taggedWord64 0w0, dest=target}])
-                            |   ContainerOnStack _ =>
-                                    ([], [loadNonAddressConstant{source=taggedWord64 0w0, dest=target}])
-                            |   NoContainer => ([(target, X0)], [])
+                        val (results, fpResults, setTarget) =
+                            case (containerValue, resultType) of
+                                (ContainerInRegs regs, GeneralType) =>
+                                    (ListPair.zip(regs, generalArgRegs), [], [loadNonAddressConstant{source=taggedWord64 0w0, dest=target}])
+                            |   (ContainerOnStack _, GeneralType) =>
+                                    ([], [], [loadNonAddressConstant{source=taggedWord64 0w0, dest=target}])
+                            |   (NoContainer, GeneralType) => ([(target, X0)], [], [])
+                            |   (NoContainer, DoubleFloatType) =>
+                                let
+                                    val dReg = newUReg()
+                                in
+                                    ([], [(dReg, V0)], [boxTagFloat{ floatSize=Double64, source=dReg, dest=target, saveRegs=[]}])
+                                end
+                            |   (NoContainer, SingleFloatType) =>
+                                let
+                                    val dReg = newUReg()
+                                in
+                                    ([], [(dReg, V0)], [boxTagFloat{ floatSize=Float32, source=dReg, dest=target, saveRegs=[]}])
+                                end
+                            |   _ => raise InternalError "codeToICodeRev: BICEval result type"
 
                         val call =
                             FunctionCall{regArgs=closureEntry @ regArgs, stackArgs=stackArgs, dests=results,
-                                         callKind=callKind, saveRegs=[], containers=stackContainers}
+                                         fpRegArgs=fpRegArgs, fpDests=fpResults, callKind=callKind, saveRegs=[], containers=stackContainers}
                         val callBlock =
                             case currHandler of
                                 NONE => BlockSimple call :: codeArgs
                             |   SOME h => BlockOptionalHandle{call=call, handler=h, label=newLabel()}  :: codeArgs
                     in
                         callBlock <@> setTarget
                     end
             in
                 (callCode, target, tailCall (* We've exited if this was a tail jump *))
             end
 
         |   codeToICodeRev(BICNullary{oper=BuiltIns.GetCurrentThreadId}, _, _, destination, tailCode) =
             (* Get the ID of the current thread. *)
             let
                 val target = asTarget destination
             in
                 (getThreadId{dest=target} :: tailCode, target, false)
             end
 
         |   codeToICodeRev(BICNullary{oper=BuiltIns.CPUPause}, _, _, destination, tailCode) =
             (* This is now done in the RTS call code. *)
                 returnUnit(destination, tailCode <::> cpuYield, false)
 
         |   codeToICodeRev(BICNullary {oper=CreateMutex}, _, _, destination, tailCode) =
             let
                 (* Allocate memory for a mutex.  Use a native word as a mutable, weak, no-overwrite, byte cell
                    which is the same as a volatileRef. This ensures that it will always be cleared when it is
                    loaded even if it was locked when it was saved. *)
                 val target = asTarget destination
                 val flags = Word8.orb(F_mutable, Word8.orb(F_weak, Word8.orb(F_noOverwrite, F_bytes))) (* 0wx69 *)
                 val absAddr = if is32in64 then newUReg() else target
                 val zeroReg = newUReg()
                 val allocAndStore =
                     storeWithConstantOffset{ source=zeroReg, base=absAddr, byteOffset=0, loadType=Load64 } ::
                     loadNonAddressConstant{source=0w0, dest=zeroReg} ::
                     allocateWithProfileRev(if is32in64 then 2 else 1, flags, absAddr, tailCode)
                 val code =
                     if is32in64
                     then absoluteToObjectIndex{source=absAddr, dest=target} :: allocAndStore
                     else allocAndStore
             in
                 (code, target, false)
             end
 
         |   codeToICodeRev(BICArbitrary { oper=ArithMult, longCall, ... }, context, isTail, destination, tailCode) =
                 (* Just call the long function to do this.  Overflow detection makes this too complicated. *)
                 codeToICodeRev(longCall, context, isTail, destination, tailCode)
 
         |   codeToICodeRev(BICArbitrary { oper, shortCond, arg1, arg2, longCall }, context, _, destination, tailCode) =
             let
                 val startLong = newLabel() and resultLabel = newLabel()
                 val condResult = newMergeReg()
                 (* Test to see if the arguments are short and go straight to the long case if not. *)
                 val testCode = codeConditionRev(shortCond, context, false, startLong, tailCode) 
                 (* Do the short case *)
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, testCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 (* We need to subtract the tag from one of the arguments and then do the
                    addition.  The optimiser will do the subtraction at compile time if we
                    subtract from a constant so if this is and Add we try to put the constant
                    in the second arg. *)
                 val (firstReg, secondReg) =
                     case (arg1, oper) of
                         (BICConstnt _, ArithAdd) => (aReg2, aReg1) | _ => (aReg1, aReg2)
                 (* Generate code for the short case.  Put the result in the merge register.  Jump to
                    the result if there's no overflow and to the long case if there is. *)
                 val codeShort =
                     case oper of
                         ArithAdd =>
                         let
                             val uReg = newUReg() and chkOverflow = newCCRef()
                         in
                             BlockFlow(Conditional{ ccRef=chkOverflow, condition=CondOverflow, trueJump=startLong, falseJump=resultLabel }) ::
                             addSubRegister{base=firstReg, shifted=uReg, dest=SomeReg condResult, length=polyWordOpSize,
                                          ccRef=SOME chkOverflow, isAdd=true, shift=ShiftNone} ::
                             addSubImmediate{dest=SomeReg uReg, source=secondReg, immed=0w1, length=polyWordOpSize,
                                             ccRef=NONE, isAdd=false}:: arg2Code
                         end
 
                     |   ArithSub =>
                         let
                             val uReg = newUReg() and chkOverflow = newCCRef()
                         in
                             BlockFlow(Conditional{ ccRef=chkOverflow, condition=CondOverflow, trueJump=startLong, falseJump=resultLabel }) ::
                             addSubRegister{base=firstReg, shifted=uReg, dest=SomeReg condResult, length=polyWordOpSize,
                                          ccRef=SOME chkOverflow, isAdd=false, shift=ShiftNone} ::
                             addSubImmediate{dest=SomeReg uReg, source=secondReg, immed=0w1, length=polyWordOpSize,
                                             ccRef=NONE, isAdd=false}:: arg2Code
                         end
 
                     |   _ => raise InternalError "BICArbitrary: unimplemented operation"
 
                 (* Code for the long case. Put the result into the merge register. *)
                 (* TODO: This could use a tail call if this is at the end of the function. *)
                 val (codeLong, _, _) =
                     codeToICodeRev(longCall, context, false, SpecificPReg condResult,
                                    BlockLabel startLong :: codeShort)
                 val target = asTarget destination
                 (* Copy the merge register into the result. *)
                 val finalCode =
                     moveRegister{source=condResult, dest=target} ::
                     BlockLabel resultLabel :: codeLong
             in
                 (finalCode, target, false)
             end
 
         |   codeToICodeRev(BICLambda(lambda as { closure = [], ...}), _, _, destination, tailCode) =
             (* Empty closure - create a constant closure for any recursive calls. *)
             let
                 val closure = makeConstantClosure()
                 val () = codeFunctionToArm64(lambda, debugSwitches, closure)
                 val dReg = asTarget destination
                 (* Return the closure itself as the value. *)
             in
                 (BlockSimple(LoadAddressConstant{source=closureAsAddress closure, dest=dReg}) :: tailCode, dReg, false)
             end
 
         |   codeToICodeRev(BICLambda(lambda as { closure, ...}), context, _, destination, tailCode) =
             (* Non-empty closure.  Ignore stack closure option at the moment. *)
             let
                 val wordsRequired = List.length closure  + (if is32in64 then 2 else 1)
                 val target = asTarget destination
                 val absAddr = if is32in64 then newUReg() else target
                 (* The values we're storing are all either constants or local/closure variables so
                    we can allocate the memory and then store into it. *)
                 val allocCode =
                     allocateWithProfileRev(wordsRequired, if is32in64 then F_closure else 0w0, absAddr, tailCode)
                 val storeCode =
                     storeIntoClosure(lambda, absAddr, context, allocCode)
                 val finalCode =
                     if is32in64 then BlockSimple(AbsoluteToObjectIndex{source=absAddr, dest=target}) :: storeCode else storeCode
             in
                 (finalCode, target, false)
             end
 
         |   codeToICodeRev(BICCase { cases, test, default, isExhaustive, firstIndex}, context, isTail, destination, tailCode) =
             let
                 (* We have to create a new preg for the result in case we need to push
                    it to the stack. *)
                 val targetReg = newMergeReg()
                 
                 local
                     val (testCode, initialTestReg, _) = codeToICodeRev(test, context, false, AnyReg, tailCode)
                     (* Subtract the minimum even if it is zero to remove the tag.
                        This leaves us with a shifted but untagged value. Don't check for overflow.
                        Instead allow large values to wrap around and check later. *)
                     val cReg1 = newUReg()
                     val subValue = taggedWord64(Word64.fromLarge(Word.toLargeX firstIndex))
                 in
                     val testReg = newUReg()
                     val testCode =
                         addSubRegister{ base=initialTestReg, shifted=cReg1, dest=SomeReg testReg, ccRef=NONE, isAdd=false,
                                         length=polyWordOpSize, shift=ShiftNone} ::
                         loadNonAddressConstant{ source=subValue, dest=cReg1 } :: testCode
                 end
 
                 val (rangeCheck, extraDefaults) =
                     if isExhaustive then (testCode, [])
                     else
                     let
                         (* Check the value is within the number of cases, *2 because this is shifted. *)
                         val cReg2 = newUReg() and ccRef1 = newCCRef()
                         val nCases = List.length cases
                         val continueLab = newLabel() and defaultLab1 = newLabel()
                         val rangeCheck =
                             BlockLabel continueLab ::
                             BlockFlow(Conditional{ccRef=ccRef1, condition=CondCarrySet, trueJump=defaultLab1, falseJump=continueLab}) ::
                             addSubRegister{base=testReg, shifted=cReg2, dest=ZeroReg, ccRef=SOME ccRef1, isAdd=false,
                                            length=OpSize64, shift=ShiftNone} ::
                             loadNonAddressConstant{ source=Word64.fromInt nCases * 0w2, dest=cReg2 } :: testCode
                     in
                         (rangeCheck, [defaultLab1])
                     end
 
                 (* Make a label for each item in the list. *)
                 val codeLabels = map (fn _ => newLabel()) cases
 
                 (* Create an exit label in case it's needed. *)
                 val labelForExit = newLabel()
 
                 (* Generate the code for each of the cases and the default.  We need to put an
                    unconditional branch after each to skip the other cases. *)
                 fun codeCases (SOME c :: otherCases, startLabel :: otherLabels, tailCode) =
                     let
                         val caseTarget = if isTail then newPReg() else targetReg
                         (* Put in the case with a jump to the end of the sequence. *)
                         val (codeThisCase, _, caseExited) =
                             codeToICodeRev(c, context, isTail, SpecificPReg caseTarget,
                                 BlockLabel startLabel :: tailCode) 
                         val exitThisCase =
                             if caseExited then codeThisCase
                             else if isTail then returnInstruction(context, caseTarget, codeThisCase)
                             else BlockFlow(Unconditional labelForExit) :: codeThisCase
                     in
                         codeCases(otherCases, otherLabels, exitThisCase)
                     end
 
                 |   codeCases(NONE :: otherCases, _ :: otherLabels, tailCode) = codeCases(otherCases, otherLabels, tailCode)
                         
                 |   codeCases ([], [], tailCode) =
                     let
                         (* We need to add labels for all the gaps we filled and also for a "default" label for
                            the indexed-case instruction itself as well as any range checks. *)
                         fun addDefault (startLabel, NONE, l) = BlockLabel startLabel :: l
                         |   addDefault (_, SOME _, l) = l
                         fun asForward l = BlockLabel l
                         val dLabs = map asForward extraDefaults @ tailCode
                         val defLabels = ListPair.foldlEq addDefault dLabs (codeLabels, cases)
                         val defaultTarget = if isTail then newPReg() else targetReg
                         val (defaultCode, _, defaultExited) =
                             codeToICodeRev(default, context, isTail, SpecificPReg defaultTarget, defLabels)
                     in
                         (* Put in the default.  Because this is the last we don't need to
                            jump round it.  However if this is a tail and we haven't exited
                            we put in a return.  That way the case will always have
                            exited if this is a tail. *)
                          if isTail andalso not defaultExited
                          then returnInstruction(context, defaultTarget, defaultCode)
                          else defaultCode
                     end
 
                 |   codeCases _ = raise InternalError "codeCases: mismatch"
                     
                 val codedCases =
                     codeCases(cases, codeLabels,
                         BlockFlow(IndexedBr codeLabels) ::
                         BlockSimple(IndexedCaseOperation{testReg=testReg}) ::
                         rangeCheck)
    
                 (* We can now copy to the target.  If we need to push the result this load
                    will be converted into a push. *)
                 val target = asTarget destination
                 val copyToTarget =
                     if isTail then codedCases
                     else moveRegister{source=targetReg, dest=target} :: BlockLabel labelForExit :: codedCases
             in
                 (copyToTarget, target, isTail (* We have always exited on a tail. *))
             end
 
         |   codeToICodeRev(BICBeginLoop {loop, arguments}, context as { stackPtr, currHandler, overflowBlock, ...},
                            isTail, destination, tailCode) =
             let
                 val target = asTarget destination
                 
                 fun codeArgs ([], tailCode) = ([], tailCode)
                 |   codeArgs (({value, addr}, _) :: rest, tailCode) =
                     let
                         val pr = newPReg()
                         val () = Array.update(locToPregArray, addr, PregLocation pr)
                         val (code, _, _) = codeToICodeRev(value, context, false, SpecificPReg pr, tailCode)
                         val (pregs, othercode) = codeArgs(rest, code)
                     in
                         (pr::pregs, othercode)
                     end
                 val (loopRegs, argCode) = codeArgs(arguments, tailCode)
 
                 val loopLabel = newLabel()
                 val (loopBody, _, loopExited) =
                     codeToICodeRev(loop,
                         {loopArgs=SOME (loopRegs, loopLabel, stackPtr), stackPtr=stackPtr,
                          currHandler=currHandler, overflowBlock=overflowBlock },
                             isTail, SpecificPReg target, BlockLabel loopLabel :: BlockSimple BeginLoop :: argCode)
             in
                 (loopBody, target, loopExited)
             end
 
         |   codeToICodeRev(BICLoop args, context as {loopArgs=SOME (loopRegs, loopLabel, loopSp), stackPtr, currHandler, ...}, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 (* Registers to receive the evaluated arguments.  We can't put the
                    values into the loop variables yet because the values could depend
                    on the current values of the loop variables. *)
                 val argPRegs = map(fn _ => newPReg()) args
                 val codeArgs =
                     ListPair.foldlEq(fn ((arg, _), pr, l) =>
                         #1 (codeToICodeRev(arg, context, false, SpecificPReg pr, l))) tailCode
                         (args, argPRegs)
                 val jumpArgs = ListPair.mapEq(fn (s, l) => {src=ArgInReg s, dst=l}) (argPRegs, loopRegs)
                 (* If we've allocated a container in the loop we have to remove it before jumping back. *)
                 val stackReset =
                     if loopSp = stackPtr then codeArgs
                     else resetStackPtr{numWords=stackPtr-loopSp} :: codeArgs
                 val jumpLoop = JumpLoop{regArgs=jumpArgs, stackArgs=[], checkInterrupt=SOME[]}
                 (* "checkInterrupt" could result in a Interrupt exception so we treat this like
                    a function call. *)
                 val code =
                     case currHandler of
                         NONE => BlockFlow(Unconditional loopLabel) :: BlockSimple jumpLoop :: stackReset
                     |   SOME h => BlockOptionalHandle{call=jumpLoop, handler=h, label=loopLabel} :: stackReset
             in
                 (code, target, true)
             end
 
         |   codeToICodeRev(BICLoop _, {loopArgs=NONE, ...}, _, _, _) = raise InternalError "BICLoop without BICBeginLoop"
 
             (* Copy the source tuple into the container.  There are important special cases for
                both the source tuple and the container.  If the source tuple is a BICTuple we have
                the fields and can store them without creating a tuple on the heap.  If the
                destination is a local container we can store directly into the stack. *)
         |   codeToICodeRev(BICSetContainer{container, tuple, filter}, context as {stackPtr, ...}, _, destination, tailCode) =
             let
                 local
                     fun createStore containerReg (source, destWord, tail) =
                         storeAtWordOffset(source, destWord, containerReg, Load64, tail)
                 in
                     val (codeContainer, storeInstr) =
                         case getContainerIfPresent container of
                             ContainerOnStack{container, stackOffset} =>
                             let
                                 fun store(source, destWord, tail) =
                                     storeToStack{source=source, container=container, field=destWord,
                                         stackOffset=stackPtr-stackOffset+destWord} :: tail
                             in
                                 (tailCode, store)
                             end
 
                         |   ContainerInRegs regs =>
                             let
                                 fun copy(source, destWord, tail) =
                                     tail <::> moveRegister{source=source, dest=List.nth(regs, destWord)}
                             in
                                 (tailCode, copy)
                             end
 
                         |   NoContainer =>
                             let
                                 val containerTarget = newPReg()
                                 val (codeContainer, _, _) =
                                     codeToICodeRev(container, context, false, SpecificPReg containerTarget, tailCode)
                             in
                                 (codeContainer, createStore containerTarget)
                             end
                 end
                 
                 val filterLength = BoolVector.length filter
 
                 val code =
                     case tuple of
                         BICTuple cl =>
                         let
                             (* In theory it's possible that the tuple could contain fields that are not
                                used but nevertheless need to be evaluated for their side-effects.
                                Create all the fields and push to the stack. *)
                             fun codeField(arg, (regs, tailCode)) =
                             let
                                 val (c, r, _) =
                                     codeToICodeRev(arg, context, false, AnyReg, tailCode)
                             in
                                 (r :: regs, c)
                             end
 
                             val (pregsRev, codeFields) = List.foldl codeField ([], codeContainer) cl
                             val pregs = List.rev pregsRev
 
                             fun copyField(srcReg, (sourceWord, destWord, tailCode)) =
                                 if sourceWord < filterLength andalso BoolVector.sub(filter, sourceWord)
                                 then (sourceWord+1, destWord+1, storeInstr(srcReg, destWord, tailCode))
                                 else (sourceWord+1, destWord, tailCode)
                             
                             val (_, _, resultCode) = List.foldl copyField (0, 0, codeFields) pregs
                         in
                             resultCode
                         end
 
                     |   tuple =>
                         let (* Copy a heap tuple.  It is possible that this is another container in which case
                                we must load the fields directly.  We mustn't load its address and then copy
                                because loading the address would be the last reference and might cause
                                the container to be reused prematurely. ??? Is that an old comment ?? *)
                             val (codeTuple, loadField) =
                                 case getContainerIfPresent tuple of
                                     ContainerOnStack {container, stackOffset} =>
                                     let
                                         fun getAddr(destReg, sourceWord, tail) =
                                             loadStack{dest=destReg, wordOffset=stackPtr-stackOffset+sourceWord, container=container,
                                                           field=sourceWord} :: tail
                                     in
                                         (codeContainer, getAddr)
                                     end
 
                                 |   ContainerInRegs regs =>
                                     let
                                         fun copyReg(destReg, sourceWord, tail) =
                                             tail <::> moveRegister{dest=destReg, source=List.nth(regs, sourceWord)}
                                     in
                                         (codeContainer, copyReg)
                                     end
 
                                 |   NoContainer =>
                                     let
                                         val (codeTuple, tupleTarget, _) = codeToICodeRev(tuple, context, false, AnyReg, codeContainer)
                                         fun loadField(destReg: preg, sourceWord: int, tail): blockStruct list =
                                         let
                                             val (code, _, _) =
                                                 wordAddressOffset(SpecificPReg destReg, tupleTarget, sourceWord, polyWordLoadSize, tail)
                                         in
                                             code
                                         end
                                     in
                                         (codeTuple, loadField)
                                     end
 
                             fun copyContainer(sourceWord, destWord, tailCode) =
                             if sourceWord = filterLength
                             then tailCode
                             else if BoolVector.sub(filter, sourceWord)
                             then
                             let
                                 val loadReg = newPReg()
                                 val code =
                                     storeInstr(loadReg, destWord, loadField(loadReg, sourceWord, tailCode))
                             in
                                 copyContainer(sourceWord+1, destWord+1, code)
                             end
                             else copyContainer(sourceWord+1, destWord, tailCode)
                         in
                             copyContainer(0, 0, codeTuple)
                         end
             in
                 returnUnit(destination, code, false)
             end
 
         |   codeToICodeRev(BICLoadContainer{base, offset}, context as {stackPtr, ...}, _, destination, tailCode) =
             (
                 case getContainerIfPresent base of
                     ContainerOnStack {container, stackOffset} =>
                     let (* If this is a local container we extract the field. *)
                         val target = asTarget destination
                         val finalOffset = stackPtr-stackOffset+offset
                         val _ = finalOffset >= 0 orelse raise InternalError "offset"
                     in
                         (BlockSimple(LoadStack{wordOffset=finalOffset, container=container, field=offset,
                             dest=target}) :: tailCode, target, false)
                     end
 
                 |   NoContainer =>
                     let
                         val (codeBase, baseEntry, _) = codeToICodeRev(base, context, false, AnyReg, tailCode)
                     in
                         wordAddressOffset(destination, baseEntry, offset, Load64, codeBase)
                     end
 
                 |   ContainerInRegs regs =>
                     let
                         (* Always copy this into a new register because the source will be a merge reg. *)
                         val target = asTarget destination
                     in
                         (moveRegister{source=List.nth(regs, offset), dest=target} :: tailCode, target, false)
                     end 
             )
 
         |   codeToICodeRev(BICLoadOperation{ kind, address}, context, _, destination, tailCode) =
                 codeLoadOperation(kind, address, context, asTarget destination, tailCode)
 
         |   codeToICodeRev(BICStoreOperation{ kind, address, value}, context, _, destination, tailCode) =
                 codeStoreOperation(kind, address, value, context, destination, tailCode)
 
         |   codeToICodeRev(BICBlockOperation{ kind=BlockOpMove{isByteMove}, sourceLeft, destRight, length }, context, _, destination, tailCode) =
                 (* Assume these are copying immutable data i.e. vector to vector and string to string.
                    The simplifier now assumes that when optimising short constant moves e.g. concatenating
                    with a constant string. *)
             let
                 (* Move bytes or words from the source to the destination.  Need to get the start addresses
                    and length into new registers because they will be modified. *)
                 val (leftAddr, codeLft) = addressToPregAddress(sourceLeft, context, tailCode)
                 val (rightAddr, codeRt) = addressToPregAddress(destRight, context, codeLft)
                 val (codeLength, lengthReg, _) = codeToICodeRev(length, context, false, AnyReg, codeRt)
                 val loadOp = if isByteMove then Load8 else if is32in64 then Load32 else Load64
 
                 (* This threads the calls through two calls to loadAndStoreWithAbsolute to compute the addresses. *)
                 fun getDestAndMove(ltReg, tailCode) =
                 let
                     fun doMove (rtReg, code) =
                     let
                         val lengthReg2 = newUReg() and ltReg2 = newUReg() and rtReg2 = newUReg()
                     in
                         blockMove{ srcAddr=ltReg2, destAddr=rtReg2, length=lengthReg2, isByteMove=isByteMove } ::
                         moveRegister{dest=rtReg2, source=rtReg} ::
                         moveRegister{dest=ltReg2, source=ltReg} ::
                         untagValue{dest=lengthReg2, source=lengthReg, isSigned=false, opSize=polyWordOpSize} :: code
                     end
                 in
                     loadAndStoreWithAbsolute (rightAddr, opWordSize loadOp, loadShift loadOp, doMove, tailCode)
                 end
             in
                 returnUnit(destination, loadAndStoreWithAbsolute (leftAddr, opWordSize loadOp, loadShift loadOp, getDestAndMove, codeLength), false)
             end
 
         |   codeToICodeRev(BICBlockOperation{ kind=BlockOpEqualByte, sourceLeft, destRight, length }, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val ccRef = newCCRef()
                 (* Compare bytes for equality.  Need to get the start addresses and length into new registers
                    because they will be modified. *)
                 val (leftAddr, codeLft) = addressToPregAddress(sourceLeft, context, tailCode)
                 val (rightAddr, codeRt) = addressToPregAddress(destRight, context, codeLft)
                 val (codeLength, lengthReg, _) = codeToICodeRev(length, context, false, AnyReg, codeRt)
 
                 (* This threads the calls through two calls to loadAndStoreWithAbsolute to compute the addresses. *)
 
                 fun getRightAndCompare(ltReg, tailCode) =
                 let
                     fun doComparison (rtReg, code) =
                     let
                         val lengthReg2 = newUReg() and ltReg2 = newUReg() and rtReg2 = newUReg()
                     in
                         compareByteVectors{ vec1Addr=ltReg2, vec2Addr=rtReg2, length=lengthReg2, ccRef=ccRef } ::
                         moveRegister{dest=rtReg2, source=rtReg} ::
                         moveRegister{dest=ltReg2, source=ltReg} ::
                         untagValue{dest=lengthReg2, source=lengthReg, isSigned=false, opSize=polyWordOpSize} :: code
                     end
                 in
                     loadAndStoreWithAbsolute (rightAddr, opWordSize Load8, loadShift Load8, doComparison, tailCode)
                 end
 
                 val testCode =
                     loadAndStoreWithAbsolute (leftAddr, opWordSize Load8, loadShift Load8, getRightAndCompare, codeLength)
             in
                 (makeBoolResultRev(CondEqual, ccRef, target, testCode), target, false)
             end
 
         |   codeToICodeRev(BICBlockOperation{ kind=BlockOpCompareByte, sourceLeft, destRight, length }, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val ccRef = newCCRef()
                 (* Similar to OpEqualByte except it returns -1, 0, +1 depending on the condition code. *)
                 (* Compare bytes for equality.  Need to get the start addresses and length into new registers
                    because they will be modified. *)
                 val (leftAddr, codeLft) = addressToPregAddress(sourceLeft, context, tailCode)
                 val (rightAddr, codeRt) = addressToPregAddress(destRight, context, codeLft)
                 val (codeLength, lengthReg, _) = codeToICodeRev(length, context, false, AnyReg, codeRt)
 
                 (* This threads the calls through two calls to loadAndStoreWithAbsolute to compute the addresses. *)
 
                 fun getRightAndCompare(ltReg, tailCode) =
                 let
                     fun doComparison (rtReg, code) =
                     let
                         val lengthReg2 = newUReg() and ltReg2 = newUReg() and rtReg2 = newUReg()
                         val exitLab = newLabel() and labGreater = newLabel() and labNotGreater = newLabel() 
                         and labLess = newLabel() and labNotLess = newLabel()
                         val mergeResult = newMergeReg()
                         val taggedMinus1 =
                             if is32in64 then 0wxffffffff else 0wxffffffffffffffff
                     in
                         (* Compare the words then a series of comparisons to set the result.
                            TODO; The old code-generator makes the "equal" exit of compareByteVectors jump
                            directly to code to set the result to zero.  It then uses
                            loadNonAddress(X0, Word64.fromInt(tag 1)) followed by
                            conditionalSetInverted{regD=X0, regTrue=X0, regFalse=XZero, cond=CondUnsignedHigher}
                            to set the result to one or minus one. N.B.  This needs to use a 32-bit operation on
                            32-in-64. *)
                         moveRegister{dest=target, source=mergeResult} ::
                         BlockLabel exitLab ::
                         loadNonAddressConstant{source=taggedWord64 0w1, dest=mergeResult} ::
                         BlockLabel labGreater ::
                         BlockFlow(Unconditional exitLab) ::
                         loadNonAddressConstant{source=taggedMinus1, dest=mergeResult} ::
                         BlockLabel labLess ::
                         BlockFlow(Unconditional exitLab) ::
                         loadNonAddressConstant{source=taggedWord64 0w0, dest=mergeResult} ::
                         BlockLabel labNotGreater ::
                         BlockFlow(Conditional{ ccRef=ccRef, condition=CondUnsignedHigher, trueJump=labGreater, falseJump=labNotGreater }) ::
                         BlockLabel labNotLess ::
                         BlockFlow(Conditional{ ccRef=ccRef, condition=CondCarryClear, trueJump=labLess, falseJump=labNotLess }) ::
                         compareByteVectors{ vec1Addr=ltReg2, vec2Addr=rtReg2, length=lengthReg2, ccRef=ccRef } ::
                         moveRegister{dest=rtReg2, source=rtReg} ::
                         moveRegister{dest=ltReg2, source=ltReg} ::
                         untagValue{dest=lengthReg2, source=lengthReg, isSigned=false, opSize=polyWordOpSize} :: code
                     end
                 in
                     loadAndStoreWithAbsolute (rightAddr, opWordSize Load8, loadShift Load8, doComparison, tailCode)
                 end
 
                 val testCode =
                     loadAndStoreWithAbsolute (leftAddr, opWordSize Load8, loadShift Load8, getRightAndCompare, codeLength)
             in
                 (testCode, target, false)
             end
 
         |   codeToICodeRev(BICAllocateWordMemory {numWords, flags, initial }, context, _, destination, tailCode) =
             let
                 (* Allocate a block of memory and initialise it. *)
                 val target = asTarget destination
                 val (codeSize, sizeReg, _) = codeToICodeRev(numWords, context, false, AnyReg, tailCode)
                 val (codeFlags, flagsReg, _) = codeToICodeRev(flags, context, false, AnyReg, codeSize)
                 val (codeInit, initReg, _) = codeToICodeRev(initial, context, false, AnyReg, codeFlags)
                 val uSizeReg = newUReg() and shiftFReg = newUReg() and lengthWord = newUReg()
                 val absAddr = if is32in64 then newUReg() else target
                 val untagSize = untagValue{source=sizeReg, dest=uSizeReg, opSize=polyWordOpSize, isSigned=false} :: codeInit
                 val allocateMem = allocateMemoryVariable{ size=uSizeReg, dest=absAddr, saveRegs=[]} :: untagSize
                 (* Make the length word by first shifting the flags into the length word reg by
                    55 or 23 bits.  This puts the tag bit in the top bit of the size.  Then insert the size
                    into this which will overwrite the flag's tag bit. *)
                 val makeLengthWord =
                     bitFieldInsert{ source=uSizeReg, destAsSource=shiftFReg, dest=lengthWord, length=polyWordOpSize,
                               immr=0w0 (*bit 0*), imms=if is32in64 then 0w23 else 0w55 (*width-1*) } ::
                     shiftConstant{direction=Arm64ICode.ShiftLeft, dest=shiftFReg, source=flagsReg,
                         shift=if is32in64 then 0w23 else 0w55, opSize=polyWordOpSize } :: allocateMem
                 val setLengthWordAndInit =
                     initialiseMem{ size=uSizeReg, addr=absAddr, init=initReg} ::
                     storeWithConstantOffset{ source=lengthWord, base=absAddr,
                         byteOffset= ~(Word.toInt wordSize), loadType=polyWordLoadSize } :: makeLengthWord
                 val finalCode =
                     if is32in64 then absoluteToObjectIndex{ source=absAddr, dest=target } :: setLengthWordAndInit
                     else setLengthWordAndInit
             in
                 (finalCode, target, false)
             end
 
         |   codeToICodeRev(BICHandle{exp, handler, exPacketAddr}, context as { stackPtr, loopArgs, overflowBlock, ... }, isTail, destination, tailCode) =
             let
                 (* As with BICCond and BICCase we need to create a new register for the
                    result in case we need to push it to the stack. *)
                 val handleResult = newMergeReg()
                 val handlerLab = newLabel() and startHandling = newLabel()
                 val (bodyTarget, handlerTarget) =
                     if isTail then (newPReg(), newPReg()) else (handleResult, handleResult)
                 (* TODO: Even if we don't actually want a result we force one in here by
                    using "asTarget".  *)
                 (* The expression cannot be treated as a tail because the handler has
                    to be removed after.  It may "exit" if it has raised an unconditional
                    exception.  If it has we mustn't generate a PopExceptionHandler because
                    there won't be any result for resultReg.
                    We need to add two words to the stack to account for the items pushed by
                    PushExceptionHandler.
                    We create an instruction to push the handler followed by a block fork to
                    the start of the code and, potentially the handler, then a label to start
                    the code that the handler is in effect for. *)
                 val initialCode =
                     BlockLabel startHandling ::
                     BlockFlow(SetHandler{handler=handlerLab, continue=startHandling}) ::
                     BlockSimple(PushExceptionHandler) :: tailCode
                 val (expCode, _, expExit) =
                     codeToICodeRev(exp, {stackPtr=stackPtr+2, loopArgs=loopArgs, currHandler=SOME handlerLab, overflowBlock=overflowBlock},
                         false (* Not tail *), SpecificPReg bodyTarget, initialCode)
                 (* If this is the tail we can replace the jump at the end of the
                    handled code with returns.  If the handler has exited we don't need
                    a return there.  Otherwise we need to add an unconditional jump to
                    skip the handler. *)
                 val (atExpEnd, skipExpLabel) =
                     case (isTail, expExit) of
                         (true, true) => (* Tail and exited. *) (expCode, NONE)
                     |   (true, false) => (* Tail and not exited. *)
                             (returnInstruction(context, bodyTarget, BlockSimple(PopExceptionHandler) :: expCode), NONE)
                     |   (false, true) => (* Not tail but exited. *) (expCode, NONE)
                     |   (false, false) =>
                         let
                             val skipHandler = newLabel()
                         in
                             (BlockFlow(Unconditional skipHandler) ::
                              BlockSimple(PopExceptionHandler) :: expCode, SOME skipHandler)
                         end
                 (* Make a register to hold the exception packet and put eax into it. *)
                 val packetAddr = newPReg()
                 val () = Array.update(locToPregArray, exPacketAddr, PregLocation packetAddr)
                 val (handleCode, _, handleExit) =
                     codeToICodeRev(handler, context, isTail, SpecificPReg handlerTarget,
                         BlockSimple(BeginHandler{packetReg=packetAddr}) :: BlockLabel handlerLab :: atExpEnd)
                 val target = asTarget destination
                 val afterHandler =
                     case (isTail, handleExit) of
                         (true, true) => (* Tail and exited. *) handleCode
                     |   (true, false) => (* Tail and not exited. *)
                             returnInstruction(context, handlerTarget, handleCode)
                     |   (false, _) => (* Not tail. *) handleCode
                 
                 val addLabel =
                     case skipExpLabel of
                         SOME lab => BlockLabel lab:: afterHandler
                     |   NONE => afterHandler
             in
                 (moveRegister{source=handleResult, dest=target} :: addLabel, target, isTail)
             end
 
         and codeConditionRev(condition, context, jumpOn, jumpLabel, tailCode) =
        (* Jump optimisation is done later.  Just generate the general case.
           Load the value into a register and compare it with 1 (true) *)
         let
             val ccRef = newCCRef()
             val (testCode, testReg, _) = codeToICodeRev(condition, context, false, AnyReg, tailCode)
             val noJumpLabel = newLabel()
         in
-            BlockLabel noJumpLabel ::
+            testCode <::>
+            (* Test bit 1.  This can be optimised into a test and branch. *)
+            logicalImmediate{source=testReg, immed=0w2, logOp=LogAnd,
+                             dest=ZeroReg, length=OpSize32 (* Always either tagged 0 or tagged 1 *), ccRef=SOME ccRef} <::>
             BlockFlow(Conditional{ccRef=ccRef,
-                       condition=if jumpOn then CondEqual else CondNotEqual, trueJump=jumpLabel, falseJump=noJumpLabel}) ::
-            (* Compare: SUBS XZ,reg,3.  Can use 32-bit comparison because it's either tagged 0 or tagged 1. *)
-            addSubImmediate{source=testReg, immed=taggedWord 0w1, isAdd=false, dest=ZeroReg, length=OpSize32, ccRef=SOME ccRef} ::
-            testCode
+                       condition=if jumpOn then CondNotEqual else CondEqual, trueJump=jumpLabel, falseJump=noJumpLabel}) <::>
+            BlockLabel noJumpLabel
         end
 
         and codeToICodeUnaryRev({oper=NotBoolean, arg1}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val ccRef = newCCRef()
                 val (argCode, testDest, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
             in
                 (* Test the argument and return a boolean result.  If either the argument is a condition
-                   or the result is used in a test this will be better than using XOR. *)
-                (makeBoolResultRev(CondNotEqual, ccRef, target,
-                        addSubImmediate{source=testDest, immed=taggedWord 0w1, isAdd=false,
+                   or the result is used in a test this will be better than using XOR.
+                   We use a bit test here because it is possible to optimise it to a test-and-branch.
+                   See codeConditionRev. *)
+                (makeBoolResultRev(CondEqual, ccRef, target,
+                        logicalImmediate{source=testDest, immed=0w2, logOp=LogAnd,
                                 dest=ZeroReg, length=OpSize32 (* Always either tagged 0 or tagged 1 *), ccRef=SOME ccRef} ::
                             argCode), target, false)
             end
 
         |   codeToICodeUnaryRev({oper=IsTaggedValue, arg1}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val ccRef = newCCRef()
                 val (argCode, testDest, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
             in
                 (* Test the argument and return a boolean result.  This ought to be optimised at
                    a lower level to use a test-and-branch. *)
                 (makeBoolResultRev(CondNotEqual, ccRef, target,
                             logicalImmediate{source=testDest, immed=0w1 (* The tag bit*), logOp=LogAnd,
                                 dest=ZeroReg, length=OpSize32 (* Always either tagged 0 or tagged 1 *), ccRef=SOME ccRef} ::
                             argCode), target, false)
             end
 
         |   codeToICodeUnaryRev({oper=MemoryCellLength, arg1}, context, _, destination, tailCode) =
             let
                 val ureg1 = newUReg() and ureg2 = newUReg()
                 val (codeBase, baseReg, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 (* Load the word at -1 (words) into a ureg *)
                 val (codeLoad, _, _) = wordAddressOffset(SpecificPReg ureg1, baseReg, ~1, polyWordLoadSize, codeBase)
                 (* Select 56 or 24 bits and shift it left.  This disassembles as UBFIZ..*)
                 val lsb = 0w1 and width = if is32in64 then 0w24 else 0w56
                 (* Encoding for unsignedBitfieldInsertinZeros64/32 *)
                 val immr = if is32in64 then Word.~ lsb mod 0w32 else Word.~ lsb mod 0w64
                 val imms = width-0w1
                 val maskAndShift =
                     bitFieldShift{source=ureg1, dest=ureg2, isSigned=false, length=polyWordOpSize, immr=immr, imms=imms} :: codeLoad
                 val target = asTarget destination
                 val addTag =
                     addSubImmediate{dest=SomeReg target, source=ureg2, immed=0w1, length=polyWordOpSize,
                                             ccRef=NONE, isAdd=true} :: maskAndShift
             in
                 (addTag, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=MemoryCellFlags, arg1}, context, _, destination, tailCode) =
             let
                 (* Load the flags byte and tag it. *)
                 val (codeBase, baseReg, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (codeRealBase, realBaseReg) = getAbsoluteAddress(codeBase, baseReg)
                 val ureg = newUReg()
                 val codeLoad =
                     loadWithConstantOffset{ base=realBaseReg, dest=ureg, byteOffset=flagsByteOffset, loadType=Load8 } :: codeRealBase
                 val target = asTarget destination
                 val withTag = tagValue{ source=ureg, dest=target, isSigned=false, opSize=OpSize32 } :: codeLoad
             in
                 (withTag, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=ClearMutableFlag, arg1}, context, _, destination, tailCode) =
             let
                 val (codeBase, baseReg, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (codeRealBase, realBaseReg) = getAbsoluteAddress(codeBase, baseReg)
                 val ureg1 = newUReg() and ureg2 = newUReg()
                 (* Load the flags, mask off the mutable bit and store it back. *)
                 val code =
                     storeWithConstantOffset{ base=realBaseReg, source=ureg2, byteOffset=flagsByteOffset, loadType=Load8 } ::
                     logicalImmediate{ source=ureg1, dest=SomeReg ureg2, ccRef=NONE,
                                       immed=Word64.xorb(0wxffffffff, 0wx40), logOp=LogAnd, length=OpSize32 } ::
                     loadWithConstantOffset{ base=realBaseReg, dest=ureg1, byteOffset=flagsByteOffset, loadType=Load8 } :: codeRealBase
             in
                 returnUnit(destination, code, false)
             end
 
         |   codeToICodeUnaryRev({oper=LongWordToTagged, arg1}, context, _, destination, tailCode) =
             let
                 val (codeBase, baseReg, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val uReg = newUReg()
                 val target = asTarget destination
                 val code =
                     tagValue{ source=uReg, dest=target, isSigned=false, opSize=polyWordOpSize } ::
                     unboxLarge{ source=baseReg, dest=uReg } :: codeBase
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=SignedToLongWord, arg1}, context, _, destination, tailCode) =
             let
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val uReg = newUReg()
                 val target = asTarget destination
                 (* We can use a single instruction here on both 32-in-64 and native 64-bits.
                    On 64-bits this is equivalent to an arithmetic shift; on 32-bits
                    it propagates the sign bit into the high-order part. *)
                 val code =
                     boxLarge{ source=uReg, dest=target, saveRegs=[] } ::
                     bitFieldShift{ source=aReg1, dest=uReg, isSigned=true, length=OpSize64,
                                    immr=0w1, imms=if is32in64 then 0wx1f else 0wx3f } ::
                     arg1Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=UnsignedToLongWord, arg1}, context, _, destination, tailCode) =
             let
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val uReg = newUReg()
                 val target = asTarget destination
                 (* This amounts to a logical shift.  Since the top half of the
                    register is zero in 32-in-64 we don't have to select just the
                    low word but there's no advantage in not. *)
                 val code =
                     boxLarge{ source=uReg, dest=target, saveRegs=[] } ::
                     bitFieldShift{ source=aReg1, dest=uReg, isSigned=false, length=OpSize64,
                                    immr=0w1, imms=if is32in64 then 0wx1f else 0wx3f } ::
                     arg1Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=RealAbs precision, arg1}, context, _, destination, tailCode) =
             let
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val uReg1 = newUReg() and uReg2 = newUReg()
                 val target = asTarget destination
                 val fpSize = precisionToFpSize precision
                 val fpOp = case precision of PrecSingle => AbsFloat | PrecDouble => AbsDouble
                 val code =
                     boxTagFloat{ floatSize=fpSize, source=uReg2, dest=target, saveRegs=[] } ::
                     unaryFloatingPt{ source=uReg1, dest=uReg2, fpOp=fpOp } ::
                     unboxTagFloat{ floatSize=fpSize, source=aReg1, dest=uReg1 } ::
                     arg1Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=RealNeg precision, arg1}, context, _, destination, tailCode) =
             let
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val uReg1 = newUReg() and uReg2 = newUReg()
                 val target = asTarget destination
                 val fpSize = precisionToFpSize precision
                 val fpOp = case precision of PrecSingle => NegFloat | PrecDouble => NegDouble
                 val code =
                     boxTagFloat{ floatSize=fpSize, source=uReg2, dest=target, saveRegs=[] } ::
                     unaryFloatingPt{ source=uReg1, dest=uReg2, fpOp=fpOp } ::
                     unboxTagFloat{ floatSize=fpSize, source=aReg1, dest=uReg1 } ::
                     arg1Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=RealFixedInt precision, arg1}, context, _, destination, tailCode) =
             let
                 (* Convert a tagged integer (FixedInt.int) to float or double. *)
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val uReg1 = newUReg() and uReg2 = newUReg()
                 val target = asTarget destination
                 val fpSize = precisionToFpSize precision
                 val code =
                     boxTagFloat{ floatSize=fpSize, source=uReg2, dest=target, saveRegs=[] } ::
                     convertIntToFloat{ source=uReg1, dest=uReg2, srcSize=polyWordOpSize, destSize=fpSize } ::
                     untagValue{ source=aReg1, dest=uReg1, opSize=polyWordOpSize, isSigned=true } ::
                     arg1Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=FloatToDouble, arg1}, context, _, destination, tailCode) =
             let
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val uReg1 = newUReg() and uReg2 = newUReg()
                 val target = asTarget destination
                 val code =
                     boxTagFloat{ floatSize=Double64, source=uReg2, dest=target, saveRegs=[] } ::
                     unaryFloatingPt{ source=uReg1, dest=uReg2, fpOp=ConvFloatToDble } ::
                     unboxTagFloat{ floatSize=Float32, source=aReg1, dest=uReg1 } ::
                     arg1Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=DoubleToFloat, arg1}, context, _, destination, tailCode) =
             let
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val uReg1 = newUReg() and uReg2 = newUReg()
                 val target = asTarget destination
                 val code =
                     boxTagFloat{ floatSize=Float32, source=uReg2, dest=target, saveRegs=[] } ::
                     unaryFloatingPt{ source=uReg1, dest=uReg2, fpOp=ConvDbleToFloat } ::
                     unboxTagFloat{ floatSize=Double64, source=aReg1, dest=uReg1 } ::
                     arg1Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=RealToInt(precision, rounding), arg1}, context, _, destination, tailCode) =
             let
                 (* Convert a float or double to a tagged int.
                    We could get an overflow in either the conversion to integer
                    or in the conversion to a tagged value.  Fortunately if the
                    conversion detects an overflow it sets the result to a
                    value that will cause an overflow in the addition. *)
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val target = asTarget destination
                 val chkOverflow = newCCRef()
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 val fpSize = precisionToFpSize precision
                 val code =
                     (* Set the tag bit. *)
                     addSubImmediate{dest=SomeReg target, source=uReg3, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=true} ::
                     checkOverflow(CondOverflow, context, chkOverflow) @
                     (* Add it to itself and set the condition code. *)
                     addSubRegister{base=uReg2, shifted=uReg2, dest=SomeReg uReg3, length=polyWordOpSize,
                                  ccRef=SOME chkOverflow, isAdd=true, shift=ShiftNone} ::
                     convertFloatToInt{ source=uReg1, dest=uReg2, srcSize=fpSize, destSize=polyWordOpSize, rounding=rounding } ::
                     unboxTagFloat{ floatSize=fpSize, source=aReg1, dest=uReg1 } ::
                     arg1Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=TouchAddress, arg1}, context, _, destination, tailCode) =
             let
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
             in
                 returnUnit(destination, touchValue{source=aReg1} :: arg1Code, false)
             end
 
         |   codeToICodeUnaryRev({oper=AllocCStack, arg1}, context, _, destination, tailCode) =
             let
                 (* Allocate space on the stack.  The higher levels have already aligned
                    the size to a multiple of 16. The number of bytes to allocate is a
                    Word.word value.  The result is a boxed large word. *)
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val uReg1 = newUReg() and uReg2 = newUReg()
                 val target = asTarget destination
                 val code =
                     boxLarge{ source=uReg2, dest=target, saveRegs=[] } ::
                     addSubXSP{ source=uReg1, dest=SomeReg uReg2, isAdd=false  } ::
                     untagValue{ source=aReg1, dest=uReg1, isSigned=false, opSize=polyWordOpSize } ::
                     arg1Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=LockMutex, arg1}, context, _, destination, tailCode) =
             (* The earliest versions of the Arm8 do not have the LDADD instruction which
                will do this directly.  To preserve compatibility we use LDAXR/STLXR
                which require a loop. *)
             let
                 local
                     val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 in
                     val (baseCode, baseReg) = getAbsoluteAddress(arg1Code, aReg1)
                 end
                 val loopLabel = newLabel() and noLoopLabel = newLabel()
                 val target = asTarget destination
                 val ccRef1 = newCCRef() and ccRef2 = newCCRef()
                 val uRegNew = newUReg() and uRegTest = newUReg() and uRegOld = newUReg() and uRegIncr = newUReg()
                 val code =
                     if useLSEAtomics
                     then baseCode <::> loadNonAddressConstant{ source=0w1, dest=uRegIncr } <::>
                         atomicOperation{atOp=LoadAddAcquire, base=baseReg, source=SomeReg uRegIncr, dest=SomeReg uRegOld} <::>
                         (* If the previous value was zero we've set it to one and we've got the lock. *)
                         addSubImmediate{source=uRegOld, dest=ZeroReg, immed=0w0, isAdd=false, length=OpSize64, ccRef=SOME ccRef2}
                     else
                         (* The result is true if the old value was zero. *)
                         addSubImmediate{source=uRegOld, dest=ZeroReg, immed=0w0, isAdd=false, length=OpSize64, ccRef=SOME ccRef2} ::
                         memoryBarrier :: (* Put in the memory barrier. *)
                         (* If the result is zero we've been successful otherwise we loop. *)
                         BlockLabel noLoopLabel ::
                         BlockFlow(Conditional{ ccRef=ccRef1, condition=CondNotEqual, trueJump=loopLabel, falseJump=noLoopLabel }) ::
                         addSubImmediate{source=uRegTest, dest=ZeroReg, immed=0w0, isAdd=false, length=OpSize32, ccRef=SOME ccRef1} ::
                         (* Add and try to store the result *)
                         storeReleaseExclusive{ base=baseReg, source=SomeReg uRegNew, result=uRegTest } ::
                         addSubImmediate{source=uRegOld, dest=SomeReg uRegNew, immed=0w1, isAdd=true, length=OpSize64, ccRef=NONE} ::
                         loadAcquireExclusive{ base=baseReg, dest=uRegOld } ::
                         BlockLabel loopLabel :: baseCode
             in
                 (makeBoolResultRev(CondEqual, ccRef2, target, code), target, false)
             end
 
         |   codeToICodeUnaryRev({oper=TryLockMutex, arg1}, context, _, destination, tailCode) =
             (* *)
             let
                 local
                     val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 in
                     val (baseCode, baseReg) = getAbsoluteAddress(arg1Code, aReg1)
                 end
                 val target = asTarget destination
                 val loopLabel = newLabel() and noLoopLabel = newLabel() and okLabel = newLabel()
                 val ccRef0 = newCCRef() and ccRef1 = newCCRef() and ccRef2 = newCCRef()
                 val uRegNew = newUReg() and uRegTest = newUReg() and uRegOld = newUReg()
                 val code =
                     if useLSEAtomics
                     then baseCode <::> loadNonAddressConstant{ source=0w1, dest=uRegNew } <::>
                         atomicOperation{atOp=LoadUMaxAcquire, base=baseReg, source=SomeReg uRegNew, dest=SomeReg uRegOld} <::>
                         (* If the previous value was zero we've set it to one and we've got the lock. *)
                         addSubImmediate{source=uRegOld, dest=ZeroReg, immed=0w0, isAdd=false, length=OpSize64, ccRef=SOME ccRef2}
                     else
                         (* The result is true if the old value was zero. *)
                         addSubImmediate{source=uRegOld, dest=ZeroReg, immed=0w0, isAdd=false, length=OpSize64, ccRef=SOME ccRef2} ::
                         memoryBarrier :: (* Put in the memory barrier. *)
                         (* If the result is zero we've been successful otherwise we loop. *)
                         BlockLabel noLoopLabel ::
                         BlockFlow(Conditional{ ccRef=ccRef1, condition=CondNotEqual, trueJump=loopLabel, falseJump=noLoopLabel }) ::
                         addSubImmediate{source=uRegTest, dest=ZeroReg, immed=0w0, isAdd=false, length=OpSize32, ccRef=SOME ccRef1} ::
                         (* If the lock wasn't taken set it to one to lock it. *)
                         storeReleaseExclusive{ base=baseReg, source=SomeReg uRegNew, result=uRegTest } ::
                         loadNonAddressConstant{source=0w1, dest=uRegNew } :: BlockLabel okLabel ::
                         (* If it's not zero don't try to store anything back and exit the loop. *)
                         BlockFlow(Conditional{ ccRef=ccRef0, condition=CondNotEqual, trueJump=noLoopLabel, falseJump=okLabel }) ::
                         addSubImmediate{source=uRegOld, dest=ZeroReg, immed=0w0, isAdd=false, length=OpSize64, ccRef=SOME ccRef0} ::
                         (* Get the old value and see if it's zero i.e. unlocked. *)
                         loadAcquireExclusive{ base=baseReg, dest=uRegOld } ::
                         BlockLabel loopLabel :: baseCode
             in
                 (makeBoolResultRev(CondEqual, ccRef2, target, code), target, false)
             end
 
         |   codeToICodeUnaryRev({oper=UnlockMutex, arg1}, context, _, destination, tailCode) =
             (* Get the previous value of the mutex to see if another thread had tried to
                lock it and set the result to zero. *)
             let (* Could use SWAPAL *)
                 local
                     val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 in
                     val (baseCode, baseReg) = getAbsoluteAddress(arg1Code, aReg1)
                 end
                 val target = asTarget destination
                 val loopLabel = newLabel() and noLoopLabel = newLabel()
                 val ccRef1 = newCCRef() and ccRef2 = newCCRef()
                 val uRegTest = newUReg() and uRegOld = newUReg()
                 val code =
                     if useLSEAtomics
                     then baseCode <::> atomicOperation{atOp=SwapRelease, base=baseReg, source=ZeroReg, dest=SomeReg uRegOld} <::>
                             addSubImmediate{source=uRegOld, dest=ZeroReg, immed=0w1, isAdd=false, length=OpSize64, ccRef=SOME ccRef2}
                     else
                         (* The result is true if the old value was one. i.e. we were the only thread
                            that locked it. *)
                         addSubImmediate{source=uRegOld, dest=ZeroReg, immed=0w1, isAdd=false, length=OpSize64, ccRef=SOME ccRef2} ::
                         memoryBarrier :: (* Put in the memory barrier. *)
                         (* If the result is zero we've been successful otherwise we loop. *)
                         BlockLabel noLoopLabel ::
                         BlockFlow(Conditional{ ccRef=ccRef1, condition=CondNotEqual, trueJump=loopLabel, falseJump=noLoopLabel }) ::
                         addSubImmediate{source=uRegTest, dest=ZeroReg, immed=0w0, isAdd=false, length=OpSize32, ccRef=SOME ccRef1} ::
                         (* Try to set this to zero *)
                         storeReleaseExclusive{ base=baseReg, source=ZeroReg, result=uRegTest } ::
                         loadAcquireExclusive{ base=baseReg, dest=uRegOld } ::
                         BlockLabel loopLabel :: baseCode
             in
                 (makeBoolResultRev(CondEqual, ccRef2, target, code), target, false)
             end
 
         and codeToICodeBinaryRev({oper=WordComparison{test, isSigned}, arg1, arg2},
                 context, _, destination, tailCode) =
             let
                 (* Comparisons. This is now only used for tagged values, not for pointer equality. *)
                 val ccRef = newCCRef()
                 val (testCode1, testDest1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (testCode2, testDest2, _) = codeToICodeRev(arg2, context, false, AnyReg, testCode1)
                 val comparison =
                     addSubRegister{base=testDest1, shifted=testDest2, dest=ZeroReg, length=polyWordOpSize,
                                    ccRef=SOME ccRef, isAdd=false, shift=ShiftNone} :: testCode2
                 val target = asTarget destination
                 open BuiltIns
                 val cond =
                     case (test, isSigned) of
                         (TestEqual,         _) => CondEqual
                     |   (TestLess,          true) => CondSignedLess
                     |   (TestLessEqual,     true) => CondSignedLessEq
                     |   (TestGreater,       true) => CondSignedGreater
                     |   (TestGreaterEqual,  true) => CondSignedGreaterEq
                     |   (TestLess,          false) => CondCarryClear
                     |   (TestLessEqual,     false) => CondUnsignedLowOrEq
                     |   (TestGreater,       false) => CondUnsignedHigher
                     |   (TestGreaterEqual,  false) => CondCarrySet
                     |   (TestUnordered,     _) => raise InternalError "WordComparison: TestUnordered"
             in
                 (makeBoolResultRev(cond, ccRef, target, comparison), target, false)
             end
 
         |   codeToICodeBinaryRev({oper=FixedPrecisionArith ArithAdd, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 (* We need to subtract the tag from one of the arguments and then do the
                    addition.  The optimiser will do the subtraction at compile time if we
                    subtract from a constant so try to put the constant in the second arg. *)
                 val (firstReg, secondReg) =
                     case arg1 of BICConstnt _ => (aReg2, aReg1) | _ => (aReg1, aReg2)
                 val uReg = newUReg()
                 val chkOverflow = newCCRef()
                 val code =
                     checkOverflow(CondOverflow, context, chkOverflow) @
                     addSubRegister{base=firstReg, shifted=uReg, dest=SomeReg target, length=polyWordOpSize,
                                  ccRef=SOME chkOverflow, isAdd=true, shift=ShiftNone} ::
                     addSubImmediate{dest=SomeReg uReg, source=secondReg, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=false}:: arg2Code
             in
                 (code , target, false)
             end
 
         |   codeToICodeBinaryRev({oper=FixedPrecisionArith ArithSub, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg = newUReg()
                 val chkOverflow = newCCRef()
                 val code =
                     checkOverflow(CondOverflow, context, chkOverflow) @
                     addSubRegister{base=aReg1, shifted=uReg, dest=SomeReg target, length=polyWordOpSize,
                                  ccRef=SOME chkOverflow, isAdd=false, shift=ShiftNone} ::
                     addSubImmediate{dest=SomeReg uReg, source=aReg2, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=false}:: arg2Code
             in
                 (code , target, false)
             end
 
         |   codeToICodeBinaryRev({oper=FixedPrecisionArith ArithMult, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg() and uReg4 = newUReg()
                 val chkOverflow = newCCRef()
 
                 (* Untag one argument. subtract the tag from the second, multiply and add back the tag. *)
                 val multiplyCode =
                     addSubImmediate{dest=SomeReg target, source=uReg3, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=true} ::
                     multiplication{kind=if is32in64 then SignedMultAddLong else MultAdd64, dest=uReg3, sourceA=ZeroReg,
                                    sourceM=uReg1, sourceN=uReg2} ::
                     addSubImmediate{dest=SomeReg uReg2, source=aReg2, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=false} ::
                     untagValue{ source=aReg1, dest=uReg1, isSigned=true, opSize=polyWordOpSize } ::
                     arg2Code
 
                 (* Overflow check:  The condition for overflow is that the high order part (64-bits in native
                    64-bits, 32-bits in 32-in-64) must be zero if the result is positive and all ones if the
                    result is negative.  The high-order part is in uReg3 in 32-in-64 since we've already used
                    SignedMultAddLong but in native 64-bits we need to use SignedMultHigh to get the
                    high order part.  In both cases we can use a comparison with ShiftASR to give a
                    value containing just the sign of the result. *)
                 val checkOverflowCode =
                     if is32in64
                     then addSubRegister{ base=uReg4, shifted=target, dest=ZeroReg, ccRef=SOME chkOverflow,
                             isAdd=false, length=OpSize32, shift=ShiftASR 0w31 } ::
                          shiftConstant{direction=Arm64ICode.ShiftRightArithmetic, source=uReg3, dest=uReg4, shift=0w32,
                             opSize=OpSize64 (* Have to start with 64-bits *)} :: multiplyCode
                     else addSubRegister{ base=uReg4, shifted=target, dest=ZeroReg, ccRef=SOME chkOverflow,
                             isAdd=false, length=OpSize64, shift=ShiftASR 0w63 } ::
                          multiplication{kind=SignedMultHigh, dest=uReg4, sourceA=ZeroReg,
                                        sourceM=uReg1, sourceN=uReg2} :: multiplyCode
                 val code =
                     checkOverflow(CondNotEqual, context, chkOverflow) @ checkOverflowCode
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=FixedPrecisionArith ArithQuot, arg1, arg2}, context, _, destination, tailCode) =
             let
                 (* The word version avoids an extra shift.  Don't do that here at least
                    for the moment.  Division by zero and overflow are checked for at
                    the higher level. *)
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 val code =
                     tagValue { source=uReg3, dest=target, opSize=polyWordOpSize, isSigned=true } ::
                     division{isSigned=true, opSize=polyWordOpSize, dest=uReg3, dividend=uReg1, divisor=uReg2} ::
                     untagValue{ source=aReg2, dest=uReg2, isSigned=true, opSize=polyWordOpSize } ::
                     untagValue{ source=aReg1, dest=uReg1, isSigned=true, opSize=polyWordOpSize } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=FixedPrecisionArith ArithRem, arg1, arg2}, context, _, destination, tailCode) =
             let
                 (* For the moment we remove the tags and then retag afterwards.  The word
                    version avoids this but at least for the moment we do it the longer way. *)
                 (* There's no direct way to get the remainder - have to use divide and multiply. *)
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg() and uReg4 = newUReg()
                 val code =
                     tagValue { source=uReg4, dest=target, opSize=polyWordOpSize, isSigned=true } ::
                     multiplication{kind=if is32in64 then MultSub32 else MultSub64, dest=uReg4,
                                    sourceM=uReg3, sourceN=uReg2, sourceA=SomeReg uReg1} ::
                     division{isSigned=true, opSize=polyWordOpSize, dest=uReg3, dividend=uReg1, divisor=uReg2} ::
                     untagValue{ source=aReg2, dest=uReg2, isSigned=true, opSize=polyWordOpSize } ::
                     untagValue{ source=aReg1, dest=uReg1, isSigned=true, opSize=polyWordOpSize } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=FixedPrecisionArith ArithDiv, ...}, _, _, _, _) =
                raise InternalError "unimplemented operation: FixedPrecisionArith ArithDiv"
 
         |   codeToICodeBinaryRev({oper=FixedPrecisionArith ArithMod, ...}, _, _, _, _) =
                 raise InternalError "unimplemented operation: FixedPrecisionArith ArithMod"
 
         |   codeToICodeBinaryRev({oper=WordArith ArithAdd, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 (* We need to subtract the tag from one of the arguments and then do the
                    addition.  The optimiser will do the subtraction at compile time if we
                    subtract from a constant so try to put the constant in the second arg. *)
                 val (firstReg, secondReg) =
                     case arg1 of BICConstnt _ => (aReg2, aReg1) | _ => (aReg1, aReg2)
                 val uReg = newUReg()
                 val code =
                     addSubRegister{base=firstReg, shifted=uReg, dest=SomeReg target, length=polyWordOpSize,
                                  ccRef=NONE, isAdd=true, shift=ShiftNone} ::
                     addSubImmediate{dest=SomeReg uReg, source=secondReg, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=false}:: arg2Code
             in
                 (code , target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordArith ArithSub, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg = newUReg()
                 (* TODO: If the first argument is a constant we could add one to that rather than
                    subtracting one from the second argument.  We're not concerned with overflow. *)
                 val code =
                     addSubRegister{base=aReg1, shifted=uReg, dest=SomeReg target, length=polyWordOpSize,
                                  ccRef=NONE, isAdd=false, shift=ShiftNone} ::
                     addSubImmediate{dest=SomeReg uReg, source=aReg2, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=false}:: arg2Code
             in
                 (code , target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordArith ArithMult, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 (* Untag one argument. subtract the tag from the second, multiply and add back the tag. *)
                 val code =
                     addSubImmediate{dest=SomeReg target, source=uReg3, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=true} ::
                     multiplication{kind=if is32in64 then MultAdd32 else MultAdd64, dest=uReg3, sourceA=ZeroReg,
                                    sourceM=uReg1, sourceN=uReg2} ::
                     addSubImmediate{dest=SomeReg uReg2, source=aReg2, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=false} ::
                     untagValue{ source=aReg1, dest=uReg1, isSigned=false, opSize=polyWordOpSize } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordArith ArithDiv, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 (* Untag the divisor (into uReg2). subtract the tag from the dividend (into uReg1),
                    divide and or in the tag.  The tag may have been set already depending on
                    the result of the division. *)
                 val code =
                     logicalImmediate{dest=SomeReg target, source=uReg3, immed=0w1, length=polyWordOpSize, ccRef=NONE, logOp=LogOr} ::
                     division{isSigned=false, opSize=polyWordOpSize, dest=uReg3, dividend=uReg1, divisor=uReg2} ::
                     addSubImmediate{dest=SomeReg uReg1, source=aReg1, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=false} ::
                     untagValue{ source=aReg2, dest=uReg2, isSigned=false, opSize=polyWordOpSize } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordArith ArithMod, arg1, arg2}, context, _, destination, tailCode) =
             let
                 (* There's no direct way to get the remainder - have to use divide and multiply. *)
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg() and uReg4 = newUReg()
                 (* Untag the divisor (into uReg2). subtract the tag from the dividend (into uReg1)
                 Untag one argument. subtract the tag from the second, divide and or in the tag.  The
                    tag may have been set already depending on the result of the division. *)
                 val tagBitMask = Word64.<<(Word64.fromInt ~1, 0w1) (* Requires a 64-bit AND. *)
                 val code =
                     (* Multiply the result of the division by the divisor and subtract this from the original, tagged
                        dividend.  This leaves us a tagged value so it can go straight into the result. *)
                     multiplication{kind=if is32in64 then MultSub32 else MultSub64, dest=target,
                                    sourceM=uReg4, sourceN=uReg2, sourceA=SomeReg aReg1} ::
                     (* Clear the bottom bit before the multiplication. *)
                     logicalImmediate{dest=SomeReg uReg4, source=uReg3, immed=tagBitMask, length=OpSize64, ccRef=NONE, logOp=LogAnd} ::
                     division{isSigned=false, opSize=polyWordOpSize, dest=uReg3, dividend=uReg1, divisor=uReg2} ::
                     addSubImmediate{dest=SomeReg uReg1, source=aReg1, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=false} ::
                     untagValue{ source=aReg2, dest=uReg2, isSigned=false, opSize=polyWordOpSize } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordArith _, ...}, _, _, _, _) =
                 raise InternalError "WordArith - unimplemented instruction"
 
         |   codeToICodeBinaryRev({oper=WordLogical LogicalAnd, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 (* Since both values are tagged the tag will be preserved. *)
                 val code =
                     logicalRegister{base=aReg1, shifted=aReg2, dest=SomeReg target, length=polyWordOpSize,
                                     ccRef=NONE, logOp=LogAnd, shift=ShiftNone} :: arg2Code
             in
                 (code , target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordLogical LogicalOr, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 (* Since both values are tagged the tag will be preserved. *)
                 val code =
                     logicalRegister{base=aReg1, shifted=aReg2, dest=SomeReg target, length=polyWordOpSize,
                                     ccRef=NONE, logOp=LogOr, shift=ShiftNone} :: arg2Code
             in
                 (code , target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordLogical LogicalXor, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 (* If we just XOR the values together the tag bit in the result will be zero.
                    It's better to remove one of the tag bits beforehand.  As with Add, we
                    try to choose a constant. *)
                 val (firstReg, secondReg) =
                     case arg1 of BICConstnt _ => (aReg2, aReg1) | _ => (aReg1, aReg2)
                 val uReg = newUReg()
                 val code =
                     logicalRegister{base=firstReg, shifted=uReg, dest=SomeReg target, length=polyWordOpSize,
                                  ccRef=NONE, logOp=LogXor, shift=ShiftNone} ::
                     addSubImmediate{dest=SomeReg uReg, source=secondReg, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=false}:: arg2Code
             in
                 (code , target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordShift ShiftLeft, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val ureg1 = newUReg() and ureg2 = newUReg() and ureg3 = newUReg()
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val code =
                     (* Put back the tag. *)
                     logicalImmediate{ source=ureg3, dest=SomeReg target, ccRef=NONE, immed=0w1, logOp=LogOr, length=polyWordOpSize } ::
                     (* Do the shift *)
                     shiftRegister{direction=Arm64ICode.ShiftLeft, dest=ureg3, source=ureg1, shift=ureg2, opSize=polyWordOpSize} ::
                     (* Untag the shift amount.  Since it's at most 64 we can use a 32-bit operation. *)
                     untagValue{source=aReg2, dest=ureg2, opSize=OpSize32, isSigned=false} ::
                     (* Remove tag bit from the value we're shifting. *)
                     logicalImmediate{ source=aReg1, dest=SomeReg ureg1, ccRef=NONE, immed=polyWordTagBitMask,
                               logOp=LogAnd, length=polyWordOpSize } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordShift ShiftRightLogical, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val ureg1 = newUReg() and ureg2 = newUReg()
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val code =
                     (* Put back the tag. *)
                     logicalImmediate{ source=ureg2, dest=SomeReg target, ccRef=NONE, immed=0w1, logOp=LogOr, length=polyWordOpSize } ::
                     (* Do the shift *)
                     shiftRegister{direction=Arm64ICode.ShiftRightLogical, dest=ureg2, source=aReg1, shift=ureg1, opSize=polyWordOpSize} ::
                     (* Untag the shift amount.  Since it's at most 64 we can use a 32-bit operation. *)
                     untagValue{source=aReg2, dest=ureg1, opSize=OpSize32, isSigned=false} :: arg2Code
                     
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordShift ShiftRightArithmetic, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val ureg1 = newUReg() and ureg2 = newUReg()
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val code =
                     (* Put back the tag. *)
                     logicalImmediate{ source=ureg2, dest=SomeReg target, ccRef=NONE, immed=0w1, logOp=LogOr, length=polyWordOpSize } ::
                     (* Do the shift *)
                     shiftRegister{direction=Arm64ICode.ShiftRightArithmetic, dest=ureg2, source=aReg1, shift=ureg1, opSize=polyWordOpSize} ::
                     (* Untag the shift amount.  Since it's at most 64 we can use a 32-bit operation. *)
                     untagValue{source=aReg2, dest=ureg1, opSize=OpSize32, isSigned=false} :: arg2Code
                     
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=AllocateByteMemory, arg1, arg2}, context, _, destination, tailCode) =
             let
                 (* Allocate a block of memory and without initialisation.  If the flags include
                    the "bytes" bit the GC won't look at it so it doesn't matter that it's not
                    initialised.
                    This is identical to AllocateWordMemory apart from the lack of initialisation. *)
                 val target = asTarget destination
                 val (codeSize, sizeReg, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (codeFlags, flagsReg, _) = codeToICodeRev(arg2, context, false, AnyReg, codeSize)
                 val uSizeReg = newUReg() and shiftFReg = newUReg() and lengthWord = newUReg()
                 val absAddr = if is32in64 then newUReg() else target
                 val untagSize = untagValue{source=sizeReg, dest=uSizeReg, opSize=polyWordOpSize, isSigned=false} :: codeFlags
                 val allocateMem = allocateMemoryVariable{ size=uSizeReg, dest=absAddr, saveRegs=[]} :: untagSize
                 (* Make the length word by first shifting the flags into the length word reg by
                    55 or 23 bits.  This puts the tag bit in the top bit of the size.  Then insert the size
                    into this which will overwrite the flag's tag bit. *)
                 val makeLengthWord =
                     bitFieldInsert{ source=uSizeReg, destAsSource=shiftFReg, dest=lengthWord, length=polyWordOpSize,
                               immr=0w0 (*bit 0*), imms=if is32in64 then 0w23 else 0w55 (*width-1*) } ::
                     shiftConstant{direction=Arm64ICode.ShiftLeft, dest=shiftFReg, source=flagsReg,
                         shift=if is32in64 then 0w23 else 0w55, opSize=polyWordOpSize } :: allocateMem
                 val setLengthWordAndInit =
                     storeWithConstantOffset{ source=lengthWord, base=absAddr,
                         byteOffset= ~(Word.toInt wordSize), loadType=polyWordLoadSize } :: makeLengthWord
                 val finalCode =
                     if is32in64 then absoluteToObjectIndex{ source=absAddr, dest=target } :: setLengthWordAndInit
                     else setLengthWordAndInit
             in
                 (finalCode, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=LargeWordComparison test, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val ccRef = newCCRef()
                 val (testCode1, testDest1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (testCode2, testDest2, _) = codeToICodeRev(arg2, context, false, AnyReg, testCode1)
                 val uReg1 = newUReg() and uReg2 = newUReg()
                 val comparison =
                     addSubRegister{base=uReg1, shifted=uReg2, dest=ZeroReg, length=OpSize64,
                                    ccRef=SOME ccRef, isAdd=false, shift=ShiftNone} ::
                     unboxLarge{ source=testDest2, dest=uReg2 } ::
                     unboxLarge{ source=testDest1, dest=uReg1 } :: testCode2
                 open BuiltIns
                 val cond =
                     case test of
                         TestEqual => CondEqual
                     |   TestLess => CondCarryClear
                     |   TestLessEqual => CondUnsignedLowOrEq
                     |   TestGreater => CondUnsignedHigher
                     |   TestGreaterEqual => CondCarrySet
                     |   TestUnordered => raise InternalError "LargeWordComparison: TestUnordered"
             in
                 (makeBoolResultRev(cond, ccRef, target, comparison), target, false)
             end
 
         |   codeToICodeBinaryRev({oper=LargeWordArith ArithAdd, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 val code =
                     boxLarge{ source=uReg3, dest=target, saveRegs=[] } ::
                     addSubRegister{base=uReg1, shifted=uReg2, dest=SomeReg uReg3, length=OpSize64,
                                    ccRef=NONE, isAdd=true, shift=ShiftNone} ::
                     unboxLarge{ source=aReg2, dest=uReg2 } ::
                     unboxLarge{ source=aReg1, dest=uReg1 } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=LargeWordArith ArithSub, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 val code =
                     boxLarge{ source=uReg3, dest=target, saveRegs=[] } ::
                     addSubRegister{base=uReg1, shifted=uReg2, dest=SomeReg uReg3, length=OpSize64,
                                    ccRef=NONE, isAdd=false, shift=ShiftNone} ::
                     unboxLarge{ source=aReg2, dest=uReg2 } ::
                     unboxLarge{ source=aReg1, dest=uReg1 } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=LargeWordArith ArithMult, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 val code =
                     boxLarge{ source=uReg3, dest=target, saveRegs=[] } ::
                     multiplication{kind=MultAdd64, sourceA=ZeroReg, sourceM=uReg1, sourceN=uReg2, dest=uReg3} ::
                     unboxLarge{ source=aReg2, dest=uReg2 } ::
                     unboxLarge{ source=aReg1, dest=uReg1 } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=LargeWordArith ArithDiv, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 val code =
                     boxLarge{ source=uReg3, dest=target, saveRegs=[] } ::
                     division{isSigned=false, opSize=OpSize64, dividend=uReg1, divisor=uReg2, dest=uReg3} ::
                     unboxLarge{ source=aReg2, dest=uReg2 } ::
                     unboxLarge{ source=aReg1, dest=uReg1 } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=LargeWordArith ArithMod, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg() and uReg4 = newUReg()
                 val code =
                     boxLarge{ source=uReg4, dest=target, saveRegs=[] } ::
                     multiplication{kind=MultSub64, dest=uReg4, sourceM=uReg3, sourceN=uReg2, sourceA=SomeReg uReg1} ::
                     division{isSigned=false, opSize=OpSize64, dividend=uReg1, divisor=uReg2, dest=uReg3} ::
                     unboxLarge{ source=aReg2, dest=uReg2 } ::
                     unboxLarge{ source=aReg1, dest=uReg1 } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=LargeWordArith _, ...}, _, _, _, _) =
                 raise InternalError "LargeWordArith - unimplemented instruction"
 
         |   codeToICodeBinaryRev({oper=LargeWordLogical logop, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 val logicalOp = case logop of LogicalAnd => LogAnd | LogicalOr => LogOr | LogicalXor => LogXor
                 val code =
                     boxLarge{ source=uReg3, dest=target, saveRegs=[] } ::
                     logicalRegister{base=uReg1, shifted=uReg2, dest=SomeReg uReg3, length=OpSize64,
                                    ccRef=NONE, logOp=logicalOp, shift=ShiftNone} ::
                     unboxLarge{ source=aReg2, dest=uReg2 } ::
                     unboxLarge{ source=aReg1, dest=uReg1 } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=LargeWordShift shiftKind, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 val shiftType =
                     case shiftKind of
                         ShiftLeft => Arm64ICode.ShiftLeft
                     |   ShiftRightLogical => Arm64ICode.ShiftRightLogical
                     |   ShiftRightArithmetic => Arm64ICode.ShiftRightArithmetic
                 val code =
                     boxLarge{ source=uReg3, dest=target, saveRegs=[] } ::
                     shiftRegister{direction=shiftType, source=uReg1, shift=uReg2, dest=uReg3, opSize=OpSize64 } ::
                     (* The shift amount is a word, not a large word. *)
                     untagValue{ source=aReg2, dest=uReg2, opSize=OpSize32, isSigned=false } ::
                     unboxLarge{ source=aReg1, dest=uReg1 } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=RealComparison(test, precision), arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val ccRef = newCCRef()
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val fpSize = precisionToFpSize precision
                 val uReg1 = newUReg() and uReg2 = newUReg()
                 (* Floating point comparisons.
                    The fcmp instruction differs from integer comparison.  If either
                    argument is a NaN the overflow bit is set and the other bits are
                    cleared.  That means that in order to get a true result only
                    if the values are not NaNs we have to test that at least one of
                    C, N, or Z are set.  We use unsigned tests for < and <=
                    and signed tests for > and >=. *)
                 val cond =
                     case test of
                         TestEqual => CondEqual
                     |   TestLess => CondCarryClear
                     |   TestLessEqual => CondUnsignedLowOrEq
                     |   TestGreater => CondSignedGreater
                     |   TestGreaterEqual => CondSignedGreaterEq
                     |   TestUnordered => CondOverflow
                 val code =
                     compareFloatingPoint{arg1=uReg1, arg2=uReg2, ccRef=ccRef, opSize=fpSize} ::
                     unboxTagFloat{ floatSize=fpSize, source=aReg2, dest=uReg2 } ::
                     unboxTagFloat{ floatSize=fpSize, source=aReg1, dest=uReg1 } ::
                     arg2Code
             in
                 (makeBoolResultRev(cond, ccRef, target, code), target, false)
             end
 
         |   codeToICodeBinaryRev({oper=RealArith(oper, precision), arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val fpSize = precisionToFpSize precision
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 val fpOp =
                     case oper of
                         ArithAdd => AddFP
                     |   ArithSub => SubtractFP
                     |   ArithMult => MultiplyFP
                     |   ArithDiv => DivideFP
                     |   _ => raise InternalError "RealArith - unimplemented instruction"
                 val code =
                     boxTagFloat{ floatSize=fpSize, source=uReg3, dest=target, saveRegs=[] } ::
                     binaryFloatingPoint{arg1=uReg1, arg2=uReg2, dest=uReg3, fpOp=fpOp, opSize=fpSize } ::
                     unboxTagFloat{ floatSize=fpSize, source=aReg2, dest=uReg2 } ::
                     unboxTagFloat{ floatSize=fpSize, source=aReg1, dest=uReg1 } ::
                     arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=PointerEq, arg1, arg2}, context, _, destination, tailCode) =
             let
                 (* Equality of general values which can include pointers. This can be treated exactly as a word equality.
                    It has to be analysed differently for indexed cases. *)
                 val ccRef = newCCRef()
                 val (testCode1, testDest1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (testCode2, testDest2, _) = codeToICodeRev(arg2, context, false, AnyReg, testCode1)
                 val comparison =
                     addSubRegister{base=testDest1, shifted=testDest2, dest=ZeroReg, length=polyWordOpSize,
                                    ccRef=SOME ccRef, isAdd=false, shift=ShiftNone} :: testCode2
                 val target = asTarget destination
             in
                 (makeBoolResultRev(CondEqual, ccRef, target, comparison), target, false)
             end
 
         |   codeToICodeBinaryRev({oper=FreeCStack, arg1, arg2}, context, _, destination, tailCode) =
             let
                 (* Free space on the C stack. This is a binary operation that takes the base address
                    and the size.  The base address isn't used in this version. *)
                 val (arg1Code, _, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg = newUReg()
                 val code =
                     addSubXSP{ source=uReg, dest=ZeroReg, isAdd=true  } ::
                     untagValue{ source=aReg2, dest=uReg, isSigned=false, opSize=polyWordOpSize } ::
                     arg2Code
             in
                 returnUnit(destination, code, false)
             end
 
         (* Code-generate an address into one or two Pregs. At this point they are in a state where
            we can code-generate arbitrary code before the address is used *)
         and addressToPregAddress({base, index, offset}, context, code) =
         let
             val (bCode, bReg, _) = codeToICodeRev(base, context, false, AnyReg, code)
         in
             case index of
                 NONE => ({base=bReg, index=NONE, offset=offset}, bCode)
             |   SOME index =>
                 let
                     val (iCode, iReg, _) = codeToICodeRev(index, context, false, AnyReg, bCode)
                 in
                     ({base=bReg, index=SOME iReg, offset=offset}, iCode)
                 end
         end
 
         (* Store the code address and the closure items into a previously allocated closure on the
            heap.  This is used both in the simple case and also with mutually recursive declarations. *)
         and storeIntoClosure(lambda as { closure, ...}, absClosureAddr, context, tailCode) =
         let
             val closureRef = makeConstantClosure()
             val () = codeFunctionToArm64(lambda, debugSwitches, closureRef)
             val codeAddrWords = if is32in64 then 2 else 1
 
             fun storeAValue(f, (n, tlCode)) =
             let
                 val (code, source, _) = codeToICodeRev(BICExtract f, context, false, AnyReg, tlCode)
             in
                 (n+1, storeAtWordOffset(source, n, absClosureAddr, polyWordLoadSize, code))
             end
             (* Store the code address in the first 64-bits. *)
             val storeCodeAddress =
                 if is32in64
                 then
                 let
                     (* We can't use codeAddressFromClosure on 32-in-64 because it always returns
                        a 64-bit value.  Instead we have to get the code address at run-time. *)
                     val clReg = newPReg() and absClReg = newUReg() and absCodeReg = newUReg()
                 in
                     storeAtWordOffset(absCodeReg, 0, absClosureAddr, Load64,
                         loadWithConstantOffset{base=absClReg, dest=absCodeReg, byteOffset=0, loadType=Load64} ::
                         objectIndexAddressToAbsolute{ source=clReg, dest=absClReg } ::
                         loadAddressConstant{source=closureAsAddress closureRef, dest=clReg} :: tailCode)
                 end
                 else
                 let
                     val cReg = newPReg()
                 in
                     storeAtWordOffset(cReg, 0, absClosureAddr, Load64,
                         loadAddressConstant{source=codeAddressFromClosure closureRef, dest=cReg} :: tailCode)
                 end
             val (_, storeCode) = List.foldl storeAValue (codeAddrWords, storeCodeAddress) closure
         in
             storeCode
         end
 
         (* Load operations. *)
         and codeLoadOperation(kind, address, context, target, tailCode) =
         let
             val (regAddr, codeAddr) = addressToPregAddress(address, context, tailCode)
 
             val code =
             case kind of
                 LoadStoreMLWord {isImmutable=false} =>
                 let
                     fun loadOp(addrReg, code) =
                         loadAcquire{base=addrReg, dest=target, loadType=polyWordLoadSize} :: code
                 in
                     loadAndStoreWithAbsolute (regAddr, opWordSize polyWordLoadSize, loadShift polyWordLoadSize, loadOp, codeAddr)
                 end
 
             |   LoadStoreMLWord {isImmutable=true} =>
                 let
                     fun loadConstOffset(base, offset, code) =
                         loadWithConstantOffset{base=base, dest=target, byteOffset=offset, loadType=polyWordLoadSize} :: code
                     fun loadIndexed(base, index, code) =
                         loadWithIndexedOffset{base=base, dest=target, index=index, loadType=polyWordLoadSize, signExtendIndex=false} :: code
                 in
                     loadAndStoreWithAddress (regAddr, opWordSize polyWordLoadSize, loadShift polyWordLoadSize, false, loadConstOffset, loadIndexed, codeAddr)
                 end
 
             |   LoadStoreMLByte {isImmutable=false} =>
                 let
                     (* Have to load into a ureg and then tag it. *)
                     val destReg = newUReg()
                     fun loadOp(addrReg, code) =
                         loadAcquire{base=addrReg, dest=destReg, loadType=Load8} :: code
                 in
                     tagValue{source=destReg, dest=target, isSigned=false, opSize=OpSize32} ::
                         loadAndStoreWithAbsolute (regAddr, opWordSize Load8, loadShift Load8, loadOp, codeAddr)
                 end
 
             |   LoadStoreMLByte {isImmutable=true} =>
                 let
                     (* Have to load into a ureg and then tag it. *)
                     val destReg = newUReg()
                     fun loadConstOffset(base, offset, code) =
                         loadWithConstantOffset{base=base, dest=destReg, byteOffset=offset, loadType=Load8} :: code
                     fun loadIndexed(base, index, code) =
                         loadWithIndexedOffset{base=base, dest=destReg, index=index, loadType=Load8, signExtendIndex=false} :: code
                 in
                     tagValue{source=destReg, dest=target, isSigned=false, opSize=OpSize32} ::
                         loadAndStoreWithAddress(regAddr, opWordSize Load8, loadShift Load8, false, loadConstOffset, loadIndexed, codeAddr)
                 end
 
             |   LoadStoreC8 =>
                 let
                     (* Have to load into a ureg and then tag it. *)
                     val destReg = newUReg()
                     fun loadConstOffset(base, offset, code) =
                         loadWithConstantOffset{base=base, dest=destReg, byteOffset=offset, loadType=Load8} :: code
                     fun loadIndexed(base, index, code) =
                         loadWithIndexedOffset{base=base, dest=destReg, index=index, loadType=Load8, signExtendIndex=true} :: code
                 in
                     tagValue{source=destReg, dest=target, isSigned=false, opSize=OpSize32} ::
                         loadAndStoreWithAddress(regAddr, opWordSize Load8, loadShift Load8, true, loadConstOffset, loadIndexed, codeAddr)
                 end
 
             |   LoadStoreC16 =>
                 let
                     (* Have to load into a ureg and then tag it. *)
                     val destReg = newUReg()
                     fun loadConstOffset(base, offset, code) =
                         loadWithConstantOffset{base=base, dest=destReg, byteOffset=offset, loadType=Load16} :: code
                     fun loadIndexed(base, index, code) =
                         loadWithIndexedOffset{base=base, dest=destReg, index=index, loadType=Load16, signExtendIndex=true} :: code
                 in
                     tagValue{source=destReg, dest=target, isSigned=false, opSize=OpSize32} ::
                         loadAndStoreWithAddress(regAddr, opWordSize Load16, loadShift Load16, true, loadConstOffset, loadIndexed, codeAddr)
                 end
 
             |   LoadStoreC32 =>
                 let
                     (* This is tagged in native 64-bits and boxed in 32-in-64. *)
                     val destReg = newUReg()
                     fun loadConstOffset(base, offset, code) =
                         loadWithConstantOffset{base=base, dest=destReg, byteOffset=offset, loadType=Load32} :: code
                     fun loadIndexed(base, index, code) =
                         loadWithIndexedOffset{base=base, dest=destReg, index=index, loadType=Load32, signExtendIndex=true} :: code
                 in
                     (if is32in64
                      then boxLarge{ source=destReg, dest=target, saveRegs=[] }
                      else tagValue{source=destReg, dest=target, isSigned=false, opSize=OpSize64 (* It becomes 33 bits *)}) ::
                         loadAndStoreWithAddress(regAddr, opWordSize Load32, loadShift Load32, true, loadConstOffset, loadIndexed, codeAddr)
                 end
 
             |   LoadStoreC64 =>
                 let
                     (* This is always boxed. *)
                     val destReg = newUReg()
                     fun loadConstOffset(base, offset, code) =
                         loadWithConstantOffset{base=base, dest=destReg, byteOffset=offset, loadType=Load64} :: code
                     fun loadIndexed(base, index, code) =
                         loadWithIndexedOffset{base=base, dest=destReg, index=index, loadType=Load64, signExtendIndex=true} :: code
                 in
                     boxLarge{ source=destReg, dest=target, saveRegs=[]} ::
                         loadAndStoreWithAddress(regAddr, opWordSize Load64, loadShift Load64, true, loadConstOffset, loadIndexed, codeAddr)
                 end
 
             |   LoadStoreCFloat =>
                 let
                     (* This always returns a double, not a 32-bit float. *)
                     val destReg = newUReg() and convertReg = newUReg()
                     fun loadConstOffset(base, offset, code) =
                         loadFPWithConstantOffset{base=base, dest=destReg, byteOffset=offset, floatSize=Float32} :: code
                     fun loadIndexed(base, index, code) =
                         loadFPWithIndexedOffset{base=base, dest=destReg, index=index, floatSize=Float32, signExtendIndex=true} :: code
                 in
                     boxTagFloat{floatSize=Double64, source=convertReg, dest=target, saveRegs=[]} ::
                         unaryFloatingPt{source=destReg, dest=convertReg, fpOp=ConvFloatToDble} ::
                         loadAndStoreWithAddress(regAddr, 4, 0w2, true, loadConstOffset, loadIndexed, codeAddr)
                 end
 
             |   LoadStoreCDouble =>
                 let
                     (* This is always boxed. *)
                     val destReg = newUReg()
                     fun loadConstOffset(base, offset, code) =
                         loadFPWithConstantOffset{base=base, dest=destReg, byteOffset=offset, floatSize=Double64} :: code
                     fun loadIndexed(base, index, code) =
                         loadFPWithIndexedOffset{base=base, dest=destReg, index=index, floatSize=Double64, signExtendIndex=true} :: code
                 in
                     boxTagFloat{floatSize=Double64, source=destReg, dest=target, saveRegs=[]} ::
                         loadAndStoreWithAddress(regAddr, 8, 0w3, true, loadConstOffset, loadIndexed, codeAddr)
                 end
 
             |   LoadStoreUntaggedUnsigned =>
                 let
                     (* LoadStoreMLWord {isImmutable=true} except it has to be tagged. *)
                     val ureg = newUReg()
                     fun loadConstOffset(base, offset, code) =
                         loadWithConstantOffset{base=base, dest=ureg, byteOffset=offset, loadType=polyWordLoadSize} :: code
                     fun loadIndexed(base, index, code) =
                         loadWithIndexedOffset{base=base, dest=ureg, index=index, loadType=polyWordLoadSize, signExtendIndex=false} :: code
                 in
                     tagValue{source=ureg, dest=target, isSigned=false, opSize=polyWordOpSize} ::
                         loadAndStoreWithAddress(regAddr, opWordSize polyWordLoadSize, loadShift polyWordLoadSize, false, loadConstOffset, loadIndexed, codeAddr)
                 end
 
             |   LoadStorePolyWord _ =>
                 let
                     (* LoadStoreMLWord {isImmutable=true} except it has to be boxed. For the moment don't use load-acquire. *)
                     val ureg = newUReg()
                     fun loadConstOffset(base, offset, code) =
                         loadWithConstantOffset{base=base, dest=ureg, byteOffset=offset, loadType=polyWordLoadSize} :: code
                     fun loadIndexed(base, index, code) =
                         loadWithIndexedOffset{base=base, dest=ureg, index=index, loadType=polyWordLoadSize, signExtendIndex=false} :: code
                 in
                     boxLarge{source=ureg, dest=target, saveRegs=[]} ::
                         loadAndStoreWithAddress(regAddr, opWordSize polyWordLoadSize, loadShift polyWordLoadSize, false, loadConstOffset, loadIndexed, codeAddr)
                 end
                 
             |   LoadStoreNativeWord _ =>
                 let
                     (* Similar to LoadStorePolyWord but a native word. *)
                     val ureg = newUReg()
                     fun loadConstOffset(base, offset, code) =
                         loadWithConstantOffset{base=base, dest=ureg, byteOffset=offset, loadType=Load64} :: code
                     fun loadIndexed(base, index, code) =
                         loadWithIndexedOffset{base=base, dest=ureg, index=index, loadType=Load64, signExtendIndex=false} :: code
                 in
                     boxLarge{source=ureg, dest=target, saveRegs=[]} ::
                         loadAndStoreWithAddress(regAddr, opWordSize Load64, loadShift polyWordLoadSize, false, loadConstOffset, loadIndexed, codeAddr)
                 end
 
         in
             (code, target, false)
         end
 
         (* Store operations. *)
         and codeStoreOperation(kind, address, value, context, destination, tailCode1) =
         let
             val (regAddr, codeAddr) = addressToPregAddress(address, context, tailCode1)
             val (sourceCode, sourceReg, _) = codeToICodeRev(value, context, false, AnyReg, codeAddr)
             
             val storeCode =
             case kind of
                 LoadStoreMLWord {isImmutable=false} =>
                 let
                     fun storeOp(addrReg, code) =
                         storeRelease{base=addrReg, source=sourceReg, loadType=polyWordLoadSize} :: code
                 in
                     loadAndStoreWithAbsolute(regAddr, opWordSize polyWordLoadSize, loadShift polyWordLoadSize, storeOp, sourceCode)
                 end
 
             |   LoadStoreMLWord {isImmutable=true} =>
                 let
                     (* Used when initialising immutables that do not require store-release. *)
                     fun loadConstOffset(base, offset, code) =
                         storeWithConstantOffset{base=base, source=sourceReg, byteOffset=offset, loadType=polyWordLoadSize} :: code
                     fun loadIndexed(base, index, code) =
                         storeWithIndexedOffset{base=base, source=sourceReg, index=index, loadType=polyWordLoadSize, signExtendIndex=false} :: code
                 in
                     loadAndStoreWithAddress (regAddr, opWordSize polyWordLoadSize, loadShift polyWordLoadSize, false, loadConstOffset, loadIndexed, sourceCode)
                 end
 
             |   LoadStoreMLByte {isImmutable=false} =>
                 let
                     fun storeOp(addrReg, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeRelease{base=addrReg, source=tReg, loadType=Load8} ::
                         untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=OpSize32} :: code
                     end
                 in
                     loadAndStoreWithAbsolute(regAddr, opWordSize Load8, loadShift Load8, storeOp, sourceCode)
                 end
 
             |   LoadStoreMLByte {isImmutable=true} =>
                 let
                     (* Used when initialising immutables that do not require store-release. *)
                     fun loadConstOffset(base, offset, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithConstantOffset{base=base, source=tReg, byteOffset=offset, loadType=Load8} ::
                         untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=OpSize32} :: code
                     end
 
                     fun loadIndexed(base, index, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithIndexedOffset{base=base, source=tReg, index=index, loadType=Load8, signExtendIndex=false} ::
                         untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=OpSize32} :: code
                     end
                 in
                     loadAndStoreWithAddress(regAddr, opWordSize Load8, loadShift Load8, false, loadConstOffset, loadIndexed, sourceCode)
                 end
 
             |   LoadStoreC8 =>
                 let
                     fun loadConstOffset(base, offset, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithConstantOffset{base=base, source=tReg, byteOffset=offset, loadType=Load8} ::
                         untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=OpSize32} :: code
                     end
 
                     fun loadIndexed(base, index, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithIndexedOffset{base=base, source=tReg, index=index, loadType=Load8, signExtendIndex=true} ::
                         untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=OpSize32} :: code
                     end
                 in
                     loadAndStoreWithAddress(regAddr, opWordSize Load8, loadShift Load8, true, loadConstOffset, loadIndexed, sourceCode)
                 end
 
             |   LoadStoreC16 =>
                 let
                     fun loadConstOffset(base, offset, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithConstantOffset{base=base, source=tReg, byteOffset=offset, loadType=Load16} ::
                         untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=OpSize32} :: code
                     end
 
                     fun loadIndexed(base, index, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithIndexedOffset{base=base, source=tReg, index=index, loadType=Load16, signExtendIndex=true} ::
                         untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=OpSize32} :: code
                     end
                 in
                     loadAndStoreWithAddress(regAddr, opWordSize Load16, loadShift Load16, true, loadConstOffset, loadIndexed, sourceCode)
                 end
 
             |   LoadStoreC32 =>
                 let
                     fun loadConstOffset(base, offset, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithConstantOffset{base=base, source=tReg, byteOffset=offset, loadType=Load32} ::
                         (if is32in64 then unboxLarge{source=sourceReg, dest=tReg}
                          else untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=OpSize64}) :: code
                     end
 
                     fun loadIndexed(base, index, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithIndexedOffset{base=base, source=tReg, index=index, loadType=Load32, signExtendIndex=true} ::
                         (if is32in64 then unboxLarge{source=sourceReg, dest=tReg}
                          else untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=OpSize64}) :: code
                     end
                 in
                     loadAndStoreWithAddress(regAddr, opWordSize Load32, loadShift Load32, true, loadConstOffset, loadIndexed, sourceCode)
                 end
 
             |   LoadStoreC64 =>
                 let
                     fun loadConstOffset(base, offset, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithConstantOffset{base=base, source=tReg, byteOffset=offset, loadType=Load64} ::
                         unboxLarge{source=sourceReg, dest=tReg} :: code
                     end
 
                     fun loadIndexed(base, index, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithIndexedOffset{base=base, source=tReg, index=index, loadType=Load64, signExtendIndex=true} ::
                         unboxLarge{source=sourceReg, dest=tReg} :: code
                     end
                 in
                     loadAndStoreWithAddress(regAddr, opWordSize Load64, loadShift Load64, true, loadConstOffset, loadIndexed, sourceCode)
                 end
 
             |   LoadStoreCFloat =>
                 let
                     (* The "real" value is a double, not a 32-bit float *)
                     fun loadConstOffset(base, offset, code) =
                     let
                         val tReg = newUReg() and cReg = newUReg()
                     in
                         storeFPWithConstantOffset{base=base, source=tReg, byteOffset=offset, floatSize=Float32} ::
                         unaryFloatingPt{source=cReg, dest=tReg, fpOp=ConvDbleToFloat} ::
                         unboxTagFloat{floatSize=Double64, source=sourceReg, dest=cReg} :: code
                     end
 
                     fun loadIndexed(base, index, code) =
                     let
                         val tReg = newUReg() and cReg = newUReg()
                     in
                         storeFPWithIndexedOffset{base=base, source=tReg, index=index, floatSize=Float32, signExtendIndex=true} ::
                         unaryFloatingPt{source=cReg, dest=tReg, fpOp=ConvDbleToFloat} ::
                         unboxTagFloat{floatSize=Double64, source=sourceReg, dest=cReg} :: code
                     end
                 in
                     loadAndStoreWithAddress(regAddr, 4, 0w2, true, loadConstOffset, loadIndexed, sourceCode)
                 end
 
             |   LoadStoreCDouble =>
                 let
                     fun loadConstOffset(base, offset, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeFPWithConstantOffset{base=base, source=tReg, byteOffset=offset, floatSize=Double64} ::
                         unboxTagFloat{floatSize=Double64, source=sourceReg, dest=tReg} :: code
                     end
 
                     fun loadIndexed(base, index, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeFPWithIndexedOffset{base=base, source=tReg, index=index, floatSize=Double64, signExtendIndex=true} ::
                         unboxTagFloat{floatSize=Double64, source=sourceReg, dest=tReg} :: code
                     end
                 in
                     loadAndStoreWithAddress(regAddr, 8, 0w3, true, loadConstOffset, loadIndexed, sourceCode)
                 end
 
             |   LoadStoreUntaggedUnsigned =>
                 let
                     (* Only used when initialising strings so this does not require store-release. *)
                     fun loadConstOffset(base, offset, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithConstantOffset{base=base, source=tReg, byteOffset=offset, loadType=polyWordLoadSize} ::
                         untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=polyWordOpSize} :: code
                     end
                     fun loadIndexed(base, index, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithIndexedOffset{base=base, source=tReg, index=index, loadType=polyWordLoadSize, signExtendIndex=false} ::
                         untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=polyWordOpSize} :: code
                     end
                 in
                     loadAndStoreWithAddress(regAddr, opWordSize polyWordLoadSize, loadShift polyWordLoadSize, false, loadConstOffset, loadIndexed, sourceCode)
                 end
 
             |   LoadStorePolyWord _ =>
                 let
                     (* For the moment assume we don't require store-release. *)
                     fun loadConstOffset(base, offset, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithConstantOffset{base=base, source=tReg, byteOffset=offset, loadType=polyWordLoadSize} ::
                         unboxLarge{source=sourceReg, dest=tReg} :: code
                     end
                     fun loadIndexed(base, index, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithIndexedOffset{base=base, source=tReg, index=index, loadType=polyWordLoadSize, signExtendIndex=false} ::
                         unboxLarge{source=sourceReg, dest=tReg} :: code
                     end
                 in
                     loadAndStoreWithAddress(regAddr, opWordSize polyWordLoadSize, loadShift polyWordLoadSize, false, loadConstOffset, loadIndexed, sourceCode)
                 end
 
             |   LoadStoreNativeWord _ =>
                 let
                     (* For the moment assume we don't require store-release. *)
                     fun loadConstOffset(base, offset, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithConstantOffset{base=base, source=tReg, byteOffset=offset, loadType=Load64} ::
                         unboxLarge{source=sourceReg, dest=tReg} :: code
                     end
                     fun loadIndexed(base, index, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithIndexedOffset{base=base, source=tReg, index=index, loadType=Load64, signExtendIndex=false} ::
                         unboxLarge{source=sourceReg, dest=tReg} :: code
                     end
                 in
                     loadAndStoreWithAddress(regAddr, opWordSize Load64, loadShift Load64, false, loadConstOffset, loadIndexed, sourceCode)
                 end
         in
             returnUnit(destination, storeCode, false)
         end
 
 
         (*Turn the codetree structure into icode. *)
         val bodyContext = {loopArgs=NONE, stackPtr=0, currHandler=NONE, overflowBlock=ref NONE}
         val (bodyCode, _, bodyExited) =
             codeToICodeRev(body, bodyContext, true, SpecificPReg resultTarget, beginInstructions)
         val icode = if bodyExited then bodyCode else returnInstruction(bodyContext, resultTarget, bodyCode)
         
         (* Turn the icode list into basic blocks.  The input list is in reverse so as part of
            this we reverse the list. *)
         local
             val resArray = Array.array(!labelCounter, BasicBlock{ block=[], flow=ExitCode })
             
             fun createEntry (blockNo, block, flow) =
                 Array.update(resArray, blockNo, BasicBlock{ block=block, flow=flow})
             
             fun splitCode([], _, _) = 
                 (* End of code.  We should have had a BeginFunction. *)
                 raise InternalError "splitCode - no begin"
             
             |   splitCode(BlockBegin args :: _, sinceLabel, flow) =
                     (* Final instruction.  Create the initial block and exit. *)
                     createEntry(0, BeginFunction args ::sinceLabel, flow)
             
             |   splitCode(BlockSimple instr :: rest, sinceLabel, flow) =
                     splitCode(rest, instr :: sinceLabel, flow)
 
             |   splitCode(BlockLabel label :: rest, sinceLabel, flow) =
                     (* Label - finish this block and start another. *)
                 (
                     createEntry(label, sinceLabel, flow);
                     (* Default to a jump to this label.  That is used if we have
                        assumed a drop-through. *)
                     splitCode(rest, [], Unconditional label)
                 )
             
             |   splitCode(BlockExit instr :: rest, _, _) =
                     splitCode(rest, [instr], ExitCode)
 
             |   splitCode(BlockFlow flow :: rest, _, _) =
                     splitCode(rest, [], flow)
             
             |   splitCode(BlockRaiseAndHandle(instr, handler) :: rest, _, _) =
                     splitCode(rest, [instr], UnconditionalHandle handler)
 
             |   splitCode(BlockOptionalHandle{call, handler, label} :: rest, sinceLabel, flow) =
                 let
                     (* A function call within a handler.  This could go to the handler but
                        if there is no exception will go to the next instruction.
                        Also includes JumpLoop since the stack check could result in an
                        Interrupt exception. *)
                 in
                     createEntry(label, sinceLabel, flow);
                     splitCode(rest, [call], ConditionalHandle{handler=handler, continue=label})
                 end
 
         in
             val () = splitCode(icode, [], ExitCode)
             val resultVector = Array.vector resArray
         end
       
         open ICodeTransform
         
         val pregProperties = Vector.fromList(List.rev(! pregPropList))
     in
         codeICodeFunctionToArm64{blocks = resultVector, functionName = name, pregProps = pregProperties,
             ccCount= ! ccRefCounter, debugSwitches = debugSwitches, resultClosure = resultClosure,
             profileObject = profileObject}
     end
 
     val gencodeLambda = codeFunctionToArm64
 
     structure Foreign = Arm64Foreign
     
     structure Sharing =
     struct
         type backendIC = backendIC
         and  bicLoadForm = bicLoadForm
         and argumentType = argumentType
         and closureRef = closureRef
     end
     
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ForeignCall.sml b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ForeignCall.sml
index 5a8b1319..ff25f8f8 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ForeignCall.sml
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ForeignCall.sml
@@ -1,861 +1,784 @@
 (*
     Copyright (c) 2021-2 David C. J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     Licence version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public Licence for more details.
     
     You should have received a copy of the GNU Lesser General Public
     Licence along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64ForeignCall(
     structure CodeArray: CODEARRAY
     and       Arm64PreAssembly: ARM64PREASSEMBLY
     and       Debug: DEBUG
 
     sharing CodeArray.Sharing = Arm64PreAssembly.Sharing
 ): FOREIGNCALL
 =
 struct
 
     open CodeArray Arm64PreAssembly
     
     exception InternalError = Misc.InternalError
     and Foreign = Foreign.Foreign
 
     datatype fastArgs = FastArgFixed | FastArgDouble | FastArgFloat
 
     val makeEntryPoint: string -> machineWord = RunCall.rtsCallFull1 "PolyCreateEntryPointObject"
 
     (* Turn an index into an absolute address. *)
     fun indexToAbsoluteAddress(iReg, absReg) =
     if is32in64
     then [AddShiftedReg{regM=iReg, regN=X_Base32in64, regD=absReg, shift=ShiftLSL 0w2, opSize=OpSize64, setFlags=false}]
     else if iReg = absReg
     then []
     else [MoveXRegToXReg{sReg=iReg, dReg=absReg}]
 
-    fun unboxDouble(addrReg, workReg, valReg) =
-    if is32in64
-    then indexToAbsoluteAddress(addrReg, workReg) @
-        [LoadFPRegScaled{regT=valReg, regN=workReg, unitOffset=0, floatSize=Double64}]
-    else [LoadFPRegScaled{regT=valReg, regN=addrReg, unitOffset=0, floatSize=Double64}]
-
-    fun unboxOrUntagSingle(addrReg, workReg, valReg) =
-    if is32in64
-    then [LoadFPRegIndexed{regN=X_Base32in64, regM=addrReg, regT=valReg, option=ExtUXTX ScaleOrShift, floatSize=Float32}]
-    else
-    [
-        shiftConstant{direction=ShiftRightLogical, shift=0w32, regN=addrReg, regD=workReg, opSize=OpSize64},
-        MoveGeneralToFP{regN=workReg, regD=valReg, floatSize=Float32}
-    ]
-        
-    fun boxOrTagFloat{floatReg, fixedReg, workReg, saveRegs} =
-    if is32in64
-    then List.rev(boxFloat({source=floatReg, destination=fixedReg, workReg=workReg, saveRegs=saveRegs}, []))
-    else
-    [
-        MoveFPToGeneral{regN=floatReg, regD=fixedReg, floatSize=Float32},
-        shiftConstant{direction=ShiftLeft, shift=0w32, regN=fixedReg, regD=fixedReg, opSize=OpSize64},
-        BitwiseLogical{regN=fixedReg, regD=fixedReg, bits=0w1, logOp=LogOr, setFlags=false, opSize=OpSize64}
-    ]
-
-
     (* Call the RTS.  Previously this did not check for exceptions raised in the RTS and instead
        there was code added after each call.  Doing it after the call doesn't affect the time
        taken but makes the code larger especially as this is needed in every arbitrary precision
        operation.
        Currently we clear the RTS exception packet field before the call.  The field is
        cleared in "full" calls that may raise an exception but not in fast calls.  They
        may not raise an exception but the packet may not have been cleared from a
        previous call. *)
 
-    fun rtsCallFastGeneral (functionName, argFormats, resultFormat, debugSwitches) =
+    fun rtsCallFastGeneral (functionName, debugSwitches) =
     let
         val entryPointAddr = makeEntryPoint functionName
         (* The maximum we currently have is five so we don't need to worry about stack args. *)
 
-        fun loadArgs([], _, _, _) = []
-        |   loadArgs(FastArgFixed :: argTypes, srcReg :: srcRegs, fixed :: fixedRegs, fpRegs) =
-                if srcReg = fixed
-                then loadArgs(argTypes, srcRegs, fixedRegs, fpRegs) (* Already in the right reg *)
-                else MoveXRegToXReg{sReg=srcReg, dReg=fixed} ::
-                        loadArgs(argTypes, srcRegs, fixedRegs, fpRegs)
-        |   loadArgs(FastArgDouble :: argTypes, srcReg :: srcRegs, fixedRegs, fp :: fpRegs) =
-                (* Unbox the value into a fp reg. *)
-                unboxDouble(srcReg, srcReg, fp) @
-                loadArgs(argTypes, srcRegs, fixedRegs, fpRegs)
-        |   loadArgs(FastArgFloat :: argTypes, srcReg :: srcRegs, fixedRegs, fp :: fpRegs) =
-                (* Untag and move into the fp reg *)
-                unboxOrUntagSingle(srcReg, srcReg, fp) @
-                loadArgs(argTypes, srcRegs, fixedRegs, fpRegs)
-        |   loadArgs _ = raise InternalError "rtsCall: Too many arguments"
-
         val labelMaker = createLabelMaker()
 
         val noRTSException = createLabel labelMaker
 
         val instructions =
-            loadArgs(argFormats,
-                (* ML Arguments *) [X0, X1, X2, X3, X4, X5, X6, X7],
-                (* C fixed pt args *) [X0, X1, X2, X3, X4, X5, X6, X7],
-                (* C floating pt args *) [V0, V1, V2, V3, V4, V5, V6, V7]) @
                 (* Clear the RTS exception state. *)
                 LoadNonAddr(X16, 0w1) ::
             [
                 StoreRegScaled{regT=X16, regN=X_MLAssemblyInt, unitOffset=exceptionPacketOffset, loadType=Load64},
                 (* Move X30 to X23, a callee-save register. *)
                 (* Note: maybe we should push X24 just in case this is the only
                    reachable reference to the code. *)
                 LogicalShiftedReg{regN=XZero, regM=X_LinkReg, regD=X23, shift=ShiftNone, logOp=LogOr, opSize=OpSize64, setFlags=false},
                 LoadAddr(X16, entryPointAddr) (* Load entry point *)
             ] @ indexToAbsoluteAddress(X16, X16) @
             [
                 LoadRegScaled{regT=X16, regN=X16, unitOffset=0, loadType=Load64}, (* Load the actual address. *)
                 (* Store the current heap allocation pointer. *)
                 StoreRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset, loadType=Load64},
                 (* For the moment save and restore the ML stack pointer.  No RTS call should change
                    it and it's callee-save but just in case... *)
                 StoreRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset, loadType=Load64},
                 BranchReg{regD=X16, brRegType=BRRAndLink}, (* Call the function. *)
                 (* Restore the ML stack pointer. *)
                 LoadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset, loadType=Load64},
                 (* Load the heap allocation ptr and limit.  We could have GCed in the RTS call. *)
                 LoadRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset, loadType=Load64},
                 LoadRegScaled{regT=X_MLHeapLimit, regN=X_MLAssemblyInt, unitOffset=heapLimitPtrOffset, loadType=Load64},
                 (* Check the RTS exception. *)
                 LoadRegScaled{regT=X16, regN=X_MLAssemblyInt, unitOffset=exceptionPacketOffset, loadType=Load64},
                 SubImmediate{regN=X16, regD=XZero, immed=0w1, shifted=false, setFlags=true, opSize=OpSize64},
                 ConditionalBranch(CondEqual, noRTSException),
                     (* If it isn't then raise the exception. *)
                 MoveXRegToXReg{sReg=X16, dReg=X0},
                 LoadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset, loadType=Load64},
                 LoadRegScaled{regT=X16, regN=X_MLStackPtr, unitOffset=0, loadType=Load64},
                 BranchReg{regD=X16, brRegType=BRRBranch},
                 SetLabel noRTSException
-
             ] @
-            (
-                case resultFormat of
-                    FastArgFixed => []
-                |   FastArgDouble => (* This must be boxed. *) List.rev(boxDouble({source=V0, destination=X0, workReg=X1, saveRegs=[]}, []))
-                |   FastArgFloat => (* This must be tagged or boxed *) boxOrTagFloat{floatReg=V0, fixedReg=X0, workReg=X1, saveRegs=[]}
-            ) @
             [
                 BranchReg{regD=X23, brRegType=BRRReturn}
             ]
         val closure = makeConstantClosure()
         val () = generateFinalCode{instrs=instructions, name=functionName, parameters=debugSwitches, resultClosure=closure,
                               profileObject=createProfileObject(), labelMaker=labelMaker}
     in
         closureAsAddress closure
     end
 
-
+    (* Provided there are no more than eight fixed pt args and four floating pt args
+       everything will be in the correct place.  These are only used in Initialise
+       so the number of arguments is limited, currently six. *)
     fun rtsCallFast (functionName, nArgs, debugSwitches) =
-        rtsCallFastGeneral (functionName, List.tabulate(nArgs, fn _ => FastArgFixed), FastArgFixed, debugSwitches)
-
-    (* RTS call with one double-precision floating point argument and a floating point result. *)
-    fun rtsCallFastRealtoReal (functionName, debugSwitches) =
-        rtsCallFastGeneral (functionName, [FastArgDouble], FastArgDouble, debugSwitches)
-    
-    (* RTS call with two double-precision floating point arguments and a floating point result. *)
-    fun rtsCallFastRealRealtoReal (functionName, debugSwitches) =
-        rtsCallFastGeneral (functionName, [FastArgDouble, FastArgDouble], FastArgDouble, debugSwitches)
-
-    (* RTS call with one double-precision floating point argument, one fixed point argument and a
-       floating point result. *)
-    fun rtsCallFastRealGeneraltoReal (functionName, debugSwitches) =
-        rtsCallFastGeneral (functionName, [FastArgDouble, FastArgFixed], FastArgDouble, debugSwitches)
-
-    (* RTS call with one general (i.e. ML word) argument and a floating point result.
-       This is used only to convert arbitrary precision values to floats. *)
-    fun rtsCallFastGeneraltoReal (functionName, debugSwitches) =
-        rtsCallFastGeneral (functionName, [FastArgFixed], FastArgDouble, debugSwitches)
-
-    (* Operations on Real32.real values. *)
-
-    fun rtsCallFastFloattoFloat (functionName, debugSwitches) =
-        rtsCallFastGeneral (functionName, [FastArgFloat], FastArgFloat, debugSwitches)
+        if nArgs > 8 then raise InternalError "rtsCallFast: more than 8 arguments"
+        else rtsCallFastGeneral (functionName, debugSwitches)
     
-    fun rtsCallFastFloatFloattoFloat (functionName, debugSwitches) =
-        rtsCallFastGeneral (functionName, [FastArgFloat, FastArgFloat], FastArgFloat, debugSwitches)
-
-    (* RTS call with one double-precision floating point argument, one fixed point argument and a
-       floating point result. *)
-    fun rtsCallFastFloatGeneraltoFloat (functionName, debugSwitches) =
-        rtsCallFastGeneral (functionName, [FastArgFloat, FastArgFixed], FastArgFloat, debugSwitches)
-
-    (* RTS call with one general (i.e. ML word) argument and a floating point result.
-       This is used only to convert arbitrary precision values to floats. *)
-    fun rtsCallFastGeneraltoFloat (functionName, debugSwitches) =
-        rtsCallFastGeneral (functionName, [FastArgFixed], FastArgFloat, debugSwitches)
+    val rtsCallFastRealtoReal = rtsCallFastGeneral
+    and rtsCallFastRealRealtoReal = rtsCallFastGeneral
+    and rtsCallFastRealGeneraltoReal = rtsCallFastGeneral
+    and rtsCallFastGeneraltoReal = rtsCallFastGeneral
+    and rtsCallFastFloattoFloat = rtsCallFastGeneral
+    and rtsCallFastFloatFloattoFloat = rtsCallFastGeneral
+    and rtsCallFastFloatGeneraltoFloat = rtsCallFastGeneral
+    and rtsCallFastGeneraltoFloat = rtsCallFastGeneral
 
     (* There is only one ABI value. *)
     datatype abi = ARM64Abi
     fun abiList () = [("default", ARM64Abi)]
 
     fun alignUp(s, align) = Word.andb(s + align-0w1, ~ align)
     
     val getThreadDataCall = makeEntryPoint "PolyArm64GetThreadData"
 
     (* This must match the type in Foreign.LowLevel.  Once this is bootstrapped we could use that
        type but note that this is the type we use within the compiler and we build Foreign.LowLevel
        AFTER compiling this. *)
     datatype cTypeForm =
         CTypeFloatingPt | CTypePointer | CTypeSignedInt | CTypeUnsignedInt
     |   CTypeStruct of cType list | CTypeVoid
     withtype cType = { typeForm: cTypeForm, align: word, size: word }
 
     (* Load a byte, halfword, word or long *)
     fun loadAlignedValue(reg, base, offset, size) =
     let
         val _ = offset mod size = 0w0 orelse raise InternalError "loadAlignedValue: not aligned"
         val loadOp =
             case size of
                 0w8 => Load64
             |   0w4 => Load32
             |   0w2 => Load16
             |   0w1 => Load8
             |   _ => raise InternalError "loadAlignedValue: invalid length"
     in
         LoadRegScaled{regT=reg, regN=base, unitOffset=Word.toInt(offset div size), loadType=loadOp}
     end
 
     (* Store a register into upto 8 bytes.  Most values will involve a single store but odd-sized
        structs can require shifts and multiple stores.  N.B.  May modify the source register. *)
     and storeUpTo8(reg, base, offset, size) =
     let
         val storeOp =
             if size = 0w8 then Load64 else if size >= 0w4 then Load32
             else if size >= 0w2 then Load16 else Load8
     in
         [StoreRegUnscaled{regT=reg, regN=base, byteOffset=offset, loadType=storeOp, unscaledType=NoUpdate}]
     end @
     (
         if size = 0w6 orelse size = 0w7
         then
         [
             shiftConstant{direction=ShiftRightLogical, regN=reg, regD=reg, shift=0w32, opSize=OpSize64 },
             StoreRegUnscaled{regT=reg, regN=base, byteOffset=offset+4, loadType=Load16, unscaledType=NoUpdate}
         ]
         else []
     ) @
     (
         if size = 0w3 orelse size = 0w5 orelse size = 0w7
         then
         [
             shiftConstant{direction=ShiftRightLogical, regN=reg, regD=reg, shift=(size-0w1)*0w8, opSize=OpSize64 },
             StoreRegUnscaled{regT=reg, regN=base, byteOffset=offset+Word.toInt(size-0w1), loadType=Load8, unscaledType=NoUpdate}
         ]
         else []
     )
 
     (* Extract the elements of structures. *)
     fun unwrap(CTypeStruct ctypes, _) = List.foldr(fn({typeForm, size, ...}, l) => unwrap(typeForm, size) @ l) [] ctypes
     |   unwrap (ctype, size) = [(ctype, size)]
 
     (* Structures of up to four floating point values of the same precision are treated specially. *)
     datatype argClass =
         ArgClassHFA of Word8.word * bool (* 1 - 4 floating pt values *)
     |   ArgLargeStruct          (* > 16 bytes and not an HFA *)
     |   ArgSmall                (* Scalars or small structures *)
 
     fun classifyArg(ctype, size) =
         case unwrap (ctype, size) of
             [(CTypeFloatingPt, 0w4)] =>
                 ArgClassHFA(0w1, false)
         |   [(CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4)] =>
                 ArgClassHFA(0w2, false)
         |   [(CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4)] =>
                 ArgClassHFA(0w3, false)
         |   [(CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4)] =>
                 ArgClassHFA(0w4, false)
         |   [(CTypeFloatingPt, 0w8)] =>
                 ArgClassHFA(0w1, true)
         |   [(CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8)] =>
                 ArgClassHFA(0w2, true)
         |   [(CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8)] =>
                 ArgClassHFA(0w3, true)
         |   [(CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8)] =>
                 ArgClassHFA(0w4, true)
         |   _ =>
                 if size > 0w16 then ArgLargeStruct
                 else ArgSmall
 
     (* Can we load this in a single instruction? *)
     fun alignedLoadStore(_, 0w1) = true
     |   alignedLoadStore(addr, 0w2) = addr mod 0w2 = 0w0
     |   alignedLoadStore(addr, 0w4) = addr mod 0w4 = 0w0
     |   alignedLoadStore(addr, 0w8) = addr mod 0w8 = 0w0
     |   alignedLoadStore(addr, 0w16) = addr mod 0w8 = 0w0 (* Can use load-pair. *)
     |   alignedLoadStore _ = false
 
     (* This builds a piece of code that takes three arguments and returns a unit result.
        All three arguments are SysWord.word values i.e. ML addresses containing the address
        of the actual C value.
        The first argument (X0) is the address of the function to call.
        The second argument (X1) points to a struct that contains the argument(s) for the
        function.  The arguments have to be unpacked from the struct into the appropriate
        registers or to the C stack.
        The third argument (X2) points to a piece of memory to receive the result of the call.
        It may be empty if the function returns void.  It may only be as big as required
        for the result type. *)
     fun foreignCall(_: abi, args: cType list, result: cType): Address.machineWord =
     let
         val resultAreaPtr = X19 (* Unboxed value from X2 - This is callee save. *)
         val argPtrReg = X9 (* A scratch register that isn't used for arguments. *)
         val entryPtReg = X16 (* Contains the address of the function to call. *)
         val argWorkReg = X10 (* Used in loading arguments if necessary. *)
         and argWorkReg2 = X11
         and structSpacePtr = X12
         and argWorkReg3 = X13
         and argWorkReg4 = X14
 
         val labelMaker = createLabelMaker()
 
         fun loadArgs([], stackOffset, _, _, _, code, largeStructSpace) =
                 (code, stackOffset, largeStructSpace)
 
         |   loadArgs(arg::args, stackOffset, argOffset, gRegNo, fpRegNo, code, largeStructSpace) =
             let
                 val {size, align, typeForm, ...} =  arg
                 val newArgOffset = alignUp(argOffset, align)
             in
                 case classifyArg(typeForm, size) of
                     ArgClassHFA(numItems, isDouble) =>
                         if fpRegNo + numItems <= 0w8
                         then
                         let
                             val scale = if isDouble then 0w8 else 0w4
                             (* Load the values to the floating point registers. *)
                             fun loadFPRegs(0w0, _, _) = []
                             |   loadFPRegs(0w1, fpRegNo, offset) =
                                 [LoadFPRegScaled{regT=VReg fpRegNo, regN=argPtrReg, unitOffset=offset, floatSize=if isDouble then Double64 else Float32}]
                             |   loadFPRegs(n, fpRegNo, offset) =
                                 (LoadFPRegPair{regT1=VReg fpRegNo, regT2=VReg(fpRegNo+0w1), regN=argPtrReg, unitOffset=offset,
                                                 floatSize=if isDouble then Double64 else Float32, unscaledType=NoUpdate} ::
                                         loadFPRegs(n-0w2, fpRegNo+0w2, offset+2))
                         in
                             loadArgs(args, stackOffset, newArgOffset+size, gRegNo, fpRegNo+numItems,
                                 loadFPRegs(numItems, fpRegNo, Word.toInt(newArgOffset div scale)) @ code,
                                 largeStructSpace)
                         end
                         else
                         let
                             (* If we have insufficient number of registers we discard any that are
                                left and push the argument to the stack. *)
                             (* The floating point value or structure is copied to the stack
                                as a contiguous area.  Use general registers to copy the data.
                                It could be on a 4-byte alignment.  In the typical case of a single
                                floating point value this will just be a single load and store. *)
                             fun copyData(0w0, _, _) = []
                             |   copyData(n, srcOffset, stackOffset) =
                                 if isDouble
                                 then LoadRegScaled{loadType=Load64, regT=argWorkReg2, regN=argPtrReg, unitOffset=srcOffset} ::
                                      StoreRegScaled{loadType=Load64, regT=argWorkReg2, regN=XSP, unitOffset=stackOffset} ::
                                      copyData(n-0w1, srcOffset+1, stackOffset+1)
                                 else LoadRegScaled{loadType=Load32, regT=argWorkReg2, regN=argPtrReg, unitOffset=srcOffset} ::
                                      StoreRegScaled{loadType=Load32, regT=argWorkReg2, regN=XSP, unitOffset=stackOffset} ::
                                      copyData(n-0w1, srcOffset+1, stackOffset+1)
 
                             val copyToStack =
                                 if isDouble
                                 then copyData(numItems, Word.toInt(newArgOffset div 0w8), stackOffset)
                                 else copyData(numItems, Word.toInt(newArgOffset div 0w4), stackOffset*2)
                             (* The overall size is rounded up to a multiple of 8 *)
                             val newStackOffset = stackOffset + Word.toInt(alignUp(size, 0w8) div 0w8)
                         in
                             loadArgs(args, newStackOffset, newArgOffset+size, gRegNo, 0w8,
                                 copyToStack @ code, largeStructSpace)
                         end
              
                 |   _ =>
                     let
                         (* Load an aligned argument into one or two registers or copy it to the stack. *)
                         fun loadArgumentValues(argSize, sourceOffset, sourceBase, newStructSpace, preCode) =
                             if gRegNo <= 0w6 orelse (size <= 0w8 andalso gRegNo <= 0w7)
                             then (* There are sufficient registers *)
                             let
                                 val (loadInstr, nextGReg) =
                                     if argSize = 0w16
                                     then ([LoadRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=XReg gRegNo, regT2=XReg(gRegNo+0w1),
                                                 regN=sourceBase, unitOffset=Word.toInt(sourceOffset div 0w8)}], gRegNo+0w2)
                                     else ([loadAlignedValue(XReg gRegNo, sourceBase, sourceOffset, size)], gRegNo+0w1)
                             in
                                 loadArgs(args, stackOffset, newArgOffset+size, nextGReg, fpRegNo,
                                         preCode @ loadInstr @ code, newStructSpace)
                             end
                             else if argSize = 0w16
                             then loadArgs(args, stackOffset+2, newArgOffset+size, 0w8, fpRegNo,
                                     preCode @
                                     LoadRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=argWorkReg2, regT2=argWorkReg3, regN=sourceBase, unitOffset=Word.toInt(sourceOffset div 0w8)} ::
                                     StoreRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=argWorkReg2, regT2=argWorkReg3, regN=XSP, unitOffset=stackOffset} :: code, newStructSpace)
                             else loadArgs(args, stackOffset+1, newArgOffset+size, 0w8, fpRegNo,
                                     preCode @ loadAlignedValue(argWorkReg2, sourceBase, sourceOffset, argSize) ::
                                     StoreRegScaled{loadType=Load64, regT=argWorkReg2, regN=XSP, unitOffset=stackOffset} :: code, newStructSpace)
                     in
                         if alignedLoadStore(newArgOffset, size)
                         then loadArgumentValues(size, newArgOffset, argPtrReg, largeStructSpace, [])
 
                         else (* General case.  Either a large structure or a small structure that
                                 can't easily be loaded,  First copy it to the stack, and either pass
                                 the address or load it once it's aligned. *)
                         let
                             val newStructSpace = alignUp(largeStructSpace + size, 0w16)
                             val loopLabel = createLabel labelMaker
                             (* The address of the area we're copying to is in argRegNo. *)
                             val argRegNo = if gRegNo < 0w8 then XReg gRegNo else argWorkReg
                             (* Copy from the end back to the start. *)
                             val copyToStructSpace =
                             [
                                 AddImmediate{opSize=OpSize64, setFlags=false, regN=structSpacePtr, regD=argRegNo, immed=largeStructSpace, shifted=false}, 
                                 AddImmediate{opSize=OpSize64, setFlags=false, regN=argRegNo, regD=argWorkReg2, immed=size, shifted=false}, (* End of dest area *)
                                 AddImmediate{opSize=OpSize64, setFlags=false, regN=argPtrReg, regD=argWorkReg3, immed=newArgOffset+size, shifted=false}, (* end of source *)
                                 SetLabel loopLabel,
                                 LoadRegUnscaled{loadType=Load8, unscaledType=PreIndex, regT=argWorkReg4, regN=argWorkReg3, byteOffset= ~1},
                                 StoreRegUnscaled{loadType=Load8, unscaledType=PreIndex, regT=argWorkReg4, regN=argWorkReg2, byteOffset= ~1},
                                 SubShiftedReg{opSize=OpSize64, setFlags=true, regM=argWorkReg2, regN=argRegNo, regD=XZero, shift=ShiftNone}, (* At start? *)
                                 ConditionalBranch(CondNotEqual, loopLabel)
                             ]
                         in
                             if size > 0w16
                             then (* Large struct - pass by reference *)
                             (
                                 if gRegNo < 0w8
                                 then loadArgs(args, stackOffset, newArgOffset+size, gRegNo+0w1, fpRegNo, copyToStructSpace @ code, newStructSpace)
                                 else loadArgs(args, stackOffset+1, newArgOffset+size, 0w8, fpRegNo,
                                         copyToStructSpace @ StoreRegScaled{loadType=Load64, regT=argWorkReg, regN=XSP, unitOffset=stackOffset} :: code,
                                         newStructSpace)
                             )
                             else (* Small struct.  Since it's now in an area at least 16 bytes and properly aligned we can load it. *)
                                 (* argRegNo points to where we copied it *)
                                 loadArgumentValues(if size > 0w8 then 0w16 else 0w8, 0w0, argRegNo, newStructSpace, copyToStructSpace)
                         end
                     end
             end
     
         local
             val {size, typeForm, ...} =  result
             (* Store a result register into the result area.  In almost all cases
                this is very simple: the only complication is with structs of odd sizes. *)
             fun storeResult(reg, offset, size) = storeUpTo8(reg, resultAreaPtr, offset, size)
         in
             val (getResult, passArgAddress) =
                 if typeForm = CTypeVoid
                 then ([], false)
                 else case classifyArg(typeForm, size) of
                     (* Floating point values are returned in s0-sn, d0-dn. *)
                     ArgClassHFA(numItems, isDouble) =>
                         let
                             fun storeFPRegs(0w0, _, _) = []
                             |   storeFPRegs(0w1, fpRegNo, offset) =
                                 [StoreFPRegScaled{regT=VReg fpRegNo, regN=resultAreaPtr, unitOffset=offset, floatSize=if isDouble then Double64 else Float32}]
                             |   storeFPRegs(n, fpRegNo, offset) =
                                 StoreFPRegPair{regT1=VReg fpRegNo, regT2=VReg(fpRegNo+0w1), regN=resultAreaPtr, unitOffset=offset,
                                         floatSize=if isDouble then Double64 else Float32, unscaledType=NoUpdate} ::
                                     storeFPRegs(n-0w2, fpRegNo+0w2, offset+2)
                         in
                             (storeFPRegs(numItems, 0w0 (* V0-Vn*), 0), false)
                         end
 
                 |   ArgLargeStruct => ([], true) (* Structures larger than 16 bytes are passed by reference. *)
 
                 |   _ =>
                         if size = 0w16
                         then ([StoreRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=X0, regT2=X1, regN=resultAreaPtr, unitOffset=0}], false)
                         else if size > 0w8
                         then (StoreRegScaled{loadType=Load64, regT=X0, regN=resultAreaPtr, unitOffset=0} :: storeResult(X1, 8, size-0w8), false)
                         else (storeResult(X0, 0, size), false)
         end
 
         val (argCode, argStack, largeStructSpace) =
             loadArgs(args, 0, 0w0, 0w0, 0w0,
                     if passArgAddress (* If we have to pass the address of the result struct it goes in X8. *)
                     then [MoveXRegToXReg{sReg=resultAreaPtr, dReg=X8}]
                     else [], 0w0)
 
         val stackSpaceRequired = alignUp(Word.fromInt argStack * 0w8, 0w16) + largeStructSpace
 
         val instructions =
             [(* Push the return address to the stack.  We could put it in a callee-save register but
                 there's a very small chance that this could be the last reference to a piece of code. *)
              StoreRegUnscaled{loadType=Load64, unscaledType=PreIndex, regT=X30, regN=X_MLStackPtr, byteOffset= ~8},
              (* Save heap ptr.  Needed in case we have a callback. *)
              StoreRegScaled{loadType=Load64, regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset}
             ] @ indexToAbsoluteAddress(X0, X0) @
              (* Load the entry point address. *)
              LoadRegScaled{loadType=Load64, regT=entryPtReg, regN=X0, unitOffset=0} ::
             (
                 (* Unbox the address of the result area into a callee save resgister.  This is where
                    the result will be stored on return if it is anything other than a struct.
                    We have to put the C address in there now because an ML address wouldn't be updated
                    by a possible GC in a callback. *)
                 if #typeForm(result) <> CTypeVoid
                 then indexToAbsoluteAddress(X2, X2) @ [LoadRegScaled{loadType=Load64, regT=resultAreaPtr, regN=X2, unitOffset=0}]
                 else []
             ) @
             [StoreRegScaled{loadType=Load64, regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset}] @ (* Save the stack pointer. *)
             (
                 if stackSpaceRequired = 0w0
                 then []
                 else [SubImmediate{opSize=OpSize64, setFlags=false, regN=XSP, regD=XSP, immed=stackSpaceRequired, shifted=false}]
             ) @
             (
                 (* If we need to copy a struct load a register with a pointer to the area for it. *)
                 if largeStructSpace = 0w0
                 then []
                 else [AddImmediate{opSize=OpSize64, setFlags=false, regN=XSP, regD=structSpacePtr, immed=stackSpaceRequired-largeStructSpace, shifted=false}]
             ) @
             (
                 (* The second argument is a SysWord containing the address of a malloced area of memory
                    with the actual arguments in it. *)
                 if null args
                 then []
                 else indexToAbsoluteAddress(X1, X1) @ [LoadRegScaled{loadType=Load64, regT=argPtrReg, regN=X1, unitOffset=0}]
             ) @ argCode @
             [BranchReg{regD=X16, brRegType=BRRAndLink}] @ (* Call the function. *)
             (* Restore the C stack value in case it's been changed by a callback. *)
             (
                 if stackSpaceRequired = 0w0
                 then []
                 else [AddImmediate{opSize=OpSize64, setFlags=false, regN=XSP, regD=XSP, immed=stackSpaceRequired, shifted=false}]
             ) @
             [
                 (* Reload the ML stack pointer even though it's callee save.  If we've made a callback
                    the ML stack could have grown and so moved to a different address. *)
                 LoadRegScaled{loadType=Load64, regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset},
                 (* Load the heap allocation ptr and limit in case of a callback. *)
                 LoadRegScaled{loadType=Load64, regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset},
                 LoadRegScaled{loadType=Load64, regT=X_MLHeapLimit, regN=X_MLAssemblyInt, unitOffset=heapLimitPtrOffset}
             ] @ (* Store the result in the destination. *) getResult @
             (* Pop the return address and return. *)
             [ LoadRegUnscaled{regT=X30, regN=X_MLStackPtr, byteOffset= 8, loadType=Load64, unscaledType=PostIndex}, BranchReg{regD=X30,brRegType=BRRReturn} ]
 
         val functionName = "foreignCall"
         val debugSwitches =
             [(*Universal.tagInject Pretty.compilerOutputTag (Pretty.prettyPrint(print, 70)),
                Universal.tagInject Debug.assemblyCodeTag true*)]
         val closure = makeConstantClosure()
         val () = generateFinalCode{instrs=instructions, name=functionName, parameters=debugSwitches, resultClosure=closure,
                               profileObject=createProfileObject(), labelMaker=labelMaker}
     in
         closureAsAddress closure
     end
 
     (* Build a callback function.  The arguments are the abi, the list of argument types and the result type.
        The result is an ML function that takes an ML function, f, as its argument, registers it as a callback and
        returns the C function as its result.  When the C function is called the arguments are copied into
        temporary memory and the vector passed to f along with the address of the memory for the result.
        "f" stores the result in it when it returns and the result is then passed back as the result of the
        callback.
        N.B.  This returns a closure cell which contains the address of the code.  It can be used as a
        SysWord.word value except that while it exists the code will not be GCed.  *)
     fun buildCallBack(_: abi, args: cType list, result: cType): Address.machineWord =
     let
         val argWorkReg = X10 (* Used in loading arguments if necessary. *)
         and argWorkReg2 = X11
         and argWorkReg3 = X13
         and argWorkReg4 = X14
         
         val labelMaker = createLabelMaker()
 
         (* The stack contains a 32-byte result area then an aligned area for the arguments. *)
 
         (* Store the argument values to the structure that will be passed to the ML callback function. *)
         (* Note.  We've loaded the frame pointer with the original stack ptr-96 so we can
            access any stack arguments from that. *)
         fun moveArgs([], _, _, _, _, moveFromStack) = moveFromStack
         
         |   moveArgs(arg::args, stackSpace, argOffset, gRegNo, fpRegNo, moveFromStack) =
             let
                 val {size, align, typeForm, ...} =  arg
                 val newArgOffset = alignUp(argOffset, align)
             in
                 case classifyArg(typeForm, size) of
                     ArgClassHFA(numItems, isDouble) =>
                         if fpRegNo + numItems <= 0w8
                         then
                         let
                             val scale = if isDouble then 0w8 else 0w4
                             (* Store the values from the FP registers. *)
                             fun storeFPRegs(0w0, _, _) = []
                             |   storeFPRegs(0w1, fpRegNo, offset) =
                                 [StoreFPRegScaled{regT=VReg fpRegNo, regN=XSP, unitOffset=offset, floatSize=if isDouble then Double64 else Float32}]
                             |   storeFPRegs(n, fpRegNo, offset) =
                                 StoreFPRegPair{regT1=VReg fpRegNo, regT2=VReg(fpRegNo+0w1), regN=XSP, unitOffset=offset,
                                                 floatSize=if isDouble then Double64 else Float32, unscaledType=NoUpdate} ::
                                         storeFPRegs(n-0w2, fpRegNo+0w2, offset+2)
                         in
                             moveArgs(args, stackSpace, newArgOffset+size, gRegNo, fpRegNo+numItems,
                                 storeFPRegs(numItems, fpRegNo, Word.toInt(newArgOffset div scale)) @ moveFromStack)
                         end
                         else
                         let
                             (* Load the arguments from the stack and store into the result area. *)
                             fun copyData(0w0, _, _) = []
                             |   copyData(n, dstOffset, stackOffset) =
                                 if isDouble
                                 then LoadRegScaled{loadType=Load64, regT=argWorkReg2, regN=X29, unitOffset=stackOffset} ::
                                      StoreRegScaled{loadType=Load64, regT=argWorkReg2, regN=XSP, unitOffset=dstOffset} ::
                                      copyData(n-0w1, dstOffset+1, stackOffset+1)
                                 else LoadRegScaled{loadType=Load32, regT=argWorkReg2, regN=X29, unitOffset=stackOffset} ::
                                      StoreRegScaled{loadType=Load32, regT=argWorkReg2, regN=XSP, unitOffset=dstOffset} ::
                                      copyData(n-0w1, dstOffset+1, stackOffset+1)
 
                             val copyFromStack =
                                 if isDouble
                                 then copyData(numItems, Word.toInt(newArgOffset div 0w8), stackSpace)
                                 else copyData(numItems, Word.toInt(newArgOffset div 0w4), stackSpace*2)
                             (* The overall size is rounded up to a multiple of 8 *)
                             val newStackOffset = stackSpace + Word.toInt(alignUp(size, 0w8) div 0w8)
                         in
                             moveArgs(args, newStackOffset, newArgOffset+size, gRegNo, 0w8,
                                 copyFromStack @ moveFromStack)
                         end
 
                 |   _ =>
                     if alignedLoadStore(newArgOffset, size) andalso
                         (gRegNo <= 0w6 orelse gRegNo = 0w7 andalso size <= 0w8)
                     then (* Usual case: argument passed in one or two registers. *)
                     (
                         if size > 0w8
                         then moveArgs(args, stackSpace, newArgOffset+size, gRegNo + 0w2, fpRegNo,
                                 StoreRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=XReg gRegNo, regT2=XReg(gRegNo+0w1), regN=XSP,
                                     unitOffset=Word.toInt(newArgOffset div 0w8)} :: moveFromStack)
                         else moveArgs(args, stackSpace, newArgOffset+size, gRegNo + 0w1, fpRegNo,
                                 storeUpTo8(XReg gRegNo, XSP, Word.toInt newArgOffset, size) @ moveFromStack)
                     )
                     else
                     (* General case.  Store the argument registers if necessary and then use
                        a byte copy to copy into the argument area.  This sorts out any odd alignments or
                        lengths.  In some cases the source will be in memory already. *)
                     let
                         (* The source is either the register value or the value on the stack. *)
                         val (argRegNo, nextGReg, newStack, loadArg) =
                             if size > 0w16
                             then
                             (
                                 if gRegNo < 0w8
                                 then (XReg gRegNo, gRegNo + 0w1, stackSpace, [])
                                 else (argWorkReg, 0w8, stackSpace+1, [LoadRegScaled{loadType=Load64, regT=argWorkReg, regN=X29, unitOffset=stackSpace}])
                             )
                             else
                             let
                                 val regsNeeded = if size > 0w8 then 0w2 else 0w1
                             in
                                 if gRegNo + regsNeeded <= 0w8
                                 then (XReg gRegNo, gRegNo+regsNeeded, stackSpace,
                                         [if size > 0w8
                                          then StoreRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=XReg gRegNo, regT2=XReg(gRegNo+0w1), regN=XSP, unitOffset=2}
                                          else StoreRegScaled{loadType=Load64, regT=XReg gRegNo, regN=XSP, unitOffset=2},
                                          AddImmediate{opSize=OpSize64, setFlags=false, regD=XReg gRegNo, regN=XSP, immed=0w16, shifted=false}])
                                 else (* Being passed on the stack *)
                                     (argWorkReg, 0w8, stackSpace+Word8.toInt regsNeeded,
                                         [AddImmediate{opSize=OpSize64, setFlags=false, regD=argWorkReg, regN=X29, immed=Word.fromInt stackSpace*0w8, shifted=false}])
                             end
 
                         val loopLabel = createLabel labelMaker
 
                         val copyCode =
                         [ 
                             AddImmediate{opSize=OpSize64, setFlags=false, regN=argRegNo, regD=argWorkReg3, immed=size, shifted=false}, (* End of source area *)
                             AddImmediate{opSize=OpSize64, setFlags=false, regN=XSP, regD=argWorkReg2, immed=newArgOffset+size, shifted=false}, (* end of dest *)
                             SetLabel loopLabel,
                             LoadRegUnscaled{loadType=Load8, unscaledType=PreIndex, regT=argWorkReg4, regN=argWorkReg3, byteOffset= ~1},
                             StoreRegUnscaled{loadType=Load8, unscaledType=PreIndex, regT=argWorkReg4, regN=argWorkReg2, byteOffset= ~1},
                             SubShiftedReg{opSize=OpSize64, setFlags=true, regM=argWorkReg3, regN=argRegNo, regD=XZero, shift=ShiftNone}, (* At start? *)
                             ConditionalBranch(CondNotEqual, loopLabel)
                         ]
                     in
                         moveArgs(args, newStack, newArgOffset+size, nextGReg, fpRegNo, loadArg @ copyCode @ moveFromStack)
                     end
             end
 
         val copyArgsFromRegsAndStack =
             moveArgs(args, 12 (* Offset to first stack arg *), 0w32 (* Size of result area *), 0w0, 0w0, [])
 
         local
             fun getNextSize (arg, argOffset) =
             let val {size, align, ...} =  arg in alignUp(argOffset, align) + size end
         in
             val argumentSpace = alignUp(List.foldl getNextSize 0w0 args, 0w16)
         end
 
         local
             val {size, typeForm, ...} =  result
         in
             (* Load the results from the result area except that if we're
                passing the result structure by reference this is done by
                the caller.  Generally similar to how arguments are passed
                in a call. *)
             val (loadResults, resultByReference) =
                 if typeForm = CTypeVoid
                 then ([], false)
                 else case classifyArg(typeForm, size) of
                     ArgClassHFA(numItems, isDouble) =>
                         let
                             (* Load the values to the floating point registers. *)
                             fun loadFPRegs(0w0, _, _) = []
                             |   loadFPRegs(0w1, fpRegNo, offset) =
                                 [LoadFPRegScaled{regT=VReg fpRegNo, regN=XSP, unitOffset=offset, floatSize=if isDouble then Double64 else Float32}]
                             |   loadFPRegs(n, fpRegNo, offset) =
                                 LoadFPRegPair{regT1=VReg fpRegNo, regT2=VReg(fpRegNo+0w1), regN=XSP, unitOffset=offset, unscaledType=NoUpdate,
                                                 floatSize=if isDouble then Double64 else Float32} ::
                                         loadFPRegs(n-0w2, fpRegNo+0w2, offset+2)
                         in
                             (loadFPRegs(numItems, 0w0, 0 (* result area *)), false)
                         end
 
                 |   ArgLargeStruct => ([], true) (* Structures larger than 16 bytes are passed by reference. *)
 
                 |   _ =>
                     (* We've allocated a 32-byte area aligned onto a 16-byte boundary so
                        we can simply load one or two registers. *)
                     if size > 0w8
                     then ([LoadRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=X0, regT2=X1, regN=XSP, unitOffset=0}], false)
                     else ([LoadRegScaled{loadType=Load64, regT=X0, regN=XSP, unitOffset=0}], false)
         end
 
         val instructions =
             [ (* Push LR, FP and the callee-save registers. *)
                 StoreRegPair{loadType=Load64, unscaledType=PreIndex, regT1=X29, regT2=X30, regN=XSP, unitOffset= ~12},
                 MoveXRegToXReg{sReg=XSP, dReg=X29},
                 StoreRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=X19, regT2=X20, regN=X29, unitOffset=2},
                 StoreRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=X21, regT2=X22, regN=X29, unitOffset=4},
                 StoreRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=X23, regT2=X24, regN=X29, unitOffset=6},
                 StoreRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=X25, regT2=X26, regN=X29, unitOffset=8},
                 StoreRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=X27, regT2=X28, regN=X29, unitOffset=10},
                 (* Reserve space for the arguments and results. *)
                 SubImmediate{opSize=OpSize64, setFlags=false, regN=XSP, regD=XSP, immed=argumentSpace+0w32, shifted=false},
                 (* We passed the function we're calling in X9 but we need to move
                    it to a callee-save register before we call the RTS. *)
                 MoveXRegToXReg{sReg=X9, dReg=X20}
             ] @
                 (* Save X8 if we're going to need it. *)
             (if resultByReference then [StoreRegScaled{loadType=Load64, regT=X8, regN=XSP, unitOffset=0}] else []) @
             (* Now we've saved X24 we can move the global heap base into it. *)
             (if is32in64 then [MoveXRegToXReg{sReg=X10, dReg=X_Base32in64}] else []) @
             copyArgsFromRegsAndStack @
             [LoadAddr(X0, getThreadDataCall)] @
             (
                 if is32in64
                 then [AddShiftedReg{setFlags=false, opSize=OpSize64, regM=X0, regN=X_Base32in64, regD=X0, shift=ShiftLSL 0w2}]
                 else []
             ) @
             [
                 (* Call into the RTS to get the thread data ptr. *)
                 LoadRegScaled{loadType=Load64, regT=X0, regN=X0, unitOffset=0},
                 BranchReg{regD=X0, brRegType=BRRAndLink},
                 MoveXRegToXReg{sReg=X0, dReg=X_MLAssemblyInt},
                 (* Load the ML regs. *)
                 LoadRegScaled{loadType=Load64, regT=X_MLHeapLimit, regN=X_MLAssemblyInt, unitOffset=heapLimitPtrOffset},
                 LoadRegScaled{loadType=Load64, regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset},
                 LoadRegScaled{loadType=Load64, regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset},
                 (* Prepare the arguments.  They are both syswords so have to be boxed.
                    First load the address of the argument area which is after the
                    32-byte result area. *)
                 AddImmediate{opSize=OpSize64, setFlags=false, regN=XSP, regD=X2, immed=0w32, shifted=false}
             ] @ List.rev(boxSysWord({source=X2, destination=X0, workReg=X3, saveRegs=[]}, [])) @ (* Address of arguments. *)
             (
                 (* Result area pointer.  If we're returning by reference this is the original
                    value of X8 otherwise it's the address of the 32 bytes we've reserved. *)
                 if resultByReference
                 then [LoadRegScaled{loadType=Load64, regT=X2, regN=XSP, unitOffset=0}]
                 else [MoveXRegToXReg{sReg=XSP, dReg=X2}]
             ) @ List.rev(boxSysWord({source=X2, destination=X1, workReg=X3, saveRegs=[]}, [])) @
                 (* Put the ML closure pointer, originally in X9 now in X20, into the
                    ML closure pointer register, X8.  Then call the ML code. *)
             [MoveXRegToXReg{sReg=X20, dReg=X8}] @
             (
                 if is32in64
                 then
                 [
                     AddShiftedReg{regM=X8, regN=X_Base32in64, regD=X16, shift=ShiftLSL 0w2, opSize=OpSize64, setFlags=false},
                     LoadRegScaled{loadType=Load64, regT=X16, regN=X16, unitOffset=0}
                 ]
                 else [LoadRegScaled{loadType=Load64, regT=X16, regN=X8, unitOffset=0}]
             ) @
             [
                 BranchReg{regD=X16, brRegType=BRRAndLink},
                 (* Save the ML stack and heap pointers.  We could have allocated or
                    grown the stack.  The limit pointer is maintained by the RTS. *)
                 StoreRegScaled{loadType=Load64, regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset},
                 StoreRegScaled{loadType=Load64, regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset}
             ] @ loadResults @ (* Load the return values *)
             [
                 (* Restore the callee-save registers and return. *)
                 MoveXRegToXReg{sReg=X29, dReg=XSP},
                 LoadRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=X19, regT2=X20, regN=X29, unitOffset=2},
                 LoadRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=X21, regT2=X22, regN=X29, unitOffset=4},
                 LoadRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=X23, regT2=X24, regN=X29, unitOffset=6},
                 LoadRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=X25, regT2=X26, regN=X29, unitOffset=8},
                 LoadRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=X27, regT2=X28, regN=X29, unitOffset=10},
                 LoadRegPair{loadType=Load64, unscaledType=PostIndex, regT1=X29, regT2=X30, regN=XSP, unitOffset=12},
                 BranchReg{regD=X30, brRegType=BRRReturn}
             ]
 
         val functionName = "foreignCallBack(2)"
         val debugSwitches =
             [(*Universal.tagInject Pretty.compilerOutputTag (Pretty.prettyPrint(print, 70)),
                Universal.tagInject Debug.assemblyCodeTag true*)]
 
         val closure = makeConstantClosure()
         val () = generateFinalCode{instrs=instructions, name=functionName, parameters=debugSwitches,
                               resultClosure=closure, profileObject=createProfileObject(), labelMaker=labelMaker}
         val stage2Code = closureAsAddress closure
         
         fun resultFunction f =
         let
             (* Generate a small function to load the address of f into a register and then jump to stage2.
                The idea is that it should be possible to generate this eventually in a single RTS call.
                That could be done by using a version of this as a model. *)
             val instructions =
                 if is32in64
                 then
                     (* Get the global heap base into X10. *)
                     [
                         LoadGlobalHeapBaseInCallback X10,
                         LoadAddr(X9, Address.toMachineWord f),
                         (* Have to load the actual address at run-time. *)
                         LoadAddr(X16, stage2Code),
                         AddShiftedReg{setFlags=false, opSize=OpSize64, regM=X16, regN=X10, regD=X16, shift=ShiftLSL 0w2},
                         LoadRegScaled{loadType=Load64, regT=X16, regN=X16, unitOffset=0},
                         BranchReg{regD=X16, brRegType=BRRBranch}
                     ]
                 else
                 let
                     (* We can extract the actual code address in the native address version. *)
                     val codeAddress = Address.loadWord(Address.toAddress stage2Code, 0w0)
                 in
                     [
                         LoadAddr(X9, Address.toMachineWord f),
                         LoadAddr(X16, codeAddress),
                         BranchReg{regD=X16, brRegType=BRRBranch}
                     ]
                 end
             val functionName = "foreignCallBack(1)"
             val debugSwitches =
                 [(*Universal.tagInject Pretty.compilerOutputTag (Pretty.prettyPrint(print, 70)),
                    Universal.tagInject Debug.assemblyCodeTag true*)]
             val closure = makeConstantClosure()
             val () = generateFinalCode{instrs=instructions, name=functionName, parameters=debugSwitches,
                                   resultClosure=closure, profileObject=createProfileObject(),
                                   labelMaker=createLabelMaker()}
             val res = closureAsAddress closure
             (*val _ = print("Address is " ^ (LargeWord.toString(RunCall.unsafeCast res)) ^ "\n")*)
         in
             res
         end
     in
         Address.toMachineWord resultFunction
     end
 
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICode.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICode.ML
index f591a361..4ce9456e 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICode.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICode.ML
@@ -1,1014 +1,1022 @@
 (*
     Copyright David C. J. Matthews 2021-2
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64ICode(
 
     structure Arm64Code: ARM64PREASSEMBLY
 
 ): ARM64ICODE =
 
 struct
     open Arm64Code
 
     open Address
     datatype preg = PReg of int (* A pseudo-register - an abstract register. *)
 
     (* If the value is zero we can use X0/W0. *)
     datatype pregOrZero = SomeReg of preg | ZeroReg
     
     (* A location on the stack.  May be more than word if this is a container or a handler entry. *)
     datatype stackLocn = StackLoc of {size: int, rno: int }
     
     (* This combines pregKind and stackLocn.  *)
     datatype regProperty =
         RegPropGeneral      (* A general register. *)
     |   RegPropUntagged     (* An untagged general register. *)
     |   RegPropStack of int (* A stack location or container. *)
     |   RegPropCacheTagged
     |   RegPropCacheUntagged
     |   RegPropMultiple     (* The result of a conditional or case. May be defined at multiple points. *)
 
     (* The reference to a condition code. *)
     datatype ccRef = CcRef of int
     datatype reg = GenReg of xReg | FPReg of vReg
 
     datatype callKind = Recursive | ConstantCode of machineWord | FullCall
 
     (* Function calls can have an unlimited number of arguments so it isn't always
        going to be possible to load them into registers. *)
     datatype 'genReg fnarg = ArgInReg of 'genReg | ArgOnStack of { wordOffset: int, container: stackLocn, field: int }
 
     datatype ('genReg, 'optGenReg, 'fpReg) arm64ICode =
         (* Move the contents of one preg to another.  These are always 64-bits. *)
         MoveRegister of { source: 'genReg, dest: 'genReg }
 
         (* Numerical constant. *)
     |   LoadNonAddressConstant of { source: Word64.word, dest: 'genReg }
 
         (* Floating point constant *)
     |   LoadFPConstant of { source: Word64.word, dest: 'fpReg, floatSize: floatSize }
 
         (* Address constant. *)
     |   LoadAddressConstant of { source: machineWord, dest: 'genReg }
 
         (* Load a value into a register using a constant, signed, byte offset.  The offset
            is in the range of -256 to (+4095*unit size). *)
     |   LoadWithConstantOffset of { base: 'genReg, dest: 'genReg, byteOffset: int, loadType: loadType }
 
         (* Similarly for FP registers. *)
     |   LoadFPWithConstantOffset of { base: 'genReg, dest: 'fpReg, byteOffset: int, floatSize: floatSize }
 
         (* Load a value into a register using an index register. *)
     |   LoadWithIndexedOffset of { base: 'genReg, dest: 'genReg, index: 'genReg, loadType: loadType, signExtendIndex: bool }
 
         (* Ditto for FP. *)
     |   LoadFPWithIndexedOffset of { base: 'genReg, dest: 'fpReg, index: 'genReg, floatSize: floatSize, signExtendIndex: bool }
 
         (* Returns the current thread ID.  Always a 64-bit value.. *)
     |   GetThreadId of { dest: 'genReg }
 
         (* Convert a 32-in-64 object index into an absolute address. *)
     |   ObjectIndexAddressToAbsolute of { source: 'genReg, dest: 'genReg }
 
         (* Convert an absolute address into an object index. *)
     |   AbsoluteToObjectIndex of { source: 'genReg, dest: 'genReg }
 
         (* Allocate a fixed sized piece of memory and puts the absolute address into dest.
            bytesRequired is the total number of bytes including the length word and any alignment
            necessary for 32-in-64. saveRegs is the list of registers that need to be saved if we
            need to do a garbage collection. *)
     |   AllocateMemoryFixed of { bytesRequired: Word64.word, dest: 'genReg, saveRegs: 'genReg list }
 
         (* Allocate a piece of memory.  The size argument is an untagged value containing
            the number of words i.e. the same value used for InitialiseMemory and to store
            in the length word. *)
     |   AllocateMemoryVariable of { size: 'genReg, dest: 'genReg, saveRegs: 'genReg list }
 
         (* Initialise a piece of memory by writing "size" copies of the value
            in "init".  N.B. The size is an untagged value containing the
            number of words. *)
     |   InitialiseMem of { size: 'genReg, addr: 'genReg, init: 'genReg }
 
         (* Mark the beginning of a loop.  This is really only to prevent the initialisation code being
            duplicated in ICodeOptimise. *)
     |   BeginLoop
 
         (* Set up the registers for a jump back to the start of a loop. *)
     |   JumpLoop of
             { regArgs: {src: 'genReg fnarg, dst: 'genReg} list,
               stackArgs: {src: 'genReg fnarg, wordOffset: int, stackloc: stackLocn} list,
               checkInterrupt: 'genReg list option }
 
         (* Store a register using a constant, signed, byte offset.  The offset
            is in the range of -256 to (+4095*unit size). *)
     |   StoreWithConstantOffset of { source: 'genReg, base: 'genReg, byteOffset: int, loadType: loadType }
 
         (* Ditto for FP regs. *)
     |   StoreFPWithConstantOffset of { source: 'fpReg, base: 'genReg, byteOffset: int, floatSize: floatSize }
 
         (* Store a register using an index register. *)
     |   StoreWithIndexedOffset of { source: 'genReg, base: 'genReg, index: 'genReg, loadType: loadType, signExtendIndex: bool }
 
         (* and for FP regs. *)
     |   StoreFPWithIndexedOffset of { source: 'fpReg, base: 'genReg, index: 'genReg, floatSize: floatSize, signExtendIndex: bool }
 
         (* Add/Subtract immediate.  The destination is optional in which case XZero is used.
            ccRef is optional.  If it is NONE the version of the instruction that does not generate
            a condition code is used. immed must be < 0wx1000. *)
     |   AddSubImmediate of { source: 'genReg, dest: 'optGenReg, ccRef: ccRef option, immed: word,
                              isAdd: bool, length: opSize }
 
         (* Add/Subtract register.  As with AddSubImmediate, both the destination and cc are optional. *)
     |   AddSubRegister of { base: 'genReg, shifted: 'genReg, dest: 'optGenReg, ccRef: ccRef option,
                             isAdd: bool, length: opSize, shift: shiftType }
 
         (* Bitwise logical operations.  The immediate value must be a valid bit pattern.  ccRef can
            only be SOME if logOp is LogAnd. *)
     |   LogicalImmediate of { source: 'genReg, dest: 'optGenReg, ccRef: ccRef option, immed: Word64.word,
                               logOp: logicalOp, length: opSize }
 
         (* Register logical operations.  ccRef can only be SOME if logOp is LogAnd.*)
     |   LogicalRegister of { base: 'genReg, shifted: 'genReg, dest: 'optGenReg, ccRef: ccRef option,
                              logOp: logicalOp, length: opSize, shift: shiftType }
 
         (* Shift a word by an amount specified in a register. *)
     |   ShiftRegister of { direction: shiftDirection, dest: 'genReg, source: 'genReg, shift: 'genReg, opSize: opSize }
 
         (* The various forms of multiply all take three arguments and the general form is
            dest = M * N +/- A..   *)
     |   Multiplication of { kind: multKind, dest: 'genReg, sourceA: 'optGenReg, sourceM: 'genReg, sourceN: 'genReg }
 
         (* Signed or unsigned division.  Sets the result to zero if the divisor is zero. *)
     |   Division of { isSigned: bool, dest: 'genReg, dividend: 'genReg, divisor: 'genReg, opSize: opSize }
 
         (* Start of function.  Set the register arguments.  stackArgs is the list of
            stack arguments.  If the function has a real closure regArgs includes the
            closure register (X8).  The register arguments include the return register
            (X30). *)
-    |   BeginFunction of { regArgs: ('genReg * xReg) list, stackArgs: stackLocn list }
+    |   BeginFunction of { regArgs: ('genReg * xReg) list, fpRegArgs: ('fpReg * vReg) list, stackArgs: stackLocn list }
 
         (* Call a function.  If the code address is a constant it is passed here.
            Otherwise the address is obtained by indirecting through X8 which has been loaded
            as one of the argument registers.  The results are stored in the result registers,
            usually just X0.
            The "containers" argument is used to ensure that any container whose address is passed
            as one of the other arguments continues to be referenced until the function is called
            since there's a possibility that it isn't actually used after the function. *)
     |   FunctionCall of
             { callKind: callKind, regArgs: ('genReg fnarg * xReg) list,
               stackArgs: 'genReg fnarg list, dests: ('genReg * xReg) list,
+              fpRegArgs: ('fpReg * vReg) list, fpDests: ('fpReg * vReg) list,
               saveRegs: 'genReg list, containers: stackLocn list}
 
         (* Jump to a tail-recursive function.  This is similar to FunctionCall
            but complicated for stack arguments because the stack and the return
            address need to be overwritten.
            stackAdjust is the number of words to remove (positive) or add
            (negative) to the stack before the call.
            currStackSize contains the number of items currently on the stack. *)
     |   TailRecursiveCall of
             { callKind: callKind, regArgs: ('genReg fnarg * xReg) list,
               stackArgs: {src: 'genReg fnarg, stack: int} list,
+              fpRegArgs: ('fpReg * vReg) list,
               stackAdjust: int, currStackSize: int }
 
         (* Return from the function.  resultRegs are the registers containing
            the result,
            returnReg is the preg that contains the return address. *)
-    |   ReturnResultFromFunction of { results: ('genReg * xReg) list, returnReg: 'genReg, numStackArgs: int }
+    |   ReturnResultFromFunction of
+            { results: ('genReg * xReg) list, fpResults:  ('fpReg * vReg) list, returnReg: 'genReg, numStackArgs: int }
 
         (* Raise an exception.  The packet is always loaded into X0. *)
     |   RaiseExceptionPacket of { packetReg: 'genReg }
 
         (* Push a register to the stack.  This is used both for a normal push, copies=1, and
            also to reserve a container. *)
     |   PushToStack of { source: 'genReg, copies: int, container: stackLocn }
 
         (* Load a register from the stack.  The container is the stack location identifier,
            the field is an offset in a container. *)
     |   LoadStack of { dest: 'genReg, wordOffset: int, container: stackLocn, field: int }
 
         (* Store a value into the stack. *)
     |   StoreToStack of { source: 'genReg, container: stackLocn, field: int, stackOffset: int }
 
         (* Set the register to the address of the container i.e. a specific offset on the stack. *)
     |   ContainerAddress of { dest: 'genReg, container: stackLocn, stackOffset: int }
 
         (* Remove items from the stack.  Used to remove containers or
            registers pushed to the stack.. *)
     |   ResetStackPtr of { numWords: int }
 
         (* Tag a value by shifting and setting the tag bit. *)
     |   TagValue of { source: 'genReg, dest: 'genReg, isSigned: bool, opSize: opSize }
 
         (* Shift a value to remove the tag bit.  The cache is used if this is untagging a
            value that has previously been tagged. *)
     |   UntagValue of { source: 'genReg, dest: 'genReg, isSigned: bool, opSize: opSize }
 
         (* Box a largeword value.  Stores a value
            into a byte area.  This can be implemented using AllocateMemoryFixed
            but keeping it separate makes optimisation easier.
            The result is always an address and needs to be converted to an
            object index on 32-in-64. *)
     |   BoxLarge of { source: 'genReg, dest: 'genReg, saveRegs: 'genReg list }
 
         (* Load a value from a box.  This can be implemented using a load but
            is kept separate to simplify optimisation.  The source is always
            an absolute address. *)
     |   UnboxLarge of { source: 'genReg, dest: 'genReg }
 
         (* Convert a floating point value into a value suitable for storing
            in the heap.  This normally involves boxing except that 32-bit
            floats can be tagged in native 64-bits. *)
     |   BoxTagFloat of { floatSize: floatSize, source: 'fpReg, dest: 'genReg, saveRegs: 'genReg list }
 
         (* The reverse of BoxTagFloat. *)
     |   UnboxTagFloat of { floatSize: floatSize, source: 'genReg, dest: 'fpReg }
 
         (* Load a value with acquire semantics.  This means that any other
            load in this thread after this sees the value of the shared
            memory at this point and not earlier.  This is used for
            references and arrays to ensure that if another thread has
            built a data structure on the heap and then assigns the
            address to a shared ref this thread will see the updated heap
            and not any locally cached previous version. *)
     |   LoadAcquire of { base: 'genReg, dest: 'genReg, loadType: loadType }
 
         (* Store a value with release semantics.  This ensures that any
            other write completes before this operation and works with
            LoadAcquire. *)
     |   StoreRelease of { base: 'genReg, source: 'genReg, loadType: loadType }
 
         (* This is a generalised constant shift which includes selection of a
            range of bits. *)
     |   BitFieldShift of { source: 'genReg, dest: 'genReg, isSigned: bool, length: opSize, immr: word, imms: word }
 
         (*  Copy a range of bits and insert it into another register.  This is the
             only case where a register functions both as a source and a destination. *)
     |   BitFieldInsert of { source: 'genReg, destAsSource: 'genReg, dest: 'genReg,
                             length: opSize, immr: word, imms: word }
 
         (* Indexed case. *)
     |   IndexedCaseOperation of { testReg: 'genReg }
 
         (* Exception handling.  - Set up an exception handler. *)
     |   PushExceptionHandler
 
         (* End of a handled section.  Restore the previous handler. *)
     |   PopExceptionHandler
 
         (* Marks the start of a handler.  This sets the stack pointer and
            restores the old handler.  Sets the exception packet register. *) 
     |   BeginHandler of { packetReg: 'genReg }
 
         (* Compare two vectors of bytes and set the condition code on the result.
            The registers are modified by the instruction. *)
     |   CompareByteVectors of
             { vec1Addr: 'genReg, vec2Addr: 'genReg, length: 'genReg, ccRef: ccRef }
 
         (* Move a block of bytes (isByteMove true) or words (isByteMove false).  The length is the
            number of items (bytes or words) to move. The registers are modified by
            the instruction. *)
     |   BlockMove of { srcAddr: 'genReg, destAddr: 'genReg, length: 'genReg, isByteMove: bool }
 
         (* Add or subtract to the system stack pointer and optionally return the new value.
            This is used to allocate and deallocate C space. *)
     |   AddSubXSP of { source: 'genReg, dest: 'optGenReg, isAdd: bool  }
 
         (* Ensures the value will actually be referenced although it doesn't generate any code. *)
     |   TouchValue of { source: 'genReg }
 
         (* Load a value at the address and get exclusive access.  Always loads a
            64-bit value. *)
     |   LoadAcquireExclusive of { base: 'genReg, dest: 'genReg }
 
         (* Store a value into an address releasing the lock.  Sets the result to
            either 0 or 1 if it succeeds or fails. *)
     |   StoreReleaseExclusive of { base: 'genReg, source: 'optGenReg, result: 'genReg }
 
         (* Insert a memory barrier. dmb ish. *)
     |   MemoryBarrier
 
         (* Convert an integer to a floating point value. *)
     |   ConvertIntToFloat of { source: 'genReg, dest: 'fpReg, srcSize: opSize, destSize: floatSize }
 
         (* Convert a floating point value to an integer using the specified rounding mode.
            We could get an overflow here but fortunately the ARM generates a value
            that will cause an overflow when we tag it, provided we tag it explicitly. *)
     |   ConvertFloatToInt of { source: 'fpReg, dest: 'genReg, srcSize: floatSize, destSize: opSize, rounding: IEEEReal.rounding_mode }
 
         (* Unary floating point.  This includes conversions between float and double. *)
     |   UnaryFloatingPt of { source: 'fpReg, dest: 'fpReg, fpOp: fpUnary }
 
         (* Binary floating point: addition, subtraction, multiplication and division. *)
     |   BinaryFloatingPoint of { arg1: 'fpReg, arg2: 'fpReg, dest: 'fpReg, fpOp: fpBinary, opSize: floatSize }
 
         (* Floating point comparison. *)
     |   CompareFloatingPoint of { arg1: 'fpReg, arg2: 'fpReg, ccRef: ccRef, opSize: floatSize }
 
         (* Yield control during a spin-lock. *)
     |   CPUYield
 
         (* Atomic operations added for ARM 8.1 *)
     |   AtomicOperation of { base: 'genReg, source: 'optGenReg, dest: 'optGenReg, atOp: atomicOp }
 
-        (* Debugging - fault if values don't match. *)
-    |   CacheCheck of { arg1: 'genReg, arg2: 'genReg }
-
         (* Destinations at the end of a basic block. *)
     and controlFlow =
         (* Unconditional branch to a label - should be a merge point. *)
         Unconditional of int
         (* Conditional branch. Jumps to trueJump if the condional is false, falseJump if false. *)
     |   Conditional of { ccRef: ccRef, condition: condition, trueJump: int, falseJump: int }
         (* Exit - the last instruction of the block is a return, raise or tailcall. *)
     |   ExitCode
         (* Indexed case - this branches to one of a number of labels *)
     |   IndexedBr of int list
         (* Set up a handler.  This doesn't cause an immediate branch but the state at the
            start of the handler is the state at this point. *)
     |   SetHandler of { handler: int, continue: int }
         (* Unconditional branch to a handler.  If an exception is raised explicitly
            within the scope of a handler. *)
     |   UnconditionalHandle of int
         (* Conditional branch to a handler.  Occurs if there is a call to a
            function within the scope of a handler.  It may jump to the handler. *)
     |   ConditionalHandle of { handler: int, continue: int }
 
     and ('genReg, 'optGenReg, 'fpReg) basicBlock =
             BasicBlock of { block: ('genReg, 'optGenReg, 'fpReg) arm64ICode list, flow: controlFlow }
   
     type iCodeAbstract = (preg, pregOrZero, preg) arm64ICode and basicBlockAbstract = (preg, pregOrZero, preg) basicBlock
     and  iCodeConcrete = (xReg, xReg, vReg) arm64ICode and basicBlockConcrete = (xReg, xReg, vReg) basicBlock
 
     (* Return the list of blocks that are the immediate successor of this. *)
     fun successorBlocks(Unconditional l) = [l]
     |   successorBlocks(Conditional{trueJump, falseJump, ...}) = [trueJump, falseJump]
     |   successorBlocks ExitCode = []
     |   successorBlocks(IndexedBr cases) = cases
     |   successorBlocks(SetHandler{handler, continue, ...}) = [handler, continue]
         (* We only need "handler" in SetHandler because we may have a handler that is never actually jumped to. *)
     |   successorBlocks(UnconditionalHandle handler) = [handler]
     |   successorBlocks(ConditionalHandle{handler, continue, ...}) = [handler, continue]
 
     local
         fun printCC(CcRef ccRef, stream) = stream ("CC" ^ Int.toString ccRef)
 
         fun printStackLoc(StackLoc{size, rno}, stream) =
             (stream "S"; stream(Int.toString rno); stream "("; stream(Int.toString size); stream ")")
         
         fun regRepr(XReg w) = "X" ^ Int.toString(Word8.toInt w)
         |   regRepr XZero = "XZ"
         |   regRepr XSP = "SP"
+        
+        and vRegRepr(VReg v) = "V" ^ Int.toString(Word8.toInt v)
 
         fun arithRepr OpSize64 = "64"
         |   arithRepr OpSize32 = "32"
 
         fun printLoadType(Load64, stream) = stream "64"
         |   printLoadType(Load32, stream) = stream "32"
         |   printLoadType(Load16, stream) = stream "16"
         |   printLoadType(Load8, stream) = stream "8"
 
         fun printSaves([], _, _) = ()
         |   printSaves([areg], _, printReg) = printReg areg
         |   printSaves(areg::more, stream, printReg) =
                 (printReg areg; stream ","; printSaves(more, stream, printReg))
 
         fun printArg(ArgInReg reg, _, printReg) = printReg reg
         |   printArg(ArgOnStack{wordOffset, container, field, ...}, stream, _) =
             (
                 printStackLoc(container, stream); stream " + ";
                 stream(Int.toString field);
                 stream " (";  stream(Int.toString wordOffset); stream ")"
             )
 
         fun printShift(ShiftLSL w, stream) = stream(" LSL " ^ Word8.toString w)
         |   printShift(ShiftLSR w, stream) = stream(" LSR " ^ Word8.toString w)
         |   printShift(ShiftASR w, stream) = stream(" ASR " ^ Word8.toString w)
         |   printShift(ShiftNone, _) = ()
 
         fun printFloatSize(Float32, stream) = stream "F"
         |   printFloatSize(Double64, stream) = stream "D"
 
         fun printICode {stream, printGenReg, ...} (MoveRegister{ source, dest }: ('a, 'b, 'c) arm64ICode) =
             (
                 stream "\tMove\t";
                 printGenReg source;
                 stream " => ";
                 printGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (LoadNonAddressConstant{ source, dest }) =
             (
                 stream "\tLoadNonAddress\t"; stream(Word64.toString source);
                 stream " => ";
                 printGenReg dest
             )
 
         |   printICode {stream, printFPReg, ...} (LoadFPConstant{ source, dest, floatSize }) =
             (
                 stream "\tLoadFPConstant"; printFloatSize(floatSize, stream); stream "\t";
                 stream(Word64.toString source); stream " => "; printFPReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (LoadAddressConstant{ source, dest }) =
             (
                 stream "\tLoadAddress\t"; stream(Address.stringOfWord source);
                 stream " => ";
                 printGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (LoadWithConstantOffset{ base, dest, byteOffset, loadType }) =
             (
                 stream "\tLoadConstOffset"; printLoadType(loadType, stream); stream "\t[";
                 printGenReg base; stream "]+";
                 stream(Int.toString byteOffset);
                 stream " => ";
                 printGenReg dest
             )
 
         |   printICode {stream, printGenReg, printFPReg, ...} (LoadFPWithConstantOffset{ base, dest, byteOffset, floatSize }) =
             (
                 stream "\tLoadConstOffset"; printFloatSize(floatSize, stream); stream "\t[";
                 printGenReg base; stream "]+";
                 stream(Int.toString byteOffset);
                 stream " => ";
                 printFPReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (LoadWithIndexedOffset{ base, dest, index, loadType, signExtendIndex }) =
             (
                 stream "\tLoadIndexed"; printLoadType(loadType, stream); stream "\t[";
                 printGenReg base; stream "+"; printGenReg index; if signExtendIndex then stream " SX" else ();
                 stream "] => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, printFPReg, ...} (LoadFPWithIndexedOffset{ base, dest, index, floatSize, signExtendIndex }) =
             (
                 stream "\tLoadIndexed"; printFloatSize(floatSize, stream); stream "\t[";
                 printGenReg base; stream "+"; printGenReg index; if signExtendIndex then stream " SX" else ();
                 stream "] => "; printFPReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (GetThreadId { dest}) =
                 ( stream "\tGetThreadId\t"; stream " => "; printGenReg dest )
 
         |   printICode {stream, printGenReg, ...} (ObjectIndexAddressToAbsolute{ source, dest }) =
             (
                 stream "\tObjectAddrToAbs\t";
                 printGenReg source; stream " => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (AbsoluteToObjectIndex{ source, dest }) =
             (
                 stream "\tAbsToObjectAddr\t";
                 printGenReg source; stream " => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (AllocateMemoryFixed{bytesRequired, dest, saveRegs}) =
             (
                 stream "\tAllocateMemory\t";
                 stream(Word64.fmt StringCvt.DEC bytesRequired); stream " => ";
                 printGenReg dest; stream " save="; printSaves(saveRegs, stream, printGenReg)
             )
 
         |   printICode {stream, printGenReg, ...} (AllocateMemoryVariable{size, dest, saveRegs}) =
             (
                 stream "\tAllocateMemory\t";
                 stream "s="; printGenReg(size);
                 stream " => "; printGenReg dest;
                 stream " save="; printSaves(saveRegs, stream, printGenReg)
             )
 
         |   printICode {stream, printGenReg, ...} (InitialiseMem{size, addr, init}) =
             (
                 stream "\tInitialiseMem\t";
                 stream "s="; printGenReg(size);
                 stream ",i="; printGenReg(init);
                 stream ",a="; printGenReg(addr)
             )
 
         |   printICode {stream, ...} BeginLoop = stream "\tBeginLoop"
 
         |   printICode {stream, printGenReg, ...} (JumpLoop{regArgs, stackArgs, checkInterrupt, ... }) =
             (
                 stream "\tJumpLoop\t";
                 List.app(fn {src, dst} => (printGenReg(dst); stream "="; printArg(src, stream, printGenReg); stream " ")) regArgs;
                 List.app(
                     fn {src, wordOffset, stackloc} =>
                         (printStackLoc(stackloc, stream); stream("(sp" ^ Int.toString wordOffset); stream ")="; printArg(src, stream, printGenReg); stream " ")
                     ) stackArgs;
                 case checkInterrupt of
                     NONE => ()
                 |   SOME saveRegs => (stream " Check:save="; printSaves(saveRegs, stream, printGenReg))
             )
 
         |   printICode {stream, printGenReg, ...} (StoreWithConstantOffset{ base, source, byteOffset, loadType }) =
             (
                 stream "\tStoreConstOffset"; printLoadType(loadType, stream); stream "\t";
                 printGenReg source; stream " => [";
                 printGenReg base; stream "+";
                 stream(Int.toString byteOffset); stream "]"
             )
 
         |   printICode {stream, printGenReg, printFPReg, ...} (StoreFPWithConstantOffset{ base, source, byteOffset, floatSize }) =
             (
                 stream "\tStoreConstOffset"; printFloatSize(floatSize, stream); stream "\t";
                 printFPReg source; stream " => [";
                 printGenReg base; stream "+";
                 stream(Int.toString byteOffset); stream "]"
             )
 
         |   printICode {stream, printGenReg, ...} (StoreWithIndexedOffset{ base, source, index, loadType, signExtendIndex }) =
             (
                 stream "\tStoreIndexed"; printLoadType(loadType, stream); stream "\t";
                 printGenReg source; stream " => [";
                 printGenReg base; stream "+"; printGenReg index;
                 if signExtendIndex then stream " SX" else (); stream "]"
             )
 
         |   printICode {stream, printGenReg, printFPReg, ...} (StoreFPWithIndexedOffset{ base, source, index, floatSize, signExtendIndex }) =
             (
                 stream "\tStoreIndexed"; printFloatSize(floatSize, stream); stream "\t";
                 printFPReg source; stream " => [";
                 printGenReg base; stream "+"; printGenReg index;
                 if signExtendIndex then stream " SX" else (); stream "]"
             )
 
         |   printICode {stream, printGenReg, printOptGenReg, ...} (AddSubImmediate{ source, dest, ccRef, immed, isAdd, length }) =
             (
                 stream (if isAdd then "\tAddImmediate" else "\tSubImmediate"); stream(arithRepr length);
                 stream "\t"; printGenReg source; stream ",0x"; stream(Word.toString immed);
                 stream " => "; printOptGenReg dest;
                 case ccRef of NONE => () | SOME cc => (stream ", "; printCC(cc, stream))
             )
 
         |   printICode {stream, printGenReg, printOptGenReg, ...} (AddSubRegister{ base, shifted, dest, ccRef, isAdd, length, shift }) =
             (
                 stream (if isAdd then "\tAddRegister" else "\tSubRegister"); stream(arithRepr length);
                 stream "\t"; printGenReg base; stream ", ";
                 printGenReg(shifted); printShift(shift, stream);
                 stream " => "; printOptGenReg dest;
                 case ccRef of NONE => () | SOME cc => (stream ", "; printCC(cc, stream))
             )
 
         |   printICode {stream, printGenReg, printOptGenReg, ...} (LogicalImmediate{ source, dest, ccRef, immed, logOp, length }) =
             (
                 stream (case logOp of LogAnd => "\tAndImmediate" | LogOr => "\tOrImmediate" | LogXor => "\tXorImmediate");
                 stream(arithRepr length);
                 stream "\t"; printGenReg source; stream ",0x"; stream(Word64.toString immed);
                 stream " => "; printOptGenReg dest;
                 case ccRef of NONE => () | SOME cc => (stream ", "; printCC(cc, stream))
             )
 
         |   printICode {stream, printGenReg, printOptGenReg, ...} (LogicalRegister{ base, shifted, dest, ccRef, logOp, length, shift }) =
             (
                 stream (case logOp of LogAnd => "\tAndRegister"  | LogOr => "\tOrRegister" | LogXor => "\tXorRegister");
                 stream(arithRepr length);
                 stream "\t"; printGenReg base; stream ", ";
                 printGenReg(shifted); printShift(shift, stream);
                 stream " => "; printOptGenReg dest;
                 case ccRef of NONE => () | SOME cc => (stream ", "; printCC(cc, stream))
             )
 
         |   printICode {stream, printGenReg, ...} (ShiftRegister{ direction, dest, source, shift, opSize }) =
             (
                 stream (
                     case direction of
                         ShiftLeft => "\tShiftLeft"
                     |   ShiftRightLogical => "\tShiftRightLog"
                     |   ShiftRightArithmetic => "\tShiftRightArith");
                 stream(arithRepr opSize);
                 stream "\t"; printGenReg source; stream " by ";
                 printGenReg(shift);
                 stream " => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, printOptGenReg, ...} (Multiplication{ kind, dest, sourceA, sourceM, sourceN }) =
             (
                 stream (
                     case kind of
                         MultAdd32 => "\tMultAdd32\t"
                     |   MultSub32 => "\tMultSub32\t"
                     |   MultAdd64 => "\tMultAdd64\t"
                     |   MultSub64 => "\tMultSub64\t"
                     |   SignedMultAddLong => "\tSignedMultAddLong\t"
                     |   SignedMultHigh => "\tSignedMultHigh\t");
                 printGenReg(sourceM); stream " * ";
                 printGenReg(sourceN);
                 stream " +/- "; printOptGenReg sourceA;
                 stream " => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (Division{ isSigned, dest, dividend, divisor, opSize }) =
             (
                 stream (if isSigned then "\tSignedDivide" else "\tUnsignedDivide");
                 stream(arithRepr opSize);
                 stream "\t"; printGenReg(dividend); stream " by ";
                 printGenReg(divisor);
                 stream " => "; printGenReg dest
             )
 
-        |   printICode {stream, printGenReg, ...} (BeginFunction{ regArgs, stackArgs }) =
+        |   printICode {stream, printGenReg, printFPReg, ...} (BeginFunction{ regArgs, stackArgs, fpRegArgs }) =
             (
                 stream "\tBeginFunction\t";
                 List.app(fn (arg, r) => (stream(regRepr r); stream "="; printGenReg(arg); stream " ")) regArgs;
+                List.app(fn (arg, r) => (stream(vRegRepr r); stream "="; printFPReg(arg); stream " ")) fpRegArgs;
                 List.app(fn s => printStackLoc(s, stream)) stackArgs
             )
 
-        |   printICode {stream, printGenReg, ...} (FunctionCall{callKind, regArgs, stackArgs, dests, saveRegs, containers}) =
+        |   printICode {stream, printGenReg, printFPReg, ...} (FunctionCall{callKind, regArgs, stackArgs, dests,
+                        fpRegArgs, fpDests, saveRegs, containers}) =
             (
                 stream "\tFunctionCall\t";
                 case callKind of
                     Recursive => stream "recursive "
                 |   ConstantCode m => (stream(stringOfWord m); stream " ")
                 |   FullCall => ();
                 stream "(";
                 List.app(fn (arg, r) => (stream(regRepr r); stream "="; printArg(arg, stream, printGenReg); stream " ")) regArgs;
+                List.app(fn (arg, r) => (stream(vRegRepr r); stream "="; printFPReg(arg); stream " ")) fpRegArgs;
                 List.app(fn arg => (stream "p="; printArg(arg, stream, printGenReg); stream " ")) stackArgs;
                 stream ") ";
                 List.app(fn (pr, r) => (stream(regRepr r); stream "=>"; printGenReg pr; stream " ")) dests;
+                List.app(fn (pr, r) => (stream(vRegRepr r); stream "=>"; printFPReg pr; stream " ")) fpDests;
                 stream " save="; printSaves(saveRegs, stream, printGenReg);
                 if null containers then ()
                 else (stream " containers="; List.app (fn c => (printStackLoc(c, stream); stream " ")) containers)
             )
 
-        |   printICode {stream, printGenReg, ...} (TailRecursiveCall{callKind, regArgs, stackArgs, stackAdjust, currStackSize, ...}) =
+        |   printICode {stream, printGenReg, printFPReg, ...} (TailRecursiveCall{callKind, regArgs, stackArgs, stackAdjust, currStackSize, fpRegArgs, ...}) =
             (
                 stream "\tTailCall\t";
                 case callKind of
                     Recursive => stream "recursive "
                 |   ConstantCode m => (stream(stringOfWord m); stream " ")
                 |   FullCall => ();
                 List.app(fn (arg, r) => (stream(regRepr r); stream "="; printArg(arg, stream, printGenReg); stream " ")) regArgs;
+                List.app(fn (arg, r) => (stream(vRegRepr r); stream "="; printFPReg(arg); stream " ")) fpRegArgs;
                 List.app(fn {src, stack} => (stream (Int.toString stack); stream "<="; printArg(src, stream, printGenReg); stream " ")) stackArgs;
                 stream "adjust="; stream(Int.toString stackAdjust);
                 stream " stackSize="; stream(Int.toString currStackSize)
             )
 
-        |   printICode {stream, printGenReg, ...} (ReturnResultFromFunction{ results, returnReg, numStackArgs }) =
+        |   printICode {stream, printGenReg, printFPReg, ...}
+                (ReturnResultFromFunction{ results, fpResults, returnReg, numStackArgs }) =
             (
                 stream "\tReturnFromFunction\t"; printGenReg(returnReg); stream "with ";
                 List.app(fn (reg, r) => (stream(regRepr r); stream "=>"; printGenReg reg; stream " ")) results;
+                List.app(fn (reg, r) => (stream(vRegRepr r); stream "=>"; printFPReg reg; stream " ")) fpResults;
                 stream("," ^ Int.toString numStackArgs)
             )
 
         |   printICode {stream, printGenReg, ...} (RaiseExceptionPacket{ packetReg }) =
             ( stream "\tRaiseException\t"; printGenReg(packetReg) )
 
         |   printICode {stream, printGenReg, ...} (PushToStack{ source, copies, container }) =
             (
                 stream "\tPushToStack\t"; printGenReg source;
                 if copies > 1 then (stream " * "; stream(Int.toString copies)) else ();
                 stream " => "; printStackLoc(container, stream)
             )
 
         |   printICode {stream, printGenReg, ...} (LoadStack{ dest, wordOffset, container, field }) =
             (
                 stream "\tLoadStack\t";
                 printStackLoc(container, stream); stream " + ";
                 stream(Int.toString field);
                 stream " (";  stream(Int.toString wordOffset); stream ")";
                 stream " => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (StoreToStack{ source, container, field, stackOffset }) =
             (
                 stream "\tStoreToStack\t"; printGenReg source;
                 stream " => "; printStackLoc(container, stream); stream "+";
                 stream (Int.toString field); stream "(";
                 stream(Int.toString stackOffset); stream ")"
             )
 
         |   printICode {stream, printGenReg, ...} (ContainerAddress{ dest, container, stackOffset }) =
             (
                 stream "\tContainerAddress\t";
                 stream "@"; printStackLoc(container, stream);
                 stream " (";  stream(Int.toString stackOffset); stream ") => ";
                 printGenReg dest
             )
 
         |   printICode {stream, ...} (ResetStackPtr{ numWords }) =
             ( stream "\tResetStackPtr\t"; stream(Int.toString numWords) )
 
         |   printICode {stream, printGenReg, ...} (TagValue{ source, dest, isSigned, opSize }) =
             (
                 stream "\tTag"; stream(if isSigned then "Signed" else "Unsigned");
                 stream(arithRepr opSize); stream "\t";
                 printGenReg source; stream " => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (UntagValue{ source, dest, isSigned, opSize }) =
             (
                 stream "\tUntag"; stream(if isSigned then "Signed" else "Unsigned");
                 stream(arithRepr opSize); stream "\t";
                 printGenReg source; stream " => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (BoxLarge{source, dest, saveRegs}) =
             (
                 stream "\tBoxLarge\t";
                 printGenReg source;
                 stream " => ";
                 printGenReg dest;
                 stream " save="; printSaves(saveRegs, stream, printGenReg)
             )
 
         |   printICode {stream, printGenReg, ...} (UnboxLarge{source, dest}) =
             (
                 stream "\tUnboxLarge\t";
                 printGenReg source;
                 stream " => ";
                 printGenReg dest
             )
 
         |   printICode {stream, printGenReg, printFPReg, ...} (BoxTagFloat{floatSize, source, dest, saveRegs}) =
             (
                 stream "\tBoxTagFloat"; printFloatSize(floatSize, stream); stream "\t";
                 printFPReg source;
                 stream " => ";
                 printGenReg dest;
                 stream " save="; printSaves(saveRegs, stream, printGenReg)
             )
 
         |   printICode {stream, printGenReg, printFPReg, ...} (UnboxTagFloat{floatSize, source, dest}) =
             (
                 stream "\tUnboxTagFloat"; printFloatSize(floatSize, stream); stream "\t";
                 printGenReg source;
                 stream " => ";
                 printFPReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (LoadAcquire{ base, dest, loadType }) =
             (
                 stream "\tLoadAcquire"; printLoadType(loadType, stream); stream "\t[";
                 printGenReg base; stream "] => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (StoreRelease{ base, source, loadType }) =
             (
                 stream "\tStoreRelease"; printLoadType(loadType, stream); stream "\t";
                 printGenReg source; stream " => [";
                 printGenReg base; stream "]"
             )
 
         |   printICode {stream, printGenReg, ...} (BitFieldShift{ source, dest, isSigned, length, immr, imms }) =
             (
                 stream "\tBitShift"; stream(if isSigned then "Signed" else "Unsigned"); stream(arithRepr length); stream "\t";
                 printGenReg source;
                 stream " => "; printGenReg dest;
                 stream " immr="; stream(Word.fmt StringCvt.DEC immr);
                 stream " imms="; stream(Word.fmt StringCvt.DEC imms)
             )
 
         |   printICode {stream, printGenReg, ...} (BitFieldInsert{ source, dest, destAsSource, length, immr, imms }) =
             (
                 stream "\tBitInsert";  stream(arithRepr length); stream "\t";
                 printGenReg source; stream " with "; printGenReg destAsSource;
                 stream " => "; printGenReg dest;
                 stream " immr="; stream(Word.fmt StringCvt.DEC immr);
                 stream " imms="; stream(Word.fmt StringCvt.DEC imms)
             )
 
         |   printICode {stream, printGenReg, ...} (IndexedCaseOperation{testReg}) =
                 ( stream "\tIndexedCase\t"; printGenReg testReg )
 
         |   printICode {stream, ...} PushExceptionHandler = stream "\tPushExcHandler"
 
         |   printICode {stream, ...} PopExceptionHandler = stream "\tPopExcHandler"
 
         |   printICode {stream, printGenReg, ...} (BeginHandler{packetReg}) =
             (
                 stream "\tBeginHandler\t";
                 printGenReg packetReg
             )
 
         |   printICode {stream, printGenReg, ...} (CompareByteVectors{vec1Addr, vec2Addr, length, ccRef, ...}) =
             (
                 stream "\tCompareByteVectors\t";
                 printGenReg(vec1Addr); stream ",";
                 printGenReg(vec2Addr); stream ",";
                 printGenReg(length);
                 stream " => "; printCC(ccRef, stream)
             )
 
         |   printICode {stream, printGenReg, ...} (BlockMove{srcAddr, destAddr, length, isByteMove}) =
             (
                 stream(if isByteMove then "\tBlockByteMove\t" else "\tBlockWordMove\t");
                 stream "src="; printGenReg(srcAddr);
                 stream ",dest="; printGenReg(destAddr);
                 stream ",len="; printGenReg(length)
             )
 
         |   printICode {stream, printGenReg, printOptGenReg, ...} (AddSubXSP{ source, dest, isAdd }) =
             (
                 stream(if isAdd then "\tAdd\t" else "\tSubtract\t");
                 printGenReg source; stream " XSP => ";
                 printOptGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (TouchValue{ source }) =
             ( stream "\tTouchValue\t"; printGenReg source )
 
         |   printICode {stream, printGenReg, ...} (LoadAcquireExclusive{ base, dest }) =
             (
                 stream "\tLoadExclusive\t["; printGenReg base;
                 stream "] => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, printOptGenReg, ...} (StoreReleaseExclusive{ base, source, result }) =
             (
                 stream "\tStoreExclusive\t";
                 printOptGenReg source; stream " => [";
                 printGenReg base; stream "] result => ";
                 printGenReg result
             )
 
         |   printICode {stream, ...} MemoryBarrier = stream "\tMemoryBarrier"
 
         |   printICode {stream, printGenReg, printFPReg, ...} (ConvertIntToFloat{ source, dest, srcSize, destSize}) =
             (
                 stream "\tConvert"; stream(arithRepr srcSize); stream "To";
                 printFloatSize(destSize, stream); stream "\t";
                 printGenReg source; stream " => "; printFPReg dest
             )
 
         |   printICode {stream, printGenReg, printFPReg, ...} (ConvertFloatToInt{ source, dest, srcSize, destSize, rounding}) =
             let
                 open IEEEReal
             in
                 stream "\tConvert"; printFloatSize(srcSize, stream); stream "To";
                 stream(arithRepr destSize); stream "\t";
                 printFPReg source; stream " => "; printGenReg dest;
                 stream(
                     case rounding of
                         TO_NEAREST => " rounding"
                     |   TO_NEGINF => " rounding down"
                     |   TO_POSINF => " rounding up"
                     |   TO_ZERO => " truncating"
                 )
             end
 
         |   printICode {stream, printFPReg, ...} (UnaryFloatingPt{ source, dest, fpOp}) =
             (
                 stream(
                     case fpOp of
                         NegFloat => "\tNegFloat\t"
                     |   NegDouble => "\tNegDouble\t"
                     |   AbsFloat => "\tAbsFloat\t"
                     |   AbsDouble => "\tAbsDouble\t"
                     |   ConvFloatToDble => "\tFloatToDble\t"
-                    |   ConvDbleToFloat => "\t\t"
+                    |   ConvDbleToFloat => "\tDbleToFloat\t"
+                    |   MoveDouble => "\tMoveDouble\t"
+                    |   MoveFloat => "\tMoveFloat\t"
                 );
                 printFPReg source; stream " => "; printFPReg dest
             )
 
         |   printICode {stream, printFPReg, ...} (BinaryFloatingPoint{ arg1, arg2, dest, fpOp, opSize}) =
             (
                 stream(
                     case fpOp of
                         MultiplyFP => "\tMultiply"
                     |   DivideFP => "\tDivide"
                     |   AddFP => "\tAdd"
                     |   SubtractFP => "\tSubtract"
                 );
                 printFloatSize(opSize, stream); stream "\t";
                 printFPReg arg1; stream ", "; printFPReg arg2;
                 stream " => "; printFPReg dest
             )
 
         |   printICode {stream, printFPReg, ...} (CompareFloatingPoint{ arg1, arg2, opSize, ccRef}) =
             (
                 stream "\tCompare"; printFloatSize(opSize, stream); stream "\t";
                 printFPReg arg1; stream ", "; printFPReg arg2;
                 stream ", "; printCC(ccRef, stream)
             )
 
         |   printICode {stream, ...} CPUYield = stream "\tCpuYield"
 
         |   printICode {stream, printGenReg, printOptGenReg, ...} (AtomicOperation{ base, source, dest, atOp }) =
             (
                 case atOp of
                     LoadAddAL => stream "\tLoadAddAL\t"
                 |   LoadUmaxAL => stream "\tLoadUmaxAL\t"
                 |   SwapAL => stream "\tSwapAL\t"
                 |   LoadAddAcquire => stream "\tLoadAddAcquire\t"
                 |   LoadUMaxAcquire => stream "\tLoadUMaxAcquire\t"
                 |   SwapRelease => stream "\tSwapRelease\t";
                 printOptGenReg source; stream ",["; printGenReg base; stream "] => "; printOptGenReg dest
             )
 
-        |   printICode {stream, printGenReg, ...} (CacheCheck{ arg1, arg2}) =
-            ( stream "\tCacheCheck\t"; printGenReg arg1; stream ", "; printGenReg arg2 )
-
         and printCondition(cond,  stream)  = stream(condToString cond)
 
         (* Print a basic block. *)
         fun printBlock {stream, printGenReg, printOptGenReg, printFPReg} (blockNo, BasicBlock{block, flow, ...}) =
         (
             (* Put a label on all but the first. *)
             if blockNo <> 0 then stream("L" ^ Int.toString blockNo ^ ":") else ();
             List.app (fn icode =>
                 (printICode {stream=stream, printGenReg=printGenReg, printOptGenReg=printOptGenReg, printFPReg=printFPReg} (icode); stream "\n")) block;
             case flow of
                 Unconditional l => stream("\tJump\tL" ^ Int.toString l ^ "\n")
             |   Conditional {condition, trueJump, falseJump, ccRef, ...} =>
                 (
                     stream "\tJump"; printCondition(condition, stream);
                     stream "\t"; printCC(ccRef, stream);
                     stream " L"; stream (Int.toString trueJump);
                     stream " else L"; stream (Int.toString falseJump); stream "\n"
                 )
             |   ExitCode => ()
             |   IndexedBr _ => ()
             |   SetHandler{handler, continue} =>
                     stream(concat["\tSetHandler\tH", Int.toString handler, "\n",
                                   "\tJump\tL", Int.toString continue, "\n"])
             |   UnconditionalHandle handler => stream("\tJump\tH" ^ Int.toString handler ^ "\n")
             |   ConditionalHandle{handler, continue} =>
                     stream(concat["\tJump\tL", Int.toString continue, " or H", Int.toString handler, "\n"])
         )
     in
         fun printPReg stream (PReg i) = stream("R" ^ Int.toString i)
         fun printOptPReg stream ZeroReg = stream "Zero"
         |   printOptPReg stream (SomeReg reg) = printPReg stream reg
         
         fun printXReg stream (XReg w) = stream("X" ^ Int.toString(Word8.toInt w))
         |   printXReg stream XZero = stream "XZ"
         |   printXReg stream XSP = stream "XSP"
 
         fun printVReg stream (VReg w) = stream("V" ^ Int.toString(Word8.toInt w))
 
         fun printICodeAbstract(blockVec, stream) =
             Vector.appi(printBlock{stream=stream, printGenReg=printPReg stream, printOptGenReg=printOptPReg stream, printFPReg=printPReg stream}) blockVec
         
         and printICodeConcrete(blockVec, stream) =
             Vector.appi(printBlock{stream=stream, printGenReg=printXReg stream, printOptGenReg=printXReg stream, printFPReg=printVReg stream}) blockVec
     end
 
     (* Only certain bit patterns are allowed in a logical immediate instruction
        but the encoding is complex so it's easiest to inherit the test from
        the assembler layer. *)
     local
         fun optow OpSize32 = WordSize32 | optow OpSize64 = WordSize64
     in
         fun isEncodableBitPattern(v, w) = Arm64Code.isEncodableBitPattern(v, optow w)
     end
 
     (* This generates a  BitField instruction with the appropriate values for immr and imms. *)
     fun shiftConstant{ direction, dest, source, shift, opSize } =
     let
         val (isSigned, immr, imms) =
             case (direction, opSize) of
                 (ShiftLeft, OpSize64) => (false, Word.~ shift mod 0w64, 0w64-0w1-shift)
             |   (ShiftLeft, OpSize32) => (false, Word.~ shift mod 0w32, 0w32-0w1-shift)
             |   (ShiftRightLogical, OpSize64) => (false, shift, 0wx3f)
             |   (ShiftRightLogical, OpSize32) => (false, shift, 0wx1f)
             |   (ShiftRightArithmetic, OpSize64) => (true, shift, 0wx3f)
             |   (ShiftRightArithmetic, OpSize32) => (true, shift, 0wx1f)
     in
         BitFieldShift{ source=source, dest=dest, isSigned=isSigned, length=opSize, immr=immr, imms=imms }
     end
 
     structure Sharing =
     struct
         type xReg           = xReg
         and  vReg           = vReg
         and  reg            = reg
         and  condition      = condition
         and  shiftType      = shiftType
         and  ('genReg, 'optGenReg, 'fpReg) arm64ICode = ('genReg, 'optGenReg, 'fpReg) arm64ICode
         and  preg           = preg
         and  pregOrZero     = pregOrZero
         and  controlFlow    = controlFlow
         and  ('genReg, 'optGenReg, 'fpReg) basicBlock = ('genReg, 'optGenReg, 'fpReg) basicBlock
         and  stackLocn      = stackLocn
         and  regProperty    = regProperty
         and  ccRef          = ccRef
         and  'genReg fnarg  = 'genReg fnarg
         and  closureRef     = closureRef
         and  loadType       = loadType
         and  opSize         = opSize
         and  logicalOp      = logicalOp
         and  callKind       = callKind
         and  floatSize      = floatSize
         and  shiftDirection = shiftDirection
         and  multKind       = multKind
         and  fpUnary        = fpUnary
         and  fpBinary       = fpBinary
         and  atomicOp       = atomicOp
    end
 end;
 
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICodeOptimise.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICodeOptimise.ML
index 4add52b0..e73755ed 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICodeOptimise.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICodeOptimise.ML
@@ -1,622 +1,627 @@
 (*
-    Copyright David C. J. Matthews 2021
+    Copyright David C. J. Matthews 2021-2
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64ICodeOptimise(
     structure Arm64ICode: ARM64ICODE
     structure IntSet: INTSET
     structure Identify: ARM64IDENTIFYREFERENCES
     structure Debug: DEBUG
     structure Pretty: PRETTY
     sharing Arm64ICode.Sharing = Identify.Sharing = IntSet = Arm64ICode
 ): ARM64ICODEOPTIMISE =
 struct
     open Arm64ICode
     open IntSet
     open Identify
 
     datatype optimise =
         Changed of (preg, pregOrZero, preg) basicBlock vector * regProperty vector | Unchanged
     
     exception InternalError = Misc.InternalError
     
     (* Optimiser.
        This could incorporate optimisations done elsewhere.
        IdentifyReferences currently removes instructions that
        produce results in registers that are never used.
+       AllocateRegisters deals generally with caching.
 
-       PushRegisters deals with caching. Caching involves
-       speculative changes that can be reversed if there is a need
-       to spill registers.
-       
-       The optimiser currently deals with booleans and conditions.
+       The optimiser currently deals with booleans and conditions and with
+       constants.
     *)
     
     (* This is a rewrite of the last instruction to set a boolean.
        This is almost always rewriting the next instruction.  The only
        possibility is that we have a ResetStackPtr in between. *)
     datatype boolRegRewrite =
         BRNone
         (* BRSetConditionToConstant - we have a comparison of two constant value.
            This will usually happen because we've duplicated a branch and
            set a register to a constant which we then compare. *)
-    |   BRSetConditionToConstant of
-            { srcCC: ccRef, signedCompare: order, unsignedCompare: order }
+    |   BRSetConditionToConstant of { srcCC: ccRef, test: condition -> bool }
 
     fun optimiseICode{ code, pregProps, ccCount=_, debugSwitches=_ } =
     let
         val hasChanged = ref false
         val pregPropSize = Vector.length pregProps
         val regCounter = ref pregPropSize (* Allocate new registers after the old. *)
         val regList = ref [] (* New properties are added in reverse order. *)
         fun newReg kind =
         (
             regList := kind :: ! regList;
             PReg (!regCounter)
         ) before regCounter := !regCounter + 1
 
         (* Constant values in registers. *)
         datatype regValue =
             SomeValue
         |   NonAddressConst of Word64.word
         |   AddressConstant of machineWord
+        |   LargeBox of preg
+        |   RealBox of preg * floatSize
+        |   TaggedValue of preg * bool * opSize
 
         (* If this argument is a register and the register is mapped to a constant
            or another register replace the value.  Unlike the X86 version we
            don't map memory locations but we do map registers. *)
         (* TODO: This is potentially quadratic if we have a long piece
            of code with very many registers. *)
         fun getRegisterValue(preg as PReg pregNo, kill, regMap) =
         (
             case List.find(fn {dest, ... } => dest = preg) regMap of
                 SOME { source, ...} =>
                 (
                     source,
                     (* Filter it if it is the last reference. *)
                     if member(pregNo, kill)
                     then List.filter(fn {dest, ...} => dest <> preg) regMap
                     else regMap
                 )
             |   NONE => (SomeValue, regMap)
         )
 
         fun optimiseBlock processed (block, flow, outCCState) =
         let
             fun optCode([], brCond, regMap, code, changed) = (code, brCond, regMap, changed)
 
-            |   optCode({instr as AddSubImmediate{source, dest=ZeroReg, ccRef=SOME ccRefOut, immed, isAdd=false, length}, kill, ...} :: rest,
-                        _, regMap, code, changed) =
-                let
-                    val (repArg1, memRefsOut) = getRegisterValue(source, kill, regMap)
-                in
-                    case repArg1 of
-                        NonAddressConst test =>
-                        (* AddSubImmediate is put in by CodetreeToIcode to test a boolean value.  It can also
-                           arise as the result of pattern matching on booleans or even by tests such as = true.
-                           If the source register is now a constant we want to propagate the constant
-                           condition. *)
-                        let
-                            (* This comparison reduces to a constant.  *)
-                            val _ = hasChanged := true
-                            (* Signed comparison.  If this is a 32-bit operation the top word
-                               could be zero so we need to convert this as Word32.
-                               immediate values are always unsigned. *)
-                            val testValue =
-                                case length of
-                                    OpSize64 => Word64.toLargeIntX test
-                                |   OpSize32 => Word32.toLargeIntX(Word32.fromLarge test)
-                            (* Put in a replacement so that if we were previously testing ccRefOut
-                               we should instead test ccRef. *)
-                            val repl =
-                                BRSetConditionToConstant{srcCC=ccRefOut,
-                                    signedCompare=LargeInt.compare(testValue, Word.toLargeInt immed),
-                                    unsignedCompare=Word64.compare(test, Word64.fromLarge(Word.toLargeWord immed))}
-                            val _ = isSome outCCState andalso raise InternalError "optCode: CC exported"
-                        in
-                            optCode(rest, repl, memRefsOut, code, true)
-                        end
-                
-                    |   _ => optCode(rest, BRNone, memRefsOut, instr :: code, changed)
-                end
-
             |   optCode({instr as AddSubImmediate{source, dest=SomeReg dest, ccRef=NONE, immed, isAdd, length}, kill, ...} :: rest, _, regMap, code, changed) =
                 (* This is frequently used to remove a tag from a value before an addition or subtraction.
                    If it's a constant we want to do that now. *)
                 let
                     val (repArg1, newMap) = getRegisterValue(source, kill, regMap)
                 in
                     case repArg1 of
                         NonAddressConst cVal =>
                         let
                             val addSub = if isAdd then cVal + Word.toLarge immed else cVal - Word.toLarge immed
                             (* Mask the result to 32-bits if this is a 32-bit operation. *)
                             val result =
                                 case length of OpSize32 => Word64.andb(addSub, Word64.<<(0w1, 0w32) - 0w1) | OpSize64 => addSub
                         in
                             optCode(rest, BRNone,
                                 {dest=dest, source=NonAddressConst result} :: newMap,
                                     LoadNonAddressConstant{dest=dest, source=result} :: code, true)
                         end
 
                     |   _ => optCode(rest, BRNone, newMap, instr ::code, changed)
                 end
 
             |   optCode({instr as AddSubRegister{base, shifted, dest, ccRef, isAdd, length, shift}, kill, ...} :: rest,
                         _, regMap, code, changed) =
                 let
                     (* If we have a constant as the second argument we can change this to the
                        immediate form. *)
                     val (repOp1, mapOp1) = getRegisterValue(base, kill, regMap)
                     val (repOp2, mapOp2) = getRegisterValue(shifted, kill, mapOp1)
                     val regAndImmed =
                         case (repOp1, repOp2, isAdd, shift) of
                             (_, NonAddressConst immed, _, ShiftNone) => if immed < 0w4096 then SOME(base, immed) else NONE
                             (* If this is an ADD we can also use the immediate if the first arg is a constant. *)
                         |   (NonAddressConst immed, _, true, ShiftNone) => if immed < 0w4096 then SOME(shifted, immed) else NONE
                         |   _ => NONE
                 in
                     case regAndImmed of
                         SOME(srcReg, immediate) =>
                                 optCode(rest, BRNone, mapOp2,
                                 AddSubImmediate{source=srcReg, dest=dest, ccRef=ccRef,
                                     immed=Word.fromLargeWord(Word64.toLargeWord immediate), isAdd=isAdd, length=length} :: code, true)
                     |   NONE =>
                             optCode(rest, BRNone, mapOp2, instr :: code, changed)
                 end
 
+            |   optCode({instr as LogicalImmediate{source, dest=ZeroReg, ccRef=SOME ccRefOut, immed, logOp=LogAnd, ...}, kill, ...} :: rest,
+                        _, regMap, code, changed) =
+                let
+                    val (repArg1, memRefsOut) = getRegisterValue(source, kill, regMap)
+                in
+                    case repArg1 of
+                        NonAddressConst test =>
+                        (* This was previously used to test a boolean but that has now changed.  It may also
+                           occur as a result of pattern matching. *)
+                        let
+                            (* This comparison reduces to a constant.  *)
+                            val _ = hasChanged := true
+                            val result = LargeWord.andb(test, immed)
+                            (* Put in a replacement so that if we were previously testing ccRefOut
+                               we should instead test ccRef. *)
+                            fun testResult CondEqual = result = 0w0
+                            |   testResult CondNotEqual = result <> 0w0
+                            |   testResult _ = raise InternalError "testResult: invalid condition"
+                            val repl =
+                                BRSetConditionToConstant{srcCC=ccRefOut, test=testResult}
+                            val _ = isSome outCCState andalso raise InternalError "optCode: CC exported"
+                        in
+                            optCode(rest, repl, memRefsOut, code, true)
+                        end
+                
+                    |   _ => optCode(rest, BRNone, memRefsOut, instr :: code, changed)
+                end
+
             |   optCode({instr as LogicalRegister{base, shifted, dest, ccRef, logOp, length, shift}, kill, ...} :: rest,
                         _, regMap, code, changed) =
                 let
                     (* If we have a constant as the second argument we can change this to the
                        immediate form. *)
                     val (repOp1, mapOp1) = getRegisterValue(base, kill, regMap)
                     val (repOp2, mapOp2) = getRegisterValue(shifted, kill, mapOp1)
                     val regAndImmed =
                         case (repOp1, repOp2, shift) of
                             (_, NonAddressConst immed, ShiftNone) =>
                                 if isEncodableBitPattern (immed, length) then SOME(base, immed) else NONE
                         |   (NonAddressConst immed, _, ShiftNone) =>
                                 if isEncodableBitPattern (immed, length) then SOME(shifted, immed) else NONE
                         |   _ => NONE
                 in
                     case regAndImmed of
                         SOME(srcReg, immediate) =>
                                 optCode(rest, BRNone, mapOp2,
                                 LogicalImmediate{source=srcReg, dest=dest, ccRef=ccRef,
                                     immed=immediate, logOp=logOp, length=length} :: code, true)
                     |   NONE =>
                             optCode(rest, BRNone, mapOp2, instr :: code, changed)
                 end
 
             |   optCode({instr as MoveRegister{dest, source}, kill, ...} :: rest, inCond, regMap, code, changed) =
                 let
                     val (repSource, mapAfterReplace) = getRegisterValue(source, kill, regMap)
                     (* If the source is mapped to a constant we can set the destination to
                        the same constant. *)
                     val newMap =
                         case repSource of
                             SomeValue => mapAfterReplace
                         |   _ => {dest=dest, source=repSource} :: mapAfterReplace
                 in
                     optCode(rest, inCond, newMap, instr :: code, changed)
                 end
 
             |   optCode({instr as LoadNonAddressConstant{dest, source}, ...} :: rest, inCond, regMap, code, changed) =
                 let
                     (* If we already have a register with this constant we would probably
                        be better off reusing it.  The map, though, needs to indicate that the
                        destination register contains the constant.
                        The X86 version always uses a load-constant here. *)
                     val newInstr =
                         case List.find(fn {source=NonAddressConst c, ... } => c = source | _ => false) regMap of
                             SOME{dest=cDest, ...} => MoveRegister{dest=dest, source=cDest}
                         |   NONE => instr
                 in
                     optCode(rest, inCond, {dest=dest, source=NonAddressConst source} :: regMap, newInstr :: code, changed)
                 end
 
             |   optCode({instr as LoadAddressConstant{dest, source}, ...} :: rest, inCond, regMap, code, changed) =
                     (* Address constant.  This is used in conjunction with UnboxValue *)
                     optCode(rest, inCond, {dest=dest, source=AddressConstant source} :: regMap, instr :: code, changed)
 
+            |   optCode({instr as BoxLarge{ source, dest, ... }, ...} :: rest, inCond, regMap, code, changed) =
+                (* Try to eliminate adjacent sequences of boxing and unboxing.  *)
+                    optCode(rest, inCond, {dest=dest, source=LargeBox source} :: regMap, instr :: code, changed)
+
+            |   optCode({instr as BoxTagFloat{ source, dest, floatSize, ... }, ...} :: rest, inCond, regMap, code, changed) =
+                    optCode(rest, inCond, {dest=dest, source=RealBox(source, floatSize)} :: regMap, instr :: code, changed)
+
+            |   optCode({instr as TagValue{ source, dest, isSigned, opSize}, ...} :: rest, inCond, regMap, code, changed) =
+                    optCode(rest, inCond, {dest=dest, source=TaggedValue(source, isSigned, opSize)} :: regMap, instr :: code, changed)
+
             |   optCode({instr as UnboxLarge{ source, dest, ... }, kill, ...} :: rest,
                         inCond, regMap, code, changed) =
                 let
                     val (repSource, mapAfterReplace) = getRegisterValue(source, kill, regMap)
                 in
                     (* If we're unboxing a constant address we can unpack it now.  This is
                        intended to handle constant aruments to LargeWord operations. *)
                     case repSource of
                         AddressConstant cVal =>
                         let
                             (* Check this looks like a boxed large word. *)
                             open Address
                             val addr = toAddress cVal
                             val _ = isBytes addr andalso length addr = 0w8 div wordSize
                             val wordConst: LargeWord.word = RunCall.unsafeCast cVal
                             val newMap = {dest=dest, source=NonAddressConst wordConst} :: mapAfterReplace
                         in
                             optCode(rest, inCond, newMap, LoadNonAddressConstant{dest=dest, source=wordConst} :: code, true)
                         end
+
+                        (* Or if it was recently boxed we can use the original and hopefully eliminate the box. *)
+                    |   LargeBox original => optCode(rest, inCond, mapAfterReplace, MoveRegister{dest=dest, source=original} :: code, true)
+
                     |   _ => optCode(rest, inCond, mapAfterReplace, instr :: code, changed)
                 end
 
             |   optCode({instr as UntagValue{ source, dest, isSigned, opSize }, kill, ...} :: rest, inCond, regMap, code, changed) =
                 (* If we're untagging a constant we can produce the result now. *)
                 let
                     val (repSource, mapAfterReplace) = getRegisterValue(source, kill, regMap)
                 in
                     case repSource of
                         NonAddressConst cVal =>
                         let
                             (* The result depends on the kind of untagging.  Unsigned values
                                can just be left shifted but signed values need to be treated
                                differently for 32-bit and 64-bit operations. *)
                             val wordConst =
                                 case (isSigned, opSize) of
                                     (false, _) => LargeWord.>>(cVal, 0w1)
                                 |   (true, OpSize64) => LargeWord.~>>(cVal, 0w1)
                                 |   (true, OpSize32) => Word32.toLarge(Word32.~>>(Word32.fromLarge cVal, 0w1))
                             val newMap = {dest=dest, source=NonAddressConst wordConst} :: mapAfterReplace
                         in
                             optCode(rest, inCond, newMap, LoadNonAddressConstant{dest=dest, source=wordConst} :: code, true)
                         end
+
+                        (* Or if it was recently tagged we can eliminate the tagging and untagging.  This occurs, for
+                           example, if we extract a character and then put it into a string.  Normally the lengths
+                           will match but may use a 32-bit tagged value as a 64-bit value in the special case of
+                           using a character to index into the array of single-character strings.  Mismatched
+                           signed/unsigned could occur with conversions between Word.word and FixedInt.int
+                           which are generally no-ops. *)
+                    |   TaggedValue(original, wasSigned, oldOpSize) =>
+                            if isSigned = wasSigned andalso (case (oldOpSize, opSize) of (OpSize64, OpSize32) => false | _ => true)
+                            then optCode(rest, inCond, mapAfterReplace, MoveRegister{dest=dest, source=original} :: code, true)
+                            else optCode(rest, inCond, mapAfterReplace, instr :: code, changed)
+                    
                     |   _ => optCode(rest, inCond, mapAfterReplace, instr :: code, changed)
                 end
 
             |   optCode({instr as UnboxTagFloat{ source, dest, floatSize=Float32 }, kill, ...} :: rest, inCond, regMap, code, changed) =
                 let
                     val (repSource, mapAfterReplace) = getRegisterValue(source, kill, regMap)
                 in
                     case repSource of
                         NonAddressConst cVal => (* Should only be Float32 in native 64-bits. *)
                         let
                             val _  = not is32in64 andalso LargeWord.andb(cVal, 0wxffffffff) = 0w1
                                         orelse raise InternalError "incorrect FP constant form"
                             val fpConstant = LargeWord.>>(cVal, 0w32)
                         in
                             optCode(rest, inCond, mapAfterReplace, LoadFPConstant{dest=dest, floatSize=Float32, source=fpConstant} :: code, true)
                         end
 
                     |   AddressConstant cVal =>
                         let
                             open Address
                             val addr = toAddress cVal
                             val _ = is32in64 andalso length addr = 0w1 andalso flags addr = F_bytes
                                 orelse raise InternalError "incorrect FP constant form"
                             val fpConstant = RunCall.loadPolyWord(addr, 0w0)
                         in
                             optCode(rest, inCond, mapAfterReplace, LoadFPConstant{dest=dest, floatSize=Float32, source=fpConstant} :: code, true)
                         end
 
+                    |   RealBox(original, fSize) =>
+                        (
+                            fSize = Float32 orelse raise InternalError "Mismatch float size";
+                            optCode(rest, inCond, mapAfterReplace, UnaryFloatingPt{dest=dest, source=original, fpOp=MoveFloat} :: code, true)
+                        )
+
                     |   _ => optCode(rest, inCond, mapAfterReplace, instr :: code, changed)
                 end
 
             |   optCode({instr as UnboxTagFloat{ source, dest, floatSize=Double64 }, kill, ...} :: rest, inCond, regMap, code, changed) =
                 let
                     val (repSource, mapAfterReplace) = getRegisterValue(source, kill, regMap)
                 in
                     case repSource of
                         AddressConstant cVal =>
                         let
                             open Address
                             val addr = toAddress cVal
                             val _ = length addr = (0w8 div wordSize) andalso flags addr = F_bytes
                                 orelse raise InternalError "incorrect FP constant form"
                             val fpConstant = RunCall.loadNativeWord(addr, 0w0)
                         in
                             optCode(rest, inCond, mapAfterReplace, LoadFPConstant{dest=dest, floatSize=Double64, source=fpConstant} :: code, true)
                         end
 
+                    |   RealBox(original, fSize) =>
+                        (
+                            fSize = Double64 orelse raise InternalError "Mismatch float size";
+                            optCode(rest, inCond, mapAfterReplace, UnaryFloatingPt{dest=dest, source=original, fpOp=MoveDouble} :: code, true)
+                        )
+
                     |   _ => optCode(rest, inCond, mapAfterReplace, instr :: code, changed)
                 end
 
                 (* Some of these are specifically to reduce "ref" to its simplest form since it's generated
                    as a variable-length item. *)
 
             |   optCode({instr as AllocateMemoryVariable{ size, dest, saveRegs }, kill, ...} :: rest, inCond, regMap, code, changed) =
                 (* Turn a variable size allocation into a fixed size allocation if the size is a constant. *)
                 let
                     val (repSize, mapAfterReplace) = getRegisterValue(size, kill, regMap)
                 in
                     case repSize of
                         NonAddressConst words =>
                         let
                             val wordsRequired =
                                 if is32in64
                                 then (* Have to round this up to 8 bytes *)
                                     Word64.andb(words+0w2, ~ 0w2)
                                 else words+0w1
                             val bytesRequired = Word64.fromLarge(Word.toLarge Address.wordSize) * wordsRequired
                         in
                             optCode(rest, inCond, mapAfterReplace,
                                 AllocateMemoryFixed{bytesRequired=bytesRequired, dest=dest, saveRegs=saveRegs} :: code, true)
                         end
                     |   _ => optCode(rest, inCond, mapAfterReplace, instr :: code, changed)
                 end
 
             |   optCode({instr as InitialiseMem{size, addr, init}, kill, ...} :: rest, inCond, regMap, code, changed) =
                 (* If we're initialising "a few" words we're better off unrolling the loop. *)
                 let
                     val (repSize, mapAfterReplace) = getRegisterValue(size, kill, regMap)
                 in
                     case repSize of
                         NonAddressConst words =>
                             if words <= 0w8
                             then
                             let
                                 val nWords = LargeWord.toInt words
                                 fun unroll(n, l) =
                                     if n = nWords then l
                                     else unroll(n+1, StoreWithConstantOffset{ source=init, base=addr,
                                             byteOffset=n*Word.toInt Address.wordSize, loadType=if is32in64 then Load32 else Load64 } :: l)
                             in
                                 optCode(rest, inCond, mapAfterReplace, unroll(0, code), true)
                             end
                             else optCode(rest, inCond, mapAfterReplace, instr :: code, changed)
                     |   _ => optCode(rest, inCond, mapAfterReplace, instr :: code, changed)
                 end
 
             |   optCode({instr as BitFieldShift{ source, dest, isSigned, length, immr, imms }, kill, ...} :: rest,
                             inCond, regMap, code, changed) =
                 (* Bit shift.  Specifically this is used to shift the flags byte to construct a length word.  The flags are
                    frequently a constant.  Unlike BitFieldInsert this sets unused bits to either zero or the sign bit. *)
                 let
                     val (repSource, mapAfterReplace) = getRegisterValue(source, kill, regMap)
                 in
                     case repSource of
                         NonAddressConst cVal =>
                         let
                             val regSize = case length of OpSize32 => 0w32 | OpSize64 => 0w64
                         in
                             if not isSigned andalso imms + 0w1 = immr
                             then (* Simple left shift: ignore the other cases for the moment. *)
                             let
                                 val wordConst64 = Word64.<<(cVal, regSize-immr)
                                 val wordConst =
                                     case length of OpSize64 => wordConst64 | OpSize32 => Word64.andb(wordConst64, 0wxffffffff)
                                 val newMap = {dest=dest, source=NonAddressConst wordConst} :: mapAfterReplace
                             in
                                 optCode(rest, inCond, newMap, LoadNonAddressConstant{dest=dest, source=wordConst} :: code, true)
                             end
                             else optCode(rest, inCond, mapAfterReplace, instr :: code, changed)
                         end
                     |   _ => optCode(rest, inCond, mapAfterReplace, instr :: code, changed)
                 end
 
-
             |   optCode({instr as BitFieldInsert{ source, dest, destAsSource, length=_, immr, imms }, kill, ...} :: rest,
                     inCond, regMap, code, changed) =
                 (* Bit field insertion.  This is used to insert the length field into the register containing the
                    shifted flags value. *)
                 let
                     val (repSource, mapAfterRepSrc) = getRegisterValue(source, kill, regMap)
                     val (repDestAs, mapAfterReplace) = getRegisterValue(destAsSource, kill, mapAfterRepSrc)
                 in
                     case (repSource, repDestAs) of
                         (NonAddressConst srcVal, NonAddressConst dstVal) =>
                         if immr = 0w0
                         then (* Insert new bits without shifting. *)
                         let
                             (* We take imms-immr+1 bits from the source. *)
                             val maskSrc = Word64.>>(Word64.notb 0w0, 0w64-(imms+0w1))
                             val maskDst = Word64.notb maskSrc
                             val wordConst = Word64.orb(Word64.andb(dstVal, maskDst), Word64.andb(srcVal, maskSrc))
                             val newMap = {dest=dest, source=NonAddressConst wordConst} :: mapAfterReplace
                         in
                             optCode(rest, inCond, newMap, LoadNonAddressConstant{dest=dest, source=wordConst} :: code, true)
                         end
                         else optCode(rest, inCond, mapAfterReplace, instr :: code, changed)
                     |   _ => optCode(rest, inCond, mapAfterReplace, instr :: code, changed)
                 end
 
                 (* Clear the cache across a function call.  We would have to push these registers. *)
             |   optCode({instr as FunctionCall _, ...} :: rest, _, _, code, changed) =
                     optCode(rest, BRNone, [], instr::code, changed)                
 
             |   optCode({instr as BeginLoop, ...} :: rest, _, _, code, changed) =
                     (* Any register value from outside the loop is not valid inside. *)
                     optCode(rest, BRNone, [], instr::code, changed)                
 
             |   optCode({instr as JumpLoop _, ...} :: rest, _, _, code, changed) =
                     (* Likewise at the end of the loop.  Not sure if this is essential. *)
                     optCode(rest, BRNone, [], instr::code, changed)
 
                 (* CompareByteVectors and BlockMove modify their arguments.  In particular
                    if we have the length as a constant in a register it won't still
                    have that value at the end.
                    TODO: This coult be refined. *)
             |   optCode({instr as CompareByteVectors _, ...} :: rest, _, _, code, changed) =
                     (* Likewise at the end of the loop.  Not sure if this is essential. *)
                     optCode(rest, BRNone, [], instr::code, changed)
 
             |   optCode({instr as BlockMove _, ...} :: rest, _, _, code, changed) =
                     (* Likewise at the end of the loop.  Not sure if this is essential. *)
                     optCode(rest, BRNone, [], instr::code, changed)
-            
 
             |   optCode({instr, ...} :: rest, inCond, regMap, code, changed) =
                 let
                     (* If this instruction affects the CC the cached SetToCondition will no longer be valid. *)
                     val afterCond =
                         case getInstructionCC instr of
                             CCUnchanged => inCond
                         |   _ => BRNone
                 in
                     optCode(rest, afterCond, regMap, instr::code, changed)
                 end
 
             val (blkCode, finalRepl, finalMap, blockChanged) = optCode(block, BRNone, [], processed, false)
             val _ = if blockChanged then hasChanged := true else ()
         in
             case (flow, finalRepl) of
                 (* We have a Condition and a change to the condition. *)
                 (flow as Conditional{ccRef, condition, trueJump, falseJump},
-                 BRSetConditionToConstant({srcCC, signedCompare, unsignedCompare, ...})) =>
+                 BRSetConditionToConstant({srcCC, test, ...})) =>
                     if srcCC = ccRef
                     then
                     let
-                        val testResult =
-                            case (condition, signedCompare, unsignedCompare) of
-                                (CondEqual,             EQUAL,  _)   => true
-                            |   (CondEqual,             _,      _)   => false
-                            |   (CondNotEqual,          EQUAL,  _)   => false
-                            |   (CondNotEqual,          _,      _)   => true
-                            |   (CondSignedLess,        LESS,   _)   => true
-                            |   (CondSignedLess,        _,      _)   => false
-                            |   (CondSignedGreater,     GREATER,_)   => true
-                            |   (CondSignedGreater,     _,      _)   => false
-                            |   (CondSignedLessEq,      GREATER,_)   => false
-                            |   (CondSignedLessEq,      _,      _)   => true
-                            |   (CondSignedGreaterEq,   LESS,   _)   => false
-                            |   (CondSignedGreaterEq,   _,      _)   => true
-                            |   (CondCarryClear,        _, LESS  )   => true
-                            |   (CondCarryClear,        _,      _)   => false
-                            |   (CondUnsignedHigher,    _,GREATER)   => true
-                            |   (CondUnsignedHigher,    _,      _)   => false
-                            |   (CondUnsignedLowOrEq,    _,GREATER)   => false
-                            |   (CondUnsignedLowOrEq,    _,      _)   => true
-                            |   (CondCarrySet,          _, LESS  )   => false
-                            |   (CondCarrySet,          _,      _)   => true
-                                (* The overflow and parity checks should never occur. *)
-                            |   _   => raise InternalError "getCondResult: comparison"
-
                         val newFlow =
-                            if testResult
+                            if test condition
                             then Unconditional trueJump
                             else Unconditional falseJump
 
                         val() = hasChanged := true
                     in
                         BasicBlock{flow=newFlow, block=List.rev blkCode}
                     end
                     else BasicBlock{flow=flow, block=List.rev blkCode}
               
             |   (flow as Unconditional jmp, _) =>
                 let
                     val ExtendedBasicBlock{block=targetBlck, locals, exports, flow=targetFlow, outCCState=targetCC, ...} =
                         Vector.sub(code, jmp)
                     (* If the target is empty or is simply one or more Resets or a Return we're
                        better off merging this in rather than doing the jump.  We allow a single
                        Load  e.g. when loading a constant or moving a register.
                        If we have a CompareLiteral and we're comparing with a register in the map
                        that has been set to a constant we include that because the comparison will
                        then be reduced to a constant. *)
                     fun isSimple([], _, _) = true
                     |   isSimple ({instr=ResetStackPtr _, ...} :: instrs, moves, regMap) = isSimple(instrs, moves, regMap)
                     |   isSimple ({instr=ReturnResultFromFunction _, ...} :: instrs, moves, regMap) = isSimple(instrs, moves, regMap)
                     |   isSimple ({instr=RaiseExceptionPacket _, ...} :: instrs, moves, regMap) = isSimple(instrs, moves, regMap)
                     |   isSimple ({instr=MoveRegister{source, dest}, ...} :: instrs, moves, regMap) =
                         let
                             (* We frequently have a move of the original register into a new register before the test. *)
                             val newMap =
                                 case List.find(fn {dest, ... } => dest = source) regMap of
                                     SOME {source, ...} => {dest=dest, source=source} :: regMap
                                 |   NONE => regMap
                         in
                             moves = 0 andalso isSimple(instrs, moves+1, newMap)
                         end
                     |   isSimple ({instr=LoadNonAddressConstant _, ...} :: instrs, moves, regMap) =
                             moves = 0 andalso isSimple(instrs, moves+1, regMap)
                     |   isSimple ({instr=LoadAddressConstant _, ...} :: instrs, moves, regMap) =
                             moves = 0 andalso isSimple(instrs, moves+1, regMap)
-                    |   isSimple ({instr=AddSubImmediate{source, dest=ZeroReg, ...}, ...} :: instrs, moves, regMap) =
+                    |   isSimple ({instr=LogicalImmediate{source, dest=ZeroReg, logOp=LogAnd, ...}, ...} :: instrs, moves, regMap) =
                         let
                             val isReplace = List.find(fn {dest, ... } => dest = source) regMap
                         in
                             case isReplace of
                                 SOME {source=NonAddressConst _, ...} => isSimple(instrs, moves, regMap)
                             |   _ => false
                         end
                     |   isSimple _ = false
  
                 in
                     (* Merge trivial blocks.  This previously also tried to merge non-trivial blocks if
                        they only had one reference but this ends up duplicating non-trivial code.  If we
                        have a trivial block that has multiple references but is the only reference to
                        a non-trivial block we can merge the non-trivial block into it.  That would
                        be fine except that at the same time we may merge this trivial block elsewhere. *)
                     (* The restriction that a block must only export "merge" registers is unfortunate
                        but necessary to avoid the situation where a non-merge register is defined at
                        multiple points and cannot be pushed to the stack.  This really isn't an issue
                        with blocks with unconditional branches but there are cases where we have
                        successive tests of the same condition and that results in local registers
                        being defined and then exported.  This occurs in, for example,
                        fun f x = if x > "abcde" then "yes" else "no"; *)
                     if isSimple(targetBlck, 0, finalMap) andalso
                             List.all (fn i => Vector.sub(pregProps, i) = RegPropMultiple) (setToList exports)
                     then
                     let
                         (* Copy the block, creating new registers for the locals. *)
                         val localMap = List.map (fn r => (PReg r, newReg(Vector.sub(pregProps, r)))) (setToList locals)
                         fun mapReg r = case List.find (fn (s, _) => r = s) localMap of SOME(_, s) => s | NONE => r
                         fun mapInstr(instr as ResetStackPtr _) = instr
-                        |   mapInstr(ReturnResultFromFunction{results, returnReg, numStackArgs}) =
+                        |   mapInstr(ReturnResultFromFunction{results, fpResults, returnReg, numStackArgs}) =
                                 ReturnResultFromFunction{results=List.map(fn(pr, r) => (mapReg pr, r))results,
+                                                         fpResults=List.map(fn(pr, r) => (mapReg pr, r))fpResults,
                                                          returnReg=mapReg returnReg, numStackArgs=numStackArgs}
                         |   mapInstr(RaiseExceptionPacket{packetReg}) =
                                 RaiseExceptionPacket{packetReg=mapReg packetReg}
                         |   mapInstr(MoveRegister{source, dest}) =
                                 MoveRegister{source=mapReg source, dest=mapReg dest}
                         |   mapInstr(LoadNonAddressConstant{source, dest}) =
                                 LoadNonAddressConstant{source=source, dest=mapReg dest}
                         |   mapInstr(LoadAddressConstant{source, dest}) =
                                 LoadAddressConstant{source=source, dest=mapReg dest}
-                        |   mapInstr(AddSubImmediate{source, dest=ZeroReg, immed, ccRef, isAdd, length}) =
-                                AddSubImmediate{source=mapReg source, dest=ZeroReg, immed=immed, ccRef=ccRef, isAdd=isAdd, length=length}
+                        |   mapInstr(LogicalImmediate{source, dest=ZeroReg, immed, ccRef, logOp, length}) =
+                                LogicalImmediate{source=mapReg source, dest=ZeroReg, immed=immed, ccRef=ccRef, logOp=logOp, length=length}
                         |   mapInstr _ = raise InternalError "mapInstr: other instruction"
                         fun mapRegNo i = case(mapReg(PReg i)) of PReg r => r
                         (* Map the instructions and the sets although we only use the kill set. *)
                         fun mapCode{instr, current, active, kill} =
                             {instr=mapInstr instr, current=listToSet(map mapRegNo (setToList current)),
                              active=listToSet(map mapRegNo (setToList active)), kill=listToSet(map mapRegNo (setToList kill))}
                     in
                         hasChanged := true;
                         optimiseBlock blkCode(map mapCode targetBlck, targetFlow, targetCC)
                     end
                     else BasicBlock{flow=flow, block=List.rev blkCode}
                 end
 
             |   (flow, _) => BasicBlock{flow=flow, block=List.rev blkCode}
         end
         
         fun optBlck(ExtendedBasicBlock{block, flow, outCCState, ...}) = optimiseBlock [] (block, flow, outCCState)
         val resVector = Vector.map optBlck code
     in
         if !hasChanged
         then
         let
             val extraRegs = List.rev(! regList)
             val props =
                 if null extraRegs
                 then pregProps
                 else Vector.concat[pregProps, Vector.fromList extraRegs]
         in
             Changed(resVector, props)
         end
         else Unchanged
     end
 
     structure Sharing =
     struct
         type extendedBasicBlock = extendedBasicBlock
         and ('genReg, 'optGenReg, 'fpReg) basicBlock = ('genReg, 'optGenReg, 'fpReg) basicBlock
         and regProperty = regProperty
         and optimise = optimise
         and preg = preg
         and pregOrZero = pregOrZero
     end
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICodeToArm64Code.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICodeToArm64Code.ML
index ba11cf88..3a70de97 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICodeToArm64Code.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICodeToArm64Code.ML
@@ -1,1226 +1,1356 @@
 (*
     Copyright David C. J. Matthews 2021-2
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64ICodeToArm64Code(
     structure Arm64PreAssembly: ARM64PREASSEMBLY
     structure Debug: DEBUG
     structure Arm64ICode: ARM64ICODE
     structure Identify: ARM64IDENTIFYREFERENCES
     structure IntSet: INTSET
     structure Pretty: PRETTY
 
     structure Strongly:
         sig
             val stronglyConnectedComponents: {nodeAddress: 'a -> int, arcs: 'a -> int list } -> 'a list -> 'a list list
         end
     
     sharing Arm64PreAssembly.Sharing = Arm64ICode.Sharing = Identify.Sharing = IntSet
 ): ARM64ICODEGENERATE =
 struct
     open Identify
     open Arm64ICode
     open Arm64PreAssembly
     open Address
     exception InternalError = Misc.InternalError
 
     (* Reversed cons and append to make the code easier to read. *)
     infix 5 <::> <@>
     fun tl <::> hd = hd :: tl
     (*and snd <@> fst = fst @ snd*)
 
     (* These aren't currently used for anything. *)
     val workReg1 = X16 and workReg2 = X17
 
     fun icodeToArm64Code {blocks: basicBlockConcrete vector, functionName, stackRequired, debugSwitches, resultClosure, profileObject, ...} =
     let
         val numBlocks = Vector.length blocks
 
         (* Load from and store to stack. *)
         fun loadFromStack(destReg, wordOffset, code) =
             if wordOffset >= 4096
             then (LoadRegIndexed{regT=destReg, regN=X_MLStackPtr, regM=destReg, option=ExtUXTX ScaleOrShift, loadType=Load64}) ::
                     [LoadNonAddr(destReg, Word64.fromInt wordOffset)] @ code
             else (LoadRegScaled{regT=destReg, regN=X_MLStackPtr, unitOffset=wordOffset, loadType=Load64}) :: code
 
         and storeToStack(sourceReg, wordOffset, workReg, code) =
             if wordOffset >= 4096
             then (StoreRegIndexed{regT=sourceReg, regN=X_MLStackPtr, regM=workReg, option=ExtUXTX ScaleOrShift, loadType=Load64}) ::
                     [LoadNonAddr(workReg, Word64.fromInt wordOffset)] @ code
             else (StoreRegScaled{regT=sourceReg, regN=X_MLStackPtr, unitOffset=wordOffset, loadType=Load64}) :: code
 
         datatype srcAndDest = IsInReg of xReg | IsOnStack of int
 
         local
             (* The registers are numbered from 0.  Choose values that don't conflict with
                the stack addresses. *)
             fun regNo(XReg r) = ~1 - Word8.toInt r | regNo _ = ~1 - 31
             type node = {src: srcAndDest, dst: srcAndDest }
             
             fun nodeAddress({dst=IsInReg r, ...}: node) = regNo r
             |   nodeAddress({dst=IsOnStack a, ...}) = a
             
             fun arcs({src=IsOnStack wordOffset, ...}: node) = [wordOffset]
             |   arcs{src=IsInReg r, ...} = [regNo r]
         in
             val stronglyConnected = Strongly.stronglyConnectedComponents { nodeAddress=nodeAddress, arcs=arcs }
         end
         
         (* This is a general function for moving values into registers or to the stack
            where it is possible that the source values might also be in use as destinations.
            The stack is used for destinations only for tail recursive calls. *)
         fun moveMultipleValues(moves, code) =
         let
             fun moveValues ([], code) = code (* We're done. *)
 
             |   moveValues (arguments, code) =
                 let
                     (* stronglyConnectedComponents does two things.  It detects loops where
                        it's not possible to move items without breaking the loop but more
                        importantly it orders the dependencies so that if there are no loops we
                        can load the source and store it in the destination knowing that
                        we won't overwrite anything we might later need. *)
                     
                     val ordered = stronglyConnected arguments
                     
                     fun loadIntoReg(IsInReg sReg, dReg, code) =
                             if sReg = dReg then code else (MoveXRegToXReg{sReg=sReg, dReg=dReg}) :: code
                     |   loadIntoReg(IsOnStack wordOffset, dReg, code) =
                             loadFromStack(dReg, wordOffset, code)
 
                     fun moveEachValue ([], code) = code
 
                     |   moveEachValue ([{dst=IsInReg dReg, src}] :: rest, code) =
                             moveEachValue(rest, loadIntoReg(src, dReg, code))
 
                     |   moveEachValue ([{dst=IsOnStack wordOffset, src=IsInReg sReg}] :: rest, code) =
                             (* Storing into the stack. *)
                             moveEachValue(rest, storeToStack(sReg, wordOffset, workReg1, code))
 
                     |   moveEachValue ([{dst=IsOnStack dstOffset, src=IsOnStack srcOffset}] :: rest, code) =
                             (* Copy a stack location - needs a load and store unless the address is the same. *)
                             if dstOffset = srcOffset
                             then moveEachValue(rest, code)
                             else moveEachValue(rest,
                                     storeToStack(workReg2, dstOffset, workReg1,
                                         loadFromStack(workReg2, srcOffset, code)))
 
                     |   moveEachValue((cycle as first :: _ :: _) :: rest, code) =
                         (* We have a cycle. *)
                         let
                             (* We need to exchange some of the arguments.  Doing an exchange here will
                                set the destination with the correct source.  However we have to process
                                every subsequent entry with the swapped registers.  That may well mean that
                                one of those entries becomes trivial.
                                We also need to rerun stronglyConnectedComponents on at least the rest of
                                this cycle.  It's easiest to flatten the rest and do everything. *)
                             (* Exchange the source and destination.  We don't have an exchange instruction
                                and there's a further complication.  We could be copying between stack locations
                                and their offsets could be > 4096.  Since we've only got two work registers
                                we need to use the hardware stack as an extra location. Stack-stack
                                exchange is very rare so the extra overhead to handle the general case
                                is worth it. *)
                             local
                                 fun storeToDest(sReg, IsInReg dReg, _, code) =
                                         (MoveXRegToXReg{sReg=sReg, dReg=dReg}) :: code
                                 |   storeToDest(sReg, IsOnStack wordOffset, work, code) =
                                         storeToStack(sReg, wordOffset, work, code)
                             in
                                 fun exchange(IsInReg arg1Reg, arg2, code) =
                                     (MoveXRegToXReg{sReg=workReg2, dReg=arg1Reg}) ::
                                     storeToDest(arg1Reg, arg2, workReg1,
                                         loadIntoReg(arg2, workReg2, code))
 
                                 |   exchange(arg1, IsInReg arg2Reg, code) =
                                     (MoveXRegToXReg{sReg=workReg2, dReg=arg2Reg}) ::
                                     storeToDest(arg2Reg, arg1, workReg1,
                                         loadIntoReg(arg1, workReg2, code))
 
                                 |   exchange(arg1, arg2, code) =
                                     (* The hardware stack must be 16-byte aligned. *)
                                     storeToDest(workReg2, arg2, workReg1,
                                         (LoadRegUnscaled{regT=workReg2, regN=XSP, byteOffset=16, loadType=Load64, unscaledType=PostIndex}) ::
                                         storeToDest(workReg2, arg1, workReg1,
                                             loadIntoReg(arg2, workReg2,
                                                 (StoreRegUnscaled{regT=workReg2, regN=XSP, byteOffset= ~16, loadType=Load64, unscaledType=PreIndex}) ::
                                                     loadIntoReg(arg1, workReg2, code))))
                             end
                             
                             (* Try to find either a register-register move or a register-stack move.
                                If not use the first.  If there's a stack-register move there will
                                also be a register-stack so we don't need to look for both. *)
                                
                             val {dst=selectDst, src=selectSrc} = first
 
                             (* This includes this entry but after the swap we'll eliminate it. *)
                             val flattened = List.foldl(fn (a, b) => a @ b) [] (cycle :: rest)
                             val destAsSource = selectDst
 
                             fun match(s1: srcAndDest, s2) = s1 = s2
                             
                             fun swapSources{src, dst} =
                                 if match(src, selectSrc) then {src=destAsSource, dst=dst}
                                 else if match(src, destAsSource) then {src=selectSrc, dst=dst}
                                 else {src=src, dst=dst}
 
                             val exchangeCode = exchange(selectDst, selectSrc, code)                                    
                         in
                             moveValues(List.map swapSources flattened, exchangeCode)
                         end
 
                     |   moveEachValue(([]) :: _, _) = (* This should not happen - avoid warning. *)
                             raise InternalError "moveEachValue - empty set"
                 in
                     moveEachValue(ordered, code)
                 end
         in
             moveValues(moves, code)
         end
 
         (* Where we have multiple specific registers as either source or
            destination there is the potential that a destination register
            if currently in use as a source. *) 
         fun moveMultipleRegisters(regPairList, code) =
         let
             val regPairsAsDests =
                 List.map(fn {src, dst} => {src=IsInReg src, dst=IsInReg dst}) regPairList
         in
             moveMultipleValues(regPairsAsDests, code)
         end
 
+        (* Floating point registers can only be moved into other floating point registers but
+           it is possible to have a cycle. *)
+        fun moveMultipleFPRegisters(moves: {dst: vReg, src: vReg} list, code) =
+        let
+            local
+                fun regNo(VReg r) = Word8.toInt r
+                type node = {src: vReg, dst: vReg }
+                fun nodeAddress({dst=r, ...}: node) = regNo r
+                fun arcs{src=r, ...} = [regNo r]
+            in
+                val stronglyConnected = Strongly.stronglyConnectedComponents { nodeAddress=nodeAddress, arcs=arcs }
+            end
+
+            fun moveValues ([], code) = code (* We're done. *)
+
+            |   moveValues (arguments, code) =
+                let
+                    val ordered = stronglyConnected arguments
+
+                    fun moveEachValue ([], code) = code
+
+                    |   moveEachValue ([{dst, src}] :: rest, code) =
+                            moveEachValue(rest,
+                                if src = dst then code else (FPUnaryOp{regN=src, regD=dst, fpOp=MoveDouble}) :: code)
+
+                    |   moveEachValue((cycle as first :: _ :: _) :: rest, code) =
+                        (* We have a cycle. *)
+                        let
+                            val {dst=selectDst: vReg, src=selectSrc: vReg} = first
+
+                            (* This includes this entry but after the swap we'll eliminate it. *)
+                            val flattened = List.foldl(fn (a, b) => a @ b) [] (cycle :: rest)
+                            val destAsSource = selectDst
+                            fun swapSources{src, dst} =
+                                if src=selectSrc then {src=destAsSource, dst=dst}
+                                else if src=destAsSource then {src=selectSrc, dst=dst}
+                                else {src=src, dst=dst}
+
+                            (* Exchange the values of two floating point registers.  There are
+                               various ways to do this.  For the moment just use the hardware stack. *)
+                            val exchangeCode =
+                                code <::>
+                                    StoreFPRegUnscaled{regT=selectDst, regN=XSP, byteOffset= ~16, floatSize=Double64, unscaledType=PreIndex} <::>
+                                    FPUnaryOp{regN=selectSrc, regD=selectDst, fpOp=MoveDouble} <::>
+                                    LoadFPRegUnscaled{regT=selectSrc, regN=XSP, byteOffset=16, floatSize=Double64, unscaledType=PostIndex}
+                        in
+                            moveValues(List.map swapSources flattened, exchangeCode)
+                        end
+
+                    |   moveEachValue(([]) :: _, _) = (* This should not happen - avoid warning. *)
+                            raise InternalError "moveEachValue - empty set"
+                in
+                    moveEachValue(ordered, code)
+                end
+        in
+            moveValues(moves, code)
+        end
+
+
+
         fun moveIfNecessary({src, dst}, code) =
             if src = dst then code
             else MoveXRegToXReg{sReg=src, dReg=dst} :: code
 
         (* Add a constant word to the source register and put the result in the
            destination.  regW is used as a work register if necessary.  This is used
            both for addition and subtraction. *)
         fun addConstantWord({regS, regD, value=0w0, ...}, code) =
             if regS = regD then code else MoveXRegToXReg{sReg=regS, dReg=regD} :: code
     
         |   addConstantWord({regS, regD, regW, value}, code) =
             let
                 (* If we have to load the constant it's better if the top 32-bits are
                    zero if possible. *)
                 val (isSub, unsigned) =
                     if value > Word64.<<(0w1, 0w63)
                     then (true, ~ value)
                     else (false, value)
             in
                 if unsigned < Word64.<<(0w1, 0w24)
                 then (* We can put up to 24 in a shifted and an unshifted constant. *)
                 let
                     val w = Word.fromLarge(Word64.toLarge unsigned)
                     val high = Word.andb(Word.>>(w, 0w12), 0wxfff)
                     val low = Word.andb(w, 0wxfff)
                     val addSub = if isSub then SubImmediate else AddImmediate
                 in
                     if high <> 0w0
                     then
                     (
                         (if low <> 0w0 then [addSub{regN=regD, regD=regD, immed=low, shifted=false, setFlags=false, opSize=OpSize64}] else []) @
                         addSub{regN=regS, regD=regD, immed=high, shifted=true, setFlags=false, opSize=OpSize64} :: code
                     )
                     else addSub{regN=regS, regD=regD, immed=low, shifted=false, setFlags=false, opSize=OpSize64} :: code
                 end
                 else
                 let
                     (* To minimise the constant and increase the chances that it
                        will fit in a single word look to see if we can shift it. *)
                     fun getShift(value, shift) =
                         if Word64.andb(value, 0w1) = 0w0
                         then getShift(Word64.>>(value, 0w1), shift+0w1)
                         else (value, shift)
                     val (shifted, shift) = getShift(unsigned, 0w0)
                 in
                     code <::> LoadNonAddr(regW, shifted) <::>
                         (if isSub then SubShiftedReg else AddShiftedReg)
                             {regM=regW, regN=regS, regD=regD, shift=ShiftLSL shift, setFlags=false, opSize=OpSize64}
                 end
             end
 
         val labelMaker = createLabelMaker()
 
         val startOfFunctionLabel = createLabel labelMaker (* Used for recursive calls/jumps *)
 
         val blockToLabelMap = Vector.tabulate(numBlocks, fn _ => createLabel labelMaker)
 
         fun getBlockLabel blockNo = Vector.sub(blockToLabelMap, blockNo)
 
         fun codeExtended _ (MoveRegister{source, dest, ...}, code) =
             moveIfNecessary({src=source, dst=dest}, code)
 
         |   codeExtended _ (LoadNonAddressConstant{source, dest}, code) =
                 code <::> LoadNonAddr(dest, source)
 
         |   codeExtended _ (LoadFPConstant{source, dest, floatSize}, code) =
                 code <::> LoadFPConst{dest=dest, value=source, floatSize=floatSize, work=workReg1}
 
         |   codeExtended _ (LoadAddressConstant{source, dest, ...}, code) =
                 code <::> LoadAddr(dest, source)
 
         |   codeExtended _ (LoadWithConstantOffset{dest, base, byteOffset, loadType, ...}, code) =
             if byteOffset < 0
             then LoadRegUnscaled{regT=dest, regN=base, byteOffset=byteOffset,
                                         loadType=loadType, unscaledType=NoUpdate} :: code
             else
             let
                 val unitOffset =
                     case loadType of
                         Load64 => Int.quot(byteOffset, 8)
                     |   Load32 => Int.quot(byteOffset, 4)
                     |   Load16 => Int.quot(byteOffset, 2)
                     |   Load8 => byteOffset
             in
                 LoadRegScaled{regT=dest, regN=base, unitOffset=unitOffset, loadType=loadType} :: code
             end
 
         |   codeExtended _ (LoadFPWithConstantOffset{dest, base, byteOffset, floatSize, ...}, code) =
             if byteOffset < 0
             then (LoadFPRegUnscaled{regT=dest, regN=base, byteOffset=byteOffset,
                                         floatSize=floatSize, unscaledType=NoUpdate}) :: code
             else
             let
                 val unitOffset = Int.quot(byteOffset, case floatSize of Float32 => 4 | Double64 => 8)
             in
                 (LoadFPRegScaled{regT=dest, regN=base, unitOffset=unitOffset, floatSize=floatSize}) :: code
             end
 
         |   codeExtended _ (LoadWithIndexedOffset{dest, base, index, loadType, signExtendIndex, ...}, code) =
             let
                 val baseReg = base and indexReg = index
                 (* In 32-in-64 the index is a 32-bit value in the low order 32-bits.  It may be
                    signed if it is a C address. *)
                 val scaleType = if is32in64 andalso signExtendIndex then ExtSXTW else ExtUXTX
                 val scale = case loadType of Load8 => NoScale | _ => ScaleOrShift
             in
                 (LoadRegIndexed{regT=dest, regN=baseReg, regM=indexReg, option=scaleType scale, loadType=loadType}) :: code
             end
 
         |   codeExtended _ (LoadFPWithIndexedOffset{dest, base, index, floatSize, signExtendIndex, ...}, code) =
             let
                 val baseReg = base and indexReg = index
                 (* In 32-in-64 the index is a 32-bit value in the low order 32-bits.  It may be
                    signed if it is a C address. *)
                 val scaleType = if is32in64 andalso signExtendIndex then ExtSXTW else ExtUXTX
             in
                 (LoadFPRegIndexed{regT=dest, regN=baseReg, regM=indexReg, option=scaleType ScaleOrShift, floatSize=floatSize}) :: code
             end
 
         |   codeExtended _ (GetThreadId { dest}, code) =
                 (* Load the thread id.  This is always a 64-bit value. *)
                 (LoadRegScaled{regT=dest, regN=X_MLAssemblyInt, unitOffset=threadIdOffset, loadType=Load64}) :: code
 
         |   codeExtended _ (ObjectIndexAddressToAbsolute{source, dest, ...}, code) =
                 (AddShiftedReg{regM=source, regN=X_Base32in64, regD=dest,
                                       shift=ShiftLSL 0w2, setFlags=false, opSize=OpSize64}) :: code
 
         |   codeExtended _ (AbsoluteToObjectIndex{source, dest, ...}, code) =
             let
                 val destReg = dest
             in
                 code <::>
                 (SubShiftedReg{regM=X_Base32in64, regN=source, regD=destReg, shift=ShiftNone, setFlags=false, opSize=OpSize64}) <::>
                 (shiftConstant{shift=0w2, regN=destReg, regD=destReg, direction=ShiftRightLogical, opSize=OpSize64})
             end
 
         |   codeExtended _ (AllocateMemoryFixed{ bytesRequired, dest, saveRegs }, code) =
                 code <::> AllocateMemoryFixedSize{ bytes=Word.fromLarge bytesRequired, dest=dest, save=saveRegs, work=workReg1 }
 
         |   codeExtended _ (AllocateMemoryVariable{ size, dest, saveRegs }, code) =
                 code <::> AllocateMemoryVariableSize{ sizeReg=size, dest=dest, save=saveRegs, work=workReg1 }
 
         |   codeExtended _ (InitialiseMem{ size, addr, init}, code) =
             let
                 val sizeReg = size
                 and addrReg = addr
                 and initReg = init
                 val exitLabel = createLabel labelMaker and loopLabel = createLabel labelMaker
                 (* This uses a loop to initialise.  It's possible the size is
                    zero so we have to check at the top of the loop. *)
                 val (bShift, offset, loadType) = if is32in64 then (0w2, ~4, Load32) else (0w3, ~8, Load64)
             in
                 code <::>
                 (* Add the length in bytes so we point at the end. *)
                 AddShiftedReg{regM=sizeReg, regN=addrReg, regD=workReg1, shift=ShiftLSL bShift, setFlags=false, opSize=OpSize64} <::>
                 SetLabel loopLabel <::>
                 (* Are we at the start? *)
                 SubShiftedReg{regM=workReg1, regN=addrReg, regD=XZero, shift=ShiftNone, setFlags=true, opSize=OpSize64} <::>
                 ConditionalBranch(CondEqual, exitLabel) <::>
                 StoreRegUnscaled{regT=initReg, regN=workReg1, byteOffset=offset, loadType=loadType, unscaledType=PreIndex } <::>
                 UnconditionalBranch loopLabel <::>
                 SetLabel exitLabel
             end
 
         |   codeExtended _ (BeginLoop, code) = code
 
         |   codeExtended _ (JumpLoop{regArgs, stackArgs, checkInterrupt}, code) =
             let
                 (* TODO: We could have a single list and use ArgOnStack and ArgInReg to distinguish. *)
                 fun convertArg(ArgOnStack{wordOffset, ...}) = IsOnStack wordOffset
                 |   convertArg(ArgInReg reg) = IsInReg(reg)
                 val extStackArgs =
                     map (fn {wordOffset, src, ...} => {src=convertArg src, dst=IsOnStack wordOffset}) stackArgs
                 val extRegArgs =
                     map (fn {dst, src} => {src=convertArg src, dst=convertArg(ArgInReg dst)}) regArgs
                 val code2 = moveMultipleValues(extStackArgs @ extRegArgs, code)
             in
                 case checkInterrupt of
                     NONE => code2
                 |   SOME saveRegs =>
                     let
                         val skipCheck = createLabel labelMaker
                     in
                         code2 <::>
                         (* Put in stack-check code to allow this to be interrupted. *)
                         LoadRegScaled{regT=workReg1, regN=X_MLAssemblyInt, unitOffset=stackLimitOffset, loadType=Load64} <::>
                         SubShiftedReg{regM=workReg1, regN=X_MLStackPtr, regD=XZero, shift=ShiftNone, setFlags=true, opSize=OpSize64} <::>
                         ConditionalBranch(CondCarrySet, skipCheck) <::>
                         RTSTrap{rtsEntry=stackOverflowCallOffset, work=workReg1, save=saveRegs} <::>
                         SetLabel skipCheck
                     end
             end
 
         |   codeExtended _ (StoreWithConstantOffset{source, base, byteOffset, loadType, ...}, code) =
             if byteOffset < 0
             then (StoreRegUnscaled{regT=source, regN=base, byteOffset=byteOffset,
                                         loadType=loadType, unscaledType=NoUpdate}) :: code
             else
             let
                 val unitOffset =
                     case loadType of
                         Load64 => Int.quot(byteOffset, 8)
                     |   Load32 => Int.quot(byteOffset, 4)
                     |   Load16 => Int.quot(byteOffset, 2)
                     |   Load8 => byteOffset
             in
                 (StoreRegScaled{regT=source, regN=base, unitOffset=unitOffset, loadType=loadType}) :: code
             end
 
         |   codeExtended _ (StoreFPWithConstantOffset{source, base, byteOffset, floatSize, ...}, code) =
             if byteOffset < 0
             then (StoreFPRegUnscaled{regT=source, regN=base, byteOffset=byteOffset,
                                         floatSize=floatSize, unscaledType=NoUpdate}) :: code
             else
             let
                 val unitOffset = Int.quot(byteOffset, case floatSize of Float32 => 4 | Double64 => 8)
             in
                 (StoreFPRegScaled{regT=source, regN=base, unitOffset=unitOffset, floatSize=floatSize}) :: code
             end
 
         |   codeExtended _ (StoreWithIndexedOffset{source, base, index, loadType, signExtendIndex, ...}, code) =
             let
                 val baseReg = base and indexReg = index
                 (* In 32-in-64 the index is a 32-bit value in the low order 32-bits.  It may be
                    signed if it is a C address. *)
                 val scaleType = if is32in64 andalso signExtendIndex then ExtSXTW else ExtUXTX
                 val scale = case loadType of Load8 => NoScale | _ => ScaleOrShift
             in
                 (StoreRegIndexed{regT=source, regN=baseReg, regM=indexReg, option=scaleType scale, loadType=loadType}) :: code
             end
 
         |   codeExtended _ (StoreFPWithIndexedOffset{source, base, index, floatSize, signExtendIndex, ...}, code) =
             let
                 val baseReg = base and indexReg = index
                 (* In 32-in-64 the index is a 32-bit value in the low order 32-bits.  It may be
                    signed if it is a C address. *)
                 val scaleType = if is32in64 andalso signExtendIndex then ExtSXTW else ExtUXTX
             in
                 (StoreFPRegIndexed{regT=source, regN=baseReg, regM=indexReg, option=scaleType ScaleOrShift, floatSize=floatSize}) :: code
             end
 
         |   codeExtended _ (AddSubImmediate{ source, dest, immed, isAdd, length, ccRef}, code) =
             let
                 val destReg = dest
             in
                 ((if isAdd then AddImmediate else SubImmediate)
                     {regN=source, regD=destReg, immed=immed, shifted=false, opSize=length, setFlags=isSome ccRef}) :: code
             end
 
         |   codeExtended _ (AddSubRegister{ base, shifted, dest, isAdd, length, ccRef, shift}, code) =
             let
                 val destReg = dest
             in
                 (
                     (if isAdd then AddShiftedReg else SubShiftedReg)
                         {regN=base, regM=shifted, regD=destReg, shift=shift, opSize=length, setFlags=isSome ccRef}) :: code
             end           
 
         |   codeExtended _ (LogicalImmediate{ source, dest, immed, logOp, length, ccRef}, code) =
             let
                 val destReg = dest
             in
                 (BitwiseLogical{regN=source, regD=destReg, bits=immed, opSize=length,
                                        logOp=logOp, setFlags=isSome ccRef}) :: code
             end
 
         |   codeExtended _ (LogicalRegister{ base, shifted, dest, logOp, length, ccRef, shift}, code) =
             let
                 (* There are also versions of AND/OR/XOR which operate on a complement (NOT)
                    of the shifted register.  It's probably not worth looking for a use for them. *)
                 val destReg = dest
             in
                 (LogicalShiftedReg{regN=base, regM=shifted, regD=destReg,
                                           shift=shift, opSize=length, logOp=logOp, setFlags=isSome ccRef}) :: code
             end
 
         |   codeExtended _ (ShiftRegister{ direction, dest, source, shift, opSize }, code) =
                 (ShiftRegisterVariable{regN=source, regM=shift, regD=dest,
                                               shiftDirection=direction, opSize=opSize}) :: code
 
         |   codeExtended _ (Multiplication{ kind, dest, sourceA, sourceM, sourceN }, code) =
             let
                 val destReg = dest
                 and srcAReg = sourceA
                 and srcNReg = sourceN
                 and srcMReg = sourceM
             in
                 (MultiplyAndAddSub{regM=srcMReg, regN=srcNReg, regA=srcAReg, regD=destReg, multKind=kind}) :: code
             end
 
         |   codeExtended _ (Division{ isSigned, dest, dividend, divisor, opSize }, code) =
                 (DivideRegs{regN=dividend, regM=divisor,
                                    regD=dest, isSigned=isSigned, opSize=opSize}) :: code
 
-        |   codeExtended _ (BeginFunction{regArgs, ...}, code) =
+        |   codeExtended _ (BeginFunction{regArgs, fpRegArgs, ...}, code) =
             let
                 val skipCheck = createLabel labelMaker
                 val defaultWords = 10 (* This is wired into the RTS. *)
                 val workRegister = workReg1
                 val debugTrapAlways = false (* Can be set to true for debugging *)
 
                 (* Test with either the stack-pointer or a high-water value.
                    The RTS assumes that X9 has been used as the high-water if it is
                    called through stackOverflowXCallOffset rather than stackOverflowCallOffset *)
                 val (testReg, entryPt, code1) =
                     if stackRequired <= defaultWords
                     then (X_MLStackPtr, stackOverflowCallOffset, code)
                     else (X9, stackOverflowXCallOffset,
                             addConstantWord({regS=X_MLStackPtr, regD=X9, regW=workRegister,
                             value= ~ (Word64.fromLarge(Word.toLarge nativeWordSize)) * Word64.fromInt stackRequired}, code))
 
                 (* Skip the RTS call if there is enough stack.  N.B. The RTS can modify the end-of-stack value
                    to force a trap here even if there is really enough stack. *)
                 val code2 =
                     (if debugTrapAlways then []
                      else
                      [ConditionalBranch(CondCarrySet, skipCheck),
                       SubShiftedReg{regM=workRegister, regN=testReg, regD=XZero, shift=ShiftNone, setFlags=true, opSize=OpSize64}]) @
                     (* Load the end-of-stack value. *)
                     LoadRegScaled{regT=workRegister, regN=X_MLAssemblyInt, unitOffset=stackLimitOffset, loadType=Load64} :: code1
 
                 val code3 =
                     code2 <::>
                         RTSTrap{rtsEntry=entryPt, work=workReg1, save=List.map #2 regArgs} <::>
                         SetLabel skipCheck
 
-                val usedRegs = regArgs
                 fun mkPair(pr, rr) = {src=rr,dst=pr}
-                val regPairs = List.map mkPair usedRegs
+                val regPairs = List.map mkPair regArgs
+                val fpRegPairs = List.map mkPair fpRegArgs
             in
-                moveMultipleRegisters(regPairs, code3)
+                moveMultipleFPRegisters(fpRegPairs, moveMultipleRegisters(regPairs, code3))
             end
 
-        |   codeExtended _ (TailRecursiveCall{callKind, regArgs, stackArgs, stackAdjust, currStackSize}, code) =
+        |   codeExtended _ (TailRecursiveCall{callKind, regArgs, stackArgs, fpRegArgs, stackAdjust, currStackSize}, code) =
             let
                 fun convertArg(ArgOnStack{wordOffset, ...}) = IsOnStack wordOffset
                 |   convertArg(ArgInReg reg) = IsInReg(reg)
 
                 val extStackArgs = map (fn {stack, src} => {dst=IsOnStack(stack+currStackSize), src=convertArg src}) stackArgs
                 val extRegArgs = map (fn (a, r) => {src=convertArg a, dst=IsInReg r}) regArgs
+                val extFPArgs = map (fn (a, r) => {src=a, dst=r}) fpRegArgs
 
                 (* Tail recursive calls are complicated because we generally have to overwrite the existing stack.
                    That means storing the arguments in the right order to avoid overwriting a
                    value that we are using for a different argument. *)
                 fun codeTailCall(arguments, stackAdjust, code) =
                     if stackAdjust < 0
                     then
                     let
                         (* If the function we're calling takes more arguments on the stack than the
                            current function we will have to extend the stack.  Do that by pushing the
                            argument whose offset is at -1.  Then adjust all the offsets and repeat. *)
                         val {src=argM1, ...} = valOf(List.find(fn {dst=IsOnStack ~1, ...} => true | _ => false) arguments)
                         fun renumberArgs [] = []
                         |   renumberArgs ({dst=IsOnStack ~1, ...} :: args) = renumberArgs args (* Remove the one we've done. *)
                         |   renumberArgs ({dst, src} :: args) =
                             let
                                 val newDest = case dst of IsOnStack d => IsOnStack(d+1) | regDest => regDest
                                 val newSrc =
                                     case src of
                                         IsOnStack wordOffset => IsOnStack(wordOffset+1)
                                     |   other => other
                             in
                                 {dst=newDest, src=newSrc} :: renumberArgs args
                             end
                         val pushCode =
                             case argM1 of
                                 IsOnStack wordOffset =>
                                     (StoreRegUnscaled{regT=workReg2, regN=X_MLStackPtr, byteOffset= ~8, unscaledType=PreIndex, loadType=Load64}) ::
                                         loadFromStack(workReg2, wordOffset, code)
                             |   IsInReg reg =>
                                     (StoreRegUnscaled{regT=reg, regN=X_MLStackPtr, byteOffset= ~8, unscaledType=PreIndex, loadType=Load64}) :: code
                                 
                     in
                         codeTailCall(renumberArgs arguments, stackAdjust+1, pushCode)
                     end
                     else
                     let
-                        val loadArgs = moveMultipleValues(arguments, code)
+                        val loadArgs =
+                            moveMultipleFPRegisters(extFPArgs, moveMultipleValues(arguments, code))
                     in
                         if stackAdjust = 0
                         then loadArgs
                         else addConstantWord({regS=X_MLStackPtr, regD=X_MLStackPtr, regW=workReg1,
                                 value=Word64.fromInt stackAdjust * Word.toLarge nativeWordSize}, loadArgs)
                     end
                 val setArgumentsCode = codeTailCall(extStackArgs @ extRegArgs, stackAdjust+currStackSize, code)
                 
                 val jumpToFunctionCode =
                     case callKind of
                         Recursive => [(UnconditionalBranch startOfFunctionLabel)]
                     |   ConstantCode m =>
                             if is32in64
                             then raise InternalError "ConstantCode" (* Not currently *)
                             else [(BranchReg{regD=workReg1, brRegType=BRRBranch}), (LoadAddr(workReg1, m))]
                     |   FullCall =>
                             if is32in64
                             then [BranchReg{regD=workReg1, brRegType=BRRBranch},
                                  LoadRegScaled{regT=workReg1, regN=workReg1, unitOffset=0, loadType=Load64},
                                  AddShiftedReg{regM=X8, regN=X_Base32in64, regD=workReg1, shift=ShiftLSL 0w2, setFlags=false, opSize=OpSize64}]
                             else [BranchReg{regD=workReg1, brRegType=BRRBranch},
                                     LoadRegScaled{regT=workReg1, regN=X8, unitOffset=0, loadType=Load64}]
             in
                 jumpToFunctionCode @ setArgumentsCode
             end
 
-        |   codeExtended _ (FunctionCall{callKind, regArgs=regArgs, stackArgs=stackArgs, dests, saveRegs, ...}, code) =
+        |   codeExtended _ (FunctionCall{callKind, regArgs, stackArgs, dests, fpRegArgs, fpDests, saveRegs, ...}, code) =
             let
                 local
                     fun pushStackArgs ([], _, code) = code
 
                     |   pushStackArgs (ArgOnStack {wordOffset, ...} ::args, argNum, code) =
                         let
                             (* Have to adjust the offsets of stack arguments. *)
                             val adjustedOffset = wordOffset+argNum
                         in
                             pushStackArgs(args, argNum+1,
                                 loadFromStack(workReg1, adjustedOffset, code) <::>
                                 StoreRegUnscaled{regT=workReg1, regN=X_MLStackPtr, byteOffset= ~8, unscaledType=PreIndex, loadType=Load64})
                         end
 
                     |   pushStackArgs (ArgInReg reg ::args, argNum, code) =
                             pushStackArgs(args, argNum+1,
                                 code <::>
                                 (StoreRegUnscaled{regT=reg, regN=X_MLStackPtr, byteOffset= ~8,
                                                          unscaledType=PreIndex, loadType=Load64}))
 
                     val pushedArgs = pushStackArgs(stackArgs, 0, code (* Initial code *))
 
                     (* We have to adjust any stack offset to account for the arguments we've pushed. *)
                     val numStackArgs = List.length stackArgs
 
                     fun convertArg(ArgOnStack{wordOffset, ...}) = IsOnStack(wordOffset+numStackArgs)
                     |   convertArg(ArgInReg reg) = IsInReg(reg)
 
                 in
                     val extRegArgs = map (fn (a, r) => {src=convertArg a, dst=IsInReg r}) regArgs
-                    val loadArgs = moveMultipleValues(extRegArgs, pushedArgs)
+                    val extFPArgs = map (fn (a, r) => {src=a, dst=r}) fpRegArgs
+                    val loadArgs = moveMultipleFPRegisters(extFPArgs, moveMultipleValues(extRegArgs, pushedArgs))
                 end
 
                 (* Push the registers before the call and pop them afterwards. *)
                 fun makeSavesAndCall([], code) =
                     (
                         case callKind of
                             Recursive => code <::> (BranchAndLink startOfFunctionLabel)
                         |   ConstantCode m =>
                                 if is32in64
                                 then raise InternalError "ConstantCode" (* Not currently *)
                                 else code <::> (LoadAddr(workReg1, m)) <::> (BranchReg{regD=workReg1, brRegType=BRRAndLink})
                         |   FullCall =>
                                 if is32in64
                                 then code <::>
                                     AddShiftedReg{regM=X8, regN=X_Base32in64, regD=workReg1, shift=ShiftLSL 0w2,
                                                   setFlags=false, opSize=OpSize64} <::>
                                     LoadRegScaled{regT=workReg1, regN=workReg1, unitOffset=0, loadType=Load64} <::>
                                     BranchReg{regD=workReg1, brRegType=BRRAndLink}
                                 else code <::> LoadRegScaled{regT=workReg1, regN=X8, unitOffset=0, loadType=Load64} <::>
                                             BranchReg{regD=workReg1, brRegType=BRRAndLink}
                     )
 
                 |   makeSavesAndCall(reg::regs, code) =
                     let
                         val areg = reg
                     in
                         makeSavesAndCall(regs,
                                 code <::> StoreRegUnscaled{regT=areg, regN=X_MLStackPtr, byteOffset= ~8,
                                                            loadType=Load64, unscaledType=PreIndex}) <::>
                             LoadRegUnscaled{regT=areg, regN=X_MLStackPtr, byteOffset= 8, loadType=Load64, unscaledType=PostIndex}
                     end
 
                 (* Results.  These go from the specific result register into the allocated register. *)
                 val resultPairs = List.map(fn (pr, rr) => {src=rr,dst=pr}) dests
+                val fpResultPairs = List.map(fn (pr, rr) => {src=rr,dst=pr}) fpDests
             in
-                moveMultipleRegisters(resultPairs, makeSavesAndCall(saveRegs, loadArgs))
+                moveMultipleFPRegisters(fpResultPairs,
+                    moveMultipleRegisters(resultPairs, makeSavesAndCall(saveRegs, loadArgs)))
             end
 
-        |   codeExtended _ (ReturnResultFromFunction { results, returnReg, numStackArgs }, code) =
+        |   codeExtended _ (ReturnResultFromFunction { results, fpResults, returnReg, numStackArgs }, code) =
             let
                 fun resetStack(0, code) = code
                 |   resetStack(nItems, code) =
                         addConstantWord({regS=X_MLStackPtr, regD=X_MLStackPtr, regW=X3,
                                         value=Word64.fromLarge(Word.toLarge nativeWordSize) * Word64.fromInt nItems}, code)
                 (* Return results.  This goes from the allocated register into the specific register rr. *)
                 val resultPairs = List.map(fn (pr, rr) => {src=pr,dst=rr}) results
+                and fpResultPairs = List.map(fn (pr, rr) => {src=pr,dst=rr}) fpResults
             in
-                BranchReg{regD=returnReg, brRegType=BRRReturn} :: resetStack(numStackArgs, moveMultipleRegisters(resultPairs, code))
+                BranchReg{regD=returnReg, brRegType=BRRReturn} :: resetStack(numStackArgs,
+                    moveMultipleFPRegisters(fpResultPairs, moveMultipleRegisters(resultPairs, code)))
             end
 
         |   codeExtended _ (RaiseExceptionPacket{ packetReg }, code) =
                 (* We need a work register here.  It can be any register other than X0 since
                    we don't preserve registers across calls. *)
                 (* Copy the handler "register" into the stack pointer.  Then
                    jump to the address in the first word.  The second word is
                    the next handler.  This is set up in the handler.  We have a lot
                    more raises than handlers since most raises are exceptional conditions
                    such as overflow so it makes sense to minimise the code in each raise. *)
                 moveIfNecessary({src=packetReg, dst=X0}, code) <::>
                 LoadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset, loadType=Load64} <::>
                 LoadRegScaled{regT=workReg1, regN=X_MLStackPtr, unitOffset=0, loadType=Load64} <::>
                 BranchReg{regD=workReg1, brRegType=BRRBranch }
 
         |   codeExtended _ (PushToStack{ source, copies, ... }, code) =
             let
                 val reg = source
                 val _ = copies > 0 orelse raise InternalError "PushToStack: copies<1"
                 fun pushn(0, c) = c
                 |   pushn(n, c) =
                         pushn(n-1,
                             (StoreRegUnscaled{regT=reg, regN=X_MLStackPtr, byteOffset= ~8, loadType=Load64, unscaledType=PreIndex}) :: c)
             in
                 pushn(copies, code)
             end
 
         |   codeExtended _ (LoadStack{ dest, wordOffset, ... }, code) =
                 loadFromStack(dest, wordOffset, code)
 
         |   codeExtended _ (StoreToStack{ source, stackOffset, ... }, code) =
                 (* Store into the stack to set a field of a container.  Always 64-bits. *)
                 storeToStack(source, stackOffset, workReg1, code)
 
         |   codeExtended _ (ContainerAddress{ dest, stackOffset, ... }, code) =
                 (* Set the register to an offset in the stack. *)
             let
                 val _ = stackOffset >= 0 orelse raise InternalError "codeGenICode: ContainerAddress - negative offset"
                 val byteOffset = stackOffset * Word.toInt nativeWordSize
             in
                 if byteOffset >= 4096
                 then code <::> LoadNonAddr(dest, Word64.fromInt byteOffset) <::>
                         AddShiftedReg{regN=X_MLStackPtr, regM=dest, regD=dest, shift=ShiftNone, setFlags=false, opSize=OpSize64}
                 else code <::> AddImmediate{regN=X_MLStackPtr, regD=dest, immed=Word.fromInt byteOffset,
                                           shifted=false, setFlags=false, opSize=OpSize64}
             end
 
         |   codeExtended _ (ResetStackPtr{ numWords, ... }, code) =
                 addConstantWord({regS=X_MLStackPtr, regD=X_MLStackPtr, regW=workReg1,
                     value=Word64.fromInt numWords * Word.toLarge nativeWordSize}, code)
 
         |   codeExtended _ (TagValue{ source, dest, isSigned=_, opSize }, code) =
                 (* Shift left by one bit and add one. *)
                 code <::>
                     shiftConstant{ direction=ShiftLeft, regD=dest, regN=source, shift=0w1, opSize=opSize } <::>
                     BitwiseLogical{ bits=0w1, regN=dest, regD=dest, opSize=opSize, setFlags=false, logOp=LogOr}
 
         |   codeExtended _ (UntagValue{ source, dest, isSigned, opSize }, code) =
                 code <::>
                     shiftConstant{ direction=if isSigned then ShiftRightArithmetic else ShiftRightLogical,
                                     regD=dest, regN=source, shift=0w1, opSize=opSize }
 
         |   codeExtended _ (BoxLarge{ source, dest, saveRegs }, code) =
                boxSysWord({source=source, destination=dest,
                            workReg=workReg1, saveRegs=saveRegs}, code)
 
         |   codeExtended _ (UnboxLarge{ source, dest }, code) =
             let
                 (* Unbox a large word.  The argument is a poly word. *)
                 val destReg = dest
                 and srcReg = source
             in
                 if is32in64
                 then LoadRegScaled{regT=destReg, regN=destReg, unitOffset=0, loadType=Load64} ::
                         AddShiftedReg{regM=srcReg, regN=X_Base32in64, regD=destReg, shift=ShiftLSL 0w2, setFlags=false, opSize=OpSize64} :: code
                 else LoadRegScaled{regT=destReg, regN=srcReg, unitOffset=0, loadType=Load64} :: code
             end
 
         |   codeExtended _ (BoxTagFloat{ floatSize=Double64, source, dest, saveRegs }, code) =
                 boxDouble({source=source, destination=dest,
                            workReg=workReg1, saveRegs=saveRegs}, code)
 
         |   codeExtended _ (BoxTagFloat{ floatSize=Float32, source, dest, saveRegs }, code) =
             let
                 val floatReg = source
                 and fixedReg = dest
             in
                 if is32in64
                 then boxFloat({source=floatReg, destination=fixedReg,
                                workReg=workReg1, saveRegs=saveRegs}, code)
                 else
                     code <::>
                         MoveFPToGeneral{regN=floatReg, regD=fixedReg, floatSize=Float32} <::>
                         shiftConstant{ direction=ShiftLeft, shift=0w32, regN=fixedReg, regD=fixedReg, opSize=OpSize64} <::>
                         BitwiseLogical{ bits=0w1, regN=fixedReg, regD=fixedReg, opSize=OpSize64, setFlags=false, logOp=LogOr}
             end
 
         |   codeExtended _ (UnboxTagFloat { floatSize=Double64, source, dest }, code) =
                 if is32in64
                 then code <::>
                     AddShiftedReg{regM=source, regN=X_Base32in64, regD=workReg1, shift=ShiftLSL 0w2,
                                           setFlags=false, opSize=OpSize64} <::>
                     LoadFPRegScaled{regT=dest, regN=workReg1, unitOffset=0, floatSize=Double64}
                 else code <::> LoadFPRegScaled{regT=dest, regN=source, unitOffset=0, floatSize=Double64}
 
         |   codeExtended _ (UnboxTagFloat { floatSize=Float32, source, dest }, code) =
                 (* This is tagged in native 64-bits.  In 32-in-64 we're loading 32-bits so we
                    can use an indexed load directly. *)
                 if is32in64
                 then code <::> LoadFPRegIndexed{regN=X_Base32in64, regM=source, regT=dest, option=ExtUXTX ScaleOrShift, floatSize=Float32}
                 else code <::>
                         shiftConstant{direction=ShiftRightLogical, shift=0w32, regN=source, regD=workReg1, opSize=OpSize64} <::>
                         MoveGeneralToFP{regN=workReg1, regD=dest, floatSize=Float32}
 
         |   codeExtended _ (LoadAcquire{dest, base, loadType, ...}, code) =
                 LoadAcquireReg{regT=dest, regN=base, loadType=loadType} :: code
 
         |   codeExtended _ (StoreRelease{source, base, loadType, ...}, code) =
                 StoreReleaseReg{regT=source, regN=base, loadType=loadType} :: code
 
         |   codeExtended _ (BitFieldShift{ source, dest, isSigned, length, immr, imms }, code) =
                 BitField{immr=immr, imms=imms, regN=source, regD=dest,
                          bitfieldKind=if isSigned then BFSigned else BFUnsigned, opSize=length} :: code
 
         |   codeExtended _ (BitFieldInsert{ source, destAsSource, dest, length, immr, imms }, code) =
             let
                 (* If we're using BitFieldMove we retain some of the bits of the
                    destination.  The higher levels require us to treat that as a source.  *)
                 val _ = source = dest andalso raise InternalError "codeExtended: bitfield: dest=source"
             in
                 BitField{immr=immr, imms=imms, regN=source, regD=dest, bitfieldKind=BFInsert, opSize=length} ::
                     moveIfNecessary({src=destAsSource, dst=dest}, code)
             end
 
         |   codeExtended {flow} (IndexedCaseOperation{testReg}, code) =
             let
                 (* testReg contains the original value after the lowest value has been subtracted.
                    Since both the original value and the lowest value were tagged it contains a
                    shifted but untagged value. *)
                 (* This should only be within a block with an IndexedBr flow type. *)
                 val cases =
                     case flow of IndexedBr cases => cases | _ => raise InternalError "codeGenICode: IndexedCaseOperation"
                 val caseLabels = map getBlockLabel cases
                 val tableLabel = createLabel labelMaker
             in
                 code <::>
                     LoadLabelAddress(workReg1, tableLabel) <::>
                     (* Add the value shifted by one since it's already shifted. *)
                     AddShiftedReg{regN=workReg1, regD=workReg1, regM=testReg,
                                           shift=ShiftLSL 0w1, setFlags=false, opSize=OpSize64} <::>
                     BranchReg{regD=workReg1, brRegType=BRRBranch} <::>
                     BranchTable{ startLabel=tableLabel, brTable=caseLabels }
             end
 
         |   codeExtended {flow} (PushExceptionHandler, code) =
             let
                 (* This should only be within a block with a SetHandler flow type. *)
                 val handleLabel =
                     case flow of
                         SetHandler{ handler, ...} => handler
                     |   _ => raise InternalError "codeGenICode: PushExceptionHandler"
                 val labelRef = getBlockLabel handleLabel
             in
                 (* Push the old handler and the handler entry point and set the "current handler"
                    to point to the stack after we've pushed these. *)
                 code <::>
                     LoadRegScaled{regT=workReg1, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset, loadType=Load64} <::>
                     LoadLabelAddress(workReg2, labelRef) <::>
                     StoreRegPair{regT1=workReg2, regT2=workReg1, regN=X_MLStackPtr, unitOffset= ~2, unscaledType=PreIndex, loadType=Load64} <::>
                     StoreRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset, loadType=Load64}
             end
 
         |   codeExtended _ (PopExceptionHandler, code) =
                 (* Remove and discard the handler we've set up.  Pop the previous handler
                    and put into "current handler". *)
                 code <::>
                     LoadRegPair{regT1=XZero, regT2=workReg2, regN=X_MLStackPtr, unitOffset=2, unscaledType=PostIndex, loadType=Load64} <::>
                     StoreRegScaled{regT=workReg2, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset, loadType=Load64}
 
         |   codeExtended _ (BeginHandler{packetReg}, code) =
             let
                 val beginHandleCode =
                     code <::>
                         (* The exception raise code resets the stack pointer to the value in the exception handler
                            so this is probably redundant.  Leave it for the moment, *)
                         LoadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset, loadType=Load64} <::>
                         LoadRegPair{regT1=XZero, regT2=workReg2, regN=X_MLStackPtr, unitOffset=2, unscaledType=PostIndex, loadType=Load64} <::>
                         StoreRegScaled{regT=workReg2, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset, loadType=Load64}
             in
                 moveIfNecessary({src=X0, dst=packetReg }, beginHandleCode)
             end
 
         |   codeExtended _ (CompareByteVectors{vec1Addr, vec2Addr, length,  ...}, code) =
             let
                 (* Construct a loop to compare two vectors of bytes. *)
                 val vec1Reg = vec1Addr
                 and vec2Reg = vec2Addr
                 and lenReg = length
                 val loopLabel = createLabel labelMaker and exitLabel = createLabel labelMaker
             in
                 code <::>
                 (* Set the CC to Equal before we start in case length = 0 *)
                 SubShiftedReg{regM=lenReg, regN=lenReg, regD=XZero, shift=ShiftNone, setFlags=true, opSize=OpSize64} <::>
                 SetLabel loopLabel <::> (* Start of loop *)
                 CompareBranch{ test=lenReg, label=exitLabel, onZero=true, opSize=OpSize64} <::> (* Go to the end when len = zero *)
                 (* Load the bytes for the comparison and increment each. *)
                 LoadRegUnscaled{regT=workReg1, regN=vec1Reg, byteOffset=1, unscaledType=PostIndex, loadType=Load8} <::>
                 LoadRegUnscaled{regT=workReg2, regN=vec2Reg, byteOffset=1, unscaledType=PostIndex, loadType=Load8} <::>            
                 SubImmediate{regN=lenReg, regD=lenReg, immed=0w1, shifted=false, setFlags=false, opSize=OpSize64} <::> (* Decr len *)
                 (* Compare *)
                 SubShiftedReg{regM=workReg2, regN=workReg1, regD=XZero, shift=ShiftNone, setFlags=true, opSize=OpSize64} <::>
                 ConditionalBranch(CondEqual, loopLabel) <::> (* Loop if they're equal *)
                 SetLabel exitLabel
             end
 
         |   codeExtended _ (BlockMove{srcAddr, destAddr, length, isByteMove}, code) =
             let
                 (* Construct a loop to move the data. *)
                 val srcReg = srcAddr
                 and destReg = destAddr
                 and lenReg = length
                 val loopLabel = createLabel labelMaker and exitLabel = createLabel labelMaker
                 val (offset, loadType) = if isByteMove then (1, Load8) else if is32in64 then (4, Load32) else (8, Load64)
             in
                 code <::> SetLabel loopLabel (* Start of loop *) <::> 
                 CompareBranch{ test=lenReg, label=exitLabel, onZero=true, opSize=OpSize64} <::> (* Exit when length = 0 *)
                 LoadRegUnscaled{regT=workReg1, regN=srcReg, byteOffset=offset, loadType=loadType, unscaledType=PostIndex} <::>
                 StoreRegUnscaled{regT=workReg1, regN=destReg, byteOffset=offset, loadType=loadType, unscaledType=PostIndex} <::>
                 SubImmediate{regN=lenReg, regD=lenReg, immed=0w1, shifted=false, setFlags=false, opSize=OpSize64} <::> (* Decr len *)
                 UnconditionalBranch loopLabel <::> (* Back to the start *)
                 SetLabel exitLabel
             end
 
         |   codeExtended _ (AddSubXSP{ source, dest, isAdd }, code) =
             let
                 val allocFreeCode =
                     (if isAdd then AddExtendedReg else SubExtendedReg)
                         {regM=source, regN=XSP, regD=XSP, extend=ExtUXTX 0w0, setFlags=false, opSize=OpSize64} :: code
             in
                 case dest of
                     XZero => allocFreeCode
                 |   destReg =>
                         (* We have to use add here to get the SP into the destination instead of the usual move. *)
                         AddImmediate{regN=XSP, regD=destReg, immed=0w0, shifted=false, setFlags=false, opSize=OpSize64} ::
                             allocFreeCode
             end
 
         |   codeExtended _ (TouchValue _, code) = code (* Don't need to do anything now. *)
 
             (* Used in mutex operations. *)
         |   codeExtended _ (LoadAcquireExclusive{ base, dest }, code) =
                 LoadAcquireExclusiveRegister{regN=base, regT=dest} :: code
 
         |   codeExtended _ (StoreReleaseExclusive{ base, source, result }, code) =
                 StoreReleaseExclusiveRegister{regS=result,
                         regT=source, regN=base} :: code
 
         |   codeExtended _ (MemoryBarrier, code) = code <::> MemBarrier
 
         |   codeExtended _ (ConvertIntToFloat{ source, dest, srcSize, destSize}, code) =
                 (CvtIntToFP{regN=source, regD=dest, floatSize=destSize, opSize=srcSize}) :: code
 
         |   codeExtended _ (ConvertFloatToInt{ source, dest, srcSize, destSize, rounding}, code) =
                 (CvtFloatToInt{regN=source, regD=dest, round=rounding, floatSize=srcSize, opSize=destSize}) :: code
 
         |   codeExtended _ (UnaryFloatingPt{ source, dest, fpOp}, code) =
                 (FPUnaryOp{regN=source, regD=dest, fpOp=fpOp}) :: code
 
         |   codeExtended _ (BinaryFloatingPoint{ arg1, arg2, dest, fpOp, opSize}, code) =
                 (FPBinaryOp{regN=arg1, regM=arg2, regD=dest, floatSize=opSize, fpOp=fpOp}) :: code
 
         |   codeExtended _ (CompareFloatingPoint{ arg1, arg2, opSize, ...}, code) =
                 (FPComparison{regN=arg1, regM=arg2, floatSize=opSize}) :: code
 
         |   codeExtended _ (CPUYield, code) = code <::> Yield
 
         |   codeExtended _ (AtomicOperation{ base, source, dest, atOp }, code) = AtomicExtension{regN=base, regS=source, regT=dest, atOp=atOp} :: code
 
-        |   codeExtended _ (CacheCheck{ arg1, arg2 }, code) =
-            let
-                val okLabel = createLabel labelMaker
-            in
-                (code <::> SubShiftedReg {regM=arg1, regN=arg2, regD=XZero, shift=ShiftNone, opSize=OpSize64, setFlags=true} <::>
-                    ConditionalBranch(CondEqual, okLabel) <::>
-                    MoveXRegToXReg{sReg=XZero, dReg=X16} <::>
-                    LoadRegScaled{regT=X16, regN=X16, unitOffset=0, loadType=Load16} <::>
-                    SetLabel okLabel)
-            end
-
         local
             (* processed - set to true when a block has been processed. *)
             val processed = Array.array(numBlocks, false)
             fun haveProcessed n = Array.sub(processed, n)
             
             (* Find the blocks that reference this one.  This isn't essential but
                allows us to try to generate blocks in the order of the control
                flow.  This in turn may allow us to use short branches rather
                than long ones. *)
             val labelRefs = Array.array(numBlocks, [])
 
             datatype flowCode =
                 FlowCodeSimple of int
             |   FlowCodeCMove of {code: precode list, trueJump: int, falseJump: int}
             
             (* Process this recursively to set the references.  If we have
                unreachable blocks, perhaps because they've been merged, we
                don't want to include them in the reference counting.
                This shouldn't happen now that IdentifyReferences removes
                unreferenced blocks. *)
             fun setReferences fromLabel toLabel =
                 case Array.sub(labelRefs, toLabel) of
                     [] => (* Not yet visited at all. *)
                     let
                         val BasicBlock{ flow, ...} = Vector.sub(blocks, toLabel)
                         val refs =
                             case flow of
                                 ExitCode => []
                             |   Unconditional lab => [lab]
                             |   Conditional{trueJump, falseJump, ... } => [trueJump, falseJump]
                             |   IndexedBr labs => labs
                             |   SetHandler { handler, continue } => [handler, continue]
                             |   UnconditionalHandle _ => []
                             |   ConditionalHandle { continue, ...} => [continue]
 
                         val () =
                             if fromLabel >= 0 then Array.update(labelRefs, toLabel, [fromLabel]) else ()
                     in
                         List.app (setReferences toLabel) refs
                     end
                     
                 |   refs =>
                     (* We've visiting this at least once.  Just add us to the list. *)
                         Array.update(labelRefs, toLabel, fromLabel :: refs)
             
             val _ = setReferences 0 0
             
             (* Process the blocks.  We keep the "stack" explicit rather than using recursion
                because this allows us to select both arms of a conditional branch sooner. *)
             fun genCode(toDo, lastFlow, code) =
             case List.filter (not o haveProcessed) toDo of
                 [] =>
                 let
                     (* There's nothing left to do. We may need to add a final branch to the end. *)
                     val finalBranch =
                         case lastFlow of
                             ExitCode => []
                         |   IndexedBr _ => []
                         |   Unconditional dest => [(UnconditionalBranch(getBlockLabel dest))]
                         |   Conditional { condition, trueJump, falseJump, ...} =>
                                 [
                                     (UnconditionalBranch(getBlockLabel falseJump)),
                                     (ConditionalBranch(condition, getBlockLabel trueJump))
                                 ]
                         |   SetHandler { continue, ...} => [(UnconditionalBranch(getBlockLabel continue))]
                         |   UnconditionalHandle _ => []
                         |   ConditionalHandle { continue, ...} => [(UnconditionalBranch(getBlockLabel continue))]
                 in
                     finalBranch @ code (* Done. *)
                 end
 
             |   stillToDo as head :: _ =>
                 let
                     local
                         (* Check the references.  If all the sources that lead up to this have
                            already been we won't have any backward jumps. *)
                         fun available dest = List.all haveProcessed (Array.sub(labelRefs, dest))
 
                         val continuation =
                             case lastFlow of
                                 ExitCode => NONE
                             |   IndexedBr _ => NONE (* We could put the last branch in here. *)
                             |   Unconditional dest =>
                                     if not (haveProcessed dest) andalso available dest
                                     then SOME(FlowCodeSimple dest)
                                     else NONE
                             |   Conditional {trueJump, falseJump, condition, ...} =>
-                                (* We can usually choose either destination and in nearly all cases
-                                   it won't matter.  The default branch is not to take forward jumps
-                                   so if there is reason to believe that one branch is more likely
-                                   we should follow that branch now and leave the other.  If we
-                                   have Cond(No)Overflow we assume that overflow is unusual.
-                                   If one branch raises an exception we assume that that is unusual. *)
                                 let
-                                    val (first, second) =
-                                        case (condition, Vector.sub(blocks, falseJump)) of
-                                            (CondNoOverflow, _) => (trueJump, falseJump)
-                                        |   (_, BasicBlock{ flow=ExitCode, block, ...}) =>
-                                                if List.exists(fn RaiseExceptionPacket _ => true | _ => false) block
-                                                then (trueJump, falseJump)
-                                                else (falseJump, trueJump)
-                                        |   _ => (falseJump, trueJump)
+                                    (* Can we replace this with a conditional set?  If both arms simply set
+                                       a register to a value and either return or jump to the same location
+                                       we can.  *)
+                                    (* The most common case is producing a boolean result, 1 (tagged 0) for false and
+                                       3 (tagged 1) for true.  We look for blocks that generate this and also
+                                       functions that return this. *)
+                                    val BasicBlock { flow=tFlow, block=tBlock, ...} = Vector.sub(blocks, trueJump)
+                                    and BasicBlock { flow=fFlow, block=fBlock, ...} = Vector.sub(blocks, falseJump)
+
+                                    val isPossSetCCOrCmov =
+                                        if not (haveProcessed trueJump) andalso available trueJump
+                                            andalso not (haveProcessed falseJump) andalso available falseJump
+                                        then case (tFlow, fFlow, tBlock, fBlock) of
+                                            (Unconditional tDest, Unconditional fDest,
+                                            [LoadNonAddressConstant{ source=sourceTrue, dest=destTrue }],
+                                            [LoadNonAddressConstant{ source=sourceFalse, dest=destFalse }]) =>
+                                                if tDest = fDest andalso destTrue = destFalse
+                                                then if sourceFalse = 0w1
+                                                then SOME{code=
+                                                        (* We can generate 1 by using CSINC and XZero. *)
+                                                            code <::> LoadNonAddr(destTrue, sourceTrue) <::>
+                                                                ConditionalSet{regD=destTrue, regTrue=destTrue, regFalse=XZero, cond=condition, condSet=CondSetIncr, opSize=OpSize64},
+                                                          trueJump=trueJump, falseJump=falseJump}
+                                                else if sourceTrue = 0w1
+                                                then SOME{code=
+                                                        (* We can generate 1 by using CSINC and XZero. *)
+                                                            code <::> LoadNonAddr(destTrue, sourceFalse) <::>
+                                                                ConditionalSet{regD=destTrue, regTrue=destTrue, regFalse=XZero, cond=invertTest condition, condSet=CondSetIncr, opSize=OpSize64},
+                                                          trueJump=trueJump, falseJump=falseJump}
+                                                else NONE
+                                                else NONE
+
+                                        |   (ExitCode, ExitCode,
+                                            [LoadNonAddressConstant{ source=sourceTrue, dest=destTrue },
+                                             ReturnResultFromFunction { results=[(retRT, retRTR)], fpResults=[], returnReg=retaT, numStackArgs=numST }],
+                                            [LoadNonAddressConstant{ source=sourceFalse, dest=destFalse },
+                                             ReturnResultFromFunction { results=[(retRF, retRFR)], fpResults=[], returnReg=retaF, numStackArgs=numSF }]) =>
+                                             if destTrue = retRT andalso destFalse = retRF (* These ought to X0. *)
+                                             then
+                                             let
+                                                val _ = retRTR = X0 andalso retRFR = X0 orelse raise InternalError "Test for condmove: mismatch regs"
+                                                val _ = retaT = retaF orelse raise InternalError "Test for condmove: mismatched return regs"
+                                                val _ = numST = numSF orelse raise InternalError "Test for condmove: mismatched stack"
+                                                fun resetAndReturn code =
+                                                    (if numST = 0
+                                                    then code
+                                                    else addConstantWord({regS=X_MLStackPtr, regD=X_MLStackPtr, regW=X3,
+                                                            value=Word64.fromLarge(Word.toLarge nativeWordSize) * Word64.fromInt numSF}, code)
+                                                    )  <::> BranchReg{regD=retaT, brRegType=BRRReturn}
+                                             in
+                                                if sourceFalse = 0w1
+                                                then SOME{code=
+                                                            resetAndReturn(
+                                                            code <::> LoadNonAddr(destTrue, sourceTrue) <::>
+                                                                ConditionalSet{regD=destTrue, regTrue=destTrue, regFalse=XZero, cond=condition, condSet=CondSetIncr, opSize=OpSize64}),
+                                                          trueJump=trueJump, falseJump=falseJump}
+                                                else if sourceTrue = 0w1
+                                                then SOME{code=
+                                                            resetAndReturn(
+                                                            code <::> LoadNonAddr(destTrue, sourceFalse) <::>
+                                                                ConditionalSet{regD=destTrue, regTrue=destTrue, regFalse=XZero, cond=invertTest condition, condSet=CondSetIncr, opSize=OpSize64}),
+                                                          trueJump=trueJump, falseJump=falseJump}
+                                                else NONE
+                                             end
+                                             else NONE
+
+                                        |   _ => NONE
+                                        else NONE
                                 in
-                                    if not (haveProcessed first) andalso available first
-                                    then SOME(FlowCodeSimple first)
-                                    else if not (haveProcessed second) andalso available second
-                                    then SOME(FlowCodeSimple second)
-                                    else NONE
-                                end
+                                    case isPossSetCCOrCmov of
+                                        NONE =>
+                                        (* We can usually choose either destination and in nearly all cases
+                                           it won't matter.  The ARM doesn't seem to define what happens if
+                                           a conditional branch hasn't been seen before.  Assume it's the
+                                           same as the X86 and that conditional forward branches aren't
+                                           taken.  Arrange this so that something that raises an exception
+                                           is assumed to be "unusual". *)
+                                        let
+                                            val (first, second) =
+                                                case (condition, Vector.sub(blocks, falseJump)) of
+                                                    (CondNoOverflow, _) => (trueJump, falseJump)
+                                                |   (_, BasicBlock{ flow=ExitCode, block, ...}) =>
+                                                        if List.exists(fn RaiseExceptionPacket _ => true | _ => false) block
+                                                        then (trueJump, falseJump)
+                                                        else (falseJump, trueJump)
+                                                |   _ => (falseJump, trueJump)
+                                        in
+                                            if not (haveProcessed first) andalso available first
+                                            then SOME(FlowCodeSimple first)
+                                            else if not (haveProcessed second) andalso available second
+                                            then SOME(FlowCodeSimple second)
+                                            else NONE
+                                        end
+                                    |   SOME args => SOME(FlowCodeCMove args)
+                                    end
                            |    SetHandler { continue, ... } =>
                                     (* We want the continuation if possible.  We'll need a
                                        branch round the handler so that won't help. *)
                                     if not (haveProcessed continue) andalso available continue
                                     then SOME(FlowCodeSimple continue)
                                     else NONE
                            |    UnconditionalHandle _ => NONE
                            |    ConditionalHandle _ => NONE
                     in
                         (* First choice - continue the existing block.
                            Second choice - the first item whose sources have all been
                            processed.
                            Third choice - something from the list. *)
                         val picked =
                             case continuation of
                                 SOME c => c
                             |   NONE =>
                                     case List.find available stillToDo of
                                         SOME c => FlowCodeSimple c
                                     |   NONE => FlowCodeSimple head
                     end
                     
                 in
                     case picked of
                         FlowCodeSimple picked =>
                         let
                             val () = Array.update(processed, picked, true)
 
                             (* Code to terminate the previous block. *)
                             val startCode =
                                 case lastFlow of
                                     ExitCode => []
                                 |   IndexedBr _ => []
                                 |   UnconditionalHandle _ => []
                                 |   Unconditional dest =>
                                         if dest = picked then [] else [(UnconditionalBranch(getBlockLabel dest))]
                                 |   ConditionalHandle { continue, ...} =>
                                         if continue = picked then [] else [(UnconditionalBranch(getBlockLabel continue))]
                                 |   SetHandler { continue, ... } =>
                                         if continue = picked then [] else [(UnconditionalBranch(getBlockLabel continue))]
                                 |   Conditional { condition, trueJump, falseJump, ...} =>
                                     if picked = falseJump (* Usual case. *)
                                     then [(ConditionalBranch(condition, getBlockLabel trueJump))]
                                     else if picked = trueJump
                                     then (* We have a jump to the true condition. Invert the jump.
                                             This is more than an optimisation.  Because this immediately precedes the
                                             true block we're not going to generate a label. *)
                                         [(ConditionalBranch(invertTest condition, getBlockLabel falseJump))]
                                     else
                                     [
                                         (UnconditionalBranch(getBlockLabel falseJump)),
                                         (ConditionalBranch(condition, getBlockLabel trueJump))
                                     ]
 
                             (* Code-generate the body with the code we've done so far
                                at the end.  Add a label at the start if necessary. *)
                             local
                                 (* If the previous block dropped through to this and this was
                                    the only reference then we don't need a label. *)
                                 fun onlyJumpingHere (lab: int) =
                                     if lab <> picked then false
                                     else case Array.sub(labelRefs, picked) of
                                         [singleton] => singleton = lab
                                     |   _ => false
                     
                                 val noLabel =
                                     case lastFlow of
                                         ExitCode => picked = 0 (* Unless this was the first block. *)
                                     |   Unconditional dest => onlyJumpingHere dest
                                     |   Conditional { trueJump, falseJump, ...} =>
                                             onlyJumpingHere trueJump orelse onlyJumpingHere falseJump
                                     |   IndexedBr _ => false
                                     |   SetHandler _ => false
                                     |   UnconditionalHandle _ => false
                                     |   ConditionalHandle { continue, ...} => onlyJumpingHere continue
                             in
                                 val startLabel = if noLabel then [] else [(SetLabel(getBlockLabel picked))]
                             end
 
                             val BasicBlock { flow, block, ...} = Vector.sub(blocks, picked)
 
                             local
                                 fun genCodeBlock(instr, code) = codeExtended {flow=flow} (instr, code)
                             in
                                 val bodyCode = List.foldl genCodeBlock (startLabel @ startCode @ code) block
                             end
 
                             val addSet =
                                 case flow of
                                     ExitCode => []
                                 |   IndexedBr cases => cases
                                 |   Unconditional dest => [dest]
                                 |   Conditional {trueJump, falseJump, ...} => [falseJump, trueJump]
                                 |   SetHandler { handler, continue } => [handler, continue]
                                 |   UnconditionalHandle _ => []
                                 |   ConditionalHandle { continue, ...} => [continue]
 
                         in
                             genCode(addSet @ stillToDo, flow, bodyCode)
                         end
                 
                     |   FlowCodeCMove{code, trueJump, falseJump} =>
                         let
                             (* We've generated a conditional move and possibly a return.  If the
                                trueJump and falseJump are only ever referenced from this block
                                they're done, otherwise we still need to do them. *)
                             val _ =
                                 case Array.sub(labelRefs, trueJump) of
                                     [_] => Array.update(processed, trueJump, true)
                                 |   _ => ()
                             val _ =
                                 case Array.sub(labelRefs, falseJump) of
                                     [_] => Array.update(processed, falseJump, true)
                                 |   _ => ()
                             val BasicBlock { flow, ...} = Vector.sub(blocks, trueJump)
                             val addSet =
                                 case flow of
                                     ExitCode => []
                                 |   Unconditional dest => [dest]
                                 |   _ => raise InternalError "FlowCodeCMove"
                         in
                             genCode(addSet @ stillToDo, flow, code)
                         end
                 end
         in
             val ops = genCode([0], ExitCode, [(SetLabel startOfFunctionLabel)])
         end
     in
         generateFinalCode{instrs=List.rev ops, name=functionName, resultClosure=resultClosure,
                                    parameters=debugSwitches, profileObject=profileObject,  labelMaker= labelMaker}
     end
 
     structure Sharing =
     struct
         type ('genReg, 'optGenReg, 'fpReg) basicBlock = ('genReg, 'optGenReg, 'fpReg) basicBlock
         and xReg        = xReg
         and vReg        = vReg
         and closureRef  = closureRef
     end
 
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64IdentifyReferences.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64IdentifyReferences.ML
index a9372250..03f97a9d 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64IdentifyReferences.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64IdentifyReferences.ML
@@ -1,892 +1,893 @@
 (*
     Copyright (c) 2021-2 David C.J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64IdentifyReferences(
     structure Arm64ICode: ARM64ICODE
     structure Debug: DEBUG
     structure IntSet: INTSET
 ): ARM64IDENTIFYREFERENCES =
 struct
     open Arm64ICode
     open IntSet
 
     type regState =
     { 
         active: int, refs: int, pushState: bool, prop: regProperty
     }
 
     (* CC states before and after.  Currently no instruction uses the
        condition; conditional branches are handled at the block level.
        The result of executing the instruction may be to set the condition code to a
        defined state, an undefined state or leave it unchanged. *)
     datatype outCCState = CCSet of ccRef | CCIndeterminate | CCUnchanged
     and inCCState = CCNeeded of ccRef | CCUnused
     
     datatype extendedBasicBlock =
         ExtendedBasicBlock of
         {
             block: {instr: iCodeAbstract, current: intSet, active: intSet, kill: intSet } list,
             flow: controlFlow,
             locals: intSet, (* Defined and used entirely within the block. *)
             imports: intSet, (* Defined outside the block, used inside it, but not needed afterwards. *)
             exports: intSet, (* Defined within the block, possibly used inside, but used outside. *)
             passThrough: intSet, (* Active throughout the block. May be referred to by it but needed afterwards. *)
             loopRegs: intSet, (* Destination registers for a loop.  They will be updated by this block. *)
             initialStacks: intSet, (* Stack items required at the start i.e. imports+passThrough for stack items. *)
             inCCState: ccRef option, (* The state this block assumes.  If SOME _ all predecessors must set it. *)
             outCCState: ccRef option (* The condition code set by this block.  SOME _ if at least one successor needs it. *)
         }
     
     exception InternalError = Misc.InternalError
 
     (* Return the list of blocks that are the immediate successor of this. *)
     fun blockSuccessors(BasicBlock{flow, ...}) = successorBlocks flow
 
     fun getOptReg(SomeReg reg) = [reg] | getOptReg ZeroReg = []
 
     fun getInstructionState(MoveRegister { source, dest, ...}) =
         { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadNonAddressConstant { dest, ...}) =
         { sources=[], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadFPConstant { dest, ...}) =
         { sources=[], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadAddressConstant { dest, ...}) =
         { sources=[], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadWithConstantOffset { base, dest, ...}) =
         { sources=[base], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadFPWithConstantOffset { base, dest, ...}) =
         { sources=[base], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadWithIndexedOffset { base, dest, index, ...}) =
         { sources=[base, index], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadFPWithIndexedOffset { base, dest, index, ...}) =
         { sources=[base, index], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(GetThreadId { dest, ...}) =
         { sources=[], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(ObjectIndexAddressToAbsolute { source, dest, ...}) =
         { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(AbsoluteToObjectIndex { source, dest, ...}) =
         { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(AllocateMemoryFixed { dest, ...}) =
         { sources=[], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(AllocateMemoryVariable{size, dest, ...}) =
             { sources=[size], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(InitialiseMem{size, addr, init}) =
             { sources=[size, addr, init], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BeginLoop) =
             (* This is just a marker.  It doesn't actually generate any code. *)
             { sources=[], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(JumpLoop{regArgs, stackArgs, ...}) =
         let
             fun getSourceFromRegs({src=ArgInReg reg, ...}, (regArgs, stackArgs)) = (reg :: regArgs, stackArgs)
             |   getSourceFromRegs({src=ArgOnStack { container, ...}, ...}, (regArgs, stackArgs)) = (regArgs, container :: stackArgs)
             and getSourceFromStack({src=ArgInReg reg, ...}, (regArgs, stackArgs)) = (reg :: regArgs, stackArgs)
             |   getSourceFromStack({src=ArgOnStack { container, ...}, ...}, (regArgs, stackArgs)) = (regArgs, container :: stackArgs)
             val (regSources, stackSources) =
                 List.foldl getSourceFromRegs (List.foldl getSourceFromStack ([], []) stackArgs) regArgs
         in
             { sources=regSources, dests=[], sStacks=stackSources, dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
         end
 
     |   getInstructionState(StoreWithConstantOffset { base, source, ...}) =
         { sources=[source, base], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(StoreFPWithConstantOffset { base, source, ...}) =
         { sources=[source, base], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(StoreWithIndexedOffset { base, source, index, ...}) =
         { sources=[source, base, index], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(StoreFPWithIndexedOffset { base, source, index, ...}) =
         { sources=[source, base, index], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(AddSubImmediate{ source, dest, ccRef, ... }) =
         { sources=[source], dests=getOptReg dest, sStacks=[], dStacks=[],
           ccIn=CCUnused, ccOut=case ccRef of NONE => CCUnchanged | SOME cc => CCSet cc }
 
     |   getInstructionState(AddSubRegister{ base, shifted, dest, ccRef, ... }) =
         { sources=[base, shifted], dests=getOptReg dest, sStacks=[], dStacks=[],
           ccIn=CCUnused, ccOut=case ccRef of NONE => CCUnchanged | SOME cc => CCSet cc }
 
     |   getInstructionState(LogicalImmediate{ source, dest, ccRef, ... }) =
         { sources=[source], dests=getOptReg dest, sStacks=[], dStacks=[],
           ccIn=CCUnused, ccOut=case ccRef of NONE => CCUnchanged | SOME cc => CCSet cc }
 
     |   getInstructionState(LogicalRegister{ base, shifted, dest, ccRef, ... }) =
         { sources=[base, shifted], dests=getOptReg dest, sStacks=[], dStacks=[],
           ccIn=CCUnused, ccOut=case ccRef of NONE => CCUnchanged | SOME cc => CCSet cc }
 
     |   getInstructionState(ShiftRegister{ source, shift, dest, ... }) =
         { sources=[source, shift], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(Multiplication{ dest, sourceA, sourceM, sourceN, ... }) =
         { sources=getOptReg sourceA @ [sourceM, sourceN],
           dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(Division{ dest, dividend, divisor, ... }) =
         { sources=[dividend, divisor], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
-    |   getInstructionState(BeginFunction {regArgs, stackArgs, ...}) =
-            { sources=[], dests=map #1 regArgs, sStacks=[], dStacks=stackArgs, ccIn=CCUnused, ccOut=CCIndeterminate }
+    |   getInstructionState(BeginFunction {regArgs, fpRegArgs, stackArgs, ...}) =
+            { sources=[], dests=map #1 regArgs @ map #1 fpRegArgs, sStacks=[], dStacks=stackArgs, ccIn=CCUnused, ccOut=CCIndeterminate }
 
-    |   getInstructionState(FunctionCall{regArgs, stackArgs, dests, containers, ...}) =
+    |   getInstructionState(FunctionCall{regArgs, stackArgs, dests, fpRegArgs, fpDests, containers, ...}) =
         let
             (* Non-tail-recursive.  Behaves as a normal reference to sources. *)
             fun getSourceFromRegs((ArgInReg reg, _), (regArgs, stackArgs)) = (reg :: regArgs, stackArgs)
             |   getSourceFromRegs((ArgOnStack { container, ...}, _), (regArgs, stackArgs)) = (regArgs, container :: stackArgs)
             and getSourceFromStack(ArgInReg reg, (regArgs, stackArgs)) = (reg :: regArgs, stackArgs)
             |   getSourceFromStack(ArgOnStack { container, ...}, (regArgs, stackArgs)) = (regArgs, container :: stackArgs)
             val (argSources, stackSources) =
                 List.foldl getSourceFromRegs (List.foldl getSourceFromStack ([], []) stackArgs) regArgs
+            val fpArgSources = List.map #1 fpRegArgs
         in
-            { sources=argSources, dests=List.map #1 dests, sStacks=stackSources @ containers, dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
+            { sources=argSources @ fpArgSources, dests=List.map #1 dests @ List.map #1 fpDests, sStacks=stackSources @ containers, dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
         end
 
-    |   getInstructionState(TailRecursiveCall{regArgs, stackArgs, ...}) =
+    |   getInstructionState(TailRecursiveCall{regArgs, fpRegArgs, stackArgs, ...}) =
         let
             (* Tail recursive call.  References the argument sources but exits. *)
             fun getSourceFromRegs((ArgInReg reg, _), (regArgs, stackArgs)) = (reg :: regArgs, stackArgs)
             |   getSourceFromRegs((ArgOnStack { container, ...}, _), (regArgs, stackArgs)) = (regArgs, container :: stackArgs)
             and getSourceFromStack({src=ArgInReg reg, ...}, (regArgs, stackArgs)) = (reg :: regArgs, stackArgs)
             |   getSourceFromStack({src=ArgOnStack { container, ...}, ...}, (regArgs, stackArgs)) = (regArgs, container :: stackArgs)
             val (argSources, stackSources) =
                 List.foldl getSourceFromRegs (List.foldl getSourceFromStack ([], []) stackArgs) regArgs
+            val fpArgSources = List.map #1 fpRegArgs
         in
-            { sources=argSources, dests=[], sStacks=stackSources, dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
+            { sources=argSources@fpArgSources, dests=[], sStacks=stackSources, dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
         end
 
-    |   getInstructionState(ReturnResultFromFunction{results, returnReg, ...}) =
-            { sources=returnReg :: List.map #1 results, dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
+    |   getInstructionState(ReturnResultFromFunction{results, fpResults, returnReg, ...}) =
+            { sources=returnReg :: List.map #1 results @ List.map #1 fpResults, dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(RaiseExceptionPacket{packetReg}) =
             { sources=[packetReg], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(PushToStack{ source, container, ... }) =
             { sources=[source], dests=[], sStacks=[], dStacks=[container], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadStack{ dest, container, ... }) =
             { sources=[], dests=[dest], sStacks=[container], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(StoreToStack{ source, container, ... }) =
         (* Although this stores into the container it must already exist. *)
             { sources=[source], dests=[], sStacks=[container], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(ContainerAddress{ dest, container, ... }) =
             { sources=[], dests=[dest], sStacks=[container], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(ResetStackPtr _) =
             { sources=[], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(TagValue{source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(UntagValue{source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BoxLarge{source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(UnboxLarge{source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BoxTagFloat{source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(UnboxTagFloat{source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadAcquire { base, dest, ...}) =
         { sources=[base], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(StoreRelease { base, source, ...}) =
         { sources=[source, base], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BitFieldShift{ source, dest, ... }) =
         { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BitFieldInsert{ source, destAsSource, dest, ... }) =
         { sources=[source, destAsSource], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(IndexedCaseOperation{ testReg }) =
         { sources=[testReg], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(PushExceptionHandler) =
         { sources=[], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(PopExceptionHandler) =
         { sources=[], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BeginHandler{packetReg}) =
         (* The packet register is a destination since this provides its definition. *)
         { sources=[], dests=[packetReg], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(CompareByteVectors{vec1Addr, vec2Addr, length, ccRef, ...}) =
             { sources=[vec1Addr, vec2Addr, length], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCSet ccRef }
 
     |   getInstructionState(BlockMove{srcAddr, destAddr, length, ...}) =
             { sources=[srcAddr, destAddr, length], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(AddSubXSP{source, dest, ...}) =
             { sources=[source], dests=getOptReg dest, sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(TouchValue{source}) =
             { sources=[source], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadAcquireExclusive{base, dest}) =
             { sources=[base], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(StoreReleaseExclusive{base, source, result}) =
             { sources=[base] @ getOptReg source, dests=[result], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(MemoryBarrier) =
             { sources=[], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(ConvertIntToFloat{ source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(ConvertFloatToInt{ source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(UnaryFloatingPt{ source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BinaryFloatingPoint{ arg1, arg2, dest, ...}) =
             { sources=[arg1, arg2], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(CompareFloatingPoint{ arg1, arg2, ccRef, ...}) =
             { sources=[arg1, arg2], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCSet ccRef }
 
     |   getInstructionState(CPUYield) =
             { sources=[], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(AtomicOperation{ base, source, dest, ... }) =
             { sources=base :: getOptReg source, dests=getOptReg dest, sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
-    |   getInstructionState(CacheCheck{ arg1, arg2}) =
-            { sources=[arg1, arg2], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
-
     (* These instructions can be eliminated if their register sources are not used.
        There may be other cases. *)
     fun eliminateable(MoveRegister _) = true
     |   eliminateable(LoadNonAddressConstant _) = true
     |   eliminateable(LoadAddressConstant _) = true
     |   eliminateable(LoadWithConstantOffset _) = true
     |   eliminateable(LoadWithIndexedOffset _) = true
     |   eliminateable(ObjectIndexAddressToAbsolute _) = true
     |   eliminateable(TagValue _) = true
     |   eliminateable(UntagValue _) = true
     |   eliminateable(BoxLarge _) = true
     |   eliminateable(UnboxLarge _) = true
+    |   eliminateable(BoxTagFloat _) = true
+    |   eliminateable(UnboxTagFloat _) = true
 
     |   eliminateable _ = false
    
 
     fun identifyRegs(blockVector, pregProps): extendedBasicBlock vector * regState vector =
     let
         val maxPRegs = Vector.length pregProps
         val vectorLength = Vector.length blockVector
         (* Initial arrays - declarationArray is the set of registers given
            values by the block, importArray is the set of registers referenced by
            the block and not declared locally. *)
         val declarationArray = Array.array(vectorLength, emptySet)
         and importArray = Array.array(vectorLength, emptySet)
         val stackDecArray = Array.array(vectorLength, emptySet)
         and stackImportArray = Array.array(vectorLength, emptySet)
         and localLoopRegArray = Array.array(vectorLength, emptySet)
         
         (* References - this is used locally to see if a register is ever
            actually used and also included in the result which uses it as
            part of the choice of which register to spill. *)
         val regRefs = Array.array(maxPRegs, 0)
         (* Registers that must be pushed because they are required after
            a function call.  For cache registers this means "discard". *)
         and requirePushOrDiscard = Array.array(maxPRegs, false)
 
         fun incrRef r = Array.update(regRefs, r, Array.sub(regRefs, r)+1)
         
         (* Contains the, possibly filtered, code for each block. *)
         val resultCode = Array.array(vectorLength, NONE)
         
         val ccInStates = Array.array(vectorLength, CCUnused)
         and ccOutStates = Array.array(vectorLength, CCIndeterminate)
         
         (* First pass - for each block build up the sets of registers defined and
            used in the block.  We do this depth-first so that we can use "refs" to
            see if a register is used.  If this is an instruction that can be eliminated
            we don't need to generate it and can ignore any references it makes. *)
         local
             fun blockScan blockNo =
             if isSome(Array.sub(resultCode, blockNo)) then ()
             else
             let
                 val () = Array.update(resultCode, blockNo, SOME []) (* Prevent looping. *)
                 val thisBlock as BasicBlock { block, flow, ...} = Vector.sub(blockVector, blockNo)
                 val successors = blockSuccessors thisBlock
                 (* Visit everything reachable first. *)
                 val () = List.app blockScan successors
                 
                 fun scanCode(instr, original as { code, decs, refs, sDecs, sRefs, occIn, occOut, loopRegs, ... }) =
                 let
                     val { sources, dests, sStacks=stackSrcs, dStacks=stackDests, ccIn, ccOut, ... } =
                         getInstructionState instr
                     fun regNo(PReg i) = i
                     and stackNo(StackLoc{rno, ...}) = rno
                     val destRegNos = map regNo dests
                     and sourceRegNos = map regNo sources
                     val stackDestRegNos = map stackNo stackDests
                     and stackSourceRegNos = map stackNo stackSrcs
                     (* If this instruction requires a cc i.e. is SetToCondition or X87FPGetCondition we
                        need to set this as a requirement earlier.  If this sets the CC and it is the condition
                        we've been expecting we've satisfied it and can set the previous condition to Unused.
                        We could use this to decide if a comparison is no longer required.  That can only
                        happen in very specific circumstances e.g. some tests in Test176.ML so it's
                        not worthwhile. *)
                     val newInCC =
                         case (ccIn, ccOut, occIn) of
                             (cc as CCNeeded _, _, _) => cc (* This instr needs a particular cc. *)
                         |   (CCUnused, CCSet _, _) => CCUnused
                         |   (CCUnused, _, occIn) => occIn
                     (* If this instruction modifies the CC check to see if it is setting an requirement. *)
                     val _ =
                         case (occIn, ccOut) of
                             (CCNeeded ccRIn, CCSet ccRout) =>
                                 if ccRIn = ccRout then () else raise InternalError "CCCheck failed"
                         |   (CCNeeded _, CCIndeterminate) => raise InternalError "CCCheck failed"
                         |   _ => ()
                     (* The output CC is the last CC set.  Tail instructions that don't change
                        the CC state are ignored until we reach an instruction that sets it. *)
                     val newOutCC = case occOut of CCUnchanged => ccOut | _ => occOut
 
                     val instrLoopRegs =
                         case instr of
                             JumpLoop{regArgs, ...} => listToSet (map (regNo o #dst) regArgs)
                         |   _ => emptySet
                 in
                     if eliminateable instr andalso
                         List.all(fn dReg => Array.sub(regRefs, dReg) = 0) destRegNos
                     then original (* Don't include this instruction. *)
                     else
                     let
                         (* Only mark the sources as referred after we know we're going to need this.
                            In that way we may eliminate the instruction that created this source. *)
                         val () = List.app incrRef sourceRegNos
                     in
                         { code = instr :: code, decs = union(listToSet destRegNos, decs), refs = union(listToSet sourceRegNos, refs),
                           sDecs = union(listToSet stackDestRegNos, sDecs), sRefs = union(listToSet stackSourceRegNos, sRefs),
                           occIn = newInCC, occOut = newOutCC, loopRegs = union(loopRegs, instrLoopRegs)}
                     end
                 end
                 
                 (* If we have a conditional branch at the end we need the condition code.  It should either
                    be set here or in a preceding block. *)
                 val inCC = case flow of Conditional { ccRef, ...} => CCNeeded ccRef | _ => CCUnused
 
                 val { code, decs, refs, sDecs, sRefs, occIn, occOut, loopRegs, ... } =
                     List.foldr scanCode
                         {code=[], decs=emptySet, refs=emptySet, sDecs=emptySet, sRefs=emptySet, occIn=inCC, occOut=CCUnchanged, loopRegs=emptySet} block
             in
                 Array.update(declarationArray, blockNo, decs);
                 (* refs includes local declarations. Remove before adding to the result. *)
                 Array.update(importArray, blockNo, minus(refs, decs));
                 Array.update(localLoopRegArray, blockNo, loopRegs);
                 Array.update(stackDecArray, blockNo, sDecs);
                 Array.update(stackImportArray, blockNo, minus(sRefs, sDecs));
                 Array.update(resultCode, blockNo, SOME code);
                 Array.update(ccInStates, blockNo, occIn);
                 Array.update(ccOutStates, blockNo, occOut)
             end
         in
             val () = blockScan 0 (* Start with the root block. *)
         end
         
         (* Second phase - Propagate reference information between the blocks.
            We need to consider loops here.  Do a depth-first scan marking each
            block.  If we find a loop we save the import information we've used.
            If when we come to process that block we find the import information
            is different we need to reprocess. *)
         (* Pass through array - values used in other blocks after this that
            are not declared in this block. *)
         val passThroughArray = Array.array(vectorLength, emptySet)
         val stackPassThroughArray = Array.array(vectorLength, emptySet)
         (* Exports - those of our declarations that are used in other blocks. *)
         val exportArray = Array.array(vectorLength, emptySet)
         val stackExportArray = Array.array(vectorLength, emptySet)
         (* Loop registers.  This contains the registers that are not exported
            from or passed through this block but are used subsequently as
            loop registers. *)
         val loopRegArray = Array.array(vectorLength, emptySet)
         val () = Array.copy{src=localLoopRegArray, dst=loopRegArray, di=0}
         (* If any one of the successors requires the CC then this is set.
            Otherwise we leave it as Unused. *)
         val ccRequiredOut = Array.array(vectorLength, CCUnused)
         local
             datatype loopData =
                 Unprocessed | Processing | Processed
             |   Looped of { regSet: intSet, loopSet: intSet, stackSet: intSet, ccState: inCCState }
             
             fun reprocessLoop () =
             let
                 val reprocess = ref false
                 val loopArray = Array.array(vectorLength, Unprocessed)
             
                 fun processBlocks blockNo =
                     case Array.sub(loopArray, blockNo) of
                         Processed => (* Already seen this by a different route. *)
                             {
                                 regSet = union(Array.sub(passThroughArray, blockNo), Array.sub(importArray, blockNo)),
                                 stackSet = union(Array.sub(stackPassThroughArray, blockNo), Array.sub(stackImportArray, blockNo)),
                                 ccState = Array.sub(ccInStates, blockNo),
                                 loopSet = Array.sub(loopRegArray, blockNo)
                             }
                     |   Looped s => s (* We've already seen this in a loop. *)
                     |   Processing => (* We have a loop. *)
                         let
                             (* Use the existing input array. *)
                             val inputs =
                             {
                                 regSet = union(Array.sub(passThroughArray, blockNo), Array.sub(importArray, blockNo)),
                                 stackSet = union(Array.sub(stackPassThroughArray, blockNo), Array.sub(stackImportArray, blockNo)),
                                 ccState = Array.sub(ccInStates, blockNo),
                                 loopSet = Array.sub(loopRegArray, blockNo)
                             }
                             val () = Array.update(loopArray, blockNo, Looped inputs)
                         in
                             inputs
                         end
                     |   Unprocessed => (* Normal case - not visited yet. *)
                         let
                             val () = Array.update(loopArray, blockNo, Processing)
                             val thisBlock = Vector.sub(blockVector, blockNo)
                             val ourDeclarations = Array.sub(declarationArray, blockNo)
                             and ourStackDeclarations = Array.sub(stackDecArray, blockNo)
                             and ourLocalLoopRegs = Array.sub(localLoopRegArray, blockNo)
                             val successors = blockSuccessors thisBlock
 
                             fun addSuccessor b =
                             let
                                 val {regSet=theirImports, stackSet=theirStackImports, ccState=theirInState, loopSet=theirLoops} = processBlocks b
                                 (* Remove loop regs from the imports if they are actually given new
                                    values by this block.  We don't want to pass the old loop regs through here. *)
                                 val theirImports = minus(theirImports, ourLocalLoopRegs)
                                 (* Split the imports.  If a register is a local declaration then
                                    it becomes an export.  If it is not it becomes part of our
                                    passThrough. *)
                                 val (addToExp, addToImp) =
                                     IntSet.partition (fn i => member(i, ourDeclarations)) theirImports
                                 val (addToStackExp, addToStackImp) =
                                     IntSet.partition (fn i => member(i, ourStackDeclarations)) theirStackImports
                                 (* Merge the input states from each of the successors. *)
                                 val () =
                                     case (theirInState, Array.sub(ccRequiredOut, blockNo)) of
                                         (CCNeeded ts, CCNeeded req) =>
                                             if ts = req then () else raise InternalError "Mismatched states"
                                     |   (ts as CCNeeded _, _) => Array.update(ccRequiredOut, blockNo, ts)
                                     |   _ => ()
                                 (* Add loop registers to the set if they are not declared here.  The
                                    only place they are declared is at the entry to the loop so that
                                    stops them being propagated further. *)
                                 val addToLoops = minus(theirLoops, ourDeclarations)
                             in
                                 Array.update(exportArray, blockNo,
                                     union(Array.sub(exportArray, blockNo), addToExp));
                                 Array.update(passThroughArray, blockNo,
                                     union(Array.sub(passThroughArray, blockNo), addToImp));
                                 Array.update(stackExportArray, blockNo,
                                     union(Array.sub(stackExportArray, blockNo), addToStackExp));
                                 Array.update(stackPassThroughArray, blockNo,
                                     union(Array.sub(stackPassThroughArray, blockNo), addToStackImp));
                                 Array.update(loopRegArray, blockNo,
                                     union(Array.sub(loopRegArray, blockNo), addToLoops))
                             end
                             val () = List.app addSuccessor successors
                             val ourInputs =
                                 union(Array.sub(passThroughArray, blockNo), Array.sub(importArray, blockNo))
                             val ourStackInputs =
                                 union(Array.sub(stackPassThroughArray, blockNo), Array.sub(stackImportArray, blockNo))
                         in
                             (* Check that we supply the required state. *)
                             case (Array.sub(ccRequiredOut, blockNo), Array.sub(ccOutStates, blockNo)) of
                                 (CCNeeded ccReq, CCSet ccSet) =>
                                     if ccReq = ccSet then () else raise InternalError "Mismatched cc states"
                             |   (CCNeeded _, CCIndeterminate) => raise InternalError "Mismatched cc states"
                             |   (cc as CCNeeded needOut, CCUnchanged) =>
                                 (
                                     (* We pass through the state.  If we don't use the state then we
                                        need to set this as the input.  If we do use the state it must be
                                        the same. *)
                                     case Array.sub(ccInStates, blockNo) of
                                         CCUnused => Array.update(ccInStates, blockNo, cc)
                                     |   CCNeeded needIn =>
                                             if needOut = needIn then () else raise InternalError "Mismatched cc states"
                                 )
                             |   _ => ();
                             (* Was this block used in a loop?  If so we should not be requiring a CC. *)
                             case Array.sub(loopArray, blockNo) of
                                 Looped {regSet, stackSet, ...} =>
                                 (
                                     case Array.sub(ccInStates, blockNo) of
                                         CCNeeded _ => raise InternalError "Looped state needs cc" | _ => ();
                                     if setToList regSet = setToList ourInputs andalso
                                         setToList stackSet = setToList ourStackInputs
                                     then ()
                                     else reprocess := true
                                 )
                             |   _ => ();
                             Array.update(loopArray, blockNo, Processed);
                             { regSet = ourInputs, stackSet = ourStackInputs,
                               ccState = Array.sub(ccInStates, blockNo), loopSet=Array.sub(loopRegArray, blockNo)}
                         end
             in
                 reprocess := false;
                 processBlocks 0;
                 if !reprocess then reprocessLoop () else ()
             end
         in
             val () = reprocessLoop ()
         end
         
         (* Third pass - Build the result list with the active registers for each
            instruction.  We don't include registers in the passThrough set since
            they are active throughout the block. *)
         local
             (* Number of instrs for which this is active.  We use this to try to select a
                register to push to the stack if we have too many.  Registers that have
                only a short lifetime are less likely to be pushed than those that are
                active longer. *)
             val regActive = Array.array(maxPRegs, 0)
             fun addActivity n r = Array.update(regActive, r, Array.sub(regActive, r)+n)
             
             fun createResultInstrs (passThrough, stackPassThrough)
                 (instr, (tail, activeAfterThis, stackActiveAfterThis)) =
             let
                 val { sources, dests, sStacks=stackSrcs, dStacks=stackDests, ... } = getInstructionState instr
             in
                 (* Eliminate instructions if their results are not required.  The earlier check for this
                    will remove most cases but if we have duplicated a block we may have a register that
                    is required elsewhere but not in this particular branch.  *)
                 if not(List.exists(fn PReg d => member(d, activeAfterThis)) dests) andalso eliminateable instr
                 then (tail, activeAfterThis, stackActiveAfterThis)
                 else
                 let
                     fun regNo(PReg i) = i
                     fun stackNo(StackLoc{rno, ...}) = rno
                     val destRegNos = map regNo dests
                     and sourceRegNos = map regNo sources
                     val destSet = listToSet destRegNos
                     (* Remove any sources that are present in passThrough since
                        they are going to be active throughout the block. *)
                     and sourceSet = minus(listToSet sourceRegNos, passThrough)
                     val stackDestRegNos = map stackNo stackDests
                     and stackSourceRegNos = map stackNo stackSrcs
                     val stackDestSet = listToSet stackDestRegNos
                     and stackSourceSet = minus(listToSet stackSourceRegNos, stackPassThrough)
 
                     (* To compute the active set for the PREVIOUS instruction (we're processing from the
                        end back to the start) we remove any registers that have been given values in this
                        instruction and add anything that we are using in this instruction since they will
                        now need to have values. *)
                     val afterRemoveDests = minus(activeAfterThis, destSet)
                     val stackAfterRemoveDests = minus(stackActiveAfterThis, stackDestSet)
                     val activeForPrevious = union(sourceSet, afterRemoveDests)
                     val stackActiveForPrevious = union(stackSourceSet, stackAfterRemoveDests)
             
                     (* The "active" set is the set of registers that need to be active DURING the
                        instruction.  It includes destinations, which will usually be in
                        "activeAfterThis", because there may be destinations that are not actually used
                        subsequently but still need a register. *)
                     val activeForInstr =
                         case instr of
                             FunctionCall _ => sourceSet (* Is this still needed? *)
                         |   TailRecursiveCall _ =>
                                 (* Set the active set to the total set of registers we require including
                                    the work register.  This ensures that we will spill as many registers
                                    as we require when we look at the size of the active set. *)
                                 union(sourceSet, destSet)
                         |   BoxLarge _ =>
                                 (* We can only store the value in the box after the box is allocated. *)
                                 union(activeAfterThis, union(sourceSet, destSet))
                         |   BoxTagFloat _ =>
                                 (* Since the source must be a V register and the destination an
                                    X register there isn't actually a problem here, but do this
                                    anyway. *)
                                 union(activeAfterThis, union(sourceSet, destSet))
                         |   _ => union(activeAfterThis, destSet)
                 
                     val () = List.app(addActivity 1) (setToList activeForInstr)
 
                     local
                         (* If we are allocating memory we have to save the current registers if
                            they could contain an address.  We mustn't push untagged registers
                            and we mustn't push the destination. *)
                         fun getSaveSet includeReg =
                         let
                             val activeAfter = union(activeAfterThis, passThrough)
                             (* Remove any registers marked - must-not-push.  These are
                                registers holding non-address values.  They will actually
                                be saved by the RTS across any GC but not checked or
                                modified by the GC.
                                Exclude the result register. *)
                             fun getSave i =
                                 if includeReg i
                                 then case Vector.sub(pregProps, i) of
                                     RegPropGeneral => SOME(PReg i)
                                 |   RegPropCacheTagged => SOME(PReg i)
                                 |   RegPropUntagged => NONE
                                 |   RegPropStack _ => NONE
                                 |   RegPropCacheUntagged => NONE
                                 |   RegPropMultiple => raise InternalError "getSave: RegPropMultiple"
                                 else NONE
                         in
                             List.mapPartial getSave (setToList activeAfter)
                         end
                     in
                         (* Sometimes we need to modify the instruction e.g. to include the set
                            of registers to save. *)
                         val convertedInstr =
                             case instr of
                                 AllocateMemoryFixed{bytesRequired, dest, saveRegs=_} =>
                                     AllocateMemoryFixed{bytesRequired=bytesRequired, dest=dest,
                                         saveRegs=getSaveSet(fn i => i <> regNo dest)}
 
                             |   AllocateMemoryVariable{size, dest, saveRegs=_} =>
                                     AllocateMemoryVariable{size=size, dest=dest, saveRegs=getSaveSet(fn i => i <> regNo dest)}
 
                             |   BoxLarge{source, dest, saveRegs=_} =>
                                     BoxLarge{source=source, dest=dest, saveRegs=getSaveSet(fn i => i <> regNo dest)}
 
                             |   BoxTagFloat{source, dest, floatSize, saveRegs=_} =>
                                     BoxTagFloat{source=source, dest=dest, floatSize=floatSize,
                                         saveRegs=getSaveSet(fn i => i <> regNo dest)}
                        
                             |   JumpLoop{regArgs, stackArgs, checkInterrupt = SOME _, ...} =>
                                 let
                                     (* If we have to check for interrupts we must preserve registers across
                                        the RTS call. *)
                                     fun getSave i =
                                         case Vector.sub(pregProps, i) of
                                         RegPropGeneral => SOME(PReg i)
                                     |   RegPropCacheTagged => SOME(PReg i)
                                     |   RegPropUntagged => NONE
                                     |   RegPropStack _ => NONE
                                     |   RegPropCacheUntagged => NONE
                                     |   RegPropMultiple => raise InternalError "getSave: RegPropMultiple"
                                     val currentRegs = union(activeAfterThis, passThrough)
                                     (* Have to include the loop registers.  These were previously included
                                        automatically because they were part of the import set. *)
                                     val check = List.mapPartial getSave (map (regNo o #dst) regArgs @ setToList currentRegs)
                                 in
                                     JumpLoop{regArgs=regArgs, stackArgs=stackArgs, checkInterrupt=SOME check}
                                 end
 
-                            |   FunctionCall{regArgs, stackArgs=[], dests, callKind as ConstantCode m,
+                            |   FunctionCall{regArgs, stackArgs=[], dests, fpRegArgs=[], fpDests=[], callKind as ConstantCode m,
                                              saveRegs=_, containers} =>
                                 (* If this is arbitrary precision push the registers rather than marking them as "save".
                                    stringOfWord returns 'CODE "PolyAddArbitrary"' etc. *)
                                 if (String.isSubstring "Arbitrary\"" (Address.stringOfWord m))
                                 then
                                 let
                                     val destRegs = List.map (regNo o #1) dests
                                     fun includeInSave i = not(List.exists(fn r => r=i) destRegs)
                                 in
                                     FunctionCall{regArgs=regArgs, stackArgs=[], callKind=callKind, dests=dests,
-                                        containers=containers, saveRegs=getSaveSet includeInSave}
+                                        fpRegArgs=[], fpDests=[], containers=containers, saveRegs=getSaveSet includeInSave}
                                 end
                                 else instr
 
                             |   _ => instr
                     end
                 
                     (* FunctionCall must mark all registers as "push". *)
                     local
                         fun pushRegisters () =
                         let
                             val activeAfter = union(activeAfterThis, passThrough)
                             fun pushAllButDests i =
                                 if List.exists(fn j => i=j) destRegNos
                                 then ()
                                 else case Vector.sub(pregProps, i) of
                                     RegPropCacheTagged => raise InternalError "pushRegisters: cache reg"
                                 |   RegPropCacheUntagged => raise InternalError "pushRegisters: cache reg"
                                 |  _ => Array.update(requirePushOrDiscard, i, true)
                         in
                             (* We need to push everything active after this
                                except the result register. *)
                             List.app pushAllButDests (setToList activeAfter)
                         end
                     in
                         val () =
                            case instr of
                                 FunctionCall{ stackArgs=[], callKind=ConstantCode m, ...} =>
                                 if (String.isSubstring "Arbitrary\"" (Address.stringOfWord m))
                                 then ()
                                 else pushRegisters ()
                             
                             |   FunctionCall _ => pushRegisters ()
                             
                                 (* It should no longer be necessary to push across a handler but
                                    there still seem to be cases that need it. *)
                             (*|   BeginHandler _ => pushRegisters ()*)
 
                             |   _ => ()
                     end
                 
                     (* Which entries are active in this instruction but not afterwards? *)
                     val kill = union(minus(stackSourceSet, stackActiveAfterThis), minus(sourceSet, activeAfterThis))
                 in
                     ({instr=convertedInstr, active=activeForInstr, current=activeAfterThis, kill=kill} :: tail, activeForPrevious,
                      stackActiveForPrevious)
                 end
             end
 
             fun createResult blockNo =
             let
                 val BasicBlock{ flow, ...} = Vector.sub(blockVector, blockNo)
                 val declSet   = Array.sub(declarationArray, blockNo)
                 and importSet = Array.sub(importArray, blockNo)
                 and passSet   = Array.sub(passThroughArray, blockNo)
                 and loopSet   = Array.sub(loopRegArray, blockNo)
                 and exportSet = Array.sub(exportArray, blockNo)
                 and stackPassSet = Array.sub(stackPassThroughArray, blockNo)
                 and stackImportSet = Array.sub(stackImportArray, blockNo)
                 and stackExportSet = Array.sub(stackExportArray, blockNo)
                 val filteredCode = getOpt(Array.sub(resultCode, blockNo), [])
                 (* At the end of the block we should have the exports active. *)
                 val (resultInstrs, _, _) = List.foldr (createResultInstrs (passSet, stackPassSet)) ([], exportSet, stackExportSet) filteredCode
                 (* Set the active count for the pass through. *)
                 val instrCount = List.length filteredCode
                 val () = List.app(addActivity instrCount) (setToList passSet)
                 val inCCState =
                     case Array.sub(ccInStates, blockNo) of CCNeeded s => SOME s | CCUnused => NONE
                 val outCCState =
                     case Array.sub(ccRequiredOut, blockNo) of CCNeeded s => SOME s | CCUnused => NONE
             in
                 ExtendedBasicBlock {
                     block = resultInstrs,
                     flow=flow,
                     locals = minus(declSet, exportSet),
                     imports = importSet,
                     exports = exportSet,
                     passThrough = passSet,
                     loopRegs = loopSet,
                     initialStacks = union(stackPassSet, stackImportSet),
                     inCCState = inCCState,
                     outCCState = outCCState
                 }
             end
         in
             val resultBlocks = Vector.tabulate(vectorLength, createResult)
             val regActive = regActive
         end
         
         val registerState: regState vector =
             Vector.tabulate(maxPRegs,
                 fn i => {
                     active = Array.sub(regActive, i),
                     refs = Array.sub(regRefs, i),
                     pushState = Array.sub(requirePushOrDiscard, i),
                     prop = Vector.sub(pregProps, i)
                 }
             )
     in
         (resultBlocks, registerState)
     end
     (* Exported function.  First filter out unreferenced blocks then process the
        registers themselves. *)
     fun identifyRegisters(blockVector, pregProps) =
     let
         val vectorLength = Vector.length blockVector
         val mapArray = Array.array(vectorLength, NONE)
         and resArray = Array.array(vectorLength, NONE)
         val count = ref 0
         
         fun setReferences label =
             case Array.sub(mapArray, label) of
                 NONE => (* Not yet visited *)
                 let
                     val BasicBlock{flow, block} = Vector.sub(blockVector, label)
                     (* Create a new entry for it. *)
                     val newLabel = ! count before count := !count + 1
                     (* Add it to the map.  Any other references will use this
                        without reprocessing. *)
                     val () = Array.update(mapArray, label, SOME newLabel)
                     val newFlow =
                         case flow of
                             Unconditional l => Unconditional(setReferences l)
                         |   Conditional{trueJump, falseJump, ccRef, condition} =>
                                 Conditional{trueJump=setReferences trueJump, falseJump=setReferences falseJump,
                                         ccRef=ccRef, condition=condition}
                         |   ExitCode => ExitCode
                         |   IndexedBr list => IndexedBr(map setReferences list)
                         |   SetHandler{handler, continue} =>
                                 SetHandler{handler=setReferences handler, continue=setReferences continue}
                         |   UnconditionalHandle l => UnconditionalHandle(setReferences l)
                         |   ConditionalHandle{handler, continue} =>
                                 ConditionalHandle{handler=setReferences handler, continue=setReferences continue}
 
                     val () = Array.update(resArray, newLabel, SOME(BasicBlock{flow=newFlow, block=block}))
                 in
                     newLabel
                 end
             |   SOME lab => lab
        
         val _ = setReferences 0
         
         val newBlockVector =
             Vector.tabulate(!count, fn i => valOf(Array.sub(resArray, i)))
     in
         identifyRegs(newBlockVector, pregProps)
     end
 
     (* Exported for use in GetConflictSets *)
     fun getInstructionRegisters instr =
     let
         val {sources, dests, ...} = getInstructionState instr
     in
         {sources=sources, dests=dests}
     end
     
     (* Exported for use in ICodeOptimise *)
     val getInstructionCC = #ccOut o getInstructionState
 
     structure Sharing =
     struct
         type ('genReg, 'optGenReg, 'fpReg) arm64ICode = ('genReg, 'optGenReg, 'fpReg) arm64ICode
         and preg                = preg
         and pregOrZero          = pregOrZero
         and intSet              = intSet
         and ('genReg, 'optGenReg, 'fpReg) basicBlock = ('genReg, 'optGenReg, 'fpReg) basicBlock
         and extendedBasicBlock  = extendedBasicBlock
         and controlFlow         = controlFlow
         and regProperty         = regProperty
         and ccRef               = ccRef
         and outCCState          = outCCState
     end
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PreAssembly.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PreAssembly.ML
index ae3d5acc..b152971c 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PreAssembly.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PreAssembly.ML
@@ -1,1101 +1,1122 @@
 (*
     Copyright (c) 2021-2 David C. J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     Licence version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public Licence for more details.
     
     You should have received a copy of the GNU Lesser General Public
     Licence along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 (* The pre-assembly layer goes below the icode and allows peep-hole optimisation. *)
 
 functor Arm64PreAssembly(
 
     structure Arm64Assembly: ARM64ASSEMBLY
     structure Debug: DEBUG
     structure Pretty: PRETTY
 
 ): ARM64PREASSEMBLY =
 struct
     open Arm64Assembly
     
     exception InternalError = Misc.InternalError
 
     (* Reversed cons and append to make the code easier to read. *)
     infix 5 <::> <@>
     fun tl <::> hd = hd :: tl
     and snd <@> fst = fst @ snd
 
     (* Many of the datatypes are inherited from Arm64Assembly *)
 
     datatype loadType = Load64 | Load32 | Load16 | Load8
     and opSize = OpSize32 | OpSize64
     and logicalOp = LogAnd | LogOr | LogXor
     and floatSize = Float32 | Double64
     and shiftDirection = ShiftLeft | ShiftRightLogical | ShiftRightArithmetic
     and multKind =
         MultAdd32 | MultSub32 | MultAdd64 | MultSub64 |
         SignedMultAddLong (* 32bit*32bit + 64bit => 64Bit *) |
         SignedMultHigh (* High order part of 64bit*64Bit *)
-    and fpUnary = NegFloat | NegDouble | AbsFloat | AbsDouble | ConvFloatToDble | ConvDbleToFloat
+    and fpUnary =
+        NegFloat | NegDouble | AbsFloat | AbsDouble | ConvFloatToDble |
+        ConvDbleToFloat | MoveDouble | MoveFloat
     and fpBinary = MultiplyFP | DivideFP | AddFP | SubtractFP
     and unscaledType = NoUpdate | PreIndex | PostIndex
     and condSet = CondSet | CondSetIncr | CondSetInvert | CondSetNegate
     and bitfieldKind = BFUnsigned | BFSigned | BFInsert
     and brRegType = BRRBranch | BRRAndLink | BRRReturn
         (* Some of the atomic operations added in 8.1 *)
     and atomicOp = LoadAddAL | LoadUmaxAL | SwapAL | LoadAddAcquire | LoadUMaxAcquire | SwapRelease
 
     datatype label = Label of int
     type labelMaker = int ref
     fun createLabelMaker() = ref 0
     fun createLabel(r as ref n) = Label n before r := n+1
 
     datatype precode =
         (* Basic instructions *)
         AddImmediate of {regN: xReg, regD: xReg, immed: word, shifted: bool, opSize: opSize, setFlags: bool}
     |   SubImmediate of {regN: xReg, regD: xReg, immed: word, shifted: bool, opSize: opSize, setFlags: bool}
     |   AddShiftedReg of {regM: xReg, regN: xReg, regD: xReg, shift: shiftType, opSize: opSize, setFlags: bool}
     |   SubShiftedReg of {regM: xReg, regN: xReg, regD: xReg, shift: shiftType, opSize: opSize, setFlags: bool}
     |   AddExtendedReg of {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend, opSize: opSize, setFlags: bool}
     |   SubExtendedReg of {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend, opSize: opSize, setFlags: bool}
     |   MultiplyAndAddSub of {regM: xReg, regN: xReg, regA: xReg, regD: xReg, multKind: multKind}
     |   DivideRegs of
             {regM: xReg, regN: xReg, regD: xReg, isSigned: bool, opSize: opSize}
     |   LogicalShiftedReg of
             {regM: xReg, regN: xReg, regD: xReg, shift: shiftType, logOp: logicalOp, opSize: opSize, setFlags: bool}
     |   LoadRegScaled of
             {regT: xReg, regN: xReg, unitOffset: int, loadType: loadType}
     |   LoadFPRegScaled of
             {regT: vReg, regN: xReg, unitOffset: int, floatSize: floatSize}
     |   StoreRegScaled of
             {regT: xReg, regN: xReg, unitOffset: int, loadType: loadType}
     |   StoreFPRegScaled of
             {regT: vReg, regN: xReg, unitOffset: int, floatSize: floatSize}
     |   LoadRegUnscaled of
             {regT: xReg, regN: xReg, byteOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   StoreRegUnscaled of
             {regT: xReg, regN: xReg, byteOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   LoadFPRegUnscaled of
             {regT: vReg, regN: xReg, byteOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   StoreFPRegUnscaled of
             {regT: vReg, regN: xReg, byteOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   LoadRegIndexed of {regT: xReg, regN: xReg, regM: xReg, loadType: loadType, option: scale extend}
     |   StoreRegIndexed of {regT: xReg, regN: xReg, regM: xReg, loadType: loadType, option: scale extend}
     |   LoadFPRegIndexed of {regT: vReg, regN: xReg, regM: xReg, floatSize: floatSize, option: scale extend}
     |   StoreFPRegIndexed of {regT: vReg, regN: xReg, regM: xReg, floatSize: floatSize, option: scale extend}
         (* LoadAcquire and StoreRelease are used for mutables. *)
     |   LoadAcquireReg of {regN: xReg, regT: xReg, loadType: loadType}
     |   StoreReleaseReg of {regN: xReg, regT: xReg, loadType: loadType}
         (* LoadAcquireExclusiveRegister and StoreReleaseExclusiveRegister are used for mutexes. *)
     |   LoadAcquireExclusiveRegister of {regN: xReg, regT: xReg}
     |   StoreReleaseExclusiveRegister of {regS: xReg, regT: xReg, regN: xReg}
     |   MemBarrier
         (* Additional atomic operations. *)
     |   AtomicExtension of { regT: xReg, regN: xReg, regS: xReg, atOp: atomicOp }
     |   LoadRegPair of
             { regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   StoreRegPair of
             { regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   LoadFPRegPair of
             { regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   StoreFPRegPair of
             { regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   ConditionalSet of
             {regD: xReg, regTrue: xReg, regFalse: xReg, cond: condition, condSet: condSet, opSize: opSize}
     |   BitField of {immr: word, imms: word, regN: xReg, regD: xReg, opSize: opSize, bitfieldKind: bitfieldKind}
     |   ShiftRegisterVariable of {regM: xReg, regN: xReg, regD: xReg, opSize: opSize, shiftDirection: shiftDirection}
     |   BitwiseLogical of { bits: Word64.word, regN: xReg, regD: xReg, opSize: opSize, setFlags: bool, logOp: logicalOp}
         (* Floating point *)
     |   MoveGeneralToFP of { regN: xReg, regD: vReg, floatSize: floatSize}
     |   MoveFPToGeneral of {regN: vReg, regD: xReg, floatSize: floatSize}
     |   CvtIntToFP of { regN: xReg, regD: vReg, floatSize: floatSize, opSize: opSize}
     |   CvtFloatToInt of { round: IEEEReal.rounding_mode, regN: vReg, regD: xReg, floatSize: floatSize, opSize: opSize}
     |   FPBinaryOp of { regM: vReg, regN: vReg, regD: vReg, floatSize: floatSize, fpOp: fpBinary}
     |   FPComparison of { regM: vReg, regN: vReg, floatSize: floatSize}
     |   FPUnaryOp of {regN: vReg, regD: vReg, fpOp: fpUnary}
         (* Branches and Labels. *)
     |   SetLabel of label
     |   ConditionalBranch of condition * label
     |   UnconditionalBranch of label
     |   BranchAndLink of label
     |   BranchReg of {regD: xReg, brRegType: brRegType }
     |   LoadLabelAddress of xReg * label
     |   TestBitBranch of { test: xReg, bit: Word8.word, label: label, onZero: bool }
     |   CompareBranch of { test: xReg, label: label, onZero: bool, opSize: opSize }
         (* Composite instructions *)
     |   MoveXRegToXReg of {sReg: xReg, dReg: xReg}
     |   LoadNonAddr of xReg * Word64.word
     |   LoadFPConst of {dest: vReg, value: Word64.word, floatSize: floatSize, work: xReg}
     |   LoadAddr of xReg * machineWord
     |   RTSTrap of { rtsEntry: int, work: xReg, save: xReg list }
     |   AllocateMemoryFixedSize of { bytes: word, dest: xReg, save: xReg list, work: xReg }
     |   AllocateMemoryVariableSize of { sizeReg: xReg, dest: xReg, save: xReg list, work: xReg }
         (* Branch table for indexed case. startLabel is the address of the first label in
            the list.  The branch table is a sequence of unconditional branches. *)
     |   BranchTable of { startLabel: label, brTable: label list }
     |   LoadGlobalHeapBaseInCallback of xReg
     |   Yield
 
+    (* Test whether a value is a power of two and return the shift if it is. *)
+    fun powerTwo v =
+        if LargeWord.andb(v, ~ v) = v
+        then
+        let
+            fun shift(n: Word8.word, 0w1) = n
+            |   shift(n, v) = shift(n+0w1, LargeWord.>>(v, 0w1))
+        in
+            SOME(shift(0w0, v))
+        end
+        else NONE
+
+
 
     (* Optimise the pre-assembler code and then generate the final code. *)
     fun generateFinalCode {instrs, name, parameters, resultClosure, profileObject, labelMaker=ref labelCount} =
     let
         val labelTargets = Array.tabulate(labelCount, fn i => (Arm64Assembly.createLabel(), i) )
 
         (* Follow the chain of forwarded labels. *)
         local
             fun forwardLab(labelNo, labels) =
             let
                 val dest as (_, dNo) = Array.sub(labelTargets, labelNo)
             in
                 if dNo = labelNo
                 then dest
                 (* This should not happen but just in case... *)
                 else if List.exists(fn i => i = dNo) labels
                 then raise InternalError "Infinite loop"
                 else forwardLab(dNo, dNo::labels)
             end
         in
             fun getLabel labelNo = forwardLab(labelNo, [labelNo])
             val getLabelTarget = #1 o getLabel
         end                
 
         fun toAssembler([], code) = code
 
         |   toAssembler(AddImmediate{regN, regD, immed, shifted, opSize, setFlags} :: rest, code) =
             let
                 val instr =
                     case (opSize, setFlags) of
                         (OpSize64, false) => addImmediate
                     |   (OpSize32, false) => addImmediate32
                     |   (OpSize64, true) => addSImmediate
                     |   (OpSize32, true) => addSImmediate32
             in
                 toAssembler(rest, code <::> instr{regN=regN, regD=regD, immed=immed, shifted=shifted})
             end
 
         |   toAssembler(SubImmediate{regN, regD, immed, shifted, opSize, setFlags} :: rest, code) =
             let
                 val instr =
                     case (opSize, setFlags) of
                         (OpSize64, false) => subImmediate
                     |   (OpSize32, false) => subImmediate32
                     |   (OpSize64, true) => subSImmediate
                     |   (OpSize32, true) => subSImmediate32
             in
                 toAssembler(rest, code <::> instr{regN=regN, regD=regD, immed=immed, shifted=shifted})
             end
 
         |   toAssembler(AddShiftedReg{regM, regN, regD, shift, opSize, setFlags} :: rest, code) =
             let
                 val instr =
                     case (opSize, setFlags) of
                         (OpSize64, false) => addShiftedReg
                     |   (OpSize32, false) => addShiftedReg32
                     |   (OpSize64, true) => addSShiftedReg
                     |   (OpSize32, true) => addSShiftedReg32
             in
                 toAssembler(rest, code <::> instr{regM=regM, regN=regN, regD=regD, shift=shift})
             end
 
         |   toAssembler(SubShiftedReg{regM, regN, regD, shift, opSize, setFlags} :: rest, code) =
             let
                 val instr =
                     case (opSize, setFlags) of
                         (OpSize64, false) => subShiftedReg
                     |   (OpSize32, false) => subShiftedReg32
                     |   (OpSize64, true) => subSShiftedReg
                     |   (OpSize32, true) => subSShiftedReg32
             in
                 toAssembler(rest, code <::> instr{regM=regM, regN=regN, regD=regD, shift=shift})
             end
 
         |   toAssembler(AddExtendedReg{regM, regN, regD, extend, opSize, setFlags} :: rest, code) =
             (* Add/SubExtended are only used to access XSP. *)
             let
                 val instr =
                     case (opSize, setFlags) of
                         (OpSize64, false) => addExtendedReg
                     |   (OpSize32, false) => raise InternalError "AddExtendedReg; 32"
                     |   (OpSize64, true) => addSExtendedReg
                     |   (OpSize32, true) => raise InternalError "AddExtendedReg; 32"
             in
                 toAssembler(rest, code <::> instr{regM=regM, regN=regN, regD=regD, extend=extend})
             end
 
         |   toAssembler(SubExtendedReg{regM, regN, regD, extend, opSize, setFlags} :: rest, code) =
             let
                 val instr =
                     case (opSize, setFlags) of
                         (OpSize64, false) => subExtendedReg
                     |   (OpSize32, false) => raise InternalError "AddExtendedReg; 32"
                     |   (OpSize64, true) => subSExtendedReg
                     |   (OpSize32, true) => raise InternalError "AddExtendedReg; 32"
             in
                 toAssembler(rest, code <::> instr{regM=regM, regN=regN, regD=regD, extend=extend})
             end
 
         |   toAssembler(MultiplyAndAddSub{regM, regN, regA, regD, multKind} :: rest, code) =
             let
                 val instr =
                     case multKind of
                         MultAdd32 => multiplyAndAdd32{regM=regM, regN=regN, regA=regA, regD=regD}
                     |   MultSub32 => multiplyAndSub32{regM=regM, regN=regN, regA=regA, regD=regD}
                     |   MultAdd64 => multiplyAndAdd{regM=regM, regN=regN, regA=regA, regD=regD}
                     |   MultSub64 => multiplyAndSub{regM=regM, regN=regN, regA=regA, regD=regD}
                     |   SignedMultAddLong => signedMultiplyAndAddLong{regM=regM, regN=regN, regA=regA, regD=regD}
                     |   SignedMultHigh => signedMultiplyHigh{regM=regM, regN=regN, regD=regD}
             in
                 toAssembler(rest, code <::> instr)
             end
 
         |   toAssembler(DivideRegs{regM, regN, regD, isSigned, opSize} :: rest, code) =
             let
                 val instr =
                     case (isSigned, opSize) of
                         (true, OpSize64) => signedDivide
                     |   (true, OpSize32) => signedDivide32
                     |   (false, OpSize64) => unsignedDivide
                     |   (false, OpSize32) => unsignedDivide32
             in
                 toAssembler(rest, code <::> instr{regN=regN, regM=regM, regD=regD})
             end
 
         |   toAssembler(LogicalShiftedReg{regM, regN, regD, shift, logOp, opSize, setFlags} :: rest, code) =
             let
                 val instr =
                     case (logOp, setFlags, opSize) of
                         (LogAnd, false, OpSize64) => andShiftedReg
                     |   (LogAnd, true, OpSize64) => andsShiftedReg
                     |   (LogOr, false, OpSize64) => orrShiftedReg
                     |   (LogXor, false, OpSize64) => eorShiftedReg
 
                     |   (LogAnd, false, OpSize32) => andShiftedReg32
                     |   (LogAnd, true, OpSize32) => andsShiftedReg32
                     |   (LogOr, false, OpSize32) => orrShiftedReg32
                     |   (LogXor, false, OpSize32) => eorShiftedReg32
 
                     |   _ => raise InternalError "setFlags not valid with OR or XOR"
                 (* There are also versions of AND/OR/XOR which operate on a complement (NOT)
                    of the shifted register.  It's probably not worth looking for a use for them. *)
             in
                 toAssembler(rest, code <::> instr{regN=regN, regM=regM, regD=regD, shift=shift})
             end
 
         |   toAssembler(LoadRegScaled{regT, regN, unitOffset, loadType} :: rest, code) =
             let
                 val instr =
                     case loadType of
                         Load64 => loadRegScaled
                     |   Load32 => loadRegScaled32
                     |   Load16 => loadRegScaled16
                     |   Load8 => loadRegScaledByte
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, unitOffset=unitOffset})
             end
 
         |   toAssembler(StoreRegScaled{regT, regN, unitOffset, loadType} :: rest, code) =
             let
                 val instr =
                     case loadType of
                         Load64 => storeRegScaled
                     |   Load32 => storeRegScaled32
                     |   Load16 => storeRegScaled16
                     |   Load8 => storeRegScaledByte
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, unitOffset=unitOffset})
             end
 
         |   toAssembler(LoadFPRegScaled{regT, regN, unitOffset, floatSize} :: rest, code) =
             let
                 val instr =
                     case floatSize of
                         Float32 => loadRegScaledFloat
                     |   Double64 => loadRegScaledDouble
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, unitOffset=unitOffset})
             end
 
         |   toAssembler(StoreFPRegScaled{regT, regN, unitOffset, floatSize} :: rest, code) =
             let
                 val instr =
                     case floatSize of
                         Float32 => storeRegScaledFloat
                     |   Double64 => storeRegScaledDouble
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, unitOffset=unitOffset})
             end
 
         |   toAssembler(LoadRegUnscaled{regT, regN, byteOffset, loadType, unscaledType} :: rest, code) =
             let
                 val instr =
                     case (loadType, unscaledType) of
                         (Load64, NoUpdate) => loadRegUnscaled
                     |   (Load32, NoUpdate) => loadRegUnscaled32
                     |   (Load16, NoUpdate) => loadRegUnscaled16
                     |   (Load8, NoUpdate) => loadRegUnscaledByte
                     |   (Load64, PreIndex) => loadRegPreIndex
                     |   (Load32, PreIndex) => loadRegPreIndex32
                     |   (Load16, PreIndex) => raise InternalError "loadRegPreIndex16"
                     |   (Load8, PreIndex) => loadRegPreIndexByte
                     |   (Load64, PostIndex) => loadRegPostIndex
                     |   (Load32, PostIndex) => loadRegPostIndex32
                     |   (Load16, PostIndex) => raise InternalError "loadRegPostIndex16"
                     |   (Load8, PostIndex) => loadRegPostIndexByte
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, byteOffset=byteOffset})
             end
 
         |   toAssembler(LoadFPRegUnscaled{regT, regN, byteOffset, floatSize, unscaledType} :: rest, code) =
             let
                 val instr =
                     case (floatSize, unscaledType) of
                         (Float32, NoUpdate) => loadRegUnscaledFloat
                     |   (Double64, NoUpdate) => loadRegUnscaledDouble
                     |   _ => raise InternalError "LoadFPRegUnscaled: pre/post indexed"
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, byteOffset=byteOffset})
             end
 
         |   toAssembler(StoreRegUnscaled{regT, regN, byteOffset, loadType, unscaledType} :: rest, code) =
             let
                 val instr =
                     case (loadType, unscaledType) of
                         (Load64, NoUpdate) => storeRegUnscaled
                     |   (Load32, NoUpdate) => storeRegUnscaled32
                     |   (Load16, NoUpdate) => storeRegUnscaled16
                     |   (Load8, NoUpdate) => storeRegUnscaledByte
                     |   (Load64, PreIndex) => storeRegPreIndex
                     |   (Load32, PreIndex) => storeRegPreIndex32
                     |   (Load16, PreIndex) => raise InternalError "storeRegPreIndex16"
                     |   (Load8, PreIndex) => storeRegPreIndexByte
                     |   (Load64, PostIndex) => storeRegPostIndex
                     |   (Load32, PostIndex) => storeRegPostIndex32
                     |   (Load16, PostIndex) => raise InternalError "storeRegPostIndex16"
                     |   (Load8, PostIndex) => storeRegPostIndexByte
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, byteOffset=byteOffset})
             end
 
         |   toAssembler(StoreFPRegUnscaled{regT, regN, byteOffset, floatSize, unscaledType} :: rest, code) =
             let
                 val instr =
                     case (floatSize, unscaledType) of
                         (Float32, NoUpdate) => storeRegUnscaledFloat
                     |   (Double64, NoUpdate) => storeRegUnscaledDouble
                     |   _ => raise InternalError "StoreFPRegUnscaled: pre/post indexed"
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, byteOffset=byteOffset})
             end
 
         |   toAssembler(LoadRegIndexed{regT, regN, regM, loadType, option} :: rest, code) =
             let
                 val instr =
                     case loadType of
                         Load64 => loadRegIndexed
                     |   Load32 => loadRegIndexed32
                     |   Load16 => loadRegIndexed16
                     |   Load8 => loadRegIndexedByte
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, regM=regM, option=option})
             end
 
         |   toAssembler(StoreRegIndexed{regT, regN, regM, loadType, option} :: rest, code) =
             let
                 val instr =
                     case loadType of
                         Load64 => storeRegIndexed
                     |   Load32 => storeRegIndexed32
                     |   Load16 => storeRegIndexed16
                     |   Load8 => storeRegIndexedByte
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, regM=regM, option=option})
             end
 
         |   toAssembler(LoadFPRegIndexed{regT, regN, regM, floatSize, option} :: rest, code) =
             let
                 val instr =
                     case floatSize of
                         Float32 => loadRegIndexedFloat
                     |   Double64 => loadRegIndexedDouble
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, regM=regM, option=option})
             end
 
         |   toAssembler(StoreFPRegIndexed{regT, regN, regM, floatSize, option} :: rest, code) =
             let
                 val instr =
                     case floatSize of
                         Float32 => storeRegIndexedFloat
                     |   Double64 => storeRegIndexedDouble
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, regM=regM, option=option})
             end
 
         |   toAssembler(LoadAcquireReg{regN, regT, loadType} :: rest, code) =
             let
                 val loadInstr  =
                     case loadType of
                         Load64 => loadAcquire
                     |   Load32 => loadAcquire32
                     |   Load8 => loadAcquireByte
                     |   _ => raise InternalError "LoadAcquire: Unsupported size" (* Not used *)
             in
                 toAssembler(rest, code <::> loadInstr{regT=regT, regN=regN})
             end
 
         |   toAssembler(StoreReleaseReg{regN, regT, loadType} :: rest, code) =
             let
                 val storeInstr  =
                     case loadType of
                         Load64 => storeRelease
                     |   Load32 => storeRelease32
                     |   Load8 => storeReleaseByte
                     |   _ => raise InternalError "StoreRelease: Unsupported size" (* Not used *)
             in
                 toAssembler(rest, code <::> storeInstr{regT=regT, regN=regN})
             end
 
         |   toAssembler(LoadAcquireExclusiveRegister{regN, regT} :: rest, code) =
                 toAssembler(rest, code <::> loadAcquireExclusiveRegister{regN=regN, regT=regT})
 
         |   toAssembler(StoreReleaseExclusiveRegister{regN, regT, regS} :: rest, code) =
                 toAssembler(rest, code <::> storeReleaseExclusiveRegister{regN=regN, regT=regT, regS=regS})
 
         |   toAssembler(MemBarrier :: rest, code) =
                 toAssembler(rest, code <::> dmbIsh)
 
         |   toAssembler(AtomicExtension{ regT, regN, regS, atOp} :: rest, code) =
             let
                 val instr =
                     case atOp of
                         LoadAddAL => loadAddAL | LoadUmaxAL => loadUMaxAL | SwapAL => swapAL
                     |   LoadAddAcquire => loadAddA | LoadUMaxAcquire => loadUMaxA | SwapRelease => swapL
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, regS=regS})
             end
 
         |   toAssembler(LoadRegPair{ regT1, regT2, regN, unitOffset, loadType, unscaledType} :: rest, code) =
             let
                 val _ = regT1 <> regT2 orelse raise InternalError "LoadRegPair: same register"
                 val instr =
                     case (loadType, unscaledType) of
                         (Load64, NoUpdate) => loadPairOffset
                     |   (Load64, PreIndex) => loadPairPreIndexed
                     |   (Load64, PostIndex) => loadPairPostIndexed
                     |   (Load32, NoUpdate) => loadPairOffset32
                     |   (Load32, PreIndex) => loadPairPreIndexed32
                     |   (Load32, PostIndex) => loadPairPostIndexed32
                     |   _ => raise InternalError "LoadRegPair: unimplemented"
             in
                 toAssembler(rest, code <::> instr{regT1=regT1, regT2=regT2, regN=regN, unitOffset=unitOffset})
             end
 
         |   toAssembler(StoreRegPair{ regT1, regT2, regN, unitOffset, loadType, unscaledType} :: rest, code) =
             let
                 val instr =
                     case (loadType, unscaledType) of
                         (Load64, NoUpdate) => storePairOffset
                     |   (Load64, PreIndex) => storePairPreIndexed
                     |   (Load64, PostIndex) => storePairPostIndexed
                     |   (Load32, NoUpdate) => storePairOffset32
                     |   (Load32, PreIndex) => storePairPreIndexed32
                     |   (Load32, PostIndex) => storePairPostIndexed32
                     |   _ => raise InternalError "StoreRegPair: unimplemented"
             in
                 toAssembler(rest, code <::> instr{regT1=regT1, regT2=regT2, regN=regN, unitOffset=unitOffset})
             end
 
         |   toAssembler(LoadFPRegPair{ regT1, regT2, regN, unitOffset, floatSize, unscaledType} :: rest, code) =
             let
                 val _ = regT1 <> regT2 orelse raise InternalError "LoadRegPair: same register"
                 val instr =
                     case (floatSize, unscaledType) of
                         (Double64, NoUpdate) => loadPairOffsetDouble
                     |   (Double64, PreIndex) => loadPairPreIndexedDouble
                     |   (Double64, PostIndex) => loadPairPostIndexedDouble
                     |   (Float32, NoUpdate) => loadPairOffsetFloat
                     |   (Float32, PreIndex) => loadPairPreIndexedFloat
                     |   (Float32, PostIndex) => loadPairPostIndexedFloat
             in
                 toAssembler(rest, code <::> instr{regT1=regT1, regT2=regT2, regN=regN, unitOffset=unitOffset})
             end
 
         |   toAssembler(StoreFPRegPair{ regT1, regT2, regN, unitOffset, floatSize, unscaledType} :: rest, code) =
             let
                 val instr =
                     case (floatSize, unscaledType) of
                         (Double64, NoUpdate) => storePairOffsetDouble
                     |   (Double64, PreIndex) => storePairPreIndexedDouble
                     |   (Double64, PostIndex) => storePairPostIndexedDouble
                     |   (Float32, NoUpdate) => storePairOffsetFloat
                     |   (Float32, PreIndex) => storePairPreIndexedFloat
                     |   (Float32, PostIndex) => storePairPostIndexedFloat
             in
                 toAssembler(rest, code <::> instr{regT1=regT1, regT2=regT2, regN=regN, unitOffset=unitOffset})
             end
 
         |   toAssembler(ConditionalSet{regD, regTrue, regFalse, cond, condSet, opSize} :: rest, code) =
             let
                 val instr =
                     case (condSet, opSize) of
                         (CondSet, OpSize64) => conditionalSet
                     |   (CondSetIncr, OpSize64) => conditionalSetIncrement
                     |   (CondSetInvert, OpSize64) => conditionalSetInverted
                     |   (CondSetNegate, OpSize64) => conditionalSetNegated
                     |   (CondSet, OpSize32) => conditionalSet32
                     |   (CondSetIncr, OpSize32) => conditionalSetIncrement32
                     |   (CondSetInvert, OpSize32) => conditionalSetInverted32
                     |   (CondSetNegate, OpSize32) => conditionalSetNegated32
             in
                 toAssembler(rest, code <::> instr{regD=regD, regTrue=regTrue, regFalse=regFalse, cond=cond})
             end
 
         |   toAssembler(BitField{immr, imms, regN, regD, opSize, bitfieldKind} :: rest, code) =
             let
                 val bfInstr =
                     case (bitfieldKind, opSize) of
                         (BFSigned, OpSize64) => signedBitfieldMove64
                     |   (BFUnsigned, OpSize64) => unsignedBitfieldMove64
                     |   (BFInsert, OpSize64) => bitfieldMove64
                     |   (BFSigned, OpSize32) => signedBitfieldMove32
                     |   (BFUnsigned, OpSize32) => unsignedBitfieldMove32
                     |   (BFInsert, OpSize32) => bitfieldMove32
             in
                 toAssembler(rest, code <::> bfInstr{immr=immr, imms=imms, regN=regN, regD=regD})
             end
 
         |   toAssembler(ShiftRegisterVariable{regM, regN, regD, opSize, shiftDirection} :: rest, code) =
             let
                 val instr =
                     case (shiftDirection, opSize) of
                         (ShiftLeft, OpSize64) => logicalShiftLeftVariable
                     |   (ShiftLeft, OpSize32) => logicalShiftLeftVariable32
                     |   (ShiftRightLogical, OpSize64) => logicalShiftRightVariable
                     |   (ShiftRightLogical, OpSize32) => logicalShiftRightVariable32
                     |   (ShiftRightArithmetic, OpSize64) => arithmeticShiftRightVariable
                     |   (ShiftRightArithmetic, OpSize32) => arithmeticShiftRightVariable32
             in
                 toAssembler(rest, code <::> instr{regN=regN, regM=regM, regD=regD})
             end
 
         |   toAssembler(BitwiseLogical{ bits, regN, regD, opSize, setFlags, logOp} :: rest, code) =
             let
                 val instr =
                     case (logOp, setFlags, opSize) of
                         (LogAnd, false, OpSize64) => bitwiseAndImmediate
                     |   (LogAnd, true, OpSize64) => bitwiseAndSImmediate
                     |   (LogOr, false, OpSize64) => bitwiseOrImmediate
                     |   (LogXor, false, OpSize64) => bitwiseXorImmediate
 
                     |   (LogAnd, false, OpSize32) => bitwiseAndImmediate32
                     |   (LogAnd, true, OpSize32) => bitwiseAndSImmediate32
                     |   (LogOr, false, OpSize32) => bitwiseOrImmediate32
                     |   (LogXor, false, OpSize32) => bitwiseXorImmediate32
 
                     |   _ => raise InternalError "flags not valid with OR or XOR"
             in
                 toAssembler(rest, code <::> instr{regN=regN, regD=regD, bits=bits})
             end
 
         |   toAssembler(MoveGeneralToFP{ regN, regD, floatSize=Float32} :: rest, code) =
                 toAssembler(rest, code <::> moveGeneralToFloat{regN=regN, regD=regD})
         |   toAssembler(MoveGeneralToFP{ regN, regD, floatSize=Double64} :: rest, code) =
                 toAssembler(rest, code <::> moveGeneralToDouble{regN=regN, regD=regD})
 
         |   toAssembler(MoveFPToGeneral{ regN, regD, floatSize=Float32} :: rest, code) =
                 toAssembler(rest, code <::> moveFloatToGeneral{regN=regN, regD=regD})
         |   toAssembler(MoveFPToGeneral{ regN, regD, floatSize=Double64} :: rest, code) =
                 toAssembler(rest, code <::> moveDoubleToGeneral{regN=regN, regD=regD})
 
         |   toAssembler(CvtIntToFP{ regN, regD, floatSize, opSize} :: rest, code) =
             let
                 val instr =
                     case (opSize, floatSize) of
                         (OpSize32, Float32) => convertInt32ToFloat
                     |   (OpSize64, Float32) => convertIntToFloat
                     |   (OpSize32, Double64) => convertInt32ToDouble
                     |   (OpSize64, Double64) => convertIntToDouble
             in
                 toAssembler(rest, code <::> instr{regN=regN, regD=regD})
             end
 
         |   toAssembler(CvtFloatToInt{ round, regN, regD, floatSize, opSize} :: rest, code) =
             let
                 val instr =
                     case (floatSize, opSize) of
                         (Float32, OpSize32) => convertFloatToInt32
                     |   (Float32, OpSize64) => convertFloatToInt
                     |   (Double64, OpSize32) => convertDoubleToInt32
                     |   (Double64, OpSize64) => convertDoubleToInt
             in
                 toAssembler(rest, code <::> instr round {regN=regN, regD=regD})
             end
 
         |   toAssembler(FPBinaryOp{ regM, regN, regD, floatSize, fpOp} :: rest, code) =
             let
                 val instr =
                     case (fpOp, floatSize) of
                         (MultiplyFP, Float32) => multiplyFloat
                     |   (DivideFP, Float32) => divideFloat
                     |   (AddFP, Float32) => addFloat
                     |   (SubtractFP, Float32) => subtractFloat
                     |   (MultiplyFP, Double64) => multiplyDouble
                     |   (DivideFP, Double64) => divideDouble
                     |   (AddFP, Double64) => addDouble
                     |   (SubtractFP, Double64) => subtractDouble
             in
                 toAssembler(rest, code <::> instr {regN=regN, regM=regM, regD=regD})
             end
 
         |   toAssembler(FPComparison{ regM, regN, floatSize} :: rest, code) =
                 toAssembler(rest, code <::> (case floatSize of Float32 => compareFloat | Double64 => compareDouble){regN=regN, regM=regM})
 
         |   toAssembler(FPUnaryOp{ regN, regD, fpOp} :: rest, code) =
             let
                 val instr =
                     case fpOp of
                         NegFloat => negFloat | NegDouble => negDouble
                     |   AbsFloat => absFloat | AbsDouble => absDouble
                     |   ConvFloatToDble => convertFloatToDouble
                     |   ConvDbleToFloat => convertDoubleToFloat
+                    |   MoveDouble => moveDoubleToDouble
+                    |   MoveFloat => moveFloatToFloat
             in
                 toAssembler(rest, code <::> instr {regN=regN, regD=regD})
             end
 
         |   toAssembler(SetLabel(Label lab) :: rest, code) = toAssembler(rest, code <::> setLabel(getLabelTarget lab))
 
         |   toAssembler(ConditionalBranch(cond, Label lab) :: rest, code) = toAssembler(rest, code <::> conditionalBranch(cond, getLabelTarget lab))
 
         |   toAssembler(UnconditionalBranch(Label lab) :: rest, code) = toAssembler(rest, code <::> unconditionalBranch(getLabelTarget lab))
 
         |   toAssembler(BranchAndLink(Label lab) :: rest, code) = toAssembler(rest, code <::> branchAndLink(getLabelTarget lab))
 
         |   toAssembler(BranchReg{regD, brRegType=BRRBranch} :: rest, code) = toAssembler(rest, code <::> branchRegister regD)
         |   toAssembler(BranchReg{regD, brRegType=BRRAndLink} :: rest, code) = toAssembler(rest, code <::> branchAndLinkReg regD)
         |   toAssembler(BranchReg{regD, brRegType=BRRReturn} :: rest, code) = toAssembler(rest, code <::> returnRegister regD)
 
         |   toAssembler(LoadLabelAddress(reg, Label lab) :: rest, code) = toAssembler(rest, code <::> loadLabelAddress(reg, getLabelTarget lab))
 
         |   toAssembler(TestBitBranch{ test, bit, label=Label lab, onZero } :: rest, code) =
                 toAssembler(rest, code <::> (if onZero then testBitBranchZero else testBitBranchNonZero)(test, bit, getLabelTarget lab))
 
         |   toAssembler(CompareBranch{ test, label=Label lab, onZero, opSize } :: rest, code) =
             let
                 val instr =
                     case (onZero, opSize) of
                         (true, OpSize64) => compareBranchZero
                     |   (false, OpSize64) => compareBranchNonZero
                     |   (true, OpSize32) => compareBranchZero32
                     |   (false, OpSize32) => compareBranchNonZero32
             in
                 toAssembler(rest, code <::> instr(test, getLabelTarget lab))
             end
 
             (* Register-register moves - special case for XSP. *)
         |   toAssembler(MoveXRegToXReg{sReg=XSP, dReg} :: rest, code) =
                 toAssembler(rest, code <::> addImmediate{regN=XSP, regD=dReg, immed=0w0, shifted=false})
         |   toAssembler(MoveXRegToXReg{sReg, dReg=XSP} :: rest, code) =
                 toAssembler(rest, code <::> addImmediate{regN=sReg, regD=XSP, immed=0w0, shifted=false})
         |   toAssembler(MoveXRegToXReg{sReg, dReg} :: rest, code) =
                 toAssembler(rest, code <::> orrShiftedReg{regN=XZero, regM=sReg, regD=dReg, shift=ShiftNone})
 
         |   toAssembler(LoadNonAddr(xReg, value) :: rest, code) =
             let
                 (* Load a non-address constant.  Tries to use movz/movn/movk if
                    that can be done easily, othewise uses loadNonAddressConstant to
                    load the value from the non-address constant area. *)
                 fun extW (v, h) = Word.andb(Word.fromLarge(LargeWord.>>(Word64.toLarge v, h*0w16)), 0wxffff)
                 val hw0 = extW(value, 0w3) and hw1 = extW(value, 0w2)
                 and hw2 = extW(value, 0w1) and hw3 = extW(value, 0w0)
                 val nextCode =
                     if value < 0wx100000000
                     then
                     let
                         (* 32-bit constants can be loaded using at most a movz and movk but
                            various cases can be reduced since all 32-bit operations set
                            the top word to zero. *)
                         val hi = hw2
                         and lo = hw3
                     in
                         (* 32-bit constants can be loaded with at most a movz and a movk but
                            it may be that there is something shorter. *)
                         if hi = 0w0
                         then code <::> moveZero32{regD=xReg, immediate=lo, shift=0w0}
                         else if hi = 0wxffff
                         then code <::> moveNot32{regD=xReg, immediate=Word.xorb(0wxffff, lo), shift=0w0}
                         else if lo = 0w0
                         then code <::> moveZero32{regD=xReg, immediate=hi, shift=0w16}
                         else if isEncodableBitPattern(value, WordSize32)
                         then code <::> bitwiseOrImmediate32{bits=value, regN=XZero, regD=xReg}
                         else (* Have to use two instructions *)
                             code <::>
                                 moveZero32{regD=xReg, immediate=lo, shift=0w0} <::>
                                 moveKeep{regD=xReg, immediate=hi, shift=0w16}
                     end
                     else if isEncodableBitPattern(value, WordSize64)
                     then code <::> bitwiseOrImmediate{bits=value, regN=XZero, regD=xReg}
                     else if hw0 = 0wxffff andalso hw1 = 0wxffff andalso hw2 = 0wxffff
                     then code <::> moveNot{regD=xReg, immediate=Word.xorb(0wxffff, hw3), shift=0w0}
                     else if hw1 = 0w0 andalso hw2 = 0w0
                     then (* This is common for length words with a flags byte *)
                         code <::> moveZero32{regD=xReg, immediate=hw3, shift=0w0} <::>
                             moveKeep{regD=xReg, immediate=hw0, shift=0w48} 
                     else code <::> loadNonAddressConstant(xReg, value)
             in
                 toAssembler(rest, nextCode)
             end
 
         |   toAssembler(LoadFPConst{dest, value, floatSize=Float32, work} :: rest, code) =
                 toAssembler(rest, loadFloatConstant(dest, value, work)::code)
 
         |   toAssembler(LoadFPConst{dest, value, floatSize=Double64, work} :: rest, code) =
                 toAssembler(rest, loadDoubleConstant(dest, value, work)::code)
 
         |   toAssembler(LoadAddr(dReg, source) :: rest, code) = toAssembler(rest, loadAddressConstant(dReg, source) :: code)
 
         |   toAssembler(RTSTrap{ rtsEntry, work, save } :: rest, code) =
             let
                 (* Because X30 is used in the branchAndLink it has to be pushed
                    across any trap. *)
                 val saveX30 = List.exists (fn r => r = X30) save
                 val preserve = List.filter (fn r => r <> X30) save
             in
                 toAssembler(rest,
                     code <@>
                         (if saveX30 then [storeRegPreIndex{regT=X30, regN=X_MLStackPtr, byteOffset= ~8}] else []) <::>
                     loadRegScaled{regT=work, regN=X_MLAssemblyInt, unitOffset=rtsEntry} <::>
                     branchAndLinkReg work <::>
                     registerMask preserve <@>
                     (if saveX30 then [loadRegPostIndex{regT=X30, regN=X_MLStackPtr, byteOffset= 8}] else [])
                 )
             end
 
         |   toAssembler(AllocateMemoryFixedSize{ bytes, dest, save, work } :: rest, code) =
             let
                 val label = Arm64Assembly.createLabel()
                 val saveX30 = List.exists (fn r => r = X30) save
                 val preserve = List.filter (fn r => r <> X30) save
 
                 val allocCode =
                     code <@>
                     (* Subtract the number of bytes required from the heap pointer. *)
                     (if bytes >= 0w4096
                     then [subShiftedReg{regM=work, regN=X_MLHeapAllocPtr, regD=dest, shift=ShiftNone},
                           loadNonAddressConstant(work, Word.toLarge bytes)]
                     else [subImmediate{regN=X_MLHeapAllocPtr, regD=dest, immed=bytes, shifted=false}]) <::>
                     (* Compare the result with the heap limit. *)
                     subSShiftedReg{regM=X_MLHeapLimit, regN=dest, regD=XZero, shift=ShiftNone} <::>
                     conditionalBranch(CondCarrySet, label) <@>
                         (if saveX30 then [storeRegPreIndex{regT=X30, regN=X_MLStackPtr, byteOffset= ~8}] else []) <::>
                     loadRegScaled{regT=work, regN=X_MLAssemblyInt, unitOffset=heapOverflowCallOffset} <::>
                     branchAndLinkReg work <::>
                     registerMask preserve <@>
                     (if saveX30 then [loadRegPostIndex{regT=X30, regN=X_MLStackPtr, byteOffset= 8}] else []) <::>
                     setLabel label <::>
                     (* Update the heap pointer. *)
                     orrShiftedReg{regN=XZero, regM=dest, regD=X_MLHeapAllocPtr, shift=ShiftNone}
             in
                 toAssembler(rest, allocCode)
             end
 
         |   toAssembler(AllocateMemoryVariableSize{ sizeReg, dest, save, work } :: rest, code) =
             let
                 val trapLabel = Arm64Assembly.createLabel() and noTrapLabel = Arm64Assembly.createLabel()
                 val saveX30 = List.exists (fn r => r = X30) save
                 val preserve = List.filter (fn r => r <> X30) save
             
                 val allocCode =
                     (
                         (* Subtract the size into the result register.  Subtract a further word for
                            the length word and round down in 32-in-64. *)
                         if is32in64
                         then code <::>
                             subShiftedReg{regM=sizeReg, regN=X_MLHeapAllocPtr, regD=dest, shift=ShiftLSL 0w2} <::>
                             subImmediate{regN=dest, regD=dest, immed=0w4, shifted=false} <::>
                             bitwiseAndImmediate{bits= ~ 0w8, regN=dest, regD=dest}
                         else code <::>
                             subShiftedReg{regM=sizeReg, regN=X_MLHeapAllocPtr, regD=dest, shift=ShiftLSL 0w3} <::>
                             subImmediate{regN=dest, regD=dest, immed=0w8, shifted=false}
                     ) <::>
                     (* Check against the limit. If the size is large enough it is possible that this could wrap round. 
                        To check for that we trap if either the result is less than the limit or if it is
                        now greater than the allocation pointer. *)
                     subSShiftedReg{regM=X_MLHeapLimit, regN=dest, regD=XZero, shift=ShiftNone} <::>
                     conditionalBranch(CondCarryClear, trapLabel) <::>
                     subSShiftedReg{regM=X_MLHeapAllocPtr, regN=dest, regD=XZero, shift=ShiftNone} <::>
                     conditionalBranch(CondCarryClear, noTrapLabel) <::>
                     setLabel trapLabel <@>
                         (if saveX30 then [storeRegPreIndex{regT=X30, regN=X_MLStackPtr, byteOffset= ~8}] else []) <::>
                     loadRegScaled{regT=work, regN=X_MLAssemblyInt, unitOffset=heapOverflowCallOffset} <::>
                     branchAndLinkReg work <::>
                     registerMask preserve <@>
                     (if saveX30 then [loadRegPostIndex{regT=X30, regN=X_MLStackPtr, byteOffset= 8}] else []) <::>
                     setLabel noTrapLabel <::>
                     (* Update the heap pointer. *)
                     orrShiftedReg{regN=XZero, regM=dest, regD=X_MLHeapAllocPtr, shift=ShiftNone}
             in
                 toAssembler(rest, allocCode)
             end
 
         |   toAssembler(BranchTable{ startLabel=Label lab, brTable } :: rest, code) =
                 toAssembler(rest,
                     List.foldl (fn (Label lab, code) => (unconditionalBranch(getLabelTarget lab)) :: code)
                         (code <::> setLabel(getLabelTarget lab)) brTable)
 
         |   toAssembler(LoadGlobalHeapBaseInCallback dest :: rest, code) =
                 toAssembler(rest,
                     code <@> List.rev(loadGlobalHeapBaseInCallback dest))
 
         |   toAssembler(Yield :: rest, code) =
                 toAssembler(rest, code <::> yield)
 
         (* Optimisation passes. *)
         fun isValidForPair(offset1, offset2) =
             let val v = Int.min(offset1, offset2) in v >= ~64 andalso v < 64 end
 
         fun forward([], list, rep) = reverse(list, [], rep)
 
         |   forward(SetLabel(Label srcLab) :: (ubr as UnconditionalBranch(Label destLab)) :: tl, list, _) =
             if srcLab = destLab
             (* We should never get this because there should always be a stack-check to
                allow a loop to be broken.  If that ever changes we need to retain the label. *)
             then raise InternalError "Infinite loop detected"
             else (* Mark this to forward to its destination. *)
             (
                 Array.update(labelTargets, srcLab, getLabel destLab);
                 forward(ubr :: tl, list, true)
             )
 
         |   forward(SetLabel(Label jmpLab1) :: (tl as SetLabel(Label jmpLab2) :: _), list, _) =
             (* Eliminate adjacent labels.  They complicate the other tests although they
                don't incur any run-time cost. *)
             (
                 (* Any reference to the first label is forwarded to the second. *)
                 Array.update(labelTargets, jmpLab1, getLabel jmpLab2);
                 forward(tl, list, true)
             )
 
         |   forward((ubr as UnconditionalBranch(Label ubrLab)) :: (tl as SetLabel(Label jumpLab) :: _), list, rep) =
                 (* Eliminate unconditional jumps to the next instruction. *)
             if ubrLab = jumpLab
             then forward(tl, list, true)
             else forward(tl, ubr :: list, rep)
 
         |   forward((cbr as ConditionalBranch(test, Label cbrLab)) :: (ubr as UnconditionalBranch(Label ubrLab)) ::
                     (tl as SetLabel(Label jumpLab) :: _), list, rep) =
             if cbrLab = jumpLab
             then (* We have a conditional branch followed by an unconditional branch followed by the destination of
                     the conditional branch.  Eliminate the unconditional branch by reversing the test.
                     This can often happen if one branch of an if-then-else has been reduced to zero
                     because the same register has been chosen for the input and output. *)
                 forward(tl (* Leave the label just in case it's used elsewhere*),
                     ConditionalBranch(invertTest test, Label ubrLab) :: list, true)
 
             else forward(ubr :: tl, cbr :: list, rep)
 
         |   forward((load as LoadRegScaled{regT=regT1, regN=regN1, unitOffset=offset1, loadType=lt1}) ::
                      (tl1 as LoadRegScaled{regT=regT2, regN=regN2, unitOffset=offset2, loadType=lt2} ::tl2), list, rep) =
             (* Two adjacent loads - can this be converted to load-pair?  N.B.  We have to be careful about the
                sequence ldr x0,[x0]; ldr x1,[x0+8] which isn't the same at all. *)
             if regN1 = regN2 andalso regN1 <> regT1 andalso lt1 = lt2 andalso (offset2 = offset1 + 1 orelse offset2 = offset1 - 1) andalso
                 (case lt1 of Load64 => true | Load32 => true | _ => false) andalso isValidForPair(offset1, offset2)
             then
             let
                 val (reg1, reg2, offset) =
                     if offset1 < offset2 then (regT1, regT2, offset1) else (regT2, regT1, offset2)
             in
                 forward(tl2,
                     LoadRegPair{ regT1=reg1, regT2=reg2, regN=regN1, unitOffset=offset, loadType=lt1, unscaledType=NoUpdate} :: list, true)
             end
             else forward(tl1, load :: list, rep)
 
         |   forward((store as StoreRegScaled{regT=regT1, regN=regN1, unitOffset=offset1, loadType=lt1}) ::
                      (tl1 as StoreRegScaled{regT=regT2, regN=regN2, unitOffset=offset2, loadType=lt2} ::tl2), list, rep) =
             (* Two adjacent stores - can this be converted to store-pair? *)
             if regN1 = regN2 andalso lt1 = lt2 andalso (offset2 = offset1 + 1 orelse offset2 = offset1 - 1) andalso
                 (case lt1 of Load64 => true | Load32 => true | _ => false) andalso isValidForPair(offset1, offset2)
             then
             let
                 val (reg1, reg2, offset) =
                     if offset1 < offset2 then (regT1, regT2, offset1) else (regT2, regT1, offset2)
             in
                 forward(tl2,
                     StoreRegPair{ regT1=reg1, regT2=reg2, regN=regN1, unitOffset=offset, loadType=lt1, unscaledType=NoUpdate} :: list, true)
             end
             else forward(tl1, store :: list, rep)
 
         |   forward((store as StoreRegUnscaled{regT=regT1, regN=regN1, byteOffset= ~8, loadType=Load64, unscaledType=NoUpdate}) ::
                      (tl1 as StoreRegScaled{regT=regT2, regN=regN2, unitOffset=0, loadType=Load64} ::tl2), list, rep) =
             (* Common case - store length word and then the first word of the cell. *)
             if regN1 = regN2
             then forward(tl2,
                     StoreRegPair{ regT1=regT1, regT2=regT2, regN=regN1, unitOffset= ~1, loadType=Load64, unscaledType=NoUpdate} :: list, true)
             else forward(tl1, store :: list, rep)
 
         |   forward((store as StoreRegUnscaled{regT=regT1, regN=regN1, byteOffset= ~4, loadType=Load32, unscaledType=NoUpdate}) ::
                      (tl1 as StoreRegScaled{regT=regT2, regN=regN2, unitOffset=0, loadType=Load32} ::tl2), list, rep) =
             (* Common case - store length word and then the first word of the cell. *)
             if regN1 = regN2
             then forward(tl2,
                     StoreRegPair{ regT1=regT1, regT2=regT2, regN=regN1, unitOffset= ~1, loadType=Load32, unscaledType=NoUpdate} :: list, true)
             else forward(tl1, store :: list, rep)
 
         |   forward((store as StoreRegUnscaled{regT=regT1, regN=regN1, byteOffset= ~8, loadType=Load64, unscaledType=PreIndex}) ::
                      (tl1 as StoreRegUnscaled{regT=regT2, regN=regN2, byteOffset= ~8, loadType=Load64, unscaledType=PreIndex} :: tl2), list, rep) =
             (* Adjacent pushes T2 is in the lower address so the order is T2, T1.  The stack is always 64-bit aligned so
                this works on both native addressing and 32-in-64. *)
             if regN1 = regN2
             then forward(tl2,
                     StoreRegPair{ regT1=regT2, regT2=regT1, regN=regN1, unitOffset= ~2, loadType=Load64, unscaledType=PreIndex} :: list, true)
             else forward(tl1, store :: list, rep)
 
         |   forward((add1 as AddImmediate{regN=regN1, regD=regD1, immed=immed1, shifted=false, opSize=OpSize64, setFlags=false}) ::
                     (tl1 as AddImmediate{regN=regN2, regD=regD2, immed=immed2, shifted=false, opSize=OpSize64, setFlags=false}  ::tl2), list, rep) =
             (* Adjacent stack resets.  This can apply more generally but only if the result registers are the same.  If they're
                not we may need the intermediate result.  We put the result back into the input stream in case it can be combined
                with another stack reset. *)
             if regN2 = regD2 andalso regD1 = regD2 andalso immed2+immed1 < 0w4096
             then forward(AddImmediate{regN=regN1, regD=regD2, immed=immed2+immed1, shifted=false, opSize=OpSize64, setFlags=false} :: tl2, list, true)
             else forward(tl1, add1 :: list, rep)
 
-        |   forward(BitwiseLogical{bits=0w1, regN, regD=XZero, logOp=LogAnd, opSize=_, setFlags=true} ::
-                    ConditionalBranch(CondEqual, label) :: tl2, list, _) =
-            (* Test the tag bit: bit 0.  This is very common to test for nil/not nil.  We could include other
-               values but they're far less likely. *)
-                forward(TestBitBranch{test=regN, bit=0w0, label=label, onZero=true} :: tl2, list, true)
-
-        |   forward(BitwiseLogical{bits=0w1, regN, regD=XZero, logOp=LogAnd, opSize=_, setFlags=true} ::
-                    ConditionalBranch(CondNotEqual, label) :: tl2, list, _) =
-                forward(TestBitBranch{test=regN, bit=0w0, label=label, onZero=false} :: tl2, list, true)
+        |   forward((bwl as BitwiseLogical{bits, regN, regD=XZero, logOp=LogAnd, opSize=_, setFlags=true}) ::
+                    (tl1 as ConditionalBranch(cond, label) :: tl2), list, rep) =
+            (* This occurs frequently as a either a test of the tag bit (bit 0) or a test of a
+               boolean (bit 1).  Other cases don't currently occur because we don't recognise
+               logical and followed by a comparison as a special case. *)
+            (
+                case (cond, powerTwo bits) of
+                    (CondEqual, SOME shift) =>
+                        forward(TestBitBranch{test=regN, bit=shift, label=label, onZero=true} :: tl2, list, true)
+                |   (CondNotEqual, SOME shift) =>
+                        forward(TestBitBranch{test=regN, bit=shift, label=label, onZero=false} :: tl2, list, true)
+                |   _ => forward(tl1, bwl :: list, rep)
+            )
 
         |   forward(hd :: tl, list, rep) = forward(tl, hd :: list, rep) 
         
         and reverse([], list, rep) = (list, rep)
 
         |   reverse((add as AddImmediate{regN=regN2, regD=regD2, immed, shifted=false, opSize=OpSize64, setFlags=false}) ::
                      (tl1 as LoadRegScaled{regT=regT1, regN=regN1, unitOffset=0, loadType=Load64} ::tl2), list, rep) =
             (* A stack reset occurring after a load.  This is usually the ML SP but can also occur with C memory ops.
                It might be possible to consider other cases. *)
             if regN1 = regD2 andalso regN2 = regD2 andalso regT1 <> regN1 andalso immed < 0w256
             then reverse(tl2,
                     LoadRegUnscaled{regT=regT1, regN=regN1, byteOffset=Word.toInt immed, loadType=Load64, unscaledType=PostIndex} :: list, true)
             else reverse(tl1, add :: list, rep)
 
         |   reverse((add as AddImmediate{regN=regN2, regD=regD2, immed, shifted=false, opSize=OpSize64, setFlags=false}) ::
                      (tl1 as LoadRegPair{regT1=regT1, regT2=regT2, regN=regN1, unitOffset=0, loadType=Load64, unscaledType=NoUpdate} ::tl2), list, rep) =
             (* A stack reset occurring after a load pair *)
             if regN1 = regD2 andalso regN2 = regD2 andalso regT1 <> regN1 andalso regT2 <> regN1 andalso immed < 0w64 * 0w8
             then reverse(tl2,
                     LoadRegPair{regT1=regT1, regT2=regT2, regN=regN1, unitOffset=Word.toInt(immed div 0w8),
                                 loadType=Load64, unscaledType=PostIndex} :: list, true)
             else reverse(tl1, add :: list, rep)
 
         |   reverse(hd :: tl, list, rep) = reverse(tl, hd :: list, rep)
 
         (* Repeat scans through the code until there are no further changes. *)
         fun repeat ops =
             case forward(ops, [], false) of
                 (list, false) => list
             |   (list, true) => repeat list
 
         val optimised = repeat instrs
 
     in
         generateCode{instrs=List.rev(toAssembler(optimised, [])), name=name, parameters=parameters,
                      resultClosure=resultClosure, profileObject=profileObject}
     end
 
     (* Constant shifts are encoded in the immr and imms fields of the bit-field instruction. *)
     fun shiftConstant{ direction, regD, regN, shift, opSize } =
     let
         val (bitfieldKind, immr, imms) =
             case (direction, opSize) of
                 (ShiftLeft, OpSize64) => (BFUnsigned, Word.~ shift mod 0w64, 0w64-0w1-shift)
             |   (ShiftLeft, OpSize32) => (BFUnsigned, Word.~ shift mod 0w32, 0w32-0w1-shift)
             |   (ShiftRightLogical, OpSize64) => (BFUnsigned, shift, 0wx3f)
             |   (ShiftRightLogical, OpSize32) => (BFUnsigned, shift, 0wx1f)
             |   (ShiftRightArithmetic, OpSize64) => (BFSigned, shift, 0wx3f)
             |   (ShiftRightArithmetic, OpSize32) => (BFSigned, shift, 0wx1f)
     in
         BitField{ regN=regN, regD=regD, opSize=opSize, immr=immr, imms=imms, bitfieldKind=bitfieldKind }
     end
 
     (* These sequences are used both in the ML code-generator and in the FFI code so it
        is convenient to have them here and share the code. *)
     local
         fun allocateWords(fixedReg, workReg, words, bytes, regMask, code) =
         let
             val (lengthWord, setLength, flagShift) = if is32in64 then (~4, Load32, 0w24) else (~8, Load64, 0w56)
         in
             code <::>
             AllocateMemoryFixedSize{ bytes=bytes, dest=fixedReg, save=regMask, work=X16 } <::>
             LoadNonAddr(workReg,
                     Word64.orb(words, Word64.<<(Word64.fromLarge(Word8.toLarge Address.F_bytes), flagShift))) <::>
             (* Store the length word.  Have to use the unaligned version because offset is -ve. *)
             StoreRegUnscaled{regT=workReg, regN=fixedReg, byteOffset= lengthWord, loadType=setLength, unscaledType=NoUpdate}
         end
 
         fun absoluteAddressToIndex(reg, code) =
         if is32in64
         then
             code <::>
             SubShiftedReg{regM=X_Base32in64, regN=reg, regD=reg, shift=ShiftNone, opSize=OpSize64, setFlags=false} <::>
             shiftConstant{direction=ShiftRightLogical, regN=reg, regD=reg, shift=0w2, opSize=OpSize64}
         else code
     in
         fun boxDouble({source, destination, workReg, saveRegs}, code) =
             absoluteAddressToIndex(destination,
                 allocateWords(destination, workReg, if is32in64 then 0w2 else 0w1, 0w16, saveRegs, code) <::>
                     StoreFPRegScaled{regT=source, regN=destination, unitOffset=0, floatSize=Double64})
                 
         and boxSysWord({source, destination, workReg, saveRegs}, code) =
             absoluteAddressToIndex(destination,
                 allocateWords(destination, workReg, if is32in64 then 0w2 else 0w1, 0w16, saveRegs, code) <::>
                     StoreRegScaled{regT=source, regN=destination, unitOffset=0, loadType=Load64})
 
         and boxFloat({source, destination, workReg, saveRegs}, code) =
             absoluteAddressToIndex(destination, 
                 allocateWords(destination, workReg, 0w1, 0w8, saveRegs, code) <::>
                     StoreFPRegScaled{regT=source, regN=destination, unitOffset=0, floatSize=Float32})
     end
 
     structure Sharing =
     struct
         type closureRef = closureRef
         type loadType = loadType
         type opSize = opSize
         type logicalOp = logicalOp
         type floatSize = floatSize
         type shiftDirection = shiftDirection
         type multKind = multKind
         type fpUnary = fpUnary
         type fpBinary = fpBinary
         type unscaledType = unscaledType
         type condSet = condSet
         type bitfieldKind = bitfieldKind
         type brRegType = brRegType
         type precode = precode
         type xReg = xReg
         type vReg = vReg
         type label = label
         type labelMaker = labelMaker
         type condition = condition
         type shiftType = shiftType
         type wordSize = wordSize
         type 'a extend = 'a extend
         type scale = scale
         type instr = instr
         type atomicOp = atomicOp
     end
 
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PushRegisters.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PushRegisters.ML
index 35131381..59ccfeef 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PushRegisters.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PushRegisters.ML
@@ -1,1181 +1,1211 @@
 (*
     Copyright David C. J. Matthews 2021-2
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64PushRegisters(
     structure Arm64ICode: ARM64ICODE
     structure IntSet: INTSET
     structure Identify: ARM64IDENTIFYREFERENCES
     sharing Arm64ICode.Sharing = Identify.Sharing = IntSet
 ) : ARM64PUSHREGISTERS
 =
 struct
     open Arm64ICode
     open IntSet
     open Identify
     
     type basicBlockAbstract = (preg, pregOrZero, preg) basicBlock
     
     (* Curried subscript functions *)
     fun asub a i = Array.sub(a, i)
     and vsub v i = Vector.sub(v, i)
     
     exception InternalError = Misc.InternalError
 
     (* Each preg in the input is mapped to either a new preg or the stack. *)
     datatype pregMapType = Unset | ToPReg of preg | ToStack of int * stackLocn
 
     (* The stack contains both entries in the input code and entries added here.
        It is really used to ensure that the stack at run time is the same size
        at the start of a block whichever block has jumped to it. *)
     datatype stackEntry =
         NewEntry of {pregNo: int} (* pregNo is the original preg that has been pushed here. *)
     |   OriginalEntry of { stackLoc: stackLocn }
     |   HandlerEntry
 
 
     fun addRegisterPushes{code: extendedBasicBlock vector, pushVec: bool vector, pregProps, firstPass=_} =
     let
         val maxPRegs = Vector.length pregProps
         val numberOfBlocks = Vector.length code
         (* Output registers and properties. *)
         val pregCounter = ref 0
         val pregPropList = ref []
         val pregMap = Array.array(maxPRegs, Unset)
  
         val maxStack = ref 0
         (* The stack size we've assumed for the block.  Also indicates if
            a block has already been processed. *)
         val inputStackSizes = Array.array(numberOfBlocks, NONE)
         (* The result of processing a block. *)
         val blockOutput = Array.array(numberOfBlocks, {code=[], stackCount=0})
         
         (* Extra blocks to adjust the stack are added here. *)
         val extraBlocks: basicBlockAbstract list ref = ref []
         val blockCounter = ref numberOfBlocks
         
         (* Get the blocks that are inputs for each one. *)
         local
             val blockRefs = Array.array(numberOfBlocks, [])
             
             fun setReferences fromBlock =
             let
                 val ExtendedBasicBlock{ flow, ...} = vsub code fromBlock
                 val refs = successorBlocks flow
                 fun setRefs toBlock =
                 let
                     val oldRefs = asub blockRefs toBlock
                 in
                     Array.update(blockRefs, toBlock, fromBlock :: oldRefs);
                     if null oldRefs
                     then setReferences toBlock
                     else ()
                 end
             in
                 List.app setRefs refs
             end
 
             val () = setReferences 0
         in
             val blockRefs = blockRefs
         end
 
         (* Recursive scan of the blocks.  For each block we produce an input and output state.
            The input state is the output state of the predecessor i.e. some block that jumps to
            this, but with any entries removed that are not used in this block.  It is then
            necessary to match the input state, if necessary by adding extra blocks that just
            do the matching. *)
         local
             val haveProcessed = isSome o asub inputStackSizes
         
             fun processBlocks toDo =
             case List.filter (fn (n, _) => not(haveProcessed n)) toDo of
                 [] => () (* Nothing left to do *)
 
             |   stillToDo as head :: _ =>
                 let
                     (* Try to find a block all of whose predecessors have been processed.  That
                        increases the chances that we will have cached items.
                        TODO: This is no longer necessary since we don't do any caching here
                        now so could may be simplified. *)
                     fun available(dest, _) = List.all haveProcessed (Array.sub(blockRefs, dest))
                     val (blockNo, lastOutputState) =
                         case List.find available stillToDo of
                             SOME c => c
                         |   NONE => head
                 
                     (* This is the first time we've come to this block.  *)
                     val ExtendedBasicBlock{ block, flow, imports, passThrough, loopRegs, initialStacks, ...} = vsub code blockNo
 
                     (* Remove any items from the input state that are no longer needed for
                        this block.  They could be local to the previous block or needed by
                        a different successor.  Although the values in loopRegs are not
                        required the stack space is so that they can be updated. *)
                     fun removeItems(result as {stack=[], stackCount=0}) = result
                 
                     |   removeItems{stack=[], ...} = raise InternalError "removeItems - stack size"
 
                     |   removeItems (thisStack as {stack=NewEntry{pregNo} :: rest, stackCount}) =
                         if member(pregNo, imports) orelse member(pregNo, passThrough) orelse member(pregNo, loopRegs)
                         then thisStack
                         else removeItems{stack=rest, stackCount=stackCount-1}
 
                     |   removeItems (thisStack as {stack=OriginalEntry{stackLoc=StackLoc{rno, size}, ...} :: rest, stackCount}) =
                         if member(rno, initialStacks)
                         then thisStack
                         else removeItems{stack=rest, stackCount=stackCount-size}
 
                     |   removeItems result = result
 
                     val {stackCount=newSp, stack=newStack} = removeItems lastOutputState
                     
                     (* References to hold the current stack count (number of words on the stack)
                        and the list of items on the stack.  The list is not used directly to map
                        stack addresses.  Instead it is used to match the stack at the beginning
                        and end of a block. *)
                     val stackCount = ref newSp
                     val stack = ref newStack
                     (* Items from the stack that have been marked as deleted but not yet
                        removed.  We only remove items from the top of the stack to avoid
                        quadratic behaviour with a very deep stack. *)
                     val deletedItems = ref []
                     
                     (* Save the stack size in case we come by a different route. *)
                     val () = Array.update(inputStackSizes, blockNo, SOME newSp)
                     
                     fun pushItemToStack item =
                     let
                         val size =
                             case item of
                                 NewEntry _ => 1
                             |   OriginalEntry{stackLoc=StackLoc{size, ...}, ...} => size
                             |   HandlerEntry => 2
                     in
                         stackCount := ! stackCount+size;
                         stack := item :: ! stack;
                         maxStack := Int.max(!maxStack, !stackCount)
                     end
  
                     fun newPReg propKind =
                     let
                         val regNo = !pregCounter before pregCounter := !pregCounter + 1
                         val () = pregPropList := propKind :: !pregPropList
                     in
                         PReg regNo
                     end
         
                     and newStackLoc size =
                     let
                         val regNo = !pregCounter before pregCounter := !pregCounter + 1
                         val () = pregPropList := RegPropStack size :: !pregPropList
                     in
                         StackLoc{size=size, rno=regNo}
                     end
         
                     (* Map a source register.  This always loads the argument. *)
                     fun mapSrcReg(PReg n) =
                         case Array.sub(pregMap, n) of
                             Unset => raise InternalError "mapSrcReg - unset"
                         |   ToPReg preg => (preg, [])
                         |   ToStack(stackLoc, container as StackLoc{size, ...}) =>
                             let
                                 (* Make a new untagged register. That will prevent us pushing it if
                                    we have to spill registers. *)
                                 val newReg = newPReg RegPropUntagged
                             in
                                 (newReg,
                                     [LoadStack{wordOffset= !stackCount-stackLoc-size, container=container,
                                                field=0, dest=newReg}])
                             end
 
                     fun mapDestReg(PReg n) =
                     let
                         val currentLocation = Array.sub(pregMap, n)
                         val kind = Vector.sub(pregProps, n)
                     in
                         if Vector.sub(pushVec, n)
                         then
                         let
                             (* This should not have been seen before. *)
                             val _ = case currentLocation of Unset => () | _ => raise InternalError "mapDestReg - already set"
                             val newReg = newPReg kind
                             val newContainer = newStackLoc 1
                             val () = Array.update(pregMap, n, ToStack (!stackCount, newContainer))
                             val () = pushItemToStack(NewEntry{pregNo=n})
                         in
                             (newReg, [PushToStack{source= newReg, container=newContainer, copies=1}])
                         end
                         else
                         let
                         (* See if we already have a number for it.  We may encounter the same preg
                            as a destination when returning the result from a conditional in which
                            case we have to use the same number.  We shouldn't have pushed it. *)
                             val newReg =
                                 case (currentLocation, kind) of
                                     (Unset, _) =>
                                     let
                                         val newReg = newPReg kind
                                         val () = Array.update(pregMap, n, ToPReg newReg)
                                     in
                                         newReg
                                     end
                                 |   (ToPReg preg, RegPropMultiple) => preg
                                 |   _ => raise InternalError "mapDestReg - multiply defined non-merge reg"
                         in
                             (newReg, [])
                         end
                     end
                     
                     (* Optional destination for arithmetic and logical ops. *)
                     fun mapOptDest ZeroReg = (ZeroReg, [])
                     |   mapOptDest (SomeReg destReg) =
                         let val (destVal, destCode) = mapDestReg destReg in (SomeReg destVal, destCode) end
 
                     fun mapOptSrc ZeroReg = (ZeroReg, [])
                     |   mapOptSrc (SomeReg srcReg) =
                         let val (srcVal, srcCode) = mapSrcReg srcReg in (SomeReg srcVal, srcCode) end
 
                     (* Adjust a stack offset from the old state to the new state. *)
                     fun mapContainerAndStack(StackLoc{rno, size}, field) =
                     let
                         val (newStackAddr, newContainer) =
                         case Array.sub(pregMap, rno) of
                             Unset => raise InternalError "mapContainer - unset"
                         |   ToPReg _ => raise InternalError "mapContainer - ToPReg"
                         |   ToStack stackContainer => stackContainer
 
                         val newOffset = !stackCount-(newStackAddr+size) + field
                     in
                         (newOffset, newContainer)
                     end
         
                     (* Add an entry for an existing stack entry. *)
                     fun mapDestContainer(StackLoc{rno, size}, locn) =
                     (
                         case Array.sub(pregMap, rno) of
                             Unset =>
                             let
                                 val newContainer = newStackLoc size
                                 val () = Array.update(pregMap, rno, ToStack(locn, newContainer))
                             in
                                 newContainer
                             end
                         |   _ => raise InternalError "mapDestContainer: already set"
                     )
 
                     (* Map a function argument which could be a register or a stack entry.  A register
                        entry could have been pushed. *)
                     fun mapArgument(ArgInReg (PReg r)) =
                         (
                             case Array.sub(pregMap, r) of
                                 Unset => raise InternalError "mapSource - unset"
                             |   ToPReg preg => ArgInReg preg
                             |   ToStack(stackLoc, container as StackLoc{size, ...}) =>
                                 ArgOnStack{wordOffset= !stackCount-stackLoc-size, container=container, field=0}
                         )
 
                     |   mapArgument(ArgOnStack{container, field, ...}) =
                             let
                                 val (newOffset, newContainer) = mapContainerAndStack(container, field)
                             in
                                 ArgOnStack{container=newContainer, wordOffset=newOffset, field=field}
                             end
 
                     (* Rewrite the code, replacing any registers that need to be pushed with references to
                        the stack.  The result is built up in reverse order and then reversed. *)
                     fun pushRegisters({instr=MoveRegister{ source, dest as PReg dReg }, ...}, code) =
                         if Vector.sub(pushVec, dReg)
                         then (* We're going to push this. *)
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                             (* TODO: Since we're pushing it we don't need to move it first. *)
                         in
                             destCode @ MoveRegister { source=sourceVal, dest=destVal} :: sourceCode @ code
                         end
                         else
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ MoveRegister { source=sourceVal, dest=destVal} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=LoadNonAddressConstant { dest, source}, ...}, code) =
                         let
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ LoadNonAddressConstant { dest=destVal, source=source} :: code
                         end
 
                     |   pushRegisters({instr=LoadFPConstant { dest, source, floatSize}, ...}, code) =
                         let
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ LoadFPConstant { dest=destVal, source=source, floatSize=floatSize} :: code
                         end
 
                     |   pushRegisters({instr=LoadAddressConstant { dest, source}, ...}, code) =
                         let
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ LoadAddressConstant { dest=destVal, source=source} :: code
                         end
 
                     |   pushRegisters({instr=LoadWithConstantOffset { base, dest, byteOffset, loadType}, ...}, code) =
                         let
                             val (baseVal, baseCode) = mapSrcReg base
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @
                                 LoadWithConstantOffset { base=baseVal, dest=destVal, byteOffset=byteOffset, loadType=loadType} ::
                                     baseCode @ code
                         end
 
                     |   pushRegisters({instr=LoadFPWithConstantOffset { base, dest, byteOffset, floatSize}, ...}, code) =
                         let
                             val (baseVal, baseCode) = mapSrcReg base
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @
                                 LoadFPWithConstantOffset { base=baseVal, dest=destVal, byteOffset=byteOffset, floatSize=floatSize} ::
                                     baseCode @ code
                         end
 
                     |   pushRegisters({instr=LoadWithIndexedOffset { base, dest, index, loadType, signExtendIndex}, ...}, code) =
                         let
                             val (baseVal, baseCode) = mapSrcReg base
                             val (indexVal, indexCode) = mapSrcReg index
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @
                                 LoadWithIndexedOffset { base=baseVal, dest=destVal, index=indexVal,
                                                         loadType=loadType, signExtendIndex=signExtendIndex} ::
                                     indexCode @ baseCode @ code
                         end
 
                     |   pushRegisters({instr=LoadFPWithIndexedOffset { base, dest, index, floatSize, signExtendIndex}, ...}, code) =
                         let
                             val (baseVal, baseCode) = mapSrcReg base
                             val (indexVal, indexCode) = mapSrcReg index
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @
                                 LoadFPWithIndexedOffset { base=baseVal, dest=destVal, index=indexVal,
                                                           floatSize=floatSize, signExtendIndex=signExtendIndex} ::
                                     indexCode @ baseCode @ code
                         end
 
                     |   pushRegisters({instr=GetThreadId { dest}, ...}, code) =
                         let
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ GetThreadId { dest=destVal} :: code
                         end
 
                     |   pushRegisters({instr=ObjectIndexAddressToAbsolute { source, dest}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ ObjectIndexAddressToAbsolute { source=sourceVal, dest=destVal} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=AbsoluteToObjectIndex { source, dest}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ AbsoluteToObjectIndex { source=sourceVal, dest=destVal} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=AllocateMemoryFixed { bytesRequired, dest, ...}, ...}, code) =
                         let
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ AllocateMemoryFixed { dest=destVal, bytesRequired=bytesRequired, saveRegs=[]} :: code
                         end
 
                     |   pushRegisters({instr=AllocateMemoryVariable{size, dest, ...}, ...}, code) =
                         let
                             val (sizeVal, sizeCode) = mapSrcReg size
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ AllocateMemoryVariable{size=sizeVal, dest=destVal, saveRegs=[]} :: sizeCode @ code
                         end
 
                     |   pushRegisters({instr=InitialiseMem{size, addr, init}, ...}, code) =
                         let
                             val (sizeVal, sizeCode) = mapSrcReg size
                             val (addrVal, addrCode) = mapSrcReg addr
                             val (initVal, initCode) = mapSrcReg init
                         in
                             InitialiseMem{size=sizeVal, addr=addrVal, init=initVal} :: initCode @ addrCode @ sizeCode @ code
                         end
 
                     |   pushRegisters({instr=BeginLoop, ...}, code) = BeginLoop :: code
 
                     |   pushRegisters({instr=JumpLoop{regArgs, stackArgs, checkInterrupt}, ...}, code) =
                         let
                             (* Normally JumpLoop will be the last item in a block but it is possible that we've
                                added a reset-stack after it. *)
                             fun getValues [] = ([], [])
                             |   getValues ({src=source, dst=PReg n} :: rest) =
                                 let
                                     val (otherRegArgs, otherStackArgs) = getValues rest
                                 in
                                     case Array.sub(pregMap, n) of
                                         ToPReg lReg => ({src=mapArgument source, dst=lReg} :: otherRegArgs, otherStackArgs)
 
                                     |   ToStack(stackloc, stackC as StackLoc{size, ...}) =>
                                         let
                                             val sourceVal = mapArgument source
                                             val stackOff = !stackCount - stackloc - size
                                         in
                                             (otherRegArgs, {src=sourceVal, wordOffset=stackOff, stackloc=stackC} :: otherStackArgs)
                                         end
 
                                     |   Unset =>
                                         (* Drop it.  It's never used. This can happen if we are folding a
                                            function over a list such that it always returns the last value and
                                            then discard the result of the fold. *)
                                             (otherRegArgs, otherStackArgs)
                                 end
                             val (newRegArguments, newStackArgs) = getValues regArgs
                             fun loadStackArg({src=source, stackloc=destC, ...}, otherArgs) =
                             let
                                 val sourceVal = mapArgument source
                                 val (newOffset, newContainer) = mapContainerAndStack(destC, 0)
                             in
                                 {src=sourceVal, wordOffset=newOffset, stackloc=newContainer} :: otherArgs
                             end
                 
                             val oldStackArgs = List.foldr loadStackArg [] stackArgs
                             val check = case checkInterrupt of NONE => NONE | SOME _ => SOME []
                         in
                             JumpLoop{ regArgs=newRegArguments, stackArgs=oldStackArgs @ newStackArgs, checkInterrupt=check} :: code
                         end
 
                     |   pushRegisters({instr=StoreWithConstantOffset { base, source, byteOffset, loadType}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (baseVal, baseCode) = mapSrcReg base 
                         in
                             StoreWithConstantOffset{ base=baseVal, source=sourceVal, byteOffset=byteOffset, loadType=loadType} ::
                                 baseCode @ sourceCode @ code
                         end
 
                     |   pushRegisters({instr=StoreFPWithConstantOffset { base, source, byteOffset, floatSize}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (baseVal, baseCode) = mapSrcReg base 
                         in
                             StoreFPWithConstantOffset{ base=baseVal, source=sourceVal, byteOffset=byteOffset, floatSize=floatSize} ::
                                 baseCode @ sourceCode @ code
                         end
 
                     |   pushRegisters({instr=StoreWithIndexedOffset { base, source, index, loadType, signExtendIndex}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (baseVal, baseCode) = mapSrcReg base 
                             val (indexVal, indexCode) = mapSrcReg index 
                         in
                             StoreWithIndexedOffset{ base=baseVal, source=sourceVal, index=indexVal,
                                                     loadType=loadType, signExtendIndex=signExtendIndex} ::
                                 indexCode @ baseCode @ sourceCode @ code
                         end
 
                     |   pushRegisters({instr=StoreFPWithIndexedOffset { base, source, index, floatSize, signExtendIndex}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (baseVal, baseCode) = mapSrcReg base 
                             val (indexVal, indexCode) = mapSrcReg index 
                         in
                             StoreFPWithIndexedOffset{ base=baseVal, source=sourceVal, index=indexVal,
                                                       floatSize=floatSize, signExtendIndex=signExtendIndex} ::
                                 indexCode @ baseCode @ sourceCode @ code
                         end
 
                     |   pushRegisters({instr=AddSubImmediate{ source, dest, ccRef, immed, isAdd, length }, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapOptDest dest
                         in
                             destCode @ AddSubImmediate { source=sourceVal, dest=destVal, ccRef=ccRef,
                                                          immed=immed, isAdd=isAdd, length=length} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=AddSubRegister{ base, shifted, dest, ccRef, isAdd, length, shift}, ...}, code) =
                         let
                             val (op1Val, op1Code) = mapSrcReg base
                             val (op2Val, op2Code) = mapSrcReg shifted
                             val (destVal, destCode) = mapOptDest dest
                         in
                             destCode @ AddSubRegister { base=op1Val, shifted=op2Val, dest=destVal, ccRef=ccRef,
                                                          isAdd=isAdd, length=length, shift=shift} :: op2Code @ op1Code @ code
                         end
 
                     |   pushRegisters({instr=LogicalImmediate{ source, dest, ccRef, immed, logOp, length }, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapOptDest dest
                         in
                             destCode @ LogicalImmediate { source=sourceVal, dest=destVal, ccRef=ccRef,
                                                           immed=immed, logOp=logOp, length=length} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=LogicalRegister{ base, shifted, dest, ccRef, logOp, length, shift}, ...}, code) =
                         let
                             val (op1Val, op1Code) = mapSrcReg base
                             val (op2Val, op2Code) = mapSrcReg shifted
                             val (destVal, destCode) = mapOptDest dest
                         in
                             destCode @ LogicalRegister { base=op1Val, shifted=op2Val, dest=destVal, ccRef=ccRef,
                                                          logOp=logOp, length=length, shift=shift} :: op2Code @ op1Code @ code
                         end
 
                     |   pushRegisters({instr=ShiftRegister{ direction, dest, source, shift, opSize}, ...}, code) =
                         let
                             val (srcVal, op1Code) = mapSrcReg source
                             val (shiftVal, op2Code) = mapSrcReg shift
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ ShiftRegister { source=srcVal, shift=shiftVal, dest=destVal,
                                                        direction=direction, opSize=opSize} :: op2Code @ op1Code @ code
                         end
 
                     |   pushRegisters({instr=Multiplication{ kind, dest, sourceA, sourceM, sourceN }, ...}, code) =
                         let
                             val (srcAVal, srcACode) = mapOptSrc sourceA
                             val (srcMVal, srcMCode) = mapSrcReg sourceM
                             val (srcNVal, srcNCode) = mapSrcReg sourceN
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ Multiplication { kind=kind, sourceA=srcAVal, sourceM=srcMVal, sourceN=srcNVal, dest=destVal} ::
                                 srcNCode @ srcMCode @ srcACode @ code
                         end
 
                     |   pushRegisters({instr=Division{ isSigned, dest, dividend, divisor, opSize }, ...}, code) =
                         let
                             val (dividendVal, dividendCode) = mapSrcReg dividend
                             val (divisorVal, divisorCode) = mapSrcReg divisor
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ Division { isSigned=isSigned, dividend=dividendVal, divisor=divisorVal, dest=destVal,
                                                        opSize=opSize} :: divisorCode @ dividendCode @ code
                         end
 
-                    |   pushRegisters({instr=BeginFunction {regArgs, stackArgs}, ...}, code) =
+                    |   pushRegisters({instr=BeginFunction {regArgs, fpRegArgs, stackArgs}, ...}, code) =
                         let
                             (* Create a new container list.  The offsets begin at -numArgs. *)
                             fun newContainers(src :: srcs, offset) =
                                 let
                                     val newContainer = mapDestContainer(src, offset)
                                 in
                                     newContainer :: newContainers(srcs, offset+1)
                                 end
                             |   newContainers _ = []
 
                             val newStackArgs = newContainers(stackArgs, ~ (List.length stackArgs))
                             (* Push any registers that need to be pushed. *)
                             fun pushReg((preg, rreg), (others, code)) =
                             let
-                                val (newReg, newCode) = mapDestReg(preg)
+                                val (newReg, newCode) = mapDestReg preg
                             in
                                 ((newReg, rreg) :: others, newCode @ code)
                             end
                             val (newRegArgs, pushCode) = List.foldl pushReg ([], []) regArgs
+                            val (newFPRegArgs, pushFPCode) = List.foldl pushReg ([], []) fpRegArgs
                         in
-                            pushCode @ BeginFunction {regArgs=newRegArgs, stackArgs=newStackArgs} :: code
+                            pushFPCode @ pushCode @ BeginFunction {regArgs=newRegArgs, fpRegArgs=newFPRegArgs, stackArgs=newStackArgs} :: code
                         end
 
-                    |   pushRegisters({instr=FunctionCall{callKind, regArgs, stackArgs, dests, containers, ...}, ...}, code) =
+                    |   pushRegisters({instr=FunctionCall{callKind, regArgs, stackArgs, dests, fpRegArgs, fpDests, containers, ...}, ...}, code) =
                         let
                             (* It's possible that this could lead to having to spill registers in order
                                to load others.  Leave that problem for the moment. *)
                             fun loadStackArg (arg, otherArgs) =
                             let
                                 val argVal = mapArgument arg
                             in
                                 argVal :: otherArgs
                             end
                             val newStackArgs = List.foldr loadStackArg [] stackArgs
 
                             fun loadRegArg ((arg, reg), otherArgs) =
                             let
                                 val argVal = mapArgument arg
                             in
                                 (argVal, reg) :: otherArgs
                             end
                             val newRegArgs = List.foldr loadRegArg [] regArgs
 
+                            fun loadFPRegArg ((PReg n, reg), otherArgs) =
+                            let
+                                (* FP regs are untagged registers and should never be pushed. *)
+                                val argVal =
+                                    case Array.sub(pregMap, n) of
+                                        Unset => raise InternalError "mapSrcReg - unset"
+                                    |   ToPReg preg => preg
+                                    |   ToStack _ => raise InternalError "loadFPRegArg: on stack"
+                            in
+                                (argVal, reg) :: otherArgs
+                            end
+                            val newFPRegArgs = List.foldr loadFPRegArg [] fpRegArgs
+
                             (* Push any result registers that need to be pushed. *)
                             fun pushResults((preg, rreg), (others, code)) =
                             let
                                 val (newReg, newCode) = mapDestReg preg
                             in
                                 ((newReg, rreg) :: others, newCode @ code)
                             end
                             val (destVals, destCode) = List.foldl pushResults ([], []) dests
+                            val (destFPVals, destFPCode) = List.foldl pushResults ([], []) fpDests
 
                             val newContainers = List.map(fn c => #2(mapContainerAndStack(c, 0))) containers
+                            (* Stack arguments are pushed in addition to anything on the stack. *)
+                            val () = maxStack := Int.max(!maxStack, !stackCount + List.length newStackArgs)
                         in
-                            destCode @ FunctionCall{ callKind=callKind, regArgs=newRegArgs, stackArgs=newStackArgs,
-                                                  dests=destVals, saveRegs=[], containers=newContainers} :: code
+                            destFPCode @ destCode @ FunctionCall{ callKind=callKind, regArgs=newRegArgs, stackArgs=newStackArgs,
+                                                  dests=destVals, fpRegArgs=newFPRegArgs, fpDests=destFPVals, saveRegs=[], containers=newContainers} :: code
                         end
 
-                    |   pushRegisters({instr=TailRecursiveCall{callKind, regArgs, stackArgs, stackAdjust, ...}, ...}, code) =
+                    |   pushRegisters({instr=TailRecursiveCall{callKind, regArgs, stackArgs, stackAdjust, fpRegArgs, ...}, ...}, code) =
                         let
                             val newStackOffset = !stackCount
                             
                             fun loadStackArg ({src, stack}, (otherLoads, otherArgs)) =
                             let
                                 val (argVal, loadCode) =
                                     case mapArgument src of
                                         (source as ArgOnStack{wordOffset, container, field}) =>
                                             (* If we're leaving it in its old location or we're pushing it
                                                above the current top we're ok.  We're also ok if
                                                we're moving it from a somewhere above the last argument.
                                                Otherwise we have to load it.
                                                It goes into a normal tagged register which may mean that it
                                                could be pushed onto the stack in a subsequent pass. *)
                                             if wordOffset = stack+newStackOffset orelse stack+newStackOffset < 0
                                                 orelse newStackOffset-wordOffset > ~ stackAdjust
                                             then (source, [])
                                             else
                                             let
                                                 val preg = newPReg RegPropGeneral
                                             in
                                                 (ArgInReg preg,
                                                     [LoadStack{wordOffset=wordOffset, container=container, field=field, dest=preg}])
                                             end
                                     |   argCode => (argCode, [])
                             in
                                 (loadCode @ otherLoads, {src=argVal, stack=stack} :: otherArgs)
                             end
                             val (stackArgLoads, newStackArgs) = List.foldr loadStackArg ([], []) stackArgs
                 
                             fun loadRegArg ((arg, reg), otherArgs) =
                             let
                                 val argVal = mapArgument arg
                             in
                                 (argVal, reg) :: otherArgs
                             end
                             val newRegArgs = List.foldr loadRegArg [] regArgs
 
+                            fun loadFPRegArg ((PReg n, reg), otherArgs) =
+                            let
+                                (* FP regs are untagged registers and should never be pushed. *)
+                                val argVal =
+                                    case Array.sub(pregMap, n) of
+                                        Unset => raise InternalError "mapSrcReg - unset"
+                                    |   ToPReg preg => preg
+                                    |   ToStack _ => raise InternalError "loadFPRegArg: on stack"
+                            in
+                                (argVal, reg) :: otherArgs
+                            end
+                            val newFPRegArgs = List.foldr loadFPRegArg [] fpRegArgs
+                            (* Stack arguments replace existing arguments but could grow the stack. *)
+                            val () = maxStack := Int.max(!maxStack, List.length newStackArgs)
                         in
-                            TailRecursiveCall{ callKind=callKind, regArgs=newRegArgs,
+                            TailRecursiveCall{ callKind=callKind, regArgs=newRegArgs, fpRegArgs=newFPRegArgs,
                                     stackArgs=newStackArgs, stackAdjust=stackAdjust, currStackSize=newStackOffset} ::
                                         stackArgLoads @ code
                         end
 
-                    |   pushRegisters({instr=ReturnResultFromFunction{results, returnReg, numStackArgs}, ...}, code) =
+                    |   pushRegisters({instr=ReturnResultFromFunction{results, fpResults, returnReg, numStackArgs}, ...}, code) =
                         let
-                            fun loadResults((preg, rreg), (others, code)) =
+                            fun getResults((preg, rreg), (others, code)) =
                             let
                                 val (newReg, newCode) = mapSrcReg preg
                             in
                                 ((newReg, rreg) :: others, newCode @ code)
                             end
-                            val (resultValues, loadResults) = List.foldr loadResults ([], []) results
+                            val (resultValues, loadResults) = List.foldr getResults ([], []) results
+                            val (fpResultValues, loadFPResults) = List.foldr getResults ([], []) fpResults
 
                             val (returnValue, loadReturn) = mapSrcReg returnReg
                             val resetCode =
                                 if !stackCount = 0 then [] else [ResetStackPtr{numWords= !stackCount}]
                         in
-                            ReturnResultFromFunction{results=resultValues, returnReg=returnValue, numStackArgs=numStackArgs} ::
-                                resetCode @ loadReturn @ loadResults @ code
+                            ReturnResultFromFunction{results=resultValues, fpResults=fpResultValues, returnReg=returnValue, numStackArgs=numStackArgs} ::
+                                resetCode @ loadReturn @ loadFPResults @ loadResults @ code
                         end
 
                     |   pushRegisters({instr=RaiseExceptionPacket{packetReg}, ...}, code) =
                         let
                             val (packetVal, packetCode) = mapSrcReg packetReg
                         in
                             RaiseExceptionPacket{packetReg=packetVal} :: packetCode @ code
                         end
 
                     |   pushRegisters({instr=PushToStack{ source, container, copies }, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             (* This was a push from a previous pass.  Treat as a container of size "copies". *)
                             val newContainer = mapDestContainer(container, !stackCount)
                             val () = pushItemToStack(OriginalEntry{stackLoc=container})
                         in
                             PushToStack{source=sourceVal, container=newContainer, copies=copies} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=LoadStack{ dest, container, field, ... }, ...}, code) =
                         let
                             val (newOffset, newContainer) = mapContainerAndStack(container, field)
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ LoadStack{ dest=destVal, container=newContainer, field=field,
                                                   wordOffset=newOffset } :: code
                         end
 
                     |   pushRegisters({instr=StoreToStack{source, container, field, ...}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (newOffset, newContainer) = mapContainerAndStack(container, field)
                         in
                             StoreToStack{source=sourceVal, container=newContainer, field=field, stackOffset=newOffset} ::
                                     sourceCode @ code
                         end
 
                     |   pushRegisters({instr=ContainerAddress{ dest, container, ... }, ...}, code) =
                         let
                             val (newOffset, newContainer) = mapContainerAndStack(container, 0)
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ ContainerAddress{ dest=destVal, container=newContainer, stackOffset=newOffset } :: code
                         end
 
                     |   pushRegisters({instr=ResetStackPtr _, ...}, code) = code
                             (* Added in a previous pass - discard it. *)
 
                     |   pushRegisters({instr=TagValue{source, dest, isSigned, opSize}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ TagValue{source=sourceVal, dest=destVal, isSigned=isSigned, opSize=opSize} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=UntagValue{source, dest, isSigned, opSize, ...}, ...}, code) =
                         let
                             val (loadedSource, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ UntagValue{source=loadedSource, dest=destVal, isSigned=isSigned, opSize=opSize} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=BoxLarge{source, dest, ...}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ BoxLarge{source=sourceVal, dest=destVal, saveRegs=[]} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=UnboxLarge{source, dest, ...}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ UnboxLarge{source=sourceVal, dest=destVal} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=BoxTagFloat{floatSize, source, dest, ...}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ BoxTagFloat{floatSize=floatSize, source=sourceVal, dest=destVal, saveRegs=[]} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=UnboxTagFloat{floatSize, source, dest, ...}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ UnboxTagFloat{floatSize=floatSize, source=sourceVal, dest=destVal} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=LoadAcquire { base, dest, loadType}, ...}, code) =
                         let
                             val (baseVal, baseCode) = mapSrcReg base
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ LoadAcquire { base=baseVal, dest=destVal, loadType=loadType} :: baseCode @ code
                         end
 
                     |   pushRegisters({instr=StoreRelease { base, source, loadType}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (baseVal, baseCode) = mapSrcReg base 
                         in
                             StoreRelease{ base=baseVal, source=sourceVal, loadType=loadType} :: baseCode @ sourceCode @ code
                         end
 
                     |   pushRegisters({instr=BitFieldShift{source, dest, isSigned, length, immr, imms}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @
                                 BitFieldShift { source=sourceVal, dest=destVal, isSigned=isSigned, immr=immr, imms=imms, length=length} ::
                                     sourceCode @ code
                         end
 
                     |   pushRegisters({instr=BitFieldInsert{source, destAsSource, dest, length, immr, imms}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destSrcVal, destSrcCode) = mapSrcReg destAsSource
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @
                                 BitFieldInsert { source=sourceVal, destAsSource=destSrcVal, dest=destVal, immr=immr, imms=imms, length=length} ::
                                     destSrcCode @ sourceCode @ code
                         end
 
                     |   pushRegisters({instr=IndexedCaseOperation{testReg}, ...}, code) =
                         let
                             val (testVal, testCode) = mapSrcReg testReg
                         in
                             IndexedCaseOperation{testReg=testVal} :: testCode @ code
                         end
 
                     |   pushRegisters({instr=PushExceptionHandler, ...}, code) =
                         let
                             (* Add a handler entry to the stack. *)
                             val () = pushItemToStack HandlerEntry
                         in
                             PushExceptionHandler :: code
                         end
 
                     |   pushRegisters({instr=PopExceptionHandler, ...}, code) =
                         let
                             (* Appears at the end of the block whose exceptions are being handled.  Delete the
                                handler and anything above it. *)
                             (* Get the state after removing the handler. *)
                             fun popContext ([], _) = raise InternalError "pushRegisters - pop handler"
                             |   popContext (HandlerEntry :: tl, new) = (tl, new-2)
                             |   popContext (OriginalEntry{stackLoc=StackLoc{size, ...}, ...} :: tl, new) = popContext(tl, new-size)
                             |   popContext (NewEntry _ :: tl, new) = popContext(tl, new-1)
                             val (newStack, nnCount) = popContext(!stack, !stackCount)
                             val () = stack := newStack
                             val oldStackPtr = ! stackCount
                             val () = stackCount := nnCount
                             (* Reset the stack to just above the two words of the handler. *)
                             val resetCode =
                                 if oldStackPtr <> nnCount+2
                                 then [ResetStackPtr{numWords=oldStackPtr-nnCount-2}]
                                 else []
                         in
                             PopExceptionHandler :: resetCode @ code
                         end
 
                     |   pushRegisters({instr=BeginHandler{packetReg}, ...}, code) =
                         let
                             (* Start of a handler.  The top active entry should be the handler. *)
                             val () =
                                 case !stack of
                                     HandlerEntry :: tl => stack := tl
                                 |   _ => raise InternalError "pushRegisters: BeginHandler"
                             val () = stackCount := !stackCount - 2
                             val (packetVal, packetCode) = mapDestReg packetReg
                         in
                             packetCode @ BeginHandler{packetReg=packetVal} :: code
                         end
 
                     |   pushRegisters({instr=CompareByteVectors{vec1Addr, vec2Addr, length, ccRef}, ...}, code) =
                         let
                             val (vec1Val, vec1Code) = mapSrcReg vec1Addr
                             val (vec2Val, vec2Code) = mapSrcReg vec2Addr
                             val (lenVal, lenCode) = mapSrcReg length
                         in
                             CompareByteVectors{vec1Addr=vec1Val, vec2Addr=vec2Val, length=lenVal, ccRef=ccRef} ::
                                 lenCode @ vec2Code @ vec1Code @ code
                         end
 
                     |   pushRegisters({instr=BlockMove{srcAddr, destAddr, length, isByteMove}, ...}, code) =
                         let
                             val (srcVal, srcCode) = mapSrcReg srcAddr
                             val (destVal, destCode) = mapSrcReg destAddr
                             val (lenVal, lenCode) = mapSrcReg length
                         in
                             BlockMove{srcAddr=srcVal, destAddr=destVal, length=lenVal, isByteMove=isByteMove} ::
                                 lenCode @ destCode @ srcCode @ code
                         end
 
 
                     |   pushRegisters({instr=AddSubXSP{source, dest, isAdd}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapOptDest dest
                         in
                             destCode @ AddSubXSP { source=sourceVal, dest=destVal, isAdd=isAdd} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=TouchValue{source, ...}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                         in
                             TouchValue { source=sourceVal} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=LoadAcquireExclusive{ base, dest }, ...}, code) =
                         let
                             val (baseVal, baseCode) = mapSrcReg base
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ LoadAcquireExclusive { base=baseVal, dest=destVal} :: baseCode @ code
                         end
 
                     |   pushRegisters({instr=StoreReleaseExclusive{ base, source, result }, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapOptSrc source
                             val (baseVal, baseCode) = mapSrcReg base 
                             val (resVal, resCode) = mapDestReg result
                         in
                             resCode @ StoreReleaseExclusive{ base=baseVal, source=sourceVal, result=resVal} :: baseCode @ sourceCode @ code
                         end
 
                     |   pushRegisters({instr=MemoryBarrier, ...}, code) = MemoryBarrier :: code
 
                     |   pushRegisters({instr=ConvertIntToFloat{ source, dest, srcSize, destSize}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @
                                 ConvertIntToFloat{ source=sourceVal, dest=destVal, srcSize=srcSize, destSize=destSize} ::
                                 sourceCode @ code
                         end
 
                     |   pushRegisters({instr=ConvertFloatToInt{ source, dest, srcSize, destSize, rounding}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @
                                 ConvertFloatToInt{ source=sourceVal, dest=destVal, srcSize=srcSize, destSize=destSize, rounding=rounding} ::
                                 sourceCode @ code
                         end
 
                     |   pushRegisters({instr=UnaryFloatingPt{ source, dest, fpOp}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ UnaryFloatingPt{ source=sourceVal, dest=destVal, fpOp=fpOp} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=BinaryFloatingPoint{ arg1, arg2, dest, fpOp, opSize}, ...}, code) =
                         let
                             val (arg1Val, arg1Code) = mapSrcReg arg1
                             val (arg2Val, arg2Code) = mapSrcReg arg2
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ BinaryFloatingPoint{ arg1=arg1Val, arg2=arg2Val, dest=destVal, fpOp=fpOp, opSize=opSize} ::
                                 arg2Code @ arg1Code @ code
                         end
 
                     |   pushRegisters({instr=CompareFloatingPoint{ arg1, arg2, opSize, ccRef}, ...}, code) =
                         let
                             val (arg1Val, arg1Code) = mapSrcReg arg1
                             val (arg2Val, arg2Code) = mapSrcReg arg2
                         in
                             CompareFloatingPoint{ arg1=arg1Val, arg2=arg2Val, opSize=opSize, ccRef=ccRef} ::
                                 arg2Code @ arg1Code @ code
                         end
 
                     |   pushRegisters({instr=CPUYield, ...}, code) = CPUYield :: code
 
                     |   pushRegisters({instr=AtomicOperation{ base, source, dest, atOp }, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapOptSrc source
                             val (baseVal, baseCode) = mapSrcReg base
                             val (destVal, destCode) = mapOptDest dest
                         in
                             destCode @ AtomicOperation{ base=baseVal, source=sourceVal, dest=destVal, atOp=atOp } :: baseCode @ sourceCode @ code
                         end
 
-                    |   pushRegisters({instr=CacheCheck _, ...}, _) = raise InternalError "pushRegisters: CacheCheck"
-
                     local
                         fun doPush(instr as {kill, ...}, code) =
                         let
                             val newCode = pushRegisters(instr, code)
                             (* Can we pop the stack? *)
                             val stackReset =
                                 case setToList (minus(kill, loopRegs)) of
                                     [] => []
                                 |   killList =>
                                     let
                                         (* See if any of the kill items are at the top of the stack.
                                            If they are we can pop them and perhaps items we've
                                            previously marked for deletion but not been able to pop. *)
                                         val oldStack = !stackCount
                                         fun checkAndAdd(r, output) =
                                             case Array.sub(pregMap, r) of
                                                 ToStack(stackLoc, StackLoc{size, ...}) =>
                                                     if stackLoc < 0
                                                     then r :: output (* We can have arguments and return address. *)
                                                     else if !stackCount = stackLoc+size
                                                     then
                                                     (
                                                         stack := tl (!stack);
                                                         stackCount := stackLoc;
                                                         output
                                                     )
                                                     else r :: output
                                             |   _ => r :: output
                                         val toAdd = List.foldl checkAndAdd [] killList
                                         fun reprocess list =
                                         let
                                             val prevStack = !stackCount
                                             val outlist = List.foldl checkAndAdd [] list
                                         in
                                             if !stackCount = prevStack
                                             then list
                                             else reprocess outlist
                                         end
                                         val () =
                                             if !stackCount = oldStack
                                             then deletedItems := toAdd @ !deletedItems
                                             else deletedItems := reprocess(toAdd @ !deletedItems)
                                         val _ = oldStack >= !stackCount orelse raise InternalError "negative stack offset"
                                     in
                                         if !stackCount = oldStack then []
                                         else [ResetStackPtr{numWords=oldStack - !stackCount}]
                                     end
                         in
                             stackReset @ newCode
                         end
                     in
                         val codeResult = List.foldl doPush [] block
                         val outputCount = ! stackCount
                         val results = {code=codeResult, stackCount= outputCount}
                         val stateResult = { stackCount= outputCount, stack= !stack }
                         
                         val () = Array.update(blockOutput, blockNo,  results)
                     end
 
                     val addSet =
                         case flow of
                             ExitCode => []
                         |   IndexedBr cases => cases
                         |   Unconditional dest => [dest]
                         |   Conditional {trueJump, falseJump, ...} => [falseJump, trueJump]
                         |   SetHandler { handler, continue } => [handler, continue]
                         |   UnconditionalHandle _ => []
                         |   ConditionalHandle { continue, ...} => [continue]
                         
                     val addItems = List.map(fn m => (m, stateResult)) addSet
                 in
                     processBlocks(addItems @ stillToDo)
                 end
         in
             val () = processBlocks([(0, {stack=[], stackCount=0})])
         end
 
 
         (* Put together the result code and blocks. *)
         local
             fun createBlock blockNo =
             (* Skip unreferenced blocks apart from block 0. *)
             if blockNo <> 0 andalso null (asub blockRefs blockNo)
             then BasicBlock{block=[], flow=ExitCode}
             else
             let
                 val ExtendedBasicBlock{ flow, ...} = vsub code blockNo
                 val {code=codeResult, stackCount=outputCount, ...} = asub blockOutput blockNo
                 
                 (* Process the successor.  If we need a stack adjustment this will require
                    an adjustment block.  TODO: We could put a pre-adjustment if we only have one
                    branch to this block. *)
                 fun matchStacks targetBlock =
                 let
                     (* Process the destination.  If it hasn't been processed. *)
                     val expectedInput = valOf (asub inputStackSizes targetBlock)
                 in
                     if expectedInput = outputCount
                     then targetBlock
                     else
                     let
                         val _ = outputCount > expectedInput orelse raise InternalError "adjustStack"
                         val adjustCode = [ResetStackPtr{numWords=outputCount-expectedInput}]
                         val newBlock = BasicBlock{block=adjustCode, flow=Unconditional targetBlock}
                         val newBlockNo = !blockCounter before blockCounter := !blockCounter+1
                         val () = extraBlocks := newBlock :: !extraBlocks
                     in
                         newBlockNo
                     end
                 end
                 
                 val (finalCode, newFlow) =
                     case flow of
                         ExitCode => (codeResult, ExitCode)
                     
                     |   Unconditional m =>
                         let
                             (* Process the block.  Since we're making an unconditional jump
                                we can include any stack adjustment needed to match the
                                destination in here.  In particular this includes loops. *)
                             val expectedInput = valOf (asub inputStackSizes m)
                             val _ = outputCount >= expectedInput orelse raise InternalError "negative reset"
                             val resultCode =
                                 if expectedInput = outputCount
                                 then codeResult
                                 else ResetStackPtr{numWords=outputCount-expectedInput} :: codeResult
                         in
                             (resultCode, Unconditional m)
                         end
 
                         (* For any of these, if we need to adjust the stack we have to add an
                            adjustment block. *)
                     |   Conditional {trueJump, falseJump, ccRef, condition} =>
                         (codeResult,
                             Conditional{trueJump=matchStacks trueJump, falseJump=matchStacks falseJump,
                                 ccRef=ccRef, condition=condition})
                     
                     |   SetHandler{ handler, continue } =>
                         (codeResult, SetHandler{ handler=matchStacks handler, continue=matchStacks continue})
                     
                     |   IndexedBr cases => (codeResult, IndexedBr(map matchStacks cases))
                     
                     |   u as UnconditionalHandle _ => (codeResult, u)
                     
                     |   c as ConditionalHandle{ continue, ... } =>
                         let
                             (* As for unconditional branch *)
                             val expectedInput = valOf (asub inputStackSizes continue)
                             val _ = outputCount >= expectedInput orelse raise InternalError "negative reset"
                             val resultCode =
                                 if expectedInput = outputCount
                                 then codeResult
                                 else ResetStackPtr{numWords=outputCount-expectedInput} :: codeResult
                         in
                             (resultCode, c)
                         end
              in
                 BasicBlock{block=List.rev finalCode, flow=newFlow}
             end
         in
             val resultBlocks = Vector.tabulate(numberOfBlocks, createBlock)
         end
         
         (* Add any extra blocks to the result. *)
         val finalResult =
             case !extraBlocks of
                 [] => resultBlocks
             |   blocks => Vector.concat[resultBlocks, Vector.fromList(List.rev blocks)]
         
         val pregProperties = Vector.fromList(List.rev(! pregPropList))
     in
         {code=finalResult, pregProps=pregProperties, maxStack= !maxStack}
     end
 
     structure Sharing =
     struct
         type extendedBasicBlock = extendedBasicBlock
         and ('genReg, 'optGenReg, 'fpReg) basicBlock =  ('genReg, 'optGenReg, 'fpReg) basicBlock
         and regProperty = regProperty
         and preg = preg
         and pregOrZero = pregOrZero
     end
 end;
diff --git a/mlsource/MLCompiler/CodeTree/X86Code/X86PushRegisters.ML b/mlsource/MLCompiler/CodeTree/X86Code/X86PushRegisters.ML
index c4d79200..c8d66d97 100644
--- a/mlsource/MLCompiler/CodeTree/X86Code/X86PushRegisters.ML
+++ b/mlsource/MLCompiler/CodeTree/X86Code/X86PushRegisters.ML
@@ -1,1557 +1,1561 @@
 (*
     Copyright David C. J. Matthews 2016-21
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor X86PushRegisters(
     structure X86ICode: X86ICODE
     structure IntSet: INTSET
     structure Identify: X86IDENTIFYREFERENCES
     sharing X86ICode.Sharing = Identify.Sharing = IntSet
 ) : X86PUSHREGISTERS
 =
 struct
     open X86ICode
     open IntSet
     open Identify
     
     (* Curried subscript functions *)
     fun asub a i = Array.sub(a, i)
     and vsub v i = Vector.sub(v, i)
     
     exception InternalError = Misc.InternalError
 
     (* Each preg in the input is mapped to either a new preg or the stack. *)
     datatype pregMapType = Unset | ToPReg of preg | ToStack of int * stackLocn
 
     (* The stack contains both entries in the input code and entries added here.
        It is really used to ensure that the stack at run time is the same size
        at the start of a block whichever block has jumped to it. *)
     datatype stackEntry =
         NewEntry of {pregNo: int} (* pregNo is the original preg that has been pushed here. *)
     |   OriginalEntry of { stackLoc: stackLocn }
     |   HandlerEntry
 
     fun addRegisterPushes{code: extendedBasicBlock vector, pushVec: bool vector, pregProps, firstPass} =
     let
         val maxPRegs = Vector.length pregProps
         val numberOfBlocks = Vector.length code
         (* Output registers and properties. *)
         val pregCounter = ref 0
         val pregPropList = ref []
         val pregMap = Array.array(maxPRegs, Unset)
         
         (* Cache registers. *)
         datatype cacheType =
             CacheStack of { rno: int }(* Original preg or stack loc. *)
             (* Cache memory location.  This allows for general base/index/offset addressing
                but currently we only cache either NoMemIndex or ObjectIndex. *)
         |   CacheMemory of { base: preg, offset: int, index: memoryIndex }
             (* CacheTagged is used if we tag a value to see if we can use the
                original untagged value somewhere. *)
         |   CacheTagged of { reg: preg, isSigned: bool, opSize: opSize }
             (* CacheFloat is used if we tag a float (Real32.real).
                Double-precision reals (Real.real) are handled as CacheMemory *)
         |   CacheFloat of { reg: preg }
 
         local
             (* The number of active cache entries is likely to be small and is
                at most proportional to the number of instructions in the block.
                Any function call will clear it.
                For memory entries we need to know if the value is tagged and
                what kind of move we're using.
                Stack entries always will be tagged and MoveWord. *)
             val cache: {cacheFor: cacheType, cacheReg: preg, isTagged: bool, kind: moveKind } list ref = ref []
 
             fun isStack n {cacheFor, ...} = cacheFor = CacheStack{rno = n}
             and isMemory (r, off, index) {cacheFor, ...} = cacheFor = CacheMemory {base = r, offset = off, index=index}
             and isTagCache(r, s, os) {cacheFor, ...} = cacheFor = CacheTagged{reg = r, isSigned = s, opSize = os}
             and isFloatCache r {cacheFor, ...} = cacheFor =CacheFloat{reg = r }
 
             fun findCache f = List.find f (! cache)
             
             fun removeCache f = cache := List.filter (not o f) (! cache)
         in
             fun clearCache() = cache := []
              
             fun findCachedStack n = Option.map (#cacheReg) (findCache (isStack n))
             and findCachedMemory (r, off, index, kind) =
             (
                 case findCache(isMemory (r, off, index)) of
                     SOME {cacheReg, isTagged, kind=cacheKind, ...} =>
                         (* Must check the size of the operand.  In particular we could have loaded the low order
                            32-bits in 32-in-64 but later want all 64-bits because it's a large-word. See Test182. *)
                         if kind = cacheKind
                         then SOME (cacheReg, isTagged, kind)
                         else NONE
                 |   NONE => NONE
             )
             and findCachedTagged (r, s, os) = Option.map #cacheReg (findCache(isTagCache (r, s, os)))
             and findCachedFloat r = Option.map #cacheReg (findCache(isFloatCache r))
             
             fun removeStackCache n = removeCache (isStack n)
             and removeMemoryCache (r, off, index) = removeCache (isMemory (r, off, index))
             and removeTagCache (r, s, os) = removeCache (isTagCache (r, s, os))
             and removeFloatCache r = removeCache (isFloatCache r)
             
             fun clearMemoryCache() =
                 cache := List.filter(fn {cacheFor=CacheMemory _,...} => false | _ => true) (!cache)
 
             fun setStackCache(n, new) =
                 (
                     removeStackCache n;
                     cache := {cacheFor=CacheStack{rno=n}, cacheReg=new, isTagged=true, kind=moveNativeWord} :: ! cache
                 )
             
             and setMemoryCache(r, off, index, new, isTagged, kind) =
                 (
                     removeMemoryCache (r, off, index);
                     cache := {cacheFor=CacheMemory{base=r, offset=off, index=index}, cacheReg=new, isTagged=isTagged, kind=kind} :: ! cache
                 )
             
             and setTagCache(r, s, os, new) =
                 (
                     removeTagCache (r, s, os);
                     cache := {cacheFor=CacheTagged{reg=r, isSigned=s, opSize=os}, cacheReg=new, isTagged=true, kind=moveNativeWord} :: ! cache
                 )
 
             and setFloatCache(r, new) =
                 (
                     removeFloatCache r;
                     cache := {cacheFor=CacheFloat{reg=r}, cacheReg=new, isTagged=true, kind=MoveFloat} :: ! cache
                 )
 
             fun getCache () = ! cache
             
             (* Merge the cache states *)
             fun setCommonCacheState [] = clearCache() 
             |   setCommonCacheState [single] = cache := single
             
             |   setCommonCacheState (many as first :: rest) =
                 let
                     (* Generally we will either be unable to merge and have an empty cache or
                        will have just one or two entries. *)
                     (* Find the shortest.  If it's empty we're done. *)
                     fun findShortest(_, [], _) = []
                     |   findShortest(_, shortest, []) = shortest
                     
                     |   findShortest(len, shortest, hd::tl) =
                         let
                             val hdLen = length hd
                         in
                             if hdLen < len then findShortest(hdLen, hd, tl) else findShortest(len, shortest, tl)
                         end
                     val shortest = findShortest(length first, first, rest)
                     
                     (* Find the item we're caching for.  If it is in a different register we
                        can't use it. *)
                     fun findItem search (hd::tl) =
                         if #cacheFor hd = #cacheFor search then #cacheReg hd = #cacheReg search
                         else findItem search tl
                     |   findItem _ [] = false
                     
                     (* It's present if it's in all the sources. *)
                     fun present search = List.all(findItem search) many
                     
                     val filtered =
                         List.foldl (fn (search, l) => if present search then search :: l else l) [] shortest
                 in
                     cache := filtered
                 end
         end
 
         val maxStack = ref 0
         (* The stack size we've assumed for the block.  Also indicates if
            a block has already been processed. *)
         val inputStackSizes =
             Array.array(numberOfBlocks, NONE: {expectedInput:int, reqCC: bool} option)
         (* The result of processing a block. *)
         val blockOutput = Array.array(numberOfBlocks, {code=[], cache=[], stackCount=0})
         
         (* Extra blocks to adjust the stack are added here. *)
         val extraBlocks: basicBlock list ref = ref []
         val blockCounter = ref numberOfBlocks
         
         (* Get the blocks that are inputs for each one. *)
         local
             val blockRefs = Array.array(numberOfBlocks, [])
             
             fun setReferences fromBlock =
             let
                 val ExtendedBasicBlock{ flow, ...} = vsub code fromBlock
                 val refs = successorBlocks flow
                 fun setRefs toBlock =
                 let
                     val oldRefs = asub blockRefs toBlock
                 in
                     Array.update(blockRefs, toBlock, fromBlock :: oldRefs);
                     if null oldRefs
                     then setReferences toBlock
                     else ()
                 end
             in
                 List.app setRefs refs
             end
 
             val () = setReferences 0
         in
             val blockRefs = blockRefs
         end
 
         (* Recursive scan of the blocks.  For each block we produce an input and output state.
            The input state is the output state of the predecessor i.e. some block that jumps to
            this, but with any entries removed that are not used in this block.  It is then
            necessary to match the input state, if necessary by adding extra blocks that just
            do the matching. *)
         local
             val haveProcessed = isSome o asub inputStackSizes
         
             fun processBlocks toDo =
             case List.filter (fn (n, _) => not(haveProcessed n)) toDo of
                 [] => () (* Nothing left to do *)
 
             |   stillToDo as head :: _ =>
                 let
                     (* Try to find a block all of whose predecessors have been processed.  That
                        increases the chances that we will have cached items. *)
                     fun available(dest, _) = List.all haveProcessed (Array.sub(blockRefs, dest))
                     val (blockNo, lastOutputState) =
                         case List.find available stillToDo of
                             SOME c => c
                         |   NONE => head
                 
                     (* This is the first time we've come to this block.  *)
                     val ExtendedBasicBlock{ block, flow, imports, passThrough, loopRegs, inCCState, initialStacks, ...} = vsub code blockNo
                     val requiresCC = isSome inCCState
 
                     (* Remove any items from the input state that are no longer needed for
                        this block.  They could be local to the previous block or needed by
                        a different successor.  Although the values in loopRegs are not
                        required the stack space is so that they can be updated. *)
                     fun removeItems(result as {stack=[], stackCount=0}) = result
                 
                     |   removeItems{stack=[], ...} = raise InternalError "removeItems - stack size"
 
                     |   removeItems (thisStack as {stack=NewEntry{pregNo} :: rest, stackCount}) =
                         if member(pregNo, imports) orelse member(pregNo, passThrough) orelse member(pregNo, loopRegs)
                         then thisStack
                         else removeItems{stack=rest, stackCount=stackCount-1}
 
                     |   removeItems (thisStack as {stack=OriginalEntry{stackLoc=StackLoc{rno, size}, ...} :: rest, stackCount}) =
                         if member(rno, initialStacks)
                         then thisStack
                         else removeItems{stack=rest, stackCount=stackCount-size}
 
                     |   removeItems result = result
 
                     val {stackCount=newSp, stack=newStack} = removeItems lastOutputState
                     
                     (* References to hold the current stack count (number of words on the stack)
                        and the list of items on the stack.  The list is not used directly to map
                        stack addresses.  Instead it is used to match the stack at the beginning
                        and end of a block. *)
                     val stackCount = ref newSp
                     val stack = ref newStack
                     (* Items from the stack that have been marked as deleted but not yet
                        removed.  We only remove items from the top of the stack to avoid
                        quadratic behaviour with a very deep stack. *)
                     val deletedItems = ref []
                     
                     (* Save the stack size in case we come by a different route. *)
                     val () = Array.update(inputStackSizes, blockNo, SOME{expectedInput=newSp, reqCC=requiresCC})
                     
                     fun pushItemToStack item =
                     let
                         val size =
                             case item of
                                 NewEntry _ => 1
                             |   OriginalEntry{stackLoc=StackLoc{size, ...}, ...} => size
                             |   HandlerEntry => 2
                     in
                         stackCount := ! stackCount+size;
                         stack := item :: ! stack;
                         maxStack := Int.max(!maxStack, !stackCount)
                     end
  
                     fun newPReg propKind =
                     let
                         val regNo = !pregCounter before pregCounter := !pregCounter + 1
                         val () = pregPropList := propKind :: !pregPropList
                     in
                         PReg regNo
                     end
         
                     and newStackLoc size =
                     let
                         val regNo = !pregCounter before pregCounter := !pregCounter + 1
                         val () = pregPropList := RegPropStack size :: !pregPropList
                     in
                         StackLoc{size=size, rno=regNo}
                     end
         
                     (* Map a source register.  This always loads the argument. *)
                     fun mapSrcRegEx(PReg n) =
                         case Array.sub(pregMap, n) of
                             Unset => raise InternalError "mapSrcReg - unset"
                         |   ToPReg preg => (preg, [], [])
                         |   ToStack(stackLoc, container as StackLoc{size, ...}) =>
                             let
                                 (* Make a new untagged register. That will prevent us pushing it if
                                    we have to spill registers. *)
                                 val newReg = newPReg RegPropUntagged
                                 val sourceCache = findCachedStack n
                                 val stackSource =
                                     StackLocation{wordOffset= !stackCount-stackLoc-size, container=container, field=0, cache=sourceCache}
                                 (* Because this is in a register we can copy it to a cache register. *)
                                 val newCacheReg = newPReg RegPropCacheTagged
                                 val () = setStackCache(n, newCacheReg)
                             in
                                 (newReg,
                                     [LoadArgument{source=stackSource, dest=newReg, kind=moveNativeWord}],
                                     [CopyToCache{source=newReg, dest=newCacheReg, kind=moveNativeWord}])
                             end
 
                     fun mapSrcReg srcReg =
                     let
                         val (newReg, codePre, codePost) = mapSrcRegEx srcReg
                     in
                         (newReg, codePost @ codePre)
                     end
 
                     fun mapDestReg(PReg n) =
                     let
                         val currentLocation = Array.sub(pregMap, n)
                         val kind = Vector.sub(pregProps, n)
                     in
                         if Vector.sub(pushVec, n)
                         then
                         let
                             (* This should not have been seen before. *)
                             val _ = case currentLocation of Unset => () | _ => raise InternalError "mapDestReg - already set"
                             val newReg = newPReg kind
                             val newContainer = newStackLoc 1
                             val () = Array.update(pregMap, n, ToStack (!stackCount, newContainer))
                             val () = pushItemToStack(NewEntry{pregNo=n})
                         in
                             (newReg, [PushValue{arg=RegisterArgument newReg, container=newContainer}])
                         end
                         else
                         let
                         (* See if we already have a number for it.  We may encounter the same preg
                            as a destination when returning the result from a conditional in which
                            case we have to use the same number.  We shouldn't have pushed it. *)
                             val newReg =
                                 case (currentLocation, kind) of
                                     (Unset, _) =>
                                     let
                                         val newReg = newPReg kind
                                         val () = Array.update(pregMap, n, ToPReg newReg)
                                     in
                                         newReg
                                     end
                                 |   (ToPReg preg, RegPropMultiple) => preg
                                 |   _ => raise InternalError "mapDestReg - multiply defined non-merge reg"
                         in
                             (newReg, [])
                         end
                     end
         
                     (* A work register must be a normal register. *)
                     fun mapWorkReg(PReg n) =
                     let
                         val currentLocation = Array.sub(pregMap, n)
                         val _ = Vector.sub(pushVec, n) andalso raise InternalError "mapWorkReg - MustPush"
                     in
                         case currentLocation of
                             Unset =>
                             let
                                 val kind = Vector.sub(pregProps, n)
                                 val newReg = newPReg kind
                                 val () = Array.update(pregMap, n, ToPReg newReg)
                             in
                                 newReg
                             end
                         |   ToPReg preg => preg
                         |   ToStack _ => raise InternalError "mapWorkReg - on stack"
                 
                     end
 
                     fun mapIndexEx(NoMemIndex) = (NoMemIndex, [], [])
                     |   mapIndexEx(MemIndex1 r) =
                             let val (sreg, c1, c2) = mapSrcRegEx r in (MemIndex1 sreg, c1, c2) end
                     |   mapIndexEx(MemIndex2 r) =
                             let val (sreg, c1, c2) = mapSrcRegEx r in (MemIndex2 sreg, c1, c2) end
                     |   mapIndexEx(MemIndex4 r) =
                             let val (sreg, c1, c2) = mapSrcRegEx r in (MemIndex4 sreg, c1, c2) end
                     |   mapIndexEx(MemIndex8 r) =
                             let val (sreg, c1, c2) = mapSrcRegEx r in (MemIndex8 sreg, c1, c2) end
                     |   mapIndexEx(ObjectIndex) = (ObjectIndex, [], [])
 
                     fun mapIndex index =
                     let
                         val (newIndex, codePre, codePost) = mapIndexEx index
                     in
                         (newIndex, codePost @ codePre)
                     end
 
                     (* Adjust a stack offset from the old state to the new state. *)
                     fun mapContainerAndStack(StackLoc{rno, size}, field) =
                     let
                         val (newStackAddr, newContainer) =
                         case Array.sub(pregMap, rno) of
                             Unset => raise InternalError "mapContainer - unset"
                         |   ToPReg _ => raise InternalError "mapContainer - ToPReg"
                         |   ToStack stackContainer => stackContainer
 
                         val newOffset = !stackCount-(newStackAddr+size) + field
                     in
                         (newOffset, newContainer)
                     end
         
                     (* Add an entry for an existing stack entry. *)
                     fun mapDestContainer(StackLoc{rno, size}, locn) =
                     (
                         case Array.sub(pregMap, rno) of
                             Unset =>
                             let
                                 val newContainer = newStackLoc size
                                 val () = Array.update(pregMap, rno, ToStack(locn, newContainer))
                             in
                                 newContainer
                             end
                         |   _ => raise InternalError "mapDestContainer: already set"
                     )
 
                     fun mapSourceEx(RegisterArgument(PReg r), _) =
                         (
                             case Array.sub(pregMap, r) of
                                 Unset => raise InternalError "mapSource - unset"
                             |   ToPReg preg => (RegisterArgument preg, [], [])
                             |   ToStack(stackLoc, container as StackLoc{size, ...}) =>
                                 let
                                     val sourceCache = findCachedStack r
                                     val stackLoc =
                                         StackLocation{wordOffset= !stackCount-stackLoc-size, container=container, field=0, cache=sourceCache}
                                     (* If this is cached we need to make a new cache register and copy it there. *)
                                     val cacheCode =
                                         case sourceCache of
                                             NONE => []
                                         |   SOME cacheR =>
                                             let
                                                 val newCacheReg = newPReg RegPropCacheTagged
                                                 val () = setStackCache(r, newCacheReg)
                                             in
                                                 [CopyToCache{source=cacheR, dest=newCacheReg, kind=moveNativeWord}]
                                             end
                                 in
                                     (stackLoc, [], cacheCode)
                                 end
                         )
 
                     |   mapSourceEx(a as AddressConstant _, _) = (a, [], [])
                     |   mapSourceEx(i as IntegerConstant _, _) = (i, [], [])
 
                     |   mapSourceEx(MemoryLocation{base, offset, index, cache, ...}, kind) =
                         if (case index of NoMemIndex => true | ObjectIndex => true | _ => false)
                         then
                         let
                             val (baseReg, baseCodePre, baseCodePost) = mapSrcRegEx base
                             (* We can cache this if it is the first pass or if we have previously
                                cached it and we haven't marked it as pushed. *)
                             val newCache =
                                 case cache of
                                     NONE => if firstPass then findCachedMemory(base, offset, index, kind) else NONE
                                 |   SOME (PReg c) =>
                                     if Vector.sub(pushVec, c)
                                     then NONE (* We had marked this as to be pushed - we can't use a cache here. *)
                                     else findCachedMemory(base, offset, index, kind)
                             val memLoc =
                                 MemoryLocation{base=baseReg, offset=offset, index=index, cache=Option.map #1 newCache}
                             val cacheCode =
                                 case newCache of
                                     NONE => (removeMemoryCache(base, offset, index); [])
                                 |   SOME (oldCacheReg, isTagged, kind) =>
                                     let
                                         (* Set the cache kind.  If this is the first pass we will have a
                                            general or untagged register. *)
                                         val cacheKind = if isTagged then RegPropCacheTagged else RegPropCacheUntagged
                                         val newCacheReg = newPReg cacheKind
                                         val () = setMemoryCache(base, offset, index, newCacheReg, isTagged, kind)
                                     in
                                         [CopyToCache{source=oldCacheReg, dest=newCacheReg, kind=kind}]
                                     end
                         in
                             (memLoc, baseCodePre, baseCodePost @ cacheCode)
                         end
                         else
                         let
                             val (baseReg, baseCodePre, baseCodePost) = mapSrcRegEx base
                             val (indexValue, indexCodePre, indexCodePost) = mapIndexEx index
                         in
                             (MemoryLocation{base=baseReg, offset=offset, index=indexValue, cache=NONE}, baseCodePre @ indexCodePre,
                                 baseCodePost @ indexCodePost)
                         end
 
                     |   mapSourceEx(StackLocation{container as StackLoc{rno, ...}, field, cache, ...}, _) =
                         let
                             val (newOffset, newContainer) = mapContainerAndStack(container, field)
                             (* Was the item previously cached?  If it wasn't or the cache reg has been marked
                                as "must push" we can't use a cache. *)
                             val newCache =
                                 case cache of
                                     NONE => NONE
                                 |   SOME (PReg c) =>
                                     if Vector.sub(pushVec, c)
                                     then NONE (* We had marked this as to be pushed - we can't use a cache here. *)
                                     else findCachedStack rno
                             val stackLoc =
                                 StackLocation{wordOffset=newOffset, container=newContainer, field=field, cache=newCache}
                             val cacheCode =
                                 case newCache of
                                     NONE => (removeStackCache rno; [])
                                 |   SOME oldCacheReg =>
                                     let
                                         val newCacheReg = newPReg RegPropCacheTagged
                                         val () = setStackCache(rno, newCacheReg)
                                     in
                                         [CopyToCache{source=oldCacheReg, dest=newCacheReg, kind=moveNativeWord}]
                                     end
                         in
                             (stackLoc, [], cacheCode)
                         end
                     
                     |   mapSourceEx(ContainerAddr{container, ...}, _) =
                         let
                             val (newOffset, newContainer) = mapContainerAndStack(container, 0)
                         in
                             (ContainerAddr{container=newContainer, stackOffset=newOffset}, [], [])
                         end
 
                     fun mapSource(src, kind) =
                     let
                         val (sourceVal, sourceCodePre, sourceCodePost) = mapSourceEx(src, kind)
                     in
                         (sourceVal, sourceCodePost @ sourceCodePre)
                     end
 
                     (* Force a load of the source into a register if it is on the stack.
                        This is used in cases where a register or literal is allowed but not
                        a memory location. If we do load it we can cache the register. *)
                     fun mapAndLoad(source as RegisterArgument(PReg r), kind) =
                         let
                             val (sourceVal, sourceCodePre, sourceCodePost) = mapSourceEx(source, kind)
                         in
                             case sourceVal of
                                 stack as StackLocation _ =>
                                 let
                                     val newReg = newPReg RegPropUntagged
                                     val newCacheReg = newPReg RegPropCacheTagged
                                     val _ = setStackCache(r, newCacheReg)
                                 in
                                     (RegisterArgument newReg,
                                         CopyToCache{source=newReg, dest=newCacheReg, kind=moveNativeWord} :: sourceCodePost @
                                             LoadArgument{source=stack, dest=newReg, kind=moveNativeWord} :: sourceCodePre)
                                 end
                             |   _ => (sourceVal, sourceCodePost @ sourceCodePre)
                         end
 
                     |   mapAndLoad(StackLocation _, _) = raise InternalError "mapAndLoad - already a stack loc"
                     |   mapAndLoad(MemoryLocation _, _) = raise InternalError "mapAndLoad - already a mem loc"
                     |   mapAndLoad(source, kind) = mapSource(source, kind)
                     
                     fun opSizeToMoveKind OpSize32 = Move32Bit
                     |   opSizeToMoveKind OpSize64 = Move64Bit
         
                     (* Rewrite the code, replacing any registers that need to be pushed with references to
                        the stack.  The result is built up in reverse order and then reversed. *)
                     fun pushRegisters({instr=LoadArgument{source, dest=PReg dReg, kind}, ...}, code) =
                         if Vector.sub(pushVec, dReg)
                         then (* We're going to push this. *)
                         let
                             val (sourceVal, sourceCode) = mapSource(source, kind)
                             (* If we have to push the value we don't have to first load it into a register. *)
                             val _ = case Array.sub(pregMap, dReg) of Unset => () | _ => raise InternalError "LoadArgument - already set"
                             val container = newStackLoc 1
                             val () = Array.update(pregMap, dReg, ToStack(! stackCount, container))
                             val () = pushItemToStack(NewEntry{pregNo=dReg})
                         in
                             if targetArch = ObjectId32Bit andalso
                                 (case sourceVal of MemoryLocation _ => true | AddressConstant _ => true | _ => false)
                             then
                             let
                                 (* Push will always push a 64-bit value.  We have to put it in a register first.
                                    For MemoryLocations that's because it would push 8 bytes; for AddressConstants
                                    that's because we don't have a way of pushing an unsigned 32-bit constant. *)
                                 val newReg = newPReg RegPropUntagged
                             in
                                 PushValue{arg=RegisterArgument newReg, container=container} ::
                                     LoadArgument{source=sourceVal, dest=newReg, kind=movePolyWord} :: sourceCode @ code
                             end
                             else PushValue{arg=sourceVal, container=container} :: sourceCode @ code
                         end
                         else (* We're not going to push this. *)
                         let
                             val (sourceVal, sourceCodePre, sourceCodePost) = mapSourceEx(source, kind)
                             val dKind = Vector.sub(pregProps, dReg)
                             val destReg =
                                 case (Array.sub(pregMap, dReg), dKind) of
                                     (Unset, _) =>
                                     let
                                         val newReg = newPReg dKind
                                         val () = Array.update(pregMap, dReg, ToPReg newReg)
                                     in
                                         newReg
                                     end
                                 |   (ToPReg preg, RegPropMultiple) => preg
                                 |   _ => raise InternalError "LoadArgument - multiply defined non-merge reg"
                             
                             (* Can we cache this? . *)
                             val cacheCode =
                                 case source of
                                     MemoryLocation{base, offset, index, ...} =>
                                     (* Only cache if we have a fixed offset (not indexed). *)
                                     if (case index of NoMemIndex => true | ObjectIndex => true | _ => false)
                                     then
                                     let
                                         (* The cache kind must match the kind of register we're loading.
                                            If the value is untagged it must not be marked to be examined
                                            by the GC if we allocate anything.
                                            The move kind has to be suitable for a register to register move. *)
                                         val (cacheType, isTagged) =
                                             case dKind of
                                                 RegPropGeneral => (RegPropCacheTagged, true)
                                                 (* Generally there's no point in caching a multiply-defined
                                                    register because it is only used once but allow it in case
                                                    the other definitions have been optimised out. *)
                                             |   RegPropMultiple => (RegPropCacheTagged, true)
                                             |   RegPropUntagged => (RegPropCacheUntagged, false)
                                             |   _ => raise InternalError "cacheKind"
                                         val newCacheReg = newPReg cacheType
                                         val _ = setMemoryCache(base, offset, index, newCacheReg, isTagged, kind)
                                         val moveKind =
                                             case kind of
                                                 Move64Bit => Move64Bit
                                             |   MoveByte => Move32Bit
                                             |   Move16Bit => Move32Bit
                                             |   Move32Bit => Move32Bit
                                             |   MoveFloat => MoveFloat
                                             |   MoveDouble => MoveDouble
                                     in 
                                         [CopyToCache{source=destReg, dest=newCacheReg, kind=moveKind}]
                                     end
                                     else []
 
                                 |   _ => []
 
                             val destCode = LoadArgument{source=sourceVal, dest=destReg, kind=kind}
                         in
                             cacheCode @ sourceCodePost @ destCode :: sourceCodePre @ code
                         end
 
                     |   pushRegisters({instr=StoreArgument{source, offset, base, index, kind, isMutable}, ...}, code) =
                         let
                             val (loadedSource, sourceCode) = mapAndLoad(source, kind)
                             (* We can't have a memory-memory store so we have to load the source if it's now on the stack. *)
                             val (baseReg, baseCode) = mapSrcReg(base)
                             val (indexValue, indexCode) = mapIndex(index)
                             (* If we're assigning to a mutable we can no longer rely on
                                the memory cache.  Clear it completely in that case although
                                we could be more selective. *)
                             val () = if isMutable then clearMemoryCache() else ()
                         in
                             StoreArgument{source=loadedSource, base=baseReg, offset=offset, index=indexValue, kind=kind, isMutable=isMutable} ::
                                     indexCode @ baseCode @ sourceCode @ code
                         end
 
                     |   pushRegisters({instr=LoadMemReg { offset, dest, kind}, ...}, code) =
                         let
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ LoadMemReg { offset=offset, dest=destVal, kind=kind} :: code
                         end
 
                     |   pushRegisters({instr=StoreMemReg { offset, source, kind}, ...}, code) =
                         let
                             val (sourceValue, sourceCode) = mapSrcReg source
                         in
                             StoreMemReg { offset=offset, source=sourceValue, kind=kind} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=BeginFunction {regArgs, stackArgs}, ...}, code) =
                         let
                             (* Create a new container list.  The offsets begin at -numArgs. *)
                             fun newContainers(src :: srcs, offset) =
                                 let
                                     val newContainer = mapDestContainer(src, offset)
                                 in
                                     newContainer :: newContainers(srcs, offset+1)
                                 end
                             |   newContainers _ = []
 
                             val newStackArgs = newContainers(stackArgs, ~ (List.length stackArgs))
                             (* Push any registers that need to be pushed. *)
                             fun pushReg((preg, rreg), (others, code)) =
                             let
                                 val (newReg, newCode) = mapDestReg(preg)
                             in
                                 ((newReg, rreg) :: others, newCode @ code)
                             end
                             val (newRegArgs, pushCode) = List.foldl pushReg ([], []) regArgs
                         in
                             pushCode @ BeginFunction {regArgs=newRegArgs, stackArgs=newStackArgs} :: code
                         end
 
                     |   pushRegisters({instr=FunctionCall{callKind, regArgs, stackArgs, dest, realDest, ...}, ...}, code) =
                         let
                             (* It's possible that this could lead to having to spill registers in order
                                to load others.  Leave that problem for the moment. *)
                             fun loadStackArg (arg, (otherLoads, otherArgs)) =
                             let
                                 val (argVal, loadCode) = mapSource(arg, movePolyWord)
                             in
                                 (loadCode @ otherLoads, argVal :: otherArgs)
                             end
                             val (stackArgLoads, newStackArgs) = List.foldr loadStackArg ([], []) stackArgs
 
                             fun loadRegArg ((arg, reg), (otherLoads, otherArgs)) =
                             let
                                 val (argVal, loadCode) = mapSource(arg, movePolyWord)
                             in
                                 (loadCode @ otherLoads, (argVal, reg) :: otherArgs)
                             end
                             val (regArgLoads, newRegArgs) = List.foldr loadRegArg ([], []) regArgs
                             val (destVal, destCode) = mapDestReg dest
                             (* Now clear the cache table. *)
                             val () = clearCache()
+                            (* Stack arguments are pushed in addition to anything on the stack. *)
+                            val () = maxStack := Int.max(!maxStack, !stackCount + List.length newStackArgs)
                         in
                             destCode @
                                     FunctionCall{ callKind=callKind, regArgs=newRegArgs, stackArgs=newStackArgs,
                                                   dest=destVal, realDest=realDest, saveRegs=[]} ::
                                         regArgLoads @ stackArgLoads @ code
                         end
 
                     |   pushRegisters({instr=TailRecursiveCall{callKind, regArgs, stackArgs, stackAdjust, workReg, ...}, ...}, code) =
                         let
                             val newWorkReg = mapWorkReg workReg
                             val newStackOffset = !stackCount
                             
                             fun loadStackArg ({src, stack}, (otherLoads, otherArgs)) =
                             let
                                 val (argVal, loadCode) =
                                     case mapSource(src, movePolyWord) of
                                         (source as StackLocation{wordOffset, ...}, loadCode) =>
                                             (* If we're leaving it in its old location or we're pushing it
                                                above the current top we're ok.  We're also ok if
                                                we're moving it from a somewhere above the last argument.
                                                Otherwise we have to load it.
                                                It goes into a normal tagged register which may mean that it
                                                could be pushed onto the stack in a subsequent pass. *)
                                             if wordOffset = stack+newStackOffset orelse stack+newStackOffset < 0
                                                 orelse newStackOffset-wordOffset > ~ stackAdjust
                                             then (source, loadCode)
                                             else
                                             let
                                                 val preg = newPReg RegPropGeneral
                                             in
                                                 (RegisterArgument preg,
                                                     LoadArgument{source=source, dest=preg, kind=moveNativeWord} :: loadCode)
                                             end
                                     |   argCode => argCode
                             in
                                 (loadCode @ otherLoads, {src=argVal, stack=stack} :: otherArgs)
                             end
                             val (stackArgLoads, newStackArgs) = List.foldr loadStackArg ([], []) stackArgs
                 
                             fun loadRegArg ((arg, reg), (otherLoads, otherArgs)) =
                             let
                                 val (argVal, loadCode) = mapSource(arg, movePolyWord)
                             in
                                 (loadCode @ otherLoads, (argVal, reg) :: otherArgs)
                             end
                             val (regArgLoads, newRegArgs) = List.foldr loadRegArg ([], []) regArgs
+                            (* Stack arguments replace existing arguments but could grow the stack. *)
+                            val () = maxStack := Int.max(!maxStack, List.length newStackArgs)
 
                         in
                             TailRecursiveCall{ callKind=callKind, regArgs=newRegArgs,
                                     stackArgs=newStackArgs, stackAdjust=stackAdjust, currStackSize=newStackOffset,
                                     workReg=newWorkReg} ::
                                         regArgLoads @ stackArgLoads @ code
                         end
 
                     |   pushRegisters({instr=AllocateMemoryOperation{size, flags, dest, ...}, ...}, code) =
                         let
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ AllocateMemoryOperation{size=size, flags=flags, dest=destVal, saveRegs=[]} :: code
                         end
 
                     |   pushRegisters({instr=AllocateMemoryVariable{size, dest, ...}, ...}, code) =
                         let
                             val (sizeVal, sizeCode) = mapSrcReg size
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ AllocateMemoryVariable{size=sizeVal, dest=destVal, saveRegs=[]} :: sizeCode @ code
                         end
 
                     |   pushRegisters({instr=InitialiseMem{size, addr, init}, ...}, code) =
                         let
                             val (sizeVal, sizeCode) = mapSrcReg size
                             val (addrVal, addrCode) = mapSrcReg addr
                             val (initVal, initCode) = mapSrcReg init
                         in
                             InitialiseMem{size=sizeVal, addr=addrVal, init=initVal} :: initCode @ addrCode @ sizeCode @ code
                         end
 
                     |   pushRegisters({instr=InitialisationComplete, ...}, code) = InitialisationComplete :: code
 
                     |   pushRegisters({instr=BeginLoop, ...}, code) = BeginLoop :: code
 
                     |   pushRegisters({instr=JumpLoop{regArgs, stackArgs, checkInterrupt, workReg}, ...}, code) =
                         let
                             (* Normally JumpLoop will be the last item in a block but it is possible that we've
                                added a reset-stack after it. *)
                             fun getValues [] = ([], [], [])
                             |   getValues ((source, PReg n) :: rest) =
                                 let
                                     val (otherRegArgs, otherStackArgs, otherCode) = getValues rest
                                 in
                                     case Array.sub(pregMap, n) of
                                         ToPReg lReg =>
                                         let
                                             val (sourceVal, sourceCode) = mapSource(source, movePolyWord)
                                         in
                                             ((sourceVal, lReg) :: otherRegArgs, otherStackArgs, sourceCode @ otherCode)
                                         end
                                     |   ToStack(stackLoc, stackC as StackLoc{size, ...}) =>
                                         let
                                             val (sourceVal, sourceCode) = mapSource(source, movePolyWord)
                                             val stackOff = !stackCount - stackLoc - size
                                         in
                                             (otherRegArgs, (sourceVal, stackOff, stackC) :: otherStackArgs, sourceCode @ otherCode)
                                         end
                                     |   Unset =>
                                         (* Drop it.  It's never used. This can happen if we are folding a
                                            function over a list such that it always returns the last value and
                                            then discard the result of the fold. *)
                                             (otherRegArgs, otherStackArgs, otherCode)
                                 end
                             val (newRegArguments, newStackArgs, sourceCode) = getValues regArgs
                             fun loadStackArg((source, _, destC), (otherLoads, otherArgs)) =
                             let
                                 val (sourceVal, sourceCode) = mapSource(source, movePolyWord)
                                 val (newOffset, newContainer) = mapContainerAndStack(destC, 0)
                             in
                                 (sourceCode @ otherLoads, (sourceVal, newOffset, newContainer) :: otherArgs)
                             end
                 
                             val (stackArgLoads, oldStackArgs) = List.foldr loadStackArg ([], []) stackArgs
                             val check = case checkInterrupt of NONE => NONE | SOME _ => SOME []
                             (* Map the work reg if it exists already but get a new one if
                                we now have stack args. *)
                             val newWorkReg =
                                 case (workReg, newStackArgs) of
                                     (SOME r, _) => SOME(mapWorkReg r)
                                 |   (NONE, []) => NONE
                                 |   _ => SOME(newPReg RegPropGeneral)
                         in
                             JumpLoop{ regArgs=newRegArguments, stackArgs=oldStackArgs @ newStackArgs, checkInterrupt=check, workReg=newWorkReg} ::
                                      sourceCode @ stackArgLoads @ code
                         end
 
                     |   pushRegisters({instr=RaiseExceptionPacket{packetReg}, ...}, code) =
                         let
                             val (packetVal, packetCode) = mapSrcReg packetReg
                         in
                             RaiseExceptionPacket{packetReg=packetVal} :: packetCode @ code
                         end
 
                     |   pushRegisters({instr=ReserveContainer{size, container}, ...}, code) =
                         let
                             val newContainer = mapDestContainer(container, !stackCount)
                             val () = pushItemToStack(OriginalEntry{stackLoc=container})
                         in
                             ReserveContainer{size=size, container=newContainer} :: code
                         end
 
                     |   pushRegisters({instr=IndexedCaseOperation{testReg, workReg}, ...}, code) =
                         let
                             val (srcVal, srcCode) = mapSrcReg(testReg)
                             val newWorkReg = mapWorkReg workReg
                         in
                             (* This is an unconditional branch. *)
                             IndexedCaseOperation{testReg=srcVal, workReg=newWorkReg} :: srcCode @ code
                         end
 
                     |   pushRegisters({instr=LockMutable{addr}, ...}, code) =
                         let
                             val (addrVal, addrCode) = mapSrcReg(addr)
                         in
                             LockMutable{addr=addrVal} :: addrCode @ code
                         end
  
                     |   pushRegisters({instr=WordComparison{arg1, arg2, ccRef, opSize}, ...}, code) =
                         let
                             val (loadedOp1, op1Code) = mapSrcReg arg1
                             val (op2Val, op2Code) = mapSource(arg2, movePolyWord)
                         in
                             WordComparison{arg1=loadedOp1, arg2=op2Val, ccRef=ccRef, opSize=opSize} :: op2Code @ op1Code @ code
                         end
  
                     |   pushRegisters({instr=CompareLiteral{arg1, arg2, opSize, ccRef}, ...}, code) =
                         let
                             val (op1Val, op1Code) = mapSource(arg1, movePolyWord)
                         in
                             CompareLiteral{arg1=op1Val, arg2=arg2, opSize=opSize, ccRef=ccRef} :: op1Code @ code
                         end
  
                     |   pushRegisters({instr=CompareByteMem{arg1={base, offset, index, ...}, arg2, ccRef}, ...}, code) =
                         let
                             val (baseReg, baseCode) = mapSrcReg base
                             val (indexValue, indexCode) = mapIndex index
                             val newArg1 = {base=baseReg, offset=offset, index=indexValue}
                         in
                             CompareByteMem{arg1=newArg1, arg2=arg2, ccRef=ccRef} :: indexCode @ baseCode @ code
                         end
 
                     |   pushRegisters({instr=PushExceptionHandler{workReg}, ...}, code) =
                         let
                             val newWorkReg = mapWorkReg workReg
                             (* Add a handler entry to the stack. *)
                             val () = pushItemToStack HandlerEntry
                         in
                             PushExceptionHandler{workReg=newWorkReg} :: code
                         end
 
                     |   pushRegisters({instr=PopExceptionHandler{workReg, ...}, ...}, code) =
                         let
                             val newWorkReg = mapWorkReg workReg
                             (* Appears at the end of the block whose exceptions are being handled.  Delete the
                                handler and anything above it. *)
                             (* Get the state after removing the handler. *)
                             fun popContext ([], _) = raise InternalError "pushRegisters - pop handler"
                             |   popContext (HandlerEntry :: tl, new) = (tl, new-2)
                             |   popContext (OriginalEntry{stackLoc=StackLoc{size, ...}, ...} :: tl, new) = popContext(tl, new-size)
                             |   popContext (NewEntry _ :: tl, new) = popContext(tl, new-1)
                             val (newStack, nnCount) = popContext(!stack, !stackCount)
                             val () = stack := newStack
                             val oldStackPtr = ! stackCount
                             val () = stackCount := nnCount
                             (* Reset the stack to just above the two words of the handler. *)
                             val resetCode =
                                 if oldStackPtr <> nnCount+2
                                 then [ResetStackPtr{numWords=oldStackPtr-nnCount-2, preserveCC=false}]
                                 else []
                         in
                             PopExceptionHandler{workReg=newWorkReg} :: resetCode @ code
                         end
 
                     |   pushRegisters({instr=BeginHandler{packetReg, workReg, ...}, ...}, code) =
                         let
                             (* Clear the cache.  This may not be necessary if we are only handling
                                locally generated exceptions but keep it for the moment. *)
                             val () = clearCache()
                             (* Start of a handler.  The top active entry should be the handler. *)
                             val () =
                                 case !stack of
                                     HandlerEntry :: tl => stack := tl
                                 |   _ => raise InternalError "pushRegisters: BeginHandler"
                             val () = stackCount := !stackCount - 2
                             val newWorkReg = mapWorkReg workReg
                             val (pktReg, pktCode) = mapDestReg(packetReg)
                         in
                             pktCode @ BeginHandler{packetReg=pktReg, workReg=newWorkReg} :: code
                         end
 
                     |   pushRegisters({instr=ReturnResultFromFunction{resultReg, realReg, numStackArgs}, ...}, code) =
                         let
                             val (resultValue, loadResult) = mapSrcReg resultReg
                             val resetCode =
                                 if !stackCount = 0 then [] else [ResetStackPtr{numWords= !stackCount, preserveCC=false}]
                         in
                             ReturnResultFromFunction{resultReg=resultValue, realReg=realReg, numStackArgs=numStackArgs} ::
                                 resetCode @ loadResult @ code
                         end
 
                     |   pushRegisters({instr=ArithmeticFunction{oper, resultReg, operand1, operand2, ccRef, opSize}, ...}, code) =
                         let
                             val (loadedOp1, op1Code) = mapSrcReg operand1
                             val (op2Val, op2Code) = mapSource(operand2, opSizeToMoveKind opSize)
                             val (destVal, destCode) = mapDestReg resultReg
                         in
                             destCode @ ArithmeticFunction{oper=oper, resultReg=destVal, operand1=loadedOp1, operand2=op2Val, ccRef=ccRef, opSize=opSize} ::
                                     op2Code @ op1Code @ code
                         end
 
                     |   pushRegisters({instr=TestTagBit{arg, ccRef}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSource(arg, movePolyWord)
                         in
                             TestTagBit{arg=sourceVal, ccRef=ccRef} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=PushValue{arg, container, ...}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSource(arg, movePolyWord)
                             (* This was a push from a previous pass.  Treat as a container of size 1. *)
                             val newContainer = mapDestContainer(container, !stackCount)
                             val () = pushItemToStack(OriginalEntry{stackLoc=container})
                         in
                             PushValue{arg=sourceVal, container=newContainer} :: sourceCode @ code
                         end
                     
                     |   pushRegisters({instr=CopyToCache _, ...}, code) = code
                             (* This was added on a previous pass.  Discard it.  If we are going to cache this again we'll
                                add new CopyToCache instructions. *)
 
                     |   pushRegisters({instr=ResetStackPtr _, ...}, code) = code
                             (* Added in a previous pass - discard it. *)
 
                     |   pushRegisters({instr=StoreToStack{source, container, field, ...}, ...}, code) =
                         let
                             val (loadedSource, sourceCode) = mapAndLoad(source, movePolyWord)
                             (* We can't have a memory-memory store so we have to load the source if it's now on the stack. *)
                             val (newOffset, newContainer) = mapContainerAndStack(container, field)
                         in
                             StoreToStack{source=loadedSource, container=newContainer, field=field, stackOffset=newOffset} ::
                                     sourceCode @ code
                         end
 
                     |   pushRegisters({instr=TagValue{source, dest, isSigned, opSize}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                             val _ = setTagCache(dest, isSigned, opSize, sourceVal)
                         in
                             destCode @ TagValue{source=sourceVal, dest=destVal, isSigned=isSigned, opSize=opSize} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=UntagValue{source, dest, isSigned, cache, opSize, ...}, ...}, code) =
                         let
                             val (loadedSource, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                             (* As with MemoryLocation caching, we can try caching it if this is the
                                first pass but otherwise we can only retain the caching if we have never
                                marked it to be pushed. *)
                             val newCache =
                                 case cache of
                                     NONE => if firstPass then findCachedTagged(source, isSigned, opSize) else NONE
                                 |   SOME (PReg c) =>
                                     if Vector.sub(pushVec, c)
                                     then NONE (* We had marked this as to be pushed - we can't use a cache here. *)
                                     else findCachedTagged(source, isSigned, opSize)
                         in
                             destCode @ UntagValue{source=loadedSource, dest=destVal, isSigned=isSigned, cache=newCache, opSize=opSize} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=LoadEffectiveAddress{base, offset, index, dest, opSize}, ...}, code) =
                         let
                             val (baseVal, baseCode) =
                                 case base of
                                     SOME bReg =>
                                         let val (newBReg, regCode) = mapSrcReg(bReg) in (SOME newBReg, regCode) end
                                 |   NONE => (NONE, [])
                             val (indexVal, indexCode) = mapIndex index
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ LoadEffectiveAddress{base=baseVal, offset=offset, index=indexVal, dest=destVal, opSize=opSize} :: indexCode @ baseCode @ code
                         end
 
                     |   pushRegisters({instr=ShiftOperation{shift, resultReg, operand, shiftAmount, ccRef, opSize}, ...}, code) =
                         let
                             val (opVal, opCode) = mapSrcReg operand
                             val (shiftVal, shiftCode) = mapSource(shiftAmount, opSizeToMoveKind opSize)
                             val (destVal, destCode) = mapDestReg resultReg
                         in
                             destCode @ ShiftOperation{shift=shift, resultReg=destVal, operand=opVal, shiftAmount=shiftVal, ccRef=ccRef, opSize=opSize} ::
                                     shiftCode @ opCode @ code
                         end
 
                     |   pushRegisters({instr=Multiplication{resultReg, operand1, operand2, ccRef, opSize}, ...}, code) =
                         let
                             val (op1Val, op1Code) = mapSrcReg operand1
                             val (op2Val, op2Code) = mapSource(operand2, opSizeToMoveKind opSize)
                             val (destVal, destCode) = mapDestReg resultReg
                         in
                             destCode @ Multiplication{resultReg=destVal, operand1=op1Val, operand2=op2Val, ccRef=ccRef, opSize=opSize} :: op2Code @ op1Code @ code
                         end
 
                     |   pushRegisters({instr=Division{isSigned, dividend, divisor, quotient, remainder, opSize}, ...}, code) =
                         let
                             val (dividendVal, dividendCode) = mapSrcReg dividend
                             val (divisorVal, divisorCode) = mapSource(divisor, opSizeToMoveKind opSize)
                             val (quotVal, quotCode) = mapDestReg quotient
                             val (remVal, remCode) = mapDestReg remainder
                         in
                             remCode @ quotCode @
                                     Division{isSigned=isSigned, dividend=dividendVal, divisor=divisorVal, quotient=quotVal, remainder=remVal, opSize=opSize} ::
                                         divisorCode @ dividendCode @ code
                         end
 
                     |   pushRegisters({instr=AtomicExchangeAndAdd{base, source, resultReg}, ...}, code) =
                         let
                             val (baseVal, baseCode) = mapSrcReg(base)
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg resultReg
                         in
                             destCode @
                                 AtomicExchangeAndAdd{base=baseVal, source=sourceVal, resultReg=destVal} :: sourceCode @ baseCode @ code
                         end
 
                     |   pushRegisters({instr=AtomicExchange{base, source, resultReg}, ...}, code) =
                         let
                             val (baseVal, baseCode) = mapSrcReg(base)
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg resultReg
                         in
                             destCode @
                                 AtomicExchange{base=baseVal, source=sourceVal, resultReg=destVal} :: sourceCode @ baseCode @ code
                         end
 
                     |   pushRegisters({instr=AtomicCompareAndExchange{base, compare, toStore, resultReg, ccRef}, ...}, code) =
                         let
                             val (baseVal, baseCode) = mapSrcReg(base)
                             val (compareVal, compareCode) = mapSrcReg compare
                             val (toStoreVal, toStoreCode) = mapSrcReg toStore
                             val (destVal, destCode) = mapDestReg resultReg
                         in
                             destCode @
                                 AtomicCompareAndExchange{base=baseVal, compare=compareVal, toStore=toStoreVal, resultReg=destVal, ccRef=ccRef} ::
                                     toStoreCode @ compareCode @ baseCode @ code
                         end
 
                     |   pushRegisters({instr=BoxValue{boxKind, source, dest as PReg dReg, ...}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                             (* We can cache the boxed value except if this is an X87 box.
                                We can't cache X87 values because there's effectively only one register
                                and this box instruction uses FSTP (store and POP). *)
                             val cacheCode =
                                 if Vector.sub(pushVec, dReg) orelse boxKind = BoxX87Double orelse boxKind = BoxX87Float then []
                                 else
                                 let
                                     val newCacheReg = newPReg RegPropCacheUntagged
                                     val moveKind =
                                         case boxKind of
                                             BoxLargeWord => moveNativeWord
                                         |   BoxX87Double => MoveDouble 
                                         |   BoxX87Float => MoveFloat
                                         |   BoxSSE2Double => MoveDouble
                                         |   BoxSSE2Float => MoveFloat
                                     val indexKind = case targetArch of ObjectId32Bit => ObjectIndex | _ => NoMemIndex
 
                                     (* The value we're putting in the cache is untagged. *)
                                     val _ = setMemoryCache(dest, 0, indexKind, newCacheReg, false, moveKind)
                                 in
                                     [CopyToCache{source=sourceVal, dest=newCacheReg, kind=moveKind}]
                                 end
                         in
                             cacheCode @ destCode @ BoxValue{boxKind=boxKind, source=sourceVal, dest=destVal, saveRegs=[]} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=CompareByteVectors{vec1Addr, vec2Addr, length, ccRef}, ...}, code) =
                         let
                             val (vec1Val, vec1Code) = mapSrcReg vec1Addr
                             val (vec2Val, vec2Code) = mapSrcReg vec2Addr
                             val (lengthVal, lengthCode) = mapSrcReg length
                         in
                             CompareByteVectors{vec1Addr=vec1Val, vec2Addr=vec2Val, length=lengthVal, ccRef=ccRef} ::
                                     lengthCode @ vec2Code @ vec1Code @ code
                         end
 
                     |   pushRegisters({instr=BlockMove{srcAddr, destAddr, length, isByteMove}, ...}, code) =
                         let
                             val (srcVal, srcCode) = mapSrcReg srcAddr
                             val (destVal, destCode) = mapSrcReg destAddr
                             val (lengthVal, lengthCode) = mapSrcReg length
                             (* For safety clear the memory cache here.  That may not be necessary. *)
                             val () = clearMemoryCache()
                         in
                             BlockMove{srcAddr=srcVal, destAddr=destVal, length=lengthVal, isByteMove=isByteMove} ::
                                     lengthCode @ destCode @ srcCode @ code
                         end
 
                     |   pushRegisters({instr=X87Compare{arg1, arg2, isDouble, ccRef}, ...}, code) =
                         let
                             val (arg1Val, arg1Code) = mapSrcReg arg1
                             val (arg2Val, arg2Code) = mapSource(arg2, if isDouble then Move64Bit else Move32Bit)
                         in
                             X87Compare{arg1=arg1Val, arg2=arg2Val, isDouble=isDouble, ccRef=ccRef} :: arg2Code @ arg1Code @ code
                         end
 
                     |   pushRegisters({instr=SSE2Compare{arg1, arg2, isDouble, ccRef}, ...}, code) =
                         let
                             val (arg1Val, arg1Code) = mapSrcReg arg1
                             val (arg2Val, arg2Code) = mapSource(arg2, if isDouble then Move64Bit else Move32Bit)
                         in
                             SSE2Compare{arg1=arg1Val, arg2=arg2Val, ccRef=ccRef, isDouble=isDouble} :: arg2Code @ arg1Code @ code
                         end
 
                     |   pushRegisters({instr=X87FPGetCondition{dest, ccRef}, ...}, code) =
                         let
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ X87FPGetCondition{dest=destVal, ccRef=ccRef} :: code
                         end
 
                     |   pushRegisters({instr=X87FPArith{opc, resultReg, arg1, arg2, isDouble}, ...}, code) =
                         let
                             val (arg1Val, arg1Code) = mapSrcReg arg1
                             val (arg2Val, arg2Code) = mapSource(arg2, if isDouble then Move64Bit else Move32Bit)
                             val (destVal, destCode) = mapDestReg resultReg
                         in
                             destCode @ X87FPArith{opc=opc, resultReg=destVal, arg1=arg1Val, arg2=arg2Val, isDouble=isDouble} ::
                                     arg2Code @ arg1Code @ code
                         end
     
                     |   pushRegisters({instr=X87FPUnaryOps{fpOp, dest, source}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ X87FPUnaryOps{fpOp=fpOp, dest=destVal, source=sourceVal} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=X87Float{dest, source}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSource(source, movePolyWord)
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ X87Float{dest=destVal, source=sourceVal} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=SSE2IntToReal{dest, source, isDouble}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSource(source, movePolyWord)
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ SSE2IntToReal{dest=destVal, source=sourceVal, isDouble=isDouble} :: sourceCode @ code
                         end
     
                     |   pushRegisters({instr=SSE2FPUnary{opc, resultReg, source}, ...}, code) =
                         let
                             val (argVal, argCode) =
                                 mapSource(source, case opc of SSE2UDoubleToFloat => Move64Bit | SSE2UFloatToDouble => Move32Bit)
                             val (destVal, destCode) = mapDestReg resultReg
                         in
                             destCode @ SSE2FPUnary{opc=opc, resultReg=destVal, source=argVal} ::
                                     argCode @ code
                         end
 
                     |   pushRegisters({instr=SSE2FPBinary{opc, resultReg, arg1, arg2}, ...}, code) =
                         let
                             val argMove =
                                 case opc of
                                     SSE2BAddDouble  => Move64Bit
                                 |   SSE2BSubDouble  => Move64Bit
                                 |   SSE2BMulDouble  => Move64Bit
                                 |   SSE2BDivDouble  => Move64Bit
                                 |   SSE2BXor        => Move64Bit (* Actually 128 bit but always in a reg. *)
                                 |   SSE2BAnd        => Move64Bit
                                 |   SSE2BAddSingle  => Move32Bit
                                 |   SSE2BSubSingle  => Move32Bit
                                 |   SSE2BMulSingle  => Move32Bit
                                 |   SSE2BDivSingle  => Move32Bit
                             val (arg1Val, arg1Code) = mapSrcReg arg1
                             val (arg2Val, arg2Code) = mapSource(arg2, argMove)
                             val (destVal, destCode) = mapDestReg resultReg
                         in
                             destCode @ SSE2FPBinary{opc=opc, resultReg=destVal, arg1=arg1Val, arg2=arg2Val} ::
                                     arg2Code @ arg1Code @ code
                         end
 
                     |   pushRegisters({instr=TagFloat{source, dest}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                             val _ = setFloatCache(dest, sourceVal)
                         in
                             destCode @ TagFloat{source=sourceVal, dest=destVal} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=UntagFloat{source as RegisterArgument srcReg, dest, cache, ...}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSource(source, Move32Bit)
                             val (destVal, destCode) = mapDestReg dest
                             (* As with MemoryLocation caching, we can try caching it if this is the
                                first pass but otherwise we can only retain the caching if we have never
                                marked it to be pushed. *)
                             val newCache =
                                 case cache of
                                     NONE => if firstPass then findCachedFloat srcReg else NONE
                                 |   SOME (PReg c) =>
                                     if Vector.sub(pushVec, c)
                                     then NONE (* We had marked this as to be pushed - we can't use a cache here. *)
                                     else findCachedFloat srcReg
                         in
                             destCode @ UntagFloat{source=sourceVal, dest=destVal, cache=newCache} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=UntagFloat{source, dest, ...}, ...}, code) =
                         (* This may also be a memory location in which case we don't cache. *)
                         let
                             val (sourceVal, sourceCode) = mapSource(source, Move32Bit)
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ UntagFloat{source=sourceVal, dest=destVal, cache=NONE} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=GetSSE2ControlReg{dest}, ...}, code) =
                         let
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ GetSSE2ControlReg{dest=destVal} :: code
                         end
 
                     |   pushRegisters({instr=SetSSE2ControlReg{source}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                         in
                             SetSSE2ControlReg{source=sourceVal} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=GetX87ControlReg{dest}, ...}, code) =
                         let
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ GetX87ControlReg{dest=destVal} :: code
                         end
 
                     |   pushRegisters({instr=SetX87ControlReg{source}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                         in
                             SetX87ControlReg{source=sourceVal} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=X87RealToInt{source, dest}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ X87RealToInt{source=sourceVal, dest=destVal} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=SSE2RealToInt{source, dest, isDouble, isTruncate}, ...}, code) =
                         let
                             val (srcVal, sourceCode) = mapSource(source, if isDouble then Move64Bit else Move32Bit)
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ SSE2RealToInt{source=srcVal, dest=destVal, isDouble=isDouble, isTruncate=isTruncate} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=SignExtend32To64{source, dest}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSource(source, Move32Bit)
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ SignExtend32To64{source=sourceVal, dest=destVal} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=TouchArgument{source}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                         in
                             TouchArgument{source=sourceVal} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=PauseCPU, ...}, code) = PauseCPU :: code
 
                     (* Find the common cache state. *)
                     val () = setCommonCacheState(List.map (#cache o asub blockOutput) (asub blockRefs blockNo))
 
                     local
                         fun doPush(instr as {kill, ...}, code) =
                         let
                             val newCode = pushRegisters(instr, code)
                             (* Can we pop the stack? *)
                             val stackReset =
                                 case setToList (minus(kill, loopRegs)) of
                                     [] => []
                                 |   killList =>
                                     let
                                         (* See if any of the kill items are at the top of the stack.
                                            If they are we can pop them and perhaps items we've
                                            previously marked for deletion but not been able to pop. *)
                                         val oldStack = !stackCount
                                         fun checkAndAdd(r, output) =
                                             case Array.sub(pregMap, r) of
                                                 ToStack(stackLoc, StackLoc{size, ...}) =>
                                                     if stackLoc < 0
                                                     then r :: output (* We can have arguments and return address. *)
                                                     else if !stackCount = stackLoc+size
                                                     then
                                                     (
                                                         stack := tl (!stack);
                                                         stackCount := stackLoc;
                                                         output
                                                     )
                                                     else r :: output
                                             |   _ => r :: output
                                         val toAdd = List.foldl checkAndAdd [] killList
                                         fun reprocess list =
                                         let
                                             val prevStack = !stackCount
                                             val outlist = List.foldl checkAndAdd [] list
                                         in
                                             if !stackCount = prevStack
                                             then list
                                             else reprocess outlist
                                         end
                                         val () =
                                             if !stackCount = oldStack
                                             then deletedItems := toAdd @ !deletedItems
                                             else deletedItems := reprocess(toAdd @ !deletedItems)
                                         val _ = oldStack >= !stackCount orelse raise InternalError "negative stack offset"
                                     in
                                         if !stackCount = oldStack then []
                                         else [ResetStackPtr{numWords=oldStack - !stackCount, preserveCC=true (* In case*)}]
                                     end
                         in
                             stackReset @ newCode
                         end
                     in
                         val codeResult = List.foldl doPush [] block
                         val outputCount = ! stackCount
                         val results = {code=codeResult, cache=getCache(), stackCount= outputCount}
                         val stateResult = { stackCount= outputCount, stack= !stack }
                         
                         val () = Array.update(blockOutput, blockNo,  results)
                     end
 
                     val addSet =
                         case flow of
                             ExitCode => []
                         |   IndexedBr cases => cases
                         |   Unconditional dest => [dest]
                         |   Conditional {trueJump, falseJump, ...} => [falseJump, trueJump]
                         |   SetHandler { handler, continue } => [handler, continue]
                         |   UnconditionalHandle _ => []
                         |   ConditionalHandle { continue, ...} => [continue]
                         
                     val addItems = List.map(fn m => (m, stateResult)) addSet
                 in
                     processBlocks(addItems @ stillToDo)
                 end
         in
             val () = processBlocks([(0, {stack=[], stackCount=0})])
         end
 
 
         (* Put together the result code and blocks. *)
         local
             fun createBlock blockNo =
             (* Skip unreferenced blocks apart from block 0. *)
             if blockNo <> 0 andalso null (asub blockRefs blockNo)
             then BasicBlock{block=[], flow=ExitCode}
             else
             let
                 val ExtendedBasicBlock{ flow, ...} = vsub code blockNo
                 val {code=codeResult, stackCount=outputCount, ...} = asub blockOutput blockNo
                 
                 (* Process the successor.  If we need a stack adjustment this will require
                    an adjustment block.  TODO: We could put a pre-adjustment if we only have one
                    branch to this block. *)
                 fun matchStacks targetBlock =
                 let
                     (* Process the destination.  If it hasn't been processed. *)
                     val {expectedInput, ...} = valOf (asub inputStackSizes targetBlock)
                 in
                     if expectedInput = outputCount
                     then targetBlock
                     else
                     let
                         val _ = outputCount > expectedInput orelse raise InternalError "adjustStack"
                         val adjustCode = [ResetStackPtr{numWords=outputCount-expectedInput, preserveCC=true (* For the moment *)}]
                         val newBlock = BasicBlock{block=adjustCode, flow=Unconditional targetBlock}
                         val newBlockNo = !blockCounter before blockCounter := !blockCounter+1
                         val () = extraBlocks := newBlock :: !extraBlocks
                     in
                         newBlockNo
                     end
                 end
                 
                 val (finalCode, newFlow) =
                     case flow of
                         ExitCode => (codeResult, ExitCode)
                     
                     |   Unconditional m =>
                         let
                             (* Process the block.  Since we're making an unconditional jump
                                we can include any stack adjustment needed to match the
                                destination in here.  In particular this includes loops. *)
                             val {expectedInput, reqCC} = valOf (asub inputStackSizes m)
                             val _ = outputCount >= expectedInput orelse raise InternalError "negative reset"
                             val resultCode =
                                 if expectedInput = outputCount
                                 then codeResult
                                 else ResetStackPtr{numWords=outputCount-expectedInput, preserveCC=reqCC} :: codeResult
                         in
                             (resultCode, Unconditional m)
                         end
 
                         (* For any of these, if we need to adjust the stack we have to add an
                            adjustment block. *)
                     |   Conditional {trueJump, falseJump, ccRef, condition} =>
                         (codeResult,
                             Conditional{trueJump=matchStacks trueJump, falseJump=matchStacks falseJump,
                                 ccRef=ccRef, condition=condition})
                     
                     |   SetHandler{ handler, continue } =>
                         (codeResult, SetHandler{ handler=matchStacks handler, continue=matchStacks continue})
                     
                     |   IndexedBr cases => (codeResult, IndexedBr(map matchStacks cases))
                     
                     |   u as UnconditionalHandle _ => (codeResult, u)
                     
                     |   c as ConditionalHandle{ continue, ... } =>
                         let
                             (* As for unconditional branch *)
                             val {expectedInput, reqCC} = valOf (asub inputStackSizes continue)
                             val _ = outputCount >= expectedInput orelse raise InternalError "negative reset"
                             val resultCode =
                                 if expectedInput = outputCount
                                 then codeResult
                                 else ResetStackPtr{numWords=outputCount-expectedInput, preserveCC=reqCC} :: codeResult
                         in
                             (resultCode, c)
                         end
              in
                 BasicBlock{block=List.rev finalCode, flow=newFlow}
             end
         in
             val resultBlocks = Vector.tabulate(numberOfBlocks, createBlock)
         end
         
         (* Add any extra blocks to the result. *)
         val finalResult =
             case !extraBlocks of
                 [] => resultBlocks
             |   blocks => Vector.concat[resultBlocks, Vector.fromList(List.rev blocks)]
         
         val pregProperties = Vector.fromList(List.rev(! pregPropList))
     in
         {code=finalResult, pregProps=pregProperties, maxStack= !maxStack}
     end
 
     structure Sharing =
     struct
         type x86ICode = x86ICode
         and preg = preg
         and intSet = intSet
         and extendedBasicBlock = extendedBasicBlock
         and basicBlock = basicBlock
         and regProperty = regProperty
     end
 end;