diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ASSEMBLY.sml b/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ASSEMBLY.sml
index 80b03800..631de923 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ASSEMBLY.sml
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ASSEMBLY.sml
@@ -1,2124 +1,2164 @@
 (*
     Copyright (c) 2021 David C. J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     Licence version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public Licence for more details.
     
     You should have received a copy of the GNU Lesser General Public
     Licence along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor ARM64ASSEMBLY (
     structure Debug: DEBUG
     and       Pretty: PRETTYSIG
     and       CodeArray: CODEARRAYSIG
 ) : Arm64Assembly =
 
 struct
     open CodeArray Address
     
     val is32in64 = Address.wordSize = 0w4
     
     val wordsPerNativeWord: word = Address.nativeWordSize div Address.wordSize
     
     exception InternalError = Misc.InternalError
 
     infix 5 << <<+ <<- >> >>+ >>- ~>> ~>>+ ~>>- (* Shift operators *)
     infix 3 andb orb xorb andbL orbL xorbL andb8 orb8 xorb8
     
     val op << = Word32.<< and op >> = Word32.>> and op ~>> = Word32.~>>
     and op andb = Word32.andb and op orb = Word32.orb
 
     val word32ToWord8 = Word8.fromLargeWord o Word32.toLargeWord
     and word8ToWord32 = Word32.fromLargeWord o Word8.toLargeWord
     and word32ToWord = Word.fromLargeWord o Word32.toLargeWord
     and wordToWord32 = Word32.fromLargeWord o Word.toLargeWord
     and word8ToWord = Word.fromLargeWord o Word8.toLargeWord
    
     (* XReg is used for fixed point registers since X0 and W0 are
        the same register. *)
     datatype xReg = XReg of Word8.word | XZero | XSP
     (* VReg is used for the floating point registers since V0, D0 and
        S0 are the same register. *)
     and vReg = VReg of Word8.word
 
     (* A Label is a ref that is later set to the location.
        Several labels can be linked together so that they are only set
        at a single point.
        Only forward jumps are linked so when we come to finally set the
        label we will have the full list. *)
     type labels = Word.word ref list ref
 
     (* Condition codes.  The encoding is standard. *)
     datatype condition = CCode of Word8.word
     
     val condEqual           = CCode 0wx0 (* Z=1 *)
     and condNotEqual        = CCode 0wx1 (* Z=0 *)
     and condCarrySet        = CCode 0wx2 (* C=1 *)
     and condCarryClear      = CCode 0wx3 (* C=0 *)
     and condNegative        = CCode 0wx4 (* N=1 *)
     and condPositive        = CCode 0wx5 (* N=0 imcludes zero *)
     and condOverflow        = CCode 0wx6 (* V=1 *)
     and condNoOverflow      = CCode 0wx7 (* V=0 *)
     and condUnsignedHigher  = CCode 0wx8 (* C=1 && Z=0 *)
     and condUnsignedLowOrEq = CCode 0wx9 (* ! (C=1 && Z=0) *)
     and condSignedGreaterEq = CCode 0wxa (* N=V *)
     and condSignedLess      = CCode 0wxb (* N<>V *)
     and condSignedGreater   = CCode 0wxc (* Z==0 && N=V *)
     and condSignedLessEq    = CCode 0wxd (* !(Z==0 && N=V) *)
     (* use unconditional branches for the "always" cases. *)
     (* N.B. On subtraction and comparison the ARM uses an inverted carry
        flag for borrow.  The C flag is set if there is NO borrow.
        This is the reverse of the X86. *)
 
     (* Offsets in the assembly code interface pointed at by X26
        These are in units of 64-bits NOT bytes. *)
     val heapOverflowCallOffset  = 1
     and stackOverflowCallOffset = 2
     and stackOverflowXCallOffset= 3
     and exceptionHandlerOffset  = 5
     and stackLimitOffset        = 6
     and exceptionPacketOffset   = 7
     and threadIdOffset          = 8
     and heapLimitPtrOffset      = 42
     and heapAllocPtrOffset      = 43
     and mlStackPtrOffset        = 44
 
     (* 31 in the register field can either mean the zero register or
        the hardware stack pointer.  Which meaning depends on the instruction. *)
     fun xRegOrXZ(XReg w) = w
     |   xRegOrXZ XZero = 0w31
     |   xRegOrXZ XSP = raise InternalError "XSP not valid here"
     
     and xRegOrXSP(XReg w) = w
     |   xRegOrXSP XZero = raise InternalError "XZero not valid here"
     |   xRegOrXSP XSP = 0w31
     
     (* There are cases where it isn't clear. *)
     and xRegOnly (XReg w) = w
     |   xRegOnly XZero = raise InternalError "XZero not valid here"
     |   xRegOnly XSP = raise InternalError "XSP not valid here"
 
     val X0  = XReg 0w0  and X1  = XReg 0w1  and X2 = XReg 0w2   and X3  = XReg 0w3
     and X4  = XReg 0w4  and X5  = XReg 0w5  and X6 = XReg 0w6   and X7  = XReg 0w7
     and X8  = XReg 0w8  and X9  = XReg 0w9  and X10= XReg 0w10  and X11 = XReg 0w11
     and X12 = XReg 0w12 and X13 = XReg 0w13 and X14= XReg 0w14  and X15 = XReg 0w15
     and X16 = XReg 0w16 and X17 = XReg 0w17 and X18= XReg 0w18  and X19 = XReg 0w19
     and X20 = XReg 0w20 and X21 = XReg 0w21 and X22= XReg 0w22  and X23 = XReg 0w23
     and X24 = XReg 0w24 and X25 = XReg 0w25 and X26= XReg 0w26  and X27 = XReg 0w27
     and X28 = XReg 0w28 and X29 = XReg 0w29 and X30= XReg 0w30
     
     val X_MLHeapLimit       = X25 (* ML Heap limit pointer *)
     and X_MLAssemblyInt     = X26 (* ML assembly interface pointer. *)
     and X_MLHeapAllocPtr    = X27 (* ML Heap allocation pointer. *)
     and X_MLStackPtr        = X28 (* ML Stack pointer. *)
     and X_LinkReg           = X30 (* Link reg - return address *)
     and X_Base32in64        = X24 (* X24 is used for the heap base in 32-in-64. *)
     
     fun vReg(VReg v) = v
     (* Only the first eight registers are currently used by ML. *)
     val V0  = VReg 0w0  and V1  = VReg 0w1 and V2 = VReg 0w2   and V3  = VReg 0w3
     and V4  = VReg 0w4  and V5  = VReg 0w5 and V6 = VReg 0w6   and V7  = VReg 0w7
 
     (* Some data instructions include a possible shift. *)
     datatype shiftType =
         ShiftLSL of Word8.word
     |   ShiftLSR of Word8.word
     |   ShiftASR of Word8.word
     |   ShiftNone
 
     local
         fun checkImm6 w = if w > 0w63 then raise InternalError "shift > 63" else w
     in 
         fun shiftEncode(ShiftLSL w) = (0w0, checkImm6 w)
         |   shiftEncode(ShiftLSR w) = (0w1, checkImm6 w)
         |   shiftEncode(ShiftASR w) = (0w2, checkImm6 w)
         |   shiftEncode ShiftNone   = (0w0, 0w0)
     end
 
     (* Other instructions include an extension i.e. a sign- or zero-extended
        value from one of the argument registers.  When an extension is encoded
        there can also be a left shift which applies after the extension.
        I don't understand what difference, if any, there is between UXTX
        and SXTX.
        There's no ExtNone because we need to use either UXTW or UXTX depending
        on the length *)
     datatype 'a extend =
         ExtUXTB of 'a (* Unsigned extend byte *)
     |   ExtUXTH of 'a (* Unsigned extend byte *)
     |   ExtUXTW of 'a (* Unsigned extend byte *)
     |   ExtUXTX of 'a (* Left shift *)
     |   ExtSXTB of 'a (* Sign extend byte *)
     |   ExtSXTH of 'a (* Sign extend halfword *)
     |   ExtSXTW of 'a (* Sign extend word *)
     |   ExtSXTX of 'a (* Left shift *)
 
     (* Load/store instructions have only a single bit for the shift.  For byte
        operations this is one bit shift; for others it scales by the size of
        the operand if set. *)
     datatype scale =
         ScaleOrShift
     |   NoScale
 
     local
         (* Although there are three bits it seems that the shift is limited to 0 to 4. *)
         fun checkImm3 w = if w > 0w4 then raise InternalError "extend shift > 4" else w
     in
         fun extendArithEncode(ExtUXTB w) = (0w0, checkImm3 w)
         |   extendArithEncode(ExtUXTH w) = (0w1, checkImm3 w)
         |   extendArithEncode(ExtUXTW w) = (0w2, checkImm3 w)
         |   extendArithEncode(ExtUXTX w) = (0w3, checkImm3 w)
         |   extendArithEncode(ExtSXTB w) = (0w4, checkImm3 w)
         |   extendArithEncode(ExtSXTH w) = (0w5, checkImm3 w)
         |   extendArithEncode(ExtSXTW w) = (0w6, checkImm3 w)
         |   extendArithEncode(ExtSXTX w) = (0w7, checkImm3 w)
         
         fun extendLSEncode(ExtUXTB v) = (0w0, v)
         |   extendLSEncode(ExtUXTH v) = (0w1, v)
         |   extendLSEncode(ExtUXTW v) = (0w2, v)
         |   extendLSEncode(ExtUXTX v) = (0w3, v)
         |   extendLSEncode(ExtSXTB v) = (0w4, v)
         |   extendLSEncode(ExtSXTH v) = (0w5, v)
         |   extendLSEncode(ExtSXTW v) = (0w6, v)
         |   extendLSEncode(ExtSXTX v) = (0w7, v)
     end
 
     datatype wordSize = WordSize32 | WordSize64
 
     (* Bit patterns on the ARM64 are encoded using a complicated scheme and
        only certain values can be encoded.  An element can be 2, 4, 8, 16, 32 or
        64 bits and must be a sequence of at least one zero bits followed by at
        least one one bit.  This sequence can then be rotated within the element.
        Finally the element is replicated within the register up to 32 or
        64 bits.  All this information is encoded in 13 bits.
        N.B. Bit patterns of all zeros or all ones cannot be encoded. *)
 
     (* Encode the value if it is possible. *)
     fun encodeBitPattern(value, sf (* size flag *)) =
     (* Can't encode 0 or all ones. *)
     if value = 0w0 orelse value = Word64.notb 0w0
     then NONE
     (* If this is 32-bits we can't encode all ones in the
        low-order 32-bits or any value that won't fit in 32-bits, *)
     else if sf = WordSize32 andalso value >= 0wxffffffff
     then NONE
     else
     let
         val regSize = case sf of WordSize32 => 0w32 | WordSize64 => 0w64
         (* Get the element size.  Look for the repeat of the
            pattern. *)
         fun getElemSize size =
         let
             val ns = size div 0w2
             val mask = Word64.<<(0w1, ns)  - 0w1
         in
             if Word64.andb(value, mask) <> Word64.andb(Word64.>>(value, ns), mask)
             then size
             else if ns <= 0w2
             then ns
             else getElemSize ns
         end
         val elemSize = getElemSize regSize
         fun log2 0w1 = 0w0 | log2 n = 0w1 + log2(Word.>>(n, 0w1))
         val elemBits = log2 elemSize
 
         (* Find the rotation that puts as many of the zero bits in the
            element at the top. *)
         val elemMask = Word64.>>(Word64.notb 0w0, 0w64-elemSize)
         fun ror elt =
             Word64.orb((Word64.<<(Word64.andb(elt, 0w1), elemSize-0w1),
                 Word64.>>(elt, 0w1)))
         and rol elt =
             Word64.orb(Word64.andb(elemMask, Word64.<<(elt, 0w1)),
                 Word64.>>(elt, elemSize-0w1))
 
         fun findRotation(v, n) =
             if ror v < v then findRotation(ror v, (n-0w1) mod elemSize)
             else if rol v < v then findRotation(rol v, n+0w1)
             else (v, n)
 
         val (rotated, rotation) = findRotation(Word64.andb(value, elemMask), 0w0)
 
         (* Count out the low order ones.  If the result is zero
            then we;ve got a valid sequence of zeros followed by ones
            but if we discover a zero bit and the result isn't zero
            then we can't encode this. *)
         fun countLowOrderOnes(v, n) =
             if v = 0w0
             then SOME n
             else if Word64.andb(v, 0w1) = 0w1
             then countLowOrderOnes(Word64.>>(v, 0w1), n+0w1)
             else NONE
      in
         case countLowOrderOnes(rotated, 0w0) of
             NONE => NONE
         |   SOME lowOrderOnes =>
             let
                 (* Encode the element size. *)
                 val elemSizeEnc = 0wx7f - (Word.<<(0w1, elemBits+0w1) - 0w1)
                 val n = if Word.andb(elemSizeEnc, 0wx40) = 0w0 then 0w1 else 0w0
                 val imms = Word.andb(Word.orb(elemSizeEnc, lowOrderOnes-0w1), 0wx3f)
             in
                 SOME{n=n, imms=imms, immr=rotation}
             end
     end;
 
     (* Decode a pattern for printing. *)
     fun decodeBitPattern{sf, n, immr, imms} =
     let
         (* Find the highest bit set in N:NOT(imms) *)
         fun highestBitSet 0w0 = 0
         |   highestBitSet n = 1+highestBitSet(Word32.>>(n, 0w1))
         val len = highestBitSet(Word32.orb(Word32.<<(n, 0w6), Word32.xorb(imms, 0wx3f))) - 1
         val _ = if len < 0 then raise InternalError "decodeBitPattern: invalid" else ()
         val size = Word32.<<(0w1, Word.fromInt len)
         val r = Word32.andb(immr, size-0w1)
         and s = Word32.andb(imms, size-0w1)
         val _ = if s = size-0w1 then raise InternalError "decodeBitPattern: invalid" else ()
         val pattern = Word64.<<(0w1, word32ToWord(s+0w1)) - 0w1
         (* Rotate right: shift left and put the top bit in the high order bit*)
         fun ror elt =
             Word64.orb((Word64.<<(Word64.andb(elt, 0w1), word32ToWord(size-0w1)),
                 Word64.>>(elt, 0w1)))
 
         fun rotateBits(value, 0w0) = value
         |   rotateBits(value, n) = rotateBits(ror value, n-0w1)
 
         val rotated = rotateBits(pattern, r)
 
         val regSize = if sf = 0w0 then 0w32 else 0w64
 
         (* Replicate the rotated pattern to fill the register. *)
         fun replicate(pattern, size) =
             if size >= regSize
             then pattern
             else replicate(Word64.orb(pattern, Word64.<<(pattern, word32ToWord size)), size * 0w2)
     in
         replicate(rotated, size)
     end
 
     val isEncodableBitPattern = isSome o encodeBitPattern
 
 
     datatype instr =
         SimpleInstr of Word32.word
     |   LoadAddressLiteral of {reg: xReg, value: machineWord}
     |   LoadNonAddressLiteral of {reg: xReg, value: Word64.word}
     |   Label of labels
     |   UnconditionalBranch of labels
     |   ConditionalBranch of { label: labels, jumpCondition: condition, length: brLength ref }
     |   LoadLabelAddress of { label: labels, reg: xReg }
     |   TestBitBranch of { label: labels, bitNo: Word8.word, brNonZero: bool, reg: xReg, length: brLength ref }
     |   CompareBranch of { label: labels, brNonZero: bool, size: wordSize, reg: xReg, length: brLength ref }
     
     and brLength = BrShort | BrExtended
 
     val nopCode  = 0wxD503201F
 
     (* Add/subtract an optionally shifted 12-bit immediate (i.e. constant) to/from a register.
        The constant is zero-extended.  The versions that do not set the flags can use XSP as
        the destination; the versions that use the signs can use XZero as the destination i.e.
        they discard the result and act as a comparison. *)
     local
         fun addSubRegImmediate(sf, oper, s, xdOp) ({regN, regD, immed, shifted}) =
         let
             val () =
                 if immed >= 0wx1000 then raise InternalError "addSubRegImmediate: immed > 12 bits" else ()
         in
             SimpleInstr(
                 0wx11000000 orb (sf << 0w31) orb (oper << 0w30) orb (s << 0w29) orb
                 (if shifted then 0wx400000 else 0w0) orb
                 (wordToWord32 immed << 0w10) orb (word8ToWord32(xRegOrXSP regN) << 0w5) orb
                 word8ToWord32(xdOp regD))
         end
     in
         val addImmediate = addSubRegImmediate(0w1, 0w0, 0w0, xRegOrXSP)
         and addSImmediate = addSubRegImmediate(0w1, 0w0, 0w1, xRegOrXZ)
         and subImmediate = addSubRegImmediate(0w1, 0w1, 0w0, xRegOrXSP)
         and subSImmediate = addSubRegImmediate(0w1, 0w1, 0w1, xRegOrXZ)
     end
 
     (* Add/subtract a shifted register, optionally setting the flags. *)
     local
         (* X31 is XZ here unlike the extended version.*)
         fun addSubtractShiftedReg (sf, oper, s) ({regM, regN, regD, shift}) =
         let
             val (shift, imm6) = shiftEncode shift
         in
             SimpleInstr(0wx0b000000 orb (sf << 0w31) orb (oper << 0w30) orb (s << 0w29) orb
                 (shift << 0w22) orb (word8ToWord32(xRegOnly regM) << 0w16) orb
                 (word8ToWord32 imm6 << 0w10) orb (word8ToWord32(xRegOrXZ regN) << 0w5) orb
                 word8ToWord32(xRegOrXZ regD))
         end
     in
         val addShiftedReg = addSubtractShiftedReg(0w1, 0w0, 0w0)
         and addSShiftedReg = addSubtractShiftedReg(0w1, 0w0, 0w1)
         and subShiftedReg = addSubtractShiftedReg(0w1, 0w1, 0w0)
         and subSShiftedReg = addSubtractShiftedReg(0w1, 0w1, 0w1)
         and addShiftedReg32 = addSubtractShiftedReg(0w0, 0w0, 0w0)
         and addSShiftedReg32 = addSubtractShiftedReg(0w0, 0w0, 0w1)
         and subShiftedReg32 = addSubtractShiftedReg(0w0, 0w1, 0w0)
         and subSShiftedReg32 = addSubtractShiftedReg(0w0, 0w1, 0w1)
     end
 
     (* Add/subtract an extended register, optionally setting the flags. *)
     local
         (* SP can be used as Xn and also for Xd for the non-flags versions. *)
         fun addSubtractExtendedReg (sf, oper, s, opt, xD) ({regM, regN, regD, extend}) =
         let
             val (option, imm3) = extendArithEncode extend
         in
             SimpleInstr(0wx0b200000 orb (sf << 0w31) orb (oper << 0w30) orb (s << 0w29) orb
                 (opt << 0w22) orb (word8ToWord32(xRegOnly regM) << 0w16) orb
                 (option << 0w13) orb (word8ToWord32 imm3 << 0w10) orb
                 (word8ToWord32(xRegOrXSP regN) << 0w5) orb
                 word8ToWord32(xD regD))
         end
     in
         val addExtendedReg = addSubtractExtendedReg(0w1, 0w0, 0w0, 0w0, xRegOrXSP)
         and addSExtendedReg = addSubtractExtendedReg(0w1, 0w0, 0w1, 0w0, xRegOrXZ)
         and subExtendedReg = addSubtractExtendedReg(0w1, 0w1, 0w0, 0w0, xRegOrXSP)
         and subSExtendedReg = addSubtractExtendedReg(0w1, 0w1, 0w1, 0w0, xRegOrXZ)
     end
 
     (* Logical operations on a shifted register. *)
     local
         fun logicalShiftedReg (sf, oper, n) ({regM, regN, regD, shift}) =
         let
             val (shift, imm6) = shiftEncode shift
         in
             SimpleInstr(0wx0a000000 orb (sf << 0w31) orb (oper << 0w29) orb
                 (shift << 0w22) orb (n << 0w21) orb (word8ToWord32(xRegOrXZ regM) << 0w16) orb
                 (word8ToWord32 imm6 << 0w10) orb (word8ToWord32(xRegOrXZ regN) << 0w5) orb
                 word8ToWord32(xRegOrXZ regD))
         end
     in
         val andShiftedReg = logicalShiftedReg(0w1, 0w0, 0w0)
         and orrShiftedReg = logicalShiftedReg(0w1, 0w1, 0w0)
         and eorShiftedReg = logicalShiftedReg(0w1, 0w2, 0w0)
         and andsShiftedReg = logicalShiftedReg(0w1, 0w3, 0w0)
         (* There are also versions that operate with an inverted version
            of the argument. *)
     end
 
     (* Two-source operations. *)
     local
         fun twoSourceInstr (sf, s, opcode) ({regM, regN, regD}) =
             SimpleInstr(0wx1ac00000 orb (sf << 0w31) orb (s << 0w29) orb
                 (word8ToWord32(xRegOnly regM) << 0w16) orb (opcode << 0w10) orb
                 (word8ToWord32(xRegOnly regN) << 0w5) orb
                 word8ToWord32(xRegOnly regD))
     in
         (* Signed and unsigned division. *)
         val unsignedDivide   = twoSourceInstr(0w1, 0w0, 0wx2)
         and signedDivide     = twoSourceInstr(0w1, 0w0, 0wx3)
         (* Logical shift left Rd = Rn << (Rm mod 0w64) *)
         and logicalShiftLeftVariable = twoSourceInstr(0w1, 0w0, 0wx8)
         (* Logical shift right Rd = Rn >> (Rm mod 0w64) *)
         and logicalShiftRightVariable = twoSourceInstr(0w1, 0w0, 0wx9)
         (* Arithmetic shift right Rd = Rn ~>> (Rm mod 0w64) *)
         and arithmeticShiftRightVariable = twoSourceInstr(0w1, 0w0, 0wxa)
         and logicalShiftLeftVariable32 = twoSourceInstr(0w0, 0w0, 0wx8)
     end
 
     (* Three source operations.  These are all variations of multiply. *)
     local
         fun threeSourceInstr (sf, op54, op31, o0) ({regM, regA, regN, regD}) =
             SimpleInstr(0wx1b000000 orb (sf << 0w31) orb (op54 << 0w29) orb
                 (op31 << 0w21) orb (word8ToWord32(xRegOnly regM) << 0w16) orb
                 (o0 << 0w15) orb (word8ToWord32(xRegOrXZ regA) << 0w10) orb
                 (word8ToWord32(xRegOnly regN) << 0w5) orb
                 word8ToWord32(xRegOnly regD))
     in
         (* regD = regA + regN * regM *)
         val multiplyAndAdd = threeSourceInstr(0w1, 0w0, 0w0, 0w0)
         (* regD = regA - regN * regM *)
         and multiplyAndSub = threeSourceInstr(0w1, 0w0, 0w0, 0w1)
         (* Return the high-order part of a signed multiplication. *)
         fun signedMultiplyHigh({regM, regN, regD}) =
             threeSourceInstr(0w1, 0w0, 0w2, 0w0) { regM=regM, regN=regN, regD=regD, regA=XZero}
     end
 
     (* Loads: There are two versions of this on the ARM.  There is a version that
        takes a signed 9-bit byte offset and a version that takes an unsigned
        12-bit word offset. *)
     
     local
         fun loadStoreRegScaled (size, v, opc, xD) ({regT, regN, unitOffset}) =
         let
             val _ = (unitOffset >= 0 andalso unitOffset < 0x1000)
                 orelse raise InternalError "loadStoreRegScaled: value out of range"
         in
             SimpleInstr(0wx39000000 orb (size << 0w30) orb (opc << 0w22) orb
                 (v << 0w26) orb (Word32.fromInt unitOffset << 0w10) orb
                 (word8ToWord32(xRegOrXSP regN) << 0w5) orb word8ToWord32(xD regT))
         end
     in
         val loadRegScaled = loadStoreRegScaled(0w3, 0w0, 0w1, xRegOrXZ)
         and storeRegScaled = loadStoreRegScaled(0w3, 0w0, 0w0, xRegOrXZ)
         (* (Unsigned) byte operations.  There are also signed versions. *)
         and loadRegScaledByte = loadStoreRegScaled (0w0, 0w0, 0w1, xRegOrXZ)
         and storeRegScaledByte = loadStoreRegScaled (0w0, 0w0, 0w0, xRegOrXZ)
         and loadRegScaled16 = loadStoreRegScaled (0w1, 0w0, 0w1, xRegOrXZ)
         and storeRegScaled16 = loadStoreRegScaled (0w1, 0w0, 0w0, xRegOrXZ)
         and loadRegScaled32 = loadStoreRegScaled (0w2, 0w0, 0w1, xRegOrXZ)
         and storeRegScaled32 = loadStoreRegScaled (0w2, 0w0, 0w0, xRegOrXZ)
         and loadRegScaledDouble = loadStoreRegScaled(0w3, 0w1, 0w1, vReg)
         and storeRegScaledDouble = loadStoreRegScaled(0w3, 0w1, 0w0, vReg)
         and loadRegScaledFloat = loadStoreRegScaled(0w2, 0w1, 0w1, vReg)
         and storeRegScaledFloat = loadStoreRegScaled(0w2, 0w1, 0w0, vReg)
     end    
 
     local
         (* Loads and stores with a signed byte offset.  This includes simple
            unscaled addresses, pre-indexing and post-indexing. *)
         fun loadStoreByteAddress (op4, xD) (size, v, opc) ({regT, regN, byteOffset}) =
         let
             val _ = (byteOffset >= ~256 andalso byteOffset < 256)
                 orelse raise InternalError "loadStoreUnscaled: value out of range"
             val imm9 = Word32.fromInt byteOffset andb 0wx1ff
         in
             SimpleInstr(0wx38000000 orb (size << 0w30) orb (opc << 0w22) orb
                 (v << 0w26) orb (imm9 << 0w12) orb (op4 << 0w10) orb
                 (word8ToWord32(xRegOrXSP regN) << 0w5) orb word8ToWord32(xD regT))
         end
         
         val loadStoreUnscaled = loadStoreByteAddress (0w0, xRegOrXZ)
         and loadStoreUnscaledSIMD = loadStoreByteAddress (0w0, vReg)
         and loadStorePostIndex = loadStoreByteAddress (0w1, xRegOrXZ)
         and loadStorePreIndex = loadStoreByteAddress (0w3, xRegOrXZ)
     in
         val loadRegUnscaled = loadStoreUnscaled (0w3, 0w0, 0w1)
         and storeRegUnscaled = loadStoreUnscaled (0w3, 0w0, 0w0)
         (* (Unsigned) byte operations.  There are also signed versions. *)
         and loadRegUnscaledByte = loadStoreUnscaled (0w0, 0w0, 0w1)
         and storeRegUnscaledByte = loadStoreUnscaled (0w0, 0w0, 0w0)
         and loadRegUnscaled16 = loadStoreUnscaled (0w1, 0w0, 0w1)
         and storeRegUnscaled16 = loadStoreUnscaled (0w1, 0w0, 0w0)
         and loadRegUnscaled32 = loadStoreUnscaled (0w2, 0w0, 0w1)
         and storeRegUnscaled32 = loadStoreUnscaled (0w2, 0w0, 0w0)
         and loadRegUnscaledFloat = loadStoreUnscaledSIMD (0w2, 0w1, 0w1)
         and storeRegUnscaledFloat = loadStoreUnscaledSIMD (0w2, 0w1, 0w0)
         and loadRegUnscaledDouble = loadStoreUnscaledSIMD (0w3, 0w1, 0w1)
         and storeRegUnscaledDouble = loadStoreUnscaledSIMD (0w3, 0w1, 0w0)
 
         val loadRegPostIndex = loadStorePostIndex (0w3, 0w0, 0w1)
         and storeRegPostIndex = loadStorePostIndex (0w3, 0w0, 0w0)
         and loadRegPostIndex32 = loadStorePostIndex (0w2, 0w0, 0w1)
         and storeRegPostIndex32 = loadStorePostIndex (0w2, 0w0, 0w0)
         and loadRegPostIndexByte = loadStorePostIndex (0w0, 0w0, 0w1)
         and storeRegPostIndexByte = loadStorePostIndex (0w0, 0w0, 0w0)
 
         val loadRegPreIndex = loadStorePreIndex (0w3, 0w0, 0w1)
         and storeRegPreIndex = loadStorePreIndex (0w3, 0w0, 0w0)
+        and loadRegPreIndex32 = loadStorePreIndex (0w2, 0w0, 0w1)
+        and storeRegPreIndex32 = loadStorePreIndex (0w2, 0w0, 0w0)
         and loadRegPreIndexByte = loadStorePreIndex (0w0, 0w0, 0w1)
         and storeRegPreIndexByte = loadStorePreIndex (0w0, 0w0, 0w0)
     end
 
     (* Load/store with a register offset i.e. an index register. *)
     local
         fun loadStoreRegRegisterOffset (size, v, opc, xD) ({regT, regN, regM, option}) =
         let
             val (opt, s) =
                 case extendLSEncode option of
                     (opt, ScaleOrShift) => (opt, 0w1) | (opt, NoScale) => (opt, 0w0)
         in
             SimpleInstr(0wx38200800 orb (size << 0w30) orb (v << 0w26) orb (opc << 0w22) orb
                 (word8ToWord32(xRegOnly regM) << 0w16) orb (opt << 0w13) orb (s << 0w12) orb
                 (word8ToWord32(xRegOrXSP regN) << 0w5) orb word8ToWord32(xD regT))
         end
     in
         val loadRegIndexed = loadStoreRegRegisterOffset(0w3, 0w0, 0w1, xRegOrXZ)
         and storeRegIndexed = loadStoreRegRegisterOffset(0w3, 0w0, 0w0, xRegOrXZ)
         and loadRegIndexedByte = loadStoreRegRegisterOffset(0w0, 0w0, 0w1, xRegOrXZ)
         and storeRegIndexedByte = loadStoreRegRegisterOffset(0w0, 0w0, 0w0, xRegOrXZ)
         and loadRegIndexed16 = loadStoreRegRegisterOffset(0w1, 0w0, 0w1, xRegOrXZ)
         and storeRegIndexed16 = loadStoreRegRegisterOffset(0w1, 0w0, 0w0, xRegOrXZ)
         and loadRegIndexed32 = loadStoreRegRegisterOffset(0w2, 0w0, 0w1, xRegOrXZ)
         and storeRegIndexed32 = loadStoreRegRegisterOffset(0w2, 0w0, 0w0, xRegOrXZ)
         and loadRegIndexedFloat = loadStoreRegRegisterOffset(0w2, 0w1, 0w1, vReg)
         and storeRegIndexedFloat = loadStoreRegRegisterOffset(0w2, 0w1, 0w0, vReg)
         and loadRegIndexedDouble = loadStoreRegRegisterOffset(0w3, 0w1, 0w1, vReg)
         and storeRegIndexedDouble = loadStoreRegRegisterOffset(0w3, 0w1, 0w0, vReg)
     end
 
     local
         (* Loads and stores with special ordering. *)
         fun loadStoreExclusive(size, o2, l, o1, o0) {regS, regT2, regN, regT} =
             SimpleInstr(0wx08000000 orb (size << 0w30) orb (o2 << 0w23) orb (l << 0w22) orb
                 (o1 << 0w21) orb (word8ToWord32(xRegOrXZ regS) << 0w16) orb (o0 << 0w15) orb
                 (word8ToWord32(xRegOrXZ regT2) << 0w10) orb (word8ToWord32(xRegOrXZ regN) << 0w5) orb
                  word8ToWord32(xRegOrXZ regT))
     in
         fun loadAcquire{regN, regT} =
             loadStoreExclusive(0w3, 0w1, 0w1, 0w0, 0w1) {regS=XZero, regT2=XZero, regN=regN, regT=regT}
         and storeRelease{regN, regT} =
             loadStoreExclusive(0w3, 0w1, 0w0, 0w0, 0w1) {regS=XZero, regT2=XZero, regN=regN, regT=regT}
         and loadAcquire32{regN, regT} =
             loadStoreExclusive(0w2, 0w1, 0w1, 0w0, 0w1) {regS=XZero, regT2=XZero, regN=regN, regT=regT}
         and storeRelease32{regN, regT} =
             loadStoreExclusive(0w2, 0w1, 0w0, 0w0, 0w1) {regS=XZero, regT2=XZero, regN=regN, regT=regT}
 
         (* Acquire exclusive access to a memory location and load its current value *)
         and loadAcquireExclusiveRegister{regN, regT} =
             loadStoreExclusive(0w3, 0w0, 0w1, 0w0, 0w1) {regS=XZero, regT2=XZero, regN=regN, regT=regT}
         (* Release exclusive access and test whether it succeeded.  Sets regS to 0
            if successful otherwise 1, in which case we have to repeat the operation. *)
         and storeReleaseExclusiveRegister{regN, regS, regT} =
             loadStoreExclusive(0w3, 0w0, 0w0, 0w0, 0w1) {regS=regS, regT2=XZero, regN=regN, regT=regT}
     end
 
     (* Addresses must go in the constant area at the end of the code where they
        can be found by the GC. *)
     fun loadAddressConstant(xReg, valu) = LoadAddressLiteral{reg=xReg, value=valu}
 
     (* Non-address constants.  These may or may not be tagged values. *)
     fun loadNonAddressConstant(xReg, valu) = LoadNonAddressLiteral{reg=xReg, value=valu}
 
     local
         fun moveWideImmediate(sf, opc) {regD, immediate, shift} =
         let
             val hw =
                 case (shift, sf) of
                     (0w0, _) => 0w0
                 |   (0w16, _) => 0w1
                 |   (0w24, 0w1) => 0w2
                 |   (0w48, 0w1) => 0w3
                 |   _ => raise InternalError "moveWideImmediate: invalid shift"
             val _ =
                 immediate <= 0wxffff orelse raise InternalError "moveWideImmediate: immediate too large"
         in
             SimpleInstr(0wx12800000 orb (sf << 0w31) orb (opc << 0w29) orb
                 (hw << 0w21) orb (wordToWord32 immediate << 0w5) orb word8ToWord32(xRegOnly regD))
         end
     in
         val moveNot32 = moveWideImmediate(0w0, 0w0)
         and moveZero32 = moveWideImmediate(0w0, 0w2)
         and moveKeep32 = moveWideImmediate(0w0, 0w3)
         and moveNot = moveWideImmediate(0w1, 0w0)
         and moveZero = moveWideImmediate(0w1, 0w2)
         and moveKeep = moveWideImmediate(0w1, 0w3)
     end
 
     (* Instructions involved in thread synchonisation. *)
     val yield = SimpleInstr 0wxD503203F (* Yield inside a spin-lock. *)
     and dmbIsh = SimpleInstr 0wxD5033BBF (* Memory barrier. *)
     
     (* Jump to the address in the register and put the address of the
        next instruction into X30. *)
     fun branchAndLinkReg(dest) =
         SimpleInstr(0wxD63F0000 orb (word8ToWord32(xRegOnly dest) << 0w5))
 
     (* Jump to the address in the register. *)
     fun branchRegister(dest) =
         SimpleInstr(0wxD61F0000 orb (word8ToWord32(xRegOnly dest) << 0w5))
 
     (* Jump to the address in the register and hint this is a return. *)
     fun returnRegister(dest) =
         SimpleInstr(0wxD65F0000 orb (word8ToWord32(xRegOnly dest) << 0w5))
 
     (* Put a label into the code. *)
     val setLabel = Label
 
     (* Create a label. *)
     fun createLabel () = ref [ref 0w0]
 
     (* A conditional or unconditional branch. *)
     and conditionalBranch(cond, label) = ConditionalBranch{label=label, jumpCondition=cond, length=ref BrExtended }
     and unconditionalBranch label = UnconditionalBranch label
     (* Put the address of a label into a register - used for handlers and cases. *)
     and loadLabelAddress(reg, label) = LoadLabelAddress{label=label, reg=reg}
     (* Test a bit in a register and branch if zero/nonzero *)
     and testBitBranchZero(reg, bit, label) =
         TestBitBranch{label=label, bitNo=bit, brNonZero=false, reg=reg, length=ref BrExtended}
     and testBitBranchNonZero(reg, bit, label) =
         TestBitBranch{label=label, bitNo=bit, brNonZero=true, reg=reg, length=ref BrExtended}
     (* Compare a register with zero and branch if zero/nonzero *)
     and compareBranchZero(reg, size, label) =
         CompareBranch{label=label, brNonZero=false, size=size, reg=reg, length=ref BrExtended}
     and compareBranchNonZero(reg, size, label) =
         CompareBranch{label=label, brNonZero=true, size=size, reg=reg, length=ref BrExtended}
     
 
     (* Set the destination register to the value of the first reg if the
        condition is true otherwise to a, possibly modified, version of
        the second argument.  There are variants that set it unmodified,
        incremented, inverted and negated. *)
     local
         fun conditionalSelect (sf, opc, op2) {regD, regFalse, regTrue, cond=CCode cond} =
             SimpleInstr(0wx1A800000 orb (sf << 0w31) orb (opc << 0w30) orb
                 (word8ToWord32(xRegOrXZ regFalse) << 0w16) orb (word8ToWord32 cond << 0w12) orb
                 (op2 << 0w10) orb (word8ToWord32(xRegOrXZ regTrue) << 0w5) orb
                 word8ToWord32(xRegOrXZ regD))
     in
         val conditionalSet = conditionalSelect(0w1, 0w0, 0w0)
         and conditionalSetIncrement = conditionalSelect(0w1, 0w0, 0w1)
         and conditionalSetInverted = conditionalSelect(0w1, 0w1, 0w0)
         and conditionalSetNegated = conditionalSelect(0w1, 0w1, 0w1)
     end
 
     (* This combines the effect of a left and right shift.  There are various
        derived forms of this depending on the relative values of immr and imms.
        if imms >= immr copies imms-immr-1 bits from bit position immr to the lsb
        bits of the destination.
        if imms < immr copies imms+1 bits from the lsb bit to bit position
        regsize-immr.
        How the remaining bits are affected depends on the instruction.
        BitField instructions do not affect other bits.
        UnsignedBitField instructions zero other bits.
        SignedBitField instructions set the high order bits to a copy of
        the high order bit copied and zero the low order bits. *)
     local
         fun bitfield (sf, opc, n) {immr, imms, regN, regD} =
             SimpleInstr(0wx13000000 orb (sf << 0w31) orb (opc << 0w29) orb (n << 0w22) orb
                 (wordToWord32 immr << 0w16) orb (wordToWord32 imms << 0w10) orb (word8ToWord32(xRegOrXZ regN) << 0w5) orb
                 word8ToWord32(xRegOrXZ regD))
 
         val signedBitfieldMove32 = bitfield(0w0, 0w0, 0w0)
         and bitfieldMove32 = bitfield(0w0, 0w1, 0w0)
         and unsignedBitfieldMove32 = bitfield(0w0, 0w2, 0w0)
         and signedBitfieldMove64 = bitfield(0w1, 0w0, 0w1)
         and bitfieldMove64 = bitfield(0w1, 0w1, 0w1)
         and unsignedBitfieldMove64 = bitfield(0w1, 0w2, 0w1)
     in
         fun logicalShiftLeft{wordSize=WordSize64, shift, regN, regD} =
                 unsignedBitfieldMove64{immr=Word.~ shift mod 0w64,
                     imms=0w64-0w1-shift, regN=regN, regD=regD}
         |   logicalShiftLeft{wordSize=WordSize32, shift, regN, regD} =
                 unsignedBitfieldMove32{immr=Word.~ shift mod 0w32,
                     imms=0w32-0w1-shift, regN=regN, regD=regD}
 
         and logicalShiftRight{wordSize=WordSize64, shift, regN, regD} =
                 unsignedBitfieldMove64{immr=shift, imms=0wx3f, regN=regN, regD=regD}
         |   logicalShiftRight{wordSize=WordSize32, shift, regN, regD} =
                 unsignedBitfieldMove32{immr=shift, imms=0wx1f, regN=regN, regD=regD}
 
         and unsignedBitfieldInsertinZeros{wordSize=WordSize64, lsb, width, regN, regD} =
                 unsignedBitfieldMove64{immr=Word.~ lsb mod 0w64,
                     imms=width-0w1, regN=regN, regD=regD}
         |   unsignedBitfieldInsertinZeros{wordSize=WordSize32, lsb, width, regN, regD} =
                 unsignedBitfieldMove32{immr=Word.~ lsb mod 0w32,
                     imms=width-0w1, regN=regN, regD=regD}
 
         and arithmeticShiftRight{wordSize=WordSize64, shift, regN, regD} =
                 signedBitfieldMove64{immr=shift, imms=0wx3f, regN=regN, regD=regD}
         |   arithmeticShiftRight{wordSize=WordSize32, shift, regN, regD} =
                 signedBitfieldMove32{immr=shift, imms=0wx1f, regN=regN, regD=regD}
 
         and bitfieldInsert{wordSize=WordSize64, lsb, width, regN, regD} =
                 bitfieldMove64{immr=Word.~ lsb mod 0w64, imms=width-0w1, regN=regN, regD=regD}
         |   bitfieldInsert{wordSize=WordSize32, lsb, width, regN, regD} =
                 bitfieldMove32{immr=Word.~ lsb mod 0w32, imms=width-0w1, regN=regN, regD=regD}
     end
 
     local
         (* Logical immediates.  AND, OR, XOR and ANDS.  Assumes that the immediate value
            has already been checked as valid.  The non-flags versions can use SP as the
            destination. *)
         fun logicalImmediate (opc, xD) {wordSize, bits, regN, regD} =
         let
             val s = case wordSize of WordSize32 => 0w0 | WordSize64 => 0w1
             val {n, imms, immr} = 
                 case encodeBitPattern(bits, wordSize) of
                     NONE => raise InternalError "testBitPattern: unable to encode bit pattern"
                 |   SOME res => res
         in
             SimpleInstr(0wx12000000 orb (opc << 0w29) orb (s << 0w31) orb (n << 0w22) orb
                 (wordToWord32 immr << 0w16) orb (wordToWord32 imms << 0w10) orb (word8ToWord32(xRegOrXZ regN) << 0w5) orb
                 word8ToWord32(xD regD))
         end
     in
         val bitwiseAndImmediate = logicalImmediate (0w0, xRegOrXSP)
         and bitwiseOrImmediate = logicalImmediate (0w1, xRegOrXSP)
         and bitwiseXorImmediate = logicalImmediate (0w2, xRegOrXSP)
         and bitwiseAndSImmediate = logicalImmediate (0w3, xRegOrXZ)
         
         (* Test a bit pattern in a register.  If the pattern is within the low-order
            32-bits we use a 32-bit test. *)
         fun testBitPattern(reg, bits) =
         let
             val w = if bits <= 0wxffffffff then WordSize32 else WordSize64
         in
             bitwiseAndSImmediate({wordSize=w, bits=bits, regN=reg, regD=XZero})
         end
     end
 
     local
         (* Floating point operations - 2 source *)
         fun floatingPoint2Source (pt, opc) {regM, regN, regD} =
             SimpleInstr(0wx1E200800 orb (pt << 0w22) orb (word8ToWord32(vReg regM) << 0w16) orb
                 (opc << 0w12) orb (word8ToWord32(vReg regN) << 0w5) orb word8ToWord32(vReg regD))
     in
         val multiplyFloat = floatingPoint2Source(0w0, 0wx0)
         and divideFloat = floatingPoint2Source(0w0, 0wx1)
         and addFloat = floatingPoint2Source(0w0, 0wx2)
         and subtractFloat = floatingPoint2Source(0w0, 0wx3)
         and multiplyDouble = floatingPoint2Source(0w1, 0wx0)
         and divideDouble = floatingPoint2Source(0w1, 0wx1)
         and addDouble = floatingPoint2Source(0w1, 0wx2)
         and subtractDouble = floatingPoint2Source(0w1, 0wx3)
     end
 
     local
         (* Move between a floating point and a general register with or without conversion. *)
         fun fmoveGeneral (sf, s, ptype, mode, opcode, rN, rD) {regN, regD} =
             SimpleInstr(0wx1E200000 orb (sf << 0w31) orb (s << 0w29) orb (ptype << 0w22) orb
                 (mode << 0w19) orb (opcode << 0w16) orb
                 (word8ToWord32(rN regN) << 0w5) orb word8ToWord32(rD regD))
         open IEEEReal
     in
         (* Moves without conversion *)
         val moveGeneralToFloat = fmoveGeneral(0w0, 0w0, 0w0, 0w0, 0w7, xRegOrXZ, vReg)
         and moveFloatToGeneral = fmoveGeneral(0w0, 0w0, 0w0, 0w0, 0w6, vReg, xRegOnly)
         and moveGeneralToDouble = fmoveGeneral(0w1, 0w0, 0w1, 0w0, 0w7, xRegOrXZ, vReg)
         and moveDoubleToGeneral = fmoveGeneral(0w1, 0w0, 0w1, 0w0, 0w6, vReg, xRegOnly)
         (* Moves with conversion - signed.  The argument is a 64-bit value. *)
         and convertIntToFloat = fmoveGeneral(0w1, 0w0, 0w0, 0w0, 0w2, xRegOrXZ, vReg)
         and convertIntToDouble = fmoveGeneral(0w1, 0w0, 0w1, 0w0, 0w2, xRegOrXZ, vReg)
+        and convertInt32ToFloat = fmoveGeneral(0w0, 0w0, 0w0, 0w0, 0w2, xRegOrXZ, vReg)
+        and convertInt32ToDouble = fmoveGeneral(0w0, 0w0, 0w1, 0w0, 0w2, xRegOrXZ, vReg)
 
         fun convertFloatToInt TO_NEAREST =
                 fmoveGeneral(0w1, 0w0, 0w0, 0w0, 0w4, vReg, xRegOnly) (* fcvtas *)
         |   convertFloatToInt TO_NEGINF =
                 fmoveGeneral(0w1, 0w0, 0w0, 0w2, 0w0, vReg, xRegOnly) (* fcvtms *)
         |   convertFloatToInt TO_POSINF =
                 fmoveGeneral(0w1, 0w0, 0w0, 0w1, 0w0, vReg, xRegOnly) (* fcvtps *)
         |   convertFloatToInt TO_ZERO =
                 fmoveGeneral(0w1, 0w0, 0w0, 0w3, 0w0, vReg, xRegOnly) (* fcvtzs *)
 
         and convertDoubleToInt TO_NEAREST =
                 fmoveGeneral(0w1, 0w0, 0w1, 0w0, 0w4, vReg, xRegOnly) (* fcvtas *)
         |   convertDoubleToInt TO_NEGINF =
                 fmoveGeneral(0w1, 0w0, 0w1, 0w2, 0w0, vReg, xRegOnly) (* fcvtms *)
         |   convertDoubleToInt TO_POSINF =
                 fmoveGeneral(0w1, 0w0, 0w1, 0w1, 0w0, vReg, xRegOnly) (* fcvtps *)
         |   convertDoubleToInt TO_ZERO =
                 fmoveGeneral(0w1, 0w0, 0w1, 0w3, 0w0, vReg, xRegOnly) (* fcvtzs *)
+
+        and convertFloatToInt32 TO_NEAREST =
+                fmoveGeneral(0w0, 0w0, 0w0, 0w0, 0w4, vReg, xRegOnly) (* fcvtas *)
+        |   convertFloatToInt32 TO_NEGINF =
+                fmoveGeneral(0w0, 0w0, 0w0, 0w2, 0w0, vReg, xRegOnly) (* fcvtms *)
+        |   convertFloatToInt32 TO_POSINF =
+                fmoveGeneral(0w0, 0w0, 0w0, 0w1, 0w0, vReg, xRegOnly) (* fcvtps *)
+        |   convertFloatToInt32 TO_ZERO =
+                fmoveGeneral(0w0, 0w0, 0w0, 0w3, 0w0, vReg, xRegOnly) (* fcvtzs *)
+
+        and convertDoubleToInt32 TO_NEAREST =
+                fmoveGeneral(0w0, 0w0, 0w1, 0w0, 0w4, vReg, xRegOnly) (* fcvtas *)
+        |   convertDoubleToInt32 TO_NEGINF =
+                fmoveGeneral(0w0, 0w0, 0w1, 0w2, 0w0, vReg, xRegOnly) (* fcvtms *)
+        |   convertDoubleToInt32 TO_POSINF =
+                fmoveGeneral(0w0, 0w0, 0w1, 0w1, 0w0, vReg, xRegOnly) (* fcvtps *)
+        |   convertDoubleToInt32 TO_ZERO =
+                fmoveGeneral(0w0, 0w0, 0w1, 0w3, 0w0, vReg, xRegOnly) (* fcvtzs *)
     end
 
     local
         fun floatingPtCompare(ptype, opc) {regM, regN} =
             SimpleInstr(0wx1E202000 orb (ptype << 0w22) orb
                 (word8ToWord32(vReg regM) << 0w16) orb (word8ToWord32(vReg regN) << 0w5) orb
                 (opc << 0w3))
     in
         val compareFloat = floatingPtCompare(0w0, 0w0) (* fcmp *)
         and compareDouble = floatingPtCompare(0w1, 0w0)
         (* It is also possible to compare a single register with zero using opc=1/3 *)
     end
 
     local
         (* Floating point single source. *)
         fun floatingPtSingle (ptype, opc) {regN, regD} =
             SimpleInstr(0wx1E204000 orb (ptype << 0w22) orb (opc << 0w15) orb
                 (word8ToWord32(vReg regN) << 0w5) orb word8ToWord32(vReg regD))
     in
         val moveFloatToFloat = floatingPtSingle(0w0, 0wx0)
         and absFloat = floatingPtSingle(0w0, 0wx1)
         and negFloat = floatingPtSingle(0w0, 0wx2)
         and convertFloatToDouble = floatingPtSingle(0w0, 0wx5)
         and moveDoubleToDouble = floatingPtSingle(0w1, 0wx0)
         and absDouble = floatingPtSingle(0w1, 0wx1)
         and negDouble = floatingPtSingle(0w1, 0wx2)
         and convertDoubleToFloat = floatingPtSingle(0w1, 0wx4)
     end
 
     (* This word is put in after a call to the RTS trap-handler.  All the registers
        are saved and restored across a call to the trap-handler; the register
        mask contains those that may contain an address and so need to be scanned and
        possibly updated if there is a GC. *)
     fun registerMask(regs) =
     let
         fun addToMask(r, mask) = mask orb (0w1 << word8ToWord(xRegOnly r))
         val maskWord = List.foldl addToMask 0w0 regs
     in
         SimpleInstr(0wx02000000 (* Reserved instr range. *) orb maskWord)
     end
     
 
     (* Size of each code word. *)
     fun codeSize (SimpleInstr _) = 1 (* Number of 32-bit words *)
     |   codeSize (LoadAddressLiteral _) = 1
     |   codeSize (LoadNonAddressLiteral _) = 1
     |   codeSize (Label _) = 0
     |   codeSize (UnconditionalBranch _) = 1
     |   codeSize (LoadLabelAddress _) = 1
     |   codeSize (ConditionalBranch { length=ref BrShort, ...}) = 1
     |   codeSize (ConditionalBranch { length=ref BrExtended, ...}) = 2
     |   codeSize (TestBitBranch { length=ref BrShort, ...}) = 1
     |   codeSize (TestBitBranch { length=ref BrExtended, ...}) = 2
     |   codeSize (CompareBranch { length=ref BrShort, ...}) = 1
     |   codeSize (CompareBranch { length=ref BrExtended, ...}) = 2
 
     (* Store a 32-bit value in the code *)
     fun writeInstr(value, wordAddr, seg) =
     let
         fun putBytes(value, a, seg, i) =
         if i = 0w4 then ()
         else
         (
             byteVecSet(seg, a+i, word32ToWord8(value andb 0wxff));
             putBytes(value >> 0w8, a, seg, i+0w1)
         )
     in
         putBytes(value, Word.<<(wordAddr, 0w2), seg, 0w0)
     end
     
     (* Store a 64-bit constant in the code area. *)
     fun write64Bit(value, word64Addr, seg) =
     let
         fun putBytes(value, a, seg, i) =
         if i = 0w8 then ()
         else
         (
             byteVecSet(seg, a+i, Word8.fromLarge(Word64.toLarge value));
             putBytes(Word64.>>(value, 0w8), a, seg, i+0w1)
         )
     in
         putBytes(value, Word.<<(word64Addr, 0w3), seg, 0w0)
     end
 
     (* Set the sizes of branches depending on the distance to the destination. *)
     fun setLabelsAndSizes ops =
     let
         (* Set the labels and get the current size of the code. *)
         fun setLabels(Label(ref labs) :: ops, ic) = (List.app(fn d => d := ic) labs; setLabels(ops, ic))
         |   setLabels(oper :: ops, ic) = setLabels(ops, ic + Word.fromInt(codeSize oper))
         |   setLabels([], ic) = ic
 
         (* Set the labels and adjust the sizes, repeating until it never gets smaller *)
         fun setLabAndSize(ops, lastSize) =
         let
             (* See if we can shorten any branches.  The "addr" is the original address since
                that's what we've used to set the labels.  *)
             fun adjust([], _) = ()
 
             |   adjust(ConditionalBranch { length as ref BrExtended, label=ref labs, ...} :: instrs, addr) =
                 let
                     val dest = !(hd labs)
                     val offset = Word.toInt dest - Word.toInt addr
                 in
                     if offset < Word32.toInt(0w1 << 0w18) andalso offset >= ~ (Word32.toInt(0w1 << 0w18))
                     then length := BrShort
                     else ();
                     adjust(instrs, addr + 0w2) (* N.B. Size BEFORE any adjustment *)
                 end
             
             |   adjust(TestBitBranch { length as ref BrExtended, label=ref labs, ...} :: instrs, addr) =
                 let
                     val dest = !(hd labs)
                     val offset = Word.toInt dest - Word.toInt addr
                 in
                     if offset < 0x2000 andalso offset >= ~ 0x2000
                     then length := BrShort
                     else ();
                     adjust(instrs, addr + 0w2) (* N.B. Size BEFORE any adjustment *)
                 end
 
             |   adjust(CompareBranch { length as ref BrExtended, label=ref labs, ...} :: instrs, addr) =
                 let
                     val dest = !(hd labs)
                     val offset = Word.toInt dest - Word.toInt addr
                 in
                     if offset < 0x40000 andalso offset >= ~ 0x40000
                     then length := BrShort
                     else ();
                     adjust(instrs, addr + 0w2) (* N.B. Size BEFORE any adjustment *)
                 end
             
             |   adjust(instr :: instrs, addr) = adjust(instrs, addr + Word.fromInt(codeSize instr))
 
             val () = adjust(ops, 0w0)
 
             val nextSize = setLabels(ops, 0w0)
         in
             if nextSize < lastSize then setLabAndSize(ops, nextSize)
             else if nextSize = lastSize then lastSize
             else raise InternalError "setLabAndSize - size increased"
         end
     in
         setLabAndSize(ops, setLabels(ops, 0w0))
     end
 
     fun genCode(ops, addressConsts, nonAddressConsts) =
     let
         val codeSize = setLabelsAndSizes ops (* Number of 32-bit instructions *)
         val wordsOfCode = (codeSize + 0w1) div 0w2 (* Round up to 64-bits *)
         val paddingWord = if Word.andb(codeSize, 0w1) = 0w1 then [SimpleInstr nopCode] else []
         
         val numNonAddrConsts = Word.fromInt(List.length nonAddressConsts)
         and numAddrConsts = Word.fromInt(List.length addressConsts) (* 32-bit words. *)
 
         (* Segment size in Poly words. *)
         val segSize =
             (wordsOfCode + numNonAddrConsts) * wordsPerNativeWord + numAddrConsts + 0w4 (* 4 extra words *)
         val codeVec = byteVecMake segSize
         
         fun testBit(bitNo, brNonZero, offset, reg) =
             0wx36000000 orb (if bitNo >= 0w32 then 0wx80000000 else 0w0) orb
                 (if brNonZero then 0wx01000000 else 0w0) orb
                 (word8ToWord32(Word8.andb(bitNo, 0wx3f)) << 0w19) orb
                 ((offset andb 0wx3fff) << 0w5) orb word8ToWord32(xRegOnly reg)
         and compareBranch(size, brNonZero, offset, reg) =
             0wx34000000 orb (case size of WordSize64 => 0wx80000000 | WordSize32 => 0w0) orb
                 (if brNonZero then 0wx01000000 else 0w0) orb
                 ((offset andb 0wx7ffff) << 0w5) orb word8ToWord32(xRegOnly reg)
 
         fun genCodeWords([], _ , _, _) = ()
 
         |   genCodeWords(SimpleInstr code :: tail, wordNo, aConstNum, nonAConstNum) =
             (
                 writeInstr(code, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum)
             )
 
         |   genCodeWords(LoadAddressLiteral{reg, ...} :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 (* The offset is in 32-bit words.  The first of the constants is
                    at offset wordsOfCode+3.  Non-address constants are always 8 bytes but
                    address constants are 4 bytes in 32-in-64. *)
                 val s = if is32in64 then 0w0 else 0w1 (* Load 64-bit word in 64-bit mode and 32-bits in 32-in-64. *)
                 val offsetOfConstant =
                     (wordsOfCode+numNonAddrConsts)*0w2 + (0w3+aConstNum)*(Address.wordSize div 0w4) - wordNo
                 val _ = offsetOfConstant < 0wx100000 orelse raise InternalError "Offset to constant is too large"
                 val code =
                     0wx18000000 orb (s << 0w30) orb (wordToWord32 offsetOfConstant << 0w5) orb word8ToWord32(xRegOnly reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum+0w1, nonAConstNum)
             end
 
         |   genCodeWords(LoadNonAddressLiteral{reg, ...} :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 (* The offset is in 32-bit words.  These are always 64-bits. *)
                 val offsetOfConstant = (wordsOfCode+nonAConstNum)*0w2 - wordNo
                 val _ = offsetOfConstant < 0wx100000 orelse raise InternalError "Offset to constant is too large"
                 val code = 0wx58000000 orb (wordToWord32 offsetOfConstant << 0w5) orb word8ToWord32(xRegOnly reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum+0w1)
             end
 
         |   genCodeWords(Label _ :: tail, wordNo, aConstNum, nonAConstNum) = 
                 genCodeWords(tail, wordNo, aConstNum, nonAConstNum) (* No code. *)
 
         |   genCodeWords(UnconditionalBranch(ref labs) :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt wordNo
                 val _ = (offset < Word32.toInt(0w1 << 0w25) andalso offset >= ~ (Word32.toInt(0w1 << 0w25)))
                     orelse raise InternalError "genCodeWords: branch too far";
             in
                 writeInstr(0wx14000000 orb (Word32.fromInt offset andb 0wx03ffffff), wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(ConditionalBranch{ label=ref labs, jumpCondition=CCode cond, length=ref BrShort }:: tail, wordNo,
                             aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt wordNo
                 val _ = (offset < Word32.toInt(0w1 << 0w18) andalso offset >= ~ (Word32.toInt(0w1 << 0w18)))
                         orelse raise InternalError "genCodeWords: branch too far"
             in
                 writeInstr(0wx54000000 orb ((Word32.fromInt offset andb 0wx07ffff) << 0w5)
                         orb word8ToWord32 cond, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(ConditionalBranch{ label=ref labs, jumpCondition=CCode cond, length=ref BrExtended }:: tail, wordNo,
                             aConstNum, nonAConstNum) =
             let (* Long form - put a conditional branch with reversed sense round an unconditional branch. *)
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt (wordNo + 0w1) (* Next instruction. *)
                 val _ = (offset < Word32.toInt(0w1 << 0w25) andalso offset >= ~ (Word32.toInt(0w1 << 0w25)))
                     orelse raise InternalError "genCodeWords: branch too far"
                 val revCond = Word8.xorb(cond, 0w1)
             in
                 writeInstr(0wx54000000 orb (0w2 << 0w5) orb word8ToWord32 revCond, wordNo, codeVec);
                 writeInstr(0wx14000000 orb (Word32.fromInt offset andb 0wx03ffffff), wordNo+0w1, codeVec);
                 genCodeWords(tail, wordNo+0w2, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(LoadLabelAddress{label=ref labs, reg} :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt wordNo
                 val _ = offset < 0x100000 orelse offset >= ~ 0x100000
                     orelse raise InternalError "Offset to label address is too large"
                 val code = 0wx10000000 orb ((Word32.fromInt offset andb 0wx7ffff) << 0w5) orb word8ToWord32(xRegOnly reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(TestBitBranch{label=ref labs, bitNo, brNonZero, reg, length=ref BrExtended} :: tail,
                     wordNo, aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt (wordNo + 0w1) (* Next instruction *)
                 val _ = (offset < Word32.toInt(0w1 << 0w25) andalso offset >= ~ (Word32.toInt(0w1 << 0w25)))
                     orelse raise InternalError "genCodeWords: branch too far"
                 val _ = bitNo <= 0w63 orelse
                     raise InternalError "TestBitBranch: bit number > 63"
                 val code = testBit(bitNo, (* Invert test *) not brNonZero, 0w2 (* Skip branch *), reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 writeInstr(0wx14000000 orb (Word32.fromInt offset andb 0wx03ffffff), wordNo+0w1, codeVec);
                 genCodeWords(tail, wordNo+0w2, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(TestBitBranch{label=ref labs, bitNo, brNonZero, reg, length=ref BrShort} :: tail,
                     wordNo, aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt wordNo
                 val _ = (offset < 0x2000 andalso offset >= ~ 0x2000)
                     orelse raise InternalError "TestBitBranch: Offset to label address is too large"
                 val _ = bitNo <= 0w63 orelse
                     raise InternalError "TestBitBranch: bit number > 63"
                 val code = testBit(bitNo, brNonZero, Word32.fromInt offset, reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(CompareBranch{label=ref labs, brNonZero, size, reg, length=ref BrExtended} :: tail,
                     wordNo, aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt (wordNo+0w1)
                 val _ = (offset < Word32.toInt(0w1 << 0w25) andalso offset >= ~ (Word32.toInt(0w1 << 0w25)))
                     orelse raise InternalError "genCodeWords: branch too far"
                 val code = compareBranch(size, (* Invert test *) not brNonZero, 0w2, reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 writeInstr(0wx14000000 orb (Word32.fromInt offset andb 0wx03ffffff), wordNo+0w1, codeVec);
                 genCodeWords(tail, wordNo+0w2, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(CompareBranch{label=ref labs, brNonZero, size, reg, length=ref BrShort} :: tail,
                     wordNo, aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt wordNo
                 val _ = (offset < 0x40000 andalso offset >= ~ 0x40000)
                     orelse raise InternalError "CompareBranch: Offset to label address is too large"
                 val code = compareBranch(size, brNonZero, Word32.fromInt offset, reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum)
             end
     in
         genCodeWords (ops @ paddingWord, 0w0, 0w0, 0w0);
         (* Copy in the non-address constants. *)
         List.foldl(fn (cVal, addr) => (write64Bit(cVal, addr, codeVec); addr+0w1)) wordsOfCode nonAddressConsts;
         (codeVec (* Return the completed code. *), wordsOfCode+numNonAddrConsts (* And the size in 64-bit words. *))
     end
 
     (* Store a word, either 64-bit or 32-bit *)
     fun setWord(value, wordNo, seg) =
     let
         val addrs = wordNo * Address.wordSize
         fun putBytes(value, a, seg, i) =
         if i = Address.wordSize then ()
         else
         (
             byteVecSet(seg, a+i, Word8.fromInt(value mod 256));
             putBytes(value div 256, a, seg, i+0w1)
         )
     in
         putBytes(value, addrs, seg, 0w0)
     end
     
    
     (* Print the instructions in the code. *)
     fun printCode (codeVec, functionName, wordsOfCode, printStream) =
     let
         val numInstructions = wordsOfCode * (Address.wordSize div 0w4)
     
         fun printHex (v, n) =
         let
             val s = Word.fmt StringCvt.HEX v
             val pad = CharVector.tabulate(Int.max(0, n-size s), fn _ => #"0")
         in
             printStream pad; printStream s
         end
 
         fun printCondition 0wx0 = printStream "eq"
         |   printCondition 0wx1 = printStream "ne"
         |   printCondition 0wx2 = printStream "cs"
         |   printCondition 0wx3 = printStream "cc"
         |   printCondition 0wx4 = printStream "mi"
         |   printCondition 0wx5 = printStream "pl"
         |   printCondition 0wx6 = printStream "vs"
         |   printCondition 0wx7 = printStream "vc"
         |   printCondition 0wx8 = printStream "hi"
         |   printCondition 0wx9 = printStream "ls"
         |   printCondition 0wxa = printStream "ge"
         |   printCondition 0wxb = printStream "lt"
         |   printCondition 0wxc = printStream "gt"
         |   printCondition 0wxd = printStream "le"
         |   printCondition 0wxe = printStream "al"
         |   printCondition _    = printStream "nv"
 
         (* Normal XReg with 31 being XZ *)
         fun prXReg 0w31 = printStream "xz"
         |   prXReg r = printStream("x" ^ Word32.fmt StringCvt.DEC r)
 
         (* XReg when 31 is SP *)
         fun prXRegOrSP 0w31 = printStream "sp"
         |   prXRegOrSP r = printStream("x" ^ Word32.fmt StringCvt.DEC r)
 
         (* Normal WReg with 31 being WZ *)
         fun prWReg 0w31 = printStream "wz"
         |   prWReg r = printStream("w" ^ Word32.fmt StringCvt.DEC r)
 
         (* WReg when 31 is WSP *)
         fun prWRegOrSP 0w31 = printStream "wsp"
         |   prWRegOrSP r = printStream("w" ^ Word32.fmt StringCvt.DEC r)
 
         (* Each instruction is 32-bytes. *)
         fun printWordAt wordNo =
         let
             val byteNo = Word.<<(wordNo, 0w2)
             val () = printHex(byteNo, 6)  (* Address *)
             val () = printStream "\t"
             val wordValue =
                 word8ToWord32 (codeVecGet (codeVec, byteNo)) orb
                 (word8ToWord32 (codeVecGet (codeVec, byteNo+0w1)) << 0w8) orb
                 (word8ToWord32 (codeVecGet (codeVec, byteNo+0w2)) << 0w16) orb
                 (word8ToWord32 (codeVecGet (codeVec, byteNo+0w3)) << 0w24)
             val () = printHex(word32ToWord wordValue, 8) (* Instr as hex *)
             val () = printStream "\t"
         in
             if (wordValue andb 0wxfffffc1f) = 0wxD61F0000
             then
             let
                 val rN = (wordValue andb 0wx3e0) >> 0w5
             in
                 printStream "br\tx";
                 printStream(Word32.fmt StringCvt.DEC rN)
             end
 
             else if (wordValue andb 0wxfffffc1f) = 0wxD63F0000
             then
             let
                 val rN = (wordValue andb 0wx3e0) >> 0w5
             in
                 printStream "blr\tx";
                 printStream(Word32.fmt StringCvt.DEC rN)
             end
 
             else if (wordValue andb 0wxfffffc1f) = 0wxD65F0000
             then
             let
                 val rN = (wordValue andb 0wx3e0) >> 0w5
             in
                 printStream "ret\tx";
                 printStream(Word32.fmt StringCvt.DEC rN)
             end
 
             else if wordValue = 0wxD503201F
             then printStream "nop"
             else if wordValue = 0wxD503203F
             then printStream "yield"
             else if wordValue = 0wxD5033BBF
             then printStream "dmb\tish"
 
             else if (wordValue andb 0wx1f800000) = 0wx12800000
             then (* Move of constants.  Includes movn and movk. *)
             let
                 val rD = wordValue andb 0wx1f
                 val imm16 = Word32.toInt((wordValue >> 0w5) andb 0wxffff)
                 val isXReg = (wordValue andb 0wx80000000) <> 0w0
                 val opc = (wordValue >> 0w29) andb 0w3
                 val shift = (wordValue >> 0w21) andb 0w3
             in
                 printStream (if opc = 0w3 then "movk\t" else "mov\t");
                 printStream (if isXReg then "x" else "w");
                 printStream(Word32.fmt StringCvt.DEC rD);
                 printStream ",#";
                 printStream(Int.toString(if opc = 0w0 then ~1 - imm16 else imm16));
                 if shift = 0w0
                 then ()
                 else (printStream ",lsl #"; printStream(Word32.fmt StringCvt.DEC (shift*0w16)))
             end
 
             else if (wordValue andb 0wx3b000000) = 0wx39000000
             then (* Load/Store with unsigned, scaled offset. *)
             let
                 (* The offset is in units of the size of the operand. *)
                 val size = wordValue >> 0w30
                 and v = (wordValue >> 0w26) andb 0w1
                 and opc = (wordValue >> 0w22) andb 0w3
                 val rT = wordValue andb 0wx1f
                 and rN = (wordValue andb 0wx3e0) >> 0w5
                 and imm12 = (wordValue andb 0wx3ffc00) >> 0w10
                 val (opcode, r, scale) =
                     case (size, v, opc) of
                         (0w0, 0w0, 0w0) => ("strb", "w", 0w0)
                     |   (0w0, 0w0, 0w1) => ("ldrb", "w", 0w0)
                     |   (0w1, 0w0, 0w0) => ("strh", "w", 0w2)
                     |   (0w1, 0w0, 0w1) => ("ldrh", "w", 0w2)
                     |   (0w2, 0w0, 0w0) => ("str", "w", 0w4)
                     |   (0w2, 0w0, 0w1) => ("ldr", "w", 0w4)
                     |   (0w3, 0w0, 0w0) => ("str", "x", 0w8)
                     |   (0w3, 0w0, 0w1) => ("ldr", "x", 0w8)
                     |   (0w2, 0w1, 0w0) => ("str", "s", 0w4)
                     |   (0w2, 0w1, 0w1) => ("ldr", "s", 0w4)
                     |   (0w3, 0w1, 0w0) => ("str", "d", 0w8)
                     |   (0w3, 0w1, 0w1) => ("ldr", "d", 0w8)
                     |   _ => ("??", "?", 0w1)
             in
                 printStream opcode; printStream "\t"; printStream r; printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",[x"; printStream(Word32.fmt StringCvt.DEC rN);
                 printStream ",#"; printStream(Word32.fmt StringCvt.DEC(imm12*scale));
                 printStream "]"
             end
 
             else if (wordValue andb 0wx3b200c00) = 0wx38000000
             then (* Load/store unscaled immediate *)
             let
                 val size = wordValue >> 0w30
                 and v = (wordValue >> 0w26) andb 0w1
                 and opc = (wordValue >> 0w22) andb 0w3
                 val rT = wordValue andb 0wx1f
                 and rN = (wordValue andb 0wx3e0) >> 0w5
                 and imm9 = (wordValue andb 0wx1ff000) >> 0w12
                 val imm9Text =
                     if imm9 > 0wxff
                     then "-" ^ Word32.fmt StringCvt.DEC (0wx200 - imm9)
                     else Word32.fmt StringCvt.DEC imm9
                 val (opcode, r) =
                     case (size, v, opc) of
                         (0w0, 0w0, 0w0) => ("strub", "w")
                     |   (0w0, 0w0, 0w1) => ("ldrub", "w")
                     |   (0w1, 0w0, 0w0) => ("struh", "w")
                     |   (0w1, 0w0, 0w1) => ("ldruh", "w")
                     |   (0w2, 0w0, 0w0) => ("stur", "w")
                     |   (0w2, 0w0, 0w1) => ("ldur", "w")
                     |   (0w3, 0w0, 0w0) => ("stur", "x")
                     |   (0w3, 0w0, 0w1) => ("ldur", "x")
                     |   (0w2, 0w1, 0w0) => ("stur", "s")
                     |   (0w2, 0w1, 0w1) => ("ldur", "s")
                     |   (0w3, 0w1, 0w0) => ("stur", "d")
                     |   (0w3, 0w1, 0w1) => ("ldur", "d")
                     |   _ => ("???", "?")
             in
                 printStream opcode; printStream "\t"; printStream r;
                 printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",[x"; printStream(Word32.fmt StringCvt.DEC rN);
                 printStream ",#"; printStream imm9Text; printStream "]"
             end
 
             else if (wordValue andb 0wx3b200c00) = 0wx38000400
             then (* Load/store immediate post-indexed *)
             let
                 val size = wordValue >> 0w30
                 and v = (wordValue >> 0w26) andb 0w1
                 and opc = (wordValue >> 0w22) andb 0w3
                 val rT = wordValue andb 0wx1f
                 and rN = (wordValue andb 0wx3e0) >> 0w5
                 and imm9 = (wordValue andb 0wx1ff000) >> 0w12
                 val imm9Text =
                     if imm9 > 0wxff
                     then "-" ^ Word32.fmt StringCvt.DEC (0wx200 - imm9)
                     else Word32.fmt StringCvt.DEC imm9
                 val (opcode, r) =
                     case (size, v, opc) of
                         (0w0, 0w0, 0w0) => ("strb", "w")
                     |   (0w0, 0w0, 0w1) => ("ldrb", "w")
+                    |   (0w2, 0w0, 0w0) => ("str", "w")
+                    |   (0w2, 0w0, 0w1) => ("ldr", "w")
                     |   (0w3, 0w0, 0w0) => ("str", "x")
                     |   (0w3, 0w0, 0w1) => ("ldr", "x")
                     |   _ => ("???", "?")
             in
                 printStream opcode; printStream "\t"; printStream r;
                 printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",[x"; printStream(Word32.fmt StringCvt.DEC rN);
                 printStream "],#"; printStream imm9Text
             end
 
             else if (wordValue andb 0wx3b200c00) = 0wx38000c00
             then (* Load/store immediate pre-indexed *)
             let
                 val size = wordValue >> 0w30
                 and v = (wordValue >> 0w26) andb 0w1
                 and opc = (wordValue >> 0w22) andb 0w3
                 val rT = wordValue andb 0wx1f
                 and rN = (wordValue andb 0wx3e0) >> 0w5
                 and imm9 = (wordValue andb 0wx1ff000) >> 0w12
                 val imm9Text =
                     if imm9 > 0wxff
                     then "-" ^ Word32.fmt StringCvt.DEC (0wx200 - imm9)
                     else Word32.fmt StringCvt.DEC imm9
                 val (opcode, r) =
                     case (size, v, opc) of
                         (0w0, 0w0, 0w0) => ("strb", "w")
                     |   (0w0, 0w0, 0w1) => ("ldrb", "w")
+                    |   (0w2, 0w0, 0w0) => ("str", "w")
+                    |   (0w2, 0w0, 0w1) => ("ldr", "w")
                     |   (0w3, 0w0, 0w0) => ("str", "x")
                     |   (0w3, 0w0, 0w1) => ("ldr", "x")
                     |   _ => ("???", "?")
             in
                 printStream opcode; printStream "\t"; printStream r;
                 printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",[x"; printStream(Word32.fmt StringCvt.DEC rN);
                 printStream ",#"; printStream imm9Text; printStream "]!"
             end
 
             else if (wordValue andb 0wx3b200c00) = 0wx38200800
             then (* Load/store with register offset i.e. an index register. *)
             let
                 val size = wordValue >> 0w30
                 and v = (wordValue >> 0w26) andb 0w1
                 and opc = (wordValue >> 0w22) andb 0w3
                 val rT = wordValue andb 0wx1f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rM = (wordValue >> 0w16) andb 0wx1f
                 val option = (wordValue >> 0w13) andb 0w7
                 val s = (wordValue andb 0wx1000) <> 0w0 
                 val (opcode, r) =
                     case (size, v, opc) of
                         (0w0, 0w0, 0w0) => ("strb", "w")
                     |   (0w0, 0w0, 0w1) => ("ldrb", "w")
                     |   (0w1, 0w0, 0w0) => ("strh", "w")
                     |   (0w1, 0w0, 0w1) => ("ldrh", "w")
                     |   (0w2, 0w0, 0w0) => ("str", "w")
                     |   (0w2, 0w0, 0w1) => ("ldr", "w")
                     |   (0w3, 0w0, 0w0) => ("str", "x")
                     |   (0w3, 0w0, 0w1) => ("ldr", "x")
                     |   (0w2, 0w1, 0w0) => ("str", "s")
                     |   (0w2, 0w1, 0w1) => ("ldr", "s")
                     |   (0w3, 0w1, 0w0) => ("str", "d")
                     |   (0w3, 0w1, 0w1) => ("ldr", "d")
                     |   _ => ("???", "?")
                 val (extend, xr) =
                     case option of
                         0w2 => (" uxtw", "w")
                     |   0w3 => if s then (" lsl", "x") else ("", "x")
                     |   0w6 => (" sxtw", "w")
                     |   0w7 => (" sxtx", "x")
                     |   _   => ("?", "?")
                 val indexShift =
                     case (size, s) of
                         (0w0, true) => " #1"
                     |   (0w1, true) => " #1"
                     |   (0w2, true) => " #2"
                     |   (0w3, true) => " #3"
                     |   _ => ""
             in
                 printStream opcode; printStream "\t"; printStream r;
                 printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",[x"; printStream(Word32.fmt StringCvt.DEC rN);
                 printStream ","; printStream xr; printStream(Word32.fmt StringCvt.DEC rM);
                 printStream extend; printStream indexShift;
                 printStream "]"
             end
 
             else if (wordValue andb 0wx3f000000) = 0wx08000000
             then (* Loads and stores with special ordering. *)
             let
                 val size = (wordValue >> 0w30) andb 0w3
                 and o2 = (wordValue >> 0w23) andb 0w1
                 and l = (wordValue >> 0w22) andb 0w1
                 and o1 = (wordValue >> 0w21) andb 0w1
                 and o0 = (wordValue >> 0w15) andb 0w1
                 val rT = wordValue andb 0wx1f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rS = (wordValue >> 0w16) andb 0wx1f
                 val (opcode, r) =
                     case (size, o2, l, o1, o0) of
                         (0w3, 0w1, 0w1, 0w0, 0w1) => ("ldar", "x")
                     |   (0w3, 0w1, 0w0, 0w0, 0w1) => ("stlr", "x")
+                    |   (0w2, 0w1, 0w1, 0w0, 0w1) => ("ldar", "w")
+                    |   (0w2, 0w1, 0w0, 0w0, 0w1) => ("stlr", "w")
                     |   (0w3, 0w0, 0w1, 0w0, 0w1) => ("ldaxr", "x")
                     |   (0w3, 0w0, 0w0, 0w0, 0w1) => ("stlxr", "x")
                     |   _ => ("??", "?")
             in
                 printStream opcode; printStream "\t";
                 if opcode = "stlxr"
                 then (printStream "w"; printStream(Word32.fmt StringCvt.DEC rS); printStream ",")
                 else ();
                 printStream r;
                 printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",[x"; printStream(Word32.fmt StringCvt.DEC rN); printStream "]"
             end
 
             else if (wordValue andb 0wxbf800000) = 0wx91000000
             then
             let
                 (* Add/Subtract a 12-bit immediate with possible shift. *)
                 val rD = wordValue andb 0wx1f
                 and rN = (wordValue andb 0wx3e0) >> 0w5
                 and imm12 = (wordValue andb 0wx3ffc00) >> 0w10
                 and shiftBit = wordValue andb 0wx400000
                 val imm = if shiftBit <> 0w0 then imm12 << 0w12 else imm12
                 val opr = if (wordValue andb 0wx40000000) = 0w0 then "add" else "sub"
             in
                 printStream opr; printStream "\t"; prXRegOrSP rD;
                 printStream ","; prXRegOrSP rN;
                 printStream ",#"; printStream(Word32.fmt StringCvt.DEC imm)
             end
 
             else if (wordValue andb 0wxff800000) = 0wxF1000000
             then
             let
                 (* Subtract a 12-bit immediate with possible shift, setting flags. *)
                 val rD = wordValue andb 0wx1f
                 and rN = (wordValue andb 0wx3e0) >> 0w5
                 and imm12 = (wordValue andb 0wx3ffc00) >> 0w10
                 and shiftBit = wordValue andb 0wx400000
                 val imm = if shiftBit <> 0w0 then imm12 << 0w12 else imm12
             in
                 if rD = 0w31
                 then printStream "cmp\t"
                 else (printStream "subs\t"; prXReg rD; printStream ",");
                 prXRegOrSP rN; printStream ",#"; printStream(Word32.fmt StringCvt.DEC imm)
             end
 
             else if (wordValue andb 0wx7fe0ffe0) = 0wx2A0003E0
             then (* Move reg,reg.  This is a subset of ORR shifted register. *)
             let
                 val reg = if (wordValue andb 0wx80000000) <> 0w0 then "x" else "w"
             in
                 printStream "mov\t"; printStream reg;
                 printStream(Word32.fmt StringCvt.DEC(wordValue andb 0wx1f));
                 printStream ","; printStream reg;
                 printStream(Word32.fmt StringCvt.DEC((wordValue >> 0w16) andb 0wx1f))
             end
 
             else if (wordValue andb 0wx1f000000) = 0wx0A000000
             then
             let
                 (* Logical operations with shifted register. *)
                 val rD = wordValue andb 0wx1f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rM = (wordValue >> 0w16) andb 0wx1f
                 and imm6 = (wordValue >> 0w10) andb 0wx3f
                 and shiftCode = (wordValue >> 0w22) andb 0wx3
                 val opc = (wordValue >> 0w29) andb 0wx3
                 val nBit = (wordValue >> 0w21) andb 0w1
                 val reg = if (wordValue andb 0wx80000000) <> 0w0 then "x" else "w"
                 val opcode =
                     case (opc, nBit) of
                         (0w0, 0w0) => "and"
                     |   (0w1, 0w0) => "orr"
                     |   (0w2, 0w0) => "eor"
                     |   (0w3, 0w0) => "ands"
                     |   _ => "??"
             in
                 printStream opcode; printStream"\t";
                 printStream reg;
                 printStream(Word32.fmt StringCvt.DEC rD); printStream ",";
                 printStream reg; printStream(Word32.fmt StringCvt.DEC rN);
                 printStream ","; printStream reg; printStream(Word32.fmt StringCvt.DEC rM);
                 if imm6 <> 0w0
                 then
                 (
                     case shiftCode of
                         0w0 => printStream ",lsl #"
                     |   0w1 => printStream ",lsr #"
                     |   0w2 => printStream ",asr #"
                     |   _ => printStream ",?? #";
                     printStream(Word32.fmt StringCvt.DEC imm6)
                 )
                 else ()
             end
 
             else if (wordValue andb 0wx1f200000) = 0wx0B000000
             then
             let
                 (* Add/subtract shifted register. *)
                 val rD = wordValue andb 0wx1f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rM = (wordValue >> 0w16) andb 0wx1f
                 and imm6 = (wordValue >> 0w10) andb 0wx3f
                 and shiftCode = (wordValue >> 0w22) andb 0wx3
                 val oper = (wordValue andb 0wx40000000) = 0w0
                 val isS = (wordValue andb 0wx20000000) <> 0w0
                 val pReg = if (wordValue andb 0wx80000000) <> 0w0 then prXReg else prWReg
             in
                 if isS andalso rD = 0w31
                 then printStream(if oper then "cmn\t" else "cmp\t")
                 else
                 (
                     printStream(if oper then "add" else "sub"); printStream(if isS then "s\t" else "\t");
                     pReg rD; printStream ","
                 );
                 pReg rN;
                 printStream ","; pReg rM;
                 if imm6 <> 0w0
                 then
                 (
                     case shiftCode of
                         0w0 => printStream ",lsl #"
                     |   0w1 => printStream ",lsr #"
                     |   0w2 => printStream ",asr #"
                     |   _ => printStream ",?? #";
                     printStream(Word32.fmt StringCvt.DEC imm6)
                 )
                 else ()
             end
 
             else if (wordValue andb 0wx1fe00000) = 0wx0b200000
             then
             let
                 (* Add/subtract extended register. *)
                 val rD = wordValue andb 0wx1f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rM = (wordValue >> 0w16) andb 0wx1f
                 and extend = (wordValue >> 0w13) andb 0w7
                 and amount = (wordValue >> 0w10) andb 0w7
                 and sf = (wordValue >> 0w31) andb 0w1
                 and p = (wordValue >> 0w30) andb 0w1
                 and s = (wordValue >> 0w29) andb 0w1
             in
                 if s = 0w1 andalso rD = 0w31
                 then printStream(if p = 0w0 then "cmn\t" else "cmp\t")
                 else
                 (
                     printStream(if p = 0w0 then "add" else "sub");
                     printStream(if s = 0w1 then "s\t" else "\t");
                     (if sf = 0w1 then prXRegOrSP else prWRegOrSP) rD; printStream ","
                 );
                 (if sf = 0w1 then prXRegOrSP else prWRegOrSP) rN;
                 printStream ",";
                 (if extend = 0w3 orelse extend = 0w7 then prXReg else prWReg) rM;
                 case extend of
                     0w0 => printStream ",uxtb"
                 |   0w1 => printStream ",uxth"
                 |   0w2 => if amount = 0w0 andalso sf = 0w0 then () else printStream ",uxtw"
                 |   0w3 => if amount = 0w0 andalso sf = 0w1 then () else printStream ",uxtx"
                 |   0w4 => printStream ",sxtb"
                 |   0w5 => printStream ",sxth"
                 |   0w6 => printStream ",sxtw"
                 |   0w7 => printStream ",sxtx"
                 |   _ => printStream "?";
                
                 if amount <> 0w0
                 then printStream(" #" ^ Word32.fmt StringCvt.DEC amount)
                 else ()
             end
 
             else if (wordValue andb 0wxbf000000) = 0wx18000000
             then
             let
                 (* Load from a PC-relative address.  This may refer to the
                    address constant area or the non-address constant area. *)
                 val rT = wordValue andb 0wx1f
                 val s = (wordValue >> 0w30) andb 0w1
                 (* The offset is in 32-bit words *)
                 val byteAddr = word32ToWord(((wordValue andb 0wx00ffffe0) >> (0w5-0w2))) + byteNo
                 val wordAddr = byteAddr div wordSize
                 (* We must NOT use codeVecGetWord if this is in the non-address
                    area.  It may well not be a tagged value. *)
                 local
                     fun getConstant(cVal, 0w0) = cVal
                     |   getConstant(cVal, offset) =
                         let
                             val byteVal =
                                 Word64.fromLarge(Word8.toLarge(codeVecGet (codeVec, byteAddr+offset-0w1)))
                         in
                             getConstant(Word64.orb(Word64.<<(cVal, 0w8), byteVal), offset-0w1)
                         end
                 in
                     val constantValue =
                         if wordAddr <= wordsOfCode
                         then "0x" ^ Word64.toString(getConstant(0w0, 0w8)) (* It's a non-address constant *)
                         else stringOfWord(codeVecGetWord(codeVec, wordAddr))
                 end
             in
                 printStream "ldr\t"; printStream (if s = 0w0 then "w" else "x");
                 printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",0x"; printStream(Word.fmt StringCvt.HEX byteAddr);
                 printStream "\t// "; printStream constantValue
             end
 
             else if (wordValue andb 0wxbf000000) = 0wx10000000
             then
             let
                 (* Put a pc-relative address into a register. *)
                 val rT = wordValue andb 0wx1f
                 val byteOffset =
-                    ((wordValue andb 0wx00ffffe0) << (Word.fromInt Word.wordSize - 0w23) ~>>
-                        (Word.fromInt Word.wordSize - 0w20)) + ((wordValue >> 0w29) andb 0w3)
+                    ((wordValue andb 0wx00ffffe0) << (Word.fromInt Word32.wordSize - 0w23) ~>>
+                        (Word.fromInt Word32.wordSize - 0w20)) + ((wordValue >> 0w29) andb 0w3)
             in
                 printStream "adr\tx"; printStream(Word32.fmt StringCvt.DEC rT);
-                printStream ",0x"; printStream(Word.fmt StringCvt.HEX (byteNo+word32ToWord byteOffset))
+                printStream ",0x"; printStream(Word32.fmt StringCvt.HEX (wordToWord32 byteNo+byteOffset))
             end
 
             else if (wordValue andb 0wxfc000000) = 0wx14000000
             then (* Unconditional branch. *)
             let
                 (* The offset is signed and the destination may be earlier. *)
                 val byteOffset =
-                    (wordValue andb 0wx03ffffff) << (Word.fromInt Word.wordSize - 0w26) ~>>
-                        (Word.fromInt Word.wordSize - 0w28)
+                    (wordValue andb 0wx03ffffff) << (Word.fromInt Word32.wordSize - 0w26) ~>>
+                        (Word.fromInt Word32.wordSize - 0w28)
             in
                 printStream "b\t0x";
-                printStream(Word.fmt StringCvt.HEX (byteNo+word32ToWord byteOffset))
+                printStream(Word32.fmt StringCvt.HEX (wordToWord32 byteNo + byteOffset))
             end
 
             else if (wordValue andb 0wxff000000) = 0wx54000000
             then (* Conditional branch *)
             let
                 val byteOffset =
-                    (wordValue andb 0wx00ffffe0) << (Word.fromInt Word.wordSize - 0w24) ~>>
-                        (Word.fromInt Word.wordSize - 0w21)
+                    (wordValue andb 0wx00ffffe0) << (Word.fromInt Word32.wordSize - 0w24) ~>>
+                        (Word.fromInt Word32.wordSize - 0w21)
             in
                 printStream "b.";
                 printCondition(wordValue andb 0wxf);
                 printStream "\t0x";
-                printStream(Word.fmt StringCvt.HEX (byteNo+word32ToWord byteOffset))
+                printStream(Word32.fmt StringCvt.HEX (wordToWord32 byteNo+byteOffset))
             end
 
             else if (wordValue andb 0wx7e000000) = 0wx34000000
             then (* Compare and branch *)
             let
                 val byteOffset =
-                    (wordValue andb 0wx00ffffe0) << (Word.fromInt Word.wordSize - 0w24) ~>>
-                        (Word.fromInt Word.wordSize - 0w21)
+                    (wordValue andb 0wx00ffffe0) << (Word.fromInt Word32.wordSize - 0w24) ~>>
+                        (Word.fromInt Word32.wordSize - 0w21)
                 val oper =
                     if (wordValue andb 0wx01000000) = 0w0
                     then "cbz" else "cbnz"
                 val r = if (wordValue andb 0wx80000000) = 0w0 then "w" else "x"
             in
                 printStream oper; printStream "\t";
                 printStream r; printStream(Word32.fmt StringCvt.DEC (wordValue andb 0wx1f));
                 printStream ",0x";
-                printStream(Word.fmt StringCvt.HEX (byteNo+word32ToWord byteOffset))
+                printStream(Word32.fmt StringCvt.HEX (wordToWord32 byteNo+byteOffset))
             end
 
             else if (wordValue andb 0wx7e000000) = 0wx36000000
             then (* Test bit and branch *)
             let
                 val byteOffset =
                     (wordValue andb 0wx00ffffe0) << (Word.fromInt Word.wordSize - 0w19) ~>>
                         (Word.fromInt Word.wordSize - 0w16)
                 val oper =
                     if (wordValue andb 0wx01000000) = 0w0
                     then "tbz" else "tbnz"
                 val b40 = (wordValue >> 0w19) andb 0wx1f
                 val bitNo = b40 orb ((wordValue >> 0w26) andb 0wx20)
                 val r = if bitNo < 0w32 then "w" else "x"
             in
                 printStream oper; printStream "\t";
                 printStream r; printStream(Word32.fmt StringCvt.DEC (wordValue andb 0wx1f));
                 printStream ",#"; printStream(Word32.fmt StringCvt.DEC bitNo); printStream ",0x";
                 printStream(Word.fmt StringCvt.HEX (byteNo+word32ToWord byteOffset))
             end
 
             else if (wordValue andb 0wx3fe00000) = 0wx1A800000
             then
             let
                 val sf = wordValue >> 0w31
                 val opc = (wordValue >> 0w30) andb 0w1
                 val op2 = (wordValue >> 0w10) andb 0w3
                 val rT = wordValue andb 0wx1f
                 val rN = (wordValue >> 0w5) andb 0wx1f
                 val rM = (wordValue >> 0w16) andb 0wx1f
                 val cond = (wordValue >> 0w12) andb 0wxf
                 val opcode =
                     case (opc, op2) of
                         (0w0, 0w0) => "csel"
                     |   (0w0, 0w1) => "csinc"
                     |   (0w1, 0w0) => "csinv"
                     |   (0w1, 0w1) => "csneg"
                     |   _ => "??"
                 val r = if sf = 0w0 then "w" else "x"
             in
                 printStream opcode; printStream "\t";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ","; printStream r; printStream(Word32.fmt StringCvt.DEC rN);
                 printStream ","; printStream r; printStream(Word32.fmt StringCvt.DEC rM);
                 printStream ","; printCondition cond
             end
 
             else if (wordValue andb 0wx7f800000) = 0wx13000000
             then (* signed bitfield *)
             let
                 val sf = wordValue >> 0w31
                 (* N is always the same as sf. *)
                 (*val nBit = (wordValue >> 0w22) andb 0w1*)
                 val immr = (wordValue >> 0w16) andb 0wx3f
                 val imms = (wordValue >> 0w10) andb 0wx3f
                 val rN = (wordValue >> 0w5) andb 0wx1f
                 val rD = wordValue andb 0wx1f
                 val (r, wordSize) = if sf = 0w0 then ("w", 0w32) else ("x", 0w64)
             in
                 if imms = wordSize - 0w1
                 then printStream "asr\t"
                 else printStream "sbfm\t";
                 printStream r;
                 printStream(Word32.fmt StringCvt.DEC rD);
                 printStream ",";
                 printStream r;
                 printStream(Word32.fmt StringCvt.DEC rN);
                 if imms = wordSize - 0w1
                 then (printStream ",#0x"; printStream(Word32.toString immr))
                 else
                 (
                     printStream ",#0x"; printStream(Word32.toString immr);
                     printStream ",#0x"; printStream(Word32.toString imms)
                 )
             end
 
             else if (wordValue andb 0wx7f800000) = 0wx53000000
             then (* unsigned bitfield move *)
             let
                 val sf = wordValue >> 0w31
                 (* N is always the same as sf. *)
                 (*val nBit = (wordValue >> 0w22) andb 0w1*)
                 val immr = (wordValue >> 0w16) andb 0wx3f
                 val imms = (wordValue >> 0w10) andb 0wx3f
                 val rN = (wordValue >> 0w5) andb 0wx1f
                 val rD = wordValue andb 0wx1f
                 val (r, wordSize) = if sf = 0w0 then ("w", 0w32) else ("x", 0w64)
             in
                 if imms + 0w1 = immr
                 then printStream "lsl\t"
                 else if imms = wordSize - 0w1
                 then printStream "lsr\t"
                 else printStream "ubfm\t";
                 printStream r;
                 printStream(Word32.fmt StringCvt.DEC rD);
                 printStream ",";
                 printStream r;
                 printStream(Word32.fmt StringCvt.DEC rN);
                 if imms + 0w1 = immr
                 then (printStream ",#0x"; printStream(Word32.toString(wordSize - immr)))
                 else if imms = wordSize - 0w1
                 then (printStream ",#0x"; printStream(Word32.toString immr))
                 else
                 (
                     printStream ",#0x"; printStream(Word32.toString immr);
                     printStream ",#0x"; printStream(Word32.toString imms)
                 )
             end
 
             else if (wordValue andb 0wx1f800000) = 0wx12000000
             then (* logical immediate *)
             let
                 val sf = wordValue >> 0w31
                 val opc = (wordValue >> 0w29) andb 0w3
                 val nBit = (wordValue >> 0w22) andb 0w1
                 val immr = (wordValue >> 0w16) andb 0wx3f
                 val imms = (wordValue >> 0w10) andb 0wx3f
                 val rN = (wordValue >> 0w5) andb 0wx1f
                 val rD = wordValue andb 0wx1f
                 val (opcode, r) =
                     case (sf, opc, nBit) of
                         (0w0, 0w0, 0w0) => ("and", "w")
                     |   (0w0, 0w1, 0w0) => ("orr", "w")
                     |   (0w0, 0w2, 0w0) => ("eor", "w")
                     |   (0w0, 0w3, 0w0) => ("ands", "w")
                     |   (0w1, 0w0, _) => ("and", "x")
                     |   (0w1, 0w1, _) => ("orr", "x")
                     |   (0w1, 0w2, _) => ("eor", "x")
                     |   (0w1, 0w3, _) => ("ands", "x")
                     |   _ => ("??", "?")
             in
                 printStream opcode;
                 printStream "\t";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rD); printStream ",";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rN); printStream ",#0x";
                 printStream(Word64.toString(decodeBitPattern{sf=sf, n=nBit, immr=immr, imms=imms}))
             end
 
             else if (wordValue andb 0wx5fe00000) = 0wx1ac00000
             then (* Two source operations - shifts and divide. *)
             let
                 val sf = wordValue >> 0w31
                 val s = (wordValue >> 0w29) andb 0w1
                 val rM = (wordValue >> 0w16) andb 0wx1f
                 val opcode = (wordValue >> 0w10) andb 0wx3f
                 val rN = (wordValue >> 0w5) andb 0wx1f
                 val rD = wordValue andb 0wx1f
                 val (oper, r) =
                     case (sf, s, opcode) of
                         (0w1, 0w0, 0wx2) => ("udiv", "x")
                     |   (0w1, 0w0, 0wx3) => ("sdiv", "x")
                     |   (0w1, 0w0, 0wx8) => ("lsl", "x")
                     |   (0w1, 0w0, 0wx9) => ("lsr", "x")
                     |   (0w1, 0w0, 0wxa) => ("asr", "x")
                     |   _ => ("??", "?")
             in
                 printStream oper;
                 printStream "\t";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rD); printStream ",";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rN); printStream ",";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rM)
             end
 
             else if (wordValue andb 0wx1f000000) = 0wx1b000000
             then (* Three source operations - multiply add/subtract. *)
             let
                 val sf = wordValue >> 0w31
                 val op54 = (wordValue >> 0w29) andb 0w3
                 val op31 = (wordValue >> 0w21) andb 0w7
                 val o0 = (wordValue >> 0w15) andb 0w1
                 val rM = (wordValue >> 0w16) andb 0wx1f
                 val rA = (wordValue >> 0w10) andb 0wx1f
                 val rN = (wordValue >> 0w5) andb 0wx1f
                 val rD = wordValue andb 0wx1f
                 val (oper, r) =
                     case (sf, op54, op31, o0, rA) of
                         (0w1, 0w0, 0w0, 0w0, 0w31) => ("mul", "x")
                     |   (0w1, 0w0, 0w0, 0w0, _)    => ("madd", "x")
                     |   (0w1, 0w0, 0w0, 0w1, 0w31) => ("mneg", "x")
                     |   (0w1, 0w0, 0w0, 0w1, _)    => ("msub", "x")
                     |   (0w1, 0w0, 0w2, 0w0, 0w31) => ("smulh", "x")
                     |   _ => ("??", "?")
             in
                 printStream oper;
                 printStream "\t";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rD); printStream ",";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rN); printStream ",";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rM);
                 if rA = 0w31 then ()
                 else (printStream ","; printStream r; printStream(Word32.fmt StringCvt.DEC rA))
             end
 
             else if (wordValue andb 0wxffe0fc00) = 0wxC800FC00
             then
             let
                 val rS = (wordValue >> 0w16) andb 0wx1f
                 val rN = (wordValue >> 0w5) andb 0wx1f
                 val rT = wordValue andb 0wx1f
             in
                 printStream "stlxr\tw"; printStream(Word32.fmt StringCvt.DEC rS);
                 printStream ",x"; printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ".[x"; printStream(Word32.fmt StringCvt.DEC rN); printStream "]"
             end
             
             else if (wordValue andb 0wx7f20fc00) = 0wx1E200000
             then (* Moves between floating point and general regs. *)
             let
                 val sf = (wordValue >> 0w31) andb 0w1
                 and s = (wordValue >> 0w29) andb 0w1
                 and ptype = (wordValue >> 0w22) andb 0w3
                 and mode = (wordValue >> 0w19) andb 0w3
                 and opcode = (wordValue >> 0w16) andb 0w7
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rD = wordValue andb 0wx1f
                 val (opc, dr, nr) =
                     case (sf, s, ptype, mode, opcode) of
                         (0w0, 0w0, 0w0, 0w0, 0w7) => ("fmov", "s", "w") (* w -> s *)
                     |   (0w0, 0w0, 0w0, 0w0, 0w6) => ("fmov", "w", "s") (* s -> w *)
                     |   (0w1, 0w0, 0w1, 0w0, 0w7) => ("fmov", "d", "x") (* d -> x *)
                     |   (0w1, 0w0, 0w1, 0w0, 0w6) => ("fmov", "x", "d") (* x -> d *)
+                    |   (0w0, 0w0, 0w0, 0w0, 0w2) => ("scvtf", "w", "s")
+                    |   (0w0, 0w0, 0w1, 0w0, 0w2) => ("scvtf", "w", "d")
                     |   (0w1, 0w0, 0w0, 0w0, 0w2) => ("scvtf", "x", "s")
                     |   (0w1, 0w0, 0w1, 0w0, 0w2) => ("scvtf", "x", "d")
-                    |   (0w1, 0w0, 0w0, 0w0, 0w4) => ("fcvtas", "w", "s") (* s -> w *)
-                    |   (0w1, 0w0, 0w0, 0w2, 0w0) => ("fcvtms", "w", "s") (* s -> w *)
-                    |   (0w1, 0w0, 0w0, 0w1, 0w0) => ("fcvtps", "w", "s") (* s -> w *)
-                    |   (0w1, 0w0, 0w0, 0w3, 0w0) => ("fcvtzs", "w", "s") (* s -> w *)
-                    |   (0w1, 0w0, 0w1, 0w0, 0w4) => ("fcvtas", "x", "s") (* s -> x *)
-                    |   (0w1, 0w0, 0w1, 0w2, 0w0) => ("fcvtms", "x", "s") (* s -> x *)
-                    |   (0w1, 0w0, 0w1, 0w1, 0w0) => ("fcvtps", "x", "s") (* s -> x *)
-                    |   (0w1, 0w0, 0w1, 0w3, 0w0) => ("fcvtzs", "x", "s") (* s -> x *)
+
+                    |   (0w0, 0w0, 0w0, 0w0, 0w4) => ("fcvtas", "w", "s") (* s -> w *)
+                    |   (0w0, 0w0, 0w0, 0w2, 0w0) => ("fcvtms", "w", "s") (* s -> w *)
+                    |   (0w0, 0w0, 0w0, 0w1, 0w0) => ("fcvtps", "w", "s") (* s -> w *)
+                    |   (0w0, 0w0, 0w0, 0w3, 0w0) => ("fcvtzs", "w", "s") (* s -> w *)
+                    |   (0w0, 0w0, 0w1, 0w0, 0w4) => ("fcvtas", "w", "d") (* d -> w *)
+                    |   (0w0, 0w0, 0w1, 0w2, 0w0) => ("fcvtms", "w", "d") (* d -> w *)
+                    |   (0w0, 0w0, 0w1, 0w1, 0w0) => ("fcvtps", "w", "d") (* d -> w *)
+                    |   (0w0, 0w0, 0w1, 0w3, 0w0) => ("fcvtzs", "w", "d") (* d -> w *)
+
+                    |   (0w1, 0w0, 0w0, 0w0, 0w4) => ("fcvtas", "x", "s") (* s -> x *)
+                    |   (0w1, 0w0, 0w0, 0w2, 0w0) => ("fcvtms", "x", "s") (* s -> x *)
+                    |   (0w1, 0w0, 0w0, 0w1, 0w0) => ("fcvtps", "x", "s") (* s -> x *)
+                    |   (0w1, 0w0, 0w0, 0w3, 0w0) => ("fcvtzs", "x", "s") (* s -> x *)
+                    |   (0w1, 0w0, 0w1, 0w0, 0w4) => ("fcvtas", "x", "d") (* d -> x *)
+                    |   (0w1, 0w0, 0w1, 0w2, 0w0) => ("fcvtms", "x", "d") (* d -> x *)
+                    |   (0w1, 0w0, 0w1, 0w1, 0w0) => ("fcvtps", "x", "d") (* d -> x *)
+                    |   (0w1, 0w0, 0w1, 0w3, 0w0) => ("fcvtzs", "x", "d") (* d -> x *)
                     |   _ => ("?", "?", "?")
             in
                 printStream opc; printStream "\t";
                 printStream dr; printStream(Word32.fmt StringCvt.DEC rD); printStream ",";
                 printStream nr; printStream(Word32.fmt StringCvt.DEC rN)
             end
             
             else if (wordValue andb 0wxff200c00) = 0wx1E200800
             then (* Floating point two source operations. *)
             let
                 val pt = (wordValue >> 0w22) andb 0w3
                 and rM = (wordValue >> 0w16) andb 0wx1f
                 and opc = (wordValue >> 0w12) andb 0wxf
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rT = wordValue andb 0wx1f
                 val (opcode, r) =
                     case (pt, opc) of
                         (0w0, 0wx0) => ("fmul", "s")
                     |   (0w0, 0wx1) => ("fdiv", "s")
                     |   (0w0, 0wx2) => ("fadd", "s")
                     |   (0w0, 0wx3) => ("fsub", "s")
                     |   (0w1, 0wx0) => ("fmul", "d")
                     |   (0w1, 0wx1) => ("fdiv", "d")
                     |   (0w1, 0wx2) => ("fadd", "d")
                     |   (0w1, 0wx3) => ("fsub", "d")
                     |   _ => ("??", "?")
             in
                 printStream opcode; printStream "\t";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rT); printStream ",";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rN); printStream ",";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rM)
             end
 
             else if (wordValue andb 0wxff207c00) = 0wx1E204000
             then (* Floating point single source. *)
             let
                 val pt = (wordValue >> 0w22) andb 0w3
                 and opc = (wordValue >> 0w15) andb 0wx3f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rT = wordValue andb 0wx1f
                 val (opcode, rS, rD) =
                     case (pt, opc) of
                         (0w0, 0wx0) => ("fmov", "s", "s")
                     |   (0w0, 0wx1) => ("fabs", "s", "s")
                     |   (0w0, 0wx2) => ("fneg", "s", "s")
                     |   (0w0, 0wx5) => ("fcvt", "s", "d")
                     |   (0w1, 0wx0) => ("fmov", "d", "d")
                     |   (0w1, 0wx1) => ("fabs", "d", "d")
                     |   (0w1, 0wx2) => ("fneg", "d", "d")
                     |   (0w1, 0wx4) => ("fcvt", "d", "s")
                     |   _ => ("??", "?", "?")
             in
                 printStream opcode; printStream "\t";
                 printStream rD; printStream(Word32.fmt StringCvt.DEC rT); printStream ",";
                 printStream rS; printStream(Word32.fmt StringCvt.DEC rN)
             end
 
             else if (wordValue andb 0wxff20fc07) = 0wx1E202000
             then (* Floating point comparison *)
             let
                 val pt = (wordValue >> 0w22) andb 0w3
                 and rM = (wordValue >> 0w16) andb 0wx1f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and opc = (wordValue >> 0w3) andb 0w3
                 val (opcode, r) =
                     case (pt, opc) of
                         (0w0, 0wx0) => ("fcmp", "s")
                     |   (0w1, 0wx0) => ("fcmp", "d")
                     |   (0w0, 0wx2) => ("fcmpe", "s")
                     |   (0w1, 0wx2) => ("fcmpe", "d")
                     |   _ => ("??", "?")
             in
                 printStream opcode; printStream "\t";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rN); printStream ",";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rM)
             end
 
             else if (wordValue andb 0wx1e000000) = 0wx02000000
             then (* This is an unallocated range.  We use it for the register mask. *)
             let
                 fun printMask (0w25, _) = ()
                 |   printMask (i, comma) =
                     if ((0w1 << i) andb wordValue) <> 0w0
                     then
                     (
                         if comma then printStream ", " else ();
                         printStream "x";
                         printStream(Word.fmt StringCvt.DEC i);
                         printMask(i+0w1, true)
                     )
                     else printMask(i+0w1, comma)
             in
                 printStream "["; printMask(0w0, false); printStream "]"
             end
 
             else printStream "?"
             ;
             printStream "\n"
         end
         
         fun printAll i =
             if i = numInstructions then ()
             else (printWordAt i; printAll(i+0w1))
     in
         printStream functionName;
         printStream ":\n";
         printAll 0w0
     end
 
     (* Adds the constants onto the code, and copies the code into a new segment *)
     fun generateCode {instrs, name=functionName, parameters, resultClosure} =
     let
         val printStream = Pretty.getSimplePrinter(parameters, [])
         and printAssemblyCode = Debug.getParameter Debug.assemblyCodeTag parameters
         
         local
             (* Extract the constants. *)
             fun getConsts(LoadAddressLiteral {value, ...}, (addrs, nonAddrs)) = (value::addrs, nonAddrs)
             |   getConsts(LoadNonAddressLiteral {value, ...}, (addrs, nonAddrs)) = (addrs, value::nonAddrs)
             |   getConsts(_, consts) = consts
 
             val (addrConsts, nonAddrConsts) = List.foldl getConsts ([], []) instrs
         in
             val addressConsts = List.rev addrConsts
             and nonAddressConsts = List.rev nonAddrConsts
         end
         
         val (byteVec, nativeWordsOfCode) = genCode(instrs, addressConsts, nonAddressConsts)
         val wordsOfCode = nativeWordsOfCode * wordsPerNativeWord
 
         (* +3 for profile count, function name and constants count *)
         val numOfConst = List.length addressConsts
         val segSize   = wordsOfCode + Word.fromInt numOfConst + 0w4
         val firstConstant = wordsOfCode + 0w3 (* Add 3 for no of consts, fn name and profile count. *)
     
         (* Put in the number of constants. This must go in before
            we actually put in any constants. *)
         local
             val lastWord = segSize - 0w1
         in
             val () = setWord(numOfConst + 2, wordsOfCode, byteVec)
             (* Set the last word of the code to the (negative) byte offset of the start of the code area
                from the end of this word. *)
             val () = setWord((numOfConst + 3) * ~(Word.toInt Address.wordSize), lastWord, byteVec) 
         end
 
         (* Now we've filled in all the size info we need to convert the segment
            into a proper code segment before it's safe to put in any ML values. *)
         val codeVec = byteVecToCodeVec(byteVec, resultClosure)
 
         local
             val name     : string = functionName
             val nameWord : machineWord = toMachineWord name
         in
             val () = codeVecPutWord (codeVec, wordsOfCode+0w1, nameWord)
         end
         (* Profile ref.  A byte ref used by the profiler in the RTS. *)
         local
             val v = RunCall.allocateByteMemory(0w1, Word.fromLargeWord(Word8.toLargeWord(Word8.orb(F_mutable, F_bytes))))
             fun clear 0w0 = ()
             |   clear i = (assignByte(v, i-0w1, 0w0); clear (i-0w1))
             val () = clear(wordSize)
         in
             val () = codeVecPutWord (codeVec, wordsOfCode+0w2, toMachineWord v)
         end
 
         (* and then copy the constants from the constant list. *)
         local
             fun setConstant(value, num) =
             (
                 codeVecPutWord (codeVec, firstConstant + num, value);
                 num+0w1
             )
         in
             val _ = List.foldl setConstant 0w0 addressConsts
         end
     in
         if printAssemblyCode
         then (* print out the code *)
             (printCode (codeVec, functionName, wordsOfCode, printStream); printStream"\n")
         else ();
         codeVecLock(codeVec, resultClosure)
     end (* copyCode *)
 
 
     structure Sharing =
     struct
         type closureRef = closureRef
         type instr = instr
         type xReg = xReg
         type vReg = vReg
         type labels = labels
         type condition = condition
         type shiftType = shiftType
         type wordSize = wordSize
         type 'a extend = 'a extend
         type scale = scale
     end
 end;
 
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64Assembly.sig b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64Assembly.sig
index c970dda3..134b55dc 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64Assembly.sig
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64Assembly.sig
@@ -1,368 +1,374 @@
 (*
     Copyright (c) 2021 David C. J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     Licence version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public Licence for more details.
     
     You should have received a copy of the GNU Lesser General Public
     Licence along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 signature Arm64Assembly =
 sig
     type closureRef
     type instr
     type machineWord = Address.machineWord
     type labels
     type condition
 
     (* XZero and XSP are both encoded as 31 but the interpretation
        depends on the instruction 
        The datatype definition is included here to allow for
        pattern matching on XSP and XZero. *)
     datatype xReg = XReg of Word8.word | XZero | XSP
     and vReg = VReg of Word8.word
 
     val X0:  xReg   and X1:  xReg   and X2:  xReg   and X3: xReg
     and X4:  xReg   and X5:  xReg   and X6:  xReg   and X7: xReg
     and X8:  xReg   and X9:  xReg   and X10: xReg   and X11: xReg
     and X12: xReg   and X13: xReg   and X14: xReg   and X15: xReg
     and X16: xReg   and X17: xReg   and X18: xReg   and X19: xReg
     and X20: xReg   and X21: xReg   and X22: xReg   and X23: xReg
     and X24: xReg   and X25: xReg   and X26: xReg   and X27: xReg
     and X28: xReg   and X29: xReg   and X30: xReg
 
     val X_MLHeapLimit: xReg (* ML Heap limit pointer *)
     and X_MLAssemblyInt: xReg (* ML assembly interface pointer. *)
     and X_MLHeapAllocPtr: xReg (* ML Heap allocation pointer. *)
     and X_MLStackPtr: xReg (* ML Stack pointer. *)
     and X_LinkReg: xReg (* Link reg - return address *)
     and X_Base32in64: xReg (* X24 is used for the heap base in 32-in-64. *)
     
     val V0:  vReg   and V1:  vReg   and V2:  vReg   and V3: vReg
     and V4:  vReg   and V5:  vReg   and V6:  vReg   and V7: vReg
 
     (* Condition for conditional branches etc. *)
     val condEqual: condition
     and condNotEqual: condition
     and condCarrySet: condition
     and condCarryClear: condition
     and condNegative: condition
     and condPositive: condition
     and condOverflow: condition
     and condNoOverflow: condition
     and condUnsignedHigher: condition
     and condUnsignedLowOrEq: condition
     and condSignedGreaterEq: condition
     and condSignedLess: condition
     and condSignedGreater: condition
     and condSignedLessEq: condition
 
     datatype shiftType =
         ShiftLSL of Word8.word
     |   ShiftLSR of Word8.word
     |   ShiftASR of Word8.word
     |   ShiftNone
 
     datatype wordSize = WordSize32 | WordSize64
 
     datatype 'a extend =
         ExtUXTB of 'a (* Unsigned extend byte *)
     |   ExtUXTH of 'a (* Unsigned extend byte *)
     |   ExtUXTW of 'a (* Unsigned extend byte *)
     |   ExtUXTX of 'a (* Left shift *)
     |   ExtSXTB of 'a (* Sign extend byte *)
     |   ExtSXTH of 'a (* Sign extend halfword *)
     |   ExtSXTW of 'a (* Sign extend word *)
     |   ExtSXTX of 'a (* Left shift *)
 
     (* Load/store instructions have only a single bit for the shift.  For byte
        operations this is one bit shift; for others it scales by the size of
        the operand if set. *)
     datatype scale =
         ScaleOrShift
     |   NoScale
 
     (* Jump to the address in the register and put the address of the
        next instruction into X30. *)
     val branchAndLinkReg: xReg -> instr
     (* Jump to the address in the register. *)
     and branchRegister: xReg -> instr
     (* Jump to the address in the register and hint this is a return. *)
     and returnRegister: xReg -> instr
 
     (* Move an address constant to a register. *)
     val loadAddressConstant: xReg * machineWord -> instr
     (* Move a constant into a register that is not an address.
        The argument is the actual bit pattern to be copied.
        For tagged integers that means that the value must have
        been shifted and the tag bit set. *)
     and loadNonAddressConstant: xReg * Word64.word -> instr
 
     (* Move a value into a register.  The immediate is 16-bits and the shift
        is 0, 16, 24, or 48.  moveKeep affect only the specific 16-bits and
        leaves the remainder unchanged. *)
     val moveNot32: {regD: xReg, immediate: word, shift: word} -> instr
     and moveZero32: {regD: xReg, immediate: word, shift: word} -> instr
     and moveKeep32: {regD: xReg, immediate: word, shift: word} -> instr
     val moveNot: {regD: xReg, immediate: word, shift: word} -> instr
     and moveZero: {regD: xReg, immediate: word, shift: word} -> instr
     and moveKeep: {regD: xReg, immediate: word, shift: word} -> instr
 
     (* Add/subtract an optionally shifted 12-bit immediate (i.e. constant) to/from a register.
        The constant is zero-extended. *)
     val addImmediate: {regN: xReg, regD: xReg, immed: word, shifted: bool} -> instr
     and addSImmediate: {regN: xReg, regD: xReg, immed: word, shifted: bool} -> instr
     and subImmediate: {regN: xReg, regD: xReg, immed: word, shifted: bool} -> instr
     and subSImmediate: {regN: xReg, regD: xReg, immed: word, shifted: bool} -> instr
 
     (* Add/subtract a shifted register, optionally setting the flags. *)
     val addShiftedReg: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and addSShiftedReg: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and subShiftedReg: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and subSShiftedReg: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and addShiftedReg32: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and addSShiftedReg32: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and subShiftedReg32: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and subSShiftedReg32: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     
     (* Add/subtract an extended register, optionally setting the flags. *)
     val addExtendedReg: {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend} -> instr
     and addSExtendedReg: {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend} -> instr
     and subExtendedReg: {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend} -> instr
     and subSExtendedReg: {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend} -> instr
 
     (* Multiplication *)
     (* regD = regA + regN * regM *)
     val multiplyAndAdd: {regM: xReg, regN: xReg, regA: xReg, regD: xReg} -> instr
     (* regD = regA - regN * regM *)
     and multiplyAndSub: {regM: xReg, regN: xReg, regA: xReg, regD: xReg} -> instr
     (* Return the high-order part of a signed multiplication. *)
     and signedMultiplyHigh: {regM: xReg, regN: xReg, regD: xReg} -> instr
 
     (* Division *)
     val unsignedDivide: {regM: xReg, regN: xReg, regD: xReg} -> instr
     and signedDivide: {regM: xReg, regN: xReg, regD: xReg} -> instr
 
     (* Logical operations on a shifted register, optionally setting the flags. *)
     val andShiftedReg: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and orrShiftedReg: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and eorShiftedReg: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and andsShiftedReg: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
 
     (* And a register with a bit pattern, discarding the results but setting the
        condition codes.  The bit pattern must be encodable. *)
     val testBitPattern: xReg * Word64.word -> instr
 
     (* Check whether a constant can be encoded. *)
     val isEncodableBitPattern: Word64.word * wordSize -> bool
 
     (* Load/Store an aligned word using a 12-bit offset.  The offset is in units
        of the size of the operand. *)
     val loadRegScaled: {regT: xReg, regN: xReg, unitOffset: int} -> instr
     and storeRegScaled: {regT: xReg, regN: xReg, unitOffset: int} -> instr
     and loadRegScaledByte: {regT: xReg, regN: xReg, unitOffset: int} -> instr
     and storeRegScaledByte: {regT: xReg, regN: xReg, unitOffset: int} -> instr
     and loadRegScaled16: {regT: xReg, regN: xReg, unitOffset: int} -> instr
     and storeRegScaled16: {regT: xReg, regN: xReg, unitOffset: int} -> instr
     and loadRegScaled32: {regT: xReg, regN: xReg, unitOffset: int} -> instr
     and storeRegScaled32: {regT: xReg, regN: xReg, unitOffset: int} -> instr
     and loadRegScaledDouble: {regT: vReg, regN: xReg, unitOffset: int} -> instr
     and storeRegScaledDouble: {regT: vReg, regN: xReg, unitOffset: int} -> instr
     and loadRegScaledFloat: {regT: vReg, regN: xReg, unitOffset: int} -> instr
     and storeRegScaledFloat: {regT: vReg, regN: xReg, unitOffset: int} -> instr
 
     (* Load/Store a value using a signed byte offset. *)
     val loadRegUnscaled: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegUnscaled: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegUnscaledByte: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegUnscaledByte: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegUnscaled16: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegUnscaled16: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegUnscaled32: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegUnscaled32: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegUnscaledFloat: {regT: vReg, regN: xReg, byteOffset: int} -> instr
     and storeRegUnscaledFloat: {regT: vReg, regN: xReg, byteOffset: int} -> instr
     and loadRegUnscaledDouble: {regT: vReg, regN: xReg, byteOffset: int} -> instr
     and storeRegUnscaledDouble: {regT: vReg, regN: xReg, byteOffset: int} -> instr
 
     (* Load/store with a register offset i.e. an index register. *)
     val loadRegIndexed: {regN: xReg, regM: xReg, regT: xReg, option: scale extend} -> instr
     and storeRegIndexed: {regN: xReg, regM: xReg, regT: xReg, option: scale extend} -> instr
     and loadRegIndexedByte: {regN: xReg, regM: xReg, regT: xReg, option: scale extend} -> instr
     and storeRegIndexedByte: {regN: xReg, regM: xReg, regT: xReg, option: scale extend} -> instr
     and loadRegIndexed16: {regN: xReg, regM: xReg, regT: xReg, option: scale extend} -> instr
     and storeRegIndexed16: {regN: xReg, regM: xReg, regT: xReg, option: scale extend} -> instr
     and loadRegIndexed32: {regN: xReg, regM: xReg, regT: xReg, option: scale extend} -> instr
     and storeRegIndexed32: {regN: xReg, regM: xReg, regT: xReg, option: scale extend} -> instr
     and loadRegIndexedFloat: {regN: xReg, regM: xReg, regT: vReg, option: scale extend} -> instr
     and storeRegIndexedFloat: {regN: xReg, regM: xReg, regT: vReg, option: scale extend} -> instr
     and loadRegIndexedDouble: {regN: xReg, regM: xReg, regT: vReg, option: scale extend} -> instr
     and storeRegIndexedDouble: {regN: xReg, regM: xReg, regT: vReg, option: scale extend} -> instr
 
     (* Load/Store a value using a signed byte offset and post-indexing (post-increment). *)
     (* The terminology is confusing. Pre-indexing means adding the offset into base address
        before loading the value, typically used for push, and post-index means using the
        original value of the base register as the address and adding in the offset after
        the value has been loaded, e.g. pop. *)
     val loadRegPostIndex: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegPostIndex: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegPostIndex32: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegPostIndex32: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegPostIndexByte: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegPostIndexByte: {regT: xReg, regN: xReg, byteOffset: int} -> instr
 
     (* Load/Store a value using a signed byte offset and pre-indexing (pre-increment). *)
     val loadRegPreIndex: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegPreIndex: {regT: xReg, regN: xReg, byteOffset: int} -> instr
+    and loadRegPreIndex32: {regT: xReg, regN: xReg, byteOffset: int} -> instr
+    and storeRegPreIndex32: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegPreIndexByte: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegPreIndexByte: {regT: xReg, regN: xReg, byteOffset: int} -> instr
 
     (* Loads and stores with special ordering. *)
     val loadAcquire: {regN: xReg, regT: xReg} -> instr
     and storeRelease: {regN: xReg, regT: xReg} -> instr
     and loadAcquire32: {regN: xReg, regT: xReg} -> instr
     and storeRelease32: {regN: xReg, regT: xReg} -> instr
 
     (* This word is put in after a call to the RTS trap-handler.  All the registers
        are saved and restored across a call to the trap-handler; the register
        mask contains those that may contain an address and so need to be scanned and
        possibly updated if there is a GC. *)
     val registerMask: xReg list -> instr
 
     (* Create a label. *)
     val createLabel: unit -> labels
     (* Put a label into the code. *)
     val setLabel: labels -> instr
     (* A conditional branch. *)
     val conditionalBranch: condition * labels -> instr
     (* Unconditional branch *)
     and unconditionalBranch: labels -> instr
     (* Put the address of a label into a register - used for handlers and cases. *)
     and loadLabelAddress: xReg * labels -> instr
     (* Test a bit in a register and branch if zero/nonzero *)
     and testBitBranchZero: xReg * Word8.word * labels -> instr
     and testBitBranchNonZero: xReg * Word8.word * labels -> instr
     (* Compare a register with zero and branch if zero/nonzero *)
     and compareBranchZero: xReg * wordSize * labels -> instr
     and compareBranchNonZero: xReg * wordSize * labels -> instr
 
     (* Set the destination register to the value of the first reg if the
        condition is true otherwise to a, possibly modified, version of
        the second argument.  There are variants that set it unmodified,
        incremented, inverted and negated. *)
     val conditionalSet:
         {regD: xReg, regTrue: xReg, regFalse: xReg, cond: condition} -> instr
     val conditionalSetIncrement:
         {regD: xReg, regTrue: xReg, regFalse: xReg, cond: condition} -> instr
     val conditionalSetInverted:
         {regD: xReg, regTrue: xReg, regFalse: xReg, cond: condition} -> instr
     val conditionalSetNegated:
         {regD: xReg, regTrue: xReg, regFalse: xReg, cond: condition} -> instr
 
     (* Various shifts *)
     val logicalShiftLeft: {wordSize: wordSize, shift: word, regN: xReg, regD: xReg} -> instr
     and logicalShiftRight: {wordSize: wordSize, shift: word, regN: xReg, regD: xReg} -> instr
     and arithmeticShiftRight: {wordSize: wordSize, shift: word, regN: xReg, regD: xReg} -> instr
     (* Extract bits and set the rest of the register to zero. *)
     and unsignedBitfieldInsertinZeros:
         {wordSize: wordSize, lsb: word, width: word, regN: xReg, regD: xReg} -> instr
     (* Extract bits but leave the rest of the register unchanged.  Can be used
        to clear a specific range of bits by using XZero as the source. *)
     and bitfieldInsert:
         {wordSize: wordSize, lsb: word, width: word, regN: xReg, regD: xReg} -> instr
 
     (* Logical shift left Rd = Rn << (Rm mod 0w64) *)
     val logicalShiftLeftVariable: {regM: xReg, regN: xReg, regD: xReg} -> instr
     (* Logical shift right Rd = Rn >> (Rm mod 0w64) *)
     and logicalShiftRightVariable: {regM: xReg, regN: xReg, regD: xReg} -> instr
     (* Arithmetic shift right Rd = Rn ~>> (Rm mod 0w64) *)
     and arithmeticShiftRightVariable: {regM: xReg, regN: xReg, regD: xReg} -> instr
     and logicalShiftLeftVariable32: {regM: xReg, regN: xReg, regD: xReg} -> instr
 
     (* Logical operations on bit patterns.  The pattern must be valid.
        ANDS is an AND that also sets the flags, typically used for a test. *)
     val bitwiseAndImmediate: {wordSize: wordSize, bits: Word64.word, regN: xReg, regD: xReg} -> instr
     and bitwiseOrImmediate: {wordSize: wordSize, bits: Word64.word, regN: xReg, regD: xReg} -> instr
     and bitwiseXorImmediate: {wordSize: wordSize, bits: Word64.word, regN: xReg, regD: xReg} -> instr
     and bitwiseAndSImmediate: {wordSize: wordSize, bits: Word64.word, regN: xReg, regD: xReg} -> instr
 
     (* Instructions involved in thread synchonisation. *)
     val yield: instr and dmbIsh: instr
     val loadAcquireExclusiveRegister: {regN: xReg, regT: xReg} -> instr
     val storeReleaseExclusiveRegister: {regN: xReg, regS: xReg, regT: xReg} -> instr
 
     (* Floating point moves and conversions.  Moves simply copy the bits.
        In all cases the integer argument is signed 64-bits. *)
     val moveGeneralToDouble: {regN: xReg, regD: vReg} -> instr
     and moveGeneralToFloat: {regN: xReg, regD: vReg} -> instr
     and moveDoubleToGeneral: {regN: vReg, regD: xReg} -> instr
     and moveFloatToGeneral: {regN: vReg, regD: xReg} -> instr
     and convertIntToDouble: {regN: xReg, regD: vReg} -> instr
     and convertIntToFloat: {regN: xReg, regD: vReg} -> instr
     and convertFloatToInt: IEEEReal.rounding_mode -> {regN: vReg, regD: xReg} -> instr
     and convertDoubleToInt: IEEEReal.rounding_mode -> {regN: vReg, regD: xReg} -> instr
+    and convertInt32ToDouble: {regN: xReg, regD: vReg} -> instr
+    and convertInt32ToFloat: {regN: xReg, regD: vReg} -> instr
+    and convertFloatToInt32: IEEEReal.rounding_mode -> {regN: vReg, regD: xReg} -> instr
+    and convertDoubleToInt32: IEEEReal.rounding_mode -> {regN: vReg, regD: xReg} -> instr
    
     (* Floating point operations. *)
     val multiplyFloat: {regM: vReg, regN: vReg, regD: vReg} -> instr
     and divideFloat: {regM: vReg, regN: vReg, regD: vReg} -> instr
     and addFloat: {regM: vReg, regN: vReg, regD: vReg} -> instr
     and subtractFloat: {regM: vReg, regN: vReg, regD: vReg} -> instr
     and multiplyDouble: {regM: vReg, regN: vReg, regD: vReg} -> instr
     and divideDouble: {regM: vReg, regN: vReg, regD: vReg} -> instr
     and addDouble: {regM: vReg, regN: vReg, regD: vReg} -> instr
     and subtractDouble: {regM: vReg, regN: vReg, regD: vReg} -> instr
 
     val compareFloat: {regM: vReg, regN: vReg} -> instr
     and compareDouble: {regM: vReg, regN: vReg} -> instr
     
     val moveFloatToFloat: {regN: vReg, regD: vReg} -> instr
     and absFloat: {regN: vReg, regD: vReg} -> instr
     and negFloat: {regN: vReg, regD: vReg} -> instr
     and convertFloatToDouble: {regN: vReg, regD: vReg} -> instr
     and moveDoubleToDouble: {regN: vReg, regD: vReg} -> instr
     and absDouble: {regN: vReg, regD: vReg} -> instr
     and negDouble: {regN: vReg, regD: vReg} -> instr
     and convertDoubleToFloat: {regN: vReg, regD: vReg} -> instr
 
     (* Create the vector of code from the list of instructions and update the
        closure reference to point to it. *)
     val generateCode:
         {instrs: instr list, name: string, parameters: Universal.universal list, resultClosure: closureRef} -> unit
 
     (* Offsets in the assembly code interface pointed at by X26
        These are in units of 64-bits NOT bytes. *)
     val heapOverflowCallOffset: int
     and stackOverflowCallOffset: int
     and stackOverflowXCallOffset: int
     and exceptionHandlerOffset: int
     and stackLimitOffset: int
     and exceptionPacketOffset: int
     and threadIdOffset: int
     and heapLimitPtrOffset: int
     and heapAllocPtrOffset: int
     and mlStackPtrOffset: int
 
     val is32in64: bool
 
     structure Sharing:
     sig
         type closureRef = closureRef
         type instr = instr
         type xReg = xReg
         type vReg = vReg
         type labels = labels
         type condition = condition
         type shiftType = shiftType
         type wordSize = wordSize
         type 'a extend = 'a extend
         type scale = scale
     end
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64GenCode.sml b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64GenCode.sml
index c61c7fd3..d1605f6c 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64GenCode.sml
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64GenCode.sml
@@ -1,2809 +1,2832 @@
 (*
     Copyright (c) 2021 David C. J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     Licence version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public Licence for more details.
     
     You should have received a copy of the GNU Lesser General Public
     Licence along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64GenCode (
     structure FallBackCG: GENCODESIG
     and       BackendTree: BackendIntermediateCodeSig
     and       CodeArray: CODEARRAYSIG
     and       Arm64Assembly: Arm64Assembly
     and       Debug: DEBUG
     and       Arm64Foreign: FOREIGNCALLSIG
     and       Arm64Sequences: Arm64Sequences
     
     sharing FallBackCG.Sharing = BackendTree.Sharing = CodeArray.Sharing =
         Arm64Assembly.Sharing = Arm64Sequences.Sharing
 ) : GENCODESIG =
 struct
 
     open BackendTree CodeArray Arm64Assembly Address Arm64Sequences
     
     open BuiltIns
     
     exception InternalError = Misc.InternalError
     exception Fallback of string
 
     (* tag a short constant *)
     fun tag c = 2 * c + 1
     and semitag c = 2*c
     
     fun taggedWord w: word = w * 0w2 + 0w1
     and taggedWord64 w: Word64.word = w * 0w2 + 0w1
     
     val tagBitMask = Word64.<<(Word64.fromInt ~1, 0w1)
     
     fun gen(instr, code) = code := instr :: !code
 
     fun genList([], _) = ()
     |   genList(instr :: instrs, code) = (gen(instr, code); genList(instrs, code))
 
     fun genPushReg(reg, code) = gen(storeRegPreIndex{regT=reg, regN=X_MLStackPtr, byteOffset= ~8}, code)
     and genPopReg(reg, code) = gen(loadRegPostIndex{regT=reg, regN=X_MLStackPtr, byteOffset= 8}, code)
 
     (* Load a value using a scaled offset.  This uses a normal scaled load if
        the offset is in the range and an indexed offset if it is not. *)
     fun loadScaledOffset(scale, loadScaled, loadIndexed) {base, dest, work, offset} =
         if offset < 0 then raise InternalError "loadScaledOffset: negative offset"
         else if offset < 0x1000
         then [loadScaled{regT=dest, regN=base, unitOffset=offset}]
         else
             loadNonAddress(work, Word64.fromInt offset) @
             [loadIndexed{regN=base, regM=work, regT=dest,
                 option=ExtUXTX(if scale = 1 then NoScale else ScaleOrShift)}]
 
     (* Similar store. *)
     and storeScaledOffset(scale, storeScaled, storeIndexed) {base, store, work, offset} =
         if offset < 0 then raise InternalError "storeScaledOffset: negative offset"
         else if offset < 0x1000
         then [storeScaled{regT=store, regN=base, unitOffset=offset}]
         else
             loadNonAddress(work, Word64.fromInt offset) @
             [storeIndexed{regN=base, regM=work, regT=store,
                 option=ExtUXTX(if scale = 1 then NoScale else ScaleOrShift)}]
 
     val loadScaledWord = loadScaledOffset(8, loadRegScaled, loadRegIndexed)
     and storeScaledWord = storeScaledOffset(8, storeRegScaled, storeRegIndexed)
     
     val loadScaledPolyWord =
         if is32in64
         then loadScaledOffset(4, loadRegScaled32, loadRegIndexed32)
         else loadScaledOffset(8, loadRegScaled, loadRegIndexed)
 
     and storeScaledPolyWord =
         if is32in64
         then storeScaledOffset(4, storeRegScaled32, storeRegIndexed32)
         else storeScaledOffset(8, storeRegScaled, storeRegIndexed)
 
     (* Add a constant word to the source register and put the result in the
        destination.  regW is used as a work register if necessary.  This is used
        both for addition and subtraction. *)
     fun addConstantWord({regS, regD, value=0w0, ...}, code) =
         if regS = regD then () else gen(moveRegToReg{sReg=regS, dReg=regD}, code)
     
     |   addConstantWord({regS, regD, regW, value}, code) =
         let
             (* If we have to load the constant it's better if the top 32-bits are
                zero if possible. *)
             val (isSub, unsigned) =
                 if value > Word64.<<(0w1, 0w63)
                 then (true, ~ value)
                 else (false, value)
         in
             if unsigned < Word64.<<(0w1, 0w24)
             then (* We can put up to 24 in a shifted and an unshifted constant. *)
             let
                 val w = Word.fromLarge(Word64.toLarge unsigned)
                 val high = Word.andb(Word.>>(w, 0w12), 0wxfff)
                 val low = Word.andb(w, 0wxfff)
                 val addSub = if isSub then subImmediate else addImmediate
             in
                 if high <> 0w0
                 then
                 (
                     gen(addSub{regN=regS, regD=regD, immed=high, shifted=true}, code);
                     if low <> 0w0
                     then gen(addSub{regN=regD, regD=regD, immed=low, shifted=false}, code)
                     else ()
                 )
                 else gen(addSub{regN=regS, regD=regD, immed=low, shifted=false}, code)
             end
             else
             let
                 (* To minimise the constant and increase the chances that it
                    will fit in a single word look to see if we can shift it. *)
                 fun getShift(value, shift) =
                     if Word64.andb(value, 0w1) = 0w0
                     then getShift(Word64.>>(value, 0w1), shift+0w1)
                     else (value, shift)
                 val (shifted, shift) = getShift(unsigned, 0w0)
             in
                 genList(loadNonAddress(regW, shifted), code);
                 gen((if isSub then subShiftedReg else addShiftedReg)
                     {regM=regW, regN=regS, regD=regD, shift=ShiftLSL shift}, code)
             end
         end
     
     (* Remove items from the stack. If the second argument is true the value
        on the top of the stack has to be moved. *)
     fun resetStack(0, _, _) = ()
     |   resetStack(nItems, true, code) =
         (
             genPopReg(X0, code);
             resetStack(nItems, false, code);
             genPushReg(X0, code)
         )
     |   resetStack(nItems, false, code) =
             addConstantWord({regS=X_MLStackPtr, regD=X_MLStackPtr, regW=X3,
                 value=Word64.fromLarge(Word.toLarge nativeWordSize) * Word64.fromInt nItems}, code)
 
     fun compareRegs(reg1, reg2, code) =
         gen(subSShiftedReg{regM=reg2, regN=reg1, regD=XZero, shift=ShiftNone}, code)
     
     (* Compare registers using 32-bit operation on 323-in-64. *)
     fun comparePolyRegs(reg1, reg2, code) =
         gen((if is32in64 then subSShiftedReg32 else subSShiftedReg){regM=reg2, regN=reg1, regD=XZero, shift=ShiftNone}, code)
 
     (* Turn an absolute address into an object index. Does nothing in
        native mode. *)
     fun absoluteAddressToIndex(reg, cvec) =
     if is32in64
     then
     (
         gen(subShiftedReg{regM=X_Base32in64, regN=reg, regD=reg, shift=ShiftNone}, cvec);
         gen(logicalShiftRight{wordSize=WordSize64, shift=0w2, regN=reg, regD=reg}, cvec)
     )
     else ()
     
     (* Turn an index into an absolute address. *)
     and indexToAbsoluteAddress(iReg, absReg, cvec) =
     if is32in64
     then gen(addShiftedReg{regM=iReg, regN=X_Base32in64, regD=absReg, shift=ShiftLSL 0w2}, cvec)
     else if iReg = absReg
     then ()
     else gen(moveRegToReg{sReg=iReg, dReg=absReg}, cvec)
 
     (* Get a large word value from a "box". *)
     fun unboxLargeWord(addrReg, valReg, cvec) =
     if is32in64
     then
     (
         indexToAbsoluteAddress(addrReg, valReg, cvec);
         gen(loadRegScaled{regT=valReg, regN=valReg, unitOffset=0}, cvec)
     )
     else gen(loadRegScaled{regT=valReg, regN=addrReg, unitOffset=0}, cvec)
 
     fun unboxDouble(addrReg, workReg, valReg, cvec) =
     if is32in64
     then
     (
         indexToAbsoluteAddress(addrReg, workReg, cvec);
         gen(loadRegScaledDouble{regT=valReg, regN=workReg, unitOffset=0}, cvec)
     )
     else gen(loadRegScaledDouble{regT=valReg, regN=addrReg, unitOffset=0}, cvec)
 
     fun unboxOrUntagSingle(addrReg, workReg, valReg, cvec) =
     if is32in64
     then gen(loadRegIndexedFloat{regN=X_Base32in64, regM=addrReg, regT=valReg, option=ExtUXTX ScaleOrShift}, cvec)
     else
     (
         gen(logicalShiftRight{wordSize=WordSize64, shift=0w32, regN=addrReg, regD=workReg}, cvec);
         gen(moveGeneralToFloat{regN=workReg, regD=valReg}, cvec)
     )
 
     (* Sequence to allocate on the heap.  The words are not initialised
        apart from the length word.  Returns the absolute address. *)
     fun genAllocateFixedSize(words, flags, resultReg, workReg, code) =
     let
         val label = createLabel()
         val wordsRequired =
             if is32in64
             then (* Have to round this up to 8 bytes *)
                 Word64.andb(Word64.fromInt(words+2), ~ 0w2)
             else Word64.fromInt(words+1)
     in
         (* Subtract the number of bytes required from the heap pointer and put in X0. *)
         addConstantWord({regS=X_MLHeapAllocPtr, regD=X0, regW=X3,
             value= ~ (Word64.fromLarge(Word.toLarge wordSize)) * wordsRequired}, code);
         compareRegs(resultReg, X_MLHeapLimit, code);
         gen(conditionalBranch(condCarrySet, label), code);
         gen(loadRegScaled{regT=X16, regN=X_MLAssemblyInt, unitOffset=heapOverflowCallOffset}, code);
         gen(branchAndLinkReg X16, code);
         gen(registerMask [], code); (* Not used at the moment. *)
         gen(setLabel label, code);
         gen(moveRegToReg{sReg=resultReg, dReg=X_MLHeapAllocPtr}, code);
         genList(loadNonAddress(workReg,
             Word64.orb(Word64.fromInt words, Word64.<<(Word64.fromLarge(Word8.toLarge flags),
                 if is32in64 then 0w24 else 0w56))), code);
         (* Store the length word.  Have to use the unaligned version because offset is -ve. *)
         if is32in64
         then gen(storeRegUnscaled32{regT=workReg, regN=resultReg, byteOffset= ~4}, code)
         else gen(storeRegUnscaled{regT=workReg, regN=resultReg, byteOffset= ~8}, code)
     end
 
     (* Allocate space on the heap for a vector, string etc.  sizeReg and flagsReg
        contain the size and flags as untagged values. sizeReg is unchanged, flagsReg
        is modified.  The result address is in resultReg.  All the registers must
        be different. *)
     fun allocateVariableSize({sizeReg, flagsReg, resultReg}, code) =
     let
         val trapLabel = createLabel() and noTrapLabel = createLabel()
     in
         (* Subtract the size as a number of bytes from the allocation ptr. *)
         if is32in64
         then
         (
             gen(subShiftedReg{regM=sizeReg, regN=X_MLHeapAllocPtr, regD=resultReg, shift=ShiftLSL 0w2}, code);
             (* Subtract another 4 to allow for the length word. *)
             gen(subImmediate{regN=resultReg, regD=resultReg, immed=0w4, shifted=false}, code);
             (* Round this down to an 8-byte boundary. *)
             gen(bitwiseAndImmediate{wordSize=WordSize64, bits= ~ 0w8, regN=resultReg, regD=resultReg}, code)
         )
         else
         (
             gen(subShiftedReg{regM=sizeReg, regN=X_MLHeapAllocPtr, regD=resultReg, shift=ShiftLSL 0w3}, code);
             (* Subtract another 8 to allow for the length word. *)
             gen(subImmediate{regN=resultReg, regD=resultReg, immed=0w8, shifted=false}, code)
         );
         (* If the size is large enough it is possible that this could wrap round.  To check for that
            we trap if either the result is less than the limit or if it is now greater than
            the allocation pointer. *)
         compareRegs(resultReg, X_MLHeapLimit, code);
         gen(conditionalBranch(condCarryClear, trapLabel), code);
         compareRegs(resultReg, X_MLHeapAllocPtr, code);
         gen(conditionalBranch(condCarryClear, noTrapLabel), code);
         gen(setLabel trapLabel, code);
         gen(loadRegScaled{regT=X16, regN=X_MLAssemblyInt, unitOffset=heapOverflowCallOffset}, code);
         gen(branchAndLinkReg X16, code);
         gen(registerMask [], code); (* Not used at the moment. *)
         gen(setLabel noTrapLabel, code);
         gen(moveRegToReg{sReg=resultReg, dReg=X_MLHeapAllocPtr}, code);
         (* Combine the size with the flags in the top byte. *)
         gen(orrShiftedReg{regM=flagsReg, regN=sizeReg, regD=flagsReg,
             shift=ShiftLSL(if is32in64 then 0w24 else 0w56)}, code);
         (* Store the length word.  Have to use the unaligned version because offset is -ve. *)
         if is32in64
         then gen(storeRegUnscaled32{regT=flagsReg, regN=resultReg, byteOffset= ~4}, code)
         else gen(storeRegUnscaled{regT=flagsReg, regN=resultReg, byteOffset= ~8}, code)
     end
 
     (* Set a register to either tagged(1) i.e. true or tagged(0) i.e. false. *)
     fun setBooleanCondition(reg, condition, code) =
     (
         genList(loadNonAddress(reg, Word64.fromInt(tag 1)), code);
         (* If the condition is false the value used is the XZero incremented by 1 i.e. 1 *)
         gen(conditionalSetIncrement{regD=reg, regTrue=reg, regFalse=XZero, cond=condition}, code)
     )
 
     (* Raise the overflow exception if the overflow bit has been set. *)
     fun checkOverflow code =
     let
         val noOverflow = createLabel()
     in
         gen(conditionalBranch(condNoOverflow, noOverflow), code);
         gen(loadAddressConstant(X0, toMachineWord Overflow), code);
         gen(loadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset}, code);
         gen(loadRegScaled{regT=X1, regN=X_MLStackPtr, unitOffset=0}, code);
         gen(branchRegister X1, code);
         gen(setLabel noOverflow, code)
     end
 
     (* Stack check code: this is inserted at the start of a function to check that there
        is sufficient ML stack available.  It is also inserted, with a zero space value,
        in a loop to ensure that the RTS can interrupt a function.
        debugTrapAlways can be used to set a sort of breakpoint during debugging. *)
     fun checkStackCode(regW, space, debugTrapAlways, code) =
     let
         val skipCheck = createLabel()
         val defaultWords = 10 (* This is wired into the RTS. *)
         val (testReg, entryPt) =
             if space <= defaultWords
             then (X_MLStackPtr, stackOverflowCallOffset)
             else
             (
                 (* This is only used at the start of the code.  X9 is wired into the RTS. *)
                 addConstantWord({regS=X_MLStackPtr, regD=X9, regW=regW,
                     value= ~ (Word64.fromLarge(Word.toLarge nativeWordSize)) * Word64.fromInt space}, code);
                 (X9, stackOverflowXCallOffset)
             )
     in
         gen(loadRegScaled{regT=regW, regN=X_MLAssemblyInt, unitOffset=stackLimitOffset}, code);
         if debugTrapAlways
         then ()
         else
         (
             compareRegs(testReg, regW, code);
             gen(conditionalBranch(condCarrySet, skipCheck), code)
         );
         gen(loadRegScaled{regT=X16, regN=X_MLAssemblyInt, unitOffset=entryPt}, code);
         gen(branchAndLinkReg X16, code);
         gen(registerMask [], code); (* Not used at the moment. *)
         gen(setLabel skipCheck, code)
     end
 
     (* Allocate a single byte cell and store the register into it. The result is
        in X0 so reg must not be X0. *)
     fun boxLargeWord(reg, code) =
     (
         reg <> X0 orelse raise InternalError "boxLargeWord: X0";
         genAllocateFixedSize(Word.toInt(nativeWordSize div wordSize), F_bytes, X0, X5, code);
         gen(storeRegScaled{regT=reg, regN=X0, unitOffset=0}, code);
         absoluteAddressToIndex(X0, code)
     )
     
     (* Allocate a single byte cell for a "real" i.e. double-precision floating
        point number. *)
     fun boxDouble(reg, code) =
     (
         genAllocateFixedSize(Word.toInt(0w8 div wordSize), F_bytes, X0, X5, code);
         gen(storeRegScaledDouble{regT=reg, regN=X0, unitOffset=0}, code);
         absoluteAddressToIndex(X0, code)
     )
     
     (* Single precision floats are shifted and tagged in native 64-bit but
        boxed in 32-in-64. *)
     fun boxOrTagFloat(reg, code) =
     if is32in64
     then
     (
         genAllocateFixedSize(1, F_bytes, X0, X5, code);
         gen(storeRegScaledFloat{regT=reg, regN=X0, unitOffset=0}, code);
         absoluteAddressToIndex(X0, code)
     )
     else
     (
         gen(moveFloatToGeneral{regN=V0, regD=X0}, code);
         gen(logicalShiftLeft{wordSize=WordSize64, shift=0w32, regN=X0, regD=X0}, code);
         gen(bitwiseOrImmediate{regN=X0, regD=X0, wordSize=WordSize64, bits=0w1}, code)
     )
 
     type caseForm =
         {
             cases   : (backendIC * word) list,
             test    : backendIC,
             caseType: caseType,
             default : backendIC
         }
    
     (* Where the result, if any, should go *)
     datatype whereto =
         NoResult     (* discard result *)
     |   ToStack     (* Need a result but it can stay on the pseudo-stack *)
     |   ToX0        (* Need a result in X0. *)
   
     (* Are we at the end of the function. *)
     datatype tail =
         EndOfProc
     |   NotEnd
 
     (* Code generate a function or global declaration *)
     fun codegen (pt, name, resultClosure, numOfArgs, localCount, parameters) =
     let
         val cvec = ref []
         
         datatype decEntry =
             StackAddr of int
         |   Empty
     
         val decVec = Array.array (localCount, Empty)
     
         (* Count of number of items on the stack.  This excludes the arguments and
            the return address. *)
         val realstackptr = ref 1 (* The closure ptr is already there *)
         
         (* Maximum size of the stack. *)
         val maxStack = ref 1
 
         (* Whether the top of the stack is actually in X0. *)
         val topInX0 = ref false
 
         (* Push a value onto the stack. *)
         fun incsp () =
         (
             realstackptr := !realstackptr + 1;
             if !realstackptr > !maxStack
             then maxStack := !realstackptr
             else ()
         )
 
         (* An entry has been removed from the stack. *)
         fun decsp () = realstackptr := !realstackptr - 1
         
         fun ensureX0 () = if ! topInX0 then (genPushReg(X0, cvec); incsp(); topInX0 := false) else ()
 
         (* generates code from the tree *)
         fun gencde (pt : backendIC, whereto : whereto, tailKind : tail, loopAddr) : unit =
         let
             val _ = !topInX0 andalso raise InternalError "topInX0 true at start"
 
             (* Save the stack pointer value here. We may want to reset the stack. *)
             val oldsp = !realstackptr;
 
             (* Operations on ML memory always have the base as an ML address.
                Word operations are always word aligned.  The higher level will
                have extracted any constant offset and scaled it if necessary.
                That's helpful for the X86 but not for the ARM.  We
                have to turn them back into indexes. *)
             (* This pushes two values to the stack: the base address and the index. *)
             fun genMLAddress({base, index, offset}, scale) =
             (
                 gencde (base, ToStack, NotEnd, loopAddr);
                 offset mod scale = 0 orelse raise InternalError "genMLAddress";
                 case (index, offset div scale) of
                     (NONE, soffset) =>
                         (genList(loadNonAddress(X0, Word64.fromInt(tag soffset)), cvec); genPushReg(X0, cvec); incsp())
                 |   (SOME indexVal, 0) => gencde (indexVal, ToStack, NotEnd, loopAddr)
                 |   (SOME indexVal, soffset) =>
                     (
                         gencde (indexVal, ToX0, NotEnd, loopAddr);
                         (* Add the offset as a shifted but not tagged value. *)
                         addConstantWord({regS=X0, regD=X0, regW=X1, value=Word64.fromInt(semitag soffset)}, cvec);
                         genPushReg(X0, cvec);
                         incsp();
                         topInX0 := false
                     )
             )
             
             val genCAddress = genMLAddress
 
             datatype mlLoadKind = MLLoadOffset of int | MLLoadReg of xReg
 
             fun genMLLoadAddress({base, index=NONE, offset}, scale) =
                 (* The index, if any, is a constant. *)
                 (
                     gencde (base, ToX0, NotEnd, loopAddr);
                     indexToAbsoluteAddress(X0, X0, cvec);
                     (X0, MLLoadOffset(offset div scale))
                 )
             
             |   genMLLoadAddress({base, index=SOME indexVal, offset}, scale) =
                 (
                     gencde (base, ToStack, NotEnd, loopAddr); (* Push base addr to stack. *)
                     gencde (indexVal, ToX0, NotEnd, loopAddr);
                     (* Shift right to remove the tag.  N.B.  Indexes into ML memory are
                        unsigned.  Unlike on the X86 we can't remove the tag by providing
                        a displacement and the only options are to scale by either 1 or 8. *)
                     gen(logicalShiftRight{regN=X0, regD=X0, wordSize=WordSize64, shift=0w1}, cvec);
                     (* Add any constant offset.  Does nothing if it's zero. *)
                     addConstantWord({regS=X0, regD=X0, regW=X3,
                         value=Word64.fromInt (* unsigned *)(offset div scale)}, cvec);
                     genPopReg(X1, cvec); (* Pop base reg into X1. *)
                     decsp();
                     indexToAbsoluteAddress(X1, X1, cvec);
                     (X1, MLLoadReg X0)
                 )
 
             (* Similar to genMLLoadAddress but for C addresses.  There are two
                differences.  The index is signed so we use an arithmetic shift and
                the base address is a LargeWord value i.e. the actual
                address is held in the word pointed at by "base", unlike with
                ML addresses. *)
             fun genCLoadAddress({base, index=NONE, offset}, scale) =
                 (
                     gencde (base, ToX0, NotEnd, loopAddr);
                     indexToAbsoluteAddress(X0, X0, cvec);
                     gen(loadRegScaled{regT=X0, regN=X0, unitOffset=0}, cvec);
                     (X0, MLLoadOffset(offset div scale))
                 )
             |   genCLoadAddress({base, index=SOME indexVal, offset}, scale) =
                 (
                     gencde (base, ToStack, NotEnd, loopAddr); (* Push base addr to stack. *)
                     gencde (indexVal, ToX0, NotEnd, loopAddr);
                     (* Shift right to remove the tag.  C indexes are SIGNED. *)
                     gen(arithmeticShiftRight{regN=X0, regD=X0, wordSize=WordSize64, shift=0w1}, cvec);
                     (* Add any constant offset.  Does nothing if it's zero. *)
                     addConstantWord({regS=X0, regD=X0, regW=X3,
                         value=Word64.fromInt (* unsigned *)(offset div scale)}, cvec);
                     genPopReg(X1, cvec); (* Pop base reg into X1. *)
                     indexToAbsoluteAddress(X1, X1, cvec);
                     gen(loadRegScaled{regT=X1, regN=X1, unitOffset=0}, cvec);
                     decsp();
                     (X1, MLLoadReg X0)
                 )
 
             (* Compare a block of bytes.  Jumps to labelEqual if all the bytes are
                equal up to the length.  Otherwise it drops through with the condition
                code set to the last byte comparison that tested unequal. *)
             fun blockCompareBytes(leftArg, rightArg, length, labelEqual, setZeroCC) =
             let
                 val loopLabel = createLabel()
             in
                 genMLAddress(leftArg, 1);
                 genMLAddress(rightArg, 1);
                 gencde (length, ToX0, NotEnd, loopAddr);
                 genPopReg(X2, cvec); (* right arg index - tagged value. *)
                 genPopReg(X1, cvec); (* right arg base address. *)
                 indexToAbsoluteAddress(X1, X1, cvec);
                 (* Add in the index N.B. ML index values are unsigned. *)
                 gen(addShiftedReg{regM=X2, regN=X1, regD=X1, shift=ShiftLSR 0w1}, cvec);
                 genPopReg(X3, cvec); (* left index *)
                 genPopReg(X2, cvec);
                 indexToAbsoluteAddress(X2, X2, cvec);
                 decsp(); decsp(); decsp(); decsp();
                 gen(addShiftedReg{regM=X3, regN=X2, regD=X2, shift=ShiftLSR 0w1}, cvec);
                 (* Untag the length *)
                 gen(logicalShiftRight{regN=X0, regD=X0, wordSize=WordSize64, shift=0w1}, cvec);
                 (* If necessary set the cc for the case where the length is zero. *)
                 if setZeroCC then compareRegs(X0, X0, cvec) else ();
                 gen(setLabel loopLabel, cvec);
                 gen(compareBranchZero(X0, WordSize64, labelEqual), cvec);
                 (* X2 is left arg addr, X1 is right arg addr. *)
                 gen(loadRegPostIndexByte{regT=X4, regN=X2, byteOffset=1}, cvec);
                 gen(loadRegPostIndexByte{regT=X3, regN=X1, byteOffset=1}, cvec);
                 compareRegs(X4, X3, cvec);
                 gen(subImmediate{regN=X0, regD=X0, immed=0w1, shifted=false}, cvec);
                 (* Loop if they're equal. *)
                 gen(conditionalBranch(condEqual, loopLabel), cvec)
             end
     
          val () =
            case pt of
                 BICEval evl => genEval (evl, tailKind)
 
             |   BICExtract ext =>
                     (* This may just be being used to discard a value which isn't
                        used on this branch.  N.B. genProc for mutual closures
                        assumes that this does not affect X1. *)
                 if whereto = NoResult then ()
                 else
                 let     
                     fun loadLocalStackValue addr =
                     (
                         genList (loadScaledWord{dest=X0, base=X_MLStackPtr, work=X16, offset= !realstackptr + addr}, cvec);
                         topInX0 := true
                     )
                 in
                     case ext of
                         BICLoadArgument locn =>
                             (* The register arguments appear in order on the
                                stack, followed by the stack argumens in reverse
                                order. *)
                             if locn < 8
                             then loadLocalStackValue (locn+1)
                             else loadLocalStackValue (numOfArgs-locn+8)
                     |   BICLoadLocal locn =>
                         (
                             case Array.sub (decVec, locn) of
                                 StackAddr n => loadLocalStackValue (~ n)
                             |   _ => (* Should be on the stack, not a function. *)
                                 raise InternalError "locaddr: bad stack address"
                         )
                     |   BICLoadClosure locn =>
                         (
                             loadLocalStackValue ~1; (* The closure itself. *)
-                            
-                            genList (loadScaledPolyWord{dest=X0, base=X0, work=X16,
+                            indexToAbsoluteAddress(X0, X0, cvec);
+                            genList(loadScaledPolyWord{dest=X0, base=X0, work=X16,
                                  (* The first word is the code. This is two poly words in 32-in-64. *)
                                 offset=if is32in64 then locn+2 else locn+1}, cvec)
                         )
                     |   BICLoadRecursive =>
                             loadLocalStackValue ~1 (* The closure itself - first value on the stack. *)
                 end
 
             |   BICField {base, offset} =>
                 (
                     gencde (base, ToX0, NotEnd, loopAddr);
                     if is32in64
                     then
                     (
                         (* Can use an indexed load if the offset is zero otherwise it takes two instrs. *)
                         if offset = 0
                         then gen(loadRegIndexed32{regN=X_Base32in64, regM=X0, regT=X0, option=ExtUXTX ScaleOrShift}, cvec)
                         else
                         (
                             indexToAbsoluteAddress(X0, X0, cvec);
                             genList (loadScaledPolyWord{dest=X0, base=X0, work=X16, offset=offset}, cvec)
                         )
                     )
                     else genList (loadScaledPolyWord{dest=X0, base=X0, work=X16, offset=offset}, cvec)
                 )
 
             |   BICLoadContainer {base, offset} =>
                 (
                     gencde (base, ToX0, NotEnd, loopAddr);
                     genList (loadScaledWord{dest=X0, base=X0, work=X16, offset=offset}, cvec)
                 )
        
             |   BICLambda lam => genProc (lam, false, fn () => ())
            
             |   BICConstnt(w, _) =>
                 (
                     if isShort w
                     then genList(loadNonAddress(X0,
                                     taggedWord64(Word64.fromLarge(Word.toLargeX(toShort w)))), cvec)
                     else gen(loadAddressConstant(X0, w), cvec);
                     topInX0 := true
                 )
 
             |   BICCond (testPart, thenPart, elsePart) =>
                     genCond (testPart, thenPart, elsePart, whereto, tailKind, loopAddr)
   
             |   BICNewenv(decls, exp) =>
                 let         
                     (* Processes a list of entries. *)
             
                     (* Mutually recursive declarations. May be either lambdas or constants. Recurse down
                        the list pushing the addresses of the closure vectors, then unwind the 
                        recursion and fill them in. *)
                     fun genMutualDecs [] = ()
 
                     |   genMutualDecs ({lambda, addr, ...} :: otherDecs) =
                             genProc (lambda, true,
                                 fn() =>
                                 (
                                     Array.update (decVec, addr, StackAddr (! realstackptr));
                                     genMutualDecs (otherDecs)
                                 ))
 
                     fun codeDecls(BICRecDecs dl) = genMutualDecs dl
 
                     |   codeDecls(BICDecContainer{size, addr}) =
                         (
                             (* If this is a container we have to process it here otherwise it
                                will be removed in the stack adjustment code. *)
                             (* The stack entries have to be initialised.  Set them to tagged(0). *)
                             genList(loadNonAddress(X0, Word64.fromInt(tag 0)), cvec);
                             let fun pushN 0 = () | pushN n = (genPushReg(X0, cvec); pushN (n-1)) in pushN size end;
                             gen(moveRegToReg{sReg=X_MLStackPtr, dReg=X0}, cvec);
                             genPushReg(X0, cvec); (* Push the address of this container. *)
                             realstackptr := !realstackptr + size + 1; (* Pushes N words plus the address. *)
                             Array.update (decVec, addr, StackAddr(!realstackptr))
                         )
 
                     |   codeDecls(BICDeclar{value, addr, ...}) =
                         (
                             gencde (value, ToStack, NotEnd, loopAddr);
                             Array.update (decVec, addr, StackAddr(!realstackptr))
                         )
                     |   codeDecls(BICNullBinding exp) = gencde (exp, NoResult, NotEnd, loopAddr)
                 in
                     List.app codeDecls decls;
                     gencde (exp, whereto, tailKind, loopAddr)
                 end
           
             |   BICBeginLoop {loop=body, arguments} =>
                 (* Execute the body which will contain at least one Loop instruction.
                    There will also be path(s) which don't contain Loops and these
                    will drop through. *)
                 let
                     val args = List.map #1 arguments
                     (* Evaluate each of the arguments, pushing the result onto the stack. *)
                     fun genLoopArg ({addr, value, ...}) =
                         (
                          gencde (value, ToStack, NotEnd, loopAddr);
                          Array.update (decVec, addr, StackAddr (!realstackptr));
                          !realstackptr (* Return the posn on the stack. *)
                         )
                     val argIndexList = map genLoopArg args;
 
                     val startSp = ! realstackptr; (* Remember the current top of stack. *)
                     val startLoop = createLabel ()
                     val () = gen(setLabel startLoop, cvec) (* Start of loop *)
                 in
                     (* Process the body, passing the jump-back address down for the Loop instruction(s). *)
                     gencde (body, whereto, tailKind, SOME(startLoop, startSp, argIndexList))
                     (* Leave the arguments on the stack.  They can be cleared later if needed. *)
                 end
 
             |   BICLoop argList => (* Jump back to the enclosing BeginLoop. *)
                 let
                     val (startLoop, startSp, argIndexList) =
                         case loopAddr of
                             SOME l => l
                         |   NONE => raise InternalError "No BeginLoop for Loop instr"
                     (* Evaluate the arguments.  First push them to the stack because evaluating
                        an argument may depend on the current value of others.  Only when we've
                        evaluated all of them can we overwrite the original argument positions. *)
                     fun loadArgs ([], []) = !realstackptr - startSp (* The offset of all the args. *)
                       | loadArgs (arg:: argList, _ :: argIndexList) =
                         let
                             (* Evaluate all the arguments. *)
                             val () = gencde (arg, ToStack, NotEnd, NONE);
                             val argOffset = loadArgs(argList, argIndexList);
                         in
                             genPopReg(X0, cvec);
                             genList(storeScaledWord{store=X0, base=X_MLStackPtr, work=X16, offset=argOffset-1}, cvec);
                             decsp(); (* The argument has now been popped. *)
                             argOffset
                         end
                       | loadArgs _ = raise InternalError "loadArgs: Mismatched arguments";
 
                     val _: int = loadArgs(List.map #1 argList, argIndexList)
                 in
                     if !realstackptr <> startSp
                     then resetStack (!realstackptr - startSp, false, cvec) (* Remove any local variables. *)
                     else ();
             
                     (* Jump back to the start of the loop. *)
                     checkStackCode(X10, 0, false, cvec);
                     
                     gen(unconditionalBranch startLoop, cvec)
                 end
   
             |   BICRaise exp =>
                 (
                     gencde (exp, ToStack, NotEnd, loopAddr);
                     genPopReg(X0, cvec);
                     (* Copy the handler "register" into the stack pointer.  Then
                        jump to the address in the first word.  The second word is
                        the next handler.  This is set up in the handler.  We have a lot
                        more raises than handlers since most raises are exceptional conditions
                        such as overflow so it makes sense to minimise the code in each raise. *)
                     gen(loadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset}, cvec);
                     gen(loadRegScaled{regT=X1, regN=X_MLStackPtr, unitOffset=0}, cvec);
                     gen(branchRegister X1, cvec)
                 )
   
             |   BICHandle {exp, handler, exPacketAddr} =>
                 let
                     (* Save old handler *)
                     val () = gen(loadRegScaled{regT=X0, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset}, cvec)
                     val () = genPushReg(X0, cvec)
                     val () = incsp ()
                     val handlerLabel = createLabel()
                     (* Push address of handler. *)
                     val () = gen(loadLabelAddress(X0, handlerLabel), cvec)
                     val () = genPushReg(X0, cvec)
                     val () = incsp()
                     (* Store the address of the stack pointer into the handler register. *)
                     val () = gen(storeRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset}, cvec)
 
                     (* Code generate the body; "NotEnd" because we have to come back
                        to remove the handler; "ToStack" because delHandler needs
                        a result to carry down. *)
                     val () = gencde (exp, ToStack, NotEnd, loopAddr)
       
                     (* Now get out of the handler and restore the old one. *)
                     val () = genPopReg(X0, cvec) (* Pop the result. *)
                     val () = genPopReg(X1, cvec) (* Pop and discard the handler address. *)
                     val () = genPopReg(X1, cvec) (* Pop the old handler. *)
                     val () = gen(storeRegScaled{regT=X1, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset}, cvec)
                     val () = genPushReg(X0, cvec) (* Push the result. *)
 
                     val skipHandler = createLabel()
                     val () = gen(unconditionalBranch skipHandler, cvec)
                     val () = realstackptr := oldsp
                     val () = gen(setLabel handlerLabel, cvec)
                     (* The exception raise code resets the stack pointer to the value in the exception handler
                        so this is probably redundant.  Leave it for the moment, *)
                     val () = gen(loadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset}, cvec)
                     (* We must, though, restore the old handler. *)
                     val () = genPopReg(X1, cvec) (* Pop and discard the handler address. *)
                     val () = genPopReg(X1, cvec) (* Pop the old handler. *)
                     val () = gen(storeRegScaled{regT=X1, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset}, cvec)
                     (* Push the exception packet which is in X0 and set the address. *)
                     val () = genPushReg(X0, cvec)
                     val () = incsp ()
                     val () = Array.update (decVec, exPacketAddr, StackAddr(!realstackptr))
                     val () = gencde (handler, ToStack, NotEnd, loopAddr)
                     (* Have to remove the exception packet. *)
                     val () = resetStack(1, true, cvec)
                     val () = decsp()
           
                     (* Finally fix-up the jump around the handler *)
                     val () = gen(setLabel skipHandler, cvec)
                 in
                     ()
                 end
   
             |   BICCase ({cases, test, default, firstIndex, ...}) =>
                 let
                     val () = gencde (test, ToStack, NotEnd, loopAddr)
                     (* Label to jump to at the end of each case. *)
                     val exitJump = createLabel()
                     val () = genPopReg(X0, cvec)
                     val () = decsp ()
 
                     (* Subtract the minimum even if it is zero to remove the tag.
                        This leaves us with a shifted but untagged value. Don't check for overflow.
                        Instead allow large values to wrap around and check later.*)
                     val () = addConstantWord({regS=X0, regD=X0, regW=X1,
                                     value= ~(taggedWord64(Word64.fromLarge(Word.toLargeX firstIndex)))}, cvec)
 
                     (* Create the case labels. *)
                     val nCases = List.length cases
                     val caseLabels = List.tabulate(nCases, fn _ => createLabel())
                     val defaultLabel = createLabel()
                     
                     (* Compare with the number of cases and go to the default if it is
                        not less. We use an unsigned comparison and compare with
                        the semitagged value because we've removed the tag bit. *)
                     (* TODO: Not necessary if it exhaustive. *)
                     (* For the moment load the value into a register and compare. *)
                     val () = genList(loadNonAddress(X1, Word64.fromInt nCases * 0w2), cvec)
                     val () = compareRegs(X0, X1, cvec)
                     val () = gen(conditionalBranch(condCarrySet, defaultLabel), cvec)
                     (* Load the address of the jump table. *)
                     val tableLabel = createLabel()
                     val () = gen(loadLabelAddress(X1, tableLabel), cvec)
                     (* Add the value shifted by one since it's already shifted. *)
                     val () = gen(addShiftedReg{regM=X0, regN=X1, regD=X0, shift=ShiftLSL 0w1}, cvec)
                     val () = gen(branchRegister X0, cvec)
                     (* Put in the branch table. *)
                     val () = gen(setLabel tableLabel, cvec)
                     val () = List.app(fn label => gen(unconditionalBranch label, cvec)) caseLabels
 
                     (* The default case, if any, follows the case statement. *)
                     (* If we have a jump to the default set it to jump here. *)
                     local
                         fun fixDefault(NONE, defCase) = gen(setLabel defCase, cvec)
                         |   fixDefault(SOME _, _) = ()
                     in
                         val () = ListPair.appEq fixDefault (cases, caseLabels)
                     end
                     val () = gen(setLabel defaultLabel, cvec)
                     val () = gencde (default, whereto, tailKind, loopAddr)
 
                     fun genCases(SOME body, label) =
                         (
                             (* First exit from the previous case or the default if
                                this is the first. *)
                             gen(unconditionalBranch exitJump, cvec);
                             (* Remove the result - the last case will leave it. *)
                             case whereto of ToStack => decsp () | NoResult => () | ToX0 => ();
                             topInX0 := false;
                             (* Fix up the jump to come here. *)
                             gen(setLabel label, cvec);
                             gencde (body, whereto, tailKind, loopAddr)
                         )
                     |   genCases(NONE, _) = ()
                 
                     val () = ListPair.appEq genCases (cases, caseLabels)
      
                     (* Finally set the exit jump to come here. *)
                     val () = gen(setLabel exitJump, cvec)
                 in
                     ()
                 end
   
             |   BICTuple recList =>
                 let
                     val size = List.length recList
                 in
                     (* Get the fields and push them to the stack. *)
                     List.app(fn v => gencde (v, ToStack, NotEnd, loopAddr)) recList;
                     genAllocateFixedSize(size, 0w0, X0, X1, cvec);
                     List.foldl(fn (_, w) =>
                         (
                             genPopReg(X1, cvec);
                             genList(storeScaledPolyWord{store=X1, base=X0, work=X16, offset=w-1}, cvec);
                             w-1)
                         )
                         size recList;
                     (* If it's 32-in-64 this has to be converted to an object pointer. *)
                     absoluteAddressToIndex(X0, cvec);
                     topInX0 := true;
                     realstackptr := !realstackptr - size
                 end
 
             |   BICSetContainer{container, tuple, filter} =>
                 (* Copy the contents of a tuple into a container.  If the tuple is a
                    Tuple instruction we can avoid generating the tuple and then
                    unpacking it and simply copy the fields that make up the tuple
                    directly into the container. *)
                 (
                     case tuple of
                         BICTuple cl =>
                             (* Simply set the container from the values. *)
                         let
                             (* Push the address of the container to the stack. *)
                             val _ = gencde (container, ToStack, NotEnd, loopAddr)
 
                             fun setValues([], _, _) = ()
 
                             |   setValues(v::tl, sourceOffset, destOffset) =
                                 if sourceOffset < BoolVector.length filter andalso BoolVector.sub(filter, sourceOffset)
                                 then
                                 (
                                     (* Get the value to store into X0. *)
                                     gencde (v, ToX0, NotEnd, loopAddr);
                                     (* Load the address of the container from the stack and store
                                        the value into the container. *)
                                     gen(loadRegScaled{regT=X1, regN=X_MLStackPtr, unitOffset=0}, cvec);
                                     genList(storeScaledWord{store=X0, base=X1, work=X16, offset=destOffset}, cvec);
                                     topInX0 := false; (* We've used it. *)
                                     setValues(tl, sourceOffset+1, destOffset+1)
                                 )
                                 else setValues(tl, sourceOffset+1, destOffset)
                         in
                             setValues(cl, 0, 0)
                             (* The container address is still on the stack. *)
                         end
 
                     |   _ =>
                         let (* General case: copy values from the source tuple. *)
                             (* First the target tuple, then the container. *)
                             val () = gencde (tuple, ToStack, NotEnd, loopAddr)
                             val () = gencde (container, ToX0, NotEnd, loopAddr)
                             val () = genPopReg(X1, cvec)
                             val () = indexToAbsoluteAddress(X1, X1, cvec)
                             val () = decsp()
                             (* Container address is in X0, tuple in X1. *)
                             
                             val last = BoolVector.foldli(fn (i, true, _) => i | (_, false, n) => n) ~1 filter
 
                             fun copy (sourceOffset, destOffset) =
                                 if BoolVector.sub(filter, sourceOffset)
                                 then
                                 (
                                     (* Load the value in the tuple. *)
                                     genList(loadScaledPolyWord{dest=X2, base=X1, work=X16, offset=sourceOffset}, cvec);
                                     (* Store into the container. *)
                                     genList(storeScaledWord{store=X2, base=X0, work=X16, offset=destOffset}, cvec);
                                     if sourceOffset = last
                                     then ()
                                     else copy (sourceOffset+1, destOffset+1)
                                 )
                                 else copy(sourceOffset+1, destOffset)
                         in
                             copy (0, 0);
                             topInX0 := true (* Container address is in X0 *)
                         end
                 )
 
             |   BICTagTest { test, tag=tagValue, ... } =>
                 (
                     gencde (test, ToStack, NotEnd, loopAddr);
                     genPopReg(X0, cvec);
                     gen(subSImmediate{regN=X0, regD=XZero, immed=taggedWord tagValue, shifted=false}, cvec);
                     setBooleanCondition(X0, condEqual, cvec);
                     genPushReg(X0, cvec)
                 )
 
             |   BICNullary {oper=GetCurrentThreadId} =>
                 (
                     gen(loadRegScaled{regT=X0, regN=X_MLAssemblyInt, unitOffset=threadIdOffset}, cvec);
                     topInX0 := true
                 )
 
             |   BICNullary {oper=CheckRTSException} =>
                 (* Raise an exception in ML if the last RTS call set the exception packet. *)
                 let (* It may be better to do this in all RTS calls. *)
                     val noException = createLabel()
                 in
                     (* Load the packet and see if it is nil (tagged 0) *)
                     gen(loadRegScaled{regT=X0, regN=X_MLAssemblyInt, unitOffset=exceptionPacketOffset}, cvec);
                     gen(subSImmediate{regN=X0, regD=XZero, immed=taggedWord 0w0, shifted=false}, cvec);
                     gen(conditionalBranch(condEqual, noException), cvec);
                     (* If it isn't then raise the exception. *)
                     gen(loadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset}, cvec);
                     gen(loadRegScaled{regT=X1, regN=X_MLStackPtr, unitOffset=0}, cvec);
                     gen(branchRegister X1, cvec);
                     gen(setLabel noException, cvec)
                 end
 
             |   BICNullary {oper=CreateMutex} =>
                 let
                     (* Allocate memory for a mutex.  Use a native word as a mutable, weak, no-overwrite, byte cell
                        which is the same as a volatileRef. This ensures that it will always be cleared when it is
                        loaded even if it was locked when it was saved. *)
                     val flags = Word8.orb(F_mutable, Word8.orb(F_weak, Word8.orb(F_noOverwrite, F_bytes))) (* 0wx69 *)
                 in
                     genAllocateFixedSize(Word.toInt(nativeWordSize div wordSize), flags, X0, X5, cvec);
                     gen(storeRegScaled{regT=XZero, regN=X0, unitOffset=0}, cvec);
                     absoluteAddressToIndex(X0, cvec);
                     topInX0 := true
                 end
 
             |   BICUnary { oper, arg1 } => genUnary(oper, arg1, loopAddr)
 
             |   BICBinary { oper, arg1, arg2 } => genBinary(oper, arg1, arg2, loopAddr)
             
             |   BICAllocateWordMemory {numWords, flags, initial } =>
                 let
                     fun doAllocateAndInit() =
                     let
                         val () = gencde (numWords, ToStack, NotEnd, loopAddr)
                         val () = gencde (flags, ToStack, NotEnd, loopAddr)
                         val () = gencde (initial, ToX0, NotEnd, loopAddr)
                         val exitLabel = createLabel() and loopLabel = createLabel()
                     in
                         genPopReg(X2, cvec); (* Flags as tagged value. *)
                         gen(logicalShiftRight{regN=X2, regD=X2, wordSize=WordSize32 (*byte*), shift=0w1}, cvec);
                         genPopReg(X1, cvec); (* Length as tagged value. *)
                         gen(logicalShiftRight{regN=X1, regD=X1, wordSize=WordSize64, shift=0w1}, cvec);
                         genPushReg(X0, cvec); (* Save initialiser - TODO: Add it to save set. *)
                         allocateVariableSize({sizeReg=X1, flagsReg=X2, resultReg=X0}, cvec);
                         genPopReg(X3, cvec); (* Pop initialiser. *)
                         (* Add the length in bytes so we point at the end. *)
-                        gen(addShiftedReg{regM=X1, regN=X0, regD=X1, shift=ShiftLSL 0w3}, cvec);
+                        gen(addShiftedReg{regM=X1, regN=X0, regD=X1,
+                            shift=ShiftLSL(if is32in64 then 0w2 else 0w3)}, cvec);
                         (* Loop to initialise. *)
                         gen(setLabel loopLabel, cvec);
                         compareRegs(X1, X0, cvec); (* Are we at the start? *)
                         gen(conditionalBranch(condEqual, exitLabel), cvec);
-                        gen(storeRegPreIndex{regT=X3, regN=X1, byteOffset= ~8}, cvec);
+                        if is32in64
+                        then gen(storeRegPreIndex32{regT=X3, regN=X1, byteOffset= ~4}, cvec)
+                        else gen(storeRegPreIndex{regT=X3, regN=X1, byteOffset= ~8}, cvec);
                         gen(unconditionalBranch loopLabel, cvec);
                         gen(setLabel exitLabel, cvec);
                         absoluteAddressToIndex(X0, cvec);
                         decsp(); decsp()
                     end
                 in
                     case (numWords, flags) of
                         (BICConstnt(length, _), BICConstnt(flagValue, _)) =>
                             if isShort length andalso toShort length = 0w1 andalso isShort flagValue
                             then (* This is a very common case for refs. *)
                             let
                                 val flagByte = Word8.fromLargeWord(Word.toLargeWord(toShort flagValue))
                             in
                                 gencde (initial, ToStack, NotEnd, loopAddr); (* Initialiser. *)
                                 genAllocateFixedSize(1, flagByte, X0, X1, cvec);
                                 genPopReg(X1, cvec);
-                                gen(storeRegScaled{regT=X1, regN=X0, unitOffset=0}, cvec);
+                                gen((if is32in64 then storeRegScaled32 else storeRegScaled){regT=X1, regN=X0, unitOffset=0}, cvec);
                                 absoluteAddressToIndex(X0, cvec);
                                 decsp(); topInX0 := true
                             end
                             else (* Constant but not a single. *) doAllocateAndInit()
                     |   _ => (* Not constant. *) doAllocateAndInit()
                 end
 
             |   BICLoadOperation { kind=LoadStoreMLWord {isImmutable=false}, address={base, index=NONE, offset=0}} =>
                 (
                     gencde (base, ToX0, NotEnd, loopAddr);
                     indexToAbsoluteAddress(X0, X0, cvec);
                     gen((if is32in64 then loadAcquire32 else loadAcquire){regN=X0, regT=X0}, cvec)
                 )
 
             |   BICLoadOperation { kind=LoadStoreMLWord _, address} =>
                 (
                     case genMLLoadAddress(address, Word.toInt wordSize) of
                         (base, MLLoadOffset offset) =>
                             genList(loadScaledPolyWord{dest=X0, base=base, work=X16, offset=offset}, cvec)
                     |   (base, MLLoadReg indexR) =>
                             gen((if is32in64 then loadRegIndexed32 else loadRegIndexed)
                                 {regN=base, regM=indexR, regT=X0, option=ExtUXTX ScaleOrShift}, cvec)
                 )
 
             |   BICLoadOperation { kind=LoadStoreMLByte _, address} =>
                 (
                     case genMLLoadAddress(address, 1) of
                         (base, MLLoadOffset offset) =>
                             gen(loadRegScaledByte{regT=X0, regN=base, unitOffset=offset}, cvec)
                     |   (base, MLLoadReg indexR) =>
                             gen(loadRegIndexedByte{regN=base, regM=indexR, regT=X0, option=ExtUXTX NoScale}, cvec);
 
                     (* Have to tag the result. *)
                     gen(logicalShiftLeft{regN=X0, regD=X0, wordSize=WordSize32, shift=0w1}, cvec);
                     gen(bitwiseOrImmediate{regN=X0, regD=X0, wordSize=WordSize32, bits=0w1}, cvec)
                 )
 
             |   BICLoadOperation { kind=LoadStoreC8, address} =>
                 (
                     case genCLoadAddress(address, 1) of
                         (base, MLLoadOffset offset) =>
                             if offset < 0 (* C offsets can be negative. *)
                             then gen(loadRegUnscaledByte{regT=X0, regN=base, byteOffset=offset}, cvec)
                             else gen(loadRegScaledByte{regT=X0, regN=base, unitOffset=offset}, cvec)
                     |   (base, MLLoadReg indexR) =>
                             gen(loadRegIndexedByte{regN=base, regM=indexR, regT=X0, option=ExtUXTX NoScale}, cvec);
 
                     (* Have to tag the result. *)
                     gen(logicalShiftLeft{regN=X0, regD=X0, wordSize=WordSize32, shift=0w1}, cvec);
                     gen(bitwiseOrImmediate{regN=X0, regD=X0, wordSize=WordSize32, bits=0w1}, cvec)
                 )
 
             |   BICLoadOperation { kind=LoadStoreC16, address} =>
                 (
                     case genCLoadAddress(address, 2) of
                         (base, MLLoadOffset offset) =>
                             if offset < 0 (* C offsets can be negative. *)
                             then gen(loadRegUnscaled16{regT=X0, regN=base, byteOffset=offset*2}, cvec)
                             else gen(loadRegScaled16{regT=X0, regN=base, unitOffset=offset}, cvec)
                     |   (base, MLLoadReg indexR) =>
                             gen(loadRegIndexed16{regN=base, regM=indexR, regT=X0, option=ExtUXTX ScaleOrShift}, cvec);
 
                     (* Have to tag the result. *)
                     gen(logicalShiftLeft{regN=X0, regD=X0, wordSize=WordSize32, shift=0w1}, cvec);
                     gen(bitwiseOrImmediate{regN=X0, regD=X0, wordSize=WordSize32, bits=0w1}, cvec)
                 )
 
             |   BICLoadOperation { kind=LoadStoreC32, address} =>
                 (
                     case genCLoadAddress(address, 4) of
                         (base, MLLoadOffset offset) =>
                             if offset < 0 (* C offsets can be negative. *)
                             then gen(loadRegUnscaled32{regT=X0, regN=base, byteOffset=offset*4}, cvec)
                             else gen(loadRegScaled32{regT=X0, regN=base, unitOffset=offset}, cvec)
                     |   (base, MLLoadReg indexR) =>
                             gen(loadRegIndexed32{regN=base, regM=indexR, regT=X0, option=ExtUXTX ScaleOrShift}, cvec);
 
                     (* Have to tag the result. *)
                     gen(logicalShiftLeft{regN=X0, regD=X0, wordSize=WordSize64 (* Must use 64-bits *), shift=0w1}, cvec);
                     gen(bitwiseOrImmediate{regN=X0, regD=X0, wordSize=WordSize64, bits=0w1}, cvec)
                 )
 
             |   BICLoadOperation { kind=LoadStoreC64, address} =>
                 (
                     case genCLoadAddress(address, 8) of
                         (base, MLLoadOffset offset) =>
                             if offset < 0 (* C offsets can be negative. *)
                             then gen(loadRegUnscaled{regT=X1, regN=base, byteOffset=offset*8}, cvec)
                             else gen(loadRegScaled{regT=X1, regN=base, unitOffset=offset}, cvec)
                     |   (base, MLLoadReg indexR) =>
                             gen(loadRegIndexed{regN=base, regM=indexR, regT=X1, option=ExtUXTX ScaleOrShift}, cvec);
 
                     (* Load the value at the address and box it. *)
                     boxLargeWord(X1, cvec)
                 )
 
             |   BICLoadOperation { kind=LoadStoreCFloat, address} =>
                 (
                     case genCLoadAddress(address, 4) of
                         (base, MLLoadOffset offset) =>
                             if offset < 0 (* C offsets can be negative. *)
                             then gen(loadRegUnscaledFloat{regT=V0, regN=base, byteOffset=offset*4}, cvec)
                             else gen(loadRegScaledFloat{regT=V0, regN=base, unitOffset=offset}, cvec)
                     |   (base, MLLoadReg indexR) =>
                             gen(loadRegIndexedFloat{regN=base, regM=indexR, regT=V0, option=ExtUXTX ScaleOrShift}, cvec);
                     (* This is defined to return a "real" i.e. a double so we need to convert it
                        to a double and then box the result. *)
                     gen(convertFloatToDouble{regN=V0, regD=V0}, cvec);
                     boxDouble(V0, cvec)
                 )
 
             |   BICLoadOperation { kind=LoadStoreCDouble, address} =>
                 (
                     case genCLoadAddress(address, 8) of
                         (base, MLLoadOffset offset) =>
                             if offset < 0 (* C offsets can be negative. *)
                             then gen(loadRegUnscaledDouble{regT=V0, regN=base, byteOffset=offset*8}, cvec)
                             else gen(loadRegScaledDouble{regT=V0, regN=base, unitOffset=offset}, cvec)
                     |   (base, MLLoadReg indexR) =>
                             gen(loadRegIndexedDouble{regN=base, regM=indexR, regT=V0, option=ExtUXTX ScaleOrShift}, cvec);
                     (* Box the result. *)
                     boxDouble(V0, cvec)
                 )
 
             |   BICLoadOperation { kind=LoadStoreUntaggedUnsigned, address} =>
                 (
                     case genMLLoadAddress(address, Word.toInt wordSize) of
                         (base, MLLoadOffset offset) =>
-                            gen(loadRegScaled{regT=X0, regN=base, unitOffset=offset}, cvec)
+                            genList(loadScaledPolyWord{dest=X0, base=base, work=X16, offset=offset}, cvec)
                     |   (base, MLLoadReg indexR) =>
-                            gen(loadRegIndexed{regN=base, regM=indexR, regT=X0, option=ExtUXTX ScaleOrShift}, cvec);
+                            gen((if is32in64 then loadRegIndexed32 else loadRegIndexed)
+                                {regN=base, regM=indexR, regT=X0, option=ExtUXTX ScaleOrShift}, cvec);
 
                     (* Have to tag the result. *)
                     gen(logicalShiftLeft{regN=X0, regD=X0, wordSize=WordSize64, shift=0w1}, cvec);
                     gen(bitwiseOrImmediate{regN=X0, regD=X0, wordSize=WordSize64, bits=0w1}, cvec)
                 )
 
             |   BICStoreOperation { kind=LoadStoreMLWord _, address={base, index=NONE, offset=0}, value } =>
                 (
                     gencde (base, ToStack, NotEnd, loopAddr);
                     gencde (value, ToX0, NotEnd, loopAddr);
                     genPopReg(X1, cvec); (* Address *)
                     indexToAbsoluteAddress(X1, X1, cvec);
                     decsp();
                     gen((if is32in64 then storeRelease32 else storeRelease){regN=X1, regT=X0}, cvec)
                 )
 
             |   BICStoreOperation { kind=LoadStoreMLWord _, address, value } =>
                 (
                     genMLAddress(address, Word.toInt wordSize);
                     gencde (value, ToStack, NotEnd, loopAddr);
                     genPopReg(X0, cvec); (* Value to store *)
                     genPopReg(X1, cvec); (* Index: a tagged value. *)
                     (* Shift right to remove the tag.  N.B.  Indexes into ML memory are
                        unsigned.  Unlike on the X86 we can't remove the tag by providing
                        a displacement and the only options are to scale by either 1 or 8. *)
                     gen(logicalShiftRight{regN=X1, regD=X1, wordSize=WordSize64, shift=0w1}, cvec);
                     genPopReg(X2, cvec); (* Base address. *)
                     indexToAbsoluteAddress(X2, X2, cvec);
                     gen((if is32in64 then storeRegIndexed32 else storeRegIndexed)
                         {regN=X2, regM=X1, regT=X0, option=ExtUXTX ScaleOrShift}, cvec);
                     (* Don't put the unit result in; it probably isn't needed, *)
                     decsp(); decsp(); decsp()
                 )
 
             |   BICStoreOperation { kind=LoadStoreMLByte _, address, value } =>
                 (
                     (* Untag the value and store the byte. *)
                     genMLAddress(address, 1);
                     gencde (value, ToStack, NotEnd, loopAddr);
                     genPopReg(X0, cvec); (* Value to store *)
                     gen(logicalShiftRight{regN=X0, regD=X0, wordSize=WordSize64, shift=0w1}, cvec); (* Untag it *)
                     genPopReg(X1, cvec); (* Index: a tagged value. *)
                     (* Shift right to remove the tag.  N.B.  Indexes into ML memory are
                        unsigned.  Unlike on the X86 we can't remove the tag by providing
                        a displacement and the only options are to scale by either 1 or 8. *)
                     gen(logicalShiftRight{regN=X1, regD=X1, wordSize=WordSize64, shift=0w1}, cvec);
                     genPopReg(X2, cvec); (* Base address. *)
                     indexToAbsoluteAddress(X2, X2, cvec);
                     gen(storeRegIndexedByte{regN=X2, regM=X1, regT=X0, option=ExtUXTX NoScale}, cvec);
                     (* Don't put the unit result in; it probably isn't needed, *)
                     decsp(); decsp(); decsp()
                 )
 
             |   BICStoreOperation { kind=LoadStoreC8, address, value} =>
                 (
                     genCAddress(address, 1);
                     gencde (value, ToX0, NotEnd, loopAddr); (* Value to store *)
                     gen(logicalShiftRight{regN=X0, regD=X0, wordSize=WordSize32, shift=0w1}, cvec); (* Untag it *)
                     genPopReg(X1, cvec); (* Index: a tagged value. *)
                     (* Untag.  C indexes are signed. *)
                     gen(arithmeticShiftRight{regN=X1, regD=X1, wordSize=WordSize64, shift=0w1}, cvec);
                     genPopReg(X2, cvec); (* Base address as a SysWord.word value. *)
                     indexToAbsoluteAddress(X2, X2, cvec);
                     gen(loadRegScaled{regT=X2, regN=X2, unitOffset=0}, cvec); (* Actual address *)
                     gen(storeRegIndexedByte{regN=X2, regM=X1, regT=X0, option=ExtUXTX NoScale}, cvec);
                     topInX0 := false;
                     decsp(); decsp()
                 )
 
             |   BICStoreOperation { kind=LoadStoreC16, address, value} =>
                 (
                     genCAddress(address, 2);
                     gencde (value, ToX0, NotEnd, loopAddr); (* Value to store *)
                     gen(logicalShiftRight{regN=X0, regD=X0, wordSize=WordSize32, shift=0w1}, cvec); (* Untag it *)
                     genPopReg(X1, cvec); (* Index: a tagged value. *)
                     (* Untag.  C indexes are signed. *)
                     gen(arithmeticShiftRight{regN=X1, regD=X1, wordSize=WordSize64, shift=0w1}, cvec);
                     genPopReg(X2, cvec); (* Base address as a SysWord.word value. *)
                     indexToAbsoluteAddress(X2, X2, cvec);
                     gen(loadRegScaled{regT=X2, regN=X2, unitOffset=0}, cvec); (* Actual address *)
                     gen(storeRegIndexed16{regN=X2, regM=X1, regT=X0, option=ExtUXTX ScaleOrShift}, cvec);
                     topInX0 := false;
                     decsp(); decsp()
                 )
 
             |   BICStoreOperation { kind=LoadStoreC32, address, value} =>
                 (
                     genCAddress(address, 4);
                     gencde (value, ToX0, NotEnd, loopAddr); (* Value to store *)
                     gen(logicalShiftRight{regN=X0, regD=X0, wordSize=WordSize64, shift=0w1}, cvec); (* Untag it *)
                     genPopReg(X1, cvec); (* Index: a tagged value. *)
                     (* Untag.  C indexes are signed. *)
                     gen(arithmeticShiftRight{regN=X1, regD=X1, wordSize=WordSize64, shift=0w1}, cvec);
                     genPopReg(X2, cvec); (* Base address as a SysWord.word value. *)
                     indexToAbsoluteAddress(X2, X2, cvec);
                     gen(loadRegScaled{regT=X2, regN=X2, unitOffset=0}, cvec); (* Actual address *)
                     gen(storeRegIndexed32{regN=X2, regM=X1, regT=X0, option=ExtUXTX ScaleOrShift}, cvec);
                     topInX0 := false;
                     decsp(); decsp()
                 )
 
             |   BICStoreOperation { kind=LoadStoreC64, address, value} =>
                 (
                     genCAddress(address, 8);
                     gencde (value, ToX0, NotEnd, loopAddr); (* Value to store. This is boxed. *)
                     gen(loadRegScaled{regT=X0, regN=X0, unitOffset=0}, cvec);
                     genPopReg(X1, cvec); (* Index: a tagged value. *)
                     (* Untag.  C indexes are signed. *)
                     gen(arithmeticShiftRight{regN=X1, regD=X1, wordSize=WordSize64, shift=0w1}, cvec);
                     genPopReg(X2, cvec); (* Base address as a SysWord.word value. *)
                     indexToAbsoluteAddress(X2, X2, cvec);
                     gen(loadRegScaled{regT=X2, regN=X2, unitOffset=0}, cvec); (* Actual address *)
                     gen(storeRegIndexed{regN=X2, regM=X1, regT=X0, option=ExtUXTX ScaleOrShift}, cvec);
                     topInX0 := false;
                     decsp(); decsp()
                 )
 
             |   BICStoreOperation { kind=LoadStoreCFloat, address, value} =>
                 (
                     genCAddress(address, 4);
                     gencde (value, ToX0, NotEnd, loopAddr); (* Value to store *)
                     (* This is a boxed double.  It needs to be converted to a float. *)
                     gen(loadRegScaledDouble{regT=V0, regN=X0, unitOffset=0}, cvec); (* Untag it *)
                     gen(convertDoubleToFloat{regN=V0, regD=V0}, cvec);
                     genPopReg(X1, cvec); (* Index: a tagged value. *)
                     (* Untag.  C indexes are signed. *)
                     gen(arithmeticShiftRight{regN=X1, regD=X1, wordSize=WordSize64, shift=0w1}, cvec);
                     genPopReg(X2, cvec); (* Base address as a SysWord.word value. *)
                     indexToAbsoluteAddress(X2, X2, cvec);
                     gen(loadRegScaled{regT=X2, regN=X2, unitOffset=0}, cvec); (* Actual address *)
                     gen(storeRegIndexedFloat{regN=X2, regM=X1, regT=V0, option=ExtUXTX ScaleOrShift}, cvec);
                     topInX0 := false;
                     decsp(); decsp()
                 )
 
             |   BICStoreOperation { kind=LoadStoreCDouble, address, value} =>
                 (
                     genCAddress(address, 8);
                     gencde (value, ToX0, NotEnd, loopAddr); (* Value to store *)
                     (* This is a boxed double. *)
                     gen(loadRegScaledDouble{regT=V0, regN=X0, unitOffset=0}, cvec); (* Untag it *)
                     genPopReg(X1, cvec); (* Index: a tagged value. *)
                     (* Untag.  C indexes are signed. *)
                     gen(arithmeticShiftRight{regN=X1, regD=X1, wordSize=WordSize64, shift=0w1}, cvec);
                     genPopReg(X2, cvec); (* Base address as a SysWord.word value. *)
                     indexToAbsoluteAddress(X2, X2, cvec);
                     gen(loadRegScaled{regT=X2, regN=X2, unitOffset=0}, cvec); (* Actual address *)
                     gen(storeRegIndexedDouble{regN=X2, regM=X1, regT=V0, option=ExtUXTX ScaleOrShift}, cvec);
                     topInX0 := false;
                     decsp(); decsp()
                 )
 
             |   BICStoreOperation { kind=LoadStoreUntaggedUnsigned, address, value} =>
                 (
                     (* Almost the same as LoadStoreMLWord except that the value to be stored
                        must be untagged before it is stored.  This is used primarily to set
                        the length word on a string. *)
                     genMLAddress(address, Word.toInt wordSize);
                     gencde (value, ToStack, NotEnd, loopAddr);
                     genPopReg(X0, cvec); (* Value to store *)
                     gen(logicalShiftRight{regN=X0, regD=X0, wordSize=WordSize64, shift=0w1}, cvec);
                     genPopReg(X1, cvec); (* Index: a tagged value. *)
                     (* Shift right to remove the tag.  N.B.  Indexes into ML memory are
                        unsigned.  Unlike on the X86 we can't remove the tag by providing
                        a displacement and the only options are to scale by either 1 or 8. *)
                     gen(logicalShiftRight{regN=X1, regD=X1, wordSize=WordSize64, shift=0w1}, cvec);
                     genPopReg(X2, cvec); (* Base address. *)
                     indexToAbsoluteAddress(X2, X2, cvec);
                     gen((if is32in64 then storeRegIndexed32 else storeRegIndexed)
                             {regN=X2, regM=X1, regT=X0, option=ExtUXTX ScaleOrShift}, cvec);
                     (* Don't put the unit result in; it probably isn't needed, *)
                     decsp(); decsp(); decsp()
                 )
 
             |   BICBlockOperation { kind=BlockOpMove{isByteMove}, sourceLeft, destRight, length } =>
                 let
                     val exitLabel = createLabel() and loopLabel = createLabel()
                 in
                     genMLAddress(sourceLeft, 1);
                     genMLAddress(destRight, 1);
                     gencde (length, ToX0, NotEnd, loopAddr); (* Length *)
                     genPopReg(X2, cvec); (* Dest index - tagged value. *)
                     genPopReg(X1, cvec); (* Dest base address. *)
                     indexToAbsoluteAddress(X1, X1, cvec);
                     (* Add in the index N.B. ML index values are unsigned. *)
                     gen(addShiftedReg{regM=X2, regN=X1, regD=X1, shift=ShiftLSR 0w1}, cvec);
                     genPopReg(X3, cvec); (* Source index *)
                     genPopReg(X2, cvec);
                     indexToAbsoluteAddress(X2, X2, cvec);
                     gen(addShiftedReg{regM=X3, regN=X2, regD=X2, shift=ShiftLSR 0w1}, cvec);
                     (* Untag the length *)
                     gen(logicalShiftRight{regN=X0, regD=X0, wordSize=WordSize64, shift=0w1}, cvec);
                     (* Test the loop value at the top in case it's already zero. *)
                     compareRegs(X0, X0, cvec); (* Set condition code just in case. *)
                     gen(setLabel loopLabel, cvec);
                     gen(compareBranchZero(X0, WordSize64, exitLabel), cvec);
                     if isByteMove
                     then
                     (
                         gen(loadRegPostIndexByte{regT=X3, regN=X2, byteOffset=1}, cvec);
                         gen(storeRegPostIndexByte{regT=X3, regN=X1, byteOffset=1}, cvec)
                     )
                     else if is32in64
                     then
                     (
                         gen(loadRegPostIndex32{regT=X3, regN=X2, byteOffset=4}, cvec);
                         gen(storeRegPostIndex32{regT=X3, regN=X1, byteOffset=4}, cvec)
                     )
                     else
                     (
                         gen(loadRegPostIndex{regT=X3, regN=X2, byteOffset=8}, cvec);
                         gen(storeRegPostIndex{regT=X3, regN=X1, byteOffset=8}, cvec)
                     );
                     gen(subImmediate{regN=X0, regD=X0, immed=0w1, shifted=false}, cvec);
                     (* Back to the start. *)
                     gen(unconditionalBranch loopLabel, cvec);
                     gen(setLabel exitLabel, cvec);
                     topInX0 := false; (* X0 does not contain "unit" *)
                     decsp(); decsp(); decsp(); decsp()
                 end
 
             |   BICBlockOperation { kind=BlockOpEqualByte, sourceLeft, destRight, length } =>
                 (* Compare byte vectors for equality - returns a boolean result. *)
                 let
                     val equalLabel = createLabel()
                 in
                     blockCompareBytes(sourceLeft, destRight, length, equalLabel, true);
                     gen(setLabel equalLabel, cvec);
                     (* Set the result condition. *)
                     setBooleanCondition(X0, condEqual, cvec)
                 end
 
             |   BICBlockOperation { kind=BlockOpCompareByte, sourceLeft, destRight, length } =>
                 (* Compare byte vectors for ordering - return tagged -1, 0, +1. *)
                 let
                     val equalLabel = createLabel() and resultLabel = createLabel()
                 in
                     blockCompareBytes(sourceLeft, destRight, length, equalLabel, false);
                     (* We drop through if we have found unequal bytes. *)
                     genList(loadNonAddress(X0, Word64.fromInt(tag 1)), cvec);
                     (* Set X0 to either 1 or -1 depending on whether it's greater or less. *)
                     gen(conditionalSetInverted{regD=X0, regTrue=X0, regFalse=XZero, cond=condUnsignedHigher}, cvec);
                     gen(unconditionalBranch resultLabel, cvec);
                     gen(setLabel equalLabel, cvec);
                     (* Equal case - set it to zero. *)
                     genList(loadNonAddress(X0, Word64.fromInt(tag 0)), cvec);
                     gen(setLabel resultLabel, cvec)
                 end
 
            |    BICArbitrary { oper=ArithAdd, shortCond, arg1, arg2, longCall } =>
                 let
                     val startLong = createLabel() and resultLabel = createLabel()
                 in
                     (* Check tag bits *)
                     gencde (shortCond, ToX0, NotEnd, loopAddr);
                     gen(subSImmediate{regN=X0, regD=XZero, immed=taggedWord 0w1, shifted=false}, cvec);
                     (* Go to the long case if it's not short. *)
                     gen(conditionalBranch(condNotEqual, startLong), cvec);
                     topInX0 := false;
                     gencde (arg1, ToStack, NotEnd, loopAddr);
                     gencde (arg2, ToX0, NotEnd, loopAddr);
                     gen(subImmediate{regN=X0, regD=X0, immed=0w1, shifted=false}, cvec);
                     genPopReg(X1, cvec);
                     decsp();
                     (* Add and set the flag bits *)
                     gen(addSShiftedReg{regN=X1, regM=X0, regD=X0, shift=ShiftNone}, cvec);
                     (* If there's no overflow skip to the end otherwise drop into
                        the call to the RTS. *)
                     gen(conditionalBranch(condNoOverflow, resultLabel), cvec);
                     gen(setLabel startLong, cvec);
                     topInX0 := false;
                     gencde (longCall, ToX0, tailKind, loopAddr);
                     gen(setLabel resultLabel, cvec)
                 end
 
            |    BICArbitrary { oper=ArithSub, shortCond, arg1, arg2, longCall } =>
                 let
                     val startLong = createLabel() and resultLabel = createLabel()
                 in
                     (* Check tag bits *)
                     gencde (shortCond, ToX0, NotEnd, loopAddr);
                     gen(subSImmediate{regN=X0, regD=XZero, immed=taggedWord 0w1, shifted=false}, cvec);
                     (* Go to the long case if it's not short. *)
                     gen(conditionalBranch(condNotEqual, startLong), cvec);
                     topInX0 := false;
                     gencde (arg1, ToStack, NotEnd, loopAddr);
                     gencde (arg2, ToX0, NotEnd, loopAddr);
                     gen(subImmediate{regN=X0, regD=X0, immed=0w1, shifted=false}, cvec);
                     genPopReg(X1, cvec);
                     decsp();
                     (* Subtract and set the flag bits *)
                     gen(subSShiftedReg{regN=X1, regM=X0, regD=X0, shift=ShiftNone}, cvec);
                     gen(conditionalBranch(condNoOverflow, resultLabel), cvec);
                     gen(setLabel startLong, cvec);
                     topInX0 := false;
                     gencde (longCall, ToX0, tailKind, loopAddr);
                     gen(setLabel resultLabel, cvec)
                 end
 
            |    BICArbitrary { oper=ArithMult, longCall, ... } =>
                 (* Multiply - Just implement as a call to the long-precision case. *)
                 (
                     gencde (longCall, whereto, tailKind, loopAddr)
                 )
 
            |    BICArbitrary _ =>
                     raise InternalError "BICArbitrary: unimplemented operation"
 
         in (* body of gencde *) 
 
           (* This ensures that there is precisely one item on the stack if
              whereto = ToStack and no items if whereto = NoResult. *)
             case whereto of
                 ToStack =>
                 let
                     val () = ensureX0()
                     val newsp = oldsp + 1;
                     val adjustment = !realstackptr - newsp
 
                     val () =
                         if adjustment = 0
                         then ()
                         else if adjustment < ~1
                         then raise InternalError ("gencde: bad adjustment " ^ Int.toString adjustment)
                         (* Hack for declarations that should push values, but don't *)
                         else if adjustment = ~1
                         then
                         (
                             genList(loadNonAddress(X0, Word64.fromInt(tag 0)), cvec);
                             genPushReg(X0, cvec)
                         )
                         else resetStack (adjustment, true, cvec)
                 in
                     realstackptr := newsp
                 end
           
             |   NoResult =>
                 let
                     val () = topInX0 := false
                     val adjustment = !realstackptr - oldsp
 
                     val () =
                         if adjustment = 0
                         then ()
                         else if adjustment < 0
                         then raise InternalError ("gencde: bad adjustment " ^ Int.toString adjustment)
                         else resetStack (adjustment, false, cvec)
                 in
                     realstackptr := oldsp
                 end
 
             |   ToX0 =>
                 let
                     (* If we have not pushed anything we have to push a unit result. *)
                     val () =
                         if !topInX0 then ()
                         else if !realstackptr = oldsp
                         then genList(loadNonAddress(X0, Word64.fromInt(tag 0)), cvec)
                         else
                         (
                             genPopReg(X0, cvec);
                             decsp()
                         )
                     val () = topInX0 := true
 
                     val adjustment = !realstackptr - oldsp
 
                     val () =
                         if adjustment = 0
                         then ()
                         else if adjustment < 0
                         then raise InternalError ("gencde: bad adjustment " ^ Int.toString adjustment)
                         else resetStack (adjustment, false, cvec)
                 in
                     realstackptr := oldsp
                 end
         end (* gencde *)
 
 
         and genUnary(NotBoolean, arg1, loopAddr) =
             (
                 gencde (arg1, ToX0, NotEnd, loopAddr);
                 (* Flip true to false and the reverse. *)
                 gen(bitwiseXorImmediate{wordSize=WordSize32, bits=0w2, regN=X0, regD=X0}, cvec)
             )
 
         |   genUnary(IsTaggedValue, arg1, loopAddr) =
             (
                 gencde (arg1, ToX0, NotEnd, loopAddr);
                 gen(testBitPattern(X0, 0w1), cvec);
                 setBooleanCondition(X0, condNotEqual (*Non-zero*), cvec)
             )
 
         |   genUnary(MemoryCellLength, arg1, loopAddr) =
             (
                 gencde (arg1, ToX0, NotEnd, loopAddr);
                 indexToAbsoluteAddress(X0, X0, cvec);
                 (* Load the length word. *)
                 if is32in64
                 then
                 (
                     gen(loadRegUnscaled32{regT=X0, regN=X0, byteOffset= ~4}, cvec);
                     (* Extract the length, excluding the flag bytes and shift by one bit. *)
                     gen(unsignedBitfieldInsertinZeros
                         {wordSize=WordSize32, lsb=0w1, width=0w24, regN=X0, regD=X0}, cvec)
                 )
                 else
                 (
                     gen(loadRegUnscaled{regT=X0, regN=X0, byteOffset= ~8}, cvec);
                     (* Extract the length, excluding the flag bytes and shift by one bit. *)
                     gen(unsignedBitfieldInsertinZeros
                         {wordSize=WordSize64, lsb=0w1, width=0w56, regN=X0, regD=X0}, cvec)
                 );
                 (* Set the tag bit. *)
                 gen(bitwiseOrImmediate{wordSize=WordSize64, bits=0w1, regN=X0, regD=X0}, cvec)
             )
 
         |   genUnary(MemoryCellFlags, arg1, loopAddr) =
             (
                 gencde (arg1, ToX0, NotEnd, loopAddr);
                 indexToAbsoluteAddress(X0, X0, cvec);
                 (* Load the flags byte. *)
                 gen(loadRegUnscaledByte{regT=X0, regN=X0, byteOffset= ~1}, cvec);
                 (* Tag the result. *)
                 gen(logicalShiftLeft{wordSize=WordSize64, shift=0w1, regN=X0, regD=X0}, cvec);
                 gen(bitwiseOrImmediate{wordSize=WordSize64, bits=0w1, regN=X0, regD=X0}, cvec)
             )
 
         |   genUnary(ClearMutableFlag, arg1, loopAddr) =
             (
                 gencde (arg1, ToX0, NotEnd, loopAddr);
                 indexToAbsoluteAddress(X0, X0, cvec);
                 gen(loadRegUnscaledByte{regT=X1, regN=X0, byteOffset= ~1}, cvec);
                 gen(bitwiseAndImmediate{wordSize=WordSize32, bits=Word64.xorb(0wxffffffff, 0wx40), regN=X1, regD=X1}, cvec);
                 gen(storeRegUnscaledByte{regT=X1, regN=X0, byteOffset= ~1}, cvec)
             )
 
         |   genUnary(LongWordToTagged, arg1, loopAddr) =
             (
                 gencde (arg1, ToX0, NotEnd, loopAddr);
                 indexToAbsoluteAddress(X0, X0, cvec);
                 (* Load the value and tag it. *)
                 gen(loadRegScaled{regT=X0, regN=X0, unitOffset=0}, cvec);
                 (* Tag the result. *)
                 gen(logicalShiftLeft{wordSize=WordSize64, shift=0w1, regN=X0, regD=X0}, cvec);
                 gen(bitwiseOrImmediate{wordSize=WordSize64, bits=0w1, regN=X0, regD=X0}, cvec)
             )
 
         |   genUnary(SignedToLongWord, arg1, loopAddr) =
             (
                 gencde (arg1, ToX0, NotEnd, loopAddr);
                 gen(arithmeticShiftRight{wordSize=WordSize64, shift=0w1, regN=X0, regD=X1}, cvec);
                 boxLargeWord(X1, cvec)
             )
 
         |   genUnary(UnsignedToLongWord, arg1, loopAddr) =
             (
                 gencde (arg1, ToX0, NotEnd, loopAddr);
                 gen(logicalShiftRight{wordSize=WordSize64, shift=0w1, regN=X0, regD=X1}, cvec);
                 boxLargeWord(X1, cvec)
             )
 
         |   genUnary(RealAbs PrecDouble, arg1, loopAddr) =
             (
                 gencde (arg1, ToX0, NotEnd, loopAddr);
+                unboxDouble(X0, X0, V0, cvec);
                 gen(loadRegScaledDouble{regT=V0, regN=X0, unitOffset=0}, cvec);
                 gen(absDouble{regN=V0, regD=V0}, cvec);
                 boxDouble(V0, cvec)
             )
 
         |   genUnary(RealNeg PrecDouble, arg1, loopAddr) =
             (
                 gencde (arg1, ToX0, NotEnd, loopAddr);
+                unboxDouble(X0, X0, V0, cvec);
                 gen(loadRegScaledDouble{regT=V0, regN=X0, unitOffset=0}, cvec);
                 gen(negDouble{regN=V0, regD=V0}, cvec);
                 boxDouble(V0, cvec)
             )
 
         |   genUnary(RealFixedInt PrecDouble, arg1, loopAddr) =
             (
                 gencde (arg1, ToX0, NotEnd, loopAddr);
                 (* Shift to remove the tag. *)
                 gen(arithmeticShiftRight{wordSize=WordSize64, shift=0w1, regN=X0, regD=X0}, cvec);
                 gen(convertIntToDouble{regN=X0, regD=V0}, cvec);
                 boxDouble(V0, cvec)
             )
 
         |   genUnary(RealAbs PrecSingle, arg1, loopAddr) =
             (
                 gencde (arg1, ToX0, NotEnd, loopAddr);
-                gen(logicalShiftRight{wordSize=WordSize64, shift=0w32, regN=X0, regD=X0}, cvec);
-                gen(moveGeneralToFloat{regN=X0, regD=V0}, cvec);
+                unboxOrUntagSingle(X0, X0, V0, cvec);
                 gen(absFloat{regN=V0, regD=V0}, cvec);
-                gen(moveFloatToGeneral{regN=V0, regD=X0}, cvec);
-                gen(logicalShiftLeft{wordSize=WordSize64, shift=0w32, regN=X0, regD=X0}, cvec);
-                gen(bitwiseOrImmediate{regN=X0, regD=X0, wordSize=WordSize64, bits=0w1}, cvec)
+                boxOrTagFloat(V0, cvec)
             )
 
         |   genUnary(RealNeg PrecSingle, arg1, loopAddr) =
             (
                 gencde (arg1, ToX0, NotEnd, loopAddr);
-                gen(logicalShiftRight{wordSize=WordSize64, shift=0w32, regN=X0, regD=X0}, cvec);
-                gen(moveGeneralToFloat{regN=X0, regD=V0}, cvec);
+                unboxOrUntagSingle(X0, X0, V0, cvec);
                 gen(negFloat{regN=V0, regD=V0}, cvec);
-                gen(moveFloatToGeneral{regN=V0, regD=X0}, cvec);
-                gen(logicalShiftLeft{wordSize=WordSize64, shift=0w32, regN=X0, regD=X0}, cvec);
-                gen(bitwiseOrImmediate{regN=X0, regD=X0, wordSize=WordSize64, bits=0w1}, cvec)
+                boxOrTagFloat(V0, cvec)
             )
 
         |   genUnary(RealFixedInt PrecSingle, arg1, loopAddr) =
             (
                 gencde (arg1, ToX0, NotEnd, loopAddr);
-                 (* Shift to remove the tag. *)
+                (* Shift to remove the tag. *)
                 gen(arithmeticShiftRight{wordSize=WordSize64, shift=0w1, regN=X0, regD=X0}, cvec);
-                gen(convertIntToFloat{regN=X0, regD=V0}, cvec);
-                gen(moveFloatToGeneral{regN=V0, regD=X0}, cvec);
-                gen(logicalShiftLeft{wordSize=WordSize64, shift=0w32, regN=X0, regD=X0}, cvec);
-                gen(bitwiseOrImmediate{regN=X0, regD=X0, wordSize=WordSize64, bits=0w1}, cvec)
+                gen((if is32in64 then convertInt32ToFloat else convertIntToFloat){regN=X0, regD=V0}, cvec);
+                boxOrTagFloat(V0, cvec)
            )
 
         |   genUnary(FloatToDouble, arg1, loopAddr) =
             (
                 gencde (arg1, ToX0, NotEnd, loopAddr);
-                gen(logicalShiftRight{wordSize=WordSize64, shift=0w32, regN=X0, regD=X0}, cvec);
-                gen(moveGeneralToFloat{regN=X0, regD=V0}, cvec);
+                unboxOrUntagSingle(X0, X0, V0, cvec);
                 gen(convertFloatToDouble{regN=V0, regD=V0}, cvec);
                 boxDouble(V0, cvec)
             )
 
         |   genUnary(DoubleToFloat, arg1, loopAddr) =
             (
                 gencde (arg1, ToX0, NotEnd, loopAddr);
                 (* Convert double to float using current rounding mode. *)
+                unboxDouble(X0, X0, V0, cvec);
                 gen(loadRegScaledDouble{regT=V0, regN=X0, unitOffset=0}, cvec);
                 gen(convertDoubleToFloat{regN=V0, regD=V0}, cvec);
-                gen(moveFloatToGeneral{regN=V0, regD=X0}, cvec);
-                gen(logicalShiftLeft{wordSize=WordSize64, shift=0w32, regN=X0, regD=X0}, cvec);
-                gen(bitwiseOrImmediate{regN=X0, regD=X0, wordSize=WordSize64, bits=0w1}, cvec)
+                boxOrTagFloat(V0, cvec)
             )
 
         |   genUnary(RealToInt (PrecDouble, rnding), arg1, loopAddr) =
             (
                 gencde (arg1, ToX0, NotEnd, loopAddr);
                 (* We could get an overflow in either the conversion to integer
                    or in the conversion to a tagged value.  Fortunately if the
                    conversion detects an overflow it sets the result to a
                    value that will cause an overflow in the addition. *)
+                unboxDouble(X0, X0, V0, cvec);
                 gen(loadRegScaledDouble{regT=V0, regN=X0, unitOffset=0}, cvec);
-                gen(convertDoubleToInt rnding {regN=V0, regD=X0}, cvec);
-                gen(addSShiftedReg{regM=X0, regN=X0, regD=X0, shift=ShiftNone}, cvec);
+                if is32in64
+                then
+                (
+                    gen(convertDoubleToInt32 rnding {regN=V0, regD=X0}, cvec);
+                    gen(addSShiftedReg32{regM=X0, regN=X0, regD=X0, shift=ShiftNone}, cvec)
+                )
+                else
+                (
+                    gen(convertDoubleToInt rnding {regN=V0, regD=X0}, cvec);
+                    gen(addSShiftedReg{regM=X0, regN=X0, regD=X0, shift=ShiftNone}, cvec)
+                );
                 gen(bitwiseOrImmediate{regN=X0, regD=X0, wordSize=WordSize64, bits=0w1}, cvec);
                 checkOverflow cvec
             )
 
         |   genUnary(RealToInt (PrecSingle, rnding), arg1, loopAddr) =
             (
                 gencde (arg1, ToX0, NotEnd, loopAddr);
-                gen(logicalShiftRight{wordSize=WordSize64, shift=0w32, regN=X0, regD=X0}, cvec);
-                gen(moveGeneralToFloat{regN=X0, regD=V0}, cvec);
-                gen(convertFloatToInt rnding {regN=V0, regD=X0}, cvec);
+                unboxOrUntagSingle(X0, X0, V0, cvec);
+                if is32in64
+                then
+                (
+                    gen(convertFloatToInt32 rnding {regN=V0, regD=X0}, cvec);
+                    gen(addSShiftedReg32{regM=X0, regN=X0, regD=X0, shift=ShiftNone}, cvec)
+                )
+                else
+                (
+                    gen(convertFloatToInt rnding {regN=V0, regD=X0}, cvec);
+                    gen(addSShiftedReg{regM=X0, regN=X0, regD=X0, shift=ShiftNone}, cvec)
+                );
                 gen(addSShiftedReg{regM=X0, regN=X0, regD=X0, shift=ShiftNone}, cvec);
                 gen(bitwiseOrImmediate{regN=X0, regD=X0, wordSize=WordSize64, bits=0w1}, cvec);
                 checkOverflow cvec
             )
 
         |   genUnary(TouchAddress, arg1, loopAddr) =
             (
                 gencde (arg1, ToX0, NotEnd, loopAddr);
                 topInX0 := false (* Discard this *)
             )
 
         |   genUnary(AllocCStack, arg1, loopAddr) =
             (
                 gencde (arg1, ToX0, NotEnd, loopAddr);
                 (* Allocate space on the stack.  The higher levels have already aligned
                    the size to a multiple of 16. *)
                 (* Remove the tag and then use add-extended.  This can use SP unlike
                    the shifted case. *)
                 gen(logicalShiftRight{wordSize=WordSize64, shift=0w1, regN=X0, regD=X0}, cvec);
                 gen(subExtendedReg{regM=X0, regN=XSP, regD=XSP, extend=ExtUXTX 0w0}, cvec);
                 (* The result is a large-word.  We can't box SP directly.
                    We have to use add here to get the SP into X1 instead of the usual move. *)
                 gen(addImmediate{regN=XSP, regD=X1, immed=0w0, shifted=false}, cvec);
                 boxLargeWord(X1, cvec)
             )
 
         |   genUnary(LockMutex, arg1, loopAddr) =
             (* The earliest versions of the Arm8 do not have the LDADD instruction which
                will do this directly.  To preserve compatibility we use LDAXR/STLXR
                which require a loop. *)
             let
                 val loopLabel = createLabel()
                 (* For the moment don't try to use a spin lock. *)
             in
                 gencde (arg1, ToX0, NotEnd, loopAddr);
                 indexToAbsoluteAddress(X0, X0, cvec);
                 gen(setLabel loopLabel, cvec);
                 (* Get the original value into X1. *)
                 gen(loadAcquireExclusiveRegister{regN=X0, regT=X1}, cvec);
                 (* Add and put the result into X2 *)
                 gen(addImmediate{regN=X1, regD=X2, immed=0w1, shifted=false}, cvec);
                 (* Store the result of the addition. W4 will be zero if this succeeded. *)
                 gen(storeReleaseExclusiveRegister{regS=X4, regT=X2, regN=X0}, cvec);
                 gen(compareBranchNonZero(X4, WordSize32, loopLabel), cvec);
                 (* Put in the memory barrier. *)
                 gen(dmbIsh, cvec);
                 (* The result is true if the old value was zero. *)
                 gen(subSImmediate{regN=X1, regD=XZero, immed=0w0, shifted=false}, cvec);
                 setBooleanCondition(X0, condEqual, cvec)                
             end
 
         |   genUnary(TryLockMutex, arg1, loopAddr) =
             (* Could use LDUMAXAL to set it the greater of the current value or 1. *)
             let
                 val loopLabel = createLabel() and exitLabel = createLabel()
             in
                 gencde (arg1, ToX0, NotEnd, loopAddr);
                 indexToAbsoluteAddress(X0, X0, cvec);
                 gen(setLabel loopLabel, cvec);
                 (* Get the original value into X1. *)
                 gen(loadAcquireExclusiveRegister{regN=X0, regT=X1}, cvec);
                 (* If it is non-zero the lock is already taken. *)
                 gen(compareBranchNonZero(X1, WordSize64, exitLabel), cvec);
                 genList(loadNonAddress(X2, 0w1), cvec);
                 (* Store zero into the memory. W4 will be zero if this succeeded. *)
                 gen(storeReleaseExclusiveRegister{regS=X4, regT=X2, regN=X0}, cvec);
                 gen(compareBranchNonZero(X4, WordSize32, loopLabel), cvec);
                 gen(setLabel exitLabel, cvec);
                 gen(dmbIsh, cvec);
                 (* The result is true if the old value was zero. *)
                 gen(subSImmediate{regN=X1, regD=XZero, immed=0w0, shifted=false}, cvec);
                 setBooleanCondition(X0, condEqual, cvec)
             end
 
         |   genUnary(UnlockMutex, arg1, loopAddr) =
             (* Could use SWAPAL *)
             let
                 val loopLabel = createLabel()
             in
                 gencde (arg1, ToX0, NotEnd, loopAddr);
                 indexToAbsoluteAddress(X0, X0, cvec);
                 gen(setLabel loopLabel, cvec);
                 (* Get the original value into X1. *)
                 gen(loadAcquireExclusiveRegister{regN=X0, regT=X1}, cvec);
                 (* Store zero into the memory. W4 will be zero if this succeeded. *)
                 gen(storeReleaseExclusiveRegister{regS=X4, regT=XZero, regN=X0}, cvec);
                 gen(compareBranchNonZero(X4, WordSize32, loopLabel), cvec);
                 (* Put in the memory barrier. *)
                 gen(dmbIsh, cvec);
                 (* The result is true if the old value was 1. *)
                 gen(subSImmediate{regN=X1, regD=XZero, immed=0w1, shifted=false}, cvec);
                 setBooleanCondition(X0, condEqual, cvec)
             end
 
         and genBinary(WordComparison{test, isSigned}, arg1, arg2, loopAddr) =
             let
                 val cond =
                     case (test, isSigned) of
                         (TestEqual,         _) => condEqual
                     |   (TestLess,          true) => condSignedLess
                     |   (TestLessEqual,     true) => condSignedLessEq
                     |   (TestGreater,       true) => condSignedGreater
                     |   (TestGreaterEqual,  true) => condSignedGreaterEq
                     |   (TestLess,          false) => condCarryClear
                     |   (TestLessEqual,     false) => condUnsignedLowOrEq
                     |   (TestGreater,       false) => condUnsignedHigher
                     |   (TestGreaterEqual,  false) => condCarrySet
                     |   (TestUnordered,     _) => raise InternalError "WordComparison: TestUnordered"
             in
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 genPopReg(X1, cvec); (* First argument. *)
                 comparePolyRegs(X1, X0, cvec);
                 setBooleanCondition(X0, cond, cvec)
             end
 
         |   genBinary(PointerEq, arg1, arg2, loopAddr) =
             (
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 genPopReg(X1, cvec); (* First argument. *)
                 comparePolyRegs(X1, X0, cvec);
                 setBooleanCondition(X0, condEqual, cvec)
             )
 
         |   genBinary(FixedPrecisionArith ArithAdd, arg1, arg2, loopAddr) =
             (
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 (* Subtract the tag bit. *)
                 gen(subImmediate{regN=X0, regD=X0, immed=0w1, shifted=false}, cvec);
                 genPopReg(X1, cvec);
                 (* Add and set the flag bits *)
                 gen((if is32in64 then addSShiftedReg32 else addSShiftedReg)
                     {regN=X1, regM=X0, regD=X0, shift=ShiftNone}, cvec);
                 checkOverflow cvec
             )
 
         |   genBinary(FixedPrecisionArith ArithSub, arg1, arg2, loopAddr) =
             (
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 (* Subtract the tag bit. *)
                 gen(subImmediate{regN=X0, regD=X0, immed=0w1, shifted=false}, cvec);
                 genPopReg(X1, cvec);
                 (* Subtract and set the flag bits *)
                 gen((if is32in64 then subSShiftedReg32 else subSShiftedReg)
                     {regN=X1, regM=X0, regD=X0, shift=ShiftNone}, cvec);
                 checkOverflow cvec
             )
 
         |   genBinary(FixedPrecisionArith ArithMult, arg1, arg2, loopAddr) =
             let
                 (* There's no simple way of detecting overflow.  We have to compute the
                    high-order word and then check that it is either all zeros with
                    the sign bit zero or all ones with the sign bit one. *)
                 val noOverflow = createLabel()
                 val _ = raise Fallback "multiply"
             in
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 (* Compute the result in the same way as for Word.* apart from the
                    arithmetic shift. *)
                 genPopReg(X1, cvec);
                 (* Shift to remove the tags on one argument suing . *)
                 gen(arithmeticShiftRight{regN=X0, regD=X2, wordSize=WordSize64, shift=0w1}, cvec);
                 (* Remove the tag on the other. *)
                 gen(bitwiseAndImmediate{regN=X1, regD=X1, wordSize=WordSize64, bits=tagBitMask}, cvec);
                 gen(multiplyAndAdd{regM=X1, regN=X2, regA=XZero, regD=X0}, cvec);
                 (* Put back the tag. *)
                 gen(bitwiseOrImmediate{regN=X0, regD=X0, wordSize=WordSize64, bits=0w1}, cvec);
                 (* Compute the high order part into X2 *)
                 gen(signedMultiplyHigh{regM=X1, regN=X2, regD=X2}, cvec);
                 (* Compare with the sign bit of the result. *)
                 gen(subSShiftedReg{regD=XZero, regN=X2, regM=X0, shift=ShiftASR 0w63}, cvec);
                 gen(conditionalBranch(condEqual, noOverflow), cvec);
                 gen(loadAddressConstant(X0, toMachineWord Overflow), cvec);
                 gen(loadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset}, cvec);
                 gen(loadRegScaled{regT=X1, regN=X_MLStackPtr, unitOffset=0}, cvec);
                 gen(branchRegister X1, cvec);
                 gen(setLabel noOverflow, cvec)
             end
 
         |   genBinary(FixedPrecisionArith ArithQuot, arg1, arg2, loopAddr) =
             (
                 raise Fallback "quot";
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 (* The word version avoids an extra shift.  Don't do that here at least
                    for the moment.  Division by zero and overflow are checked for at
                    the higher level. *)
                 genPopReg(X1, cvec);
                 (* Shift to remove the tags on the arguments *)
                 gen(arithmeticShiftRight{regN=X0, regD=X0, wordSize=WordSize64, shift=0w1}, cvec);
                 gen(arithmeticShiftRight{regN=X1, regD=X1, wordSize=WordSize64, shift=0w1}, cvec);
                 gen(signedDivide{regM=X0, regN=X1, regD=X0}, cvec);
                 (* Restore the tag. *)
                 gen(logicalShiftLeft{regN=X0, regD=X0, wordSize=WordSize64, shift=0w1}, cvec);
                 gen(bitwiseOrImmediate{regN=X0, regD=X0, wordSize=WordSize64, bits=0w1}, cvec)
             )
 
         |   genBinary(FixedPrecisionArith ArithRem, arg1, arg2, loopAddr) =
             (
                 raise Fallback "rem";
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 (* For the moment we remove the tags and then retag afterwards.  The word
                    version avoids this but at least for the moment we do it the longer way. *)
                 (* There's no direct way to get the remainder - have to use divide and multiply. *)
                 genPopReg(X1, cvec);
                 (* Shift to remove the tags on the arguments *)
                 gen(arithmeticShiftRight{regN=X0, regD=X0, wordSize=WordSize64, shift=0w1}, cvec);
                 gen(arithmeticShiftRight{regN=X1, regD=X1, wordSize=WordSize64, shift=0w1}, cvec);
                 gen(signedDivide{regM=X0, regN=X1, regD=X2}, cvec);
                 (* X0 = X1 - (X2/X0)*X0 *)
                 gen(multiplyAndSub{regM=X2, regN=X0, regA=X1, regD=X0}, cvec);
                 (* Restore the tag. *)
                 gen(logicalShiftLeft{regN=X0, regD=X0, wordSize=WordSize64, shift=0w1}, cvec);
                 gen(bitwiseOrImmediate{regN=X0, regD=X0, wordSize=WordSize64, bits=0w1}, cvec)
             )
 
         |   genBinary(FixedPrecisionArith ArithDiv, _, _, _) =
                 raise InternalError "unimplemented operation: FixedPrecisionArith ArithDiv"
 
         |   genBinary(FixedPrecisionArith ArithMod, _, _, _) =
                 raise InternalError "unimplemented operation: FixedPrecisionArith ArithMod"
 
         |   genBinary(WordArith ArithAdd, arg1, arg2, loopAddr) =
             (
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 (* Subtract the tag bit. *)
                 gen(subImmediate{regN=X0, regD=X0, immed=0w1, shifted=false}, cvec);
                 genPopReg(X1, cvec);
                 gen((if is32in64 then addShiftedReg32 else addShiftedReg){regN=X1, regM=X0, regD=X0, shift=ShiftNone}, cvec)
             )
 
         |   genBinary(WordArith ArithSub, arg1, arg2, loopAddr) =
             (
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 (* Subtract the tag bit. *)
                 gen(subImmediate{regN=X0, regD=X0, immed=0w1, shifted=false}, cvec);
                 genPopReg(X1, cvec);
                 gen((if is32in64 then subShiftedReg32 else subShiftedReg){regN=X1, regM=X0, regD=X0, shift=ShiftNone}, cvec)
             )
 
         |   genBinary(WordArith ArithMult, arg1, arg2, loopAddr) =
             (
                 raise Fallback "mult";
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 genPopReg(X1, cvec);
                 (* Shift to remove the tags on one argument. *)
                 gen(logicalShiftRight{regN=X0, regD=X0, wordSize=WordSize64, shift=0w1}, cvec);
                 (* Remove the tag on the other. *)
                 gen(bitwiseAndImmediate{regN=X1, regD=X1, wordSize=WordSize64, bits=tagBitMask}, cvec);
                 gen(multiplyAndAdd{regM=X1, regN=X0, regA=XZero, regD=X0}, cvec);
                 (* Put back the tag. *)
                 gen(bitwiseOrImmediate{regN=X0, regD=X0, wordSize=WordSize64, bits=0w1}, cvec)
             )
 
         |   genBinary(WordArith ArithDiv, arg1, arg2, loopAddr) =
             (
                 raise Fallback "div";
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 genPopReg(X1, cvec);
                 (* Shift to remove the tag on the divisor *)
                 gen(logicalShiftRight{regN=X0, regD=X0, wordSize=WordSize64, shift=0w1}, cvec);
                 (* Untag but don't shift the dividend. *)
                 gen(bitwiseAndImmediate{regN=X1, regD=X1, wordSize=WordSize64, bits=tagBitMask}, cvec);
                 gen(unsignedDivide{regM=X0, regN=X1, regD=X0}, cvec);
                 (* Restore the tag: Note: it may already be set depending on the result of
                    the division. *)
                 gen(bitwiseOrImmediate{regN=X0, regD=X0, wordSize=WordSize64, bits=0w1}, cvec)
             )
 
         |   genBinary(WordArith ArithMod, arg1, arg2, loopAddr) =
             (
                 raise Fallback "mod";
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 (* There's no direct way to get the remainder - have to use divide and multiply. *)
                 genPopReg(X1, cvec);
                 (* Shift to remove the tag on the divisor *)
                 gen(logicalShiftRight{regN=X0, regD=X0, wordSize=WordSize64, shift=0w1}, cvec);
                 (* Untag but don't shift the dividend. *)
                 gen(bitwiseAndImmediate{regN=X1, regD=X2, wordSize=WordSize64, bits=tagBitMask}, cvec);
                 gen(unsignedDivide{regM=X0, regN=X2, regD=X2}, cvec);
                 (* Clear the bottom bit before the multiplication. *)
                 gen(bitwiseAndImmediate{regN=X2, regD=X2, wordSize=WordSize64, bits=tagBitMask}, cvec);
                 (* X0 = X1 - (X2/X0)*X0 *)
                 gen(multiplyAndSub{regM=X2, regN=X0, regA=X1, regD=X0}, cvec)
                 (* Because we're subtracting from the original, tagged, dividend
                    the result is tagged. *)
             )
 
         |   genBinary(WordArith _, _, _, _) = raise InternalError "WordArith - unimplemented instruction"
 
         |   genBinary(WordLogical LogicalAnd, arg1, arg2, loopAddr) =
             (
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 genPopReg(X1, cvec);
                 (* Since they're both tagged the tag bit is preserved. *)
                 gen(andShiftedReg{regN=X1, regM=X0, regD=X0, shift=ShiftNone}, cvec)
             )
 
         |   genBinary(WordLogical LogicalOr, arg1, arg2, loopAddr) =
             (
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 genPopReg(X1, cvec);
                 (* Since they're both tagged the tag bit is preserved. *)
                 gen(orrShiftedReg{regN=X1, regM=X0, regD=X0, shift=ShiftNone}, cvec)
             )
 
         |   genBinary(WordLogical LogicalXor, arg1, arg2, loopAddr) =
             (
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 genPopReg(X1, cvec);
                 (* Have to restore the tag bit because that will be cleared. *)
                 gen(eorShiftedReg{regN=X1, regM=X0, regD=X0, shift=ShiftNone}, cvec);
                 gen(bitwiseOrImmediate{regN=X0, regD=X0, wordSize=WordSize64, bits=0w1}, cvec)
             )
 
             (* Shifts: ARM64 shifts are taken modulo the word length but that's
                dealt with at a higher level. *)
         |   genBinary(WordShift ShiftLeft, arg1, arg2, loopAddr) =
             (
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 genPopReg(X1, cvec);
                 (* Remove the tag from value we're shifting. *)
                 gen(bitwiseAndImmediate{regN=X1, regD=X1, wordSize=WordSize64, bits=tagBitMask}, cvec);
                 (* Untag the shift amount.  Can use 32-bit op here. *)
                 gen(logicalShiftRight{regN=X0, regD=X0, wordSize=WordSize32, shift=0w1}, cvec);
                 gen((if is32in64 then logicalShiftLeftVariable32 else logicalShiftLeftVariable)
                     {regM=X0, regN=X1, regD=X0}, cvec);
                 (* Put back the tag. *)
                 gen(bitwiseOrImmediate{regN=X0, regD=X0, wordSize=WordSize64, bits=0w1}, cvec)
             )
 
         |   genBinary(WordShift ShiftRightLogical, arg1, arg2, loopAddr) =
             (
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 genPopReg(X1, cvec);
                 (* Don't need to remove the tag. *)
                 (* Untag the shift amount.  Can use 32-bit op here. *)
                 gen(logicalShiftRight{regN=X0, regD=X0, wordSize=WordSize32, shift=0w1}, cvec);
                 gen(logicalShiftRightVariable{regM=X0, regN=X1, regD=X0}, cvec);
                 (* Put back the tag. *)
                 gen(bitwiseOrImmediate{regN=X0, regD=X0, wordSize=WordSize64, bits=0w1}, cvec)
             )
 
         |   genBinary(WordShift ShiftRightArithmetic, arg1, arg2, loopAddr) =
             (
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 genPopReg(X1, cvec);
                 (* Don't need to remove the tag. *)
                 (* Untag the shift amount.  Can use 32-bit op here. *)
                 gen(logicalShiftRight{regN=X0, regD=X0, wordSize=WordSize32, shift=0w1}, cvec);
                 gen(arithmeticShiftRightVariable{regM=X0, regN=X1, regD=X0}, cvec);
                 (* Put back the tag. *)
                 gen(bitwiseOrImmediate{regN=X0, regD=X0, wordSize=WordSize64, bits=0w1}, cvec)
             )
 
         |   genBinary(AllocateByteMemory, arg1, arg2, loopAddr) =
             (* Allocate memory for byte data.  Unlike for word data it is not necessary to
                initialise it before any further allocation provided it has the mutable bit
                set. *)
             (
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 (* Load and untag the size and flags.  The size is the number of words even
                    though this is byte data. *)
                 gen(logicalShiftRight{regN=X0, regD=X0, wordSize=WordSize32 (*byte*), shift=0w1}, cvec);
                 genPopReg(X1, cvec);
                 gen(logicalShiftRight{regN=X1, regD=X1, wordSize=WordSize64, shift=0w1}, cvec);
                 allocateVariableSize({sizeReg=X1, flagsReg=X0, resultReg=X2}, cvec);
                 gen(moveRegToReg{sReg=X2, dReg=X0}, cvec);
                 absoluteAddressToIndex(X0, cvec)
             )
 
         |   genBinary(LargeWordComparison test, arg1, arg2, loopAddr) =
             let
                 val cond =
                     case test of
                         TestEqual => condEqual
                     |   TestLess => condCarryClear
                     |   TestLessEqual => condUnsignedLowOrEq
                     |   TestGreater => condUnsignedHigher
                     |   TestGreaterEqual => condCarrySet
                     |   TestUnordered => raise InternalError "LargeWordComparison: TestUnordered"
             in
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 (* The values are boxed so have to be loaded first. *)
                 genPopReg(X1, cvec);
                 unboxLargeWord(X0, X0, cvec);
                 unboxLargeWord(X1, X1, cvec);
                 compareRegs(X1, X0, cvec);
                 setBooleanCondition(X0, cond, cvec)
             end
 
         |   genBinary(LargeWordArith ArithAdd, arg1, arg2, loopAddr) =
             (
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 genPopReg(X1, cvec);
                 unboxLargeWord(X0, X0, cvec);
                 unboxLargeWord(X1, X1, cvec);
                 gen(addShiftedReg{regN=X1, regM=X0, regD=X1, shift=ShiftNone}, cvec);
                 boxLargeWord(X1, cvec)
             )
 
         |   genBinary(LargeWordArith ArithSub, arg1, arg2, loopAddr) =
             (
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 genPopReg(X1, cvec);
                 unboxLargeWord(X0, X0, cvec);
                 unboxLargeWord(X1, X1, cvec);
                 gen(subShiftedReg{regN=X1, regM=X0, regD=X1, shift=ShiftNone}, cvec);
                 boxLargeWord(X1, cvec)
             )
 
         |   genBinary(LargeWordArith ArithMult, arg1, arg2, loopAddr) =
             (
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 genPopReg(X1, cvec);
                 unboxLargeWord(X0, X0, cvec);
                 unboxLargeWord(X1, X1, cvec);
                 gen(multiplyAndAdd{regM=X1, regN=X0, regA=XZero, regD=X1}, cvec);
                 boxLargeWord(X1, cvec)
             )
 
         |   genBinary(LargeWordArith ArithDiv, arg1, arg2, loopAddr) =
             (
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 genPopReg(X1, cvec);
                 unboxLargeWord(X0, X0, cvec);
                 unboxLargeWord(X1, X1, cvec);
                 gen(unsignedDivide{regM=X0, regN=X1, regD=X1}, cvec);
                 boxLargeWord(X1, cvec)
             )
 
         |   genBinary(LargeWordArith ArithMod, arg1, arg2, loopAddr) =
             (
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 genPopReg(X1, cvec);
                 unboxLargeWord(X0, X0, cvec);
                 unboxLargeWord(X1, X1, cvec);
                 gen(unsignedDivide{regM=X0, regN=X1, regD=X2}, cvec);
                 gen(multiplyAndSub{regM=X2, regN=X0, regA=X1, regD=X1}, cvec);
                 boxLargeWord(X1, cvec)
             )
 
         |   genBinary(LargeWordArith _, _, _, _) =
                 raise InternalError "LargeWordArith - unimplemented instruction"
 
         |   genBinary(LargeWordLogical LogicalAnd, arg1, arg2, loopAddr) =
             (
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 genPopReg(X1, cvec);
                 unboxLargeWord(X0, X0, cvec);
                 unboxLargeWord(X1, X1, cvec);
                 gen(andShiftedReg{regN=X1, regM=X0, regD=X1, shift=ShiftNone}, cvec);
                 boxLargeWord(X1, cvec)
             )
 
         |   genBinary(LargeWordLogical LogicalOr, arg1, arg2, loopAddr) =
             (
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 genPopReg(X1, cvec);
                 unboxLargeWord(X0, X0, cvec);
                 unboxLargeWord(X1, X1, cvec);
                 gen(orrShiftedReg{regN=X1, regM=X0, regD=X1, shift=ShiftNone}, cvec);
                 boxLargeWord(X1, cvec)
             )
 
         |   genBinary(LargeWordLogical LogicalXor, arg1, arg2, loopAddr) =
             (
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 genPopReg(X1, cvec);
                 unboxLargeWord(X0, X0, cvec);
                 unboxLargeWord(X1, X1, cvec);
                 gen(eorShiftedReg{regN=X1, regM=X0, regD=X1, shift=ShiftNone}, cvec);
                 boxLargeWord(X1, cvec)
             )
 
             (* The shift is always a Word.word value i.e. tagged.  There is a check at the higher level
                that the shift does not exceed 32/64 bits. *)
         |   genBinary(LargeWordShift ShiftLeft, arg1, arg2, loopAddr) =
             (
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 genPopReg(X1, cvec);
                 unboxLargeWord(X1, X1, cvec);
                 (* Untag the shift amount.  Can use 32-bit op here. *)
                 gen(logicalShiftRight{regN=X0, regD=X0, wordSize=WordSize32, shift=0w1}, cvec);
                 gen(logicalShiftLeftVariable{regM=X0, regN=X1, regD=X1}, cvec);
                 boxLargeWord(X1, cvec)
             )
 
         |   genBinary(LargeWordShift ShiftRightLogical, arg1, arg2, loopAddr) =
             (
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 genPopReg(X1, cvec);
                 unboxLargeWord(X1, X1, cvec);
                 (* Untag the shift amount.  Can use 32-bit op here. *)
                 gen(logicalShiftRight{regN=X0, regD=X0, wordSize=WordSize32, shift=0w1}, cvec);
                 gen(logicalShiftRightVariable{regM=X0, regN=X1, regD=X1}, cvec);
                 boxLargeWord(X1, cvec)
             )
 
         |   genBinary(LargeWordShift ShiftRightArithmetic, arg1, arg2, loopAddr) =
             (
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 genPopReg(X1, cvec);
                 unboxLargeWord(X1, X1, cvec);
                 (* Untag the shift amount.  Can use 32-bit op here. *)
                 gen(logicalShiftRight{regN=X0, regD=X0, wordSize=WordSize32, shift=0w1}, cvec);
                 gen(arithmeticShiftRightVariable{regM=X0, regN=X1, regD=X1}, cvec);
                 boxLargeWord(X1, cvec)
             )
 
             (* Floating point comparisons.
                The fcmp instruction differs from integer comparison.  If either
                argument is a NaN the overflow bit is set and the other bits are
                cleared.  That means that in order to get a true result only
                if the values are not NaNs we have to test that at least one of
                C, N, or Z are set.  We use unsigned tests for < and <=
                and signed tests for > and >=. *)
         |   genBinary(RealComparison (test, PrecDouble), arg1, arg2, loopAddr) =
             let
                 val cond =
                     case test of
                         TestEqual => condEqual
                     |   TestLess => condCarryClear
                     |   TestLessEqual => condUnsignedLowOrEq
                     |   TestGreater => condSignedGreater
                     |   TestGreaterEqual => condSignedGreaterEq
                     |   TestUnordered => condOverflow
             in
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 genPopReg(X1, cvec);
                 unboxDouble(X0, X0, V0, cvec);
                 unboxDouble(X1, X1, V1, cvec);
                 gen(compareDouble{regM=V0, regN=V1}, cvec);
                 setBooleanCondition(X0, cond, cvec)
             end
 
         |   genBinary(RealComparison (test, PrecSingle), arg1, arg2, loopAddr) =
             let
                 val cond =
                     case test of
                         TestEqual => condEqual
                     |   TestLess => condCarryClear
                     |   TestLessEqual => condUnsignedLowOrEq
                     |   TestGreater => condSignedGreater
                     |   TestGreaterEqual => condSignedGreaterEq
                     |   TestUnordered => condOverflow
             in
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 genPopReg(X1, cvec);
                 unboxOrUntagSingle(X0, X0, V0, cvec);
                 unboxOrUntagSingle(X1, X1, V1, cvec);
                 gen(compareFloat{regM=V0, regN=V1}, cvec);
                 setBooleanCondition(X0, cond, cvec)
             end
 
         |   genBinary(RealArith (oper, PrecDouble), arg1, arg2, loopAddr) =
             let
                 val operation =
                     case oper of
                         ArithAdd => addDouble
                     |   ArithSub => subtractDouble
                     |   ArithMult => multiplyDouble
                     |   ArithDiv => divideDouble
                     |   _ => raise InternalError "RealArith - unimplemented instruction"
             in
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 genPopReg(X1, cvec);
                 unboxDouble(X0, X0, V0, cvec);
                 unboxDouble(X1, X1, V1, cvec);
                 gen(operation{regM=V0, regN=V1, regD=V0}, cvec);
                 boxDouble(V0, cvec)
             end
 
         |   genBinary(RealArith (oper, PrecSingle), arg1, arg2, loopAddr) =
             let
                 val operation =
                     case oper of
                         ArithAdd => addFloat
                     |   ArithSub => subtractFloat
                     |   ArithMult => multiplyFloat
                     |   ArithDiv => divideFloat
                     |   _ => raise InternalError "RealArith - unimplemented instruction"
             in
                 (* 32-bit floats are represented as the value in the top 32-bits of
                    a general register with the low-order word containing all zeros
                    except the bottom bit which is one. *)
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 genPopReg(X1, cvec);
                 unboxOrUntagSingle(X0, X0, V0, cvec);
                 unboxOrUntagSingle(X1, X1, V1, cvec);
                 gen(operation{regM=V0, regN=V1, regD=V0}, cvec);
                 boxOrTagFloat(V0, cvec)
             end
 
         |   genBinary(FreeCStack, arg1, arg2, loopAddr) =
             (* Free space on the C stack. This is a binary operation that takes the base address
                and the size.  The base address isn't used in this version. *)
             (
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 genPopReg(X1, cvec); (* Pop and discard the address *)
                 (* Can't use the shifted addition which would remove the tag as part of the add. *)
                 gen(logicalShiftRight{wordSize=WordSize64, shift=0w1, regN=X0, regD=X0}, cvec);
                 gen(addExtendedReg{regM=X0, regN=XSP, regD=XSP, extend=ExtUXTX 0w0}, cvec)
             )
 
        (* doNext is only used for mutually recursive functions where a
          function may not be able to fill in its closure if it does not have
          all the remaining declarations. *)
         (* TODO: This always creates the closure on the heap even when makeClosure is false. *) 
         and genProc ({ closure=[], localCount, body, argTypes, name, ...}: bicLambdaForm, mutualDecs, doNext: unit -> unit) : unit =
             let
                 (* Create a one word item for the closure.  This is returned for recursive references
                    and filled in with the address of the code when we've finished. *)
                 val closure = makeConstantClosure()
                 (* Code-gen function. No non-local references. *)
                 val () = codegen (body, name, closure, List.length argTypes, localCount, parameters);
                 val () = gen(loadAddressConstant(X0, closureAsAddress closure), cvec)
                 val () = genPushReg(X0, cvec)
                 val () = incsp();
             in
                 if mutualDecs then doNext () else ()
             end
 
         |   genProc ({ localCount, body, name, argTypes, closure, ...}, mutualDecs, doNext) =
             let (* Full closure required. *)
                 val resClosure = makeConstantClosure()
                 (* Code-gen function. *)
                 val () = codegen (body, name, resClosure, List.length argTypes, localCount, parameters)
                 val closureVars = List.length closure (* Size excluding the code address *)
                 (* The first native word is the code address. *)
                 val firstEntry = Word.toInt(nativeWordSize div wordSize)
             in
                 if mutualDecs
                 then
                 let (* Have to make the closure now and fill it in later. *)
-                    val () = genAllocateFixedSize(closureVars+firstEntry, F_mutable, X0, X1, cvec)
+                    val () = genAllocateFixedSize(closureVars+firstEntry, Word8.orb(F_mutable, F_closure), X0, X1, cvec)
                     val () =
                         if is32in64
                         then (* Have to get the code address at run-time. *)
                         (
                             gen(loadAddressConstant(X1, toMachineWord resClosure), cvec);
                             indexToAbsoluteAddress(X1, X1, cvec);
                             gen(loadRegScaled{regT=X1, regN=X1, unitOffset=0}, cvec)
                         )
                         else gen(loadAddressConstant(X1, codeAddressFromClosure resClosure), cvec)
                     val () = gen(storeRegScaled{regT=X1, regN=X0, unitOffset=0}, cvec)
                     val () = absoluteAddressToIndex(X0, cvec)
                     val () = genPushReg(X0, cvec)
                     val () = incsp ()
            
                     val entryAddr : int = !realstackptr
                     (* Set the address of this entry in the declaration table and
                        then process any other mutual-recursive functions. *)
                     val () = doNext ()
 
                     (* Reload the address of the vector - If we have processed other
                        closures the closure will no longer be on the top of the stack. *)
                     val () = genList(loadScaledWord{dest=X1, base=X_MLStackPtr, work=X16, offset= !realstackptr - entryAddr}, cvec)
                     val () = indexToAbsoluteAddress(X1, X1, cvec)
 
                     (* Load items for the closure. *)
                     fun loadItems ([], _) = ()
                     |   loadItems (v :: vs, addr : int) =
                     (
                         (* Generate an item and move it into the closure *)
                         gencde (BICExtract v, ToX0, NotEnd, NONE);
                         (* The closure "address" excludes the code address. *)
                         genList(storeScaledPolyWord{store=X0, base=X1, work=X16, offset=addr+firstEntry}, cvec);
                         topInX0 := false;
                         loadItems (vs, addr + 1)
                     )
              
                     val () = loadItems (closure, 0)
 
-                    (* Lock it by setting the top byte to zero. *)
-                    val () = gen(storeRegUnscaledByte{regT=XZero, regN=X1, byteOffset= ~1}, cvec)
+                    (* Lock it by setting the top byte to the zero or the closure bit. *)
+                    val () =
+                        if is32in64
+                        then
+                        (
+                            genList(loadNonAddress(X16, Word64.fromLargeWord(Word8.toLargeWord F_closure)), cvec);
+                            gen(storeRegUnscaledByte{regT=X16, regN=X1, byteOffset= ~1}, cvec)
+                        )
+                        else gen(storeRegUnscaledByte{regT=XZero, regN=X1, byteOffset= ~1}, cvec)
                 in
                     () (* Don't need to do anything now. *)
                 end
          
                 else
                 let
                     val () = List.app (fn pt => gencde (BICExtract pt, ToStack, NotEnd, NONE)) closure
                 in
-                    genAllocateFixedSize(closureVars+firstEntry, 0w0, X0, X1, cvec);
+                    genAllocateFixedSize(closureVars+firstEntry, if is32in64 then F_closure else 0w0, X0, X1, cvec);
                     List.foldl(fn (_, w) =>
                         (
                             genPopReg(X1, cvec);
                             genList(storeScaledPolyWord{store=X1, base=X0, work=X16, offset=w-1}, cvec);
                             w-1
                         )
                     ) (closureVars+firstEntry) closure;
                     if is32in64
                     then (* Have to get the code address at run-time. *)
                     (
                         gen(loadAddressConstant(X1, toMachineWord resClosure), cvec);
                         indexToAbsoluteAddress(X1, X1, cvec);
                         gen(loadRegScaled{regT=X1, regN=X1, unitOffset=0}, cvec)
                     )
                     else gen(loadAddressConstant(X1, codeAddressFromClosure resClosure), cvec);
                     gen(storeRegScaled{regT=X1, regN=X0, unitOffset=0}, cvec);
+                    absoluteAddressToIndex(X0, cvec);
                     genPushReg(X0, cvec);
                     realstackptr := !realstackptr - closureVars + 1 (* Popped the closure vars and pushed the address. *)
                 end
             end
 
         and genCond (testCode, thenCode, elseCode, whereto, tailKind, loopAddr) =
         let
             val toElse = createLabel() and exitJump = createLabel()
             val () = genTest(testCode, false, toElse, loopAddr)
             val () = gencde (thenCode, whereto, tailKind, loopAddr)
             (* Get rid of the result from the stack. If there is a result then the
             ``else-part'' will push it. *)
             val () = case whereto of ToStack => decsp () | NoResult => () | ToX0 => ()
             val () = topInX0 := false
 
             val () = gen(unconditionalBranch exitJump, cvec)
 
             (* start of "else part" *)
             val () = gen(setLabel toElse, cvec)
             val () = gencde (elseCode, whereto, tailKind, loopAddr)
             val () = gen(setLabel exitJump, cvec)
         in
             ()
         end (* genCond *)
 
         (* andalso and orelse are turned into conditionals with constants.
            Convert this into a series of tests. *)
         and genTest(BICConstnt(w, _), jumpOn, targetLabel, _) =
             let
                 val cVal = case toShort w of 0w0 => false | 0w1 => true | _ => raise InternalError "genTest"
             in
                 if cVal = jumpOn
                 then gen(unconditionalBranch targetLabel, cvec)
                 else ()
             end
 
         |   genTest(BICUnary { oper=NotBoolean, arg1 }, jumpOn, targetLabel, loopAddr) =
                 genTest(arg1, not jumpOn, targetLabel, loopAddr)
 
         |   genTest(BICUnary { oper=IsTaggedValue, arg1 }, jumpOn, targetLabel, loopAddr) =
             (
                 gencde (arg1, ToX0, NotEnd, loopAddr);
                 topInX0 := false;
                 gen((if jumpOn then testBitBranchNonZero else testBitBranchZero)
                     (X0, 0w0, targetLabel), cvec)
             )
 
         |   genTest(BICBinary{oper=WordComparison{test, isSigned}, arg1, arg2}, jumpOn, targetLabel, loopAddr) =
             let
                 val (cond, condNot) =
                     case (test, isSigned) of
                         (TestEqual,         _) => (condEqual, condNotEqual)
                     |   (TestLess,          true) => (condSignedLess, condSignedGreaterEq)
                     |   (TestLessEqual,     true) => (condSignedLessEq, condSignedGreater)
                     |   (TestGreater,       true) => (condSignedGreater, condSignedLessEq)
                     |   (TestGreaterEqual,  true) => (condSignedGreaterEq, condSignedLess)
                     |   (TestLess,          false) => (condCarryClear, condCarrySet)
                     |   (TestLessEqual,     false) => (condUnsignedLowOrEq, condUnsignedHigher)
                     |   (TestGreater,       false) => (condUnsignedHigher, condUnsignedLowOrEq)
                     |   (TestGreaterEqual,  false) => (condCarrySet, condCarryClear)
                     |   (TestUnordered,     _) => raise InternalError "WordComparison: TestUnordered"
             in
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 topInX0 := false;
                 genPopReg(X1, cvec); (* First argument. *)
                 comparePolyRegs(X1, X0, cvec);
                 gen(conditionalBranch(if jumpOn then cond else condNot, targetLabel), cvec)
             end
 
         |   genTest(BICBinary{oper=PointerEq, arg1, arg2}, jumpOn, targetLabel, loopAddr) =
             (
                 gencde (arg1, ToStack, NotEnd, loopAddr);
                 gencde (arg2, ToX0, NotEnd, loopAddr);
                 decsp();
                 topInX0 := false;
                 genPopReg(X1, cvec); (* First argument. *)
                 compareRegs(X1, X0, cvec);
                 gen(conditionalBranch(if jumpOn then condEqual else condNotEqual, targetLabel), cvec)
             )
 
         |   genTest(BICTagTest { test, tag=tagValue, ... }, jumpOn, targetLabel, loopAddr) =
             (
                 gencde (test, ToX0, NotEnd, loopAddr);
                 topInX0 := false;
                 gen(subSImmediate{regN=X0, regD=XZero, immed=taggedWord tagValue, shifted=false}, cvec);
                 gen(conditionalBranch(if jumpOn then condEqual else condNotEqual, targetLabel), cvec)
             )
 
         |   genTest(BICCond (testPart, thenPart, elsePart), jumpOn, targetLabel, loopAddr) =
             let
                 val toElse = createLabel() and exitJump = createLabel()
             in
                 genTest(testPart, false, toElse, loopAddr);
                 genTest(thenPart, jumpOn, targetLabel, loopAddr);
                 gen(unconditionalBranch exitJump, cvec);
                 gen(setLabel toElse, cvec);
                 genTest(elsePart, jumpOn, targetLabel, loopAddr);
                 gen(setLabel exitJump, cvec)
             end
 
         |   genTest(testCode, jumpOn, targetLabel, loopAddr) =
             (
                 gencde (testCode, ToStack, NotEnd, loopAddr);
                 genPopReg(X0, cvec);
                 gen(subSImmediate{regN=X0, regD=XZero, immed=taggedWord 0w1, shifted=false}, cvec);
                 gen(conditionalBranch(if jumpOn then condEqual else condNotEqual, targetLabel), cvec);
                 decsp() (* conditional branch pops a value. *)
             )
 
         and genEval (eval, tailKind : tail) : unit =
         let
             val argList : backendIC list = List.map #1 (#argList eval)
             val argsToPass : int = List.length argList;
 
             (* Load arguments *)
             fun loadArgs [] = ()
             |   loadArgs (v :: vs) =
             let (* Push each expression onto the stack. *)
                 val () = gencde(v, ToStack, NotEnd, NONE)
             in
                 loadArgs vs
             end;
 
             (* Have to guarantee that the expression to return the function
               is evaluated before the arguments. *)
 
             (* Returns true if evaluating it later is safe. *)
             fun safeToLeave (BICConstnt _) = true
             |   safeToLeave (BICLambda _) = true
             |   safeToLeave (BICExtract _) = true
             |   safeToLeave (BICField {base, ...}) = safeToLeave base
             |   safeToLeave (BICLoadContainer {base, ...}) = safeToLeave base
             |   safeToLeave _ = false
 
             val () =
                 if (case argList of [] => true | _ => safeToLeave (#function eval))
                 then
                 let
                     (* Can load the args first. *)
                     val () = loadArgs argList
                 in 
                     gencde (#function eval, ToStack, NotEnd, NONE)
                 end
 
                 else
                 let
                     (* The expression for the function is too complicated to
                        risk leaving. It might have a side-effect and we must
                        ensure that any side-effects it has are done before the
                        arguments are loaded. *)
                     val () = gencde(#function eval, ToStack, NotEnd, NONE);
                     val () = loadArgs(argList);
                     (* Load the function again. *)
                     val () = genList(loadScaledWord{dest=X0, base=X_MLStackPtr, work=X16, offset=argsToPass}, cvec)
                     val () = genPushReg(X0, cvec)
                 in
                     incsp ()
                 end
 
         in (* body of genEval *)
             case tailKind of
                 NotEnd => (* Normal call. *)
                 let
                     val () = genPopReg(X8, cvec) (* Pop the closure pointer. *)
                     (* We need to put the first 8 arguments into registers and
                        leave the rest on the stack. *)
                     fun loadArg(n, reg) =
                         if argsToPass > n
                         then genList(loadScaledWord{dest=reg, base=X_MLStackPtr, work=X16, offset=argsToPass-n-1}, cvec)
                         else ()
                     val () = loadArg(0, X0)
                     val () = loadArg(1, X1)
                     val () = loadArg(2, X2)
                     val () = loadArg(3, X3)
                     val () = loadArg(4, X4)
                     val () = loadArg(5, X5)
                     val () = loadArg(6, X6)
                     val () = loadArg(7, X7)
                 in
                     if is32in64
                     then
                     (
                         (* Can't use an indexed load because the only options for scaling are either 1 or 8. *)
                         indexToAbsoluteAddress(X8, X9, cvec);
                         gen(loadRegScaled{regT=X9, regN=X9, unitOffset=0}, cvec) (* Entry point *)
                     )
                     else gen(loadRegScaled{regT=X9, regN=X8, unitOffset=0}, cvec); (* Entry point *)
                     gen(branchAndLinkReg X9, cvec);
                     (* We have popped the closure pointer.  The caller has popped the stack
                        arguments and we have pushed the result value. The register arguments
                        are still on the stack. *)
                     topInX0 := true;
                     realstackptr := !realstackptr - Int.max(argsToPass-8, 0) - 1 (* Args popped by caller. *)
                 end
      
             |   EndOfProc => (* Tail recursive call. *)
                 let
                     val () = genPopReg(X8, cvec) (* Pop the closure pointer. *)
                     val () = decsp()
                     (* Get the return address into X30. *)
                     val () = genList(loadScaledWord{dest=X30, base=X_MLStackPtr, work=X16, offset= !realstackptr}, cvec)
 
                     (* Load the register arguments *)
                     fun loadArg(n, reg) =
                         if argsToPass > n
                         then genList(loadScaledWord{dest=reg, base=X_MLStackPtr, work=X16, offset=argsToPass-n-1}, cvec)
                         else ()
                     val () = loadArg(0, X0)
                     val () = loadArg(1, X1)
                     val () = loadArg(2, X2)
                     val () = loadArg(3, X3)
                     val () = loadArg(4, X4)
                     val () = loadArg(5, X5)
                     val () = loadArg(6, X6)
                     val () = loadArg(7, X7)
                     (* We need to move the stack arguments into the original argument area. *)
 
                     (* This is the total number of words that this function is responsible for.
                        It includes the stack arguments that the caller expects to be removed. *)
                     val itemsOnStack = !realstackptr + 1 + numOfArgs
 
                     (* Stack arguments are moved using X9. *)
                     fun moveStackArg n =
                     if n >= argsToPass
                     then ()
                     else
                     let
                         val () = loadArg(n, X9)
                         val destOffset = itemsOnStack - (n-8) - 1
                         val () = genList(storeScaledWord{store=X9, base=X_MLStackPtr, work=X16, offset=destOffset}, cvec)
                     in
                         moveStackArg(n+1)
                     end
 
                     val () = moveStackArg 8
                 in
                     resetStack(itemsOnStack - Int.max(argsToPass-8, 0), false, cvec);
                     if is32in64
                     then
                     (
                         indexToAbsoluteAddress(X8, X9, cvec);
                         gen(loadRegScaled{regT=X9, regN=X9, unitOffset=0}, cvec) (* Entry point *)
                     )
                     else gen(loadRegScaled{regT=X9, regN=X8, unitOffset=0}, cvec); (* Entry point *)
                     gen(branchRegister X9, cvec)
                     (* Since we're not returning we can ignore the stack pointer value. *)
                 end
         end (* genEval *)
 
         (* Begin generating the code for the function. *)
         val prefix = ref []
         (* Push the arguments passed in registers. *)
         val () = if numOfArgs >= 8 then genPushReg (X7, prefix) else ()
         val () = if numOfArgs >= 7 then genPushReg (X6, prefix) else ()
         val () = if numOfArgs >= 6 then genPushReg (X5, prefix) else ()
         val () = if numOfArgs >= 5 then genPushReg (X4, prefix) else ()
         val () = if numOfArgs >= 4 then genPushReg (X3, prefix) else ()
         val () = if numOfArgs >= 3 then genPushReg (X2, prefix) else ()
         val () = if numOfArgs >= 2 then genPushReg (X1, prefix) else ()
         val () = if numOfArgs >= 1 then genPushReg (X0, prefix) else ()
         val () = genPushReg (X30, prefix)
         val () = genPushReg (X8, prefix) (* Push closure pointer *)
 
         (* Generate the function. *)
         (* Assume we always want a result. There is otherwise a problem if the
           called routine returns a result of type void (i.e. no result) but the
           caller wants a result (e.g. the identity function). *)
         val () = gencde (pt, ToX0, EndOfProc, NONE)
         val () = resetStack(1, false, cvec) (* Skip over the pushed closure *)
         val () = genPopReg(X30, cvec) (* Return address => pop into X30 *)
         val () = resetStack(numOfArgs, false, cvec) (* Remove the arguments *)
         val () = gen(returnRegister X30, cvec) (* Jump to X30 *)
         
         (* Now we know the maximum stack size we can code-gen the stack check.
            This needs to go in after we have saved X30. *)
         val () = checkStackCode(X10, !maxStack, false, prefix)
         val instructions = List.rev(!prefix) @ List.rev(!cvec)
 
     in (* body of codegen *)
        (* Having code-generated the body of the function, it is copied
           into a new data segment. *)
         generateCode{instrs=instructions, name=name, parameters=parameters, resultClosure=resultClosure}
     end (* codegen *)
 
     fun gencodeLambda(lambda as { name, body, argTypes, localCount, ...}:bicLambdaForm, parameters, closure) =
-    if (*false andalso*) Debug.getParameter Debug.compilerDebugTag parameters = 0
+    if false andalso Debug.getParameter Debug.compilerDebugTag parameters = 0
     then FallBackCG.gencodeLambda(lambda, parameters, closure)
     else
     (
         codegen (body, name, closure, List.length argTypes, localCount, parameters)
            handle Fallback _ => FallBackCG.gencodeLambda(lambda, parameters, closure)
     )
 
     structure Foreign = Arm64Foreign
 
     structure Sharing =
     struct
         open BackendTree.Sharing
         type closureRef = closureRef
     end
 
 end;