diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ASSEMBLY.sig b/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ASSEMBLY.sig
index 96611c92..b7a093a6 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ASSEMBLY.sig
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ASSEMBLY.sig
@@ -1,471 +1,474 @@
 (*
     Copyright (c) 2021-2 David C. J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     Licence version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public Licence for more details.
     
     You should have received a copy of the GNU Lesser General Public
     Licence along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 signature ARM64ASSEMBLY =
 sig
     type closureRef
     type instr
     type machineWord = Address.machineWord
     type labels
 
     (* XZero and XSP are both encoded as 31 but the interpretation
        depends on the instruction 
        The datatype definition is included here to allow for
        pattern matching on XSP and XZero. *)
     datatype xReg = XReg of Word8.word | XZero | XSP
     and vReg = VReg of Word8.word
 
     val X0:  xReg   and X1:  xReg   and X2:  xReg   and X3: xReg
     and X4:  xReg   and X5:  xReg   and X6:  xReg   and X7: xReg
     and X8:  xReg   and X9:  xReg   and X10: xReg   and X11: xReg
     and X12: xReg   and X13: xReg   and X14: xReg   and X15: xReg
     and X16: xReg   and X17: xReg   and X18: xReg   and X19: xReg
     and X20: xReg   and X21: xReg   and X22: xReg   and X23: xReg
     and X24: xReg   and X25: xReg   and X26: xReg   and X27: xReg
     and X28: xReg   and X29: xReg   and X30: xReg
 
     val X_MLHeapLimit: xReg (* ML Heap limit pointer *)
     and X_MLAssemblyInt: xReg (* ML assembly interface pointer. *)
     and X_MLHeapAllocPtr: xReg (* ML Heap allocation pointer. *)
     and X_MLStackPtr: xReg (* ML Stack pointer. *)
     and X_LinkReg: xReg (* Link reg - return address *)
     and X_Base32in64: xReg (* X24 is used for the heap base in 32-in-64. *)
 
     val V0:  vReg   and V1:  vReg   and V2:  vReg   and V3: vReg
     and V4:  vReg   and V5:  vReg   and V6:  vReg   and V7: vReg
 
     (* Condition for conditional branches etc. *)
     datatype condition =
         CondEqual            (* Z=1 *)
     |   CondNotEqual         (* Z=0 *)
     |   CondCarrySet         (* C=1 *)
     |   CondCarryClear       (* C=0 *)
     |   CondNegative         (* N=1 *)
     |   CondPositive         (* N=0 imcludes zero *)
     |   CondOverflow         (* V=1 *)
     |   CondNoOverflow       (* V=0 *)
     |   CondUnsignedHigher   (* C=1 && Z=0 *)
     |   CondUnsignedLowOrEq  (* ! (C=1 && Z=0) *)
     |   CondSignedGreaterEq  (* N=V *)
     |   CondSignedLess       (* N<>V *)
     |   CondSignedGreater    (* Z==0 && N=V *)
     |   CondSignedLessEq     (* !(Z==0 && N=V) *)
 
     val invertTest: condition -> condition (* i.e. jump when the condition is not true. *)
     val condToString: condition -> string
 
     datatype shiftType =
         ShiftLSL of Word8.word
     |   ShiftLSR of Word8.word
     |   ShiftASR of Word8.word
     |   ShiftNone
 
     datatype wordSize = WordSize32 | WordSize64
 
     datatype 'a extend =
         ExtUXTB of 'a (* Unsigned extend byte *)
     |   ExtUXTH of 'a (* Unsigned extend byte *)
     |   ExtUXTW of 'a (* Unsigned extend byte *)
     |   ExtUXTX of 'a (* Left shift *)
     |   ExtSXTB of 'a (* Sign extend byte *)
     |   ExtSXTH of 'a (* Sign extend halfword *)
     |   ExtSXTW of 'a (* Sign extend word *)
     |   ExtSXTX of 'a (* Left shift *)
 
     (* Load/store instructions have only a single bit for the shift.  For byte
        operations this is one bit shift; for others it scales by the size of
        the operand if set. *)
     datatype scale =
         ScaleOrShift
     |   NoScale
 
     (* Jump to the address in the register and put the address of the
        next instruction into X30. *)
     val branchAndLinkReg: xReg -> instr
     (* Jump to the address in the register. *)
     and branchRegister: xReg -> instr
     (* Jump to the address in the register and hint this is a return. *)
     and returnRegister: xReg -> instr
 
     (* Move an address constant to a register. *)
     val loadAddressConstant: xReg * machineWord -> instr
     (* Move a constant into a register that is not an address.
        The argument is the actual bit pattern to be copied.
        For tagged integers that means that the value must have
        been shifted and the tag bit set. *)
     and loadNonAddressConstant: xReg * Word64.word -> instr
     
     and loadFloatConstant: vReg  * Word64.word * xReg -> instr
     and loadDoubleConstant: vReg  * Word64.word * xReg -> instr
 
     (* Move a value into a register.  The immediate is 16-bits and the shift
        is 0, 16, 24, or 48.  moveKeep affect only the specific 16-bits and
        leaves the remainder unchanged. *)
     val moveNot32: {regD: xReg, immediate: word, shift: word} -> instr
     and moveZero32: {regD: xReg, immediate: word, shift: word} -> instr
     and moveKeep32: {regD: xReg, immediate: word, shift: word} -> instr
     val moveNot: {regD: xReg, immediate: word, shift: word} -> instr
     and moveZero: {regD: xReg, immediate: word, shift: word} -> instr
     and moveKeep: {regD: xReg, immediate: word, shift: word} -> instr
 
     (* Add/subtract an optionally shifted 12-bit immediate (i.e. constant) to/from a register.
        The constant is zero-extended. *)
     val addImmediate: {regN: xReg, regD: xReg, immed: word, shifted: bool} -> instr
     and addSImmediate: {regN: xReg, regD: xReg, immed: word, shifted: bool} -> instr
     and subImmediate: {regN: xReg, regD: xReg, immed: word, shifted: bool} -> instr
     and subSImmediate: {regN: xReg, regD: xReg, immed: word, shifted: bool} -> instr
     and addImmediate32: {regN: xReg, regD: xReg, immed: word, shifted: bool} -> instr
     and addSImmediate32: {regN: xReg, regD: xReg, immed: word, shifted: bool} -> instr
     and subImmediate32: {regN: xReg, regD: xReg, immed: word, shifted: bool} -> instr
     and subSImmediate32: {regN: xReg, regD: xReg, immed: word, shifted: bool} -> instr
 
     (* Add/subtract a shifted register, optionally setting the flags. *)
     val addShiftedReg: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and addSShiftedReg: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and subShiftedReg: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and subSShiftedReg: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and addShiftedReg32: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and addSShiftedReg32: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and subShiftedReg32: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and subSShiftedReg32: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     
     (* Add/subtract an extended register, optionally setting the flags. *)
     val addExtendedReg: {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend} -> instr
     and addSExtendedReg: {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend} -> instr
     and subExtendedReg: {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend} -> instr
     and subSExtendedReg: {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend} -> instr
 
     (* Multiplication *)
     (* regD = regA + regN * regM *)
     val multiplyAndAdd: {regM: xReg, regN: xReg, regA: xReg, regD: xReg} -> instr
     (* regD = regA - regN * regM *)
     and multiplyAndSub: {regM: xReg, regN: xReg, regA: xReg, regD: xReg} -> instr
     (* Return the high-order part of a signed multiplication. *)
     and signedMultiplyHigh: {regM: xReg, regN: xReg, regD: xReg} -> instr
     and multiplyAndAdd32: {regM: xReg, regN: xReg, regA: xReg, regD: xReg} -> instr
     and multiplyAndSub32: {regM: xReg, regN: xReg, regA: xReg, regD: xReg} -> instr
     (* Multiply two 32-bit quantities and add/subtract a 64-bit quantity. *)
     and signedMultiplyAndAddLong: {regM: xReg, regN: xReg, regA: xReg, regD: xReg} -> instr
     and signedMultiplyAndSubLong: {regM: xReg, regN: xReg, regA: xReg, regD: xReg} -> instr
 
     (* Division *)
     val unsignedDivide: {regM: xReg, regN: xReg, regD: xReg} -> instr
     and signedDivide: {regM: xReg, regN: xReg, regD: xReg} -> instr
     and unsignedDivide32: {regM: xReg, regN: xReg, regD: xReg} -> instr
     and signedDivide32: {regM: xReg, regN: xReg, regD: xReg} -> instr
 
     (* Logical operations on a shifted register, optionally setting the flags. *)
     val andShiftedReg: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and orrShiftedReg: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and eorShiftedReg: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and andsShiftedReg: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and andShiftedReg32: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and orrShiftedReg32: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and eorShiftedReg32: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
     and andsShiftedReg32: {regM: xReg, regN: xReg, regD: xReg, shift: shiftType} -> instr
 
     (* Check whether a constant can be encoded. *)
     val isEncodableBitPattern: Word64.word * wordSize -> bool
 
     (* Load/Store an aligned word using a 12-bit offset.  The offset is in units
        of the size of the operand. *)
     val loadRegScaled: {regT: xReg, regN: xReg, unitOffset: int} -> instr
     and storeRegScaled: {regT: xReg, regN: xReg, unitOffset: int} -> instr
     and loadRegScaledByte: {regT: xReg, regN: xReg, unitOffset: int} -> instr
     and storeRegScaledByte: {regT: xReg, regN: xReg, unitOffset: int} -> instr
     and loadRegScaled16: {regT: xReg, regN: xReg, unitOffset: int} -> instr
     and storeRegScaled16: {regT: xReg, regN: xReg, unitOffset: int} -> instr
     and loadRegScaled32: {regT: xReg, regN: xReg, unitOffset: int} -> instr
     and storeRegScaled32: {regT: xReg, regN: xReg, unitOffset: int} -> instr
     and loadRegScaledDouble: {regT: vReg, regN: xReg, unitOffset: int} -> instr
     and storeRegScaledDouble: {regT: vReg, regN: xReg, unitOffset: int} -> instr
     and loadRegScaledFloat: {regT: vReg, regN: xReg, unitOffset: int} -> instr
     and storeRegScaledFloat: {regT: vReg, regN: xReg, unitOffset: int} -> instr
 
     (* Load/Store a value using a signed byte offset. *)
     val loadRegUnscaled: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegUnscaled: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegUnscaledByte: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegUnscaledSignedByteTo64: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegUnscaledSignedByteTo32: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegUnscaledByte: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegUnscaled16: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegUnscaledSigned16To64: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegUnscaledSigned16To32: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegUnscaled16: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegUnscaled32: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegUnscaledSigned32To64: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegUnscaled32: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegUnscaledFloat: {regT: vReg, regN: xReg, byteOffset: int} -> instr
     and storeRegUnscaledFloat: {regT: vReg, regN: xReg, byteOffset: int} -> instr
     and loadRegUnscaledDouble: {regT: vReg, regN: xReg, byteOffset: int} -> instr
     and storeRegUnscaledDouble: {regT: vReg, regN: xReg, byteOffset: int} -> instr
 
     (* Load/store with a register offset i.e. an index register. *)
     val loadRegIndexed: {regN: xReg, regM: xReg, regT: xReg, option: scale extend} -> instr
     and storeRegIndexed: {regN: xReg, regM: xReg, regT: xReg, option: scale extend} -> instr
     and loadRegIndexedByte: {regN: xReg, regM: xReg, regT: xReg, option: scale extend} -> instr
     and storeRegIndexedByte: {regN: xReg, regM: xReg, regT: xReg, option: scale extend} -> instr
     and loadRegIndexed16: {regN: xReg, regM: xReg, regT: xReg, option: scale extend} -> instr
     and storeRegIndexed16: {regN: xReg, regM: xReg, regT: xReg, option: scale extend} -> instr
     and loadRegIndexed32: {regN: xReg, regM: xReg, regT: xReg, option: scale extend} -> instr
     and storeRegIndexed32: {regN: xReg, regM: xReg, regT: xReg, option: scale extend} -> instr
     and loadRegIndexedFloat: {regN: xReg, regM: xReg, regT: vReg, option: scale extend} -> instr
     and storeRegIndexedFloat: {regN: xReg, regM: xReg, regT: vReg, option: scale extend} -> instr
     and loadRegIndexedDouble: {regN: xReg, regM: xReg, regT: vReg, option: scale extend} -> instr
     and storeRegIndexedDouble: {regN: xReg, regM: xReg, regT: vReg, option: scale extend} -> instr
 
     (* Load/Store a value using a signed byte offset and post-indexing (post-increment). *)
     (* The terminology is confusing. Pre-indexing means adding the offset into base address
        before loading the value, typically used for push, and post-index means using the
        original value of the base register as the address and adding in the offset after
        the value has been loaded, e.g. pop. *)
     val loadRegPostIndex: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegPostIndex: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegPostIndex32: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegPostIndex32: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegPostIndexByte: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegPostIndexByte: {regT: xReg, regN: xReg, byteOffset: int} -> instr
 
     (* Load/Store a value using a signed byte offset and pre-indexing (pre-increment). *)
     val loadRegPreIndex: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegPreIndex: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegPreIndex32: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegPreIndex32: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and loadRegPreIndexByte: {regT: xReg, regN: xReg, byteOffset: int} -> instr
     and storeRegPreIndexByte: {regT: xReg, regN: xReg, byteOffset: int} -> instr
 
     (* Loads and stores with special ordering. *)
     val loadAcquire: {regN: xReg, regT: xReg} -> instr
     and storeRelease: {regN: xReg, regT: xReg} -> instr
     and loadAcquire32: {regN: xReg, regT: xReg} -> instr
     and storeRelease32: {regN: xReg, regT: xReg} -> instr
     and loadAcquireByte: {regN: xReg, regT: xReg} -> instr
     and storeReleaseByte: {regN: xReg, regT: xReg} -> instr
 
     (* Load and store pairs of registers.  The offsets are signed scaled values. *)
     val storePairOffset: {regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int} -> instr
     and loadPairOffset: {regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int} -> instr
     and storePairPostIndexed: {regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int} -> instr
     and loadPairPostIndexed: {regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int} -> instr
     and storePairPreIndexed: {regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int} -> instr
     and loadPairPreIndexed: {regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int} -> instr
 
     and storePairOffset32: {regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int} -> instr
     and loadPairOffset32: {regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int} -> instr
     and storePairPostIndexed32: {regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int} -> instr
     and loadPairPostIndexed32: {regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int} -> instr
     and storePairPreIndexed32: {regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int} -> instr
     and loadPairPreIndexed32: {regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int} -> instr
 
     and storePairOffsetFloat: {regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int} -> instr
     and loadPairOffsetFloat: {regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int} -> instr
     and storePairPostIndexedFloat: {regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int} -> instr
     and loadPairPostIndexedFloat: {regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int} -> instr
     and storePairPreIndexedFloat: {regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int} -> instr
     and loadPairPreIndexedFloat: {regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int} -> instr
 
     and storePairOffsetDouble: {regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int} -> instr
     and loadPairOffsetDouble: {regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int} -> instr
     and storePairPostIndexedDouble: {regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int} -> instr
     and loadPairPostIndexedDouble: {regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int} -> instr
     and storePairPreIndexedDouble: {regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int} -> instr
     and loadPairPreIndexedDouble: {regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int} -> instr
 
     (* This word is put in after a call to the RTS trap-handler.  All the registers
        are saved and restored across a call to the trap-handler; the register
        mask contains those that may contain an address and so need to be scanned and
        possibly updated if there is a GC. *)
     val registerMask: xReg list -> instr
 
     (* Create a label. *)
     val createLabel: unit -> labels
     (* Put a label into the code. *)
     val setLabel: labels -> instr
     (* A conditional branch. *)
     val conditionalBranch: condition * labels -> instr
     (* Unconditional branch *)
     and unconditionalBranch: labels -> instr
     (* Unconditional branch and link. Only ever goes to the start of the function. *)
     and branchAndLink: labels -> instr
     (* Put the address of a label into a register - used for handlers and cases. *)
     and loadLabelAddress: xReg * labels -> instr
     (* Test a bit in a register and branch if zero/nonzero *)
     and testBitBranchZero: xReg * Word8.word * labels -> instr
     and testBitBranchNonZero: xReg * Word8.word * labels -> instr
     (* Compare a register with zero and branch if zero/nonzero *)
     and compareBranchZero: xReg * labels -> instr
     and compareBranchZero32: xReg * labels -> instr
     and compareBranchNonZero: xReg * labels -> instr
     and compareBranchNonZero32: xReg * labels -> instr
 
     (* Set the destination register to the value of the first reg if the
        condition is true otherwise to a, possibly modified, version of
        the second argument.  There are variants that set it unmodified,
        incremented, inverted and negated. *)
     val conditionalSet:
         {regD: xReg, regTrue: xReg, regFalse: xReg, cond: condition} -> instr
     and conditionalSetIncrement:
         {regD: xReg, regTrue: xReg, regFalse: xReg, cond: condition} -> instr
     and conditionalSetInverted:
         {regD: xReg, regTrue: xReg, regFalse: xReg, cond: condition} -> instr
     and conditionalSetNegated:
         {regD: xReg, regTrue: xReg, regFalse: xReg, cond: condition} -> instr
     and conditionalSet32:
         {regD: xReg, regTrue: xReg, regFalse: xReg, cond: condition} -> instr
     and conditionalSetIncrement32:
         {regD: xReg, regTrue: xReg, regFalse: xReg, cond: condition} -> instr
     and conditionalSetInverted32:
         {regD: xReg, regTrue: xReg, regFalse: xReg, cond: condition} -> instr
     and conditionalSetNegated32:
         {regD: xReg, regTrue: xReg, regFalse: xReg, cond: condition} -> instr
 
     (* General form of shift/bit extraction. *)
     val signedBitfieldMove32: {immr: word, imms: word, regN: xReg, regD: xReg} -> instr
     and bitfieldMove32: {immr: word, imms: word, regN: xReg, regD: xReg} -> instr
     and unsignedBitfieldMove32: {immr: word, imms: word, regN: xReg, regD: xReg} -> instr
     and signedBitfieldMove64: {immr: word, imms: word, regN: xReg, regD: xReg} -> instr
     and bitfieldMove64: {immr: word, imms: word, regN: xReg, regD: xReg} -> instr
     and unsignedBitfieldMove64: {immr: word, imms: word, regN: xReg, regD: xReg} -> instr
 
     (* Derived forms: Various shifts *)
     val logicalShiftLeft: {shift: word, regN: xReg, regD: xReg} -> instr
     and logicalShiftLeft32: {shift: word, regN: xReg, regD: xReg} -> instr
     and logicalShiftRight: {shift: word, regN: xReg, regD: xReg} -> instr
     and logicalShiftRight32: {shift: word, regN: xReg, regD: xReg} -> instr
     and arithmeticShiftRight: {shift: word, regN: xReg, regD: xReg} -> instr
     and arithmeticShiftRight32: {shift: word, regN: xReg, regD: xReg} -> instr
     (* Extract "width" least significant bits and place at offset "lsb" in the destination
        setting the rest of the register to zero. *)
     and unsignedBitfieldInsertinZeros: {lsb: word, width: word, regN: xReg, regD: xReg} -> instr
     and unsignedBitfieldInsertinZeros32: {lsb: word, width: word, regN: xReg, regD: xReg} -> instr
     (* Extract bits but leave the rest of the register unchanged.  Can be used
        to clear a specific range of bits by using XZero as the source. *)
     and bitfieldInsert: {lsb: word, width: word, regN: xReg, regD: xReg} -> instr
     and bitfieldInsert32: {lsb: word, width: word, regN: xReg, regD: xReg} -> instr
     (* Extract "width" bits starting from "lsb" in the source and place in the
        least significant bits of the destination, setting the high order bits to the
        sign bit. *)
     and signedBitfieldExtract: {lsb: word, width: word, regN: xReg, regD: xReg} -> instr
 
     (* Logical shift left Rd = Rn << (Rm mod 0w64) *)
     val logicalShiftLeftVariable: {regM: xReg, regN: xReg, regD: xReg} -> instr
     (* Logical shift right Rd = Rn >> (Rm mod 0w64) *)
     and logicalShiftRightVariable: {regM: xReg, regN: xReg, regD: xReg} -> instr
     (* Arithmetic shift right Rd = Rn ~>> (Rm mod 0w64) *)
     and arithmeticShiftRightVariable: {regM: xReg, regN: xReg, regD: xReg} -> instr
     and logicalShiftLeftVariable32: {regM: xReg, regN: xReg, regD: xReg} -> instr
     and logicalShiftRightVariable32: {regM: xReg, regN: xReg, regD: xReg} -> instr
     and arithmeticShiftRightVariable32: {regM: xReg, regN: xReg, regD: xReg} -> instr
 
     (* Logical operations on bit patterns.  The pattern must be valid.
        ANDS is an AND that also sets the flags, typically used for a test. *)
     val bitwiseAndImmediate: {bits: Word64.word, regN: xReg, regD: xReg} -> instr
     and bitwiseAndImmediate32: {bits: Word64.word, regN: xReg, regD: xReg} -> instr
     and bitwiseOrImmediate: {bits: Word64.word, regN: xReg, regD: xReg} -> instr
     and bitwiseOrImmediate32: {bits: Word64.word, regN: xReg, regD: xReg} -> instr
     and bitwiseXorImmediate: {bits: Word64.word, regN: xReg, regD: xReg} -> instr
     and bitwiseXorImmediate32: {bits: Word64.word, regN: xReg, regD: xReg} -> instr
     and bitwiseAndSImmediate: {bits: Word64.word, regN: xReg, regD: xReg} -> instr
     and bitwiseAndSImmediate32: {bits: Word64.word, regN: xReg, regD: xReg} -> instr
 
     (* Instructions involved in thread synchonisation. *)
     val yield: instr and dmbIsh: instr
     val loadAcquireExclusiveRegister: {regN: xReg, regT: xReg} -> instr
     val storeReleaseExclusiveRegister: {regN: xReg, regS: xReg, regT: xReg} -> instr
 
     (* Floating point moves and conversions.  Moves simply copy the bits.
        In all cases the integer argument is signed 64-bits. *)
     val moveGeneralToDouble: {regN: xReg, regD: vReg} -> instr
     and moveGeneralToFloat: {regN: xReg, regD: vReg} -> instr
     and moveDoubleToGeneral: {regN: vReg, regD: xReg} -> instr
     and moveFloatToGeneral: {regN: vReg, regD: xReg} -> instr
     and convertIntToDouble: {regN: xReg, regD: vReg} -> instr
     and convertIntToFloat: {regN: xReg, regD: vReg} -> instr
     and convertFloatToInt: IEEEReal.rounding_mode -> {regN: vReg, regD: xReg} -> instr
     and convertDoubleToInt: IEEEReal.rounding_mode -> {regN: vReg, regD: xReg} -> instr
     and convertInt32ToDouble: {regN: xReg, regD: vReg} -> instr
     and convertInt32ToFloat: {regN: xReg, regD: vReg} -> instr
     and convertFloatToInt32: IEEEReal.rounding_mode -> {regN: vReg, regD: xReg} -> instr
     and convertDoubleToInt32: IEEEReal.rounding_mode -> {regN: vReg, regD: xReg} -> instr
    
     (* Floating point operations. *)
     val multiplyFloat: {regM: vReg, regN: vReg, regD: vReg} -> instr
     and divideFloat: {regM: vReg, regN: vReg, regD: vReg} -> instr
     and addFloat: {regM: vReg, regN: vReg, regD: vReg} -> instr
     and subtractFloat: {regM: vReg, regN: vReg, regD: vReg} -> instr
     and multiplyDouble: {regM: vReg, regN: vReg, regD: vReg} -> instr
     and divideDouble: {regM: vReg, regN: vReg, regD: vReg} -> instr
     and addDouble: {regM: vReg, regN: vReg, regD: vReg} -> instr
     and subtractDouble: {regM: vReg, regN: vReg, regD: vReg} -> instr
 
     val compareFloat: {regM: vReg, regN: vReg} -> instr
     and compareDouble: {regM: vReg, regN: vReg} -> instr
     
     val moveFloatToFloat: {regN: vReg, regD: vReg} -> instr
     and absFloat: {regN: vReg, regD: vReg} -> instr
     and negFloat: {regN: vReg, regD: vReg} -> instr
     and convertFloatToDouble: {regN: vReg, regD: vReg} -> instr
     and moveDoubleToDouble: {regN: vReg, regD: vReg} -> instr
     and absDouble: {regN: vReg, regD: vReg} -> instr
     and negDouble: {regN: vReg, regD: vReg} -> instr
     and convertDoubleToFloat: {regN: vReg, regD: vReg} -> instr
 
     (* Some of the atomic operations added in 8.1 *)
     val loadAddAL: { regN: xReg, regT: xReg, regS: xReg } -> instr
     and loadUMaxAL: { regN: xReg, regT: xReg, regS: xReg } -> instr
     and swapAL: { regN: xReg, regT: xReg, regS: xReg } -> instr
+    and loadAddA: { regN: xReg, regT: xReg, regS: xReg } -> instr
+    and loadUMaxA: { regN: xReg, regT: xReg, regS: xReg } -> instr
+    and swapL: { regN: xReg, regT: xReg, regS: xReg } -> instr
     
     (* Special hack for callbacks in 32-in-64.  Must appear as
        the first instructions in the callback. *)
     val loadGlobalHeapBaseInCallback: xReg -> instr list
 
     (* Create the vector of code from the list of instructions and update the
        closure reference to point to it. *)
     val generateCode:
         {instrs: instr list, name: string, parameters: Universal.universal list, resultClosure: closureRef,
          profileObject: machineWord} -> unit
  
     (* Offsets in the assembly code interface pointed at by X26
        These are in units of 64-bits NOT bytes. *)
     val heapOverflowCallOffset: int
     and stackOverflowCallOffset: int
     and stackOverflowXCallOffset: int
     and exceptionHandlerOffset: int
     and stackLimitOffset: int
     and exceptionPacketOffset: int
     and threadIdOffset: int
     and heapLimitPtrOffset: int
     and heapAllocPtrOffset: int
     and mlStackPtrOffset: int
 
     val is32in64: bool and isBigEndian: bool
 
     structure Sharing:
     sig
         type closureRef = closureRef
         type instr = instr
         type xReg = xReg
         type vReg = vReg
         type labels = labels
         type condition = condition
         type shiftType = shiftType
         type wordSize = wordSize
         type 'a extend = 'a extend
         type scale = scale
     end
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODE.sig b/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODE.sig
index b12465c3..eb3d9c81 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODE.sig
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODE.sig
@@ -1,453 +1,453 @@
 (*
     Signature for the high-level ARM64 code
 
     Copyright David C. J. Matthews 2021-2
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 signature ARM64ICODE =
 sig
     type machineWord = Address.machineWord
     type address = Address.address
     
     type closureRef
 
     (* Registers. *)
     datatype xReg = XReg of Word8.word | XZero | XSP
     and vReg = VReg of Word8.word
 
     (* It is simpler to use a single type for all registers. *)
     datatype reg = GenReg of xReg | FPReg of vReg
 
     val X0:  xReg   and X1:  xReg   and X2:  xReg   and X3: xReg
     and X4:  xReg   and X5:  xReg   and X6:  xReg   and X7: xReg
     and X8:  xReg   and X9:  xReg   and X10: xReg   and X11: xReg
     and X12: xReg   and X13: xReg   and X14: xReg   and X15: xReg
     and X16: xReg   and X17: xReg   and X18: xReg   and X19: xReg
     and X20: xReg   and X21: xReg   and X22: xReg   and X23: xReg
     and X24: xReg   and X25: xReg   and X26: xReg   and X27: xReg
     and X28: xReg   and X29: xReg   and X30: xReg
 
     val V0:  vReg   and V1:  vReg   and V2:  vReg   and V3: vReg
     and V4:  vReg   and V5:  vReg   and V6:  vReg   and V7: vReg
     
     val is32in64: bool and isBigEndian: bool
     
     (* Condition for conditional branches etc. *)
     datatype condition =
         CondEqual            (* Z=1 *)
     |   CondNotEqual         (* Z=0 *)
     |   CondCarrySet         (* C=1 *)
     |   CondCarryClear       (* C=0 *)
     |   CondNegative         (* N=1 *)
     |   CondPositive         (* N=0 imcludes zero *)
     |   CondOverflow         (* V=1 *)
     |   CondNoOverflow       (* V=0 *)
     |   CondUnsignedHigher   (* C=1 && Z=0 *)
     |   CondUnsignedLowOrEq  (* ! (C=1 && Z=0) *)
     |   CondSignedGreaterEq  (* N=V *)
     |   CondSignedLess       (* N<>V *)
     |   CondSignedGreater    (* Z==0 && N=V *)
     |   CondSignedLessEq     (* !(Z==0 && N=V) *)
 
     (* The shift used in arithemtic operations. *)
     and shiftType =
         ShiftLSL of Word8.word
     |   ShiftLSR of Word8.word
     |   ShiftASR of Word8.word
     |   ShiftNone
 
     datatype preg = PReg of int (* A pseudo-register - an abstract register. *)
 
     (* If the value is zero we can use X0/W0. *)
     datatype pregOrZero = SomeReg of preg | ZeroReg
     
     (* A location on the stack.  May be more than word if this is a container or a handler entry. *)
     datatype stackLocn = StackLoc of {size: int, rno: int }
     
     (* This combines pregKind and stackLocn.  *)
     datatype regProperty =
         RegPropGeneral      (* A general register. *)
     |   RegPropUntagged     (* An untagged general register. *)
     |   RegPropStack of int (* A stack location or container. *)
     |   RegPropCacheTagged
     |   RegPropCacheUntagged
     |   RegPropMultiple     (* The result of a conditional or case. May be defined at multiple points. *)
     
     (* The reference to a condition code. *)
     datatype ccRef = CcRef of int
 
     datatype loadType = Load64 | Load32 | Load16 | Load8
     and opSize = OpSize32 | OpSize64
     and logicalOp = LogAnd | LogOr | LogXor
     and callKind = Recursive | ConstantCode of machineWord | FullCall
     and floatSize = Float32 | Double64
     and shiftDirection = ShiftLeft | ShiftRightLogical | ShiftRightArithmetic
     and multKind =
         MultAdd32 | MultSub32 | MultAdd64 | MultSub64 |
         SignedMultAddLong (* 32bit*32bit + 64bit => 64Bit *) |
         SignedMultHigh (* High order part of 64bit*64Bit *)
     and fpUnary = NegFloat | NegDouble | AbsFloat | AbsDouble | ConvFloatToDble | ConvDbleToFloat
     and fpBinary = MultiplyFP | DivideFP | AddFP | SubtractFP
         (* Some of the atomic operations added in 8.1 *)
-    and atomicOp = LoadAddAL | LoadUmaxAL | SwapAL
+    and atomicOp = LoadAddAL | LoadUmaxAL | SwapAL | LoadAddAcquire | LoadUMaxAcquire | SwapRelease
 
     (* Function calls can have an unlimited number of arguments so it isn't always
        going to be possible to load them into registers. *)
     datatype 'genReg fnarg = ArgInReg of 'genReg | ArgOnStack of { wordOffset: int, container: stackLocn, field: int }
 
     datatype ('genReg, 'optGenReg, 'fpReg) arm64ICode =
         (* Move the contents of one preg to another.  These are always 64-bits. *)
         MoveRegister of { source: 'genReg, dest: 'genReg }
 
         (* Numerical constant. *)
     |   LoadNonAddressConstant of { source: Word64.word, dest: 'genReg }
 
         (* Floating point constant *)
     |   LoadFPConstant of { source: Word64.word, dest: 'fpReg, floatSize: floatSize }
 
         (* Address constant. *)
     |   LoadAddressConstant of { source: machineWord, dest: 'genReg }
 
         (* Load a value into a register using a constant, signed, byte offset.  The offset
            is in the range of -256 to (+4095*unit size). *)
     |   LoadWithConstantOffset of { base: 'genReg, dest: 'genReg, byteOffset: int, loadType: loadType }
 
         (* Similarly for FP registers. *)
     |   LoadFPWithConstantOffset of { base: 'genReg, dest: 'fpReg, byteOffset: int, floatSize: floatSize }
 
         (* Load a value into a register using an index register. *)
     |   LoadWithIndexedOffset of { base: 'genReg, dest: 'genReg, index: 'genReg, loadType: loadType, signExtendIndex: bool }
 
         (* Ditto for FP. *)
     |   LoadFPWithIndexedOffset of { base: 'genReg, dest: 'fpReg, index: 'genReg, floatSize: floatSize, signExtendIndex: bool }
 
         (* Returns the current thread ID.  Always a 64-bit value.. *)
     |   GetThreadId of { dest: 'genReg }
 
         (* Convert a 32-in-64 object index into an absolute address. *)
     |   ObjectIndexAddressToAbsolute of { source: 'genReg, dest: 'genReg }
 
         (* Convert an absolute address into an object index. *)
     |   AbsoluteToObjectIndex of { source: 'genReg, dest: 'genReg }
 
         (* Allocate a fixed sized piece of memory and puts the absolute address into dest.
            bytesRequired is the total number of bytes including the length word and any alignment
            necessary for 32-in-64. saveRegs is the list of registers that need to be saved if we
            need to do a garbage collection. *)
     |   AllocateMemoryFixed of { bytesRequired: Word64.word, dest: 'genReg, saveRegs: 'genReg list }
 
         (* Allocate a piece of memory.  The size argument is an untagged value containing
            the number of words i.e. the same value used for InitialiseMemory and to store
            in the length word. *)
     |   AllocateMemoryVariable of { size: 'genReg, dest: 'genReg, saveRegs: 'genReg list }
 
         (* Initialise a piece of memory by writing "size" copies of the value
            in "init".  N.B. The size is an untagged value containing the
            number of words. *)
     |   InitialiseMem of { size: 'genReg, addr: 'genReg, init: 'genReg }
 
         (* Mark the beginning of a loop.  This is really only to prevent the initialisation code being
            duplicated in ICodeOptimise. *)
     |   BeginLoop
 
         (* Set up the registers for a jump back to the start of a loop. *)
     |   JumpLoop of
             { regArgs: {src: 'genReg fnarg, dst: 'genReg} list,
               stackArgs: {src: 'genReg fnarg, wordOffset: int, stackloc: stackLocn} list,
               checkInterrupt: 'genReg list option }
 
         (* Store a register using a constant, signed, byte offset.  The offset
            is in the range of -256 to (+4095*unit size). *)
     |   StoreWithConstantOffset of { source: 'genReg, base: 'genReg, byteOffset: int, loadType: loadType }
 
         (* Ditto for FP regs. *)
     |   StoreFPWithConstantOffset of { source: 'fpReg, base: 'genReg, byteOffset: int, floatSize: floatSize }
 
         (* Store a register using an index register. *)
     |   StoreWithIndexedOffset of { source: 'genReg, base: 'genReg, index: 'genReg, loadType: loadType, signExtendIndex: bool }
 
         (* and for FP regs. *)
     |   StoreFPWithIndexedOffset of { source: 'fpReg, base: 'genReg, index: 'genReg, floatSize: floatSize, signExtendIndex: bool }
 
         (* Add/Subtract immediate.  The destination is optional in which case XZero is used.
            ccRef is optional.  If it is NONE the version of the instruction that does not generate
            a condition code is used. immed must be < 0wx1000. *)
     |   AddSubImmediate of { source: 'genReg, dest: 'optGenReg, ccRef: ccRef option, immed: word,
                              isAdd: bool, length: opSize }
 
         (* Add/Subtract register.  As with AddSubImmediate, both the destination and cc are optional. *)
     |   AddSubRegister of { base: 'genReg, shifted: 'genReg, dest: 'optGenReg, ccRef: ccRef option,
                             isAdd: bool, length: opSize, shift: shiftType }
 
         (* Bitwise logical operations.  The immediate value must be a valid bit pattern.  ccRef can
            only be SOME if logOp is LogAnd. *)
     |   LogicalImmediate of { source: 'genReg, dest: 'optGenReg, ccRef: ccRef option, immed: Word64.word,
                               logOp: logicalOp, length: opSize }
 
         (* Register logical operations.  ccRef can only be SOME if logOp is LogAnd.*)
     |   LogicalRegister of { base: 'genReg, shifted: 'genReg, dest: 'optGenReg, ccRef: ccRef option,
                              logOp: logicalOp, length: opSize, shift: shiftType }
 
         (* Shift a word by an amount specified in a register. *)
     |   ShiftRegister of { direction: shiftDirection, dest: 'genReg, source: 'genReg, shift: 'genReg, opSize: opSize }
 
         (* The various forms of multiply all take three arguments and the general form is
            dest = M * N +/- A..   *)
     |   Multiplication of { kind: multKind, dest: 'genReg, sourceA: 'optGenReg, sourceM: 'genReg, sourceN: 'genReg }
 
         (* Signed or unsigned division.  Sets the result to zero if the divisor is zero. *)
     |   Division of { isSigned: bool, dest: 'genReg, dividend: 'genReg, divisor: 'genReg, opSize: opSize }
 
         (* Start of function.  Set the register arguments.  stackArgs is the list of
            stack arguments.  If the function has a real closure regArgs includes the
            closure register (X8).  The register arguments include the return register
            (X30). *)
     |   BeginFunction of { regArgs: ('genReg * xReg) list, stackArgs: stackLocn list }
 
         (* Call a function.  If the code address is a constant it is passed here.
            Otherwise the address is obtained by indirecting through X8 which has been loaded
            as one of the argument registers.  The results are stored in the result registers,
            usually just X0.
            The "containers" argument is used to ensure that any container whose address is passed
            as one of the other arguments continues to be referenced until the function is called
            since there's a possibility that it isn't actually used after the function. *)
     |   FunctionCall of
             { callKind: callKind, regArgs: ('genReg fnarg * xReg) list,
               stackArgs: 'genReg fnarg list, dests: ('genReg * xReg) list,
               saveRegs: 'genReg list, containers: stackLocn list}
 
         (* Jump to a tail-recursive function.  This is similar to FunctionCall
            but complicated for stack arguments because the stack and the return
            address need to be overwritten.
            stackAdjust is the number of words to remove (positive) or add
            (negative) to the stack before the call.
            currStackSize contains the number of items currently on the stack. *)
     |   TailRecursiveCall of
             { callKind: callKind, regArgs: ('genReg fnarg * xReg) list,
               stackArgs: {src: 'genReg fnarg, stack: int} list,
               stackAdjust: int, currStackSize: int }
 
         (* Return from the function.  resultRegs are the registers containing
            the result,
            returnReg is the preg that contains the return address. *)
     |   ReturnResultFromFunction of { results: ('genReg * xReg) list, returnReg: 'genReg, numStackArgs: int }
 
         (* Raise an exception.  The packet is always loaded into X0. *)
     |   RaiseExceptionPacket of { packetReg: 'genReg }
 
         (* Push a register to the stack.  This is used both for a normal push, copies=1, and
            also to reserve a container. *)
     |   PushToStack of { source: 'genReg, copies: int, container: stackLocn }
 
         (* Load a register from the stack.  The container is the stack location identifier,
            the field is an offset in a container. *)
     |   LoadStack of { dest: 'genReg, wordOffset: int, container: stackLocn, field: int }
 
         (* Store a value into the stack. *)
     |   StoreToStack of { source: 'genReg, container: stackLocn, field: int, stackOffset: int }
 
         (* Set the register to the address of the container i.e. a specific offset on the stack. *)
     |   ContainerAddress of { dest: 'genReg, container: stackLocn, stackOffset: int }
 
         (* Remove items from the stack.  Used to remove containers or
            registers pushed to the stack.. *)
     |   ResetStackPtr of { numWords: int }
 
         (* Tag a value by shifting and setting the tag bit. *)
     |   TagValue of { source: 'genReg, dest: 'genReg, isSigned: bool, opSize: opSize }
 
         (* Shift a value to remove the tag bit.  The cache is used if this is untagging a
            value that has previously been tagged. *)
     |   UntagValue of { source: 'genReg, dest: 'genReg, isSigned: bool, opSize: opSize }
 
         (* Box a largeword value.  Stores a value
            into a byte area.  This can be implemented using AllocateMemoryFixed
            but keeping it separate makes optimisation easier.
            The result is always an address and needs to be converted to an
            object index on 32-in-64. *)
     |   BoxLarge of { source: 'genReg, dest: 'genReg, saveRegs: 'genReg list }
 
         (* Load a value from a box.  This can be implemented using a load but
            is kept separate to simplify optimisation.  The source is always
            an absolute address. *)
     |   UnboxLarge of { source: 'genReg, dest: 'genReg }
 
         (* Convert a floating point value into a value suitable for storing
            in the heap.  This normally involves boxing except that 32-bit
            floats can be tagged in native 64-bits. *)
     |   BoxTagFloat of { floatSize: floatSize, source: 'fpReg, dest: 'genReg, saveRegs: 'genReg list }
 
         (* The reverse of BoxTagFloat. *)
     |   UnboxTagFloat of { floatSize: floatSize, source: 'genReg, dest: 'fpReg }
 
         (* Load a value with acquire semantics.  This means that any other
            load in this thread after this sees the value of the shared
            memory at this point and not earlier.  This is used for
            references and arrays to ensure that if another thread has
            built a data structure on the heap and then assigns the
            address to a shared ref this thread will see the updated heap
            and not any locally cached previous version. *)
     |   LoadAcquire of { base: 'genReg, dest: 'genReg, loadType: loadType }
 
         (* Store a value with release semantics.  This ensures that any
            other write completes before this operation and works with
            LoadAcquire. *)
     |   StoreRelease of { base: 'genReg, source: 'genReg, loadType: loadType }
 
         (* This is a generalised constant shift which includes selection of a
            range of bits. *)
     |   BitFieldShift of { source: 'genReg, dest: 'genReg, isSigned: bool, length: opSize, immr: word, imms: word }
 
         (*  Copy a range of bits and insert it into another register.  This is the
             only case where a register functions both as a source and a destination. *)
     |   BitFieldInsert of { source: 'genReg, destAsSource: 'genReg, dest: 'genReg,
                             length: opSize, immr: word, imms: word }
 
         (* Indexed case. *)
     |   IndexedCaseOperation of { testReg: 'genReg }
 
         (* Exception handling.  - Set up an exception handler. *)
     |   PushExceptionHandler
 
         (* End of a handled section.  Restore the previous handler. *)
     |   PopExceptionHandler
 
         (* Marks the start of a handler.  This sets the stack pointer and
            restores the old handler.  Sets the exception packet register. *) 
     |   BeginHandler of { packetReg: 'genReg }
 
         (* Compare two vectors of bytes and set the condition code on the result.
            The registers are modified by the instruction. *)
     |   CompareByteVectors of
             { vec1Addr: 'genReg, vec2Addr: 'genReg, length: 'genReg, ccRef: ccRef }
 
         (* Move a block of bytes (isByteMove true) or words (isByteMove false).  The length is the
            number of items (bytes or words) to move. The registers are modified by
            the instruction. *)
     |   BlockMove of { srcAddr: 'genReg, destAddr: 'genReg, length: 'genReg, isByteMove: bool }
 
         (* Add or subtract to the system stack pointer and optionally return the new value.
            This is used to allocate and deallocate C space. *)
     |   AddSubXSP of { source: 'genReg, dest: 'optGenReg, isAdd: bool  }
 
         (* Ensures the value will actually be referenced although it doesn't generate any code. *)
     |   TouchValue of { source: 'genReg }
 
         (* Load a value at the address and get exclusive access.  Always loads a
            64-bit value. *)
     |   LoadAcquireExclusive of { base: 'genReg, dest: 'genReg }
 
         (* Store a value into an address releasing the lock.  Sets the result to
            either 0 or 1 if it succeeds or fails. *)
     |   StoreReleaseExclusive of { base: 'genReg, source: 'optGenReg, result: 'genReg }
 
         (* Insert a memory barrier. dmb ish. *)
     |   MemoryBarrier
 
         (* Convert an integer to a floating point value. *)
     |   ConvertIntToFloat of { source: 'genReg, dest: 'fpReg, srcSize: opSize, destSize: floatSize }
 
         (* Convert a floating point value to an integer using the specified rounding mode.
            We could get an overflow here but fortunately the ARM generates a value
            that will cause an overflow when we tag it, provided we tag it explicitly. *)
     |   ConvertFloatToInt of { source: 'fpReg, dest: 'genReg, srcSize: floatSize, destSize: opSize, rounding: IEEEReal.rounding_mode }
 
         (* Unary floating point.  This includes conversions between float and double. *)
     |   UnaryFloatingPt of { source: 'fpReg, dest: 'fpReg, fpOp: fpUnary }
 
         (* Binary floating point: addition, subtraction, multiplication and division. *)
     |   BinaryFloatingPoint of { arg1: 'fpReg, arg2: 'fpReg, dest: 'fpReg, fpOp: fpBinary, opSize: floatSize }
 
         (* Floating point comparison. *)
     |   CompareFloatingPoint of { arg1: 'fpReg, arg2: 'fpReg, ccRef: ccRef, opSize: floatSize }
 
         (* Yield control during a spin-lock. *)
     |   CPUYield
 
         (* Atomic operations added for ARM 8.1 *)
     |   AtomicOperation of { base: 'genReg, source: 'optGenReg, dest: 'optGenReg, atOp: atomicOp }
 
         (* Debugging - fault if values don't match. *)
     |   CacheCheck of { arg1: 'genReg, arg2: 'genReg }
 
         (* Destinations at the end of a basic block. *)
     and controlFlow =
         (* Unconditional branch to a label - should be a merge point. *)
         Unconditional of int
         (* Conditional branch. Jumps to trueJump if the condional is false, falseJump if false. *)
     |   Conditional of { ccRef: ccRef, condition: condition, trueJump: int, falseJump: int }
         (* Exit - the last instruction of the block is a return, raise or tailcall. *)
     |   ExitCode
         (* Indexed case - this branches to one of a number of labels *)
     |   IndexedBr of int list
         (* Set up a handler.  This doesn't cause an immediate branch but the state at the
            start of the handler is the state at this point. *)
     |   SetHandler of { handler: int, continue: int }
         (* Unconditional branch to a handler.  If an exception is raised explicitly
            within the scope of a handler. *)
     |   UnconditionalHandle of int
         (* Conditional branch to a handler.  Occurs if there is a call to a
            function within the scope of a handler.  It may jump to the handler. *)
     |   ConditionalHandle of { handler: int, continue: int }
 
     and ('genReg, 'optGenReg, 'fpReg) basicBlock =
             BasicBlock of { block: ('genReg, 'optGenReg, 'fpReg) arm64ICode list, flow: controlFlow }
     
     (* Return the successor blocks from a control flow. *)
     val successorBlocks: controlFlow -> int list
 
     type iCodeAbstract = (preg, pregOrZero, preg) arm64ICode and basicBlockAbstract = (preg, pregOrZero, preg) basicBlock
     and  iCodeConcrete = (xReg, xReg, vReg) arm64ICode and basicBlockConcrete = (xReg, xReg, vReg) basicBlock
 
     val printICodeAbstract: basicBlockAbstract vector * (string -> unit) -> unit
     and printICodeConcrete: basicBlockConcrete vector * (string -> unit) -> unit
 
     (* Check whether this value is acceptable for LogicalImmediate. *)
     val isEncodableBitPattern: Word64.word * opSize -> bool
 
     (* This generates a  BitField instruction with the appropriate values for immr and imms. *)
     val shiftConstant:
         { direction: shiftDirection, dest: preg, source: preg, shift: word, opSize: opSize } -> iCodeAbstract
     
     structure Sharing:
     sig
         type xReg           = xReg
         and  vReg           = vReg
         and  reg            = reg
         and  condition      = condition
         and  shiftType      = shiftType
         and  ('genReg, 'optGenReg, 'fpReg) arm64ICode = ('genReg, 'optGenReg, 'fpReg) arm64ICode
         and  preg           = preg
         and  pregOrZero     = pregOrZero
         and  controlFlow    = controlFlow
         and  ('genReg, 'optGenReg, 'fpReg) basicBlock = ('genReg, 'optGenReg, 'fpReg) basicBlock
         and  stackLocn      = stackLocn
         and  regProperty    = regProperty
         and  ccRef          = ccRef
         and  'genReg fnarg  = 'genReg fnarg
         and  closureRef     = closureRef
         and  loadType       = loadType
         and  opSize         = opSize
         and  logicalOp      = logicalOp
         and  callKind       = callKind
         and  floatSize      = floatSize
         and  shiftDirection = shiftDirection
         and  multKind       = multKind
         and  fpUnary        = fpUnary
         and  fpBinary       = fpBinary
         and  atomicOp       = atomicOp
    end
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64PREASSEMBLY.sig b/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64PREASSEMBLY.sig
index 4fdfdb5e..36be1203 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64PREASSEMBLY.sig
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64PREASSEMBLY.sig
@@ -1,267 +1,267 @@
 (*
     Copyright (c) 2021-2 David C. J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     Licence version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public Licence for more details.
     
     You should have received a copy of the GNU Lesser General Public
     Licence along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 (* The pre-assembly layer goes below the icode and allows peep-hole optimisation. *)
 signature ARM64PREASSEMBLY =
 sig
     type closureRef
     type machineWord = Address.machineWord
 
     (* XZero and XSP are both encoded as 31 but the interpretation
        depends on the instruction 
        The datatype definition is included here to allow for
        pattern matching on XSP and XZero. *)
     datatype xReg = XReg of Word8.word | XZero | XSP
     and vReg = VReg of Word8.word
 
     val X0:  xReg   and X1:  xReg   and X2:  xReg   and X3: xReg
     and X4:  xReg   and X5:  xReg   and X6:  xReg   and X7: xReg
     and X8:  xReg   and X9:  xReg   and X10: xReg   and X11: xReg
     and X12: xReg   and X13: xReg   and X14: xReg   and X15: xReg
     and X16: xReg   and X17: xReg   and X18: xReg   and X19: xReg
     and X20: xReg   and X21: xReg   and X22: xReg   and X23: xReg
     and X24: xReg   and X25: xReg   and X26: xReg   and X27: xReg
     and X28: xReg   and X29: xReg   and X30: xReg
 
     val X_MLHeapLimit: xReg (* ML Heap limit pointer *)
     and X_MLAssemblyInt: xReg (* ML assembly interface pointer. *)
     and X_MLHeapAllocPtr: xReg (* ML Heap allocation pointer. *)
     and X_MLStackPtr: xReg (* ML Stack pointer. *)
     and X_LinkReg: xReg (* Link reg - return address *)
     and X_Base32in64: xReg (* X24 is used for the heap base in 32-in-64. *)
 
     val V0:  vReg   and V1:  vReg   and V2:  vReg   and V3: vReg
     and V4:  vReg   and V5:  vReg   and V6:  vReg   and V7: vReg
 
     (* Condition for conditional branches etc. *)
     datatype condition =
         CondEqual            (* Z=1 *)
     |   CondNotEqual         (* Z=0 *)
     |   CondCarrySet         (* C=1 *)
     |   CondCarryClear       (* C=0 *)
     |   CondNegative         (* N=1 *)
     |   CondPositive         (* N=0 imcludes zero *)
     |   CondOverflow         (* V=1 *)
     |   CondNoOverflow       (* V=0 *)
     |   CondUnsignedHigher   (* C=1 && Z=0 *)
     |   CondUnsignedLowOrEq  (* ! (C=1 && Z=0) *)
     |   CondSignedGreaterEq  (* N=V *)
     |   CondSignedLess       (* N<>V *)
     |   CondSignedGreater    (* Z==0 && N=V *)
     |   CondSignedLessEq     (* !(Z==0 && N=V) *)
 
     val invertTest: condition -> condition (* i.e. jump when the condition is not true. *)
     val condToString: condition -> string
 
     datatype shiftType =
         ShiftLSL of Word8.word
     |   ShiftLSR of Word8.word
     |   ShiftASR of Word8.word
     |   ShiftNone
 
     datatype wordSize = WordSize32 | WordSize64
 
     datatype 'a extend =
         ExtUXTB of 'a (* Unsigned extend byte *)
     |   ExtUXTH of 'a (* Unsigned extend byte *)
     |   ExtUXTW of 'a (* Unsigned extend byte *)
     |   ExtUXTX of 'a (* Left shift *)
     |   ExtSXTB of 'a (* Sign extend byte *)
     |   ExtSXTH of 'a (* Sign extend halfword *)
     |   ExtSXTW of 'a (* Sign extend word *)
     |   ExtSXTX of 'a (* Left shift *)
 
     (* Load/store instructions have only a single bit for the shift.  For byte
        operations this is one bit shift; for others it scales by the size of
        the operand if set. *)
     datatype scale =
         ScaleOrShift
     |   NoScale
 
     datatype loadType = Load64 | Load32 | Load16 | Load8
     and opSize = OpSize32 | OpSize64
     and logicalOp = LogAnd | LogOr | LogXor
     and floatSize = Float32 | Double64
     and shiftDirection = ShiftLeft | ShiftRightLogical | ShiftRightArithmetic
     and multKind =
         MultAdd32 | MultSub32 | MultAdd64 | MultSub64 |
         SignedMultAddLong (* 32bit*32bit + 64bit => 64Bit *) |
         SignedMultHigh (* High order part of 64bit*64Bit *)
     and fpUnary = NegFloat | NegDouble | AbsFloat | AbsDouble | ConvFloatToDble | ConvDbleToFloat
     and fpBinary = MultiplyFP | DivideFP | AddFP | SubtractFP
     and unscaledType = NoUpdate | PreIndex | PostIndex
     and condSet = CondSet | CondSetIncr | CondSetInvert | CondSetNegate
     and bitfieldKind = BFUnsigned | BFSigned | BFInsert
     and brRegType = BRRBranch | BRRAndLink | BRRReturn
         (* Some of the atomic operations added in 8.1 *)
-    and atomicOp = LoadAddAL | LoadUmaxAL | SwapAL
+    and atomicOp = LoadAddAL | LoadUmaxAL | SwapAL | LoadAddAcquire | LoadUMaxAcquire | SwapRelease
 
     type label and labelMaker
     val createLabelMaker: unit -> labelMaker
     and createLabel: labelMaker -> label
 
     datatype precode =
         (* Basic instructions *)
         AddImmediate of {regN: xReg, regD: xReg, immed: word, shifted: bool, opSize: opSize, setFlags: bool}
     |   SubImmediate of {regN: xReg, regD: xReg, immed: word, shifted: bool, opSize: opSize, setFlags: bool}
     |   AddShiftedReg of {regM: xReg, regN: xReg, regD: xReg, shift: shiftType, opSize: opSize, setFlags: bool}
     |   SubShiftedReg of {regM: xReg, regN: xReg, regD: xReg, shift: shiftType, opSize: opSize, setFlags: bool}
     |   AddExtendedReg of {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend, opSize: opSize, setFlags: bool}
     |   SubExtendedReg of {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend, opSize: opSize, setFlags: bool}
     |   MultiplyAndAddSub of {regM: xReg, regN: xReg, regA: xReg, regD: xReg, multKind: multKind}
     |   DivideRegs of
             {regM: xReg, regN: xReg, regD: xReg, isSigned: bool, opSize: opSize}
     |   LogicalShiftedReg of
             {regM: xReg, regN: xReg, regD: xReg, shift: shiftType, logOp: logicalOp, opSize: opSize, setFlags: bool}
     |   LoadRegScaled of
             {regT: xReg, regN: xReg, unitOffset: int, loadType: loadType}
     |   LoadFPRegScaled of
             {regT: vReg, regN: xReg, unitOffset: int, floatSize: floatSize}
     |   StoreRegScaled of
             {regT: xReg, regN: xReg, unitOffset: int, loadType: loadType}
     |   StoreFPRegScaled of
             {regT: vReg, regN: xReg, unitOffset: int, floatSize: floatSize}
     |   LoadRegUnscaled of
             {regT: xReg, regN: xReg, byteOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   StoreRegUnscaled of
             {regT: xReg, regN: xReg, byteOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   LoadFPRegUnscaled of
             {regT: vReg, regN: xReg, byteOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   StoreFPRegUnscaled of
             {regT: vReg, regN: xReg, byteOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   LoadRegIndexed of {regT: xReg, regN: xReg, regM: xReg, loadType: loadType, option: scale extend}
     |   StoreRegIndexed of {regT: xReg, regN: xReg, regM: xReg, loadType: loadType, option: scale extend}
     |   LoadFPRegIndexed of {regT: vReg, regN: xReg, regM: xReg, floatSize: floatSize, option: scale extend}
     |   StoreFPRegIndexed of {regT: vReg, regN: xReg, regM: xReg, floatSize: floatSize, option: scale extend}
         (* LoadAcquire and StoreRelease are used for mutables. *)
     |   LoadAcquireReg of {regN: xReg, regT: xReg, loadType: loadType}
     |   StoreReleaseReg of {regN: xReg, regT: xReg, loadType: loadType}
         (* LoadAcquireExclusiveRegister and StoreReleaseExclusiveRegister are used for mutexes. *)
     |   LoadAcquireExclusiveRegister of {regN: xReg, regT: xReg}
     |   StoreReleaseExclusiveRegister of {regS: xReg, regT: xReg, regN: xReg}
     |   MemBarrier
         (* Additional atomic operations. *)
     |   AtomicExtension of { regT: xReg, regN: xReg, regS: xReg, atOp: atomicOp }
     |   LoadRegPair of
             { regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   StoreRegPair of
             { regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   LoadFPRegPair of
             { regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   StoreFPRegPair of
             { regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   ConditionalSet of
             {regD: xReg, regTrue: xReg, regFalse: xReg, cond: condition, condSet: condSet, opSize: opSize}
     |   BitField of {immr: word, imms: word, regN: xReg, regD: xReg, opSize: opSize, bitfieldKind: bitfieldKind}
     |   ShiftRegisterVariable of {regM: xReg, regN: xReg, regD: xReg, opSize: opSize, shiftDirection: shiftDirection}
     |   BitwiseLogical of { bits: Word64.word, regN: xReg, regD: xReg, opSize: opSize, setFlags: bool, logOp: logicalOp}
         (* Floating point *)
     |   MoveGeneralToFP of { regN: xReg, regD: vReg, floatSize: floatSize}
     |   MoveFPToGeneral of {regN: vReg, regD: xReg, floatSize: floatSize}
     |   CvtIntToFP of { regN: xReg, regD: vReg, floatSize: floatSize, opSize: opSize}
     |   CvtFloatToInt of { round: IEEEReal.rounding_mode, regN: vReg, regD: xReg, floatSize: floatSize, opSize: opSize}
     |   FPBinaryOp of { regM: vReg, regN: vReg, regD: vReg, floatSize: floatSize, fpOp: fpBinary}
     |   FPComparison of { regM: vReg, regN: vReg, floatSize: floatSize}
     |   FPUnaryOp of {regN: vReg, regD: vReg, fpOp: fpUnary}
         (* Branches and Labels. *)
     |   SetLabel of label
     |   ConditionalBranch of condition * label
     |   UnconditionalBranch of label
     |   BranchAndLink of label
     |   BranchReg of {regD: xReg, brRegType: brRegType }
     |   LoadLabelAddress of xReg * label
     |   TestBitBranch of { test: xReg, bit: Word8.word, label: label, onZero: bool }
     |   CompareBranch of { test: xReg, label: label, onZero: bool, opSize: opSize }
         (* Composite instructions *)
     |   MoveXRegToXReg of {sReg: xReg, dReg: xReg}
     |   LoadNonAddr of xReg * Word64.word
     |   LoadFPConst of {dest: vReg, value: Word64.word, floatSize: floatSize, work: xReg}
     |   LoadAddr of xReg * machineWord
     |   RTSTrap of { rtsEntry: int, work: xReg, save: xReg list }
         (* Allocate memory - bytes includes the length word and rounding. *)
     |   AllocateMemoryFixedSize of { bytes: word, dest: xReg, save: xReg list, work: xReg }
         (* Allocate memory - sizeReg is number of ML words needed for cell. *)
     |   AllocateMemoryVariableSize of { sizeReg: xReg, dest: xReg, save: xReg list, work: xReg }
         (* Branch table for indexed case. startLabel is the address of the first label in
            the list.  The branch table is a sequence of unconditional branches. *)
     |   BranchTable of { startLabel: label, brTable: label list }
     |   LoadGlobalHeapBaseInCallback of xReg
     |   Yield
 
     (* Wrapper for BitField *)
     val shiftConstant: { direction: shiftDirection, regD: xReg, regN: xReg, shift: word, opSize: opSize } -> precode
     
     (* Convenient sequences.  N.B. These are in reverse order. *)
     val boxDouble:
         {source: vReg, destination: xReg, workReg: xReg, saveRegs: xReg list} * precode list -> precode list
     and boxFloat:
         {source: vReg, destination: xReg, workReg: xReg, saveRegs: xReg list} * precode list -> precode list
     and boxSysWord:
         {source: xReg, destination: xReg, workReg: xReg, saveRegs: xReg list} * precode list -> precode list
 
     (* Create the vector of code from the list of instructions and update the
        closure reference to point to it. *)
     val generateFinalCode:
         {instrs: precode list, name: string, parameters: Universal.universal list, resultClosure: closureRef,
          profileObject: machineWord, labelMaker: labelMaker} -> unit
 
     (* Offsets in the assembly code interface pointed at by X26
        These are in units of 64-bits NOT bytes. *)
     val heapOverflowCallOffset: int
     and stackOverflowCallOffset: int
     and stackOverflowXCallOffset: int
     and exceptionHandlerOffset: int
     and stackLimitOffset: int
     and threadIdOffset: int
     and heapLimitPtrOffset: int
     and heapAllocPtrOffset: int
     and mlStackPtrOffset: int
     and exceptionPacketOffset: int
 
     val is32in64: bool and isBigEndian: bool
 
     val isEncodableBitPattern: Word64.word * wordSize -> bool
 
     structure Sharing:
     sig
         type closureRef = closureRef
         type loadType = loadType
         type opSize = opSize
         type logicalOp = logicalOp
         type floatSize = floatSize
         type shiftDirection = shiftDirection
         type multKind = multKind
         type fpUnary = fpUnary
         type fpBinary = fpBinary
         type unscaledType = unscaledType
         type condSet = condSet
         type bitfieldKind = bitfieldKind
         type brRegType = brRegType
         type precode = precode
         type xReg = xReg
         type vReg = vReg
         type label = label
         type labelMaker = labelMaker
         type condition = condition
         type shiftType = shiftType
         type wordSize = wordSize
         type 'a extend = 'a extend
         type scale = scale
         type atomicOp = atomicOp
     end
 
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64Assembly.sml b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64Assembly.sml
index 8f5231eb..9efadc38 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64Assembly.sml
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64Assembly.sml
@@ -1,2729 +1,2732 @@
 (*
     Copyright (c) 2021-2 David C. J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     Licence version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public Licence for more details.
     
     You should have received a copy of the GNU Lesser General Public
     Licence along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64Assembly (
     structure Debug: DEBUG
     and       Pretty: PRETTY
     and       CodeArray: CODEARRAY
 ) : ARM64ASSEMBLY =
 
 struct
     open CodeArray Address
     
     val is32in64 = Address.wordSize = 0w4
     
     val wordsPerNativeWord: word = Address.nativeWordSize div Address.wordSize
     
     local
         (* Almost every ARM64 platform is little-endian but it is possible to
            run it in big-endian mode.  Instructions are always little-endian.
            The value of isBigEndian will be determined when the structure is
            constructed.  That's not a problem since it will be built on the
            target machine. *)
         val isBigEndian: unit -> bool = RunCall.rtsCallFast1 "PolyIsBigEndian"
     in
         val isBigEndian = isBigEndian()
     end
     
     exception InternalError = Misc.InternalError
 
     infix 5 << <<+ <<- >> >>+ >>- ~>> ~>>+ ~>>- (* Shift operators *)
     infix 3 andb orb xorb andbL orbL xorbL andb8 orb8 xorb8
     
     val op << = Word32.<< and op >> = Word32.>> and op ~>> = Word32.~>>
     and op andb = Word32.andb and op orb = Word32.orb
 
     val word32ToWord8 = Word8.fromLargeWord o Word32.toLargeWord
     and word8ToWord32 = Word32.fromLargeWord o Word8.toLargeWord
     and word32ToWord = Word.fromLargeWord o Word32.toLargeWord
     and wordToWord32 = Word32.fromLargeWord o Word.toLargeWord
     and word8ToWord = Word.fromLargeWord o Word8.toLargeWord
 
     (* The maximum positive number that will fit in a signed "bits" field. *)
     fun maxSigned bits = Word.<<(0w1, bits-0w1) - 0w1
     fun willFitInRange(offset, bits) = offset <= Word.toInt(maxSigned bits) andalso offset >= ~ (Word.toInt(maxSigned bits)) - 1
    
     (* XReg is used for fixed point registers since X0 and W0 are
        the same register. *)
     datatype xReg = XReg of Word8.word | XZero | XSP
     (* VReg is used for the floating point registers since V0, D0 and
        S0 are the same register. *)
     and vReg = VReg of Word8.word
 
     (* A Label is a ref that is later set to the location.
        Several labels can be linked together so that they are only set
        at a single point.
        Only forward jumps are linked so when we come to finally set the
        label we will have the full list. *)
     type labels = Word.word ref list ref
 
     (* Condition codes. *)
 
     (* N.B. On subtraction and comparison the ARM uses an inverted carry
        flag for borrow.  The C flag is set if there is NO borrow.
        This is the reverse of the X86. *)
     datatype condition =
         CondEqual            (* Z=1 *)
     |   CondNotEqual         (* Z=0 *)
     |   CondCarrySet         (* C=1 *)
     |   CondCarryClear       (* C=0 *)
     |   CondNegative         (* N=1 *)
     |   CondPositive         (* N=0 imcludes zero *)
     |   CondOverflow         (* V=1 *)
     |   CondNoOverflow       (* V=0 *)
     |   CondUnsignedHigher   (* C=1 && Z=0 *)
     |   CondUnsignedLowOrEq  (* ! (C=1 && Z=0) *)
     |   CondSignedGreaterEq  (* N=V *)
     |   CondSignedLess       (* N<>V *)
     |   CondSignedGreater    (* Z==0 && N=V *)
     |   CondSignedLessEq     (* !(Z==0 && N=V) *)
 
     (* The negation of a test just involves inverting the bottom bit. *)
     fun invertTest CondEqual            = CondNotEqual
     |   invertTest CondNotEqual         = CondEqual
     |   invertTest CondCarrySet         = CondCarryClear
     |   invertTest CondCarryClear       = CondCarrySet
     |   invertTest CondNegative         = CondPositive
     |   invertTest CondPositive         = CondNegative
     |   invertTest CondOverflow         = CondNoOverflow
     |   invertTest CondNoOverflow       = CondOverflow
     |   invertTest CondUnsignedHigher   = CondUnsignedLowOrEq
     |   invertTest CondUnsignedLowOrEq  = CondUnsignedHigher
     |   invertTest CondSignedGreaterEq  = CondSignedLess
     |   invertTest CondSignedLess       = CondSignedGreaterEq
     |   invertTest CondSignedGreater    = CondSignedLessEq
     |   invertTest CondSignedLessEq     = CondSignedGreater
 
     fun condToString CondEqual            = "EQ"
     |   condToString CondNotEqual         = "NE"
     |   condToString CondCarrySet         = "CS"
     |   condToString CondCarryClear       = "CC"
     |   condToString CondNegative         = "MI"
     |   condToString CondPositive         = "PL"
     |   condToString CondOverflow         = "VS"
     |   condToString CondNoOverflow       = "VC"
     |   condToString CondUnsignedHigher   = "HI"
     |   condToString CondUnsignedLowOrEq  = "LS"
     |   condToString CondSignedGreaterEq  = "GE"
     |   condToString CondSignedLess       = "LT"
     |   condToString CondSignedGreater    = "GT"
     |   condToString CondSignedLessEq     = "LE"
 
 
     (* Condition codes to binary encoding. *)
     fun cCode CondEqual           = 0wx0: Word32.word
     |   cCode CondNotEqual        = 0wx1
     |   cCode CondCarrySet        = 0wx2 (* C=1 *)
     |   cCode CondCarryClear      = 0wx3 (* C=0 *)
     |   cCode CondNegative        = 0wx4 (* N=1 *)
     |   cCode CondPositive        = 0wx5 (* N=0 imcludes zero *)
     |   cCode CondOverflow        = 0wx6 (* V=1 *)
     |   cCode CondNoOverflow      = 0wx7 (* V=0 *)
     |   cCode CondUnsignedHigher  = 0wx8 (* C=1 && Z=0 *)
     |   cCode CondUnsignedLowOrEq = 0wx9 (* ! (C=1 && Z=0) *)
     |   cCode CondSignedGreaterEq = 0wxa (* N=V *)
     |   cCode CondSignedLess      = 0wxb (* N<>V *)
     |   cCode CondSignedGreater   = 0wxc (* Z==0 && N=V *)
     |   cCode CondSignedLessEq    = 0wxd (* !(Z==0 && N=V) *)
 
 
     (* Offsets in the assembly code interface pointed at by X26
        These are in units of 64-bits NOT bytes. *)
     val heapOverflowCallOffset  = 1
     and stackOverflowCallOffset = 2
     and stackOverflowXCallOffset= 3
     and exceptionHandlerOffset  = 5
     and stackLimitOffset        = 6
     and exceptionPacketOffset   = 7
     and threadIdOffset          = 8
     and heapLimitPtrOffset      = 42
     and heapAllocPtrOffset      = 43
     and mlStackPtrOffset        = 44
 
     (* 31 in the register field can either mean the zero register or
        the hardware stack pointer.  Which meaning depends on the instruction. *)
     fun xRegOrXZ(XReg w) = w
     |   xRegOrXZ XZero = 0w31
     |   xRegOrXZ XSP = raise InternalError "XSP not valid here"
     
     and xRegOrXSP(XReg w) = w
     |   xRegOrXSP XZero = raise InternalError "XZero not valid here"
     |   xRegOrXSP XSP = 0w31
     
     (* There are cases where it isn't clear. *)
     and xRegOnly (XReg w) = w
     |   xRegOnly XZero = raise InternalError "XZero not valid here"
     |   xRegOnly XSP = raise InternalError "XSP not valid here"
 
     val X0  = XReg 0w0  and X1  = XReg 0w1  and X2 = XReg 0w2   and X3  = XReg 0w3
     and X4  = XReg 0w4  and X5  = XReg 0w5  and X6 = XReg 0w6   and X7  = XReg 0w7
     and X8  = XReg 0w8  and X9  = XReg 0w9  and X10= XReg 0w10  and X11 = XReg 0w11
     and X12 = XReg 0w12 and X13 = XReg 0w13 and X14= XReg 0w14  and X15 = XReg 0w15
     and X16 = XReg 0w16 and X17 = XReg 0w17 and X18= XReg 0w18  and X19 = XReg 0w19
     and X20 = XReg 0w20 and X21 = XReg 0w21 and X22= XReg 0w22  and X23 = XReg 0w23
     and X24 = XReg 0w24 and X25 = XReg 0w25 and X26= XReg 0w26  and X27 = XReg 0w27
     and X28 = XReg 0w28 and X29 = XReg 0w29 and X30= XReg 0w30
     
     val X_MLHeapLimit       = X25 (* ML Heap limit pointer *)
     and X_MLAssemblyInt     = X26 (* ML assembly interface pointer. *)
     and X_MLHeapAllocPtr    = X27 (* ML Heap allocation pointer. *)
     and X_MLStackPtr        = X28 (* ML Stack pointer. *)
     and X_LinkReg           = X30 (* Link reg - return address *)
     and X_Base32in64        = X24 (* X24 is used for the heap base in 32-in-64. *)
     
     fun vReg(VReg v) = v
     (* Only the first eight registers are currently used by ML. *)
     val V0  = VReg 0w0  and V1  = VReg 0w1 and V2 = VReg 0w2   and V3  = VReg 0w3
     and V4  = VReg 0w4  and V5  = VReg 0w5 and V6 = VReg 0w6   and V7  = VReg 0w7
 
     (* Some data instructions include a possible shift. *)
     datatype shiftType =
         ShiftLSL of Word8.word
     |   ShiftLSR of Word8.word
     |   ShiftASR of Word8.word
     |   ShiftNone
 
     local
         fun checkImm6 w = if w > 0w63 then raise InternalError "shift > 63" else w
     in 
         fun shiftEncode(ShiftLSL w) = (0w0, checkImm6 w)
         |   shiftEncode(ShiftLSR w) = (0w1, checkImm6 w)
         |   shiftEncode(ShiftASR w) = (0w2, checkImm6 w)
         |   shiftEncode ShiftNone   = (0w0, 0w0)
     end
 
     (* Other instructions include an extension i.e. a sign- or zero-extended
        value from one of the argument registers.  When an extension is encoded
        there can also be a left shift which applies after the extension.
        I don't understand what difference, if any, there is between UXTX
        and SXTX.
        There's no ExtNone because we need to use either UXTW or UXTX depending
        on the length *)
     datatype 'a extend =
         ExtUXTB of 'a (* Unsigned extend byte *)
     |   ExtUXTH of 'a (* Unsigned extend byte *)
     |   ExtUXTW of 'a (* Unsigned extend byte *)
     |   ExtUXTX of 'a (* Left shift *)
     |   ExtSXTB of 'a (* Sign extend byte *)
     |   ExtSXTH of 'a (* Sign extend halfword *)
     |   ExtSXTW of 'a (* Sign extend word *)
     |   ExtSXTX of 'a (* Left shift *)
 
     (* Load/store instructions have only a single bit for the shift.  For byte
        operations this is one bit shift; for others it scales by the size of
        the operand if set. *)
     datatype scale =
         ScaleOrShift
     |   NoScale
 
     local
         (* Although there are three bits it seems that the shift is limited to 0 to 4. *)
         fun checkImm3 w = if w > 0w4 then raise InternalError "extend shift > 4" else w
     in
         fun extendArithEncode(ExtUXTB w) = (0w0, checkImm3 w)
         |   extendArithEncode(ExtUXTH w) = (0w1, checkImm3 w)
         |   extendArithEncode(ExtUXTW w) = (0w2, checkImm3 w)
         |   extendArithEncode(ExtUXTX w) = (0w3, checkImm3 w)
         |   extendArithEncode(ExtSXTB w) = (0w4, checkImm3 w)
         |   extendArithEncode(ExtSXTH w) = (0w5, checkImm3 w)
         |   extendArithEncode(ExtSXTW w) = (0w6, checkImm3 w)
         |   extendArithEncode(ExtSXTX w) = (0w7, checkImm3 w)
         
         fun extendLSEncode(ExtUXTB v) = (0w0, v)
         |   extendLSEncode(ExtUXTH v) = (0w1, v)
         |   extendLSEncode(ExtUXTW v) = (0w2, v)
         |   extendLSEncode(ExtUXTX v) = (0w3, v)
         |   extendLSEncode(ExtSXTB v) = (0w4, v)
         |   extendLSEncode(ExtSXTH v) = (0w5, v)
         |   extendLSEncode(ExtSXTW v) = (0w6, v)
         |   extendLSEncode(ExtSXTX v) = (0w7, v)
     end
 
     datatype wordSize = WordSize32 | WordSize64
 
     (* Bit patterns on the ARM64 are encoded using a complicated scheme and
        only certain values can be encoded.  An element can be 2, 4, 8, 16, 32 or
        64 bits and must be a sequence of at least one zero bits followed by at
        least one one bit.  This sequence can then be rotated within the element.
        Finally the element is replicated within the register up to 32 or
        64 bits.  All this information is encoded in 13 bits.
        N.B. Bit patterns of all zeros or all ones cannot be encoded. *)
 
     (* Encode the value if it is possible. *)
     fun encodeBitPattern(value, sf (* size flag *)) =
     (* Can't encode 0 or all ones. *)
     if value = 0w0 orelse value = Word64.notb 0w0
     then NONE
     (* If this is 32-bits we can't encode all ones in the
        low-order 32-bits or any value that won't fit in 32-bits, *)
     else if sf = WordSize32 andalso value >= 0wxffffffff
     then NONE
     else
     let
         val regSize = case sf of WordSize32 => 0w32 | WordSize64 => 0w64
         (* Get the element size.  Look for the repeat of the
            pattern. *)
         fun getElemSize size =
         let
             val ns = size div 0w2
             val mask = Word64.<<(0w1, ns)  - 0w1
         in
             if Word64.andb(value, mask) <> Word64.andb(Word64.>>(value, ns), mask)
             then size
             else if ns <= 0w2
             then ns
             else getElemSize ns
         end
         val elemSize = getElemSize regSize
         fun log2 0w1 = 0w0 | log2 n = 0w1 + log2(Word.>>(n, 0w1))
         val elemBits = log2 elemSize
 
         (* Find the rotation that puts as many of the zero bits in the
            element at the top. *)
         val elemMask = Word64.>>(Word64.notb 0w0, 0w64-elemSize)
         fun ror elt =
             Word64.orb((Word64.<<(Word64.andb(elt, 0w1), elemSize-0w1),
                 Word64.>>(elt, 0w1)))
         and rol elt =
             Word64.orb(Word64.andb(elemMask, Word64.<<(elt, 0w1)),
                 Word64.>>(elt, elemSize-0w1))
 
         fun findRotation(v, n) =
             if ror v < v then findRotation(ror v, (n-0w1) mod elemSize)
             else if rol v < v then findRotation(rol v, n+0w1)
             else (v, n)
 
         val (rotated, rotation) = findRotation(Word64.andb(value, elemMask), 0w0)
 
         (* Count out the low order ones.  If the result is zero
            then we;ve got a valid sequence of zeros followed by ones
            but if we discover a zero bit and the result isn't zero
            then we can't encode this. *)
         fun countLowOrderOnes(v, n) =
             if v = 0w0
             then SOME n
             else if Word64.andb(v, 0w1) = 0w1
             then countLowOrderOnes(Word64.>>(v, 0w1), n+0w1)
             else NONE
      in
         case countLowOrderOnes(rotated, 0w0) of
             NONE => NONE
         |   SOME lowOrderOnes =>
             let
                 (* Encode the element size. *)
                 val elemSizeEnc = 0wx7f - (Word.<<(0w1, elemBits+0w1) - 0w1)
                 val n = if Word.andb(elemSizeEnc, 0wx40) = 0w0 then 0w1 else 0w0
                 val imms = Word.andb(Word.orb(elemSizeEnc, lowOrderOnes-0w1), 0wx3f)
             in
                 SOME{n=n, imms=imms, immr=rotation}
             end
     end;
 
     (* Decode a pattern for printing. *)
     fun decodeBitPattern{sf, n, immr, imms} =
     let
         (* Find the highest bit set in N:NOT(imms) *)
         fun highestBitSet 0w0 = 0
         |   highestBitSet n = 1+highestBitSet(Word32.>>(n, 0w1))
         val len = highestBitSet(Word32.orb(Word32.<<(n, 0w6), Word32.xorb(imms, 0wx3f))) - 1
         val _ = if len < 0 then raise InternalError "decodeBitPattern: invalid" else ()
         val size = Word32.<<(0w1, Word.fromInt len)
         val r = Word32.andb(immr, size-0w1)
         and s = Word32.andb(imms, size-0w1)
         val _ = if s = size-0w1 then raise InternalError "decodeBitPattern: invalid" else ()
         val pattern = Word64.<<(0w1, word32ToWord(s+0w1)) - 0w1
         (* Rotate right: shift left and put the top bit in the high order bit*)
         fun ror elt =
             Word64.orb((Word64.<<(Word64.andb(elt, 0w1), word32ToWord(size-0w1)),
                 Word64.>>(elt, 0w1)))
 
         fun rotateBits(value, 0w0) = value
         |   rotateBits(value, n) = rotateBits(ror value, n-0w1)
 
         val rotated = rotateBits(pattern, r)
 
         val regSize = if sf = 0w0 then 0w32 else 0w64
 
         (* Replicate the rotated pattern to fill the register. *)
         fun replicate(pattern, size) =
             if size >= regSize
             then pattern
             else replicate(Word64.orb(pattern, Word64.<<(pattern, word32ToWord size)), size * 0w2)
     in
         replicate(rotated, size)
     end
 
     val isEncodableBitPattern = isSome o encodeBitPattern
 
 
     datatype instr =
         SimpleInstr of Word32.word
     |   LoadAddressLiteral of {reg: xReg, value: machineWord, length: brLength ref}
     |   LoadNonAddressLiteral of {reg: xReg, value: Word64.word, length: brLength ref}
     |   LoadFPLiteral of {reg: vReg, value: Word64.word, length: brLength ref, isDouble: bool, work: xReg}
     |   Label of labels
     |   UnconditionalBranch of {label: labels, andLink: bool}
     |   ConditionalBranch of { label: labels, jumpCondition: condition, length: brLength ref }
     |   LoadLabelAddress of { label: labels, reg: xReg, length: brLength ref }
     |   TestBitBranch of { label: labels, bitNo: Word8.word, brNonZero: bool, reg: xReg, length: brLength ref }
     |   CompareBranch of { label: labels, brNonZero: bool, size: wordSize, reg: xReg, length: brLength ref }
     
     and brLength = BrShort | BrExtended
 
     val nopCode  = 0wxD503201F
     and undefCode = 0wx00000000 (* Permanently undefined instruction. *)
 
     (* Add/subtract an optionally shifted 12-bit immediate (i.e. constant) to/from a register.
        The constant is zero-extended.  The versions that do not set the flags can use XSP as
        the destination; the versions that use the signs can use XZero as the destination i.e.
        they discard the result and act as a comparison. *)
     local
         fun addSubRegImmediate(sf, oper, s, xdOp) ({regN, regD, immed, shifted}) =
         let
             val () =
                 if immed >= 0wx1000 then raise InternalError "addSubRegImmediate: immed > 12 bits" else ()
         in
             SimpleInstr(
                 0wx11000000 orb (sf << 0w31) orb (oper << 0w30) orb (s << 0w29) orb
                 (if shifted then 0wx400000 else 0w0) orb
                 (wordToWord32 immed << 0w10) orb (word8ToWord32(xRegOrXSP regN) << 0w5) orb
                 word8ToWord32(xdOp regD))
         end
     in
         val addImmediate = addSubRegImmediate(0w1, 0w0, 0w0, xRegOrXSP)
         and addSImmediate = addSubRegImmediate(0w1, 0w0, 0w1, xRegOrXZ)
         and subImmediate = addSubRegImmediate(0w1, 0w1, 0w0, xRegOrXSP)
         and subSImmediate = addSubRegImmediate(0w1, 0w1, 0w1, xRegOrXZ)
         and addImmediate32 = addSubRegImmediate(0w0, 0w0, 0w0, xRegOrXSP)
         and addSImmediate32 = addSubRegImmediate(0w0, 0w0, 0w1, xRegOrXZ)
         and subImmediate32 = addSubRegImmediate(0w0, 0w1, 0w0, xRegOrXSP)
         and subSImmediate32 = addSubRegImmediate(0w0, 0w1, 0w1, xRegOrXZ)
     end
 
     (* Add/subtract a shifted register, optionally setting the flags. *)
     local
         (* X31 is XZ here unlike the extended version.*)
         fun addSubtractShiftedReg (sf, oper, s) ({regM, regN, regD, shift}) =
         let
             val (shift, imm6) = shiftEncode shift
         in
             SimpleInstr(0wx0b000000 orb (sf << 0w31) orb (oper << 0w30) orb (s << 0w29) orb
                 (shift << 0w22) orb (word8ToWord32(xRegOnly regM) << 0w16) orb
                 (word8ToWord32 imm6 << 0w10) orb (word8ToWord32(xRegOrXZ regN) << 0w5) orb
                 word8ToWord32(xRegOrXZ regD))
         end
     in
         val addShiftedReg = addSubtractShiftedReg(0w1, 0w0, 0w0)
         and addSShiftedReg = addSubtractShiftedReg(0w1, 0w0, 0w1)
         and subShiftedReg = addSubtractShiftedReg(0w1, 0w1, 0w0)
         and subSShiftedReg = addSubtractShiftedReg(0w1, 0w1, 0w1)
         and addShiftedReg32 = addSubtractShiftedReg(0w0, 0w0, 0w0)
         and addSShiftedReg32 = addSubtractShiftedReg(0w0, 0w0, 0w1)
         and subShiftedReg32 = addSubtractShiftedReg(0w0, 0w1, 0w0)
         and subSShiftedReg32 = addSubtractShiftedReg(0w0, 0w1, 0w1)
     end
 
     (* Add/subtract an extended register, optionally setting the flags. *)
     local
         (* SP can be used as Xn and also for Xd for the non-flags versions. *)
         fun addSubtractExtendedReg (sf, oper, s, opt, xD) ({regM, regN, regD, extend}) =
         let
             val (option, imm3) = extendArithEncode extend
         in
             SimpleInstr(0wx0b200000 orb (sf << 0w31) orb (oper << 0w30) orb (s << 0w29) orb
                 (opt << 0w22) orb (word8ToWord32(xRegOnly regM) << 0w16) orb
                 (option << 0w13) orb (word8ToWord32 imm3 << 0w10) orb
                 (word8ToWord32(xRegOrXSP regN) << 0w5) orb
                 word8ToWord32(xD regD))
         end
     in
         val addExtendedReg = addSubtractExtendedReg(0w1, 0w0, 0w0, 0w0, xRegOrXSP)
         and addSExtendedReg = addSubtractExtendedReg(0w1, 0w0, 0w1, 0w0, xRegOrXZ)
         and subExtendedReg = addSubtractExtendedReg(0w1, 0w1, 0w0, 0w0, xRegOrXSP)
         and subSExtendedReg = addSubtractExtendedReg(0w1, 0w1, 0w1, 0w0, xRegOrXZ)
     end
 
     (* Logical operations on a shifted register. *)
     local
         fun logicalShiftedReg (sf, oper, n) ({regM, regN, regD, shift}) =
         let
             val (shift, imm6) = shiftEncode shift
         in
             SimpleInstr(0wx0a000000 orb (sf << 0w31) orb (oper << 0w29) orb
                 (shift << 0w22) orb (n << 0w21) orb (word8ToWord32(xRegOrXZ regM) << 0w16) orb
                 (word8ToWord32 imm6 << 0w10) orb (word8ToWord32(xRegOrXZ regN) << 0w5) orb
                 word8ToWord32(xRegOrXZ regD))
         end
     in
         val andShiftedReg = logicalShiftedReg(0w1, 0w0, 0w0)
         and orrShiftedReg = logicalShiftedReg(0w1, 0w1, 0w0)
         and eorShiftedReg = logicalShiftedReg(0w1, 0w2, 0w0)
         and andsShiftedReg = logicalShiftedReg(0w1, 0w3, 0w0)
         val andShiftedReg32 = logicalShiftedReg(0w0, 0w0, 0w0)
         and orrShiftedReg32 = logicalShiftedReg(0w0, 0w1, 0w0)
         and eorShiftedReg32 = logicalShiftedReg(0w0, 0w2, 0w0)
         and andsShiftedReg32 = logicalShiftedReg(0w0, 0w3, 0w0)
         (* There are also versions that operate with an inverted version
            of the argument. *)
     end
 
     (* Two-source operations. *)
     local
         fun twoSourceInstr (sf, s, opcode) ({regM, regN, regD}) =
             SimpleInstr(0wx1ac00000 orb (sf << 0w31) orb (s << 0w29) orb
                 (word8ToWord32(xRegOnly regM) << 0w16) orb (opcode << 0w10) orb
                 (word8ToWord32(xRegOnly regN) << 0w5) orb
                 word8ToWord32(xRegOnly regD))
     in
         (* Signed and unsigned division. *)
         val unsignedDivide   = twoSourceInstr(0w1, 0w0, 0wx2)
         and signedDivide     = twoSourceInstr(0w1, 0w0, 0wx3)
         and unsignedDivide32 = twoSourceInstr(0w0, 0w0, 0wx2)
         and signedDivide32   = twoSourceInstr(0w0, 0w0, 0wx3)
         (* Logical shift left Rd = Rn << (Rm mod 0w64) *)
         and logicalShiftLeftVariable = twoSourceInstr(0w1, 0w0, 0wx8)
         (* Logical shift right Rd = Rn >> (Rm mod 0w64) *)
         and logicalShiftRightVariable = twoSourceInstr(0w1, 0w0, 0wx9)
         (* Arithmetic shift right Rd = Rn ~>> (Rm mod 0w64) *)
         and arithmeticShiftRightVariable = twoSourceInstr(0w1, 0w0, 0wxa)
         and logicalShiftLeftVariable32 = twoSourceInstr(0w0, 0w0, 0wx8)
         and logicalShiftRightVariable32 = twoSourceInstr(0w0, 0w0, 0wx9)
         and arithmeticShiftRightVariable32 = twoSourceInstr(0w0, 0w0, 0wxa)
     end
 
     (* Three source operations.  These are all variations of multiply. *)
     local
         fun threeSourceInstr (sf, op54, op31, o0) ({regM, regA, regN, regD}) =
             SimpleInstr(0wx1b000000 orb (sf << 0w31) orb (op54 << 0w29) orb
                 (op31 << 0w21) orb (word8ToWord32(xRegOnly regM) << 0w16) orb
                 (o0 << 0w15) orb (word8ToWord32(xRegOrXZ regA) << 0w10) orb
                 (word8ToWord32(xRegOnly regN) << 0w5) orb
                 word8ToWord32(xRegOnly regD))
     in
         (* regD = regA + regN * regM *)
         val multiplyAndAdd = threeSourceInstr(0w1, 0w0, 0w0, 0w0)
         (* regD = regA - regN * regM *)
         and multiplyAndSub = threeSourceInstr(0w1, 0w0, 0w0, 0w1)
         and multiplyAndAdd32 = threeSourceInstr(0w0, 0w0, 0w0, 0w0)
         and multiplyAndSub32 = threeSourceInstr(0w0, 0w0, 0w0, 0w1)
         (* Multiply two 32-bit quantities and add/subtract a 64-bit quantity. *)
         and signedMultiplyAndAddLong = threeSourceInstr(0w1, 0w0, 0w1, 0w0)
         and signedMultiplyAndSubLong = threeSourceInstr(0w1, 0w0, 0w1, 0w1)
         (* Return the high-order part of a signed multiplication. *)
         fun signedMultiplyHigh({regM, regN, regD}) =
             threeSourceInstr(0w1, 0w0, 0w2, 0w0) { regM=regM, regN=regN, regD=regD, regA=XZero}
     end
 
     (* Loads: There are two versions of this on the ARM.  There is a version that
        takes a signed 9-bit byte offset and a version that takes an unsigned
        12-bit word offset. *)
     
     local
         fun loadStoreRegScaled (size, v, opc, xD) ({regT, regN, unitOffset}) =
         let
             val _ = (unitOffset >= 0 andalso unitOffset < 0x1000)
                 orelse raise InternalError "loadStoreRegScaled: value out of range"
         in
             SimpleInstr(0wx39000000 orb (size << 0w30) orb (opc << 0w22) orb
                 (v << 0w26) orb (Word32.fromInt unitOffset << 0w10) orb
                 (word8ToWord32(xRegOrXSP regN) << 0w5) orb word8ToWord32(xD regT))
         end
     in
         val loadRegScaled = loadStoreRegScaled(0w3, 0w0, 0w1, xRegOrXZ)
         and storeRegScaled = loadStoreRegScaled(0w3, 0w0, 0w0, xRegOrXZ)
         (* (Unsigned) byte operations.  There are also signed versions. *)
         and loadRegScaledByte = loadStoreRegScaled (0w0, 0w0, 0w1, xRegOrXZ)
         and storeRegScaledByte = loadStoreRegScaled (0w0, 0w0, 0w0, xRegOrXZ)
         and loadRegScaled16 = loadStoreRegScaled (0w1, 0w0, 0w1, xRegOrXZ)
         and storeRegScaled16 = loadStoreRegScaled (0w1, 0w0, 0w0, xRegOrXZ)
         and loadRegScaled32 = loadStoreRegScaled (0w2, 0w0, 0w1, xRegOrXZ)
         and storeRegScaled32 = loadStoreRegScaled (0w2, 0w0, 0w0, xRegOrXZ)
         and loadRegScaledDouble = loadStoreRegScaled(0w3, 0w1, 0w1, vReg)
         and storeRegScaledDouble = loadStoreRegScaled(0w3, 0w1, 0w0, vReg)
         and loadRegScaledFloat = loadStoreRegScaled(0w2, 0w1, 0w1, vReg)
         and storeRegScaledFloat = loadStoreRegScaled(0w2, 0w1, 0w0, vReg)
     end    
 
     local
         (* Loads and stores with a signed byte offset.  This includes simple
            unscaled addresses, pre-indexing and post-indexing. *)
         fun loadStoreByteAddress (op4, xD) (size, v, opc) ({regT, regN, byteOffset}) =
         let
             val _ = (byteOffset >= ~256 andalso byteOffset < 256)
                 orelse raise InternalError "loadStoreUnscaled: value out of range"
             val imm9 = Word32.fromInt byteOffset andb 0wx1ff
         in
             SimpleInstr(0wx38000000 orb (size << 0w30) orb (opc << 0w22) orb
                 (v << 0w26) orb (imm9 << 0w12) orb (op4 << 0w10) orb
                 (word8ToWord32(xRegOrXSP regN) << 0w5) orb word8ToWord32(xD regT))
         end
         
         val loadStoreUnscaled = loadStoreByteAddress (0w0, xRegOrXZ)
         and loadStoreUnscaledSIMD = loadStoreByteAddress (0w0, vReg)
         and loadStorePostIndex = loadStoreByteAddress (0w1, xRegOrXZ)
         and loadStorePreIndex = loadStoreByteAddress (0w3, xRegOrXZ)
     in
         val loadRegUnscaled = loadStoreUnscaled (0w3, 0w0, 0w1)
         and storeRegUnscaled = loadStoreUnscaled (0w3, 0w0, 0w0)
         (* (Unsigned) byte operations.  There are also signed versions. *)
         and loadRegUnscaledByte = loadStoreUnscaled (0w0, 0w0, 0w1)
         and loadRegUnscaledSignedByteTo64 = loadStoreUnscaled (0w0, 0w0, 0w2)
         and loadRegUnscaledSignedByteTo32 = loadStoreUnscaled (0w0, 0w0, 0w3)
         and storeRegUnscaledByte = loadStoreUnscaled (0w0, 0w0, 0w0)
         and loadRegUnscaled16 = loadStoreUnscaled (0w1, 0w0, 0w1)
         and loadRegUnscaledSigned16To64 = loadStoreUnscaled (0w1, 0w0, 0w2)
         and loadRegUnscaledSigned16To32 = loadStoreUnscaled (0w1, 0w0, 0w3)
         and storeRegUnscaled16 = loadStoreUnscaled (0w1, 0w0, 0w0)
         and loadRegUnscaled32 = loadStoreUnscaled (0w2, 0w0, 0w1)
         and loadRegUnscaledSigned32To64 = loadStoreUnscaled (0w2, 0w0, 0w2)
         and storeRegUnscaled32 = loadStoreUnscaled (0w2, 0w0, 0w0)
         and loadRegUnscaledFloat = loadStoreUnscaledSIMD (0w2, 0w1, 0w1)
         and storeRegUnscaledFloat = loadStoreUnscaledSIMD (0w2, 0w1, 0w0)
         and loadRegUnscaledDouble = loadStoreUnscaledSIMD (0w3, 0w1, 0w1)
         and storeRegUnscaledDouble = loadStoreUnscaledSIMD (0w3, 0w1, 0w0)
 
         val loadRegPostIndex = loadStorePostIndex (0w3, 0w0, 0w1)
         and storeRegPostIndex = loadStorePostIndex (0w3, 0w0, 0w0)
         and loadRegPostIndex32 = loadStorePostIndex (0w2, 0w0, 0w1)
         and storeRegPostIndex32 = loadStorePostIndex (0w2, 0w0, 0w0)
         and loadRegPostIndexByte = loadStorePostIndex (0w0, 0w0, 0w1)
         and storeRegPostIndexByte = loadStorePostIndex (0w0, 0w0, 0w0)
 
         val loadRegPreIndex = loadStorePreIndex (0w3, 0w0, 0w1)
         and storeRegPreIndex = loadStorePreIndex (0w3, 0w0, 0w0)
         and loadRegPreIndex32 = loadStorePreIndex (0w2, 0w0, 0w1)
         and storeRegPreIndex32 = loadStorePreIndex (0w2, 0w0, 0w0)
         and loadRegPreIndexByte = loadStorePreIndex (0w0, 0w0, 0w1)
         and storeRegPreIndexByte = loadStorePreIndex (0w0, 0w0, 0w0)
     end
 
     (* Load/store with a register offset i.e. an index register. *)
     local
         fun loadStoreRegRegisterOffset (size, v, opc, xD) ({regT, regN, regM, option}) =
         let
             val (opt, s) =
                 case extendLSEncode option of
                     (opt, ScaleOrShift) => (opt, 0w1) | (opt, NoScale) => (opt, 0w0)
         in
             SimpleInstr(0wx38200800 orb (size << 0w30) orb (v << 0w26) orb (opc << 0w22) orb
                 (word8ToWord32(xRegOnly regM) << 0w16) orb (opt << 0w13) orb (s << 0w12) orb
                 (word8ToWord32(xRegOrXSP regN) << 0w5) orb word8ToWord32(xD regT))
         end
     in
         val loadRegIndexed = loadStoreRegRegisterOffset(0w3, 0w0, 0w1, xRegOrXZ)
         and storeRegIndexed = loadStoreRegRegisterOffset(0w3, 0w0, 0w0, xRegOrXZ)
         and loadRegIndexedByte = loadStoreRegRegisterOffset(0w0, 0w0, 0w1, xRegOrXZ)
         and storeRegIndexedByte = loadStoreRegRegisterOffset(0w0, 0w0, 0w0, xRegOrXZ)
         and loadRegIndexed16 = loadStoreRegRegisterOffset(0w1, 0w0, 0w1, xRegOrXZ)
         and storeRegIndexed16 = loadStoreRegRegisterOffset(0w1, 0w0, 0w0, xRegOrXZ)
         and loadRegIndexed32 = loadStoreRegRegisterOffset(0w2, 0w0, 0w1, xRegOrXZ)
         and storeRegIndexed32 = loadStoreRegRegisterOffset(0w2, 0w0, 0w0, xRegOrXZ)
         and loadRegIndexedFloat = loadStoreRegRegisterOffset(0w2, 0w1, 0w1, vReg)
         and storeRegIndexedFloat = loadStoreRegRegisterOffset(0w2, 0w1, 0w0, vReg)
         and loadRegIndexedDouble = loadStoreRegRegisterOffset(0w3, 0w1, 0w1, vReg)
         and storeRegIndexedDouble = loadStoreRegRegisterOffset(0w3, 0w1, 0w0, vReg)
     end
 
     local
         (* Loads and stores with special ordering. *)
         fun loadStoreExclusive(size, o2, l, o1, o0) {regS, regT2, regN, regT} =
             SimpleInstr(0wx08000000 orb (size << 0w30) orb (o2 << 0w23) orb (l << 0w22) orb
             
                 (o1 << 0w21) orb (word8ToWord32(xRegOrXZ regS) << 0w16) orb (o0 << 0w15) orb
                 (word8ToWord32(xRegOrXZ regT2) << 0w10) orb (word8ToWord32(xRegOrXSP regN) << 0w5) orb
                  word8ToWord32(xRegOrXZ regT))
     in
         fun loadAcquire{regN, regT} =
             loadStoreExclusive(0w3, 0w1, 0w1, 0w0, 0w1) {regS=XZero, regT2=XZero, regN=regN, regT=regT}
         and storeRelease{regN, regT} =
             loadStoreExclusive(0w3, 0w1, 0w0, 0w0, 0w1) {regS=XZero, regT2=XZero, regN=regN, regT=regT}
         and loadAcquire32{regN, regT} =
             loadStoreExclusive(0w2, 0w1, 0w1, 0w0, 0w1) {regS=XZero, regT2=XZero, regN=regN, regT=regT}
         and storeRelease32{regN, regT} =
             loadStoreExclusive(0w2, 0w1, 0w0, 0w0, 0w1) {regS=XZero, regT2=XZero, regN=regN, regT=regT}
         and loadAcquireByte{regN, regT} =
             loadStoreExclusive(0w0, 0w1, 0w1, 0w0, 0w1) {regS=XZero, regT2=XZero, regN=regN, regT=regT}
         and storeReleaseByte{regN, regT} =
             loadStoreExclusive(0w0, 0w1, 0w0, 0w0, 0w1) {regS=XZero, regT2=XZero, regN=regN, regT=regT}
 
         (* Acquire exclusive access to a memory location and load its current value *)
         and loadAcquireExclusiveRegister{regN, regT} =
             loadStoreExclusive(0w3, 0w0, 0w1, 0w0, 0w1) {regS=XZero, regT2=XZero, regN=regN, regT=regT}
         (* Release exclusive access and test whether it succeeded.  Sets regS to 0
            if successful otherwise 1, in which case we have to repeat the operation. *)
         and storeReleaseExclusiveRegister{regN, regS, regT} =
             loadStoreExclusive(0w3, 0w0, 0w0, 0w0, 0w1) {regS=regS, regT2=XZero, regN=regN, regT=regT}
     end
 
     local
         (* Load and store pairs.  The offsets are signed scaled values. *)
         fun loadStorePair op2 (opc, v, l, rT) {regT1, regT2, regN, unitOffset} =
         let
             val _ = (unitOffset >= ~64 andalso unitOffset < 64)
                 orelse raise InternalError "loadStorePair: value out of range"
             val imm7 = Word32.fromInt unitOffset andb 0wx7f
         in
            SimpleInstr(0wx28000000 orb (opc << 0w30) orb (v << 0w26) orb (op2 << 0w23) orb
             (l << 0w22) orb (imm7 << 0w15) orb (word8ToWord32(rT regT2) << 0w10) orb
             (word8ToWord32(xRegOrXSP regN) << 0w5) orb word8ToWord32(rT regT1))
         end
         
         fun loadStorePairOffset args = loadStorePair 0w2 args
         and loadStorePairPostIndexed args = loadStorePair 0w1 args
         and loadStorePairPreIndexed args = loadStorePair 0w3 args
     in
         val storePairOffset = loadStorePairOffset(0w2, 0w0, 0w0, xRegOrXZ)
         and loadPairOffset =  loadStorePairOffset(0w2, 0w0, 0w1, xRegOrXZ)
         and storePairPostIndexed = loadStorePairPostIndexed(0w2, 0w0, 0w0, xRegOrXZ)
         and loadPairPostIndexed =  loadStorePairPostIndexed(0w2, 0w0, 0w1, xRegOrXZ)
         and storePairPreIndexed = loadStorePairPreIndexed(0w2, 0w0, 0w0, xRegOrXZ)
         and loadPairPreIndexed =  loadStorePairPreIndexed(0w2, 0w0, 0w1, xRegOrXZ)
         
         and storePairOffset32 = loadStorePairOffset(0w0, 0w0, 0w0, xRegOrXZ)
         and loadPairOffset32 =  loadStorePairOffset(0w0, 0w0, 0w1, xRegOrXZ)
         and storePairPostIndexed32 = loadStorePairPostIndexed(0w0, 0w0, 0w0, xRegOrXZ)
         and loadPairPostIndexed32 =  loadStorePairPostIndexed(0w0, 0w0, 0w1, xRegOrXZ)
         and storePairPreIndexed32 = loadStorePairPreIndexed(0w0, 0w0, 0w0, xRegOrXZ)
         and loadPairPreIndexed32 =  loadStorePairPreIndexed(0w0, 0w0, 0w1, xRegOrXZ)
 
         and storePairOffsetFloat = loadStorePairOffset(0w0, 0w1, 0w0, vReg)
         and loadPairOffsetFloat = loadStorePairOffset(0w0, 0w1, 0w1, vReg)
         and storePairPostIndexedFloat = loadStorePairPostIndexed(0w0, 0w1, 0w0, vReg)
         and loadPairPostIndexedFloat = loadStorePairPostIndexed(0w0, 0w1, 0w1, vReg)
         and storePairPreIndexedFloat = loadStorePairPreIndexed(0w0, 0w1, 0w0, vReg)
         and loadPairPreIndexedFloat = loadStorePairPreIndexed(0w0, 0w1, 0w1, vReg)
 
         and storePairOffsetDouble = loadStorePairOffset(0w0, 0w1, 0w0, vReg)
         and loadPairOffsetDouble = loadStorePairOffset(0w0, 0w1, 0w1, vReg)
         and storePairPostIndexedDouble = loadStorePairPostIndexed(0w1, 0w1, 0w0, vReg)
         and loadPairPostIndexedDouble = loadStorePairPostIndexed(0w1, 0w1, 0w1, vReg)
         and storePairPreIndexedDouble = loadStorePairPreIndexed(0w1, 0w1, 0w0, vReg)
         and loadPairPreIndexedDouble = loadStorePairPreIndexed(0w1, 0w1, 0w1, vReg)
     end
 
     (* Addresses must go in the constant area at the end of the code where they
        can be found by the GC. *)
     fun loadAddressConstant(xReg, valu) =
         LoadAddressLiteral{reg=xReg, value=valu, length=ref BrExtended}
 
     (* Non-address constants.  These may or may not be tagged values. *)
     fun loadNonAddressConstant(xReg, valu) =
         LoadNonAddressLiteral{reg=xReg, value=valu, length=ref BrExtended}
 
     (* Floating point constants.  TODO: We can use fmov dn,c for various  constant values. *)
     local
         (* Use the same instruction for both float and double. *)
         fun moviZero regD = SimpleInstr(0wx2F00E400 orb word8ToWord32(vReg regD))
     in
         fun loadFloatConstant(vReg, 0w0, _) = moviZero vReg
     
         |   loadFloatConstant(vReg, valu, work) =
                 LoadFPLiteral{reg=vReg, value=valu, isDouble=false, length=ref BrExtended, work=work}
 
         and loadDoubleConstant(vReg, 0w0, _) = moviZero vReg
 
         |   loadDoubleConstant(vReg, valu, work) =
                 LoadFPLiteral{reg=vReg, value=valu, isDouble=true, length=ref BrExtended, work=work}
     end
 
     local
         fun moveWideImmediate(sf, opc) {regD, immediate, shift} =
         let
             val hw =
                 case (shift, sf) of
                     (0w0, _) => 0w0
                 |   (0w16, _) => 0w1
                 |   (0w24, 0w1) => 0w2
                 |   (0w48, 0w1) => 0w3
                 |   _ => raise InternalError "moveWideImmediate: invalid shift"
             val _ =
                 immediate <= 0wxffff orelse raise InternalError "moveWideImmediate: immediate too large"
         in
             SimpleInstr(0wx12800000 orb (sf << 0w31) orb (opc << 0w29) orb
                 (hw << 0w21) orb (wordToWord32 immediate << 0w5) orb word8ToWord32(xRegOnly regD))
         end
     in
         val moveNot32 = moveWideImmediate(0w0, 0w0)
         and moveZero32 = moveWideImmediate(0w0, 0w2)
         and moveKeep32 = moveWideImmediate(0w0, 0w3)
         and moveNot = moveWideImmediate(0w1, 0w0)
         and moveZero = moveWideImmediate(0w1, 0w2)
         and moveKeep = moveWideImmediate(0w1, 0w3)
     end
 
     (* Instructions involved in thread synchonisation. *)
     val yield = SimpleInstr 0wxD503203F (* Yield inside a spin-lock. *)
     and dmbIsh = SimpleInstr 0wxD5033BBF (* Memory barrier. *)
     
     (* Jump to the address in the register and put the address of the
        next instruction into X30. *)
     fun branchAndLinkReg(dest) =
         SimpleInstr(0wxD63F0000 orb (word8ToWord32(xRegOnly dest) << 0w5))
 
     (* Jump to the address in the register. *)
     fun branchRegister(dest) =
         SimpleInstr(0wxD61F0000 orb (word8ToWord32(xRegOnly dest) << 0w5))
 
     (* Jump to the address in the register and hint this is a return. *)
     fun returnRegister(dest) =
         SimpleInstr(0wxD65F0000 orb (word8ToWord32(xRegOnly dest) << 0w5))
 
     (* Put a label into the code. *)
     val setLabel = Label
 
     (* Create a label. *)
     fun createLabel () = ref [ref 0w0]
 
     (* A conditional or unconditional branch. *)
     and conditionalBranch(cond, label) = ConditionalBranch{label=label, jumpCondition=cond, length=ref BrExtended }
     and unconditionalBranch label = UnconditionalBranch{label=label, andLink=false}
     and branchAndLink label = UnconditionalBranch{label=label, andLink=true}
     (* Put the address of a label into a register - used for handlers and cases. *)
     and loadLabelAddress(reg, label) = LoadLabelAddress{label=label, reg=reg, length=ref BrExtended}
     (* Test a bit in a register and branch if zero/nonzero *)
     and testBitBranchZero(reg, bit, label) =
         TestBitBranch{label=label, bitNo=bit, brNonZero=false, reg=reg, length=ref BrExtended}
     and testBitBranchNonZero(reg, bit, label) =
         TestBitBranch{label=label, bitNo=bit, brNonZero=true, reg=reg, length=ref BrExtended}
     (* Compare a register with zero and branch if zero/nonzero *)
     and compareBranchZero(reg,  label) =
         CompareBranch{label=label, brNonZero=false, size=WordSize64, reg=reg, length=ref BrExtended}
     and compareBranchNonZero(reg, label) =
         CompareBranch{label=label, brNonZero=true, size=WordSize64, reg=reg, length=ref BrExtended}
     and compareBranchZero32(reg, label) =
         CompareBranch{label=label, brNonZero=false, size=WordSize32, reg=reg, length=ref BrExtended}
     and compareBranchNonZero32(reg, label) =
         CompareBranch{label=label, brNonZero=true, size=WordSize32, reg=reg, length=ref BrExtended}
     
 
     (* Set the destination register to the value of the first reg if the
        condition is true otherwise to a, possibly modified, version of
        the second argument.  There are variants that set it unmodified,
        incremented, inverted and negated. *)
     local
         fun conditionalSelect (sf, opc, op2) {regD, regFalse, regTrue, cond} =
             SimpleInstr(0wx1A800000 orb (sf << 0w31) orb (opc << 0w30) orb
                 (word8ToWord32(xRegOrXZ regFalse) << 0w16) orb (cCode cond << 0w12) orb
                 (op2 << 0w10) orb (word8ToWord32(xRegOrXZ regTrue) << 0w5) orb
                 word8ToWord32(xRegOrXZ regD))
     in
         val conditionalSet = conditionalSelect(0w1, 0w0, 0w0)
         and conditionalSetIncrement = conditionalSelect(0w1, 0w0, 0w1)
         and conditionalSetInverted = conditionalSelect(0w1, 0w1, 0w0)
         and conditionalSetNegated = conditionalSelect(0w1, 0w1, 0w1)
         and conditionalSet32 = conditionalSelect(0w0, 0w0, 0w0)
         and conditionalSetIncrement32 = conditionalSelect(0w0, 0w0, 0w1)
         and conditionalSetInverted32 = conditionalSelect(0w0, 0w1, 0w0)
         and conditionalSetNegated32 = conditionalSelect(0w0, 0w1, 0w1)
     end
 
     (* This combines the effect of a left and right shift.  There are various
        derived forms of this depending on the relative values of immr and imms.
        if imms >= immr copies imms-immr-1 bits from bit position immr to the lsb
        bits of the destination.
        if imms < immr copies imms+1 bits from the lsb bit to bit position
        regsize-immr.
        How the remaining bits are affected depends on the instruction.
        BitField instructions do not affect other bits.
        UnsignedBitField instructions zero other bits.
        SignedBitField instructions set the high order bits to a copy of
        the high order bit copied and zero the low order bits. *)
     local
         fun bitfield (sf, opc, n) {immr, imms, regN, regD} =
             SimpleInstr(0wx13000000 orb (sf << 0w31) orb (opc << 0w29) orb (n << 0w22) orb
                 (wordToWord32 immr << 0w16) orb (wordToWord32 imms << 0w10) orb (word8ToWord32(xRegOrXZ regN) << 0w5) orb
                 word8ToWord32(xRegOrXZ regD))
 
     in
         val signedBitfieldMove32 = bitfield(0w0, 0w0, 0w0)
         and bitfieldMove32 = bitfield(0w0, 0w1, 0w0)
         and unsignedBitfieldMove32 = bitfield(0w0, 0w2, 0w0)
         and signedBitfieldMove64 = bitfield(0w1, 0w0, 0w1)
         and bitfieldMove64 = bitfield(0w1, 0w1, 0w1)
         and unsignedBitfieldMove64 = bitfield(0w1, 0w2, 0w1)
 
         (* Derived forms. *)
         fun logicalShiftLeft{shift, regN, regD} =
                 unsignedBitfieldMove64{immr=Word.~ shift mod 0w64,
                     imms=0w64-0w1-shift, regN=regN, regD=regD}
         and logicalShiftLeft32{shift, regN, regD} =
                 unsignedBitfieldMove32{immr=Word.~ shift mod 0w32,
                     imms=0w32-0w1-shift, regN=regN, regD=regD}
 
         and logicalShiftRight{shift, regN, regD} =
                 unsignedBitfieldMove64{immr=shift, imms=0wx3f, regN=regN, regD=regD}
         and logicalShiftRight32{shift, regN, regD} =
                 unsignedBitfieldMove32{immr=shift, imms=0wx1f, regN=regN, regD=regD}
 
         and unsignedBitfieldInsertinZeros{lsb, width, regN, regD} =
                 unsignedBitfieldMove64{immr=Word.~ lsb mod 0w64,
                     imms=width-0w1, regN=regN, regD=regD}
         and unsignedBitfieldInsertinZeros32{lsb, width, regN, regD} =
                 unsignedBitfieldMove32{immr=Word.~ lsb mod 0w32,
                     imms=width-0w1, regN=regN, regD=regD}
 
         and arithmeticShiftRight{shift, regN, regD} =
                 signedBitfieldMove64{immr=shift, imms=0wx3f, regN=regN, regD=regD}
         and arithmeticShiftRight32{shift, regN, regD} =
                 signedBitfieldMove32{immr=shift, imms=0wx1f, regN=regN, regD=regD}
         and signedBitfieldExtract{lsb, width, regN, regD} =
                 signedBitfieldMove64{immr=lsb, imms=lsb+width-0w1, regN=regN, regD=regD}
 
         and bitfieldInsert{lsb, width, regN, regD} =
                 bitfieldMove64{immr=Word.~ lsb mod 0w64, imms=width-0w1, regN=regN, regD=regD}
         and bitfieldInsert32{lsb, width, regN, regD} =
                 bitfieldMove32{immr=Word.~ lsb mod 0w32, imms=width-0w1, regN=regN, regD=regD}
     end
 
     local
         (* Logical immediates.  AND, OR, XOR and ANDS.  Assumes that the immediate value
            has already been checked as valid.  The non-flags versions can use SP as the
            destination. *)
         fun logicalImmediate (s, opc, xD) {bits, regN, regD} =
         let
             val {n, imms, immr} = 
                 case encodeBitPattern(bits, if s = 0w0 then WordSize32 else WordSize64) of
                     NONE => raise InternalError "testBitPattern: unable to encode bit pattern"
                 |   SOME res => res
         in
             SimpleInstr(0wx12000000 orb (opc << 0w29) orb (s << 0w31) orb (n << 0w22) orb
                 (wordToWord32 immr << 0w16) orb (wordToWord32 imms << 0w10) orb (word8ToWord32(xRegOrXZ regN) << 0w5) orb
                 word8ToWord32(xD regD))
         end
     in
         val bitwiseAndImmediate = logicalImmediate (0w1, 0w0, xRegOrXSP)
         and bitwiseOrImmediate = logicalImmediate (0w1, 0w1, xRegOrXSP)
         and bitwiseXorImmediate = logicalImmediate (0w1, 0w2, xRegOrXSP)
         and bitwiseAndSImmediate = logicalImmediate (0w1, 0w3, xRegOrXZ)
         and bitwiseAndImmediate32 = logicalImmediate (0w0, 0w0, xRegOrXSP)
         and bitwiseOrImmediate32 = logicalImmediate (0w0, 0w1, xRegOrXSP)
         and bitwiseXorImmediate32 = logicalImmediate (0w0, 0w2, xRegOrXSP)
         and bitwiseAndSImmediate32 = logicalImmediate (0w0, 0w3, xRegOrXZ)
     end
 
     local
         (* Floating point operations - 2 source *)
         fun floatingPoint2Source (pt, opc) {regM, regN, regD} =
             SimpleInstr(0wx1E200800 orb (pt << 0w22) orb (word8ToWord32(vReg regM) << 0w16) orb
                 (opc << 0w12) orb (word8ToWord32(vReg regN) << 0w5) orb word8ToWord32(vReg regD))
     in
         val multiplyFloat = floatingPoint2Source(0w0, 0wx0)
         and divideFloat = floatingPoint2Source(0w0, 0wx1)
         and addFloat = floatingPoint2Source(0w0, 0wx2)
         and subtractFloat = floatingPoint2Source(0w0, 0wx3)
         and multiplyDouble = floatingPoint2Source(0w1, 0wx0)
         and divideDouble = floatingPoint2Source(0w1, 0wx1)
         and addDouble = floatingPoint2Source(0w1, 0wx2)
         and subtractDouble = floatingPoint2Source(0w1, 0wx3)
     end
 
     local
         (* Move between a floating point and a general register with or without conversion. *)
         fun fmoveGeneral (sf, s, ptype, mode, opcode, rN, rD) {regN, regD} =
             SimpleInstr(0wx1E200000 orb (sf << 0w31) orb (s << 0w29) orb (ptype << 0w22) orb
                 (mode << 0w19) orb (opcode << 0w16) orb
                 (word8ToWord32(rN regN) << 0w5) orb word8ToWord32(rD regD))
         open IEEEReal
     in
         (* Moves without conversion *)
         val moveGeneralToFloat = fmoveGeneral(0w0, 0w0, 0w0, 0w0, 0w7, xRegOrXZ, vReg)
         and moveFloatToGeneral = fmoveGeneral(0w0, 0w0, 0w0, 0w0, 0w6, vReg, xRegOnly)
         and moveGeneralToDouble = fmoveGeneral(0w1, 0w0, 0w1, 0w0, 0w7, xRegOrXZ, vReg)
         and moveDoubleToGeneral = fmoveGeneral(0w1, 0w0, 0w1, 0w0, 0w6, vReg, xRegOnly)
         (* Moves with conversion - signed.  The argument is a 64-bit value. *)
         and convertIntToFloat = fmoveGeneral(0w1, 0w0, 0w0, 0w0, 0w2, xRegOrXZ, vReg)
         and convertIntToDouble = fmoveGeneral(0w1, 0w0, 0w1, 0w0, 0w2, xRegOrXZ, vReg)
         and convertInt32ToFloat = fmoveGeneral(0w0, 0w0, 0w0, 0w0, 0w2, xRegOrXZ, vReg)
         and convertInt32ToDouble = fmoveGeneral(0w0, 0w0, 0w1, 0w0, 0w2, xRegOrXZ, vReg)
 
         fun convertFloatToInt TO_NEAREST =
                 fmoveGeneral(0w1, 0w0, 0w0, 0w0, 0w4, vReg, xRegOnly) (* fcvtas *)
         |   convertFloatToInt TO_NEGINF =
                 fmoveGeneral(0w1, 0w0, 0w0, 0w2, 0w0, vReg, xRegOnly) (* fcvtms *)
         |   convertFloatToInt TO_POSINF =
                 fmoveGeneral(0w1, 0w0, 0w0, 0w1, 0w0, vReg, xRegOnly) (* fcvtps *)
         |   convertFloatToInt TO_ZERO =
                 fmoveGeneral(0w1, 0w0, 0w0, 0w3, 0w0, vReg, xRegOnly) (* fcvtzs *)
 
         and convertDoubleToInt TO_NEAREST =
                 fmoveGeneral(0w1, 0w0, 0w1, 0w0, 0w4, vReg, xRegOnly) (* fcvtas *)
         |   convertDoubleToInt TO_NEGINF =
                 fmoveGeneral(0w1, 0w0, 0w1, 0w2, 0w0, vReg, xRegOnly) (* fcvtms *)
         |   convertDoubleToInt TO_POSINF =
                 fmoveGeneral(0w1, 0w0, 0w1, 0w1, 0w0, vReg, xRegOnly) (* fcvtps *)
         |   convertDoubleToInt TO_ZERO =
                 fmoveGeneral(0w1, 0w0, 0w1, 0w3, 0w0, vReg, xRegOnly) (* fcvtzs *)
 
         and convertFloatToInt32 TO_NEAREST =
                 fmoveGeneral(0w0, 0w0, 0w0, 0w0, 0w4, vReg, xRegOnly) (* fcvtas *)
         |   convertFloatToInt32 TO_NEGINF =
                 fmoveGeneral(0w0, 0w0, 0w0, 0w2, 0w0, vReg, xRegOnly) (* fcvtms *)
         |   convertFloatToInt32 TO_POSINF =
                 fmoveGeneral(0w0, 0w0, 0w0, 0w1, 0w0, vReg, xRegOnly) (* fcvtps *)
         |   convertFloatToInt32 TO_ZERO =
                 fmoveGeneral(0w0, 0w0, 0w0, 0w3, 0w0, vReg, xRegOnly) (* fcvtzs *)
 
         and convertDoubleToInt32 TO_NEAREST =
                 fmoveGeneral(0w0, 0w0, 0w1, 0w0, 0w4, vReg, xRegOnly) (* fcvtas *)
         |   convertDoubleToInt32 TO_NEGINF =
                 fmoveGeneral(0w0, 0w0, 0w1, 0w2, 0w0, vReg, xRegOnly) (* fcvtms *)
         |   convertDoubleToInt32 TO_POSINF =
                 fmoveGeneral(0w0, 0w0, 0w1, 0w1, 0w0, vReg, xRegOnly) (* fcvtps *)
         |   convertDoubleToInt32 TO_ZERO =
                 fmoveGeneral(0w0, 0w0, 0w1, 0w3, 0w0, vReg, xRegOnly) (* fcvtzs *)
     end
 
     local
         fun floatingPtCompare(ptype, opc) {regM, regN} =
             SimpleInstr(0wx1E202000 orb (ptype << 0w22) orb
                 (word8ToWord32(vReg regM) << 0w16) orb (word8ToWord32(vReg regN) << 0w5) orb
                 (opc << 0w3))
     in
         val compareFloat = floatingPtCompare(0w0, 0w0) (* fcmp *)
         and compareDouble = floatingPtCompare(0w1, 0w0)
         (* It is also possible to compare a single register with zero using opc=1/3 *)
     end
 
     local
         (* Floating point single source. *)
         fun floatingPtSingle (ptype, opc) {regN, regD} =
             SimpleInstr(0wx1E204000 orb (ptype << 0w22) orb (opc << 0w15) orb
                 (word8ToWord32(vReg regN) << 0w5) orb word8ToWord32(vReg regD))
     in
         val moveFloatToFloat = floatingPtSingle(0w0, 0wx0)
         and absFloat = floatingPtSingle(0w0, 0wx1)
         and negFloat = floatingPtSingle(0w0, 0wx2)
         and convertFloatToDouble = floatingPtSingle(0w0, 0wx5)
         and moveDoubleToDouble = floatingPtSingle(0w1, 0wx0)
         and absDouble = floatingPtSingle(0w1, 0wx1)
         and negDouble = floatingPtSingle(0w1, 0wx2)
         and convertDoubleToFloat = floatingPtSingle(0w1, 0wx4)
     end
 
     local
         fun atomicMemory (size, v, a, r, o3, opc) {regS, regN, regT} =
             SimpleInstr(0wx38200000 orb (size << 0w30) orb (v << 0w26) orb (a << 0w23) orb (r << 0w22) orb (o3 << 0w15) orb (opc << 0w12)
                 orb (word8ToWord32(xRegOrXZ regS) << 0w16) orb (word8ToWord32(xRegOrXSP regN) << 0w5) orb word8ToWord32(xRegOrXZ regT))
     in
         val loadAddAL = atomicMemory(0w3, 0w0, 0w1, 0w1, 0w0, 0w0)
         and loadUMaxAL = atomicMemory(0w3, 0w0, 0w1, 0w1, 0w0, 0w6)
         and swapAL = atomicMemory(0w3, 0w0, 0w1, 0w1, 0w1, 0w0)
+        and loadAddA = atomicMemory(0w3, 0w0, 0w1, 0w0, 0w0, 0w0)
+        and loadUMaxA = atomicMemory(0w3, 0w0, 0w1, 0w0, 0w0, 0w6)
+        and swapL = atomicMemory(0w3, 0w0, 0w0, 0w1, 0w1, 0w0)
     end
 
 
     (* This word is put in after a call to the RTS trap-handler.  All the registers
        are saved and restored across a call to the trap-handler; the register
        mask contains those that may contain an address and so need to be scanned and
        possibly updated if there is a GC. *)
     fun registerMask(regs) =
     let
         fun addToMask(r, mask) =
         let
             val rno = word8ToWord(xRegOnly r)
         in
             if rno > 0w24 (* In particular this can't be X30. *)
             then raise InternalError ("registerMask: invalid register "^Word.toString rno)
             else mask orb (0w1 << word8ToWord(xRegOnly r))
         end
         val maskWord = List.foldl addToMask 0w0 regs
     in
         SimpleInstr(0wx02000000 (* Reserved instr range. *) orb maskWord)
     end
     
     (* This is a bit of a hack but is the only way to get round the problem that when
        a callback (FFI closure) is called the code has none of the global registers.
        This isn't a problem in the native addressing version because we have
        absolute addresses but in 32-in-64 we need at least one absolute address to
        begin.  This embeds the global heap base pointer as a constant in the
        non-address constant area.  It requires the RTS to be able to find it and
        update it when the code is loaded.  We insert a nop followed by the
        pc-relative load.  This MUST be the first instruction in the code. *)
     local
         val getHeapBase: unit -> LargeWord.word = RunCall.rtsCallFull0 "PolyGetHeapBase"
     in
         fun loadGlobalHeapBaseInCallback reg =
             if is32in64
             then [SimpleInstr nopCode, loadNonAddressConstant(reg, getHeapBase())]
             else raise InternalError "loadGlobalHeapBaseInCallback called with native addressing"
     end
 
     (* Size of each code word. *)
     fun codeSize (SimpleInstr _) = 1 (* Number of 32-bit words *)
     |   codeSize (LoadAddressLiteral{ length=ref BrShort, ...}) = 1
     |   codeSize (LoadAddressLiteral{ length=ref BrExtended, ...}) = 2
     |   codeSize (LoadNonAddressLiteral{ length=ref BrShort, ...}) = 1
     |   codeSize (LoadNonAddressLiteral{ length=ref BrExtended, ...}) = 2
     |   codeSize (LoadFPLiteral{ length=ref BrShort, ...}) = 1
     |   codeSize (LoadFPLiteral{ length=ref BrExtended, ...}) = 2
     |   codeSize (Label _) = 0
     |   codeSize (UnconditionalBranch _) = 1
     |   codeSize (LoadLabelAddress { length=ref BrShort, ...}) = 1
     |   codeSize (LoadLabelAddress { length=ref BrExtended, ...}) = 2
     |   codeSize (ConditionalBranch { length=ref BrShort, ...}) = 1
     |   codeSize (ConditionalBranch { length=ref BrExtended, ...}) = 2
     |   codeSize (TestBitBranch { length=ref BrShort, ...}) = 1
     |   codeSize (TestBitBranch { length=ref BrExtended, ...}) = 2
     |   codeSize (CompareBranch { length=ref BrShort, ...}) = 1
     |   codeSize (CompareBranch { length=ref BrExtended, ...}) = 2
 
     (* Store a 32-bit value in the code.  Always little-endian. *)
     fun writeInstr(value, wordAddr, seg) =
     let
         fun putBytes(value, a, seg, i) =
         if i = 0w4 then ()
         else
         (
             byteVecSet(seg, a+i, word32ToWord8(value andb 0wxff));
             putBytes(value >> 0w8, a, seg, i+0w1)
         )
     in
         putBytes(value, Word.<<(wordAddr, 0w2), seg, 0w0)
     end
     
     (* Store a 64-bit constant in the code area. *)
     fun write64Bit(value, word64Addr, seg) =
     let
         fun putBytes(value, a, seg, i) =
         if i = 0w8 then ()
         else
         (
             byteVecSet(seg,
                 if not isBigEndian then a+i else a+0w8-i-0w1,
                 Word8.fromLarge(Word64.toLarge value));
             putBytes(Word64.>>(value, 0w8), a, seg, i+0w1)
         )
     in
         putBytes(value, Word.<<(word64Addr, 0w3), seg, 0w0)
     end
 
     (* Set the sizes of branches depending on the distance to the destination. *)
     fun setLabelsAndSizes(ops, maxConstantSize) =
     let
         (* Set the labels and get the current size of the code. *)
         fun setLabels(Label(ref labs) :: ops, ic) = (List.app(fn d => d := ic) labs; setLabels(ops, ic))
         |   setLabels(oper :: ops, ic) = setLabels(ops, ic + Word.fromInt(codeSize oper))
         |   setLabels([], ic) = ic
 
         (* Set the labels and adjust the sizes, repeating until it never gets smaller *)
         fun setLabAndSize(ops, lastSize) =
         let
             (* See if we can shorten any branches.  The "addr" is the original address since
                that's what we've used to set the labels.  *)
             fun adjust([], _) = ()
 
             |   adjust(ConditionalBranch { length as ref BrExtended, label=ref labs, ...} :: instrs, addr) =
                 let
                     val dest = !(hd labs)
                     val offset = Word.toInt dest - Word.toInt addr
                 in
                     if willFitInRange(offset, 0w19) then length := BrShort else ();
                     adjust(instrs, addr + 0w2) (* N.B. Size BEFORE any adjustment *)
                 end
             
             |   adjust(TestBitBranch { length as ref BrExtended, label=ref labs, ...} :: instrs, addr) =
                 let
                     val dest = !(hd labs)
                     val offset = Word.toInt dest - Word.toInt addr
                 in
                     if willFitInRange(offset, 0w14) then length := BrShort else ();
                     adjust(instrs, addr + 0w2) (* N.B. Size BEFORE any adjustment *)
                 end
 
             |   adjust(CompareBranch { length as ref BrExtended, label=ref labs, ...} :: instrs, addr) =
                 let
                     val dest = !(hd labs)
                     val offset = Word.toInt dest - Word.toInt addr
                 in
                     if willFitInRange(offset, 0w19) then length := BrShort else ();
                     adjust(instrs, addr + 0w2) (* N.B. Size BEFORE any adjustment *)
                 end
 
             |   adjust(LoadAddressLiteral { length as ref BrExtended, ...} :: instrs, addr) =
                 let
                     val offset = Word.toInt (lastSize + maxConstantSize) - Word.toInt addr
                 in
                     (* We can only shorten these in 32-in-64.  In native 64-bits we may need to move
                        the constant area *)
                     if is32in64 andalso willFitInRange(offset, 0w19) then length := BrShort else ();
                     adjust(instrs, addr + 0w2) (* N.B. Size BEFORE any adjustment *)
                 end
 
             |   adjust(LoadNonAddressLiteral { length as ref BrExtended, ...} :: instrs, addr) =
                 let
                     val offset = Word.toInt (lastSize + maxConstantSize) - Word.toInt addr
                 in
                     if willFitInRange(offset, 0w19) then length := BrShort else ();
                     adjust(instrs, addr + 0w2) (* N.B. Size BEFORE any adjustment *)
                 end
 
             |   adjust(LoadFPLiteral { length as ref BrExtended, ...} :: instrs, addr) =
                 let
                     val offset = Word.toInt (lastSize + maxConstantSize) - Word.toInt addr
                 in
                     if willFitInRange(offset, 0w19) then length := BrShort else ();
                     adjust(instrs, addr + 0w2) (* N.B. Size BEFORE any adjustment *)
                 end
 
             |   adjust(LoadLabelAddress { length as ref BrExtended, label=ref labs, ...} :: instrs, addr) =
                 let
                     val dest = !(hd labs)
                     val offset = Word.toInt dest - Word.toInt addr
                 in
                     if willFitInRange(offset, 0w19) then length := BrShort else ();
                     adjust(instrs, addr + 0w2) (* N.B. Size BEFORE any adjustment *)
                 end
 
             |   adjust(instr :: instrs, addr) = adjust(instrs, addr + Word.fromInt(codeSize instr))
 
             val () = adjust(ops, 0w0)
 
             val nextSize = setLabels(ops, 0w0)
         in
             if nextSize < lastSize then setLabAndSize(ops, nextSize)
             else if nextSize = lastSize then lastSize
             else raise InternalError "setLabAndSize - size increased"
         end
     in
         setLabAndSize(ops, setLabels(ops, 0w0))
     end
 
     fun genCode(ops, addressConsts, nonAddressConsts, addrConstMap, nonAddrConstMap) =
     let
         val numNonAddrConsts = Word.fromInt(List.length nonAddressConsts)
         and numAddrConsts = Word.fromInt(List.length addressConsts) (* 32-bit words. *)
         val constSizePlusExtras = (* Number of extra (poly)words needed. *)
             numNonAddrConsts * wordsPerNativeWord + numAddrConsts + 0w4 (* 4 extra words *)
 
         val codeSize (* Number of 32-bit instructions *) =
             setLabelsAndSizes(ops, constSizePlusExtras  * (Address.wordSize div 0w4) + 0w2 (*allow 2 UDFs*))
 
         val wordsOfCode = (codeSize + 0w2) div 0w2 (* Round up to 64-bits with the UDF marker(s) added. *)
         (* Put one or two UDF instructions at the end as markers. *)
         val endOfCodeWords =
             if Word.andb(codeSize, 0w1) = 0w0 then [SimpleInstr undefCode, SimpleInstr undefCode] else [SimpleInstr undefCode]
  
         (* Segment size in Poly words. *)
         val segSize = wordsOfCode*wordsPerNativeWord + constSizePlusExtras
         val codeVec = byteVecMake segSize
         
         fun testBit(bitNo, brNonZero, offset, reg) =
             0wx36000000 orb (if bitNo >= 0w32 then 0wx80000000 else 0w0) orb
                 (if brNonZero then 0wx01000000 else 0w0) orb
                 (word8ToWord32(Word8.andb(bitNo, 0wx3f)) << 0w19) orb
                 ((offset andb 0wx3fff) << 0w5) orb word8ToWord32(xRegOnly reg)
         and compareBranch(size, brNonZero, offset, reg) =
             0wx34000000 orb (case size of WordSize64 => 0wx80000000 | WordSize32 => 0w0) orb
                 (if brNonZero then 0wx01000000 else 0w0) orb
                 ((offset andb 0wx7ffff) << 0w5) orb word8ToWord32(xRegOnly reg)
 
         fun genCodeWords([], _ , _, _) = ()
 
         |   genCodeWords(SimpleInstr code :: tail, wordNo, aConstNum, nonAConstNum) =
             (
                 writeInstr(code, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum)
             )
 
         |   genCodeWords(LoadAddressLiteral{reg, length=ref BrExtended, ...} :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 val code1 = 0wx90000000 orb word8ToWord32(xRegOnly reg)
                 val code2 =
                     (if is32in64 then loadRegScaled32 else loadRegScaled) {regT=reg, regN=reg, unitOffset=0}
             in
                 writeInstr(code1, wordNo, codeVec);
                 genCodeWords(code2 :: tail, wordNo+0w1, aConstNum+1, nonAConstNum)
             end
 
         |   genCodeWords(LoadAddressLiteral{reg, length=ref BrShort, ...} :: tail, wordNo, aConstNum, nonAConstNum) =
             (* Address literals can be shortened in 32-in-64 but are always 2 instrs in 64-bit.
                That allows for the constant area to be pulled out if necessary to make the
                code position-independent. *)
             let
                 (* The offset is in 32-bit words.  The first of the constants is
                    at offset wordsOfCode+3.  Non-address constants are always 8 bytes but
                    address constants are 4 bytes in 32-in-64. *)
                 val s = if is32in64 then 0w0 else 0w1 (* Load 64-bit word in 64-bit mode and 32-bits in 32-in-64. *)
                 val constPos = Array.sub(addrConstMap, aConstNum)
                 val offsetOfConstant =
                     (wordsOfCode+numNonAddrConsts)*0w2 + (0w3+constPos)*(Address.wordSize div 0w4) - wordNo
                 val _ = willFitInRange(Word.toInt offsetOfConstant, 0w19) orelse raise InternalError "Offset to constant is too large"
                 val code =
                     0wx18000000 orb (s << 0w30) orb (wordToWord32 offsetOfConstant << 0w5) orb word8ToWord32(xRegOnly reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum+1, nonAConstNum)
             end
 
         |   genCodeWords(LoadNonAddressLiteral{reg, length=ref BrExtended, ...} :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 val code1 = 0wx90000000 orb word8ToWord32(xRegOnly reg)
                 (* The load instruction is always 64-bits even in 32-in-64. *)
                 val code2 = loadRegScaled{regT=reg, regN=reg, unitOffset=0}
             in
                 writeInstr(code1, wordNo, codeVec);
                 genCodeWords(code2 :: tail, wordNo+0w1, aConstNum, nonAConstNum+1)
             end
 
         |   genCodeWords(LoadNonAddressLiteral{reg, length=ref BrShort, ...} :: tail, wordNo, aConstNum, nonAConstNum) =
             (* These can be shortened since they're always part of the code. *)
             let
                 (* The offset is in 32-bit words.  These are always 64-bits. *)
                 val constPos = Array.sub(nonAddrConstMap, nonAConstNum)
                 val offsetOfConstant = (wordsOfCode+constPos)*0w2 - wordNo
                 val _ = willFitInRange(Word.toInt offsetOfConstant, 0w19) orelse raise InternalError "Offset to constant is too large"
                 val code = 0wx58000000 orb (wordToWord32 offsetOfConstant << 0w5) orb word8ToWord32(xRegOnly reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum+1)
             end
 
         |   genCodeWords(LoadFPLiteral{reg, work, isDouble, length=ref BrExtended, ...} :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 val code1 = 0wx90000000 orb word8ToWord32(xRegOnly work)
                 val code2 = (if isDouble then loadRegScaledDouble else loadRegScaledFloat){regT=reg, regN=work, unitOffset=0}
             in
                 writeInstr(code1, wordNo, codeVec);
                 genCodeWords(code2 :: tail, wordNo+0w1, aConstNum, nonAConstNum+1)
             end
 
         |   genCodeWords(LoadFPLiteral{reg, isDouble, length=ref BrShort, ...} :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 (* The offset is in 32-bit words.  These are always 64-bits. *)
                 val constPos = Array.sub(nonAddrConstMap, nonAConstNum)
                 val offsetOfConstant = (wordsOfCode+constPos)*0w2 - wordNo
                 val _ = willFitInRange(Word.toInt offsetOfConstant, 0w19) orelse raise InternalError "Offset to constant is too large"
                 val code = (if isDouble then 0wx5c000000 else 0wx1c000000) orb
                                 (wordToWord32 offsetOfConstant << 0w5) orb word8ToWord32(vReg reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum+1)
             end
 
         |   genCodeWords(Label _ :: tail, wordNo, aConstNum, nonAConstNum) = 
                 genCodeWords(tail, wordNo, aConstNum, nonAConstNum) (* No code. *)
 
         |   genCodeWords(UnconditionalBranch{label=ref labs, andLink} :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt wordNo
                 val _ = willFitInRange(offset, 0w26) orelse raise InternalError "genCodeWords: branch too far"
                 val linkBit = if andLink then 0wx80000000 else 0w0
             in
                 writeInstr(0wx14000000 orb linkBit orb (Word32.fromInt offset andb 0wx03ffffff), wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(ConditionalBranch{ label=ref labs, jumpCondition=cond, length=ref BrShort }:: tail, wordNo,
                             aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt wordNo
                 val _ = willFitInRange(offset, 0w19) orelse raise InternalError "genCodeWords: branch too far"
             in
                 writeInstr(0wx54000000 orb ((Word32.fromInt offset andb 0wx07ffff) << 0w5)
                         orb cCode cond, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(ConditionalBranch{ label=ref labs, jumpCondition, length=ref BrExtended }:: tail, wordNo,
                             aConstNum, nonAConstNum) =
             let (* Long form - put a conditional branch with reversed sense round an unconditional branch. *)
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt (wordNo + 0w1) (* Next instruction. *)
                 val _ = willFitInRange(offset, 0w26) orelse raise InternalError "genCodeWords: branch too far"
                 val revCond = invertTest jumpCondition
             in
                 writeInstr(0wx54000000 orb (0w2 << 0w5) orb cCode revCond, wordNo, codeVec);
                 writeInstr(0wx14000000 orb (Word32.fromInt offset andb 0wx03ffffff), wordNo+0w1, codeVec);
                 genCodeWords(tail, wordNo+0w2, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(LoadLabelAddress{reg, length=ref BrExtended, ...} :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 val code1 = 0wx90000000 orb word8ToWord32(xRegOnly reg)
                 val code2 = addImmediate{regN=reg, regD=reg, immed=0w0, shifted=false}
             in
                 writeInstr(code1, wordNo, codeVec);
                 genCodeWords(code2 :: tail, wordNo+0w1, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(LoadLabelAddress{label=ref labs, reg, length=ref BrShort, ...} :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt wordNo
                 val _ = willFitInRange(offset, 0w19) orelse raise InternalError "Offset to label address is too large"
                 val code = 0wx10000000 orb ((Word32.fromInt offset andb 0wx7ffff) << 0w5) orb word8ToWord32(xRegOnly reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(TestBitBranch{label=ref labs, bitNo, brNonZero, reg, length=ref BrExtended} :: tail,
                     wordNo, aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt (wordNo + 0w1) (* Next instruction *)
                 val _ = willFitInRange(offset, 0w25) orelse raise InternalError "genCodeWords: branch too far"
                 val _ = bitNo <= 0w63 orelse
                     raise InternalError "TestBitBranch: bit number > 63"
                 val code = testBit(bitNo, (* Invert test *) not brNonZero, 0w2 (* Skip branch *), reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 writeInstr(0wx14000000 orb (Word32.fromInt offset andb 0wx03ffffff), wordNo+0w1, codeVec);
                 genCodeWords(tail, wordNo+0w2, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(TestBitBranch{label=ref labs, bitNo, brNonZero, reg, length=ref BrShort} :: tail,
                     wordNo, aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt wordNo
                 val _ = willFitInRange(offset, 0w14) orelse raise InternalError "TestBitBranch: Offset to label address is too large"
                 val _ = bitNo <= 0w63 orelse
                     raise InternalError "TestBitBranch: bit number > 63"
                 val code = testBit(bitNo, brNonZero, Word32.fromInt offset, reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(CompareBranch{label=ref labs, brNonZero, size, reg, length=ref BrExtended} :: tail,
                     wordNo, aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt (wordNo+0w1)
                 val _ = willFitInRange(offset, 0w25) orelse raise InternalError "genCodeWords: branch too far"
                 val code = compareBranch(size, (* Invert test *) not brNonZero, 0w2, reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 writeInstr(0wx14000000 orb (Word32.fromInt offset andb 0wx03ffffff), wordNo+0w1, codeVec);
                 genCodeWords(tail, wordNo+0w2, aConstNum, nonAConstNum)
             end
 
         |   genCodeWords(CompareBranch{label=ref labs, brNonZero, size, reg, length=ref BrShort} :: tail,
                     wordNo, aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs)
                 val offset = Word.toInt dest - Word.toInt wordNo
                 val _ = willFitInRange(offset, 0w19) orelse raise InternalError "CompareBranch: Offset to label address is too large"
                 val code = compareBranch(size, brNonZero, Word32.fromInt offset, reg)
             in
                 writeInstr(code, wordNo, codeVec);
                 genCodeWords(tail, wordNo+0w1, aConstNum, nonAConstNum)
             end
     in
         genCodeWords (ops @ endOfCodeWords, 0w0, 0, 0);
         (* Copy in the non-address constants. *)
         List.foldl(fn (cVal, addr) => (write64Bit(cVal, addr, codeVec); addr+0w1)) wordsOfCode nonAddressConsts;
         (codeVec (* Return the completed code. *), wordsOfCode+numNonAddrConsts (* And the size in 64-bit words. *))
     end
 
     (* Store a word, either 64-bit or 32-bit. *)
     fun setWord(value, wordNo, seg) =
     let
         val addrs = wordNo * Address.wordSize
         fun putBytes(value, a, seg, i) =
         if i = Address.wordSize then ()
         else
         (
             byteVecSet(seg,
                 if not isBigEndian then a+i else a+wordSize-i-0w1,
                 Word8.fromLarge value);
             putBytes(LargeWord.>>(value, 0w8), a, seg, i+0w1)
         )
     in
         putBytes(value, addrs, seg, 0w0)
     end
     
    
     (* Print the instructions in the code. *)
     fun printCode (codeVec, functionName, wordsOfCode, printStream) =
     let
         val numInstructions = wordsOfCode * (Address.wordSize div 0w4)
     
         fun printHex (v, n) =
         let
             val s = Word32.fmt StringCvt.HEX v
             val pad = CharVector.tabulate(Int.max(0, n-size s), fn _ => #"0")
         in
             printStream pad; printStream s
         end
 
         fun printCondition 0wx0 = printStream "eq"
         |   printCondition 0wx1 = printStream "ne"
         |   printCondition 0wx2 = printStream "cs"
         |   printCondition 0wx3 = printStream "cc"
         |   printCondition 0wx4 = printStream "mi"
         |   printCondition 0wx5 = printStream "pl"
         |   printCondition 0wx6 = printStream "vs"
         |   printCondition 0wx7 = printStream "vc"
         |   printCondition 0wx8 = printStream "hi"
         |   printCondition 0wx9 = printStream "ls"
         |   printCondition 0wxa = printStream "ge"
         |   printCondition 0wxb = printStream "lt"
         |   printCondition 0wxc = printStream "gt"
         |   printCondition 0wxd = printStream "le"
         |   printCondition 0wxe = printStream "al"
         |   printCondition _    = printStream "nv"
 
         (* Normal XReg with 31 being XZ *)
         fun prXReg 0w31 = printStream "xz"
         |   prXReg r = printStream("x" ^ Word32.fmt StringCvt.DEC r)
 
         (* XReg when 31 is SP *)
         fun prXRegOrSP 0w31 = printStream "sp"
         |   prXRegOrSP r = printStream("x" ^ Word32.fmt StringCvt.DEC r)
 
         (* Normal WReg with 31 being WZ *)
         fun prWReg 0w31 = printStream "wz"
         |   prWReg r = printStream("w" ^ Word32.fmt StringCvt.DEC r)
 
         (* WReg when 31 is WSP *)
         fun prWRegOrSP 0w31 = printStream "wsp"
         |   prWRegOrSP r = printStream("w" ^ Word32.fmt StringCvt.DEC r)
 
         (* Each instruction is 32-bytes. *)
         fun printWordAt wordNo =
         let
             val byteNo = Word.<<(wordNo, 0w2)
             val () = printHex(wordToWord32 byteNo, 6)  (* Address *)
             val () = printStream "\t"
             val wordValue =
                 word8ToWord32 (codeVecGet (codeVec, byteNo)) orb
                 (word8ToWord32 (codeVecGet (codeVec, byteNo+0w1)) << 0w8) orb
                 (word8ToWord32 (codeVecGet (codeVec, byteNo+0w2)) << 0w16) orb
                 (word8ToWord32 (codeVecGet (codeVec, byteNo+0w3)) << 0w24)
             val () = printHex(wordValue, 8) (* Instr as hex *)
             val () = printStream "\t"
         in
             if (wordValue andb 0wxfffffc1f) = 0wxD61F0000
             then
             let
                 val rN = (wordValue andb 0wx3e0) >> 0w5
             in
                 printStream "br\tx";
                 printStream(Word32.fmt StringCvt.DEC rN)
             end
 
             else if (wordValue andb 0wxfffffc1f) = 0wxD63F0000
             then
             let
                 val rN = (wordValue andb 0wx3e0) >> 0w5
             in
                 printStream "blr\tx";
                 printStream(Word32.fmt StringCvt.DEC rN)
             end
 
             else if (wordValue andb 0wxfffffc1f) = 0wxD65F0000
             then
             let
                 val rN = (wordValue andb 0wx3e0) >> 0w5
             in
                 printStream "ret\tx";
                 printStream(Word32.fmt StringCvt.DEC rN)
             end
 
             else if wordValue = 0wxD503201F
             then printStream "nop"
             else if wordValue = 0wxD503203F
             then printStream "yield"
             else if wordValue = 0wxD5033BBF
             then printStream "dmb\tish"
 
             else if (wordValue andb 0wx1f800000) = 0wx12800000
             then (* Move of constants.  Includes movn and movk. *)
             let
                 val rD = wordValue andb 0wx1f
                 val imm16 = Word32.toInt((wordValue >> 0w5) andb 0wxffff)
                 val isXReg = (wordValue andb 0wx80000000) <> 0w0
                 val opc = (wordValue >> 0w29) andb 0w3
                 val shift = (wordValue >> 0w21) andb 0w3
             in
                 printStream (if opc = 0w3 then "movk\t" else "mov\t");
                 printStream (if isXReg then "x" else "w");
                 printStream(Word32.fmt StringCvt.DEC rD);
                 printStream ",#";
                 printStream(Int.toString(if opc = 0w0 then ~1 - imm16 else imm16));
                 if shift = 0w0
                 then ()
                 else (printStream ",lsl #"; printStream(Word32.fmt StringCvt.DEC (shift*0w16)))
             end
 
             else if (wordValue andb 0wx3b000000) = 0wx39000000
             then (* Load/Store with unsigned, scaled offset. *)
             let
                 (* The offset is in units of the size of the operand. *)
                 val size = wordValue >> 0w30
                 and v = (wordValue >> 0w26) andb 0w1
                 and opc = (wordValue >> 0w22) andb 0w3
                 val rT = wordValue andb 0wx1f
                 and rN = (wordValue andb 0wx3e0) >> 0w5
                 and imm12 = (wordValue andb 0wx3ffc00) >> 0w10
                 val (opcode, r, scale) =
                     case (size, v, opc) of
                         (0w0, 0w0, 0w0) => ("strb", "w", 0w0)
                     |   (0w0, 0w0, 0w1) => ("ldrb", "w", 0w0)
                     |   (0w1, 0w0, 0w0) => ("strh", "w", 0w2)
                     |   (0w1, 0w0, 0w1) => ("ldrh", "w", 0w2)
                     |   (0w2, 0w0, 0w0) => ("str", "w", 0w4)
                     |   (0w2, 0w0, 0w1) => ("ldr", "w", 0w4)
                     |   (0w3, 0w0, 0w0) => ("str", "x", 0w8)
                     |   (0w3, 0w0, 0w1) => ("ldr", "x", 0w8)
                     |   (0w2, 0w1, 0w0) => ("str", "s", 0w4)
                     |   (0w2, 0w1, 0w1) => ("ldr", "s", 0w4)
                     |   (0w3, 0w1, 0w0) => ("str", "d", 0w8)
                     |   (0w3, 0w1, 0w1) => ("ldr", "d", 0w8)
                     |   _ => ("??", "?", 0w1)
             in
                 printStream opcode; printStream "\t"; printStream r; printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",["; prXRegOrSP rN;
                 printStream ",#"; printStream(Word32.fmt StringCvt.DEC(imm12*scale));
                 printStream "]"
             end
 
             else if (wordValue andb 0wx3b200c00) = 0wx38000000
             then (* Load/store unscaled immediate *)
             let
                 val size = wordValue >> 0w30
                 and v = (wordValue >> 0w26) andb 0w1
                 and opc = (wordValue >> 0w22) andb 0w3
                 val rT = wordValue andb 0wx1f
                 and rN = (wordValue andb 0wx3e0) >> 0w5
                 and imm9 = (wordValue andb 0wx1ff000) >> 0w12
                 val imm9Text =
                     if imm9 > 0wxff
                     then "-" ^ Word32.fmt StringCvt.DEC (0wx200 - imm9)
                     else Word32.fmt StringCvt.DEC imm9
                 val (opcode, r) =
                     case (size, v, opc) of
                         (0w0, 0w0, 0w0) => ("sturb", "w")
                     |   (0w0, 0w0, 0w1) => ("ldurb", "w")
                     |   (0w0, 0w0, 0w2) => ("ldursb", "w")
                     |   (0w0, 0w0, 0w3) => ("ldursb", "x")
                     |   (0w1, 0w0, 0w0) => ("sturh", "w")
                     |   (0w1, 0w0, 0w1) => ("ldurh", "w")
                     |   (0w1, 0w0, 0w2) => ("ldursh", "w")
                     |   (0w1, 0w0, 0w3) => ("ldursh", "x")
                     |   (0w2, 0w0, 0w0) => ("stur", "w")
                     |   (0w2, 0w0, 0w1) => ("ldur", "w")
                     |   (0w2, 0w0, 0w2) => ("ldursw", "x")
                     |   (0w3, 0w0, 0w0) => ("stur", "x")
                     |   (0w3, 0w0, 0w1) => ("ldur", "x")
                     |   (0w2, 0w1, 0w0) => ("stur", "s")
                     |   (0w2, 0w1, 0w1) => ("ldur", "s")
                     |   (0w3, 0w1, 0w0) => ("stur", "d")
                     |   (0w3, 0w1, 0w1) => ("ldur", "d")
                     |   _ => ("???", "?")
             in
                 printStream opcode; printStream "\t"; printStream r;
                 printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",["; prXRegOrSP rN;
                 printStream ",#"; printStream imm9Text; printStream "]"
             end
 
             else if (wordValue andb 0wx3b200c00) = 0wx38000400
             then (* Load/store immediate post-indexed *)
             let
                 val size = wordValue >> 0w30
                 and v = (wordValue >> 0w26) andb 0w1
                 and opc = (wordValue >> 0w22) andb 0w3
                 val rT = wordValue andb 0wx1f
                 and rN = (wordValue andb 0wx3e0) >> 0w5
                 and imm9 = (wordValue andb 0wx1ff000) >> 0w12
                 val imm9Text =
                     if imm9 > 0wxff
                     then "-" ^ Word32.fmt StringCvt.DEC (0wx200 - imm9)
                     else Word32.fmt StringCvt.DEC imm9
                 val (opcode, r) =
                     case (size, v, opc) of
                         (0w0, 0w0, 0w0) => ("strb", "w")
                     |   (0w0, 0w0, 0w1) => ("ldrb", "w")
                     |   (0w2, 0w0, 0w0) => ("str", "w")
                     |   (0w2, 0w0, 0w1) => ("ldr", "w")
                     |   (0w3, 0w0, 0w0) => ("str", "x")
                     |   (0w3, 0w0, 0w1) => ("ldr", "x")
                     |   _ => ("???", "?")
             in
                 printStream opcode; printStream "\t"; printStream r;
                 printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",["; prXRegOrSP rN;
                 printStream "],#"; printStream imm9Text
             end
 
             else if (wordValue andb 0wx3b200c00) = 0wx38000c00
             then (* Load/store immediate pre-indexed *)
             let
                 val size = wordValue >> 0w30
                 and v = (wordValue >> 0w26) andb 0w1
                 and opc = (wordValue >> 0w22) andb 0w3
                 val rT = wordValue andb 0wx1f
                 and rN = (wordValue andb 0wx3e0) >> 0w5
                 and imm9 = (wordValue andb 0wx1ff000) >> 0w12
                 val imm9Text =
                     if imm9 > 0wxff
                     then "-" ^ Word32.fmt StringCvt.DEC (0wx200 - imm9)
                     else Word32.fmt StringCvt.DEC imm9
                 val (opcode, r) =
                     case (size, v, opc) of
                         (0w0, 0w0, 0w0) => ("strb", "w")
                     |   (0w0, 0w0, 0w1) => ("ldrb", "w")
                     |   (0w2, 0w0, 0w0) => ("str", "w")
                     |   (0w2, 0w0, 0w1) => ("ldr", "w")
                     |   (0w3, 0w0, 0w0) => ("str", "x")
                     |   (0w3, 0w0, 0w1) => ("ldr", "x")
                     |   _ => ("???", "?")
             in
                 printStream opcode; printStream "\t"; printStream r;
                 printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",["; prXRegOrSP rN;
                 printStream ",#"; printStream imm9Text; printStream "]!"
             end
 
             else if (wordValue andb 0wx3b200c00) = 0wx38200800
             then (* Load/store with register offset i.e. an index register. *)
             let
                 val size = wordValue >> 0w30
                 and v = (wordValue >> 0w26) andb 0w1
                 and opc = (wordValue >> 0w22) andb 0w3
                 val rT = wordValue andb 0wx1f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rM = (wordValue >> 0w16) andb 0wx1f
                 val option = (wordValue >> 0w13) andb 0w7
                 val s = (wordValue andb 0wx1000) <> 0w0 
                 val (opcode, r) =
                     case (size, v, opc) of
                         (0w0, 0w0, 0w0) => ("strb", "w")
                     |   (0w0, 0w0, 0w1) => ("ldrb", "w")
                     |   (0w1, 0w0, 0w0) => ("strh", "w")
                     |   (0w1, 0w0, 0w1) => ("ldrh", "w")
                     |   (0w2, 0w0, 0w0) => ("str", "w")
                     |   (0w2, 0w0, 0w1) => ("ldr", "w")
                     |   (0w3, 0w0, 0w0) => ("str", "x")
                     |   (0w3, 0w0, 0w1) => ("ldr", "x")
                     |   (0w2, 0w1, 0w0) => ("str", "s")
                     |   (0w2, 0w1, 0w1) => ("ldr", "s")
                     |   (0w3, 0w1, 0w0) => ("str", "d")
                     |   (0w3, 0w1, 0w1) => ("ldr", "d")
                     |   _ => ("???", "?")
                 val (extend, xr) =
                     case option of
                         0w2 => (" uxtw", "w")
                     |   0w3 => if s then (" lsl", "x") else ("", "x")
                     |   0w6 => (" sxtw", "w")
                     |   0w7 => (" sxtx", "x")
                     |   _   => ("?", "?")
                 val indexShift =
                     case (size, s) of
                         (0w0, true) => " #1"
                     |   (0w1, true) => " #1"
                     |   (0w2, true) => " #2"
                     |   (0w3, true) => " #3"
                     |   _ => ""
             in
                 printStream opcode; printStream "\t"; printStream r;
                 printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",["; prXRegOrSP rN;
                 printStream ","; printStream xr; printStream(Word32.fmt StringCvt.DEC rM);
                 printStream extend; printStream indexShift;
                 printStream "]"
             end
 
             else if (wordValue andb 0wx3f000000) = 0wx08000000
             then (* Loads and stores with special ordering. *)
             let
                 val size = (wordValue >> 0w30) andb 0w3
                 and o2 = (wordValue >> 0w23) andb 0w1
                 and l = (wordValue >> 0w22) andb 0w1
                 and o1 = (wordValue >> 0w21) andb 0w1
                 and o0 = (wordValue >> 0w15) andb 0w1
                 val rT = wordValue andb 0wx1f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rS = (wordValue >> 0w16) andb 0wx1f
                 val (opcode, r) =
                     case (size, o2, l, o1, o0) of
                         (0w3, 0w1, 0w1, 0w0, 0w1) => ("ldar", "x")
                     |   (0w3, 0w1, 0w0, 0w0, 0w1) => ("stlr", "x")
                     |   (0w2, 0w1, 0w1, 0w0, 0w1) => ("ldar", "w")
                     |   (0w2, 0w1, 0w0, 0w0, 0w1) => ("stlr", "w")
                     |   (0w3, 0w0, 0w1, 0w0, 0w1) => ("ldaxr", "x")
                     |   (0w3, 0w0, 0w0, 0w0, 0w1) => ("stlxr", "x")
                     |   (0w0, 0w1, 0w1, 0w0, 0w1) => ("ldarb", "w")
                     |   (0w0, 0w1, 0w0, 0w0, 0w1) => ("stlrb", "w")
                     |   _ => ("??", "?")
             in
                 printStream opcode; printStream "\t";
                 if opcode = "stlxr"
                 then (printStream "w"; printStream(Word32.fmt StringCvt.DEC rS); printStream ",")
                 else ();
                 printStream r;
                 printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",["; prXRegOrSP rN; printStream "]"
             end
 
             else if (wordValue andb 0wx3a000000) = 0wx28000000
             then (* Load/store pairs of registers *)
             let
                 val opc = (wordValue >> 0w30) andb 0w3
                 and v = (wordValue >> 0w26) andb 0w1
                 and op2 = (wordValue >> 0w23) andb 0w3
                 and l = (wordValue >> 0w22) andb 0w1
                 and imm7 = (wordValue >> 0w15) andb 0wx7f
                 and rT2 = (wordValue >> 0w10) andb 0wx1f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rT1 = wordValue andb 0wx1f
                 val (opcode, r, scale) =
                     case (opc, v, l) of
                         (0w0, 0w0, 0w0) => ("stp", "w", 0w4)
                     |   (0w0, 0w0, 0w1) => ("ldp", "w", 0w4)
                     |   (0w2, 0w0, 0w0) => ("stp", "x", 0w8)
                     |   (0w2, 0w0, 0w1) => ("ldp", "x", 0w8)
                     |   (0w0, 0w1, 0w0) => ("stp", "s", 0w4)
                     |   (0w0, 0w1, 0w1) => ("ldp", "s", 0w4)
                     |   (0w1, 0w1, 0w0) => ("stp", "d", 0w8)
                     |   (0w1, 0w1, 0w1) => ("ldp", "d", 0w8)
                     |   _ => ("??", "?", 0w1)
                 val imm7Text =
                     if imm7 > 0wx3f
                     then "-" ^ Word32.fmt StringCvt.DEC ((0wx80 - imm7) * scale)
                     else Word32.fmt StringCvt.DEC (imm7 * scale)
             in
                 printStream opcode; printStream "\t"; printStream r;
                 printStream(Word32.fmt StringCvt.DEC rT1); printStream ",";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rT2);
                 printStream ",["; prXRegOrSP rN;
                 case op2 of
                     0w1 => (* Post indexed *)
                         (printStream "],#"; printStream imm7Text)
                 |   0w2 => (* Offset *)
                         (printStream ",#"; printStream imm7Text; printStream "]")
                 |   0w3 => (* Pre indexed *)
                         (printStream ",#"; printStream imm7Text; printStream "]!")
                 |   _ => printStream "??"
             end
 
             else if (wordValue andb 0wx1f800000) = 0wx11000000
             then
             let
                 (* Add/Subtract a 12-bit immediate with possible shift. *)
                 val sf = (wordValue >> 0w31) andb 0w1
                 val rD = wordValue andb 0wx1f
                 and rN = (wordValue andb 0wx3e0) >> 0w5
                 and imm12 = (wordValue andb 0wx3ffc00) >> 0w10
                 and shiftBit = wordValue andb 0wx400000
                 val imm = if shiftBit <> 0w0 then imm12 << 0w12 else imm12
                 val oper = (wordValue andb 0wx40000000) = 0w0
                 val isS = (wordValue andb 0wx20000000) <> 0w0
                 val prReg = if sf = 0w1 then prXRegOrSP else prWRegOrSP
             in
                 if imm12 = 0w0 andalso (rN = 0w31 orelse rD = 0w31) andalso not isS
                 then (printStream "mov\t"; prReg rD; printStream ","; prReg rN)
                 else
                 (
                     if isS andalso rD = 0w31
                     then printStream(if oper then "cmn\t" else "cmp\t")
                     else
                     (
                         printStream(if oper then "add" else "sub"); printStream(if isS then "s\t" else "\t");
                         prReg rD; printStream ","
                     );
                     prReg rN; printStream ",#"; printStream(Word32.fmt StringCvt.DEC imm)
                 )
             end
 
             else if (wordValue andb 0wx7fe0ffe0) = 0wx2A0003E0
             then (* Move reg,reg.  This is a subset of ORR shifted register. *)
             let
                 val reg = if (wordValue andb 0wx80000000) <> 0w0 then "x" else "w"
             in
                 printStream "mov\t"; printStream reg;
                 printStream(Word32.fmt StringCvt.DEC(wordValue andb 0wx1f));
                 printStream ","; printStream reg;
                 printStream(Word32.fmt StringCvt.DEC((wordValue >> 0w16) andb 0wx1f))
             end
 
             else if (wordValue andb 0wx1f000000) = 0wx0A000000
             then
             let
                 (* Logical operations with shifted register. *)
                 val rD = wordValue andb 0wx1f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rM = (wordValue >> 0w16) andb 0wx1f
                 and imm6 = (wordValue >> 0w10) andb 0wx3f
                 and shiftCode = (wordValue >> 0w22) andb 0wx3
                 val opc = (wordValue >> 0w29) andb 0wx3
                 val nBit = (wordValue >> 0w21) andb 0w1
                 val reg = if (wordValue andb 0wx80000000) <> 0w0 then "x" else "w"
                 val opcode =
                     case (opc, nBit) of
                         (0w0, 0w0) => "and"
                     |   (0w1, 0w0) => "orr"
                     |   (0w2, 0w0) => "eor"
                     |   (0w3, 0w0) => "ands"
                     |   _ => "??"
             in
                 if rD = 0w31 andalso opc=0w3 andalso nBit = 0w0
                 then printStream "tst\t"
                 else
                 (
                     printStream opcode; printStream"\t";
                     printStream reg;
                     printStream(Word32.fmt StringCvt.DEC rD); printStream ","
                 );
                 printStream reg; printStream(Word32.fmt StringCvt.DEC rN);
                 printStream ","; printStream reg; printStream(Word32.fmt StringCvt.DEC rM);
                 if imm6 <> 0w0
                 then
                 (
                     case shiftCode of
                         0w0 => printStream ",lsl #"
                     |   0w1 => printStream ",lsr #"
                     |   0w2 => printStream ",asr #"
                     |   _ => printStream ",?? #";
                     printStream(Word32.fmt StringCvt.DEC imm6)
                 )
                 else ()
             end
 
             else if (wordValue andb 0wx1f200000) = 0wx0B000000
             then
             let
                 (* Add/subtract shifted register. *)
                 val rD = wordValue andb 0wx1f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rM = (wordValue >> 0w16) andb 0wx1f
                 and imm6 = (wordValue >> 0w10) andb 0wx3f
                 and shiftCode = (wordValue >> 0w22) andb 0wx3
                 val oper = (wordValue andb 0wx40000000) = 0w0
                 val isS = (wordValue andb 0wx20000000) <> 0w0
                 val pReg = if (wordValue andb 0wx80000000) <> 0w0 then prXReg else prWReg
             in
                 if isS andalso rD = 0w31
                 then printStream(if oper then "cmn\t" else "cmp\t")
                 else
                 (
                     printStream(if oper then "add" else "sub"); printStream(if isS then "s\t" else "\t");
                     pReg rD; printStream ","
                 );
                 pReg rN;
                 printStream ","; pReg rM;
                 if imm6 <> 0w0
                 then
                 (
                     case shiftCode of
                         0w0 => printStream ",lsl #"
                     |   0w1 => printStream ",lsr #"
                     |   0w2 => printStream ",asr #"
                     |   _ => printStream ",?? #";
                     printStream(Word32.fmt StringCvt.DEC imm6)
                 )
                 else ()
             end
 
             else if (wordValue andb 0wx1fe00000) = 0wx0b200000
             then
             let
                 (* Add/subtract extended register. *)
                 val rD = wordValue andb 0wx1f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rM = (wordValue >> 0w16) andb 0wx1f
                 and extend = (wordValue >> 0w13) andb 0w7
                 and amount = (wordValue >> 0w10) andb 0w7
                 and sf = (wordValue >> 0w31) andb 0w1
                 and p = (wordValue >> 0w30) andb 0w1
                 and s = (wordValue >> 0w29) andb 0w1
             in
                 if s = 0w1 andalso rD = 0w31
                 then printStream(if p = 0w0 then "cmn\t" else "cmp\t")
                 else
                 (
                     printStream(if p = 0w0 then "add" else "sub");
                     printStream(if s = 0w1 then "s\t" else "\t");
                     (if sf = 0w1 then prXRegOrSP else prWRegOrSP) rD; printStream ","
                 );
                 (if sf = 0w1 then prXRegOrSP else prWRegOrSP) rN;
                 printStream ",";
                 (if extend = 0w3 orelse extend = 0w7 then prXReg else prWReg) rM;
                 case extend of
                     0w0 => printStream ",uxtb"
                 |   0w1 => printStream ",uxth"
                 |   0w2 => if amount = 0w0 andalso sf = 0w0 then () else printStream ",uxtw"
                 |   0w3 => if amount = 0w0 andalso sf = 0w1 then () else printStream ",uxtx"
                 |   0w4 => printStream ",sxtb"
                 |   0w5 => printStream ",sxth"
                 |   0w6 => printStream ",sxtw"
                 |   0w7 => printStream ",sxtx"
                 |   _ => printStream "?";
                
                 if amount <> 0w0
                 then printStream(" #" ^ Word32.fmt StringCvt.DEC amount)
                 else ()
             end
 
             else if (wordValue andb 0wx3b000000) = 0wx18000000
             then
             let
                 (* Load from a PC-relative address.  This may refer to the
                    address constant area or the non-address constant area. *)
                 val rT = wordValue andb 0wx1f
                 val opc = (wordValue >> 0w30) andb 0w3
                 val v = (wordValue >> 0w26) andb 0w1
                 (* The offset is in 32-bit words *)
                 val byteAddr = word32ToWord(((wordValue andb 0wx00ffffe0) >> (0w5-0w2))) + byteNo
                 (* We must NOT use codeVecGetWord if this is in the non-address
                    area.  It may well not be a tagged value. *)
                 local
                     fun getConstant(cVal, 0w0) = cVal
                     |   getConstant(cVal, offset) =
                         let
                             val byteVal =
                                 Word64.fromLarge(Word8.toLarge(codeVecGet (codeVec, byteAddr+offset-0w1)))
                         in
                             getConstant(Word64.orb(Word64.<<(cVal, 0w8), byteVal), offset-0w1)
                         end
                 in
                     val constantValue = "0x" ^ Word64.toString(getConstant(0w0, 0w8)) (* It's a non-address constant *)
                 end
                 val reg =
                     case (opc, v) of
                         (0w0, 0w0) => "w"
                     |   (0w1, 0w0) => "x"
                     |   (0w0, 0w1) => "s"
                     |   (0w1, 0w1) => "d"
                     |   _ => "?"
             in
                 printStream "ldr\t"; printStream reg;
                 printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",0x"; printStream(Word.fmt StringCvt.HEX byteAddr);
                 printStream "\t// "; printStream constantValue
             end
 
             else if (wordValue andb 0wxbf000000) = 0wx10000000
             then
             let
                 (* Put a pc-relative address into a register. *)
                 val rT = wordValue andb 0wx1f
                 val byteOffset =
                     ((wordValue andb 0wx00ffffe0) << (Word.fromInt Word32.wordSize - 0w23) ~>>
                         (Word.fromInt Word32.wordSize - 0w20)) + ((wordValue >> 0w29) andb 0w3)
             in
                 printStream "adr\tx"; printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",0x"; printStream(Word32.fmt StringCvt.HEX (wordToWord32 byteNo+byteOffset))
             end
 
             else if (wordValue andb 0wx9f000000) = 0wx90000000
             then (* ADRP *)
             let
                 val rT = wordValue andb 0wx1f
                 (* The value is a page offset *)
                 val pageOffset = ((wordValue >> 0w29) andb 0w3) (* immlo *) orb
                     ((wordValue >> 0w3) andb 0wx1fffc)
             in
                 printStream "adrp\tx"; printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ",0x"; printStream(Word32.fmt StringCvt.HEX (pageOffset*0w4096))
             end
 
             else if (wordValue andb 0wx7c000000) = 0wx14000000
             then (* Unconditional branch. *)
             let
                 (* The offset is signed and the destination may be earlier. *)
                 val byteOffset =
                     (wordValue andb 0wx03ffffff) << (Word.fromInt Word32.wordSize - 0w26) ~>>
                         (Word.fromInt Word32.wordSize - 0w28)
                 val opc = if (wordValue andb 0wx80000000) = 0w0 then "b" else "bl"
             in
                 printStream opc; printStream "\t0x";
                 printStream(Word32.fmt StringCvt.HEX (wordToWord32 byteNo + byteOffset))
             end
 
             else if (wordValue andb 0wxff000000) = 0wx54000000
             then (* Conditional branch *)
             let
                 val byteOffset =
                     (wordValue andb 0wx00ffffe0) << (Word.fromInt Word32.wordSize - 0w24) ~>>
                         (Word.fromInt Word32.wordSize - 0w21)
             in
                 printStream "b.";
                 printCondition(wordValue andb 0wxf);
                 printStream "\t0x";
                 printStream(Word32.fmt StringCvt.HEX (wordToWord32 byteNo+byteOffset))
             end
 
             else if (wordValue andb 0wx7e000000) = 0wx34000000
             then (* Compare and branch *)
             let
                 val byteOffset =
                     (wordValue andb 0wx00ffffe0) << (Word.fromInt Word32.wordSize - 0w24) ~>>
                         (Word.fromInt Word32.wordSize - 0w21)
                 val oper =
                     if (wordValue andb 0wx01000000) = 0w0
                     then "cbz" else "cbnz"
                 val r = if (wordValue andb 0wx80000000) = 0w0 then "w" else "x"
             in
                 printStream oper; printStream "\t";
                 printStream r; printStream(Word32.fmt StringCvt.DEC (wordValue andb 0wx1f));
                 printStream ",0x";
                 printStream(Word32.fmt StringCvt.HEX (wordToWord32 byteNo+byteOffset))
             end
 
             else if (wordValue andb 0wx7e000000) = 0wx36000000
             then (* Test bit and branch *)
             let
                 val byteOffset =
                     (wordValue andb 0wx000fffe0) << (Word.fromInt Word32.wordSize - 0w20) ~>>
                         (Word.fromInt Word32.wordSize - 0w17)
                 val oper =
                     if (wordValue andb 0wx01000000) = 0w0
                     then "tbz" else "tbnz"
                 val b40 = (wordValue >> 0w19) andb 0wx1f
                 val bitNo = b40 orb ((wordValue >> 0w26) andb 0wx20)
                 val r = if bitNo < 0w32 then "w" else "x"
             in
                 printStream oper; printStream "\t";
                 printStream r; printStream(Word32.fmt StringCvt.DEC (wordValue andb 0wx1f));
                 printStream ",#"; printStream(Word32.fmt StringCvt.DEC bitNo); printStream ",0x";
                 printStream(Word32.fmt StringCvt.HEX (wordToWord32 byteNo+byteOffset))
             end
 
             else if (wordValue andb 0wx3fe00000) = 0wx1A800000
             then
             let
                 val sf = wordValue >> 0w31
                 val opc = (wordValue >> 0w30) andb 0w1
                 val op2 = (wordValue >> 0w10) andb 0w3
                 val rT = wordValue andb 0wx1f
                 val rN = (wordValue >> 0w5) andb 0wx1f
                 val rM = (wordValue >> 0w16) andb 0wx1f
                 val cond = (wordValue >> 0w12) andb 0wxf
                 val opcode =
                     case (opc, op2) of
                         (0w0, 0w0) => "csel"
                     |   (0w0, 0w1) => "csinc"
                     |   (0w1, 0w0) => "csinv"
                     |   (0w1, 0w1) => "csneg"
                     |   _ => "??"
                 val r = if sf = 0w0 then "w" else "x"
             in
                 printStream opcode; printStream "\t";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rT);
                 printStream ","; printStream r; printStream(Word32.fmt StringCvt.DEC rN);
                 printStream ","; printStream r; printStream(Word32.fmt StringCvt.DEC rM);
                 printStream ","; printCondition cond
             end
 
             else if (wordValue andb 0wx7f800000) = 0wx13000000
             then (* signed bitfield *)
             let
                 val sf = wordValue >> 0w31
                 (* N is always the same as sf. *)
                 (*val nBit = (wordValue >> 0w22) andb 0w1*)
                 val immr = (wordValue >> 0w16) andb 0wx3f
                 val imms = (wordValue >> 0w10) andb 0wx3f
                 val rN = (wordValue >> 0w5) andb 0wx1f
                 val rD = wordValue andb 0wx1f
                 val (r, wordSize) = if sf = 0w0 then ("w", 0w32) else if sf = 0w1 then ("x", 0w64) else raise InternalError "Neither"
             in
                 if imms = wordSize - 0w1
                 then printStream "asr\t"
                 else printStream "sbfm\t";
                 printStream r;
                 printStream(Word32.fmt StringCvt.DEC rD);
                 printStream ",";
                 printStream r;
                 printStream(Word32.fmt StringCvt.DEC rN);
                 if imms = wordSize - 0w1
                 then (printStream ",#0x"; printStream(Word32.toString immr))
                 else
                 (
                     printStream ",#0x"; printStream(Word32.toString immr);
                     printStream ",#0x"; printStream(Word32.toString imms)
                 )
             end
 
             else if (wordValue andb 0wx7f800000) = 0wx33000000
             then (* bitfield move *)
             let
                 val sf = wordValue >> 0w31
                 (* N is always the same as sf. *)
                 (*val nBit = (wordValue >> 0w22) andb 0w1*)
                 val immr = (wordValue >> 0w16) andb 0wx3f
                 val imms = (wordValue >> 0w10) andb 0wx3f
                 val rN = (wordValue >> 0w5) andb 0wx1f
                 val rD = wordValue andb 0wx1f
                 val (r, wordSize) = if sf = 0w0 then ("w", 0w32) else ("x", 0w64)
             in
                 if imms < immr
                 then if rD = 0wx31 then printStream "bfc\t" else printStream "bfi\t"
                 else printStream "bfxil\t";
                 if imms >= immr orelse rD <> 0w31
                 then
                 (
                     printStream r;
                     printStream(Word32.fmt StringCvt.DEC rD);
                     printStream ","
                 )
                 else ();
                 printStream r;
                 printStream(Word32.fmt StringCvt.DEC rN);
                 (* Not certain that these are correct. *)
                 if imms < immr
                 then
                 (
                     printStream ",#0x"; printStream(Word32.toString(wordSize - immr));
                     printStream ",#0x"; printStream(Word32.toString(imms+0w1))
                 )
                 else
                 (
                     printStream ",#0x"; printStream(Word32.toString immr);
                     printStream ",#0x"; printStream(Word32.toString(imms+0w1-immr))
                 )
             end
 
             else if (wordValue andb 0wx7f800000) = 0wx53000000
             then (* unsigned bitfield move *)
             let
                 val sf = wordValue >> 0w31
                 (* N is always the same as sf. *)
                 (*val nBit = (wordValue >> 0w22) andb 0w1*)
                 val immr = (wordValue >> 0w16) andb 0wx3f
                 val imms = (wordValue >> 0w10) andb 0wx3f
                 val rN = (wordValue >> 0w5) andb 0wx1f
                 val rD = wordValue andb 0wx1f
                 val (r, wordSize) = if sf = 0w0 then ("w", 0w32) else ("x", 0w64)
             in
                 if imms + 0w1 = immr
                 then
                 (
                     printStream "lsl\t";
                     printStream r;
                     printStream(Word32.fmt StringCvt.DEC rD);
                     printStream ",";
                     printStream r;
                     printStream(Word32.fmt StringCvt.DEC rN);
                     printStream ",#0x"; printStream(Word32.toString(wordSize - immr))
                 )
                 else if imms = wordSize - 0w1
                 then
                 (
                     printStream "lsr\t";
                     printStream r;
                     printStream(Word32.fmt StringCvt.DEC rD);
                     printStream ",";
                     printStream r;
                     printStream(Word32.fmt StringCvt.DEC rN);
                     printStream ",#0x"; printStream(Word32.toString immr)
                 )
                 else if imms < immr
                 then
                 (
                     printStream "ubfiz\t";
                     printStream r;
                     printStream(Word32.fmt StringCvt.DEC rD);
                     printStream ",";
                     printStream r;
                     printStream(Word32.fmt StringCvt.DEC rN);
                     printStream ",#0x"; printStream(Word32.toString(wordSize - immr));
                     printStream ",#0x"; printStream(Word32.toString(imms+0w1))
                 )
                 else
                 (
                     printStream "ubfm\t";
                     printStream r;
                     printStream(Word32.fmt StringCvt.DEC rD);
                     printStream ",";
                     printStream r;
                     printStream(Word32.fmt StringCvt.DEC rN);
                     printStream ",#0x"; printStream(Word32.toString immr);
                     printStream ",#0x"; printStream(Word32.toString imms)
                 )
             end
 
             else if (wordValue andb 0wx1f800000) = 0wx12000000
             then (* logical immediate *)
             let
                 val sf = wordValue >> 0w31
                 val opc = (wordValue >> 0w29) andb 0w3
                 val nBit = (wordValue >> 0w22) andb 0w1
                 val immr = (wordValue >> 0w16) andb 0wx3f
                 val imms = (wordValue >> 0w10) andb 0wx3f
                 val rN = (wordValue >> 0w5) andb 0wx1f
                 val rD = wordValue andb 0wx1f
                 val (opcode, r) =
                     case (sf, opc, nBit) of
                         (0w0, 0w0, 0w0) => ("and", "w")
                     |   (0w0, 0w1, 0w0) => ("orr", "w")
                     |   (0w0, 0w2, 0w0) => ("eor", "w")
                     |   (0w0, 0w3, 0w0) => ("ands", "w")
                     |   (0w1, 0w0, _) => ("and", "x")
                     |   (0w1, 0w1, _) => ("orr", "x")
                     |   (0w1, 0w2, _) => ("eor", "x")
                     |   (0w1, 0w3, _) => ("ands", "x")
                     |   _ => ("??", "?")
             in
                 if rD = 0w31 andalso opc=0w3
                 then printStream "tst\t"
                 else
                 (
                     printStream opcode;
                     printStream "\t";
                     printStream r; printStream(Word32.fmt StringCvt.DEC rD); printStream ","
                 );
                 printStream r; printStream(Word32.fmt StringCvt.DEC rN); printStream ",#0x";
                 printStream(Word64.toString(decodeBitPattern{sf=sf, n=nBit, immr=immr, imms=imms}))
             end
 
             else if (wordValue andb 0wx5fe00000) = 0wx1ac00000
             then (* Two source operations - shifts and divide. *)
             let
                 val sf = wordValue >> 0w31
                 val s = (wordValue >> 0w29) andb 0w1
                 val rM = (wordValue >> 0w16) andb 0wx1f
                 val opcode = (wordValue >> 0w10) andb 0wx3f
                 val rN = (wordValue >> 0w5) andb 0wx1f
                 val rD = wordValue andb 0wx1f
                 val (oper, r) =
                     case (sf, s, opcode) of
                         (0w1, 0w0, 0wx2) => ("udiv", "x")
                     |   (0w1, 0w0, 0wx3) => ("sdiv", "x")
                     |   (0w0, 0w0, 0wx2) => ("udiv", "w")
                     |   (0w0, 0w0, 0wx3) => ("sdiv", "w")
                     |   (0w1, 0w0, 0wx8) => ("lsl", "x")
                     |   (0w0, 0w0, 0wx8) => ("lsl", "w")
                     |   (0w1, 0w0, 0wx9) => ("lsr", "x")
                     |   (0w0, 0w0, 0wx9) => ("lsr", "w")
                     |   (0w1, 0w0, 0wxa) => ("asr", "x")
                     |   (0w0, 0w0, 0wxa) => ("asr", "w")
                     |   _ => ("??", "?")
             in
                 printStream oper;
                 printStream "\t";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rD); printStream ",";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rN); printStream ",";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rM)
             end
 
             else if (wordValue andb 0wx1f000000) = 0wx1b000000
             then (* Three source operations - multiply add/subtract. *)
             let
                 val sf = wordValue >> 0w31
                 val op54 = (wordValue >> 0w29) andb 0w3
                 val op31 = (wordValue >> 0w21) andb 0w7
                 val o0 = (wordValue >> 0w15) andb 0w1
                 val rM = (wordValue >> 0w16) andb 0wx1f
                 val rA = (wordValue >> 0w10) andb 0wx1f
                 val rN = (wordValue >> 0w5) andb 0wx1f
                 val rD = wordValue andb 0wx1f
                 val (oper, r1, r2) =
                     case (sf, op54, op31, o0, rA) of
                         (0w1, 0w0, 0w0, 0w0, 0w31) => ("mul", "x", "x")
                     |   (0w1, 0w0, 0w0, 0w0, _)    => ("madd", "x", "x")
                     |   (0w1, 0w0, 0w0, 0w1, 0w31) => ("mneg", "x", "x")
                     |   (0w1, 0w0, 0w0, 0w1, _)    => ("msub", "x", "x")
                     |   (0w0, 0w0, 0w0, 0w0, _)    => ("madd", "w", "w")
                     |   (0w0, 0w0, 0w0, 0w1, _)    => ("msub", "w", "w")
                     |   (0w1, 0w0, 0w2, 0w0, 0w31) => ("smulh", "x", "x")
                     |   (0w1, 0w0, 0w1, 0w0, 0w31) => ("smull", "x", "w")
                     |   (0w1, 0w0, 0w1, 0w0, _)    => ("smaddl", "x", "w")
                     |   (0w1, 0w0, 0w1, 0w1, _)    => ("smsubl", "x", "w")
                     |   _ => ("??", "?", "?")
             in
                 printStream oper;
                 printStream "\t";
                 printStream r1; printStream(Word32.fmt StringCvt.DEC rD); printStream ",";
                 printStream r2; printStream(Word32.fmt StringCvt.DEC rN); printStream ",";
                 printStream r2; printStream(Word32.fmt StringCvt.DEC rM);
                 if rA = 0w31 then ()
                 else (printStream ","; printStream r1; printStream(Word32.fmt StringCvt.DEC rA))
             end
 
             else if (wordValue andb 0wx7f20fc00) = 0wx1E200000
             then (* Moves between floating point and general regs. *)
             let
                 val sf = (wordValue >> 0w31) andb 0w1
                 and s = (wordValue >> 0w29) andb 0w1
                 and ptype = (wordValue >> 0w22) andb 0w3
                 and mode = (wordValue >> 0w19) andb 0w3
                 and opcode = (wordValue >> 0w16) andb 0w7
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rD = wordValue andb 0wx1f
                 val (opc, dr, nr) =
                     case (sf, s, ptype, mode, opcode) of
                         (0w0, 0w0, 0w0, 0w0, 0w7) => ("fmov", "s", "w") (* w -> s *)
                     |   (0w0, 0w0, 0w0, 0w0, 0w6) => ("fmov", "w", "s") (* s -> w *)
                     |   (0w1, 0w0, 0w1, 0w0, 0w7) => ("fmov", "d", "x") (* d -> x *)
                     |   (0w1, 0w0, 0w1, 0w0, 0w6) => ("fmov", "x", "d") (* x -> d *)
                     |   (0w0, 0w0, 0w0, 0w0, 0w2) => ("scvtf", "w", "s")
                     |   (0w0, 0w0, 0w1, 0w0, 0w2) => ("scvtf", "w", "d")
                     |   (0w1, 0w0, 0w0, 0w0, 0w2) => ("scvtf", "x", "s")
                     |   (0w1, 0w0, 0w1, 0w0, 0w2) => ("scvtf", "x", "d")
 
                     |   (0w0, 0w0, 0w0, 0w0, 0w4) => ("fcvtas", "w", "s") (* s -> w *)
                     |   (0w0, 0w0, 0w0, 0w2, 0w0) => ("fcvtms", "w", "s") (* s -> w *)
                     |   (0w0, 0w0, 0w0, 0w1, 0w0) => ("fcvtps", "w", "s") (* s -> w *)
                     |   (0w0, 0w0, 0w0, 0w3, 0w0) => ("fcvtzs", "w", "s") (* s -> w *)
                     |   (0w0, 0w0, 0w1, 0w0, 0w4) => ("fcvtas", "w", "d") (* d -> w *)
                     |   (0w0, 0w0, 0w1, 0w2, 0w0) => ("fcvtms", "w", "d") (* d -> w *)
                     |   (0w0, 0w0, 0w1, 0w1, 0w0) => ("fcvtps", "w", "d") (* d -> w *)
                     |   (0w0, 0w0, 0w1, 0w3, 0w0) => ("fcvtzs", "w", "d") (* d -> w *)
 
                     |   (0w1, 0w0, 0w0, 0w0, 0w4) => ("fcvtas", "x", "s") (* s -> x *)
                     |   (0w1, 0w0, 0w0, 0w2, 0w0) => ("fcvtms", "x", "s") (* s -> x *)
                     |   (0w1, 0w0, 0w0, 0w1, 0w0) => ("fcvtps", "x", "s") (* s -> x *)
                     |   (0w1, 0w0, 0w0, 0w3, 0w0) => ("fcvtzs", "x", "s") (* s -> x *)
                     |   (0w1, 0w0, 0w1, 0w0, 0w4) => ("fcvtas", "x", "d") (* d -> x *)
                     |   (0w1, 0w0, 0w1, 0w2, 0w0) => ("fcvtms", "x", "d") (* d -> x *)
                     |   (0w1, 0w0, 0w1, 0w1, 0w0) => ("fcvtps", "x", "d") (* d -> x *)
                     |   (0w1, 0w0, 0w1, 0w3, 0w0) => ("fcvtzs", "x", "d") (* d -> x *)
                     |   _ => ("?", "?", "?")
             in
                 printStream opc; printStream "\t";
                 printStream dr; printStream(Word32.fmt StringCvt.DEC rD); printStream ",";
                 printStream nr; printStream(Word32.fmt StringCvt.DEC rN)
             end
             
             else if (wordValue andb 0wxff200c00) = 0wx1E200800
             then (* Floating point two source operations. *)
             let
                 val pt = (wordValue >> 0w22) andb 0w3
                 and rM = (wordValue >> 0w16) andb 0wx1f
                 and opc = (wordValue >> 0w12) andb 0wxf
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rT = wordValue andb 0wx1f
                 val (opcode, r) =
                     case (pt, opc) of
                         (0w0, 0wx0) => ("fmul", "s")
                     |   (0w0, 0wx1) => ("fdiv", "s")
                     |   (0w0, 0wx2) => ("fadd", "s")
                     |   (0w0, 0wx3) => ("fsub", "s")
                     |   (0w1, 0wx0) => ("fmul", "d")
                     |   (0w1, 0wx1) => ("fdiv", "d")
                     |   (0w1, 0wx2) => ("fadd", "d")
                     |   (0w1, 0wx3) => ("fsub", "d")
                     |   _ => ("??", "?")
             in
                 printStream opcode; printStream "\t";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rT); printStream ",";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rN); printStream ",";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rM)
             end
 
             else if (wordValue andb 0wxff207c00) = 0wx1E204000
             then (* Floating point single source. *)
             let
                 val pt = (wordValue >> 0w22) andb 0w3
                 and opc = (wordValue >> 0w15) andb 0wx3f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and rT = wordValue andb 0wx1f
                 val (opcode, rS, rD) =
                     case (pt, opc) of
                         (0w0, 0wx0) => ("fmov", "s", "s")
                     |   (0w0, 0wx1) => ("fabs", "s", "s")
                     |   (0w0, 0wx2) => ("fneg", "s", "s")
                     |   (0w0, 0wx5) => ("fcvt", "s", "d")
                     |   (0w1, 0wx0) => ("fmov", "d", "d")
                     |   (0w1, 0wx1) => ("fabs", "d", "d")
                     |   (0w1, 0wx2) => ("fneg", "d", "d")
                     |   (0w1, 0wx4) => ("fcvt", "d", "s")
                     |   _ => ("??", "?", "?")
             in
                 printStream opcode; printStream "\t";
                 printStream rD; printStream(Word32.fmt StringCvt.DEC rT); printStream ",";
                 printStream rS; printStream(Word32.fmt StringCvt.DEC rN)
             end
 
             else if (wordValue andb 0wxff20fc07) = 0wx1E202000
             then (* Floating point comparison *)
             let
                 val pt = (wordValue >> 0w22) andb 0w3
                 and rM = (wordValue >> 0w16) andb 0wx1f
                 and rN = (wordValue >> 0w5) andb 0wx1f
                 and opc = (wordValue >> 0w3) andb 0w3
                 val (opcode, r) =
                     case (pt, opc) of
                         (0w0, 0wx0) => ("fcmp", "s")
                     |   (0w1, 0wx0) => ("fcmp", "d")
                     |   (0w0, 0wx2) => ("fcmpe", "s")
                     |   (0w1, 0wx2) => ("fcmpe", "d")
                     |   _ => ("??", "?")
             in
                 printStream opcode; printStream "\t";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rN); printStream ",";
                 printStream r; printStream(Word32.fmt StringCvt.DEC rM)
             end
 
             else if (wordValue andb 0wxffffffe0) = 0wx2F00E400
             then (* movi dn,#0 *)
             let
                 val rD = wordValue andb 0wx1f
             in
                 printStream "movi\td"; printStream(Word32.fmt StringCvt.DEC rD);
                 printStream ",#0"
             end
 
             else if (wordValue andb 0wx1e000000) = 0wx02000000
             then (* This is an unallocated range.  We use it for the register mask. *)
             let
                 fun printMask (0w25, _) = ()
                 |   printMask (i, comma) =
                     if ((0w1 << i) andb wordValue) <> 0w0
                     then
                     (
                         if comma then printStream ", " else ();
                         printStream "x";
                         printStream(Word.fmt StringCvt.DEC i);
                         printMask(i+0w1, true)
                     )
                     else printMask(i+0w1, comma)
             in
                 printStream "["; printMask(0w0, false); printStream "]"
             end
 
             else printStream "?"
             ;
             printStream "\n"
         end
         
         fun printAll i =
             if i = numInstructions then ()
             else (printWordAt i; printAll(i+0w1))
     in
         printStream functionName;
         printStream ":\n";
         printAll 0w0
     end
 
     (* Set the offsets of ADRP+LDR and ADRP+ADD instruction pairs.  The values in these instructions are,
        to some extent, absolute addresses so this needs to be done by the RTS. firstNonAddrConst and
        firstAddrConst are the offsets in bytes. *)
     fun setADRPAddresses(ops, codeVec, firstNonAddrConst, firstAddrConst, addrConstMap, nonAddrConstMap) =
     let
         fun setADRPAddrs([], _ , _, _) = ()
 
         |   setADRPAddrs(LoadAddressLiteral{length=ref BrExtended, ...} :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 (* Address constants are 32-bits in 32-in-64 and 64-bits in native 64-bits *)
                 val constPos = Array.sub(addrConstMap, aConstNum)
                 val addrOfConstant (* byte offset *) = firstAddrConst + constPos * Address.wordSize
             in
                 codeVecPutConstant (codeVec, wordNo * 0w4, toMachineWord addrOfConstant,
                     if is32in64 then ConstArm64AdrpLdr32 else ConstArm64AdrpLdr64);
                 setADRPAddrs(tail, wordNo+0w2, aConstNum+1, nonAConstNum)
             end
 
         |   setADRPAddrs(LoadNonAddressLiteral{length=ref BrExtended, ...} :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 (* The offset is in 32-bit words.  These are always 64-bits. *)
                 val constPos = Array.sub(nonAddrConstMap, nonAConstNum)
                 val offsetOfConstant (* byte offset *) = firstNonAddrConst+constPos*0w8
             in
                 codeVecPutConstant (codeVec, wordNo * 0w4, toMachineWord offsetOfConstant, ConstArm64AdrpLdr64);
                 setADRPAddrs(tail, wordNo+0w2, aConstNum, nonAConstNum+1)
             end
 
         |   setADRPAddrs(LoadFPLiteral{length=ref BrExtended, isDouble, ...} :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 (* The offset is in 32-bit words and the constants themselves are always 64-bits.  If
                    we're loading a 32-bit float we have to use 32-bit offsets. *)
                 val constPos = Array.sub(nonAddrConstMap, nonAConstNum)
                 val offsetOfConstant (* byte offset *) = firstNonAddrConst+constPos*0w8
             in
                 codeVecPutConstant (codeVec, wordNo * 0w4, toMachineWord offsetOfConstant,
                                             if isDouble then ConstArm64AdrpLdr64 else ConstArm64AdrpLdr32);
                 setADRPAddrs(tail, wordNo+0w2, aConstNum, nonAConstNum+1)
             end
 
         |   setADRPAddrs(LoadLabelAddress{label=ref labs, length=ref BrExtended, ...} :: tail, wordNo, aConstNum, nonAConstNum) =
             let
                 val dest = !(hd labs) * 0w4
             in
                 codeVecPutConstant (codeVec, wordNo * 0w4, toMachineWord dest, ConstArm64AdrpAdd);
                 setADRPAddrs(tail, wordNo+0w2, aConstNum, nonAConstNum)
             end
 
        |    setADRPAddrs(instr :: tail, wordNo, aConstNum, nonAConstNum) =
                 setADRPAddrs(tail, wordNo+Word.fromInt(codeSize instr), aConstNum, nonAConstNum)
     in
         setADRPAddrs (ops, 0w0, 0, 0)
     end
 
     (* Although this is used locally it must be defined at the top level
        otherwise a new RTS function will be compiled every time the
        containing function is called *)
     val sortFunction: (machineWord * int) array -> bool =
         RunCall.rtsCallFast1 "PolySortArrayOfAddresses"
 
     (* Adds the constants onto the code, and copies the code into a new segment *)
     fun generateCode {instrs, name=functionName, parameters, resultClosure, profileObject} =
     let
         val printStream = Pretty.getSimplePrinter(parameters, [])
         and printAssemblyCode = Debug.getParameter Debug.assemblyCodeTag parameters
         
         local
             (* Extract the constants. *)
             fun getConsts(LoadAddressLiteral {value, ...}, (addrs, nonAddrs, addrCount, nonAddrCount)) =
                     ((value, addrCount)::addrs, nonAddrs, addrCount+1, nonAddrCount)
             |   getConsts(LoadNonAddressLiteral {value, ...}, (addrs, nonAddrs, addrCount, nonAddrCount)) =
                     (addrs, (value, nonAddrCount)::nonAddrs, addrCount, nonAddrCount+1)
             |   getConsts(LoadFPLiteral {value, isDouble, ...}, (addrs, nonAddrs, addrCount, nonAddrCount)) =
                 let
                     (* When loading a float we will only access the first 32-bits so if this is
                        big-endian we have to shift the value so it's there. *)
                     val shifted = if not isDouble andalso isBigEndian then LargeWord.<<(value, 0w32) else value
                 in
                     (addrs, (shifted, nonAddrCount)::nonAddrs, addrCount, nonAddrCount+1)
                 end
             |   getConsts(_, consts) = consts
 
             val (addressConstants, nonAddressConstants, addrConstCount, nonAddrConstCount) =
                 List.foldl getConsts ([], [], 0, 0) instrs
 
             (* Sort the non-address constants to remove duplicates.  There don't seem to be
                many in practice.
                Since we're not actually interested in the order but only
                sorting to remove duplicates we can use a stripped-down Quicksort. *)
             fun sort([], out) = out
             |   sort((median, addr) :: tl, out) = partition(median, tl, [addr], [], [], out)
 
             and partition(median, [], addrs, less, greater, out) =
                     sort(less, sort(greater, (median, addrs) :: out))
             |   partition(median, (entry as (value, addr)) :: tl, addrs, less, greater, out) =
                     if value = median
                     then partition(median, tl, addr::addrs, less, greater, out)
                     else if value < median
                     then partition(median, tl, addrs, entry :: less, greater, out)
                     else partition(median, tl, addrs, less, entry :: greater, out)
             
             (* Non-address constants.  We can't use any ordering on them because a GC could
                change the values half way through the sort.  Instead we use a simple search
                for a small number of constants and use an RTS call for larger numbers.  We
                want to avoid quadratic cost when there are large numbers. *)
 
             val sortedConstants =
                 if List.length addressConstants < 10
                 then
                 let
                     fun findDups([], out) = out
                     |   findDups((value, addr) :: tl, out) =
                         let
                             fun partition(e as (v, a), (eq, neq)) =
                                 if PolyML.pointerEq(value, v)
                                 then (a :: eq, neq)
                                 else (eq, e :: neq)
                             val (eqAddr, neq) = List.foldl partition ([addr], []) tl
                         in
                             findDups(neq, (value, eqAddr) :: out)
                         end
                 in
                     findDups(addressConstants, [])
                 end
                 else
                 let
                     val arrayToSort = Array.fromList addressConstants
                     val _ = sortFunction arrayToSort
                     
                     fun makeList((v, a), []) = [(v, [a])]
                     |   makeList((v, a), l as (vv, aa) :: tl) =
                         if PolyML.pointerEq(v, vv)
                         then (vv, a :: aa) :: tl
                         else (v, [a]) :: l
                 in
                     (Array.foldl makeList [] arrayToSort)
                 end
         in
             val addressConsts = sortedConstants
             and nonAddressConsts = sort(nonAddressConstants, []) : (Word64.word * int list) list
             and addrConstCount = addrConstCount
             and nonAddrConstCount = nonAddrConstCount
         end
 
         (* Create maps that indicate for each constant where it is in the constant area. *)
         val addrConstMap = Array.array(addrConstCount, 0w0) and nonAddrConstMap = Array.array(nonAddrConstCount, 0w0)
         val _ = List.foldl(fn ((_, cnums), n) => (List.app(fn i => Array.update(addrConstMap, i, n)) cnums; n+0w1)) 0w0 addressConsts
         val _ = List.foldl(fn ((_, cnums), n) => (List.app(fn i => Array.update(nonAddrConstMap, i, n)) cnums; n+0w1)) 0w0 nonAddressConsts
         
         (* Generate the code and set the constant addresses at the same time. *)
         val (byteVec, nativeWordsOfCode) =
             genCode(instrs, List.map #1 addressConsts, List.map #1 nonAddressConsts, addrConstMap, nonAddrConstMap)
 
         val wordsOfCode = nativeWordsOfCode * wordsPerNativeWord
 
         (* +3 for profile count, function name and constants count *)
         val numOfConst = List.length addressConsts
         val segSize   = wordsOfCode + Word.fromInt numOfConst + 0w4
         val firstConstant = wordsOfCode + 0w3 (* Add 3 for no of consts, fn name and profile count. *)
     
         (* Put in the number of constants. This must go in before
            we actually put in any constants. *)
         local
             val lastWord = segSize - 0w1
         in
             val () = setWord(LargeWord.fromInt(numOfConst + 2), wordsOfCode, byteVec)
             (* Set the last word of the code to the (negative) byte offset of the start of the code area
                from the end of this word. *)
             val () = setWord(LargeWord.fromInt(numOfConst + 3) * ~(Word.toLarge Address.wordSize), lastWord, byteVec) 
         end
 
         (* Now we've filled in all the size info we need to convert the segment
            into a proper code segment before it's safe to put in any ML values. *)
         val codeVec = byteVecToCodeVec(byteVec, resultClosure)
 
         local
             val name     : string = functionName
             val nameWord : machineWord = toMachineWord name
         in
             val () = codeVecPutWord (codeVec, wordsOfCode+0w1, nameWord)
         end
         (* Profile ref.  A byte ref used by the profiler in the RTS. *)
         val () = codeVecPutWord (codeVec, wordsOfCode+0w2, profileObject)
 
         (* and then copy the constants from the constant list. *)
         local
             fun setConstant((value, _), num) =
             (
                 codeVecPutWord (codeVec, firstConstant + num, value);
                 num+0w1
             )
         in
             val _ = List.foldl setConstant 0w0 addressConsts
         end
         
         val () = setADRPAddresses(instrs, codeVec,
                     (nativeWordsOfCode-Word.fromInt(List.length nonAddressConsts)) * Address.nativeWordSize,
                     firstConstant * Address.wordSize, addrConstMap, nonAddrConstMap)
     in
         if printAssemblyCode
         then (* print out the code *)
             (printCode (codeVec, functionName, wordsOfCode, printStream); printStream"\n")
         else ();
         codeVecLock(codeVec, resultClosure)
     end (* copyCode *)
 
 
     structure Sharing =
     struct
         type closureRef = closureRef
         type instr = instr
         type xReg = xReg
         type vReg = vReg
         type labels = labels
         type condition = condition
         type shiftType = shiftType
         type wordSize = wordSize
         type 'a extend = 'a extend
         type scale = scale
     end
 end;
 
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64CodetreeToICode.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64CodetreeToICode.ML
index 4f2f834c..fa961bfc 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64CodetreeToICode.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64CodetreeToICode.ML
@@ -1,3290 +1,3284 @@
 (*
     Copyright David C. J. Matthews 2021-2
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64CodetreeToICode(
     structure BackendTree: BACKENDINTERMEDIATECODE
     structure Arm64ICode: ARM64ICODE
     structure Debug: DEBUG
     structure Arm64Foreign: FOREIGNCALL
     structure ICodeTransform: ARM64ICODETRANSFORM
     structure CodeArray: CODEARRAY
     structure Pretty:PRETTY
 
     sharing Arm64ICode.Sharing = ICodeTransform.Sharing = CodeArray.Sharing = BackendTree.Sharing
 ): GENCODE =
 struct
     open BackendTree
     open Address
     open Arm64ICode
     open CodeArray
     open BuiltIns
+    
+    val useLSEAtomics = true (* Use 8.1 atomics? *)
 
     (* Reversed cons and append to make the code easier to read. *)
     infix 5 <::> <@>
     fun tl <::> hd = hd :: tl
     and snd <@> fst = fst @ snd
 
     type iCodeAbstract = (preg, pregOrZero, preg) arm64ICode and basicBlockAbstract = (preg, pregOrZero, preg) basicBlock
     
     exception InternalError = Misc.InternalError
     
     fun taggedWord64 w: Word64.word = w * 0w2 + 0w1
     and taggedWord w: word = w * 0w2 + 0w1
     
     datatype blockStruct =
         BlockSimple of iCodeAbstract
     |   BlockExit of iCodeAbstract
     |   BlockLabel of int
     |   BlockFlow of controlFlow
     |   BlockBegin of { regArgs: (preg * xReg) list, stackArgs: stackLocn list }
     |   BlockRaiseAndHandle of iCodeAbstract * int
     |   BlockOptionalHandle of {call: iCodeAbstract, handler: int, label: int }
 
 
     val moveRegister = BlockSimple o MoveRegister
     and loadNonAddressConstant = BlockSimple o LoadNonAddressConstant
     and loadAddressConstant = BlockSimple o LoadAddressConstant
     and loadWithConstantOffset = BlockSimple o LoadWithConstantOffset
     and loadFPWithConstantOffset = BlockSimple o LoadFPWithConstantOffset
     and loadWithIndexedOffset = BlockSimple o LoadWithIndexedOffset
     and loadFPWithIndexedOffset = BlockSimple o LoadFPWithIndexedOffset
     and getThreadId = BlockSimple o GetThreadId
     and objectIndexAddressToAbsolute = BlockSimple o ObjectIndexAddressToAbsolute
     and absoluteToObjectIndex = BlockSimple o AbsoluteToObjectIndex
     and allocateMemoryFixed = BlockSimple o AllocateMemoryFixed
     and allocateMemoryVariable = BlockSimple o AllocateMemoryVariable
     and initialiseMem = BlockSimple o InitialiseMem
     and storeWithConstantOffset = BlockSimple o StoreWithConstantOffset
     and storeFPWithConstantOffset = BlockSimple o StoreFPWithConstantOffset
     and storeWithIndexedOffset = BlockSimple o StoreWithIndexedOffset
     and storeFPWithIndexedOffset = BlockSimple o StoreFPWithIndexedOffset
     and addSubImmediate = BlockSimple o AddSubImmediate
     and addSubRegister = BlockSimple o AddSubRegister
     and logicalImmediate = BlockSimple o LogicalImmediate
     and logicalRegister = BlockSimple o LogicalRegister
     and shiftRegister = BlockSimple o ShiftRegister
     and multiplication = BlockSimple o Multiplication
     and division = BlockSimple o Division
     and pushToStack = BlockSimple o PushToStack
     and loadStack = BlockSimple o LoadStack
     and storeToStack = BlockSimple o StoreToStack
     and containerAddress = BlockSimple o ContainerAddress
     and resetStackPtr = BlockSimple o ResetStackPtr
     and tagValue = BlockSimple o TagValue
     and untagValue = BlockSimple o UntagValue
     and boxLarge = BlockSimple o BoxLarge
     and unboxLarge = BlockSimple o UnboxLarge
     and boxTagFloat = BlockSimple o BoxTagFloat
     and unboxTagFloat = BlockSimple o UnboxTagFloat
     and loadAcquire = BlockSimple o LoadAcquire
     and storeRelease = BlockSimple o StoreRelease
     and bitFieldShift = BlockSimple o BitFieldShift
     and bitFieldInsert = BlockSimple o BitFieldInsert
     and compareByteVectors = BlockSimple o CompareByteVectors
     and blockMove = BlockSimple o BlockMove
     and addSubXSP = BlockSimple o AddSubXSP
     and touchValue = BlockSimple o TouchValue
     and loadAcquireExclusive = BlockSimple o LoadAcquireExclusive
     and storeReleaseExclusive = BlockSimple o StoreReleaseExclusive
     and memoryBarrier = BlockSimple MemoryBarrier
     and convertIntToFloat = BlockSimple o ConvertIntToFloat
     and convertFloatToInt = BlockSimple o ConvertFloatToInt
     and unaryFloatingPt = BlockSimple o UnaryFloatingPt
     and binaryFloatingPoint = BlockSimple o BinaryFloatingPoint
     and compareFloatingPoint = BlockSimple o CompareFloatingPoint
     and cpuYield = BlockSimple CPUYield
     val atomicOperation = BlockSimple o AtomicOperation
 
     val shiftConstant = BlockSimple o shiftConstant
 
     (* Many operations use 32-bit arguments in 32-in-64 and 64-bit in native 64. *)
     val polyWordLoadSize = if is32in64 then Load32 else Load64
     val polyWordOpSize = if is32in64 then OpSize32 else OpSize64
 
     val tagBitMask64 = Word64.<<(Word64.fromInt ~1, 0w1)
     val tagBitMask32 = Word64.andb(tagBitMask64, 0wxffffffff)
 
     val polyWordTagBitMask = if is32in64 then tagBitMask32 else tagBitMask64
 
     (* The flags byte is the high-order byte of length word. *)
     val flagsByteOffset = if isBigEndian then ~ (Word.toInt wordSize) else ~1
 
     (* Size of operand in bytes and therefore the scale factor. *)
     fun opWordSize Load64 = 8
     |   opWordSize Load32 = 4
     |   opWordSize Load16 = 2
     |   opWordSize Load8 = 1
 
     (* Shift for each size.  i.e. log2 of opWordSize. *)
     fun loadShift Load64 = 0w3
     |   loadShift Load32 = 0w2
     |   loadShift Load16 = 0w1
     |   loadShift Load8 = 0w0
 
     fun precisionToFpSize PrecSingle = Float32 | precisionToFpSize PrecDouble = Double64
 
     fun codeFunctionToArm64({body, localCount, name, argTypes, closure, ...}:bicLambdaForm, debugSwitches, resultClosure) =
     let
         (* Pseudo-registers are allocated sequentially and the properties added to the list. *)
         val pregCounter = ref 0
         val pregPropList = ref []
         fun newPReg() =
         let
             val regNo = !pregCounter before pregCounter := !pregCounter + 1
             val () = pregPropList := RegPropGeneral :: !pregPropList
         in
             PReg regNo
         end
         
         and newUReg() =
         let
             val regNo = !pregCounter before pregCounter := !pregCounter + 1
             val () = pregPropList := RegPropUntagged :: !pregPropList
         in
             PReg regNo
         end
         
         and newStackLoc size =
         let
             val regNo = !pregCounter before pregCounter := !pregCounter + 1
             val () = pregPropList := RegPropStack size :: !pregPropList
         in
             StackLoc{size=size, rno=regNo}
         end
         
         and newMergeReg() =
         let
             val regNo = !pregCounter before pregCounter := !pregCounter + 1
             val () = pregPropList := RegPropMultiple :: !pregPropList
         in
             PReg regNo
         end
         
         datatype locationValue =
             NoLocation
         |   PregLocation of preg
         |   StackContainer of { container: stackLocn, stackOffset: int }
         |   RegisterContainer of preg list
 
         val locToPregArray = Array.array(localCount, NoLocation)
         val labelCounter = ref 1 (* Start at 1.  Zero is used for the root. *)
         fun newLabel() = !labelCounter before labelCounter := !labelCounter + 1
         val ccRefCounter = ref 0
         fun newCCRef() = CcRef(!ccRefCounter) before ccRefCounter := !ccRefCounter + 1
 
         (* The profile object is a single mutable with the F_bytes bit set. *)        
         val profileObject = CodeArray.createProfileObject()
 
         (* Switch to indicate if we want to trace where live data has been allocated. *)
         (* TODO: This should be used in AllocateMemoryOperation and BoxValue and possibly AllocateMemoryVariable. *)
         val addAllocatingFunction =
             Debug.getParameter Debug.profileAllocationTag debugSwitches = 1
 
         datatype destination =
             SpecificPReg of preg
         |   NoResult
         |   AnyReg
 
         (* Context type. *)
         type context =
             { loopArgs: (preg list * int * int) option, stackPtr: int, currHandler: int option,
               overflowBlock: int option ref }
 
         datatype argLoc =
             ArgumentIsInReg of { realReg: xReg, argReg: preg }
         |   ArgumentIsOnStack of { stackOffset: int, stackReg: stackLocn }
         |   ArgumentIsRegContainer of preg list
 
         (* An address as either suitable for Load/StoreWithConstantOffset or else Load/StoreWithIndexedOffset. *)
         datatype addressKind =
             AddrOffset of {base: preg, offset: int}
         |   AddrIndex of {base: preg, index: preg}
 
         (* Pseudo-regs for the result, the closure and the args that were passed in real regs. *)
         val resultTarget = newPReg()
         val closureRegAddr = newPReg()
         val returnAddrReg = newPReg()
         
         val generalArgRegs = [X0, X1, X2, X3, X4, X5, X6, X7]
         
         (* If a container is larger than this it is passed on the stack. *)
         val smallContainerSize = 4
 
         (* Create a map for the arguments indicating their register or stack location. *)
         local
             val containerRegs =
                 case List.filter(fn ContainerType _ => true | _ => false) argTypes of
                     [] => NONE
                 |   [ContainerType s] =>
                         if s <= smallContainerSize
                         then SOME(List.tabulate(s, fn _ => newMergeReg()))
                         else SOME [] (* Larger containers return their result on the stack. *)
                 |   _ => raise InternalError "more than one container arg"
 
             (* Select the appropriate argument register depending on the argument type. *)
             fun argTypesToArgEntries([], _, _) = ([], [], [], [])
 
             |   argTypesToArgEntries(ContainerType s :: tl, gRegs, n) =
                 if s <= smallContainerSize
                 then
                 let
                     val (argTypes, argCode, argRegs, stackArgs) = argTypesToArgEntries(tl, gRegs, n-1)
                     val regs = valOf containerRegs
                 in
                     (ArgumentIsRegContainer regs :: argTypes, argCode, argRegs, stackArgs)
                 end
                 (* The address of a larger container is passed as an argument *)
                 else argTypesToArgEntries(GeneralType :: tl, gRegs, n)
 
             |   argTypesToArgEntries(_ :: tl, gReg :: gRegs, n) =
                 (* This deals with general arguments but also with extra floating point arguments.
                    They are boxed as usual. *)
                 let
                     val (argTypes, argCode, argRegs, stackArgs) =
                         argTypesToArgEntries(tl, gRegs, n-1)
                     val argReg=newPReg()
                 in
                     (ArgumentIsInReg{realReg=gReg, argReg=argReg} :: argTypes, argCode, (argReg, gReg) :: argRegs, stackArgs)
                 end
 
             |   argTypesToArgEntries(_ :: tl, [], n) =
                 let
                     val (argTypes, argCode, argRegs, stackArgs) = argTypesToArgEntries(tl, [], n-1)
                     val stackLoc = newStackLoc 1
                 in
                     (ArgumentIsOnStack {stackOffset=n, stackReg = stackLoc } :: argTypes, argCode, argRegs, stackLoc :: stackArgs)
                 end
 
             val (argEntries, argCode, argRegs, stackArguments) =
                 argTypesToArgEntries(argTypes, generalArgRegs, List.length argTypes)
 
             val clReg = case closure of [] => [] | _ => [(closureRegAddr, X8)]
             val retReg = [(returnAddrReg, X30)]
         in
             val argumentVector = Vector.fromList argEntries
 
             (* Start code for the function. *)
             val beginInstructions = argCode @
                 [BlockBegin{regArgs=retReg @ clReg @ argRegs, stackArgs=stackArguments }]
 
             (* The number of arguments on the stack.  Needed in return instrs and tail calls. *)
             val currentStackArgs = List.length stackArguments
             
             val containerResults = Option.map(fn regs => ListPair.zip(regs, generalArgRegs)) containerRegs
         end
 
         (* TODO: Return the values of the container registers if we have multiple results. *)
 
 
         fun returnInstruction({stackPtr, ...}, resReg, tailCode) =
         let
             val results = getOpt(containerResults, [(resReg, X0)]) (* Return the result in X0 unless there's a container. *)
         in
             BlockExit(ReturnResultFromFunction{results=results, returnReg = returnAddrReg, numStackArgs=currentStackArgs}) ::
                 (if stackPtr <> 0 then resetStackPtr{numWords=stackPtr} :: tailCode else tailCode)
         end
 
         fun asTarget(SpecificPReg preg) = preg
         |   asTarget _ = newPReg()
 
         fun moveToResult(SpecificPReg tReg, code, sReg) =
                 (moveRegister{source=sReg, dest=tReg} :: code, tReg, false)
         |   moveToResult(AnyReg, code, sReg) = (code, sReg, false)
         |   moveToResult(NoResult, code, sReg) =
             let
                 val tReg = newPReg()
             in
                 (moveRegister{source=sReg, dest=tReg} :: code, tReg, false)
             end
 
         (* Store a register at a given offset.  This may have to use an index register
            if the offset is too large. *)
         fun storeAtWordOffset(toStore, offset, base, loadSize, tailCode) =
         let
             val wSize = opWordSize loadSize
             val byteOffset = offset*wSize
         in
             if offset < 4096 andalso byteOffset > ~256 
             then storeWithConstantOffset{base=base, source=toStore,
                             byteOffset=byteOffset, loadType=loadSize} :: tailCode
             else
             let
                 val indexReg = newUReg()
             in
                 storeWithIndexedOffset{ base=base, source=toStore, index=indexReg, loadType=loadSize, signExtendIndex=false } ::
                     loadNonAddressConstant{ source=LargeWord.fromInt offset, dest=indexReg } :: tailCode
             end
         end
 
         (* Allocate a fixed size cell with a reference to the profile object if we want
            to trace the location of live data.  Currently only used for tuples and closures. *)
         fun allocateWithProfileRev(n, flags, memAddr, tlCode) =
         let
             fun doAllocation(words, flags, tlCode) =
             let
                 val wordsRequired =
                     if is32in64
                     then (* Have to round this up to 8 bytes *)
                         Word64.andb(Word64.fromInt(words+2), ~ 0w2)
                     else Word64.fromInt(words+1)
                 val bytesRequired = Word64.fromLarge(Word.toLarge wordSize) * wordsRequired
                 val lengthWord =
                     Word64.orb(Word64.fromInt words, Word64.<<(Word64.fromLarge(Word8.toLarge flags),
                         if is32in64 then 0w24 else 0w56))
                 val lengthReg = newUReg()
             in
                 storeWithConstantOffset{ source=lengthReg, base=memAddr, byteOffset= ~(Word.toInt wordSize), loadType=polyWordLoadSize } ::
                 loadNonAddressConstant{ source=lengthWord, dest=lengthReg } ::
                 allocateMemoryFixed{bytesRequired=bytesRequired, dest=memAddr, saveRegs=[]} :: tlCode
             end
         in
             if addAllocatingFunction
             then
             let
                 val profReg = newPReg()
             in
                 storeAtWordOffset(profReg, n, memAddr, polyWordLoadSize,
                     loadAddressConstant{ source=profileObject, dest=profReg} ::
                         doAllocation(n+1, Word8.orb(flags, Address.F_profile), tlCode))
             end
             else doAllocation(n, flags, tlCode)
         end
 
         (* Return a unit result. *)
         fun returnUnit(target, code, exit) =
         let
             val tReg = asTarget target
         in
             (loadNonAddressConstant{source=taggedWord64 0w0, dest=tReg} :: code, tReg, exit)
         end
 
         (* Create a bool result from a test by returning true or false. *)
         fun makeBoolResultRev(condition, ccRef, target, testCode) =
         let
             val trueLab = newLabel() and falseLab = newLabel() and mergeLab = newLabel()
             val mergeReg = newMergeReg()
         in
             moveRegister{dest=target, source=mergeReg} ::
             BlockLabel mergeLab ::
             BlockFlow(Unconditional mergeLab) ::
             loadNonAddressConstant{dest=mergeReg, source=taggedWord64 0w0} ::
             BlockLabel falseLab ::
             BlockFlow(Unconditional mergeLab) ::
             loadNonAddressConstant{dest=mergeReg, source=taggedWord64 0w1} ::
             BlockLabel trueLab ::
             BlockFlow(Conditional{ ccRef=ccRef, condition=condition, trueJump=trueLab, falseJump=falseLab }) ::
             testCode
         end
 
         (* Return an absolute address in both native addressing and 32-in-64. *)
         fun getAbsoluteAddress(code, baseReg) =
             if is32in64
             then let val absReg = newUReg() in (objectIndexAddressToAbsolute{ source=baseReg, dest=absReg } :: code, absReg) end
             else (code, baseReg)
 
         (* Load a value aligned on a 64 or 32-bit boundary.  offset is the number
            of units.  Typically this will be a polyword. *)
         fun wordAddressOffset(destination, baseReg1, offset, loadOp, code) =
         let
             val dReg = asTarget destination
             val opWordSize = opWordSize loadOp
             val byteOffset = offset * opWordSize
             val (codeBase, baseReg) = getAbsoluteAddress(code, baseReg1)
             val code =
                 if offset < 4096 andalso byteOffset > ~256
                 then loadWithConstantOffset{base=baseReg, dest=dReg, byteOffset=byteOffset, loadType=loadOp} :: codeBase
                 else
                 let
                     val indexReg = newUReg()
                 in
                     loadWithIndexedOffset{ base=baseReg, dest=dReg, index=indexReg, loadType=loadOp, signExtendIndex=false } ::
                         loadNonAddressConstant{ source=LargeWord.fromInt offset, dest=indexReg } :: codeBase
                 end
         in
             (code, dReg, false)
         end
 
         (* See if we have a container and return the entry if present. *)
         datatype containerType =
             NoContainer | ContainerOnStack of { container: stackLocn, stackOffset: int } | ContainerInRegs of preg list
         fun getContainerIfPresent(BICExtract(BICLoadLocal l)) =
             (
                 case Array.sub(locToPregArray, l) of
                     StackContainer container => ContainerOnStack container
                 |   RegisterContainer rc => ContainerInRegs rc
                 |   _ => NoContainer
             )
         |   getContainerIfPresent(BICExtract(BICLoadArgument a)) =
             (
                 case Vector.sub(argumentVector, a) of
                     ArgumentIsRegContainer rc => ContainerInRegs rc
                 |   _ => NoContainer
             )
             
         |   getContainerIfPresent _ = NoContainer
 
         (* General function for loads and stores. *)
         fun loadAndStoreWithAddress
                 ({base=bReg1, index, offset}, loadSize, loadShift, isCAddress, loadStoreOffset, loadStoreIndex, code) =
         let
             val byteOffset = offset * loadSize
             (* Get the base register value *)
             val bCode = code
             val sCode = bCode
 
             (* Get any index register value. *)
             val (iCode, iReg1Opt) =
                 case index of
                     NONE =>
                         if offset < 4096 andalso byteOffset > ~256
                         then (sCode, NONE) (* We can use this offset. *)
                         else
                         let
                             val iReg = newUReg()
                         in
                             (loadNonAddressConstant{ source=LargeWord.fromInt offset, dest=iReg } :: sCode, SOME iReg)
                         end
                 |   SOME iReg1 =>
                         let
                             val iCode1 = sCode
                             (* The index is a tagged integer containing the number of units (words, bytes etc).
                                It has to be untagged.  If this is a C address it may be negative. *)
                             val iReg2 = newUReg()
                             (* Logical shift if this is a Poly address, arithmetic shift if this is
                                a C address. *)
                             val iCode2 =
                                 untagValue{source=iReg1, dest=iReg2, opSize=polyWordOpSize, isSigned=isCAddress } :: iCode1
                         in
                             if offset = 0
                             then (iCode2, SOME iReg2)
                             else
                             let
                                 (* If there's some constant offset add it to the index.  Because it's a byte offset
                                    we need to divide it by the scale but it should always be a multiple.
                                    N.B. In 32-in-64 the index register contains a 32-bit value even when
                                    the offset is negative. *)
                                 val cReg = newUReg() and iReg3 = newUReg()
                                 val offsetAsWord = LargeWord.fromInt offset
                                 (* It could be negative if it's a C address. *)
                                 val shiftedOffset = (if isCAddress then LargeWord.~>> else LargeWord.>>) (offsetAsWord, loadShift)
                             in
                                 (addSubRegister{ base=iReg2, shifted=cReg, dest=SomeReg iReg3, ccRef=NONE, isAdd=true,
                                     length=polyWordOpSize, shift=ShiftNone} ::
                                  loadNonAddressConstant{ source=shiftedOffset, dest=cReg } :: iCode2,
                                  SOME iReg3)
                             end
                         end
 
             (* If this is 32in64 get the absolute address. *)
             val (absBCode, absBReg) = getAbsoluteAddress(iCode, bReg1)
 
             (* If this is a C address the "base address" is actually a box containing the address. *)
             val (effBCode, effBReg) =
                 if isCAddress
                 then
                     let
                         val bReg = newUReg()
                     in
                         (loadWithConstantOffset{ base=absBReg, dest=bReg, byteOffset=0, loadType=Load64 } :: absBCode, bReg)
                     end
                 else (absBCode, absBReg)
 
         in
             case iReg1Opt of
                 SOME iReg => loadStoreIndex(effBReg, iReg, effBCode)
             |   NONE => loadStoreOffset(effBReg, offset, effBCode)
         end
 
         (* Some operations require a single absolute address.
            These are all ML addresses so the index/offset is always unsigned. *)
         fun loadAndStoreWithAbsolute (address, loadSize, loadShift, loadStore, code) =
         let
             (* Have to add the offset/index register. *)
             fun loadStoreOffset(bReg, 0, code) = loadStore(bReg, code)
             |   loadStoreOffset(bReg, offset, code) =
                 let
                     val cReg = newUReg() and aReg = newUReg()
                 in
                     loadStore(aReg,
                         addSubRegister{ base=bReg, shifted=cReg, dest=SomeReg aReg, ccRef=NONE, isAdd=true,
                                         length=OpSize64, shift=ShiftNone} ::
                         loadNonAddressConstant{ source=LargeWord.fromInt offset, dest=cReg } :: code)
                 end
 
             and loadStoreIndex(bReg, iReg, code) =
             let
                 val aReg = newUReg()
                 (* The index register is a number of words/bytes etc so has to be multiplied when it's
                    added in. *)
                 val indexShift =
                     if loadShift = 0w0 then ShiftNone else ShiftLSL(Word8.fromLarge(Word.toLarge loadShift))
             in
                 loadStore(aReg,
                     addSubRegister{ base=bReg, shifted=iReg, dest=SomeReg aReg, ccRef=NONE, isAdd=true,
                                     length=OpSize64, shift=indexShift} :: code)
             end
         in
             loadAndStoreWithAddress (address, loadSize, loadShift, false, loadStoreOffset, loadStoreIndex, code)
         end
 
         (* Overflow check.  This raises Overflow if the condition is satisfied.  Normally this will be
            that the overflow bit is set but for multiplication it's more complicated.  This generates
            a single block for the function unless there is a handler.
            As well as reducing the size of the code this also means that overflow checks are generally
            BO instructions to the end of the code.  Since the default branch prediction is not to take
            forward jumps this should improve prefetching on the normal, non-overflow, path. *)
         fun checkOverflow (condition, {currHandler=NONE, overflowBlock=ref(SOME overFlowLab), ...}, ccRef) =
             (* It's already been set and there's no surrounding handler - use this. *)
             let
                 val noOverflowLab = newLabel()
             in
                 [
                     BlockLabel noOverflowLab,
                     BlockFlow(Conditional{ ccRef=ccRef, condition=condition, trueJump=overFlowLab, falseJump=noOverflowLab })
                 ]
             end
 
         |   checkOverflow (condition, {currHandler=NONE, overflowBlock, ...}, ccRef) =
             let
                 (* *)
                 val overFlowLab = newLabel() and noOverflowLab = newLabel()
                 val packetReg = newPReg()
                 val () = overflowBlock := SOME overFlowLab
             in
                 [
                     BlockLabel noOverflowLab,
                     BlockExit(RaiseExceptionPacket{packetReg=packetReg}),
                     loadAddressConstant{source=toMachineWord(Overflow), dest=packetReg},
                     BlockLabel overFlowLab,
                     BlockFlow(Conditional{ ccRef=ccRef, condition=condition, trueJump=overFlowLab, falseJump=noOverflowLab })
                 ]
             end
 
         |   checkOverflow (condition, {currHandler=SOME h, ...}, ccRef) =
             let
                 val overFlowLab = newLabel() and noOverflowLab = newLabel()
                 val packetReg = newPReg()
             in
                 [
                     BlockLabel noOverflowLab,
                     BlockRaiseAndHandle(RaiseExceptionPacket{packetReg=packetReg}, h),
                     loadAddressConstant{source=toMachineWord(Overflow), dest=packetReg},
                     BlockLabel overFlowLab,
                     BlockFlow(Conditional{ ccRef=ccRef, condition=condition, trueJump=overFlowLab, falseJump=noOverflowLab })
                 ]
             end
 
         fun codeToICodeRev(BICNewenv (bindings, exp), context: context as {stackPtr=initialSp, ...}, isTail, destination, tailCode) =
             let
                 (* Process a list of bindings.  We need to accumulate the space used by
                    any containers and reset the stack pointer at the end if necessary. *)
                 fun doBindings([], context, tailCode) = (tailCode, context)
  
                 |   doBindings(BICDeclar{value=BICExtract(BICLoadLocal l), addr, ...} :: decs, context, tailCode) =
                     let
                         (* Giving a new name to an existing entry.  This should have been removed
                            at a higher level but it doesn't always seem to be.  In particular we
                            must treat this specially if it's a container. *)
                         val original = Array.sub(locToPregArray, l)
                         val () = Array.update(locToPregArray, addr, original)
                     in
                         doBindings(decs, context, tailCode)
                     end
 
                 |   doBindings(BICDeclar{value, addr, ...} :: decs, context, tailCode) =
                     let
                         val (code, dest, _) = codeToICodeRev(value, context, false, AnyReg, tailCode)
                         val () = Array.update(locToPregArray, addr, PregLocation dest)
                     in
                         doBindings(decs, context, code)
                     end
 
                 |   doBindings(BICRecDecs [{lambda, addr, ...}] :: decs, context, tailCode) =
                     (* We shouldn't have single entries in RecDecs but it seems to occur at the moment. *)
                     let
                         val dest = newPReg()
                         val (code, _, _) = codeToICodeRev(BICLambda lambda, context, false, SpecificPReg dest, tailCode)
                         val () = Array.update(locToPregArray, addr, PregLocation dest)
                     in
                         doBindings(decs, context, code)
                     end
 
                 |   doBindings(BICRecDecs recDecs :: decs, context, tailCode) =
                     let
                         val destRegs = map (fn _ => newPReg()) recDecs
                         val flagsValue = if is32in64 then F_closure else 0w0
                         (* First build the closures as mutable cells containing zeros.  Set the
                            entry in the address table to the register containing the address. *)
                         fun makeClosure({lambda={closure, ...}, addr, ...}, dest, tailCode) =
                         let
                             val () = Array.update(locToPregArray, addr, PregLocation dest)
                             val wordsRequired = List.length closure + (if is32in64 then 2 else 1)
                             val absAddr = if is32in64 then newUReg() else dest
                             val zeroReg = newPReg()
                             val allocAndSetZero =
                                 loadNonAddressConstant{ source=taggedWord64 0w0, dest=zeroReg} ::
                                     allocateWithProfileRev(wordsRequired, Word8.orb(F_mutable, flagsValue), absAddr, tailCode)
                             val (_, clearCode) =
                                 List.foldl(fn (_, (n, l)) =>
                                     (n+1, storeAtWordOffset(zeroReg, n, absAddr, polyWordLoadSize, l))) (0, allocAndSetZero) closure
                         in
                             if is32in64
                             then absoluteToObjectIndex{ source=absAddr, dest=dest } :: clearCode
                             else clearCode
                         end
                         val allocClosures = ListPair.foldlEq makeClosure tailCode (recDecs, destRegs)
 
                         fun setClosure({lambda, ...}, dest, l) =
                         let
                             val absAddr = if is32in64 then newUReg() else dest
                             val flagsReg = newUReg()
                             (* Lock the closure by storing the flags byte without the mutable flag.
                                TODO: We could simply use XZ here. *)
                         in
                             storeWithConstantOffset{ base=absAddr, source=flagsReg, byteOffset=flagsByteOffset,
                                     loadType=Load8 } ::
                             loadNonAddressConstant{ source=Word8.toLarge flagsValue, dest=flagsReg } ::
                             storeIntoClosure(lambda, absAddr, context,
                                     if is32in64
                                     then objectIndexAddressToAbsolute{ source=dest, dest=absAddr } :: l else l)
                         end
                             
                         val setAndLockClosures = ListPair.foldlEq setClosure allocClosures (recDecs, destRegs)
                     in
                         doBindings(decs, context, setAndLockClosures)
                     end
 
                 |   doBindings(BICNullBinding exp :: decs, context, tailCode) =
                     let
                         val (code, _, _) = codeToICodeRev(exp, context, false, NoResult, tailCode) (* And discard result. *)
                     in
                         doBindings(decs, context, code)
                     end
        
                 |   doBindings(BICDecContainer{ addr, size } :: decs, context as {loopArgs, stackPtr, currHandler, overflowBlock}, tailCode) =
                     if size <= smallContainerSize
                     then
                     let
                         val regs = List.tabulate(size, fn _ => newMergeReg())
                         val () = Array.update(locToPregArray, addr, RegisterContainer regs)
                     in
                         doBindings(decs, context, tailCode)
                     end
                     else
                     let (* Larger container - reserve a portion of stack and zero it. *)
                         val containerLoc = newStackLoc size
                         val () = Array.update(locToPregArray, addr,
                                     StackContainer{container=containerLoc, stackOffset=stackPtr+size})
                         val zeroReg = newPReg()
                     in
                         doBindings(decs,
                             {loopArgs=loopArgs, stackPtr=stackPtr+size, currHandler=currHandler, overflowBlock=overflowBlock},
                             tailCode <::> loadNonAddressConstant{ source=taggedWord64 0w0, dest=zeroReg } <::>
                             pushToStack{copies=size, container=containerLoc, source=zeroReg})
                     end
 
                 val (codeBindings, resContext as {stackPtr=finalSp, ...}) = doBindings(bindings, context, tailCode)
                 (* If we have had a container we'll need to reset the stack *)
             in
                 if initialSp <> finalSp
                 then
                 let
                     val _ = finalSp >= initialSp orelse raise InternalError "codeToICode - stack ptr"
                     val bodyReg = newPReg() and resultReg = asTarget destination
                     val (codeExp, result, haveExited) =
                         codeToICodeRev(exp, resContext, isTail, SpecificPReg bodyReg, codeBindings)
                     val afterAdjustSp =
                         if haveExited
                         then codeExp
                         else
                             moveRegister{source=result, dest=resultReg} ::
                             resetStackPtr{numWords=finalSp-initialSp} :: codeExp
                 in
                     (afterAdjustSp, resultReg, haveExited)
                 end
                 else codeToICodeRev(exp, resContext, isTail, destination, codeBindings)
             end
 
         |   codeToICodeRev(BICExtract(BICLoadLocal l), {stackPtr, ...}, _, destination, tailCode) =
             (
                 case Array.sub(locToPregArray, l) of
                     NoLocation => raise InternalError "codeToICodeRev - local unset"
                 |   PregLocation preg => moveToResult(destination, tailCode, preg)
                 |   StackContainer{container, stackOffset} =>
                     let
                         val target = asTarget destination
                     in
                         (containerAddress{dest=target, container=container, stackOffset=stackPtr-stackOffset}
                             :: tailCode, target, false)
                     end
                 |   RegisterContainer _ => raise InternalError "BICExtract local: reg container"
             )
 
         |   codeToICodeRev(BICExtract(BICLoadArgument a), {stackPtr, ...}, _, destination, tailCode) =
             (
                 case Vector.sub(argumentVector, a) of
                     ArgumentIsInReg{argReg, ...} => (* It was originally in a register.  It's now in a preg. *)
                         moveToResult(destination, tailCode, argReg)
                 |   ArgumentIsOnStack{stackOffset, stackReg} => (* Pushed before call. *)
                     let
                         val target = asTarget destination
                     in
                         (loadStack{wordOffset=stackOffset+stackPtr, container=stackReg, field=0,
                                                dest=target} :: tailCode, target, false)
                     end
                 |   ArgumentIsRegContainer _ => raise InternalError "BICExtract argument: reg container"
             )
 
         |   codeToICodeRev(BICExtract(BICLoadClosure c), _, _, destination, tailCode) =
             let
                 (* Add the number of words for the code address.  This is 1 in native but 2 in 32-in-64. *)
                 val offset = if is32in64 then c+2 else c+1
             in
                 if c >= List.length closure then raise InternalError "BICExtract: closure" else ();
                 wordAddressOffset(destination, closureRegAddr, offset, polyWordLoadSize, tailCode)
             end
 
         |   codeToICodeRev(BICExtract BICLoadRecursive, _, _, destination, tailCode) =
                 (* If the closure is empty we must use the constant.  We can't guarantee that
                    the caller will actually load the closure register if it knows the closure
                    is empty. *)
             (
                  case closure of
                     [] =>
                         let
                             val dReg = asTarget destination
                         in
                             (loadAddressConstant{source=closureAsAddress resultClosure, dest=dReg} :: tailCode, dReg, false)
                         end
                  |  _ => moveToResult(destination, tailCode, closureRegAddr)
             )
 
         |   codeToICodeRev(BICConstnt(w, _), _, _, destination, tailCode) =
             let
                 val dReg = asTarget destination
                 val instr =
                     if isShort w
                     then (* When converting to Word64 we do NOT want to use sign-extension.
                             In 32-in-64 signed fixed-precision ints need to have zeros
                             in the top 32 bits. *)
                         loadNonAddressConstant{source=taggedWord64(Word64.fromLarge(Word.toLarge(toShort w))),
                             dest=dReg}
                     else loadAddressConstant{source=w, dest=dReg}
             in
                 (instr :: tailCode, dReg, false)
             end
 
         |   codeToICodeRev(BICField{base, offset}, context, _, destination, tailCode) =
             let
                 val (codeBase, baseReg, _) = codeToICodeRev(base, context, false, AnyReg, tailCode)
             in
                 wordAddressOffset(destination, baseReg, offset, polyWordLoadSize, codeBase)
             end
 
         |   codeToICodeRev(BICCond(test, thenPt, elsePt), context, isTail, NoResult, tailCode) =
             let
                 (* If we don't want the result but are only evaluating for side-effects we
                    may be able to optimise special cases.  This was easier in the forward
                    case but for now we don't bother and leave it to the lower levels. *)
                 val startElse = newLabel() and skipElse = newLabel()
                 val codeTest = codeConditionRev(test, context, false, startElse, tailCode)
                 val (codeThen, _, _) =
                     codeToICodeRev(thenPt, context, isTail, NoResult, codeTest)
                 val (codeElse, _, _) =
                      codeToICodeRev(elsePt, context, isTail, NoResult,
                         BlockLabel startElse ::
                         BlockFlow(Unconditional skipElse) :: codeThen)
             in
                 returnUnit(NoResult, BlockLabel skipElse :: codeElse, false(*??*))
             end
 
         |   codeToICodeRev(BICCond(test, thenPt, elsePt), context, isTail, destination, tailCode) =
             let
                 (* Because we may push the result onto the stack we have to create a new preg to
                    hold the result and then copy that to the final result. *)
                 (* If this is a tail each arm will exit separately and neither will return a result. *)
                 val target = asTarget destination
                 val condResult = newMergeReg()
                 val thenTarget = if isTail then newPReg() else condResult
                 val startElse = newLabel()
                 val testCode = codeConditionRev(test, context, false, startElse, tailCode)
                 
                 (* Put the result in the target register. *)
                 val (thenCode, _, thenExited) = codeToICodeRev(thenPt, context, isTail, SpecificPReg thenTarget, testCode)
                 (* Add a jump round the else-part except that if this is a tail we
                    return.  The then-part could have exited e.g. with a raise or a loop. *)
                 val (exitThen, thenLabel, elseTarget) =
                     if thenExited then (thenCode, [], target (* Can use original target. *))
                     else if isTail then (returnInstruction(context, thenTarget, thenCode), [], newPReg())
                     else
                     let
                         val skipElse = newLabel()
                     in
                         (BlockFlow(Unconditional skipElse) :: thenCode,
                          [moveRegister{source=condResult, dest=target},
                           BlockLabel skipElse],
                          condResult)
                     end
                 val (elseCode, _, elseExited) =
                     codeToICodeRev(elsePt, context, isTail, SpecificPReg elseTarget,
                         BlockLabel startElse :: exitThen)
                 (* Add a return to the else-part if necessary so we will always exit on a tail. *)
                 val exitElse =
                     if isTail andalso not elseExited
                     then returnInstruction(context, elseTarget, elseCode) else elseCode
             in
                 (thenLabel @ exitElse, target, isTail orelse thenExited andalso elseExited)
             end
 
         |   codeToICodeRev(BICUnary instr, context, isTail, destination, tailCode) =
                 codeToICodeUnaryRev(instr, context, isTail, destination, tailCode)
 
         |   codeToICodeRev(BICBinary instr, context, isTail, destination, tailCode) =
                 codeToICodeBinaryRev(instr, context, isTail, destination, tailCode)
 
         |   codeToICodeRev(BICTagTest{test, tag=tagValue, ...}, context, isTail, destination, tailCode) =
             (* Check the "tag" word of a union (datatype).  N.B.  Not the same as testing the
                tag bit of a word.  Just generate it as a general word comparison.  The optimiser
                will sort out whether the tag value can be an immediate. *)
                 codeToICodeRev(BICBinary{oper=WordComparison{test=TestEqual, isSigned=false},
                                          arg1=test, arg2=BICConstnt(toMachineWord tagValue, [])},
                         context, isTail, destination, tailCode)
 
         |   codeToICodeRev(BICTuple fields, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 (* The allocator sets the register to the absolute address.  It
                    has to be converted to an object pointer in 32-in-64. *)
                 val absAddr = if is32in64 then newUReg() else target
                 
                 fun loadFields([], n, tlCode) = allocateWithProfileRev(n, 0w0, absAddr, tlCode)
                 
                 |   loadFields((f as BICConstnt _) :: rest, n, tlCode) =
                     let
                         (* Unlike the X86 we still need to load a constant into a register in order to
                            store it in the new tuple.  However, it's better to leave that until after
                            the allocation and move it then.  That way we can use the same register
                            for different constants if we have a very large tuple. *)
                         val restAndAlloc = loadFields(rest, n+1, tlCode)
                         val (code1, source, _) = codeToICodeRev(f, context, false, AnyReg, restAndAlloc)
                     in
                         storeAtWordOffset(source, n, absAddr, polyWordLoadSize, code1)
                     end
                     
                 |   loadFields(f :: rest, n, tlCode) =
                     let
                         val (code1, source, _) = codeToICodeRev(f, context, false, AnyReg, tlCode)
                         val restAndAlloc = loadFields(rest, n+1, code1)
                     in
                         storeAtWordOffset(source, n, absAddr, polyWordLoadSize, restAndAlloc)
                     end
                 val allocAndStore = loadFields(fields, 0, tailCode)
                 val code =
                     if is32in64
                     then absoluteToObjectIndex{source=absAddr, dest=target} :: allocAndStore
                     else allocAndStore
             in
                 (code, target, false)
             end
 
         |   codeToICodeRev(BICRaise exc, context as { currHandler, ...}, _, destination, tailCode) =
             let
                 val (code, packetReg, _) = codeToICodeRev(exc, context, false, AnyReg, tailCode)
                 val raiseCode = RaiseExceptionPacket{packetReg=packetReg}
                 val block =
                     case currHandler of
                         NONE => BlockExit raiseCode | SOME h => BlockRaiseAndHandle(raiseCode, h)
             in
                 returnUnit(destination,  block :: code, true (* Always exits *))
             end
 
         |   codeToICodeRev(BICEval{function, argList, ...}, context as { currHandler, ...}, isTail, destination, tailCode) =
             let
                 val target = asTarget destination
                 (* Create pregs for the closure and each argument. *)
                 val clPReg = newPReg()
                 (* If we have a constant closure we can go directly to the entry point.
                    If the closure is a single word we don't need to load the closure register. *)
                 val (functionCode, closureEntry, callKind) =
                     case function of
                         BICConstnt(addr, _) =>
                         let
                             val addrAsAddr = toAddress addr
                             (* If this is a closure we're still compiling we can't get the code address.
                                However if this is directly recursive we can use the recursive
                                convention. *)
                         in
                             if wordEq(closureAsAddress resultClosure, addr)
                             then (tailCode, [], Recursive)
                             else if flags addrAsAddr <> Address.F_words andalso flags addrAsAddr <> Address.F_closure
                             then (loadAddressConstant{source=addr, dest=clPReg} :: tailCode,
                                       [(ArgInReg clPReg, X8)], FullCall)
                             else if is32in64
                             then (* The code address is a 64-bit value so we have to load it
                                     at run-time.  The X86 version passes the closure address here
                                     and generates a relative CALL/JMP.  The actual offset is
                                     computed by the RTS.
                                     For the moment just use a full call. *)
                                 (loadAddressConstant{source=addr, dest=clPReg} :: tailCode,
                                       [(ArgInReg clPReg, X8)], FullCall)
                             else (* Native 64-bits. *)
                             let
                                 val addrLength = length addrAsAddr
                                 val _ = addrLength >= 0w1 orelse raise InternalError "BICEval address"
                                 val codeAddr = loadWord(addrAsAddr, 0w0)
                                 val _ = isCode (toAddress codeAddr) orelse raise InternalError "BICEval address not code"
                             in
                                 if addrLength = 0w1
                                 then (tailCode, [], ConstantCode codeAddr)
                                 else (loadAddressConstant{source=addr, dest=clPReg} :: tailCode,
                                       [(ArgInReg clPReg, X8)], ConstantCode codeAddr)
                             end
                         end
 
                     |   BICExtract BICLoadRecursive =>
                         (
                             (* If the closure is empty we don't need to load X8 *)
                             case closure of
                                 [] => (tailCode, [], Recursive)
                             |   _ =>
                                     (moveRegister {source=closureRegAddr, dest=clPReg} :: tailCode,
                                      [(ArgInReg clPReg, X8)], Recursive)
                         )
 
                     |   function => (* General case. *)
                             (#1 (codeToICodeRev(function, context, false, SpecificPReg clPReg, tailCode)), [(ArgInReg clPReg, X8)], FullCall)
 
                 local
                     (* Load the first arguments into registers and the rest to the stack. *)
                     fun loadArgs ([], _, tailCode) = (tailCode, [], [])
 
                     |   loadArgs ((arg, _) :: args, gReg::gRegs, tailCode) =
                         let (* General register argument. *)
                             val (c, r, _) = codeToICodeRev(arg, context, false, AnyReg, tailCode)
                             val (code, regArgs, stackArgs) = loadArgs(args, gRegs, c)
                         in
                             (code, (ArgInReg r, gReg) :: regArgs, stackArgs)
                         end
 
                     |   loadArgs ((arg, _) :: args, [], tailCode) =
                         let (* Stack argument. *)
                             val (c, r, _) = codeToICodeRev(arg, context, false, AnyReg, tailCode)
                             val (code, regArgs, stackArgs) = loadArgs(args, [], c)
                         in
                             (code, regArgs, ArgInReg r :: stackArgs)
                         end
 
                     fun isSmallContainer(ContainerType s) = s <= smallContainerSize | isSmallContainer _ = false
                 in
                     val (codeArgs, regArgs, stackArgs) =
                         loadArgs(List.filter(not o isSmallContainer o #2) argList, generalArgRegs, functionCode)
                 end
                 
                 (* If this is at the end of the function and the result types are the
                    same we can use a tail-recursive call. *)
                 val tailCall = isTail (*andalso resultType = fnResultType*)
                 
                 val callCode =
                     if tailCall
                     then
                     let
                         val {stackPtr, ...} = context
                         (* The number of arguments currently on the stack. *)
                         val currentStackArgCount = currentStackArgs
                         val newStackArgCount = List.length stackArgs
                         (* The offset of the first argument.  Offsets can be negative. *)
                         val stackOffset = stackPtr
                         fun makeStackArgs([], _) = []
                         |   makeStackArgs(arg::args, offset) = {src=arg, stack=offset} :: makeStackArgs(args, offset-1)
                         val stackArgs = makeStackArgs(stackArgs, currentStackArgCount-1)
                         (* The stack adjustment needed to compensate for any items that have been pushed
                            and the differences in the number of arguments.  May be positive or negative. *)
                         val stackAdjust = currentStackArgCount - newStackArgCount
                         (* Add an entry for the return address to the register arguments. *)
                     in
                         BlockExit(TailRecursiveCall{regArgs=(ArgInReg returnAddrReg, X30) :: closureEntry @ regArgs, stackArgs=stackArgs,
                                   stackAdjust = stackAdjust, currStackSize=stackOffset, callKind=callKind}) ::
                                   codeArgs
                     end
                     else
                     let
                         (* See if there is a container argument. *)
                         val containerArg = List.find(fn (_, ContainerType _) => true | _ => false) argList
 
                         val containerValue =
                             case containerArg of
                                 SOME(argVal, _) => getContainerIfPresent argVal
                             |   NONE => NoContainer
                         (* When a container is passed as an argument we put the address into a register.
                            Normally the container will be referenced after the call in order to extract
                            the values but if it's discarded we need to make sure it will continue to
                            be referenced at least as far as the call.  This isn't a problem for the X86
                            code-generator since container addresses are a form of the "argument"
                            datatype. *)
                         val stackContainers =
                             case containerValue of ContainerOnStack{container, ...} => [container] | _ => []
 
                         (* Get the results.  If we're returning the result through a container the target isn't used
                            so we return unit. *)
                         val (results, setTarget) =
                             case containerValue of
                                 ContainerInRegs regs =>
                                     (ListPair.zip(regs, generalArgRegs), [loadNonAddressConstant{source=taggedWord64 0w0, dest=target}])
                             |   ContainerOnStack _ =>
                                     ([], [loadNonAddressConstant{source=taggedWord64 0w0, dest=target}])
                             |   NoContainer => ([(target, X0)], [])
 
                         val call =
                             FunctionCall{regArgs=closureEntry @ regArgs, stackArgs=stackArgs, dests=results,
                                          callKind=callKind, saveRegs=[], containers=stackContainers}
                         val callBlock =
                             case currHandler of
                                 NONE => BlockSimple call :: codeArgs
                             |   SOME h => BlockOptionalHandle{call=call, handler=h, label=newLabel()}  :: codeArgs
                     in
                         callBlock <@> setTarget
                     end
             in
                 (callCode, target, tailCall (* We've exited if this was a tail jump *))
             end
 
         |   codeToICodeRev(BICNullary{oper=BuiltIns.GetCurrentThreadId}, _, _, destination, tailCode) =
             (* Get the ID of the current thread. *)
             let
                 val target = asTarget destination
             in
                 (getThreadId{dest=target} :: tailCode, target, false)
             end
 
         |   codeToICodeRev(BICNullary{oper=BuiltIns.CPUPause}, _, _, destination, tailCode) =
             (* This is now done in the RTS call code. *)
                 returnUnit(destination, tailCode <::> cpuYield, false)
 
         |   codeToICodeRev(BICNullary {oper=CreateMutex}, _, _, destination, tailCode) =
             let
                 (* Allocate memory for a mutex.  Use a native word as a mutable, weak, no-overwrite, byte cell
                    which is the same as a volatileRef. This ensures that it will always be cleared when it is
                    loaded even if it was locked when it was saved. *)
                 val target = asTarget destination
                 val flags = Word8.orb(F_mutable, Word8.orb(F_weak, Word8.orb(F_noOverwrite, F_bytes))) (* 0wx69 *)
                 val absAddr = if is32in64 then newUReg() else target
                 val zeroReg = newUReg()
                 val allocAndStore =
                     storeWithConstantOffset{ source=zeroReg, base=absAddr, byteOffset=0, loadType=Load64 } ::
                     loadNonAddressConstant{source=0w0, dest=zeroReg} ::
                     allocateWithProfileRev(if is32in64 then 2 else 1, flags, absAddr, tailCode)
                 val code =
                     if is32in64
                     then absoluteToObjectIndex{source=absAddr, dest=target} :: allocAndStore
                     else allocAndStore
             in
                 (code, target, false)
             end
 
         |   codeToICodeRev(BICArbitrary { oper=ArithMult, longCall, ... }, context, isTail, destination, tailCode) =
                 (* Just call the long function to do this.  Overflow detection makes this too complicated. *)
                 codeToICodeRev(longCall, context, isTail, destination, tailCode)
 
         |   codeToICodeRev(BICArbitrary { oper, shortCond, arg1, arg2, longCall }, context, _, destination, tailCode) =
             let
                 val startLong = newLabel() and resultLabel = newLabel()
                 val condResult = newMergeReg()
                 (* Test to see if the arguments are short and go straight to the long case if not. *)
                 val testCode = codeConditionRev(shortCond, context, false, startLong, tailCode) 
                 (* Do the short case *)
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, testCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 (* We need to subtract the tag from one of the arguments and then do the
                    addition.  The optimiser will do the subtraction at compile time if we
                    subtract from a constant so if this is and Add we try to put the constant
                    in the second arg. *)
                 val (firstReg, secondReg) =
                     case (arg1, oper) of
                         (BICConstnt _, ArithAdd) => (aReg2, aReg1) | _ => (aReg1, aReg2)
                 (* Generate code for the short case.  Put the result in the merge register.  Jump to
                    the result if there's no overflow and to the long case if there is. *)
                 val codeShort =
                     case oper of
                         ArithAdd =>
                         let
                             val uReg = newUReg() and chkOverflow = newCCRef()
                         in
                             BlockFlow(Conditional{ ccRef=chkOverflow, condition=CondOverflow, trueJump=startLong, falseJump=resultLabel }) ::
                             addSubRegister{base=firstReg, shifted=uReg, dest=SomeReg condResult, length=polyWordOpSize,
                                          ccRef=SOME chkOverflow, isAdd=true, shift=ShiftNone} ::
                             addSubImmediate{dest=SomeReg uReg, source=secondReg, immed=0w1, length=polyWordOpSize,
                                             ccRef=NONE, isAdd=false}:: arg2Code
                         end
 
                     |   ArithSub =>
                         let
                             val uReg = newUReg() and chkOverflow = newCCRef()
                         in
                             BlockFlow(Conditional{ ccRef=chkOverflow, condition=CondOverflow, trueJump=startLong, falseJump=resultLabel }) ::
                             addSubRegister{base=firstReg, shifted=uReg, dest=SomeReg condResult, length=polyWordOpSize,
                                          ccRef=SOME chkOverflow, isAdd=false, shift=ShiftNone} ::
                             addSubImmediate{dest=SomeReg uReg, source=secondReg, immed=0w1, length=polyWordOpSize,
                                             ccRef=NONE, isAdd=false}:: arg2Code
                         end
 
                     |   _ => raise InternalError "BICArbitrary: unimplemented operation"
 
                 (* Code for the long case. Put the result into the merge register. *)
                 (* TODO: This could use a tail call if this is at the end of the function. *)
                 val (codeLong, _, _) =
                     codeToICodeRev(longCall, context, false, SpecificPReg condResult,
                                    BlockLabel startLong :: codeShort)
                 val target = asTarget destination
                 (* Copy the merge register into the result. *)
                 val finalCode =
                     moveRegister{source=condResult, dest=target} ::
                     BlockLabel resultLabel :: codeLong
             in
                 (finalCode, target, false)
             end
 
         |   codeToICodeRev(BICLambda(lambda as { closure = [], ...}), _, _, destination, tailCode) =
             (* Empty closure - create a constant closure for any recursive calls. *)
             let
                 val closure = makeConstantClosure()
                 val () = codeFunctionToArm64(lambda, debugSwitches, closure)
                 val dReg = asTarget destination
                 (* Return the closure itself as the value. *)
             in
                 (BlockSimple(LoadAddressConstant{source=closureAsAddress closure, dest=dReg}) :: tailCode, dReg, false)
             end
 
         |   codeToICodeRev(BICLambda(lambda as { closure, ...}), context, _, destination, tailCode) =
             (* Non-empty closure.  Ignore stack closure option at the moment. *)
             let
                 val wordsRequired = List.length closure  + (if is32in64 then 2 else 1)
                 val target = asTarget destination
                 val absAddr = if is32in64 then newUReg() else target
                 (* The values we're storing are all either constants or local/closure variables so
                    we can allocate the memory and then store into it. *)
                 val allocCode =
                     allocateWithProfileRev(wordsRequired, if is32in64 then F_closure else 0w0, absAddr, tailCode)
                 val storeCode =
                     storeIntoClosure(lambda, absAddr, context, allocCode)
                 val finalCode =
                     if is32in64 then BlockSimple(AbsoluteToObjectIndex{source=absAddr, dest=target}) :: storeCode else storeCode
             in
                 (finalCode, target, false)
             end
 
         |   codeToICodeRev(BICCase { cases, test, default, isExhaustive, firstIndex}, context, isTail, destination, tailCode) =
             let
                 (* We have to create a new preg for the result in case we need to push
                    it to the stack. *)
                 val targetReg = newMergeReg()
                 
                 local
                     val (testCode, initialTestReg, _) = codeToICodeRev(test, context, false, AnyReg, tailCode)
                     (* Subtract the minimum even if it is zero to remove the tag.
                        This leaves us with a shifted but untagged value. Don't check for overflow.
                        Instead allow large values to wrap around and check later. *)
                     val cReg1 = newUReg()
                     val subValue = taggedWord64(Word64.fromLarge(Word.toLargeX firstIndex))
                 in
                     val testReg = newUReg()
                     val testCode =
                         addSubRegister{ base=initialTestReg, shifted=cReg1, dest=SomeReg testReg, ccRef=NONE, isAdd=false,
                                         length=polyWordOpSize, shift=ShiftNone} ::
                         loadNonAddressConstant{ source=subValue, dest=cReg1 } :: testCode
                 end
 
                 val (rangeCheck, extraDefaults) =
                     if isExhaustive then (testCode, [])
                     else
                     let
                         (* Check the value is within the number of cases, *2 because this is shifted. *)
                         val cReg2 = newUReg() and ccRef1 = newCCRef()
                         val nCases = List.length cases
                         val continueLab = newLabel() and defaultLab1 = newLabel()
                         val rangeCheck =
                             BlockLabel continueLab ::
                             BlockFlow(Conditional{ccRef=ccRef1, condition=CondCarrySet, trueJump=defaultLab1, falseJump=continueLab}) ::
                             addSubRegister{base=testReg, shifted=cReg2, dest=ZeroReg, ccRef=SOME ccRef1, isAdd=false,
                                            length=OpSize64, shift=ShiftNone} ::
                             loadNonAddressConstant{ source=Word64.fromInt nCases * 0w2, dest=cReg2 } :: testCode
                     in
                         (rangeCheck, [defaultLab1])
                     end
 
                 (* Make a label for each item in the list. *)
                 val codeLabels = map (fn _ => newLabel()) cases
 
                 (* Create an exit label in case it's needed. *)
                 val labelForExit = newLabel()
 
                 (* Generate the code for each of the cases and the default.  We need to put an
                    unconditional branch after each to skip the other cases. *)
                 fun codeCases (SOME c :: otherCases, startLabel :: otherLabels, tailCode) =
                     let
                         val caseTarget = if isTail then newPReg() else targetReg
                         (* Put in the case with a jump to the end of the sequence. *)
                         val (codeThisCase, _, caseExited) =
                             codeToICodeRev(c, context, isTail, SpecificPReg caseTarget,
                                 BlockLabel startLabel :: tailCode) 
                         val exitThisCase =
                             if caseExited then codeThisCase
                             else if isTail then returnInstruction(context, caseTarget, codeThisCase)
                             else BlockFlow(Unconditional labelForExit) :: codeThisCase
                     in
                         codeCases(otherCases, otherLabels, exitThisCase)
                     end
 
                 |   codeCases(NONE :: otherCases, _ :: otherLabels, tailCode) = codeCases(otherCases, otherLabels, tailCode)
                         
                 |   codeCases ([], [], tailCode) =
                     let
                         (* We need to add labels for all the gaps we filled and also for a "default" label for
                            the indexed-case instruction itself as well as any range checks. *)
                         fun addDefault (startLabel, NONE, l) = BlockLabel startLabel :: l
                         |   addDefault (_, SOME _, l) = l
                         fun asForward l = BlockLabel l
                         val dLabs = map asForward extraDefaults @ tailCode
                         val defLabels = ListPair.foldlEq addDefault dLabs (codeLabels, cases)
                         val defaultTarget = if isTail then newPReg() else targetReg
                         val (defaultCode, _, defaultExited) =
                             codeToICodeRev(default, context, isTail, SpecificPReg defaultTarget, defLabels)
                     in
                         (* Put in the default.  Because this is the last we don't need to
                            jump round it.  However if this is a tail and we haven't exited
                            we put in a return.  That way the case will always have
                            exited if this is a tail. *)
                          if isTail andalso not defaultExited
                          then returnInstruction(context, defaultTarget, defaultCode)
                          else defaultCode
                     end
 
                 |   codeCases _ = raise InternalError "codeCases: mismatch"
                     
                 val codedCases =
                     codeCases(cases, codeLabels,
                         BlockFlow(IndexedBr codeLabels) ::
                         BlockSimple(IndexedCaseOperation{testReg=testReg}) ::
                         rangeCheck)
    
                 (* We can now copy to the target.  If we need to push the result this load
                    will be converted into a push. *)
                 val target = asTarget destination
                 val copyToTarget =
                     if isTail then codedCases
                     else moveRegister{source=targetReg, dest=target} :: BlockLabel labelForExit :: codedCases
             in
                 (copyToTarget, target, isTail (* We have always exited on a tail. *))
             end
 
         |   codeToICodeRev(BICBeginLoop {loop, arguments}, context as { stackPtr, currHandler, overflowBlock, ...},
                            isTail, destination, tailCode) =
             let
                 val target = asTarget destination
                 
                 fun codeArgs ([], tailCode) = ([], tailCode)
                 |   codeArgs (({value, addr}, _) :: rest, tailCode) =
                     let
                         val pr = newPReg()
                         val () = Array.update(locToPregArray, addr, PregLocation pr)
                         val (code, _, _) = codeToICodeRev(value, context, false, SpecificPReg pr, tailCode)
                         val (pregs, othercode) = codeArgs(rest, code)
                     in
                         (pr::pregs, othercode)
                     end
                 val (loopRegs, argCode) = codeArgs(arguments, tailCode)
 
                 val loopLabel = newLabel()
                 val (loopBody, _, loopExited) =
                     codeToICodeRev(loop,
                         {loopArgs=SOME (loopRegs, loopLabel, stackPtr), stackPtr=stackPtr,
                          currHandler=currHandler, overflowBlock=overflowBlock },
                             isTail, SpecificPReg target, BlockLabel loopLabel :: BlockSimple BeginLoop :: argCode)
             in
                 (loopBody, target, loopExited)
             end
 
         |   codeToICodeRev(BICLoop args, context as {loopArgs=SOME (loopRegs, loopLabel, loopSp), stackPtr, currHandler, ...}, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 (* Registers to receive the evaluated arguments.  We can't put the
                    values into the loop variables yet because the values could depend
                    on the current values of the loop variables. *)
                 val argPRegs = map(fn _ => newPReg()) args
                 val codeArgs =
                     ListPair.foldlEq(fn ((arg, _), pr, l) =>
                         #1 (codeToICodeRev(arg, context, false, SpecificPReg pr, l))) tailCode
                         (args, argPRegs)
                 val jumpArgs = ListPair.mapEq(fn (s, l) => {src=ArgInReg s, dst=l}) (argPRegs, loopRegs)
                 (* If we've allocated a container in the loop we have to remove it before jumping back. *)
                 val stackReset =
                     if loopSp = stackPtr then codeArgs
                     else resetStackPtr{numWords=stackPtr-loopSp} :: codeArgs
                 val jumpLoop = JumpLoop{regArgs=jumpArgs, stackArgs=[], checkInterrupt=SOME[]}
                 (* "checkInterrupt" could result in a Interrupt exception so we treat this like
                    a function call. *)
                 val code =
                     case currHandler of
                         NONE => BlockFlow(Unconditional loopLabel) :: BlockSimple jumpLoop :: stackReset
                     |   SOME h => BlockOptionalHandle{call=jumpLoop, handler=h, label=loopLabel} :: stackReset
             in
                 (code, target, true)
             end
 
         |   codeToICodeRev(BICLoop _, {loopArgs=NONE, ...}, _, _, _) = raise InternalError "BICLoop without BICBeginLoop"
 
             (* Copy the source tuple into the container.  There are important special cases for
                both the source tuple and the container.  If the source tuple is a BICTuple we have
                the fields and can store them without creating a tuple on the heap.  If the
                destination is a local container we can store directly into the stack. *)
         |   codeToICodeRev(BICSetContainer{container, tuple, filter}, context as {stackPtr, ...}, _, destination, tailCode) =
             let
                 local
                     fun createStore containerReg (source, destWord, tail) =
                         storeAtWordOffset(source, destWord, containerReg, Load64, tail)
                 in
                     val (codeContainer, storeInstr) =
                         case getContainerIfPresent container of
                             ContainerOnStack{container, stackOffset} =>
                             let
                                 fun store(source, destWord, tail) =
                                     storeToStack{source=source, container=container, field=destWord,
                                         stackOffset=stackPtr-stackOffset+destWord} :: tail
                             in
                                 (tailCode, store)
                             end
 
                         |   ContainerInRegs regs =>
                             let
                                 fun copy(source, destWord, tail) =
                                     tail <::> moveRegister{source=source, dest=List.nth(regs, destWord)}
                             in
                                 (tailCode, copy)
                             end
 
                         |   NoContainer =>
                             let
                                 val containerTarget = newPReg()
                                 val (codeContainer, _, _) =
                                     codeToICodeRev(container, context, false, SpecificPReg containerTarget, tailCode)
                             in
                                 (codeContainer, createStore containerTarget)
                             end
                 end
                 
                 val filterLength = BoolVector.length filter
 
                 val code =
                     case tuple of
                         BICTuple cl =>
                         let
                             (* In theory it's possible that the tuple could contain fields that are not
                                used but nevertheless need to be evaluated for their side-effects.
                                Create all the fields and push to the stack. *)
                             fun codeField(arg, (regs, tailCode)) =
                             let
                                 val (c, r, _) =
                                     codeToICodeRev(arg, context, false, AnyReg, tailCode)
                             in
                                 (r :: regs, c)
                             end
 
                             val (pregsRev, codeFields) = List.foldl codeField ([], codeContainer) cl
                             val pregs = List.rev pregsRev
 
                             fun copyField(srcReg, (sourceWord, destWord, tailCode)) =
                                 if sourceWord < filterLength andalso BoolVector.sub(filter, sourceWord)
                                 then (sourceWord+1, destWord+1, storeInstr(srcReg, destWord, tailCode))
                                 else (sourceWord+1, destWord, tailCode)
                             
                             val (_, _, resultCode) = List.foldl copyField (0, 0, codeFields) pregs
                         in
                             resultCode
                         end
 
                     |   tuple =>
                         let (* Copy a heap tuple.  It is possible that this is another container in which case
                                we must load the fields directly.  We mustn't load its address and then copy
                                because loading the address would be the last reference and might cause
                                the container to be reused prematurely. ??? Is that an old comment ?? *)
                             val (codeTuple, loadField) =
                                 case getContainerIfPresent tuple of
                                     ContainerOnStack {container, stackOffset} =>
                                     let
                                         fun getAddr(destReg, sourceWord, tail) =
                                             loadStack{dest=destReg, wordOffset=stackPtr-stackOffset+sourceWord, container=container,
                                                           field=sourceWord} :: tail
                                     in
                                         (codeContainer, getAddr)
                                     end
 
                                 |   ContainerInRegs regs =>
                                     let
                                         fun copyReg(destReg, sourceWord, tail) =
                                             tail <::> moveRegister{dest=destReg, source=List.nth(regs, sourceWord)}
                                     in
                                         (codeContainer, copyReg)
                                     end
 
                                 |   NoContainer =>
                                     let
                                         val (codeTuple, tupleTarget, _) = codeToICodeRev(tuple, context, false, AnyReg, codeContainer)
                                         fun loadField(destReg: preg, sourceWord: int, tail): blockStruct list =
                                         let
                                             val (code, _, _) =
                                                 wordAddressOffset(SpecificPReg destReg, tupleTarget, sourceWord, polyWordLoadSize, tail)
                                         in
                                             code
                                         end
                                     in
                                         (codeTuple, loadField)
                                     end
 
                             fun copyContainer(sourceWord, destWord, tailCode) =
                             if sourceWord = filterLength
                             then tailCode
                             else if BoolVector.sub(filter, sourceWord)
                             then
                             let
                                 val loadReg = newPReg()
                                 val code =
                                     storeInstr(loadReg, destWord, loadField(loadReg, sourceWord, tailCode))
                             in
                                 copyContainer(sourceWord+1, destWord+1, code)
                             end
                             else copyContainer(sourceWord+1, destWord, tailCode)
                         in
                             copyContainer(0, 0, codeTuple)
                         end
             in
                 returnUnit(destination, code, false)
             end
 
         |   codeToICodeRev(BICLoadContainer{base, offset}, context as {stackPtr, ...}, _, destination, tailCode) =
             (
                 case getContainerIfPresent base of
                     ContainerOnStack {container, stackOffset} =>
                     let (* If this is a local container we extract the field. *)
                         val target = asTarget destination
                         val finalOffset = stackPtr-stackOffset+offset
                         val _ = finalOffset >= 0 orelse raise InternalError "offset"
                     in
                         (BlockSimple(LoadStack{wordOffset=finalOffset, container=container, field=offset,
                             dest=target}) :: tailCode, target, false)
                     end
 
                 |   NoContainer =>
                     let
                         val (codeBase, baseEntry, _) = codeToICodeRev(base, context, false, AnyReg, tailCode)
                     in
                         wordAddressOffset(destination, baseEntry, offset, Load64, codeBase)
                     end
 
                 |   ContainerInRegs regs =>
                     let
                         (* Always copy this into a new register because the source will be a merge reg. *)
                         val target = asTarget destination
                     in
                         (moveRegister{source=List.nth(regs, offset), dest=target} :: tailCode, target, false)
                     end 
             )
 
         |   codeToICodeRev(BICLoadOperation{ kind, address}, context, _, destination, tailCode) =
                 codeLoadOperation(kind, address, context, asTarget destination, tailCode)
 
         |   codeToICodeRev(BICStoreOperation{ kind, address, value}, context, _, destination, tailCode) =
                 codeStoreOperation(kind, address, value, context, destination, tailCode)
 
         |   codeToICodeRev(BICBlockOperation{ kind=BlockOpMove{isByteMove}, sourceLeft, destRight, length }, context, _, destination, tailCode) =
                 (* Assume these are copying immutable data i.e. vector to vector and string to string.
                    The simplifier now assumes that when optimising short constant moves e.g. concatenating
                    with a constant string. *)
             let
                 (* Move bytes or words from the source to the destination.  Need to get the start addresses
                    and length into new registers because they will be modified. *)
                 val (leftAddr, codeLft) = addressToPregAddress(sourceLeft, context, tailCode)
                 val (rightAddr, codeRt) = addressToPregAddress(destRight, context, codeLft)
                 val (codeLength, lengthReg, _) = codeToICodeRev(length, context, false, AnyReg, codeRt)
                 val loadOp = if isByteMove then Load8 else if is32in64 then Load32 else Load64
 
                 (* This threads the calls through two calls to loadAndStoreWithAbsolute to compute the addresses. *)
                 fun getDestAndMove(ltReg, tailCode) =
                 let
                     fun doMove (rtReg, code) =
                     let
                         val lengthReg2 = newUReg() and ltReg2 = newUReg() and rtReg2 = newUReg()
                     in
                         blockMove{ srcAddr=ltReg2, destAddr=rtReg2, length=lengthReg2, isByteMove=isByteMove } ::
                         moveRegister{dest=rtReg2, source=rtReg} ::
                         moveRegister{dest=ltReg2, source=ltReg} ::
                         untagValue{dest=lengthReg2, source=lengthReg, isSigned=false, opSize=polyWordOpSize} :: code
                     end
                 in
                     loadAndStoreWithAbsolute (rightAddr, opWordSize loadOp, loadShift loadOp, doMove, tailCode)
                 end
             in
                 returnUnit(destination, loadAndStoreWithAbsolute (leftAddr, opWordSize loadOp, loadShift loadOp, getDestAndMove, codeLength), false)
             end
 
         |   codeToICodeRev(BICBlockOperation{ kind=BlockOpEqualByte, sourceLeft, destRight, length }, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val ccRef = newCCRef()
                 (* Compare bytes for equality.  Need to get the start addresses and length into new registers
                    because they will be modified. *)
                 val (leftAddr, codeLft) = addressToPregAddress(sourceLeft, context, tailCode)
                 val (rightAddr, codeRt) = addressToPregAddress(destRight, context, codeLft)
                 val (codeLength, lengthReg, _) = codeToICodeRev(length, context, false, AnyReg, codeRt)
 
                 (* This threads the calls through two calls to loadAndStoreWithAbsolute to compute the addresses. *)
 
                 fun getRightAndCompare(ltReg, tailCode) =
                 let
                     fun doComparison (rtReg, code) =
                     let
                         val lengthReg2 = newUReg() and ltReg2 = newUReg() and rtReg2 = newUReg()
                     in
                         compareByteVectors{ vec1Addr=ltReg2, vec2Addr=rtReg2, length=lengthReg2, ccRef=ccRef } ::
                         moveRegister{dest=rtReg2, source=rtReg} ::
                         moveRegister{dest=ltReg2, source=ltReg} ::
                         untagValue{dest=lengthReg2, source=lengthReg, isSigned=false, opSize=polyWordOpSize} :: code
                     end
                 in
                     loadAndStoreWithAbsolute (rightAddr, opWordSize Load8, loadShift Load8, doComparison, tailCode)
                 end
 
                 val testCode =
                     loadAndStoreWithAbsolute (leftAddr, opWordSize Load8, loadShift Load8, getRightAndCompare, codeLength)
             in
                 (makeBoolResultRev(CondEqual, ccRef, target, testCode), target, false)
             end
 
         |   codeToICodeRev(BICBlockOperation{ kind=BlockOpCompareByte, sourceLeft, destRight, length }, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val ccRef = newCCRef()
                 (* Similar to OpEqualByte except it returns -1, 0, +1 depending on the condition code. *)
                 (* Compare bytes for equality.  Need to get the start addresses and length into new registers
                    because they will be modified. *)
                 val (leftAddr, codeLft) = addressToPregAddress(sourceLeft, context, tailCode)
                 val (rightAddr, codeRt) = addressToPregAddress(destRight, context, codeLft)
                 val (codeLength, lengthReg, _) = codeToICodeRev(length, context, false, AnyReg, codeRt)
 
                 (* This threads the calls through two calls to loadAndStoreWithAbsolute to compute the addresses. *)
 
                 fun getRightAndCompare(ltReg, tailCode) =
                 let
                     fun doComparison (rtReg, code) =
                     let
                         val lengthReg2 = newUReg() and ltReg2 = newUReg() and rtReg2 = newUReg()
                         val exitLab = newLabel() and labGreater = newLabel() and labNotGreater = newLabel() 
                         and labLess = newLabel() and labNotLess = newLabel()
                         val mergeResult = newMergeReg()
                         val taggedMinus1 =
                             if is32in64 then 0wxffffffff else 0wxffffffffffffffff
                     in
                         (* Compare the words then a series of comparisons to set the result.
                            TODO; The old code-generator makes the "equal" exit of compareByteVectors jump
                            directly to code to set the result to zero.  It then uses
                            loadNonAddress(X0, Word64.fromInt(tag 1)) followed by
                            conditionalSetInverted{regD=X0, regTrue=X0, regFalse=XZero, cond=CondUnsignedHigher}
                            to set the result to one or minus one. N.B.  This needs to use a 32-bit operation on
                            32-in-64. *)
                         moveRegister{dest=target, source=mergeResult} ::
                         BlockLabel exitLab ::
                         loadNonAddressConstant{source=taggedWord64 0w1, dest=mergeResult} ::
                         BlockLabel labGreater ::
                         BlockFlow(Unconditional exitLab) ::
                         loadNonAddressConstant{source=taggedMinus1, dest=mergeResult} ::
                         BlockLabel labLess ::
                         BlockFlow(Unconditional exitLab) ::
                         loadNonAddressConstant{source=taggedWord64 0w0, dest=mergeResult} ::
                         BlockLabel labNotGreater ::
                         BlockFlow(Conditional{ ccRef=ccRef, condition=CondUnsignedHigher, trueJump=labGreater, falseJump=labNotGreater }) ::
                         BlockLabel labNotLess ::
                         BlockFlow(Conditional{ ccRef=ccRef, condition=CondCarryClear, trueJump=labLess, falseJump=labNotLess }) ::
                         compareByteVectors{ vec1Addr=ltReg2, vec2Addr=rtReg2, length=lengthReg2, ccRef=ccRef } ::
                         moveRegister{dest=rtReg2, source=rtReg} ::
                         moveRegister{dest=ltReg2, source=ltReg} ::
                         untagValue{dest=lengthReg2, source=lengthReg, isSigned=false, opSize=polyWordOpSize} :: code
                     end
                 in
                     loadAndStoreWithAbsolute (rightAddr, opWordSize Load8, loadShift Load8, doComparison, tailCode)
                 end
 
                 val testCode =
                     loadAndStoreWithAbsolute (leftAddr, opWordSize Load8, loadShift Load8, getRightAndCompare, codeLength)
             in
                 (testCode, target, false)
             end
 
         |   codeToICodeRev(BICAllocateWordMemory {numWords, flags, initial }, context, _, destination, tailCode) =
             let
                 (* Allocate a block of memory and initialise it. *)
                 val target = asTarget destination
                 val (codeSize, sizeReg, _) = codeToICodeRev(numWords, context, false, AnyReg, tailCode)
                 val (codeFlags, flagsReg, _) = codeToICodeRev(flags, context, false, AnyReg, codeSize)
                 val (codeInit, initReg, _) = codeToICodeRev(initial, context, false, AnyReg, codeFlags)
                 val uSizeReg = newUReg() and shiftFReg = newUReg() and lengthWord = newUReg()
                 val absAddr = if is32in64 then newUReg() else target
                 val untagSize = untagValue{source=sizeReg, dest=uSizeReg, opSize=polyWordOpSize, isSigned=false} :: codeInit
                 val allocateMem = allocateMemoryVariable{ size=uSizeReg, dest=absAddr, saveRegs=[]} :: untagSize
                 (* Make the length word by first shifting the flags into the length word reg by
                    55 or 23 bits.  This puts the tag bit in the top bit of the size.  Then insert the size
                    into this which will overwrite the flag's tag bit. *)
                 val makeLengthWord =
                     bitFieldInsert{ source=uSizeReg, destAsSource=shiftFReg, dest=lengthWord, length=polyWordOpSize,
                               immr=0w0 (*bit 0*), imms=if is32in64 then 0w23 else 0w55 (*width-1*) } ::
                     shiftConstant{direction=Arm64ICode.ShiftLeft, dest=shiftFReg, source=flagsReg,
                         shift=if is32in64 then 0w23 else 0w55, opSize=polyWordOpSize } :: allocateMem
                 val setLengthWordAndInit =
                     initialiseMem{ size=uSizeReg, addr=absAddr, init=initReg} ::
                     storeWithConstantOffset{ source=lengthWord, base=absAddr,
                         byteOffset= ~(Word.toInt wordSize), loadType=polyWordLoadSize } :: makeLengthWord
                 val finalCode =
                     if is32in64 then absoluteToObjectIndex{ source=absAddr, dest=target } :: setLengthWordAndInit
                     else setLengthWordAndInit
             in
                 (finalCode, target, false)
             end
 
         |   codeToICodeRev(BICHandle{exp, handler, exPacketAddr}, context as { stackPtr, loopArgs, overflowBlock, ... }, isTail, destination, tailCode) =
             let
                 (* As with BICCond and BICCase we need to create a new register for the
                    result in case we need to push it to the stack. *)
                 val handleResult = newMergeReg()
                 val handlerLab = newLabel() and startHandling = newLabel()
                 val (bodyTarget, handlerTarget) =
                     if isTail then (newPReg(), newPReg()) else (handleResult, handleResult)
                 (* TODO: Even if we don't actually want a result we force one in here by
                    using "asTarget".  *)
                 (* The expression cannot be treated as a tail because the handler has
                    to be removed after.  It may "exit" if it has raised an unconditional
                    exception.  If it has we mustn't generate a PopExceptionHandler because
                    there won't be any result for resultReg.
                    We need to add two words to the stack to account for the items pushed by
                    PushExceptionHandler.
                    We create an instruction to push the handler followed by a block fork to
                    the start of the code and, potentially the handler, then a label to start
                    the code that the handler is in effect for. *)
                 val initialCode =
                     BlockLabel startHandling ::
                     BlockFlow(SetHandler{handler=handlerLab, continue=startHandling}) ::
                     BlockSimple(PushExceptionHandler) :: tailCode
                 val (expCode, _, expExit) =
                     codeToICodeRev(exp, {stackPtr=stackPtr+2, loopArgs=loopArgs, currHandler=SOME handlerLab, overflowBlock=overflowBlock},
                         false (* Not tail *), SpecificPReg bodyTarget, initialCode)
                 (* If this is the tail we can replace the jump at the end of the
                    handled code with returns.  If the handler has exited we don't need
                    a return there.  Otherwise we need to add an unconditional jump to
                    skip the handler. *)
                 val (atExpEnd, skipExpLabel) =
                     case (isTail, expExit) of
                         (true, true) => (* Tail and exited. *) (expCode, NONE)
                     |   (true, false) => (* Tail and not exited. *)
                             (returnInstruction(context, bodyTarget, BlockSimple(PopExceptionHandler) :: expCode), NONE)
                     |   (false, true) => (* Not tail but exited. *) (expCode, NONE)
                     |   (false, false) =>
                         let
                             val skipHandler = newLabel()
                         in
                             (BlockFlow(Unconditional skipHandler) ::
                              BlockSimple(PopExceptionHandler) :: expCode, SOME skipHandler)
                         end
                 (* Make a register to hold the exception packet and put eax into it. *)
                 val packetAddr = newPReg()
                 val () = Array.update(locToPregArray, exPacketAddr, PregLocation packetAddr)
                 val (handleCode, _, handleExit) =
                     codeToICodeRev(handler, context, isTail, SpecificPReg handlerTarget,
                         BlockSimple(BeginHandler{packetReg=packetAddr}) :: BlockLabel handlerLab :: atExpEnd)
                 val target = asTarget destination
                 val afterHandler =
                     case (isTail, handleExit) of
                         (true, true) => (* Tail and exited. *) handleCode
                     |   (true, false) => (* Tail and not exited. *)
                             returnInstruction(context, handlerTarget, handleCode)
                     |   (false, _) => (* Not tail. *) handleCode
                 
                 val addLabel =
                     case skipExpLabel of
                         SOME lab => BlockLabel lab:: afterHandler
                     |   NONE => afterHandler
             in
                 (moveRegister{source=handleResult, dest=target} :: addLabel, target, isTail)
             end
 
         and codeConditionRev(condition, context, jumpOn, jumpLabel, tailCode) =
        (* Jump optimisation is done later.  Just generate the general case.
           Load the value into a register and compare it with 1 (true) *)
         let
             val ccRef = newCCRef()
             val (testCode, testReg, _) = codeToICodeRev(condition, context, false, AnyReg, tailCode)
             val noJumpLabel = newLabel()
         in
             BlockLabel noJumpLabel ::
             BlockFlow(Conditional{ccRef=ccRef,
                        condition=if jumpOn then CondEqual else CondNotEqual, trueJump=jumpLabel, falseJump=noJumpLabel}) ::
             (* Compare: SUBS XZ,reg,3.  Can use 32-bit comparison because it's either tagged 0 or tagged 1. *)
             addSubImmediate{source=testReg, immed=taggedWord 0w1, isAdd=false, dest=ZeroReg, length=OpSize32, ccRef=SOME ccRef} ::
             testCode
         end
 
         and codeToICodeUnaryRev({oper=NotBoolean, arg1}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val ccRef = newCCRef()
                 val (argCode, testDest, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
             in
                 (* Test the argument and return a boolean result.  If either the argument is a condition
                    or the result is used in a test this will be better than using XOR. *)
                 (makeBoolResultRev(CondNotEqual, ccRef, target,
                         addSubImmediate{source=testDest, immed=taggedWord 0w1, isAdd=false,
                                 dest=ZeroReg, length=OpSize32 (* Always either tagged 0 or tagged 1 *), ccRef=SOME ccRef} ::
                             argCode), target, false)
             end
 
         |   codeToICodeUnaryRev({oper=IsTaggedValue, arg1}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val ccRef = newCCRef()
                 val (argCode, testDest, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
             in
                 (* Test the argument and return a boolean result.  This ought to be optimised at
                    a lower level to use a test-and-branch. *)
                 (makeBoolResultRev(CondNotEqual, ccRef, target,
                             logicalImmediate{source=testDest, immed=0w1 (* The tag bit*), logOp=LogAnd,
                                 dest=ZeroReg, length=OpSize32 (* Always either tagged 0 or tagged 1 *), ccRef=SOME ccRef} ::
                             argCode), target, false)
             end
 
         |   codeToICodeUnaryRev({oper=MemoryCellLength, arg1}, context, _, destination, tailCode) =
             let
                 val ureg1 = newUReg() and ureg2 = newUReg()
                 val (codeBase, baseReg, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 (* Load the word at -1 (words) into a ureg *)
                 val (codeLoad, _, _) = wordAddressOffset(SpecificPReg ureg1, baseReg, ~1, polyWordLoadSize, codeBase)
                 (* Select 56 or 24 bits and shift it left.  This disassembles as UBFIZ..*)
                 val lsb = 0w1 and width = if is32in64 then 0w24 else 0w56
                 (* Encoding for unsignedBitfieldInsertinZeros64/32 *)
                 val immr = if is32in64 then Word.~ lsb mod 0w32 else Word.~ lsb mod 0w64
                 val imms = width-0w1
                 val maskAndShift =
                     bitFieldShift{source=ureg1, dest=ureg2, isSigned=false, length=polyWordOpSize, immr=immr, imms=imms} :: codeLoad
                 val target = asTarget destination
                 val addTag =
                     addSubImmediate{dest=SomeReg target, source=ureg2, immed=0w1, length=polyWordOpSize,
                                             ccRef=NONE, isAdd=true} :: maskAndShift
             in
                 (addTag, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=MemoryCellFlags, arg1}, context, _, destination, tailCode) =
             let
                 (* Load the flags byte and tag it. *)
                 val (codeBase, baseReg, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (codeRealBase, realBaseReg) = getAbsoluteAddress(codeBase, baseReg)
                 val ureg = newUReg()
                 val codeLoad =
                     loadWithConstantOffset{ base=realBaseReg, dest=ureg, byteOffset=flagsByteOffset, loadType=Load8 } :: codeRealBase
                 val target = asTarget destination
                 val withTag = tagValue{ source=ureg, dest=target, isSigned=false, opSize=OpSize32 } :: codeLoad
             in
                 (withTag, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=ClearMutableFlag, arg1}, context, _, destination, tailCode) =
             let
                 val (codeBase, baseReg, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (codeRealBase, realBaseReg) = getAbsoluteAddress(codeBase, baseReg)
                 val ureg1 = newUReg() and ureg2 = newUReg()
                 (* Load the flags, mask off the mutable bit and store it back. *)
                 val code =
                     storeWithConstantOffset{ base=realBaseReg, source=ureg2, byteOffset=flagsByteOffset, loadType=Load8 } ::
                     logicalImmediate{ source=ureg1, dest=SomeReg ureg2, ccRef=NONE,
                                       immed=Word64.xorb(0wxffffffff, 0wx40), logOp=LogAnd, length=OpSize32 } ::
                     loadWithConstantOffset{ base=realBaseReg, dest=ureg1, byteOffset=flagsByteOffset, loadType=Load8 } :: codeRealBase
             in
                 returnUnit(destination, code, false)
             end
 
         |   codeToICodeUnaryRev({oper=LongWordToTagged, arg1}, context, _, destination, tailCode) =
             let
                 val (codeBase, baseReg, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val uReg = newUReg()
                 val target = asTarget destination
                 val code =
                     tagValue{ source=uReg, dest=target, isSigned=false, opSize=polyWordOpSize } ::
                     unboxLarge{ source=baseReg, dest=uReg } :: codeBase
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=SignedToLongWord, arg1}, context, _, destination, tailCode) =
             let
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val uReg = newUReg()
                 val target = asTarget destination
                 (* We can use a single instruction here on both 32-in-64 and native 64-bits.
                    On 64-bits this is equivalent to an arithmetic shift; on 32-bits
                    it propagates the sign bit into the high-order part. *)
                 val code =
                     boxLarge{ source=uReg, dest=target, saveRegs=[] } ::
                     bitFieldShift{ source=aReg1, dest=uReg, isSigned=true, length=OpSize64,
                                    immr=0w1, imms=if is32in64 then 0wx1f else 0wx3f } ::
                     arg1Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=UnsignedToLongWord, arg1}, context, _, destination, tailCode) =
             let
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val uReg = newUReg()
                 val target = asTarget destination
                 (* This amounts to a logical shift.  Since the top half of the
                    register is zero in 32-in-64 we don't have to select just the
                    low word but there's no advantage in not. *)
                 val code =
                     boxLarge{ source=uReg, dest=target, saveRegs=[] } ::
                     bitFieldShift{ source=aReg1, dest=uReg, isSigned=false, length=OpSize64,
                                    immr=0w1, imms=if is32in64 then 0wx1f else 0wx3f } ::
                     arg1Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=RealAbs precision, arg1}, context, _, destination, tailCode) =
             let
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val uReg1 = newUReg() and uReg2 = newUReg()
                 val target = asTarget destination
                 val fpSize = precisionToFpSize precision
                 val fpOp = case precision of PrecSingle => AbsFloat | PrecDouble => AbsDouble
                 val code =
                     boxTagFloat{ floatSize=fpSize, source=uReg2, dest=target, saveRegs=[] } ::
                     unaryFloatingPt{ source=uReg1, dest=uReg2, fpOp=fpOp } ::
                     unboxTagFloat{ floatSize=fpSize, source=aReg1, dest=uReg1 } ::
                     arg1Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=RealNeg precision, arg1}, context, _, destination, tailCode) =
             let
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val uReg1 = newUReg() and uReg2 = newUReg()
                 val target = asTarget destination
                 val fpSize = precisionToFpSize precision
                 val fpOp = case precision of PrecSingle => NegFloat | PrecDouble => NegDouble
                 val code =
                     boxTagFloat{ floatSize=fpSize, source=uReg2, dest=target, saveRegs=[] } ::
                     unaryFloatingPt{ source=uReg1, dest=uReg2, fpOp=fpOp } ::
                     unboxTagFloat{ floatSize=fpSize, source=aReg1, dest=uReg1 } ::
                     arg1Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=RealFixedInt precision, arg1}, context, _, destination, tailCode) =
             let
                 (* Convert a tagged integer (FixedInt.int) to float or double. *)
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val uReg1 = newUReg() and uReg2 = newUReg()
                 val target = asTarget destination
                 val fpSize = precisionToFpSize precision
                 val code =
                     boxTagFloat{ floatSize=fpSize, source=uReg2, dest=target, saveRegs=[] } ::
                     convertIntToFloat{ source=uReg1, dest=uReg2, srcSize=polyWordOpSize, destSize=fpSize } ::
                     untagValue{ source=aReg1, dest=uReg1, opSize=polyWordOpSize, isSigned=true } ::
                     arg1Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=FloatToDouble, arg1}, context, _, destination, tailCode) =
             let
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val uReg1 = newUReg() and uReg2 = newUReg()
                 val target = asTarget destination
                 val code =
                     boxTagFloat{ floatSize=Double64, source=uReg2, dest=target, saveRegs=[] } ::
                     unaryFloatingPt{ source=uReg1, dest=uReg2, fpOp=ConvFloatToDble } ::
                     unboxTagFloat{ floatSize=Float32, source=aReg1, dest=uReg1 } ::
                     arg1Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=DoubleToFloat, arg1}, context, _, destination, tailCode) =
             let
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val uReg1 = newUReg() and uReg2 = newUReg()
                 val target = asTarget destination
                 val code =
                     boxTagFloat{ floatSize=Float32, source=uReg2, dest=target, saveRegs=[] } ::
                     unaryFloatingPt{ source=uReg1, dest=uReg2, fpOp=ConvDbleToFloat } ::
                     unboxTagFloat{ floatSize=Double64, source=aReg1, dest=uReg1 } ::
                     arg1Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=RealToInt(precision, rounding), arg1}, context, _, destination, tailCode) =
             let
                 (* Convert a float or double to a tagged int.
                    We could get an overflow in either the conversion to integer
                    or in the conversion to a tagged value.  Fortunately if the
                    conversion detects an overflow it sets the result to a
                    value that will cause an overflow in the addition. *)
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val target = asTarget destination
                 val chkOverflow = newCCRef()
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 val fpSize = precisionToFpSize precision
                 val code =
                     (* Set the tag bit. *)
                     addSubImmediate{dest=SomeReg target, source=uReg3, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=true} ::
                     checkOverflow(CondOverflow, context, chkOverflow) @
                     (* Add it to itself and set the condition code. *)
                     addSubRegister{base=uReg2, shifted=uReg2, dest=SomeReg uReg3, length=polyWordOpSize,
                                  ccRef=SOME chkOverflow, isAdd=true, shift=ShiftNone} ::
                     convertFloatToInt{ source=uReg1, dest=uReg2, srcSize=fpSize, destSize=polyWordOpSize, rounding=rounding } ::
                     unboxTagFloat{ floatSize=fpSize, source=aReg1, dest=uReg1 } ::
                     arg1Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=TouchAddress, arg1}, context, _, destination, tailCode) =
             let
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
             in
                 returnUnit(destination, touchValue{source=aReg1} :: arg1Code, false)
             end
 
         |   codeToICodeUnaryRev({oper=AllocCStack, arg1}, context, _, destination, tailCode) =
             let
                 (* Allocate space on the stack.  The higher levels have already aligned
                    the size to a multiple of 16. The number of bytes to allocate is a
                    Word.word value.  The result is a boxed large word. *)
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val uReg1 = newUReg() and uReg2 = newUReg()
                 val target = asTarget destination
                 val code =
                     boxLarge{ source=uReg2, dest=target, saveRegs=[] } ::
                     addSubXSP{ source=uReg1, dest=SomeReg uReg2, isAdd=false  } ::
                     untagValue{ source=aReg1, dest=uReg1, isSigned=false, opSize=polyWordOpSize } ::
                     arg1Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=LockMutex, arg1}, context, _, destination, tailCode) =
             (* The earliest versions of the Arm8 do not have the LDADD instruction which
                will do this directly.  To preserve compatibility we use LDAXR/STLXR
                which require a loop. *)
             let
                 local
                     val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 in
                     val (baseCode, baseReg) = getAbsoluteAddress(arg1Code, aReg1)
                 end
-                val target = asTarget destination
-(*                val loopLabel = newLabel() and noLoopLabel = newLabel()
+                val loopLabel = newLabel() and noLoopLabel = newLabel()
                 val target = asTarget destination
                 val ccRef1 = newCCRef() and ccRef2 = newCCRef()
-                val uRegNew = newUReg() and uRegTest = newUReg() and uRegOld = newUReg()
-                (* N.B. in reverse order. *)
-                val code =
-                    (* The result is true if the old value was zero. *)
-                    addSubImmediate{source=uRegOld, dest=ZeroReg, immed=0w0, isAdd=false, length=OpSize64, ccRef=SOME ccRef2} ::
-                    memoryBarrier :: (* Put in the memory barrier. *)
-                    (* If the result is zero we've been successful otherwise we loop. *)
-                    BlockLabel noLoopLabel ::
-                    BlockFlow(Conditional{ ccRef=ccRef1, condition=CondNotEqual, trueJump=loopLabel, falseJump=noLoopLabel }) ::
-                    addSubImmediate{source=uRegTest, dest=ZeroReg, immed=0w0, isAdd=false, length=OpSize32, ccRef=SOME ccRef1} ::
-                    (* Add and try to store the result *)
-                    storeReleaseExclusive{ base=baseReg, source=SomeReg uRegNew, result=uRegTest } ::
-                    addSubImmediate{source=uRegOld, dest=SomeReg uRegNew, immed=0w1, isAdd=true, length=OpSize64, ccRef=NONE} ::
-                    loadAcquireExclusive{ base=baseReg, dest=uRegOld } ::
-                    BlockLabel loopLabel :: baseCode*)
-                val ccRef2 = newCCRef()
-                val uRegIncr = newUReg() and uRegOld = newUReg()
+                val uRegNew = newUReg() and uRegTest = newUReg() and uRegOld = newUReg() and uRegIncr = newUReg()
                 val code =
-                    baseCode <::> loadNonAddressConstant{ source=0w1, dest=uRegIncr } <::>
-                        atomicOperation{atOp=LoadAddAL, base=baseReg, source=SomeReg uRegIncr, dest=SomeReg uRegOld} <::>
+                    if useLSEAtomics
+                    then baseCode <::> loadNonAddressConstant{ source=0w1, dest=uRegIncr } <::>
+                        atomicOperation{atOp=LoadAddAcquire, base=baseReg, source=SomeReg uRegIncr, dest=SomeReg uRegOld} <::>
                         (* If the previous value was zero we've set it to one and we've got the lock. *)
                         addSubImmediate{source=uRegOld, dest=ZeroReg, immed=0w0, isAdd=false, length=OpSize64, ccRef=SOME ccRef2}
+                    else
+                        (* The result is true if the old value was zero. *)
+                        addSubImmediate{source=uRegOld, dest=ZeroReg, immed=0w0, isAdd=false, length=OpSize64, ccRef=SOME ccRef2} ::
+                        memoryBarrier :: (* Put in the memory barrier. *)
+                        (* If the result is zero we've been successful otherwise we loop. *)
+                        BlockLabel noLoopLabel ::
+                        BlockFlow(Conditional{ ccRef=ccRef1, condition=CondNotEqual, trueJump=loopLabel, falseJump=noLoopLabel }) ::
+                        addSubImmediate{source=uRegTest, dest=ZeroReg, immed=0w0, isAdd=false, length=OpSize32, ccRef=SOME ccRef1} ::
+                        (* Add and try to store the result *)
+                        storeReleaseExclusive{ base=baseReg, source=SomeReg uRegNew, result=uRegTest } ::
+                        addSubImmediate{source=uRegOld, dest=SomeReg uRegNew, immed=0w1, isAdd=true, length=OpSize64, ccRef=NONE} ::
+                        loadAcquireExclusive{ base=baseReg, dest=uRegOld } ::
+                        BlockLabel loopLabel :: baseCode
             in
                 (makeBoolResultRev(CondEqual, ccRef2, target, code), target, false)
             end
 
         |   codeToICodeUnaryRev({oper=TryLockMutex, arg1}, context, _, destination, tailCode) =
             (* *)
             let
-            (* Could use LDUMAXAL to set it the greater of the current value or 1. *)
                 local
                     val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 in
                     val (baseCode, baseReg) = getAbsoluteAddress(arg1Code, aReg1)
                 end
                 val target = asTarget destination
-                (*
                 val loopLabel = newLabel() and noLoopLabel = newLabel() and okLabel = newLabel()
                 val ccRef0 = newCCRef() and ccRef1 = newCCRef() and ccRef2 = newCCRef()
                 val uRegNew = newUReg() and uRegTest = newUReg() and uRegOld = newUReg()
                 val code =
-                    (* The result is true if the old value was zero. *)
-                    addSubImmediate{source=uRegOld, dest=ZeroReg, immed=0w0, isAdd=false, length=OpSize64, ccRef=SOME ccRef2} ::
-                    memoryBarrier :: (* Put in the memory barrier. *)
-                    (* If the result is zero we've been successful otherwise we loop. *)
-                    BlockLabel noLoopLabel ::
-                    BlockFlow(Conditional{ ccRef=ccRef1, condition=CondNotEqual, trueJump=loopLabel, falseJump=noLoopLabel }) ::
-                    addSubImmediate{source=uRegTest, dest=ZeroReg, immed=0w0, isAdd=false, length=OpSize32, ccRef=SOME ccRef1} ::
-                    (* If the lock wasn't taken set it to one to lock it. *)
-                    storeReleaseExclusive{ base=baseReg, source=SomeReg uRegNew, result=uRegTest } ::
-                    loadNonAddressConstant{source=0w1, dest=uRegNew } :: BlockLabel okLabel ::
-                    (* If it's not zero don't try to store anything back and exit the loop. *)
-                    BlockFlow(Conditional{ ccRef=ccRef0, condition=CondNotEqual, trueJump=noLoopLabel, falseJump=okLabel }) ::
-                    addSubImmediate{source=uRegOld, dest=ZeroReg, immed=0w0, isAdd=false, length=OpSize64, ccRef=SOME ccRef0} ::
-                    (* Get the old value and see if it's zero i.e. unlocked. *)
-                    loadAcquireExclusive{ base=baseReg, dest=uRegOld } ::
-                    BlockLabel loopLabel :: baseCode*)
-                val uRegNew = newUReg() and uRegOld = newUReg()
-                val ccRef2 = newCCRef()
-                val code =
-                    baseCode <::> loadNonAddressConstant{ source=0w1, dest=uRegNew } <::>
-                        atomicOperation{atOp=LoadUmaxAL, base=baseReg, source=SomeReg uRegNew, dest=SomeReg uRegOld} <::>
+                    if useLSEAtomics
+                    then baseCode <::> loadNonAddressConstant{ source=0w1, dest=uRegNew } <::>
+                        atomicOperation{atOp=LoadUMaxAcquire, base=baseReg, source=SomeReg uRegNew, dest=SomeReg uRegOld} <::>
                         (* If the previous value was zero we've set it to one and we've got the lock. *)
                         addSubImmediate{source=uRegOld, dest=ZeroReg, immed=0w0, isAdd=false, length=OpSize64, ccRef=SOME ccRef2}
+                    else
+                        (* The result is true if the old value was zero. *)
+                        addSubImmediate{source=uRegOld, dest=ZeroReg, immed=0w0, isAdd=false, length=OpSize64, ccRef=SOME ccRef2} ::
+                        memoryBarrier :: (* Put in the memory barrier. *)
+                        (* If the result is zero we've been successful otherwise we loop. *)
+                        BlockLabel noLoopLabel ::
+                        BlockFlow(Conditional{ ccRef=ccRef1, condition=CondNotEqual, trueJump=loopLabel, falseJump=noLoopLabel }) ::
+                        addSubImmediate{source=uRegTest, dest=ZeroReg, immed=0w0, isAdd=false, length=OpSize32, ccRef=SOME ccRef1} ::
+                        (* If the lock wasn't taken set it to one to lock it. *)
+                        storeReleaseExclusive{ base=baseReg, source=SomeReg uRegNew, result=uRegTest } ::
+                        loadNonAddressConstant{source=0w1, dest=uRegNew } :: BlockLabel okLabel ::
+                        (* If it's not zero don't try to store anything back and exit the loop. *)
+                        BlockFlow(Conditional{ ccRef=ccRef0, condition=CondNotEqual, trueJump=noLoopLabel, falseJump=okLabel }) ::
+                        addSubImmediate{source=uRegOld, dest=ZeroReg, immed=0w0, isAdd=false, length=OpSize64, ccRef=SOME ccRef0} ::
+                        (* Get the old value and see if it's zero i.e. unlocked. *)
+                        loadAcquireExclusive{ base=baseReg, dest=uRegOld } ::
+                        BlockLabel loopLabel :: baseCode
             in
                 (makeBoolResultRev(CondEqual, ccRef2, target, code), target, false)
             end
 
         |   codeToICodeUnaryRev({oper=UnlockMutex, arg1}, context, _, destination, tailCode) =
             (* Get the previous value of the mutex to see if another thread had tried to
                lock it and set the result to zero. *)
             let (* Could use SWAPAL *)
                 local
                     val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 in
                     val (baseCode, baseReg) = getAbsoluteAddress(arg1Code, aReg1)
                 end
                 val target = asTarget destination
-                (*
                 val loopLabel = newLabel() and noLoopLabel = newLabel()
                 val ccRef1 = newCCRef() and ccRef2 = newCCRef()
                 val uRegTest = newUReg() and uRegOld = newUReg()
                 val code =
-                    (* The result is true if the old value was one. i.e. we were the only thread
-                       that locked it. *)
-                    addSubImmediate{source=uRegOld, dest=ZeroReg, immed=0w1, isAdd=false, length=OpSize64, ccRef=SOME ccRef2} ::
-                    memoryBarrier :: (* Put in the memory barrier. *)
-                    (* If the result is zero we've been successful otherwise we loop. *)
-                    BlockLabel noLoopLabel ::
-                    BlockFlow(Conditional{ ccRef=ccRef1, condition=CondNotEqual, trueJump=loopLabel, falseJump=noLoopLabel }) ::
-                    addSubImmediate{source=uRegTest, dest=ZeroReg, immed=0w0, isAdd=false, length=OpSize32, ccRef=SOME ccRef1} ::
-                    (* Try to set this to zero *)
-                    storeReleaseExclusive{ base=baseReg, source=ZeroReg, result=uRegTest } ::
-                    loadAcquireExclusive{ base=baseReg, dest=uRegOld } ::
-                    BlockLabel loopLabel :: baseCode
-                *)
-                val ccRef2 = newCCRef()
-                val uRegOld = newUReg()
-                val code = baseCode <::> atomicOperation{atOp=SwapAL, base=baseReg, source=ZeroReg, dest=SomeReg uRegOld} <::>
-                    memoryBarrier <::> addSubImmediate{source=uRegOld, dest=ZeroReg, immed=0w1, isAdd=false, length=OpSize64, ccRef=SOME ccRef2}
+                    if useLSEAtomics
+                    then baseCode <::> atomicOperation{atOp=SwapRelease, base=baseReg, source=ZeroReg, dest=SomeReg uRegOld} <::>
+                            addSubImmediate{source=uRegOld, dest=ZeroReg, immed=0w1, isAdd=false, length=OpSize64, ccRef=SOME ccRef2}
+                    else
+                        (* The result is true if the old value was one. i.e. we were the only thread
+                           that locked it. *)
+                        addSubImmediate{source=uRegOld, dest=ZeroReg, immed=0w1, isAdd=false, length=OpSize64, ccRef=SOME ccRef2} ::
+                        memoryBarrier :: (* Put in the memory barrier. *)
+                        (* If the result is zero we've been successful otherwise we loop. *)
+                        BlockLabel noLoopLabel ::
+                        BlockFlow(Conditional{ ccRef=ccRef1, condition=CondNotEqual, trueJump=loopLabel, falseJump=noLoopLabel }) ::
+                        addSubImmediate{source=uRegTest, dest=ZeroReg, immed=0w0, isAdd=false, length=OpSize32, ccRef=SOME ccRef1} ::
+                        (* Try to set this to zero *)
+                        storeReleaseExclusive{ base=baseReg, source=ZeroReg, result=uRegTest } ::
+                        loadAcquireExclusive{ base=baseReg, dest=uRegOld } ::
+                        BlockLabel loopLabel :: baseCode
             in
                 (makeBoolResultRev(CondEqual, ccRef2, target, code), target, false)
             end
 
         and codeToICodeBinaryRev({oper=WordComparison{test, isSigned}, arg1, arg2},
                 context, _, destination, tailCode) =
             let
                 (* Comparisons. This is now only used for tagged values, not for pointer equality. *)
                 val ccRef = newCCRef()
                 val (testCode1, testDest1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (testCode2, testDest2, _) = codeToICodeRev(arg2, context, false, AnyReg, testCode1)
                 val comparison =
                     addSubRegister{base=testDest1, shifted=testDest2, dest=ZeroReg, length=polyWordOpSize,
                                    ccRef=SOME ccRef, isAdd=false, shift=ShiftNone} :: testCode2
                 val target = asTarget destination
                 open BuiltIns
                 val cond =
                     case (test, isSigned) of
                         (TestEqual,         _) => CondEqual
                     |   (TestLess,          true) => CondSignedLess
                     |   (TestLessEqual,     true) => CondSignedLessEq
                     |   (TestGreater,       true) => CondSignedGreater
                     |   (TestGreaterEqual,  true) => CondSignedGreaterEq
                     |   (TestLess,          false) => CondCarryClear
                     |   (TestLessEqual,     false) => CondUnsignedLowOrEq
                     |   (TestGreater,       false) => CondUnsignedHigher
                     |   (TestGreaterEqual,  false) => CondCarrySet
                     |   (TestUnordered,     _) => raise InternalError "WordComparison: TestUnordered"
             in
                 (makeBoolResultRev(cond, ccRef, target, comparison), target, false)
             end
 
         |   codeToICodeBinaryRev({oper=FixedPrecisionArith ArithAdd, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 (* We need to subtract the tag from one of the arguments and then do the
                    addition.  The optimiser will do the subtraction at compile time if we
                    subtract from a constant so try to put the constant in the second arg. *)
                 val (firstReg, secondReg) =
                     case arg1 of BICConstnt _ => (aReg2, aReg1) | _ => (aReg1, aReg2)
                 val uReg = newUReg()
                 val chkOverflow = newCCRef()
                 val code =
                     checkOverflow(CondOverflow, context, chkOverflow) @
                     addSubRegister{base=firstReg, shifted=uReg, dest=SomeReg target, length=polyWordOpSize,
                                  ccRef=SOME chkOverflow, isAdd=true, shift=ShiftNone} ::
                     addSubImmediate{dest=SomeReg uReg, source=secondReg, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=false}:: arg2Code
             in
                 (code , target, false)
             end
 
         |   codeToICodeBinaryRev({oper=FixedPrecisionArith ArithSub, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg = newUReg()
                 val chkOverflow = newCCRef()
                 val code =
                     checkOverflow(CondOverflow, context, chkOverflow) @
                     addSubRegister{base=aReg1, shifted=uReg, dest=SomeReg target, length=polyWordOpSize,
                                  ccRef=SOME chkOverflow, isAdd=false, shift=ShiftNone} ::
                     addSubImmediate{dest=SomeReg uReg, source=aReg2, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=false}:: arg2Code
             in
                 (code , target, false)
             end
 
         |   codeToICodeBinaryRev({oper=FixedPrecisionArith ArithMult, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg() and uReg4 = newUReg()
                 val chkOverflow = newCCRef()
 
                 (* Untag one argument. subtract the tag from the second, multiply and add back the tag. *)
                 val multiplyCode =
                     addSubImmediate{dest=SomeReg target, source=uReg3, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=true} ::
                     multiplication{kind=if is32in64 then SignedMultAddLong else MultAdd64, dest=uReg3, sourceA=ZeroReg,
                                    sourceM=uReg1, sourceN=uReg2} ::
                     addSubImmediate{dest=SomeReg uReg2, source=aReg2, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=false} ::
                     untagValue{ source=aReg1, dest=uReg1, isSigned=true, opSize=polyWordOpSize } ::
                     arg2Code
 
                 (* Overflow check:  The condition for overflow is that the high order part (64-bits in native
                    64-bits, 32-bits in 32-in-64) must be zero if the result is positive and all ones if the
                    result is negative.  The high-order part is in uReg3 in 32-in-64 since we've already used
                    SignedMultAddLong but in native 64-bits we need to use SignedMultHigh to get the
                    high order part.  In both cases we can use a comparison with ShiftASR to give a
                    value containing just the sign of the result. *)
                 val checkOverflowCode =
                     if is32in64
                     then addSubRegister{ base=uReg4, shifted=target, dest=ZeroReg, ccRef=SOME chkOverflow,
                             isAdd=false, length=OpSize32, shift=ShiftASR 0w31 } ::
                          shiftConstant{direction=Arm64ICode.ShiftRightArithmetic, source=uReg3, dest=uReg4, shift=0w32,
                             opSize=OpSize64 (* Have to start with 64-bits *)} :: multiplyCode
                     else addSubRegister{ base=uReg4, shifted=target, dest=ZeroReg, ccRef=SOME chkOverflow,
                             isAdd=false, length=OpSize64, shift=ShiftASR 0w63 } ::
                          multiplication{kind=SignedMultHigh, dest=uReg4, sourceA=ZeroReg,
                                        sourceM=uReg1, sourceN=uReg2} :: multiplyCode
                 val code =
                     checkOverflow(CondNotEqual, context, chkOverflow) @ checkOverflowCode
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=FixedPrecisionArith ArithQuot, arg1, arg2}, context, _, destination, tailCode) =
             let
                 (* The word version avoids an extra shift.  Don't do that here at least
                    for the moment.  Division by zero and overflow are checked for at
                    the higher level. *)
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 val code =
                     tagValue { source=uReg3, dest=target, opSize=polyWordOpSize, isSigned=true } ::
                     division{isSigned=true, opSize=polyWordOpSize, dest=uReg3, dividend=uReg1, divisor=uReg2} ::
                     untagValue{ source=aReg2, dest=uReg2, isSigned=true, opSize=polyWordOpSize } ::
                     untagValue{ source=aReg1, dest=uReg1, isSigned=true, opSize=polyWordOpSize } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=FixedPrecisionArith ArithRem, arg1, arg2}, context, _, destination, tailCode) =
             let
                 (* For the moment we remove the tags and then retag afterwards.  The word
                    version avoids this but at least for the moment we do it the longer way. *)
                 (* There's no direct way to get the remainder - have to use divide and multiply. *)
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg() and uReg4 = newUReg()
                 val code =
                     tagValue { source=uReg4, dest=target, opSize=polyWordOpSize, isSigned=true } ::
                     multiplication{kind=if is32in64 then MultSub32 else MultSub64, dest=uReg4,
                                    sourceM=uReg3, sourceN=uReg2, sourceA=SomeReg uReg1} ::
                     division{isSigned=true, opSize=polyWordOpSize, dest=uReg3, dividend=uReg1, divisor=uReg2} ::
                     untagValue{ source=aReg2, dest=uReg2, isSigned=true, opSize=polyWordOpSize } ::
                     untagValue{ source=aReg1, dest=uReg1, isSigned=true, opSize=polyWordOpSize } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=FixedPrecisionArith ArithDiv, ...}, _, _, _, _) =
                raise InternalError "unimplemented operation: FixedPrecisionArith ArithDiv"
 
         |   codeToICodeBinaryRev({oper=FixedPrecisionArith ArithMod, ...}, _, _, _, _) =
                 raise InternalError "unimplemented operation: FixedPrecisionArith ArithMod"
 
         |   codeToICodeBinaryRev({oper=WordArith ArithAdd, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 (* We need to subtract the tag from one of the arguments and then do the
                    addition.  The optimiser will do the subtraction at compile time if we
                    subtract from a constant so try to put the constant in the second arg. *)
                 val (firstReg, secondReg) =
                     case arg1 of BICConstnt _ => (aReg2, aReg1) | _ => (aReg1, aReg2)
                 val uReg = newUReg()
                 val code =
                     addSubRegister{base=firstReg, shifted=uReg, dest=SomeReg target, length=polyWordOpSize,
                                  ccRef=NONE, isAdd=true, shift=ShiftNone} ::
                     addSubImmediate{dest=SomeReg uReg, source=secondReg, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=false}:: arg2Code
             in
                 (code , target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordArith ArithSub, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg = newUReg()
                 (* TODO: If the first argument is a constant we could add one to that rather than
                    subtracting one from the second argument.  We're not concerned with overflow. *)
                 val code =
                     addSubRegister{base=aReg1, shifted=uReg, dest=SomeReg target, length=polyWordOpSize,
                                  ccRef=NONE, isAdd=false, shift=ShiftNone} ::
                     addSubImmediate{dest=SomeReg uReg, source=aReg2, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=false}:: arg2Code
             in
                 (code , target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordArith ArithMult, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 (* Untag one argument. subtract the tag from the second, multiply and add back the tag. *)
                 val code =
                     addSubImmediate{dest=SomeReg target, source=uReg3, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=true} ::
                     multiplication{kind=if is32in64 then MultAdd32 else MultAdd64, dest=uReg3, sourceA=ZeroReg,
                                    sourceM=uReg1, sourceN=uReg2} ::
                     addSubImmediate{dest=SomeReg uReg2, source=aReg2, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=false} ::
                     untagValue{ source=aReg1, dest=uReg1, isSigned=false, opSize=polyWordOpSize } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordArith ArithDiv, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 (* Untag the divisor (into uReg2). subtract the tag from the dividend (into uReg1),
                    divide and or in the tag.  The tag may have been set already depending on
                    the result of the division. *)
                 val code =
                     logicalImmediate{dest=SomeReg target, source=uReg3, immed=0w1, length=polyWordOpSize, ccRef=NONE, logOp=LogOr} ::
                     division{isSigned=false, opSize=polyWordOpSize, dest=uReg3, dividend=uReg1, divisor=uReg2} ::
                     addSubImmediate{dest=SomeReg uReg1, source=aReg1, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=false} ::
                     untagValue{ source=aReg2, dest=uReg2, isSigned=false, opSize=polyWordOpSize } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordArith ArithMod, arg1, arg2}, context, _, destination, tailCode) =
             let
                 (* There's no direct way to get the remainder - have to use divide and multiply. *)
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg() and uReg4 = newUReg()
                 (* Untag the divisor (into uReg2). subtract the tag from the dividend (into uReg1)
                 Untag one argument. subtract the tag from the second, divide and or in the tag.  The
                    tag may have been set already depending on the result of the division. *)
                 val tagBitMask = Word64.<<(Word64.fromInt ~1, 0w1) (* Requires a 64-bit AND. *)
                 val code =
                     (* Multiply the result of the division by the divisor and subtract this from the original, tagged
                        dividend.  This leaves us a tagged value so it can go straight into the result. *)
                     multiplication{kind=if is32in64 then MultSub32 else MultSub64, dest=target,
                                    sourceM=uReg4, sourceN=uReg2, sourceA=SomeReg aReg1} ::
                     (* Clear the bottom bit before the multiplication. *)
                     logicalImmediate{dest=SomeReg uReg4, source=uReg3, immed=tagBitMask, length=OpSize64, ccRef=NONE, logOp=LogAnd} ::
                     division{isSigned=false, opSize=polyWordOpSize, dest=uReg3, dividend=uReg1, divisor=uReg2} ::
                     addSubImmediate{dest=SomeReg uReg1, source=aReg1, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=false} ::
                     untagValue{ source=aReg2, dest=uReg2, isSigned=false, opSize=polyWordOpSize } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordArith _, ...}, _, _, _, _) =
                 raise InternalError "WordArith - unimplemented instruction"
 
         |   codeToICodeBinaryRev({oper=WordLogical LogicalAnd, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 (* Since both values are tagged the tag will be preserved. *)
                 val code =
                     logicalRegister{base=aReg1, shifted=aReg2, dest=SomeReg target, length=polyWordOpSize,
                                     ccRef=NONE, logOp=LogAnd, shift=ShiftNone} :: arg2Code
             in
                 (code , target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordLogical LogicalOr, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 (* Since both values are tagged the tag will be preserved. *)
                 val code =
                     logicalRegister{base=aReg1, shifted=aReg2, dest=SomeReg target, length=polyWordOpSize,
                                     ccRef=NONE, logOp=LogOr, shift=ShiftNone} :: arg2Code
             in
                 (code , target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordLogical LogicalXor, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 (* If we just XOR the values together the tag bit in the result will be zero.
                    It's better to remove one of the tag bits beforehand.  As with Add, we
                    try to choose a constant. *)
                 val (firstReg, secondReg) =
                     case arg1 of BICConstnt _ => (aReg2, aReg1) | _ => (aReg1, aReg2)
                 val uReg = newUReg()
                 val code =
                     logicalRegister{base=firstReg, shifted=uReg, dest=SomeReg target, length=polyWordOpSize,
                                  ccRef=NONE, logOp=LogXor, shift=ShiftNone} ::
                     addSubImmediate{dest=SomeReg uReg, source=secondReg, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=false}:: arg2Code
             in
                 (code , target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordShift ShiftLeft, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val ureg1 = newUReg() and ureg2 = newUReg() and ureg3 = newUReg()
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val code =
                     (* Put back the tag. *)
                     logicalImmediate{ source=ureg3, dest=SomeReg target, ccRef=NONE, immed=0w1, logOp=LogOr, length=polyWordOpSize } ::
                     (* Do the shift *)
                     shiftRegister{direction=Arm64ICode.ShiftLeft, dest=ureg3, source=ureg1, shift=ureg2, opSize=polyWordOpSize} ::
                     (* Untag the shift amount.  Since it's at most 64 we can use a 32-bit operation. *)
                     untagValue{source=aReg2, dest=ureg2, opSize=OpSize32, isSigned=false} ::
                     (* Remove tag bit from the value we're shifting. *)
                     logicalImmediate{ source=aReg1, dest=SomeReg ureg1, ccRef=NONE, immed=polyWordTagBitMask,
                               logOp=LogAnd, length=polyWordOpSize } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordShift ShiftRightLogical, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val ureg1 = newUReg() and ureg2 = newUReg()
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val code =
                     (* Put back the tag. *)
                     logicalImmediate{ source=ureg2, dest=SomeReg target, ccRef=NONE, immed=0w1, logOp=LogOr, length=polyWordOpSize } ::
                     (* Do the shift *)
                     shiftRegister{direction=Arm64ICode.ShiftRightLogical, dest=ureg2, source=aReg1, shift=ureg1, opSize=polyWordOpSize} ::
                     (* Untag the shift amount.  Since it's at most 64 we can use a 32-bit operation. *)
                     untagValue{source=aReg2, dest=ureg1, opSize=OpSize32, isSigned=false} :: arg2Code
                     
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordShift ShiftRightArithmetic, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val ureg1 = newUReg() and ureg2 = newUReg()
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val code =
                     (* Put back the tag. *)
                     logicalImmediate{ source=ureg2, dest=SomeReg target, ccRef=NONE, immed=0w1, logOp=LogOr, length=polyWordOpSize } ::
                     (* Do the shift *)
                     shiftRegister{direction=Arm64ICode.ShiftRightArithmetic, dest=ureg2, source=aReg1, shift=ureg1, opSize=polyWordOpSize} ::
                     (* Untag the shift amount.  Since it's at most 64 we can use a 32-bit operation. *)
                     untagValue{source=aReg2, dest=ureg1, opSize=OpSize32, isSigned=false} :: arg2Code
                     
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=AllocateByteMemory, arg1, arg2}, context, _, destination, tailCode) =
             let
                 (* Allocate a block of memory and without initialisation.  If the flags include
                    the "bytes" bit the GC won't look at it so it doesn't matter that it's not
                    initialised.
                    This is identical to AllocateWordMemory apart from the lack of initialisation. *)
                 val target = asTarget destination
                 val (codeSize, sizeReg, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (codeFlags, flagsReg, _) = codeToICodeRev(arg2, context, false, AnyReg, codeSize)
                 val uSizeReg = newUReg() and shiftFReg = newUReg() and lengthWord = newUReg()
                 val absAddr = if is32in64 then newUReg() else target
                 val untagSize = untagValue{source=sizeReg, dest=uSizeReg, opSize=polyWordOpSize, isSigned=false} :: codeFlags
                 val allocateMem = allocateMemoryVariable{ size=uSizeReg, dest=absAddr, saveRegs=[]} :: untagSize
                 (* Make the length word by first shifting the flags into the length word reg by
                    55 or 23 bits.  This puts the tag bit in the top bit of the size.  Then insert the size
                    into this which will overwrite the flag's tag bit. *)
                 val makeLengthWord =
                     bitFieldInsert{ source=uSizeReg, destAsSource=shiftFReg, dest=lengthWord, length=polyWordOpSize,
                               immr=0w0 (*bit 0*), imms=if is32in64 then 0w23 else 0w55 (*width-1*) } ::
                     shiftConstant{direction=Arm64ICode.ShiftLeft, dest=shiftFReg, source=flagsReg,
                         shift=if is32in64 then 0w23 else 0w55, opSize=polyWordOpSize } :: allocateMem
                 val setLengthWordAndInit =
                     storeWithConstantOffset{ source=lengthWord, base=absAddr,
                         byteOffset= ~(Word.toInt wordSize), loadType=polyWordLoadSize } :: makeLengthWord
                 val finalCode =
                     if is32in64 then absoluteToObjectIndex{ source=absAddr, dest=target } :: setLengthWordAndInit
                     else setLengthWordAndInit
             in
                 (finalCode, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=LargeWordComparison test, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val ccRef = newCCRef()
                 val (testCode1, testDest1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (testCode2, testDest2, _) = codeToICodeRev(arg2, context, false, AnyReg, testCode1)
                 val uReg1 = newUReg() and uReg2 = newUReg()
                 val comparison =
                     addSubRegister{base=uReg1, shifted=uReg2, dest=ZeroReg, length=OpSize64,
                                    ccRef=SOME ccRef, isAdd=false, shift=ShiftNone} ::
                     unboxLarge{ source=testDest2, dest=uReg2 } ::
                     unboxLarge{ source=testDest1, dest=uReg1 } :: testCode2
                 open BuiltIns
                 val cond =
                     case test of
                         TestEqual => CondEqual
                     |   TestLess => CondCarryClear
                     |   TestLessEqual => CondUnsignedLowOrEq
                     |   TestGreater => CondUnsignedHigher
                     |   TestGreaterEqual => CondCarrySet
                     |   TestUnordered => raise InternalError "LargeWordComparison: TestUnordered"
             in
                 (makeBoolResultRev(cond, ccRef, target, comparison), target, false)
             end
 
         |   codeToICodeBinaryRev({oper=LargeWordArith ArithAdd, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 val code =
                     boxLarge{ source=uReg3, dest=target, saveRegs=[] } ::
                     addSubRegister{base=uReg1, shifted=uReg2, dest=SomeReg uReg3, length=OpSize64,
                                    ccRef=NONE, isAdd=true, shift=ShiftNone} ::
                     unboxLarge{ source=aReg2, dest=uReg2 } ::
                     unboxLarge{ source=aReg1, dest=uReg1 } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=LargeWordArith ArithSub, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 val code =
                     boxLarge{ source=uReg3, dest=target, saveRegs=[] } ::
                     addSubRegister{base=uReg1, shifted=uReg2, dest=SomeReg uReg3, length=OpSize64,
                                    ccRef=NONE, isAdd=false, shift=ShiftNone} ::
                     unboxLarge{ source=aReg2, dest=uReg2 } ::
                     unboxLarge{ source=aReg1, dest=uReg1 } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=LargeWordArith ArithMult, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 val code =
                     boxLarge{ source=uReg3, dest=target, saveRegs=[] } ::
                     multiplication{kind=MultAdd64, sourceA=ZeroReg, sourceM=uReg1, sourceN=uReg2, dest=uReg3} ::
                     unboxLarge{ source=aReg2, dest=uReg2 } ::
                     unboxLarge{ source=aReg1, dest=uReg1 } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=LargeWordArith ArithDiv, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 val code =
                     boxLarge{ source=uReg3, dest=target, saveRegs=[] } ::
                     division{isSigned=false, opSize=OpSize64, dividend=uReg1, divisor=uReg2, dest=uReg3} ::
                     unboxLarge{ source=aReg2, dest=uReg2 } ::
                     unboxLarge{ source=aReg1, dest=uReg1 } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=LargeWordArith ArithMod, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg() and uReg4 = newUReg()
                 val code =
                     boxLarge{ source=uReg4, dest=target, saveRegs=[] } ::
                     multiplication{kind=MultSub64, dest=uReg4, sourceM=uReg3, sourceN=uReg2, sourceA=SomeReg uReg1} ::
                     division{isSigned=false, opSize=OpSize64, dividend=uReg1, divisor=uReg2, dest=uReg3} ::
                     unboxLarge{ source=aReg2, dest=uReg2 } ::
                     unboxLarge{ source=aReg1, dest=uReg1 } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=LargeWordArith _, ...}, _, _, _, _) =
                 raise InternalError "LargeWordArith - unimplemented instruction"
 
         |   codeToICodeBinaryRev({oper=LargeWordLogical logop, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 val logicalOp = case logop of LogicalAnd => LogAnd | LogicalOr => LogOr | LogicalXor => LogXor
                 val code =
                     boxLarge{ source=uReg3, dest=target, saveRegs=[] } ::
                     logicalRegister{base=uReg1, shifted=uReg2, dest=SomeReg uReg3, length=OpSize64,
                                    ccRef=NONE, logOp=logicalOp, shift=ShiftNone} ::
                     unboxLarge{ source=aReg2, dest=uReg2 } ::
                     unboxLarge{ source=aReg1, dest=uReg1 } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=LargeWordShift shiftKind, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 val shiftType =
                     case shiftKind of
                         ShiftLeft => Arm64ICode.ShiftLeft
                     |   ShiftRightLogical => Arm64ICode.ShiftRightLogical
                     |   ShiftRightArithmetic => Arm64ICode.ShiftRightArithmetic
                 val code =
                     boxLarge{ source=uReg3, dest=target, saveRegs=[] } ::
                     shiftRegister{direction=shiftType, source=uReg1, shift=uReg2, dest=uReg3, opSize=OpSize64 } ::
                     (* The shift amount is a word, not a large word. *)
                     untagValue{ source=aReg2, dest=uReg2, opSize=OpSize32, isSigned=false } ::
                     unboxLarge{ source=aReg1, dest=uReg1 } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=RealComparison(test, precision), arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val ccRef = newCCRef()
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val fpSize = precisionToFpSize precision
                 val uReg1 = newUReg() and uReg2 = newUReg()
                 (* Floating point comparisons.
                    The fcmp instruction differs from integer comparison.  If either
                    argument is a NaN the overflow bit is set and the other bits are
                    cleared.  That means that in order to get a true result only
                    if the values are not NaNs we have to test that at least one of
                    C, N, or Z are set.  We use unsigned tests for < and <=
                    and signed tests for > and >=. *)
                 val cond =
                     case test of
                         TestEqual => CondEqual
                     |   TestLess => CondCarryClear
                     |   TestLessEqual => CondUnsignedLowOrEq
                     |   TestGreater => CondSignedGreater
                     |   TestGreaterEqual => CondSignedGreaterEq
                     |   TestUnordered => CondOverflow
                 val code =
                     compareFloatingPoint{arg1=uReg1, arg2=uReg2, ccRef=ccRef, opSize=fpSize} ::
                     unboxTagFloat{ floatSize=fpSize, source=aReg2, dest=uReg2 } ::
                     unboxTagFloat{ floatSize=fpSize, source=aReg1, dest=uReg1 } ::
                     arg2Code
             in
                 (makeBoolResultRev(cond, ccRef, target, code), target, false)
             end
 
         |   codeToICodeBinaryRev({oper=RealArith(oper, precision), arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val fpSize = precisionToFpSize precision
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 val fpOp =
                     case oper of
                         ArithAdd => AddFP
                     |   ArithSub => SubtractFP
                     |   ArithMult => MultiplyFP
                     |   ArithDiv => DivideFP
                     |   _ => raise InternalError "RealArith - unimplemented instruction"
                 val code =
                     boxTagFloat{ floatSize=fpSize, source=uReg3, dest=target, saveRegs=[] } ::
                     binaryFloatingPoint{arg1=uReg1, arg2=uReg2, dest=uReg3, fpOp=fpOp, opSize=fpSize } ::
                     unboxTagFloat{ floatSize=fpSize, source=aReg2, dest=uReg2 } ::
                     unboxTagFloat{ floatSize=fpSize, source=aReg1, dest=uReg1 } ::
                     arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=PointerEq, arg1, arg2}, context, _, destination, tailCode) =
             let
                 (* Equality of general values which can include pointers. This can be treated exactly as a word equality.
                    It has to be analysed differently for indexed cases. *)
                 val ccRef = newCCRef()
                 val (testCode1, testDest1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (testCode2, testDest2, _) = codeToICodeRev(arg2, context, false, AnyReg, testCode1)
                 val comparison =
                     addSubRegister{base=testDest1, shifted=testDest2, dest=ZeroReg, length=polyWordOpSize,
                                    ccRef=SOME ccRef, isAdd=false, shift=ShiftNone} :: testCode2
                 val target = asTarget destination
             in
                 (makeBoolResultRev(CondEqual, ccRef, target, comparison), target, false)
             end
 
         |   codeToICodeBinaryRev({oper=FreeCStack, arg1, arg2}, context, _, destination, tailCode) =
             let
                 (* Free space on the C stack. This is a binary operation that takes the base address
                    and the size.  The base address isn't used in this version. *)
                 val (arg1Code, _, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg = newUReg()
                 val code =
                     addSubXSP{ source=uReg, dest=ZeroReg, isAdd=true  } ::
                     untagValue{ source=aReg2, dest=uReg, isSigned=false, opSize=polyWordOpSize } ::
                     arg2Code
             in
                 returnUnit(destination, code, false)
             end
 
         (* Code-generate an address into one or two Pregs. At this point they are in a state where
            we can code-generate arbitrary code before the address is used *)
         and addressToPregAddress({base, index, offset}, context, code) =
         let
             val (bCode, bReg, _) = codeToICodeRev(base, context, false, AnyReg, code)
         in
             case index of
                 NONE => ({base=bReg, index=NONE, offset=offset}, bCode)
             |   SOME index =>
                 let
                     val (iCode, iReg, _) = codeToICodeRev(index, context, false, AnyReg, bCode)
                 in
                     ({base=bReg, index=SOME iReg, offset=offset}, iCode)
                 end
         end
 
         (* Store the code address and the closure items into a previously allocated closure on the
            heap.  This is used both in the simple case and also with mutually recursive declarations. *)
         and storeIntoClosure(lambda as { closure, ...}, absClosureAddr, context, tailCode) =
         let
             val closureRef = makeConstantClosure()
             val () = codeFunctionToArm64(lambda, debugSwitches, closureRef)
             val codeAddrWords = if is32in64 then 2 else 1
 
             fun storeAValue(f, (n, tlCode)) =
             let
                 val (code, source, _) = codeToICodeRev(BICExtract f, context, false, AnyReg, tlCode)
             in
                 (n+1, storeAtWordOffset(source, n, absClosureAddr, polyWordLoadSize, code))
             end
             (* Store the code address in the first 64-bits. *)
             val storeCodeAddress =
                 if is32in64
                 then
                 let
                     (* We can't use codeAddressFromClosure on 32-in-64 because it always returns
                        a 64-bit value.  Instead we have to get the code address at run-time. *)
                     val clReg = newPReg() and absClReg = newUReg() and absCodeReg = newUReg()
                 in
                     storeAtWordOffset(absCodeReg, 0, absClosureAddr, Load64,
                         loadWithConstantOffset{base=absClReg, dest=absCodeReg, byteOffset=0, loadType=Load64} ::
                         objectIndexAddressToAbsolute{ source=clReg, dest=absClReg } ::
                         loadAddressConstant{source=closureAsAddress closureRef, dest=clReg} :: tailCode)
                 end
                 else
                 let
                     val cReg = newPReg()
                 in
                     storeAtWordOffset(cReg, 0, absClosureAddr, Load64,
                         loadAddressConstant{source=codeAddressFromClosure closureRef, dest=cReg} :: tailCode)
                 end
             val (_, storeCode) = List.foldl storeAValue (codeAddrWords, storeCodeAddress) closure
         in
             storeCode
         end
 
         (* Load operations. *)
         and codeLoadOperation(kind, address, context, target, tailCode) =
         let
             val (regAddr, codeAddr) = addressToPregAddress(address, context, tailCode)
 
             val code =
             case kind of
                 LoadStoreMLWord {isImmutable=false} =>
                 let
                     fun loadOp(addrReg, code) =
                         loadAcquire{base=addrReg, dest=target, loadType=polyWordLoadSize} :: code
                 in
                     loadAndStoreWithAbsolute (regAddr, opWordSize polyWordLoadSize, loadShift polyWordLoadSize, loadOp, codeAddr)
                 end
 
             |   LoadStoreMLWord {isImmutable=true} =>
                 let
                     fun loadConstOffset(base, offset, code) =
                         loadWithConstantOffset{base=base, dest=target, byteOffset=offset, loadType=polyWordLoadSize} :: code
                     fun loadIndexed(base, index, code) =
                         loadWithIndexedOffset{base=base, dest=target, index=index, loadType=polyWordLoadSize, signExtendIndex=false} :: code
                 in
                     loadAndStoreWithAddress (regAddr, opWordSize polyWordLoadSize, loadShift polyWordLoadSize, false, loadConstOffset, loadIndexed, codeAddr)
                 end
 
             |   LoadStoreMLByte {isImmutable=false} =>
                 let
                     (* Have to load into a ureg and then tag it. *)
                     val destReg = newUReg()
                     fun loadOp(addrReg, code) =
                         loadAcquire{base=addrReg, dest=destReg, loadType=Load8} :: code
                 in
                     tagValue{source=destReg, dest=target, isSigned=false, opSize=OpSize32} ::
                         loadAndStoreWithAbsolute (regAddr, opWordSize Load8, loadShift Load8, loadOp, codeAddr)
                 end
 
             |   LoadStoreMLByte {isImmutable=true} =>
                 let
                     (* Have to load into a ureg and then tag it. *)
                     val destReg = newUReg()
                     fun loadConstOffset(base, offset, code) =
                         loadWithConstantOffset{base=base, dest=destReg, byteOffset=offset, loadType=Load8} :: code
                     fun loadIndexed(base, index, code) =
                         loadWithIndexedOffset{base=base, dest=destReg, index=index, loadType=Load8, signExtendIndex=false} :: code
                 in
                     tagValue{source=destReg, dest=target, isSigned=false, opSize=OpSize32} ::
                         loadAndStoreWithAddress(regAddr, opWordSize Load8, loadShift Load8, false, loadConstOffset, loadIndexed, codeAddr)
                 end
 
             |   LoadStoreC8 =>
                 let
                     (* Have to load into a ureg and then tag it. *)
                     val destReg = newUReg()
                     fun loadConstOffset(base, offset, code) =
                         loadWithConstantOffset{base=base, dest=destReg, byteOffset=offset, loadType=Load8} :: code
                     fun loadIndexed(base, index, code) =
                         loadWithIndexedOffset{base=base, dest=destReg, index=index, loadType=Load8, signExtendIndex=true} :: code
                 in
                     tagValue{source=destReg, dest=target, isSigned=false, opSize=OpSize32} ::
                         loadAndStoreWithAddress(regAddr, opWordSize Load8, loadShift Load8, true, loadConstOffset, loadIndexed, codeAddr)
                 end
 
             |   LoadStoreC16 =>
                 let
                     (* Have to load into a ureg and then tag it. *)
                     val destReg = newUReg()
                     fun loadConstOffset(base, offset, code) =
                         loadWithConstantOffset{base=base, dest=destReg, byteOffset=offset, loadType=Load16} :: code
                     fun loadIndexed(base, index, code) =
                         loadWithIndexedOffset{base=base, dest=destReg, index=index, loadType=Load16, signExtendIndex=true} :: code
                 in
                     tagValue{source=destReg, dest=target, isSigned=false, opSize=OpSize32} ::
                         loadAndStoreWithAddress(regAddr, opWordSize Load16, loadShift Load16, true, loadConstOffset, loadIndexed, codeAddr)
                 end
 
             |   LoadStoreC32 =>
                 let
                     (* This is tagged in native 64-bits and boxed in 32-in-64. *)
                     val destReg = newUReg()
                     fun loadConstOffset(base, offset, code) =
                         loadWithConstantOffset{base=base, dest=destReg, byteOffset=offset, loadType=Load32} :: code
                     fun loadIndexed(base, index, code) =
                         loadWithIndexedOffset{base=base, dest=destReg, index=index, loadType=Load32, signExtendIndex=true} :: code
                 in
                     (if is32in64
                      then boxLarge{ source=destReg, dest=target, saveRegs=[] }
                      else tagValue{source=destReg, dest=target, isSigned=false, opSize=OpSize64 (* It becomes 33 bits *)}) ::
                         loadAndStoreWithAddress(regAddr, opWordSize Load32, loadShift Load32, true, loadConstOffset, loadIndexed, codeAddr)
                 end
 
             |   LoadStoreC64 =>
                 let
                     (* This is always boxed. *)
                     val destReg = newUReg()
                     fun loadConstOffset(base, offset, code) =
                         loadWithConstantOffset{base=base, dest=destReg, byteOffset=offset, loadType=Load64} :: code
                     fun loadIndexed(base, index, code) =
                         loadWithIndexedOffset{base=base, dest=destReg, index=index, loadType=Load64, signExtendIndex=true} :: code
                 in
                     boxLarge{ source=destReg, dest=target, saveRegs=[]} ::
                         loadAndStoreWithAddress(regAddr, opWordSize Load64, loadShift Load64, true, loadConstOffset, loadIndexed, codeAddr)
                 end
 
             |   LoadStoreCFloat =>
                 let
                     (* This always returns a double, not a 32-bit float. *)
                     val destReg = newUReg() and convertReg = newUReg()
                     fun loadConstOffset(base, offset, code) =
                         loadFPWithConstantOffset{base=base, dest=destReg, byteOffset=offset, floatSize=Float32} :: code
                     fun loadIndexed(base, index, code) =
                         loadFPWithIndexedOffset{base=base, dest=destReg, index=index, floatSize=Float32, signExtendIndex=true} :: code
                 in
                     boxTagFloat{floatSize=Double64, source=convertReg, dest=target, saveRegs=[]} ::
                         unaryFloatingPt{source=destReg, dest=convertReg, fpOp=ConvFloatToDble} ::
                         loadAndStoreWithAddress(regAddr, 4, 0w2, true, loadConstOffset, loadIndexed, codeAddr)
                 end
 
             |   LoadStoreCDouble =>
                 let
                     (* This is always boxed. *)
                     val destReg = newUReg()
                     fun loadConstOffset(base, offset, code) =
                         loadFPWithConstantOffset{base=base, dest=destReg, byteOffset=offset, floatSize=Double64} :: code
                     fun loadIndexed(base, index, code) =
                         loadFPWithIndexedOffset{base=base, dest=destReg, index=index, floatSize=Double64, signExtendIndex=true} :: code
                 in
                     boxTagFloat{floatSize=Double64, source=destReg, dest=target, saveRegs=[]} ::
                         loadAndStoreWithAddress(regAddr, 8, 0w3, true, loadConstOffset, loadIndexed, codeAddr)
                 end
 
             |   LoadStoreUntaggedUnsigned =>
                 let
                     (* LoadStoreMLWord {isImmutable=true} except it has to be tagged. *)
                     val ureg = newUReg()
                     fun loadConstOffset(base, offset, code) =
                         loadWithConstantOffset{base=base, dest=ureg, byteOffset=offset, loadType=polyWordLoadSize} :: code
                     fun loadIndexed(base, index, code) =
                         loadWithIndexedOffset{base=base, dest=ureg, index=index, loadType=polyWordLoadSize, signExtendIndex=false} :: code
                 in
                     tagValue{source=ureg, dest=target, isSigned=false, opSize=polyWordOpSize} ::
                         loadAndStoreWithAddress(regAddr, opWordSize polyWordLoadSize, loadShift polyWordLoadSize, false, loadConstOffset, loadIndexed, codeAddr)
                 end
 
             |   LoadStorePolyWord _ =>
                 let
                     (* LoadStoreMLWord {isImmutable=true} except it has to be boxed. For the moment don't use load-acquire. *)
                     val ureg = newUReg()
                     fun loadConstOffset(base, offset, code) =
                         loadWithConstantOffset{base=base, dest=ureg, byteOffset=offset, loadType=polyWordLoadSize} :: code
                     fun loadIndexed(base, index, code) =
                         loadWithIndexedOffset{base=base, dest=ureg, index=index, loadType=polyWordLoadSize, signExtendIndex=false} :: code
                 in
                     boxLarge{source=ureg, dest=target, saveRegs=[]} ::
                         loadAndStoreWithAddress(regAddr, opWordSize polyWordLoadSize, loadShift polyWordLoadSize, false, loadConstOffset, loadIndexed, codeAddr)
                 end
                 
             |   LoadStoreNativeWord _ =>
                 let
                     (* Similar to LoadStorePolyWord but a native word. *)
                     val ureg = newUReg()
                     fun loadConstOffset(base, offset, code) =
                         loadWithConstantOffset{base=base, dest=ureg, byteOffset=offset, loadType=Load64} :: code
                     fun loadIndexed(base, index, code) =
                         loadWithIndexedOffset{base=base, dest=ureg, index=index, loadType=Load64, signExtendIndex=false} :: code
                 in
                     boxLarge{source=ureg, dest=target, saveRegs=[]} ::
                         loadAndStoreWithAddress(regAddr, opWordSize Load64, loadShift polyWordLoadSize, false, loadConstOffset, loadIndexed, codeAddr)
                 end
 
         in
             (code, target, false)
         end
 
         (* Store operations. *)
         and codeStoreOperation(kind, address, value, context, destination, tailCode1) =
         let
             val (regAddr, codeAddr) = addressToPregAddress(address, context, tailCode1)
             val (sourceCode, sourceReg, _) = codeToICodeRev(value, context, false, AnyReg, codeAddr)
             
             val storeCode =
             case kind of
                 LoadStoreMLWord {isImmutable=false} =>
                 let
                     fun storeOp(addrReg, code) =
                         storeRelease{base=addrReg, source=sourceReg, loadType=polyWordLoadSize} :: code
                 in
                     loadAndStoreWithAbsolute(regAddr, opWordSize polyWordLoadSize, loadShift polyWordLoadSize, storeOp, sourceCode)
                 end
 
             |   LoadStoreMLWord {isImmutable=true} =>
                 let
                     (* Used when initialising immutables that do not require store-release. *)
                     fun loadConstOffset(base, offset, code) =
                         storeWithConstantOffset{base=base, source=sourceReg, byteOffset=offset, loadType=polyWordLoadSize} :: code
                     fun loadIndexed(base, index, code) =
                         storeWithIndexedOffset{base=base, source=sourceReg, index=index, loadType=polyWordLoadSize, signExtendIndex=false} :: code
                 in
                     loadAndStoreWithAddress (regAddr, opWordSize polyWordLoadSize, loadShift polyWordLoadSize, false, loadConstOffset, loadIndexed, sourceCode)
                 end
 
             |   LoadStoreMLByte {isImmutable=false} =>
                 let
                     fun storeOp(addrReg, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeRelease{base=addrReg, source=tReg, loadType=Load8} ::
                         untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=OpSize32} :: code
                     end
                 in
                     loadAndStoreWithAbsolute(regAddr, opWordSize Load8, loadShift Load8, storeOp, sourceCode)
                 end
 
             |   LoadStoreMLByte {isImmutable=true} =>
                 let
                     (* Used when initialising immutables that do not require store-release. *)
                     fun loadConstOffset(base, offset, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithConstantOffset{base=base, source=tReg, byteOffset=offset, loadType=Load8} ::
                         untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=OpSize32} :: code
                     end
 
                     fun loadIndexed(base, index, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithIndexedOffset{base=base, source=tReg, index=index, loadType=Load8, signExtendIndex=false} ::
                         untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=OpSize32} :: code
                     end
                 in
                     loadAndStoreWithAddress(regAddr, opWordSize Load8, loadShift Load8, false, loadConstOffset, loadIndexed, sourceCode)
                 end
 
             |   LoadStoreC8 =>
                 let
                     fun loadConstOffset(base, offset, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithConstantOffset{base=base, source=tReg, byteOffset=offset, loadType=Load8} ::
                         untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=OpSize32} :: code
                     end
 
                     fun loadIndexed(base, index, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithIndexedOffset{base=base, source=tReg, index=index, loadType=Load8, signExtendIndex=true} ::
                         untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=OpSize32} :: code
                     end
                 in
                     loadAndStoreWithAddress(regAddr, opWordSize Load8, loadShift Load8, true, loadConstOffset, loadIndexed, sourceCode)
                 end
 
             |   LoadStoreC16 =>
                 let
                     fun loadConstOffset(base, offset, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithConstantOffset{base=base, source=tReg, byteOffset=offset, loadType=Load16} ::
                         untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=OpSize32} :: code
                     end
 
                     fun loadIndexed(base, index, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithIndexedOffset{base=base, source=tReg, index=index, loadType=Load16, signExtendIndex=true} ::
                         untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=OpSize32} :: code
                     end
                 in
                     loadAndStoreWithAddress(regAddr, opWordSize Load16, loadShift Load16, true, loadConstOffset, loadIndexed, sourceCode)
                 end
 
             |   LoadStoreC32 =>
                 let
                     fun loadConstOffset(base, offset, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithConstantOffset{base=base, source=tReg, byteOffset=offset, loadType=Load32} ::
                         (if is32in64 then unboxLarge{source=sourceReg, dest=tReg}
                          else untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=OpSize64}) :: code
                     end
 
                     fun loadIndexed(base, index, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithIndexedOffset{base=base, source=tReg, index=index, loadType=Load32, signExtendIndex=true} ::
                         (if is32in64 then unboxLarge{source=sourceReg, dest=tReg}
                          else untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=OpSize64}) :: code
                     end
                 in
                     loadAndStoreWithAddress(regAddr, opWordSize Load32, loadShift Load32, true, loadConstOffset, loadIndexed, sourceCode)
                 end
 
             |   LoadStoreC64 =>
                 let
                     fun loadConstOffset(base, offset, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithConstantOffset{base=base, source=tReg, byteOffset=offset, loadType=Load64} ::
                         unboxLarge{source=sourceReg, dest=tReg} :: code
                     end
 
                     fun loadIndexed(base, index, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithIndexedOffset{base=base, source=tReg, index=index, loadType=Load64, signExtendIndex=true} ::
                         unboxLarge{source=sourceReg, dest=tReg} :: code
                     end
                 in
                     loadAndStoreWithAddress(regAddr, opWordSize Load64, loadShift Load64, true, loadConstOffset, loadIndexed, sourceCode)
                 end
 
             |   LoadStoreCFloat =>
                 let
                     (* The "real" value is a double, not a 32-bit float *)
                     fun loadConstOffset(base, offset, code) =
                     let
                         val tReg = newUReg() and cReg = newUReg()
                     in
                         storeFPWithConstantOffset{base=base, source=tReg, byteOffset=offset, floatSize=Float32} ::
                         unaryFloatingPt{source=cReg, dest=tReg, fpOp=ConvDbleToFloat} ::
                         unboxTagFloat{floatSize=Double64, source=sourceReg, dest=cReg} :: code
                     end
 
                     fun loadIndexed(base, index, code) =
                     let
                         val tReg = newUReg() and cReg = newUReg()
                     in
                         storeFPWithIndexedOffset{base=base, source=tReg, index=index, floatSize=Float32, signExtendIndex=true} ::
                         unaryFloatingPt{source=cReg, dest=tReg, fpOp=ConvDbleToFloat} ::
                         unboxTagFloat{floatSize=Double64, source=sourceReg, dest=cReg} :: code
                     end
                 in
                     loadAndStoreWithAddress(regAddr, 4, 0w2, true, loadConstOffset, loadIndexed, sourceCode)
                 end
 
             |   LoadStoreCDouble =>
                 let
                     fun loadConstOffset(base, offset, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeFPWithConstantOffset{base=base, source=tReg, byteOffset=offset, floatSize=Double64} ::
                         unboxTagFloat{floatSize=Double64, source=sourceReg, dest=tReg} :: code
                     end
 
                     fun loadIndexed(base, index, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeFPWithIndexedOffset{base=base, source=tReg, index=index, floatSize=Double64, signExtendIndex=true} ::
                         unboxTagFloat{floatSize=Double64, source=sourceReg, dest=tReg} :: code
                     end
                 in
                     loadAndStoreWithAddress(regAddr, 8, 0w3, true, loadConstOffset, loadIndexed, sourceCode)
                 end
 
             |   LoadStoreUntaggedUnsigned =>
                 let
                     (* Only used when initialising strings so this does not require store-release. *)
                     fun loadConstOffset(base, offset, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithConstantOffset{base=base, source=tReg, byteOffset=offset, loadType=polyWordLoadSize} ::
                         untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=polyWordOpSize} :: code
                     end
                     fun loadIndexed(base, index, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithIndexedOffset{base=base, source=tReg, index=index, loadType=polyWordLoadSize, signExtendIndex=false} ::
                         untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=polyWordOpSize} :: code
                     end
                 in
                     loadAndStoreWithAddress(regAddr, opWordSize polyWordLoadSize, loadShift polyWordLoadSize, false, loadConstOffset, loadIndexed, sourceCode)
                 end
 
             |   LoadStorePolyWord _ =>
                 let
                     (* For the moment assume we don't require store-release. *)
                     fun loadConstOffset(base, offset, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithConstantOffset{base=base, source=tReg, byteOffset=offset, loadType=polyWordLoadSize} ::
                         unboxLarge{source=sourceReg, dest=tReg} :: code
                     end
                     fun loadIndexed(base, index, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithIndexedOffset{base=base, source=tReg, index=index, loadType=polyWordLoadSize, signExtendIndex=false} ::
                         unboxLarge{source=sourceReg, dest=tReg} :: code
                     end
                 in
                     loadAndStoreWithAddress(regAddr, opWordSize polyWordLoadSize, loadShift polyWordLoadSize, false, loadConstOffset, loadIndexed, sourceCode)
                 end
 
             |   LoadStoreNativeWord _ =>
                 let
                     (* For the moment assume we don't require store-release. *)
                     fun loadConstOffset(base, offset, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithConstantOffset{base=base, source=tReg, byteOffset=offset, loadType=Load64} ::
                         unboxLarge{source=sourceReg, dest=tReg} :: code
                     end
                     fun loadIndexed(base, index, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithIndexedOffset{base=base, source=tReg, index=index, loadType=Load64, signExtendIndex=false} ::
                         unboxLarge{source=sourceReg, dest=tReg} :: code
                     end
                 in
                     loadAndStoreWithAddress(regAddr, opWordSize Load64, loadShift Load64, false, loadConstOffset, loadIndexed, sourceCode)
                 end
         in
             returnUnit(destination, storeCode, false)
         end
 
 
         (*Turn the codetree structure into icode. *)
         val bodyContext = {loopArgs=NONE, stackPtr=0, currHandler=NONE, overflowBlock=ref NONE}
         val (bodyCode, _, bodyExited) =
             codeToICodeRev(body, bodyContext, true, SpecificPReg resultTarget, beginInstructions)
         val icode = if bodyExited then bodyCode else returnInstruction(bodyContext, resultTarget, bodyCode)
         
         (* Turn the icode list into basic blocks.  The input list is in reverse so as part of
            this we reverse the list. *)
         local
             val resArray = Array.array(!labelCounter, BasicBlock{ block=[], flow=ExitCode })
             
             fun createEntry (blockNo, block, flow) =
                 Array.update(resArray, blockNo, BasicBlock{ block=block, flow=flow})
             
             fun splitCode([], _, _) = 
                 (* End of code.  We should have had a BeginFunction. *)
                 raise InternalError "splitCode - no begin"
             
             |   splitCode(BlockBegin args :: _, sinceLabel, flow) =
                     (* Final instruction.  Create the initial block and exit. *)
                     createEntry(0, BeginFunction args ::sinceLabel, flow)
             
             |   splitCode(BlockSimple instr :: rest, sinceLabel, flow) =
                     splitCode(rest, instr :: sinceLabel, flow)
 
             |   splitCode(BlockLabel label :: rest, sinceLabel, flow) =
                     (* Label - finish this block and start another. *)
                 (
                     createEntry(label, sinceLabel, flow);
                     (* Default to a jump to this label.  That is used if we have
                        assumed a drop-through. *)
                     splitCode(rest, [], Unconditional label)
                 )
             
             |   splitCode(BlockExit instr :: rest, _, _) =
                     splitCode(rest, [instr], ExitCode)
 
             |   splitCode(BlockFlow flow :: rest, _, _) =
                     splitCode(rest, [], flow)
             
             |   splitCode(BlockRaiseAndHandle(instr, handler) :: rest, _, _) =
                     splitCode(rest, [instr], UnconditionalHandle handler)
 
             |   splitCode(BlockOptionalHandle{call, handler, label} :: rest, sinceLabel, flow) =
                 let
                     (* A function call within a handler.  This could go to the handler but
                        if there is no exception will go to the next instruction.
                        Also includes JumpLoop since the stack check could result in an
                        Interrupt exception. *)
                 in
                     createEntry(label, sinceLabel, flow);
                     splitCode(rest, [call], ConditionalHandle{handler=handler, continue=label})
                 end
 
         in
             val () = splitCode(icode, [], ExitCode)
             val resultVector = Array.vector resArray
         end
       
         open ICodeTransform
         
         val pregProperties = Vector.fromList(List.rev(! pregPropList))
     in
         codeICodeFunctionToArm64{blocks = resultVector, functionName = name, pregProps = pregProperties,
             ccCount= ! ccRefCounter, debugSwitches = debugSwitches, resultClosure = resultClosure,
             profileObject = profileObject}
     end
 
     val gencodeLambda = codeFunctionToArm64
 
     structure Foreign = Arm64Foreign
     
     structure Sharing =
     struct
         type backendIC = backendIC
         and  bicLoadForm = bicLoadForm
         and argumentType = argumentType
         and closureRef = closureRef
     end
     
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICode.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICode.ML
index a803bfaf..f591a361 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICode.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICode.ML
@@ -1,1011 +1,1014 @@
 (*
     Copyright David C. J. Matthews 2021-2
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64ICode(
 
     structure Arm64Code: ARM64PREASSEMBLY
 
 ): ARM64ICODE =
 
 struct
     open Arm64Code
 
     open Address
     datatype preg = PReg of int (* A pseudo-register - an abstract register. *)
 
     (* If the value is zero we can use X0/W0. *)
     datatype pregOrZero = SomeReg of preg | ZeroReg
     
     (* A location on the stack.  May be more than word if this is a container or a handler entry. *)
     datatype stackLocn = StackLoc of {size: int, rno: int }
     
     (* This combines pregKind and stackLocn.  *)
     datatype regProperty =
         RegPropGeneral      (* A general register. *)
     |   RegPropUntagged     (* An untagged general register. *)
     |   RegPropStack of int (* A stack location or container. *)
     |   RegPropCacheTagged
     |   RegPropCacheUntagged
     |   RegPropMultiple     (* The result of a conditional or case. May be defined at multiple points. *)
 
     (* The reference to a condition code. *)
     datatype ccRef = CcRef of int
     datatype reg = GenReg of xReg | FPReg of vReg
 
     datatype callKind = Recursive | ConstantCode of machineWord | FullCall
 
     (* Function calls can have an unlimited number of arguments so it isn't always
        going to be possible to load them into registers. *)
     datatype 'genReg fnarg = ArgInReg of 'genReg | ArgOnStack of { wordOffset: int, container: stackLocn, field: int }
 
     datatype ('genReg, 'optGenReg, 'fpReg) arm64ICode =
         (* Move the contents of one preg to another.  These are always 64-bits. *)
         MoveRegister of { source: 'genReg, dest: 'genReg }
 
         (* Numerical constant. *)
     |   LoadNonAddressConstant of { source: Word64.word, dest: 'genReg }
 
         (* Floating point constant *)
     |   LoadFPConstant of { source: Word64.word, dest: 'fpReg, floatSize: floatSize }
 
         (* Address constant. *)
     |   LoadAddressConstant of { source: machineWord, dest: 'genReg }
 
         (* Load a value into a register using a constant, signed, byte offset.  The offset
            is in the range of -256 to (+4095*unit size). *)
     |   LoadWithConstantOffset of { base: 'genReg, dest: 'genReg, byteOffset: int, loadType: loadType }
 
         (* Similarly for FP registers. *)
     |   LoadFPWithConstantOffset of { base: 'genReg, dest: 'fpReg, byteOffset: int, floatSize: floatSize }
 
         (* Load a value into a register using an index register. *)
     |   LoadWithIndexedOffset of { base: 'genReg, dest: 'genReg, index: 'genReg, loadType: loadType, signExtendIndex: bool }
 
         (* Ditto for FP. *)
     |   LoadFPWithIndexedOffset of { base: 'genReg, dest: 'fpReg, index: 'genReg, floatSize: floatSize, signExtendIndex: bool }
 
         (* Returns the current thread ID.  Always a 64-bit value.. *)
     |   GetThreadId of { dest: 'genReg }
 
         (* Convert a 32-in-64 object index into an absolute address. *)
     |   ObjectIndexAddressToAbsolute of { source: 'genReg, dest: 'genReg }
 
         (* Convert an absolute address into an object index. *)
     |   AbsoluteToObjectIndex of { source: 'genReg, dest: 'genReg }
 
         (* Allocate a fixed sized piece of memory and puts the absolute address into dest.
            bytesRequired is the total number of bytes including the length word and any alignment
            necessary for 32-in-64. saveRegs is the list of registers that need to be saved if we
            need to do a garbage collection. *)
     |   AllocateMemoryFixed of { bytesRequired: Word64.word, dest: 'genReg, saveRegs: 'genReg list }
 
         (* Allocate a piece of memory.  The size argument is an untagged value containing
            the number of words i.e. the same value used for InitialiseMemory and to store
            in the length word. *)
     |   AllocateMemoryVariable of { size: 'genReg, dest: 'genReg, saveRegs: 'genReg list }
 
         (* Initialise a piece of memory by writing "size" copies of the value
            in "init".  N.B. The size is an untagged value containing the
            number of words. *)
     |   InitialiseMem of { size: 'genReg, addr: 'genReg, init: 'genReg }
 
         (* Mark the beginning of a loop.  This is really only to prevent the initialisation code being
            duplicated in ICodeOptimise. *)
     |   BeginLoop
 
         (* Set up the registers for a jump back to the start of a loop. *)
     |   JumpLoop of
             { regArgs: {src: 'genReg fnarg, dst: 'genReg} list,
               stackArgs: {src: 'genReg fnarg, wordOffset: int, stackloc: stackLocn} list,
               checkInterrupt: 'genReg list option }
 
         (* Store a register using a constant, signed, byte offset.  The offset
            is in the range of -256 to (+4095*unit size). *)
     |   StoreWithConstantOffset of { source: 'genReg, base: 'genReg, byteOffset: int, loadType: loadType }
 
         (* Ditto for FP regs. *)
     |   StoreFPWithConstantOffset of { source: 'fpReg, base: 'genReg, byteOffset: int, floatSize: floatSize }
 
         (* Store a register using an index register. *)
     |   StoreWithIndexedOffset of { source: 'genReg, base: 'genReg, index: 'genReg, loadType: loadType, signExtendIndex: bool }
 
         (* and for FP regs. *)
     |   StoreFPWithIndexedOffset of { source: 'fpReg, base: 'genReg, index: 'genReg, floatSize: floatSize, signExtendIndex: bool }
 
         (* Add/Subtract immediate.  The destination is optional in which case XZero is used.
            ccRef is optional.  If it is NONE the version of the instruction that does not generate
            a condition code is used. immed must be < 0wx1000. *)
     |   AddSubImmediate of { source: 'genReg, dest: 'optGenReg, ccRef: ccRef option, immed: word,
                              isAdd: bool, length: opSize }
 
         (* Add/Subtract register.  As with AddSubImmediate, both the destination and cc are optional. *)
     |   AddSubRegister of { base: 'genReg, shifted: 'genReg, dest: 'optGenReg, ccRef: ccRef option,
                             isAdd: bool, length: opSize, shift: shiftType }
 
         (* Bitwise logical operations.  The immediate value must be a valid bit pattern.  ccRef can
            only be SOME if logOp is LogAnd. *)
     |   LogicalImmediate of { source: 'genReg, dest: 'optGenReg, ccRef: ccRef option, immed: Word64.word,
                               logOp: logicalOp, length: opSize }
 
         (* Register logical operations.  ccRef can only be SOME if logOp is LogAnd.*)
     |   LogicalRegister of { base: 'genReg, shifted: 'genReg, dest: 'optGenReg, ccRef: ccRef option,
                              logOp: logicalOp, length: opSize, shift: shiftType }
 
         (* Shift a word by an amount specified in a register. *)
     |   ShiftRegister of { direction: shiftDirection, dest: 'genReg, source: 'genReg, shift: 'genReg, opSize: opSize }
 
         (* The various forms of multiply all take three arguments and the general form is
            dest = M * N +/- A..   *)
     |   Multiplication of { kind: multKind, dest: 'genReg, sourceA: 'optGenReg, sourceM: 'genReg, sourceN: 'genReg }
 
         (* Signed or unsigned division.  Sets the result to zero if the divisor is zero. *)
     |   Division of { isSigned: bool, dest: 'genReg, dividend: 'genReg, divisor: 'genReg, opSize: opSize }
 
         (* Start of function.  Set the register arguments.  stackArgs is the list of
            stack arguments.  If the function has a real closure regArgs includes the
            closure register (X8).  The register arguments include the return register
            (X30). *)
     |   BeginFunction of { regArgs: ('genReg * xReg) list, stackArgs: stackLocn list }
 
         (* Call a function.  If the code address is a constant it is passed here.
            Otherwise the address is obtained by indirecting through X8 which has been loaded
            as one of the argument registers.  The results are stored in the result registers,
            usually just X0.
            The "containers" argument is used to ensure that any container whose address is passed
            as one of the other arguments continues to be referenced until the function is called
            since there's a possibility that it isn't actually used after the function. *)
     |   FunctionCall of
             { callKind: callKind, regArgs: ('genReg fnarg * xReg) list,
               stackArgs: 'genReg fnarg list, dests: ('genReg * xReg) list,
               saveRegs: 'genReg list, containers: stackLocn list}
 
         (* Jump to a tail-recursive function.  This is similar to FunctionCall
            but complicated for stack arguments because the stack and the return
            address need to be overwritten.
            stackAdjust is the number of words to remove (positive) or add
            (negative) to the stack before the call.
            currStackSize contains the number of items currently on the stack. *)
     |   TailRecursiveCall of
             { callKind: callKind, regArgs: ('genReg fnarg * xReg) list,
               stackArgs: {src: 'genReg fnarg, stack: int} list,
               stackAdjust: int, currStackSize: int }
 
         (* Return from the function.  resultRegs are the registers containing
            the result,
            returnReg is the preg that contains the return address. *)
     |   ReturnResultFromFunction of { results: ('genReg * xReg) list, returnReg: 'genReg, numStackArgs: int }
 
         (* Raise an exception.  The packet is always loaded into X0. *)
     |   RaiseExceptionPacket of { packetReg: 'genReg }
 
         (* Push a register to the stack.  This is used both for a normal push, copies=1, and
            also to reserve a container. *)
     |   PushToStack of { source: 'genReg, copies: int, container: stackLocn }
 
         (* Load a register from the stack.  The container is the stack location identifier,
            the field is an offset in a container. *)
     |   LoadStack of { dest: 'genReg, wordOffset: int, container: stackLocn, field: int }
 
         (* Store a value into the stack. *)
     |   StoreToStack of { source: 'genReg, container: stackLocn, field: int, stackOffset: int }
 
         (* Set the register to the address of the container i.e. a specific offset on the stack. *)
     |   ContainerAddress of { dest: 'genReg, container: stackLocn, stackOffset: int }
 
         (* Remove items from the stack.  Used to remove containers or
            registers pushed to the stack.. *)
     |   ResetStackPtr of { numWords: int }
 
         (* Tag a value by shifting and setting the tag bit. *)
     |   TagValue of { source: 'genReg, dest: 'genReg, isSigned: bool, opSize: opSize }
 
         (* Shift a value to remove the tag bit.  The cache is used if this is untagging a
            value that has previously been tagged. *)
     |   UntagValue of { source: 'genReg, dest: 'genReg, isSigned: bool, opSize: opSize }
 
         (* Box a largeword value.  Stores a value
            into a byte area.  This can be implemented using AllocateMemoryFixed
            but keeping it separate makes optimisation easier.
            The result is always an address and needs to be converted to an
            object index on 32-in-64. *)
     |   BoxLarge of { source: 'genReg, dest: 'genReg, saveRegs: 'genReg list }
 
         (* Load a value from a box.  This can be implemented using a load but
            is kept separate to simplify optimisation.  The source is always
            an absolute address. *)
     |   UnboxLarge of { source: 'genReg, dest: 'genReg }
 
         (* Convert a floating point value into a value suitable for storing
            in the heap.  This normally involves boxing except that 32-bit
            floats can be tagged in native 64-bits. *)
     |   BoxTagFloat of { floatSize: floatSize, source: 'fpReg, dest: 'genReg, saveRegs: 'genReg list }
 
         (* The reverse of BoxTagFloat. *)
     |   UnboxTagFloat of { floatSize: floatSize, source: 'genReg, dest: 'fpReg }
 
         (* Load a value with acquire semantics.  This means that any other
            load in this thread after this sees the value of the shared
            memory at this point and not earlier.  This is used for
            references and arrays to ensure that if another thread has
            built a data structure on the heap and then assigns the
            address to a shared ref this thread will see the updated heap
            and not any locally cached previous version. *)
     |   LoadAcquire of { base: 'genReg, dest: 'genReg, loadType: loadType }
 
         (* Store a value with release semantics.  This ensures that any
            other write completes before this operation and works with
            LoadAcquire. *)
     |   StoreRelease of { base: 'genReg, source: 'genReg, loadType: loadType }
 
         (* This is a generalised constant shift which includes selection of a
            range of bits. *)
     |   BitFieldShift of { source: 'genReg, dest: 'genReg, isSigned: bool, length: opSize, immr: word, imms: word }
 
         (*  Copy a range of bits and insert it into another register.  This is the
             only case where a register functions both as a source and a destination. *)
     |   BitFieldInsert of { source: 'genReg, destAsSource: 'genReg, dest: 'genReg,
                             length: opSize, immr: word, imms: word }
 
         (* Indexed case. *)
     |   IndexedCaseOperation of { testReg: 'genReg }
 
         (* Exception handling.  - Set up an exception handler. *)
     |   PushExceptionHandler
 
         (* End of a handled section.  Restore the previous handler. *)
     |   PopExceptionHandler
 
         (* Marks the start of a handler.  This sets the stack pointer and
            restores the old handler.  Sets the exception packet register. *) 
     |   BeginHandler of { packetReg: 'genReg }
 
         (* Compare two vectors of bytes and set the condition code on the result.
            The registers are modified by the instruction. *)
     |   CompareByteVectors of
             { vec1Addr: 'genReg, vec2Addr: 'genReg, length: 'genReg, ccRef: ccRef }
 
         (* Move a block of bytes (isByteMove true) or words (isByteMove false).  The length is the
            number of items (bytes or words) to move. The registers are modified by
            the instruction. *)
     |   BlockMove of { srcAddr: 'genReg, destAddr: 'genReg, length: 'genReg, isByteMove: bool }
 
         (* Add or subtract to the system stack pointer and optionally return the new value.
            This is used to allocate and deallocate C space. *)
     |   AddSubXSP of { source: 'genReg, dest: 'optGenReg, isAdd: bool  }
 
         (* Ensures the value will actually be referenced although it doesn't generate any code. *)
     |   TouchValue of { source: 'genReg }
 
         (* Load a value at the address and get exclusive access.  Always loads a
            64-bit value. *)
     |   LoadAcquireExclusive of { base: 'genReg, dest: 'genReg }
 
         (* Store a value into an address releasing the lock.  Sets the result to
            either 0 or 1 if it succeeds or fails. *)
     |   StoreReleaseExclusive of { base: 'genReg, source: 'optGenReg, result: 'genReg }
 
         (* Insert a memory barrier. dmb ish. *)
     |   MemoryBarrier
 
         (* Convert an integer to a floating point value. *)
     |   ConvertIntToFloat of { source: 'genReg, dest: 'fpReg, srcSize: opSize, destSize: floatSize }
 
         (* Convert a floating point value to an integer using the specified rounding mode.
            We could get an overflow here but fortunately the ARM generates a value
            that will cause an overflow when we tag it, provided we tag it explicitly. *)
     |   ConvertFloatToInt of { source: 'fpReg, dest: 'genReg, srcSize: floatSize, destSize: opSize, rounding: IEEEReal.rounding_mode }
 
         (* Unary floating point.  This includes conversions between float and double. *)
     |   UnaryFloatingPt of { source: 'fpReg, dest: 'fpReg, fpOp: fpUnary }
 
         (* Binary floating point: addition, subtraction, multiplication and division. *)
     |   BinaryFloatingPoint of { arg1: 'fpReg, arg2: 'fpReg, dest: 'fpReg, fpOp: fpBinary, opSize: floatSize }
 
         (* Floating point comparison. *)
     |   CompareFloatingPoint of { arg1: 'fpReg, arg2: 'fpReg, ccRef: ccRef, opSize: floatSize }
 
         (* Yield control during a spin-lock. *)
     |   CPUYield
 
         (* Atomic operations added for ARM 8.1 *)
     |   AtomicOperation of { base: 'genReg, source: 'optGenReg, dest: 'optGenReg, atOp: atomicOp }
 
         (* Debugging - fault if values don't match. *)
     |   CacheCheck of { arg1: 'genReg, arg2: 'genReg }
 
         (* Destinations at the end of a basic block. *)
     and controlFlow =
         (* Unconditional branch to a label - should be a merge point. *)
         Unconditional of int
         (* Conditional branch. Jumps to trueJump if the condional is false, falseJump if false. *)
     |   Conditional of { ccRef: ccRef, condition: condition, trueJump: int, falseJump: int }
         (* Exit - the last instruction of the block is a return, raise or tailcall. *)
     |   ExitCode
         (* Indexed case - this branches to one of a number of labels *)
     |   IndexedBr of int list
         (* Set up a handler.  This doesn't cause an immediate branch but the state at the
            start of the handler is the state at this point. *)
     |   SetHandler of { handler: int, continue: int }
         (* Unconditional branch to a handler.  If an exception is raised explicitly
            within the scope of a handler. *)
     |   UnconditionalHandle of int
         (* Conditional branch to a handler.  Occurs if there is a call to a
            function within the scope of a handler.  It may jump to the handler. *)
     |   ConditionalHandle of { handler: int, continue: int }
 
     and ('genReg, 'optGenReg, 'fpReg) basicBlock =
             BasicBlock of { block: ('genReg, 'optGenReg, 'fpReg) arm64ICode list, flow: controlFlow }
   
     type iCodeAbstract = (preg, pregOrZero, preg) arm64ICode and basicBlockAbstract = (preg, pregOrZero, preg) basicBlock
     and  iCodeConcrete = (xReg, xReg, vReg) arm64ICode and basicBlockConcrete = (xReg, xReg, vReg) basicBlock
 
     (* Return the list of blocks that are the immediate successor of this. *)
     fun successorBlocks(Unconditional l) = [l]
     |   successorBlocks(Conditional{trueJump, falseJump, ...}) = [trueJump, falseJump]
     |   successorBlocks ExitCode = []
     |   successorBlocks(IndexedBr cases) = cases
     |   successorBlocks(SetHandler{handler, continue, ...}) = [handler, continue]
         (* We only need "handler" in SetHandler because we may have a handler that is never actually jumped to. *)
     |   successorBlocks(UnconditionalHandle handler) = [handler]
     |   successorBlocks(ConditionalHandle{handler, continue, ...}) = [handler, continue]
 
     local
         fun printCC(CcRef ccRef, stream) = stream ("CC" ^ Int.toString ccRef)
 
         fun printStackLoc(StackLoc{size, rno}, stream) =
             (stream "S"; stream(Int.toString rno); stream "("; stream(Int.toString size); stream ")")
         
         fun regRepr(XReg w) = "X" ^ Int.toString(Word8.toInt w)
         |   regRepr XZero = "XZ"
         |   regRepr XSP = "SP"
 
         fun arithRepr OpSize64 = "64"
         |   arithRepr OpSize32 = "32"
 
         fun printLoadType(Load64, stream) = stream "64"
         |   printLoadType(Load32, stream) = stream "32"
         |   printLoadType(Load16, stream) = stream "16"
         |   printLoadType(Load8, stream) = stream "8"
 
         fun printSaves([], _, _) = ()
         |   printSaves([areg], _, printReg) = printReg areg
         |   printSaves(areg::more, stream, printReg) =
                 (printReg areg; stream ","; printSaves(more, stream, printReg))
 
         fun printArg(ArgInReg reg, _, printReg) = printReg reg
         |   printArg(ArgOnStack{wordOffset, container, field, ...}, stream, _) =
             (
                 printStackLoc(container, stream); stream " + ";
                 stream(Int.toString field);
                 stream " (";  stream(Int.toString wordOffset); stream ")"
             )
 
         fun printShift(ShiftLSL w, stream) = stream(" LSL " ^ Word8.toString w)
         |   printShift(ShiftLSR w, stream) = stream(" LSR " ^ Word8.toString w)
         |   printShift(ShiftASR w, stream) = stream(" ASR " ^ Word8.toString w)
         |   printShift(ShiftNone, _) = ()
 
         fun printFloatSize(Float32, stream) = stream "F"
         |   printFloatSize(Double64, stream) = stream "D"
 
         fun printICode {stream, printGenReg, ...} (MoveRegister{ source, dest }: ('a, 'b, 'c) arm64ICode) =
             (
                 stream "\tMove\t";
                 printGenReg source;
                 stream " => ";
                 printGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (LoadNonAddressConstant{ source, dest }) =
             (
                 stream "\tLoadNonAddress\t"; stream(Word64.toString source);
                 stream " => ";
                 printGenReg dest
             )
 
         |   printICode {stream, printFPReg, ...} (LoadFPConstant{ source, dest, floatSize }) =
             (
                 stream "\tLoadFPConstant"; printFloatSize(floatSize, stream); stream "\t";
                 stream(Word64.toString source); stream " => "; printFPReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (LoadAddressConstant{ source, dest }) =
             (
                 stream "\tLoadAddress\t"; stream(Address.stringOfWord source);
                 stream " => ";
                 printGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (LoadWithConstantOffset{ base, dest, byteOffset, loadType }) =
             (
                 stream "\tLoadConstOffset"; printLoadType(loadType, stream); stream "\t[";
                 printGenReg base; stream "]+";
                 stream(Int.toString byteOffset);
                 stream " => ";
                 printGenReg dest
             )
 
         |   printICode {stream, printGenReg, printFPReg, ...} (LoadFPWithConstantOffset{ base, dest, byteOffset, floatSize }) =
             (
                 stream "\tLoadConstOffset"; printFloatSize(floatSize, stream); stream "\t[";
                 printGenReg base; stream "]+";
                 stream(Int.toString byteOffset);
                 stream " => ";
                 printFPReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (LoadWithIndexedOffset{ base, dest, index, loadType, signExtendIndex }) =
             (
                 stream "\tLoadIndexed"; printLoadType(loadType, stream); stream "\t[";
                 printGenReg base; stream "+"; printGenReg index; if signExtendIndex then stream " SX" else ();
                 stream "] => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, printFPReg, ...} (LoadFPWithIndexedOffset{ base, dest, index, floatSize, signExtendIndex }) =
             (
                 stream "\tLoadIndexed"; printFloatSize(floatSize, stream); stream "\t[";
                 printGenReg base; stream "+"; printGenReg index; if signExtendIndex then stream " SX" else ();
                 stream "] => "; printFPReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (GetThreadId { dest}) =
                 ( stream "\tGetThreadId\t"; stream " => "; printGenReg dest )
 
         |   printICode {stream, printGenReg, ...} (ObjectIndexAddressToAbsolute{ source, dest }) =
             (
                 stream "\tObjectAddrToAbs\t";
                 printGenReg source; stream " => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (AbsoluteToObjectIndex{ source, dest }) =
             (
                 stream "\tAbsToObjectAddr\t";
                 printGenReg source; stream " => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (AllocateMemoryFixed{bytesRequired, dest, saveRegs}) =
             (
                 stream "\tAllocateMemory\t";
                 stream(Word64.fmt StringCvt.DEC bytesRequired); stream " => ";
                 printGenReg dest; stream " save="; printSaves(saveRegs, stream, printGenReg)
             )
 
         |   printICode {stream, printGenReg, ...} (AllocateMemoryVariable{size, dest, saveRegs}) =
             (
                 stream "\tAllocateMemory\t";
                 stream "s="; printGenReg(size);
                 stream " => "; printGenReg dest;
                 stream " save="; printSaves(saveRegs, stream, printGenReg)
             )
 
         |   printICode {stream, printGenReg, ...} (InitialiseMem{size, addr, init}) =
             (
                 stream "\tInitialiseMem\t";
                 stream "s="; printGenReg(size);
                 stream ",i="; printGenReg(init);
                 stream ",a="; printGenReg(addr)
             )
 
         |   printICode {stream, ...} BeginLoop = stream "\tBeginLoop"
 
         |   printICode {stream, printGenReg, ...} (JumpLoop{regArgs, stackArgs, checkInterrupt, ... }) =
             (
                 stream "\tJumpLoop\t";
                 List.app(fn {src, dst} => (printGenReg(dst); stream "="; printArg(src, stream, printGenReg); stream " ")) regArgs;
                 List.app(
                     fn {src, wordOffset, stackloc} =>
                         (printStackLoc(stackloc, stream); stream("(sp" ^ Int.toString wordOffset); stream ")="; printArg(src, stream, printGenReg); stream " ")
                     ) stackArgs;
                 case checkInterrupt of
                     NONE => ()
                 |   SOME saveRegs => (stream " Check:save="; printSaves(saveRegs, stream, printGenReg))
             )
 
         |   printICode {stream, printGenReg, ...} (StoreWithConstantOffset{ base, source, byteOffset, loadType }) =
             (
                 stream "\tStoreConstOffset"; printLoadType(loadType, stream); stream "\t";
                 printGenReg source; stream " => [";
                 printGenReg base; stream "+";
                 stream(Int.toString byteOffset); stream "]"
             )
 
         |   printICode {stream, printGenReg, printFPReg, ...} (StoreFPWithConstantOffset{ base, source, byteOffset, floatSize }) =
             (
                 stream "\tStoreConstOffset"; printFloatSize(floatSize, stream); stream "\t";
                 printFPReg source; stream " => [";
                 printGenReg base; stream "+";
                 stream(Int.toString byteOffset); stream "]"
             )
 
         |   printICode {stream, printGenReg, ...} (StoreWithIndexedOffset{ base, source, index, loadType, signExtendIndex }) =
             (
                 stream "\tStoreIndexed"; printLoadType(loadType, stream); stream "\t";
                 printGenReg source; stream " => [";
                 printGenReg base; stream "+"; printGenReg index;
                 if signExtendIndex then stream " SX" else (); stream "]"
             )
 
         |   printICode {stream, printGenReg, printFPReg, ...} (StoreFPWithIndexedOffset{ base, source, index, floatSize, signExtendIndex }) =
             (
                 stream "\tStoreIndexed"; printFloatSize(floatSize, stream); stream "\t";
                 printFPReg source; stream " => [";
                 printGenReg base; stream "+"; printGenReg index;
                 if signExtendIndex then stream " SX" else (); stream "]"
             )
 
         |   printICode {stream, printGenReg, printOptGenReg, ...} (AddSubImmediate{ source, dest, ccRef, immed, isAdd, length }) =
             (
                 stream (if isAdd then "\tAddImmediate" else "\tSubImmediate"); stream(arithRepr length);
                 stream "\t"; printGenReg source; stream ",0x"; stream(Word.toString immed);
                 stream " => "; printOptGenReg dest;
                 case ccRef of NONE => () | SOME cc => (stream ", "; printCC(cc, stream))
             )
 
         |   printICode {stream, printGenReg, printOptGenReg, ...} (AddSubRegister{ base, shifted, dest, ccRef, isAdd, length, shift }) =
             (
                 stream (if isAdd then "\tAddRegister" else "\tSubRegister"); stream(arithRepr length);
                 stream "\t"; printGenReg base; stream ", ";
                 printGenReg(shifted); printShift(shift, stream);
                 stream " => "; printOptGenReg dest;
                 case ccRef of NONE => () | SOME cc => (stream ", "; printCC(cc, stream))
             )
 
         |   printICode {stream, printGenReg, printOptGenReg, ...} (LogicalImmediate{ source, dest, ccRef, immed, logOp, length }) =
             (
                 stream (case logOp of LogAnd => "\tAndImmediate" | LogOr => "\tOrImmediate" | LogXor => "\tXorImmediate");
                 stream(arithRepr length);
                 stream "\t"; printGenReg source; stream ",0x"; stream(Word64.toString immed);
                 stream " => "; printOptGenReg dest;
                 case ccRef of NONE => () | SOME cc => (stream ", "; printCC(cc, stream))
             )
 
         |   printICode {stream, printGenReg, printOptGenReg, ...} (LogicalRegister{ base, shifted, dest, ccRef, logOp, length, shift }) =
             (
                 stream (case logOp of LogAnd => "\tAndRegister"  | LogOr => "\tOrRegister" | LogXor => "\tXorRegister");
                 stream(arithRepr length);
                 stream "\t"; printGenReg base; stream ", ";
                 printGenReg(shifted); printShift(shift, stream);
                 stream " => "; printOptGenReg dest;
                 case ccRef of NONE => () | SOME cc => (stream ", "; printCC(cc, stream))
             )
 
         |   printICode {stream, printGenReg, ...} (ShiftRegister{ direction, dest, source, shift, opSize }) =
             (
                 stream (
                     case direction of
                         ShiftLeft => "\tShiftLeft"
                     |   ShiftRightLogical => "\tShiftRightLog"
                     |   ShiftRightArithmetic => "\tShiftRightArith");
                 stream(arithRepr opSize);
                 stream "\t"; printGenReg source; stream " by ";
                 printGenReg(shift);
                 stream " => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, printOptGenReg, ...} (Multiplication{ kind, dest, sourceA, sourceM, sourceN }) =
             (
                 stream (
                     case kind of
                         MultAdd32 => "\tMultAdd32\t"
                     |   MultSub32 => "\tMultSub32\t"
                     |   MultAdd64 => "\tMultAdd64\t"
                     |   MultSub64 => "\tMultSub64\t"
                     |   SignedMultAddLong => "\tSignedMultAddLong\t"
                     |   SignedMultHigh => "\tSignedMultHigh\t");
                 printGenReg(sourceM); stream " * ";
                 printGenReg(sourceN);
                 stream " +/- "; printOptGenReg sourceA;
                 stream " => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (Division{ isSigned, dest, dividend, divisor, opSize }) =
             (
                 stream (if isSigned then "\tSignedDivide" else "\tUnsignedDivide");
                 stream(arithRepr opSize);
                 stream "\t"; printGenReg(dividend); stream " by ";
                 printGenReg(divisor);
                 stream " => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (BeginFunction{ regArgs, stackArgs }) =
             (
                 stream "\tBeginFunction\t";
                 List.app(fn (arg, r) => (stream(regRepr r); stream "="; printGenReg(arg); stream " ")) regArgs;
                 List.app(fn s => printStackLoc(s, stream)) stackArgs
             )
 
         |   printICode {stream, printGenReg, ...} (FunctionCall{callKind, regArgs, stackArgs, dests, saveRegs, containers}) =
             (
                 stream "\tFunctionCall\t";
                 case callKind of
                     Recursive => stream "recursive "
                 |   ConstantCode m => (stream(stringOfWord m); stream " ")
                 |   FullCall => ();
                 stream "(";
                 List.app(fn (arg, r) => (stream(regRepr r); stream "="; printArg(arg, stream, printGenReg); stream " ")) regArgs;
                 List.app(fn arg => (stream "p="; printArg(arg, stream, printGenReg); stream " ")) stackArgs;
                 stream ") ";
                 List.app(fn (pr, r) => (stream(regRepr r); stream "=>"; printGenReg pr; stream " ")) dests;
                 stream " save="; printSaves(saveRegs, stream, printGenReg);
                 if null containers then ()
                 else (stream " containers="; List.app (fn c => (printStackLoc(c, stream); stream " ")) containers)
             )
 
         |   printICode {stream, printGenReg, ...} (TailRecursiveCall{callKind, regArgs, stackArgs, stackAdjust, currStackSize, ...}) =
             (
                 stream "\tTailCall\t";
                 case callKind of
                     Recursive => stream "recursive "
                 |   ConstantCode m => (stream(stringOfWord m); stream " ")
                 |   FullCall => ();
                 List.app(fn (arg, r) => (stream(regRepr r); stream "="; printArg(arg, stream, printGenReg); stream " ")) regArgs;
                 List.app(fn {src, stack} => (stream (Int.toString stack); stream "<="; printArg(src, stream, printGenReg); stream " ")) stackArgs;
                 stream "adjust="; stream(Int.toString stackAdjust);
                 stream " stackSize="; stream(Int.toString currStackSize)
             )
 
         |   printICode {stream, printGenReg, ...} (ReturnResultFromFunction{ results, returnReg, numStackArgs }) =
             (
                 stream "\tReturnFromFunction\t"; printGenReg(returnReg); stream "with ";
                 List.app(fn (reg, r) => (stream(regRepr r); stream "=>"; printGenReg reg; stream " ")) results;
                 stream("," ^ Int.toString numStackArgs)
             )
 
         |   printICode {stream, printGenReg, ...} (RaiseExceptionPacket{ packetReg }) =
             ( stream "\tRaiseException\t"; printGenReg(packetReg) )
 
         |   printICode {stream, printGenReg, ...} (PushToStack{ source, copies, container }) =
             (
                 stream "\tPushToStack\t"; printGenReg source;
                 if copies > 1 then (stream " * "; stream(Int.toString copies)) else ();
                 stream " => "; printStackLoc(container, stream)
             )
 
         |   printICode {stream, printGenReg, ...} (LoadStack{ dest, wordOffset, container, field }) =
             (
                 stream "\tLoadStack\t";
                 printStackLoc(container, stream); stream " + ";
                 stream(Int.toString field);
                 stream " (";  stream(Int.toString wordOffset); stream ")";
                 stream " => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (StoreToStack{ source, container, field, stackOffset }) =
             (
                 stream "\tStoreToStack\t"; printGenReg source;
                 stream " => "; printStackLoc(container, stream); stream "+";
                 stream (Int.toString field); stream "(";
                 stream(Int.toString stackOffset); stream ")"
             )
 
         |   printICode {stream, printGenReg, ...} (ContainerAddress{ dest, container, stackOffset }) =
             (
                 stream "\tContainerAddress\t";
                 stream "@"; printStackLoc(container, stream);
                 stream " (";  stream(Int.toString stackOffset); stream ") => ";
                 printGenReg dest
             )
 
         |   printICode {stream, ...} (ResetStackPtr{ numWords }) =
             ( stream "\tResetStackPtr\t"; stream(Int.toString numWords) )
 
         |   printICode {stream, printGenReg, ...} (TagValue{ source, dest, isSigned, opSize }) =
             (
                 stream "\tTag"; stream(if isSigned then "Signed" else "Unsigned");
                 stream(arithRepr opSize); stream "\t";
                 printGenReg source; stream " => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (UntagValue{ source, dest, isSigned, opSize }) =
             (
                 stream "\tUntag"; stream(if isSigned then "Signed" else "Unsigned");
                 stream(arithRepr opSize); stream "\t";
                 printGenReg source; stream " => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (BoxLarge{source, dest, saveRegs}) =
             (
                 stream "\tBoxLarge\t";
                 printGenReg source;
                 stream " => ";
                 printGenReg dest;
                 stream " save="; printSaves(saveRegs, stream, printGenReg)
             )
 
         |   printICode {stream, printGenReg, ...} (UnboxLarge{source, dest}) =
             (
                 stream "\tUnboxLarge\t";
                 printGenReg source;
                 stream " => ";
                 printGenReg dest
             )
 
         |   printICode {stream, printGenReg, printFPReg, ...} (BoxTagFloat{floatSize, source, dest, saveRegs}) =
             (
                 stream "\tBoxTagFloat"; printFloatSize(floatSize, stream); stream "\t";
                 printFPReg source;
                 stream " => ";
                 printGenReg dest;
                 stream " save="; printSaves(saveRegs, stream, printGenReg)
             )
 
         |   printICode {stream, printGenReg, printFPReg, ...} (UnboxTagFloat{floatSize, source, dest}) =
             (
                 stream "\tUnboxTagFloat"; printFloatSize(floatSize, stream); stream "\t";
                 printGenReg source;
                 stream " => ";
                 printFPReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (LoadAcquire{ base, dest, loadType }) =
             (
                 stream "\tLoadAcquire"; printLoadType(loadType, stream); stream "\t[";
                 printGenReg base; stream "] => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (StoreRelease{ base, source, loadType }) =
             (
                 stream "\tStoreRelease"; printLoadType(loadType, stream); stream "\t";
                 printGenReg source; stream " => [";
                 printGenReg base; stream "]"
             )
 
         |   printICode {stream, printGenReg, ...} (BitFieldShift{ source, dest, isSigned, length, immr, imms }) =
             (
                 stream "\tBitShift"; stream(if isSigned then "Signed" else "Unsigned"); stream(arithRepr length); stream "\t";
                 printGenReg source;
                 stream " => "; printGenReg dest;
                 stream " immr="; stream(Word.fmt StringCvt.DEC immr);
                 stream " imms="; stream(Word.fmt StringCvt.DEC imms)
             )
 
         |   printICode {stream, printGenReg, ...} (BitFieldInsert{ source, dest, destAsSource, length, immr, imms }) =
             (
                 stream "\tBitInsert";  stream(arithRepr length); stream "\t";
                 printGenReg source; stream " with "; printGenReg destAsSource;
                 stream " => "; printGenReg dest;
                 stream " immr="; stream(Word.fmt StringCvt.DEC immr);
                 stream " imms="; stream(Word.fmt StringCvt.DEC imms)
             )
 
         |   printICode {stream, printGenReg, ...} (IndexedCaseOperation{testReg}) =
                 ( stream "\tIndexedCase\t"; printGenReg testReg )
 
         |   printICode {stream, ...} PushExceptionHandler = stream "\tPushExcHandler"
 
         |   printICode {stream, ...} PopExceptionHandler = stream "\tPopExcHandler"
 
         |   printICode {stream, printGenReg, ...} (BeginHandler{packetReg}) =
             (
                 stream "\tBeginHandler\t";
                 printGenReg packetReg
             )
 
         |   printICode {stream, printGenReg, ...} (CompareByteVectors{vec1Addr, vec2Addr, length, ccRef, ...}) =
             (
                 stream "\tCompareByteVectors\t";
                 printGenReg(vec1Addr); stream ",";
                 printGenReg(vec2Addr); stream ",";
                 printGenReg(length);
                 stream " => "; printCC(ccRef, stream)
             )
 
         |   printICode {stream, printGenReg, ...} (BlockMove{srcAddr, destAddr, length, isByteMove}) =
             (
                 stream(if isByteMove then "\tBlockByteMove\t" else "\tBlockWordMove\t");
                 stream "src="; printGenReg(srcAddr);
                 stream ",dest="; printGenReg(destAddr);
                 stream ",len="; printGenReg(length)
             )
 
         |   printICode {stream, printGenReg, printOptGenReg, ...} (AddSubXSP{ source, dest, isAdd }) =
             (
                 stream(if isAdd then "\tAdd\t" else "\tSubtract\t");
                 printGenReg source; stream " XSP => ";
                 printOptGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (TouchValue{ source }) =
             ( stream "\tTouchValue\t"; printGenReg source )
 
         |   printICode {stream, printGenReg, ...} (LoadAcquireExclusive{ base, dest }) =
             (
                 stream "\tLoadExclusive\t["; printGenReg base;
                 stream "] => "; printGenReg dest
             )
 
         |   printICode {stream, printGenReg, printOptGenReg, ...} (StoreReleaseExclusive{ base, source, result }) =
             (
                 stream "\tStoreExclusive\t";
                 printOptGenReg source; stream " => [";
                 printGenReg base; stream "] result => ";
                 printGenReg result
             )
 
         |   printICode {stream, ...} MemoryBarrier = stream "\tMemoryBarrier"
 
         |   printICode {stream, printGenReg, printFPReg, ...} (ConvertIntToFloat{ source, dest, srcSize, destSize}) =
             (
                 stream "\tConvert"; stream(arithRepr srcSize); stream "To";
                 printFloatSize(destSize, stream); stream "\t";
                 printGenReg source; stream " => "; printFPReg dest
             )
 
         |   printICode {stream, printGenReg, printFPReg, ...} (ConvertFloatToInt{ source, dest, srcSize, destSize, rounding}) =
             let
                 open IEEEReal
             in
                 stream "\tConvert"; printFloatSize(srcSize, stream); stream "To";
                 stream(arithRepr destSize); stream "\t";
                 printFPReg source; stream " => "; printGenReg dest;
                 stream(
                     case rounding of
                         TO_NEAREST => " rounding"
                     |   TO_NEGINF => " rounding down"
                     |   TO_POSINF => " rounding up"
                     |   TO_ZERO => " truncating"
                 )
             end
 
         |   printICode {stream, printFPReg, ...} (UnaryFloatingPt{ source, dest, fpOp}) =
             (
                 stream(
                     case fpOp of
                         NegFloat => "\tNegFloat\t"
                     |   NegDouble => "\tNegDouble\t"
                     |   AbsFloat => "\tAbsFloat\t"
                     |   AbsDouble => "\tAbsDouble\t"
                     |   ConvFloatToDble => "\tFloatToDble\t"
                     |   ConvDbleToFloat => "\t\t"
                 );
                 printFPReg source; stream " => "; printFPReg dest
             )
 
         |   printICode {stream, printFPReg, ...} (BinaryFloatingPoint{ arg1, arg2, dest, fpOp, opSize}) =
             (
                 stream(
                     case fpOp of
                         MultiplyFP => "\tMultiply"
                     |   DivideFP => "\tDivide"
                     |   AddFP => "\tAdd"
                     |   SubtractFP => "\tSubtract"
                 );
                 printFloatSize(opSize, stream); stream "\t";
                 printFPReg arg1; stream ", "; printFPReg arg2;
                 stream " => "; printFPReg dest
             )
 
         |   printICode {stream, printFPReg, ...} (CompareFloatingPoint{ arg1, arg2, opSize, ccRef}) =
             (
                 stream "\tCompare"; printFloatSize(opSize, stream); stream "\t";
                 printFPReg arg1; stream ", "; printFPReg arg2;
                 stream ", "; printCC(ccRef, stream)
             )
 
         |   printICode {stream, ...} CPUYield = stream "\tCpuYield"
 
         |   printICode {stream, printGenReg, printOptGenReg, ...} (AtomicOperation{ base, source, dest, atOp }) =
             (
                 case atOp of
                     LoadAddAL => stream "\tLoadAddAL\t"
                 |   LoadUmaxAL => stream "\tLoadUmaxAL\t"
-                |   SwapAL => stream "\tSwapAL\t";
+                |   SwapAL => stream "\tSwapAL\t"
+                |   LoadAddAcquire => stream "\tLoadAddAcquire\t"
+                |   LoadUMaxAcquire => stream "\tLoadUMaxAcquire\t"
+                |   SwapRelease => stream "\tSwapRelease\t";
                 printOptGenReg source; stream ",["; printGenReg base; stream "] => "; printOptGenReg dest
             )
 
         |   printICode {stream, printGenReg, ...} (CacheCheck{ arg1, arg2}) =
             ( stream "\tCacheCheck\t"; printGenReg arg1; stream ", "; printGenReg arg2 )
 
         and printCondition(cond,  stream)  = stream(condToString cond)
 
         (* Print a basic block. *)
         fun printBlock {stream, printGenReg, printOptGenReg, printFPReg} (blockNo, BasicBlock{block, flow, ...}) =
         (
             (* Put a label on all but the first. *)
             if blockNo <> 0 then stream("L" ^ Int.toString blockNo ^ ":") else ();
             List.app (fn icode =>
                 (printICode {stream=stream, printGenReg=printGenReg, printOptGenReg=printOptGenReg, printFPReg=printFPReg} (icode); stream "\n")) block;
             case flow of
                 Unconditional l => stream("\tJump\tL" ^ Int.toString l ^ "\n")
             |   Conditional {condition, trueJump, falseJump, ccRef, ...} =>
                 (
                     stream "\tJump"; printCondition(condition, stream);
                     stream "\t"; printCC(ccRef, stream);
                     stream " L"; stream (Int.toString trueJump);
                     stream " else L"; stream (Int.toString falseJump); stream "\n"
                 )
             |   ExitCode => ()
             |   IndexedBr _ => ()
             |   SetHandler{handler, continue} =>
                     stream(concat["\tSetHandler\tH", Int.toString handler, "\n",
                                   "\tJump\tL", Int.toString continue, "\n"])
             |   UnconditionalHandle handler => stream("\tJump\tH" ^ Int.toString handler ^ "\n")
             |   ConditionalHandle{handler, continue} =>
                     stream(concat["\tJump\tL", Int.toString continue, " or H", Int.toString handler, "\n"])
         )
     in
         fun printPReg stream (PReg i) = stream("R" ^ Int.toString i)
         fun printOptPReg stream ZeroReg = stream "Zero"
         |   printOptPReg stream (SomeReg reg) = printPReg stream reg
         
         fun printXReg stream (XReg w) = stream("X" ^ Int.toString(Word8.toInt w))
         |   printXReg stream XZero = stream "XZ"
         |   printXReg stream XSP = stream "XSP"
 
         fun printVReg stream (VReg w) = stream("V" ^ Int.toString(Word8.toInt w))
 
         fun printICodeAbstract(blockVec, stream) =
             Vector.appi(printBlock{stream=stream, printGenReg=printPReg stream, printOptGenReg=printOptPReg stream, printFPReg=printPReg stream}) blockVec
         
         and printICodeConcrete(blockVec, stream) =
             Vector.appi(printBlock{stream=stream, printGenReg=printXReg stream, printOptGenReg=printXReg stream, printFPReg=printVReg stream}) blockVec
     end
 
     (* Only certain bit patterns are allowed in a logical immediate instruction
        but the encoding is complex so it's easiest to inherit the test from
        the assembler layer. *)
     local
         fun optow OpSize32 = WordSize32 | optow OpSize64 = WordSize64
     in
         fun isEncodableBitPattern(v, w) = Arm64Code.isEncodableBitPattern(v, optow w)
     end
 
     (* This generates a  BitField instruction with the appropriate values for immr and imms. *)
     fun shiftConstant{ direction, dest, source, shift, opSize } =
     let
         val (isSigned, immr, imms) =
             case (direction, opSize) of
                 (ShiftLeft, OpSize64) => (false, Word.~ shift mod 0w64, 0w64-0w1-shift)
             |   (ShiftLeft, OpSize32) => (false, Word.~ shift mod 0w32, 0w32-0w1-shift)
             |   (ShiftRightLogical, OpSize64) => (false, shift, 0wx3f)
             |   (ShiftRightLogical, OpSize32) => (false, shift, 0wx1f)
             |   (ShiftRightArithmetic, OpSize64) => (true, shift, 0wx3f)
             |   (ShiftRightArithmetic, OpSize32) => (true, shift, 0wx1f)
     in
         BitFieldShift{ source=source, dest=dest, isSigned=isSigned, length=opSize, immr=immr, imms=imms }
     end
 
     structure Sharing =
     struct
         type xReg           = xReg
         and  vReg           = vReg
         and  reg            = reg
         and  condition      = condition
         and  shiftType      = shiftType
         and  ('genReg, 'optGenReg, 'fpReg) arm64ICode = ('genReg, 'optGenReg, 'fpReg) arm64ICode
         and  preg           = preg
         and  pregOrZero     = pregOrZero
         and  controlFlow    = controlFlow
         and  ('genReg, 'optGenReg, 'fpReg) basicBlock = ('genReg, 'optGenReg, 'fpReg) basicBlock
         and  stackLocn      = stackLocn
         and  regProperty    = regProperty
         and  ccRef          = ccRef
         and  'genReg fnarg  = 'genReg fnarg
         and  closureRef     = closureRef
         and  loadType       = loadType
         and  opSize         = opSize
         and  logicalOp      = logicalOp
         and  callKind       = callKind
         and  floatSize      = floatSize
         and  shiftDirection = shiftDirection
         and  multKind       = multKind
         and  fpUnary        = fpUnary
         and  fpBinary       = fpBinary
         and  atomicOp       = atomicOp
    end
 end;
 
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PreAssembly.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PreAssembly.ML
index 0c8af938..ae3d5acc 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PreAssembly.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PreAssembly.ML
@@ -1,1098 +1,1101 @@
 (*
     Copyright (c) 2021-2 David C. J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     Licence version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public Licence for more details.
     
     You should have received a copy of the GNU Lesser General Public
     Licence along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 (* The pre-assembly layer goes below the icode and allows peep-hole optimisation. *)
 
 functor Arm64PreAssembly(
 
     structure Arm64Assembly: ARM64ASSEMBLY
     structure Debug: DEBUG
     structure Pretty: PRETTY
 
 ): ARM64PREASSEMBLY =
 struct
     open Arm64Assembly
     
     exception InternalError = Misc.InternalError
 
     (* Reversed cons and append to make the code easier to read. *)
     infix 5 <::> <@>
     fun tl <::> hd = hd :: tl
     and snd <@> fst = fst @ snd
 
     (* Many of the datatypes are inherited from Arm64Assembly *)
 
     datatype loadType = Load64 | Load32 | Load16 | Load8
     and opSize = OpSize32 | OpSize64
     and logicalOp = LogAnd | LogOr | LogXor
     and floatSize = Float32 | Double64
     and shiftDirection = ShiftLeft | ShiftRightLogical | ShiftRightArithmetic
     and multKind =
         MultAdd32 | MultSub32 | MultAdd64 | MultSub64 |
         SignedMultAddLong (* 32bit*32bit + 64bit => 64Bit *) |
         SignedMultHigh (* High order part of 64bit*64Bit *)
     and fpUnary = NegFloat | NegDouble | AbsFloat | AbsDouble | ConvFloatToDble | ConvDbleToFloat
     and fpBinary = MultiplyFP | DivideFP | AddFP | SubtractFP
     and unscaledType = NoUpdate | PreIndex | PostIndex
     and condSet = CondSet | CondSetIncr | CondSetInvert | CondSetNegate
     and bitfieldKind = BFUnsigned | BFSigned | BFInsert
     and brRegType = BRRBranch | BRRAndLink | BRRReturn
         (* Some of the atomic operations added in 8.1 *)
-    and atomicOp = LoadAddAL | LoadUmaxAL | SwapAL
+    and atomicOp = LoadAddAL | LoadUmaxAL | SwapAL | LoadAddAcquire | LoadUMaxAcquire | SwapRelease
 
     datatype label = Label of int
     type labelMaker = int ref
     fun createLabelMaker() = ref 0
     fun createLabel(r as ref n) = Label n before r := n+1
 
     datatype precode =
         (* Basic instructions *)
         AddImmediate of {regN: xReg, regD: xReg, immed: word, shifted: bool, opSize: opSize, setFlags: bool}
     |   SubImmediate of {regN: xReg, regD: xReg, immed: word, shifted: bool, opSize: opSize, setFlags: bool}
     |   AddShiftedReg of {regM: xReg, regN: xReg, regD: xReg, shift: shiftType, opSize: opSize, setFlags: bool}
     |   SubShiftedReg of {regM: xReg, regN: xReg, regD: xReg, shift: shiftType, opSize: opSize, setFlags: bool}
     |   AddExtendedReg of {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend, opSize: opSize, setFlags: bool}
     |   SubExtendedReg of {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend, opSize: opSize, setFlags: bool}
     |   MultiplyAndAddSub of {regM: xReg, regN: xReg, regA: xReg, regD: xReg, multKind: multKind}
     |   DivideRegs of
             {regM: xReg, regN: xReg, regD: xReg, isSigned: bool, opSize: opSize}
     |   LogicalShiftedReg of
             {regM: xReg, regN: xReg, regD: xReg, shift: shiftType, logOp: logicalOp, opSize: opSize, setFlags: bool}
     |   LoadRegScaled of
             {regT: xReg, regN: xReg, unitOffset: int, loadType: loadType}
     |   LoadFPRegScaled of
             {regT: vReg, regN: xReg, unitOffset: int, floatSize: floatSize}
     |   StoreRegScaled of
             {regT: xReg, regN: xReg, unitOffset: int, loadType: loadType}
     |   StoreFPRegScaled of
             {regT: vReg, regN: xReg, unitOffset: int, floatSize: floatSize}
     |   LoadRegUnscaled of
             {regT: xReg, regN: xReg, byteOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   StoreRegUnscaled of
             {regT: xReg, regN: xReg, byteOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   LoadFPRegUnscaled of
             {regT: vReg, regN: xReg, byteOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   StoreFPRegUnscaled of
             {regT: vReg, regN: xReg, byteOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   LoadRegIndexed of {regT: xReg, regN: xReg, regM: xReg, loadType: loadType, option: scale extend}
     |   StoreRegIndexed of {regT: xReg, regN: xReg, regM: xReg, loadType: loadType, option: scale extend}
     |   LoadFPRegIndexed of {regT: vReg, regN: xReg, regM: xReg, floatSize: floatSize, option: scale extend}
     |   StoreFPRegIndexed of {regT: vReg, regN: xReg, regM: xReg, floatSize: floatSize, option: scale extend}
         (* LoadAcquire and StoreRelease are used for mutables. *)
     |   LoadAcquireReg of {regN: xReg, regT: xReg, loadType: loadType}
     |   StoreReleaseReg of {regN: xReg, regT: xReg, loadType: loadType}
         (* LoadAcquireExclusiveRegister and StoreReleaseExclusiveRegister are used for mutexes. *)
     |   LoadAcquireExclusiveRegister of {regN: xReg, regT: xReg}
     |   StoreReleaseExclusiveRegister of {regS: xReg, regT: xReg, regN: xReg}
     |   MemBarrier
         (* Additional atomic operations. *)
     |   AtomicExtension of { regT: xReg, regN: xReg, regS: xReg, atOp: atomicOp }
     |   LoadRegPair of
             { regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   StoreRegPair of
             { regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   LoadFPRegPair of
             { regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   StoreFPRegPair of
             { regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   ConditionalSet of
             {regD: xReg, regTrue: xReg, regFalse: xReg, cond: condition, condSet: condSet, opSize: opSize}
     |   BitField of {immr: word, imms: word, regN: xReg, regD: xReg, opSize: opSize, bitfieldKind: bitfieldKind}
     |   ShiftRegisterVariable of {regM: xReg, regN: xReg, regD: xReg, opSize: opSize, shiftDirection: shiftDirection}
     |   BitwiseLogical of { bits: Word64.word, regN: xReg, regD: xReg, opSize: opSize, setFlags: bool, logOp: logicalOp}
         (* Floating point *)
     |   MoveGeneralToFP of { regN: xReg, regD: vReg, floatSize: floatSize}
     |   MoveFPToGeneral of {regN: vReg, regD: xReg, floatSize: floatSize}
     |   CvtIntToFP of { regN: xReg, regD: vReg, floatSize: floatSize, opSize: opSize}
     |   CvtFloatToInt of { round: IEEEReal.rounding_mode, regN: vReg, regD: xReg, floatSize: floatSize, opSize: opSize}
     |   FPBinaryOp of { regM: vReg, regN: vReg, regD: vReg, floatSize: floatSize, fpOp: fpBinary}
     |   FPComparison of { regM: vReg, regN: vReg, floatSize: floatSize}
     |   FPUnaryOp of {regN: vReg, regD: vReg, fpOp: fpUnary}
         (* Branches and Labels. *)
     |   SetLabel of label
     |   ConditionalBranch of condition * label
     |   UnconditionalBranch of label
     |   BranchAndLink of label
     |   BranchReg of {regD: xReg, brRegType: brRegType }
     |   LoadLabelAddress of xReg * label
     |   TestBitBranch of { test: xReg, bit: Word8.word, label: label, onZero: bool }
     |   CompareBranch of { test: xReg, label: label, onZero: bool, opSize: opSize }
         (* Composite instructions *)
     |   MoveXRegToXReg of {sReg: xReg, dReg: xReg}
     |   LoadNonAddr of xReg * Word64.word
     |   LoadFPConst of {dest: vReg, value: Word64.word, floatSize: floatSize, work: xReg}
     |   LoadAddr of xReg * machineWord
     |   RTSTrap of { rtsEntry: int, work: xReg, save: xReg list }
     |   AllocateMemoryFixedSize of { bytes: word, dest: xReg, save: xReg list, work: xReg }
     |   AllocateMemoryVariableSize of { sizeReg: xReg, dest: xReg, save: xReg list, work: xReg }
         (* Branch table for indexed case. startLabel is the address of the first label in
            the list.  The branch table is a sequence of unconditional branches. *)
     |   BranchTable of { startLabel: label, brTable: label list }
     |   LoadGlobalHeapBaseInCallback of xReg
     |   Yield
 
 
     (* Optimise the pre-assembler code and then generate the final code. *)
     fun generateFinalCode {instrs, name, parameters, resultClosure, profileObject, labelMaker=ref labelCount} =
     let
         val labelTargets = Array.tabulate(labelCount, fn i => (Arm64Assembly.createLabel(), i) )
 
         (* Follow the chain of forwarded labels. *)
         local
             fun forwardLab(labelNo, labels) =
             let
                 val dest as (_, dNo) = Array.sub(labelTargets, labelNo)
             in
                 if dNo = labelNo
                 then dest
                 (* This should not happen but just in case... *)
                 else if List.exists(fn i => i = dNo) labels
                 then raise InternalError "Infinite loop"
                 else forwardLab(dNo, dNo::labels)
             end
         in
             fun getLabel labelNo = forwardLab(labelNo, [labelNo])
             val getLabelTarget = #1 o getLabel
         end                
 
         fun toAssembler([], code) = code
 
         |   toAssembler(AddImmediate{regN, regD, immed, shifted, opSize, setFlags} :: rest, code) =
             let
                 val instr =
                     case (opSize, setFlags) of
                         (OpSize64, false) => addImmediate
                     |   (OpSize32, false) => addImmediate32
                     |   (OpSize64, true) => addSImmediate
                     |   (OpSize32, true) => addSImmediate32
             in
                 toAssembler(rest, code <::> instr{regN=regN, regD=regD, immed=immed, shifted=shifted})
             end
 
         |   toAssembler(SubImmediate{regN, regD, immed, shifted, opSize, setFlags} :: rest, code) =
             let
                 val instr =
                     case (opSize, setFlags) of
                         (OpSize64, false) => subImmediate
                     |   (OpSize32, false) => subImmediate32
                     |   (OpSize64, true) => subSImmediate
                     |   (OpSize32, true) => subSImmediate32
             in
                 toAssembler(rest, code <::> instr{regN=regN, regD=regD, immed=immed, shifted=shifted})
             end
 
         |   toAssembler(AddShiftedReg{regM, regN, regD, shift, opSize, setFlags} :: rest, code) =
             let
                 val instr =
                     case (opSize, setFlags) of
                         (OpSize64, false) => addShiftedReg
                     |   (OpSize32, false) => addShiftedReg32
                     |   (OpSize64, true) => addSShiftedReg
                     |   (OpSize32, true) => addSShiftedReg32
             in
                 toAssembler(rest, code <::> instr{regM=regM, regN=regN, regD=regD, shift=shift})
             end
 
         |   toAssembler(SubShiftedReg{regM, regN, regD, shift, opSize, setFlags} :: rest, code) =
             let
                 val instr =
                     case (opSize, setFlags) of
                         (OpSize64, false) => subShiftedReg
                     |   (OpSize32, false) => subShiftedReg32
                     |   (OpSize64, true) => subSShiftedReg
                     |   (OpSize32, true) => subSShiftedReg32
             in
                 toAssembler(rest, code <::> instr{regM=regM, regN=regN, regD=regD, shift=shift})
             end
 
         |   toAssembler(AddExtendedReg{regM, regN, regD, extend, opSize, setFlags} :: rest, code) =
             (* Add/SubExtended are only used to access XSP. *)
             let
                 val instr =
                     case (opSize, setFlags) of
                         (OpSize64, false) => addExtendedReg
                     |   (OpSize32, false) => raise InternalError "AddExtendedReg; 32"
                     |   (OpSize64, true) => addSExtendedReg
                     |   (OpSize32, true) => raise InternalError "AddExtendedReg; 32"
             in
                 toAssembler(rest, code <::> instr{regM=regM, regN=regN, regD=regD, extend=extend})
             end
 
         |   toAssembler(SubExtendedReg{regM, regN, regD, extend, opSize, setFlags} :: rest, code) =
             let
                 val instr =
                     case (opSize, setFlags) of
                         (OpSize64, false) => subExtendedReg
                     |   (OpSize32, false) => raise InternalError "AddExtendedReg; 32"
                     |   (OpSize64, true) => subSExtendedReg
                     |   (OpSize32, true) => raise InternalError "AddExtendedReg; 32"
             in
                 toAssembler(rest, code <::> instr{regM=regM, regN=regN, regD=regD, extend=extend})
             end
 
         |   toAssembler(MultiplyAndAddSub{regM, regN, regA, regD, multKind} :: rest, code) =
             let
                 val instr =
                     case multKind of
                         MultAdd32 => multiplyAndAdd32{regM=regM, regN=regN, regA=regA, regD=regD}
                     |   MultSub32 => multiplyAndSub32{regM=regM, regN=regN, regA=regA, regD=regD}
                     |   MultAdd64 => multiplyAndAdd{regM=regM, regN=regN, regA=regA, regD=regD}
                     |   MultSub64 => multiplyAndSub{regM=regM, regN=regN, regA=regA, regD=regD}
                     |   SignedMultAddLong => signedMultiplyAndAddLong{regM=regM, regN=regN, regA=regA, regD=regD}
                     |   SignedMultHigh => signedMultiplyHigh{regM=regM, regN=regN, regD=regD}
             in
                 toAssembler(rest, code <::> instr)
             end
 
         |   toAssembler(DivideRegs{regM, regN, regD, isSigned, opSize} :: rest, code) =
             let
                 val instr =
                     case (isSigned, opSize) of
                         (true, OpSize64) => signedDivide
                     |   (true, OpSize32) => signedDivide32
                     |   (false, OpSize64) => unsignedDivide
                     |   (false, OpSize32) => unsignedDivide32
             in
                 toAssembler(rest, code <::> instr{regN=regN, regM=regM, regD=regD})
             end
 
         |   toAssembler(LogicalShiftedReg{regM, regN, regD, shift, logOp, opSize, setFlags} :: rest, code) =
             let
                 val instr =
                     case (logOp, setFlags, opSize) of
                         (LogAnd, false, OpSize64) => andShiftedReg
                     |   (LogAnd, true, OpSize64) => andsShiftedReg
                     |   (LogOr, false, OpSize64) => orrShiftedReg
                     |   (LogXor, false, OpSize64) => eorShiftedReg
 
                     |   (LogAnd, false, OpSize32) => andShiftedReg32
                     |   (LogAnd, true, OpSize32) => andsShiftedReg32
                     |   (LogOr, false, OpSize32) => orrShiftedReg32
                     |   (LogXor, false, OpSize32) => eorShiftedReg32
 
                     |   _ => raise InternalError "setFlags not valid with OR or XOR"
                 (* There are also versions of AND/OR/XOR which operate on a complement (NOT)
                    of the shifted register.  It's probably not worth looking for a use for them. *)
             in
                 toAssembler(rest, code <::> instr{regN=regN, regM=regM, regD=regD, shift=shift})
             end
 
         |   toAssembler(LoadRegScaled{regT, regN, unitOffset, loadType} :: rest, code) =
             let
                 val instr =
                     case loadType of
                         Load64 => loadRegScaled
                     |   Load32 => loadRegScaled32
                     |   Load16 => loadRegScaled16
                     |   Load8 => loadRegScaledByte
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, unitOffset=unitOffset})
             end
 
         |   toAssembler(StoreRegScaled{regT, regN, unitOffset, loadType} :: rest, code) =
             let
                 val instr =
                     case loadType of
                         Load64 => storeRegScaled
                     |   Load32 => storeRegScaled32
                     |   Load16 => storeRegScaled16
                     |   Load8 => storeRegScaledByte
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, unitOffset=unitOffset})
             end
 
         |   toAssembler(LoadFPRegScaled{regT, regN, unitOffset, floatSize} :: rest, code) =
             let
                 val instr =
                     case floatSize of
                         Float32 => loadRegScaledFloat
                     |   Double64 => loadRegScaledDouble
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, unitOffset=unitOffset})
             end
 
         |   toAssembler(StoreFPRegScaled{regT, regN, unitOffset, floatSize} :: rest, code) =
             let
                 val instr =
                     case floatSize of
                         Float32 => storeRegScaledFloat
                     |   Double64 => storeRegScaledDouble
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, unitOffset=unitOffset})
             end
 
         |   toAssembler(LoadRegUnscaled{regT, regN, byteOffset, loadType, unscaledType} :: rest, code) =
             let
                 val instr =
                     case (loadType, unscaledType) of
                         (Load64, NoUpdate) => loadRegUnscaled
                     |   (Load32, NoUpdate) => loadRegUnscaled32
                     |   (Load16, NoUpdate) => loadRegUnscaled16
                     |   (Load8, NoUpdate) => loadRegUnscaledByte
                     |   (Load64, PreIndex) => loadRegPreIndex
                     |   (Load32, PreIndex) => loadRegPreIndex32
                     |   (Load16, PreIndex) => raise InternalError "loadRegPreIndex16"
                     |   (Load8, PreIndex) => loadRegPreIndexByte
                     |   (Load64, PostIndex) => loadRegPostIndex
                     |   (Load32, PostIndex) => loadRegPostIndex32
                     |   (Load16, PostIndex) => raise InternalError "loadRegPostIndex16"
                     |   (Load8, PostIndex) => loadRegPostIndexByte
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, byteOffset=byteOffset})
             end
 
         |   toAssembler(LoadFPRegUnscaled{regT, regN, byteOffset, floatSize, unscaledType} :: rest, code) =
             let
                 val instr =
                     case (floatSize, unscaledType) of
                         (Float32, NoUpdate) => loadRegUnscaledFloat
                     |   (Double64, NoUpdate) => loadRegUnscaledDouble
                     |   _ => raise InternalError "LoadFPRegUnscaled: pre/post indexed"
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, byteOffset=byteOffset})
             end
 
         |   toAssembler(StoreRegUnscaled{regT, regN, byteOffset, loadType, unscaledType} :: rest, code) =
             let
                 val instr =
                     case (loadType, unscaledType) of
                         (Load64, NoUpdate) => storeRegUnscaled
                     |   (Load32, NoUpdate) => storeRegUnscaled32
                     |   (Load16, NoUpdate) => storeRegUnscaled16
                     |   (Load8, NoUpdate) => storeRegUnscaledByte
                     |   (Load64, PreIndex) => storeRegPreIndex
                     |   (Load32, PreIndex) => storeRegPreIndex32
                     |   (Load16, PreIndex) => raise InternalError "storeRegPreIndex16"
                     |   (Load8, PreIndex) => storeRegPreIndexByte
                     |   (Load64, PostIndex) => storeRegPostIndex
                     |   (Load32, PostIndex) => storeRegPostIndex32
                     |   (Load16, PostIndex) => raise InternalError "storeRegPostIndex16"
                     |   (Load8, PostIndex) => storeRegPostIndexByte
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, byteOffset=byteOffset})
             end
 
         |   toAssembler(StoreFPRegUnscaled{regT, regN, byteOffset, floatSize, unscaledType} :: rest, code) =
             let
                 val instr =
                     case (floatSize, unscaledType) of
                         (Float32, NoUpdate) => storeRegUnscaledFloat
                     |   (Double64, NoUpdate) => storeRegUnscaledDouble
                     |   _ => raise InternalError "StoreFPRegUnscaled: pre/post indexed"
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, byteOffset=byteOffset})
             end
 
         |   toAssembler(LoadRegIndexed{regT, regN, regM, loadType, option} :: rest, code) =
             let
                 val instr =
                     case loadType of
                         Load64 => loadRegIndexed
                     |   Load32 => loadRegIndexed32
                     |   Load16 => loadRegIndexed16
                     |   Load8 => loadRegIndexedByte
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, regM=regM, option=option})
             end
 
         |   toAssembler(StoreRegIndexed{regT, regN, regM, loadType, option} :: rest, code) =
             let
                 val instr =
                     case loadType of
                         Load64 => storeRegIndexed
                     |   Load32 => storeRegIndexed32
                     |   Load16 => storeRegIndexed16
                     |   Load8 => storeRegIndexedByte
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, regM=regM, option=option})
             end
 
         |   toAssembler(LoadFPRegIndexed{regT, regN, regM, floatSize, option} :: rest, code) =
             let
                 val instr =
                     case floatSize of
                         Float32 => loadRegIndexedFloat
                     |   Double64 => loadRegIndexedDouble
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, regM=regM, option=option})
             end
 
         |   toAssembler(StoreFPRegIndexed{regT, regN, regM, floatSize, option} :: rest, code) =
             let
                 val instr =
                     case floatSize of
                         Float32 => storeRegIndexedFloat
                     |   Double64 => storeRegIndexedDouble
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, regM=regM, option=option})
             end
 
         |   toAssembler(LoadAcquireReg{regN, regT, loadType} :: rest, code) =
             let
                 val loadInstr  =
                     case loadType of
                         Load64 => loadAcquire
                     |   Load32 => loadAcquire32
                     |   Load8 => loadAcquireByte
                     |   _ => raise InternalError "LoadAcquire: Unsupported size" (* Not used *)
             in
                 toAssembler(rest, code <::> loadInstr{regT=regT, regN=regN})
             end
 
         |   toAssembler(StoreReleaseReg{regN, regT, loadType} :: rest, code) =
             let
                 val storeInstr  =
                     case loadType of
                         Load64 => storeRelease
                     |   Load32 => storeRelease32
                     |   Load8 => storeReleaseByte
                     |   _ => raise InternalError "StoreRelease: Unsupported size" (* Not used *)
             in
                 toAssembler(rest, code <::> storeInstr{regT=regT, regN=regN})
             end
 
         |   toAssembler(LoadAcquireExclusiveRegister{regN, regT} :: rest, code) =
                 toAssembler(rest, code <::> loadAcquireExclusiveRegister{regN=regN, regT=regT})
 
         |   toAssembler(StoreReleaseExclusiveRegister{regN, regT, regS} :: rest, code) =
                 toAssembler(rest, code <::> storeReleaseExclusiveRegister{regN=regN, regT=regT, regS=regS})
 
         |   toAssembler(MemBarrier :: rest, code) =
                 toAssembler(rest, code <::> dmbIsh)
 
         |   toAssembler(AtomicExtension{ regT, regN, regS, atOp} :: rest, code) =
             let
-                val instr = case atOp of LoadAddAL => loadAddAL | LoadUmaxAL => loadUMaxAL | SwapAL => swapAL
+                val instr =
+                    case atOp of
+                        LoadAddAL => loadAddAL | LoadUmaxAL => loadUMaxAL | SwapAL => swapAL
+                    |   LoadAddAcquire => loadAddA | LoadUMaxAcquire => loadUMaxA | SwapRelease => swapL
             in
                 toAssembler(rest, code <::> instr{regT=regT, regN=regN, regS=regS})
             end
 
         |   toAssembler(LoadRegPair{ regT1, regT2, regN, unitOffset, loadType, unscaledType} :: rest, code) =
             let
                 val _ = regT1 <> regT2 orelse raise InternalError "LoadRegPair: same register"
                 val instr =
                     case (loadType, unscaledType) of
                         (Load64, NoUpdate) => loadPairOffset
                     |   (Load64, PreIndex) => loadPairPreIndexed
                     |   (Load64, PostIndex) => loadPairPostIndexed
                     |   (Load32, NoUpdate) => loadPairOffset32
                     |   (Load32, PreIndex) => loadPairPreIndexed32
                     |   (Load32, PostIndex) => loadPairPostIndexed32
                     |   _ => raise InternalError "LoadRegPair: unimplemented"
             in
                 toAssembler(rest, code <::> instr{regT1=regT1, regT2=regT2, regN=regN, unitOffset=unitOffset})
             end
 
         |   toAssembler(StoreRegPair{ regT1, regT2, regN, unitOffset, loadType, unscaledType} :: rest, code) =
             let
                 val instr =
                     case (loadType, unscaledType) of
                         (Load64, NoUpdate) => storePairOffset
                     |   (Load64, PreIndex) => storePairPreIndexed
                     |   (Load64, PostIndex) => storePairPostIndexed
                     |   (Load32, NoUpdate) => storePairOffset32
                     |   (Load32, PreIndex) => storePairPreIndexed32
                     |   (Load32, PostIndex) => storePairPostIndexed32
                     |   _ => raise InternalError "StoreRegPair: unimplemented"
             in
                 toAssembler(rest, code <::> instr{regT1=regT1, regT2=regT2, regN=regN, unitOffset=unitOffset})
             end
 
         |   toAssembler(LoadFPRegPair{ regT1, regT2, regN, unitOffset, floatSize, unscaledType} :: rest, code) =
             let
                 val _ = regT1 <> regT2 orelse raise InternalError "LoadRegPair: same register"
                 val instr =
                     case (floatSize, unscaledType) of
                         (Double64, NoUpdate) => loadPairOffsetDouble
                     |   (Double64, PreIndex) => loadPairPreIndexedDouble
                     |   (Double64, PostIndex) => loadPairPostIndexedDouble
                     |   (Float32, NoUpdate) => loadPairOffsetFloat
                     |   (Float32, PreIndex) => loadPairPreIndexedFloat
                     |   (Float32, PostIndex) => loadPairPostIndexedFloat
             in
                 toAssembler(rest, code <::> instr{regT1=regT1, regT2=regT2, regN=regN, unitOffset=unitOffset})
             end
 
         |   toAssembler(StoreFPRegPair{ regT1, regT2, regN, unitOffset, floatSize, unscaledType} :: rest, code) =
             let
                 val instr =
                     case (floatSize, unscaledType) of
                         (Double64, NoUpdate) => storePairOffsetDouble
                     |   (Double64, PreIndex) => storePairPreIndexedDouble
                     |   (Double64, PostIndex) => storePairPostIndexedDouble
                     |   (Float32, NoUpdate) => storePairOffsetFloat
                     |   (Float32, PreIndex) => storePairPreIndexedFloat
                     |   (Float32, PostIndex) => storePairPostIndexedFloat
             in
                 toAssembler(rest, code <::> instr{regT1=regT1, regT2=regT2, regN=regN, unitOffset=unitOffset})
             end
 
         |   toAssembler(ConditionalSet{regD, regTrue, regFalse, cond, condSet, opSize} :: rest, code) =
             let
                 val instr =
                     case (condSet, opSize) of
                         (CondSet, OpSize64) => conditionalSet
                     |   (CondSetIncr, OpSize64) => conditionalSetIncrement
                     |   (CondSetInvert, OpSize64) => conditionalSetInverted
                     |   (CondSetNegate, OpSize64) => conditionalSetNegated
                     |   (CondSet, OpSize32) => conditionalSet32
                     |   (CondSetIncr, OpSize32) => conditionalSetIncrement32
                     |   (CondSetInvert, OpSize32) => conditionalSetInverted32
                     |   (CondSetNegate, OpSize32) => conditionalSetNegated32
             in
                 toAssembler(rest, code <::> instr{regD=regD, regTrue=regTrue, regFalse=regFalse, cond=cond})
             end
 
         |   toAssembler(BitField{immr, imms, regN, regD, opSize, bitfieldKind} :: rest, code) =
             let
                 val bfInstr =
                     case (bitfieldKind, opSize) of
                         (BFSigned, OpSize64) => signedBitfieldMove64
                     |   (BFUnsigned, OpSize64) => unsignedBitfieldMove64
                     |   (BFInsert, OpSize64) => bitfieldMove64
                     |   (BFSigned, OpSize32) => signedBitfieldMove32
                     |   (BFUnsigned, OpSize32) => unsignedBitfieldMove32
                     |   (BFInsert, OpSize32) => bitfieldMove32
             in
                 toAssembler(rest, code <::> bfInstr{immr=immr, imms=imms, regN=regN, regD=regD})
             end
 
         |   toAssembler(ShiftRegisterVariable{regM, regN, regD, opSize, shiftDirection} :: rest, code) =
             let
                 val instr =
                     case (shiftDirection, opSize) of
                         (ShiftLeft, OpSize64) => logicalShiftLeftVariable
                     |   (ShiftLeft, OpSize32) => logicalShiftLeftVariable32
                     |   (ShiftRightLogical, OpSize64) => logicalShiftRightVariable
                     |   (ShiftRightLogical, OpSize32) => logicalShiftRightVariable32
                     |   (ShiftRightArithmetic, OpSize64) => arithmeticShiftRightVariable
                     |   (ShiftRightArithmetic, OpSize32) => arithmeticShiftRightVariable32
             in
                 toAssembler(rest, code <::> instr{regN=regN, regM=regM, regD=regD})
             end
 
         |   toAssembler(BitwiseLogical{ bits, regN, regD, opSize, setFlags, logOp} :: rest, code) =
             let
                 val instr =
                     case (logOp, setFlags, opSize) of
                         (LogAnd, false, OpSize64) => bitwiseAndImmediate
                     |   (LogAnd, true, OpSize64) => bitwiseAndSImmediate
                     |   (LogOr, false, OpSize64) => bitwiseOrImmediate
                     |   (LogXor, false, OpSize64) => bitwiseXorImmediate
 
                     |   (LogAnd, false, OpSize32) => bitwiseAndImmediate32
                     |   (LogAnd, true, OpSize32) => bitwiseAndSImmediate32
                     |   (LogOr, false, OpSize32) => bitwiseOrImmediate32
                     |   (LogXor, false, OpSize32) => bitwiseXorImmediate32
 
                     |   _ => raise InternalError "flags not valid with OR or XOR"
             in
                 toAssembler(rest, code <::> instr{regN=regN, regD=regD, bits=bits})
             end
 
         |   toAssembler(MoveGeneralToFP{ regN, regD, floatSize=Float32} :: rest, code) =
                 toAssembler(rest, code <::> moveGeneralToFloat{regN=regN, regD=regD})
         |   toAssembler(MoveGeneralToFP{ regN, regD, floatSize=Double64} :: rest, code) =
                 toAssembler(rest, code <::> moveGeneralToDouble{regN=regN, regD=regD})
 
         |   toAssembler(MoveFPToGeneral{ regN, regD, floatSize=Float32} :: rest, code) =
                 toAssembler(rest, code <::> moveFloatToGeneral{regN=regN, regD=regD})
         |   toAssembler(MoveFPToGeneral{ regN, regD, floatSize=Double64} :: rest, code) =
                 toAssembler(rest, code <::> moveDoubleToGeneral{regN=regN, regD=regD})
 
         |   toAssembler(CvtIntToFP{ regN, regD, floatSize, opSize} :: rest, code) =
             let
                 val instr =
                     case (opSize, floatSize) of
                         (OpSize32, Float32) => convertInt32ToFloat
                     |   (OpSize64, Float32) => convertIntToFloat
                     |   (OpSize32, Double64) => convertInt32ToDouble
                     |   (OpSize64, Double64) => convertIntToDouble
             in
                 toAssembler(rest, code <::> instr{regN=regN, regD=regD})
             end
 
         |   toAssembler(CvtFloatToInt{ round, regN, regD, floatSize, opSize} :: rest, code) =
             let
                 val instr =
                     case (floatSize, opSize) of
                         (Float32, OpSize32) => convertFloatToInt32
                     |   (Float32, OpSize64) => convertFloatToInt
                     |   (Double64, OpSize32) => convertDoubleToInt32
                     |   (Double64, OpSize64) => convertDoubleToInt
             in
                 toAssembler(rest, code <::> instr round {regN=regN, regD=regD})
             end
 
         |   toAssembler(FPBinaryOp{ regM, regN, regD, floatSize, fpOp} :: rest, code) =
             let
                 val instr =
                     case (fpOp, floatSize) of
                         (MultiplyFP, Float32) => multiplyFloat
                     |   (DivideFP, Float32) => divideFloat
                     |   (AddFP, Float32) => addFloat
                     |   (SubtractFP, Float32) => subtractFloat
                     |   (MultiplyFP, Double64) => multiplyDouble
                     |   (DivideFP, Double64) => divideDouble
                     |   (AddFP, Double64) => addDouble
                     |   (SubtractFP, Double64) => subtractDouble
             in
                 toAssembler(rest, code <::> instr {regN=regN, regM=regM, regD=regD})
             end
 
         |   toAssembler(FPComparison{ regM, regN, floatSize} :: rest, code) =
                 toAssembler(rest, code <::> (case floatSize of Float32 => compareFloat | Double64 => compareDouble){regN=regN, regM=regM})
 
         |   toAssembler(FPUnaryOp{ regN, regD, fpOp} :: rest, code) =
             let
                 val instr =
                     case fpOp of
                         NegFloat => negFloat | NegDouble => negDouble
                     |   AbsFloat => absFloat | AbsDouble => absDouble
                     |   ConvFloatToDble => convertFloatToDouble
                     |   ConvDbleToFloat => convertDoubleToFloat
             in
                 toAssembler(rest, code <::> instr {regN=regN, regD=regD})
             end
 
         |   toAssembler(SetLabel(Label lab) :: rest, code) = toAssembler(rest, code <::> setLabel(getLabelTarget lab))
 
         |   toAssembler(ConditionalBranch(cond, Label lab) :: rest, code) = toAssembler(rest, code <::> conditionalBranch(cond, getLabelTarget lab))
 
         |   toAssembler(UnconditionalBranch(Label lab) :: rest, code) = toAssembler(rest, code <::> unconditionalBranch(getLabelTarget lab))
 
         |   toAssembler(BranchAndLink(Label lab) :: rest, code) = toAssembler(rest, code <::> branchAndLink(getLabelTarget lab))
 
         |   toAssembler(BranchReg{regD, brRegType=BRRBranch} :: rest, code) = toAssembler(rest, code <::> branchRegister regD)
         |   toAssembler(BranchReg{regD, brRegType=BRRAndLink} :: rest, code) = toAssembler(rest, code <::> branchAndLinkReg regD)
         |   toAssembler(BranchReg{regD, brRegType=BRRReturn} :: rest, code) = toAssembler(rest, code <::> returnRegister regD)
 
         |   toAssembler(LoadLabelAddress(reg, Label lab) :: rest, code) = toAssembler(rest, code <::> loadLabelAddress(reg, getLabelTarget lab))
 
         |   toAssembler(TestBitBranch{ test, bit, label=Label lab, onZero } :: rest, code) =
                 toAssembler(rest, code <::> (if onZero then testBitBranchZero else testBitBranchNonZero)(test, bit, getLabelTarget lab))
 
         |   toAssembler(CompareBranch{ test, label=Label lab, onZero, opSize } :: rest, code) =
             let
                 val instr =
                     case (onZero, opSize) of
                         (true, OpSize64) => compareBranchZero
                     |   (false, OpSize64) => compareBranchNonZero
                     |   (true, OpSize32) => compareBranchZero32
                     |   (false, OpSize32) => compareBranchNonZero32
             in
                 toAssembler(rest, code <::> instr(test, getLabelTarget lab))
             end
 
             (* Register-register moves - special case for XSP. *)
         |   toAssembler(MoveXRegToXReg{sReg=XSP, dReg} :: rest, code) =
                 toAssembler(rest, code <::> addImmediate{regN=XSP, regD=dReg, immed=0w0, shifted=false})
         |   toAssembler(MoveXRegToXReg{sReg, dReg=XSP} :: rest, code) =
                 toAssembler(rest, code <::> addImmediate{regN=sReg, regD=XSP, immed=0w0, shifted=false})
         |   toAssembler(MoveXRegToXReg{sReg, dReg} :: rest, code) =
                 toAssembler(rest, code <::> orrShiftedReg{regN=XZero, regM=sReg, regD=dReg, shift=ShiftNone})
 
         |   toAssembler(LoadNonAddr(xReg, value) :: rest, code) =
             let
                 (* Load a non-address constant.  Tries to use movz/movn/movk if
                    that can be done easily, othewise uses loadNonAddressConstant to
                    load the value from the non-address constant area. *)
                 fun extW (v, h) = Word.andb(Word.fromLarge(LargeWord.>>(Word64.toLarge v, h*0w16)), 0wxffff)
                 val hw0 = extW(value, 0w3) and hw1 = extW(value, 0w2)
                 and hw2 = extW(value, 0w1) and hw3 = extW(value, 0w0)
                 val nextCode =
                     if value < 0wx100000000
                     then
                     let
                         (* 32-bit constants can be loaded using at most a movz and movk but
                            various cases can be reduced since all 32-bit operations set
                            the top word to zero. *)
                         val hi = hw2
                         and lo = hw3
                     in
                         (* 32-bit constants can be loaded with at most a movz and a movk but
                            it may be that there is something shorter. *)
                         if hi = 0w0
                         then code <::> moveZero32{regD=xReg, immediate=lo, shift=0w0}
                         else if hi = 0wxffff
                         then code <::> moveNot32{regD=xReg, immediate=Word.xorb(0wxffff, lo), shift=0w0}
                         else if lo = 0w0
                         then code <::> moveZero32{regD=xReg, immediate=hi, shift=0w16}
                         else if isEncodableBitPattern(value, WordSize32)
                         then code <::> bitwiseOrImmediate32{bits=value, regN=XZero, regD=xReg}
                         else (* Have to use two instructions *)
                             code <::>
                                 moveZero32{regD=xReg, immediate=lo, shift=0w0} <::>
                                 moveKeep{regD=xReg, immediate=hi, shift=0w16}
                     end
                     else if isEncodableBitPattern(value, WordSize64)
                     then code <::> bitwiseOrImmediate{bits=value, regN=XZero, regD=xReg}
                     else if hw0 = 0wxffff andalso hw1 = 0wxffff andalso hw2 = 0wxffff
                     then code <::> moveNot{regD=xReg, immediate=Word.xorb(0wxffff, hw3), shift=0w0}
                     else if hw1 = 0w0 andalso hw2 = 0w0
                     then (* This is common for length words with a flags byte *)
                         code <::> moveZero32{regD=xReg, immediate=hw3, shift=0w0} <::>
                             moveKeep{regD=xReg, immediate=hw0, shift=0w48} 
                     else code <::> loadNonAddressConstant(xReg, value)
             in
                 toAssembler(rest, nextCode)
             end
 
         |   toAssembler(LoadFPConst{dest, value, floatSize=Float32, work} :: rest, code) =
                 toAssembler(rest, loadFloatConstant(dest, value, work)::code)
 
         |   toAssembler(LoadFPConst{dest, value, floatSize=Double64, work} :: rest, code) =
                 toAssembler(rest, loadDoubleConstant(dest, value, work)::code)
 
         |   toAssembler(LoadAddr(dReg, source) :: rest, code) = toAssembler(rest, loadAddressConstant(dReg, source) :: code)
 
         |   toAssembler(RTSTrap{ rtsEntry, work, save } :: rest, code) =
             let
                 (* Because X30 is used in the branchAndLink it has to be pushed
                    across any trap. *)
                 val saveX30 = List.exists (fn r => r = X30) save
                 val preserve = List.filter (fn r => r <> X30) save
             in
                 toAssembler(rest,
                     code <@>
                         (if saveX30 then [storeRegPreIndex{regT=X30, regN=X_MLStackPtr, byteOffset= ~8}] else []) <::>
                     loadRegScaled{regT=work, regN=X_MLAssemblyInt, unitOffset=rtsEntry} <::>
                     branchAndLinkReg work <::>
                     registerMask preserve <@>
                     (if saveX30 then [loadRegPostIndex{regT=X30, regN=X_MLStackPtr, byteOffset= 8}] else [])
                 )
             end
 
         |   toAssembler(AllocateMemoryFixedSize{ bytes, dest, save, work } :: rest, code) =
             let
                 val label = Arm64Assembly.createLabel()
                 val saveX30 = List.exists (fn r => r = X30) save
                 val preserve = List.filter (fn r => r <> X30) save
 
                 val allocCode =
                     code <@>
                     (* Subtract the number of bytes required from the heap pointer. *)
                     (if bytes >= 0w4096
                     then [subShiftedReg{regM=work, regN=X_MLHeapAllocPtr, regD=dest, shift=ShiftNone},
                           loadNonAddressConstant(work, Word.toLarge bytes)]
                     else [subImmediate{regN=X_MLHeapAllocPtr, regD=dest, immed=bytes, shifted=false}]) <::>
                     (* Compare the result with the heap limit. *)
                     subSShiftedReg{regM=X_MLHeapLimit, regN=dest, regD=XZero, shift=ShiftNone} <::>
                     conditionalBranch(CondCarrySet, label) <@>
                         (if saveX30 then [storeRegPreIndex{regT=X30, regN=X_MLStackPtr, byteOffset= ~8}] else []) <::>
                     loadRegScaled{regT=work, regN=X_MLAssemblyInt, unitOffset=heapOverflowCallOffset} <::>
                     branchAndLinkReg work <::>
                     registerMask preserve <@>
                     (if saveX30 then [loadRegPostIndex{regT=X30, regN=X_MLStackPtr, byteOffset= 8}] else []) <::>
                     setLabel label <::>
                     (* Update the heap pointer. *)
                     orrShiftedReg{regN=XZero, regM=dest, regD=X_MLHeapAllocPtr, shift=ShiftNone}
             in
                 toAssembler(rest, allocCode)
             end
 
         |   toAssembler(AllocateMemoryVariableSize{ sizeReg, dest, save, work } :: rest, code) =
             let
                 val trapLabel = Arm64Assembly.createLabel() and noTrapLabel = Arm64Assembly.createLabel()
                 val saveX30 = List.exists (fn r => r = X30) save
                 val preserve = List.filter (fn r => r <> X30) save
             
                 val allocCode =
                     (
                         (* Subtract the size into the result register.  Subtract a further word for
                            the length word and round down in 32-in-64. *)
                         if is32in64
                         then code <::>
                             subShiftedReg{regM=sizeReg, regN=X_MLHeapAllocPtr, regD=dest, shift=ShiftLSL 0w2} <::>
                             subImmediate{regN=dest, regD=dest, immed=0w4, shifted=false} <::>
                             bitwiseAndImmediate{bits= ~ 0w8, regN=dest, regD=dest}
                         else code <::>
                             subShiftedReg{regM=sizeReg, regN=X_MLHeapAllocPtr, regD=dest, shift=ShiftLSL 0w3} <::>
                             subImmediate{regN=dest, regD=dest, immed=0w8, shifted=false}
                     ) <::>
                     (* Check against the limit. If the size is large enough it is possible that this could wrap round. 
                        To check for that we trap if either the result is less than the limit or if it is
                        now greater than the allocation pointer. *)
                     subSShiftedReg{regM=X_MLHeapLimit, regN=dest, regD=XZero, shift=ShiftNone} <::>
                     conditionalBranch(CondCarryClear, trapLabel) <::>
                     subSShiftedReg{regM=X_MLHeapAllocPtr, regN=dest, regD=XZero, shift=ShiftNone} <::>
                     conditionalBranch(CondCarryClear, noTrapLabel) <::>
                     setLabel trapLabel <@>
                         (if saveX30 then [storeRegPreIndex{regT=X30, regN=X_MLStackPtr, byteOffset= ~8}] else []) <::>
                     loadRegScaled{regT=work, regN=X_MLAssemblyInt, unitOffset=heapOverflowCallOffset} <::>
                     branchAndLinkReg work <::>
                     registerMask preserve <@>
                     (if saveX30 then [loadRegPostIndex{regT=X30, regN=X_MLStackPtr, byteOffset= 8}] else []) <::>
                     setLabel noTrapLabel <::>
                     (* Update the heap pointer. *)
                     orrShiftedReg{regN=XZero, regM=dest, regD=X_MLHeapAllocPtr, shift=ShiftNone}
             in
                 toAssembler(rest, allocCode)
             end
 
         |   toAssembler(BranchTable{ startLabel=Label lab, brTable } :: rest, code) =
                 toAssembler(rest,
                     List.foldl (fn (Label lab, code) => (unconditionalBranch(getLabelTarget lab)) :: code)
                         (code <::> setLabel(getLabelTarget lab)) brTable)
 
         |   toAssembler(LoadGlobalHeapBaseInCallback dest :: rest, code) =
                 toAssembler(rest,
                     code <@> List.rev(loadGlobalHeapBaseInCallback dest))
 
         |   toAssembler(Yield :: rest, code) =
                 toAssembler(rest, code <::> yield)
 
         (* Optimisation passes. *)
         fun isValidForPair(offset1, offset2) =
             let val v = Int.min(offset1, offset2) in v >= ~64 andalso v < 64 end
 
         fun forward([], list, rep) = reverse(list, [], rep)
 
         |   forward(SetLabel(Label srcLab) :: (ubr as UnconditionalBranch(Label destLab)) :: tl, list, _) =
             if srcLab = destLab
             (* We should never get this because there should always be a stack-check to
                allow a loop to be broken.  If that ever changes we need to retain the label. *)
             then raise InternalError "Infinite loop detected"
             else (* Mark this to forward to its destination. *)
             (
                 Array.update(labelTargets, srcLab, getLabel destLab);
                 forward(ubr :: tl, list, true)
             )
 
         |   forward(SetLabel(Label jmpLab1) :: (tl as SetLabel(Label jmpLab2) :: _), list, _) =
             (* Eliminate adjacent labels.  They complicate the other tests although they
                don't incur any run-time cost. *)
             (
                 (* Any reference to the first label is forwarded to the second. *)
                 Array.update(labelTargets, jmpLab1, getLabel jmpLab2);
                 forward(tl, list, true)
             )
 
         |   forward((ubr as UnconditionalBranch(Label ubrLab)) :: (tl as SetLabel(Label jumpLab) :: _), list, rep) =
                 (* Eliminate unconditional jumps to the next instruction. *)
             if ubrLab = jumpLab
             then forward(tl, list, true)
             else forward(tl, ubr :: list, rep)
 
         |   forward((cbr as ConditionalBranch(test, Label cbrLab)) :: (ubr as UnconditionalBranch(Label ubrLab)) ::
                     (tl as SetLabel(Label jumpLab) :: _), list, rep) =
             if cbrLab = jumpLab
             then (* We have a conditional branch followed by an unconditional branch followed by the destination of
                     the conditional branch.  Eliminate the unconditional branch by reversing the test.
                     This can often happen if one branch of an if-then-else has been reduced to zero
                     because the same register has been chosen for the input and output. *)
                 forward(tl (* Leave the label just in case it's used elsewhere*),
                     ConditionalBranch(invertTest test, Label ubrLab) :: list, true)
 
             else forward(ubr :: tl, cbr :: list, rep)
 
         |   forward((load as LoadRegScaled{regT=regT1, regN=regN1, unitOffset=offset1, loadType=lt1}) ::
                      (tl1 as LoadRegScaled{regT=regT2, regN=regN2, unitOffset=offset2, loadType=lt2} ::tl2), list, rep) =
             (* Two adjacent loads - can this be converted to load-pair?  N.B.  We have to be careful about the
                sequence ldr x0,[x0]; ldr x1,[x0+8] which isn't the same at all. *)
             if regN1 = regN2 andalso regN1 <> regT1 andalso lt1 = lt2 andalso (offset2 = offset1 + 1 orelse offset2 = offset1 - 1) andalso
                 (case lt1 of Load64 => true | Load32 => true | _ => false) andalso isValidForPair(offset1, offset2)
             then
             let
                 val (reg1, reg2, offset) =
                     if offset1 < offset2 then (regT1, regT2, offset1) else (regT2, regT1, offset2)
             in
                 forward(tl2,
                     LoadRegPair{ regT1=reg1, regT2=reg2, regN=regN1, unitOffset=offset, loadType=lt1, unscaledType=NoUpdate} :: list, true)
             end
             else forward(tl1, load :: list, rep)
 
         |   forward((store as StoreRegScaled{regT=regT1, regN=regN1, unitOffset=offset1, loadType=lt1}) ::
                      (tl1 as StoreRegScaled{regT=regT2, regN=regN2, unitOffset=offset2, loadType=lt2} ::tl2), list, rep) =
             (* Two adjacent stores - can this be converted to store-pair? *)
             if regN1 = regN2 andalso lt1 = lt2 andalso (offset2 = offset1 + 1 orelse offset2 = offset1 - 1) andalso
                 (case lt1 of Load64 => true | Load32 => true | _ => false) andalso isValidForPair(offset1, offset2)
             then
             let
                 val (reg1, reg2, offset) =
                     if offset1 < offset2 then (regT1, regT2, offset1) else (regT2, regT1, offset2)
             in
                 forward(tl2,
                     StoreRegPair{ regT1=reg1, regT2=reg2, regN=regN1, unitOffset=offset, loadType=lt1, unscaledType=NoUpdate} :: list, true)
             end
             else forward(tl1, store :: list, rep)
 
         |   forward((store as StoreRegUnscaled{regT=regT1, regN=regN1, byteOffset= ~8, loadType=Load64, unscaledType=NoUpdate}) ::
                      (tl1 as StoreRegScaled{regT=regT2, regN=regN2, unitOffset=0, loadType=Load64} ::tl2), list, rep) =
             (* Common case - store length word and then the first word of the cell. *)
             if regN1 = regN2
             then forward(tl2,
                     StoreRegPair{ regT1=regT1, regT2=regT2, regN=regN1, unitOffset= ~1, loadType=Load64, unscaledType=NoUpdate} :: list, true)
             else forward(tl1, store :: list, rep)
 
         |   forward((store as StoreRegUnscaled{regT=regT1, regN=regN1, byteOffset= ~4, loadType=Load32, unscaledType=NoUpdate}) ::
                      (tl1 as StoreRegScaled{regT=regT2, regN=regN2, unitOffset=0, loadType=Load32} ::tl2), list, rep) =
             (* Common case - store length word and then the first word of the cell. *)
             if regN1 = regN2
             then forward(tl2,
                     StoreRegPair{ regT1=regT1, regT2=regT2, regN=regN1, unitOffset= ~1, loadType=Load32, unscaledType=NoUpdate} :: list, true)
             else forward(tl1, store :: list, rep)
 
         |   forward((store as StoreRegUnscaled{regT=regT1, regN=regN1, byteOffset= ~8, loadType=Load64, unscaledType=PreIndex}) ::
                      (tl1 as StoreRegUnscaled{regT=regT2, regN=regN2, byteOffset= ~8, loadType=Load64, unscaledType=PreIndex} :: tl2), list, rep) =
             (* Adjacent pushes T2 is in the lower address so the order is T2, T1.  The stack is always 64-bit aligned so
                this works on both native addressing and 32-in-64. *)
             if regN1 = regN2
             then forward(tl2,
                     StoreRegPair{ regT1=regT2, regT2=regT1, regN=regN1, unitOffset= ~2, loadType=Load64, unscaledType=PreIndex} :: list, true)
             else forward(tl1, store :: list, rep)
 
         |   forward((add1 as AddImmediate{regN=regN1, regD=regD1, immed=immed1, shifted=false, opSize=OpSize64, setFlags=false}) ::
                     (tl1 as AddImmediate{regN=regN2, regD=regD2, immed=immed2, shifted=false, opSize=OpSize64, setFlags=false}  ::tl2), list, rep) =
             (* Adjacent stack resets.  This can apply more generally but only if the result registers are the same.  If they're
                not we may need the intermediate result.  We put the result back into the input stream in case it can be combined
                with another stack reset. *)
             if regN2 = regD2 andalso regD1 = regD2 andalso immed2+immed1 < 0w4096
             then forward(AddImmediate{regN=regN1, regD=regD2, immed=immed2+immed1, shifted=false, opSize=OpSize64, setFlags=false} :: tl2, list, true)
             else forward(tl1, add1 :: list, rep)
 
         |   forward(BitwiseLogical{bits=0w1, regN, regD=XZero, logOp=LogAnd, opSize=_, setFlags=true} ::
                     ConditionalBranch(CondEqual, label) :: tl2, list, _) =
             (* Test the tag bit: bit 0.  This is very common to test for nil/not nil.  We could include other
                values but they're far less likely. *)
                 forward(TestBitBranch{test=regN, bit=0w0, label=label, onZero=true} :: tl2, list, true)
 
         |   forward(BitwiseLogical{bits=0w1, regN, regD=XZero, logOp=LogAnd, opSize=_, setFlags=true} ::
                     ConditionalBranch(CondNotEqual, label) :: tl2, list, _) =
                 forward(TestBitBranch{test=regN, bit=0w0, label=label, onZero=false} :: tl2, list, true)
 
         |   forward(hd :: tl, list, rep) = forward(tl, hd :: list, rep) 
         
         and reverse([], list, rep) = (list, rep)
 
         |   reverse((add as AddImmediate{regN=regN2, regD=regD2, immed, shifted=false, opSize=OpSize64, setFlags=false}) ::
                      (tl1 as LoadRegScaled{regT=regT1, regN=regN1, unitOffset=0, loadType=Load64} ::tl2), list, rep) =
             (* A stack reset occurring after a load.  This is usually the ML SP but can also occur with C memory ops.
                It might be possible to consider other cases. *)
             if regN1 = regD2 andalso regN2 = regD2 andalso regT1 <> regN1 andalso immed < 0w256
             then reverse(tl2,
                     LoadRegUnscaled{regT=regT1, regN=regN1, byteOffset=Word.toInt immed, loadType=Load64, unscaledType=PostIndex} :: list, true)
             else reverse(tl1, add :: list, rep)
 
         |   reverse((add as AddImmediate{regN=regN2, regD=regD2, immed, shifted=false, opSize=OpSize64, setFlags=false}) ::
                      (tl1 as LoadRegPair{regT1=regT1, regT2=regT2, regN=regN1, unitOffset=0, loadType=Load64, unscaledType=NoUpdate} ::tl2), list, rep) =
             (* A stack reset occurring after a load pair *)
             if regN1 = regD2 andalso regN2 = regD2 andalso regT1 <> regN1 andalso regT2 <> regN1 andalso immed < 0w64 * 0w8
             then reverse(tl2,
                     LoadRegPair{regT1=regT1, regT2=regT2, regN=regN1, unitOffset=Word.toInt(immed div 0w8),
                                 loadType=Load64, unscaledType=PostIndex} :: list, true)
             else reverse(tl1, add :: list, rep)
 
         |   reverse(hd :: tl, list, rep) = reverse(tl, hd :: list, rep)
 
         (* Repeat scans through the code until there are no further changes. *)
         fun repeat ops =
             case forward(ops, [], false) of
                 (list, false) => list
             |   (list, true) => repeat list
 
         val optimised = repeat instrs
 
     in
         generateCode{instrs=List.rev(toAssembler(optimised, [])), name=name, parameters=parameters,
                      resultClosure=resultClosure, profileObject=profileObject}
     end
 
     (* Constant shifts are encoded in the immr and imms fields of the bit-field instruction. *)
     fun shiftConstant{ direction, regD, regN, shift, opSize } =
     let
         val (bitfieldKind, immr, imms) =
             case (direction, opSize) of
                 (ShiftLeft, OpSize64) => (BFUnsigned, Word.~ shift mod 0w64, 0w64-0w1-shift)
             |   (ShiftLeft, OpSize32) => (BFUnsigned, Word.~ shift mod 0w32, 0w32-0w1-shift)
             |   (ShiftRightLogical, OpSize64) => (BFUnsigned, shift, 0wx3f)
             |   (ShiftRightLogical, OpSize32) => (BFUnsigned, shift, 0wx1f)
             |   (ShiftRightArithmetic, OpSize64) => (BFSigned, shift, 0wx3f)
             |   (ShiftRightArithmetic, OpSize32) => (BFSigned, shift, 0wx1f)
     in
         BitField{ regN=regN, regD=regD, opSize=opSize, immr=immr, imms=imms, bitfieldKind=bitfieldKind }
     end
 
     (* These sequences are used both in the ML code-generator and in the FFI code so it
        is convenient to have them here and share the code. *)
     local
         fun allocateWords(fixedReg, workReg, words, bytes, regMask, code) =
         let
             val (lengthWord, setLength, flagShift) = if is32in64 then (~4, Load32, 0w24) else (~8, Load64, 0w56)
         in
             code <::>
             AllocateMemoryFixedSize{ bytes=bytes, dest=fixedReg, save=regMask, work=X16 } <::>
             LoadNonAddr(workReg,
                     Word64.orb(words, Word64.<<(Word64.fromLarge(Word8.toLarge Address.F_bytes), flagShift))) <::>
             (* Store the length word.  Have to use the unaligned version because offset is -ve. *)
             StoreRegUnscaled{regT=workReg, regN=fixedReg, byteOffset= lengthWord, loadType=setLength, unscaledType=NoUpdate}
         end
 
         fun absoluteAddressToIndex(reg, code) =
         if is32in64
         then
             code <::>
             SubShiftedReg{regM=X_Base32in64, regN=reg, regD=reg, shift=ShiftNone, opSize=OpSize64, setFlags=false} <::>
             shiftConstant{direction=ShiftRightLogical, regN=reg, regD=reg, shift=0w2, opSize=OpSize64}
         else code
     in
         fun boxDouble({source, destination, workReg, saveRegs}, code) =
             absoluteAddressToIndex(destination,
                 allocateWords(destination, workReg, if is32in64 then 0w2 else 0w1, 0w16, saveRegs, code) <::>
                     StoreFPRegScaled{regT=source, regN=destination, unitOffset=0, floatSize=Double64})
                 
         and boxSysWord({source, destination, workReg, saveRegs}, code) =
             absoluteAddressToIndex(destination,
                 allocateWords(destination, workReg, if is32in64 then 0w2 else 0w1, 0w16, saveRegs, code) <::>
                     StoreRegScaled{regT=source, regN=destination, unitOffset=0, loadType=Load64})
 
         and boxFloat({source, destination, workReg, saveRegs}, code) =
             absoluteAddressToIndex(destination, 
                 allocateWords(destination, workReg, 0w1, 0w8, saveRegs, code) <::>
                     StoreFPRegScaled{regT=source, regN=destination, unitOffset=0, floatSize=Float32})
     end
 
     structure Sharing =
     struct
         type closureRef = closureRef
         type loadType = loadType
         type opSize = opSize
         type logicalOp = logicalOp
         type floatSize = floatSize
         type shiftDirection = shiftDirection
         type multKind = multKind
         type fpUnary = fpUnary
         type fpBinary = fpBinary
         type unscaledType = unscaledType
         type condSet = condSet
         type bitfieldKind = bitfieldKind
         type brRegType = brRegType
         type precode = precode
         type xReg = xReg
         type vReg = vReg
         type label = label
         type labelMaker = labelMaker
         type condition = condition
         type shiftType = shiftType
         type wordSize = wordSize
         type 'a extend = 'a extend
         type scale = scale
         type instr = instr
         type atomicOp = atomicOp
     end
 
 end;