diff --git a/RootArm64.ML b/RootArm64.ML
index 4c94d26c..14c5c7c5 100644
--- a/RootArm64.ML
+++ b/RootArm64.ML
@@ -1,153 +1,150 @@
 (*
     Copyright (c) 2021 David C. J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 (* Compiler root file for Arm64.  This gives the "use" instructions necessary to build the
    compiler and suitable for use with an IDE project file.
    It was constructed from the Poly/ML make files. *)
 PolyML.print_depth 1;
 PolyML.Compiler.reportUnreferencedIds := true;
 use "mlsource/MLCompiler/Address.ML";
 use "mlsource/MLCompiler/Misc.ML";
 use "mlsource/MLCompiler/HashTable.ML";
 use "mlsource/MLCompiler/UniversalTable.ML";
 use "mlsource/MLCompiler/StronglyConnected.sml";
 use "mlsource/MLCompiler/StretchArray.ML";
 use "mlsource/MLCompiler/STRUCTVALSIG.sml";
 use "mlsource/MLCompiler/PRETTY.sig";
 use "mlsource/MLCompiler/LEXSIG.sml";
 use "mlsource/MLCompiler/SymbolsSig.sml";
 use "mlsource/MLCompiler/COMPILERBODYSIG.sml";
 use "mlsource/MLCompiler/DEBUG.sig";
 use "mlsource/MLCompiler/MAKESIG.sml";
 use "mlsource/MLCompiler/MAKE_.ML";
 use "mlsource/MLCompiler/FOREIGNCALL.sig";
 use "mlsource/MLCompiler/BUILTINS.sml";
 use "mlsource/MLCompiler/CODETREE.sig";
 use "mlsource/MLCompiler/STRUCT_VALS.ML";
 use "mlsource/MLCompiler/CodeTree/BACKENDINTERMEDIATECODE.sig";
 use "mlsource/MLCompiler/CodeTree/BASECODETREE.sig";
 use "mlsource/MLCompiler/CodeTree/CODETREEFUNCTIONS.sig";
 use "mlsource/MLCompiler/CodeTree/CODEARRAY.sig";
 use "mlsource/MLCompiler/CodeTree/CODEGENTREE.sig";
 use "mlsource/MLCompiler/CodeTree/GENCODE.sig";
 use "mlsource/MLCompiler/CodeTree/CodetreeFunctions.ML";
 use "mlsource/MLCompiler/CodeTree/CodetreeStaticLinkAndCases.ML";
 use "mlsource/MLCompiler/CodeTree/CodetreeCodegenConstantFunctions.ML";
 use "mlsource/MLCompiler/CodeTree/CodetreeLambdaLift.ML";
 use "mlsource/MLCompiler/CodeTree/CodetreeRemoveRedundant.ML";
 use "mlsource/MLCompiler/CodeTree/CodetreeSimplifier.ML";
 use "mlsource/MLCompiler/CodeTree/CodetreeOptimiser.ML";
 use "mlsource/MLCompiler/CodeTree/CodeTreeConstruction.ML";
 use "mlsource/MLCompiler/Pretty.sml";
 use "mlsource/MLCompiler/CodeTree/CodeArray.ML";
 use "mlsource/MLCompiler/Debug.ML";
 use "mlsource/MLCompiler/CodeTree/BackendIntermediateCode.sml";
 use "mlsource/MLCompiler/CodeTree/BaseCodeTree.ML";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ASSEMBLY.sig";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/Arm64Assembly.sml";
-use "mlsource/MLCompiler/CodeTree/Arm64Code/ARM64SEQUENCES.sig";
-use "mlsource/MLCompiler/CodeTree/Arm64Code/Arm64Sequences.sml";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/ARM64PREASSEMBLY.sig";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PreAssembly.ML";
-use "mlsource/MLCompiler/CodeTree/Arm64Code/Arm64GenCode.sml";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODE.sig";
 use "mlsource/MLCompiler/CodeTree/INTSET.sig";
 use "mlsource/MLCompiler/CodeTree/IntSet.sml";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICode.ML";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ForeignCall.sml";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/ARM64IDENTIFYREFERENCES.sig";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/Arm64IdentifyReferences.ML";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/ARM64PUSHREGISTERS.sig";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PushRegisters.ML";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODEOPTIMISE.sig";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICodeOptimise.ML";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ALLOCATEREGISTERS.sig";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/Arm64AllocateRegisters.ML";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODEGENERATE.sig";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICodeToArm64Code.ML";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODETRANSFORM.sig";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICodeTransform.ML";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/Arm64CodetreeToICode.ML";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/ml_bind.ML";
 use "mlsource/MLCompiler/CodeTree/GCode.arm64.ML";
 use "mlsource/MLCompiler/CodeTree/ml_bind.ML";
 use "mlsource/MLCompiler/StructVals.ML";
 use "mlsource/MLCompiler/LEX_.ML";
 use "mlsource/MLCompiler/Symbols.ML";
 use "mlsource/MLCompiler/Lex.ML";
 use "mlsource/MLCompiler/SymsetSig.sml";
 use "mlsource/MLCompiler/DATATYPEREPSIG.sml";
 use "mlsource/MLCompiler/VALUEOPSSIG.sml";
 use "mlsource/MLCompiler/EXPORTTREESIG.sml";
 use "mlsource/MLCompiler/STRUCTURESSIG.sml";
 use "mlsource/MLCompiler/COMPILER_BODY.ML";
 use "mlsource/MLCompiler/SymSet.ML";
 use "mlsource/MLCompiler/TYPETREESIG.sml";
 use "mlsource/MLCompiler/COPIERSIG.sml";
 use "mlsource/MLCompiler/TYPEIDCODESIG.sml";
 use "mlsource/MLCompiler/DATATYPE_REP.ML";
 use "mlsource/MLCompiler/PRINTTABLESIG.sml";
 use "mlsource/MLCompiler/VALUE_OPS.ML";
 use "mlsource/MLCompiler/TYPE_TREE.ML";
 use "mlsource/MLCompiler/UTILITIES_.ML";
 use "mlsource/MLCompiler/Utilities.ML";
 use "mlsource/MLCompiler/PRINT_TABLE.ML";
 use "mlsource/MLCompiler/PrintTable.ML";
 use "mlsource/MLCompiler/ExportTree.sml";
 use "mlsource/MLCompiler/ExportTreeStruct.sml";
 use "mlsource/MLCompiler/TypeTree.ML";
 use "mlsource/MLCompiler/COPIER.sml";
 use "mlsource/MLCompiler/CopierStruct.sml";
 use "mlsource/MLCompiler/TYPEIDCODE.sml";
 use "mlsource/MLCompiler/TypeIDCodeStruct.sml";
 use "mlsource/MLCompiler/DatatypeRep.ML";
 use "mlsource/MLCompiler/ValueOps.ML";
 use "mlsource/MLCompiler/PARSETREESIG.sml";
 use "mlsource/MLCompiler/SIGNATURESSIG.sml";
 use "mlsource/MLCompiler/DEBUGGER.sig";
 use "mlsource/MLCompiler/STRUCTURES_.ML";
 use "mlsource/MLCompiler/DEBUGGER_.sml";
 use "mlsource/MLCompiler/Debugger.sml";
 use "mlsource/MLCompiler/ParseTree/BaseParseTreeSig.sml";
 use "mlsource/MLCompiler/ParseTree/BASE_PARSE_TREE.sml";
 use "mlsource/MLCompiler/ParseTree/PrintParsetreeSig.sml";
 use "mlsource/MLCompiler/ParseTree/PRINT_PARSETREE.sml";
 use "mlsource/MLCompiler/ParseTree/ExportParsetreeSig.sml";
 use "mlsource/MLCompiler/ParseTree/EXPORT_PARSETREE.sml";
 use "mlsource/MLCompiler/ParseTree/TypeCheckParsetreeSig.sml";
 use "mlsource/MLCompiler/ParseTree/TYPECHECK_PARSETREE.sml";
 use "mlsource/MLCompiler/ParseTree/MatchCompilerSig.sml";
 use "mlsource/MLCompiler/ParseTree/MATCH_COMPILER.sml";
 use "mlsource/MLCompiler/ParseTree/CodegenParsetreeSig.sml";
 use "mlsource/MLCompiler/ParseTree/CODEGEN_PARSETREE.sml";
 use "mlsource/MLCompiler/ParseTree/PARSE_TREE.ML";
 use "mlsource/MLCompiler/ParseTree/ml_bind.ML";
 use "mlsource/MLCompiler/SIGNATURES.sml";
 use "mlsource/MLCompiler/SignaturesStruct.sml";
 use "mlsource/MLCompiler/Structures.ML";
 use "mlsource/MLCompiler/PARSE_DEC.ML";
 use "mlsource/MLCompiler/SKIPS_.ML";
 use "mlsource/MLCompiler/Skips.ML";
 use "mlsource/MLCompiler/PARSE_TYPE.ML";
 use "mlsource/MLCompiler/ParseType.ML";
 use "mlsource/MLCompiler/ParseDec.ML";
 use "mlsource/MLCompiler/CompilerBody.ML";
 use "mlsource/MLCompiler/CompilerVersion.sml";
 use "mlsource/MLCompiler/Make.ML";
 use "mlsource/MLCompiler/INITIALISE_.ML";
 use "mlsource/MLCompiler/Initialise.ML";
 use "mlsource/MLCompiler/ml_bind.ML";
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64PREASSEMBLY.sig b/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64PREASSEMBLY.sig
index 9b09ce91..998feaee 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64PREASSEMBLY.sig
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64PREASSEMBLY.sig
@@ -1,264 +1,266 @@
 (*
     Copyright (c) 2021-2 David C. J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     Licence version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public Licence for more details.
     
     You should have received a copy of the GNU Lesser General Public
     Licence along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 (* The pre-assembly layer goes below the icode and allows peep-hole optimisation. *)
 signature ARM64PREASSEMBLY =
 sig
     type closureRef
     type machineWord = Address.machineWord
 
     (* XZero and XSP are both encoded as 31 but the interpretation
        depends on the instruction 
        The datatype definition is included here to allow for
        pattern matching on XSP and XZero. *)
     datatype xReg = XReg of Word8.word | XZero | XSP
     and vReg = VReg of Word8.word
 
     val X0:  xReg   and X1:  xReg   and X2:  xReg   and X3: xReg
     and X4:  xReg   and X5:  xReg   and X6:  xReg   and X7: xReg
     and X8:  xReg   and X9:  xReg   and X10: xReg   and X11: xReg
     and X12: xReg   and X13: xReg   and X14: xReg   and X15: xReg
     and X16: xReg   and X17: xReg   and X18: xReg   and X19: xReg
     and X20: xReg   and X21: xReg   and X22: xReg   and X23: xReg
     and X24: xReg   and X25: xReg   and X26: xReg   and X27: xReg
     and X28: xReg   and X29: xReg   and X30: xReg
 
     val X_MLHeapLimit: xReg (* ML Heap limit pointer *)
     and X_MLAssemblyInt: xReg (* ML assembly interface pointer. *)
     and X_MLHeapAllocPtr: xReg (* ML Heap allocation pointer. *)
     and X_MLStackPtr: xReg (* ML Stack pointer. *)
     and X_LinkReg: xReg (* Link reg - return address *)
     and X_Base32in64: xReg (* X24 is used for the heap base in 32-in-64. *)
 
     val V0:  vReg   and V1:  vReg   and V2:  vReg   and V3: vReg
     and V4:  vReg   and V5:  vReg   and V6:  vReg   and V7: vReg
 
     (* Condition for conditional branches etc. *)
     datatype condition =
         CondEqual            (* Z=1 *)
     |   CondNotEqual         (* Z=0 *)
     |   CondCarrySet         (* C=1 *)
     |   CondCarryClear       (* C=0 *)
     |   CondNegative         (* N=1 *)
     |   CondPositive         (* N=0 imcludes zero *)
     |   CondOverflow         (* V=1 *)
     |   CondNoOverflow       (* V=0 *)
     |   CondUnsignedHigher   (* C=1 && Z=0 *)
     |   CondUnsignedLowOrEq  (* ! (C=1 && Z=0) *)
     |   CondSignedGreaterEq  (* N=V *)
     |   CondSignedLess       (* N<>V *)
     |   CondSignedGreater    (* Z==0 && N=V *)
     |   CondSignedLessEq     (* !(Z==0 && N=V) *)
 
     val invertTest: condition -> condition (* i.e. jump when the condition is not true. *)
     val condToString: condition -> string
 
     datatype shiftType =
         ShiftLSL of Word8.word
     |   ShiftLSR of Word8.word
     |   ShiftASR of Word8.word
     |   ShiftNone
 
     datatype wordSize = WordSize32 | WordSize64
 
     datatype 'a extend =
         ExtUXTB of 'a (* Unsigned extend byte *)
     |   ExtUXTH of 'a (* Unsigned extend byte *)
     |   ExtUXTW of 'a (* Unsigned extend byte *)
     |   ExtUXTX of 'a (* Left shift *)
     |   ExtSXTB of 'a (* Sign extend byte *)
     |   ExtSXTH of 'a (* Sign extend halfword *)
     |   ExtSXTW of 'a (* Sign extend word *)
     |   ExtSXTX of 'a (* Left shift *)
 
     (* Load/store instructions have only a single bit for the shift.  For byte
        operations this is one bit shift; for others it scales by the size of
        the operand if set. *)
     datatype scale =
         ScaleOrShift
     |   NoScale
 
     datatype loadType = Load64 | Load32 | Load16 | Load8
     and opSize = OpSize32 | OpSize64
     and logicalOp = LogAnd | LogOr | LogXor
     and floatSize = Float32 | Double64
     and shiftDirection = ShiftLeft | ShiftRightLogical | ShiftRightArithmetic
     and multKind =
         MultAdd32 | MultSub32 | MultAdd64 | MultSub64 |
         SignedMultAddLong (* 32bit*32bit + 64bit => 64Bit *) |
         SignedMultHigh (* High order part of 64bit*64Bit *)
     and fpUnary = NegFloat | NegDouble | AbsFloat | AbsDouble | ConvFloatToDble | ConvDbleToFloat
     and fpBinary = MultiplyFP | DivideFP | AddFP | SubtractFP
     and unscaledType = NoUpdate | PreIndex | PostIndex
     and condSet = CondSet | CondSetIncr | CondSetInvert | CondSetNegate
     and bitfieldKind = BFUnsigned | BFSigned | BFInsert
     and brRegType = BRRBranch | BRRAndLink | BRRReturn
 
     type precodeLabel
     val createLabel: unit -> precodeLabel
 
     datatype precode =
         (* Basic instructions *)
         AddImmediate of {regN: xReg, regD: xReg, immed: word, shifted: bool, opSize: opSize, setFlags: bool}
     |   SubImmediate of {regN: xReg, regD: xReg, immed: word, shifted: bool, opSize: opSize, setFlags: bool}
     |   AddShiftedReg of {regM: xReg, regN: xReg, regD: xReg, shift: shiftType, opSize: opSize, setFlags: bool}
     |   SubShiftedReg of {regM: xReg, regN: xReg, regD: xReg, shift: shiftType, opSize: opSize, setFlags: bool}
     |   AddExtendedReg of {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend, opSize: opSize, setFlags: bool}
     |   SubExtendedReg of {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend, opSize: opSize, setFlags: bool}
     |   MultiplyAndAddSub of {regM: xReg, regN: xReg, regA: xReg, regD: xReg, multKind: multKind}
     |   DivideRegs of
             {regM: xReg, regN: xReg, regD: xReg, isSigned: bool, opSize: opSize}
     |   LogicalShiftedReg of
             {regM: xReg, regN: xReg, regD: xReg, shift: shiftType, logOp: logicalOp, opSize: opSize, setFlags: bool}
     |   LoadRegScaled of
             {regT: xReg, regN: xReg, unitOffset: int, loadType: loadType}
     |   LoadFPRegScaled of
             {regT: vReg, regN: xReg, unitOffset: int, floatSize: floatSize}
     |   StoreRegScaled of
             {regT: xReg, regN: xReg, unitOffset: int, loadType: loadType}
     |   StoreFPRegScaled of
             {regT: vReg, regN: xReg, unitOffset: int, floatSize: floatSize}
     |   LoadRegUnscaled of
             {regT: xReg, regN: xReg, byteOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   StoreRegUnscaled of
             {regT: xReg, regN: xReg, byteOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   LoadFPRegUnscaled of
             {regT: vReg, regN: xReg, byteOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   StoreFPRegUnscaled of
             {regT: vReg, regN: xReg, byteOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   LoadRegIndexed of {regT: xReg, regN: xReg, regM: xReg, loadType: loadType, option: scale extend}
     |   StoreRegIndexed of {regT: xReg, regN: xReg, regM: xReg, loadType: loadType, option: scale extend}
     |   LoadFPRegIndexed of {regT: vReg, regN: xReg, regM: xReg, floatSize: floatSize, option: scale extend}
     |   StoreFPRegIndexed of {regT: vReg, regN: xReg, regM: xReg, floatSize: floatSize, option: scale extend}
         (* LoadAcquire and StoreRelease are used for mutables. *)
     |   LoadAcquireReg of {regN: xReg, regT: xReg, loadType: loadType}
     |   StoreReleaseReg of {regN: xReg, regT: xReg, loadType: loadType}
         (* LoadAcquireExclusiveRegister and StoreReleaseExclusiveRegister are used for mutexes. *)
     |   LoadAcquireExclusiveRegister of {regN: xReg, regT: xReg}
     |   StoreReleaseExclusiveRegister of {regS: xReg, regT: xReg, regN: xReg}
     |   MemBarrier
     |   LoadRegPair of
             { regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   StoreRegPair of
             { regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   LoadFPRegPair of
             { regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   StoreFPRegPair of
             { regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   ConditionalSet of
             {regD: xReg, regTrue: xReg, regFalse: xReg, cond: condition, condSet: condSet, opSize: opSize}
     |   BitField of {immr: word, imms: word, regN: xReg, regD: xReg, opSize: opSize, bitfieldKind: bitfieldKind}
     |   ShiftRegisterVariable of {regM: xReg, regN: xReg, regD: xReg, opSize: opSize, shiftDirection: shiftDirection}
     |   BitwiseLogical of { bits: Word64.word, regN: xReg, regD: xReg, opSize: opSize, setFlags: bool, logOp: logicalOp}
         (* Floating point *)
     |   MoveGeneralToFP of { regN: xReg, regD: vReg, floatSize: floatSize}
     |   MoveFPToGeneral of {regN: vReg, regD: xReg, floatSize: floatSize}
     |   CvtIntToFP of { regN: xReg, regD: vReg, floatSize: floatSize, opSize: opSize}
     |   CvtFloatToInt of { round: IEEEReal.rounding_mode, regN: vReg, regD: xReg, floatSize: floatSize, opSize: opSize}
     |   FPBinaryOp of { regM: vReg, regN: vReg, regD: vReg, floatSize: floatSize, fpOp: fpBinary}
     |   FPComparison of { regM: vReg, regN: vReg, floatSize: floatSize}
     |   FPUnaryOp of {regN: vReg, regD: vReg, fpOp: fpUnary}
         (* Branches and Labels. *)
     |   SetLabel of precodeLabel
     |   ConditionalBranch of condition * precodeLabel
     |   UnconditionalBranch of precodeLabel
     |   BranchAndLink of precodeLabel
     |   BranchReg of {regD: xReg, brRegType: brRegType }
     |   LoadLabelAddress of xReg * precodeLabel
     |   TestBitBranch of { test: xReg, bit: Word8.word, label: precodeLabel, onZero: bool }
     |   CompareBranch of { test: xReg, label: precodeLabel, onZero: bool, opSize: opSize }
         (* Composite instructions *)
     |   MoveXRegToXReg of {sReg: xReg, dReg: xReg}
     |   LoadNonAddr of xReg * Word64.word
     |   LoadAddr of xReg * machineWord
     |   RTSTrap of { rtsEntry: int, work: xReg, save: xReg list }
         (* Allocate memory - bytes includes the length word and rounding. *)
     |   AllocateMemoryFixedSize of { bytes: word, dest: xReg, save: xReg list, work: xReg }
         (* Allocate memory - sizeReg is number of ML words needed for cell. *)
     |   AllocateMemoryVariableSize of { sizeReg: xReg, dest: xReg, save: xReg list, work: xReg }
         (* Branch table for indexed case. startLabel is the address of the first label in
            the list.  The branch table is a sequence of unconditional branches. *)
     |   BranchTable of { startLabel: precodeLabel, brTable: precodeLabel list }
+    |   LoadGlobalHeapBaseInCallback of xReg
 
     (* Wrapper for BitField *)
     val shiftConstant: { direction: shiftDirection, regD: xReg, regN: xReg, shift: word, opSize: opSize } -> precode
     
     (* Convenient sequences.  N.B. These are in reverse order. *)
     val boxDouble:
         {source: vReg, destination: xReg, workReg: xReg, saveRegs: xReg list} * precode list -> precode list
     and boxFloat:
         {source: vReg, destination: xReg, workReg: xReg, saveRegs: xReg list} * precode list -> precode list
     and boxSysWord:
         {source: xReg, destination: xReg, workReg: xReg, saveRegs: xReg list} * precode list -> precode list
 
     (* Create the vector of code from the list of instructions and update the
        closure reference to point to it. *)
     val generateFinalCode:
         {instrs: precode list, name: string, parameters: Universal.universal list, resultClosure: closureRef,
          profileObject: machineWord, labelCount: int} -> unit
 
     (* Temporarily for development. *)
     type instr
     val toInstr: precode -> instr
 
     (* Take a forward order sequence of instructions and generate a forward order output sequence. *)
     val toInstrs: precode list -> instr list
  
     (* Offsets in the assembly code interface pointed at by X26
        These are in units of 64-bits NOT bytes. *)
     val heapOverflowCallOffset: int
     and stackOverflowCallOffset: int
     and stackOverflowXCallOffset: int
     and exceptionHandlerOffset: int
     and stackLimitOffset: int
     and threadIdOffset: int
     and heapLimitPtrOffset: int
     and heapAllocPtrOffset: int
     and mlStackPtrOffset: int
+    and exceptionPacketOffset: int
 
     val is32in64: bool and isBigEndian: bool
 
     val isEncodableBitPattern: Word64.word * wordSize -> bool
 
     structure Sharing:
     sig
         type closureRef = closureRef
         type loadType = loadType
         type opSize = opSize
         type logicalOp = logicalOp
         type floatSize = floatSize
         type shiftDirection = shiftDirection
         type multKind = multKind
         type fpUnary = fpUnary
         type fpBinary = fpBinary
         type unscaledType = unscaledType
         type condSet = condSet
         type bitfieldKind = bitfieldKind
         type brRegType = brRegType
         type precode = precode
         type xReg = xReg
         type vReg = vReg
         type precodeLabel = precodeLabel
         type condition = condition
         type shiftType = shiftType
         type wordSize = wordSize
         type 'a extend = 'a extend
         type scale = scale
         type instr = instr
     end
 
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64SEQUENCES.sig b/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64SEQUENCES.sig
deleted file mode 100644
index 767a0fe9..00000000
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64SEQUENCES.sig
+++ /dev/null
@@ -1,47 +0,0 @@
-(*
-    Copyright (c) 2021 David C. J. Matthews
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    Licence version 2.1 as published by the Free Software Foundation.
-    
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public Licence for more details.
-    
-    You should have received a copy of the GNU Lesser General Public
-    Licence along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-*)
-
-signature ARM64SEQUENCES =
-sig
-    type instr and xReg and vReg
-    
-    (* Sequences used in both the main code-generator and the FFI code*)
-    
-    (* Copy a value to another register. *)
-    val moveRegToReg: {sReg: xReg, dReg: xReg} -> instr
-    
-    (* Load a non-address constant.  Tries to use movz/movn/movk if
-       that can be done easily, othewise uses loadNonAddressConstant to
-       load the value from the non-address constant area. *)
-    val loadNonAddress: xReg * Word64.word -> instr list
-
-    (* Boxing operations.  Floats are tagged rather than boxed in native 64-bit.
-       The destination is a poly address. *)
-    val boxDouble:
-        {source: vReg, destination: xReg, workReg: xReg, saveRegs: xReg list} -> instr list
-    and boxFloat:
-        {source: vReg, destination: xReg, workReg: xReg, saveRegs: xReg list} -> instr list
-    and boxSysWord:
-        {source: xReg, destination: xReg, workReg: xReg, saveRegs: xReg list} -> instr list
-
-    structure Sharing:
-    sig
-        type instr = instr
-        type xReg = xReg
-        type vReg = vReg
-    end
-end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ForeignCall.sml b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ForeignCall.sml
index 43adacf4..0b791ac9 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ForeignCall.sml
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ForeignCall.sml
@@ -1,860 +1,854 @@
 (*
     Copyright (c) 2021-2 David C. J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     Licence version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public Licence for more details.
     
     You should have received a copy of the GNU Lesser General Public
     Licence along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64ForeignCall(
     structure CodeArray: CODEARRAY
     and       Arm64PreAssembly: ARM64PREASSEMBLY
-    and       Arm64Assembly: ARM64ASSEMBLY
     and       Debug: DEBUG
-    and       Arm64Sequences: ARM64SEQUENCES
 
-    sharing CodeArray.Sharing = Arm64Assembly.Sharing = Arm64Sequences.Sharing
+    sharing CodeArray.Sharing = Arm64PreAssembly.Sharing
 ): FOREIGNCALL
 =
 struct
 
-    open CodeArray Arm64Assembly Arm64Sequences
+    open CodeArray Arm64PreAssembly
     
     exception InternalError = Misc.InternalError
     and Foreign = Foreign.Foreign
 
     datatype fastArgs = FastArgFixed | FastArgDouble | FastArgFloat
 
     val makeEntryPoint: string -> machineWord = RunCall.rtsCallFull1 "PolyCreateEntryPointObject"
 
     (* Turn an index into an absolute address. *)
     fun indexToAbsoluteAddress(iReg, absReg) =
     if is32in64
-    then [addShiftedReg{regM=iReg, regN=X_Base32in64, regD=absReg, shift=ShiftLSL 0w2}]
+    then [AddShiftedReg{regM=iReg, regN=X_Base32in64, regD=absReg, shift=ShiftLSL 0w2, opSize=OpSize64, setFlags=false}]
     else if iReg = absReg
     then []
-    else [moveRegToReg{sReg=iReg, dReg=absReg}]
+    else [MoveXRegToXReg{sReg=iReg, dReg=absReg}]
 
     fun unboxDouble(addrReg, workReg, valReg) =
     if is32in64
     then indexToAbsoluteAddress(addrReg, workReg) @
-        [loadRegScaledDouble{regT=valReg, regN=workReg, unitOffset=0}]
-    else [loadRegScaledDouble{regT=valReg, regN=addrReg, unitOffset=0}]
+        [LoadFPRegScaled{regT=valReg, regN=workReg, unitOffset=0, floatSize=Double64}]
+    else [LoadFPRegScaled{regT=valReg, regN=addrReg, unitOffset=0, floatSize=Double64}]
 
     fun unboxOrUntagSingle(addrReg, workReg, valReg) =
     if is32in64
-    then [loadRegIndexedFloat{regN=X_Base32in64, regM=addrReg, regT=valReg, option=ExtUXTX ScaleOrShift}]
+    then [LoadFPRegIndexed{regN=X_Base32in64, regM=addrReg, regT=valReg, option=ExtUXTX ScaleOrShift, floatSize=Float32}]
     else
     [
-        logicalShiftRight{shift=0w32, regN=addrReg, regD=workReg},
-        moveGeneralToFloat{regN=workReg, regD=valReg}
+        shiftConstant{direction=ShiftRightLogical, shift=0w32, regN=addrReg, regD=workReg, opSize=OpSize64},
+        MoveGeneralToFP{regN=workReg, regD=valReg, floatSize=Float32}
     ]
         
     fun boxOrTagFloat{floatReg, fixedReg, workReg, saveRegs} =
     if is32in64
-    then boxFloat{source=floatReg, destination=fixedReg, workReg=workReg, saveRegs=saveRegs}
+    then List.rev(boxFloat({source=floatReg, destination=fixedReg, workReg=workReg, saveRegs=saveRegs}, []))
     else
     [
-        moveFloatToGeneral{regN=floatReg, regD=fixedReg},
-        logicalShiftLeft{shift=0w32, regN=fixedReg, regD=fixedReg},
-        bitwiseOrImmediate{regN=fixedReg, regD=fixedReg, bits=0w1}
+        MoveFPToGeneral{regN=floatReg, regD=fixedReg, floatSize=Float32},
+        shiftConstant{direction=ShiftLeft, shift=0w32, regN=fixedReg, regD=fixedReg, opSize=OpSize64},
+        BitwiseLogical{regN=fixedReg, regD=fixedReg, bits=0w1, logOp=LogOr, setFlags=false, opSize=OpSize64}
     ]
 
 
     (* Call the RTS.  Previously this did not check for exceptions raised in the RTS and instead
        there was code added after each call.  Doing it after the call doesn't affect the time
        taken but makes the code larger especially as this is needed in every arbitrary precision
        operation.
        Currently we clear the RTS exception packet field before the call.  The field is
        cleared in "full" calls that may raise an exception but not in fast calls.  They
        may not raise an exception but the packet may not have been cleared from a
        previous call. *)
 
     fun rtsCallFastGeneral (functionName, argFormats, resultFormat, debugSwitches) =
     let
         val entryPointAddr = makeEntryPoint functionName
         (* The maximum we currently have is five so we don't need to worry about stack args. *)
 
         fun loadArgs([], _, _, _) = []
         |   loadArgs(FastArgFixed :: argTypes, srcReg :: srcRegs, fixed :: fixedRegs, fpRegs) =
                 if srcReg = fixed
                 then loadArgs(argTypes, srcRegs, fixedRegs, fpRegs) (* Already in the right reg *)
-                else moveRegToReg{sReg=srcReg, dReg=fixed} ::
+                else MoveXRegToXReg{sReg=srcReg, dReg=fixed} ::
                         loadArgs(argTypes, srcRegs, fixedRegs, fpRegs)
         |   loadArgs(FastArgDouble :: argTypes, srcReg :: srcRegs, fixedRegs, fp :: fpRegs) =
                 (* Unbox the value into a fp reg. *)
                 unboxDouble(srcReg, srcReg, fp) @
                 loadArgs(argTypes, srcRegs, fixedRegs, fpRegs)
         |   loadArgs(FastArgFloat :: argTypes, srcReg :: srcRegs, fixedRegs, fp :: fpRegs) =
                 (* Untag and move into the fp reg *)
                 unboxOrUntagSingle(srcReg, srcReg, fp) @
                 loadArgs(argTypes, srcRegs, fixedRegs, fpRegs)
         |   loadArgs _ = raise InternalError "rtsCall: Too many arguments"
 
         val noRTSException = createLabel()
 
         val instructions =
             loadArgs(argFormats,
                 (* ML Arguments *) [X0, X1, X2, X3, X4, X5, X6, X7],
                 (* C fixed pt args *) [X0, X1, X2, X3, X4, X5, X6, X7],
                 (* C floating pt args *) [V0, V1, V2, V3, V4, V5, V6, V7]) @
                 (* Clear the RTS exception state. *)
-                loadNonAddress(X16, 0w1) @
+                LoadNonAddr(X16, 0w1) ::
             [
-                storeRegScaled{regT=X16, regN=X_MLAssemblyInt, unitOffset=exceptionPacketOffset},
+                StoreRegScaled{regT=X16, regN=X_MLAssemblyInt, unitOffset=exceptionPacketOffset, loadType=Load64},
                 (* Move X30 to X23, a callee-save register. *)
                 (* Note: maybe we should push X24 just in case this is the only
                    reachable reference to the code. *)
-                orrShiftedReg{regN=XZero, regM=X_LinkReg, regD=X23, shift=ShiftNone},
-                loadAddressConstant(X16, entryPointAddr) (* Load entry point *)
+                LogicalShiftedReg{regN=XZero, regM=X_LinkReg, regD=X23, shift=ShiftNone, logOp=LogOr, opSize=OpSize64, setFlags=false},
+                LoadAddr(X16, entryPointAddr) (* Load entry point *)
             ] @ indexToAbsoluteAddress(X16, X16) @
             [
-                loadRegScaled{regT=X16, regN=X16, unitOffset=0}, (* Load the actual address. *)
+                LoadRegScaled{regT=X16, regN=X16, unitOffset=0, loadType=Load64}, (* Load the actual address. *)
                 (* Store the current heap allocation pointer. *)
-                storeRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset},
+                StoreRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset, loadType=Load64},
                 (* For the moment save and restore the ML stack pointer.  No RTS call should change
                    it and it's callee-save but just in case... *)
-                storeRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset},
-                branchAndLinkReg X16, (* Call the function. *)
+                StoreRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset, loadType=Load64},
+                BranchReg{regD=X16, brRegType=BRRAndLink}, (* Call the function. *)
                 (* Restore the ML stack pointer. *)
-                loadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset},
+                LoadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset, loadType=Load64},
                 (* Load the heap allocation ptr and limit.  We could have GCed in the RTS call. *)
-                loadRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset},
-                loadRegScaled{regT=X_MLHeapLimit, regN=X_MLAssemblyInt, unitOffset=heapLimitPtrOffset},
+                LoadRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset, loadType=Load64},
+                LoadRegScaled{regT=X_MLHeapLimit, regN=X_MLAssemblyInt, unitOffset=heapLimitPtrOffset, loadType=Load64},
                 (* Check the RTS exception. *)
-                loadRegScaled{regT=X16, regN=X_MLAssemblyInt, unitOffset=exceptionPacketOffset},
-                subSImmediate{regN=X16, regD=XZero, immed=0w1, shifted=false},
-                conditionalBranch(CondEqual, noRTSException),
+                LoadRegScaled{regT=X16, regN=X_MLAssemblyInt, unitOffset=exceptionPacketOffset, loadType=Load64},
+                SubImmediate{regN=X16, regD=XZero, immed=0w1, shifted=false, setFlags=true, opSize=OpSize64},
+                ConditionalBranch(CondEqual, noRTSException),
                     (* If it isn't then raise the exception. *)
-                moveRegToReg{sReg=X16, dReg=X0},
-                loadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset},
-                loadRegScaled{regT=X16, regN=X_MLStackPtr, unitOffset=0},
-                branchRegister X16,
-                setLabel noRTSException
+                MoveXRegToXReg{sReg=X16, dReg=X0},
+                LoadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset, loadType=Load64},
+                LoadRegScaled{regT=X16, regN=X_MLStackPtr, unitOffset=0, loadType=Load64},
+                BranchReg{regD=X16, brRegType=BRRBranch},
+                SetLabel noRTSException
 
             ] @
             (
                 case resultFormat of
                     FastArgFixed => []
-                |   FastArgDouble => (* This must be boxed. *) boxDouble{source=V0, destination=X0, workReg=X1, saveRegs=[]}
+                |   FastArgDouble => (* This must be boxed. *) List.rev(boxDouble({source=V0, destination=X0, workReg=X1, saveRegs=[]}, []))
                 |   FastArgFloat => (* This must be tagged or boxed *) boxOrTagFloat{floatReg=V0, fixedReg=X0, workReg=X1, saveRegs=[]}
             ) @
             [
-                returnRegister X23
+                BranchReg{regD=X23, brRegType=BRRReturn}
             ]
         val closure = makeConstantClosure()
-        val () = generateCode{instrs=instructions, name=functionName, parameters=debugSwitches, resultClosure=closure,
-                              profileObject=createProfileObject()}
+        val () = generateFinalCode{instrs=instructions, name=functionName, parameters=debugSwitches, resultClosure=closure,
+                              profileObject=createProfileObject(), labelCount=1}
     in
         closureAsAddress closure
     end
 
 
     fun rtsCallFast (functionName, nArgs, debugSwitches) =
         rtsCallFastGeneral (functionName, List.tabulate(nArgs, fn _ => FastArgFixed), FastArgFixed, debugSwitches)
 
     (* RTS call with one double-precision floating point argument and a floating point result. *)
     fun rtsCallFastRealtoReal (functionName, debugSwitches) =
         rtsCallFastGeneral (functionName, [FastArgDouble], FastArgDouble, debugSwitches)
     
     (* RTS call with two double-precision floating point arguments and a floating point result. *)
     fun rtsCallFastRealRealtoReal (functionName, debugSwitches) =
         rtsCallFastGeneral (functionName, [FastArgDouble, FastArgDouble], FastArgDouble, debugSwitches)
 
     (* RTS call with one double-precision floating point argument, one fixed point argument and a
        floating point result. *)
     fun rtsCallFastRealGeneraltoReal (functionName, debugSwitches) =
         rtsCallFastGeneral (functionName, [FastArgDouble, FastArgFixed], FastArgDouble, debugSwitches)
 
     (* RTS call with one general (i.e. ML word) argument and a floating point result.
        This is used only to convert arbitrary precision values to floats. *)
     fun rtsCallFastGeneraltoReal (functionName, debugSwitches) =
         rtsCallFastGeneral (functionName, [FastArgFixed], FastArgDouble, debugSwitches)
 
     (* Operations on Real32.real values. *)
 
     fun rtsCallFastFloattoFloat (functionName, debugSwitches) =
         rtsCallFastGeneral (functionName, [FastArgFloat], FastArgFloat, debugSwitches)
     
     fun rtsCallFastFloatFloattoFloat (functionName, debugSwitches) =
         rtsCallFastGeneral (functionName, [FastArgFloat, FastArgFloat], FastArgFloat, debugSwitches)
 
     (* RTS call with one double-precision floating point argument, one fixed point argument and a
        floating point result. *)
     fun rtsCallFastFloatGeneraltoFloat (functionName, debugSwitches) =
         rtsCallFastGeneral (functionName, [FastArgFloat, FastArgFixed], FastArgFloat, debugSwitches)
 
     (* RTS call with one general (i.e. ML word) argument and a floating point result.
        This is used only to convert arbitrary precision values to floats. *)
     fun rtsCallFastGeneraltoFloat (functionName, debugSwitches) =
         rtsCallFastGeneral (functionName, [FastArgFixed], FastArgFloat, debugSwitches)
 
     (* There is only one ABI value. *)
     datatype abi = ARM64Abi
     fun abiList () = [("default", ARM64Abi)]
 
     fun alignUp(s, align) = Word.andb(s + align-0w1, ~ align)
     
     val getThreadDataCall = makeEntryPoint "PolyArm64GetThreadData"
 
     (* This must match the type in Foreign.LowLevel.  Once this is bootstrapped we could use that
        type but note that this is the type we use within the compiler and we build Foreign.LowLevel
        AFTER compiling this. *)
     datatype cTypeForm =
         CTypeFloatingPt | CTypePointer | CTypeSignedInt | CTypeUnsignedInt
     |   CTypeStruct of cType list | CTypeVoid
     withtype cType = { typeForm: cTypeForm, align: word, size: word }
 
     (* Load a byte, halfword, word or long *)
     fun loadAlignedValue(reg, base, offset, size) =
     let
         val _ = offset mod size = 0w0 orelse raise InternalError "loadAlignedValue: not aligned"
         val loadOp =
             case size of
-                0w8 => loadRegScaled
-            |   0w4 => loadRegScaled32
-            |   0w2 => loadRegScaled16
-            |   0w1 => loadRegScaledByte
+                0w8 => Load64
+            |   0w4 => Load32
+            |   0w2 => Load16
+            |   0w1 => Load8
             |   _ => raise InternalError "loadAlignedValue: invalid length"
     in
-        loadOp{regT=reg, regN=base, unitOffset=Word.toInt(offset div size)}
+        LoadRegScaled{regT=reg, regN=base, unitOffset=Word.toInt(offset div size), loadType=loadOp}
     end
 
     (* Store a register into upto 8 bytes.  Most values will involve a single store but odd-sized
        structs can require shifts and multiple stores.  N.B.  May modify the source register. *)
     and storeUpTo8(reg, base, offset, size) =
     let
         val storeOp =
-            if size = 0w8 then storeRegUnscaled else if size >= 0w4 then storeRegUnscaled32
-            else if size >= 0w2 then storeRegUnscaled16 else storeRegUnscaledByte
+            if size = 0w8 then Load64 else if size >= 0w4 then Load32
+            else if size >= 0w2 then Load16 else Load8
     in
-        [storeOp{regT=reg, regN=base, byteOffset=offset}]
+        [StoreRegUnscaled{regT=reg, regN=base, byteOffset=offset, loadType=storeOp, unscaledType=NoUpdate}]
     end @
     (
         if size = 0w6 orelse size = 0w7
         then
         [
-            logicalShiftRight{regN=reg, regD=reg, shift=0w32 },
-            storeRegUnscaled16{regT=reg, regN=base, byteOffset=offset+4}
+            shiftConstant{direction=ShiftRightLogical, regN=reg, regD=reg, shift=0w32, opSize=OpSize64 },
+            StoreRegUnscaled{regT=reg, regN=base, byteOffset=offset+4, loadType=Load16, unscaledType=NoUpdate}
         ]
         else []
     ) @
     (
         if size = 0w3 orelse size = 0w5 orelse size = 0w7
         then
         [
-            logicalShiftRight{regN=reg, regD=reg, shift=(size-0w1)*0w8 },
-            storeRegUnscaledByte{regT=reg, regN=base, byteOffset=offset+Word.toInt(size-0w1)}
+            shiftConstant{direction=ShiftRightLogical, regN=reg, regD=reg, shift=(size-0w1)*0w8, opSize=OpSize64 },
+            StoreRegUnscaled{regT=reg, regN=base, byteOffset=offset+Word.toInt(size-0w1), loadType=Load8, unscaledType=NoUpdate}
         ]
         else []
     )
 
     (* Extract the elements of structures. *)
     fun unwrap(CTypeStruct ctypes, _) = List.foldr(fn({typeForm, size, ...}, l) => unwrap(typeForm, size) @ l) [] ctypes
     |   unwrap (ctype, size) = [(ctype, size)]
 
     (* Structures of up to four floating point values of the same precision are treated specially. *)
     datatype argClass =
         ArgClassHFA of Word8.word * bool (* 1 - 4 floating pt values *)
     |   ArgLargeStruct          (* > 16 bytes and not an HFA *)
     |   ArgSmall                (* Scalars or small structures *)
 
     fun classifyArg(ctype, size) =
         case unwrap (ctype, size) of
             [(CTypeFloatingPt, 0w4)] =>
                 ArgClassHFA(0w1, false)
         |   [(CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4)] =>
                 ArgClassHFA(0w2, false)
         |   [(CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4)] =>
                 ArgClassHFA(0w3, false)
         |   [(CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4)] =>
                 ArgClassHFA(0w4, false)
         |   [(CTypeFloatingPt, 0w8)] =>
                 ArgClassHFA(0w1, true)
         |   [(CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8)] =>
                 ArgClassHFA(0w2, true)
         |   [(CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8)] =>
                 ArgClassHFA(0w3, true)
         |   [(CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8)] =>
                 ArgClassHFA(0w4, true)
         |   _ =>
                 if size > 0w16 then ArgLargeStruct
                 else ArgSmall
 
     (* Can we load this in a single instruction? *)
     fun alignedLoadStore(_, 0w1) = true
     |   alignedLoadStore(addr, 0w2) = addr mod 0w2 = 0w0
     |   alignedLoadStore(addr, 0w4) = addr mod 0w4 = 0w0
     |   alignedLoadStore(addr, 0w8) = addr mod 0w8 = 0w0
     |   alignedLoadStore(addr, 0w16) = addr mod 0w8 = 0w0 (* Can use load-pair. *)
     |   alignedLoadStore _ = false
 
     (* This builds a piece of code that takes three arguments and returns a unit result.
        All three arguments are SysWord.word values i.e. ML addresses containing the address
        of the actual C value.
        The first argument (X0) is the address of the function to call.
        The second argument (X1) points to a struct that contains the argument(s) for the
        function.  The arguments have to be unpacked from the struct into the appropriate
        registers or to the C stack.
        The third argument (X2) points to a piece of memory to receive the result of the call.
        It may be empty if the function returns void.  It may only be as big as required
        for the result type. *)
     fun foreignCall(_: abi, args: cType list, result: cType): Address.machineWord =
     let
         val resultAreaPtr = X19 (* Unboxed value from X2 - This is callee save. *)
         val argPtrReg = X9 (* A scratch register that isn't used for arguments. *)
         val entryPtReg = X16 (* Contains the address of the function to call. *)
         val argWorkReg = X10 (* Used in loading arguments if necessary. *)
         and argWorkReg2 = X11
         and structSpacePtr = X12
         and argWorkReg3 = X13
         and argWorkReg4 = X14
 
         fun loadArgs([], stackOffset, _, _, _, code, largeStructSpace) =
                 (code, stackOffset, largeStructSpace)
 
         |   loadArgs(arg::args, stackOffset, argOffset, gRegNo, fpRegNo, code, largeStructSpace) =
             let
                 val {size, align, typeForm, ...} =  arg
                 val newArgOffset = alignUp(argOffset, align)
             in
                 case classifyArg(typeForm, size) of
                     ArgClassHFA(numItems, isDouble) =>
                         if fpRegNo + numItems <= 0w8
                         then
                         let
                             val scale = if isDouble then 0w8 else 0w4
                             (* Load the values to the floating point registers. *)
                             fun loadFPRegs(0w0, _, _) = []
                             |   loadFPRegs(0w1, fpRegNo, offset) =
-                                [(if isDouble then loadRegScaledDouble else loadRegScaledFloat)
-                                    {regT=VReg fpRegNo, regN=argPtrReg, unitOffset=offset}]
+                                [LoadFPRegScaled{regT=VReg fpRegNo, regN=argPtrReg, unitOffset=offset, floatSize=if isDouble then Double64 else Float32}]
                             |   loadFPRegs(n, fpRegNo, offset) =
-                                (if isDouble then loadPairOffsetDouble else loadPairOffsetFloat)
-                                    {regT1=VReg fpRegNo, regT2=VReg(fpRegNo+0w1), regN=argPtrReg, unitOffset=offset} ::
-                                        loadFPRegs(n-0w2, fpRegNo+0w2, offset+2)
+                                (LoadFPRegPair{regT1=VReg fpRegNo, regT2=VReg(fpRegNo+0w1), regN=argPtrReg, unitOffset=offset,
+                                                floatSize=if isDouble then Double64 else Float32, unscaledType=NoUpdate} ::
+                                        loadFPRegs(n-0w2, fpRegNo+0w2, offset+2))
                         in
                             loadArgs(args, stackOffset, newArgOffset+size, gRegNo, fpRegNo+numItems,
                                 loadFPRegs(numItems, fpRegNo, Word.toInt(newArgOffset div scale)) @ code,
                                 largeStructSpace)
                         end
                         else
                         let
                             (* If we have insufficient number of registers we discard any that are
                                left and push the argument to the stack. *)
                             (* The floating point value or structure is copied to the stack
                                as a contiguous area.  Use general registers to copy the data.
                                It could be on a 4-byte alignment.  In the typical case of a single
                                floating point value this will just be a single load and store. *)
                             fun copyData(0w0, _, _) = []
                             |   copyData(n, srcOffset, stackOffset) =
                                 if isDouble
-                                then loadRegScaled{regT=argWorkReg2, regN=argPtrReg, unitOffset=srcOffset} ::
-                                     storeRegScaled{regT=argWorkReg2, regN=XSP, unitOffset=stackOffset} ::
+                                then LoadRegScaled{loadType=Load64, regT=argWorkReg2, regN=argPtrReg, unitOffset=srcOffset} ::
+                                     StoreRegScaled{loadType=Load64, regT=argWorkReg2, regN=XSP, unitOffset=stackOffset} ::
                                      copyData(n-0w1, srcOffset+1, stackOffset+1)
-                                else loadRegScaled32{regT=argWorkReg2, regN=argPtrReg, unitOffset=srcOffset} ::
-                                     storeRegScaled32{regT=argWorkReg2, regN=XSP, unitOffset=stackOffset} ::
+                                else LoadRegScaled{loadType=Load32, regT=argWorkReg2, regN=argPtrReg, unitOffset=srcOffset} ::
+                                     StoreRegScaled{loadType=Load32, regT=argWorkReg2, regN=XSP, unitOffset=stackOffset} ::
                                      copyData(n-0w1, srcOffset+1, stackOffset+1)
 
                             val copyToStack =
                                 if isDouble
                                 then copyData(numItems, Word.toInt(newArgOffset div 0w8), stackOffset)
                                 else copyData(numItems, Word.toInt(newArgOffset div 0w4), stackOffset*2)
                             (* The overall size is rounded up to a multiple of 8 *)
                             val newStackOffset = stackOffset + Word.toInt(alignUp(size, 0w8) div 0w8)
                         in
                             loadArgs(args, newStackOffset, newArgOffset+size, gRegNo, 0w8,
                                 copyToStack @ code, largeStructSpace)
                         end
              
                 |   _ =>
                     let
                         (* Load an aligned argument into one or two registers or copy it to the stack. *)
                         fun loadArgumentValues(argSize, sourceOffset, sourceBase, newStructSpace, preCode) =
                             if gRegNo <= 0w6 orelse (size <= 0w8 andalso gRegNo <= 0w7)
                             then (* There are sufficient registers *)
                             let
                                 val (loadInstr, nextGReg) =
                                     if argSize = 0w16
-                                    then ([loadPairOffset{regT1=XReg gRegNo, regT2=XReg(gRegNo+0w1),
+                                    then ([LoadRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=XReg gRegNo, regT2=XReg(gRegNo+0w1),
                                                 regN=sourceBase, unitOffset=Word.toInt(sourceOffset div 0w8)}], gRegNo+0w2)
                                     else ([loadAlignedValue(XReg gRegNo, sourceBase, sourceOffset, size)], gRegNo+0w1)
                             in
                                 loadArgs(args, stackOffset, newArgOffset+size, nextGReg, fpRegNo,
                                         preCode @ loadInstr @ code, newStructSpace)
                             end
                             else if argSize = 0w16
                             then loadArgs(args, stackOffset+2, newArgOffset+size, 0w8, fpRegNo,
                                     preCode @
-                                    loadPairOffset{regT1=argWorkReg2, regT2=argWorkReg3, regN=sourceBase, unitOffset=Word.toInt(sourceOffset div 0w8)} ::
-                                    storePairOffset{regT1=argWorkReg2, regT2=argWorkReg3, regN=XSP, unitOffset=stackOffset} :: code, newStructSpace)
+                                    LoadRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=argWorkReg2, regT2=argWorkReg3, regN=sourceBase, unitOffset=Word.toInt(sourceOffset div 0w8)} ::
+                                    StoreRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=argWorkReg2, regT2=argWorkReg3, regN=XSP, unitOffset=stackOffset} :: code, newStructSpace)
                             else loadArgs(args, stackOffset+1, newArgOffset+size, 0w8, fpRegNo,
                                     preCode @ loadAlignedValue(argWorkReg2, sourceBase, sourceOffset, argSize) ::
-                                    storeRegScaled{regT=argWorkReg2, regN=XSP, unitOffset=stackOffset} :: code, newStructSpace)
+                                    StoreRegScaled{loadType=Load64, regT=argWorkReg2, regN=XSP, unitOffset=stackOffset} :: code, newStructSpace)
                     in
                         if alignedLoadStore(newArgOffset, size)
                         then loadArgumentValues(size, newArgOffset, argPtrReg, largeStructSpace, [])
 
                         else (* General case.  Either a large structure or a small structure that
                                 can't easily be loaded,  First copy it to the stack, and either pass
                                 the address or load it once it's aligned. *)
                         let
                             val newStructSpace = alignUp(largeStructSpace + size, 0w16)
                             val loopLabel = createLabel()
                             (* The address of the area we're copying to is in argRegNo. *)
                             val argRegNo = if gRegNo < 0w8 then XReg gRegNo else argWorkReg
                             (* Copy from the end back to the start. *)
                             val copyToStructSpace =
                             [
-                                addImmediate{regN=structSpacePtr, regD=argRegNo, immed=largeStructSpace, shifted=false}, 
-                                addImmediate{regN=argRegNo, regD=argWorkReg2, immed=size, shifted=false}, (* End of dest area *)
-                                addImmediate{regN=argPtrReg, regD=argWorkReg3, immed=newArgOffset+size, shifted=false}, (* end of source *)
-                                setLabel loopLabel,
-                                loadRegPreIndexByte{regT=argWorkReg4, regN=argWorkReg3, byteOffset= ~1},
-                                storeRegPreIndexByte{regT=argWorkReg4, regN=argWorkReg2, byteOffset= ~1},
-                                subSShiftedReg{regM=argWorkReg2, regN=argRegNo, regD=XZero, shift=ShiftNone}, (* At start? *)
-                                conditionalBranch(CondNotEqual, loopLabel)
+                                AddImmediate{opSize=OpSize64, setFlags=false, regN=structSpacePtr, regD=argRegNo, immed=largeStructSpace, shifted=false}, 
+                                AddImmediate{opSize=OpSize64, setFlags=false, regN=argRegNo, regD=argWorkReg2, immed=size, shifted=false}, (* End of dest area *)
+                                AddImmediate{opSize=OpSize64, setFlags=false, regN=argPtrReg, regD=argWorkReg3, immed=newArgOffset+size, shifted=false}, (* end of source *)
+                                SetLabel loopLabel,
+                                LoadRegUnscaled{loadType=Load8, unscaledType=PreIndex, regT=argWorkReg4, regN=argWorkReg3, byteOffset= ~1},
+                                StoreRegUnscaled{loadType=Load8, unscaledType=PreIndex, regT=argWorkReg4, regN=argWorkReg2, byteOffset= ~1},
+                                SubShiftedReg{opSize=OpSize64, setFlags=true, regM=argWorkReg2, regN=argRegNo, regD=XZero, shift=ShiftNone}, (* At start? *)
+                                ConditionalBranch(CondNotEqual, loopLabel)
                             ]
                         in
                             if size > 0w16
                             then (* Large struct - pass by reference *)
                             (
                                 if gRegNo < 0w8
                                 then loadArgs(args, stackOffset, newArgOffset+size, gRegNo+0w1, fpRegNo, copyToStructSpace @ code, newStructSpace)
                                 else loadArgs(args, stackOffset+1, newArgOffset+size, 0w8, fpRegNo,
-                                        copyToStructSpace @ storeRegScaled{regT=argWorkReg, regN=XSP, unitOffset=stackOffset} :: code,
+                                        copyToStructSpace @ StoreRegScaled{loadType=Load64, regT=argWorkReg, regN=XSP, unitOffset=stackOffset} :: code,
                                         newStructSpace)
                             )
                             else (* Small struct.  Since it's now in an area at least 16 bytes and properly aligned we can load it. *)
                                 (* argRegNo points to where we copied it *)
                                 loadArgumentValues(if size > 0w8 then 0w16 else 0w8, 0w0, argRegNo, newStructSpace, copyToStructSpace)
                         end
                     end
             end
     
         local
             val {size, typeForm, ...} =  result
             (* Store a result register into the result area.  In almost all cases
                this is very simple: the only complication is with structs of odd sizes. *)
             fun storeResult(reg, offset, size) = storeUpTo8(reg, resultAreaPtr, offset, size)
         in
             val (getResult, passArgAddress) =
                 if typeForm = CTypeVoid
                 then ([], false)
                 else case classifyArg(typeForm, size) of
                     (* Floating point values are returned in s0-sn, d0-dn. *)
                     ArgClassHFA(numItems, isDouble) =>
                         let
                             fun storeFPRegs(0w0, _, _) = []
                             |   storeFPRegs(0w1, fpRegNo, offset) =
-                                [(if isDouble then storeRegScaledDouble else storeRegScaledFloat)
-                                    {regT=VReg fpRegNo, regN=resultAreaPtr, unitOffset=offset}]
+                                [StoreFPRegScaled{regT=VReg fpRegNo, regN=resultAreaPtr, unitOffset=offset, floatSize=if isDouble then Double64 else Float32}]
                             |   storeFPRegs(n, fpRegNo, offset) =
-                                (if isDouble then storePairOffsetDouble else storePairOffsetFloat)
-                                    {regT1=VReg fpRegNo, regT2=VReg(fpRegNo+0w1), regN=resultAreaPtr, unitOffset=offset} ::
-                                        storeFPRegs(n-0w2, fpRegNo+0w2, offset+2)
+                                StoreFPRegPair{regT1=VReg fpRegNo, regT2=VReg(fpRegNo+0w1), regN=resultAreaPtr, unitOffset=offset,
+                                        floatSize=if isDouble then Double64 else Float32, unscaledType=NoUpdate} ::
+                                    storeFPRegs(n-0w2, fpRegNo+0w2, offset+2)
                         in
                             (storeFPRegs(numItems, 0w0 (* V0-Vn*), 0), false)
                         end
 
                 |   ArgLargeStruct => ([], true) (* Structures larger than 16 bytes are passed by reference. *)
 
                 |   _ =>
                         if size = 0w16
-                        then ([storePairOffset{regT1=X0, regT2=X1, regN=resultAreaPtr, unitOffset=0}], false)
+                        then ([StoreRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=X0, regT2=X1, regN=resultAreaPtr, unitOffset=0}], false)
                         else if size > 0w8
-                        then (storeRegScaled{regT=X0, regN=resultAreaPtr, unitOffset=0} :: storeResult(X1, 8, size-0w8), false)
+                        then (StoreRegScaled{loadType=Load64, regT=X0, regN=resultAreaPtr, unitOffset=0} :: storeResult(X1, 8, size-0w8), false)
                         else (storeResult(X0, 0, size), false)
         end
 
         val (argCode, argStack, largeStructSpace) =
             loadArgs(args, 0, 0w0, 0w0, 0w0,
                     if passArgAddress (* If we have to pass the address of the result struct it goes in X8. *)
-                    then [moveRegToReg{sReg=resultAreaPtr, dReg=X8}]
+                    then [MoveXRegToXReg{sReg=resultAreaPtr, dReg=X8}]
                     else [], 0w0)
 
         val stackSpaceRequired = alignUp(Word.fromInt argStack * 0w8, 0w16) + largeStructSpace
 
         val instructions =
             [(* Push the return address to the stack.  We could put it in a callee-save register but
                 there's a very small chance that this could be the last reference to a piece of code. *)
-             storeRegPreIndex{regT=X30, regN=X_MLStackPtr, byteOffset= ~8},
+             StoreRegUnscaled{loadType=Load64, unscaledType=PreIndex, regT=X30, regN=X_MLStackPtr, byteOffset= ~8},
              (* Save heap ptr.  Needed in case we have a callback. *)
-             storeRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset}
+             StoreRegScaled{loadType=Load64, regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset}
             ] @ indexToAbsoluteAddress(X0, X0) @
              (* Load the entry point address. *)
-             loadRegScaled{regT=entryPtReg, regN=X0, unitOffset=0} ::
+             LoadRegScaled{loadType=Load64, regT=entryPtReg, regN=X0, unitOffset=0} ::
             (
                 (* Unbox the address of the result area into a callee save resgister.  This is where
                    the result will be stored on return if it is anything other than a struct.
                    We have to put the C address in there now because an ML address wouldn't be updated
                    by a possible GC in a callback. *)
                 if #typeForm(result) <> CTypeVoid
-                then indexToAbsoluteAddress(X2, X2) @ [loadRegScaled{regT=resultAreaPtr, regN=X2, unitOffset=0}]
+                then indexToAbsoluteAddress(X2, X2) @ [LoadRegScaled{loadType=Load64, regT=resultAreaPtr, regN=X2, unitOffset=0}]
                 else []
             ) @
-            [storeRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset}] @ (* Save the stack pointer. *)
+            [StoreRegScaled{loadType=Load64, regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset}] @ (* Save the stack pointer. *)
             (
                 if stackSpaceRequired = 0w0
                 then []
-                else [subImmediate{regN=XSP, regD=XSP, immed=stackSpaceRequired, shifted=false}]
+                else [SubImmediate{opSize=OpSize64, setFlags=false, regN=XSP, regD=XSP, immed=stackSpaceRequired, shifted=false}]
             ) @
             (
                 (* If we need to copy a struct load a register with a pointer to the area for it. *)
                 if largeStructSpace = 0w0
                 then []
-                else [addImmediate{regN=XSP, regD=structSpacePtr, immed=stackSpaceRequired-largeStructSpace, shifted=false}]
+                else [AddImmediate{opSize=OpSize64, setFlags=false, regN=XSP, regD=structSpacePtr, immed=stackSpaceRequired-largeStructSpace, shifted=false}]
             ) @
             (
                 (* The second argument is a SysWord containing the address of a malloced area of memory
                    with the actual arguments in it. *)
                 if null args
                 then []
-                else indexToAbsoluteAddress(X1, X1) @ [loadRegScaled{regT=argPtrReg, regN=X1, unitOffset=0}]
+                else indexToAbsoluteAddress(X1, X1) @ [LoadRegScaled{loadType=Load64, regT=argPtrReg, regN=X1, unitOffset=0}]
             ) @ argCode @
-            [branchAndLinkReg X16] @ (* Call the function. *)
+            [BranchReg{regD=X16, brRegType=BRRAndLink}] @ (* Call the function. *)
             (* Restore the C stack value in case it's been changed by a callback. *)
             (
                 if stackSpaceRequired = 0w0
                 then []
-                else [addImmediate{regN=XSP, regD=XSP, immed=stackSpaceRequired, shifted=false}]
+                else [AddImmediate{opSize=OpSize64, setFlags=false, regN=XSP, regD=XSP, immed=stackSpaceRequired, shifted=false}]
             ) @
             [
                 (* Reload the ML stack pointer even though it's callee save.  If we've made a callback
                    the ML stack could have grown and so moved to a different address. *)
-                loadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset},
+                LoadRegScaled{loadType=Load64, regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset},
                 (* Load the heap allocation ptr and limit in case of a callback. *)
-                loadRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset},
-                loadRegScaled{regT=X_MLHeapLimit, regN=X_MLAssemblyInt, unitOffset=heapLimitPtrOffset}
+                LoadRegScaled{loadType=Load64, regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset},
+                LoadRegScaled{loadType=Load64, regT=X_MLHeapLimit, regN=X_MLAssemblyInt, unitOffset=heapLimitPtrOffset}
             ] @ (* Store the result in the destination. *) getResult @
             (* Pop the return address and return. *)
-            [ loadRegPostIndex{regT=X30, regN=X_MLStackPtr, byteOffset= 8}, returnRegister X30 ]
+            [ LoadRegUnscaled{regT=X30, regN=X_MLStackPtr, byteOffset= 8, loadType=Load64, unscaledType=PostIndex}, BranchReg{regD=X30,brRegType=BRRReturn} ]
 
         val functionName = "foreignCall"
         val debugSwitches =
             [(*Universal.tagInject Pretty.compilerOutputTag (Pretty.prettyPrint(print, 70)),
                Universal.tagInject Debug.assemblyCodeTag true*)]
         val closure = makeConstantClosure()
-        val () = generateCode{instrs=instructions, name=functionName, parameters=debugSwitches, resultClosure=closure,
-                              profileObject=createProfileObject()}
+        val () = generateFinalCode{instrs=instructions, name=functionName, parameters=debugSwitches, resultClosure=closure,
+                              profileObject=createProfileObject(), labelCount=0}
     in
         closureAsAddress closure
     end
 
     (* Build a callback function.  The arguments are the abi, the list of argument types and the result type.
        The result is an ML function that takes an ML function, f, as its argument, registers it as a callback and
        returns the C function as its result.  When the C function is called the arguments are copied into
        temporary memory and the vector passed to f along with the address of the memory for the result.
        "f" stores the result in it when it returns and the result is then passed back as the result of the
        callback.
        N.B.  This returns a closure cell which contains the address of the code.  It can be used as a
        SysWord.word value except that while it exists the code will not be GCed.  *)
     fun buildCallBack(_: abi, args: cType list, result: cType): Address.machineWord =
     let
         val argWorkReg = X10 (* Used in loading arguments if necessary. *)
         and argWorkReg2 = X11
         and argWorkReg3 = X13
         and argWorkReg4 = X14
         
         (* The stack contains a 32-byte result area then an aligned area for the arguments. *)
 
         (* Store the argument values to the structure that will be passed to the ML callback function. *)
         (* Note.  We've loaded the frame pointer with the original stack ptr-96 so we can
            access any stack arguments from that. *)
         fun moveArgs([], _, _, _, _, moveFromStack) = moveFromStack
         
         |   moveArgs(arg::args, stackSpace, argOffset, gRegNo, fpRegNo, moveFromStack) =
             let
                 val {size, align, typeForm, ...} =  arg
                 val newArgOffset = alignUp(argOffset, align)
             in
                 case classifyArg(typeForm, size) of
                     ArgClassHFA(numItems, isDouble) =>
                         if fpRegNo + numItems <= 0w8
                         then
                         let
                             val scale = if isDouble then 0w8 else 0w4
                             (* Store the values from the FP registers. *)
                             fun storeFPRegs(0w0, _, _) = []
                             |   storeFPRegs(0w1, fpRegNo, offset) =
-                                [(if isDouble then storeRegScaledDouble else storeRegScaledFloat)
-                                    {regT=VReg fpRegNo, regN=XSP, unitOffset=offset}]
+                                [StoreFPRegScaled{regT=VReg fpRegNo, regN=XSP, unitOffset=offset, floatSize=if isDouble then Double64 else Float32}]
                             |   storeFPRegs(n, fpRegNo, offset) =
-                                (if isDouble then storePairOffsetDouble else storePairOffsetFloat)
-                                    {regT1=VReg fpRegNo, regT2=VReg(fpRegNo+0w1), regN=XSP, unitOffset=offset} ::
+                                StoreFPRegPair{regT1=VReg fpRegNo, regT2=VReg(fpRegNo+0w1), regN=XSP, unitOffset=offset,
+                                                floatSize=if isDouble then Double64 else Float32, unscaledType=NoUpdate} ::
                                         storeFPRegs(n-0w2, fpRegNo+0w2, offset+2)
                         in
                             moveArgs(args, stackSpace, newArgOffset+size, gRegNo, fpRegNo+numItems,
                                 storeFPRegs(numItems, fpRegNo, Word.toInt(newArgOffset div scale)) @ moveFromStack)
                         end
                         else
                         let
                             (* Load the arguments from the stack and store into the result area. *)
                             fun copyData(0w0, _, _) = []
                             |   copyData(n, dstOffset, stackOffset) =
                                 if isDouble
-                                then loadRegScaled{regT=argWorkReg2, regN=X29, unitOffset=stackOffset} ::
-                                     storeRegScaled{regT=argWorkReg2, regN=XSP, unitOffset=dstOffset} ::
+                                then LoadRegScaled{loadType=Load64, regT=argWorkReg2, regN=X29, unitOffset=stackOffset} ::
+                                     StoreRegScaled{loadType=Load64, regT=argWorkReg2, regN=XSP, unitOffset=dstOffset} ::
                                      copyData(n-0w1, dstOffset+1, stackOffset+1)
-                                else loadRegScaled32{regT=argWorkReg2, regN=X29, unitOffset=stackOffset} ::
-                                     storeRegScaled32{regT=argWorkReg2, regN=XSP, unitOffset=dstOffset} ::
+                                else LoadRegScaled{loadType=Load32, regT=argWorkReg2, regN=X29, unitOffset=stackOffset} ::
+                                     StoreRegScaled{loadType=Load32, regT=argWorkReg2, regN=XSP, unitOffset=dstOffset} ::
                                      copyData(n-0w1, dstOffset+1, stackOffset+1)
 
                             val copyFromStack =
                                 if isDouble
                                 then copyData(numItems, Word.toInt(newArgOffset div 0w8), stackSpace)
                                 else copyData(numItems, Word.toInt(newArgOffset div 0w4), stackSpace*2)
                             (* The overall size is rounded up to a multiple of 8 *)
                             val newStackOffset = stackSpace + Word.toInt(alignUp(size, 0w8) div 0w8)
                         in
                             moveArgs(args, newStackOffset, newArgOffset+size, gRegNo, 0w8,
                                 copyFromStack @ moveFromStack)
                         end
 
                 |   _ =>
                     if alignedLoadStore(newArgOffset, size) andalso
                         (gRegNo <= 0w6 orelse gRegNo = 0w7 andalso size <= 0w8)
                     then (* Usual case: argument passed in one or two registers. *)
                     (
                         if size > 0w8
                         then moveArgs(args, stackSpace, newArgOffset+size, gRegNo + 0w2, fpRegNo,
-                                storePairOffset{regT1=XReg gRegNo, regT2=XReg(gRegNo+0w1), regN=XSP,
+                                StoreRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=XReg gRegNo, regT2=XReg(gRegNo+0w1), regN=XSP,
                                     unitOffset=Word.toInt(newArgOffset div 0w8)} :: moveFromStack)
                         else moveArgs(args, stackSpace, newArgOffset+size, gRegNo + 0w1, fpRegNo,
                                 storeUpTo8(XReg gRegNo, XSP, Word.toInt newArgOffset, size) @ moveFromStack)
                     )
                     else
                     (* General case.  Store the argument registers if necessary and then use
                        a byte copy to copy into the argument area.  This sorts out any odd alignments or
                        lengths.  In some cases the source will be in memory already. *)
                     let
                         (* The source is either the register value or the value on the stack. *)
                         val (argRegNo, nextGReg, newStack, loadArg) =
                             if size > 0w16
                             then
                             (
                                 if gRegNo < 0w8
                                 then (XReg gRegNo, gRegNo + 0w1, stackSpace, [])
-                                else (argWorkReg, 0w8, stackSpace+1, [loadRegScaled{regT=argWorkReg, regN=X29, unitOffset=stackSpace}])
+                                else (argWorkReg, 0w8, stackSpace+1, [LoadRegScaled{loadType=Load64, regT=argWorkReg, regN=X29, unitOffset=stackSpace}])
                             )
                             else
                             let
                                 val regsNeeded = if size > 0w8 then 0w2 else 0w1
                             in
                                 if gRegNo + regsNeeded <= 0w8
                                 then (XReg gRegNo, gRegNo+regsNeeded, stackSpace,
                                         [if size > 0w8
-                                         then storePairOffset{regT1=XReg gRegNo, regT2=XReg(gRegNo+0w1), regN=XSP, unitOffset=2}
-                                         else storeRegScaled{regT=XReg gRegNo, regN=XSP, unitOffset=2},
-                                         addImmediate{regD=XReg gRegNo, regN=XSP, immed=0w16, shifted=false}])
+                                         then StoreRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=XReg gRegNo, regT2=XReg(gRegNo+0w1), regN=XSP, unitOffset=2}
+                                         else StoreRegScaled{loadType=Load64, regT=XReg gRegNo, regN=XSP, unitOffset=2},
+                                         AddImmediate{opSize=OpSize64, setFlags=false, regD=XReg gRegNo, regN=XSP, immed=0w16, shifted=false}])
                                 else (* Being passed on the stack *)
                                     (argWorkReg, 0w8, stackSpace+Word8.toInt regsNeeded,
-                                        [addImmediate{regD=argWorkReg, regN=X29, immed=Word.fromInt stackSpace*0w8, shifted=false}])
+                                        [AddImmediate{opSize=OpSize64, setFlags=false, regD=argWorkReg, regN=X29, immed=Word.fromInt stackSpace*0w8, shifted=false}])
                             end
 
                         val loopLabel = createLabel()
 
                         val copyCode =
                         [ 
-                            addImmediate{regN=argRegNo, regD=argWorkReg3, immed=size, shifted=false}, (* End of source area *)
-                            addImmediate{regN=XSP, regD=argWorkReg2, immed=newArgOffset+size, shifted=false}, (* end of dest *)
-                            setLabel loopLabel,
-                            loadRegPreIndexByte{regT=argWorkReg4, regN=argWorkReg3, byteOffset= ~1},
-                            storeRegPreIndexByte{regT=argWorkReg4, regN=argWorkReg2, byteOffset= ~1},
-                            subSShiftedReg{regM=argWorkReg3, regN=argRegNo, regD=XZero, shift=ShiftNone}, (* At start? *)
-                            conditionalBranch(CondNotEqual, loopLabel)
+                            AddImmediate{opSize=OpSize64, setFlags=false, regN=argRegNo, regD=argWorkReg3, immed=size, shifted=false}, (* End of source area *)
+                            AddImmediate{opSize=OpSize64, setFlags=false, regN=XSP, regD=argWorkReg2, immed=newArgOffset+size, shifted=false}, (* end of dest *)
+                            SetLabel loopLabel,
+                            LoadRegUnscaled{loadType=Load8, unscaledType=PreIndex, regT=argWorkReg4, regN=argWorkReg3, byteOffset= ~1},
+                            StoreRegUnscaled{loadType=Load8, unscaledType=PreIndex, regT=argWorkReg4, regN=argWorkReg2, byteOffset= ~1},
+                            SubShiftedReg{opSize=OpSize64, setFlags=true, regM=argWorkReg3, regN=argRegNo, regD=XZero, shift=ShiftNone}, (* At start? *)
+                            ConditionalBranch(CondNotEqual, loopLabel)
                         ]
                     in
                         moveArgs(args, newStack, newArgOffset+size, nextGReg, fpRegNo, loadArg @ copyCode @ moveFromStack)
                     end
             end
 
         val copyArgsFromRegsAndStack =
             moveArgs(args, 12 (* Offset to first stack arg *), 0w32 (* Size of result area *), 0w0, 0w0, [])
 
         local
             fun getNextSize (arg, argOffset) =
             let val {size, align, ...} =  arg in alignUp(argOffset, align) + size end
         in
             val argumentSpace = alignUp(List.foldl getNextSize 0w0 args, 0w16)
         end
 
         local
             val {size, typeForm, ...} =  result
         in
             (* Load the results from the result area except that if we're
                passing the result structure by reference this is done by
                the caller.  Generally similar to how arguments are passed
                in a call. *)
             val (loadResults, resultByReference) =
                 if typeForm = CTypeVoid
                 then ([], false)
                 else case classifyArg(typeForm, size) of
                     ArgClassHFA(numItems, isDouble) =>
                         let
                             (* Load the values to the floating point registers. *)
                             fun loadFPRegs(0w0, _, _) = []
                             |   loadFPRegs(0w1, fpRegNo, offset) =
-                                [(if isDouble then loadRegScaledDouble else loadRegScaledFloat)
-                                    {regT=VReg fpRegNo, regN=XSP, unitOffset=offset}]
+                                [LoadFPRegScaled{regT=VReg fpRegNo, regN=XSP, unitOffset=offset, floatSize=if isDouble then Double64 else Float32}]
                             |   loadFPRegs(n, fpRegNo, offset) =
-                                (if isDouble then loadPairOffsetDouble else loadPairOffsetFloat)
-                                    {regT1=VReg fpRegNo, regT2=VReg(fpRegNo+0w1), regN=XSP, unitOffset=offset} ::
+                                LoadFPRegPair{regT1=VReg fpRegNo, regT2=VReg(fpRegNo+0w1), regN=XSP, unitOffset=offset, unscaledType=NoUpdate,
+                                                floatSize=if isDouble then Double64 else Float32} ::
                                         loadFPRegs(n-0w2, fpRegNo+0w2, offset+2)
                         in
                             (loadFPRegs(numItems, 0w0, 0 (* result area *)), false)
                         end
 
                 |   ArgLargeStruct => ([], true) (* Structures larger than 16 bytes are passed by reference. *)
 
                 |   _ =>
                     (* We've allocated a 32-byte area aligned onto a 16-byte boundary so
                        we can simply load one or two registers. *)
                     if size > 0w8
-                    then ([loadPairOffset{regT1=X0, regT2=X1, regN=XSP, unitOffset=0}], false)
-                    else ([loadRegScaled{regT=X0, regN=XSP, unitOffset=0}], false)
+                    then ([LoadRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=X0, regT2=X1, regN=XSP, unitOffset=0}], false)
+                    else ([LoadRegScaled{loadType=Load64, regT=X0, regN=XSP, unitOffset=0}], false)
         end
 
         val instructions =
             [ (* Push LR, FP and the callee-save registers. *)
-                storePairPreIndexed{regT1=X29, regT2=X30, regN=XSP, unitOffset= ~12},
-                moveRegToReg{sReg=XSP, dReg=X29},
-                storePairOffset{regT1=X19, regT2=X20, regN=X29, unitOffset=2},
-                storePairOffset{regT1=X21, regT2=X22, regN=X29, unitOffset=4},
-                storePairOffset{regT1=X23, regT2=X24, regN=X29, unitOffset=6},
-                storePairOffset{regT1=X25, regT2=X26, regN=X29, unitOffset=8},
-                storePairOffset{regT1=X27, regT2=X28, regN=X29, unitOffset=10},
+                StoreRegPair{loadType=Load64, unscaledType=PreIndex, regT1=X29, regT2=X30, regN=XSP, unitOffset= ~12},
+                MoveXRegToXReg{sReg=XSP, dReg=X29},
+                StoreRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=X19, regT2=X20, regN=X29, unitOffset=2},
+                StoreRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=X21, regT2=X22, regN=X29, unitOffset=4},
+                StoreRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=X23, regT2=X24, regN=X29, unitOffset=6},
+                StoreRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=X25, regT2=X26, regN=X29, unitOffset=8},
+                StoreRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=X27, regT2=X28, regN=X29, unitOffset=10},
                 (* Reserve space for the arguments and results. *)
-                subImmediate{regN=XSP, regD=XSP, immed=argumentSpace+0w32, shifted=false},
+                SubImmediate{opSize=OpSize64, setFlags=false, regN=XSP, regD=XSP, immed=argumentSpace+0w32, shifted=false},
                 (* We passed the function we're calling in X9 but we need to move
                    it to a callee-save register before we call the RTS. *)
-                moveRegToReg{sReg=X9, dReg=X20}
+                MoveXRegToXReg{sReg=X9, dReg=X20}
             ] @
                 (* Save X8 if we're going to need it. *)
-            (if resultByReference then [storeRegScaled{regT=X8, regN=XSP, unitOffset=0}] else []) @
+            (if resultByReference then [StoreRegScaled{loadType=Load64, regT=X8, regN=XSP, unitOffset=0}] else []) @
             (* Now we've saved X24 we can move the global heap base into it. *)
-            (if is32in64 then [moveRegToReg{sReg=X10, dReg=X_Base32in64}] else []) @
+            (if is32in64 then [MoveXRegToXReg{sReg=X10, dReg=X_Base32in64}] else []) @
             copyArgsFromRegsAndStack @
-            [loadAddressConstant(X0, getThreadDataCall)] @
+            [LoadAddr(X0, getThreadDataCall)] @
             (
                 if is32in64
-                then [addShiftedReg{regM=X0, regN=X_Base32in64, regD=X0, shift=ShiftLSL 0w2}]
+                then [AddShiftedReg{setFlags=false, opSize=OpSize64, regM=X0, regN=X_Base32in64, regD=X0, shift=ShiftLSL 0w2}]
                 else []
             ) @
             [
                 (* Call into the RTS to get the thread data ptr. *)
-                loadRegScaled{regT=X0, regN=X0, unitOffset=0},
-                branchAndLinkReg X0,
-                moveRegToReg{sReg=X0, dReg=X_MLAssemblyInt},
+                LoadRegScaled{loadType=Load64, regT=X0, regN=X0, unitOffset=0},
+                BranchReg{regD=X0, brRegType=BRRAndLink},
+                MoveXRegToXReg{sReg=X0, dReg=X_MLAssemblyInt},
                 (* Load the ML regs. *)
-                loadRegScaled{regT=X_MLHeapLimit, regN=X_MLAssemblyInt, unitOffset=heapLimitPtrOffset},
-                loadRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset},
-                loadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset},
+                LoadRegScaled{loadType=Load64, regT=X_MLHeapLimit, regN=X_MLAssemblyInt, unitOffset=heapLimitPtrOffset},
+                LoadRegScaled{loadType=Load64, regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset},
+                LoadRegScaled{loadType=Load64, regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset},
                 (* Prepare the arguments.  They are both syswords so have to be boxed.
                    First load the address of the argument area which is after the
                    32-byte result area. *)
-                addImmediate{regN=XSP, regD=X2, immed=0w32, shifted=false}
-            ] @ boxSysWord{source=X2, destination=X0, workReg=X3, saveRegs=[]} @ (* Address of arguments. *)
+                AddImmediate{opSize=OpSize64, setFlags=false, regN=XSP, regD=X2, immed=0w32, shifted=false}
+            ] @ List.rev(boxSysWord({source=X2, destination=X0, workReg=X3, saveRegs=[]}, [])) @ (* Address of arguments. *)
             (
                 (* Result area pointer.  If we're returning by reference this is the original
                    value of X8 otherwise it's the address of the 32 bytes we've reserved. *)
                 if resultByReference
-                then [loadRegScaled{regT=X2, regN=XSP, unitOffset=0}]
-                else [moveRegToReg{sReg=XSP, dReg=X2}]
-            ) @ boxSysWord{source=X2, destination=X1, workReg=X3, saveRegs=[]} @
+                then [LoadRegScaled{loadType=Load64, regT=X2, regN=XSP, unitOffset=0}]
+                else [MoveXRegToXReg{sReg=XSP, dReg=X2}]
+            ) @ List.rev(boxSysWord({source=X2, destination=X1, workReg=X3, saveRegs=[]}, [])) @
                 (* Put the ML closure pointer, originally in X9 now in X20, into the
                    ML closure pointer register, X8.  Then call the ML code. *)
-            [moveRegToReg{sReg=X20, dReg=X8}] @
+            [MoveXRegToXReg{sReg=X20, dReg=X8}] @
             (
                 if is32in64
                 then
                 [
-                    addShiftedReg{regM=X8, regN=X_Base32in64, regD=X16, shift=ShiftLSL 0w2},
-                    loadRegScaled{regT=X16, regN=X16, unitOffset=0}
+                    AddShiftedReg{regM=X8, regN=X_Base32in64, regD=X16, shift=ShiftLSL 0w2, opSize=OpSize64, setFlags=false},
+                    LoadRegScaled{loadType=Load64, regT=X16, regN=X16, unitOffset=0}
                 ]
-                else [loadRegScaled{regT=X16, regN=X8, unitOffset=0}]
+                else [LoadRegScaled{loadType=Load64, regT=X16, regN=X8, unitOffset=0}]
             ) @
             [
-                branchAndLinkReg X16,
+                BranchReg{regD=X16, brRegType=BRRAndLink},
                 (* Save the ML stack and heap pointers.  We could have allocated or
                    grown the stack.  The limit pointer is maintained by the RTS. *)
-                storeRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset},
-                storeRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset}
+                StoreRegScaled{loadType=Load64, regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset},
+                StoreRegScaled{loadType=Load64, regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset}
             ] @ loadResults @ (* Load the return values *)
             [
                 (* Restore the callee-save registers and return. *)
-                moveRegToReg{sReg=X29, dReg=XSP},
-                loadPairOffset{regT1=X19, regT2=X20, regN=X29, unitOffset=2},
-                loadPairOffset{regT1=X21, regT2=X22, regN=X29, unitOffset=4},
-                loadPairOffset{regT1=X23, regT2=X24, regN=X29, unitOffset=6},
-                loadPairOffset{regT1=X25, regT2=X26, regN=X29, unitOffset=8},
-                loadPairOffset{regT1=X27, regT2=X28, regN=X29, unitOffset=10},
-                loadPairPostIndexed{regT1=X29, regT2=X30, regN=XSP, unitOffset=12},
-                returnRegister X30
+                MoveXRegToXReg{sReg=X29, dReg=XSP},
+                LoadRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=X19, regT2=X20, regN=X29, unitOffset=2},
+                LoadRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=X21, regT2=X22, regN=X29, unitOffset=4},
+                LoadRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=X23, regT2=X24, regN=X29, unitOffset=6},
+                LoadRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=X25, regT2=X26, regN=X29, unitOffset=8},
+                LoadRegPair{loadType=Load64, unscaledType=NoUpdate, regT1=X27, regT2=X28, regN=X29, unitOffset=10},
+                LoadRegPair{loadType=Load64, unscaledType=PostIndex, regT1=X29, regT2=X30, regN=XSP, unitOffset=12},
+                BranchReg{regD=X30, brRegType=BRRReturn}
             ]
 
         val functionName = "foreignCallBack(2)"
         val debugSwitches =
             [(*Universal.tagInject Pretty.compilerOutputTag (Pretty.prettyPrint(print, 70)),
                Universal.tagInject Debug.assemblyCodeTag true*)]
 
         val closure = makeConstantClosure()
-        val () = generateCode{instrs=instructions, name=functionName, parameters=debugSwitches,
-                              resultClosure=closure, profileObject=createProfileObject()}
+        val () = generateFinalCode{instrs=instructions, name=functionName, parameters=debugSwitches,
+                              resultClosure=closure, profileObject=createProfileObject(), labelCount=0}
         val stage2Code = closureAsAddress closure
         
         fun resultFunction f =
         let
             (* Generate a small function to load the address of f into a register and then jump to stage2.
                The idea is that it should be possible to generate this eventually in a single RTS call.
                That could be done by using a version of this as a model. *)
             val instructions =
                 if is32in64
                 then
                     (* Get the global heap base into X10. *)
-                    loadGlobalHeapBaseInCallback X10 @
                     [
-                        loadAddressConstant(X9, Address.toMachineWord f),
+                        LoadGlobalHeapBaseInCallback X10,
+                        LoadAddr(X9, Address.toMachineWord f),
                         (* Have to load the actual address at run-time. *)
-                        loadAddressConstant(X16, stage2Code),
-                        addShiftedReg{regM=X16, regN=X10, regD=X16, shift=ShiftLSL 0w2},
-                        loadRegScaled{regT=X16, regN=X16, unitOffset=0},
-                        branchRegister X16
+                        LoadAddr(X16, stage2Code),
+                        AddShiftedReg{setFlags=false, opSize=OpSize64, regM=X16, regN=X10, regD=X16, shift=ShiftLSL 0w2},
+                        LoadRegScaled{loadType=Load64, regT=X16, regN=X16, unitOffset=0},
+                        BranchReg{regD=X16, brRegType=BRRBranch}
                     ]
                 else
                 let
                     (* We can extract the actual code address in the native address version. *)
                     val codeAddress = Address.loadWord(Address.toAddress stage2Code, 0w0)
                 in
                     [
-                        loadAddressConstant(X9, Address.toMachineWord f),
-                        loadAddressConstant(X16, codeAddress),
-                        branchRegister X16
+                        LoadAddr(X9, Address.toMachineWord f),
+                        LoadAddr(X16, codeAddress),
+                        BranchReg{regD=X16, brRegType=BRRBranch}
                     ]
                 end
             val functionName = "foreignCallBack(1)"
             val debugSwitches =
                 [(*Universal.tagInject Pretty.compilerOutputTag (Pretty.prettyPrint(print, 70)),
                    Universal.tagInject Debug.assemblyCodeTag true*)]
             val closure = makeConstantClosure()
-            val () = generateCode{instrs=instructions, name=functionName, parameters=debugSwitches,
-                                  resultClosure=closure, profileObject=createProfileObject()}
+            val () = generateFinalCode{instrs=instructions, name=functionName, parameters=debugSwitches,
+                                  resultClosure=closure, profileObject=createProfileObject(), labelCount=0}
             val res = closureAsAddress closure
             (*val _ = print("Address is " ^ (LargeWord.toString(RunCall.unsafeCast res)) ^ "\n")*)
         in
             res
         end
     in
         Address.toMachineWord resultFunction
     end
 
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64GenCode.sml b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64GenCode.sml
deleted file mode 100644
index f062ad5f..00000000
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64GenCode.sml
+++ /dev/null
@@ -1,2950 +0,0 @@
-(*
-    Copyright (c) 2021 David C. J. Matthews
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    Licence version 2.1 as published by the Free Software Foundation.
-    
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public Licence for more details.
-    
-    You should have received a copy of the GNU Lesser General Public
-    Licence along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-*)
-
-functor Arm64GenCode (
-    structure BackendTree: BACKENDINTERMEDIATECODE
-    and       CodeArray: CODEARRAY
-    and       Arm64Assembly: ARM64ASSEMBLY
-    and       Debug: DEBUG
-    and       Arm64Foreign: FOREIGNCALL
-    and       Arm64Sequences: ARM64SEQUENCES
-    
-    sharing BackendTree.Sharing = CodeArray.Sharing = Arm64Assembly.Sharing = Arm64Sequences.Sharing
-
-) : GENCODE =
-struct
-
-    open BackendTree CodeArray Arm64Assembly Address Arm64Sequences
-    
-    open BuiltIns
-    
-    exception InternalError = Misc.InternalError
-
-    (* tag a short constant *)
-    fun tag c = 2 * c + 1
-    and semitag c = 2*c
-    
-    fun taggedWord w: word = w * 0w2 + 0w1
-    and taggedWord64 w: Word64.word = w * 0w2 + 0w1
-    
-    val tagBitMask = Word64.<<(Word64.fromInt ~1, 0w1)
-    val tagBitMask32 = Word64.andb(tagBitMask, 0wxffffffff)
-    
-    fun gen(instr, code) = code := instr :: !code
-
-    fun genList([], _) = ()
-    |   genList(instr :: instrs, code) = (gen(instr, code); genList(instrs, code))
-
-    fun genPushReg(reg, code) = gen(storeRegPreIndex{regT=reg, regN=X_MLStackPtr, byteOffset= ~8}, code)
-    and genPopReg(reg, code) = gen(loadRegPostIndex{regT=reg, regN=X_MLStackPtr, byteOffset= 8}, code)
-
-    (* Load a value using a scaled offset.  This uses a normal scaled load if
-       the offset is in the range and an indexed offset if it is not. *)
-    fun loadScaledOffset(scale, loadScaled, loadIndexed) {base, dest, work, offset} =
-        if offset < 0 then raise InternalError "loadScaledOffset: negative offset"
-        else if offset < 0x1000
-        then [loadScaled{regT=dest, regN=base, unitOffset=offset}]
-        else
-            loadNonAddress(work, Word64.fromInt offset) @
-            [loadIndexed{regN=base, regM=work, regT=dest,
-                option=ExtUXTX(if scale = 1 then NoScale else ScaleOrShift)}]
-
-    (* Similar store. *)
-    and storeScaledOffset(scale, storeScaled, storeIndexed) {base, store, work, offset} =
-        if offset < 0 then raise InternalError "storeScaledOffset: negative offset"
-        else if offset < 0x1000
-        then [storeScaled{regT=store, regN=base, unitOffset=offset}]
-        else
-            loadNonAddress(work, Word64.fromInt offset) @
-            [storeIndexed{regN=base, regM=work, regT=store,
-                option=ExtUXTX(if scale = 1 then NoScale else ScaleOrShift)}]
-
-    val loadScaledWord = loadScaledOffset(8, loadRegScaled, loadRegIndexed)
-    and storeScaledWord = storeScaledOffset(8, storeRegScaled, storeRegIndexed)
-    
-    val loadScaledPolyWord =
-        if is32in64
-        then loadScaledOffset(4, loadRegScaled32, loadRegIndexed32)
-        else loadScaledOffset(8, loadRegScaled, loadRegIndexed)
-
-    and storeScaledPolyWord =
-        if is32in64
-        then storeScaledOffset(4, storeRegScaled32, storeRegIndexed32)
-        else storeScaledOffset(8, storeRegScaled, storeRegIndexed)
-
-    (* Add a constant word to the source register and put the result in the
-       destination.  regW is used as a work register if necessary.  This is used
-       both for addition and subtraction. *)
-    fun addConstantWord({regS, regD, value=0w0, ...}, code) =
-        if regS = regD then () else gen(moveRegToReg{sReg=regS, dReg=regD}, code)
-    
-    |   addConstantWord({regS, regD, regW, value}, code) =
-        let
-            (* If we have to load the constant it's better if the top 32-bits are
-               zero if possible. *)
-            val (isSub, unsigned) =
-                if value > Word64.<<(0w1, 0w63)
-                then (true, ~ value)
-                else (false, value)
-        in
-            if unsigned < Word64.<<(0w1, 0w24)
-            then (* We can put up to 24 in a shifted and an unshifted constant. *)
-            let
-                val w = Word.fromLarge(Word64.toLarge unsigned)
-                val high = Word.andb(Word.>>(w, 0w12), 0wxfff)
-                val low = Word.andb(w, 0wxfff)
-                val addSub = if isSub then subImmediate else addImmediate
-            in
-                if high <> 0w0
-                then
-                (
-                    gen(addSub{regN=regS, regD=regD, immed=high, shifted=true}, code);
-                    if low <> 0w0
-                    then gen(addSub{regN=regD, regD=regD, immed=low, shifted=false}, code)
-                    else ()
-                )
-                else gen(addSub{regN=regS, regD=regD, immed=low, shifted=false}, code)
-            end
-            else
-            let
-                (* To minimise the constant and increase the chances that it
-                   will fit in a single word look to see if we can shift it. *)
-                fun getShift(value, shift) =
-                    if Word64.andb(value, 0w1) = 0w0
-                    then getShift(Word64.>>(value, 0w1), shift+0w1)
-                    else (value, shift)
-                val (shifted, shift) = getShift(unsigned, 0w0)
-            in
-                genList(loadNonAddress(regW, shifted), code);
-                gen((if isSub then subShiftedReg else addShiftedReg)
-                    {regM=regW, regN=regS, regD=regD, shift=ShiftLSL shift}, code)
-            end
-        end
-    
-    (* Remove items from the stack. If the second argument is true the value
-       on the top of the stack has to be moved. *)
-    fun resetStack(0, _, _) = ()
-    |   resetStack(nItems, true, code) =
-        (
-            genPopReg(X0, code);
-            resetStack(nItems, false, code);
-            genPushReg(X0, code)
-        )
-    |   resetStack(nItems, false, code) =
-            addConstantWord({regS=X_MLStackPtr, regD=X_MLStackPtr, regW=X3,
-                value=Word64.fromLarge(Word.toLarge nativeWordSize) * Word64.fromInt nItems}, code)
-
-    fun compareRegs(reg1, reg2, code) =
-        gen(subSShiftedReg{regM=reg2, regN=reg1, regD=XZero, shift=ShiftNone}, code)
-    
-    (* Compare registers using 32-bit operation on 323-in-64. *)
-    fun comparePolyRegs(reg1, reg2, code) =
-        gen((if is32in64 then subSShiftedReg32 else subSShiftedReg){regM=reg2, regN=reg1, regD=XZero, shift=ShiftNone}, code)
-
-    (* Turn an absolute address into an object index. Does nothing in
-       native mode. *)
-    fun absoluteAddressToIndex(reg, cvec) =
-    if is32in64
-    then
-    (
-        gen(subShiftedReg{regM=X_Base32in64, regN=reg, regD=reg, shift=ShiftNone}, cvec);
-        gen(logicalShiftRight{shift=0w2, regN=reg, regD=reg}, cvec)
-    )
-    else ()
-    
-    (* Turn an index into an absolute address. *)
-    and indexToAbsoluteAddress(iReg, absReg, cvec) =
-    if is32in64
-    then gen(addShiftedReg{regM=iReg, regN=X_Base32in64, regD=absReg, shift=ShiftLSL 0w2}, cvec)
-    else if iReg = absReg
-    then ()
-    else gen(moveRegToReg{sReg=iReg, dReg=absReg}, cvec)
-
-    (* Get a large word value from a "box". *)
-    fun unboxLargeWord(addrReg, valReg, cvec) =
-    if is32in64
-    then
-    (
-        indexToAbsoluteAddress(addrReg, valReg, cvec);
-        gen(loadRegScaled{regT=valReg, regN=valReg, unitOffset=0}, cvec)
-    )
-    else gen(loadRegScaled{regT=valReg, regN=addrReg, unitOffset=0}, cvec)
-
-    fun unboxDouble(addrReg, workReg, valReg, cvec) =
-    if is32in64
-    then
-    (
-        indexToAbsoluteAddress(addrReg, workReg, cvec);
-        gen(loadRegScaledDouble{regT=valReg, regN=workReg, unitOffset=0}, cvec)
-    )
-    else gen(loadRegScaledDouble{regT=valReg, regN=addrReg, unitOffset=0}, cvec)
-
-    fun unboxOrUntagSingle(addrReg, workReg, valReg, cvec) =
-    if is32in64
-    then gen(loadRegIndexedFloat{regN=X_Base32in64, regM=addrReg, regT=valReg, option=ExtUXTX ScaleOrShift}, cvec)
-    else
-    (
-        gen(logicalShiftRight{shift=0w32, regN=addrReg, regD=workReg}, cvec);
-        gen(moveGeneralToFloat{regN=workReg, regD=valReg}, cvec)
-    )
-
-    (* Sequence to allocate on the heap.  The words are not initialised
-       apart from the length word.  Returns the absolute address. *)
-    fun genAllocateFixedSize(words, flags, resultReg, workReg, code) =
-    let
-        val label = createLabel()
-        val wordsRequired =
-            if is32in64
-            then (* Have to round this up to 8 bytes *)
-                Word64.andb(Word64.fromInt(words+2), ~ 0w2)
-            else Word64.fromInt(words+1)
-    in
-        (* Subtract the number of bytes required from the heap pointer and put in X0. *)
-        addConstantWord({regS=X_MLHeapAllocPtr, regD=X0, regW=X3,
-            value= ~ (Word64.fromLarge(Word.toLarge wordSize)) * wordsRequired}, code);
-        compareRegs(resultReg, X_MLHeapLimit, code);
-        gen(conditionalBranch(CondCarrySet, label), code);
-        gen(loadRegScaled{regT=X16, regN=X_MLAssemblyInt, unitOffset=heapOverflowCallOffset}, code);
-        gen(branchAndLinkReg X16, code);
-        gen(registerMask [], code); (* Not used at the moment. *)
-        gen(setLabel label, code);
-        gen(moveRegToReg{sReg=resultReg, dReg=X_MLHeapAllocPtr}, code);
-        genList(loadNonAddress(workReg,
-            Word64.orb(Word64.fromInt words, Word64.<<(Word64.fromLarge(Word8.toLarge flags),
-                if is32in64 then 0w24 else 0w56))), code);
-        (* Store the length word.  Have to use the unaligned version because offset is -ve. *)
-        if is32in64
-        then gen(storeRegUnscaled32{regT=workReg, regN=resultReg, byteOffset= ~4}, code)
-        else gen(storeRegUnscaled{regT=workReg, regN=resultReg, byteOffset= ~8}, code)
-    end
-
-    (* Allocate space on the heap for a vector, string etc.  sizeReg and flagsReg
-       contain the size and flags as untagged values. sizeReg is unchanged, flagsReg
-       is modified.  The result address is in resultReg.  All the registers must
-       be different. *)
-    fun allocateVariableSize({sizeReg, flagsReg, resultReg}, code) =
-    let
-        val trapLabel = createLabel() and noTrapLabel = createLabel()
-    in
-        (* Subtract the size as a number of bytes from the allocation ptr. *)
-        if is32in64
-        then
-        (
-            gen(subShiftedReg{regM=sizeReg, regN=X_MLHeapAllocPtr, regD=resultReg, shift=ShiftLSL 0w2}, code);
-            (* Subtract another 4 to allow for the length word. *)
-            gen(subImmediate{regN=resultReg, regD=resultReg, immed=0w4, shifted=false}, code);
-            (* Round this down to an 8-byte boundary. *)
-            gen(bitwiseAndImmediate{bits= ~ 0w8, regN=resultReg, regD=resultReg}, code)
-        )
-        else
-        (
-            gen(subShiftedReg{regM=sizeReg, regN=X_MLHeapAllocPtr, regD=resultReg, shift=ShiftLSL 0w3}, code);
-            (* Subtract another 8 to allow for the length word. *)
-            gen(subImmediate{regN=resultReg, regD=resultReg, immed=0w8, shifted=false}, code)
-        );
-        (* If the size is large enough it is possible that this could wrap round.  To check for that
-           we trap if either the result is less than the limit or if it is now greater than
-           the allocation pointer. *)
-        compareRegs(resultReg, X_MLHeapLimit, code);
-        gen(conditionalBranch(CondCarryClear, trapLabel), code);
-        compareRegs(resultReg, X_MLHeapAllocPtr, code);
-        gen(conditionalBranch(CondCarryClear, noTrapLabel), code);
-        gen(setLabel trapLabel, code);
-        gen(loadRegScaled{regT=X16, regN=X_MLAssemblyInt, unitOffset=heapOverflowCallOffset}, code);
-        gen(branchAndLinkReg X16, code);
-        gen(registerMask [], code); (* Not used at the moment. *)
-        gen(setLabel noTrapLabel, code);
-        gen(moveRegToReg{sReg=resultReg, dReg=X_MLHeapAllocPtr}, code);
-        (* Combine the size with the flags in the top byte. *)
-        gen(orrShiftedReg{regM=flagsReg, regN=sizeReg, regD=flagsReg,
-            shift=ShiftLSL(if is32in64 then 0w24 else 0w56)}, code);
-        (* Store the length word.  Have to use the unaligned version because offset is -ve. *)
-        if is32in64
-        then gen(storeRegUnscaled32{regT=flagsReg, regN=resultReg, byteOffset= ~4}, code)
-        else gen(storeRegUnscaled{regT=flagsReg, regN=resultReg, byteOffset= ~8}, code)
-    end
-
-    (* Set a register to either tagged(1) i.e. true or tagged(0) i.e. false. *)
-    fun setBooleanCondition(reg, condition, code) =
-    (
-        genList(loadNonAddress(reg, Word64.fromInt(tag 1)), code);
-        (* If the condition is false the value used is the XZero incremented by 1 i.e. 1 *)
-        gen(conditionalSetIncrement{regD=reg, regTrue=reg, regFalse=XZero, cond=condition}, code)
-    )
-
-    (* Raise the overflow exception if the overflow bit has been set. *)
-    fun checkOverflow code =
-    let
-        val noOverflow = createLabel()
-    in
-        gen(conditionalBranch(CondNoOverflow, noOverflow), code);
-        gen(loadAddressConstant(X0, toMachineWord Overflow), code);
-        gen(loadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset}, code);
-        gen(loadRegScaled{regT=X1, regN=X_MLStackPtr, unitOffset=0}, code);
-        gen(branchRegister X1, code);
-        gen(setLabel noOverflow, code)
-    end
-
-    (* Stack check code: this is inserted at the start of a function to check that there
-       is sufficient ML stack available.  It is also inserted, with a zero space value,
-       in a loop to ensure that the RTS can interrupt a function.
-       debugTrapAlways can be used to set a sort of breakpoint during debugging. *)
-    fun checkStackCode(regW, space, debugTrapAlways, code) =
-    let
-        val skipCheck = createLabel()
-        val defaultWords = 10 (* This is wired into the RTS. *)
-        val (testReg, entryPt) =
-            if space <= defaultWords
-            then (X_MLStackPtr, stackOverflowCallOffset)
-            else
-            (
-                (* This is only used at the start of the code.  X9 is wired into the RTS. *)
-                addConstantWord({regS=X_MLStackPtr, regD=X9, regW=regW,
-                    value= ~ (Word64.fromLarge(Word.toLarge nativeWordSize)) * Word64.fromInt space}, code);
-                (X9, stackOverflowXCallOffset)
-            )
-    in
-        gen(loadRegScaled{regT=regW, regN=X_MLAssemblyInt, unitOffset=stackLimitOffset}, code);
-        if debugTrapAlways
-        then ()
-        else
-        (
-            compareRegs(testReg, regW, code);
-            gen(conditionalBranch(CondCarrySet, skipCheck), code)
-        );
-        gen(loadRegScaled{regT=X16, regN=X_MLAssemblyInt, unitOffset=entryPt}, code);
-        gen(branchAndLinkReg X16, code);
-        gen(registerMask [], code); (* Not used at the moment. *)
-        gen(setLabel skipCheck, code)
-    end
-
-    (* Allocate a single byte cell and store the register into it. The result is
-       in X0 so reg must not be X0. *)
-    fun boxLargeWord(reg, code) =
-    (
-        reg <> X0 orelse raise InternalError "boxLargeWord: X0";
-        genAllocateFixedSize(Word.toInt(nativeWordSize div wordSize), F_bytes, X0, X5, code);
-        gen(storeRegScaled{regT=reg, regN=X0, unitOffset=0}, code);
-        absoluteAddressToIndex(X0, code)
-    )
-    
-    (* Allocate a single byte cell for a "real" i.e. double-precision floating
-       point number. *)
-    fun boxDouble(reg, code) =
-    (
-        genAllocateFixedSize(Word.toInt(0w8 div wordSize), F_bytes, X0, X5, code);
-        gen(storeRegScaledDouble{regT=reg, regN=X0, unitOffset=0}, code);
-        absoluteAddressToIndex(X0, code)
-    )
-    
-    (* Single precision floats are shifted and tagged in native 64-bit but
-       boxed in 32-in-64. *)
-    fun boxOrTagFloat(reg, code) =
-    if is32in64
-    then
-    (
-        genAllocateFixedSize(1, F_bytes, X0, X5, code);
-        gen(storeRegScaledFloat{regT=reg, regN=X0, unitOffset=0}, code);
-        absoluteAddressToIndex(X0, code)
-    )
-    else
-    (
-        gen(moveFloatToGeneral{regN=V0, regD=X0}, code);
-        gen(logicalShiftLeft{shift=0w32, regN=X0, regD=X0}, code);
-        gen(bitwiseOrImmediate{regN=X0, regD=X0, bits=0w1}, code)
-    )
-
-    type caseForm =
-        {
-            cases   : (backendIC * word) list,
-            test    : backendIC,
-            caseType: caseType,
-            default : backendIC
-        }
-   
-    (* Where the result, if any, should go *)
-    datatype whereto =
-        NoResult     (* discard result *)
-    |   ToStack     (* Need a result but it can stay on the pseudo-stack *)
-    |   ToX0        (* Need a result in X0. *)
-  
-    (* Are we at the end of the function. *)
-    datatype tail =
-        EndOfProc
-    |   NotEnd
-
-
-    (* The flags byte is the high-order byte of length word. *)
-    val flagsByteOffset = if isBigEndian then ~ (Word.toInt wordSize) else ~1
-
-    (* Code generate a function or global declaration *)
-    fun codegen (pt, name, resultClosure, numOfArgs, localCount, parameters, hasClosure) =
-    let
-        val cvec = ref []
-        
-        datatype decEntry =
-            StackAddr of int
-        |   Empty
-    
-        val decVec = Array.array (localCount, Empty)
-    
-        (* Count of number of items on the stack.  This excludes the arguments and
-           the return address. *)
-        val realstackptr = ref 1 (* The closure ptr is already there *)
-        
-        (* Maximum size of the stack. *)
-        val maxStack = ref 1
-
-        (* Whether the top of the stack is actually in X0. *)
-        val topInX0 = ref false
-
-        (* Push a value onto the stack. *)
-        fun incsp () =
-        (
-            realstackptr := !realstackptr + 1;
-            if !realstackptr > !maxStack
-            then maxStack := !realstackptr
-            else ()
-        )
-
-        (* An entry has been removed from the stack. *)
-        fun decsp () = realstackptr := !realstackptr - 1
-        
-        fun ensureX0 () = if ! topInX0 then (genPushReg(X0, cvec); incsp(); topInX0 := false) else ()
-
-        (* generates code from the tree *)
-        fun gencde (pt : backendIC, whereto : whereto, tailKind : tail, loopAddr) : unit =
-        let
-            val _ = !topInX0 andalso raise InternalError "topInX0 true at start"
-
-            (* Save the stack pointer value here. We may want to reset the stack. *)
-            val oldsp = !realstackptr;
-
-            (* Operations on ML memory always have the base as an ML address.
-               Word operations are always word aligned.  The higher level will
-               have extracted any constant offset and scaled it if necessary.
-               That's helpful for the X86 but not for the ARM.  We
-               have to turn them back into indexes. *)
-            (* This pushes two values to the stack: the base address and the index. *)
-            fun genMLAddress({base, index, offset}, scale) =
-            (
-                gencde (base, ToStack, NotEnd, loopAddr);
-                offset mod scale = 0 orelse raise InternalError "genMLAddress";
-                case (index, offset div scale) of
-                    (NONE, soffset) =>
-                        (genList(loadNonAddress(X0, Word64.fromInt(tag soffset)), cvec); genPushReg(X0, cvec); incsp())
-                |   (SOME indexVal, 0) => gencde (indexVal, ToStack, NotEnd, loopAddr)
-                |   (SOME indexVal, soffset) =>
-                    (
-                        gencde (indexVal, ToX0, NotEnd, loopAddr);
-                        (* Add the offset as a shifted but not tagged value. *)
-                        addConstantWord({regS=X0, regD=X0, regW=X1, value=Word64.fromInt(semitag soffset)}, cvec);
-                        genPushReg(X0, cvec);
-                        incsp();
-                        topInX0 := false
-                    )
-            )
-            
-            val genCAddress = genMLAddress
-
-            datatype mlLoadKind = MLLoadOffset of int | MLLoadReg of xReg
-
-            fun genMLLoadAddress({base, index=NONE, offset}, scale) =
-                (* The index, if any, is a constant. *)
-                (
-                    gencde (base, ToX0, NotEnd, loopAddr);
-                    indexToAbsoluteAddress(X0, X0, cvec);
-                    (X0, MLLoadOffset(offset div scale))
-                )
-            
-            |   genMLLoadAddress({base, index=SOME indexVal, offset}, scale) =
-                (
-                    gencde (base, ToStack, NotEnd, loopAddr); (* Push base addr to stack. *)
-                    gencde (indexVal, ToX0, NotEnd, loopAddr);
-                    (* Shift right to remove the tag.  N.B.  Indexes into ML memory are
-                       unsigned.  Unlike on the X86 we can't remove the tag by providing
-                       a displacement and the only options are to scale by either 1 or 8. *)
-                    gen(logicalShiftRight{regN=X0, regD=X0, shift=0w1}, cvec);
-                    (* Add any constant offset.  Does nothing if it's zero. *)
-                    addConstantWord({regS=X0, regD=X0, regW=X3,
-                        value=Word64.fromInt (* unsigned *)(offset div scale)}, cvec);
-                    genPopReg(X1, cvec); (* Pop base reg into X1. *)
-                    decsp();
-                    indexToAbsoluteAddress(X1, X1, cvec);
-                    (X1, MLLoadReg X0)
-                )
-
-            (* Similar to genMLLoadAddress but for C addresses.  There are two
-               differences.  The index is signed so we use an arithmetic shift and
-               the base address is a LargeWord value i.e. the actual
-               address is held in the word pointed at by "base", unlike with
-               ML addresses. *)
-            fun genCLoadAddress({base, index=NONE, offset}, scale) =
-                (
-                    gencde (base, ToX0, NotEnd, loopAddr);
-                    indexToAbsoluteAddress(X0, X0, cvec);
-                    gen(loadRegScaled{regT=X0, regN=X0, unitOffset=0}, cvec);
-                    (* In principle we could have a problem if the index is out of the
-                       range for the offset field.  However the simplifier only
-                       converts index offsets into constant offsets for small positive
-                       offsets. *)
-                    (X0, MLLoadOffset(offset div scale))
-                )
-            |   genCLoadAddress({base, index=SOME indexVal, offset}, scale) =
-                (
-                    gencde (base, ToStack, NotEnd, loopAddr); (* Push base addr to stack. *)
-                    gencde (indexVal, ToX0, NotEnd, loopAddr);
-                    (* Shift right to remove the tag.  C indexes are SIGNED. *)
-                    gen((if is32in64 then arithmeticShiftRight32 else arithmeticShiftRight)
-                        {regN=X0, regD=X0, shift=0w1}, cvec);
-                    (* Add any constant offset.  Does nothing if it's zero. *)
-                    addConstantWord({regS=X0, regD=X0, regW=X3,
-                        value=Word64.fromInt (* unsigned *)(offset div scale)}, cvec);
-                    genPopReg(X1, cvec); (* Pop base reg into X1. *)
-                    indexToAbsoluteAddress(X1, X1, cvec);
-                    gen(loadRegScaled{regT=X1, regN=X1, unitOffset=0}, cvec);
-                    decsp();
-                    (X1, MLLoadReg X0)
-                )
-
-            (* Have to sign-extend 32-bit signed index in 32-in64. *)
-            val cIndexFactor = (if is32in64 then ExtSXTW else ExtUXTX)
-
-            (* Compare a block of bytes.  Jumps to labelEqual if all the bytes are
-               equal up to the length.  Otherwise it drops through with the condition
-               code set to the last byte comparison that tested unequal. *)
-            fun blockCompareBytes(leftArg, rightArg, length, labelEqual, setZeroCC) =
-            let
-                val loopLabel = createLabel()
-            in
-                genMLAddress(leftArg, 1);
-                genMLAddress(rightArg, 1);
-                gencde (length, ToX0, NotEnd, loopAddr);
-                genPopReg(X2, cvec); (* right arg index - tagged value. *)
-                genPopReg(X1, cvec); (* right arg base address. *)
-                indexToAbsoluteAddress(X1, X1, cvec);
-                (* Add in the index N.B. ML index values are unsigned. *)
-                gen(addShiftedReg{regM=X2, regN=X1, regD=X1, shift=ShiftLSR 0w1}, cvec);
-                genPopReg(X3, cvec); (* left index *)
-                genPopReg(X2, cvec);
-                indexToAbsoluteAddress(X2, X2, cvec);
-                decsp(); decsp(); decsp(); decsp();
-                gen(addShiftedReg{regM=X3, regN=X2, regD=X2, shift=ShiftLSR 0w1}, cvec);
-                (* Untag the length *)
-                gen(logicalShiftRight{regN=X0, regD=X0, shift=0w1}, cvec);
-                (* If necessary set the cc for the case where the length is zero. *)
-                if setZeroCC then compareRegs(X0, X0, cvec) else ();
-                gen(setLabel loopLabel, cvec);
-                gen(compareBranchZero(X0, labelEqual), cvec);
-                (* X2 is left arg addr, X1 is right arg addr. *)
-                gen(loadRegPostIndexByte{regT=X4, regN=X2, byteOffset=1}, cvec);
-                gen(loadRegPostIndexByte{regT=X3, regN=X1, byteOffset=1}, cvec);
-                compareRegs(X4, X3, cvec);
-                gen(subImmediate{regN=X0, regD=X0, immed=0w1, shifted=false}, cvec);
-                (* Loop if they're equal. *)
-                gen(conditionalBranch(CondEqual, loopLabel), cvec)
-            end
-    
-         val () =
-           case pt of
-                BICEval evl => genEval (evl, tailKind)
-
-            |   BICExtract ext =>
-                    (* This may just be being used to discard a value which isn't
-                       used on this branch.  N.B. genProc for mutual closures
-                       assumes that this does not affect X1. *)
-                if whereto = NoResult then ()
-                else
-                let     
-                    fun loadLocalStackValue addr =
-                    (
-                        genList (loadScaledWord{dest=X0, base=X_MLStackPtr, work=X16, offset= !realstackptr + addr}, cvec);
-                        topInX0 := true
-                    )
-                in
-                    case ext of
-                        BICLoadArgument locn =>
-                            (* The register arguments appear in order on the
-                               stack, followed by the stack argumens in reverse
-                               order. *)
-                            if locn < 8
-                            then loadLocalStackValue (locn+1)
-                            else loadLocalStackValue (numOfArgs-locn+8)
-                    |   BICLoadLocal locn =>
-                        (
-                            case Array.sub (decVec, locn) of
-                                StackAddr n => loadLocalStackValue (~ n)
-                            |   _ => (* Should be on the stack, not a function. *)
-                                raise InternalError "locaddr: bad stack address"
-                        )
-                    |   BICLoadClosure locn =>
-                        (
-                            loadLocalStackValue ~1; (* The closure itself. *)
-                            indexToAbsoluteAddress(X0, X0, cvec);
-                            genList(loadScaledPolyWord{dest=X0, base=X0, work=X16,
-                                 (* The first word is the code. This is two poly words in 32-in-64. *)
-                                offset=if is32in64 then locn+2 else locn+1}, cvec)
-                        )
-                    |   BICLoadRecursive =>
-                            loadLocalStackValue ~1 (* The closure itself - first value on the stack. *)
-                end
-
-            |   BICField {base, offset} =>
-                (
-                    gencde (base, ToX0, NotEnd, loopAddr);
-                    if is32in64
-                    then
-                    (
-                        (* Can use an indexed load if the offset is zero otherwise it takes two instrs. *)
-                        if offset = 0
-                        then gen(loadRegIndexed32{regN=X_Base32in64, regM=X0, regT=X0, option=ExtUXTX ScaleOrShift}, cvec)
-                        else
-                        (
-                            indexToAbsoluteAddress(X0, X0, cvec);
-                            genList (loadScaledPolyWord{dest=X0, base=X0, work=X16, offset=offset}, cvec)
-                        )
-                    )
-                    else genList (loadScaledPolyWord{dest=X0, base=X0, work=X16, offset=offset}, cvec)
-                )
-
-            |   BICLoadContainer {base, offset} =>
-                (
-                    gencde (base, ToX0, NotEnd, loopAddr);
-                    genList (loadScaledWord{dest=X0, base=X0, work=X16, offset=offset}, cvec)
-                )
-       
-            |   BICLambda lam => genProc (lam, false, fn () => ())
-           
-            |   BICConstnt(w, _) =>
-                (
-                    if isShort w
-                    then
-                    let
-                        val taggedValue = taggedWord64(Word64.fromLarge(Word.toLarge(toShort w)))
-                    in
-                        genList(loadNonAddress(X0,
-                            (* We have sign extended this to a 64-bit value but the high-order bits
-                               should always be zero. *)
-                            if is32in64 then LargeWord.andb(taggedValue, 0wxffffffff) else taggedValue), cvec)
-                    end
-                    else gen(loadAddressConstant(X0, w), cvec);
-                    topInX0 := true
-                )
-
-            |   BICCond (testPart, thenPart, elsePart) =>
-                    genCond (testPart, thenPart, elsePart, whereto, tailKind, loopAddr)
-  
-            |   BICNewenv(decls, exp) =>
-                let         
-                    (* Processes a list of entries. *)
-            
-                    (* Mutually recursive declarations. May be either lambdas or constants. Recurse down
-                       the list pushing the addresses of the closure vectors, then unwind the 
-                       recursion and fill them in. *)
-                    fun genMutualDecs [] = ()
-
-                    |   genMutualDecs ({lambda, addr, ...} :: otherDecs) =
-                            genProc (lambda, true,
-                                fn() =>
-                                (
-                                    Array.update (decVec, addr, StackAddr (! realstackptr));
-                                    genMutualDecs (otherDecs)
-                                ))
-
-                    fun codeDecls(BICRecDecs dl) = genMutualDecs dl
-
-                    |   codeDecls(BICDecContainer{size, addr}) =
-                        (
-                            (* If this is a container we have to process it here otherwise it
-                               will be removed in the stack adjustment code. *)
-                            (* The stack entries have to be initialised.  Set them to tagged(0). *)
-                            genList(loadNonAddress(X0, Word64.fromInt(tag 0)), cvec);
-                            let fun pushN 0 = () | pushN n = (genPushReg(X0, cvec); pushN (n-1)) in pushN size end;
-                            gen(moveRegToReg{sReg=X_MLStackPtr, dReg=X0}, cvec);
-                            genPushReg(X0, cvec); (* Push the address of this container. *)
-                            realstackptr := !realstackptr + size + 1; (* Pushes N words plus the address. *)
-                            Array.update (decVec, addr, StackAddr(!realstackptr))
-                        )
-
-                    |   codeDecls(BICDeclar{value, addr, ...}) =
-                        (
-                            gencde (value, ToStack, NotEnd, loopAddr);
-                            Array.update (decVec, addr, StackAddr(!realstackptr))
-                        )
-                    |   codeDecls(BICNullBinding exp) = gencde (exp, NoResult, NotEnd, loopAddr)
-                in
-                    List.app codeDecls decls;
-                    gencde (exp, whereto, tailKind, loopAddr)
-                end
-          
-            |   BICBeginLoop {loop=body, arguments} =>
-                (* Execute the body which will contain at least one Loop instruction.
-                   There will also be path(s) which don't contain Loops and these
-                   will drop through. *)
-                let
-                    val args = List.map #1 arguments
-                    (* Evaluate each of the arguments, pushing the result onto the stack. *)
-                    fun genLoopArg ({addr, value, ...}) =
-                        (
-                         gencde (value, ToStack, NotEnd, loopAddr);
-                         Array.update (decVec, addr, StackAddr (!realstackptr));
-                         !realstackptr (* Return the posn on the stack. *)
-                        )
-                    val argIndexList = map genLoopArg args;
-
-                    val startSp = ! realstackptr; (* Remember the current top of stack. *)
-                    val startLoop = createLabel ()
-                    val () = gen(setLabel startLoop, cvec) (* Start of loop *)
-                in
-                    (* Process the body, passing the jump-back address down for the Loop instruction(s). *)
-                    gencde (body, whereto, tailKind, SOME(startLoop, startSp, argIndexList))
-                    (* Leave the arguments on the stack.  They can be cleared later if needed. *)
-                end
-
-            |   BICLoop argList => (* Jump back to the enclosing BeginLoop. *)
-                let
-                    val (startLoop, startSp, argIndexList) =
-                        case loopAddr of
-                            SOME l => l
-                        |   NONE => raise InternalError "No BeginLoop for Loop instr"
-                    (* Evaluate the arguments.  First push them to the stack because evaluating
-                       an argument may depend on the current value of others.  Only when we've
-                       evaluated all of them can we overwrite the original argument positions. *)
-                    fun loadArgs ([], []) = !realstackptr - startSp (* The offset of all the args. *)
-                      | loadArgs (arg:: argList, _ :: argIndexList) =
-                        let
-                            (* Evaluate all the arguments. *)
-                            val () = gencde (arg, ToStack, NotEnd, NONE);
-                            val argOffset = loadArgs(argList, argIndexList);
-                        in
-                            genPopReg(X0, cvec);
-                            genList(storeScaledWord{store=X0, base=X_MLStackPtr, work=X16, offset=argOffset-1}, cvec);
-                            decsp(); (* The argument has now been popped. *)
-                            argOffset
-                        end
-                      | loadArgs _ = raise InternalError "loadArgs: Mismatched arguments";
-
-                    val _: int = loadArgs(List.map #1 argList, argIndexList)
-                in
-                    if !realstackptr <> startSp
-                    then resetStack (!realstackptr - startSp, false, cvec) (* Remove any local variables. *)
-                    else ();
-            
-                    (* Jump back to the start of the loop. *)
-                    checkStackCode(X10, 0, false, cvec);
-                    
-                    gen(unconditionalBranch startLoop, cvec)
-                end
-  
-            |   BICRaise exp =>
-                (
-                    gencde (exp, ToStack, NotEnd, loopAddr);
-                    genPopReg(X0, cvec);
-                    (* Copy the handler "register" into the stack pointer.  Then
-                       jump to the address in the first word.  The second word is
-                       the next handler.  This is set up in the handler.  We have a lot
-                       more raises than handlers since most raises are exceptional conditions
-                       such as overflow so it makes sense to minimise the code in each raise. *)
-                    gen(loadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset}, cvec);
-                    gen(loadRegScaled{regT=X1, regN=X_MLStackPtr, unitOffset=0}, cvec);
-                    gen(branchRegister X1, cvec)
-                )
-  
-            |   BICHandle {exp, handler, exPacketAddr} =>
-                let
-                    (* Save old handler *)
-                    val () = gen(loadRegScaled{regT=X0, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset}, cvec)
-                    val () = genPushReg(X0, cvec)
-                    val () = incsp ()
-                    val handlerLabel = createLabel()
-                    (* Push address of handler. *)
-                    val () = gen(loadLabelAddress(X0, handlerLabel), cvec)
-                    val () = genPushReg(X0, cvec)
-                    val () = incsp()
-                    (* Store the address of the stack pointer into the handler register. *)
-                    val () = gen(storeRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset}, cvec)
-
-                    (* Code generate the body; "NotEnd" because we have to come back
-                       to remove the handler; "ToStack" because delHandler needs
-                       a result to carry down. *)
-                    val () = gencde (exp, ToStack, NotEnd, loopAddr)
-      
-                    (* Now get out of the handler and restore the old one. *)
-                    val () = genPopReg(X0, cvec) (* Pop the result. *)
-                    val () = genPopReg(X1, cvec) (* Pop and discard the handler address. *)
-                    val () = genPopReg(X1, cvec) (* Pop the old handler. *)
-                    val () = gen(storeRegScaled{regT=X1, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset}, cvec)
-                    val () = genPushReg(X0, cvec) (* Push the result. *)
-
-                    val skipHandler = createLabel()
-                    val () = gen(unconditionalBranch skipHandler, cvec)
-                    val () = realstackptr := oldsp
-                    val () = gen(setLabel handlerLabel, cvec)
-                    (* The exception raise code resets the stack pointer to the value in the exception handler
-                       so this is probably redundant.  Leave it for the moment, *)
-                    val () = gen(loadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset}, cvec)
-                    (* We must, though, restore the old handler. *)
-                    val () = genPopReg(X1, cvec) (* Pop and discard the handler address. *)
-                    val () = genPopReg(X1, cvec) (* Pop the old handler. *)
-                    val () = gen(storeRegScaled{regT=X1, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset}, cvec)
-                    (* Push the exception packet which is in X0 and set the address. *)
-                    val () = genPushReg(X0, cvec)
-                    val () = incsp ()
-                    val () = Array.update (decVec, exPacketAddr, StackAddr(!realstackptr))
-                    val () = gencde (handler, ToStack, NotEnd, loopAddr)
-                    (* Have to remove the exception packet. *)
-                    val () = resetStack(1, true, cvec)
-                    val () = decsp()
-          
-                    (* Finally fix-up the jump around the handler *)
-                    val () = gen(setLabel skipHandler, cvec)
-                in
-                    ()
-                end
-  
-            |   BICCase ({cases, test, default, firstIndex, ...}) =>
-                let
-                    val () = gencde (test, ToStack, NotEnd, loopAddr)
-                    (* Label to jump to at the end of each case. *)
-                    val exitJump = createLabel()
-                    val () = genPopReg(X0, cvec)
-                    val () = decsp ()
-
-                    (* Subtract the minimum even if it is zero to remove the tag.
-                       This leaves us with a shifted but untagged value. Don't check for overflow.
-                       Instead allow large values to wrap around and check later.*)
-                    val () = addConstantWord({regS=X0, regD=X0, regW=X1,
-                                    value= ~(taggedWord64(Word64.fromLarge(Word.toLargeX firstIndex)))}, cvec)
-
-                    (* Create the case labels. *)
-                    val nCases = List.length cases
-                    val caseLabels = List.tabulate(nCases, fn _ => createLabel())
-                    val defaultLabel = createLabel()
-                    
-                    (* Compare with the number of cases and go to the default if it is
-                       not less. We use an unsigned comparison and compare with
-                       the semitagged value because we've removed the tag bit. *)
-                    (* TODO: Not necessary if it exhaustive. *)
-                    (* For the moment load the value into a register and compare. *)
-                    val () = genList(loadNonAddress(X1, Word64.fromInt nCases * 0w2), cvec)
-                    val () = compareRegs(X0, X1, cvec)
-                    val () = gen(conditionalBranch(CondCarrySet, defaultLabel), cvec)
-                    (* Load the address of the jump table. *)
-                    val tableLabel = createLabel()
-                    val () = gen(loadLabelAddress(X1, tableLabel), cvec)
-                    (* Add the value shifted by one since it's already shifted. *)
-                    val () = gen(addShiftedReg{regM=X0, regN=X1, regD=X0, shift=ShiftLSL 0w1}, cvec)
-                    val () = gen(branchRegister X0, cvec)
-                    (* Put in the branch table. *)
-                    val () = gen(setLabel tableLabel, cvec)
-                    val () = List.app(fn label => gen(unconditionalBranch label, cvec)) caseLabels
-
-                    (* The default case, if any, follows the case statement. *)
-                    (* If we have a jump to the default set it to jump here. *)
-                    local
-                        fun fixDefault(NONE, defCase) = gen(setLabel defCase, cvec)
-                        |   fixDefault(SOME _, _) = ()
-                    in
-                        val () = ListPair.appEq fixDefault (cases, caseLabels)
-                    end
-                    val () = gen(setLabel defaultLabel, cvec)
-                    val () = gencde (default, whereto, tailKind, loopAddr)
-
-                    fun genCases(SOME body, label) =
-                        (
-                            (* First exit from the previous case or the default if
-                               this is the first. *)
-                            gen(unconditionalBranch exitJump, cvec);
-                            (* Remove the result - the last case will leave it. *)
-                            case whereto of ToStack => decsp () | NoResult => () | ToX0 => ();
-                            topInX0 := false;
-                            (* Fix up the jump to come here. *)
-                            gen(setLabel label, cvec);
-                            gencde (body, whereto, tailKind, loopAddr)
-                        )
-                    |   genCases(NONE, _) = ()
-                
-                    val () = ListPair.appEq genCases (cases, caseLabels)
-     
-                    (* Finally set the exit jump to come here. *)
-                    val () = gen(setLabel exitJump, cvec)
-                in
-                    ()
-                end
-  
-            |   BICTuple recList =>
-                let
-                    val size = List.length recList
-                in
-                    (* Get the fields and push them to the stack. *)
-                    List.app(fn v => gencde (v, ToStack, NotEnd, loopAddr)) recList;
-                    genAllocateFixedSize(size, 0w0, X0, X1, cvec);
-                    List.foldl(fn (_, w) =>
-                        (
-                            genPopReg(X1, cvec);
-                            genList(storeScaledPolyWord{store=X1, base=X0, work=X16, offset=w-1}, cvec);
-                            w-1)
-                        )
-                        size recList;
-                    (* If it's 32-in-64 this has to be converted to an object pointer. *)
-                    absoluteAddressToIndex(X0, cvec);
-                    topInX0 := true;
-                    realstackptr := !realstackptr - size
-                end
-
-            |   BICSetContainer{container, tuple, filter} =>
-                (* Copy the contents of a tuple into a container.  If the tuple is a
-                   Tuple instruction we can avoid generating the tuple and then
-                   unpacking it and simply copy the fields that make up the tuple
-                   directly into the container. *)
-                (
-                    case tuple of
-                        BICTuple cl =>
-                            (* Simply set the container from the values. *)
-                        let
-                            (* Push the address of the container to the stack. *)
-                            val _ = gencde (container, ToStack, NotEnd, loopAddr)
-
-                            fun setValues([], _, _) = ()
-
-                            |   setValues(v::tl, sourceOffset, destOffset) =
-                                if sourceOffset < BoolVector.length filter andalso BoolVector.sub(filter, sourceOffset)
-                                then
-                                (
-                                    (* Get the value to store into X0. *)
-                                    gencde (v, ToX0, NotEnd, loopAddr);
-                                    (* Load the address of the container from the stack and store
-                                       the value into the container. *)
-                                    gen(loadRegScaled{regT=X1, regN=X_MLStackPtr, unitOffset=0}, cvec);
-                                    genList(storeScaledWord{store=X0, base=X1, work=X16, offset=destOffset}, cvec);
-                                    topInX0 := false; (* We've used it. *)
-                                    setValues(tl, sourceOffset+1, destOffset+1)
-                                )
-                                else setValues(tl, sourceOffset+1, destOffset)
-                        in
-                            setValues(cl, 0, 0)
-                            (* The container address is still on the stack. *)
-                        end
-
-                    |   _ =>
-                        let (* General case: copy values from the source tuple. *)
-                            (* First the target tuple, then the container. *)
-                            val () = gencde (tuple, ToStack, NotEnd, loopAddr)
-                            val () = gencde (container, ToX0, NotEnd, loopAddr)
-                            val () = genPopReg(X1, cvec)
-                            val () = indexToAbsoluteAddress(X1, X1, cvec)
-                            val () = decsp()
-                            (* Container address is in X0, tuple in X1. *)
-                            
-                            val last = BoolVector.foldli(fn (i, true, _) => i | (_, false, n) => n) ~1 filter
-
-                            fun copy (sourceOffset, destOffset) =
-                                if BoolVector.sub(filter, sourceOffset)
-                                then
-                                (
-                                    (* Load the value in the tuple. *)
-                                    genList(loadScaledPolyWord{dest=X2, base=X1, work=X16, offset=sourceOffset}, cvec);
-                                    (* Store into the container. *)
-                                    genList(storeScaledWord{store=X2, base=X0, work=X16, offset=destOffset}, cvec);
-                                    if sourceOffset = last
-                                    then ()
-                                    else copy (sourceOffset+1, destOffset+1)
-                                )
-                                else copy(sourceOffset+1, destOffset)
-                        in
-                            copy (0, 0);
-                            topInX0 := true (* Container address is in X0 *)
-                        end
-                )
-
-            |   BICTagTest { test, tag=tagValue, ... } =>
-                (
-                    gencde (test, ToStack, NotEnd, loopAddr);
-                    genPopReg(X0, cvec);
-                    gen(subSImmediate{regN=X0, regD=XZero, immed=taggedWord tagValue, shifted=false}, cvec);
-                    setBooleanCondition(X0, CondEqual, cvec);
-                    genPushReg(X0, cvec)
-                )
-
-            |   BICNullary {oper=GetCurrentThreadId} =>
-                (
-                    gen(loadRegScaled{regT=X0, regN=X_MLAssemblyInt, unitOffset=threadIdOffset}, cvec);
-                    topInX0 := true
-                )
-
-            |   BICNullary {oper=CheckRTSException} =>
-                (* Raise an exception in ML if the last RTS call set the exception packet. *)
-                let (* It may be better to do this in all RTS calls. *)
-                    val noException = createLabel()
-                in
-                    (* Load the packet and see if it is nil (tagged 0) *)
-                    gen(loadRegScaled{regT=X0, regN=X_MLAssemblyInt, unitOffset=exceptionPacketOffset}, cvec);
-                    gen(subSImmediate{regN=X0, regD=XZero, immed=taggedWord 0w0, shifted=false}, cvec);
-                    gen(conditionalBranch(CondEqual, noException), cvec);
-                    (* If it isn't then raise the exception. *)
-                    gen(loadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset}, cvec);
-                    gen(loadRegScaled{regT=X1, regN=X_MLStackPtr, unitOffset=0}, cvec);
-                    gen(branchRegister X1, cvec);
-                    gen(setLabel noException, cvec)
-                end
-
-            |   BICNullary {oper=CreateMutex} =>
-                let
-                    (* Allocate memory for a mutex.  Use a native word as a mutable, weak, no-overwrite, byte cell
-                       which is the same as a volatileRef. This ensures that it will always be cleared when it is
-                       loaded even if it was locked when it was saved. *)
-                    val flags = Word8.orb(F_mutable, Word8.orb(F_weak, Word8.orb(F_noOverwrite, F_bytes))) (* 0wx69 *)
-                in
-                    genAllocateFixedSize(Word.toInt(nativeWordSize div wordSize), flags, X0, X5, cvec);
-                    gen(storeRegScaled{regT=XZero, regN=X0, unitOffset=0}, cvec);
-                    absoluteAddressToIndex(X0, cvec);
-                    topInX0 := true
-                end
-
-            |   BICUnary { oper, arg1 } => genUnary(oper, arg1, loopAddr)
-
-            |   BICBinary { oper, arg1, arg2 } => genBinary(oper, arg1, arg2, loopAddr)
-            
-            |   BICAllocateWordMemory {numWords, flags, initial } =>
-                let
-                    fun doAllocateAndInit() =
-                    let
-                        val () = gencde (numWords, ToStack, NotEnd, loopAddr)
-                        val () = gencde (flags, ToStack, NotEnd, loopAddr)
-                        val () = gencde (initial, ToX0, NotEnd, loopAddr)
-                        val exitLabel = createLabel() and loopLabel = createLabel()
-                    in
-                        genPopReg(X2, cvec); (* Flags as tagged value. *)
-                        gen(logicalShiftRight32(*byte*){regN=X2, regD=X2, shift=0w1}, cvec);
-                        genPopReg(X1, cvec); (* Length as tagged value. *)
-                        gen(logicalShiftRight{regN=X1, regD=X1, shift=0w1}, cvec);
-                        genPushReg(X0, cvec); (* Save initialiser - TODO: Add it to save set. *)
-                        allocateVariableSize({sizeReg=X1, flagsReg=X2, resultReg=X0}, cvec);
-                        genPopReg(X3, cvec); (* Pop initialiser. *)
-                        (* Add the length in bytes so we point at the end. *)
-                        gen(addShiftedReg{regM=X1, regN=X0, regD=X1,
-                            shift=ShiftLSL(if is32in64 then 0w2 else 0w3)}, cvec);
-                        (* Loop to initialise. *)
-                        gen(setLabel loopLabel, cvec);
-                        compareRegs(X1, X0, cvec); (* Are we at the start? *)
-                        gen(conditionalBranch(CondEqual, exitLabel), cvec);
-                        if is32in64
-                        then gen(storeRegPreIndex32{regT=X3, regN=X1, byteOffset= ~4}, cvec)
-                        else gen(storeRegPreIndex{regT=X3, regN=X1, byteOffset= ~8}, cvec);
-                        gen(unconditionalBranch loopLabel, cvec);
-                        gen(setLabel exitLabel, cvec);
-                        absoluteAddressToIndex(X0, cvec);
-                        decsp(); decsp()
-                    end
-                in
-                    case (numWords, flags) of
-                        (BICConstnt(length, _), BICConstnt(flagValue, _)) =>
-                            if isShort length andalso toShort length = 0w1 andalso isShort flagValue
-                            then (* This is a very common case for refs. *)
-                            let
-                                val flagByte = Word8.fromLargeWord(Word.toLargeWord(toShort flagValue))
-                            in
-                                gencde (initial, ToStack, NotEnd, loopAddr); (* Initialiser. *)
-                                genAllocateFixedSize(1, flagByte, X0, X1, cvec);
-                                genPopReg(X1, cvec);
-                                gen((if is32in64 then storeRegScaled32 else storeRegScaled){regT=X1, regN=X0, unitOffset=0}, cvec);
-                                absoluteAddressToIndex(X0, cvec);
-                                decsp(); topInX0 := true
-                            end
-                            else (* Constant but not a single. *) doAllocateAndInit()
-                    |   _ => (* Not constant. *) doAllocateAndInit()
-                end
-
-            |   BICLoadOperation { kind=LoadStoreMLWord {isImmutable=false}, address={base, index=NONE, offset=0}} =>
-                (
-                    gencde (base, ToX0, NotEnd, loopAddr);
-                    indexToAbsoluteAddress(X0, X0, cvec);
-                    gen((if is32in64 then loadAcquire32 else loadAcquire){regN=X0, regT=X0}, cvec)
-                )
-
-            |   BICLoadOperation { kind=LoadStoreMLWord _, address} =>
-                (
-                    case genMLLoadAddress(address, Word.toInt wordSize) of
-                        (base, MLLoadOffset offset) =>
-                            genList(loadScaledPolyWord{dest=X0, base=base, work=X16, offset=offset}, cvec)
-                    |   (base, MLLoadReg indexR) =>
-                            gen((if is32in64 then loadRegIndexed32 else loadRegIndexed)
-                                {regN=base, regM=indexR, regT=X0, option=ExtUXTX ScaleOrShift}, cvec)
-                )
-
-            |   BICLoadOperation { kind=LoadStoreMLByte _, address} =>
-                (
-                    case genMLLoadAddress(address, 1) of
-                        (base, MLLoadOffset offset) =>
-                            gen(loadRegScaledByte{regT=X0, regN=base, unitOffset=offset}, cvec)
-                    |   (base, MLLoadReg indexR) =>
-                            gen(loadRegIndexedByte{regN=base, regM=indexR, regT=X0, option=ExtUXTX NoScale}, cvec);
-
-                    (* Have to tag the result. *)
-                    gen(logicalShiftLeft32{regN=X0, regD=X0, shift=0w1}, cvec);
-                    gen(bitwiseOrImmediate32{regN=X0, regD=X0, bits=0w1}, cvec)
-                )
-
-            |   BICLoadOperation { kind=LoadStoreC8, address} =>
-                (
-                    case genCLoadAddress(address, 1) of
-                        (base, MLLoadOffset offset) =>
-                            if offset < 0 (* C offsets can be negative. *)
-                            then gen(loadRegUnscaledByte{regT=X0, regN=base, byteOffset=offset}, cvec)
-                            else gen(loadRegScaledByte{regT=X0, regN=base, unitOffset=offset}, cvec)
-                    |   (base, MLLoadReg indexR) =>
-                            gen(loadRegIndexedByte{regN=base, regM=indexR, regT=X0, option=cIndexFactor NoScale}, cvec);
-
-                    (* Have to tag the result. *)
-                    gen(logicalShiftLeft32{regN=X0, regD=X0, shift=0w1}, cvec);
-                    gen(bitwiseOrImmediate32{regN=X0, regD=X0, bits=0w1}, cvec)
-                )
-
-            |   BICLoadOperation { kind=LoadStoreC16, address} =>
-                (
-                    case genCLoadAddress(address, 2) of
-                        (base, MLLoadOffset offset) =>
-                            if offset < 0 (* C offsets can be negative. *)
-                            then gen(loadRegUnscaled16{regT=X0, regN=base, byteOffset=offset*2}, cvec)
-                            else gen(loadRegScaled16{regT=X0, regN=base, unitOffset=offset}, cvec)
-                    |   (base, MLLoadReg indexR) =>
-                            gen(loadRegIndexed16{regN=base, regM=indexR, regT=X0, option=cIndexFactor ScaleOrShift}, cvec);
-
-                    (* Have to tag the result. *)
-                    gen(logicalShiftLeft32{regN=X0, regD=X0, shift=0w1}, cvec);
-                    gen(bitwiseOrImmediate32{regN=X0, regD=X0, bits=0w1}, cvec)
-                )
-
-            |   BICLoadOperation { kind=LoadStoreC32, address} =>
-                (
-                    case genCLoadAddress(address, 4) of
-                        (base, MLLoadOffset offset) =>
-                            if offset < 0 (* C offsets can be negative. *)
-                            then gen(loadRegUnscaled32{regT=X1, regN=base, byteOffset=offset*4}, cvec)
-                            else gen(loadRegScaled32{regT=X1, regN=base, unitOffset=offset}, cvec)
-                    |   (base, MLLoadReg indexR) =>
-                            gen(loadRegIndexed32{regN=base, regM=indexR, regT=X1, option=cIndexFactor ScaleOrShift}, cvec);
-
-                    (* Tag this in native 64-bits but box it in 32-in-64 *)
-                    if is32in64
-                    then boxLargeWord(X1, cvec)
-                    else
-                    (
-                        gen(logicalShiftLeft{regN=X1, regD=X0, shift=0w1}, cvec);
-                        gen(bitwiseOrImmediate{regN=X0, regD=X0, bits=0w1}, cvec)
-                    )
-                )
-
-            |   BICLoadOperation { kind=LoadStoreC64, address} =>
-                (
-                    case genCLoadAddress(address, 8) of
-                        (base, MLLoadOffset offset) =>
-                            if offset < 0 (* C offsets can be negative. *)
-                            then gen(loadRegUnscaled{regT=X1, regN=base, byteOffset=offset*8}, cvec)
-                            else gen(loadRegScaled{regT=X1, regN=base, unitOffset=offset}, cvec)
-                    |   (base, MLLoadReg indexR) =>
-                            gen(loadRegIndexed{regN=base, regM=indexR, regT=X1, option=cIndexFactor ScaleOrShift}, cvec);
-
-                    (* Load the value at the address and box it. *)
-                    boxLargeWord(X1, cvec)
-                )
-
-            |   BICLoadOperation { kind=LoadStoreCFloat, address} =>
-                (
-                    case genCLoadAddress(address, 4) of
-                        (base, MLLoadOffset offset) =>
-                            if offset < 0 (* C offsets can be negative. *)
-                            then gen(loadRegUnscaledFloat{regT=V0, regN=base, byteOffset=offset*4}, cvec)
-                            else gen(loadRegScaledFloat{regT=V0, regN=base, unitOffset=offset}, cvec)
-                    |   (base, MLLoadReg indexR) =>
-                            gen(loadRegIndexedFloat{regN=base, regM=indexR, regT=V0, option=cIndexFactor ScaleOrShift}, cvec);
-                    (* This is defined to return a "real" i.e. a double so we need to convert it
-                       to a double and then box the result. *)
-                    gen(convertFloatToDouble{regN=V0, regD=V0}, cvec);
-                    boxDouble(V0, cvec)
-                )
-
-            |   BICLoadOperation { kind=LoadStoreCDouble, address} =>
-                (
-                    case genCLoadAddress(address, 8) of
-                        (base, MLLoadOffset offset) =>
-                            if offset < 0 (* C offsets can be negative. *)
-                            then gen(loadRegUnscaledDouble{regT=V0, regN=base, byteOffset=offset*8}, cvec)
-                            else gen(loadRegScaledDouble{regT=V0, regN=base, unitOffset=offset}, cvec)
-                    |   (base, MLLoadReg indexR) =>
-                            gen(loadRegIndexedDouble{regN=base, regM=indexR, regT=V0, option=cIndexFactor ScaleOrShift}, cvec);
-                    (* Box the result. *)
-                    boxDouble(V0, cvec)
-                )
-
-            |   BICLoadOperation { kind=LoadStoreUntaggedUnsigned, address} =>
-                (
-                    case genMLLoadAddress(address, Word.toInt wordSize) of
-                        (base, MLLoadOffset offset) =>
-                            genList(loadScaledPolyWord{dest=X0, base=base, work=X16, offset=offset}, cvec)
-                    |   (base, MLLoadReg indexR) =>
-                            gen((if is32in64 then loadRegIndexed32 else loadRegIndexed)
-                                {regN=base, regM=indexR, regT=X0, option=ExtUXTX ScaleOrShift}, cvec);
-
-                    (* Have to tag the result. *)
-                    gen(logicalShiftLeft{regN=X0, regD=X0, shift=0w1}, cvec);
-                    gen(bitwiseOrImmediate{regN=X0, regD=X0, bits=0w1}, cvec)
-                )
-
-            |   BICStoreOperation { kind=LoadStoreMLWord _, address={base, index=NONE, offset=0}, value } =>
-                (
-                    gencde (base, ToStack, NotEnd, loopAddr);
-                    gencde (value, ToX0, NotEnd, loopAddr);
-                    genPopReg(X1, cvec); (* Address *)
-                    indexToAbsoluteAddress(X1, X1, cvec);
-                    decsp();
-                    gen((if is32in64 then storeRelease32 else storeRelease){regN=X1, regT=X0}, cvec)
-                )
-
-            |   BICStoreOperation { kind=LoadStoreMLWord _, address, value } =>
-                (
-                    genMLAddress(address, Word.toInt wordSize);
-                    gencde (value, ToStack, NotEnd, loopAddr);
-                    genPopReg(X0, cvec); (* Value to store *)
-                    genPopReg(X1, cvec); (* Index: a tagged value. *)
-                    (* Shift right to remove the tag.  N.B.  Indexes into ML memory are
-                       unsigned.  Unlike on the X86 we can't remove the tag by providing
-                       a displacement and the only options are to scale by either 1 or 8. *)
-                    gen(logicalShiftRight{regN=X1, regD=X1, shift=0w1}, cvec);
-                    genPopReg(X2, cvec); (* Base address. *)
-                    indexToAbsoluteAddress(X2, X2, cvec);
-                    gen((if is32in64 then storeRegIndexed32 else storeRegIndexed)
-                        {regN=X2, regM=X1, regT=X0, option=ExtUXTX ScaleOrShift}, cvec);
-                    (* Don't put the unit result in; it probably isn't needed, *)
-                    decsp(); decsp(); decsp()
-                )
-
-            |   BICStoreOperation { kind=LoadStoreMLByte _, address, value } =>
-                (
-                    (* Untag the value and store the byte. *)
-                    genMLAddress(address, 1);
-                    gencde (value, ToStack, NotEnd, loopAddr);
-                    genPopReg(X0, cvec); (* Value to store *)
-                    gen(logicalShiftRight{regN=X0, regD=X0, shift=0w1}, cvec); (* Untag it *)
-                    genPopReg(X1, cvec); (* Index: a tagged value. *)
-                    (* Shift right to remove the tag.  N.B.  Indexes into ML memory are
-                       unsigned.  Unlike on the X86 we can't remove the tag by providing
-                       a displacement and the only options are to scale by either 1 or 8. *)
-                    gen(logicalShiftRight{regN=X1, regD=X1, shift=0w1}, cvec);
-                    genPopReg(X2, cvec); (* Base address. *)
-                    indexToAbsoluteAddress(X2, X2, cvec);
-                    gen(storeRegIndexedByte{regN=X2, regM=X1, regT=X0, option=ExtUXTX NoScale}, cvec);
-                    (* Don't put the unit result in; it probably isn't needed, *)
-                    decsp(); decsp(); decsp()
-                )
-
-            |   BICStoreOperation { kind=LoadStoreC8, address, value} =>
-                (
-                    genCAddress(address, 1);
-                    gencde (value, ToX0, NotEnd, loopAddr); (* Value to store *)
-                    gen(logicalShiftRight32{regN=X0, regD=X0, shift=0w1}, cvec); (* Untag it *)
-                    genPopReg(X1, cvec); (* Index: a tagged value. *)
-                    (* Untag.  C indexes are signed. *)
-                    gen((if is32in64 then arithmeticShiftRight32 else arithmeticShiftRight)
-                        {regN=X1, regD=X1, shift=0w1}, cvec);
-                    genPopReg(X2, cvec); (* Base address as a SysWord.word value. *)
-                    indexToAbsoluteAddress(X2, X2, cvec);
-                    gen(loadRegScaled{regT=X2, regN=X2, unitOffset=0}, cvec); (* Actual address *)
-                    gen(storeRegIndexedByte{regN=X2, regM=X1, regT=X0, option=cIndexFactor NoScale}, cvec);
-                    topInX0 := false;
-                    decsp(); decsp()
-                )
-
-            |   BICStoreOperation { kind=LoadStoreC16, address, value} =>
-                (
-                    genCAddress(address, 2);
-                    gencde (value, ToX0, NotEnd, loopAddr); (* Value to store *)
-                    gen(logicalShiftRight32{regN=X0, regD=X0, shift=0w1}, cvec); (* Untag it *)
-                    genPopReg(X1, cvec); (* Index: a tagged value. *)
-                    (* Untag.  C indexes are signed. *)
-                    gen((if is32in64 then arithmeticShiftRight32 else arithmeticShiftRight)
-                        {regN=X1, regD=X1, shift=0w1}, cvec);
-                    genPopReg(X2, cvec); (* Base address as a SysWord.word value. *)
-                    indexToAbsoluteAddress(X2, X2, cvec);
-                    gen(loadRegScaled{regT=X2, regN=X2, unitOffset=0}, cvec); (* Actual address *)
-                    gen(storeRegIndexed16{regN=X2, regM=X1, regT=X0, option=cIndexFactor ScaleOrShift}, cvec);
-                    topInX0 := false;
-                    decsp(); decsp()
-                )
-
-            |   BICStoreOperation { kind=LoadStoreC32, address, value} =>
-                (
-                    genCAddress(address, 4);
-                    gencde (value, ToX0, NotEnd, loopAddr); (* Value to store *)
-                    (* This is tagged in 64-bit but boxed in 32-bit *)
-                    if is32in64
-                    then
-                    (
-                        indexToAbsoluteAddress(X0, X0, cvec);
-                        gen(loadRegScaled32{regT=X0, regN=X0, unitOffset=0}, cvec)
-                    )
-                    else gen(logicalShiftRight{regN=X0, regD=X0, shift=0w1}, cvec); (* Untag it *)
-                    genPopReg(X1, cvec); (* Index: a tagged value. *)
-                    (* Untag.  C indexes are signed. *)
-                    gen((if is32in64 then arithmeticShiftRight32 else arithmeticShiftRight)
-                        {regN=X1, regD=X1, shift=0w1}, cvec);
-                    genPopReg(X2, cvec); (* Base address as a SysWord.word value. *)
-                    indexToAbsoluteAddress(X2, X2, cvec);
-                    gen(loadRegScaled{regT=X2, regN=X2, unitOffset=0}, cvec); (* Actual address *)
-                    gen(storeRegIndexed32{regN=X2, regM=X1, regT=X0, option=cIndexFactor ScaleOrShift}, cvec);
-                    topInX0 := false;
-                    decsp(); decsp()
-                )
-
-            |   BICStoreOperation { kind=LoadStoreC64, address, value} =>
-                (
-                    genCAddress(address, 8);
-                    gencde (value, ToX0, NotEnd, loopAddr); (* Value to store. This is boxed. *)
-                    indexToAbsoluteAddress(X0, X0, cvec);
-                    gen(loadRegScaled{regT=X0, regN=X0, unitOffset=0}, cvec);
-                    genPopReg(X1, cvec); (* Index: a tagged value. *)
-                    (* Untag.  C indexes are signed. *)
-                    gen((if is32in64 then arithmeticShiftRight32 else arithmeticShiftRight)
-                        {regN=X1, regD=X1, shift=0w1}, cvec);
-                    genPopReg(X2, cvec); (* Base address as a SysWord.word value. *)
-                    indexToAbsoluteAddress(X2, X2, cvec);
-                    gen(loadRegScaled{regT=X2, regN=X2, unitOffset=0}, cvec); (* Actual address *)
-                    gen(storeRegIndexed{regN=X2, regM=X1, regT=X0, option=cIndexFactor ScaleOrShift}, cvec);
-                    topInX0 := false;
-                    decsp(); decsp()
-                )
-
-            |   BICStoreOperation { kind=LoadStoreCFloat, address, value} =>
-                (
-                    genCAddress(address, 4);
-                    gencde (value, ToX0, NotEnd, loopAddr); (* Value to store *)
-                    (* This is a boxed double.  It needs to be converted to a float. *)
-                    indexToAbsoluteAddress(X0, X0, cvec);
-                    gen(loadRegScaledDouble{regT=V0, regN=X0, unitOffset=0}, cvec);
-                    gen(convertDoubleToFloat{regN=V0, regD=V0}, cvec);
-                    genPopReg(X1, cvec); (* Index: a tagged value. *)
-                    (* Untag.  C indexes are signed. *)
-                    gen((if is32in64 then arithmeticShiftRight32 else arithmeticShiftRight)
-                        {regN=X1, regD=X1, shift=0w1}, cvec);
-                    genPopReg(X2, cvec); (* Base address as a SysWord.word value. *)
-                    indexToAbsoluteAddress(X2, X2, cvec);
-                    gen(loadRegScaled{regT=X2, regN=X2, unitOffset=0}, cvec); (* Actual address *)
-                    gen(storeRegIndexedFloat{regN=X2, regM=X1, regT=V0, option=cIndexFactor ScaleOrShift}, cvec);
-                    topInX0 := false;
-                    decsp(); decsp()
-                )
-
-            |   BICStoreOperation { kind=LoadStoreCDouble, address, value} =>
-                (
-                    genCAddress(address, 8);
-                    gencde (value, ToX0, NotEnd, loopAddr); (* Value to store *)
-                    (* This is a boxed double. *)
-                    indexToAbsoluteAddress(X0, X0, cvec);
-                    gen(loadRegScaledDouble{regT=V0, regN=X0, unitOffset=0}, cvec);
-                    genPopReg(X1, cvec); (* Index: a tagged value. *)
-                    (* Untag.  C indexes are signed. *)
-                    gen((if is32in64 then arithmeticShiftRight32 else arithmeticShiftRight)
-                        {regN=X1, regD=X1, shift=0w1}, cvec);
-                    genPopReg(X2, cvec); (* Base address as a SysWord.word value. *)
-                    indexToAbsoluteAddress(X2, X2, cvec);
-                    gen(loadRegScaled{regT=X2, regN=X2, unitOffset=0}, cvec); (* Actual address *)
-                    gen(storeRegIndexedDouble{regN=X2, regM=X1, regT=V0, option=cIndexFactor ScaleOrShift}, cvec);
-                    topInX0 := false;
-                    decsp(); decsp()
-                )
-
-            |   BICStoreOperation { kind=LoadStoreUntaggedUnsigned, address, value} =>
-                (
-                    (* Almost the same as LoadStoreMLWord except that the value to be stored
-                       must be untagged before it is stored.  This is used primarily to set
-                       the length word on a string. *)
-                    genMLAddress(address, Word.toInt wordSize);
-                    gencde (value, ToStack, NotEnd, loopAddr);
-                    genPopReg(X0, cvec); (* Value to store *)
-                    gen(logicalShiftRight{regN=X0, regD=X0, shift=0w1}, cvec);
-                    genPopReg(X1, cvec); (* Index: a tagged value. *)
-                    (* Shift right to remove the tag.  N.B.  Indexes into ML memory are
-                       unsigned.  Unlike on the X86 we can't remove the tag by providing
-                       a displacement and the only options are to scale by either 1 or 8. *)
-                    gen(logicalShiftRight{regN=X1, regD=X1, shift=0w1}, cvec);
-                    genPopReg(X2, cvec); (* Base address. *)
-                    indexToAbsoluteAddress(X2, X2, cvec);
-                    gen((if is32in64 then storeRegIndexed32 else storeRegIndexed)
-                            {regN=X2, regM=X1, regT=X0, option=ExtUXTX ScaleOrShift}, cvec);
-                    (* Don't put the unit result in; it probably isn't needed, *)
-                    decsp(); decsp(); decsp()
-                )
-
-            |   BICBlockOperation { kind=BlockOpMove{isByteMove}, sourceLeft, destRight, length } =>
-                let
-                    val exitLabel = createLabel() and loopLabel = createLabel()
-                    (* Add the index, an unsigned, tagged value, to the base to create the
-                       effective address. *)
-                    fun addIndex(baseReg, indexReg) =
-                        if isByteMove
-                        then gen(addShiftedReg{regM=indexReg, regN=baseReg, regD=baseReg, shift=ShiftLSR 0w1}, cvec)
-                        else
-                        (
-                            gen(logicalShiftRight{regN=indexReg, regD=indexReg, shift=0w1}, cvec); (* Untag *)
-                            gen(addShiftedReg{regM=indexReg, regN=baseReg, regD=baseReg,
-                                shift=ShiftLSL (if is32in64 then 0w2 else 0w3)}, cvec)
-                        )
-                    val scale = if isByteMove then 1 else if is32in64 then 4 else 8
-                in
-                    genMLAddress(sourceLeft, scale);
-                    genMLAddress(destRight, scale);
-                    gencde (length, ToX0, NotEnd, loopAddr); (* Length *)
-                    genPopReg(X2, cvec); (* Dest index - tagged value. *)
-                    genPopReg(X1, cvec); (* Dest base address. *)
-                    indexToAbsoluteAddress(X1, X1, cvec);
-                    (* Add in the index N.B. ML index values are unsigned. *)
-                    addIndex(X1, X2);
-                    (*gen(addShiftedReg{regM=X2, regN=X1, regD=X1, shift=ShiftLSR 0w1}, cvec);*)
-                    genPopReg(X3, cvec); (* Source index *)
-                    genPopReg(X2, cvec);
-                    indexToAbsoluteAddress(X2, X2, cvec);
-                    addIndex(X2, X3);
-                    (*gen(addShiftedReg{regM=X3, regN=X2, regD=X2, shift=ShiftLSR 0w1}, cvec);*)
-                    (* Untag the length *)
-                    gen(logicalShiftRight{regN=X0, regD=X0, shift=0w1}, cvec);
-                    (* Test the loop value at the top in case it's already zero. *)
-                    compareRegs(X0, X0, cvec);
-                    gen(setLabel loopLabel, cvec);
-                    gen(compareBranchZero(X0, exitLabel), cvec);
-                    if isByteMove
-                    then
-                    (
-                        gen(loadRegPostIndexByte{regT=X3, regN=X2, byteOffset=1}, cvec);
-                        gen(storeRegPostIndexByte{regT=X3, regN=X1, byteOffset=1}, cvec)
-                    )
-                    else if is32in64
-                    then
-                    (
-                        gen(loadRegPostIndex32{regT=X3, regN=X2, byteOffset=4}, cvec);
-                        gen(storeRegPostIndex32{regT=X3, regN=X1, byteOffset=4}, cvec)
-                    )
-                    else
-                    (
-                        gen(loadRegPostIndex{regT=X3, regN=X2, byteOffset=8}, cvec);
-                        gen(storeRegPostIndex{regT=X3, regN=X1, byteOffset=8}, cvec)
-                    );
-                    gen(subImmediate{regN=X0, regD=X0, immed=0w1, shifted=false}, cvec);
-                    (* Back to the start. *)
-                    gen(unconditionalBranch loopLabel, cvec);
-                    gen(setLabel exitLabel, cvec);
-                    topInX0 := false; (* X0 does not contain "unit" *)
-                    decsp(); decsp(); decsp(); decsp()
-                end
-
-            |   BICBlockOperation { kind=BlockOpEqualByte, sourceLeft, destRight, length } =>
-                (* Compare byte vectors for equality - returns a boolean result. *)
-                let
-                    val equalLabel = createLabel()
-                in
-                    blockCompareBytes(sourceLeft, destRight, length, equalLabel, true);
-                    gen(setLabel equalLabel, cvec);
-                    (* Set the result condition. *)
-                    setBooleanCondition(X0, CondEqual, cvec)
-                end
-
-            |   BICBlockOperation { kind=BlockOpCompareByte, sourceLeft, destRight, length } =>
-                (* Compare byte vectors for ordering - return tagged -1, 0, +1. *)
-                let
-                    val equalLabel = createLabel() and resultLabel = createLabel()
-                in
-                    blockCompareBytes(sourceLeft, destRight, length, equalLabel, false);
-                    (* We drop through if we have found unequal bytes. *)
-                    genList(loadNonAddress(X0, Word64.fromInt(tag 1)), cvec);
-                    (* Set X0 to either 1 or -1 depending on whether it's greater or less. *)
-                    gen(conditionalSetInverted{regD=X0, regTrue=X0, regFalse=XZero, cond=CondUnsignedHigher}, cvec);
-                    gen(unconditionalBranch resultLabel, cvec);
-                    gen(setLabel equalLabel, cvec);
-                    (* Equal case - set it to zero. *)
-                    genList(loadNonAddress(X0, Word64.fromInt(tag 0)), cvec);
-                    gen(setLabel resultLabel, cvec)
-                end
-
-           |    BICArbitrary { oper=ArithAdd, shortCond, arg1, arg2, longCall } =>
-                let
-                    val startLong = createLabel() and resultLabel = createLabel()
-                in
-                    (* Check tag bits *)
-                    gencde (shortCond, ToX0, NotEnd, loopAddr);
-                    gen(subSImmediate{regN=X0, regD=XZero, immed=taggedWord 0w1, shifted=false}, cvec);
-                    (* Go to the long case if it's not short. *)
-                    gen(conditionalBranch(CondNotEqual, startLong), cvec);
-                    topInX0 := false;
-                    gencde (arg1, ToStack, NotEnd, loopAddr);
-                    gencde (arg2, ToX0, NotEnd, loopAddr);
-                    gen(subImmediate{regN=X0, regD=X0, immed=0w1, shifted=false}, cvec);
-                    genPopReg(X1, cvec);
-                    decsp();
-                    (* Add and set the flag bits *)
-                    gen((if is32in64 then addSShiftedReg32 else addSShiftedReg)
-                        {regN=X1, regM=X0, regD=X0, shift=ShiftNone}, cvec);
-                    (* If there's no overflow skip to the end otherwise drop into
-                       the call to the RTS. *)
-                    gen(conditionalBranch(CondNoOverflow, resultLabel), cvec);
-                    gen(setLabel startLong, cvec);
-                    topInX0 := false;
-                    gencde (longCall, ToX0, tailKind, loopAddr);
-                    gen(setLabel resultLabel, cvec)
-                end
-
-           |    BICArbitrary { oper=ArithSub, shortCond, arg1, arg2, longCall } =>
-                let
-                    val startLong = createLabel() and resultLabel = createLabel()
-                in
-                    (* Check tag bits *)
-                    gencde (shortCond, ToX0, NotEnd, loopAddr);
-                    gen(subSImmediate{regN=X0, regD=XZero, immed=taggedWord 0w1, shifted=false}, cvec);
-                    (* Go to the long case if it's not short. *)
-                    gen(conditionalBranch(CondNotEqual, startLong), cvec);
-                    topInX0 := false;
-                    gencde (arg1, ToStack, NotEnd, loopAddr);
-                    gencde (arg2, ToX0, NotEnd, loopAddr);
-                    gen(subImmediate{regN=X0, regD=X0, immed=0w1, shifted=false}, cvec);
-                    genPopReg(X1, cvec);
-                    decsp();
-                    (* Subtract and set the flag bits *)
-                    gen((if is32in64 then subSShiftedReg32 else subSShiftedReg)
-                        {regN=X1, regM=X0, regD=X0, shift=ShiftNone}, cvec);
-                    gen(conditionalBranch(CondNoOverflow, resultLabel), cvec);
-                    gen(setLabel startLong, cvec);
-                    topInX0 := false;
-                    gencde (longCall, ToX0, tailKind, loopAddr);
-                    gen(setLabel resultLabel, cvec)
-                end
-
-           |    BICArbitrary { oper=ArithMult, longCall, ... } =>
-                (* Multiply - Just implement as a call to the long-precision case. *)
-                (
-                    gencde (longCall, whereto, tailKind, loopAddr)
-                )
-
-           |    BICArbitrary _ =>
-                    raise InternalError "BICArbitrary: unimplemented operation"
-
-        in (* body of gencde *) 
-
-          (* This ensures that there is precisely one item on the stack if
-             whereto = ToStack and no items if whereto = NoResult. *)
-            case whereto of
-                ToStack =>
-                let
-                    val () = ensureX0()
-                    val newsp = oldsp + 1;
-                    val adjustment = !realstackptr - newsp
-
-                    val () =
-                        if adjustment = 0
-                        then ()
-                        else if adjustment < ~1
-                        then raise InternalError ("gencde: bad adjustment " ^ Int.toString adjustment)
-                        (* Hack for declarations that should push values, but don't *)
-                        else if adjustment = ~1
-                        then
-                        (
-                            genList(loadNonAddress(X0, Word64.fromInt(tag 0)), cvec);
-                            genPushReg(X0, cvec)
-                        )
-                        else resetStack (adjustment, true, cvec)
-                in
-                    realstackptr := newsp
-                end
-          
-            |   NoResult =>
-                let
-                    val () = topInX0 := false
-                    val adjustment = !realstackptr - oldsp
-
-                    val () =
-                        if adjustment = 0
-                        then ()
-                        else if adjustment < 0
-                        then raise InternalError ("gencde: bad adjustment " ^ Int.toString adjustment)
-                        else resetStack (adjustment, false, cvec)
-                in
-                    realstackptr := oldsp
-                end
-
-            |   ToX0 =>
-                let
-                    (* If we have not pushed anything we have to push a unit result. *)
-                    val () =
-                        if !topInX0 then ()
-                        else if !realstackptr = oldsp
-                        then genList(loadNonAddress(X0, Word64.fromInt(tag 0)), cvec)
-                        else
-                        (
-                            genPopReg(X0, cvec);
-                            decsp()
-                        )
-                    val () = topInX0 := true
-
-                    val adjustment = !realstackptr - oldsp
-
-                    val () =
-                        if adjustment = 0
-                        then ()
-                        else if adjustment < 0
-                        then raise InternalError ("gencde: bad adjustment " ^ Int.toString adjustment)
-                        else resetStack (adjustment, false, cvec)
-                in
-                    realstackptr := oldsp
-                end
-        end (* gencde *)
-
-
-        and genUnary(NotBoolean, arg1, loopAddr) =
-            (
-                gencde (arg1, ToX0, NotEnd, loopAddr);
-                (* Flip true to false and the reverse. *)
-                gen(bitwiseXorImmediate32{bits=0w2, regN=X0, regD=X0}, cvec)
-            )
-
-        |   genUnary(IsTaggedValue, arg1, loopAddr) =
-            (
-                gencde (arg1, ToX0, NotEnd, loopAddr);
-                gen(bitwiseAndSImmediate32{bits=0w1, regN=X0, regD=XZero}, cvec);
-                setBooleanCondition(X0, CondNotEqual (*Non-zero*), cvec)
-            )
-
-        |   genUnary(MemoryCellLength, arg1, loopAddr) =
-            (
-                gencde (arg1, ToX0, NotEnd, loopAddr);
-                indexToAbsoluteAddress(X0, X0, cvec);
-                (* Load the length word. *)
-                if is32in64
-                then
-                (
-                    gen(loadRegUnscaled32{regT=X0, regN=X0, byteOffset= ~4}, cvec);
-                    (* Extract the length, excluding the flag bytes and shift by one bit. *)
-                    gen(unsignedBitfieldInsertinZeros32{lsb=0w1, width=0w24, regN=X0, regD=X0}, cvec)
-                )
-                else
-                (
-                    gen(loadRegUnscaled{regT=X0, regN=X0, byteOffset= ~8}, cvec);
-                    (* Extract the length, excluding the flag bytes and shift by one bit. *)
-                    gen(unsignedBitfieldInsertinZeros{lsb=0w1, width=0w56, regN=X0, regD=X0}, cvec)
-                );
-                (* Set the tag bit. *)
-                gen(bitwiseOrImmediate{bits=0w1, regN=X0, regD=X0}, cvec)
-            )
-
-        |   genUnary(MemoryCellFlags, arg1, loopAddr) =
-            (
-                gencde (arg1, ToX0, NotEnd, loopAddr);
-                indexToAbsoluteAddress(X0, X0, cvec);
-                (* Load the flags byte. *)
-                gen(loadRegUnscaledByte{regT=X0, regN=X0, byteOffset=flagsByteOffset}, cvec);
-                (* Tag the result. *)
-                gen(logicalShiftLeft{shift=0w1, regN=X0, regD=X0}, cvec);
-                gen(bitwiseOrImmediate{bits=0w1, regN=X0, regD=X0}, cvec)
-            )
-
-        |   genUnary(ClearMutableFlag, arg1, loopAddr) =
-            (
-                gencde (arg1, ToX0, NotEnd, loopAddr);
-                indexToAbsoluteAddress(X0, X0, cvec);
-                gen(loadRegUnscaledByte{regT=X1, regN=X0, byteOffset=flagsByteOffset}, cvec);
-                gen(bitwiseAndImmediate32{bits=Word64.xorb(0wxffffffff, 0wx40), regN=X1, regD=X1}, cvec);
-                gen(storeRegUnscaledByte{regT=X1, regN=X0, byteOffset=flagsByteOffset}, cvec)
-            )
-
-        |   genUnary(LongWordToTagged, arg1, loopAddr) =
-            (
-                gencde (arg1, ToX0, NotEnd, loopAddr);
-                indexToAbsoluteAddress(X0, X0, cvec);
-                (* Load the value and tag it. *)
-                gen(loadRegScaled{regT=X0, regN=X0, unitOffset=0}, cvec);
-                (* Tag the result. *)
-                gen(logicalShiftLeft{shift=0w1, regN=X0, regD=X0}, cvec);
-                gen((if is32in64 then bitwiseOrImmediate32 else bitwiseOrImmediate)
-                    {bits=0w1, regN=X0, regD=X0}, cvec)
-            )
-
-        |   genUnary(SignedToLongWord, arg1, loopAddr) =
-            (
-                gencde (arg1, ToX0, NotEnd, loopAddr);
-                (* We want to generate a 64-bit signed value. In 32-in-64 we use a
-                   64-bit SBFX to do the right shift and set the sign bits. *)
-                if is32in64
-                then gen(signedBitfieldExtract{lsb=0w1, width=0w31, regN=X0, regD=X1}, cvec)
-                else gen(arithmeticShiftRight{shift=0w1, regN=X0, regD=X1}, cvec);
-                boxLargeWord(X1, cvec)
-            )
-
-        |   genUnary(UnsignedToLongWord, arg1, loopAddr) =
-            (
-                gencde (arg1, ToX0, NotEnd, loopAddr);
-                gen(logicalShiftRight{shift=0w1, regN=X0, regD=X1}, cvec);
-                boxLargeWord(X1, cvec)
-            )
-
-        |   genUnary(RealAbs PrecDouble, arg1, loopAddr) =
-            (
-                gencde (arg1, ToX0, NotEnd, loopAddr);
-                unboxDouble(X0, X0, V0, cvec);
-                gen(loadRegScaledDouble{regT=V0, regN=X0, unitOffset=0}, cvec);
-                gen(absDouble{regN=V0, regD=V0}, cvec);
-                boxDouble(V0, cvec)
-            )
-
-        |   genUnary(RealNeg PrecDouble, arg1, loopAddr) =
-            (
-                gencde (arg1, ToX0, NotEnd, loopAddr);
-                unboxDouble(X0, X0, V0, cvec);
-                gen(loadRegScaledDouble{regT=V0, regN=X0, unitOffset=0}, cvec);
-                gen(negDouble{regN=V0, regD=V0}, cvec);
-                boxDouble(V0, cvec)
-            )
-
-        |   genUnary(RealFixedInt PrecDouble, arg1, loopAddr) =
-            (
-                gencde (arg1, ToX0, NotEnd, loopAddr);
-                (* Shift to remove the tag. *)
-                gen(arithmeticShiftRight{shift=0w1, regN=X0, regD=X0}, cvec);
-                gen(convertIntToDouble{regN=X0, regD=V0}, cvec);
-                boxDouble(V0, cvec)
-            )
-
-        |   genUnary(RealAbs PrecSingle, arg1, loopAddr) =
-            (
-                gencde (arg1, ToX0, NotEnd, loopAddr);
-                unboxOrUntagSingle(X0, X0, V0, cvec);
-                gen(absFloat{regN=V0, regD=V0}, cvec);
-                boxOrTagFloat(V0, cvec)
-            )
-
-        |   genUnary(RealNeg PrecSingle, arg1, loopAddr) =
-            (
-                gencde (arg1, ToX0, NotEnd, loopAddr);
-                unboxOrUntagSingle(X0, X0, V0, cvec);
-                gen(negFloat{regN=V0, regD=V0}, cvec);
-                boxOrTagFloat(V0, cvec)
-            )
-
-        |   genUnary(RealFixedInt PrecSingle, arg1, loopAddr) =
-            (
-                gencde (arg1, ToX0, NotEnd, loopAddr);
-                (* Shift to remove the tag. *)
-                gen(arithmeticShiftRight{shift=0w1, regN=X0, regD=X0}, cvec);
-                gen((if is32in64 then convertInt32ToFloat else convertIntToFloat){regN=X0, regD=V0}, cvec);
-                boxOrTagFloat(V0, cvec)
-           )
-
-        |   genUnary(FloatToDouble, arg1, loopAddr) =
-            (
-                gencde (arg1, ToX0, NotEnd, loopAddr);
-                unboxOrUntagSingle(X0, X0, V0, cvec);
-                gen(convertFloatToDouble{regN=V0, regD=V0}, cvec);
-                boxDouble(V0, cvec)
-            )
-
-        |   genUnary(DoubleToFloat, arg1, loopAddr) =
-            (
-                gencde (arg1, ToX0, NotEnd, loopAddr);
-                (* Convert double to float using current rounding mode. *)
-                unboxDouble(X0, X0, V0, cvec);
-                gen(loadRegScaledDouble{regT=V0, regN=X0, unitOffset=0}, cvec);
-                gen(convertDoubleToFloat{regN=V0, regD=V0}, cvec);
-                boxOrTagFloat(V0, cvec)
-            )
-
-        |   genUnary(RealToInt (PrecDouble, rnding), arg1, loopAddr) =
-            (
-                gencde (arg1, ToX0, NotEnd, loopAddr);
-                (* We could get an overflow in either the conversion to integer
-                   or in the conversion to a tagged value.  Fortunately if the
-                   conversion detects an overflow it sets the result to a
-                   value that will cause an overflow in the addition. *)
-                unboxDouble(X0, X0, V0, cvec);
-                gen(loadRegScaledDouble{regT=V0, regN=X0, unitOffset=0}, cvec);
-                if is32in64
-                then
-                (
-                    gen(convertDoubleToInt32 rnding {regN=V0, regD=X0}, cvec);
-                    gen(addSShiftedReg32{regM=X0, regN=X0, regD=X0, shift=ShiftNone}, cvec)
-                )
-                else
-                (
-                    gen(convertDoubleToInt rnding {regN=V0, regD=X0}, cvec);
-                    gen(addSShiftedReg{regM=X0, regN=X0, regD=X0, shift=ShiftNone}, cvec)
-                );
-                gen(bitwiseOrImmediate{regN=X0, regD=X0, bits=0w1}, cvec);
-                checkOverflow cvec
-            )
-
-        |   genUnary(RealToInt (PrecSingle, rnding), arg1, loopAddr) =
-            (
-                gencde (arg1, ToX0, NotEnd, loopAddr);
-                unboxOrUntagSingle(X0, X0, V0, cvec);
-                if is32in64
-                then
-                (
-                    gen(convertFloatToInt32 rnding {regN=V0, regD=X0}, cvec);
-                    gen(addSShiftedReg32{regM=X0, regN=X0, regD=X0, shift=ShiftNone}, cvec)
-                )
-                else
-                (
-                    gen(convertFloatToInt rnding {regN=V0, regD=X0}, cvec);
-                    gen(addSShiftedReg{regM=X0, regN=X0, regD=X0, shift=ShiftNone}, cvec)
-                );
-                gen(addSShiftedReg{regM=X0, regN=X0, regD=X0, shift=ShiftNone}, cvec);
-                gen(bitwiseOrImmediate{regN=X0, regD=X0, bits=0w1}, cvec);
-                checkOverflow cvec
-            )
-
-        |   genUnary(TouchAddress, arg1, loopAddr) =
-            (
-                gencde (arg1, ToX0, NotEnd, loopAddr);
-                topInX0 := false (* Discard this *)
-            )
-
-        |   genUnary(AllocCStack, arg1, loopAddr) =
-            (
-                gencde (arg1, ToX0, NotEnd, loopAddr);
-                (* Allocate space on the stack.  The higher levels have already aligned
-                   the size to a multiple of 16. *)
-                (* Remove the tag and then use add-extended.  This can use SP unlike
-                   the shifted case. *)
-                gen(logicalShiftRight{shift=0w1, regN=X0, regD=X0}, cvec);
-                gen(subExtendedReg{regM=X0, regN=XSP, regD=XSP, extend=ExtUXTX 0w0}, cvec);
-                (* The result is a large-word.  We can't box SP directly.
-                   We have to use add here to get the SP into X1 instead of the usual move. *)
-                gen(addImmediate{regN=XSP, regD=X1, immed=0w0, shifted=false}, cvec);
-                boxLargeWord(X1, cvec)
-            )
-
-        |   genUnary(LockMutex, arg1, loopAddr) =
-            (* The earliest versions of the Arm8 do not have the LDADD instruction which
-               will do this directly.  To preserve compatibility we use LDAXR/STLXR
-               which require a loop. *)
-            let
-                val loopLabel = createLabel()
-                (* For the moment don't try to use a spin lock. *)
-            in
-                gencde (arg1, ToX0, NotEnd, loopAddr);
-                indexToAbsoluteAddress(X0, X0, cvec);
-                gen(setLabel loopLabel, cvec);
-                (* Get the original value into X1. *)
-                gen(loadAcquireExclusiveRegister{regN=X0, regT=X1}, cvec);
-                (* Add and put the result into X2 *)
-                gen(addImmediate{regN=X1, regD=X2, immed=0w1, shifted=false}, cvec);
-                (* Store the result of the addition. W4 will be zero if this succeeded. *)
-                gen(storeReleaseExclusiveRegister{regS=X4, regT=X2, regN=X0}, cvec);
-                gen(compareBranchNonZero32(X4, loopLabel), cvec);
-                (* Put in the memory barrier. *)
-                gen(dmbIsh, cvec);
-                (* The result is true if the old value was zero. *)
-                gen(subSImmediate{regN=X1, regD=XZero, immed=0w0, shifted=false}, cvec);
-                setBooleanCondition(X0, CondEqual, cvec)                
-            end
-
-        |   genUnary(TryLockMutex, arg1, loopAddr) =
-            (* Could use LDUMAXAL to set it the greater of the current value or 1. *)
-            let
-                val loopLabel = createLabel() and exitLabel = createLabel()
-            in
-                gencde (arg1, ToX0, NotEnd, loopAddr);
-                indexToAbsoluteAddress(X0, X0, cvec);
-                gen(setLabel loopLabel, cvec);
-                (* Get the original value into X1. *)
-                gen(loadAcquireExclusiveRegister{regN=X0, regT=X1}, cvec);
-                (* If it is non-zero the lock is already taken. *)
-                gen(compareBranchNonZero(X1, exitLabel), cvec);
-                genList(loadNonAddress(X2, 0w1), cvec);
-                (* Store zero into the memory. W4 will be zero if this succeeded. *)
-                gen(storeReleaseExclusiveRegister{regS=X4, regT=X2, regN=X0}, cvec);
-                gen(compareBranchNonZero32(X4, loopLabel), cvec);
-                gen(setLabel exitLabel, cvec);
-                gen(dmbIsh, cvec);
-                (* The result is true if the old value was zero. *)
-                gen(subSImmediate{regN=X1, regD=XZero, immed=0w0, shifted=false}, cvec);
-                setBooleanCondition(X0, CondEqual, cvec)
-            end
-
-        |   genUnary(UnlockMutex, arg1, loopAddr) =
-            (* Could use SWAPAL *)
-            let
-                val loopLabel = createLabel()
-            in
-                gencde (arg1, ToX0, NotEnd, loopAddr);
-                indexToAbsoluteAddress(X0, X0, cvec);
-                gen(setLabel loopLabel, cvec);
-                (* Get the original value into X1. *)
-                gen(loadAcquireExclusiveRegister{regN=X0, regT=X1}, cvec);
-                (* Store zero into the memory. W4 will be zero if this succeeded. *)
-                gen(storeReleaseExclusiveRegister{regS=X4, regT=XZero, regN=X0}, cvec);
-                gen(compareBranchNonZero32(X4,  loopLabel), cvec);
-                (* Put in the memory barrier. *)
-                gen(dmbIsh, cvec);
-                (* The result is true if the old value was 1. *)
-                gen(subSImmediate{regN=X1, regD=XZero, immed=0w1, shifted=false}, cvec);
-                setBooleanCondition(X0, CondEqual, cvec)
-            end
-
-        and genBinary(WordComparison{test, isSigned}, arg1, arg2, loopAddr) =
-            let
-                val cond =
-                    case (test, isSigned) of
-                        (TestEqual,         _) => CondEqual
-                    |   (TestLess,          true) => CondSignedLess
-                    |   (TestLessEqual,     true) => CondSignedLessEq
-                    |   (TestGreater,       true) => CondSignedGreater
-                    |   (TestGreaterEqual,  true) => CondSignedGreaterEq
-                    |   (TestLess,          false) => CondCarryClear
-                    |   (TestLessEqual,     false) => CondUnsignedLowOrEq
-                    |   (TestGreater,       false) => CondUnsignedHigher
-                    |   (TestGreaterEqual,  false) => CondCarrySet
-                    |   (TestUnordered,     _) => raise InternalError "WordComparison: TestUnordered"
-            in
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                genPopReg(X1, cvec); (* First argument. *)
-                comparePolyRegs(X1, X0, cvec);
-                setBooleanCondition(X0, cond, cvec)
-            end
-
-        |   genBinary(PointerEq, arg1, arg2, loopAddr) =
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                genPopReg(X1, cvec); (* First argument. *)
-                comparePolyRegs(X1, X0, cvec);
-                setBooleanCondition(X0, CondEqual, cvec)
-            )
-
-        |   genBinary(FixedPrecisionArith ArithAdd, arg1, arg2, loopAddr) =
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                (* Subtract the tag bit. *)
-                gen(subImmediate{regN=X0, regD=X0, immed=0w1, shifted=false}, cvec);
-                genPopReg(X1, cvec);
-                (* Add and set the flag bits *)
-                gen((if is32in64 then addSShiftedReg32 else addSShiftedReg)
-                    {regN=X1, regM=X0, regD=X0, shift=ShiftNone}, cvec);
-                checkOverflow cvec
-            )
-
-        |   genBinary(FixedPrecisionArith ArithSub, arg1, arg2, loopAddr) =
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                (* Subtract the tag bit. *)
-                gen(subImmediate{regN=X0, regD=X0, immed=0w1, shifted=false}, cvec);
-                genPopReg(X1, cvec);
-                (* Subtract and set the flag bits *)
-                gen((if is32in64 then subSShiftedReg32 else subSShiftedReg)
-                    {regN=X1, regM=X0, regD=X0, shift=ShiftNone}, cvec);
-                checkOverflow cvec
-            )
-
-        |   genBinary(FixedPrecisionArith ArithMult, arg1, arg2, loopAddr) =
-            let
-                (* There's no simple way of detecting overflow.  We have to compute the
-                   high-order word and then check that it is either all zeros with
-                   the sign bit zero or all ones with the sign bit one. *)
-                val noOverflow = createLabel()
-            in
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                (* Compute the result in the same way as for Word.* apart from the
-                   arithmetic shift. *)
-                genPopReg(X1, cvec);
-
-                if is32in64
-                then
-                (
-                    (* Shift to remove the tags on one argument. *)
-                    gen(arithmeticShiftRight32{regN=X0, regD=X2, shift=0w1}, cvec);
-                    (* Remove the tag on the other. *)
-                    gen(bitwiseAndImmediate32{regN=X1, regD=X1, bits=tagBitMask32}, cvec);
-                    (* Multiply two 32-bit quantities. *)
-                    gen(signedMultiplyAndAddLong{regM=X1, regN=X2, regA=XZero, regD=X0}, cvec);
-                    (* Get the top word which should be either all ones or all zeros. *)
-                    gen(arithmeticShiftRight{shift=0w32, regN=X0, regD=X2}, cvec);
-                    (* The result is in X0. Use a 32-bit operation to clear the top word. *)
-                    gen(bitwiseOrImmediate32{regN=X0, regD=X0, bits=0w1}, cvec);
-                    (* Compare with the sign bit of the result. *)
-                    gen(subSShiftedReg32{regD=XZero, regN=X2, regM=X0, shift=ShiftASR 0w31}, cvec)
-                )
-                else
-                (
-                    (* Shift to remove the tags on one argument. *)
-                    gen(arithmeticShiftRight{regN=X0, regD=X2, shift=0w1}, cvec);
-                    (* Remove the tag on the other. *)
-                    gen(bitwiseAndImmediate{regN=X1, regD=X1, bits=tagBitMask}, cvec);
-                    gen(multiplyAndAdd{regM=X1, regN=X2, regA=XZero, regD=X0}, cvec);
-                    (* Put back the tag. *)
-                    gen(bitwiseOrImmediate{regN=X0, regD=X0, bits=0w1}, cvec);
-                    (* Compute the high order part into X2 *)
-                    gen(signedMultiplyHigh{regM=X1, regN=X2, regD=X2}, cvec);
-                    (* Compare with the sign bit of the result. *)
-                    gen(subSShiftedReg{regD=XZero, regN=X2, regM=X0, shift=ShiftASR 0w63}, cvec)
-                );
-                gen(conditionalBranch(CondEqual, noOverflow), cvec);
-                gen(loadAddressConstant(X0, toMachineWord Overflow), cvec);
-                gen(loadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset}, cvec);
-                gen(loadRegScaled{regT=X1, regN=X_MLStackPtr, unitOffset=0}, cvec);
-                gen(branchRegister X1, cvec);
-                gen(setLabel noOverflow, cvec)
-            end
-
-        |   genBinary(FixedPrecisionArith ArithQuot, arg1, arg2, loopAddr) =
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                (* The word version avoids an extra shift.  Don't do that here at least
-                   for the moment.  Division by zero and overflow are checked for at
-                   the higher level. *)
-                genPopReg(X1, cvec);
-                if is32in64
-                then
-                (
-                    (* Shift to remove the tags on the arguments *)
-                    gen(arithmeticShiftRight32{regN=X0, regD=X0, shift=0w1}, cvec);
-                    gen(arithmeticShiftRight32{regN=X1, regD=X1, shift=0w1}, cvec);
-                    gen(signedDivide32{regM=X0, regN=X1, regD=X0}, cvec);
-                    (* Restore the tag. *)
-                    gen(logicalShiftLeft32{regN=X0, regD=X0, shift=0w1}, cvec);
-                    gen(bitwiseOrImmediate32{regN=X0, regD=X0, bits=0w1}, cvec)
-                )
-                else
-                (
-                    (* Shift to remove the tags on the arguments *)
-                    gen(arithmeticShiftRight{regN=X0, regD=X0, shift=0w1}, cvec);
-                    gen(arithmeticShiftRight{regN=X1, regD=X1, shift=0w1}, cvec);
-                    gen(signedDivide{regM=X0, regN=X1, regD=X0}, cvec);
-                    (* Restore the tag. *)
-                    gen(logicalShiftLeft{regN=X0, regD=X0, shift=0w1}, cvec);
-                    gen(bitwiseOrImmediate{regN=X0, regD=X0, bits=0w1}, cvec)
-                )
-            )
-
-        |   genBinary(FixedPrecisionArith ArithRem, arg1, arg2, loopAddr) =
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                (* For the moment we remove the tags and then retag afterwards.  The word
-                   version avoids this but at least for the moment we do it the longer way. *)
-                (* There's no direct way to get the remainder - have to use divide and multiply. *)
-                genPopReg(X1, cvec);
-                (* Shift to remove the tags on the arguments *)
-                if is32in64
-                then
-                (
-                    gen(arithmeticShiftRight32{regN=X0, regD=X0, shift=0w1}, cvec);
-                    gen(arithmeticShiftRight32{regN=X1, regD=X1, shift=0w1}, cvec);
-                    gen(signedDivide32{regM=X0, regN=X1, regD=X2}, cvec);
-                    (* X0 = X1 - (X2/X0)*X0 *)
-                    gen(multiplyAndSub32{regM=X2, regN=X0, regA=X1, regD=X0}, cvec);
-                    (* Restore the tag. *)
-                    gen(logicalShiftLeft32{regN=X0, regD=X0, shift=0w1}, cvec);
-                    gen(bitwiseOrImmediate32{regN=X0, regD=X0, bits=0w1}, cvec)
-                )
-                else
-                (
-                    gen(arithmeticShiftRight{regN=X0, regD=X0, shift=0w1}, cvec);
-                    gen(arithmeticShiftRight{regN=X1, regD=X1, shift=0w1}, cvec);
-                    gen(signedDivide{regM=X0, regN=X1, regD=X2}, cvec);
-                    (* X0 = X1 - (X2/X0)*X0 *)
-                    gen(multiplyAndSub{regM=X2, regN=X0, regA=X1, regD=X0}, cvec);
-                    (* Restore the tag. *)
-                    gen(logicalShiftLeft{regN=X0, regD=X0, shift=0w1}, cvec);
-                    gen(bitwiseOrImmediate{regN=X0, regD=X0, bits=0w1}, cvec)
-                )
-            )
-
-        |   genBinary(FixedPrecisionArith ArithDiv, _, _, _) =
-                raise InternalError "unimplemented operation: FixedPrecisionArith ArithDiv"
-
-        |   genBinary(FixedPrecisionArith ArithMod, _, _, _) =
-                raise InternalError "unimplemented operation: FixedPrecisionArith ArithMod"
-
-        |   genBinary(WordArith ArithAdd, arg1, arg2, loopAddr) =
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                (* Subtract the tag bit. *)
-                gen(subImmediate{regN=X0, regD=X0, immed=0w1, shifted=false}, cvec);
-                genPopReg(X1, cvec);
-                gen((if is32in64 then addShiftedReg32 else addShiftedReg){regN=X1, regM=X0, regD=X0, shift=ShiftNone}, cvec)
-            )
-
-        |   genBinary(WordArith ArithSub, arg1, arg2, loopAddr) =
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                (* Subtract the tag bit. *)
-                gen(subImmediate{regN=X0, regD=X0, immed=0w1, shifted=false}, cvec);
-                genPopReg(X1, cvec);
-                gen((if is32in64 then subShiftedReg32 else subShiftedReg){regN=X1, regM=X0, regD=X0, shift=ShiftNone}, cvec)
-            )
-
-        |   genBinary(WordArith ArithMult, arg1, arg2, loopAddr) =
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                genPopReg(X1, cvec);
-                (* Shift to remove the tags on one argument. *)
-                gen(logicalShiftRight{regN=X0, regD=X0, shift=0w1}, cvec);
-                (* Remove the tag on the other. *)
-                gen(bitwiseAndImmediate{regN=X1, regD=X1, bits=tagBitMask}, cvec);
-                gen((if is32in64 then multiplyAndAdd32 else multiplyAndAdd)
-                    {regM=X1, regN=X0, regA=XZero, regD=X0}, cvec);
-                (* Put back the tag. *)
-                gen((if is32in64 then bitwiseOrImmediate32 else bitwiseOrImmediate)
-                    {regN=X0, regD=X0, bits=0w1}, cvec)
-            )
-
-        |   genBinary(WordArith ArithDiv, arg1, arg2, loopAddr) =
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                genPopReg(X1, cvec);
-                (* Shift to remove the tag on the divisor *)
-                gen(logicalShiftRight{regN=X0, regD=X0, shift=0w1}, cvec);
-                (* Untag but don't shift the dividend. *)
-                gen(bitwiseAndImmediate{regN=X1, regD=X1, bits=tagBitMask}, cvec);
-                gen((if is32in64 then unsignedDivide32 else unsignedDivide){regM=X0, regN=X1, regD=X0}, cvec);
-                (* Restore the tag: Note: it may already be set depending on the result of
-                   the division. *)
-                gen((if is32in64 then bitwiseOrImmediate32 else bitwiseOrImmediate)
-                    {regN=X0, regD=X0, bits=0w1}, cvec)
-            )
-
-        |   genBinary(WordArith ArithMod, arg1, arg2, loopAddr) =
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                (* There's no direct way to get the remainder - have to use divide and multiply. *)
-                genPopReg(X1, cvec);
-                (* Shift to remove the tag on the divisor *)
-                gen(logicalShiftRight{regN=X0, regD=X0, shift=0w1}, cvec);
-                (* Untag but don't shift the dividend. *)
-                gen(bitwiseAndImmediate{regN=X1, regD=X2, bits=tagBitMask}, cvec);
-                gen((if is32in64 then unsignedDivide32 else unsignedDivide){regM=X0, regN=X2, regD=X2}, cvec);
-                (* Clear the bottom bit before the multiplication. *)
-                gen(bitwiseAndImmediate{regN=X2, regD=X2, bits=tagBitMask}, cvec);
-                (* X0 = X1 - (X2/X0)*X0 *)
-                gen((if is32in64 then multiplyAndSub32 else multiplyAndSub){regM=X2, regN=X0, regA=X1, regD=X0}, cvec)
-                (* Because we're subtracting from the original, tagged, dividend
-                   the result is tagged. *)
-            )
-
-        |   genBinary(WordArith _, _, _, _) = raise InternalError "WordArith - unimplemented instruction"
-
-        |   genBinary(WordLogical LogicalAnd, arg1, arg2, loopAddr) =
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                genPopReg(X1, cvec);
-                (* Since they're both tagged the tag bit is preserved. *)
-                gen(andShiftedReg{regN=X1, regM=X0, regD=X0, shift=ShiftNone}, cvec)
-            )
-
-        |   genBinary(WordLogical LogicalOr, arg1, arg2, loopAddr) =
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                genPopReg(X1, cvec);
-                (* Since they're both tagged the tag bit is preserved. *)
-                gen(orrShiftedReg{regN=X1, regM=X0, regD=X0, shift=ShiftNone}, cvec)
-            )
-
-        |   genBinary(WordLogical LogicalXor, arg1, arg2, loopAddr) =
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                genPopReg(X1, cvec);
-                (* Have to restore the tag bit because that will be cleared. *)
-                gen(eorShiftedReg{regN=X1, regM=X0, regD=X0, shift=ShiftNone}, cvec);
-                gen((if is32in64 then bitwiseOrImmediate32 else bitwiseOrImmediate)
-                    {regN=X0, regD=X0, bits=0w1}, cvec)
-            )
-
-            (* Shifts: ARM64 shifts are taken modulo the word length but that's
-               dealt with at a higher level. *)
-        |   genBinary(WordShift ShiftLeft, arg1, arg2, loopAddr) =
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                genPopReg(X1, cvec);
-                (* Remove the tag from value we're shifting. *)
-                gen(bitwiseAndImmediate{regN=X1, regD=X1, bits=tagBitMask}, cvec);
-                (* Untag the shift amount.  Can use 32-bit op here. *)
-                gen(logicalShiftRight32{regN=X0, regD=X0, shift=0w1}, cvec);
-                gen((if is32in64 then logicalShiftLeftVariable32 else logicalShiftLeftVariable)
-                    {regM=X0, regN=X1, regD=X0}, cvec);
-                (* Put back the tag. *)
-                gen((if is32in64 then bitwiseOrImmediate32 else bitwiseOrImmediate)
-                    {regN=X0, regD=X0, bits=0w1}, cvec)
-            )
-
-        |   genBinary(WordShift ShiftRightLogical, arg1, arg2, loopAddr) =
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                genPopReg(X1, cvec);
-                (* Don't need to remove the tag. *)
-                (* Untag the shift amount.  Can use 32-bit op here. *)
-                gen(logicalShiftRight32{regN=X0, regD=X0, shift=0w1}, cvec);
-                gen(logicalShiftRightVariable{regM=X0, regN=X1, regD=X0}, cvec);
-                (* Put back the tag. *)
-                gen((if is32in64 then bitwiseOrImmediate32 else bitwiseOrImmediate)
-                    {regN=X0, regD=X0, bits=0w1}, cvec)
-            )
-
-        |   genBinary(WordShift ShiftRightArithmetic, arg1, arg2, loopAddr) =
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                genPopReg(X1, cvec);
-                (* Don't need to remove the tag. *)
-                (* Untag the shift amount.  Can use 32-bit op here. *)
-                gen(logicalShiftRight32{regN=X0, regD=X0, shift=0w1}, cvec);
-                if is32in64
-                then
-                (
-                    (* Must use a 32 bit shift here. *)
-                    gen(arithmeticShiftRightVariable32{regM=X0, regN=X1, regD=X0}, cvec);
-                    (* Put back the tag. *)
-                    gen(bitwiseOrImmediate32{regN=X0, regD=X0, bits=0w1}, cvec)
-                )
-                else
-                (
-                    gen(arithmeticShiftRightVariable{regM=X0, regN=X1, regD=X0}, cvec);
-                    (* Put back the tag. *)
-                    gen(bitwiseOrImmediate{regN=X0, regD=X0, bits=0w1}, cvec)
-                )
-            )
-
-        |   genBinary(AllocateByteMemory, arg1, arg2, loopAddr) =
-            (* Allocate memory for byte data.  Unlike for word data it is not necessary to
-               initialise it before any further allocation provided it has the mutable bit
-               set. *)
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                (* Load and untag the size and flags.  The size is the number of words even
-                   though this is byte data. *)
-                gen(logicalShiftRight32{regN=X0, regD=X0, shift=0w1}, cvec);
-                genPopReg(X1, cvec);
-                gen(logicalShiftRight{regN=X1, regD=X1, shift=0w1}, cvec);
-                allocateVariableSize({sizeReg=X1, flagsReg=X0, resultReg=X2}, cvec);
-                gen(moveRegToReg{sReg=X2, dReg=X0}, cvec);
-                absoluteAddressToIndex(X0, cvec)
-            )
-
-        |   genBinary(LargeWordComparison test, arg1, arg2, loopAddr) =
-            let
-                val cond =
-                    case test of
-                        TestEqual => CondEqual
-                    |   TestLess => CondCarryClear
-                    |   TestLessEqual => CondUnsignedLowOrEq
-                    |   TestGreater => CondUnsignedHigher
-                    |   TestGreaterEqual => CondCarrySet
-                    |   TestUnordered => raise InternalError "LargeWordComparison: TestUnordered"
-            in
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                (* The values are boxed so have to be loaded first. *)
-                genPopReg(X1, cvec);
-                unboxLargeWord(X0, X0, cvec);
-                unboxLargeWord(X1, X1, cvec);
-                compareRegs(X1, X0, cvec);
-                setBooleanCondition(X0, cond, cvec)
-            end
-
-        |   genBinary(LargeWordArith ArithAdd, arg1, arg2, loopAddr) =
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                genPopReg(X1, cvec);
-                unboxLargeWord(X0, X0, cvec);
-                unboxLargeWord(X1, X1, cvec);
-                gen(addShiftedReg{regN=X1, regM=X0, regD=X1, shift=ShiftNone}, cvec);
-                boxLargeWord(X1, cvec)
-            )
-
-        |   genBinary(LargeWordArith ArithSub, arg1, arg2, loopAddr) =
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                genPopReg(X1, cvec);
-                unboxLargeWord(X0, X0, cvec);
-                unboxLargeWord(X1, X1, cvec);
-                gen(subShiftedReg{regN=X1, regM=X0, regD=X1, shift=ShiftNone}, cvec);
-                boxLargeWord(X1, cvec)
-            )
-
-        |   genBinary(LargeWordArith ArithMult, arg1, arg2, loopAddr) =
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                genPopReg(X1, cvec);
-                unboxLargeWord(X0, X0, cvec);
-                unboxLargeWord(X1, X1, cvec);
-                gen(multiplyAndAdd{regM=X1, regN=X0, regA=XZero, regD=X1}, cvec);
-                boxLargeWord(X1, cvec)
-            )
-
-        |   genBinary(LargeWordArith ArithDiv, arg1, arg2, loopAddr) =
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                genPopReg(X1, cvec);
-                unboxLargeWord(X0, X0, cvec);
-                unboxLargeWord(X1, X1, cvec);
-                gen(unsignedDivide{regM=X0, regN=X1, regD=X1}, cvec);
-                boxLargeWord(X1, cvec)
-            )
-
-        |   genBinary(LargeWordArith ArithMod, arg1, arg2, loopAddr) =
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                genPopReg(X1, cvec);
-                unboxLargeWord(X0, X0, cvec);
-                unboxLargeWord(X1, X1, cvec);
-                gen(unsignedDivide{regM=X0, regN=X1, regD=X2}, cvec);
-                gen(multiplyAndSub{regM=X2, regN=X0, regA=X1, regD=X1}, cvec);
-                boxLargeWord(X1, cvec)
-            )
-
-        |   genBinary(LargeWordArith _, _, _, _) =
-                raise InternalError "LargeWordArith - unimplemented instruction"
-
-        |   genBinary(LargeWordLogical LogicalAnd, arg1, arg2, loopAddr) =
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                genPopReg(X1, cvec);
-                unboxLargeWord(X0, X0, cvec);
-                unboxLargeWord(X1, X1, cvec);
-                gen(andShiftedReg{regN=X1, regM=X0, regD=X1, shift=ShiftNone}, cvec);
-                boxLargeWord(X1, cvec)
-            )
-
-        |   genBinary(LargeWordLogical LogicalOr, arg1, arg2, loopAddr) =
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                genPopReg(X1, cvec);
-                unboxLargeWord(X0, X0, cvec);
-                unboxLargeWord(X1, X1, cvec);
-                gen(orrShiftedReg{regN=X1, regM=X0, regD=X1, shift=ShiftNone}, cvec);
-                boxLargeWord(X1, cvec)
-            )
-
-        |   genBinary(LargeWordLogical LogicalXor, arg1, arg2, loopAddr) =
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                genPopReg(X1, cvec);
-                unboxLargeWord(X0, X0, cvec);
-                unboxLargeWord(X1, X1, cvec);
-                gen(eorShiftedReg{regN=X1, regM=X0, regD=X1, shift=ShiftNone}, cvec);
-                boxLargeWord(X1, cvec)
-            )
-
-            (* The shift is always a Word.word value i.e. tagged.  There is a check at the higher level
-               that the shift does not exceed 32/64 bits. *)
-        |   genBinary(LargeWordShift ShiftLeft, arg1, arg2, loopAddr) =
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                genPopReg(X1, cvec);
-                unboxLargeWord(X1, X1, cvec);
-                (* Untag the shift amount.  Can use 32-bit op here. *)
-                gen(logicalShiftRight32{regN=X0, regD=X0, shift=0w1}, cvec);
-                gen(logicalShiftLeftVariable{regM=X0, regN=X1, regD=X1}, cvec);
-                boxLargeWord(X1, cvec)
-            )
-
-        |   genBinary(LargeWordShift ShiftRightLogical, arg1, arg2, loopAddr) =
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                genPopReg(X1, cvec);
-                unboxLargeWord(X1, X1, cvec);
-                (* Untag the shift amount.  Can use 32-bit op here. *)
-                gen(logicalShiftRight32{regN=X0, regD=X0, shift=0w1}, cvec);
-                gen(logicalShiftRightVariable{regM=X0, regN=X1, regD=X1}, cvec);
-                boxLargeWord(X1, cvec)
-            )
-
-        |   genBinary(LargeWordShift ShiftRightArithmetic, arg1, arg2, loopAddr) =
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                genPopReg(X1, cvec);
-                unboxLargeWord(X1, X1, cvec);
-                (* Untag the shift amount.  Can use 32-bit op here. *)
-                gen(logicalShiftRight32{regN=X0, regD=X0, shift=0w1}, cvec);
-                gen(arithmeticShiftRightVariable{regM=X0, regN=X1, regD=X1}, cvec);
-                boxLargeWord(X1, cvec)
-            )
-
-            (* Floating point comparisons.
-               The fcmp instruction differs from integer comparison.  If either
-               argument is a NaN the overflow bit is set and the other bits are
-               cleared.  That means that in order to get a true result only
-               if the values are not NaNs we have to test that at least one of
-               C, N, or Z are set.  We use unsigned tests for < and <=
-               and signed tests for > and >=. *)
-        |   genBinary(RealComparison (test, PrecDouble), arg1, arg2, loopAddr) =
-            let
-                val cond =
-                    case test of
-                        TestEqual => CondEqual
-                    |   TestLess => CondCarryClear
-                    |   TestLessEqual => CondUnsignedLowOrEq
-                    |   TestGreater => CondSignedGreater
-                    |   TestGreaterEqual => CondSignedGreaterEq
-                    |   TestUnordered => CondOverflow
-            in
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                genPopReg(X1, cvec);
-                unboxDouble(X0, X0, V0, cvec);
-                unboxDouble(X1, X1, V1, cvec);
-                gen(compareDouble{regM=V0, regN=V1}, cvec);
-                setBooleanCondition(X0, cond, cvec)
-            end
-
-        |   genBinary(RealComparison (test, PrecSingle), arg1, arg2, loopAddr) =
-            let
-                val cond =
-                    case test of
-                        TestEqual => CondEqual
-                    |   TestLess => CondCarryClear
-                    |   TestLessEqual => CondUnsignedLowOrEq
-                    |   TestGreater => CondSignedGreater
-                    |   TestGreaterEqual => CondSignedGreaterEq
-                    |   TestUnordered => CondOverflow
-            in
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                genPopReg(X1, cvec);
-                unboxOrUntagSingle(X0, X0, V0, cvec);
-                unboxOrUntagSingle(X1, X1, V1, cvec);
-                gen(compareFloat{regM=V0, regN=V1}, cvec);
-                setBooleanCondition(X0, cond, cvec)
-            end
-
-        |   genBinary(RealArith (oper, PrecDouble), arg1, arg2, loopAddr) =
-            let
-                val operation =
-                    case oper of
-                        ArithAdd => addDouble
-                    |   ArithSub => subtractDouble
-                    |   ArithMult => multiplyDouble
-                    |   ArithDiv => divideDouble
-                    |   _ => raise InternalError "RealArith - unimplemented instruction"
-            in
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                genPopReg(X1, cvec);
-                unboxDouble(X0, X0, V0, cvec);
-                unboxDouble(X1, X1, V1, cvec);
-                gen(operation{regM=V0, regN=V1, regD=V0}, cvec);
-                boxDouble(V0, cvec)
-            end
-
-        |   genBinary(RealArith (oper, PrecSingle), arg1, arg2, loopAddr) =
-            let
-                val operation =
-                    case oper of
-                        ArithAdd => addFloat
-                    |   ArithSub => subtractFloat
-                    |   ArithMult => multiplyFloat
-                    |   ArithDiv => divideFloat
-                    |   _ => raise InternalError "RealArith - unimplemented instruction"
-            in
-                (* 32-bit floats are represented as the value in the top 32-bits of
-                   a general register with the low-order word containing all zeros
-                   except the bottom bit which is one. *)
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                genPopReg(X1, cvec);
-                unboxOrUntagSingle(X0, X0, V0, cvec);
-                unboxOrUntagSingle(X1, X1, V1, cvec);
-                gen(operation{regM=V0, regN=V1, regD=V0}, cvec);
-                boxOrTagFloat(V0, cvec)
-            end
-
-        |   genBinary(FreeCStack, arg1, arg2, loopAddr) =
-            (* Free space on the C stack. This is a binary operation that takes the base address
-               and the size.  The base address isn't used in this version. *)
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                genPopReg(X1, cvec); (* Pop and discard the address *)
-                (* Can't use the shifted addition which would remove the tag as part of the add. *)
-                gen(logicalShiftRight{shift=0w1, regN=X0, regD=X0}, cvec);
-                gen(addExtendedReg{regM=X0, regN=XSP, regD=XSP, extend=ExtUXTX 0w0}, cvec)
-            )
-
-       (* doNext is only used for mutually recursive functions where a
-         function may not be able to fill in its closure if it does not have
-         all the remaining declarations. *)
-        (* TODO: This always creates the closure on the heap even when makeClosure is false. *) 
-        and genProc ({ closure=[], localCount, body, argTypes, name, ...}: bicLambdaForm, mutualDecs, doNext: unit -> unit) : unit =
-            let
-                (* Create a one word item for the closure.  This is returned for recursive references
-                   and filled in with the address of the code when we've finished. *)
-                val closure = makeConstantClosure()
-                (* Code-gen function. No non-local references. *)
-                val () = codegen (body, name, closure, List.length argTypes, localCount, parameters, false)
-                val () = gen(loadAddressConstant(X0, closureAsAddress closure), cvec)
-                val () = genPushReg(X0, cvec)
-                val () = incsp();
-            in
-                if mutualDecs then doNext () else ()
-            end
-
-        |   genProc ({ localCount, body, name, argTypes, closure, ...}, mutualDecs, doNext) =
-            let (* Full closure required. *)
-                val resClosure = makeConstantClosure()
-                (* Code-gen function. *)
-                val () = codegen (body, name, resClosure, List.length argTypes, localCount, parameters, true)
-                val closureVars = List.length closure (* Size excluding the code address *)
-                (* The first native word is the code address. *)
-                val firstEntry = Word.toInt(nativeWordSize div wordSize)
-            in
-                if mutualDecs
-                then
-                let (* Have to make the closure now and fill it in later. *)
-                    val () = genAllocateFixedSize(closureVars+firstEntry, Word8.orb(F_mutable, F_closure), X0, X1, cvec)
-                    val () =
-                        if is32in64
-                        then (* Have to get the code address at run-time. *)
-                        (
-                            gen(loadAddressConstant(X1, toMachineWord resClosure), cvec);
-                            indexToAbsoluteAddress(X1, X1, cvec);
-                            gen(loadRegScaled{regT=X1, regN=X1, unitOffset=0}, cvec)
-                        )
-                        else gen(loadAddressConstant(X1, codeAddressFromClosure resClosure), cvec)
-                    val () = gen(storeRegScaled{regT=X1, regN=X0, unitOffset=0}, cvec)
-                    val () = absoluteAddressToIndex(X0, cvec)
-                    val () = genPushReg(X0, cvec)
-                    val () = incsp ()
-           
-                    val entryAddr : int = !realstackptr
-                    (* Set the address of this entry in the declaration table and
-                       then process any other mutual-recursive functions. *)
-                    val () = doNext ()
-
-                    (* Reload the address of the vector - If we have processed other
-                       closures the closure will no longer be on the top of the stack. *)
-                    val () = genList(loadScaledWord{dest=X1, base=X_MLStackPtr, work=X16, offset= !realstackptr - entryAddr}, cvec)
-                    val () = indexToAbsoluteAddress(X1, X1, cvec)
-
-                    (* Load items for the closure. *)
-                    fun loadItems ([], _) = ()
-                    |   loadItems (v :: vs, addr : int) =
-                    (
-                        (* Generate an item and move it into the closure *)
-                        gencde (BICExtract v, ToX0, NotEnd, NONE);
-                        (* The closure "address" excludes the code address. *)
-                        genList(storeScaledPolyWord{store=X0, base=X1, work=X16, offset=addr+firstEntry}, cvec);
-                        topInX0 := false;
-                        loadItems (vs, addr + 1)
-                    )
-             
-                    val () = loadItems (closure, 0)
-
-                    (* Lock it by setting the top byte to the zero or the closure bit. *)
-                    val () =
-                        if is32in64
-                        then
-                        (
-                            genList(loadNonAddress(X16, Word64.fromLargeWord(Word8.toLargeWord F_closure)), cvec);
-                            gen(storeRegUnscaledByte{regT=X16, regN=X1, byteOffset=flagsByteOffset}, cvec)
-                        )
-                        else gen(storeRegUnscaledByte{regT=XZero, regN=X1, byteOffset=flagsByteOffset}, cvec)
-                in
-                    () (* Don't need to do anything now. *)
-                end
-         
-                else
-                let
-                    val () = List.app (fn pt => gencde (BICExtract pt, ToStack, NotEnd, NONE)) closure
-                in
-                    genAllocateFixedSize(closureVars+firstEntry, if is32in64 then F_closure else 0w0, X0, X1, cvec);
-                    List.foldl(fn (_, w) =>
-                        (
-                            genPopReg(X1, cvec);
-                            genList(storeScaledPolyWord{store=X1, base=X0, work=X16, offset=w-1}, cvec);
-                            w-1
-                        )
-                    ) (closureVars+firstEntry) closure;
-                    if is32in64
-                    then (* Have to get the code address at run-time. *)
-                    (
-                        gen(loadAddressConstant(X1, toMachineWord resClosure), cvec);
-                        indexToAbsoluteAddress(X1, X1, cvec);
-                        gen(loadRegScaled{regT=X1, regN=X1, unitOffset=0}, cvec)
-                    )
-                    else gen(loadAddressConstant(X1, codeAddressFromClosure resClosure), cvec);
-                    gen(storeRegScaled{regT=X1, regN=X0, unitOffset=0}, cvec);
-                    absoluteAddressToIndex(X0, cvec);
-                    genPushReg(X0, cvec);
-                    realstackptr := !realstackptr - closureVars + 1 (* Popped the closure vars and pushed the address. *)
-                end
-            end
-
-        and genCond (testCode, thenCode, elseCode, whereto, tailKind, loopAddr) =
-        let
-            val toElse = createLabel() and exitJump = createLabel()
-            val () = genTest(testCode, false, toElse, loopAddr)
-            val () = gencde (thenCode, whereto, tailKind, loopAddr)
-            (* Get rid of the result from the stack. If there is a result then the
-            ``else-part'' will push it. *)
-            val () = case whereto of ToStack => decsp () | NoResult => () | ToX0 => ()
-            val () = topInX0 := false
-
-            val () = gen(unconditionalBranch exitJump, cvec)
-
-            (* start of "else part" *)
-            val () = gen(setLabel toElse, cvec)
-            val () = gencde (elseCode, whereto, tailKind, loopAddr)
-            val () = gen(setLabel exitJump, cvec)
-        in
-            ()
-        end (* genCond *)
-
-        (* andalso and orelse are turned into conditionals with constants.
-           Convert this into a series of tests. *)
-        and genTest(BICConstnt(w, _), jumpOn, targetLabel, _) =
-            let
-                val cVal = case toShort w of 0w0 => false | 0w1 => true | _ => raise InternalError "genTest"
-            in
-                if cVal = jumpOn
-                then gen(unconditionalBranch targetLabel, cvec)
-                else ()
-            end
-
-        |   genTest(BICUnary { oper=NotBoolean, arg1 }, jumpOn, targetLabel, loopAddr) =
-                genTest(arg1, not jumpOn, targetLabel, loopAddr)
-
-        |   genTest(BICUnary { oper=IsTaggedValue, arg1 }, jumpOn, targetLabel, loopAddr) =
-            (
-                gencde (arg1, ToX0, NotEnd, loopAddr);
-                topInX0 := false;
-                gen((if jumpOn then testBitBranchNonZero else testBitBranchZero)
-                    (X0, 0w0, targetLabel), cvec)
-            )
-
-        |   genTest(BICBinary{oper=WordComparison{test, isSigned}, arg1, arg2}, jumpOn, targetLabel, loopAddr) =
-            let
-                val (cond, condNot) =
-                    case (test, isSigned) of
-                        (TestEqual,         _) => (CondEqual, CondNotEqual)
-                    |   (TestLess,          true) => (CondSignedLess, CondSignedGreaterEq)
-                    |   (TestLessEqual,     true) => (CondSignedLessEq, CondSignedGreater)
-                    |   (TestGreater,       true) => (CondSignedGreater, CondSignedLessEq)
-                    |   (TestGreaterEqual,  true) => (CondSignedGreaterEq, CondSignedLess)
-                    |   (TestLess,          false) => (CondCarryClear, CondCarrySet)
-                    |   (TestLessEqual,     false) => (CondUnsignedLowOrEq, CondUnsignedHigher)
-                    |   (TestGreater,       false) => (CondUnsignedHigher, CondUnsignedLowOrEq)
-                    |   (TestGreaterEqual,  false) => (CondCarrySet, CondCarryClear)
-                    |   (TestUnordered,     _) => raise InternalError "WordComparison: TestUnordered"
-            in
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                topInX0 := false;
-                genPopReg(X1, cvec); (* First argument. *)
-                comparePolyRegs(X1, X0, cvec);
-                gen(conditionalBranch(if jumpOn then cond else condNot, targetLabel), cvec)
-            end
-
-        |   genTest(BICBinary{oper=PointerEq, arg1, arg2}, jumpOn, targetLabel, loopAddr) =
-            (
-                gencde (arg1, ToStack, NotEnd, loopAddr);
-                gencde (arg2, ToX0, NotEnd, loopAddr);
-                decsp();
-                topInX0 := false;
-                genPopReg(X1, cvec); (* First argument. *)
-                comparePolyRegs(X1, X0, cvec);
-                gen(conditionalBranch(if jumpOn then CondEqual else CondNotEqual, targetLabel), cvec)
-            )
-
-        |   genTest(BICTagTest { test, tag=tagValue, ... }, jumpOn, targetLabel, loopAddr) =
-            (
-                gencde (test, ToX0, NotEnd, loopAddr);
-                topInX0 := false;
-                gen((if is32in64 then subSImmediate32 else subSImmediate)
-                    {regN=X0, regD=XZero, immed=taggedWord tagValue, shifted=false}, cvec);
-                gen(conditionalBranch(if jumpOn then CondEqual else CondNotEqual, targetLabel), cvec)
-            )
-
-        |   genTest(BICCond (testPart, thenPart, elsePart), jumpOn, targetLabel, loopAddr) =
-            let
-                val toElse = createLabel() and exitJump = createLabel()
-            in
-                genTest(testPart, false, toElse, loopAddr);
-                genTest(thenPart, jumpOn, targetLabel, loopAddr);
-                gen(unconditionalBranch exitJump, cvec);
-                gen(setLabel toElse, cvec);
-                genTest(elsePart, jumpOn, targetLabel, loopAddr);
-                gen(setLabel exitJump, cvec)
-            end
-
-        |   genTest(testCode, jumpOn, targetLabel, loopAddr) =
-            (
-                gencde (testCode, ToStack, NotEnd, loopAddr);
-                genPopReg(X0, cvec);
-                gen((if is32in64 then subSImmediate32 else subSImmediate)
-                    {regN=X0, regD=XZero, immed=taggedWord 0w1, shifted=false}, cvec);
-                gen(conditionalBranch(if jumpOn then CondEqual else CondNotEqual, targetLabel), cvec);
-                decsp() (* conditional branch pops a value. *)
-            )
-
-        and genEval (eval, tailKind : tail) : unit =
-        let
-            val argList : backendIC list = List.map #1 (#argList eval)
-            val argsToPass : int = List.length argList;
-
-            (* Load arguments *)
-            fun loadArgs [] = ()
-            |   loadArgs (v :: vs) =
-            let (* Push each expression onto the stack. *)
-                val () = gencde(v, ToStack, NotEnd, NONE)
-            in
-                loadArgs vs
-            end;
-
-            (* Have to guarantee that the expression to return the function
-              is evaluated before the arguments. *)
-
-            (* Returns true if evaluating it later is safe. *)
-            fun safeToLeave (BICConstnt _) = true
-            |   safeToLeave (BICLambda _) = true
-            |   safeToLeave (BICExtract _) = true
-            |   safeToLeave (BICField {base, ...}) = safeToLeave base
-            |   safeToLeave (BICLoadContainer {base, ...}) = safeToLeave base
-            |   safeToLeave _ = false
-
-            val () =
-                if (case argList of [] => true | _ => safeToLeave (#function eval))
-                then
-                let
-                    (* Can load the args first. *)
-                    val () = loadArgs argList
-                in 
-                    gencde (#function eval, ToStack, NotEnd, NONE)
-                end
-
-                else
-                let
-                    (* The expression for the function is too complicated to
-                       risk leaving. It might have a side-effect and we must
-                       ensure that any side-effects it has are done before the
-                       arguments are loaded. *)
-                    val () = gencde(#function eval, ToStack, NotEnd, NONE);
-                    val () = loadArgs(argList);
-                    (* Load the function again. *)
-                    val () = genList(loadScaledWord{dest=X0, base=X_MLStackPtr, work=X16, offset=argsToPass}, cvec)
-                    val () = genPushReg(X0, cvec)
-                in
-                    incsp ()
-                end
-
-        in (* body of genEval *)
-            case tailKind of
-                NotEnd => (* Normal call. *)
-                let
-                    val () = genPopReg(X8, cvec) (* Pop the closure pointer. *)
-                    (* We need to put the first 8 arguments into registers and
-                       leave the rest on the stack. *)
-                    fun loadArg(n, reg) =
-                        if argsToPass > n
-                        then genList(loadScaledWord{dest=reg, base=X_MLStackPtr, work=X16, offset=argsToPass-n-1}, cvec)
-                        else ()
-                    val () = loadArg(0, X0)
-                    val () = loadArg(1, X1)
-                    val () = loadArg(2, X2)
-                    val () = loadArg(3, X3)
-                    val () = loadArg(4, X4)
-                    val () = loadArg(5, X5)
-                    val () = loadArg(6, X6)
-                    val () = loadArg(7, X7)
-                in
-                    if is32in64
-                    then
-                    (
-                        (* Can't use an indexed load because the only options for scaling are either 1 or 8. *)
-                        indexToAbsoluteAddress(X8, X9, cvec);
-                        gen(loadRegScaled{regT=X9, regN=X9, unitOffset=0}, cvec) (* Entry point *)
-                    )
-                    else gen(loadRegScaled{regT=X9, regN=X8, unitOffset=0}, cvec); (* Entry point *)
-                    gen(branchAndLinkReg X9, cvec);
-                    (* We have popped the closure pointer.  The caller has popped the stack
-                       arguments and we have pushed the result value. The register arguments
-                       are still on the stack. *)
-                    topInX0 := true;
-                    realstackptr := !realstackptr - Int.max(argsToPass-8, 0) - 1 (* Args popped by caller. *)
-                end
-     
-            |   EndOfProc => (* Tail recursive call. *)
-                let
-                    val () = genPopReg(X8, cvec) (* Pop the closure pointer. *)
-                    val () = decsp()
-                    (* Get the return address into X30. *)
-                    val () = genList(loadScaledWord{dest=X30, base=X_MLStackPtr, work=X16, offset= !realstackptr}, cvec)
-
-                    (* Load the register arguments *)
-                    fun loadArg(n, reg) =
-                        if argsToPass > n
-                        then genList(loadScaledWord{dest=reg, base=X_MLStackPtr, work=X16, offset=argsToPass-n-1}, cvec)
-                        else ()
-                    val () = loadArg(0, X0)
-                    val () = loadArg(1, X1)
-                    val () = loadArg(2, X2)
-                    val () = loadArg(3, X3)
-                    val () = loadArg(4, X4)
-                    val () = loadArg(5, X5)
-                    val () = loadArg(6, X6)
-                    val () = loadArg(7, X7)
-                    (* We need to move the stack arguments into the original argument area. *)
-
-                    (* This is the total number of words that this function is responsible for.
-                       It includes the stack arguments that the caller expects to be removed. *)
-                    val itemsOnStack = !realstackptr + 1 + numOfArgs
-
-                    (* Stack arguments are moved using X9. *)
-                    fun moveStackArg n =
-                    if n >= argsToPass
-                    then ()
-                    else
-                    let
-                        val () = loadArg(n, X9)
-                        val destOffset = itemsOnStack - (n-8) - 1
-                        val () = genList(storeScaledWord{store=X9, base=X_MLStackPtr, work=X16, offset=destOffset}, cvec)
-                    in
-                        moveStackArg(n+1)
-                    end
-
-                    val () = moveStackArg 8
-                in
-                    resetStack(itemsOnStack - Int.max(argsToPass-8, 0), false, cvec);
-                    if is32in64
-                    then
-                    (
-                        indexToAbsoluteAddress(X8, X9, cvec);
-                        gen(loadRegScaled{regT=X9, regN=X9, unitOffset=0}, cvec) (* Entry point *)
-                    )
-                    else gen(loadRegScaled{regT=X9, regN=X8, unitOffset=0}, cvec); (* Entry point *)
-                    gen(branchRegister X9, cvec)
-                    (* Since we're not returning we can ignore the stack pointer value. *)
-                end
-        end (* genEval *)
-
-        (* Begin generating the code for the function. *)
-        val prefix = ref []
-        (* Push the arguments passed in registers. *)
-        val () = if numOfArgs >= 8 then genPushReg (X7, prefix) else ()
-        val () = if numOfArgs >= 7 then genPushReg (X6, prefix) else ()
-        val () = if numOfArgs >= 6 then genPushReg (X5, prefix) else ()
-        val () = if numOfArgs >= 5 then genPushReg (X4, prefix) else ()
-        val () = if numOfArgs >= 4 then genPushReg (X3, prefix) else ()
-        val () = if numOfArgs >= 3 then genPushReg (X2, prefix) else ()
-        val () = if numOfArgs >= 2 then genPushReg (X1, prefix) else ()
-        val () = if numOfArgs >= 1 then genPushReg (X0, prefix) else ()
-        val () = genPushReg (X30, prefix)
-        (* The new code generator does not pass X8 if it is calling a constant function with no
-           closure.  For simplicity we set X8 to the current closure here. *)
-        val () = if hasClosure then () else gen(loadAddressConstant(X8, closureAsAddress resultClosure), prefix)
-        val () = genPushReg (X8, prefix) (* Push closure pointer *)
-
-        (* Generate the function. *)
-        (* Assume we always want a result. There is otherwise a problem if the
-          called routine returns a result of type void (i.e. no result) but the
-          caller wants a result (e.g. the identity function). *)
-        val () = gencde (pt, ToX0, EndOfProc, NONE)
-        val () = resetStack(1, false, cvec) (* Skip over the pushed closure *)
-        val () = genPopReg(X30, cvec) (* Return address => pop into X30 *)
-        val () = resetStack(numOfArgs, false, cvec) (* Remove the arguments *)
-        val () = gen(returnRegister X30, cvec) (* Jump to X30 *)
-        
-        (* Now we know the maximum stack size we can code-gen the stack check.
-           This needs to go in after we have saved X30. *)
-        val () = checkStackCode(X10, !maxStack, false, prefix)
-        val instructions = List.rev(!prefix) @ List.rev(!cvec)
-
-    in (* body of codegen *)
-       (* Having code-generated the body of the function, it is copied
-          into a new data segment. *)
-        generateCode{instrs=instructions, name=name, parameters=parameters, resultClosure=resultClosure, profileObject=createProfileObject()}
-    end (* codegen *)
-
-    fun gencodeLambda({ name, body, argTypes, localCount, closure, ...}:bicLambdaForm, parameters, closureRef) =
-        codegen (body, name, closureRef, List.length argTypes, localCount, parameters, not(null closure))
-
-
-    structure Foreign = Arm64Foreign
-
-    structure Sharing =
-    struct
-        open BackendTree.Sharing
-        type closureRef = closureRef
-    end
-
-end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PreAssembly.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PreAssembly.ML
index 554583bb..2d2bef55 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PreAssembly.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PreAssembly.ML
@@ -1,898 +1,904 @@
 (*
     Copyright (c) 2021-2 David C. J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     Licence version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public Licence for more details.
     
     You should have received a copy of the GNU Lesser General Public
     Licence along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 (* The pre-assembly layer goes below the icode and allows peep-hole optimisation. *)
 
 functor Arm64PreAssembly(
 
     structure Arm64Assembly: ARM64ASSEMBLY
     structure Debug: DEBUG
     structure Pretty: PRETTY
 
 ): ARM64PREASSEMBLY =
 struct
     open Arm64Assembly
     
     exception InternalError = Misc.InternalError
 
     (* Reversed cons and append to make the code easier to read. *)
     infix 5 <::> <@>
     fun tl <::> hd = hd :: tl
     and snd <@> fst = fst @ snd
 
     (* Many of the datatypes are inherited from Arm64Assembly *)
 
     datatype loadType = Load64 | Load32 | Load16 | Load8
     and opSize = OpSize32 | OpSize64
     and logicalOp = LogAnd | LogOr | LogXor
     and floatSize = Float32 | Double64
     and shiftDirection = ShiftLeft | ShiftRightLogical | ShiftRightArithmetic
     and multKind =
         MultAdd32 | MultSub32 | MultAdd64 | MultSub64 |
         SignedMultAddLong (* 32bit*32bit + 64bit => 64Bit *) |
         SignedMultHigh (* High order part of 64bit*64Bit *)
     and fpUnary = NegFloat | NegDouble | AbsFloat | AbsDouble | ConvFloatToDble | ConvDbleToFloat
     and fpBinary = MultiplyFP | DivideFP | AddFP | SubtractFP
     and unscaledType = NoUpdate | PreIndex | PostIndex
     and condSet = CondSet | CondSetIncr | CondSetInvert | CondSetNegate
     and bitfieldKind = BFUnsigned | BFSigned | BFInsert
     and brRegType = BRRBranch | BRRAndLink | BRRReturn
 
     type precodeLabel = labels
 
     datatype precode =
         (* Basic instructions *)
         AddImmediate of {regN: xReg, regD: xReg, immed: word, shifted: bool, opSize: opSize, setFlags: bool}
     |   SubImmediate of {regN: xReg, regD: xReg, immed: word, shifted: bool, opSize: opSize, setFlags: bool}
     |   AddShiftedReg of {regM: xReg, regN: xReg, regD: xReg, shift: shiftType, opSize: opSize, setFlags: bool}
     |   SubShiftedReg of {regM: xReg, regN: xReg, regD: xReg, shift: shiftType, opSize: opSize, setFlags: bool}
     |   AddExtendedReg of {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend, opSize: opSize, setFlags: bool}
     |   SubExtendedReg of {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend, opSize: opSize, setFlags: bool}
     |   MultiplyAndAddSub of {regM: xReg, regN: xReg, regA: xReg, regD: xReg, multKind: multKind}
     |   DivideRegs of
             {regM: xReg, regN: xReg, regD: xReg, isSigned: bool, opSize: opSize}
     |   LogicalShiftedReg of
             {regM: xReg, regN: xReg, regD: xReg, shift: shiftType, logOp: logicalOp, opSize: opSize, setFlags: bool}
     |   LoadRegScaled of
             {regT: xReg, regN: xReg, unitOffset: int, loadType: loadType}
     |   LoadFPRegScaled of
             {regT: vReg, regN: xReg, unitOffset: int, floatSize: floatSize}
     |   StoreRegScaled of
             {regT: xReg, regN: xReg, unitOffset: int, loadType: loadType}
     |   StoreFPRegScaled of
             {regT: vReg, regN: xReg, unitOffset: int, floatSize: floatSize}
     |   LoadRegUnscaled of
             {regT: xReg, regN: xReg, byteOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   StoreRegUnscaled of
             {regT: xReg, regN: xReg, byteOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   LoadFPRegUnscaled of
             {regT: vReg, regN: xReg, byteOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   StoreFPRegUnscaled of
             {regT: vReg, regN: xReg, byteOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   LoadRegIndexed of {regT: xReg, regN: xReg, regM: xReg, loadType: loadType, option: scale extend}
     |   StoreRegIndexed of {regT: xReg, regN: xReg, regM: xReg, loadType: loadType, option: scale extend}
     |   LoadFPRegIndexed of {regT: vReg, regN: xReg, regM: xReg, floatSize: floatSize, option: scale extend}
     |   StoreFPRegIndexed of {regT: vReg, regN: xReg, regM: xReg, floatSize: floatSize, option: scale extend}
         (* LoadAcquire and StoreRelease are used for mutables. *)
     |   LoadAcquireReg of {regN: xReg, regT: xReg, loadType: loadType}
     |   StoreReleaseReg of {regN: xReg, regT: xReg, loadType: loadType}
         (* LoadAcquireExclusiveRegister and StoreReleaseExclusiveRegister are used for mutexes. *)
     |   LoadAcquireExclusiveRegister of {regN: xReg, regT: xReg}
     |   StoreReleaseExclusiveRegister of {regS: xReg, regT: xReg, regN: xReg}
     |   MemBarrier
     |   LoadRegPair of
             { regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   StoreRegPair of
             { regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int, loadType: loadType, unscaledType: unscaledType}
     |   LoadFPRegPair of
             { regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   StoreFPRegPair of
             { regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int, floatSize: floatSize, unscaledType: unscaledType}
     |   ConditionalSet of
             {regD: xReg, regTrue: xReg, regFalse: xReg, cond: condition, condSet: condSet, opSize: opSize}
     |   BitField of {immr: word, imms: word, regN: xReg, regD: xReg, opSize: opSize, bitfieldKind: bitfieldKind}
     |   ShiftRegisterVariable of {regM: xReg, regN: xReg, regD: xReg, opSize: opSize, shiftDirection: shiftDirection}
     |   BitwiseLogical of { bits: Word64.word, regN: xReg, regD: xReg, opSize: opSize, setFlags: bool, logOp: logicalOp}
         (* Floating point *)
     |   MoveGeneralToFP of { regN: xReg, regD: vReg, floatSize: floatSize}
     |   MoveFPToGeneral of {regN: vReg, regD: xReg, floatSize: floatSize}
     |   CvtIntToFP of { regN: xReg, regD: vReg, floatSize: floatSize, opSize: opSize}
     |   CvtFloatToInt of { round: IEEEReal.rounding_mode, regN: vReg, regD: xReg, floatSize: floatSize, opSize: opSize}
     |   FPBinaryOp of { regM: vReg, regN: vReg, regD: vReg, floatSize: floatSize, fpOp: fpBinary}
     |   FPComparison of { regM: vReg, regN: vReg, floatSize: floatSize}
     |   FPUnaryOp of {regN: vReg, regD: vReg, fpOp: fpUnary}
         (* Branches and Labels. *)
     |   SetLabel of precodeLabel
     |   ConditionalBranch of condition * precodeLabel
     |   UnconditionalBranch of precodeLabel
     |   BranchAndLink of precodeLabel
     |   BranchReg of {regD: xReg, brRegType: brRegType }
     |   LoadLabelAddress of xReg * precodeLabel
     |   TestBitBranch of { test: xReg, bit: Word8.word, label: precodeLabel, onZero: bool }
     |   CompareBranch of { test: xReg, label: precodeLabel, onZero: bool, opSize: opSize }
         (* Composite instructions *)
     |   MoveXRegToXReg of {sReg: xReg, dReg: xReg}
     |   LoadNonAddr of xReg * Word64.word
     |   LoadAddr of xReg * machineWord
     |   RTSTrap of { rtsEntry: int, work: xReg, save: xReg list }
     |   AllocateMemoryFixedSize of { bytes: word, dest: xReg, save: xReg list, work: xReg }
     |   AllocateMemoryVariableSize of { sizeReg: xReg, dest: xReg, save: xReg list, work: xReg }
         (* Branch table for indexed case. startLabel is the address of the first label in
            the list.  The branch table is a sequence of unconditional branches. *)
     |   BranchTable of { startLabel: precodeLabel, brTable: precodeLabel list }
+    |   LoadGlobalHeapBaseInCallback of xReg
 
 
     fun toAssembler([], code) = code
 
     |   toAssembler(AddImmediate{regN, regD, immed, shifted, opSize, setFlags} :: rest, code) =
         let
             val instr =
                 case (opSize, setFlags) of
                     (OpSize64, false) => addImmediate
                 |   (OpSize32, false) => addImmediate32
                 |   (OpSize64, true) => addSImmediate
                 |   (OpSize32, true) => addSImmediate32
         in
             toAssembler(rest, code <::> instr{regN=regN, regD=regD, immed=immed, shifted=shifted})
         end
 
     |   toAssembler(SubImmediate{regN, regD, immed, shifted, opSize, setFlags} :: rest, code) =
         let
             val instr =
                 case (opSize, setFlags) of
                     (OpSize64, false) => subImmediate
                 |   (OpSize32, false) => subImmediate32
                 |   (OpSize64, true) => subSImmediate
                 |   (OpSize32, true) => subSImmediate32
         in
             toAssembler(rest, code <::> instr{regN=regN, regD=regD, immed=immed, shifted=shifted})
         end
 
     |   toAssembler(AddShiftedReg{regM, regN, regD, shift, opSize, setFlags} :: rest, code) =
         let
             val instr =
                 case (opSize, setFlags) of
                     (OpSize64, false) => addShiftedReg
                 |   (OpSize32, false) => addShiftedReg32
                 |   (OpSize64, true) => addSShiftedReg
                 |   (OpSize32, true) => addSShiftedReg32
         in
             toAssembler(rest, code <::> instr{regM=regM, regN=regN, regD=regD, shift=shift})
         end
 
     |   toAssembler(SubShiftedReg{regM, regN, regD, shift, opSize, setFlags} :: rest, code) =
         let
             val instr =
                 case (opSize, setFlags) of
                     (OpSize64, false) => subShiftedReg
                 |   (OpSize32, false) => subShiftedReg32
                 |   (OpSize64, true) => subSShiftedReg
                 |   (OpSize32, true) => subSShiftedReg32
         in
             toAssembler(rest, code <::> instr{regM=regM, regN=regN, regD=regD, shift=shift})
         end
 
     |   toAssembler(AddExtendedReg{regM, regN, regD, extend, opSize, setFlags} :: rest, code) =
         (* Add/SubExtended are only used to access XSP. *)
         let
             val instr =
                 case (opSize, setFlags) of
                     (OpSize64, false) => addExtendedReg
                 |   (OpSize32, false) => raise InternalError "AddExtendedReg; 32"
                 |   (OpSize64, true) => addSExtendedReg
                 |   (OpSize32, true) => raise InternalError "AddExtendedReg; 32"
         in
             toAssembler(rest, code <::> instr{regM=regM, regN=regN, regD=regD, extend=extend})
         end
 
     |   toAssembler(SubExtendedReg{regM, regN, regD, extend, opSize, setFlags} :: rest, code) =
         let
             val instr =
                 case (opSize, setFlags) of
                     (OpSize64, false) => subExtendedReg
                 |   (OpSize32, false) => raise InternalError "AddExtendedReg; 32"
                 |   (OpSize64, true) => subSExtendedReg
                 |   (OpSize32, true) => raise InternalError "AddExtendedReg; 32"
         in
             toAssembler(rest, code <::> instr{regM=regM, regN=regN, regD=regD, extend=extend})
         end
 
     |   toAssembler(MultiplyAndAddSub{regM, regN, regA, regD, multKind} :: rest, code) =
         let
             val instr =
                 case multKind of
                     MultAdd32 => multiplyAndAdd32{regM=regM, regN=regN, regA=regA, regD=regD}
                 |   MultSub32 => multiplyAndSub32{regM=regM, regN=regN, regA=regA, regD=regD}
                 |   MultAdd64 => multiplyAndAdd{regM=regM, regN=regN, regA=regA, regD=regD}
                 |   MultSub64 => multiplyAndSub{regM=regM, regN=regN, regA=regA, regD=regD}
                 |   SignedMultAddLong => signedMultiplyAndAddLong{regM=regM, regN=regN, regA=regA, regD=regD}
                 |   SignedMultHigh => signedMultiplyHigh{regM=regM, regN=regN, regD=regD}
         in
             toAssembler(rest, code <::> instr)
         end
 
     |   toAssembler(DivideRegs{regM, regN, regD, isSigned, opSize} :: rest, code) =
         let
             val instr =
                 case (isSigned, opSize) of
                     (true, OpSize64) => signedDivide
                 |   (true, OpSize32) => signedDivide32
                 |   (false, OpSize64) => unsignedDivide
                 |   (false, OpSize32) => unsignedDivide32
         in
             toAssembler(rest, code <::> instr{regN=regN, regM=regM, regD=regD})
         end
 
     |   toAssembler(LogicalShiftedReg{regM, regN, regD, shift, logOp, opSize, setFlags} :: rest, code) =
         let
             val instr =
                 case (logOp, setFlags, opSize) of
                     (LogAnd, false, OpSize64) => andShiftedReg
                 |   (LogAnd, true, OpSize64) => andsShiftedReg
                 |   (LogOr, false, OpSize64) => orrShiftedReg
                 |   (LogXor, false, OpSize64) => eorShiftedReg
 
                 |   (LogAnd, false, OpSize32) => andShiftedReg32
                 |   (LogAnd, true, OpSize32) => andsShiftedReg32
                 |   (LogOr, false, OpSize32) => orrShiftedReg32
                 |   (LogXor, false, OpSize32) => eorShiftedReg32
 
                 |   _ => raise InternalError "setFlags not valid with OR or XOR"
             (* There are also versions of AND/OR/XOR which operate on a complement (NOT)
                of the shifted register.  It's probably not worth looking for a use for them. *)
         in
             toAssembler(rest, code <::> instr{regN=regN, regM=regM, regD=regD, shift=shift})
         end
 
     |   toAssembler(LoadRegScaled{regT, regN, unitOffset, loadType} :: rest, code) =
         let
             val instr =
                 case loadType of
                     Load64 => loadRegScaled
                 |   Load32 => loadRegScaled32
                 |   Load16 => loadRegScaled16
                 |   Load8 => loadRegScaledByte
         in
             toAssembler(rest, code <::> instr{regT=regT, regN=regN, unitOffset=unitOffset})
         end
 
     |   toAssembler(StoreRegScaled{regT, regN, unitOffset, loadType} :: rest, code) =
         let
             val instr =
                 case loadType of
                     Load64 => storeRegScaled
                 |   Load32 => storeRegScaled32
                 |   Load16 => storeRegScaled16
                 |   Load8 => storeRegScaledByte
         in
             toAssembler(rest, code <::> instr{regT=regT, regN=regN, unitOffset=unitOffset})
         end
 
     |   toAssembler(LoadFPRegScaled{regT, regN, unitOffset, floatSize} :: rest, code) =
         let
             val instr =
                 case floatSize of
                     Float32 => loadRegScaledFloat
                 |   Double64 => loadRegScaledDouble
         in
             toAssembler(rest, code <::> instr{regT=regT, regN=regN, unitOffset=unitOffset})
         end
 
     |   toAssembler(StoreFPRegScaled{regT, regN, unitOffset, floatSize} :: rest, code) =
         let
             val instr =
                 case floatSize of
                     Float32 => storeRegScaledFloat
                 |   Double64 => storeRegScaledDouble
         in
             toAssembler(rest, code <::> instr{regT=regT, regN=regN, unitOffset=unitOffset})
         end
 
     |   toAssembler(LoadRegUnscaled{regT, regN, byteOffset, loadType, unscaledType} :: rest, code) =
         let
             val instr =
                 case (loadType, unscaledType) of
                     (Load64, NoUpdate) => loadRegUnscaled
                 |   (Load32, NoUpdate) => loadRegUnscaled32
                 |   (Load16, NoUpdate) => loadRegUnscaled16
                 |   (Load8, NoUpdate) => loadRegUnscaledByte
                 |   (Load64, PreIndex) => loadRegPreIndex
                 |   (Load32, PreIndex) => loadRegPreIndex32
                 |   (Load16, PreIndex) => raise InternalError "loadRegPreIndex16"
                 |   (Load8, PreIndex) => loadRegPreIndexByte
                 |   (Load64, PostIndex) => loadRegPostIndex
                 |   (Load32, PostIndex) => loadRegPostIndex32
                 |   (Load16, PostIndex) => raise InternalError "loadRegPostIndex16"
                 |   (Load8, PostIndex) => loadRegPostIndexByte
         in
             toAssembler(rest, code <::> instr{regT=regT, regN=regN, byteOffset=byteOffset})
         end
 
     |   toAssembler(LoadFPRegUnscaled{regT, regN, byteOffset, floatSize, unscaledType} :: rest, code) =
         let
             val instr =
                 case (floatSize, unscaledType) of
                     (Float32, NoUpdate) => loadRegUnscaledFloat
                 |   (Double64, NoUpdate) => loadRegUnscaledDouble
                 |   _ => raise InternalError "LoadFPRegUnscaled: pre/post indexed"
         in
             toAssembler(rest, code <::> instr{regT=regT, regN=regN, byteOffset=byteOffset})
         end
 
     |   toAssembler(StoreRegUnscaled{regT, regN, byteOffset, loadType, unscaledType} :: rest, code) =
         let
             val instr =
                 case (loadType, unscaledType) of
                     (Load64, NoUpdate) => storeRegUnscaled
                 |   (Load32, NoUpdate) => storeRegUnscaled32
                 |   (Load16, NoUpdate) => storeRegUnscaled16
                 |   (Load8, NoUpdate) => storeRegUnscaledByte
                 |   (Load64, PreIndex) => storeRegPreIndex
                 |   (Load32, PreIndex) => storeRegPreIndex32
                 |   (Load16, PreIndex) => raise InternalError "storeRegPreIndex16"
                 |   (Load8, PreIndex) => storeRegPreIndexByte
                 |   (Load64, PostIndex) => storeRegPostIndex
                 |   (Load32, PostIndex) => storeRegPostIndex32
                 |   (Load16, PostIndex) => raise InternalError "storeRegPostIndex16"
                 |   (Load8, PostIndex) => storeRegPostIndexByte
         in
             toAssembler(rest, code <::> instr{regT=regT, regN=regN, byteOffset=byteOffset})
         end
 
     |   toAssembler(StoreFPRegUnscaled{regT, regN, byteOffset, floatSize, unscaledType} :: rest, code) =
         let
             val instr =
                 case (floatSize, unscaledType) of
                     (Float32, NoUpdate) => storeRegUnscaledFloat
                 |   (Double64, NoUpdate) => storeRegUnscaledDouble
                 |   _ => raise InternalError "StoreFPRegUnscaled: pre/post indexed"
         in
             toAssembler(rest, code <::> instr{regT=regT, regN=regN, byteOffset=byteOffset})
         end
 
     |   toAssembler(LoadRegIndexed{regT, regN, regM, loadType, option} :: rest, code) =
         let
             val instr =
                 case loadType of
                     Load64 => loadRegIndexed
                 |   Load32 => loadRegIndexed32
                 |   Load16 => loadRegIndexed16
                 |   Load8 => loadRegIndexedByte
         in
             toAssembler(rest, code <::> instr{regT=regT, regN=regN, regM=regM, option=option})
         end
 
     |   toAssembler(StoreRegIndexed{regT, regN, regM, loadType, option} :: rest, code) =
         let
             val instr =
                 case loadType of
                     Load64 => storeRegIndexed
                 |   Load32 => storeRegIndexed32
                 |   Load16 => storeRegIndexed16
                 |   Load8 => storeRegIndexedByte
         in
             toAssembler(rest, code <::> instr{regT=regT, regN=regN, regM=regM, option=option})
         end
 
     |   toAssembler(LoadFPRegIndexed{regT, regN, regM, floatSize, option} :: rest, code) =
         let
             val instr =
                 case floatSize of
                     Float32 => loadRegIndexedFloat
                 |   Double64 => loadRegIndexedDouble
         in
             toAssembler(rest, code <::> instr{regT=regT, regN=regN, regM=regM, option=option})
         end
 
     |   toAssembler(StoreFPRegIndexed{regT, regN, regM, floatSize, option} :: rest, code) =
         let
             val instr =
                 case floatSize of
                     Float32 => storeRegIndexedFloat
                 |   Double64 => storeRegIndexedDouble
         in
             toAssembler(rest, code <::> instr{regT=regT, regN=regN, regM=regM, option=option})
         end
 
     |   toAssembler(LoadAcquireReg{regN, regT, loadType} :: rest, code) =
         let
             val loadInstr  =
                 case loadType of
                     Load64 => loadAcquire
                 |   Load32 => loadAcquire32
                 |   Load8 => loadAcquireByte
                 |   _ => raise InternalError "LoadAcquire: Unsupported size" (* Not used *)
         in
             toAssembler(rest, code <::> loadInstr{regT=regT, regN=regN})
         end
 
     |   toAssembler(StoreReleaseReg{regN, regT, loadType} :: rest, code) =
         let
             val storeInstr  =
                 case loadType of
                     Load64 => storeRelease
                 |   Load32 => storeRelease32
                 |   Load8 => storeReleaseByte
                 |   _ => raise InternalError "StoreRelease: Unsupported size" (* Not used *)
         in
             toAssembler(rest, code <::> storeInstr{regT=regT, regN=regN})
         end
 
     |   toAssembler(LoadAcquireExclusiveRegister{regN, regT} :: rest, code) =
             toAssembler(rest, code <::> loadAcquireExclusiveRegister{regN=regN, regT=regT})
 
     |   toAssembler(StoreReleaseExclusiveRegister{regN, regT, regS} :: rest, code) =
             toAssembler(rest, code <::> storeReleaseExclusiveRegister{regN=regN, regT=regT, regS=regS})
 
     |   toAssembler(MemBarrier :: rest, code) =
             toAssembler(rest, code <::> dmbIsh)
 
     |   toAssembler(LoadRegPair{ regT1, regT2, regN, unitOffset, loadType, unscaledType} :: rest, code) =
         let
             val instr =
                 case (loadType, unscaledType) of
                     (Load64, NoUpdate) => loadPairOffset
                 |   (Load64, PreIndex) => loadPairPreIndexed
                 |   (Load64, PostIndex) => loadPairPostIndexed
                 |   (Load32, NoUpdate) => loadPairOffset32
                 |   (Load32, PreIndex) => loadPairPreIndexed32
                 |   (Load32, PostIndex) => loadPairPostIndexed32
                 |   _ => raise InternalError "LoadRegPair: unimplemented"
         in
             toAssembler(rest, code <::> instr{regT1=regT1, regT2=regT2, regN=regN, unitOffset=unitOffset})
         end
 
     |   toAssembler(StoreRegPair{ regT1, regT2, regN, unitOffset, loadType, unscaledType} :: rest, code) =
         let
             val instr =
                 case (loadType, unscaledType) of
                     (Load64, NoUpdate) => storePairOffset
                 |   (Load64, PreIndex) => storePairPreIndexed
                 |   (Load64, PostIndex) => storePairPostIndexed
                 |   (Load32, NoUpdate) => storePairOffset32
                 |   (Load32, PreIndex) => storePairPreIndexed32
                 |   (Load32, PostIndex) => storePairPostIndexed32
                 |   _ => raise InternalError "StoreRegPair: unimplemented"
         in
             toAssembler(rest, code <::> instr{regT1=regT1, regT2=regT2, regN=regN, unitOffset=unitOffset})
         end
 
     |   toAssembler(LoadFPRegPair{ regT1, regT2, regN, unitOffset, floatSize, unscaledType} :: rest, code) =
         let
             val instr =
                 case (floatSize, unscaledType) of
                     (Double64, NoUpdate) => loadPairOffsetDouble
                 |   (Double64, PreIndex) => loadPairPreIndexedDouble
                 |   (Double64, PostIndex) => loadPairPostIndexedDouble
                 |   (Float32, NoUpdate) => loadPairOffsetFloat
                 |   (Float32, PreIndex) => loadPairPreIndexedFloat
                 |   (Float32, PostIndex) => loadPairPostIndexedFloat
         in
             toAssembler(rest, code <::> instr{regT1=regT1, regT2=regT2, regN=regN, unitOffset=unitOffset})
         end
 
     |   toAssembler(StoreFPRegPair{ regT1, regT2, regN, unitOffset, floatSize, unscaledType} :: rest, code) =
         let
             val instr =
                 case (floatSize, unscaledType) of
                     (Double64, NoUpdate) => storePairOffsetDouble
                 |   (Double64, PreIndex) => storePairPreIndexedDouble
                 |   (Double64, PostIndex) => storePairPostIndexedDouble
                 |   (Float32, NoUpdate) => storePairOffsetFloat
                 |   (Float32, PreIndex) => storePairPreIndexedFloat
                 |   (Float32, PostIndex) => storePairPostIndexedFloat
         in
             toAssembler(rest, code <::> instr{regT1=regT1, regT2=regT2, regN=regN, unitOffset=unitOffset})
         end
 
     |   toAssembler(ConditionalSet{regD, regTrue, regFalse, cond, condSet, opSize} :: rest, code) =
         let
             val instr =
                 case (condSet, opSize) of
                     (CondSet, OpSize64) => conditionalSet
                 |   (CondSetIncr, OpSize64) => conditionalSetIncrement
                 |   (CondSetInvert, OpSize64) => conditionalSetInverted
                 |   (CondSetNegate, OpSize64) => conditionalSetNegated
                 |   (CondSet, OpSize32) => conditionalSet32
                 |   (CondSetIncr, OpSize32) => conditionalSetIncrement32
                 |   (CondSetInvert, OpSize32) => conditionalSetInverted32
                 |   (CondSetNegate, OpSize32) => conditionalSetNegated32
         in
             toAssembler(rest, code <::> instr{regD=regD, regTrue=regTrue, regFalse=regFalse, cond=cond})
         end
 
     |   toAssembler(BitField{immr, imms, regN, regD, opSize, bitfieldKind} :: rest, code) =
         let
             val bfInstr =
                 case (bitfieldKind, opSize) of
                     (BFSigned, OpSize64) => signedBitfieldMove64
                 |   (BFUnsigned, OpSize64) => unsignedBitfieldMove64
                 |   (BFInsert, OpSize64) => bitfieldMove64
                 |   (BFSigned, OpSize32) => signedBitfieldMove32
                 |   (BFUnsigned, OpSize32) => unsignedBitfieldMove32
                 |   (BFInsert, OpSize32) => bitfieldMove32
         in
             toAssembler(rest, code <::> bfInstr{immr=immr, imms=imms, regN=regN, regD=regD})
         end
 
     |   toAssembler(ShiftRegisterVariable{regM, regN, regD, opSize, shiftDirection} :: rest, code) =
         let
             val instr =
                 case (shiftDirection, opSize) of
                     (ShiftLeft, OpSize64) => logicalShiftLeftVariable
                 |   (ShiftLeft, OpSize32) => logicalShiftLeftVariable32
                 |   (ShiftRightLogical, OpSize64) => logicalShiftRightVariable
                 |   (ShiftRightLogical, OpSize32) => logicalShiftRightVariable32
                 |   (ShiftRightArithmetic, OpSize64) => arithmeticShiftRightVariable
                 |   (ShiftRightArithmetic, OpSize32) => arithmeticShiftRightVariable32
         in
             toAssembler(rest, code <::> instr{regN=regN, regM=regM, regD=regD})
         end
 
     |   toAssembler(BitwiseLogical{ bits, regN, regD, opSize, setFlags, logOp} :: rest, code) =
         let
             val instr =
                 case (logOp, setFlags, opSize) of
                     (LogAnd, false, OpSize64) => bitwiseAndImmediate
                 |   (LogAnd, true, OpSize64) => bitwiseAndSImmediate
                 |   (LogOr, false, OpSize64) => bitwiseOrImmediate
                 |   (LogXor, false, OpSize64) => bitwiseXorImmediate
 
                 |   (LogAnd, false, OpSize32) => bitwiseAndImmediate32
                 |   (LogAnd, true, OpSize32) => bitwiseAndSImmediate32
                 |   (LogOr, false, OpSize32) => bitwiseOrImmediate32
                 |   (LogXor, false, OpSize32) => bitwiseXorImmediate32
 
                 |   _ => raise InternalError "flags not valid with OR or XOR"
         in
             toAssembler(rest, code <::> instr{regN=regN, regD=regD, bits=bits})
         end
 
     |   toAssembler(MoveGeneralToFP{ regN, regD, floatSize=Float32} :: rest, code) =
             toAssembler(rest, code <::> moveGeneralToFloat{regN=regN, regD=regD})
     |   toAssembler(MoveGeneralToFP{ regN, regD, floatSize=Double64} :: rest, code) =
             toAssembler(rest, code <::> moveGeneralToDouble{regN=regN, regD=regD})
 
     |   toAssembler(MoveFPToGeneral{ regN, regD, floatSize=Float32} :: rest, code) =
             toAssembler(rest, code <::> moveFloatToGeneral{regN=regN, regD=regD})
     |   toAssembler(MoveFPToGeneral{ regN, regD, floatSize=Double64} :: rest, code) =
             toAssembler(rest, code <::> moveDoubleToGeneral{regN=regN, regD=regD})
 
     |   toAssembler(CvtIntToFP{ regN, regD, floatSize, opSize} :: rest, code) =
         let
             val instr =
                 case (opSize, floatSize) of
                     (OpSize32, Float32) => convertInt32ToFloat
                 |   (OpSize64, Float32) => convertIntToFloat
                 |   (OpSize32, Double64) => convertInt32ToDouble
                 |   (OpSize64, Double64) => convertIntToDouble
         in
             toAssembler(rest, code <::> instr{regN=regN, regD=regD})
         end
 
     |   toAssembler(CvtFloatToInt{ round, regN, regD, floatSize, opSize} :: rest, code) =
         let
             val instr =
                 case (floatSize, opSize) of
                     (Float32, OpSize32) => convertFloatToInt32
                 |   (Float32, OpSize64) => convertFloatToInt
                 |   (Double64, OpSize32) => convertDoubleToInt32
                 |   (Double64, OpSize64) => convertDoubleToInt
         in
             toAssembler(rest, code <::> instr round {regN=regN, regD=regD})
         end
 
     |   toAssembler(FPBinaryOp{ regM, regN, regD, floatSize, fpOp} :: rest, code) =
         let
             val instr =
                 case (fpOp, floatSize) of
                     (MultiplyFP, Float32) => multiplyFloat
                 |   (DivideFP, Float32) => divideFloat
                 |   (AddFP, Float32) => addFloat
                 |   (SubtractFP, Float32) => subtractFloat
                 |   (MultiplyFP, Double64) => multiplyDouble
                 |   (DivideFP, Double64) => divideDouble
                 |   (AddFP, Double64) => addDouble
                 |   (SubtractFP, Double64) => subtractDouble
         in
             toAssembler(rest, code <::> instr {regN=regN, regM=regM, regD=regD})
         end
 
     |   toAssembler(FPComparison{ regM, regN, floatSize} :: rest, code) =
             toAssembler(rest, code <::> (case floatSize of Float32 => compareFloat | Double64 => compareDouble){regN=regN, regM=regM})
 
     |   toAssembler(FPUnaryOp{ regN, regD, fpOp} :: rest, code) =
         let
             val instr =
                 case fpOp of
                     NegFloat => negFloat | NegDouble => negDouble
                 |   AbsFloat => absFloat | AbsDouble => absDouble
                 |   ConvFloatToDble => convertFloatToDouble
                 |   ConvDbleToFloat => convertDoubleToFloat
         in
             toAssembler(rest, code <::> instr {regN=regN, regD=regD})
         end
 
     |   toAssembler(SetLabel label :: rest, code) = toAssembler(rest, code <::> setLabel label)
 
     |   toAssembler(ConditionalBranch(cond, label) :: rest, code) = toAssembler(rest, code <::> conditionalBranch(cond, label))
 
     |   toAssembler(UnconditionalBranch label :: rest, code) = toAssembler(rest, code <::> unconditionalBranch label)
 
     |   toAssembler(BranchAndLink label :: rest, code) = toAssembler(rest, code <::> branchAndLink label)
 
     |   toAssembler(BranchReg{regD, brRegType=BRRBranch} :: rest, code) = toAssembler(rest, code <::> branchRegister regD)
     |   toAssembler(BranchReg{regD, brRegType=BRRAndLink} :: rest, code) = toAssembler(rest, code <::> branchAndLinkReg regD)
     |   toAssembler(BranchReg{regD, brRegType=BRRReturn} :: rest, code) = toAssembler(rest, code <::> returnRegister regD)
 
     |   toAssembler(LoadLabelAddress(reg, label) :: rest, code) = toAssembler(rest, code <::> loadLabelAddress(reg, label))
 
     |   toAssembler(TestBitBranch{ test, bit, label, onZero } :: rest, code) =
             toAssembler(rest, code <::> (if onZero then testBitBranchZero else testBitBranchNonZero)(test, bit, label))
 
     |   toAssembler(CompareBranch{ test, label, onZero, opSize } :: rest, code) =
         let
             val instr =
                 case (onZero, opSize) of
                     (true, OpSize64) => compareBranchZero
                 |   (false, OpSize64) => compareBranchNonZero
                 |   (true, OpSize32) => compareBranchZero32
                 |   (false, OpSize32) => compareBranchNonZero32
         in
             toAssembler(rest, code <::> instr(test, label))
         end
 
         (* Register-register moves - special case for XSP. *)
     |   toAssembler(MoveXRegToXReg{sReg=XSP, dReg} :: rest, code) =
             toAssembler(rest, code <::> addImmediate{regN=XSP, regD=dReg, immed=0w0, shifted=false})
     |   toAssembler(MoveXRegToXReg{sReg, dReg=XSP} :: rest, code) =
             toAssembler(rest, code <::> addImmediate{regN=sReg, regD=XSP, immed=0w0, shifted=false})
     |   toAssembler(MoveXRegToXReg{sReg, dReg} :: rest, code) =
             toAssembler(rest, code <::> orrShiftedReg{regN=XZero, regM=sReg, regD=dReg, shift=ShiftNone})
 
     |   toAssembler(LoadNonAddr(xReg, value) :: rest, code) =
         let
             (* Load a non-address constant.  Tries to use movz/movn/movk if
                that can be done easily, othewise uses loadNonAddressConstant to
                load the value from the non-address constant area. *)
             fun extW (v, h) = Word.andb(Word.fromLarge(LargeWord.>>(Word64.toLarge v, h*0w16)), 0wxffff)
             val hw0 = extW(value, 0w3) and hw1 = extW(value, 0w2)
             and hw2 = extW(value, 0w1) and hw3 = extW(value, 0w0)
             val nextCode =
                 if value < 0wx100000000
                 then
                 let
                     (* 32-bit constants can be loaded using at most a movz and movk but
                        various cases can be reduced since all 32-bit operations set
                        the top word to zero. *)
                     val hi = hw2
                     and lo = hw3
                 in
                     (* 32-bit constants can be loaded with at most a movz and a movk but
                        it may be that there is something shorter. *)
                     if hi = 0w0
                     then code <::> moveZero32{regD=xReg, immediate=lo, shift=0w0}
                     else if hi = 0wxffff
                     then code <::> moveNot32{regD=xReg, immediate=Word.xorb(0wxffff, lo), shift=0w0}
                     else if lo = 0w0
                     then code <::> moveZero32{regD=xReg, immediate=hi, shift=0w16}
                     else if isEncodableBitPattern(value, WordSize32)
                     then code <::> bitwiseOrImmediate32{bits=value, regN=XZero, regD=xReg}
                     else (* Have to use two instructions *)
                         code <::>
                             moveZero32{regD=xReg, immediate=lo, shift=0w0} <::>
                             moveKeep{regD=xReg, immediate=hi, shift=0w16}
                 end
                 else if hw0 = 0wxffff andalso hw1 = 0wxffff andalso hw2 = 0wxffff
                 then code <::> moveNot{regD=xReg, immediate=Word.xorb(0wxffff, hw3), shift=0w0}
                 else if hw1 = 0w0 andalso hw2 = 0w0
                 then (* This is common for length words with a flags byte *)
                     code <::> moveZero32{regD=xReg, immediate=hw3, shift=0w0} <::>
                         moveKeep{regD=xReg, immediate=hw0, shift=0w48} 
                 else code <::> loadNonAddressConstant(xReg, value)
         in
             toAssembler(rest, nextCode)
         end
 
     |   toAssembler(LoadAddr(dReg, source) :: rest, code) = toAssembler(rest, loadAddressConstant(dReg, source) :: code)
 
     |   toAssembler(RTSTrap{ rtsEntry, work, save } :: rest, code) =
         let
             (* Because X30 is used in the branchAndLink it has to be pushed
                across any trap. *)
             val saveX30 = List.exists (fn r => r = X30) save
             val preserve = List.filter (fn r => r <> X30) save
         in
             toAssembler(rest,
                 code <@>
                     (if saveX30 then [storeRegPreIndex{regT=X30, regN=X_MLStackPtr, byteOffset= ~8}] else []) <::>
                 loadRegScaled{regT=work, regN=X_MLAssemblyInt, unitOffset=rtsEntry} <::>
                 branchAndLinkReg work <::>
                 registerMask preserve <@>
                 (if saveX30 then [loadRegPostIndex{regT=X30, regN=X_MLStackPtr, byteOffset= 8}] else [])
             )
         end
 
     |   toAssembler(AllocateMemoryFixedSize{ bytes, dest, save, work } :: rest, code) =
         let
             val label = createLabel()
             val saveX30 = List.exists (fn r => r = X30) save
             val preserve = List.filter (fn r => r <> X30) save
 
             val allocCode =
                 code <::>
                 (* Subtract the number of bytes required from the heap pointer. *)
                 subImmediate{regN=X_MLHeapAllocPtr, regD=dest, immed=bytes, shifted=false} <::>
                 (* Compare the result with the heap limit. *)
                 subSShiftedReg{regM=X_MLHeapLimit, regN=dest, regD=XZero, shift=ShiftNone} <::>
                 conditionalBranch(CondCarrySet, label) <@>
                     (if saveX30 then [storeRegPreIndex{regT=X30, regN=X_MLStackPtr, byteOffset= ~8}] else []) <::>
                 loadRegScaled{regT=work, regN=X_MLAssemblyInt, unitOffset=heapOverflowCallOffset} <::>
                 branchAndLinkReg work <::>
                 registerMask preserve <@>
                 (if saveX30 then [loadRegPostIndex{regT=X30, regN=X_MLStackPtr, byteOffset= 8}] else []) <::>
                 setLabel label <::>
                 (* Update the heap pointer. *)
                 orrShiftedReg{regN=XZero, regM=dest, regD=X_MLHeapAllocPtr, shift=ShiftNone}
         in
             toAssembler(rest, allocCode)
         end
 
     |   toAssembler(AllocateMemoryVariableSize{ sizeReg, dest, save, work } :: rest, code) =
         let
             val trapLabel = createLabel() and noTrapLabel = createLabel()
             val saveX30 = List.exists (fn r => r = X30) save
             val preserve = List.filter (fn r => r <> X30) save
             
             val allocCode =
                 (
                     (* Subtract the size into the result register.  Subtract a further word for
                        the length word and round down in 32-in-64. *)
                     if is32in64
                     then code <::>
                         subShiftedReg{regM=sizeReg, regN=X_MLHeapAllocPtr, regD=dest, shift=ShiftLSL 0w2} <::>
                         subImmediate{regN=dest, regD=dest, immed=0w4, shifted=false} <::>
                         bitwiseAndImmediate{bits= ~ 0w8, regN=dest, regD=dest}
                     else code <::>
                         subShiftedReg{regM=sizeReg, regN=X_MLHeapAllocPtr, regD=dest, shift=ShiftLSL 0w3} <::>
                         subImmediate{regN=dest, regD=dest, immed=0w8, shifted=false}
                 ) <::>
                 (* Check against the limit. If the size is large enough it is possible that this could wrap round. 
                    To check for that we trap if either the result is less than the limit or if it is
                    now greater than the allocation pointer. *)
                 subSShiftedReg{regM=X_MLHeapLimit, regN=dest, regD=XZero, shift=ShiftNone} <::>
                 conditionalBranch(CondCarryClear, trapLabel) <::>
                 subSShiftedReg{regM=X_MLHeapAllocPtr, regN=dest, regD=XZero, shift=ShiftNone} <::>
                 conditionalBranch(CondCarryClear, noTrapLabel) <::>
                 setLabel trapLabel <@>
                     (if saveX30 then [storeRegPreIndex{regT=X30, regN=X_MLStackPtr, byteOffset= ~8}] else []) <::>
                 loadRegScaled{regT=work, regN=X_MLAssemblyInt, unitOffset=heapOverflowCallOffset} <::>
                 branchAndLinkReg work <::>
                 registerMask preserve <@>
                 (if saveX30 then [loadRegPostIndex{regT=X30, regN=X_MLStackPtr, byteOffset= 8}] else []) <::>
                 setLabel noTrapLabel <::>
                 (* Update the heap pointer. *)
                 orrShiftedReg{regN=XZero, regM=dest, regD=X_MLHeapAllocPtr, shift=ShiftNone}
         in
             toAssembler(rest, allocCode)
         end
 
     |   toAssembler(BranchTable{ startLabel, brTable } :: rest, code) =
             toAssembler(rest,
                 List.foldl (fn (label, code) => (unconditionalBranch label) :: code)
                     (code <::> setLabel startLabel) brTable)
 
+    |   toAssembler(LoadGlobalHeapBaseInCallback dest :: rest, code) =
+            toAssembler(rest,
+                code <@> List.rev(loadGlobalHeapBaseInCallback dest))
+
+
     fun toInstr precode =
         case toAssembler([precode], []) of
             [single] => single
         |   _ => raise InternalError "toInstr"
 
     (* Take a forward order sequence of instructions and generate a forward order output sequence. *)
     fun toInstrs precode = List.rev(toAssembler(precode, []))
 
     (* Constant shifts are encoded in the immr and imms fields of the bit-field instruction. *)
     fun shiftConstant{ direction, regD, regN, shift, opSize } =
     let
         val (bitfieldKind, immr, imms) =
             case (direction, opSize) of
                 (ShiftLeft, OpSize64) => (BFUnsigned, Word.~ shift mod 0w64, 0w64-0w1-shift)
             |   (ShiftLeft, OpSize32) => (BFUnsigned, Word.~ shift mod 0w32, 0w32-0w1-shift)
             |   (ShiftRightLogical, OpSize64) => (BFUnsigned, shift, 0wx3f)
             |   (ShiftRightLogical, OpSize32) => (BFUnsigned, shift, 0wx1f)
             |   (ShiftRightArithmetic, OpSize64) => (BFSigned, shift, 0wx3f)
             |   (ShiftRightArithmetic, OpSize32) => (BFSigned, shift, 0wx1f)
     in
         BitField{ regN=regN, regD=regD, opSize=opSize, immr=immr, imms=imms, bitfieldKind=bitfieldKind }
     end
 
     (* These sequences are used both in the ML code-generator and in the FFI code so it
        is convenient to have them here and share the code. *)
     local
         fun allocateWords(fixedReg, workReg, words, bytes, regMask, code) =
         let
             val (lengthWord, setLength, flagShift) = if is32in64 then (~4, Load32, 0w24) else (~8, Load64, 0w56)
         in
             code <::>
             AllocateMemoryFixedSize{ bytes=bytes, dest=fixedReg, save=regMask, work=X16 } <::>
             LoadNonAddr(workReg,
                     Word64.orb(words, Word64.<<(Word64.fromLarge(Word8.toLarge Address.F_bytes), flagShift))) <::>
             (* Store the length word.  Have to use the unaligned version because offset is -ve. *)
             StoreRegUnscaled{regT=workReg, regN=fixedReg, byteOffset= lengthWord, loadType=setLength, unscaledType=NoUpdate}
         end
 
         fun absoluteAddressToIndex(reg, code) =
         if is32in64
         then
             code <::>
             SubShiftedReg{regM=X_Base32in64, regN=reg, regD=reg, shift=ShiftNone, opSize=OpSize64, setFlags=false} <::>
             shiftConstant{direction=ShiftRightLogical, regN=reg, regD=reg, shift=0w2, opSize=OpSize64}
         else code
     in
         fun boxDouble({source, destination, workReg, saveRegs}, code) =
             absoluteAddressToIndex(destination,
                 allocateWords(destination, workReg, if is32in64 then 0w2 else 0w1, 0w16, saveRegs, code) <::>
                     StoreFPRegScaled{regT=source, regN=destination, unitOffset=0, floatSize=Double64})
                 
         and boxSysWord({source, destination, workReg, saveRegs}, code) =
             absoluteAddressToIndex(destination,
                 allocateWords(destination, workReg, if is32in64 then 0w2 else 0w1, 0w16, saveRegs, code) <::>
                     StoreRegScaled{regT=source, regN=destination, unitOffset=0, loadType=Load64})
 
         and boxFloat({source, destination, workReg, saveRegs}, code) =
             absoluteAddressToIndex(destination, 
                 allocateWords(destination, workReg, 0w1, 0w8, saveRegs, code) <::>
                     StoreFPRegScaled{regT=source, regN=destination, unitOffset=0, floatSize=Float32})
     end
 
     (* Optimise the pre-assembler code and then generate the final code. *)
     fun generateFinalCode {instrs, name, parameters, resultClosure, profileObject, labelCount=_} =
     let
 
     in
         generateCode{instrs=List.rev(toAssembler(instrs, [])), name=name, parameters=parameters,
                      resultClosure=resultClosure, profileObject=profileObject}
     end
 
 
     structure Sharing =
     struct
         type closureRef = closureRef
         type loadType = loadType
         type opSize = opSize
         type logicalOp = logicalOp
         type floatSize = floatSize
         type shiftDirection = shiftDirection
         type multKind = multKind
         type fpUnary = fpUnary
         type fpBinary = fpBinary
         type unscaledType = unscaledType
         type condSet = condSet
         type bitfieldKind = bitfieldKind
         type brRegType = brRegType
         type precode = precode
         type xReg = xReg
         type vReg = vReg
         type precodeLabel = precodeLabel
         type condition = condition
         type shiftType = shiftType
         type wordSize = wordSize
         type 'a extend = 'a extend
         type scale = scale
         type instr = instr
     end
 
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64Sequences.sml b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64Sequences.sml
deleted file mode 100644
index 46303d15..00000000
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64Sequences.sml
+++ /dev/null
@@ -1,181 +0,0 @@
-(*
-    Copyright (c) 2021 David C. J. Matthews
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    Licence version 2.1 as published by the Free Software Foundation.
-    
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public Licence for more details.
-    
-    You should have received a copy of the GNU Lesser General Public
-    Licence along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-*)
-
-(* Sequences of instructions for various purposes that are
-   used both in the main code-generator and the FFI code. *)
-
-functor Arm64Sequences (
-    structure Arm64Assembly: ARM64ASSEMBLY
-) : ARM64SEQUENCES =
-
-struct
-    open Arm64Assembly
-
-    (* Move register.  If we're moving to/from SP we have to use addImmediate *)
-    fun moveRegToReg{sReg=XSP, dReg} =
-            addImmediate{regN=XSP, regD=dReg, immed=0w0, shifted=false}
-    |   moveRegToReg{sReg, dReg=XSP} =
-            addImmediate{regN=sReg, regD=XSP, immed=0w0, shifted=false}
-    |   moveRegToReg{sReg, dReg} =
-            orrShiftedReg{regN=XZero, regM=sReg, regD=dReg, shift=ShiftNone}
-     
-    (* Load a non-address constant.  Tries to use movz/movn/movk if
-       that can be done easily, othewise uses loadNonAddressConstant to
-       load the value from the non-address constant area. *)
-    local
-        fun extW (v, h) = Word.andb(Word.fromLarge(LargeWord.>>(Word64.toLarge v, h*0w16)), 0wxffff)
-        (* Use one or more movk instructions to change the current value into
-           the target value. *)
-        (*fun modifyValue(xReg, current, target) =
-            if current = target
-            then []
-            else
-            let
-                val curr0 = extW(current, 0w3) and curr1 = extW(current, 0w2)
-                and curr2 = extW(current, 0w1)
-                and targ0 = extW(target, 0w3) and targ1 = extW(target, 0w2)
-                and targ2 = extW(target, 0w1) and targ3 = extW(target, 0w0)
-                fun replace(v, s) =
-                    Word64.orb(Word64.<<(Word64.fromLarge(Word.toLarge v), s),
-                        Word64.andb(current, Word64.notb(Word64.<<(0wxffff, s))))
-            in
-                if curr0 <> targ0
-                then moveKeep{regD=xReg, immediate=targ0, shift=0w48} ::
-                    modifyValue(xReg, replace(targ0, 0w48), target)
-                else if curr1 <> targ1
-                then moveKeep{regD=xReg, immediate=targ1, shift=0w32} ::
-                    modifyValue(xReg, replace(targ1, 0w32), target)
-                else if curr2 <> targ2
-                then moveKeep{regD=xReg, immediate=targ2, shift=0w16} ::
-                    modifyValue(xReg, replace(targ2, 0w16), target)
-                else [moveKeep{regD=xReg, immediate=targ3, shift=0w0}]
-            end*)
-    in
-        fun loadNonAddress(xReg, value) =
-        let
-            val hw0 = extW(value, 0w3) and hw1 = extW(value, 0w2)
-            and hw2 = extW(value, 0w1) and hw3 = extW(value, 0w0)
-        in
-            if value < 0wx100000000
-            then
-            let
-                (* 32-bit constants can be loaded using at most a movz and movk but
-                   various cases can be reduced since all 32-bit operations set
-                   the top word to zero. *)
-                val hi = hw2
-                and lo = hw3
-            in
-                (* 32-bit constants can be loaded with at most a movz and a movk but
-                   it may be that there is something shorter. *)
-                if hi = 0w0
-                then [moveZero32{regD=xReg, immediate=lo, shift=0w0}]
-                else if hi = 0wxffff
-                then [moveNot32{regD=xReg, immediate=Word.xorb(0wxffff, lo), shift=0w0}]
-                else if lo = 0w0
-                then [moveZero32{regD=xReg, immediate=hi, shift=0w16}]
-                else if isEncodableBitPattern(value, WordSize32)
-                then [bitwiseOrImmediate32{bits=value, regN=XZero, regD=xReg}]
-                else (* Have to use two instructions *)
-                    [
-                        moveZero32{regD=xReg, immediate=lo, shift=0w0},
-                        moveKeep{regD=xReg, immediate=hi, shift=0w16}
-                    ]  
-            end
-            else if hw0 = 0wxffff andalso hw1 = 0wxffff andalso hw2 = 0wxffff
-            then [moveNot{regD=xReg, immediate=Word.xorb(0wxffff, hw3), shift=0w0}]
-            else if hw1 = 0w0 andalso hw2 = 0w0
-            then (* This is common for length words with a flags byte *)
-                [
-                    moveZero32{regD=xReg, immediate=hw3, shift=0w0},
-                    moveKeep{regD=xReg, immediate=hw0, shift=0w48}
-                ]  
-            else [loadNonAddressConstant(xReg, value)]
-
-        end
-    end
-
-    local
-        fun allocateWords(fixedReg, workReg, words, bytes, regMask) =
-        let
-            val label = createLabel()
-            (* Because X30 is used in the branchAndLink it has to be pushed
-               across any trap. *)
-            val saveX30 = List.exists (fn r => r = X30) regMask
-            val preserve = List.filter (fn r => r <> X30) regMask
-        in
-            [
-                (* Subtract the number of bytes required from the heap pointer. *)
-                subImmediate{regN=X_MLHeapAllocPtr, regD=fixedReg, immed=bytes, shifted=false},
-                (* Compare the result with the heap limit. *)
-                subSShiftedReg{regM=X_MLHeapLimit, regN=fixedReg, regD=XZero, shift=ShiftNone},
-                conditionalBranch(CondCarrySet, label)
-            ] @
-            (if saveX30 then [storeRegPreIndex{regT=X30, regN=X_MLStackPtr, byteOffset= ~8}] else []) @
-            [
-                loadRegScaled{regT=X16, regN=X_MLAssemblyInt, unitOffset=heapOverflowCallOffset},
-                branchAndLinkReg X16,
-                registerMask preserve
-            ] @
-            (if saveX30 then [loadRegPostIndex{regT=X30, regN=X_MLStackPtr, byteOffset= 8}] else []) @
-            [
-                setLabel label,
-                (* Update the heap pointer. *)
-                moveRegToReg{sReg=fixedReg, dReg=X_MLHeapAllocPtr}
-            ] @
-                loadNonAddress(workReg,
-                    Word64.orb(words, Word64.<<(Word64.fromLarge(Word8.toLarge Address.F_bytes),
-                        if is32in64 then 0w24 else 0w56)))
-            @
-            [
-                (* Store the length word.  Have to use the unaligned version because offset is -ve. *)
-                if is32in64
-                then storeRegUnscaled32{regT=workReg, regN=fixedReg, byteOffset= ~4}
-                else storeRegUnscaled{regT=workReg, regN=fixedReg, byteOffset= ~8}
-            ]
-        end
-
-        fun absoluteAddressToIndex reg =
-        if is32in64
-        then
-        [
-            subShiftedReg{regM=X_Base32in64, regN=reg, regD=reg, shift=ShiftNone},
-            logicalShiftRight{shift=0w2, regN=reg, regD=reg}
-        ]
-        else []
-
-    in
-        fun boxDouble{source, destination, workReg, saveRegs} =
-            allocateWords(destination, workReg, if is32in64 then 0w2 else 0w1, 0w16, saveRegs) @
-                storeRegScaledDouble{regT=source, regN=destination, unitOffset=0} ::
-                absoluteAddressToIndex destination
-        and boxSysWord{source, destination, workReg, saveRegs} =
-            allocateWords(destination, workReg, if is32in64 then 0w2 else 0w1, 0w16, saveRegs) @
-                storeRegScaled{regT=source, regN=destination, unitOffset=0} ::
-                absoluteAddressToIndex destination
-        and boxFloat{source, destination, workReg, saveRegs} =
-            allocateWords(destination, workReg, 0w1, 0w8, saveRegs) @
-                storeRegScaledFloat{regT=source, regN=destination, unitOffset=0} ::
-                absoluteAddressToIndex destination
-    end
-
-    structure Sharing =
-    struct
-        type instr = instr
-        type xReg = xReg
-        type vReg = vReg
-    end
-end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/ml_bind.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/ml_bind.ML
index 8a58b0d2..53af1a62 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/ml_bind.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/ml_bind.ML
@@ -1,121 +1,113 @@
 (*
     Copyright (c) 2021-2 David C. J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     Licence version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public Licence for more details.
     
     You should have received a copy of the GNU Lesser General Public
     Licence along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 local
     structure Arm64Assembly =
         Arm64Assembly (
             structure Debug = Debug
             and       Pretty = Pretty
             and       CodeArray = CodeArray
         )
-    
-    structure Arm64Sequences =
-        Arm64Sequences (
-            structure Arm64Assembly = Arm64Assembly
-        )
 
     structure Arm64PreAssembly =
         Arm64PreAssembly (
             structure Arm64Assembly = Arm64Assembly
             and       Debug = Debug
             and       Pretty = Pretty
         )
 
     structure Arm64Foreign =
         Arm64ForeignCall (
             structure CodeArray   = CodeArray
             and       Arm64PreAssembly = Arm64PreAssembly
-            and       Arm64Assembly = Arm64Assembly
-            and       Arm64Sequences = Arm64Sequences
             and       Debug = Debug
         )
 
     structure Arm64ICode =
         Arm64ICode (
             structure Arm64Code     = Arm64PreAssembly
         )
 
     structure Arm64ICodeIdentify =
         Arm64IdentifyReferences (
             structure Debug         = Debug
             structure Arm64ICode         = Arm64ICode
             structure IntSet        = IntSet
         )
 
     structure Arm64PushRegs =
         Arm64PushRegisters (
             structure Arm64ICode    = Arm64ICode
             structure IntSet        = IntSet
             structure Identify      = Arm64ICodeIdentify
         )
 
     structure Arm64Opt =
         Arm64ICodeOptimise (
             structure Arm64ICode    = Arm64ICode
             structure IntSet        = IntSet
             structure Identify      = Arm64ICodeIdentify
             structure Debug         = Debug
             structure Pretty        = Pretty
         )
 
     structure Arm64IAllocate =
         Arm64AllocateRegisters (
             structure Arm64ICode    = Arm64ICode
             structure Identify      = Arm64ICodeIdentify
             structure IntSet        = IntSet
         )
 
     structure Arm64ICodeGenerate =
         Arm64ICodeToArm64Code (
             structure Debug         = Debug
             structure Arm64ICode    = Arm64ICode
             structure Identify      = Arm64ICodeIdentify
             structure Pretty        = Pretty
             structure IntSet        = IntSet
             structure Arm64PreAssembly = Arm64PreAssembly
             structure Arm64Assembly = Arm64Assembly
-            structure Arm64Sequences = Arm64Sequences
             structure Strongly      = StronglyConnected
         )
         
     structure Arm64ICodeTransform =
         Arm64ICodeTransform (
             structure Debug         = Debug
             structure Arm64ICode    = Arm64ICode
             structure Identify      = Arm64ICodeIdentify
             structure Allocate      = Arm64IAllocate
             structure PushRegisters = Arm64PushRegs
             structure Optimise      = Arm64Opt
             structure Pretty        = Pretty
             structure IntSet        = IntSet
             structure Codegen       = Arm64ICodeGenerate
         )
 
 in
 
     structure Arm64Code = 
         Arm64CodetreeToICode (
             structure BackendTree   = BackendIntermediateCode
             structure Debug         = Debug
-            structure Arm64ICode         = Arm64ICode
+            structure Arm64ICode    = Arm64ICode
             structure Arm64Foreign  = Arm64Foreign
             structure ICodeTransform = Arm64ICodeTransform
             structure CodeArray    = CodeArray
             and       Pretty = Pretty
         )
 
 end;
diff --git a/polymlArm64.pyp b/polymlArm64.pyp
index f8c402f9..28ef5ae8 100644
--- a/polymlArm64.pyp
+++ b/polymlArm64.pyp
@@ -1,243 +1,240 @@
 <?xml version="1.0"?>
 <polyproject>
 	<source name="basis\Array.sml" />
 	<source name="basis\Array2.sml" />
 	<source name="basis\Array2Signature.sml" />
 	<source name="basis\ArraySignature.sml" />
 	<source name="basis\ArraySliceSignature.sml" />
 	<source name="basis\ASN1.sml" />
 	<source name="basis\BasicStreamIO.sml" />
 	<source name="basis\BinIO.sml" />
 	<source name="basis\BIT_FLAGS.sml" />
 	<source name="basis\Bool.sml" />
 	<source name="basis\BoolArray.sml" />
 	<source name="basis\build.sml" />
 	<source name="basis\Byte.sml" />
 	<source name="basis\CommandLine.sml" />
 	<source name="basis\DATE.sig" />
 	<source name="basis\Date.sml" />
 	<source name="basis\ExnPrinter.sml" />
 	<source name="basis\FinalPolyML.sml" />
 	<source name="basis\Foreign.sml" />
 	<source name="basis\ForeignConstants.sml" />
 	<source name="basis\ForeignMemory.sml" />
 	<source name="basis\General.sml" />
 	<source name="basis\GenericSock.sml" />
 	<source name="basis\HashArray.ML" />
 	<source name="basis\IEEE_REAL.sml" />
 	<source name="basis\IEEEReal.sml" />
 	<source name="basis\IMPERATIVE_IO.sml" />
 	<source name="basis\ImperativeIO.sml" />
 	<source name="basis\INet6Sock.sml" />
 	<source name="basis\INetSock.sml" />
 	<source name="basis\InitialBasis.ML" />
 	<source name="basis\InitialPolyML.ML" />
 	<source name="basis\Int.sml" />
 	<source name="basis\Int32.sml" />
 	<source name="basis\IntArray.sml" />
 	<source name="basis\IntArray2.sml" />
 	<source name="basis\IntInf.sml" />
 	<source name="basis\IO.sml" />
 	<source name="basis\LargeWord.sml" />
 	<source name="basis\LibraryIOSupport.sml" />
 	<source name="basis\LibrarySupport.sml" />
 	<source name="basis\List.sml" />
 	<source name="basis\ListPair.sml" />
 	<source name="basis\MATH.sml" />
 	<source name="basis\MONO_ARRAY.sml" />
 	<source name="basis\MONO_ARRAY_SLICE.sml" />
 	<source name="basis\MONO_VECTOR.sml" />
 	<source name="basis\MONO_VECTOR_SLICE.sml" />
 	<source name="basis\NetProtDB.sml" />
 	<source name="basis\NetServDB.sml" />
 	<source name="basis\Option.sml" />
 	<source name="basis\OS.sml" />
 	<source name="basis\PackReal.sml" />
 	<source name="basis\PackReal32Boxed.sml" />
 	<source name="basis\PackReal32Tagged.sml" />
 	<source name="basis\PackWord.sml" />
 	<source name="basis\PolyMLException.sml" />
 	<source name="basis\PolyVectorOperations.sml" />
 	<source name="basis\Posix.sml" />
 	<source name="basis\PrettyPrinter.sml" />
 	<source name="basis\PRIM_IO.sml" />
 	<source name="basis\PrimIO.sml" />
 	<source name="basis\Real.sml" />
 	<source name="basis\Real32.sml" />
 	<source name="basis\RealArray.sml" />
 	<source name="basis\RealSignature.sml" />
 	<source name="basis\Signal.sml" />
 	<source name="basis\SingleAssignment.sml" />
 	<source name="basis\SML90.sml" />
 	<source name="basis\Socket.sml" />
 	<source name="basis\Statistics.ML" />
 	<source name="basis\STREAM_IO.sml" />
 	<source name="basis\String.sml" />
 	<source name="basis\StringSignatures.sml" />
 	<source name="basis\Text.sml" />
 	<source name="basis\TextIO.sml" />
 	<source name="basis\Thread.sml" />
 	<source name="basis\ThreadLib.sml" />
 	<source name="basis\Time.sml" />
 	<source name="basis\Timer.sml" />
 	<source name="basis\TopLevelPolyML.sml" />
 	<source name="basis\Universal.ML" />
 	<source name="basis\UniversalArray.ML" />
 	<source name="basis\Unix.sml" />
 	<source name="basis\UnixSock.sml" />
 	<source name="basis\Vector.sml" />
 	<source name="basis\VectorOperations.sml" />
 	<source name="basis\VectorSignature.sml" />
 	<source name="basis\VectorSliceOperations.sml" />
 	<source name="basis\VectorSliceSignature.sml" />
 	<source name="basis\Weak.sml" />
 	<source name="basis\Windows.sml" />
 	<source name="basis\Word16.sml" />
 	<source name="basis\Word32.sml" />
 	<source name="basis\Word32In64.sml" />
 	<source name="basis\Word8.sml" />
 	<source name="basis\Word8Array.sml" />
 	<source name="basis\WordSignature.sml" />
 	<source name="mlsource\MLCompiler\Address.ML" />
 	<source name="mlsource\MLCompiler\BUILTINS.sml" />
 	<source name="mlsource\MLCompiler\CODETREE.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\Arm64AllocateRegisters.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\ARM64ALLOCATEREGISTERS.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\ARM64ASSEMBLY.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\Arm64Assembly.sml" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\Arm64CodetreeToICode.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\Arm64ForeignCall.sml" />
-	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\Arm64GenCode.sml" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\Arm64ICode.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\ARM64ICODE.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\ARM64ICODEGENERATE.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\Arm64ICodeOptimise.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\ARM64ICODEOPTIMISE.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\Arm64ICodeToArm64Code.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\Arm64ICodeTransform.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\ARM64ICODETRANSFORM.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\Arm64IdentifyReferences.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\ARM64IDENTIFYREFERENCES.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\Arm64PreAssembly.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\ARM64PREASSEMBLY.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\Arm64PushRegisters.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\ARM64PUSHREGISTERS.sig" />
-	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\ARM64SEQUENCES.sig" />
-	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\Arm64Sequences.sml" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\ml_bind.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\BACKENDINTERMEDIATECODE.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\BackendIntermediateCode.sml" />
 	<source name="mlsource\MLCompiler\CodeTree\BaseCodeTree.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\BASECODETREE.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\CodeArray.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\CODEARRAY.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\CODEGENTREE.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\CodetreeCodegenConstantFunctions.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\CodeTreeConstruction.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\CodetreeFunctions.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\CODETREEFUNCTIONS.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\CodetreeLambdaLift.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\CodetreeOptimiser.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\CodetreeRemoveRedundant.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\CodetreeSimplifier.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\CodetreeStaticLinkAndCases.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\GCode.arm64.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\GENCODE.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\INTSET.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\IntSet.sml" />
 	<source name="mlsource\MLCompiler\CodeTree\ml_bind.ML" />
 	<source name="mlsource\MLCompiler\COMPILER_BODY.ML" />
 	<source name="mlsource\MLCompiler\CompilerBody.ML" />
 	<source name="mlsource\MLCompiler\COMPILERBODYSIG.sml" />
 	<source name="mlsource\MLCompiler\CompilerVersion.sml" />
 	<source name="mlsource\MLCompiler\COPIER.sml" />
 	<source name="mlsource\MLCompiler\COPIERSIG.sml" />
 	<source name="mlsource\MLCompiler\CopierStruct.sml" />
 	<source name="mlsource\MLCompiler\DATATYPE_REP.ML" />
 	<source name="mlsource\MLCompiler\DatatypeRep.ML" />
 	<source name="mlsource\MLCompiler\DATATYPEREPSIG.sml" />
 	<source name="mlsource\MLCompiler\Debug.ML" />
 	<source name="mlsource\MLCompiler\DEBUG.sig" />
 	<source name="mlsource\MLCompiler\DEBUGGER.sig" />
 	<source name="mlsource\MLCompiler\Debugger.sml" />
 	<source name="mlsource\MLCompiler\DEBUGGER_.sml" />
 	<source name="mlsource\MLCompiler\ExportTree.sml" />
 	<source name="mlsource\MLCompiler\EXPORTTREESIG.sml" />
 	<source name="mlsource\MLCompiler\ExportTreeStruct.sml" />
 	<source name="mlsource\MLCompiler\FOREIGNCALL.sig" />
 	<source name="mlsource\MLCompiler\HashTable.ML" />
 	<source name="mlsource\MLCompiler\Initialise.ML" />
 	<source name="mlsource\MLCompiler\INITIALISE_.ML" />
 	<source name="mlsource\MLCompiler\Lex.ML" />
 	<source name="mlsource\MLCompiler\LEX_.ML" />
 	<source name="mlsource\MLCompiler\LEXSIG.sml" />
 	<source name="mlsource\MLCompiler\Make.ML" />
 	<source name="mlsource\MLCompiler\MAKE_.ML" />
 	<source name="mlsource\MLCompiler\MAKESIG.sml" />
 	<source name="mlsource\MLCompiler\Misc.ML" />
 	<source name="mlsource\MLCompiler\ml_bind.ML" />
 	<source name="mlsource\MLCompiler\PARSE_DEC.ML" />
 	<source name="mlsource\MLCompiler\PARSE_TYPE.ML" />
 	<source name="mlsource\MLCompiler\ParseDec.ML" />
 	<source name="mlsource\MLCompiler\ParseTree\BASE_PARSE_TREE.sml" />
 	<source name="mlsource\MLCompiler\ParseTree\BaseParseTreeSig.sml" />
 	<source name="mlsource\MLCompiler\ParseTree\CODEGEN_PARSETREE.sml" />
 	<source name="mlsource\MLCompiler\ParseTree\CodegenParsetreeSig.sml" />
 	<source name="mlsource\MLCompiler\ParseTree\EXPORT_PARSETREE.sml" />
 	<source name="mlsource\MLCompiler\ParseTree\ExportParsetreeSig.sml" />
 	<source name="mlsource\MLCompiler\ParseTree\MATCH_COMPILER.sml" />
 	<source name="mlsource\MLCompiler\ParseTree\MatchCompilerSig.sml" />
 	<source name="mlsource\MLCompiler\ParseTree\ml_bind.ML" />
 	<source name="mlsource\MLCompiler\ParseTree\PARSE_TREE.ML" />
 	<source name="mlsource\MLCompiler\ParseTree\PRINT_PARSETREE.sml" />
 	<source name="mlsource\MLCompiler\ParseTree\PrintParsetreeSig.sml" />
 	<source name="mlsource\MLCompiler\ParseTree\TYPECHECK_PARSETREE.sml" />
 	<source name="mlsource\MLCompiler\ParseTree\TypeCheckParsetreeSig.sml" />
 	<source name="mlsource\MLCompiler\PARSETREESIG.sml" />
 	<source name="mlsource\MLCompiler\ParseType.ML" />
 	<source name="mlsource\MLCompiler\PRETTY.sig" />
 	<source name="mlsource\MLCompiler\Pretty.sml" />
 	<source name="mlsource\MLCompiler\PRINT_TABLE.ML" />
 	<source name="mlsource\MLCompiler\PrintTable.ML" />
 	<source name="mlsource\MLCompiler\PRINTTABLESIG.sml" />
 	<source name="mlsource\MLCompiler\SIGNATURES.sml" />
 	<source name="mlsource\MLCompiler\SIGNATURESSIG.sml" />
 	<source name="mlsource\MLCompiler\SignaturesStruct.sml" />
 	<source name="mlsource\MLCompiler\Skips.ML" />
 	<source name="mlsource\MLCompiler\SKIPS_.ML" />
 	<source name="mlsource\MLCompiler\StretchArray.ML" />
 	<source name="mlsource\MLCompiler\StronglyConnected.sml" />
 	<source name="mlsource\MLCompiler\STRUCT_VALS.ML" />
 	<source name="mlsource\MLCompiler\Structures.ML" />
 	<source name="mlsource\MLCompiler\STRUCTURES_.ML" />
 	<source name="mlsource\MLCompiler\STRUCTURESSIG.sml" />
 	<source name="mlsource\MLCompiler\StructVals.ML" />
 	<source name="mlsource\MLCompiler\STRUCTVALSIG.sml" />
 	<source name="mlsource\MLCompiler\Symbols.ML" />
 	<source name="mlsource\MLCompiler\SymbolsSig.sml" />
 	<source name="mlsource\MLCompiler\SymSet.ML" />
 	<source name="mlsource\MLCompiler\SymsetSig.sml" />
 	<source name="mlsource\MLCompiler\TYPE_TREE.ML" />
 	<source name="mlsource\MLCompiler\TYPEIDCODE.sml" />
 	<source name="mlsource\MLCompiler\TYPEIDCODESIG.sml" />
 	<source name="mlsource\MLCompiler\TypeIDCodeStruct.sml" />
 	<source name="mlsource\MLCompiler\TypeTree.ML" />
 	<source name="mlsource\MLCompiler\TYPETREESIG.sml" />
 	<source name="mlsource\MLCompiler\UniversalTable.ML" />
 	<source name="mlsource\MLCompiler\Utilities.ML" />
 	<source name="mlsource\MLCompiler\UTILITIES_.ML" />
 	<source name="mlsource\MLCompiler\VALUE_OPS.ML" />
 	<source name="mlsource\MLCompiler\ValueOps.ML" />
 	<source name="mlsource\MLCompiler\VALUEOPSSIG.sml" />
 	<source name="modules\IntInfAsInt\ArrayVector.sml" />
 	<source name="modules\IntInfAsInt\Date.sml" />
 	<source name="modules\IntInfAsInt\InputOutput.sml" />
 	<source name="modules\IntInfAsInt\Integer.sml" />
 	<source name="modules\IntInfAsInt\List.sml" />
 	<source name="modules\IntInfAsInt\RealStringCvt.sml" />
 	<source name="modules\IntInfAsInt\ROOT.sml" />
 	<source name="modules\IntInfAsInt\StringChar.sml" />
 	<source name="modules\IntInfAsInt\Word.sml" />
 	<source name="modules\IntInfAsInt\Word64.sml" />
 	<source name="RootArm64.ML" root="true" />
 </polyproject>