diff --git a/RootArm64.ML b/RootArm64.ML
index cb340ef0..95a798e8 100644
--- a/RootArm64.ML
+++ b/RootArm64.ML
@@ -1,153 +1,155 @@
 (*
     Copyright (c) 2021 David C. J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 (* Compiler root file for Arm64.  This gives the "use" instructions necessary to build the
    compiler and suitable for use with an IDE project file.
    It was constructed from the Poly/ML make files. *)
 PolyML.print_depth 1;
 PolyML.Compiler.reportUnreferencedIds := true;
 use "mlsource/MLCompiler/Address.ML";
 use "mlsource/MLCompiler/Misc.ML";
 use "mlsource/MLCompiler/HashTable.ML";
 use "mlsource/MLCompiler/UniversalTable.ML";
 use "mlsource/MLCompiler/StronglyConnected.sml";
 use "mlsource/MLCompiler/StretchArray.ML";
 use "mlsource/MLCompiler/STRUCTVALSIG.sml";
 use "mlsource/MLCompiler/PRETTY.sig";
 use "mlsource/MLCompiler/LEXSIG.sml";
 use "mlsource/MLCompiler/SymbolsSig.sml";
 use "mlsource/MLCompiler/COMPILERBODYSIG.sml";
 use "mlsource/MLCompiler/DEBUG.sig";
 use "mlsource/MLCompiler/MAKESIG.sml";
 use "mlsource/MLCompiler/MAKE_.ML";
 use "mlsource/MLCompiler/FOREIGNCALL.sig";
 use "mlsource/MLCompiler/BUILTINS.sml";
 use "mlsource/MLCompiler/CODETREE.sig";
 use "mlsource/MLCompiler/STRUCT_VALS.ML";
 use "mlsource/MLCompiler/CodeTree/BACKENDINTERMEDIATECODE.sig";
 use "mlsource/MLCompiler/CodeTree/BASECODETREE.sig";
 use "mlsource/MLCompiler/CodeTree/CODETREEFUNCTIONS.sig";
 use "mlsource/MLCompiler/CodeTree/CODEARRAY.sig";
 use "mlsource/MLCompiler/CodeTree/CODEGENTREE.sig";
 use "mlsource/MLCompiler/CodeTree/GENCODE.sig";
 use "mlsource/MLCompiler/CodeTree/CodetreeFunctions.ML";
 use "mlsource/MLCompiler/CodeTree/CodetreeStaticLinkAndCases.ML";
 use "mlsource/MLCompiler/CodeTree/CodetreeCodegenConstantFunctions.ML";
 use "mlsource/MLCompiler/CodeTree/CodetreeLambdaLift.ML";
 use "mlsource/MLCompiler/CodeTree/CodetreeRemoveRedundant.ML";
 use "mlsource/MLCompiler/CodeTree/CodetreeSimplifier.ML";
 use "mlsource/MLCompiler/CodeTree/CodetreeOptimiser.ML";
 use "mlsource/MLCompiler/CodeTree/CodeTreeConstruction.ML";
 use "mlsource/MLCompiler/Pretty.sml";
 use "mlsource/MLCompiler/CodeTree/CodeArray.ML";
 use "mlsource/MLCompiler/Debug.ML";
 use "mlsource/MLCompiler/CodeTree/BackendIntermediateCode.sml";
 use "mlsource/MLCompiler/CodeTree/BaseCodeTree.ML";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ASSEMBLY.sig";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/Arm64Assembly.sml";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/ARM64SEQUENCES.sig";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/Arm64Sequences.sml";
+use "mlsource/MLCompiler/CodeTree/Arm64Code/ARM64PREASSEMBLY.sig";
+use "mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PreAssembly.ML";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/Arm64GenCode.sml";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODE.sig";
 use "mlsource/MLCompiler/CodeTree/INTSET.sig";
 use "mlsource/MLCompiler/CodeTree/IntSet.sml";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICode.ML";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ForeignCall.sml";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/ARM64IDENTIFYREFERENCES.sig";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/Arm64IdentifyReferences.ML";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODECONFLICTS.sig";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICodeConflicts.ML";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/ARM64PUSHREGISTERS.sig";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PushRegisters.ML";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODEOPTIMISE.sig";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICodeOptimise.ML";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ALLOCATEREGISTERS.sig";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/Arm64AllocateRegisters.ML";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODEGENERATE.sig";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICodeToArm64Code.ML";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODETRANSFORM.sig";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICodeTransform.ML";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/Arm64CodetreeToICode.ML";
 use "mlsource/MLCompiler/CodeTree/Arm64Code/ml_bind.ML";
 use "mlsource/MLCompiler/CodeTree/GCode.arm64.ML";
 use "mlsource/MLCompiler/CodeTree/ml_bind.ML";
 use "mlsource/MLCompiler/StructVals.ML";
 use "mlsource/MLCompiler/LEX_.ML";
 use "mlsource/MLCompiler/Symbols.ML";
 use "mlsource/MLCompiler/Lex.ML";
 use "mlsource/MLCompiler/SymsetSig.sml";
 use "mlsource/MLCompiler/DATATYPEREPSIG.sml";
 use "mlsource/MLCompiler/VALUEOPSSIG.sml";
 use "mlsource/MLCompiler/EXPORTTREESIG.sml";
 use "mlsource/MLCompiler/STRUCTURESSIG.sml";
 use "mlsource/MLCompiler/COMPILER_BODY.ML";
 use "mlsource/MLCompiler/SymSet.ML";
 use "mlsource/MLCompiler/TYPETREESIG.sml";
 use "mlsource/MLCompiler/COPIERSIG.sml";
 use "mlsource/MLCompiler/TYPEIDCODESIG.sml";
 use "mlsource/MLCompiler/DATATYPE_REP.ML";
 use "mlsource/MLCompiler/PRINTTABLESIG.sml";
 use "mlsource/MLCompiler/VALUE_OPS.ML";
 use "mlsource/MLCompiler/TYPE_TREE.ML";
 use "mlsource/MLCompiler/UTILITIES_.ML";
 use "mlsource/MLCompiler/Utilities.ML";
 use "mlsource/MLCompiler/PRINT_TABLE.ML";
 use "mlsource/MLCompiler/PrintTable.ML";
 use "mlsource/MLCompiler/ExportTree.sml";
 use "mlsource/MLCompiler/ExportTreeStruct.sml";
 use "mlsource/MLCompiler/TypeTree.ML";
 use "mlsource/MLCompiler/COPIER.sml";
 use "mlsource/MLCompiler/CopierStruct.sml";
 use "mlsource/MLCompiler/TYPEIDCODE.sml";
 use "mlsource/MLCompiler/TypeIDCodeStruct.sml";
 use "mlsource/MLCompiler/DatatypeRep.ML";
 use "mlsource/MLCompiler/ValueOps.ML";
 use "mlsource/MLCompiler/PARSETREESIG.sml";
 use "mlsource/MLCompiler/SIGNATURESSIG.sml";
 use "mlsource/MLCompiler/DEBUGGER.sig";
 use "mlsource/MLCompiler/STRUCTURES_.ML";
 use "mlsource/MLCompiler/DEBUGGER_.sml";
 use "mlsource/MLCompiler/Debugger.sml";
 use "mlsource/MLCompiler/ParseTree/BaseParseTreeSig.sml";
 use "mlsource/MLCompiler/ParseTree/BASE_PARSE_TREE.sml";
 use "mlsource/MLCompiler/ParseTree/PrintParsetreeSig.sml";
 use "mlsource/MLCompiler/ParseTree/PRINT_PARSETREE.sml";
 use "mlsource/MLCompiler/ParseTree/ExportParsetreeSig.sml";
 use "mlsource/MLCompiler/ParseTree/EXPORT_PARSETREE.sml";
 use "mlsource/MLCompiler/ParseTree/TypeCheckParsetreeSig.sml";
 use "mlsource/MLCompiler/ParseTree/TYPECHECK_PARSETREE.sml";
 use "mlsource/MLCompiler/ParseTree/MatchCompilerSig.sml";
 use "mlsource/MLCompiler/ParseTree/MATCH_COMPILER.sml";
 use "mlsource/MLCompiler/ParseTree/CodegenParsetreeSig.sml";
 use "mlsource/MLCompiler/ParseTree/CODEGEN_PARSETREE.sml";
 use "mlsource/MLCompiler/ParseTree/PARSE_TREE.ML";
 use "mlsource/MLCompiler/ParseTree/ml_bind.ML";
 use "mlsource/MLCompiler/SIGNATURES.sml";
 use "mlsource/MLCompiler/SignaturesStruct.sml";
 use "mlsource/MLCompiler/Structures.ML";
 use "mlsource/MLCompiler/PARSE_DEC.ML";
 use "mlsource/MLCompiler/SKIPS_.ML";
 use "mlsource/MLCompiler/Skips.ML";
 use "mlsource/MLCompiler/PARSE_TYPE.ML";
 use "mlsource/MLCompiler/ParseType.ML";
 use "mlsource/MLCompiler/ParseDec.ML";
 use "mlsource/MLCompiler/CompilerBody.ML";
 use "mlsource/MLCompiler/CompilerVersion.sml";
 use "mlsource/MLCompiler/Make.ML";
 use "mlsource/MLCompiler/INITIALISE_.ML";
 use "mlsource/MLCompiler/Initialise.ML";
 use "mlsource/MLCompiler/ml_bind.ML";
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODE.sig b/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODE.sig
index 27fe44c8..6caa1ce1 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODE.sig
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64ICODE.sig
@@ -1,424 +1,430 @@
 (*
     Signature for the high-level ARM64 code
 
     Copyright David C. J. Matthews 2021
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 signature ARM64ICODE =
 sig
     type machineWord = Address.machineWord
     type address = Address.address
     
     type closureRef
 
     (* Registers. *)
     datatype xReg = XReg of Word8.word | XZero | XSP
     and vReg = VReg of Word8.word
 
     (* It is simpler to use a single type for all registers. *)
     datatype reg = GenReg of xReg | FPReg of vReg
 
     val X0:  xReg   and X1:  xReg   and X2:  xReg   and X3: xReg
     and X4:  xReg   and X5:  xReg   and X6:  xReg   and X7: xReg
     and X8:  xReg   and X9:  xReg   and X10: xReg   and X11: xReg
     and X12: xReg   and X13: xReg   and X14: xReg   and X15: xReg
     and X16: xReg   and X17: xReg   and X18: xReg   and X19: xReg
     and X20: xReg   and X21: xReg   and X22: xReg   and X23: xReg
     and X24: xReg   and X25: xReg   and X26: xReg   and X27: xReg
     and X28: xReg   and X29: xReg   and X30: xReg
 
     val V0:  vReg   and V1:  vReg   and V2:  vReg   and V3: vReg
     and V4:  vReg   and V5:  vReg   and V6:  vReg   and V7: vReg
     
     val is32in64: bool and isBigEndian: bool
     
     (* Condition for conditional branches etc. *)
     datatype condition =
         CondEqual            (* Z=1 *)
     |   CondNotEqual         (* Z=0 *)
     |   CondCarrySet         (* C=1 *)
     |   CondCarryClear       (* C=0 *)
     |   CondNegative         (* N=1 *)
     |   CondPositive         (* N=0 imcludes zero *)
     |   CondOverflow         (* V=1 *)
     |   CondNoOverflow       (* V=0 *)
     |   CondUnsignedHigher   (* C=1 && Z=0 *)
     |   CondUnsignedLowOrEq  (* ! (C=1 && Z=0) *)
     |   CondSignedGreaterEq  (* N=V *)
     |   CondSignedLess       (* N<>V *)
     |   CondSignedGreater    (* Z==0 && N=V *)
     |   CondSignedLessEq     (* !(Z==0 && N=V) *)
 
     (* The shift used in arithemtic operations. *)
     and shiftType =
         ShiftLSL of Word8.word
     |   ShiftLSR of Word8.word
     |   ShiftASR of Word8.word
     |   ShiftNone
 
     datatype preg = PReg of int (* A pseudo-register - an abstract register. *)
 
     (* If the value is zero we can use X0/W0. *)
     datatype pregOrZero = SomeReg of preg | ZeroReg
     
     (* A location on the stack.  May be more than word if this is a container or a handler entry. *)
     datatype stackLocn = StackLoc of {size: int, rno: int }
     
     (* This combines pregKind and stackLocn.  *)
     datatype regProperty =
         RegPropGeneral      (* A general register. *)
     |   RegPropUntagged     (* An untagged general register. *)
     |   RegPropStack of int (* A stack location or container. *)
     |   RegPropCacheTagged
     |   RegPropCacheUntagged
     |   RegPropMultiple     (* The result of a conditional or case. May be defined at multiple points. *)
     
     (* The reference to a condition code. *)
     datatype ccRef = CcRef of int
 
-    datatype loadType = Load64 | Load32 | Load16 | Load8 | LoadDouble | LoadFloat
+    datatype loadType = Load64 | Load32 | Load16 | Load8
     and opSize = OpSize32 | OpSize64
     and logicalOp = LogAnd | LogOr | LogXor
     and callKind = Recursive | ConstantCode of machineWord | FullCall
     and floatSize = Float32 | Double64
     and shiftDirection = ShiftLeft | ShiftRightLogical | ShiftRightArithmetic
     and multKind =
         MultAdd32 | MultSub32 | MultAdd64 | MultSub64 |
         SignedMultAddLong (* 32bit*32bit + 64bit => 64Bit *) |
         SignedMultHigh (* High order part of 64bit*64Bit *)
     and fpUnary = NegFloat | NegDouble | AbsFloat | AbsDouble | ConvFloatToDble | ConvDbleToFloat
     and fpBinary = MultiplyFP | DivideFP | AddFP | SubtractFP
 
     (* Function calls can have an unlimited number of arguments so it isn't always
        going to be possible to load them into registers. *)
     datatype fnarg = ArgInReg of preg | ArgOnStack of { wordOffset: int, container: stackLocn, field: int }
 
     datatype arm64ICode =
         (* Move the contents of one preg to another.  These are always 64-bits. *)
         MoveRegister of { source: preg, dest: preg }
 
         (* Numerical constant. *)
     |   LoadNonAddressConstant of { source: Word64.word, dest: preg }
 
         (* Address constant. *)
     |   LoadAddressConstant of { source: machineWord, dest: preg }
 
         (* Load a value into a register using a constant, signed, byte offset.  The offset
            is in the range of -256 to (+4095*unit size). *)
     |   LoadWithConstantOffset of { base: preg, dest: preg, byteOffset: int, loadType: loadType }
 
+        (* Similarly for FP registers. *)
+    |   LoadFPWithConstantOffset of { base: preg, dest: preg, byteOffset: int, floatSize: floatSize }
+
         (* Load a value into a register using an index register. *)
     |   LoadWithIndexedOffset of { base: preg, dest: preg, index: preg, loadType: loadType }
 
-        (* Load an entry from the "memory registers".  Used for ThreadSelf and CheckRTSException.
-           These are always 64-bit values. *)
-    |   LoadMemReg of { wordOffset: int, dest: preg }
+        (* Ditto for FP. *)
+    |   LoadFPWithIndexedOffset of { base: preg, dest: preg, index: preg, floatSize: floatSize }
+
+        (* Returns the current thread ID.  Always a 64-bit value.. *)
+    |   GetThreadId of { dest: preg }
 
         (* Convert a 32-in-64 object index into an absolute address. *)
     |   ObjectIndexAddressToAbsolute of { source: preg, dest: preg }
 
         (* Convert an absolute address into an object index. *)
     |   AbsoluteToObjectIndex of { source: preg, dest: preg }
 
         (* Allocate a fixed sized piece of memory and puts the absolute address into dest.
            bytesRequired is the total number of bytes including the length word and any alignment
            necessary for 32-in-64. saveRegs is the list of registers that need to be saved if we
            need to do a garbage collection. *)
     |   AllocateMemoryFixed of { bytesRequired: Word64.word, dest: preg, saveRegs: preg list }
 
         (* Allocate a piece of memory.  The size argument is an untagged value containing
            the number of words i.e. the same value used for InitialiseMemory and to store
            in the length word. *)
     |   AllocateMemoryVariable of { size: preg, dest: preg, saveRegs: preg list }
 
         (* Initialise a piece of memory by writing "size" copies of the value
            in "init".  N.B. The size is an untagged value containing the
            number of words. *)
     |   InitialiseMem of { size: preg, addr: preg, init: preg }
 
         (* Mark the beginning of a loop.  This is really only to prevent the initialisation code being
            duplicated in ICodeOptimise. *)
     |   BeginLoop
 
         (* Set up the registers for a jump back to the start of a loop. *)
     |   JumpLoop of
             { regArgs: {src: fnarg, dst: preg} list,
               stackArgs: {src: fnarg, wordOffset: int, stackloc: stackLocn} list,
               checkInterrupt: preg list option }
 
         (* Store a register using a constant, signed, byte offset.  The offset
            is in the range of -256 to (+4095*unit size). *)
     |   StoreWithConstantOffset of { source: preg, base: preg, byteOffset: int, loadType: loadType }
 
+        (* Ditto for FP regs. *)
+    |   StoreFPWithConstantOffset of { source: preg, base: preg, byteOffset: int, floatSize: floatSize }
+
         (* Store a register using an index register. *)
     |   StoreWithIndexedOffset of { source: preg, base: preg, index: preg, loadType: loadType }
 
+        (* and for FP regs. *)
+    |   StoreFPWithIndexedOffset of { source: preg, base: preg, index: preg, floatSize: floatSize }
+
         (* Add/Subtract immediate.  The destination is optional in which case XZero is used.
            ccRef is optional.  If it is NONE the version of the instruction that does not generate
            a condition code is used. immed must be < 0wx1000. *)
     |   AddSubImmediate of { source: preg, dest: preg option, ccRef: ccRef option, immed: word,
                              isAdd: bool, length: opSize }
 
         (* Add/Subtract register.  As with AddSubImmediate, both the destination and cc are optional. *)
     |   AddSubRegister of { base: preg, shifted: preg, dest: preg option, ccRef: ccRef option,
                             isAdd: bool, length: opSize, shift: shiftType }
 
         (* Bitwise logical operations.  The immediate value must be a valid bit pattern.  ccRef can
            only be SOME if logOp is LogAnd. *)
     |   LogicalImmediate of { source: preg, dest: preg option, ccRef: ccRef option, immed: Word64.word,
                               logOp: logicalOp, length: opSize }
 
         (* Register logical operations.  ccRef can only be SOME if logOp is LogAnd.*)
     |   LogicalRegister of { base: preg, shifted: preg, dest: preg option, ccRef: ccRef option,
                              logOp: logicalOp, length: opSize, shift: shiftType }
 
         (* Shift a word by an amount specified in a register. *)
     |   ShiftRegister of { direction: shiftDirection, dest: preg, source: preg, shift: preg, opSize: opSize }
 
         (* The various forms of multiply all take three arguments and the general form is
            dest = M * N +/- A..   *)
     |   Multiplication of { kind: multKind, dest: preg, sourceA: preg option, sourceM: preg, sourceN: preg }
 
         (* Signed or unsigned division.  Sets the result to zero if the divisor is zero. *)
     |   Division of { isSigned: bool, dest: preg, dividend: preg, divisor: preg, opSize: opSize }
 
         (* Start of function.  Set the register arguments.  stackArgs is the list of
            stack arguments.  If the function has a real closure regArgs includes the
            closure register (X8).  The register arguments include the return register
            (X30). *)
     |   BeginFunction of { regArgs: (preg * xReg) list, stackArgs: stackLocn list }
 
         (* Call a function.  If the code address is a constant it is passed here.
            Otherwise the address is obtained by indirecting through X8 which has been loaded
            as one of the argument registers.  The result is stored in the destination register.
            The "containers" argument is used to ensure that any container whose address is passed
            as one of the other arguments continues to be referenced until the function is called
            since there's a possibility that it isn't actually used after the function. *)
     |   FunctionCall of
             { callKind: callKind, regArgs: (fnarg * xReg) list,
               stackArgs: fnarg list, dest: preg, saveRegs: preg list, containers: stackLocn list}
 
         (* Jump to a tail-recursive function.  This is similar to FunctionCall
            but complicated for stack arguments because the stack and the return
            address need to be overwritten.
            stackAdjust is the number of words to remove (positive) or add
            (negative) to the stack before the call.
            currStackSize contains the number of items currently on the stack. *)
     |   TailRecursiveCall of
             { callKind: callKind, regArgs: (fnarg * xReg) list,
               stackArgs: {src: fnarg, stack: int} list,
               stackAdjust: int, currStackSize: int }
 
         (* Return from the function.  resultReg is the preg that contains the result,
            returnReg is the preg that contains the return address. *)
     |   ReturnResultFromFunction of { resultReg: preg, returnReg: preg, numStackArgs: int }
 
         (* Raise an exception.  The packet is always loaded into X0. *)
     |   RaiseExceptionPacket of { packetReg: preg }
 
         (* Push a register to the stack.  This is used both for a normal push, copies=1, and
            also to reserve a container. *)
     |   PushToStack of { source: preg, copies: int, container: stackLocn }
 
         (* Load a register from the stack.  The container is the stack location identifier,
            the field is an offset in a container. *)
     |   LoadStack of { dest: preg, wordOffset: int, container: stackLocn, field: int }
 
         (* Store a value into the stack. *)
     |   StoreToStack of { source: preg, container: stackLocn, field: int, stackOffset: int }
 
         (* Set the register to the address of the container i.e. a specific offset on the stack. *)
     |   ContainerAddress of { dest: preg, container: stackLocn, stackOffset: int }
 
         (* Remove items from the stack.  Used to remove containers or
            registers pushed to the stack.. *)
     |   ResetStackPtr of { numWords: int }
 
         (* Tag a value by shifting and setting the tag bit. *)
     |   TagValue of { source: preg, dest: preg, isSigned: bool, opSize: opSize }
 
         (* Shift a value to remove the tag bit.  The cache is used if this is untagging a
            value that has previously been tagged. *)
     |   UntagValue of { source: preg, dest: preg, isSigned: bool, opSize: opSize }
 
         (* Box a largeword value.  Stores a value
            into a byte area.  This can be implemented using AllocateMemoryFixed
            but keeping it separate makes optimisation easier.
            The result is always an address and needs to be converted to an
            object index on 32-in-64. *)
     |   BoxLarge of { source: preg, dest: preg, saveRegs: preg list }
 
         (* Load a value from a box.  This can be implemented using a load but
            is kept separate to simplify optimisation.  The source is always
            an absolute address. *)
     |   UnboxLarge of { source: preg, dest: preg }
 
         (* Convert a floating point value into a value suitable for storing
            in the heap.  This normally involves boxing except that 32-bit
            floats can be tagged in native 64-bits. *)
     |   BoxTagFloat of { floatSize: floatSize, source: preg, dest: preg, saveRegs: preg list }
 
         (* The reverse of BoxTagFloat. *)
     |   UnboxTagFloat of { floatSize: floatSize, source: preg, dest: preg }
 
         (* Load a value with acquire semantics.  This means that any other
            load in this thread after this sees the value of the shared
            memory at this point and not earlier.  This is used for
            references and arrays to ensure that if another thread has
            built a data structure on the heap and then assigns the
            address to a shared ref this thread will see the updated heap
            and not any locally cached previous version. *)
     |   LoadAcquire of { base: preg, dest: preg, loadType: loadType }
 
         (* Store a value with release semantics.  This ensures that any
            other write completes before this operation and works with
            LoadAcquire. *)
     |   StoreRelease of { base: preg, source: preg, loadType: loadType }
 
         (* This is a generalised constant shift which includes selection of a
            range of bits. *)
     |   BitFieldShift of { source: preg, dest: preg, isSigned: bool, length: opSize, immr: word, imms: word }
 
         (*  Copy a range of bits and insert it into another register.  This is the
             only case where a register functions both as a source and a destination. *)
     |   BitFieldInsert of { source: preg, destAsSource: preg, dest: preg,
                             length: opSize, immr: word, imms: word }
 
         (* Indexed case. *)
     |   IndexedCaseOperation of { testReg: preg }
 
         (* Exception handling.  - Set up an exception handler. *)
     |   PushExceptionHandler
 
         (* End of a handled section.  Restore the previous handler. *)
     |   PopExceptionHandler
 
         (* Marks the start of a handler.  This sets the stack pointer and
            restores the old handler.  Sets the exception packet register. *) 
     |   BeginHandler of { packetReg: preg }
 
         (* Compare two vectors of bytes and set the condition code on the result.
            The registers are modified by the instruction. *)
     |   CompareByteVectors of
             { vec1Addr: preg, vec2Addr: preg, length: preg, ccRef: ccRef }
 
         (* Move a block of bytes (isByteMove true) or words (isByteMove false).  The length is the
            number of items (bytes or words) to move. The registers are modified by
            the instruction. *)
     |   BlockMove of { srcAddr: preg, destAddr: preg, length: preg, isByteMove: bool }
 
         (* Add or subtract to the system stack pointer and optionally return the new value.
            This is used to allocate and deallocate C space. *)
     |   AddSubXSP of { source: preg, dest: pregOrZero, isAdd: bool  }
 
         (* Ensures the value will actually be referenced although it doesn't generate any code. *)
     |   TouchValue of { source: preg }
 
         (* Load a value at the address and get exclusive access.  Always loads a
            64-bit value. *)
     |   LoadAcquireExclusive of { base: preg, dest: preg }
 
         (* Store a value into an address releasing the lock.  Sets the result to
            either 0 or 1 if it succeeds or fails. *)
     |   StoreReleaseExclusive of { base: preg, source: pregOrZero, result: preg }
 
         (* Insert a memory barrier. dmb ish. *)
     |   MemoryBarrier
 
         (* Convert an integer to a floating point value. *)
     |   ConvertIntToFloat of { source: preg, dest: preg, srcSize: opSize, destSize: floatSize }
 
         (* Convert a floating point value to an integer using the specified rounding mode.
            We could get an overflow here but fortunately the ARM generates a value
            that will cause an overflow when we tag it, provided we tag it explicitly. *)
     |   ConvertFloatToInt of { source: preg, dest: preg, srcSize: floatSize, destSize: opSize, rounding: IEEEReal.rounding_mode }
 
         (* Unary floating point.  This includes conversions between float and double. *)
     |   UnaryFloatingPt of { source: preg, dest: preg, fpOp: fpUnary }
 
         (* Binary floating point: addition, subtraction, multiplication and division. *)
     |   BinaryFloatingPoint of { arg1: preg, arg2: preg, dest: preg, fpOp: fpBinary, opSize: floatSize }
 
         (* Floating point comparison. *)
     |   CompareFloatingPoint of { arg1: preg, arg2: preg, ccRef: ccRef, opSize: floatSize }
 
         (* Destinations at the end of a basic block. *)
     and controlFlow =
         (* Unconditional branch to a label - should be a merge point. *)
         Unconditional of int
         (* Conditional branch. Jumps to trueJump if the condional is false, falseJump if false. *)
     |   Conditional of { ccRef: ccRef, condition: condition, trueJump: int, falseJump: int }
         (* Exit - the last instruction of the block is a return, raise or tailcall. *)
     |   ExitCode
         (* Indexed case - this branches to one of a number of labels *)
     |   IndexedBr of int list
         (* Set up a handler.  This doesn't cause an immediate branch but the state at the
            start of the handler is the state at this point. *)
     |   SetHandler of { handler: int, continue: int }
         (* Unconditional branch to a handler.  If an exception is raised explicitly
            within the scope of a handler. *)
     |   UnconditionalHandle of int
         (* Conditional branch to a handler.  Occurs if there is a call to a
            function within the scope of a handler.  It may jump to the handler. *)
     |   ConditionalHandle of { handler: int, continue: int }
 
     and basicBlock = BasicBlock of { block: arm64ICode list, flow: controlFlow }
     
     (* Return the successor blocks from a control flow. *)
     val successorBlocks: controlFlow -> int list
 
     val printICodeAbstract: basicBlock vector * (string -> unit) -> unit
 
     (* Check whether this value is acceptable for LogicalImmediate. *)
     val isEncodableBitPattern: Word64.word * opSize -> bool
 
     (* This generates a  BitField instruction with the appropriate values for immr and imms. *)
     val shiftConstant:
         { direction: shiftDirection, dest: preg, source: preg, shift: word, opSize: opSize } -> arm64ICode
-
-    (* Offsets in the assembly code interface pointed at by X26
-       These are in units of 64-bits NOT bytes. *)
-    val exceptionPacketOffset: int
-    and threadIdOffset: int
     
     structure Sharing:
     sig
         type xReg           = xReg
         and  vReg           = vReg
         and  reg            = reg
         and  condition      = condition
         and  shiftType      = shiftType
         and  arm64ICode     = arm64ICode
         and  preg           = preg
         and  pregOrZero     = pregOrZero
         and  controlFlow    = controlFlow
         and  basicBlock     = basicBlock
         and  stackLocn      = stackLocn
         and  regProperty    = regProperty
         and  ccRef          = ccRef
         and  fnarg          = fnarg
         and  closureRef     = closureRef
         and  loadType       = loadType
         and  opSize         = opSize
         and  logicalOp      = logicalOp
         and  callKind       = callKind
         and  floatSize      = floatSize
         and  shiftDirection = shiftDirection
         and  multKind       = multKind
         and  fpUnary        = fpUnary
         and  fpBinary       = fpBinary
    end
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64PREASSEMBLY.sig b/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64PREASSEMBLY.sig
new file mode 100644
index 00000000..f365c608
--- /dev/null
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/ARM64PREASSEMBLY.sig
@@ -0,0 +1,211 @@
+(*
+    Copyright (c) 2021 David C. J. Matthews
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    Licence version 2.1 as published by the Free Software Foundation.
+    
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public Licence for more details.
+    
+    You should have received a copy of the GNU Lesser General Public
+    Licence along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+*)
+
+(* The pre-assembly layer goes below the icode and allows peep-hole optimisation. *)
+signature ARM64PREASSEMBLY =
+sig
+    type closureRef
+    type machineWord = Address.machineWord
+    type labels
+
+    (* XZero and XSP are both encoded as 31 but the interpretation
+       depends on the instruction 
+       The datatype definition is included here to allow for
+       pattern matching on XSP and XZero. *)
+    datatype xReg = XReg of Word8.word | XZero | XSP
+    and vReg = VReg of Word8.word
+
+    val X0:  xReg   and X1:  xReg   and X2:  xReg   and X3: xReg
+    and X4:  xReg   and X5:  xReg   and X6:  xReg   and X7: xReg
+    and X8:  xReg   and X9:  xReg   and X10: xReg   and X11: xReg
+    and X12: xReg   and X13: xReg   and X14: xReg   and X15: xReg
+    and X16: xReg   and X17: xReg   and X18: xReg   and X19: xReg
+    and X20: xReg   and X21: xReg   and X22: xReg   and X23: xReg
+    and X24: xReg   and X25: xReg   and X26: xReg   and X27: xReg
+    and X28: xReg   and X29: xReg   and X30: xReg
+
+    val X_MLHeapLimit: xReg (* ML Heap limit pointer *)
+    and X_MLAssemblyInt: xReg (* ML assembly interface pointer. *)
+    and X_MLHeapAllocPtr: xReg (* ML Heap allocation pointer. *)
+    and X_MLStackPtr: xReg (* ML Stack pointer. *)
+    and X_LinkReg: xReg (* Link reg - return address *)
+    and X_Base32in64: xReg (* X24 is used for the heap base in 32-in-64. *)
+
+    val V0:  vReg   and V1:  vReg   and V2:  vReg   and V3: vReg
+    and V4:  vReg   and V5:  vReg   and V6:  vReg   and V7: vReg
+
+    (* Condition for conditional branches etc. *)
+    datatype condition =
+        CondEqual            (* Z=1 *)
+    |   CondNotEqual         (* Z=0 *)
+    |   CondCarrySet         (* C=1 *)
+    |   CondCarryClear       (* C=0 *)
+    |   CondNegative         (* N=1 *)
+    |   CondPositive         (* N=0 imcludes zero *)
+    |   CondOverflow         (* V=1 *)
+    |   CondNoOverflow       (* V=0 *)
+    |   CondUnsignedHigher   (* C=1 && Z=0 *)
+    |   CondUnsignedLowOrEq  (* ! (C=1 && Z=0) *)
+    |   CondSignedGreaterEq  (* N=V *)
+    |   CondSignedLess       (* N<>V *)
+    |   CondSignedGreater    (* Z==0 && N=V *)
+    |   CondSignedLessEq     (* !(Z==0 && N=V) *)
+
+    val invertTest: condition -> condition (* i.e. jump when the condition is not true. *)
+    val condToString: condition -> string
+
+    datatype shiftType =
+        ShiftLSL of Word8.word
+    |   ShiftLSR of Word8.word
+    |   ShiftASR of Word8.word
+    |   ShiftNone
+
+    datatype wordSize = WordSize32 | WordSize64
+
+    datatype 'a extend =
+        ExtUXTB of 'a (* Unsigned extend byte *)
+    |   ExtUXTH of 'a (* Unsigned extend byte *)
+    |   ExtUXTW of 'a (* Unsigned extend byte *)
+    |   ExtUXTX of 'a (* Left shift *)
+    |   ExtSXTB of 'a (* Sign extend byte *)
+    |   ExtSXTH of 'a (* Sign extend halfword *)
+    |   ExtSXTW of 'a (* Sign extend word *)
+    |   ExtSXTX of 'a (* Left shift *)
+
+    (* Load/store instructions have only a single bit for the shift.  For byte
+       operations this is one bit shift; for others it scales by the size of
+       the operand if set. *)
+    datatype scale =
+        ScaleOrShift
+    |   NoScale
+
+    datatype loadType = Load64 | Load32 | Load16 | Load8
+    and opSize = OpSize32 | OpSize64
+    and logicalOp = LogAnd | LogOr | LogXor
+    and floatSize = Float32 | Double64
+    and shiftDirection = ShiftLeft | ShiftRightLogical | ShiftRightArithmetic
+    and multKind =
+        MultAdd32 | MultSub32 | MultAdd64 | MultSub64 |
+        SignedMultAddLong (* 32bit*32bit + 64bit => 64Bit *) |
+        SignedMultHigh (* High order part of 64bit*64Bit *)
+    and fpUnary = NegFloat | NegDouble | AbsFloat | AbsDouble | ConvFloatToDble | ConvDbleToFloat
+    and fpBinary = MultiplyFP | DivideFP | AddFP | SubtractFP
+    and unscaledType = NoUpdate | PreIndex | PostIndex
+    and condSet = CondSet | CondSetIncr | CondSetInvert | CondSetNegate
+    and bitfieldKind = BitFieldUnsigned | BitFieldSigned | BitFieldInsert
+
+    datatype precode =
+        (* Basic instructions *)
+        AddSubImmediate of
+            {regN: xReg, regD: xReg, immed: word, shifted: bool, isAdd: bool, opSize: opSize, setFlags: bool}
+    |   AddSubShiftedReg of
+            {regM: xReg, regN: xReg, regD: xReg, shift: shiftType, isAdd: bool, opSize: opSize, setFlags: bool}
+    |   AddSubExtendedReg of
+            {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend, isAdd: bool, opSize: opSize, setFlags: bool}
+    |   MultiplyAndAddSub of
+            {regM: xReg, regN: xReg, regA: xReg, regD: xReg, isAdd: bool, opSize: opSize, multKind: multKind}
+    |   DivideRegs of
+            {regM: xReg, regN: xReg, regD: xReg, isSigned: bool, opSize: opSize}
+    |   LogicalShiftedReg of
+            {regM: xReg, regN: xReg, regD: xReg, shift: shiftType, logOp: logicalOp, opSize: opSize, setFlags: bool}
+    |   LoadRegScaled of
+            {regT: xReg, regN: xReg, unitOffset: int, loadType: loadType}
+    |   LoadFPRegScaled of
+            {regT: vReg, regN: xReg, unitOffset: int, floatSize: floatSize}
+    |   StoreRegScaled of
+            {regT: xReg, regN: xReg, unitOffset: int, loadType: loadType}
+    |   StoreFPRegScaled of
+            {regT: vReg, regN: xReg, unitOffset: int, floatSize: floatSize}
+    |   LoadRegUnscaled of
+            {regT: xReg, regN: xReg, byteOffset: int, loadType: loadType, unscaledType: unscaledType}
+    |   StoreRegUnscaled of
+            {regT: xReg, regN: xReg, byteOffset: int, loadType: loadType, unscaledType: unscaledType}
+    |   LoadFPRegUnscaled of
+            {regT: vReg, regN: xReg, byteOffset: int, floatSize: floatSize, unscaledType: unscaledType}
+    |   StoreFPRegUnscaled of
+            {regT: vReg, regN: xReg, byteOffset: int, floatSize: floatSize, unscaledType: unscaledType}
+    |   LoadAcquire of {regN: xReg, regT: xReg, loadType: loadType}
+    |   StoreRelease of {regN: xReg, regT: xReg, loadType: loadType}
+    |   LoadRegPair of { regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int, unscaledType: unscaledType}
+    |   StoreRegPair of{ regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int, unscaledType: unscaledType}
+    |   LoadFPRegPair of { regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int, unscaledType: unscaledType}
+    |   StoreFPRegPair of { regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int, unscaledType: unscaledType}
+    |   ConditionalSet of {regD: xReg, regTrue: xReg, regFalse: xReg, cond: condition, condSet: condSet}
+    |   BitField of {immr: word, imms: word, regN: xReg, regD: xReg, opSize: opSize, bitfieldKind: bitfieldKind}
+    |   ShiftRegisterVariable of {regM: xReg, regN: xReg, regD: xReg, opSize: opSize, shiftDirection: shiftDirection}
+    |   BitwiseLogical of { bits: Word64.word, regN: xReg, regD: xReg, opSize: opSize, setFlags: bool, logOp: logicalOp}
+        (* Floating point *)
+    |   MoveGeneralToFP of { regN: xReg, regD: vReg, floatSize: floatSize}
+    |   MoveFPToGeneral of {regN: vReg, regD: xReg, floatSize: floatSize}
+    |   ConvertIntToFP of { regN: xReg, regD: vReg, floatSize: floatSize, opSize: opSize}
+    |   ConvertFloatToInt of { round: IEEEReal.rounding_mode, regN: vReg, regD: xReg, floatSize: floatSize, opSize: opSize}
+    |   FPBinaryOp of { regM: vReg, regN: vReg, regD: vReg, floatSize: floatSize, fpOp: fpBinary}
+    |   FPComparison of { regM: vReg, regN: vReg, floatSize: floatSize}
+    |   FPUnaryOp of {regN: vReg, regD: vReg, floatSize: floatSize, fpOp: fpUnary}
+        (* Branches and Labels. *)
+    |   SetLabel of labels
+    |   ConditionalBranch of condition * labels
+    |   UnconditionalBranch of labels
+    |   BranchAndLink of labels
+    |   LoadLabelAddress of xReg * labels
+    |   TestBitBranch of { test: xReg, bit: Word8.word, label: labels, onZero: bool }
+    |   CompareBranch of { test: xReg, label: labels, onZero: bool, opSize: opSize }
+        (* Composite instructions *)
+    |   MoveXRegToXReg of {sReg: xReg, dReg: xReg}
+    |   LoadNonAddr of xReg * Word64.word
+    |   LoadAddr of xReg * machineWord
+
+    val createLabel: unit -> labels
+
+    (* Create the vector of code from the list of instructions and update the
+       closure reference to point to it. *)
+    val generateCode:
+        {instrs: precode list, name: string, parameters: Universal.universal list, resultClosure: closureRef,
+         profileObject: machineWord} -> unit
+
+ 
+    (* Offsets in the assembly code interface pointed at by X26
+       These are in units of 64-bits NOT bytes. *)
+    val heapOverflowCallOffset: int
+    and stackOverflowCallOffset: int
+    and stackOverflowXCallOffset: int
+    and exceptionHandlerOffset: int
+    and stackLimitOffset: int
+    and threadIdOffset: int
+    and heapLimitPtrOffset: int
+    and heapAllocPtrOffset: int
+    and mlStackPtrOffset: int
+
+    val is32in64: bool and isBigEndian: bool
+
+    val isEncodableBitPattern: Word64.word * wordSize -> bool
+
+    structure Sharing:
+    sig
+        type closureRef = closureRef
+        type precode = precode
+        type xReg = xReg
+        type vReg = vReg
+        type labels = labels
+        type condition = condition
+        type shiftType = shiftType
+        type wordSize = wordSize
+        type 'a extend = 'a extend
+        type scale = scale
+    end
+
+end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64AllocateRegisters.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64AllocateRegisters.ML
index 72789628..a8e4a591 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64AllocateRegisters.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64AllocateRegisters.ML
@@ -1,556 +1,572 @@
 (*
     Copyright David C. J. Matthews 2016-21
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64AllocateRegisters(
     structure Arm64ICode: ARM64ICODE
     structure Identify: ARM64IDENTIFYREFERENCES
     structure ConflictSets: ARM64ICODECONFLICTS
     structure IntSet: INTSET
     
     sharing Arm64ICode.Sharing = Identify.Sharing = ConflictSets.Sharing = IntSet
 ): ARM64ALLOCATEREGISTERS =
 struct
     open Arm64ICode
     open Identify
     open ConflictSets
     open IntSet
     open Address
     
     exception InternalError = Misc.InternalError
 
     datatype allocateResult =
         AllocateSuccess of reg vector
     |   AllocateFailure of intSet list
 
     (* General registers.  X24 is used as the global heap base in 32-in-64. X30 is the
        return address set by blr but is otherwise a general register.
        Put the argument registers at the end of the list so they'll only be
        used when hinted. *)
     val generalRegisters =
         map GenReg ([X9, X10, X11, X12, X13, X14, X15, X19, X20, X21, X22, X23,
                       X0, X1, X2, X3, X4, X5, X6, X7, X8, X30] @ (if is32in64 then [] else [X24]))
 
     val floatingPtRegisters =
         map FPReg [V7, V6, V5, V4, V3, V2, V1]
 
     fun allocateRegisters{blocks, regStates, regProps, ...} =
     let
         val maxPRegs = Vector.length regStates
         and numBlocks = Vector.length blocks
 
         (* Hint values.  The idea of hints is that by using a hinted register
            we may avoid an unnecessary move instruction.  realHints is set when
            a pseudo-register is going to be loaded from a specific register
            e.g. a register argument, or moved into one e.g. ecx for a shift.
            friends is set to the other pseudo-registers that may be associated
            with the pseudo-register.  E.g. the argument and destination of
            an arithmetic operation where choosing the same register for
            each may avoid a move. *)
         val realHints = Array.array(maxPRegs, NONE: reg option)
 
         (* Sources and destinations.  These indicate the registers that are
            the sources and destinations of the indexing register and are used
            as hints.  If a register has been allocated for a source or destination
            we may be able to reuse it.  *)
         val sourceRegs = Array.array(maxPRegs, []: int list)
         and destinationRegs = Array.array(maxPRegs, []: int list)
         local
             fun addRealHint(r, reg) =
                 case Array.sub(realHints, r) of
                     NONE => Array.update(realHints, r, SOME reg)
                 |   SOME _ => ()
         
             fun addSourceAndDestinationHint{src, dst} =
             let
                 val {conflicts, ...} = Vector.sub(regStates, src)
             in
                 (* If they conflict we can't add them. *)
                 if member(dst, conflicts)
                 then ()
                 else
                 let
                     val currentDests = Array.sub(destinationRegs, src)
                     val currentSources = Array.sub(sourceRegs, dst)
                 in
                     if List.exists(fn i => i=dst) currentDests
                     then ()
                     else Array.update(destinationRegs, src, dst :: currentDests);
                     if List.exists(fn i => i=src) currentSources
                     then ()
                     else Array.update(sourceRegs, dst, src :: currentSources)
                 end
             end
         
         in
             (* Add the hints to steer the register allocation.  The idea is to avoid moves between
                registers by getting values into the appropriate register in advance. *)
             fun addHints{instr=MoveRegister{source=PReg sreg, dest=PReg dreg, ...}, ...} =
                     addSourceAndDestinationHint {src=sreg, dst=dreg}
 
             |   addHints{instr=BitFieldInsert{destAsSource=PReg dsReg, dest=PReg dReg, ...}, ...} =
                     (* The "destAsSource" is the destination if some bits are retained. *)
                     addSourceAndDestinationHint {src=dsReg, dst=dReg}
 
             |   addHints{instr=ReturnResultFromFunction { resultReg=PReg resReg, returnReg=PReg retReg, ... }, ...} =
                 (
                     addRealHint(resReg, GenReg X0);
                     addRealHint(retReg, GenReg X30) (* It may be there from earlier. *)
                 )
 
             |   addHints{instr=JumpLoop{regArgs, ...}, ...} =
                 let
                     fun addRegArg {src=ArgInReg(PReg argReg), dst=PReg resReg} =
                             addSourceAndDestinationHint {dst=resReg, src=argReg}
                     |   addRegArg {src=ArgOnStack _, ...} = ()
                 in
                     List.app addRegArg regArgs
                 end
 
             |   addHints{instr=BeginFunction{regArgs, ...}, ...} =
                     List.app (fn (PReg pr, reg) => addRealHint(pr, GenReg reg)) regArgs
 
             |   addHints{instr=TailRecursiveCall{regArgs, ...}, ...} =
                 let
                     fun setHint(ArgInReg(PReg pr), reg) = addRealHint(pr, GenReg reg) | setHint _ = ()
                 in
                     List.app setHint regArgs
                 end
         
             |   addHints{instr=FunctionCall{regArgs, dest=PReg dreg, ...}, ...} =
                 let
                     fun setHint(ArgInReg(PReg pr), reg) = addRealHint(pr, GenReg reg) | setHint _ = ()
                 in
                     addRealHint(dreg, GenReg X0);
                     List.app setHint regArgs
                 end
 
                 (* Exception packets are in X0 *)
             |   addHints{instr=RaiseExceptionPacket{ packetReg=PReg preg }, ...} = addRealHint(preg, GenReg X0)
             |   addHints{instr=BeginHandler{ packetReg=PReg preg }, ...} = addRealHint(preg, GenReg X0)
 
             |   addHints _ = ()
         
         end
 
         val allocatedRegs = Array.array(maxPRegs, NONE: reg option)
         val failures = ref []: intSet list ref
 
         (* Find a real register for a preg.
            1.  If a register is already allocated use that.
            2.  Try the "preferred" register if one has been given.
            3.  Try the realHints value if there is one.
            4.  See if there is a "friend" that has an appropriate register
            5.  Look at all the registers and find one. *)
         fun findRegister(r, pref, regSet) =
         case Array.sub(allocatedRegs, r) of
             SOME reg => reg
         |   NONE =>
             let
                 val {conflicts, realConflicts, ...} = Vector.sub(regStates, r)
                 (* Find the registers we've already allocated that may conflict. *)
                 val conflictingRegs =
                     List.mapPartial(fn i => Array.sub(allocatedRegs, i)) (setToList conflicts) @
                         realConflicts
 
                 fun isFree aReg = not (List.exists(fn i => i=aReg) conflictingRegs)
 
                 fun tryAReg NONE = NONE
                 |   tryAReg (somePref as SOME prefReg) =
                         if isFree prefReg
                         then (Array.update(allocatedRegs, r, somePref); somePref)
                         else NONE
 
                 fun findAReg [] =
                     (
                         (* This failed.  We're going to have to spill something. *)
                         failures := conflicts :: ! failures;
                         hd regSet (* Return a register to satisfy everything. *)
                     )
                 |   findAReg (reg::regs) =
                         if isFree reg then (Array.update(allocatedRegs, r, SOME reg); reg)
                         else findAReg regs
 
 
                 (* Search the sources and destinations to see if a register has
                    already been allocated or there is a hint. *)
                 fun findAFriend([], [], _) = NONE
 
                 |   findAFriend(aDest :: otherDests, sources, alreadySeen) =
                     let
                         val possReg =
                             case Array.sub(allocatedRegs, aDest) of
                                 v as SOME _ => tryAReg v
                             |   NONE => tryAReg(Array.sub(realHints, aDest))
                     in
                         case possReg of
                             reg as SOME _ => reg
                         |   NONE =>
                             let
                                 (* Add the destinations of the destinations to the list
                                    if they don't conflict and haven't been seen. *)
                                 fun newFriend f =
                                     not(List.exists (fn n => n=f) alreadySeen) andalso not(member(f, conflicts))
                                 val fOfF = List.filter newFriend (Array.sub(destinationRegs, aDest))
                             in
                                 findAFriend(otherDests @ fOfF, sources, aDest :: alreadySeen)
                             end
                     end
 
                 |   findAFriend([], aSrc :: otherSrcs, alreadySeen) =
                     let
                         val possReg =
                             case Array.sub(allocatedRegs, aSrc) of
                                 v as SOME _ => tryAReg v
                             |   NONE => tryAReg(Array.sub(realHints, aSrc))
                     in
                         case possReg of
                             reg as SOME _ => reg
                         |   NONE =>
                             let
                                 (* Add the sources of the sources to the list
                                    if they don't conflict and haven't been seen. *)
                                 fun newFriend f =
                                     not(List.exists (fn n => n=f) alreadySeen) andalso not(member(f, conflicts))
                                 val fOfF = List.filter newFriend (Array.sub(sourceRegs, aSrc))
                             in
                                 findAFriend([], otherSrcs @ fOfF, aSrc :: alreadySeen)
                             end
                     end
 
                 (* See if there is a friend that has a register already or a
                    hint.  Friends are registers that don't conflict and can
                    possibly avoid an extra move. *) 
 (*                    fun findAFriend([], _) = NONE
                 |   findAFriend(friend :: tail, old) =
                     let
                         val possReg =
                             case Array.sub(allocatedRegs, friend) of
                                 v as SOME _ => tryAReg v
                             |   NONE => tryAReg(Array.sub(realHints, friend))
                     in
                         case possReg of
                             reg as SOME _ => reg
                         |   NONE =>
                             let
                                 (* Add a friend of a friend to the list if we haven't already
                                    seen it and it doesn't conflict. *)
                                 fun newFriend f =
                                     not(List.exists (fn n => n=f) old) andalso
                                         not(List.exists (fn n => n=f) conflicts)
                                 val fOfF = List.filter newFriend (Array.sub(friends, friend))
                             in
                                 findAFriend(tail @ fOfF, friend :: old)
                             end
                     end*)
             in
                 case tryAReg pref of
                     SOME r => r
                 |   NONE =>
                     (
                         case tryAReg (Array.sub(realHints, r)) of
                             SOME r => r
                         |   NONE =>
                             (
                                 case findAFriend(Array.sub(destinationRegs, r), Array.sub(sourceRegs, r), []) of
                                     SOME r => r
                                     (* Look through the registers to find one that's free. *)
                                 |   NONE => findAReg regSet
                             )
                     )
             end
             
         fun allocateRegister args = ignore(findRegister args)
         val allocateFindRegister = findRegister
 
         fun allocateGenReg(PReg r) = allocateRegister(r, NONE, generalRegisters)
         and allocateFloatReg(PReg r) = allocateRegister(r, NONE, floatingPtRegisters)
         and allocateOptGenReg(SomeReg reg) = allocateGenReg reg | allocateOptGenReg ZeroReg = ()
 
         val allocateGenRegs = List.app allocateGenReg
         and allocateFloatRegs = List.app allocateFloatReg
 
         fun registerAllocate{instr=MoveRegister{source=PReg sreg, dest=PReg dreg}, ...} =
             let
                 val realDestReg = findRegister(dreg, NONE, generalRegisters)
             in
                 allocateRegister(sreg, SOME realDestReg, generalRegisters)
             end
 
         |   registerAllocate{instr=LoadNonAddressConstant{dest, ...}, ...} = allocateGenReg dest
 
         |   registerAllocate{instr=LoadAddressConstant{dest, ...}, ...} = allocateGenReg dest
 
-        |   registerAllocate{instr=LoadWithConstantOffset{dest, base, loadType, ...}, ...} =
+        |   registerAllocate{instr=LoadWithConstantOffset{dest, base, ...}, ...} =
             (
-                case loadType of LoadFloat => allocateFloatReg dest | LoadDouble => allocateFloatReg dest | _ => allocateGenReg dest;
+                allocateGenReg dest;
+                allocateGenReg base
+            )
+
+        |   registerAllocate{instr=LoadFPWithConstantOffset{dest, base, ...}, ...} =
+            (
+                allocateFloatReg dest;
                 allocateGenReg  base
             )
 
-        |   registerAllocate{instr=LoadWithIndexedOffset{dest, base, index, loadType, ...}, ...} =
+        |   registerAllocate{instr=LoadWithIndexedOffset{dest, base, index, ...}, ...} =
+            (
+                allocateGenReg dest;
+                allocateGenRegs[base, index]
+            )
+
+        |   registerAllocate{instr=LoadFPWithIndexedOffset{dest, base, index, ...}, ...} =
             (
-                case loadType of LoadFloat => allocateFloatReg dest | LoadDouble => allocateFloatReg dest | _ => allocateGenReg dest;
+                allocateFloatReg dest;
                 allocateGenRegs[base, index]
             )
 
-        |   registerAllocate{instr=LoadMemReg { dest, ...}, ...} = allocateGenReg dest
+        |   registerAllocate{instr=GetThreadId { dest, ...}, ...} = allocateGenReg dest
 
         |   registerAllocate{instr=ObjectIndexAddressToAbsolute{dest, source, ...}, ...} = allocateGenRegs[dest, source]
 
         |   registerAllocate{instr=AbsoluteToObjectIndex{dest, source, ...}, ...} = allocateGenRegs[dest, source]
 
         |   registerAllocate({instr=AllocateMemoryFixed{ dest, saveRegs, ...}, ...}) = allocateGenRegs (dest :: saveRegs)
 
         |   registerAllocate({instr=AllocateMemoryVariable{ size, dest, saveRegs, ...}, ...}) =
                 allocateGenRegs (size :: dest :: saveRegs)
 
         |   registerAllocate({instr=InitialiseMem{ size, addr, init}, ...}) = allocateGenRegs [size, addr, init]
 
         |   registerAllocate{instr=BeginLoop, ...} = ()
 
         |   registerAllocate({instr=JumpLoop{regArgs, stackArgs, checkInterrupt}, ...}) =
             (
                 List.app (fn {src=ArgInReg sreg, dst} => (allocateGenReg sreg; allocateGenReg dst) | _ => ()) regArgs;
                 List.app (fn {src=ArgInReg sreg, ...} => allocateGenReg sreg | _ => ()) stackArgs;
                 case checkInterrupt of SOME regs => List.app(fn reg => allocateGenReg reg) regs | NONE => ()
             )
 
-        |   registerAllocate{instr=StoreWithConstantOffset{source, base, loadType, ...}, ...} =
+        |   registerAllocate{instr=StoreWithConstantOffset{source, base, ...}, ...} = allocateGenRegs[source, base]
+
+        |   registerAllocate{instr=StoreFPWithConstantOffset{source, base, ...}, ...} =
             (
-                case loadType of LoadFloat => allocateFloatReg source | LoadDouble => allocateFloatReg source | _ => allocateGenReg source;
+                allocateFloatReg source;
                 allocateGenReg base
             )
 
-        |   registerAllocate{instr=StoreWithIndexedOffset{source, base, index, loadType, ...}, ...} =
+        |   registerAllocate{instr=StoreWithIndexedOffset{source, base, index, ...}, ...} = allocateGenRegs[source, base, index]
+
+        |   registerAllocate{instr=StoreFPWithIndexedOffset{source, base, index, ...}, ...} =
             (
-                case loadType of LoadFloat => allocateFloatReg source | LoadDouble => allocateFloatReg source | _ => allocateGenReg source;
+                allocateFloatReg source;
                 allocateGenRegs[base, index]
             )
 
         |   registerAllocate{instr=AddSubImmediate{ source, dest, ... }, ...} =
             (
                 case dest of SOME dest => allocateGenReg dest | NONE => ();
                 allocateGenReg source
             )
 
         |   registerAllocate{instr=AddSubRegister{ base, shifted, dest, ... }, ...} =
             (
                 case dest of SOME dest => allocateGenReg dest | NONE => ();
                 allocateGenRegs[base, shifted]
             )
 
         |   registerAllocate{instr=LogicalImmediate{ source, dest, ... }, ...} =
             (
                 case dest of SOME dest => allocateGenReg dest | NONE => ();
                 allocateGenReg source
             )
 
         |   registerAllocate{instr=LogicalRegister{ base, shifted, dest, ... }, ...} =
             (
                 case dest of SOME dest => allocateGenReg dest | NONE => ();
                 allocateGenRegs[base, shifted]
             )
 
         |   registerAllocate{instr=ShiftRegister{ dest, source, shift, ... }, ...} = allocateGenRegs[dest, source, shift]
 
         |   registerAllocate{instr=Multiplication{ dest, sourceA, sourceM, sourceN, ... }, ...} =
                 (allocateGenRegs[dest, sourceM, sourceN]; case sourceA of SOME srcA => allocateGenReg srcA | NONE => ())
 
         |   registerAllocate{instr=Division{ dest, dividend, divisor, ... }, ...} = allocateGenRegs[dest, dividend, divisor]
 
         |   registerAllocate{instr=BeginFunction{regArgs, ...}, ...} =
                 (* Check that every argument has a register allocated including any that are unused.
                    Unused arguments should be discarded at a higher level because we could allocate
                    a different register and copy the argument register only to discard it. *)
                 allocateGenRegs(List.map #1 regArgs)
 
         |   registerAllocate({instr=TailRecursiveCall{regArgs=regArgs, stackArgs=stackArgs, ...}, ...}) =
             let
                 fun allocateRegArg(ArgInReg argReg, _) = allocateGenReg argReg
                 |   allocateRegArg _ = ()
             in
                 (* We've already hinted the arguments but we want to allocate these first
                    to reduce the chance that they'll be used for stack arguments. *)
                 List.app allocateRegArg regArgs;
                 List.app (fn {src=ArgInReg argReg, ...} => allocateGenReg argReg | _ => ()) stackArgs
             end
 
         |   registerAllocate({instr=FunctionCall{regArgs=regArgs, stackArgs=stackArgs, dest=PReg dReg, saveRegs, ...}, ...}) =
             let
                 fun allocateRegArg(ArgInReg argReg, _) = allocateGenReg argReg
                 |   allocateRegArg _ = ()
             in
                 (* We've already hinted the arguments but we want to allocate these first
                    to reduce the chance that they'll be used for stack arguments. *)
                 List.app allocateRegArg regArgs;
                 List.app (fn ArgInReg argReg => allocateGenReg argReg | _ => ()) stackArgs;
                 allocateGenRegs saveRegs;
                 (* Result will be in X0. *)
                 allocateRegister(dReg, SOME(GenReg X0), [GenReg X0])
             end
 
         |   registerAllocate{instr=ReturnResultFromFunction { resultReg=PReg resReg, returnReg, ... }, ...} =
             (
                 allocateRegister(resReg, SOME(GenReg X0), [GenReg X0] (* It MUST be in this register *));
                 allocateGenReg returnReg
             )
 
         |   registerAllocate{instr=RaiseExceptionPacket{packetReg}, ...} = allocateGenReg packetReg
 
         |   registerAllocate{instr=PushToStack{ source, ... }, ...} = allocateGenReg source
 
         |   registerAllocate{instr=LoadStack{ dest,  ... }, ...} = allocateGenReg dest
 
         |   registerAllocate{instr=StoreToStack{ source, ... }, ...} = allocateGenReg source
 
         |   registerAllocate{instr=ContainerAddress{ dest, ... }, ...} = allocateGenReg dest
 
         |   registerAllocate{instr=ResetStackPtr _, ...} = ()
 
         |   registerAllocate({instr=TagValue{source, dest, ...}, ...}) = allocateGenRegs[source, dest]
 
         |   registerAllocate({instr=UntagValue{source, dest, ...}, ...}) = allocateGenRegs[source, dest]
 
         |   registerAllocate({instr=BoxLarge{source, dest, saveRegs}, ...}) =
                 (allocateGenRegs saveRegs; allocateGenRegs[source, dest])
 
         |   registerAllocate({instr=UnboxLarge{source, dest}, ...}) = allocateGenRegs[source, dest]
 
         |   registerAllocate({instr=BoxTagFloat{source, dest, saveRegs, ...}, ...}) =
             (
                 allocateGenRegs saveRegs;
                 allocateFloatReg source;
                 allocateGenReg dest
             )
 
         |   registerAllocate({instr=UnboxTagFloat{source, dest, ...}, ...}) =
             (
                 allocateFloatReg dest;
                 allocateGenReg source
             )
 
         |   registerAllocate{instr=LoadAcquire{dest, base, ...}, ...} = allocateGenRegs[dest, base]
 
         |   registerAllocate{instr=StoreRelease{source, base, ...}, ...} = allocateGenRegs[source, base]
 
         |   registerAllocate{instr=BitFieldShift{ source, dest, ... }, ...} = allocateGenRegs[source, dest]
 
         |   registerAllocate{instr=BitFieldInsert{ source, dest, destAsSource, ... }, ...} =
                 allocateGenRegs[source, destAsSource, dest]
 
         |   registerAllocate({instr=IndexedCaseOperation{testReg}, ...}) = allocateGenReg testReg
         
         |   registerAllocate({instr=PushExceptionHandler, ...}) = ()
         |   registerAllocate({instr=PopExceptionHandler, ...}) = ()
 
         |   registerAllocate({instr=BeginHandler{packetReg}, ...}) = allocateGenReg packetReg
 
         |   registerAllocate({instr=CompareByteVectors{vec1Addr, vec2Addr, length, ...}, ...}) =
                 allocateGenRegs[vec1Addr, vec2Addr, length]
 
         |   registerAllocate({instr=BlockMove{srcAddr, destAddr, length, ...}, ...}) =
                 allocateGenRegs[srcAddr, destAddr, length]
 
         |   registerAllocate({instr=AddSubXSP{source, dest, ...}, ...}) =
                 ( allocateOptGenReg dest; allocateGenReg source )
 
         |   registerAllocate({instr=TouchValue{source, ...}, ...}) = allocateGenReg source
 
         |   registerAllocate({instr=LoadAcquireExclusive{ base, dest }, ...}) = allocateGenRegs[base, dest]
 
         |   registerAllocate({instr=StoreReleaseExclusive{ base, source, result }, ...}) =
                 ( allocateGenRegs[base, result];  allocateOptGenReg source )
 
         |   registerAllocate({instr=MemoryBarrier, ...}) = ()
 
         |   registerAllocate({instr=ConvertIntToFloat{ source, dest, ...}, ...}) =
                 (allocateFloatReg dest; allocateGenReg source)
 
         |   registerAllocate({instr=ConvertFloatToInt{ source, dest, ...}, ...}) =
                 (allocateGenReg dest; allocateFloatReg source)
 
         |   registerAllocate({instr=UnaryFloatingPt{ source, dest, ...}, ...}) =
                 allocateFloatRegs[source, dest]
 
         |   registerAllocate({instr=BinaryFloatingPoint{ arg1, arg2, dest, ...}, ...}) =
                 allocateFloatRegs[arg1, arg2, dest]
 
         |   registerAllocate({instr=CompareFloatingPoint{ arg1, arg2, ...}, ...})  =
                 allocateFloatRegs[arg1, arg2]
 
         (* Depth-first scan. *)
         val visited = Array.array(numBlocks, false)
 
         fun processBlocks blockNo =
         if Array.sub(visited, blockNo)
         then ()  (* Done or currently being done. *)
         else
         let
             val () = Array.update(visited, blockNo, true)
             val ExtendedBasicBlock { flow, block, passThrough, exports, ...} =
                 Vector.sub(blocks, blockNo)
             (* Add the hints for this block before the actual allocation of registers. *)
             val _ = List.app addHints block
             val () =
                 (* Process the dependencies first. *)
                 case flow of
                     ExitCode => ()
                 |   Unconditional m => processBlocks m
                 |   Conditional {trueJump, falseJump, ...} =>
                         (processBlocks trueJump; processBlocks falseJump)
                 |   IndexedBr cases => List.app processBlocks cases
                 |   SetHandler{ handler, continue } =>
                         (processBlocks handler; processBlocks continue)
                 |   UnconditionalHandle _ => ()
                 |   ConditionalHandle { continue, ...} => processBlocks continue
             (* Now this block. *)
             local
                 (* We assume that anything used later will have been allocated a register.
                    This is generally true except for a loop where the use may occur earlier. *)
                 val exported = setToList passThrough @ setToList exports
                 fun findAReg r =
                     case Vector.sub(regProps, r) of
                         RegPropStack _ => ()
                     |   _ => ignore(allocateFindRegister(r, NONE, generalRegisters))
             in
                 val () = List.app findAReg exported
             end
         in
             List.foldr(fn (c, ()) => registerAllocate c) () block
         end
 
     in
         processBlocks 0;
         (* If the failures list is empty we succeeded. *)
         case !failures of
             [] =>  (* Return the allocation vector.  We may have unused registers, *)
                 AllocateSuccess(Vector.tabulate(maxPRegs, fn i => getOpt(Array.sub(allocatedRegs, i), GenReg XZero)))
             (* Else we'll have to spill something. *)
         |   l => AllocateFailure l
     end
 
     val nGenRegs = List.length generalRegisters
 
     structure Sharing =
     struct
         type intSet             = intSet
         and extendedBasicBlock  = extendedBasicBlock
         and regProperty         = regProperty
         and reg                 = reg
         and allocateResult      = allocateResult
     end
 
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64CodetreeToICode.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64CodetreeToICode.ML
index 677bdde1..c29a5344 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64CodetreeToICode.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64CodetreeToICode.ML
@@ -1,3105 +1,3103 @@
 (*
     Copyright David C. J. Matthews 2021
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64CodetreeToICode(
     structure BackendTree: BACKENDINTERMEDIATECODE
     structure Arm64ICode: ARM64ICODE
     structure Debug: DEBUG
     structure Arm64Foreign: FOREIGNCALL
     structure ICodeTransform: ARM64ICODETRANSFORM
     structure CodeArray: CODEARRAY
     structure Pretty:PRETTY
 
     sharing Arm64ICode.Sharing = ICodeTransform.Sharing = CodeArray.Sharing = BackendTree.Sharing
 ): GENCODE =
 struct
     open BackendTree
     open Address
     open Arm64ICode
     open CodeArray
     open BuiltIns
     
     exception InternalError = Misc.InternalError
     
     fun taggedWord64 w: Word64.word = w * 0w2 + 0w1
     and taggedWord w: word = w * 0w2 + 0w1
     
     datatype blockStruct =
         BlockSimple of arm64ICode
     |   BlockExit of arm64ICode
     |   BlockLabel of int
     |   BlockFlow of controlFlow
     |   BlockBegin of { regArgs: (preg * xReg) list, stackArgs: stackLocn list }
     |   BlockRaiseAndHandle of arm64ICode * int
     |   BlockOptionalHandle of {call: arm64ICode, handler: int, label: int }
 
 
     val moveRegister = BlockSimple o MoveRegister
     and loadNonAddressConstant = BlockSimple o LoadNonAddressConstant
     and loadAddressConstant = BlockSimple o LoadAddressConstant
     and loadWithConstantOffset = BlockSimple o LoadWithConstantOffset
+    and loadFPWithConstantOffset = BlockSimple o LoadFPWithConstantOffset
     and loadWithIndexedOffset = BlockSimple o LoadWithIndexedOffset
-    and loadMemReg = BlockSimple o LoadMemReg
+    and loadFPWithIndexedOffset = BlockSimple o LoadFPWithIndexedOffset
+    and getThreadId = BlockSimple o GetThreadId
     and objectIndexAddressToAbsolute = BlockSimple o ObjectIndexAddressToAbsolute
     and absoluteToObjectIndex = BlockSimple o AbsoluteToObjectIndex
     and allocateMemoryFixed = BlockSimple o AllocateMemoryFixed
     and allocateMemoryVariable = BlockSimple o AllocateMemoryVariable
     and initialiseMem = BlockSimple o InitialiseMem
     and storeWithConstantOffset = BlockSimple o StoreWithConstantOffset
+    and storeFPWithConstantOffset = BlockSimple o StoreFPWithConstantOffset
     and storeWithIndexedOffset = BlockSimple o StoreWithIndexedOffset
+    and storeFPWithIndexedOffset = BlockSimple o StoreFPWithIndexedOffset
     and addSubImmediate = BlockSimple o AddSubImmediate
     and addSubRegister = BlockSimple o AddSubRegister
     and logicalImmediate = BlockSimple o LogicalImmediate
     and logicalRegister = BlockSimple o LogicalRegister
     and shiftRegister = BlockSimple o ShiftRegister
     and multiplication = BlockSimple o Multiplication
     and division = BlockSimple o Division
     and pushToStack = BlockSimple o PushToStack
     and loadStack = BlockSimple o LoadStack
     and storeToStack = BlockSimple o StoreToStack
     and containerAddress = BlockSimple o ContainerAddress
     and resetStackPtr = BlockSimple o ResetStackPtr
     and tagValue = BlockSimple o TagValue
     and untagValue = BlockSimple o UntagValue
     and boxLarge = BlockSimple o BoxLarge
     and unboxLarge = BlockSimple o UnboxLarge
     and boxTagFloat = BlockSimple o BoxTagFloat
     and unboxTagFloat = BlockSimple o UnboxTagFloat
     and loadAcquire = BlockSimple o LoadAcquire
     and storeRelease = BlockSimple o StoreRelease
     and bitFieldShift = BlockSimple o BitFieldShift
     and bitFieldInsert = BlockSimple o BitFieldInsert
     and compareByteVectors = BlockSimple o CompareByteVectors
     and blockMove = BlockSimple o BlockMove
     and addSubXSP = BlockSimple o AddSubXSP
     and touchValue = BlockSimple o TouchValue
     and loadAcquireExclusive = BlockSimple o LoadAcquireExclusive
     and storeReleaseExclusive = BlockSimple o StoreReleaseExclusive
     and memoryBarrier = BlockSimple MemoryBarrier
     and convertIntToFloat = BlockSimple o ConvertIntToFloat
     and convertFloatToInt = BlockSimple o ConvertFloatToInt
     and unaryFloatingPt = BlockSimple o UnaryFloatingPt
     and binaryFloatingPoint = BlockSimple o BinaryFloatingPoint
     and compareFloatingPoint = BlockSimple o CompareFloatingPoint
 
     val shiftConstant = BlockSimple o shiftConstant
 
     (* Many operations use 32-bit arguments in 32-in-64 and 64-bit in native 64. *)
     val polyWordLoadSize = if is32in64 then Load32 else Load64
     val polyWordOpSize = if is32in64 then OpSize32 else OpSize64
 
     val tagBitMask64 = Word64.<<(Word64.fromInt ~1, 0w1)
     val tagBitMask32 = Word64.andb(tagBitMask64, 0wxffffffff)
 
     val polyWordTagBitMask = if is32in64 then tagBitMask32 else tagBitMask64
 
     (* The flags byte is the high-order byte of length word. *)
     val flagsByteOffset = if isBigEndian then ~ (Word.toInt wordSize) else ~1
 
     (* Size of operand in bytes and therefore the scale factor. *)
     fun opWordSize Load64 = 8
     |   opWordSize Load32 = 4
     |   opWordSize Load16 = 2
     |   opWordSize Load8 = 1
-    |   opWordSize LoadDouble = 8
-    |   opWordSize LoadFloat = 4
 
     (* Shift for each size.  i.e. log2 of opWordSize. *)
     fun loadShift Load64 = 0w3
     |   loadShift Load32 = 0w2
     |   loadShift Load16 = 0w1
     |   loadShift Load8 = 0w0
-    |   loadShift LoadDouble = 0w3
-    |   loadShift LoadFloat = 0w2
 
     fun precisionToFpSize PrecSingle = Float32 | precisionToFpSize PrecDouble = Double64
 
     fun codeFunctionToArm64({body, localCount, name, argTypes, closure, ...}:bicLambdaForm, debugSwitches, resultClosure) =
     let
         (* Pseudo-registers are allocated sequentially and the properties added to the list. *)
         val pregCounter = ref 0
         val pregPropList = ref []
         fun newPReg() =
         let
             val regNo = !pregCounter before pregCounter := !pregCounter + 1
             val () = pregPropList := RegPropGeneral :: !pregPropList
         in
             PReg regNo
         end
         
         and newUReg() =
         let
             val regNo = !pregCounter before pregCounter := !pregCounter + 1
             val () = pregPropList := RegPropUntagged :: !pregPropList
         in
             PReg regNo
         end
         
         and newStackLoc size =
         let
             val regNo = !pregCounter before pregCounter := !pregCounter + 1
             val () = pregPropList := RegPropStack size :: !pregPropList
         in
             StackLoc{size=size, rno=regNo}
         end
         
         and newMergeReg() =
         let
             val regNo = !pregCounter before pregCounter := !pregCounter + 1
             val () = pregPropList := RegPropMultiple :: !pregPropList
         in
             PReg regNo
         end
         
         datatype locationValue =
             NoLocation
         |   PregLocation of preg
         |   ContainerLocation of { container: stackLocn, stackOffset: int }
 
         val locToPregArray = Array.array(localCount, NoLocation)
         val labelCounter = ref 1 (* Start at 1.  Zero is used for the root. *)
         fun newLabel() = !labelCounter before labelCounter := !labelCounter + 1
         val ccRefCounter = ref 0
         fun newCCRef() = CcRef(!ccRefCounter) before ccRefCounter := !ccRefCounter + 1
 
         (* The profile object is a single mutable with the F_bytes bit set. *)        
         val profileObject = CodeArray.createProfileObject()
 
         (* Switch to indicate if we want to trace where live data has been allocated. *)
         (* TODO: This should be used in AllocateMemoryOperation and BoxValue and possibly AllocateMemoryVariable. *)
         val addAllocatingFunction =
             Debug.getParameter Debug.profileAllocationTag debugSwitches = 1
 
         datatype destination =
             SpecificPReg of preg
         |   NoResult
         |   AnyReg
 
         (* Context type. *)
         type context =
             { loopArgs: (preg list * int * int) option, stackPtr: int, currHandler: int option,
               overflowBlock: int option ref }
 
         datatype argLoc =
             ArgumentIsInReg of { realReg: xReg, argReg: preg }
         |   ArgumentIsOnStack of { stackOffset: int, stackReg: stackLocn }
 
         (* An address as either suitable for Load/StoreWithConstantOffset or else Load/StoreWithIndexedOffset. *)
         datatype addressKind =
             AddrOffset of {base: preg, offset: int}
         |   AddrIndex of {base: preg, index: preg}
 
         (* Pseudo-regs for the result, the closure and the args that were passed in real regs. *)
         val resultTarget = newPReg()
         val closureRegAddr = newPReg()
         val returnAddrReg = newPReg()
         
         val generalArgRegs = [X0, X1, X2, X3, X4, X5, X6, X7]
 
         (* Create a map for the arguments indicating their register or stack location. *)
         local
             (* Select the appropriate argument register depending on the argument type. *)
             fun argTypesToArgEntries([], _, _) = ([], [], [], [])
 
             |   argTypesToArgEntries(_ :: tl, gReg :: gRegs, n) =
                 (* This deals with general arguments but also with extra floating point arguments.
                    They are boxed as usual. *)
                 let
                     val (argTypes, argCode, argRegs, stackArgs) =
                         argTypesToArgEntries(tl, gRegs, n-1)
                     val argReg=newPReg()
                 in
                     (ArgumentIsInReg{realReg=gReg, argReg=argReg} :: argTypes, argCode, (argReg, gReg) :: argRegs, stackArgs)
                 end
 
             |   argTypesToArgEntries(_ :: tl, [], n) =
                 let
                     val (argTypes, argCode, argRegs, stackArgs) = argTypesToArgEntries(tl, [], n-1)
                     val stackLoc = newStackLoc 1
                 in
                     (ArgumentIsOnStack {stackOffset=n, stackReg = stackLoc } :: argTypes, argCode, argRegs, stackLoc :: stackArgs)
                 end
 
             val (argEntries, argCode, argRegs, stackArguments) =
                 argTypesToArgEntries(argTypes, generalArgRegs, List.length argTypes)
             val clReg = case closure of [] => [] | _ => [(closureRegAddr, X8)]
             val retReg = [(returnAddrReg, X30)]
         in
             val argumentVector = Vector.fromList argEntries
 
             (* Start code for the function. *)
             val beginInstructions = argCode @
                 [BlockBegin{regArgs=retReg @ clReg @ argRegs, stackArgs=stackArguments }]
 
             (* The number of arguments on the stack.  Needed in return instrs and tail calls. *)
             val currentStackArgs = List.length stackArguments
         end
 
         fun returnInstruction({stackPtr, ...}, resReg, tailCode) =
         let
         in
             BlockExit(ReturnResultFromFunction{resultReg=resReg, returnReg = returnAddrReg, numStackArgs=currentStackArgs}) ::
                 (if stackPtr <> 0 then resetStackPtr{numWords=stackPtr} :: tailCode else tailCode)
         end
 
         fun asTarget(SpecificPReg preg) = preg
         |   asTarget _ = newPReg()
 
         fun moveToResult(SpecificPReg tReg, code, sReg) =
                 (moveRegister{source=sReg, dest=tReg} :: code, tReg, false)
         |   moveToResult(AnyReg, code, sReg) = (code, sReg, false)
         |   moveToResult(NoResult, code, sReg) =
             let
                 val tReg = newPReg()
             in
                 (moveRegister{source=sReg, dest=tReg} :: code, tReg, false)
             end
 
         (* Store a register at a given offset.  This may have to use an index register
            if the offset is too large. *)
         fun storeAtWordOffset(toStore, offset, base, loadSize, tailCode) =
         let
             val wSize = opWordSize loadSize
             val byteOffset = offset*wSize
         in
             if offset < 4096 andalso byteOffset > ~256 
             then storeWithConstantOffset{base=base, source=toStore,
                             byteOffset=byteOffset, loadType=loadSize} :: tailCode
             else
             let
                 val indexReg = newUReg()
             in
                 storeWithIndexedOffset{ base=base, source=toStore, index=indexReg, loadType=loadSize } ::
                     loadNonAddressConstant{ source=LargeWord.fromInt offset, dest=indexReg } :: tailCode
             end
         end
 
         (* Allocate a fixed size cell with a reference to the profile object if we want
            to trace the location of live data.  Currently only used for tuples and closures. *)
         fun allocateWithProfileRev(n, flags, memAddr, tlCode) =
         let
             fun doAllocation(words, flags, tlCode) =
             let
                 val wordsRequired =
                     if is32in64
                     then (* Have to round this up to 8 bytes *)
                         Word64.andb(Word64.fromInt(words+2), ~ 0w2)
                     else Word64.fromInt(words+1)
                 val bytesRequired = Word64.fromLarge(Word.toLarge wordSize) * wordsRequired
                 val lengthWord =
                     Word64.orb(Word64.fromInt words, Word64.<<(Word64.fromLarge(Word8.toLarge flags),
                         if is32in64 then 0w24 else 0w56))
                 val lengthReg = newUReg()
             in
                 storeWithConstantOffset{ source=lengthReg, base=memAddr, byteOffset= ~(Word.toInt wordSize), loadType=polyWordLoadSize } ::
                 loadNonAddressConstant{ source=lengthWord, dest=lengthReg } ::
                 allocateMemoryFixed{bytesRequired=bytesRequired, dest=memAddr, saveRegs=[]} :: tlCode
             end
         in
             if addAllocatingFunction
             then
             let
                 val profReg = newPReg()
             in
                 storeAtWordOffset(profReg, n, memAddr, polyWordLoadSize,
                     loadAddressConstant{ source=profileObject, dest=profReg} ::
                         doAllocation(n+1, Word8.orb(flags, Address.F_profile), tlCode))
             end
             else doAllocation(n, flags, tlCode)
         end
 
         (* Return a unit result. *)
         fun returnUnit(target, code, exit) =
         let
             val tReg = asTarget target
         in
             (loadNonAddressConstant{source=taggedWord64 0w0, dest=tReg} :: code, tReg, exit)
         end
 
         (* Create a bool result from a test by returning true or false. *)
         fun makeBoolResultRev(condition, ccRef, target, testCode) =
         let
             val trueLab = newLabel() and falseLab = newLabel() and mergeLab = newLabel()
             val mergeReg = newMergeReg()
         in
             moveRegister{dest=target, source=mergeReg} ::
             BlockLabel mergeLab ::
             BlockFlow(Unconditional mergeLab) ::
             loadNonAddressConstant{dest=mergeReg, source=taggedWord64 0w0} ::
             BlockLabel falseLab ::
             BlockFlow(Unconditional mergeLab) ::
             loadNonAddressConstant{dest=mergeReg, source=taggedWord64 0w1} ::
             BlockLabel trueLab ::
             BlockFlow(Conditional{ ccRef=ccRef, condition=condition, trueJump=trueLab, falseJump=falseLab }) ::
             testCode
         end
 
         (* Return an absolute address in both native addressing and 32-in-64. *)
         fun getAbsoluteAddress(code, baseReg) =
             if is32in64
             then let val absReg = newUReg() in (objectIndexAddressToAbsolute{ source=baseReg, dest=absReg } :: code, absReg) end
             else (code, baseReg)
 
         (* Load a value aligned on a 64 or 32-bit boundary.  offset is the number
            of units.  Typically this will be a polyword. *)
         fun wordAddressOffset(destination, baseReg1, offset, loadOp, code) =
         let
             val dReg = asTarget destination
             val opWordSize = opWordSize loadOp
             val byteOffset = offset * opWordSize
             val (codeBase, baseReg) = getAbsoluteAddress(code, baseReg1)
             val code =
                 if offset < 4096 andalso byteOffset > ~256
                 then loadWithConstantOffset{base=baseReg, dest=dReg, byteOffset=byteOffset, loadType=loadOp} :: codeBase
                 else
                 let
                     val indexReg = newUReg()
                 in
                     loadWithIndexedOffset{ base=baseReg, dest=dReg, index=indexReg, loadType=loadOp } ::
                         loadNonAddressConstant{ source=LargeWord.fromInt offset, dest=indexReg } :: codeBase
                 end
         in
             (code, dReg, false)
         end
 
         (* See if we have a container and return the entry if present. *)
         fun getContainerIfPresent(BICExtract(BICLoadLocal l)) =
             ( case Array.sub(locToPregArray, l) of ContainerLocation container => SOME container | _ => NONE )
         |   getContainerIfPresent _ = NONE
 
         (* General function for loads and stores. *)
         fun loadAndStoreWithAddress
-                ({base=bReg1, index, offset}, loadType, isCAddress, loadStoreOffset, loadStoreIndex, code) =
+                ({base=bReg1, index, offset}, loadSize, loadShift, isCAddress, loadStoreOffset, loadStoreIndex, code) =
         let
-            val opWordSize = opWordSize loadType
-            val byteOffset = offset * opWordSize
+            val byteOffset = offset * loadSize
             (* Get the base register value *)
             val bCode = code
             val sCode = bCode
 
             (* Get any index register value. *)
             val (iCode, iReg1Opt) =
                 case index of
                     NONE =>
                         if offset < 4096 andalso byteOffset > ~256
                         then (sCode, NONE) (* We can use this offset. *)
                         else
                         let
                             val iReg = newUReg()
                         in
                             (loadNonAddressConstant{ source=LargeWord.fromInt offset, dest=iReg } :: sCode, SOME iReg)
                         end
                 |   SOME iReg1 =>
                         let
                             val iCode1 = sCode
                             (* The index is a tagged integer containing the number of units (words, bytes etc).
                                It has to be untagged.  If this is a C address it may be negative. *)
                             val iReg2 = newUReg()
                             (* Logical shift if this is a Poly address, arithmetic shift if this is
                                a C address. *)
                             val iCode2 =
                                 untagValue{source=iReg1, dest=iReg2, opSize=polyWordOpSize, isSigned=isCAddress } :: iCode1
                         in
                             if offset = 0
                             then (iCode2, SOME iReg2)
                             else
                             let
                                 (* If there's some constant offset add it to the index.  Because it's a byte offset
                                    we need to divide it by the scale but it should always be a multiple.
                                    N.B. In 32-in-64 the index register contains a 32-bit value even when
                                    the offset is negative. *)
                                 val cReg = newUReg() and iReg3 = newUReg()
-                                val shift = loadShift loadType
                                 val offsetAsWord = LargeWord.fromInt offset
                                 (* It could be negative if it's a C address. *)
-                                val shiftedOffset = (if isCAddress then LargeWord.~>> else LargeWord.>>) (offsetAsWord, shift)
+                                val shiftedOffset = (if isCAddress then LargeWord.~>> else LargeWord.>>) (offsetAsWord, loadShift)
                             in
                                 (addSubRegister{ base=iReg2, shifted=cReg, dest=SOME iReg3, ccRef=NONE, isAdd=true,
                                     length=polyWordOpSize, shift=ShiftNone} ::
                                  loadNonAddressConstant{ source=shiftedOffset, dest=cReg } :: iCode2,
                                  SOME iReg3)
                             end
                         end
 
             (* If this is 32in64 get the absolute address. *)
             val (absBCode, absBReg) = getAbsoluteAddress(iCode, bReg1)
 
             (* If this is a C address the "base address" is actually a box containing the address. *)
             val (effBCode, effBReg) =
                 if isCAddress
                 then
                     let
                         val bReg = newUReg()
                     in
                         (loadWithConstantOffset{ base=absBReg, dest=bReg, byteOffset=0, loadType=Load64 } :: absBCode, bReg)
                     end
                 else (absBCode, absBReg)
 
         in
             case iReg1Opt of
                 SOME iReg => loadStoreIndex(effBReg, iReg, effBCode)
             |   NONE => loadStoreOffset(effBReg, offset, effBCode)
         end
 
         (* Some operations require a single absolute address.
            These are all ML addresses so the index/offset is always unsigned. *)
-        fun loadAndStoreWithAbsolute (address, loadType, loadStore, code) =
+        fun loadAndStoreWithAbsolute (address, loadSize, loadShift, loadStore, code) =
         let
             (* Have to add the offset/index register. *)
             fun loadStoreOffset(bReg, 0, code) = loadStore(bReg, code)
             |   loadStoreOffset(bReg, offset, code) =
                 let
                     val cReg = newUReg() and aReg = newUReg()
                 in
                     loadStore(aReg,
                         addSubRegister{ base=bReg, shifted=cReg, dest=SOME aReg, ccRef=NONE, isAdd=true,
                                         length=OpSize64, shift=ShiftNone} ::
                         loadNonAddressConstant{ source=LargeWord.fromInt offset, dest=cReg } :: code)
                 end
 
             and loadStoreIndex(bReg, iReg, code) =
             let
                 val aReg = newUReg()
                 (* The index register is a number of words/bytes etc so has to be multiplied when it's
                    added in. *)
                 val indexShift =
-                    let val v = loadShift loadType in if v = 0w0 then ShiftNone else ShiftLSL(Word8.fromLarge(Word.toLarge v)) end
+                    if loadShift = 0w0 then ShiftNone else ShiftLSL(Word8.fromLarge(Word.toLarge loadShift))
             in
                 loadStore(aReg,
                     addSubRegister{ base=bReg, shifted=iReg, dest=SOME aReg, ccRef=NONE, isAdd=true,
                                     length=OpSize64, shift=indexShift} :: code)
             end
         in
-            loadAndStoreWithAddress (address, loadType, false, loadStoreOffset, loadStoreIndex, code)
+            loadAndStoreWithAddress (address, loadSize, loadShift, false, loadStoreOffset, loadStoreIndex, code)
         end
 
         (* Overflow check.  This raises Overflow if the condition is satisfied.  Normally this will be
            that the overflow bit is set but for multiplication it's more complicated.  This generates
            a single block for the function unless there is a handler.
            As well as reducing the size of the code this also means that overflow checks are generally
            BO instructions to the end of the code.  Since the default branch prediction is not to take
            forward jumps this should improve prefetching on the normal, non-overflow, path. *)
         fun checkOverflow (condition, {currHandler=NONE, overflowBlock=ref(SOME overFlowLab), ...}, ccRef) =
             (* It's already been set and there's no surrounding handler - use this. *)
             let
                 val noOverflowLab = newLabel()
             in
                 [
                     BlockLabel noOverflowLab,
                     BlockFlow(Conditional{ ccRef=ccRef, condition=condition, trueJump=overFlowLab, falseJump=noOverflowLab })
                 ]
             end
 
         |   checkOverflow (condition, {currHandler=NONE, overflowBlock, ...}, ccRef) =
             let
                 (* *)
                 val overFlowLab = newLabel() and noOverflowLab = newLabel()
                 val packetReg = newPReg()
                 val () = overflowBlock := SOME overFlowLab
             in
                 [
                     BlockLabel noOverflowLab,
                     BlockExit(RaiseExceptionPacket{packetReg=packetReg}),
                     loadAddressConstant{source=toMachineWord(Overflow), dest=packetReg},
                     BlockLabel overFlowLab,
                     BlockFlow(Conditional{ ccRef=ccRef, condition=condition, trueJump=overFlowLab, falseJump=noOverflowLab })
                 ]
             end
 
         |   checkOverflow (condition, {currHandler=SOME h, ...}, ccRef) =
             let
                 val overFlowLab = newLabel() and noOverflowLab = newLabel()
                 val packetReg = newPReg()
             in
                 [
                     BlockLabel noOverflowLab,
                     BlockRaiseAndHandle(RaiseExceptionPacket{packetReg=packetReg}, h),
                     loadAddressConstant{source=toMachineWord(Overflow), dest=packetReg},
                     BlockLabel overFlowLab,
                     BlockFlow(Conditional{ ccRef=ccRef, condition=condition, trueJump=overFlowLab, falseJump=noOverflowLab })
                 ]
             end
 
         fun codeToICodeRev(BICNewenv (bindings, exp), context: context as {stackPtr=initialSp, ...}, isTail, destination, tailCode) =
             let
                 (* Process a list of bindings.  We need to accumulate the space used by
                    any containers and reset the stack pointer at the end if necessary. *)
                 fun doBindings([], context, tailCode) = (tailCode, context)
  
                 |   doBindings(BICDeclar{value=BICExtract(BICLoadLocal l), addr, ...} :: decs, context, tailCode) =
                     let
                         (* Giving a new name to an existing entry.  This should have been removed
                            at a higher level but it doesn't always seem to be.  In particular we
                            must treat this specially if it's a container. *)
                         val original = Array.sub(locToPregArray, l)
                         val () = Array.update(locToPregArray, addr, original)
                     in
                         doBindings(decs, context, tailCode)
                     end
 
                 |   doBindings(BICDeclar{value, addr, ...} :: decs, context, tailCode) =
                     let
                         val (code, dest, _) = codeToICodeRev(value, context, false, AnyReg, tailCode)
                         val () = Array.update(locToPregArray, addr, PregLocation dest)
                     in
                         doBindings(decs, context, code)
                     end
 
                 |   doBindings(BICRecDecs [{lambda, addr, ...}] :: decs, context, tailCode) =
                     (* We shouldn't have single entries in RecDecs but it seems to occur at the moment. *)
                     let
                         val dest = newPReg()
                         val (code, _, _) = codeToICodeRev(BICLambda lambda, context, false, SpecificPReg dest, tailCode)
                         val () = Array.update(locToPregArray, addr, PregLocation dest)
                     in
                         doBindings(decs, context, code)
                     end
 
                 |   doBindings(BICRecDecs recDecs :: decs, context, tailCode) =
                     let
                         val destRegs = map (fn _ => newPReg()) recDecs
                         val flagsValue = if is32in64 then F_closure else 0w0
                         (* First build the closures as mutable cells containing zeros.  Set the
                            entry in the address table to the register containing the address. *)
                         fun makeClosure({lambda={closure, ...}, addr, ...}, dest, tailCode) =
                         let
                             val () = Array.update(locToPregArray, addr, PregLocation dest)
                             val wordsRequired = List.length closure + (if is32in64 then 2 else 1)
                             val absAddr = if is32in64 then newUReg() else dest
                             val zeroReg = newPReg()
                             val allocAndSetZero =
                                 loadNonAddressConstant{ source=taggedWord64 0w0, dest=zeroReg} ::
                                     allocateWithProfileRev(wordsRequired, Word8.orb(F_mutable, flagsValue), absAddr, tailCode)
                             val (_, clearCode) =
                                 List.foldl(fn (_, (n, l)) =>
                                     (n+1, storeAtWordOffset(zeroReg, n, absAddr, polyWordLoadSize, l))) (0, allocAndSetZero) closure
                         in
                             if is32in64
                             then absoluteToObjectIndex{ source=absAddr, dest=dest } :: clearCode
                             else clearCode
                         end
                         val allocClosures = ListPair.foldlEq makeClosure tailCode (recDecs, destRegs)
 
                         fun setClosure({lambda, ...}, dest, l) =
                         let
                             val absAddr = if is32in64 then newUReg() else dest
                             val flagsReg = newUReg()
                             (* Lock the closure by storing the flags byte without the mutable flag.
                                TODO: We could simply use XZ here. *)
                         in
                             storeWithConstantOffset{ base=absAddr, source=flagsReg, byteOffset=flagsByteOffset,
                                     loadType=Load8 } ::
                             loadNonAddressConstant{ source=Word8.toLarge flagsValue, dest=flagsReg } ::
                             storeIntoClosure(lambda, absAddr, context,
                                     if is32in64
                                     then objectIndexAddressToAbsolute{ source=dest, dest=absAddr } :: l else l)
                         end
                             
                         val setAndLockClosures = ListPair.foldlEq setClosure allocClosures (recDecs, destRegs)
                     in
                         doBindings(decs, context, setAndLockClosures)
                     end
 
                 |   doBindings(BICNullBinding exp :: decs, context, tailCode) =
                     let
                         val (code, _, _) = codeToICodeRev(exp, context, false, NoResult, tailCode) (* And discard result. *)
                     in
                         doBindings(decs, context, code)
                     end
        
                 |   doBindings(BICDecContainer{ addr, size } :: decs, {loopArgs, stackPtr, currHandler, overflowBlock}, tailCode) =
                     let
                         val containerLoc = newStackLoc size
                         val () = Array.update(locToPregArray, addr,
                                     ContainerLocation{container=containerLoc, stackOffset=stackPtr+size})
                         val zeroReg = newPReg()
                     in
                         doBindings(decs,
                             {loopArgs=loopArgs, stackPtr=stackPtr+size, currHandler=currHandler, overflowBlock=overflowBlock},
                             pushToStack{copies=size, container=containerLoc, source=zeroReg} ::
                             loadNonAddressConstant{ source=taggedWord64 0w0, dest=zeroReg } :: tailCode)
                     end
 
                 val (codeBindings, resContext as {stackPtr=finalSp, ...}) = doBindings(bindings, context, tailCode)
                 (* If we have had a container we'll need to reset the stack *)
             in
                 if initialSp <> finalSp
                 then
                 let
                     val _ = finalSp >= initialSp orelse raise InternalError "codeToICode - stack ptr"
                     val bodyReg = newPReg() and resultReg = asTarget destination
                     val (codeExp, result, haveExited) =
                         codeToICodeRev(exp, resContext, isTail, SpecificPReg bodyReg, codeBindings)
                     val afterAdjustSp =
                         if haveExited
                         then codeExp
                         else
                             moveRegister{source=result, dest=resultReg} ::
                             resetStackPtr{numWords=finalSp-initialSp} :: codeExp
                 in
                     (afterAdjustSp, resultReg, haveExited)
                 end
                 else codeToICodeRev(exp, resContext, isTail, destination, codeBindings)
             end
 
         |   codeToICodeRev(BICExtract(BICLoadLocal l), {stackPtr, ...}, _, destination, tailCode) =
             (
                 case Array.sub(locToPregArray, l) of
                     NoLocation => raise InternalError "codeToICodeRev - local unset"
                 |   PregLocation preg => moveToResult(destination, tailCode, preg)
                 |   ContainerLocation{container, stackOffset} =>
                     let
                         val target = asTarget destination
                     in
                         (containerAddress{dest=target, container=container, stackOffset=stackPtr-stackOffset}
                             :: tailCode, target, false)
                     end
             )
 
         |   codeToICodeRev(BICExtract(BICLoadArgument a), {stackPtr, ...}, _, destination, tailCode) =
             (
                 case Vector.sub(argumentVector, a) of
                     ArgumentIsInReg{argReg, ...} => (* It was originally in a register.  It's now in a preg. *)
                         moveToResult(destination, tailCode, argReg)
                 |   ArgumentIsOnStack{stackOffset, stackReg} => (* Pushed before call. *)
                     let
                         val target = asTarget destination
                     in
                         (loadStack{wordOffset=stackOffset+stackPtr, container=stackReg, field=0,
                                                dest=target} :: tailCode, target, false)
                     end
             )
 
         |   codeToICodeRev(BICExtract(BICLoadClosure c), _, _, destination, tailCode) =
             let
                 (* Add the number of words for the code address.  This is 1 in native but 2 in 32-in-64. *)
                 val offset = if is32in64 then c+2 else c+1
             in
                 if c >= List.length closure then raise InternalError "BICExtract: closure" else ();
                 wordAddressOffset(destination, closureRegAddr, offset, polyWordLoadSize, tailCode)
             end
 
         |   codeToICodeRev(BICExtract BICLoadRecursive, _, _, destination, tailCode) =
                 (* If the closure is empty we must use the constant.  We can't guarantee that
                    the caller will actually load the closure register if it knows the closure
                    is empty. *)
             (
                  case closure of
                     [] =>
                         let
                             val dReg = asTarget destination
                         in
                             (loadAddressConstant{source=closureAsAddress resultClosure, dest=dReg} :: tailCode, dReg, false)
                         end
                  |  _ => moveToResult(destination, tailCode, closureRegAddr)
             )
 
         |   codeToICodeRev(BICConstnt(w, _), _, _, destination, tailCode) =
             let
                 val dReg = asTarget destination
                 val instr =
                     if isShort w
                     then (* When converting to Word64 we do NOT want to use sign-extension.
                             In 32-in-64 signed fixed-precision ints need to have zeros
                             in the top 32 bits. *)
                         loadNonAddressConstant{source=taggedWord64(Word64.fromLarge(Word.toLarge(toShort w))),
                             dest=dReg}
                     else loadAddressConstant{source=w, dest=dReg}
             in
                 (instr :: tailCode, dReg, false)
             end
 
         |   codeToICodeRev(BICField{base, offset}, context, _, destination, tailCode) =
             let
                 val (codeBase, baseReg, _) = codeToICodeRev(base, context, false, AnyReg, tailCode)
             in
                 wordAddressOffset(destination, baseReg, offset, polyWordLoadSize, codeBase)
             end
 
         |   codeToICodeRev(BICCond(test, thenPt, elsePt), context, isTail, NoResult, tailCode) =
             let
                 (* If we don't want the result but are only evaluating for side-effects we
                    may be able to optimise special cases.  This was easier in the forward
                    case but for now we don't bother and leave it to the lower levels. *)
                 val startElse = newLabel() and skipElse = newLabel()
                 val codeTest = codeConditionRev(test, context, false, startElse, tailCode)
                 val (codeThen, _, _) =
                     codeToICodeRev(thenPt, context, isTail, NoResult, codeTest)
                 val (codeElse, _, _) =
                      codeToICodeRev(elsePt, context, isTail, NoResult,
                         BlockLabel startElse ::
                         BlockFlow(Unconditional skipElse) :: codeThen)
             in
                 returnUnit(NoResult, BlockLabel skipElse :: codeElse, false(*??*))
             end
 
         |   codeToICodeRev(BICCond(test, thenPt, elsePt), context, isTail, destination, tailCode) =
             let
                 (* Because we may push the result onto the stack we have to create a new preg to
                    hold the result and then copy that to the final result. *)
                 (* If this is a tail each arm will exit separately and neither will return a result. *)
                 val target = asTarget destination
                 val condResult = newMergeReg()
                 val thenTarget = if isTail then newPReg() else condResult
                 val startElse = newLabel()
                 val testCode = codeConditionRev(test, context, false, startElse, tailCode)
                 
                 (* Put the result in the target register. *)
                 val (thenCode, _, thenExited) = codeToICodeRev(thenPt, context, isTail, SpecificPReg thenTarget, testCode)
                 (* Add a jump round the else-part except that if this is a tail we
                    return.  The then-part could have exited e.g. with a raise or a loop. *)
                 val (exitThen, thenLabel, elseTarget) =
                     if thenExited then (thenCode, [], target (* Can use original target. *))
                     else if isTail then (returnInstruction(context, thenTarget, thenCode), [], newPReg())
                     else
                     let
                         val skipElse = newLabel()
                     in
                         (BlockFlow(Unconditional skipElse) :: thenCode,
                          [moveRegister{source=condResult, dest=target},
                           BlockLabel skipElse],
                          condResult)
                     end
                 val (elseCode, _, elseExited) =
                     codeToICodeRev(elsePt, context, isTail, SpecificPReg elseTarget,
                         BlockLabel startElse :: exitThen)
                 (* Add a return to the else-part if necessary so we will always exit on a tail. *)
                 val exitElse =
                     if isTail andalso not elseExited
                     then returnInstruction(context, elseTarget, elseCode) else elseCode
             in
                 (thenLabel @ exitElse, target, isTail orelse thenExited andalso elseExited)
             end
 
         |   codeToICodeRev(BICUnary instr, context, isTail, destination, tailCode) =
                 codeToICodeUnaryRev(instr, context, isTail, destination, tailCode)
 
         |   codeToICodeRev(BICBinary instr, context, isTail, destination, tailCode) =
                 codeToICodeBinaryRev(instr, context, isTail, destination, tailCode)
 
         |   codeToICodeRev(BICTagTest{test, tag=tagValue, ...}, context, isTail, destination, tailCode) =
             (* Check the "tag" word of a union (datatype).  N.B.  Not the same as testing the
                tag bit of a word.  Just generate it as a general word comparison.  The optimiser
                will sort out whether the tag value can be an immediate. *)
                 codeToICodeRev(BICBinary{oper=WordComparison{test=TestEqual, isSigned=false},
                                          arg1=test, arg2=BICConstnt(toMachineWord tagValue, [])},
                         context, isTail, destination, tailCode)
 
         |   codeToICodeRev(BICTuple fields, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 (* The allocator sets the register to the absolute address.  It
                    has to be converted to an object pointer in 32-in-64. *)
                 val absAddr = if is32in64 then newUReg() else target
                 
                 fun loadFields([], n, tlCode) = allocateWithProfileRev(n, 0w0, absAddr, tlCode)
                 
                 |   loadFields((f as BICConstnt _) :: rest, n, tlCode) =
                     let
                         (* Unlike the X86 we still need to load a constant into a register in order to
                            store it in the new tuple.  However, it's better to leave that until after
                            the allocation and move it then.  That way we can use the same register
                            for different constants if we have a very large tuple. *)
                         val restAndAlloc = loadFields(rest, n+1, tlCode)
                         val (code1, source, _) = codeToICodeRev(f, context, false, AnyReg, restAndAlloc)
                     in
                         storeAtWordOffset(source, n, absAddr, polyWordLoadSize, code1)
                     end
                     
                 |   loadFields(f :: rest, n, tlCode) =
                     let
                         val (code1, source, _) = codeToICodeRev(f, context, false, AnyReg, tlCode)
                         val restAndAlloc = loadFields(rest, n+1, code1)
                     in
                         storeAtWordOffset(source, n, absAddr, polyWordLoadSize, restAndAlloc)
                     end
                 val allocAndStore = loadFields(fields, 0, tailCode)
                 val code =
                     if is32in64
                     then absoluteToObjectIndex{source=absAddr, dest=target} :: allocAndStore
                     else allocAndStore
             in
                 (code, target, false)
             end
 
         |   codeToICodeRev(BICRaise exc, context as { currHandler, ...}, _, destination, tailCode) =
             let
                 val (code, packetReg, _) = codeToICodeRev(exc, context, false, AnyReg, tailCode)
                 val raiseCode = RaiseExceptionPacket{packetReg=packetReg}
                 val block =
                     case currHandler of
                         NONE => BlockExit raiseCode | SOME h => BlockRaiseAndHandle(raiseCode, h)
             in
                 returnUnit(destination,  block :: code, true (* Always exits *))
             end
 
         |   codeToICodeRev(BICEval{function, argList, ...}, context as { currHandler, ...}, isTail, destination, tailCode) =
             let
                 val target = asTarget destination
                 (* Create pregs for the closure and each argument. *)
                 val clPReg = newPReg()
                 (* If we have a constant closure we can go directly to the entry point.
                    If the closure is a single word we don't need to load the closure register. *)
                 val (functionCode, closureEntry, callKind) =
                     case function of
                         BICConstnt(addr, _) =>
                         let
                             val addrAsAddr = toAddress addr
                             (* If this is a closure we're still compiling we can't get the code address.
                                However if this is directly recursive we can use the recursive
                                convention. *)
                         in
                             if wordEq(closureAsAddress resultClosure, addr)
                             then (tailCode, [], Recursive)
                             else if flags addrAsAddr <> Address.F_words andalso flags addrAsAddr <> Address.F_closure
                             then (loadAddressConstant{source=addr, dest=clPReg} :: tailCode,
                                       [(ArgInReg clPReg, X8)], FullCall)
                             else if is32in64
                             then (* The code address is a 64-bit value so we have to load it
                                     at run-time.  The X86 version passes the closure address here
                                     and generates a relative CALL/JMP.  The actual offset is
                                     computed by the RTS.
                                     For the moment just use a full call. *)
                                 (loadAddressConstant{source=addr, dest=clPReg} :: tailCode,
                                       [(ArgInReg clPReg, X8)], FullCall)
                             else (* Native 64-bits. *)
                             let
                                 val addrLength = length addrAsAddr
                                 val _ = addrLength >= 0w1 orelse raise InternalError "BICEval address"
                                 val codeAddr = loadWord(addrAsAddr, 0w0)
                                 val _ = isCode (toAddress codeAddr) orelse raise InternalError "BICEval address not code"
                             in
                                 if addrLength = 0w1
                                 then (tailCode, [], ConstantCode codeAddr)
                                 else (loadAddressConstant{source=addr, dest=clPReg} :: tailCode,
                                       [(ArgInReg clPReg, X8)], ConstantCode codeAddr)
                             end
                         end
 
                     |   BICExtract BICLoadRecursive =>
                         (
                             (* If the closure is empty we don't need to load X8 *)
                             case closure of
                                 [] => (tailCode, [], Recursive)
                             |   _ =>
                                     (moveRegister {source=closureRegAddr, dest=clPReg} :: tailCode,
                                      [(ArgInReg clPReg, X8)], Recursive)
                         )
 
                     |   function => (* General case. *)
                             (#1 (codeToICodeRev(function, context, false, SpecificPReg clPReg, tailCode)), [(ArgInReg clPReg, X8)], FullCall)
 
                 (* Load the first arguments into registers and the rest to the stack. *)
                 fun loadArgs ([], _, tailCode) = (tailCode, [], [])
 
                 |   loadArgs ((arg, _) :: args, gReg::gRegs, tailCode) =
                     let (* General register argument. *)
                         val (c, r, _) = codeToICodeRev(arg, context, false, AnyReg, tailCode)
                         val (code, regArgs, stackArgs) = loadArgs(args, gRegs, c)
                     in
                         (code, (ArgInReg r, gReg) :: regArgs, stackArgs)
                     end
 
                 |   loadArgs ((arg, _) :: args, [], tailCode) =
                     let (* Stack argument. *)
                         val (c, r, _) = codeToICodeRev(arg, context, false, AnyReg, tailCode)
                         val (code, regArgs, stackArgs) = loadArgs(args, [], c)
                     in
                         (code, regArgs, ArgInReg r :: stackArgs)
                     end
 
                 val (codeArgs, regArgs, stackArgs) = loadArgs(argList, generalArgRegs, functionCode)
                 
                 (* If this is at the end of the function and the result types are the
                    same we can use a tail-recursive call. *)
                 val tailCall = isTail (*andalso resultType = fnResultType*)
                 
                 val callCode =
                     if tailCall
                     then
                     let
                         val {stackPtr, ...} = context
                         (* The number of arguments currently on the stack. *)
                         val currentStackArgCount = currentStackArgs
                         val newStackArgCount = List.length stackArgs
                         (* The offset of the first argument.  Offsets can be negative. *)
                         val stackOffset = stackPtr
                         fun makeStackArgs([], _) = []
                         |   makeStackArgs(arg::args, offset) = {src=arg, stack=offset} :: makeStackArgs(args, offset-1)
                         val stackArgs = makeStackArgs(stackArgs, currentStackArgCount-1)
                         (* The stack adjustment needed to compensate for any items that have been pushed
                            and the differences in the number of arguments.  May be positive or negative. *)
                         val stackAdjust = currentStackArgCount - newStackArgCount
                         (* Add an entry for the return address to the register arguments. *)
                     in
                         BlockExit(TailRecursiveCall{regArgs=(ArgInReg returnAddrReg, X30) :: closureEntry @ regArgs, stackArgs=stackArgs,
                                   stackAdjust = stackAdjust, currStackSize=stackOffset, callKind=callKind}) ::
                                   codeArgs
                     end
                     else
                     let
                         local
                             (* When a container is passed as an argument we put the address into a register.
                                Normally the container will be referenced after the call in order to extract
                                the values but if it's discarded we need to make sure it will continue to
                                be referenced at least as far as the call.  This isn't a problem for the X86
                                code-generator since container addresses are a form of the "argument"
                                datatype. *)
                             fun getContainers(BICExtract(BICLoadLocal l), _) =
                                 (
                                     case Array.sub(locToPregArray, l) of
                                         ContainerLocation{container, ...} => SOME container
                                     |   _ => NONE
                                 )
                             |   getContainers _ = NONE
                         in
                             val containers = List.mapPartial getContainers argList
                         end
                         val call =
                             FunctionCall{regArgs=closureEntry @ regArgs, stackArgs=stackArgs, dest=target,
                                          callKind=callKind, saveRegs=[], containers=containers}
                         val callBlock =
                             case currHandler of
                                 NONE => BlockSimple call :: codeArgs
                             |   SOME h => BlockOptionalHandle{call=call, handler=h, label=newLabel()}  :: codeArgs
                     in
                         callBlock
                     end
             in
                 (callCode, target, tailCall (* We've exited if this was a tail jump *))
             end
 
         |   codeToICodeRev(BICNullary{oper=BuiltIns.GetCurrentThreadId}, _, _, destination, tailCode) =
             (* Get the ID of the current thread. *)
             let
                 val target = asTarget destination
             in
-                (loadMemReg{wordOffset=threadIdOffset, dest=target} :: tailCode, target, false)
+                (getThreadId{dest=target} :: tailCode, target, false)
             end
 
         |   codeToICodeRev(BICNullary{oper=BuiltIns.CheckRTSException}, _, _, destination, tailCode) =
             (* This is now done in the RTS call code. *)
                 returnUnit(destination, tailCode, false)
 
         |   codeToICodeRev(BICNullary {oper=CreateMutex}, _, _, destination, tailCode) =
             let
                 (* Allocate memory for a mutex.  Use a native word as a mutable, weak, no-overwrite, byte cell
                    which is the same as a volatileRef. This ensures that it will always be cleared when it is
                    loaded even if it was locked when it was saved. *)
                 val target = asTarget destination
                 val flags = Word8.orb(F_mutable, Word8.orb(F_weak, Word8.orb(F_noOverwrite, F_bytes))) (* 0wx69 *)
                 val absAddr = if is32in64 then newUReg() else target
                 val zeroReg = newUReg()
                 val allocAndStore =
                     storeWithConstantOffset{ source=zeroReg, base=absAddr, byteOffset=0, loadType=Load64 } ::
                     loadNonAddressConstant{source=0w0, dest=zeroReg} ::
                     allocateWithProfileRev(if is32in64 then 2 else 1, flags, absAddr, tailCode)
                 val code =
                     if is32in64
                     then absoluteToObjectIndex{source=absAddr, dest=target} :: allocAndStore
                     else allocAndStore
             in
                 (code, target, false)
             end
 
         |   codeToICodeRev(BICArbitrary { oper=ArithMult, longCall, ... }, context, isTail, destination, tailCode) =
                 (* Just call the long function to do this.  Overflow detection makes this too complicated. *)
                 codeToICodeRev(longCall, context, isTail, destination, tailCode)
 
         |   codeToICodeRev(BICArbitrary { oper, shortCond, arg1, arg2, longCall }, context, _, destination, tailCode) =
             let
                 val startLong = newLabel() and resultLabel = newLabel()
                 val condResult = newMergeReg()
                 (* Test to see if the arguments are short and go straight to the long case if not. *)
                 val testCode = codeConditionRev(shortCond, context, false, startLong, tailCode) 
                 (* Do the short case *)
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, testCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 (* We need to subtract the tag from one of the arguments and then do the
                    addition.  The optimiser will do the subtraction at compile time if we
                    subtract from a constant so if this is and Add we try to put the constant
                    in the second arg. *)
                 val (firstReg, secondReg) =
                     case (arg1, oper) of
                         (BICConstnt _, ArithAdd) => (aReg2, aReg1) | _ => (aReg1, aReg2)
                 (* Generate code for the short case.  Put the result in the merge register.  Jump to
                    the result if there's no overflow and to the long case if there is. *)
                 val codeShort =
                     case oper of
                         ArithAdd =>
                         let
                             val uReg = newUReg() and chkOverflow = newCCRef()
                         in
                             BlockFlow(Conditional{ ccRef=chkOverflow, condition=CondOverflow, trueJump=startLong, falseJump=resultLabel }) ::
                             addSubRegister{base=firstReg, shifted=uReg, dest=SOME condResult, length=polyWordOpSize,
                                          ccRef=SOME chkOverflow, isAdd=true, shift=ShiftNone} ::
                             addSubImmediate{dest=SOME uReg, source=secondReg, immed=0w1, length=polyWordOpSize,
                                             ccRef=NONE, isAdd=false}:: arg2Code
                         end
 
                     |   ArithSub =>
                         let
                             val uReg = newUReg() and chkOverflow = newCCRef()
                         in
                             BlockFlow(Conditional{ ccRef=chkOverflow, condition=CondOverflow, trueJump=startLong, falseJump=resultLabel }) ::
                             addSubRegister{base=firstReg, shifted=uReg, dest=SOME condResult, length=polyWordOpSize,
                                          ccRef=SOME chkOverflow, isAdd=false, shift=ShiftNone} ::
                             addSubImmediate{dest=SOME uReg, source=secondReg, immed=0w1, length=polyWordOpSize,
                                             ccRef=NONE, isAdd=false}:: arg2Code
                         end
 
                     |   _ => raise InternalError "BICArbitrary: unimplemented operation"
 
                 (* Code for the long case. Put the result into the merge register. *)
                 (* TODO: This could use a tail call if this is at the end of the function. *)
                 val (codeLong, _, _) =
                     codeToICodeRev(longCall, context, false, SpecificPReg condResult,
                                    BlockLabel startLong :: codeShort)
                 val target = asTarget destination
                 (* Copy the merge register into the result. *)
                 val finalCode =
                     moveRegister{source=condResult, dest=target} ::
                     BlockLabel resultLabel :: codeLong
             in
                 (finalCode, target, false)
             end
 
         |   codeToICodeRev(BICLambda(lambda as { closure = [], ...}), _, _, destination, tailCode) =
             (* Empty closure - create a constant closure for any recursive calls. *)
             let
                 val closure = makeConstantClosure()
                 val () = codeFunctionToArm64(lambda, debugSwitches, closure)
                 val dReg = asTarget destination
                 (* Return the closure itself as the value. *)
             in
                 (BlockSimple(LoadAddressConstant{source=closureAsAddress closure, dest=dReg}) :: tailCode, dReg, false)
             end
 
         |   codeToICodeRev(BICLambda(lambda as { closure, ...}), context, _, destination, tailCode) =
             (* Non-empty closure.  Ignore stack closure option at the moment. *)
             let
                 val wordsRequired = List.length closure  + (if is32in64 then 2 else 1)
                 val target = asTarget destination
                 val absAddr = if is32in64 then newUReg() else target
                 (* The values we're storing are all either constants or local/closure variables so
                    we can allocate the memory and then store into it. *)
                 val allocCode =
                     allocateWithProfileRev(wordsRequired, if is32in64 then F_closure else 0w0, absAddr, tailCode)
                 val storeCode =
                     storeIntoClosure(lambda, absAddr, context, allocCode)
                 val finalCode =
                     if is32in64 then BlockSimple(AbsoluteToObjectIndex{source=absAddr, dest=target}) :: storeCode else storeCode
             in
                 (finalCode, target, false)
             end
 
         |   codeToICodeRev(BICCase { cases, test, default, isExhaustive, firstIndex}, context, isTail, destination, tailCode) =
             let
                 (* We have to create a new preg for the result in case we need to push
                    it to the stack. *)
                 val targetReg = newMergeReg()
                 
                 local
                     val (testCode, initialTestReg, _) = codeToICodeRev(test, context, false, AnyReg, tailCode)
                     (* Subtract the minimum even if it is zero to remove the tag.
                        This leaves us with a shifted but untagged value. Don't check for overflow.
                        Instead allow large values to wrap around and check later. *)
                     val cReg1 = newUReg()
                     val subValue = taggedWord64(Word64.fromLarge(Word.toLargeX firstIndex))
                 in
                     val testReg = newUReg()
                     val testCode =
                         addSubRegister{ base=initialTestReg, shifted=cReg1, dest=SOME testReg, ccRef=NONE, isAdd=false,
                                         length=polyWordOpSize, shift=ShiftNone} ::
                         loadNonAddressConstant{ source=subValue, dest=cReg1 } :: testCode
                 end
 
                 val (rangeCheck, extraDefaults) =
                     if isExhaustive then (testCode, [])
                     else
                     let
                         (* Check the value is within the number of cases, *2 because this is shifted. *)
                         val cReg2 = newUReg() and ccRef1 = newCCRef()
                         val nCases = List.length cases
                         val continueLab = newLabel() and defaultLab1 = newLabel()
                         val rangeCheck =
                             BlockLabel continueLab ::
                             BlockFlow(Conditional{ccRef=ccRef1, condition=CondCarrySet, trueJump=defaultLab1, falseJump=continueLab}) ::
                             addSubRegister{base=testReg, shifted=cReg2, dest=NONE, ccRef=SOME ccRef1, isAdd=false,
                                            length=OpSize64, shift=ShiftNone} ::
                             loadNonAddressConstant{ source=Word64.fromInt nCases * 0w2, dest=cReg2 } :: testCode
                     in
                         (rangeCheck, [defaultLab1])
                     end
 
                 (* Make a label for each item in the list. *)
                 val codeLabels = map (fn _ => newLabel()) cases
 
                 (* Create an exit label in case it's needed. *)
                 val labelForExit = newLabel()
 
                 (* Generate the code for each of the cases and the default.  We need to put an
                    unconditional branch after each to skip the other cases. *)
                 fun codeCases (SOME c :: otherCases, startLabel :: otherLabels, tailCode) =
                     let
                         val caseTarget = if isTail then newPReg() else targetReg
                         (* Put in the case with a jump to the end of the sequence. *)
                         val (codeThisCase, _, caseExited) =
                             codeToICodeRev(c, context, isTail, SpecificPReg caseTarget,
                                 BlockLabel startLabel :: tailCode) 
                         val exitThisCase =
                             if caseExited then codeThisCase
                             else if isTail then returnInstruction(context, caseTarget, codeThisCase)
                             else BlockFlow(Unconditional labelForExit) :: codeThisCase
                     in
                         codeCases(otherCases, otherLabels, exitThisCase)
                     end
 
                 |   codeCases(NONE :: otherCases, _ :: otherLabels, tailCode) = codeCases(otherCases, otherLabels, tailCode)
                         
                 |   codeCases ([], [], tailCode) =
                     let
                         (* We need to add labels for all the gaps we filled and also for a "default" label for
                            the indexed-case instruction itself as well as any range checks. *)
                         fun addDefault (startLabel, NONE, l) = BlockLabel startLabel :: l
                         |   addDefault (_, SOME _, l) = l
                         fun asForward l = BlockLabel l
                         val dLabs = map asForward extraDefaults @ tailCode
                         val defLabels = ListPair.foldlEq addDefault dLabs (codeLabels, cases)
                         val defaultTarget = if isTail then newPReg() else targetReg
                         val (defaultCode, _, defaultExited) =
                             codeToICodeRev(default, context, isTail, SpecificPReg defaultTarget, defLabels)
                     in
                         (* Put in the default.  Because this is the last we don't need to
                            jump round it.  However if this is a tail and we haven't exited
                            we put in a return.  That way the case will always have
                            exited if this is a tail. *)
                          if isTail andalso not defaultExited
                          then returnInstruction(context, defaultTarget, defaultCode)
                          else defaultCode
                     end
 
                 |   codeCases _ = raise InternalError "codeCases: mismatch"
                     
                 val codedCases =
                     codeCases(cases, codeLabels,
                         BlockFlow(IndexedBr codeLabels) ::
                         BlockSimple(IndexedCaseOperation{testReg=testReg}) ::
                         rangeCheck)
    
                 (* We can now copy to the target.  If we need to push the result this load
                    will be converted into a push. *)
                 val target = asTarget destination
                 val copyToTarget =
                     if isTail then codedCases
                     else moveRegister{source=targetReg, dest=target} :: BlockLabel labelForExit :: codedCases
             in
                 (copyToTarget, target, isTail (* We have always exited on a tail. *))
             end
 
         |   codeToICodeRev(BICBeginLoop {loop, arguments}, context as { stackPtr, currHandler, overflowBlock, ...},
                            isTail, destination, tailCode) =
             let
                 val target = asTarget destination
                 
                 fun codeArgs ([], tailCode) = ([], tailCode)
                 |   codeArgs (({value, addr}, _) :: rest, tailCode) =
                     let
                         val pr = newPReg()
                         val () = Array.update(locToPregArray, addr, PregLocation pr)
                         val (code, _, _) = codeToICodeRev(value, context, false, SpecificPReg pr, tailCode)
                         val (pregs, othercode) = codeArgs(rest, code)
                     in
                         (pr::pregs, othercode)
                     end
                 val (loopRegs, argCode) = codeArgs(arguments, tailCode)
 
                 val loopLabel = newLabel()
                 val (loopBody, _, loopExited) =
                     codeToICodeRev(loop,
                         {loopArgs=SOME (loopRegs, loopLabel, stackPtr), stackPtr=stackPtr,
                          currHandler=currHandler, overflowBlock=overflowBlock },
                             isTail, SpecificPReg target, BlockLabel loopLabel :: BlockSimple BeginLoop :: argCode)
             in
                 (loopBody, target, loopExited)
             end
 
         |   codeToICodeRev(BICLoop args, context as {loopArgs=SOME (loopRegs, loopLabel, loopSp), stackPtr, currHandler, ...}, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 (* Registers to receive the evaluated arguments.  We can't put the
                    values into the loop variables yet because the values could depend
                    on the current values of the loop variables. *)
                 val argPRegs = map(fn _ => newPReg()) args
                 val codeArgs =
                     ListPair.foldlEq(fn ((arg, _), pr, l) =>
                         #1 (codeToICodeRev(arg, context, false, SpecificPReg pr, l))) tailCode
                         (args, argPRegs)
                 val jumpArgs = ListPair.mapEq(fn (s, l) => {src=ArgInReg s, dst=l}) (argPRegs, loopRegs)
                 (* If we've allocated a container in the loop we have to remove it before jumping back. *)
                 val stackReset =
                     if loopSp = stackPtr then codeArgs
                     else resetStackPtr{numWords=stackPtr-loopSp} :: codeArgs
                 val jumpLoop = JumpLoop{regArgs=jumpArgs, stackArgs=[], checkInterrupt=SOME[]}
                 (* "checkInterrupt" could result in a Interrupt exception so we treat this like
                    a function call. *)
                 val code =
                     case currHandler of
                         NONE => BlockFlow(Unconditional loopLabel) :: BlockSimple jumpLoop :: stackReset
                     |   SOME h => BlockOptionalHandle{call=jumpLoop, handler=h, label=loopLabel} :: stackReset
             in
                 (code, target, true)
             end
 
         |   codeToICodeRev(BICLoop _, {loopArgs=NONE, ...}, _, _, _) = raise InternalError "BICLoop without BICBeginLoop"
 
             (* Copy the source tuple into the container.  There are important special cases for
                both the source tuple and the container.  If the source tuple is a BICTuple we have
                the fields and can store them without creating a tuple on the heap.  If the
                destination is a local container we can store directly into the stack. *)
         |   codeToICodeRev(BICSetContainer{container, tuple, filter}, context as {stackPtr, ...}, _, destination, tailCode) =
             let
                 local
                     fun createStore containerReg (source, destWord, tail) =
                         storeAtWordOffset(source, destWord, containerReg, Load64, tail)
                 in
                     val findContainer =
                         case container of
                             BICExtract(BICLoadLocal l) =>
                             (
                                 case Array.sub(locToPregArray, l) of
                                     ContainerLocation{container, stackOffset} =>
                                     let
                                         fun store(source, destWord, tail) =
                                             storeToStack{source=source, container=container, field=destWord,
                                                 stackOffset=stackPtr-stackOffset+destWord} :: tail
                                     in
                                         SOME store
                                     end
                                |    _ => NONE
                            )
                        |    _ => NONE
 
                     val (codeContainer, storeInstr) =
                         case findContainer of
                             SOME storeToStack => (tailCode, storeToStack)
                         |   NONE => 
                             let
                                 val containerTarget = newPReg()
                                 val (codeContainer, _, _) =
                                     codeToICodeRev(container, context, false, SpecificPReg containerTarget, tailCode)
                             in
                                 (codeContainer, createStore containerTarget)
                             end
                 end
                 
                 val filterLength = BoolVector.length filter
 
                 val code =
                     case tuple of
                         BICTuple cl =>
                         let
                             (* In theory it's possible that the tuple could contain fields that are not
                                used but nevertheless need to be evaluated for their side-effects.
                                Create all the fields and push to the stack. *)
                             fun codeField(arg, (regs, tailCode)) =
                             let
                                 val (c, r, _) =
                                     codeToICodeRev(arg, context, false, AnyReg, tailCode)
                             in
                                 (r :: regs, c)
                             end
 
                             val (pregsRev, codeFields) = List.foldl codeField ([], codeContainer) cl
                             val pregs = List.rev pregsRev
 
                             fun copyField(srcReg, (sourceWord, destWord, tailCode)) =
                                 if sourceWord < filterLength andalso BoolVector.sub(filter, sourceWord)
                                 then (sourceWord+1, destWord+1, storeInstr(srcReg, destWord, tailCode))
                                 else (sourceWord+1, destWord, tailCode)
                             
                             val (_, _, resultCode) = List.foldl copyField (0, 0, codeFields) pregs
                         in
                             resultCode
                         end
 
                     |   tuple =>
                         let (* Copy a heap tuple.  It is possible that this is another container in which case
                                we must load the fields directly.  We mustn't load its address and then copy
                                because loading the address would be the last reference and might cause
                                the container to be reused prematurely. ??? Is that an old comment ?? *)
                             val (codeTuple, loadField) =
                                 case getContainerIfPresent tuple of
                                     SOME {container, stackOffset} =>
                                     let
                                         fun getAddr(destReg, sourceWord, tail) =
                                             loadStack{dest=destReg, wordOffset=stackPtr-stackOffset+sourceWord, container=container,
                                                           field=sourceWord} :: tail
                                     in
                                         (codeContainer, getAddr)
                                     end
                                 |   NONE =>
                                     let
                                         val (codeTuple, tupleTarget, _) = codeToICodeRev(tuple, context, false, AnyReg, codeContainer)
                                         fun loadField(destReg: preg, sourceWord: int, tail): blockStruct list =
                                         let
                                             val (code, _, _) =
                                                 wordAddressOffset(SpecificPReg destReg, tupleTarget, sourceWord, polyWordLoadSize, tail)
                                         in
                                             code
                                         end
                                     in
                                         (codeTuple, loadField)
                                     end
 
                             fun copyContainer(sourceWord, destWord, tailCode) =
                             if sourceWord = filterLength
                             then tailCode
                             else if BoolVector.sub(filter, sourceWord)
                             then
                             let
                                 val loadReg = newPReg()
                                 val code =
                                     storeInstr(loadReg, destWord, loadField(loadReg, sourceWord, tailCode))
                             in
                                 copyContainer(sourceWord+1, destWord+1, code)
                             end
                             else copyContainer(sourceWord+1, destWord, tailCode)
                         in
                             copyContainer(0, 0, codeTuple)
                         end
             in
                 returnUnit(destination, code, false)
             end
 
         |   codeToICodeRev(BICLoadContainer{base, offset}, context as {stackPtr, ...}, _, destination, tailCode) =
             (
                 case getContainerIfPresent base of
                     SOME {container, stackOffset} =>
                     let (* If this is a local container we extract the field. *)
                         val target = asTarget destination
                         val finalOffset = stackPtr-stackOffset+offset
                         val _ = finalOffset >= 0 orelse raise InternalError "offset"
                     in
                         (BlockSimple(LoadStack{wordOffset=finalOffset, container=container, field=offset,
                             dest=target}) :: tailCode, target, false)
                     end
 
                 |   NONE =>
                     let
                         val (codeBase, baseEntry, _) = codeToICodeRev(base, context, false, AnyReg, tailCode)
                     in
                         wordAddressOffset(destination, baseEntry, offset, Load64, codeBase)
                     end
             )
 
         |   codeToICodeRev(BICLoadOperation{ kind, address}, context, _, destination, tailCode) =
                 codeLoadOperation(kind, address, context, asTarget destination, tailCode)
 
         |   codeToICodeRev(BICStoreOperation{ kind, address, value}, context, _, destination, tailCode) =
                 codeStoreOperation(kind, address, value, context, destination, tailCode)
 
         |   codeToICodeRev(BICBlockOperation{ kind=BlockOpMove{isByteMove}, sourceLeft, destRight, length }, context, _, destination, tailCode) =
                 (* Assume these are copying immutable data i.e. vector to vector and string to string.
                    The simplifier now assumes that when optimising short constant moves e.g. concatenating
                    with a constant string. *)
             let
                 (* Move bytes or words from the source to the destination.  Need to get the start addresses
                    and length into new registers because they will be modified. *)
                 val (leftAddr, codeLft) = addressToPregAddress(sourceLeft, context, tailCode)
                 val (rightAddr, codeRt) = addressToPregAddress(destRight, context, codeLft)
                 val (codeLength, lengthReg, _) = codeToICodeRev(length, context, false, AnyReg, codeRt)
                 val loadOp = if isByteMove then Load8 else if is32in64 then Load32 else Load64
 
                 (* This threads the calls through two calls to loadAndStoreWithAbsolute to compute the addresses. *)
                 fun getDestAndMove(ltReg, tailCode) =
                 let
                     fun doMove (rtReg, code) =
                     let
                         val lengthReg2 = newUReg() and ltReg2 = newUReg() and rtReg2 = newUReg()
                     in
                         blockMove{ srcAddr=ltReg2, destAddr=rtReg2, length=lengthReg2, isByteMove=isByteMove } ::
                         moveRegister{dest=rtReg2, source=rtReg} ::
                         moveRegister{dest=ltReg2, source=ltReg} ::
                         untagValue{dest=lengthReg2, source=lengthReg, isSigned=false, opSize=polyWordOpSize} :: code
                     end
                 in
-                    loadAndStoreWithAbsolute (rightAddr, loadOp, doMove, tailCode)
+                    loadAndStoreWithAbsolute (rightAddr, opWordSize loadOp, loadShift loadOp, doMove, tailCode)
                 end
             in
-                returnUnit(destination, loadAndStoreWithAbsolute (leftAddr, loadOp, getDestAndMove, codeLength), false)
+                returnUnit(destination, loadAndStoreWithAbsolute (leftAddr, opWordSize loadOp, loadShift loadOp, getDestAndMove, codeLength), false)
             end
 
         |   codeToICodeRev(BICBlockOperation{ kind=BlockOpEqualByte, sourceLeft, destRight, length }, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val ccRef = newCCRef()
                 (* Compare bytes for equality.  Need to get the start addresses and length into new registers
                    because they will be modified. *)
                 val (leftAddr, codeLft) = addressToPregAddress(sourceLeft, context, tailCode)
                 val (rightAddr, codeRt) = addressToPregAddress(destRight, context, codeLft)
                 val (codeLength, lengthReg, _) = codeToICodeRev(length, context, false, AnyReg, codeRt)
 
                 (* This threads the calls through two calls to loadAndStoreWithAbsolute to compute the addresses. *)
 
                 fun getRightAndCompare(ltReg, tailCode) =
                 let
                     fun doComparison (rtReg, code) =
                     let
                         val lengthReg2 = newUReg() and ltReg2 = newUReg() and rtReg2 = newUReg()
                     in
                         compareByteVectors{ vec1Addr=ltReg2, vec2Addr=rtReg2, length=lengthReg2, ccRef=ccRef } ::
                         moveRegister{dest=rtReg2, source=rtReg} ::
                         moveRegister{dest=ltReg2, source=ltReg} ::
                         untagValue{dest=lengthReg2, source=lengthReg, isSigned=false, opSize=polyWordOpSize} :: code
                     end
                 in
-                    loadAndStoreWithAbsolute (rightAddr, Load8, doComparison, tailCode)
+                    loadAndStoreWithAbsolute (rightAddr, opWordSize Load8, loadShift Load8, doComparison, tailCode)
                 end
 
                 val testCode =
-                    loadAndStoreWithAbsolute (leftAddr, Load8, getRightAndCompare, codeLength)
+                    loadAndStoreWithAbsolute (leftAddr, opWordSize Load8, loadShift Load8, getRightAndCompare, codeLength)
             in
                 (makeBoolResultRev(CondEqual, ccRef, target, testCode), target, false)
             end
 
         |   codeToICodeRev(BICBlockOperation{ kind=BlockOpCompareByte, sourceLeft, destRight, length }, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val ccRef = newCCRef()
                 (* Similar to OpEqualByte except it returns -1, 0, +1 depending on the condition code. *)
                 (* Compare bytes for equality.  Need to get the start addresses and length into new registers
                    because they will be modified. *)
                 val (leftAddr, codeLft) = addressToPregAddress(sourceLeft, context, tailCode)
                 val (rightAddr, codeRt) = addressToPregAddress(destRight, context, codeLft)
                 val (codeLength, lengthReg, _) = codeToICodeRev(length, context, false, AnyReg, codeRt)
 
                 (* This threads the calls through two calls to loadAndStoreWithAbsolute to compute the addresses. *)
 
                 fun getRightAndCompare(ltReg, tailCode) =
                 let
                     fun doComparison (rtReg, code) =
                     let
                         val lengthReg2 = newUReg() and ltReg2 = newUReg() and rtReg2 = newUReg()
                         val exitLab = newLabel() and labGreater = newLabel() and labNotGreater = newLabel() 
                         and labLess = newLabel() and labNotLess = newLabel()
                         val mergeResult = newMergeReg()
                         val taggedMinus1 =
                             if is32in64 then 0wxffffffff else 0wxffffffffffffffff
                     in
                         (* Compare the words then a series of comparisons to set the result.
                            TODO; The old code-generator makes the "equal" exit of compareByteVectors jump
                            directly to code to set the result to zero.  It then uses
                            loadNonAddress(X0, Word64.fromInt(tag 1)) followed by
                            conditionalSetInverted{regD=X0, regTrue=X0, regFalse=XZero, cond=CondUnsignedHigher}
                            to set the result to one or minus one. N.B.  This needs to use a 32-bit operation on
                            32-in-64. *)
                         moveRegister{dest=target, source=mergeResult} ::
                         BlockLabel exitLab ::
                         loadNonAddressConstant{source=taggedWord64 0w1, dest=mergeResult} ::
                         BlockLabel labGreater ::
                         BlockFlow(Unconditional exitLab) ::
                         loadNonAddressConstant{source=taggedMinus1, dest=mergeResult} ::
                         BlockLabel labLess ::
                         BlockFlow(Unconditional exitLab) ::
                         loadNonAddressConstant{source=taggedWord64 0w0, dest=mergeResult} ::
                         BlockLabel labNotGreater ::
                         BlockFlow(Conditional{ ccRef=ccRef, condition=CondUnsignedHigher, trueJump=labGreater, falseJump=labNotGreater }) ::
                         BlockLabel labNotLess ::
                         BlockFlow(Conditional{ ccRef=ccRef, condition=CondCarryClear, trueJump=labLess, falseJump=labNotLess }) ::
                         compareByteVectors{ vec1Addr=ltReg2, vec2Addr=rtReg2, length=lengthReg2, ccRef=ccRef } ::
                         moveRegister{dest=rtReg2, source=rtReg} ::
                         moveRegister{dest=ltReg2, source=ltReg} ::
                         untagValue{dest=lengthReg2, source=lengthReg, isSigned=false, opSize=polyWordOpSize} :: code
                     end
                 in
-                    loadAndStoreWithAbsolute (rightAddr, Load8, doComparison, tailCode)
+                    loadAndStoreWithAbsolute (rightAddr, opWordSize Load8, loadShift Load8, doComparison, tailCode)
                 end
 
                 val testCode =
-                    loadAndStoreWithAbsolute (leftAddr, Load8, getRightAndCompare, codeLength)
+                    loadAndStoreWithAbsolute (leftAddr, opWordSize Load8, loadShift Load8, getRightAndCompare, codeLength)
             in
                 (testCode, target, false)
             end
 
         |   codeToICodeRev(BICAllocateWordMemory {numWords, flags, initial }, context, _, destination, tailCode) =
             let
                 (* Allocate a block of memory and initialise it. *)
                 val target = asTarget destination
                 val (codeSize, sizeReg, _) = codeToICodeRev(numWords, context, false, AnyReg, tailCode)
                 val (codeFlags, flagsReg, _) = codeToICodeRev(flags, context, false, AnyReg, codeSize)
                 val (codeInit, initReg, _) = codeToICodeRev(initial, context, false, AnyReg, codeFlags)
                 val uSizeReg = newUReg() and shiftFReg = newUReg() and lengthWord = newUReg()
                 val absAddr = if is32in64 then newUReg() else target
                 val untagSize = untagValue{source=sizeReg, dest=uSizeReg, opSize=polyWordOpSize, isSigned=false} :: codeInit
                 val allocateMem = allocateMemoryVariable{ size=uSizeReg, dest=absAddr, saveRegs=[]} :: untagSize
                 (* Make the length word by first shifting the flags into the length word reg by
                    55 or 23 bits.  This puts the tag bit in the top bit of the size.  Then insert the size
                    into this which will overwrite the flag's tag bit. *)
                 val makeLengthWord =
                     bitFieldInsert{ source=uSizeReg, destAsSource=shiftFReg, dest=lengthWord, length=polyWordOpSize,
                               immr=0w0 (*bit 0*), imms=if is32in64 then 0w23 else 0w55 (*width-1*) } ::
                     shiftConstant{direction=Arm64ICode.ShiftLeft, dest=shiftFReg, source=flagsReg,
                         shift=if is32in64 then 0w23 else 0w55, opSize=polyWordOpSize } :: allocateMem
                 val setLengthWordAndInit =
                     initialiseMem{ size=uSizeReg, addr=absAddr, init=initReg} ::
                     storeWithConstantOffset{ source=lengthWord, base=absAddr,
                         byteOffset= ~(Word.toInt wordSize), loadType=polyWordLoadSize } :: makeLengthWord
                 val finalCode =
                     if is32in64 then absoluteToObjectIndex{ source=absAddr, dest=target } :: setLengthWordAndInit
                     else setLengthWordAndInit
             in
                 (finalCode, target, false)
             end
 
         |   codeToICodeRev(BICHandle{exp, handler, exPacketAddr}, context as { stackPtr, loopArgs, overflowBlock, ... }, isTail, destination, tailCode) =
             let
                 (* As with BICCond and BICCase we need to create a new register for the
                    result in case we need to push it to the stack. *)
                 val handleResult = newMergeReg()
                 val handlerLab = newLabel() and startHandling = newLabel()
                 val (bodyTarget, handlerTarget) =
                     if isTail then (newPReg(), newPReg()) else (handleResult, handleResult)
                 (* TODO: Even if we don't actually want a result we force one in here by
                    using "asTarget".  *)
                 (* The expression cannot be treated as a tail because the handler has
                    to be removed after.  It may "exit" if it has raised an unconditional
                    exception.  If it has we mustn't generate a PopExceptionHandler because
                    there won't be any result for resultReg.
                    We need to add two words to the stack to account for the items pushed by
                    PushExceptionHandler.
                    We create an instruction to push the handler followed by a block fork to
                    the start of the code and, potentially the handler, then a label to start
                    the code that the handler is in effect for. *)
                 val initialCode =
                     BlockLabel startHandling ::
                     BlockFlow(SetHandler{handler=handlerLab, continue=startHandling}) ::
                     BlockSimple(PushExceptionHandler) :: tailCode
                 val (expCode, _, expExit) =
                     codeToICodeRev(exp, {stackPtr=stackPtr+2, loopArgs=loopArgs, currHandler=SOME handlerLab, overflowBlock=overflowBlock},
                         false (* Not tail *), SpecificPReg bodyTarget, initialCode)
                 (* If this is the tail we can replace the jump at the end of the
                    handled code with returns.  If the handler has exited we don't need
                    a return there.  Otherwise we need to add an unconditional jump to
                    skip the handler. *)
                 val (atExpEnd, skipExpLabel) =
                     case (isTail, expExit) of
                         (true, true) => (* Tail and exited. *) (expCode, NONE)
                     |   (true, false) => (* Tail and not exited. *)
                             (returnInstruction(context, bodyTarget, BlockSimple(PopExceptionHandler) :: expCode), NONE)
                     |   (false, true) => (* Not tail but exited. *) (expCode, NONE)
                     |   (false, false) =>
                         let
                             val skipHandler = newLabel()
                         in
                             (BlockFlow(Unconditional skipHandler) ::
                              BlockSimple(PopExceptionHandler) :: expCode, SOME skipHandler)
                         end
                 (* Make a register to hold the exception packet and put eax into it. *)
                 val packetAddr = newPReg()
                 val () = Array.update(locToPregArray, exPacketAddr, PregLocation packetAddr)
                 val (handleCode, _, handleExit) =
                     codeToICodeRev(handler, context, isTail, SpecificPReg handlerTarget,
                         BlockSimple(BeginHandler{packetReg=packetAddr}) :: BlockLabel handlerLab :: atExpEnd)
                 val target = asTarget destination
                 val afterHandler =
                     case (isTail, handleExit) of
                         (true, true) => (* Tail and exited. *) handleCode
                     |   (true, false) => (* Tail and not exited. *)
                             returnInstruction(context, handlerTarget, handleCode)
                     |   (false, _) => (* Not tail. *) handleCode
                 
                 val addLabel =
                     case skipExpLabel of
                         SOME lab => BlockLabel lab:: afterHandler
                     |   NONE => afterHandler
             in
                 (moveRegister{source=handleResult, dest=target} :: addLabel, target, isTail)
             end
 
         and codeConditionRev(condition, context, jumpOn, jumpLabel, tailCode) =
        (* Jump optimisation is done later.  Just generate the general case.
           Load the value into a register and compare it with 1 (true) *)
         let
             val ccRef = newCCRef()
             val (testCode, testReg, _) = codeToICodeRev(condition, context, false, AnyReg, tailCode)
             val noJumpLabel = newLabel()
         in
             BlockLabel noJumpLabel ::
             BlockFlow(Conditional{ccRef=ccRef,
                        condition=if jumpOn then CondEqual else CondNotEqual, trueJump=jumpLabel, falseJump=noJumpLabel}) ::
             (* Compare: SUBS XZ,reg,3.  Can use 32-bit comparison because it's either tagged 0 or tagged 1. *)
             addSubImmediate{source=testReg, immed=taggedWord 0w1, isAdd=false, dest=NONE, length=OpSize32, ccRef=SOME ccRef} ::
             testCode
         end
 
         and codeToICodeUnaryRev({oper=NotBoolean, arg1}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val ccRef = newCCRef()
                 val (argCode, testDest, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
             in
                 (* Test the argument and return a boolean result.  If either the argument is a condition
                    or the result is used in a test this will be better than using XOR. *)
                 (makeBoolResultRev(CondNotEqual, ccRef, target,
                         addSubImmediate{source=testDest, immed=taggedWord 0w1, isAdd=false,
                                 dest=NONE, length=OpSize32 (* Always either tagged 0 or tagged 1 *), ccRef=SOME ccRef} ::
                             argCode), target, false)
             end
 
         |   codeToICodeUnaryRev({oper=IsTaggedValue, arg1}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val ccRef = newCCRef()
                 val (argCode, testDest, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
             in
                 (* Test the argument and return a boolean result.  This ought to be optimised at
                    a lower level to use a test-and-branch. *)
                 (makeBoolResultRev(CondNotEqual, ccRef, target,
                             logicalImmediate{source=testDest, immed=0w1 (* The tag bit*), logOp=LogAnd,
                                 dest=NONE, length=OpSize32 (* Always either tagged 0 or tagged 1 *), ccRef=SOME ccRef} ::
                             argCode), target, false)
             end
 
         |   codeToICodeUnaryRev({oper=MemoryCellLength, arg1}, context, _, destination, tailCode) =
             let
                 val ureg1 = newUReg() and ureg2 = newUReg()
                 val (codeBase, baseReg, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 (* Load the word at -1 (words) into a ureg *)
                 val (codeLoad, _, _) = wordAddressOffset(SpecificPReg ureg1, baseReg, ~1, polyWordLoadSize, codeBase)
                 (* Select 56 or 24 bits and shift it left.  This disassembles as UBFIZ..*)
                 val lsb = 0w1 and width = if is32in64 then 0w24 else 0w56
                 (* Encoding for unsignedBitfieldInsertinZeros64/32 *)
                 val immr = if is32in64 then Word.~ lsb mod 0w32 else Word.~ lsb mod 0w64
                 val imms = width-0w1
                 val maskAndShift =
                     bitFieldShift{source=ureg1, dest=ureg2, isSigned=false, length=polyWordOpSize, immr=immr, imms=imms} :: codeLoad
                 val target = asTarget destination
                 val addTag =
                     addSubImmediate{dest=SOME target, source=ureg2, immed=0w1, length=polyWordOpSize,
                                             ccRef=NONE, isAdd=true} :: maskAndShift
             in
                 (addTag, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=MemoryCellFlags, arg1}, context, _, destination, tailCode) =
             let
                 (* Load the flags byte and tag it. *)
                 val (codeBase, baseReg, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (codeRealBase, realBaseReg) = getAbsoluteAddress(codeBase, baseReg)
                 val ureg = newUReg()
                 val codeLoad =
                     loadWithConstantOffset{ base=realBaseReg, dest=ureg, byteOffset=flagsByteOffset, loadType=Load8 } :: codeRealBase
                 val target = asTarget destination
                 val withTag = tagValue{ source=ureg, dest=target, isSigned=false, opSize=OpSize32 } :: codeLoad
             in
                 (withTag, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=ClearMutableFlag, arg1}, context, _, destination, tailCode) =
             let
                 val (codeBase, baseReg, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (codeRealBase, realBaseReg) = getAbsoluteAddress(codeBase, baseReg)
                 val ureg1 = newUReg() and ureg2 = newUReg()
                 (* Load the flags, mask off the mutable bit and store it back. *)
                 val code =
                     storeWithConstantOffset{ base=realBaseReg, source=ureg2, byteOffset=flagsByteOffset, loadType=Load8 } ::
                     logicalImmediate{ source=ureg1, dest=SOME ureg2, ccRef=NONE,
                                       immed=Word64.xorb(0wxffffffff, 0wx40), logOp=LogAnd, length=OpSize32 } ::
                     loadWithConstantOffset{ base=realBaseReg, dest=ureg1, byteOffset=flagsByteOffset, loadType=Load8 } :: codeRealBase
             in
                 returnUnit(destination, code, false)
             end
 
         |   codeToICodeUnaryRev({oper=LongWordToTagged, arg1}, context, _, destination, tailCode) =
             let
                 val (codeBase, baseReg, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val uReg = newUReg()
                 val target = asTarget destination
                 val code =
                     tagValue{ source=uReg, dest=target, isSigned=false, opSize=polyWordOpSize } ::
                     unboxLarge{ source=baseReg, dest=uReg } :: codeBase
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=SignedToLongWord, arg1}, context, _, destination, tailCode) =
             let
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val uReg = newUReg()
                 val target = asTarget destination
                 (* We can use a single instruction here on both 32-in-64 and native 64-bits.
                    On 64-bits this is equivalent to an arithmetic shift; on 32-bits
                    it propagates the sign bit into the high-order part. *)
                 val code =
                     boxLarge{ source=uReg, dest=target, saveRegs=[] } ::
                     bitFieldShift{ source=aReg1, dest=uReg, isSigned=true, length=OpSize64,
                                    immr=0w1, imms=if is32in64 then 0wx1f else 0wx3f } ::
                     arg1Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=UnsignedToLongWord, arg1}, context, _, destination, tailCode) =
             let
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val uReg = newUReg()
                 val target = asTarget destination
                 (* This amounts to a logical shift.  Since the top half of the
                    register is zero in 32-in-64 we don't have to select just the
                    low word but there's no advantage in not. *)
                 val code =
                     boxLarge{ source=uReg, dest=target, saveRegs=[] } ::
                     bitFieldShift{ source=aReg1, dest=uReg, isSigned=false, length=OpSize64,
                                    immr=0w1, imms=if is32in64 then 0wx1f else 0wx3f } ::
                     arg1Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=RealAbs precision, arg1}, context, _, destination, tailCode) =
             let
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val uReg1 = newUReg() and uReg2 = newUReg()
                 val target = asTarget destination
                 val fpSize = precisionToFpSize precision
                 val fpOp = case precision of PrecSingle => AbsFloat | PrecDouble => AbsDouble
                 val code =
                     boxTagFloat{ floatSize=fpSize, source=uReg2, dest=target, saveRegs=[] } ::
                     unaryFloatingPt{ source=uReg1, dest=uReg2, fpOp=fpOp } ::
                     unboxTagFloat{ floatSize=fpSize, source=aReg1, dest=uReg1 } ::
                     arg1Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=RealNeg precision, arg1}, context, _, destination, tailCode) =
             let
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val uReg1 = newUReg() and uReg2 = newUReg()
                 val target = asTarget destination
                 val fpSize = precisionToFpSize precision
                 val fpOp = case precision of PrecSingle => NegFloat | PrecDouble => NegDouble
                 val code =
                     boxTagFloat{ floatSize=fpSize, source=uReg2, dest=target, saveRegs=[] } ::
                     unaryFloatingPt{ source=uReg1, dest=uReg2, fpOp=fpOp } ::
                     unboxTagFloat{ floatSize=fpSize, source=aReg1, dest=uReg1 } ::
                     arg1Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=RealFixedInt precision, arg1}, context, _, destination, tailCode) =
             let
                 (* Convert a tagged integer (FixedInt.int) to float or double. *)
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val uReg1 = newUReg() and uReg2 = newUReg()
                 val target = asTarget destination
                 val fpSize = precisionToFpSize precision
                 val code =
                     boxTagFloat{ floatSize=fpSize, source=uReg2, dest=target, saveRegs=[] } ::
                     convertIntToFloat{ source=uReg1, dest=uReg2, srcSize=polyWordOpSize, destSize=fpSize } ::
                     untagValue{ source=aReg1, dest=uReg1, opSize=polyWordOpSize, isSigned=true } ::
                     arg1Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=FloatToDouble, arg1}, context, _, destination, tailCode) =
             let
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val uReg1 = newUReg() and uReg2 = newUReg()
                 val target = asTarget destination
                 val code =
                     boxTagFloat{ floatSize=Double64, source=uReg2, dest=target, saveRegs=[] } ::
                     unaryFloatingPt{ source=uReg1, dest=uReg2, fpOp=ConvFloatToDble } ::
                     unboxTagFloat{ floatSize=Float32, source=aReg1, dest=uReg1 } ::
                     arg1Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=DoubleToFloat, arg1}, context, _, destination, tailCode) =
             let
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val uReg1 = newUReg() and uReg2 = newUReg()
                 val target = asTarget destination
                 val code =
                     boxTagFloat{ floatSize=Float32, source=uReg2, dest=target, saveRegs=[] } ::
                     unaryFloatingPt{ source=uReg1, dest=uReg2, fpOp=ConvDbleToFloat } ::
                     unboxTagFloat{ floatSize=Double64, source=aReg1, dest=uReg1 } ::
                     arg1Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=RealToInt(precision, rounding), arg1}, context, _, destination, tailCode) =
             let
                 (* Convert a float or double to a tagged int.
                    We could get an overflow in either the conversion to integer
                    or in the conversion to a tagged value.  Fortunately if the
                    conversion detects an overflow it sets the result to a
                    value that will cause an overflow in the addition. *)
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val target = asTarget destination
                 val chkOverflow = newCCRef()
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 val fpSize = precisionToFpSize precision
                 val code =
                     (* Set the tag bit. *)
                     addSubImmediate{dest=SOME target, source=uReg3, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=true} ::
                     checkOverflow(CondOverflow, context, chkOverflow) @
                     (* Add it to itself and set the condition code. *)
                     addSubRegister{base=uReg2, shifted=uReg2, dest=SOME uReg3, length=polyWordOpSize,
                                  ccRef=SOME chkOverflow, isAdd=true, shift=ShiftNone} ::
                     convertFloatToInt{ source=uReg1, dest=uReg2, srcSize=fpSize, destSize=polyWordOpSize, rounding=rounding } ::
                     unboxTagFloat{ floatSize=fpSize, source=aReg1, dest=uReg1 } ::
                     arg1Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=TouchAddress, arg1}, context, _, destination, tailCode) =
             let
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
             in
                 returnUnit(destination, touchValue{source=aReg1} :: arg1Code, false)
             end
 
         |   codeToICodeUnaryRev({oper=AllocCStack, arg1}, context, _, destination, tailCode) =
             let
                 (* Allocate space on the stack.  The higher levels have already aligned
                    the size to a multiple of 16. The number of bytes to allocate is a
                    Word.word value.  The result is a boxed large word. *)
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val uReg1 = newUReg() and uReg2 = newUReg()
                 val target = asTarget destination
                 val code =
                     boxLarge{ source=uReg2, dest=target, saveRegs=[] } ::
                     addSubXSP{ source=uReg1, dest=SomeReg uReg2, isAdd=false  } ::
                     untagValue{ source=aReg1, dest=uReg1, isSigned=false, opSize=polyWordOpSize } ::
                     arg1Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeUnaryRev({oper=LockMutex, arg1}, context, _, destination, tailCode) =
             (* The earliest versions of the Arm8 do not have the LDADD instruction which
                will do this directly.  To preserve compatibility we use LDAXR/STLXR
                which require a loop. *)
             let
                 local
                     val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 in
                     val (baseCode, baseReg) = getAbsoluteAddress(arg1Code, aReg1)
                 end
                 val loopLabel = newLabel() and noLoopLabel = newLabel()
                 val target = asTarget destination
                 val ccRef1 = newCCRef() and ccRef2 = newCCRef()
                 val uRegNew = newUReg() and uRegTest = newUReg() and uRegOld = newUReg()
                 (* N.B. in reverse order. *)
                 val code =
                     (* The result is true if the old value was zero. *)
                     addSubImmediate{source=uRegOld, dest=NONE, immed=0w0, isAdd=false, length=OpSize64, ccRef=SOME ccRef2} ::
                     memoryBarrier :: (* Put in the memory barrier. *)
                     (* If the result is zero we've been successful otherwise we loop. *)
                     BlockLabel noLoopLabel ::
                     BlockFlow(Conditional{ ccRef=ccRef1, condition=CondNotEqual, trueJump=loopLabel, falseJump=noLoopLabel }) ::
                     addSubImmediate{source=uRegTest, dest=NONE, immed=0w0, isAdd=false, length=OpSize32, ccRef=SOME ccRef1} ::
                     (* Add and try to store the result *)
                     storeReleaseExclusive{ base=baseReg, source=SomeReg uRegNew, result=uRegTest } ::
                     addSubImmediate{source=uRegOld, dest=SOME uRegNew, immed=0w1, isAdd=true, length=OpSize64, ccRef=NONE} ::
                     loadAcquireExclusive{ base=baseReg, dest=uRegOld } ::
                     BlockLabel loopLabel :: baseCode
             in
                 (makeBoolResultRev(CondEqual, ccRef2, target, code), target, false)
             end
 
         |   codeToICodeUnaryRev({oper=TryLockMutex, arg1}, context, _, destination, tailCode) =
             (* *)
             let
             (* Could use LDUMAXAL to set it the greater of the current value or 1. *)
                 local
                     val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 in
                     val (baseCode, baseReg) = getAbsoluteAddress(arg1Code, aReg1)
                 end
                 val loopLabel = newLabel() and noLoopLabel = newLabel() and okLabel = newLabel()
                 val target = asTarget destination
                 val ccRef0 = newCCRef() and ccRef1 = newCCRef() and ccRef2 = newCCRef()
                 val uRegNew = newUReg() and uRegTest = newUReg() and uRegOld = newUReg()
                 val code =
                     (* The result is true if the old value was zero. *)
                     addSubImmediate{source=uRegOld, dest=NONE, immed=0w0, isAdd=false, length=OpSize64, ccRef=SOME ccRef2} ::
                     memoryBarrier :: (* Put in the memory barrier. *)
                     (* If the result is zero we've been successful otherwise we loop. *)
                     BlockLabel noLoopLabel ::
                     BlockFlow(Conditional{ ccRef=ccRef1, condition=CondNotEqual, trueJump=loopLabel, falseJump=noLoopLabel }) ::
                     addSubImmediate{source=uRegTest, dest=NONE, immed=0w0, isAdd=false, length=OpSize32, ccRef=SOME ccRef1} ::
                     (* If the lock wasn't taken set it to one to lock it. *)
                     storeReleaseExclusive{ base=baseReg, source=SomeReg uRegNew, result=uRegTest } ::
                     loadNonAddressConstant{source=0w1, dest=uRegNew } :: BlockLabel okLabel ::
                     (* If it's not zero don't try to store anything back and exit the loop. *)
                     BlockFlow(Conditional{ ccRef=ccRef0, condition=CondNotEqual, trueJump=noLoopLabel, falseJump=okLabel }) ::
                     addSubImmediate{source=uRegOld, dest=NONE, immed=0w0, isAdd=false, length=OpSize64, ccRef=SOME ccRef0} ::
                     (* Get the old value and see if it's zero i.e. unlocked. *)
                     loadAcquireExclusive{ base=baseReg, dest=uRegOld } ::
                     BlockLabel loopLabel :: baseCode
             in
                 (makeBoolResultRev(CondEqual, ccRef2, target, code), target, false)
             end
 
         |   codeToICodeUnaryRev({oper=UnlockMutex, arg1}, context, _, destination, tailCode) =
             (* Get the previous value of the mutex to see if another thread had tried to
                lock it and set the result to zero. *)
             let (* Could use SWAPAL *)
                 local
                     val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 in
                     val (baseCode, baseReg) = getAbsoluteAddress(arg1Code, aReg1)
                 end
                 val loopLabel = newLabel() and noLoopLabel = newLabel()
                 val target = asTarget destination
                 val ccRef1 = newCCRef() and ccRef2 = newCCRef()
                 val uRegTest = newUReg() and uRegOld = newUReg()
                 val code =
                     (* The result is true if the old value was one. i.e. we were the only thread
                        that locked it. *)
                     addSubImmediate{source=uRegOld, dest=NONE, immed=0w1, isAdd=false, length=OpSize64, ccRef=SOME ccRef2} ::
                     memoryBarrier :: (* Put in the memory barrier. *)
                     (* If the result is zero we've been successful otherwise we loop. *)
                     BlockLabel noLoopLabel ::
                     BlockFlow(Conditional{ ccRef=ccRef1, condition=CondNotEqual, trueJump=loopLabel, falseJump=noLoopLabel }) ::
                     addSubImmediate{source=uRegTest, dest=NONE, immed=0w0, isAdd=false, length=OpSize32, ccRef=SOME ccRef1} ::
                     (* Try to set this to zero *)
                     storeReleaseExclusive{ base=baseReg, source=ZeroReg, result=uRegTest } ::
                     loadAcquireExclusive{ base=baseReg, dest=uRegOld } ::
                     BlockLabel loopLabel :: baseCode
             in
                 (makeBoolResultRev(CondEqual, ccRef2, target, code), target, false)
             end
 
         and codeToICodeBinaryRev({oper=WordComparison{test, isSigned}, arg1, arg2},
                 context, _, destination, tailCode) =
             let
                 (* Comparisons. This is now only used for tagged values, not for pointer equality. *)
                 val ccRef = newCCRef()
                 val (testCode1, testDest1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (testCode2, testDest2, _) = codeToICodeRev(arg2, context, false, AnyReg, testCode1)
                 val comparison =
                     addSubRegister{base=testDest1, shifted=testDest2, dest=NONE, length=polyWordOpSize,
                                    ccRef=SOME ccRef, isAdd=false, shift=ShiftNone} :: testCode2
                 val target = asTarget destination
                 open BuiltIns
                 val cond =
                     case (test, isSigned) of
                         (TestEqual,         _) => CondEqual
                     |   (TestLess,          true) => CondSignedLess
                     |   (TestLessEqual,     true) => CondSignedLessEq
                     |   (TestGreater,       true) => CondSignedGreater
                     |   (TestGreaterEqual,  true) => CondSignedGreaterEq
                     |   (TestLess,          false) => CondCarryClear
                     |   (TestLessEqual,     false) => CondUnsignedLowOrEq
                     |   (TestGreater,       false) => CondUnsignedHigher
                     |   (TestGreaterEqual,  false) => CondCarrySet
                     |   (TestUnordered,     _) => raise InternalError "WordComparison: TestUnordered"
             in
                 (makeBoolResultRev(cond, ccRef, target, comparison), target, false)
             end
 
         |   codeToICodeBinaryRev({oper=FixedPrecisionArith ArithAdd, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 (* We need to subtract the tag from one of the arguments and then do the
                    addition.  The optimiser will do the subtraction at compile time if we
                    subtract from a constant so try to put the constant in the second arg. *)
                 val (firstReg, secondReg) =
                     case arg1 of BICConstnt _ => (aReg2, aReg1) | _ => (aReg1, aReg2)
                 val uReg = newUReg()
                 val chkOverflow = newCCRef()
                 val code =
                     checkOverflow(CondOverflow, context, chkOverflow) @
                     addSubRegister{base=firstReg, shifted=uReg, dest=SOME target, length=polyWordOpSize,
                                  ccRef=SOME chkOverflow, isAdd=true, shift=ShiftNone} ::
                     addSubImmediate{dest=SOME uReg, source=secondReg, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=false}:: arg2Code
             in
                 (code , target, false)
             end
 
         |   codeToICodeBinaryRev({oper=FixedPrecisionArith ArithSub, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg = newUReg()
                 val chkOverflow = newCCRef()
                 val code =
                     checkOverflow(CondOverflow, context, chkOverflow) @
                     addSubRegister{base=aReg1, shifted=uReg, dest=SOME target, length=polyWordOpSize,
                                  ccRef=SOME chkOverflow, isAdd=false, shift=ShiftNone} ::
                     addSubImmediate{dest=SOME uReg, source=aReg2, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=false}:: arg2Code
             in
                 (code , target, false)
             end
 
         |   codeToICodeBinaryRev({oper=FixedPrecisionArith ArithMult, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg() and uReg4 = newUReg()
                 val chkOverflow = newCCRef()
 
                 (* Untag one argument. subtract the tag from the second, multiply and add back the tag. *)
                 val multiplyCode =
                     addSubImmediate{dest=SOME target, source=uReg3, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=true} ::
                     multiplication{kind=if is32in64 then SignedMultAddLong else MultAdd64, dest=uReg3, sourceA=NONE,
                                    sourceM=uReg1, sourceN=uReg2} ::
                     addSubImmediate{dest=SOME uReg2, source=aReg2, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=false} ::
                     untagValue{ source=aReg1, dest=uReg1, isSigned=true, opSize=polyWordOpSize } ::
                     arg2Code
 
                 (* Overflow check:  The condition for overflow is that the high order part (64-bits in native
                    64-bits, 32-bits in 32-in-64) must be zero if the result is positive and all ones if the
                    result is negative.  The high-order part is in uReg3 in 32-in-64 since we've already used
                    SignedMultAddLong but in native 64-bits we need to use SignedMultHigh to get the
                    high order part.  In both cases we can use a comparison with ShiftASR to give a
                    value containing just the sign of the result. *)
                 val checkOverflowCode =
                     if is32in64
                     then addSubRegister{ base=uReg4, shifted=target, dest=NONE, ccRef=SOME chkOverflow,
                             isAdd=false, length=OpSize32, shift=ShiftASR 0w31 } ::
                          shiftConstant{direction=Arm64ICode.ShiftRightArithmetic, source=uReg3, dest=uReg4, shift=0w32,
                             opSize=OpSize64 (* Have to start with 64-bits *)} :: multiplyCode
                     else addSubRegister{ base=uReg4, shifted=target, dest=NONE, ccRef=SOME chkOverflow,
                             isAdd=false, length=OpSize64, shift=ShiftASR 0w63 } ::
                          multiplication{kind=SignedMultHigh, dest=uReg4, sourceA=NONE,
                                        sourceM=uReg1, sourceN=uReg2} :: multiplyCode
                 val code =
                     checkOverflow(CondNotEqual, context, chkOverflow) @ checkOverflowCode
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=FixedPrecisionArith ArithQuot, arg1, arg2}, context, _, destination, tailCode) =
             let
                 (* The word version avoids an extra shift.  Don't do that here at least
                    for the moment.  Division by zero and overflow are checked for at
                    the higher level. *)
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 val code =
                     tagValue { source=uReg3, dest=target, opSize=polyWordOpSize, isSigned=true } ::
                     division{isSigned=true, opSize=polyWordOpSize, dest=uReg3, dividend=uReg1, divisor=uReg2} ::
                     untagValue{ source=aReg2, dest=uReg2, isSigned=true, opSize=polyWordOpSize } ::
                     untagValue{ source=aReg1, dest=uReg1, isSigned=true, opSize=polyWordOpSize } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=FixedPrecisionArith ArithRem, arg1, arg2}, context, _, destination, tailCode) =
             let
                 (* For the moment we remove the tags and then retag afterwards.  The word
                    version avoids this but at least for the moment we do it the longer way. *)
                 (* There's no direct way to get the remainder - have to use divide and multiply. *)
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg() and uReg4 = newUReg()
                 val code =
                     tagValue { source=uReg4, dest=target, opSize=polyWordOpSize, isSigned=true } ::
                     multiplication{kind=if is32in64 then MultSub32 else MultSub64, dest=uReg4,
                                    sourceM=uReg3, sourceN=uReg2, sourceA=SOME uReg1} ::
                     division{isSigned=true, opSize=polyWordOpSize, dest=uReg3, dividend=uReg1, divisor=uReg2} ::
                     untagValue{ source=aReg2, dest=uReg2, isSigned=true, opSize=polyWordOpSize } ::
                     untagValue{ source=aReg1, dest=uReg1, isSigned=true, opSize=polyWordOpSize } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=FixedPrecisionArith ArithDiv, ...}, _, _, _, _) =
                raise InternalError "unimplemented operation: FixedPrecisionArith ArithDiv"
 
         |   codeToICodeBinaryRev({oper=FixedPrecisionArith ArithMod, ...}, _, _, _, _) =
                 raise InternalError "unimplemented operation: FixedPrecisionArith ArithMod"
 
         |   codeToICodeBinaryRev({oper=WordArith ArithAdd, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 (* We need to subtract the tag from one of the arguments and then do the
                    addition.  The optimiser will do the subtraction at compile time if we
                    subtract from a constant so try to put the constant in the second arg. *)
                 val (firstReg, secondReg) =
                     case arg1 of BICConstnt _ => (aReg2, aReg1) | _ => (aReg1, aReg2)
                 val uReg = newUReg()
                 val code =
                     addSubRegister{base=firstReg, shifted=uReg, dest=SOME target, length=polyWordOpSize,
                                  ccRef=NONE, isAdd=true, shift=ShiftNone} ::
                     addSubImmediate{dest=SOME uReg, source=secondReg, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=false}:: arg2Code
             in
                 (code , target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordArith ArithSub, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg = newUReg()
                 (* TODO: If the first argument is a constant we could add one to that rather than
                    subtracting one from the second argument.  We're not concerned with overflow. *)
                 val code =
                     addSubRegister{base=aReg1, shifted=uReg, dest=SOME target, length=polyWordOpSize,
                                  ccRef=NONE, isAdd=false, shift=ShiftNone} ::
                     addSubImmediate{dest=SOME uReg, source=aReg2, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=false}:: arg2Code
             in
                 (code , target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordArith ArithMult, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 (* Untag one argument. subtract the tag from the second, multiply and add back the tag. *)
                 val code =
                     addSubImmediate{dest=SOME target, source=uReg3, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=true} ::
                     multiplication{kind=if is32in64 then MultAdd32 else MultAdd64, dest=uReg3, sourceA=NONE,
                                    sourceM=uReg1, sourceN=uReg2} ::
                     addSubImmediate{dest=SOME uReg2, source=aReg2, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=false} ::
                     untagValue{ source=aReg1, dest=uReg1, isSigned=false, opSize=polyWordOpSize } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordArith ArithDiv, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 (* Untag the divisor (into uReg2). subtract the tag from the dividend (into uReg1),
                    divide and or in the tag.  The tag may have been set already depending on
                    the result of the division. *)
                 val code =
                     logicalImmediate{dest=SOME target, source=uReg3, immed=0w1, length=polyWordOpSize, ccRef=NONE, logOp=LogOr} ::
                     division{isSigned=false, opSize=polyWordOpSize, dest=uReg3, dividend=uReg1, divisor=uReg2} ::
                     addSubImmediate{dest=SOME uReg1, source=aReg1, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=false} ::
                     untagValue{ source=aReg2, dest=uReg2, isSigned=false, opSize=polyWordOpSize } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordArith ArithMod, arg1, arg2}, context, _, destination, tailCode) =
             let
                 (* There's no direct way to get the remainder - have to use divide and multiply. *)
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg() and uReg4 = newUReg()
                 (* Untag the divisor (into uReg2). subtract the tag from the dividend (into uReg1)
                 Untag one argument. subtract the tag from the second, divide and or in the tag.  The
                    tag may have been set already depending on the result of the division. *)
                 val tagBitMask = Word64.<<(Word64.fromInt ~1, 0w1) (* Requires a 64-bit AND. *)
                 val code =
                     (* Multiply the result of the division by the divisor and subtract this from the original, tagged
                        dividend.  This leaves us a tagged value so it can go straight into the result. *)
                     multiplication{kind=if is32in64 then MultSub32 else MultSub64, dest=target,
                                    sourceM=uReg4, sourceN=uReg2, sourceA=SOME aReg1} ::
                     (* Clear the bottom bit before the multiplication. *)
                     logicalImmediate{dest=SOME uReg4, source=uReg3, immed=tagBitMask, length=OpSize64, ccRef=NONE, logOp=LogAnd} ::
                     division{isSigned=false, opSize=polyWordOpSize, dest=uReg3, dividend=uReg1, divisor=uReg2} ::
                     addSubImmediate{dest=SOME uReg1, source=aReg1, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=false} ::
                     untagValue{ source=aReg2, dest=uReg2, isSigned=false, opSize=polyWordOpSize } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordArith _, ...}, _, _, _, _) =
                 raise InternalError "WordArith - unimplemented instruction"
 
         |   codeToICodeBinaryRev({oper=WordLogical LogicalAnd, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 (* Since both values are tagged the tag will be preserved. *)
                 val code =
                     logicalRegister{base=aReg1, shifted=aReg2, dest=SOME target, length=polyWordOpSize,
                                     ccRef=NONE, logOp=LogAnd, shift=ShiftNone} :: arg2Code
             in
                 (code , target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordLogical LogicalOr, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 (* Since both values are tagged the tag will be preserved. *)
                 val code =
                     logicalRegister{base=aReg1, shifted=aReg2, dest=SOME target, length=polyWordOpSize,
                                     ccRef=NONE, logOp=LogOr, shift=ShiftNone} :: arg2Code
             in
                 (code , target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordLogical LogicalXor, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 (* If we just XOR the values together the tag bit in the result will be zero.
                    It's better to remove one of the tag bits beforehand.  As with Add, we
                    try to choose a constant. *)
                 val (firstReg, secondReg) =
                     case arg1 of BICConstnt _ => (aReg2, aReg1) | _ => (aReg1, aReg2)
                 val uReg = newUReg()
                 val code =
                     logicalRegister{base=firstReg, shifted=uReg, dest=SOME target, length=polyWordOpSize,
                                  ccRef=NONE, logOp=LogXor, shift=ShiftNone} ::
                     addSubImmediate{dest=SOME uReg, source=secondReg, immed=0w1, length=polyWordOpSize, ccRef=NONE, isAdd=false}:: arg2Code
             in
                 (code , target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordShift ShiftLeft, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val ureg1 = newUReg() and ureg2 = newUReg() and ureg3 = newUReg()
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val code =
                     (* Put back the tag. *)
                     logicalImmediate{ source=ureg3, dest=SOME target, ccRef=NONE, immed=0w1, logOp=LogOr, length=polyWordOpSize } ::
                     (* Do the shift *)
                     shiftRegister{direction=Arm64ICode.ShiftLeft, dest=ureg3, source=ureg1, shift=ureg2, opSize=polyWordOpSize} ::
                     (* Untag the shift amount.  Since it's at most 64 we can use a 32-bit operation. *)
                     untagValue{source=aReg2, dest=ureg2, opSize=OpSize32, isSigned=false} ::
                     (* Remove tag bit from the value we're shifting. *)
                     logicalImmediate{ source=aReg1, dest=SOME ureg1, ccRef=NONE, immed=polyWordTagBitMask,
                               logOp=LogAnd, length=polyWordOpSize } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordShift ShiftRightLogical, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val ureg1 = newUReg() and ureg2 = newUReg()
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val code =
                     (* Put back the tag. *)
                     logicalImmediate{ source=ureg2, dest=SOME target, ccRef=NONE, immed=0w1, logOp=LogOr, length=polyWordOpSize } ::
                     (* Do the shift *)
                     shiftRegister{direction=Arm64ICode.ShiftRightLogical, dest=ureg2, source=aReg1, shift=ureg1, opSize=polyWordOpSize} ::
                     (* Untag the shift amount.  Since it's at most 64 we can use a 32-bit operation. *)
                     untagValue{source=aReg2, dest=ureg1, opSize=OpSize32, isSigned=false} :: arg2Code
                     
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=WordShift ShiftRightArithmetic, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val ureg1 = newUReg() and ureg2 = newUReg()
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val code =
                     (* Put back the tag. *)
                     logicalImmediate{ source=ureg2, dest=SOME target, ccRef=NONE, immed=0w1, logOp=LogOr, length=polyWordOpSize } ::
                     (* Do the shift *)
                     shiftRegister{direction=Arm64ICode.ShiftRightArithmetic, dest=ureg2, source=aReg1, shift=ureg1, opSize=polyWordOpSize} ::
                     (* Untag the shift amount.  Since it's at most 64 we can use a 32-bit operation. *)
                     untagValue{source=aReg2, dest=ureg1, opSize=OpSize32, isSigned=false} :: arg2Code
                     
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=AllocateByteMemory, arg1, arg2}, context, _, destination, tailCode) =
             let
                 (* Allocate a block of memory and without initialisation.  If the flags include
                    the "bytes" bit the GC won't look at it so it doesn't matter that it's not
                    initialised.
                    This is identical to AllocateWordMemory apart from the lack of initialisation. *)
                 val target = asTarget destination
                 val (codeSize, sizeReg, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (codeFlags, flagsReg, _) = codeToICodeRev(arg2, context, false, AnyReg, codeSize)
                 val uSizeReg = newUReg() and shiftFReg = newUReg() and lengthWord = newUReg()
                 val absAddr = if is32in64 then newUReg() else target
                 val untagSize = untagValue{source=sizeReg, dest=uSizeReg, opSize=polyWordOpSize, isSigned=false} :: codeFlags
                 val allocateMem = allocateMemoryVariable{ size=uSizeReg, dest=absAddr, saveRegs=[]} :: untagSize
                 (* Make the length word by first shifting the flags into the length word reg by
                    55 or 23 bits.  This puts the tag bit in the top bit of the size.  Then insert the size
                    into this which will overwrite the flag's tag bit. *)
                 val makeLengthWord =
                     bitFieldInsert{ source=uSizeReg, destAsSource=shiftFReg, dest=lengthWord, length=polyWordOpSize,
                               immr=0w0 (*bit 0*), imms=if is32in64 then 0w23 else 0w55 (*width-1*) } ::
                     shiftConstant{direction=Arm64ICode.ShiftLeft, dest=shiftFReg, source=flagsReg,
                         shift=if is32in64 then 0w23 else 0w55, opSize=polyWordOpSize } :: allocateMem
                 val setLengthWordAndInit =
                     storeWithConstantOffset{ source=lengthWord, base=absAddr,
                         byteOffset= ~(Word.toInt wordSize), loadType=polyWordLoadSize } :: makeLengthWord
                 val finalCode =
                     if is32in64 then absoluteToObjectIndex{ source=absAddr, dest=target } :: setLengthWordAndInit
                     else setLengthWordAndInit
             in
                 (finalCode, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=LargeWordComparison test, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val ccRef = newCCRef()
                 val (testCode1, testDest1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (testCode2, testDest2, _) = codeToICodeRev(arg2, context, false, AnyReg, testCode1)
                 val uReg1 = newUReg() and uReg2 = newUReg()
                 val comparison =
                     addSubRegister{base=uReg1, shifted=uReg2, dest=NONE, length=OpSize64,
                                    ccRef=SOME ccRef, isAdd=false, shift=ShiftNone} ::
                     unboxLarge{ source=testDest2, dest=uReg2 } ::
                     unboxLarge{ source=testDest1, dest=uReg1 } :: testCode2
                 open BuiltIns
                 val cond =
                     case test of
                         TestEqual => CondEqual
                     |   TestLess => CondCarryClear
                     |   TestLessEqual => CondUnsignedLowOrEq
                     |   TestGreater => CondUnsignedHigher
                     |   TestGreaterEqual => CondCarrySet
                     |   TestUnordered => raise InternalError "LargeWordComparison: TestUnordered"
             in
                 (makeBoolResultRev(cond, ccRef, target, comparison), target, false)
             end
 
         |   codeToICodeBinaryRev({oper=LargeWordArith ArithAdd, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 val code =
                     boxLarge{ source=uReg3, dest=target, saveRegs=[] } ::
                     addSubRegister{base=uReg1, shifted=uReg2, dest=SOME uReg3, length=OpSize64,
                                    ccRef=NONE, isAdd=true, shift=ShiftNone} ::
                     unboxLarge{ source=aReg2, dest=uReg2 } ::
                     unboxLarge{ source=aReg1, dest=uReg1 } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=LargeWordArith ArithSub, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 val code =
                     boxLarge{ source=uReg3, dest=target, saveRegs=[] } ::
                     addSubRegister{base=uReg1, shifted=uReg2, dest=SOME uReg3, length=OpSize64,
                                    ccRef=NONE, isAdd=false, shift=ShiftNone} ::
                     unboxLarge{ source=aReg2, dest=uReg2 } ::
                     unboxLarge{ source=aReg1, dest=uReg1 } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=LargeWordArith ArithMult, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 val code =
                     boxLarge{ source=uReg3, dest=target, saveRegs=[] } ::
                     multiplication{kind=MultAdd64, sourceA=NONE, sourceM=uReg1, sourceN=uReg2, dest=uReg3} ::
                     unboxLarge{ source=aReg2, dest=uReg2 } ::
                     unboxLarge{ source=aReg1, dest=uReg1 } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=LargeWordArith ArithDiv, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 val code =
                     boxLarge{ source=uReg3, dest=target, saveRegs=[] } ::
                     division{isSigned=false, opSize=OpSize64, dividend=uReg1, divisor=uReg2, dest=uReg3} ::
                     unboxLarge{ source=aReg2, dest=uReg2 } ::
                     unboxLarge{ source=aReg1, dest=uReg1 } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=LargeWordArith ArithMod, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg() and uReg4 = newUReg()
                 val code =
                     boxLarge{ source=uReg4, dest=target, saveRegs=[] } ::
                     multiplication{kind=MultSub64, dest=uReg4, sourceM=uReg3, sourceN=uReg2, sourceA=SOME uReg1} ::
                     division{isSigned=false, opSize=OpSize64, dividend=uReg1, divisor=uReg2, dest=uReg3} ::
                     unboxLarge{ source=aReg2, dest=uReg2 } ::
                     unboxLarge{ source=aReg1, dest=uReg1 } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=LargeWordArith _, ...}, _, _, _, _) =
                 raise InternalError "LargeWordArith - unimplemented instruction"
 
         |   codeToICodeBinaryRev({oper=LargeWordLogical logop, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 val logicalOp = case logop of LogicalAnd => LogAnd | LogicalOr => LogOr | LogicalXor => LogXor
                 val code =
                     boxLarge{ source=uReg3, dest=target, saveRegs=[] } ::
                     logicalRegister{base=uReg1, shifted=uReg2, dest=SOME uReg3, length=OpSize64,
                                    ccRef=NONE, logOp=logicalOp, shift=ShiftNone} ::
                     unboxLarge{ source=aReg2, dest=uReg2 } ::
                     unboxLarge{ source=aReg1, dest=uReg1 } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=LargeWordShift shiftKind, arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 val shiftType =
                     case shiftKind of
                         ShiftLeft => Arm64ICode.ShiftLeft
                     |   ShiftRightLogical => Arm64ICode.ShiftRightLogical
                     |   ShiftRightArithmetic => Arm64ICode.ShiftRightArithmetic
                 val code =
                     boxLarge{ source=uReg3, dest=target, saveRegs=[] } ::
                     shiftRegister{direction=shiftType, source=uReg1, shift=uReg2, dest=uReg3, opSize=OpSize64 } ::
                     (* The shift amount is a word, not a large word. *)
                     untagValue{ source=aReg2, dest=uReg2, opSize=OpSize32, isSigned=false } ::
                     unboxLarge{ source=aReg1, dest=uReg1 } :: arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=RealComparison(test, precision), arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val ccRef = newCCRef()
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val fpSize = precisionToFpSize precision
                 val uReg1 = newUReg() and uReg2 = newUReg()
                 (* Floating point comparisons.
                    The fcmp instruction differs from integer comparison.  If either
                    argument is a NaN the overflow bit is set and the other bits are
                    cleared.  That means that in order to get a true result only
                    if the values are not NaNs we have to test that at least one of
                    C, N, or Z are set.  We use unsigned tests for < and <=
                    and signed tests for > and >=. *)
                 val cond =
                     case test of
                         TestEqual => CondEqual
                     |   TestLess => CondCarryClear
                     |   TestLessEqual => CondUnsignedLowOrEq
                     |   TestGreater => CondSignedGreater
                     |   TestGreaterEqual => CondSignedGreaterEq
                     |   TestUnordered => CondOverflow
                 val code =
                     compareFloatingPoint{arg1=uReg1, arg2=uReg2, ccRef=ccRef, opSize=fpSize} ::
                     unboxTagFloat{ floatSize=fpSize, source=aReg2, dest=uReg2 } ::
                     unboxTagFloat{ floatSize=fpSize, source=aReg1, dest=uReg1 } ::
                     arg2Code
             in
                 (makeBoolResultRev(cond, ccRef, target, code), target, false)
             end
 
         |   codeToICodeBinaryRev({oper=RealArith(oper, precision), arg1, arg2}, context, _, destination, tailCode) =
             let
                 val target = asTarget destination
                 val (arg1Code, aReg1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val fpSize = precisionToFpSize precision
                 val uReg1 = newUReg() and uReg2 = newUReg() and uReg3 = newUReg()
                 val fpOp =
                     case oper of
                         ArithAdd => AddFP
                     |   ArithSub => SubtractFP
                     |   ArithMult => MultiplyFP
                     |   ArithDiv => DivideFP
                     |   _ => raise InternalError "RealArith - unimplemented instruction"
                 val code =
                     boxTagFloat{ floatSize=fpSize, source=uReg3, dest=target, saveRegs=[] } ::
                     binaryFloatingPoint{arg1=uReg1, arg2=uReg2, dest=uReg3, fpOp=fpOp, opSize=fpSize } ::
                     unboxTagFloat{ floatSize=fpSize, source=aReg2, dest=uReg2 } ::
                     unboxTagFloat{ floatSize=fpSize, source=aReg1, dest=uReg1 } ::
                     arg2Code
             in
                 (code, target, false)
             end
 
         |   codeToICodeBinaryRev({oper=PointerEq, arg1, arg2}, context, _, destination, tailCode) =
             let
                 (* Equality of general values which can include pointers. This can be treated exactly as a word equality.
                    It has to be analysed differently for indexed cases. *)
                 val ccRef = newCCRef()
                 val (testCode1, testDest1, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (testCode2, testDest2, _) = codeToICodeRev(arg2, context, false, AnyReg, testCode1)
                 val comparison =
                     addSubRegister{base=testDest1, shifted=testDest2, dest=NONE, length=polyWordOpSize,
                                    ccRef=SOME ccRef, isAdd=false, shift=ShiftNone} :: testCode2
                 val target = asTarget destination
             in
                 (makeBoolResultRev(CondEqual, ccRef, target, comparison), target, false)
             end
 
         |   codeToICodeBinaryRev({oper=FreeCStack, arg1, arg2}, context, _, destination, tailCode) =
             let
                 (* Free space on the C stack. This is a binary operation that takes the base address
                    and the size.  The base address isn't used in this version. *)
                 val (arg1Code, _, _) = codeToICodeRev(arg1, context, false, AnyReg, tailCode)
                 val (arg2Code, aReg2, _) = codeToICodeRev(arg2, context, false, AnyReg, arg1Code)
                 val uReg = newUReg()
                 val code =
                     addSubXSP{ source=uReg, dest=ZeroReg, isAdd=true  } ::
                     untagValue{ source=aReg2, dest=uReg, isSigned=false, opSize=polyWordOpSize } ::
                     arg2Code
             in
                 returnUnit(destination, code, false)
             end
 
         (* Code-generate an address into one or two Pregs. At this point they are in a state where
            we can code-generate arbitrary code before the address is used *)
         and addressToPregAddress({base, index, offset}, context, code) =
         let
             val (bCode, bReg, _) = codeToICodeRev(base, context, false, AnyReg, code)
         in
             case index of
                 NONE => ({base=bReg, index=NONE, offset=offset}, bCode)
             |   SOME index =>
                 let
                     val (iCode, iReg, _) = codeToICodeRev(index, context, false, AnyReg, bCode)
                 in
                     ({base=bReg, index=SOME iReg, offset=offset}, iCode)
                 end
         end
 
         (* Store the code address and the closure items into a previously allocated closure on the
            heap.  This is used both in the simple case and also with mutually recursive declarations. *)
         and storeIntoClosure(lambda as { closure, ...}, absClosureAddr, context, tailCode) =
         let
             val closureRef = makeConstantClosure()
             val () = codeFunctionToArm64(lambda, debugSwitches, closureRef)
             val codeAddrWords = if is32in64 then 2 else 1
 
             fun storeAValue(f, (n, tlCode)) =
             let
                 val (code, source, _) = codeToICodeRev(BICExtract f, context, false, AnyReg, tlCode)
             in
                 (n+1, storeAtWordOffset(source, n, absClosureAddr, polyWordLoadSize, code))
             end
             (* Store the code address in the first 64-bits. *)
             val storeCodeAddress =
                 if is32in64
                 then
                 let
                     (* We can't use codeAddressFromClosure on 32-in-64 because it always returns
                        a 64-bit value.  Instead we have to get the code address at run-time. *)
                     val clReg = newPReg() and absClReg = newUReg() and absCodeReg = newUReg()
                 in
                     storeAtWordOffset(absCodeReg, 0, absClosureAddr, Load64,
                         loadWithConstantOffset{base=absClReg, dest=absCodeReg, byteOffset=0, loadType=Load64} ::
                         objectIndexAddressToAbsolute{ source=clReg, dest=absClReg } ::
                         loadAddressConstant{source=closureAsAddress closureRef, dest=clReg} :: tailCode)
                 end
                 else
                 let
                     val cReg = newPReg()
                 in
                     storeAtWordOffset(cReg, 0, absClosureAddr, Load64,
                         loadAddressConstant{source=codeAddressFromClosure closureRef, dest=cReg} :: tailCode)
                 end
             val (_, storeCode) = List.foldl storeAValue (codeAddrWords, storeCodeAddress) closure
         in
             storeCode
         end
 
         (* Load operations. *)
         and codeLoadOperation(kind, address, context, target, tailCode) =
         let
             val (regAddr, codeAddr) = addressToPregAddress(address, context, tailCode)
 
             val code =
             case kind of
                 LoadStoreMLWord {isImmutable=false} =>
                 let
                     fun loadOp(addrReg, code) =
                         loadAcquire{base=addrReg, dest=target, loadType=polyWordLoadSize} :: code
                 in
-                    loadAndStoreWithAbsolute (regAddr, polyWordLoadSize, loadOp, codeAddr)
+                    loadAndStoreWithAbsolute (regAddr, opWordSize polyWordLoadSize, loadShift polyWordLoadSize, loadOp, codeAddr)
                 end
 
             |   LoadStoreMLWord {isImmutable=true} =>
                 let
                     fun loadConstOffset(base, offset, code) =
                         loadWithConstantOffset{base=base, dest=target, byteOffset=offset, loadType=polyWordLoadSize} :: code
                     fun loadIndexed(base, index, code) =
                         loadWithIndexedOffset{base=base, dest=target, index=index, loadType=polyWordLoadSize} :: code
                 in
-                    loadAndStoreWithAddress (regAddr, polyWordLoadSize, false, loadConstOffset, loadIndexed, codeAddr)
+                    loadAndStoreWithAddress (regAddr, opWordSize polyWordLoadSize, loadShift polyWordLoadSize, false, loadConstOffset, loadIndexed, codeAddr)
                 end
 
             |   LoadStoreMLByte {isImmutable=false} =>
                 let
                     (* Have to load into a ureg and then tag it. *)
                     val destReg = newUReg()
                     fun loadOp(addrReg, code) =
                         loadAcquire{base=addrReg, dest=destReg, loadType=Load8} :: code
                 in
                     tagValue{source=destReg, dest=target, isSigned=false, opSize=OpSize32} ::
-                        loadAndStoreWithAbsolute (regAddr, Load8, loadOp, codeAddr)
+                        loadAndStoreWithAbsolute (regAddr, opWordSize Load8, loadShift Load8, loadOp, codeAddr)
                 end
 
             |   LoadStoreMLByte {isImmutable=true} =>
                 let
                     (* Have to load into a ureg and then tag it. *)
                     val destReg = newUReg()
                     fun loadConstOffset(base, offset, code) =
                         loadWithConstantOffset{base=base, dest=destReg, byteOffset=offset, loadType=Load8} :: code
                     fun loadIndexed(base, index, code) =
                         loadWithIndexedOffset{base=base, dest=destReg, index=index, loadType=Load8} :: code
                 in
                     tagValue{source=destReg, dest=target, isSigned=false, opSize=OpSize32} ::
-                        loadAndStoreWithAddress(regAddr, Load8, false, loadConstOffset, loadIndexed, codeAddr)
+                        loadAndStoreWithAddress(regAddr, opWordSize Load8, loadShift Load8, false, loadConstOffset, loadIndexed, codeAddr)
                 end
 
             |   LoadStoreC8 =>
                 let
                     (* Have to load into a ureg and then tag it. *)
                     val destReg = newUReg()
                     fun loadConstOffset(base, offset, code) =
                         loadWithConstantOffset{base=base, dest=destReg, byteOffset=offset, loadType=Load8} :: code
                     fun loadIndexed(base, index, code) =
                         loadWithIndexedOffset{base=base, dest=destReg, index=index, loadType=Load8} :: code
                 in
                     tagValue{source=destReg, dest=target, isSigned=false, opSize=OpSize32} ::
-                        loadAndStoreWithAddress(regAddr, Load8, true, loadConstOffset, loadIndexed, codeAddr)
+                        loadAndStoreWithAddress(regAddr, opWordSize Load8, loadShift Load8, true, loadConstOffset, loadIndexed, codeAddr)
                 end
 
             |   LoadStoreC16 =>
                 let
                     (* Have to load into a ureg and then tag it. *)
                     val destReg = newUReg()
                     fun loadConstOffset(base, offset, code) =
                         loadWithConstantOffset{base=base, dest=destReg, byteOffset=offset, loadType=Load16} :: code
                     fun loadIndexed(base, index, code) =
                         loadWithIndexedOffset{base=base, dest=destReg, index=index, loadType=Load16} :: code
                 in
                     tagValue{source=destReg, dest=target, isSigned=false, opSize=OpSize32} ::
-                        loadAndStoreWithAddress(regAddr, Load16, true, loadConstOffset, loadIndexed, codeAddr)
+                        loadAndStoreWithAddress(regAddr, opWordSize Load16, loadShift Load16, true, loadConstOffset, loadIndexed, codeAddr)
                 end
 
             |   LoadStoreC32 =>
                 let
                     (* This is tagged in native 64-bits and boxed in 32-in-64. *)
                     val destReg = newUReg()
                     fun loadConstOffset(base, offset, code) =
                         loadWithConstantOffset{base=base, dest=destReg, byteOffset=offset, loadType=Load32} :: code
                     fun loadIndexed(base, index, code) =
                         loadWithIndexedOffset{base=base, dest=destReg, index=index, loadType=Load32} :: code
                 in
                     (if is32in64
                      then boxLarge{ source=destReg, dest=target, saveRegs=[] }
                      else tagValue{source=destReg, dest=target, isSigned=false, opSize=OpSize64 (* It becomes 33 bits *)}) ::
-                        loadAndStoreWithAddress(regAddr, Load32, true, loadConstOffset, loadIndexed, codeAddr)
+                        loadAndStoreWithAddress(regAddr, opWordSize Load32, loadShift Load32, true, loadConstOffset, loadIndexed, codeAddr)
                 end
 
             |   LoadStoreC64 =>
                 let
                     (* This is always boxed. *)
                     val destReg = newUReg()
                     fun loadConstOffset(base, offset, code) =
                         loadWithConstantOffset{base=base, dest=destReg, byteOffset=offset, loadType=Load64} :: code
                     fun loadIndexed(base, index, code) =
                         loadWithIndexedOffset{base=base, dest=destReg, index=index, loadType=Load64} :: code
                 in
                     boxLarge{ source=destReg, dest=target, saveRegs=[]} ::
-                        loadAndStoreWithAddress(regAddr, Load64, true, loadConstOffset, loadIndexed, codeAddr)
+                        loadAndStoreWithAddress(regAddr, opWordSize Load64, loadShift Load64, true, loadConstOffset, loadIndexed, codeAddr)
                 end
 
             |   LoadStoreCFloat =>
                 let
                     (* This always returns a double, not a 32-bit float. *)
                     val destReg = newUReg() and convertReg = newUReg()
                     fun loadConstOffset(base, offset, code) =
-                        loadWithConstantOffset{base=base, dest=destReg, byteOffset=offset, loadType=LoadFloat} :: code
+                        loadFPWithConstantOffset{base=base, dest=destReg, byteOffset=offset, floatSize=Float32} :: code
                     fun loadIndexed(base, index, code) =
-                        loadWithIndexedOffset{base=base, dest=destReg, index=index, loadType=LoadFloat} :: code
+                        loadFPWithIndexedOffset{base=base, dest=destReg, index=index, floatSize=Float32} :: code
                 in
                     boxTagFloat{floatSize=Double64, source=convertReg, dest=target, saveRegs=[]} ::
                         unaryFloatingPt{source=destReg, dest=convertReg, fpOp=ConvFloatToDble} ::
-                        loadAndStoreWithAddress(regAddr, LoadFloat, true, loadConstOffset, loadIndexed, codeAddr)
+                        loadAndStoreWithAddress(regAddr, 4, 0w2, true, loadConstOffset, loadIndexed, codeAddr)
                 end
 
             |   LoadStoreCDouble =>
                 let
                     (* This is always boxed. *)
                     val destReg = newUReg()
                     fun loadConstOffset(base, offset, code) =
-                        loadWithConstantOffset{base=base, dest=destReg, byteOffset=offset, loadType=LoadDouble} :: code
+                        loadFPWithConstantOffset{base=base, dest=destReg, byteOffset=offset, floatSize=Double64} :: code
                     fun loadIndexed(base, index, code) =
-                        loadWithIndexedOffset{base=base, dest=destReg, index=index, loadType=LoadDouble} :: code
+                        loadFPWithIndexedOffset{base=base, dest=destReg, index=index, floatSize=Double64} :: code
                 in
                     boxTagFloat{floatSize=Double64, source=destReg, dest=target, saveRegs=[]} ::
-                        loadAndStoreWithAddress(regAddr, LoadDouble, true, loadConstOffset, loadIndexed, codeAddr)
+                        loadAndStoreWithAddress(regAddr, 8, 0w3, true, loadConstOffset, loadIndexed, codeAddr)
                 end
 
             |   LoadStoreUntaggedUnsigned =>
                 let
                     (* LoadStoreMLWord {isImmutable=true} except it has to be tagged. *)
                     val ureg = newUReg()
                     fun loadConstOffset(base, offset, code) =
                         loadWithConstantOffset{base=base, dest=ureg, byteOffset=offset, loadType=polyWordLoadSize} :: code
                     fun loadIndexed(base, index, code) =
                         loadWithIndexedOffset{base=base, dest=ureg, index=index, loadType=polyWordLoadSize} :: code
                 in
                     tagValue{source=ureg, dest=target, isSigned=false, opSize=polyWordOpSize} ::
-                        loadAndStoreWithAddress(regAddr, polyWordLoadSize, false, loadConstOffset, loadIndexed, codeAddr)
+                        loadAndStoreWithAddress(regAddr, opWordSize polyWordLoadSize, loadShift polyWordLoadSize, false, loadConstOffset, loadIndexed, codeAddr)
                 end
 
         in
             (code, target, false)
         end
 
         (* Store operations. *)
         and codeStoreOperation(kind, address, value, context, destination, tailCode1) =
         let
             val (regAddr, codeAddr) = addressToPregAddress(address, context, tailCode1)
             val (sourceCode, sourceReg, _) = codeToICodeRev(value, context, false, AnyReg, codeAddr)
             
             val storeCode =
             case kind of
                 LoadStoreMLWord {isImmutable=false} =>
                 let
                     fun storeOp(addrReg, code) =
                         storeRelease{base=addrReg, source=sourceReg, loadType=polyWordLoadSize} :: code
                 in
-                    loadAndStoreWithAbsolute(regAddr, polyWordLoadSize, storeOp, sourceCode)
+                    loadAndStoreWithAbsolute(regAddr, opWordSize polyWordLoadSize, loadShift polyWordLoadSize, storeOp, sourceCode)
                 end
 
             |   LoadStoreMLWord {isImmutable=true} =>
                 let
                     (* Used when initialising immutables that do not require store-release. *)
                     fun loadConstOffset(base, offset, code) =
                         storeWithConstantOffset{base=base, source=sourceReg, byteOffset=offset, loadType=polyWordLoadSize} :: code
                     fun loadIndexed(base, index, code) =
                         storeWithIndexedOffset{base=base, source=sourceReg, index=index, loadType=polyWordLoadSize} :: code
                 in
-                    loadAndStoreWithAddress (regAddr, polyWordLoadSize, false, loadConstOffset, loadIndexed, sourceCode)
+                    loadAndStoreWithAddress (regAddr, opWordSize polyWordLoadSize, loadShift polyWordLoadSize, false, loadConstOffset, loadIndexed, sourceCode)
                 end
 
             |   LoadStoreMLByte {isImmutable=false} =>
                 let
                     fun storeOp(addrReg, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeRelease{base=addrReg, source=tReg, loadType=Load8} ::
                         untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=OpSize32} :: code
                     end
                 in
-                    loadAndStoreWithAbsolute(regAddr, Load8, storeOp, sourceCode)
+                    loadAndStoreWithAbsolute(regAddr, opWordSize Load8, loadShift Load8, storeOp, sourceCode)
                 end
 
             |   LoadStoreMLByte {isImmutable=true} =>
                 let
                     (* Used when initialising immutables that do not require store-release. *)
                     fun loadConstOffset(base, offset, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithConstantOffset{base=base, source=tReg, byteOffset=offset, loadType=Load8} ::
                         untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=OpSize32} :: code
                     end
 
                     fun loadIndexed(base, index, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithIndexedOffset{base=base, source=tReg, index=index, loadType=Load8} ::
                         untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=OpSize32} :: code
                     end
                 in
-                    loadAndStoreWithAddress(regAddr, Load8, false, loadConstOffset, loadIndexed, sourceCode)
+                    loadAndStoreWithAddress(regAddr, opWordSize Load8, loadShift Load8, false, loadConstOffset, loadIndexed, sourceCode)
                 end
 
             |   LoadStoreC8 =>
                 let
                     fun loadConstOffset(base, offset, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithConstantOffset{base=base, source=tReg, byteOffset=offset, loadType=Load8} ::
                         untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=OpSize32} :: code
                     end
 
                     fun loadIndexed(base, index, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithIndexedOffset{base=base, source=tReg, index=index, loadType=Load8} ::
                         untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=OpSize32} :: code
                     end
                 in
-                    loadAndStoreWithAddress(regAddr, Load8, true, loadConstOffset, loadIndexed, sourceCode)
+                    loadAndStoreWithAddress(regAddr, opWordSize Load8, loadShift Load8, true, loadConstOffset, loadIndexed, sourceCode)
                 end
 
             |   LoadStoreC16 =>
                 let
                     fun loadConstOffset(base, offset, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithConstantOffset{base=base, source=tReg, byteOffset=offset, loadType=Load16} ::
                         untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=OpSize32} :: code
                     end
 
                     fun loadIndexed(base, index, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithIndexedOffset{base=base, source=tReg, index=index, loadType=Load16} ::
                         untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=OpSize32} :: code
                     end
                 in
-                    loadAndStoreWithAddress(regAddr, Load16, true, loadConstOffset, loadIndexed, sourceCode)
+                    loadAndStoreWithAddress(regAddr, opWordSize Load16, loadShift Load16, true, loadConstOffset, loadIndexed, sourceCode)
                 end
 
             |   LoadStoreC32 =>
                 let
                     fun loadConstOffset(base, offset, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithConstantOffset{base=base, source=tReg, byteOffset=offset, loadType=Load32} ::
                         (if is32in64 then unboxLarge{source=sourceReg, dest=tReg}
                          else untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=OpSize64}) :: code
                     end
 
                     fun loadIndexed(base, index, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithIndexedOffset{base=base, source=tReg, index=index, loadType=Load32} ::
                         (if is32in64 then unboxLarge{source=sourceReg, dest=tReg}
                          else untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=OpSize64}) :: code
                     end
                 in
-                    loadAndStoreWithAddress(regAddr, Load32, true, loadConstOffset, loadIndexed, sourceCode)
+                    loadAndStoreWithAddress(regAddr, opWordSize Load32, loadShift Load32, true, loadConstOffset, loadIndexed, sourceCode)
                 end
 
             |   LoadStoreC64 =>
                 let
                     fun loadConstOffset(base, offset, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithConstantOffset{base=base, source=tReg, byteOffset=offset, loadType=Load64} ::
                         unboxLarge{source=sourceReg, dest=tReg} :: code
                     end
 
                     fun loadIndexed(base, index, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithIndexedOffset{base=base, source=tReg, index=index, loadType=Load64} ::
                         unboxLarge{source=sourceReg, dest=tReg} :: code
                     end
                 in
-                    loadAndStoreWithAddress(regAddr, Load64, true, loadConstOffset, loadIndexed, sourceCode)
+                    loadAndStoreWithAddress(regAddr, opWordSize Load64, loadShift Load64, true, loadConstOffset, loadIndexed, sourceCode)
                 end
 
             |   LoadStoreCFloat =>
                 let
                     (* The "real" value is a double, not a 32-bit float *)
                     fun loadConstOffset(base, offset, code) =
                     let
                         val tReg = newUReg() and cReg = newUReg()
                     in
-                        storeWithConstantOffset{base=base, source=tReg, byteOffset=offset, loadType=LoadFloat} ::
+                        storeFPWithConstantOffset{base=base, source=tReg, byteOffset=offset, floatSize=Float32} ::
                         unaryFloatingPt{source=cReg, dest=tReg, fpOp=ConvDbleToFloat} ::
                         unboxTagFloat{floatSize=Double64, source=sourceReg, dest=cReg} :: code
                     end
 
                     fun loadIndexed(base, index, code) =
                     let
                         val tReg = newUReg() and cReg = newUReg()
                     in
-                        storeWithIndexedOffset{base=base, source=tReg, index=index, loadType=LoadFloat} ::
+                        storeFPWithIndexedOffset{base=base, source=tReg, index=index, floatSize=Float32} ::
                         unaryFloatingPt{source=cReg, dest=tReg, fpOp=ConvDbleToFloat} ::
                         unboxTagFloat{floatSize=Double64, source=sourceReg, dest=cReg} :: code
                     end
                 in
-                    loadAndStoreWithAddress(regAddr, LoadFloat, true, loadConstOffset, loadIndexed, sourceCode)
+                    loadAndStoreWithAddress(regAddr, 4, 0w2, true, loadConstOffset, loadIndexed, sourceCode)
                 end
 
             |   LoadStoreCDouble =>
                 let
                     fun loadConstOffset(base, offset, code) =
                     let
                         val tReg = newUReg()
                     in
-                        storeWithConstantOffset{base=base, source=tReg, byteOffset=offset, loadType=LoadDouble} ::
+                        storeFPWithConstantOffset{base=base, source=tReg, byteOffset=offset, floatSize=Double64} ::
                         unboxTagFloat{floatSize=Double64, source=sourceReg, dest=tReg} :: code
                     end
 
                     fun loadIndexed(base, index, code) =
                     let
                         val tReg = newUReg()
                     in
-                        storeWithIndexedOffset{base=base, source=tReg, index=index, loadType=LoadDouble} ::
+                        storeFPWithIndexedOffset{base=base, source=tReg, index=index, floatSize=Double64} ::
                         unboxTagFloat{floatSize=Double64, source=sourceReg, dest=tReg} :: code
                     end
                 in
-                    loadAndStoreWithAddress(regAddr, LoadDouble, true, loadConstOffset, loadIndexed, sourceCode)
+                    loadAndStoreWithAddress(regAddr, 8, 0w3, true, loadConstOffset, loadIndexed, sourceCode)
                 end
 
             |   LoadStoreUntaggedUnsigned =>
                 let
                     (* Only used when initialising strings so this does not require store-release. *)
                     fun loadConstOffset(base, offset, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithConstantOffset{base=base, source=tReg, byteOffset=offset, loadType=polyWordLoadSize} ::
                         untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=polyWordOpSize} :: code
                     end
                     fun loadIndexed(base, index, code) =
                     let
                         val tReg = newUReg()
                     in
                         storeWithIndexedOffset{base=base, source=tReg, index=index, loadType=polyWordLoadSize} ::
                         untagValue{source=sourceReg, dest=tReg, isSigned=false, opSize=polyWordOpSize} :: code
                     end
                 in
-                    loadAndStoreWithAddress(regAddr, polyWordLoadSize, false, loadConstOffset, loadIndexed, sourceCode)
+                    loadAndStoreWithAddress(regAddr, opWordSize polyWordLoadSize, loadShift polyWordLoadSize, false, loadConstOffset, loadIndexed, sourceCode)
                 end
         in
             returnUnit(destination, storeCode, false)
         end
 
 
         (*Turn the codetree structure into icode. *)
         val bodyContext = {loopArgs=NONE, stackPtr=0, currHandler=NONE, overflowBlock=ref NONE}
         val (bodyCode, _, bodyExited) =
             codeToICodeRev(body, bodyContext, true, SpecificPReg resultTarget, beginInstructions)
         val icode = if bodyExited then bodyCode else returnInstruction(bodyContext, resultTarget, bodyCode)
         
         (* Turn the icode list into basic blocks.  The input list is in reverse so as part of
            this we reverse the list. *)
         local
             val resArray = Array.array(!labelCounter, BasicBlock{ block=[], flow=ExitCode })
             
             fun createEntry (blockNo, block, flow) =
                 Array.update(resArray, blockNo, BasicBlock{ block=block, flow=flow})
             
             fun splitCode([], _, _) = 
                 (* End of code.  We should have had a BeginFunction. *)
                 raise InternalError "splitCode - no begin"
             
             |   splitCode(BlockBegin args :: _, sinceLabel, flow) =
                     (* Final instruction.  Create the initial block and exit. *)
                     createEntry(0, BeginFunction args ::sinceLabel, flow)
             
             |   splitCode(BlockSimple instr :: rest, sinceLabel, flow) =
                     splitCode(rest, instr :: sinceLabel, flow)
 
             |   splitCode(BlockLabel label :: rest, sinceLabel, flow) =
                     (* Label - finish this block and start another. *)
                 (
                     createEntry(label, sinceLabel, flow);
                     (* Default to a jump to this label.  That is used if we have
                        assumed a drop-through. *)
                     splitCode(rest, [], Unconditional label)
                 )
             
             |   splitCode(BlockExit instr :: rest, _, _) =
                     splitCode(rest, [instr], ExitCode)
 
             |   splitCode(BlockFlow flow :: rest, _, _) =
                     splitCode(rest, [], flow)
             
             |   splitCode(BlockRaiseAndHandle(instr, handler) :: rest, _, _) =
                     splitCode(rest, [instr], UnconditionalHandle handler)
 
             |   splitCode(BlockOptionalHandle{call, handler, label} :: rest, sinceLabel, flow) =
                 let
                     (* A function call within a handler.  This could go to the handler but
                        if there is no exception will go to the next instruction.
                        Also includes JumpLoop since the stack check could result in an
                        Interrupt exception. *)
                 in
                     createEntry(label, sinceLabel, flow);
                     splitCode(rest, [call], ConditionalHandle{handler=handler, continue=label})
                 end
 
         in
             val () = splitCode(icode, [], ExitCode)
             val resultVector = Array.vector resArray
         end
       
         open ICodeTransform
         
         val pregProperties = Vector.fromList(List.rev(! pregPropList))
     in
         codeICodeFunctionToArm64{blocks = resultVector, functionName = name, pregProps = pregProperties,
             ccCount= ! ccRefCounter, debugSwitches = debugSwitches, resultClosure = resultClosure,
             profileObject = profileObject}
     end
 
     val gencodeLambda = codeFunctionToArm64
 
     structure Foreign = Arm64Foreign
     
     structure Sharing =
     struct
         type backendIC = backendIC
         and  bicLoadForm = bicLoadForm
         and argumentType = argumentType
         and closureRef = closureRef
     end
     
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICode.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICode.ML
index 09fc51c0..27b67095 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICode.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICode.ML
@@ -1,929 +1,958 @@
 (*
     Copyright David C. J. Matthews 2021
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64ICode(
 
-    structure Arm64Code: ARM64ASSEMBLY
+    structure Arm64Code: ARM64PREASSEMBLY
 
 ): ARM64ICODE =
 
 struct
     open Arm64Code
 
     open Address
     datatype preg = PReg of int (* A pseudo-register - an abstract register. *)
 
     (* If the value is zero we can use X0/W0. *)
     datatype pregOrZero = SomeReg of preg | ZeroReg
     
     (* A location on the stack.  May be more than word if this is a container or a handler entry. *)
     datatype stackLocn = StackLoc of {size: int, rno: int }
     
     (* This combines pregKind and stackLocn.  *)
     datatype regProperty =
         RegPropGeneral      (* A general register. *)
     |   RegPropUntagged     (* An untagged general register. *)
     |   RegPropStack of int (* A stack location or container. *)
     |   RegPropCacheTagged
     |   RegPropCacheUntagged
     |   RegPropMultiple     (* The result of a conditional or case. May be defined at multiple points. *)
 
     (* The reference to a condition code. *)
     datatype ccRef = CcRef of int
     datatype reg = GenReg of xReg | FPReg of vReg
 
-    datatype loadType = Load64 | Load32 | Load16 | Load8 | LoadDouble | LoadFloat
-
-    and opSize = OpSize32 | OpSize64
-    and logicalOp = LogAnd | LogOr | LogXor
-    and callKind = Recursive | ConstantCode of machineWord | FullCall
-    and floatSize = Float32 | Double64
-    and shiftDirection = ShiftLeft | ShiftRightLogical | ShiftRightArithmetic
-    and multKind =
-        MultAdd32 | MultSub32 | MultAdd64 | MultSub64 |
-        SignedMultAddLong (* 32bit*32bit + 64bit => 64Bit *) |
-        SignedMultHigh (* High order part of 64bit*64Bit *)
-    and fpUnary = NegFloat | NegDouble | AbsFloat | AbsDouble | ConvFloatToDble | ConvDbleToFloat
-    and fpBinary = MultiplyFP | DivideFP | AddFP | SubtractFP
+    datatype callKind = Recursive | ConstantCode of machineWord | FullCall
 
     (* Function calls can have an unlimited number of arguments so it isn't always
        going to be possible to load them into registers. *)
     datatype fnarg = ArgInReg of preg | ArgOnStack of { wordOffset: int, container: stackLocn, field: int }
 
+
     datatype arm64ICode =
         (* Move the contents of one preg to another.  These are always 64-bits. *)
         MoveRegister of { source: preg, dest: preg }
 
         (* Numerical constant. *)
     |   LoadNonAddressConstant of { source: Word64.word, dest: preg }
 
         (* Address constant. *)
     |   LoadAddressConstant of { source: machineWord, dest: preg }
 
         (* Load a value into a register using a constant, signed, byte offset.  The offset
            is in the range of -256 to (+4095*unit size). *)
     |   LoadWithConstantOffset of { base: preg, dest: preg, byteOffset: int, loadType: loadType }
 
+        (* Similarly for FP registers. *)
+    |   LoadFPWithConstantOffset of { base: preg, dest: preg, byteOffset: int, floatSize: floatSize }
+
         (* Load a value into a register using an index register. *)
     |   LoadWithIndexedOffset of { base: preg, dest: preg, index: preg, loadType: loadType }
 
-        (* Load an entry from the "memory registers".  Used for ThreadSelf and CheckRTSException.
-           These are always 64-bit values. *)
-    |   LoadMemReg of { wordOffset: int, dest: preg }
+        (* Ditto for FP. *)
+    |   LoadFPWithIndexedOffset of { base: preg, dest: preg, index: preg, floatSize: floatSize }
+
+        (* Returns the current thread ID.  Always a 64-bit value.. *)
+    |   GetThreadId of { dest: preg }
 
         (* Convert a 32-in-64 object index into an absolute address. *)
     |   ObjectIndexAddressToAbsolute of { source: preg, dest: preg }
 
         (* Convert an absolute address into an object index. *)
     |   AbsoluteToObjectIndex of { source: preg, dest: preg }
 
         (* Allocate a fixed sized piece of memory and puts the absolute address into dest.
            bytesRequired is the total number of bytes including the length word and any alignment
            necessary for 32-in-64. saveRegs is the list of registers that need to be saved if we
            need to do a garbage collection. *)
     |   AllocateMemoryFixed of { bytesRequired: Word64.word, dest: preg, saveRegs: preg list }
 
         (* Allocate a piece of memory.  The size argument is an untagged value containing
            the number of words i.e. the same value used for InitialiseMemory and to store
            in the length word. *)
     |   AllocateMemoryVariable of { size: preg, dest: preg, saveRegs: preg list }
 
         (* Initialise a piece of memory by writing "size" copies of the value
            in "init".  N.B. The size is an untagged value containing the
            number of words. *)
     |   InitialiseMem of { size: preg, addr: preg, init: preg }
 
         (* Mark the beginning of a loop.  This is really only to prevent the initialisation code being
            duplicated in ICodeOptimise. *)
     |   BeginLoop
 
         (* Set up the registers for a jump back to the start of a loop. *)
     |   JumpLoop of
             { regArgs: {src: fnarg, dst: preg} list,
               stackArgs: {src: fnarg, wordOffset: int, stackloc: stackLocn} list,
               checkInterrupt: preg list option }
 
         (* Store a register using a constant, signed, byte offset.  The offset
            is in the range of -256 to (+4095*unit size). *)
     |   StoreWithConstantOffset of { source: preg, base: preg, byteOffset: int, loadType: loadType }
 
+        (* Ditto for FP regs. *)
+    |   StoreFPWithConstantOffset of { source: preg, base: preg, byteOffset: int, floatSize: floatSize }
+
         (* Store a register using an index register. *)
     |   StoreWithIndexedOffset of { source: preg, base: preg, index: preg, loadType: loadType }
 
+        (* and for FP regs. *)
+    |   StoreFPWithIndexedOffset of { source: preg, base: preg, index: preg, floatSize: floatSize }
+
         (* Add/Subtract immediate.  The destination is optional in which case XZero is used.
            ccRef is optional.  If it is NONE the version of the instruction that does not generate
            a condition code is used. immed must be < 0wx1000. *)
     |   AddSubImmediate of { source: preg, dest: preg option, ccRef: ccRef option, immed: word,
                              isAdd: bool, length: opSize }
 
         (* Add/Subtract register.  As with AddSubImmediate, both the destination and cc are optional. *)
     |   AddSubRegister of { base: preg, shifted: preg, dest: preg option, ccRef: ccRef option,
                             isAdd: bool, length: opSize, shift: shiftType }
 
         (* Bitwise logical operations.  The immediate value must be a valid bit pattern.  ccRef can
            only be SOME if logOp is LogAnd. *)
     |   LogicalImmediate of { source: preg, dest: preg option, ccRef: ccRef option, immed: Word64.word,
                               logOp: logicalOp, length: opSize }
 
         (* Register logical operations.  ccRef can only be SOME if logOp is LogAnd.*)
     |   LogicalRegister of { base: preg, shifted: preg, dest: preg option, ccRef: ccRef option,
                              logOp: logicalOp, length: opSize, shift: shiftType }
 
         (* Shift a word by an amount specified in a register. *)
     |   ShiftRegister of { direction: shiftDirection, dest: preg, source: preg, shift: preg, opSize: opSize }
 
         (* The various forms of multiply all take three arguments and the general form is
            dest = M * N +/- A..   *)
     |   Multiplication of { kind: multKind, dest: preg, sourceA: preg option, sourceM: preg, sourceN: preg }
 
         (* Signed or unsigned division.  Sets the result to zero if the divisor is zero. *)
     |   Division of { isSigned: bool, dest: preg, dividend: preg, divisor: preg, opSize: opSize }
 
         (* Start of function.  Set the register arguments.  stackArgs is the list of
            stack arguments.  If the function has a real closure regArgs includes the
            closure register (X8).  The register arguments include the return register
            (X30). *)
     |   BeginFunction of { regArgs: (preg * xReg) list, stackArgs: stackLocn list }
 
         (* Call a function.  If the code address is a constant it is passed here.
            Otherwise the address is obtained by indirecting through X8 which has been loaded
            as one of the argument registers.  The result is stored in the destination register.
            The "containers" argument is used to ensure that any container whose address is passed
            as one of the other arguments continues to be referenced until the function is called
            since there's a possibility that it isn't actually used after the function. *)
     |   FunctionCall of
             { callKind: callKind, regArgs: (fnarg * xReg) list,
               stackArgs: fnarg list, dest: preg, saveRegs: preg list, containers: stackLocn list}
 
         (* Jump to a tail-recursive function.  This is similar to FunctionCall
            but complicated for stack arguments because the stack and the return
            address need to be overwritten.
            stackAdjust is the number of words to remove (positive) or add
            (negative) to the stack before the call.
            currStackSize contains the number of items currently on the stack. *)
     |   TailRecursiveCall of
             { callKind: callKind, regArgs: (fnarg * xReg) list,
               stackArgs: {src: fnarg, stack: int} list,
               stackAdjust: int, currStackSize: int }
 
         (* Return from the function.  resultReg is the preg that contains the result,
            returnReg is the preg that contains the return address. *)
     |   ReturnResultFromFunction of { resultReg: preg, returnReg: preg, numStackArgs: int }
 
         (* Raise an exception.  The packet is always loaded into X0. *)
     |   RaiseExceptionPacket of { packetReg: preg }
 
         (* Push a register to the stack.  This is used both for a normal push, copies=1, and
            also to reserve a container. *)
     |   PushToStack of { source: preg, copies: int, container: stackLocn }
 
         (* Load a register from the stack.  The container is the stack location identifier,
            the field is an offset in a container. *)
     |   LoadStack of { dest: preg, wordOffset: int, container: stackLocn, field: int }
 
         (* Store a value into the stack. *)
     |   StoreToStack of { source: preg, container: stackLocn, field: int, stackOffset: int }
 
         (* Set the register to the address of the container i.e. a specific offset on the stack. *)
     |   ContainerAddress of { dest: preg, container: stackLocn, stackOffset: int }
 
         (* Remove items from the stack.  Used to remove containers or
            registers pushed to the stack.. *)
     |   ResetStackPtr of { numWords: int }
 
         (* Tag a value by shifting and setting the tag bit. *)
     |   TagValue of { source: preg, dest: preg, isSigned: bool, opSize: opSize }
 
         (* Shift a value to remove the tag bit.  The cache is used if this is untagging a
            value that has previously been tagged. *)
     |   UntagValue of { source: preg, dest: preg, isSigned: bool, opSize: opSize }
 
         (* Box a largeword value.  Stores a value
            into a byte area.  This can be implemented using AllocateMemoryFixed
            but keeping it separate makes optimisation easier.
            The result is always an address and needs to be converted to an
            object index on 32-in-64. *)
     |   BoxLarge of { source: preg, dest: preg, saveRegs: preg list }
 
         (* Load a value from a box.  This can be implemented using a load but
            is kept separate to simplify optimisation.  The source is always
            an absolute address. *)
     |   UnboxLarge of { source: preg, dest: preg }
 
         (* Convert a floating point value into a value suitable for storing
            in the heap.  This normally involves boxing except that 32-bit
            floats can be tagged in native 64-bits. *)
     |   BoxTagFloat of { floatSize: floatSize, source: preg, dest: preg, saveRegs: preg list }
 
         (* The reverse of BoxTagFloat. *)
     |   UnboxTagFloat of { floatSize: floatSize, source: preg, dest: preg }
 
         (* Load a value with acquire semantics.  This means that any other
            load in this thread after this sees the value of the shared
            memory at this point and not earlier.  This is used for
            references and arrays to ensure that if another thread has
            built a data structure on the heap and then assigns the
            address to a shared ref this thread will see the updated heap
            and not any locally cached previous version. *)
     |   LoadAcquire of { base: preg, dest: preg, loadType: loadType }
 
         (* Store a value with release semantics.  This ensures that any
            other write completes before this operation and works with
            LoadAcquire. *)
     |   StoreRelease of { base: preg, source: preg, loadType: loadType }
 
         (* This is a generalised constant shift which includes selection of a
            range of bits. *)
     |   BitFieldShift of { source: preg, dest: preg, isSigned: bool, length: opSize, immr: word, imms: word }
 
         (*  Copy a range of bits and insert it into another register.  This is the
             only case where a register functions both as a source and a destination. *)
     |   BitFieldInsert of { source: preg, destAsSource: preg, dest: preg,
                             length: opSize, immr: word, imms: word }
 
         (* Indexed case. *)
     |   IndexedCaseOperation of { testReg: preg }
 
         (* Exception handling.  - Set up an exception handler. *)
     |   PushExceptionHandler
 
         (* End of a handled section.  Restore the previous handler. *)
     |   PopExceptionHandler
 
         (* Marks the start of a handler.  This sets the stack pointer and
            restores the old handler.  Sets the exception packet register. *) 
     |   BeginHandler of { packetReg: preg }
 
         (* Compare two vectors of bytes and set the condition code on the result.
            The registers are modified by the instruction. *)
     |   CompareByteVectors of
             { vec1Addr: preg, vec2Addr: preg, length: preg, ccRef: ccRef }
 
         (* Move a block of bytes (isByteMove true) or words (isByteMove false).  The length is the
            number of items (bytes or words) to move. The registers are modified by
            the instruction. *)
     |   BlockMove of { srcAddr: preg, destAddr: preg, length: preg, isByteMove: bool }
 
         (* Add or subtract to the system stack pointer and optionally return the new value.
            This is used to allocate and deallocate C space. *)
     |   AddSubXSP of { source: preg, dest: pregOrZero, isAdd: bool  }
 
         (* Ensures the value will actually be referenced although it doesn't generate any code. *)
     |   TouchValue of { source: preg }
 
         (* Load a value at the address and get exclusive access.  Always loads a
            64-bit value. *)
     |   LoadAcquireExclusive of { base: preg, dest: preg }
 
         (* Store a value into an address releasing the lock.  Sets the result to
            either 0 or 1 if it succeeds or fails. *)
     |   StoreReleaseExclusive of { base: preg, source: pregOrZero, result: preg }
 
         (* Insert a memory barrier. dmb ish. *)
     |   MemoryBarrier
 
         (* Convert an integer to a floating point value. *)
     |   ConvertIntToFloat of { source: preg, dest: preg, srcSize: opSize, destSize: floatSize }
 
         (* Convert a floating point value to an integer using the specified rounding mode.
            We could get an overflow here but fortunately the ARM generates a value
            that will cause an overflow when we tag it, provided we tag it explicitly. *)
     |   ConvertFloatToInt of { source: preg, dest: preg, srcSize: floatSize, destSize: opSize, rounding: IEEEReal.rounding_mode }
 
         (* Unary floating point.  This includes conversions between float and double. *)
     |   UnaryFloatingPt of { source: preg, dest: preg, fpOp: fpUnary }
 
         (* Binary floating point: addition, subtraction, multiplication and division. *)
     |   BinaryFloatingPoint of { arg1: preg, arg2: preg, dest: preg, fpOp: fpBinary, opSize: floatSize }
 
         (* Floating point comparison. *)
     |   CompareFloatingPoint of { arg1: preg, arg2: preg, ccRef: ccRef, opSize: floatSize }
 
         (* Destinations at the end of a basic block. *)
     and controlFlow =
         (* Unconditional branch to a label - should be a merge point. *)
         Unconditional of int
         (* Conditional branch. Jumps to trueJump if the condional is true, falseJump if false. *)
     |   Conditional of { ccRef: ccRef, condition: condition, trueJump: int, falseJump: int }
         (* Exit - the last instruction of the block is a return, raise or tailcall. *)
     |   ExitCode
         (* Indexed case - this branches to one of a number of labels *)
     |   IndexedBr of int list
         (* Set up a handler.  This doesn't cause an immediate branch but the state at the
            start of the handler is the state at this point. *)
     |   SetHandler of { handler: int, continue: int }
         (* Unconditional branch to a handler.  If an exception is raised explicitly
            within the scope of a handler. *)
     |   UnconditionalHandle of int
         (* Conditional branch to a handler.  Occurs if there is a call to a
            function within the scope of a handler.  It may jump to the handler. *)
     |   ConditionalHandle of { handler: int, continue: int }
 
     and basicBlock = BasicBlock of { block: arm64ICode list, flow: controlFlow }
 
     (* Return the list of blocks that are the immediate successor of this. *)
     fun successorBlocks(Unconditional l) = [l]
     |   successorBlocks(Conditional{trueJump, falseJump, ...}) = [trueJump, falseJump]
     |   successorBlocks ExitCode = []
     |   successorBlocks(IndexedBr cases) = cases
     |   successorBlocks(SetHandler{handler, continue, ...}) = [handler, continue]
         (* We only need "handler" in SetHandler because we may have a handler that is never actually jumped to. *)
     |   successorBlocks(UnconditionalHandle handler) = [handler]
     |   successorBlocks(ConditionalHandle{handler, continue, ...}) = [handler, continue]
 
     local
         fun printReg(PReg i, stream) = stream("R" ^ Int.toString i)
         and printCC(CcRef ccRef, stream) = stream ("CC" ^ Int.toString ccRef)
         
         fun printOptReg(ZeroReg, stream) = stream "Zero"
         |   printOptReg(SomeReg reg, stream) = printReg(reg, stream)
          
         fun printStackLoc(StackLoc{size, rno}, stream) =
             (stream "S"; stream(Int.toString rno); stream "("; stream(Int.toString size); stream ")")
         
         fun regRepr(XReg w) = "X" ^ Int.toString(Word8.toInt w)
         |   regRepr XZero = "XZ"
         |   regRepr XSP = "SP"
 
         fun arithRepr OpSize64 = "64"
         |   arithRepr OpSize32 = "32"
 
         fun printLoadType(Load64, stream) = stream "64"
         |   printLoadType(Load32, stream) = stream "32"
         |   printLoadType(Load16, stream) = stream "16"
         |   printLoadType(Load8, stream) = stream "8"
-        |   printLoadType(LoadDouble, stream) = stream "D"
-        |   printLoadType(LoadFloat, stream) = stream "F"
 
         fun printSaves([], _) = ()
         |   printSaves([areg], stream) = printReg(areg, stream)
         |   printSaves(areg::more, stream) =
                 (printReg(areg, stream); stream ","; printSaves(more, stream))
 
         fun printArg(ArgInReg reg, stream) = printReg(reg, stream)
         |   printArg(ArgOnStack{wordOffset, container, field, ...}, stream) =
             (
                 printStackLoc(container, stream); stream " + ";
                 stream(Int.toString field);
                 stream " (";  stream(Int.toString wordOffset); stream ")"
             )
 
         fun printShift(ShiftLSL w, stream) = stream(" LSL " ^ Word8.toString w)
         |   printShift(ShiftLSR w, stream) = stream(" LSR " ^ Word8.toString w)
         |   printShift(ShiftASR w, stream) = stream(" ASR " ^ Word8.toString w)
         |   printShift(ShiftNone, _) = ()
 
         fun printFloatSize(Float32, stream) = stream "F"
         |   printFloatSize(Double64, stream) = stream "D"
 
         fun printICode(MoveRegister{ source, dest }, stream) =
             (
                 stream "\tMove\t";
                 printReg(source,  stream);
                 stream " => ";
                 printReg(dest, stream)
             )
 
         |   printICode(LoadNonAddressConstant{ source, dest }, stream) =
             (
                 stream "\tLoadNonAddress\t"; stream(Word64.toString source);
                 stream " => ";
                 printReg(dest, stream)
             )
 
         |   printICode(LoadAddressConstant{ source, dest }, stream) =
             (
                 stream "\tLoadAddress\t"; stream(Address.stringOfWord source);
                 stream " => ";
                 printReg(dest, stream)
             )
 
         |   printICode(LoadWithConstantOffset{ base, dest, byteOffset, loadType }, stream) =
             (
                 stream "\tLoadConstOffset"; printLoadType(loadType, stream); stream "\t[";
                 printReg(base, stream); stream "]+";
                 stream(Int.toString byteOffset);
                 stream " => ";
                 printReg(dest, stream)
             )
 
+        |   printICode(LoadFPWithConstantOffset{ base, dest, byteOffset, floatSize }, stream) =
+            (
+                stream "\tLoadConstOffset"; printFloatSize(floatSize, stream); stream "\t[";
+                printReg(base, stream); stream "]+";
+                stream(Int.toString byteOffset);
+                stream " => ";
+                printReg(dest, stream)
+            )
+
         |   printICode(LoadWithIndexedOffset{ base, dest, index, loadType }, stream) =
             (
                 stream "\tLoadIndexed"; printLoadType(loadType, stream); stream "\t[";
                 printReg(base, stream); stream "+"; printReg(index, stream);
                 stream "] => "; printReg(dest, stream)
             )
 
-        |   printICode(LoadMemReg { wordOffset, dest}, stream) =
-                ( stream "\tLoadMemReg\t"; stream(Int.toString wordOffset); stream " => "; printReg(dest, stream) )
+        |   printICode(LoadFPWithIndexedOffset{ base, dest, index, floatSize }, stream) =
+            (
+                stream "\tLoadIndexed"; printFloatSize(floatSize, stream); stream "\t[";
+                printReg(base, stream); stream "+"; printReg(index, stream);
+                stream "] => "; printReg(dest, stream)
+            )
+
+        |   printICode(GetThreadId { dest}, stream) =
+                ( stream "\tGetThreadId\t"; stream " => "; printReg(dest, stream) )
 
         |   printICode(ObjectIndexAddressToAbsolute{ source, dest }, stream) =
             (
                 stream "\tObjectAddrToAbs\t";
                 printReg(source, stream); stream " => "; printReg(dest, stream)
             )
 
         |   printICode(AbsoluteToObjectIndex{ source, dest }, stream) =
             (
                 stream "\tAbsToObjectAddr\t";
                 printReg(source, stream); stream " => "; printReg(dest, stream)
             )
 
         |   printICode(AllocateMemoryFixed{bytesRequired, dest, saveRegs}, stream) =
             (
                 stream "\tAllocateMemory\t";
                 stream(Word64.fmt StringCvt.DEC bytesRequired); stream " => ";
                 printReg(dest, stream); stream " save="; printSaves(saveRegs, stream)
             )
 
         |   printICode(AllocateMemoryVariable{size, dest, saveRegs}, stream) =
             (
                 stream "\tAllocateMemory\t";
                 stream "s="; printReg(size, stream);
                 stream " => "; printReg(dest, stream);
                 stream " save="; printSaves(saveRegs, stream)
             )
 
         |   printICode(InitialiseMem{size, addr, init}, stream) =
             (
                 stream "\tInitialiseMem\t";
                 stream "s="; printReg(size, stream);
                 stream ",i="; printReg(init, stream);
                 stream ",a="; printReg(addr, stream)
             )
 
         |   printICode(BeginLoop, stream) = stream "\tBeginLoop"
 
         |   printICode(JumpLoop{regArgs, stackArgs, checkInterrupt, ... }, stream) =
             (
                 stream "\tJumpLoop\t";
                 List.app(fn {src, dst} => (printReg(dst, stream); stream "="; printArg(src, stream); stream " ")) regArgs;
                 List.app(
                     fn {src, wordOffset, stackloc} =>
                         (printStackLoc(stackloc, stream); stream("(sp" ^ Int.toString wordOffset); stream ")="; printArg(src, stream); stream " ")
                     ) stackArgs;
                 case checkInterrupt of
                     NONE => ()
                 |   SOME saveRegs => (stream " Check:save="; printSaves(saveRegs, stream))
             )
 
         |   printICode(StoreWithConstantOffset{ base, source, byteOffset, loadType }, stream) =
             (
                 stream "\tStoreConstOffset"; printLoadType(loadType, stream); stream "\t";
                 printReg(source, stream); stream " => [";
                 printReg(base, stream); stream "+";
                 stream(Int.toString byteOffset); stream "]"
             )
 
+        |   printICode(StoreFPWithConstantOffset{ base, source, byteOffset, floatSize }, stream) =
+            (
+                stream "\tStoreConstOffset"; printFloatSize(floatSize, stream); stream "\t";
+                printReg(source, stream); stream " => [";
+                printReg(base, stream); stream "+";
+                stream(Int.toString byteOffset); stream "]"
+            )
+
         |   printICode(StoreWithIndexedOffset{ base, source, index, loadType }, stream) =
             (
                 stream "\tStoreIndexed"; printLoadType(loadType, stream); stream "\t";
                 printReg(source, stream); stream " => [";
                 printReg(base, stream); stream "+"; printReg(index, stream); stream "]"
             )
 
+        |   printICode(StoreFPWithIndexedOffset{ base, source, index, floatSize }, stream) =
+            (
+                stream "\tStoreIndexed"; printFloatSize(floatSize, stream); stream "\t";
+                printReg(source, stream); stream " => [";
+                printReg(base, stream); stream "+"; printReg(index, stream); stream "]"
+            )
+
         |   printICode(AddSubImmediate{ source, dest, ccRef, immed, isAdd, length }, stream) =
             (
                 stream (if isAdd then "\tAddImmediate" else "\tSubImmediate"); stream(arithRepr length);
                 stream "\t"; printReg(source, stream); stream ",0x"; stream(Word.toString immed);
                 stream " => "; case dest of NONE => stream "_" | SOME reg => printReg(reg, stream);
                 case ccRef of NONE => () | SOME cc => (stream ", "; printCC(cc, stream))
             )
 
         |   printICode(AddSubRegister{ base, shifted, dest, ccRef, isAdd, length, shift }, stream) =
             (
                 stream (if isAdd then "\tAddRegister" else "\tSubRegister"); stream(arithRepr length);
                 stream "\t"; printReg(base, stream); stream ", ";
                 printReg(shifted, stream); printShift(shift, stream);
                 stream " => "; case dest of NONE => stream "_" | SOME reg => printReg(reg, stream);
                 case ccRef of NONE => () | SOME cc => (stream ", "; printCC(cc, stream))
             )
 
         |   printICode(LogicalImmediate{ source, dest, ccRef, immed, logOp, length }, stream) =
             (
                 stream (case logOp of LogAnd => "\tAndImmediate" | LogOr => "\tOrImmediate" | LogXor => "\tXorImmediate");
                 stream(arithRepr length);
                 stream "\t"; printReg(source, stream); stream ",0x"; stream(Word64.toString immed);
                 stream " => "; case dest of NONE => stream "_" | SOME reg => printReg(reg, stream);
                 case ccRef of NONE => () | SOME cc => (stream ", "; printCC(cc, stream))
             )
 
         |   printICode(LogicalRegister{ base, shifted, dest, ccRef, logOp, length, shift }, stream) =
             (
                 stream (case logOp of LogAnd => "\tAndRegister"  | LogOr => "\tOrRegister" | LogXor => "\tXorRegister");
                 stream(arithRepr length);
                 stream "\t"; printReg(base, stream); stream ", ";
                 printReg(shifted, stream); printShift(shift, stream);
                 stream " => "; case dest of NONE => stream "_" | SOME reg => printReg(reg, stream);
                 case ccRef of NONE => () | SOME cc => (stream ", "; printCC(cc, stream))
             )
 
         |   printICode(ShiftRegister{ direction, dest, source, shift, opSize }, stream) =
             (
                 stream (
                     case direction of
                         ShiftLeft => "\tShiftLeft"
                     |   ShiftRightLogical => "\tShiftRightLog"
                     |   ShiftRightArithmetic => "\tShiftRightArith");
                 stream(arithRepr opSize);
                 stream "\t"; printReg(source, stream); stream " by ";
                 printReg(shift, stream);
                 stream " => "; printReg(dest, stream)
             )
 
         |   printICode(Multiplication{ kind, dest, sourceA, sourceM, sourceN }, stream) =
             (
                 stream (
                     case kind of
                         MultAdd32 => "\tMultAdd32\t"
                     |   MultSub32 => "\tMultSub32\t"
                     |   MultAdd64 => "\tMultAdd64\t"
                     |   MultSub64 => "\tMultSub64\t"
                     |   SignedMultAddLong => "\tSignedMultAddLong\t"
                     |   SignedMultHigh => "\tSignedMultHigh\t");
                 printReg(sourceM, stream); stream " * ";
                 printReg(sourceN, stream);
                 case sourceA of SOME srcA => (stream " +/- "; printReg(srcA, stream)) | NONE => ();
                 stream " => "; printReg(dest, stream)
             )
 
         |   printICode(Division{ isSigned, dest, dividend, divisor, opSize }, stream) =
             (
                 stream (if isSigned then "\tSignedDivide" else "\tUnsignedDivide");
                 stream(arithRepr opSize);
                 stream "\t"; printReg(dividend, stream); stream " by ";
                 printReg(divisor, stream);
                 stream " => "; printReg(dest, stream)
             )
 
         |   printICode(BeginFunction{ regArgs, stackArgs }, stream) =
             (
                 stream "\tBeginFunction\t";
                 List.app(fn (arg, r) => (stream(regRepr r); stream "="; printReg(arg, stream); stream " ")) regArgs;
                 List.app(fn s => printStackLoc(s, stream)) stackArgs
             )
 
         |   printICode(FunctionCall{callKind, regArgs, stackArgs, dest, saveRegs, containers}, stream) =
             (
                 stream "\tFunctionCall\t";
                 case callKind of
                     Recursive => stream "recursive "
                 |   ConstantCode m => (stream(stringOfWord m); stream " ")
                 |   FullCall => ();
                 List.app(fn (arg, r) => (stream(regRepr r); stream "="; printArg(arg, stream); stream " ")) regArgs;
                 List.app(fn arg => (stream "p="; printArg(arg, stream); stream " ")) stackArgs;
                 stream "=> "; printReg(dest, stream);
                 stream " save="; printSaves(saveRegs, stream);
                 if null containers then ()
                 else (stream " containers="; List.app (fn c => (printStackLoc(c, stream); stream " ")) containers)
             )
 
         |   printICode(TailRecursiveCall{callKind, regArgs, stackArgs, stackAdjust, currStackSize, ...}, stream) =
             (
                 stream "\tTailCall\t";
                 case callKind of
                     Recursive => stream "recursive "
                 |   ConstantCode m => (stream(stringOfWord m); stream " ")
                 |   FullCall => ();
                 List.app(fn (arg, r) => (stream(regRepr r); stream "="; printArg(arg, stream); stream " ")) regArgs;
                 List.app(fn {src, stack} => (stream (Int.toString stack); stream "<="; printArg(src, stream); stream " ")) stackArgs;
                 stream "adjust="; stream(Int.toString stackAdjust);
                 stream " stackSize="; stream(Int.toString currStackSize)
             )
 
         |   printICode(ReturnResultFromFunction{ resultReg, returnReg, numStackArgs }, stream) =
             (
                 stream "\tReturnFromFunction\t";
                 printReg(returnReg, stream); stream " with "; printReg(resultReg, stream);
                 stream("," ^ Int.toString numStackArgs)
             )
 
         |   printICode(RaiseExceptionPacket{ packetReg }, stream) =
             ( stream "\tRaiseException\t"; printReg(packetReg, stream) )
 
         |   printICode(PushToStack{ source, copies, container }, stream) =
             (
                 stream "\tPushToStack\t"; printReg(source, stream);
                 if copies > 1 then (stream " * "; stream(Int.toString copies)) else ();
                 stream " => "; printStackLoc(container, stream)
             )
 
         |   printICode(LoadStack{ dest, wordOffset, container, field }, stream) =
             (
                 stream "\tLoadStack\t";
                 printStackLoc(container, stream); stream " + ";
                 stream(Int.toString field);
                 stream " (";  stream(Int.toString wordOffset); stream ")";
                 stream " => "; printReg(dest, stream)
             )
 
         |   printICode(StoreToStack{ source, container, field, stackOffset }, stream) =
             (
                 stream "\tStoreToStack\t"; printReg(source, stream);
                 stream " => "; printStackLoc(container, stream); stream "+";
                 stream (Int.toString field); stream "(";
                 stream(Int.toString stackOffset); stream ")"
             )
 
         |   printICode(ContainerAddress{ dest, container, stackOffset }, stream) =
             (
                 stream "\tContainerAddress\t";
                 stream "@"; printStackLoc(container, stream);
                 stream " (";  stream(Int.toString stackOffset); stream ") => ";
                 printReg(dest, stream)
             )
 
         |   printICode(ResetStackPtr{ numWords }, stream) =
             ( stream "\tResetStackPtr\t"; stream(Int.toString numWords) )
 
         |   printICode(TagValue{ source, dest, isSigned, opSize }, stream) =
             (
                 stream "\tTag"; stream(if isSigned then "Signed" else "Unsigned");
                 stream(arithRepr opSize); stream "\t";
                 printReg(source, stream); stream " => "; printReg(dest, stream)
             )
 
         |   printICode(UntagValue{ source, dest, isSigned, opSize }, stream) =
             (
                 stream "\tUntag"; stream(if isSigned then "Signed" else "Unsigned");
                 stream(arithRepr opSize); stream "\t";
                 printReg(source, stream); stream " => "; printReg(dest, stream)
             )
 
         |   printICode(BoxLarge{source, dest, saveRegs}, stream) =
             (
                 stream "\tBoxLarge\t";
                 printReg(source, stream);
                 stream " => ";
                 printReg(dest, stream);
                 stream " save="; printSaves(saveRegs, stream)
             )
 
         |   printICode(UnboxLarge{source, dest}, stream) =
             (
                 stream "\tUnboxLarge\t";
                 printReg(source, stream);
                 stream " => ";
                 printReg(dest, stream)
             )
 
         |   printICode(BoxTagFloat{floatSize, source, dest, saveRegs}, stream) =
             (
                 stream "\tBoxTagFloat"; printFloatSize(floatSize, stream); stream "\t";
                 printReg(source, stream);
                 stream " => ";
                 printReg(dest, stream);
                 stream " save="; printSaves(saveRegs, stream)
             )
 
         |   printICode(UnboxTagFloat{floatSize, source, dest}, stream) =
             (
                 stream "\tUnboxTagFloat"; printFloatSize(floatSize, stream); stream "\t";
                 printReg(source, stream);
                 stream " => ";
                 printReg(dest, stream)
             )
 
         |   printICode(LoadAcquire{ base, dest, loadType }, stream) =
             (
                 stream "\tLoadAcquire"; printLoadType(loadType, stream); stream "\t[";
                 printReg(base, stream); stream "] => "; printReg(dest, stream)
             )
 
         |   printICode(StoreRelease{ base, source, loadType }, stream) =
             (
                 stream "\tStoreRelease"; printLoadType(loadType, stream); stream "\t";
                 printReg(source, stream); stream " => [";
                 printReg(base, stream); stream "]"
             )
 
         |   printICode(BitFieldShift{ source, dest, isSigned, length, immr, imms }, stream) =
             (
                 stream "\tBitShift"; stream(if isSigned then "Signed" else "Unsigned"); stream(arithRepr length); stream "\t";
                 printReg(source, stream);
                 stream " => "; printReg(dest, stream);
                 stream " immr="; stream(Word.fmt StringCvt.DEC immr);
                 stream " imms="; stream(Word.fmt StringCvt.DEC imms)
             )
 
         |   printICode(BitFieldInsert{ source, dest, destAsSource, length, immr, imms }, stream) =
             (
                 stream "\tBitInsert";  stream(arithRepr length); stream "\t";
                 printReg(source, stream); stream " with "; printReg(destAsSource, stream);
                 stream " => "; printReg(dest, stream);
                 stream " immr="; stream(Word.fmt StringCvt.DEC immr);
                 stream " imms="; stream(Word.fmt StringCvt.DEC imms)
             )
 
         |   printICode(IndexedCaseOperation{testReg}, stream) =
                 ( stream "\tIndexedCase\t"; printReg(testReg, stream) )
 
         |   printICode(PushExceptionHandler, stream) = stream "\tPushExcHandler"
 
         |   printICode(PopExceptionHandler, stream) = stream "\tPopExcHandler"
 
         |   printICode(BeginHandler{packetReg}, stream) =
             (
                 stream "\tBeginHandler\t";
                 printReg(packetReg, stream)
             )
 
         |   printICode(CompareByteVectors{vec1Addr, vec2Addr, length, ccRef, ...}, stream) =
             (
                 stream "\tCompareByteVectors\t";
                 printReg(vec1Addr, stream); stream ",";
                 printReg(vec2Addr, stream); stream ",";
                 printReg(length, stream);
                 stream " => "; printCC(ccRef, stream)
             )
 
         |   printICode(BlockMove{srcAddr, destAddr, length, isByteMove}, stream) =
             (
                 stream(if isByteMove then "\tBlockByteMove\t" else "\tBlockWordMove\t");
                 stream "src="; printReg(srcAddr, stream);
                 stream ",dest="; printReg(destAddr, stream);
                 stream ",len="; printReg(length, stream)
             )
 
         |   printICode(AddSubXSP{ source, dest, isAdd }, stream) =
             (
                 stream(if isAdd then "\tAdd\t" else "\tSubtract\t");
                 printReg(source, stream); stream " XSP => ";
                 printOptReg(dest, stream)
             )
 
         |   printICode(TouchValue{ source }, stream) =
             ( stream "\tTouchValue\t"; printReg(source, stream) )
 
         |   printICode(LoadAcquireExclusive{ base, dest }, stream) =
             (
                 stream "\tLoadExclusive\t["; printReg(base, stream);
                 stream "] => "; printReg(dest, stream)
             )
 
         |   printICode(StoreReleaseExclusive{ base, source, result }, stream) =
             (
                 stream "\tStoreExclusive\t";
                 printOptReg(source, stream); stream " => [";
                 printReg(base, stream); stream "] result => ";
                 printReg(result, stream)
             )
 
         |   printICode(MemoryBarrier, stream) = stream "\tMemoryBarrier"
 
         |   printICode(ConvertIntToFloat{ source, dest, srcSize, destSize}, stream) =
             (
                 stream "\tConvert"; stream(arithRepr srcSize); stream "To";
                 printFloatSize(destSize, stream); stream "\t";
                 printReg(source, stream); stream " => "; printReg(dest, stream)
             )
 
         |   printICode(ConvertFloatToInt{ source, dest, srcSize, destSize, rounding}, stream) =
             let
                 open IEEEReal
             in
                 stream "\tConvert"; printFloatSize(srcSize, stream); stream "To";
                 stream(arithRepr destSize); stream "\t";
                 printReg(source, stream); stream " => "; printReg(dest, stream);
                 stream(
                     case rounding of
                         TO_NEAREST => " rounding"
                     |   TO_NEGINF => " rounding down"
                     |   TO_POSINF => " rounding up"
                     |   TO_ZERO => " truncating"
                 )
             end
 
         |   printICode(UnaryFloatingPt{ source, dest, fpOp}, stream) =
             (
                 stream(
                     case fpOp of
                         NegFloat => "\tNegFloat\t"
                     |   NegDouble => "\tNegDouble\t"
                     |   AbsFloat => "\tAbsFloat\t"
                     |   AbsDouble => "\tAbsDouble\t"
                     |   ConvFloatToDble => "\tFloatToDble\t"
                     |   ConvDbleToFloat => "\t\t"
                 );
                 printReg(source, stream); stream " => "; printReg(dest, stream)
             )
 
         |   printICode(BinaryFloatingPoint{ arg1, arg2, dest, fpOp, opSize}, stream) =
             (
                 stream(
                     case fpOp of
                         MultiplyFP => "\tMultiply"
                     |   DivideFP => "\tDivide"
                     |   AddFP => "\tAdd"
                     |   SubtractFP => "\tSubtract"
                 );
                 printFloatSize(opSize, stream); stream "\t";
                 printReg(arg1, stream); stream ", "; printReg(arg2, stream);
                 stream " => "; printReg(dest, stream)
             )
 
         |   printICode(CompareFloatingPoint{ arg1, arg2, opSize, ccRef}, stream) =
             (
                 stream "\tCompare"; printFloatSize(opSize, stream); stream "\t";
                 printReg(arg1, stream); stream ", "; printReg(arg2, stream);
                 stream ", "; printCC(ccRef, stream)
             )
 
         and printCondition(cond,  stream)  = stream(condToString cond)
 
         (* Print a basic block. *)
         fun printBlock stream (blockNo, BasicBlock{block, flow, ...}) =
         (
             (* Put a label on all but the first. *)
             if blockNo <> 0 then stream("L" ^ Int.toString blockNo ^ ":") else ();
             List.app (fn icode => (printICode(icode, stream); stream "\n")) block;
             case flow of
                 Unconditional l => stream("\tJump\tL" ^ Int.toString l ^ "\n")
             |   Conditional {condition, trueJump, falseJump, ccRef, ...} =>
                 (
                     stream "\tJump"; printCondition(condition, stream);
                     stream "\t"; printCC(ccRef, stream);
                     stream " L"; stream (Int.toString trueJump);
                     stream " else L"; stream (Int.toString falseJump); stream "\n"
                 )
             |   ExitCode => ()
             |   IndexedBr _ => ()
             |   SetHandler{handler, continue} =>
                     stream(concat["\tSetHandler\tH", Int.toString handler, "\n",
                                   "\tJump\tL", Int.toString continue, "\n"])
             |   UnconditionalHandle handler => stream("\tJump\tH" ^ Int.toString handler ^ "\n")
             |   ConditionalHandle{handler, continue} =>
                     stream(concat["\tJump\tL", Int.toString continue, " or H", Int.toString handler, "\n"])
         )
     in
         fun printICodeAbstract(blockVec, stream) =
             Vector.appi(printBlock stream) blockVec
     end
 
     (* Only certain bit patterns are allowed in a logical immediate instruction
        but the encoding is complex so it's easiest to inherit the test from
        the assembler layer. *)
     local
         fun optow OpSize32 = WordSize32 | optow OpSize64 = WordSize64
     in
         fun isEncodableBitPattern(v, w) = Arm64Code.isEncodableBitPattern(v, optow w)
     end
 
     (* This generates a  BitField instruction with the appropriate values for immr and imms. *)
     fun shiftConstant{ direction, dest, source, shift, opSize } =
     let
         val (isSigned, immr, imms) =
             case (direction, opSize) of
                 (ShiftLeft, OpSize64) => (false, Word.~ shift mod 0w64, 0w64-0w1-shift)
             |   (ShiftLeft, OpSize32) => (false, Word.~ shift mod 0w32, 0w32-0w1-shift)
             |   (ShiftRightLogical, OpSize64) => (false, shift, 0wx3f)
             |   (ShiftRightLogical, OpSize32) => (false, shift, 0wx1f)
             |   (ShiftRightArithmetic, OpSize64) => (true, shift, 0wx3f)
             |   (ShiftRightArithmetic, OpSize32) => (true, shift, 0wx1f)
     in
         BitFieldShift{ source=source, dest=dest, isSigned=isSigned, length=opSize, immr=immr, imms=imms }
     end
 
     structure Sharing =
     struct
         type xReg           = xReg
         and  vReg           = vReg
         and  reg            = reg
         and  condition      = condition
         and  shiftType      = shiftType
         and  arm64ICode     = arm64ICode
         and  preg           = preg
         and  pregOrZero     = pregOrZero
         and  controlFlow    = controlFlow
         and  basicBlock     = basicBlock
         and  stackLocn      = stackLocn
         and  regProperty    = regProperty
         and  ccRef          = ccRef
         and  fnarg          = fnarg
         and  closureRef     = closureRef
         and  loadType       = loadType
         and  opSize         = opSize
         and  logicalOp      = logicalOp
         and  callKind       = callKind
         and  floatSize      = floatSize
         and  shiftDirection = shiftDirection
         and  multKind       = multKind
         and  fpUnary        = fpUnary
         and  fpBinary       = fpBinary
    end
 end;
 
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICodeToArm64Code.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICodeToArm64Code.ML
index 21cfcbdb..87075f04 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICodeToArm64Code.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ICodeToArm64Code.ML
@@ -1,1473 +1,1535 @@
 (*
     Copyright David C. J. Matthews 2021
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64ICodeToArm64Code(
 
     structure Arm64Assembly: ARM64ASSEMBLY
     structure Arm64Sequences: ARM64SEQUENCES
     structure Debug: DEBUG
     structure Arm64ICode: ARM64ICODE
     structure Identify: ARM64IDENTIFYREFERENCES
     structure IntSet: INTSET
     structure Pretty: PRETTY
 
     structure Strongly:
         sig
             val stronglyConnectedComponents: {nodeAddress: 'a -> int, arcs: 'a -> int list } -> 'a list -> 'a list list
         end
     
     sharing Arm64Assembly.Sharing = Arm64Sequences.Sharing = Arm64ICode.Sharing = Identify.Sharing = IntSet
 ): ARM64ICODEGENERATE =
 struct
     open Identify
     open Arm64ICode
     open Arm64Assembly
     open Arm64Sequences
     open Address
     exception InternalError = Misc.InternalError
 
     (* These aren't currently used for anything. *)
     val workReg1 = X16 and workReg2 = X17
 
     fun icodeToArm64Code {blocks, functionName, stackRequired, debugSwitches, allocatedRegisters: reg vector, resultClosure, profileObject, ...} =
     let
         val numBlocks = Vector.length blocks
         fun getAllocatedReg(PReg r) = Vector.sub(allocatedRegisters, r)
         
         fun getAllocatedGenReg r =
             case getAllocatedReg r of GenReg r => r | FPReg _ => raise InternalError "getAllocateGenReg: returned FP Reg"
         and getAllocatedFPReg r =
             case getAllocatedReg r of FPReg r => r | GenReg _ => raise InternalError "getAllocatedFPReg: returned Gen Reg"
 
         fun getAllocatedGenRegOrZero ZeroReg = XZero
         |   getAllocatedGenRegOrZero (SomeReg reg) = getAllocatedGenReg reg
 
         (* Load from and store to stack. *)
         fun loadFromStack(destReg, wordOffset, code) =
             if wordOffset >= 4096
             then loadRegIndexed{regT=destReg, regN=X_MLStackPtr, regM=destReg, option=ExtUXTX ScaleOrShift} ::
                     List.rev(loadNonAddress(destReg, Word64.fromInt wordOffset)) @ code
             else loadRegScaled{regT=destReg, regN=X_MLStackPtr, unitOffset=wordOffset} :: code
 
         and storeToStack(sourceReg, wordOffset, workReg, code) =
             if wordOffset >= 4096
             then storeRegIndexed{regT=sourceReg, regN=X_MLStackPtr, regM=workReg, option=ExtUXTX ScaleOrShift} ::
                     List.rev(loadNonAddress(workReg, Word64.fromInt wordOffset)) @ code
             else storeRegScaled{regT=sourceReg, regN=X_MLStackPtr, unitOffset=wordOffset} :: code
 
         datatype srcAndDest = IsInReg of xReg | IsOnStack of int
 
         local
             (* The registers are numbered from 0.  Choose values that don't conflict with
                the stack addresses. *)
             fun regNo(XReg r) = ~1 - Word8.toInt r | regNo _ = ~1 - 31
             type node = {src: srcAndDest, dst: srcAndDest }
             
             fun nodeAddress({dst=IsInReg r, ...}: node) = regNo r
             |   nodeAddress({dst=IsOnStack a, ...}) = a
             
             fun arcs({src=IsOnStack wordOffset, ...}: node) = [wordOffset]
             |   arcs{src=IsInReg r, ...} = [regNo r]
         in
             val stronglyConnected = Strongly.stronglyConnectedComponents { nodeAddress=nodeAddress, arcs=arcs }
         end
         
         (* This is a general function for moving values into registers or to the stack
            where it is possible that the source values might also be in use as destinations.
            The stack is used for destinations only for tail recursive calls. *)
         fun moveMultipleValues(moves, code) =
         let
             fun moveValues ([], code) = code (* We're done. *)
 
             |   moveValues (arguments, code) =
                 let
                     (* stronglyConnectedComponents does two things.  It detects loops where
                        it's not possible to move items without breaking the loop but more
                        importantly it orders the dependencies so that if there are no loops we
                        can load the source and store it in the destination knowing that
                        we won't overwrite anything we might later need. *)
                     
                     val ordered = stronglyConnected arguments
                     
                     fun loadIntoReg(IsInReg sReg, dReg, code) =
                             if sReg = dReg then code else moveRegToReg{sReg=sReg, dReg=dReg} :: code
                     |   loadIntoReg(IsOnStack wordOffset, dReg, code) =
                             loadFromStack(dReg, wordOffset, code)
 
                     fun moveEachValue ([], code) = code
 
                     |   moveEachValue ([{dst=IsInReg dReg, src}] :: rest, code) =
                             moveEachValue(rest, loadIntoReg(src, dReg, code))
 
                     |   moveEachValue ([{dst=IsOnStack wordOffset, src=IsInReg sReg}] :: rest, code) =
                             (* Storing into the stack. *)
                             moveEachValue(rest, storeToStack(sReg, wordOffset, workReg1, code))
 
                     |   moveEachValue ([{dst=IsOnStack dstOffset, src=IsOnStack srcOffset}] :: rest, code) =
                             (* Copy a stack location - needs a load and store unless the address is the same. *)
                             if dstOffset = srcOffset
                             then moveEachValue(rest, code)
                             else moveEachValue(rest,
                                     storeToStack(workReg2, dstOffset, workReg1,
                                         loadFromStack(workReg2, srcOffset, code)))
 
                     |   moveEachValue((cycle as first :: _ :: _) :: rest, code) =
                         (* We have a cycle. *)
                         let
                             (* We need to exchange some of the arguments.  Doing an exchange here will
                                set the destination with the correct source.  However we have to process
                                every subsequent entry with the swapped registers.  That may well mean that
                                one of those entries becomes trivial.
                                We also need to rerun stronglyConnectedComponents on at least the rest of
                                this cycle.  It's easiest to flatten the rest and do everything. *)
                             (* Exchange the source and destination.  We don't have an exchange instruction
                                and there's a further complication.  We could be copying between stack locations
                                and their offsets could be > 4096.  Since we've only got two work registers
                                we need to use the hardware stack as an extra location. Stack-stack
                                exchange is very rare so the extra overhead to handle the general case
                                is worth it. *)
                             local
                                 fun storeToDest(sReg, IsInReg dReg, _, code) =
                                         moveRegToReg{sReg=sReg, dReg=dReg} :: code
                                 |   storeToDest(sReg, IsOnStack wordOffset, work, code) =
                                         storeToStack(sReg, wordOffset, work, code)
                             in
                                 fun exchange(IsInReg arg1Reg, arg2, code) =
                                     moveRegToReg{sReg=workReg2, dReg=arg1Reg} ::
                                     storeToDest(arg1Reg, arg2, workReg1,
                                         loadIntoReg(arg2, workReg2, code))
 
                                 |   exchange(arg1, IsInReg arg2Reg, code) =
                                     moveRegToReg{sReg=workReg2, dReg=arg2Reg} ::
                                     storeToDest(arg2Reg, arg1, workReg1,
                                         loadIntoReg(arg1, workReg2, code))
 
                                 |   exchange(arg1, arg2, code) =
                                     (* The hardware stack must be 16-byte aligned. *)
                                     storeToDest(workReg2, arg2, workReg1,
                                         loadRegPostIndex{regT=workReg2, regN=XSP, byteOffset=16} ::
                                         storeToDest(workReg2, arg1, workReg1,
                                             loadIntoReg(arg2, workReg2,
                                                 storeRegPreIndex{regT=workReg2, regN=XSP, byteOffset= ~16} ::
                                                     loadIntoReg(arg1, workReg2, code))))
                             end
                             
                             (* Try to find either a register-register move or a register-stack move.
                                If not use the first.  If there's a stack-register move there will
                                also be a register-stack so we don't need to look for both. *)
                                
                             val {dst=selectDst, src=selectSrc} = first
 
                             (* This includes this entry but after the swap we'll eliminate it. *)
                             val flattened = List.foldl(fn (a, b) => a @ b) [] (cycle :: rest)
                             val destAsSource = selectDst
 
                             fun match(s1: srcAndDest, s2) = s1 = s2
                             
                             fun swapSources{src, dst} =
                                 if match(src, selectSrc) then {src=destAsSource, dst=dst}
                                 else if match(src, destAsSource) then {src=selectSrc, dst=dst}
                                 else {src=src, dst=dst}
 
                             val exchangeCode = exchange(selectDst, selectSrc, code)                                    
                         in
                             moveValues(List.map swapSources flattened, exchangeCode)
                         end
 
                     |   moveEachValue(([]) :: _, _) = (* This should not happen - avoid warning. *)
                             raise InternalError "moveEachValue - empty set"
                 in
                     moveEachValue(ordered, code)
                 end
         in
             moveValues(moves, code)
         end
 
         (* Where we have multiple specific registers as either source or
            destination there is the potential that a destination register
            if currently in use as a source. *) 
         fun moveMultipleRegisters(regPairList, code) =
         let
             val regPairsAsDests =
                 List.map(fn {src, dst} => {src=IsInReg src, dst=IsInReg dst}) regPairList
         in
             moveMultipleValues(regPairsAsDests, code)
         end
 
         fun moveIfNecessary({src, dst}, code) =
             if src = dst then code
             else moveRegToReg{sReg=src, dReg=dst} :: code
 
         (* Add a constant word to the source register and put the result in the
            destination.  regW is used as a work register if necessary.  This is used
            both for addition and subtraction. *)
         fun addConstantWord({regS, regD, value=0w0, ...}, code) =
             if regS = regD then code else moveRegToReg{sReg=regS, dReg=regD} :: code
     
         |   addConstantWord({regS, regD, regW, value}, code) =
             let
                 (* If we have to load the constant it's better if the top 32-bits are
                    zero if possible. *)
                 val (isSub, unsigned) =
                     if value > Word64.<<(0w1, 0w63)
                     then (true, ~ value)
                     else (false, value)
             in
                 if unsigned < Word64.<<(0w1, 0w24)
                 then (* We can put up to 24 in a shifted and an unshifted constant. *)
                 let
                     val w = Word.fromLarge(Word64.toLarge unsigned)
                     val high = Word.andb(Word.>>(w, 0w12), 0wxfff)
                     val low = Word.andb(w, 0wxfff)
                     val addSub = if isSub then subImmediate else addImmediate
                 in
                     if high <> 0w0
                     then
                     (
                         (if low <> 0w0 then [addSub{regN=regD, regD=regD, immed=low, shifted=false}] else []) @
                         addSub{regN=regS, regD=regD, immed=high, shifted=true} :: code
                     )
                     else addSub{regN=regS, regD=regD, immed=low, shifted=false} :: code
                 end
                 else
                 let
                     (* To minimise the constant and increase the chances that it
                        will fit in a single word look to see if we can shift it. *)
                     fun getShift(value, shift) =
                         if Word64.andb(value, 0w1) = 0w0
                         then getShift(Word64.>>(value, 0w1), shift+0w1)
                         else (value, shift)
                     val (shifted, shift) = getShift(unsigned, 0w0)
                 in
                     (if isSub then subShiftedReg else addShiftedReg)
                         {regM=regW, regN=regS, regD=regD, shift=ShiftLSL shift} ::
                     List.rev (loadNonAddress(regW, shifted)) @ code
                 end
             end
 
         val getSaveRegs = List.map getAllocatedGenReg
         
         fun getSaveRegsAndSeparate saveRegs =
         let
             val realSaveRegs = getSaveRegs saveRegs
             val saveX30 = List.exists (fn r => r = X30) realSaveRegs
             val preserve = List.filter (fn r => r <> X30) realSaveRegs
         in
             { saveX30=saveX30, saveRegs=preserve }
         end
 
         val startOfFunctionLabel = createLabel() (* Used for recursive calls/jumps *)
 
         val blockToLabelMap = Vector.tabulate(numBlocks, fn _ => createLabel())
 
         fun getBlockLabel blockNo = Vector.sub(blockToLabelMap, blockNo)
 
         fun codeExtended _ (MoveRegister{source, dest, ...}, code) =
             moveIfNecessary({src=getAllocatedGenReg source, dst=getAllocatedGenReg dest}, code)
 
         |   codeExtended _ (LoadNonAddressConstant{source, dest, ...}, code) =
                 List.rev(loadNonAddress(getAllocatedGenReg dest, source)) @ code
 
         |   codeExtended _ (LoadAddressConstant{source, dest, ...}, code) =
                 loadAddressConstant(getAllocatedGenReg dest, source) :: code
 
         |   codeExtended _ (LoadWithConstantOffset{dest, base, byteOffset, loadType, ...}, code) =
             if byteOffset < 0
             then
             let
                 val baseReg = getAllocatedGenReg base
                 val loadInstr =
                 case loadType of
                     Load64 => loadRegUnscaled{regT=getAllocatedGenReg dest, regN=baseReg, byteOffset=byteOffset}
                 |   Load32 => loadRegUnscaled32{regT=getAllocatedGenReg dest, regN=baseReg, byteOffset=byteOffset}
                 |   Load16 => loadRegUnscaled16{regT=getAllocatedGenReg dest, regN=baseReg, byteOffset=byteOffset}
                 |   Load8 => loadRegUnscaledByte{regT=getAllocatedGenReg dest, regN=baseReg, byteOffset=byteOffset}
-                |   LoadFloat => loadRegUnscaledFloat{regT=getAllocatedFPReg dest, regN=baseReg, byteOffset=byteOffset}
-                |   LoadDouble => loadRegUnscaledDouble{regT=getAllocatedFPReg dest, regN=baseReg, byteOffset=byteOffset}
             in
                 loadInstr :: code
             end
             else
             let
                 val baseReg = getAllocatedGenReg base
                 val loadInstr =
                     case loadType of
                         Load64 => loadRegScaled{regT=getAllocatedGenReg dest, regN=baseReg, unitOffset=Int.quot(byteOffset, 8)}
                     |   Load32 => loadRegScaled32{regT=getAllocatedGenReg dest, regN=baseReg, unitOffset=Int.quot(byteOffset, 4)}
                     |   Load16 => loadRegScaled16{regT=getAllocatedGenReg dest, regN=baseReg, unitOffset=Int.quot(byteOffset, 2)}
                     |   Load8 => loadRegScaledByte{regT=getAllocatedGenReg dest, regN=baseReg, unitOffset=byteOffset}
-                    |   LoadFloat => loadRegScaledFloat{regT=getAllocatedFPReg dest, regN=baseReg, unitOffset=Int.quot(byteOffset, 4)}
-                    |   LoadDouble => loadRegScaledDouble{regT=getAllocatedFPReg dest, regN=baseReg, unitOffset=Int.quot(byteOffset, 8)}
+            in
+                loadInstr :: code
+            end
+
+        |   codeExtended _ (LoadFPWithConstantOffset{dest, base, byteOffset, floatSize, ...}, code) =
+            if byteOffset < 0
+            then
+            let
+                val baseReg = getAllocatedGenReg base
+                val loadInstr =
+                case floatSize of
+                    Float32 => loadRegUnscaledFloat{regT=getAllocatedFPReg dest, regN=baseReg, byteOffset=byteOffset}
+                |   Double64 => loadRegUnscaledDouble{regT=getAllocatedFPReg dest, regN=baseReg, byteOffset=byteOffset}
+            in
+                loadInstr :: code
+            end
+            else
+            let
+                val baseReg = getAllocatedGenReg base
+                val loadInstr =
+                    case floatSize of
+                        Float32 => loadRegScaledFloat{regT=getAllocatedFPReg dest, regN=baseReg, unitOffset=Int.quot(byteOffset, 4)}
+                    |   Double64 => loadRegScaledDouble{regT=getAllocatedFPReg dest, regN=baseReg, unitOffset=Int.quot(byteOffset, 8)}
             in
                 loadInstr :: code
             end
 
         |   codeExtended _ (LoadWithIndexedOffset{dest, base, index, loadType, ...}, code) =
             let
                 val baseReg = getAllocatedGenReg base and indexReg = getAllocatedGenReg index
                 (* In 32-in-64 the index is a 32-bit value in the low order 32-bits.  It may be
                    signed if it is a C address. *)
                 val scaleType = if is32in64 then ExtSXTW else ExtUXTX
                 val loadInstr =
                     case loadType of
                         Load64 => loadRegIndexed{regT=getAllocatedGenReg dest, regN=baseReg, regM=indexReg, option=scaleType ScaleOrShift}
                     |   Load32 => loadRegIndexed32{regT=getAllocatedGenReg dest, regN=baseReg, regM=indexReg, option=scaleType ScaleOrShift}
                     |   Load16 => loadRegIndexed16{regT=getAllocatedGenReg dest, regN=baseReg, regM=indexReg, option=scaleType ScaleOrShift}
                     |   Load8 => loadRegIndexedByte{regT=getAllocatedGenReg dest, regN=baseReg, regM=indexReg, option=scaleType NoScale}
-                    |   LoadFloat => loadRegIndexedFloat{regT=getAllocatedFPReg dest, regN=baseReg, regM=indexReg, option=scaleType ScaleOrShift}
-                    |   LoadDouble => loadRegIndexedDouble{regT=getAllocatedFPReg dest, regN=baseReg, regM=indexReg, option=scaleType ScaleOrShift}
             in
                 loadInstr :: code
             end
 
-        |   codeExtended _ (LoadMemReg { wordOffset, dest}, code) =
-                (* Load the thread id or RTS exception.  This is always a 64-bit value. *)
-                loadRegScaled{regT=getAllocatedGenReg dest, regN=X_MLAssemblyInt, unitOffset=wordOffset} :: code
+        |   codeExtended _ (LoadFPWithIndexedOffset{dest, base, index, floatSize, ...}, code) =
+            let
+                val baseReg = getAllocatedGenReg base and indexReg = getAllocatedGenReg index
+                (* In 32-in-64 the index is a 32-bit value in the low order 32-bits.  It may be
+                   signed if it is a C address. *)
+                val scaleType = if is32in64 then ExtSXTW else ExtUXTX
+                val loadInstr =
+                    case floatSize of
+                        Float32 => loadRegIndexedFloat{regT=getAllocatedFPReg dest, regN=baseReg, regM=indexReg, option=scaleType ScaleOrShift}
+                    |   Double64 => loadRegIndexedDouble{regT=getAllocatedFPReg dest, regN=baseReg, regM=indexReg, option=scaleType ScaleOrShift}
+            in
+                loadInstr :: code
+            end
+
+        |   codeExtended _ (GetThreadId { dest}, code) =
+                (* Load the thread id.  This is always a 64-bit value. *)
+                loadRegScaled{regT=getAllocatedGenReg dest, regN=X_MLAssemblyInt, unitOffset=threadIdOffset} :: code
 
         |   codeExtended _ (ObjectIndexAddressToAbsolute{source, dest, ...}, code) =
                 addShiftedReg{regM=getAllocatedGenReg source, regN=X_Base32in64, regD=getAllocatedGenReg dest, shift=ShiftLSL 0w2} :: code
 
         |   codeExtended _ (AbsoluteToObjectIndex{source, dest, ...}, code) =
             let
                 val destReg = getAllocatedGenReg dest
             in
                 logicalShiftRight{shift=0w2, regN=destReg, regD=destReg} ::
                 subShiftedReg{regM=X_Base32in64, regN=getAllocatedGenReg source, regD=destReg, shift=ShiftNone} :: code
             end
 
         |   codeExtended _ (AllocateMemoryFixed{ bytesRequired, dest, saveRegs, ... }, code) =
             let
                 val {saveX30, saveRegs=preserve} = getSaveRegsAndSeparate saveRegs
                 val label = createLabel()
                 val destReg = getAllocatedGenReg dest
                 (* N.B.  This is in reverse order so read from the bottom up. *)
             in
                 moveRegToReg{sReg=destReg, dReg=X_MLHeapAllocPtr} ::
                 setLabel label ::
                 (if saveX30 then [loadRegPostIndex{regT=X30, regN=X_MLStackPtr, byteOffset= 8}] else []) @
                 registerMask preserve :: branchAndLinkReg workReg1 ::
                 loadRegScaled{regT=workReg1, regN=X_MLAssemblyInt, unitOffset=heapOverflowCallOffset} ::
                 (if saveX30 then [storeRegPreIndex{regT=X30, regN=X_MLStackPtr, byteOffset= ~8}] else []) @
                 conditionalBranch(CondCarrySet, label) :: (* Skip the trap if it's ok. *)
                 (* Compare with heap limit. *)
                 subSShiftedReg{regM=X_MLHeapLimit, regN=destReg, regD=XZero, shift=ShiftNone} ::
                 (* Subtract the number of bytes required from the heap pointer and put in result reg. *)
                 (if bytesRequired >= 0w4096
                 then subShiftedReg{regM=workReg1, regN=X_MLHeapAllocPtr, regD=destReg, shift=ShiftNone} ::
                      loadNonAddressConstant(workReg1, bytesRequired) :: code
                 else subImmediate{regN=X_MLHeapAllocPtr, regD=destReg, immed=Word.fromLarge bytesRequired, shifted=false} :: code)
             end
 
         |   codeExtended _ (AllocateMemoryVariable{ size, dest, saveRegs, ... }, code) =
             let
                 val {saveX30, saveRegs=preserve} = getSaveRegsAndSeparate saveRegs
                 val trapLabel = createLabel() and noTrapLabel = createLabel()
                 val destReg = getAllocatedGenReg dest and sizeReg = getAllocatedGenReg size
                 (* Subtract the size into the result register.  Subtract a further word for
                    the length word and round down in 32-in-64. *)
                 val subtractSize =
                     if is32in64
                     then bitwiseAndImmediate{bits= ~ 0w8, regN=destReg, regD=destReg} ::
                             subImmediate{regN=destReg, regD=destReg, immed=0w4, shifted=false} ::
                             subShiftedReg{regM=sizeReg, regN=X_MLHeapAllocPtr, regD=destReg, shift=ShiftLSL 0w2} :: code
                     else subImmediate{regN=destReg, regD=destReg, immed=0w8, shifted=false} ::
                             subShiftedReg{regM=sizeReg, regN=X_MLHeapAllocPtr, regD=destReg, shift=ShiftLSL 0w3} :: code
                 (* Check against the limit. If the size is large enough it is possible that this could wrap round. 
                    To check for that we trap if either the result is less than the limit or if it is
                    now greater than the allocation pointer. *)
             in
                 (* N.B.  This is in reverse order so read from the bottom up. *)
                 moveRegToReg{sReg=destReg, dReg=X_MLHeapAllocPtr} ::
                 setLabel noTrapLabel ::
                 (if saveX30 then [loadRegPostIndex{regT=X30, regN=X_MLStackPtr, byteOffset= 8}] else []) @
                 registerMask preserve :: branchAndLinkReg workReg1 ::
                 loadRegScaled{regT=X16, regN=X_MLAssemblyInt, unitOffset=heapOverflowCallOffset} ::
                 (if saveX30 then [storeRegPreIndex{regT=X30, regN=X_MLStackPtr, byteOffset= ~8}] else []) @
                 setLabel trapLabel ::
                 conditionalBranch(CondCarryClear, noTrapLabel) ::
                 subSShiftedReg{regM=X_MLHeapAllocPtr, regN=destReg, regD=XZero, shift=ShiftNone} ::
                 conditionalBranch(CondCarryClear, trapLabel) ::
                 subSShiftedReg{regM=X_MLHeapLimit, regN=destReg, regD=XZero, shift=ShiftNone} :: subtractSize
             end
 
         |   codeExtended _ (InitialiseMem{ size, addr, init}, code) =
             let
                 val sizeReg = getAllocatedGenReg size
                 and addrReg = getAllocatedGenReg addr
                 and initReg = getAllocatedGenReg init
                 val exitLabel = createLabel() and loopLabel = createLabel()
                 (* Yhis uses a loop to initialise.  It's possible the size is
                    zero so we have to check at the top of the loop. *)
             in
                 setLabel exitLabel ::
                 unconditionalBranch loopLabel ::
                 (if is32in64
                  then storeRegPreIndex32{regT=initReg, regN=workReg1, byteOffset= ~4}
                  else storeRegPreIndex{regT=initReg, regN=workReg1, byteOffset= ~8}) ::
                 conditionalBranch(CondEqual, exitLabel) ::
                 (* Are we at the start? *)
                 subSShiftedReg{regM=workReg1, regN=addrReg, regD=XZero, shift=ShiftNone} ::
                 setLabel loopLabel ::
                 (* Add the length in bytes so we point at the end. *)
                 addShiftedReg{regM=sizeReg, regN=addrReg, regD=workReg1,
                             shift=ShiftLSL(if is32in64 then 0w2 else 0w3)} :: code
             end
 
         |   codeExtended _ (BeginLoop, code) = code
 
         |   codeExtended _ (JumpLoop{regArgs, stackArgs, checkInterrupt}, code) =
             let
                 (* TODO: We could have a single list and use ArgOnStack and ArgInReg to distinguish. *)
                 fun convertArg(ArgOnStack{wordOffset, ...}) = IsOnStack wordOffset
                 |   convertArg(ArgInReg reg) = IsInReg(getAllocatedGenReg reg)
                 val extStackArgs =
                     map (fn {wordOffset, src, ...} => {src=convertArg src, dst=IsOnStack wordOffset}) stackArgs
                 val extRegArgs =
                     map (fn {dst, src} => {src=convertArg src, dst=convertArg(ArgInReg dst)}) regArgs
                 val code2 = moveMultipleValues(extStackArgs @ extRegArgs, code)
             in
                 case checkInterrupt of
                     NONE => code2
                 |   SOME saveRegs =>
                     let
                         val {saveX30, saveRegs} = getSaveRegsAndSeparate saveRegs
                         val skipCheck = createLabel()
                     in
                         (* Put in stack-check code to allow this to be interrupted. *)
                         setLabel skipCheck ::
                         (if saveX30 then [loadRegPostIndex{regT=X30, regN=X_MLStackPtr, byteOffset= 8}] else []) @
                         registerMask saveRegs ::
                         branchAndLinkReg workReg1 ::
                         loadRegScaled{regT=workReg1, regN=X_MLAssemblyInt, unitOffset=stackOverflowCallOffset} ::
                         (if saveX30 then [storeRegPreIndex{regT=X30, regN=X_MLStackPtr, byteOffset= ~8}] else []) @
                         conditionalBranch(CondCarrySet, skipCheck) ::
                         subSShiftedReg{regM=workReg1, regN=X_MLStackPtr, regD=XZero, shift=ShiftNone} ::
                         loadRegScaled{regT=workReg1, regN=X_MLAssemblyInt, unitOffset=stackLimitOffset}  ::
                         code2
                     end
             end
 
         |   codeExtended _ (StoreWithConstantOffset{source, base, byteOffset, loadType, ...}, code) =
             if byteOffset < 0
             then
             let
                 val baseReg = getAllocatedGenReg base
                 val storeInstr =
                 case loadType of
                     Load64 => storeRegUnscaled{regT=getAllocatedGenReg source, regN=baseReg, byteOffset=byteOffset}
                 |   Load32 => storeRegUnscaled32{regT=getAllocatedGenReg source, regN=baseReg, byteOffset=byteOffset}
                 |   Load16 => storeRegUnscaled16{regT=getAllocatedGenReg source, regN=baseReg, byteOffset=byteOffset}
                 |   Load8 => storeRegUnscaledByte{regT=getAllocatedGenReg source, regN=baseReg, byteOffset=byteOffset}
-                |   LoadFloat => storeRegUnscaledFloat{regT=getAllocatedFPReg source, regN=baseReg, byteOffset=byteOffset}
-                |   LoadDouble => storeRegUnscaledDouble{regT=getAllocatedFPReg source, regN=baseReg, byteOffset=byteOffset}
             in
                 storeInstr :: code
             end
             else
             let
                 val baseReg = getAllocatedGenReg base
                 val storeInstr =
                     case loadType of
                         Load64 => storeRegScaled{regT=getAllocatedGenReg source, regN=baseReg, unitOffset=Int.quot(byteOffset, 8)}
                     |   Load32 => storeRegScaled32{regT=getAllocatedGenReg source, regN=baseReg, unitOffset=Int.quot(byteOffset, 4)}
                     |   Load16 => storeRegScaled16{regT=getAllocatedGenReg source, regN=baseReg, unitOffset=Int.quot(byteOffset, 2)}
                     |   Load8 => storeRegScaledByte{regT=getAllocatedGenReg source, regN=baseReg, unitOffset=byteOffset}
-                    |   LoadFloat => storeRegScaledFloat{regT=getAllocatedFPReg source, regN=baseReg, unitOffset=Int.quot(byteOffset, 4)}
-                    |   LoadDouble => storeRegScaledDouble{regT=getAllocatedFPReg source, regN=baseReg, unitOffset=Int.quot(byteOffset, 8)}
+            in
+                storeInstr :: code
+            end
+
+        |   codeExtended _ (StoreFPWithConstantOffset{source, base, byteOffset, floatSize, ...}, code) =
+            if byteOffset < 0
+            then
+            let
+                val baseReg = getAllocatedGenReg base
+                val storeInstr =
+                case floatSize of
+                    Float32 => storeRegUnscaledFloat{regT=getAllocatedFPReg source, regN=baseReg, byteOffset=byteOffset}
+                |   Double64 => storeRegUnscaledDouble{regT=getAllocatedFPReg source, regN=baseReg, byteOffset=byteOffset}
+            in
+                storeInstr :: code
+            end
+            else
+            let
+                val baseReg = getAllocatedGenReg base
+                val storeInstr =
+                    case floatSize of
+                        Float32 => storeRegScaledFloat{regT=getAllocatedFPReg source, regN=baseReg, unitOffset=Int.quot(byteOffset, 4)}
+                    |   Double64 => storeRegScaledDouble{regT=getAllocatedFPReg source, regN=baseReg, unitOffset=Int.quot(byteOffset, 8)}
             in
                 storeInstr :: code
             end
 
         |   codeExtended _ (StoreWithIndexedOffset{source, base, index, loadType, ...}, code) =
             let
                 val baseReg = getAllocatedGenReg base and indexReg = getAllocatedGenReg index
                 (* In 32-in-64 the index is a 32-bit value in the low order 32-bits.  It may be
                    signed if it is a C address. *)
                 val scaleType = if is32in64 then ExtSXTW else ExtUXTX
                 val storeInstr =
                     case loadType of
                         Load64 => storeRegIndexed{regT=getAllocatedGenReg source, regN=baseReg, regM=indexReg, option=scaleType ScaleOrShift}
                     |   Load32 => storeRegIndexed32{regT=getAllocatedGenReg source, regN=baseReg, regM=indexReg, option=scaleType ScaleOrShift}
                     |   Load16 => storeRegIndexed16{regT=getAllocatedGenReg source, regN=baseReg, regM=indexReg, option=scaleType ScaleOrShift}
                     |   Load8 => storeRegIndexedByte{regT=getAllocatedGenReg source, regN=baseReg, regM=indexReg, option=scaleType NoScale}
-                    |   LoadFloat => storeRegIndexedFloat{regT=getAllocatedFPReg source, regN=baseReg, regM=indexReg, option=scaleType ScaleOrShift}
-                    |   LoadDouble => storeRegIndexedDouble{regT=getAllocatedFPReg source, regN=baseReg, regM=indexReg, option=scaleType ScaleOrShift}
+            in
+                storeInstr :: code
+            end
+
+        |   codeExtended _ (StoreFPWithIndexedOffset{source, base, index, floatSize, ...}, code) =
+            let
+                val baseReg = getAllocatedGenReg base and indexReg = getAllocatedGenReg index
+                (* In 32-in-64 the index is a 32-bit value in the low order 32-bits.  It may be
+                   signed if it is a C address. *)
+                val scaleType = if is32in64 then ExtSXTW else ExtUXTX
+                val storeInstr =
+                    case floatSize of
+                        Float32 => storeRegIndexedFloat{regT=getAllocatedFPReg source, regN=baseReg, regM=indexReg, option=scaleType ScaleOrShift}
+                    |   Double64 => storeRegIndexedDouble{regT=getAllocatedFPReg source, regN=baseReg, regM=indexReg, option=scaleType ScaleOrShift}
             in
                 storeInstr :: code
             end
 
         |   codeExtended _ (AddSubImmediate{ source, dest, immed, isAdd, length, ccRef}, code) =
             let
                 val instr =
                     case (isAdd, ccRef, length) of
                         (true, NONE, OpSize64) => addImmediate
                     |   (true, SOME _, OpSize64) => addSImmediate
                     |   (false, NONE, OpSize64) => subImmediate
                     |   (false, SOME _, OpSize64) => subSImmediate
                     |   (true, NONE, OpSize32) => addImmediate32
                     |   (true, SOME _, OpSize32) => addSImmediate32
                     |   (false, NONE, OpSize32) => subImmediate32
                     |   (false, SOME _, OpSize32) => subSImmediate32
                 val destReg =
                     case dest of
                         NONE => XZero
                     |   SOME dreg => getAllocatedGenReg dreg
             in
                 instr{regN=getAllocatedGenReg source, regD=destReg, immed=immed, shifted=false} :: code
             end
 
         |   codeExtended _ (AddSubRegister{ base, shifted, dest, isAdd, length, ccRef, shift}, code) =
             let
                 val instr =
                     case (isAdd, ccRef, length) of
                         (true, NONE, OpSize64) => addShiftedReg
                     |   (true, SOME _, OpSize64) => addSShiftedReg
                     |   (false, NONE, OpSize64) => subShiftedReg
                     |   (false, SOME _, OpSize64) => subSShiftedReg
                     |   (true, NONE, OpSize32) => addShiftedReg32
                     |   (true, SOME _, OpSize32) => addSShiftedReg32
                     |   (false, NONE, OpSize32) => subShiftedReg32
                     |   (false, SOME _, OpSize32) => subSShiftedReg32
                 val destReg =
                     case dest of
                         NONE => XZero
                     |   SOME dreg => getAllocatedGenReg dreg
             in
                 instr{regN=getAllocatedGenReg base, regM=getAllocatedGenReg shifted, regD=destReg, shift=shift} :: code
             end           
 
         |   codeExtended _ (LogicalImmediate{ source, dest, immed, logOp, length, ccRef}, code) =
             let
                 val instr =
                     case (logOp, ccRef, length) of
                         (LogAnd, NONE, OpSize64) => bitwiseAndImmediate
                     |   (LogAnd, SOME _, OpSize64) => bitwiseAndSImmediate
                     |   (LogOr, NONE, OpSize64) => bitwiseOrImmediate
                     |   (LogXor, NONE, OpSize64) => bitwiseXorImmediate
 
                     |   (LogAnd, NONE, OpSize32) => bitwiseAndImmediate32
                     |   (LogAnd, SOME _, OpSize32) => bitwiseAndSImmediate32
                     |   (LogOr, NONE, OpSize32) => bitwiseOrImmediate32
                     |   (LogXor, NONE, OpSize32) => bitwiseXorImmediate32
 
                     |   _ => raise InternalError "ccRef not valid with OR or XOR"
 
                 val destReg =
                     case dest of
                         NONE => XZero
                     |   SOME dreg => getAllocatedGenReg dreg
             in
                 instr{regN=getAllocatedGenReg source, regD=destReg, bits=immed} :: code
             end
 
         |   codeExtended _ (LogicalRegister{ base, shifted, dest, logOp, length, ccRef, shift}, code) =
             let
                 val instr =
                     case (logOp, ccRef, length) of
                         (LogAnd, NONE, OpSize64) => andShiftedReg
                     |   (LogAnd, SOME _, OpSize64) => andsShiftedReg
                     |   (LogOr, NONE, OpSize64) => orrShiftedReg
                     |   (LogXor, NONE, OpSize64) => eorShiftedReg
 
                     |   (LogAnd, NONE, OpSize32) => andShiftedReg32
                     |   (LogAnd, SOME _, OpSize32) => andsShiftedReg32
                     |   (LogOr, NONE, OpSize32) => orrShiftedReg32
                     |   (LogXor, NONE, OpSize32) => eorShiftedReg32
 
                     |   _ => raise InternalError "ccRef not valid with OR or XOR"
                 
                 (* There are also versions of AND/OR/XOR which operate on a complement (NOT)
                    of the shifted register.  It's probably not worth looking for a use for them. *)
                 val destReg =
                     case dest of
                         NONE => XZero
                     |   SOME dreg => getAllocatedGenReg dreg
             in
                 instr{regN=getAllocatedGenReg base, regM=getAllocatedGenReg shifted, regD=destReg, shift=shift} :: code
             end
 
         |   codeExtended _ (ShiftRegister{ direction, dest, source, shift, opSize }, code) =
             let
                 val instr =
                     case (direction, opSize) of
                         (ShiftLeft, OpSize64) => logicalShiftLeftVariable
                     |   (ShiftLeft, OpSize32) => logicalShiftLeftVariable32
                     |   (ShiftRightLogical, OpSize64) => logicalShiftRightVariable
                     |   (ShiftRightLogical, OpSize32) => logicalShiftRightVariable32
                     |   (ShiftRightArithmetic, OpSize64) => arithmeticShiftRightVariable
                     |   (ShiftRightArithmetic, OpSize32) => arithmeticShiftRightVariable32
             in
                 instr{regN=getAllocatedGenReg source, regM=getAllocatedGenReg shift, regD=getAllocatedGenReg dest} :: code
             end
 
         |   codeExtended _ (Multiplication{ kind, dest, sourceA, sourceM, sourceN }, code) =
             let
                 val destReg = getAllocatedGenReg dest
                 and srcAReg = case sourceA of SOME srcA => getAllocatedGenReg srcA | NONE => XZero
                 and srcNReg = getAllocatedGenReg sourceN
                 and srcMReg = getAllocatedGenReg sourceM
                 val instr =
                     case kind of
                         MultAdd32 => multiplyAndAdd32{regM=srcMReg, regN=srcNReg, regA=srcAReg, regD=destReg}
                     |   MultSub32 => multiplyAndSub32{regM=srcMReg, regN=srcNReg, regA=srcAReg, regD=destReg}
                     |   MultAdd64 => multiplyAndAdd{regM=srcMReg, regN=srcNReg, regA=srcAReg, regD=destReg}
                     |   MultSub64 => multiplyAndSub{regM=srcMReg, regN=srcNReg, regA=srcAReg, regD=destReg}
                     |   SignedMultAddLong => signedMultiplyAndAddLong{regM=srcMReg, regN=srcNReg, regA=srcAReg, regD=destReg}
                     |   SignedMultHigh => signedMultiplyHigh{regM=srcMReg, regN=srcNReg, regD=destReg}
             in
                 instr :: code
             end
 
         |   codeExtended _ (Division{ isSigned, dest, dividend, divisor, opSize }, code) =
             let
                 val instr =
                     case (isSigned, opSize) of
                         (true, OpSize64) => signedDivide
                     |   (true, OpSize32) => signedDivide32
                     |   (false, OpSize64) => unsignedDivide
                     |   (false, OpSize32) => unsignedDivide32
             in
                 instr{regN=getAllocatedGenReg dividend, regM=getAllocatedGenReg divisor, regD=getAllocatedGenReg dest} :: code
             end
 
         |   codeExtended _ (BeginFunction{regArgs, ...}, code) =
             let
                 (* The real registers used for arguments. X30 is there but saved separately. *)
                 val saveRegs = List.filter (fn r => r <> X30) (List.map #2 regArgs)
                 val skipCheck = createLabel()
                 val defaultWords = 10 (* This is wired into the RTS. *)
                 val workRegister = workReg1
                 val debugTrapAlways = false (* Can be set to true for debugging *)
 
                 (* Test with either the stack-pointer or a high-water value.
                    The RTS assumes that X9 has been used as the high-water if it is
                    called through stackOverflowXCallOffset rather than stackOverflowCallOffset *)
                 val (testReg, entryPt, code1) =
                     if stackRequired <= defaultWords
                     then (X_MLStackPtr, stackOverflowCallOffset, code)
                     else (X9, stackOverflowXCallOffset,
                             addConstantWord({regS=X_MLStackPtr, regD=X9, regW=workRegister,
                             value= ~ (Word64.fromLarge(Word.toLarge nativeWordSize)) * Word64.fromInt stackRequired}, code))
 
                 (* Skip the RTS call if there is enough stack.  N.B. The RTS can modify the end-of-stack value
                    to force a trap here even if there is really enough stack. *)
                 val code2 =
                     (if debugTrapAlways then []
                      else
                      [conditionalBranch(CondCarrySet, skipCheck),
                       subSShiftedReg{regM=workRegister, regN=testReg, regD=XZero, shift=ShiftNone}]) @
                     (* Load the end-of-stack value. *)
                     loadRegScaled{regT=workRegister, regN=X_MLAssemblyInt, unitOffset=stackLimitOffset} :: code1
 
                 val code3 =
                     (* Call the RTS but save X30 across the call *)
                     setLabel skipCheck ::
                     loadRegPostIndex{regT=X30, regN=X_MLStackPtr, byteOffset= 8} ::
                     registerMask saveRegs ::
                     branchAndLinkReg X16 ::
                     loadRegScaled{regT=X16, regN=X_MLAssemblyInt, unitOffset=entryPt} ::
                     storeRegPreIndex{regT=X30, regN=X_MLStackPtr, byteOffset= ~8} :: code2
 
                 val usedRegs = regArgs
                 fun mkPair(pr, rr) = {src=rr,dst=getAllocatedGenReg pr}
                 val regPairs = List.map mkPair usedRegs
             in
                 moveMultipleRegisters(regPairs, code3)
             end
 
         |   codeExtended _ (TailRecursiveCall{callKind, regArgs, stackArgs, stackAdjust, currStackSize}, code: instr list) =
             let
                 fun convertArg(ArgOnStack{wordOffset, ...}) = IsOnStack wordOffset
                 |   convertArg(ArgInReg reg) = IsInReg(getAllocatedGenReg reg)
 
                 val extStackArgs = map (fn {stack, src} => {dst=IsOnStack(stack+currStackSize), src=convertArg src}) stackArgs
                 val extRegArgs = map (fn (a, r) => {src=convertArg a, dst=IsInReg r}) regArgs
 
                 (* Tail recursive calls are complicated because we generally have to overwrite the existing stack.
                    That means storing the arguments in the right order to avoid overwriting a
                    value that we are using for a different argument. *)
                 fun codeTailCall(arguments, stackAdjust, code) =
                     if stackAdjust < 0
                     then
                     let
                         (* If the function we're calling takes more arguments on the stack than the
                            current function we will have to extend the stack.  Do that by pushing the
                            argument whose offset is at -1.  Then adjust all the offsets and repeat. *)
                         val {src=argM1, ...} = valOf(List.find(fn {dst=IsOnStack ~1, ...} => true | _ => false) arguments)
                         fun renumberArgs [] = []
                         |   renumberArgs ({dst=IsOnStack ~1, ...} :: args) = renumberArgs args (* Remove the one we've done. *)
                         |   renumberArgs ({dst, src} :: args) =
                             let
                                 val newDest = case dst of IsOnStack d => IsOnStack(d+1) | regDest => regDest
                                 val newSrc =
                                     case src of
                                         IsOnStack wordOffset => IsOnStack(wordOffset+1)
                                     |   other => other
                             in
                                 {dst=newDest, src=newSrc} :: renumberArgs args
                             end
                         val pushCode =
                             case argM1 of
                                 IsOnStack wordOffset =>
                                     storeRegPreIndex{regT=workReg2, regN=X_MLStackPtr, byteOffset= ~8} ::
                                         loadFromStack(workReg2, wordOffset, code)
                             |   IsInReg reg => storeRegPreIndex{regT=reg, regN=X_MLStackPtr, byteOffset= ~8} :: code
                                 
                     in
                         codeTailCall(renumberArgs arguments, stackAdjust+1, pushCode)
                     end
                     else
                     let
                         val loadArgs = moveMultipleValues(arguments, code)
                     in
                         if stackAdjust = 0
                         then loadArgs
                         else addConstantWord({regS=X_MLStackPtr, regD=X_MLStackPtr, regW=workReg1,
                                 value=Word64.fromInt stackAdjust * Word.toLarge nativeWordSize}, loadArgs)
                     end
                 val setArgumentsCode = codeTailCall(extStackArgs @ extRegArgs, stackAdjust+currStackSize, code)
                 
                 val jumpToFunctionCode =
                     case callKind of
                         Recursive => [unconditionalBranch startOfFunctionLabel]
                     |   ConstantCode m =>
                             if is32in64
                             then raise InternalError "ConstantCode" (* Not currently *)
                             else [branchRegister workReg1, loadAddressConstant(workReg1, m)]
                     |   FullCall =>
                             if is32in64
                             then [branchRegister workReg1,
                                  loadRegScaled{regT=workReg1, regN=workReg1, unitOffset=0},
                                  addShiftedReg{regM=X8, regN=X_Base32in64, regD=workReg1, shift=ShiftLSL 0w2}]
                             else [branchRegister workReg1,loadRegScaled{regT=workReg1, regN=X8, unitOffset=0}]
             in
                 jumpToFunctionCode @ setArgumentsCode
             end
 
         |   codeExtended _ (FunctionCall{callKind, regArgs=regArgs, stackArgs=stackArgs, dest, saveRegs, ...}, code: instr list) =
             let
                 val destReg = getAllocatedGenReg dest
 
                 local
                     fun pushStackArgs ([], _, code) = code
 
                     |   pushStackArgs (ArgOnStack {wordOffset, ...} ::args, argNum, code) =
                         let
                             (* Have to adjust the offsets of stack arguments. *)
                             val adjustedOffset = wordOffset+argNum
                         in
                             pushStackArgs(args, argNum+1,
                                 storeRegPreIndex{regT=workReg1, regN=X_MLStackPtr, byteOffset= ~8} ::
                                     loadFromStack(workReg1, adjustedOffset, code))
                         end
 
                     |   pushStackArgs (ArgInReg reg ::args, argNum, code) =
                             pushStackArgs(args, argNum+1,
                                 storeRegPreIndex{regT=getAllocatedGenReg reg, regN=X_MLStackPtr, byteOffset= ~8} :: code)
 
                     val pushedArgs = pushStackArgs(stackArgs, 0, code (* Initial code *))
 
                     (* We have to adjust any stack offset to account for the arguments we've pushed. *)
                     val numStackArgs = List.length stackArgs
 
                     fun convertArg(ArgOnStack{wordOffset, ...}) = IsOnStack(wordOffset+numStackArgs)
                     |   convertArg(ArgInReg reg) = IsInReg(getAllocatedGenReg reg)
 
                 in
                     val extRegArgs = map (fn (a, r) => {src=convertArg a, dst=IsInReg r}) regArgs
                     val loadArgs = moveMultipleValues(extRegArgs, pushedArgs)
                 end
 
                 (* Push the registers before the call and pop them afterwards. *)
                 fun makeSavesAndCall([], code) =
                     let
                         val callFunctionCode =
                             case callKind of
                                 Recursive => [branchAndLink startOfFunctionLabel]
                             |   ConstantCode m =>
                                     if is32in64
                                     then raise InternalError "ConstantCode" (* Not currently *)
                                     else [branchAndLinkReg workReg1, loadAddressConstant(workReg1, m)]
                             |   FullCall =>
                                     if is32in64
                                     then [branchAndLinkReg workReg1,
                                          loadRegScaled{regT=workReg1, regN=workReg1, unitOffset=0},
                                          addShiftedReg{regM=X8, regN=X_Base32in64, regD=workReg1, shift=ShiftLSL 0w2}]
                                     else [branchAndLinkReg workReg1,loadRegScaled{regT=workReg1, regN=X8, unitOffset=0}]
                     in
                         callFunctionCode @ code
                     end
 
                 |   makeSavesAndCall(reg::regs, code) =
                     let
                         val areg = getAllocatedGenReg reg
                     in
                         loadRegPostIndex{regT=areg, regN=X_MLStackPtr, byteOffset= 8} ::
                             makeSavesAndCall(regs, storeRegPreIndex{regT=areg, regN=X_MLStackPtr, byteOffset= ~8} :: code)
                     end
 
             in
                 moveIfNecessary({dst=destReg, src=X0}, makeSavesAndCall(saveRegs, loadArgs))
             end
 
         |   codeExtended _ (ReturnResultFromFunction { resultReg, returnReg, numStackArgs }, code) =
             let
                 val resultReg = getAllocatedGenReg resultReg
                 and returnReg = getAllocatedGenReg returnReg
 
                 fun resetStack(0, code) = code
                 |   resetStack(nItems, code) =
                         addConstantWord({regS=X_MLStackPtr, regD=X_MLStackPtr, regW=X3,
                                         value=Word64.fromLarge(Word.toLarge nativeWordSize) * Word64.fromInt nItems}, code)
             in
                 returnRegister returnReg :: resetStack(numStackArgs, moveIfNecessary({src=resultReg, dst=X0}, code))
             end
 
         |   codeExtended _ (RaiseExceptionPacket{ packetReg }, code) =
                 (* We need a work register here.  It can be any register other than X0 since
                    we don't preserve registers across calls. *)
                 (* Copy the handler "register" into the stack pointer.  Then
                    jump to the address in the first word.  The second word is
                    the next handler.  This is set up in the handler.  We have a lot
                    more raises than handlers since most raises are exceptional conditions
                    such as overflow so it makes sense to minimise the code in each raise. *)
                 branchRegister workReg1 :: 
                 loadRegScaled{regT=workReg1, regN=X_MLStackPtr, unitOffset=0} ::
                 loadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset} ::
                 moveIfNecessary({src=getAllocatedGenReg packetReg, dst=X0}, code)
 
         |   codeExtended _ (PushToStack{ source, copies, ... }, code) =
             let
                 val reg = getAllocatedGenReg source
                 val _ = copies > 0 orelse raise InternalError "PushToStack: copies<1"
                 fun pushn(0, c) = c
                 |   pushn(n, c) = pushn(n-1, storeRegPreIndex{regT=reg, regN=X_MLStackPtr, byteOffset= ~8} :: c)
             in
                 pushn(copies, code)
             end
 
         |   codeExtended _ (LoadStack{ dest, wordOffset, ... }, code) =
                 loadFromStack(getAllocatedGenReg dest, wordOffset, code)
 
         |   codeExtended _ (StoreToStack{ source, stackOffset, ... }, code) =
                 (* Store into the stack to set a field of a container.  Always 64-bits. *)
                 storeToStack(getAllocatedGenReg source, stackOffset, workReg1, code)
 
         |   codeExtended _ (ContainerAddress{ dest, stackOffset, ... }, code) =
                 (* Set the register to an offset in the stack. *)
             let
                 val destReg = getAllocatedGenReg dest
                 val _ = stackOffset >= 0 orelse raise InternalError "codeGenICode: ContainerAddress - negative offset"
                 val byteOffset = stackOffset * Word.toInt nativeWordSize
             in
                 if byteOffset >= 4096
                 then addShiftedReg{regN=X_MLStackPtr, regM=destReg, regD=destReg, shift=ShiftNone} ::
                         List.rev(loadNonAddress(destReg, Word64.fromInt byteOffset)) @ code
                 else addImmediate{regN=X_MLStackPtr, regD=destReg, immed=Word.fromInt byteOffset, shifted=false} :: code
             end
 
         |   codeExtended _ (ResetStackPtr{ numWords, ... }, code) =
                 addConstantWord({regS=X_MLStackPtr, regD=X_MLStackPtr, regW=workReg1,
                     value=Word64.fromInt numWords * Word.toLarge nativeWordSize}, code)
 
         |   codeExtended _ (TagValue{ source, dest, isSigned=_, opSize }, code) =
             let
                 val sourceReg = getAllocatedGenReg source
                 and destReg = getAllocatedGenReg dest
                 (* Shift left by one bit and add one. *)
             in
                 case opSize of
                     OpSize64 =>
                         bitwiseOrImmediate{regN=destReg, regD=destReg, bits=0w1} ::
                         logicalShiftLeft{regN=sourceReg, regD=destReg, shift=0w1} :: code
                 |   OpSize32 =>
                         bitwiseOrImmediate32{regN=destReg, regD=destReg, bits=0w1} ::
                         logicalShiftLeft32{regN=sourceReg, regD=destReg, shift=0w1} :: code
             end
 
         |   codeExtended _ (UntagValue{ source, dest, isSigned, opSize }, code) =
             let
                 (* Shift right by one bit.  The type of shift depends on the length and whether it's signed. *)
                 val shiftType =
                     case (isSigned, opSize) of
                         (false, OpSize64) => logicalShiftRight
                     |   (false, OpSize32) => logicalShiftRight32
                     |   (true, OpSize64) => arithmeticShiftRight
                     |   (true, OpSize32) => arithmeticShiftRight32
             in
                 shiftType{regN=getAllocatedGenReg source, regD=getAllocatedGenReg dest, shift=0w1} :: code
             end
 
         |   codeExtended _ (BoxLarge{ source, dest, saveRegs }, code) =
                 List.rev(boxSysWord{source=getAllocatedGenReg source, destination=getAllocatedGenReg dest,
                                     workReg=workReg1, saveRegs=getSaveRegs saveRegs}) @ code
 
         |   codeExtended _ (UnboxLarge{ source, dest }, code) =
             let
                 (* Unbox a large word.  The argument is a poly word. *)
                 val destReg = getAllocatedGenReg dest
                 and srcReg = getAllocatedGenReg source
             in
                 if is32in64
                 then loadRegScaled{regT=destReg, regN=destReg, unitOffset=0} ::
                         addShiftedReg{regM=srcReg, regN=X_Base32in64, regD=destReg, shift=ShiftLSL 0w2} :: code
                 else loadRegScaled{regT=destReg, regN=srcReg, unitOffset=0} :: code
             end
 
         |   codeExtended _ (BoxTagFloat{ floatSize=Double64, source, dest, saveRegs }, code) =
                 List.rev(boxDouble{source=getAllocatedFPReg source, destination=getAllocatedGenReg dest,
                                     workReg=workReg1, saveRegs=getSaveRegs saveRegs}) @ code
 
         |   codeExtended _ (BoxTagFloat{ floatSize=Float32, source, dest, saveRegs }, code) =
             let
                 val floatReg = getAllocatedFPReg source
                 and fixedReg = getAllocatedGenReg dest
             in
                 if is32in64
                 then List.rev(boxFloat{source=floatReg, destination=fixedReg,
                                        workReg=workReg1, saveRegs=getSaveRegs saveRegs}) @ code
                 else bitwiseOrImmediate{regN=fixedReg, regD=fixedReg, bits=0w1} ::
                     logicalShiftLeft{shift=0w32, regN=fixedReg, regD=fixedReg} ::
                     moveFloatToGeneral{regN=floatReg, regD=fixedReg} :: code
             end
 
         |   codeExtended _ (UnboxTagFloat { floatSize=Double64, source, dest }, code) =
             let
                 val addrReg = getAllocatedGenReg source
                 and valReg = getAllocatedFPReg dest
             in
                 if is32in64
                 then loadRegScaledDouble{regT=valReg, regN=workReg1, unitOffset=0} ::
                          addShiftedReg{regM=addrReg, regN=X_Base32in64, regD=workReg1, shift=ShiftLSL 0w2} :: code
                 else loadRegScaledDouble{regT=valReg, regN=addrReg, unitOffset=0} :: code
             end
 
         |   codeExtended _ (UnboxTagFloat { floatSize=Float32, source, dest }, code) =
             let
                 val addrReg = getAllocatedGenReg source
                 and valReg = getAllocatedFPReg dest
                 (* This is tagged in native 64-bits.  In 32-in-64 we're loading 32-bits so we
                    can use an indexed load directly. *)
             in
                 if is32in64
                 then loadRegIndexedFloat{regN=X_Base32in64, regM=addrReg, regT=valReg, option=ExtUXTX ScaleOrShift} :: code
                 else moveGeneralToFloat{regN=workReg1, regD=valReg} ::
                         logicalShiftRight{shift=0w32, regN=addrReg, regD=workReg1} :: code
             end
 
         |   codeExtended _ (LoadAcquire{dest, base, loadType, ...}, code) =
             let
                 val loadInstr  =
                     case loadType of
                         Load64 => loadAcquire
                     |   Load32 => loadAcquire32
                     |   Load8 => loadAcquireByte
                     |   _ => raise InternalError "LoadAcquire: Unsupported size" (* Not used *)
             in
                 loadInstr{regT=getAllocatedGenReg dest, regN=getAllocatedGenReg base} :: code
             end
 
         |   codeExtended _ (StoreRelease{source, base, loadType, ...}, code) =
             let
                 val storeInstr  =
                     case loadType of
                         Load64 => storeRelease
                     |   Load32 => storeRelease32
                     |   Load8 => storeReleaseByte
                     |   _ => raise InternalError "StoreRelease: Unsupported size" (* Not used *)
             in
                 storeInstr{regT=getAllocatedGenReg source, regN=getAllocatedGenReg base} :: code
             end
 
         |   codeExtended _ (BitFieldShift{ source, dest, isSigned, length, immr, imms }, code) =
             let
                 val bfInstr =
                     case (isSigned, length) of
                         (true, OpSize64) => signedBitfieldMove64
                     |   (false, OpSize64) => unsignedBitfieldMove64
                     |   (true, OpSize32) => signedBitfieldMove32
                     |   (false, OpSize32) => unsignedBitfieldMove32
                 val srcReg = getAllocatedGenReg source
                 val destReg = getAllocatedGenReg dest
             in
                 bfInstr{immr=immr, imms=imms, regN=srcReg, regD=destReg} :: code
             end
 
         |   codeExtended _ (BitFieldInsert{ source, destAsSource, dest, length, immr, imms }, code) =
             let
                 (* If we're using BitFieldMove we retain some of the bits of the
                    destination.  The higher levels require us to treat that as a source.  *)
                 val sourceReg = getAllocatedGenReg source
                 and destReg = getAllocatedGenReg dest
                 val _ = sourceReg = destReg andalso raise InternalError "codeExtended: bitfield: dest=source"
                 val bfInstr =
                     case length of OpSize64 => bitfieldMove64 | OpSize32 => bitfieldMove32
             in
                 bfInstr{immr=immr, imms=imms, regN=getAllocatedGenReg source, regD=destReg} ::
                     moveIfNecessary({src=getAllocatedGenReg destAsSource, dst=destReg}, code)
             end
 
         |   codeExtended {flow} (IndexedCaseOperation{testReg}, code) =
             let
                 (* testReg contains the original value after the lowest value has been subtracted.
                    Since both the original value and the lowest value were tagged it contains a
                    shifted but untagged value. *)
                 (* This should only be within a block with an IndexedBr flow type. *)
                 val cases =
                     case flow of IndexedBr cases => cases | _ => raise InternalError "codeGenICode: IndexedCaseOperation"
                 val caseLabels = map getBlockLabel cases
                 val tableLabel = createLabel()
                 val startOfCase =
                     setLabel tableLabel ::
                     branchRegister workReg1 ::
                     (* Add the value shifted by one since it's already shifted. *)
                     addShiftedReg{regN=workReg1, regD=workReg1, regM=getAllocatedGenReg testReg, shift=ShiftLSL 0w1} ::
                     loadLabelAddress(workReg1, tableLabel) :: code
                 val addCases =
                     List.foldl (fn (label, code) => unconditionalBranch label :: code) startOfCase caseLabels
             in
                 addCases
             end
 
         |   codeExtended {flow} (PushExceptionHandler, code) =
             let
                 (* This should only be within a block with a SetHandler flow type. *)
                 val handleLabel =
                     case flow of
                         SetHandler{ handler, ...} => handler
                     |   _ => raise InternalError "codeGenICode: PushExceptionHandler"
                 val labelRef = getBlockLabel handleLabel
             in
                 (* Push the old handler and the handler entry point and set the "current handler"
                    to point to the stack after we've pushed these. *)
                 storeRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset} ::
                 storeRegPreIndex{regT=workReg2, regN=X_MLStackPtr, byteOffset= ~8} ::
                 storeRegPreIndex{regT=workReg1, regN=X_MLStackPtr, byteOffset= ~8} ::
                 loadLabelAddress(workReg2, labelRef) ::
                 loadRegScaled{regT=workReg1, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset} :: code
             end
 
         |   codeExtended _ (PopExceptionHandler, code) =
                 (* Remove and discard the handler we've set up.  Pop the previous handler
                    and put into "current handler". *)
                 storeRegScaled{regT=workReg2, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset} ::
                 loadRegPostIndex{regT=workReg2, regN=X_MLStackPtr, byteOffset=8} ::
                 loadRegPostIndex{regT=workReg1, regN=X_MLStackPtr, byteOffset=8} :: code
 
         |   codeExtended _ (BeginHandler{packetReg}, code) =
             let
                 val beginHandleCode =
                     (* Remove the handler entry for this handler. *)
                     storeRegScaled{regT=workReg2, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset} ::
                     loadRegPostIndex{regT=workReg2, regN=X_MLStackPtr, byteOffset=8} ::
                     loadRegPostIndex{regT=workReg1, regN=X_MLStackPtr, byteOffset=8} ::
                     (* The exception raise code resets the stack pointer to the value in the exception handler
                            so this is probably redundant.  Leave it for the moment, *)
                     loadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=exceptionHandlerOffset} :: code
             in
                 moveIfNecessary({src=X0, dst=getAllocatedGenReg packetReg }, beginHandleCode)
             end
 
         |   codeExtended _ (CompareByteVectors{vec1Addr, vec2Addr, length,  ...}, code) =
             let
                 (* Construct a loop to compare two vectors of bytes. *)
                 val vec1Reg = getAllocatedGenReg vec1Addr
                 and vec2Reg = getAllocatedGenReg vec2Addr
                 and lenReg = getAllocatedGenReg length
                 val loopLabel = createLabel() and exitLabel = createLabel()
                 (*  N.B. the code is in reverse order - read from the bottom up. *)
             in
                 setLabel exitLabel ::
                 conditionalBranch(CondEqual, loopLabel) :: (* Loop if they're equal *)
                 (* Compare *)
                 subSShiftedReg32{regM=workReg2, regN=workReg1, regD=XZero, shift=ShiftNone} ::
                 subImmediate{regN=lenReg, regD=lenReg, immed=0w1, shifted=false} :: (* Decr len *)
                 (* Load the bytes for the comparison and increment each. *)
                 loadRegPostIndexByte{regT=workReg2, regN=vec2Reg, byteOffset=1} ::            
                 loadRegPostIndexByte{regT=workReg1, regN=vec1Reg, byteOffset=1} ::
                 compareBranchZero(lenReg, exitLabel) :: (* Go to the end when len = zero *)
                 setLabel loopLabel :: (* Start of loop *)
                 (* Set the CC to Equal before we start in case length = 0 *)
                 subSShiftedReg{regM=lenReg, regN=lenReg, regD=XZero, shift=ShiftNone} :: code
             end
 
         |   codeExtended _ (BlockMove{srcAddr, destAddr, length, isByteMove}, code) =
             let
                 (* Construct a loop to move the data. *)
                 val srcReg = getAllocatedGenReg srcAddr
                 and destReg = getAllocatedGenReg destAddr
                 and lenReg = getAllocatedGenReg length
                 val loopLabel = createLabel() and exitLabel = createLabel()
             in
                 setLabel exitLabel :: (* Exit from the loop. *)
                 unconditionalBranch loopLabel :: (* Back to the start *)
                 subImmediate{regN=lenReg, regD=lenReg, immed=0w1, shifted=false} :: (* Decr len *)
                 (
                     if isByteMove
                     then
                     [
                         storeRegPostIndexByte{regT=workReg1, regN=destReg, byteOffset=1},
                         loadRegPostIndexByte{regT=workReg1, regN=srcReg, byteOffset=1}
                     ]
                     else if is32in64
                     then
                     [
                         storeRegPostIndex32{regT=workReg1, regN=destReg, byteOffset=4},
                         loadRegPostIndex32{regT=workReg1, regN=srcReg, byteOffset=4}
                     ]
                     else
                     [
                         storeRegPostIndex{regT=workReg1, regN=destReg, byteOffset=8},
                         loadRegPostIndex{regT=workReg1, regN=srcReg, byteOffset=8}
                     ]
                 ) @
                 compareBranchZero(lenReg, exitLabel) :: (* Exit when length = 0 *)
                 setLabel loopLabel (* Start of loop *) :: code
             end
 
         |   codeExtended _ (AddSubXSP{ source, dest, isAdd }, code) =
             let
                 val allocFreeCode =
                     (if isAdd then addExtendedReg else subExtendedReg)
                         {regM=getAllocatedGenReg source, regN=XSP, regD=XSP, extend=ExtUXTX 0w0} :: code
             in
                 case dest of
                     ZeroReg => allocFreeCode
                 |   SomeReg destReg =>
                         (* We have to use add here to get the SP into the destination instead of the usual move. *)
                         addImmediate{regN=XSP, regD=getAllocatedGenReg destReg, immed=0w0, shifted=false} ::
                             allocFreeCode
             end
 
         |   codeExtended _ (TouchValue _, code) = code (* Don't need to do anything now. *)
 
         |   codeExtended _ (LoadAcquireExclusive{ base, dest }, code) =
                 loadAcquireExclusiveRegister{regN=getAllocatedGenReg base, regT=getAllocatedGenReg dest} :: code
 
         |   codeExtended _ (StoreReleaseExclusive{ base, source, result }, code) =
                 storeReleaseExclusiveRegister{regS=getAllocatedGenReg result,
                         regT=getAllocatedGenRegOrZero source, regN=getAllocatedGenReg base} :: code
 
         |   codeExtended _ (MemoryBarrier, code) = code
 
         |   codeExtended _ (ConvertIntToFloat{ source, dest, srcSize, destSize}, code) =
             let
                 val instr =
                     case (srcSize, destSize) of
                         (OpSize32, Float32) => convertInt32ToFloat
                     |   (OpSize64, Float32) => convertIntToFloat
                     |   (OpSize32, Double64) => convertInt32ToDouble
                     |   (OpSize64, Double64) => convertIntToDouble
             in
                 instr{regN=getAllocatedGenReg source, regD=getAllocatedFPReg dest} :: code
             end
 
         |   codeExtended _ (ConvertFloatToInt{ source, dest, srcSize, destSize, rounding}, code) =
             let
                 val instr =
                     case (srcSize, destSize) of
                         (Float32, OpSize32) => convertFloatToInt32
                     |   (Float32, OpSize64) => convertFloatToInt
                     |   (Double64, OpSize32) => convertDoubleToInt32
                     |   (Double64, OpSize64) => convertDoubleToInt
             in
                 instr rounding {regN=getAllocatedFPReg source, regD=getAllocatedGenReg dest} :: code
             end
 
         |   codeExtended _ (UnaryFloatingPt{ source, dest, fpOp}, code) =
             let
                 val instr =
                     case fpOp of
                         NegFloat => negFloat | NegDouble => negDouble
                     |   AbsFloat => absFloat | AbsDouble => absDouble
                     |   ConvFloatToDble => convertFloatToDouble
                     |   ConvDbleToFloat => convertDoubleToFloat
             in
                 instr {regN=getAllocatedFPReg source, regD=getAllocatedFPReg dest} :: code
             end
 
         |   codeExtended _ (BinaryFloatingPoint{ arg1, arg2, dest, fpOp, opSize}, code) =
             let
                 val instr =
                     case (fpOp, opSize) of
                         (MultiplyFP, Float32) => multiplyFloat
                     |   (DivideFP, Float32) => divideFloat
                     |   (AddFP, Float32) => addFloat
                     |   (SubtractFP, Float32) => subtractFloat
                     |   (MultiplyFP, Double64) => multiplyDouble
                     |   (DivideFP, Double64) => divideDouble
                     |   (AddFP, Double64) => addDouble
                     |   (SubtractFP, Double64) => subtractDouble
             in
                 instr {regN=getAllocatedFPReg arg1, regM=getAllocatedFPReg arg2, regD=getAllocatedFPReg dest} :: code
             end
 
         |   codeExtended _ (CompareFloatingPoint{ arg1, arg2, opSize, ...}, code) =
             (case opSize of Float32 => compareFloat | Double64 => compareDouble)
                 {regN=getAllocatedFPReg arg1, regM=getAllocatedFPReg arg2} :: code
 
         local
             (* processed - set to true when a block has been processed. *)
             val processed = Array.array(numBlocks, false)
             fun haveProcessed n = Array.sub(processed, n)
             
             (* Find the blocks that reference this one.  This isn't essential but
                allows us to try to generate blocks in the order of the control
                flow.  This in turn may allow us to use short branches rather
                than long ones. *)
             val labelRefs = Array.array(numBlocks, [])
 
             datatype flowCode =
                 FlowCodeSimple of int
             |   FlowCodeCMove of {code: instr list, trueJump: int, falseJump: int}
             
             (* Process this recursively to set the references.  If we have
                unreachable blocks, perhaps because they've been merged, we
                don't want to include them in the reference counting.
                This shouldn't happen now that IdentifyReferences removes
                unreferenced blocks. *)
             fun setReferences fromLabel toLabel =
                 case Array.sub(labelRefs, toLabel) of
                     [] => (* Not yet visited at all. *)
                     let
                         val ExtendedBasicBlock{ flow, ...} = Vector.sub(blocks, toLabel)
                         val refs =
                             case flow of
                                 ExitCode => []
                             |   Unconditional lab => [lab]
                             |   Conditional{trueJump, falseJump, ... } => [trueJump, falseJump]
                             |   IndexedBr labs => labs
                             |   SetHandler { handler, continue } => [handler, continue]
                             |   UnconditionalHandle _ => []
                             |   ConditionalHandle { continue, ...} => [continue]
 
                         val () =
                             if fromLabel >= 0 then Array.update(labelRefs, toLabel, [fromLabel]) else ()
                     in
                         List.app (setReferences toLabel) refs
                     end
                     
                 |   refs =>
                     (* We've visiting this at least once.  Just add us to the list. *)
                         Array.update(labelRefs, toLabel, fromLabel :: refs)
             
             val _ = setReferences 0 0
             
             (* Process the blocks.  We keep the "stack" explicit rather than using recursion
                because this allows us to select both arms of a conditional branch sooner. *)
             fun genCode(toDo, lastFlow, code) =
             case List.filter (not o haveProcessed) toDo of
                 [] =>
                 let
                     (* There's nothing left to do. We may need to add a final branch to the end. *)
                     val finalBranch =
                         case lastFlow of
                             ExitCode => []
                         |   IndexedBr _ => []
                         |   Unconditional dest => [unconditionalBranch(getBlockLabel dest)]
                         |   Conditional { condition, trueJump, falseJump, ...} =>
                                 [
                                     unconditionalBranch(getBlockLabel falseJump),
                                     conditionalBranch(condition, getBlockLabel trueJump)
                                 ]
                         |   SetHandler { continue, ...} => [unconditionalBranch(getBlockLabel continue)]
                         |   UnconditionalHandle _ => []
                         |   ConditionalHandle { continue, ...} => [unconditionalBranch(getBlockLabel continue)]
                 in
                     finalBranch @ code (* Done. *)
                 end
 
             |   stillToDo as head :: _ =>
                 let
                     local
                         (* Check the references.  If all the sources that lead up to this have
                            already been we won't have any backward jumps. *)
                         fun available dest = List.all haveProcessed (Array.sub(labelRefs, dest))
 
                         val continuation =
                             case lastFlow of
                                 ExitCode => NONE
                             |   IndexedBr _ => NONE (* We could put the last branch in here. *)
                             |   Unconditional dest =>
                                     if not (haveProcessed dest) andalso available dest
                                     then SOME(FlowCodeSimple dest)
                                     else NONE
                             |   Conditional {trueJump, falseJump, condition, ...} =>
                                 (* We can usually choose either destination and in nearly all cases
                                    it won't matter.  The default branch is not to take forward jumps
                                    so if there is reason to believe that one branch is more likely
                                    we should follow that branch now and leave the other.  If we
                                    have Cond(No)Overflow we assume that overflow is unusual.
                                    If one branch raises an exception we assume that that is unusual. *)
                                 let
                                     val (first, second) =
                                         case (condition, Vector.sub(blocks, falseJump)) of
                                             (CondNoOverflow, _) => (trueJump, falseJump)
                                         |   (_, ExtendedBasicBlock{ flow=ExitCode, block, ...}) =>
                                                 if List.exists(fn{instr=RaiseExceptionPacket _, ...} => true | _ => false) block
                                                 then (trueJump, falseJump)
                                                 else (falseJump, trueJump)
                                         |   _ => (falseJump, trueJump)
                                 in
                                     if not (haveProcessed first) andalso available first
                                     then SOME(FlowCodeSimple first)
                                     else if not (haveProcessed second) andalso available second
                                     then SOME(FlowCodeSimple second)
                                     else NONE
                                 end
                            |    SetHandler { continue, ... } =>
                                     (* We want the continuation if possible.  We'll need a
                                        branch round the handler so that won't help. *)
                                     if not (haveProcessed continue) andalso available continue
                                     then SOME(FlowCodeSimple continue)
                                     else NONE
                            |    UnconditionalHandle _ => NONE
                            |    ConditionalHandle _ => NONE
                     in
                         (* First choice - continue the existing block.
                            Second choice - the first item whose sources have all been
                            processed.
                            Third choice - something from the list. *)
                         val picked =
                             case continuation of
                                 SOME c => c
                             |   NONE =>
                                     case List.find available stillToDo of
                                         SOME c => FlowCodeSimple c
                                     |   NONE => FlowCodeSimple head
                     end
                     
                 in
                     case picked of
                         FlowCodeSimple picked =>
                         let
                             val () = Array.update(processed, picked, true)
 
                             (* Code to terminate the previous block. *)
                             val startCode =
                                 case lastFlow of
                                     ExitCode => []
                                 |   IndexedBr _ => []
                                 |   UnconditionalHandle _ => []
                                 |   Unconditional dest =>
                                         if dest = picked then [] else [unconditionalBranch(getBlockLabel dest)]
                                 |   ConditionalHandle { continue, ...} =>
                                         if continue = picked then [] else [unconditionalBranch(getBlockLabel continue)]
                                 |   SetHandler { continue, ... } =>
                                         if continue = picked then [] else [unconditionalBranch(getBlockLabel continue)]
                                 |   Conditional { condition, trueJump, falseJump, ...} =>
                                     if picked = falseJump (* Usual case. *)
                                     then [conditionalBranch(condition, getBlockLabel trueJump)]
                                     else if picked = trueJump
                                     then (* We have a jump to the true condition. Invert the jump.
                                             This is more than an optimisation.  Because this immediately precedes the
                                             true block we're not going to generate a label. *)
                                         [conditionalBranch(invertTest condition, getBlockLabel falseJump)]
                                     else
                                     [
                                         unconditionalBranch(getBlockLabel falseJump),
                                         conditionalBranch(condition, getBlockLabel trueJump)
                                     ]
 
                             (* Code-generate the body with the code we've done so far
                                at the end.  Add a label at the start if necessary. *)
                             local
                                 (* If the previous block dropped through to this and this was
                                    the only reference then we don't need a label. *)
                                 fun onlyJumpingHere (lab: int) =
                                     if lab <> picked then false
                                     else case Array.sub(labelRefs, picked) of
                                         [singleton] => singleton = lab
                                     |   _ => false
                     
                                 val noLabel =
                                     case lastFlow of
                                         ExitCode => picked = 0 (* Unless this was the first block. *)
                                     |   Unconditional dest => onlyJumpingHere dest
                                     |   Conditional { trueJump, falseJump, ...} =>
                                             onlyJumpingHere trueJump orelse onlyJumpingHere falseJump
                                     |   IndexedBr _ => false
                                     |   SetHandler _ => false
                                     |   UnconditionalHandle _ => false
                                     |   ConditionalHandle { continue, ...} => onlyJumpingHere continue
                             in
                                 val startLabel = if noLabel then [] else [setLabel(getBlockLabel picked)]
                             end
 
                             val ExtendedBasicBlock { flow, block, ...} = Vector.sub(blocks, picked)
 
                             local
                                 fun genCodeBlock({instr, ...}, code) = codeExtended {flow=flow} (instr, code)
                             in
                                 val bodyCode = List.foldl genCodeBlock (startLabel @ startCode @ code) block
                             end
 
                             val addSet =
                                 case flow of
                                     ExitCode => []
                                 |   IndexedBr cases => cases
                                 |   Unconditional dest => [dest]
                                 |   Conditional {trueJump, falseJump, ...} => [falseJump, trueJump]
                                 |   SetHandler { handler, continue } => [handler, continue]
                                 |   UnconditionalHandle _ => []
                                 |   ConditionalHandle { continue, ...} => [continue]
 
                         in
                             genCode(addSet @ stillToDo, flow, bodyCode)
                         end
                 
                     |   FlowCodeCMove{code, trueJump, falseJump} =>
                         let
                             (* We've generated a conditional move and possibly a return.  If the
                                trueJump and falseJump are only ever referenced from this block
                                they're done, otherwise we still need to do them. *)
                             val _ =
                                 case Array.sub(labelRefs, trueJump) of
                                     [_] => Array.update(processed, trueJump, true)
                                 |   _ => ()
                             val _ =
                                 case Array.sub(labelRefs, falseJump) of
                                     [_] => Array.update(processed, falseJump, true)
                                 |   _ => ()
                             val ExtendedBasicBlock { flow, ...} = Vector.sub(blocks, trueJump)
                             val addSet =
                                 case flow of
                                     ExitCode => []
                                 |   Unconditional dest => [dest]
                                 |   _ => raise InternalError "FlowCodeCMove"
                         in
                             genCode(addSet @ stillToDo, flow, code)
                         end
                 end
         in
             val ops = genCode([0], ExitCode, [setLabel startOfFunctionLabel])
         end
     in
         Arm64Assembly.generateCode{instrs=List.rev ops, name=functionName, resultClosure=resultClosure,
                                    parameters=debugSwitches, profileObject=profileObject}
     end
 
     structure Sharing =
     struct
         type intSet             = intSet
         and extendedBasicBlock  = extendedBasicBlock
         and regProperty         = regProperty
         and reg                 = reg
         and closureRef          = closureRef
     end
 
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64IdentifyReferences.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64IdentifyReferences.ML
index 561fe38c..d4c4eb93 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64IdentifyReferences.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64IdentifyReferences.ML
@@ -1,869 +1,881 @@
 (*
     Copyright (c) 2021 David C.J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64IdentifyReferences(
     structure Arm64ICode: ARM64ICODE
     structure Debug: DEBUG
     structure IntSet: INTSET
 ): ARM64IDENTIFYREFERENCES =
 struct
     open Arm64ICode
     open IntSet
 
     type regState =
     { 
         active: int, refs: int, pushState: bool, prop: regProperty
     }
 
     (* CC states before and after.  The instruction may use the CC or ignore it. The only
        instructions to use the CC is X87FPGetCondition.  Conditional branches
        are handled at the block level.
        The result of executing the instruction may be to set the condition code to a
        defined state, an undefined state or leave it unchanged.
        N.B. Some "instructions" may involve a stack reset that could affect the CC. *)
     datatype outCCState = CCSet of ccRef | CCIndeterminate | CCUnchanged
     and inCCState = CCNeeded of ccRef | CCUnused
     
     datatype extendedBasicBlock =
         ExtendedBasicBlock of
         {
             block: {instr: arm64ICode, current: intSet, active: intSet, kill: intSet } list,
             flow: controlFlow,
             locals: intSet, (* Defined and used entirely within the block. *)
             imports: intSet, (* Defined outside the block, used inside it, but not needed afterwards. *)
             exports: intSet, (* Defined within the block, possibly used inside, but used outside. *)
             passThrough: intSet, (* Active throughout the block. May be referred to by it but needed afterwards. *)
             loopRegs: intSet, (* Destination registers for a loop.  They will be updated by this block. *)
             initialStacks: intSet, (* Stack items required at the start i.e. imports+passThrough for stack items. *)
             inCCState: ccRef option, (* The state this block assumes.  If SOME _ all predecessors must set it. *)
             outCCState: ccRef option (* The condition code set by this block.  SOME _ if at least one successor needs it. *)
         }
     
     exception InternalError = Misc.InternalError
 
     (* Return the list of blocks that are the immediate successor of this. *)
     fun blockSuccessors(BasicBlock{flow, ...}) = successorBlocks flow
 
     fun getOptReg(SomeReg reg) = [reg] | getOptReg ZeroReg = []
 
     fun getInstructionState(MoveRegister { source, dest, ...}) =
         { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadNonAddressConstant { dest, ...}) =
         { sources=[], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadAddressConstant { dest, ...}) =
         { sources=[], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadWithConstantOffset { base, dest, ...}) =
         { sources=[base], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
+    |   getInstructionState(LoadFPWithConstantOffset { base, dest, ...}) =
+        { sources=[base], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
+
     |   getInstructionState(LoadWithIndexedOffset { base, dest, index, ...}) =
         { sources=[base, index], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
-    |   getInstructionState(LoadMemReg { dest, ...}) =
+    |   getInstructionState(LoadFPWithIndexedOffset { base, dest, index, ...}) =
+        { sources=[base, index], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
+
+    |   getInstructionState(GetThreadId { dest, ...}) =
         { sources=[], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(ObjectIndexAddressToAbsolute { source, dest, ...}) =
         { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(AbsoluteToObjectIndex { source, dest, ...}) =
         { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(AllocateMemoryFixed { dest, ...}) =
         { sources=[], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(AllocateMemoryVariable{size, dest, ...}) =
             { sources=[size], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(InitialiseMem{size, addr, init}) =
             { sources=[size, addr, init], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BeginLoop) =
             (* This is just a marker.  It doesn't actually generate any code. *)
             { sources=[], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(JumpLoop{regArgs, stackArgs, ...}) =
         let
             fun getSourceFromRegs({src=ArgInReg reg, ...}, (regArgs, stackArgs)) = (reg :: regArgs, stackArgs)
             |   getSourceFromRegs({src=ArgOnStack { container, ...}, ...}, (regArgs, stackArgs)) = (regArgs, container :: stackArgs)
             and getSourceFromStack({src=ArgInReg reg, ...}, (regArgs, stackArgs)) = (reg :: regArgs, stackArgs)
             |   getSourceFromStack({src=ArgOnStack { container, ...}, ...}, (regArgs, stackArgs)) = (regArgs, container :: stackArgs)
             val (regSources, stackSources) =
                 List.foldl getSourceFromRegs (List.foldl getSourceFromStack ([], []) stackArgs) regArgs
         in
             { sources=regSources, dests=[], sStacks=stackSources, dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
         end
 
     |   getInstructionState(StoreWithConstantOffset { base, source, ...}) =
         { sources=[source, base], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
+    |   getInstructionState(StoreFPWithConstantOffset { base, source, ...}) =
+        { sources=[source, base], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
+
     |   getInstructionState(StoreWithIndexedOffset { base, source, index, ...}) =
         { sources=[source, base, index], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
+    |   getInstructionState(StoreFPWithIndexedOffset { base, source, index, ...}) =
+        { sources=[source, base, index], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
+
     |   getInstructionState(AddSubImmediate{ source, dest, ccRef, ... }) =
         { sources=[source], dests=case dest of NONE => [] | SOME d => [d], sStacks=[], dStacks=[],
           ccIn=CCUnused, ccOut=case ccRef of NONE => CCUnchanged | SOME cc => CCSet cc }
 
     |   getInstructionState(AddSubRegister{ base, shifted, dest, ccRef, ... }) =
         { sources=[base, shifted], dests=case dest of NONE => [] | SOME d => [d], sStacks=[], dStacks=[],
           ccIn=CCUnused, ccOut=case ccRef of NONE => CCUnchanged | SOME cc => CCSet cc }
 
     |   getInstructionState(LogicalImmediate{ source, dest, ccRef, ... }) =
         { sources=[source], dests=case dest of NONE => [] | SOME d => [d], sStacks=[], dStacks=[],
           ccIn=CCUnused, ccOut=case ccRef of NONE => CCUnchanged | SOME cc => CCSet cc }
 
     |   getInstructionState(LogicalRegister{ base, shifted, dest, ccRef, ... }) =
         { sources=[base, shifted], dests=case dest of NONE => [] | SOME d => [d], sStacks=[], dStacks=[],
           ccIn=CCUnused, ccOut=case ccRef of NONE => CCUnchanged | SOME cc => CCSet cc }
 
     |   getInstructionState(ShiftRegister{ source, shift, dest, ... }) =
         { sources=[source, shift], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(Multiplication{ dest, sourceA, sourceM, sourceN, ... }) =
         { sources=(case sourceA of SOME srcA => [srcA] | NONE => []) @ [sourceM, sourceN],
           dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(Division{ dest, dividend, divisor, ... }) =
         { sources=[dividend, divisor], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BeginFunction {regArgs, stackArgs, ...}) =
             { sources=[], dests=map #1 regArgs, sStacks=[], dStacks=stackArgs, ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(FunctionCall{regArgs, stackArgs, dest, containers, ...}) =
         let
             (* Non-tail-recursive.  Behaves as a normal reference to sources. *)
             fun getSourceFromRegs((ArgInReg reg, _), (regArgs, stackArgs)) = (reg :: regArgs, stackArgs)
             |   getSourceFromRegs((ArgOnStack { container, ...}, _), (regArgs, stackArgs)) = (regArgs, container :: stackArgs)
             and getSourceFromStack(ArgInReg reg, (regArgs, stackArgs)) = (reg :: regArgs, stackArgs)
             |   getSourceFromStack(ArgOnStack { container, ...}, (regArgs, stackArgs)) = (regArgs, container :: stackArgs)
             val (argSources, stackSources) =
                 List.foldl getSourceFromRegs (List.foldl getSourceFromStack ([], []) stackArgs) regArgs
         in
             { sources=argSources, dests=[dest], sStacks=stackSources @ containers, dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
         end
 
     |   getInstructionState(TailRecursiveCall{regArgs, stackArgs, ...}) =
         let
             (* Tail recursive call.  References the argument sources but exits. *)
             fun getSourceFromRegs((ArgInReg reg, _), (regArgs, stackArgs)) = (reg :: regArgs, stackArgs)
             |   getSourceFromRegs((ArgOnStack { container, ...}, _), (regArgs, stackArgs)) = (regArgs, container :: stackArgs)
             and getSourceFromStack({src=ArgInReg reg, ...}, (regArgs, stackArgs)) = (reg :: regArgs, stackArgs)
             |   getSourceFromStack({src=ArgOnStack { container, ...}, ...}, (regArgs, stackArgs)) = (regArgs, container :: stackArgs)
             val (argSources, stackSources) =
                 List.foldl getSourceFromRegs (List.foldl getSourceFromStack ([], []) stackArgs) regArgs
         in
             { sources=argSources, dests=[], sStacks=stackSources, dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
         end
 
     |   getInstructionState(ReturnResultFromFunction{resultReg, returnReg, ...}) =
             { sources=[resultReg, returnReg], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(RaiseExceptionPacket{packetReg}) =
             { sources=[packetReg], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(PushToStack{ source, container, ... }) =
             { sources=[source], dests=[], sStacks=[], dStacks=[container], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadStack{ dest, container, ... }) =
             { sources=[], dests=[dest], sStacks=[container], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(StoreToStack{ source, container, ... }) =
         (* Although this stores into the container it must already exist. *)
             { sources=[source], dests=[], sStacks=[container], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(ContainerAddress{ dest, container, ... }) =
             { sources=[], dests=[dest], sStacks=[container], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(ResetStackPtr _) =
             { sources=[], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(TagValue{source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(UntagValue{source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BoxLarge{source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(UnboxLarge{source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BoxTagFloat{source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(UnboxTagFloat{source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadAcquire { base, dest, ...}) =
         { sources=[base], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(StoreRelease { base, source, ...}) =
         { sources=[source, base], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BitFieldShift{ source, dest, ... }) =
         { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BitFieldInsert{ source, destAsSource, dest, ... }) =
         { sources=[source, destAsSource], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(IndexedCaseOperation{ testReg }) =
         { sources=[testReg], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(PushExceptionHandler) =
         { sources=[], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(PopExceptionHandler) =
         { sources=[], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BeginHandler{packetReg}) =
         (* The packet register is a destination since this provides its definition. *)
         { sources=[], dests=[packetReg], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(CompareByteVectors{vec1Addr, vec2Addr, length, ccRef, ...}) =
             { sources=[vec1Addr, vec2Addr, length], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCSet ccRef }
 
     |   getInstructionState(BlockMove{srcAddr, destAddr, length, ...}) =
             { sources=[srcAddr, destAddr, length], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCIndeterminate }
 
     |   getInstructionState(AddSubXSP{source, dest, ...}) =
             { sources=[source], dests=getOptReg dest, sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(TouchValue{source}) =
             { sources=[source], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(LoadAcquireExclusive{base, dest}) =
             { sources=[base], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(StoreReleaseExclusive{base, source, result}) =
             { sources=[base] @ getOptReg source, dests=[result], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(MemoryBarrier) =
             { sources=[], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(ConvertIntToFloat{ source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(ConvertFloatToInt{ source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(UnaryFloatingPt{ source, dest, ...}) =
             { sources=[source], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(BinaryFloatingPoint{ arg1, arg2, dest, ...}) =
             { sources=[arg1, arg2], dests=[dest], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCUnchanged }
 
     |   getInstructionState(CompareFloatingPoint{ arg1, arg2, ccRef, ...}) =
             { sources=[arg1, arg2], dests=[], sStacks=[], dStacks=[], ccIn=CCUnused, ccOut=CCSet ccRef }
 
 
     (* These instructions can be eliminated if their register sources are not used.
        There may be other cases. *)
     fun eliminateable(MoveRegister _) = true
     |   eliminateable(LoadNonAddressConstant _) = true
     |   eliminateable(LoadAddressConstant _) = true
     |   eliminateable(LoadWithConstantOffset _) = true
     |   eliminateable(LoadWithIndexedOffset _) = true
     |   eliminateable(ObjectIndexAddressToAbsolute _) = true
 
     |   eliminateable(TagValue _) = true
     |   eliminateable(UntagValue _) = true
     |   eliminateable(BoxLarge _) = true
     |   eliminateable(UnboxLarge _) = true
 (*
     |   eliminateable(CopyToCache _) = true
     |   eliminateable(LoadMemReg _) = true
 
 *)
     |   eliminateable _ = false
    
 
     fun identifyRegs(blockVector, pregProps): extendedBasicBlock vector * regState vector =
     let
         val maxPRegs = Vector.length pregProps
         val vectorLength = Vector.length blockVector
         (* Initial arrays - declarationArray is the set of registers given
            values by the block, importArray is the set of registers referenced by
            the block and not declared locally. *)
         val declarationArray = Array.array(vectorLength, emptySet)
         and importArray = Array.array(vectorLength, emptySet)
         val stackDecArray = Array.array(vectorLength, emptySet)
         and stackImportArray = Array.array(vectorLength, emptySet)
         and localLoopRegArray = Array.array(vectorLength, emptySet)
         
         (* References - this is used locally to see if a register is ever
            actually used and also included in the result which uses it as
            part of the choice of which register to spill. *)
         val regRefs = Array.array(maxPRegs, 0)
         (* Registers that must be pushed because they are required after
            a function call.  For cache registers this means "discard". *)
         and requirePushOrDiscard = Array.array(maxPRegs, false)
 
         fun incrRef r = Array.update(regRefs, r, Array.sub(regRefs, r)+1)
         
         (* Contains the, possibly filtered, code for each block. *)
         val resultCode = Array.array(vectorLength, NONE)
         
         val ccInStates = Array.array(vectorLength, CCUnused)
         and ccOutStates = Array.array(vectorLength, CCIndeterminate)
         
         (* First pass - for each block build up the sets of registers defined and
            used in the block.  We do this depth-first so that we can use "refs" to
            see if a register is used.  If this is an instruction that can be eliminated
            we don't need to generate it and can ignore any references it makes. *)
         local
             fun blockScan blockNo =
             if isSome(Array.sub(resultCode, blockNo)) then ()
             else
             let
                 val () = Array.update(resultCode, blockNo, SOME []) (* Prevent looping. *)
                 val thisBlock as BasicBlock { block, flow, ...} = Vector.sub(blockVector, blockNo)
                 val successors = blockSuccessors thisBlock
                 (* Visit everything reachable first. *)
                 val () = List.app blockScan successors
                 
                 fun scanCode(instr, original as { code, decs, refs, sDecs, sRefs, occIn, occOut, loopRegs, ... }) =
                 let
                     val { sources, dests, sStacks=stackSrcs, dStacks=stackDests, ccIn, ccOut, ... } =
                         getInstructionState instr
                     fun regNo(PReg i) = i
                     and stackNo(StackLoc{rno, ...}) = rno
                     val destRegNos = map regNo dests
                     and sourceRegNos = map regNo sources
                     val stackDestRegNos = map stackNo stackDests
                     and stackSourceRegNos = map stackNo stackSrcs
                     (* If this instruction requires a cc i.e. is SetToCondition or X87FPGetCondition we
                        need to set this as a requirement earlier.  If this sets the CC and it is the condition
                        we've been expecting we've satisfied it and can set the previous condition to Unused.
                        We could use this to decide if a comparison is no longer required.  That can only
                        happen in very specific circumstances e.g. some tests in Test176.ML so it's
                        not worthwhile. *)
                     val newInCC =
                         case (ccIn, ccOut, occIn) of
                             (cc as CCNeeded _, _, _) => cc (* This instr needs a particular cc. *)
                         |   (CCUnused, CCSet _, _) => CCUnused
                         |   (CCUnused, _, occIn) => occIn
                     (* If this instruction modifies the CC check to see if it is setting an requirement. *)
                     val _ =
                         case (occIn, ccOut) of
                             (CCNeeded ccRIn, CCSet ccRout) =>
                                 if ccRIn = ccRout then () else raise InternalError "CCCheck failed"
                         |   (CCNeeded _, CCIndeterminate) => raise InternalError "CCCheck failed"
                         |   _ => ()
                     (* The output CC is the last CC set.  Tail instructions that don't change
                        the CC state are ignored until we reach an instruction that sets it. *)
                     val newOutCC = case occOut of CCUnchanged => ccOut | _ => occOut
 
                     val instrLoopRegs =
                         case instr of
                             JumpLoop{regArgs, ...} => listToSet (map (regNo o #dst) regArgs)
                         |   _ => emptySet
                 in
                     if eliminateable instr andalso
                         List.all(fn dReg => Array.sub(regRefs, dReg) = 0) destRegNos
                     then original (* Don't include this instruction. *)
                     else
                     let
                         (* Only mark the sources as referred after we know we're going to need this.
                            In that way we may eliminate the instruction that created this source. *)
                         val () = List.app incrRef sourceRegNos
                     in
                         { code = instr :: code, decs = union(listToSet destRegNos, decs), refs = union(listToSet sourceRegNos, refs),
                           sDecs = union(listToSet stackDestRegNos, sDecs), sRefs = union(listToSet stackSourceRegNos, sRefs),
                           occIn = newInCC, occOut = newOutCC, loopRegs = union(loopRegs, instrLoopRegs)}
                     end
                 end
                 
                 (* If we have a conditional branch at the end we need the condition code.  It should either
                    be set here or in a preceding block. *)
                 val inCC = case flow of Conditional { ccRef, ...} => CCNeeded ccRef | _ => CCUnused
 
                 val { code, decs, refs, sDecs, sRefs, occIn, occOut, loopRegs, ... } =
                     List.foldr scanCode
                         {code=[], decs=emptySet, refs=emptySet, sDecs=emptySet, sRefs=emptySet, occIn=inCC, occOut=CCUnchanged, loopRegs=emptySet} block
             in
                 Array.update(declarationArray, blockNo, decs);
                 (* refs includes local declarations. Remove before adding to the result. *)
                 Array.update(importArray, blockNo, minus(refs, decs));
                 Array.update(localLoopRegArray, blockNo, loopRegs);
                 Array.update(stackDecArray, blockNo, sDecs);
                 Array.update(stackImportArray, blockNo, minus(sRefs, sDecs));
                 Array.update(resultCode, blockNo, SOME code);
                 Array.update(ccInStates, blockNo, occIn);
                 Array.update(ccOutStates, blockNo, occOut)
             end
         in
             val () = blockScan 0 (* Start with the root block. *)
         end
         
         (* Second phase - Propagate reference information between the blocks.
            We need to consider loops here.  Do a depth-first scan marking each
            block.  If we find a loop we save the import information we've used.
            If when we come to process that block we find the import information
            is different we need to reprocess. *)
         (* Pass through array - values used in other blocks after this that
            are not declared in this block. *)
         val passThroughArray = Array.array(vectorLength, emptySet)
         val stackPassThroughArray = Array.array(vectorLength, emptySet)
         (* Exports - those of our declarations that are used in other blocks. *)
         val exportArray = Array.array(vectorLength, emptySet)
         val stackExportArray = Array.array(vectorLength, emptySet)
         (* Loop registers.  This contains the registers that are not exported
            from or passed through this block but are used subsequently as
            loop registers. *)
         val loopRegArray = Array.array(vectorLength, emptySet)
         val () = Array.copy{src=localLoopRegArray, dst=loopRegArray, di=0}
         (* If any one of the successors requires the CC then this is set.
            Otherwise we leave it as Unused. *)
         val ccRequiredOut = Array.array(vectorLength, CCUnused)
         local
             datatype loopData =
                 Unprocessed | Processing | Processed
             |   Looped of { regSet: intSet, loopSet: intSet, stackSet: intSet, ccState: inCCState }
             
             fun reprocessLoop () =
             let
                 val reprocess = ref false
                 val loopArray = Array.array(vectorLength, Unprocessed)
             
                 fun processBlocks blockNo =
                     case Array.sub(loopArray, blockNo) of
                         Processed => (* Already seen this by a different route. *)
                             {
                                 regSet = union(Array.sub(passThroughArray, blockNo), Array.sub(importArray, blockNo)),
                                 stackSet = union(Array.sub(stackPassThroughArray, blockNo), Array.sub(stackImportArray, blockNo)),
                                 ccState = Array.sub(ccInStates, blockNo),
                                 loopSet = Array.sub(loopRegArray, blockNo)
                             }
                     |   Looped s => s (* We've already seen this in a loop. *)
                     |   Processing => (* We have a loop. *)
                         let
                             (* Use the existing input array. *)
                             val inputs =
                             {
                                 regSet = union(Array.sub(passThroughArray, blockNo), Array.sub(importArray, blockNo)),
                                 stackSet = union(Array.sub(stackPassThroughArray, blockNo), Array.sub(stackImportArray, blockNo)),
                                 ccState = Array.sub(ccInStates, blockNo),
                                 loopSet = Array.sub(loopRegArray, blockNo)
                             }
                             val () = Array.update(loopArray, blockNo, Looped inputs)
                         in
                             inputs
                         end
                     |   Unprocessed => (* Normal case - not visited yet. *)
                         let
                             val () = Array.update(loopArray, blockNo, Processing)
                             val thisBlock = Vector.sub(blockVector, blockNo)
                             val ourDeclarations = Array.sub(declarationArray, blockNo)
                             and ourStackDeclarations = Array.sub(stackDecArray, blockNo)
                             and ourLocalLoopRegs = Array.sub(localLoopRegArray, blockNo)
                             val successors = blockSuccessors thisBlock
 
                             fun addSuccessor b =
                             let
                                 val {regSet=theirImports, stackSet=theirStackImports, ccState=theirInState, loopSet=theirLoops} = processBlocks b
                                 (* Remove loop regs from the imports if they are actually given new
                                    values by this block.  We don't want to pass the old loop regs through here. *)
                                 val theirImports = minus(theirImports, ourLocalLoopRegs)
                                 (* Split the imports.  If a register is a local declaration then
                                    it becomes an export.  If it is not it becomes part of our
                                    passThrough. *)
                                 val (addToExp, addToImp) =
                                     IntSet.partition (fn i => member(i, ourDeclarations)) theirImports
                                 val (addToStackExp, addToStackImp) =
                                     IntSet.partition (fn i => member(i, ourStackDeclarations)) theirStackImports
                                 (* Merge the input states from each of the successors. *)
                                 val () =
                                     case (theirInState, Array.sub(ccRequiredOut, blockNo)) of
                                         (CCNeeded ts, CCNeeded req) =>
                                             if ts = req then () else raise InternalError "Mismatched states"
                                     |   (ts as CCNeeded _, _) => Array.update(ccRequiredOut, blockNo, ts)
                                     |   _ => ()
                                 (* Add loop registers to the set if they are not declared here.  The
                                    only place they are declared is at the entry to the loop so that
                                    stops them being propagated further. *)
                                 val addToLoops = minus(theirLoops, ourDeclarations)
                             in
                                 Array.update(exportArray, blockNo,
                                     union(Array.sub(exportArray, blockNo), addToExp));
                                 Array.update(passThroughArray, blockNo,
                                     union(Array.sub(passThroughArray, blockNo), addToImp));
                                 Array.update(stackExportArray, blockNo,
                                     union(Array.sub(stackExportArray, blockNo), addToStackExp));
                                 Array.update(stackPassThroughArray, blockNo,
                                     union(Array.sub(stackPassThroughArray, blockNo), addToStackImp));
                                 Array.update(loopRegArray, blockNo,
                                     union(Array.sub(loopRegArray, blockNo), addToLoops))
                             end
                             val () = List.app addSuccessor successors
                             val ourInputs =
                                 union(Array.sub(passThroughArray, blockNo), Array.sub(importArray, blockNo))
                             val ourStackInputs =
                                 union(Array.sub(stackPassThroughArray, blockNo), Array.sub(stackImportArray, blockNo))
                         in
                             (* Check that we supply the required state. *)
                             case (Array.sub(ccRequiredOut, blockNo), Array.sub(ccOutStates, blockNo)) of
                                 (CCNeeded ccReq, CCSet ccSet) =>
                                     if ccReq = ccSet then () else raise InternalError "Mismatched cc states"
                             |   (CCNeeded _, CCIndeterminate) => raise InternalError "Mismatched cc states"
                             |   (cc as CCNeeded needOut, CCUnchanged) =>
                                 (
                                     (* We pass through the state.  If we don't use the state then we
                                        need to set this as the input.  If we do use the state it must be
                                        the same. *)
                                     case Array.sub(ccInStates, blockNo) of
                                         CCUnused => Array.update(ccInStates, blockNo, cc)
                                     |   CCNeeded needIn =>
                                             if needOut = needIn then () else raise InternalError "Mismatched cc states"
                                 )
                             |   _ => ();
                             (* Was this block used in a loop?  If so we should not be requiring a CC. *)
                             case Array.sub(loopArray, blockNo) of
                                 Looped {regSet, stackSet, ...} =>
                                 (
                                     case Array.sub(ccInStates, blockNo) of
                                         CCNeeded _ => raise InternalError "Looped state needs cc" | _ => ();
                                     if setToList regSet = setToList ourInputs andalso
                                         setToList stackSet = setToList ourStackInputs
                                     then ()
                                     else reprocess := true
                                 )
                             |   _ => ();
                             Array.update(loopArray, blockNo, Processed);
                             { regSet = ourInputs, stackSet = ourStackInputs,
                               ccState = Array.sub(ccInStates, blockNo), loopSet=Array.sub(loopRegArray, blockNo)}
                         end
             in
                 reprocess := false;
                 processBlocks 0;
                 if !reprocess then reprocessLoop () else ()
             end
         in
             val () = reprocessLoop ()
         end
         
         (* Third pass - Build the result list with the active registers for each
            instruction.  We don't include registers in the passThrough set since
            they are active throughout the block. *)
         local
             (* Number of instrs for which this is active.  We use this to try to select a
                register to push to the stack if we have too many.  Registers that have
                only a short lifetime are less likely to be pushed than those that are
                active longer. *)
             val regActive = Array.array(maxPRegs, 0)
             fun addActivity n r = Array.update(regActive, r, Array.sub(regActive, r)+n)
             
             fun createResultInstrs (passThrough, stackPassThrough)
                 (instr, (tail, activeAfterThis, stackActiveAfterThis)) =
             let
                 val { sources, dests, sStacks=stackSrcs, dStacks=stackDests, ... } = getInstructionState instr
             in
                 (* Eliminate instructions if their results are not required.  The earlier check for this
                    will remove most cases but if we have duplicated a block we may have a register that
                    is required elsewhere but not in this particular branch.  *)
                 if not(List.exists(fn PReg d => member(d, activeAfterThis)) dests) andalso eliminateable instr
                 then (tail, activeAfterThis, stackActiveAfterThis)
                 else
                 let
                     fun regNo(PReg i) = i
                     fun stackNo(StackLoc{rno, ...}) = rno
                     val destRegNos = map regNo dests
                     and sourceRegNos = map regNo sources
                     val destSet = listToSet destRegNos
                     (* Remove any sources that are present in passThrough since
                        they are going to be active throughout the block. *)
                     and sourceSet = minus(listToSet sourceRegNos, passThrough)
                     val stackDestRegNos = map stackNo stackDests
                     and stackSourceRegNos = map stackNo stackSrcs
                     val stackDestSet = listToSet stackDestRegNos
                     and stackSourceSet = minus(listToSet stackSourceRegNos, stackPassThrough)
 
                     (* To compute the active set for the PREVIOUS instruction (we're processing from the
                        end back to the start) we remove any registers that have been given values in this
                        instruction and add anything that we are using in this instruction since they will
                        now need to have values. *)
                     val afterRemoveDests = minus(activeAfterThis, destSet)
                     val stackAfterRemoveDests = minus(stackActiveAfterThis, stackDestSet)
                     val activeForPrevious = union(sourceSet, afterRemoveDests)
                     val stackActiveForPrevious = union(stackSourceSet, stackAfterRemoveDests)
             
                     (* The "active" set is the set of registers that need to be active DURING the
                        instruction.  It includes destinations, which will usually be in
                        "activeAfterThis", because there may be destinations that are not actually used
                        subsequently but still need a register. *)
                     val activeForInstr =
                         case instr of
                             FunctionCall _ => sourceSet (* Is this still needed? *)
                         |   TailRecursiveCall _ =>
                                 (* Set the active set to the total set of registers we require including
                                    the work register.  This ensures that we will spill as many registers
                                    as we require when we look at the size of the active set. *)
                                 union(sourceSet, destSet)
                         |   BoxLarge _ =>
                                 (* We can only store the value in the box after the box is allocated. *)
                                 union(activeAfterThis, union(sourceSet, destSet))
                         |   BoxTagFloat _ =>
                                 (* Since the source must be a V register and the destination an
                                    X register there isn't actually a problem here, but do this
                                    anyway. *)
                                 union(activeAfterThis, union(sourceSet, destSet))
                         |   _ => union(activeAfterThis, destSet)
                 
                     val () = List.app(addActivity 1) (setToList activeForInstr)
 
                     local
                         (* If we are allocating memory we have to save the current registers if
                            they could contain an address.  We mustn't push untagged registers
                            and we mustn't push the destination. *)
                         fun getSaveSet dReg =
                         let
                             val activeAfter = union(activeAfterThis, passThrough)
                             (* Remove any registers marked - must-not-push.  These are
                                registers holding non-address values.  They will actually
                                be saved by the RTS across any GC but not checked or
                                modified by the GC.
                                Exclude the result register. *)
                             fun getSave i =
                                 if i = dReg
                                 then NONE
                                 else case Vector.sub(pregProps, i) of
                                     RegPropGeneral => SOME(PReg i)
                                 |   RegPropCacheTagged => SOME(PReg i)
                                 |   RegPropUntagged => NONE
                                 |   RegPropStack _ => NONE
                                 |   RegPropCacheUntagged => NONE
                                 |   RegPropMultiple => raise InternalError "getSave: RegPropMultiple"
                         in
                             List.mapPartial getSave (setToList activeAfter)
                         end
                     in
                         (* Sometimes we need to modify the instruction e.g. to include the set
                            of registers to save. *)
                         val convertedInstr =
                             case instr of
                                 AllocateMemoryFixed{bytesRequired, dest, saveRegs=_} =>
                                     AllocateMemoryFixed{bytesRequired=bytesRequired, dest=dest,
                                         saveRegs=getSaveSet(regNo dest)}
 
                             |   AllocateMemoryVariable{size, dest, saveRegs=_} =>
                                     AllocateMemoryVariable{size=size, dest=dest, saveRegs=getSaveSet(regNo dest)}
 
                             |   BoxLarge{source, dest, saveRegs=_} =>
                                     BoxLarge{source=source, dest=dest, saveRegs=getSaveSet(regNo dest)}
 
                             |   BoxTagFloat{source, dest, floatSize, saveRegs=_} =>
                                     BoxTagFloat{source=source, dest=dest, floatSize=floatSize,
                                         saveRegs=getSaveSet(regNo dest)}
                        
                             |   JumpLoop{regArgs, stackArgs, checkInterrupt = SOME _, ...} =>
                                 let
                                     (* If we have to check for interrupts we must preserve registers across
                                        the RTS call. *)
                                     fun getSave i =
                                         case Vector.sub(pregProps, i) of
                                         RegPropGeneral => SOME(PReg i)
                                     |   RegPropCacheTagged => SOME(PReg i)
                                     |   RegPropUntagged => NONE
                                     |   RegPropStack _ => NONE
                                     |   RegPropCacheUntagged => NONE
                                     |   RegPropMultiple => raise InternalError "getSave: RegPropMultiple"
                                     val currentRegs = union(activeAfterThis, passThrough)
                                     (* Have to include the loop registers.  These were previously included
                                        automatically because they were part of the import set. *)
                                     val check = List.mapPartial getSave (map (regNo o #dst) regArgs @ setToList currentRegs)
                                 in
                                     JumpLoop{regArgs=regArgs, stackArgs=stackArgs, checkInterrupt=SOME check}
                                 end
 
                             |   FunctionCall{regArgs, stackArgs=[], dest, callKind as ConstantCode m,
                                              saveRegs=_, containers} =>
                                 (* If this is arbitrary precision push the registers rather than marking them as "save".
                                    stringOfWord returns 'CODE "PolyAddArbitrary"' etc. *)
                                 if (String.isSubstring "Arbitrary\"" (Address.stringOfWord m))
                                 then FunctionCall{regArgs=regArgs, stackArgs=[], callKind=callKind, dest=dest,
                                         containers=containers, saveRegs=getSaveSet(regNo dest) }
                                 else instr
 
                             |   _ => instr
                     end
                 
                     (* FunctionCall must mark all registers as "push". *)
                     local
                         fun pushRegisters () =
                         let
                             val activeAfter = union(activeAfterThis, passThrough)
                             fun pushAllButDests i =
                                 if List.exists(fn j => i=j) destRegNos
                                 then ()
                                 else case Vector.sub(pregProps, i) of
                                     RegPropCacheTagged => raise InternalError "pushRegisters: cache reg"
                                 |   RegPropCacheUntagged => raise InternalError "pushRegisters: cache reg"
                                 |  _ => Array.update(requirePushOrDiscard, i, true)
                         in
                             (* We need to push everything active after this
                                except the result register. *)
                             List.app pushAllButDests (setToList activeAfter)
                         end
                     in
                         val () =
                            case instr of
                                 FunctionCall{ stackArgs=[], callKind=ConstantCode m, ...} =>
                                 if (String.isSubstring "Arbitrary\"" (Address.stringOfWord m))
                                 then ()
                                 else pushRegisters ()
                             
                             |   FunctionCall _ => pushRegisters ()
                             
                                 (* It should no longer be necessary to push across a handler but
                                    there still seem to be cases that need it. *)
                             (*|   BeginHandler _ => pushRegisters ()*)
 
                             |   _ => ()
                     end
                 
                     (* Which entries are active in this instruction but not afterwards? *)
                     val kill = union(minus(stackSourceSet, stackActiveAfterThis), minus(sourceSet, activeAfterThis))
                 in
                     ({instr=convertedInstr, active=activeForInstr, current=activeAfterThis, kill=kill} :: tail, activeForPrevious,
                      stackActiveForPrevious)
                 end
             end
 
             fun createResult blockNo =
             let
                 val BasicBlock{ flow, ...} = Vector.sub(blockVector, blockNo)
                 val declSet   = Array.sub(declarationArray, blockNo)
                 and importSet = Array.sub(importArray, blockNo)
                 and passSet   = Array.sub(passThroughArray, blockNo)
                 and loopSet   = Array.sub(loopRegArray, blockNo)
                 and exportSet = Array.sub(exportArray, blockNo)
                 and stackPassSet = Array.sub(stackPassThroughArray, blockNo)
                 and stackImportSet = Array.sub(stackImportArray, blockNo)
                 and stackExportSet = Array.sub(stackExportArray, blockNo)
                 val filteredCode = getOpt(Array.sub(resultCode, blockNo), [])
                 (* At the end of the block we should have the exports active. *)
                 val (resultInstrs, _, _) = List.foldr (createResultInstrs (passSet, stackPassSet)) ([], exportSet, stackExportSet) filteredCode
                 (* Set the active count for the pass through. *)
                 val instrCount = List.length filteredCode
                 val () = List.app(addActivity instrCount) (setToList passSet)
                 val inCCState =
                     case Array.sub(ccInStates, blockNo) of CCNeeded s => SOME s | CCUnused => NONE
                 val outCCState =
                     case Array.sub(ccRequiredOut, blockNo) of CCNeeded s => SOME s | CCUnused => NONE
             in
                 ExtendedBasicBlock {
                     block = resultInstrs,
                     flow=flow,
                     locals = minus(declSet, exportSet),
                     imports = importSet,
                     exports = exportSet,
                     passThrough = passSet,
                     loopRegs = loopSet,
                     initialStacks = union(stackPassSet, stackImportSet),
                     inCCState = inCCState,
                     outCCState = outCCState
                 }
             end
         in
             val resultBlocks = Vector.tabulate(vectorLength, createResult)
             val regActive = regActive
         end
         
         val registerState: regState vector =
             Vector.tabulate(maxPRegs,
                 fn i => {
                     active = Array.sub(regActive, i),
                     refs = Array.sub(regRefs, i),
                     pushState = Array.sub(requirePushOrDiscard, i),
                     prop = Vector.sub(pregProps, i)
                 }
             )
     in
         (resultBlocks, registerState)
     end
     (* Exported function.  First filter out unreferenced blocks then process the
        registers themselves. *)
     fun identifyRegisters(blockVector, pregProps) =
     let
         val vectorLength = Vector.length blockVector
         val mapArray = Array.array(vectorLength, NONE)
         and resArray = Array.array(vectorLength, NONE)
         val count = ref 0
         
         fun setReferences label =
             case Array.sub(mapArray, label) of
                 NONE => (* Not yet visited *)
                 let
                     val BasicBlock{flow, block} = Vector.sub(blockVector, label)
                     (* Create a new entry for it. *)
                     val newLabel = ! count before count := !count + 1
                     (* Add it to the map.  Any other references will use this
                        without reprocessing. *)
                     val () = Array.update(mapArray, label, SOME newLabel)
                     val newFlow =
                         case flow of
                             Unconditional l => Unconditional(setReferences l)
                         |   Conditional{trueJump, falseJump, ccRef, condition} =>
                                 Conditional{trueJump=setReferences trueJump, falseJump=setReferences falseJump,
                                         ccRef=ccRef, condition=condition}
                         |   ExitCode => ExitCode
                         |   IndexedBr list => IndexedBr(map setReferences list)
                         |   SetHandler{handler, continue} =>
                                 SetHandler{handler=setReferences handler, continue=setReferences continue}
                         |   UnconditionalHandle l => UnconditionalHandle(setReferences l)
                         |   ConditionalHandle{handler, continue} =>
                                 ConditionalHandle{handler=setReferences handler, continue=setReferences continue}
 
                     val () = Array.update(resArray, newLabel, SOME(BasicBlock{flow=newFlow, block=block}))
                 in
                     newLabel
                 end
             |   SOME lab => lab
        
         val _ = setReferences 0
         
         val newBlockVector =
             Vector.tabulate(!count, fn i => valOf(Array.sub(resArray, i)))
     in
         identifyRegs(newBlockVector, pregProps)
     end
 
     (* Exported for use in GetConflictSets *)
     fun getInstructionRegisters instr =
     let
         val {sources, dests, ...} = getInstructionState instr
     in
         {sources=sources, dests=dests}
     end
     
     (* Exported for use in ICodeOptimise *)
     val getInstructionCC = #ccOut o getInstructionState
 
     structure Sharing =
     struct
         type arm64ICode         = arm64ICode
         and preg                = preg
         and intSet              = intSet
         and basicBlock          = basicBlock
         and extendedBasicBlock  = extendedBasicBlock
         and controlFlow         = controlFlow
         and regProperty         = regProperty
         and ccRef               = ccRef
         and outCCState          = outCCState
     end
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PreAssembly.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PreAssembly.ML
new file mode 100644
index 00000000..d20800ad
--- /dev/null
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PreAssembly.ML
@@ -0,0 +1,129 @@
+(*
+    Copyright (c) 2021 David C. J. Matthews
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    Licence version 2.1 as published by the Free Software Foundation.
+    
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public Licence for more details.
+    
+    You should have received a copy of the GNU Lesser General Public
+    Licence along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+*)
+
+(* The pre-assembly layer goes below the icode and allows peep-hole optimisation. *)
+
+functor Arm64PreAssembly(
+
+    structure Arm64Assembly: ARM64ASSEMBLY
+    structure Debug: DEBUG
+    structure Pretty: PRETTY
+
+): ARM64PREASSEMBLY =
+struct
+    open Arm64Assembly
+    
+    exception InternalError = Misc.InternalError
+
+    (* Many of the datatypes are inherited from Arm64Assembly *)
+
+    datatype loadType = Load64 | Load32 | Load16 | Load8
+    and opSize = OpSize32 | OpSize64
+    and logicalOp = LogAnd | LogOr | LogXor
+    and floatSize = Float32 | Double64
+    and shiftDirection = ShiftLeft | ShiftRightLogical | ShiftRightArithmetic
+    and multKind =
+        MultAdd32 | MultSub32 | MultAdd64 | MultSub64 |
+        SignedMultAddLong (* 32bit*32bit + 64bit => 64Bit *) |
+        SignedMultHigh (* High order part of 64bit*64Bit *)
+    and fpUnary = NegFloat | NegDouble | AbsFloat | AbsDouble | ConvFloatToDble | ConvDbleToFloat
+    and fpBinary = MultiplyFP | DivideFP | AddFP | SubtractFP
+    and unscaledType = NoUpdate | PreIndex | PostIndex
+    and condSet = CondSet | CondSetIncr | CondSetInvert | CondSetNegate
+    and bitfieldKind = BitFieldUnsigned | BitFieldSigned | BitFieldInsert
+
+    datatype precode =
+        (* Basic instructions *)
+        AddSubImmediate of
+            {regN: xReg, regD: xReg, immed: word, shifted: bool, isAdd: bool, opSize: opSize, setFlags: bool}
+    |   AddSubShiftedReg of
+            {regM: xReg, regN: xReg, regD: xReg, shift: shiftType, isAdd: bool, opSize: opSize, setFlags: bool}
+    |   AddSubExtendedReg of
+            {regM: xReg, regN: xReg, regD: xReg, extend: Word8.word extend, isAdd: bool, opSize: opSize, setFlags: bool}
+    |   MultiplyAndAddSub of
+            {regM: xReg, regN: xReg, regA: xReg, regD: xReg, isAdd: bool, opSize: opSize, multKind: multKind}
+    |   DivideRegs of
+            {regM: xReg, regN: xReg, regD: xReg, isSigned: bool, opSize: opSize}
+    |   LogicalShiftedReg of
+            {regM: xReg, regN: xReg, regD: xReg, shift: shiftType, logOp: logicalOp, opSize: opSize, setFlags: bool}
+    |   LoadRegScaled of
+            {regT: xReg, regN: xReg, unitOffset: int, loadType: loadType}
+    |   LoadFPRegScaled of
+            {regT: vReg, regN: xReg, unitOffset: int, floatSize: floatSize}
+    |   StoreRegScaled of
+            {regT: xReg, regN: xReg, unitOffset: int, loadType: loadType}
+    |   StoreFPRegScaled of
+            {regT: vReg, regN: xReg, unitOffset: int, floatSize: floatSize}
+    |   LoadRegUnscaled of
+            {regT: xReg, regN: xReg, byteOffset: int, loadType: loadType, unscaledType: unscaledType}
+    |   StoreRegUnscaled of
+            {regT: xReg, regN: xReg, byteOffset: int, loadType: loadType, unscaledType: unscaledType}
+    |   LoadFPRegUnscaled of
+            {regT: vReg, regN: xReg, byteOffset: int, floatSize: floatSize, unscaledType: unscaledType}
+    |   StoreFPRegUnscaled of
+            {regT: vReg, regN: xReg, byteOffset: int, floatSize: floatSize, unscaledType: unscaledType}
+    |   LoadAcquire of {regN: xReg, regT: xReg, loadType: loadType}
+    |   StoreRelease of {regN: xReg, regT: xReg, loadType: loadType}
+    |   LoadRegPair of { regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int, unscaledType: unscaledType}
+    |   StoreRegPair of{ regT1: xReg, regT2: xReg, regN: xReg, unitOffset: int, unscaledType: unscaledType}
+    |   LoadFPRegPair of { regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int, unscaledType: unscaledType}
+    |   StoreFPRegPair of { regT1: vReg, regT2: vReg, regN: xReg, unitOffset: int, unscaledType: unscaledType}
+    |   ConditionalSet of {regD: xReg, regTrue: xReg, regFalse: xReg, cond: condition, condSet: condSet}
+    |   BitField of {immr: word, imms: word, regN: xReg, regD: xReg, opSize: opSize, bitfieldKind: bitfieldKind}
+    |   ShiftRegisterVariable of {regM: xReg, regN: xReg, regD: xReg, opSize: opSize, shiftDirection: shiftDirection}
+    |   BitwiseLogical of { bits: Word64.word, regN: xReg, regD: xReg, opSize: opSize, setFlags: bool, logOp: logicalOp}
+        (* Floating point *)
+    |   MoveGeneralToFP of { regN: xReg, regD: vReg, floatSize: floatSize}
+    |   MoveFPToGeneral of {regN: vReg, regD: xReg, floatSize: floatSize}
+    |   ConvertIntToFP of { regN: xReg, regD: vReg, floatSize: floatSize, opSize: opSize}
+    |   ConvertFloatToInt of { round: IEEEReal.rounding_mode, regN: vReg, regD: xReg, floatSize: floatSize, opSize: opSize}
+    |   FPBinaryOp of { regM: vReg, regN: vReg, regD: vReg, floatSize: floatSize, fpOp: fpBinary}
+    |   FPComparison of { regM: vReg, regN: vReg, floatSize: floatSize}
+    |   FPUnaryOp of {regN: vReg, regD: vReg, floatSize: floatSize, fpOp: fpUnary}
+        (* Branches and Labels. *)
+    |   SetLabel of labels
+    |   ConditionalBranch of condition * labels
+    |   UnconditionalBranch of labels
+    |   BranchAndLink of labels
+    |   LoadLabelAddress of xReg * labels
+    |   TestBitBranch of { test: xReg, bit: Word8.word, label: labels, onZero: bool }
+    |   CompareBranch of { test: xReg, label: labels, onZero: bool, opSize: opSize }
+        (* Composite instructions *)
+    |   MoveXRegToXReg of {sReg: xReg, dReg: xReg}
+    |   LoadNonAddr of xReg * Word64.word
+    |   LoadAddr of xReg * machineWord
+
+    fun generateCode
+        {instrs, name, parameters, resultClosure, profileObject} = raise Fail "TODO"
+
+
+
+    structure Sharing =
+    struct
+        type closureRef = closureRef
+        type precode = precode
+        type xReg = xReg
+        type vReg = vReg
+        type labels = labels
+        type condition = condition
+        type shiftType = shiftType
+        type wordSize = wordSize
+        type 'a extend = 'a extend
+        type scale = scale
+    end
+
+end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PushRegisters.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PushRegisters.ML
index 591bf446..97a29a00 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PushRegisters.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64PushRegisters.ML
@@ -1,1104 +1,1143 @@
 (*
     Copyright David C. J. Matthews 2021
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor Arm64PushRegisters(
     structure Arm64ICode: ARM64ICODE
     structure IntSet: INTSET
     structure Identify: ARM64IDENTIFYREFERENCES
     sharing Arm64ICode.Sharing = Identify.Sharing = IntSet
 ) : ARM64PUSHREGISTERS
 =
 struct
     open Arm64ICode
     open IntSet
     open Identify
     
     (* Curried subscript functions *)
     fun asub a i = Array.sub(a, i)
     and vsub v i = Vector.sub(v, i)
     
     exception InternalError = Misc.InternalError
 
     (* Each preg in the input is mapped to either a new preg or the stack. *)
     datatype pregMapType = Unset | ToPReg of preg | ToStack of int * stackLocn
 
     (* The stack contains both entries in the input code and entries added here.
        It is really used to ensure that the stack at run time is the same size
        at the start of a block whichever block has jumped to it. *)
     datatype stackEntry =
         NewEntry of {pregNo: int} (* pregNo is the original preg that has been pushed here. *)
     |   OriginalEntry of { stackLoc: stackLocn }
     |   HandlerEntry
 
 
     fun addRegisterPushes{code: extendedBasicBlock vector, pushVec: bool vector, pregProps, firstPass=_} =
     let
         val maxPRegs = Vector.length pregProps
         val numberOfBlocks = Vector.length code
         (* Output registers and properties. *)
         val pregCounter = ref 0
         val pregPropList = ref []
         val pregMap = Array.array(maxPRegs, Unset)
  
         val maxStack = ref 0
         (* The stack size we've assumed for the block.  Also indicates if
            a block has already been processed. *)
         val inputStackSizes = Array.array(numberOfBlocks, NONE)
         (* The result of processing a block. *)
         val blockOutput = Array.array(numberOfBlocks, {code=[], stackCount=0})
         
         (* Extra blocks to adjust the stack are added here. *)
         val extraBlocks: basicBlock list ref = ref []
         val blockCounter = ref numberOfBlocks
         
         (* Get the blocks that are inputs for each one. *)
         local
             val blockRefs = Array.array(numberOfBlocks, [])
             
             fun setReferences fromBlock =
             let
                 val ExtendedBasicBlock{ flow, ...} = vsub code fromBlock
                 val refs = successorBlocks flow
                 fun setRefs toBlock =
                 let
                     val oldRefs = asub blockRefs toBlock
                 in
                     Array.update(blockRefs, toBlock, fromBlock :: oldRefs);
                     if null oldRefs
                     then setReferences toBlock
                     else ()
                 end
             in
                 List.app setRefs refs
             end
 
             val () = setReferences 0
         in
             val blockRefs = blockRefs
         end
 
         (* Recursive scan of the blocks.  For each block we produce an input and output state.
            The input state is the output state of the predecessor i.e. some block that jumps to
            this, but with any entries removed that are not used in this block.  It is then
            necessary to match the input state, if necessary by adding extra blocks that just
            do the matching. *)
         local
             val haveProcessed = isSome o asub inputStackSizes
         
             fun processBlocks toDo =
             case List.filter (fn (n, _) => not(haveProcessed n)) toDo of
                 [] => () (* Nothing left to do *)
 
             |   stillToDo as head :: _ =>
                 let
                     (* Try to find a block all of whose predecessors have been processed.  That
                        increases the chances that we will have cached items. *)
                     fun available(dest, _) = List.all haveProcessed (Array.sub(blockRefs, dest))
                     val (blockNo, lastOutputState) =
                         case List.find available stillToDo of
                             SOME c => c
                         |   NONE => head
                 
                     (* This is the first time we've come to this block.  *)
                     val ExtendedBasicBlock{ block, flow, imports, passThrough, loopRegs, initialStacks, ...} = vsub code blockNo
 
                     (* Remove any items from the input state that are no longer needed for
                        this block.  They could be local to the previous block or needed by
                        a different successor.  Although the values in loopRegs are not
                        required the stack space is so that they can be updated. *)
                     fun removeItems(result as {stack=[], stackCount=0}) = result
                 
                     |   removeItems{stack=[], ...} = raise InternalError "removeItems - stack size"
 
                     |   removeItems (thisStack as {stack=NewEntry{pregNo} :: rest, stackCount}) =
                         if member(pregNo, imports) orelse member(pregNo, passThrough) orelse member(pregNo, loopRegs)
                         then thisStack
                         else removeItems{stack=rest, stackCount=stackCount-1}
 
                     |   removeItems (thisStack as {stack=OriginalEntry{stackLoc=StackLoc{rno, size}, ...} :: rest, stackCount}) =
                         if member(rno, initialStacks)
                         then thisStack
                         else removeItems{stack=rest, stackCount=stackCount-size}
 
                     |   removeItems result = result
 
                     val {stackCount=newSp, stack=newStack} = removeItems lastOutputState
                     
                     (* References to hold the current stack count (number of words on the stack)
                        and the list of items on the stack.  The list is not used directly to map
                        stack addresses.  Instead it is used to match the stack at the beginning
                        and end of a block. *)
                     val stackCount = ref newSp
                     val stack = ref newStack
                     (* Items from the stack that have been marked as deleted but not yet
                        removed.  We only remove items from the top of the stack to avoid
                        quadratic behaviour with a very deep stack. *)
                     val deletedItems = ref []
                     
                     (* Save the stack size in case we come by a different route. *)
                     val () = Array.update(inputStackSizes, blockNo, SOME newSp)
                     
                     fun pushItemToStack item =
                     let
                         val size =
                             case item of
                                 NewEntry _ => 1
                             |   OriginalEntry{stackLoc=StackLoc{size, ...}, ...} => size
                             |   HandlerEntry => 2
                     in
                         stackCount := ! stackCount+size;
                         stack := item :: ! stack;
                         maxStack := Int.max(!maxStack, !stackCount)
                     end
  
                     fun newPReg propKind =
                     let
                         val regNo = !pregCounter before pregCounter := !pregCounter + 1
                         val () = pregPropList := propKind :: !pregPropList
                     in
                         PReg regNo
                     end
         
                     and newStackLoc size =
                     let
                         val regNo = !pregCounter before pregCounter := !pregCounter + 1
                         val () = pregPropList := RegPropStack size :: !pregPropList
                     in
                         StackLoc{size=size, rno=regNo}
                     end
         
                     (* Map a source register.  This always loads the argument. *)
                     fun mapSrcReg(PReg n) =
                         case Array.sub(pregMap, n) of
                             Unset => raise InternalError "mapSrcReg - unset"
                         |   ToPReg preg => (preg, [])
                         |   ToStack(stackLoc, container as StackLoc{size, ...}) =>
                             let
                                 (* Make a new untagged register. That will prevent us pushing it if
                                    we have to spill registers. *)
                                 val newReg = newPReg RegPropUntagged
                             in
                                 (newReg,
                                     [LoadStack{wordOffset= !stackCount-stackLoc-size, container=container,
                                                field=0, dest=newReg}])
                             end
 
                     fun mapDestReg(PReg n) =
                     let
                         val currentLocation = Array.sub(pregMap, n)
                         val kind = Vector.sub(pregProps, n)
                     in
                         if Vector.sub(pushVec, n)
                         then
                         let
                             (* This should not have been seen before. *)
                             val _ = case currentLocation of Unset => () | _ => raise InternalError "mapDestReg - already set"
                             val newReg = newPReg kind
                             val newContainer = newStackLoc 1
                             val () = Array.update(pregMap, n, ToStack (!stackCount, newContainer))
                             val () = pushItemToStack(NewEntry{pregNo=n})
                         in
                             (newReg, [PushToStack{source= newReg, container=newContainer, copies=1}])
                         end
                         else
                         let
                         (* See if we already have a number for it.  We may encounter the same preg
                            as a destination when returning the result from a conditional in which
                            case we have to use the same number.  We shouldn't have pushed it. *)
                             val newReg =
                                 case (currentLocation, kind) of
                                     (Unset, _) =>
                                     let
                                         val newReg = newPReg kind
                                         val () = Array.update(pregMap, n, ToPReg newReg)
                                     in
                                         newReg
                                     end
                                 |   (ToPReg preg, RegPropMultiple) => preg
                                 |   _ => raise InternalError "mapDestReg - multiply defined non-merge reg"
                         in
                             (newReg, [])
                         end
                     end
                     
                     (* Optional destination for arithmetic and logical ops. *)
                     fun mapOptDestOld NONE = (NONE, [])
                     |   mapOptDestOld (SOME destReg) =
                         let val (destVal, destCode) = mapDestReg destReg in (SOME destVal, destCode) end
 
                     fun mapOptSrcOld NONE = (NONE, [])
                     |   mapOptSrcOld (SOME srcReg) =
                         let val (srcVal, srcCode) = mapSrcReg srcReg in (SOME srcVal, srcCode) end
 
                     fun mapOptDest ZeroReg = (ZeroReg, [])
                     |   mapOptDest (SomeReg destReg) =
                         let val (destVal, destCode) = mapDestReg destReg in (SomeReg destVal, destCode) end
 
                     fun mapOptSrc ZeroReg = (ZeroReg, [])
                     |   mapOptSrc (SomeReg srcReg) =
                         let val (srcVal, srcCode) = mapSrcReg srcReg in (SomeReg srcVal, srcCode) end
 
                     (* Adjust a stack offset from the old state to the new state. *)
                     fun mapContainerAndStack(StackLoc{rno, size}, field) =
                     let
                         val (newStackAddr, newContainer) =
                         case Array.sub(pregMap, rno) of
                             Unset => raise InternalError "mapContainer - unset"
                         |   ToPReg _ => raise InternalError "mapContainer - ToPReg"
                         |   ToStack stackContainer => stackContainer
 
                         val newOffset = !stackCount-(newStackAddr+size) + field
                     in
                         (newOffset, newContainer)
                     end
         
                     (* Add an entry for an existing stack entry. *)
                     fun mapDestContainer(StackLoc{rno, size}, locn) =
                     (
                         case Array.sub(pregMap, rno) of
                             Unset =>
                             let
                                 val newContainer = newStackLoc size
                                 val () = Array.update(pregMap, rno, ToStack(locn, newContainer))
                             in
                                 newContainer
                             end
                         |   _ => raise InternalError "mapDestContainer: already set"
                     )
 
                     (* Map a function argument which could be a register or a stack entry.  A register
                        entry could have been pushed. *)
                     fun mapArgument(ArgInReg (PReg r)) =
                         (
                             case Array.sub(pregMap, r) of
                                 Unset => raise InternalError "mapSource - unset"
                             |   ToPReg preg => ArgInReg preg
                             |   ToStack(stackLoc, container as StackLoc{size, ...}) =>
                                 ArgOnStack{wordOffset= !stackCount-stackLoc-size, container=container, field=0}
                         )
 
                     |   mapArgument(ArgOnStack{container, field, ...}) =
                             let
                                 val (newOffset, newContainer) = mapContainerAndStack(container, field)
                             in
                                 ArgOnStack{container=newContainer, wordOffset=newOffset, field=field}
                             end
 
                     (* Rewrite the code, replacing any registers that need to be pushed with references to
                        the stack.  The result is built up in reverse order and then reversed. *)
                     fun pushRegisters({instr=MoveRegister{ source, dest as PReg dReg }, ...}, code) =
                         if Vector.sub(pushVec, dReg)
                         then (* We're going to push this. *)
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                             (* TODO: Since we're pushing it we don't need to move it first. *)
                         in
                             destCode @ MoveRegister { source=sourceVal, dest=destVal} :: sourceCode @ code
                         end
                         else
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ MoveRegister { source=sourceVal, dest=destVal} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=LoadNonAddressConstant { dest, source}, ...}, code) =
                         let
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ LoadNonAddressConstant { dest=destVal, source=source} :: code
                         end
 
                     |   pushRegisters({instr=LoadAddressConstant { dest, source}, ...}, code) =
                         let
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ LoadAddressConstant { dest=destVal, source=source} :: code
                         end
 
                     |   pushRegisters({instr=LoadWithConstantOffset { base, dest, byteOffset, loadType}, ...}, code) =
                         let
                             val (baseVal, baseCode) = mapSrcReg base
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @
                                 LoadWithConstantOffset { base=baseVal, dest=destVal, byteOffset=byteOffset, loadType=loadType} ::
                                     baseCode @ code
                         end
 
+                    |   pushRegisters({instr=LoadFPWithConstantOffset { base, dest, byteOffset, floatSize}, ...}, code) =
+                        let
+                            val (baseVal, baseCode) = mapSrcReg base
+                            val (destVal, destCode) = mapDestReg dest
+                        in
+                            destCode @
+                                LoadFPWithConstantOffset { base=baseVal, dest=destVal, byteOffset=byteOffset, floatSize=floatSize} ::
+                                    baseCode @ code
+                        end
+
                     |   pushRegisters({instr=LoadWithIndexedOffset { base, dest, index, loadType}, ...}, code) =
                         let
                             val (baseVal, baseCode) = mapSrcReg base
                             val (indexVal, indexCode) = mapSrcReg index
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @
                                 LoadWithIndexedOffset { base=baseVal, dest=destVal, index=indexVal, loadType=loadType} ::
                                     indexCode @ baseCode @ code
                         end
 
-                    |   pushRegisters({instr=LoadMemReg { wordOffset, dest}, ...}, code) =
+                    |   pushRegisters({instr=LoadFPWithIndexedOffset { base, dest, index, floatSize}, ...}, code) =
+                        let
+                            val (baseVal, baseCode) = mapSrcReg base
+                            val (indexVal, indexCode) = mapSrcReg index
+                            val (destVal, destCode) = mapDestReg dest
+                        in
+                            destCode @
+                                LoadFPWithIndexedOffset { base=baseVal, dest=destVal, index=indexVal, floatSize=floatSize} ::
+                                    indexCode @ baseCode @ code
+                        end
+
+                    |   pushRegisters({instr=GetThreadId { dest}, ...}, code) =
                         let
                             val (destVal, destCode) = mapDestReg dest
                         in
-                            destCode @ LoadMemReg { wordOffset=wordOffset, dest=destVal} :: code
+                            destCode @ GetThreadId { dest=destVal} :: code
                         end
 
                     |   pushRegisters({instr=ObjectIndexAddressToAbsolute { source, dest}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ ObjectIndexAddressToAbsolute { source=sourceVal, dest=destVal} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=AbsoluteToObjectIndex { source, dest}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ AbsoluteToObjectIndex { source=sourceVal, dest=destVal} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=AllocateMemoryFixed { bytesRequired, dest, ...}, ...}, code) =
                         let
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ AllocateMemoryFixed { dest=destVal, bytesRequired=bytesRequired, saveRegs=[]} :: code
                         end
 
                     |   pushRegisters({instr=AllocateMemoryVariable{size, dest, ...}, ...}, code) =
                         let
                             val (sizeVal, sizeCode) = mapSrcReg size
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ AllocateMemoryVariable{size=sizeVal, dest=destVal, saveRegs=[]} :: sizeCode @ code
                         end
 
                     |   pushRegisters({instr=InitialiseMem{size, addr, init}, ...}, code) =
                         let
                             val (sizeVal, sizeCode) = mapSrcReg size
                             val (addrVal, addrCode) = mapSrcReg addr
                             val (initVal, initCode) = mapSrcReg init
                         in
                             InitialiseMem{size=sizeVal, addr=addrVal, init=initVal} :: initCode @ addrCode @ sizeCode @ code
                         end
 
                     |   pushRegisters({instr=BeginLoop, ...}, code) = BeginLoop :: code
 
                     |   pushRegisters({instr=JumpLoop{regArgs, stackArgs, checkInterrupt}, ...}, code) =
                         let
                             (* Normally JumpLoop will be the last item in a block but it is possible that we've
                                added a reset-stack after it. *)
                             fun getValues [] = ([], [])
                             |   getValues ({src=source, dst=PReg n} :: rest) =
                                 let
                                     val (otherRegArgs, otherStackArgs) = getValues rest
                                 in
                                     case Array.sub(pregMap, n) of
                                         ToPReg lReg => ({src=mapArgument source, dst=lReg} :: otherRegArgs, otherStackArgs)
 
                                     |   ToStack(stackloc, stackC as StackLoc{size, ...}) =>
                                         let
                                             val sourceVal = mapArgument source
                                             val stackOff = !stackCount - stackloc - size
                                         in
                                             (otherRegArgs, {src=sourceVal, wordOffset=stackOff, stackloc=stackC} :: otherStackArgs)
                                         end
 
                                     |   Unset =>
                                         (* Drop it.  It's never used. This can happen if we are folding a
                                            function over a list such that it always returns the last value and
                                            then discard the result of the fold. *)
                                             (otherRegArgs, otherStackArgs)
                                 end
                             val (newRegArguments, newStackArgs) = getValues regArgs
                             fun loadStackArg({src=source, stackloc=destC, ...}, otherArgs) =
                             let
                                 val sourceVal = mapArgument source
                                 val (newOffset, newContainer) = mapContainerAndStack(destC, 0)
                             in
                                 {src=sourceVal, wordOffset=newOffset, stackloc=newContainer} :: otherArgs
                             end
                 
                             val oldStackArgs = List.foldr loadStackArg [] stackArgs
                             val check = case checkInterrupt of NONE => NONE | SOME _ => SOME []
                         in
                             JumpLoop{ regArgs=newRegArguments, stackArgs=oldStackArgs @ newStackArgs, checkInterrupt=check} :: code
                         end
 
-
                     |   pushRegisters({instr=StoreWithConstantOffset { base, source, byteOffset, loadType}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (baseVal, baseCode) = mapSrcReg base 
                         in
                             StoreWithConstantOffset{ base=baseVal, source=sourceVal, byteOffset=byteOffset, loadType=loadType} ::
                                 baseCode @ sourceCode @ code
                         end
 
+                    |   pushRegisters({instr=StoreFPWithConstantOffset { base, source, byteOffset, floatSize}, ...}, code) =
+                        let
+                            val (sourceVal, sourceCode) = mapSrcReg source
+                            val (baseVal, baseCode) = mapSrcReg base 
+                        in
+                            StoreFPWithConstantOffset{ base=baseVal, source=sourceVal, byteOffset=byteOffset, floatSize=floatSize} ::
+                                baseCode @ sourceCode @ code
+                        end
+
                     |   pushRegisters({instr=StoreWithIndexedOffset { base, source, index, loadType}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (baseVal, baseCode) = mapSrcReg base 
                             val (indexVal, indexCode) = mapSrcReg index 
                         in
                             StoreWithIndexedOffset{ base=baseVal, source=sourceVal, index=indexVal, loadType=loadType} ::
                                 indexCode @ baseCode @ sourceCode @ code
                         end
 
+                    |   pushRegisters({instr=StoreFPWithIndexedOffset { base, source, index, floatSize}, ...}, code) =
+                        let
+                            val (sourceVal, sourceCode) = mapSrcReg source
+                            val (baseVal, baseCode) = mapSrcReg base 
+                            val (indexVal, indexCode) = mapSrcReg index 
+                        in
+                            StoreFPWithIndexedOffset{ base=baseVal, source=sourceVal, index=indexVal, floatSize=floatSize} ::
+                                indexCode @ baseCode @ sourceCode @ code
+                        end
+
                     |   pushRegisters({instr=AddSubImmediate{ source, dest, ccRef, immed, isAdd, length }, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapOptDestOld dest
                         in
                             destCode @ AddSubImmediate { source=sourceVal, dest=destVal, ccRef=ccRef,
                                                          immed=immed, isAdd=isAdd, length=length} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=AddSubRegister{ base, shifted, dest, ccRef, isAdd, length, shift}, ...}, code) =
                         let
                             val (op1Val, op1Code) = mapSrcReg base
                             val (op2Val, op2Code) = mapSrcReg shifted
                             val (destVal, destCode) = mapOptDestOld dest
                         in
                             destCode @ AddSubRegister { base=op1Val, shifted=op2Val, dest=destVal, ccRef=ccRef,
                                                          isAdd=isAdd, length=length, shift=shift} :: op2Code @ op1Code @ code
                         end
 
                     |   pushRegisters({instr=LogicalImmediate{ source, dest, ccRef, immed, logOp, length }, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapOptDestOld dest
                         in
                             destCode @ LogicalImmediate { source=sourceVal, dest=destVal, ccRef=ccRef,
                                                           immed=immed, logOp=logOp, length=length} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=LogicalRegister{ base, shifted, dest, ccRef, logOp, length, shift}, ...}, code) =
                         let
                             val (op1Val, op1Code) = mapSrcReg base
                             val (op2Val, op2Code) = mapSrcReg shifted
                             val (destVal, destCode) = mapOptDestOld dest
                         in
                             destCode @ LogicalRegister { base=op1Val, shifted=op2Val, dest=destVal, ccRef=ccRef,
                                                          logOp=logOp, length=length, shift=shift} :: op2Code @ op1Code @ code
                         end
 
                     |   pushRegisters({instr=ShiftRegister{ direction, dest, source, shift, opSize}, ...}, code) =
                         let
                             val (srcVal, op1Code) = mapSrcReg source
                             val (shiftVal, op2Code) = mapSrcReg shift
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ ShiftRegister { source=srcVal, shift=shiftVal, dest=destVal,
                                                        direction=direction, opSize=opSize} :: op2Code @ op1Code @ code
                         end
 
                     |   pushRegisters({instr=Multiplication{ kind, dest, sourceA, sourceM, sourceN }, ...}, code) =
                         let
                             val (srcAVal, srcACode) = mapOptSrcOld sourceA
                             val (srcMVal, srcMCode) = mapSrcReg sourceM
                             val (srcNVal, srcNCode) = mapSrcReg sourceN
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ Multiplication { kind=kind, sourceA=srcAVal, sourceM=srcMVal, sourceN=srcNVal, dest=destVal} ::
                                 srcNCode @ srcMCode @ srcACode @ code
                         end
 
                     |   pushRegisters({instr=Division{ isSigned, dest, dividend, divisor, opSize }, ...}, code) =
                         let
                             val (dividendVal, dividendCode) = mapSrcReg dividend
                             val (divisorVal, divisorCode) = mapSrcReg divisor
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ Division { isSigned=isSigned, dividend=dividendVal, divisor=divisorVal, dest=destVal,
                                                        opSize=opSize} :: divisorCode @ dividendCode @ code
                         end
 
                     |   pushRegisters({instr=BeginFunction {regArgs, stackArgs}, ...}, code) =
                         let
                             (* Create a new container list.  The offsets begin at -numArgs. *)
                             fun newContainers(src :: srcs, offset) =
                                 let
                                     val newContainer = mapDestContainer(src, offset)
                                 in
                                     newContainer :: newContainers(srcs, offset+1)
                                 end
                             |   newContainers _ = []
 
                             val newStackArgs = newContainers(stackArgs, ~ (List.length stackArgs))
                             (* Push any registers that need to be pushed. *)
                             fun pushReg((preg, rreg), (others, code)) =
                             let
                                 val (newReg, newCode) = mapDestReg(preg)
                             in
                                 ((newReg, rreg) :: others, newCode @ code)
                             end
                             val (newRegArgs, pushCode) = List.foldl pushReg ([], []) regArgs
                         in
                             pushCode @ BeginFunction {regArgs=newRegArgs, stackArgs=newStackArgs} :: code
                         end
 
                     |   pushRegisters({instr=FunctionCall{callKind, regArgs, stackArgs, dest, containers, ...}, ...}, code) =
                         let
                             (* It's possible that this could lead to having to spill registers in order
                                to load others.  Leave that problem for the moment. *)
                             fun loadStackArg (arg, otherArgs) =
                             let
                                 val argVal = mapArgument arg
                             in
                                 argVal :: otherArgs
                             end
                             val newStackArgs = List.foldr loadStackArg [] stackArgs
 
                             fun loadRegArg ((arg, reg), otherArgs) =
                             let
                                 val argVal = mapArgument arg
                             in
                                 (argVal, reg) :: otherArgs
                             end
                             val newRegArgs = List.foldr loadRegArg [] regArgs
                             val (destVal, destCode) = mapDestReg dest
                             val newContainers = List.map(fn c => #2(mapContainerAndStack(c, 0))) containers
                         in
                             destCode @ FunctionCall{ callKind=callKind, regArgs=newRegArgs, stackArgs=newStackArgs,
                                                   dest=destVal, saveRegs=[], containers=newContainers} :: code
                         end
 
                     |   pushRegisters({instr=TailRecursiveCall{callKind, regArgs, stackArgs, stackAdjust, ...}, ...}, code) =
                         let
                             val newStackOffset = !stackCount
                             
                             fun loadStackArg ({src, stack}, (otherLoads, otherArgs)) =
                             let
                                 val (argVal, loadCode) =
                                     case mapArgument src of
                                         (source as ArgOnStack{wordOffset, container, field}) =>
                                             (* If we're leaving it in its old location or we're pushing it
                                                above the current top we're ok.  We're also ok if
                                                we're moving it from a somewhere above the last argument.
                                                Otherwise we have to load it.
                                                It goes into a normal tagged register which may mean that it
                                                could be pushed onto the stack in a subsequent pass. *)
                                             if wordOffset = stack+newStackOffset orelse stack+newStackOffset < 0
                                                 orelse newStackOffset-wordOffset > ~ stackAdjust
                                             then (source, [])
                                             else
                                             let
                                                 val preg = newPReg RegPropGeneral
                                             in
                                                 (ArgInReg preg,
                                                     [LoadStack{wordOffset=wordOffset, container=container, field=field, dest=preg}])
                                             end
                                     |   argCode => (argCode, [])
                             in
                                 (loadCode @ otherLoads, {src=argVal, stack=stack} :: otherArgs)
                             end
                             val (stackArgLoads, newStackArgs) = List.foldr loadStackArg ([], []) stackArgs
                 
                             fun loadRegArg ((arg, reg), otherArgs) =
                             let
                                 val argVal = mapArgument arg
                             in
                                 (argVal, reg) :: otherArgs
                             end
                             val newRegArgs = List.foldr loadRegArg [] regArgs
 
                         in
                             TailRecursiveCall{ callKind=callKind, regArgs=newRegArgs,
                                     stackArgs=newStackArgs, stackAdjust=stackAdjust, currStackSize=newStackOffset} ::
                                         stackArgLoads @ code
                         end
 
                     |   pushRegisters({instr=ReturnResultFromFunction{resultReg, returnReg, numStackArgs}, ...}, code) =
                         let
                             val (resultValue, loadResult) = mapSrcReg resultReg
                             val (returnValue, loadReturn) = mapSrcReg returnReg
                             val resetCode =
                                 if !stackCount = 0 then [] else [ResetStackPtr{numWords= !stackCount}]
                         in
                             ReturnResultFromFunction{resultReg=resultValue, returnReg=returnValue, numStackArgs=numStackArgs} ::
                                 resetCode @ loadReturn @ loadResult @ code
                         end
 
                     |   pushRegisters({instr=RaiseExceptionPacket{packetReg}, ...}, code) =
                         let
                             val (packetVal, packetCode) = mapSrcReg packetReg
                         in
                             RaiseExceptionPacket{packetReg=packetVal} :: packetCode @ code
                         end
 
                     |   pushRegisters({instr=PushToStack{ source, container, copies }, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             (* This was a push from a previous pass.  Treat as a container of size "copies". *)
                             val newContainer = mapDestContainer(container, !stackCount)
                             val () = pushItemToStack(OriginalEntry{stackLoc=container})
                         in
                             PushToStack{source=sourceVal, container=newContainer, copies=copies} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=LoadStack{ dest, container, field, ... }, ...}, code) =
                         let
                             val (newOffset, newContainer) = mapContainerAndStack(container, field)
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ LoadStack{ dest=destVal, container=newContainer, field=field,
                                                   wordOffset=newOffset } :: code
                         end
 
                     |   pushRegisters({instr=StoreToStack{source, container, field, ...}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (newOffset, newContainer) = mapContainerAndStack(container, field)
                         in
                             StoreToStack{source=sourceVal, container=newContainer, field=field, stackOffset=newOffset} ::
                                     sourceCode @ code
                         end
 
                     |   pushRegisters({instr=ContainerAddress{ dest, container, ... }, ...}, code) =
                         let
                             val (newOffset, newContainer) = mapContainerAndStack(container, 0)
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ ContainerAddress{ dest=destVal, container=newContainer, stackOffset=newOffset } :: code
                         end
 
                     |   pushRegisters({instr=ResetStackPtr _, ...}, code) = code
                             (* Added in a previous pass - discard it. *)
 
                     |   pushRegisters({instr=TagValue{source, dest, isSigned, opSize}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ TagValue{source=sourceVal, dest=destVal, isSigned=isSigned, opSize=opSize} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=UntagValue{source, dest, isSigned, opSize, ...}, ...}, code) =
                         let
                             val (loadedSource, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ UntagValue{source=loadedSource, dest=destVal, isSigned=isSigned, opSize=opSize} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=BoxLarge{source, dest, ...}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ BoxLarge{source=sourceVal, dest=destVal, saveRegs=[]} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=UnboxLarge{source, dest, ...}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ UnboxLarge{source=sourceVal, dest=destVal} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=BoxTagFloat{floatSize, source, dest, ...}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ BoxTagFloat{floatSize=floatSize, source=sourceVal, dest=destVal, saveRegs=[]} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=UnboxTagFloat{floatSize, source, dest, ...}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ UnboxTagFloat{floatSize=floatSize, source=sourceVal, dest=destVal} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=LoadAcquire { base, dest, loadType}, ...}, code) =
                         let
                             val (baseVal, baseCode) = mapSrcReg base
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ LoadAcquire { base=baseVal, dest=destVal, loadType=loadType} :: baseCode @ code
                         end
 
                     |   pushRegisters({instr=StoreRelease { base, source, loadType}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (baseVal, baseCode) = mapSrcReg base 
                         in
                             StoreRelease{ base=baseVal, source=sourceVal, loadType=loadType} :: baseCode @ sourceCode @ code
                         end
 
                     |   pushRegisters({instr=BitFieldShift{source, dest, isSigned, length, immr, imms}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @
                                 BitFieldShift { source=sourceVal, dest=destVal, isSigned=isSigned, immr=immr, imms=imms, length=length} ::
                                     sourceCode @ code
                         end
 
                     |   pushRegisters({instr=BitFieldInsert{source, destAsSource, dest, length, immr, imms}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destSrcVal, destSrcCode) = mapSrcReg destAsSource
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @
                                 BitFieldInsert { source=sourceVal, destAsSource=destSrcVal, dest=destVal, immr=immr, imms=imms, length=length} ::
                                     destSrcCode @ sourceCode @ code
                         end
 
                     |   pushRegisters({instr=IndexedCaseOperation{testReg}, ...}, code) =
                         let
                             val (testVal, testCode) = mapSrcReg testReg
                         in
                             IndexedCaseOperation{testReg=testVal} :: testCode @ code
                         end
 
                     |   pushRegisters({instr=PushExceptionHandler, ...}, code) =
                         let
                             (* Add a handler entry to the stack. *)
                             val () = pushItemToStack HandlerEntry
                         in
                             PushExceptionHandler :: code
                         end
 
                     |   pushRegisters({instr=PopExceptionHandler, ...}, code) =
                         let
                             (* Appears at the end of the block whose exceptions are being handled.  Delete the
                                handler and anything above it. *)
                             (* Get the state after removing the handler. *)
                             fun popContext ([], _) = raise InternalError "pushRegisters - pop handler"
                             |   popContext (HandlerEntry :: tl, new) = (tl, new-2)
                             |   popContext (OriginalEntry{stackLoc=StackLoc{size, ...}, ...} :: tl, new) = popContext(tl, new-size)
                             |   popContext (NewEntry _ :: tl, new) = popContext(tl, new-1)
                             val (newStack, nnCount) = popContext(!stack, !stackCount)
                             val () = stack := newStack
                             val oldStackPtr = ! stackCount
                             val () = stackCount := nnCount
                             (* Reset the stack to just above the two words of the handler. *)
                             val resetCode =
                                 if oldStackPtr <> nnCount+2
                                 then [ResetStackPtr{numWords=oldStackPtr-nnCount-2}]
                                 else []
                         in
                             PopExceptionHandler :: resetCode @ code
                         end
 
                     |   pushRegisters({instr=BeginHandler{packetReg}, ...}, code) =
                         let
                             (* Start of a handler.  The top active entry should be the handler. *)
                             val () =
                                 case !stack of
                                     HandlerEntry :: tl => stack := tl
                                 |   _ => raise InternalError "pushRegisters: BeginHandler"
                             val () = stackCount := !stackCount - 2
                             val (packetVal, packetCode) = mapDestReg packetReg
                         in
                             packetCode @ BeginHandler{packetReg=packetVal} :: code
                         end
 
                     |   pushRegisters({instr=CompareByteVectors{vec1Addr, vec2Addr, length, ccRef}, ...}, code) =
                         let
                             val (vec1Val, vec1Code) = mapSrcReg vec1Addr
                             val (vec2Val, vec2Code) = mapSrcReg vec2Addr
                             val (lenVal, lenCode) = mapSrcReg length
                         in
                             CompareByteVectors{vec1Addr=vec1Val, vec2Addr=vec2Val, length=lenVal, ccRef=ccRef} ::
                                 lenCode @ vec2Code @ vec1Code @ code
                         end
 
                     |   pushRegisters({instr=BlockMove{srcAddr, destAddr, length, isByteMove}, ...}, code) =
                         let
                             val (srcVal, srcCode) = mapSrcReg srcAddr
                             val (destVal, destCode) = mapSrcReg destAddr
                             val (lenVal, lenCode) = mapSrcReg length
                         in
                             BlockMove{srcAddr=srcVal, destAddr=destVal, length=lenVal, isByteMove=isByteMove} ::
                                 lenCode @ destCode @ srcCode @ code
                         end
 
 
                     |   pushRegisters({instr=AddSubXSP{source, dest, isAdd}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapOptDest dest
                         in
                             destCode @ AddSubXSP { source=sourceVal, dest=destVal, isAdd=isAdd} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=TouchValue{source, ...}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                         in
                             TouchValue { source=sourceVal} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=LoadAcquireExclusive{ base, dest }, ...}, code) =
                         let
                             val (baseVal, baseCode) = mapSrcReg base
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ LoadAcquireExclusive { base=baseVal, dest=destVal} :: baseCode @ code
                         end
 
                     |   pushRegisters({instr=StoreReleaseExclusive{ base, source, result }, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapOptSrc source
                             val (baseVal, baseCode) = mapSrcReg base 
                             val (resVal, resCode) = mapDestReg result
                         in
                             resCode @ StoreReleaseExclusive{ base=baseVal, source=sourceVal, result=resVal} :: baseCode @ sourceCode @ code
                         end
 
                     |   pushRegisters({instr=MemoryBarrier, ...}, code) = MemoryBarrier :: code
 
                     |   pushRegisters({instr=ConvertIntToFloat{ source, dest, srcSize, destSize}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @
                                 ConvertIntToFloat{ source=sourceVal, dest=destVal, srcSize=srcSize, destSize=destSize} ::
                                 sourceCode @ code
                         end
 
                     |   pushRegisters({instr=ConvertFloatToInt{ source, dest, srcSize, destSize, rounding}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @
                                 ConvertFloatToInt{ source=sourceVal, dest=destVal, srcSize=srcSize, destSize=destSize, rounding=rounding} ::
                                 sourceCode @ code
                         end
 
                     |   pushRegisters({instr=UnaryFloatingPt{ source, dest, fpOp}, ...}, code) =
                         let
                             val (sourceVal, sourceCode) = mapSrcReg source
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ UnaryFloatingPt{ source=sourceVal, dest=destVal, fpOp=fpOp} :: sourceCode @ code
                         end
 
                     |   pushRegisters({instr=BinaryFloatingPoint{ arg1, arg2, dest, fpOp, opSize}, ...}, code) =
                         let
                             val (arg1Val, arg1Code) = mapSrcReg arg1
                             val (arg2Val, arg2Code) = mapSrcReg arg2
                             val (destVal, destCode) = mapDestReg dest
                         in
                             destCode @ BinaryFloatingPoint{ arg1=arg1Val, arg2=arg2Val, dest=destVal, fpOp=fpOp, opSize=opSize} ::
                                 arg2Code @ arg1Code @ code
                         end
 
                     |   pushRegisters({instr=CompareFloatingPoint{ arg1, arg2, opSize, ccRef}, ...}, code) =
                         let
                             val (arg1Val, arg1Code) = mapSrcReg arg1
                             val (arg2Val, arg2Code) = mapSrcReg arg2
                         in
                             CompareFloatingPoint{ arg1=arg1Val, arg2=arg2Val, opSize=opSize, ccRef=ccRef} ::
                                 arg2Code @ arg1Code @ code
                         end
 
                     local
                         fun doPush(instr as {kill, ...}, code) =
                         let
                             val newCode = pushRegisters(instr, code)
                             (* Can we pop the stack? *)
                             val stackReset =
                                 case setToList (minus(kill, loopRegs)) of
                                     [] => []
                                 |   killList =>
                                     let
                                         (* See if any of the kill items are at the top of the stack.
                                            If they are we can pop them and perhaps items we've
                                            previously marked for deletion but not been able to pop. *)
                                         val oldStack = !stackCount
                                         fun checkAndAdd(r, output) =
                                             case Array.sub(pregMap, r) of
                                                 ToStack(stackLoc, StackLoc{size, ...}) =>
                                                     if stackLoc < 0
                                                     then r :: output (* We can have arguments and return address. *)
                                                     else if !stackCount = stackLoc+size
                                                     then
                                                     (
                                                         stack := tl (!stack);
                                                         stackCount := stackLoc;
                                                         output
                                                     )
                                                     else r :: output
                                             |   _ => r :: output
                                         val toAdd = List.foldl checkAndAdd [] killList
                                         fun reprocess list =
                                         let
                                             val prevStack = !stackCount
                                             val outlist = List.foldl checkAndAdd [] list
                                         in
                                             if !stackCount = prevStack
                                             then list
                                             else reprocess outlist
                                         end
                                         val () =
                                             if !stackCount = oldStack
                                             then deletedItems := toAdd @ !deletedItems
                                             else deletedItems := reprocess(toAdd @ !deletedItems)
                                         val _ = oldStack >= !stackCount orelse raise InternalError "negative stack offset"
                                     in
                                         if !stackCount = oldStack then []
                                         else [ResetStackPtr{numWords=oldStack - !stackCount}]
                                     end
                         in
                             stackReset @ newCode
                         end
                     in
                         val codeResult = List.foldl doPush [] block
                         val outputCount = ! stackCount
                         val results = {code=codeResult, stackCount= outputCount}
                         val stateResult = { stackCount= outputCount, stack= !stack }
                         
                         val () = Array.update(blockOutput, blockNo,  results)
                     end
 
                     val addSet =
                         case flow of
                             ExitCode => []
                         |   IndexedBr cases => cases
                         |   Unconditional dest => [dest]
                         |   Conditional {trueJump, falseJump, ...} => [falseJump, trueJump]
                         |   SetHandler { handler, continue } => [handler, continue]
                         |   UnconditionalHandle _ => []
                         |   ConditionalHandle { continue, ...} => [continue]
                         
                     val addItems = List.map(fn m => (m, stateResult)) addSet
                 in
                     processBlocks(addItems @ stillToDo)
                 end
         in
             val () = processBlocks([(0, {stack=[], stackCount=0})])
         end
 
 
         (* Put together the result code and blocks. *)
         local
             fun createBlock blockNo =
             (* Skip unreferenced blocks apart from block 0. *)
             if blockNo <> 0 andalso null (asub blockRefs blockNo)
             then BasicBlock{block=[], flow=ExitCode}
             else
             let
                 val ExtendedBasicBlock{ flow, ...} = vsub code blockNo
                 val {code=codeResult, stackCount=outputCount, ...} = asub blockOutput blockNo
                 
                 (* Process the successor.  If we need a stack adjustment this will require
                    an adjustment block.  TODO: We could put a pre-adjustment if we only have one
                    branch to this block. *)
                 fun matchStacks targetBlock =
                 let
                     (* Process the destination.  If it hasn't been processed. *)
                     val expectedInput = valOf (asub inputStackSizes targetBlock)
                 in
                     if expectedInput = outputCount
                     then targetBlock
                     else
                     let
                         val _ = outputCount > expectedInput orelse raise InternalError "adjustStack"
                         val adjustCode = [ResetStackPtr{numWords=outputCount-expectedInput}]
                         val newBlock = BasicBlock{block=adjustCode, flow=Unconditional targetBlock}
                         val newBlockNo = !blockCounter before blockCounter := !blockCounter+1
                         val () = extraBlocks := newBlock :: !extraBlocks
                     in
                         newBlockNo
                     end
                 end
                 
                 val (finalCode, newFlow) =
                     case flow of
                         ExitCode => (codeResult, ExitCode)
                     
                     |   Unconditional m =>
                         let
                             (* Process the block.  Since we're making an unconditional jump
                                we can include any stack adjustment needed to match the
                                destination in here.  In particular this includes loops. *)
                             val expectedInput = valOf (asub inputStackSizes m)
                             val _ = outputCount >= expectedInput orelse raise InternalError "negative reset"
                             val resultCode =
                                 if expectedInput = outputCount
                                 then codeResult
                                 else ResetStackPtr{numWords=outputCount-expectedInput} :: codeResult
                         in
                             (resultCode, Unconditional m)
                         end
 
                         (* For any of these, if we need to adjust the stack we have to add an
                            adjustment block. *)
                     |   Conditional {trueJump, falseJump, ccRef, condition} =>
                         (codeResult,
                             Conditional{trueJump=matchStacks trueJump, falseJump=matchStacks falseJump,
                                 ccRef=ccRef, condition=condition})
                     
                     |   SetHandler{ handler, continue } =>
                         (codeResult, SetHandler{ handler=matchStacks handler, continue=matchStacks continue})
                     
                     |   IndexedBr cases => (codeResult, IndexedBr(map matchStacks cases))
                     
                     |   u as UnconditionalHandle _ => (codeResult, u)
                     
                     |   c as ConditionalHandle{ continue, ... } =>
                         let
                             (* As for unconditional branch *)
                             val expectedInput = valOf (asub inputStackSizes continue)
                             val _ = outputCount >= expectedInput orelse raise InternalError "negative reset"
                             val resultCode =
                                 if expectedInput = outputCount
                                 then codeResult
                                 else ResetStackPtr{numWords=outputCount-expectedInput} :: codeResult
                         in
                             (resultCode, c)
                         end
              in
                 BasicBlock{block=List.rev finalCode, flow=newFlow}
             end
         in
             val resultBlocks = Vector.tabulate(numberOfBlocks, createBlock)
         end
         
         (* Add any extra blocks to the result. *)
         val finalResult =
             case !extraBlocks of
                 [] => resultBlocks
             |   blocks => Vector.concat[resultBlocks, Vector.fromList(List.rev blocks)]
         
         val pregProperties = Vector.fromList(List.rev(! pregPropList))
     in
         {code=finalResult, pregProps=pregProperties, maxStack= !maxStack}
     end
 
     structure Sharing =
     struct
         type extendedBasicBlock = extendedBasicBlock
         and basicBlock = basicBlock
         and regProperty = regProperty
     end
 end;
diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/ml_bind.ML b/mlsource/MLCompiler/CodeTree/Arm64Code/ml_bind.ML
index d86ff2a8..94340995 100644
--- a/mlsource/MLCompiler/CodeTree/Arm64Code/ml_bind.ML
+++ b/mlsource/MLCompiler/CodeTree/Arm64Code/ml_bind.ML
@@ -1,121 +1,128 @@
 (*
     Copyright (c) 2021 David C. J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     Licence version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public Licence for more details.
     
     You should have received a copy of the GNU Lesser General Public
     Licence along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 local
     structure Arm64Assembly =
         Arm64Assembly (
             structure Debug = Debug
             and       Pretty = Pretty
             and       CodeArray = CodeArray
         )
     
     structure Arm64Sequences =
         Arm64Sequences (
             structure Arm64Assembly = Arm64Assembly
         )
 
+    structure Arm64Preassembly =
+        Arm64PreAssembly (
+            structure Arm64Assembly = Arm64Assembly
+            and       Debug = Debug
+            and       Pretty = Pretty
+        )
+
     structure Arm64Foreign =
         Arm64ForeignCall (
             structure CodeArray   = CodeArray
             and       Arm64Assembly = Arm64Assembly
             and       Arm64Sequences = Arm64Sequences
             and       Debug = Debug
         )
 
     structure Arm64ICode =
         Arm64ICode (
-            structure Arm64Code       = Arm64Assembly
+            structure Arm64Code     = Arm64Preassembly
         )
 
     structure Arm64ICodeIdentify =
         Arm64IdentifyReferences (
             structure Debug         = Debug
             structure Arm64ICode         = Arm64ICode
             structure IntSet        = IntSet
         )
 
     structure Arm64ICodeConflicts =
         Arm64ICodeConflicts (
             structure Arm64ICode         = Arm64ICode
             structure IntSet        = IntSet
             structure Identify      = Arm64ICodeIdentify
         )
 
     structure Arm64PushRegs =
         Arm64PushRegisters (
             structure Arm64ICode         = Arm64ICode
             structure IntSet        = IntSet
             structure Identify      = Arm64ICodeIdentify
         )
 
     structure Arm64Opt =
         Arm64ICodeOptimise (
             structure Arm64ICode    = Arm64ICode
             structure IntSet        = IntSet
             structure Identify      = Arm64ICodeIdentify
             structure Debug         = Debug
             structure Pretty        = Pretty
         )
 
     structure Arm64IAllocate =
         Arm64AllocateRegisters (
             structure Arm64ICode    = Arm64ICode
             structure Identify      = Arm64ICodeIdentify
             structure ConflictSets  = Arm64ICodeConflicts
             structure IntSet        = IntSet
         )
 
     structure Arm64ICodeGenerate =
         Arm64ICodeToArm64Code (
             structure Debug         = Debug
             structure Arm64ICode    = Arm64ICode
             structure Identify      = Arm64ICodeIdentify
             structure Pretty        = Pretty
             structure IntSet        = IntSet
             structure Arm64Assembly = Arm64Assembly
             structure Arm64Sequences = Arm64Sequences
             structure Strongly      = StronglyConnected
         )
         
     structure Arm64ICodeTransform =
         Arm64ICodeTransform (
             structure Debug         = Debug
             structure Arm64ICode    = Arm64ICode
             structure Identify      = Arm64ICodeIdentify
             structure ConflictSets  = Arm64ICodeConflicts
             structure Allocate      = Arm64IAllocate
             structure PushRegisters = Arm64PushRegs
             structure Optimise      = Arm64Opt
             structure Pretty        = Pretty
             structure IntSet        = IntSet
             structure Codegen       = Arm64ICodeGenerate
         )
 
 in
 
     structure Arm64Code = 
         Arm64CodetreeToICode (
             structure BackendTree   = BackendIntermediateCode
             structure Debug         = Debug
             structure Arm64ICode         = Arm64ICode
             structure Arm64Foreign  = Arm64Foreign
             structure ICodeTransform = Arm64ICodeTransform
             structure CodeArray    = CodeArray
             and       Pretty = Pretty
         )
 
 end;
diff --git a/polymlArm64.pyp b/polymlArm64.pyp
index 28b539d7..dfe62740 100644
--- a/polymlArm64.pyp
+++ b/polymlArm64.pyp
@@ -1,243 +1,245 @@
 <?xml version="1.0"?>
 <polyproject>
 	<source name="basis\Array.sml" />
 	<source name="basis\Array2.sml" />
 	<source name="basis\Array2Signature.sml" />
 	<source name="basis\ArraySignature.sml" />
 	<source name="basis\ArraySliceSignature.sml" />
 	<source name="basis\ASN1.sml" />
 	<source name="basis\BasicStreamIO.sml" />
 	<source name="basis\BinIO.sml" />
 	<source name="basis\BIT_FLAGS.sml" />
 	<source name="basis\Bool.sml" />
 	<source name="basis\BoolArray.sml" />
 	<source name="basis\build.sml" />
 	<source name="basis\Byte.sml" />
 	<source name="basis\CommandLine.sml" />
 	<source name="basis\DATE.sig" />
 	<source name="basis\Date.sml" />
 	<source name="basis\ExnPrinter.sml" />
 	<source name="basis\FinalPolyML.sml" />
 	<source name="basis\Foreign.sml" />
 	<source name="basis\ForeignConstants.sml" />
 	<source name="basis\ForeignMemory.sml" />
 	<source name="basis\General.sml" />
 	<source name="basis\GenericSock.sml" />
 	<source name="basis\HashArray.ML" />
 	<source name="basis\IEEE_REAL.sml" />
 	<source name="basis\IEEEReal.sml" />
 	<source name="basis\IMPERATIVE_IO.sml" />
 	<source name="basis\ImperativeIO.sml" />
 	<source name="basis\INet6Sock.sml" />
 	<source name="basis\INetSock.sml" />
 	<source name="basis\InitialBasis.ML" />
 	<source name="basis\InitialPolyML.ML" />
 	<source name="basis\Int.sml" />
 	<source name="basis\Int32.sml" />
 	<source name="basis\IntArray.sml" />
 	<source name="basis\IntArray2.sml" />
 	<source name="basis\IntInf.sml" />
 	<source name="basis\IO.sml" />
 	<source name="basis\LargeWord.sml" />
 	<source name="basis\LibraryIOSupport.sml" />
 	<source name="basis\LibrarySupport.sml" />
 	<source name="basis\List.sml" />
 	<source name="basis\ListPair.sml" />
 	<source name="basis\MATH.sml" />
 	<source name="basis\MONO_ARRAY.sml" />
 	<source name="basis\MONO_ARRAY_SLICE.sml" />
 	<source name="basis\MONO_VECTOR.sml" />
 	<source name="basis\MONO_VECTOR_SLICE.sml" />
 	<source name="basis\NetProtDB.sml" />
 	<source name="basis\NetServDB.sml" />
 	<source name="basis\Option.sml" />
 	<source name="basis\OS.sml" />
 	<source name="basis\PackReal.sml" />
 	<source name="basis\PackReal32Boxed.sml" />
 	<source name="basis\PackReal32Tagged.sml" />
 	<source name="basis\PackWord.sml" />
 	<source name="basis\PolyMLException.sml" />
 	<source name="basis\PolyVectorOperations.sml" />
 	<source name="basis\Posix.sml" />
 	<source name="basis\PrettyPrinter.sml" />
 	<source name="basis\PRIM_IO.sml" />
 	<source name="basis\PrimIO.sml" />
 	<source name="basis\Real.sml" />
 	<source name="basis\Real32.sml" />
 	<source name="basis\RealArray.sml" />
 	<source name="basis\RealSignature.sml" />
 	<source name="basis\Signal.sml" />
 	<source name="basis\SingleAssignment.sml" />
 	<source name="basis\SML90.sml" />
 	<source name="basis\Socket.sml" />
 	<source name="basis\Statistics.ML" />
 	<source name="basis\STREAM_IO.sml" />
 	<source name="basis\String.sml" />
 	<source name="basis\StringSignatures.sml" />
 	<source name="basis\Text.sml" />
 	<source name="basis\TextIO.sml" />
 	<source name="basis\Thread.sml" />
 	<source name="basis\ThreadLib.sml" />
 	<source name="basis\Time.sml" />
 	<source name="basis\Timer.sml" />
 	<source name="basis\TopLevelPolyML.sml" />
 	<source name="basis\Universal.ML" />
 	<source name="basis\UniversalArray.ML" />
 	<source name="basis\Unix.sml" />
 	<source name="basis\UnixSock.sml" />
 	<source name="basis\Vector.sml" />
 	<source name="basis\VectorOperations.sml" />
 	<source name="basis\VectorSignature.sml" />
 	<source name="basis\VectorSliceOperations.sml" />
 	<source name="basis\VectorSliceSignature.sml" />
 	<source name="basis\Weak.sml" />
 	<source name="basis\Windows.sml" />
 	<source name="basis\Word16.sml" />
 	<source name="basis\Word32.sml" />
 	<source name="basis\Word32In64.sml" />
 	<source name="basis\Word8.sml" />
 	<source name="basis\Word8Array.sml" />
 	<source name="basis\WordSignature.sml" />
 	<source name="mlsource\MLCompiler\Address.ML" />
 	<source name="mlsource\MLCompiler\BUILTINS.sml" />
 	<source name="mlsource\MLCompiler\CODETREE.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\Arm64AllocateRegisters.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\ARM64ALLOCATEREGISTERS.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\ARM64ASSEMBLY.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\Arm64Assembly.sml" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\Arm64CodetreeToICode.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\Arm64ForeignCall.sml" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\Arm64GenCode.sml" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\Arm64ICode.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\ARM64ICODE.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\Arm64ICodeConflicts.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\ARM64ICODECONFLICTS.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\ARM64ICODEGENERATE.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\Arm64ICodeOptimise.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\ARM64ICODEOPTIMISE.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\Arm64ICodeToArm64Code.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\Arm64ICodeTransform.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\ARM64ICODETRANSFORM.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\Arm64IdentifyReferences.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\ARM64IDENTIFYREFERENCES.sig" />
+	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\Arm64PreAssembly.ML" />
+	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\ARM64PREASSEMBLY.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\Arm64PushRegisters.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\ARM64PUSHREGISTERS.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\ARM64SEQUENCES.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\Arm64Sequences.sml" />
 	<source name="mlsource\MLCompiler\CodeTree\Arm64Code\ml_bind.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\BACKENDINTERMEDIATECODE.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\BackendIntermediateCode.sml" />
 	<source name="mlsource\MLCompiler\CodeTree\BaseCodeTree.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\BASECODETREE.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\CodeArray.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\CODEARRAY.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\CODEGENTREE.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\CodetreeCodegenConstantFunctions.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\CodeTreeConstruction.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\CodetreeFunctions.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\CODETREEFUNCTIONS.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\CodetreeLambdaLift.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\CodetreeOptimiser.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\CodetreeRemoveRedundant.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\CodetreeSimplifier.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\CodetreeStaticLinkAndCases.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\GCode.arm64.ML" />
 	<source name="mlsource\MLCompiler\CodeTree\GENCODE.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\INTSET.sig" />
 	<source name="mlsource\MLCompiler\CodeTree\IntSet.sml" />
 	<source name="mlsource\MLCompiler\CodeTree\ml_bind.ML" />
 	<source name="mlsource\MLCompiler\COMPILER_BODY.ML" />
 	<source name="mlsource\MLCompiler\CompilerBody.ML" />
 	<source name="mlsource\MLCompiler\COMPILERBODYSIG.sml" />
 	<source name="mlsource\MLCompiler\CompilerVersion.sml" />
 	<source name="mlsource\MLCompiler\COPIER.sml" />
 	<source name="mlsource\MLCompiler\COPIERSIG.sml" />
 	<source name="mlsource\MLCompiler\CopierStruct.sml" />
 	<source name="mlsource\MLCompiler\DATATYPE_REP.ML" />
 	<source name="mlsource\MLCompiler\DatatypeRep.ML" />
 	<source name="mlsource\MLCompiler\DATATYPEREPSIG.sml" />
 	<source name="mlsource\MLCompiler\Debug.ML" />
 	<source name="mlsource\MLCompiler\DEBUG.sig" />
 	<source name="mlsource\MLCompiler\DEBUGGER.sig" />
 	<source name="mlsource\MLCompiler\Debugger.sml" />
 	<source name="mlsource\MLCompiler\DEBUGGER_.sml" />
 	<source name="mlsource\MLCompiler\ExportTree.sml" />
 	<source name="mlsource\MLCompiler\EXPORTTREESIG.sml" />
 	<source name="mlsource\MLCompiler\ExportTreeStruct.sml" />
 	<source name="mlsource\MLCompiler\FOREIGNCALL.sig" />
 	<source name="mlsource\MLCompiler\HashTable.ML" />
 	<source name="mlsource\MLCompiler\Initialise.ML" />
 	<source name="mlsource\MLCompiler\INITIALISE_.ML" />
 	<source name="mlsource\MLCompiler\Lex.ML" />
 	<source name="mlsource\MLCompiler\LEX_.ML" />
 	<source name="mlsource\MLCompiler\LEXSIG.sml" />
 	<source name="mlsource\MLCompiler\Make.ML" />
 	<source name="mlsource\MLCompiler\MAKE_.ML" />
 	<source name="mlsource\MLCompiler\MAKESIG.sml" />
 	<source name="mlsource\MLCompiler\Misc.ML" />
 	<source name="mlsource\MLCompiler\ml_bind.ML" />
 	<source name="mlsource\MLCompiler\PARSE_DEC.ML" />
 	<source name="mlsource\MLCompiler\PARSE_TYPE.ML" />
 	<source name="mlsource\MLCompiler\ParseDec.ML" />
 	<source name="mlsource\MLCompiler\ParseTree\BASE_PARSE_TREE.sml" />
 	<source name="mlsource\MLCompiler\ParseTree\BaseParseTreeSig.sml" />
 	<source name="mlsource\MLCompiler\ParseTree\CODEGEN_PARSETREE.sml" />
 	<source name="mlsource\MLCompiler\ParseTree\CodegenParsetreeSig.sml" />
 	<source name="mlsource\MLCompiler\ParseTree\EXPORT_PARSETREE.sml" />
 	<source name="mlsource\MLCompiler\ParseTree\ExportParsetreeSig.sml" />
 	<source name="mlsource\MLCompiler\ParseTree\MATCH_COMPILER.sml" />
 	<source name="mlsource\MLCompiler\ParseTree\MatchCompilerSig.sml" />
 	<source name="mlsource\MLCompiler\ParseTree\ml_bind.ML" />
 	<source name="mlsource\MLCompiler\ParseTree\PARSE_TREE.ML" />
 	<source name="mlsource\MLCompiler\ParseTree\PRINT_PARSETREE.sml" />
 	<source name="mlsource\MLCompiler\ParseTree\PrintParsetreeSig.sml" />
 	<source name="mlsource\MLCompiler\ParseTree\TYPECHECK_PARSETREE.sml" />
 	<source name="mlsource\MLCompiler\ParseTree\TypeCheckParsetreeSig.sml" />
 	<source name="mlsource\MLCompiler\PARSETREESIG.sml" />
 	<source name="mlsource\MLCompiler\ParseType.ML" />
 	<source name="mlsource\MLCompiler\PRETTY.sig" />
 	<source name="mlsource\MLCompiler\Pretty.sml" />
 	<source name="mlsource\MLCompiler\PRINT_TABLE.ML" />
 	<source name="mlsource\MLCompiler\PrintTable.ML" />
 	<source name="mlsource\MLCompiler\PRINTTABLESIG.sml" />
 	<source name="mlsource\MLCompiler\SIGNATURES.sml" />
 	<source name="mlsource\MLCompiler\SIGNATURESSIG.sml" />
 	<source name="mlsource\MLCompiler\SignaturesStruct.sml" />
 	<source name="mlsource\MLCompiler\Skips.ML" />
 	<source name="mlsource\MLCompiler\SKIPS_.ML" />
 	<source name="mlsource\MLCompiler\StretchArray.ML" />
 	<source name="mlsource\MLCompiler\StronglyConnected.sml" />
 	<source name="mlsource\MLCompiler\STRUCT_VALS.ML" />
 	<source name="mlsource\MLCompiler\Structures.ML" />
 	<source name="mlsource\MLCompiler\STRUCTURES_.ML" />
 	<source name="mlsource\MLCompiler\STRUCTURESSIG.sml" />
 	<source name="mlsource\MLCompiler\StructVals.ML" />
 	<source name="mlsource\MLCompiler\STRUCTVALSIG.sml" />
 	<source name="mlsource\MLCompiler\Symbols.ML" />
 	<source name="mlsource\MLCompiler\SymbolsSig.sml" />
 	<source name="mlsource\MLCompiler\SymSet.ML" />
 	<source name="mlsource\MLCompiler\SymsetSig.sml" />
 	<source name="mlsource\MLCompiler\TYPE_TREE.ML" />
 	<source name="mlsource\MLCompiler\TYPEIDCODE.sml" />
 	<source name="mlsource\MLCompiler\TYPEIDCODESIG.sml" />
 	<source name="mlsource\MLCompiler\TypeIDCodeStruct.sml" />
 	<source name="mlsource\MLCompiler\TypeTree.ML" />
 	<source name="mlsource\MLCompiler\TYPETREESIG.sml" />
 	<source name="mlsource\MLCompiler\UniversalTable.ML" />
 	<source name="mlsource\MLCompiler\Utilities.ML" />
 	<source name="mlsource\MLCompiler\UTILITIES_.ML" />
 	<source name="mlsource\MLCompiler\VALUE_OPS.ML" />
 	<source name="mlsource\MLCompiler\ValueOps.ML" />
 	<source name="mlsource\MLCompiler\VALUEOPSSIG.sml" />
 	<source name="modules\IntInfAsInt\ArrayVector.sml" />
 	<source name="modules\IntInfAsInt\Date.sml" />
 	<source name="modules\IntInfAsInt\InputOutput.sml" />
 	<source name="modules\IntInfAsInt\Integer.sml" />
 	<source name="modules\IntInfAsInt\List.sml" />
 	<source name="modules\IntInfAsInt\RealStringCvt.sml" />
 	<source name="modules\IntInfAsInt\ROOT.sml" />
 	<source name="modules\IntInfAsInt\StringChar.sml" />
 	<source name="modules\IntInfAsInt\Word.sml" />
 	<source name="modules\IntInfAsInt\Word64.sml" />
 	<source name="RootArm64.ML" root="true" />
 </polyproject>