diff --git a/mlsource/MLCompiler/CodeTree/X86Code/X86FOREIGNCALL.sml b/mlsource/MLCompiler/CodeTree/X86Code/X86FOREIGNCALL.sml
index fedbe498..f29cc207 100644
--- a/mlsource/MLCompiler/CodeTree/X86Code/X86FOREIGNCALL.sml
+++ b/mlsource/MLCompiler/CodeTree/X86Code/X86FOREIGNCALL.sml
@@ -1,1145 +1,1190 @@
 (*
     Copyright (c) 2016-19 David C.J. Matthews
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 functor X86FOREIGNCALL(
 
     structure X86CODE: X86CODESIG
 
     structure X86OPTIMISE:
     sig
         type operation
         type code
         type operations = operation list
         type closureRef
 
         (* Optimise and code-generate. *)
         val generateCode: {code: code, ops: operations, labelCount: int, resultClosure: closureRef} -> unit
 
         structure Sharing:
         sig
             type operation = operation
             type code = code
             type closureRef = closureRef
         end
     end
 
     structure DEBUG: DEBUGSIG
     
     structure CODE_ARRAY: CODEARRAYSIG
 
     sharing X86CODE.Sharing = X86OPTIMISE.Sharing = CODE_ARRAY.Sharing
 ): FOREIGNCALLSIG
 =
 struct
     open X86CODE
     open Address
     open CODE_ARRAY
     
     val memRegSize = 0
    
     val (polyWordOpSize, nativeWordOpSize) =
         case targetArch of
             Native32Bit     => (OpSize32, OpSize32)
         |   Native64Bit     => (OpSize64, OpSize64)
         |   ObjectId32Bit   => (OpSize32, OpSize64)
     
     (* Ebx/Rbx is used for the second argument on the native architectures but
        is replaced by esi on the object ID arch because ebx is used as the
        global base register. *)
     val mlArg2Reg = case targetArch of ObjectId32Bit => esi | _ => ebx
     
     exception InternalError = Misc.InternalError
   
     fun opSizeToMove OpSize32 = Move32 | opSizeToMove OpSize64 = Move64
 
     val pushR = PushToStack o RegisterArg
 
     fun moveRR{source, output, opSize} =
         Move{source=RegisterArg source, destination=RegisterArg output, moveSize=opSizeToMove opSize}
 
     fun loadMemory(reg, base, offset, opSize) =
         Move{source=MemoryArg{base=base, offset=offset, index=NoIndex}, destination=RegisterArg reg, moveSize=opSizeToMove opSize}
     and storeMemory(reg, base, offset, opSize) =
         Move{source=RegisterArg reg, destination=MemoryArg {base=base, offset=offset, index=NoIndex}, moveSize=opSizeToMove opSize}
     
     val loadHeapMemory =
         case targetArch of
             ObjectId32Bit =>
                 (
                     fn (reg, base, offset, opSize) => 
                         Move{source=MemoryArg{base=ebx, offset=offset, index=Index4 base},
                              destination=RegisterArg reg, moveSize=opSizeToMove opSize}
                 )
         |   _ => loadMemory
 
 
     fun loadAddress{source=(srcReg, 0), destination} =
             Move{source=RegisterArg srcReg, destination=RegisterArg destination, moveSize=opSizeToMove nativeWordOpSize}
     |   loadAddress{source=(srcReg, srcOffset), destination} =
             LoadAddress{offset=srcOffset, base=SOME srcReg, index=NoIndex, output=destination, opSize=nativeWordOpSize }
 
     (* Sequence of operations to move memory. *)
     fun moveMemory{source, destination, length} =
     [
         loadAddress{source=source, destination=rsi},
         loadAddress{source=destination, destination=rdi},
         Move{source=NonAddressConstArg(LargeInt.fromInt length), destination=RegisterArg rcx,
              moveSize=opSizeToMove nativeWordOpSize},
         RepeatOperation MOVS8
     ]
 
     fun createProfileObject _ (*functionName*) =
     let
         (* The profile object is a single mutable with the F_bytes bit set. *)
         open Address
         val profileObject = RunCall.allocateByteMemory(0w1, Word.fromLargeWord(Word8.toLargeWord(Word8.orb(F_mutable, F_bytes))))
         fun clear 0w0 = ()
         |   clear i = (assignByte(profileObject, i-0w1, 0w0); clear (i-0w1))
         val () = clear wordSize
     in
         toMachineWord profileObject
     end
 
     val makeEntryPoint: string -> machineWord = RunCall.rtsCallFull1 "PolyCreateEntryPointObject"
 
     datatype abi = X86_32 | X64Win | X64Unix
     
     local
         (* Get the ABI.  On 64-bit Windows and Unix use different calling conventions. *)
         val getABICall: unit -> int = RunCall.rtsCallFast0 "PolyGetABI"
     in
         fun getABI() =
             case getABICall() of
                 0 => X86_32
             |   1 => X64Unix
             |   2 => X64Win
             |   n => raise InternalError ("Unknown ABI type " ^ Int.toString n)
     end
 
     val noException = 1
 
     (* Full RTS call version.  An extra argument is passed that contains the thread ID.
        This allows the taskData object to be found which is needed if the code allocates
        any ML memory or raises an exception.  It also saves the stack and heap pointers
        in case of a GC. *)
     fun rtsCallFull (functionName, nArgs (* Not counting the thread ID *), debugSwitches) =
     let
         val entryPointAddr = makeEntryPoint functionName
 
         (* Get the ABI.  On 64-bit Windows and Unix use different calling conventions. *)
         val abi = getABI()
 
         (* Branch to check for exception. *)
         val exLabel = Label{labelNo=0} (* There's just one label in this function. *)
         
         (* Unix X64.  The first six arguments are in rdi, rsi, rdx, rcx, r8, r9.
                       The rest are on the stack.
            Windows X64. The first four arguments are in rcx, rdx, r8 and r9.  The rest are
                        on the stack.  The caller must ensure the stack is aligned on 16-byte boundary
                        and must allocate 32-byte save area for the register args.
                        rbx, rbp, rdi, rsi, rsp, r12-r15 are saved by the called function.
            X86/32.  Arguments are pushed to the stack.
                    ebx, edi, esi, ebp and esp are saved by the called function.
                    We use esi to hold the argument data pointer and edi to save the ML stack pointer
            Our ML conventions use eax, ebx for the first two arguments in X86/32,
                    rax, ebx, r8, r9, r10 for the first five arguments in X86/64 and
                    rax, rsi, r8, r9 and r10 for the first five arguments in X86/64-32 bit.
         *)
         
         (* Previously the ML stack pointer was saved in a callee-save register.  This works
            in almost all circumstances except when a call to the FFI code results in a callback
            and the callback moves the ML stack.  Instead the RTS callback handler adjusts the value
            in memRegStackPtr and we reload the ML stack pointer from there. *)
         val entryPtrReg = if targetArch <> Native32Bit then r11 else ecx
         
         val stackSpace =
             case abi of
                 X64Unix => memRegSize
             |   X64Win => memRegSize + 32 (* Requires 32-byte save area. *)
             |   X86_32 =>
                 let
                     (* GCC likes to keep the stack on a 16-byte alignment. *)
                     val argSpace = (nArgs+1)*4
                     val align = argSpace mod 16
                 in
                     (* Add sufficient space so that esp will be 16-byte aligned *)
                     if align = 0
                     then memRegSize
                     else memRegSize + 16 - align
                 end
 
         (* The RTS functions expect the real address of the thread Id. *)
         fun loadThreadId toReg =
             if targetArch <> ObjectId32Bit
             then [loadMemory(toReg, ebp, memRegThreadSelf, nativeWordOpSize)]
             else [loadMemory(toReg, ebp, memRegThreadSelf, polyWordOpSize),
                   LoadAddress{output=toReg, offset=0, base=SOME ebx, index=Index4 toReg, opSize=nativeWordOpSize}]
 
         val code =
             [
                 Move{source=AddressConstArg entryPointAddr, destination=RegisterArg entryPtrReg, moveSize=opSizeToMove polyWordOpSize}, (* Load the entry point ref. *)
                 loadHeapMemory(entryPtrReg, entryPtrReg, 0, nativeWordOpSize)(* Load its value. *)
             ] @
             (
                 (* Save heap ptr.  This is in r15 in X86/64 *)
                 if targetArch <> Native32Bit then [storeMemory(r15, ebp, memRegLocalMPointer, nativeWordOpSize)] (* Save heap ptr *)
                 else []
             ) @
             (
                 if abi = X86_32 andalso nArgs >= 3
                 then [moveRR{source=esp, output=edi, opSize=nativeWordOpSize}] (* Needed if we have to load from the stack. *)
                 else []
             ) @
             
             [
                 (* Have to save the stack pointer to the arg structure in case we need to scan the stack for a GC. *)
                 storeMemory(esp, ebp, memRegStackPtr, nativeWordOpSize), (* Save ML stack and switch to C stack. *)
                 loadMemory(esp, ebp, memRegCStackPtr, nativeWordOpSize), (*moveRR{source=ebp, output=esp},*) (* Load the saved C stack pointer. *)
                 (* Set the stack pointer past the data on the stack.  For Windows/64 add in a 32 byte save area *)
                 ArithToGenReg{opc=SUB, output=esp, source=NonAddressConstArg(LargeInt.fromInt stackSpace), opSize=nativeWordOpSize}
             ] @
             (
                 case (abi, nArgs) of  (* Set the argument registers. *)
                     (X64Unix, 0) => loadThreadId edi
                 |   (X64Unix, 1) => moveRR{source=eax, output=esi, opSize=polyWordOpSize} :: loadThreadId edi
                 |   (X64Unix, 2) =>
                         moveRR{source=mlArg2Reg, output=edx, opSize=polyWordOpSize} ::
                         moveRR{source=eax, output=esi, opSize=polyWordOpSize} :: loadThreadId edi
                 |   (X64Unix, 3) => 
                         moveRR{source=mlArg2Reg, output=edx, opSize=polyWordOpSize} :: moveRR{source=eax, output=esi, opSize=polyWordOpSize} ::
                         moveRR{source=r8, output=ecx, opSize=polyWordOpSize} :: loadThreadId edi
                 |   (X64Win, 0) => loadThreadId ecx
                 |   (X64Win, 1) => moveRR{source=eax, output=edx, opSize=polyWordOpSize} :: loadThreadId ecx
                 |   (X64Win, 2) =>
                         moveRR{source=eax, output=edx, opSize=polyWordOpSize} ::
                         moveRR{source=mlArg2Reg, output=r8, opSize=polyWordOpSize} :: loadThreadId ecx
                 |   (X64Win, 3) =>
                         moveRR{source=eax, output=edx, opSize=polyWordOpSize} :: moveRR{source=r8, output=r9, opSize=polyWordOpSize} ::
                         moveRR{source=mlArg2Reg, output=r8, opSize=polyWordOpSize} :: loadThreadId ecx
                 |   (X86_32, 0) => [ PushToStack(MemoryArg{base=ebp, offset=memRegThreadSelf, index=NoIndex}) ]
                 |   (X86_32, 1) => [ pushR eax, PushToStack(MemoryArg{base=ebp, offset=memRegThreadSelf, index=NoIndex}) ]
                 |   (X86_32, 2) => [ pushR mlArg2Reg, pushR eax, PushToStack(MemoryArg{base=ebp, offset=memRegThreadSelf, index=NoIndex}) ]
                 |   (X86_32, 3) =>
                         [
                             (* We need to move an argument from the ML stack. *)
                             PushToStack(MemoryArg{base=edi, offset=4, index=NoIndex}), pushR mlArg2Reg, pushR eax,
                             PushToStack(MemoryArg{base=ebp, offset=memRegThreadSelf, index=NoIndex})
                         ]
                 |   _ => raise InternalError "rtsCall: Abi/argument count not implemented"
             ) @
             [
                 CallAddress(RegisterArg entryPtrReg), (* Call the function *)
                 loadMemory(esp, ebp, memRegStackPtr, nativeWordOpSize) (* Restore the ML stack pointer. *)
             ] @
             (
             if targetArch <> Native32Bit then [loadMemory(r15, ebp, memRegLocalMPointer, nativeWordOpSize) ] (* Copy back the heap ptr *)
             else []
             ) @
             [
                 ArithMemConst{opc=CMP, address={offset=memRegExceptionPacket, base=ebp, index=NoIndex}, source=noException, opSize=polyWordOpSize},
                 ConditionalBranch{test=JNE, label=exLabel},
                 (* Remove any arguments that have been passed on the stack. *)
                 ReturnFromFunction(Int.max(case abi of X86_32 => nArgs-2 | _ => nArgs-5, 0)),
                 JumpLabel exLabel, (* else raise the exception *)
                 loadMemory(eax, ebp, memRegExceptionPacket, polyWordOpSize),
                 RaiseException { workReg=ecx }
             ]
  
         val profileObject = createProfileObject functionName
         val newCode = codeCreate (functionName, profileObject, debugSwitches)
         val closure = makeConstantClosure()
         val () = X86OPTIMISE.generateCode{code=newCode, labelCount=1(*One label.*), ops=code, resultClosure=closure}
     in
         closureAsAddress closure
     end
 
     (* This is a quicker version but can only be used if the RTS entry does
        not allocated ML memory, raise an exception or need to suspend the thread. *)
     datatype fastArgs = FastArgFixed | FastArgDouble | FastArgFloat
 
 
     fun rtsCallFastGeneral (functionName, argFormats, (*resultFormat*) _, debugSwitches) =
     let
         val entryPointAddr = makeEntryPoint functionName
 
         (* Get the ABI.  On 64-bit Windows and Unix use different calling conventions. *)
         val abi = getABI()
 
         val (entryPtrReg, saveMLStackPtrReg) =
             if targetArch <> Native32Bit then (r11, r13) else (ecx, edi)
         
         val stackSpace =
             case abi of
                 X64Unix => memRegSize
             |   X64Win => memRegSize + 32 (* Requires 32-byte save area. *)
             |   X86_32 =>
                 let
                     (* GCC likes to keep the stack on a 16-byte alignment. *)
                     val argSpace = List.foldl(fn (FastArgDouble, n) => n+8 | (_, n) => n+4) 0 argFormats
                     val align = argSpace mod 16
                 in
                     (* Add sufficient space so that esp will be 16-byte aligned *)
                     if align = 0
                     then memRegSize
                     else memRegSize + 16 - align
                 end
 
         (* The number of ML arguments passed on the stack. *)
         val mlArgsOnStack =
             Int.max(case abi of X86_32 => List.length argFormats - 2 | _ => List.length argFormats - 5, 0)
 
         val code =
             [
                 Move{source=AddressConstArg entryPointAddr, destination=RegisterArg entryPtrReg, moveSize=opSizeToMove polyWordOpSize}, (* Load the entry point ref. *)
                 loadHeapMemory(entryPtrReg, entryPtrReg, 0, nativeWordOpSize),(* Load its value. *)
                 moveRR{source=esp, output=saveMLStackPtrReg, opSize=nativeWordOpSize}, (* Save ML stack and switch to C stack. *)
                 loadMemory(esp, ebp, memRegCStackPtr, nativeWordOpSize),
                 (* Set the stack pointer past the data on the stack.  For Windows/64 add in a 32 byte save area *)
                 ArithToGenReg{opc=SUB, output=esp, source=NonAddressConstArg(LargeInt.fromInt stackSpace), opSize=nativeWordOpSize}
             ] @
             (
                 case (abi, argFormats) of  (* Set the argument registers. *)
                     (_, []) => []
                 |   (X64Unix, [FastArgFixed]) => [ moveRR{source=eax, output=edi, opSize=polyWordOpSize} ]
                 |   (X64Unix, [FastArgFixed, FastArgFixed]) =>
                         (* Since mlArgs2Reg is esi on 32-in-64 this is redundant. *)
                         [ moveRR{source=mlArg2Reg, output=esi, opSize=polyWordOpSize}, moveRR{source=eax, output=edi, opSize=polyWordOpSize} ]
                 |   (X64Unix, [FastArgFixed, FastArgFixed, FastArgFixed]) => 
                         [ moveRR{source=mlArg2Reg, output=esi, opSize=polyWordOpSize}, moveRR{source=eax, output=edi, opSize=polyWordOpSize},
                           moveRR{source=r8, output=edx, opSize=polyWordOpSize} ]
                 |   (X64Unix, [FastArgFixed, FastArgFixed, FastArgFixed, FastArgFixed]) => 
                         [ moveRR{source=mlArg2Reg, output=esi, opSize=polyWordOpSize}, moveRR{source=eax, output=edi, opSize=polyWordOpSize},
                           moveRR{source=r8, output=edx, opSize=polyWordOpSize}, moveRR{source=r9, output=ecx, opSize=polyWordOpSize} ]
                 |   (X64Win, [FastArgFixed]) => [ moveRR{source=eax, output=ecx, opSize=polyWordOpSize} ]
                 |   (X64Win, [FastArgFixed, FastArgFixed]) => [ moveRR{source=eax, output=ecx, opSize=polyWordOpSize}, moveRR{source=mlArg2Reg, output=edx, opSize=polyWordOpSize} ]
                 |   (X64Win, [FastArgFixed, FastArgFixed, FastArgFixed]) =>
                         [ moveRR{source=eax, output=ecx, opSize=polyWordOpSize}, moveRR{source=mlArg2Reg, output=edx, opSize=polyWordOpSize} (* Arg3 is already in r8. *) ]
                 |   (X64Win, [FastArgFixed, FastArgFixed, FastArgFixed, FastArgFixed]) =>
                         [ moveRR{source=eax, output=ecx, opSize=polyWordOpSize}, moveRR{source=mlArg2Reg, output=edx, opSize=polyWordOpSize} (* Arg3 is already in r8 and arg4 in r9. *) ]
                 |   (X86_32, [FastArgFixed]) => [ pushR eax ]
                 |   (X86_32, [FastArgFixed, FastArgFixed]) => [ pushR mlArg2Reg, pushR eax ]
                 |   (X86_32, [FastArgFixed, FastArgFixed, FastArgFixed]) =>
                         [
                             (* We need to move an argument from the ML stack. *)
                             loadMemory(edx, saveMLStackPtrReg, 4, polyWordOpSize), pushR edx, pushR mlArg2Reg, pushR eax
                         ]
                 |   (X86_32, [FastArgFixed, FastArgFixed, FastArgFixed, FastArgFixed]) =>
                         [
                             (* We need to move an arguments from the ML stack. *)
                             loadMemory(edx, saveMLStackPtrReg, 4, polyWordOpSize), pushR edx,
                             loadMemory(edx, saveMLStackPtrReg, 8, polyWordOpSize), pushR edx,
                             pushR mlArg2Reg, pushR eax
                         ]
 
                     (* One "double" argument.  The value needs to be unboxed. *)
                 |   (X86_32, [FastArgDouble]) =>
                      (* eax contains the address of the value.  This must be unboxed onto the stack. *)
                     [
                         FPLoadFromMemory{address={base=eax, offset=0, index=NoIndex}, precision=DoublePrecision},
                         ArithToGenReg{ opc=SUB, output=esp, source=NonAddressConstArg 8, opSize=nativeWordOpSize},
                         FPStoreToMemory{ address={base=esp, offset=0, index=NoIndex}, precision=DoublePrecision, andPop=true }
                     ]
 
                 |   (_, [FastArgDouble]) => [ (* Already in xmm0 *) ]
 
                 |   (X86_32, [FastArgDouble, FastArgDouble]) =>
                      (* eax and ebx contain the addresses of the values.  They must be unboxed onto the stack. *)
                     [
                         FPLoadFromMemory{address={base=ebx, offset=0, index=NoIndex}, precision=DoublePrecision},
                         ArithToGenReg{ opc=SUB, output=esp, source=NonAddressConstArg 8, opSize=nativeWordOpSize},
                         FPStoreToMemory{ address={base=esp, offset=0, index=NoIndex}, precision=DoublePrecision, andPop=true },
                         FPLoadFromMemory{address={base=eax, offset=0, index=NoIndex}, precision=DoublePrecision},
                         ArithToGenReg{ opc=SUB, output=esp, source=NonAddressConstArg 8, opSize=nativeWordOpSize},
                         FPStoreToMemory{ address={base=esp, offset=0, index=NoIndex}, precision=DoublePrecision, andPop=true }
                     ]
                     (* X64 on both Windows and Unix take the first arg in xmm0 and the second in xmm1. They are already there. *)
                 |   (_, [FastArgDouble, FastArgDouble]) => [ ]
 
                     (* X64 on both Windows and Unix take the first arg in xmm0.  On Unix the integer argument is treated
                        as the first argument and goes into edi.  On Windows it's treated as the second and goes into edx.
                        N.B.  It's also the first argument in ML so is in rax. *)
                 |   (X64Unix, [FastArgDouble, FastArgFixed]) => [ moveRR{source=eax, output=edi, opSize=nativeWordOpSize} ]
                 |   (X64Win, [FastArgDouble, FastArgFixed]) => [ moveRR{source=eax, output=edx, opSize=nativeWordOpSize} ]
                 |   (X86_32, [FastArgDouble, FastArgFixed]) =>
                      (* ebx must be pushed to the stack but eax must be unboxed.. *)
                     [
                         pushR ebx,
                         FPLoadFromMemory{address={base=eax, offset=0, index=NoIndex}, precision=DoublePrecision},
                         ArithToGenReg{ opc=SUB, output=esp, source=NonAddressConstArg 8, opSize=nativeWordOpSize},
                         FPStoreToMemory{ address={base=esp, offset=0, index=NoIndex}, precision=DoublePrecision, andPop=true }
                     ]
 
                     (* One "float" argument.  The value needs to be untagged on X86/64 but unboxed on X86/32. *)
                 |   (X86_32, [FastArgFloat]) =>
                      (* eax contains the address of the value.  This must be unboxed onto the stack. *)
                     [
                         FPLoadFromMemory{address={base=eax, offset=0, index=NoIndex}, precision=SinglePrecision},
                         ArithToGenReg{ opc=SUB, output=esp, source=NonAddressConstArg 4, opSize=nativeWordOpSize},
                         FPStoreToMemory{ address={base=esp, offset=0, index=NoIndex}, precision=SinglePrecision, andPop=true }
                     ]
                 |   (_, [FastArgFloat]) => []
 
                     (* Two float arguments.  Untag them on X86/64 but unbox on X86/32 *)
                 |   (X86_32, [FastArgFloat, FastArgFloat]) =>
                      (* eax and ebx contain the addresses of the values.  They must be unboxed onto the stack. *)
                     [
                         FPLoadFromMemory{address={base=ebx, offset=0, index=NoIndex}, precision=SinglePrecision},
                         ArithToGenReg{ opc=SUB, output=esp, source=NonAddressConstArg 4, opSize=nativeWordOpSize},
                         FPStoreToMemory{ address={base=esp, offset=0, index=NoIndex}, precision=SinglePrecision, andPop=true },
                         FPLoadFromMemory{address={base=eax, offset=0, index=NoIndex}, precision=SinglePrecision},
                         ArithToGenReg{ opc=SUB, output=esp, source=NonAddressConstArg 4, opSize=nativeWordOpSize},
                         FPStoreToMemory{ address={base=esp, offset=0, index=NoIndex}, precision=SinglePrecision, andPop=true }
                     ]
                 |   (_, [FastArgFloat, FastArgFloat]) => [] (* Already in xmm0 and xmm1 *)
 
                     (* One float argument and one fixed. *)
                 |   (X64Unix, [FastArgFloat, FastArgFixed]) => [moveRR{source=mlArg2Reg, output=edi, opSize=polyWordOpSize} ]
                 |   (X64Win, [FastArgFloat, FastArgFixed]) => [moveRR{source=mlArg2Reg, output=edx, opSize=polyWordOpSize}]
                 |   (X86_32, [FastArgFloat, FastArgFixed]) =>
                      (* ebx must be pushed to the stack but eax must be unboxed.. *)
                     [
                         pushR ebx,
                         FPLoadFromMemory{address={base=eax, offset=0, index=NoIndex}, precision=SinglePrecision},
                         ArithToGenReg{ opc=SUB, output=esp, source=NonAddressConstArg 4, opSize=nativeWordOpSize},
                         FPStoreToMemory{ address={base=esp, offset=0, index=NoIndex}, precision=SinglePrecision, andPop=true }
                     ]
 
                 |   _ => raise InternalError "rtsCall: Abi/argument count not implemented"
             ) @
             [
                 CallAddress(RegisterArg entryPtrReg), (* Call the function *)
                 moveRR{source=saveMLStackPtrReg, output=esp, opSize=nativeWordOpSize}, (* Restore the ML stack pointer *)
                 (* Since this is an ML function we need to remove any ML stack arguments. *)
                 ReturnFromFunction mlArgsOnStack
             ]
  
         val profileObject = createProfileObject functionName
         val newCode = codeCreate (functionName, profileObject, debugSwitches)
         val closure = makeConstantClosure()
         val () = X86OPTIMISE.generateCode{code=newCode, labelCount=0, ops=code, resultClosure=closure}
     in
         closureAsAddress closure
     end
     
     
     fun rtsCallFast (functionName, nArgs, debugSwitches) =
         rtsCallFastGeneral (functionName, List.tabulate(nArgs, fn _ => FastArgFixed), FastArgFixed, debugSwitches)
     
     (* RTS call with one double-precision floating point argument and a floating point result. *)
     fun rtsCallFastRealtoReal (functionName, debugSwitches) =
         rtsCallFastGeneral (functionName, [FastArgDouble], FastArgDouble, debugSwitches)
     
     (* RTS call with two double-precision floating point arguments and a floating point result. *)
     fun rtsCallFastRealRealtoReal (functionName, debugSwitches) =
         rtsCallFastGeneral (functionName, [FastArgDouble, FastArgDouble], FastArgDouble, debugSwitches)
 
     (* RTS call with one double-precision floating point argument, one fixed point argument and a
        floating point result. *)
     fun rtsCallFastRealGeneraltoReal (functionName, debugSwitches) =
         rtsCallFastGeneral (functionName, [FastArgDouble, FastArgFixed], FastArgDouble, debugSwitches)
 
     (* RTS call with one general (i.e. ML word) argument and a floating point result.
        This is used only to convert arbitrary precision values to floats. *)
     fun rtsCallFastGeneraltoReal (functionName, debugSwitches) =
         rtsCallFastGeneral (functionName, [FastArgFixed], FastArgDouble, debugSwitches)
 
     (* Operations on Real32.real values. *)
 
     fun rtsCallFastFloattoFloat (functionName, debugSwitches) =
         rtsCallFastGeneral (functionName, [FastArgFloat], FastArgFloat, debugSwitches)
     
     fun rtsCallFastFloatFloattoFloat (functionName, debugSwitches) =
         rtsCallFastGeneral (functionName, [FastArgFloat, FastArgFloat], FastArgFloat, debugSwitches)
 
     (* RTS call with one double-precision floating point argument, one fixed point argument and a
        floating point result. *)
     fun rtsCallFastFloatGeneraltoFloat (functionName, debugSwitches) =
         rtsCallFastGeneral (functionName, [FastArgFloat, FastArgFixed], FastArgFloat, debugSwitches)
 
     (* RTS call with one general (i.e. ML word) argument and a floating point result.
        This is used only to convert arbitrary precision values to floats. *)
     fun rtsCallFastGeneraltoFloat (functionName, debugSwitches) =
         rtsCallFastGeneral (functionName, [FastArgFixed], FastArgFloat, debugSwitches)
 
 
     datatype ffiABI =
         FFI_SYSV        (* Unix 32 bit and Windows GCC 32-bit *)
     |   FFI_STDCALL     (* Windows 32-bit system ABI.  Callee clears the stack. *)
     |   FFI_MS_CDECL    (* VS 32-bit.  Same as SYSV except when returning a struct.  Default on Windows including GCC in Mingw. *)
     |   FFI_WIN64       (* Windows 64 bit *)
     |   FFI_UNIX64      (* Unix 64 bit. libffi also implements this on X86/32. *)
     (* We don't include various other 32-bit Windows ABIs. *)
 
     local
         (* Get the current ABI list.  N.B.  Foreign.LibFFI.abiList is the ABIs on the platform we built
            the compiler on, not necessarily the one we're running on. *)
         val ffiGeneral = RunCall.rtsCallFull2 "PolyFFIGeneral"
     in
         fun getFFIAbi abi =
         let
             val abis: (string * Foreign.LibFFI.abi) list = ffiGeneral (50, ())
         in
             case List.find (fn ("default", _) => false | (_, a) => a = abi) abis of
                 SOME ("sysv", _)        => FFI_SYSV
             |   SOME ("stdcall", _)     => FFI_STDCALL
             |   SOME ("ms_cdecl", _)    => FFI_MS_CDECL
             |   SOME ("win64", _)       => FFI_WIN64
             |   SOME ("unix64", _)      => FFI_UNIX64
             |   _   => raise Foreign.Foreign "Unknown or unsupported ABI"
         end
     end
     
     fun alignUp(s, align) = Word.andb(s + align-0w1, ~ align)
     fun intAlignUp(s, align) = Word.toInt(alignUp(Word.fromInt s, align))
 
     (* Build a foreign call function.  The arguments are the abi, the list of argument types and the result type.
        The result is the code of the ML function that takes three arguments: the C function to call, the arguments
        as a vector of C values and the address of the memory for the result. *)
 
     fun call32Bits(abi, args, result) =
     let
         (* 32-bit arguments.  These all go to the stack so we can simply push them.  The arguments go on the
            stack in reverse order. *)
         fun loadArgs32([], stackOffset, argOffset, code, continue) = continue(stackOffset, argOffset, code)
 
         |   loadArgs32(arg::args, stackOffset, argOffset, code, continue) =
             let
                 val {size, align, typeCode, elements} = Foreign.LibFFI.extractFFItype arg
                 val newArgOffset = alignUp(argOffset, align)
                 val baseAddr = {base=mlArg2Reg, offset=Word.toInt newArgOffset, index=NoIndex}
             in
                 if typeCode = Foreign.LibFFI.ffiTypeCodeUInt8
                 then (* Unsigned char. *)
                     loadArgs32(args, stackOffset+4, newArgOffset+size,
                         Move{source=MemoryArg baseAddr, destination=RegisterArg edx, moveSize=Move8 }
                             :: PushToStack(RegisterArg edx) :: code, continue)
                 else if typeCode = Foreign.LibFFI.ffiTypeCodeSInt8
                 then  (* Signed char. *)
                     loadArgs32(args, stackOffset+4, newArgOffset+size,
                         Move{source=MemoryArg baseAddr, destination=RegisterArg edx, moveSize=Move8X32 }
                             :: PushToStack(RegisterArg edx) :: code, continue)
                 else if typeCode = Foreign.LibFFI.ffiTypeCodeUInt16
                 then  (* Unsigned 16-bits. *)
                     loadArgs32(args, stackOffset+4, newArgOffset+size,
                         Move{source=MemoryArg baseAddr, destination=RegisterArg edx, moveSize=Move16 }
                             :: PushToStack(RegisterArg edx) :: code, continue)
                 else if typeCode = Foreign.LibFFI.ffiTypeCodeSInt16
                 then  (* Signed 16-bits. *)
                     loadArgs32(args, stackOffset+4, newArgOffset+size,
                         Move{source=MemoryArg baseAddr, destination=RegisterArg edx, moveSize=Move16X32 }
                             :: PushToStack(RegisterArg edx) :: code, continue)
                 else if typeCode = Foreign.LibFFI.ffiTypeCodeUInt32 orelse typeCode = Foreign.LibFFI.ffiTypeCodeSInt32
                         orelse typeCode = Foreign.LibFFI.ffiTypeCodePointer orelse typeCode = Foreign.LibFFI.ffiTypeCodeFloat
                         orelse typeCode = Foreign.LibFFI.ffiTypeCodeInt
                 then  (* 32-bits. *)
                     loadArgs32(args, stackOffset+4, newArgOffset+size, PushToStack(MemoryArg baseAddr) :: code, continue)
                 else if typeCode = Foreign.LibFFI.ffiTypeCodeDouble
                 then  (* Double: push the two words.  High-order word first, then low-order. *)
                     loadArgs32(args, stackOffset+8, newArgOffset+size,
                         PushToStack(MemoryArg{base=mlArg2Reg, offset=Word.toInt newArgOffset+4, index=NoIndex}) ::
                             PushToStack(MemoryArg{base=mlArg2Reg, offset=Word.toInt newArgOffset, index=NoIndex}) :: code, continue)
                 else if typeCode = Foreign.LibFFI.ffiTypeCodeStruct
                 (* structs passed as values are recursively unpacked. *)
                 then loadArgs32(elements, stackOffset, newArgOffset (* Struct is aligned. *), code,
                             fn (so, ao, code) => loadArgs32(args, so, ao, code, continue))
                 else if typeCode = Foreign.LibFFI.ffiTypeCodeVoid
                 then raise Foreign.Foreign "Void cannot be used for a function argument"
                 else (* Anything else? *) raise Foreign.Foreign "Unrecognised type for function argument"
             end
 
         val {typeCode, size, ...} = Foreign.LibFFI.extractFFItype result
 
         val resultMemory = {base=ecx, offset=0, index=NoIndex}
         (* Structures are passed by reference by storing the address of the result as
            the first argument except that in MS_CDECL (and STDCALL?) structures of
            size 1, 2, 4 and 8 are returned in EAX, and for 8, EDX.  *)
         val (getResult, needResultAddress) =
             if typeCode = Foreign.LibFFI.ffiTypeCodeStruct andalso
                 (abi = FFI_SYSV orelse (size <> 0w1 andalso size <> 0w2 andalso size <> 0w4 andalso size <> 0w8))
             (* TODO: We have to get the address of the destination area. *)
             then ([], true)
             else if typeCode = Foreign.LibFFI.ffiTypeCodeVoid
             then ([], false)
             else
                 (loadMemory(ecx, esp, 4, nativeWordOpSize)  ::
                 loadHeapMemory(ecx, ecx, 0, nativeWordOpSize) ::
                 (if size = 0w1
                 then (* Single byte *) [Move{source=RegisterArg eax, destination=MemoryArg resultMemory, moveSize=Move8}]
                 else if size = 0w2
                 then (* 16-bits *) [Move{source=RegisterArg eax, destination=MemoryArg resultMemory, moveSize=Move16}]
                 else if typeCode = Foreign.LibFFI.ffiTypeCodeFloat
                 then [FPStoreToMemory{address=resultMemory, precision=SinglePrecision, andPop=true }]
                 else if size = 0w4
                 then [Move{source=RegisterArg eax, destination=MemoryArg resultMemory, moveSize=Move32}]
                 else if typeCode = Foreign.LibFFI.ffiTypeCodeDouble
                 then [FPStoreToMemory{address=resultMemory, precision=DoublePrecision, andPop=true }]
                 else if size = 0w8
                 then
                 [
                     Move{source=RegisterArg eax, destination=MemoryArg resultMemory, moveSize=Move32},
                     Move{source=RegisterArg edx, destination=MemoryArg {base=ecx, offset=4, index=NoIndex}, moveSize=Move32}
                 ]
                 else raise Foreign.Foreign "Unrecognised result type"), false)
 
         local
             (* Load the arguments.  If we need to pass the return address for a struct that is the first arg. *)
             val (startStack, startCode) =
                 if needResultAddress
                 then (4, [PushToStack(MemoryArg{base=ecx, offset=0, index=NoIndex})])
                 else (0, [])
         in
             val (argCode, argStack) =
                 loadArgs32(args, startStack, 0w0, startCode, fn (stackOffset, _, code) => (code, stackOffset))
         end
 
         local
             val align = argStack mod 16
         in
             (* Always align the stack.  It's not always necessary on 32-bits but GCC prefers it. *)
             val preArgAlign = if align = 0 then 0 else 16-align
         end
 
         in
             (
                 (* If we're returning a struct we need the result address before we call. *)
                 if needResultAddress then [loadMemory(ecx, esp, 4, nativeWordOpSize)] else []
             ) @
             [
                 (* Save the stack pointer. *)
                 storeMemory(esp, ebp, memRegStackPtr, nativeWordOpSize), (* Save ML stack and switch to C stack. *)
                 loadMemory(esp, ebp, memRegCStackPtr, nativeWordOpSize)  (* Load the saved C stack pointer. *)
             ] @
             (
                 if preArgAlign = 0
                 then []
                 else [ArithToGenReg{opc=SUB, output=esp, source=NonAddressConstArg(LargeInt.fromInt preArgAlign), opSize=nativeWordOpSize}]
             ) @
             (
                 (* The second argument is a SysWord containing the address of a malloced area of memory
                    with the actual arguments in it. *)
                 if null args
                 then []
                 else [loadHeapMemory(mlArg2Reg, mlArg2Reg, 0, nativeWordOpSize)]
             ) @ argCode @
             [
                 (* Call the function.  We're discarding the value in rsp so no need to remove args. *)
                 CallAddress(MemoryArg{base=eax, offset=0, index=NoIndex}),
                 loadMemory(esp, ebp, memRegStackPtr, nativeWordOpSize) (* Restore the ML stack pointer. *)
             ] @ getResult @ (* Store the result in the destination. *) [ ReturnFromFunction 1 ]
         end
 
     (* Windows on X64. *)
     fun callWindows64Bits(args, result) =
     let
         val extraStackReg = r10 (* Not used for any arguments. *)
 
         fun loadWin64Args([], stackOffset, _, _, code, extraStack, preCode) = (code, stackOffset, preCode, extraStack)
 
         |   loadWin64Args(arg::args, stackOffset, argOffset, regs, code, extraStack, preCode) =
             let
                 val {size, align, typeCode, ...} = Foreign.LibFFI.extractFFItype arg
                 val newArgOffset = alignUp(argOffset, align)
                 val baseAddr = {base=mlArg2Reg, offset=Word.toInt newArgOffset, index=NoIndex}
                 val workReg = rcx (* rcx: the last to be loaded. *)
                 
                 (* Integer arguments. *)
                 fun loadIntArg moveOp =
                     case regs of
                         (areg, _) :: regs' => 
                             loadWin64Args(args, stackOffset, newArgOffset+size, regs',
                                 Move{source=MemoryArg baseAddr, destination=RegisterArg areg, moveSize=moveOp } :: code,
                                 extraStack, preCode)
                     |   [] =>
                             loadWin64Args(args, stackOffset+8, newArgOffset+size, [],
                                 if size = 0w8
                                 then PushToStack(MemoryArg baseAddr) :: code
                                 else (* Need to load it into a register first. *)
                                     Move{source=MemoryArg baseAddr,
                                            destination=RegisterArg workReg, moveSize=moveOp } :: PushToStack(RegisterArg workReg) :: code,
                                 extraStack, preCode)
             in
                 (* Structs of 1, 2, 4 and 8 bytes are passed as the corresponding int.  It may not
                    be necessary to sign-extend 1, 2 or 4-byte values.
                    2, 4 or 8-byte structs may not be aligned onto the appropriate boundary but
                    it should still work. *)
                 case size of
                     0w1 =>
                     if typeCode = Foreign.LibFFI.ffiTypeCodeSInt8
                     then (* Signed char. *) loadIntArg Move8X64
                     else (* Unsigned char or single byte struct *) loadIntArg Move8
 
                 |   0w2 =>
                     if typeCode = Foreign.LibFFI.ffiTypeCodeSInt16
                     then (* Signed 16-bits. *) loadIntArg Move16X64
                     else (* Unsigned 16-bits. *) loadIntArg Move16
 
                 |   0w4 =>
                     if typeCode = Foreign.LibFFI.ffiTypeCodeFloat
                     then
                     (
                         case regs of
                             (_, fpReg) :: regs' => 
                             loadWin64Args(args, stackOffset, newArgOffset+size, regs',
                                 XMMArith{opc=SSE2MoveFloat, source=MemoryArg baseAddr, output=fpReg } :: code, extraStack, preCode)
                         |   [] =>
                             loadWin64Args(args, stackOffset+8, newArgOffset+size, [],
                                 Move{source=MemoryArg baseAddr,
                                        destination=RegisterArg workReg, moveSize=Move32 } :: PushToStack(RegisterArg workReg) :: code,
                                 extraStack, preCode)
                     )
                     else if typeCode = Foreign.LibFFI.ffiTypeCodeSInt32 orelse typeCode = Foreign.LibFFI.ffiTypeCodeInt
                     then (* Signed 32-bits. *) loadIntArg Move32X64
                     else (* Unsigned 32-bits. *) loadIntArg Move32
 
                 |   0w8 =>
                     if typeCode = Foreign.LibFFI.ffiTypeCodeDouble
                     then
                     (
                         case regs of
                             (_, fpReg) :: regs' => 
                             loadWin64Args(args, stackOffset, newArgOffset+size, regs',
                                 XMMArith{opc=SSE2MoveDouble, source=MemoryArg baseAddr, output=fpReg } :: code, extraStack, preCode)
                         |   [] =>
                             loadWin64Args(args, stackOffset+8, newArgOffset+size, [],
                                 Move{source=MemoryArg baseAddr,
                                        destination=RegisterArg workReg, moveSize=Move64 } :: PushToStack(RegisterArg workReg) :: code,
                                 extraStack, preCode)
                     )
                     else (* 64-bits. *) loadIntArg Move64
 
                 |   _ =>
                     if typeCode <> Foreign.LibFFI.ffiTypeCodeStruct
                     then raise Foreign.Foreign "Unrecognised type for function argument"
                     else
                     let
                         (* Structures of other sizes are passed by reference.  They are first
                            copied into new areas on the stack.  This ensures that the called function
                            can update the structure without changing the original values. *)
                         val newExtra = intAlignUp(extraStack + Word.toInt size, 0w16)
                         val newPreCode =
                             moveMemory{source=(mlArg2Reg, Word.toInt newArgOffset), destination=(extraStackReg, extraStack),
                                        length=Word.toInt size} @ preCode
                     in
                         case regs of
                             (areg, _) :: regs' => 
                             loadWin64Args(args, stackOffset, newArgOffset+size, regs',
                                 loadAddress{source=(extraStackReg, extraStack), destination=areg} :: code,
                                 newExtra, newPreCode)
                         |   [] =>
                             loadWin64Args(args, stackOffset+8, newArgOffset+size, [],
                                 loadAddress{source=(extraStackReg, extraStack), destination=workReg} ::
                                 PushToStack(RegisterArg workReg) :: code, newExtra, newPreCode)
                     end
             end
 
         val {typeCode, size, ...} = Foreign.LibFFI.extractFFItype result
         
         val resultAreaPtr = r12 (* Saved value of r8 - This is callee save. *)
         val resultMemory = {base=resultAreaPtr, offset=0, index=NoIndex}
         fun storeIntValue moveOp =
             ([Move{source=RegisterArg eax, destination=MemoryArg resultMemory, moveSize=moveOp}], false)
         and storeFloatValue precision =
             ([XMMStoreToMemory{toStore=xmm0, address=resultMemory, precision=precision}], false)
 
         val (getResult, passStructAddress) =
             if typeCode = Foreign.LibFFI.ffiTypeCodeVoid
             then ([], false)
             else if size = 0w1 (* Includes structs *) then (* Single byte *) storeIntValue Move8
             else if size = 0w2 then (* 16-bits *) storeIntValue Move16
             else if typeCode = Foreign.LibFFI.ffiTypeCodeFloat
             then storeFloatValue SinglePrecision
             else if size = 0w4 then storeIntValue Move32
             else if typeCode = Foreign.LibFFI.ffiTypeCodeDouble
             then storeFloatValue DoublePrecision
             else if size = 0w8
             then storeIntValue Move64
             else if typeCode = Foreign.LibFFI.ffiTypeCodeStruct
             then ([], true)
             else raise Foreign.Foreign "Unrecognised result type"
 
         val win64ArgRegs = [ (rcx, xmm0), (rdx, xmm1), (r8, xmm2), (r9, xmm3) ]
 
         (* argCode is the code to load and push the arguments.  argStack is the amount of stack space
            the arguments will take.  It's only used to ensure that the stack is aligned onto a 16-byte
            boundary.  preArgCode is any code that is needed to copy the arguments before they are
            actually loaded.  Because it is done before the argument registers are loaded it can
            use rcx, rdi and rsi.  extraStack is local stack space needed.  It is usually zero but
            if it is non-zero it must be a multiple of 16 bytes.  The address of this area is loaded
            into r10 before preArgCode is called. *)
         val (argCode, argStack, preArgCode, extraStack) =
             if passStructAddress
             then (* The address of the result structure goes in the first argument register: rcx *)
                 loadWin64Args(args, 0, 0w0, tl win64ArgRegs,
                     [moveRR{source=resultAreaPtr, output=rcx, opSize=nativeWordOpSize}], 0, [])
             else loadWin64Args(args, 0, 0w0, win64ArgRegs, [], 0, [])
 
         local
             val align = argStack mod 16
         in
             (* Always align the stack. *)
             val preArgAlign = if align = 0 then 0 else 16-align
         end
 
         in
             (* Save heap ptr.  Needed in case we have a callback. *)
             [storeMemory(r15, ebp, memRegLocalMPointer, nativeWordOpSize)] @
             (
                 (* Put the destination address into a callee save resgister.
                    We have to put the C address in there now because an ML address wouldn't be updated
                    by a possible GC in a callback. *)
                 if #typeCode(Foreign.LibFFI.extractFFItype result) <> Foreign.LibFFI.ffiTypeCodeVoid
                 then [loadHeapMemory(resultAreaPtr, r8, 0, nativeWordOpSize)]
                 else []
             ) @
             [
                 (* Save the stack pointer. *)
                 storeMemory(esp, ebp, memRegStackPtr, nativeWordOpSize), (* Save ML stack and switch to C stack. *)
                 loadMemory(esp, ebp, memRegCStackPtr, nativeWordOpSize)  (* Load the saved C stack pointer. *)
             ] @
             (
                 if extraStack = 0
                 then []
                 else
                 [
                     ArithToGenReg{opc=SUB, output=rsp, source=NonAddressConstArg(LargeInt.fromInt extraStack), opSize=nativeWordOpSize},
                     Move{source=RegisterArg rsp, destination=RegisterArg extraStackReg, moveSize=Move64}
                 ]
             ) @
             (
                 if preArgAlign = 0
                 then []
                 else [ArithToGenReg{opc=SUB, output=esp, source=NonAddressConstArg(LargeInt.fromInt preArgAlign), opSize=nativeWordOpSize}]
             ) @
             (
                 (* The second argument is a SysWord containing the address of a malloced area of memory
                    with the actual arguments in it. *)
                 if null args
                 then []
                 else [loadHeapMemory(mlArg2Reg, mlArg2Reg, 0, nativeWordOpSize)]
             ) @ preArgCode @ argCode @
             [ (* Reserve a 32-byte area after the arguments.  This is specific to the Windows ABI. *)
                 ArithToGenReg{opc=SUB, output=esp, source=NonAddressConstArg(LargeInt.fromInt 32), opSize=nativeWordOpSize},
                 let
                     (* The entry point is in a SysWord.word value in RAX. *)
                     val entryPoint =
                         case targetArch of
                             ObjectId32Bit => MemoryArg{base=ebx, offset=0, index=Index4 eax}
                         |   _ => MemoryArg{base=eax, offset=0, index=NoIndex}
                 in
                     (* Call the function.  We're discarding the value in rsp so no need to remove args. *)
                     CallAddress entryPoint
                 end,
                 loadMemory(esp, ebp, memRegStackPtr, nativeWordOpSize), (* Restore the ML stack pointer. *)
                 (* Reload the heap pointer.  If we've called back to ML this could well have changed. *)
                 loadMemory(r15, ebp, memRegLocalMPointer, nativeWordOpSize)
             ] @ (* Store the result in the destination. *) getResult @ [ReturnFromFunction 0 ]
         end (* callWindows64Bits *)
 
 
     local
         (* The rules for passing structs in SysV on X86/64 are complicated but most of the special
            cases don't apply.  We don't support floating point larger than 8 bytes, packed structures
            or C++ constructors.  It then reduces to the following:
            Structures of up to 8 bytes are passed in a single register and of 8-16 bytes in two
            registers.  Larger structures are passed on the stack.  The question is whether to use
            general registers or SSE2 registers.  Each 8 byte chunk is considered independently after
            any internal structs have been unwrapped.  Each chunk will consist of either a single
            8-byte value (i.e.. a pointer, int64_t or a double) or one or more smaller values and
            possibly some padding.  An SSE2 register is used if the value is a double, two floats
            or a single float and padding.  Otherwise it must have at least one shorter int-like
            type (e.g. int, char, short etc) in which case a general register is used.  That
            applies even if it also contains a float.  If, having selected the kind of
            registers to be used, there are not enough for the whole struct it is passed
            on the stack.
         
            We don't really need this for simple arguments but it's easier to consider
            them all together. *)
         datatype argClass = ArgInMemory | ArgInRegs of { firstInSSE: bool, secondInSSE: bool }
         
         fun classifyArg arg =
         let
             val {size, ...} = Foreign.LibFFI.extractFFItype arg
 
             (* Unwrap the struct and any internal structs. *)
             fun getFields([], _) = []
 
             |   getFields(field::fields, offset) =
                 let
                     val {size, align, typeCode, elements, ...} = Foreign.LibFFI.extractFFItype field
                     val alignedOffset = alignUp(offset, align) (* Align this even if it's a sub-struct *)
                 in
                     if typeCode = Foreign.LibFFI.ffiTypeCodeVoid
                     then raise Foreign.Foreign "Void cannot be used for a function argument"
                     else if typeCode = Foreign.LibFFI.ffiTypeCodeStruct
                     then getFields(elements, alignedOffset) @ getFields(fields, alignedOffset+size)
                     else (typeCode, alignedOffset) :: getFields(fields, alignedOffset+size)
                 end
        
             val isSSE =
                 List.all (fn (tc, _) => tc = Foreign.LibFFI.ffiTypeCodeFloat orelse tc = Foreign.LibFFI.ffiTypeCodeDouble)
         in
             if size > 0w16
             then ArgInMemory
             else
             let
                 val fieldsAndOffsets = getFields([arg], 0w0)
             in
                 if size <= 0w8 (* Only the first register will be used. *)
                 then ArgInRegs{firstInSSE=isSSE fieldsAndOffsets, secondInSSE=false}
                 else
                 let
                     val (first8Bytes, second8Bytes) =
                         List.partition (fn (_, off) => off <= 0w8) fieldsAndOffsets
                 in
                     ArgInRegs{firstInSSE=isSSE first8Bytes, secondInSSE=isSSE second8Bytes}
                 end
             end
         end
     in
         fun callUnix64Bits(args, result) =
         let
-            val sysVGenRegs = [rdi, rsi, rdx, rcx, r8, r9]
-            and sysVFPRegs = [xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7]
-
-            val extraStackReg = r10 (* Not used for any arguments. *)
+            val argWorkReg = r10 (* Not used for any arguments. *)
             val resultAreaPtr = r12 (* Saved value of r8 - This is callee save. *)
 
-            fun loadSysV64Args([], stackOffset, _, _, _, code, extraStack, preCode) = (code, stackOffset, preCode, extraStack)
+            fun loadSysV64Args([], stackOffset, _, _, _, code, preCode) = (code, stackOffset, preCode)
 
-            |   loadSysV64Args(arg::args, stackOffset, argOffset, gRegs, fpRegs, code, extraStack, preCode) =
+            |   loadSysV64Args(arg::args, stackOffset, argOffset, gRegs, fpRegs, code, preCode) =
                 let
                     val {size, align, typeCode, ...} = Foreign.LibFFI.extractFFItype arg
+
+                    (* Load a value into a register.  Normally the size will be 1, 2, 4 or 8 bytes and
+                       this will just involve a simple load.  Structs, though, can be of any size up
+                       to 8 bytes. *)
+                    fun loadRegister(reg, offset, size) =
+                    let
+                        (* We don't necessarily have to sign-extend.  There's a comment in libffi that
+                           suggests that LVM expects it even though the SysV ABI doesn't require it. *)
+                        val moveOp =
+                            if size = 0w8
+                            then Move64
+                            else if typeCode = Foreign.LibFFI.ffiTypeCodeSInt32 orelse typeCode = Foreign.LibFFI.ffiTypeCodeInt
+                            then Move32X64
+                            else if size >= 0w4
+                            then Move32
+                            else if typeCode = Foreign.LibFFI.ffiTypeCodeSInt16
+                            then Move16X64
+                            else if size >= 0w2
+                            then Move16
+                            else if typeCode = Foreign.LibFFI.ffiTypeCodeSInt8
+                            then Move8X64 else Move8
+                    in
+                        [Move{source=MemoryArg{base=mlArg2Reg, offset=Word.toInt offset, index=NoIndex}, destination=RegisterArg reg, moveSize=moveOp}]
+                    end @
+                    (
+                        if size = 0w6 orelse size = 0w7
+                        then
+                        [
+                            Move{source=MemoryArg{base=mlArg2Reg, offset=Word.toInt offset + 4, index=NoIndex},
+                                destination=RegisterArg argWorkReg, moveSize=Move16},
+                            ShiftConstant{ shiftType=SHL, output=argWorkReg, shift=0w32, opSize=OpSize64 },
+                            ArithToGenReg{ opc=OR, output=reg, source=RegisterArg argWorkReg, opSize=OpSize64 }
+                        ]
+                        else []
+                    ) @
+                    (
+                        if size = 0w3 orelse size = 0w5 orelse size = 0w7
+                        then
+                        [
+                            Move{source=MemoryArg{base=mlArg2Reg, offset=Word.toInt offset + Word.toInt(size-0w1), index=NoIndex},
+                                destination=RegisterArg argWorkReg, moveSize=Move8},
+                            ShiftConstant{ shiftType=SHL, output=argWorkReg, shift=Word8.fromLargeWord(Word.toLargeWord((size-0w1)*0w8)), opSize=OpSize64 },
+                            ArithToGenReg{ opc=OR, output=reg, source=RegisterArg argWorkReg, opSize=OpSize64 }
+                        ]
+                        else []
+                    )
+
                     val newArgOffset = alignUp(argOffset, align)
-                    (* We have to load structs of sizes up to 16 bytes into one or more registers
-                       but that includes odd sizes and we need to be careful we don't read beyond
-                       the source argument area.  We copy the struct onto the stack in the preCode
-                       and then load from there.
-                       TODO: We don't actually need to copy the struct if it's 1, 2, 4, 8, 12 or 16 bytes.
-                       In that case it could be misaligned but that's safe on X86. *)
-                    val (baseReg, addr, preCodePost, extraStackPost) =
-                        if typeCode = Foreign.LibFFI.ffiTypeCodeStruct
-                        then (extraStackReg, extraStack,
-                                moveMemory{source=(mlArg2Reg, Word.toInt newArgOffset),
-                                           destination=(extraStackReg, extraStack), length=Word.toInt size} @ preCode,
-                                extraStack+Word.toInt(alignUp(size, 0w16)))
-                        else (mlArg2Reg, Word.toInt newArgOffset, preCode, extraStack)
-                    val baseAddr1 = {base=baseReg, offset=addr, index=NoIndex}
-                    and baseAddr2 = {base=baseReg, offset=addr+8, index=NoIndex}
+                    val word1Addr = {base=mlArg2Reg, offset=Word.toInt newArgOffset, index=NoIndex}
+                    val word2Addr = {base=mlArg2Reg, offset=Word.toInt newArgOffset + 8, index=NoIndex}
                 in
                     case (classifyArg arg, size > 0w8, gRegs, fpRegs) of
-                        (* 8 bytes or smaller - single general reg *)
+                        (* 8 bytes or smaller - single general reg.  This is the usual case. *)
                         (ArgInRegs{firstInSSE=false, ...}, false, gReg :: gRegs', fpRegs') =>
-                            (* We don't necessarily have to sign-extend.  There's a comment in libffi that
-                               suggests that LVM expects it even though the SysV ABI doesn't require it. *)
-                        let
-                            val moveOp =
-                                case size of
-                                    0w1 => if typeCode = Foreign.LibFFI.ffiTypeCodeSInt8 then Move8X64 else Move8
-                                |   0w2 => if typeCode = Foreign.LibFFI.ffiTypeCodeSInt16 then Move16X64 else Move16
-                                |   0w4 => if typeCode = Foreign.LibFFI.ffiTypeCodeSInt32 orelse typeCode = Foreign.LibFFI.ffiTypeCodeInt
-                                           then Move32X64 else Move32
-                                |   0w3 => Move32 (* It's been copied onto the stack. *)
-                                |   _ => Move64
-                        in
                             loadSysV64Args(args, stackOffset, newArgOffset+size, gRegs', fpRegs',
-                                        Move{source=MemoryArg baseAddr1, destination=RegisterArg gReg, moveSize=moveOp} :: code,
-                                        extraStackPost, preCodePost)
-                        end
+                                loadRegister(gReg, newArgOffset, size) @ code, preCode)
 
-                        (* 8 bytes or smaller - single SSE reg. *)
+                        (* 8 bytes or smaller - single SSE reg.  Usual case for real arguments. *)
                     |   (ArgInRegs{firstInSSE=true, ...}, false, gRegs', fpReg :: fpRegs') =>
                             loadSysV64Args(args, stackOffset, newArgOffset+size, gRegs', fpRegs',
-                                XMMArith{opc=if size = 0w4 then SSE2MoveFloat else SSE2MoveDouble, source=MemoryArg baseAddr1, output=fpReg } :: code,
-                                extraStackPost, preCodePost)
+                                XMMArith{opc=if size = 0w4 then SSE2MoveFloat else SSE2MoveDouble, source=MemoryArg word1Addr, output=fpReg } :: code,
+                                preCode)
                     
                         (* 9-16 bytes - both values in general regs. *)
                     |   (ArgInRegs{firstInSSE=false, secondInSSE=false}, true, gReg1 :: gReg2 :: gRegs', fpRegs') =>
                             loadSysV64Args(args, stackOffset, newArgOffset+size, gRegs', fpRegs',
-                                Move{source=MemoryArg baseAddr1, destination=RegisterArg gReg1, moveSize=Move64} ::
-                                Move{source=MemoryArg baseAddr2, destination=RegisterArg gReg2, moveSize=if size <= 0w12 then Move32 else Move64} :: code,
-                                extraStackPost, preCodePost)
+                                Move{source=MemoryArg word1Addr, destination=RegisterArg gReg1, moveSize=Move64} ::
+                                loadRegister(gReg2, newArgOffset+0w8, size-0w8) @ code, preCode)
 
                         (* 9-16 bytes - first in general, second in SSE. *)
                     |   (ArgInRegs{firstInSSE=false, secondInSSE=true}, true, gReg :: gRegs', fpReg :: fpRegs') =>
                             loadSysV64Args(args, stackOffset, newArgOffset+size, gRegs', fpRegs',
-                                Move{source=MemoryArg baseAddr1, destination=RegisterArg gReg, moveSize=Move64} ::
-                                XMMArith{opc=if size = 0w12 then SSE2MoveFloat else SSE2MoveDouble, source=MemoryArg baseAddr2, output=fpReg } :: code,
-                                extraStackPost, preCodePost)
+                                Move{source=MemoryArg word1Addr, destination=RegisterArg gReg, moveSize=Move64} ::
+                                    XMMArith{opc=if size = 0w12 then SSE2MoveFloat else SSE2MoveDouble, source=MemoryArg word2Addr, output=fpReg } :: code,
+                                preCode)
 
                         (* 9-16 bytes - first in SSE, second in general. *)
                     |   (ArgInRegs{firstInSSE=true, secondInSSE=false}, true, gReg :: gRegs', fpReg :: fpRegs') =>
                             loadSysV64Args(args, stackOffset, newArgOffset+size, gRegs', fpRegs',
-                                XMMArith{opc=SSE2MoveDouble, source=MemoryArg baseAddr1, output=fpReg } ::
-                                Move{source=MemoryArg baseAddr2, destination=RegisterArg gReg, moveSize=if size <= 0w12 then Move32 else Move64} :: code,
-                                extraStackPost, preCodePost)
+                                XMMArith{opc=SSE2MoveDouble, source=MemoryArg word1Addr, output=fpReg } ::
+                                    loadRegister(gReg, newArgOffset+0w8, size-0w8) @ code,
+                                preCode)
 
                     |   (* 9-16 bytes - both values in SSE regs. *)
-                        (ArgInRegs{firstInSSE=true, ...}, true, gRegs', fpReg1 :: fpReg2 :: fpRegs') =>
+                        (ArgInRegs{firstInSSE=true, secondInSSE=true}, true, gRegs', fpReg1 :: fpReg2 :: fpRegs') =>
                             loadSysV64Args(args, stackOffset, newArgOffset+size, gRegs', fpRegs',
-                                XMMArith{opc=SSE2MoveDouble, source=MemoryArg baseAddr1, output=fpReg1 } ::
+                                XMMArith{opc=SSE2MoveDouble, source=MemoryArg word1Addr, output=fpReg1 } ::
                                 XMMArith{opc=if size = 0w12 then SSE2MoveFloat else SSE2MoveDouble,
-                                        source=MemoryArg baseAddr2, output=fpReg2 } :: code,
-                                extraStackPost, preCodePost)
+                                        source=MemoryArg word2Addr, output=fpReg2 } :: code,
+                                preCode)
 
                     |   (_, _, gRegs', fpRegs') => (* Either larger than 16 bytes or we've run out of the right kind of registers. *)
+                            (* Move the argument in the preCode.  It's possible a large struct could be the first argument
+                               and if we left it until the end RDI and RSI would already have been loaded. *)
                         let
-                            (* Move it to the stack.  Align the area up to 8 bytes. *)
                             val space = intAlignUp(Word.toInt size, 0w8)
                         in
-                            (* Move the argument in the preCode.  It's possible a large struct could be the first argument
-                               and if we left it until the end RDI and RSI would already have been loaded.
-                               N.B. We use preCode rather than preCodePost here to save a double move. *)
-                            loadSysV64Args(args, stackOffset+space, newArgOffset+size, gRegs', fpRegs', code, extraStack,
+                            loadSysV64Args(args, stackOffset+space, newArgOffset+size, gRegs', fpRegs', code,
                                 ArithToGenReg{opc=SUB, output=rsp, source=NonAddressConstArg(LargeInt.fromInt space), opSize=nativeWordOpSize} ::
                                     moveMemory{source=(mlArg2Reg, Word.toInt newArgOffset), destination=(rsp, 0), length=Word.toInt size} @ preCode)
                         end
                 end
 
-            (* argCode is the code to load and push the arguments.  argStack is the amount of stack space
-               the arguments will take.  It's only used to ensure that the stack is aligned onto a 16-byte
-               boundary.  preArgCode is any code that is needed to copy the arguments before they are
-               actually loaded.  Because it is done before the argument registers are loaded it can
-               use rcx, rdi and rsi.  On X86/64 Unix it also pushes any values that need to be written
-               to the stack.  extraStack is local stack space needed.  It is usually zero but
-               if it is non-zero it must be a multiple of 16 bytes.  The address of this area is loaded
-               into r10 before preArgCode is called. *)
-        
-            val {typeCode, ...} = Foreign.LibFFI.extractFFItype result
-
-            val resultMemory = {base=ecx, offset=0, index=NoIndex}
+            (* The rules for returning structs are similar to those for parameters. *)
+            local
+                (* Store a result register into the result area.  In almost all cases
+                   this is very simple: the only complication is with structs of odd sizes. *)
+                fun storeResult(reg, offset, size) =
+                let
+                    val moveOp =
+                        if size = 0w8 then Move64 else if size >= 0w4 then Move32 else if size >= 0w2 then Move16 else Move8
+                in
+                    [Move{source=RegisterArg reg, destination=MemoryArg {base=ecx, offset=offset, index=NoIndex}, moveSize=moveOp}]
+                end @
+                (
+                    if size = 0w6 orelse size = 0w7
+                    then
+                    [
+                        ShiftConstant{ shiftType=SHR, output=reg, shift=0w32, opSize=OpSize64 },
+                        Move{source=RegisterArg reg, destination=MemoryArg {base=ecx, offset=offset+4, index=NoIndex}, moveSize=Move16}
+                    ]
+                    else []
+                ) @
+                (
+                    if size = 0w3 orelse size = 0w5 orelse size = 0w7
+                    then
+                    [
+                        ShiftConstant{ shiftType=SHR, output=reg, shift=Word8.fromLargeWord(Word.toLargeWord((size-0w1)*0w8)), opSize=OpSize64 },
+                        Move{source=RegisterArg reg, destination=MemoryArg {base=ecx, offset=offset+Word.toInt(size-0w1), index=NoIndex}, moveSize=Move8}
+                    ]
+                    else []
+                )
+                    
+                val {size, typeCode, ...} = Foreign.LibFFI.extractFFItype result
+            in
+                val (getResult, passArgAddress) =
+                    if typeCode = Foreign.LibFFI.ffiTypeCodeVoid
+                    then ([], false)
+                    else case (classifyArg result, size > 0w8) of
+                        (* 8 bytes or smaller - returned in RAX - Normal case for int-like results. *)
+                        (ArgInRegs{firstInSSE=false, ...}, false) => (storeResult(rax, 0, size), false)
+
+                        (* 8 bytes or smaller - returned in XMM0 - Normal case for real results. *)
+                    |   (ArgInRegs{firstInSSE=true, ...}, false) =>
+                            ([XMMStoreToMemory{toStore=xmm0, address={base=rcx, offset=0, index=NoIndex},
+                                  precision=if size = 0w4 then SinglePrecision else DoublePrecision}], false)
+
+                        (* 9-16 bytes - returned in RAX/RDX. *)
+                    |   (ArgInRegs{firstInSSE=false, secondInSSE=false}, true) =>
+                            (storeResult(rax, 0, 0w8) @ storeResult(rdx, 0, size-0w8), false)
+
+                        (* 9-16 bytes - first in RAX, second in XMM0. *)
+                    |   (ArgInRegs{firstInSSE=false, secondInSSE=true}, true) =>
+                            (XMMStoreToMemory{toStore=xmm0, address={base=rcx, offset=8, index=NoIndex},
+                                    precision=if size = 0w12 then SinglePrecision else DoublePrecision} ::
+                                storeResult(rax, 0, 0w8), false)
+
+                        (* 9-16 bytes - first in XMM0, second in RAX. *)
+                    |   (ArgInRegs{firstInSSE=true, secondInSSE=false}, true) =>
+                            (XMMStoreToMemory{toStore=xmm0, address={base=rcx, offset=0, index=NoIndex}, precision=DoublePrecision} ::
+                                storeResult(rax, 8, size-0w8), false)
+
+                        (* 9-16 bytes - both values in SSE regs.*)
+                    |   (ArgInRegs{firstInSSE=true, secondInSSE=true}, true) =>
+                            ([XMMStoreToMemory{toStore=xmm0, address={base=rcx, offset=0, index=NoIndex}, precision=DoublePrecision},
+                              XMMStoreToMemory{toStore=xmm1, address={base=rcx, offset=8, index=NoIndex},
+                                    precision=if size = 0w12 then SinglePrecision else DoublePrecision}], false)
+
+                    |   _ => ([], true) (* Have to pass the address of the area in memory *)
+            end
 
-            val getResult =
-                if typeCode = Foreign.LibFFI.ffiTypeCodeStruct
-                then raise Foreign.Foreign "Struct result not yet implemented"
-                else if typeCode = Foreign.LibFFI.ffiTypeCodeVoid
-                then []
-                else if typeCode = Foreign.LibFFI.ffiTypeCodeSInt8 orelse typeCode = Foreign.LibFFI.ffiTypeCodeSInt8
-                then (* Single byte *) [Move{source=RegisterArg eax, destination=MemoryArg resultMemory, moveSize=Move8}]
-                else if typeCode = Foreign.LibFFI.ffiTypeCodeUInt16 orelse typeCode = Foreign.LibFFI.ffiTypeCodeSInt16
-                then (* 16-bits *) [Move{source=RegisterArg eax, destination=MemoryArg resultMemory, moveSize=Move16}]
-                else if typeCode = Foreign.LibFFI.ffiTypeCodeUInt32 orelse typeCode = Foreign.LibFFI.ffiTypeCodeSInt32
-                        orelse typeCode = Foreign.LibFFI.ffiTypeCodeInt
-                then [Move{source=RegisterArg eax, destination=MemoryArg resultMemory, moveSize=Move32}]
-                else if typeCode = Foreign.LibFFI.ffiTypeCodeUInt64 orelse typeCode = Foreign.LibFFI.ffiTypeCodeSInt64
-                        orelse typeCode = Foreign.LibFFI.ffiTypeCodePointer
-                then [Move{source=RegisterArg eax, destination=MemoryArg resultMemory, moveSize=Move64}]
-                else if typeCode = Foreign.LibFFI.ffiTypeCodeFloat
-                then [XMMStoreToMemory{toStore=xmm0, address=resultMemory, precision=SinglePrecision}]
-                else if typeCode = Foreign.LibFFI.ffiTypeCodeDouble
-                then [XMMStoreToMemory{toStore=xmm0, address=resultMemory, precision=DoublePrecision}]
-                else raise Foreign.Foreign "Unrecognised result type"
+            val sysVGenRegs = [rdi, rsi, rdx, rcx, r8, r9]
+            and sysVFPRegs = [xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7]
 
-            val (argCode, argStack, preArgCode, extraStack) =
-                loadSysV64Args(args, 0, 0w0, sysVGenRegs, sysVFPRegs, [], 0, [])
+            val (argCode, argStack, preArgCode) =
+                if passArgAddress (* If we have to pass the address of the result struct it goes in rdi. *)
+                then loadSysV64Args(args, 0, 0w0, tl sysVGenRegs, sysVFPRegs,
+                        [moveRR{source=resultAreaPtr, output=rdi, opSize=nativeWordOpSize}], [])
+                else loadSysV64Args(args, 0, 0w0, sysVGenRegs, sysVFPRegs, [], [])
 
             local
                 val align = argStack mod 16
             in
                 (* Always align the stack. *)
                 val preArgAlign = if align = 0 then 0 else 16-align
             end
 
         in
             (* Save heap ptr.  Needed in case we have a callback. *)
             [storeMemory(r15, ebp, memRegLocalMPointer, nativeWordOpSize)] @
             (
                 (* Put the destination address into a callee save resgister.
                    We have to put the C address in there now because an ML address wouldn't be updated
                    by a possible GC in a callback. *)
                 if #typeCode(Foreign.LibFFI.extractFFItype result) <> Foreign.LibFFI.ffiTypeCodeVoid
                 then [loadHeapMemory(resultAreaPtr, r8, 0, nativeWordOpSize)]
                 else []
             ) @
             [
                 (* Save the stack pointer. *)
                 storeMemory(esp, ebp, memRegStackPtr, nativeWordOpSize), (* Save ML stack and switch to C stack. *)
                 loadMemory(esp, ebp, memRegCStackPtr, nativeWordOpSize)  (* Load the saved C stack pointer. *)
             ] @
-            (
-                if extraStack = 0
-                then []
-                else
-                [
-                    ArithToGenReg{opc=SUB, output=rsp, source=NonAddressConstArg(LargeInt.fromInt extraStack), opSize=nativeWordOpSize},
-                    Move{source=RegisterArg rsp, destination=RegisterArg extraStackReg, moveSize=Move64}
-                ]
-            ) @
             (
                 if preArgAlign = 0
                 then []
                 else [ArithToGenReg{opc=SUB, output=esp, source=NonAddressConstArg(LargeInt.fromInt preArgAlign), opSize=nativeWordOpSize}]
             ) @
             (
                 (* The second argument is a SysWord containing the address of a malloced area of memory
                    with the actual arguments in it. *)
                 if null args
                 then []
                 else [loadHeapMemory(mlArg2Reg, mlArg2Reg, 0, nativeWordOpSize)]
             ) @ preArgCode @ argCode @
             [
                 let
                     (* The entry point is in a SysWord.word value in RAX. *)
                     val entryPoint =
                         case targetArch of
                             ObjectId32Bit => MemoryArg{base=ebx, offset=0, index=Index4 eax}
                         |   _ => MemoryArg{base=eax, offset=0, index=NoIndex}
                 in
                     (* Call the function.  We're discarding the value in rsp so no need to remove args. *)
                     CallAddress entryPoint
                 end,
                 loadMemory(esp, ebp, memRegStackPtr, nativeWordOpSize), (* Restore the ML stack pointer. *)
                 (* Reload the heap pointer.  If we've called back to ML this could well have changed. *)
                 loadMemory(r15, ebp, memRegLocalMPointer, nativeWordOpSize)
             ] @ (* Store the result in the destination. *) getResult @ [ ReturnFromFunction 0 ]
         end (* callUnix64Bits *)
     end
 
     fun foreignCall(abivalue: Foreign.LibFFI.abi, args: Foreign.LibFFI.ffiType list, result: Foreign.LibFFI.ffiType): Address.machineWord =
     let
         val abi = getFFIAbi abivalue
 
         val code =
             case abi of
                 FFI_UNIX64 => callUnix64Bits(args, result)
             |   FFI_WIN64 => callWindows64Bits(args, result)
             |   _ => call32Bits(abi, args, result)
         val functionName = "foreignCall"
         val debugSwitches =
             [Universal.tagInject Pretty.compilerOutputTag (Pretty.prettyPrint(print, 70)),
                Universal.tagInject DEBUG.assemblyCodeTag true]
         val profileObject = createProfileObject functionName
         val newCode = codeCreate (functionName, profileObject, debugSwitches)
         val closure = makeConstantClosure()
         val () = X86OPTIMISE.generateCode{code=newCode, labelCount=0, ops=code, resultClosure=closure}
     in
         closureAsAddress closure
     end
 
     (* Build a callback function.  The arguments are the abi, the list of argument types and the result type.
        The result is an ML function that takes an ML function, f, as its argument, registers it as a callback and
        returns the C function as its result.  When the C function is called the arguments are copied into
        temporary memory and the vector passed to f along with the address of the memory for the result.
        "f" stores the result in it when it returns and the result is then passed back as the result of the
        callback. *)
     fun buildCallBack(abivalue: Foreign.LibFFI.abi, args: Foreign.LibFFI.ffiType list, result: Foreign.LibFFI.ffiType): Address.machineWord =
     let
         val abi = getFFIAbi abivalue
     in
         raise Fail "TODO: foreignCall"
     end
 
 end;
diff --git a/samplecode/Foreign/Foreign.def b/samplecode/Foreign/Foreign.def
index f50c23ec..0ca19806 100644
--- a/samplecode/Foreign/Foreign.def
+++ b/samplecode/Foreign/Foreign.def
@@ -1,15 +1,17 @@
 ; Exports file for DLL
 
 LIBRARY      "Foreign"
 
 EXPORTS
 	DupNString
 	SumTree
 	ReturnR2
+    ReturnR4
+    ReturnR6
 	MakeCallback
 	MakeCallback2
 	MakeCallback3
 	AllocateIt
 	FreeIt
     UpdateArg
     ReturnFn
diff --git a/samplecode/Foreign/ForeignTest.ML b/samplecode/Foreign/ForeignTest.ML
index 7ae662aa..27d42246 100644
--- a/samplecode/Foreign/ForeignTest.ML
+++ b/samplecode/Foreign/ForeignTest.ML
@@ -1,148 +1,156 @@
 (* Example code for the updated foreign-function interface.
 
     Copyright David C.J. Matthews 2015
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License version 2.1 as published by the Free Software Foundation.
     
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.
     
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *)
 
 open Foreign;
 
 val mylib = loadLibrary "Foreign";
 
 (* Example of creating a conversion for a datatype. *)
 datatype intTree = NullTree | Node of {left: intTree, right: intTree, valu: int};
 
 (* The corresponding C structure is
 typedef struct _tree {
     struct _tree *left, *right;
     int nValue;
 } *tree;
 *)
 
 local
     (* Start with the C structure. *)
     val treeNode = cStruct3(cPointer, cPointer, cInt) 
     val {store=storeStruct, load=loadStruct, ctype = {size = sizeStruct, ...}, ... } =
         breakConversion treeNode
 in
 (* The following function builds a C data structure from an ML datatype. *)
     fun treeMake NullTree = Memory.null
     |   treeMake(Node{left, right, valu}) =
         let
             val mem = Memory.malloc sizeStruct
         in
             ignore(storeStruct(mem, (treeMake left, treeMake right, valu)));
             mem
         end
     
     fun treeClear a =
         if a = Memory.null
         then ()
         else
         let
             val (left, right, _) = loadStruct a
         in
             treeClear left; treeClear right; Memory.free a
         end
 
     fun treeStore(addr, tree) =
     let
         val mem = treeMake tree
     in
         Memory.setAddress(addr, 0w0, mem);
         (* The store function returns a function that frees the memory. *)
         fn () => treeClear mem
     end
            
     (* The inverse of treeStore. We don't actually use this in this example. *)
     fun treeGet a = 
         if a = Memory.null
         then NullTree
         else
         let
             val (left, right, valu) = loadStruct a
         in
             Node{left=treeGet left, right=treeGet right, valu = valu }
         end
 
     fun treeLoad v = treeGet(Memory.getAddress(v, 0w0))
 
 end;
 
 (* Build a conversion out of this. *)
 val cTree: intTree conversion =
     makeConversion { load = treeLoad, store = treeStore, ctype = LowLevel.cTypePointer };
 
 val sumTree = buildCall1 (getSymbol mylib "SumTree", cTree, cInt);
 
 val aTree = Node{left=Node{left=NullTree, right=NullTree, valu=4},
                right=Node{
                     left= Node{left=NullTree, right=NullTree, valu=3},
                     right=NullTree, valu=5},
                valu = 7};
 sumTree aTree;
 
 
 (* Example of returning a structure. *)
 val returnR2 = buildCall2 (getSymbol mylib "ReturnR2", (cInt, cInt), cStruct2(cInt, cInt));
 returnR2(5,6);
 
+val returnR4 = buildCall2 (getSymbol mylib "ReturnR4", (cInt, cInt), cStruct4(cInt, cInt, cFloat, cFloat));
+returnR4(5,6);
+
+
+val returnR6 =
+    buildCall2 (getSymbol mylib "ReturnR6", (cInt, cInt), cStruct6(cInt, cInt, cInt, cInt, cInt, cInt));
+returnR6(5,6);
+
 (* Example of passing and returning strings. *)
 val dupNString = buildCall2 (getSymbol mylib "DupNString", (cInt, cString), cString);
 
 dupNString (4, "hi");
 
 (* Example of a callback function. *)
 
 fun f (i, j) = (PolyML.print(i, j); i+j);
 
 val fAsCFunction = buildClosure2(f, (cInt, cInt), cInt);
 
 val doAdd =
 	buildCall2 (getSymbol mylib "MakeCallback", (cInt, cFunction: (int*int->int) closure conversion), cInt);
 doAdd(4, fAsCFunction);
 
 fun myCallback(a: int, b: char, c: real, d: real, e: int, f: Memory.voidStar) =
 (
     PolyML.print(a, b, c, d, e);
     99.0
 );
 
 local
 	val myCallbackC =
 		buildClosure6(myCallback, (cInt, cChar, cDouble, cFloat, cShort, cPointer), cDouble)
 
 	val MakeCallback2 =
 	    buildCall1(getSymbol mylib "MakeCallback2", cFunction, cDouble)
 in
 	val returnR3 = MakeCallback2 myCallbackC
 end;
 
 let
 	val f = buildClosure1(fn i => print(Int.toString i), cInt, cVoid)
 	val doit = buildCall2(getSymbol mylib "MakeCallback3", (cFunction, cInt), cVoid)
 in
 	doit(f, 2)
 end;
 
 (* Call-by-reference. *)
 
 val r = ref 6;
 
 val updateArg =
     buildCall2 (getSymbol mylib "UpdateArg", (cInt, cStar cInt), cVoid);
 
 updateArg(5, r); (* Adds its first argument to the ref. *)
 
 !r;
diff --git a/samplecode/Foreign/ForeignTest.c b/samplecode/Foreign/ForeignTest.c
index 66f54f19..7e8ea35d 100644
--- a/samplecode/Foreign/ForeignTest.c
+++ b/samplecode/Foreign/ForeignTest.c
@@ -1,119 +1,151 @@
 /* Example code for a C-library accessible from ML
 using the CInterface structure.
 
 Copyright David C.J. Matthews 1999, 2009, 2015
 
 This library is free software; you can redistribute it and/or
 modify it under the terms of the GNU Lesser General Public
 License version 2.1 as published by the Free Software Foundation.
 
 This library is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 Lesser General Public License for more details.
 
 You should have received a copy of the GNU Lesser General Public
 License along with this library; if not, write to the Free Software
 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
 /*
 Linux: cc -shared -o Foreign ForeignTest.c
 Windows: cl /MT ForeignTest.c /link /dll /out:Foreign.dll /def:Foreign.def
 Mac OS X : cc -dynamiclib -o Foreign.dylib ForeignTest.c
 */
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
 
 /* Return a string duplicated n Times. */
 char *DupNString(int n, char *str)
 {
     int nSize = strlen(str);
     char *res = malloc(n*nSize + 1);
     int i;
     *res = 0;
     for (i = 0; i < n; i++) strcat(res, str);
     return res;
 }
 typedef struct _tree {
     struct _tree *left, *right;
     int	nValue;
 } *tree;
 
 int SumTree(tree t)
 {
     if (t == NULL) return 0;
     else return t->nValue + SumTree(t->left) + SumTree(t->right);
 }
 
 /* Tests to see whether small structures are returned specially. */
 struct r2 { int n, m; };
 
 struct r2 ReturnR2(int n, int m)
 {
     struct r2 ret;
     ret.n = n + 1;
     ret.m = m + 1;
     return ret;
 }
 
+// The SysV ABI for X64 Unix puts these in RAX/XMM0.
+struct r4 {
+    int a, b;
+    float c, d;
+};
+
+struct r4 ReturnR4(int n, int m)
+{
+    struct r4 ret;
+    ret.a = n + 1;
+    ret.b = m + 1;
+    ret.c = n + 2;
+    ret.d = m + 2;
+    return ret;
+}
+
+struct r6 {
+    int a, b, c, d, e, f;
+};
+
+struct r6 ReturnR6(int n, int m)
+{
+    struct r6 ret;
+    ret.a = n + 1;
+    ret.b = m + 1;
+    ret.c = n + 2;
+    ret.d = m + 2;
+    ret.e = n + 3;
+    ret.f = m + 3;
+    return ret;
+}
+
 /* Added.  Callback function. */
 
 typedef int(*INT_INT_CALLBACK) (int a, int b);
 
 int MakeCallback(int i, INT_INT_CALLBACK p)
 {
     return (*p)(i, 4) + (*p)(i + 1, 5);
 }
 
 
 /* Another callback function.  This tests the various argument types. */
 typedef double(*DBL_CALLBACK) (int a, char b, double c, float d, short e, int *f);
 
 double MakeCallback2(DBL_CALLBACK p)
 {
     int x = 1;
     double y = p(12345, 'X', (double)1.414, (float)2.8, 44, &x);
     return y;
 }
 
 // Check that void results work for callbacks.
 void MakeCallback3(void(*mlcall)(int), int i)
 {
     mlcall(i + 1);
 }
 
 /* Test for finalisation. */
 void *AllocateIt()
 {
     void *p = malloc(1);
     printf("Allocated object at %p\n", p);
     fflush(stdout);
     return p;
 }
 
 void FreeIt(void *p)
 {
     printf("Freed object at %p\n", p);
     fflush(stdout);
     free(p);
 }
 
 /* Test for call-by-reference.  Added in updated FFI. */
 void UpdateArg(int i, int *p)
 {
     *p += i;
 }
 
 /* Test for returning a function.  Added in updated FFI. */
 static int acallBack(int q)
 {
     return q * 2;
 }
 
 typedef int(*CB)(int);
 
 void ReturnFn(CB *v)
 {
     *v = acallBack;
 }