diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ForeignCall.sml b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ForeignCall.sml index 348ddffe..6386e6f0 100644 --- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ForeignCall.sml +++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ForeignCall.sml @@ -1,692 +1,740 @@ (* Copyright (c) 2021 David C. J. Matthews This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public Licence version 2.1 as published by the Free Software Foundation. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public Licence for more details. You should have received a copy of the GNU Lesser General Public Licence along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA *) functor Arm64ForeignCall( structure CodeArray: CODEARRAYSIG and Arm64Assembly: Arm64Assembly and Debug: DEBUG and Arm64Sequences: Arm64Sequences sharing CodeArray.Sharing = Arm64Assembly.Sharing = Arm64Sequences.Sharing ): FOREIGNCALLSIG = struct open CodeArray Arm64Assembly Arm64Sequences exception InternalError = Misc.InternalError and Foreign = Foreign.Foreign datatype fastArgs = FastArgFixed | FastArgDouble | FastArgFloat val makeEntryPoint: string -> machineWord = RunCall.rtsCallFull1 "PolyCreateEntryPointObject" local (* Both doubles and sysword values occupy a single heap cell. *) fun allocateAWord(fixedReg, workReg) = let val label = createLabel() in [ (* Subtract the number of bytes required from the heap pointer. *) subImmediate{regN=X_MLHeapAllocPtr, regD=fixedReg, immed=0w16, shifted=false}, (* Compare the result with the heap limit. *) subSShiftedReg{regM=X_MLHeapLimit, regN=fixedReg, regD=XZero, shift=ShiftNone}, conditionalBranch(condCarrySet, label), loadRegScaled{regT=X16, regN=X_MLAssemblyInt, unitOffset=heapOverflowCallOffset}, branchAndLinkReg X16, registerMask [], (* Not used at the moment. *) setLabel label, (* Update the heap pointer. *) moveRegToReg{sReg=fixedReg, dReg=X_MLHeapAllocPtr} ] @ loadNonAddress(workReg, Word64.orb(0w1, Word64.<<(Word64.fromLarge(Word8.toLarge Address.F_bytes), 0w56))) @ [ (* Store the length word. Have to use the unaligned version because offset is -ve. *) storeRegUnscaled{regT=workReg, regN=fixedReg, byteOffset= ~8} ] end in fun boxDouble(floatReg, fixedReg, workReg) = allocateAWord(fixedReg, workReg) @ [storeRegScaledDouble{regT=floatReg, regN=fixedReg, unitOffset=0}] and boxSysWord(toBoxReg, fixedReg, workReg) = allocateAWord(fixedReg, workReg) @ [storeRegScaled{regT=toBoxReg, regN=fixedReg, unitOffset=0}] end fun rtsCallFastGeneral (functionName, argFormats, resultFormat, debugSwitches) = let val entryPointAddr = makeEntryPoint functionName (* The maximum we currently have is five so we don't need to worry about stack args. *) fun loadArgs([], _, _, _) = [] | loadArgs(FastArgFixed :: argTypes, srcReg :: srcRegs, fixed :: fixedRegs, fpRegs) = if srcReg = fixed then loadArgs(argTypes, srcRegs, fixedRegs, fpRegs) (* Already in the right reg *) else moveRegToReg{sReg=srcReg, dReg=fixed} :: loadArgs(argTypes, srcRegs, fixedRegs, fpRegs) | loadArgs(FastArgDouble :: argTypes, srcReg :: srcRegs, fixedRegs, fp :: fpRegs) = (* Unbox the value into a fp reg. *) loadRegScaledDouble{regT=fp, regN=srcReg, unitOffset=0} :: loadArgs(argTypes, srcRegs, fixedRegs, fpRegs) | loadArgs(FastArgFloat :: argTypes, srcReg :: srcRegs, fixedRegs, fp :: fpRegs) = (* Untag and move into the fp reg *) logicalShiftRight{shift=0w32, regN=srcReg, regD=srcReg} :: moveGeneralToFloat{regN=srcReg, regD=fp} :: loadArgs(argTypes, srcRegs, fixedRegs, fpRegs) | loadArgs _ = raise InternalError "rtsCall: Too many arguments" val instructions = loadArgs(argFormats, (* ML Arguments *) [X0, X1, X2, X3, X4, X5, X6, X7], (* C fixed pt args *) [X0, X1, X2, X3, X4, X5, X6, X7], (* C floating pt args *) [V0, V1, V2, V3, V4, V5, V6, V7]) @ [ (* Move X30 to X24, a callee-save register. *) (* Note: maybe we should push X24 just in case this is the only reachable reference to the code. *) orrShiftedReg{regN=XZero, regM=X_LinkReg, regD=X24, shift=ShiftNone}, loadAddressConstant(X16, entryPointAddr), (* Load entry point *) loadRegScaled{regT=X16, regN=X16, unitOffset=0}, (* Load the actual address. *) (* Store the current heap allocation pointer. *) storeRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset}, (* For the moment save and restore the ML stack pointer. No RTS call should change it and it's callee-save but just in case... *) storeRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset}, branchAndLinkReg X16, (* Call the function. *) (* Restore the ML stack pointer. *) loadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset}, (* Load the heap allocation ptr and limit. We could have GCed in the RTS call. *) loadRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset}, loadRegScaled{regT=X_MLHeapLimit, regN=X_MLAssemblyInt, unitOffset=heapLimitPtrOffset} ] @ ( case resultFormat of FastArgFixed => [] | FastArgDouble => (* This must be boxed. *) boxDouble(V0, X0, X1) | FastArgFloat => (* This must be tagged *) [ moveFloatToGeneral{regN=V0, regD=X0}, logicalShiftLeft{shift=0w32, regN=X0, regD=X0}, bitwiseOrImmediate{regN=X0, regD=X0, bits=0w1} ] ) @ [ returnRegister X24 ] val closure = makeConstantClosure() val () = generateCode{instrs=instructions, name=functionName, parameters=debugSwitches, resultClosure=closure} in closureAsAddress closure end fun rtsCallFast (functionName, nArgs, debugSwitches) = rtsCallFastGeneral (functionName, List.tabulate(nArgs, fn _ => FastArgFixed), FastArgFixed, debugSwitches) (* RTS call with one double-precision floating point argument and a floating point result. *) fun rtsCallFastRealtoReal (functionName, debugSwitches) = rtsCallFastGeneral (functionName, [FastArgDouble], FastArgDouble, debugSwitches) (* RTS call with two double-precision floating point arguments and a floating point result. *) fun rtsCallFastRealRealtoReal (functionName, debugSwitches) = rtsCallFastGeneral (functionName, [FastArgDouble, FastArgDouble], FastArgDouble, debugSwitches) (* RTS call with one double-precision floating point argument, one fixed point argument and a floating point result. *) fun rtsCallFastRealGeneraltoReal (functionName, debugSwitches) = rtsCallFastGeneral (functionName, [FastArgDouble, FastArgFixed], FastArgDouble, debugSwitches) (* RTS call with one general (i.e. ML word) argument and a floating point result. This is used only to convert arbitrary precision values to floats. *) fun rtsCallFastGeneraltoReal (functionName, debugSwitches) = rtsCallFastGeneral (functionName, [FastArgFixed], FastArgDouble, debugSwitches) (* Operations on Real32.real values. *) fun rtsCallFastFloattoFloat (functionName, debugSwitches) = rtsCallFastGeneral (functionName, [FastArgFloat], FastArgFloat, debugSwitches) fun rtsCallFastFloatFloattoFloat (functionName, debugSwitches) = rtsCallFastGeneral (functionName, [FastArgFloat, FastArgFloat], FastArgFloat, debugSwitches) (* RTS call with one double-precision floating point argument, one fixed point argument and a floating point result. *) fun rtsCallFastFloatGeneraltoFloat (functionName, debugSwitches) = rtsCallFastGeneral (functionName, [FastArgFloat, FastArgFixed], FastArgFloat, debugSwitches) (* RTS call with one general (i.e. ML word) argument and a floating point result. This is used only to convert arbitrary precision values to floats. *) fun rtsCallFastGeneraltoFloat (functionName, debugSwitches) = rtsCallFastGeneral (functionName, [FastArgFixed], FastArgFloat, debugSwitches) (* There is only one ABI value. *) datatype abi = ARM64Abi fun abiList () = [("default", ARM64Abi)] fun alignUp(s, align) = Word.andb(s + align-0w1, ~ align) val getThreadDataCall = makeEntryPoint "PolyArm64GetThreadData" (* This must match the type in Foreign.LowLevel. Once this is bootstrapped we could use that type but note that this is the type we use within the compiler and we build Foreign.LowLevel AFTER compiling this. *) datatype cTypeForm = CTypeFloatingPt | CTypePointer | CTypeSignedInt | CTypeUnsignedInt | CTypeStruct of cType list | CTypeVoid withtype cType = { typeForm: cTypeForm, align: word, size: word } (* Load a value into a register. Normally the size will be 1, 2, 4 or 8 bytes and this will just involve a simple load. Structs, though, can be of any size up to 8 bytes. *) fun loadUpTo8(reg, base, offset, size, workReg) = let (* The argument is put in the least-significant bits implying we don't sign-extend. *) val loadOp = if size = 0w8 then loadRegUnscaled else if size >= 0w4 then loadRegUnscaled32 else if size >= 0w2 then loadRegUnscaled16 else loadRegUnscaledByte in [loadOp{regT=reg, regN=base, byteOffset=Word.toInt offset}] end @ ( if size = 0w6 orelse size = 0w7 then [ loadRegUnscaled16{regT=workReg, regN=base, byteOffset=Word.toInt offset + 4}, orrShiftedReg{regM=workReg, regN=reg, regD=reg, shift=ShiftLSL 0w32} ] else [] ) @ ( if size = 0w3 orelse size = 0w5 orelse size = 0w7 then [ loadRegUnscaledByte{regT=workReg, regN=base, byteOffset=Word.toInt offset + Word.toInt(size-0w1)}, orrShiftedReg{regM=workReg, regN=reg, regD=reg, shift=ShiftLSL((size-0w1)*0w8)} ] else [] ) (* Store a register into upto 8 bytes. Most values will involve a single store but odd-sized structs can require shifts and multiple stores. N.B. May modify the source register. *) and storeUpTo8(reg, base, offset, size) = let val storeOp = if size = 0w8 then storeRegUnscaled else if size >= 0w4 then storeRegUnscaled32 - else if size >= 0w2 then storeRegUnscaled16 else loadRegUnscaledByte + else if size >= 0w2 then storeRegUnscaled16 else storeRegUnscaledByte in [storeOp{regT=reg, regN=base, byteOffset=offset}] end @ ( if size = 0w6 orelse size = 0w7 then [ logicalShiftRight{regN=reg, regD=reg, shift=0w32 }, storeRegUnscaled16{regT=reg, regN=base, byteOffset=offset+4} ] else [] ) @ ( if size = 0w3 orelse size = 0w5 orelse size = 0w7 then [ logicalShiftRight{regN=reg, regD=reg, shift=(size-0w1)*0w8 }, - storeRegUnscaledByte{regT=reg, regN=base, byteOffset=Word.toInt(size-0w1)} + storeRegUnscaledByte{regT=reg, regN=base, byteOffset=offset+Word.toInt(size-0w1)} ] else [] ) (* Extract the elements of structures. *) fun unwrap(CTypeStruct ctypes, _) = List.foldr(fn({typeForm, size, ...}, l) => unwrap(typeForm, size) @ l) [] ctypes | unwrap (ctype, size) = [(ctype, size)] (* Structures of up to four floating point values of the same precision are treated specially. *) datatype argClass = ArgClassHFA of Word8.word * bool (* 1 - 4 floating pt values *) | ArgLargeStruct (* > 16 bytes and not an HFA *) | Arg9To16Bytes (* Two X registers *) | Arg1To8Bytes (* One X register *) fun classifyArg(ctype, size) = case unwrap (ctype, size) of [(CTypeFloatingPt, 0w4)] => ArgClassHFA(0w1, false) | [(CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4)] => ArgClassHFA(0w2, false) | [(CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4)] => ArgClassHFA(0w3, false) | [(CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4)] => ArgClassHFA(0w4, false) | [(CTypeFloatingPt, 0w8)] => ArgClassHFA(0w1, true) | [(CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8)] => ArgClassHFA(0w2, true) | [(CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8)] => ArgClassHFA(0w3, true) | [(CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8)] => ArgClassHFA(0w4, true) | _ => if size > 0w16 then ArgLargeStruct else if size > 0w8 then Arg9To16Bytes else Arg1To8Bytes (* This builds a piece of code that takes three arguments and returns a unit result. All three arguments are SysWord.word values i.e. ML addresses containing the address of the actual C value. The first argument (X0) is the address of the function to call. The second argument (X1) points to a struct that contains the argument(s) for the function. The arguments have to be unpacked from the struct into the appropriate registers or to the C stack. The third argument (X2) points to a piece of memory to receive the result of the call. It may be empty if the function returns void. It may only be as big as required for the result type. *) fun foreignCall(_: abi, args: cType list, result: cType): Address.machineWord = let val resultAreaPtr = X19 (* Unboxed value from X2 - This is callee save. *) val argPtrReg = X9 (* A scratch register that isn't used for arguments. *) val entryPtReg = X16 (* Contains the address of the function to call. *) val argWorkReg = X10 (* Used in loading arguments if necessary. *) and argWorkReg2 = X11 and structSpacePtr = X12 and argWorkReg3 = X13 and argWorkReg4 = X14 fun loadArgs([], stackOffset, _, _, _, code, largeStructSpace) = (code, stackOffset, largeStructSpace) | loadArgs(arg::args, stackOffset, argOffset, gRegNo, fpRegNo, code, largeStructSpace) = let val {size, align, typeForm, ...} = arg val newArgOffset = alignUp(argOffset, align) in case classifyArg(typeForm, size) of ArgClassHFA(numItems, isDouble) => if fpRegNo + numItems <= 0w8 then let val scale = if isDouble then 0w8 else 0w4 (* Load the values to the floating point registers. *) fun loadFPRegs(0w0, _, _) = [] | loadFPRegs(0w1, fpRegNo, offset) = [(if isDouble then loadRegScaledDouble else loadRegScaledFloat) {regT=VReg fpRegNo, regN=argPtrReg, unitOffset=offset}] | loadFPRegs(n, fpRegNo, offset) = (if isDouble then loadPairOffsetDouble else loadPairOffsetFloat) {regT1=VReg fpRegNo, regT2=VReg(fpRegNo+0w1), regN=argPtrReg, unitOffset=offset} :: loadFPRegs(n-0w2, fpRegNo+0w2, offset+2) in loadArgs(args, stackOffset, newArgOffset+size, gRegNo, fpRegNo+numItems, loadFPRegs(numItems, fpRegNo, Word.toInt(newArgOffset div scale)) @ code, largeStructSpace) end else let (* If we have insufficient number of registers we discard any that are left and push the argument to the stack. *) (* The floating point value or structure is copied to the stack as a contiguous area. Use general registers to copy the data. It could be on a 4-byte alignment. In the typical case of a single floating point value this will just be a single load and store. *) fun copyData(0w0, _, _) = [] | copyData(n, srcOffset, stackOffset) = if isDouble then loadRegScaled{regT=argWorkReg2, regN=argPtrReg, unitOffset=srcOffset} :: storeRegScaled{regT=argWorkReg2, regN=XSP, unitOffset=stackOffset} :: copyData(n-0w1, srcOffset+1, stackOffset+1) else loadRegScaled32{regT=argWorkReg2, regN=argPtrReg, unitOffset=srcOffset} :: storeRegScaled32{regT=argWorkReg2, regN=XSP, unitOffset=stackOffset} :: copyData(n-0w1, srcOffset+1, stackOffset+1) val copyToStack = if isDouble then copyData(numItems, Word.toInt(newArgOffset div 0w8), stackOffset) else copyData(numItems, Word.toInt(newArgOffset div 0w4), stackOffset*2) (* The overall size is rounded up to a multiple of 8 *) val newStackOffset = stackOffset + Word.toInt(alignUp(size, 0w8) div 0w8) in loadArgs(args, newStackOffset, newArgOffset+size, gRegNo, 0w8, copyToStack @ code, largeStructSpace) end | ArgLargeStruct => (* Larger structures are passed by reference. *) let (* Structures of other sizes are passed by reference. They are first copied into new areas on the stack. This ensures that the called function can update the structure without changing the original values. *) val newStructSpace = alignUp(largeStructSpace + size, 0w16) val loopLabel = createLabel() val argRegNo = if gRegNo < 0w8 then XReg gRegNo else argWorkReg (* Copy from the end back to the start. *) val argCode = [ addImmediate{regN=structSpacePtr, regD=argRegNo, immed=largeStructSpace, shifted=false}, addImmediate{regN=argRegNo, regD=argWorkReg2, immed=size, shifted=false}, (* End of dest area *) addImmediate{regN=argPtrReg, regD=argWorkReg3, immed=newArgOffset+size, shifted=false}, (* end of source *) setLabel loopLabel, loadRegPreIndexByte{regT=argWorkReg4, regN=argWorkReg3, byteOffset= ~1}, storeRegPreIndexByte{regT=argWorkReg4, regN=argWorkReg2, byteOffset= ~1}, subSShiftedReg{regM=argWorkReg2, regN=argRegNo, regD=XZero, shift=ShiftNone}, (* At start? *) conditionalBranch(condNotEqual, loopLabel) ] in if gRegNo < 0w8 then loadArgs(args, stackOffset, newArgOffset+size, gRegNo+0w1, fpRegNo, argCode @ code, newStructSpace) else loadArgs(args, stackOffset+1, newArgOffset+size, 0w8, fpRegNo, argCode @ storeRegScaled{regT=argWorkReg, regN=XSP, unitOffset=stackOffset} :: code, newStructSpace) end | Arg9To16Bytes => (* Structs between 8 and 16 bytes require two registers. *) (* Note: If this is a struct the source value may not be aligned on any particular alignment. We ignore that and use 64-bit loads. However that means we can't necessarily use a load-pair. *) ( if gRegNo <= 0w6 then loadArgs(args, stackOffset, newArgOffset+size, gRegNo + 0w2, fpRegNo, loadRegUnscaled{regT=XReg gRegNo, regN=argPtrReg, byteOffset=Word.toInt newArgOffset} :: loadUpTo8(XReg(gRegNo+0w1), argPtrReg, newArgOffset+0w8, size-0w8, argWorkReg) @ code, largeStructSpace) else loadArgs(args, stackOffset+2, newArgOffset+size, 0w8, fpRegNo, loadRegUnscaled{regT=argWorkReg2, regN=argPtrReg, byteOffset=Word.toInt newArgOffset} :: storeRegScaled{regT=argWorkReg2, regN=XSP, unitOffset=stackOffset} :: loadUpTo8(argWorkReg2, argPtrReg, newArgOffset+0w8, size-0w8, argWorkReg) @ storeRegScaled{regT=argWorkReg2, regN=XSP, unitOffset=stackOffset + 1} :: code, largeStructSpace) ) | Arg1To8Bytes => ( if gRegNo <= 0w7 then loadArgs(args, stackOffset, newArgOffset+size, gRegNo + 0w1, fpRegNo, loadUpTo8(XReg gRegNo, argPtrReg, newArgOffset, size, argWorkReg) @ code, largeStructSpace) else loadArgs(args, stackOffset+1, newArgOffset+size, 0w8, fpRegNo, loadUpTo8(argWorkReg2, argPtrReg, newArgOffset, size, argWorkReg) @ storeRegScaled{regT=argWorkReg2, regN=XSP, unitOffset=stackOffset} :: code, largeStructSpace) ) end local val {size, typeForm, ...} = result (* Store a result register into the result area. In almost all cases this is very simple: the only complication is with structs of odd sizes. *) fun storeResult(reg, offset, size) = storeUpTo8(reg, resultAreaPtr, offset, size) in val (getResult, passArgAddress) = if typeForm = CTypeVoid then ([], false) else case classifyArg(typeForm, size) of (* Floating point values are returned in s0-sn, d0-dn. *) ArgClassHFA(numItems, isDouble) => let fun storeFPRegs(0w0, _, _) = [] | storeFPRegs(0w1, fpRegNo, offset) = [(if isDouble then storeRegScaledDouble else storeRegScaledFloat) {regT=VReg fpRegNo, regN=resultAreaPtr, unitOffset=offset}] | storeFPRegs(n, fpRegNo, offset) = (if isDouble then storePairOffsetDouble else storePairOffsetFloat) {regT1=VReg fpRegNo, regT2=VReg(fpRegNo+0w1), regN=resultAreaPtr, unitOffset=offset} :: storeFPRegs(n-0w2, fpRegNo+0w2, offset+2) in (storeFPRegs(numItems, 0w0 (* V0-Vn*), 0), false) end | ArgLargeStruct => ([], true) (* Structures larger than 16 bytes are passed by reference. *) (* Structures between 8 and 16 bytes are returned in X0/X1. *) | Arg9To16Bytes => if size = 0w16 then ([storePairOffset{regT1=X0, regT2=X1, regN=resultAreaPtr, unitOffset=0}], false) else (storeRegScaled{regT=X0, regN=resultAreaPtr, unitOffset=0} :: storeResult(X1, 8, size-0w8), false) | Arg1To8Bytes => (storeResult(X0, 0, size), false) end val (argCode, argStack, largeStructSpace) = loadArgs(args, 0, 0w0, 0w0, 0w0, if passArgAddress (* If we have to pass the address of the result struct it goes in X8. *) then [moveRegToReg{sReg=resultAreaPtr, dReg=X8}] else [], 0w0) val stackSpaceRequired = alignUp(Word.fromInt argStack * 0w8, 0w16) + largeStructSpace val instructions = [(* Push the return address to the stack. We could put it in a callee-save register but there's a very small chance that this could be the last reference to a piece of code. *) storeRegPreIndex{regT=X30, regN=X_MLStackPtr, byteOffset= ~8}, (* Save heap ptr. Needed in case we have a callback. *) storeRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset}, (* Load the entry point address. *) loadRegScaled{regT=entryPtReg, regN=X0, unitOffset=0}] @ ( (* Unbox the address of the result area into a callee save resgister. This is where the result will be stored on return if it is anything other than a struct. We have to put the C address in there now because an ML address wouldn't be updated by a possible GC in a callback. *) if #typeForm(result) <> CTypeVoid then [loadRegScaled{regT=resultAreaPtr, regN=X2, unitOffset=0}] else [] ) @ [storeRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset}] @ (* Save the stack pointer. *) ( if stackSpaceRequired = 0w0 then [] else [subImmediate{regN=XSP, regD=XSP, immed=stackSpaceRequired, shifted=false}] ) @ ( (* If we need to copy a struct load a register with a pointer to the area for it. *) if largeStructSpace = 0w0 then [] else [addImmediate{regN=XSP, regD=structSpacePtr, immed=stackSpaceRequired-largeStructSpace, shifted=false}] ) @ ( (* The second argument is a SysWord containing the address of a malloced area of memory with the actual arguments in it. *) if null args then [] else [loadRegScaled{regT=argPtrReg, regN=X1, unitOffset=0}] ) @ argCode @ [branchAndLinkReg X16] @ (* Call the function. *) (* Restore the C stack value in case it's been changed by a callback. *) ( if stackSpaceRequired = 0w0 then [] else [addImmediate{regN=XSP, regD=XSP, immed=stackSpaceRequired, shifted=false}] ) @ [ (* Reload the ML stack pointer even though it's callee save. If we've made a callback the ML stack could have grown and so moved to a different address. *) loadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset}, (* Load the heap allocation ptr and limit in case of a callback. *) loadRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset}, loadRegScaled{regT=X_MLHeapLimit, regN=X_MLAssemblyInt, unitOffset=heapLimitPtrOffset} ] @ (* Store the result in the destination. *) getResult @ (* Pop the return address and return. *) [ loadRegPostIndex{regT=X30, regN=X_MLStackPtr, byteOffset= 8}, returnRegister X30 ] val functionName = "foreignCall" val debugSwitches = [(*Universal.tagInject Pretty.compilerOutputTag (Pretty.prettyPrint(print, 70)), Universal.tagInject Debug.assemblyCodeTag true*)] val closure = makeConstantClosure() val () = generateCode{instrs=instructions, name=functionName, parameters=debugSwitches, resultClosure=closure} in closureAsAddress closure end (* Build a callback function. The arguments are the abi, the list of argument types and the result type. The result is an ML function that takes an ML function, f, as its argument, registers it as a callback and returns the C function as its result. When the C function is called the arguments are copied into temporary memory and the vector passed to f along with the address of the memory for the result. "f" stores the result in it when it returns and the result is then passed back as the result of the callback. N.B. This returns a closure cell which contains the address of the code. It can be used as a SysWord.word value except that while it exists the code will not be GCed. *) fun buildCallBack(_: abi, args: cType list, result: cType): Address.machineWord = let (* Store the argument values to the structure that will be passed to the ML callback function. *) (* Note. We've loaded the frame pointer with the original stack ptr-96 so we can access any stack arguments from that. *) fun moveArgs([], _, _, _, _, moveFromStack) = moveFromStack | moveArgs(arg::args, stackSpace, argOffset, gRegNo, fpRegNo, moveFromStack) = let val {size, align, typeForm, ...} = arg val newArgOffset = alignUp(argOffset, align) in case classifyArg(typeForm, size) of - ArgClassHFA(numItems, isDouble) => raise Foreign "TODO: ArgClassHFA" - | ArgLargeStruct => raise Foreign "TODO: ArgLargeStruct" - | Arg9To16Bytes => raise Foreign "TODO: Arg9To16Bytes" + ArgClassHFA(numItems, isDouble) => + if fpRegNo + numItems <= 0w8 + then + let + val scale = if isDouble then 0w8 else 0w4 + (* Store the values from the FP registers. *) + fun storeFPRegs(0w0, _, _) = [] + | storeFPRegs(0w1, fpRegNo, offset) = + [(if isDouble then storeRegScaledDouble else storeRegScaledFloat) + {regT=VReg fpRegNo, regN=XSP, unitOffset=offset}] + | storeFPRegs(n, fpRegNo, offset) = + (if isDouble then storePairOffsetDouble else storePairOffsetFloat) + {regT1=VReg fpRegNo, regT2=VReg(fpRegNo+0w1), regN=XSP, unitOffset=offset} :: + storeFPRegs(n-0w2, fpRegNo+0w2, offset+2) + in + moveArgs(args, stackSpace, newArgOffset+size, gRegNo, fpRegNo+numItems, + storeFPRegs(numItems, fpRegNo, Word.toInt(newArgOffset div scale)) @ moveFromStack) + end + else raise Foreign "TODO: ArgClassHFA on stack" + + | ArgLargeStruct => raise Foreign "TODO: ArgLargeStruct" + | Arg9To16Bytes => raise Foreign "TODO: Arg9To16Bytes" | Arg1To8Bytes => ( if gRegNo <= 0w7 then moveArgs(args, stackSpace, newArgOffset+size, gRegNo + 0w1, fpRegNo, storeUpTo8(XReg gRegNo, XSP, Word.toInt newArgOffset, size) @ moveFromStack) else (*moveArgs(args, stackOffset+1, newArgOffset+size, 0w8, fpRegNo, loadUpTo8(argWorkReg2, argPtrReg, newArgOffset, size, argWorkReg) @ storeRegScaled{regT=argWorkReg2, regN=XSP, unitOffset=stackOffset} :: code, moveFromStack)*) raise Foreign "TODO: Stack arguments" ) end val copyArgsFromRegsAndStack = moveArgs(args, 0, 0w0, 0w0, 0w0, []) local fun getNextSize (arg, argOffset) = let val {size, align, ...} = arg in alignUp(argOffset, align) + size end in val argumentSpace = alignUp(List.foldl getNextSize 0w0 args, 0w16) end (* After the argument space there is a 32 byte area for the results. *) local val {size, typeForm, ...} = result in + (* Load the results from the result area except that if we're + passing the result structure by reference this is done by + the caller. Generally similar to how arguments are passed + in a call. *) val (loadResults, resultByReference) = if typeForm = CTypeVoid then ([], false) else case classifyArg(typeForm, size) of - ArgClassHFA(numItems, isDouble) => raise Foreign "TODO ArgClassHFA" + ArgClassHFA(numItems, isDouble) => + let + val scale = if isDouble then 0w8 else 0w4 + (* Load the values to the floating point registers. *) + fun loadFPRegs(0w0, _, _) = [] + | loadFPRegs(0w1, fpRegNo, offset) = + [(if isDouble then loadRegScaledDouble else loadRegScaledFloat) + {regT=VReg fpRegNo, regN=XSP, unitOffset=offset}] + | loadFPRegs(n, fpRegNo, offset) = + (if isDouble then loadPairOffsetDouble else loadPairOffsetFloat) + {regT1=VReg fpRegNo, regT2=VReg(fpRegNo+0w1), regN=XSP, unitOffset=offset} :: + loadFPRegs(n-0w2, fpRegNo+0w2, offset+2) + in + (loadFPRegs(numItems, 0w0, Word.toInt(argumentSpace div scale)), false) + end + | ArgLargeStruct => ([], true) (* Structures larger than 16 bytes are passed by reference. *) - | Arg9To16Bytes => raise Foreign "TODO Arg9To16Bytes" + | Arg9To16Bytes => + let + (* Unlike the argument case this is always aligned. *) + val code = + if size = 0w16 + then [loadPairOffset{regT1=X0, regT2=X1, regN=XSP, unitOffset=Word.toInt(argumentSpace div 0w8)}] + else loadRegScaled{regT=X0, regN=XSP, unitOffset=Word.toInt(argumentSpace div 0w8)} :: + loadUpTo8(X1, XSP, argumentSpace+0w8, size-0w8, X2) + in + (code, false) + end | Arg1To8Bytes => (loadUpTo8(X0, XSP, argumentSpace, size, X2),false) end val instructions = [ (* Push LR, FP and the callee-save registers. *) storePairPreIndexed{regT1=X29, regT2=X30, regN=XSP, unitOffset= ~12}, moveRegToReg{sReg=XSP, dReg=X29}, storePairOffset{regT1=X19, regT2=X20, regN=X29, unitOffset=2}, storePairOffset{regT1=X21, regT2=X22, regN=X29, unitOffset=4}, storePairOffset{regT1=X23, regT2=X24, regN=X29, unitOffset=6}, storePairOffset{regT1=X25, regT2=X26, regN=X29, unitOffset=8}, storePairOffset{regT1=X27, regT2=X28, regN=X29, unitOffset=10}, (* Reserve an area for any results returned in registers. This could be up to 32 bytes. Save X8 in it in case we're actually returning a result by reference. *) storeRegPreIndex{regT=X8, regN=XSP, byteOffset= ~32}, (* Reserve space for the arguments. *) subImmediate{regN=XSP, regD=XSP, immed=argumentSpace, shifted=false}, (* We passed the function we're calling in X9 but we need to move it to a callee-save register. *) moveRegToReg{sReg=X9, dReg=X20} ] @ copyArgsFromRegsAndStack @ [ (* Call into the RTS to get the thread data ptr. *) loadAddressConstant(X0, getThreadDataCall), loadRegScaled{regT=X0, regN=X0, unitOffset=0}, branchAndLinkReg X0, moveRegToReg{sReg=X0, dReg=X_MLAssemblyInt}, (* Load the ML regs. *) loadRegScaled{regT=X_MLHeapLimit, regN=X_MLAssemblyInt, unitOffset=heapLimitPtrOffset}, loadRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset}, loadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset}, (* Prepare the arguments. They are both syswords so have to be boxed. *) moveRegToReg{sReg=XSP, dReg=X2} (* Move to X2 before boxing *) ] @ boxSysWord(X2, X0, X3) @ (* Address of arguments. *) ( (* Result area pointer. If we're returning by reference this is the original value of X8 otherwise it's the address of the 32 bytes we've reserved. *) if resultByReference then [loadRegScaled{regT=X2, regN=XSP, unitOffset=Word.toInt(argumentSpace div 0w8)}] else [addImmediate{regN=XSP, regD=X2, immed=argumentSpace, shifted=false}] ) @ boxSysWord(X2, X1, X3) @ [ moveRegToReg{sReg=X20, dReg=X8}, loadRegScaled{regT=X16, regN=X8, unitOffset=0}, branchAndLinkReg X16, (* Save the ML stack and heap pointers. We could have allocated or grown the stack. The limit pointer is maintained by the RTS. *) storeRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset}, storeRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset} ] @ loadResults @ (* Load the return values *) [ (* Restore the callee-save registers and return. *) moveRegToReg{sReg=X29, dReg=XSP}, loadPairOffset{regT1=X19, regT2=X20, regN=X29, unitOffset=2}, loadPairOffset{regT1=X21, regT2=X22, regN=X29, unitOffset=4}, loadPairOffset{regT1=X23, regT2=X24, regN=X29, unitOffset=6}, loadPairOffset{regT1=X25, regT2=X26, regN=X29, unitOffset=8}, loadPairOffset{regT1=X27, regT2=X28, regN=X29, unitOffset=10}, loadPairPostIndexed{regT1=X29, regT2=X30, regN=XSP, unitOffset=12}, returnRegister X30 ] val functionName = "foreignCallBack(2)" val debugSwitches = [(*Universal.tagInject Pretty.compilerOutputTag (Pretty.prettyPrint(print, 70)), Universal.tagInject Debug.assemblyCodeTag true*)] val closure = makeConstantClosure() val () = generateCode{instrs=instructions, name=functionName, parameters=debugSwitches, resultClosure=closure} val stage2Code = closureAsAddress closure fun resultFunction f = let (* Generate a small function to load the address of f into a register and then jump to stage2. The idea is that it should be possible to generate this eventually in a single RTS call. That could be done by using a version of this as a model. *) val codeAddress = Address.loadWord(Address.toAddress stage2Code, 0w0) val instructions = [ loadAddressConstant(X9, Address.toMachineWord f), loadAddressConstant(X16, codeAddress), branchRegister X16 ] val functionName = "foreignCallBack(1)" val debugSwitches = [(*Universal.tagInject Pretty.compilerOutputTag (Pretty.prettyPrint(print, 70)), Universal.tagInject Debug.assemblyCodeTag true*)] val closure = makeConstantClosure() val () = generateCode{instrs=instructions, name=functionName, parameters=debugSwitches, resultClosure=closure} val res = closureAsAddress closure (*val _ = print("Address is " ^ (LargeWord.toString(RunCall.unsafeCast res)) ^ "\n")*) in res end in Address.toMachineWord resultFunction end end;