diff --git a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ForeignCall.sml b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ForeignCall.sml index be49cd8f..626f2dde 100644 --- a/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ForeignCall.sml +++ b/mlsource/MLCompiler/CodeTree/Arm64Code/Arm64ForeignCall.sml @@ -1,803 +1,848 @@ (* Copyright (c) 2021 David C. J. Matthews This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public Licence version 2.1 as published by the Free Software Foundation. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public Licence for more details. You should have received a copy of the GNU Lesser General Public Licence along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA *) functor Arm64ForeignCall( structure CodeArray: CODEARRAYSIG and Arm64Assembly: Arm64Assembly and Debug: DEBUG and Arm64Sequences: Arm64Sequences sharing CodeArray.Sharing = Arm64Assembly.Sharing = Arm64Sequences.Sharing ): FOREIGNCALLSIG = struct open CodeArray Arm64Assembly Arm64Sequences exception InternalError = Misc.InternalError and Foreign = Foreign.Foreign datatype fastArgs = FastArgFixed | FastArgDouble | FastArgFloat val makeEntryPoint: string -> machineWord = RunCall.rtsCallFull1 "PolyCreateEntryPointObject" + fun absoluteAddressToIndex reg = + if is32in64 + then + [ + subShiftedReg{regM=X_Base32in64, regN=reg, regD=reg, shift=ShiftNone}, + logicalShiftRight{shift=0w2, regN=reg, regD=reg} + ] + else [] + + (* Turn an index into an absolute address. *) + and indexToAbsoluteAddress(iReg, absReg) = + if is32in64 + then [addShiftedReg{regM=iReg, regN=X_Base32in64, regD=absReg, shift=ShiftLSL 0w2}] + else if iReg = absReg + then [] + else [moveRegToReg{sReg=iReg, dReg=absReg}] + local - (* Both doubles and sysword values occupy a single heap cell. *) - fun allocateAWord(fixedReg, workReg) = + fun allocateWords(fixedReg, workReg, words, bytes) = let val label = createLabel() in [ (* Subtract the number of bytes required from the heap pointer. *) - subImmediate{regN=X_MLHeapAllocPtr, regD=fixedReg, immed=0w16, shifted=false}, + subImmediate{regN=X_MLHeapAllocPtr, regD=fixedReg, immed=bytes, shifted=false}, (* Compare the result with the heap limit. *) subSShiftedReg{regM=X_MLHeapLimit, regN=fixedReg, regD=XZero, shift=ShiftNone}, conditionalBranch(condCarrySet, label), loadRegScaled{regT=X16, regN=X_MLAssemblyInt, unitOffset=heapOverflowCallOffset}, branchAndLinkReg X16, registerMask [], (* Not used at the moment. *) setLabel label, (* Update the heap pointer. *) moveRegToReg{sReg=fixedReg, dReg=X_MLHeapAllocPtr} ] @ loadNonAddress(workReg, - Word64.orb(0w1, Word64.<<(Word64.fromLarge(Word8.toLarge Address.F_bytes), 0w56))) + Word64.orb(words, Word64.<<(Word64.fromLarge(Word8.toLarge Address.F_bytes), + if is32in64 then 0w24 else 0w56))) @ [ (* Store the length word. Have to use the unaligned version because offset is -ve. *) storeRegUnscaled{regT=workReg, regN=fixedReg, byteOffset= ~8} ] end in fun boxDouble(floatReg, fixedReg, workReg) = - allocateAWord(fixedReg, workReg) @ - [storeRegScaledDouble{regT=floatReg, regN=fixedReg, unitOffset=0}] + allocateWords(fixedReg, workReg, if is32in64 then 0w2 else 0w1, 0w16) @ + storeRegScaledDouble{regT=floatReg, regN=fixedReg, unitOffset=0} :: + absoluteAddressToIndex fixedReg and boxSysWord(toBoxReg, fixedReg, workReg) = - allocateAWord(fixedReg, workReg) @ - [storeRegScaled{regT=toBoxReg, regN=fixedReg, unitOffset=0}] + allocateWords(fixedReg, workReg, if is32in64 then 0w2 else 0w1, 0w16) @ + storeRegScaled{regT=toBoxReg, regN=fixedReg, unitOffset=0} :: + absoluteAddressToIndex fixedReg + + fun boxOrTagFloat(floatReg, fixedReg, workReg) = + if is32in64 + then + allocateWords(fixedReg, workReg, 0w1, 0w8) @ + storeRegScaledFloat{regT=floatReg, regN=fixedReg, unitOffset=0} :: + absoluteAddressToIndex fixedReg + else + [ + moveFloatToGeneral{regN=floatReg, regD=fixedReg}, + logicalShiftLeft{shift=0w32, regN=fixedReg, regD=fixedReg}, + bitwiseOrImmediate{regN=fixedReg, regD=fixedReg, bits=0w1} + ] end + fun unboxDouble(addrReg, workReg, valReg) = + if is32in64 + then indexToAbsoluteAddress(addrReg, workReg) @ + [loadRegScaledDouble{regT=valReg, regN=workReg, unitOffset=0}] + else [loadRegScaledDouble{regT=valReg, regN=addrReg, unitOffset=0}] + + fun unboxOrUntagSingle(addrReg, workReg, valReg) = + if is32in64 + then [loadRegIndexedFloat{regN=X_Base32in64, regM=addrReg, regT=valReg, option=ExtUXTX ScaleOrShift}] + else + [ + logicalShiftRight{shift=0w32, regN=addrReg, regD=workReg}, + moveGeneralToFloat{regN=workReg, regD=valReg} + ] + fun rtsCallFastGeneral (functionName, argFormats, resultFormat, debugSwitches) = let val entryPointAddr = makeEntryPoint functionName (* The maximum we currently have is five so we don't need to worry about stack args. *) fun loadArgs([], _, _, _) = [] | loadArgs(FastArgFixed :: argTypes, srcReg :: srcRegs, fixed :: fixedRegs, fpRegs) = if srcReg = fixed then loadArgs(argTypes, srcRegs, fixedRegs, fpRegs) (* Already in the right reg *) else moveRegToReg{sReg=srcReg, dReg=fixed} :: loadArgs(argTypes, srcRegs, fixedRegs, fpRegs) | loadArgs(FastArgDouble :: argTypes, srcReg :: srcRegs, fixedRegs, fp :: fpRegs) = (* Unbox the value into a fp reg. *) - loadRegScaledDouble{regT=fp, regN=srcReg, unitOffset=0} :: + unboxDouble(srcReg, srcReg, fp) @ loadArgs(argTypes, srcRegs, fixedRegs, fpRegs) | loadArgs(FastArgFloat :: argTypes, srcReg :: srcRegs, fixedRegs, fp :: fpRegs) = (* Untag and move into the fp reg *) - logicalShiftRight{shift=0w32, regN=srcReg, regD=srcReg} :: - moveGeneralToFloat{regN=srcReg, regD=fp} :: + unboxOrUntagSingle(srcReg, srcReg, fp) @ loadArgs(argTypes, srcRegs, fixedRegs, fpRegs) | loadArgs _ = raise InternalError "rtsCall: Too many arguments" val instructions = loadArgs(argFormats, (* ML Arguments *) [X0, X1, X2, X3, X4, X5, X6, X7], (* C fixed pt args *) [X0, X1, X2, X3, X4, X5, X6, X7], (* C floating pt args *) [V0, V1, V2, V3, V4, V5, V6, V7]) @ [ - (* Move X30 to X24, a callee-save register. *) + (* Move X30 to X23, a callee-save register. *) (* Note: maybe we should push X24 just in case this is the only reachable reference to the code. *) - orrShiftedReg{regN=XZero, regM=X_LinkReg, regD=X24, shift=ShiftNone}, - loadAddressConstant(X16, entryPointAddr), (* Load entry point *) + orrShiftedReg{regN=XZero, regM=X_LinkReg, regD=X23, shift=ShiftNone}, + loadAddressConstant(X16, entryPointAddr) (* Load entry point *) + ] @ indexToAbsoluteAddress(X16, X16) @ + [ loadRegScaled{regT=X16, regN=X16, unitOffset=0}, (* Load the actual address. *) (* Store the current heap allocation pointer. *) storeRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset}, (* For the moment save and restore the ML stack pointer. No RTS call should change it and it's callee-save but just in case... *) storeRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset}, branchAndLinkReg X16, (* Call the function. *) (* Restore the ML stack pointer. *) loadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset}, (* Load the heap allocation ptr and limit. We could have GCed in the RTS call. *) loadRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset}, loadRegScaled{regT=X_MLHeapLimit, regN=X_MLAssemblyInt, unitOffset=heapLimitPtrOffset} ] @ ( case resultFormat of FastArgFixed => [] | FastArgDouble => (* This must be boxed. *) boxDouble(V0, X0, X1) - | FastArgFloat => (* This must be tagged *) - [ - moveFloatToGeneral{regN=V0, regD=X0}, - logicalShiftLeft{shift=0w32, regN=X0, regD=X0}, - bitwiseOrImmediate{regN=X0, regD=X0, bits=0w1} - ] + | FastArgFloat => (* This must be tagged or boxed *) boxOrTagFloat(V0, X0, X1) ) @ [ - returnRegister X24 + returnRegister X23 ] val closure = makeConstantClosure() val () = generateCode{instrs=instructions, name=functionName, parameters=debugSwitches, resultClosure=closure} in closureAsAddress closure end fun rtsCallFast (functionName, nArgs, debugSwitches) = rtsCallFastGeneral (functionName, List.tabulate(nArgs, fn _ => FastArgFixed), FastArgFixed, debugSwitches) (* RTS call with one double-precision floating point argument and a floating point result. *) fun rtsCallFastRealtoReal (functionName, debugSwitches) = rtsCallFastGeneral (functionName, [FastArgDouble], FastArgDouble, debugSwitches) (* RTS call with two double-precision floating point arguments and a floating point result. *) fun rtsCallFastRealRealtoReal (functionName, debugSwitches) = rtsCallFastGeneral (functionName, [FastArgDouble, FastArgDouble], FastArgDouble, debugSwitches) (* RTS call with one double-precision floating point argument, one fixed point argument and a floating point result. *) fun rtsCallFastRealGeneraltoReal (functionName, debugSwitches) = rtsCallFastGeneral (functionName, [FastArgDouble, FastArgFixed], FastArgDouble, debugSwitches) (* RTS call with one general (i.e. ML word) argument and a floating point result. This is used only to convert arbitrary precision values to floats. *) fun rtsCallFastGeneraltoReal (functionName, debugSwitches) = rtsCallFastGeneral (functionName, [FastArgFixed], FastArgDouble, debugSwitches) (* Operations on Real32.real values. *) fun rtsCallFastFloattoFloat (functionName, debugSwitches) = rtsCallFastGeneral (functionName, [FastArgFloat], FastArgFloat, debugSwitches) fun rtsCallFastFloatFloattoFloat (functionName, debugSwitches) = rtsCallFastGeneral (functionName, [FastArgFloat, FastArgFloat], FastArgFloat, debugSwitches) (* RTS call with one double-precision floating point argument, one fixed point argument and a floating point result. *) fun rtsCallFastFloatGeneraltoFloat (functionName, debugSwitches) = rtsCallFastGeneral (functionName, [FastArgFloat, FastArgFixed], FastArgFloat, debugSwitches) (* RTS call with one general (i.e. ML word) argument and a floating point result. This is used only to convert arbitrary precision values to floats. *) fun rtsCallFastGeneraltoFloat (functionName, debugSwitches) = rtsCallFastGeneral (functionName, [FastArgFixed], FastArgFloat, debugSwitches) (* There is only one ABI value. *) datatype abi = ARM64Abi fun abiList () = [("default", ARM64Abi)] fun alignUp(s, align) = Word.andb(s + align-0w1, ~ align) val getThreadDataCall = makeEntryPoint "PolyArm64GetThreadData" (* This must match the type in Foreign.LowLevel. Once this is bootstrapped we could use that type but note that this is the type we use within the compiler and we build Foreign.LowLevel AFTER compiling this. *) datatype cTypeForm = CTypeFloatingPt | CTypePointer | CTypeSignedInt | CTypeUnsignedInt | CTypeStruct of cType list | CTypeVoid withtype cType = { typeForm: cTypeForm, align: word, size: word } (* Load a byte, halfword, word or long *) fun loadAlignedValue(reg, base, offset, size) = let val _ = offset mod size = 0w0 orelse raise InternalError "loadAlignedValue: not aligned" val loadOp = case size of 0w8 => loadRegScaled | 0w4 => loadRegScaled32 | 0w2 => loadRegScaled16 | 0w1 => loadRegScaledByte | _ => raise InternalError "loadAlignedValue: invalid length" in loadOp{regT=reg, regN=base, unitOffset=Word.toInt(offset div size)} end (* Store a register into upto 8 bytes. Most values will involve a single store but odd-sized structs can require shifts and multiple stores. N.B. May modify the source register. *) and storeUpTo8(reg, base, offset, size) = let val storeOp = if size = 0w8 then storeRegUnscaled else if size >= 0w4 then storeRegUnscaled32 else if size >= 0w2 then storeRegUnscaled16 else storeRegUnscaledByte in [storeOp{regT=reg, regN=base, byteOffset=offset}] end @ ( if size = 0w6 orelse size = 0w7 then [ logicalShiftRight{regN=reg, regD=reg, shift=0w32 }, storeRegUnscaled16{regT=reg, regN=base, byteOffset=offset+4} ] else [] ) @ ( if size = 0w3 orelse size = 0w5 orelse size = 0w7 then [ logicalShiftRight{regN=reg, regD=reg, shift=(size-0w1)*0w8 }, storeRegUnscaledByte{regT=reg, regN=base, byteOffset=offset+Word.toInt(size-0w1)} ] else [] ) (* Extract the elements of structures. *) fun unwrap(CTypeStruct ctypes, _) = List.foldr(fn({typeForm, size, ...}, l) => unwrap(typeForm, size) @ l) [] ctypes | unwrap (ctype, size) = [(ctype, size)] (* Structures of up to four floating point values of the same precision are treated specially. *) datatype argClass = ArgClassHFA of Word8.word * bool (* 1 - 4 floating pt values *) | ArgLargeStruct (* > 16 bytes and not an HFA *) | ArgSmall (* Scalars or small structures *) fun classifyArg(ctype, size) = case unwrap (ctype, size) of [(CTypeFloatingPt, 0w4)] => ArgClassHFA(0w1, false) | [(CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4)] => ArgClassHFA(0w2, false) | [(CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4)] => ArgClassHFA(0w3, false) | [(CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4), (CTypeFloatingPt, 0w4)] => ArgClassHFA(0w4, false) | [(CTypeFloatingPt, 0w8)] => ArgClassHFA(0w1, true) | [(CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8)] => ArgClassHFA(0w2, true) | [(CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8)] => ArgClassHFA(0w3, true) | [(CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8), (CTypeFloatingPt, 0w8)] => ArgClassHFA(0w4, true) | _ => if size > 0w16 then ArgLargeStruct else ArgSmall (* Can we load this in a single instruction? *) fun alignedLoadStore(_, 0w1) = true | alignedLoadStore(addr, 0w2) = addr mod 0w2 = 0w0 | alignedLoadStore(addr, 0w4) = addr mod 0w4 = 0w0 | alignedLoadStore(addr, 0w8) = addr mod 0w8 = 0w0 | alignedLoadStore(addr, 0w16) = addr mod 0w8 = 0w0 (* Can use load-pair. *) | alignedLoadStore _ = false (* This builds a piece of code that takes three arguments and returns a unit result. All three arguments are SysWord.word values i.e. ML addresses containing the address of the actual C value. The first argument (X0) is the address of the function to call. The second argument (X1) points to a struct that contains the argument(s) for the function. The arguments have to be unpacked from the struct into the appropriate registers or to the C stack. The third argument (X2) points to a piece of memory to receive the result of the call. It may be empty if the function returns void. It may only be as big as required for the result type. *) fun foreignCall(_: abi, args: cType list, result: cType): Address.machineWord = let val resultAreaPtr = X19 (* Unboxed value from X2 - This is callee save. *) val argPtrReg = X9 (* A scratch register that isn't used for arguments. *) val entryPtReg = X16 (* Contains the address of the function to call. *) val argWorkReg = X10 (* Used in loading arguments if necessary. *) and argWorkReg2 = X11 and structSpacePtr = X12 and argWorkReg3 = X13 and argWorkReg4 = X14 fun loadArgs([], stackOffset, _, _, _, code, largeStructSpace) = (code, stackOffset, largeStructSpace) | loadArgs(arg::args, stackOffset, argOffset, gRegNo, fpRegNo, code, largeStructSpace) = let val {size, align, typeForm, ...} = arg val newArgOffset = alignUp(argOffset, align) in case classifyArg(typeForm, size) of ArgClassHFA(numItems, isDouble) => if fpRegNo + numItems <= 0w8 then let val scale = if isDouble then 0w8 else 0w4 (* Load the values to the floating point registers. *) fun loadFPRegs(0w0, _, _) = [] | loadFPRegs(0w1, fpRegNo, offset) = [(if isDouble then loadRegScaledDouble else loadRegScaledFloat) {regT=VReg fpRegNo, regN=argPtrReg, unitOffset=offset}] | loadFPRegs(n, fpRegNo, offset) = (if isDouble then loadPairOffsetDouble else loadPairOffsetFloat) {regT1=VReg fpRegNo, regT2=VReg(fpRegNo+0w1), regN=argPtrReg, unitOffset=offset} :: loadFPRegs(n-0w2, fpRegNo+0w2, offset+2) in loadArgs(args, stackOffset, newArgOffset+size, gRegNo, fpRegNo+numItems, loadFPRegs(numItems, fpRegNo, Word.toInt(newArgOffset div scale)) @ code, largeStructSpace) end else let (* If we have insufficient number of registers we discard any that are left and push the argument to the stack. *) (* The floating point value or structure is copied to the stack as a contiguous area. Use general registers to copy the data. It could be on a 4-byte alignment. In the typical case of a single floating point value this will just be a single load and store. *) fun copyData(0w0, _, _) = [] | copyData(n, srcOffset, stackOffset) = if isDouble then loadRegScaled{regT=argWorkReg2, regN=argPtrReg, unitOffset=srcOffset} :: storeRegScaled{regT=argWorkReg2, regN=XSP, unitOffset=stackOffset} :: copyData(n-0w1, srcOffset+1, stackOffset+1) else loadRegScaled32{regT=argWorkReg2, regN=argPtrReg, unitOffset=srcOffset} :: storeRegScaled32{regT=argWorkReg2, regN=XSP, unitOffset=stackOffset} :: copyData(n-0w1, srcOffset+1, stackOffset+1) val copyToStack = if isDouble then copyData(numItems, Word.toInt(newArgOffset div 0w8), stackOffset) else copyData(numItems, Word.toInt(newArgOffset div 0w4), stackOffset*2) (* The overall size is rounded up to a multiple of 8 *) val newStackOffset = stackOffset + Word.toInt(alignUp(size, 0w8) div 0w8) in loadArgs(args, newStackOffset, newArgOffset+size, gRegNo, 0w8, copyToStack @ code, largeStructSpace) end | _ => let (* Load an aligned argument into one or two registers or copy it to the stack. *) fun loadArgumentValues(argSize, sourceOffset, sourceBase, newStructSpace, preCode) = if gRegNo <= 0w6 orelse (size <= 0w8 andalso gRegNo <= 0w7) then (* There are sufficient registers *) let val (loadInstr, nextGReg) = if argSize = 0w16 then ([loadPairOffset{regT1=XReg gRegNo, regT2=XReg(gRegNo+0w1), regN=sourceBase, unitOffset=Word.toInt(sourceOffset div 0w8)}], gRegNo+0w2) else ([loadAlignedValue(XReg gRegNo, sourceBase, sourceOffset, size)], gRegNo+0w1) in loadArgs(args, stackOffset, newArgOffset+size, nextGReg, fpRegNo, preCode @ loadInstr @ code, newStructSpace) end else if argSize = 0w16 then loadArgs(args, stackOffset+2, newArgOffset+size, 0w8, fpRegNo, preCode @ loadPairOffset{regT1=argWorkReg2, regT2=argWorkReg3, regN=sourceBase, unitOffset=Word.toInt(sourceOffset div 0w8)} :: storePairOffset{regT1=argWorkReg2, regT2=argWorkReg3, regN=XSP, unitOffset=stackOffset} :: code, newStructSpace) else loadArgs(args, stackOffset+1, newArgOffset+size, 0w8, fpRegNo, preCode @ loadAlignedValue(argWorkReg2, sourceBase, sourceOffset, argSize) :: storeRegScaled{regT=argWorkReg2, regN=XSP, unitOffset=stackOffset} :: code, newStructSpace) in if alignedLoadStore(newArgOffset, size) then loadArgumentValues(size, newArgOffset, argPtrReg, largeStructSpace, []) else (* General case. Either a large structure or a small structure that can't easily be loaded, First copy it to the stack, and either pass the address or load it once it's aligned. *) let val newStructSpace = alignUp(largeStructSpace + size, 0w16) val loopLabel = createLabel() (* The address of the area we're copying to is in argRegNo. *) val argRegNo = if gRegNo < 0w8 then XReg gRegNo else argWorkReg (* Copy from the end back to the start. *) val copyToStructSpace = [ addImmediate{regN=structSpacePtr, regD=argRegNo, immed=largeStructSpace, shifted=false}, addImmediate{regN=argRegNo, regD=argWorkReg2, immed=size, shifted=false}, (* End of dest area *) addImmediate{regN=argPtrReg, regD=argWorkReg3, immed=newArgOffset+size, shifted=false}, (* end of source *) setLabel loopLabel, loadRegPreIndexByte{regT=argWorkReg4, regN=argWorkReg3, byteOffset= ~1}, storeRegPreIndexByte{regT=argWorkReg4, regN=argWorkReg2, byteOffset= ~1}, subSShiftedReg{regM=argWorkReg2, regN=argRegNo, regD=XZero, shift=ShiftNone}, (* At start? *) conditionalBranch(condNotEqual, loopLabel) ] in if size > 0w16 then (* Large struct - pass by reference *) ( if gRegNo < 0w8 then loadArgs(args, stackOffset, newArgOffset+size, gRegNo+0w1, fpRegNo, copyToStructSpace @ code, newStructSpace) else loadArgs(args, stackOffset+1, newArgOffset+size, 0w8, fpRegNo, copyToStructSpace @ storeRegScaled{regT=argWorkReg, regN=XSP, unitOffset=stackOffset} :: code, newStructSpace) ) else (* Small struct. Since it's now in an area at least 16 bytes and properly aligned we can load it. *) (* argRegNo points to where we copied it *) loadArgumentValues(if size > 0w8 then 0w16 else 0w8, 0w0, argRegNo, newStructSpace, copyToStructSpace) end end end local val {size, typeForm, ...} = result (* Store a result register into the result area. In almost all cases this is very simple: the only complication is with structs of odd sizes. *) fun storeResult(reg, offset, size) = storeUpTo8(reg, resultAreaPtr, offset, size) in val (getResult, passArgAddress) = if typeForm = CTypeVoid then ([], false) else case classifyArg(typeForm, size) of (* Floating point values are returned in s0-sn, d0-dn. *) ArgClassHFA(numItems, isDouble) => let fun storeFPRegs(0w0, _, _) = [] | storeFPRegs(0w1, fpRegNo, offset) = [(if isDouble then storeRegScaledDouble else storeRegScaledFloat) {regT=VReg fpRegNo, regN=resultAreaPtr, unitOffset=offset}] | storeFPRegs(n, fpRegNo, offset) = (if isDouble then storePairOffsetDouble else storePairOffsetFloat) {regT1=VReg fpRegNo, regT2=VReg(fpRegNo+0w1), regN=resultAreaPtr, unitOffset=offset} :: storeFPRegs(n-0w2, fpRegNo+0w2, offset+2) in (storeFPRegs(numItems, 0w0 (* V0-Vn*), 0), false) end | ArgLargeStruct => ([], true) (* Structures larger than 16 bytes are passed by reference. *) | _ => if size = 0w16 then ([storePairOffset{regT1=X0, regT2=X1, regN=resultAreaPtr, unitOffset=0}], false) else if size > 0w8 then (storeRegScaled{regT=X0, regN=resultAreaPtr, unitOffset=0} :: storeResult(X1, 8, size-0w8), false) else (storeResult(X0, 0, size), false) end val (argCode, argStack, largeStructSpace) = loadArgs(args, 0, 0w0, 0w0, 0w0, if passArgAddress (* If we have to pass the address of the result struct it goes in X8. *) then [moveRegToReg{sReg=resultAreaPtr, dReg=X8}] else [], 0w0) val stackSpaceRequired = alignUp(Word.fromInt argStack * 0w8, 0w16) + largeStructSpace val instructions = [(* Push the return address to the stack. We could put it in a callee-save register but there's a very small chance that this could be the last reference to a piece of code. *) storeRegPreIndex{regT=X30, regN=X_MLStackPtr, byteOffset= ~8}, (* Save heap ptr. Needed in case we have a callback. *) - storeRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset}, + storeRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset} + ] @ indexToAbsoluteAddress(X0, X0) @ (* Load the entry point address. *) - loadRegScaled{regT=entryPtReg, regN=X0, unitOffset=0}] @ + loadRegScaled{regT=entryPtReg, regN=X0, unitOffset=0} :: ( (* Unbox the address of the result area into a callee save resgister. This is where the result will be stored on return if it is anything other than a struct. We have to put the C address in there now because an ML address wouldn't be updated by a possible GC in a callback. *) if #typeForm(result) <> CTypeVoid - then [loadRegScaled{regT=resultAreaPtr, regN=X2, unitOffset=0}] + then indexToAbsoluteAddress(X2, X2) @ [loadRegScaled{regT=resultAreaPtr, regN=X2, unitOffset=0}] else [] ) @ [storeRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset}] @ (* Save the stack pointer. *) ( if stackSpaceRequired = 0w0 then [] else [subImmediate{regN=XSP, regD=XSP, immed=stackSpaceRequired, shifted=false}] ) @ ( (* If we need to copy a struct load a register with a pointer to the area for it. *) if largeStructSpace = 0w0 then [] else [addImmediate{regN=XSP, regD=structSpacePtr, immed=stackSpaceRequired-largeStructSpace, shifted=false}] ) @ ( (* The second argument is a SysWord containing the address of a malloced area of memory with the actual arguments in it. *) if null args then [] - else [loadRegScaled{regT=argPtrReg, regN=X1, unitOffset=0}] + else indexToAbsoluteAddress(X1, X1) @ [loadRegScaled{regT=argPtrReg, regN=X1, unitOffset=0}] ) @ argCode @ [branchAndLinkReg X16] @ (* Call the function. *) (* Restore the C stack value in case it's been changed by a callback. *) ( if stackSpaceRequired = 0w0 then [] else [addImmediate{regN=XSP, regD=XSP, immed=stackSpaceRequired, shifted=false}] ) @ [ (* Reload the ML stack pointer even though it's callee save. If we've made a callback the ML stack could have grown and so moved to a different address. *) loadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset}, (* Load the heap allocation ptr and limit in case of a callback. *) loadRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset}, loadRegScaled{regT=X_MLHeapLimit, regN=X_MLAssemblyInt, unitOffset=heapLimitPtrOffset} ] @ (* Store the result in the destination. *) getResult @ (* Pop the return address and return. *) [ loadRegPostIndex{regT=X30, regN=X_MLStackPtr, byteOffset= 8}, returnRegister X30 ] val functionName = "foreignCall" val debugSwitches = [(*Universal.tagInject Pretty.compilerOutputTag (Pretty.prettyPrint(print, 70)), Universal.tagInject Debug.assemblyCodeTag true*)] val closure = makeConstantClosure() val () = generateCode{instrs=instructions, name=functionName, parameters=debugSwitches, resultClosure=closure} in closureAsAddress closure end (* Build a callback function. The arguments are the abi, the list of argument types and the result type. The result is an ML function that takes an ML function, f, as its argument, registers it as a callback and returns the C function as its result. When the C function is called the arguments are copied into temporary memory and the vector passed to f along with the address of the memory for the result. "f" stores the result in it when it returns and the result is then passed back as the result of the callback. N.B. This returns a closure cell which contains the address of the code. It can be used as a SysWord.word value except that while it exists the code will not be GCed. *) fun buildCallBack(_: abi, args: cType list, result: cType): Address.machineWord = let + val _ = if is32in64 then raise Foreign "Callbacks not yet implemented in compact 32-bit ARM" else () val argWorkReg = X10 (* Used in loading arguments if necessary. *) and argWorkReg2 = X11 and argWorkReg3 = X13 and argWorkReg4 = X14 (* The stack contains a 32-byte result area then an aligned area for the arguments. *) (* Store the argument values to the structure that will be passed to the ML callback function. *) (* Note. We've loaded the frame pointer with the original stack ptr-96 so we can access any stack arguments from that. *) fun moveArgs([], _, _, _, _, moveFromStack) = moveFromStack | moveArgs(arg::args, stackSpace, argOffset, gRegNo, fpRegNo, moveFromStack) = let val {size, align, typeForm, ...} = arg val newArgOffset = alignUp(argOffset, align) in case classifyArg(typeForm, size) of ArgClassHFA(numItems, isDouble) => if fpRegNo + numItems <= 0w8 then let val scale = if isDouble then 0w8 else 0w4 (* Store the values from the FP registers. *) fun storeFPRegs(0w0, _, _) = [] | storeFPRegs(0w1, fpRegNo, offset) = [(if isDouble then storeRegScaledDouble else storeRegScaledFloat) {regT=VReg fpRegNo, regN=XSP, unitOffset=offset}] | storeFPRegs(n, fpRegNo, offset) = (if isDouble then storePairOffsetDouble else storePairOffsetFloat) {regT1=VReg fpRegNo, regT2=VReg(fpRegNo+0w1), regN=XSP, unitOffset=offset} :: storeFPRegs(n-0w2, fpRegNo+0w2, offset+2) in moveArgs(args, stackSpace, newArgOffset+size, gRegNo, fpRegNo+numItems, storeFPRegs(numItems, fpRegNo, Word.toInt(newArgOffset div scale)) @ moveFromStack) end else let (* Load the arguments from the stack and store into the result area. *) fun copyData(0w0, _, _) = [] | copyData(n, dstOffset, stackOffset) = if isDouble then loadRegScaled{regT=argWorkReg2, regN=X29, unitOffset=stackOffset} :: storeRegScaled{regT=argWorkReg2, regN=XSP, unitOffset=dstOffset} :: copyData(n-0w1, dstOffset+1, stackOffset+1) else loadRegScaled32{regT=argWorkReg2, regN=X29, unitOffset=stackOffset} :: storeRegScaled32{regT=argWorkReg2, regN=XSP, unitOffset=dstOffset} :: copyData(n-0w1, dstOffset+1, stackOffset+1) val copyFromStack = if isDouble then copyData(numItems, Word.toInt(newArgOffset div 0w8), stackSpace) else copyData(numItems, Word.toInt(newArgOffset div 0w4), stackSpace*2) (* The overall size is rounded up to a multiple of 8 *) val newStackOffset = stackSpace + Word.toInt(alignUp(size, 0w8) div 0w8) in moveArgs(args, newStackOffset, newArgOffset+size, gRegNo, 0w8, copyFromStack @ moveFromStack) end | _ => if alignedLoadStore(newArgOffset, size) andalso (gRegNo <= 0w6 orelse gRegNo = 0w7 andalso size <= 0w8) then (* Usual case: argument passed in one or two registers. *) ( if size > 0w8 then moveArgs(args, stackSpace, newArgOffset+size, gRegNo + 0w2, fpRegNo, storePairOffset{regT1=XReg gRegNo, regT2=XReg(gRegNo+0w1), regN=XSP, unitOffset=Word.toInt(newArgOffset div 0w8)} :: moveFromStack) else moveArgs(args, stackSpace, newArgOffset+size, gRegNo + 0w1, fpRegNo, storeUpTo8(XReg gRegNo, XSP, Word.toInt newArgOffset, size) @ moveFromStack) ) else (* General case. Store the argument registers if necessary and then use a byte copy to copy into the argument area. This sorts out any odd alignments or lengths. In some cases the source will be in memory already. *) let (* The source is either the register value or the value on the stack. *) val (argRegNo, nextGReg, newStack, loadArg) = if size > 0w16 then ( if gRegNo < 0w8 then (XReg gRegNo, gRegNo + 0w1, stackSpace, []) else (argWorkReg, 0w8, stackSpace+1, [loadRegScaled{regT=argWorkReg, regN=X29, unitOffset=stackSpace}]) ) else let val regsNeeded = if size > 0w8 then 0w2 else 0w1 in if gRegNo + regsNeeded <= 0w8 then (XReg gRegNo, gRegNo+regsNeeded, stackSpace, [if size > 0w8 then storePairOffset{regT1=XReg gRegNo, regT2=XReg(gRegNo+0w1), regN=XSP, unitOffset=2} else storeRegScaled{regT=XReg gRegNo, regN=XSP, unitOffset=2}, addImmediate{regD=XReg gRegNo, regN=XSP, immed=0w16, shifted=false}]) else (* Being passed on the stack *) (argWorkReg, 0w8, stackSpace+Word8.toInt regsNeeded, [addImmediate{regD=argWorkReg, regN=X29, immed=Word.fromInt stackSpace*0w8, shifted=false}]) end val loopLabel = createLabel() val copyCode = [ addImmediate{regN=argRegNo, regD=argWorkReg3, immed=size, shifted=false}, (* End of source area *) addImmediate{regN=XSP, regD=argWorkReg2, immed=newArgOffset+size, shifted=false}, (* end of dest *) setLabel loopLabel, loadRegPreIndexByte{regT=argWorkReg4, regN=argWorkReg3, byteOffset= ~1}, storeRegPreIndexByte{regT=argWorkReg4, regN=argWorkReg2, byteOffset= ~1}, subSShiftedReg{regM=argWorkReg3, regN=argRegNo, regD=XZero, shift=ShiftNone}, (* At start? *) conditionalBranch(condNotEqual, loopLabel) ] in moveArgs(args, newStack, newArgOffset+size, nextGReg, fpRegNo, loadArg @ copyCode @ moveFromStack) end end val copyArgsFromRegsAndStack = moveArgs(args, 12 (* Offset to first stack arg *), 0w32 (* Size of result area *), 0w0, 0w0, []) local fun getNextSize (arg, argOffset) = let val {size, align, ...} = arg in alignUp(argOffset, align) + size end in val argumentSpace = alignUp(List.foldl getNextSize 0w0 args, 0w16) end local val {size, typeForm, ...} = result in (* Load the results from the result area except that if we're passing the result structure by reference this is done by the caller. Generally similar to how arguments are passed in a call. *) val (loadResults, resultByReference) = if typeForm = CTypeVoid then ([], false) else case classifyArg(typeForm, size) of ArgClassHFA(numItems, isDouble) => let (* Load the values to the floating point registers. *) fun loadFPRegs(0w0, _, _) = [] | loadFPRegs(0w1, fpRegNo, offset) = [(if isDouble then loadRegScaledDouble else loadRegScaledFloat) {regT=VReg fpRegNo, regN=XSP, unitOffset=offset}] | loadFPRegs(n, fpRegNo, offset) = (if isDouble then loadPairOffsetDouble else loadPairOffsetFloat) {regT1=VReg fpRegNo, regT2=VReg(fpRegNo+0w1), regN=XSP, unitOffset=offset} :: loadFPRegs(n-0w2, fpRegNo+0w2, offset+2) in (loadFPRegs(numItems, 0w0, 0 (* result area *)), false) end | ArgLargeStruct => ([], true) (* Structures larger than 16 bytes are passed by reference. *) | _ => (* We've allocated a 32-byte area aligned onto a 16-byte boundary so we can simply load one or two registers. *) if size > 0w8 then ([loadPairOffset{regT1=X0, regT2=X1, regN=XSP, unitOffset=0}], false) else ([loadRegScaled{regT=X0, regN=XSP, unitOffset=0}], false) end val instructions = [ (* Push LR, FP and the callee-save registers. *) storePairPreIndexed{regT1=X29, regT2=X30, regN=XSP, unitOffset= ~12}, moveRegToReg{sReg=XSP, dReg=X29}, storePairOffset{regT1=X19, regT2=X20, regN=X29, unitOffset=2}, storePairOffset{regT1=X21, regT2=X22, regN=X29, unitOffset=4}, storePairOffset{regT1=X23, regT2=X24, regN=X29, unitOffset=6}, storePairOffset{regT1=X25, regT2=X26, regN=X29, unitOffset=8}, storePairOffset{regT1=X27, regT2=X28, regN=X29, unitOffset=10}, (* Reserve space for the arguments and results. *) subImmediate{regN=XSP, regD=XSP, immed=argumentSpace+0w32, shifted=false}, (* We passed the function we're calling in X9 but we need to move it to a callee-save register before we call the RTS. *) moveRegToReg{sReg=X9, dReg=X20} ] @ (* Save X8 if we're going to need it. *) (if resultByReference then [storeRegScaled{regT=X8, regN=XSP, unitOffset=0}] else []) @ copyArgsFromRegsAndStack @ [ (* Call into the RTS to get the thread data ptr. *) loadAddressConstant(X0, getThreadDataCall), loadRegScaled{regT=X0, regN=X0, unitOffset=0}, branchAndLinkReg X0, moveRegToReg{sReg=X0, dReg=X_MLAssemblyInt}, (* Load the ML regs. *) loadRegScaled{regT=X_MLHeapLimit, regN=X_MLAssemblyInt, unitOffset=heapLimitPtrOffset}, loadRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset}, loadRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset}, (* Prepare the arguments. They are both syswords so have to be boxed. First load the address of the argument area which is after the 32-byte result area. *) addImmediate{regN=XSP, regD=X2, immed=0w32, shifted=false} ] @ boxSysWord(X2, X0, X3) @ (* Address of arguments. *) ( (* Result area pointer. If we're returning by reference this is the original value of X8 otherwise it's the address of the 32 bytes we've reserved. *) if resultByReference then [loadRegScaled{regT=X2, regN=XSP, unitOffset=0}] else [moveRegToReg{sReg=XSP, dReg=X2}] ) @ boxSysWord(X2, X1, X3) @ [ (* Put the ML closure pointer, originally in X9 now in X20, into the ML closure pointer register, X8. Then call the ML code. *) moveRegToReg{sReg=X20, dReg=X8}, loadRegScaled{regT=X16, regN=X8, unitOffset=0}, branchAndLinkReg X16, (* Save the ML stack and heap pointers. We could have allocated or grown the stack. The limit pointer is maintained by the RTS. *) storeRegScaled{regT=X_MLHeapAllocPtr, regN=X_MLAssemblyInt, unitOffset=heapAllocPtrOffset}, storeRegScaled{regT=X_MLStackPtr, regN=X_MLAssemblyInt, unitOffset=mlStackPtrOffset} ] @ loadResults @ (* Load the return values *) [ (* Restore the callee-save registers and return. *) moveRegToReg{sReg=X29, dReg=XSP}, loadPairOffset{regT1=X19, regT2=X20, regN=X29, unitOffset=2}, loadPairOffset{regT1=X21, regT2=X22, regN=X29, unitOffset=4}, loadPairOffset{regT1=X23, regT2=X24, regN=X29, unitOffset=6}, loadPairOffset{regT1=X25, regT2=X26, regN=X29, unitOffset=8}, loadPairOffset{regT1=X27, regT2=X28, regN=X29, unitOffset=10}, loadPairPostIndexed{regT1=X29, regT2=X30, regN=XSP, unitOffset=12}, returnRegister X30 ] val functionName = "foreignCallBack(2)" val debugSwitches = [(*Universal.tagInject Pretty.compilerOutputTag (Pretty.prettyPrint(print, 70)), Universal.tagInject Debug.assemblyCodeTag true*)] val closure = makeConstantClosure() val () = generateCode{instrs=instructions, name=functionName, parameters=debugSwitches, resultClosure=closure} val stage2Code = closureAsAddress closure fun resultFunction f = let (* Generate a small function to load the address of f into a register and then jump to stage2. The idea is that it should be possible to generate this eventually in a single RTS call. That could be done by using a version of this as a model. *) val codeAddress = Address.loadWord(Address.toAddress stage2Code, 0w0) val instructions = [ loadAddressConstant(X9, Address.toMachineWord f), loadAddressConstant(X16, codeAddress), branchRegister X16 ] val functionName = "foreignCallBack(1)" val debugSwitches = [(*Universal.tagInject Pretty.compilerOutputTag (Pretty.prettyPrint(print, 70)), Universal.tagInject Debug.assemblyCodeTag true*)] val closure = makeConstantClosure() val () = generateCode{instrs=instructions, name=functionName, parameters=debugSwitches, resultClosure=closure} val res = closureAsAddress closure (*val _ = print("Address is " ^ (LargeWord.toString(RunCall.unsafeCast res)) ^ "\n")*) in res end in Address.toMachineWord resultFunction end end;