You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/09/21 17:08:27 UTC
[1/2] systemml git commit: [SYSTEMML-1923] Support sinh,
cosh and tanh as built-in functions
Repository: systemml
Updated Branches:
refs/heads/master aa15197ec -> 50a895f86
http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
index a4aa093..541fe30 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
@@ -27,6 +27,7 @@ import org.apache.sysml.runtime.util.UtilFunctions;
public class CNodeUnary extends CNode
{
+ // TODO: Add support for SINH, COSH and TANH
public enum UnaryType {
LOOKUP_R, LOOKUP_C, LOOKUP_RC, LOOKUP0, CBIND0, //codegen specific
ROW_SUMS, ROW_MINS, ROW_MAXS, //codegen specific
http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBased.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBased.java b/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBased.java
index c230505..acb90e2 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBased.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBased.java
@@ -701,6 +701,9 @@ public class PlanSelectionFuseCostBased extends PlanSelection
case ASIN: costs = 93; break;
case ACOS: costs = 103; break;
case ATAN: costs = 40; break;
+ case SINH: costs = 93; break; // TODO:
+ case COSH: costs = 103; break;
+ case TANH: costs = 40; break;
case CUMSUM:
case CUMMIN:
case CUMMAX:
http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java b/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java
index 1bf42f6..30631d0 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java
@@ -928,6 +928,9 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
case ASIN: costs = 93; break;
case ACOS: costs = 103; break;
case ATAN: costs = 40; break;
+ case SINH: costs = 93; break; // TODO:
+ case COSH: costs = 103; break;
+ case TANH: costs = 40; break;
case CUMSUM:
case CUMMIN:
case CUMMAX:
http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/java/org/apache/sysml/lops/Unary.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/lops/Unary.java b/src/main/java/org/apache/sysml/lops/Unary.java
index cc53666..36f186a 100644
--- a/src/main/java/org/apache/sysml/lops/Unary.java
+++ b/src/main/java/org/apache/sysml/lops/Unary.java
@@ -38,7 +38,7 @@ public class Unary extends Lop
public enum OperationTypes {
ADD, SUBTRACT, SUBTRACTRIGHT, MULTIPLY, MULTIPLY2, DIVIDE, MODULUS, INTDIV, MINUS1_MULTIPLY,
- POW, POW2, LOG, MAX, MIN, NOT, ABS, SIN, COS, TAN, ASIN, ACOS, ATAN, SIGN, SQRT, EXP, Over,
+ POW, POW2, LOG, MAX, MIN, NOT, ABS, SIN, COS, TAN, ASIN, ACOS, ATAN, SINH, COSH, TANH, SIGN, SQRT, EXP, Over,
LESS_THAN, LESS_THAN_OR_EQUALS, GREATER_THAN, GREATER_THAN_OR_EQUALS, EQUALS, NOT_EQUALS,
ROUND, CEIL, FLOOR, MR_IQM, INVERSE, CHOLESKY,
CUMSUM, CUMPROD, CUMMIN, CUMMAX,
@@ -194,6 +194,12 @@ public class Unary extends Lop
return "acos";
case ATAN:
return "atan";
+ case SINH:
+ return "sinh";
+ case COSH:
+ return "cosh";
+ case TANH:
+ return "tanh";
case SIGN:
return "sign";
case SQRT:
http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/java/org/apache/sysml/lops/UnaryCP.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/lops/UnaryCP.java b/src/main/java/org/apache/sysml/lops/UnaryCP.java
index fad0304..c83baf4 100644
--- a/src/main/java/org/apache/sysml/lops/UnaryCP.java
+++ b/src/main/java/org/apache/sysml/lops/UnaryCP.java
@@ -34,7 +34,7 @@ public class UnaryCP extends Lop
{
public enum OperationTypes {
- NOT, ABS, SIN, COS, TAN, ASIN, ACOS, ATAN, SQRT, LOG, EXP,
+ NOT, ABS, SIN, COS, TAN, ASIN, ACOS, ATAN, SQRT, LOG, EXP, SINH, COSH, TANH,
CAST_AS_SCALAR, CAST_AS_MATRIX, CAST_AS_FRAME, CAST_AS_DOUBLE, CAST_AS_INT, CAST_AS_BOOLEAN,
PRINT, NROW, NCOL, LENGTH, ROUND, STOP, CEIL, FLOOR, CUMSUM
};
@@ -106,6 +106,15 @@ public class UnaryCP extends Lop
case ATAN:
return "atan";
+ case SINH:
+ return "sinh";
+
+ case COSH:
+ return "cosh";
+
+ case TANH:
+ return "tanh";
+
case SQRT:
return "sqrt";
http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/java/org/apache/sysml/parser/BuiltinFunctionExpression.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/parser/BuiltinFunctionExpression.java b/src/main/java/org/apache/sysml/parser/BuiltinFunctionExpression.java
index e03add3..1e54251 100644
--- a/src/main/java/org/apache/sysml/parser/BuiltinFunctionExpression.java
+++ b/src/main/java/org/apache/sysml/parser/BuiltinFunctionExpression.java
@@ -1322,6 +1322,9 @@ public class BuiltinFunctionExpression extends DataIdentifier
case ACOS:
case ASIN:
case ATAN:
+ case COSH:
+ case SINH:
+ case TANH:
case SIGN:
case SQRT:
case ABS:
@@ -1345,6 +1348,9 @@ public class BuiltinFunctionExpression extends DataIdentifier
case ACOS:
case ASIN:
case ATAN:
+ case COSH:
+ case SINH:
+ case TANH:
case SIGN:
case SQRT:
case ABS:
@@ -1552,6 +1558,12 @@ public class BuiltinFunctionExpression extends DataIdentifier
bifop = Expression.BuiltinFunctionOp.ASIN;
else if (functionName.equals("atan"))
bifop = Expression.BuiltinFunctionOp.ATAN;
+ else if (functionName.equals("cosh"))
+ bifop = Expression.BuiltinFunctionOp.COSH;
+ else if (functionName.equals("sinh"))
+ bifop = Expression.BuiltinFunctionOp.SINH;
+ else if (functionName.equals("tanh"))
+ bifop = Expression.BuiltinFunctionOp.TANH;
else if (functionName.equals("diag"))
bifop = Expression.BuiltinFunctionOp.DIAG;
else if (functionName.equals("exp"))
http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/java/org/apache/sysml/parser/DMLTranslator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/parser/DMLTranslator.java b/src/main/java/org/apache/sysml/parser/DMLTranslator.java
index f44c0a4..123fe19 100644
--- a/src/main/java/org/apache/sysml/parser/DMLTranslator.java
+++ b/src/main/java/org/apache/sysml/parser/DMLTranslator.java
@@ -2765,6 +2765,9 @@ public class DMLTranslator
case ASIN:
case ACOS:
case ATAN:
+ case SINH:
+ case COSH:
+ case TANH:
case SIGN:
case SQRT:
case EXP:
@@ -2798,6 +2801,15 @@ public class DMLTranslator
case ATAN:
mathOp1 = Hop.OpOp1.ATAN;
break;
+ case SINH:
+ mathOp1 = Hop.OpOp1.SINH;
+ break;
+ case COSH:
+ mathOp1 = Hop.OpOp1.COSH;
+ break;
+ case TANH:
+ mathOp1 = Hop.OpOp1.TANH;
+ break;
case SIGN:
mathOp1 = Hop.OpOp1.SIGN;
break;
http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/java/org/apache/sysml/parser/Expression.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/parser/Expression.java b/src/main/java/org/apache/sysml/parser/Expression.java
index a68f59f..053a5a3 100644
--- a/src/main/java/org/apache/sysml/parser/Expression.java
+++ b/src/main/java/org/apache/sysml/parser/Expression.java
@@ -78,6 +78,7 @@ public abstract class Expression implements ParseInfo
COLSUM,
COLVAR,
COS,
+ COSH,
COV,
CUMMAX,
CUMMIN,
@@ -123,6 +124,7 @@ public abstract class Expression implements ParseInfo
SD,
SEQ,
SIN,
+ SINH,
SIGN,
SOLVE,
SQRT,
@@ -130,6 +132,7 @@ public abstract class Expression implements ParseInfo
SVD,
TABLE,
TAN,
+ TANH,
TRACE,
TRANS,
VAR
http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/java/org/apache/sysml/runtime/functionobjects/Builtin.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/functionobjects/Builtin.java b/src/main/java/org/apache/sysml/runtime/functionobjects/Builtin.java
index 753e494..a05eefe 100644
--- a/src/main/java/org/apache/sysml/runtime/functionobjects/Builtin.java
+++ b/src/main/java/org/apache/sysml/runtime/functionobjects/Builtin.java
@@ -49,7 +49,7 @@ public class Builtin extends ValueFunction
private static final long serialVersionUID = 3836744687789840574L;
- public enum BuiltinCode { SIN, COS, TAN, ASIN, ACOS, ATAN, LOG, LOG_NZ, MIN, MAX, ABS, SIGN, SQRT, EXP, PLOGP, PRINT, PRINTF, NROW, NCOL, LENGTH, ROUND, MAXINDEX, MININDEX, STOP, CEIL, FLOOR, CUMSUM, CUMPROD, CUMMIN, CUMMAX, INVERSE, SPROP, SIGMOID, SELP };
+ public enum BuiltinCode { SIN, COS, TAN, SINH, COSH, TANH, ASIN, ACOS, ATAN, LOG, LOG_NZ, MIN, MAX, ABS, SIGN, SQRT, EXP, PLOGP, PRINT, PRINTF, NROW, NCOL, LENGTH, ROUND, MAXINDEX, MININDEX, STOP, CEIL, FLOOR, CUMSUM, CUMPROD, CUMMIN, CUMMAX, INVERSE, SPROP, SIGMOID, SELP };
public BuiltinCode bFunc;
private static final boolean FASTMATH = true;
@@ -61,6 +61,9 @@ public class Builtin extends ValueFunction
String2BuiltinCode.put( "sin" , BuiltinCode.SIN);
String2BuiltinCode.put( "cos" , BuiltinCode.COS);
String2BuiltinCode.put( "tan" , BuiltinCode.TAN);
+ String2BuiltinCode.put( "sinh" , BuiltinCode.SINH);
+ String2BuiltinCode.put( "cosh" , BuiltinCode.COSH);
+ String2BuiltinCode.put( "tanh" , BuiltinCode.TANH);
String2BuiltinCode.put( "asin" , BuiltinCode.ASIN);
String2BuiltinCode.put( "acos" , BuiltinCode.ACOS);
String2BuiltinCode.put( "atan" , BuiltinCode.ATAN);
@@ -95,7 +98,7 @@ public class Builtin extends ValueFunction
}
// We should create one object for every builtin function that we support
- private static Builtin sinObj = null, cosObj = null, tanObj = null, asinObj = null, acosObj = null, atanObj = null;
+ private static Builtin sinObj = null, cosObj = null, tanObj = null, sinhObj = null, coshObj = null, tanhObj = null, asinObj = null, acosObj = null, atanObj = null;
private static Builtin logObj = null, lognzObj = null, minObj = null, maxObj = null, maxindexObj = null, minindexObj=null;
private static Builtin absObj = null, signObj = null, sqrtObj = null, expObj = null, plogpObj = null, printObj = null, printfObj;
private static Builtin nrowObj = null, ncolObj = null, lengthObj = null, roundObj = null, ceilObj=null, floorObj=null;
@@ -135,6 +138,19 @@ public class Builtin extends ValueFunction
if ( tanObj == null )
tanObj = new Builtin(BuiltinCode.TAN);
return tanObj;
+ case SINH:
+ if ( sinhObj == null )
+ sinhObj = new Builtin(BuiltinCode.SINH);
+ return sinhObj;
+
+ case COSH:
+ if ( coshObj == null )
+ coshObj = new Builtin(BuiltinCode.COSH);
+ return coshObj;
+ case TANH:
+ if ( tanhObj == null )
+ tanhObj = new Builtin(BuiltinCode.TANH);
+ return tanhObj;
case ASIN:
if ( asinObj == null )
asinObj = new Builtin(BuiltinCode.ASIN);
@@ -282,6 +298,10 @@ public class Builtin extends ValueFunction
case ASIN: return FASTMATH ? FastMath.asin(in) : Math.asin(in);
case ACOS: return FASTMATH ? FastMath.acos(in) : Math.acos(in);
case ATAN: return Math.atan(in); //faster in Math
+ // FastMath.*h is faster 98% of time than Math.*h in initial micro-benchmarks
+ case SINH: return FASTMATH ? FastMath.sinh(in) : Math.sinh(in);
+ case COSH: return FASTMATH ? FastMath.cosh(in) : Math.cosh(in);
+ case TANH: return FASTMATH ? FastMath.tanh(in) : Math.tanh(in);
case CEIL: return FASTMATH ? FastMath.ceil(in) : Math.ceil(in);
case FLOOR: return FASTMATH ? FastMath.floor(in) : Math.floor(in);
case LOG: return Math.log(in); //faster in Math
http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
index 2f77710..f6b880d 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
@@ -158,6 +158,9 @@ public class CPInstructionParser extends InstructionParser
String2CPInstructionType.put( "sin" , CPINSTRUCTION_TYPE.BuiltinUnary);
String2CPInstructionType.put( "cos" , CPINSTRUCTION_TYPE.BuiltinUnary);
String2CPInstructionType.put( "tan" , CPINSTRUCTION_TYPE.BuiltinUnary);
+ String2CPInstructionType.put( "sinh" , CPINSTRUCTION_TYPE.BuiltinUnary);
+ String2CPInstructionType.put( "cosh" , CPINSTRUCTION_TYPE.BuiltinUnary);
+ String2CPInstructionType.put( "tanh" , CPINSTRUCTION_TYPE.BuiltinUnary);
String2CPInstructionType.put( "asin" , CPINSTRUCTION_TYPE.BuiltinUnary);
String2CPInstructionType.put( "acos" , CPINSTRUCTION_TYPE.BuiltinUnary);
String2CPInstructionType.put( "atan" , CPINSTRUCTION_TYPE.BuiltinUnary);
http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
index 6f1ed91..503576f 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
@@ -91,6 +91,9 @@ public class GPUInstructionParser extends InstructionParser
String2GPUInstructionType.put( "sin", GPUINSTRUCTION_TYPE.BuiltinUnary);
String2GPUInstructionType.put( "cos", GPUINSTRUCTION_TYPE.BuiltinUnary);
String2GPUInstructionType.put( "tan", GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "sinh", GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "cosh", GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "tanh", GPUINSTRUCTION_TYPE.BuiltinUnary);
String2GPUInstructionType.put( "asin", GPUINSTRUCTION_TYPE.BuiltinUnary);
String2GPUInstructionType.put( "acos", GPUINSTRUCTION_TYPE.BuiltinUnary);
String2GPUInstructionType.put( "atan", GPUINSTRUCTION_TYPE.BuiltinUnary);
http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/java/org/apache/sysml/runtime/instructions/SPInstructionParser.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/SPInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/SPInstructionParser.java
index 2ebb2e1..b5f8a04 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/SPInstructionParser.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/SPInstructionParser.java
@@ -216,6 +216,9 @@ public class SPInstructionParser extends InstructionParser
String2SPInstructionType.put( "asin" , SPINSTRUCTION_TYPE.BuiltinUnary);
String2SPInstructionType.put( "acos" , SPINSTRUCTION_TYPE.BuiltinUnary);
String2SPInstructionType.put( "atan" , SPINSTRUCTION_TYPE.BuiltinUnary);
+ String2SPInstructionType.put( "sinh" , SPINSTRUCTION_TYPE.BuiltinUnary);
+ String2SPInstructionType.put( "cosh" , SPINSTRUCTION_TYPE.BuiltinUnary);
+ String2SPInstructionType.put( "tanh" , SPINSTRUCTION_TYPE.BuiltinUnary);
String2SPInstructionType.put( "sign" , SPINSTRUCTION_TYPE.BuiltinUnary);
String2SPInstructionType.put( "sqrt" , SPINSTRUCTION_TYPE.BuiltinUnary);
String2SPInstructionType.put( "plogp" , SPINSTRUCTION_TYPE.BuiltinUnary);
http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
index 149de80..bc3ba9b 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
@@ -104,6 +104,9 @@ public abstract class GPUInstruction extends Instruction {
public final static String MISC_TIMER_SIN_KERNEL = "sink"; // time spent in the sin kernel
public final static String MISC_TIMER_COS_KERNEL = "cosk"; // time spent in the cos kernel
public final static String MISC_TIMER_TAN_KERNEL = "tank"; // time spent in the tan kernel
+ public final static String MISC_TIMER_SINH_KERNEL = "sinhk"; // time spent in the sinh kernel
+ public final static String MISC_TIMER_COSH_KERNEL = "coshk"; // time spent in the cosh kernel
+ public final static String MISC_TIMER_TANH_KERNEL = "tanhk"; // time spent in the tanh kernel
public final static String MISC_TIMER_ASIN_KERNEL = "asink"; // time spent in the asin kernel
public final static String MISC_TIMER_ACOS_KERNEL = "acosk"; // time spent in the acos kernel
public final static String MISC_TIMER_ATAN_KERNEL = "atank"; // time spent in the atan kernel
http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
index 5096566..1718d69 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
@@ -66,6 +66,12 @@ public class MatrixBuiltinGPUInstruction extends BuiltinUnaryGPUInstruction {
LibMatrixCUDA.cos(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
case "tan":
LibMatrixCUDA.tan(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
+ case "sinh":
+ LibMatrixCUDA.sinh(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
+ case "cosh":
+ LibMatrixCUDA.cosh(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
+ case "tanh":
+ LibMatrixCUDA.tanh(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
case "asin":
LibMatrixCUDA.asin(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
case "acos":
http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index 5f31f28..4b2cd73 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -2604,6 +2604,57 @@ public class LibMatrixCUDA {
// tan(0) = 0
unaryOp(ec, gCtx, in1, "matrix_tan", 0, outputName, instName, GPUInstruction.MISC_TIMER_TAN_KERNEL);
}
+
+ /**
+ * Performs an "sinh" operation on a matrix on the GPU
+ * @param ec execution context
+ * @param gCtx a valid {@link GPUContext}
+ * @param instName the invoking instruction's name for record {@link Statistics}.
+ * @param in1 input matrix
+ * @param outputName output matrix name
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public static void sinh(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+ if(LOG.isTraceEnabled()) {
+ LOG.trace("GPU : sinh" + ", GPUContext=" + gCtx);
+ }
+ // sin(0) = 0
+ unaryOp(ec, gCtx, in1, "matrix_sinh", 0, outputName, instName, GPUInstruction.MISC_TIMER_SINH_KERNEL);
+ }
+
+ /**
+ * Performs an "cosh" operation on a matrix on the GPU
+ * @param ec execution context
+ * @param gCtx a valid {@link GPUContext}
+ * @param instName the invoking instruction's name for record {@link Statistics}.
+ * @param in1 input matrix
+ * @param outputName output matrix name
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public static void cosh(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+ if(LOG.isTraceEnabled()) {
+ LOG.trace("GPU : cosh" + ", GPUContext=" + gCtx);
+ }
+ // cos(0) = 1
+ unaryOp(ec, gCtx, in1, "matrix_cosh", 1, outputName, instName, GPUInstruction.MISC_TIMER_COSH_KERNEL);
+ }
+
+ /**
+ * Performs an "tanh" operation on a matrix on the GPU
+ * @param ec execution context
+ * @param gCtx a valid {@link GPUContext}
+ * @param instName the invoking instruction's name for record {@link Statistics}.
+ * @param in1 input matrix
+ * @param outputName output matrix name
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public static void tanh(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+ if(LOG.isTraceEnabled()) {
+ LOG.trace("GPU : tanh" + ", GPUContext=" + gCtx);
+ }
+ // tan(0) = 0
+ unaryOp(ec, gCtx, in1, "matrix_tanh", 0, outputName, instName, GPUInstruction.MISC_TIMER_TANH_KERNEL);
+ }
/**
* Performs an "asin" operation on a matrix on the GPU
http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/java/org/apache/sysml/runtime/matrix/operators/UnaryOperator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/operators/UnaryOperator.java b/src/main/java/org/apache/sysml/runtime/matrix/operators/UnaryOperator.java
index 9b4f34f..743b1f3 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/operators/UnaryOperator.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/operators/UnaryOperator.java
@@ -43,6 +43,8 @@ public class UnaryOperator extends Operator
if( fn instanceof Builtin ) {
Builtin f=(Builtin)fn;
sparseSafe = (f.bFunc==Builtin.BuiltinCode.SIN || f.bFunc==Builtin.BuiltinCode.TAN
+ // sinh and tanh are zero only at zero, else they are nnz
+ || f.bFunc==Builtin.BuiltinCode.SINH || f.bFunc==Builtin.BuiltinCode.TANH
|| f.bFunc==Builtin.BuiltinCode.ROUND || f.bFunc==Builtin.BuiltinCode.ABS
|| f.bFunc==Builtin.BuiltinCode.SQRT || f.bFunc==Builtin.BuiltinCode.SPROP
|| f.bFunc==Builtin.BuiltinCode.SELP || f.bFunc==Builtin.BuiltinCode.LOG_NZ
http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/python/systemml/defmatrix.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/defmatrix.py b/src/main/python/systemml/defmatrix.py
index d24ba62..576e300 100644
--- a/src/main/python/systemml/defmatrix.py
+++ b/src/main/python/systemml/defmatrix.py
@@ -830,7 +830,16 @@ class matrix(object):
def tan(self):
return unaryMatrixFunction(self, 'tan')
+
+ def sinh(self):
+ return unaryMatrixFunction(self, 'sinh')
+
+ def cosh(self):
+ return unaryMatrixFunction(self, 'cosh')
+ def tanh(self):
+ return unaryMatrixFunction(self, 'tanh')
+
def arcsin(self):
return self.asin()
http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/test/java/org/apache/sysml/test/gpu/UnaryOpTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/UnaryOpTests.java b/src/test/java/org/apache/sysml/test/gpu/UnaryOpTests.java
index 84b1f73..510766e 100644
--- a/src/test/java/org/apache/sysml/test/gpu/UnaryOpTests.java
+++ b/src/test/java/org/apache/sysml/test/gpu/UnaryOpTests.java
@@ -50,6 +50,21 @@ public class UnaryOpTests extends UnaryOpTestsBase {
public void testTan() throws Exception {
testSimpleUnaryOpMatrixOutput("tan", "gpu_tan");
}
+
+ @Test
+ public void testSinh() throws Exception {
+ testSimpleUnaryOpMatrixOutput("sinh", "gpu_sinh");
+ }
+
+ @Test
+ public void testCosh() throws Exception {
+ testSimpleUnaryOpMatrixOutput("cosh", "gpu_cosh");
+ }
+
+ @Test
+ public void testTanh() throws Exception {
+ testSimpleUnaryOpMatrixOutput("tanh", "gpu_tanh");
+ }
@Test
public void testAsin() throws Exception {
[2/2] systemml git commit: [SYSTEMML-1923] Support sinh,
cosh and tanh as built-in functions
Posted by ni...@apache.org.
[SYSTEMML-1923] Support sinh, cosh and tanh as built-in functions
- Added sinh, cosh and tanh builtin functions to CP, Spark and GPU backend.
- Added these functions to Python DSL.
- Also, updated the DML language reference and Python reference documentation.
Closes #668.
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/50a895f8
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/50a895f8
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/50a895f8
Branch: refs/heads/master
Commit: 50a895f86167d5766b249a3db89f9c2b06f7a0d0
Parents: aa15197
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Thu Sep 21 10:06:04 2017 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Thu Sep 21 10:07:15 2017 -0700
----------------------------------------------------------------------
docs/dml-language-reference.md | 2 +-
docs/python-reference.md | 2 +-
src/main/cpp/kernels/SystemML.cu | 44 +-
src/main/cpp/kernels/SystemML.ptx | 2070 +++++++++++-------
src/main/java/org/apache/sysml/hops/Hop.java | 11 +-
.../java/org/apache/sysml/hops/UnaryOp.java | 11 +-
.../sysml/hops/codegen/cplan/CNodeUnary.java | 1 +
.../codegen/opt/PlanSelectionFuseCostBased.java | 3 +
.../opt/PlanSelectionFuseCostBasedV2.java | 3 +
src/main/java/org/apache/sysml/lops/Unary.java | 8 +-
.../java/org/apache/sysml/lops/UnaryCP.java | 11 +-
.../sysml/parser/BuiltinFunctionExpression.java | 12 +
.../org/apache/sysml/parser/DMLTranslator.java | 12 +
.../org/apache/sysml/parser/Expression.java | 3 +
.../sysml/runtime/functionobjects/Builtin.java | 24 +-
.../instructions/CPInstructionParser.java | 3 +
.../instructions/GPUInstructionParser.java | 3 +
.../instructions/SPInstructionParser.java | 3 +
.../instructions/gpu/GPUInstruction.java | 3 +
.../gpu/MatrixBuiltinGPUInstruction.java | 6 +
.../runtime/matrix/data/LibMatrixCUDA.java | 51 +
.../runtime/matrix/operators/UnaryOperator.java | 2 +
src/main/python/systemml/defmatrix.py | 9 +
.../org/apache/sysml/test/gpu/UnaryOpTests.java | 15 +
24 files changed, 1474 insertions(+), 838 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/docs/dml-language-reference.md
----------------------------------------------------------------------
diff --git a/docs/dml-language-reference.md b/docs/dml-language-reference.md
index bd66a42..d8ca07f 100644
--- a/docs/dml-language-reference.md
+++ b/docs/dml-language-reference.md
@@ -822,7 +822,7 @@ is same as
Function | Description | Parameters | Example
-------- | ----------- | ---------- | -------
exp(), log(), abs(), sqrt(), round(), floor(), ceil() | Apply mathematical function on input (cell wise if input is matrix) | Input: (<matrix>), or (<scalar>) <br/> Output: <matrix>, or <scalar> | sqrt(X) <br/> log(X,y) <br/> round(X) <br/> floor(X) <br/> ceil(X)
-sin(), cos(), tan(), asin(), acos(), atan() | Apply trigonometric function on input (cell wise if input is matrix) | Input: (<matrix>), or (<scalar>) <br/> Output: <matrix>, or <scalar> | sin(X)
+sin(), cos(), tan(), sinh(), cosh(), tanh(), asin(), acos(), atan() | Apply trigonometric function on input (cell wise if input is matrix) | Input: (<matrix>), or (<scalar>) <br/> Output: <matrix>, or <scalar> | sin(X)
sign() | Returns a matrix representing the signs of the input matrix elements, where 1 represents positive, 0 represents zero, and -1 represents negative | Input : (A <matrix>) <br/> Output : <matrix> | <span style="white-space: nowrap;">A = matrix("-5 0 3 -3",</span> rows=2, cols=2) <br/>signA = sign(A)<br/>Matrix signA: [[-1, 0], [1, -1]]
http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/docs/python-reference.md
----------------------------------------------------------------------
diff --git a/docs/python-reference.md b/docs/python-reference.md
index 119c1d0..4fd78fe 100644
--- a/docs/python-reference.md
+++ b/docs/python-reference.md
@@ -191,7 +191,7 @@ In addition to the above mentioned operators, following functions are supported.
| argmax(self, axis=None) | Returns the indices of the maximum values along an axis. | axis : int, optional (only axis=1, i.e. rowIndexMax is supported in this version) |
| cumsum(self, axis=None) | Returns the indices of the maximum values along an axis. | axis : int, optional (only axis=0, i.e. cumsum along the rows is supported in this version) |
-- Global statistical built-In functions: exp, log, abs, sqrt, round, floor, ceil, sin, cos, tan, asin, acos, atan, sign, solve
+- Global statistical built-In functions: exp, log, abs, sqrt, round, floor, ceil, sin, cos, tan, sinh, cosh, tanh, asin, acos, atan, sign, solve
| | Description | Parameters |
|------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/cpp/kernels/SystemML.cu
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu
index 3e1a13a..c243564 100644
--- a/src/main/cpp/kernels/SystemML.cu
+++ b/src/main/cpp/kernels/SystemML.cu
@@ -990,6 +990,20 @@ __global__ void matrix_sin(double *A, double *C, unsigned int size) {
}
/**
+ * Do an sinh over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_sinh(double *A, double *C, unsigned int size) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ if (index < size){
+ C[index] = sinh(A[index]);
+ }
+}
+
+/**
* Do an cos over all the elements of a matrix
* @param A the input matrix (of length = size)
* @param C the pre-allocated output matrix (of length = size)
@@ -1004,6 +1018,20 @@ __global__ void matrix_cos(double *A, double *C, unsigned int size) {
}
/**
+ * Do an cosh over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_cosh(double *A, double *C, unsigned int size) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ if (index < size){
+ C[index] = cosh(A[index]);
+ }
+}
+
+/**
* Do an tan over all the elements of a matrix
* @param A the input matrix (of length = size)
* @param C the pre-allocated output matrix (of length = size)
@@ -1018,6 +1046,20 @@ __global__ void matrix_tan(double *A, double *C, unsigned int size) {
}
/**
+ * Do an tanh over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_tanh(double *A, double *C, unsigned int size) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ if (index < size){
+ C[index] = tanh(A[index]);
+ }
+}
+
+/**
* Do an asin over all the elements of a matrix
* @param A the input matrix (of length = size)
* @param C the pre-allocated output matrix (of length = size)
@@ -1076,4 +1118,4 @@ __global__ void matrix_sign(double *A, double *C, unsigned int size) {
C[index] = copysign(1.0, A[index]);
}
}
-}
\ No newline at end of file
+}
http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx
index b4a6559..73b057e 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -1,8 +1,8 @@
//
// Generated by NVIDIA NVVM Compiler
//
-// Compiler Build ID: CL-21554848
-// Cuda compilation tools, release 8.0, V8.0.61
+// Compiler Build ID: CL-21124049
+// Cuda compilation tools, release 8.0, V8.0.44
// Based on LLVM 3.4svn
//
@@ -689,9 +689,9 @@ BB10_6:
.param .u32 matrix_matrix_cellwise_op_param_7
)
{
- .reg .pred %p<73>;
- .reg .b32 %r<66>;
- .reg .f64 %fd<56>;
+ .reg .pred %p<77>;
+ .reg .b32 %r<65>;
+ .reg .f64 %fd<55>;
.reg .b64 %rd<19>;
@@ -712,40 +712,40 @@ BB10_6:
setp.lt.s32 %p2, %r1, %r14;
setp.gt.s32 %p3, %r10, -1;
and.pred %p4, %p2, %p3;
- @!%p4 bra BB11_77;
+ @!%p4 bra BB11_73;
bra.uni BB11_1;
BB11_1:
mad.lo.s32 %r3, %r1, %r10, %r2;
setp.eq.s32 %p5, %r11, 1;
- mov.u32 %r64, %r1;
+ mov.u32 %r63, %r1;
@%p5 bra BB11_5;
setp.ne.s32 %p6, %r11, 2;
- mov.u32 %r65, %r3;
+ mov.u32 %r64, %r3;
@%p6 bra BB11_4;
- mov.u32 %r65, %r2;
+ mov.u32 %r64, %r2;
BB11_4:
- mov.u32 %r59, %r65;
- mov.u32 %r4, %r59;
- mov.u32 %r64, %r4;
+ mov.u32 %r58, %r64;
+ mov.u32 %r4, %r58;
+ mov.u32 %r63, %r4;
BB11_5:
- mov.u32 %r5, %r64;
+ mov.u32 %r5, %r63;
setp.eq.s32 %p7, %r12, 1;
- mov.u32 %r62, %r1;
+ mov.u32 %r61, %r1;
@%p7 bra BB11_9;
setp.ne.s32 %p8, %r12, 2;
- mov.u32 %r63, %r3;
+ mov.u32 %r62, %r3;
@%p8 bra BB11_8;
- mov.u32 %r63, %r2;
+ mov.u32 %r62, %r2;
BB11_8:
- mov.u32 %r62, %r63;
+ mov.u32 %r61, %r62;
BB11_9:
cvta.to.global.u64 %rd5, %rd3;
@@ -753,10 +753,10 @@ BB11_9:
mul.wide.s32 %rd7, %r5, 8;
add.s64 %rd8, %rd6, %rd7;
ld.global.f64 %fd1, [%rd8];
- mul.wide.s32 %rd9, %r62, 8;
+ mul.wide.s32 %rd9, %r61, 8;
add.s64 %rd10, %rd5, %rd9;
ld.global.f64 %fd2, [%rd10];
- mov.f64 %fd55, 0d7FEFFFFFFFFFFFFF;
+ mov.f64 %fd54, 0d7FEFFFFFFFFFFFFF;
setp.gt.s32 %p9, %r13, 8;
@%p9 bra BB11_26;
@@ -767,12 +767,12 @@ BB11_9:
@%p30 bra BB11_15;
setp.eq.s32 %p33, %r13, 0;
- @%p33 bra BB11_75;
+ @%p33 bra BB11_71;
bra.uni BB11_13;
-BB11_75:
- add.f64 %fd55, %fd1, %fd2;
- bra.uni BB11_76;
+BB11_71:
+ add.f64 %fd54, %fd1, %fd2;
+ bra.uni BB11_72;
BB11_26:
setp.gt.s32 %p10, %r13, 13;
@@ -782,23 +782,23 @@ BB11_26:
@%p17 bra BB11_31;
setp.eq.s32 %p21, %r13, 9;
- @%p21 bra BB11_55;
+ @%p21 bra BB11_53;
bra.uni BB11_29;
-BB11_55:
- setp.eq.f64 %p48, %fd1, %fd2;
- selp.f64 %fd55, 0d3FF0000000000000, 0d0000000000000000, %p48;
- bra.uni BB11_76;
+BB11_53:
+ setp.eq.f64 %p50, %fd1, %fd2;
+ selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p50;
+ bra.uni BB11_72;
BB11_18:
setp.gt.s32 %p24, %r13, 5;
@%p24 bra BB11_22;
setp.eq.s32 %p28, %r13, 4;
- @%p28 bra BB11_58;
+ @%p28 bra BB11_56;
bra.uni BB11_20;
-BB11_58:
+BB11_56:
{
.reg .b32 %temp;
mov.b64 {%temp, %r8}, %fd1;
@@ -811,7 +811,7 @@ BB11_58:
add.s32 %r32, %r31, -1012;
mov.b64 %rd15, %fd2;
shl.b64 %rd1, %rd15, %r32;
- setp.eq.s64 %p53, %rd1, -9223372036854775808;
+ setp.eq.s64 %p55, %rd1, -9223372036854775808;
abs.f64 %fd19, %fd1;
// Callseq Start 0
{
@@ -828,192 +828,192 @@ BB11_58:
param0,
param1
);
- ld.param.f64 %fd54, [retval0+0];
+ ld.param.f64 %fd53, [retval0+0];
//{
}// Callseq End 0
- setp.lt.s32 %p54, %r8, 0;
- and.pred %p1, %p54, %p53;
- @!%p1 bra BB11_60;
- bra.uni BB11_59;
+ setp.lt.s32 %p56, %r8, 0;
+ and.pred %p1, %p56, %p55;
+ @!%p1 bra BB11_58;
+ bra.uni BB11_57;
-BB11_59:
+BB11_57:
{
.reg .b32 %temp;
- mov.b64 {%temp, %r33}, %fd54;
+ mov.b64 {%temp, %r33}, %fd53;
}
xor.b32 %r34, %r33, -2147483648;
{
.reg .b32 %temp;
- mov.b64 {%r35, %temp}, %fd54;
+ mov.b64 {%r35, %temp}, %fd53;
}
- mov.b64 %fd54, {%r35, %r34};
+ mov.b64 %fd53, {%r35, %r34};
-BB11_60:
- mov.f64 %fd53, %fd54;
- setp.eq.f64 %p55, %fd1, 0d0000000000000000;
- @%p55 bra BB11_63;
- bra.uni BB11_61;
+BB11_58:
+ mov.f64 %fd52, %fd53;
+ setp.eq.f64 %p57, %fd1, 0d0000000000000000;
+ @%p57 bra BB11_61;
+ bra.uni BB11_59;
-BB11_63:
- selp.b32 %r36, %r8, 0, %p53;
+BB11_61:
+ selp.b32 %r36, %r8, 0, %p55;
or.b32 %r37, %r36, 2146435072;
- setp.lt.s32 %p59, %r9, 0;
- selp.b32 %r38, %r37, %r36, %p59;
+ setp.lt.s32 %p61, %r9, 0;
+ selp.b32 %r38, %r37, %r36, %p61;
mov.u32 %r39, 0;
- mov.b64 %fd53, {%r39, %r38};
- bra.uni BB11_64;
+ mov.b64 %fd52, {%r39, %r38};
+ bra.uni BB11_62;
BB11_35:
setp.gt.s32 %p11, %r13, 15;
@%p11 bra BB11_39;
setp.eq.s32 %p15, %r13, 14;
- @%p15 bra BB11_52;
+ @%p15 bra BB11_50;
bra.uni BB11_37;
-BB11_52:
+BB11_50:
cvt.rni.s64.f64 %rd11, %fd1;
cvt.rni.s64.f64 %rd12, %fd2;
cvt.u32.u64 %r25, %rd11;
cvt.u32.u64 %r26, %rd12;
or.b32 %r27, %r26, %r25;
- setp.eq.s32 %p45, %r27, 0;
- selp.f64 %fd55, 0d0000000000000000, 0d3FF0000000000000, %p45;
- bra.uni BB11_76;
+ setp.eq.s32 %p47, %r27, 0;
+ selp.f64 %fd54, 0d0000000000000000, 0d3FF0000000000000, %p47;
+ bra.uni BB11_72;
BB11_15:
setp.eq.s32 %p31, %r13, 2;
- @%p31 bra BB11_74;
+ @%p31 bra BB11_70;
bra.uni BB11_16;
-BB11_74:
- mul.f64 %fd55, %fd1, %fd2;
- bra.uni BB11_76;
+BB11_70:
+ mul.f64 %fd54, %fd1, %fd2;
+ bra.uni BB11_72;
BB11_31:
setp.eq.s32 %p18, %r13, 11;
- @%p18 bra BB11_54;
+ @%p18 bra BB11_52;
setp.eq.s32 %p19, %r13, 12;
- @%p19 bra BB11_53;
+ @%p19 bra BB11_51;
bra.uni BB11_33;
-BB11_53:
- max.f64 %fd55, %fd1, %fd2;
- bra.uni BB11_76;
+BB11_51:
+ max.f64 %fd54, %fd1, %fd2;
+ bra.uni BB11_72;
BB11_22:
setp.eq.s32 %p25, %r13, 6;
- @%p25 bra BB11_57;
+ @%p25 bra BB11_55;
setp.eq.s32 %p26, %r13, 7;
- @%p26 bra BB11_56;
+ @%p26 bra BB11_54;
bra.uni BB11_24;
-BB11_56:
- setp.gt.f64 %p50, %fd1, %fd2;
- selp.f64 %fd55, 0d3FF0000000000000, 0d0000000000000000, %p50;
- bra.uni BB11_76;
+BB11_54:
+ setp.gt.f64 %p52, %fd1, %fd2;
+ selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p52;
+ bra.uni BB11_72;
BB11_39:
setp.eq.s32 %p12, %r13, 16;
- @%p12 bra BB11_51;
+ @%p12 bra BB11_49;
setp.eq.s32 %p13, %r13, 17;
- @%p13 bra BB11_46;
+ @%p13 bra BB11_45;
bra.uni BB11_41;
-BB11_46:
- setp.eq.f64 %p38, %fd2, 0d0000000000000000;
- setp.eq.f64 %p39, %fd2, 0d8000000000000000;
- or.pred %p40, %p38, %p39;
- mov.f64 %fd55, 0d7FF8000000000000;
- @%p40 bra BB11_76;
+BB11_45:
+ setp.eq.f64 %p39, %fd2, 0d0000000000000000;
+ setp.eq.f64 %p40, %fd2, 0d8000000000000000;
+ or.pred %p41, %p39, %p40;
+ mov.f64 %fd54, 0d7FF8000000000000;
+ @%p41 bra BB11_72;
- div.rn.f64 %fd55, %fd1, %fd2;
- abs.f64 %fd39, %fd55;
- setp.gtu.f64 %p41, %fd39, 0d7FF0000000000000;
- @%p41 bra BB11_76;
+ div.rn.f64 %fd54, %fd1, %fd2;
+ abs.f64 %fd39, %fd54;
+ setp.gtu.f64 %p42, %fd39, 0d7FF0000000000000;
+ @%p42 bra BB11_72;
{
.reg .b32 %temp;
- mov.b64 {%temp, %r22}, %fd55;
+ mov.b64 {%r22, %temp}, %fd54;
}
- and.b32 %r23, %r22, 2147483647;
- setp.ne.s32 %p42, %r23, 2146435072;
- @%p42 bra BB11_50;
-
{
.reg .b32 %temp;
- mov.b64 {%r24, %temp}, %fd55;
+ mov.b64 {%temp, %r23}, %fd54;
}
- setp.eq.s32 %p43, %r24, 0;
- @%p43 bra BB11_76;
-
-BB11_50:
- cvt.rmi.f64.f64 %fd40, %fd55;
+ and.b32 %r24, %r23, 2147483647;
+ setp.ne.s32 %p43, %r24, 2146435072;
+ setp.ne.s32 %p44, %r22, 0;
+ or.pred %p45, %p43, %p44;
+ @!%p45 bra BB11_72;
+ bra.uni BB11_48;
+
+BB11_48:
+ cvt.rmi.f64.f64 %fd40, %fd54;
mul.f64 %fd41, %fd2, %fd40;
- sub.f64 %fd55, %fd1, %fd41;
- bra.uni BB11_76;
+ sub.f64 %fd54, %fd1, %fd41;
+ bra.uni BB11_72;
BB11_13:
setp.eq.s32 %p34, %r13, 1;
@%p34 bra BB11_14;
- bra.uni BB11_76;
+ bra.uni BB11_72;
BB11_14:
- sub.f64 %fd55, %fd1, %fd2;
- bra.uni BB11_76;
+ sub.f64 %fd54, %fd1, %fd2;
+ bra.uni BB11_72;
BB11_29:
setp.eq.s32 %p22, %r13, 10;
@%p22 bra BB11_30;
- bra.uni BB11_76;
+ bra.uni BB11_72;
BB11_30:
- setp.neu.f64 %p47, %fd1, %fd2;
- selp.f64 %fd55, 0d3FF0000000000000, 0d0000000000000000, %p47;
- bra.uni BB11_76;
+ setp.neu.f64 %p49, %fd1, %fd2;
+ selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p49;
+ bra.uni BB11_72;
BB11_20:
setp.eq.s32 %p29, %r13, 5;
@%p29 bra BB11_21;
- bra.uni BB11_76;
+ bra.uni BB11_72;
BB11_21:
- setp.lt.f64 %p52, %fd1, %fd2;
- selp.f64 %fd55, 0d3FF0000000000000, 0d0000000000000000, %p52;
- bra.uni BB11_76;
+ setp.lt.f64 %p54, %fd1, %fd2;
+ selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p54;
+ bra.uni BB11_72;
BB11_37:
setp.eq.s32 %p16, %r13, 15;
@%p16 bra BB11_38;
- bra.uni BB11_76;
+ bra.uni BB11_72;
BB11_38:
mul.f64 %fd43, %fd1, %fd2;
mov.f64 %fd44, 0d3FF0000000000000;
- sub.f64 %fd55, %fd44, %fd43;
- bra.uni BB11_76;
+ sub.f64 %fd54, %fd44, %fd43;
+ bra.uni BB11_72;
BB11_16:
setp.eq.s32 %p32, %r13, 3;
@%p32 bra BB11_17;
- bra.uni BB11_76;
+ bra.uni BB11_72;
BB11_17:
- div.rn.f64 %fd55, %fd1, %fd2;
- bra.uni BB11_76;
+ div.rn.f64 %fd54, %fd1, %fd2;
+ bra.uni BB11_72;
-BB11_54:
- min.f64 %fd55, %fd1, %fd2;
- bra.uni BB11_76;
+BB11_52:
+ min.f64 %fd54, %fd1, %fd2;
+ bra.uni BB11_72;
BB11_33:
setp.eq.s32 %p20, %r13, 13;
@%p20 bra BB11_34;
- bra.uni BB11_76;
+ bra.uni BB11_72;
BB11_34:
cvt.rni.s64.f64 %rd13, %fd1;
@@ -1021,149 +1021,147 @@ BB11_34:
cvt.u32.u64 %r28, %rd13;
cvt.u32.u64 %r29, %rd14;
and.b32 %r30, %r29, %r28;
- setp.eq.s32 %p46, %r30, 0;
- selp.f64 %fd55, 0d0000000000000000, 0d3FF0000000000000, %p46;
- bra.uni BB11_76;
+ setp.eq.s32 %p48, %r30, 0;
+ selp.f64 %fd54, 0d0000000000000000, 0d3FF0000000000000, %p48;
+ bra.uni BB11_72;
-BB11_57:
- setp.le.f64 %p51, %fd1, %fd2;
- selp.f64 %fd55, 0d3FF0000000000000, 0d0000000000000000, %p51;
- bra.uni BB11_76;
+BB11_55:
+ setp.le.f64 %p53, %fd1, %fd2;
+ selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p53;
+ bra.uni BB11_72;
BB11_24:
setp.eq.s32 %p27, %r13, 8;
@%p27 bra BB11_25;
- bra.uni BB11_76;
+ bra.uni BB11_72;
BB11_25:
- setp.ge.f64 %p49, %fd1, %fd2;
- selp.f64 %fd55, 0d3FF0000000000000, 0d0000000000000000, %p49;
- bra.uni BB11_76;
+ setp.ge.f64 %p51, %fd1, %fd2;
+ selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p51;
+ bra.uni BB11_72;
-BB11_51:
- setp.neu.f64 %p44, %fd1, 0d0000000000000000;
+BB11_49:
+ setp.neu.f64 %p46, %fd1, 0d0000000000000000;
sub.f64 %fd42, %fd1, %fd2;
- selp.f64 %fd55, %fd42, 0d0000000000000000, %p44;
- bra.uni BB11_76;
+ selp.f64 %fd54, %fd42, 0d0000000000000000, %p46;
+ bra.uni BB11_72;
BB11_41:
setp.ne.s32 %p14, %r13, 18;
- @%p14 bra BB11_76;
+ @%p14 bra BB11_72;
- div.rn.f64 %fd55, %fd1, %fd2;
- abs.f64 %fd37, %fd55;
+ div.rn.f64 %fd54, %fd1, %fd2;
+ abs.f64 %fd37, %fd54;
setp.gtu.f64 %p35, %fd37, 0d7FF0000000000000;
- @%p35 bra BB11_76;
+ @%p35 bra BB11_72;
{
.reg .b32 %temp;
- mov.b64 {%temp, %r19}, %fd55;
+ mov.b64 {%r19, %temp}, %fd54;
}
- and.b32 %r20, %r19, 2147483647;
- setp.ne.s32 %p36, %r20, 2146435072;
- @%p36 bra BB11_45;
-
{
.reg .b32 %temp;
- mov.b64 {%r21, %temp}, %fd55;
+ mov.b64 {%temp, %r20}, %fd54;
}
- setp.eq.s32 %p37, %r21, 0;
- @%p37 bra BB11_76;
+ and.b32 %r21, %r20, 2147483647;
+ setp.ne.s32 %p36, %r21, 2146435072;
+ setp.ne.s32 %p37, %r19, 0;
+ or.pred %p38, %p36, %p37;
+ @!%p38 bra BB11_72;
+ bra.uni BB11_44;
-BB11_45:
- cvt.rmi.f64.f64 %fd55, %fd55;
- bra.uni BB11_76;
+BB11_44:
+ cvt.rmi.f64.f64 %fd54, %fd54;
+ bra.uni BB11_72;
-BB11_61:
- setp.gt.s32 %p56, %r8, -1;
- @%p56 bra BB11_64;
+BB11_59:
+ setp.gt.s32 %p58, %r8, -1;
+ @%p58 bra BB11_62;
cvt.rzi.f64.f64 %fd45, %fd2;
- setp.neu.f64 %p57, %fd45, %fd2;
- selp.f64 %fd53, 0dFFF8000000000000, %fd53, %p57;
+ setp.neu.f64 %p59, %fd45, %fd2;
+ selp.f64 %fd52, 0dFFF8000000000000, %fd52, %p59;
-BB11_64:
- mov.f64 %fd25, %fd53;
+BB11_62:
+ mov.f64 %fd25, %fd52;
add.f64 %fd26, %fd1, %fd2;
{
.reg .b32 %temp;
mov.b64 {%temp, %r40}, %fd26;
}
and.b32 %r41, %r40, 2146435072;
- setp.ne.s32 %p60, %r41, 2146435072;
- mov.f64 %fd52, %fd25;
- @%p60 bra BB11_73;
+ setp.ne.s32 %p62, %r41, 2146435072;
+ mov.f64 %fd51, %fd25;
+ @%p62 bra BB11_69;
- setp.gtu.f64 %p61, %fd19, 0d7FF0000000000000;
- mov.f64 %fd52, %fd26;
- @%p61 bra BB11_73;
-
- abs.f64 %fd46, %fd2;
- setp.gtu.f64 %p62, %fd46, 0d7FF0000000000000;
+ setp.gtu.f64 %p63, %fd19, 0d7FF0000000000000;
mov.f64 %fd51, %fd26;
- mov.f64 %fd52, %fd51;
- @%p62 bra BB11_73;
-
- and.b32 %r42, %r9, 2147483647;
- setp.ne.s32 %p63, %r42, 2146435072;
@%p63 bra BB11_69;
+ abs.f64 %fd46, %fd2;
+ setp.gtu.f64 %p64, %fd46, 0d7FF0000000000000;
+ mov.f64 %fd50, %fd26;
+ mov.f64 %fd51, %fd50;
+ @%p64 bra BB11_69;
+
{
.reg .b32 %temp;
- mov.b64 {%r43, %temp}, %fd2;
+ mov.b64 {%r42, %temp}, %fd2;
}
- setp.eq.s32 %p64, %r43, 0;
- @%p64 bra BB11_72;
-
-BB11_69:
- and.b32 %r44, %r8, 2147483647;
- setp.ne.s32 %p65, %r44, 2146435072;
- mov.f64 %fd49, %fd25;
- mov.f64 %fd52, %fd49;
- @%p65 bra BB11_73;
-
+ and.b32 %r43, %r9, 2147483647;
+ setp.eq.s32 %p65, %r43, 2146435072;
+ setp.eq.s32 %p66, %r42, 0;
+ and.pred %p67, %p65, %p66;
+ @%p67 bra BB11_68;
+ bra.uni BB11_66;
+
+BB11_68:
+ setp.gt.f64 %p71, %fd19, 0d3FF0000000000000;
+ selp.b32 %r51, 2146435072, 0, %p71;
+ xor.b32 %r52, %r51, 2146435072;
+ setp.lt.s32 %p72, %r9, 0;
+ selp.b32 %r53, %r52, %r51, %p72;
+ setp.eq.f64 %p73, %fd1, 0dBFF0000000000000;
+ selp.b32 %r54, 1072693248, %r53, %p73;
+ mov.u32 %r55, 0;
+ mov.b64 %fd51, {%r55, %r54};
+ bra.uni BB11_69;
+
+BB11_66:
{
.reg .b32 %temp;
- mov.b64 {%r45, %temp}, %fd1;
+ mov.b64 {%r44, %temp}, %fd1;
}
- setp.ne.s32 %p66, %r45, 0;
- mov.f64 %fd52, %fd25;
- @%p66 bra BB11_73;
-
+ and.b32 %r45, %r8, 2147483647;
+ setp.eq.s32 %p68, %r45, 2146435072;
+ setp.eq.s32 %p69, %r44, 0;
+ and.pred %p70, %p68, %p69;
+ mov.f64 %fd51, %fd25;
+ @!%p70 bra BB11_69;
+ bra.uni BB11_67;
+
+BB11_67:
shr.s32 %r46, %r9, 31;
and.b32 %r47, %r46, -2146435072;
- add.s32 %r48, %r47, 2146435072;
- or.b32 %r49, %r48, -2147483648;
- selp.b32 %r50, %r49, %r48, %p1;
- mov.u32 %r51, 0;
- mov.b64 %fd52, {%r51, %r50};
- bra.uni BB11_73;
-
-BB11_72:
- setp.gt.f64 %p67, %fd19, 0d3FF0000000000000;
- selp.b32 %r52, 2146435072, 0, %p67;
- xor.b32 %r53, %r52, 2146435072;
- setp.lt.s32 %p68, %r9, 0;
- selp.b32 %r54, %r53, %r52, %p68;
- setp.eq.f64 %p69, %fd1, 0dBFF0000000000000;
- selp.b32 %r55, 1072693248, %r54, %p69;
- mov.u32 %r56, 0;
- mov.b64 %fd52, {%r56, %r55};
+ selp.b32 %r48, -1048576, 2146435072, %p1;
+ add.s32 %r49, %r48, %r47;
+ mov.u32 %r50, 0;
+ mov.b64 %fd51, {%r50, %r49};
-BB11_73:
- setp.eq.f64 %p70, %fd2, 0d0000000000000000;
- setp.eq.f64 %p71, %fd1, 0d3FF0000000000000;
- or.pred %p72, %p71, %p70;
- selp.f64 %fd55, 0d3FF0000000000000, %fd52, %p72;
+BB11_69:
+ setp.eq.f64 %p74, %fd2, 0d0000000000000000;
+ setp.eq.f64 %p75, %fd1, 0d3FF0000000000000;
+ or.pred %p76, %p75, %p74;
+ selp.f64 %fd54, 0d3FF0000000000000, %fd51, %p76;
-BB11_76:
+BB11_72:
cvta.to.global.u64 %rd16, %rd4;
mul.wide.s32 %rd17, %r3, 8;
add.s64 %rd18, %rd16, %rd17;
- st.global.f64 [%rd18], %fd55;
+ st.global.f64 [%rd18], %fd54;
bar.sync 0;
-BB11_77:
+BB11_73:
ret;
}
@@ -1177,9 +1175,9 @@ BB11_77:
.param .u32 matrix_scalar_op_param_5
)
{
- .reg .pred %p<133>;
- .reg .b32 %r<88>;
- .reg .f64 %fd<109>;
+ .reg .pred %p<141>;
+ .reg .b32 %r<86>;
+ .reg .f64 %fd<107>;
.reg .b64 %rd<20>;
@@ -1194,7 +1192,7 @@ BB11_77:
mov.u32 %r11, %tid.x;
mad.lo.s32 %r1, %r9, %r10, %r11;
setp.ge.s32 %p3, %r1, %r8;
- @%p3 bra BB12_138;
+ @%p3 bra BB12_130;
cvta.to.global.u64 %rd6, %rd5;
cvta.to.global.u64 %rd7, %rd4;
@@ -1203,9 +1201,9 @@ BB11_77:
ld.global.f64 %fd1, [%rd9];
add.s64 %rd1, %rd6, %rd8;
setp.eq.s32 %p4, %r7, 0;
- @%p4 bra BB12_70;
+ @%p4 bra BB12_66;
- mov.f64 %fd99, 0d7FEFFFFFFFFFFFFF;
+ mov.f64 %fd98, 0d7FEFFFFFFFFFFFFF;
setp.gt.s32 %p5, %r6, 8;
@%p5 bra BB12_19;
@@ -1216,31 +1214,31 @@ BB11_77:
@%p26 bra BB12_8;
setp.eq.s32 %p29, %r6, 0;
- @%p29 bra BB12_68;
+ @%p29 bra BB12_64;
bra.uni BB12_6;
-BB12_68:
- add.f64 %fd99, %fd1, %fd68;
- bra.uni BB12_69;
+BB12_64:
+ add.f64 %fd98, %fd1, %fd68;
+ bra.uni BB12_65;
-BB12_70:
- mov.f64 %fd108, 0d7FEFFFFFFFFFFFFF;
- setp.gt.s32 %p69, %r6, 8;
- @%p69 bra BB12_87;
+BB12_66:
+ mov.f64 %fd106, 0d7FEFFFFFFFFFFFFF;
+ setp.gt.s32 %p73, %r6, 8;
+ @%p73 bra BB12_83;
- setp.gt.s32 %p83, %r6, 3;
- @%p83 bra BB12_79;
+ setp.gt.s32 %p87, %r6, 3;
+ @%p87 bra BB12_75;
- setp.gt.s32 %p90, %r6, 1;
- @%p90 bra BB12_76;
+ setp.gt.s32 %p94, %r6, 1;
+ @%p94 bra BB12_72;
- setp.eq.s32 %p93, %r6, 0;
- @%p93 bra BB12_136;
- bra.uni BB12_74;
+ setp.eq.s32 %p97, %r6, 0;
+ @%p97 bra BB12_128;
+ bra.uni BB12_70;
-BB12_136:
- add.f64 %fd108, %fd1, %fd68;
- bra.uni BB12_137;
+BB12_128:
+ add.f64 %fd106, %fd1, %fd68;
+ bra.uni BB12_129;
BB12_19:
setp.gt.s32 %p6, %r6, 13;
@@ -1250,39 +1248,39 @@ BB12_19:
@%p13 bra BB12_24;
setp.eq.s32 %p17, %r6, 9;
- @%p17 bra BB12_48;
+ @%p17 bra BB12_46;
bra.uni BB12_22;
-BB12_48:
- setp.eq.f64 %p44, %fd1, %fd68;
- selp.f64 %fd99, 0d3FF0000000000000, 0d0000000000000000, %p44;
- bra.uni BB12_69;
+BB12_46:
+ setp.eq.f64 %p46, %fd1, %fd68;
+ selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p46;
+ bra.uni BB12_65;
-BB12_87:
- setp.gt.s32 %p70, %r6, 13;
- @%p70 bra BB12_96;
+BB12_83:
+ setp.gt.s32 %p74, %r6, 13;
+ @%p74 bra BB12_92;
- setp.gt.s32 %p77, %r6, 10;
- @%p77 bra BB12_92;
+ setp.gt.s32 %p81, %r6, 10;
+ @%p81 bra BB12_88;
- setp.eq.s32 %p81, %r6, 9;
- @%p81 bra BB12_116;
- bra.uni BB12_90;
+ setp.eq.s32 %p85, %r6, 9;
+ @%p85 bra BB12_110;
+ bra.uni BB12_86;
-BB12_116:
- setp.eq.f64 %p108, %fd1, %fd68;
- selp.f64 %fd108, 0d3FF0000000000000, 0d0000000000000000, %p108;
- bra.uni BB12_137;
+BB12_110:
+ setp.eq.f64 %p114, %fd1, %fd68;
+ selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p114;
+ bra.uni BB12_129;
BB12_11:
setp.gt.s32 %p20, %r6, 5;
@%p20 bra BB12_15;
setp.eq.s32 %p24, %r6, 4;
- @%p24 bra BB12_51;
+ @%p24 bra BB12_49;
bra.uni BB12_13;
-BB12_51:
+BB12_49:
{
.reg .b32 %temp;
mov.b64 {%temp, %r2}, %fd68;
@@ -1295,7 +1293,7 @@ BB12_51:
add.s32 %r25, %r24, -1012;
mov.b64 %rd14, %fd1;
shl.b64 %rd2, %rd14, %r25;
- setp.eq.s64 %p49, %rd2, -9223372036854775808;
+ setp.eq.s64 %p51, %rd2, -9223372036854775808;
abs.f64 %fd18, %fd68;
// Callseq Start 1
{
@@ -1312,69 +1310,69 @@ BB12_51:
param0,
param1
);
- ld.param.f64 %fd98, [retval0+0];
+ ld.param.f64 %fd97, [retval0+0];
//{
}// Callseq End 1
- setp.lt.s32 %p50, %r2, 0;
- and.pred %p1, %p50, %p49;
- @!%p1 bra BB12_53;
- bra.uni BB12_52;
+ setp.lt.s32 %p52, %r2, 0;
+ and.pred %p1, %p52, %p51;
+ @!%p1 bra BB12_51;
+ bra.uni BB12_50;
-BB12_52:
+BB12_50:
{
.reg .b32 %temp;
- mov.b64 {%temp, %r26}, %fd98;
+ mov.b64 {%temp, %r26}, %fd97;
}
xor.b32 %r27, %r26, -2147483648;
{
.reg .b32 %temp;
- mov.b64 {%r28, %temp}, %fd98;
+ mov.b64 {%r28, %temp}, %fd97;
}
- mov.b64 %fd98, {%r28, %r27};
+ mov.b64 %fd97, {%r28, %r27};
-BB12_53:
- mov.f64 %fd97, %fd98;
- setp.eq.f64 %p51, %fd68, 0d0000000000000000;
- @%p51 bra BB12_56;
- bra.uni BB12_54;
+BB12_51:
+ mov.f64 %fd96, %fd97;
+ setp.eq.f64 %p53, %fd68, 0d0000000000000000;
+ @%p53 bra BB12_54;
+ bra.uni BB12_52;
-BB12_56:
- selp.b32 %r29, %r2, 0, %p49;
+BB12_54:
+ selp.b32 %r29, %r2, 0, %p51;
or.b32 %r30, %r29, 2146435072;
- setp.lt.s32 %p55, %r3, 0;
- selp.b32 %r31, %r30, %r29, %p55;
+ setp.lt.s32 %p57, %r3, 0;
+ selp.b32 %r31, %r30, %r29, %p57;
mov.u32 %r32, 0;
- mov.b64 %fd97, {%r32, %r31};
- bra.uni BB12_57;
+ mov.b64 %fd96, {%r32, %r31};
+ bra.uni BB12_55;
BB12_28:
setp.gt.s32 %p7, %r6, 15;
@%p7 bra BB12_32;
setp.eq.s32 %p11, %r6, 14;
- @%p11 bra BB12_45;
+ @%p11 bra BB12_43;
bra.uni BB12_30;
-BB12_45:
+BB12_43:
cvt.rni.s64.f64 %rd10, %fd68;
cvt.rni.s64.f64 %rd11, %fd1;
cvt.u32.u64 %r18, %rd10;
cvt.u32.u64 %r19, %rd11;
or.b32 %r20, %r19, %r18;
- setp.eq.s32 %p41, %r20, 0;
- selp.f64 %fd99, 0d0000000000000000, 0d3FF0000000000000, %p41;
- bra.uni BB12_69;
+ setp.eq.s32 %p43, %r20, 0;
+ selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p43;
+ bra.uni BB12_65;
-BB12_79:
- setp.gt.s32 %p84, %r6, 5;
- @%p84 bra BB12_83;
+BB12_75:
+ setp.gt.s32 %p88, %r6, 5;
+ @%p88 bra BB12_79;
- setp.eq.s32 %p88, %r6, 4;
- @%p88 bra BB12_119;
- bra.uni BB12_81;
+ setp.eq.s32 %p92, %r6, 4;
+ @%p92 bra BB12_113;
+ bra.uni BB12_77;
-BB12_119:
+BB12_113:
{
.reg .b32 %temp;
mov.b64 {%temp, %r4}, %fd1;
@@ -1383,11 +1381,11 @@ BB12_119:
.reg .b32 %temp;
mov.b64 {%temp, %r5}, %fd68;
}
- bfe.u32 %r62, %r5, 20, 11;
- add.s32 %r63, %r62, -1012;
+ bfe.u32 %r61, %r5, 20, 11;
+ add.s32 %r62, %r61, -1012;
mov.b64 %rd19, %fd68;
- shl.b64 %rd3, %rd19, %r63;
- setp.eq.s64 %p113, %rd3, -9223372036854775808;
+ shl.b64 %rd3, %rd19, %r62;
+ setp.eq.s64 %p119, %rd3, -9223372036854775808;
abs.f64 %fd51, %fd1;
// Callseq Start 2
{
@@ -1404,267 +1402,267 @@ BB12_119:
param0,
param1
);
- ld.param.f64 %fd107, [retval0+0];
+ ld.param.f64 %fd105, [retval0+0];
//{
}// Callseq End 2
- setp.lt.s32 %p114, %r4, 0;
- and.pred %p2, %p114, %p113;
- @!%p2 bra BB12_121;
- bra.uni BB12_120;
+ setp.lt.s32 %p120, %r4, 0;
+ and.pred %p2, %p120, %p119;
+ @!%p2 bra BB12_115;
+ bra.uni BB12_114;
-BB12_120:
+BB12_114:
{
.reg .b32 %temp;
- mov.b64 {%temp, %r64}, %fd107;
+ mov.b64 {%temp, %r63}, %fd105;
}
- xor.b32 %r65, %r64, -2147483648;
+ xor.b32 %r64, %r63, -2147483648;
{
.reg .b32 %temp;
- mov.b64 {%r66, %temp}, %fd107;
+ mov.b64 {%r65, %temp}, %fd105;
}
- mov.b64 %fd107, {%r66, %r65};
+ mov.b64 %fd105, {%r65, %r64};
-BB12_121:
- mov.f64 %fd106, %fd107;
- setp.eq.f64 %p115, %fd1, 0d0000000000000000;
- @%p115 bra BB12_124;
- bra.uni BB12_122;
+BB12_115:
+ mov.f64 %fd104, %fd105;
+ setp.eq.f64 %p121, %fd1, 0d0000000000000000;
+ @%p121 bra BB12_118;
+ bra.uni BB12_116;
-BB12_124:
- selp.b32 %r67, %r4, 0, %p113;
- or.b32 %r68, %r67, 2146435072;
- setp.lt.s32 %p119, %r5, 0;
- selp.b32 %r69, %r68, %r67, %p119;
- mov.u32 %r70, 0;
- mov.b64 %fd106, {%r70, %r69};
- bra.uni BB12_125;
+BB12_118:
+ selp.b32 %r66, %r4, 0, %p119;
+ or.b32 %r67, %r66, 2146435072;
+ setp.lt.s32 %p125, %r5, 0;
+ selp.b32 %r68, %r67, %r66, %p125;
+ mov.u32 %r69, 0;
+ mov.b64 %fd104, {%r69, %r68};
+ bra.uni BB12_119;
-BB12_96:
- setp.gt.s32 %p71, %r6, 15;
- @%p71 bra BB12_100;
+BB12_92:
+ setp.gt.s32 %p75, %r6, 15;
+ @%p75 bra BB12_96;
- setp.eq.s32 %p75, %r6, 14;
- @%p75 bra BB12_113;
- bra.uni BB12_98;
+ setp.eq.s32 %p79, %r6, 14;
+ @%p79 bra BB12_107;
+ bra.uni BB12_94;
-BB12_113:
+BB12_107:
cvt.rni.s64.f64 %rd15, %fd1;
cvt.rni.s64.f64 %rd16, %fd68;
- cvt.u32.u64 %r56, %rd15;
- cvt.u32.u64 %r57, %rd16;
- or.b32 %r58, %r57, %r56;
- setp.eq.s32 %p105, %r58, 0;
- selp.f64 %fd108, 0d0000000000000000, 0d3FF0000000000000, %p105;
- bra.uni BB12_137;
+ cvt.u32.u64 %r55, %rd15;
+ cvt.u32.u64 %r56, %rd16;
+ or.b32 %r57, %r56, %r55;
+ setp.eq.s32 %p111, %r57, 0;
+ selp.f64 %fd106, 0d0000000000000000, 0d3FF0000000000000, %p111;
+ bra.uni BB12_129;
BB12_8:
setp.eq.s32 %p27, %r6, 2;
- @%p27 bra BB12_67;
+ @%p27 bra BB12_63;
bra.uni BB12_9;
-BB12_67:
- mul.f64 %fd99, %fd1, %fd68;
- bra.uni BB12_69;
+BB12_63:
+ mul.f64 %fd98, %fd1, %fd68;
+ bra.uni BB12_65;
BB12_24:
setp.eq.s32 %p14, %r6, 11;
- @%p14 bra BB12_47;
+ @%p14 bra BB12_45;
setp.eq.s32 %p15, %r6, 12;
- @%p15 bra BB12_46;
+ @%p15 bra BB12_44;
bra.uni BB12_26;
-BB12_46:
- max.f64 %fd99, %fd68, %fd1;
- bra.uni BB12_69;
+BB12_44:
+ max.f64 %fd98, %fd68, %fd1;
+ bra.uni BB12_65;
BB12_15:
setp.eq.s32 %p21, %r6, 6;
- @%p21 bra BB12_50;
+ @%p21 bra BB12_48;
setp.eq.s32 %p22, %r6, 7;
- @%p22 bra BB12_49;
+ @%p22 bra BB12_47;
bra.uni BB12_17;
-BB12_49:
- setp.lt.f64 %p46, %fd1, %fd68;
- selp.f64 %fd99, 0d3FF0000000000000, 0d0000000000000000, %p46;
- bra.uni BB12_69;
+BB12_47:
+ setp.lt.f64 %p48, %fd1, %fd68;
+ selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p48;
+ bra.uni BB12_65;
BB12_32:
setp.eq.s32 %p8, %r6, 16;
- @%p8 bra BB12_44;
+ @%p8 bra BB12_42;
setp.eq.s32 %p9, %r6, 17;
- @%p9 bra BB12_39;
+ @%p9 bra BB12_38;
bra.uni BB12_34;
-BB12_39:
- setp.eq.f64 %p34, %fd1, 0d0000000000000000;
- setp.eq.f64 %p35, %fd1, 0d8000000000000000;
- or.pred %p36, %p34, %p35;
- mov.f64 %fd99, 0d7FF8000000000000;
- @%p36 bra BB12_69;
+BB12_38:
+ setp.eq.f64 %p35, %fd1, 0d0000000000000000;
+ setp.eq.f64 %p36, %fd1, 0d8000000000000000;
+ or.pred %p37, %p35, %p36;
+ mov.f64 %fd98, 0d7FF8000000000000;
+ @%p37 bra BB12_65;
- div.rn.f64 %fd99, %fd68, %fd1;
- abs.f64 %fd72, %fd99;
- setp.gtu.f64 %p37, %fd72, 0d7FF0000000000000;
- @%p37 bra BB12_69;
+ div.rn.f64 %fd98, %fd68, %fd1;
+ abs.f64 %fd72, %fd98;
+ setp.gtu.f64 %p38, %fd72, 0d7FF0000000000000;
+ @%p38 bra BB12_65;
{
.reg .b32 %temp;
- mov.b64 {%temp, %r15}, %fd99;
+ mov.b64 {%r15, %temp}, %fd98;
}
- and.b32 %r16, %r15, 2147483647;
- setp.ne.s32 %p38, %r16, 2146435072;
- @%p38 bra BB12_43;
-
{
.reg .b32 %temp;
- mov.b64 {%r17, %temp}, %fd99;
+ mov.b64 {%temp, %r16}, %fd98;
}
- setp.eq.s32 %p39, %r17, 0;
- @%p39 bra BB12_69;
-
-BB12_43:
- cvt.rmi.f64.f64 %fd73, %fd99;
+ and.b32 %r17, %r16, 2147483647;
+ setp.ne.s32 %p39, %r17, 2146435072;
+ setp.ne.s32 %p40, %r15, 0;
+ or.pred %p41, %p39, %p40;
+ @!%p41 bra BB12_65;
+ bra.uni BB12_41;
+
+BB12_41:
+ cvt.rmi.f64.f64 %fd73, %fd98;
mul.f64 %fd74, %fd1, %fd73;
- sub.f64 %fd99, %fd68, %fd74;
- bra.uni BB12_69;
+ sub.f64 %fd98, %fd68, %fd74;
+ bra.uni BB12_65;
-BB12_76:
- setp.eq.s32 %p91, %r6, 2;
- @%p91 bra BB12_135;
- bra.uni BB12_77;
+BB12_72:
+ setp.eq.s32 %p95, %r6, 2;
+ @%p95 bra BB12_127;
+ bra.uni BB12_73;
-BB12_135:
- mul.f64 %fd108, %fd1, %fd68;
- bra.uni BB12_137;
+BB12_127:
+ mul.f64 %fd106, %fd1, %fd68;
+ bra.uni BB12_129;
-BB12_92:
- setp.eq.s32 %p78, %r6, 11;
- @%p78 bra BB12_115;
+BB12_88:
+ setp.eq.s32 %p82, %r6, 11;
+ @%p82 bra BB12_109;
- setp.eq.s32 %p79, %r6, 12;
- @%p79 bra BB12_114;
- bra.uni BB12_94;
+ setp.eq.s32 %p83, %r6, 12;
+ @%p83 bra BB12_108;
+ bra.uni BB12_90;
-BB12_114:
- max.f64 %fd108, %fd1, %fd68;
- bra.uni BB12_137;
+BB12_108:
+ max.f64 %fd106, %fd1, %fd68;
+ bra.uni BB12_129;
-BB12_83:
- setp.eq.s32 %p85, %r6, 6;
- @%p85 bra BB12_118;
+BB12_79:
+ setp.eq.s32 %p89, %r6, 6;
+ @%p89 bra BB12_112;
- setp.eq.s32 %p86, %r6, 7;
- @%p86 bra BB12_117;
- bra.uni BB12_85;
+ setp.eq.s32 %p90, %r6, 7;
+ @%p90 bra BB12_111;
+ bra.uni BB12_81;
-BB12_117:
- setp.gt.f64 %p110, %fd1, %fd68;
- selp.f64 %fd108, 0d3FF0000000000000, 0d0000000000000000, %p110;
- bra.uni BB12_137;
+BB12_111:
+ setp.gt.f64 %p116, %fd1, %fd68;
+ selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p116;
+ bra.uni BB12_129;
-BB12_100:
- setp.eq.s32 %p72, %r6, 16;
- @%p72 bra BB12_112;
+BB12_96:
+ setp.eq.s32 %p76, %r6, 16;
+ @%p76 bra BB12_106;
- setp.eq.s32 %p73, %r6, 17;
- @%p73 bra BB12_107;
- bra.uni BB12_102;
+ setp.eq.s32 %p77, %r6, 17;
+ @%p77 bra BB12_102;
+ bra.uni BB12_98;
-BB12_107:
- setp.eq.f64 %p98, %fd68, 0d0000000000000000;
- setp.eq.f64 %p99, %fd68, 0d8000000000000000;
- or.pred %p100, %p98, %p99;
- mov.f64 %fd108, 0d7FF8000000000000;
- @%p100 bra BB12_137;
+BB12_102:
+ setp.eq.f64 %p103, %fd68, 0d0000000000000000;
+ setp.eq.f64 %p104, %fd68, 0d8000000000000000;
+ or.pred %p105, %p103, %p104;
+ mov.f64 %fd106, 0d7FF8000000000000;
+ @%p105 bra BB12_129;
- div.rn.f64 %fd108, %fd1, %fd68;
- abs.f64 %fd83, %fd108;
- setp.gtu.f64 %p101, %fd83, 0d7FF0000000000000;
- @%p101 bra BB12_137;
+ div.rn.f64 %fd106, %fd1, %fd68;
+ abs.f64 %fd83, %fd106;
+ setp.gtu.f64 %p106, %fd83, 0d7FF0000000000000;
+ @%p106 bra BB12_129;
{
.reg .b32 %temp;
- mov.b64 {%temp, %r53}, %fd108;
+ mov.b64 {%r52, %temp}, %fd106;
}
- and.b32 %r54, %r53, 2147483647;
- setp.ne.s32 %p102, %r54, 2146435072;
- @%p102 bra BB12_111;
-
{
.reg .b32 %temp;
- mov.b64 {%r55, %temp}, %fd108;
+ mov.b64 {%temp, %r53}, %fd106;
}
- setp.eq.s32 %p103, %r55, 0;
- @%p103 bra BB12_137;
-
-BB12_111:
- cvt.rmi.f64.f64 %fd84, %fd108;
+ and.b32 %r54, %r53, 2147483647;
+ setp.ne.s32 %p107, %r54, 2146435072;
+ setp.ne.s32 %p108, %r52, 0;
+ or.pred %p109, %p107, %p108;
+ @!%p109 bra BB12_129;
+ bra.uni BB12_105;
+
+BB12_105:
+ cvt.rmi.f64.f64 %fd84, %fd106;
mul.f64 %fd85, %fd84, %fd68;
- sub.f64 %fd108, %fd1, %fd85;
- bra.uni BB12_137;
+ sub.f64 %fd106, %fd1, %fd85;
+ bra.uni BB12_129;
BB12_6:
setp.eq.s32 %p30, %r6, 1;
@%p30 bra BB12_7;
- bra.uni BB12_69;
+ bra.uni BB12_65;
BB12_7:
- sub.f64 %fd99, %fd68, %fd1;
- bra.uni BB12_69;
+ sub.f64 %fd98, %fd68, %fd1;
+ bra.uni BB12_65;
BB12_22:
setp.eq.s32 %p18, %r6, 10;
@%p18 bra BB12_23;
- bra.uni BB12_69;
+ bra.uni BB12_65;
BB12_23:
- setp.neu.f64 %p43, %fd1, %fd68;
- selp.f64 %fd99, 0d3FF0000000000000, 0d0000000000000000, %p43;
- bra.uni BB12_69;
+ setp.neu.f64 %p45, %fd1, %fd68;
+ selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p45;
+ bra.uni BB12_65;
BB12_13:
setp.eq.s32 %p25, %r6, 5;
@%p25 bra BB12_14;
- bra.uni BB12_69;
+ bra.uni BB12_65;
BB12_14:
- setp.gt.f64 %p48, %fd1, %fd68;
- selp.f64 %fd99, 0d3FF0000000000000, 0d0000000000000000, %p48;
- bra.uni BB12_69;
+ setp.gt.f64 %p50, %fd1, %fd68;
+ selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p50;
+ bra.uni BB12_65;
BB12_30:
setp.eq.s32 %p12, %r6, 15;
@%p12 bra BB12_31;
- bra.uni BB12_69;
+ bra.uni BB12_65;
BB12_31:
mul.f64 %fd76, %fd1, %fd68;
mov.f64 %fd77, 0d3FF0000000000000;
- sub.f64 %fd99, %fd77, %fd76;
- bra.uni BB12_69;
+ sub.f64 %fd98, %fd77, %fd76;
+ bra.uni BB12_65;
BB12_9:
setp.eq.s32 %p28, %r6, 3;
@%p28 bra BB12_10;
- bra.uni BB12_69;
+ bra.uni BB12_65;
BB12_10:
- div.rn.f64 %fd99, %fd68, %fd1;
- bra.uni BB12_69;
+ div.rn.f64 %fd98, %fd68, %fd1;
+ bra.uni BB12_65;
-BB12_47:
- min.f64 %fd99, %fd68, %fd1;
- bra.uni BB12_69;
+BB12_45:
+ min.f64 %fd98, %fd68, %fd1;
+ bra.uni BB12_65;
BB12_26:
setp.eq.s32 %p16, %r6, 13;
@%p16 bra BB12_27;
- bra.uni BB12_69;
+ bra.uni BB12_65;
BB12_27:
cvt.rni.s64.f64 %rd12, %fd68;
@@ -1672,348 +1670,344 @@ BB12_27:
cvt.u32.u64 %r21, %rd12;
cvt.u32.u64 %r22, %rd13;
and.b32 %r23, %r22, %r21;
- setp.eq.s32 %p42, %r23, 0;
- selp.f64 %fd99, 0d0000000000000000, 0d3FF0000000000000, %p42;
- bra.uni BB12_69;
+ setp.eq.s32 %p44, %r23, 0;
+ selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p44;
+ bra.uni BB12_65;
-BB12_50:
- setp.ge.f64 %p47, %fd1, %fd68;
- selp.f64 %fd99, 0d3FF0000000000000, 0d0000000000000000, %p47;
- bra.uni BB12_69;
+BB12_48:
+ setp.ge.f64 %p49, %fd1, %fd68;
+ selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p49;
+ bra.uni BB12_65;
BB12_17:
setp.eq.s32 %p23, %r6, 8;
@%p23 bra BB12_18;
- bra.uni BB12_69;
+ bra.uni BB12_65;
BB12_18:
- setp.le.f64 %p45, %fd1, %fd68;
- selp.f64 %fd99, 0d3FF0000000000000, 0d0000000000000000, %p45;
- bra.uni BB12_69;
+ setp.le.f64 %p47, %fd1, %fd68;
+ selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p47;
+ bra.uni BB12_65;
-BB12_44:
- setp.neu.f64 %p40, %fd68, 0d0000000000000000;
+BB12_42:
+ setp.neu.f64 %p42, %fd68, 0d0000000000000000;
sub.f64 %fd75, %fd68, %fd1;
- selp.f64 %fd99, %fd75, 0d0000000000000000, %p40;
- bra.uni BB12_69;
+ selp.f64 %fd98, %fd75, 0d0000000000000000, %p42;
+ bra.uni BB12_65;
BB12_34:
setp.ne.s32 %p10, %r6, 18;
- @%p10 bra BB12_69;
+ @%p10 bra BB12_65;
- div.rn.f64 %fd99, %fd68, %fd1;
- abs.f64 %fd70, %fd99;
+ div.rn.f64 %fd98, %fd68, %fd1;
+ abs.f64 %fd70, %fd98;
setp.gtu.f64 %p31, %fd70, 0d7FF0000000000000;
- @%p31 bra BB12_69;
+ @%p31 bra BB12_65;
{
.reg .b32 %temp;
- mov.b64 {%temp, %r12}, %fd99;
+ mov.b64 {%r12, %temp}, %fd98;
}
- and.b32 %r13, %r12, 2147483647;
- setp.ne.s32 %p32, %r13, 2146435072;
- @%p32 bra BB12_38;
-
{
.reg .b32 %temp;
- mov.b64 {%r14, %temp}, %fd99;
+ mov.b64 {%temp, %r13}, %fd98;
}
- setp.eq.s32 %p33, %r14, 0;
- @%p33 bra BB12_69;
+ and.b32 %r14, %r13, 2147483647;
+ setp.ne.s32 %p32, %r14, 2146435072;
+ setp.ne.s32 %p33, %r12, 0;
+ or.pred %p34, %p32, %p33;
+ @!%p34 bra BB12_65;
+ bra.uni BB12_37;
-BB12_38:
- cvt.rmi.f64.f64 %fd99, %fd99;
- bra.uni BB12_69;
+BB12_37:
+ cvt.rmi.f64.f64 %fd98, %fd98;
+ bra.uni BB12_65;
-BB12_74:
- setp.eq.s32 %p94, %r6, 1;
- @%p94 bra BB12_75;
- bra.uni BB12_137;
+BB12_70:
+ setp.eq.s32 %p98, %r6, 1;
+ @%p98 bra BB12_71;
+ bra.uni BB12_129;
-BB12_75:
- sub.f64 %fd108, %fd1, %fd68;
- bra.uni BB12_137;
+BB12_71:
+ sub.f64 %fd106, %fd1, %fd68;
+ bra.uni BB12_129;
-BB12_90:
- setp.eq.s32 %p82, %r6, 10;
- @%p82 bra BB12_91;
- bra.uni BB12_137;
+BB12_86:
+ setp.eq.s32 %p86, %r6, 10;
+ @%p86 bra BB12_87;
+ bra.uni BB12_129;
-BB12_91:
- setp.neu.f64 %p107, %fd1, %fd68;
- selp.f64 %fd108, 0d3FF0000000000000, 0d0000000000000000, %p107;
- bra.uni BB12_137;
+BB12_87:
+ setp.neu.f64 %p113, %fd1, %fd68;
+ selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p113;
+ bra.uni BB12_129;
-BB12_81:
- setp.eq.s32 %p89, %r6, 5;
- @%p89 bra BB12_82;
- bra.uni BB12_137;
+BB12_77:
+ setp.eq.s32 %p93, %r6, 5;
+ @%p93 bra BB12_78;
+ bra.uni BB12_129;
-BB12_82:
- setp.lt.f64 %p112, %fd1, %fd68;
- selp.f64 %fd108, 0d3FF0000000000000, 0d0000000000000000, %p112;
- bra.uni BB12_137;
+BB12_78:
+ setp.lt.f64 %p118, %fd1, %fd68;
+ selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p118;
+ bra.uni BB12_129;
-BB12_98:
- setp.eq.s32 %p76, %r6, 15;
- @%p76 bra BB12_99;
- bra.uni BB12_137;
+BB12_94:
+ setp.eq.s32 %p80, %r6, 15;
+ @%p80 bra BB12_95;
+ bra.uni BB12_129;
-BB12_99:
+BB12_95:
mul.f64 %fd87, %fd1, %fd68;
mov.f64 %fd88, 0d3FF0000000000000;
- sub.f64 %fd108, %fd88, %fd87;
- bra.uni BB12_137;
+ sub.f64 %fd106, %fd88, %fd87;
+ bra.uni BB12_129;
-BB12_77:
- setp.eq.s32 %p92, %r6, 3;
- @%p92 bra BB12_78;
- bra.uni BB12_137;
+BB12_73:
+ setp.eq.s32 %p96, %r6, 3;
+ @%p96 bra BB12_74;
+ bra.uni BB12_129;
-BB12_78:
- div.rn.f64 %fd108, %fd1, %fd68;
- bra.uni BB12_137;
+BB12_74:
+ div.rn.f64 %fd106, %fd1, %fd68;
+ bra.uni BB12_129;
-BB12_115:
- min.f64 %fd108, %fd1, %fd68;
- bra.uni BB12_137;
+BB12_109:
+ min.f64 %fd106, %fd1, %fd68;
+ bra.uni BB12_129;
-BB12_94:
- setp.eq.s32 %p80, %r6, 13;
- @%p80 bra BB12_95;
- bra.uni BB12_137;
+BB12_90:
+ setp.eq.s32 %p84, %r6, 13;
+ @%p84 bra BB12_91;
+ bra.uni BB12_129;
-BB12_95:
+BB12_91:
cvt.rni.s64.f64 %rd17, %fd1;
cvt.rni.s64.f64 %rd18, %fd68;
- cvt.u32.u64 %r59, %rd17;
- cvt.u32.u64 %r60, %rd18;
- and.b32 %r61, %r60, %r59;
- setp.eq.s32 %p106, %r61, 0;
- selp.f64 %fd108, 0d0000000000000000, 0d3FF0000000000000, %p106;
- bra.uni BB12_137;
+ cvt.u32.u64 %r58, %rd17;
+ cvt.u32.u64 %r59, %rd18;
+ and.b32 %r60, %r59, %r58;
+ setp.eq.s32 %p112, %r60, 0;
+ selp.f64 %fd106, 0d0000000000000000, 0d3FF0000000000000, %p112;
+ bra.uni BB12_129;
-BB12_118:
- setp.le.f64 %p111, %fd1, %fd68;
- selp.f64 %fd108, 0d3FF0000000000000, 0d0000000000000000, %p111;
- bra.uni BB12_137;
+BB12_112:
+ setp.le.f64 %p117, %fd1, %fd68;
+ selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p117;
+ bra.uni BB12_129;
-BB12_85:
- setp.eq.s32 %p87, %r6, 8;
- @%p87 bra BB12_86;
- bra.uni BB12_137;
+BB12_81:
+ setp.eq.s32 %p91, %r6, 8;
+ @%p91 bra BB12_82;
+ bra.uni BB12_129;
-BB12_86:
- setp.ge.f64 %p109, %fd1, %fd68;
- selp.f64 %fd108, 0d3FF0000000000000, 0d0000000000000000, %p109;
- bra.uni BB12_137;
+BB12_82:
+ setp.ge.f64 %p115, %fd1, %fd68;
+ selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p115;
+ bra.uni BB12_129;
-BB12_112:
- setp.neu.f64 %p104, %fd1, 0d0000000000000000;
+BB12_106:
+ setp.neu.f64 %p110, %fd1, 0d0000000000000000;
sub.f64 %fd86, %fd1, %fd68;
- selp.f64 %fd108, %fd86, 0d0000000000000000, %p104;
- bra.uni BB12_137;
+ selp.f64 %fd106, %fd86, 0d0000000000000000, %p110;
+ bra.uni BB12_129;
-BB12_102:
- setp.ne.s32 %p74, %r6, 18;
- @%p74 bra BB12_137;
+BB12_98:
+ setp.ne.s32 %p78, %r6, 18;
+ @%p78 bra BB12_129;
- div.rn.f64 %fd108, %fd1, %fd68;
- abs.f64 %fd81, %fd108;
- setp.gtu.f64 %p95, %fd81, 0d7FF0000000000000;
- @%p95 bra BB12_137;
+ div.rn.f64 %fd106, %fd1, %fd68;
+ abs.f64 %fd81, %fd106;
+ setp.gtu.f64 %p99, %fd81, 0d7FF0000000000000;
+ @%p99 bra BB12_129;
{
.reg .b32 %temp;
- mov.b64 {%temp, %r50}, %fd108;
+ mov.b64 {%r49, %temp}, %fd106;
}
- and.b32 %r51, %r50, 2147483647;
- setp.ne.s32 %p96, %r51, 2146435072;
- @%p96 bra BB12_106;
-
{
.reg .b32 %temp;
- mov.b64 {%r52, %temp}, %fd108;
+ mov.b64 {%temp, %r50}, %fd106;
}
- setp.eq.s32 %p97, %r52, 0;
- @%p97 bra BB12_137;
+ and.b32 %r51, %r50, 2147483647;
+ setp.ne.s32 %p100, %r51, 2146435072;
+ setp.ne.s32 %p101, %r49, 0;
+ or.pred %p102, %p100, %p101;
+ @!%p102 bra BB12_129;
+ bra.uni BB12_101;
-BB12_106:
- cvt.rmi.f64.f64 %fd108, %fd108;
- bra.uni BB12_137;
+BB12_101:
+ cvt.rmi.f64.f64 %fd106, %fd106;
+ bra.uni BB12_129;
-BB12_54:
- setp.gt.s32 %p52, %r2, -1;
- @%p52 bra BB12_57;
+BB12_52:
+ setp.gt.s32 %p54, %r2, -1;
+ @%p54 bra BB12_55;
cvt.rzi.f64.f64 %fd78, %fd1;
- setp.neu.f64 %p53, %fd78, %fd1;
- selp.f64 %fd97, 0dFFF8000000000000, %fd97, %p53;
+ setp.neu.f64 %p55, %fd78, %fd1;
+ selp.f64 %fd96, 0dFFF8000000000000, %fd96, %p55;
-BB12_57:
- mov.f64 %fd24, %fd97;
+BB12_55:
+ mov.f64 %fd24, %fd96;
add.f64 %fd25, %fd1, %fd68;
{
.reg .b32 %temp;
mov.b64 {%temp, %r33}, %fd25;
}
and.b32 %r34, %r33, 2146435072;
- setp.ne.s32 %p56, %r34, 2146435072;
- mov.f64 %fd96, %fd24;
- @%p56 bra BB12_66;
+ setp.ne.s32 %p58, %r34, 2146435072;
+ mov.f64 %fd95, %fd24;
+ @%p58 bra BB12_62;
- setp.gtu.f64 %p57, %fd18, 0d7FF0000000000000;
- mov.f64 %fd96, %fd25;
- @%p57 bra BB12_66;
-
- abs.f64 %fd79, %fd1;
- setp.gtu.f64 %p58, %fd79, 0d7FF0000000000000;
+ setp.gtu.f64 %p59, %fd18, 0d7FF0000000000000;
mov.f64 %fd95, %fd25;
- mov.f64 %fd96, %fd95;
- @%p58 bra BB12_66;
-
- and.b32 %r35, %r3, 2147483647;
- setp.ne.s32 %p59, %r35, 2146435072;
@%p59 bra BB12_62;
- {
- .reg .b32 %temp;
- mov.b64 {%r36, %temp}, %fd1;
- }
- setp.eq.s32 %p60, %r36, 0;
- @%p60 bra BB12_65;
-
-BB12_62:
- and.b32 %r37, %r2, 2147483647;
- setp.ne.s32 %p61, %r37, 2146435072;
- mov.f64 %fd93, %fd24;
- mov.f64 %fd96, %fd93;
- @%p61 bra BB12_66;
+ abs.f64 %fd79, %fd1;
+ setp.gtu.f64 %p60, %fd79, 0d7FF0000000000000;
+ mov.f64 %fd94, %fd25;
+ mov.f64 %fd95, %fd94;
+ @%p60 bra BB12_62;
{
.reg .b32 %temp;
- mov.b64 {%r38, %temp}, %fd68;
+ mov.b64 {%r35, %temp}, %fd1;
}
- setp.ne.s32 %p62, %r38, 0;
- mov.f64 %fd96, %fd24;
- @%p62 bra BB12_66;
-
- shr.s32 %r39, %r3, 31;
- and.b32 %r40, %r39, -2146435072;
- add.s32 %r41, %r40, 2146435072;
- or.b32 %r42, %r41, -2147483648;
- selp.b32 %r43, %r42, %r41, %p1;
- mov.u32 %r44, 0;
- mov.b64 %fd96, {%r44, %r43};
- bra.uni BB12_66;
+ and.b32 %r36, %r3, 2147483647;
+ setp.eq.s32 %p61, %r36, 2146435072;
+ setp.eq.s32 %p62, %r35, 0;
+ and.pred %p63, %p61, %p62;
+ @%p63 bra BB12_61;
+ bra.uni BB12_59;
+
+BB12_61:
+ setp.gt.f64 %p67, %fd18, 0d3FF0000000000000;
+ selp.b32 %r44, 2146435072, 0, %p67;
+ xor.b32 %r45, %r44, 2146435072;
+ setp.lt.s32 %p68, %r3, 0;
+ selp.b32 %r46, %r45, %r44, %p68;
+ setp.eq.f64 %p69, %fd68, 0dBFF0000000000000;
+ selp.b32 %r47, 1072693248, %r46, %p69;
+ mov.u32 %r48, 0;
+ mov.b64 %fd95, {%r48, %r47};
+ bra.uni BB12_62;
-BB12_122:
- setp.gt.s32 %p116, %r4, -1;
- @%p116 bra BB12_125;
+BB12_116:
+ setp.gt.s32 %p122, %r4, -1;
+ @%p122 bra BB12_119;
cvt.rzi.f64.f64 %fd89, %fd68;
- setp.neu.f64 %p117, %fd89, %fd68;
- selp.f64 %fd106, 0dFFF8000000000000, %fd106, %p117;
+ setp.neu.f64 %p123, %fd89, %fd68;
+ selp.f64 %fd104, 0dFFF8000000000000, %fd104, %p123;
-BB12_125:
- mov.f64 %fd57, %fd106;
+BB12_119:
+ mov.f64 %fd57, %fd104;
add.f64 %fd58, %fd1, %fd68;
{
.reg .b32 %temp;
- mov.b64 {%temp, %r71}, %fd58;
+ mov.b64 {%temp, %r70}, %fd58;
}
- and.b32 %r72, %r71, 2146435072;
- setp.ne.s32 %p120, %r72, 2146435072;
- mov.f64 %fd105, %fd57;
- @%p120 bra BB12_134;
+ and.b32 %r71, %r70, 2146435072;
+ setp.ne.s32 %p126, %r71, 2146435072;
+ mov.f64 %fd103, %fd57;
+ @%p126 bra BB12_126;
- setp.gtu.f64 %p121, %fd51, 0d7FF0000000000000;
- mov.f64 %fd105, %fd58;
- @%p121 bra BB12_134;
+ setp.gtu.f64 %p127, %fd51, 0d7FF0000000000000;
+ mov.f64 %fd103, %fd58;
+ @%p127 bra BB12_126;
abs.f64 %fd90, %fd68;
- setp.gtu.f64 %p122, %fd90, 0d7FF0000000000000;
- mov.f64 %fd104, %fd58;
- mov.f64 %fd105, %fd104;
- @%p122 bra BB12_134;
+ setp.gtu.f64 %p128, %fd90, 0d7FF0000000000000;
+ mov.f64 %fd102, %fd58;
+ mov.f64 %fd103, %fd102;
+ @%p128 bra BB12_126;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r72, %temp}, %fd68;
+ }
and.b32 %r73, %r5, 2147483647;
- setp.ne.s32 %p123, %r73, 2146435072;
- @%p123 bra BB12_130;
+ setp.eq.s32 %p129, %r73, 2146435072;
+ setp.eq.s32 %p130, %r72, 0;
+ and.pred %p131, %p129, %p130;
+ @%p131 bra BB12_125;
+ bra.uni BB12_123;
+BB12_125:
+ setp.gt.f64 %p135, %fd51, 0d3FF0000000000000;
+ selp.b32 %r81, 2146435072, 0, %p135;
+ xor.b32 %r82, %r81, 2146435072;
+ setp.lt.s32 %p136, %r5, 0;
+ selp.b32 %r83, %r82, %r81, %p136;
+ setp.eq.f64 %p137, %fd1, 0dBFF0000000000000;
+ selp.b32 %r84, 1072693248, %r83, %p137;
+ mov.u32 %r85, 0;
+ mov.b64 %fd103, {%r85, %r84};
+ bra.uni BB12_126;
+
+BB12_59:
{
.reg .b32 %temp;
- mov.b64 {%r74, %temp}, %fd68;
+ mov.b64 {%r37, %temp}, %fd68;
}
- setp.eq.s32 %p124, %r74, 0;
- @%p124 bra BB12_133;
+ and.b32 %r38, %r2, 2147483647;
+ setp.eq.s32 %p64, %r38, 2146435072;
+ setp.eq.s32 %p65, %r37, 0;
+ and.pred %p66, %p64, %p65;
+ mov.f64 %fd95, %fd24;
+ @!%p66 bra BB12_62;
+ bra.uni BB12_60;
+
+BB12_60:
+ shr.s32 %r39, %r3, 31;
+ and.b32 %r40, %r39, -2146435072;
+ selp.b32 %r41, -1048576, 2146435072, %p1;
+ add.s32 %r42, %r41, %r40;
+ mov.u32 %r43, 0;
+ mov.b64 %fd95, {%r43, %r42};
-BB12_130:
- and.b32 %r75, %r4, 2147483647;
- setp.ne.s32 %p125, %r75, 2146435072;
- mov.f64 %fd102, %fd57;
- mov.f64 %fd105, %fd102;
- @%p125 bra BB12_134;
+BB12_62:
+ setp.eq.f64 %p70, %fd1, 0d0000000000000000;
+ setp.eq.f64 %p71, %fd68, 0d3FF0000000000000;
+ or.pred %p72, %p71, %p70;
+ selp.f64 %fd98, 0d3FF0000000000000, %fd95, %p72;
+BB12_65:
+ st.global.f64 [%rd1], %fd98;
+ bra.uni BB12_130;
+
+BB12_123:
{
.reg .b32 %temp;
- mov.b64 {%r76, %temp}, %fd1;
+ mov.b64 {%r74, %temp}, %fd1;
}
- setp.ne.s32 %p126, %r76, 0;
- mov.f64 %fd105, %fd57;
- @%p126 bra BB12_134;
-
- shr.s32 %r77, %r5, 31;
- and.b32 %r78, %r77, -2146435072;
- add.s32 %r79, %r78, 2146435072;
- or.b32 %r80, %r79, -2147483648;
- selp.b32 %r81, %r80, %r79, %p2;
- mov.u32 %r82, 0;
- mov.b64 %fd105, {%r82, %r81};
- bra.uni BB12_134;
+ and.b32 %r75, %r4, 2147483647;
+ setp.eq.s32 %p132, %r75, 2146435072;
+ setp.eq.s32 %p133, %r74, 0;
+ and.pred %p134, %p132, %p133;
+ mov.f64 %fd103, %fd57;
+ @!%p134 bra BB12_126;
+ bra.uni BB12_124;
-BB12_65:
- setp.gt.f64 %p63, %fd18, 0d3FF0000000000000;
- selp.b32 %r45, 2146435072, 0, %p63;
- xor.b32 %r46, %r45, 2146435072;
- setp.lt.s32 %p64, %r3, 0;
- selp.b32 %r47, %r46, %r45, %p64;
- setp.eq.f64 %p65, %fd68, 0dBFF0000000000000;
- selp.b32 %r48, 1072693248, %r47, %p65;
- mov.u32 %r49, 0;
- mov.b64 %fd96, {%r49, %r48};
+BB12_124:
+ shr.s32 %r76, %r5, 31;
+ and.b32 %r77, %r76, -2146435072;
+ selp.b32 %r78, -1048576, 2146435072, %p2;
+ add.s32 %r79, %r78, %r77;
+ mov.u32 %r80, 0;
+ mov.b64 %fd103, {%r80, %r79};
+
+BB12_126:
+ setp.eq.f64 %p138, %fd68, 0d0000000000000000;
+ setp.eq.f64 %p139, %fd1, 0d3FF0000000000000;
+ or.pred %p140, %p139, %p138;
+ selp.f64 %fd106, 0d3FF0000000000000, %fd103, %p140;
+
+BB12_129:
+ st.global.f64 [%rd1], %fd106;
-BB12_66:
- setp.eq.f64 %p66, %fd1, 0d0000000000000000;
- setp.eq.f64 %p67, %fd68, 0d3FF0000000000000;
- or.pred %p68, %p67, %p66;
- selp.f64 %fd99, 0d3FF0000000000000, %fd96, %p68;
-
-BB12_69:
- st.global.f64 [%rd1], %fd99;
- bra.uni BB12_138;
-
-BB12_133:
- setp.gt.f64 %p127, %fd51, 0d3FF0000000000000;
- selp.b32 %r83, 2146435072, 0, %p127;
- xor.b32 %r84, %r83, 2146435072;
- setp.lt.s32 %p128, %r5, 0;
- selp.b32 %r85, %r84, %r83, %p128;
- setp.eq.f64 %p129, %fd1, 0dBFF0000000000000;
- selp.b32 %r86, 1072693248, %r85, %p129;
- mov.u32 %r87, 0;
- mov.b64 %fd105, {%r87, %r86};
-
-BB12_134:
- setp.eq.f64 %p130, %fd68, 0d0000000000000000;
- setp.eq.f64 %p131, %fd1, 0d3FF0000000000000;
- or.pred %p132, %p131, %p130;
- selp.f64 %fd108, 0d3FF0000000000000, %fd105, %p132;
-
-BB12_137:
- st.global.f64 [%rd1], %fd108;
-
-BB12_138:
+BB12_130:
bar.sync 0;
ret;
}
@@ -4431,7 +4425,7 @@ BB34_2:
.local .align 4 .b8 __local_depot35[4];
.reg .b64 %SP;
.reg .b64 %SPL;
- .reg .pred %p<7>;
+ .reg .pred %p<9>;
.reg .b32 %r<18>;
.reg .f64 %fd<41>;
.reg .b64 %rd<17>;
@@ -4449,7 +4443,7 @@ BB34_2:
mov.u32 %r8, %tid.x;
mad.lo.s32 %r1, %r6, %r7, %r8;
setp.ge.u32 %p1, %r1, %r5;
- @%p1 bra BB35_11;
+ @%p1 bra BB35_10;
cvta.to.global.u64 %rd6, %rd3;
cvt.s64.s32 %rd2, %r1;
@@ -4458,23 +4452,24 @@ BB34_2:
ld.global.f64 %fd38, [%rd8];
{
.reg .b32 %temp;
- mov.b64 {%temp, %r9}, %fd38;
+ mov.b64 {%r9, %temp}, %fd38;
}
- and.b32 %r10, %r9, 2147483647;
- setp.ne.s32 %p2, %r10, 2146435072;
- @%p2 bra BB35_4;
-
{
.reg .b32 %temp;
- mov.b64 {%r11, %temp}, %fd38;
+ mov.b64 {%temp, %r10}, %fd38;
}
- setp.ne.s32 %p3, %r11, 0;
- @%p3 bra BB35_4;
+ and.b32 %r11, %r10, 2147483647;
+ setp.eq.s32 %p2, %r11, 2146435072;
+ setp.eq.s32 %p3, %r9, 0;
+ and.pred %p4, %p2, %p3;
+ @!%p4 bra BB35_3;
+ bra.uni BB35_2;
+BB35_2:
mov.f64 %fd14, 0d0000000000000000;
mul.rn.f64 %fd38, %fd38, %fd14;
-BB35_4:
+BB35_3:
mul.f64 %fd15, %fd38, 0d3FE45F306DC9C883;
cvt.rni.s32.f64 %r17, %fd15;
st.local.u32 [%rd1], %r17;
@@ -4491,8 +4486,8 @@ BB35_4:
mov.b64 {%temp, %r12}, %fd38;
}
and.b32 %r13, %r12, 2145386496;
- setp.lt.u32 %p4, %r13, 1105199104;
- @%p4 bra BB35_6;
+ setp.lt.u32 %p5, %r13, 1105199104;
+ @%p5 bra BB35_5;
// Callseq Start 3
{
@@ -4515,11 +4510,11 @@ BB35_4:
}// Callseq End 3
ld.local.u32 %r17, [%rd1];
-BB35_6:
+BB35_5:
and.b32 %r14, %r17, 1;
shl.b32 %r15, %r14, 3;
- setp.eq.s32 %p5, %r14, 0;
- selp.f64 %fd23, 0d3DE5DB65F9785EBA, 0dBDA8FF8320FD8164, %p5;
+ setp.eq.b32 %p6, %r14, 1;
+ selp.f64 %fd23, 0dBDA8FF8320FD8164, 0d3DE5DB65F9785EBA, %p6;
mul.wide.u32 %rd10, %r15, 8;
mov.u64 %rd11, __cudart_sin_cos_coeffs;
add.s64 %rd12, %rd10, %rd11;
@@ -4537,27 +4532,179 @@ BB35_6:
ld.const.f64 %fd34, [%rd12+48];
fma.rn.f64 %fd8, %fd33, %fd7, %fd34;
fma.rn.f64 %fd40, %fd8, %fd39, %fd39;
- @%p5 bra BB35_8;
+ setp.eq.s32 %p7, %r14, 0;
+ @%p7 bra BB35_7;
mov.f64 %fd35, 0d3FF0000000000000;
fma.rn.f64 %fd40, %fd8, %fd7, %fd35;
-BB35_8:
+BB35_7:
and.b32 %r16, %r17, 2;
- setp.eq.s32 %p6, %r16, 0;
- @%p6 bra BB35_10;
+ setp.eq.s32 %p8, %r16, 0;
+ @%p8 bra BB35_9;
mov.f64 %fd36, 0d0000000000000000;
mov.f64 %fd37, 0dBFF0000000000000;
fma.rn.f64 %fd40, %fd40, %fd37, %fd36;
-BB35_10:
+BB35_9:
cvta.to.global.u64 %rd13, %rd4;
shl.b64 %rd14, %rd2, 3;
add.s64 %rd15, %rd13, %rd14;
st.global.f64 [%rd15], %fd40;
-BB35_11:
+BB35_10:
+ ret;
+}
+
+ // .globl matrix_sinh
+.visible .entry matrix_sinh(
+ .param .u64 matrix_sinh_param_0,
+ .param .u64 matrix_sinh_param_1,
+ .param .u32 matrix_sinh_param_2
+)
+{
+ .reg .pred %p<7>;
+ .reg .b32 %r<24>;
+ .reg .f64 %fd<68>;
+ .reg .b64 %rd<10>;
+
+
+ ld.param.u64 %rd2, [matrix_sinh_param_0];
+ ld.param.u64 %rd3, [matrix_sinh_param_1];
+ ld.param.u32 %r3, [matrix_sinh_param_2];
+ mov.u32 %r4, %ctaid.x;
+ mov.u32 %r5, %ntid.x;
+ mov.u32 %r6, %tid.x;
+ mad.lo.s32 %r1, %r5, %r4, %r6;
+ setp.ge.u32 %p1, %r1, %r3;
+ @%p1 bra BB36_5;
+
+ cvta.to.global.u64 %rd4, %rd2;
+ cvt.s64.s32 %rd1, %r1;
+ mul.wide.s32 %rd5, %r1, 8;
+ add.s64 %rd6, %rd4, %rd5;
+ ld.global.f64 %fd5, [%rd6];
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r2}, %fd5;
+ }
+ and.b32 %r7, %r2, 2147483647;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r8, %temp}, %fd5;
+ }
+ mov.b64 %fd1, {%r8, %r7};
+ setp.lt.u32 %p2, %r7, 1072693248;
+ @%p2 bra BB36_3;
+ bra.uni BB36_2;
+
+BB36_3:
+ mul.f64 %fd51, %fd1, %fd1;
+ mov.f64 %fd52, 0d3DE611A561D87DEF;
+ mov.f64 %fd53, 0d3D6B4C75AB274C53;
+ fma.rn.f64 %fd54, %fd53, %fd51, %fd52;
+ mov.f64 %fd55, 0d3E5AE64671B18F5C;
+ fma.rn.f64 %fd56, %fd54, %fd51, %fd55;
+ mov.f64 %fd57, 0d3EC71DE3A465B1E4;
+ fma.rn.f64 %fd58, %fd56, %fd51, %fd57;
+ mov.f64 %fd59, 0d3F2A01A01A02899D;
+ fma.rn.f64 %fd60, %fd58, %fd51, %fd59;
+ mov.f64 %fd61, 0d3F811111111110A6;
+ fma.rn.f64 %fd62, %fd60, %fd51, %fd61;
+ mov.f64 %fd63, 0d3FC5555555555556;
+ fma.rn.f64 %fd64, %fd62, %fd51, %fd63;
+ mul.f64 %fd65, %fd51, %fd64;
+ fma.rn.f64 %fd67, %fd65, %fd1, %fd1;
+ bra.uni BB36_4;
+
+BB36_2:
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r9}, %fd1;
+ }
+ mov.f64 %fd6, 0d4338000000000000;
+ mov.f64 %fd7, 0d3FF71547652B82FE;
+ fma.rn.f64 %fd8, %fd1, %fd7, %fd6;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r10, %temp}, %fd8;
+ }
+ add.s32 %r11, %r10, -1;
+ mov.f64 %fd9, 0dC338000000000000;
+ add.rn.f64 %fd10, %fd8, %fd9;
+ mov.f64 %fd11, 0dBFE62E42FEFA39EF;
+ fma.rn.f64 %fd12, %fd10, %fd11, %fd1;
+ mov.f64 %fd13, 0dBC7ABC9E3B39803F;
+ fma.rn.f64 %fd14, %fd10, %fd13, %fd12;
+ add.s32 %r12, %r9, %r9;
+ setp.lt.u32 %p3, %r12, 2142496327;
+ selp.b32 %r13, 0, %r11, %p3;
+ selp.f64 %fd15, %fd1, %fd14, %p3;
+ mov.f64 %fd16, 0d3E5AF86D8EBD13CD;
+ mov.f64 %fd17, 0d3E21F4076ACD15B6;
+ fma.rn.f64 %fd18, %fd17, %fd15, %fd16;
+ mov.f64 %fd19, 0d3E927E5092BA033D;
+ fma.rn.f64 %fd20, %fd18, %fd15, %fd19;
+ mov.f64 %fd21, 0d3EC71DDE6C5F9DA1;
+ fma.rn.f64 %fd22, %fd20, %fd15, %fd21;
+ mov.f64 %fd23, 0d3EFA01A018D034E6;
+ fma.rn.f64 %fd24, %fd22, %fd15, %fd23;
+ mov.f64 %fd25, 0d3F2A01A01B3B6940;
+ fma.rn.f64 %fd26, %fd24, %fd15, %fd25;
+ mov.f64 %fd27, 0d3F56C16C16C1B5DD;
+ fma.rn.f64 %fd28, %fd26, %fd15, %fd27;
+ mov.f64 %fd29, 0d3F8111111110F74D;
+ fma.rn.f64 %fd30, %fd28, %fd15, %fd29;
+ mov.f64 %fd31, 0d3FA555555555554D;
+ fma.rn.f64 %fd32, %fd30, %fd15, %fd31;
+ mov.f64 %fd33, 0d3FC5555555555557;
+ fma.rn.f64 %fd34, %fd32, %fd15, %fd33;
+ mov.f64 %fd35, 0d3FE0000000000000;
+ fma.rn.f64 %fd36, %fd34, %fd15, %fd35;
+ mul.f64 %fd37, %fd15, %fd36;
+ fma.rn.f64 %fd38, %fd37, %fd15, %fd15;
+ setp.eq.s32 %p4, %r13, 1024;
+ selp.b32 %r14, -1, 0, %p4;
+ add.s32 %r15, %r14, %r13;
+ shl.b32 %r16, %r15, 20;
+ add.s32 %r17, %r16, 1072693248;
+ mov.u32 %r18, 0;
+ mov.b64 %fd39, {%r18, %r17};
+ mov.u32 %r19, 1071644672;
+ mov.b64 %fd40, {%r18, %r19};
+ sub.f64 %fd41, %fd39, %fd40;
+ fma.rn.f64 %fd42, %fd38, %fd39, %fd41;
+ add.f64 %fd43, %fd42, %fd42;
+ selp.f64 %fd44, %fd43, %fd42, %p4;
+ setp.eq.s32 %p5, %r12, 0;
+ selp.f64 %fd45, %fd15, %fd44, %p5;
+ mov.f64 %fd46, 0d3FF0000000000000;
+ mov.f64 %fd47, 0d4000000000000000;
+ fma.rn.f64 %fd48, %fd47, %fd45, %fd46;
+ div.rn.f64 %fd49, %fd45, %fd48;
+ add.f64 %fd50, %fd49, %fd45;
+ setp.ge.f64 %p6, %fd1, 0d408633CE8FB9F87E;
+ selp.f64 %fd67, 0d7FF0000000000000, %fd50, %p6;
+
+BB36_4:
+ cvta.to.global.u64 %rd7, %rd3;
+ and.b32 %r20, %r2, -2147483648;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r21}, %fd67;
+ }
+ or.b32 %r22, %r21, %r20;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r23, %temp}, %fd67;
+ }
+ mov.b64 %fd66, {%r23, %r22};
+ shl.b64 %rd8, %rd1, 3;
+ add.s64 %rd9, %rd7, %rd8;
+ st.global.f64 [%rd9], %fd66;
+
+BB36_5:
ret;
}
@@ -4568,16 +4715,16 @@ BB35_11:
.param .u32 matrix_cos_param_2
)
{
- .local .align 4 .b8 __local_depot36[4];
+ .local .align 4 .b8 __local_depot37[4];
.reg .b64 %SP;
.reg .b64 %SPL;
- .reg .pred %p<7>;
+ .reg .pred %p<9>;
.reg .b32 %r<19>;
.reg .f64 %fd<41>;
.reg .b64 %rd<17>;
- mov.u64 %rd16, __local_depot36;
+ mov.u64 %rd16, __local_depot37;
cvta.local.u64 %SP, %rd16;
ld.param.u64 %rd3, [matrix_cos_param_0];
ld.param.u64 %rd4, [matrix_cos_param_1];
@@ -4589,7 +4736,7 @@ BB35_11:
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
- @%p1 bra BB36_11;
+ @%p1 bra BB37_10;
cvta.to.global.u64 %rd6, %rd3;
cvt.s64.s32 %rd2, %r1;
@@ -4598,23 +4745,24 @@ BB35_11:
ld.global.f64 %fd38, [%rd8];
{
.reg .b32 %temp;
- mov.b64 {%temp, %r10}, %fd38;
+ mov.b64 {%r10, %temp}, %fd38;
}
- and.b32 %r11, %r10, 2147483647;
- setp.ne.s32 %p2, %r11, 2146435072;
- @%p2 bra BB36_4;
-
{
.reg .b32 %temp;
- mov.b64 {%r12, %temp}, %fd38;
+ mov.b64 {%temp, %r11}, %fd38;
}
- setp.ne.s32 %p3, %r12, 0;
- @%p3 bra BB36_4;
+ and.b32 %r12, %r11, 2147483647;
+ setp.eq.s32 %p2, %r12, 2146435072;
+ setp.eq.s32 %p3, %r10, 0;
+ and.pred %p4, %p2, %p3;
+ @!%p4 bra BB37_3;
+ bra.uni BB37_2;
+BB37_2:
mov.f64 %fd14, 0d0000000000000000;
mul.rn.f64 %fd38, %fd38, %fd14;
-BB36_4:
+BB37_3:
mul.f64 %fd15, %fd38, 0d3FE45F306DC9C883;
cvt.rni.s32.f64 %r18, %fd15;
st.local.u32 [%rd1], %r18;
@@ -4631,8 +4779,8 @@ BB36_4:
mov.b64 {%temp, %r13}, %fd38;
}
and.b32 %r14, %r13, 2145386496;
- setp.lt.u32 %p4, %r14, 1105199104;
- @%p4 bra BB36_6;
+ setp.lt.u32 %p5, %r14, 1105199104;
+ @%p5 bra BB37_5;
// Callseq Start 4
{
@@ -4655,12 +4803,12 @@ BB36_4:
}// Callseq End 4
ld.local.u32 %r18, [%rd1];
-BB36_6:
+BB37_5:
add.s32 %r5, %r18, 1;
and.b32 %r15, %r5, 1;
shl.b32 %r16, %r15, 3;
- setp.eq.s32 %p5, %r15, 0;
- selp.f64 %fd23, 0d3DE5DB65F9785EBA, 0dBDA8FF8320FD8164, %p5;
+ setp.eq.b32 %p6, %r15, 1;
+ selp.f64 %fd23, 0dBDA8FF8320FD8164, 0d3DE5DB65F9785EBA, %p6;
mul.wide.u32 %rd10, %r16, 8;
mov.u64 %rd11, __cudart_sin_cos_coeffs;
add.s64 %rd12, %rd10, %rd11;
@@ -4678,27 +4826,148 @@ BB36_6:
ld.const.f64 %fd34, [%rd12+48];
fma.rn.f64 %fd8, %fd33, %fd7, %fd34;
fma.rn.f64 %fd40, %fd8, %fd39, %fd39;
- @%p5 bra BB36_8;
+ setp.eq.s32 %p7, %r15, 0;
+ @%p7 bra BB37_7;
mov.f64 %fd35, 0d3FF0000000000000;
fma.rn.f64 %fd40, %fd8, %fd7, %fd35;
-BB36_8:
+BB37_7:
and.b32 %r17, %r5, 2;
- setp.eq.s32 %p6, %r17, 0;
- @%p6 bra BB36_10;
+ setp.eq.s32 %p8, %r17, 0;
+ @%p8 bra BB37_9;
mov.f64 %fd36, 0d0000000000000000;
mov.f64 %fd37, 0dBFF0000000000000;
fma.rn.f64 %fd40, %fd40, %fd37, %fd36;
-BB36_10:
+BB37_9:
cvta.to.global.u64 %rd13, %rd4;
shl.b64 %rd14, %rd2, 3;
add.s64 %rd15, %rd13, %rd14;
st.global.f64 [%rd15], %fd40;
-BB36_11:
+BB37_10:
+ ret;
+}
+
+ // .globl matrix_cosh
+.visible .entry matrix_cosh(
+ .param .u64 matrix_cosh_param_0,
+ .param .u64 matrix_cosh_param_1,
+ .param .u32 matrix_cosh_param_2
+)
+{
+ .reg .pred %p<4>;
+ .reg .b32 %r<16>;
+ .reg .f64 %fd<46>;
+ .reg .b64 %rd<10>;
+
+
+ ld.param.u64 %rd2, [matrix_cosh_param_0];
+ ld.param.u64 %rd3, [matrix_cosh_param_1];
+ ld.param.u32 %r2, [matrix_cosh_param_2];
+ mov.u32 %r3, %ctaid.x;
+ mov.u32 %r4, %ntid.x;
+ mov.u32 %r5, %tid.x;
+ mad.lo.s32 %r1, %r4, %r3, %r5;
+ setp.ge.u32 %p1, %r1, %r2;
+ @%p1 bra BB38_5;
+
+ cvta.to.global.u64 %rd4, %rd2;
+ cvt.s64.s32 %rd1, %r1;
+ mul.wide.s32 %rd5, %r1, 8;
+ add.s64 %rd6, %rd4, %rd5;
+ ld.global.f64 %fd1, [%rd6];
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r6}, %fd1;
+ }
+ and.b32 %r7, %r6, 2147483647;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r8, %temp}, %fd1;
+ }
+ mov.b64 %fd2, {%r8, %r7};
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r9}, %fd2;
+ }
+ setp.lt.u32 %p2, %r9, 1082536911;
+ @%p2 bra BB38_3;
+ bra.uni BB38_2;
+
+BB38_3:
+ mov.f64 %fd8, 0d4338000000000000;
+ mov.f64 %fd9, 0d3FF71547652B82FE;
+ fma.rn.f64 %fd10, %fd2, %fd9, %fd8;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r10, %temp}, %fd10;
+ }
+ mov.f64 %fd11, 0dC338000000000000;
+ add.rn.f64 %fd12, %fd10, %fd11;
+ mov.f64 %fd13, 0dBFE62E42FEFA39EF;
+ fma.rn.f64 %fd14, %fd12, %fd13, %fd2;
+ mov.f64 %fd15, 0dBC7ABC9E3B39803F;
+ fma.rn.f64 %fd16, %fd12, %fd15, %fd14;
+ mov.f64 %fd17, 0d3E928AF3FCA213EA;
+ mov.f64 %fd18, 0d3E5ADE1569CE2BDF;
+ fma.rn.f64 %fd19, %fd18, %fd16, %fd17;
+ mov.f64 %fd20, 0d3EC71DEE62401315;
+ fma.rn.f64 %fd21, %fd19, %fd16, %fd20;
+ mov.f64 %fd22, 0d3EFA01997C89EB71;
+ fma.rn.f64 %fd23, %fd21, %fd16, %fd22;
+ mov.f64 %fd24, 0d3F2A01A014761F65;
+ fma.rn.f64 %fd25, %fd23, %fd16, %fd24;
+ mov.f64 %fd26, 0d3F56C16C1852B7AF;
+ fma.rn.f64 %fd27, %fd25, %fd16, %fd26;
+ mov.f64 %fd28, 0d3F81111111122322;
+ fma.rn.f64 %fd29, %fd27, %fd16, %fd28;
+ mov.f64 %fd30, 0d3FA55555555502A1;
+ fma.rn.f64 %fd31, %fd29, %fd16, %fd30;
+ mov.f64 %fd32, 0d3FC5555555555511;
+ fma.rn.f64 %fd33, %fd31, %fd16, %fd32;
+ mov.f64 %fd34, 0d3FE000000000000B;
+ fma.rn.f64 %fd35, %fd33, %fd16, %fd34;
+ mov.f64 %fd36, 0d3FF0000000000000;
+ fma.rn.f64 %fd37, %fd35, %fd16, %fd36;
+ fma.rn.f64 %fd38, %fd37, %fd16, %fd36;
+ shl.b32 %r11, %r10, 20;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r12, %temp}, %fd38;
+ }
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r13}, %fd38;
+ }
+ add.s32 %r14, %r11, %r13;
+ add.s32 %r15, %r14, -2097152;
+ mov.b64 %fd7, {%r12, %r15};
+ // inline asm
+ rcp.approx.ftz.f64 %fd6,%fd7;
+ // inline asm
+ neg.f64 %fd39, %fd7;
+ fma.rn.f64 %fd40, %fd39, %fd6, %fd36;
+ fma.rn.f64 %fd41, %fd40, %fd40, %fd40;
+ fma.rn.f64 %fd42, %fd41, %fd6, %fd6;
+ mov.f64 %fd43, 0d3FB0000000000000;
+ fma.rn.f64 %fd45, %fd42, %fd43, %fd7;
+ bra.uni BB38_4;
+
+BB38_2:
+ setp.le.f64 %p3, %fd1, 0d7FF0000000000000;
+ selp.f64 %fd45, 0d7FF0000000000000, %fd1, %p3;
+
+BB38_4:
+ cvta.to.global.u64 %rd7, %rd3;
+ shl.b64 %rd8, %rd1, 3;
+ add.s64 %rd9, %rd7, %rd8;
+ add.f64 %fd44, %fd45, %fd45;
+ st.global.f64 [%rd9], %fd44;
+
+BB38_5:
ret;
}
@@ -4709,16 +4978,16 @@ BB36_11:
.param .u32 matrix_tan_param_2
)
{
- .local .align 4 .b8 __local_depot37[4];
+ .local .align 4 .b8 __local_depot39[4];
.reg .b64 %SP;
.reg .b64 %SPL;
- .reg .pred %p<6>;
+ .reg .pred %p<7>;
.reg .b32 %r<16>;
.reg .f64 %fd<66>;
.reg .b64 %rd<14>;
- mov.u64 %rd13, __local_depot37;
+ mov.u64 %rd13, __local_depot39;
cvta.local.u64 %SP, %rd13;
ld.param.u64 %rd3, [matrix_tan_param_0];
ld.param.u64 %rd4, [matrix_tan_param_1];
@@ -4730,7 +4999,7 @@ BB36_11:
mov.u32 %r8, %tid.x;
mad.lo.s32 %r1, %r6, %r7, %r8;
setp.ge.u32 %p1, %r1, %r5;
- @%p1 bra BB37_9;
+ @%p1 bra BB39_8;
cvta.to.global.u64 %rd6, %rd3;
cvt.s64.s32 %rd2, %r1;
@@ -4739,23 +5008,24 @@ BB36_11:
ld.global.f64 %fd63, [%rd8];
{
.reg .b32 %temp;
- mov.b64 {%temp, %r9}, %fd63;
+ mov.b64 {%r9, %temp}, %fd63;
}
- and.b32 %r10, %r9, 2147483647;
- setp.ne.s32 %p2, %r10, 2146435072;
- @%p2 bra BB37_4;
-
{
.reg .b32 %temp;
- mov.b64 {%r11, %temp}, %fd63;
+ mov.b64 {%temp, %r10}, %fd63;
}
- setp.ne.s32 %p3, %r11, 0;
- @%p3 bra BB37_4;
+ and.b32 %r11, %r10, 2147483647;
+ setp.eq.s32 %p2, %r11, 2146435072;
+ setp.eq.s32 %p3, %r9, 0;
+ and.pred %p4, %p2, %p3;
+ @!%p4 bra BB39_3;
+ bra.uni BB39_2;
+BB39_2:
mov.f64 %fd11, 0d0000000000000000;
mul.rn.f64 %fd63, %fd63, %fd11;
-BB37_4:
+BB39_3:
mul.f64 %fd12, %fd63, 0d3FE45F306DC9C883;
cvt.rni.s32.f64 %r15, %fd12;
st.local.u32 [%rd1], %r15;
@@ -4772,8 +5042,8 @@ BB37_4:
mov.b64 {%temp, %r12}, %fd63;
}
and.b32 %r13, %r12, 2145386496;
- setp.lt.u32 %p4, %r13, 1105199104;
- @%p4 bra BB37_6;
+ setp.lt.u32 %p5, %r13, 1105199104;
+ @%p5 bra BB39_5;
// Callseq Start 5
{
@@ -4796,7 +5066,7 @@ BB37_4:
}// Callseq End 5
ld.local.u32 %r15, [%rd1];
-BB37_6:
+BB39_5:
mul.f64 %fd20, %fd64, %fd64;
mov.f64 %fd21, 0dBEF9757C5B27EBB1;
mov.f64 %fd22, 0d3EE48DAC2799BCB9;
@@ -4830,11 +5100,11 @@ BB37_6:
mul.f64 %fd7, %fd20, %fd49;
fma.rn.f64 %fd65, %fd7, %fd64, %fd64;
and.b32 %r14, %r15, 1;
- setp.eq.b32 %p5, %r14, 1;
- @!%p5 bra BB37_8;
- bra.uni BB37_7;
+ setp.eq.b32 %p6, %r14, 1;
+ @!%p6 bra BB39_7;
+ bra.uni BB39_6;
-BB37_7:
+BB39_6:
sub.f64 %fd52, %fd65, %fd64;
neg.f64 %fd53, %fd52;
fma.rn.f64 %fd54, %fd7, %fd64, %fd53;
@@ -4851,13 +5121,159 @@ BB37_7:
fma.rn.f64 %fd62, %fd60, %fd54, %fd61;
fma.rn.f64 %fd65, %fd62, %fd60, %fd60;
-BB37_8:
+BB39_7:
cvta.to.global.u64 %rd10, %rd4;
shl.b64 %rd11, %rd2, 3;
add.s64 %rd12, %rd10, %rd11;
st.global.f64 [%rd12], %fd65;
-BB37_9:
+BB39_8:
+ ret;
+}
+
+ // .globl matrix_tanh
+.visible .entry matrix_tanh(
+ .param .u64 matrix_tanh_param_0,
+ .param .u64 matrix_tanh_param_1,
+ .param .u32 matrix_tanh_param_2
+)
+{
+ .reg .pred %p<4>;
+ .reg .b32 %r<17>;
+ .reg .f64 %fd<74>;
+ .reg .b64 %rd<10>;
+
+
+ ld.param.u64 %rd2, [matrix_tanh_param_0];
+ ld.param.u64 %rd3, [matrix_tanh_param_1];
+ ld.param.u32 %r4, [matrix_tanh_param_2];
+ mov.u32 %r5, %ctaid.x;
+ mov.u32 %r6, %ntid.x;
+ mov.u32 %r7, %tid.x;
+ mad.lo.s32 %r1, %r6, %r5, %r7;
+ setp.ge.u32 %p1, %r1, %r4;
+ @%p1 bra BB40_5;
+
+ cvta.to.global.u64 %rd4, %rd2;
+ cvt.s64.s32 %rd1, %r1;
+ mul.wide.s32 %rd5, %r1, 8;
+ add.s64 %rd6, %rd4, %rd5;
+ ld.global.f64 %fd1, [%rd6];
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r2}, %fd1;
+ }
+ and.b32 %r3, %r2, 2147483647;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r8, %temp}, %fd1;
+ }
+ mov.b64 %fd2, {%r8, %r3};
+ setp.ltu.f64 %p2, %fd2, 0d3FE1C7A398201CD6;
+ @%p2 bra BB40_3;
+ bra.uni BB40_2;
+
+BB40_3:
+ mul.f64 %fd51, %fd1, %fd1;
+ mov.f64 %fd52, 0dBF2B9093D89F0E23;
+ mov.f64 %fd53, 0d3F0ABFFC9B5786C4;
+ fma.rn.f64 %fd54, %fd53, %fd51, %fd52;
+ mov.f64 %fd55, 0d3F42FA2744C30B61;
+ fma.rn.f64 %fd56, %fd54, %fd51, %fd55;
+ mov.f64 %fd57, 0dBF57CF3B9C1E491D;
+ fma.rn.f64 %fd58, %fd56, %fd51, %fd57;
+ mov.f64 %fd59, 0d3F6D6C61D450119A;
+ fma.rn.f64 %fd60, %fd58, %fd51, %fd59;
+ mov.f64 %fd61, 0dBF8226DDD44294F5;
+ fma.rn.f64 %fd62, %fd60, %fd51, %fd61;
+ mov.f64 %fd63, 0d3F9664F45C2B04A6;
+ fma.rn.f64 %fd64, %fd62, %fd51, %fd63;
+ mov.f64 %fd65, 0dBFABA1BA1AD70754;
+ fma.rn.f64 %fd66, %fd64, %fd51, %fd65;
+ mov.f64 %fd67, 0d3FC111111110295E;
+ fma.rn.f64 %fd68, %fd66, %fd51, %fd67;
+ mov.f64 %fd69, 0dBFD555555555549F;
+ fma.rn.f64 %fd70, %fd68, %fd51, %fd69;
+ mul.f64 %fd71, %fd51, %fd70;
+ fma.rn.f64 %fd73, %fd71, %fd1, %fd1;
+ bra.uni BB40_4;
+
+BB40_2:
+ add.f64 %fd8, %fd2, %fd2;
+ mov.f64 %fd9, 0d4338000000000000;
+ mov.f64 %fd10, 0d3FF71547652B82FE;
+ fma.rn.f64 %fd11, %fd8, %fd10, %fd9;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r9, %temp}, %fd11;
+ }
+ mov.f64 %fd12, 0dC338000000000000;
+ add.rn.f64 %fd13, %fd11, %fd12;
+ mov.f64 %fd14, 0dBFE62E42FEFA39EF;
+ fma.rn.f64 %fd15, %fd13, %fd14, %fd8;
+ mov.f64 %fd16, 0dBC7ABC9E3B39803F;
+ fma.rn.f64 %fd17, %fd13, %fd16, %fd15;
+ mov.f64 %fd18, 0d3E5AF86D8EBD13CD;
+ mov.f64 %fd19, 0d3E21F4076ACD15B6;
+ fma.rn.f64 %fd20, %fd19, %fd17, %fd18;
+ mov.f64 %fd21, 0d3E927E5092BA033D;
+ fma.rn.f64 %fd22, %fd20, %fd17, %fd21;
+ mov.f64 %fd23, 0d3EC71DDE6C5F9DA1;
+ fma.rn.f64 %fd24, %fd22, %fd17, %fd23;
+ mov.f64 %fd25, 0d3EFA01A018D034E6;
+ fma.rn.f64 %fd26, %fd24, %fd17, %fd25;
+ mov.f64 %fd27, 0d3F2A01A01B3B6940;
+ fma.rn.f64 %fd28, %fd26, %fd17, %fd27;
+ mov.f64 %fd29, 0d3F56C16C16C1B5DD;
+ fma.rn.f64 %fd30, %fd28, %fd17, %fd29;
+ mov.f64 %fd31, 0d3F8111111110F74D;
+ fma.rn.f64 %fd32, %fd30, %fd17, %fd31;
+ mov.f64 %fd33, 0d3FA555555555554D;
+ fma.rn.f64 %fd34, %fd32, %fd17, %fd33;
+ mov.f64 %fd35, 0d3FC5555555555557;
+ fma.rn.f64 %fd36, %fd34, %fd17, %fd35;
+ mov.f64 %fd37, 0d3FE0000000000000;
+ fma.rn.f64 %fd38, %fd36, %fd17, %fd37;
+ mul.f64 %fd39, %fd17, %fd38;
+ fma.rn.f64 %fd40, %fd39, %fd17, %fd17;
+ shl.b32 %r10, %r9, 20;
+ add.s32 %r11, %r10, 1072693248;
+ mov.u32 %r12, 0;
+ mov.b64 %fd41, {%r12, %r11};
+ fma.rn.f64 %fd42, %fd40, %fd41, %fd41;
+ add.f64 %fd7, %fd42, 0d3FF0000000000000;
+ // inline asm
+ rcp.approx.ftz.f64 %fd6,%fd7;
+ // inline asm
+ neg.f64 %fd43, %fd7;
+ mov.f64 %fd44, 0d3FF0000000000000;
+ fma.rn.f64 %fd45, %fd43, %fd6, %fd44;
+ fma.rn.f64 %fd46, %fd45, %fd45, %fd45;
+ fma.rn.f64 %fd47, %fd46, %fd6, %fd6;
+ neg.f64 %fd48, %fd47;
+ mov.f64 %fd49, 0d4000000000000000;
+ fma.rn.f64 %fd50, %fd49, %fd48, %fd44;
+ setp.gt.u32 %p3, %r3, 1077936127;
+ selp.f64 %fd73, 0d3FF0000000000000, %fd50, %p3;
+
+BB40_4:
+ cvta.to.global.u64 %rd7, %rd3;
+ and.b32 %r13, %r2, -2147483648;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r14}, %fd73;
+ }
+ or.b32 %r15, %r14, %r13;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r16, %temp}, %fd73;
+ }
+ mov.b64 %fd72, {%r16, %r15};
+ shl.b64 %rd8, %rd1, 3;
+ add.s64 %rd9, %rd7, %rd8;
+ st.global.f64 [%rd9], %fd72;
+
+BB40_5:
ret;
}
@@ -4883,7 +5299,7 @@ BB37_9:
mov.u32 %r6, %tid.x;
mad.lo.s32 %r1, %r5, %r4, %r6;
setp.ge.u32 %p1, %r1, %r3;
- @%p1 bra BB38_5;
+ @%p1 bra BB41_5;
cvta.to.global.u64 %rd4, %rd2;
cvt.s64.s32 %rd1, %r1;
@@ -4897,10 +5313,10 @@ BB37_9:
mov.b32 %f1, %r2;
abs.f32 %f2, %f1;
setp.lt.f32 %p2, %f2, 0f3FE26666;
- @%p2 bra BB38_3;
- bra.uni BB38_2;
+ @%p2 bra BB41_3;
+ bra.uni BB41_2;
-BB38_3:
+BB41_3:
mul.f64 %fd55, %fd1, %fd1;
mov.f64 %fd56, 0dBFB3823B180754AF;
mov.f64 %fd57, 0d3FB0066BDC1895E9;
@@ -4929,9 +5345,9 @@ BB38_3:
fma.rn.f64 %fd80, %fd78, %fd55, %fd79;
mul.f64 %fd81, %fd55, %fd80;
fma.rn.f64 %fd82, %fd81, %fd1, %fd1;
- bra.uni BB38_4;
+ bra.uni BB41_4;
-BB38_2:
+BB41_2:
abs.f64 %fd7, %fd1;
mov.f64 %fd8, 0d3FE0000000000000;
mov.f64 %fd9, 0dBFE0000000000000;
@@ -4965,8 +5381,8 @@ BB38_2:
}
setp.lt.s32 %p3, %r10, 0;
selp.f64 %fd21, 0dFFF8000000000000, %fd20, %p3;
- setp.equ.f64 %p4, %fd6, 0d0000000000000000;
- selp.f64 %fd22, %fd6, %fd21, %p4;
+ setp.ne.f64 %p4, %fd6, 0d0000000000000000;
+ selp.f64 %fd22, %fd21, %fd6, %p4;
mov.f64 %fd23, 0dBFB3823B180754AF;
mov.f64 %fd24, 0d3FB0066BDC1895E9;
fma.rn.f64 %fd25, %fd24, %fd6, %fd23;
@@ -5011,13 +5427,13 @@ BB38_2:
or.b32 %r14, %r12, %r13;
mov.b64 %fd82, {%r11, %r14};
-BB38_4:
+BB41_4:
cvta.to.global.u64 %rd7, %rd3;
shl.b64 %rd8, %rd1, 3;
add.s64 %rd9, %rd7, %rd8;
st.global.f64 [%rd9], %fd82;
-BB38_5:
+BB41_5:
ret;
}
@@ -5042,7 +5458,7 @@ BB38_5:
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r6, %r5, %r7;
setp.ge.u32 %p1, %r1, %r4;
- @%p1 bra BB39_14;
+ @%p1 bra BB42_14;
cvta.to.global.u64 %rd4, %rd2;
cvt.s64.s32 %rd1, %r1;
@@ -5059,10 +5475,10 @@ BB38_5:
mov.b64 {%temp, %r8}, %fd1;
}
setp.lt.s32 %p2, %r8, 1071801958;
- @%p2 bra BB39_9;
- bra.uni BB39_2;
+ @%p2 bra BB42_9;
+ bra.uni BB42_2;
-BB39_9:
+BB42_9:
mul.f64 %fd62, %fd1, %fd1;
mov.f64 %fd63, 0dBFB3823B180754AF;
mov.f64 %fd64, 0d3FB0066BDC1895E9;
@@ -5092,14 +5508,14 @@ BB39_9:
mul.f64 %fd88, %fd62, %fd87;
fma.rn.f64 %fd10, %fd88, %fd1, %fd1;
setp.lt.s32 %p6, %r2, 0;
- @%p6 bra BB39_11;
+ @%p6 bra BB42_11;
mov.f64 %fd89, 0dBC91A62633145C07;
add.rn.f64 %fd90, %fd10, %fd89;
neg.f64 %fd93, %fd90;
- bra.uni BB39_12;
+ bra.uni BB42_12;
-BB39_2:
+BB42_2:
mov.f64 %fd19, 0d3FF0000000000000;
sub.f64 %fd2, %fd19, %fd1;
{
@@ -5135,7 +5551,7 @@ BB39_2:
fma.rn.f64 %fd28, %fd24, %fd25, %fd18;
fma.rn.f64 %fd3, %fd28, %fd27, %fd24;
setp.lt.s32 %p3, %r3, 1;
- @%p3 bra BB39_4;
+ @%p3 bra BB42_4;
{
.reg .b32 %temp;
@@ -5174,31 +5590,31 @@ BB39_2:
fma.rn.f64 %fd54, %fd52, %fd2, %fd53;
mul.f64 %fd55, %fd2, %fd54;
fma.rn.f64 %fd94, %fd55, %fd29, %fd29;
- bra.uni BB39_5;
+ bra.uni BB42_5;
-BB39_11:
+BB42_11:
mov.f64 %fd91, 0d3C91A62633145C07;
add.rn.f64 %fd93, %fd10, %fd91;
-BB39_12:
+BB42_12:
mov.f64 %fd92, 0d3FF921FB54442D18;
add.rn.f64 %fd94, %fd92, %fd93;
- bra.uni BB39_13;
+ bra.uni BB42_13;
-BB39_4:
+BB42_4:
mov.f64 %fd56, 0d0000000000000000;
mul.rn.f64 %fd94, %fd1, %fd56;
-BB39_5:
+BB42_5:
setp.gt.s32 %p4, %r3, -1;
- @%p4 bra BB39_7;
+ @%p4 bra BB42_7;
mov.f64 %fd57, 0d7FF0000000000000;
mul.rn.f64 %fd94, %fd94, %fd57;
-BB39_7:
+BB42_7:
setp.gt.s32 %p5, %r2, -1;
- @%p5 bra BB39_13;
+ @%p5 bra BB42_13;
mov.f64 %fd58, 0dBCA1A62633145C07;
add.rn.f64 %fd59, %fd94, %fd58;
@@ -5206,13 +5622,13 @@ BB39_7:
mov.f64 %fd61, 0d400921FB54442D18;
add.rn.f64 %fd94, %fd61, %fd60;
-BB39_13:
+BB42_13:
cvta.to.global.u64 %rd7, %rd3;
shl.b64 %rd8, %rd1, 3;
add.s64 %rd9, %rd7, %rd8;
st.global.f64 [%rd9], %fd94;
-BB39_14:
+BB42_14:
ret;
}
@@ -5237,7 +5653,7 @@ BB39_14:
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
- @%p1 bra BB40_4;
+ @%p1 bra BB43_4;
cvta.to.global.u64 %rd4, %rd2;
cvt.s64.s32 %rd1, %r1;
@@ -5247,7 +5663,7 @@ BB39_14:
abs.f64 %fd2, %fd1;
setp.leu.f64 %p2, %fd2, 0d3FF0000000000000;
mov.f64 %fd56, %fd2;
- @%p2 bra BB40_3;
+ @%p2 bra BB43_3;
// inline asm
rcp.approx.ftz.f64 %fd5,%fd2;
@@ -5261,7 +5677,7 @@ BB39_14:
selp.f64 %fd3, 0d0000000000000000, %fd11, %p3;
mov.f64 %fd56, %fd3;
-BB40_3:
+BB43_3:
mov.f64 %fd4, %fd56;
cvta.to.global.u64 %rd7, %rd3;
mul.f64 %fd12, %fd4, %fd4;
@@ -5327,7 +5743,7 @@ BB40_3:
add.s64 %rd9, %rd7, %rd8;
st.global.f64 [%rd9], %fd55;
-BB40_4:
+BB43_4:
ret;
}
@@ -5352,7 +5768,7 @@ BB40_4:
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
- @%p1 bra BB41_4;
+ @%p1 bra BB44_4;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.s32 %rd5, %r1, 8;
@@ -5361,15 +5777,15 @@ BB40_4:
setp.eq.f64 %p2, %fd1, 0d0000000000000000;
cvta.to.global.u64 %rd7, %rd3;
add.s64 %rd1, %rd7, %rd5;
- @%p2 bra BB41_3;
- bra.uni BB41_2;
+ @%p2 bra BB44_3;
+ bra.uni BB44_2;
-BB41_3:
+BB44_3:
mov.u64 %rd8, 0;
st.global.u64 [%rd1], %rd8;
- bra.uni BB41_4;
+ bra.uni BB44_4;
-BB41_2:
+BB44_2:
{
.reg .b32 %temp;
mov.b64 {%temp, %r6}, %fd1;
@@ -5389,7 +5805,7 @@ BB41_2:
mov.b64 %fd3, {%r11, %r10};
st.global.f64 [%rd1], %fd3;
-BB41_4:
+BB44_4:
ret;
}
@@ -5398,7 +5814,7 @@ BB41_4:
.param .b64 __internal_trig_reduction_slowpathd_param_1
)
{
- .local .align 8 .b8 __local_depot42[40];
+ .local .align 8 .b8 __local_depot45[40];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<9>;
@@ -5407,7 +5823,7 @@ BB41_4:
.reg .b64 %rd<101>;
- mov.u64 %rd100, __local_depot42;
+ mov.u64 %rd100, __local_depot45;
cvta.local.u64 %SP, %rd100;
ld.param.f64 %fd4, [__internal_trig_reduction_slowpathd_param_0];
ld.param.u64 %rd37, [__internal_trig_reduction_slowpathd_param_1];
@@ -5421,7 +5837,7 @@ BB41_4:
shr.u32 %r3, %r1, 20;
bfe.u32 %r4, %r1, 20, 11;
setp.eq.s32 %p1, %r4, 2047;
- @%p1 bra BB42_13;
+ @%p1 bra BB45_13;
add.s32 %r16, %r4, -1024;
shr.u32 %r17, %r16, 6;
@@ -5434,7 +5850,7 @@ BB41_4:
setp.gt.s32 %p2, %r5, %r6;
mov.u64 %rd94, 0;
mov.u64 %rd93, %rd1;
- @%p2 bra BB42_4;
+ @%p2 bra BB45_4;
mov.b64 %rd41, %fd4;
shl.b64 %rd42, %rd41, 11;
@@ -5453,7 +5869,7 @@ BB41_4:
mov.u64 %rd91, %rd1;
mov.u32 %r39, %r7;
-BB42_3:
+BB45_3:
.pragma "nounroll";
mov.u32 %r8, %r39;
mov.u64 %rd7, %rd91;
@@ -5488,15 +5904,15 @@ BB42_3:
setp.lt.s32 %p3, %r9, %r6;
mov.u64 %rd91, %rd13;
mov.u32 %r39, %r9;
- @%p3 bra BB42_3;
+ @%p3 bra BB45_3;
-BB42_4:
+BB45_4:
st.local.u64 [%rd93], %rd94;
ld.local.u64 %rd95, [%rd1+16];
ld.local.u64 %rd96, [%rd1+24];
and.b32 %r10, %r3, 63;
setp.eq.s32 %p4, %r10, 0;
- @%p4 bra BB42_6;
+ @%p4 bra BB45_6;
mov.u32 %r27, 64;
sub.s32 %r28, %r27, %r10;
@@ -5508,7 +5924,7 @@ BB42_4:
shr.u64 %rd56, %rd55, %r28;
or.b64 %rd95, %rd56, %rd54;
-BB42_6:
+BB45_6:
cvta.to.local.u64 %rd57, %rd37;
shr.u64 %rd58, %rd96, 62;
cvt.u32.u64 %r29, %rd58;
@@ -5521,11 +5937,11 @@ BB42_6:
and.b32 %r31, %r30, 1;
add.s32 %r32, %r31, %r29;
neg.s32 %r33, %r32;
- setp.eq.s32 %p5, %r40, 0;
- selp.b32 %r34, %r32, %r33, %p5;
+ setp.ne.s32 %p5, %r40, 0;
+ selp.b32 %r34, %r33, %r32, %p5;
st.local.u32 [%rd57], %r34;
setp.eq.s32 %p6, %r31, 0;
- @%p6 bra BB42_8;
+ @%p6 bra BB45_8;
mov.u64 %rd65, 0;
// inline asm
@@ -5545,10 +5961,10 @@ BB42_6:
// inline asm
xor.b32 %r40, %r40, -2147483648;
-BB42_8:
+BB45_8:
clz.b64 %r41, %rd98;
setp.eq.s32 %p7, %r41, 0;
- @%p7 bra BB42_10;
+ @%p7 bra BB45_10;
shl.b64 %rd68, %rd98, %r41;
mov.u32 %r35, 64;
@@ -5556,7 +5972,7 @@ BB42_8:
shr.u64 %rd69, %rd97, %r36;
or.b64 %rd98, %rd69, %rd68;
-BB42_10:
+BB45_10:
mov.u64 %rd73, -3958705157555305931;
// inline asm
{
@@ -5577,7 +5993,7 @@ BB42_10:
}
// inline asm
setp.lt.s64 %p8, %rd99, 1;
- @%p8 bra BB42_12;
+ @%p8 bra BB45_12;
// inline asm
{
@@ -5596,7 +6012,7 @@ BB42_10:
// inline asm
add.s32 %r41, %r41, 1;
-BB42_12:
+BB45_12:
cvt.u64.u32 %rd80, %r40;
shl.b64 %rd81, %rd80, 32;
mov.u32 %r37, 1022;
@@ -5611,7 +6027,7 @@ BB42_12:
or.b64 %rd89, %rd88, %rd81;
mov.b64 %fd4, %rd89;
-BB42_13:
+BB45_13:
st.param.f64 [func_retval0+0], %fd4;
ret;
}
@@ -5621,7 +6037,7 @@ BB42_13:
.param .b64 __internal_accurate_pow_param_1
)
{
- .reg .pred %p<9>;
+ .reg .pred %p<10>;
.reg .f32 %f<3>;
.reg .b32 %r<52>;
.reg .f64 %fd<134>;
@@ -5639,7 +6055,7 @@ BB42_13:
}
shr.u32 %r50, %r49, 20;
setp.ne.s32 %p1, %r50, 0;
- @%p1 bra BB43_2;
+ @%p1 bra BB46_2;
mul.f64 %fd14, %fd12, 0d4350000000000000;
{
@@ -5653,13 +6069,13 @@ BB42_13:
shr.u32 %r16, %r49, 20;
add.s32 %r50, %r16, -54;
-BB43_2:
+BB46_2:
add.s32 %r51, %r50, -1023;
and.b32 %r17, %r49, -2146435073;
or.b32 %r18, %r17, 1072693248;
mov.b64 %fd132, {%r48, %r18};
setp.lt.u32 %p2, %r18, 1073127583;
- @%p2 bra BB43_4;
+ @%p2 bra BB46_4;
{
.reg .b32 %temp;
@@ -5673,7 +6089,7 @@ BB43_2:
mov.b64 %fd132, {%r19, %r21};
add.s32 %r51, %r50, -1022;
-BB43_4:
+BB46_4:
add.f64 %fd16, %fd132, 0d3FF0000000000000;
// inline asm
rcp.approx.ftz.f64 %fd15,%fd16;
@@ -5838,13 +6254,13 @@ BB43_4:
mov.b32 %f2, %r35;
abs.f32 %f1, %f2;
setp.lt.f32 %p4, %f1, 0f4086232B;
- @%p4 bra BB43_7;
+ @%p4 bra BB46_7;
setp.lt.f64 %p5, %fd4, 0d0000000000000000;
add.f64 %fd129, %fd4, 0d7FF0000000000000;
selp.f64 %fd133, 0d0000000000000000, %fd129, %p5;
setp.geu.f32 %p6, %f1, 0f40874800;
- @%p6 bra BB43_7;
+ @%p6 bra BB46_7;
shr.u32 %r36, %r13, 31;
add.s32 %r37, %r13, %r36;
@@ -5859,26 +6275,26 @@ BB43_4:
mov.b64 %fd131, {%r44, %r43};
mul.f64 %fd133, %fd130, %fd131;
-BB43_7:
+BB46_7:
{
.reg .b32 %temp;
mov.b64 {%temp, %r45}, %fd133;
}
and.b32 %r46, %r45, 2147483647;
setp.ne.s32 %p7, %r46, 2146435072;
- @%p7 bra BB43_9;
-
{
.reg .b32 %temp;
mov.b64 {%r47, %temp}, %fd133;
}
- setp.eq.s32 %p8, %r47, 0;
- @%p8 bra BB43_10;
+ setp.ne.s32 %p8, %r47, 0;
+ or.pred %p9, %p8, %p7;
+ @!%p9 bra BB46_9;
+ bra.uni BB46_8;
-BB43_9:
+BB46_8:
fma.rn.f64 %fd133, %fd133, %fd5, %fd133;
-BB43_10:
+BB46_9:
st.param.f64 [func_retval0+0], %fd133;
ret;
}
http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/java/org/apache/sysml/hops/Hop.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/Hop.java b/src/main/java/org/apache/sysml/hops/Hop.java
index 5ee0b56..f8ee068 100644
--- a/src/main/java/org/apache/sysml/hops/Hop.java
+++ b/src/main/java/org/apache/sysml/hops/Hop.java
@@ -1051,7 +1051,7 @@ public abstract class Hop implements ParseInfo
}
public enum OpOp1 {
- NOT, ABS, SIN, COS, TAN, ASIN, ACOS, ATAN, SIGN, SQRT, LOG, EXP,
+ NOT, ABS, SIN, COS, TAN, ASIN, ACOS, ATAN, SINH, COSH, TANH, SIGN, SQRT, LOG, EXP,
CAST_AS_SCALAR, CAST_AS_MATRIX, CAST_AS_FRAME, CAST_AS_DOUBLE, CAST_AS_INT, CAST_AS_BOOLEAN,
PRINT, EIGEN, NROW, NCOL, LENGTH, ROUND, IQM, STOP, CEIL, FLOOR, MEDIAN, INVERSE, CHOLESKY,
SVD,
@@ -1276,6 +1276,9 @@ public abstract class Hop implements ParseInfo
HopsOpOp1LopsU.put(OpOp1.ASIN, org.apache.sysml.lops.Unary.OperationTypes.ASIN);
HopsOpOp1LopsU.put(OpOp1.ACOS, org.apache.sysml.lops.Unary.OperationTypes.ACOS);
HopsOpOp1LopsU.put(OpOp1.ATAN, org.apache.sysml.lops.Unary.OperationTypes.ATAN);
+ HopsOpOp1LopsU.put(OpOp1.SINH, org.apache.sysml.lops.Unary.OperationTypes.SINH);
+ HopsOpOp1LopsU.put(OpOp1.COSH, org.apache.sysml.lops.Unary.OperationTypes.COSH);
+ HopsOpOp1LopsU.put(OpOp1.TANH, org.apache.sysml.lops.Unary.OperationTypes.TANH);
HopsOpOp1LopsU.put(OpOp1.SIGN, org.apache.sysml.lops.Unary.OperationTypes.SIGN);
HopsOpOp1LopsU.put(OpOp1.SQRT, org.apache.sysml.lops.Unary.OperationTypes.SQRT);
HopsOpOp1LopsU.put(OpOp1.EXP, org.apache.sysml.lops.Unary.OperationTypes.EXP);
@@ -1310,6 +1313,9 @@ public abstract class Hop implements ParseInfo
HopsOpOp1LopsUS.put(OpOp1.ASIN, org.apache.sysml.lops.UnaryCP.OperationTypes.ASIN);
HopsOpOp1LopsUS.put(OpOp1.ACOS, org.apache.sysml.lops.UnaryCP.OperationTypes.ACOS);
HopsOpOp1LopsUS.put(OpOp1.ATAN, org.apache.sysml.lops.UnaryCP.OperationTypes.ATAN);
+ HopsOpOp1LopsUS.put(OpOp1.SINH, org.apache.sysml.lops.UnaryCP.OperationTypes.SINH);
+ HopsOpOp1LopsUS.put(OpOp1.COSH, org.apache.sysml.lops.UnaryCP.OperationTypes.COSH);
+ HopsOpOp1LopsUS.put(OpOp1.TANH, org.apache.sysml.lops.UnaryCP.OperationTypes.TANH);
HopsOpOp1LopsUS.put(OpOp1.SQRT, org.apache.sysml.lops.UnaryCP.OperationTypes.SQRT);
HopsOpOp1LopsUS.put(OpOp1.EXP, org.apache.sysml.lops.UnaryCP.OperationTypes.EXP);
HopsOpOp1LopsUS.put(OpOp1.LOG, org.apache.sysml.lops.UnaryCP.OperationTypes.LOG);
@@ -1365,6 +1371,9 @@ public abstract class Hop implements ParseInfo
HopsOpOp12String.put(OpOp1.ASIN, "asin");
HopsOpOp12String.put(OpOp1.ACOS, "acos");
HopsOpOp12String.put(OpOp1.ATAN, "atan");
+ HopsOpOp12String.put(OpOp1.SINH, "sinh");
+ HopsOpOp12String.put(OpOp1.COSH, "cosh");
+ HopsOpOp12String.put(OpOp1.TANH, "tanh");
HopsOpOp12String.put(OpOp1.STOP, "stop");
HopsOpOp12String.put(OpOp1.INVERSE, "inv");
HopsOpOp12String.put(OpOp1.SPROP, "sprop");
http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/java/org/apache/sysml/hops/UnaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/UnaryOp.java b/src/main/java/org/apache/sysml/hops/UnaryOp.java
index ca1fc84..46cabcc 100644
--- a/src/main/java/org/apache/sysml/hops/UnaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/UnaryOp.java
@@ -109,7 +109,9 @@ public class UnaryOp extends Hop implements MultiThreadedHop
switch(_op) {
case SELP:case EXP:case SQRT:case LOG:case ABS:
case ROUND:case FLOOR:case CEIL:
- case SIN:case COS: case TAN:case ASIN:case ACOS:case ATAN:
+ case SIN:case COS: case TAN:
+ case ASIN:case ACOS:case ATAN:
+ case SINH:case COSH: case TANH:
case SIGN:
return true;
default:
@@ -601,6 +603,7 @@ public class UnaryOp extends Hop implements MultiThreadedHop
if( mc.dimsKnown() ) {
if( _op==OpOp1.ABS || _op==OpOp1.COS || _op==OpOp1.SIN || _op==OpOp1.TAN
|| _op==OpOp1.ACOS || _op==OpOp1.ASIN || _op==OpOp1.ATAN
+ || _op==OpOp1.COSH || _op==OpOp1.SINH || _op==OpOp1.TANH
|| _op==OpOp1.SQRT || _op==OpOp1.ROUND
|| _op==OpOp1.SPROP || _op==OpOp1.SELP ) //sparsity preserving
{
@@ -723,8 +726,10 @@ public class UnaryOp extends Hop implements MultiThreadedHop
Hop input = getInput().get(0);
setDim1( input.getDim1() );
setDim2( input.getDim2() );
- if( _op==OpOp1.ABS || _op==OpOp1.COS || _op==OpOp1.SIN || _op==OpOp1.TAN
- || _op==OpOp1.ACOS || _op==OpOp1.ASIN || _op==OpOp1.ATAN
+ // cosh(0)=cos(0)=1, acos(0)=1.5707963267948966
+ if( _op==OpOp1.ABS || _op==OpOp1.SIN || _op==OpOp1.TAN
+ || _op==OpOp1.SINH || _op==OpOp1.TANH
+ || _op==OpOp1.ASIN || _op==OpOp1.ATAN
|| _op==OpOp1.SQRT || _op==OpOp1.ROUND || _op==OpOp1.SPROP ) //sparsity preserving
{
setNnz( input.getNnz() );