Skip to content

Commit

Permalink
Fix a number of problems with ARM fused multiply add/subtract instruc…
Browse files Browse the repository at this point in the history
…tions.

1. The new instruction itinerary entries are not properly described.
2. The asm parser can't handle vfms and vfnms.
3. There were no assembler, disassembler test cases.
4. HasNEON2 has the wrong assembler predicate.
rdar://10139676


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@154456 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
Evan Cheng committed Apr 11, 2012
1 parent 71fbed4 commit 82509e5
Show file tree
Hide file tree
Showing 10 changed files with 160 additions and 10 deletions.
2 changes: 0 additions & 2 deletions lib/Target/ARM/ARM.td
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,6 @@ def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
"true",
"Use NEON for single precision FP">;
// Allow more precision in FP computation
def FPContractions : Predicate<"!TM.Options.NoExcessFPPrecision">;

// Disable 32-bit to 16-bit narrowing for experimentation.
def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
Expand Down
7 changes: 5 additions & 2 deletions lib/Target/ARM/ARMInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -181,11 +181,11 @@ def HasVFP3 : Predicate<"Subtarget->hasVFP3()">,
AssemblerPredicate<"FeatureVFP3">;
def HasVFP4 : Predicate<"Subtarget->hasVFP4()">,
AssemblerPredicate<"FeatureVFP4">;
def NoVFP4 : Predicate<"!Subtarget->hasVFP4()">;
def NoVFP4 : Predicate<"!Subtarget->hasVFP4()">;
def HasNEON : Predicate<"Subtarget->hasNEON()">,
AssemblerPredicate<"FeatureNEON">;
def HasNEON2 : Predicate<"Subtarget->hasNEON2()">,
AssemblerPredicate<"FeatureNEON2">;
AssemblerPredicate<"FeatureNEON,FeatureVFP4">;
def NoNEON2 : Predicate<"!Subtarget->hasNEON2()">;
def HasFP16 : Predicate<"Subtarget->hasFP16()">,
AssemblerPredicate<"FeatureFP16">;
Expand Down Expand Up @@ -221,6 +221,9 @@ def UseMovt : Predicate<"Subtarget->useMovt()">;
def DontUseMovt : Predicate<"!Subtarget->useMovt()">;
def UseFPVMLx : Predicate<"Subtarget->useFPVMLx()">;

// Allow more precision in FP computation
def FPContractions : Predicate<"!TM.Options.NoExcessFPPrecision">;

//===----------------------------------------------------------------------===//
// ARM Flag Definitions.

Expand Down
9 changes: 4 additions & 5 deletions lib/Target/ARM/ARMInstrNEON.td
Original file line number Diff line number Diff line change
Expand Up @@ -4115,7 +4115,6 @@ defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D,
"vqdmlsl", "s", int_arm_neon_vqdmlsl>;
defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", int_arm_neon_vqdmlsl>;


// Fused Vector Multiply-Accumulate and Fused Multiply-Subtract Operations.
def VFMAfd : N3VDMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACD, "vfma", "f32",
v2f32, fmul_su, fadd_mlx>,
Expand All @@ -4136,10 +4135,10 @@ def VFMSfq : N3VQMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACQ, "vfms", "f32",
// Match @llvm.fma.* intrinsics
def : Pat<(fma (v2f32 DPR:$src1), (v2f32 DPR:$Vn), (v2f32 DPR:$Vm)),
(VFMAfd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
Requires<[HasNEON, HasVFP4]>;
Requires<[HasNEON2]>;
def : Pat<(fma (v4f32 QPR:$src1), (v4f32 QPR:$Vn), (v4f32 QPR:$Vm)),
(VFMAfq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
Requires<[HasNEON, HasVFP4]>;
Requires<[HasNEON2]>;

// Vector Subtract Operations.

Expand Down Expand Up @@ -5497,9 +5496,9 @@ def : N3VSMulOpPat<fmul, fadd, VMLAfd>,
def : N3VSMulOpPat<fmul, fsub, VMLSfd>,
Requires<[HasNEON, UseNEONForFP, UseFPVMLx, NoNEON2]>;
def : N3VSMulOpPat<fmul, fadd, VFMAfd>,
Requires<[HasNEON2, UseNEONForFP,FPContractions]>;
Requires<[HasNEON2, UseNEONForFP, FPContractions]>;
def : N3VSMulOpPat<fmul, fsub, VFMSfd>,
Requires<[HasNEON2, UseNEONForFP,FPContractions]>;
Requires<[HasNEON2, UseNEONForFP, FPContractions]>;
def : N2VSPat<fabs, VABSfd>;
def : N2VSPat<fneg, VNEGfd>;
def : N3VSPat<NEONfmax, VMAXfd>;
Expand Down
19 changes: 19 additions & 0 deletions lib/Target/ARM/ARMScheduleA8.td
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,15 @@ def CortexA8Itineraries : ProcessorItineraries<
InstrStage<19, [A8_NPipe], 0>,
InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>,
//
// Single-precision Fused FP MAC
InstrItinData<IIC_fpFMAC32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
InstrStage<1, [A8_NPipe]>], [7, 2, 1, 1]>,
//
// Double-precision Fused FP MAC
InstrItinData<IIC_fpFMAC64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
InstrStage<19, [A8_NPipe], 0>,
InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>,
//
// Single-precision FP DIV
InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
InstrStage<20, [A8_NPipe], 0>,
Expand Down Expand Up @@ -860,6 +869,16 @@ def CortexA8Itineraries : ProcessorItineraries<
InstrItinData<IIC_VMACQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>,
//
// Double-register Fused FP Multiple-Accumulate
InstrItinData<IIC_VFMACD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
InstrStage<1, [A8_NPipe]>], [9, 3, 2, 2]>,
//
// Quad-register Fused FP Multiple-Accumulate
// Result written in N9, but that is relative to the last cycle of multicycle,
// so we use 10 for those cases
InstrItinData<IIC_VFMACQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>,
//
// Double-register Reciprical Step
InstrItinData<IIC_VRECSD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
InstrStage<1, [A8_NPipe]>], [9, 2, 2]>,
Expand Down
36 changes: 36 additions & 0 deletions lib/Target/ARM/ARMScheduleA9.td
Original file line number Diff line number Diff line change
Expand Up @@ -604,6 +604,22 @@ def CortexA9Itineraries : ProcessorItineraries<
InstrStage<2, [A9_NPipe]>],
[9, 1, 1, 1]>,
//
// Single-precision Fused FP MAC
InstrItinData<IIC_fpFMAC32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<9, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[8, 1, 1, 1]>,
//
// Double-precision Fused FP MAC
InstrItinData<IIC_fpFMAC64, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<10, [A9_DRegsN], 0, Reserved>,
InstrStage<2, [A9_NPipe]>],
[9, 1, 1, 1]>,
//
// Single-precision FP DIV
InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
Expand Down Expand Up @@ -1697,6 +1713,26 @@ def CortexA9Itineraries : ProcessorItineraries<
InstrStage<4, [A9_NPipe]>],
[8, 4, 2, 1]>,
//
// Double-register Fused FP Multiple-Accumulate
InstrItinData<IIC_VFMACD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 7 cycles
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe]>],
[6, 3, 2, 1]>,
//
// Quad-register Fused FP Multiple-Accumulate
// Result written in N9, but that is relative to the last cycle of multicycle,
// so we use 10 for those cases
InstrItinData<IIC_VFMACQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 9 cycles
InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
InstrStage<4, [A9_NPipe]>],
[8, 4, 2, 1]>,
//
// Double-register Reciprical Step
InstrItinData<IIC_VRECSD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
Expand Down
6 changes: 6 additions & 0 deletions lib/Target/ARM/ARMScheduleV6.td
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,12 @@ def ARMV6Itineraries : ProcessorItineraries<
// Double-precision FP MAC
InstrItinData<IIC_fpMAC64 , [InstrStage<2, [V6_Pipe]>], [9, 2, 2, 2]>,
//
// Single-precision Fused FP MAC
InstrItinData<IIC_fpFMAC32, [InstrStage<1, [V6_Pipe]>], [9, 2, 2, 2]>,
//
// Double-precision Fused FP MAC
InstrItinData<IIC_fpFMAC64, [InstrStage<2, [V6_Pipe]>], [9, 2, 2, 2]>,
//
// Single-precision FP DIV
InstrItinData<IIC_fpDIV32 , [InstrStage<15, [V6_Pipe]>], [20, 2, 2]>,
//
Expand Down
2 changes: 1 addition & 1 deletion lib/Target/ARM/ARMSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
bool HasV6T2Ops;
bool HasV7Ops;

/// HasVFPv2, HasVFPv3, HasVFPv4, HasNEON, HasNEONVFPv4 - Specify what
/// HasVFPv2, HasVFPv3, HasVFPv4, HasNEON, HasNEON2 - Specify what
/// floating point ISAs are supported.
bool HasVFPv2;
bool HasVFPv3;
Expand Down
2 changes: 2 additions & 0 deletions lib/Target/ARM/AsmParser/ARMAsmParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4659,6 +4659,7 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
Mnemonic == "fmrs" || Mnemonic == "fsqrts" || Mnemonic == "fsubs" ||
Mnemonic == "fsts" || Mnemonic == "fcpys" || Mnemonic == "fdivs" ||
Mnemonic == "fmuls" || Mnemonic == "fcmps" || Mnemonic == "fcmpzs" ||
Mnemonic == "vfms" || Mnemonic == "vfnms" ||
(Mnemonic == "movs" && isThumb()))) {
Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 1);
CarrySetting = true;
Expand Down Expand Up @@ -4702,6 +4703,7 @@ getMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
Mnemonic == "orr" || Mnemonic == "mvn" ||
Mnemonic == "rsb" || Mnemonic == "rsc" || Mnemonic == "orn" ||
Mnemonic == "sbc" || Mnemonic == "eor" || Mnemonic == "neg" ||
Mnemonic == "vfm" || Mnemonic == "vfnm" ||
(!isThumb() && (Mnemonic == "smull" || Mnemonic == "mov" ||
Mnemonic == "mla" || Mnemonic == "smlal" ||
Mnemonic == "umlal" || Mnemonic == "umull"))) {
Expand Down
50 changes: 50 additions & 0 deletions test/MC/ARM/vfp4.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
@ RUN: llvm-mc < %s -triple armv7-unknown-unknown -show-encoding -mattr=+neon,+vfp4 | FileCheck %s --check-prefix=ARM
@ RUN: llvm-mc < %s -triple thumbv7-unknown-unknown -show-encoding -mattr=+neon,+vfp4 | FileCheck %s --check-prefix=THUMB

@ ARM: vfma.f64 d16, d18, d17 @ encoding: [0xa1,0x0b,0xe2,0xee]
@ THUMB: vfma.f64 d16, d18, d17 @ encoding: [0xe2,0xee,0xa1,0x0b]
vfma.f64 d16, d18, d17

@ ARM: vfma.f32 s2, s4, s0 @ encoding: [0x00,0x1a,0xa2,0xee]
@ THUMB: vfma.f32 s2, s4, s0 @ encoding: [0xa2,0xee,0x00,0x1a]
vfma.f32 s2, s4, s0

@ ARM: vfma.f32 d16, d18, d17 @ encoding: [0xb1,0x0c,0x42,0xf2]
@ THUMB: vfma.f32 d16, d18, d17 @ encoding: [0x42,0xef,0xb1,0x0c]
vfma.f32 d16, d18, d17

@ ARM: vfma.f32 q2, q4, q0 @ encoding: [0x50,0x4c,0x08,0xf2]
@ THUMB: vfma.f32 q2, q4, q0 @ encoding: [0x08,0xef,0x50,0x4c]
vfma.f32 q2, q4, q0

@ ARM: vfnma.f64 d16, d18, d17 @ encoding: [0xe1,0x0b,0xd2,0xee]
@ THUMB: vfnma.f64 d16, d18, d17 @ encoding: [0xd2,0xee,0xe1,0x0b]
vfnma.f64 d16, d18, d17

@ ARM: vfnma.f32 s2, s4, s0 @ encoding: [0x40,0x1a,0x92,0xee]
@ THUMB: vfnma.f32 s2, s4, s0 @ encoding: [0x92,0xee,0x40,0x1a]
vfnma.f32 s2, s4, s0

@ ARM: vfms.f64 d16, d18, d17 @ encoding: [0xe1,0x0b,0xe2,0xee]
@ THUMB: vfms.f64 d16, d18, d17 @ encoding: [0xe2,0xee,0xe1,0x0b]
vfms.f64 d16, d18, d17

@ ARM: vfms.f32 s2, s4, s0 @ encoding: [0x40,0x1a,0xa2,0xee]
@ THUMB: vfms.f32 s2, s4, s0 @ encoding: [0xa2,0xee,0x40,0x1a]
vfms.f32 s2, s4, s0

@ ARM: vfms.f32 d16, d18, d17 @ encoding: [0xb1,0x0c,0x62,0xf2]
@ THUMB: vfms.f32 d16, d18, d17 @ encoding: [0x62,0xef,0xb1,0x0c]
vfms.f32 d16, d18, d17

@ ARM: vfms.f32 q2, q4, q0 @ encoding: [0x50,0x4c,0x28,0xf2]
@ THUMB: vfms.f32 q2, q4, q0 @ encoding: [0x28,0xef,0x50,0x4c]
vfms.f32 q2, q4, q0

@ ARM: vfnms.f64 d16, d18, d17 @ encoding: [0xa1,0x0b,0xd2,0xee]
@ THUMB: vfnms.f64 d16, d18, d17 @ encoding: [0xd2,0xee,0xa1,0x0b]
vfnms.f64 d16, d18, d17

@ ARM: vfnms.f32 s2, s4, s0 @ encoding: [0x00,0x1a,0x92,0xee]
@ THUMB: vfnms.f32 s2, s4, s0 @ encoding: [0x92,0xee,0x00,0x1a]
vfnms.f32 s2, s4, s0
37 changes: 37 additions & 0 deletions test/MC/Disassembler/ARM/vfp4.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# RUN: llvm-mc < %s -triple thumbv7-unknown-unknown --disassemble -mattr=+neon,+vfp4 | FileCheck %s

# CHECK: vfma.f64 d16, d18, d17
0xe2 0xee 0xa1 0x0b

# CHECK: vfma.f32 s2, s4, s0
0xa2 0xee 0x00 0x1a

# CHECK: vfma.f32 d16, d18, d17
0x42 0xef 0xb1 0x0c

# CHECK: vfma.f32 q2, q4, q0
0x08 0xef 0x50 0x4c

# CHECK: vfnms.f64 d16, d18, d17
0xd2 0xee 0xa1 0x0b

# CHECK: vfnms.f32 s2, s4, s0
0x92 0xee 0x00 0x1a

# CHECK: vfms.f64 d16, d18, d17
0xe2 0xee 0xe1 0x0b

# CHECK: vfms.f32 s2, s4, s0
0xa2 0xee 0x40 0x1a

# CHECK: vfms.f32 d16, d18, d17
0x62 0xef 0xb1 0x0c

# CHECK: vfms.f32 q2, q4, q0
0x28 0xef 0x50 0x4c

# CHECK: vfnma.f64 d16, d18, d17
0xd2 0xee 0xe1 0x0b

# CHECK: vfnma.f32 s2, s4, s0
0x92 0xee 0x40 0x1a

0 comments on commit 82509e5

Please sign in to comment.