summaryrefslogtreecommitdiffhomepage
path: root/ir/be
diff options
context:
space:
mode:
authorJohannes Bucher <johannes.bucher2@student.kit.edu>2020-05-15 11:21:49 +0200
committerJohannes Bucher <johannes.bucher2@student.kit.edu>2021-03-22 12:04:24 +0100
commitdf6f8a5f86fa65bb390ff8533490b9f1927960c2 (patch)
tree1ac24064ccba7de0b12b151cb8cfff3b3ecdf658 /ir/be
parent7faa813a52b93d75d71f3d8379622bfd60c53743 (diff)
amd64: support scalar fused-multiply-add instructions (FMA3)
Adds support for fused multiply-add of scalar double- and single-precision floating point values from the FMA3 instruction set. Comprises the instructions VFMADD132SD, VFMADD213SD, VFMADD231SD, VFMADD132SS, VFMADD213SS, VFMADD231SS This feature can be enabled with the -mfma option.
Diffstat (limited to 'ir/be')
-rw-r--r--ir/be/amd64/amd64_bearch.c3
-rw-r--r--ir/be/amd64/amd64_bearch_t.h1
-rw-r--r--ir/be/amd64/amd64_emitter.c16
-rw-r--r--ir/be/amd64/amd64_new_nodes.c2
-rw-r--r--ir/be/amd64/amd64_new_nodes.h3
-rw-r--r--ir/be/amd64/amd64_nodes_attr.h2
-rw-r--r--ir/be/amd64/amd64_spec.pl16
-rw-r--r--ir/be/amd64/amd64_transform.c138
8 files changed, 181 insertions, 0 deletions
diff --git a/ir/be/amd64/amd64_bearch.c b/ir/be/amd64/amd64_bearch.c
index 00fd5be..4b9d82d 100644
--- a/ir/be/amd64/amd64_bearch.c
+++ b/ir/be/amd64/amd64_bearch.c
@@ -47,6 +47,8 @@ pmap *amd64_constants;
ir_mode *amd64_mode_xmm;
+bool use_scalar_fma3 = false;
+
static ir_node *create_push(ir_node *node, ir_node *schedpoint, ir_node *sp,
ir_node *mem, ir_entity *ent, x86_insn_size_t size)
{
@@ -834,6 +836,7 @@ void be_init_arch_amd64(void)
{
static const lc_opt_table_entry_t options[] = {
LC_OPT_ENT_BOOL("no-red-zone", "gcc compatibility", &amd64_use_red_zone),
+ LC_OPT_ENT_BOOL("fma", "support FMA3 code generation", &use_scalar_fma3),
LC_OPT_LAST
};
lc_opt_entry_t *be_grp = lc_opt_get_grp(firm_opt_get_root(), "be");
diff --git a/ir/be/amd64/amd64_bearch_t.h b/ir/be/amd64/amd64_bearch_t.h
index d72c4e9..94f9cbd 100644
--- a/ir/be/amd64/amd64_bearch_t.h
+++ b/ir/be/amd64/amd64_bearch_t.h
@@ -23,6 +23,7 @@ extern pmap *amd64_constants; /**< A map of entities that store const tarvals */
extern ir_mode *amd64_mode_xmm;
extern bool amd64_use_red_zone;
+extern bool use_scalar_fma3;
#define AMD64_REGISTER_SIZE 8
/** power of two stack alignment on calls */
diff --git a/ir/be/amd64/amd64_emitter.c b/ir/be/amd64/amd64_emitter.c
index da13af9..920fbe7 100644
--- a/ir/be/amd64/amd64_emitter.c
+++ b/ir/be/amd64/amd64_emitter.c
@@ -242,6 +242,22 @@ static void amd64_emit_am(const ir_node *const node, bool indirect_star)
be_emit_cstring(", ");
goto emit_addr_reg;
}
+ case AMD64_OP_REG_REG_ADDR: {
+ x86_emit_addr(node, &attr->addr);
+ be_emit_cstring(", ");
+ const arch_register_t *reg1 = arch_get_irn_register_in(node, 1);
+ emit_register_mode(reg1, attr->base.size);
+ be_emit_cstring(", ");
+ const arch_register_t *reg0 = arch_get_irn_register_in(node, 0);
+ emit_register_mode(reg0, attr->base.size);
+ return;
+ }
+ case AMD64_OP_REG_REG_REG: {
+ const arch_register_t *reg2 = arch_get_irn_register_in(node, 2);
+ emit_register_mode(reg2, attr->base.size);
+ be_emit_cstring(", ");
+ // fallthrough
+ }
case AMD64_OP_REG_REG: {
const arch_register_t *reg1 = arch_get_irn_register_in(node, 1);
emit_register_mode(reg1, attr->base.size);
diff --git a/ir/be/amd64/amd64_new_nodes.c b/ir/be/amd64/amd64_new_nodes.c
index 94e7f73..f7845ba 100644
--- a/ir/be/amd64/amd64_new_nodes.c
+++ b/ir/be/amd64/amd64_new_nodes.c
@@ -65,6 +65,8 @@ static const char *get_op_mode_string(amd64_op_mode_t const op_mode)
case AMD64_OP_SHIFT_REG: return "shift_reg";
case AMD64_OP_X87: return "x87";
case AMD64_OP_X87_ADDR_REG: return "x87+addr+reg";
+ case AMD64_OP_REG_REG_REG: return "reg+reg+reg";
+ case AMD64_OP_REG_REG_ADDR: return "reg+reg+addr";
case AMD64_OP_CC: return "cc";
}
return "invalid op_mode";
diff --git a/ir/be/amd64/amd64_new_nodes.h b/ir/be/amd64/amd64_new_nodes.h
index 9b3dd00..2e97143 100644
--- a/ir/be/amd64/amd64_new_nodes.h
+++ b/ir/be/amd64/amd64_new_nodes.h
@@ -46,6 +46,8 @@ static inline bool amd64_has_addr_attr(amd64_op_mode_t const op_mode)
case AMD64_OP_ADDR:
case AMD64_OP_REG:
case AMD64_OP_IMM32:
+ case AMD64_OP_REG_REG_REG:
+ case AMD64_OP_REG_REG_ADDR:
return true;
default:
return amd64_has_binop_attr(op_mode);
@@ -59,6 +61,7 @@ static inline bool amd64_loads(const ir_node *node)
case AMD64_OP_ADDR:
return !is_amd64_lea(node);
case AMD64_OP_REG_ADDR:
+ case AMD64_OP_REG_REG_ADDR:
return true;
/* Note: AMD64_OP_ADDR_REG, AMD64_OP_X87_ADDR_REG are stores */
default:
diff --git a/ir/be/amd64/amd64_nodes_attr.h b/ir/be/amd64/amd64_nodes_attr.h
index 4d35c63..82c4ea5 100644
--- a/ir/be/amd64/amd64_nodes_attr.h
+++ b/ir/be/amd64/amd64_nodes_attr.h
@@ -39,6 +39,8 @@ typedef enum amd64_op_mode_t {
AMD64_OP_SHIFT_IMM,
AMD64_OP_X87,
AMD64_OP_X87_ADDR_REG,
+ AMD64_OP_REG_REG_REG,
+ AMD64_OP_REG_REG_ADDR,
AMD64_OP_CC,
} amd64_op_mode_t;
diff --git a/ir/be/amd64/amd64_spec.pl b/ir/be/amd64/amd64_spec.pl
index 906ea7d..bf9b105 100644
--- a/ir/be/amd64/amd64_spec.pl
+++ b/ir/be/amd64/amd64_spec.pl
@@ -301,6 +301,16 @@ my $x87store = {
attr => "const amd64_binop_addr_attr_t *attr_init",
};
+my $fmaop = {
+ irn_flags => [ "rematerializable" ],
+ in_reqs => "...",
+ out_reqs => [ "xmm", "none", "mem" ],
+ outs => [ "res", "none", "M" ],
+ attr_type => "amd64_addr_attr_t",
+ attr => "x86_insn_size_t size, amd64_op_mode_t op_mode, x86_addr_t addr",
+ emit => "{name}%MX %AM",
+};
+
%nodes = (
push_am => {
op_flags => [ "uses_memory" ],
@@ -854,4 +864,10 @@ fpop => {
emit => "fstp %F0",
},
+# FMA instructions
+
+vfmadd132s => { template => $fmaop },
+vfmadd213s => { template => $fmaop },
+vfmadd231s => { template => $fmaop },
+
);
diff --git a/ir/be/amd64/amd64_transform.c b/ir/be/amd64/amd64_transform.c
index 16cdc20..d263df1 100644
--- a/ir/be/amd64/amd64_transform.c
+++ b/ir/be/amd64/amd64_transform.c
@@ -115,6 +115,12 @@ static const arch_register_req_t amd64_requirement_x87killed = {
.kills_value = true,
};
+static const arch_register_req_t amd64_requirement_xmm_killed = {
+ .cls = &amd64_reg_classes[CLASS_amd64_xmm],
+ .width = 1,
+ .kills_value = true,
+};
+
static const arch_register_req_t *mem_reqs[] = {
&arch_memory_requirement,
};
@@ -183,6 +189,27 @@ static const arch_register_req_t *xmm_reg_reg_mem_reqs[] = {
&arch_memory_requirement,
};
+static const arch_register_req_t *xmm_xmm_mem_reqs[] = {
+ &amd64_requirement_xmm_killed,
+ &amd64_class_reg_req_xmm,
+ &arch_memory_requirement,
+};
+
+static const arch_register_req_t *xmm_xmm_reg_mem_reqs[] = {
+ &amd64_requirement_xmm_killed,
+ &amd64_class_reg_req_xmm,
+ &amd64_class_reg_req_gp,
+ &arch_memory_requirement,
+};
+
+static const arch_register_req_t *xmm_xmm_reg_reg_mem_reqs[] = {
+ &amd64_requirement_xmm_killed,
+ &amd64_class_reg_req_xmm,
+ &amd64_class_reg_req_gp,
+ &amd64_class_reg_req_gp,
+ &arch_memory_requirement,
+};
+
static const arch_register_req_t *x87K_reg_reg_mem_reqs[] = {
&amd64_requirement_x87killed,
&amd64_class_reg_req_gp,
@@ -251,6 +278,12 @@ arch_register_req_t const *amd64_xmm_xmm_reqs[] = {
&amd64_class_reg_req_xmm,
};
+arch_register_req_t const *amd64_xmm_xmm_xmm_reqs[] = {
+ &amd64_requirement_xmm_killed,
+ &amd64_class_reg_req_xmm,
+ &amd64_class_reg_req_xmm,
+};
+
arch_register_req_t const **const gp_am_reqs[] = {
mem_reqs,
reg_mem_reqs,
@@ -265,6 +298,14 @@ static arch_register_req_t const **const xmm_am_reqs[] = {
xmm_reg_reg_mem_reqs,
};
+static arch_register_req_t const **const xmm_fma_am_reqs[] = {
+ mem_reqs,
+ xmm_mem_reqs,
+ xmm_xmm_mem_reqs,
+ xmm_xmm_reg_mem_reqs,
+ xmm_xmm_reg_reg_mem_reqs,
+};
+
static arch_register_req_t const **const x87K_am_reqs[] = {
mem_reqs,
x87K_mem_reqs,
@@ -985,6 +1026,100 @@ static x86_insn_size_t get_size_32_64_from_mode(ir_mode *const mode)
return get_mode_size_bits(mode) <= 32 ? X86_SIZE_32 : X86_SIZE_64;
}
+static ir_node *gen_fma(ir_node *const add, ir_node *const op1, ir_node *const op2)
+{
+ if (!use_scalar_fma3)
+ return NULL;
+ ir_mode *const add_mode = get_irn_mode(add);
+ if (get_mode_size_bits(add_mode) != 64 && get_mode_size_bits(add_mode) != 32)
+ return NULL;
+ ir_node *mul, *add_op;
+ if (is_Mul(op1)) {
+ mul = op1;
+ add_op = op2;
+ } else if (is_Mul(op2)) {
+ mul = op2;
+ add_op = op1;
+ } else {
+ return NULL;
+ }
+ if (get_irn_mode(mul) != add_mode)
+ return NULL;
+ if (get_irn_n_edges(mul) != 1)
+ return NULL;
+
+ ir_node *const block = get_nodes_block(add);
+ ir_node *const mul_op1 = get_Mul_left(mul);
+ ir_node *const mul_op2 = get_Mul_right(mul);
+ ir_node *load, *reg_op, *source1, *source2;
+ bool use_am;
+ ir_node *(*fma_variant)(dbg_info *, ir_node *, const int, ir_node *const *, const arch_register_req_t **,
+ x86_insn_size_t, amd64_op_mode_t, x86_addr_t);
+ //try if Add operand, left Mul operand or right Mul operand can be used as AM input
+ if ((use_am = use_address_matching(add_mode, match_am, block, mul_op1, add_op, &load, &reg_op)
+ && (!input_depends_on_load(load, mul_op2)))) {
+ source1 = mul_op2;
+ source2 = reg_op;
+ fma_variant = &new_bd_amd64_vfmadd213s;
+ } else if ((use_am = use_address_matching(add_mode, match_am, block, add_op, mul_op1, &load, &reg_op)
+ && (!input_depends_on_load(load, mul_op2)))) {
+ source1 = mul_op2;
+ source2 = reg_op;
+ fma_variant = &new_bd_amd64_vfmadd132s;
+ } else if ((use_am = use_address_matching(add_mode, match_am, block, add_op, mul_op2, &load, &reg_op)
+ && (!input_depends_on_load(load, mul_op1)))) {
+ source1 = reg_op;
+ source2 = mul_op1;
+ fma_variant = &new_bd_amd64_vfmadd231s;
+ }
+ int arity = 0;
+ amd64_op_mode_t op_mode;
+ ir_node *mem_proj = NULL;
+ ir_node *in[5];
+ const arch_register_req_t **reqs;
+ x86_addr_t addr = {
+ .base_input = 0,
+ .variant = X86_ADDR_REG,
+ };
+
+ if (use_am) {
+ int reg_input = arity++;
+ in[reg_input] = be_transform_node(source1);
+ reg_input = arity++;
+ in[reg_input] = be_transform_node(source2);
+
+ ir_node *ptr = get_Load_ptr(load);
+ perform_address_matching(ptr, &arity, in, &addr);
+
+ reqs = xmm_fma_am_reqs[arity];
+
+ ir_node *new_mem = be_transform_node(get_Load_mem(load));
+ int mem_input = arity++;
+ in[mem_input] = new_mem;
+ addr.mem_input = mem_input;
+ mem_proj = get_Proj_for_pn(load, pn_Load_M);
+ op_mode = AMD64_OP_REG_REG_ADDR;
+ } else {
+ int const input0 = arity++;
+ int const input1 = arity++;
+ int const input2 = arity++;
+ in[input0] = be_transform_node(add_op);
+ in[input1] = be_transform_node(mul_op1);
+ in[input2] = be_transform_node(mul_op2);
+ op_mode = AMD64_OP_REG_REG_REG;
+ reqs = amd64_xmm_xmm_xmm_reqs;
+ fma_variant = &new_bd_amd64_vfmadd231s;
+ }
+ x86_insn_size_t size = x86_size_from_mode(add_mode);
+ dbg_info *const dbgi = get_irn_dbg_info(add);
+ ir_node *const new_block = be_transform_node(block);
+ ir_node *const new_node = fma_variant(dbgi, new_block, arity, in, reqs, size, op_mode, addr);
+
+ fix_node_mem_proj(new_node, mem_proj);
+ arch_set_irn_register_req_out(new_node, 0, &amd64_requirement_xmm_same_0);
+ return be_new_Proj(new_node, pn_amd64_vfmadd132s_res);
+}
+
static ir_node *gen_Add(ir_node *const node)
{
ir_node *const op1 = get_Add_left(node);
@@ -995,6 +1130,9 @@ static ir_node *gen_Add(ir_node *const node)
if (mode_is_float(mode)) {
if (mode == x86_mode_E)
return gen_binop_x87(node, op1, op2, new_bd_amd64_fadd);
+ ir_node *const fma = gen_fma(node, op1, op2);
+ if (fma)
+ return fma;
return gen_binop_am(node, op1, op2, new_bd_amd64_adds,
pn_amd64_adds_res, match_commutative | match_am);
}