Git Repo - qemu.git/blame_incremental - target/arm/translate-neon.inc.c

... / ...

Commit	Line	Data
	1	/*
	2	* ARM translation: AArch32 Neon instructions
	3	*
	4	* Copyright (c) 2003 Fabrice Bellard
	5	* Copyright (c) 2005-2007 CodeSourcery
	6	* Copyright (c) 2007 OpenedHand, Ltd.
	7	* Copyright (c) 2020 Linaro, Ltd.
	8	*
	9	* This library is free software; you can redistribute it and/or
	10	* modify it under the terms of the GNU Lesser General Public
	11	* License as published by the Free Software Foundation; either
	12	* version 2 of the License, or (at your option) any later version.
	13	*
	14	* This library is distributed in the hope that it will be useful,
	15	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	17	* Lesser General Public License for more details.
	18	*
	19	* You should have received a copy of the GNU Lesser General Public
	20	* License along with this library; if not, see <http://www.gnu.org/licenses/>.
	21	*/
	22
	23	/*
	24	* This file is intended to be included from translate.c; it uses
	25	* some macros and definitions provided by that file.
	26	* It might be possible to convert it to a standalone .c file eventually.
	27	*/
	28
	29	static inline int plus1(DisasContext *s, int x)
	30	{
	31	return x + 1;
	32	}
	33
	34	static inline int rsub_64(DisasContext *s, int x)
	35	{
	36	return 64 - x;
	37	}
	38
	39	static inline int rsub_32(DisasContext *s, int x)
	40	{
	41	return 32 - x;
	42	}
	43	static inline int rsub_16(DisasContext *s, int x)
	44	{
	45	return 16 - x;
	46	}
	47	static inline int rsub_8(DisasContext *s, int x)
	48	{
	49	return 8 - x;
	50	}
	51
	52	/* Include the generated Neon decoder */
	53	#include "decode-neon-dp.inc.c"
	54	#include "decode-neon-ls.inc.c"
	55	#include "decode-neon-shared.inc.c"
	56
	57	static bool trans_VCMLA(DisasContext s, arg_VCMLA a)
	58	{
	59	int opr_sz;
	60	TCGv_ptr fpst;
	61	gen_helper_gvec_3_ptr *fn_gvec_ptr;
	62
	63	if (!dc_isar_feature(aa32_vcma, s)
	64	\|\| (!a->size && !dc_isar_feature(aa32_fp16_arith, s))) {
	65	return false;
	66	}
	67
	68	/* UNDEF accesses to D16-D31 if they don't exist. */
	69	if (!dc_isar_feature(aa32_simd_r32, s) &&
	70	((a->vd \| a->vn \| a->vm) & 0x10)) {
	71	return false;
	72	}
	73
	74	if ((a->vn \| a->vm \| a->vd) & a->q) {
	75	return false;
	76	}
	77
	78	if (!vfp_access_check(s)) {
	79	return true;
	80	}
	81
	82	opr_sz = (1 + a->q) * 8;
	83	fpst = get_fpstatus_ptr(1);
	84	fn_gvec_ptr = a->size ? gen_helper_gvec_fcmlas : gen_helper_gvec_fcmlah;
	85	tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
	86	vfp_reg_offset(1, a->vn),
	87	vfp_reg_offset(1, a->vm),
	88	fpst, opr_sz, opr_sz, a->rot,
	89	fn_gvec_ptr);
	90	tcg_temp_free_ptr(fpst);
	91	return true;
	92	}
	93
	94	static bool trans_VCADD(DisasContext s, arg_VCADD a)
	95	{
	96	int opr_sz;
	97	TCGv_ptr fpst;
	98	gen_helper_gvec_3_ptr *fn_gvec_ptr;
	99
	100	if (!dc_isar_feature(aa32_vcma, s)
	101	\|\| (!a->size && !dc_isar_feature(aa32_fp16_arith, s))) {
	102	return false;
	103	}
	104
	105	/* UNDEF accesses to D16-D31 if they don't exist. */
	106	if (!dc_isar_feature(aa32_simd_r32, s) &&
	107	((a->vd \| a->vn \| a->vm) & 0x10)) {
	108	return false;
	109	}
	110
	111	if ((a->vn \| a->vm \| a->vd) & a->q) {
	112	return false;
	113	}
	114
	115	if (!vfp_access_check(s)) {
	116	return true;
	117	}
	118
	119	opr_sz = (1 + a->q) * 8;
	120	fpst = get_fpstatus_ptr(1);
	121	fn_gvec_ptr = a->size ? gen_helper_gvec_fcadds : gen_helper_gvec_fcaddh;
	122	tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
	123	vfp_reg_offset(1, a->vn),
	124	vfp_reg_offset(1, a->vm),
	125	fpst, opr_sz, opr_sz, a->rot,
	126	fn_gvec_ptr);
	127	tcg_temp_free_ptr(fpst);
	128	return true;
	129	}
	130
	131	static bool trans_VDOT(DisasContext s, arg_VDOT a)
	132	{
	133	int opr_sz;
	134	gen_helper_gvec_3 *fn_gvec;
	135
	136	if (!dc_isar_feature(aa32_dp, s)) {
	137	return false;
	138	}
	139
	140	/* UNDEF accesses to D16-D31 if they don't exist. */
	141	if (!dc_isar_feature(aa32_simd_r32, s) &&
	142	((a->vd \| a->vn \| a->vm) & 0x10)) {
	143	return false;
	144	}
	145
	146	if ((a->vn \| a->vm \| a->vd) & a->q) {
	147	return false;
	148	}
	149
	150	if (!vfp_access_check(s)) {
	151	return true;
	152	}
	153
	154	opr_sz = (1 + a->q) * 8;
	155	fn_gvec = a->u ? gen_helper_gvec_udot_b : gen_helper_gvec_sdot_b;
	156	tcg_gen_gvec_3_ool(vfp_reg_offset(1, a->vd),
	157	vfp_reg_offset(1, a->vn),
	158	vfp_reg_offset(1, a->vm),
	159	opr_sz, opr_sz, 0, fn_gvec);
	160	return true;
	161	}
	162
	163	static bool trans_VFML(DisasContext s, arg_VFML a)
	164	{
	165	int opr_sz;
	166
	167	if (!dc_isar_feature(aa32_fhm, s)) {
	168	return false;
	169	}
	170
	171	/* UNDEF accesses to D16-D31 if they don't exist. */
	172	if (!dc_isar_feature(aa32_simd_r32, s) &&
	173	(a->vd & 0x10)) {
	174	return false;
	175	}
	176
	177	if (a->vd & a->q) {
	178	return false;
	179	}
	180
	181	if (!vfp_access_check(s)) {
	182	return true;
	183	}
	184
	185	opr_sz = (1 + a->q) * 8;
	186	tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
	187	vfp_reg_offset(a->q, a->vn),
	188	vfp_reg_offset(a->q, a->vm),
	189	cpu_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
	190	gen_helper_gvec_fmlal_a32);
	191	return true;
	192	}
	193
	194	static bool trans_VCMLA_scalar(DisasContext s, arg_VCMLA_scalar a)
	195	{
	196	gen_helper_gvec_3_ptr *fn_gvec_ptr;
	197	int opr_sz;
	198	TCGv_ptr fpst;
	199
	200	if (!dc_isar_feature(aa32_vcma, s)) {
	201	return false;
	202	}
	203	if (a->size == 0 && !dc_isar_feature(aa32_fp16_arith, s)) {
	204	return false;
	205	}
	206
	207	/* UNDEF accesses to D16-D31 if they don't exist. */
	208	if (!dc_isar_feature(aa32_simd_r32, s) &&
	209	((a->vd \| a->vn \| a->vm) & 0x10)) {
	210	return false;
	211	}
	212
	213	if ((a->vd \| a->vn) & a->q) {
	214	return false;
	215	}
	216
	217	if (!vfp_access_check(s)) {
	218	return true;
	219	}
	220
	221	fn_gvec_ptr = (a->size ? gen_helper_gvec_fcmlas_idx
	222	: gen_helper_gvec_fcmlah_idx);
	223	opr_sz = (1 + a->q) * 8;
	224	fpst = get_fpstatus_ptr(1);
	225	tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
	226	vfp_reg_offset(1, a->vn),
	227	vfp_reg_offset(1, a->vm),
	228	fpst, opr_sz, opr_sz,
	229	(a->index << 2) \| a->rot, fn_gvec_ptr);
	230	tcg_temp_free_ptr(fpst);
	231	return true;
	232	}
	233
	234	static bool trans_VDOT_scalar(DisasContext s, arg_VDOT_scalar a)
	235	{
	236	gen_helper_gvec_3 *fn_gvec;
	237	int opr_sz;
	238	TCGv_ptr fpst;
	239
	240	if (!dc_isar_feature(aa32_dp, s)) {
	241	return false;
	242	}
	243
	244	/* UNDEF accesses to D16-D31 if they don't exist. */
	245	if (!dc_isar_feature(aa32_simd_r32, s) &&
	246	((a->vd \| a->vn) & 0x10)) {
	247	return false;
	248	}
	249
	250	if ((a->vd \| a->vn) & a->q) {
	251	return false;
	252	}
	253
	254	if (!vfp_access_check(s)) {
	255	return true;
	256	}
	257
	258	fn_gvec = a->u ? gen_helper_gvec_udot_idx_b : gen_helper_gvec_sdot_idx_b;
	259	opr_sz = (1 + a->q) * 8;
	260	fpst = get_fpstatus_ptr(1);
	261	tcg_gen_gvec_3_ool(vfp_reg_offset(1, a->vd),
	262	vfp_reg_offset(1, a->vn),
	263	vfp_reg_offset(1, a->rm),
	264	opr_sz, opr_sz, a->index, fn_gvec);
	265	tcg_temp_free_ptr(fpst);
	266	return true;
	267	}
	268
	269	static bool trans_VFML_scalar(DisasContext s, arg_VFML_scalar a)
	270	{
	271	int opr_sz;
	272
	273	if (!dc_isar_feature(aa32_fhm, s)) {
	274	return false;
	275	}
	276
	277	/* UNDEF accesses to D16-D31 if they don't exist. */
	278	if (!dc_isar_feature(aa32_simd_r32, s) &&
	279	((a->vd & 0x10) \|\| (a->q && (a->vn & 0x10)))) {
	280	return false;
	281	}
	282
	283	if (a->vd & a->q) {
	284	return false;
	285	}
	286
	287	if (!vfp_access_check(s)) {
	288	return true;
	289	}
	290
	291	opr_sz = (1 + a->q) * 8;
	292	tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
	293	vfp_reg_offset(a->q, a->vn),
	294	vfp_reg_offset(a->q, a->rm),
	295	cpu_env, opr_sz, opr_sz,
	296	(a->index << 2) \| a->s, /* is_2 == 0 */
	297	gen_helper_gvec_fmlal_idx_a32);
	298	return true;
	299	}
	300
	301	static struct {
	302	int nregs;
	303	int interleave;
	304	int spacing;
	305	} const neon_ls_element_type[11] = {
	306	{1, 4, 1},
	307	{1, 4, 2},
	308	{4, 1, 1},
	309	{2, 2, 2},
	310	{1, 3, 1},
	311	{1, 3, 2},
	312	{3, 1, 1},
	313	{1, 1, 1},
	314	{1, 2, 1},
	315	{1, 2, 2},
	316	{2, 1, 1}
	317	};
	318
	319	static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
	320	int stride)
	321	{
	322	if (rm != 15) {
	323	TCGv_i32 base;
	324
	325	base = load_reg(s, rn);
	326	if (rm == 13) {
	327	tcg_gen_addi_i32(base, base, stride);
	328	} else {
	329	TCGv_i32 index;
	330	index = load_reg(s, rm);
	331	tcg_gen_add_i32(base, base, index);
	332	tcg_temp_free_i32(index);
	333	}
	334	store_reg(s, rn, base);
	335	}
	336	}
	337
	338	static bool trans_VLDST_multiple(DisasContext s, arg_VLDST_multiple a)
	339	{
	340	/* Neon load/store multiple structures */
	341	int nregs, interleave, spacing, reg, n;
	342	MemOp endian = s->be_data;
	343	int mmu_idx = get_mem_index(s);
	344	int size = a->size;
	345	TCGv_i64 tmp64;
	346	TCGv_i32 addr, tmp;
	347
	348	if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
	349	return false;
	350	}
	351
	352	/* UNDEF accesses to D16-D31 if they don't exist */
	353	if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
	354	return false;
	355	}
	356	if (a->itype > 10) {
	357	return false;
	358	}
	359	/* Catch UNDEF cases for bad values of align field */
	360	switch (a->itype & 0xc) {
	361	case 4:
	362	if (a->align >= 2) {
	363	return false;
	364	}
	365	break;
	366	case 8:
	367	if (a->align == 3) {
	368	return false;
	369	}
	370	break;
	371	default:
	372	break;
	373	}
	374	nregs = neon_ls_element_type[a->itype].nregs;
	375	interleave = neon_ls_element_type[a->itype].interleave;
	376	spacing = neon_ls_element_type[a->itype].spacing;
	377	if (size == 3 && (interleave \| spacing) != 1) {
	378	return false;
	379	}
	380
	381	if (!vfp_access_check(s)) {
	382	return true;
	383	}
	384
	385	/* For our purposes, bytes are always little-endian. */
	386	if (size == 0) {
	387	endian = MO_LE;
	388	}
	389	/*
	390	* Consecutive little-endian elements from a single register
	391	* can be promoted to a larger little-endian operation.
	392	*/
	393	if (interleave == 1 && endian == MO_LE) {
	394	size = 3;
	395	}
	396	tmp64 = tcg_temp_new_i64();
	397	addr = tcg_temp_new_i32();
	398	tmp = tcg_const_i32(1 << size);
	399	load_reg_var(s, addr, a->rn);
	400	for (reg = 0; reg < nregs; reg++) {
	401	for (n = 0; n < 8 >> size; n++) {
	402	int xs;
	403	for (xs = 0; xs < interleave; xs++) {
	404	int tt = a->vd + reg + spacing * xs;
	405
	406	if (a->l) {
	407	gen_aa32_ld_i64(s, tmp64, addr, mmu_idx, endian \| size);
	408	neon_store_element64(tt, n, size, tmp64);
	409	} else {
	410	neon_load_element64(tmp64, tt, n, size);
	411	gen_aa32_st_i64(s, tmp64, addr, mmu_idx, endian \| size);
	412	}
	413	tcg_gen_add_i32(addr, addr, tmp);
	414	}
	415	}
	416	}
	417	tcg_temp_free_i32(addr);
	418	tcg_temp_free_i32(tmp);
	419	tcg_temp_free_i64(tmp64);
	420
	421	gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
	422	return true;
	423	}
	424
	425	static bool trans_VLD_all_lanes(DisasContext s, arg_VLD_all_lanes a)
	426	{
	427	/* Neon load single structure to all lanes */
	428	int reg, stride, vec_size;
	429	int vd = a->vd;
	430	int size = a->size;
	431	int nregs = a->n + 1;
	432	TCGv_i32 addr, tmp;
	433
	434	if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
	435	return false;
	436	}
	437
	438	/* UNDEF accesses to D16-D31 if they don't exist */
	439	if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
	440	return false;
	441	}
	442
	443	if (size == 3) {
	444	if (nregs != 4 \|\| a->a == 0) {
	445	return false;
	446	}
	447	/* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
	448	size = 2;
	449	}
	450	if (nregs == 1 && a->a == 1 && size == 0) {
	451	return false;
	452	}
	453	if (nregs == 3 && a->a == 1) {
	454	return false;
	455	}
	456
	457	if (!vfp_access_check(s)) {
	458	return true;
	459	}
	460
	461	/*
	462	* VLD1 to all lanes: T bit indicates how many Dregs to write.
	463	* VLD2/3/4 to all lanes: T bit indicates register stride.
	464	*/
	465	stride = a->t ? 2 : 1;
	466	vec_size = nregs == 1 ? stride * 8 : 8;
	467
	468	tmp = tcg_temp_new_i32();
	469	addr = tcg_temp_new_i32();
	470	load_reg_var(s, addr, a->rn);
	471	for (reg = 0; reg < nregs; reg++) {
	472	gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s),
	473	s->be_data \| size);
	474	if ((vd & 1) && vec_size == 16) {
	475	/*
	476	* We cannot write 16 bytes at once because the
	477	* destination is unaligned.
	478	*/
	479	tcg_gen_gvec_dup_i32(size, neon_reg_offset(vd, 0),
	480	8, 8, tmp);
	481	tcg_gen_gvec_mov(0, neon_reg_offset(vd + 1, 0),
	482	neon_reg_offset(vd, 0), 8, 8);
	483	} else {
	484	tcg_gen_gvec_dup_i32(size, neon_reg_offset(vd, 0),
	485	vec_size, vec_size, tmp);
	486	}
	487	tcg_gen_addi_i32(addr, addr, 1 << size);
	488	vd += stride;
	489	}
	490	tcg_temp_free_i32(tmp);
	491	tcg_temp_free_i32(addr);
	492
	493	gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
	494
	495	return true;
	496	}
	497
	498	static bool trans_VLDST_single(DisasContext s, arg_VLDST_single a)
	499	{
	500	/* Neon load/store single structure to one lane */
	501	int reg;
	502	int nregs = a->n + 1;
	503	int vd = a->vd;
	504	TCGv_i32 addr, tmp;
	505
	506	if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
	507	return false;
	508	}
	509
	510	/* UNDEF accesses to D16-D31 if they don't exist */
	511	if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
	512	return false;
	513	}
	514
	515	/* Catch the UNDEF cases. This is unavoidably a bit messy. */
	516	switch (nregs) {
	517	case 1:
	518	if (((a->align & (1 << a->size)) != 0) \|\|
	519	(a->size == 2 && ((a->align & 3) == 1 \|\| (a->align & 3) == 2))) {
	520	return false;
	521	}
	522	break;
	523	case 3:
	524	if ((a->align & 1) != 0) {
	525	return false;
	526	}
	527	/* fall through */
	528	case 2:
	529	if (a->size == 2 && (a->align & 2) != 0) {
	530	return false;
	531	}
	532	break;
	533	case 4:
	534	if ((a->size == 2) && ((a->align & 3) == 3)) {
	535	return false;
	536	}
	537	break;
	538	default:
	539	abort();
	540	}
	541	if ((vd + a->stride * (nregs - 1)) > 31) {
	542	/*
	543	* Attempts to write off the end of the register file are
	544	* UNPREDICTABLE; we choose to UNDEF because otherwise we would
	545	* access off the end of the array that holds the register data.
	546	*/
	547	return false;
	548	}
	549
	550	if (!vfp_access_check(s)) {
	551	return true;
	552	}
	553
	554	tmp = tcg_temp_new_i32();
	555	addr = tcg_temp_new_i32();
	556	load_reg_var(s, addr, a->rn);
	557	/*
	558	* TODO: if we implemented alignment exceptions, we should check
	559	* addr against the alignment encoded in a->align here.
	560	*/
	561	for (reg = 0; reg < nregs; reg++) {
	562	if (a->l) {
	563	gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s),
	564	s->be_data \| a->size);
	565	neon_store_element(vd, a->reg_idx, a->size, tmp);
	566	} else { /* Store */
	567	neon_load_element(tmp, vd, a->reg_idx, a->size);
	568	gen_aa32_st_i32(s, tmp, addr, get_mem_index(s),
	569	s->be_data \| a->size);
	570	}
	571	vd += a->stride;
	572	tcg_gen_addi_i32(addr, addr, 1 << a->size);
	573	}
	574	tcg_temp_free_i32(addr);
	575	tcg_temp_free_i32(tmp);
	576
	577	gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
	578
	579	return true;
	580	}
	581
	582	static bool do_3same(DisasContext s, arg_3same a, GVecGen3Fn fn)
	583	{
	584	int vec_size = a->q ? 16 : 8;
	585	int rd_ofs = neon_reg_offset(a->vd, 0);
	586	int rn_ofs = neon_reg_offset(a->vn, 0);
	587	int rm_ofs = neon_reg_offset(a->vm, 0);
	588
	589	if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
	590	return false;
	591	}
	592
	593	/* UNDEF accesses to D16-D31 if they don't exist. */
	594	if (!dc_isar_feature(aa32_simd_r32, s) &&
	595	((a->vd \| a->vn \| a->vm) & 0x10)) {
	596	return false;
	597	}
	598
	599	if ((a->vn \| a->vm \| a->vd) & a->q) {
	600	return false;
	601	}
	602
	603	if (!vfp_access_check(s)) {
	604	return true;
	605	}
	606
	607	fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
	608	return true;
	609	}
	610
	611	#define DO_3SAME(INSN, FUNC) \
	612	static bool trans_##INSN##_3s(DisasContext s, arg_3same a) \
	613	{ \
	614	return do_3same(s, a, FUNC); \
	615	}
	616
	617	DO_3SAME(VADD, tcg_gen_gvec_add)
	618	DO_3SAME(VSUB, tcg_gen_gvec_sub)
	619	DO_3SAME(VAND, tcg_gen_gvec_and)
	620	DO_3SAME(VBIC, tcg_gen_gvec_andc)
	621	DO_3SAME(VORR, tcg_gen_gvec_or)
	622	DO_3SAME(VORN, tcg_gen_gvec_orc)
	623	DO_3SAME(VEOR, tcg_gen_gvec_xor)
	624	DO_3SAME(VSHL_S, gen_gvec_sshl)
	625	DO_3SAME(VSHL_U, gen_gvec_ushl)
	626	DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
	627	DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
	628	DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
	629	DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
	630
	631	/* These insns are all gvec_bitsel but with the inputs in various orders. */
	632	#define DO_3SAME_BITSEL(INSN, O1, O2, O3) \
	633	static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs, \
	634	uint32_t rn_ofs, uint32_t rm_ofs, \
	635	uint32_t oprsz, uint32_t maxsz) \
	636	{ \
	637	tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz); \
	638	} \
	639	DO_3SAME(INSN, gen_##INSN##_3s)
	640
	641	DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
	642	DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
	643	DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
	644
	645	#define DO_3SAME_NO_SZ_3(INSN, FUNC) \
	646	static bool trans_##INSN##_3s(DisasContext s, arg_3same a) \
	647	{ \
	648	if (a->size == 3) { \
	649	return false; \
	650	} \
	651	return do_3same(s, a, FUNC); \
	652	}
	653
	654	DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
	655	DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
	656	DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
	657	DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
	658	DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
	659	DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
	660	DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
	661	DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
	662	DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
	663	DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
	664	DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
	665	DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
	666
	667	#define DO_3SAME_CMP(INSN, COND) \
	668	static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs, \
	669	uint32_t rn_ofs, uint32_t rm_ofs, \
	670	uint32_t oprsz, uint32_t maxsz) \
	671	{ \
	672	tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
	673	} \
	674	DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
	675
	676	DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
	677	DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
	678	DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
	679	DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
	680	DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
	681
	682	#define WRAP_OOL_FN(WRAPNAME, FUNC) \
	683	static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, \
	684	uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz) \
	685	{ \
	686	tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
	687	}
	688
	689	WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
	690
	691	static bool trans_VMUL_p_3s(DisasContext s, arg_3same a)
	692	{
	693	if (a->size != 0) {
	694	return false;
	695	}
	696	return do_3same(s, a, gen_VMUL_p_3s);
	697	}
	698
	699	#define DO_VQRDMLAH(INSN, FUNC) \
	700	static bool trans_##INSN##_3s(DisasContext s, arg_3same a) \
	701	{ \
	702	if (!dc_isar_feature(aa32_rdm, s)) { \
	703	return false; \
	704	} \
	705	if (a->size != 1 && a->size != 2) { \
	706	return false; \
	707	} \
	708	return do_3same(s, a, FUNC); \
	709	}
	710
	711	DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
	712	DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
	713
	714	#define DO_SHA1(NAME, FUNC) \
	715	WRAP_OOL_FN(gen_##NAME##_3s, FUNC) \
	716	static bool trans_##NAME##_3s(DisasContext s, arg_3same a) \
	717	{ \
	718	if (!dc_isar_feature(aa32_sha1, s)) { \
	719	return false; \
	720	} \
	721	return do_3same(s, a, gen_##NAME##_3s); \
	722	}
	723
	724	DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
	725	DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
	726	DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
	727	DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
	728
	729	#define DO_SHA2(NAME, FUNC) \
	730	WRAP_OOL_FN(gen_##NAME##_3s, FUNC) \
	731	static bool trans_##NAME##_3s(DisasContext s, arg_3same a) \
	732	{ \
	733	if (!dc_isar_feature(aa32_sha2, s)) { \
	734	return false; \
	735	} \
	736	return do_3same(s, a, gen_##NAME##_3s); \
	737	}
	738
	739	DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
	740	DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
	741	DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
	742
	743	#define DO_3SAME_64(INSN, FUNC) \
	744	static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs, \
	745	uint32_t rn_ofs, uint32_t rm_ofs, \
	746	uint32_t oprsz, uint32_t maxsz) \
	747	{ \
	748	static const GVecGen3 op = { .fni8 = FUNC }; \
	749	tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op); \
	750	} \
	751	DO_3SAME(INSN, gen_##INSN##_3s)
	752
	753	#define DO_3SAME_64_ENV(INSN, FUNC) \
	754	static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m) \
	755	{ \
	756	FUNC(d, cpu_env, n, m); \
	757	} \
	758	DO_3SAME_64(INSN, gen_##INSN##_elt)
	759
	760	DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
	761	DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
	762	DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
	763	DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
	764	DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
	765	DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
	766
	767	#define DO_3SAME_32(INSN, FUNC) \
	768	static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs, \
	769	uint32_t rn_ofs, uint32_t rm_ofs, \
	770	uint32_t oprsz, uint32_t maxsz) \
	771	{ \
	772	static const GVecGen3 ops[4] = { \
	773	{ .fni4 = gen_helper_neon_##FUNC##8 }, \
	774	{ .fni4 = gen_helper_neon_##FUNC##16 }, \
	775	{ .fni4 = gen_helper_neon_##FUNC##32 }, \
	776	{ 0 }, \
	777	}; \
	778	tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
	779	} \
	780	static bool trans_##INSN##_3s(DisasContext s, arg_3same a) \
	781	{ \
	782	if (a->size > 2) { \
	783	return false; \
	784	} \
	785	return do_3same(s, a, gen_##INSN##_3s); \
	786	}
	787
	788	/*
	789	* Some helper functions need to be passed the cpu_env. In order
	790	* to use those with the gvec APIs like tcg_gen_gvec_3() we need
	791	* to create wrapper functions whose prototype is a NeonGenTwoOpFn()
	792	* and which call a NeonGenTwoOpEnvFn().
	793	*/
	794	#define WRAP_ENV_FN(WRAPNAME, FUNC) \
	795	static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m) \
	796	{ \
	797	FUNC(d, cpu_env, n, m); \
	798	}
	799
	800	#define DO_3SAME_32_ENV(INSN, FUNC) \
	801	WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8); \
	802	WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16); \
	803	WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32); \
	804	static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs, \
	805	uint32_t rn_ofs, uint32_t rm_ofs, \
	806	uint32_t oprsz, uint32_t maxsz) \
	807	{ \
	808	static const GVecGen3 ops[4] = { \
	809	{ .fni4 = gen_##INSN##_tramp8 }, \
	810	{ .fni4 = gen_##INSN##_tramp16 }, \
	811	{ .fni4 = gen_##INSN##_tramp32 }, \
	812	{ 0 }, \
	813	}; \
	814	tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
	815	} \
	816	static bool trans_##INSN##_3s(DisasContext s, arg_3same a) \
	817	{ \
	818	if (a->size > 2) { \
	819	return false; \
	820	} \
	821	return do_3same(s, a, gen_##INSN##_3s); \
	822	}
	823
	824	DO_3SAME_32(VHADD_S, hadd_s)
	825	DO_3SAME_32(VHADD_U, hadd_u)
	826	DO_3SAME_32(VHSUB_S, hsub_s)
	827	DO_3SAME_32(VHSUB_U, hsub_u)
	828	DO_3SAME_32(VRHADD_S, rhadd_s)
	829	DO_3SAME_32(VRHADD_U, rhadd_u)
	830	DO_3SAME_32(VRSHL_S, rshl_s)
	831	DO_3SAME_32(VRSHL_U, rshl_u)
	832
	833	DO_3SAME_32_ENV(VQSHL_S, qshl_s)
	834	DO_3SAME_32_ENV(VQSHL_U, qshl_u)
	835	DO_3SAME_32_ENV(VQRSHL_S, qrshl_s)
	836	DO_3SAME_32_ENV(VQRSHL_U, qrshl_u)
	837
	838	static bool do_3same_pair(DisasContext s, arg_3same a, NeonGenTwoOpFn *fn)
	839	{
	840	/* Operations handled pairwise 32 bits at a time */
	841	TCGv_i32 tmp, tmp2, tmp3;
	842
	843	if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
	844	return false;
	845	}
	846
	847	/* UNDEF accesses to D16-D31 if they don't exist. */
	848	if (!dc_isar_feature(aa32_simd_r32, s) &&
	849	((a->vd \| a->vn \| a->vm) & 0x10)) {
	850	return false;
	851	}
	852
	853	if (a->size == 3) {
	854	return false;
	855	}
	856
	857	if (!vfp_access_check(s)) {
	858	return true;
	859	}
	860
	861	assert(a->q == 0); /* enforced by decode patterns */
	862
	863	/*
	864	* Note that we have to be careful not to clobber the source operands
	865	* in the "vm == vd" case by storing the result of the first pass too
	866	* early. Since Q is 0 there are always just two passes, so instead
	867	* of a complicated loop over each pass we just unroll.
	868	*/
	869	tmp = neon_load_reg(a->vn, 0);
	870	tmp2 = neon_load_reg(a->vn, 1);
	871	fn(tmp, tmp, tmp2);
	872	tcg_temp_free_i32(tmp2);
	873
	874	tmp3 = neon_load_reg(a->vm, 0);
	875	tmp2 = neon_load_reg(a->vm, 1);
	876	fn(tmp3, tmp3, tmp2);
	877	tcg_temp_free_i32(tmp2);
	878
	879	neon_store_reg(a->vd, 0, tmp);
	880	neon_store_reg(a->vd, 1, tmp3);
	881	return true;
	882	}
	883
	884	#define DO_3SAME_PAIR(INSN, func) \
	885	static bool trans_##INSN##_3s(DisasContext s, arg_3same a) \
	886	{ \
	887	static NeonGenTwoOpFn * const fns[] = { \
	888	gen_helper_neon_##func##8, \
	889	gen_helper_neon_##func##16, \
	890	gen_helper_neon_##func##32, \
	891	}; \
	892	if (a->size > 2) { \
	893	return false; \
	894	} \
	895	return do_3same_pair(s, a, fns[a->size]); \
	896	}
	897
	898	/* 32-bit pairwise ops end up the same as the elementwise versions. */
	899	#define gen_helper_neon_pmax_s32 tcg_gen_smax_i32
	900	#define gen_helper_neon_pmax_u32 tcg_gen_umax_i32
	901	#define gen_helper_neon_pmin_s32 tcg_gen_smin_i32
	902	#define gen_helper_neon_pmin_u32 tcg_gen_umin_i32
	903	#define gen_helper_neon_padd_u32 tcg_gen_add_i32
	904
	905	DO_3SAME_PAIR(VPMAX_S, pmax_s)
	906	DO_3SAME_PAIR(VPMIN_S, pmin_s)
	907	DO_3SAME_PAIR(VPMAX_U, pmax_u)
	908	DO_3SAME_PAIR(VPMIN_U, pmin_u)
	909	DO_3SAME_PAIR(VPADD, padd_u)
	910
	911	#define DO_3SAME_VQDMULH(INSN, FUNC) \
	912	WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16); \
	913	WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32); \
	914	static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs, \
	915	uint32_t rn_ofs, uint32_t rm_ofs, \
	916	uint32_t oprsz, uint32_t maxsz) \
	917	{ \
	918	static const GVecGen3 ops[2] = { \
	919	{ .fni4 = gen_##INSN##_tramp16 }, \
	920	{ .fni4 = gen_##INSN##_tramp32 }, \
	921	}; \
	922	tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \
	923	} \
	924	static bool trans_##INSN##_3s(DisasContext s, arg_3same a) \
	925	{ \
	926	if (a->size != 1 && a->size != 2) { \
	927	return false; \
	928	} \
	929	return do_3same(s, a, gen_##INSN##_3s); \
	930	}
	931
	932	DO_3SAME_VQDMULH(VQDMULH, qdmulh)
	933	DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
	934
	935	static bool do_3same_fp(DisasContext s, arg_3same a, VFPGen3OpSPFn *fn,
	936	bool reads_vd)
	937	{
	938	/*
	939	* FP operations handled elementwise 32 bits at a time.
	940	* If reads_vd is true then the old value of Vd will be
	941	* loaded before calling the callback function. This is
	942	* used for multiply-accumulate type operations.
	943	*/
	944	TCGv_i32 tmp, tmp2;
	945	int pass;
	946
	947	if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
	948	return false;
	949	}
	950
	951	/* UNDEF accesses to D16-D31 if they don't exist. */
	952	if (!dc_isar_feature(aa32_simd_r32, s) &&
	953	((a->vd \| a->vn \| a->vm) & 0x10)) {
	954	return false;
	955	}
	956
	957	if ((a->vn \| a->vm \| a->vd) & a->q) {
	958	return false;
	959	}
	960
	961	if (!vfp_access_check(s)) {
	962	return true;
	963	}
	964
	965	TCGv_ptr fpstatus = get_fpstatus_ptr(1);
	966	for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
	967	tmp = neon_load_reg(a->vn, pass);
	968	tmp2 = neon_load_reg(a->vm, pass);
	969	if (reads_vd) {
	970	TCGv_i32 tmp_rd = neon_load_reg(a->vd, pass);
	971	fn(tmp_rd, tmp, tmp2, fpstatus);
	972	neon_store_reg(a->vd, pass, tmp_rd);
	973	tcg_temp_free_i32(tmp);
	974	} else {
	975	fn(tmp, tmp, tmp2, fpstatus);
	976	neon_store_reg(a->vd, pass, tmp);
	977	}
	978	tcg_temp_free_i32(tmp2);
	979	}
	980	tcg_temp_free_ptr(fpstatus);
	981	return true;
	982	}
	983
	984	/*
	985	* For all the functions using this macro, size == 1 means fp16,
	986	* which is an architecture extension we don't implement yet.
	987	*/
	988	#define DO_3S_FP_GVEC(INSN,FUNC) \
	989	static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs, \
	990	uint32_t rn_ofs, uint32_t rm_ofs, \
	991	uint32_t oprsz, uint32_t maxsz) \
	992	{ \
	993	TCGv_ptr fpst = get_fpstatus_ptr(1); \
	994	tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst, \
	995	oprsz, maxsz, 0, FUNC); \
	996	tcg_temp_free_ptr(fpst); \
	997	} \
	998	static bool trans_##INSN##_fp_3s(DisasContext s, arg_3same a) \
	999	{ \
	1000	if (a->size != 0) { \
	1001	/* TODO fp16 support */ \
	1002	return false; \
	1003	} \
	1004	return do_3same(s, a, gen_##INSN##_3s); \
	1005	}
	1006
	1007
	1008	DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s)
	1009	DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s)
	1010	DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s)
	1011	DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s)
	1012
	1013	/*
	1014	* For all the functions using this macro, size == 1 means fp16,
	1015	* which is an architecture extension we don't implement yet.
	1016	*/
	1017	#define DO_3S_FP(INSN,FUNC,READS_VD) \
	1018	static bool trans_##INSN##_fp_3s(DisasContext s, arg_3same a) \
	1019	{ \
	1020	if (a->size != 0) { \
	1021	/* TODO fp16 support */ \
	1022	return false; \
	1023	} \
	1024	return do_3same_fp(s, a, FUNC, READS_VD); \
	1025	}
	1026
	1027	DO_3S_FP(VCEQ, gen_helper_neon_ceq_f32, false)
	1028	DO_3S_FP(VCGE, gen_helper_neon_cge_f32, false)
	1029	DO_3S_FP(VCGT, gen_helper_neon_cgt_f32, false)
	1030	DO_3S_FP(VACGE, gen_helper_neon_acge_f32, false)
	1031	DO_3S_FP(VACGT, gen_helper_neon_acgt_f32, false)
	1032	DO_3S_FP(VMAX, gen_helper_vfp_maxs, false)
	1033	DO_3S_FP(VMIN, gen_helper_vfp_mins, false)
	1034
	1035	static void gen_VMLA_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
	1036	TCGv_ptr fpstatus)
	1037	{
	1038	gen_helper_vfp_muls(vn, vn, vm, fpstatus);
	1039	gen_helper_vfp_adds(vd, vd, vn, fpstatus);
	1040	}
	1041
	1042	static void gen_VMLS_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
	1043	TCGv_ptr fpstatus)
	1044	{
	1045	gen_helper_vfp_muls(vn, vn, vm, fpstatus);
	1046	gen_helper_vfp_subs(vd, vd, vn, fpstatus);
	1047	}
	1048
	1049	DO_3S_FP(VMLA, gen_VMLA_fp_3s, true)
	1050	DO_3S_FP(VMLS, gen_VMLS_fp_3s, true)
	1051
	1052	static bool trans_VMAXNM_fp_3s(DisasContext s, arg_3same a)
	1053	{
	1054	if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
	1055	return false;
	1056	}
	1057
	1058	if (a->size != 0) {
	1059	/* TODO fp16 support */
	1060	return false;
	1061	}
	1062
	1063	return do_3same_fp(s, a, gen_helper_vfp_maxnums, false);
	1064	}
	1065
	1066	static bool trans_VMINNM_fp_3s(DisasContext s, arg_3same a)
	1067	{
	1068	if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
	1069	return false;
	1070	}
	1071
	1072	if (a->size != 0) {
	1073	/* TODO fp16 support */
	1074	return false;
	1075	}
	1076
	1077	return do_3same_fp(s, a, gen_helper_vfp_minnums, false);
	1078	}
	1079
	1080	WRAP_ENV_FN(gen_VRECPS_tramp, gen_helper_recps_f32)
	1081
	1082	static void gen_VRECPS_fp_3s(unsigned vece, uint32_t rd_ofs,
	1083	uint32_t rn_ofs, uint32_t rm_ofs,
	1084	uint32_t oprsz, uint32_t maxsz)
	1085	{
	1086	static const GVecGen3 ops = { .fni4 = gen_VRECPS_tramp };
	1087	tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops);
	1088	}
	1089
	1090	static bool trans_VRECPS_fp_3s(DisasContext s, arg_3same a)
	1091	{
	1092	if (a->size != 0) {
	1093	/* TODO fp16 support */
	1094	return false;
	1095	}
	1096
	1097	return do_3same(s, a, gen_VRECPS_fp_3s);
	1098	}
	1099
	1100	WRAP_ENV_FN(gen_VRSQRTS_tramp, gen_helper_rsqrts_f32)
	1101
	1102	static void gen_VRSQRTS_fp_3s(unsigned vece, uint32_t rd_ofs,
	1103	uint32_t rn_ofs, uint32_t rm_ofs,
	1104	uint32_t oprsz, uint32_t maxsz)
	1105	{
	1106	static const GVecGen3 ops = { .fni4 = gen_VRSQRTS_tramp };
	1107	tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops);
	1108	}
	1109
	1110	static bool trans_VRSQRTS_fp_3s(DisasContext s, arg_3same a)
	1111	{
	1112	if (a->size != 0) {
	1113	/* TODO fp16 support */
	1114	return false;
	1115	}
	1116
	1117	return do_3same(s, a, gen_VRSQRTS_fp_3s);
	1118	}
	1119
	1120	static void gen_VFMA_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
	1121	TCGv_ptr fpstatus)
	1122	{
	1123	gen_helper_vfp_muladds(vd, vn, vm, vd, fpstatus);
	1124	}
	1125
	1126	static bool trans_VFMA_fp_3s(DisasContext s, arg_3same a)
	1127	{
	1128	if (!dc_isar_feature(aa32_simdfmac, s)) {
	1129	return false;
	1130	}
	1131
	1132	if (a->size != 0) {
	1133	/* TODO fp16 support */
	1134	return false;
	1135	}
	1136
	1137	return do_3same_fp(s, a, gen_VFMA_fp_3s, true);
	1138	}
	1139
	1140	static void gen_VFMS_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
	1141	TCGv_ptr fpstatus)
	1142	{
	1143	gen_helper_vfp_negs(vn, vn);
	1144	gen_helper_vfp_muladds(vd, vn, vm, vd, fpstatus);
	1145	}
	1146
	1147	static bool trans_VFMS_fp_3s(DisasContext s, arg_3same a)
	1148	{
	1149	if (!dc_isar_feature(aa32_simdfmac, s)) {
	1150	return false;
	1151	}
	1152
	1153	if (a->size != 0) {
	1154	/* TODO fp16 support */
	1155	return false;
	1156	}
	1157
	1158	return do_3same_fp(s, a, gen_VFMS_fp_3s, true);
	1159	}
	1160
	1161	static bool do_3same_fp_pair(DisasContext s, arg_3same a, VFPGen3OpSPFn *fn)
	1162	{
	1163	/* FP operations handled pairwise 32 bits at a time */
	1164	TCGv_i32 tmp, tmp2, tmp3;
	1165	TCGv_ptr fpstatus;
	1166
	1167	if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
	1168	return false;
	1169	}
	1170
	1171	/* UNDEF accesses to D16-D31 if they don't exist. */
	1172	if (!dc_isar_feature(aa32_simd_r32, s) &&
	1173	((a->vd \| a->vn \| a->vm) & 0x10)) {
	1174	return false;
	1175	}
	1176
	1177	if (!vfp_access_check(s)) {
	1178	return true;
	1179	}
	1180
	1181	assert(a->q == 0); /* enforced by decode patterns */
	1182
	1183	/*
	1184	* Note that we have to be careful not to clobber the source operands
	1185	* in the "vm == vd" case by storing the result of the first pass too
	1186	* early. Since Q is 0 there are always just two passes, so instead
	1187	* of a complicated loop over each pass we just unroll.
	1188	*/
	1189	fpstatus = get_fpstatus_ptr(1);
	1190	tmp = neon_load_reg(a->vn, 0);
	1191	tmp2 = neon_load_reg(a->vn, 1);
	1192	fn(tmp, tmp, tmp2, fpstatus);
	1193	tcg_temp_free_i32(tmp2);
	1194
	1195	tmp3 = neon_load_reg(a->vm, 0);
	1196	tmp2 = neon_load_reg(a->vm, 1);
	1197	fn(tmp3, tmp3, tmp2, fpstatus);
	1198	tcg_temp_free_i32(tmp2);
	1199	tcg_temp_free_ptr(fpstatus);
	1200
	1201	neon_store_reg(a->vd, 0, tmp);
	1202	neon_store_reg(a->vd, 1, tmp3);
	1203	return true;
	1204	}
	1205
	1206	/*
	1207	* For all the functions using this macro, size == 1 means fp16,
	1208	* which is an architecture extension we don't implement yet.
	1209	*/
	1210	#define DO_3S_FP_PAIR(INSN,FUNC) \
	1211	static bool trans_##INSN##_fp_3s(DisasContext s, arg_3same a) \
	1212	{ \
	1213	if (a->size != 0) { \
	1214	/* TODO fp16 support */ \
	1215	return false; \
	1216	} \
	1217	return do_3same_fp_pair(s, a, FUNC); \
	1218	}
	1219
	1220	DO_3S_FP_PAIR(VPADD, gen_helper_vfp_adds)
	1221	DO_3S_FP_PAIR(VPMAX, gen_helper_vfp_maxs)
	1222	DO_3S_FP_PAIR(VPMIN, gen_helper_vfp_mins)
	1223
	1224	static bool do_vector_2sh(DisasContext s, arg_2reg_shift a, GVecGen2iFn *fn)
	1225	{
	1226	/* Handle a 2-reg-shift insn which can be vectorized. */
	1227	int vec_size = a->q ? 16 : 8;
	1228	int rd_ofs = neon_reg_offset(a->vd, 0);
	1229	int rm_ofs = neon_reg_offset(a->vm, 0);
	1230
	1231	if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
	1232	return false;
	1233	}
	1234
	1235	/* UNDEF accesses to D16-D31 if they don't exist. */
	1236	if (!dc_isar_feature(aa32_simd_r32, s) &&
	1237	((a->vd \| a->vm) & 0x10)) {
	1238	return false;
	1239	}
	1240
	1241	if ((a->vm \| a->vd) & a->q) {
	1242	return false;
	1243	}
	1244
	1245	if (!vfp_access_check(s)) {
	1246	return true;
	1247	}
	1248
	1249	fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
	1250	return true;
	1251	}
	1252
	1253	#define DO_2SH(INSN, FUNC) \
	1254	static bool trans_##INSN##_2sh(DisasContext s, arg_2reg_shift a) \
	1255	{ \
	1256	return do_vector_2sh(s, a, FUNC); \
	1257	} \
	1258
	1259	DO_2SH(VSHL, tcg_gen_gvec_shli)
	1260	DO_2SH(VSLI, gen_gvec_sli)
	1261	DO_2SH(VSRI, gen_gvec_sri)
	1262	DO_2SH(VSRA_S, gen_gvec_ssra)
	1263	DO_2SH(VSRA_U, gen_gvec_usra)
	1264	DO_2SH(VRSHR_S, gen_gvec_srshr)
	1265	DO_2SH(VRSHR_U, gen_gvec_urshr)
	1266	DO_2SH(VRSRA_S, gen_gvec_srsra)
	1267	DO_2SH(VRSRA_U, gen_gvec_ursra)
	1268
	1269	static bool trans_VSHR_S_2sh(DisasContext s, arg_2reg_shift a)
	1270	{
	1271	/* Signed shift out of range results in all-sign-bits */
	1272	a->shift = MIN(a->shift, (8 << a->size) - 1);
	1273	return do_vector_2sh(s, a, tcg_gen_gvec_sari);
	1274	}
	1275
	1276	static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
	1277	int64_t shift, uint32_t oprsz, uint32_t maxsz)
	1278	{
	1279	tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
	1280	}
	1281
	1282	static bool trans_VSHR_U_2sh(DisasContext s, arg_2reg_shift a)
	1283	{
	1284	/* Shift out of range is architecturally valid and results in zero. */
	1285	if (a->shift >= (8 << a->size)) {
	1286	return do_vector_2sh(s, a, gen_zero_rd_2sh);
	1287	} else {
	1288	return do_vector_2sh(s, a, tcg_gen_gvec_shri);
	1289	}
	1290	}
	1291
	1292	static bool do_2shift_env_64(DisasContext s, arg_2reg_shift a,
	1293	NeonGenTwo64OpEnvFn *fn)
	1294	{
	1295	/*
	1296	* 2-reg-and-shift operations, size == 3 case, where the
	1297	* function needs to be passed cpu_env.
	1298	*/
	1299	TCGv_i64 constimm;
	1300	int pass;
	1301
	1302	if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
	1303	return false;
	1304	}
	1305
	1306	/* UNDEF accesses to D16-D31 if they don't exist. */
	1307	if (!dc_isar_feature(aa32_simd_r32, s) &&
	1308	((a->vd \| a->vm) & 0x10)) {
	1309	return false;
	1310	}
	1311
	1312	if ((a->vm \| a->vd) & a->q) {
	1313	return false;
	1314	}
	1315
	1316	if (!vfp_access_check(s)) {
	1317	return true;
	1318	}
	1319
	1320	/*
	1321	* To avoid excessive duplication of ops we implement shift
	1322	* by immediate using the variable shift operations.
	1323	*/
	1324	constimm = tcg_const_i64(dup_const(a->size, a->shift));
	1325
	1326	for (pass = 0; pass < a->q + 1; pass++) {
	1327	TCGv_i64 tmp = tcg_temp_new_i64();
	1328
	1329	neon_load_reg64(tmp, a->vm + pass);
	1330	fn(tmp, cpu_env, tmp, constimm);
	1331	neon_store_reg64(tmp, a->vd + pass);
	1332	}
	1333	tcg_temp_free_i64(constimm);
	1334	return true;
	1335	}
	1336
	1337	static bool do_2shift_env_32(DisasContext s, arg_2reg_shift a,
	1338	NeonGenTwoOpEnvFn *fn)
	1339	{
	1340	/*
	1341	* 2-reg-and-shift operations, size < 3 case, where the
	1342	* helper needs to be passed cpu_env.
	1343	*/
	1344	TCGv_i32 constimm;
	1345	int pass;
	1346
	1347	if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
	1348	return false;
	1349	}
	1350
	1351	/* UNDEF accesses to D16-D31 if they don't exist. */
	1352	if (!dc_isar_feature(aa32_simd_r32, s) &&
	1353	((a->vd \| a->vm) & 0x10)) {
	1354	return false;
	1355	}
	1356
	1357	if ((a->vm \| a->vd) & a->q) {
	1358	return false;
	1359	}
	1360
	1361	if (!vfp_access_check(s)) {
	1362	return true;
	1363	}
	1364
	1365	/*
	1366	* To avoid excessive duplication of ops we implement shift
	1367	* by immediate using the variable shift operations.
	1368	*/
	1369	constimm = tcg_const_i32(dup_const(a->size, a->shift));
	1370
	1371	for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
	1372	TCGv_i32 tmp = neon_load_reg(a->vm, pass);
	1373	fn(tmp, cpu_env, tmp, constimm);
	1374	neon_store_reg(a->vd, pass, tmp);
	1375	}
	1376	tcg_temp_free_i32(constimm);
	1377	return true;
	1378	}
	1379
	1380	#define DO_2SHIFT_ENV(INSN, FUNC) \
	1381	static bool trans_##INSN##_64_2sh(DisasContext s, arg_2reg_shift a) \
	1382	{ \
	1383	return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64); \
	1384	} \
	1385	static bool trans_##INSN##_2sh(DisasContext s, arg_2reg_shift a) \
	1386	{ \
	1387	static NeonGenTwoOpEnvFn * const fns[] = { \
	1388	gen_helper_neon_##FUNC##8, \
	1389	gen_helper_neon_##FUNC##16, \
	1390	gen_helper_neon_##FUNC##32, \
	1391	}; \
	1392	assert(a->size < ARRAY_SIZE(fns)); \
	1393	return do_2shift_env_32(s, a, fns[a->size]); \
	1394	}
	1395
	1396	DO_2SHIFT_ENV(VQSHLU, qshlu_s)
	1397	DO_2SHIFT_ENV(VQSHL_U, qshl_u)
	1398	DO_2SHIFT_ENV(VQSHL_S, qshl_s)
	1399
	1400	static bool do_2shift_narrow_64(DisasContext s, arg_2reg_shift a,
	1401	NeonGenTwo64OpFn *shiftfn,
	1402	NeonGenNarrowEnvFn *narrowfn)
	1403	{
	1404	/* 2-reg-and-shift narrowing-shift operations, size == 3 case */
	1405	TCGv_i64 constimm, rm1, rm2;
	1406	TCGv_i32 rd;
	1407
	1408	if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
	1409	return false;
	1410	}
	1411
	1412	/* UNDEF accesses to D16-D31 if they don't exist. */
	1413	if (!dc_isar_feature(aa32_simd_r32, s) &&
	1414	((a->vd \| a->vm) & 0x10)) {
	1415	return false;
	1416	}
	1417
	1418	if (a->vm & 1) {
	1419	return false;
	1420	}
	1421
	1422	if (!vfp_access_check(s)) {
	1423	return true;
	1424	}
	1425
	1426	/*
	1427	* This is always a right shift, and the shiftfn is always a
	1428	* left-shift helper, which thus needs the negated shift count.
	1429	*/
	1430	constimm = tcg_const_i64(-a->shift);
	1431	rm1 = tcg_temp_new_i64();
	1432	rm2 = tcg_temp_new_i64();
	1433
	1434	/* Load both inputs first to avoid potential overwrite if rm == rd */
	1435	neon_load_reg64(rm1, a->vm);
	1436	neon_load_reg64(rm2, a->vm + 1);
	1437
	1438	shiftfn(rm1, rm1, constimm);
	1439	rd = tcg_temp_new_i32();
	1440	narrowfn(rd, cpu_env, rm1);
	1441	neon_store_reg(a->vd, 0, rd);
	1442
	1443	shiftfn(rm2, rm2, constimm);
	1444	rd = tcg_temp_new_i32();
	1445	narrowfn(rd, cpu_env, rm2);
	1446	neon_store_reg(a->vd, 1, rd);
	1447
	1448	tcg_temp_free_i64(rm1);
	1449	tcg_temp_free_i64(rm2);
	1450	tcg_temp_free_i64(constimm);
	1451
	1452	return true;
	1453	}
	1454
	1455	static bool do_2shift_narrow_32(DisasContext s, arg_2reg_shift a,
	1456	NeonGenTwoOpFn *shiftfn,
	1457	NeonGenNarrowEnvFn *narrowfn)
	1458	{
	1459	/* 2-reg-and-shift narrowing-shift operations, size < 3 case */
	1460	TCGv_i32 constimm, rm1, rm2, rm3, rm4;
	1461	TCGv_i64 rtmp;
	1462	uint32_t imm;
	1463
	1464	if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
	1465	return false;
	1466	}
	1467
	1468	/* UNDEF accesses to D16-D31 if they don't exist. */
	1469	if (!dc_isar_feature(aa32_simd_r32, s) &&
	1470	((a->vd \| a->vm) & 0x10)) {
	1471	return false;
	1472	}
	1473
	1474	if (a->vm & 1) {
	1475	return false;
	1476	}
	1477
	1478	if (!vfp_access_check(s)) {
	1479	return true;
	1480	}
	1481
	1482	/*
	1483	* This is always a right shift, and the shiftfn is always a
	1484	* left-shift helper, which thus needs the negated shift count
	1485	* duplicated into each lane of the immediate value.
	1486	*/
	1487	if (a->size == 1) {
	1488	imm = (uint16_t)(-a->shift);
	1489	imm \|= imm << 16;
	1490	} else {
	1491	/* size == 2 */
	1492	imm = -a->shift;
	1493	}
	1494	constimm = tcg_const_i32(imm);
	1495
	1496	/* Load all inputs first to avoid potential overwrite */
	1497	rm1 = neon_load_reg(a->vm, 0);
	1498	rm2 = neon_load_reg(a->vm, 1);
	1499	rm3 = neon_load_reg(a->vm + 1, 0);
	1500	rm4 = neon_load_reg(a->vm + 1, 1);
	1501	rtmp = tcg_temp_new_i64();
	1502
	1503	shiftfn(rm1, rm1, constimm);
	1504	shiftfn(rm2, rm2, constimm);
	1505
	1506	tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
	1507	tcg_temp_free_i32(rm2);
	1508
	1509	narrowfn(rm1, cpu_env, rtmp);
	1510	neon_store_reg(a->vd, 0, rm1);
	1511
	1512	shiftfn(rm3, rm3, constimm);
	1513	shiftfn(rm4, rm4, constimm);
	1514	tcg_temp_free_i32(constimm);
	1515
	1516	tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
	1517	tcg_temp_free_i32(rm4);
	1518
	1519	narrowfn(rm3, cpu_env, rtmp);
	1520	tcg_temp_free_i64(rtmp);
	1521	neon_store_reg(a->vd, 1, rm3);
	1522	return true;
	1523	}
	1524
	1525	#define DO_2SN_64(INSN, FUNC, NARROWFUNC) \
	1526	static bool trans_##INSN##_2sh(DisasContext s, arg_2reg_shift a) \
	1527	{ \
	1528	return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC); \
	1529	}
	1530	#define DO_2SN_32(INSN, FUNC, NARROWFUNC) \
	1531	static bool trans_##INSN##_2sh(DisasContext s, arg_2reg_shift a) \
	1532	{ \
	1533	return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC); \
	1534	}
	1535
	1536	static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
	1537	{
	1538	tcg_gen_extrl_i64_i32(dest, src);
	1539	}
	1540
	1541	static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
	1542	{
	1543	gen_helper_neon_narrow_u16(dest, src);
	1544	}
	1545
	1546	static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
	1547	{
	1548	gen_helper_neon_narrow_u8(dest, src);
	1549	}
	1550
	1551	DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
	1552	DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
	1553	DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
	1554
	1555	DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
	1556	DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
	1557	DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
	1558
	1559	DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
	1560	DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
	1561	DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
	1562
	1563	DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
	1564	DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
	1565	DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)