+#define DO_VCMULH(N, M, D, S) float16_mul(N, M, S)
+#define DO_VCMULS(N, M, D, S) float32_mul(N, M, S)
+
+#define DO_VCMLAH(N, M, D, S) float16_muladd(N, M, D, 0, S)
+#define DO_VCMLAS(N, M, D, S) float32_muladd(N, M, D, 0, S)
+
+DO_VCMLA(vcmul0h, 2, uint16_t, 0, float16_chs, DO_VCMULH)
+DO_VCMLA(vcmul0s, 4, uint32_t, 0, float32_chs, DO_VCMULS)
+DO_VCMLA(vcmul90h, 2, uint16_t, 1, float16_chs, DO_VCMULH)
+DO_VCMLA(vcmul90s, 4, uint32_t, 1, float32_chs, DO_VCMULS)
+DO_VCMLA(vcmul180h, 2, uint16_t, 2, float16_chs, DO_VCMULH)
+DO_VCMLA(vcmul180s, 4, uint32_t, 2, float32_chs, DO_VCMULS)
+DO_VCMLA(vcmul270h, 2, uint16_t, 3, float16_chs, DO_VCMULH)
+DO_VCMLA(vcmul270s, 4, uint32_t, 3, float32_chs, DO_VCMULS)