qemu-riscv
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-riscv] [Qemu-devel] [PATCH v2 08/17] RISC-V: add vector extens


From: Richard Henderson
Subject: Re: [Qemu-riscv] [Qemu-devel] [PATCH v2 08/17] RISC-V: add vector extension integer instructions part1, add/sub/adc/sbc
Date: Thu, 12 Sep 2019 11:27:20 -0400
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Thunderbird/60.8.0

On 9/11/19 2:25 AM, liuzhiwei wrote:
>  #define VECTOR_HELPER(name) HELPER(glue(vector_, name))
> +#define SIGNBIT8    (1 << 7)
> +#define SIGNBIT16   (1 << 15)
> +#define SIGNBIT32   (1 << 31)
> +#define SIGNBIT64   ((uint64_t)1 << 63)

Perhaps make up your mind if you want signed or unsigned values?  Perhaps just
use or redefine INT<N>_MIN instead?

> +static int64_t extend_gpr(target_ulong reg)
> +{
> +    return sign_extend(reg, sizeof(target_ulong) * 8);
> +}

Note wrt usage:
+                extend_rs1 = (uint64_t)extend_gpr(env->gpr[rs1]);

This is equivalent to "extend_rs1 = (target_long)env->gpr[rs1]".

I don't see how this helper function is helping, really.
Also, pass gprs by value, not by index.

> +static inline int vector_get_carry(CPURISCVState *env, int width, int lmul,
> +    int index)
> +{
> +    int mlen = width / lmul;
> +    int idx = (index * mlen) / 8;
> +    int pos = (index * mlen) % 8;
> +
> +    return (env->vfp.vreg[0].u8[idx] >> pos) & 0x1;
> +}

Any reason not to re-use vector_elem_mask?

> +static inline uint64_t u64xu64_lh(uint64_t a, uint64_t b)
> +{
> +    uint64_t hi_64, carry;
> +
> +    /* first get the whole product in {hi_64, lo_64} */
> +    uint64_t a_hi = a >> 32;
> +    uint64_t a_lo = (uint32_t)a;
> +    uint64_t b_hi = b >> 32;
> +    uint64_t b_lo = (uint32_t)b;
> +
> +    /*
> +     * a * b = (a_hi << 32 + a_lo) * (b_hi << 32 + b_lo)
> +     *               = (a_hi * b_hi) << 64 + (a_hi * b_lo) << 32 +
> +     *                 (a_lo * b_hi) << 32 + a_lo * b_lo
> +     *               = {hi_64, lo_64}
> +     * hi_64 = ((a_hi * b_lo) << 32 + (a_lo * b_hi) << 32 + (a_lo * b_lo)) 
> >> 64
> +     *       = (a_hi * b_lo) >> 32 + (a_lo * b_hi) >> 32 + carry
> +     * carry = ((uint64_t)(uint32_t)(a_hi * b_lo) +
> +     *           (uint64_t)(uint32_t)(a_lo * b_hi) + (a_lo * b_lo) >> 32) >> 
> 32
> +     */
> +
> +    carry =  ((uint64_t)(uint32_t)(a_hi * b_lo) +
> +              (uint64_t)(uint32_t)(a_lo * b_hi) +
> +              ((a_lo * b_lo) >> 32)) >> 32;
> +
> +    hi_64 = a_hi * b_hi +
> +            ((a_hi * b_lo) >> 32) + ((a_lo * b_hi) >> 32) +
> +            carry;
> +
> +    return hi_64;
> +}

Use mulu64().

> +static inline int64_t s64xu64_lh(int64_t a, uint64_t b)
> +{
> +    uint64_t abs_a = a;
> +    uint64_t lo_64, hi_64;
> +
> +    if (a < 0) {
> +        abs_a =  ~a + 1;

 abs_a = -a

> +static inline int64_t s64xs64_lh(int64_t a, int64_t b)
> +{
> +    uint64_t abs_a = a, abs_b = b;
> +    uint64_t lo_64, hi_64;
> +
> +    if (a < 0) {
> +        abs_a =  ~a + 1;
> +    }
> +    if (b < 0) {
> +        abs_b = ~b + 1;
> +    }
> +    lo_64 = abs_a * abs_b;
> +    hi_64 = u64xu64_lh(abs_a, abs_b);
> +
> +    if ((a ^ b) & SIGNBIT64) {
> +        lo_64 = ~lo_64;
> +        hi_64 = ~hi_64;
> +        if (lo_64 == UINT64_MAX) {
> +            lo_64 = 0;
> +            hi_64 += 1;
> +        } else {
> +            lo_64 += 1;
> +        }
> +    }
> +    return hi_64;
> +}

Use muls64().


> +void VECTOR_HELPER(vadc_vvm)(CPURISCVState *env, uint32_t rs1,
> +    uint32_t rs2, uint32_t rd)
> +{
> +    int i, j, vl;
> +    uint32_t lmul, width, src1, src2, dest, vlmax, carry;
> +
> +    vl    = env->vfp.vl;
> +    lmul  = vector_get_lmul(env);
> +    width   = vector_get_width(env);
> +    vlmax = vector_get_vlmax(env);
> +
> +    if (vector_vtype_ill(env) || vector_overlap_carry(lmul, rd)) {
> +        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
> +        return;
> +    }
> +    vector_lmul_check_reg(env, lmul, rs1, false);
> +    vector_lmul_check_reg(env, lmul, rs2, false);
> +    vector_lmul_check_reg(env, lmul, rd, false);
> +
> +    for (i = 0; i < vlmax; i++) {
> +        src1 = rs1 + (i / (VLEN / width));
> +        src2 = rs2 + (i / (VLEN / width));
> +        dest = rd + (i / (VLEN / width));
> +        j = i % (VLEN / width);
> +        if (i < env->vfp.vstart) {
> +            continue;

Again, hoist.

> +        } else if (i < vl) {

I would think this too could be moved into the loop condition.

> +            switch (width) {
> +            case 8:
> +                carry = vector_get_carry(env, width, lmul, i);
> +                env->vfp.vreg[dest].u8[j] = env->vfp.vreg[src1].u8[j]
> +                    + env->vfp.vreg[src2].u8[j] + carry;
> +                break;
> +            case 16:
> +                carry = vector_get_carry(env, width, lmul, i);
> +                env->vfp.vreg[dest].u16[j] = env->vfp.vreg[src1].u16[j]
> +                    + env->vfp.vreg[src2].u16[j] + carry;
> +                break;
> +            case 32:
> +                carry = vector_get_carry(env, width, lmul, i);
> +                env->vfp.vreg[dest].u32[j] = env->vfp.vreg[src1].u32[j]
> +                    + env->vfp.vreg[src2].u32[j] + carry;
> +                break;
> +            case 64:
> +                carry = vector_get_carry(env, width, lmul, i);
> +                env->vfp.vreg[dest].u64[j] = env->vfp.vreg[src1].u64[j]
> +                    + env->vfp.vreg[src2].u64[j] + carry;
> +                break;
> +            default:
> +                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
> +                break;
> +            }
> +        } else {
> +            vector_tail_common(env, dest, j, width);

With this tail clearing being done as a loop of its own, which would devolve to
memset on a little-endian host.


> +        }
> +    }
> +    env->vfp.vstart = 0;
> +}
> +void VECTOR_HELPER(vadc_vxm)(CPURISCVState *env, uint32_t rs1,
> +    uint32_t rs2, uint32_t rd)
> +{

Watch the spacing between functions.
Pass gpr rs1 by value.

> +void VECTOR_HELPER(vadc_vim)(CPURISCVState *env, uint32_t rs1,
> +    uint32_t rs2, uint32_t rd)
> +{
...
> +                env->vfp.vreg[dest].u8[j] = sign_extend(rs1, 5)

Pass the immediate as a sign-extended immediate to begin with, not as an
unsigned 5-bit field.

All of the rest of the helpers are about the same.

Consider creating a helper function that contains the basic outline of the
vector processing, and takes a (set of) function pointers that perform the
operation.  With optimization, compiler inlining should produce the same code
as you have here without having to replicate quite so much code for each
helper.  You can also fix a bug in the basic outline in one place instead of
hundreds.


r~



reply via email to

[Prev in Thread] Current Thread [Next in Thread]