Test results for Broadwell and Skylake

Date: 2020-10-07 15:58

// gcc -g -Wall -O2 fusion.c -o fusion -DLIKWID -llikwid [may also need -lm -lpthread]
// likwid-perfctr -m -g UOPS_ISSUED_ANY:PMC0,UOPS_EXECUTED_CORE:PMC1,UOPS_RETIRED_ALL:PMC2,BR_INST_RETIRED_NEAR_TAKEN:PMC3 -C 1 fusion#include <x86intrin.h>
#include <stdint.h>
#include <stdio.h>
#ifdef LIKWID
#include <likwid.h>
#define MEASURE_INIT()                                          \
    do {                                                        \
        likwid_markerInit();                                    \
        likwid_markerThreadInit();                              \
    } while (0)
#define MEASURE_FINI()                                          \
    do {                                                        \
        likwid_markerClose();                                   \
    } while (0)
#define MEASURE(name, code)                                     \
    do {                                                        \
        sum1 = sum2 = 0;                                        \
        likwid_markerStartRegion(name);                         \
        code;                                                   \
        likwid_markerStopRegion(name);                          \
        printf("%s: sum1=%ld, sum2=%ld\n", name, sum1, sum2);   \
    } while (0)
#else // not LIKWID
#define MEASURE_INIT()
#define MEASURE_FINI()
#define MEASURE(name, code)                                     \
    do {                                                        \
        sum1 = sum2 = 0;                                        \
        code;                                                   \
        printf("%s: sum1=%ld, sum2=%ld\n", name, sum1, sum2);   \
    } while (0)
#endif // not LIKWID

#define ASM_TWO_MICRO_TWO_MACRO(in1, sum1, in2, sum2, max)      \
    __asm volatile ("1:\n"                                      \
                    "add (%[IN1]), %[SUM1]\n"                   \
                    "cmp %[MAX], %[SUM1]\n"                     \
                    "jae 2f\n"                                  \
                    "add (%[IN2]), %[SUM2]\n"                   \
                    "cmp %[MAX], %[SUM2]\n"                     \
                    "jb 1b\n"                                   \
                    "2:" :                                      \
                    [SUM1] "+&r" (sum1),                        \
                    [SUM2] "+&r" (sum2) :                       \
                    [IN1] "r" (in1),                            \
                    [IN2] "r" (in2),                            \
                    [MAX] "r" (max))
#define ASM_NO_MICRO_TWO_MACRO(in1, sum1, in2, sum2, max, tmp1, tmp2)   \
    __asm volatile ("1:\n"                                      \
                    "mov (%[IN1]), %[TMP1]\n"                   \
                    "add %[TMP1], %[SUM1]\n"                    \
                    "cmp %[MAX], %[SUM1]\n"                     \
                    "jae 2f\n"                                  \
                    "mov (%[IN2]), %[TMP2]\n"                   \
                    "add %[TMP2], %[SUM2]\n"                    \
                    "cmp %[MAX], %[SUM2]\n"                     \
                    "jb 1b\n"                                   \
                    "2:" :                                      \
                    [TMP1] "=&r" (tmp1),                        \
                    [TMP2] "=&r" (tmp2),                        \
                    [SUM1] "+&r" (sum1),                        \
                    [SUM2] "+&r" (sum2) :                       \
                    [IN1] "r" (in1),                            \
                    [IN2] "r" (in2),                            \
                    [MAX] "r" (max))

#define ASM_ONE_MICRO_TWO_MACRO(in1, sum1, in2, sum2, max, tmp) \
    __asm volatile ("1:\n"                                      \
                    "add (%[IN1]), %[SUM1]\n"                   \
                    "cmp %[MAX], %[SUM1]\n"                     \
                    "jae 2f\n"                                  \
                    "mov (%[IN2]), %[TMP]\n"                    \
                    "add %[TMP], %[SUM2]\n"                     \
                    "cmp %[MAX], %[SUM2]\n"                     \
                    "jb 1b\n"                                   \
                    "2:" :                                      \
                    [TMP] "=&r" (tmp),                          \
                    [SUM1] "+&r" (sum1),                        \
                    [SUM2] "+&r" (sum2) :                       \
                    [IN1] "r" (in1),                            \
                    [IN2] "r" (in2),                            \
                    [MAX] "r" (max))

#define ASM_ONE_MICRO_ONE_MACRO(in1, sum1, in2, sum2, max, tmp) \
    __asm volatile ("1:\n"                                      \
                    "add (%[IN1]), %[SUM1]\n"                   \
                    "cmp %[MAX], %[SUM1]\n"                     \
                    "mov (%[IN1]), %[TMP]\n"                    \
                    "jae 2f\n"                                  \
                    "add %[TMP], %[SUM2]\n"                     \
                    "cmp %[MAX], %[SUM2]\n"                     \
                    "jb 1b\n"                                   \
                    "2:" :                                      \
                    [TMP] "=&r" (tmp),                          \
                    [SUM1] "+&r" (sum1),                        \
                    [SUM2] "+&r" (sum2) :                       \
                    [IN1] "r" (in1),                            \
                    [IN2] "r" (in2),                            \
                    [MAX] "r" (max))
// two separate loads and adds, two non-fused cmp then jcc
#define ASM_NO_MICRO_NO_MACRO(in1, sum1, in2, sum2, max, tmp1, tmp2)    \
    __asm volatile ("mov (%[IN1]), %[TMP1]\n"                   \
                    "1:\n"                                      \
                    "add %[TMP1], %[SUM1]\n"                    \
                    "cmp %[MAX], %[SUM1]\n"                     \
                    "mov (%[IN2]), %[TMP2]\n"                   \
                    "jae 2f\n"                                  \
                    "add %[TMP2], %[SUM2]\n"                    \
                    "cmp %[MAX], %[SUM2]\n"                     \
                    "mov (%[IN1]), %[TMP1]\n"                   \
                    "jb 1b\n"                                   \
                    "2:" :                                      \
                    [TMP1] "=&r" (tmp1),                        \
                    [TMP2] "=&r" (tmp2),                        \
                    [SUM1] "+&r" (sum1),                        \
                    [SUM2] "+&r" (sum2) :                       \
                    [IN1] "r" (in1),                            \
                    [IN2] "r" (in2),                            \
                    [MAX] "r" (max))

int main(/* int argc, char **argv */) {
    uint64_t tmp, tmp1, tmp2;
    uint64_t sum1, sum2;
    uint64_t in1 = 1;
    uint64_t in2 = 1;
    uint64_t max = 10000000;
    MEASURE_INIT();
    MEASURE("two_micro_two_macro", ASM_TWO_MICRO_TWO_MACRO(&in1, sum1, &in2, sum2, max));
    MEASURE("one_micro_two_macro", ASM_ONE_MICRO_TWO_MACRO(&in1, sum1, &in2, sum2, max, tmp));
    MEASURE("one_micro_one_macro", ASM_ONE_MICRO_ONE_MACRO(&in1, sum1, &in2, sum2, max, tmp));
    MEASURE("no_micro_two_macro", ASM_NO_MICRO_TWO_MACRO(&in1, sum1, &in2, sum2, max, tmp1, tmp2));
    MEASURE("no_micro_no_macro", ASM_NO_MICRO_NO_MACRO(&in1, sum1, &in2, sum2, max, tmp1, tmp2));
    MEASURE_FINI();
    return 0;
}

Author: Agner	Date: 2016-01-16 03:26
Thanks for the tip. The link doesn't work. I found it here: myeventagenda.com/sessions/0B9F4191-1C29-408A-8B61-65D7520025A8/7/5 session SPCS001.

Author: Agner	Date: 2016-06-19 00:59
T wrote: If that's 4-wide in the fused domain, it implies that the processor could sustain 6 Âµops throughput in the unfused domain, if there are no 4 (or 5) wide bottlenecks downstream of the scheduler (e.g., issue or retirement). Yes, it can do 6 Âµops in the unfused domain.

Author: Tacit Murky	Date: 2017-05-11 13:33
Very intresting, thanks. Maybe, by replacing in your 1st example Â«inc ebxÂ» with Â«mov [non-conflicing-address], rÂ» you can get 7 unfused Âµops and 11 GPR reads per clock â€” if the hardware allows this.

Author: T	Date: 2016-08-08 01:57
Thank you very much for that. It is really interesting and implied that compilers and assembly writers should tune differently for Haswell vs Skylake. I wonder if icc has been updated to reflect it?

Author: Agner	Date: 2017-05-30 12:49
Bulat Ziganshin wrote: more than year ago, i wrote you that skylake will have single-issue avx-512. I think this kind of speculation is unsound if you have no inside information.

Author:	Date: 2017-06-28 09:54
Anybody get hands on 7900X? Could you please have a test on the gather/scatter performance? Interested to know if the throughput of AVX2/AVX512 gather instruction improved.

Author:	Date: 2017-06-30 03:21
According to the following URL: techreport.com/review/32111/intel-core-i9-7900x-cpu-reviewed-part-one SKYLAKE-X is able to handle two 64-byte loads per cycle, so there is a chance the throughput of gather can be improved.

Author: SEt	Date: 2017-07-17 20:41
It looks like Intel released some information about inner workings of Sandy Bridge and newer CPUs: https://reviews.llvm.org/rL307529 Is it indeed accurate? Should instruction tables manual be updated from that information?

Author: Agner	Date: 2020-10-11 03:11
Armand Behroozi wrote: I got different results than what is listed in the instruction tables for Intel Skylake MOVQ r64, mm/x and MOVQ mm/x, r64. You are right. It was a typo.

Agner`s CPU blog