arch/x86/include/asm/xor_avx.h

   1 /* SPDX-License-Identifier: GPL-2.0-only */
   2 #ifndef _ASM_X86_XOR_AVX_H
   3 #define _ASM_X86_XOR_AVX_H
   4
   5 /*
   6  * Optimized RAID-5 checksumming functions for AVX
   7  *
   8  * Copyright (C) 2012 Intel Corporation
   9  * Author: Jim Kukunas <[email protected]>
  10  *
  11  * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
  12  */
  13
  14 #include <linux/compiler.h>
  15 #include <asm/fpu/api.h>
  16
  17 #define BLOCK4(i) \
  18                 BLOCK(32 * i, 0) \
  19                 BLOCK(32 * (i + 1), 1) \
  20                 BLOCK(32 * (i + 2), 2) \
  21                 BLOCK(32 * (i + 3), 3)
  22
  23 #define BLOCK16() \
  24                 BLOCK4(0) \
  25                 BLOCK4(4) \
  26                 BLOCK4(8) \
  27                 BLOCK4(12)
  28
  29 static void xor_avx_2(unsigned long bytes, unsigned long * __restrict p0,
  30                       const unsigned long * __restrict p1)
  31 {
  32         unsigned long lines = bytes >> 9;
  33
  34         kernel_fpu_begin();
  35
  36         while (lines--) {
  37 #undef BLOCK
  38 #define BLOCK(i, reg) \
  39 do { \
  40         asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
  41         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
  42                 "m" (p0[i / sizeof(*p0)])); \
  43         asm volatile("vmovdqa %%ymm" #reg ", %0" : \
  44                 "=m" (p0[i / sizeof(*p0)])); \
  45 } while (0);
  46
  47                 BLOCK16()
  48
  49                 p0 = (unsigned long *)((uintptr_t)p0 + 512);
  50                 p1 = (unsigned long *)((uintptr_t)p1 + 512);
  51         }
  52
  53         kernel_fpu_end();
  54 }
  55
  56 static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0,
  57                       const unsigned long * __restrict p1,
  58                       const unsigned long * __restrict p2)
  59 {
  60         unsigned long lines = bytes >> 9;
  61
  62         kernel_fpu_begin();
  63
  64         while (lines--) {
  65 #undef BLOCK
  66 #define BLOCK(i, reg) \
  67 do { \
  68         asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
  69         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
  70                 "m" (p1[i / sizeof(*p1)])); \
  71         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
  72                 "m" (p0[i / sizeof(*p0)])); \
  73         asm volatile("vmovdqa %%ymm" #reg ", %0" : \
  74                 "=m" (p0[i / sizeof(*p0)])); \
  75 } while (0);
  76
  77                 BLOCK16()
  78
  79                 p0 = (unsigned long *)((uintptr_t)p0 + 512);
  80                 p1 = (unsigned long *)((uintptr_t)p1 + 512);
  81                 p2 = (unsigned long *)((uintptr_t)p2 + 512);
  82         }
  83
  84         kernel_fpu_end();
  85 }
  86
  87 static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0,
  88                       const unsigned long * __restrict p1,
  89                       const unsigned long * __restrict p2,
  90                       const unsigned long * __restrict p3)
  91 {
  92         unsigned long lines = bytes >> 9;
  93
  94         kernel_fpu_begin();
  95
  96         while (lines--) {
  97 #undef BLOCK
  98 #define BLOCK(i, reg) \
  99 do { \
 100         asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
 101         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 102                 "m" (p2[i / sizeof(*p2)])); \
 103         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 104                 "m" (p1[i / sizeof(*p1)])); \
 105         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 106                 "m" (p0[i / sizeof(*p0)])); \
 107         asm volatile("vmovdqa %%ymm" #reg ", %0" : \
 108                 "=m" (p0[i / sizeof(*p0)])); \
 109 } while (0);
 110
 111                 BLOCK16();
 112
 113                 p0 = (unsigned long *)((uintptr_t)p0 + 512);
 114                 p1 = (unsigned long *)((uintptr_t)p1 + 512);
 115                 p2 = (unsigned long *)((uintptr_t)p2 + 512);
 116                 p3 = (unsigned long *)((uintptr_t)p3 + 512);
 117         }
 118
 119         kernel_fpu_end();
 120 }
 121
 122 static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0,
 123              const unsigned long * __restrict p1,
 124              const unsigned long * __restrict p2,
 125              const unsigned long * __restrict p3,
 126              const unsigned long * __restrict p4)
 127 {
 128         unsigned long lines = bytes >> 9;
 129
 130         kernel_fpu_begin();
 131
 132         while (lines--) {
 133 #undef BLOCK
 134 #define BLOCK(i, reg) \
 135 do { \
 136         asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
 137         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 138                 "m" (p3[i / sizeof(*p3)])); \
 139         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 140                 "m" (p2[i / sizeof(*p2)])); \
 141         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 142                 "m" (p1[i / sizeof(*p1)])); \
 143         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 144                 "m" (p0[i / sizeof(*p0)])); \
 145         asm volatile("vmovdqa %%ymm" #reg ", %0" : \
 146                 "=m" (p0[i / sizeof(*p0)])); \
 147 } while (0);
 148
 149                 BLOCK16()
 150
 151                 p0 = (unsigned long *)((uintptr_t)p0 + 512);
 152                 p1 = (unsigned long *)((uintptr_t)p1 + 512);
 153                 p2 = (unsigned long *)((uintptr_t)p2 + 512);
 154                 p3 = (unsigned long *)((uintptr_t)p3 + 512);
 155                 p4 = (unsigned long *)((uintptr_t)p4 + 512);
 156         }
 157
 158         kernel_fpu_end();
 159 }
 160
 161 static struct xor_block_template xor_block_avx = {
 162         .name = "avx",
 163         .do_2 = xor_avx_2,
 164         .do_3 = xor_avx_3,
 165         .do_4 = xor_avx_4,
 166         .do_5 = xor_avx_5,
 167 };
 168
 169 #define AVX_XOR_SPEED \
 170 do { \
 171         if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \
 172                 xor_speed(&xor_block_avx); \
 173 } while (0)
 174
 175 #define AVX_SELECT(FASTEST) \
 176         (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST)
 177
 178 #endif