]> Git Repo - linux.git/blame - arch/x86/lib/memcpy_64.S
Merge tag 'mac80211-for-davem-2015-05-19' of git://git.kernel.org/pub/scm/linux/kerne...
[linux.git] / arch / x86 / lib / memcpy_64.S
CommitLineData
1da177e4 1/* Copyright 2002 Andi Kleen */
038b0a6d 2
8d379dad 3#include <linux/linkage.h>
8d379dad 4#include <asm/cpufeature.h>
f3b6eaf0 5#include <asm/dwarf2.h>
101068c1 6#include <asm/alternative-asm.h>
8d379dad 7
e0bc8d17
BP
8/*
9 * We build a jump to memcpy_orig by default which gets NOPped out on
10 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
11 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
12 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
13 */
14
15.weak memcpy
16
1da177e4
LT
17/*
18 * memcpy - Copy a memory block.
19 *
f3b6eaf0
IM
20 * Input:
21 * rdi destination
22 * rsi source
23 * rdx count
24 *
1da177e4
LT
25 * Output:
26 * rax original destination
f3b6eaf0 27 */
e0bc8d17
BP
28ENTRY(__memcpy)
29ENTRY(memcpy)
30 ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
31 "jmp memcpy_erms", X86_FEATURE_ERMS
1da177e4 32
f3b6eaf0 33 movq %rdi, %rax
2ab56091
JB
34 movq %rdx, %rcx
35 shrq $3, %rcx
f3b6eaf0 36 andl $7, %edx
8d379dad 37 rep movsq
f3b6eaf0 38 movl %edx, %ecx
8d379dad
JB
39 rep movsb
40 ret
e0bc8d17
BP
41ENDPROC(memcpy)
42ENDPROC(__memcpy)
8d379dad 43
101068c1 44/*
e0bc8d17
BP
45 * memcpy_erms() - enhanced fast string memcpy. This is faster and
46 * simpler than memcpy. Use memcpy_erms when possible.
101068c1 47 */
e0bc8d17 48ENTRY(memcpy_erms)
101068c1 49 movq %rdi, %rax
2ab56091 50 movq %rdx, %rcx
101068c1
FY
51 rep movsb
52 ret
e0bc8d17 53ENDPROC(memcpy_erms)
393f203f 54
e0bc8d17 55ENTRY(memcpy_orig)
8d379dad 56 CFI_STARTPROC
59daa706 57 movq %rdi, %rax
7bcd3f34 58
2ab56091 59 cmpq $0x20, %rdx
59daa706 60 jb .Lhandle_tail
7bcd3f34 61
f3b6eaf0 62 /*
9de4966a 63 * We check whether memory false dependence could occur,
59daa706 64 * then jump to corresponding copy mode.
f3b6eaf0 65 */
59daa706
ML
66 cmp %dil, %sil
67 jl .Lcopy_backward
2ab56091 68 subq $0x20, %rdx
59daa706
ML
69.Lcopy_forward_loop:
70 subq $0x20, %rdx
7bcd3f34 71
f3b6eaf0 72 /*
59daa706 73 * Move in blocks of 4x8 bytes:
f3b6eaf0 74 */
59daa706
ML
75 movq 0*8(%rsi), %r8
76 movq 1*8(%rsi), %r9
77 movq 2*8(%rsi), %r10
78 movq 3*8(%rsi), %r11
79 leaq 4*8(%rsi), %rsi
80
81 movq %r8, 0*8(%rdi)
82 movq %r9, 1*8(%rdi)
83 movq %r10, 2*8(%rdi)
84 movq %r11, 3*8(%rdi)
85 leaq 4*8(%rdi), %rdi
86 jae .Lcopy_forward_loop
2ab56091 87 addl $0x20, %edx
59daa706
ML
88 jmp .Lhandle_tail
89
90.Lcopy_backward:
91 /*
92 * Calculate copy position to tail.
93 */
94 addq %rdx, %rsi
95 addq %rdx, %rdi
96 subq $0x20, %rdx
97 /*
98 * At most 3 ALU operations in one cycle,
d50ba368 99 * so append NOPS in the same 16 bytes trunk.
59daa706
ML
100 */
101 .p2align 4
102.Lcopy_backward_loop:
103 subq $0x20, %rdx
104 movq -1*8(%rsi), %r8
105 movq -2*8(%rsi), %r9
106 movq -3*8(%rsi), %r10
107 movq -4*8(%rsi), %r11
108 leaq -4*8(%rsi), %rsi
109 movq %r8, -1*8(%rdi)
110 movq %r9, -2*8(%rdi)
111 movq %r10, -3*8(%rdi)
112 movq %r11, -4*8(%rdi)
113 leaq -4*8(%rdi), %rdi
114 jae .Lcopy_backward_loop
7bcd3f34 115
59daa706
ML
116 /*
117 * Calculate copy position to head.
118 */
2ab56091 119 addl $0x20, %edx
59daa706
ML
120 subq %rdx, %rsi
121 subq %rdx, %rdi
7bcd3f34 122.Lhandle_tail:
2ab56091 123 cmpl $16, %edx
59daa706 124 jb .Lless_16bytes
f3b6eaf0 125
59daa706
ML
126 /*
127 * Move data from 16 bytes to 31 bytes.
128 */
129 movq 0*8(%rsi), %r8
130 movq 1*8(%rsi), %r9
131 movq -2*8(%rsi, %rdx), %r10
132 movq -1*8(%rsi, %rdx), %r11
133 movq %r8, 0*8(%rdi)
134 movq %r9, 1*8(%rdi)
135 movq %r10, -2*8(%rdi, %rdx)
136 movq %r11, -1*8(%rdi, %rdx)
137 retq
7bcd3f34 138 .p2align 4
59daa706 139.Lless_16bytes:
2ab56091 140 cmpl $8, %edx
59daa706
ML
141 jb .Lless_8bytes
142 /*
143 * Move data from 8 bytes to 15 bytes.
144 */
145 movq 0*8(%rsi), %r8
146 movq -1*8(%rsi, %rdx), %r9
147 movq %r8, 0*8(%rdi)
148 movq %r9, -1*8(%rdi, %rdx)
149 retq
150 .p2align 4
151.Lless_8bytes:
2ab56091 152 cmpl $4, %edx
59daa706 153 jb .Lless_3bytes
f3b6eaf0 154
59daa706
ML
155 /*
156 * Move data from 4 bytes to 7 bytes.
157 */
158 movl (%rsi), %ecx
159 movl -4(%rsi, %rdx), %r8d
160 movl %ecx, (%rdi)
161 movl %r8d, -4(%rdi, %rdx)
162 retq
7bcd3f34 163 .p2align 4
59daa706 164.Lless_3bytes:
9d8e2277
JB
165 subl $1, %edx
166 jb .Lend
59daa706
ML
167 /*
168 * Move data from 1 bytes to 3 bytes.
169 */
9d8e2277
JB
170 movzbl (%rsi), %ecx
171 jz .Lstore_1byte
172 movzbq 1(%rsi), %r8
173 movzbq (%rsi, %rdx), %r9
174 movb %r8b, 1(%rdi)
175 movb %r9b, (%rdi, %rdx)
176.Lstore_1byte:
177 movb %cl, (%rdi)
7bcd3f34 178
f3b6eaf0 179.Lend:
59daa706 180 retq
8d379dad 181 CFI_ENDPROC
e0bc8d17 182ENDPROC(memcpy_orig)
This page took 0.690321 seconds and 4 git commands to generate.