2 * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64)
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
7 * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
26 #include <linux/linkage.h>
27 #include "glue_helper-asm-avx.S"
29 .file "cast6-avx-x86_64-asm_64.S"
36 /* structure of crypto context */
46 /**********************************************************************
48 **********************************************************************/
99 #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
100 movzbl src ## bh, RID1d; \
101 movzbl src ## bl, RID2d; \
103 movl s1(, RID1, 4), dst ## d; \
104 op1 s2(, RID2, 4), dst ## d; \
105 movzbl src ## bh, RID1d; \
106 movzbl src ## bl, RID2d; \
107 interleave_op(il_reg); \
108 op2 s3(, RID1, 4), dst ## d; \
109 op3 s4(, RID2, 4), dst ## d;
111 #define dummy(d) /* do nothing */
113 #define shr_next(reg) \
116 #define F_head(a, x, gi1, gi2, op0) \
118 vpslld RKRF, x, RTMP; \
125 #define F_tail(a, x, gi1, gi2, op1, op2, op3) \
126 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
127 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
129 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
132 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
137 vpinsrq $1, RFS3, x, x;
139 #define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
140 F_head(b1, RX, RGI1, RGI2, op0); \
141 F_head(b2, RX, RGI3, RGI4, op0); \
143 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
144 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
149 #define F1_2(a1, b1, a2, b2) \
150 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
151 #define F2_2(a1, b1, a2, b2) \
152 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
153 #define F3_2(a1, b1, a2, b2) \
154 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
156 #define qop(in, out, f) \
157 F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2);
159 #define get_round_keys(nn) \
160 vbroadcastss (km+(4*(nn)))(CTX), RKM; \
161 vpand R1ST, RKR, RKRF; \
162 vpsubq RKRF, R32, RKRR; \
163 vpsrldq $1, RKR, RKR;
166 get_round_keys(4*n+0); \
169 get_round_keys(4*n+1); \
172 get_round_keys(4*n+2); \
175 get_round_keys(4*n+3); \
179 get_round_keys(4*n+3); \
182 get_round_keys(4*n+2); \
185 get_round_keys(4*n+1); \
188 get_round_keys(4*n+0); \
191 #define shuffle(mask) \
192 vpshufb mask, RKR, RKR;
194 #define preload_rkr(n, do_mask, mask) \
195 vbroadcastss .L16_mask, RKR; \
196 /* add 16-bit rotation to key rotations (mod 32) */ \
197 vpxor (kr+n*16)(CTX), RKR, RKR; \
200 #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
201 vpunpckldq x1, x0, t0; \
202 vpunpckhdq x1, x0, t2; \
203 vpunpckldq x3, x2, t1; \
204 vpunpckhdq x3, x2, x3; \
206 vpunpcklqdq t1, t0, x0; \
207 vpunpckhqdq t1, t0, x1; \
208 vpunpcklqdq x3, t2, x2; \
209 vpunpckhqdq x3, t2, x3;
211 #define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
212 vpshufb rmask, x0, x0; \
213 vpshufb rmask, x1, x1; \
214 vpshufb rmask, x2, x2; \
215 vpshufb rmask, x3, x3; \
217 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
219 #define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
220 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
222 vpshufb rmask, x0, x0; \
223 vpshufb rmask, x1, x1; \
224 vpshufb rmask, x2, x2; \
225 vpshufb rmask, x3, x3;
231 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
233 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
234 .Lrkr_enc_Q_Q_QBAR_QBAR:
235 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
236 .Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
237 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
239 .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
240 .Lrkr_dec_Q_Q_QBAR_QBAR:
241 .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0
242 .Lrkr_dec_QBAR_QBAR_QBAR_QBAR:
243 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
257 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
259 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
265 vmovdqa .Lbswap_mask, RKM;
266 vmovd .Lfirst_mask, R1ST;
267 vmovd .L32_mask, R32;
269 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
270 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
272 preload_rkr(0, dummy, none);
277 preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR);
282 preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR);
291 vmovdqa .Lbswap_mask, RKM;
293 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
294 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
297 ENDPROC(__cast6_enc_blk8)
303 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
305 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
311 vmovdqa .Lbswap_mask, RKM;
312 vmovd .Lfirst_mask, R1ST;
313 vmovd .L32_mask, R32;
315 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
316 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
318 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
323 preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR);
328 preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR);
337 vmovdqa .Lbswap_mask, RKM;
338 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
339 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
342 ENDPROC(__cast6_dec_blk8)
344 ENTRY(cast6_ecb_enc_8way)
353 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
355 call __cast6_enc_blk8;
357 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
360 ENDPROC(cast6_ecb_enc_8way)
362 ENTRY(cast6_ecb_dec_8way)
371 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
373 call __cast6_dec_blk8;
375 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
378 ENDPROC(cast6_ecb_dec_8way)
380 ENTRY(cast6_cbc_dec_8way)
392 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
394 call __cast6_dec_blk8;
396 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
401 ENDPROC(cast6_cbc_dec_8way)
403 ENTRY(cast6_ctr_8way)
408 * %rcx: iv (little endian, 128bit)
416 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
419 call __cast6_enc_blk8;
421 store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
426 ENDPROC(cast6_ctr_8way)