dect
/
linux-2.6
Archived
13
0
Fork 0
This repository has been archived on 2022-02-17. You can view files and clone it, but cannot push or open issues or pull requests.
linux-2.6/arch/x86/crypto/twofish_avx_glue.c

572 lines
15 KiB
C
Raw Normal View History

crypto: twofish - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Twofish block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) twofish-avx-x86_64 vs. twofish-x86_64-3way 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.97x 1.00x 0.95x 0.97x 0.97x 0.96x 0.95x 0.95x 0.98x 64B 0.99x 0.99x 1.00x 0.99x 0.98x 0.98x 0.99x 0.98x 0.99x 0.98x 256B 1.20x 1.21x 1.00x 1.19x 1.15x 1.14x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.24x 1.26x 1.28x 1.26x 1.27x 8192B 1.31x 1.32x 1.00x 1.31x 1.25x 1.25x 1.28x 1.29x 1.28x 1.30x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.96x 1.00x 0.96x 0.97x 0.98x 0.95x 0.95x 0.95x 0.96x 64B 1.00x 0.99x 1.00x 0.98x 0.98x 1.01x 0.98x 0.98x 0.98x 0.98x 256B 1.20x 1.21x 1.00x 1.21x 1.15x 1.15x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.23x 1.26x 1.27x 1.26x 1.27x 8192B 1.31x 1.33x 1.00x 1.31x 1.26x 1.26x 1.29x 1.29x 1.28x 1.30x twofish-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.19x 1.63x ecb-dec 1.18x 1.62x cbc-enc 0.75x 1.03x cbc-dec 1.23x 1.67x ctr-enc 1.24x 1.65x ctr-dec 1.24x 1.65x lrw-enc 1.15x 1.53x lrw-dec 1.14x 1.52x xts-enc 1.16x 1.56x xts-dec 1.16x 1.56x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2012-05-28 13:54:24 +00:00
/*
* Glue Code for AVX assembler version of Twofish Cipher
*
* Copyright (C) 2012 Johannes Goetzfried
* <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
* USA
*
*/
#include <linux/module.h>
#include <linux/hardirq.h>
#include <linux/types.h>
#include <linux/crypto.h>
#include <linux/err.h>
#include <crypto/algapi.h>
#include <crypto/twofish.h>
#include <crypto/cryptd.h>
#include <crypto/b128ops.h>
#include <crypto/ctr.h>
#include <crypto/lrw.h>
#include <crypto/xts.h>
#include <asm/i387.h>
#include <asm/xcr.h>
#include <asm/xsave.h>
#include <asm/crypto/twofish.h>
#include <asm/crypto/ablk_helper.h>
#include <asm/crypto/glue_helper.h>
crypto: twofish - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Twofish block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) twofish-avx-x86_64 vs. twofish-x86_64-3way 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.97x 1.00x 0.95x 0.97x 0.97x 0.96x 0.95x 0.95x 0.98x 64B 0.99x 0.99x 1.00x 0.99x 0.98x 0.98x 0.99x 0.98x 0.99x 0.98x 256B 1.20x 1.21x 1.00x 1.19x 1.15x 1.14x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.24x 1.26x 1.28x 1.26x 1.27x 8192B 1.31x 1.32x 1.00x 1.31x 1.25x 1.25x 1.28x 1.29x 1.28x 1.30x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.96x 1.00x 0.96x 0.97x 0.98x 0.95x 0.95x 0.95x 0.96x 64B 1.00x 0.99x 1.00x 0.98x 0.98x 1.01x 0.98x 0.98x 0.98x 0.98x 256B 1.20x 1.21x 1.00x 1.21x 1.15x 1.15x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.23x 1.26x 1.27x 1.26x 1.27x 8192B 1.31x 1.33x 1.00x 1.31x 1.26x 1.26x 1.29x 1.29x 1.28x 1.30x twofish-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.19x 1.63x ecb-dec 1.18x 1.62x cbc-enc 0.75x 1.03x cbc-dec 1.23x 1.67x ctr-enc 1.24x 1.65x ctr-dec 1.24x 1.65x lrw-enc 1.15x 1.53x lrw-dec 1.14x 1.52x xts-enc 1.16x 1.56x xts-dec 1.16x 1.56x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2012-05-28 13:54:24 +00:00
#include <crypto/scatterwalk.h>
#include <linux/workqueue.h>
#include <linux/spinlock.h>
#define TWOFISH_PARALLEL_BLOCKS 8
/* 8-way parallel cipher functions */
asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src);
asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
crypto: twofish - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Twofish block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) twofish-avx-x86_64 vs. twofish-x86_64-3way 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.97x 1.00x 0.95x 0.97x 0.97x 0.96x 0.95x 0.95x 0.98x 64B 0.99x 0.99x 1.00x 0.99x 0.98x 0.98x 0.99x 0.98x 0.99x 0.98x 256B 1.20x 1.21x 1.00x 1.19x 1.15x 1.14x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.24x 1.26x 1.28x 1.26x 1.27x 8192B 1.31x 1.32x 1.00x 1.31x 1.25x 1.25x 1.28x 1.29x 1.28x 1.30x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.96x 1.00x 0.96x 0.97x 0.98x 0.95x 0.95x 0.95x 0.96x 64B 1.00x 0.99x 1.00x 0.98x 0.98x 1.01x 0.98x 0.98x 0.98x 0.98x 256B 1.20x 1.21x 1.00x 1.21x 1.15x 1.15x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.23x 1.26x 1.27x 1.26x 1.27x 8192B 1.31x 1.33x 1.00x 1.31x 1.26x 1.26x 1.29x 1.29x 1.28x 1.30x twofish-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.19x 1.63x ecb-dec 1.18x 1.62x cbc-enc 0.75x 1.03x cbc-dec 1.23x 1.67x ctr-enc 1.24x 1.65x ctr-dec 1.24x 1.65x lrw-enc 1.15x 1.53x lrw-dec 1.14x 1.52x xts-enc 1.16x 1.56x xts-dec 1.16x 1.56x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2012-05-28 13:54:24 +00:00
const u8 *src);
asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src);
asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src, le128 *iv);
crypto: twofish - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Twofish block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) twofish-avx-x86_64 vs. twofish-x86_64-3way 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.97x 1.00x 0.95x 0.97x 0.97x 0.96x 0.95x 0.95x 0.98x 64B 0.99x 0.99x 1.00x 0.99x 0.98x 0.98x 0.99x 0.98x 0.99x 0.98x 256B 1.20x 1.21x 1.00x 1.19x 1.15x 1.14x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.24x 1.26x 1.28x 1.26x 1.27x 8192B 1.31x 1.32x 1.00x 1.31x 1.25x 1.25x 1.28x 1.29x 1.28x 1.30x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.96x 1.00x 0.96x 0.97x 0.98x 0.95x 0.95x 0.95x 0.96x 64B 1.00x 0.99x 1.00x 0.98x 0.98x 1.01x 0.98x 0.98x 0.98x 0.98x 256B 1.20x 1.21x 1.00x 1.21x 1.15x 1.15x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.23x 1.26x 1.27x 1.26x 1.27x 8192B 1.31x 1.33x 1.00x 1.31x 1.26x 1.26x 1.29x 1.29x 1.28x 1.30x twofish-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.19x 1.63x ecb-dec 1.18x 1.62x cbc-enc 0.75x 1.03x cbc-dec 1.23x 1.67x ctr-enc 1.24x 1.65x ctr-dec 1.24x 1.65x lrw-enc 1.15x 1.53x lrw-dec 1.14x 1.52x xts-enc 1.16x 1.56x xts-dec 1.16x 1.56x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2012-05-28 13:54:24 +00:00
static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
crypto: twofish - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Twofish block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) twofish-avx-x86_64 vs. twofish-x86_64-3way 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.97x 1.00x 0.95x 0.97x 0.97x 0.96x 0.95x 0.95x 0.98x 64B 0.99x 0.99x 1.00x 0.99x 0.98x 0.98x 0.99x 0.98x 0.99x 0.98x 256B 1.20x 1.21x 1.00x 1.19x 1.15x 1.14x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.24x 1.26x 1.28x 1.26x 1.27x 8192B 1.31x 1.32x 1.00x 1.31x 1.25x 1.25x 1.28x 1.29x 1.28x 1.30x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.96x 1.00x 0.96x 0.97x 0.98x 0.95x 0.95x 0.95x 0.96x 64B 1.00x 0.99x 1.00x 0.98x 0.98x 1.01x 0.98x 0.98x 0.98x 0.98x 256B 1.20x 1.21x 1.00x 1.21x 1.15x 1.15x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.23x 1.26x 1.27x 1.26x 1.27x 8192B 1.31x 1.33x 1.00x 1.31x 1.26x 1.26x 1.29x 1.29x 1.28x 1.30x twofish-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.19x 1.63x ecb-dec 1.18x 1.62x cbc-enc 0.75x 1.03x cbc-dec 1.23x 1.67x ctr-enc 1.24x 1.65x ctr-dec 1.24x 1.65x lrw-enc 1.15x 1.53x lrw-dec 1.14x 1.52x xts-enc 1.16x 1.56x xts-dec 1.16x 1.56x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2012-05-28 13:54:24 +00:00
const u8 *src)
{
__twofish_enc_blk_3way(ctx, dst, src, false);
crypto: twofish - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Twofish block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) twofish-avx-x86_64 vs. twofish-x86_64-3way 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.97x 1.00x 0.95x 0.97x 0.97x 0.96x 0.95x 0.95x 0.98x 64B 0.99x 0.99x 1.00x 0.99x 0.98x 0.98x 0.99x 0.98x 0.99x 0.98x 256B 1.20x 1.21x 1.00x 1.19x 1.15x 1.14x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.24x 1.26x 1.28x 1.26x 1.27x 8192B 1.31x 1.32x 1.00x 1.31x 1.25x 1.25x 1.28x 1.29x 1.28x 1.30x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.96x 1.00x 0.96x 0.97x 0.98x 0.95x 0.95x 0.95x 0.96x 64B 1.00x 0.99x 1.00x 0.98x 0.98x 1.01x 0.98x 0.98x 0.98x 0.98x 256B 1.20x 1.21x 1.00x 1.21x 1.15x 1.15x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.23x 1.26x 1.27x 1.26x 1.27x 8192B 1.31x 1.33x 1.00x 1.31x 1.26x 1.26x 1.29x 1.29x 1.28x 1.30x twofish-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.19x 1.63x ecb-dec 1.18x 1.62x cbc-enc 0.75x 1.03x cbc-dec 1.23x 1.67x ctr-enc 1.24x 1.65x ctr-dec 1.24x 1.65x lrw-enc 1.15x 1.53x lrw-dec 1.14x 1.52x xts-enc 1.16x 1.56x xts-dec 1.16x 1.56x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2012-05-28 13:54:24 +00:00
}
static const struct common_glue_ctx twofish_enc = {
.num_funcs = 3,
.fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
.funcs = { {
.num_blocks = TWOFISH_PARALLEL_BLOCKS,
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) }
}, {
.num_blocks = 3,
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
}, {
.num_blocks = 1,
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) }
} }
};
static const struct common_glue_ctx twofish_ctr = {
.num_funcs = 3,
.fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
.funcs = { {
.num_blocks = TWOFISH_PARALLEL_BLOCKS,
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) }
}, {
.num_blocks = 3,
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) }
}, {
.num_blocks = 1,
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) }
} }
};
static const struct common_glue_ctx twofish_dec = {
.num_funcs = 3,
.fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
.funcs = { {
.num_blocks = TWOFISH_PARALLEL_BLOCKS,
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) }
}, {
.num_blocks = 3,
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
}, {
.num_blocks = 1,
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) }
} }
};
static const struct common_glue_ctx twofish_dec_cbc = {
.num_funcs = 3,
.fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
.funcs = { {
.num_blocks = TWOFISH_PARALLEL_BLOCKS,
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) }
}, {
.num_blocks = 3,
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
}, {
.num_blocks = 1,
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) }
} }
};
crypto: twofish - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Twofish block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) twofish-avx-x86_64 vs. twofish-x86_64-3way 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.97x 1.00x 0.95x 0.97x 0.97x 0.96x 0.95x 0.95x 0.98x 64B 0.99x 0.99x 1.00x 0.99x 0.98x 0.98x 0.99x 0.98x 0.99x 0.98x 256B 1.20x 1.21x 1.00x 1.19x 1.15x 1.14x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.24x 1.26x 1.28x 1.26x 1.27x 8192B 1.31x 1.32x 1.00x 1.31x 1.25x 1.25x 1.28x 1.29x 1.28x 1.30x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.96x 1.00x 0.96x 0.97x 0.98x 0.95x 0.95x 0.95x 0.96x 64B 1.00x 0.99x 1.00x 0.98x 0.98x 1.01x 0.98x 0.98x 0.98x 0.98x 256B 1.20x 1.21x 1.00x 1.21x 1.15x 1.15x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.23x 1.26x 1.27x 1.26x 1.27x 8192B 1.31x 1.33x 1.00x 1.31x 1.26x 1.26x 1.29x 1.29x 1.28x 1.30x twofish-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.19x 1.63x ecb-dec 1.18x 1.62x cbc-enc 0.75x 1.03x cbc-dec 1.23x 1.67x ctr-enc 1.24x 1.65x ctr-dec 1.24x 1.65x lrw-enc 1.15x 1.53x lrw-dec 1.14x 1.52x xts-enc 1.16x 1.56x xts-dec 1.16x 1.56x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2012-05-28 13:54:24 +00:00
static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes);
crypto: twofish - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Twofish block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) twofish-avx-x86_64 vs. twofish-x86_64-3way 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.97x 1.00x 0.95x 0.97x 0.97x 0.96x 0.95x 0.95x 0.98x 64B 0.99x 0.99x 1.00x 0.99x 0.98x 0.98x 0.99x 0.98x 0.99x 0.98x 256B 1.20x 1.21x 1.00x 1.19x 1.15x 1.14x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.24x 1.26x 1.28x 1.26x 1.27x 8192B 1.31x 1.32x 1.00x 1.31x 1.25x 1.25x 1.28x 1.29x 1.28x 1.30x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.96x 1.00x 0.96x 0.97x 0.98x 0.95x 0.95x 0.95x 0.96x 64B 1.00x 0.99x 1.00x 0.98x 0.98x 1.01x 0.98x 0.98x 0.98x 0.98x 256B 1.20x 1.21x 1.00x 1.21x 1.15x 1.15x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.23x 1.26x 1.27x 1.26x 1.27x 8192B 1.31x 1.33x 1.00x 1.31x 1.26x 1.26x 1.29x 1.29x 1.28x 1.30x twofish-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.19x 1.63x ecb-dec 1.18x 1.62x cbc-enc 0.75x 1.03x cbc-dec 1.23x 1.67x ctr-enc 1.24x 1.65x ctr-dec 1.24x 1.65x lrw-enc 1.15x 1.53x lrw-dec 1.14x 1.52x xts-enc 1.16x 1.56x xts-dec 1.16x 1.56x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2012-05-28 13:54:24 +00:00
}
static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes);
crypto: twofish - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Twofish block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) twofish-avx-x86_64 vs. twofish-x86_64-3way 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.97x 1.00x 0.95x 0.97x 0.97x 0.96x 0.95x 0.95x 0.98x 64B 0.99x 0.99x 1.00x 0.99x 0.98x 0.98x 0.99x 0.98x 0.99x 0.98x 256B 1.20x 1.21x 1.00x 1.19x 1.15x 1.14x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.24x 1.26x 1.28x 1.26x 1.27x 8192B 1.31x 1.32x 1.00x 1.31x 1.25x 1.25x 1.28x 1.29x 1.28x 1.30x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.96x 1.00x 0.96x 0.97x 0.98x 0.95x 0.95x 0.95x 0.96x 64B 1.00x 0.99x 1.00x 0.98x 0.98x 1.01x 0.98x 0.98x 0.98x 0.98x 256B 1.20x 1.21x 1.00x 1.21x 1.15x 1.15x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.23x 1.26x 1.27x 1.26x 1.27x 8192B 1.31x 1.33x 1.00x 1.31x 1.26x 1.26x 1.29x 1.29x 1.28x 1.30x twofish-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.19x 1.63x ecb-dec 1.18x 1.62x cbc-enc 0.75x 1.03x cbc-dec 1.23x 1.67x ctr-enc 1.24x 1.65x ctr-dec 1.24x 1.65x lrw-enc 1.15x 1.53x lrw-dec 1.14x 1.52x xts-enc 1.16x 1.56x xts-dec 1.16x 1.56x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2012-05-28 13:54:24 +00:00
}
static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc,
dst, src, nbytes);
crypto: twofish - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Twofish block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) twofish-avx-x86_64 vs. twofish-x86_64-3way 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.97x 1.00x 0.95x 0.97x 0.97x 0.96x 0.95x 0.95x 0.98x 64B 0.99x 0.99x 1.00x 0.99x 0.98x 0.98x 0.99x 0.98x 0.99x 0.98x 256B 1.20x 1.21x 1.00x 1.19x 1.15x 1.14x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.24x 1.26x 1.28x 1.26x 1.27x 8192B 1.31x 1.32x 1.00x 1.31x 1.25x 1.25x 1.28x 1.29x 1.28x 1.30x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.96x 1.00x 0.96x 0.97x 0.98x 0.95x 0.95x 0.95x 0.96x 64B 1.00x 0.99x 1.00x 0.98x 0.98x 1.01x 0.98x 0.98x 0.98x 0.98x 256B 1.20x 1.21x 1.00x 1.21x 1.15x 1.15x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.23x 1.26x 1.27x 1.26x 1.27x 8192B 1.31x 1.33x 1.00x 1.31x 1.26x 1.26x 1.29x 1.29x 1.28x 1.30x twofish-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.19x 1.63x ecb-dec 1.18x 1.62x cbc-enc 0.75x 1.03x cbc-dec 1.23x 1.67x ctr-enc 1.24x 1.65x ctr-dec 1.24x 1.65x lrw-enc 1.15x 1.53x lrw-dec 1.14x 1.52x xts-enc 1.16x 1.56x xts-dec 1.16x 1.56x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2012-05-28 13:54:24 +00:00
}
static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src,
nbytes);
crypto: twofish - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Twofish block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) twofish-avx-x86_64 vs. twofish-x86_64-3way 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.97x 1.00x 0.95x 0.97x 0.97x 0.96x 0.95x 0.95x 0.98x 64B 0.99x 0.99x 1.00x 0.99x 0.98x 0.98x 0.99x 0.98x 0.99x 0.98x 256B 1.20x 1.21x 1.00x 1.19x 1.15x 1.14x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.24x 1.26x 1.28x 1.26x 1.27x 8192B 1.31x 1.32x 1.00x 1.31x 1.25x 1.25x 1.28x 1.29x 1.28x 1.30x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.96x 1.00x 0.96x 0.97x 0.98x 0.95x 0.95x 0.95x 0.96x 64B 1.00x 0.99x 1.00x 0.98x 0.98x 1.01x 0.98x 0.98x 0.98x 0.98x 256B 1.20x 1.21x 1.00x 1.21x 1.15x 1.15x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.23x 1.26x 1.27x 1.26x 1.27x 8192B 1.31x 1.33x 1.00x 1.31x 1.26x 1.26x 1.29x 1.29x 1.28x 1.30x twofish-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.19x 1.63x ecb-dec 1.18x 1.62x cbc-enc 0.75x 1.03x cbc-dec 1.23x 1.67x ctr-enc 1.24x 1.65x ctr-dec 1.24x 1.65x lrw-enc 1.15x 1.53x lrw-dec 1.14x 1.52x xts-enc 1.16x 1.56x xts-dec 1.16x 1.56x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2012-05-28 13:54:24 +00:00
}
static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
crypto: twofish - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Twofish block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) twofish-avx-x86_64 vs. twofish-x86_64-3way 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.97x 1.00x 0.95x 0.97x 0.97x 0.96x 0.95x 0.95x 0.98x 64B 0.99x 0.99x 1.00x 0.99x 0.98x 0.98x 0.99x 0.98x 0.99x 0.98x 256B 1.20x 1.21x 1.00x 1.19x 1.15x 1.14x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.24x 1.26x 1.28x 1.26x 1.27x 8192B 1.31x 1.32x 1.00x 1.31x 1.25x 1.25x 1.28x 1.29x 1.28x 1.30x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.96x 1.00x 0.96x 0.97x 0.98x 0.95x 0.95x 0.95x 0.96x 64B 1.00x 0.99x 1.00x 0.98x 0.98x 1.01x 0.98x 0.98x 0.98x 0.98x 256B 1.20x 1.21x 1.00x 1.21x 1.15x 1.15x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.23x 1.26x 1.27x 1.26x 1.27x 8192B 1.31x 1.33x 1.00x 1.31x 1.26x 1.26x 1.29x 1.29x 1.28x 1.30x twofish-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.19x 1.63x ecb-dec 1.18x 1.62x cbc-enc 0.75x 1.03x cbc-dec 1.23x 1.67x ctr-enc 1.24x 1.65x ctr-dec 1.24x 1.65x lrw-enc 1.15x 1.53x lrw-dec 1.14x 1.52x xts-enc 1.16x 1.56x xts-dec 1.16x 1.56x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2012-05-28 13:54:24 +00:00
{
return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes);
crypto: twofish - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Twofish block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) twofish-avx-x86_64 vs. twofish-x86_64-3way 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.97x 1.00x 0.95x 0.97x 0.97x 0.96x 0.95x 0.95x 0.98x 64B 0.99x 0.99x 1.00x 0.99x 0.98x 0.98x 0.99x 0.98x 0.99x 0.98x 256B 1.20x 1.21x 1.00x 1.19x 1.15x 1.14x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.24x 1.26x 1.28x 1.26x 1.27x 8192B 1.31x 1.32x 1.00x 1.31x 1.25x 1.25x 1.28x 1.29x 1.28x 1.30x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.96x 1.00x 0.96x 0.97x 0.98x 0.95x 0.95x 0.95x 0.96x 64B 1.00x 0.99x 1.00x 0.98x 0.98x 1.01x 0.98x 0.98x 0.98x 0.98x 256B 1.20x 1.21x 1.00x 1.21x 1.15x 1.15x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.23x 1.26x 1.27x 1.26x 1.27x 8192B 1.31x 1.33x 1.00x 1.31x 1.26x 1.26x 1.29x 1.29x 1.28x 1.30x twofish-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.19x 1.63x ecb-dec 1.18x 1.62x cbc-enc 0.75x 1.03x cbc-dec 1.23x 1.67x ctr-enc 1.24x 1.65x ctr-dec 1.24x 1.65x lrw-enc 1.15x 1.53x lrw-dec 1.14x 1.52x xts-enc 1.16x 1.56x xts-dec 1.16x 1.56x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2012-05-28 13:54:24 +00:00
}
static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes)
crypto: twofish - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Twofish block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) twofish-avx-x86_64 vs. twofish-x86_64-3way 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.97x 1.00x 0.95x 0.97x 0.97x 0.96x 0.95x 0.95x 0.98x 64B 0.99x 0.99x 1.00x 0.99x 0.98x 0.98x 0.99x 0.98x 0.99x 0.98x 256B 1.20x 1.21x 1.00x 1.19x 1.15x 1.14x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.24x 1.26x 1.28x 1.26x 1.27x 8192B 1.31x 1.32x 1.00x 1.31x 1.25x 1.25x 1.28x 1.29x 1.28x 1.30x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.96x 1.00x 0.96x 0.97x 0.98x 0.95x 0.95x 0.95x 0.96x 64B 1.00x 0.99x 1.00x 0.98x 0.98x 1.01x 0.98x 0.98x 0.98x 0.98x 256B 1.20x 1.21x 1.00x 1.21x 1.15x 1.15x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.23x 1.26x 1.27x 1.26x 1.27x 8192B 1.31x 1.33x 1.00x 1.31x 1.26x 1.26x 1.29x 1.29x 1.28x 1.30x twofish-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.19x 1.63x ecb-dec 1.18x 1.62x cbc-enc 0.75x 1.03x cbc-dec 1.23x 1.67x ctr-enc 1.24x 1.65x ctr-dec 1.24x 1.65x lrw-enc 1.15x 1.53x lrw-dec 1.14x 1.52x xts-enc 1.16x 1.56x xts-dec 1.16x 1.56x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2012-05-28 13:54:24 +00:00
{
return glue_fpu_begin(TF_BLOCK_SIZE, TWOFISH_PARALLEL_BLOCKS, NULL,
fpu_enabled, nbytes);
crypto: twofish - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Twofish block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) twofish-avx-x86_64 vs. twofish-x86_64-3way 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.97x 1.00x 0.95x 0.97x 0.97x 0.96x 0.95x 0.95x 0.98x 64B 0.99x 0.99x 1.00x 0.99x 0.98x 0.98x 0.99x 0.98x 0.99x 0.98x 256B 1.20x 1.21x 1.00x 1.19x 1.15x 1.14x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.24x 1.26x 1.28x 1.26x 1.27x 8192B 1.31x 1.32x 1.00x 1.31x 1.25x 1.25x 1.28x 1.29x 1.28x 1.30x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.96x 1.00x 0.96x 0.97x 0.98x 0.95x 0.95x 0.95x 0.96x 64B 1.00x 0.99x 1.00x 0.98x 0.98x 1.01x 0.98x 0.98x 0.98x 0.98x 256B 1.20x 1.21x 1.00x 1.21x 1.15x 1.15x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.23x 1.26x 1.27x 1.26x 1.27x 8192B 1.31x 1.33x 1.00x 1.31x 1.26x 1.26x 1.29x 1.29x 1.28x 1.30x twofish-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.19x 1.63x ecb-dec 1.18x 1.62x cbc-enc 0.75x 1.03x cbc-dec 1.23x 1.67x ctr-enc 1.24x 1.65x ctr-dec 1.24x 1.65x lrw-enc 1.15x 1.53x lrw-dec 1.14x 1.52x xts-enc 1.16x 1.56x xts-dec 1.16x 1.56x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2012-05-28 13:54:24 +00:00
}
static inline void twofish_fpu_end(bool fpu_enabled)
crypto: twofish - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Twofish block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) twofish-avx-x86_64 vs. twofish-x86_64-3way 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.97x 1.00x 0.95x 0.97x 0.97x 0.96x 0.95x 0.95x 0.98x 64B 0.99x 0.99x 1.00x 0.99x 0.98x 0.98x 0.99x 0.98x 0.99x 0.98x 256B 1.20x 1.21x 1.00x 1.19x 1.15x 1.14x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.24x 1.26x 1.28x 1.26x 1.27x 8192B 1.31x 1.32x 1.00x 1.31x 1.25x 1.25x 1.28x 1.29x 1.28x 1.30x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.96x 1.00x 0.96x 0.97x 0.98x 0.95x 0.95x 0.95x 0.96x 64B 1.00x 0.99x 1.00x 0.98x 0.98x 1.01x 0.98x 0.98x 0.98x 0.98x 256B 1.20x 1.21x 1.00x 1.21x 1.15x 1.15x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.23x 1.26x 1.27x 1.26x 1.27x 8192B 1.31x 1.33x 1.00x 1.31x 1.26x 1.26x 1.29x 1.29x 1.28x 1.30x twofish-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.19x 1.63x ecb-dec 1.18x 1.62x cbc-enc 0.75x 1.03x cbc-dec 1.23x 1.67x ctr-enc 1.24x 1.65x ctr-dec 1.24x 1.65x lrw-enc 1.15x 1.53x lrw-dec 1.14x 1.52x xts-enc 1.16x 1.56x xts-dec 1.16x 1.56x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2012-05-28 13:54:24 +00:00
{
glue_fpu_end(fpu_enabled);
crypto: twofish - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Twofish block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) twofish-avx-x86_64 vs. twofish-x86_64-3way 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.97x 1.00x 0.95x 0.97x 0.97x 0.96x 0.95x 0.95x 0.98x 64B 0.99x 0.99x 1.00x 0.99x 0.98x 0.98x 0.99x 0.98x 0.99x 0.98x 256B 1.20x 1.21x 1.00x 1.19x 1.15x 1.14x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.24x 1.26x 1.28x 1.26x 1.27x 8192B 1.31x 1.32x 1.00x 1.31x 1.25x 1.25x 1.28x 1.29x 1.28x 1.30x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.96x 1.00x 0.96x 0.97x 0.98x 0.95x 0.95x 0.95x 0.96x 64B 1.00x 0.99x 1.00x 0.98x 0.98x 1.01x 0.98x 0.98x 0.98x 0.98x 256B 1.20x 1.21x 1.00x 1.21x 1.15x 1.15x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.23x 1.26x 1.27x 1.26x 1.27x 8192B 1.31x 1.33x 1.00x 1.31x 1.26x 1.26x 1.29x 1.29x 1.28x 1.30x twofish-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.19x 1.63x ecb-dec 1.18x 1.62x cbc-enc 0.75x 1.03x cbc-dec 1.23x 1.67x ctr-enc 1.24x 1.65x ctr-dec 1.24x 1.65x lrw-enc 1.15x 1.53x lrw-dec 1.14x 1.52x xts-enc 1.16x 1.56x xts-dec 1.16x 1.56x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2012-05-28 13:54:24 +00:00
}
struct crypt_priv {
struct twofish_ctx *ctx;
bool fpu_enabled;
};
static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
{
const unsigned int bsize = TF_BLOCK_SIZE;
struct crypt_priv *ctx = priv;
int i;
ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
crypto: twofish - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Twofish block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) twofish-avx-x86_64 vs. twofish-x86_64-3way 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.97x 1.00x 0.95x 0.97x 0.97x 0.96x 0.95x 0.95x 0.98x 64B 0.99x 0.99x 1.00x 0.99x 0.98x 0.98x 0.99x 0.98x 0.99x 0.98x 256B 1.20x 1.21x 1.00x 1.19x 1.15x 1.14x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.24x 1.26x 1.28x 1.26x 1.27x 8192B 1.31x 1.32x 1.00x 1.31x 1.25x 1.25x 1.28x 1.29x 1.28x 1.30x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.96x 1.00x 0.96x 0.97x 0.98x 0.95x 0.95x 0.95x 0.96x 64B 1.00x 0.99x 1.00x 0.98x 0.98x 1.01x 0.98x 0.98x 0.98x 0.98x 256B 1.20x 1.21x 1.00x 1.21x 1.15x 1.15x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.23x 1.26x 1.27x 1.26x 1.27x 8192B 1.31x 1.33x 1.00x 1.31x 1.26x 1.26x 1.29x 1.29x 1.28x 1.30x twofish-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.19x 1.63x ecb-dec 1.18x 1.62x cbc-enc 0.75x 1.03x cbc-dec 1.23x 1.67x ctr-enc 1.24x 1.65x ctr-dec 1.24x 1.65x lrw-enc 1.15x 1.53x lrw-dec 1.14x 1.52x xts-enc 1.16x 1.56x xts-dec 1.16x 1.56x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2012-05-28 13:54:24 +00:00
return;
}
for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst);
nbytes %= bsize * 3;
for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
twofish_enc_blk(ctx->ctx, srcdst, srcdst);
}
static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
{
const unsigned int bsize = TF_BLOCK_SIZE;
struct crypt_priv *ctx = priv;
int i;
ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
crypto: twofish - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Twofish block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) twofish-avx-x86_64 vs. twofish-x86_64-3way 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.97x 1.00x 0.95x 0.97x 0.97x 0.96x 0.95x 0.95x 0.98x 64B 0.99x 0.99x 1.00x 0.99x 0.98x 0.98x 0.99x 0.98x 0.99x 0.98x 256B 1.20x 1.21x 1.00x 1.19x 1.15x 1.14x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.24x 1.26x 1.28x 1.26x 1.27x 8192B 1.31x 1.32x 1.00x 1.31x 1.25x 1.25x 1.28x 1.29x 1.28x 1.30x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.96x 1.00x 0.96x 0.97x 0.98x 0.95x 0.95x 0.95x 0.96x 64B 1.00x 0.99x 1.00x 0.98x 0.98x 1.01x 0.98x 0.98x 0.98x 0.98x 256B 1.20x 1.21x 1.00x 1.21x 1.15x 1.15x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.23x 1.26x 1.27x 1.26x 1.27x 8192B 1.31x 1.33x 1.00x 1.31x 1.26x 1.26x 1.29x 1.29x 1.28x 1.30x twofish-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.19x 1.63x ecb-dec 1.18x 1.62x cbc-enc 0.75x 1.03x cbc-dec 1.23x 1.67x ctr-enc 1.24x 1.65x ctr-dec 1.24x 1.65x lrw-enc 1.15x 1.53x lrw-dec 1.14x 1.52x xts-enc 1.16x 1.56x xts-dec 1.16x 1.56x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2012-05-28 13:54:24 +00:00
return;
}
for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst);
nbytes %= bsize * 3;
for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
twofish_dec_blk(ctx->ctx, srcdst, srcdst);
}
static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
be128 buf[TWOFISH_PARALLEL_BLOCKS];
struct crypt_priv crypt_ctx = {
.ctx = &ctx->twofish_ctx,
.fpu_enabled = false,
};
struct lrw_crypt_req req = {
.tbuf = buf,
.tbuflen = sizeof(buf),
.table_ctx = &ctx->lrw_table,
.crypt_ctx = &crypt_ctx,
.crypt_fn = encrypt_callback,
};
int ret;
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
ret = lrw_crypt(desc, dst, src, nbytes, &req);
twofish_fpu_end(crypt_ctx.fpu_enabled);
return ret;
}
static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
be128 buf[TWOFISH_PARALLEL_BLOCKS];
struct crypt_priv crypt_ctx = {
.ctx = &ctx->twofish_ctx,
.fpu_enabled = false,
};
struct lrw_crypt_req req = {
.tbuf = buf,
.tbuflen = sizeof(buf),
.table_ctx = &ctx->lrw_table,
.crypt_ctx = &crypt_ctx,
.crypt_fn = decrypt_callback,
};
int ret;
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
ret = lrw_crypt(desc, dst, src, nbytes, &req);
twofish_fpu_end(crypt_ctx.fpu_enabled);
return ret;
}
static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
be128 buf[TWOFISH_PARALLEL_BLOCKS];
struct crypt_priv crypt_ctx = {
.ctx = &ctx->crypt_ctx,
.fpu_enabled = false,
};
struct xts_crypt_req req = {
.tbuf = buf,
.tbuflen = sizeof(buf),
.tweak_ctx = &ctx->tweak_ctx,
.tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
.crypt_ctx = &crypt_ctx,
.crypt_fn = encrypt_callback,
};
int ret;
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
ret = xts_crypt(desc, dst, src, nbytes, &req);
twofish_fpu_end(crypt_ctx.fpu_enabled);
return ret;
}
static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
be128 buf[TWOFISH_PARALLEL_BLOCKS];
struct crypt_priv crypt_ctx = {
.ctx = &ctx->crypt_ctx,
.fpu_enabled = false,
};
struct xts_crypt_req req = {
.tbuf = buf,
.tbuflen = sizeof(buf),
.tweak_ctx = &ctx->tweak_ctx,
.tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
.crypt_ctx = &crypt_ctx,
.crypt_fn = decrypt_callback,
};
int ret;
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
ret = xts_crypt(desc, dst, src, nbytes, &req);
twofish_fpu_end(crypt_ctx.fpu_enabled);
return ret;
}
static struct crypto_alg twofish_algs[10] = { {
.cra_name = "__ecb-twofish-avx",
.cra_driver_name = "__driver-ecb-twofish-avx",
.cra_priority = 0,
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_blocksize = TF_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct twofish_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_blkcipher_type,
.cra_module = THIS_MODULE,
.cra_u = {
.blkcipher = {
.min_keysize = TF_MIN_KEY_SIZE,
.max_keysize = TF_MAX_KEY_SIZE,
.setkey = twofish_setkey,
.encrypt = ecb_encrypt,
.decrypt = ecb_decrypt,
},
},
}, {
.cra_name = "__cbc-twofish-avx",
.cra_driver_name = "__driver-cbc-twofish-avx",
.cra_priority = 0,
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_blocksize = TF_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct twofish_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_blkcipher_type,
.cra_module = THIS_MODULE,
.cra_u = {
.blkcipher = {
.min_keysize = TF_MIN_KEY_SIZE,
.max_keysize = TF_MAX_KEY_SIZE,
.setkey = twofish_setkey,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
},
},
}, {
.cra_name = "__ctr-twofish-avx",
.cra_driver_name = "__driver-ctr-twofish-avx",
.cra_priority = 0,
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct twofish_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_blkcipher_type,
.cra_module = THIS_MODULE,
.cra_u = {
.blkcipher = {
.min_keysize = TF_MIN_KEY_SIZE,
.max_keysize = TF_MAX_KEY_SIZE,
.ivsize = TF_BLOCK_SIZE,
.setkey = twofish_setkey,
.encrypt = ctr_crypt,
.decrypt = ctr_crypt,
},
},
}, {
.cra_name = "__lrw-twofish-avx",
.cra_driver_name = "__driver-lrw-twofish-avx",
.cra_priority = 0,
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_blocksize = TF_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct twofish_lrw_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_blkcipher_type,
.cra_module = THIS_MODULE,
.cra_exit = lrw_twofish_exit_tfm,
crypto: twofish - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Twofish block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) twofish-avx-x86_64 vs. twofish-x86_64-3way 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.97x 1.00x 0.95x 0.97x 0.97x 0.96x 0.95x 0.95x 0.98x 64B 0.99x 0.99x 1.00x 0.99x 0.98x 0.98x 0.99x 0.98x 0.99x 0.98x 256B 1.20x 1.21x 1.00x 1.19x 1.15x 1.14x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.24x 1.26x 1.28x 1.26x 1.27x 8192B 1.31x 1.32x 1.00x 1.31x 1.25x 1.25x 1.28x 1.29x 1.28x 1.30x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.96x 1.00x 0.96x 0.97x 0.98x 0.95x 0.95x 0.95x 0.96x 64B 1.00x 0.99x 1.00x 0.98x 0.98x 1.01x 0.98x 0.98x 0.98x 0.98x 256B 1.20x 1.21x 1.00x 1.21x 1.15x 1.15x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.23x 1.26x 1.27x 1.26x 1.27x 8192B 1.31x 1.33x 1.00x 1.31x 1.26x 1.26x 1.29x 1.29x 1.28x 1.30x twofish-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.19x 1.63x ecb-dec 1.18x 1.62x cbc-enc 0.75x 1.03x cbc-dec 1.23x 1.67x ctr-enc 1.24x 1.65x ctr-dec 1.24x 1.65x lrw-enc 1.15x 1.53x lrw-dec 1.14x 1.52x xts-enc 1.16x 1.56x xts-dec 1.16x 1.56x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2012-05-28 13:54:24 +00:00
.cra_u = {
.blkcipher = {
.min_keysize = TF_MIN_KEY_SIZE +
TF_BLOCK_SIZE,
.max_keysize = TF_MAX_KEY_SIZE +
TF_BLOCK_SIZE,
.ivsize = TF_BLOCK_SIZE,
.setkey = lrw_twofish_setkey,
.encrypt = lrw_encrypt,
.decrypt = lrw_decrypt,
},
},
}, {
.cra_name = "__xts-twofish-avx",
.cra_driver_name = "__driver-xts-twofish-avx",
.cra_priority = 0,
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_blocksize = TF_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct twofish_xts_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_blkcipher_type,
.cra_module = THIS_MODULE,
.cra_u = {
.blkcipher = {
.min_keysize = TF_MIN_KEY_SIZE * 2,
.max_keysize = TF_MAX_KEY_SIZE * 2,
.ivsize = TF_BLOCK_SIZE,
.setkey = xts_twofish_setkey,
.encrypt = xts_encrypt,
.decrypt = xts_decrypt,
},
},
}, {
.cra_name = "ecb(twofish)",
.cra_driver_name = "ecb-twofish-avx",
.cra_priority = 400,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
.cra_blocksize = TF_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct async_helper_ctx),
crypto: twofish - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Twofish block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) twofish-avx-x86_64 vs. twofish-x86_64-3way 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.97x 1.00x 0.95x 0.97x 0.97x 0.96x 0.95x 0.95x 0.98x 64B 0.99x 0.99x 1.00x 0.99x 0.98x 0.98x 0.99x 0.98x 0.99x 0.98x 256B 1.20x 1.21x 1.00x 1.19x 1.15x 1.14x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.24x 1.26x 1.28x 1.26x 1.27x 8192B 1.31x 1.32x 1.00x 1.31x 1.25x 1.25x 1.28x 1.29x 1.28x 1.30x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.96x 1.00x 0.96x 0.97x 0.98x 0.95x 0.95x 0.95x 0.96x 64B 1.00x 0.99x 1.00x 0.98x 0.98x 1.01x 0.98x 0.98x 0.98x 0.98x 256B 1.20x 1.21x 1.00x 1.21x 1.15x 1.15x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.23x 1.26x 1.27x 1.26x 1.27x 8192B 1.31x 1.33x 1.00x 1.31x 1.26x 1.26x 1.29x 1.29x 1.28x 1.30x twofish-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.19x 1.63x ecb-dec 1.18x 1.62x cbc-enc 0.75x 1.03x cbc-dec 1.23x 1.67x ctr-enc 1.24x 1.65x ctr-dec 1.24x 1.65x lrw-enc 1.15x 1.53x lrw-dec 1.14x 1.52x xts-enc 1.16x 1.56x xts-dec 1.16x 1.56x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2012-05-28 13:54:24 +00:00
.cra_alignmask = 0,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
.cra_init = ablk_init,
.cra_exit = ablk_exit,
.cra_u = {
.ablkcipher = {
.min_keysize = TF_MIN_KEY_SIZE,
.max_keysize = TF_MAX_KEY_SIZE,
.setkey = ablk_set_key,
.encrypt = ablk_encrypt,
.decrypt = ablk_decrypt,
},
},
}, {
.cra_name = "cbc(twofish)",
.cra_driver_name = "cbc-twofish-avx",
.cra_priority = 400,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
.cra_blocksize = TF_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct async_helper_ctx),
crypto: twofish - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Twofish block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) twofish-avx-x86_64 vs. twofish-x86_64-3way 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.97x 1.00x 0.95x 0.97x 0.97x 0.96x 0.95x 0.95x 0.98x 64B 0.99x 0.99x 1.00x 0.99x 0.98x 0.98x 0.99x 0.98x 0.99x 0.98x 256B 1.20x 1.21x 1.00x 1.19x 1.15x 1.14x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.24x 1.26x 1.28x 1.26x 1.27x 8192B 1.31x 1.32x 1.00x 1.31x 1.25x 1.25x 1.28x 1.29x 1.28x 1.30x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.96x 1.00x 0.96x 0.97x 0.98x 0.95x 0.95x 0.95x 0.96x 64B 1.00x 0.99x 1.00x 0.98x 0.98x 1.01x 0.98x 0.98x 0.98x 0.98x 256B 1.20x 1.21x 1.00x 1.21x 1.15x 1.15x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.23x 1.26x 1.27x 1.26x 1.27x 8192B 1.31x 1.33x 1.00x 1.31x 1.26x 1.26x 1.29x 1.29x 1.28x 1.30x twofish-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.19x 1.63x ecb-dec 1.18x 1.62x cbc-enc 0.75x 1.03x cbc-dec 1.23x 1.67x ctr-enc 1.24x 1.65x ctr-dec 1.24x 1.65x lrw-enc 1.15x 1.53x lrw-dec 1.14x 1.52x xts-enc 1.16x 1.56x xts-dec 1.16x 1.56x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2012-05-28 13:54:24 +00:00
.cra_alignmask = 0,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
.cra_init = ablk_init,
.cra_exit = ablk_exit,
.cra_u = {
.ablkcipher = {
.min_keysize = TF_MIN_KEY_SIZE,
.max_keysize = TF_MAX_KEY_SIZE,
.ivsize = TF_BLOCK_SIZE,
.setkey = ablk_set_key,
.encrypt = __ablk_encrypt,
.decrypt = ablk_decrypt,
},
},
}, {
.cra_name = "ctr(twofish)",
.cra_driver_name = "ctr-twofish-avx",
.cra_priority = 400,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct async_helper_ctx),
crypto: twofish - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Twofish block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) twofish-avx-x86_64 vs. twofish-x86_64-3way 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.97x 1.00x 0.95x 0.97x 0.97x 0.96x 0.95x 0.95x 0.98x 64B 0.99x 0.99x 1.00x 0.99x 0.98x 0.98x 0.99x 0.98x 0.99x 0.98x 256B 1.20x 1.21x 1.00x 1.19x 1.15x 1.14x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.24x 1.26x 1.28x 1.26x 1.27x 8192B 1.31x 1.32x 1.00x 1.31x 1.25x 1.25x 1.28x 1.29x 1.28x 1.30x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.96x 1.00x 0.96x 0.97x 0.98x 0.95x 0.95x 0.95x 0.96x 64B 1.00x 0.99x 1.00x 0.98x 0.98x 1.01x 0.98x 0.98x 0.98x 0.98x 256B 1.20x 1.21x 1.00x 1.21x 1.15x 1.15x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.23x 1.26x 1.27x 1.26x 1.27x 8192B 1.31x 1.33x 1.00x 1.31x 1.26x 1.26x 1.29x 1.29x 1.28x 1.30x twofish-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.19x 1.63x ecb-dec 1.18x 1.62x cbc-enc 0.75x 1.03x cbc-dec 1.23x 1.67x ctr-enc 1.24x 1.65x ctr-dec 1.24x 1.65x lrw-enc 1.15x 1.53x lrw-dec 1.14x 1.52x xts-enc 1.16x 1.56x xts-dec 1.16x 1.56x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2012-05-28 13:54:24 +00:00
.cra_alignmask = 0,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
.cra_init = ablk_init,
.cra_exit = ablk_exit,
.cra_u = {
.ablkcipher = {
.min_keysize = TF_MIN_KEY_SIZE,
.max_keysize = TF_MAX_KEY_SIZE,
.ivsize = TF_BLOCK_SIZE,
.setkey = ablk_set_key,
.encrypt = ablk_encrypt,
.decrypt = ablk_encrypt,
.geniv = "chainiv",
},
},
}, {
.cra_name = "lrw(twofish)",
.cra_driver_name = "lrw-twofish-avx",
.cra_priority = 400,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
.cra_blocksize = TF_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct async_helper_ctx),
crypto: twofish - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Twofish block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) twofish-avx-x86_64 vs. twofish-x86_64-3way 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.97x 1.00x 0.95x 0.97x 0.97x 0.96x 0.95x 0.95x 0.98x 64B 0.99x 0.99x 1.00x 0.99x 0.98x 0.98x 0.99x 0.98x 0.99x 0.98x 256B 1.20x 1.21x 1.00x 1.19x 1.15x 1.14x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.24x 1.26x 1.28x 1.26x 1.27x 8192B 1.31x 1.32x 1.00x 1.31x 1.25x 1.25x 1.28x 1.29x 1.28x 1.30x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.96x 1.00x 0.96x 0.97x 0.98x 0.95x 0.95x 0.95x 0.96x 64B 1.00x 0.99x 1.00x 0.98x 0.98x 1.01x 0.98x 0.98x 0.98x 0.98x 256B 1.20x 1.21x 1.00x 1.21x 1.15x 1.15x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.23x 1.26x 1.27x 1.26x 1.27x 8192B 1.31x 1.33x 1.00x 1.31x 1.26x 1.26x 1.29x 1.29x 1.28x 1.30x twofish-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.19x 1.63x ecb-dec 1.18x 1.62x cbc-enc 0.75x 1.03x cbc-dec 1.23x 1.67x ctr-enc 1.24x 1.65x ctr-dec 1.24x 1.65x lrw-enc 1.15x 1.53x lrw-dec 1.14x 1.52x xts-enc 1.16x 1.56x xts-dec 1.16x 1.56x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2012-05-28 13:54:24 +00:00
.cra_alignmask = 0,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
.cra_init = ablk_init,
.cra_exit = ablk_exit,
.cra_u = {
.ablkcipher = {
.min_keysize = TF_MIN_KEY_SIZE +
TF_BLOCK_SIZE,
.max_keysize = TF_MAX_KEY_SIZE +
TF_BLOCK_SIZE,
.ivsize = TF_BLOCK_SIZE,
.setkey = ablk_set_key,
.encrypt = ablk_encrypt,
.decrypt = ablk_decrypt,
},
},
}, {
.cra_name = "xts(twofish)",
.cra_driver_name = "xts-twofish-avx",
.cra_priority = 400,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
.cra_blocksize = TF_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct async_helper_ctx),
crypto: twofish - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Twofish block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) twofish-avx-x86_64 vs. twofish-x86_64-3way 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.97x 1.00x 0.95x 0.97x 0.97x 0.96x 0.95x 0.95x 0.98x 64B 0.99x 0.99x 1.00x 0.99x 0.98x 0.98x 0.99x 0.98x 0.99x 0.98x 256B 1.20x 1.21x 1.00x 1.19x 1.15x 1.14x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.24x 1.26x 1.28x 1.26x 1.27x 8192B 1.31x 1.32x 1.00x 1.31x 1.25x 1.25x 1.28x 1.29x 1.28x 1.30x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.96x 1.00x 0.96x 0.97x 0.98x 0.95x 0.95x 0.95x 0.96x 64B 1.00x 0.99x 1.00x 0.98x 0.98x 1.01x 0.98x 0.98x 0.98x 0.98x 256B 1.20x 1.21x 1.00x 1.21x 1.15x 1.15x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.23x 1.26x 1.27x 1.26x 1.27x 8192B 1.31x 1.33x 1.00x 1.31x 1.26x 1.26x 1.29x 1.29x 1.28x 1.30x twofish-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.19x 1.63x ecb-dec 1.18x 1.62x cbc-enc 0.75x 1.03x cbc-dec 1.23x 1.67x ctr-enc 1.24x 1.65x ctr-dec 1.24x 1.65x lrw-enc 1.15x 1.53x lrw-dec 1.14x 1.52x xts-enc 1.16x 1.56x xts-dec 1.16x 1.56x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2012-05-28 13:54:24 +00:00
.cra_alignmask = 0,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
.cra_init = ablk_init,
.cra_exit = ablk_exit,
.cra_u = {
.ablkcipher = {
.min_keysize = TF_MIN_KEY_SIZE * 2,
.max_keysize = TF_MAX_KEY_SIZE * 2,
.ivsize = TF_BLOCK_SIZE,
.setkey = ablk_set_key,
.encrypt = ablk_encrypt,
.decrypt = ablk_decrypt,
},
},
} };
static int __init twofish_init(void)
{
u64 xcr0;
if (!cpu_has_avx || !cpu_has_osxsave) {
printk(KERN_INFO "AVX instructions are not detected.\n");
return -ENODEV;
}
xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
printk(KERN_INFO "AVX detected but unusable.\n");
return -ENODEV;
}
return crypto_register_algs(twofish_algs, ARRAY_SIZE(twofish_algs));
}
static void __exit twofish_exit(void)
{
crypto_unregister_algs(twofish_algs, ARRAY_SIZE(twofish_algs));
}
module_init(twofish_init);
module_exit(twofish_exit);
MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX optimized");
MODULE_LICENSE("GPL");
MODULE_ALIAS("twofish");