2247 lines
67 KiB
C
2247 lines
67 KiB
C
|
/******************************************************************************
|
||
|
*
|
||
|
* M4RI: Linear Algebra over GF(2)
|
||
|
*
|
||
|
* Copyright (C) 2007 Gregory Bard <gregory.bard@ieee.org>
|
||
|
* Copyright (C) 2009-2013 Martin Albrecht <martinralbrecht+m4ri@googlemail.com>
|
||
|
* Copyright (C) 2011 Carlo Wood <carlo@alinoe.com>
|
||
|
*
|
||
|
* Distributed under the terms of the GNU General Public License (GPL)
|
||
|
* version 2 or higher.
|
||
|
*
|
||
|
* This code is distributed in the hope that it will be useful,
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
* General Public License for more details.
|
||
|
*
|
||
|
* The full text of the GPL is available at:
|
||
|
*
|
||
|
* http://www.gnu.org/licenses/
|
||
|
******************************************************************************/
|
||
|
|
||
|
#ifdef HAVE_CONFIG_H
|
||
|
#include "config.h"
|
||
|
#endif
|
||
|
|
||
|
#ifdef __M4RI_HAVE_LIBPNG
|
||
|
#include <png.h>
|
||
|
#endif
|
||
|
|
||
|
#include <stdlib.h>
|
||
|
#include <string.h>
|
||
|
#include "mzd.h"
|
||
|
#include "parity.h"
|
||
|
#include "mmc.h"
|
||
|
|
||
|
|
||
|
/**
|
||
|
* \brief Cache of mzd_t containers
|
||
|
*/
|
||
|
|
||
|
typedef struct mzd_t_cache {
|
||
|
mzd_t mzd[64]; /*!< cached matrices */
|
||
|
struct mzd_t_cache *prev; /*!< previous block */
|
||
|
struct mzd_t_cache *next; /*!< next block */
|
||
|
uint64_t used; /*!< bitmasks which matrices in this block are used */
|
||
|
unsigned char padding[sizeof(mzd_t) - 2 * sizeof(struct mzd_t_cache*) - sizeof(uint64_t)]; /*!< alignment */
|
||
|
#ifdef __GNUC__
|
||
|
} mzd_t_cache_t __attribute__ ((__aligned__ (64)));
|
||
|
#else
|
||
|
} mzd_t_cache_t;
|
||
|
#endif
|
||
|
|
||
|
#define __M4RI_MZD_T_CACHE_MAX 16
|
||
|
static mzd_t_cache_t mzd_cache;
|
||
|
static mzd_t_cache_t* current_cache = &mzd_cache;
|
||
|
|
||
|
static int log2_floor(uint64_t v) {
|
||
|
static uint64_t const b[] = { 0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000, 0xFFFFFFFF00000000 };
|
||
|
static unsigned int const S[] = { 1, 2, 4, 8, 16, 32 };
|
||
|
unsigned int r = 0;
|
||
|
for (int i = 5; i >= 0; --i)
|
||
|
{
|
||
|
if ((v & b[i]))
|
||
|
{
|
||
|
v >>= S[i];
|
||
|
r |= S[i];
|
||
|
}
|
||
|
}
|
||
|
return r;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Return a pointer to a new mzd_t structure.
|
||
|
* The structure will be 64 byte aligned.
|
||
|
* Call mzd_t_free to free the structure for next use.
|
||
|
*/
|
||
|
|
||
|
static mzd_t* mzd_t_malloc() {
|
||
|
#if __M4RI_HAVE_OPENMP
|
||
|
return (mzd_t*)m4ri_mm_malloc(sizeof(mzd_t));
|
||
|
#else
|
||
|
mzd_t *ret = NULL;
|
||
|
int i=0;
|
||
|
|
||
|
if (current_cache->used == (uint64_t)-1) {
|
||
|
mzd_t_cache_t *cache = &mzd_cache;
|
||
|
while (cache && cache->used == (uint64_t)-1) {
|
||
|
current_cache = cache;
|
||
|
cache = cache->next;
|
||
|
i++;
|
||
|
}
|
||
|
if (!cache && i< __M4RI_MZD_T_CACHE_MAX) {
|
||
|
cache = (mzd_t_cache_t*)m4ri_mm_malloc_aligned(sizeof(mzd_t_cache_t), 64);
|
||
|
memset((char*)cache, 0, sizeof(mzd_t_cache_t));
|
||
|
|
||
|
cache->prev = current_cache;
|
||
|
current_cache->next = cache;
|
||
|
current_cache = cache;
|
||
|
} else if (!cache && i>= __M4RI_MZD_T_CACHE_MAX) {
|
||
|
/* We have reached the upper limit on the number of caches */
|
||
|
ret = (mzd_t*)m4ri_mm_malloc(sizeof(mzd_t));
|
||
|
} else {
|
||
|
current_cache = cache;
|
||
|
}
|
||
|
}
|
||
|
if (ret == NULL) {
|
||
|
int free_entry =log2_floor(~current_cache->used);
|
||
|
current_cache->used |= ((uint64_t)1 << free_entry);
|
||
|
ret = ¤t_cache->mzd[free_entry];
|
||
|
}
|
||
|
return ret;
|
||
|
#endif //__M4RI_HAVE_OPENMP
|
||
|
|
||
|
}
|
||
|
|
||
|
static void mzd_t_free(mzd_t *M) {
|
||
|
#if __M4RI_HAVE_OPENMP
|
||
|
m4ri_mm_free(M);
|
||
|
#else
|
||
|
int foundit = 0;
|
||
|
mzd_t_cache_t *cache = &mzd_cache;
|
||
|
while(cache) {
|
||
|
size_t entry = M - cache->mzd;
|
||
|
if (entry < 64) {
|
||
|
cache->used &= ~((uint64_t)1 << entry);
|
||
|
if (cache->used == 0) {
|
||
|
if (cache == &mzd_cache) {
|
||
|
current_cache = cache;
|
||
|
} else {
|
||
|
if (cache == current_cache) {
|
||
|
current_cache = cache->prev;
|
||
|
}
|
||
|
cache->prev->next = cache->next;
|
||
|
if (cache->next)
|
||
|
cache->next->prev = cache->prev;
|
||
|
m4ri_mm_free(cache);
|
||
|
}
|
||
|
}
|
||
|
foundit = 1;
|
||
|
break;
|
||
|
}
|
||
|
cache = cache->next;
|
||
|
}
|
||
|
if(!foundit) {
|
||
|
m4ri_mm_free(M);
|
||
|
}
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
mzd_t *mzd_init(rci_t r, rci_t c) {
|
||
|
assert(sizeof(mzd_t) == 64);
|
||
|
|
||
|
mzd_t *A = mzd_t_malloc();
|
||
|
|
||
|
A->nrows = r;
|
||
|
A->ncols = c;
|
||
|
A->width = (c + m4ri_radix - 1) / m4ri_radix;
|
||
|
A->rowstride = (A->width < mzd_paddingwidth || (A->width & 1) == 0) ? A->width : A->width + 1;
|
||
|
A->high_bitmask = __M4RI_LEFT_BITMASK(c % m4ri_radix);
|
||
|
A->flags = (A->high_bitmask != m4ri_ffff) ? mzd_flag_nonzero_excess : 0;
|
||
|
A->offset_vector = 0;
|
||
|
A->row_offset = 0;
|
||
|
|
||
|
A->rows = (word**)m4ri_mmc_calloc(r + 1, sizeof(word*)); // We're overcomitting here.
|
||
|
|
||
|
if (r && c) {
|
||
|
int blockrows = __M4RI_MAX_MZD_BLOCKSIZE / A->rowstride;
|
||
|
A->blockrows_log = 0;
|
||
|
while(blockrows >>= 1)
|
||
|
A->blockrows_log++;
|
||
|
blockrows = 1 << A->blockrows_log;
|
||
|
|
||
|
int const blockrows_mask = blockrows - 1;
|
||
|
int const nblocks = (r + blockrows - 1) / blockrows;
|
||
|
A->flags |= (nblocks > 1) ? mzd_flag_multiple_blocks : 0;
|
||
|
A->blocks = (mzd_block_t*)m4ri_mmc_calloc(nblocks + 1, sizeof(mzd_block_t));
|
||
|
|
||
|
size_t block_words = (r - (nblocks - 1) * blockrows) * A->rowstride;
|
||
|
for(int i = nblocks - 1; i >= 0; --i) {
|
||
|
A->blocks[i].size = block_words * sizeof(word);
|
||
|
A->blocks[i].begin = (word*)m4ri_mmc_calloc(1, A->blocks[i].size);
|
||
|
A->blocks[i].end = A->blocks[i].begin + block_words;
|
||
|
block_words = blockrows * A->rowstride;
|
||
|
}
|
||
|
|
||
|
for(rci_t i = 0; i < A->nrows; ++i) {
|
||
|
A->rows[i] = A->blocks[i >> A->blockrows_log].begin + (i & blockrows_mask) * A->rowstride;
|
||
|
}
|
||
|
|
||
|
} else {
|
||
|
A->blocks = NULL;
|
||
|
}
|
||
|
|
||
|
return A;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
Explanation of offset_vector (in words), and row_offset.
|
||
|
|
||
|
<------------------------------- row_stride (in words)--------------------->
|
||
|
.---------------------------------------------------------------------------. <-- m->blocks[0].begin ^
|
||
|
| ^ /| |
|
||
|
| m->row_offset| m->offset_vector_/ | |
|
||
|
| v / | |
|
||
|
| .--------------------------------------------------------------------v<--|---- m->rows[0] |_ skipped_blocks (in blocks)
|
||
|
| |m (also a window) ^ | | |
|
||
|
| | | | | |
|
||
|
`---------------------------------|-----------------------------------------' v
|
||
|
.---------------------------------|----------------------------------------_. <-- m->blocks[1].begin <-- windows.blocks[0].begin
|
||
|
| | ^ lowr| |_^ |
|
||
|
| | window->row_offset| | window->offset_vector _-^| |
|
||
|
| | v v _-^ | |
|
||
|
| | .----------------------------------------------------------v<--. |<--|---- m->rows[lowr]
|
||
|
| | |window | `-|---|---- window->rows[0]
|
||
|
| | | | | |
|
||
|
`---------------------------------------------------------------------------'
|
||
|
.---------------------------------------------------------------------------. <-- m->blocks[2].begin <-- windows.blocks[1].begin
|
||
|
| | | | | |
|
||
|
| | | | lowc | |
|
||
|
| | | |<---->| |
|
||
|
| | | | \__|___|__ also wrd_offset (in words)
|
||
|
| | `----------------------------------------------------------' | |
|
||
|
| `--------------------------------------------------------------------' |
|
||
|
`---------------------------------------------------------------------------'
|
||
|
.---------------------------------------------------------------------------.
|
||
|
| |
|
||
|
|
||
|
*/
|
||
|
|
||
|
mzd_t *mzd_init_window(mzd_t *M, rci_t lowr, rci_t lowc, rci_t highr, rci_t highc) {
|
||
|
assert(lowc % m4ri_radix == 0);
|
||
|
|
||
|
mzd_t *W = mzd_t_malloc();
|
||
|
|
||
|
rci_t nrows = MIN(highr - lowr, M->nrows - lowr);
|
||
|
rci_t ncols = highc - lowc;
|
||
|
|
||
|
W->nrows = nrows;
|
||
|
W->ncols = ncols;
|
||
|
W->rowstride = M->rowstride;
|
||
|
W->width = (ncols + m4ri_radix - 1) / m4ri_radix;
|
||
|
W->high_bitmask = __M4RI_LEFT_BITMASK(ncols % m4ri_radix);
|
||
|
|
||
|
W->flags = mzd_flag_windowed_zerooffset;
|
||
|
W->flags |= (ncols % m4ri_radix == 0) ? mzd_flag_windowed_zeroexcess : mzd_flag_nonzero_excess;
|
||
|
W->blockrows_log = M->blockrows_log;
|
||
|
|
||
|
wi_t const blockrows_mask = (1 << W->blockrows_log) - 1;
|
||
|
int const skipped_blocks = (M->row_offset + lowr) >> W->blockrows_log;
|
||
|
assert(skipped_blocks == 0 || ((M->flags & mzd_flag_multiple_blocks)));
|
||
|
W->row_offset = (M->row_offset + lowr) & blockrows_mask;
|
||
|
W->blocks = &M->blocks[skipped_blocks];
|
||
|
wi_t const wrd_offset = lowc / m4ri_radix;
|
||
|
W->offset_vector = (M->offset_vector + wrd_offset) + (W->row_offset - M->row_offset) * W->rowstride;
|
||
|
if(nrows)
|
||
|
W->rows = (word**)m4ri_mmc_calloc(nrows + 1, sizeof(word*));
|
||
|
else
|
||
|
W->rows = NULL;
|
||
|
for(rci_t i = 0; i < nrows; ++i) {
|
||
|
W->rows[i] = M->rows[lowr + i] + wrd_offset;
|
||
|
}
|
||
|
if (mzd_row_to_block(W, nrows - 1) > 0)
|
||
|
W->flags |= M->flags & mzd_flag_multiple_blocks;
|
||
|
|
||
|
/* offset_vector is the distance from the start of the first block to the first word of the first row. */
|
||
|
assert(nrows == 0 || W->blocks[0].begin + W->offset_vector == W->rows[0]);
|
||
|
|
||
|
__M4RI_DD_MZD(W);
|
||
|
return W;
|
||
|
}
|
||
|
|
||
|
void mzd_free(mzd_t *A) {
|
||
|
if(A->rows)
|
||
|
m4ri_mmc_free(A->rows, (A->nrows + 1) * sizeof(word*));
|
||
|
if(mzd_owns_blocks(A)) {
|
||
|
int i;
|
||
|
for(i = 0; A->blocks[i].size; ++i) {
|
||
|
m4ri_mmc_free(A->blocks[i].begin, A->blocks[i].size);
|
||
|
}
|
||
|
m4ri_mmc_free(A->blocks, (i + 1) * sizeof(mzd_block_t));
|
||
|
}
|
||
|
mzd_t_free(A);
|
||
|
}
|
||
|
|
||
|
void mzd_row_add(mzd_t *M, rci_t sourcerow, rci_t destrow) {
|
||
|
mzd_row_add_offset(M, destrow, sourcerow, 0);
|
||
|
}
|
||
|
|
||
|
void mzd_row_clear_offset(mzd_t *M, rci_t row, rci_t coloffset) {
|
||
|
wi_t const startblock = coloffset / m4ri_radix;
|
||
|
word temp;
|
||
|
|
||
|
/* make sure to start clearing at coloffset */
|
||
|
if (coloffset%m4ri_radix) {
|
||
|
temp = M->rows[row][startblock];
|
||
|
temp &= __M4RI_RIGHT_BITMASK(m4ri_radix - coloffset);
|
||
|
} else {
|
||
|
temp = 0;
|
||
|
}
|
||
|
M->rows[row][startblock] = temp;
|
||
|
for (wi_t i = startblock + 1; i < M->width; ++i) {
|
||
|
M->rows[row][i] = 0;
|
||
|
}
|
||
|
|
||
|
__M4RI_DD_ROW(M, row);
|
||
|
}
|
||
|
|
||
|
|
||
|
rci_t mzd_gauss_delayed(mzd_t *M, rci_t startcol, int full) {
|
||
|
rci_t startrow = startcol;
|
||
|
rci_t pivots = 0;
|
||
|
for (rci_t i = startcol; i < M->ncols ; ++i) {
|
||
|
for(rci_t j = startrow ; j < M->nrows; ++j) {
|
||
|
if (mzd_read_bit(M, j, i)) {
|
||
|
mzd_row_swap(M, startrow, j);
|
||
|
++pivots;
|
||
|
|
||
|
for(rci_t ii = full ? 0 : startrow + 1; ii < M->nrows; ++ii) {
|
||
|
if (ii != startrow) {
|
||
|
if (mzd_read_bit(M, ii, i)) {
|
||
|
mzd_row_add_offset(M, ii, startrow, i);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
startrow = startrow + 1;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
__M4RI_DD_MZD(M);
|
||
|
__M4RI_DD_RCI(pivots);
|
||
|
return pivots;
|
||
|
}
|
||
|
|
||
|
rci_t mzd_echelonize_naive(mzd_t *M, int full) {
|
||
|
return mzd_gauss_delayed(M, 0, full);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Transpose a 64 x 64 matrix with width 1.
|
||
|
*
|
||
|
* \param dst First word of destination matrix.
|
||
|
* \param src First word of source matrix.
|
||
|
* \param rowstride_dst Rowstride of matrix dst.
|
||
|
* \param rowstride_src Rowstride of matrix src.
|
||
|
*
|
||
|
* Rows of both matrices are expected to fit exactly in a word (offset == 0)
|
||
|
* and lay entirely inside a single block.
|
||
|
*
|
||
|
* \note This function also works when dst == src.
|
||
|
*/
|
||
|
|
||
|
static inline void _mzd_copy_transpose_64x64(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src)
|
||
|
{
|
||
|
/*
|
||
|
* m runs over the values:
|
||
|
* 0x00000000FFFFFFFF
|
||
|
* 0x0000FFFF0000FFFF
|
||
|
* 0x00FF00FF00FF00FF
|
||
|
* 0x0F0F0F0F0F0F0F0F
|
||
|
* 0x3333333333333333
|
||
|
* 0x5555555555555555,
|
||
|
* alternating j zeroes with j ones.
|
||
|
*
|
||
|
* Assume we have a matrix existing of four jxj matrices ((0,0) is in the top-right corner,
|
||
|
* this is the memory-model view, see the layout on http://m4ri.sagemath.org/doxygen/structmzd__t.html):
|
||
|
* ...[A1][B1][A0][B0]
|
||
|
* ...[C1][D1][C0][D0]
|
||
|
* . [A2][B2]
|
||
|
* . [C2][B2]
|
||
|
* . .
|
||
|
* .
|
||
|
* The following calulates the XOR between A and D,
|
||
|
* and subsequently applies that to A and D respectively,
|
||
|
* swapping A and D as a result.
|
||
|
* Therefore wk starts at the first row and then has rowstride
|
||
|
* added j times, running over the rows of A, then skips C
|
||
|
* by adding j * rowstride to continue with the next A below C.
|
||
|
*/
|
||
|
|
||
|
word m = __M4RI_CONVERT_TO_WORD(0xFFFFFFFF);
|
||
|
wi_t j_rowstride_dst = rowstride_dst * 64;
|
||
|
wi_t j_rowstride_src = rowstride_src * 32;
|
||
|
word* const end = dst + j_rowstride_dst;
|
||
|
// We start with j = 32, and a one-time unrolled loop, where
|
||
|
// we copy from src and write the result to dst, swapping
|
||
|
// the two 32x32 corner matrices.
|
||
|
int j = 32;
|
||
|
j_rowstride_dst >>= 1;
|
||
|
word* RESTRICT wk = dst;
|
||
|
for (word const* RESTRICT wks = src; wk < end; wk += j_rowstride_dst, wks += j_rowstride_src) {
|
||
|
for (int k = 0; k < j; ++k, wk += rowstride_dst, wks += rowstride_src) {
|
||
|
word xor = ((*wks >> j) ^ *(wks + j_rowstride_src)) & m;
|
||
|
*wk = *wks ^ (xor << j);
|
||
|
*(wk + j_rowstride_dst) = *(wks + j_rowstride_src) ^ xor;
|
||
|
}
|
||
|
}
|
||
|
// Next we work in-place in dst and swap the corners of
|
||
|
// each of the last matrices, all in parallel, for all
|
||
|
// remaining values of j.
|
||
|
m ^= m << 16;
|
||
|
for (j = 16; j != 0; j = j >> 1, m ^= m << j) {
|
||
|
j_rowstride_dst >>= 1;
|
||
|
for (wk = dst; wk < end; wk += j_rowstride_dst) {
|
||
|
for (int k = 0; k < j; ++k, wk += rowstride_dst) {
|
||
|
word xor = ((*wk >> j) ^ *(wk + j_rowstride_dst)) & m;
|
||
|
*wk ^= xor << j;
|
||
|
*(wk + j_rowstride_dst) ^= xor;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Transpose two 64 x 64 matrix with width 1.
|
||
|
*
|
||
|
* \param dst1 First word of destination matrix 1.
|
||
|
* \param dst2 First word of destination matrix 2.
|
||
|
* \param src1 First word of source matrix 1.
|
||
|
* \param src2 First word of source matrix 2.
|
||
|
* \param rowstride_dst Rowstride of destination matrices.
|
||
|
* \param rowstride_src Rowstride of source matrices.
|
||
|
*
|
||
|
* Rows of all matrices are expected to fit exactly in a word (offset == 0)
|
||
|
* and lay entirely inside a single block.
|
||
|
*
|
||
|
* \note This function also works to transpose in-place.
|
||
|
*/
|
||
|
|
||
|
static inline void _mzd_copy_transpose_64x64_2(word* RESTRICT dst1, word* RESTRICT dst2, word const* RESTRICT src1, word const* RESTRICT src2, wi_t rowstride_dst, wi_t rowstride_src)
|
||
|
{
|
||
|
word m = __M4RI_CONVERT_TO_WORD(0xFFFFFFFF);
|
||
|
wi_t j_rowstride_dst = rowstride_dst * 64;
|
||
|
wi_t j_rowstride_src = rowstride_src * 32;
|
||
|
word* const end = dst1 + j_rowstride_dst;
|
||
|
int j = 32;
|
||
|
word* RESTRICT wk[2];
|
||
|
word const* RESTRICT wks[2];
|
||
|
word xor[2];
|
||
|
|
||
|
j_rowstride_dst >>= 1;
|
||
|
wk[0] = dst1;
|
||
|
wk[1] = dst2;
|
||
|
wks[0] = src1;
|
||
|
wks[1] = src2;
|
||
|
|
||
|
do {
|
||
|
|
||
|
for (int k = 0; k < j; ++k) {
|
||
|
xor[0] = ((*wks[0] >> j) ^ *(wks[0] + j_rowstride_src)) & m;
|
||
|
xor[1] = ((*wks[1] >> j) ^ *(wks[1] + j_rowstride_src)) & m;
|
||
|
*wk[0] = *wks[0] ^ (xor[0] << j);
|
||
|
*wk[1] = *wks[1] ^ (xor[1] << j);
|
||
|
*(wk[0] + j_rowstride_dst) = *(wks[0] + j_rowstride_src) ^ xor[0];
|
||
|
*(wk[1] + j_rowstride_dst) = *(wks[1] + j_rowstride_src) ^ xor[1];
|
||
|
wk[0] += rowstride_dst;
|
||
|
wk[1] += rowstride_dst;
|
||
|
wks[0] += rowstride_src;
|
||
|
wks[1] += rowstride_src;
|
||
|
}
|
||
|
|
||
|
wk[0] += j_rowstride_dst;
|
||
|
wk[1] += j_rowstride_dst;
|
||
|
wks[0] += j_rowstride_src;
|
||
|
wks[1] += j_rowstride_src;
|
||
|
|
||
|
} while(wk[0] < end);
|
||
|
|
||
|
m ^= m << 16;
|
||
|
for (j = 16; j != 0; j = j >> 1, m ^= m << j) {
|
||
|
|
||
|
j_rowstride_dst >>= 1;
|
||
|
wk[0] = dst1;
|
||
|
wk[1] = dst2;
|
||
|
|
||
|
do {
|
||
|
|
||
|
for (int k = 0; k < j; ++k) {
|
||
|
xor[0] = ((*wk[0] >> j) ^ *(wk[0] + j_rowstride_dst)) & m;
|
||
|
xor[1] = ((*wk[1] >> j) ^ *(wk[1] + j_rowstride_dst)) & m;
|
||
|
*wk[0] ^= xor[0] << j;
|
||
|
*wk[1] ^= xor[1] << j;
|
||
|
*(wk[0] + j_rowstride_dst) ^= xor[0];
|
||
|
*(wk[1] + j_rowstride_dst) ^= xor[1];
|
||
|
wk[0] += rowstride_dst;
|
||
|
wk[1] += rowstride_dst;
|
||
|
}
|
||
|
|
||
|
wk[0] += j_rowstride_dst;
|
||
|
wk[1] += j_rowstride_dst;
|
||
|
|
||
|
} while(wk[0] < end);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static unsigned char log2_ceil_table[64] = {
|
||
|
0, 1, 2, 2, 3, 3, 3, 3,
|
||
|
4, 4, 4, 4, 4, 4, 4, 4,
|
||
|
5, 5, 5, 5, 5, 5, 5, 5,
|
||
|
5, 5, 5, 5, 5, 5, 5, 5,
|
||
|
6, 6, 6, 6, 6, 6, 6, 6,
|
||
|
6, 6, 6, 6, 6, 6, 6, 6,
|
||
|
6, 6, 6, 6, 6, 6, 6, 6,
|
||
|
6, 6, 6, 6, 6, 6, 6, 6
|
||
|
};
|
||
|
|
||
|
static inline int log2_ceil(int n)
|
||
|
{
|
||
|
return log2_ceil_table[n - 1];
|
||
|
}
|
||
|
|
||
|
static word const transpose_mask[6] = {
|
||
|
0x5555555555555555ULL,
|
||
|
0x3333333333333333ULL,
|
||
|
0x0F0F0F0F0F0F0F0FULL,
|
||
|
0x00FF00FF00FF00FFULL,
|
||
|
0x0000FFFF0000FFFFULL,
|
||
|
0x00000000FFFFFFFFULL,
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Transpose 64/j matrices of size jxj in parallel.
|
||
|
*
|
||
|
* Where j equals n rounded up to the nearest power of 2.
|
||
|
* The input array t must be of size j (containing the rows i of all matrices in t[i]).
|
||
|
*
|
||
|
* t[0..{j-1}] = [Al]...[A1][A0]
|
||
|
*
|
||
|
* \param t An array of j words.
|
||
|
* \param n The number of rows in each matrix.
|
||
|
*
|
||
|
* \return log2(j)
|
||
|
*/
|
||
|
|
||
|
static inline int _mzd_transpose_Nxjx64(word* RESTRICT t, int n)
|
||
|
{
|
||
|
int j = 1;
|
||
|
int mi = 0; // Index into the transpose_mask array.
|
||
|
|
||
|
while (j < n) // Don't swap with entirely undefined data (where [D] exists entirely of non-existant rows).
|
||
|
{
|
||
|
// Swap 64/j matrices of size jxj in 2j rows. Thus,
|
||
|
// <---- one word --->
|
||
|
// [Al][Bl]...[A0][B0]
|
||
|
// [Cl][Dl]...[C0][D0], where l = 64/j - 1 and each matrix [A], [B] etc is jxj.
|
||
|
// Then swap [A] and [D] in-place.
|
||
|
|
||
|
// m runs over the values in transpose_mask, so that at all
|
||
|
// times m exists of j zeroes followed by j ones, repeated.
|
||
|
word const m = transpose_mask[mi];
|
||
|
int k = 0; // Index into t[].
|
||
|
do {
|
||
|
// Run over all rows of [A] and [D].
|
||
|
for (int i = 0; i < j; ++i, ++k) {
|
||
|
// t[k] contains row i of all [A], and t[k + j] contains row i of all [D]. Swap them.
|
||
|
word xor = ((t[k] >> j) ^ t[k + j]) & m;
|
||
|
t[k] ^= xor << j;
|
||
|
t[k + j] ^= xor;
|
||
|
}
|
||
|
k += j; // Skip [C].
|
||
|
} while (k < n); // Stop if we passed all valid input.
|
||
|
|
||
|
// Double the size of j and repeat this for the next 2j rows until all
|
||
|
// n rows have been swapped (possibly with non-existant rows).
|
||
|
j <<= 1;
|
||
|
++mi;
|
||
|
}
|
||
|
|
||
|
return mi;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Transpose a n x 64 matrix with width 1.
|
||
|
*
|
||
|
* \param dst First word of destination matrix.
|
||
|
* \param src First word of source matrix.
|
||
|
* \param rowstride_dst Rowstride of destination matrix.
|
||
|
* \param rowstride_src Rowstride of source matrix.
|
||
|
* \param n Number of rows in source matrix, must be less than 64.
|
||
|
*
|
||
|
* Rows of all matrices are expected have offset zero
|
||
|
* and lay entirely inside a single block.
|
||
|
*
|
||
|
* \note This function also works to transpose in-place.
|
||
|
*/
|
||
|
|
||
|
static inline void _mzd_copy_transpose_lt64x64(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n)
|
||
|
{
|
||
|
// Preload the n input rows into level 1, using a minimum of cache lines (compact storage).
|
||
|
word t[64];
|
||
|
word const* RESTRICT wks = src;
|
||
|
int k;
|
||
|
for (k = 0; k < n; ++k) {
|
||
|
t[k] = *wks;
|
||
|
wks += rowstride_src;
|
||
|
}
|
||
|
if (n > 32) {
|
||
|
while (k < 64)
|
||
|
t[k++] = 0;
|
||
|
_mzd_copy_transpose_64x64(dst, t, rowstride_dst, 1);
|
||
|
return;
|
||
|
}
|
||
|
int log2j = _mzd_transpose_Nxjx64(t, n);
|
||
|
// All output bits are now transposed, but still might need to be shifted in place.
|
||
|
// What we have now is 64/j matrices of size jxj. Thus,
|
||
|
// [Al]...[A1][A0], where l = 64/j - 1.
|
||
|
// while the actual output is:
|
||
|
// [A0]
|
||
|
// [A1]
|
||
|
// ...
|
||
|
// [Al]
|
||
|
word const m = __M4RI_LEFT_BITMASK(n);
|
||
|
word* RESTRICT wk = dst;
|
||
|
switch (log2j) {
|
||
|
case 5:
|
||
|
{
|
||
|
wi_t const j_rowstride_dst = 32 * rowstride_dst;
|
||
|
for (int k = 0; k < 32; ++k) {
|
||
|
wk[0] = t[k] & m;
|
||
|
wk[j_rowstride_dst] = (t[k] >> 32) & m;
|
||
|
wk += rowstride_dst;
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
case 4:
|
||
|
{
|
||
|
wi_t const j_rowstride_dst = 16 * rowstride_dst;
|
||
|
for (int k = 0; k < 16; ++k) {
|
||
|
wk[0] = t[k] & m;
|
||
|
wk[j_rowstride_dst] = (t[k] >> 16) & m;
|
||
|
wk[2 * j_rowstride_dst] = (t[k] >> 32) & m;
|
||
|
wk[3 * j_rowstride_dst] = (t[k] >> 48) & m;
|
||
|
wk += rowstride_dst;
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
case 3:
|
||
|
{
|
||
|
wi_t const j_rowstride_dst = 8 * rowstride_dst;
|
||
|
for (int k = 0; k < 8; ++k) {
|
||
|
wk[0] = t[k] & m;
|
||
|
wk[j_rowstride_dst] = (t[k] >> 8) & m;
|
||
|
wk[2 * j_rowstride_dst] = (t[k] >> 16) & m;
|
||
|
wk[3 * j_rowstride_dst] = (t[k] >> 24) & m;
|
||
|
wk[4 * j_rowstride_dst] = (t[k] >> 32) & m;
|
||
|
wk[5 * j_rowstride_dst] = (t[k] >> 40) & m;
|
||
|
wk[6 * j_rowstride_dst] = (t[k] >> 48) & m;
|
||
|
wk[7 * j_rowstride_dst] = (t[k] >> 56) & m;
|
||
|
wk += rowstride_dst;
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
case 2:
|
||
|
{
|
||
|
wi_t const j_rowstride_dst = 4 * rowstride_dst;
|
||
|
for (int k = 0; k < 4; ++k) {
|
||
|
word* RESTRICT wk2 = wk;
|
||
|
word tk = t[k];
|
||
|
for (int i = 0; i < 2; ++i) {
|
||
|
wk2[0] = tk & m;
|
||
|
wk2[j_rowstride_dst] = (tk >> 4) & m;
|
||
|
wk2[2 * j_rowstride_dst] = (tk >> 8) & m;
|
||
|
wk2[3 * j_rowstride_dst] = (tk >> 12) & m;
|
||
|
wk2[4 * j_rowstride_dst] = (tk >> 16) & m;
|
||
|
wk2[5 * j_rowstride_dst] = (tk >> 20) & m;
|
||
|
wk2[6 * j_rowstride_dst] = (tk >> 24) & m;
|
||
|
wk2[7 * j_rowstride_dst] = (tk >> 28) & m;
|
||
|
wk2 += 8 * j_rowstride_dst;
|
||
|
tk >>= 32;
|
||
|
}
|
||
|
wk += rowstride_dst;
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
case 1:
|
||
|
{
|
||
|
wi_t const j_rowstride_dst = 2 * rowstride_dst;
|
||
|
for (int k = 0; k < 2; ++k) {
|
||
|
word* RESTRICT wk2 = wk;
|
||
|
word tk = t[k];
|
||
|
for (int i = 0; i < 8; ++i) {
|
||
|
wk2[0] = tk & m;
|
||
|
wk2[j_rowstride_dst] = (tk >> 2) & m;
|
||
|
wk2[2 * j_rowstride_dst] = (tk >> 4) & m;
|
||
|
wk2[3 * j_rowstride_dst] = (tk >> 6) & m;
|
||
|
wk2 += 4 * j_rowstride_dst;
|
||
|
tk >>= 8;
|
||
|
}
|
||
|
wk += rowstride_dst;
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
case 0:
|
||
|
{
|
||
|
word* RESTRICT wk2 = wk;
|
||
|
word tk = t[0];
|
||
|
for (int i = 0; i < 16; ++i) {
|
||
|
wk2[0] = tk & m;
|
||
|
wk2[rowstride_dst] = (tk >> 1) & m;
|
||
|
wk2[2 * rowstride_dst] = (tk >> 2) & m;
|
||
|
wk2[3 * rowstride_dst] = (tk >> 3) & m;
|
||
|
wk2 += 4 * rowstride_dst;
|
||
|
tk >>= 4;
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Transpose a 64 x n matrix with width 1.
|
||
|
*
|
||
|
* \param dst First word of destination matrix.
|
||
|
* \param src First word of source matrix.
|
||
|
* \param rowstride_dst Rowstride of destination matrix.
|
||
|
* \param rowstride_src Rowstride of source matrix.
|
||
|
* \param n Number of columns in source matrix, must be less than 64.
|
||
|
*
|
||
|
* Rows of all matrices are expected have offset zero
|
||
|
* and lay entirely inside a single block.
|
||
|
*
|
||
|
* \note This function also works to transpose in-place.
|
||
|
*/
|
||
|
|
||
|
static inline void _mzd_copy_transpose_64xlt64(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n)
|
||
|
{
|
||
|
word t[64];
|
||
|
int log2j = log2_ceil(n);
|
||
|
word const* RESTRICT wks = src;
|
||
|
switch (log2j) {
|
||
|
case 6:
|
||
|
{
|
||
|
_mzd_copy_transpose_64x64(t, src, 1, rowstride_src);
|
||
|
word* RESTRICT wk = dst;
|
||
|
for (int k = 0; k < n; ++k) {
|
||
|
*wk = t[k];
|
||
|
wk += rowstride_dst;
|
||
|
}
|
||
|
return;
|
||
|
}
|
||
|
case 5:
|
||
|
{
|
||
|
wi_t const j_rowstride_src = 32 * rowstride_src;
|
||
|
for (int k = 0; k < 32; ++k) {
|
||
|
t[k] = wks[0] | (wks[j_rowstride_src] << 32);
|
||
|
wks += rowstride_src;
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
case 4:
|
||
|
{
|
||
|
wi_t const j_rowstride_src = 16 * rowstride_src;
|
||
|
for (int k = 0; k < 16; ++k) {
|
||
|
t[k] = wks[0] | (wks[j_rowstride_src] << 16);
|
||
|
t[k] |= (wks[2 * j_rowstride_src] << 32) | (wks[3 * j_rowstride_src] << 48);
|
||
|
wks += rowstride_src;
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
case 3:
|
||
|
{
|
||
|
wi_t const j_rowstride_src = 8 * rowstride_src;
|
||
|
word tt;
|
||
|
for (int k = 0; k < 8; ++k) {
|
||
|
tt = wks[0] | (wks[j_rowstride_src] << 8);
|
||
|
t[k] = (wks[2 * j_rowstride_src] << 16) | (wks[3 * j_rowstride_src] << 24);
|
||
|
tt |= (wks[4 * j_rowstride_src] << 32) | (wks[5 * j_rowstride_src] << 40);
|
||
|
t[k] |= (wks[6 * j_rowstride_src] << 48) | (wks[7 * j_rowstride_src] << 56);
|
||
|
wks += rowstride_src;
|
||
|
t[k] |= tt;
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
case 2:
|
||
|
{
|
||
|
word const* RESTRICT wks2 = wks + 60 * rowstride_src;
|
||
|
t[0] = wks2[0];
|
||
|
t[1] = wks2[rowstride_src];
|
||
|
t[2] = wks2[2 * rowstride_src];
|
||
|
t[3] = wks2[3 * rowstride_src];
|
||
|
for (int i = 0; i < 15; ++i) {
|
||
|
wks2 -= 4 * rowstride_src;
|
||
|
t[0] <<= 4;
|
||
|
t[1] <<= 4;
|
||
|
t[2] <<= 4;
|
||
|
t[3] <<= 4;
|
||
|
t[0] |= wks2[0];
|
||
|
t[1] |= wks2[rowstride_src];
|
||
|
t[2] |= wks2[2 * rowstride_src];
|
||
|
t[3] |= wks2[3 * rowstride_src];
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
case 1:
|
||
|
{
|
||
|
wks += 62 * rowstride_src;
|
||
|
t[0] = wks[0];
|
||
|
t[1] = wks[rowstride_src];
|
||
|
for (int i = 0; i < 31; ++i) {
|
||
|
wks -= 2 * rowstride_src;
|
||
|
t[0] <<= 2;
|
||
|
t[1] <<= 2;
|
||
|
t[0] |= wks[0];
|
||
|
t[1] |= wks[rowstride_src];
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
case 0:
|
||
|
{
|
||
|
word tt[2];
|
||
|
tt[0] = wks[0];
|
||
|
tt[1] = wks[rowstride_src];
|
||
|
for (int i = 2; i < 64; i += 2) {
|
||
|
wks += 2 * rowstride_src;
|
||
|
tt[0] |= wks[0] << i;
|
||
|
tt[1] |= wks[rowstride_src] << i;
|
||
|
}
|
||
|
*dst = tt[0] | (tt[1] << 1);
|
||
|
return;
|
||
|
}
|
||
|
}
|
||
|
int j = 1 << log2j;
|
||
|
_mzd_transpose_Nxjx64(t, j);
|
||
|
word* RESTRICT wk = dst;
|
||
|
for (int k = 0; k < n; ++k) {
|
||
|
*wk = t[k];
|
||
|
wk += rowstride_dst;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Transpose a n x m matrix with width 1, offset 0 and m and n less than or equal 8.
|
||
|
*
|
||
|
* \param dst First word of destination matrix.
|
||
|
* \param src First word of source matrix.
|
||
|
* \param rowstride_dst Rowstride of destination matrix.
|
||
|
* \param rowstride_src Rowstride of source matrix.
|
||
|
* \param n Number of rows in source matrix, must be less than or equal 8.
|
||
|
* \param m Number of columns in source matrix, must be less than or equal 8.
|
||
|
*
|
||
|
* Rows of all matrices are expected to have offset zero
|
||
|
* and lay entirely inside a single block.
|
||
|
*
|
||
|
* \note This function also works to transpose in-place.
|
||
|
*/
|
||
|
|
||
|
static inline void _mzd_copy_transpose_le8xle8(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n, int m, int maxsize)
|
||
|
{
|
||
|
int end = maxsize * 7;
|
||
|
word const* RESTRICT wks = src;
|
||
|
word w = *wks;
|
||
|
int shift = 0;
|
||
|
for (int i = 1; i < n; ++i) {
|
||
|
wks += rowstride_src;
|
||
|
shift += 8;
|
||
|
w |= (*wks << shift);
|
||
|
}
|
||
|
word mask = 0x80402010080402ULL;
|
||
|
word w7 = w >> 7;
|
||
|
shift = 7;
|
||
|
--m;
|
||
|
do {
|
||
|
word xor = (w ^ w7) & mask;
|
||
|
mask >>= 8;
|
||
|
w ^= (xor << shift);
|
||
|
shift += 7;
|
||
|
w7 >>= 7;
|
||
|
w ^= xor;
|
||
|
} while(shift < end);
|
||
|
word* RESTRICT wk = dst + m * rowstride_dst;
|
||
|
for (int shift = 8 * m; shift > 0; shift -= 8) {
|
||
|
*wk = (unsigned char)(w >> shift);
|
||
|
wk -= rowstride_dst;
|
||
|
}
|
||
|
*wk = (unsigned char)w;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Transpose a n x m matrix with width 1, offset 0 and m and n less than or equal 16.
|
||
|
*
|
||
|
* \param dst First word of destination matrix.
|
||
|
* \param src First word of source matrix.
|
||
|
* \param rowstride_dst Rowstride of destination matrix.
|
||
|
* \param rowstride_src Rowstride of source matrix.
|
||
|
* \param n Number of rows in source matrix, must be less than or equal 16.
|
||
|
* \param m Number of columns in source matrix, must be less than or equal 16.
|
||
|
*
|
||
|
* Rows of all matrices are expected to have offset zero
|
||
|
* and lay entirely inside a single block.
|
||
|
*
|
||
|
* \note This function also works to transpose in-place.
|
||
|
*/
|
||
|
|
||
|
static inline void _mzd_copy_transpose_le16xle16(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n, int m, int maxsize)
|
||
|
{
|
||
|
int end = maxsize * 3;
|
||
|
word const* RESTRICT wks = src;
|
||
|
word t[4];
|
||
|
int i = n;
|
||
|
do {
|
||
|
t[0] = wks[0];
|
||
|
if (--i == 0) {
|
||
|
t[1] = 0;
|
||
|
t[2] = 0;
|
||
|
t[3] = 0;
|
||
|
break;
|
||
|
}
|
||
|
t[1] = wks[rowstride_src];
|
||
|
if (--i == 0) {
|
||
|
t[2] = 0;
|
||
|
t[3] = 0;
|
||
|
break;
|
||
|
}
|
||
|
t[2] = wks[2 * rowstride_src];
|
||
|
if (--i == 0) {
|
||
|
t[3] = 0;
|
||
|
break;
|
||
|
}
|
||
|
t[3] = wks[3 * rowstride_src];
|
||
|
if (--i == 0)
|
||
|
break;
|
||
|
wks += 4 * rowstride_src;
|
||
|
for(int shift = 16;; shift += 16) {
|
||
|
t[0] |= (*wks << shift);
|
||
|
if (--i == 0)
|
||
|
break;
|
||
|
t[1] |= (wks[rowstride_src] << shift);
|
||
|
if (--i == 0)
|
||
|
break;
|
||
|
t[2] |= (wks[2 * rowstride_src] << shift);
|
||
|
if (--i == 0)
|
||
|
break;
|
||
|
t[3] |= (wks[3 * rowstride_src] << shift);
|
||
|
if (--i == 0)
|
||
|
break;
|
||
|
wks += 4 * rowstride_src;
|
||
|
}
|
||
|
} while(0);
|
||
|
word mask = 0xF0000F0000F0ULL;
|
||
|
int shift = 12;
|
||
|
word xor[4];
|
||
|
do {
|
||
|
xor[0] = (t[0] ^ (t[0] >> shift)) & mask;
|
||
|
xor[1] = (t[1] ^ (t[1] >> shift)) & mask;
|
||
|
xor[2] = (t[2] ^ (t[2] >> shift)) & mask;
|
||
|
xor[3] = (t[3] ^ (t[3] >> shift)) & mask;
|
||
|
mask >>= 16;
|
||
|
t[0] ^= (xor[0] << shift);
|
||
|
t[1] ^= (xor[1] << shift);
|
||
|
t[2] ^= (xor[2] << shift);
|
||
|
t[3] ^= (xor[3] << shift);
|
||
|
shift += 12;
|
||
|
t[0] ^= xor[0];
|
||
|
t[1] ^= xor[1];
|
||
|
t[2] ^= xor[2];
|
||
|
t[3] ^= xor[3];
|
||
|
} while(shift < end);
|
||
|
_mzd_transpose_Nxjx64(t, 4);
|
||
|
i = m;
|
||
|
word* RESTRICT wk = dst;
|
||
|
do {
|
||
|
wk[0] = (uint16_t)t[0];
|
||
|
if (--i == 0)
|
||
|
break;
|
||
|
wk[rowstride_dst] = (uint16_t)t[1];
|
||
|
if (--i == 0)
|
||
|
break;
|
||
|
wk[2 * rowstride_dst] = (uint16_t)t[2];
|
||
|
if (--i == 0)
|
||
|
break;
|
||
|
wk[3 * rowstride_dst] = (uint16_t)t[3];
|
||
|
if (--i == 0)
|
||
|
break;
|
||
|
wk += 4 * rowstride_dst;
|
||
|
for(int shift = 16;; shift += 16) {
|
||
|
wk[0] = (uint16_t)(t[0] >> shift);
|
||
|
if (--i == 0)
|
||
|
break;
|
||
|
wk[rowstride_dst] = (uint16_t)(t[1] >> shift);
|
||
|
if (--i == 0)
|
||
|
break;
|
||
|
wk[2 * rowstride_dst] = (uint16_t)(t[2] >> shift);
|
||
|
if (--i == 0)
|
||
|
break;
|
||
|
wk[3 * rowstride_dst] = (uint16_t)(t[3] >> shift);
|
||
|
if (--i == 0)
|
||
|
break;
|
||
|
wk += 4 * rowstride_dst;
|
||
|
}
|
||
|
} while(0);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Transpose a n x m matrix with width 1, offset 0 and m and n less than or equal 32.
|
||
|
*
|
||
|
* \param dst First word of destination matrix.
|
||
|
* \param src First word of source matrix.
|
||
|
* \param rowstride_dst Rowstride of destination matrix.
|
||
|
* \param rowstride_src Rowstride of source matrix.
|
||
|
* \param n Number of rows in source matrix, must be less than or equal 32.
|
||
|
* \param m Number of columns in source matrix, must be less than or equal 32.
|
||
|
*
|
||
|
* Rows of all matrices are expected to have offset zero
|
||
|
* and lay entirely inside a single block.
|
||
|
*
|
||
|
* \note This function also works to transpose in-place.
|
||
|
*/
|
||
|
|
||
|
static inline void _mzd_copy_transpose_le32xle32(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n, int m)
|
||
|
{
|
||
|
word const* RESTRICT wks = src;
|
||
|
word t[16];
|
||
|
int i = n;
|
||
|
if (n > 16) {
|
||
|
i -= 16;
|
||
|
for (int j = 0; j < 16; ++j) {
|
||
|
t[j] = *wks;
|
||
|
wks += rowstride_src;
|
||
|
}
|
||
|
int j = 0;
|
||
|
do {
|
||
|
t[j++] |= (*wks << 32);
|
||
|
wks += rowstride_src;
|
||
|
} while(--i);
|
||
|
} else {
|
||
|
int j;
|
||
|
for (j = 0; j < n; ++j) {
|
||
|
t[j] = *wks;
|
||
|
wks += rowstride_src;
|
||
|
}
|
||
|
for (; j < 16; ++j)
|
||
|
t[j] = 0;
|
||
|
}
|
||
|
_mzd_transpose_Nxjx64(t, 16);
|
||
|
int one_more = (m & 1);
|
||
|
word* RESTRICT wk = dst;
|
||
|
if (m > 16) {
|
||
|
m -= 16;
|
||
|
for (int j = 0; j < 16; j += 2) {
|
||
|
*wk = (t[j] & 0xFFFF) | ((t[j] >> 16) & 0xFFFF0000);
|
||
|
wk[rowstride_dst] = (t[j + 1] & 0xFFFF) | ((t[j + 1] >> 16) & 0xFFFF0000);
|
||
|
wk += 2 * rowstride_dst;
|
||
|
}
|
||
|
for (int j = 1; j < m; j += 2) {
|
||
|
*wk = ((t[j - 1] >> 16) & 0xFFFF) | ((t[j - 1] >> 32) & 0xFFFF0000);
|
||
|
wk[rowstride_dst] = ((t[j] >> 16) & 0xFFFF) | ((t[j] >> 32) & 0xFFFF0000);
|
||
|
wk += 2 * rowstride_dst;
|
||
|
}
|
||
|
if (one_more) {
|
||
|
*wk = ((t[m - 1] >> 16) & 0xFFFF) | ((t[m - 1] >> 32) & 0xFFFF0000);
|
||
|
}
|
||
|
} else {
|
||
|
for (int j = 1; j < m; j += 2) {
|
||
|
*wk = (t[j - 1] & 0xFFFF) | ((t[j - 1] >> 16) & 0xFFFF0000);
|
||
|
wk[rowstride_dst] = (t[j] & 0xFFFF) | ((t[j] >> 16) & 0xFFFF0000);
|
||
|
wk += 2 * rowstride_dst;
|
||
|
}
|
||
|
if (one_more) {
|
||
|
*wk = (t[m - 1] & 0xFFFF) | ((t[m - 1] >> 16) & 0xFFFF0000);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static inline void _mzd_copy_transpose_le64xle64(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n, int m)
|
||
|
{
|
||
|
word const* RESTRICT wks = src;
|
||
|
word t[64];
|
||
|
int k;
|
||
|
for (k = 0; k < n; ++k) {
|
||
|
t[k] = *wks;
|
||
|
wks += rowstride_src;
|
||
|
}
|
||
|
while(k < 64)
|
||
|
t[k++] = 0;
|
||
|
_mzd_copy_transpose_64x64(t, t, 1, 1);
|
||
|
word* RESTRICT wk = dst;
|
||
|
for (int k = 0; k < m; ++k) {
|
||
|
*wk = t[k];
|
||
|
wk += rowstride_dst;
|
||
|
}
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
void _mzd_transpose_multiblock(mzd_t *DST, mzd_t const *A, word* RESTRICT* fwdp, word const* RESTRICT* fwsp, rci_t* nrowsp, rci_t* ncolsp);
|
||
|
|
||
|
mzd_t *_mzd_transpose(mzd_t *DST, mzd_t const *A) {
|
||
|
assert(!mzd_is_windowed(DST) && !mzd_is_windowed(A));
|
||
|
// We assume that there fit at least 64 rows in a block, if
|
||
|
// that is the case then each block will contain a multiple
|
||
|
// of 64 rows, since blockrows is a power of 2.
|
||
|
assert(A->blockrows_log >= 6 && DST->blockrows_log >= 6);
|
||
|
|
||
|
rci_t nrows = A->nrows;
|
||
|
rci_t ncols = A->ncols;
|
||
|
rci_t maxsize = MAX(nrows, ncols);
|
||
|
|
||
|
word* RESTRICT fwd = mzd_first_row(DST);
|
||
|
word const* RESTRICT fws = mzd_first_row(A);
|
||
|
|
||
|
if (maxsize >= 64) {
|
||
|
|
||
|
// This is the most non-intrusive way to deal with the case of multiple blocks.
|
||
|
// Note that this code is VERY sensitive. ANY change to _mzd_transpose can easily
|
||
|
// reduce the speed for small matrices (up to 64x64) by 5 to 10%.
|
||
|
int const multiple_blocks = (A->flags | DST->flags) & mzd_flag_multiple_blocks;
|
||
|
if (__M4RI_UNLIKELY(multiple_blocks)) {
|
||
|
word* RESTRICT non_register_fwd;
|
||
|
word const* RESTRICT non_register_fws;
|
||
|
rci_t non_register_nrows;
|
||
|
rci_t non_register_ncols;
|
||
|
_mzd_transpose_multiblock(DST, A, &non_register_fwd, &non_register_fws, &non_register_nrows, &non_register_ncols);
|
||
|
fwd = non_register_fwd;
|
||
|
fws = non_register_fws;
|
||
|
nrows = non_register_nrows;
|
||
|
ncols = non_register_ncols;
|
||
|
}
|
||
|
|
||
|
if (nrows >= 64) {
|
||
|
/*
|
||
|
* This is an interesting #if ...
|
||
|
* I recommend to investigate the number of instructions, and the clocks per instruction,
|
||
|
* as function of various sizes of the matrix (most likely especially the number of columns
|
||
|
* (the size of a row) will have influence; also always use multiples of 64 or even 128),
|
||
|
* for both cases below.
|
||
|
*
|
||
|
* To measure this run for example:
|
||
|
*
|
||
|
* ./bench_mzd -m 10 -x 10 -p PAPI_TOT_INS,PAPI_L1_TCM,PAPI_L2_TCM mzd_transpose 32000 32000
|
||
|
* ./bench_mzd -m 10 -x 100 -p PAPI_TOT_INS,PAPI_L1_TCM,PAPI_L2_TCM mzd_transpose 128 10240
|
||
|
* etc (increase -x for smaller sizes to get better accuracy).
|
||
|
*
|
||
|
* --Carlo Wood
|
||
|
*/
|
||
|
#if 1
|
||
|
int js = ncols & nrows & 64; // True if the total number of whole 64x64 matrices is odd.
|
||
|
wi_t const rowstride_64_dst = 64 * DST->rowstride;
|
||
|
word* RESTRICT fwd_current = fwd;
|
||
|
word const* RESTRICT fws_current = fws;
|
||
|
if (js) {
|
||
|
js = 1;
|
||
|
_mzd_copy_transpose_64x64(fwd, fws, DST->rowstride, A->rowstride);
|
||
|
if ((nrows | ncols) == 64) {
|
||
|
__M4RI_DD_MZD(DST);
|
||
|
return DST;
|
||
|
}
|
||
|
fwd_current += rowstride_64_dst;
|
||
|
++fws_current;
|
||
|
}
|
||
|
rci_t const whole_64cols = ncols / 64;
|
||
|
// The use of delayed and even, is to avoid calling _mzd_copy_transpose_64x64_2 twice.
|
||
|
// This way it can be inlined without duplicating the amount of code that has to be loaded.
|
||
|
word* RESTRICT fwd_delayed = NULL;
|
||
|
word const* RESTRICT fws_delayed = NULL;
|
||
|
int even = 0;
|
||
|
while (1)
|
||
|
{
|
||
|
for (int j = js; j < whole_64cols; ++j) {
|
||
|
if (!even) {
|
||
|
fwd_delayed = fwd_current;
|
||
|
fws_delayed = fws_current;
|
||
|
} else {
|
||
|
_mzd_copy_transpose_64x64_2(fwd_delayed, fwd_current, fws_delayed, fws_current, DST->rowstride, A->rowstride);
|
||
|
}
|
||
|
fwd_current += rowstride_64_dst;
|
||
|
++fws_current;
|
||
|
even = !even;
|
||
|
}
|
||
|
nrows -= 64;
|
||
|
if (ncols % 64) {
|
||
|
_mzd_copy_transpose_64xlt64(fwd + whole_64cols * rowstride_64_dst, fws + whole_64cols, DST->rowstride, A->rowstride, ncols % 64);
|
||
|
}
|
||
|
fwd += 1;
|
||
|
fws += 64 * A->rowstride;
|
||
|
if (nrows < 64)
|
||
|
break;
|
||
|
js = 0;
|
||
|
fws_current = fws;
|
||
|
fwd_current = fwd;
|
||
|
}
|
||
|
#else
|
||
|
// The same as the above, but without using _mzd_copy_transpose_64x64_2.
|
||
|
wi_t const rowstride_64_dst = 64 * DST->rowstride;
|
||
|
rci_t const whole_64cols = ncols / 64;
|
||
|
assert(nrows >= 64);
|
||
|
do {
|
||
|
for (int j = 0; j < whole_64cols; ++j) {
|
||
|
_mzd_copy_transpose_64x64(fwd + j * rowstride_64_dst, fws + j, DST->rowstride, A->rowstride);
|
||
|
}
|
||
|
nrows -= 64;
|
||
|
if (ncols % 64) {
|
||
|
_mzd_copy_transpose_64xlt64(fwd + whole_64cols * rowstride_64_dst, fws + whole_64cols, DST->rowstride, A->rowstride, ncols % 64);
|
||
|
}
|
||
|
fwd += 1;
|
||
|
fws += 64 * A->rowstride;
|
||
|
} while(nrows >= 64);
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
if (nrows == 0) {
|
||
|
__M4RI_DD_MZD(DST);
|
||
|
return DST;
|
||
|
}
|
||
|
|
||
|
// Transpose the remaining top rows. Now 0 < nrows < 64.
|
||
|
|
||
|
while (ncols >= 64)
|
||
|
{
|
||
|
_mzd_copy_transpose_lt64x64(fwd, fws, DST->rowstride, A->rowstride, nrows);
|
||
|
ncols -= 64;
|
||
|
fwd += 64 * DST->rowstride;
|
||
|
fws += 1;
|
||
|
}
|
||
|
|
||
|
if (ncols == 0) {
|
||
|
__M4RI_DD_MZD(DST);
|
||
|
return DST;
|
||
|
}
|
||
|
|
||
|
maxsize = MAX(nrows, ncols);
|
||
|
}
|
||
|
|
||
|
// Transpose the remaining corner. Now both 0 < nrows < 64 and 0 < ncols < 64.
|
||
|
|
||
|
if (maxsize <= 8) {
|
||
|
_mzd_copy_transpose_le8xle8(fwd, fws, DST->rowstride, A->rowstride, nrows, ncols, maxsize);
|
||
|
}
|
||
|
else if (maxsize <= 16) {
|
||
|
_mzd_copy_transpose_le16xle16(fwd, fws, DST->rowstride, A->rowstride, nrows, ncols, maxsize);
|
||
|
}
|
||
|
else if (maxsize <= 32) {
|
||
|
_mzd_copy_transpose_le32xle32(fwd, fws, DST->rowstride, A->rowstride, nrows, ncols);
|
||
|
}
|
||
|
else {
|
||
|
_mzd_copy_transpose_le64xle64(fwd, fws, DST->rowstride, A->rowstride, nrows, ncols);
|
||
|
}
|
||
|
|
||
|
__M4RI_DD_MZD(DST);
|
||
|
return DST;
|
||
|
}
|
||
|
|
||
|
void _mzd_transpose_multiblock(mzd_t *DST, mzd_t const *A, word* RESTRICT* fwdp, word const* RESTRICT* fwsp, rci_t* nrowsp, rci_t* ncolsp) {
|
||
|
|
||
|
rci_t nrows = A->nrows;
|
||
|
rci_t ncols = A->ncols;
|
||
|
|
||
|
rci_t blockrows_dst = 1 << DST->blockrows_log; // The maximum number of rows in a block of DST.
|
||
|
rci_t blockrows_src = 1 << A->blockrows_log; // The maximum number of rows in a block of A.
|
||
|
|
||
|
/* We're deviding the source matrix into blocks of multiples of 64x64, such that each
|
||
|
* block fits entirely inside a single memory allocation block, both in the source
|
||
|
* as well as the corresponding destination.
|
||
|
*
|
||
|
* <-------------------ncols----------------->
|
||
|
* <---------blockrows_dst------->
|
||
|
* .---------------------------------------------------------------.
|
||
|
* |P ^ Matrix A:| . |Q . . . |<-^---- A->blocks[0].begin
|
||
|
* | | | . | . . . | |
|
||
|
* | | | . | . . . | |
|
||
|
* | | |- - - - - -|- - - - - - - - - - - - - - - -| |
|
||
|
* | | | . | . ^ . . | |
|
||
|
* | | | . | .<64x64>. . | |
|
||
|
* | | | . | . v . . | |
|
||
|
* | | |- - - - - -|- - - - - - - - - - - - - - - -| |- blockrows_src
|
||
|
* | | | . | . . . | |
|
||
|
* | | | . | . . . | |
|
||
|
* | | | . | . . . | |
|
||
|
* | |nrows |- - - - - -|- - - - - - - - - - - - - - - -| |
|
||
|
* | | | . | . . . | |
|
||
|
* | | | . | . . . | |
|
||
|
* | | | . | . . . | v
|
||
|
* |===================+===========================================|
|
||
|
* |R | | . |S . . . |<------ A->blocks[1].begin
|
||
|
* | | | . | . . . |
|
||
|
* | | | . | . . . |
|
||
|
* | | |- - - - - -|- - - - - - - - - - - - - - - -|
|
||
|
* | | | . | . . . |
|
||
|
* | | | . | . . . |
|
||
|
* | | | . | . . . |
|
||
|
* | | |- - - - - -|- - - - - - - - - - - - - - - -|
|
||
|
* | v | . | . . . |
|
||
|
* | `-------------------------------------------|
|
||
|
* | | |
|
||
|
* | | |
|
||
|
* | | |
|
||
|
* | | |
|
||
|
* | | |
|
||
|
* `---------------------------------------------------------------'
|
||
|
*
|
||
|
* Imagine this also to be the memory map of DST, which then would be
|
||
|
* mirrored in the diagonal line from the top/right to the bottom/left.
|
||
|
* Then each of the squares P, Q, R and S lay entirely inside one
|
||
|
* memory block in both the source as well as the destination matrix.
|
||
|
* P and Q are really the same block for matrix A (as are R and S),
|
||
|
* while P and R (and Q and S) are really the same block for DST.
|
||
|
*
|
||
|
* We're going to run over the top/right corners of each of these
|
||
|
* memory "blocks" and then transpose it, one by one, running
|
||
|
* from right to left and top to bottom. The last one (R) will be
|
||
|
* done by the calling function, so we just return when we get there.
|
||
|
*/
|
||
|
|
||
|
rci_t R_top = (nrows >> A->blockrows_log) << A->blockrows_log;
|
||
|
rci_t R_right = (ncols >> DST->blockrows_log) << DST->blockrows_log;
|
||
|
for (rci_t col = 0; col < ncols; col += blockrows_dst) {
|
||
|
rci_t end = (col == R_right) ? R_top : nrows;
|
||
|
for (rci_t row = 0; row < end; row += blockrows_src) {
|
||
|
|
||
|
rci_t nrowsb = (row < R_top) ? blockrows_src : (nrows - R_top);
|
||
|
rci_t ncolsb = (col < R_right) ? blockrows_dst : (ncols - R_right);
|
||
|
word const* RESTRICT fws = mzd_row(A, row) + col / m4ri_radix;
|
||
|
word* RESTRICT fwd = mzd_row(DST, col) + row / m4ri_radix;
|
||
|
|
||
|
// The following code is (almost) duplicated from _mzd_transpose.
|
||
|
if (nrowsb >= 64) {
|
||
|
|
||
|
int js = ncolsb & nrowsb & 64; // True if the total number of whole 64x64 matrices is odd.
|
||
|
wi_t const rowstride_64_dst = 64 * DST->rowstride;
|
||
|
word* RESTRICT fwd_current = fwd;
|
||
|
word const* RESTRICT fws_current = fws;
|
||
|
if (js) {
|
||
|
js = 1;
|
||
|
_mzd_copy_transpose_64x64(fwd, fws, DST->rowstride, A->rowstride);
|
||
|
fwd_current += rowstride_64_dst;
|
||
|
++fws_current;
|
||
|
}
|
||
|
rci_t const whole_64cols = ncolsb / 64;
|
||
|
// The use of delayed and even, is to avoid calling _mzd_copy_transpose_64x64_2 twice.
|
||
|
// This way it can be inlined without duplicating the amount of code that has to be loaded.
|
||
|
word* RESTRICT fwd_delayed = NULL;
|
||
|
word const* RESTRICT fws_delayed = NULL;
|
||
|
int even = 0;
|
||
|
while (1)
|
||
|
{
|
||
|
for (int j = js; j < whole_64cols; ++j) {
|
||
|
if (!even) {
|
||
|
fwd_delayed = fwd_current;
|
||
|
fws_delayed = fws_current;
|
||
|
} else {
|
||
|
_mzd_copy_transpose_64x64_2(fwd_delayed, fwd_current, fws_delayed, fws_current, DST->rowstride, A->rowstride);
|
||
|
}
|
||
|
fwd_current += rowstride_64_dst;
|
||
|
++fws_current;
|
||
|
even = !even;
|
||
|
}
|
||
|
nrowsb -= 64;
|
||
|
if (ncolsb % 64) {
|
||
|
_mzd_copy_transpose_64xlt64(fwd + whole_64cols * rowstride_64_dst, fws + whole_64cols, DST->rowstride, A->rowstride, ncolsb % 64);
|
||
|
}
|
||
|
fwd += 1;
|
||
|
fws += 64 * A->rowstride;
|
||
|
if (nrowsb < 64)
|
||
|
break;
|
||
|
js = 0;
|
||
|
fws_current = fws;
|
||
|
fwd_current = fwd;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (nrowsb == 0)
|
||
|
continue;
|
||
|
|
||
|
// Transpose the remaining top rows. Now 0 < nrowsb < 64.
|
||
|
|
||
|
while (ncolsb >= 64)
|
||
|
{
|
||
|
_mzd_copy_transpose_lt64x64(fwd, fws, DST->rowstride, A->rowstride, nrowsb);
|
||
|
ncolsb -= 64;
|
||
|
fwd += 64 * DST->rowstride;
|
||
|
fws += 1;
|
||
|
}
|
||
|
|
||
|
// This is true because if it wasn't then nrowsb has to be 0 and we continued before already.
|
||
|
assert(ncolsb == 0);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
*nrowsp = nrows - R_top;
|
||
|
*ncolsp = ncols - R_right;
|
||
|
if (R_top < nrows)
|
||
|
*fwsp = mzd_row(A, R_top) + R_right / m4ri_radix;
|
||
|
if (R_right < ncols)
|
||
|
*fwdp = mzd_row(DST, R_right) + R_top / m4ri_radix;
|
||
|
}
|
||
|
|
||
|
mzd_t *mzd_transpose(mzd_t *DST, mzd_t const *A) {
|
||
|
if (DST == NULL) {
|
||
|
DST = mzd_init( A->ncols, A->nrows );
|
||
|
} else if (__M4RI_UNLIKELY(DST->nrows != A->ncols || DST->ncols != A->nrows)) {
|
||
|
m4ri_die("mzd_transpose: Wrong size for return matrix.\n");
|
||
|
} else {
|
||
|
/** it seems this is taken care of in the subroutines, re-enable if running into problems **/
|
||
|
//mzd_set_ui(DST,0);
|
||
|
}
|
||
|
|
||
|
if(A->nrows == 0 || A->ncols == 0)
|
||
|
return mzd_copy(DST, A);
|
||
|
|
||
|
if (__M4RI_LIKELY(!mzd_is_windowed(DST) && !mzd_is_windowed(A)))
|
||
|
return _mzd_transpose(DST, A);
|
||
|
int A_windowed = mzd_is_windowed(A);
|
||
|
if (A_windowed)
|
||
|
A = mzd_copy(NULL, A);
|
||
|
if (__M4RI_LIKELY(!mzd_is_windowed(DST)))
|
||
|
_mzd_transpose(DST, A);
|
||
|
else {
|
||
|
mzd_t *D = mzd_init(DST->nrows, DST->ncols);
|
||
|
_mzd_transpose(D, A);
|
||
|
mzd_copy(DST, D);
|
||
|
mzd_free(D);
|
||
|
}
|
||
|
if (A_windowed)
|
||
|
mzd_free((mzd_t*)A);
|
||
|
return DST;
|
||
|
}
|
||
|
|
||
|
mzd_t *mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B) {
|
||
|
if (C == NULL) {
|
||
|
C = mzd_init(A->nrows, B->ncols);
|
||
|
} else {
|
||
|
if (C->nrows != A->nrows || C->ncols != B->ncols) {
|
||
|
m4ri_die("mzd_mul_naive: Provided return matrix has wrong dimensions.\n");
|
||
|
}
|
||
|
}
|
||
|
if(B->ncols < m4ri_radix-10) { /* this cutoff is rather arbitrary */
|
||
|
mzd_t *BT = mzd_transpose(NULL, B);
|
||
|
_mzd_mul_naive(C, A, BT, 1);
|
||
|
mzd_free (BT);
|
||
|
} else {
|
||
|
_mzd_mul_va(C, A, B, 1);
|
||
|
}
|
||
|
return C;
|
||
|
}
|
||
|
|
||
|
mzd_t *mzd_addmul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B) {
|
||
|
if (C->nrows != A->nrows || C->ncols != B->ncols) {
|
||
|
m4ri_die("mzd_mul_naive: Provided return matrix has wrong dimensions.\n");
|
||
|
}
|
||
|
|
||
|
if(B->ncols < m4ri_radix-10) { /* this cutoff is rather arbitrary */
|
||
|
mzd_t *BT = mzd_transpose(NULL, B);
|
||
|
_mzd_mul_naive(C, A, BT, 0);
|
||
|
mzd_free (BT);
|
||
|
} else {
|
||
|
_mzd_mul_va(C, A, B, 0);
|
||
|
}
|
||
|
return C;
|
||
|
}
|
||
|
|
||
|
mzd_t *_mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B, const int clear) {
|
||
|
wi_t eol;
|
||
|
word *a, *b, *c;
|
||
|
|
||
|
if (clear) {
|
||
|
word const mask_end = C->high_bitmask;
|
||
|
/* improves performance on x86_64 but is not cross plattform */
|
||
|
/* asm __volatile__ (".p2align 4\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop"); */
|
||
|
for (rci_t i = 0; i < C->nrows; ++i) {
|
||
|
wi_t j = 0;
|
||
|
for (; j < C->width - 1; ++j) {
|
||
|
C->rows[i][j] = 0;
|
||
|
}
|
||
|
C->rows[i][j] &= ~mask_end;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(C->ncols % m4ri_radix) {
|
||
|
eol = (C->width - 1);
|
||
|
} else {
|
||
|
eol = (C->width);
|
||
|
}
|
||
|
|
||
|
word parity[64];
|
||
|
for (int i = 0; i < 64; ++i) {
|
||
|
parity[i] = 0;
|
||
|
}
|
||
|
wi_t const wide = A->width;
|
||
|
int const blocksize = __M4RI_MUL_BLOCKSIZE;
|
||
|
for (rci_t start = 0; start + blocksize <= C->nrows; start += blocksize) {
|
||
|
for (rci_t i = start; i < start + blocksize; ++i) {
|
||
|
a = A->rows[i];
|
||
|
c = C->rows[i];
|
||
|
for (rci_t j = 0; j < m4ri_radix * eol; j += m4ri_radix) {
|
||
|
for (int k = 0; k < m4ri_radix; ++k) {
|
||
|
b = B->rows[j + k];
|
||
|
parity[k] = a[0] & b[0];
|
||
|
for (wi_t ii = wide - 1; ii >= 1; --ii)
|
||
|
parity[k] ^= a[ii] & b[ii];
|
||
|
}
|
||
|
c[j / m4ri_radix] ^= m4ri_parity64(parity);
|
||
|
}
|
||
|
|
||
|
if (eol != C->width) {
|
||
|
word const mask_end = C->high_bitmask;
|
||
|
/* improves performance on x86_64 but is not cross plattform */
|
||
|
/* asm __volatile__ (".p2align 4\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop"); */
|
||
|
for (int k = 0; k < (C->ncols % m4ri_radix); ++k) {
|
||
|
b = B->rows[m4ri_radix * eol + k];
|
||
|
parity[k] = a[0] & b[0];
|
||
|
for (wi_t ii = 1; ii < A->width; ++ii)
|
||
|
parity[k] ^= a[ii] & b[ii];
|
||
|
}
|
||
|
c[eol] ^= m4ri_parity64(parity) & mask_end;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
for (rci_t i = C->nrows - (C->nrows % blocksize); i < C->nrows; ++i) {
|
||
|
a = A->rows[i];
|
||
|
c = C->rows[i];
|
||
|
for (rci_t j = 0; j < m4ri_radix * eol; j += m4ri_radix) {
|
||
|
for (int k = 0; k < m4ri_radix; ++k) {
|
||
|
b = B->rows[j+k];
|
||
|
parity[k] = a[0] & b[0];
|
||
|
for (wi_t ii = wide - 1; ii >= 1; --ii)
|
||
|
parity[k] ^= a[ii] & b[ii];
|
||
|
}
|
||
|
c[j/m4ri_radix] ^= m4ri_parity64(parity);
|
||
|
}
|
||
|
|
||
|
if (eol != C->width) {
|
||
|
word const mask_end = C->high_bitmask;
|
||
|
/* improves performance on x86_64 but is not cross plattform */
|
||
|
/* asm __volatile__ (".p2align 4\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop"); */
|
||
|
for (int k = 0; k < (C->ncols % m4ri_radix); ++k) {
|
||
|
b = B->rows[m4ri_radix * eol + k];
|
||
|
parity[k] = a[0] & b[0];
|
||
|
for (wi_t ii = 1; ii < A->width; ++ii)
|
||
|
parity[k] ^= a[ii] & b[ii];
|
||
|
}
|
||
|
c[eol] ^= m4ri_parity64(parity) & mask_end;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
__M4RI_DD_MZD(C);
|
||
|
return C;
|
||
|
}
|
||
|
|
||
|
mzd_t *_mzd_mul_va(mzd_t *C, mzd_t const *v, mzd_t const *A, int const clear) {
|
||
|
if(clear)
|
||
|
mzd_set_ui(C, 0);
|
||
|
|
||
|
rci_t const m = v->nrows;
|
||
|
rci_t const n = v->ncols;
|
||
|
|
||
|
for(rci_t i = 0; i < m; ++i)
|
||
|
for(rci_t j = 0; j < n; ++j)
|
||
|
if (mzd_read_bit(v,i,j))
|
||
|
mzd_combine(C,i,0, C,i,0, A,j,0);
|
||
|
|
||
|
__M4RI_DD_MZD(C);
|
||
|
return C;
|
||
|
}
|
||
|
|
||
|
void mzd_randomize(mzd_t *A) {
|
||
|
wi_t const width = A->width - 1;
|
||
|
word const mask_end = A->high_bitmask;
|
||
|
for(rci_t i = 0; i < A->nrows; ++i) {
|
||
|
for(wi_t j = 0; j < width; ++j)
|
||
|
A->rows[i][j] = m4ri_random_word();
|
||
|
A->rows[i][width] ^= (A->rows[i][width] ^ m4ri_random_word()) & mask_end;
|
||
|
}
|
||
|
|
||
|
__M4RI_DD_MZD(A);
|
||
|
}
|
||
|
|
||
|
void mzd_set_ui( mzd_t *A, unsigned int value) {
|
||
|
word const mask_end = A->high_bitmask;
|
||
|
|
||
|
for (rci_t i = 0; i < A->nrows; ++i) {
|
||
|
word *row = A->rows[i];
|
||
|
for(wi_t j = 0; j < A->width - 1; ++j)
|
||
|
row[j] = 0;
|
||
|
row[A->width - 1] &= ~mask_end;
|
||
|
}
|
||
|
|
||
|
if(value % 2 == 0) {
|
||
|
__M4RI_DD_MZD(A);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
rci_t const stop = MIN(A->nrows, A->ncols);
|
||
|
for (rci_t i = 0; i < stop; ++i) {
|
||
|
mzd_write_bit(A, i, i, 1);
|
||
|
}
|
||
|
|
||
|
__M4RI_DD_MZD(A);
|
||
|
}
|
||
|
|
||
|
int mzd_equal(mzd_t const *A, mzd_t const *B) {
|
||
|
if (A->nrows != B->nrows) return FALSE;
|
||
|
if (A->ncols != B->ncols) return FALSE;
|
||
|
if (A == B) return TRUE;
|
||
|
|
||
|
wi_t Awidth = A->width - 1;
|
||
|
|
||
|
for (rci_t i = 0; i < A->nrows; ++i) {
|
||
|
for (wi_t j = 0; j < Awidth; ++j) {
|
||
|
if (A->rows[i][j] != B->rows[i][j])
|
||
|
return FALSE;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
word const mask_end = A->high_bitmask;
|
||
|
for (rci_t i = 0; i < A->nrows; ++i) {
|
||
|
if (((A->rows[i][Awidth] ^ B->rows[i][Awidth]) & mask_end))
|
||
|
return FALSE;
|
||
|
}
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
int mzd_cmp(mzd_t const *A, mzd_t const *B) {
|
||
|
if(A->nrows < B->nrows) return -1;
|
||
|
if(B->nrows < A->nrows) return 1;
|
||
|
if(A->ncols < B->ncols) return -1;
|
||
|
if(B->ncols < A->ncols) return 1;
|
||
|
|
||
|
const word mask_end = A->high_bitmask;
|
||
|
const wi_t n = A->width-1;
|
||
|
|
||
|
/* Columns with large index are "larger", but rows with small index
|
||
|
are more important than with large index. */
|
||
|
|
||
|
for(rci_t i=0; i<A->nrows; i++) {
|
||
|
if ((A->rows[i][n]&mask_end) < (B->rows[i][n]&mask_end))
|
||
|
return -1;
|
||
|
else if ((A->rows[i][n]&mask_end) > (B->rows[i][n]&mask_end))
|
||
|
return 1;
|
||
|
|
||
|
for(wi_t j=n-1; j>=0; j--) {
|
||
|
if (A->rows[i][j] < B->rows[i][j])
|
||
|
return -1;
|
||
|
else if (A->rows[i][j] > B->rows[i][j])
|
||
|
return 1;
|
||
|
}
|
||
|
}
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
mzd_t *mzd_copy(mzd_t *N, mzd_t const *P) {
|
||
|
if (N == P)
|
||
|
return N;
|
||
|
|
||
|
if (N == NULL) {
|
||
|
N = mzd_init(P->nrows, P->ncols);
|
||
|
} else {
|
||
|
if (N->nrows < P->nrows || N->ncols < P->ncols)
|
||
|
m4ri_die("mzd_copy: Target matrix is too small.");
|
||
|
}
|
||
|
word *p_truerow, *n_truerow;
|
||
|
wi_t const wide = P->width - 1;
|
||
|
word mask_end = P->high_bitmask;
|
||
|
for (rci_t i = 0; i < P->nrows; ++i) {
|
||
|
p_truerow = P->rows[i];
|
||
|
n_truerow = N->rows[i];
|
||
|
for (wi_t j = 0; j < wide; ++j)
|
||
|
n_truerow[j] = p_truerow[j];
|
||
|
n_truerow[wide] = (n_truerow[wide] & ~mask_end) | (p_truerow[wide] & mask_end);
|
||
|
}
|
||
|
__M4RI_DD_MZD(N);
|
||
|
return N;
|
||
|
}
|
||
|
|
||
|
/* This is sometimes called augment */
|
||
|
mzd_t *mzd_concat(mzd_t *C, mzd_t const *A, mzd_t const *B) {
|
||
|
if (A->nrows != B->nrows) {
|
||
|
m4ri_die("mzd_concat: Bad arguments to concat!\n");
|
||
|
}
|
||
|
|
||
|
if (C == NULL) {
|
||
|
C = mzd_init(A->nrows, A->ncols + B->ncols);
|
||
|
} else if (C->nrows != A->nrows || C->ncols != (A->ncols + B->ncols)) {
|
||
|
m4ri_die("mzd_concat: C has wrong dimension!\n");
|
||
|
}
|
||
|
|
||
|
for (rci_t i = 0; i < A->nrows; ++i) {
|
||
|
word *dst_truerow = C->rows[i];
|
||
|
word *src_truerow = A->rows[i];
|
||
|
for (wi_t j = 0; j < A->width; ++j) {
|
||
|
dst_truerow[j] = src_truerow[j];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
for (rci_t i = 0; i < B->nrows; ++i) {
|
||
|
for (rci_t j = 0; j < B->ncols; ++j) {
|
||
|
mzd_write_bit(C, i, j + A->ncols, mzd_read_bit(B, i, j));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
__M4RI_DD_MZD(C);
|
||
|
return C;
|
||
|
}
|
||
|
|
||
|
mzd_t *mzd_stack(mzd_t *C, mzd_t const *A, mzd_t const *B) {
|
||
|
if (A->ncols != B->ncols) {
|
||
|
m4ri_die("mzd_stack: A->ncols (%d) != B->ncols (%d)!\n", A->ncols, B->ncols);
|
||
|
}
|
||
|
|
||
|
if (C == NULL) {
|
||
|
C = mzd_init(A->nrows + B->nrows, A->ncols);
|
||
|
} else if (C->nrows != (A->nrows + B->nrows) || C->ncols != A->ncols) {
|
||
|
m4ri_die("mzd_stack: C has wrong dimension!\n");
|
||
|
}
|
||
|
|
||
|
for(rci_t i = 0; i < A->nrows; ++i) {
|
||
|
word *src_truerow = A->rows[i];
|
||
|
word *dst_truerow = C->rows[i];
|
||
|
for (wi_t j = 0; j < A->width; ++j) {
|
||
|
dst_truerow[j] = src_truerow[j];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
for(rci_t i = 0; i < B->nrows; ++i) {
|
||
|
word *dst_truerow = C->rows[A->nrows + i];
|
||
|
word *src_truerow = B->rows[i];
|
||
|
for (wi_t j = 0; j < B->width; ++j) {
|
||
|
dst_truerow[j] = src_truerow[j];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
__M4RI_DD_MZD(C);
|
||
|
return C;
|
||
|
}
|
||
|
|
||
|
mzd_t *mzd_invert_naive(mzd_t *INV, mzd_t const *A, mzd_t const *I) {
|
||
|
mzd_t *H;
|
||
|
|
||
|
H = mzd_concat(NULL, A, I);
|
||
|
|
||
|
rci_t x = mzd_echelonize_naive(H, TRUE);
|
||
|
|
||
|
if (x == 0) {
|
||
|
mzd_free(H);
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
INV = mzd_submatrix(INV, H, 0, A->ncols, A->nrows, 2 * A->ncols);
|
||
|
|
||
|
mzd_free(H);
|
||
|
|
||
|
__M4RI_DD_MZD(INV);
|
||
|
return INV;
|
||
|
}
|
||
|
|
||
|
mzd_t *mzd_add(mzd_t *ret, mzd_t const *left, mzd_t const *right) {
|
||
|
if (left->nrows != right->nrows || left->ncols != right->ncols) {
|
||
|
m4ri_die("mzd_add: rows and columns must match.\n");
|
||
|
}
|
||
|
if (ret == NULL) {
|
||
|
ret = mzd_init(left->nrows, left->ncols);
|
||
|
} else if (ret != left) {
|
||
|
if (ret->nrows != left->nrows || ret->ncols != left->ncols) {
|
||
|
m4ri_die("mzd_add: rows and columns of returned matrix must match.\n");
|
||
|
}
|
||
|
}
|
||
|
return _mzd_add(ret, left, right);
|
||
|
}
|
||
|
|
||
|
mzd_t *_mzd_add(mzd_t *C, mzd_t const *A, mzd_t const *B) {
|
||
|
rci_t const nrows = MIN(MIN(A->nrows, B->nrows), C->nrows);
|
||
|
|
||
|
if (C == B) { //swap
|
||
|
mzd_t const *tmp = A;
|
||
|
A = B;
|
||
|
B = tmp;
|
||
|
}
|
||
|
|
||
|
word const mask_end = C->high_bitmask;
|
||
|
|
||
|
switch(A->width) {
|
||
|
case 0:
|
||
|
return C;
|
||
|
case 1:
|
||
|
for(rci_t i = 0; i < nrows; ++i) {
|
||
|
C->rows[i][0] ^= ((A->rows[i][0] ^ B->rows[i][0] ^ C->rows[i][0]) & mask_end);
|
||
|
}
|
||
|
break;
|
||
|
case 2:
|
||
|
for(rci_t i = 0; i < nrows; ++i) {
|
||
|
C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
|
||
|
C->rows[i][1] ^= ((A->rows[i][1] ^ B->rows[i][1] ^ C->rows[i][1]) & mask_end);
|
||
|
}
|
||
|
break;
|
||
|
case 3:
|
||
|
for(rci_t i = 0; i < nrows; ++i) {
|
||
|
C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
|
||
|
C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
|
||
|
C->rows[i][2] ^= ((A->rows[i][2] ^ B->rows[i][2] ^ C->rows[i][2]) & mask_end);
|
||
|
}
|
||
|
break;
|
||
|
case 4:
|
||
|
for(rci_t i = 0; i < nrows; ++i) {
|
||
|
C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
|
||
|
C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
|
||
|
C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2];
|
||
|
C->rows[i][3] ^= ((A->rows[i][3] ^ B->rows[i][3] ^ C->rows[i][3]) & mask_end);
|
||
|
}
|
||
|
break;
|
||
|
case 5:
|
||
|
for(rci_t i = 0; i < nrows; ++i) {
|
||
|
C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
|
||
|
C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
|
||
|
C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2];
|
||
|
C->rows[i][3] = A->rows[i][3] ^ B->rows[i][3];
|
||
|
C->rows[i][4] ^= ((A->rows[i][4] ^ B->rows[i][4] ^ C->rows[i][4]) & mask_end);
|
||
|
}
|
||
|
break;
|
||
|
case 6:
|
||
|
for(rci_t i = 0; i < nrows; ++i) {
|
||
|
C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
|
||
|
C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
|
||
|
C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2];
|
||
|
C->rows[i][3] = A->rows[i][3] ^ B->rows[i][3];
|
||
|
C->rows[i][4] = A->rows[i][4] ^ B->rows[i][4];
|
||
|
C->rows[i][5] ^= ((A->rows[i][5] ^ B->rows[i][5] ^ C->rows[i][5]) & mask_end);
|
||
|
}
|
||
|
break;
|
||
|
case 7:
|
||
|
for(rci_t i = 0; i < nrows; ++i) {
|
||
|
C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
|
||
|
C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
|
||
|
C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2];
|
||
|
C->rows[i][3] = A->rows[i][3] ^ B->rows[i][3];
|
||
|
C->rows[i][4] = A->rows[i][4] ^ B->rows[i][4];
|
||
|
C->rows[i][5] = A->rows[i][5] ^ B->rows[i][5];
|
||
|
C->rows[i][6] ^= ((A->rows[i][6] ^ B->rows[i][6] ^ C->rows[i][6]) & mask_end);
|
||
|
}
|
||
|
break;
|
||
|
case 8:
|
||
|
for(rci_t i = 0; i < nrows; ++i) {
|
||
|
C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
|
||
|
C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
|
||
|
C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2];
|
||
|
C->rows[i][3] = A->rows[i][3] ^ B->rows[i][3];
|
||
|
C->rows[i][4] = A->rows[i][4] ^ B->rows[i][4];
|
||
|
C->rows[i][5] = A->rows[i][5] ^ B->rows[i][5];
|
||
|
C->rows[i][6] = A->rows[i][6] ^ B->rows[i][6];
|
||
|
C->rows[i][7] ^= ((A->rows[i][7] ^ B->rows[i][7] ^ C->rows[i][7]) & mask_end);
|
||
|
}
|
||
|
break;
|
||
|
|
||
|
default:
|
||
|
for(rci_t i = 0; i < nrows; ++i) {
|
||
|
mzd_combine_even(C,i,0, A,i,0, B,i,0);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
__M4RI_DD_MZD(C);
|
||
|
return C;
|
||
|
}
|
||
|
|
||
|
mzd_t *mzd_submatrix(mzd_t *S, mzd_t const *M, rci_t const startrow, rci_t const startcol, rci_t const endrow, rci_t const endcol) {
|
||
|
rci_t const nrows = endrow - startrow;
|
||
|
rci_t const ncols = endcol - startcol;
|
||
|
|
||
|
if (S == NULL) {
|
||
|
S = mzd_init(nrows, ncols);
|
||
|
} else if( (S->nrows < nrows) | (S->ncols < ncols) ) {
|
||
|
m4ri_die("mzd_submatrix: got S with dimension %d x %d but expected %d x %d\n", S->nrows, S->ncols, nrows, ncols);
|
||
|
}
|
||
|
|
||
|
if (startcol % m4ri_radix == 0) {
|
||
|
|
||
|
wi_t const startword = startcol / m4ri_radix;
|
||
|
/* we start at the beginning of a word */
|
||
|
if(ncols / m4ri_radix != 0) {
|
||
|
for(rci_t x = startrow, i = 0; i < nrows; ++i, ++x) {
|
||
|
memcpy(S->rows[i], M->rows[x] + startword, sizeof(word) * (ncols / m4ri_radix));
|
||
|
}
|
||
|
}
|
||
|
if (ncols % m4ri_radix) {
|
||
|
word const mask_end = __M4RI_LEFT_BITMASK(ncols % m4ri_radix);
|
||
|
for(rci_t x = startrow, i = 0; i < nrows; ++i, ++x) {
|
||
|
/* process remaining bits */
|
||
|
word temp = M->rows[x][startword + ncols / m4ri_radix] & mask_end;
|
||
|
S->rows[i][ncols / m4ri_radix] = temp;
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
wi_t j;
|
||
|
for(rci_t i=0; i<nrows; i++) {
|
||
|
for(j=0; j+m4ri_radix<=ncols; j+=m4ri_radix)
|
||
|
S->rows[i][j/m4ri_radix] = mzd_read_bits(M, startrow+i, startcol+j, m4ri_radix);
|
||
|
S->rows[i][j/m4ri_radix] &= ~S->high_bitmask;
|
||
|
S->rows[i][j/m4ri_radix] |= mzd_read_bits(M, startrow+i, startcol+j, ncols - j) & S->high_bitmask;
|
||
|
}
|
||
|
}
|
||
|
__M4RI_DD_MZD(S);
|
||
|
return S;
|
||
|
}
|
||
|
|
||
|
void mzd_col_swap(mzd_t *M, rci_t const cola, rci_t const colb) {
|
||
|
if (cola == colb)
|
||
|
return;
|
||
|
|
||
|
rci_t const _cola = cola;
|
||
|
rci_t const _colb = colb;
|
||
|
|
||
|
wi_t const a_word = _cola / m4ri_radix;
|
||
|
wi_t const b_word = _colb / m4ri_radix;
|
||
|
|
||
|
int const a_bit = _cola % m4ri_radix;
|
||
|
int const b_bit = _colb % m4ri_radix;
|
||
|
|
||
|
word* RESTRICT ptr = mzd_first_row(M);
|
||
|
int max_bit = MAX(a_bit, b_bit);
|
||
|
int count = mzd_rows_in_block(M, 0);
|
||
|
assert(count > 0);
|
||
|
int min_bit = a_bit + b_bit - max_bit;
|
||
|
int block = 0;
|
||
|
int offset = max_bit - min_bit;
|
||
|
word mask = m4ri_one << min_bit;
|
||
|
|
||
|
if (a_word == b_word) {
|
||
|
while(1) {
|
||
|
ptr += a_word;
|
||
|
int fast_count = count / 4;
|
||
|
int rest_count = count - 4 * fast_count;
|
||
|
word xor[4];
|
||
|
wi_t const rowstride = M->rowstride;
|
||
|
while (fast_count--) {
|
||
|
xor[0] = ptr[0];
|
||
|
xor[1] = ptr[rowstride];
|
||
|
xor[2] = ptr[2 * rowstride];
|
||
|
xor[3] = ptr[3 * rowstride];
|
||
|
xor[0] ^= xor[0] >> offset;
|
||
|
xor[1] ^= xor[1] >> offset;
|
||
|
xor[2] ^= xor[2] >> offset;
|
||
|
xor[3] ^= xor[3] >> offset;
|
||
|
xor[0] &= mask;
|
||
|
xor[1] &= mask;
|
||
|
xor[2] &= mask;
|
||
|
xor[3] &= mask;
|
||
|
xor[0] |= xor[0] << offset;
|
||
|
xor[1] |= xor[1] << offset;
|
||
|
xor[2] |= xor[2] << offset;
|
||
|
xor[3] |= xor[3] << offset;
|
||
|
ptr[0] ^= xor[0];
|
||
|
ptr[rowstride] ^= xor[1];
|
||
|
ptr[2 * rowstride] ^= xor[2];
|
||
|
ptr[3 * rowstride] ^= xor[3];
|
||
|
ptr += 4 * rowstride;
|
||
|
}
|
||
|
while (rest_count--) {
|
||
|
word xor = *ptr;
|
||
|
xor ^= xor >> offset;
|
||
|
xor &= mask;
|
||
|
*ptr ^= xor | (xor << offset);
|
||
|
ptr += rowstride;
|
||
|
}
|
||
|
if ((count = mzd_rows_in_block(M, ++block)) <= 0)
|
||
|
break;
|
||
|
ptr = mzd_first_row_next_block(M, block);
|
||
|
}
|
||
|
} else {
|
||
|
word* RESTRICT min_ptr;
|
||
|
wi_t max_offset;
|
||
|
if (min_bit == a_bit) {
|
||
|
min_ptr = ptr + a_word;
|
||
|
max_offset = b_word - a_word;
|
||
|
} else {
|
||
|
min_ptr = ptr + b_word;
|
||
|
max_offset = a_word - b_word;
|
||
|
}
|
||
|
while(1) {
|
||
|
wi_t const rowstride = M->rowstride;
|
||
|
while(count--) {
|
||
|
word xor = (min_ptr[0] ^ (min_ptr[max_offset] >> offset)) & mask;
|
||
|
min_ptr[0] ^= xor;
|
||
|
min_ptr[max_offset] ^= xor << offset;
|
||
|
min_ptr += rowstride;
|
||
|
}
|
||
|
if ((count = mzd_rows_in_block(M, ++block)) <= 0)
|
||
|
break;
|
||
|
ptr = mzd_first_row_next_block(M, block);
|
||
|
if (min_bit == a_bit)
|
||
|
min_ptr = ptr + a_word;
|
||
|
else
|
||
|
min_ptr = ptr + b_word;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
__M4RI_DD_MZD(M);
|
||
|
}
|
||
|
|
||
|
int mzd_is_zero(mzd_t const *A) {
|
||
|
word status = 0;
|
||
|
word mask_end = A->high_bitmask;
|
||
|
for (rci_t i = 0; i < A->nrows; ++i) {
|
||
|
for (wi_t j = 0; j < A->width - 1; ++j)
|
||
|
status |= A->rows[i][j];
|
||
|
status |= A->rows[i][A->width - 1] & mask_end;
|
||
|
if(status)
|
||
|
return 0;
|
||
|
}
|
||
|
return !status;
|
||
|
}
|
||
|
|
||
|
void mzd_copy_row(mzd_t *B, rci_t i, mzd_t const *A, rci_t j) {
|
||
|
assert(B->ncols >= A->ncols);
|
||
|
wi_t const width = MIN(B->width, A->width) - 1;
|
||
|
|
||
|
word const *a = A->rows[j];
|
||
|
word *b = B->rows[i];
|
||
|
|
||
|
word const mask_end = __M4RI_LEFT_BITMASK(A->ncols % m4ri_radix);
|
||
|
|
||
|
if (width != 0) {
|
||
|
for(wi_t k = 0; k < width; ++k)
|
||
|
b[k] = a[k];
|
||
|
b[width] = (b[width] & ~mask_end) | (a[width] & mask_end);
|
||
|
|
||
|
} else {
|
||
|
b[0] = b[0] | (a[0]& mask_end) | (b[0] & ~mask_end);
|
||
|
}
|
||
|
|
||
|
__M4RI_DD_ROW(B, i);
|
||
|
}
|
||
|
|
||
|
|
||
|
int mzd_find_pivot(mzd_t const *A, rci_t start_row, rci_t start_col, rci_t *r, rci_t *c) {
|
||
|
rci_t const nrows = A->nrows;
|
||
|
rci_t const ncols = A->ncols;
|
||
|
word data = 0;
|
||
|
rci_t row_candidate = 0;
|
||
|
if(A->ncols - start_col < m4ri_radix) {
|
||
|
for(rci_t j = start_col; j < A->ncols; j += m4ri_radix) {
|
||
|
int const length = MIN(m4ri_radix, ncols - j);
|
||
|
for(rci_t i = start_row; i < nrows; ++i) {
|
||
|
word const curr_data = mzd_read_bits(A, i, j, length);
|
||
|
if (m4ri_lesser_LSB(curr_data, data)) {
|
||
|
row_candidate = i;
|
||
|
data = curr_data;
|
||
|
}
|
||
|
}
|
||
|
if(data) {
|
||
|
*r = row_candidate;
|
||
|
for(int l = 0; l < length; ++l) {
|
||
|
if(__M4RI_GET_BIT(data, l)) {
|
||
|
*c = j + l;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
__M4RI_DD_RCI(*r);
|
||
|
__M4RI_DD_RCI(*c);
|
||
|
__M4RI_DD_INT(1);
|
||
|
return 1;
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
/* we definitely have more than one word */
|
||
|
/* handle first word */
|
||
|
int const bit_offset = (start_col % m4ri_radix);
|
||
|
wi_t const word_offset = start_col / m4ri_radix;
|
||
|
word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix-bit_offset);
|
||
|
for(rci_t i = start_row; i < nrows; ++i) {
|
||
|
word const curr_data = A->rows[i][word_offset] & mask_begin;
|
||
|
if (m4ri_lesser_LSB(curr_data, data)) {
|
||
|
row_candidate = i;
|
||
|
data = curr_data;
|
||
|
if(__M4RI_GET_BIT(data,bit_offset)) {
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
if(data) {
|
||
|
*r = row_candidate;
|
||
|
data >>= bit_offset;
|
||
|
assert(data);
|
||
|
for(int l = 0; l < (m4ri_radix - bit_offset); ++l) {
|
||
|
if(__M4RI_GET_BIT(data, l)) {
|
||
|
*c = start_col + l;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
__M4RI_DD_RCI(*r);
|
||
|
__M4RI_DD_RCI(*c);
|
||
|
__M4RI_DD_INT(1);
|
||
|
return 1;
|
||
|
}
|
||
|
/* handle complete words */
|
||
|
for(wi_t wi = word_offset + 1; wi < A->width - 1; ++wi) {
|
||
|
for(rci_t i = start_row; i < nrows; ++i) {
|
||
|
word const curr_data = A->rows[i][wi];
|
||
|
if (m4ri_lesser_LSB(curr_data, data)) {
|
||
|
row_candidate = i;
|
||
|
data = curr_data;
|
||
|
if(__M4RI_GET_BIT(data, 0))
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
if(data) {
|
||
|
*r = row_candidate;
|
||
|
for(int l = 0; l < m4ri_radix; ++l) {
|
||
|
if(__M4RI_GET_BIT(data, l)) {
|
||
|
*c = wi * m4ri_radix + l;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
__M4RI_DD_RCI(*r);
|
||
|
__M4RI_DD_RCI(*c);
|
||
|
__M4RI_DD_INT(1);
|
||
|
return 1;
|
||
|
}
|
||
|
}
|
||
|
/* handle last word */
|
||
|
int const end_offset = (A->ncols % m4ri_radix) ? (A->ncols % m4ri_radix) : m4ri_radix;
|
||
|
word const mask_end = __M4RI_LEFT_BITMASK(end_offset % m4ri_radix);
|
||
|
wi_t wi = A->width - 1;
|
||
|
for(rci_t i = start_row; i < nrows; ++i) {
|
||
|
word const curr_data = A->rows[i][wi] & mask_end;
|
||
|
if (m4ri_lesser_LSB(curr_data, data)) {
|
||
|
row_candidate = i;
|
||
|
data = curr_data;
|
||
|
if(__M4RI_GET_BIT(data,0))
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
if(data) {
|
||
|
*r = row_candidate;
|
||
|
for(int l = 0; l < end_offset; ++l) {
|
||
|
if(__M4RI_GET_BIT(data, l)) {
|
||
|
*c = wi * m4ri_radix + l;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
__M4RI_DD_RCI(*r);
|
||
|
__M4RI_DD_RCI(*c);
|
||
|
__M4RI_DD_INT(1);
|
||
|
return 1;
|
||
|
}
|
||
|
}
|
||
|
__M4RI_DD_RCI(*r);
|
||
|
__M4RI_DD_RCI(*c);
|
||
|
__M4RI_DD_INT(0);
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
|
||
|
#define MASK(c) (((uint64_t)(-1)) / (__M4RI_TWOPOW(__M4RI_TWOPOW(c)) + 1))
|
||
|
#define COUNT(x,c) ((x) & MASK(c)) + (((x) >> (__M4RI_TWOPOW(c))) & MASK(c))
|
||
|
|
||
|
static inline int m4ri_bitcount(word w) {
|
||
|
uint64_t n = __M4RI_CONVERT_TO_UINT64_T(w);
|
||
|
n = COUNT(n, 0);
|
||
|
n = COUNT(n, 1);
|
||
|
n = COUNT(n, 2);
|
||
|
n = COUNT(n, 3);
|
||
|
n = COUNT(n, 4);
|
||
|
n = COUNT(n, 5);
|
||
|
return (int)n;
|
||
|
}
|
||
|
|
||
|
|
||
|
double _mzd_density(mzd_t const *A, wi_t res, rci_t r, rci_t c) {
|
||
|
size_t count = 0;
|
||
|
size_t total = 0;
|
||
|
|
||
|
if(A->width == 1) {
|
||
|
for(rci_t i = r; i < A->nrows; ++i)
|
||
|
for(rci_t j = c; j < A->ncols; ++j)
|
||
|
if(mzd_read_bit(A, i, j))
|
||
|
++count;
|
||
|
return ((double)count)/(1.0 * A->ncols * A->nrows);
|
||
|
}
|
||
|
|
||
|
if(res == 0)
|
||
|
res = A->width / 100;
|
||
|
if (res < 1)
|
||
|
res = 1;
|
||
|
|
||
|
for(rci_t i = r; i < A->nrows; ++i) {
|
||
|
word *truerow = A->rows[i];
|
||
|
for(rci_t j = c; j < m4ri_radix; ++j)
|
||
|
if(mzd_read_bit(A, i, j))
|
||
|
++count;
|
||
|
total += m4ri_radix;
|
||
|
|
||
|
for(wi_t j = MAX(1, c / m4ri_radix); j < A->width - 1; j += res) {
|
||
|
count += m4ri_bitcount(truerow[j]);
|
||
|
total += m4ri_radix;
|
||
|
}
|
||
|
for(int j = 0; j < A->ncols % m4ri_radix; ++j)
|
||
|
if(mzd_read_bit(A, i, m4ri_radix * (A->ncols / m4ri_radix) + j))
|
||
|
++count;
|
||
|
total += A->ncols % m4ri_radix;
|
||
|
}
|
||
|
|
||
|
return (double)count / total;
|
||
|
}
|
||
|
|
||
|
double mzd_density(mzd_t const *A, wi_t res) {
|
||
|
return _mzd_density(A, res, 0, 0);
|
||
|
}
|
||
|
|
||
|
rci_t mzd_first_zero_row(mzd_t const *A) {
|
||
|
word const mask_end = __M4RI_LEFT_BITMASK(A->ncols % m4ri_radix);
|
||
|
wi_t const end = A->width - 1;
|
||
|
word *row;
|
||
|
|
||
|
for(rci_t i = A->nrows - 1; i >= 0; --i) {
|
||
|
row = A->rows[i];
|
||
|
word tmp = row[0];
|
||
|
for (wi_t j = 1; j < end; ++j)
|
||
|
tmp |= row[j];
|
||
|
tmp |= row[end] & mask_end;
|
||
|
if(tmp) {
|
||
|
__M4RI_DD_INT(i + 1);
|
||
|
return i + 1;
|
||
|
}
|
||
|
}
|
||
|
__M4RI_DD_INT(0);
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
mzd_t *mzd_extract_u(mzd_t *U, mzd_t const *A) {
|
||
|
rci_t k = MIN(A->nrows, A->ncols);
|
||
|
if (U == NULL)
|
||
|
U = mzd_submatrix(NULL, A, 0, 0, k, k);
|
||
|
else
|
||
|
assert(U->nrows == k && U->ncols == k);
|
||
|
for(rci_t i=1; i<U->nrows; i++) {
|
||
|
for(wi_t j=0; j<i/m4ri_radix; j++) {
|
||
|
U->rows[i][j] = 0;
|
||
|
}
|
||
|
if(i%m4ri_radix)
|
||
|
mzd_clear_bits(U, i, (i/m4ri_radix)*m4ri_radix, i%m4ri_radix);
|
||
|
}
|
||
|
return U;
|
||
|
}
|
||
|
|
||
|
mzd_t *mzd_extract_l(mzd_t *L, mzd_t const *A) {
|
||
|
rci_t k = MIN(A->nrows, A->ncols);
|
||
|
if (L == NULL)
|
||
|
L = mzd_submatrix(NULL, A, 0, 0, k, k);
|
||
|
else
|
||
|
assert(L->nrows == k && L->ncols == k);
|
||
|
for(rci_t i=0; i<L->nrows-1; i++) {
|
||
|
if(m4ri_radix - (i+1)%m4ri_radix)
|
||
|
mzd_clear_bits(L, i, i+1, m4ri_radix - (i+1)%m4ri_radix);
|
||
|
for(wi_t j=(i/m4ri_radix+1); j<L->width; j++) {
|
||
|
L->rows[i][j] = 0;
|
||
|
}
|
||
|
}
|
||
|
return L;
|
||
|
}
|