/******************************************************************************
*
*            M4RI: Linear Algebra over GF(2)
*
*    Copyright (C) 2007 Gregory Bard <gregory.bard@ieee.org> 
*    Copyright (C) 2009-2013 Martin Albrecht <martinralbrecht+m4ri@googlemail.com>
*    Copyright (C) 2011 Carlo Wood <carlo@alinoe.com>
*
*  Distributed under the terms of the GNU General Public License (GPL)
*  version 2 or higher.
*
*    This code is distributed in the hope that it will be useful,
*    but WITHOUT ANY WARRANTY; without even the implied warranty of
*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
*    General Public License for more details.
*
*  The full text of the GPL is available at:
*
*                  http://www.gnu.org/licenses/
******************************************************************************/

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#ifdef __M4RI_HAVE_LIBPNG
#include <png.h>
#endif

#include <stdlib.h>
#include <string.h>
#include "mzd.h"
#include "parity.h"
#include "mmc.h"


/**
 * \brief Cache of mzd_t containers
 */

typedef struct mzd_t_cache {
  mzd_t mzd[64]; /*!< cached matrices */
  struct mzd_t_cache *prev; /*!< previous block */
  struct mzd_t_cache *next; /*!< next block */
  uint64_t used; /*!< bitmasks which matrices in this block are used */
  unsigned char padding[sizeof(mzd_t) - 2 * sizeof(struct mzd_t_cache*) - sizeof(uint64_t)]; /*!< alignment */
#ifdef __GNUC__
} mzd_t_cache_t __attribute__ ((__aligned__ (64)));
#else
} mzd_t_cache_t;
#endif

#define __M4RI_MZD_T_CACHE_MAX 16
static mzd_t_cache_t mzd_cache;
static mzd_t_cache_t* current_cache = &mzd_cache;

static int log2_floor(uint64_t v) {
  static uint64_t const b[] = { 0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000, 0xFFFFFFFF00000000 };
  static unsigned int const S[] = { 1, 2, 4, 8, 16, 32 };
  unsigned int r = 0;
  for (int i = 5; i >= 0; --i)
  {
    if ((v & b[i]))
    {
      v >>= S[i];
      r |= S[i];
    }
  }
  return r;
}

/*
 * Return a pointer to a new mzd_t structure.
 * The structure will be 64 byte aligned.
 * Call mzd_t_free to free the structure for next use.
 */

static mzd_t* mzd_t_malloc() {
#if __M4RI_HAVE_OPENMP
  return (mzd_t*)m4ri_mm_malloc(sizeof(mzd_t));
#else
  mzd_t *ret = NULL;
  int i=0;

  if (current_cache->used == (uint64_t)-1) {
    mzd_t_cache_t *cache = &mzd_cache;
    while (cache && cache->used == (uint64_t)-1) {
      current_cache = cache;
      cache = cache->next;
      i++;
    }
    if (!cache && i< __M4RI_MZD_T_CACHE_MAX) {
      cache = (mzd_t_cache_t*)m4ri_mm_malloc_aligned(sizeof(mzd_t_cache_t), 64);
      memset((char*)cache, 0, sizeof(mzd_t_cache_t));

      cache->prev = current_cache;
      current_cache->next = cache;
      current_cache = cache;
    } else if (!cache && i>= __M4RI_MZD_T_CACHE_MAX) {
      /* We have reached the upper limit on the number of caches */
      ret = (mzd_t*)m4ri_mm_malloc(sizeof(mzd_t));
    } else {
      current_cache = cache;
    }
  }
  if (ret == NULL) {
    int free_entry =log2_floor(~current_cache->used);
    current_cache->used |= ((uint64_t)1 << free_entry);
    ret = &current_cache->mzd[free_entry];
  }
  return ret;
#endif //__M4RI_HAVE_OPENMP

}

static void mzd_t_free(mzd_t *M) {
#if __M4RI_HAVE_OPENMP
  m4ri_mm_free(M);
#else
  int foundit = 0;
  mzd_t_cache_t *cache = &mzd_cache;
  while(cache) {
    size_t entry = M - cache->mzd;
    if (entry < 64) {
      cache->used &= ~((uint64_t)1 << entry);
      if (cache->used == 0) {
        if (cache == &mzd_cache) {
          current_cache = cache;
        } else {
          if (cache == current_cache) {
            current_cache = cache->prev;
          }
          cache->prev->next = cache->next;
          if (cache->next)
            cache->next->prev = cache->prev;
          m4ri_mm_free(cache);
        }
      }
      foundit = 1;
      break;
    }
    cache = cache->next;
  }
  if(!foundit) {
    m4ri_mm_free(M);
  }
#endif
}

mzd_t *mzd_init(rci_t r, rci_t c) {
  assert(sizeof(mzd_t) == 64);

  mzd_t *A = mzd_t_malloc();

  A->nrows = r;
  A->ncols = c;
  A->width = (c + m4ri_radix - 1) / m4ri_radix;
  A->rowstride = (A->width < mzd_paddingwidth || (A->width & 1) == 0) ? A->width : A->width + 1;
  A->high_bitmask = __M4RI_LEFT_BITMASK(c % m4ri_radix);
  A->flags = (A->high_bitmask != m4ri_ffff) ? mzd_flag_nonzero_excess : 0;
  A->offset_vector = 0;
  A->row_offset = 0;

  A->rows = (word**)m4ri_mmc_calloc(r + 1, sizeof(word*)); // We're overcomitting here.

  if (r && c) {
    int blockrows = __M4RI_MAX_MZD_BLOCKSIZE / A->rowstride;
    A->blockrows_log = 0;
    while(blockrows >>= 1)
      A->blockrows_log++;
    blockrows = 1 << A->blockrows_log;

    int const blockrows_mask = blockrows - 1;
    int const nblocks = (r + blockrows - 1) / blockrows;
    A->flags |= (nblocks > 1) ? mzd_flag_multiple_blocks : 0;
    A->blocks = (mzd_block_t*)m4ri_mmc_calloc(nblocks + 1, sizeof(mzd_block_t));

    size_t block_words = (r - (nblocks - 1) * blockrows) * A->rowstride;
    for(int i = nblocks - 1; i >= 0; --i) {
      A->blocks[i].size = block_words * sizeof(word);
      A->blocks[i].begin = (word*)m4ri_mmc_calloc(1, A->blocks[i].size);
      A->blocks[i].end = A->blocks[i].begin + block_words;
      block_words = blockrows * A->rowstride;
    }

    for(rci_t i = 0; i < A->nrows; ++i) {
      A->rows[i] = A->blocks[i >> A->blockrows_log].begin + (i & blockrows_mask) * A->rowstride;
    }

  } else {
    A->blocks = NULL;
  }

  return A;
}

/*
   Explanation of offset_vector (in words), and row_offset.

   <------------------------------- row_stride (in words)--------------------->
   .---------------------------------------------------------------------------.  <-- m->blocks[0].begin   ^
   |                                 ^                                        /|                           |
   |                    m->row_offset|                      m->offset_vector_/ |                           |
   |                                 v                                      /  |                           |
   |  .--------------------------------------------------------------------v<--|---- m->rows[0]            |_ skipped_blocks (in blocks)
   |  |m (also a window)             ^                                     |   |                           |
   |  |                              |                                     |   |                           |
   `---------------------------------|-----------------------------------------'                           v
   .---------------------------------|----------------------------------------_.  <-- m->blocks[1].begin <-- windows.blocks[0].begin
   |  |                      ^   lowr|                                     |_^ |
   |  |    window->row_offset|       |            window->offset_vector _-^|   |
   |  |                      v       v                               _-^   |   |
   |  |  .----------------------------------------------------------v<--.  |<--|---- m->rows[lowr]
   |  |  |window                                                    |    `-|---|---- window->rows[0]
   |  |  |                                                          |      |   |
   `---------------------------------------------------------------------------'
   .---------------------------------------------------------------------------.  <-- m->blocks[2].begin <-- windows.blocks[1].begin
   |  |  |                                                          |      |   |
   |  |  |                                                          | lowc |   |
   |  |  |                                                          |<---->|   |
   |  |  |                                                          |   \__|___|__ also wrd_offset (in words)
   |  |  `----------------------------------------------------------'      |   |
   |  `--------------------------------------------------------------------'   |
   `---------------------------------------------------------------------------'
   .---------------------------------------------------------------------------.
   |                                                                           |

*/

mzd_t *mzd_init_window(mzd_t *M, rci_t lowr, rci_t lowc, rci_t highr, rci_t highc) {
  assert(lowc % m4ri_radix == 0);

  mzd_t *W = mzd_t_malloc();

  rci_t nrows = MIN(highr - lowr, M->nrows - lowr);
  rci_t ncols = highc - lowc;
  
  W->nrows = nrows;
  W->ncols = ncols;
  W->rowstride = M->rowstride;
  W->width = (ncols + m4ri_radix - 1) / m4ri_radix;
  W->high_bitmask = __M4RI_LEFT_BITMASK(ncols % m4ri_radix);

  W->flags = mzd_flag_windowed_zerooffset;
  W->flags |= (ncols % m4ri_radix == 0) ? mzd_flag_windowed_zeroexcess : mzd_flag_nonzero_excess;
  W->blockrows_log = M->blockrows_log;

  wi_t const blockrows_mask = (1 << W->blockrows_log) - 1;
  int const skipped_blocks = (M->row_offset + lowr) >> W->blockrows_log;
  assert(skipped_blocks == 0 || ((M->flags & mzd_flag_multiple_blocks)));
  W->row_offset = (M->row_offset + lowr) & blockrows_mask;
  W->blocks = &M->blocks[skipped_blocks];
  wi_t const wrd_offset = lowc / m4ri_radix;
  W->offset_vector = (M->offset_vector + wrd_offset) + (W->row_offset - M->row_offset) * W->rowstride;
  if(nrows)
    W->rows = (word**)m4ri_mmc_calloc(nrows + 1, sizeof(word*));
  else
    W->rows = NULL;
  for(rci_t i = 0; i < nrows; ++i) {
    W->rows[i] = M->rows[lowr + i] + wrd_offset;
  }
  if (mzd_row_to_block(W, nrows - 1) > 0)
    W->flags |= M->flags & mzd_flag_multiple_blocks;

  /* offset_vector is the distance from the start of the first block to the first word of the first row. */
  assert(nrows == 0 || W->blocks[0].begin + W->offset_vector == W->rows[0]);

  __M4RI_DD_MZD(W);
  return W;
}

void mzd_free(mzd_t *A) {
  if(A->rows)
    m4ri_mmc_free(A->rows, (A->nrows + 1) * sizeof(word*));
  if(mzd_owns_blocks(A)) {
    int i;
    for(i = 0; A->blocks[i].size; ++i) {
      m4ri_mmc_free(A->blocks[i].begin, A->blocks[i].size);
    }
    m4ri_mmc_free(A->blocks, (i + 1) * sizeof(mzd_block_t));
  }
  mzd_t_free(A);
}

void mzd_row_add(mzd_t *M, rci_t sourcerow, rci_t destrow) {
  mzd_row_add_offset(M, destrow, sourcerow, 0);
}

void mzd_row_clear_offset(mzd_t *M, rci_t row, rci_t coloffset) {
  wi_t const startblock = coloffset / m4ri_radix;
  word temp;

  /* make sure to start clearing at coloffset */
  if (coloffset%m4ri_radix) {
    temp = M->rows[row][startblock];
    temp &= __M4RI_RIGHT_BITMASK(m4ri_radix - coloffset);
  } else {
    temp = 0;
  }
  M->rows[row][startblock] = temp;
  for (wi_t i = startblock + 1; i < M->width; ++i) {
    M->rows[row][i] = 0;
  }

  __M4RI_DD_ROW(M, row);
}


rci_t mzd_gauss_delayed(mzd_t *M, rci_t startcol, int full) {
  rci_t startrow = startcol;
  rci_t pivots = 0;
  for (rci_t i = startcol; i < M->ncols ; ++i) {
    for(rci_t j = startrow ; j < M->nrows; ++j) {
      if (mzd_read_bit(M, j, i)) {
	mzd_row_swap(M, startrow, j);
	++pivots;

	for(rci_t ii = full ? 0 : startrow + 1;  ii < M->nrows; ++ii) {
	  if (ii != startrow) {
	    if (mzd_read_bit(M, ii, i)) {
	      mzd_row_add_offset(M, ii, startrow, i);
	    }
	  }
	}
	startrow = startrow + 1;
	break;
      }
    }
  }

  __M4RI_DD_MZD(M);
  __M4RI_DD_RCI(pivots);
  return pivots;
}

rci_t mzd_echelonize_naive(mzd_t *M, int full) { 
  return mzd_gauss_delayed(M, 0, full); 
}

/**
 * Transpose a 64 x 64 matrix with width 1.
 *
 * \param dst First word of destination matrix.
 * \param src First word of source matrix.
 * \param rowstride_dst Rowstride of matrix dst.
 * \param rowstride_src Rowstride of matrix src.
 *
 * Rows of both matrices are expected to fit exactly in a word (offset == 0)
 * and lay entirely inside a single block.
 *
 * \note This function also works when dst == src.
 */

static inline void _mzd_copy_transpose_64x64(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src)
{
  /*
   * m runs over the values:
   *   0x00000000FFFFFFFF
   *   0x0000FFFF0000FFFF
   *   0x00FF00FF00FF00FF
   *   0x0F0F0F0F0F0F0F0F
   *   0x3333333333333333
   *   0x5555555555555555,
   * alternating j zeroes with j ones.
   *
   * Assume we have a matrix existing of four jxj matrices ((0,0) is in the top-right corner,
   * this is the memory-model view, see the layout on http://m4ri.sagemath.org/doxygen/structmzd__t.html):
   * ...[A1][B1][A0][B0]
   * ...[C1][D1][C0][D0]
   *          . [A2][B2]
   *        .   [C2][B2]
   *      .         .
   *                .
   * The following calulates the XOR between A and D,
   * and subsequently applies that to A and D respectively,
   * swapping A and D as a result.
   * Therefore wk starts at the first row and then has rowstride
   * added j times, running over the rows of A, then skips C
   * by adding j * rowstride to continue with the next A below C.
   */

  word m = __M4RI_CONVERT_TO_WORD(0xFFFFFFFF);
  wi_t j_rowstride_dst = rowstride_dst * 64;
  wi_t j_rowstride_src = rowstride_src * 32;
  word* const end = dst + j_rowstride_dst;
  // We start with j = 32, and a one-time unrolled loop, where
  // we copy from src and write the result to dst, swapping
  // the two 32x32 corner matrices.
  int j = 32;
  j_rowstride_dst >>= 1;
  word* RESTRICT wk = dst;
  for (word const* RESTRICT wks = src; wk < end; wk += j_rowstride_dst, wks += j_rowstride_src) {
    for (int k = 0; k < j; ++k, wk += rowstride_dst, wks += rowstride_src) {
      word xor = ((*wks >> j) ^ *(wks + j_rowstride_src)) & m;
      *wk = *wks ^ (xor << j);
      *(wk + j_rowstride_dst) = *(wks + j_rowstride_src) ^ xor;
    }
  }
  // Next we work in-place in dst and swap the corners of
  // each of the last matrices, all in parallel, for all
  // remaining values of j.
  m ^= m << 16;
  for (j = 16; j != 0; j = j >> 1, m ^= m << j) {
    j_rowstride_dst >>= 1;
    for (wk = dst; wk < end; wk += j_rowstride_dst) {
      for (int k = 0; k < j; ++k, wk += rowstride_dst) {
	word xor = ((*wk >> j) ^ *(wk + j_rowstride_dst)) & m;
        *wk ^= xor << j;
	*(wk + j_rowstride_dst) ^= xor;
      }
    }
  }
}

/**
 * Transpose two 64 x 64 matrix with width 1.
 *
 * \param dst1 First word of destination matrix 1.
 * \param dst2 First word of destination matrix 2.
 * \param src1 First word of source matrix 1.
 * \param src2 First word of source matrix 2.
 * \param rowstride_dst Rowstride of destination matrices.
 * \param rowstride_src Rowstride of source matrices.
 *
 * Rows of all matrices are expected to fit exactly in a word (offset == 0)
 * and lay entirely inside a single block.
 *
 * \note This function also works to transpose in-place.
 */

static inline void _mzd_copy_transpose_64x64_2(word* RESTRICT dst1, word* RESTRICT dst2, word const* RESTRICT src1, word const* RESTRICT src2, wi_t rowstride_dst, wi_t rowstride_src)
{
  word m = __M4RI_CONVERT_TO_WORD(0xFFFFFFFF);
  wi_t j_rowstride_dst = rowstride_dst * 64;
  wi_t j_rowstride_src = rowstride_src * 32;
  word* const end = dst1 + j_rowstride_dst;
  int j = 32;
  word* RESTRICT wk[2];
  word const* RESTRICT wks[2];
  word xor[2];

  j_rowstride_dst >>= 1;
  wk[0] = dst1;
  wk[1] = dst2;
  wks[0] = src1;
  wks[1] = src2;

  do {

    for (int k = 0; k < j; ++k) {
      xor[0] = ((*wks[0] >> j) ^ *(wks[0] + j_rowstride_src)) & m;
      xor[1] = ((*wks[1] >> j) ^ *(wks[1] + j_rowstride_src)) & m;
      *wk[0] = *wks[0] ^ (xor[0] << j);
      *wk[1] = *wks[1] ^ (xor[1] << j);
      *(wk[0] + j_rowstride_dst) = *(wks[0] + j_rowstride_src) ^ xor[0];
      *(wk[1] + j_rowstride_dst) = *(wks[1] + j_rowstride_src) ^ xor[1];
      wk[0] += rowstride_dst;
      wk[1] += rowstride_dst;
      wks[0] += rowstride_src;
      wks[1] += rowstride_src;
    }

    wk[0] += j_rowstride_dst;
    wk[1] += j_rowstride_dst;
    wks[0] += j_rowstride_src;
    wks[1] += j_rowstride_src;

  } while(wk[0] < end);

  m ^= m << 16;
  for (j = 16; j != 0; j = j >> 1, m ^= m << j) {

    j_rowstride_dst >>= 1;
    wk[0] = dst1;
    wk[1] = dst2;

    do {

      for (int k = 0; k < j; ++k) {
	xor[0] = ((*wk[0] >> j) ^ *(wk[0] + j_rowstride_dst)) & m;
	xor[1] = ((*wk[1] >> j) ^ *(wk[1] + j_rowstride_dst)) & m;
	*wk[0] ^= xor[0] << j;
	*wk[1] ^= xor[1] << j;
	*(wk[0] + j_rowstride_dst) ^= xor[0];
	*(wk[1] + j_rowstride_dst) ^= xor[1];
	wk[0] += rowstride_dst;
	wk[1] += rowstride_dst;
      }

      wk[0] += j_rowstride_dst;
      wk[1] += j_rowstride_dst;

    } while(wk[0] < end);
  }
}

static unsigned char log2_ceil_table[64] = {
  0, 1, 2, 2, 3, 3, 3, 3,
  4, 4, 4, 4, 4, 4, 4, 4,
  5, 5, 5, 5, 5, 5, 5, 5,
  5, 5, 5, 5, 5, 5, 5, 5,
  6, 6, 6, 6, 6, 6, 6, 6,
  6, 6, 6, 6, 6, 6, 6, 6,
  6, 6, 6, 6, 6, 6, 6, 6,
  6, 6, 6, 6, 6, 6, 6, 6
};

static inline int log2_ceil(int n)
{
  return log2_ceil_table[n - 1];
}

static word const transpose_mask[6] = {
  0x5555555555555555ULL,
  0x3333333333333333ULL,
  0x0F0F0F0F0F0F0F0FULL,
  0x00FF00FF00FF00FFULL,
  0x0000FFFF0000FFFFULL,
  0x00000000FFFFFFFFULL,
};

/**
 * Transpose 64/j matrices of size jxj in parallel.
 *
 * Where j equals n rounded up to the nearest power of 2.
 * The input array t must be of size j (containing the rows i of all matrices in t[i]).
 *
 * t[0..{j-1}]  = [Al]...[A1][A0]
 *
 * \param t An array of j words.
 * \param n The number of rows in each matrix.
 *
 * \return log2(j)
 */

static inline int _mzd_transpose_Nxjx64(word* RESTRICT t, int n)
{
  int j = 1;
  int mi = 0;		// Index into the transpose_mask array.

  while (j < n)		// Don't swap with entirely undefined data (where [D] exists entirely of non-existant rows).
  {
    // Swap 64/j matrices of size jxj in 2j rows. Thus,
    // <---- one word --->
    // [Al][Bl]...[A0][B0]
    // [Cl][Dl]...[C0][D0], where l = 64/j - 1 and each matrix [A], [B] etc is jxj.
    // Then swap [A] and [D] in-place.

    // m runs over the values in transpose_mask, so that at all
    // times m exists of j zeroes followed by j ones, repeated.
    word const m = transpose_mask[mi];
    int k = 0;		// Index into t[].
    do {
      // Run over all rows of [A] and [D].
      for (int i = 0; i < j; ++i, ++k) {
	// t[k] contains row i of all [A], and t[k + j] contains row i of all [D]. Swap them.
	word xor = ((t[k] >> j) ^ t[k + j]) & m;
	t[k] ^= xor << j;
	t[k + j] ^= xor;
      }
      k += j;		// Skip [C].
    } while (k < n);	// Stop if we passed all valid input.

    // Double the size of j and repeat this for the next 2j rows until all
    // n rows have been swapped (possibly with non-existant rows).
    j <<= 1;
    ++mi;
  }

  return mi;
}

/**
 * Transpose a n x 64 matrix with width 1.
 *
 * \param dst First word of destination matrix.
 * \param src First word of source matrix.
 * \param rowstride_dst Rowstride of destination matrix.
 * \param rowstride_src Rowstride of source matrix.
 * \param n Number of rows in source matrix, must be less than 64.
 *
 * Rows of all matrices are expected have offset zero
 * and lay entirely inside a single block.
 *
 * \note This function also works to transpose in-place.
 */

static inline void _mzd_copy_transpose_lt64x64(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n)
{
  // Preload the n input rows into level 1, using a minimum of cache lines (compact storage).
  word t[64];
  word const* RESTRICT wks = src;
  int k;
  for (k = 0; k < n; ++k) {
    t[k] = *wks;
    wks += rowstride_src;
  }
  if (n > 32) {
    while (k < 64)
      t[k++] = 0;
    _mzd_copy_transpose_64x64(dst, t, rowstride_dst, 1);
    return;
  }
  int log2j = _mzd_transpose_Nxjx64(t, n);
  // All output bits are now transposed, but still might need to be shifted in place.
  // What we have now is 64/j matrices of size jxj. Thus,
  // [Al]...[A1][A0], where l = 64/j - 1.
  // while the actual output is:
  // [A0]
  // [A1]
  // ...
  // [Al]
  word const m = __M4RI_LEFT_BITMASK(n);
  word* RESTRICT wk = dst;
  switch (log2j) {
    case 5:
    {
      wi_t const j_rowstride_dst = 32 * rowstride_dst;
      for (int k = 0; k < 32; ++k) {
	wk[0] = t[k] & m;
	wk[j_rowstride_dst] = (t[k] >> 32) & m;
	wk += rowstride_dst;
      }
      break;
    }
    case 4:
    {
      wi_t const j_rowstride_dst = 16 * rowstride_dst;
      for (int k = 0; k < 16; ++k) {
	wk[0] = t[k] & m;
	wk[j_rowstride_dst] = (t[k] >> 16) & m;
	wk[2 * j_rowstride_dst] = (t[k] >> 32) & m;
	wk[3 * j_rowstride_dst] = (t[k] >> 48) & m;
	wk += rowstride_dst;
      }
      break;
    }
    case 3:
    {
      wi_t const j_rowstride_dst = 8 * rowstride_dst;
      for (int k = 0; k < 8; ++k) {
	wk[0] = t[k] & m;
	wk[j_rowstride_dst] = (t[k] >> 8) & m;
	wk[2 * j_rowstride_dst] = (t[k] >> 16) & m;
	wk[3 * j_rowstride_dst] = (t[k] >> 24) & m;
	wk[4 * j_rowstride_dst] = (t[k] >> 32) & m;
	wk[5 * j_rowstride_dst] = (t[k] >> 40) & m;
	wk[6 * j_rowstride_dst] = (t[k] >> 48) & m;
	wk[7 * j_rowstride_dst] = (t[k] >> 56) & m;
	wk += rowstride_dst;
      }
      break;
    }
    case 2:
    {
      wi_t const j_rowstride_dst = 4 * rowstride_dst;
      for (int k = 0; k < 4; ++k) {
	word* RESTRICT wk2 = wk;
	word tk = t[k];
	for (int i = 0; i < 2; ++i) {
	  wk2[0] = tk & m;
	  wk2[j_rowstride_dst] = (tk >> 4) & m;
	  wk2[2 * j_rowstride_dst] = (tk >> 8) & m;
	  wk2[3 * j_rowstride_dst] = (tk >> 12) & m;
	  wk2[4 * j_rowstride_dst] = (tk >> 16) & m;
	  wk2[5 * j_rowstride_dst] = (tk >> 20) & m;
	  wk2[6 * j_rowstride_dst] = (tk >> 24) & m;
	  wk2[7 * j_rowstride_dst] = (tk >> 28) & m;
	  wk2 += 8 * j_rowstride_dst;
	  tk >>= 32;
	}
	wk += rowstride_dst;
      }
      break;
    }
    case 1:
    {
      wi_t const j_rowstride_dst = 2 * rowstride_dst;
      for (int k = 0; k < 2; ++k) {
	word* RESTRICT wk2 = wk;
	word tk = t[k];
	for (int i = 0; i < 8; ++i) {
	  wk2[0] = tk & m;
	  wk2[j_rowstride_dst] = (tk >> 2) & m;
	  wk2[2 * j_rowstride_dst] = (tk >> 4) & m;
	  wk2[3 * j_rowstride_dst] = (tk >> 6) & m;
	  wk2 += 4 * j_rowstride_dst;
	  tk >>= 8;
	}
	wk += rowstride_dst;
      }
      break;
    }
    case 0:
    {
      word* RESTRICT wk2 = wk;
      word tk = t[0];
      for (int i = 0; i < 16; ++i) {
	wk2[0] = tk & m;
	wk2[rowstride_dst] = (tk >> 1) & m;
	wk2[2 * rowstride_dst] = (tk >> 2) & m;
	wk2[3 * rowstride_dst] = (tk >> 3) & m;
	wk2 += 4 * rowstride_dst;
	tk >>= 4;
      }
      break;
    }
  }
}

/**
 * Transpose a 64 x n matrix with width 1.
 *
 * \param dst First word of destination matrix.
 * \param src First word of source matrix.
 * \param rowstride_dst Rowstride of destination matrix.
 * \param rowstride_src Rowstride of source matrix.
 * \param n Number of columns in source matrix, must be less than 64.
 *
 * Rows of all matrices are expected have offset zero
 * and lay entirely inside a single block.
 *
 * \note This function also works to transpose in-place.
 */

static inline void _mzd_copy_transpose_64xlt64(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n)
{
  word t[64];
  int log2j = log2_ceil(n);
  word const* RESTRICT wks = src;
  switch (log2j) {
    case 6:
    {
      _mzd_copy_transpose_64x64(t, src, 1, rowstride_src);
      word* RESTRICT wk = dst;
      for (int k = 0; k < n; ++k) {
	*wk = t[k];
	wk += rowstride_dst;
      }
      return;
    }
    case 5:
    {
      wi_t const j_rowstride_src = 32 * rowstride_src;
      for (int k = 0; k < 32; ++k) {
	t[k] = wks[0] | (wks[j_rowstride_src] << 32);
	wks += rowstride_src;
      }
      break;
    }
    case 4:
    {
      wi_t const j_rowstride_src = 16 * rowstride_src;
      for (int k = 0; k < 16; ++k) {
	t[k] = wks[0] | (wks[j_rowstride_src] << 16);
	t[k] |= (wks[2 * j_rowstride_src] << 32) | (wks[3 * j_rowstride_src] << 48);
	wks += rowstride_src;
      }
      break;
    }
    case 3:
    {
      wi_t const j_rowstride_src = 8 * rowstride_src;
      word tt;
      for (int k = 0; k < 8; ++k) {
	tt = wks[0] | (wks[j_rowstride_src] << 8);
	t[k] = (wks[2 * j_rowstride_src] << 16) | (wks[3 * j_rowstride_src] << 24);
	tt |= (wks[4 * j_rowstride_src] << 32) | (wks[5 * j_rowstride_src] << 40);
	t[k] |= (wks[6 * j_rowstride_src] << 48) | (wks[7 * j_rowstride_src] << 56);
	wks += rowstride_src;
	t[k] |= tt;
      }
      break;
    }
    case 2:
    {
      word const* RESTRICT wks2 = wks + 60 * rowstride_src;
      t[0] = wks2[0];
      t[1] = wks2[rowstride_src];
      t[2] = wks2[2 * rowstride_src];
      t[3] = wks2[3 * rowstride_src];
      for (int i = 0; i < 15; ++i) {
	wks2 -= 4 * rowstride_src;
	t[0] <<= 4;
	t[1] <<= 4;
	t[2] <<= 4;
	t[3] <<= 4;
	t[0] |= wks2[0];
	t[1] |= wks2[rowstride_src];
	t[2] |= wks2[2 * rowstride_src];
	t[3] |= wks2[3 * rowstride_src];
      }
      break;
    }
    case 1:
    {
      wks += 62 * rowstride_src;
      t[0] = wks[0];
      t[1] = wks[rowstride_src];
      for (int i = 0; i < 31; ++i) {
	wks -= 2 * rowstride_src;
	t[0] <<= 2;
	t[1] <<= 2;
	t[0] |= wks[0];
	t[1] |= wks[rowstride_src];
      }
      break;
    }
    case 0:
    {
      word tt[2];
      tt[0] = wks[0];
      tt[1] = wks[rowstride_src];
      for (int i = 2; i < 64; i += 2) {
	wks += 2 * rowstride_src;
	tt[0] |= wks[0] << i;
	tt[1] |= wks[rowstride_src] << i;
      }
      *dst = tt[0] | (tt[1] << 1);
      return;
    }
  }
  int j  = 1 << log2j;
  _mzd_transpose_Nxjx64(t, j);
  word* RESTRICT wk = dst;
  for (int k = 0; k < n; ++k) {
    *wk = t[k];
    wk += rowstride_dst;
  }
}

/**
 * Transpose a n x m matrix with width 1, offset 0 and m and n less than or equal 8.
 *
 * \param dst First word of destination matrix.
 * \param src First word of source matrix.
 * \param rowstride_dst Rowstride of destination matrix.
 * \param rowstride_src Rowstride of source matrix.
 * \param n Number of rows in source matrix, must be less than or equal 8.
 * \param m Number of columns in source matrix, must be less than or equal 8.
 *
 * Rows of all matrices are expected to have offset zero
 * and lay entirely inside a single block.
 *
 * \note This function also works to transpose in-place.
 */

static inline void _mzd_copy_transpose_le8xle8(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n, int m, int maxsize)
{
  int end = maxsize * 7;
  word const* RESTRICT wks = src;
  word w = *wks;
  int shift = 0;
  for (int i = 1; i < n; ++i) {
    wks += rowstride_src;
    shift += 8;
    w |= (*wks << shift);
  }
  word mask = 0x80402010080402ULL;
  word w7 = w >> 7;
  shift = 7;
  --m;
  do {
    word xor = (w ^ w7) & mask;
    mask >>= 8;
    w ^= (xor << shift);
    shift += 7;
    w7 >>= 7;
    w ^= xor;
  } while(shift < end);
  word* RESTRICT wk = dst + m * rowstride_dst;
  for (int shift = 8 * m; shift > 0; shift -= 8) {
    *wk = (unsigned char)(w >> shift);
    wk -= rowstride_dst;
  }
  *wk = (unsigned char)w;
}

/**
 * Transpose a n x m matrix with width 1, offset 0 and m and n less than or equal 16.
 *
 * \param dst First word of destination matrix.
 * \param src First word of source matrix.
 * \param rowstride_dst Rowstride of destination matrix.
 * \param rowstride_src Rowstride of source matrix.
 * \param n Number of rows in source matrix, must be less than or equal 16.
 * \param m Number of columns in source matrix, must be less than or equal 16.
 *
 * Rows of all matrices are expected to have offset zero
 * and lay entirely inside a single block.
 *
 * \note This function also works to transpose in-place.
 */

static inline void _mzd_copy_transpose_le16xle16(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n, int m, int maxsize)
{
  int end = maxsize * 3;
  word const* RESTRICT wks = src;
  word t[4];
  int i = n;
  do {
    t[0] = wks[0];
    if (--i == 0) {
      t[1] = 0;
      t[2] = 0;
      t[3] = 0;
      break;
    }
    t[1] = wks[rowstride_src];
    if (--i == 0) {
      t[2] = 0;
      t[3] = 0;
      break;
    }
    t[2] = wks[2 * rowstride_src];
    if (--i == 0) {
      t[3] = 0;
      break;
    }
    t[3] = wks[3 * rowstride_src];
    if (--i == 0)
      break;
    wks += 4 * rowstride_src;
    for(int shift = 16;; shift += 16) {
      t[0] |= (*wks << shift);
      if (--i == 0)
	break;
      t[1] |= (wks[rowstride_src] << shift);
      if (--i == 0)
	break;
      t[2] |= (wks[2 * rowstride_src] << shift);
      if (--i == 0)
	break;
      t[3] |= (wks[3 * rowstride_src] << shift);
      if (--i == 0)
	break;
      wks += 4 * rowstride_src;
    }
  } while(0);
  word mask = 0xF0000F0000F0ULL;
  int shift = 12;
  word xor[4];
  do {
    xor[0] = (t[0] ^ (t[0] >> shift)) & mask;
    xor[1] = (t[1] ^ (t[1] >> shift)) & mask;
    xor[2] = (t[2] ^ (t[2] >> shift)) & mask;
    xor[3] = (t[3] ^ (t[3] >> shift)) & mask;
    mask >>= 16;
    t[0] ^= (xor[0] << shift);
    t[1] ^= (xor[1] << shift);
    t[2] ^= (xor[2] << shift);
    t[3] ^= (xor[3] << shift);
    shift += 12;
    t[0] ^= xor[0];
    t[1] ^= xor[1];
    t[2] ^= xor[2];
    t[3] ^= xor[3];
  } while(shift < end);
  _mzd_transpose_Nxjx64(t, 4);
  i = m;
  word* RESTRICT wk = dst;
  do {
    wk[0] = (uint16_t)t[0];
    if (--i == 0)
      break;
    wk[rowstride_dst] = (uint16_t)t[1];
    if (--i == 0)
      break;
    wk[2 * rowstride_dst] = (uint16_t)t[2];
    if (--i == 0)
      break;
    wk[3 * rowstride_dst] = (uint16_t)t[3];
    if (--i == 0)
      break;
    wk += 4 * rowstride_dst;
    for(int shift = 16;; shift += 16) {
      wk[0] = (uint16_t)(t[0] >> shift);
      if (--i == 0)
	break;
      wk[rowstride_dst] = (uint16_t)(t[1] >> shift);
      if (--i == 0)
	break;
      wk[2 * rowstride_dst] = (uint16_t)(t[2] >> shift);
      if (--i == 0)
	break;
      wk[3 * rowstride_dst] = (uint16_t)(t[3] >> shift);
      if (--i == 0)
	break;
      wk += 4 * rowstride_dst;
    }
  } while(0);
}

/**
 * Transpose a n x m matrix with width 1, offset 0 and m and n less than or equal 32.
 *
 * \param dst First word of destination matrix.
 * \param src First word of source matrix.
 * \param rowstride_dst Rowstride of destination matrix.
 * \param rowstride_src Rowstride of source matrix.
 * \param n Number of rows in source matrix, must be less than or equal 32.
 * \param m Number of columns in source matrix, must be less than or equal 32.
 *
 * Rows of all matrices are expected to have offset zero
 * and lay entirely inside a single block.
 *
 * \note This function also works to transpose in-place.
 */

static inline void _mzd_copy_transpose_le32xle32(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n, int m)
{
  word const* RESTRICT wks = src;
  word t[16];
  int i = n;
  if (n > 16) {
    i -= 16;
    for (int j = 0; j < 16; ++j) {
      t[j] = *wks;
      wks += rowstride_src;
    }
    int j = 0;
    do {
      t[j++] |= (*wks << 32);
      wks += rowstride_src;
    } while(--i);
  } else {
    int j;
    for (j = 0; j < n; ++j) {
      t[j] = *wks;
      wks += rowstride_src;
    }
    for (; j < 16; ++j)
      t[j] = 0;
  }
  _mzd_transpose_Nxjx64(t, 16);
  int one_more = (m & 1);
  word* RESTRICT wk = dst;
  if (m > 16) {
    m -= 16;
    for (int j = 0; j < 16; j += 2) {
      *wk = (t[j] & 0xFFFF) | ((t[j] >> 16) & 0xFFFF0000);
      wk[rowstride_dst] = (t[j + 1] & 0xFFFF) | ((t[j + 1] >> 16) & 0xFFFF0000);
      wk += 2 * rowstride_dst;
    }
    for (int j = 1; j < m; j += 2) {
      *wk = ((t[j - 1] >> 16) & 0xFFFF) | ((t[j - 1] >> 32) & 0xFFFF0000);
      wk[rowstride_dst] = ((t[j] >> 16) & 0xFFFF) | ((t[j] >> 32) & 0xFFFF0000);
      wk += 2 * rowstride_dst;
    }
    if (one_more) {
      *wk = ((t[m - 1] >> 16) & 0xFFFF) | ((t[m - 1] >> 32) & 0xFFFF0000);
    }
  } else {
    for (int j = 1; j < m; j += 2) {
      *wk = (t[j - 1] & 0xFFFF) | ((t[j - 1] >> 16) & 0xFFFF0000);
      wk[rowstride_dst] = (t[j] & 0xFFFF) | ((t[j] >> 16) & 0xFFFF0000);
      wk += 2 * rowstride_dst;
    }
    if (one_more) {
      *wk = (t[m - 1] & 0xFFFF) | ((t[m - 1] >> 16) & 0xFFFF0000);
    }
  }
}

static inline void _mzd_copy_transpose_le64xle64(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n, int m)
{
  word const* RESTRICT wks = src;
  word t[64];
  int k;
  for (k = 0; k < n; ++k) {
    t[k] = *wks;
    wks += rowstride_src;
  }
  while(k < 64)
    t[k++] = 0;
  _mzd_copy_transpose_64x64(t, t, 1, 1);
  word* RESTRICT wk = dst;
  for (int k = 0; k < m; ++k) {
    *wk = t[k];
    wk += rowstride_dst;
  }
  return;
}

void _mzd_transpose_multiblock(mzd_t *DST, mzd_t const *A, word* RESTRICT* fwdp, word const* RESTRICT* fwsp, rci_t* nrowsp, rci_t* ncolsp);

mzd_t *_mzd_transpose(mzd_t *DST, mzd_t const *A) {
  assert(!mzd_is_windowed(DST) && !mzd_is_windowed(A));
  // We assume that there fit at least 64 rows in a block, if
  // that is the case then each block will contain a multiple
  // of 64 rows, since blockrows is a power of 2.
  assert(A->blockrows_log >= 6 && DST->blockrows_log >= 6);

  rci_t nrows = A->nrows;
  rci_t ncols = A->ncols;
  rci_t maxsize = MAX(nrows, ncols);

  word* RESTRICT fwd = mzd_first_row(DST);
  word const* RESTRICT fws = mzd_first_row(A);

  if (maxsize >= 64) {

    // This is the most non-intrusive way to deal with the case of multiple blocks.
    // Note that this code is VERY sensitive. ANY change to _mzd_transpose can easily
    // reduce the speed for small matrices (up to 64x64) by 5 to 10%.
    int const multiple_blocks = (A->flags | DST->flags) & mzd_flag_multiple_blocks;
    if (__M4RI_UNLIKELY(multiple_blocks)) {
      word* RESTRICT non_register_fwd;
      word const* RESTRICT non_register_fws;
      rci_t non_register_nrows;
      rci_t non_register_ncols;
      _mzd_transpose_multiblock(DST, A, &non_register_fwd, &non_register_fws, &non_register_nrows, &non_register_ncols);
      fwd = non_register_fwd;
      fws = non_register_fws;
      nrows = non_register_nrows;
      ncols = non_register_ncols;
    }

    if (nrows >= 64) {
      /*
       * This is an interesting #if ...
       * I recommend to investigate the number of instructions, and the clocks per instruction,
       * as function of various sizes of the matrix (most likely especially the number of columns
       * (the size of a row) will have influence; also always use multiples of 64 or even 128),
       * for both cases below.
       *
       * To measure this run for example:
       *
       * ./bench_mzd -m 10 -x 10 -p PAPI_TOT_INS,PAPI_L1_TCM,PAPI_L2_TCM mzd_transpose 32000 32000
       * ./bench_mzd -m 10 -x 100 -p PAPI_TOT_INS,PAPI_L1_TCM,PAPI_L2_TCM mzd_transpose 128 10240
       * etc (increase -x for smaller sizes to get better accuracy).
       *
       * --Carlo Wood
       */
#if 1
      int js = ncols & nrows & 64;	// True if the total number of whole 64x64 matrices is odd.
      wi_t const rowstride_64_dst = 64 * DST->rowstride;
      word* RESTRICT fwd_current = fwd;
      word const* RESTRICT fws_current = fws;
      if (js) {
	js = 1;
	_mzd_copy_transpose_64x64(fwd, fws, DST->rowstride, A->rowstride);
	if ((nrows | ncols) == 64) {
	  __M4RI_DD_MZD(DST);
	  return DST;
	}
	fwd_current += rowstride_64_dst;
	++fws_current;
      }
      rci_t const whole_64cols = ncols / 64;
      // The use of delayed and even, is to avoid calling _mzd_copy_transpose_64x64_2 twice.
      // This way it can be inlined without duplicating the amount of code that has to be loaded.
      word* RESTRICT fwd_delayed = NULL;
      word const* RESTRICT fws_delayed = NULL;
      int even = 0;
      while (1)
      {
	for (int j = js; j < whole_64cols; ++j) {
	  if (!even) {
	    fwd_delayed = fwd_current;
	    fws_delayed = fws_current;
	  } else {
	    _mzd_copy_transpose_64x64_2(fwd_delayed, fwd_current, fws_delayed, fws_current, DST->rowstride, A->rowstride);
	  }
	  fwd_current += rowstride_64_dst;
	  ++fws_current;
	  even = !even;
	}
	nrows -= 64;
	if (ncols % 64) {
	  _mzd_copy_transpose_64xlt64(fwd + whole_64cols * rowstride_64_dst, fws + whole_64cols, DST->rowstride, A->rowstride, ncols % 64);
	}
	fwd += 1;
	fws += 64 * A->rowstride;
	if (nrows < 64)
	  break;
	js = 0;
	fws_current = fws;
	fwd_current = fwd;
      }
#else
      // The same as the above, but without using _mzd_copy_transpose_64x64_2.
      wi_t const rowstride_64_dst = 64 * DST->rowstride;
      rci_t const whole_64cols = ncols / 64;
      assert(nrows >= 64);
      do {
	for (int j = 0; j < whole_64cols; ++j) {
	  _mzd_copy_transpose_64x64(fwd + j * rowstride_64_dst, fws + j, DST->rowstride, A->rowstride);
	}
	nrows -= 64;
	if (ncols % 64) {
	  _mzd_copy_transpose_64xlt64(fwd + whole_64cols * rowstride_64_dst, fws + whole_64cols, DST->rowstride, A->rowstride, ncols % 64);
	}
	fwd += 1;
	fws += 64 * A->rowstride;
      } while(nrows >= 64);
#endif
    }

    if (nrows == 0) {
      __M4RI_DD_MZD(DST);
      return DST;
    }

    // Transpose the remaining top rows. Now 0 < nrows < 64.

    while (ncols >= 64)
    {
      _mzd_copy_transpose_lt64x64(fwd, fws, DST->rowstride, A->rowstride, nrows);
      ncols -= 64;
      fwd += 64 * DST->rowstride;
      fws += 1;
    }

    if (ncols == 0) {
      __M4RI_DD_MZD(DST);
      return DST;
    }

    maxsize = MAX(nrows, ncols);
  }

  // Transpose the remaining corner. Now both 0 < nrows < 64 and 0 < ncols < 64.

  if (maxsize <= 8) {
    _mzd_copy_transpose_le8xle8(fwd, fws, DST->rowstride, A->rowstride, nrows, ncols, maxsize);
  }
  else if (maxsize <= 16) {
    _mzd_copy_transpose_le16xle16(fwd, fws, DST->rowstride, A->rowstride, nrows, ncols, maxsize);
  }
  else if (maxsize <= 32) {
    _mzd_copy_transpose_le32xle32(fwd, fws, DST->rowstride, A->rowstride, nrows, ncols);
  }
  else {
    _mzd_copy_transpose_le64xle64(fwd, fws, DST->rowstride, A->rowstride, nrows, ncols);
  }

  __M4RI_DD_MZD(DST);
  return DST;
}

void _mzd_transpose_multiblock(mzd_t *DST, mzd_t const *A, word* RESTRICT* fwdp, word const* RESTRICT* fwsp, rci_t* nrowsp, rci_t* ncolsp) {

  rci_t nrows = A->nrows;
  rci_t ncols = A->ncols;

  rci_t blockrows_dst = 1 << DST->blockrows_log;	// The maximum number of rows in a block of DST.
  rci_t blockrows_src = 1 << A->blockrows_log;		// The maximum number of rows in a block of A.

  /* We're deviding the source matrix into blocks of multiples of 64x64, such that each
   * block fits entirely inside a single memory allocation block, both in the source
   * as well as the corresponding destination.
   *
   *                      <-------------------ncols----------------->
   *                                  <---------blockrows_dst------->
   * .---------------------------------------------------------------.
   * |P      ^  Matrix A:|   .       |Q      .       .       .       |<-^---- A->blocks[0].begin
   * |       |           |   .       |       .       .       .       |  |
   * |       |           |   .       |       .       .       .       |  |
   * |       |           |- - - - - -|- - - - - - - - - - - - - - - -|  |
   * |       |           |   .       |       .   ^   .       .       |  |
   * |       |           |   .       |       .<64x64>.       .       |  |
   * |       |           |   .       |       .   v   .       .       |  |
   * |       |           |- - - - - -|- - - - - - - - - - - - - - - -|  |- blockrows_src
   * |       |           |   .       |       .       .       .       |  |
   * |       |           |   .       |       .       .       .       |  |
   * |       |           |   .       |       .       .       .       |  |
   * |       |nrows      |- - - - - -|- - - - - - - - - - - - - - - -|  |
   * |       |           |   .       |       .       .       .       |  |
   * |       |           |   .       |       .       .       .       |  |
   * |       |           |   .       |       .       .       .       |  v
   * |===================+===========================================|
   * |R      |           |   .       |S      .       .       .       |<------ A->blocks[1].begin
   * |       |           |   .       |       .       .       .       |
   * |       |           |   .       |       .       .       .       |
   * |       |           |- - - - - -|- - - - - - - - - - - - - - - -|
   * |       |           |   .       |       .       .       .       |
   * |       |           |   .       |       .       .       .       |
   * |       |           |   .       |       .       .       .       |
   * |       |           |- - - - - -|- - - - - - - - - - - - - - - -|
   * |       v           |   .       |       .       .       .       |
   * |                   `-------------------------------------------|
   * |                               |                               |
   * |                               |                               |
   * |                               |                               |
   * |                               |                               |
   * |                               |                               |
   * `---------------------------------------------------------------'
   *
   * Imagine this also to be the memory map of DST, which then would be
   * mirrored in the diagonal line from the top/right to the bottom/left.
   * Then each of the squares P, Q, R and S lay entirely inside one
   * memory block in both the source as well as the destination matrix.
   * P and Q are really the same block for matrix A (as are R and S),
   * while P and R (and Q and S) are really the same block for DST.
   *
   * We're going to run over the top/right corners of each of these
   * memory "blocks" and then transpose it, one by one, running
   * from right to left and top to bottom. The last one (R) will be
   * done by the calling function, so we just return when we get there.
   */

  rci_t R_top = (nrows >> A->blockrows_log) << A->blockrows_log;
  rci_t R_right = (ncols >> DST->blockrows_log) << DST->blockrows_log;
  for (rci_t col = 0; col < ncols; col += blockrows_dst) {
    rci_t end = (col == R_right) ? R_top : nrows;
    for (rci_t row = 0; row < end; row += blockrows_src) {

      rci_t nrowsb = (row < R_top) ? blockrows_src : (nrows - R_top);
      rci_t ncolsb = (col < R_right) ? blockrows_dst : (ncols - R_right);
      word const* RESTRICT fws = mzd_row(A, row) + col / m4ri_radix;
      word* RESTRICT fwd = mzd_row(DST, col) + row / m4ri_radix;

      // The following code is (almost) duplicated from _mzd_transpose.
      if (nrowsb >= 64) {

	int js = ncolsb & nrowsb & 64;	// True if the total number of whole 64x64 matrices is odd.
	wi_t const rowstride_64_dst = 64 * DST->rowstride;
	word* RESTRICT fwd_current = fwd;
	word const* RESTRICT fws_current = fws;
	if (js) {
	  js = 1;
	  _mzd_copy_transpose_64x64(fwd, fws, DST->rowstride, A->rowstride);
	  fwd_current += rowstride_64_dst;
	  ++fws_current;
	}
	rci_t const whole_64cols = ncolsb / 64;
	// The use of delayed and even, is to avoid calling _mzd_copy_transpose_64x64_2 twice.
	// This way it can be inlined without duplicating the amount of code that has to be loaded.
	word* RESTRICT fwd_delayed = NULL;
	word const* RESTRICT fws_delayed = NULL;
	int even = 0;
	while (1)
	{
	  for (int j = js; j < whole_64cols; ++j) {
	    if (!even) {
	      fwd_delayed = fwd_current;
	      fws_delayed = fws_current;
	    } else {
	      _mzd_copy_transpose_64x64_2(fwd_delayed, fwd_current, fws_delayed, fws_current, DST->rowstride, A->rowstride);
	    }
	    fwd_current += rowstride_64_dst;
	    ++fws_current;
	    even = !even;
	  }
	  nrowsb -= 64;
	  if (ncolsb % 64) {
	    _mzd_copy_transpose_64xlt64(fwd + whole_64cols * rowstride_64_dst, fws + whole_64cols, DST->rowstride, A->rowstride, ncolsb % 64);
	  }
	  fwd += 1;
	  fws += 64 * A->rowstride;
	  if (nrowsb < 64)
	    break;
	  js = 0;
	  fws_current = fws;
	  fwd_current = fwd;
	}
      }

      if (nrowsb == 0)
	continue;

      // Transpose the remaining top rows. Now 0 < nrowsb < 64.

      while (ncolsb >= 64)
      {
	_mzd_copy_transpose_lt64x64(fwd, fws, DST->rowstride, A->rowstride, nrowsb);
	ncolsb -= 64;
	fwd += 64 * DST->rowstride;
	fws += 1;
      }

      // This is true because if it wasn't then nrowsb has to be 0 and we continued before already.
      assert(ncolsb == 0);
    }
  }

  *nrowsp = nrows - R_top;
  *ncolsp = ncols - R_right;
  if (R_top < nrows)
    *fwsp = mzd_row(A, R_top) + R_right / m4ri_radix;
  if (R_right < ncols)
    *fwdp = mzd_row(DST, R_right) + R_top / m4ri_radix;
}

mzd_t *mzd_transpose(mzd_t *DST, mzd_t const *A) {
  if (DST == NULL) {
    DST = mzd_init( A->ncols, A->nrows );
  } else if (__M4RI_UNLIKELY(DST->nrows != A->ncols || DST->ncols != A->nrows)) {
    m4ri_die("mzd_transpose: Wrong size for return matrix.\n");
  } else {
    /** it seems this is taken care of in the subroutines, re-enable if running into problems **/
    //mzd_set_ui(DST,0);
  }

  if(A->nrows == 0 || A->ncols == 0)
    return mzd_copy(DST, A);
 
  if (__M4RI_LIKELY(!mzd_is_windowed(DST) && !mzd_is_windowed(A)))
    return _mzd_transpose(DST, A);
  int A_windowed = mzd_is_windowed(A);
  if (A_windowed)
    A = mzd_copy(NULL, A);
  if (__M4RI_LIKELY(!mzd_is_windowed(DST)))
    _mzd_transpose(DST, A);
  else {
    mzd_t *D = mzd_init(DST->nrows, DST->ncols);
    _mzd_transpose(D, A);
    mzd_copy(DST, D);
    mzd_free(D);
  }
  if (A_windowed)
    mzd_free((mzd_t*)A);
  return DST;
}

mzd_t *mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B) {
  if (C == NULL) {
    C = mzd_init(A->nrows, B->ncols);
  } else {
    if (C->nrows != A->nrows || C->ncols != B->ncols) {
      m4ri_die("mzd_mul_naive: Provided return matrix has wrong dimensions.\n");
    }
  }
  if(B->ncols < m4ri_radix-10) { /* this cutoff is rather arbitrary */
    mzd_t *BT = mzd_transpose(NULL, B);
    _mzd_mul_naive(C, A, BT, 1);
    mzd_free (BT);
  } else {
    _mzd_mul_va(C, A, B, 1);
  }
  return C;
}

mzd_t *mzd_addmul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B) {
  if (C->nrows != A->nrows || C->ncols != B->ncols) {
    m4ri_die("mzd_mul_naive: Provided return matrix has wrong dimensions.\n");
  }

  if(B->ncols < m4ri_radix-10) { /* this cutoff is rather arbitrary */
    mzd_t *BT = mzd_transpose(NULL, B);
    _mzd_mul_naive(C, A, BT, 0);
    mzd_free (BT);
  } else {
    _mzd_mul_va(C, A, B, 0);
  }
  return C;
}

mzd_t *_mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B, const int clear) {
  wi_t eol;
  word *a, *b, *c;

  if (clear) {
    word const mask_end = C->high_bitmask;
    /* improves performance on x86_64 but is not cross plattform */
    /* asm __volatile__ (".p2align 4\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop"); */
    for (rci_t i = 0; i < C->nrows; ++i) {
      wi_t j = 0;
      for (; j < C->width - 1; ++j) {
  	C->rows[i][j] = 0;
      }
      C->rows[i][j] &= ~mask_end;
    }
  }

  if(C->ncols % m4ri_radix) {
    eol = (C->width - 1);
  } else {
    eol = (C->width);
  }

  word parity[64];
  for (int i = 0; i < 64; ++i) {
    parity[i] = 0;
  }
  wi_t const wide = A->width;
  int const blocksize = __M4RI_MUL_BLOCKSIZE;
  for (rci_t start = 0; start + blocksize <= C->nrows; start += blocksize) {
    for (rci_t i = start; i < start + blocksize; ++i) {
      a = A->rows[i];
      c = C->rows[i];
      for (rci_t j = 0; j < m4ri_radix * eol; j += m4ri_radix) {
	for (int k = 0; k < m4ri_radix; ++k) {
          b = B->rows[j + k];
          parity[k] = a[0] & b[0];
          for (wi_t ii = wide - 1; ii >= 1; --ii)
	    parity[k] ^= a[ii] & b[ii];
        }
        c[j / m4ri_radix] ^= m4ri_parity64(parity);
      }
      
      if (eol != C->width) {
	word const mask_end = C->high_bitmask;
        /* improves performance on x86_64 but is not cross plattform */
	/* asm __volatile__ (".p2align 4\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop"); */
        for (int k = 0; k < (C->ncols % m4ri_radix); ++k) {
          b = B->rows[m4ri_radix * eol + k];
          parity[k] = a[0] & b[0];
          for (wi_t ii = 1; ii < A->width; ++ii)
            parity[k] ^= a[ii] & b[ii];
        }
        c[eol] ^= m4ri_parity64(parity) & mask_end;
      }
    }
  }

  for (rci_t i = C->nrows - (C->nrows % blocksize); i < C->nrows; ++i) {
    a = A->rows[i];
    c = C->rows[i];
    for (rci_t j = 0; j < m4ri_radix * eol; j += m4ri_radix) {
      for (int k = 0; k < m4ri_radix; ++k) {
        b = B->rows[j+k];
        parity[k] = a[0] & b[0];
        for (wi_t ii = wide - 1; ii >= 1; --ii)
          parity[k] ^= a[ii] & b[ii];
      }
      c[j/m4ri_radix] ^= m4ri_parity64(parity);
    }
    
    if (eol != C->width) {
      word const mask_end = C->high_bitmask;
      /* improves performance on x86_64 but is not cross plattform */
      /* asm __volatile__ (".p2align 4\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop"); */
      for (int k = 0; k < (C->ncols % m4ri_radix); ++k) {
        b = B->rows[m4ri_radix * eol + k];
        parity[k] = a[0] & b[0];
        for (wi_t ii = 1; ii < A->width; ++ii)
          parity[k] ^= a[ii] & b[ii];
      }
      c[eol] ^= m4ri_parity64(parity) & mask_end;
    }
  }

  __M4RI_DD_MZD(C);
  return C;
}

mzd_t *_mzd_mul_va(mzd_t *C, mzd_t const *v, mzd_t const *A, int const clear) {
  if(clear)
    mzd_set_ui(C, 0);

  rci_t const m = v->nrows;
  rci_t const n = v->ncols;
  
  for(rci_t i = 0; i < m; ++i)
    for(rci_t j = 0; j < n; ++j)
      if (mzd_read_bit(v,i,j))
        mzd_combine(C,i,0, C,i,0, A,j,0);

  __M4RI_DD_MZD(C);
  return C;
}

void mzd_randomize(mzd_t *A) {
  wi_t const width = A->width - 1;
  word const mask_end = A->high_bitmask;
  for(rci_t i = 0; i < A->nrows; ++i) {
    for(wi_t j = 0; j < width; ++j)
      A->rows[i][j] = m4ri_random_word();
    A->rows[i][width] ^= (A->rows[i][width] ^ m4ri_random_word()) & mask_end;
  }

  __M4RI_DD_MZD(A);
}

void mzd_set_ui( mzd_t *A, unsigned int value) {
  word const mask_end = A->high_bitmask;
  
  for (rci_t i = 0; i < A->nrows; ++i) {
    word *row = A->rows[i];
    for(wi_t j = 0; j < A->width - 1; ++j)
      row[j] = 0;
    row[A->width - 1] &= ~mask_end;
  }

  if(value % 2 == 0) {
    __M4RI_DD_MZD(A);
    return;
  }

  rci_t const stop = MIN(A->nrows, A->ncols);
  for (rci_t i = 0; i < stop; ++i) {
    mzd_write_bit(A, i, i, 1);
  }

  __M4RI_DD_MZD(A);
}

int mzd_equal(mzd_t const *A, mzd_t const *B) {
  if (A->nrows != B->nrows) return FALSE;
  if (A->ncols != B->ncols) return FALSE;
  if (A == B) return TRUE;

  wi_t Awidth = A->width - 1;
  
  for (rci_t i = 0; i < A->nrows; ++i) {
    for (wi_t j = 0; j < Awidth; ++j) {
      if (A->rows[i][j] != B->rows[i][j])
        return FALSE;
    }
  }

  word const mask_end = A->high_bitmask;
  for (rci_t i = 0; i < A->nrows; ++i) {
    if (((A->rows[i][Awidth] ^ B->rows[i][Awidth]) & mask_end))
      return FALSE;
  }
  return TRUE;
}

int mzd_cmp(mzd_t const *A, mzd_t const *B) {
  if(A->nrows < B->nrows) return -1;
  if(B->nrows < A->nrows) return 1;
  if(A->ncols < B->ncols) return -1;
  if(B->ncols < A->ncols) return 1;

  const word mask_end   = A->high_bitmask;
  const wi_t n = A->width-1;

  /* Columns with large index are "larger", but rows with small index
     are more important than with large index. */

  for(rci_t i=0; i<A->nrows; i++) {
    if ((A->rows[i][n]&mask_end) < (B->rows[i][n]&mask_end))
      return -1;
    else if ((A->rows[i][n]&mask_end) > (B->rows[i][n]&mask_end))
      return 1;

    for(wi_t j=n-1; j>=0; j--) {
      if (A->rows[i][j] < B->rows[i][j])
        return -1;
      else if (A->rows[i][j] > B->rows[i][j])
        return 1;
    }
  }
  return 0;
}

mzd_t *mzd_copy(mzd_t *N, mzd_t const *P) {
  if (N == P)
    return N;

  if (N == NULL) {
    N = mzd_init(P->nrows, P->ncols);
  } else {
    if (N->nrows < P->nrows || N->ncols < P->ncols)
      m4ri_die("mzd_copy: Target matrix is too small.");
  }
  word *p_truerow, *n_truerow;
  wi_t const wide = P->width - 1;
  word mask_end = P->high_bitmask;
  for (rci_t i = 0; i < P->nrows; ++i) {
    p_truerow = P->rows[i];
    n_truerow = N->rows[i];
    for (wi_t j = 0; j < wide; ++j)
      n_truerow[j] = p_truerow[j];
    n_truerow[wide] = (n_truerow[wide] & ~mask_end) | (p_truerow[wide] & mask_end);
  }
  __M4RI_DD_MZD(N);
  return N;
}

/* This is sometimes called augment */
mzd_t *mzd_concat(mzd_t *C, mzd_t const *A, mzd_t const *B) {
  if (A->nrows != B->nrows) {
    m4ri_die("mzd_concat: Bad arguments to concat!\n");
  }

  if (C == NULL) {
    C = mzd_init(A->nrows, A->ncols + B->ncols);
  } else if (C->nrows != A->nrows || C->ncols != (A->ncols + B->ncols)) {
    m4ri_die("mzd_concat: C has wrong dimension!\n");
  }

  for (rci_t i = 0; i < A->nrows; ++i) {
    word *dst_truerow = C->rows[i];
    word *src_truerow = A->rows[i];
    for (wi_t j = 0; j < A->width; ++j) {
      dst_truerow[j] = src_truerow[j];
    }
  }

  for (rci_t i = 0; i < B->nrows; ++i) {
    for (rci_t j = 0; j < B->ncols; ++j) {
      mzd_write_bit(C, i, j + A->ncols, mzd_read_bit(B, i, j));
    }
  }

  __M4RI_DD_MZD(C);
  return C;
}

mzd_t *mzd_stack(mzd_t *C, mzd_t const *A, mzd_t const *B) {
  if (A->ncols != B->ncols) {
    m4ri_die("mzd_stack: A->ncols (%d) != B->ncols (%d)!\n", A->ncols, B->ncols);
  }

  if (C == NULL) {
    C = mzd_init(A->nrows + B->nrows, A->ncols);
  } else if (C->nrows != (A->nrows + B->nrows) || C->ncols != A->ncols) {
    m4ri_die("mzd_stack: C has wrong dimension!\n");
  }
  
  for(rci_t i = 0; i < A->nrows; ++i) {
    word *src_truerow = A->rows[i];
    word *dst_truerow = C->rows[i];
    for (wi_t j = 0; j < A->width; ++j) {
      dst_truerow[j] = src_truerow[j]; 
    }
  }

  for(rci_t i = 0; i < B->nrows; ++i) {
    word *dst_truerow = C->rows[A->nrows + i];
    word *src_truerow = B->rows[i];
    for (wi_t j = 0; j < B->width; ++j) {
      dst_truerow[j] = src_truerow[j]; 
    }
  }

  __M4RI_DD_MZD(C);
  return C;
}

mzd_t *mzd_invert_naive(mzd_t *INV, mzd_t const *A, mzd_t const *I) {
  mzd_t *H;

  H = mzd_concat(NULL, A, I);

  rci_t x = mzd_echelonize_naive(H, TRUE);

  if (x == 0) { 
    mzd_free(H); 
    return NULL; 
  }
  
  INV = mzd_submatrix(INV, H, 0, A->ncols, A->nrows, 2 * A->ncols);

  mzd_free(H);

  __M4RI_DD_MZD(INV);
  return INV;
}

mzd_t *mzd_add(mzd_t *ret, mzd_t const *left, mzd_t const *right) {
  if (left->nrows != right->nrows || left->ncols != right->ncols) {
    m4ri_die("mzd_add: rows and columns must match.\n");
  }
  if (ret == NULL) {
    ret = mzd_init(left->nrows, left->ncols);
  } else if (ret != left) {
    if (ret->nrows != left->nrows || ret->ncols != left->ncols) {
      m4ri_die("mzd_add: rows and columns of returned matrix must match.\n");
    }
  }
  return _mzd_add(ret, left, right);
}

mzd_t *_mzd_add(mzd_t *C, mzd_t const *A, mzd_t const *B) {
  rci_t const nrows = MIN(MIN(A->nrows, B->nrows), C->nrows);

  if (C == B) { //swap
    mzd_t const *tmp = A;
    A = B;
    B = tmp;
  }

  word const mask_end = C->high_bitmask;

  switch(A->width) {
  case 0:
    return C;
  case 1:
    for(rci_t i = 0; i < nrows; ++i) {
      C->rows[i][0] ^= ((A->rows[i][0] ^ B->rows[i][0] ^ C->rows[i][0]) & mask_end);
    }
    break;
  case 2:
    for(rci_t i = 0; i < nrows; ++i) {
      C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
      C->rows[i][1] ^= ((A->rows[i][1] ^ B->rows[i][1] ^ C->rows[i][1]) & mask_end);
    }
    break;
  case 3:
    for(rci_t i = 0; i < nrows; ++i) {
      C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
      C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
      C->rows[i][2] ^= ((A->rows[i][2] ^ B->rows[i][2] ^ C->rows[i][2]) & mask_end);
    }
    break;
  case 4:
    for(rci_t i = 0; i < nrows; ++i) {
      C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
      C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
      C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2];
      C->rows[i][3] ^= ((A->rows[i][3] ^ B->rows[i][3] ^ C->rows[i][3]) & mask_end);
    }
    break;
  case 5:
    for(rci_t i = 0; i < nrows; ++i) {
      C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
      C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
      C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2];
      C->rows[i][3] = A->rows[i][3] ^ B->rows[i][3];
      C->rows[i][4] ^= ((A->rows[i][4] ^ B->rows[i][4] ^ C->rows[i][4]) & mask_end);
    }
    break;
  case 6:
    for(rci_t i = 0; i < nrows; ++i) {
      C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
      C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
      C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2];
      C->rows[i][3] = A->rows[i][3] ^ B->rows[i][3];
      C->rows[i][4] = A->rows[i][4] ^ B->rows[i][4];
      C->rows[i][5] ^= ((A->rows[i][5] ^ B->rows[i][5] ^ C->rows[i][5]) & mask_end);
    }
    break;
  case 7:
    for(rci_t i = 0; i < nrows; ++i) {
      C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
      C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
      C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2];
      C->rows[i][3] = A->rows[i][3] ^ B->rows[i][3];
      C->rows[i][4] = A->rows[i][4] ^ B->rows[i][4];
      C->rows[i][5] = A->rows[i][5] ^ B->rows[i][5];
      C->rows[i][6] ^= ((A->rows[i][6] ^ B->rows[i][6] ^ C->rows[i][6]) & mask_end);
    }
    break;
  case 8:
    for(rci_t i = 0; i < nrows; ++i) {
      C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
      C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
      C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2];
      C->rows[i][3] = A->rows[i][3] ^ B->rows[i][3];
      C->rows[i][4] = A->rows[i][4] ^ B->rows[i][4];
      C->rows[i][5] = A->rows[i][5] ^ B->rows[i][5];
      C->rows[i][6] = A->rows[i][6] ^ B->rows[i][6];
      C->rows[i][7] ^= ((A->rows[i][7] ^ B->rows[i][7] ^ C->rows[i][7]) & mask_end);
    }
    break;

  default:
    for(rci_t i = 0; i < nrows; ++i) {
      mzd_combine_even(C,i,0, A,i,0, B,i,0);
    }
  }

  __M4RI_DD_MZD(C);
  return C;
}

mzd_t *mzd_submatrix(mzd_t *S, mzd_t const *M, rci_t const startrow, rci_t const startcol, rci_t const endrow, rci_t const endcol) {
  rci_t const nrows = endrow - startrow;
  rci_t const ncols = endcol - startcol;

  if (S == NULL) {
    S = mzd_init(nrows, ncols);
  } else if( (S->nrows < nrows) | (S->ncols < ncols) ) {
    m4ri_die("mzd_submatrix: got S with dimension %d x %d but expected %d x %d\n", S->nrows, S->ncols, nrows, ncols);
  }

  if (startcol % m4ri_radix == 0) {

    wi_t const startword = startcol / m4ri_radix;
    /* we start at the beginning of a word */
    if(ncols / m4ri_radix != 0) {
      for(rci_t x = startrow, i = 0; i < nrows; ++i, ++x) {
        memcpy(S->rows[i], M->rows[x] + startword, sizeof(word) * (ncols / m4ri_radix));
      }
    }
    if (ncols % m4ri_radix) {
      word const mask_end = __M4RI_LEFT_BITMASK(ncols % m4ri_radix);
      for(rci_t x = startrow, i = 0; i < nrows; ++i, ++x) {
        /* process remaining bits */
        word temp = M->rows[x][startword + ncols / m4ri_radix] & mask_end;
        S->rows[i][ncols / m4ri_radix] = temp;
      } 
    }
  } else {
    wi_t j;
    for(rci_t i=0; i<nrows; i++) {
      for(j=0; j+m4ri_radix<=ncols; j+=m4ri_radix)
        S->rows[i][j/m4ri_radix] = mzd_read_bits(M, startrow+i, startcol+j, m4ri_radix);
      S->rows[i][j/m4ri_radix] &= ~S->high_bitmask;
      S->rows[i][j/m4ri_radix] |= mzd_read_bits(M, startrow+i, startcol+j, ncols - j) & S->high_bitmask;
    }
  }
  __M4RI_DD_MZD(S);
  return S;
}

void mzd_col_swap(mzd_t *M, rci_t const cola, rci_t const colb) {
  if (cola == colb)
    return;

  rci_t const _cola = cola;
  rci_t const _colb = colb;

  wi_t const a_word = _cola / m4ri_radix;
  wi_t const b_word = _colb / m4ri_radix;

  int const a_bit = _cola % m4ri_radix;
  int const b_bit = _colb % m4ri_radix;

  word* RESTRICT ptr = mzd_first_row(M);
  int max_bit = MAX(a_bit, b_bit);
  int count = mzd_rows_in_block(M, 0);
  assert(count > 0);
  int min_bit = a_bit + b_bit - max_bit;
  int block = 0;
  int offset = max_bit - min_bit;
  word mask = m4ri_one << min_bit;

  if (a_word == b_word) {
    while(1) {
      ptr += a_word;
      int fast_count = count / 4;
      int rest_count = count - 4 * fast_count;
      word xor[4];
      wi_t const rowstride = M->rowstride;
      while (fast_count--) {
	xor[0] = ptr[0];
	xor[1] = ptr[rowstride];
	xor[2] = ptr[2 * rowstride];
	xor[3] = ptr[3 * rowstride];
	xor[0] ^= xor[0] >> offset;
	xor[1] ^= xor[1] >> offset;
	xor[2] ^= xor[2] >> offset;
	xor[3] ^= xor[3] >> offset;
	xor[0] &= mask;
	xor[1] &= mask;
	xor[2] &= mask;
	xor[3] &= mask;
	xor[0] |= xor[0] << offset;
	xor[1] |= xor[1] << offset;
	xor[2] |= xor[2] << offset;
	xor[3] |= xor[3] << offset;
	ptr[0] ^= xor[0];
	ptr[rowstride] ^= xor[1];
	ptr[2 * rowstride] ^= xor[2];
	ptr[3 * rowstride] ^= xor[3];
	ptr += 4 * rowstride;
      }
      while (rest_count--) {
	word xor = *ptr;
	xor ^= xor >> offset;
	xor &= mask;
	*ptr ^= xor | (xor << offset);
	ptr += rowstride;
      }
      if ((count = mzd_rows_in_block(M, ++block)) <= 0)
	break;
      ptr = mzd_first_row_next_block(M, block);
    }
  } else {
    word* RESTRICT min_ptr;
    wi_t max_offset;
    if (min_bit == a_bit) {
      min_ptr = ptr + a_word;
      max_offset = b_word - a_word;
    } else {
      min_ptr = ptr + b_word;
      max_offset = a_word - b_word;
    }
    while(1) {
      wi_t const rowstride = M->rowstride;
      while(count--) {
	word xor = (min_ptr[0] ^ (min_ptr[max_offset] >> offset)) & mask;
	min_ptr[0] ^= xor;
	min_ptr[max_offset] ^= xor << offset;
	min_ptr += rowstride;
      }
      if ((count = mzd_rows_in_block(M, ++block)) <= 0)
	break;
      ptr = mzd_first_row_next_block(M, block);
      if (min_bit == a_bit)
	min_ptr = ptr + a_word;
      else
	min_ptr = ptr + b_word;
    }
  }

  __M4RI_DD_MZD(M);
}

int mzd_is_zero(mzd_t const *A) {
  word status = 0;
  word mask_end = A->high_bitmask;
  for (rci_t i = 0; i < A->nrows; ++i) {
    for (wi_t j = 0; j < A->width - 1; ++j)
      status |= A->rows[i][j];
    status |= A->rows[i][A->width - 1] & mask_end;
    if(status)
      return 0;
  }
  return !status;
}

void mzd_copy_row(mzd_t *B, rci_t i, mzd_t const *A, rci_t j) {
  assert(B->ncols >= A->ncols);
  wi_t const width = MIN(B->width, A->width) - 1;

  word const *a = A->rows[j];
  word *b = B->rows[i];
 
  word const mask_end = __M4RI_LEFT_BITMASK(A->ncols % m4ri_radix);

  if (width != 0) {
    for(wi_t k = 0; k < width; ++k)
      b[k] = a[k];
    b[width] = (b[width] & ~mask_end) | (a[width] & mask_end);
    
  } else {
    b[0] = b[0] | (a[0]& mask_end) | (b[0] & ~mask_end);
  }

  __M4RI_DD_ROW(B, i);
}


int mzd_find_pivot(mzd_t const *A, rci_t start_row, rci_t start_col, rci_t *r, rci_t *c) {
  rci_t const nrows = A->nrows;
  rci_t const ncols = A->ncols;
  word data = 0;
  rci_t row_candidate = 0;
  if(A->ncols - start_col < m4ri_radix) {
    for(rci_t j = start_col; j < A->ncols; j += m4ri_radix) {
      int const length = MIN(m4ri_radix, ncols - j);
      for(rci_t i = start_row; i < nrows; ++i) {
        word const curr_data = mzd_read_bits(A, i, j, length);
        if (m4ri_lesser_LSB(curr_data, data)) {
          row_candidate = i;
          data = curr_data;
        }
      }
      if(data) {
        *r = row_candidate;
        for(int l = 0; l < length; ++l) {
          if(__M4RI_GET_BIT(data, l)) {
            *c = j + l;
            break;
          }
        }
	__M4RI_DD_RCI(*r);
	__M4RI_DD_RCI(*c);
	__M4RI_DD_INT(1);
        return 1;
      }
    }
  } else {
    /* we definitely have more than one word */
    /* handle first word */
    int const bit_offset = (start_col % m4ri_radix);
    wi_t const word_offset = start_col / m4ri_radix;
    word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix-bit_offset);
    for(rci_t i = start_row; i < nrows; ++i) {
      word const curr_data = A->rows[i][word_offset] & mask_begin;
      if (m4ri_lesser_LSB(curr_data, data)) {
        row_candidate = i;
        data = curr_data;
        if(__M4RI_GET_BIT(data,bit_offset)) {
          break;
        }
      }
    }
    if(data) {
      *r = row_candidate;
      data >>= bit_offset;
      assert(data);
      for(int l = 0; l < (m4ri_radix - bit_offset); ++l) {
        if(__M4RI_GET_BIT(data, l)) {
          *c = start_col + l;
          break;
        }
      }
      __M4RI_DD_RCI(*r);
      __M4RI_DD_RCI(*c);
      __M4RI_DD_INT(1);
      return 1;
    }
    /* handle complete words */
    for(wi_t wi = word_offset + 1; wi < A->width - 1; ++wi) {
      for(rci_t i = start_row; i < nrows; ++i) {
        word const curr_data = A->rows[i][wi];
        if (m4ri_lesser_LSB(curr_data, data)) {
          row_candidate = i;
          data = curr_data;
          if(__M4RI_GET_BIT(data, 0))
            break;
        }
      }
      if(data) {
        *r = row_candidate;
        for(int l = 0; l < m4ri_radix; ++l) {
          if(__M4RI_GET_BIT(data, l)) {
            *c = wi * m4ri_radix + l;
            break;
          }
        }
	__M4RI_DD_RCI(*r);
	__M4RI_DD_RCI(*c);
	__M4RI_DD_INT(1);
        return 1;
      }
    }
    /* handle last word */
    int const end_offset = (A->ncols % m4ri_radix) ? (A->ncols % m4ri_radix) : m4ri_radix;
    word const mask_end = __M4RI_LEFT_BITMASK(end_offset % m4ri_radix);
    wi_t wi = A->width - 1;
    for(rci_t i = start_row; i < nrows; ++i) {
      word const curr_data = A->rows[i][wi] & mask_end;
      if (m4ri_lesser_LSB(curr_data, data)) {
        row_candidate = i;
        data = curr_data;
        if(__M4RI_GET_BIT(data,0))
          break;
      }
    }
    if(data) {
      *r = row_candidate;
      for(int l = 0; l < end_offset; ++l) {
        if(__M4RI_GET_BIT(data, l)) {
          *c = wi * m4ri_radix + l;
          break;
        }
      }
      __M4RI_DD_RCI(*r);
      __M4RI_DD_RCI(*c);
      __M4RI_DD_INT(1);
      return 1;
    }
  }
  __M4RI_DD_RCI(*r);
  __M4RI_DD_RCI(*c);
  __M4RI_DD_INT(0);
  return 0;
}


#define MASK(c)    (((uint64_t)(-1)) / (__M4RI_TWOPOW(__M4RI_TWOPOW(c)) + 1))
#define COUNT(x,c) ((x) & MASK(c)) + (((x) >> (__M4RI_TWOPOW(c))) & MASK(c))

static inline int m4ri_bitcount(word w)  {
   uint64_t n = __M4RI_CONVERT_TO_UINT64_T(w);
   n = COUNT(n, 0);
   n = COUNT(n, 1);
   n = COUNT(n, 2);
   n = COUNT(n, 3);
   n = COUNT(n, 4);
   n = COUNT(n, 5);
   return (int)n;
}


double _mzd_density(mzd_t const *A, wi_t res, rci_t r, rci_t c) {
  size_t count = 0;
  size_t total = 0;
  
  if(A->width == 1) {
    for(rci_t i = r; i < A->nrows; ++i)
      for(rci_t j = c; j < A->ncols; ++j)
        if(mzd_read_bit(A, i, j))
          ++count;
    return ((double)count)/(1.0 * A->ncols * A->nrows);
  }

  if(res == 0)
    res = A->width / 100;
  if (res < 1)
    res = 1;

  for(rci_t i = r; i < A->nrows; ++i) {
    word *truerow = A->rows[i];
    for(rci_t j = c; j < m4ri_radix; ++j)
      if(mzd_read_bit(A, i, j))
        ++count;
    total += m4ri_radix;

    for(wi_t j = MAX(1, c / m4ri_radix); j < A->width - 1; j += res) {
      count += m4ri_bitcount(truerow[j]);
      total += m4ri_radix;
    }
    for(int j = 0; j < A->ncols % m4ri_radix; ++j)
      if(mzd_read_bit(A, i, m4ri_radix * (A->ncols / m4ri_radix) + j))
        ++count;
    total += A->ncols % m4ri_radix;
  }

  return (double)count / total;
}

double mzd_density(mzd_t const *A, wi_t res) {
  return _mzd_density(A, res, 0, 0);
}

rci_t mzd_first_zero_row(mzd_t const *A) {
  word const mask_end = __M4RI_LEFT_BITMASK(A->ncols % m4ri_radix);
  wi_t const end = A->width - 1;
  word *row;

  for(rci_t i = A->nrows - 1; i >= 0; --i) {
    row = A->rows[i];
    word tmp = row[0];
    for (wi_t j = 1; j < end; ++j)
      tmp |= row[j];
    tmp |= row[end] & mask_end;
    if(tmp) {
      __M4RI_DD_INT(i + 1);
      return i + 1;
    }
  }
  __M4RI_DD_INT(0);
  return 0;
}

mzd_t *mzd_extract_u(mzd_t *U, mzd_t const *A) {
  rci_t k = MIN(A->nrows, A->ncols);
  if (U == NULL)
    U = mzd_submatrix(NULL, A, 0, 0, k, k);
  else
    assert(U->nrows == k && U->ncols == k);
  for(rci_t i=1; i<U->nrows; i++) {
    for(wi_t j=0; j<i/m4ri_radix; j++) {
      U->rows[i][j] = 0;
    }
    if(i%m4ri_radix)
      mzd_clear_bits(U, i, (i/m4ri_radix)*m4ri_radix, i%m4ri_radix);
  }
  return U;
}

mzd_t *mzd_extract_l(mzd_t *L, mzd_t const *A) {
  rci_t k = MIN(A->nrows, A->ncols);
  if (L == NULL)
    L = mzd_submatrix(NULL, A, 0, 0, k, k);
  else
    assert(L->nrows == k && L->ncols == k);
  for(rci_t i=0; i<L->nrows-1; i++) {
    if(m4ri_radix - (i+1)%m4ri_radix)
      mzd_clear_bits(L, i, i+1, m4ri_radix - (i+1)%m4ri_radix);
    for(wi_t j=(i/m4ri_radix+1); j<L->width; j++) {
      L->rows[i][j] = 0;
    }
  }
  return L;
}