101            const W *subKeys, 
size_t rounds, 
const byte *inBlocks,
 
  102            const byte *xorBlocks, 
byte *outBlocks, 
size_t length, 
word32 flags)
 
  109    const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
 
  110    const uint32x4_t s_one = vld1q_u32(w_one);
 
  112    const size_t blockSize = 16;
 
  115    size_t inIncrement = (flags & (
EnumToInt(BT_InBlockIsCounter)|
EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
 
  116    size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
 
  117    size_t outIncrement = (flags & 
EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
 
  120    const bool xorInput = (xorBlocks != NULLPTR) && (flags & 
EnumToInt(BT_XorInput));
 
  121    const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & 
EnumToInt(BT_XorInput));
 
  123    if (flags & BT_ReverseDirection)
 
  125        inBlocks = 
PtrAdd(inBlocks, length - blockSize);
 
  126        xorBlocks = 
PtrAdd(xorBlocks, length - blockSize);
 
  127        outBlocks = 
PtrAdd(outBlocks, length - blockSize);
 
  128        inIncrement = 0-inIncrement;
 
  129        xorIncrement = 0-xorIncrement;
 
  130        outIncrement = 0-outIncrement;
 
  133    if (flags & BT_AllowParallel)
 
  135        while (length >= 6*blockSize)
 
  137            uint64x2_t block0, block1, block2, block3, block4, block5;
 
  138            if (flags & BT_InBlockIsCounter)
 
  140                const uint64x2_t one = vreinterpretq_u64_u32(s_one);
 
  141                block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
 
  142                block1 = vaddq_u64(block0, one);
 
  143                block2 = vaddq_u64(block1, one);
 
  144                block3 = vaddq_u64(block2, one);
 
  145                block4 = vaddq_u64(block3, one);
 
  146                block5 = vaddq_u64(block4, one);
 
  147                vst1q_u8(
const_cast<byte*
>(inBlocks),
 
  148                    vreinterpretq_u8_u64(vaddq_u64(block5, one)));
 
  152                block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
 
  153                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  154                block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
 
  155                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  156                block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
 
  157                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  158                block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
 
  159                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  160                block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
 
  161                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  162                block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
 
  163                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  168                block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  169                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  170                block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  171                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  172                block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  173                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  174                block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  175                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  176                block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  177                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  178                block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  179                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  182            func6(block0, block1, block2, block3, block4, block5, subKeys, 
static_cast<unsigned int>(rounds));
 
  186                block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  187                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  188                block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  189                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  190                block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  191                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  192                block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  193                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  194                block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  195                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  196                block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  197                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  200            vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
 
  201            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  202            vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
 
  203            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  204            vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
 
  205            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  206            vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
 
  207            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  208            vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
 
  209            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  210            vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
 
  211            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  213            length -= 6*blockSize;
 
  217    while (length >= blockSize)
 
  220        block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
 
  223            block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  225        if (flags & BT_InBlockIsCounter)
 
  226            const_cast<byte *
>(inBlocks)[15]++;
 
  228        func1(block, subKeys, 
static_cast<unsigned int>(rounds));
 
  231            block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  233        vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
 
  235        inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  236        outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  237        xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
 
  255            const W *subKeys, 
size_t rounds, 
const byte *inBlocks,
 
  256            const byte *xorBlocks, 
byte *outBlocks, 
size_t length, 
word32 flags)
 
  263    const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
 
  264    const uint32x4_t s_one = vld1q_u32(w_one);
 
  266    const size_t blockSize = 16;
 
  269    size_t inIncrement = (flags & (
EnumToInt(BT_InBlockIsCounter)|
EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
 
  270    size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
 
  271    size_t outIncrement = (flags & 
EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
 
  274    const bool xorInput = (xorBlocks != NULLPTR) && (flags & 
EnumToInt(BT_XorInput));
 
  275    const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & 
EnumToInt(BT_XorInput));
 
  277    if (flags & BT_ReverseDirection)
 
  279        inBlocks = 
PtrAdd(inBlocks, length - blockSize);
 
  280        xorBlocks = 
PtrAdd(xorBlocks, length - blockSize);
 
  281        outBlocks = 
PtrAdd(outBlocks, length - blockSize);
 
  282        inIncrement = 0-inIncrement;
 
  283        xorIncrement = 0-xorIncrement;
 
  284        outIncrement = 0-outIncrement;
 
  287    if (flags & BT_AllowParallel)
 
  289        while (length >= 4*blockSize)
 
  291            uint32x4_t block0, block1, block2, block3;
 
  292            if (flags & BT_InBlockIsCounter)
 
  294                const uint32x4_t one = s_one;
 
  295                block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
 
  296                block1 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block0), vreinterpretq_u64_u32(one)));
 
  297                block2 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block1), vreinterpretq_u64_u32(one)));
 
  298                block3 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block2), vreinterpretq_u64_u32(one)));
 
  299                vst1q_u8(
const_cast<byte*
>(inBlocks), vreinterpretq_u8_u64(vaddq_u64(
 
  300                    vreinterpretq_u64_u32(block3), vreinterpretq_u64_u32(one))));
 
  304                block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
 
  305                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  306                block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
 
  307                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  308                block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
 
  309                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  310                block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
 
  311                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  316                block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
 
  317                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  318                block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
 
  319                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  320                block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
 
  321                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  322                block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
 
  323                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  326            func4(block0, block1, block2, block3, subKeys, 
static_cast<unsigned int>(rounds));
 
  330                block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
 
  331                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  332                block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
 
  333                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  334                block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
 
  335                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  336                block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
 
  337                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  340            vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
 
  341            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  342            vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
 
  343            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  344            vst1q_u8(outBlocks, vreinterpretq_u8_u32(block2));
 
  345            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  346            vst1q_u8(outBlocks, vreinterpretq_u8_u32(block3));
 
  347            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  349            length -= 4*blockSize;
 
  353    while (length >= blockSize)
 
  355        uint32x4_t block = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
 
  358            block = veorq_u32(block, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
 
  360        if (flags & BT_InBlockIsCounter)
 
  361            const_cast<byte *
>(inBlocks)[15]++;
 
  363        func1(block, subKeys, 
static_cast<unsigned int>(rounds));
 
  366            block = veorq_u32(block, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
 
  368        vst1q_u8(outBlocks, vreinterpretq_u8_u32(block));
 
  370        inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  371        outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  372        xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
 
  389            const W *subKeys, 
size_t rounds, 
const byte *inBlocks,
 
  390            const byte *xorBlocks, 
byte *outBlocks, 
size_t length, 
word32 flags)
 
  397    const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
 
  398    const uint32x4_t s_one = vld1q_u32(w_one);
 
  400    const size_t blockSize = 16;
 
  403    size_t inIncrement = (flags & (
EnumToInt(BT_InBlockIsCounter)|
EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
 
  404    size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
 
  405    size_t outIncrement = (flags & 
EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
 
  408    const bool xorInput = (xorBlocks != NULLPTR) && (flags & 
EnumToInt(BT_XorInput));
 
  409    const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & 
EnumToInt(BT_XorInput));
 
  411    if (flags & BT_ReverseDirection)
 
  413        inBlocks = 
PtrAdd(inBlocks, length - blockSize);
 
  414        xorBlocks = 
PtrAdd(xorBlocks, length - blockSize);
 
  415        outBlocks = 
PtrAdd(outBlocks, length - blockSize);
 
  416        inIncrement = 0-inIncrement;
 
  417        xorIncrement = 0-xorIncrement;
 
  418        outIncrement = 0-outIncrement;
 
  421    if (flags & BT_AllowParallel)
 
  423        while (length >= 6*blockSize)
 
  425            uint64x2_t block0, block1, block2, block3, block4, block5;
 
  426            if (flags & BT_InBlockIsCounter)
 
  428                const uint64x2_t one = vreinterpretq_u64_u32(s_one);
 
  429                block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
 
  430                block1 = vaddq_u64(block0, one);
 
  431                block2 = vaddq_u64(block1, one);
 
  432                block3 = vaddq_u64(block2, one);
 
  433                block4 = vaddq_u64(block3, one);
 
  434                block5 = vaddq_u64(block4, one);
 
  435                vst1q_u8(
const_cast<byte*
>(inBlocks),
 
  436                    vreinterpretq_u8_u64(vaddq_u64(block5, one)));
 
  440                block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
 
  441                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  442                block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
 
  443                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  444                block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
 
  445                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  446                block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
 
  447                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  448                block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
 
  449                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  450                block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
 
  451                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  456                block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  457                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  458                block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  459                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  460                block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  461                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  462                block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  463                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  464                block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  465                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  466                block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  467                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  470            func6(block0, block1, block2, block3, block4, block5, subKeys, 
static_cast<unsigned int>(rounds));
 
  474                block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  475                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  476                block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  477                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  478                block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  479                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  480                block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  481                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  482                block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  483                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  484                block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  485                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  488            vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
 
  489            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  490            vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
 
  491            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  492            vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
 
  493            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  494            vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
 
  495            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  496            vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
 
  497            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  498            vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
 
  499            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  501            length -= 6*blockSize;
 
  504        while (length >= 2*blockSize)
 
  506            uint64x2_t block0, block1;
 
  507            if (flags & BT_InBlockIsCounter)
 
  509                const uint64x2_t one = vreinterpretq_u64_u32(s_one);
 
  510                block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
 
  511                block1 = vaddq_u64(block0, one);
 
  512                vst1q_u8(
const_cast<byte*
>(inBlocks),
 
  513                    vreinterpretq_u8_u64(vaddq_u64(block1, one)));
 
  517                block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
 
  518                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  519                block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
 
  520                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  525                block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  526                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  527                block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  528                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  531            func2(block0, block1, subKeys, 
static_cast<unsigned int>(rounds));
 
  535                block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  536                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  537                block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  538                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  541            vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
 
  542            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  543            vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
 
  544            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  546            length -= 2*blockSize;
 
  550    while (length >= blockSize)
 
  552        uint64x2_t block, zero = {0,0};
 
  553        block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
 
  556            block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  558        if (flags & BT_InBlockIsCounter)
 
  559            const_cast<byte *
>(inBlocks)[15]++;
 
  561        func2(block, zero, subKeys, 
static_cast<unsigned int>(rounds));
 
  564            block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
 
  566        vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
 
  568        inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  569        outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  570        xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
 
  636        MAYBE_CONST W *subKeys, 
size_t rounds, 
const byte *inBlocks,
 
  637        const byte *xorBlocks, 
byte *outBlocks, 
size_t length, 
word32 flags)
 
  644    const size_t blockSize = 16;
 
  647    size_t inIncrement = (flags & (
EnumToInt(BT_InBlockIsCounter)|
EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
 
  648    size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
 
  649    size_t outIncrement = (flags & 
EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
 
  652    const bool xorInput = (xorBlocks != NULLPTR) && (flags & 
EnumToInt(BT_XorInput));
 
  653    const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & 
EnumToInt(BT_XorInput));
 
  655    if (flags & BT_ReverseDirection)
 
  657        inBlocks = 
PtrAdd(inBlocks, length - blockSize);
 
  658        xorBlocks = 
PtrAdd(xorBlocks, length - blockSize);
 
  659        outBlocks = 
PtrAdd(outBlocks, length - blockSize);
 
  660        inIncrement = 0-inIncrement;
 
  661        xorIncrement = 0-xorIncrement;
 
  662        outIncrement = 0-outIncrement;
 
  665    if (flags & BT_AllowParallel)
 
  667        while (length >= 6*blockSize)
 
  669            __m128i block0, block1, block2, block3, block4, block5;
 
  670            if (flags & BT_InBlockIsCounter)
 
  673                const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
 
  675                block1 = _mm_add_epi32(block0, s_one);
 
  676                block2 = _mm_add_epi32(block1, s_one);
 
  677                block3 = _mm_add_epi32(block2, s_one);
 
  678                block4 = _mm_add_epi32(block3, s_one);
 
  679                block5 = _mm_add_epi32(block4, s_one);
 
  680                _mm_storeu_si128(
M128_CAST(inBlocks), _mm_add_epi32(block5, s_one));
 
  685                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  687                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  689                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  691                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  693                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  695                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  700                block0 = _mm_xor_si128(block0, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
 
  701                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  702                block1 = _mm_xor_si128(block1, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
 
  703                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  704                block2 = _mm_xor_si128(block2, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
 
  705                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  706                block3 = _mm_xor_si128(block3, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
 
  707                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  708                block4 = _mm_xor_si128(block4, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
 
  709                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  710                block5 = _mm_xor_si128(block5, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
 
  711                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  714            func6(block0, block1, block2, block3, block4, block5, subKeys, 
static_cast<unsigned int>(rounds));
 
  718                block0 = _mm_xor_si128(block0, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
 
  719                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  720                block1 = _mm_xor_si128(block1, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
 
  721                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  722                block2 = _mm_xor_si128(block2, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
 
  723                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  724                block3 = _mm_xor_si128(block3, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
 
  725                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  726                block4 = _mm_xor_si128(block4, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
 
  727                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  728                block5 = _mm_xor_si128(block5, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
 
  729                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  732            _mm_storeu_si128(
M128_CAST(outBlocks), block0);
 
  733            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  734            _mm_storeu_si128(
M128_CAST(outBlocks), block1);
 
  735            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  736            _mm_storeu_si128(
M128_CAST(outBlocks), block2);
 
  737            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  738            _mm_storeu_si128(
M128_CAST(outBlocks), block3);
 
  739            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  740            _mm_storeu_si128(
M128_CAST(outBlocks), block4);
 
  741            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  742            _mm_storeu_si128(
M128_CAST(outBlocks), block5);
 
  743            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  745            length -= 6*blockSize;
 
  748        while (length >= 2*blockSize)
 
  750            __m128i block0, block1;
 
  751            if (flags & BT_InBlockIsCounter)
 
  754                const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
 
  756                block1 = _mm_add_epi32(block0, s_one);
 
  757                _mm_storeu_si128(
M128_CAST(inBlocks), _mm_add_epi32(block1, s_one));
 
  762                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  764                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  769                block0 = _mm_xor_si128(block0, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
 
  770                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  771                block1 = _mm_xor_si128(block1, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
 
  772                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  775            func2(block0, block1, subKeys, 
static_cast<unsigned int>(rounds));
 
  779                block0 = _mm_xor_si128(block0, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
 
  780                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  781                block1 = _mm_xor_si128(block1, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
 
  782                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  785            _mm_storeu_si128(
M128_CAST(outBlocks), block0);
 
  786            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  787            _mm_storeu_si128(
M128_CAST(outBlocks), block1);
 
  788            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  790            length -= 2*blockSize;
 
  794    while (length >= blockSize)
 
  796        __m128i block, zero = _mm_setzero_si128();
 
  800            block = _mm_xor_si128(block, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
 
  802        if (flags & BT_InBlockIsCounter)
 
  803            const_cast<byte *
>(inBlocks)[15]++;
 
  805        func2(block, zero, subKeys, 
static_cast<unsigned int>(rounds));
 
  808            block = _mm_xor_si128(block, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
 
  810        _mm_storeu_si128(
M128_CAST(outBlocks), block);
 
  812        inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  813        outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  814        xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
 
  831        MAYBE_CONST W *subKeys, 
size_t rounds, 
const byte *inBlocks,
 
  832        const byte *xorBlocks, 
byte *outBlocks, 
size_t length, 
word32 flags)
 
  839    const size_t blockSize = 16;
 
  842    size_t inIncrement = (flags & (
EnumToInt(BT_InBlockIsCounter)|
EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
 
  843    size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
 
  844    size_t outIncrement = (flags & 
EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
 
  847    const bool xorInput = (xorBlocks != NULLPTR) && (flags & 
EnumToInt(BT_XorInput));
 
  848    const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & 
EnumToInt(BT_XorInput));
 
  850    if (flags & BT_ReverseDirection)
 
  852        inBlocks = 
PtrAdd(inBlocks, length - blockSize);
 
  853        xorBlocks = 
PtrAdd(xorBlocks, length - blockSize);
 
  854        outBlocks = 
PtrAdd(outBlocks, length - blockSize);
 
  855        inIncrement = 0-inIncrement;
 
  856        xorIncrement = 0-xorIncrement;
 
  857        outIncrement = 0-outIncrement;
 
  860    if (flags & BT_AllowParallel)
 
  862        while (length >= 4*blockSize)
 
  864            __m128i block0, block1, block2, block3;
 
  865            if (flags & BT_InBlockIsCounter)
 
  868                const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
 
  870                block1 = _mm_add_epi32(block0, s_one);
 
  871                block2 = _mm_add_epi32(block1, s_one);
 
  872                block3 = _mm_add_epi32(block2, s_one);
 
  873                _mm_storeu_si128(
M128_CAST(inBlocks), _mm_add_epi32(block3, s_one));
 
  878                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  880                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  882                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  884                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  889                block0 = _mm_xor_si128(block0, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
 
  890                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  891                block1 = _mm_xor_si128(block1, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
 
  892                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  893                block2 = _mm_xor_si128(block2, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
 
  894                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  895                block3 = _mm_xor_si128(block3, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
 
  896                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  899            func4(block0, block1, block2, block3, subKeys, 
static_cast<unsigned int>(rounds));
 
  903                block0 = _mm_xor_si128(block0, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
 
  904                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  905                block1 = _mm_xor_si128(block1, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
 
  906                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  907                block2 = _mm_xor_si128(block2, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
 
  908                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  909                block3 = _mm_xor_si128(block3, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
 
  910                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
  913            _mm_storeu_si128(
M128_CAST(outBlocks), block0);
 
  914            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  915            _mm_storeu_si128(
M128_CAST(outBlocks), block1);
 
  916            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  917            _mm_storeu_si128(
M128_CAST(outBlocks), block2);
 
  918            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  919            _mm_storeu_si128(
M128_CAST(outBlocks), block3);
 
  920            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  922            length -= 4*blockSize;
 
  926    while (length >= blockSize)
 
  931            block = _mm_xor_si128(block, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
 
  933        if (flags & BT_InBlockIsCounter)
 
  934            const_cast<byte *
>(inBlocks)[15]++;
 
  936        func1(block, subKeys, 
static_cast<unsigned int>(rounds));
 
  939            block = _mm_xor_si128(block, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
 
  941        _mm_storeu_si128(
M128_CAST(outBlocks), block);
 
  943        inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
  944        outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
  945        xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
 
  972        const W *subKeys, 
size_t rounds, 
const byte *inBlocks,
 
  973        const byte *xorBlocks, 
byte *outBlocks, 
size_t length, 
word32 flags)
 
  980#if (CRYPTOPP_LITTLE_ENDIAN) 
  986    const size_t blockSize = 16;
 
  989    size_t inIncrement = (flags & (
EnumToInt(BT_InBlockIsCounter)|
EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
 
  990    size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
 
  991    size_t outIncrement = (flags & 
EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
 
  994    const bool xorInput = (xorBlocks != NULLPTR) && (flags & 
EnumToInt(BT_XorInput));
 
  995    const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & 
EnumToInt(BT_XorInput));
 
  997    if (flags & BT_ReverseDirection)
 
  999        inBlocks = 
PtrAdd(inBlocks, length - blockSize);
 
 1000        xorBlocks = 
PtrAdd(xorBlocks, length - blockSize);
 
 1001        outBlocks = 
PtrAdd(outBlocks, length - blockSize);
 
 1002        inIncrement = 0-inIncrement;
 
 1003        xorIncrement = 0-xorIncrement;
 
 1004        outIncrement = 0-outIncrement;
 
 1007    if (flags & BT_AllowParallel)
 
 1009        while (length >= 4*blockSize)
 
 1013            if (flags & BT_InBlockIsCounter)
 
 1016                block1 = 
VecAdd(block0, s_one);
 
 1017                block2 = 
VecAdd(block1, s_one);
 
 1018                block3 = 
VecAdd(block2, s_one);
 
 1028                const_cast<byte*
>(inBlocks)[15] += 6;
 
 1033                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
 1035                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
 1037                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
 1039                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
 1045                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
 1047                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
 1049                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
 1051                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
 1054            func4(block0, block1, block2, block3, subKeys, rounds);
 
 1059                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
 1061                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
 1063                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
 1065                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
 1069            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
 1071            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
 1073            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
 1075            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
 1077            length -= 4*blockSize;
 
 1081    while (length >= blockSize)
 
 1088        if (flags & BT_InBlockIsCounter)
 
 1089            const_cast<byte *
>(inBlocks)[15]++;
 
 1091        func1(block, subKeys, rounds);
 
 1098        inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
 1099        outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
 1100        xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
 1101        length -= blockSize;
 
 
 1117        const W *subKeys, 
size_t rounds, 
const byte *inBlocks,
 
 1118        const byte *xorBlocks, 
byte *outBlocks, 
size_t length, 
word32 flags)
 
 1125#if (CRYPTOPP_LITTLE_ENDIAN) 
 1131    const size_t blockSize = 16;
 
 1134    size_t inIncrement = (flags & (
EnumToInt(BT_InBlockIsCounter)|
EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
 
 1135    size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
 
 1136    size_t outIncrement = (flags & 
EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
 
 1139    const bool xorInput = (xorBlocks != NULLPTR) && (flags & 
EnumToInt(BT_XorInput));
 
 1140    const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & 
EnumToInt(BT_XorInput));
 
 1142    if (flags & BT_ReverseDirection)
 
 1144        inBlocks = 
PtrAdd(inBlocks, length - blockSize);
 
 1145        xorBlocks = 
PtrAdd(xorBlocks, length - blockSize);
 
 1146        outBlocks = 
PtrAdd(outBlocks, length - blockSize);
 
 1147        inIncrement = 0-inIncrement;
 
 1148        xorIncrement = 0-xorIncrement;
 
 1149        outIncrement = 0-outIncrement;
 
 1152    if (flags & BT_AllowParallel)
 
 1154        while (length >= 6*blockSize)
 
 1156            uint32x4_p block0, block1, block2, block3, block4, block5;
 
 1158            if (flags & BT_InBlockIsCounter)
 
 1161                block1 = 
VecAdd(block0, s_one);
 
 1162                block2 = 
VecAdd(block1, s_one);
 
 1163                block3 = 
VecAdd(block2, s_one);
 
 1164                block4 = 
VecAdd(block3, s_one);
 
 1165                block5 = 
VecAdd(block4, s_one);
 
 1182                VecStoreBE(temp, 
const_cast<byte*
>(inBlocks));
 
 1187                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
 1189                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
 1191                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
 1193                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
 1195                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
 1197                inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
 1203                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
 1205                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
 1207                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
 1209                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
 1211                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
 1213                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
 1216            func6(block0, block1, block2, block3, block4, block5, subKeys, rounds);
 
 1221                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
 1223                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
 1225                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
 1227                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
 1229                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
 1231                xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
 1235            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
 1237            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
 1239            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
 1241            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
 1243            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
 1245            outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
 1247            length -= 6*blockSize;
 
 1251    while (length >= blockSize)
 
 1258        if (flags & BT_InBlockIsCounter)
 
 1259            const_cast<byte *
>(inBlocks)[15]++;
 
 1261        func1(block, subKeys, rounds);
 
 1268        inBlocks = 
PtrAdd(inBlocks, inIncrement);
 
 1269        outBlocks = 
PtrAdd(outBlocks, outIncrement);
 
 1270        xorBlocks = 
PtrAdd(xorBlocks, xorIncrement);
 
 1271        length -= blockSize;